added new files

job13011 · Apr 3, 2016 · 9c7ab97 · 9c7ab97
1 parent 23db3b3
commit 9c7ab97
Show file tree

Hide file tree

Showing 2 changed files with 46 additions and 0 deletions.
diff --git a/AniaLexicon.py b/AniaLexicon.py
@@ -0,0 +1,21 @@
+def load():
+  f = open("words3.txt", "r")
+  lines = f.readlines()
+  words = []
+  scores = []
+  for line in lines:
+    try:
+      (word, score) = line.rstrip().split(" ")
+      if not (word in words):
+        words.append(word)
+        if score == "p":
+          scores.append(1)
+        elif score == "n":
+          scores.append(-1)
+        else:
+          print "FUCK", word
+          exit()
+    except:
+      pass
+
+  return (words, scores)
diff --git a/TFIDF.py b/TFIDF.py
@@ -0,0 +1,25 @@
+import math
+import nltk
+
+# document is assumed to be tokenized (a list of words)
+# documents is a list of tokenized docs
+def tfidf(term, document, documents):
+  all_doc_appearances = 0          # number of documents in which term appears
+  for doc in documents:
+    if term in doc:
+      all_doc_appearances += 1
+  doc_appearances = 0              # number of appearances of term in this document
+  for word in document:
+    if term == word:
+      doc_appearances += 1
+  num_docs = len(documents)        # number of documents in the collection
+  if doc_appearances == 0:
+    #This happens sometimes, probably due to inconsistent splitting/tokenizing.
+    #print "Error: no occurrences of", term
+    return 0
+  elif all_doc_appearances == 0:
+    #print "Error: fuck,", term
+    return 0
+  else:
+    tfidf = (1 + math.log(doc_appearances,10)) * math.log((float(num_docs)/all_doc_appearances), 10)
+    return tfidf