diff --git a/AniaLexicon.py b/AniaLexicon.py new file mode 100644 index 0000000..43d4b5c --- /dev/null +++ b/AniaLexicon.py @@ -0,0 +1,21 @@ +def load(): + f = open("words3.txt", "r") + lines = f.readlines() + words = [] + scores = [] + for line in lines: + try: + (word, score) = line.rstrip().split(" ") + if not (word in words): + words.append(word) + if score == "p": + scores.append(1) + elif score == "n": + scores.append(-1) + else: + print "FUCK", word + exit() + except: + pass + + return (words, scores) \ No newline at end of file diff --git a/TFIDF.py b/TFIDF.py new file mode 100644 index 0000000..84a9cfe --- /dev/null +++ b/TFIDF.py @@ -0,0 +1,25 @@ +import math +import nltk + +# document is assumed to be tokenized (a list of words) +# documents is a list of tokenized docs +def tfidf(term, document, documents): + all_doc_appearances = 0 # number of documents in which term appears + for doc in documents: + if term in doc: + all_doc_appearances += 1 + doc_appearances = 0 # number of appearances of term in this document + for word in document: + if term == word: + doc_appearances += 1 + num_docs = len(documents) # number of documents in the collection + if doc_appearances == 0: + #This happens sometimes, probably due to inconsistent splitting/tokenizing. + #print "Error: no occurrences of", term + return 0 + elif all_doc_appearances == 0: + #print "Error: fuck,", term + return 0 + else: + tfidf = (1 + math.log(doc_appearances,10)) * math.log((float(num_docs)/all_doc_appearances), 10) + return tfidf \ No newline at end of file