Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
added new files
  • Loading branch information
job13011 committed Apr 3, 2016
1 parent 23db3b3 commit 9c7ab97
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 0 deletions.
21 changes: 21 additions & 0 deletions AniaLexicon.py
@@ -0,0 +1,21 @@
def load():
f = open("words3.txt", "r")
lines = f.readlines()
words = []
scores = []
for line in lines:
try:
(word, score) = line.rstrip().split(" ")
if not (word in words):
words.append(word)
if score == "p":
scores.append(1)
elif score == "n":
scores.append(-1)
else:
print "FUCK", word
exit()
except:
pass

return (words, scores)
25 changes: 25 additions & 0 deletions TFIDF.py
@@ -0,0 +1,25 @@
import math
import nltk

# document is assumed to be tokenized (a list of words)
# documents is a list of tokenized docs
def tfidf(term, document, documents):
all_doc_appearances = 0 # number of documents in which term appears
for doc in documents:
if term in doc:
all_doc_appearances += 1
doc_appearances = 0 # number of appearances of term in this document
for word in document:
if term == word:
doc_appearances += 1
num_docs = len(documents) # number of documents in the collection
if doc_appearances == 0:
#This happens sometimes, probably due to inconsistent splitting/tokenizing.
#print "Error: no occurrences of", term
return 0
elif all_doc_appearances == 0:
#print "Error: fuck,", term
return 0
else:
tfidf = (1 + math.log(doc_appearances,10)) * math.log((float(num_docs)/all_doc_appearances), 10)
return tfidf

0 comments on commit 9c7ab97

Please sign in to comment.