Skip to content
Permalink
9c7ab97c2d
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
25 lines (24 sloc) 911 Bytes
import math
import nltk
# document is assumed to be tokenized (a list of words)
# documents is a list of tokenized docs
def tfidf(term, document, documents):
all_doc_appearances = 0 # number of documents in which term appears
for doc in documents:
if term in doc:
all_doc_appearances += 1
doc_appearances = 0 # number of appearances of term in this document
for word in document:
if term == word:
doc_appearances += 1
num_docs = len(documents) # number of documents in the collection
if doc_appearances == 0:
#This happens sometimes, probably due to inconsistent splitting/tokenizing.
#print "Error: no occurrences of", term
return 0
elif all_doc_appearances == 0:
#print "Error: fuck,", term
return 0
else:
tfidf = (1 + math.log(doc_appearances,10)) * math.log((float(num_docs)/all_doc_appearances), 10)
return tfidf