Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
50 lines (45 sloc) 1.41 KB
from __future__ import division
import math
import nltk
# document is assumed to be tokenized (a list of words)
# documents is a list of tokenized docs
def compute_idfs(documents):
idfs = {}
N = len(documents)
for doc in documents:
for term in doc:
if idfs.has_key(term):
idfs[term] += 1
else:
idfs[term] = 1
for term in idfs.keys():
idfs[term] = math.log(N/idfs[term])
return idfs
def tfidf(term, document, documents, idfs={}):
if idfs == {}:
all_doc_appearances = len([doc for doc in documents if term in doc])
idf = math.log(len(documents)/all_doc_appearances, 10)
else:
if idfs.has_key(term):
idf = idfs[term]
else:
return 0 # is this supposed to happen???
doc_appearances = 0 # number of appearances of term in this document
for word in document:
if term == word:
doc_appearances += 1
"""
if doc_appearances == 0:
#This happens sometimes, probably due to inconsistent splitting/tokenizing.
#print "Error: no occurrences of", term
return 0
elif all_doc_appearances == 0:
#print "Error: fuck,", term
return 0
else:
"""
tfidf = (1 + math.log(doc_appearances,10)) * idf
return tfidf
# Martineau and Finin 2009
def delta_tfidf(term, document, positive_set, negative_set, pos_idfs={}, neg_idfs={}):
return tfidf(term, document, positive_set, pos_idfs) - tfidf(term, document, negative_set, neg_idfs)
You can’t perform that action at this time.