Skip to content
Permalink
4e6b38c6e6
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
49 lines (45 sloc) 1.41 KB
from __future__ import division
import math
import nltk
# document is assumed to be tokenized (a list of words)
# documents is a list of tokenized docs
def compute_idfs(documents):
idfs = {}
N = len(documents)
for doc in documents:
for term in doc:
if idfs.has_key(term):
idfs[term] += 1
else:
idfs[term] = 1
for term in idfs.keys():
idfs[term] = math.log(N/idfs[term])
return idfs
def tfidf(term, document, documents, idfs={}):
if idfs == {}:
all_doc_appearances = len([doc for doc in documents if term in doc])
idf = math.log(len(documents)/all_doc_appearances, 10)
else:
if idfs.has_key(term):
idf = idfs[term]
else:
return 0 # is this supposed to happen???
doc_appearances = 0 # number of appearances of term in this document
for word in document:
if term == word:
doc_appearances += 1
"""
if doc_appearances == 0:
#This happens sometimes, probably due to inconsistent splitting/tokenizing.
#print "Error: no occurrences of", term
return 0
elif all_doc_appearances == 0:
#print "Error: fuck,", term
return 0
else:
"""
tfidf = (1 + math.log(doc_appearances,10)) * idf
return tfidf
# Martineau and Finin 2009
def delta_tfidf(term, document, positive_set, negative_set, pos_idfs={}, neg_idfs={}):
return tfidf(term, document, positive_set, pos_idfs) - tfidf(term, document, negative_set, neg_idfs)