TFIDF.py

from __future__ import division
import math
import nltk

# document is assumed to be tokenized (a list of words)
# documents is a list of tokenized docs

def compute_idfs(documents):
  idfs = {}
  N = len(documents)
  for doc in documents:
    for term in doc:
      if idfs.has_key(term):
        idfs[term] += 1
      else:
        idfs[term] = 1
  for term in idfs.keys():
    idfs[term] = math.log(N/idfs[term])
  return idfs

def tfidf(term, document, documents, idfs={}):
  if idfs == {}:
    all_doc_appearances = len([doc for doc in documents if term in doc])
    idf = math.log(len(documents)/all_doc_appearances, 10)
  else:
    if idfs.has_key(term):
      idf = idfs[term]
    else:
      return 0 # is this supposed to happen???
  doc_appearances = 0              # number of appearances of term in this document
  for word in document:
    if term == word:
      doc_appearances += 1
  """
  if doc_appearances == 0:
    #This happens sometimes, probably due to inconsistent splitting/tokenizing.
    #print "Error: no occurrences of", term
    return 0
  elif all_doc_appearances == 0:
    #print "Error: fuck,", term
    return 0
  else:
  """
  tfidf = (1 + math.log(doc_appearances,10)) * idf
  return tfidf

# Martineau and Finin 2009
def delta_tfidf(term, document, positive_set, negative_set, pos_idfs={}, neg_idfs={}):
  return tfidf(term, document, positive_set, pos_idfs) - tfidf(term, document, negative_set, neg_idfs)
	from __future__ import division
	import math
	import nltk

	# document is assumed to be tokenized (a list of words)
	# documents is a list of tokenized docs

	def compute_idfs(documents):
	idfs = {}
	N = len(documents)
	for doc in documents:
	for term in doc:
	if idfs.has_key(term):
	idfs[term] += 1
	else:
	idfs[term] = 1
	for term in idfs.keys():
	idfs[term] = math.log(N/idfs[term])
	return idfs

	def tfidf(term, document, documents, idfs={}):
	if idfs == {}:
	all_doc_appearances = len([doc for doc in documents if term in doc])
	idf = math.log(len(documents)/all_doc_appearances, 10)
	else:
	if idfs.has_key(term):
	idf = idfs[term]
	else:
	return 0 # is this supposed to happen???
	doc_appearances = 0 # number of appearances of term in this document
	for word in document:
	if term == word:
	doc_appearances += 1
	"""
	if doc_appearances == 0:
	#This happens sometimes, probably due to inconsistent splitting/tokenizing.
	#print "Error: no occurrences of", term
	return 0
	elif all_doc_appearances == 0:
	#print "Error: fuck,", term
	return 0
	else:
	"""
	tfidf = (1 + math.log(doc_appearances,10)) * idf
	return tfidf

	# Martineau and Finin 2009
	def delta_tfidf(term, document, positive_set, negative_set, pos_idfs={}, neg_idfs={}):
	return tfidf(term, document, positive_set, pos_idfs) - tfidf(term, document, negative_set, neg_idfs)