from __future__ import division | |

import math | |

import nltk | |

# document is assumed to be tokenized (a list of words) | |

# documents is a list of tokenized docs | |

def compute_idfs(documents): | |

idfs = {} | |

N = len(documents) | |

for doc in documents: | |

for term in doc: | |

if idfs.has_key(term): | |

idfs[term] += 1 | |

else: | |

idfs[term] = 1 | |

for term in idfs.keys(): | |

idfs[term] = math.log(N/idfs[term]) | |

return idfs | |

def tfidf(term, document, documents, idfs={}): | |

if idfs == {}: | |

all_doc_appearances = len([doc for doc in documents if term in doc]) | |

idf = math.log(len(documents)/all_doc_appearances, 10) | |

else: | |

if idfs.has_key(term): | |

idf = idfs[term] | |

else: | |

return 0 # is this supposed to happen??? | |

doc_appearances = 0 # number of appearances of term in this document | |

for word in document: | |

if term == word: | |

doc_appearances += 1 | |

""" | |

if doc_appearances == 0: | |

#This happens sometimes, probably due to inconsistent splitting/tokenizing. | |

#print "Error: no occurrences of", term | |

return 0 | |

elif all_doc_appearances == 0: | |

#print "Error: fuck,", term | |

return 0 | |

else: | |

""" | |

tfidf = (1 + math.log(doc_appearances,10)) * idf | |

return tfidf | |

# Martineau and Finin 2009 | |

def delta_tfidf(term, document, positive_set, negative_set, pos_idfs={}, neg_idfs={}): | |

return tfidf(term, document, positive_set, pos_idfs) - tfidf(term, document, negative_set, neg_idfs) |