diff --git a/BagOfWords.py b/BagOfWords.py index bbca85d..1f8cd32 100644 --- a/BagOfWords.py +++ b/BagOfWords.py @@ -1,5 +1,7 @@ +from __future__ import division import nltk import string +from TFIDF import tfidf # "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not', # 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word." @@ -33,7 +35,7 @@ def make(text, ref_bag=None, gram_length=1, use_negation=False, use_presence=Fal if use_position: for j in range(len(POSITION_TAGS)): - if float(i)/len(words) < POSITION_THRESHOLDS[j]: + if i/len(words) < POSITION_THRESHOLDS[j]: n_gram += POSITION_TAGS[j] break @@ -62,5 +64,19 @@ def make(text, ref_bag=None, gram_length=1, use_negation=False, use_presence=Fal length += (bag_of_words[k]**2) length **= 0.5 for k in bag_of_words.keys(): - bag_of_words[k] = float(bag_of_words[k])/length - return bag_of_words \ No newline at end of file + bag_of_words[k] = bag_of_words[k]/length + return bag_of_words + +# document and document are lists of words (pre-tokenized with nltk.word_tokenize()) +def make_tfidf(document, documents): + bag = {} + factor = 0 + for term in set(document): + weight = tfidf(term, document, documents) + if (weight != 0): + bag[term] = weight + factor += weight**2 + factor **= 0.5 + for key in bag.keys(): + bag[key] /= factor + return bag \ No newline at end of file