Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
also modified BagOfWords
  • Loading branch information
job13011 committed Apr 2, 2016
1 parent e5cdf0f commit 3672009
Showing 1 changed file with 19 additions and 3 deletions.
22 changes: 19 additions & 3 deletions BagOfWords.py
@@ -1,5 +1,7 @@
from __future__ import division
import nltk
import string
from TFIDF import tfidf

# "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not',
# 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word."
Expand Down Expand Up @@ -33,7 +35,7 @@ def make(text, ref_bag=None, gram_length=1, use_negation=False, use_presence=Fal

if use_position:
for j in range(len(POSITION_TAGS)):
if float(i)/len(words) < POSITION_THRESHOLDS[j]:
if i/len(words) < POSITION_THRESHOLDS[j]:
n_gram += POSITION_TAGS[j]
break

Expand Down Expand Up @@ -62,5 +64,19 @@ def make(text, ref_bag=None, gram_length=1, use_negation=False, use_presence=Fal
length += (bag_of_words[k]**2)
length **= 0.5
for k in bag_of_words.keys():
bag_of_words[k] = float(bag_of_words[k])/length
return bag_of_words
bag_of_words[k] = bag_of_words[k]/length
return bag_of_words

# document and document are lists of words (pre-tokenized with nltk.word_tokenize())
def make_tfidf(document, documents):
bag = {}
factor = 0
for term in set(document):
weight = tfidf(term, document, documents)
if (weight != 0):
bag[term] = weight
factor += weight**2
factor **= 0.5
for key in bag.keys():
bag[key] /= factor
return bag

0 comments on commit 3672009

Please sign in to comment.