Skip to content

Commit

Permalink
oops here's the latest
Browse files Browse the repository at this point in the history
  • Loading branch information
job13011 committed Mar 29, 2016
1 parent 2a8d5ef commit 3b2a970
Showing 1 changed file with 37 additions and 25 deletions.
62 changes: 37 additions & 25 deletions BagOfWords.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,40 +4,50 @@
# "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not',
# 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word."
# They didn't provide a full list.
NEGATION_WORDS = ["not", "isn't", "didn't", "doesn't"]
PUNCTUATION = [".", "!", "?", ",", ";"] #TODO make this work with POS tags (._.)
NEGATION_WORDS = ["not", "isn't", "didn't", "doesn't"]
PUNCTUATION = [".", "!", "?", ",", ";"] #TODO make this work with POS tags (._.)
POSITION_TAGS = ["_1Q", "_2H", "_3Q"]
POSITION_THRESHOLDS = [0.25, 0.75, 1]


def make(text, ref_bag=None, use_presence=False, use_pos_tags=False, use_adj_only=False, gram_length=1, normalize_bags=True):
# ref_bag is used to calculate the total word count across all documents.
def make(text, ref_bag=None, gram_length=1, use_negation=False, use_presence=False, use_pos_tags=False, use_adj_only=False, use_position=False, normalize=True, use_hash=True):
bag_of_words = {}
do_negation = False
if use_negation:
do_negation = False

words = nltk.word_tokenize(text)
if use_pos_tags:# and gram_length==1:
if use_pos_tags:
tagged = nltk.pos_tag(words)
tagged = [string.join(t, "_") for t in tagged]
words = tagged
count = 0
words = [string.join(t, "_") for t in tagged]
for i in range(len(words) - gram_length + 1):
n_gram = string.join(words[i:i+gram_length], "_")
if (gram_length == 1): # Pang and Lee didn't do negation tagging for bigrams.
if n_gram in NEGATION_WORDS:
do_negation = True
elif n_gram in PUNCTUATION:
do_negation = False
if do_negation:
n_gram = "NOT_" + n_gram

if use_negation:
if (gram_length == 1): # Pang and Lee didn't do negation tagging for bigrams.
if n_gram in NEGATION_WORDS:
do_negation = True
elif n_gram in PUNCTUATION:
do_negation = False
if do_negation:
n_gram = "NOT_" + n_gram

if use_position:
for j in range(len(POSITION_TAGS)):
if float(i)/len(words) < POSITION_THRESHOLDS[j]:
n_gram += POSITION_TAGS[j]
break

# LIBSVM won't use strings as keys, so hash to convert to a number.
index = hash(n_gram)
if use_hash:
index = hash(n_gram)
else:
index = n_gram

if not (use_pos_tags and use_adj_only and (tagged[i][1] != "JJ")):
#if not (ref_bag != None and ((not ref_bag.has_key(index)) or (ref_bag[index] < MIN_OCCURRENCES))):
if (not use_presence) and bag_of_words.has_key(index):
bag_of_words[index] += 1
count += 1
else:
bag_of_words[index] = 1
count += 1

# Add it to the reference bag
if ref_bag != None:
Expand All @@ -46,9 +56,11 @@ def make(text, ref_bag=None, use_presence=False, use_pos_tags=False, use_adj_onl
else:
ref_bag[index] = 1

# TODO do this correctly

#if normalize_bags:
# for k in bag_of_words.keys():
# bag_of_words[k] = float(NORMAL_LENGTH*bag_of_words[k])/count
if normalize:
length = 0
for k in bag_of_words.keys():
length += (bag_of_words[k]**2)
length **= 0.5
for k in bag_of_words.keys():
bag_of_words[k] = float(bag_of_words[k])/length
return bag_of_words

0 comments on commit 3b2a970

Please sign in to comment.