From 3b2a970799be115ce9bbcd0d2905906c2ad77abc Mon Sep 17 00:00:00 2001 From: Jack Date: Tue, 29 Mar 2016 19:46:32 -0400 Subject: [PATCH] oops here's the latest --- BagOfWords.py | 62 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 37 insertions(+), 25 deletions(-) diff --git a/BagOfWords.py b/BagOfWords.py index 96f700c..bbca85d 100644 --- a/BagOfWords.py +++ b/BagOfWords.py @@ -4,40 +4,50 @@ # "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not', # 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word." # They didn't provide a full list. -NEGATION_WORDS = ["not", "isn't", "didn't", "doesn't"] -PUNCTUATION = [".", "!", "?", ",", ";"] #TODO make this work with POS tags (._.) +NEGATION_WORDS = ["not", "isn't", "didn't", "doesn't"] +PUNCTUATION = [".", "!", "?", ",", ";"] #TODO make this work with POS tags (._.) +POSITION_TAGS = ["_1Q", "_2H", "_3Q"] +POSITION_THRESHOLDS = [0.25, 0.75, 1] - -def make(text, ref_bag=None, use_presence=False, use_pos_tags=False, use_adj_only=False, gram_length=1, normalize_bags=True): +# ref_bag is used to calculate the total word count across all documents. +def make(text, ref_bag=None, gram_length=1, use_negation=False, use_presence=False, use_pos_tags=False, use_adj_only=False, use_position=False, normalize=True, use_hash=True): bag_of_words = {} - do_negation = False + if use_negation: + do_negation = False words = nltk.word_tokenize(text) - if use_pos_tags:# and gram_length==1: + if use_pos_tags: tagged = nltk.pos_tag(words) - tagged = [string.join(t, "_") for t in tagged] - words = tagged - count = 0 + words = [string.join(t, "_") for t in tagged] for i in range(len(words) - gram_length + 1): n_gram = string.join(words[i:i+gram_length], "_") - if (gram_length == 1): # Pang and Lee didn't do negation tagging for bigrams. - if n_gram in NEGATION_WORDS: - do_negation = True - elif n_gram in PUNCTUATION: - do_negation = False - if do_negation: - n_gram = "NOT_" + n_gram + if use_negation: + if (gram_length == 1): # Pang and Lee didn't do negation tagging for bigrams. + if n_gram in NEGATION_WORDS: + do_negation = True + elif n_gram in PUNCTUATION: + do_negation = False + if do_negation: + n_gram = "NOT_" + n_gram + + if use_position: + for j in range(len(POSITION_TAGS)): + if float(i)/len(words) < POSITION_THRESHOLDS[j]: + n_gram += POSITION_TAGS[j] + break + # LIBSVM won't use strings as keys, so hash to convert to a number. - index = hash(n_gram) + if use_hash: + index = hash(n_gram) + else: + index = n_gram + if not (use_pos_tags and use_adj_only and (tagged[i][1] != "JJ")): - #if not (ref_bag != None and ((not ref_bag.has_key(index)) or (ref_bag[index] < MIN_OCCURRENCES))): if (not use_presence) and bag_of_words.has_key(index): bag_of_words[index] += 1 - count += 1 else: bag_of_words[index] = 1 - count += 1 # Add it to the reference bag if ref_bag != None: @@ -46,9 +56,11 @@ def make(text, ref_bag=None, use_presence=False, use_pos_tags=False, use_adj_onl else: ref_bag[index] = 1 - # TODO do this correctly - - #if normalize_bags: - # for k in bag_of_words.keys(): - # bag_of_words[k] = float(NORMAL_LENGTH*bag_of_words[k])/count + if normalize: + length = 0 + for k in bag_of_words.keys(): + length += (bag_of_words[k]**2) + length **= 0.5 + for k in bag_of_words.keys(): + bag_of_words[k] = float(bag_of_words[k])/length return bag_of_words \ No newline at end of file