From 3b2a970799be115ce9bbcd0d2905906c2ad77abc Mon Sep 17 00:00:00 2001
From: Jack <john.butler@uconn.edu>
Date: Tue, 29 Mar 2016 19:46:32 -0400
Subject: [PATCH] oops here's the latest

---
 BagOfWords.py | 62 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 37 insertions(+), 25 deletions(-)

diff --git a/BagOfWords.py b/BagOfWords.py
index 96f700c..bbca85d 100644
--- a/BagOfWords.py
+++ b/BagOfWords.py
@@ -4,40 +4,50 @@
 # "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not',
 # 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word."
 # They didn't provide a full list.
-NEGATION_WORDS = ["not", "isn't", "didn't", "doesn't"]
-PUNCTUATION    = [".", "!", "?", ",", ";"] #TODO make this work with POS tags (._.)
+NEGATION_WORDS      = ["not", "isn't", "didn't", "doesn't"]
+PUNCTUATION         = [".", "!", "?", ",", ";"] #TODO make this work with POS tags (._.)
+POSITION_TAGS       = ["_1Q", "_2H", "_3Q"]
+POSITION_THRESHOLDS = [0.25, 0.75, 1]
 
-
-def make(text, ref_bag=None, use_presence=False, use_pos_tags=False, use_adj_only=False, gram_length=1, normalize_bags=True):
+# ref_bag is used to calculate the total word count across all documents.
+def make(text, ref_bag=None, gram_length=1, use_negation=False, use_presence=False, use_pos_tags=False, use_adj_only=False, use_position=False, normalize=True, use_hash=True):
   bag_of_words = {}
-  do_negation = False
+  if use_negation:
+    do_negation = False
   
   words = nltk.word_tokenize(text)
-  if use_pos_tags:# and gram_length==1:
+  if use_pos_tags:
     tagged = nltk.pos_tag(words)
-    tagged = [string.join(t, "_") for t in tagged]
-    words = tagged
-  count = 0
+    words = [string.join(t, "_") for t in tagged]
   for i in range(len(words) - gram_length + 1):
     n_gram = string.join(words[i:i+gram_length], "_")
-    if (gram_length == 1):                 # Pang and Lee didn't do negation tagging for bigrams.
-      if n_gram in NEGATION_WORDS:
-        do_negation = True
-      elif n_gram in PUNCTUATION:
-        do_negation = False
-      if do_negation:
-        n_gram = "NOT_" + n_gram
     
+    if use_negation:
+      if (gram_length == 1): # Pang and Lee didn't do negation tagging for bigrams.
+        if n_gram in NEGATION_WORDS:
+          do_negation = True
+        elif n_gram in PUNCTUATION:
+          do_negation = False
+        if do_negation:
+          n_gram = "NOT_" + n_gram
+          
+    if use_position:
+      for j in range(len(POSITION_TAGS)):
+        if float(i)/len(words) < POSITION_THRESHOLDS[j]:
+          n_gram += POSITION_TAGS[j]
+          break
+          
     # LIBSVM won't use strings as keys, so hash to convert to a number.
-    index = hash(n_gram)
+    if use_hash:
+      index = hash(n_gram)
+    else:
+      index = n_gram
+      
     if not (use_pos_tags and use_adj_only and (tagged[i][1] != "JJ")):
-      #if not (ref_bag != None and ((not ref_bag.has_key(index)) or (ref_bag[index] < MIN_OCCURRENCES))):
       if (not use_presence) and bag_of_words.has_key(index):
         bag_of_words[index] += 1
-        count += 1
       else:
         bag_of_words[index] = 1
-        count += 1
       
       # Add it to the reference bag
       if ref_bag != None:
@@ -46,9 +56,11 @@ def make(text, ref_bag=None, use_presence=False, use_pos_tags=False, use_adj_onl
         else:
           ref_bag[index] = 1
           
-  # TODO do this correctly
-  
-  #if normalize_bags:
-  #  for k in bag_of_words.keys():
-  #    bag_of_words[k] = float(NORMAL_LENGTH*bag_of_words[k])/count
+  if normalize:
+    length = 0
+    for k in bag_of_words.keys():
+      length += (bag_of_words[k]**2)
+    length **= 0.5
+    for k in bag_of_words.keys():
+      bag_of_words[k] = float(bag_of_words[k])/length
   return bag_of_words
\ No newline at end of file