Moved bag of words to its own file.

job13011 · Mar 17, 2016 · 593be58 · 593be58
1 parent 47c6a2a
commit 593be58
Show file tree

Hide file tree

Showing 2 changed files with 163 additions and 113 deletions.
diff --git a/BagOfWords.py b/BagOfWords.py
@@ -0,0 +1,54 @@
+import nltk
+import string
+
+# "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not',
+# 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word."
+# They didn't provide a full list.
+NEGATION_WORDS = ["not", "isn't", "didn't", "doesn't"]
+PUNCTUATION    = [".", "!", "?", ",", ";"] #TODO make this work with POS tags (._.)
+
+
+def make(text, ref_bag=None, use_presence=False, use_pos_tags=False, use_adj_only=False, gram_length=1, normalize_bags=True):
+  bag_of_words = {}
+  do_negation = False
+
+  words = nltk.word_tokenize(text)
+  if use_pos_tags:# and gram_length==1:
+    tagged = nltk.pos_tag(words)
+    tagged = [string.join(t, "_") for t in tagged]
+    words = tagged
+  count = 0
+  for i in range(len(words) - gram_length + 1):
+    n_gram = string.join(words[i:i+gram_length], "_")
+    if (gram_length == 1):                 # Pang and Lee didn't do negation tagging for bigrams.
+      if n_gram in NEGATION_WORDS:
+        do_negation = True
+      elif n_gram in PUNCTUATION:
+        do_negation = False
+      if do_negation:
+        n_gram = "NOT_" + n_gram
+
+    # LIBSVM won't use strings as keys, so hash to convert to a number.
+    index = hash(n_gram)
+    if not (use_pos_tags and use_adj_only and (tagged[i][1] != "JJ")):
+      #if not (ref_bag != None and ((not ref_bag.has_key(index)) or (ref_bag[index] < MIN_OCCURRENCES))):
+      if (not use_presence) and bag_of_words.has_key(index):
+        bag_of_words[index] += 1
+        count += 1
+      else:
+        bag_of_words[index] = 1
+        count += 1
+
+      # Add it to the reference bag
+      if ref_bag != None:
+        if ref_bag.has_key(index):
+          ref_bag[index] += 1
+        else:
+          ref_bag[index] = 1
+
+  # TODO do this correctly
+
+  #if normalize_bags:
+  #  for k in bag_of_words.keys():
+  #    bag_of_words[k] = float(NORMAL_LENGTH*bag_of_words[k])/count
+  return bag_of_words
diff --git a/review_svm.py b/review_svm.py
@@ -2,158 +2,161 @@
 import random
 import string
 import time
+import sys
 
 import nltk
 import svmutil
-#import liblinearutil
+
+import BagOfWords
 
 # Program to classify the movie review dataset using a support vector machine
 # (via LIBSVM), following Pang and Lee (2002).
 
-NEG_FOLDER   = os.path.join("review_polarity","txt_sentoken","neg")
-POS_FOLDER   = os.path.join("review_polarity","txt_sentoken","pos")
+POS_FOLDER = os.path.join("review_polarity","txt_sentoken","pos")
+NEG_FOLDER = os.path.join("review_polarity","txt_sentoken","neg")
 
 # "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not',
 # 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word."
 # They didn't provide a full list.
+# TODO make this a parameter
 NEGATION_WORDS = ["not", "isn't", "didn't", "doesn't"]
 PUNCTUATION    = [".", "!", "?", ",", ";"]
 
 NORMAL_LENGTH  = 1000
 
-# TODO Make these command-line parameters.
+# These are now command line parameters! See below...
 USE_PRESENCE    = False                       # If true, use presence rather than frequency.
-USE_POS_TAGS    = True
+USE_POS_TAGS    = False
 USE_ADJ_ONLY    = False
-GRAM_LENGTH     = 2                          # Unigrams, bigrams, ...
+USE_NEGATION    = True
+GRAM_LENGTH     = 1                           # Unigrams, bigrams, ... TODO use a range
 NUM_FOLDS       = 3                           # For cross-validation (Pang & Lee used 3)
-EPSILON         = .1                        # determines how long the algorithm runs (default is 0.001)
-KERNEL_TYPE     = 0                           # 0: linear, 2: radial basis
-NORMALIZE_BAGS  = False
-USE_LIBLINEAR   = False
+
+MIN_OCCURRENCES = 4                           # To be included, the word must show up this many times across all documents (Pang and Lee used 4)
+EPSILON         = .001                        # determines how long the algorithm runs (default is 0.001)
+
+KERNEL_TYPE     = 0                           # 0: linear, 2: radial basis (just use linear)
+NORMALIZE_BAGS  = True
+USE_LIBLINEAR   = False                       # Not implemented - it murdered my computer and wasn't noticeably faster.  But maybe multicore is worth a look
 CACHE_SIZE      = 512
-MIN_OCCURRENCES = 10                           # To be included, the word must show up this many times across all documents
 
 def file_to_text(filename):
   f = open(filename)
   lines = f.readlines()
   f.close()
   text = string.join(lines, " ")
   return text
+
+def generate_filenames(folder_name):
+  filenames = []
+  for (folder, x, folder_filenames) in os.walk(folder_name):
+    for filename in folder_filenames:
+      if filename.endswith(".txt"):
+        filenames.append(os.path.join(folder, filename))
+  return filenames
+
+def partition_filenames(filenames, num_partitions):
+  partitions = [[] for i in range(num_partitions)]
+  for i in range(len(filenames)):
+    partitions[i % num_partitions].append(filenames[i])
+  return partitions
 
-def make_bag(text, ref_bag):
-  bag_of_words = {}
-  do_negation = False
 
-  words = nltk.word_tokenize(text)
-  if USE_POS_TAGS:# and GRAM_LENGTH==1:
-    t5 = time.time()
-    tagged = nltk.pos_tag(words)
-    tagged = [string.join(t, "_") for t in tagged]
-    words = tagged
-    t6 = time.time()
-    print "Tag time (%d words): %f" % (len(words), (t6-t5))
-  count = 0
-  for i in range(len(words) - GRAM_LENGTH + 1):
-    n_gram = string.join(words[i:i+GRAM_LENGTH], "_")
-    if (GRAM_LENGTH == 1):                 # Pang and Lee didn't do negation tagging for bigrams.
-      if n_gram in NEGATION_WORDS:
-        do_negation = True
-      elif n_gram in PUNCTUATION:
-        do_negation = False
-
-      if do_negation:
-        n_gram = "NOT_" + n_gram
-
-    # LIBSVM won't use strings as keys, so hash to convert to a number.
-    index = hash(n_gram)
-    if not (USE_POS_TAGS and USE_ADJ_ONLY and (tagged[i][1] != "JJ")):
-      #if not (ref_bag != None and ((not ref_bag.has_key(index)) or (ref_bag[index] < MIN_OCCURRENCES))):
-      if (not USE_PRESENCE) and bag_of_words.has_key(index):
-        bag_of_words[index] += 1
-        count += 1
-        print n_gram, "=>", bag_of_words[index]
-      else:
-        bag_of_words[index] = 1
-        count += 1
-        print n_gram, "=>", bag_of_words[index]
-
-      # Add it to the reference bag
-      if ref_bag.has_key(index):
-        ref_bag[index] += 1
-      else:
-        ref_bag[index] = 1
-  # Normalize the bag of words.  For whatever reason it didn't work very well with small decimals...
-  if NORMALIZE_BAGS:
-    for k in bag_of_words.keys():
-      bag_of_words[k] = float(NORMAL_LENGTH*bag_of_words[k])/count
-  return bag_of_words
+# Set parameters from command-line arguments.
+i = 0
+try:
+  args = sys.argv[1:]
+  while i < len(args):
+    if args[i] == "--gram-length":
+      GRAM_LENGTH = int(args[i+1])
+      i += 2 
+    elif args[i] == "--num-folds":
+      NUM_FOLDS = int(args[i+1])
+      i += 2
+    elif args[i] == "--presence":
+      USE_PRESENCE = True
+      i += 1
+    elif args[i] == "--frequency":
+      USE_PRESENCE = False
+      i += 1
+    elif args[i] == "--use-pos-tags":
+      USE_POS_TAGS = True
+      i += 1
+    elif args[i] == "--use-adj-only":
+      USE_ADJ_ONLY = True
+      i += 1
+    elif args[i] == "--use-negation":
+      USE_NEGATION = True
+      i += 1
+    elif args[i] == "--no-negation":
+      USE_NEGATION = False
+      i += 1
+    elif args[i] == "--threshold":
+      MIN_OCCURRENCES = int(args[i+1])
+      i += 2
+    elif args[i] == "--epsilon":
+      EPSILON = float(args[i+1])
+      i += 2
+    elif args[i] == "--help":
+      print "Usage:"
+      print "--gram-length N\t\tUse groups of N consecutive words (Default: 1)"
+      print "--num-folds N\t\tUse N folds for cross-validation (Default: 3)"
+      print "--presence\t\tUse word presence rather than word frequency (Default: Off)"
+      print "--frequency\t\tUse word frequency rather than word presence (Default: On)"
+      print "--use-pos-tags\t\tUse part-of-speech tags (Default: Off)"
+      print "--use-adj-only\t\tUse adjectives only (requires --use-pos-tags and --gram-length 1) (Default: Off)"
+      print "--threshold N\t\tOnly include words that appear at least N times across all documents  (Default: 4)"
+      print "--epsilon X\t\tSVM parameter to control the number of iterations (Default: 0.001)"
+      print "\t\t\t(0 < epsilon < 1; lower = more iterations)"
+      exit()
+    else:
+      print "Error: Invalid argument", args[i]
+      i += 1
+except Exception:
+  print "Invalid arguments"
 
 t0 = time.time()
 
-pos_filenames = []
-neg_filenames = []
-next_word_index = 0
-
-for (folder, x, filenames) in os.walk(POS_FOLDER):
-  for filename in filenames:
-    if filename.endswith(".txt"):
-      pos_filenames.append(os.path.join(folder, filename))
-
-for (folder, x, filenames) in os.walk(NEG_FOLDER):
-  for filename in filenames:
-    if filename.endswith(".txt"):
-      neg_filenames.append(os.path.join(folder, filename))
-
-# TEST
+pos_filenames = generate_filenames(POS_FOLDER)
+neg_filenames = generate_filenames(NEG_FOLDER)
+
+# TEST - to test on a subset of reviews (since some operations [i.e. tagging] are slow)
 #pos_filenames = random.sample(pos_filenames, 20)
 #neg_filenames = random.sample(neg_filenames, 20)
 
 # Partition reviews into folds.
-pos_folds = [[] for i in range(NUM_FOLDS)]
-neg_folds = [[] for i in range(NUM_FOLDS)]
-
-for i in range(len(pos_filenames)):
-  pos_folds[i % NUM_FOLDS].append(pos_filenames[i])
-
-for i in range(len(neg_filenames)):
-  neg_folds[i % NUM_FOLDS].append(neg_filenames[i])
+pos_folds = partition_filenames(pos_filenames, NUM_FOLDS)
+neg_folds = partition_filenames(neg_filenames, NUM_FOLDS)
 
 # Count occurrences of every word across all documents
 # (this is important for e.g. Delta TFIDF)
-word_table = {}
+total_word_counts = {}
 
 # Construct a bag of words (or n-grams) from each file.
 pos_fold_bags = [[] for i in range(NUM_FOLDS)]
 neg_fold_bags = [[] for i in range(NUM_FOLDS)]
 
 for i in range(NUM_FOLDS):
   for filename in pos_folds[i]:
-    t3 = time.time()
-    pos_fold_bags[i].append(make_bag(file_to_text(filename), ref_bag=word_table))
-    t4 = time.time()
-    print "Bag time:", (t4-t3)
+    pos_fold_bags[i].append(BagOfWords.make(file_to_text(filename), ref_bag=total_word_counts, gram_length=GRAM_LENGTH,
+                            use_presence=USE_PRESENCE, use_pos_tags=USE_POS_TAGS, use_adj_only=USE_ADJ_ONLY, normalize_bags=NORMALIZE_BAGS))
 
   for filename in neg_folds[i]:
-    t3 = time.time()
-    neg_fold_bags[i].append(make_bag(file_to_text(filename), ref_bag=word_table))
-    t4 = time.time()
-    print "Bag time:", (t4-t3)
+    neg_fold_bags[i].append(
+      BagOfWords.make(file_to_text(filename), ref_bag=total_word_counts, gram_length=GRAM_LENGTH,
+                      use_presence=USE_PRESENCE, use_pos_tags=USE_POS_TAGS, use_adj_only=USE_ADJ_ONLY, normalize_bags=NORMALIZE_BAGS))
 
-
 # Remove words with less than the minimum occurrences threshold.
-for k in word_table.keys():
-  if word_table[k] < MIN_OCCURRENCES:
-    for bag in (neg_fold_bags + pos_fold_bags):
-      if bag.has_key(k):
-        bag.pop(k)
-
-#word_table = make_bag(all_text, use_presence=False)
-for k in word_table.keys():
-  if word_table[k] < MIN_OCCURRENCES:
-    word_table.pop(k)
-num_unique_words = len(word_table.keys())
-print "# unique words:", num_unique_words
+for k in total_word_counts.keys():
+  if total_word_counts[k] < MIN_OCCURRENCES:
+    for fold in (neg_fold_bags + pos_fold_bags):
+      for bag in fold:
+        if bag.has_key(k):
+          bag.pop(k)
+    total_word_counts.pop(k)
+
+#num_unique_words = len(total_word_counts.keys())
+#print "# unique words:", num_unique_words
 
 t1 = time.time()
 print "Constructed bags, time:", (t1-t0)
@@ -178,21 +181,13 @@ def make_bag(text, ref_bag):
   train_labels = [1] * len(pos_train_bags) + [-1] * len(neg_train_bags)
   train_bags = pos_train_bags + neg_train_bags
 
-  # TODO: Investigate LIBSVM training parameters.
-  # TODO: Why does LIBLINEAR break my computer?
-  if USE_LIBLINEAR:
-    pass#m = liblinearutil.train(train_labels, train_bags, "-e %f" % EPSILON)
-  else:
-    m = svmutil.svm_train(train_labels, train_bags, "-t %d -e %f -m %d -q" % (KERNEL_TYPE, EPSILON, CACHE_SIZE))  
+  m = svmutil.svm_train(train_labels, train_bags, "-t %d -e %f -m %d -q" % (KERNEL_TYPE, EPSILON, CACHE_SIZE))  
 
   test_bags = pos_fold_bags[i] + neg_fold_bags[i]
   test_filenames = pos_test_filenames + neg_test_filenames
   test_labels = [1] * len(pos_test_filenames) + [-1] * len(neg_test_filenames)
 
-  if USE_LIBLINEAR:
-    pass#(predicted_labels, acc, p_vals) = liblinearutil.svm_predict(test_labels, test_bags, m)
-  else:
-    (predicted_labels, acc, p_vals) = svmutil.svm_predict(test_labels, test_bags, m)
+  (predicted_labels, acc, p_vals) = svmutil.svm_predict(test_labels, test_bags, m)
 
   avg_acc += acc[0]
 
@@ -208,5 +203,6 @@ def make_bag(text, ref_bag):
 t2 = time.time()
 avg_acc /= NUM_FOLDS
 print "Total accuracy:", avg_acc
-print "Total time:", (t2-t1)
+print "Classification time:", (t2-t1)
+print "Total time:", (t2-t0)