From 0f1aeaee6457aff207d1c266de1ae63b1f97186c Mon Sep 17 00:00:00 2001
From: Jack <john.butler@uconn.edu>
Date: Tue, 1 Mar 2016 13:57:22 -0500
Subject: [PATCH] Added cross validation to SVM

---
 review_svm.py | 118 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 76 insertions(+), 42 deletions(-)

diff --git a/review_svm.py b/review_svm.py
index b9cf07b..5ffaa5b 100644
--- a/review_svm.py
+++ b/review_svm.py
@@ -6,38 +6,52 @@
 # Program to classify the movie review dataset using a support vector machine
 # (via LIBSVM), following Pang and Lee (2002).
 
-NEG_FOLDER = "txt_sentoken\\neg"
-POS_FOLDER = "txt_sentoken\\pos"
-NUM_TRAIN = 300
-USE_PRESENCE = True # If true, use presence rather than frequency.
-GRAM_LENGTH = 2
+NEG_FOLDER   = "review_polarity\\txt_sentoken\\neg"
+POS_FOLDER   = "review_polarity\\txt_sentoken\\pos"
 
-# "Adapting a technique of Das and Chen (2001)..."
+# "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not',
+# 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word."
 # They didn't provide a full list.
 NEGATION_WORDS = ["not", "isn't", "didn't", "doesn't"]
-END_SENTENCE_PUNCTUATION = [".", "!", "?"]
+PUNCTUATION = [".", "!", "?", ",", ";"]
+
+# TODO Make these command-line parameters.
+USE_PRESENCE = False                       # If true, use presence rather than frequency.
+GRAM_LENGTH  = 1                           # Unigrams, bigrams, ...
+NUM_FOLDS    = 3                           # For cross-validation (Pang & Lee used 3)
+EPSILON      = 0.01                        # determines how long the algorithm runs
 
 def make_bag(filename):
   f = open(filename)
   lines = f.readlines()
+  f.close()
+  text = string.join(lines, " ")
   bag_of_words = {}
-  for line in lines:
-    do_negation = False
-    words = line.split(" ")
-    for word in words:
-      if word in NEGATION_WORDS:
+  do_negation = False
+  
+  words = text.split(" ")
+  count = 0
+  for i in range(len(words) - GRAM_LENGTH + 1):
+    n_gram = string.join(words[i:i+GRAM_LENGTH+1], "_")
+    if (GRAM_LENGTH == 1):                 # Pang and Lee didn't do negation tagging for bigrams.
+      if n_gram in NEGATION_WORDS:
         do_negation = True
-      if word in END_SENTENCE_PUNCTUATION:
+      if n_gram in PUNCTUATION:
         do_negation = False
       elif do_negation:
-        word = "NOT_" + word
-      index = hash(word)
-        
-      if (not USE_PRESENCE) and bag_of_words.has_key(index):
-        bag_of_words[index] += 1
-      else:
-        bag_of_words[index] = 1
-  f.close()
+        n_gram = "NOT_" + n_gram
+    
+    index = hash(n_gram)
+    if (not USE_PRESENCE) and bag_of_words.has_key(index):
+      bag_of_words[index] += 1
+      count += 1
+    else:
+      bag_of_words[index] = 1
+      count += 1
+  # Normalize the bag of words.
+  #for k in bag_of_words.keys():
+  #  bag_of_words[k] = float(bag_of_words[k])/count
+  
   return bag_of_words
   
 neg_filenames = []
@@ -57,37 +71,57 @@ def make_bag(filename):
 
 random.shuffle(neg_filenames)
 random.shuffle(pos_filenames)
-neg_train_filenames = neg_filenames[:NUM_TRAIN]
-neg_test_filenames = neg_filenames[NUM_TRAIN:]
-pos_train_filenames = pos_filenames[:NUM_TRAIN]
-pos_test_filenames = pos_filenames[NUM_TRAIN:]
+neg_folds = [[] for i in range(NUM_FOLDS)]
+pos_folds = [[] for i in range(NUM_FOLDS)]
+for i in range(len(neg_filenames)):
+  neg_folds[i % NUM_FOLDS].append(neg_filenames[i])
 
+for i in range(len(pos_filenames)):
+  pos_folds[i % NUM_FOLDS].append(pos_filenames[i])
+  
 #TRAIN
 
-neg_train_bags = []
-pos_train_bags = []
+#neg_fold_bags = [[] for i in range(NUM_FOLDS)]
+#pos_fold_bags = [[] for i in range(NUM_FOLDS)]
+
+for i in range(NUM_FOLDS):
+
+  neg_train_bags = []
+  pos_train_bags = []
 
-for filename in neg_train_filenames:
-  neg_train_bags.append(make_bag(filename))
+  neg_train_filenames = []
+  pos_train_filenames = []
+  neg_test_filenames = neg_folds[i]
+  pos_test_filenames = pos_folds[i]
+  for j in range(NUM_FOLDS):
+    if j != i:
+      neg_train_filenames += neg_folds[j]
+      pos_train_filenames += pos_folds[j]
 
-for filename in pos_train_filenames:
-  pos_train_bags.append(make_bag(filename))
+  for filename in neg_train_filenames:
+    neg_train_bags.append(make_bag(filename))
 
-train_labels = [-1] * len(neg_train_bags) + [1] * len(pos_train_bags)
-train_bags = neg_train_bags + pos_train_bags
+  for filename in pos_train_filenames:
+    pos_train_bags.append(make_bag(filename))
 
-# TODO: Investigate LIBSVM training parameters.
-m = svm_train(train_labels, train_bags, "-t 0")
+  train_labels = [-1] * len(neg_train_bags) + [1] * len(pos_train_bags)
+  train_bags = neg_train_bags + pos_train_bags
 
-# TEST
+  # TODO: Investigate LIBSVM training parameters.
+  m = svm_train(train_labels, train_bags, "-t 0 -e %d" % EPSILON)
 
-test_bags = []
-test_filenames = neg_test_filenames + pos_test_filenames
+  # TEST
+  test_bags = []
+  test_filenames = neg_test_filenames + pos_test_filenames
 
-for filename in test_filenames:
-  test_bags.append(make_bag(filename))
+  for filename in test_filenames:
+    test_bags.append(make_bag(filename))
 
-test_labels = [-1] * len(neg_test_filenames) + [1] * len(pos_test_filenames)
-svm_predict(test_labels, test_bags, m)
+  test_labels = [-1] * len(neg_test_filenames) + [1] * len(pos_test_filenames)
+  (train_labels, acc, p_vals) = svm_predict(test_labels, test_bags, m)
+  #indices = random.sample(range(len(test_filenames)))
+  #for i in indices:
+  #  filename = test_filenames[i]
+  #  if filename in neg_
   
 #sorted(bag_of_words.items(), key=lambda (k,v): -v)[:100]
\ No newline at end of file