From cf6576fb12fc9bfc6224d1b76af1c1c9c6e0ff2f Mon Sep 17 00:00:00 2001
From: Jack <john.butler@uconn.edu>
Date: Sat, 16 Apr 2016 10:40:46 -0400
Subject: [PATCH] matplotlib; eval tweaks; comparison

---
 GlossLexicon.py |  10 +-
 LexiconEval.py  |  80 ++++++---
 graph.py        |  43 +++++
 review_svm.py   | 432 +++++++++++++++++++++++++-----------------------
 4 files changed, 333 insertions(+), 232 deletions(-)
 create mode 100644 graph.py

diff --git a/GlossLexicon.py b/GlossLexicon.py
index 9d0f7d7..351ae1e 100644
--- a/GlossLexicon.py
+++ b/GlossLexicon.py
@@ -11,13 +11,11 @@ import nltk
 from nltk.corpus import wordnet as wn
 
 import BagOfWords
-import MPQALexicon
-import AniaLexicon
 
-EXPAND_ITERATIONS = 2
+EXPAND_ITERATIONS = 3
 CLASSIFIER = "svm" # "nb" = Naive Bayes, "svm" = Linear SVM, "me" = maximum entropy
-REMOVE_STOPWORDS = False
-USE_STEMMING = False
+REMOVE_STOPWORDS = True
+USE_STEMMING = True # sync this up with eval!
 USE_EXAMPLES = True
 
 USE_EQUAL_TRAINING = True
@@ -90,7 +88,7 @@ def expand_sets(positive, negative, neutral):
   return (newPositive, newNegative, newNeutral)
   
 def do_stem(text):
-  global stemmer
+  stemmer = nltk.stem.porter.PorterStemmer()
   return [stemmer.stem(word) for word in text]
 
 def create(test_words, test_labels):
diff --git a/LexiconEval.py b/LexiconEval.py
index 52e0db8..afff8c2 100644
--- a/LexiconEval.py
+++ b/LexiconEval.py
@@ -7,11 +7,14 @@ from nltk.corpus import movie_reviews
 import MPQALexicon
 import AniaLexicon
 import GlossLexicon
+import XMLParser
 
-USE_STEMMING = False
+USE_STEMMING = True # sync this up with lexicon!
 USE_PARSING = True
-LEX_ALG = "gloss"
-LEX_SOURCE = "mpqa"
+LEX_ALG = "gloss"   # "gloss", "conjunction", "none"
+LEX_SOURCE = "mpqa" # "mpqa", "ania"
+CORPUS = "movies"   # "amazon", "movies"
+NEG_MOD = 1.5       # Taboada suggested 1.5.
 
 # new and improved finite state machine
 # kinda-sorta based on Taboada 2011.
@@ -36,13 +39,9 @@ def calculate_score(text, lexicon):
   num_neg = 0
   num_halfneg = 0
   for word in text:
-    if lexicon.has_key(word):
-      word_score = lexicon[word]
-      # EXPERIMENTAL
-      if word_score < 0: word_score *= 1.5
     if state == 0:
       if lexicon.has_key(word):
-        score += word_score
+        score += lexicon[word]
         num_single += 1
       elif word in negators:
         state = 1
@@ -50,7 +49,7 @@ def calculate_score(text, lexicon):
         state = 2
     elif state == 1:
       if lexicon.has_key(word):
-        score += -1 * word_score
+        score += -1 * lexicon[word]
         num_neg += 1
         state = 0
       elif word in intensifiers:
@@ -59,7 +58,7 @@ def calculate_score(text, lexicon):
         state = 0
     elif state == 2:
       if lexicon.has_key(word):
-        score += 2 * word_score
+        score += 2 * lexicon[word]
         num_double += 1
         state = 0
       else:
@@ -68,7 +67,7 @@ def calculate_score(text, lexicon):
       pass #TODO
     elif state == 4:
       if lexicon.has_key(word):
-        score += -0.5 * word_score
+        score += -0.5 * lexicon[word]
         num_halfneg += 1
         state = 0
       else:
@@ -79,9 +78,6 @@ def calculate_score(text, lexicon):
 def do_stem(text):
   global stemmer
   return [stemmer.stem(word) for word in text]
-  
-def get_label(id):
-  return movie_reviews.categories(fileids=[id])[0]
 
 # Used to create a lexicon instance from the words + labels directly (i.e. without using an algorithm)
 def create_lexicon(words, labels):
@@ -114,14 +110,24 @@ try:
       else:
         print "Invalid lexicon"
       i += 2
+    elif args[i] == "--corpus":
+      if args[i+1] == "movies":
+        CORPUS = "movies"
+      elif args[i+1] == "amazon":
+        CORPUS = "amazon"
+      i += 2
     elif args[i] == "--help":
       print "Usage:"
-      print "--alg X: Choose the algorithm to use ('gloss', 'conjunction' or 'none') (default: gloss)"
+      print "--algorithm|alg X: Choose the algorithm to use ('gloss', 'conjunction' or 'none') (default: gloss)"
       print "  - gloss: Use the gloss-based algorithm (Esuli & Sebastiani)"
       print "  - conjunction: Use the conjunction-based algorithm (Hatzivassiloglou & McKeown)"
-      print "--lexicon X: Choose the lexicon to use ('mpqa', 'ania' or 'none')"
+      print "  - none: Use the input lexicon as is"
+      print "--lexicon|lex X: Choose the lexicon to use ('mpqa', 'ania' or 'none')"
       print "  - mpqa: Use the MPQA lexicon"
       print "  - ania: Use the hand-labeled lexicon from the Brown corpus"
+      print "--corpus X: Choose the data set to test on"
+      print "  - amazon: Use the Amazon data set"
+      print "  - movies: Use the Pang&Lee movie data set (default)"
       exit()
     else:
       print "Error: Invalid argument", args[i]
@@ -132,6 +138,7 @@ except Exception:
   
 print "Lexicon =", LEX_SOURCE
 print "Algorithm =", LEX_ALG
+print "Corpus =", CORPUS
 
 # Load the test set. A few options here.
 if LEX_SOURCE == "mpqa":
@@ -158,17 +165,45 @@ if LEX_ALG != "none":
   lex_acc = correct/len(lexicon.items())
   print "Lexicon accuracy:", lex_acc
   
+for key in lexicon.keys():
+  if lexicon[key] < 0: lexicon[key] *= NEG_MOD
+  
+if CORPUS == "movies":
+  ids = movie_reviews.fileids()
+  reviews = [list(movie_reviews.words(fileids=[id])) for id in ids]
+  labels = []
+  for id in ids:
+    label = movie_reviews.categories(id)[0]
+    if label == 'pos':
+      labels.append(1)
+    elif label == 'neg': 
+      labels.append(-1)
+elif CORPUS == "amazon":
+  (ids, reviews, labels) = XMLParser.get_all_reviews()
+else:
+  print "Invalid corpus!"
+  exit()
+  
+"""
+# It feels like there should be a more efficient way do to this.
+shuffled = zip(ids,reviews,labels)
+shuffled = shuffled[:20]  
+ids = [x[0] for x in shuffled]
+reviews = [x[1] for x in shuffled]
+labels = [x[2] for x in shuffled]
+"""
+
 # Iterate through all of the reviews and compute scores by taking the sum of their
 # component lexicon words.  Includes rudimentary negation testing.
 correct = 0
 positive = 0
-ids = sorted(movie_reviews.fileids())
 scores = []
 
-for id in ids:
-  words = list(movie_reviews.words(fileids=[id]))
+for i in range(len(reviews)):
+  words = reviews[i]
   if USE_STEMMING:
     words = do_stem(words)
+    
   if USE_PARSING:
     score = calculate_score(words, lexicon)
   else:
@@ -182,14 +217,15 @@ for id in ids:
 for i in range(len(ids)):
   id = ids[i]
   score = scores[i]
+  label = labels[i]
   if score >= 0:
-    sent_value = "pos"
+    sent_value = 1
     positive += 1
     #print id, sent_value
   elif score < 0:
-    sent_value = "neg"
+    sent_value = -1
     #print id, sent_value
-  label = get_label(id)
+    
   if sent_value == label:
     correct += 1
 
diff --git a/graph.py b/graph.py
new file mode 100644
index 0000000..04102ef
--- /dev/null
+++ b/graph.py
@@ -0,0 +1,43 @@
+import numpy
+from matplotlib import pyplot
+
+labels = [
+  "unigrams, frequency",
+  "unigrams, frequency, +Position",
+  "unigrams, presence",
+  "unigrams, presence, +Position",
+  "bigrams, frequency",
+  "bigrams, frequency, +Position",
+  "bigrams, presence",
+  "bigrams, presence, +Position",
+  "delta_tfidf"
+]
+labels2 = [
+  "unigrams, frequency",
+  "unigrams, frequency, +Position",
+  "unigrams, presence",
+  "unigrams, presence,  +Position",
+  "bigrams, frequency",
+  "bigrams, frequency, +Position",
+  "bigrams, presence",
+  "bigrams, presence, +Position",
+  "delta_tfidf"
+]
+tops = numpy.arange(len(labels))
+widths = [0.826002649356, 0.784479089868, 0.842490694287, 0.821997146847, 0.807497617378, 0.777000053946, 0.820491149832, 0.795509581438, 0.981992471513]
+widths2 = [0.824624634419, 0.808376475678, 0.832750728912, 0.815374570779, 0.797876474366, 0.771876439875, 0.799001849413, 0.768376127015, 0.929999178955]
+height = 0.3
+pyplot.barh(tops, widths, height, color="#FF0000")
+pyplot.barh(tops+height, widths2, height, color="#00FF00")
+pyplot.legend(["Movies", "Amazon"], loc=4) # bottom right
+pyplot.yticks(tops+height, labels)
+pyplot.xlim(0.5, 1.0)
+pyplot.ylim(tops[0]-2*height, tops[-1]+3*height)
+pyplot.show()
+
+"""
+gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: False, use_position: False 0.822003140865
+gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: False, use_position: True 0.781988575402
+gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position: False 0.77899606193
+gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position: True 0.762512512513
+"""
\ No newline at end of file
diff --git a/review_svm.py b/review_svm.py
index 649f4bc..ceae890 100644
--- a/review_svm.py
+++ b/review_svm.py
@@ -2,13 +2,11 @@ from __future__ import division
 import os
 import random
 import string
-import time
 import sys
 
 import nltk
 from nltk.corpus import movie_reviews
 import numpy
-#import svmutil
 from sklearn.svm import SVC
 from sklearn.svm import LinearSVC
 from TFIDF import delta_tfidf, compute_idfs
@@ -43,6 +41,7 @@ NORMALIZE_BAGS  = True
 USE_LIBLINEAR   = True                        # This is supposedly faster for large instances
 
 USE_AMAZON      = False                       # Use the Amazon review set, not Pang and Lee.
+USE_DELTA_TFIDF = False
 
 def make_folds(documents, ids, num_partitions):
   folds = [[] for i in range(num_partitions)]
@@ -52,220 +51,245 @@ def make_folds(documents, ids, num_partitions):
     fold_ids[i % num_partitions].append(ids[i])
   return (folds, fold_ids)
   
-def make_bag(text, total_word_counts):
-  return BagOfWords.make(text, ref_bag=total_word_counts,
-                         gram_length=GRAM_LENGTH, use_presence=USE_PRESENCE,
-                         use_pos_tags=USE_POS_TAGS, use_adj_only=USE_ADJ_ONLY,
-                         normalize=NORMALIZE_BAGS, use_negation=USE_NEGATION,
-                         use_position=USE_POSITION)
-
-# Set parameters from command-line arguments.
-i = 0
-try:
-  args = sys.argv[1:]
-  while i < len(args):
-    if args[i] == "--gram-length":
-      GRAM_LENGTH = int(args[i+1])
-      i += 2 
-    elif args[i] == "--num-folds":
-      NUM_FOLDS = int(args[i+1])
-      i += 2
-    elif args[i] == "--presence":
-      USE_PRESENCE = True
-      i += 1
-    elif args[i] == "--frequency":
-      USE_PRESENCE = False
-      i += 1
-    elif args[i] == "--use-pos-tags":
-      USE_POS_TAGS = True
-      i += 1
-    elif args[i] == "--use-adj-only":
-      USE_ADJ_ONLY = True
-      i += 1
-    elif args[i] == "--use-negation":
-      USE_NEGATION = True
-      i += 1
-    elif args[i] == "--no-negation":
-      USE_NEGATION = False
-      i += 1
-    elif args[i] == "--use-position":
-      USE_POSITION = True
-      i += 1
-    elif args[i] == "--threshold":
-      MIN_OCCURRENCES = int(args[i+1])
-      i += 2
-    elif args[i] == "--epsilon":
-      EPSILON = float(args[i+1])
-      i += 2
-    elif args[i] == "--use-amazon":
-      USE_AMAZON = True
-      i += 1
-    elif args[i] == "--use-delta":
-      USE_DELTATFIDF = True
-      i += 1
-    elif args[i] == "--help":
-      print "Usage:"
-      print "--gram-length N\t\tUse groups of N consecutive words (Default: 1)"
-      print "--num-folds N\t\tUse N folds for cross-validation (Default: 3)"
-      print "--presence\t\tUse word presence rather than word frequency (Default: Off)"
-      print "--frequency\t\tUse word frequency rather than word presence (Default: On)"
-      print "--use-pos-tags\t\tUse part-of-speech tags (Default: Off)"
-      print "--use-negation\t\tTag words appearing after a negation word (Default: Off)"
-      print "--use-adj-only\t\tUse adjectives only (requires --use-pos-tags and --gram-length 1) (Default: Off)"
-      print "--use-position\t\tTag words according to their position in the text (Default: Off)"
-      print "--threshold N\t\tOnly include words that appear at least N times across all documents  (Default: 4)"
-      print "--epsilon X\t\tSVM parameter to control the number of iterations (Default: 0.001)"
-      print "\t\t\t(0 < epsilon < 1; lower = more iterations)"
-      print "--use-amazon\t\tUse the Amazon data set rather than the movie review set.  (Default: Off)"
-      print "--use-delta\t\tUse Delta TFIDF.  (Default: Off)"
-      exit()
-    else:
-      print "Error: Invalid argument", args[i]
-      i += 1
-except Exception:
-  print "Invalid arguments"
-
-t0 = time.time()
-
-positive_ids = []
-negative_ids = []
-
-if USE_AMAZON:
-  # Load the mixed Amazon review dataset.
-  (ids, reviews, labels) = XMLParser.get_all_reviews()
-  for i in range(len(ids)):
+def make_bag(text, total_word_counts, **bag_params):
+  return BagOfWords.make(text, ref_bag=total_word_counts, **bag_params)
+
+def from_command_line():
+  i = 0
+  # Set parameters to default values
+  gram_length = GRAM_LENGTH
+  num_folds = NUM_FOLDS
+  use_presence = USE_PRESENCE
+  use_pos_tags = USE_POS_TAGS
+  use_negation = USE_NEGATION
+  use_position = USE_POSITION
+  min_occurrences = MIN_OCCURRENCES
+  use_amazon = USE_AMAZON
+  try:
+    args = sys.argv[1:]
+    while i < len(args):
+      if args[i] == "--gram-length":
+        gram_length = int(args[i+1])
+        i += 2 
+      elif args[i] == "--num-folds":
+        num_folds = int(args[i+1])
+        i += 2
+      elif args[i] == "--presence":
+        use_presence = True
+        i += 1
+      elif args[i] == "--frequency":
+        use_presence = False
+        i += 1
+      elif args[i] == "--use-pos-tags":
+        use_pos_tags = True
+        i += 1
+      elif args[i] == "--use-adj-only":
+        use_adj_only = True
+        i += 1
+      elif args[i] == "--use-negation":
+        use_negation = True
+        i += 1
+      elif args[i] == "--no-negation":
+        use_negation = False
+        i += 1
+      elif args[i] == "--use-position":
+        use_position = True
+        i += 1
+      elif args[i] == "--threshold":
+        min_occurrences = int(args[i+1])
+        i += 2
+      elif args[i] == "--use-amazon":
+        use_amazon = True
+        i += 1
+      elif args[i] == "--use-delta":
+        use_delta = True
+        i += 1
+      elif args[i] == "--help":
+        print "Usage:"
+        print "--gram-length N\t\tUse groups of N consecutive words (Default: 1)"
+        print "--num-folds N\t\tUse N folds for cross-validation (Default: 3)"
+        print "--presence\t\tUse word presence rather than word frequency (Default: Off)"
+        print "--frequency\t\tUse word frequency rather than word presence (Default: On)"
+        print "--use-pos-tags\t\tUse part-of-speech tags (Default: Off)"
+        print "--use-negation\t\tTag words appearing after a negation word (Default: Off)"
+        print "--use-adj-only\t\tUse adjectives only (requires --use-pos-tags and --gram-length 1) (Default: Off)"
+        print "--use-position\t\tTag words according to their position in the text (Default: Off)"
+        print "--threshold N\t\tOnly include words that appear at least N times across all documents  (Default: 4)"
+        print "\t\t\t(0 < epsilon < 1; lower = more iterations)"
+        print "--use-amazon\t\tUse the Amazon data set rather than the movie review set.  (Default: Off)"
+        print "--use-delta\t\tUse Delta TFIDF.  (Default: Off)"
+        exit()
+      else:
+        print "Error: Invalid argument", args[i]
+        i += 1
+    classify_reviews(gram_length, num_folds, use_presence, use_negation, use_pos_tags, use_adj_only, min_occurrences, use_amazon, use_delta)
+  except Exception:
+    print "Invalid arguments"
+                         
+def classify_reviews(gram_length=GRAM_LENGTH, num_folds=NUM_FOLDS, use_presence=USE_PRESENCE, use_negation=USE_NEGATION, use_pos_tags=USE_POS_TAGS, use_adj_only=USE_ADJ_ONLY,
+                     use_position = USE_POSITION,  min_occurrences=MIN_OCCURRENCES, use_amazon=USE_AMAZON, use_delta=USE_DELTA_TFIDF):
+  positive_ids = []
+  negative_ids = []
+
+  if use_amazon:
+    # Load the mixed Amazon review dataset.
+    (ids, reviews, labels) = XMLParser.get_all_reviews()
+    for i in range(len(ids)):
+      if labels[i] == 1:
+        positive_ids.append(ids[i])
+      elif labels[i] == -1:
+        negative_ids.append(ids[i])
+  else:
+    # Load the Pang and Lee sentiment dataset.
+    ids = movie_reviews.fileids()
+    reviews = [list(movie_reviews.words(fileids = [id])) for id in ids]
+    labels = []
+    for id in ids:
+      label = movie_reviews.categories(id)[0]
+      if label == 'pos':
+        labels.append(1)
+        positive_ids.append(id)
+      elif label == 'neg': 
+        labels.append(-1)
+        negative_ids.append(id)
+
+  positive_reviews = []
+  negative_reviews = []
+
+  for i in range(len(reviews)):
     if labels[i] == 1:
-      positive_ids.append(ids[i])
+      positive_reviews.append(reviews[i])
     elif labels[i] == -1:
-      negative_ids.append(ids[i])
-else:
-  # Load the Pang and Lee sentiment dataset.
-  ids = movie_reviews.fileids()
-  reviews = [list(movie_reviews.words(fileids = [id])) for id in ids]
-  labels = []
-  for id in ids:
-    label = movie_reviews.categories(id)[0]
-    if label == 'pos':
-      labels.append(1)
-      positive_ids.append(id)
-    elif label == 'neg': 
-      labels.append(-1)
-      negative_ids.append(id)
-
-positive_reviews = []
-negative_reviews = []
-
-for i in range(len(reviews)):
-  if labels[i] == 1:
-    positive_reviews.append(reviews[i])
-  elif labels[i] == -1:
-    negative_reviews.append(reviews[i])
+      negative_reviews.append(reviews[i])
 
-#TEST
-#positive_reviews = positive_reviews[:200]
-#negative_reviews = negative_reviews[:600]
-#positive_reviews = random.sample(positive_reviews, 1000)
-#negative_reviews = random.sample(negative_reviews, 1000)
+  #TEST
+  #positive_reviews = positive_reviews[:200]
+  #negative_reviews = negative_reviews[:600]
+  #positive_reviews = random.sample(positive_reviews, 1000)
+  #negative_reviews = random.sample(negative_reviews, 1000)
 
-# Partition reviews into folds.
-(pos_folds, pos_fold_ids) = make_folds(positive_reviews, positive_ids, NUM_FOLDS)
-(neg_folds, neg_fold_ids) = make_folds(negative_reviews, negative_ids, NUM_FOLDS)
+  # Partition reviews into folds.
+  (pos_folds, pos_fold_ids) = make_folds(positive_reviews, positive_ids, num_folds)
+  (neg_folds, neg_fold_ids) = make_folds(negative_reviews, negative_ids, num_folds)
 
-# Count occurrences of every word across all documents
-# (this is important for e.g. Delta TFIDF)
-total_word_counts = {}
+  # Count occurrences of every word across all documents
+  # (this is important for e.g. Delta TFIDF)
+  total_word_counts = {}
 
-# Construct a bag of words (or n-grams) from each file.
-pos_fold_bags = [[] for i in range(NUM_FOLDS)]
-neg_fold_bags = [[] for i in range(NUM_FOLDS)]
+  # Construct a bag of words (or n-grams) from each file.
+  pos_fold_bags = [[] for i in range(num_folds)]
+  neg_fold_bags = [[] for i in range(num_folds)]
 
-pos_fold_idfs = [compute_idfs(pos_folds[i]) for i in range(NUM_FOLDS)]
-neg_fold_idfs = [compute_idfs(neg_folds[i]) for i in range(NUM_FOLDS)]
+  pos_fold_idfs = [compute_idfs(pos_folds[i]) for i in range(num_folds)]
+  neg_fold_idfs = [compute_idfs(neg_folds[i]) for i in range(num_folds)]
 
-for i in range(NUM_FOLDS):
-  for review in pos_folds[i]:
-    if USE_DELTATFIDF:
-      pos_idfs = pos_fold_idfs[i]
-      neg_idfs = neg_fold_idfs[i]
-      pos_fold_bags[i].append(BagOfWords.make_delta_tfidf(review, positive_reviews, negative_reviews, pos_idfs, neg_idfs, total_word_counts))
-    else:
-      pos_fold_bags[i].append(make_bag(review, total_word_counts))
+  bag_params = {'gram_length':gram_length, 'use_presence':use_presence, 'use_negation':use_negation, 'use_pos_tags':use_pos_tags,
+                'use_adj_only':use_adj_only, 'use_position':use_position}
+  
+  for i in range(num_folds):
+    for review in pos_folds[i]:
+      if use_delta:
+        pos_idfs = pos_fold_idfs[i]
+        neg_idfs = neg_fold_idfs[i]
+        pos_fold_bags[i].append(BagOfWords.make_delta_tfidf(review, positive_reviews, negative_reviews, pos_idfs, neg_idfs, total_word_counts))
+      else:
+        pos_fold_bags[i].append(make_bag(review, total_word_counts, **bag_params))
+      
+    for review in neg_folds[i]:
+      if use_delta:
+        pos_idfs = pos_fold_idfs[i]
+        neg_idfs = neg_fold_idfs[i]
+        neg_fold_bags[i].append(BagOfWords.make_delta_tfidf(review, positive_reviews, negative_reviews, pos_idfs, neg_idfs, total_word_counts))
+      else:
+        neg_fold_bags[i].append(make_bag(review, total_word_counts, **bag_params))
+
+  # Remove words with less than the minimum occurrences threshold.
+  if min_occurrences > 0:
+    for k in total_word_counts.keys():
+      if total_word_counts[k] < min_occurrences:
+        for fold in (neg_fold_bags + pos_fold_bags):
+          for bag in fold:
+            if bag.has_key(k):
+              bag.pop(k)
+        total_word_counts.pop(k)
+
+  avg_acc = 0
+
+  wordlist = total_word_counts.keys()
+
+  for i in range(num_folds):
+    pos_train_reviews = []
+    neg_train_reviews = []
+    pos_train_bags = []
+    neg_train_bags = []
     
-  for review in neg_folds[i]:
-    if USE_DELTATFIDF:
-      pos_idfs = pos_fold_idfs[i]
-      neg_idfs = neg_fold_idfs[i]
-      neg_fold_bags[i].append(BagOfWords.make_delta_tfidf(review, positive_reviews, negative_reviews, pos_idfs, neg_idfs, total_word_counts))
+    pos_test_reviews = pos_folds[i]
+    neg_test_reviews = neg_folds[i]
+    pos_test_ids = pos_fold_ids[i]
+    neg_test_ids = neg_fold_ids[i]
+    for j in range(num_folds):
+      if j != i:
+        pos_train_reviews += pos_folds[j]
+        neg_train_reviews += neg_folds[j]
+        pos_train_bags += pos_fold_bags[j]
+        neg_train_bags += neg_fold_bags[j]
+
+    train_labels = [1] * len(pos_train_bags) + [-1] * len(neg_train_bags)
+    train_bags = pos_train_bags + neg_train_bags
+
+    if USE_LIBLINEAR:
+      classifier = LinearSVC()
     else:
-      neg_fold_bags[i].append(make_bag(review, total_word_counts))
-
-# Remove words with less than the minimum occurrences threshold.
-if MIN_OCCURRENCES > 0:
-  for k in total_word_counts.keys():
-    if total_word_counts[k] < MIN_OCCURRENCES:
-      for fold in (neg_fold_bags + pos_fold_bags):
-        for bag in fold:
-          if bag.has_key(k):
-            bag.pop(k)
-      total_word_counts.pop(k)
+      classifier = SVC(kernel="linear",tol=EPSILON)
 
-#num_unique_words = len(total_word_counts.keys())
-#print "# unique words:", num_unique_words
+    train_vecs = [BagOfWords.to_vector(bag, wordlist) for bag in train_bags]
+    classifier.fit(train_vecs, train_labels)
 
-t1 = time.time()
-print "Constructed bags, time:", (t1-t0)
-avg_acc = 0
-
-wordlist = total_word_counts.keys()
-
-#f = open("results.txt", "w")
-for i in range(NUM_FOLDS):
-  pos_train_reviews = []
-  neg_train_reviews = []
-  pos_train_bags = []
-  neg_train_bags = []
-  
-  pos_test_reviews = pos_folds[i]
-  neg_test_reviews = neg_folds[i]
-  pos_test_ids = pos_fold_ids[i]
-  neg_test_ids = neg_fold_ids[i]
-  for j in range(NUM_FOLDS):
-    if j != i:
-      pos_train_reviews += pos_folds[j]
-      neg_train_reviews += neg_folds[j]
-      pos_train_bags += pos_fold_bags[j]
-      neg_train_bags += neg_fold_bags[j]
-
-  train_labels = [1] * len(pos_train_bags) + [-1] * len(neg_train_bags)
-  train_bags = pos_train_bags + neg_train_bags
-
-  if USE_LIBLINEAR:
-    classifier = LinearSVC()
-  else:
-    classifier = SVC(kernel="linear",tol=EPSILON)
-
-  train_vecs = [BagOfWords.to_vector(bag, wordlist) for bag in train_bags]
-  classifier.fit(train_vecs, train_labels)
-
-  test_bags = pos_fold_bags[i] + neg_fold_bags[i]
+    test_bags = pos_fold_bags[i] + neg_fold_bags[i]
+      
+    test_vecs = [BagOfWords.to_vector(bag, wordlist) for bag in test_bags]
+    test_reviews = pos_test_reviews + neg_test_reviews
+    test_ids = pos_test_ids + neg_test_ids
+    test_labels = [1] * len(pos_test_reviews) + [-1] * len(neg_test_reviews)
     
-  test_vecs = [BagOfWords.to_vector(bag, wordlist) for bag in test_bags]
-  test_reviews = pos_test_reviews + neg_test_reviews
-  test_ids = pos_test_ids + neg_test_ids
-  test_labels = [1] * len(pos_test_reviews) + [-1] * len(neg_test_reviews)
-  
-  predicted_labels = classifier.predict(test_vecs)
-  acc = classifier.score(test_vecs, test_labels)
-  for i in range(len(test_reviews)):
-    #f.write("%s\t%d\t%d\n" % (test_ids[i], test_labels[i], predicted_labels[i]))
-    print("%s\t%d\t%d" % (test_ids[i], test_labels[i], predicted_labels[i]))
+    predicted_labels = classifier.predict(test_vecs)
+    acc = classifier.score(test_vecs, test_labels)
+    avg_acc += acc
 
-  avg_acc += acc
-
-#f.close()
+  avg_acc /= num_folds
+  return avg_acc
   
-t2 = time.time()
-avg_acc /= NUM_FOLDS
-print "Total accuracy:", avg_acc
-print "Classification time:", (t2-t1)
-print "Total time:", (t2-t0)
\ No newline at end of file
+def run_configs():
+  min_occurrences = 4
+  use_negation = True
+  use_delta = False
+  use_pos_tags = False
+  use_adj_only = False
+  labels = []
+  accs = []
+  for use_amazon in [False, True]:
+    for gram_length in [1,2]:
+      for use_presence in [False, True]:
+        for (use_pos_tags, use_adj_only) in [(True, False), (True, True)]:
+          for use_position in [False, True]:
+            params = {'gram_length':gram_length, 'use_presence':use_presence, 'use_pos_tags':use_pos_tags, 'use_adj_only':use_adj_only,
+                    'use_position':use_position, 'use_amazon':use_amazon, 'min_occurrences':min_occurrences, 'use_delta':False}
+            acc = classify_reviews(**params)
+            label = "gram_length: %d, use_presence: %s, use_amazon: %s, use_pos_tags: %s, use_adj_only: %s, use_position: %s" % (gram_length, use_presence, use_amazon, use_pos_tags, use_adj_only, use_position)
+            print label, acc
+            labels.append(label)
+            accs.append(acc)
+    # Delta-TFIDF construction doesn't support all parameters (yet).
+    params = {'use_amazon':use_amazon, 'use_delta':True}
+    acc = classify_reviews(**params)
+    label = "delta_tfidf: True, use_amazon: %s" % use_amazon
+    print label, acc
+    labels.append(label)
+    accs.append(acc)
+  return (labels, accs)
+
+(labels, accs) = run_configs()
+f = open('SVM_RESULTS.txt', 'w')
+for (label, acc) in zip(labels, accs):
+  f.write("%s\t%s\n" % (label, acc))
+f.close()