Updated SVM code for Amazon reviews, Sklearn, NLTK

job13011 · Apr 9, 2016 · c1f8c1e · c1f8c1e
1 parent c15369d
commit c1f8c1e
Show file tree

Hide file tree

Showing 2 changed files with 101 additions and 73 deletions.
diff --git a/BagOfWords.py b/BagOfWords.py
@@ -1,23 +1,24 @@
 from __future__ import division
-import nltk
 import string
+import numpy
+import nltk
 from TFIDF import tfidf
 
 # "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not',
 # 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word."
 # They didn't provide a full list.
-NEGATION_WORDS      = ["not", "isn't", "didn't", "doesn't"]
-PUNCTUATION         = [".", "!", "?", ",", ";"] #TODO make this work with POS tags (._.)
+NEGATION_WORDS      = ["not", "n't"]
+PUNCTUATION         = [".", "!", "?", ",", ";", '(', ')'] #TODO make this work with POS tags (._.)
 POSITION_TAGS       = ["_1Q", "_2H", "_3Q"]
+ADJECTIVE_TAGS      = ["JJ", "JJR", "JJS", "JJT"]
 POSITION_THRESHOLDS = [0.25, 0.75, 1]
 
 # ref_bag is used to calculate the total word count across all documents.
-def make(text, ref_bag=None, gram_length=1, use_negation=False, use_presence=False, use_pos_tags=False, use_adj_only=False, use_position=False, normalize=True, use_hash=True):
+def make(words, ref_bag=None, gram_length=1, use_negation=False, use_presence=False, use_pos_tags=False, use_adj_only=False, use_position=False, normalize=True, use_hash=False):
   bag_of_words = {}
   if use_negation:
     do_negation = False
 
-  words = nltk.word_tokenize(text)
   if use_pos_tags:
     tagged = nltk.pos_tag(words)
     words = [string.join(t, "_") for t in tagged]
@@ -45,7 +46,7 @@ def make(text, ref_bag=None, gram_length=1, use_negation=False, use_presence=Fal
     else:
       index = n_gram
 
-    if not (use_pos_tags and use_adj_only and (tagged[i][1] != "JJ")):
+    if not (use_pos_tags and use_adj_only and (tagged[i][1] not in ADJECTIVE_TAGS)):
       if (not use_presence) and bag_of_words.has_key(index):
         bag_of_words[index] += 1
       else:
@@ -79,4 +80,14 @@ def make_tfidf(document, documents):
   factor **= 0.5
   for key in bag.keys():
     bag[key] /= factor
-  return bag
+  return bag
+
+def to_vector(bag, wordlist):
+  vec = []
+  for word in wordlist:
+    if bag.has_key(word):
+      vec.append(bag[word])
+    else:
+      vec.append(0)
+  return vec
+  #return numpy.array(vec).reshape(1,-1)
diff --git a/review_svm.py b/review_svm.py
@@ -1,75 +1,61 @@
+from __future__ import division
 import os
 import random
 import string
 import time
 import sys
 
 import nltk
-import svmutil
+from nltk.corpus import movie_reviews
+import numpy
+#import svmutil
+from sklearn.svm import SVC
+from sklearn.svm import LinearSVC
 
 import BagOfWords
+import XMLParser
 
 # Program to classify the movie review dataset using a support vector machine
 # (via LIBSVM), following Pang and Lee (2002).
 
-POS_FOLDER = os.path.join("review_polarity","txt_sentoken","pos")
-NEG_FOLDER = os.path.join("review_polarity","txt_sentoken","neg")
-
 # "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not',
 # 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word."
 # They didn't provide a full list.
 # TODO make this a parameter
 NEGATION_WORDS = ["not", "isn't", "didn't", "doesn't"]
 PUNCTUATION    = [".", "!", "?", ",", ";"]
 
-NORMAL_LENGTH  = 1000
-
 # These are now command line parameters! See below...
 USE_PRESENCE    = False                       # If true, use presence rather than frequency.
 USE_POS_TAGS    = False
 USE_ADJ_ONLY    = False
 USE_NEGATION    = True
 USE_POSITION    = False
 GRAM_LENGTH     = 1                           # Unigrams, bigrams, ... TODO use a range
-NUM_FOLDS       = 3                           # For cross-validation (Pang & Lee used 3)
+NUM_FOLDS       = 5                           # For cross-validation (Pang & Lee used 3)
 
-MIN_OCCURRENCES = 4                           # To be included, the word must show up this many times across all documents (Pang and Lee used 4)
+MIN_OCCURRENCES = 0#4                           # To be included, the word must show up this many times across all documents (Pang and Lee used 4)
 EPSILON         = .001                        # determines how long the algorithm runs (default is 0.001)
 
-KERNEL_TYPE     = 0                           # 0: linear, 2: radial basis (just use linear)
 NORMALIZE_BAGS  = True
-USE_LIBLINEAR   = False                       # Not implemented - it murdered my computer and wasn't noticeably faster.  But maybe multicore is worth a look
+USE_LIBLINEAR   = True                        # Not implemented - it murdered my computer and wasn't noticeably faster.  But maybe multicore is worth a look
 CACHE_SIZE      = 512
 
-def file_to_text(filename):
-  f = open(filename)
-  lines = f.readlines()
-  f.close()
-  text = string.join(lines, " ")
-  return text
-
-def generate_filenames(folder_name):
-  filenames = []
-  for (folder, x, folder_filenames) in os.walk(folder_name):
-    for filename in folder_filenames:
-      if filename.endswith(".txt"):
-        filenames.append(os.path.join(folder, filename))
-  return filenames
-
-def partition_filenames(filenames, num_partitions):
-  partitions = [[] for i in range(num_partitions)]
-  for i in range(len(filenames)):
-    partitions[i % num_partitions].append(filenames[i])
-  return partitions
+USE_AMAZON      = True                        # Use the Amazon review set, not Pang and Lee.
+
+def make_folds(documents, num_partitions):
+  folds = [[] for i in range(num_partitions)]
+  for i in range(len(documents)):
+    folds[i % num_partitions].append(documents[i])
+  return folds
 
 def make_bag(text, total_word_counts):
-  return BagOfWords.make(file_to_text(filename), ref_bag=total_word_counts,
+  return BagOfWords.make(text, ref_bag=total_word_counts,
                          gram_length=GRAM_LENGTH, use_presence=USE_PRESENCE,
                          use_pos_tags=USE_POS_TAGS, use_adj_only=USE_ADJ_ONLY,
                          normalize=NORMALIZE_BAGS, use_negation=USE_NEGATION,
                          use_position=USE_POSITION)
 
-
 # Set parameters from command-line arguments.
 i = 0
 try:
@@ -129,17 +115,37 @@ def make_bag(text, total_word_counts):
   print "Invalid arguments"
 
 t0 = time.time()
-
-pos_filenames = generate_filenames(POS_FOLDER)
-neg_filenames = generate_filenames(NEG_FOLDER)
 
-# TEST - to test on a subset of reviews (since some operations [i.e. tagging] are slow)
-#pos_filenames = random.sample(pos_filenames, 20)
-#neg_filenames = random.sample(neg_filenames, 20)
+if USE_AMAZON:
+  # Load the mixed Amazon review dataset.
+  (ids, reviews, labels) = XMLParser.get_all_reviews()
+else:
+  # Load the Pang and Lee sentiment dataset.
+  ids = movie_reviews.fileids()
+  reviews = [list(movie_reviews.words(id)) for id in ids]
+  labels = []
+  for id in ids:
+    label = movie_reviews.categories(id)[0]
+    if label == 'pos':
+      labels.append(1)
+    elif label == 'neg': 
+      labels.append(-1)
+
+positive_reviews = []
+negative_reviews = []
+for i in range(len(reviews)):
+  if labels[i] == 1:
+    positive_reviews.append(reviews[i])
+  elif labels[i] == -1:
+    negative_reviews.append(reviews[i])
+
+#TEST
+positive_reviews = random.sample(positive_reviews, 1000)
+negative_reviews = random.sample(negative_reviews, 1000)
 
 # Partition reviews into folds.
-pos_folds = partition_filenames(pos_filenames, NUM_FOLDS)
-neg_folds = partition_filenames(neg_filenames, NUM_FOLDS)
+pos_folds = make_folds(positive_reviews, NUM_FOLDS)
+neg_folds = make_folds(negative_reviews, NUM_FOLDS)
 
 # Count occurrences of every word across all documents
 # (this is important for e.g. Delta TFIDF)
@@ -148,22 +154,23 @@ def make_bag(text, total_word_counts):
 # Construct a bag of words (or n-grams) from each file.
 pos_fold_bags = [[] for i in range(NUM_FOLDS)]
 neg_fold_bags = [[] for i in range(NUM_FOLDS)]
-  
+
 for i in range(NUM_FOLDS):
-  for filename in pos_folds[i]:
-    pos_fold_bags[i].append(make_bag(file_to_text(filename), total_word_counts))
+  for review in pos_folds[i]:
+    pos_fold_bags[i].append(make_bag(review, total_word_counts))
 
-  for filename in neg_folds[i]:
-    neg_fold_bags[i].append(make_bag(file_to_text(filename), total_word_counts))
+  for review in neg_folds[i]:
+    neg_fold_bags[i].append(make_bag(review, total_word_counts))
 
 # Remove words with less than the minimum occurrences threshold.
-for k in total_word_counts.keys():
-  if total_word_counts[k] < MIN_OCCURRENCES:
-    for fold in (neg_fold_bags + pos_fold_bags):
-      for bag in fold:
-        if bag.has_key(k):
-          bag.pop(k)
-    total_word_counts.pop(k)
+if MIN_OCCURRENCES > 0:
+  for k in total_word_counts.keys():
+    if total_word_counts[k] < MIN_OCCURRENCES:
+      for fold in (neg_fold_bags + pos_fold_bags):
+        for bag in fold:
+          if bag.has_key(k):
+            bag.pop(k)
+      total_word_counts.pop(k)
 
 #num_unique_words = len(total_word_counts.keys())
 #print "# unique words:", num_unique_words
@@ -172,34 +179,45 @@ def make_bag(text, total_word_counts):
 print "Constructed bags, time:", (t1-t0)
 avg_acc = 0
 
+wordlist = total_word_counts.keys()
 for i in range(NUM_FOLDS):
-  pos_train_filenames = []
-  neg_train_filenames = []
+  pos_train_reviews = []
+  neg_train_reviews = []
   pos_train_bags = []
   neg_train_bags = []
 
-  pos_test_filenames = pos_folds[i]
-  neg_test_filenames = neg_folds[i]
-
+  pos_test_reviews = pos_folds[i]
+  neg_test_reviews = neg_folds[i]
   for j in range(NUM_FOLDS):
     if j != i:
-      pos_train_filenames += pos_folds[j]
-      neg_train_filenames += neg_folds[j]
+      pos_train_reviews += pos_folds[j]
+      neg_train_reviews += neg_folds[j]
       pos_train_bags += pos_fold_bags[j]
       neg_train_bags += neg_fold_bags[j]
 
   train_labels = [1] * len(pos_train_bags) + [-1] * len(neg_train_bags)
   train_bags = pos_train_bags + neg_train_bags
 
-  m = svmutil.svm_train(train_labels, train_bags, "-t %d -e %f -m %d -q" % (KERNEL_TYPE, EPSILON, CACHE_SIZE))  
+  #m = svmutil.svm_train(train_labels, train_bags, "-t 0 -e %f -m %d -q" % (EPSILON, CACHE_SIZE))  
+  if USE_LIBLINEAR:
+    classifier = LinearSVC()
+  else:
+    classifier = SVC(kernel="linear",tol=EPSILON)
+
+  train_vecs = [BagOfWords.to_vector(bag, wordlist) for bag in train_bags]
+  classifier.fit(train_vecs, train_labels)
 
   test_bags = pos_fold_bags[i] + neg_fold_bags[i]
-  test_filenames = pos_test_filenames + neg_test_filenames
-  test_labels = [1] * len(pos_test_filenames) + [-1] * len(neg_test_filenames)
+
+  test_vecs = [BagOfWords.to_vector(bag, wordlist) for bag in test_bags]
+  test_reviews = pos_test_reviews + neg_test_reviews
+  test_labels = [1] * len(pos_test_reviews) + [-1] * len(neg_test_reviews)
 
-  (predicted_labels, acc, p_vals) = svmutil.svm_predict(test_labels, test_bags, m)
+  #(predicted_labels, acc, p_vals) = svmutil.svm_predict(test_labels, test_bags, m)
+  predicted_labels = classifier.predict(test_vecs)
+  acc = classifier.score(test_vecs, test_labels)
 
-  avg_acc += acc[0]
+  avg_acc += acc
 
   """
   indices = random.sample(range(len(test_filenames)), 10)
@@ -214,5 +232,4 @@ def make_bag(text, total_word_counts):
 avg_acc /= NUM_FOLDS
 print "Total accuracy:", avg_acc
 print "Classification time:", (t2-t1)
-print "Total time:", (t2-t0)
-
+print "Total time:", (t2-t0)