added NLTK integration (POS tags), LIBLINEAR support

job13011 · Mar 15, 2016 · 64bcfa5 · 64bcfa5
1 parent 88fcad2
commit 64bcfa5
Showing 1 changed file with 59 additions and 26 deletions.
diff --git a/review_svm.py b/review_svm.py
@@ -1,28 +1,36 @@
 import os
 import random
 import string
-from svmutil import *
+import time
+
+import nltk
+import svmutil
+import liblinearutil
 
 # Program to classify the movie review dataset using a support vector machine
 # (via LIBSVM), following Pang and Lee (2002).
 
-NEG_FOLDER   = "review_polarity\\txt_sentoken\\neg"
-POS_FOLDER   = "review_polarity\\txt_sentoken\\pos"
+NEG_FOLDER   = os.path.join("review_polarity","txt_sentoken","neg")
+POS_FOLDER   = os.path.join("review_polarity","txt_sentoken","pos")
 
 # "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not',
 # 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word."
 # They didn't provide a full list.
 NEGATION_WORDS = ["not", "isn't", "didn't", "doesn't"]
-PUNCTUATION = [".", "!", "?", ",", ";"]
+PUNCTUATION    = [".", "!", "?", ",", ";"]
 
-NORMAL_LENGTH = 1000
+NORMAL_LENGTH  = 1000
 
 # TODO Make these command-line parameters.
-USE_PRESENCE = False                       # If true, use presence rather than frequency.
-GRAM_LENGTH  = 3                           # Unigrams, bigrams, ...
-NUM_FOLDS    = 3                           # For cross-validation (Pang & Lee used 3)
-EPSILON      = 0.001                       # determines how long the algorithm runs (default is 0.001)
-KERNEL_TYPE  = 0                           # 0: linear, 2: radial basis
+USE_PRESENCE   = False                       # If true, use presence rather than frequency.
+USE_POS_TAGS   = False
+USE_ADJ_ONLY   = False
+GRAM_LENGTH    = 1                           # Unigrams, bigrams, ...
+NUM_FOLDS      = 3                           # For cross-validation (Pang & Lee used 3)
+EPSILON        = 0.001                       # determines how long the algorithm runs (default is 0.001)
+KERNEL_TYPE    = 0                           # 0: linear, 2: radial basis
+NORMALIZE_BAGS = False
+USE_LIBLINEAR  = True
 
 def make_bag(filename):
   f = open(filename)
@@ -32,28 +40,39 @@ def make_bag(filename):
   bag_of_words = {}
   do_negation = False
 
-  words = text.split(" ")
+  #words = text.split(" ")
+  words = nltk.word_tokenize(text)
+  if USE_POS_TAGS and GRAM_LENGTH==1:
+    tagged = nltk.pos_tag(words)
   count = 0
   for i in range(len(words) - GRAM_LENGTH + 1):
     n_gram = string.join(words[i:i+GRAM_LENGTH+1], "_")
     if (GRAM_LENGTH == 1):                 # Pang and Lee didn't do negation tagging for bigrams.
       if n_gram in NEGATION_WORDS:
         do_negation = True
-      if n_gram in PUNCTUATION:
+      elif n_gram in PUNCTUATION:
         do_negation = False
-      elif do_negation:
+
+      if USE_POS_TAGS:
+        n_gram = string.join(tagged[i], "_")
+      if do_negation:
         n_gram = "NOT_" + n_gram
 
     index = hash(n_gram)
-    if (not USE_PRESENCE) and bag_of_words.has_key(index):
-      bag_of_words[index] += 1
-      count += 1
-    else:
-      bag_of_words[index] = 1
-      count += 1
+    if not (USE_POS_TAGS and USE_ADJ_ONLY and (tagged[i][1] != "JJ")):
+      if (not USE_PRESENCE) and bag_of_words.has_key(index):
+        bag_of_words[index] += 1
+        #print n_gram + " => " + str(bag_of_words[index])
+        count += 1
+      else:
+        bag_of_words[index] = 1
+        count += 1
+        #print n_gram + " => " + str(bag_of_words[index])
   # Normalize the bag of words.  For whatever reason it didn't work very well with small decimals...
-  for k in bag_of_words.keys():
-    bag_of_words[k] = float(NORMAL_LENGTH*bag_of_words[k])/count
+  if NORMALIZE_BAGS:
+    for k in bag_of_words.keys():
+      bag_of_words[k] = float(NORMAL_LENGTH*bag_of_words[k])/count
+
   return bag_of_words
 
 pos_filenames = []
@@ -64,12 +83,12 @@ def make_bag(filename):
 for (folder, x, filenames) in os.walk(POS_FOLDER):
   for filename in filenames:
     if filename.endswith(".txt"):
-      pos_filenames.append(folder + "\\" + filename)
+      pos_filenames.append(os.path.join(folder, filename))
 
 for (folder, x, filenames) in os.walk(NEG_FOLDER):
   for filename in filenames:
     if filename.endswith(".txt"):
-      neg_filenames.append(folder + "\\" + filename)
+      neg_filenames.append(os.path.join(folder, filename))
 
 # Partition reviews into folds.
 pos_folds = [[] for i in range(NUM_FOLDS)]
@@ -84,6 +103,8 @@ def make_bag(filename):
 # Construct a bag of words (or n-grams) from each file.
 pos_fold_bags = [[] for i in range(NUM_FOLDS)]
 neg_fold_bags = [[] for i in range(NUM_FOLDS)]
+
+print "Constructed bags."
 
 for i in range(NUM_FOLDS):
   for filename in pos_folds[i]:
@@ -93,6 +114,7 @@ def make_bag(filename):
     neg_fold_bags[i].append(make_bag(filename))
 
 
+t1 = time.time()
 for i in range(NUM_FOLDS):
   pos_train_bags = []
   neg_train_bags = []
@@ -114,16 +136,27 @@ def make_bag(filename):
   train_bags = pos_train_bags + neg_train_bags
 
   # TODO: Investigate LIBSVM training parameters.
-  m = svm_train(train_labels, train_bags, "-t %d -e %f" % (KERNEL_TYPE, EPSILON))
+  if USE_LIBLINEAR:
+    m = liblinearutil.train(train_labels, train_bags, "-e %f" % EPSILON)
+  else:
+    m = svmutil.svm_train(train_labels, train_bags, "-t %d -e %f" % (KERNEL_TYPE, EPSILON))
+
 
   test_bags = pos_fold_bags[i] + neg_fold_bags[i]
   test_filenames = pos_test_filenames + neg_test_filenames
   test_labels = [1] * len(pos_test_filenames) + [-1] * len(neg_test_filenames)
 
-  (predicted_labels, acc, p_vals) = svm_predict(test_labels, test_bags, m)
+  if USE_LIBLINEAR:
+    (predicted_labels, acc, p_vals) = liblinearutil.svm_predict(test_labels, test_bags, m)
+  else:
+    (predicted_labels, acc, p_vals) = svmutil.svm_predict(test_labels, test_bags, m)
+
   indices = random.sample(range(len(test_filenames)), 10)
   filenames_labels = {}
   for j in indices:
     filename = test_filenames[j]
     predicted_label = predicted_labels[j]
-    filenames_labels[filename] = predicted_labels[j]
+    filenames_labels[filename] = predicted_labels[j]
+
+t2 = time.time()
+print "Total time:", t2-t1