Various fixes (min occurrences, etc.)

job13011 · Mar 16, 2016 · 5deb398 · 5deb398
1 parent 64bcfa5
commit 5deb398
Showing 1 changed file with 83 additions and 33 deletions.
diff --git a/review_svm.py b/review_svm.py
@@ -5,7 +5,7 @@
 
 import nltk
 import svmutil
-import liblinearutil
+#import liblinearutil
 
 # Program to classify the movie review dataset using a support vector machine
 # (via LIBSVM), following Pang and Lee (2002).
@@ -22,62 +22,77 @@
 NORMAL_LENGTH  = 1000
 
 # TODO Make these command-line parameters.
-USE_PRESENCE   = False                       # If true, use presence rather than frequency.
-USE_POS_TAGS   = False
-USE_ADJ_ONLY   = False
-GRAM_LENGTH    = 1                           # Unigrams, bigrams, ...
-NUM_FOLDS      = 3                           # For cross-validation (Pang & Lee used 3)
-EPSILON        = 0.001                       # determines how long the algorithm runs (default is 0.001)
-KERNEL_TYPE    = 0                           # 0: linear, 2: radial basis
-NORMALIZE_BAGS = False
-USE_LIBLINEAR  = True
-
-def make_bag(filename):
+USE_PRESENCE    = False                       # If true, use presence rather than frequency.
+USE_POS_TAGS    = True
+USE_ADJ_ONLY    = False
+GRAM_LENGTH     = 2                          # Unigrams, bigrams, ...
+NUM_FOLDS       = 3                           # For cross-validation (Pang & Lee used 3)
+EPSILON         = .1                        # determines how long the algorithm runs (default is 0.001)
+KERNEL_TYPE     = 0                           # 0: linear, 2: radial basis
+NORMALIZE_BAGS  = False
+USE_LIBLINEAR   = False
+CACHE_SIZE      = 512
+MIN_OCCURRENCES = 10                           # To be included, the word must show up this many times across all documents
+
+def file_to_text(filename):
   f = open(filename)
   lines = f.readlines()
   f.close()
   text = string.join(lines, " ")
+  return text
+
+def make_bag(text, ref_bag):
   bag_of_words = {}
   do_negation = False
 
-  #words = text.split(" ")
   words = nltk.word_tokenize(text)
-  if USE_POS_TAGS and GRAM_LENGTH==1:
+  if USE_POS_TAGS:# and GRAM_LENGTH==1:
+    t5 = time.time()
     tagged = nltk.pos_tag(words)
+    tagged = [string.join(t, "_") for t in tagged]
+    words = tagged
+    t6 = time.time()
+    print "Tag time (%d words): %f" % (len(words), (t6-t5))
   count = 0
   for i in range(len(words) - GRAM_LENGTH + 1):
-    n_gram = string.join(words[i:i+GRAM_LENGTH+1], "_")
+    n_gram = string.join(words[i:i+GRAM_LENGTH], "_")
     if (GRAM_LENGTH == 1):                 # Pang and Lee didn't do negation tagging for bigrams.
       if n_gram in NEGATION_WORDS:
         do_negation = True
       elif n_gram in PUNCTUATION:
         do_negation = False
 
-      if USE_POS_TAGS:
-        n_gram = string.join(tagged[i], "_")
       if do_negation:
         n_gram = "NOT_" + n_gram
 
+    # LIBSVM won't use strings as keys, so hash to convert to a number.
     index = hash(n_gram)
     if not (USE_POS_TAGS and USE_ADJ_ONLY and (tagged[i][1] != "JJ")):
+      #if not (ref_bag != None and ((not ref_bag.has_key(index)) or (ref_bag[index] < MIN_OCCURRENCES))):
       if (not USE_PRESENCE) and bag_of_words.has_key(index):
         bag_of_words[index] += 1
-        #print n_gram + " => " + str(bag_of_words[index])
         count += 1
+        #print n_gram, "=>", bag_of_words[index]
       else:
         bag_of_words[index] = 1
         count += 1
-        #print n_gram + " => " + str(bag_of_words[index])
+        #print n_gram, "=>", bag_of_words[index]
+
+      # Add it to the reference bag
+      if ref_bag.has_key(index):
+        ref_bag[index] += 1
+      else:
+        ref_bag[index] = 1
   # Normalize the bag of words.  For whatever reason it didn't work very well with small decimals...
   if NORMALIZE_BAGS:
     for k in bag_of_words.keys():
       bag_of_words[k] = float(NORMAL_LENGTH*bag_of_words[k])/count
-
   return bag_of_words
+
+t0 = time.time()
 
 pos_filenames = []
 neg_filenames = []
-word_table = {}
 next_word_index = 0
 
 for (folder, x, filenames) in os.walk(POS_FOLDER):
@@ -89,6 +104,10 @@ def make_bag(filename):
   for filename in filenames:
     if filename.endswith(".txt"):
       neg_filenames.append(os.path.join(folder, filename))
+
+# TEST
+#pos_filenames = random.sample(pos_filenames, 20)
+#neg_filenames = random.sample(neg_filenames, 20)
 
 # Partition reviews into folds.
 pos_folds = [[] for i in range(NUM_FOLDS)]
@@ -100,27 +119,51 @@ def make_bag(filename):
 for i in range(len(neg_filenames)):
   neg_folds[i % NUM_FOLDS].append(neg_filenames[i])
 
+# Count occurrences of every word across all documents
+# (this is important for e.g. Delta TFIDF)
+word_table = {}
+
 # Construct a bag of words (or n-grams) from each file.
 pos_fold_bags = [[] for i in range(NUM_FOLDS)]
 neg_fold_bags = [[] for i in range(NUM_FOLDS)]
-
-print "Constructed bags."
 
 for i in range(NUM_FOLDS):
   for filename in pos_folds[i]:
-    pos_fold_bags[i].append(make_bag(filename))
+    t3 = time.time()
+    pos_fold_bags[i].append(make_bag(file_to_text(filename), ref_bag=word_table))
+    t4 = time.time()
+    print "Bag time:", (t4-t3)
 
   for filename in neg_folds[i]:
-    neg_fold_bags[i].append(make_bag(filename))
+    t3 = time.time()
+    neg_fold_bags[i].append(make_bag(file_to_text(filename), ref_bag=word_table))
+    t4 = time.time()
+    print "Bag time:", (t4-t3)
 
+
+# Remove words with less than the minimum occurrences threshold.
+for k in word_table.keys():
+  if word_table[k] < MIN_OCCURRENCES:
+    for bag in (neg_fold_bags + pos_fold_bags):
+      if bag.has_key(k):
+        bag.pop(k)
+
+#word_table = make_bag(all_text, use_presence=False)
+for k in word_table.keys():
+  if word_table[k] < MIN_OCCURRENCES:
+    word_table.pop(k)
+num_unique_words = len(word_table.keys())
+print "# unique words:", num_unique_words
 
 t1 = time.time()
-for i in range(NUM_FOLDS):
-  pos_train_bags = []
-  neg_train_bags = []
+print "Constructed bags, time:", (t1-t0)
+avg_acc = 0
 
+for i in range(NUM_FOLDS):
   pos_train_filenames = []
   neg_train_filenames = []
+  pos_train_bags = []
+  neg_train_bags = []
 
   pos_test_filenames = pos_folds[i]
   neg_test_filenames = neg_folds[i]
@@ -136,27 +179,34 @@ def make_bag(filename):
   train_bags = pos_train_bags + neg_train_bags
 
   # TODO: Investigate LIBSVM training parameters.
+  # TODO: Why does LIBLINEAR break my computer?
   if USE_LIBLINEAR:
-    m = liblinearutil.train(train_labels, train_bags, "-e %f" % EPSILON)
+    pass#m = liblinearutil.train(train_labels, train_bags, "-e %f" % EPSILON)
   else:
-    m = svmutil.svm_train(train_labels, train_bags, "-t %d -e %f" % (KERNEL_TYPE, EPSILON))
-
+    m = svmutil.svm_train(train_labels, train_bags, "-t %d -e %f -m %d -q" % (KERNEL_TYPE, EPSILON, CACHE_SIZE))  
 
   test_bags = pos_fold_bags[i] + neg_fold_bags[i]
   test_filenames = pos_test_filenames + neg_test_filenames
   test_labels = [1] * len(pos_test_filenames) + [-1] * len(neg_test_filenames)
 
   if USE_LIBLINEAR:
-    (predicted_labels, acc, p_vals) = liblinearutil.svm_predict(test_labels, test_bags, m)
+    pass#(predicted_labels, acc, p_vals) = liblinearutil.svm_predict(test_labels, test_bags, m)
   else:
     (predicted_labels, acc, p_vals) = svmutil.svm_predict(test_labels, test_bags, m)
+
+  avg_acc += acc[0]
 
+  """
   indices = random.sample(range(len(test_filenames)), 10)
   filenames_labels = {}
   for j in indices:
     filename = test_filenames[j]
     predicted_label = predicted_labels[j]
     filenames_labels[filename] = predicted_labels[j]
+  """
 
 t2 = time.time()
-print "Total time:", t2-t1
+avg_acc /= NUM_FOLDS
+print "Total accuracy:", avg_acc
+print "Total time:", (t2-t1)
+