Delta TFIDF optimization. 98% accuracy, I'm suspicious.

job13011 · Apr 11, 2016 · 6c944e0 · 6c944e0
1 parent 1fd3910
commit 6c944e0
Show file tree

Hide file tree

Showing 3 changed files with 68 additions and 45 deletions.
diff --git a/BagOfWords.py b/BagOfWords.py
@@ -84,11 +84,11 @@ def make_tfidf(document, documents):
 
 # As per Martineau and Finn (2009), create a bag of words using delta TFIDF as the feature value.
 # Todo: Bigrams?
-def make_delta_tfidf(document, positive_set, negative_set, ref_bag):
+def make_delta_tfidf(document, positive_set, negative_set, pos_idfs, neg_idfs, ref_bag):
   bag = {}
   factor = 0
   for term in set(document):
-    weight = delta_tfidf(term, document, positive_set, negative_set)
+    weight = delta_tfidf(term, document, positive_set, negative_set, pos_idfs, neg_idfs)
     if (weight != 0):
       bag[term] = weight
       factor += weight**2

diff --git a/TFIDF.py b/TFIDF.py
@@ -1,18 +1,37 @@
+from __future__ import division
 import math
 import nltk
 
 # document is assumed to be tokenized (a list of words)
 # documents is a list of tokenized docs
-def tfidf(term, document, documents):
-  all_doc_appearances = 0          # number of documents in which term appears
+
+def compute_idfs(documents):
+  idfs = {}
+  N = len(documents)
   for doc in documents:
-    if term in doc:
-      all_doc_appearances += 1
+    for term in doc:
+      if idfs.has_key(term):
+        idfs[term] += 1
+      else:
+        idfs[term] = 1
+  for term in idfs.keys():
+    idfs[term] = math.log(N/idfs[term])
+  return idfs
+
+def tfidf(term, document, documents, idfs={}):
+  if idfs == {}:
+    all_doc_appearances = sum([doc for doc in documents if term in doc])
+    idf = math.log(len(documents)/all_doc_appearances, 10)
+  else:
+    if idfs.has_key(term):
+      idf = idfs[term]
+    else:
+      return 0 # is this supposed to happen???
   doc_appearances = 0              # number of appearances of term in this document
   for word in document:
     if term == word:
       doc_appearances += 1
-  num_docs = len(documents)        # number of documents in the collection
+  """
   if doc_appearances == 0:
     #This happens sometimes, probably due to inconsistent splitting/tokenizing.
     #print "Error: no occurrences of", term
@@ -21,12 +40,10 @@ def tfidf(term, document, documents):
     #print "Error: fuck,", term
     return 0
   else:
-    tfidf = (1 + math.log(doc_appearances,10)) * math.log((float(num_docs)/all_doc_appearances), 10)
-    return tfidf
+  """
+  tfidf = (1 + math.log(doc_appearances,10)) * idf
+  return tfidf
 
 # Martineau and Finn 2009
-def delta_tfidf(term, document, positive_set, negative_set):
-  return tfidf(term, document, positive_set) - tfidf(term, document, negative_set)
-
-def delta_tfidf_fast(term, document, positive_set, negative_set):
-  return tfidf(term, document, positive_set) - tfidf(term, document, negative_set)
+def delta_tfidf(term, document, positive_set, negative_set, pos_idfs={}, neg_idfs={}):
+  return tfidf(term, document, positive_set, pos_idfs) - tfidf(term, document, negative_set, neg_idfs)
diff --git a/review_svm.py b/review_svm.py
@@ -11,7 +11,7 @@
 #import svmutil
 from sklearn.svm import SVC
 from sklearn.svm import LinearSVC
-from TFIDF import delta_tfidf
+from TFIDF import delta_tfidf, compute_idfs
 
 import BagOfWords
 import XMLParser
@@ -34,7 +34,7 @@
 USE_NEGATION    = True
 USE_POSITION    = False
 GRAM_LENGTH     = 1                           # Unigrams, bigrams, ... TODO use a range
-NUM_FOLDS       = 3                           # For cross-validation (Pang & Lee used 3)
+NUM_FOLDS       = 10                           # For cross-validation (Pang & Lee used 3)
 
 MIN_OCCURRENCES = 0#4                         # To be included, the word must show up this many times across all documents (Pang and Lee used 4)
 EPSILON         = .001                        # determines how long the algorithm runs (default is 0.001)
@@ -44,11 +44,13 @@
 
 USE_AMAZON      = False                       # Use the Amazon review set, not Pang and Lee.
 
-def make_folds(documents, num_partitions):
+def make_folds(documents, ids, num_partitions):
   folds = [[] for i in range(num_partitions)]
+  fold_ids = [[] for i in range(num_partitions)]
   for i in range(len(documents)):
     folds[i % num_partitions].append(documents[i])
-  return folds
+    fold_ids[i % num_partitions].append(ids[i])
+  return (folds, fold_ids)
 
 def make_bag(text, total_word_counts):
   return BagOfWords.make(text, ref_bag=total_word_counts,
@@ -123,30 +125,35 @@ def make_bag(text, total_word_counts):
 else:
   # Load the Pang and Lee sentiment dataset.
   ids = movie_reviews.fileids()
-  reviews = [list(movie_reviews.words(id)) for id in ids]
+  reviews = [list(movie_reviews.words(fileids = [id])) for id in ids]
+  positive_ids = []
+  negative_ids = []
   labels = []
   for id in ids:
     label = movie_reviews.categories(id)[0]
     if label == 'pos':
       labels.append(1)
+      positive_ids.append(id)
     elif label == 'neg': 
       labels.append(-1)
+      negative_ids.append(id)
 
 positive_reviews = []
 negative_reviews = []
+
 for i in range(len(reviews)):
   if labels[i] == 1:
     positive_reviews.append(reviews[i])
   elif labels[i] == -1:
     negative_reviews.append(reviews[i])
-    
+
 #TEST
-positive_reviews = random.sample(positive_reviews, 250)
-negative_reviews = random.sample(negative_reviews, 250)
+#positive_reviews = random.sample(positive_reviews, 25)
+#negative_reviews = random.sample(negative_reviews, 25)
 
 # Partition reviews into folds.
-pos_folds = make_folds(positive_reviews, NUM_FOLDS)
-neg_folds = make_folds(negative_reviews, NUM_FOLDS)
+(pos_folds, pos_fold_ids) = make_folds(positive_reviews, positive_ids, NUM_FOLDS)
+(neg_folds, neg_fold_ids) = make_folds(negative_reviews, negative_ids, NUM_FOLDS)
 
 # Count occurrences of every word across all documents
 # (this is important for e.g. Delta TFIDF)
@@ -156,24 +163,25 @@ def make_bag(text, total_word_counts):
 pos_fold_bags = [[] for i in range(NUM_FOLDS)]
 neg_fold_bags = [[] for i in range(NUM_FOLDS)]
 
+pos_fold_idfs = [compute_idfs(pos_folds[i]) for i in range(NUM_FOLDS)]
+neg_fold_idfs = [compute_idfs(neg_folds[i]) for i in range(NUM_FOLDS)]
+
 for i in range(NUM_FOLDS):
   for review in pos_folds[i]:
-    t3 = time.time()
     if USE_DELTATFIDF:
-      pos_fold_bags[i].append(BagOfWords.make_delta_tfidf(review, positive_reviews, negative_reviews, total_word_counts))
+      pos_idfs = pos_fold_idfs[i]
+      neg_idfs = neg_fold_idfs[i]
+      pos_fold_bags[i].append(BagOfWords.make_delta_tfidf(review, positive_reviews, negative_reviews, pos_idfs, neg_idfs, total_word_counts))
     else:
-      pos_fold_bags[i].append(make_bag(review, total_word_counts))      
-    t4 = time.time()
-    print "Bag time:", (t4-t3)
+      pos_fold_bags[i].append(make_bag(review, total_word_counts))
 
   for review in neg_folds[i]:
-    t3 = time.time()
     if USE_DELTATFIDF:
-      neg_fold_bags[i].append(BagOfWords.make_delta_tfidf(review, positive_reviews, negative_reviews, total_word_counts))
+      pos_idfs = pos_fold_idfs[i]
+      neg_idfs = neg_fold_idfs[i]
+      neg_fold_bags[i].append(BagOfWords.make_delta_tfidf(review, positive_reviews, negative_reviews, pos_idfs, neg_idfs, total_word_counts))
     else:
       neg_fold_bags[i].append(make_bag(review, total_word_counts))
-    t4 = time.time()
-    print "Bag time:", (t4-t3)
 
 # Remove words with less than the minimum occurrences threshold.
 if MIN_OCCURRENCES > 0:
@@ -193,6 +201,8 @@ def make_bag(text, total_word_counts):
 avg_acc = 0
 
 wordlist = total_word_counts.keys()
+
+f = open("results.txt", "w")
 for i in range(NUM_FOLDS):
   pos_train_reviews = []
   neg_train_reviews = []
@@ -201,6 +211,8 @@ def make_bag(text, total_word_counts):
 
   pos_test_reviews = pos_folds[i]
   neg_test_reviews = neg_folds[i]
+  pos_test_ids = pos_fold_ids[i]
+  neg_test_ids = neg_fold_ids[i]
   for j in range(NUM_FOLDS):
     if j != i:
       pos_train_reviews += pos_folds[j]
@@ -211,7 +223,6 @@ def make_bag(text, total_word_counts):
   train_labels = [1] * len(pos_train_bags) + [-1] * len(neg_train_bags)
   train_bags = pos_train_bags + neg_train_bags
 
-  #m = svmutil.svm_train(train_labels, train_bags, "-t 0 -e %f -m %d -q" % (EPSILON, CACHE_SIZE))  
   if USE_LIBLINEAR:
     classifier = LinearSVC()
   else:
@@ -224,23 +235,18 @@ def make_bag(text, total_word_counts):
 
   test_vecs = [BagOfWords.to_vector(bag, wordlist) for bag in test_bags]
   test_reviews = pos_test_reviews + neg_test_reviews
+  test_ids = pos_test_ids + neg_test_ids
   test_labels = [1] * len(pos_test_reviews) + [-1] * len(neg_test_reviews)
 
-  #(predicted_labels, acc, p_vals) = svmutil.svm_predict(test_labels, test_bags, m)
   predicted_labels = classifier.predict(test_vecs)
   acc = classifier.score(test_vecs, test_labels)
-
+  for i in range(len(test_reviews)):
+    print "%s\t%d\t%d" % (test_ids[i], test_labels[i], predicted_labels[i])
+
   avg_acc += acc
-
-  """
-  indices = random.sample(range(len(test_filenames)), 10)
-  filenames_labels = {}
-  for j in indices:
-    filename = test_filenames[j]
-    predicted_label = predicted_labels[j]
-    filenames_labels[filename] = predicted_labels[j]
-  """
 
+f.close()
+
 t2 = time.time()
 avg_acc /= NUM_FOLDS
 print "Total accuracy:", avg_acc