From 444709974b9de669f0a3a44368966d63adfcf5f6 Mon Sep 17 00:00:00 2001
From: Jack <john.butler@uconn.edu>
Date: Mon, 11 Apr 2016 17:07:29 -0400
Subject: [PATCH] Delta TFIDF, part 1

---
 BagOfWords.py    |  26 ++-
 GlossCountJWB.py | 432 ++++++++++++++++++++++++++++++-----------------
 MPQALexicon.py   |  25 ++-
 TFIDF.py         |   9 +-
 getAdjectives.py |   7 +-
 review_svm.py    |  31 +++-
 6 files changed, 352 insertions(+), 178 deletions(-)

diff --git a/BagOfWords.py b/BagOfWords.py
index c224bda..dbaea22 100644
--- a/BagOfWords.py
+++ b/BagOfWords.py
@@ -2,7 +2,7 @@
 import string
 import numpy
 import nltk
-from TFIDF import tfidf
+from TFIDF import tfidf, delta_tfidf
 
 # "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not',
 # 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word."
@@ -81,7 +81,29 @@ def make_tfidf(document, documents):
   for key in bag.keys():
     bag[key] /= factor
   return bag
-  
+
+# As per Martineau and Finn (2009), create a bag of words using delta TFIDF as the feature value.
+# Todo: Bigrams?
+def make_delta_tfidf(document, positive_set, negative_set, ref_bag):
+  bag = {}
+  factor = 0
+  for term in set(document):
+    weight = delta_tfidf(term, document, positive_set, negative_set)
+    if (weight != 0):
+      bag[term] = weight
+      factor += weight**2
+  factor **= 0.5
+  for key in bag.keys():
+    bag[key] /= factor
+  # Add word counts to the reference bag
+  for term in document:
+    if ref_bag != None:
+      if ref_bag.has_key(term):
+        ref_bag[term] += 1
+      else:
+        ref_bag[term] = 1
+  return bag
+
 def to_vector(bag, wordlist):
   vec = []
   for word in wordlist:
diff --git a/GlossCountJWB.py b/GlossCountJWB.py
index 2ca0964..cf9278c 100644
--- a/GlossCountJWB.py
+++ b/GlossCountJWB.py
@@ -1,178 +1,302 @@
+from __future__ import division
 import math
+import random
+import string
+from sets import Set
+import numpy
+from sklearn.svm import LinearSVC
+from sklearn.linear_model import LogisticRegression
+from sklearn.naive_bayes import MultinomialNB
 import nltk
 from nltk.corpus import wordnet as wn
 import nltk.classify.util
-from nltk.classify import NaiveBayesClassifier
 from nltk.corpus import movie_reviews
-from sets import Set
-import string
-import random
 import BagOfWords
-from sklearn import svm
-from sklearn.naive_bayes import MultinomialNB
-from sklearn.linear_model import LogisticRegression
 import MPQALexicon
-import numpy
+import AniaLexicon
+
+EXPAND_ITERATIONS = 2
+CLASSIFIER = "me" # "nb" = Naive Bayes, "svm" = Linear SVM, "me" = maximum entropy
+REMOVE_STOPWORDS = False
+USE_STEMMING = False
+USE_EXAMPLES = True
+
+USE_EQUAL_TRAINING = True
+USE_EQUAL_TEST = True
+USE_PARSING = True
+
+POS_SEED = ['good', 'nice', 'excellent', 'positive', 'fortunate', 'correct', 'superior']
+NEG_SEED = ['bad', 'nasty', 'poor', 'negative', 'unfortunate', 'wrong', 'inferior']
     
 # returns tokenized
-def get_defs(word, use_examples=True):
-  defs = [synset.definition() for synset in wn.synsets(word, pos=wn.ADJ)]
-  if use_examples:
-    examples = [synset.examples() for synset in wn.synsets(word, pos=wn.ADJ)]
-    for example in examples: defs += example
-  return nltk.word_tokenize(string.join(defs))
-
-# text and documents are pre-tokenized
+def get_defs(word):
+  defs = []
+  for synset in wn.synsets(word, pos=wn.ADJ):
+    defs += synset.lemma_names()
+    defs.append(synset.definition())
+    if USE_EXAMPLES:
+      defs += synset.examples()
+
+  tokens = nltk.word_tokenize(string.join(defs))
+  if USE_STEMMING:
+    tokens = do_stem(tokens)
+  if REMOVE_STOPWORDS:
+    stopwords = set(nltk.corpus.stopwords.words('english'))
+    if USE_STEMMING:
+      stopwords = do_stem(stopwords)
+    tokens = [x for x in tokens if x not in stopwords]
+  return tokens
+
+# return a tfidf bag; text and documents are pre-tokenized.
 def make_bag(text, documents):
-    #return BagOfWords.make(text, normalize=True, use_negation=False, use_hash=False, use_presence=False)
-    return BagOfWords.make_tfidf(text, documents)
-
-def expand_sets(positive,negative,neutral):
-    newPositive = set(positive)
-    newNegative = set(negative)
-    newNeutral = set(neutral)
-    for word in positive:
-        for syn in wn.synsets(word, pos=wn.ADJ):
-            for lemma in syn.lemmas():
-                curr = lemma.name()
-                if curr not in newPositive and curr not in newNegative and curr not in newNeutral:
-                    newPositive.add(curr)
-                elif curr in newNegative:
-                    newNegative.discard(curr)
-                    newNeutral.add(curr)
-                for antonym in lemma.antonyms():
-                    ant = antonym.name()
-                    if ant not in newPositive and ant not in newNegative and ant not in newNeutral:
-                        newNegative.add(ant)
-                    elif ant in newPositive:
-                        newPositive.discard(ant)
-                        newNeutral.add(ant)
-
-    for word in negative:
-        for syn in wn.synsets(word, pos=wn.ADJ):
-            for lemma in syn.lemmas():
-                curr = lemma.name()
-                if curr not in newPositive and curr not in newNegative and curr not in newNeutral:
-                    newNegative.add(curr)
-                elif curr in newPositive:
-                    newPositive.discard(curr)
-                    newNeutral.add(curr)
-                for antonym in lemma.antonyms():
-                    ant = antonym.name()
-                    if ant not in newPositive and ant not in newNegative and ant not in newNeutral:
-                        newPositive.add(ant)
-                    elif ant in newNegative:
-                        newNegative.discard(ant)
-                        newNeutral.add(ant)
-    return (newPositive, newNegative, newNeutral)
-
-def bag_to_vec(bag, wordlist):
-  vec = []
-  for word in wordlist:
-    if bag.has_key(word):
-      vec.append(bag[word])
-    else:
-      vec.append(0)
-  return vec
-
-# Set up initial Sets S_p and S_n
-neutral = Set([])
-positive = Set(['good', 'nice', 'excellent', 'positive', 'fortunate', 'correct', 'superior'])
-negative = Set(['bad', 'nasty', 'poor', 'negative', 'unfortunate', 'wrong', 'inferior'])
-
-# Expand on Sets to get S_p' and S_n'
-for num in range(1):
-  (positive, negative, neutral) = expand_sets(positive,negative,neutral);
-
-# Use the same number of positive and negative training words.
-positive = random.sample(positive, min(len(positive), len(negative)))
-negative = random.sample(negative, min(len(positive), len(negative)))
-
-# Train the classifier using the expanded wordlist.
-train_wordlist = set(positive + negative)
-
-train_defs = [get_defs(word) for word in (positive + negative)]
-
-train_bags = [make_bag(get_defs(word), train_defs) for word in positive] + [make_bag(get_defs(word), train_defs) for word in negative]
-
-train_labels = [1 for word in positive] + [-1 for word in negative]
-
-# The classifier needs vectors, not dicts.  So we need to convert them to vectors.
-# Make a list of all the words contained in them, then make an array with entries 
-# corresponding to each word.
-
-train_vecs = [bag_to_vec(bag, train_wordlist) for bag in train_bags]
-classifier = svm.LinearSVC()
-classifier.fit(train_vecs, train_labels)
-
-# Load the test set. I'm only using the bag of words structure here to select the words
-# with a certain word count threshold.
-(test_words, test_labels) = MPQALexicon.load(True)
+  return BagOfWords.make_tfidf(text, documents)
 
-test_wordlist_bag = BagOfWords.make(string.join(test_words), normalize=False, use_negation=False, use_hash=False, use_presence=False)
-test_wordlist = sorted(test_wordlist_bag.keys(), key=lambda k: -test_wordlist_bag[k])#[:500]
-test_bags = []
+# Esuli and Sebastiani's algorithm to expand seed sets using WordNet
+def expand_sets(positive, negative, neutral):
+  newPositive = set(positive)
+  newNegative = set(negative)
+  newNeutral = set(neutral)
+  for word in positive:
+    for syn in wn.synsets(word, pos=wn.ADJ):
+      for lemma in syn.lemmas():
+        curr = lemma.name()
+        if curr not in newPositive and curr not in newNegative and curr not in newNeutral:
+          newPositive.add(curr)
+        elif curr in newNegative:
+          newNegative.discard(curr)
+          newNeutral.add(curr)
+        for antonym in lemma.antonyms():
+          ant = antonym.name()
+          if ant not in newPositive and ant not in newNegative and ant not in newNeutral:
+            newNegative.add(ant)
+          elif ant in newPositive:
+            newPositive.discard(ant)
+            newNeutral.add(ant)
 
-test_wordlist = filter(lambda x: x != '', test_wordlist)
-test_bags = [make_bag(get_defs(word), train_defs) for word in test_wordlist]
+  for word in negative:
+    for syn in wn.synsets(word, pos=wn.ADJ):
+      for lemma in syn.lemmas():
+        curr = lemma.name()
+        if curr not in newPositive and curr not in newNegative and curr not in newNeutral:
+          newNegative.add(curr)
+        elif curr in newPositive:
+          newPositive.discard(curr)
+          newNeutral.add(curr)
+        for antonym in lemma.antonyms():
+          ant = antonym.name()
+          if ant not in newPositive and ant not in newNegative and ant not in newNeutral:
+            newPositive.add(ant)
+          elif ant in newNegative:
+            newNegative.discard(ant)
+            newNeutral.add(ant)
+  return (newPositive, newNegative, newNeutral)
+  
+def get_label(id):
+  return movie_reviews.categories(fileids=[id])[0]
+  
+def do_stem(text):
+  global stemmer
+  return [stemmer.stem(word) for word in text]
 
-test_vecs = [bag_to_vec(bag, train_wordlist) for bag in test_bags]
-predicted_labels = classifier.predict(test_vecs)
-word_labels = {}
+# new and improved finite state machine
+# states are as follows:
+# 0 - base
+# 1 - negator found
+# 2 - intensifier found
+# 3 - un-intensifier found (unused)
+# 4 - negator + intensifier found
+def calculate_score(text, lexicon):
+  negators = ["not", "n't", "hardly", "barely"]
+  intensifiers = ["very", "really", "incredibly", "amazingly", "extremely"]
+  if USE_STEMMING:
+    negators = do_stem(negators)
+    intensifiers = do_stem(intensifiers)
+    
+  punctuation = [".", "!", "?", ",", ";", '(', ')']
+  state = 0
+  score = 0
+  num_double = 0
+  num_single = 0
+  num_neg = 0
+  num_halfneg = 0
+  for word in text:
+    if state == 0:
+      if lexicon.has_key(word):
+        score += lexicon[word]
+        num_single += 1
+      elif word in negators:
+        state = 1
+      elif word in intensifiers:
+        state = 2
+    elif state == 1:
+      if lexicon.has_key(word):
+        score += -1 * lexicon[word]
+        num_neg += 1
+        state = 0
+      elif word in intensifiers:
+        state = 4
+      else:
+        state = 0
+    elif state == 2:
+      if lexicon.has_key(word):
+        score += 2 * lexicon[word]
+        num_double += 1
+        state = 0
+      else:
+        state = 0
+    elif state == 3:
+      pass #TODO
+    elif state == 4:
+      if lexicon.has_key(word):
+        score += -0.5 * lexicon[word]
+        num_halfneg += 1
+        state = 0
+      else:
+        state = 0
+  #print num_single, num_neg, num_double, num_halfneg
+  return score
 
-for i in range(len(test_wordlist)):
-  key = test_wordlist[i]
-  word_labels[key] = predicted_labels[i]
+def create_lexicon(words, labels):
+  lexicon = {}
+  for i in range(len(words)):
+    word = words[i]
+    label = labels[i]
+    lexicon[word] = label
+  return lexicon
 
-pos_words = [w for w in test_wordlist if word_labels[w] > 0]
-neg_words = [w for w in test_wordlist if word_labels[w] < 0]
+def create_trained_lexicon(pos_seed, neg_seed, test_words, test_labels):
+  # Set up initial Sets S_p and S_n
+  neutral = []
+  #positive = ['good']
+  #negative = ['bad']
 
-# Use the same number of positive and negative words.
-length = min(len(pos_words), len(neg_words))
-pos_words = pos_words[:length]
-neg_words = neg_words[:length]
-word_labels2 = {}
-for word in pos_words:
-  word_labels2[word] = 1
+  positive = [word for word in pos_seed]
+  negative = [word for word in neg_seed]
+  # Expand on Sets to get S_p' and S_n'
+  for num in range(EXPAND_ITERATIONS):
+    (positive, negative, neutral) = expand_sets(positive,negative,neutral)
 
-for word in neg_words:
-  word_labels2[word] = -1
-  
-f = open('fuck.txt', 'w')
-f.write("[POS]\n\n")
-f.write(string.join(pos_words,"\n"))
-f.write("\n\n[NEG]\n\n")
-f.write(string.join(neg_words,"\n"))
-f.close()
-#exit()
+  if USE_STEMMING:
+    positive = list(set(do_stem(positive)))
+    negative = list(set(do_stem(negative)))
+
+  # Use the same number of positive and negative training words.
+  if USE_EQUAL_TRAINING:
+    length = min(len(positive), len(negative))
+    positive = list(positive)[:length]
+    negative = list(negative)[:length]
+
+  # Train the classifier using the expanded wordlist.
+  train_defs = [get_defs(word) for word in (positive + negative)]
+  train_bags = [make_bag(get_defs(word), train_defs) for word in positive] + [make_bag(get_defs(word), train_defs) for word in negative]
+
+  train_labels = [1 for word in positive] + [-1 for word in negative]
+
+  # The classifier needs vectors, not dicts.  So we need to convert them to vectors.
+  # Make a list of all the words contained in them, then make an array with entries 
+  # corresponding to each word.
+
+  # Vector entries correspond to each word in the training word list.
+  train_wordlist = []
+  for tdef in train_defs:
+    for word in tdef:
+      train_wordlist.append(word)
+  train_wordlist = set(train_wordlist)
+
+  train_vecs = [BagOfWords.to_vector(bag, train_wordlist) for bag in train_bags]
+  if CLASSIFIER == "nb":
+    classifier = MultinomialNB()
+  elif CLASSIFIER == "svm":
+    classifier = LinearSVC()
+  elif CLASSIFIER == "me":
+    classifier = LogisticRegression()
+  classifier.fit(train_vecs, train_labels)
+
+  test_defs =  [get_defs(word) for word in test_words]
+  test_bags = [make_bag(get_defs(word), test_defs) for word in test_words]
+  test_vecs = [BagOfWords.to_vector(bag, train_wordlist) for bag in test_bags]
+
+  predicted_labels = classifier.predict(test_vecs)
+  correct = 0
+  for i in range(len(test_labels)):
+    if test_labels[i] == predicted_labels[i]:
+      correct += 1
+      
+  print "Lexicon accuracy:", correct/len(test_labels)
   
-# Iterate through all of the reviews and find sentiment
+  word_labels = {}
+  for i in range(len(test_words)):
+    key = test_words[i]
+    word_labels[key] = predicted_labels[i]
+
+  pos_words = set([w for w in test_words if word_labels[w] > 0])
+  neg_words = set([w for w in test_words if word_labels[w] < 0])
+
+  # Use the same number of positive and negative words.
+  if USE_EQUAL_TEST:
+    length = min(len(pos_words), len(neg_words))
+    pos_words = list(pos_words)[:length]
+    neg_words = list(neg_words)[:length]
+
+  lexicon = {}
+  lex2 = {}
+  for word in pos_words:
+    lexicon[word] = 1
+
+  for word in neg_words:
+    lexicon[word] = -1
+    
+  return lexicon
+
+if USE_STEMMING:
+  stemmer = nltk.stem.porter.PorterStemmer()
+
+# Load the test set. A few options here.
+(test_words, test_labels) = MPQALexicon.load(True)
+#(test_words, test_labels) = AniaLexicon.load()
+if USE_STEMMING:
+  test_words = do_stem(test_words)
+
+lexicon = create_trained_lexicon(POS_SEED, NEG_SEED, test_words, test_labels)
+#lexicon = create_seed_lexicon(POS_SEED, NEG_SEED)
+#lexicon = create_lexicon(test_words, test_labels)
+
+# Iterate through all of the reviews and compute scores by taking the sum of their
+# component lexicon words.  Includes rudimentary negation testing.
 correct = 0
 positive = 0
 ids = sorted(movie_reviews.fileids())
 scores = []
-    
-for review_id in ids:
-  words = movie_reviews.words(fileids=[review_id])
-  score = 0
-  for word in words:
-    if word_labels2.has_key(word):
-      score += word_labels2[word]
-  scores.append(score)
 
-avg_score = float(sum(scores))/len(scores)
+for id in ids:
+  words = list(movie_reviews.words(fileids=[id]))
+  if USE_STEMMING:
+    words = do_stem(words)
+  if USE_PARSING:
+    scores.append(calculate_score(words, lexicon))
+  else:
+    score = 0
+    x = 0
+    for word in words:
+      if lexicon.has_key(word):
+        score += lexicon[word]
+        x += 1
+    scores.append(score)
+    print score, x
+
 for i in range(len(ids)):
-    id = ids[i]
-    score = scores[i]
-    if score >= 0:#avg_score:
-        sent_value = "pos"
-        positive += 1
-    elif score < 0:#avg_score:
-        sent_value = "neg"
-    label = movie_reviews.categories(fileids=[id])[0]
-    if sent_value == label:
-        correct += 1
-
-print "correct:", float(correct)/len(ids)
-print "positive:", float(positive)/len(ids)
-#print "avg:", avg_score
\ No newline at end of file
+  id = ids[i]
+  score = scores[i]
+  if score >= 0:
+    sent_value = "pos"
+    positive += 1
+    #print id, sent_value
+  elif score < 0:
+    sent_value = "neg"
+    #print id, sent_value
+  label = get_label(id)
+  if sent_value == label:
+    correct += 1
+
+print "correct:", correct/len(ids)
+print "positive:", positive/len(ids)
\ No newline at end of file
diff --git a/MPQALexicon.py b/MPQALexicon.py
index 6eec7a6..a8c0600 100644
--- a/MPQALexicon.py
+++ b/MPQALexicon.py
@@ -1,4 +1,6 @@
-def load():
+from nltk.corpus import wordnet as wn
+
+def load(strong_only=False):
   filename = "subjclueslen1-HLTEMNLP05.tff"
   f = open(filename)
   lines = f.readlines()
@@ -10,11 +12,16 @@ def load():
     fields = [field for field in fields if "=" in field] #ugh, two lines have a random extra char in them
     d = dict([field.rstrip().split("=") for field in fields])
     (word, label, pos, type) = d["word1"], d["priorpolarity"], d["pos1"], d["type"]
-    if pos == "adj":# and type == "strongsubj":
-      if label == "positive":
-        words.append(word)
-        labels.append("pos")
-      elif label == "negative":
-        words.append(word)
-        labels.append("neg")
-  return (words, labels)
\ No newline at end of file
+    if word not in words:
+      if is_adjective(word):
+        if not (strong_only and (type != "strongsubj")):
+          if label == "positive":
+            words.append(word)
+            labels.append(1)
+          elif label == "negative":
+            words.append(word)
+            labels.append(-1)
+  return (words, labels)
+  
+def is_adjective(word):
+  return (len(wn.synsets(word, wn.ADJ)) > 0)
\ No newline at end of file
diff --git a/TFIDF.py b/TFIDF.py
index 84a9cfe..35039fc 100644
--- a/TFIDF.py
+++ b/TFIDF.py
@@ -22,4 +22,11 @@ def tfidf(term, document, documents):
     return 0
   else:
     tfidf = (1 + math.log(doc_appearances,10)) * math.log((float(num_docs)/all_doc_appearances), 10)
-    return tfidf
\ No newline at end of file
+    return tfidf
+
+# Martineau and Finn 2009
+def delta_tfidf(term, document, positive_set, negative_set):
+  return tfidf(term, document, positive_set) - tfidf(term, document, negative_set)
+  
+def delta_tfidf_fast(term, document, positive_set, negative_set):
+  return tfidf(term, document, positive_set) - tfidf(term, document, negative_set)
diff --git a/getAdjectives.py b/getAdjectives.py
index 8b5423c..7106326 100644
--- a/getAdjectives.py
+++ b/getAdjectives.py
@@ -35,7 +35,7 @@ def genConj(training):
                     conj.write(tokens[i][0]+ " " + tokens[i+2][0] + " " + tokens[i+1][0] + "\n")
                     nor.write(tokens[i][0]+ " " + tokens[i+2][0]+ "\n")
 
-f = open('words.txt', 'r+')
+f = open('words2.txt', 'w')
 list1 = []
 for word in sc.tagged_sents():
     for w in word:
@@ -45,5 +45,6 @@ def genConj(training):
 d = dict(counts)
 
 for n in d:
-    if( d[n] >= 20):
-        f.write(n+" \n")
\ No newline at end of file
+    if( d[n] >= 15):
+        f.write(n+" \n")
+f.close()
\ No newline at end of file
diff --git a/review_svm.py b/review_svm.py
index 918b003..26535ac 100644
--- a/review_svm.py
+++ b/review_svm.py
@@ -11,6 +11,7 @@
 #import svmutil
 from sklearn.svm import SVC
 from sklearn.svm import LinearSVC
+from TFIDF import delta_tfidf
 
 import BagOfWords
 import XMLParser
@@ -26,22 +27,22 @@
 PUNCTUATION    = [".", "!", "?", ",", ";"]
 
 # These are now command line parameters! See below...
+USE_DELTATFIDF  = True                        # Martineau and Finn.  Excludes some other parameters (e.g. frequency)
 USE_PRESENCE    = False                       # If true, use presence rather than frequency.
 USE_POS_TAGS    = False
 USE_ADJ_ONLY    = False
 USE_NEGATION    = True
 USE_POSITION    = False
 GRAM_LENGTH     = 1                           # Unigrams, bigrams, ... TODO use a range
-NUM_FOLDS       = 5                           # For cross-validation (Pang & Lee used 3)
+NUM_FOLDS       = 3                           # For cross-validation (Pang & Lee used 3)
 
-MIN_OCCURRENCES = 0#4                           # To be included, the word must show up this many times across all documents (Pang and Lee used 4)
+MIN_OCCURRENCES = 0#4                         # To be included, the word must show up this many times across all documents (Pang and Lee used 4)
 EPSILON         = .001                        # determines how long the algorithm runs (default is 0.001)
 
 NORMALIZE_BAGS  = True
-USE_LIBLINEAR   = True                        # Not implemented - it murdered my computer and wasn't noticeably faster.  But maybe multicore is worth a look
-CACHE_SIZE      = 512
+USE_LIBLINEAR   = True                        # This is supposedly faster for large instances
 
-USE_AMAZON      = True                        # Use the Amazon review set, not Pang and Lee.
+USE_AMAZON      = False                       # Use the Amazon review set, not Pang and Lee.
 
 def make_folds(documents, num_partitions):
   folds = [[] for i in range(num_partitions)]
@@ -140,8 +141,8 @@ def make_bag(text, total_word_counts):
     negative_reviews.append(reviews[i])
     
 #TEST
-positive_reviews = random.sample(positive_reviews, 1000)
-negative_reviews = random.sample(negative_reviews, 1000)
+positive_reviews = random.sample(positive_reviews, 250)
+negative_reviews = random.sample(negative_reviews, 250)
 
 # Partition reviews into folds.
 pos_folds = make_folds(positive_reviews, NUM_FOLDS)
@@ -157,10 +158,22 @@ def make_bag(text, total_word_counts):
 
 for i in range(NUM_FOLDS):
   for review in pos_folds[i]:
-    pos_fold_bags[i].append(make_bag(review, total_word_counts))
+    t3 = time.time()
+    if USE_DELTATFIDF:
+      pos_fold_bags[i].append(BagOfWords.make_delta_tfidf(review, positive_reviews, negative_reviews, total_word_counts))
+    else:
+      pos_fold_bags[i].append(make_bag(review, total_word_counts))      
+    t4 = time.time()
+    print "Bag time:", (t4-t3)
     
   for review in neg_folds[i]:
-    neg_fold_bags[i].append(make_bag(review, total_word_counts))
+    t3 = time.time()
+    if USE_DELTATFIDF:
+      neg_fold_bags[i].append(BagOfWords.make_delta_tfidf(review, positive_reviews, negative_reviews, total_word_counts))
+    else:
+      neg_fold_bags[i].append(make_bag(review, total_word_counts))
+    t4 = time.time()
+    print "Bag time:", (t4-t3)
 
 # Remove words with less than the minimum occurrences threshold.
 if MIN_OCCURRENCES > 0: