From 7b5edc3acd289ec24eb5a09744bfe34765363afb Mon Sep 17 00:00:00 2001
From: Jack <john.butler@uconn.edu>
Date: Tue, 26 Apr 2016 16:12:29 -0400
Subject: [PATCH] Twitter corpus; random changes from last week

---
 BagOfWords.py    |  22 +++---
 GlossLexicon.py  |   9 +--
 LexiconEval.py   |  24 ++++--
 TFIDF.py         |   2 +-
 TwitterCorpus.py |  30 ++++++++
 cblexicon.py     | 185 +++++++++++++++++++++++++----------------------
 getAdjectives.py |  72 +++++++++---------
 graph.py         |  39 +++++++---
 review_svm.py    | 110 ++++++++++++++++++----------
 9 files changed, 300 insertions(+), 193 deletions(-)
 create mode 100644 TwitterCorpus.py

diff --git a/BagOfWords.py b/BagOfWords.py
index 68c04da..b3007aa 100644
--- a/BagOfWords.py
+++ b/BagOfWords.py
@@ -14,17 +14,17 @@
 POSITION_THRESHOLDS = [0.25, 0.75, 1]
 
 # ref_bag is used to calculate the total word count across all documents.
-def make(words, ref_bag=None, gram_length=1, use_negation=False, use_presence=False, use_pos_tags=False, use_adj_only=False, use_position=False, normalize=True, use_hash=False):
+def make(words, ref_bag=None, gram_length=1, use_negation=False, use_presence=False, use_pos_tags=False, use_adj_only=False, use_position=False, normalize=False):
   bag_of_words = {}
   if use_negation:
     do_negation = False
   
   if use_pos_tags:
-    tagged = nltk.pos_tag(words)
+    #tagged = nltk.pos_tag(words)
+    tagged = tagger.tag(words) # this is much much faster !!!
     words = [string.join(t, "_") for t in tagged]
   for i in range(len(words) - gram_length + 1):
     n_gram = string.join(words[i:i+gram_length], "_")
-    
     if use_negation:
       if (gram_length == 1): # Pang and Lee didn't do negation tagging for bigrams.
         if n_gram in NEGATION_WORDS:
@@ -39,13 +39,8 @@ def make(words, ref_bag=None, gram_length=1, use_negation=False, use_presence=Fa
         if i/len(words) < POSITION_THRESHOLDS[j]:
           n_gram += POSITION_TAGS[j]
           break
-          
-    # LIBSVM won't use strings as keys, so hash to convert to a number.
-    if use_hash:
-      index = hash(n_gram)
-    else:
-      index = n_gram
       
+    index = n_gram
     if not (use_pos_tags and use_adj_only and (tagged[i][1] not in ADJECTIVE_TAGS)):
       if (not use_presence) and bag_of_words.has_key(index):
         bag_of_words[index] += 1
@@ -58,7 +53,8 @@ def make(words, ref_bag=None, gram_length=1, use_negation=False, use_presence=Fa
           ref_bag[index] += 1
         else:
           ref_bag[index] = 1
-          
+
+  #length-normalize
   if normalize:
     length = 0
     for k in bag_of_words.keys():
@@ -84,7 +80,7 @@ def make_tfidf(document, documents):
 
 # As per Martineau and Finn (2009), create a bag of words using delta TFIDF as the feature value.
 # Todo: Bigrams?
-def make_delta_tfidf(document, positive_set, negative_set, pos_idfs, neg_idfs, ref_bag):
+def make_delta_tfidf(document, positive_set, negative_set, pos_idfs, neg_idfs, ref_bag, use_pos_tags=False):
   bag = {}
   factor = 0
   for term in set(document):
@@ -112,4 +108,6 @@ def to_vector(bag, wordlist):
     else:
       vec.append(0)
   return vec
-  #return numpy.array(vec).reshape(1,-1)
\ No newline at end of file
+  #return numpy.array(vec).reshape(1,-1)
+  
+tagger = nltk.tag.perceptron.PerceptronTagger()
\ No newline at end of file
diff --git a/GlossLexicon.py b/GlossLexicon.py
index 351ae1e..69fadb3 100644
--- a/GlossLexicon.py
+++ b/GlossLexicon.py
@@ -12,10 +12,10 @@
 
 import BagOfWords
 
-EXPAND_ITERATIONS = 3
+EXPAND_ITERATIONS = 2
 CLASSIFIER = "svm" # "nb" = Naive Bayes, "svm" = Linear SVM, "me" = maximum entropy
-REMOVE_STOPWORDS = True
-USE_STEMMING = True # sync this up with eval!
+REMOVE_STOPWORDS = False
+USE_STEMMING = False # sync this up with eval!
 USE_EXAMPLES = True
 
 USE_EQUAL_TRAINING = True
@@ -144,14 +144,12 @@ def create(test_words, test_labels):
   test_vecs = [BagOfWords.to_vector(bag, train_wordlist) for bag in test_bags]
 
   predicted_labels = classifier.predict(test_vecs)
-  """
   correct = 0
   for i in range(len(test_labels)):
     if test_labels[i] == predicted_labels[i]:
       correct += 1
       
   print "Lexicon accuracy:", correct/len(test_labels)
-  """
   
   word_labels = {}
   for i in range(len(test_words)):
@@ -173,7 +171,6 @@ def create(test_words, test_labels):
     lexicon[word] = 1
 
   for word in neg_words:
-    #lexicon[word] = -1
     lexicon[word] = -1
     
   return lexicon
diff --git a/LexiconEval.py b/LexiconEval.py
index afff8c2..3646fcf 100644
--- a/LexiconEval.py
+++ b/LexiconEval.py
@@ -7,13 +7,15 @@
 import MPQALexicon
 import AniaLexicon
 import GlossLexicon
+import LexFromFile
 import XMLParser
+import TwitterCorpus
 
-USE_STEMMING = True # sync this up with lexicon!
+USE_STEMMING = False # sync this up with lexicon!
 USE_PARSING = True
 LEX_ALG = "gloss"   # "gloss", "conjunction", "none"
 LEX_SOURCE = "mpqa" # "mpqa", "ania"
-CORPUS = "movies"   # "amazon", "movies"
+CORPUS = "movies"   # "amazon", "movies", "twitter"
 NEG_MOD = 1.5       # Taboada suggested 1.5.
 
 # new and improved finite state machine
@@ -115,6 +117,8 @@ def create_lexicon(words, labels):
         CORPUS = "movies"
       elif args[i+1] == "amazon":
         CORPUS = "amazon"
+      elif args[i+1] == "twitter":
+        CORPUS = "twitter"
       i += 2
     elif args[i] == "--help":
       print "Usage:"
@@ -127,6 +131,7 @@ def create_lexicon(words, labels):
       print "  - ania: Use the hand-labeled lexicon from the Brown corpus"
       print "--corpus X: Choose the data set to test on"
       print "  - amazon: Use the Amazon data set"
+      print "  - twitter: Use the Twitter data set"
       print "  - movies: Use the Pang&Lee movie data set (default)"
       exit()
     else:
@@ -142,7 +147,7 @@ def create_lexicon(words, labels):
 
 # Load the test set. A few options here.
 if LEX_SOURCE == "mpqa":
-  (test_words, test_labels) = MPQALexicon.load(True)
+  (test_words, test_labels) = MPQALexicon.load(False)
 elif LEX_SOURCE == "ania":
   (test_words, test_labels) = AniaLexicon.load()
 else:
@@ -164,10 +169,12 @@ def create_lexicon(words, labels):
   correct = len([(word, label) for (word, label) in zip(test_words, test_labels) if lexicon.has_key(word) and label == lexicon[word]])
   lex_acc = correct/len(lexicon.items())
   print "Lexicon accuracy:", lex_acc
-  
+
+# TODO refactor me again.
+#lexicon = LexFromFile.lexfromfile("cblex.txt")
 for key in lexicon.keys():
   if lexicon[key] < 0: lexicon[key] *= NEG_MOD
-  
+
 if CORPUS == "movies":
   ids = movie_reviews.fileids()
   reviews = [list(movie_reviews.words(fileids=[id])) for id in ids]
@@ -180,10 +187,12 @@ def create_lexicon(words, labels):
       labels.append(-1)
 elif CORPUS == "amazon":
   (ids, reviews, labels) = XMLParser.get_all_reviews()
+elif CORPUS == "twitter":
+  (ids, reviews, labels) = TwitterCorpus.load() #they're not reviews but we'll let it slide.
 else:
   print "Invalid corpus!"
   exit()
-  
+
 """
 # It feels like there should be a more efficient way do to this.
 shuffled = zip(ids,reviews,labels)
@@ -192,6 +201,8 @@ def create_lexicon(words, labels):
 reviews = [x[1] for x in shuffled]
 labels = [x[2] for x in shuffled]
 """
+#for k in lexicon.keys():
+#  lexicon[k] *= -1
 
 # Iterate through all of the reviews and compute scores by taking the sum of their
 # component lexicon words.  Includes rudimentary negation testing.
@@ -203,7 +214,6 @@ def create_lexicon(words, labels):
   words = reviews[i]
   if USE_STEMMING:
     words = do_stem(words)
-    
   if USE_PARSING:
     score = calculate_score(words, lexicon)
   else:
diff --git a/TFIDF.py b/TFIDF.py
index 202180c..77008ad 100644
--- a/TFIDF.py
+++ b/TFIDF.py
@@ -44,6 +44,6 @@ def tfidf(term, document, documents, idfs={}):
   tfidf = (1 + math.log(doc_appearances,10)) * idf
   return tfidf
 
-# Martineau and Finn 2009
+# Martineau and Finin 2009
 def delta_tfidf(term, document, positive_set, negative_set, pos_idfs={}, neg_idfs={}):
   return tfidf(term, document, positive_set, pos_idfs) - tfidf(term, document, negative_set, neg_idfs)
diff --git a/TwitterCorpus.py b/TwitterCorpus.py
new file mode 100644
index 0000000..635cb5a
--- /dev/null
+++ b/TwitterCorpus.py
@@ -0,0 +1,30 @@
+import nltk
+import string
+import random
+
+def load(sample=True):
+  CONJUNCTIONS = ["and", "but", "or"]
+
+  f = open("Sentiment Analysis Dataset.csv")
+  lines = f.readlines()
+  #lines = lines[:1000]
+  f.close()
+  if sample:
+    lines = random.sample(lines, 10000)
+  ids = []
+  tweets = []
+  labels = []
+  for line in lines[1:]:
+    line = line.replace("\"", "").strip()
+    line2 = ""
+    for c in line:
+      if ord(c) < 128: line2 += c
+    terms = line2.split(",")
+    id = terms[0]
+    label = int(terms[1])
+    if label == 0: label = -1
+    tweet = terms[3]
+    ids.append(id)
+    tweets.append(nltk.word_tokenize(tweet))
+    labels.append(label)
+  return (ids, tweets, labels)
\ No newline at end of file
diff --git a/cblexicon.py b/cblexicon.py
index 0f33cb7..ef6505a 100644
--- a/cblexicon.py
+++ b/cblexicon.py
@@ -6,74 +6,77 @@
 import random
 from nltk.stem import *
 import time
+import scipy
 
 from sets import Set
 
-def optimize(set1,set2,conjSet,defSet,dis):
-    i = 0
+"""
+def optimize(set1, set2, conjSet, defSet, dis):
     currentMin = 999999
-    consideredMin = calcScore(set1,set2,conjSet,dis)
+    consideredMin = calcScore(set1, set2, conjSet, dis)
     bestSwapWord = ""
     # Calculate the best word to remove until no moves lessen the function
+    i = 1
     while( currentMin > consideredMin):
         print i
-        i = i + 1
         currentMin = consideredMin
         for word in set1:
             set1.remove(word)
             set2.append(word)
-            test = calcScore(set1,set2,conjSet,dis)
+            test = calcScore(set1, set2, conjSet, dis)
             set2.remove(word)
             set1.append(word)
-            if (test < consideredMin):
+            if test < consideredMin:
                 consideredMin = test
                 bestSwapWord = word
         for word in set2:
             set2.remove(word)
             set1.append(word)
-            test = calcScore(set1,set2,conjSet,dis)
+            test = calcScore(set1, set2, conjSet, dis)
             set1.remove(word)
             set2.append(word)
-            if (test < consideredMin):
+            if test < consideredMin:
                 consideredMin = test
                 bestSwapWord = word
 
-        if(bestSwapWord in set1):
+        if bestSwapWord in set1:
             set1.remove(bestSwapWord)
             set2.append(bestSwapWord)
         else:
             set2.remove(bestSwapWord)
             set1.append(bestSwapWord)
+        i = i + 1
     # Return the optimized sets
-    return set1,set2
+    return set1, set2
+"""
 
-def optimize2(set1,set2,conjSet,defSet,dis):
+def optimize2(set1, set2, conjSet, defSet, dis):
     currentMin = 999999
-    consideredMin = calcScore(set1,conjSet,dis) + calcScore(set2,conjSet,dis)
+    consideredMin = calcScore(set1, conjSet, dis) + calcScore(set2, conjSet, dis)
     bestSwapWord = None
     # Calculate the best word to remove until no moves lessen the function
     i = 1
-    while( currentMin > consideredMin):
+    while currentMin > consideredMin:
         t1 = time.time()
         currentMin = consideredMin
-        currentS1 = calcScore(set1,conjSet,dis)
-        currentS2 = calcScore(set2,conjSet,dis)
-        consideredMin = currentS1 + currentS2 #
+        currentS1 = calcScore(set1, conjSet, dis)
+        currentS2 = calcScore(set2, conjSet, dis)
+        consideredMin = currentS1 + currentS2
         for word in set1:
-            test = calcSwap(word,set1,set2,currentS1,currentS2,conjSet,dis)
+            test = calcSwap(word, set1, set2, currentS1, currentS2, conjSet, dis)
             if (test < consideredMin):
                 consideredMin = test
                 bestSwapWord = word
         for word in set2:
-            test = calcSwap(word,set2,set1,currentS2,currentS1,conjSet,dis)
-            if (test < consideredMin):
+            test = calcSwap(word, set2, set1, currentS2, currentS1, conjSet, dis)
+            if test < consideredMin:
                 consideredMin = test
                 bestSwapWord = word
 
-        if(bestSwapWord in set1):
+        if bestSwapWord in set1:
             set1.remove(bestSwapWord)
             set2.append(bestSwapWord)
-        elif(bestSwapWord in set2):
+        elif bestSwapWord in set2:
             set2.remove(bestSwapWord)
             set1.append(bestSwapWord)
         t2 = time.time()
@@ -81,23 +84,23 @@ def optimize2(set1,set2,conjSet,defSet,dis):
         i += 1
 
     # Return the optimized sets
-    return set1,set2
+    return set1, set2
 
-def constraintSwap(set1,set2,conjSet,defSet,dis):
+def constraintSwap(set1, set2, conjSet, defSet, dis):
     for word in set1:
         stay = 0
         swap = 0
         for otherword in set1:
             if otherword != word:
-                cats = dis[conjSet[word][0]][conjSet[otherword][0]]
+                cats = getDis(word, otherword)
                 stay = stay + cats
-        stay = stay * (1/(len(set1)-1))
+        stay /= (len(set1)-1)
         for otherword in set2:
             if otherword != word:
-                cats = dis[conjSet[word][0]][conjSet[otherword][0]]
+                cats = getDis(word, otherword)
                 swap = swap + cats
-        swap = swap * (1/(len(set2)))
-        if(stay > swap):
+        swap /= len(set2)
+        if stay > swap:
             set1.remove(word)
             set2.append(word)
 
@@ -106,19 +109,18 @@ def constraintSwap(set1,set2,conjSet,defSet,dis):
         swap = 0
         for otherword in set2:
             if otherword != word:
-                cats = dis[conjSet[word][0]][conjSet[otherword][0]]
-                stay = stay + cats
-        stay = stay * (1/(len(set2)-1))
+                cats = getDis(word, otherword)
+                stay +=  cats
+        stay /= (len(set2)-1)
         for otherword in set1:
             if otherword != word:
-                cats = dis[conjSet[word][0]][conjSet[otherword][0]]
-                swap = swap + cats
-        swap = swap * (1/(len(set1)))
-        if(stay > swap):
+                cats = getDis(word, otherword)
+                swap += cats
+        swap /= len(set1)
+        if stay > swap:
             set2.remove(word)
             set1.append(word)
-    return set1,set2
-
+    return set1, set2
 
 def calcScore(set,conjSet,dis):
     score = 0
@@ -126,8 +128,8 @@ def calcScore(set,conjSet,dis):
         w1 = set[i]
         for j in range(i, len(set)):
             w2 = set[j]
-            cats = dis[conjSet[w1][0]][conjSet[w2][0]]
-            score = score + cats
+            cats = getDis(w1, w2)
+            score += cats
     return score / len(set)
 
 def calcSwap(word,currSet,opSet,currentCount,otherCount,conjSet,dis):
@@ -135,25 +137,22 @@ def calcSwap(word,currSet,opSet,currentCount,otherCount,conjSet,dis):
     score2 = 0
     for w in currSet:
         if word != w:
-            cats = dis[conjSet[word][0]][conjSet[w][0]]
-            score1 = score1 + cats
-    currentCount = ((currentCount* len(currSet)) - score1 )/(len(currSet)-1)
+            cats = getDis(word, w)
+            score1 += cats
+    currentCount = (currentCount * len(currSet) - score1)/(len(currSet)-1)
 
     #for word in set2:
     for w in opSet:
         if word != w:
-            cats = dis[conjSet[word][0]][conjSet[w][0]]
-            score2 = score2 + cats
-    otherCount = ((otherCount* len(opSet)) + score2 )/(len(opSet)+1)
+            cats = getDis(word, w)
+            score2 += cats
+    otherCount = (otherCount * len(opSet) + score2)/(len(opSet)+1)
 
     return currentCount + otherCount
 
-def normalize_word(word):
-    return SnowballStemmer("english").stem(word)
-
-def vectorize(conjSet,defSet):
+def vectorize(conjSet, defSet):
     dis = numpy.zeros((len(defSet),len(defSet)))
-    dis.fill(.5)
+    dis.fill(0.5)
     for word in defSet:
         similar = conjSet[word][1]
         dissimilar = conjSet[word][2]
@@ -163,12 +162,10 @@ def vectorize(conjSet,defSet):
             dis[conjSet[word][0]][conjSet[d][0]] = 1
     return dis
 
-def word_feats(words):
-    return dict([(word, True) for word in words])
-
 def genSets():
-    f = open('words.txt', 'r+')
+    f = open('words.txt', 'r')
     content = f.readlines()
+    f.close()
     positive = Set([])
     negative = Set([])
 
@@ -179,53 +176,68 @@ def genSets():
         elif (current[1][0] == 'n'):
             negative.add(current[0])
 
-    return positive,negative
+    return positive, negative
 
 def getConj():
     # Set up the tuple (index, similar, dissimilar)
-    f = open('conj.txt', 'r+')
+    f = open('movieconj.txt', 'r')
     content = f.readlines()
+    f.close()
     d = dict()
     i = 0
     for line in content:
         current = line.split(' ')
+        # WTF is all this index math?
         if current[2] == "but":
             if current[0] in d:
                 d[current[0]][2].add(current[1])
             else:
                 d[current[0]] = (i,Set(),Set([current[1]]))
-                i = i+1
+                i += 1
             if current[1] in d:
                 d[current[1]][2].add(current[0])
             else:
                 d[current[1]] = (i,Set(),Set([current[0]]))
-                i = i+1
+                i += 1
         else:
             if current[0] in d:
                 d[current[0]][1].add(current[1])
             else:
                 d[current[0]] = (i,Set([current[1]]),Set())
-                i = i+1
+                i += 1
             if current[1] in d:
                 d[current[1]][1].add(current[0])
             else:
                 d[current[1]] = (i,Set([current[0]]),Set())
-                i = i+1
+                i += 1
     return d
 
-def findFrequency(set1,set2):
+def findFrequency(set1, set2):
     set1Freq = 0
     set2Freq = 0
 
     for word in brown.words():
-        set1Freq = (set1Freq+1) if (word in set1) else set1Freq
-        set2Freq = (set2Freq+1) if (word in set2) else set2Freq
+        set1Freq = (set1Freq + 1) if (word in set1) else set1Freq
+        set2Freq = (set2Freq + 1) if (word in set2) else set2Freq
 
     return set1Freq, set2Freq
 
+def getDis(a, b):
+    global dis, conjSet
+    a_index = conjSet[a][0]
+    b_index = conjSet[b][0]
+    """
+    if dis.has_key((a_index,b_index)):
+        return dis[(a_index, b_index)]
+    else:
+        return 0
+    """
+    return dis[a_index][b_index]
+
 def conjunctionData(set1,set2):
-    f = open('conj.txt', 'r+')
+    f = open('movieconj.txt', 'r+')
     content = f.readlines()
+    f.close()
     totalConj = 0
     totalbuts = 0
     correctbuts = 0
@@ -233,27 +245,27 @@ def conjunctionData(set1,set2):
     correctands = 0
     totalors = 0
     correctors = 0
-    totalnors =0
+    totalnors = 0
     correctnors = 0
     for line in content:
-        totalConj = totalConj +1
+        totalConj = totalConj + 1
         current = line.split(' ')
         if current[2] == "but":
-            totalbuts = totalbuts +1
-            if( (current[0] in set1 and current[1] in set2) or (current[0] in set2 and current[1] in set1) ):
-                correctbuts = correctbuts +1
+            totalbuts = totalbuts + 1
+            if (current[0] in set1 and current[1] in set2) or (current[0] in set2 and current[1] in set1):
+                correctbuts = correctbuts + 1
         elif current[2] == "and":
-            totalands = totalands +1
-            if( (current[0] in set1 and current[1] in set1) or (current[0] in set2 and current[1] in set2) ):
-                correctands = correctands +1
+            totalands = totalands + 1
+            if (current[0] in set1 and current[1] in set1) or (current[0] in set2 and current[1] in set2):
+                correctands = correctands + 1
         elif current[2] == "or":
-            totalors = totalors +1
-            if( (current[0] in set1 and current[1] in set1) or (current[0] in set2 and current[1] in set2) ):
-                correctors = correctors +1
+            totalors = totalors + 1
+            if (current[0] in set1 and current[1] in set1) or (current[0] in set2 and current[1] in set2):
+                correctors = correctors + 1
         elif current[2] == "nor":
-            totalnors = totalnors +1
-            if( (current[0] in set1 and current[1] in set1) or (current[0] in set2 and current[1] in set2) ):
-                correctnors = correctnors +1
+            totalnors = totalnors + 1
+            if (current[0] in set1 and current[1] in set1) or (current[0] in set2 and current[1] in set2):
+                correctnors = correctnors + 1
     print "Total Conjunctions: %d" % totalConj
     print "Total ands: %d \n Ands in same set: %d" % (totalands,correctands)
     print "Total ors: %d \n Ors in same set: %d" % (totalors,correctors)
@@ -261,6 +273,7 @@ def conjunctionData(set1,set2):
     print "Total buts: %d \n Buts in opposite sets: %d" % (totalbuts,correctbuts)
 
 def returnCBLexicon():
+    global dis, conjSet
     # Generate positive and negative initial sets
     sets = genSets()
     positive = random.sample(sets[0], min(len(sets[0]), len(sets[1])))
@@ -282,9 +295,10 @@ def returnCBLexicon():
     bestSet1 = []
     bestSet2 = []
     bestScore = 999999
-    numIterations = 10
+    numIterations = 3
     for i in range(numIterations):
-      set1 = random.sample(defSet, len(defSet)//2)
+      setsize = random.randint(len(defSet)//4, len(defSet)*3//4)
+      set1 = random.sample(defSet, setsize)
       set2 = [x for x in defSet if x not in set1]
       
       # Optimize objective function
@@ -299,12 +313,14 @@ def returnCBLexicon():
         bestScore = score
 
     #Find which set has a higher frequency in the training set
-    (set1Freq,set2Freq) = findFrequency(set1,set2)
+    #(set1Freq,set2Freq) = findFrequency(set1,set2)
 
-    positive = set1 if (set1Freq>set2Freq) else set2
-    negative = set1 if (set1Freq<set2Freq) else set2
+    #positive = set1 if (set1Freq>set2Freq) else set2
+    #negative = set1 if (set1Freq<set2Freq) else set2
+    positive = set1 if len(set1)>len(set2) else set2
+    negative = set2 if len(set1)<len(set2) else set1
 
-    conjunctionData(set1,set2)
+    # conjunctionData(set1,set2)
 
     # Generate Dictionary in correct format
     lexicon = dict([(word,1) for word in positive])
@@ -315,5 +331,4 @@ def returnCBLexicon():
 f = open("cblex.txt", "w")
 for key in lex.keys():
   f.write("%s, %d\n" % (key, lex[key]))
-
 f.close()
diff --git a/getAdjectives.py b/getAdjectives.py
index ae19e02..6f5c1fe 100644
--- a/getAdjectives.py
+++ b/getAdjectives.py
@@ -1,51 +1,54 @@
 import math
+import string
 import nltk
 from nltk.corpus import brown as sc
 from collections import Counter
 from nltk.corpus import movie_reviews
+from nltk.corpus import wordnet as wn
+import MPQALexicon
 
-
+# NLTK's pos tagger kinda sucks sometimes.  Let's double check with Wordnet.
+def isAdj(word):
+    global tagger, mpqa_words
+    #return tagger.tag([word])[0][1].startswith("JJ")
+    #return len(wn.synsets(word, wn.ADJ)) > 0
+    #return tagger.tag([word])[0][1].startswith("JJ") and len(wn.synsets(word, wn.ADJ)) > 0 and word in mpqa_words
+    return word in mpqa_words
 
 def genConj():
-    conj = open('movieconj.txt', 'r+')
-    ands = open('ands.txt', 'r+')
-    ors =  open('ors.txt', 'r+')
-    buts =  open('buts.txt', 'r+')
-    nor =  open('nor.txt', 'r+')
-    eor = open('eor.txt', 'r+')
-    j = 0
+    conj = open('movieconj.txt', 'w')
+    global tagger
+    tagger = nltk.tag.perceptron.PerceptronTagger()
+    num_lines = 0
     for review in sorted(movie_reviews.fileids()):     #For every review
-        tokens = nltk.pos_tag(nltk.word_tokenize(movie_reviews.raw(fileids=[review])))
-        print j
-        j = j+1
-        for i in range(0,len(tokens)-3):
-            if ((tokens[i][1]== "JJ" or tokens[i][1] == "JJR" or tokens[i][1] == "JJS") and (tokens[i+2][1]== "JJ" or tokens[i+2][1] == "JJR" or tokens[i+2][1] == "JJS")):
-                if (tokens[i+1][0] == "and"):
-                    print tokens[i][0]
-                    conj.write(tokens[i][0]+ " " + tokens[i+2][0] + " " + tokens[i+1][0] + "\n")
-                    #ands.write(tokens[i][0]+ " " + tokens[i+2][0]+ "\n")
-                elif (tokens[i+1][0] == "or"):
-                    conj.write(tokens[i][0]+ " " + tokens[i+2][0] + " " + tokens[i+1][0] + "\n")
-                    #ors.write(tokens[i][0]+ " " + tokens[i+2][0]+ "\n")
-                elif (tokens[i+1][0] == "but"+ "\n"):
-                    conj.write(tokens[i][0]+ " " + tokens[i+2][0] + " " + tokens[i+1][0] + "\n")
-                    #buts.write(tokens[i][0]+ " " + tokens[i+2][0]+ "\n")
-                elif (tokens[i+1][0] == "either-or"):
-                    conj.write(tokens[i][0]+ " " + tokens[i+2][0] + " " + tokens[i+1][0] + "\n")
-                    #eor.write(tokens[i][0]+ " " + tokens[i+2][0]+ "\n")
-                elif (tokens[i+1][0] == "neither-nor"):
-                    conj.write(tokens[i][0]+ " " + tokens[i+2][0] + " " + tokens[i+1][0] + "\n")
-                    #nor.write(tokens[i][0]+ " " + tokens[i+2][0]+ "\n")
-
-
-
-
+        tokens = movie_reviews.words(fileids=[review])
+        for i in range(0,len(tokens)-2):
+            if isAdj(tokens[i]) and isAdj(tokens[i+2]):
+                line = tokens[i]+ " " + tokens[i+2] + " " + tokens[i+1] + "\n"
+                print line.strip()
+                if (tokens[i+1] == "and"):
+                    conj.write(line)
+                    num_lines += 1
+                elif (tokens[i+1] == "or"):
+                    conj.write(line)
+                    num_lines += 1
+                elif (tokens[i+1] == "but"+ "\n"):
+                    conj.write(line)
+                    num_lines += 1
+                elif (tokens[i+1] == "either-or"):
+                    conj.write(line)
+                    num_lines += 1
+                elif (tokens[i+1] == "neither-nor"):
+                    conj.write(line)
+                    num_lines += 1
+    print num_lines
+    
 def doBrown():
     f = open('movieconj.txt', 'w')
     list1 = []
     for word in sc.tagged_sents():
         for w in word:
-            if(w[1] == "JJ" or w[1] == "JJR" or w[1] == "JJS" or w[1] == "JJT"):
+            if w[1].startswith("JJ"):
                 list1.append(w[0])
     counts = Counter(list1)
     d = dict(counts)
@@ -55,4 +58,5 @@ def doBrown():
             f.write(n+" \n")
     f.close()
 
+(mpqa_words, mpqa_labels) = MPQALexicon.load()
 genConj()
\ No newline at end of file
diff --git a/graph.py b/graph.py
index 04102ef..1425f71 100644
--- a/graph.py
+++ b/graph.py
@@ -10,7 +10,9 @@
   "bigrams, frequency, +Position",
   "bigrams, presence",
   "bigrams, presence, +Position",
-  "delta_tfidf"
+  "delta_tfidf",
+  "unigrams, presence, +POS",
+  "bigrams, presence, +POS"
 ]
 labels2 = [
   "unigrams, frequency",
@@ -21,23 +23,38 @@
   "bigrams, frequency, +Position",
   "bigrams, presence",
   "bigrams, presence, +Position",
-  "delta_tfidf"
+  "delta_tfidf",
+  "unigrams, presence, +POS",
+  "bigrams, presence, +POS"
 ]
-tops = numpy.arange(len(labels))
-widths = [0.826002649356, 0.784479089868, 0.842490694287, 0.821997146847, 0.807497617378, 0.777000053946, 0.820491149832, 0.795509581438, 0.981992471513]
-widths2 = [0.824624634419, 0.808376475678, 0.832750728912, 0.815374570779, 0.797876474366, 0.771876439875, 0.799001849413, 0.768376127015, 0.929999178955]
+bottoms = numpy.arange(len(labels))
+widths = [0.826002649356, 0.784479089868, 0.842490694287, 0.821997146847, 0.807497617378, 0.777000053946, 0.820491149832, 0.795509581438, 0.981992471513, 0.836989684295, 0.818997140853]
+widths2 = [0.824624634419, 0.808376475678, 0.832750728912, 0.815374570779, 0.797876474366, 0.771876439875, 0.799001849413, 0.768376127015, 0.929999178955, 0.813999695576, 0.792252879562]
 height = 0.3
-pyplot.barh(tops, widths, height, color="#FF0000")
-pyplot.barh(tops+height, widths2, height, color="#00FF00")
+pyplot.barh(bottoms, widths, height, color="#FF0000")
+pyplot.barh(bottoms-height, widths2, height, color="#00FF00")
 pyplot.legend(["Movies", "Amazon"], loc=4) # bottom right
-pyplot.yticks(tops+height, labels)
+pyplot.yticks(bottoms, labels)
 pyplot.xlim(0.5, 1.0)
-pyplot.ylim(tops[0]-2*height, tops[-1]+3*height)
+#pyplot.ylim(tops[0]-2*height, bottoms[-1]+3*height)
 pyplot.show()
 
 """
+# TODO: Use POS tags on Amazon dataset.
 gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: False, use_position: False 0.822003140865
 gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: False, use_position: True 0.781988575402
-gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position: False 0.77899606193
-gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position: True 0.762512512513
+gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position:False 0.77899606193
+gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position:True 0.762512512513
+gram_length: 1, use_presence: True, use_amazon: False, use_pos_tags: True, use_adj_only: False, use_position:False 0.836989684295
+gram_length: 1, use_presence: True, use_amazon: False, use_pos_tags: True, use_adj_only: False, use_position:True 0.814993136849
+gram_length: 1, use_presence: True, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position: False 0.769011526497
+gram_length: 1, use_presence: True, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position: True 0.749997002991
+gram_length: 2, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: False, use_position: False 0.800000599402
+gram_length: 2, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: False, use_position: True 0.762980045914
+gram_length: 2, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position:False 0.707506908106
+gram_length: 2, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position:True 0.669494344644
+gram_length: 2, use_presence: True, use_amazon: False, use_pos_tags: True, use_adj_only: False, use_position:False 0.818997140853
+gram_length: 2, use_presence: True, use_amazon: False, use_pos_tags: True, use_adj_only: False, use_position:True 0.78900607194
+gram_length: 2, use_presence: True, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position: False 0.714507921095
+gram_length: 2, use_presence: True, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position: True 0.668990847135
 """
\ No newline at end of file
diff --git a/review_svm.py b/review_svm.py
index ceae890..3920aef 100644
--- a/review_svm.py
+++ b/review_svm.py
@@ -9,18 +9,17 @@
 import numpy
 from sklearn.svm import SVC
 from sklearn.svm import LinearSVC
-from TFIDF import delta_tfidf, compute_idfs
 
 import BagOfWords
 import XMLParser
+import TwitterCorpus
+from TFIDF import delta_tfidf, compute_idfs
 
-# Program to classify the movie review dataset using a support vector machine
-# (via LIBSVM), following Pang and Lee (2002).
+# Program to classify the movie review dataset using a support vector machine, following Pang and Lee (2002).
 
 # "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not',
 # 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word."
 # They didn't provide a full list.
-# TODO make this a parameter
 NEGATION_WORDS = ["not", "isn't", "didn't", "doesn't"]
 PUNCTUATION    = [".", "!", "?", ",", ";"]
 
@@ -32,15 +31,15 @@
 USE_NEGATION    = True
 USE_POSITION    = False
 GRAM_LENGTH     = 1                           # Unigrams, bigrams, ... TODO use a range
-NUM_FOLDS       = 3                           # For cross-validation (Pang & Lee used 3, Martineau & Finn used 10)
+NUM_FOLDS       = 3                           # For cross-validation (Pang & Lee used 3, Martineau & Finin used 10)
 
-MIN_OCCURRENCES = 0#4                         # To be included, the word must show up this many times across all documents (Pang and Lee used 4)
+MIN_OCCURRENCES = 4                           # To be included, the word must show up this many times across all documents (Pang and Lee used 4)
 EPSILON         = .001                        # determines how long the algorithm runs (default is 0.001)
 
 NORMALIZE_BAGS  = True
 USE_LIBLINEAR   = True                        # This is supposedly faster for large instances
 
-USE_AMAZON      = False                       # Use the Amazon review set, not Pang and Lee.
+CORPUS = "movies"                             # "twitter", "amazon", "movies"
 USE_DELTA_TFIDF = False
 
 def make_folds(documents, ids, num_partitions):
@@ -64,7 +63,7 @@ def from_command_line():
   use_negation = USE_NEGATION
   use_position = USE_POSITION
   min_occurrences = MIN_OCCURRENCES
-  use_amazon = USE_AMAZON
+  corpus = CORPUS
   try:
     args = sys.argv[1:]
     while i < len(args):
@@ -98,9 +97,9 @@ def from_command_line():
       elif args[i] == "--threshold":
         min_occurrences = int(args[i+1])
         i += 2
-      elif args[i] == "--use-amazon":
-        use_amazon = True
-        i += 1
+      elif args[i] == "--corpus":
+        corpus = args[i+1]
+        i += 2
       elif args[i] == "--use-delta":
         use_delta = True
         i += 1
@@ -116,22 +115,22 @@ def from_command_line():
         print "--use-position\t\tTag words according to their position in the text (Default: Off)"
         print "--threshold N\t\tOnly include words that appear at least N times across all documents  (Default: 4)"
         print "\t\t\t(0 < epsilon < 1; lower = more iterations)"
-        print "--use-amazon\t\tUse the Amazon data set rather than the movie review set.  (Default: Off)"
+        print "--corpus\t\tSelect a corpus to evaluate. (amazon, movies, twitter)  (Default: movies)"
         print "--use-delta\t\tUse Delta TFIDF.  (Default: Off)"
         exit()
       else:
         print "Error: Invalid argument", args[i]
         i += 1
-    classify_reviews(gram_length, num_folds, use_presence, use_negation, use_pos_tags, use_adj_only, min_occurrences, use_amazon, use_delta)
+    classify_reviews(gram_length, num_folds, use_presence, use_negation, use_pos_tags, use_adj_only, min_occurrences, corpus, use_delta)
   except Exception:
     print "Invalid arguments"
-                         
+
 def classify_reviews(gram_length=GRAM_LENGTH, num_folds=NUM_FOLDS, use_presence=USE_PRESENCE, use_negation=USE_NEGATION, use_pos_tags=USE_POS_TAGS, use_adj_only=USE_ADJ_ONLY,
-                     use_position = USE_POSITION,  min_occurrences=MIN_OCCURRENCES, use_amazon=USE_AMAZON, use_delta=USE_DELTA_TFIDF):
+                     use_position = USE_POSITION,  min_occurrences=MIN_OCCURRENCES, corpus=CORPUS, use_delta=USE_DELTA_TFIDF, skew=(1,1)):
   positive_ids = []
   negative_ids = []
 
-  if use_amazon:
+  if corpus == "amazon":
     # Load the mixed Amazon review dataset.
     (ids, reviews, labels) = XMLParser.get_all_reviews()
     for i in range(len(ids)):
@@ -139,7 +138,7 @@ def classify_reviews(gram_length=GRAM_LENGTH, num_folds=NUM_FOLDS, use_presence=
         positive_ids.append(ids[i])
       elif labels[i] == -1:
         negative_ids.append(ids[i])
-  else:
+  elif corpus == "movies":
     # Load the Pang and Lee sentiment dataset.
     ids = movie_reviews.fileids()
     reviews = [list(movie_reviews.words(fileids = [id])) for id in ids]
@@ -152,6 +151,13 @@ def classify_reviews(gram_length=GRAM_LENGTH, num_folds=NUM_FOLDS, use_presence=
       elif label == 'neg': 
         labels.append(-1)
         negative_ids.append(id)
+  elif corpus == "twitter":
+    (ids, reviews, labels) = TwitterCorpus.load()
+    for i in range(len(ids)):
+      if labels[i] == 1:
+        positive_ids.append(ids[i])
+      elif labels[i] == -1:
+        negative_ids.append(ids[i])
 
   positive_reviews = []
   negative_reviews = []
@@ -162,11 +168,10 @@ def classify_reviews(gram_length=GRAM_LENGTH, num_folds=NUM_FOLDS, use_presence=
     elif labels[i] == -1:
       negative_reviews.append(reviews[i])
 
-  #TEST
-  #positive_reviews = positive_reviews[:200]
-  #negative_reviews = negative_reviews[:600]
-  #positive_reviews = random.sample(positive_reviews, 1000)
-  #negative_reviews = random.sample(negative_reviews, 1000)
+  num_pos = int(len(positive_reviews) * skew[0])
+  num_neg = int(len(negative_reviews) * skew[1])
+  positive_reviews = random.sample(positive_reviews, num_pos)
+  negative_reviews = random.sample(negative_reviews, num_neg)
 
   # Partition reviews into folds.
   (pos_folds, pos_fold_ids) = make_folds(positive_reviews, positive_ids, num_folds)
@@ -262,34 +267,65 @@ def classify_reviews(gram_length=GRAM_LENGTH, num_folds=NUM_FOLDS, use_presence=
 def run_configs():
   min_occurrences = 4
   use_negation = True
-  use_delta = False
-  use_pos_tags = False
-  use_adj_only = False
   labels = []
   accs = []
-  for use_amazon in [False, True]:
-    for gram_length in [1,2]:
-      for use_presence in [False, True]:
-        for (use_pos_tags, use_adj_only) in [(True, False), (True, True)]:
-          for use_position in [False, True]:
+  #for corpus in ["movies", "amazon", "twitter"]:
+  for corpus in ["amazon", "twitter"]:
+    for use_position in [False, True]:
+      for (use_pos_tags, use_adj_only) in [(False, False), (True, False), (True, True)]:
+        for gram_length in [1,2]:
+          for use_presence in [False, True]:
             params = {'gram_length':gram_length, 'use_presence':use_presence, 'use_pos_tags':use_pos_tags, 'use_adj_only':use_adj_only,
-                    'use_position':use_position, 'use_amazon':use_amazon, 'min_occurrences':min_occurrences, 'use_delta':False}
+                    'use_position':use_position, 'corpus':corpus, 'min_occurrences':min_occurrences, 'use_delta':False}
             acc = classify_reviews(**params)
-            label = "gram_length: %d, use_presence: %s, use_amazon: %s, use_pos_tags: %s, use_adj_only: %s, use_position: %s" % (gram_length, use_presence, use_amazon, use_pos_tags, use_adj_only, use_position)
+            label = "gram_length: %d, use_presence: %s, corpus: %s, use_pos_tags: %s, use_adj_only: %s, use_position: %s" % (gram_length, use_presence, corpus, use_pos_tags, use_adj_only, use_position)
             print label, acc
             labels.append(label)
             accs.append(acc)
     # Delta-TFIDF construction doesn't support all parameters (yet).
-    params = {'use_amazon':use_amazon, 'use_delta':True}
+    params = {'corpus':corpus, 'use_delta':True}
     acc = classify_reviews(**params)
-    label = "delta_tfidf: True, use_amazon: %s" % use_amazon
+    label = "delta_tfidf: True, corpus: %s" % corpus
     print label, acc
     labels.append(label)
     accs.append(acc)
   return (labels, accs)
+  
+def run_skewed():
+  min_occurrences = 4
+  use_negation = True
+  use_delta = False
+  use_pos_tags = False
+  use_adj_only = False
+  use_position = False
+  use_presence = True
+  labels = []
+  accs = []
+  for corpus in ["movies", "amazon"]:
+    for skew in  [(0.2,1), (0.4,1), (0.6,1), (0.8, 1), (1,0.8), (1,0.6), (1,0.4), (1,0.2)]:
+      for gram_length in [1,2]:
+        params = {'gram_length':gram_length, 'use_presence':use_presence, 'use_pos_tags':use_pos_tags, 'use_adj_only':use_adj_only,
+                'use_position':use_position, 'corpus':corpus, 'min_occurrences':min_occurrences, 'use_delta':False, 'skew': skew}
+
+        acc = classify_reviews(**params)
+        label = "corpus: %s, gram_length: %d, skew: (%f, %f)" % (corpus, gram_length, skew[0], skew[1])
+
+        print label, acc
+        labels.append(label)
+        accs.append(acc)
+    
+      params = {'gram_length':1, 'use_presence':False, 'use_pos_tags':False, 'use_adj_only':False,
+              'use_position':False, 'corpus':corpus, 'min_occurrences':min_occurrences, 'use_delta':False, 'skew': skew}
+      
+      acc = classify_reviews(**params)
+      label = "corpus: %s, delta_tfidf: True, skew: (%f, %f)" % (corpus, skew[0], skew[1])
+      print label, acc
+      labels.append(label)
+      accs.append(acc)
 
-(labels, accs) = run_configs()
-f = open('SVM_RESULTS.txt', 'w')
+#(labels, accs) = run_configs()
+(labels, accs) = run_skewed()
+f = open('SVM_RESULTS_SKEW.txt', 'w')
 for (label, acc) in zip(labels, accs):
   f.write("%s\t%s\n" % (label, acc))
-f.close()
+f.close()
\ No newline at end of file