matplotlib; eval tweaks; comparison

job13011 · Apr 16, 2016 · cf6576f · cf6576f
1 parent 6d16225
commit cf6576f
Show file tree

Hide file tree

Showing 4 changed files with 333 additions and 232 deletions.
diff --git a/GlossLexicon.py b/GlossLexicon.py
@@ -11,13 +11,11 @@
 from nltk.corpus import wordnet as wn
 
 import BagOfWords
-import MPQALexicon
-import AniaLexicon
 
-EXPAND_ITERATIONS = 2
+EXPAND_ITERATIONS = 3
 CLASSIFIER = "svm" # "nb" = Naive Bayes, "svm" = Linear SVM, "me" = maximum entropy
-REMOVE_STOPWORDS = False
-USE_STEMMING = False
+REMOVE_STOPWORDS = True
+USE_STEMMING = True # sync this up with eval!
 USE_EXAMPLES = True
 
 USE_EQUAL_TRAINING = True
@@ -90,7 +88,7 @@ def expand_sets(positive, negative, neutral):
   return (newPositive, newNegative, newNeutral)
 
 def do_stem(text):
-  global stemmer
+  stemmer = nltk.stem.porter.PorterStemmer()
   return [stemmer.stem(word) for word in text]
 
 def create(test_words, test_labels):

diff --git a/LexiconEval.py b/LexiconEval.py
@@ -7,11 +7,14 @@
 import MPQALexicon
 import AniaLexicon
 import GlossLexicon
+import XMLParser
 
-USE_STEMMING = False
+USE_STEMMING = True # sync this up with lexicon!
 USE_PARSING = True
-LEX_ALG = "gloss"
-LEX_SOURCE = "mpqa"
+LEX_ALG = "gloss"   # "gloss", "conjunction", "none"
+LEX_SOURCE = "mpqa" # "mpqa", "ania"
+CORPUS = "movies"   # "amazon", "movies"
+NEG_MOD = 1.5       # Taboada suggested 1.5.
 
 # new and improved finite state machine
 # kinda-sorta based on Taboada 2011.
@@ -36,21 +39,17 @@ def calculate_score(text, lexicon):
   num_neg = 0
   num_halfneg = 0
   for word in text:
-    if lexicon.has_key(word):
-      word_score = lexicon[word]
-      # EXPERIMENTAL
-      if word_score < 0: word_score *= 1.5
     if state == 0:
       if lexicon.has_key(word):
-        score += word_score
+        score += lexicon[word]
         num_single += 1
       elif word in negators:
         state = 1
       elif word in intensifiers:
         state = 2
     elif state == 1:
       if lexicon.has_key(word):
-        score += -1 * word_score
+        score += -1 * lexicon[word]
         num_neg += 1
         state = 0
       elif word in intensifiers:
@@ -59,7 +58,7 @@ def calculate_score(text, lexicon):
         state = 0
     elif state == 2:
       if lexicon.has_key(word):
-        score += 2 * word_score
+        score += 2 * lexicon[word]
         num_double += 1
         state = 0
       else:
@@ -68,7 +67,7 @@ def calculate_score(text, lexicon):
       pass #TODO
     elif state == 4:
       if lexicon.has_key(word):
-        score += -0.5 * word_score
+        score += -0.5 * lexicon[word]
         num_halfneg += 1
         state = 0
       else:
@@ -79,9 +78,6 @@ def calculate_score(text, lexicon):
 def do_stem(text):
   global stemmer
   return [stemmer.stem(word) for word in text]
-
-def get_label(id):
-  return movie_reviews.categories(fileids=[id])[0]
 
 # Used to create a lexicon instance from the words + labels directly (i.e. without using an algorithm)
 def create_lexicon(words, labels):
@@ -114,14 +110,24 @@ def create_lexicon(words, labels):
       else:
         print "Invalid lexicon"
       i += 2
+    elif args[i] == "--corpus":
+      if args[i+1] == "movies":
+        CORPUS = "movies"
+      elif args[i+1] == "amazon":
+        CORPUS = "amazon"
+      i += 2
     elif args[i] == "--help":
       print "Usage:"
-      print "--alg X: Choose the algorithm to use ('gloss', 'conjunction' or 'none') (default: gloss)"
+      print "--algorithm|alg X: Choose the algorithm to use ('gloss', 'conjunction' or 'none') (default: gloss)"
       print "  - gloss: Use the gloss-based algorithm (Esuli & Sebastiani)"
       print "  - conjunction: Use the conjunction-based algorithm (Hatzivassiloglou & McKeown)"
-      print "--lexicon X: Choose the lexicon to use ('mpqa', 'ania' or 'none')"
+      print "  - none: Use the input lexicon as is"
+      print "--lexicon|lex X: Choose the lexicon to use ('mpqa', 'ania' or 'none')"
       print "  - mpqa: Use the MPQA lexicon"
       print "  - ania: Use the hand-labeled lexicon from the Brown corpus"
+      print "--corpus X: Choose the data set to test on"
+      print "  - amazon: Use the Amazon data set"
+      print "  - movies: Use the Pang&Lee movie data set (default)"
       exit()
     else:
       print "Error: Invalid argument", args[i]
@@ -132,6 +138,7 @@ def create_lexicon(words, labels):
 
 print "Lexicon =", LEX_SOURCE
 print "Algorithm =", LEX_ALG
+print "Corpus =", CORPUS
 
 # Load the test set. A few options here.
 if LEX_SOURCE == "mpqa":
@@ -158,17 +165,45 @@ def create_lexicon(words, labels):
   lex_acc = correct/len(lexicon.items())
   print "Lexicon accuracy:", lex_acc
 
+for key in lexicon.keys():
+  if lexicon[key] < 0: lexicon[key] *= NEG_MOD
+
+if CORPUS == "movies":
+  ids = movie_reviews.fileids()
+  reviews = [list(movie_reviews.words(fileids=[id])) for id in ids]
+  labels = []
+  for id in ids:
+    label = movie_reviews.categories(id)[0]
+    if label == 'pos':
+      labels.append(1)
+    elif label == 'neg': 
+      labels.append(-1)
+elif CORPUS == "amazon":
+  (ids, reviews, labels) = XMLParser.get_all_reviews()
+else:
+  print "Invalid corpus!"
+  exit()
+
+"""
+# It feels like there should be a more efficient way do to this.
+shuffled = zip(ids,reviews,labels)
+shuffled = shuffled[:20]  
+ids = [x[0] for x in shuffled]
+reviews = [x[1] for x in shuffled]
+labels = [x[2] for x in shuffled]
+"""
+
 # Iterate through all of the reviews and compute scores by taking the sum of their
 # component lexicon words.  Includes rudimentary negation testing.
 correct = 0
 positive = 0
-ids = sorted(movie_reviews.fileids())
 scores = []
 
-for id in ids:
-  words = list(movie_reviews.words(fileids=[id]))
+for i in range(len(reviews)):
+  words = reviews[i]
   if USE_STEMMING:
     words = do_stem(words)
+
   if USE_PARSING:
     score = calculate_score(words, lexicon)
   else:
@@ -182,14 +217,15 @@ def create_lexicon(words, labels):
 for i in range(len(ids)):
   id = ids[i]
   score = scores[i]
+  label = labels[i]
   if score >= 0:
-    sent_value = "pos"
+    sent_value = 1
     positive += 1
     #print id, sent_value
   elif score < 0:
-    sent_value = "neg"
+    sent_value = -1
     #print id, sent_value
-  label = get_label(id)
+    
   if sent_value == label:
     correct += 1
 

diff --git a/graph.py b/graph.py
@@ -0,0 +1,43 @@
+import numpy
+from matplotlib import pyplot
+
+labels = [
+  "unigrams, frequency",
+  "unigrams, frequency, +Position",
+  "unigrams, presence",
+  "unigrams, presence, +Position",
+  "bigrams, frequency",
+  "bigrams, frequency, +Position",
+  "bigrams, presence",
+  "bigrams, presence, +Position",
+  "delta_tfidf"
+]
+labels2 = [
+  "unigrams, frequency",
+  "unigrams, frequency, +Position",
+  "unigrams, presence",
+  "unigrams, presence,  +Position",
+  "bigrams, frequency",
+  "bigrams, frequency, +Position",
+  "bigrams, presence",
+  "bigrams, presence, +Position",
+  "delta_tfidf"
+]
+tops = numpy.arange(len(labels))
+widths = [0.826002649356, 0.784479089868, 0.842490694287, 0.821997146847, 0.807497617378, 0.777000053946, 0.820491149832, 0.795509581438, 0.981992471513]
+widths2 = [0.824624634419, 0.808376475678, 0.832750728912, 0.815374570779, 0.797876474366, 0.771876439875, 0.799001849413, 0.768376127015, 0.929999178955]
+height = 0.3
+pyplot.barh(tops, widths, height, color="#FF0000")
+pyplot.barh(tops+height, widths2, height, color="#00FF00")
+pyplot.legend(["Movies", "Amazon"], loc=4) # bottom right
+pyplot.yticks(tops+height, labels)
+pyplot.xlim(0.5, 1.0)
+pyplot.ylim(tops[0]-2*height, tops[-1]+3*height)
+pyplot.show()
+
+"""
+gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: False, use_position: False 0.822003140865
+gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: False, use_position: True 0.781988575402
+gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position: False 0.77899606193
+gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position: True 0.762512512513
+"""