refactored lexicon

job13011 · Apr 12, 2016 · ae237e3 · ae237e3
1 parent 1a5c75f
commit ae237e3
Showing 1 changed file with 181 additions and 0 deletions.
diff --git a/LexiconEval.py b/LexiconEval.py
@@ -0,0 +1,181 @@
+from __future__ import division
+import sys
+
+import nltk
+from nltk.corpus import movie_reviews
+
+import MPQALexicon
+import AniaLexicon
+import GlossLexicon
+
+USE_STEMMING = False
+USE_PARSING = True
+LEX_ALG = "gloss"
+LEX_SOURCE = "mpqa"
+
+# new and improved finite state machine
+# states are as follows:
+# 0 - base
+# 1 - negator found
+# 2 - intensifier found
+# 3 - un-intensifier found (unused)
+# 4 - negator + intensifier found
+def calculate_score(text, lexicon):
+  negators = ["not", "n't", "hardly", "barely"]
+  intensifiers = ["very", "really", "incredibly", "amazingly", "extremely"]
+  if USE_STEMMING:
+    negators = do_stem(negators)
+    intensifiers = do_stem(intensifiers)
+
+  punctuation = [".", "!", "?", ",", ";", '(', ')']
+  state = 0
+  score = 0
+  num_double = 0
+  num_single = 0
+  num_neg = 0
+  num_halfneg = 0
+  for word in text:
+    if state == 0:
+      if lexicon.has_key(word):
+        score += lexicon[word]
+        num_single += 1
+      elif word in negators:
+        state = 1
+      elif word in intensifiers:
+        state = 2
+    elif state == 1:
+      if lexicon.has_key(word):
+        score += -1 * lexicon[word]
+        num_neg += 1
+        state = 0
+      elif word in intensifiers:
+        state = 4
+      else:
+        state = 0
+    elif state == 2:
+      if lexicon.has_key(word):
+        score += 2 * lexicon[word]
+        num_double += 1
+        state = 0
+      else:
+        state = 0
+    elif state == 3:
+      pass #TODO
+    elif state == 4:
+      if lexicon.has_key(word):
+        score += -0.5 * lexicon[word]
+        num_halfneg += 1
+        state = 0
+      else:
+        state = 0
+  #print num_single, num_neg, num_double, num_halfneg
+  return score
+
+def do_stem(text):
+  global stemmer
+  return [stemmer.stem(word) for word in text]
+
+def get_label(id):
+  return movie_reviews.categories(fileids=[id])[0]
+
+i = 0
+try:
+  args = sys.argv[1:]
+  while i < len(args):
+    if args[i] in ["--alg", "--algorithm"]:
+      if args[i+1] == "gloss":
+        LEX_ALG = "gloss"
+      elif args[i+1] == "conjunction":
+        LEX_ALG = "conjunction"
+      else:
+        print "Invalid algorithm"
+      i += 2
+    elif args[i] in ["--lex", "--lexicon"]:
+      if args[i+1] == "mpqa":
+        LEX_SOURCE = "mpqa"
+      elif args[i+1] == "ania":
+        LEX_SOURCE = "ania"
+      else:
+        print "Invalid lexicon"
+      i += 2
+    elif args[i] == "--help":
+      print "Usage:"
+      print "--alg X: Choose the algorithm to use ('gloss', 'conjunction' or 'none') (default: gloss)"
+      print "  - gloss: Use the gloss-based algorithm (Esuli & Sebastiani)"
+      print "  - conjunction: Use the conjunction-based algorithm (Hatzivassiloglou & McKeown)"
+      print "--lexicon X: Choose the lexicon to use ('mpqa', 'ania' or 'none')"
+      print "  - mpqa: Use the MPQA lexicon"
+      print "  - ania: Use the hand-labeled lexicon from the Brown corpus"
+      exit()
+    else:
+      print "Error: Invalid argument", args[i]
+      i += 1
+except Exception:
+  print "Invalid arguments"
+  exit()
+
+print "Lexicon =", LEX_SOURCE
+print "Algorithm =", LEX_ALG
+
+# Load the test set. A few options here.
+if LEX_SOURCE == "mpqa":
+  (test_words, test_labels) = MPQALexicon.load(True)
+elif LEX_SOURCE == "ania":
+  (test_words, test_labels) = AniaLexicon.load()
+else:
+  print "Invalid lexicon"
+  exit()
+
+if USE_STEMMING:
+  stemmer = nltk.stem.porter.PorterStemmer()
+  test_words = do_stem(test_words)
+
+if LEX_ALG == "gloss":
+  lexicon = GlossLexicon.create(test_words, test_labels)
+elif LEX_ALG == "conjunction":
+  print "Error: Conjunction algorithm NYI"
+elif LEX_ALG == "none":
+  lexicon = create_lexicon(test_words, test_labels)
+
+correct = len([(word, label) for (word, label) in zip(test_words, test_labels) if lexicon.has_key(word) and label == lexicon[word]])
+lex_acc = correct/len(lexicon.items())
+print "Lexicon accuracy:", lex_acc
+
+# Iterate through all of the reviews and compute scores by taking the sum of their
+# component lexicon words.  Includes rudimentary negation testing.
+correct = 0
+positive = 0
+ids = sorted(movie_reviews.fileids())
+scores = []
+
+for id in ids:
+  words = list(movie_reviews.words(fileids=[id]))
+  if USE_STEMMING:
+    words = do_stem(words)
+  if USE_PARSING:
+    score = calculate_score(words, lexicon)
+  else:
+    score = 0
+    for word in words:
+      if lexicon.has_key(word):
+        score += lexicon[word]
+        x += 1
+  scores.append(score)
+  #print id, score
+
+for i in range(len(ids)):
+  id = ids[i]
+  score = scores[i]
+  if score >= 0:
+    sent_value = "pos"
+    positive += 1
+    #print id, sent_value
+  elif score < 0:
+    sent_value = "neg"
+    #print id, sent_value
+  label = get_label(id)
+  if sent_value == label:
+    correct += 1
+
+print "correct:", correct/len(ids)
+print "positive:", positive/len(ids)