LexiconEval.py

from __future__ import division
import sys

import nltk
from nltk.corpus import movie_reviews

import MPQALexicon
import AniaLexicon
import GlossLexicon
import XMLParser

USE_STEMMING = True # sync this up with lexicon!
USE_PARSING = True
LEX_ALG = "gloss"   # "gloss", "conjunction", "none"
LEX_SOURCE = "mpqa" # "mpqa", "ania"
CORPUS = "movies"   # "amazon", "movies"
NEG_MOD = 1.5       # Taboada suggested 1.5.

# new and improved finite state machine
# kinda-sorta based on Taboada 2011.
# states are as follows:
# 0 - base
# 1 - negator found
# 2 - intensifier found
# 3 - un-intensifier found (unused)
# 4 - negator + intensifier found
def calculate_score(text, lexicon):
  negators = ["not", "n't", "hardly", "barely"]
  intensifiers = ["very", "really", "incredibly", "amazingly", "extremely"]
  if USE_STEMMING:
    negators = do_stem(negators)
    intensifiers = do_stem(intensifiers)
    
  punctuation = [".", "!", "?", ",", ";", '(', ')']
  state = 0
  score = 0
  num_double = 0
  num_single = 0
  num_neg = 0
  num_halfneg = 0
  for word in text:
    if state == 0:
      if lexicon.has_key(word):
        score += lexicon[word]
        num_single += 1
      elif word in negators:
        state = 1
      elif word in intensifiers:
        state = 2
    elif state == 1:
      if lexicon.has_key(word):
        score += -1 * lexicon[word]
        num_neg += 1
        state = 0
      elif word in intensifiers:
        state = 4
      else:
        state = 0
    elif state == 2:
      if lexicon.has_key(word):
        score += 2 * lexicon[word]
        num_double += 1
        state = 0
      else:
        state = 0
    elif state == 3:
      pass #TODO
    elif state == 4:
      if lexicon.has_key(word):
        score += -0.5 * lexicon[word]
        num_halfneg += 1
        state = 0
      else:
        state = 0
  #print num_single, num_neg, num_double, num_halfneg
  return score

def do_stem(text):
  global stemmer
  return [stemmer.stem(word) for word in text]

# Used to create a lexicon instance from the words + labels directly (i.e. without using an algorithm)
def create_lexicon(words, labels):
  lexicon = {}
  for i in range(len(words)):
    word = words[i]
    label = labels[i]
    lexicon[word] = label
  return lexicon
  
i = 0
try:
  args = sys.argv[1:]
  while i < len(args):
    if args[i] in ["--alg", "--algorithm"]:
      if args[i+1] == "gloss":
        LEX_ALG = "gloss"
      elif args[i+1] == "conjunction":
        LEX_ALG = "conjunction"
      elif args[i+1] == "none":
        LEX_ALG = "none"
      else:
        print "Invalid algorithm"
      i += 2
    elif args[i] in ["--lex", "--lexicon"]:
      if args[i+1] == "mpqa":
        LEX_SOURCE = "mpqa"
      elif args[i+1] == "ania":
        LEX_SOURCE = "ania"
      else:
        print "Invalid lexicon"
      i += 2
    elif args[i] == "--corpus":
      if args[i+1] == "movies":
        CORPUS = "movies"
      elif args[i+1] == "amazon":
        CORPUS = "amazon"
      i += 2
    elif args[i] == "--help":
      print "Usage:"
      print "--algorithm|alg X: Choose the algorithm to use ('gloss', 'conjunction' or 'none') (default: gloss)"
      print "  - gloss: Use the gloss-based algorithm (Esuli & Sebastiani)"
      print "  - conjunction: Use the conjunction-based algorithm (Hatzivassiloglou & McKeown)"
      print "  - none: Use the input lexicon as is"
      print "--lexicon|lex X: Choose the lexicon to use ('mpqa', 'ania' or 'none')"
      print "  - mpqa: Use the MPQA lexicon"
      print "  - ania: Use the hand-labeled lexicon from the Brown corpus"
      print "--corpus X: Choose the data set to test on"
      print "  - amazon: Use the Amazon data set"
      print "  - movies: Use the Pang&Lee movie data set (default)"
      exit()
    else:
      print "Error: Invalid argument", args[i]
      i += 1
except Exception:
  print "Invalid arguments"
  exit()
  
print "Lexicon =", LEX_SOURCE
print "Algorithm =", LEX_ALG
print "Corpus =", CORPUS

# Load the test set. A few options here.
if LEX_SOURCE == "mpqa":
  (test_words, test_labels) = MPQALexicon.load(True)
elif LEX_SOURCE == "ania":
  (test_words, test_labels) = AniaLexicon.load()
else:
  print "Invalid lexicon"
  exit()
  
if USE_STEMMING:
  stemmer = nltk.stem.porter.PorterStemmer()
  test_words = do_stem(test_words)

if LEX_ALG == "gloss":
  lexicon = GlossLexicon.create(test_words, test_labels)
elif LEX_ALG == "conjunction":
  print "Error: Conjunction algorithm NYI"
elif LEX_ALG == "none":
  lexicon = create_lexicon(test_words, test_labels)

if LEX_ALG != "none":
  correct = len([(word, label) for (word, label) in zip(test_words, test_labels) if lexicon.has_key(word) and label == lexicon[word]])
  lex_acc = correct/len(lexicon.items())
  print "Lexicon accuracy:", lex_acc
  
for key in lexicon.keys():
  if lexicon[key] < 0: lexicon[key] *= NEG_MOD
  
if CORPUS == "movies":
  ids = movie_reviews.fileids()
  reviews = [list(movie_reviews.words(fileids=[id])) for id in ids]
  labels = []
  for id in ids:
    label = movie_reviews.categories(id)[0]
    if label == 'pos':
      labels.append(1)
    elif label == 'neg': 
      labels.append(-1)
elif CORPUS == "amazon":
  (ids, reviews, labels) = XMLParser.get_all_reviews()
else:
  print "Invalid corpus!"
  exit()
  
"""
# It feels like there should be a more efficient way do to this.
shuffled = zip(ids,reviews,labels)
shuffled = shuffled[:20]  
ids = [x[0] for x in shuffled]
reviews = [x[1] for x in shuffled]
labels = [x[2] for x in shuffled]
"""

# Iterate through all of the reviews and compute scores by taking the sum of their
# component lexicon words.  Includes rudimentary negation testing.
correct = 0
positive = 0
scores = []

for i in range(len(reviews)):
  words = reviews[i]
  if USE_STEMMING:
    words = do_stem(words)
    
  if USE_PARSING:
    score = calculate_score(words, lexicon)
  else:
    score = 0
    for word in words:
      if lexicon.has_key(word):
        score += lexicon[word]
  scores.append(score)
  #print id, score

for i in range(len(ids)):
  id = ids[i]
  score = scores[i]
  label = labels[i]
  if score >= 0:
    sent_value = 1
    positive += 1
    #print id, sent_value
  elif score < 0:
    sent_value = -1
    #print id, sent_value
    
  if sent_value == label:
    correct += 1

print "correct:", correct/len(ids)
print "positive:", positive/len(ids)