LexiconEval.py

from __future__ import division
import sys

import nltk
from nltk.corpus import movie_reviews

import MPQALexicon
import AniaLexicon
import GlossLexicon

USE_STEMMING = False
USE_PARSING = True
LEX_ALG = "gloss"
LEX_SOURCE = "mpqa"

# new and improved finite state machine
# kinda-sorta based on Taboada 2011.
# states are as follows:
# 0 - base
# 1 - negator found
# 2 - intensifier found
# 3 - un-intensifier found (unused)
# 4 - negator + intensifier found
def calculate_score(text, lexicon):
  negators = ["not", "n't", "hardly", "barely"]
  intensifiers = ["very", "really", "incredibly", "amazingly", "extremely"]
  if USE_STEMMING:
    negators = do_stem(negators)
    intensifiers = do_stem(intensifiers)

  punctuation = [".", "!", "?", ",", ";", '(', ')']
  state = 0
  score = 0
  num_double = 0
  num_single = 0
  num_neg = 0
  num_halfneg = 0
  for word in text:
    if lexicon.has_key(word):
      word_score = lexicon[word]
      # EXPERIMENTAL
      if word_score < 0: word_score *= 1.5
    if state == 0:
      if lexicon.has_key(word):
        score += word_score
        num_single += 1
      elif word in negators:
        state = 1
      elif word in intensifiers:
        state = 2
    elif state == 1:
      if lexicon.has_key(word):
        score += -1 * word_score
        num_neg += 1
        state = 0
      elif word in intensifiers:
        state = 4
      else:
        state = 0
    elif state == 2:
      if lexicon.has_key(word):
        score += 2 * word_score
        num_double += 1
        state = 0
      else:
        state = 0
    elif state == 3:
      pass #TODO
    elif state == 4:
      if lexicon.has_key(word):
        score += -0.5 * word_score
        num_halfneg += 1
        state = 0
      else:
        state = 0
  #print num_single, num_neg, num_double, num_halfneg
  return score

def do_stem(text):
  global stemmer
  return [stemmer.stem(word) for word in text]

def get_label(id):
  return movie_reviews.categories(fileids=[id])[0]

# Used to create a lexicon instance from the words + labels directly (i.e. without using an algorithm)
def create_lexicon(words, labels):
  lexicon = {}
  for i in range(len(words)):
    word = words[i]
    label = labels[i]
    lexicon[word] = label
  return lexicon

i = 0
try:
  args = sys.argv[1:]
  while i < len(args):
    if args[i] in ["--alg", "--algorithm"]:
      if args[i+1] == "gloss":
        LEX_ALG = "gloss"
      elif args[i+1] == "conjunction":
        LEX_ALG = "conjunction"
      elif args[i+1] == "none":
        LEX_ALG = "none"
      else:
        print "Invalid algorithm"
      i += 2
    elif args[i] in ["--lex", "--lexicon"]:
      if args[i+1] == "mpqa":
        LEX_SOURCE = "mpqa"
      elif args[i+1] == "ania":
        LEX_SOURCE = "ania"
      else:
        print "Invalid lexicon"
      i += 2
    elif args[i] == "--help":
      print "Usage:"
      print "--alg X: Choose the algorithm to use ('gloss', 'conjunction' or 'none') (default: gloss)"
      print "  - gloss: Use the gloss-based algorithm (Esuli & Sebastiani)"
      print "  - conjunction: Use the conjunction-based algorithm (Hatzivassiloglou & McKeown)"
      print "--lexicon X: Choose the lexicon to use ('mpqa', 'ania' or 'none')"
      print "  - mpqa: Use the MPQA lexicon"
      print "  - ania: Use the hand-labeled lexicon from the Brown corpus"
      exit()
    else:
      print "Error: Invalid argument", args[i]
      i += 1
except Exception:
  print "Invalid arguments"
  exit()

print "Lexicon =", LEX_SOURCE
print "Algorithm =", LEX_ALG

# Load the test set. A few options here.
if LEX_SOURCE == "mpqa":
  (test_words, test_labels) = MPQALexicon.load(True)
elif LEX_SOURCE == "ania":
  (test_words, test_labels) = AniaLexicon.load()
else:
  print "Invalid lexicon"
  exit()

if USE_STEMMING:
  stemmer = nltk.stem.porter.PorterStemmer()
  test_words = do_stem(test_words)

if LEX_ALG == "gloss":
  lexicon = GlossLexicon.create(test_words, test_labels)
elif LEX_ALG == "conjunction":
  print "Error: Conjunction algorithm NYI"
elif LEX_ALG == "none":
  lexicon = create_lexicon(test_words, test_labels)

if LEX_ALG != "none":
  correct = len([(word, label) for (word, label) in zip(test_words, test_labels) if lexicon.has_key(word) and label == lexicon[word]])
  lex_acc = correct/len(lexicon.items())
  print "Lexicon accuracy:", lex_acc

# Iterate through all of the reviews and compute scores by taking the sum of their
# component lexicon words.  Includes rudimentary negation testing.
correct = 0
positive = 0
ids = sorted(movie_reviews.fileids())
scores = []

for id in ids:
  words = list(movie_reviews.words(fileids=[id]))
  if USE_STEMMING:
    words = do_stem(words)
  if USE_PARSING:
    score = calculate_score(words, lexicon)
  else:
    score = 0
    for word in words:
      if lexicon.has_key(word):
        score += lexicon[word]
  scores.append(score)
  #print id, score

for i in range(len(ids)):
  id = ids[i]
  score = scores[i]
  if score >= 0:
    sent_value = "pos"
    positive += 1
    #print id, sent_value
  elif score < 0:
    sent_value = "neg"
    #print id, sent_value
  label = get_label(id)
  if sent_value == label:
    correct += 1

print "correct:", correct/len(ids)
print "positive:", positive/len(ids)
	from __future__ import division
	import sys

	import nltk
	from nltk.corpus import movie_reviews

	import MPQALexicon
	import AniaLexicon
	import GlossLexicon

	USE_STEMMING = False
	USE_PARSING = True
	LEX_ALG = "gloss"
	LEX_SOURCE = "mpqa"

	# new and improved finite state machine
	# kinda-sorta based on Taboada 2011.
	# states are as follows:
	# 0 - base
	# 1 - negator found
	# 2 - intensifier found
	# 3 - un-intensifier found (unused)
	# 4 - negator + intensifier found
	def calculate_score(text, lexicon):
	negators = ["not", "n't", "hardly", "barely"]
	intensifiers = ["very", "really", "incredibly", "amazingly", "extremely"]
	if USE_STEMMING:
	negators = do_stem(negators)
	intensifiers = do_stem(intensifiers)

	punctuation = [".", "!", "?", ",", ";", '(', ')']
	state = 0
	score = 0
	num_double = 0
	num_single = 0
	num_neg = 0
	num_halfneg = 0
	for word in text:
	if lexicon.has_key(word):
	word_score = lexicon[word]
	# EXPERIMENTAL
	if word_score < 0: word_score *= 1.5
	if state == 0:
	if lexicon.has_key(word):
	score += word_score
	num_single += 1
	elif word in negators:
	state = 1
	elif word in intensifiers:
	state = 2
	elif state == 1:
	if lexicon.has_key(word):
	score += -1 * word_score
	num_neg += 1
	state = 0
	elif word in intensifiers:
	state = 4
	else:
	state = 0
	elif state == 2:
	if lexicon.has_key(word):
	score += 2 * word_score
	num_double += 1
	state = 0
	else:
	state = 0
	elif state == 3:
	pass #TODO
	elif state == 4:
	if lexicon.has_key(word):
	score += -0.5 * word_score
	num_halfneg += 1
	state = 0
	else:
	state = 0
	#print num_single, num_neg, num_double, num_halfneg
	return score

	def do_stem(text):
	global stemmer
	return [stemmer.stem(word) for word in text]

	def get_label(id):
	return movie_reviews.categories(fileids=[id])[0]

	# Used to create a lexicon instance from the words + labels directly (i.e. without using an algorithm)
	def create_lexicon(words, labels):
	lexicon = {}
	for i in range(len(words)):
	word = words[i]
	label = labels[i]
	lexicon[word] = label
	return lexicon

	i = 0
	try:
	args = sys.argv[1:]
	while i < len(args):
	if args[i] in ["--alg", "--algorithm"]:
	if args[i+1] == "gloss":
	LEX_ALG = "gloss"
	elif args[i+1] == "conjunction":
	LEX_ALG = "conjunction"
	elif args[i+1] == "none":
	LEX_ALG = "none"
	else:
	print "Invalid algorithm"
	i += 2
	elif args[i] in ["--lex", "--lexicon"]:
	if args[i+1] == "mpqa":
	LEX_SOURCE = "mpqa"
	elif args[i+1] == "ania":
	LEX_SOURCE = "ania"
	else:
	print "Invalid lexicon"
	i += 2
	elif args[i] == "--help":
	print "Usage:"
	print "--alg X: Choose the algorithm to use ('gloss', 'conjunction' or 'none') (default: gloss)"
	print " - gloss: Use the gloss-based algorithm (Esuli & Sebastiani)"
	print " - conjunction: Use the conjunction-based algorithm (Hatzivassiloglou & McKeown)"
	print "--lexicon X: Choose the lexicon to use ('mpqa', 'ania' or 'none')"
	print " - mpqa: Use the MPQA lexicon"
	print " - ania: Use the hand-labeled lexicon from the Brown corpus"
	exit()
	else:
	print "Error: Invalid argument", args[i]
	i += 1
	except Exception:
	print "Invalid arguments"
	exit()

	print "Lexicon =", LEX_SOURCE
	print "Algorithm =", LEX_ALG

	# Load the test set. A few options here.
	if LEX_SOURCE == "mpqa":
	(test_words, test_labels) = MPQALexicon.load(True)
	elif LEX_SOURCE == "ania":
	(test_words, test_labels) = AniaLexicon.load()
	else:
	print "Invalid lexicon"
	exit()

	if USE_STEMMING:
	stemmer = nltk.stem.porter.PorterStemmer()
	test_words = do_stem(test_words)

	if LEX_ALG == "gloss":
	lexicon = GlossLexicon.create(test_words, test_labels)
	elif LEX_ALG == "conjunction":
	print "Error: Conjunction algorithm NYI"
	elif LEX_ALG == "none":
	lexicon = create_lexicon(test_words, test_labels)

	if LEX_ALG != "none":
	correct = len([(word, label) for (word, label) in zip(test_words, test_labels) if lexicon.has_key(word) and label == lexicon[word]])
	lex_acc = correct/len(lexicon.items())
	print "Lexicon accuracy:", lex_acc

	# Iterate through all of the reviews and compute scores by taking the sum of their
	# component lexicon words. Includes rudimentary negation testing.
	correct = 0
	positive = 0
	ids = sorted(movie_reviews.fileids())
	scores = []

	for id in ids:
	words = list(movie_reviews.words(fileids=[id]))
	if USE_STEMMING:
	words = do_stem(words)
	if USE_PARSING:
	score = calculate_score(words, lexicon)
	else:
	score = 0
	for word in words:
	if lexicon.has_key(word):
	score += lexicon[word]
	scores.append(score)
	#print id, score

	for i in range(len(ids)):
	id = ids[i]
	score = scores[i]
	if score >= 0:
	sent_value = "pos"
	positive += 1
	#print id, sent_value
	elif score < 0:
	sent_value = "neg"
	#print id, sent_value
	label = get_label(id)
	if sent_value == label:
	correct += 1

	print "correct:", correct/len(ids)
	print "positive:", positive/len(ids)