Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
refactored lexicon
  • Loading branch information
job13011 committed Apr 12, 2016
1 parent 1a5c75f commit ae237e3
Showing 1 changed file with 181 additions and 0 deletions.
181 changes: 181 additions & 0 deletions LexiconEval.py
@@ -0,0 +1,181 @@
from __future__ import division
import sys

import nltk
from nltk.corpus import movie_reviews

import MPQALexicon
import AniaLexicon
import GlossLexicon

USE_STEMMING = False
USE_PARSING = True
LEX_ALG = "gloss"
LEX_SOURCE = "mpqa"

# new and improved finite state machine
# states are as follows:
# 0 - base
# 1 - negator found
# 2 - intensifier found
# 3 - un-intensifier found (unused)
# 4 - negator + intensifier found
def calculate_score(text, lexicon):
negators = ["not", "n't", "hardly", "barely"]
intensifiers = ["very", "really", "incredibly", "amazingly", "extremely"]
if USE_STEMMING:
negators = do_stem(negators)
intensifiers = do_stem(intensifiers)

punctuation = [".", "!", "?", ",", ";", '(', ')']
state = 0
score = 0
num_double = 0
num_single = 0
num_neg = 0
num_halfneg = 0
for word in text:
if state == 0:
if lexicon.has_key(word):
score += lexicon[word]
num_single += 1
elif word in negators:
state = 1
elif word in intensifiers:
state = 2
elif state == 1:
if lexicon.has_key(word):
score += -1 * lexicon[word]
num_neg += 1
state = 0
elif word in intensifiers:
state = 4
else:
state = 0
elif state == 2:
if lexicon.has_key(word):
score += 2 * lexicon[word]
num_double += 1
state = 0
else:
state = 0
elif state == 3:
pass #TODO
elif state == 4:
if lexicon.has_key(word):
score += -0.5 * lexicon[word]
num_halfneg += 1
state = 0
else:
state = 0
#print num_single, num_neg, num_double, num_halfneg
return score

def do_stem(text):
global stemmer
return [stemmer.stem(word) for word in text]

def get_label(id):
return movie_reviews.categories(fileids=[id])[0]

i = 0
try:
args = sys.argv[1:]
while i < len(args):
if args[i] in ["--alg", "--algorithm"]:
if args[i+1] == "gloss":
LEX_ALG = "gloss"
elif args[i+1] == "conjunction":
LEX_ALG = "conjunction"
else:
print "Invalid algorithm"
i += 2
elif args[i] in ["--lex", "--lexicon"]:
if args[i+1] == "mpqa":
LEX_SOURCE = "mpqa"
elif args[i+1] == "ania":
LEX_SOURCE = "ania"
else:
print "Invalid lexicon"
i += 2
elif args[i] == "--help":
print "Usage:"
print "--alg X: Choose the algorithm to use ('gloss', 'conjunction' or 'none') (default: gloss)"
print " - gloss: Use the gloss-based algorithm (Esuli & Sebastiani)"
print " - conjunction: Use the conjunction-based algorithm (Hatzivassiloglou & McKeown)"
print "--lexicon X: Choose the lexicon to use ('mpqa', 'ania' or 'none')"
print " - mpqa: Use the MPQA lexicon"
print " - ania: Use the hand-labeled lexicon from the Brown corpus"
exit()
else:
print "Error: Invalid argument", args[i]
i += 1
except Exception:
print "Invalid arguments"
exit()

print "Lexicon =", LEX_SOURCE
print "Algorithm =", LEX_ALG

# Load the test set. A few options here.
if LEX_SOURCE == "mpqa":
(test_words, test_labels) = MPQALexicon.load(True)
elif LEX_SOURCE == "ania":
(test_words, test_labels) = AniaLexicon.load()
else:
print "Invalid lexicon"
exit()

if USE_STEMMING:
stemmer = nltk.stem.porter.PorterStemmer()
test_words = do_stem(test_words)

if LEX_ALG == "gloss":
lexicon = GlossLexicon.create(test_words, test_labels)
elif LEX_ALG == "conjunction":
print "Error: Conjunction algorithm NYI"
elif LEX_ALG == "none":
lexicon = create_lexicon(test_words, test_labels)

correct = len([(word, label) for (word, label) in zip(test_words, test_labels) if lexicon.has_key(word) and label == lexicon[word]])
lex_acc = correct/len(lexicon.items())
print "Lexicon accuracy:", lex_acc

# Iterate through all of the reviews and compute scores by taking the sum of their
# component lexicon words. Includes rudimentary negation testing.
correct = 0
positive = 0
ids = sorted(movie_reviews.fileids())
scores = []

for id in ids:
words = list(movie_reviews.words(fileids=[id]))
if USE_STEMMING:
words = do_stem(words)
if USE_PARSING:
score = calculate_score(words, lexicon)
else:
score = 0
for word in words:
if lexicon.has_key(word):
score += lexicon[word]
x += 1
scores.append(score)
#print id, score

for i in range(len(ids)):
id = ids[i]
score = scores[i]
if score >= 0:
sent_value = "pos"
positive += 1
#print id, sent_value
elif score < 0:
sent_value = "neg"
#print id, sent_value
label = get_label(id)
if sent_value == label:
correct += 1

print "correct:", correct/len(ids)
print "positive:", positive/len(ids)

0 comments on commit ae237e3

Please sign in to comment.