Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
243 lines (222 sloc) 6.6 KB
from __future__ import division
import sys
import nltk
from nltk.corpus import movie_reviews
import MPQALexicon
import AniaLexicon
import GlossLexicon
import LexFromFile
import XMLParser
import TwitterCorpus
USE_STEMMING = False # sync this up with lexicon!
USE_PARSING = True
LEX_ALG = "gloss" # "gloss", "conjunction", "none"
LEX_SOURCE = "mpqa" # "mpqa", "ania"
CORPUS = "movies" # "amazon", "movies", "twitter"
NEG_MOD = 1.5 # Taboada suggested 1.5.
# new and improved finite state machine
# kinda-sorta based on Taboada 2011.
# states are as follows:
# 0 - base
# 1 - negator found
# 2 - intensifier found
# 3 - un-intensifier found (unused)
# 4 - negator + intensifier found
def calculate_score(text, lexicon):
negators = ["not", "n't", "hardly", "barely"]
intensifiers = ["very", "really", "incredibly", "amazingly", "extremely"]
if USE_STEMMING:
negators = do_stem(negators)
intensifiers = do_stem(intensifiers)
punctuation = [".", "!", "?", ",", ";", '(', ')']
state = 0
score = 0
num_double = 0
num_single = 0
num_neg = 0
num_halfneg = 0
for word in text:
if state == 0:
if lexicon.has_key(word):
score += lexicon[word]
num_single += 1
elif word in negators:
state = 1
elif word in intensifiers:
state = 2
elif state == 1:
if lexicon.has_key(word):
score += -1 * lexicon[word]
num_neg += 1
state = 0
elif word in intensifiers:
state = 4
else:
state = 0
elif state == 2:
if lexicon.has_key(word):
score += 2 * lexicon[word]
num_double += 1
state = 0
else:
state = 0
elif state == 3:
pass #TODO
elif state == 4:
if lexicon.has_key(word):
score += -0.5 * lexicon[word]
num_halfneg += 1
state = 0
else:
state = 0
#print num_single, num_neg, num_double, num_halfneg
return score
def do_stem(text):
global stemmer
return [stemmer.stem(word) for word in text]
# Used to create a lexicon instance from the words + labels directly (i.e. without using an algorithm)
def create_lexicon(words, labels):
lexicon = {}
for i in range(len(words)):
word = words[i]
label = labels[i]
lexicon[word] = label
return lexicon
i = 0
try:
args = sys.argv[1:]
while i < len(args):
if args[i] in ["--alg", "--algorithm"]:
if args[i+1] == "gloss":
LEX_ALG = "gloss"
elif args[i+1] == "conjunction":
LEX_ALG = "conjunction"
elif args[i+1] == "none":
LEX_ALG = "none"
else:
print "Invalid algorithm"
i += 2
elif args[i] in ["--lex", "--lexicon"]:
if args[i+1] == "mpqa":
LEX_SOURCE = "mpqa"
elif args[i+1] == "ania":
LEX_SOURCE = "ania"
else:
print "Invalid lexicon"
i += 2
elif args[i] == "--corpus":
if args[i+1] == "movies":
CORPUS = "movies"
elif args[i+1] == "amazon":
CORPUS = "amazon"
elif args[i+1] == "twitter":
CORPUS = "twitter"
i += 2
elif args[i] == "--help":
print "Usage:"
print "--algorithm|alg X: Choose the algorithm to use ('gloss', 'conjunction' or 'none') (default: gloss)"
print " - gloss: Use the gloss-based algorithm (Esuli & Sebastiani)"
print " - conjunction: Use the conjunction-based algorithm (Hatzivassiloglou & McKeown)"
print " - none: Use the input lexicon as is"
print "--lexicon|lex X: Choose the lexicon to use ('mpqa', 'ania' or 'none')"
print " - mpqa: Use the MPQA lexicon"
print " - ania: Use the hand-labeled lexicon from the Brown corpus"
print "--corpus X: Choose the data set to test on"
print " - amazon: Use the Amazon data set"
print " - twitter: Use the Twitter data set"
print " - movies: Use the Pang&Lee movie data set (default)"
exit()
else:
print "Error: Invalid argument", args[i]
i += 1
except Exception:
print "Invalid arguments"
exit()
print "Lexicon =", LEX_SOURCE
print "Algorithm =", LEX_ALG
print "Corpus =", CORPUS
# Load the test set. A few options here.
if LEX_SOURCE == "mpqa":
(test_words, test_labels) = MPQALexicon.load(False)
elif LEX_SOURCE == "ania":
(test_words, test_labels) = AniaLexicon.load()
else:
print "Invalid lexicon"
exit()
if USE_STEMMING:
stemmer = nltk.stem.porter.PorterStemmer()
test_words = do_stem(test_words)
if LEX_ALG == "gloss":
lexicon = GlossLexicon.create(test_words, test_labels)
elif LEX_ALG == "conjunction":
print "Error: Conjunction algorithm NYI"
elif LEX_ALG == "none":
lexicon = create_lexicon(test_words, test_labels)
if LEX_ALG != "none":
correct = len([(word, label) for (word, label) in zip(test_words, test_labels) if lexicon.has_key(word) and label == lexicon[word]])
lex_acc = correct/len(lexicon.items())
print "Lexicon accuracy:", lex_acc
# TODO refactor me again.
#lexicon = LexFromFile.lexfromfile("cblex.txt")
for key in lexicon.keys():
if lexicon[key] < 0: lexicon[key] *= NEG_MOD
if CORPUS == "movies":
ids = movie_reviews.fileids()
reviews = [list(movie_reviews.words(fileids=[id])) for id in ids]
labels = []
for id in ids:
label = movie_reviews.categories(id)[0]
if label == 'pos':
labels.append(1)
elif label == 'neg':
labels.append(-1)
elif CORPUS == "amazon":
(ids, reviews, labels) = XMLParser.get_all_reviews()
elif CORPUS == "twitter":
(ids, reviews, labels) = TwitterCorpus.load() #they're not reviews but we'll let it slide.
else:
print "Invalid corpus!"
exit()
"""
# It feels like there should be a more efficient way do to this.
shuffled = zip(ids,reviews,labels)
shuffled = shuffled[:20]
ids = [x[0] for x in shuffled]
reviews = [x[1] for x in shuffled]
labels = [x[2] for x in shuffled]
"""
#for k in lexicon.keys():
# lexicon[k] *= -1
# Iterate through all of the reviews and compute scores by taking the sum of their
# component lexicon words. Includes rudimentary negation testing.
correct = 0
positive = 0
scores = []
for i in range(len(reviews)):
words = reviews[i]
if USE_STEMMING:
words = do_stem(words)
if USE_PARSING:
score = calculate_score(words, lexicon)
else:
score = 0
for word in words:
if lexicon.has_key(word):
score += lexicon[word]
scores.append(score)
#print id, score
for i in range(len(ids)):
id = ids[i]
score = scores[i]
label = labels[i]
if score >= 0:
sent_value = 1
positive += 1
#print id, sent_value
elif score < 0:
sent_value = -1
#print id, sent_value
if sent_value == label:
correct += 1
print "correct:", correct/len(ids)
print "positive:", positive/len(ids)
You can’t perform that action at this time.