From ae237e32d0a8ffda8783f4b3900461ab2cb5e608 Mon Sep 17 00:00:00 2001 From: Jack Date: Tue, 12 Apr 2016 14:53:10 -0400 Subject: [PATCH] refactored lexicon --- LexiconEval.py | 181 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 181 insertions(+) create mode 100644 LexiconEval.py diff --git a/LexiconEval.py b/LexiconEval.py new file mode 100644 index 0000000..0c84cad --- /dev/null +++ b/LexiconEval.py @@ -0,0 +1,181 @@ +from __future__ import division +import sys + +import nltk +from nltk.corpus import movie_reviews + +import MPQALexicon +import AniaLexicon +import GlossLexicon + +USE_STEMMING = False +USE_PARSING = True +LEX_ALG = "gloss" +LEX_SOURCE = "mpqa" + +# new and improved finite state machine +# states are as follows: +# 0 - base +# 1 - negator found +# 2 - intensifier found +# 3 - un-intensifier found (unused) +# 4 - negator + intensifier found +def calculate_score(text, lexicon): + negators = ["not", "n't", "hardly", "barely"] + intensifiers = ["very", "really", "incredibly", "amazingly", "extremely"] + if USE_STEMMING: + negators = do_stem(negators) + intensifiers = do_stem(intensifiers) + + punctuation = [".", "!", "?", ",", ";", '(', ')'] + state = 0 + score = 0 + num_double = 0 + num_single = 0 + num_neg = 0 + num_halfneg = 0 + for word in text: + if state == 0: + if lexicon.has_key(word): + score += lexicon[word] + num_single += 1 + elif word in negators: + state = 1 + elif word in intensifiers: + state = 2 + elif state == 1: + if lexicon.has_key(word): + score += -1 * lexicon[word] + num_neg += 1 + state = 0 + elif word in intensifiers: + state = 4 + else: + state = 0 + elif state == 2: + if lexicon.has_key(word): + score += 2 * lexicon[word] + num_double += 1 + state = 0 + else: + state = 0 + elif state == 3: + pass #TODO + elif state == 4: + if lexicon.has_key(word): + score += -0.5 * lexicon[word] + num_halfneg += 1 + state = 0 + else: + state = 0 + #print num_single, num_neg, num_double, num_halfneg + return score + +def do_stem(text): + global stemmer + return [stemmer.stem(word) for word in text] + +def get_label(id): + return movie_reviews.categories(fileids=[id])[0] + +i = 0 +try: + args = sys.argv[1:] + while i < len(args): + if args[i] in ["--alg", "--algorithm"]: + if args[i+1] == "gloss": + LEX_ALG = "gloss" + elif args[i+1] == "conjunction": + LEX_ALG = "conjunction" + else: + print "Invalid algorithm" + i += 2 + elif args[i] in ["--lex", "--lexicon"]: + if args[i+1] == "mpqa": + LEX_SOURCE = "mpqa" + elif args[i+1] == "ania": + LEX_SOURCE = "ania" + else: + print "Invalid lexicon" + i += 2 + elif args[i] == "--help": + print "Usage:" + print "--alg X: Choose the algorithm to use ('gloss', 'conjunction' or 'none') (default: gloss)" + print " - gloss: Use the gloss-based algorithm (Esuli & Sebastiani)" + print " - conjunction: Use the conjunction-based algorithm (Hatzivassiloglou & McKeown)" + print "--lexicon X: Choose the lexicon to use ('mpqa', 'ania' or 'none')" + print " - mpqa: Use the MPQA lexicon" + print " - ania: Use the hand-labeled lexicon from the Brown corpus" + exit() + else: + print "Error: Invalid argument", args[i] + i += 1 +except Exception: + print "Invalid arguments" + exit() + +print "Lexicon =", LEX_SOURCE +print "Algorithm =", LEX_ALG + +# Load the test set. A few options here. +if LEX_SOURCE == "mpqa": + (test_words, test_labels) = MPQALexicon.load(True) +elif LEX_SOURCE == "ania": + (test_words, test_labels) = AniaLexicon.load() +else: + print "Invalid lexicon" + exit() + +if USE_STEMMING: + stemmer = nltk.stem.porter.PorterStemmer() + test_words = do_stem(test_words) + +if LEX_ALG == "gloss": + lexicon = GlossLexicon.create(test_words, test_labels) +elif LEX_ALG == "conjunction": + print "Error: Conjunction algorithm NYI" +elif LEX_ALG == "none": + lexicon = create_lexicon(test_words, test_labels) + +correct = len([(word, label) for (word, label) in zip(test_words, test_labels) if lexicon.has_key(word) and label == lexicon[word]]) +lex_acc = correct/len(lexicon.items()) +print "Lexicon accuracy:", lex_acc + +# Iterate through all of the reviews and compute scores by taking the sum of their +# component lexicon words. Includes rudimentary negation testing. +correct = 0 +positive = 0 +ids = sorted(movie_reviews.fileids()) +scores = [] + +for id in ids: + words = list(movie_reviews.words(fileids=[id])) + if USE_STEMMING: + words = do_stem(words) + if USE_PARSING: + score = calculate_score(words, lexicon) + else: + score = 0 + for word in words: + if lexicon.has_key(word): + score += lexicon[word] + x += 1 + scores.append(score) + #print id, score + +for i in range(len(ids)): + id = ids[i] + score = scores[i] + if score >= 0: + sent_value = "pos" + positive += 1 + #print id, sent_value + elif score < 0: + sent_value = "neg" + #print id, sent_value + label = get_label(id) + if sent_value == label: + correct += 1 + +print "correct:", correct/len(ids) +print "positive:", positive/len(ids) \ No newline at end of file