Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
matplotlib; eval tweaks; comparison
  • Loading branch information
job13011 committed Apr 16, 2016
1 parent 6d16225 commit cf6576f
Show file tree
Hide file tree
Showing 4 changed files with 333 additions and 232 deletions.
10 changes: 4 additions & 6 deletions GlossLexicon.py
Expand Up @@ -11,13 +11,11 @@ import nltk
from nltk.corpus import wordnet as wn

import BagOfWords
import MPQALexicon
import AniaLexicon

EXPAND_ITERATIONS = 2
EXPAND_ITERATIONS = 3
CLASSIFIER = "svm" # "nb" = Naive Bayes, "svm" = Linear SVM, "me" = maximum entropy
REMOVE_STOPWORDS = False
USE_STEMMING = False
REMOVE_STOPWORDS = True
USE_STEMMING = True # sync this up with eval!
USE_EXAMPLES = True

USE_EQUAL_TRAINING = True
Expand Down Expand Up @@ -90,7 +88,7 @@ def expand_sets(positive, negative, neutral):
return (newPositive, newNegative, newNeutral)

def do_stem(text):
global stemmer
stemmer = nltk.stem.porter.PorterStemmer()
return [stemmer.stem(word) for word in text]

def create(test_words, test_labels):
Expand Down
80 changes: 58 additions & 22 deletions LexiconEval.py
Expand Up @@ -7,11 +7,14 @@ from nltk.corpus import movie_reviews
import MPQALexicon
import AniaLexicon
import GlossLexicon
import XMLParser

USE_STEMMING = False
USE_STEMMING = True # sync this up with lexicon!
USE_PARSING = True
LEX_ALG = "gloss"
LEX_SOURCE = "mpqa"
LEX_ALG = "gloss" # "gloss", "conjunction", "none"
LEX_SOURCE = "mpqa" # "mpqa", "ania"
CORPUS = "movies" # "amazon", "movies"
NEG_MOD = 1.5 # Taboada suggested 1.5.

# new and improved finite state machine
# kinda-sorta based on Taboada 2011.
Expand All @@ -36,21 +39,17 @@ def calculate_score(text, lexicon):
num_neg = 0
num_halfneg = 0
for word in text:
if lexicon.has_key(word):
word_score = lexicon[word]
# EXPERIMENTAL
if word_score < 0: word_score *= 1.5
if state == 0:
if lexicon.has_key(word):
score += word_score
score += lexicon[word]
num_single += 1
elif word in negators:
state = 1
elif word in intensifiers:
state = 2
elif state == 1:
if lexicon.has_key(word):
score += -1 * word_score
score += -1 * lexicon[word]
num_neg += 1
state = 0
elif word in intensifiers:
Expand All @@ -59,7 +58,7 @@ def calculate_score(text, lexicon):
state = 0
elif state == 2:
if lexicon.has_key(word):
score += 2 * word_score
score += 2 * lexicon[word]
num_double += 1
state = 0
else:
Expand All @@ -68,7 +67,7 @@ def calculate_score(text, lexicon):
pass #TODO
elif state == 4:
if lexicon.has_key(word):
score += -0.5 * word_score
score += -0.5 * lexicon[word]
num_halfneg += 1
state = 0
else:
Expand All @@ -79,9 +78,6 @@ def calculate_score(text, lexicon):
def do_stem(text):
global stemmer
return [stemmer.stem(word) for word in text]

def get_label(id):
return movie_reviews.categories(fileids=[id])[0]

# Used to create a lexicon instance from the words + labels directly (i.e. without using an algorithm)
def create_lexicon(words, labels):
Expand Down Expand Up @@ -114,14 +110,24 @@ try:
else:
print "Invalid lexicon"
i += 2
elif args[i] == "--corpus":
if args[i+1] == "movies":
CORPUS = "movies"
elif args[i+1] == "amazon":
CORPUS = "amazon"
i += 2
elif args[i] == "--help":
print "Usage:"
print "--alg X: Choose the algorithm to use ('gloss', 'conjunction' or 'none') (default: gloss)"
print "--algorithm|alg X: Choose the algorithm to use ('gloss', 'conjunction' or 'none') (default: gloss)"
print " - gloss: Use the gloss-based algorithm (Esuli & Sebastiani)"
print " - conjunction: Use the conjunction-based algorithm (Hatzivassiloglou & McKeown)"
print "--lexicon X: Choose the lexicon to use ('mpqa', 'ania' or 'none')"
print " - none: Use the input lexicon as is"
print "--lexicon|lex X: Choose the lexicon to use ('mpqa', 'ania' or 'none')"
print " - mpqa: Use the MPQA lexicon"
print " - ania: Use the hand-labeled lexicon from the Brown corpus"
print "--corpus X: Choose the data set to test on"
print " - amazon: Use the Amazon data set"
print " - movies: Use the Pang&Lee movie data set (default)"
exit()
else:
print "Error: Invalid argument", args[i]
Expand All @@ -132,6 +138,7 @@ except Exception:

print "Lexicon =", LEX_SOURCE
print "Algorithm =", LEX_ALG
print "Corpus =", CORPUS

# Load the test set. A few options here.
if LEX_SOURCE == "mpqa":
Expand All @@ -158,17 +165,45 @@ if LEX_ALG != "none":
lex_acc = correct/len(lexicon.items())
print "Lexicon accuracy:", lex_acc

for key in lexicon.keys():
if lexicon[key] < 0: lexicon[key] *= NEG_MOD

if CORPUS == "movies":
ids = movie_reviews.fileids()
reviews = [list(movie_reviews.words(fileids=[id])) for id in ids]
labels = []
for id in ids:
label = movie_reviews.categories(id)[0]
if label == 'pos':
labels.append(1)
elif label == 'neg':
labels.append(-1)
elif CORPUS == "amazon":
(ids, reviews, labels) = XMLParser.get_all_reviews()
else:
print "Invalid corpus!"
exit()

"""
# It feels like there should be a more efficient way do to this.
shuffled = zip(ids,reviews,labels)
shuffled = shuffled[:20]
ids = [x[0] for x in shuffled]
reviews = [x[1] for x in shuffled]
labels = [x[2] for x in shuffled]
"""

# Iterate through all of the reviews and compute scores by taking the sum of their
# component lexicon words. Includes rudimentary negation testing.
correct = 0
positive = 0
ids = sorted(movie_reviews.fileids())
scores = []

for id in ids:
words = list(movie_reviews.words(fileids=[id]))
for i in range(len(reviews)):
words = reviews[i]
if USE_STEMMING:
words = do_stem(words)

if USE_PARSING:
score = calculate_score(words, lexicon)
else:
Expand All @@ -182,14 +217,15 @@ for id in ids:
for i in range(len(ids)):
id = ids[i]
score = scores[i]
label = labels[i]
if score >= 0:
sent_value = "pos"
sent_value = 1
positive += 1
#print id, sent_value
elif score < 0:
sent_value = "neg"
sent_value = -1
#print id, sent_value
label = get_label(id)
if sent_value == label:
correct += 1

Expand Down
43 changes: 43 additions & 0 deletions graph.py
@@ -0,0 +1,43 @@
import numpy
from matplotlib import pyplot

labels = [
"unigrams, frequency",
"unigrams, frequency, +Position",
"unigrams, presence",
"unigrams, presence, +Position",
"bigrams, frequency",
"bigrams, frequency, +Position",
"bigrams, presence",
"bigrams, presence, +Position",
"delta_tfidf"
]
labels2 = [
"unigrams, frequency",
"unigrams, frequency, +Position",
"unigrams, presence",
"unigrams, presence, +Position",
"bigrams, frequency",
"bigrams, frequency, +Position",
"bigrams, presence",
"bigrams, presence, +Position",
"delta_tfidf"
]
tops = numpy.arange(len(labels))
widths = [0.826002649356, 0.784479089868, 0.842490694287, 0.821997146847, 0.807497617378, 0.777000053946, 0.820491149832, 0.795509581438, 0.981992471513]
widths2 = [0.824624634419, 0.808376475678, 0.832750728912, 0.815374570779, 0.797876474366, 0.771876439875, 0.799001849413, 0.768376127015, 0.929999178955]
height = 0.3
pyplot.barh(tops, widths, height, color="#FF0000")
pyplot.barh(tops+height, widths2, height, color="#00FF00")
pyplot.legend(["Movies", "Amazon"], loc=4) # bottom right
pyplot.yticks(tops+height, labels)
pyplot.xlim(0.5, 1.0)
pyplot.ylim(tops[0]-2*height, tops[-1]+3*height)
pyplot.show()

"""
gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: False, use_position: False 0.822003140865
gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: False, use_position: True 0.781988575402
gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position: False 0.77899606193
gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position: True 0.762512512513
"""

0 comments on commit cf6576f

Please sign in to comment.