Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
refactored lexicons
  • Loading branch information
job13011 committed Apr 12, 2016
1 parent 6c944e0 commit 1a5c75f
Showing 1 changed file with 10 additions and 121 deletions.
131 changes: 10 additions & 121 deletions GlossCountJWB.py → GlossLexicon.py
Expand Up @@ -9,21 +9,19 @@ from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
import nltk
from nltk.corpus import wordnet as wn
import nltk.classify.util
from nltk.corpus import movie_reviews

import BagOfWords
import MPQALexicon
import AniaLexicon

EXPAND_ITERATIONS = 2
CLASSIFIER = "me" # "nb" = Naive Bayes, "svm" = Linear SVM, "me" = maximum entropy
CLASSIFIER = "svm" # "nb" = Naive Bayes, "svm" = Linear SVM, "me" = maximum entropy
REMOVE_STOPWORDS = False
USE_STEMMING = False
USE_EXAMPLES = True

USE_EQUAL_TRAINING = True
USE_EQUAL_TEST = True
USE_PARSING = True

POS_SEED = ['good', 'nice', 'excellent', 'positive', 'fortunate', 'correct', 'superior']
NEG_SEED = ['bad', 'nasty', 'poor', 'negative', 'unfortunate', 'wrong', 'inferior']
Expand Down Expand Up @@ -91,71 +89,10 @@ def expand_sets(positive, negative, neutral):
newNeutral.add(ant)
return (newPositive, newNegative, newNeutral)

def get_label(id):
return movie_reviews.categories(fileids=[id])[0]

def do_stem(text):
global stemmer
return [stemmer.stem(word) for word in text]

# new and improved finite state machine
# states are as follows:
# 0 - base
# 1 - negator found
# 2 - intensifier found
# 3 - un-intensifier found (unused)
# 4 - negator + intensifier found
def calculate_score(text, lexicon):
negators = ["not", "n't", "hardly", "barely"]
intensifiers = ["very", "really", "incredibly", "amazingly", "extremely"]
if USE_STEMMING:
negators = do_stem(negators)
intensifiers = do_stem(intensifiers)

punctuation = [".", "!", "?", ",", ";", '(', ')']
state = 0
score = 0
num_double = 0
num_single = 0
num_neg = 0
num_halfneg = 0
for word in text:
if state == 0:
if lexicon.has_key(word):
score += lexicon[word]
num_single += 1
elif word in negators:
state = 1
elif word in intensifiers:
state = 2
elif state == 1:
if lexicon.has_key(word):
score += -1 * lexicon[word]
num_neg += 1
state = 0
elif word in intensifiers:
state = 4
else:
state = 0
elif state == 2:
if lexicon.has_key(word):
score += 2 * lexicon[word]
num_double += 1
state = 0
else:
state = 0
elif state == 3:
pass #TODO
elif state == 4:
if lexicon.has_key(word):
score += -0.5 * lexicon[word]
num_halfneg += 1
state = 0
else:
state = 0
#print num_single, num_neg, num_double, num_halfneg
return score

def create_lexicon(words, labels):
lexicon = {}
for i in range(len(words)):
Expand All @@ -164,14 +101,14 @@ def create_lexicon(words, labels):
lexicon[word] = label
return lexicon

def create_trained_lexicon(pos_seed, neg_seed, test_words, test_labels):
def create(test_words, test_labels):
# Set up initial Sets S_p and S_n
neutral = []
#positive = ['good']
#negative = ['bad']

positive = [word for word in pos_seed]
negative = [word for word in neg_seed]
positive = [word for word in POS_SEED]
negative = [word for word in NEG_SEED]
# Expand on Sets to get S_p' and S_n'
for num in range(EXPAND_ITERATIONS):
(positive, negative, neutral) = expand_sets(positive,negative,neutral)
Expand Down Expand Up @@ -217,12 +154,14 @@ def create_trained_lexicon(pos_seed, neg_seed, test_words, test_labels):
test_vecs = [BagOfWords.to_vector(bag, train_wordlist) for bag in test_bags]

predicted_labels = classifier.predict(test_vecs)
"""
correct = 0
for i in range(len(test_labels)):
if test_labels[i] == predicted_labels[i]:
correct += 1
print "Lexicon accuracy:", correct/len(test_labels)
"""

word_labels = {}
for i in range(len(test_words)):
Expand All @@ -244,59 +183,9 @@ def create_trained_lexicon(pos_seed, neg_seed, test_words, test_labels):
lexicon[word] = 1

for word in neg_words:
lexicon[word] = -1
#lexicon[word] = -1
lexicon[word] = -1.5

return lexicon

if USE_STEMMING:
stemmer = nltk.stem.porter.PorterStemmer()

# Load the test set. A few options here.
(test_words, test_labels) = MPQALexicon.load(True)
#(test_words, test_labels) = AniaLexicon.load()
if USE_STEMMING:
test_words = do_stem(test_words)

lexicon = create_trained_lexicon(POS_SEED, NEG_SEED, test_words, test_labels)
#lexicon = create_seed_lexicon(POS_SEED, NEG_SEED)
#lexicon = create_lexicon(test_words, test_labels)

# Iterate through all of the reviews and compute scores by taking the sum of their
# component lexicon words. Includes rudimentary negation testing.
correct = 0
positive = 0
ids = sorted(movie_reviews.fileids())
scores = []

for id in ids:
words = list(movie_reviews.words(fileids=[id]))
if USE_STEMMING:
words = do_stem(words)
if USE_PARSING:
scores.append(calculate_score(words, lexicon))
else:
score = 0
x = 0
for word in words:
if lexicon.has_key(word):
score += lexicon[word]
x += 1
scores.append(score)
print score, x

for i in range(len(ids)):
id = ids[i]
score = scores[i]
if score >= 0:
sent_value = "pos"
positive += 1
#print id, sent_value
elif score < 0:
sent_value = "neg"
#print id, sent_value
label = get_label(id)
if sent_value == label:
correct += 1

print "correct:", correct/len(ids)
print "positive:", positive/len(ids)
#create_trained_lexicon(POS_SEED, NEG_SEED, test_words, test_labels)

0 comments on commit 1a5c75f

Please sign in to comment.