Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
My attempt at lexicons
  • Loading branch information
job13011 committed Mar 19, 2016
1 parent 593be58 commit 2a8d5ef
Show file tree
Hide file tree
Showing 4 changed files with 8,387 additions and 0 deletions.
144 changes: 144 additions & 0 deletions GlossCountJWB.py
@@ -0,0 +1,144 @@
import math
import nltk
from nltk.corpus import wordnet as wn
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from sets import Set
import string
import random
import BagOfWords
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
import MPQALexicon

def get_defs(word):
return string.join([synset.definition() for synset in wn.synsets(word, pos=wn.ADJ)])

def make_bag(text):
return BagOfWords.make(text, normalize=True, use_negation=False, use_hash=False, use_presence=False)

def expand_sets(positive,negative,neutral):
newPositive = set(positive)
newNegative = set(negative)
newNeutral = set(neutral)
for word in positive:
for syn in wn.synsets(word, pos=wn.ADJ):
for lemma in syn.lemmas():
curr = lemma.name().split('.')[0]
if curr not in newPositive and curr not in newNegative and curr not in newNeutral:
newPositive.add(curr)
elif curr in newNegative:
newNegative.discard(curr)
newNeutral.add(curr)

for word in negative:
for syn in wn.synsets(word, pos=wn.ADJ):
for lemma in syn.lemmas():
curr = lemma.name().split('.')[0]
if curr not in newPositive and curr not in newNegative and curr not in newNeutral:
newNegative.add(curr)
elif curr in newPositive:
newPositive.discard(curr)
newNeutral.add(curr)
return (newPositive, newNegative, newNeutral)

def bag_to_vec(bag, wordlist):
vec = []
for word in wordlist:
if bag.has_key(word):
vec.append(bag[word])
else:
vec.append(0)
return vec

# Set up initial Sets S_p and S_n
#positive = Set(['Good'])
#negative = Set(['Bad'])
neutral = Set([''])
positive = Set(['good', 'nice', 'excellent', 'positive', 'fortunate', 'correct', 'superior'])
negative = Set(['bad', 'nasty', 'poor', 'negative', 'unfortunate', 'wrong', 'inferior'])

# Expand on Sets to get S_p' and S_n'
for num in range(2):
newsets = expand_sets(positive,negative,neutral);
positive = set(newsets[0])
negative = set(newsets[1])
neutral = set(newsets[2])

# Use the same number of positive and negative words.
positive = random.sample(positive, min(len(positive), len(negative)))
negative = random.sample(negative, min(len(positive), len(negative)))

# Learn Classifier
train_bags = [make_bag(get_defs(word)) for word in positive] + [make_bag(get_defs(word)) for word in negative]

train_labels = [1 for word in positive] + [-1 for word in negative]
train_wordlist = []

# The classifier needs vectors, not dicts. So we need to convert them to vectors.
# Make a list of all the words contained in them, then make an array with entries
# corresponding to each word.
for bag in train_bags:
for word in bag.keys():
if not (word in train_wordlist):
train_wordlist.append(word)
train_wordlist = sorted(train_wordlist)
train_vecs = [bag_to_vec(bag, train_wordlist) for bag in train_bags]
classifier = MultinomialNB()
#classifier = svm.SVC(kernel="linear")
classifier.fit(train_vecs, train_labels)
# Iterate through all of the reviews and find sentiment
count = 0
correct = 0
ids = sorted(movie_reviews.fileids())

# Load the test set
(test_words, test_labels) = MPQALexicon.load()
#test_words = string.join(list(movie_reviews.words(fileids=ids)))
test_wordlist_bag = BagOfWords.make(string.join(test_words), normalize=False, use_negation=False, use_hash=False, use_presence=False)
test_wordlist = sorted(test_wordlist_bag.keys(), key=lambda k: -test_wordlist_bag[k])#[:NUM_TEST_WORDS]
test_bags = []
test_wordlist2 = []
for word in test_wordlist:
defs = get_defs(word)
if defs != '':
test_wordlist2.append(word)
test_bags.append(make_bag(defs))

test_vecs = [bag_to_vec(bag, train_wordlist) for bag in test_bags]
predicted_labels = classifier.predict(test_vecs)
word_labels = {}

for i in range(len(test_wordlist2)):
key = test_wordlist2[i]
word_labels[key] = predicted_labels[i]

pos_words = [w for w in test_wordlist2 if word_labels[w] > 0]
neg_words = [w for w in test_wordlist2 if word_labels[w] < 0]
# Use the same number of positive and negative words.
length = min(len(pos_words), len(neg_words))
pos_words = pos_words[:length]
neg_words = neg_words[:length]

scores = {}
for review_id in ids:
words = movie_reviews.words(fileids=[review_id])
score = 0
for word in words:
if word in pos_words:
score += 1
elif word in neg_words:
score -= 1
if (score >= 0):
sent_value = "pos"
print "Positive (%s)" % review_id
else:
sent_value = "neg"
print "Negative (%s)" % review_id
if (sent_value == movie_reviews.categories(fileids=[review_id])[0]):
correct += 1
count += 1
scores[review_id] = score

print "correct:", float(correct)/count
20 changes: 20 additions & 0 deletions MPQALexicon.py
@@ -0,0 +1,20 @@
def load():
filename = "subjclueslen1-HLTEMNLP05.tff"
f = open(filename)
lines = f.readlines()
f.close()
words = []
labels = []
for line in lines:
fields = line.split(" ")
fields = [field for field in fields if "=" in field] #ugh, two lines have a random extra char in them
d = dict([field.rstrip().split("=") for field in fields])
(word, label, pos) = d["word1"], d["priorpolarity"], d["pos1"]
if pos == "adj":
if label == "positive":
words.append(word)
labels.append("pos")
elif label == "negative":
words.append(word)
labels.append("neg")
return (words, labels)

0 comments on commit 2a8d5ef

Please sign in to comment.