Skip to content

Commit

Permalink
Tried to add TFIDF/normalization. 64%...
Browse files Browse the repository at this point in the history
  • Loading branch information
job13011 committed Apr 2, 2016
1 parent 8e9a44c commit 35c95dc
Showing 1 changed file with 66 additions and 57 deletions.
123 changes: 66 additions & 57 deletions GlossCountJWB.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,18 @@
import MPQALexicon
import numpy

def get_defs(word):
return string.join([synset.definition() for synset in wn.synsets(word, pos=wn.ADJ)])
# returns tokenized
def get_defs(word, use_examples=True):
defs = [synset.definition() for synset in wn.synsets(word, pos=wn.ADJ)]
if use_examples:
examples = [synset.examples() for synset in wn.synsets(word, pos=wn.ADJ)]
for example in examples: defs += example
return nltk.word_tokenize(string.join(defs))

def make_bag(text):
return BagOfWords.make(text, normalize=True, use_negation=True, use_hash=False, use_presence=True)
# text and documents are pre-tokenized
def make_bag(text, documents):
#return BagOfWords.make(text, normalize=True, use_negation=False, use_hash=False, use_presence=False)
return BagOfWords.make_tfidf(text, documents)

def expand_sets(positive,negative,neutral):
newPositive = set(positive)
Expand Down Expand Up @@ -54,78 +61,71 @@ def expand_sets(positive,negative,neutral):
ant = antonym.name()
if ant not in newPositive and ant not in newNegative and ant not in newNeutral:
newPositive.add(ant)
elif ant in newPositive:
elif ant in newNegative:
newNegative.discard(ant)
newNeutral.add(ant)
return (newPositive, newNegative, newNeutral)

def bag_to_vec(bag, wordlist):
vec = []
for word in wordlist:
if bag.has_key(word):
vec.append(bag[word])
else:
vec.append(0)
return vec
vec = []
for word in wordlist:
if bag.has_key(word):
vec.append(bag[word])
else:
vec.append(0)
return vec

# Set up initial Sets S_p and S_n
neutral = Set([])
positive = Set(['good', 'nice', 'excellent', 'positive', 'fortunate', 'correct', 'superior'])
negative = Set(['bad', 'nasty', 'poor', 'negative', 'unfortunate', 'wrong', 'inferior'])

# Expand on Sets to get S_p' and S_n'
for num in range(3):
newsets = expand_sets(positive,negative,neutral);
positive = set(newsets[0])
negative = set(newsets[1])
neutral = set(newsets[2])

# Use the same number of positive and negative words.
for num in range(1):
(positive, negative, neutral) = expand_sets(positive,negative,neutral);

# Use the same number of positive and negative training words.
positive = random.sample(positive, min(len(positive), len(negative)))
negative = random.sample(negative, min(len(positive), len(negative)))

# Learn Classifier
train_bags = [make_bag(get_defs(word)) for word in positive] + [make_bag(get_defs(word)) for word in negative]

# Train the classifier using the expanded wordlist.
train_wordlist = set(positive + negative)

train_defs = [get_defs(word) for word in (positive + negative)]

train_bags = [make_bag(get_defs(word), train_defs) for word in positive] + [make_bag(get_defs(word), train_defs) for word in negative]

train_labels = [1 for word in positive] + [-1 for word in negative]
train_wordlist = []

# The classifier needs vectors, not dicts. So we need to convert them to vectors.
# Make a list of all the words contained in them, then make an array with entries
# corresponding to each word.
for bag in train_bags:
for word in bag.keys():
if not (word in train_wordlist):
train_wordlist.append(word)
train_wordlist = sorted(train_wordlist)

train_vecs = [bag_to_vec(bag, train_wordlist) for bag in train_bags]
#classifier = MultinomialNB()
classifier = svm.SVC(kernel="linear")
classifier = svm.LinearSVC()
classifier.fit(train_vecs, train_labels)

# Load the test set
(test_words, test_labels) = MPQALexicon.load()
#test_words = string.join(list(movie_reviews.words(fileids=ids)))
test_wordlist_bag = BagOfWords.make(string.join(test_words), normalize=False, use_negation=False, use_hash=False, use_presence=True)
test_wordlist = sorted(test_wordlist_bag.keys(), key=lambda k: -test_wordlist_bag[k])#[:NUM_TEST_WORDS]
# Load the test set. I'm only using the bag of words structure here to select the words
# with a certain word count threshold.
(test_words, test_labels) = MPQALexicon.load(True)

test_wordlist_bag = BagOfWords.make(string.join(test_words), normalize=False, use_negation=False, use_hash=False, use_presence=False)
test_wordlist = sorted(test_wordlist_bag.keys(), key=lambda k: -test_wordlist_bag[k])#[:500]
test_bags = []
test_wordlist2 = []
for word in test_wordlist:
defs = get_defs(word)
if defs != '':
test_wordlist2.append(word)
test_bags.append(make_bag(defs))

test_wordlist = filter(lambda x: x != '', test_wordlist)
test_bags = [make_bag(get_defs(word), train_defs) for word in test_wordlist]

test_vecs = [bag_to_vec(bag, train_wordlist) for bag in test_bags]
predicted_labels = classifier.predict(test_vecs)
word_labels = {}

for i in range(len(test_wordlist2)):
key = test_wordlist2[i]
word_labels[key] = predicted_labels[i]
for i in range(len(test_wordlist)):
key = test_wordlist[i]
word_labels[key] = predicted_labels[i]

pos_words = [w for w in test_wordlist2 if word_labels[w] > 0]
neg_words = [w for w in test_wordlist2 if word_labels[w] < 0]
pos_words = [w for w in test_wordlist if word_labels[w] > 0]
neg_words = [w for w in test_wordlist if word_labels[w] < 0]

# Use the same number of positive and negative words.
length = min(len(pos_words), len(neg_words))
Expand All @@ -134,36 +134,45 @@ def bag_to_vec(bag, wordlist):
word_labels2 = {}
for word in pos_words:
word_labels2[word] = 1

for word in neg_words:
word_labels2[word] = -1


f = open('fuck.txt', 'w')
f.write("[POS]\n\n")
f.write(string.join(pos_words,"\n"))
f.write("\n\n[NEG]\n\n")
f.write(string.join(neg_words,"\n"))
f.close()
#exit()

# Iterate through all of the reviews and find sentiment
correct = 0
positive = 0
ids = sorted(movie_reviews.fileids())
scores = []

for review_id in ids:
words = movie_reviews.words(fileids=[review_id])
score = 0
for word in words:
if word_labels2.has_key(word):
score += word_labels2[word]
scores.append(score)
words = movie_reviews.words(fileids=[review_id])
score = 0
for word in words:
if word_labels2.has_key(word):
score += word_labels2[word]
scores.append(score)

avg_score = float(sum(scores))/len(scores)
for i in range(len(ids)):
id = ids[i]
score = scores[i]
if score >= avg_score:
if score >= 0:#avg_score:
sent_value = "pos"
positive += 1
elif score < avg_score:
elif score < 0:#avg_score:
sent_value = "neg"
label = movie_reviews.categories(fileids=[id])[0]
if sent_value == label:
correct += 1

print "correct:", float(correct)/len(ids)
print "positive:", float(positive)/len(ids)
print "avg:", avg_score
#print "avg:", avg_score

0 comments on commit 35c95dc

Please sign in to comment.