Skip to content

Commit

Permalink
meh
Browse files Browse the repository at this point in the history
  • Loading branch information
job13011 committed Mar 31, 2016
1 parent 77869d0 commit c458117
Show file tree
Hide file tree
Showing 5 changed files with 125 additions and 43 deletions.
97 changes: 61 additions & 36 deletions GlossCountJWB.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,17 @@
import string
import random
import BagOfWords
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
import MPQALexicon
import numpy

def get_defs(word):
return string.join([synset.definition() for synset in wn.synsets(word, pos=wn.ADJ)])

def make_bag(text):
return BagOfWords.make(text, normalize=True, use_negation=False, use_hash=False, use_presence=False)
return BagOfWords.make(text, normalize=True, use_negation=True, use_hash=False, use_presence=True)

def expand_sets(positive,negative,neutral):
newPositive = set(positive)
Expand All @@ -25,42 +27,54 @@ def expand_sets(positive,negative,neutral):
for word in positive:
for syn in wn.synsets(word, pos=wn.ADJ):
for lemma in syn.lemmas():
curr = lemma.name().split('.')[0]
curr = lemma.name()
if curr not in newPositive and curr not in newNegative and curr not in newNeutral:
newPositive.add(curr)
elif curr in newNegative:
newNegative.discard(curr)
newNeutral.add(curr)
for antonym in lemma.antonyms():
ant = antonym.name()
if ant not in newPositive and ant not in newNegative and ant not in newNeutral:
newNegative.add(ant)
elif ant in newPositive:
newPositive.discard(ant)
newNeutral.add(ant)

for word in negative:
for syn in wn.synsets(word, pos=wn.ADJ):
for lemma in syn.lemmas():
curr = lemma.name().split('.')[0]
curr = lemma.name()
if curr not in newPositive and curr not in newNegative and curr not in newNeutral:
newNegative.add(curr)
elif curr in newPositive:
newPositive.discard(curr)
newNeutral.add(curr)
for antonym in lemma.antonyms():
ant = antonym.name()
if ant not in newPositive and ant not in newNegative and ant not in newNeutral:
newPositive.add(ant)
elif ant in newPositive:
newNegative.discard(ant)
newNeutral.add(ant)
return (newPositive, newNegative, newNeutral)

def bag_to_vec(bag, wordlist):
vec = []
for word in wordlist:
if bag.has_key(word):
vec.append(bag[word])
else:
vec.append(0)
return vec
vec = []
for word in wordlist:
if bag.has_key(word):
vec.append(bag[word])
else:
vec.append(0)
return vec

# Set up initial Sets S_p and S_n
#positive = Set(['Good'])
#negative = Set(['Bad'])
neutral = Set([''])
neutral = Set([])
positive = Set(['good', 'nice', 'excellent', 'positive', 'fortunate', 'correct', 'superior'])
negative = Set(['bad', 'nasty', 'poor', 'negative', 'unfortunate', 'wrong', 'inferior'])

# Expand on Sets to get S_p' and S_n'
for num in range(2):
for num in range(3):
newsets = expand_sets(positive,negative,neutral);
positive = set(newsets[0])
negative = set(newsets[1])
Expand All @@ -85,18 +99,14 @@ def bag_to_vec(bag, wordlist):
train_wordlist.append(word)
train_wordlist = sorted(train_wordlist)
train_vecs = [bag_to_vec(bag, train_wordlist) for bag in train_bags]
classifier = MultinomialNB()
#classifier = svm.SVC(kernel="linear")
#classifier = MultinomialNB()
classifier = svm.SVC(kernel="linear")
classifier.fit(train_vecs, train_labels)
# Iterate through all of the reviews and find sentiment
count = 0
correct = 0
ids = sorted(movie_reviews.fileids())

# Load the test set
(test_words, test_labels) = MPQALexicon.load()
#test_words = string.join(list(movie_reviews.words(fileids=ids)))
test_wordlist_bag = BagOfWords.make(string.join(test_words), normalize=False, use_negation=False, use_hash=False, use_presence=False)
test_wordlist_bag = BagOfWords.make(string.join(test_words), normalize=False, use_negation=False, use_hash=False, use_presence=True)
test_wordlist = sorted(test_wordlist_bag.keys(), key=lambda k: -test_wordlist_bag[k])#[:NUM_TEST_WORDS]
test_bags = []
test_wordlist2 = []
Expand All @@ -105,7 +115,7 @@ def bag_to_vec(bag, wordlist):
if defs != '':
test_wordlist2.append(word)
test_bags.append(make_bag(defs))

test_vecs = [bag_to_vec(bag, train_wordlist) for bag in test_bags]
predicted_labels = classifier.predict(test_vecs)
word_labels = {}
Expand All @@ -116,29 +126,44 @@ def bag_to_vec(bag, wordlist):

pos_words = [w for w in test_wordlist2 if word_labels[w] > 0]
neg_words = [w for w in test_wordlist2 if word_labels[w] < 0]

# Use the same number of positive and negative words.
length = min(len(pos_words), len(neg_words))
pos_words = pos_words[:length]
neg_words = neg_words[:length]
word_labels2 = {}
for word in pos_words:
word_labels2[word] = 1
for word in neg_words:
word_labels2[word] = -1

scores = {}
# Iterate through all of the reviews and find sentiment
correct = 0
positive = 0
ids = sorted(movie_reviews.fileids())
scores = []

for review_id in ids:
words = movie_reviews.words(fileids=[review_id])
score = 0
for word in words:
if word in pos_words:
score += 1
elif word in neg_words:
score -= 1
if (score >= 0):
if word_labels2.has_key(word):
score += word_labels2[word]
scores.append(score)

avg_score = float(sum(scores))/len(scores)
for i in range(len(ids)):
id = ids[i]
score = scores[i]
if score >= avg_score:
sent_value = "pos"
print "Positive (%s)" % review_id
else:
positive += 1
elif score < avg_score:
sent_value = "neg"
print "Negative (%s)" % review_id
if (sent_value == movie_reviews.categories(fileids=[review_id])[0]):
label = movie_reviews.categories(fileids=[id])[0]
if sent_value == label:
correct += 1
count += 1
scores[review_id] = score

print "correct:", float(correct)/count
print "correct:", float(correct)/len(ids)
print "positive:", float(positive)/len(ids)
print "avg:", avg_score
4 changes: 2 additions & 2 deletions MPQALexicon.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ def load():
fields = line.split(" ")
fields = [field for field in fields if "=" in field] #ugh, two lines have a random extra char in them
d = dict([field.rstrip().split("=") for field in fields])
(word, label, pos) = d["word1"], d["priorpolarity"], d["pos1"]
if pos == "adj":
(word, label, pos, type) = d["word1"], d["priorpolarity"], d["pos1"], d["type"]
if pos == "adj":# and type == "strongsubj":
if label == "positive":
words.append(word)
labels.append("pos")
Expand Down
11 changes: 11 additions & 0 deletions asdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import nltk
from nltk.corpus import wordnet
from nltk.corpus import movie_reviews
from nltk.classify import NaiveBayesClassifier

word = "good"
syns = wordnet.synsets(word)
for syn in syns:
lemmas = syn.lemmas()
for lemma in lemmas:
if lemma.antonyms() != []: print lemma.antonyms()
36 changes: 36 additions & 0 deletions pos.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import nltk
import os
import string

"""
POS tagging is really slow compared to SVM training and prediction.
This script processes the reviews beforehand, applies the NLTK POS tagger,
and saves them in a new folder.
"""

POS_FOLDER = os.path.join("review_polarity","txt_sentoken","pos")
NEG_FOLDER = os.path.join("review_polarity","txt_sentoken","neg")
POS_TAGGED_FOLDER = os.path.join("review_polarity","txt_sentoken","pos_tagged")
NEG_TAGGED_FOLDER = os.path.join("review_polarity","txt_sentoken","neg_tagged")

for (folder_name, tagged_folder_name) in [(POS_FOLDER, POS_TAGGED_FOLDER), (NEG_FOLDER, NEG_TAGGED_FOLDER)]:
filenames = []
for (folder, x, folder_filenames) in os.walk(folder_name):
for filename in folder_filenames:
if filename.endswith(".txt"):
filenames.append(os.path.join(folder, filename))
for filename in filenames:
f = open(filename)
lines = f.readlines()
f.close()
text = string.join(lines, " ")

tokens = nltk.word_tokenize(text)
tagged = nltk.pos_tag(tokens)
tagged = [string.join(t, "_") for t in tagged]
tagged = string.join(tagged, " ")
tagged_filename = os.path.join(tagged_folder_name, os.path.split(filename)[-1])
f = open(tagged_filename, "w")
f.write(tagged)
f.close()
print "Tagged & saved file", tagged_filename
20 changes: 15 additions & 5 deletions review_svm.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
USE_POS_TAGS = False
USE_ADJ_ONLY = False
USE_NEGATION = True
USE_POSITION = False
GRAM_LENGTH = 1 # Unigrams, bigrams, ... TODO use a range
NUM_FOLDS = 3 # For cross-validation (Pang & Lee used 3)

Expand Down Expand Up @@ -60,6 +61,13 @@ def partition_filenames(filenames, num_partitions):
for i in range(len(filenames)):
partitions[i % num_partitions].append(filenames[i])
return partitions

def make_bag(text, total_word_counts):
return BagOfWords.make(file_to_text(filename), ref_bag=total_word_counts,
gram_length=GRAM_LENGTH, use_presence=USE_PRESENCE,
use_pos_tags=USE_POS_TAGS, use_adj_only=USE_ADJ_ONLY,
normalize=NORMALIZE_BAGS, use_negation=USE_NEGATION,
use_position=USE_POSITION)


# Set parameters from command-line arguments.
Expand Down Expand Up @@ -91,6 +99,9 @@ def partition_filenames(filenames, num_partitions):
elif args[i] == "--no-negation":
USE_NEGATION = False
i += 1
elif args[i] == "--use-position":
USE_POSITION = True
i += 1
elif args[i] == "--threshold":
MIN_OCCURRENCES = int(args[i+1])
i += 2
Expand All @@ -104,7 +115,9 @@ def partition_filenames(filenames, num_partitions):
print "--presence\t\tUse word presence rather than word frequency (Default: Off)"
print "--frequency\t\tUse word frequency rather than word presence (Default: On)"
print "--use-pos-tags\t\tUse part-of-speech tags (Default: Off)"
print "--use-negation\t\tTag words appearing after a negation word (Default: Off)"
print "--use-adj-only\t\tUse adjectives only (requires --use-pos-tags and --gram-length 1) (Default: Off)"
print "--use-position\t\tTag words according to their position in the text (Default: Off)"
print "--threshold N\t\tOnly include words that appear at least N times across all documents (Default: 4)"
print "--epsilon X\t\tSVM parameter to control the number of iterations (Default: 0.001)"
print "\t\t\t(0 < epsilon < 1; lower = more iterations)"
Expand Down Expand Up @@ -138,13 +151,10 @@ def partition_filenames(filenames, num_partitions):

for i in range(NUM_FOLDS):
for filename in pos_folds[i]:
pos_fold_bags[i].append(BagOfWords.make(file_to_text(filename), ref_bag=total_word_counts, gram_length=GRAM_LENGTH,
use_presence=USE_PRESENCE, use_pos_tags=USE_POS_TAGS, use_adj_only=USE_ADJ_ONLY, normalize_bags=NORMALIZE_BAGS))
pos_fold_bags[i].append(make_bag(file_to_text(filename), total_word_counts))

for filename in neg_folds[i]:
neg_fold_bags[i].append(
BagOfWords.make(file_to_text(filename), ref_bag=total_word_counts, gram_length=GRAM_LENGTH,
use_presence=USE_PRESENCE, use_pos_tags=USE_POS_TAGS, use_adj_only=USE_ADJ_ONLY, normalize_bags=NORMALIZE_BAGS))
neg_fold_bags[i].append(make_bag(file_to_text(filename), total_word_counts))

# Remove words with less than the minimum occurrences threshold.
for k in total_word_counts.keys():
Expand Down

0 comments on commit c458117

Please sign in to comment.