GlossCountJWB.py

import math
import nltk
from nltk.corpus import wordnet as wn
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from sets import Set
import string
import random
import BagOfWords
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
import MPQALexicon

def get_defs(word):
    return string.join([synset.definition() for synset in wn.synsets(word, pos=wn.ADJ)])

def make_bag(text):
    return BagOfWords.make(text, normalize=True, use_negation=False, use_hash=False, use_presence=False)

def expand_sets(positive,negative,neutral):
    newPositive = set(positive)
    newNegative = set(negative)
    newNeutral = set(neutral)
    for word in positive:
        for syn in wn.synsets(word, pos=wn.ADJ):
            for lemma in syn.lemmas():
                curr = lemma.name().split('.')[0]
                if curr not in newPositive and curr not in newNegative and curr not in newNeutral:
                    newPositive.add(curr)
                elif curr in newNegative:
                    newNegative.discard(curr)
                    newNeutral.add(curr)

    for word in negative:
        for syn in wn.synsets(word, pos=wn.ADJ):
            for lemma in syn.lemmas():
                curr = lemma.name().split('.')[0]
                if curr not in newPositive and curr not in newNegative and curr not in newNeutral:
                    newNegative.add(curr)
                elif curr in newPositive:
                    newPositive.discard(curr)
                    newNeutral.add(curr)
    return (newPositive, newNegative, newNeutral)

def bag_to_vec(bag, wordlist):
  vec = []
  for word in wordlist:
    if bag.has_key(word):
      vec.append(bag[word])
    else:
      vec.append(0)
  return vec

# Set up initial Sets S_p and S_n
#positive = Set(['Good'])
#negative = Set(['Bad'])
neutral = Set([''])
positive = Set(['good', 'nice', 'excellent', 'positive', 'fortunate', 'correct', 'superior'])
negative = Set(['bad', 'nasty', 'poor', 'negative', 'unfortunate', 'wrong', 'inferior'])

# Expand on Sets to get S_p' and S_n'
for num in range(2):
    newsets = expand_sets(positive,negative,neutral);
    positive = set(newsets[0])
    negative = set(newsets[1])
    neutral = set(newsets[2])

# Use the same number of positive and negative words.
positive = random.sample(positive, min(len(positive), len(negative)))
negative = random.sample(negative, min(len(positive), len(negative)))

# Learn Classifier
train_bags = [make_bag(get_defs(word)) for word in positive] + [make_bag(get_defs(word)) for word in negative]

train_labels = [1 for word in positive] + [-1 for word in negative]
train_wordlist = []

# The classifier needs vectors, not dicts.  So we need to convert them to vectors.
# Make a list of all the words contained in them, then make an array with entries
# corresponding to each word.
for bag in train_bags:
  for word in bag.keys():
    if not (word in train_wordlist):
      train_wordlist.append(word)
train_wordlist = sorted(train_wordlist)
train_vecs = [bag_to_vec(bag, train_wordlist) for bag in train_bags]
classifier = MultinomialNB()
#classifier = svm.SVC(kernel="linear")
classifier.fit(train_vecs, train_labels)
# Iterate through all of the reviews and find sentiment
count = 0
correct = 0
ids = sorted(movie_reviews.fileids())

# Load the test set
(test_words, test_labels) = MPQALexicon.load()
#test_words = string.join(list(movie_reviews.words(fileids=ids)))
test_wordlist_bag = BagOfWords.make(string.join(test_words), normalize=False, use_negation=False, use_hash=False, use_presence=False)
test_wordlist = sorted(test_wordlist_bag.keys(), key=lambda k: -test_wordlist_bag[k])#[:NUM_TEST_WORDS]
test_bags = []
test_wordlist2 = []
for word in test_wordlist:
    defs = get_defs(word)
    if defs != '':
        test_wordlist2.append(word)
        test_bags.append(make_bag(defs))

test_vecs = [bag_to_vec(bag, train_wordlist) for bag in test_bags]
predicted_labels = classifier.predict(test_vecs)
word_labels = {}

for i in range(len(test_wordlist2)):
    key = test_wordlist2[i]
    word_labels[key] = predicted_labels[i]

pos_words = [w for w in test_wordlist2 if word_labels[w] > 0]
neg_words = [w for w in test_wordlist2 if word_labels[w] < 0]
# Use the same number of positive and negative words.
length = min(len(pos_words), len(neg_words))
pos_words = pos_words[:length]
neg_words = neg_words[:length]

scores = {}
for review_id in ids:
    words = movie_reviews.words(fileids=[review_id])
    score = 0
    for word in words:
        if word in pos_words:
            score += 1
        elif word in neg_words:
            score -= 1
    if (score >= 0):
        sent_value = "pos"
        print "Positive (%s)" % review_id
    else:
        sent_value = "neg"
        print "Negative (%s)" % review_id
    if (sent_value == movie_reviews.categories(fileids=[review_id])[0]):
        correct += 1
    count += 1
    scores[review_id] = score

print "correct:", float(correct)/count
	import math
	import nltk
	from nltk.corpus import wordnet as wn
	import nltk.classify.util
	from nltk.classify import NaiveBayesClassifier
	from nltk.corpus import movie_reviews
	from sets import Set
	import string
	import random
	import BagOfWords
	from sklearn.naive_bayes import MultinomialNB
	from sklearn import svm
	import MPQALexicon

	def get_defs(word):
	return string.join([synset.definition() for synset in wn.synsets(word, pos=wn.ADJ)])

	def make_bag(text):
	return BagOfWords.make(text, normalize=True, use_negation=False, use_hash=False, use_presence=False)

	def expand_sets(positive,negative,neutral):
	newPositive = set(positive)
	newNegative = set(negative)
	newNeutral = set(neutral)
	for word in positive:
	for syn in wn.synsets(word, pos=wn.ADJ):
	for lemma in syn.lemmas():
	curr = lemma.name().split('.')[0]
	if curr not in newPositive and curr not in newNegative and curr not in newNeutral:
	newPositive.add(curr)
	elif curr in newNegative:
	newNegative.discard(curr)
	newNeutral.add(curr)

	for word in negative:
	for syn in wn.synsets(word, pos=wn.ADJ):
	for lemma in syn.lemmas():
	curr = lemma.name().split('.')[0]
	if curr not in newPositive and curr not in newNegative and curr not in newNeutral:
	newNegative.add(curr)
	elif curr in newPositive:
	newPositive.discard(curr)
	newNeutral.add(curr)
	return (newPositive, newNegative, newNeutral)

	def bag_to_vec(bag, wordlist):
	vec = []
	for word in wordlist:
	if bag.has_key(word):
	vec.append(bag[word])
	else:
	vec.append(0)
	return vec

	# Set up initial Sets S_p and S_n
	#positive = Set(['Good'])
	#negative = Set(['Bad'])
	neutral = Set([''])
	positive = Set(['good', 'nice', 'excellent', 'positive', 'fortunate', 'correct', 'superior'])
	negative = Set(['bad', 'nasty', 'poor', 'negative', 'unfortunate', 'wrong', 'inferior'])

	# Expand on Sets to get S_p' and S_n'
	for num in range(2):
	newsets = expand_sets(positive,negative,neutral);
	positive = set(newsets[0])
	negative = set(newsets[1])
	neutral = set(newsets[2])

	# Use the same number of positive and negative words.
	positive = random.sample(positive, min(len(positive), len(negative)))
	negative = random.sample(negative, min(len(positive), len(negative)))

	# Learn Classifier
	train_bags = [make_bag(get_defs(word)) for word in positive] + [make_bag(get_defs(word)) for word in negative]

	train_labels = [1 for word in positive] + [-1 for word in negative]
	train_wordlist = []

	# The classifier needs vectors, not dicts. So we need to convert them to vectors.
	# Make a list of all the words contained in them, then make an array with entries
	# corresponding to each word.
	for bag in train_bags:
	for word in bag.keys():
	if not (word in train_wordlist):
	train_wordlist.append(word)
	train_wordlist = sorted(train_wordlist)
	train_vecs = [bag_to_vec(bag, train_wordlist) for bag in train_bags]
	classifier = MultinomialNB()
	#classifier = svm.SVC(kernel="linear")
	classifier.fit(train_vecs, train_labels)
	# Iterate through all of the reviews and find sentiment
	count = 0
	correct = 0
	ids = sorted(movie_reviews.fileids())

	# Load the test set
	(test_words, test_labels) = MPQALexicon.load()
	#test_words = string.join(list(movie_reviews.words(fileids=ids)))
	test_wordlist_bag = BagOfWords.make(string.join(test_words), normalize=False, use_negation=False, use_hash=False, use_presence=False)
	test_wordlist = sorted(test_wordlist_bag.keys(), key=lambda k: -test_wordlist_bag[k])#[:NUM_TEST_WORDS]
	test_bags = []
	test_wordlist2 = []
	for word in test_wordlist:
	defs = get_defs(word)
	if defs != '':
	test_wordlist2.append(word)
	test_bags.append(make_bag(defs))

	test_vecs = [bag_to_vec(bag, train_wordlist) for bag in test_bags]
	predicted_labels = classifier.predict(test_vecs)
	word_labels = {}

	for i in range(len(test_wordlist2)):
	key = test_wordlist2[i]
	word_labels[key] = predicted_labels[i]

	pos_words = [w for w in test_wordlist2 if word_labels[w] > 0]
	neg_words = [w for w in test_wordlist2 if word_labels[w] < 0]
	# Use the same number of positive and negative words.
	length = min(len(pos_words), len(neg_words))
	pos_words = pos_words[:length]
	neg_words = neg_words[:length]

	scores = {}
	for review_id in ids:
	words = movie_reviews.words(fileids=[review_id])
	score = 0
	for word in words:
	if word in pos_words:
	score += 1
	elif word in neg_words:
	score -= 1
	if (score >= 0):
	sent_value = "pos"
	print "Positive (%s)" % review_id
	else:
	sent_value = "neg"
	print "Negative (%s)" % review_id
	if (sent_value == movie_reviews.categories(fileids=[review_id])[0]):
	correct += 1
	count += 1
	scores[review_id] = score

	print "correct:", float(correct)/count