Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
from __future__ import division
import math
import random
import string
from sets import Set
import numpy
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
import nltk
from nltk.corpus import wordnet as wn
import BagOfWords
EXPAND_ITERATIONS = 2
CLASSIFIER = "svm" # "nb" = Naive Bayes, "svm" = Linear SVM, "me" = maximum entropy
REMOVE_STOPWORDS = False
USE_STEMMING = False # sync this up with eval!
USE_EXAMPLES = True
USE_EQUAL_TRAINING = True
USE_EQUAL_TEST = True
POS_SEED = ['good', 'nice', 'excellent', 'positive', 'fortunate', 'correct', 'superior']
NEG_SEED = ['bad', 'nasty', 'poor', 'negative', 'unfortunate', 'wrong', 'inferior']
# returns tokenized
def get_defs(word):
defs = []
for synset in wn.synsets(word, pos=wn.ADJ):
defs += synset.lemma_names()
defs.append(synset.definition())
if USE_EXAMPLES:
defs += synset.examples()
tokens = nltk.word_tokenize(string.join(defs))
if USE_STEMMING:
tokens = do_stem(tokens)
if REMOVE_STOPWORDS:
stopwords = set(nltk.corpus.stopwords.words('english'))
if USE_STEMMING:
stopwords = do_stem(stopwords)
tokens = [x for x in tokens if x not in stopwords]
return tokens
# return a tfidf bag; text and documents are pre-tokenized.
def make_bag(text, documents):
return BagOfWords.make_tfidf(text, documents)
# Esuli and Sebastiani's algorithm to expand seed sets using WordNet
def expand_sets(positive, negative, neutral):
newPositive = set(positive)
newNegative = set(negative)
newNeutral = set(neutral)
for word in positive:
for syn in wn.synsets(word, pos=wn.ADJ):
for lemma in syn.lemmas():
curr = lemma.name()
if curr not in newPositive and curr not in newNegative and curr not in newNeutral:
newPositive.add(curr)
elif curr in newNegative:
newNegative.discard(curr)
newNeutral.add(curr)
for antonym in lemma.antonyms():
ant = antonym.name()
if ant not in newPositive and ant not in newNegative and ant not in newNeutral:
newNegative.add(ant)
elif ant in newPositive:
newPositive.discard(ant)
newNeutral.add(ant)
for word in negative:
for syn in wn.synsets(word, pos=wn.ADJ):
for lemma in syn.lemmas():
curr = lemma.name()
if curr not in newPositive and curr not in newNegative and curr not in newNeutral:
newNegative.add(curr)
elif curr in newPositive:
newPositive.discard(curr)
newNeutral.add(curr)
for antonym in lemma.antonyms():
ant = antonym.name()
if ant not in newPositive and ant not in newNegative and ant not in newNeutral:
newPositive.add(ant)
elif ant in newNegative:
newNegative.discard(ant)
newNeutral.add(ant)
return (newPositive, newNegative, newNeutral)
def do_stem(text):
stemmer = nltk.stem.porter.PorterStemmer()
return [stemmer.stem(word) for word in text]
def create(test_words, test_labels):
# Set up initial Sets S_p and S_n
neutral = []
#positive = ['good']
#negative = ['bad']
positive = [word for word in POS_SEED]
negative = [word for word in NEG_SEED]
# Expand on Sets to get S_p' and S_n'
for num in range(EXPAND_ITERATIONS):
(positive, negative, neutral) = expand_sets(positive,negative,neutral)
if USE_STEMMING:
positive = list(set(do_stem(positive)))
negative = list(set(do_stem(negative)))
# Use the same number of positive and negative training words.
if USE_EQUAL_TRAINING:
length = min(len(positive), len(negative))
positive = list(positive)[:length]
negative = list(negative)[:length]
# Train the classifier using the expanded wordlist.
train_defs = [get_defs(word) for word in (positive + negative)]
train_bags = [make_bag(get_defs(word), train_defs) for word in positive] + [make_bag(get_defs(word), train_defs) for word in negative]
train_labels = [1 for word in positive] + [-1 for word in negative]
# The classifier needs vectors, not dicts. So we need to convert them to vectors.
# Make a list of all the words contained in them, then make an array with entries
# corresponding to each word.
# Vector entries correspond to each word in the training word list.
train_wordlist = []
for tdef in train_defs:
for word in tdef:
train_wordlist.append(word)
train_wordlist = set(train_wordlist)
train_vecs = [BagOfWords.to_vector(bag, train_wordlist) for bag in train_bags]
if CLASSIFIER == "nb":
classifier = MultinomialNB()
elif CLASSIFIER == "svm":
classifier = LinearSVC()
elif CLASSIFIER == "me":
classifier = LogisticRegression()
classifier.fit(train_vecs, train_labels)
test_defs = [get_defs(word) for word in test_words]
test_bags = [make_bag(get_defs(word), test_defs) for word in test_words]
test_vecs = [BagOfWords.to_vector(bag, train_wordlist) for bag in test_bags]
predicted_labels = classifier.predict(test_vecs)
correct = 0
for i in range(len(test_labels)):
if test_labels[i] == predicted_labels[i]:
correct += 1
print "Lexicon accuracy:", correct/len(test_labels)
word_labels = {}
for i in range(len(test_words)):
key = test_words[i]
word_labels[key] = predicted_labels[i]
pos_words = set([w for w in test_words if word_labels[w] > 0])
neg_words = set([w for w in test_words if word_labels[w] < 0])
# Use the same number of positive and negative words.
if USE_EQUAL_TEST:
length = min(len(pos_words), len(neg_words))
pos_words = list(pos_words)[:length]
neg_words = list(neg_words)[:length]
lexicon = {}
lex2 = {}
for word in pos_words:
lexicon[word] = 1
for word in neg_words:
lexicon[word] = -1
return lexicon
#create_trained_lexicon(POS_SEED, NEG_SEED, test_words, test_labels)