Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
178 lines (149 sloc) 5.8 KB
from __future__ import division
import math
import random
import string
from sets import Set
import numpy
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
import nltk
from nltk.corpus import wordnet as wn
import BagOfWords
EXPAND_ITERATIONS = 2
CLASSIFIER = "svm" # "nb" = Naive Bayes, "svm" = Linear SVM, "me" = maximum entropy
REMOVE_STOPWORDS = False
USE_STEMMING = False # sync this up with eval!
USE_EXAMPLES = True
USE_EQUAL_TRAINING = True
USE_EQUAL_TEST = True
POS_SEED = ['good', 'nice', 'excellent', 'positive', 'fortunate', 'correct', 'superior']
NEG_SEED = ['bad', 'nasty', 'poor', 'negative', 'unfortunate', 'wrong', 'inferior']
# returns tokenized
def get_defs(word):
defs = []
for synset in wn.synsets(word, pos=wn.ADJ):
defs += synset.lemma_names()
defs.append(synset.definition())
if USE_EXAMPLES:
defs += synset.examples()
tokens = nltk.word_tokenize(string.join(defs))
if USE_STEMMING:
tokens = do_stem(tokens)
if REMOVE_STOPWORDS:
stopwords = set(nltk.corpus.stopwords.words('english'))
if USE_STEMMING:
stopwords = do_stem(stopwords)
tokens = [x for x in tokens if x not in stopwords]
return tokens
# return a tfidf bag; text and documents are pre-tokenized.
def make_bag(text, documents):
return BagOfWords.make_tfidf(text, documents)
# Esuli and Sebastiani's algorithm to expand seed sets using WordNet
def expand_sets(positive, negative, neutral):
newPositive = set(positive)
newNegative = set(negative)
newNeutral = set(neutral)
for word in positive:
for syn in wn.synsets(word, pos=wn.ADJ):
for lemma in syn.lemmas():
curr = lemma.name()
if curr not in newPositive and curr not in newNegative and curr not in newNeutral:
newPositive.add(curr)
elif curr in newNegative:
newNegative.discard(curr)
newNeutral.add(curr)
for antonym in lemma.antonyms():
ant = antonym.name()
if ant not in newPositive and ant not in newNegative and ant not in newNeutral:
newNegative.add(ant)
elif ant in newPositive:
newPositive.discard(ant)
newNeutral.add(ant)
for word in negative:
for syn in wn.synsets(word, pos=wn.ADJ):
for lemma in syn.lemmas():
curr = lemma.name()
if curr not in newPositive and curr not in newNegative and curr not in newNeutral:
newNegative.add(curr)
elif curr in newPositive:
newPositive.discard(curr)
newNeutral.add(curr)
for antonym in lemma.antonyms():
ant = antonym.name()
if ant not in newPositive and ant not in newNegative and ant not in newNeutral:
newPositive.add(ant)
elif ant in newNegative:
newNegative.discard(ant)
newNeutral.add(ant)
return (newPositive, newNegative, newNeutral)
def do_stem(text):
stemmer = nltk.stem.porter.PorterStemmer()
return [stemmer.stem(word) for word in text]
def create(test_words, test_labels):
# Set up initial Sets S_p and S_n
neutral = []
#positive = ['good']
#negative = ['bad']
positive = [word for word in POS_SEED]
negative = [word for word in NEG_SEED]
# Expand on Sets to get S_p' and S_n'
for num in range(EXPAND_ITERATIONS):
(positive, negative, neutral) = expand_sets(positive,negative,neutral)
if USE_STEMMING:
positive = list(set(do_stem(positive)))
negative = list(set(do_stem(negative)))
# Use the same number of positive and negative training words.
if USE_EQUAL_TRAINING:
length = min(len(positive), len(negative))
positive = list(positive)[:length]
negative = list(negative)[:length]
# Train the classifier using the expanded wordlist.
train_defs = [get_defs(word) for word in (positive + negative)]
train_bags = [make_bag(get_defs(word), train_defs) for word in positive] + [make_bag(get_defs(word), train_defs) for word in negative]
train_labels = [1 for word in positive] + [-1 for word in negative]
# The classifier needs vectors, not dicts. So we need to convert them to vectors.
# Make a list of all the words contained in them, then make an array with entries
# corresponding to each word.
# Vector entries correspond to each word in the training word list.
train_wordlist = []
for tdef in train_defs:
for word in tdef:
train_wordlist.append(word)
train_wordlist = set(train_wordlist)
train_vecs = [BagOfWords.to_vector(bag, train_wordlist) for bag in train_bags]
if CLASSIFIER == "nb":
classifier = MultinomialNB()
elif CLASSIFIER == "svm":
classifier = LinearSVC()
elif CLASSIFIER == "me":
classifier = LogisticRegression()
classifier.fit(train_vecs, train_labels)
test_defs = [get_defs(word) for word in test_words]
test_bags = [make_bag(get_defs(word), test_defs) for word in test_words]
test_vecs = [BagOfWords.to_vector(bag, train_wordlist) for bag in test_bags]
predicted_labels = classifier.predict(test_vecs)
correct = 0
for i in range(len(test_labels)):
if test_labels[i] == predicted_labels[i]:
correct += 1
print "Lexicon accuracy:", correct/len(test_labels)
word_labels = {}
for i in range(len(test_words)):
key = test_words[i]
word_labels[key] = predicted_labels[i]
pos_words = set([w for w in test_words if word_labels[w] > 0])
neg_words = set([w for w in test_words if word_labels[w] < 0])
# Use the same number of positive and negative words.
if USE_EQUAL_TEST:
length = min(len(pos_words), len(neg_words))
pos_words = list(pos_words)[:length]
neg_words = list(neg_words)[:length]
lexicon = {}
lex2 = {}
for word in pos_words:
lexicon[word] = 1
for word in neg_words:
lexicon[word] = -1
return lexicon
#create_trained_lexicon(POS_SEED, NEG_SEED, test_words, test_labels)
You can’t perform that action at this time.