Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
BigData/GlossLexicon.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
178 lines (149 sloc)
5.8 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import division | |
import math | |
import random | |
import string | |
from sets import Set | |
import numpy | |
from sklearn.svm import LinearSVC | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.naive_bayes import MultinomialNB | |
import nltk | |
from nltk.corpus import wordnet as wn | |
import BagOfWords | |
EXPAND_ITERATIONS = 2 | |
CLASSIFIER = "svm" # "nb" = Naive Bayes, "svm" = Linear SVM, "me" = maximum entropy | |
REMOVE_STOPWORDS = False | |
USE_STEMMING = False # sync this up with eval! | |
USE_EXAMPLES = True | |
USE_EQUAL_TRAINING = True | |
USE_EQUAL_TEST = True | |
POS_SEED = ['good', 'nice', 'excellent', 'positive', 'fortunate', 'correct', 'superior'] | |
NEG_SEED = ['bad', 'nasty', 'poor', 'negative', 'unfortunate', 'wrong', 'inferior'] | |
# returns tokenized | |
def get_defs(word): | |
defs = [] | |
for synset in wn.synsets(word, pos=wn.ADJ): | |
defs += synset.lemma_names() | |
defs.append(synset.definition()) | |
if USE_EXAMPLES: | |
defs += synset.examples() | |
tokens = nltk.word_tokenize(string.join(defs)) | |
if USE_STEMMING: | |
tokens = do_stem(tokens) | |
if REMOVE_STOPWORDS: | |
stopwords = set(nltk.corpus.stopwords.words('english')) | |
if USE_STEMMING: | |
stopwords = do_stem(stopwords) | |
tokens = [x for x in tokens if x not in stopwords] | |
return tokens | |
# return a tfidf bag; text and documents are pre-tokenized. | |
def make_bag(text, documents): | |
return BagOfWords.make_tfidf(text, documents) | |
# Esuli and Sebastiani's algorithm to expand seed sets using WordNet | |
def expand_sets(positive, negative, neutral): | |
newPositive = set(positive) | |
newNegative = set(negative) | |
newNeutral = set(neutral) | |
for word in positive: | |
for syn in wn.synsets(word, pos=wn.ADJ): | |
for lemma in syn.lemmas(): | |
curr = lemma.name() | |
if curr not in newPositive and curr not in newNegative and curr not in newNeutral: | |
newPositive.add(curr) | |
elif curr in newNegative: | |
newNegative.discard(curr) | |
newNeutral.add(curr) | |
for antonym in lemma.antonyms(): | |
ant = antonym.name() | |
if ant not in newPositive and ant not in newNegative and ant not in newNeutral: | |
newNegative.add(ant) | |
elif ant in newPositive: | |
newPositive.discard(ant) | |
newNeutral.add(ant) | |
for word in negative: | |
for syn in wn.synsets(word, pos=wn.ADJ): | |
for lemma in syn.lemmas(): | |
curr = lemma.name() | |
if curr not in newPositive and curr not in newNegative and curr not in newNeutral: | |
newNegative.add(curr) | |
elif curr in newPositive: | |
newPositive.discard(curr) | |
newNeutral.add(curr) | |
for antonym in lemma.antonyms(): | |
ant = antonym.name() | |
if ant not in newPositive and ant not in newNegative and ant not in newNeutral: | |
newPositive.add(ant) | |
elif ant in newNegative: | |
newNegative.discard(ant) | |
newNeutral.add(ant) | |
return (newPositive, newNegative, newNeutral) | |
def do_stem(text): | |
stemmer = nltk.stem.porter.PorterStemmer() | |
return [stemmer.stem(word) for word in text] | |
def create(test_words, test_labels): | |
# Set up initial Sets S_p and S_n | |
neutral = [] | |
#positive = ['good'] | |
#negative = ['bad'] | |
positive = [word for word in POS_SEED] | |
negative = [word for word in NEG_SEED] | |
# Expand on Sets to get S_p' and S_n' | |
for num in range(EXPAND_ITERATIONS): | |
(positive, negative, neutral) = expand_sets(positive,negative,neutral) | |
if USE_STEMMING: | |
positive = list(set(do_stem(positive))) | |
negative = list(set(do_stem(negative))) | |
# Use the same number of positive and negative training words. | |
if USE_EQUAL_TRAINING: | |
length = min(len(positive), len(negative)) | |
positive = list(positive)[:length] | |
negative = list(negative)[:length] | |
# Train the classifier using the expanded wordlist. | |
train_defs = [get_defs(word) for word in (positive + negative)] | |
train_bags = [make_bag(get_defs(word), train_defs) for word in positive] + [make_bag(get_defs(word), train_defs) for word in negative] | |
train_labels = [1 for word in positive] + [-1 for word in negative] | |
# The classifier needs vectors, not dicts. So we need to convert them to vectors. | |
# Make a list of all the words contained in them, then make an array with entries | |
# corresponding to each word. | |
# Vector entries correspond to each word in the training word list. | |
train_wordlist = [] | |
for tdef in train_defs: | |
for word in tdef: | |
train_wordlist.append(word) | |
train_wordlist = set(train_wordlist) | |
train_vecs = [BagOfWords.to_vector(bag, train_wordlist) for bag in train_bags] | |
if CLASSIFIER == "nb": | |
classifier = MultinomialNB() | |
elif CLASSIFIER == "svm": | |
classifier = LinearSVC() | |
elif CLASSIFIER == "me": | |
classifier = LogisticRegression() | |
classifier.fit(train_vecs, train_labels) | |
test_defs = [get_defs(word) for word in test_words] | |
test_bags = [make_bag(get_defs(word), test_defs) for word in test_words] | |
test_vecs = [BagOfWords.to_vector(bag, train_wordlist) for bag in test_bags] | |
predicted_labels = classifier.predict(test_vecs) | |
correct = 0 | |
for i in range(len(test_labels)): | |
if test_labels[i] == predicted_labels[i]: | |
correct += 1 | |
print "Lexicon accuracy:", correct/len(test_labels) | |
word_labels = {} | |
for i in range(len(test_words)): | |
key = test_words[i] | |
word_labels[key] = predicted_labels[i] | |
pos_words = set([w for w in test_words if word_labels[w] > 0]) | |
neg_words = set([w for w in test_words if word_labels[w] < 0]) | |
# Use the same number of positive and negative words. | |
if USE_EQUAL_TEST: | |
length = min(len(pos_words), len(neg_words)) | |
pos_words = list(pos_words)[:length] | |
neg_words = list(neg_words)[:length] | |
lexicon = {} | |
lex2 = {} | |
for word in pos_words: | |
lexicon[word] = 1 | |
for word in neg_words: | |
lexicon[word] = -1 | |
return lexicon | |
#create_trained_lexicon(POS_SEED, NEG_SEED, test_words, test_labels) |