GlossLexicon.py

from __future__ import division
import math
import random
import string
from sets import Set
import numpy
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
import nltk
from nltk.corpus import wordnet as wn

import BagOfWords

EXPAND_ITERATIONS = 2
CLASSIFIER = "svm" # "nb" = Naive Bayes, "svm" = Linear SVM, "me" = maximum entropy
REMOVE_STOPWORDS = False
USE_STEMMING = False # sync this up with eval!
USE_EXAMPLES = True

USE_EQUAL_TRAINING = True
USE_EQUAL_TEST = True

POS_SEED = ['good', 'nice', 'excellent', 'positive', 'fortunate', 'correct', 'superior']
NEG_SEED = ['bad', 'nasty', 'poor', 'negative', 'unfortunate', 'wrong', 'inferior']

# returns tokenized
def get_defs(word):
  defs = []
  for synset in wn.synsets(word, pos=wn.ADJ):
    defs += synset.lemma_names()
    defs.append(synset.definition())
    if USE_EXAMPLES:
      defs += synset.examples()

  tokens = nltk.word_tokenize(string.join(defs))
  if USE_STEMMING:
    tokens = do_stem(tokens)
  if REMOVE_STOPWORDS:
    stopwords = set(nltk.corpus.stopwords.words('english'))
    if USE_STEMMING:
      stopwords = do_stem(stopwords)
    tokens = [x for x in tokens if x not in stopwords]
  return tokens

# return a tfidf bag; text and documents are pre-tokenized.
def make_bag(text, documents):
  return BagOfWords.make_tfidf(text, documents)

# Esuli and Sebastiani's algorithm to expand seed sets using WordNet
def expand_sets(positive, negative, neutral):
  newPositive = set(positive)
  newNegative = set(negative)
  newNeutral = set(neutral)
  for word in positive:
    for syn in wn.synsets(word, pos=wn.ADJ):
      for lemma in syn.lemmas():
        curr = lemma.name()
        if curr not in newPositive and curr not in newNegative and curr not in newNeutral:
          newPositive.add(curr)
        elif curr in newNegative:
          newNegative.discard(curr)
          newNeutral.add(curr)
        for antonym in lemma.antonyms():
          ant = antonym.name()
          if ant not in newPositive and ant not in newNegative and ant not in newNeutral:
            newNegative.add(ant)
          elif ant in newPositive:
            newPositive.discard(ant)
            newNeutral.add(ant)

  for word in negative:
    for syn in wn.synsets(word, pos=wn.ADJ):
      for lemma in syn.lemmas():
        curr = lemma.name()
        if curr not in newPositive and curr not in newNegative and curr not in newNeutral:
          newNegative.add(curr)
        elif curr in newPositive:
          newPositive.discard(curr)
          newNeutral.add(curr)
        for antonym in lemma.antonyms():
          ant = antonym.name()
          if ant not in newPositive and ant not in newNegative and ant not in newNeutral:
            newPositive.add(ant)
          elif ant in newNegative:
            newNegative.discard(ant)
            newNeutral.add(ant)
  return (newPositive, newNegative, newNeutral)

def do_stem(text):
  stemmer = nltk.stem.porter.PorterStemmer()
  return [stemmer.stem(word) for word in text]

def create(test_words, test_labels):
  # Set up initial Sets S_p and S_n
  neutral = []
  #positive = ['good']
  #negative = ['bad']

  positive = [word for word in POS_SEED]
  negative = [word for word in NEG_SEED]
  # Expand on Sets to get S_p' and S_n'
  for num in range(EXPAND_ITERATIONS):
    (positive, negative, neutral) = expand_sets(positive,negative,neutral)

  if USE_STEMMING:
    positive = list(set(do_stem(positive)))
    negative = list(set(do_stem(negative)))

  # Use the same number of positive and negative training words.
  if USE_EQUAL_TRAINING:
    length = min(len(positive), len(negative))
    positive = list(positive)[:length]
    negative = list(negative)[:length]

  # Train the classifier using the expanded wordlist.
  train_defs = [get_defs(word) for word in (positive + negative)]
  train_bags = [make_bag(get_defs(word), train_defs) for word in positive] + [make_bag(get_defs(word), train_defs) for word in negative]

  train_labels = [1 for word in positive] + [-1 for word in negative]

  # The classifier needs vectors, not dicts.  So we need to convert them to vectors.
  # Make a list of all the words contained in them, then make an array with entries
  # corresponding to each word.

  # Vector entries correspond to each word in the training word list.
  train_wordlist = []
  for tdef in train_defs:
    for word in tdef:
      train_wordlist.append(word)
  train_wordlist = set(train_wordlist)

  train_vecs = [BagOfWords.to_vector(bag, train_wordlist) for bag in train_bags]
  if CLASSIFIER == "nb":
    classifier = MultinomialNB()
  elif CLASSIFIER == "svm":
    classifier = LinearSVC()
  elif CLASSIFIER == "me":
    classifier = LogisticRegression()
  classifier.fit(train_vecs, train_labels)

  test_defs =  [get_defs(word) for word in test_words]
  test_bags = [make_bag(get_defs(word), test_defs) for word in test_words]
  test_vecs = [BagOfWords.to_vector(bag, train_wordlist) for bag in test_bags]

  predicted_labels = classifier.predict(test_vecs)
  correct = 0
  for i in range(len(test_labels)):
    if test_labels[i] == predicted_labels[i]:
      correct += 1

  print "Lexicon accuracy:", correct/len(test_labels)

  word_labels = {}
  for i in range(len(test_words)):
    key = test_words[i]
    word_labels[key] = predicted_labels[i]

  pos_words = set([w for w in test_words if word_labels[w] > 0])
  neg_words = set([w for w in test_words if word_labels[w] < 0])

  # Use the same number of positive and negative words.
  if USE_EQUAL_TEST:
    length = min(len(pos_words), len(neg_words))
    pos_words = list(pos_words)[:length]
    neg_words = list(neg_words)[:length]

  lexicon = {}
  lex2 = {}
  for word in pos_words:
    lexicon[word] = 1

  for word in neg_words:
    lexicon[word] = -1

  return lexicon

#create_trained_lexicon(POS_SEED, NEG_SEED, test_words, test_labels)
	from __future__ import division
	import math
	import random
	import string
	from sets import Set
	import numpy
	from sklearn.svm import LinearSVC
	from sklearn.linear_model import LogisticRegression
	from sklearn.naive_bayes import MultinomialNB
	import nltk
	from nltk.corpus import wordnet as wn

	import BagOfWords

	EXPAND_ITERATIONS = 2
	CLASSIFIER = "svm" # "nb" = Naive Bayes, "svm" = Linear SVM, "me" = maximum entropy
	REMOVE_STOPWORDS = False
	USE_STEMMING = False # sync this up with eval!
	USE_EXAMPLES = True

	USE_EQUAL_TRAINING = True
	USE_EQUAL_TEST = True

	POS_SEED = ['good', 'nice', 'excellent', 'positive', 'fortunate', 'correct', 'superior']
	NEG_SEED = ['bad', 'nasty', 'poor', 'negative', 'unfortunate', 'wrong', 'inferior']

	# returns tokenized
	def get_defs(word):
	defs = []
	for synset in wn.synsets(word, pos=wn.ADJ):
	defs += synset.lemma_names()
	defs.append(synset.definition())
	if USE_EXAMPLES:
	defs += synset.examples()

	tokens = nltk.word_tokenize(string.join(defs))
	if USE_STEMMING:
	tokens = do_stem(tokens)
	if REMOVE_STOPWORDS:
	stopwords = set(nltk.corpus.stopwords.words('english'))
	if USE_STEMMING:
	stopwords = do_stem(stopwords)
	tokens = [x for x in tokens if x not in stopwords]
	return tokens

	# return a tfidf bag; text and documents are pre-tokenized.
	def make_bag(text, documents):
	return BagOfWords.make_tfidf(text, documents)

	# Esuli and Sebastiani's algorithm to expand seed sets using WordNet
	def expand_sets(positive, negative, neutral):
	newPositive = set(positive)
	newNegative = set(negative)
	newNeutral = set(neutral)
	for word in positive:
	for syn in wn.synsets(word, pos=wn.ADJ):
	for lemma in syn.lemmas():
	curr = lemma.name()
	if curr not in newPositive and curr not in newNegative and curr not in newNeutral:
	newPositive.add(curr)
	elif curr in newNegative:
	newNegative.discard(curr)
	newNeutral.add(curr)
	for antonym in lemma.antonyms():
	ant = antonym.name()
	if ant not in newPositive and ant not in newNegative and ant not in newNeutral:
	newNegative.add(ant)
	elif ant in newPositive:
	newPositive.discard(ant)
	newNeutral.add(ant)

	for word in negative:
	for syn in wn.synsets(word, pos=wn.ADJ):
	for lemma in syn.lemmas():
	curr = lemma.name()
	if curr not in newPositive and curr not in newNegative and curr not in newNeutral:
	newNegative.add(curr)
	elif curr in newPositive:
	newPositive.discard(curr)
	newNeutral.add(curr)
	for antonym in lemma.antonyms():
	ant = antonym.name()
	if ant not in newPositive and ant not in newNegative and ant not in newNeutral:
	newPositive.add(ant)
	elif ant in newNegative:
	newNegative.discard(ant)
	newNeutral.add(ant)
	return (newPositive, newNegative, newNeutral)

	def do_stem(text):
	stemmer = nltk.stem.porter.PorterStemmer()
	return [stemmer.stem(word) for word in text]

	def create(test_words, test_labels):
	# Set up initial Sets S_p and S_n
	neutral = []
	#positive = ['good']
	#negative = ['bad']

	positive = [word for word in POS_SEED]
	negative = [word for word in NEG_SEED]
	# Expand on Sets to get S_p' and S_n'
	for num in range(EXPAND_ITERATIONS):
	(positive, negative, neutral) = expand_sets(positive,negative,neutral)

	if USE_STEMMING:
	positive = list(set(do_stem(positive)))
	negative = list(set(do_stem(negative)))

	# Use the same number of positive and negative training words.
	if USE_EQUAL_TRAINING:
	length = min(len(positive), len(negative))
	positive = list(positive)[:length]
	negative = list(negative)[:length]

	# Train the classifier using the expanded wordlist.
	train_defs = [get_defs(word) for word in (positive + negative)]
	train_bags = [make_bag(get_defs(word), train_defs) for word in positive] + [make_bag(get_defs(word), train_defs) for word in negative]

	train_labels = [1 for word in positive] + [-1 for word in negative]

	# The classifier needs vectors, not dicts. So we need to convert them to vectors.
	# Make a list of all the words contained in them, then make an array with entries
	# corresponding to each word.

	# Vector entries correspond to each word in the training word list.
	train_wordlist = []
	for tdef in train_defs:
	for word in tdef:
	train_wordlist.append(word)
	train_wordlist = set(train_wordlist)

	train_vecs = [BagOfWords.to_vector(bag, train_wordlist) for bag in train_bags]
	if CLASSIFIER == "nb":
	classifier = MultinomialNB()
	elif CLASSIFIER == "svm":
	classifier = LinearSVC()
	elif CLASSIFIER == "me":
	classifier = LogisticRegression()
	classifier.fit(train_vecs, train_labels)

	test_defs = [get_defs(word) for word in test_words]
	test_bags = [make_bag(get_defs(word), test_defs) for word in test_words]
	test_vecs = [BagOfWords.to_vector(bag, train_wordlist) for bag in test_bags]

	predicted_labels = classifier.predict(test_vecs)
	correct = 0
	for i in range(len(test_labels)):
	if test_labels[i] == predicted_labels[i]:
	correct += 1

	print "Lexicon accuracy:", correct/len(test_labels)

	word_labels = {}
	for i in range(len(test_words)):
	key = test_words[i]
	word_labels[key] = predicted_labels[i]

	pos_words = set([w for w in test_words if word_labels[w] > 0])
	neg_words = set([w for w in test_words if word_labels[w] < 0])

	# Use the same number of positive and negative words.
	if USE_EQUAL_TEST:
	length = min(len(pos_words), len(neg_words))
	pos_words = list(pos_words)[:length]
	neg_words = list(neg_words)[:length]

	lexicon = {}
	lex2 = {}
	for word in pos_words:
	lexicon[word] = 1

	for word in neg_words:
	lexicon[word] = -1

	return lexicon

	#create_trained_lexicon(POS_SEED, NEG_SEED, test_words, test_labels)