From 444709974b9de669f0a3a44368966d63adfcf5f6 Mon Sep 17 00:00:00 2001 From: Jack Date: Mon, 11 Apr 2016 17:07:29 -0400 Subject: [PATCH] Delta TFIDF, part 1 --- BagOfWords.py | 26 ++- GlossCountJWB.py | 432 ++++++++++++++++++++++++++++++----------------- MPQALexicon.py | 25 ++- TFIDF.py | 9 +- getAdjectives.py | 7 +- review_svm.py | 31 +++- 6 files changed, 352 insertions(+), 178 deletions(-) diff --git a/BagOfWords.py b/BagOfWords.py index c224bda..dbaea22 100644 --- a/BagOfWords.py +++ b/BagOfWords.py @@ -2,7 +2,7 @@ from __future__ import division import string import numpy import nltk -from TFIDF import tfidf +from TFIDF import tfidf, delta_tfidf # "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not', # 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word." @@ -81,7 +81,29 @@ def make_tfidf(document, documents): for key in bag.keys(): bag[key] /= factor return bag - + +# As per Martineau and Finn (2009), create a bag of words using delta TFIDF as the feature value. +# Todo: Bigrams? +def make_delta_tfidf(document, positive_set, negative_set, ref_bag): + bag = {} + factor = 0 + for term in set(document): + weight = delta_tfidf(term, document, positive_set, negative_set) + if (weight != 0): + bag[term] = weight + factor += weight**2 + factor **= 0.5 + for key in bag.keys(): + bag[key] /= factor + # Add word counts to the reference bag + for term in document: + if ref_bag != None: + if ref_bag.has_key(term): + ref_bag[term] += 1 + else: + ref_bag[term] = 1 + return bag + def to_vector(bag, wordlist): vec = [] for word in wordlist: diff --git a/GlossCountJWB.py b/GlossCountJWB.py index 2ca0964..cf9278c 100644 --- a/GlossCountJWB.py +++ b/GlossCountJWB.py @@ -1,178 +1,302 @@ +from __future__ import division import math +import random +import string +from sets import Set +import numpy +from sklearn.svm import LinearSVC +from sklearn.linear_model import LogisticRegression +from sklearn.naive_bayes import MultinomialNB import nltk from nltk.corpus import wordnet as wn import nltk.classify.util -from nltk.classify import NaiveBayesClassifier from nltk.corpus import movie_reviews -from sets import Set -import string -import random import BagOfWords -from sklearn import svm -from sklearn.naive_bayes import MultinomialNB -from sklearn.linear_model import LogisticRegression import MPQALexicon -import numpy +import AniaLexicon + +EXPAND_ITERATIONS = 2 +CLASSIFIER = "me" # "nb" = Naive Bayes, "svm" = Linear SVM, "me" = maximum entropy +REMOVE_STOPWORDS = False +USE_STEMMING = False +USE_EXAMPLES = True + +USE_EQUAL_TRAINING = True +USE_EQUAL_TEST = True +USE_PARSING = True + +POS_SEED = ['good', 'nice', 'excellent', 'positive', 'fortunate', 'correct', 'superior'] +NEG_SEED = ['bad', 'nasty', 'poor', 'negative', 'unfortunate', 'wrong', 'inferior'] # returns tokenized -def get_defs(word, use_examples=True): - defs = [synset.definition() for synset in wn.synsets(word, pos=wn.ADJ)] - if use_examples: - examples = [synset.examples() for synset in wn.synsets(word, pos=wn.ADJ)] - for example in examples: defs += example - return nltk.word_tokenize(string.join(defs)) - -# text and documents are pre-tokenized +def get_defs(word): + defs = [] + for synset in wn.synsets(word, pos=wn.ADJ): + defs += synset.lemma_names() + defs.append(synset.definition()) + if USE_EXAMPLES: + defs += synset.examples() + + tokens = nltk.word_tokenize(string.join(defs)) + if USE_STEMMING: + tokens = do_stem(tokens) + if REMOVE_STOPWORDS: + stopwords = set(nltk.corpus.stopwords.words('english')) + if USE_STEMMING: + stopwords = do_stem(stopwords) + tokens = [x for x in tokens if x not in stopwords] + return tokens + +# return a tfidf bag; text and documents are pre-tokenized. def make_bag(text, documents): - #return BagOfWords.make(text, normalize=True, use_negation=False, use_hash=False, use_presence=False) - return BagOfWords.make_tfidf(text, documents) - -def expand_sets(positive,negative,neutral): - newPositive = set(positive) - newNegative = set(negative) - newNeutral = set(neutral) - for word in positive: - for syn in wn.synsets(word, pos=wn.ADJ): - for lemma in syn.lemmas(): - curr = lemma.name() - if curr not in newPositive and curr not in newNegative and curr not in newNeutral: - newPositive.add(curr) - elif curr in newNegative: - newNegative.discard(curr) - newNeutral.add(curr) - for antonym in lemma.antonyms(): - ant = antonym.name() - if ant not in newPositive and ant not in newNegative and ant not in newNeutral: - newNegative.add(ant) - elif ant in newPositive: - newPositive.discard(ant) - newNeutral.add(ant) - - for word in negative: - for syn in wn.synsets(word, pos=wn.ADJ): - for lemma in syn.lemmas(): - curr = lemma.name() - if curr not in newPositive and curr not in newNegative and curr not in newNeutral: - newNegative.add(curr) - elif curr in newPositive: - newPositive.discard(curr) - newNeutral.add(curr) - for antonym in lemma.antonyms(): - ant = antonym.name() - if ant not in newPositive and ant not in newNegative and ant not in newNeutral: - newPositive.add(ant) - elif ant in newNegative: - newNegative.discard(ant) - newNeutral.add(ant) - return (newPositive, newNegative, newNeutral) - -def bag_to_vec(bag, wordlist): - vec = [] - for word in wordlist: - if bag.has_key(word): - vec.append(bag[word]) - else: - vec.append(0) - return vec - -# Set up initial Sets S_p and S_n -neutral = Set([]) -positive = Set(['good', 'nice', 'excellent', 'positive', 'fortunate', 'correct', 'superior']) -negative = Set(['bad', 'nasty', 'poor', 'negative', 'unfortunate', 'wrong', 'inferior']) - -# Expand on Sets to get S_p' and S_n' -for num in range(1): - (positive, negative, neutral) = expand_sets(positive,negative,neutral); - -# Use the same number of positive and negative training words. -positive = random.sample(positive, min(len(positive), len(negative))) -negative = random.sample(negative, min(len(positive), len(negative))) - -# Train the classifier using the expanded wordlist. -train_wordlist = set(positive + negative) - -train_defs = [get_defs(word) for word in (positive + negative)] - -train_bags = [make_bag(get_defs(word), train_defs) for word in positive] + [make_bag(get_defs(word), train_defs) for word in negative] - -train_labels = [1 for word in positive] + [-1 for word in negative] - -# The classifier needs vectors, not dicts. So we need to convert them to vectors. -# Make a list of all the words contained in them, then make an array with entries -# corresponding to each word. - -train_vecs = [bag_to_vec(bag, train_wordlist) for bag in train_bags] -classifier = svm.LinearSVC() -classifier.fit(train_vecs, train_labels) - -# Load the test set. I'm only using the bag of words structure here to select the words -# with a certain word count threshold. -(test_words, test_labels) = MPQALexicon.load(True) + return BagOfWords.make_tfidf(text, documents) -test_wordlist_bag = BagOfWords.make(string.join(test_words), normalize=False, use_negation=False, use_hash=False, use_presence=False) -test_wordlist = sorted(test_wordlist_bag.keys(), key=lambda k: -test_wordlist_bag[k])#[:500] -test_bags = [] +# Esuli and Sebastiani's algorithm to expand seed sets using WordNet +def expand_sets(positive, negative, neutral): + newPositive = set(positive) + newNegative = set(negative) + newNeutral = set(neutral) + for word in positive: + for syn in wn.synsets(word, pos=wn.ADJ): + for lemma in syn.lemmas(): + curr = lemma.name() + if curr not in newPositive and curr not in newNegative and curr not in newNeutral: + newPositive.add(curr) + elif curr in newNegative: + newNegative.discard(curr) + newNeutral.add(curr) + for antonym in lemma.antonyms(): + ant = antonym.name() + if ant not in newPositive and ant not in newNegative and ant not in newNeutral: + newNegative.add(ant) + elif ant in newPositive: + newPositive.discard(ant) + newNeutral.add(ant) -test_wordlist = filter(lambda x: x != '', test_wordlist) -test_bags = [make_bag(get_defs(word), train_defs) for word in test_wordlist] + for word in negative: + for syn in wn.synsets(word, pos=wn.ADJ): + for lemma in syn.lemmas(): + curr = lemma.name() + if curr not in newPositive and curr not in newNegative and curr not in newNeutral: + newNegative.add(curr) + elif curr in newPositive: + newPositive.discard(curr) + newNeutral.add(curr) + for antonym in lemma.antonyms(): + ant = antonym.name() + if ant not in newPositive and ant not in newNegative and ant not in newNeutral: + newPositive.add(ant) + elif ant in newNegative: + newNegative.discard(ant) + newNeutral.add(ant) + return (newPositive, newNegative, newNeutral) + +def get_label(id): + return movie_reviews.categories(fileids=[id])[0] + +def do_stem(text): + global stemmer + return [stemmer.stem(word) for word in text] -test_vecs = [bag_to_vec(bag, train_wordlist) for bag in test_bags] -predicted_labels = classifier.predict(test_vecs) -word_labels = {} +# new and improved finite state machine +# states are as follows: +# 0 - base +# 1 - negator found +# 2 - intensifier found +# 3 - un-intensifier found (unused) +# 4 - negator + intensifier found +def calculate_score(text, lexicon): + negators = ["not", "n't", "hardly", "barely"] + intensifiers = ["very", "really", "incredibly", "amazingly", "extremely"] + if USE_STEMMING: + negators = do_stem(negators) + intensifiers = do_stem(intensifiers) + + punctuation = [".", "!", "?", ",", ";", '(', ')'] + state = 0 + score = 0 + num_double = 0 + num_single = 0 + num_neg = 0 + num_halfneg = 0 + for word in text: + if state == 0: + if lexicon.has_key(word): + score += lexicon[word] + num_single += 1 + elif word in negators: + state = 1 + elif word in intensifiers: + state = 2 + elif state == 1: + if lexicon.has_key(word): + score += -1 * lexicon[word] + num_neg += 1 + state = 0 + elif word in intensifiers: + state = 4 + else: + state = 0 + elif state == 2: + if lexicon.has_key(word): + score += 2 * lexicon[word] + num_double += 1 + state = 0 + else: + state = 0 + elif state == 3: + pass #TODO + elif state == 4: + if lexicon.has_key(word): + score += -0.5 * lexicon[word] + num_halfneg += 1 + state = 0 + else: + state = 0 + #print num_single, num_neg, num_double, num_halfneg + return score -for i in range(len(test_wordlist)): - key = test_wordlist[i] - word_labels[key] = predicted_labels[i] +def create_lexicon(words, labels): + lexicon = {} + for i in range(len(words)): + word = words[i] + label = labels[i] + lexicon[word] = label + return lexicon -pos_words = [w for w in test_wordlist if word_labels[w] > 0] -neg_words = [w for w in test_wordlist if word_labels[w] < 0] +def create_trained_lexicon(pos_seed, neg_seed, test_words, test_labels): + # Set up initial Sets S_p and S_n + neutral = [] + #positive = ['good'] + #negative = ['bad'] -# Use the same number of positive and negative words. -length = min(len(pos_words), len(neg_words)) -pos_words = pos_words[:length] -neg_words = neg_words[:length] -word_labels2 = {} -for word in pos_words: - word_labels2[word] = 1 + positive = [word for word in pos_seed] + negative = [word for word in neg_seed] + # Expand on Sets to get S_p' and S_n' + for num in range(EXPAND_ITERATIONS): + (positive, negative, neutral) = expand_sets(positive,negative,neutral) -for word in neg_words: - word_labels2[word] = -1 - -f = open('fuck.txt', 'w') -f.write("[POS]\n\n") -f.write(string.join(pos_words,"\n")) -f.write("\n\n[NEG]\n\n") -f.write(string.join(neg_words,"\n")) -f.close() -#exit() + if USE_STEMMING: + positive = list(set(do_stem(positive))) + negative = list(set(do_stem(negative))) + + # Use the same number of positive and negative training words. + if USE_EQUAL_TRAINING: + length = min(len(positive), len(negative)) + positive = list(positive)[:length] + negative = list(negative)[:length] + + # Train the classifier using the expanded wordlist. + train_defs = [get_defs(word) for word in (positive + negative)] + train_bags = [make_bag(get_defs(word), train_defs) for word in positive] + [make_bag(get_defs(word), train_defs) for word in negative] + + train_labels = [1 for word in positive] + [-1 for word in negative] + + # The classifier needs vectors, not dicts. So we need to convert them to vectors. + # Make a list of all the words contained in them, then make an array with entries + # corresponding to each word. + + # Vector entries correspond to each word in the training word list. + train_wordlist = [] + for tdef in train_defs: + for word in tdef: + train_wordlist.append(word) + train_wordlist = set(train_wordlist) + + train_vecs = [BagOfWords.to_vector(bag, train_wordlist) for bag in train_bags] + if CLASSIFIER == "nb": + classifier = MultinomialNB() + elif CLASSIFIER == "svm": + classifier = LinearSVC() + elif CLASSIFIER == "me": + classifier = LogisticRegression() + classifier.fit(train_vecs, train_labels) + + test_defs = [get_defs(word) for word in test_words] + test_bags = [make_bag(get_defs(word), test_defs) for word in test_words] + test_vecs = [BagOfWords.to_vector(bag, train_wordlist) for bag in test_bags] + + predicted_labels = classifier.predict(test_vecs) + correct = 0 + for i in range(len(test_labels)): + if test_labels[i] == predicted_labels[i]: + correct += 1 + + print "Lexicon accuracy:", correct/len(test_labels) -# Iterate through all of the reviews and find sentiment + word_labels = {} + for i in range(len(test_words)): + key = test_words[i] + word_labels[key] = predicted_labels[i] + + pos_words = set([w for w in test_words if word_labels[w] > 0]) + neg_words = set([w for w in test_words if word_labels[w] < 0]) + + # Use the same number of positive and negative words. + if USE_EQUAL_TEST: + length = min(len(pos_words), len(neg_words)) + pos_words = list(pos_words)[:length] + neg_words = list(neg_words)[:length] + + lexicon = {} + lex2 = {} + for word in pos_words: + lexicon[word] = 1 + + for word in neg_words: + lexicon[word] = -1 + + return lexicon + +if USE_STEMMING: + stemmer = nltk.stem.porter.PorterStemmer() + +# Load the test set. A few options here. +(test_words, test_labels) = MPQALexicon.load(True) +#(test_words, test_labels) = AniaLexicon.load() +if USE_STEMMING: + test_words = do_stem(test_words) + +lexicon = create_trained_lexicon(POS_SEED, NEG_SEED, test_words, test_labels) +#lexicon = create_seed_lexicon(POS_SEED, NEG_SEED) +#lexicon = create_lexicon(test_words, test_labels) + +# Iterate through all of the reviews and compute scores by taking the sum of their +# component lexicon words. Includes rudimentary negation testing. correct = 0 positive = 0 ids = sorted(movie_reviews.fileids()) scores = [] - -for review_id in ids: - words = movie_reviews.words(fileids=[review_id]) - score = 0 - for word in words: - if word_labels2.has_key(word): - score += word_labels2[word] - scores.append(score) -avg_score = float(sum(scores))/len(scores) +for id in ids: + words = list(movie_reviews.words(fileids=[id])) + if USE_STEMMING: + words = do_stem(words) + if USE_PARSING: + scores.append(calculate_score(words, lexicon)) + else: + score = 0 + x = 0 + for word in words: + if lexicon.has_key(word): + score += lexicon[word] + x += 1 + scores.append(score) + print score, x + for i in range(len(ids)): - id = ids[i] - score = scores[i] - if score >= 0:#avg_score: - sent_value = "pos" - positive += 1 - elif score < 0:#avg_score: - sent_value = "neg" - label = movie_reviews.categories(fileids=[id])[0] - if sent_value == label: - correct += 1 - -print "correct:", float(correct)/len(ids) -print "positive:", float(positive)/len(ids) -#print "avg:", avg_score \ No newline at end of file + id = ids[i] + score = scores[i] + if score >= 0: + sent_value = "pos" + positive += 1 + #print id, sent_value + elif score < 0: + sent_value = "neg" + #print id, sent_value + label = get_label(id) + if sent_value == label: + correct += 1 + +print "correct:", correct/len(ids) +print "positive:", positive/len(ids) \ No newline at end of file diff --git a/MPQALexicon.py b/MPQALexicon.py index 6eec7a6..a8c0600 100644 --- a/MPQALexicon.py +++ b/MPQALexicon.py @@ -1,4 +1,6 @@ -def load(): +from nltk.corpus import wordnet as wn + +def load(strong_only=False): filename = "subjclueslen1-HLTEMNLP05.tff" f = open(filename) lines = f.readlines() @@ -10,11 +12,16 @@ def load(): fields = [field for field in fields if "=" in field] #ugh, two lines have a random extra char in them d = dict([field.rstrip().split("=") for field in fields]) (word, label, pos, type) = d["word1"], d["priorpolarity"], d["pos1"], d["type"] - if pos == "adj":# and type == "strongsubj": - if label == "positive": - words.append(word) - labels.append("pos") - elif label == "negative": - words.append(word) - labels.append("neg") - return (words, labels) \ No newline at end of file + if word not in words: + if is_adjective(word): + if not (strong_only and (type != "strongsubj")): + if label == "positive": + words.append(word) + labels.append(1) + elif label == "negative": + words.append(word) + labels.append(-1) + return (words, labels) + +def is_adjective(word): + return (len(wn.synsets(word, wn.ADJ)) > 0) \ No newline at end of file diff --git a/TFIDF.py b/TFIDF.py index 84a9cfe..35039fc 100644 --- a/TFIDF.py +++ b/TFIDF.py @@ -22,4 +22,11 @@ def tfidf(term, document, documents): return 0 else: tfidf = (1 + math.log(doc_appearances,10)) * math.log((float(num_docs)/all_doc_appearances), 10) - return tfidf \ No newline at end of file + return tfidf + +# Martineau and Finn 2009 +def delta_tfidf(term, document, positive_set, negative_set): + return tfidf(term, document, positive_set) - tfidf(term, document, negative_set) + +def delta_tfidf_fast(term, document, positive_set, negative_set): + return tfidf(term, document, positive_set) - tfidf(term, document, negative_set) diff --git a/getAdjectives.py b/getAdjectives.py index 8b5423c..7106326 100644 --- a/getAdjectives.py +++ b/getAdjectives.py @@ -35,7 +35,7 @@ def genConj(training): conj.write(tokens[i][0]+ " " + tokens[i+2][0] + " " + tokens[i+1][0] + "\n") nor.write(tokens[i][0]+ " " + tokens[i+2][0]+ "\n") -f = open('words.txt', 'r+') +f = open('words2.txt', 'w') list1 = [] for word in sc.tagged_sents(): for w in word: @@ -45,5 +45,6 @@ counts = Counter(list1) d = dict(counts) for n in d: - if( d[n] >= 20): - f.write(n+" \n") \ No newline at end of file + if( d[n] >= 15): + f.write(n+" \n") +f.close() \ No newline at end of file diff --git a/review_svm.py b/review_svm.py index 918b003..26535ac 100644 --- a/review_svm.py +++ b/review_svm.py @@ -11,6 +11,7 @@ import numpy #import svmutil from sklearn.svm import SVC from sklearn.svm import LinearSVC +from TFIDF import delta_tfidf import BagOfWords import XMLParser @@ -26,22 +27,22 @@ NEGATION_WORDS = ["not", "isn't", "didn't", "doesn't"] PUNCTUATION = [".", "!", "?", ",", ";"] # These are now command line parameters! See below... +USE_DELTATFIDF = True # Martineau and Finn. Excludes some other parameters (e.g. frequency) USE_PRESENCE = False # If true, use presence rather than frequency. USE_POS_TAGS = False USE_ADJ_ONLY = False USE_NEGATION = True USE_POSITION = False GRAM_LENGTH = 1 # Unigrams, bigrams, ... TODO use a range -NUM_FOLDS = 5 # For cross-validation (Pang & Lee used 3) +NUM_FOLDS = 3 # For cross-validation (Pang & Lee used 3) -MIN_OCCURRENCES = 0#4 # To be included, the word must show up this many times across all documents (Pang and Lee used 4) +MIN_OCCURRENCES = 0#4 # To be included, the word must show up this many times across all documents (Pang and Lee used 4) EPSILON = .001 # determines how long the algorithm runs (default is 0.001) NORMALIZE_BAGS = True -USE_LIBLINEAR = True # Not implemented - it murdered my computer and wasn't noticeably faster. But maybe multicore is worth a look -CACHE_SIZE = 512 +USE_LIBLINEAR = True # This is supposedly faster for large instances -USE_AMAZON = True # Use the Amazon review set, not Pang and Lee. +USE_AMAZON = False # Use the Amazon review set, not Pang and Lee. def make_folds(documents, num_partitions): folds = [[] for i in range(num_partitions)] @@ -140,8 +141,8 @@ for i in range(len(reviews)): negative_reviews.append(reviews[i]) #TEST -positive_reviews = random.sample(positive_reviews, 1000) -negative_reviews = random.sample(negative_reviews, 1000) +positive_reviews = random.sample(positive_reviews, 250) +negative_reviews = random.sample(negative_reviews, 250) # Partition reviews into folds. pos_folds = make_folds(positive_reviews, NUM_FOLDS) @@ -157,10 +158,22 @@ neg_fold_bags = [[] for i in range(NUM_FOLDS)] for i in range(NUM_FOLDS): for review in pos_folds[i]: - pos_fold_bags[i].append(make_bag(review, total_word_counts)) + t3 = time.time() + if USE_DELTATFIDF: + pos_fold_bags[i].append(BagOfWords.make_delta_tfidf(review, positive_reviews, negative_reviews, total_word_counts)) + else: + pos_fold_bags[i].append(make_bag(review, total_word_counts)) + t4 = time.time() + print "Bag time:", (t4-t3) for review in neg_folds[i]: - neg_fold_bags[i].append(make_bag(review, total_word_counts)) + t3 = time.time() + if USE_DELTATFIDF: + neg_fold_bags[i].append(BagOfWords.make_delta_tfidf(review, positive_reviews, negative_reviews, total_word_counts)) + else: + neg_fold_bags[i].append(make_bag(review, total_word_counts)) + t4 = time.time() + print "Bag time:", (t4-t3) # Remove words with less than the minimum occurrences threshold. if MIN_OCCURRENCES > 0: