From c458117d6eddcb40629d1ea093a61cc6a9715dcd Mon Sep 17 00:00:00 2001 From: Jack Date: Thu, 31 Mar 2016 19:29:05 -0400 Subject: [PATCH] meh --- GlossCountJWB.py | 97 ++++++++++++++++++++++++++++++------------------ MPQALexicon.py | 4 +- asdf.py | 11 ++++++ pos.py | 36 ++++++++++++++++++ review_svm.py | 20 +++++++--- 5 files changed, 125 insertions(+), 43 deletions(-) create mode 100644 asdf.py create mode 100644 pos.py diff --git a/GlossCountJWB.py b/GlossCountJWB.py index 2c9d543..8fc5609 100644 --- a/GlossCountJWB.py +++ b/GlossCountJWB.py @@ -8,15 +8,17 @@ import string import random import BagOfWords -from sklearn.naive_bayes import MultinomialNB from sklearn import svm +from sklearn.naive_bayes import MultinomialNB +from sklearn.linear_model import LogisticRegression import MPQALexicon +import numpy def get_defs(word): return string.join([synset.definition() for synset in wn.synsets(word, pos=wn.ADJ)]) def make_bag(text): - return BagOfWords.make(text, normalize=True, use_negation=False, use_hash=False, use_presence=False) + return BagOfWords.make(text, normalize=True, use_negation=True, use_hash=False, use_presence=True) def expand_sets(positive,negative,neutral): newPositive = set(positive) @@ -25,42 +27,54 @@ def expand_sets(positive,negative,neutral): for word in positive: for syn in wn.synsets(word, pos=wn.ADJ): for lemma in syn.lemmas(): - curr = lemma.name().split('.')[0] + curr = lemma.name() if curr not in newPositive and curr not in newNegative and curr not in newNeutral: newPositive.add(curr) elif curr in newNegative: newNegative.discard(curr) newNeutral.add(curr) + for antonym in lemma.antonyms(): + ant = antonym.name() + if ant not in newPositive and ant not in newNegative and ant not in newNeutral: + newNegative.add(ant) + elif ant in newPositive: + newPositive.discard(ant) + newNeutral.add(ant) for word in negative: for syn in wn.synsets(word, pos=wn.ADJ): for lemma in syn.lemmas(): - curr = lemma.name().split('.')[0] + curr = lemma.name() if curr not in newPositive and curr not in newNegative and curr not in newNeutral: newNegative.add(curr) elif curr in newPositive: newPositive.discard(curr) newNeutral.add(curr) + for antonym in lemma.antonyms(): + ant = antonym.name() + if ant not in newPositive and ant not in newNegative and ant not in newNeutral: + newPositive.add(ant) + elif ant in newPositive: + newNegative.discard(ant) + newNeutral.add(ant) return (newPositive, newNegative, newNeutral) def bag_to_vec(bag, wordlist): - vec = [] - for word in wordlist: - if bag.has_key(word): - vec.append(bag[word]) - else: - vec.append(0) - return vec + vec = [] + for word in wordlist: + if bag.has_key(word): + vec.append(bag[word]) + else: + vec.append(0) + return vec # Set up initial Sets S_p and S_n -#positive = Set(['Good']) -#negative = Set(['Bad']) -neutral = Set(['']) +neutral = Set([]) positive = Set(['good', 'nice', 'excellent', 'positive', 'fortunate', 'correct', 'superior']) negative = Set(['bad', 'nasty', 'poor', 'negative', 'unfortunate', 'wrong', 'inferior']) # Expand on Sets to get S_p' and S_n' -for num in range(2): +for num in range(3): newsets = expand_sets(positive,negative,neutral); positive = set(newsets[0]) negative = set(newsets[1]) @@ -85,18 +99,14 @@ def bag_to_vec(bag, wordlist): train_wordlist.append(word) train_wordlist = sorted(train_wordlist) train_vecs = [bag_to_vec(bag, train_wordlist) for bag in train_bags] -classifier = MultinomialNB() -#classifier = svm.SVC(kernel="linear") +#classifier = MultinomialNB() +classifier = svm.SVC(kernel="linear") classifier.fit(train_vecs, train_labels) -# Iterate through all of the reviews and find sentiment -count = 0 -correct = 0 -ids = sorted(movie_reviews.fileids()) # Load the test set (test_words, test_labels) = MPQALexicon.load() #test_words = string.join(list(movie_reviews.words(fileids=ids))) -test_wordlist_bag = BagOfWords.make(string.join(test_words), normalize=False, use_negation=False, use_hash=False, use_presence=False) +test_wordlist_bag = BagOfWords.make(string.join(test_words), normalize=False, use_negation=False, use_hash=False, use_presence=True) test_wordlist = sorted(test_wordlist_bag.keys(), key=lambda k: -test_wordlist_bag[k])#[:NUM_TEST_WORDS] test_bags = [] test_wordlist2 = [] @@ -105,7 +115,7 @@ def bag_to_vec(bag, wordlist): if defs != '': test_wordlist2.append(word) test_bags.append(make_bag(defs)) - + test_vecs = [bag_to_vec(bag, train_wordlist) for bag in test_bags] predicted_labels = classifier.predict(test_vecs) word_labels = {} @@ -116,29 +126,44 @@ def bag_to_vec(bag, wordlist): pos_words = [w for w in test_wordlist2 if word_labels[w] > 0] neg_words = [w for w in test_wordlist2 if word_labels[w] < 0] + # Use the same number of positive and negative words. length = min(len(pos_words), len(neg_words)) pos_words = pos_words[:length] neg_words = neg_words[:length] +word_labels2 = {} +for word in pos_words: + word_labels2[word] = 1 +for word in neg_words: + word_labels2[word] = -1 -scores = {} +# Iterate through all of the reviews and find sentiment +correct = 0 +positive = 0 +ids = sorted(movie_reviews.fileids()) +scores = [] + for review_id in ids: words = movie_reviews.words(fileids=[review_id]) score = 0 for word in words: - if word in pos_words: - score += 1 - elif word in neg_words: - score -= 1 - if (score >= 0): + if word_labels2.has_key(word): + score += word_labels2[word] + scores.append(score) + +avg_score = float(sum(scores))/len(scores) +for i in range(len(ids)): + id = ids[i] + score = scores[i] + if score >= avg_score: sent_value = "pos" - print "Positive (%s)" % review_id - else: + positive += 1 + elif score < avg_score: sent_value = "neg" - print "Negative (%s)" % review_id - if (sent_value == movie_reviews.categories(fileids=[review_id])[0]): + label = movie_reviews.categories(fileids=[id])[0] + if sent_value == label: correct += 1 - count += 1 - scores[review_id] = score -print "correct:", float(correct)/count \ No newline at end of file +print "correct:", float(correct)/len(ids) +print "positive:", float(positive)/len(ids) +print "avg:", avg_score \ No newline at end of file diff --git a/MPQALexicon.py b/MPQALexicon.py index d86c2e1..6eec7a6 100644 --- a/MPQALexicon.py +++ b/MPQALexicon.py @@ -9,8 +9,8 @@ def load(): fields = line.split(" ") fields = [field for field in fields if "=" in field] #ugh, two lines have a random extra char in them d = dict([field.rstrip().split("=") for field in fields]) - (word, label, pos) = d["word1"], d["priorpolarity"], d["pos1"] - if pos == "adj": + (word, label, pos, type) = d["word1"], d["priorpolarity"], d["pos1"], d["type"] + if pos == "adj":# and type == "strongsubj": if label == "positive": words.append(word) labels.append("pos") diff --git a/asdf.py b/asdf.py new file mode 100644 index 0000000..9c8868d --- /dev/null +++ b/asdf.py @@ -0,0 +1,11 @@ +import nltk +from nltk.corpus import wordnet +from nltk.corpus import movie_reviews +from nltk.classify import NaiveBayesClassifier + +word = "good" +syns = wordnet.synsets(word) +for syn in syns: + lemmas = syn.lemmas() + for lemma in lemmas: + if lemma.antonyms() != []: print lemma.antonyms() \ No newline at end of file diff --git a/pos.py b/pos.py new file mode 100644 index 0000000..f6311c5 --- /dev/null +++ b/pos.py @@ -0,0 +1,36 @@ +import nltk +import os +import string + +""" +POS tagging is really slow compared to SVM training and prediction. +This script processes the reviews beforehand, applies the NLTK POS tagger, +and saves them in a new folder. +""" + +POS_FOLDER = os.path.join("review_polarity","txt_sentoken","pos") +NEG_FOLDER = os.path.join("review_polarity","txt_sentoken","neg") +POS_TAGGED_FOLDER = os.path.join("review_polarity","txt_sentoken","pos_tagged") +NEG_TAGGED_FOLDER = os.path.join("review_polarity","txt_sentoken","neg_tagged") + +for (folder_name, tagged_folder_name) in [(POS_FOLDER, POS_TAGGED_FOLDER), (NEG_FOLDER, NEG_TAGGED_FOLDER)]: + filenames = [] + for (folder, x, folder_filenames) in os.walk(folder_name): + for filename in folder_filenames: + if filename.endswith(".txt"): + filenames.append(os.path.join(folder, filename)) + for filename in filenames: + f = open(filename) + lines = f.readlines() + f.close() + text = string.join(lines, " ") + + tokens = nltk.word_tokenize(text) + tagged = nltk.pos_tag(tokens) + tagged = [string.join(t, "_") for t in tagged] + tagged = string.join(tagged, " ") + tagged_filename = os.path.join(tagged_folder_name, os.path.split(filename)[-1]) + f = open(tagged_filename, "w") + f.write(tagged) + f.close() + print "Tagged & saved file", tagged_filename \ No newline at end of file diff --git a/review_svm.py b/review_svm.py index 6b5dd09..0d3a6bd 100644 --- a/review_svm.py +++ b/review_svm.py @@ -29,6 +29,7 @@ USE_POS_TAGS = False USE_ADJ_ONLY = False USE_NEGATION = True +USE_POSITION = False GRAM_LENGTH = 1 # Unigrams, bigrams, ... TODO use a range NUM_FOLDS = 3 # For cross-validation (Pang & Lee used 3) @@ -60,6 +61,13 @@ def partition_filenames(filenames, num_partitions): for i in range(len(filenames)): partitions[i % num_partitions].append(filenames[i]) return partitions + +def make_bag(text, total_word_counts): + return BagOfWords.make(file_to_text(filename), ref_bag=total_word_counts, + gram_length=GRAM_LENGTH, use_presence=USE_PRESENCE, + use_pos_tags=USE_POS_TAGS, use_adj_only=USE_ADJ_ONLY, + normalize=NORMALIZE_BAGS, use_negation=USE_NEGATION, + use_position=USE_POSITION) # Set parameters from command-line arguments. @@ -91,6 +99,9 @@ def partition_filenames(filenames, num_partitions): elif args[i] == "--no-negation": USE_NEGATION = False i += 1 + elif args[i] == "--use-position": + USE_POSITION = True + i += 1 elif args[i] == "--threshold": MIN_OCCURRENCES = int(args[i+1]) i += 2 @@ -104,7 +115,9 @@ def partition_filenames(filenames, num_partitions): print "--presence\t\tUse word presence rather than word frequency (Default: Off)" print "--frequency\t\tUse word frequency rather than word presence (Default: On)" print "--use-pos-tags\t\tUse part-of-speech tags (Default: Off)" + print "--use-negation\t\tTag words appearing after a negation word (Default: Off)" print "--use-adj-only\t\tUse adjectives only (requires --use-pos-tags and --gram-length 1) (Default: Off)" + print "--use-position\t\tTag words according to their position in the text (Default: Off)" print "--threshold N\t\tOnly include words that appear at least N times across all documents (Default: 4)" print "--epsilon X\t\tSVM parameter to control the number of iterations (Default: 0.001)" print "\t\t\t(0 < epsilon < 1; lower = more iterations)" @@ -138,13 +151,10 @@ def partition_filenames(filenames, num_partitions): for i in range(NUM_FOLDS): for filename in pos_folds[i]: - pos_fold_bags[i].append(BagOfWords.make(file_to_text(filename), ref_bag=total_word_counts, gram_length=GRAM_LENGTH, - use_presence=USE_PRESENCE, use_pos_tags=USE_POS_TAGS, use_adj_only=USE_ADJ_ONLY, normalize_bags=NORMALIZE_BAGS)) + pos_fold_bags[i].append(make_bag(file_to_text(filename), total_word_counts)) for filename in neg_folds[i]: - neg_fold_bags[i].append( - BagOfWords.make(file_to_text(filename), ref_bag=total_word_counts, gram_length=GRAM_LENGTH, - use_presence=USE_PRESENCE, use_pos_tags=USE_POS_TAGS, use_adj_only=USE_ADJ_ONLY, normalize_bags=NORMALIZE_BAGS)) + neg_fold_bags[i].append(make_bag(file_to_text(filename), total_word_counts)) # Remove words with less than the minimum occurrences threshold. for k in total_word_counts.keys():