From cf6576fb12fc9bfc6224d1b76af1c1c9c6e0ff2f Mon Sep 17 00:00:00 2001 From: Jack Date: Sat, 16 Apr 2016 10:40:46 -0400 Subject: [PATCH] matplotlib; eval tweaks; comparison --- GlossLexicon.py | 10 +- LexiconEval.py | 80 ++++++--- graph.py | 43 +++++ review_svm.py | 432 +++++++++++++++++++++++++----------------------- 4 files changed, 333 insertions(+), 232 deletions(-) create mode 100644 graph.py diff --git a/GlossLexicon.py b/GlossLexicon.py index 9d0f7d7..351ae1e 100644 --- a/GlossLexicon.py +++ b/GlossLexicon.py @@ -11,13 +11,11 @@ import nltk from nltk.corpus import wordnet as wn import BagOfWords -import MPQALexicon -import AniaLexicon -EXPAND_ITERATIONS = 2 +EXPAND_ITERATIONS = 3 CLASSIFIER = "svm" # "nb" = Naive Bayes, "svm" = Linear SVM, "me" = maximum entropy -REMOVE_STOPWORDS = False -USE_STEMMING = False +REMOVE_STOPWORDS = True +USE_STEMMING = True # sync this up with eval! USE_EXAMPLES = True USE_EQUAL_TRAINING = True @@ -90,7 +88,7 @@ def expand_sets(positive, negative, neutral): return (newPositive, newNegative, newNeutral) def do_stem(text): - global stemmer + stemmer = nltk.stem.porter.PorterStemmer() return [stemmer.stem(word) for word in text] def create(test_words, test_labels): diff --git a/LexiconEval.py b/LexiconEval.py index 52e0db8..afff8c2 100644 --- a/LexiconEval.py +++ b/LexiconEval.py @@ -7,11 +7,14 @@ from nltk.corpus import movie_reviews import MPQALexicon import AniaLexicon import GlossLexicon +import XMLParser -USE_STEMMING = False +USE_STEMMING = True # sync this up with lexicon! USE_PARSING = True -LEX_ALG = "gloss" -LEX_SOURCE = "mpqa" +LEX_ALG = "gloss" # "gloss", "conjunction", "none" +LEX_SOURCE = "mpqa" # "mpqa", "ania" +CORPUS = "movies" # "amazon", "movies" +NEG_MOD = 1.5 # Taboada suggested 1.5. # new and improved finite state machine # kinda-sorta based on Taboada 2011. @@ -36,13 +39,9 @@ def calculate_score(text, lexicon): num_neg = 0 num_halfneg = 0 for word in text: - if lexicon.has_key(word): - word_score = lexicon[word] - # EXPERIMENTAL - if word_score < 0: word_score *= 1.5 if state == 0: if lexicon.has_key(word): - score += word_score + score += lexicon[word] num_single += 1 elif word in negators: state = 1 @@ -50,7 +49,7 @@ def calculate_score(text, lexicon): state = 2 elif state == 1: if lexicon.has_key(word): - score += -1 * word_score + score += -1 * lexicon[word] num_neg += 1 state = 0 elif word in intensifiers: @@ -59,7 +58,7 @@ def calculate_score(text, lexicon): state = 0 elif state == 2: if lexicon.has_key(word): - score += 2 * word_score + score += 2 * lexicon[word] num_double += 1 state = 0 else: @@ -68,7 +67,7 @@ def calculate_score(text, lexicon): pass #TODO elif state == 4: if lexicon.has_key(word): - score += -0.5 * word_score + score += -0.5 * lexicon[word] num_halfneg += 1 state = 0 else: @@ -79,9 +78,6 @@ def calculate_score(text, lexicon): def do_stem(text): global stemmer return [stemmer.stem(word) for word in text] - -def get_label(id): - return movie_reviews.categories(fileids=[id])[0] # Used to create a lexicon instance from the words + labels directly (i.e. without using an algorithm) def create_lexicon(words, labels): @@ -114,14 +110,24 @@ try: else: print "Invalid lexicon" i += 2 + elif args[i] == "--corpus": + if args[i+1] == "movies": + CORPUS = "movies" + elif args[i+1] == "amazon": + CORPUS = "amazon" + i += 2 elif args[i] == "--help": print "Usage:" - print "--alg X: Choose the algorithm to use ('gloss', 'conjunction' or 'none') (default: gloss)" + print "--algorithm|alg X: Choose the algorithm to use ('gloss', 'conjunction' or 'none') (default: gloss)" print " - gloss: Use the gloss-based algorithm (Esuli & Sebastiani)" print " - conjunction: Use the conjunction-based algorithm (Hatzivassiloglou & McKeown)" - print "--lexicon X: Choose the lexicon to use ('mpqa', 'ania' or 'none')" + print " - none: Use the input lexicon as is" + print "--lexicon|lex X: Choose the lexicon to use ('mpqa', 'ania' or 'none')" print " - mpqa: Use the MPQA lexicon" print " - ania: Use the hand-labeled lexicon from the Brown corpus" + print "--corpus X: Choose the data set to test on" + print " - amazon: Use the Amazon data set" + print " - movies: Use the Pang&Lee movie data set (default)" exit() else: print "Error: Invalid argument", args[i] @@ -132,6 +138,7 @@ except Exception: print "Lexicon =", LEX_SOURCE print "Algorithm =", LEX_ALG +print "Corpus =", CORPUS # Load the test set. A few options here. if LEX_SOURCE == "mpqa": @@ -158,17 +165,45 @@ if LEX_ALG != "none": lex_acc = correct/len(lexicon.items()) print "Lexicon accuracy:", lex_acc +for key in lexicon.keys(): + if lexicon[key] < 0: lexicon[key] *= NEG_MOD + +if CORPUS == "movies": + ids = movie_reviews.fileids() + reviews = [list(movie_reviews.words(fileids=[id])) for id in ids] + labels = [] + for id in ids: + label = movie_reviews.categories(id)[0] + if label == 'pos': + labels.append(1) + elif label == 'neg': + labels.append(-1) +elif CORPUS == "amazon": + (ids, reviews, labels) = XMLParser.get_all_reviews() +else: + print "Invalid corpus!" + exit() + +""" +# It feels like there should be a more efficient way do to this. +shuffled = zip(ids,reviews,labels) +shuffled = shuffled[:20] +ids = [x[0] for x in shuffled] +reviews = [x[1] for x in shuffled] +labels = [x[2] for x in shuffled] +""" + # Iterate through all of the reviews and compute scores by taking the sum of their # component lexicon words. Includes rudimentary negation testing. correct = 0 positive = 0 -ids = sorted(movie_reviews.fileids()) scores = [] -for id in ids: - words = list(movie_reviews.words(fileids=[id])) +for i in range(len(reviews)): + words = reviews[i] if USE_STEMMING: words = do_stem(words) + if USE_PARSING: score = calculate_score(words, lexicon) else: @@ -182,14 +217,15 @@ for id in ids: for i in range(len(ids)): id = ids[i] score = scores[i] + label = labels[i] if score >= 0: - sent_value = "pos" + sent_value = 1 positive += 1 #print id, sent_value elif score < 0: - sent_value = "neg" + sent_value = -1 #print id, sent_value - label = get_label(id) + if sent_value == label: correct += 1 diff --git a/graph.py b/graph.py new file mode 100644 index 0000000..04102ef --- /dev/null +++ b/graph.py @@ -0,0 +1,43 @@ +import numpy +from matplotlib import pyplot + +labels = [ + "unigrams, frequency", + "unigrams, frequency, +Position", + "unigrams, presence", + "unigrams, presence, +Position", + "bigrams, frequency", + "bigrams, frequency, +Position", + "bigrams, presence", + "bigrams, presence, +Position", + "delta_tfidf" +] +labels2 = [ + "unigrams, frequency", + "unigrams, frequency, +Position", + "unigrams, presence", + "unigrams, presence, +Position", + "bigrams, frequency", + "bigrams, frequency, +Position", + "bigrams, presence", + "bigrams, presence, +Position", + "delta_tfidf" +] +tops = numpy.arange(len(labels)) +widths = [0.826002649356, 0.784479089868, 0.842490694287, 0.821997146847, 0.807497617378, 0.777000053946, 0.820491149832, 0.795509581438, 0.981992471513] +widths2 = [0.824624634419, 0.808376475678, 0.832750728912, 0.815374570779, 0.797876474366, 0.771876439875, 0.799001849413, 0.768376127015, 0.929999178955] +height = 0.3 +pyplot.barh(tops, widths, height, color="#FF0000") +pyplot.barh(tops+height, widths2, height, color="#00FF00") +pyplot.legend(["Movies", "Amazon"], loc=4) # bottom right +pyplot.yticks(tops+height, labels) +pyplot.xlim(0.5, 1.0) +pyplot.ylim(tops[0]-2*height, tops[-1]+3*height) +pyplot.show() + +""" +gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: False, use_position: False 0.822003140865 +gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: False, use_position: True 0.781988575402 +gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position: False 0.77899606193 +gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position: True 0.762512512513 +""" \ No newline at end of file diff --git a/review_svm.py b/review_svm.py index 649f4bc..ceae890 100644 --- a/review_svm.py +++ b/review_svm.py @@ -2,13 +2,11 @@ from __future__ import division import os import random import string -import time import sys import nltk from nltk.corpus import movie_reviews import numpy -#import svmutil from sklearn.svm import SVC from sklearn.svm import LinearSVC from TFIDF import delta_tfidf, compute_idfs @@ -43,6 +41,7 @@ NORMALIZE_BAGS = True USE_LIBLINEAR = True # This is supposedly faster for large instances USE_AMAZON = False # Use the Amazon review set, not Pang and Lee. +USE_DELTA_TFIDF = False def make_folds(documents, ids, num_partitions): folds = [[] for i in range(num_partitions)] @@ -52,220 +51,245 @@ def make_folds(documents, ids, num_partitions): fold_ids[i % num_partitions].append(ids[i]) return (folds, fold_ids) -def make_bag(text, total_word_counts): - return BagOfWords.make(text, ref_bag=total_word_counts, - gram_length=GRAM_LENGTH, use_presence=USE_PRESENCE, - use_pos_tags=USE_POS_TAGS, use_adj_only=USE_ADJ_ONLY, - normalize=NORMALIZE_BAGS, use_negation=USE_NEGATION, - use_position=USE_POSITION) - -# Set parameters from command-line arguments. -i = 0 -try: - args = sys.argv[1:] - while i < len(args): - if args[i] == "--gram-length": - GRAM_LENGTH = int(args[i+1]) - i += 2 - elif args[i] == "--num-folds": - NUM_FOLDS = int(args[i+1]) - i += 2 - elif args[i] == "--presence": - USE_PRESENCE = True - i += 1 - elif args[i] == "--frequency": - USE_PRESENCE = False - i += 1 - elif args[i] == "--use-pos-tags": - USE_POS_TAGS = True - i += 1 - elif args[i] == "--use-adj-only": - USE_ADJ_ONLY = True - i += 1 - elif args[i] == "--use-negation": - USE_NEGATION = True - i += 1 - elif args[i] == "--no-negation": - USE_NEGATION = False - i += 1 - elif args[i] == "--use-position": - USE_POSITION = True - i += 1 - elif args[i] == "--threshold": - MIN_OCCURRENCES = int(args[i+1]) - i += 2 - elif args[i] == "--epsilon": - EPSILON = float(args[i+1]) - i += 2 - elif args[i] == "--use-amazon": - USE_AMAZON = True - i += 1 - elif args[i] == "--use-delta": - USE_DELTATFIDF = True - i += 1 - elif args[i] == "--help": - print "Usage:" - print "--gram-length N\t\tUse groups of N consecutive words (Default: 1)" - print "--num-folds N\t\tUse N folds for cross-validation (Default: 3)" - print "--presence\t\tUse word presence rather than word frequency (Default: Off)" - print "--frequency\t\tUse word frequency rather than word presence (Default: On)" - print "--use-pos-tags\t\tUse part-of-speech tags (Default: Off)" - print "--use-negation\t\tTag words appearing after a negation word (Default: Off)" - print "--use-adj-only\t\tUse adjectives only (requires --use-pos-tags and --gram-length 1) (Default: Off)" - print "--use-position\t\tTag words according to their position in the text (Default: Off)" - print "--threshold N\t\tOnly include words that appear at least N times across all documents (Default: 4)" - print "--epsilon X\t\tSVM parameter to control the number of iterations (Default: 0.001)" - print "\t\t\t(0 < epsilon < 1; lower = more iterations)" - print "--use-amazon\t\tUse the Amazon data set rather than the movie review set. (Default: Off)" - print "--use-delta\t\tUse Delta TFIDF. (Default: Off)" - exit() - else: - print "Error: Invalid argument", args[i] - i += 1 -except Exception: - print "Invalid arguments" - -t0 = time.time() - -positive_ids = [] -negative_ids = [] - -if USE_AMAZON: - # Load the mixed Amazon review dataset. - (ids, reviews, labels) = XMLParser.get_all_reviews() - for i in range(len(ids)): +def make_bag(text, total_word_counts, **bag_params): + return BagOfWords.make(text, ref_bag=total_word_counts, **bag_params) + +def from_command_line(): + i = 0 + # Set parameters to default values + gram_length = GRAM_LENGTH + num_folds = NUM_FOLDS + use_presence = USE_PRESENCE + use_pos_tags = USE_POS_TAGS + use_negation = USE_NEGATION + use_position = USE_POSITION + min_occurrences = MIN_OCCURRENCES + use_amazon = USE_AMAZON + try: + args = sys.argv[1:] + while i < len(args): + if args[i] == "--gram-length": + gram_length = int(args[i+1]) + i += 2 + elif args[i] == "--num-folds": + num_folds = int(args[i+1]) + i += 2 + elif args[i] == "--presence": + use_presence = True + i += 1 + elif args[i] == "--frequency": + use_presence = False + i += 1 + elif args[i] == "--use-pos-tags": + use_pos_tags = True + i += 1 + elif args[i] == "--use-adj-only": + use_adj_only = True + i += 1 + elif args[i] == "--use-negation": + use_negation = True + i += 1 + elif args[i] == "--no-negation": + use_negation = False + i += 1 + elif args[i] == "--use-position": + use_position = True + i += 1 + elif args[i] == "--threshold": + min_occurrences = int(args[i+1]) + i += 2 + elif args[i] == "--use-amazon": + use_amazon = True + i += 1 + elif args[i] == "--use-delta": + use_delta = True + i += 1 + elif args[i] == "--help": + print "Usage:" + print "--gram-length N\t\tUse groups of N consecutive words (Default: 1)" + print "--num-folds N\t\tUse N folds for cross-validation (Default: 3)" + print "--presence\t\tUse word presence rather than word frequency (Default: Off)" + print "--frequency\t\tUse word frequency rather than word presence (Default: On)" + print "--use-pos-tags\t\tUse part-of-speech tags (Default: Off)" + print "--use-negation\t\tTag words appearing after a negation word (Default: Off)" + print "--use-adj-only\t\tUse adjectives only (requires --use-pos-tags and --gram-length 1) (Default: Off)" + print "--use-position\t\tTag words according to their position in the text (Default: Off)" + print "--threshold N\t\tOnly include words that appear at least N times across all documents (Default: 4)" + print "\t\t\t(0 < epsilon < 1; lower = more iterations)" + print "--use-amazon\t\tUse the Amazon data set rather than the movie review set. (Default: Off)" + print "--use-delta\t\tUse Delta TFIDF. (Default: Off)" + exit() + else: + print "Error: Invalid argument", args[i] + i += 1 + classify_reviews(gram_length, num_folds, use_presence, use_negation, use_pos_tags, use_adj_only, min_occurrences, use_amazon, use_delta) + except Exception: + print "Invalid arguments" + +def classify_reviews(gram_length=GRAM_LENGTH, num_folds=NUM_FOLDS, use_presence=USE_PRESENCE, use_negation=USE_NEGATION, use_pos_tags=USE_POS_TAGS, use_adj_only=USE_ADJ_ONLY, + use_position = USE_POSITION, min_occurrences=MIN_OCCURRENCES, use_amazon=USE_AMAZON, use_delta=USE_DELTA_TFIDF): + positive_ids = [] + negative_ids = [] + + if use_amazon: + # Load the mixed Amazon review dataset. + (ids, reviews, labels) = XMLParser.get_all_reviews() + for i in range(len(ids)): + if labels[i] == 1: + positive_ids.append(ids[i]) + elif labels[i] == -1: + negative_ids.append(ids[i]) + else: + # Load the Pang and Lee sentiment dataset. + ids = movie_reviews.fileids() + reviews = [list(movie_reviews.words(fileids = [id])) for id in ids] + labels = [] + for id in ids: + label = movie_reviews.categories(id)[0] + if label == 'pos': + labels.append(1) + positive_ids.append(id) + elif label == 'neg': + labels.append(-1) + negative_ids.append(id) + + positive_reviews = [] + negative_reviews = [] + + for i in range(len(reviews)): if labels[i] == 1: - positive_ids.append(ids[i]) + positive_reviews.append(reviews[i]) elif labels[i] == -1: - negative_ids.append(ids[i]) -else: - # Load the Pang and Lee sentiment dataset. - ids = movie_reviews.fileids() - reviews = [list(movie_reviews.words(fileids = [id])) for id in ids] - labels = [] - for id in ids: - label = movie_reviews.categories(id)[0] - if label == 'pos': - labels.append(1) - positive_ids.append(id) - elif label == 'neg': - labels.append(-1) - negative_ids.append(id) - -positive_reviews = [] -negative_reviews = [] - -for i in range(len(reviews)): - if labels[i] == 1: - positive_reviews.append(reviews[i]) - elif labels[i] == -1: - negative_reviews.append(reviews[i]) + negative_reviews.append(reviews[i]) -#TEST -#positive_reviews = positive_reviews[:200] -#negative_reviews = negative_reviews[:600] -#positive_reviews = random.sample(positive_reviews, 1000) -#negative_reviews = random.sample(negative_reviews, 1000) + #TEST + #positive_reviews = positive_reviews[:200] + #negative_reviews = negative_reviews[:600] + #positive_reviews = random.sample(positive_reviews, 1000) + #negative_reviews = random.sample(negative_reviews, 1000) -# Partition reviews into folds. -(pos_folds, pos_fold_ids) = make_folds(positive_reviews, positive_ids, NUM_FOLDS) -(neg_folds, neg_fold_ids) = make_folds(negative_reviews, negative_ids, NUM_FOLDS) + # Partition reviews into folds. + (pos_folds, pos_fold_ids) = make_folds(positive_reviews, positive_ids, num_folds) + (neg_folds, neg_fold_ids) = make_folds(negative_reviews, negative_ids, num_folds) -# Count occurrences of every word across all documents -# (this is important for e.g. Delta TFIDF) -total_word_counts = {} + # Count occurrences of every word across all documents + # (this is important for e.g. Delta TFIDF) + total_word_counts = {} -# Construct a bag of words (or n-grams) from each file. -pos_fold_bags = [[] for i in range(NUM_FOLDS)] -neg_fold_bags = [[] for i in range(NUM_FOLDS)] + # Construct a bag of words (or n-grams) from each file. + pos_fold_bags = [[] for i in range(num_folds)] + neg_fold_bags = [[] for i in range(num_folds)] -pos_fold_idfs = [compute_idfs(pos_folds[i]) for i in range(NUM_FOLDS)] -neg_fold_idfs = [compute_idfs(neg_folds[i]) for i in range(NUM_FOLDS)] + pos_fold_idfs = [compute_idfs(pos_folds[i]) for i in range(num_folds)] + neg_fold_idfs = [compute_idfs(neg_folds[i]) for i in range(num_folds)] -for i in range(NUM_FOLDS): - for review in pos_folds[i]: - if USE_DELTATFIDF: - pos_idfs = pos_fold_idfs[i] - neg_idfs = neg_fold_idfs[i] - pos_fold_bags[i].append(BagOfWords.make_delta_tfidf(review, positive_reviews, negative_reviews, pos_idfs, neg_idfs, total_word_counts)) - else: - pos_fold_bags[i].append(make_bag(review, total_word_counts)) + bag_params = {'gram_length':gram_length, 'use_presence':use_presence, 'use_negation':use_negation, 'use_pos_tags':use_pos_tags, + 'use_adj_only':use_adj_only, 'use_position':use_position} + + for i in range(num_folds): + for review in pos_folds[i]: + if use_delta: + pos_idfs = pos_fold_idfs[i] + neg_idfs = neg_fold_idfs[i] + pos_fold_bags[i].append(BagOfWords.make_delta_tfidf(review, positive_reviews, negative_reviews, pos_idfs, neg_idfs, total_word_counts)) + else: + pos_fold_bags[i].append(make_bag(review, total_word_counts, **bag_params)) + + for review in neg_folds[i]: + if use_delta: + pos_idfs = pos_fold_idfs[i] + neg_idfs = neg_fold_idfs[i] + neg_fold_bags[i].append(BagOfWords.make_delta_tfidf(review, positive_reviews, negative_reviews, pos_idfs, neg_idfs, total_word_counts)) + else: + neg_fold_bags[i].append(make_bag(review, total_word_counts, **bag_params)) + + # Remove words with less than the minimum occurrences threshold. + if min_occurrences > 0: + for k in total_word_counts.keys(): + if total_word_counts[k] < min_occurrences: + for fold in (neg_fold_bags + pos_fold_bags): + for bag in fold: + if bag.has_key(k): + bag.pop(k) + total_word_counts.pop(k) + + avg_acc = 0 + + wordlist = total_word_counts.keys() + + for i in range(num_folds): + pos_train_reviews = [] + neg_train_reviews = [] + pos_train_bags = [] + neg_train_bags = [] - for review in neg_folds[i]: - if USE_DELTATFIDF: - pos_idfs = pos_fold_idfs[i] - neg_idfs = neg_fold_idfs[i] - neg_fold_bags[i].append(BagOfWords.make_delta_tfidf(review, positive_reviews, negative_reviews, pos_idfs, neg_idfs, total_word_counts)) + pos_test_reviews = pos_folds[i] + neg_test_reviews = neg_folds[i] + pos_test_ids = pos_fold_ids[i] + neg_test_ids = neg_fold_ids[i] + for j in range(num_folds): + if j != i: + pos_train_reviews += pos_folds[j] + neg_train_reviews += neg_folds[j] + pos_train_bags += pos_fold_bags[j] + neg_train_bags += neg_fold_bags[j] + + train_labels = [1] * len(pos_train_bags) + [-1] * len(neg_train_bags) + train_bags = pos_train_bags + neg_train_bags + + if USE_LIBLINEAR: + classifier = LinearSVC() else: - neg_fold_bags[i].append(make_bag(review, total_word_counts)) - -# Remove words with less than the minimum occurrences threshold. -if MIN_OCCURRENCES > 0: - for k in total_word_counts.keys(): - if total_word_counts[k] < MIN_OCCURRENCES: - for fold in (neg_fold_bags + pos_fold_bags): - for bag in fold: - if bag.has_key(k): - bag.pop(k) - total_word_counts.pop(k) + classifier = SVC(kernel="linear",tol=EPSILON) -#num_unique_words = len(total_word_counts.keys()) -#print "# unique words:", num_unique_words + train_vecs = [BagOfWords.to_vector(bag, wordlist) for bag in train_bags] + classifier.fit(train_vecs, train_labels) -t1 = time.time() -print "Constructed bags, time:", (t1-t0) -avg_acc = 0 - -wordlist = total_word_counts.keys() - -#f = open("results.txt", "w") -for i in range(NUM_FOLDS): - pos_train_reviews = [] - neg_train_reviews = [] - pos_train_bags = [] - neg_train_bags = [] - - pos_test_reviews = pos_folds[i] - neg_test_reviews = neg_folds[i] - pos_test_ids = pos_fold_ids[i] - neg_test_ids = neg_fold_ids[i] - for j in range(NUM_FOLDS): - if j != i: - pos_train_reviews += pos_folds[j] - neg_train_reviews += neg_folds[j] - pos_train_bags += pos_fold_bags[j] - neg_train_bags += neg_fold_bags[j] - - train_labels = [1] * len(pos_train_bags) + [-1] * len(neg_train_bags) - train_bags = pos_train_bags + neg_train_bags - - if USE_LIBLINEAR: - classifier = LinearSVC() - else: - classifier = SVC(kernel="linear",tol=EPSILON) - - train_vecs = [BagOfWords.to_vector(bag, wordlist) for bag in train_bags] - classifier.fit(train_vecs, train_labels) - - test_bags = pos_fold_bags[i] + neg_fold_bags[i] + test_bags = pos_fold_bags[i] + neg_fold_bags[i] + + test_vecs = [BagOfWords.to_vector(bag, wordlist) for bag in test_bags] + test_reviews = pos_test_reviews + neg_test_reviews + test_ids = pos_test_ids + neg_test_ids + test_labels = [1] * len(pos_test_reviews) + [-1] * len(neg_test_reviews) - test_vecs = [BagOfWords.to_vector(bag, wordlist) for bag in test_bags] - test_reviews = pos_test_reviews + neg_test_reviews - test_ids = pos_test_ids + neg_test_ids - test_labels = [1] * len(pos_test_reviews) + [-1] * len(neg_test_reviews) - - predicted_labels = classifier.predict(test_vecs) - acc = classifier.score(test_vecs, test_labels) - for i in range(len(test_reviews)): - #f.write("%s\t%d\t%d\n" % (test_ids[i], test_labels[i], predicted_labels[i])) - print("%s\t%d\t%d" % (test_ids[i], test_labels[i], predicted_labels[i])) + predicted_labels = classifier.predict(test_vecs) + acc = classifier.score(test_vecs, test_labels) + avg_acc += acc - avg_acc += acc - -#f.close() + avg_acc /= num_folds + return avg_acc -t2 = time.time() -avg_acc /= NUM_FOLDS -print "Total accuracy:", avg_acc -print "Classification time:", (t2-t1) -print "Total time:", (t2-t0) \ No newline at end of file +def run_configs(): + min_occurrences = 4 + use_negation = True + use_delta = False + use_pos_tags = False + use_adj_only = False + labels = [] + accs = [] + for use_amazon in [False, True]: + for gram_length in [1,2]: + for use_presence in [False, True]: + for (use_pos_tags, use_adj_only) in [(True, False), (True, True)]: + for use_position in [False, True]: + params = {'gram_length':gram_length, 'use_presence':use_presence, 'use_pos_tags':use_pos_tags, 'use_adj_only':use_adj_only, + 'use_position':use_position, 'use_amazon':use_amazon, 'min_occurrences':min_occurrences, 'use_delta':False} + acc = classify_reviews(**params) + label = "gram_length: %d, use_presence: %s, use_amazon: %s, use_pos_tags: %s, use_adj_only: %s, use_position: %s" % (gram_length, use_presence, use_amazon, use_pos_tags, use_adj_only, use_position) + print label, acc + labels.append(label) + accs.append(acc) + # Delta-TFIDF construction doesn't support all parameters (yet). + params = {'use_amazon':use_amazon, 'use_delta':True} + acc = classify_reviews(**params) + label = "delta_tfidf: True, use_amazon: %s" % use_amazon + print label, acc + labels.append(label) + accs.append(acc) + return (labels, accs) + +(labels, accs) = run_configs() +f = open('SVM_RESULTS.txt', 'w') +for (label, acc) in zip(labels, accs): + f.write("%s\t%s\n" % (label, acc)) +f.close()