From 0f1aeaee6457aff207d1c266de1ae63b1f97186c Mon Sep 17 00:00:00 2001 From: Jack Date: Tue, 1 Mar 2016 13:57:22 -0500 Subject: [PATCH] Added cross validation to SVM --- review_svm.py | 118 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 76 insertions(+), 42 deletions(-) diff --git a/review_svm.py b/review_svm.py index b9cf07b..5ffaa5b 100644 --- a/review_svm.py +++ b/review_svm.py @@ -6,38 +6,52 @@ # Program to classify the movie review dataset using a support vector machine # (via LIBSVM), following Pang and Lee (2002). -NEG_FOLDER = "txt_sentoken\\neg" -POS_FOLDER = "txt_sentoken\\pos" -NUM_TRAIN = 300 -USE_PRESENCE = True # If true, use presence rather than frequency. -GRAM_LENGTH = 2 +NEG_FOLDER = "review_polarity\\txt_sentoken\\neg" +POS_FOLDER = "review_polarity\\txt_sentoken\\pos" -# "Adapting a technique of Das and Chen (2001)..." +# "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not', +# 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word." # They didn't provide a full list. NEGATION_WORDS = ["not", "isn't", "didn't", "doesn't"] -END_SENTENCE_PUNCTUATION = [".", "!", "?"] +PUNCTUATION = [".", "!", "?", ",", ";"] + +# TODO Make these command-line parameters. +USE_PRESENCE = False # If true, use presence rather than frequency. +GRAM_LENGTH = 1 # Unigrams, bigrams, ... +NUM_FOLDS = 3 # For cross-validation (Pang & Lee used 3) +EPSILON = 0.01 # determines how long the algorithm runs def make_bag(filename): f = open(filename) lines = f.readlines() + f.close() + text = string.join(lines, " ") bag_of_words = {} - for line in lines: - do_negation = False - words = line.split(" ") - for word in words: - if word in NEGATION_WORDS: + do_negation = False + + words = text.split(" ") + count = 0 + for i in range(len(words) - GRAM_LENGTH + 1): + n_gram = string.join(words[i:i+GRAM_LENGTH+1], "_") + if (GRAM_LENGTH == 1): # Pang and Lee didn't do negation tagging for bigrams. + if n_gram in NEGATION_WORDS: do_negation = True - if word in END_SENTENCE_PUNCTUATION: + if n_gram in PUNCTUATION: do_negation = False elif do_negation: - word = "NOT_" + word - index = hash(word) - - if (not USE_PRESENCE) and bag_of_words.has_key(index): - bag_of_words[index] += 1 - else: - bag_of_words[index] = 1 - f.close() + n_gram = "NOT_" + n_gram + + index = hash(n_gram) + if (not USE_PRESENCE) and bag_of_words.has_key(index): + bag_of_words[index] += 1 + count += 1 + else: + bag_of_words[index] = 1 + count += 1 + # Normalize the bag of words. + #for k in bag_of_words.keys(): + # bag_of_words[k] = float(bag_of_words[k])/count + return bag_of_words neg_filenames = [] @@ -57,37 +71,57 @@ def make_bag(filename): random.shuffle(neg_filenames) random.shuffle(pos_filenames) -neg_train_filenames = neg_filenames[:NUM_TRAIN] -neg_test_filenames = neg_filenames[NUM_TRAIN:] -pos_train_filenames = pos_filenames[:NUM_TRAIN] -pos_test_filenames = pos_filenames[NUM_TRAIN:] +neg_folds = [[] for i in range(NUM_FOLDS)] +pos_folds = [[] for i in range(NUM_FOLDS)] +for i in range(len(neg_filenames)): + neg_folds[i % NUM_FOLDS].append(neg_filenames[i]) +for i in range(len(pos_filenames)): + pos_folds[i % NUM_FOLDS].append(pos_filenames[i]) + #TRAIN -neg_train_bags = [] -pos_train_bags = [] +#neg_fold_bags = [[] for i in range(NUM_FOLDS)] +#pos_fold_bags = [[] for i in range(NUM_FOLDS)] + +for i in range(NUM_FOLDS): + + neg_train_bags = [] + pos_train_bags = [] -for filename in neg_train_filenames: - neg_train_bags.append(make_bag(filename)) + neg_train_filenames = [] + pos_train_filenames = [] + neg_test_filenames = neg_folds[i] + pos_test_filenames = pos_folds[i] + for j in range(NUM_FOLDS): + if j != i: + neg_train_filenames += neg_folds[j] + pos_train_filenames += pos_folds[j] -for filename in pos_train_filenames: - pos_train_bags.append(make_bag(filename)) + for filename in neg_train_filenames: + neg_train_bags.append(make_bag(filename)) -train_labels = [-1] * len(neg_train_bags) + [1] * len(pos_train_bags) -train_bags = neg_train_bags + pos_train_bags + for filename in pos_train_filenames: + pos_train_bags.append(make_bag(filename)) -# TODO: Investigate LIBSVM training parameters. -m = svm_train(train_labels, train_bags, "-t 0") + train_labels = [-1] * len(neg_train_bags) + [1] * len(pos_train_bags) + train_bags = neg_train_bags + pos_train_bags -# TEST + # TODO: Investigate LIBSVM training parameters. + m = svm_train(train_labels, train_bags, "-t 0 -e %d" % EPSILON) -test_bags = [] -test_filenames = neg_test_filenames + pos_test_filenames + # TEST + test_bags = [] + test_filenames = neg_test_filenames + pos_test_filenames -for filename in test_filenames: - test_bags.append(make_bag(filename)) + for filename in test_filenames: + test_bags.append(make_bag(filename)) -test_labels = [-1] * len(neg_test_filenames) + [1] * len(pos_test_filenames) -svm_predict(test_labels, test_bags, m) + test_labels = [-1] * len(neg_test_filenames) + [1] * len(pos_test_filenames) + (train_labels, acc, p_vals) = svm_predict(test_labels, test_bags, m) + #indices = random.sample(range(len(test_filenames))) + #for i in indices: + # filename = test_filenames[i] + # if filename in neg_ #sorted(bag_of_words.items(), key=lambda (k,v): -v)[:100] \ No newline at end of file