From 88fcad218cdaf9314159420399d73d23854b1cf3 Mon Sep 17 00:00:00 2001 From: Jack Date: Tue, 1 Mar 2016 17:41:11 -0500 Subject: [PATCH] stuff --- review_svm.py | 98 ++++++++++++++++++++++++++------------------------- 1 file changed, 50 insertions(+), 48 deletions(-) diff --git a/review_svm.py b/review_svm.py index 5ffaa5b..def7a9d 100644 --- a/review_svm.py +++ b/review_svm.py @@ -15,11 +15,14 @@ NEGATION_WORDS = ["not", "isn't", "didn't", "doesn't"] PUNCTUATION = [".", "!", "?", ",", ";"] +NORMAL_LENGTH = 1000 + # TODO Make these command-line parameters. USE_PRESENCE = False # If true, use presence rather than frequency. -GRAM_LENGTH = 1 # Unigrams, bigrams, ... +GRAM_LENGTH = 3 # Unigrams, bigrams, ... NUM_FOLDS = 3 # For cross-validation (Pang & Lee used 3) -EPSILON = 0.01 # determines how long the algorithm runs +EPSILON = 0.001 # determines how long the algorithm runs (default is 0.001) +KERNEL_TYPE = 0 # 0: linear, 2: radial basis def make_bag(filename): f = open(filename) @@ -48,80 +51,79 @@ def make_bag(filename): else: bag_of_words[index] = 1 count += 1 - # Normalize the bag of words. - #for k in bag_of_words.keys(): - # bag_of_words[k] = float(bag_of_words[k])/count - + # Normalize the bag of words. For whatever reason it didn't work very well with small decimals... + for k in bag_of_words.keys(): + bag_of_words[k] = float(NORMAL_LENGTH*bag_of_words[k])/count return bag_of_words -neg_filenames = [] pos_filenames = [] +neg_filenames = [] word_table = {} next_word_index = 0 -for (folder, x, filenames) in os.walk(NEG_FOLDER): - for filename in filenames: - if filename.endswith(".txt"): - neg_filenames.append(folder + "\\" + filename) - for (folder, x, filenames) in os.walk(POS_FOLDER): for filename in filenames: if filename.endswith(".txt"): pos_filenames.append(folder + "\\" + filename) -random.shuffle(neg_filenames) -random.shuffle(pos_filenames) -neg_folds = [[] for i in range(NUM_FOLDS)] +for (folder, x, filenames) in os.walk(NEG_FOLDER): + for filename in filenames: + if filename.endswith(".txt"): + neg_filenames.append(folder + "\\" + filename) + +# Partition reviews into folds. pos_folds = [[] for i in range(NUM_FOLDS)] -for i in range(len(neg_filenames)): - neg_folds[i % NUM_FOLDS].append(neg_filenames[i]) +neg_folds = [[] for i in range(NUM_FOLDS)] for i in range(len(pos_filenames)): pos_folds[i % NUM_FOLDS].append(pos_filenames[i]) -#TRAIN - -#neg_fold_bags = [[] for i in range(NUM_FOLDS)] -#pos_fold_bags = [[] for i in range(NUM_FOLDS)] +for i in range(len(neg_filenames)): + neg_folds[i % NUM_FOLDS].append(neg_filenames[i]) +# Construct a bag of words (or n-grams) from each file. +pos_fold_bags = [[] for i in range(NUM_FOLDS)] +neg_fold_bags = [[] for i in range(NUM_FOLDS)] + for i in range(NUM_FOLDS): + for filename in pos_folds[i]: + pos_fold_bags[i].append(make_bag(filename)) + + for filename in neg_folds[i]: + neg_fold_bags[i].append(make_bag(filename)) - neg_train_bags = [] + +for i in range(NUM_FOLDS): pos_train_bags = [] + neg_train_bags = [] - neg_train_filenames = [] pos_train_filenames = [] - neg_test_filenames = neg_folds[i] + neg_train_filenames = [] + pos_test_filenames = pos_folds[i] + neg_test_filenames = neg_folds[i] + for j in range(NUM_FOLDS): if j != i: - neg_train_filenames += neg_folds[j] pos_train_filenames += pos_folds[j] + neg_train_filenames += neg_folds[j] + pos_train_bags += pos_fold_bags[j] + neg_train_bags += neg_fold_bags[j] - for filename in neg_train_filenames: - neg_train_bags.append(make_bag(filename)) - - for filename in pos_train_filenames: - pos_train_bags.append(make_bag(filename)) - - train_labels = [-1] * len(neg_train_bags) + [1] * len(pos_train_bags) - train_bags = neg_train_bags + pos_train_bags + train_labels = [1] * len(pos_train_bags) + [-1] * len(neg_train_bags) + train_bags = pos_train_bags + neg_train_bags # TODO: Investigate LIBSVM training parameters. - m = svm_train(train_labels, train_bags, "-t 0 -e %d" % EPSILON) - - # TEST - test_bags = [] - test_filenames = neg_test_filenames + pos_test_filenames - - for filename in test_filenames: - test_bags.append(make_bag(filename)) + m = svm_train(train_labels, train_bags, "-t %d -e %f" % (KERNEL_TYPE, EPSILON)) - test_labels = [-1] * len(neg_test_filenames) + [1] * len(pos_test_filenames) - (train_labels, acc, p_vals) = svm_predict(test_labels, test_bags, m) - #indices = random.sample(range(len(test_filenames))) - #for i in indices: - # filename = test_filenames[i] - # if filename in neg_ + test_bags = pos_fold_bags[i] + neg_fold_bags[i] + test_filenames = pos_test_filenames + neg_test_filenames + test_labels = [1] * len(pos_test_filenames) + [-1] * len(neg_test_filenames) -#sorted(bag_of_words.items(), key=lambda (k,v): -v)[:100] \ No newline at end of file + (predicted_labels, acc, p_vals) = svm_predict(test_labels, test_bags, m) + indices = random.sample(range(len(test_filenames)), 10) + filenames_labels = {} + for j in indices: + filename = test_filenames[j] + predicted_label = predicted_labels[j] + filenames_labels[filename] = predicted_labels[j] \ No newline at end of file