diff --git a/BagOfWords.py b/BagOfWords.py new file mode 100644 index 0000000..96f700c --- /dev/null +++ b/BagOfWords.py @@ -0,0 +1,54 @@ +import nltk +import string + +# "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not', +# 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word." +# They didn't provide a full list. +NEGATION_WORDS = ["not", "isn't", "didn't", "doesn't"] +PUNCTUATION = [".", "!", "?", ",", ";"] #TODO make this work with POS tags (._.) + + +def make(text, ref_bag=None, use_presence=False, use_pos_tags=False, use_adj_only=False, gram_length=1, normalize_bags=True): + bag_of_words = {} + do_negation = False + + words = nltk.word_tokenize(text) + if use_pos_tags:# and gram_length==1: + tagged = nltk.pos_tag(words) + tagged = [string.join(t, "_") for t in tagged] + words = tagged + count = 0 + for i in range(len(words) - gram_length + 1): + n_gram = string.join(words[i:i+gram_length], "_") + if (gram_length == 1): # Pang and Lee didn't do negation tagging for bigrams. + if n_gram in NEGATION_WORDS: + do_negation = True + elif n_gram in PUNCTUATION: + do_negation = False + if do_negation: + n_gram = "NOT_" + n_gram + + # LIBSVM won't use strings as keys, so hash to convert to a number. + index = hash(n_gram) + if not (use_pos_tags and use_adj_only and (tagged[i][1] != "JJ")): + #if not (ref_bag != None and ((not ref_bag.has_key(index)) or (ref_bag[index] < MIN_OCCURRENCES))): + if (not use_presence) and bag_of_words.has_key(index): + bag_of_words[index] += 1 + count += 1 + else: + bag_of_words[index] = 1 + count += 1 + + # Add it to the reference bag + if ref_bag != None: + if ref_bag.has_key(index): + ref_bag[index] += 1 + else: + ref_bag[index] = 1 + + # TODO do this correctly + + #if normalize_bags: + # for k in bag_of_words.keys(): + # bag_of_words[k] = float(NORMAL_LENGTH*bag_of_words[k])/count + return bag_of_words \ No newline at end of file diff --git a/review_svm.py b/review_svm.py index 632c01c..6b5dd09 100644 --- a/review_svm.py +++ b/review_svm.py @@ -2,37 +2,43 @@ import os import random import string import time +import sys import nltk import svmutil -#import liblinearutil + +import BagOfWords # Program to classify the movie review dataset using a support vector machine # (via LIBSVM), following Pang and Lee (2002). -NEG_FOLDER = os.path.join("review_polarity","txt_sentoken","neg") -POS_FOLDER = os.path.join("review_polarity","txt_sentoken","pos") +POS_FOLDER = os.path.join("review_polarity","txt_sentoken","pos") +NEG_FOLDER = os.path.join("review_polarity","txt_sentoken","neg") # "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not', # 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word." # They didn't provide a full list. +# TODO make this a parameter NEGATION_WORDS = ["not", "isn't", "didn't", "doesn't"] PUNCTUATION = [".", "!", "?", ",", ";"] NORMAL_LENGTH = 1000 -# TODO Make these command-line parameters. +# These are now command line parameters! See below... USE_PRESENCE = False # If true, use presence rather than frequency. -USE_POS_TAGS = True +USE_POS_TAGS = False USE_ADJ_ONLY = False -GRAM_LENGTH = 2 # Unigrams, bigrams, ... +USE_NEGATION = True +GRAM_LENGTH = 1 # Unigrams, bigrams, ... TODO use a range NUM_FOLDS = 3 # For cross-validation (Pang & Lee used 3) -EPSILON = .1 # determines how long the algorithm runs (default is 0.001) -KERNEL_TYPE = 0 # 0: linear, 2: radial basis -NORMALIZE_BAGS = False -USE_LIBLINEAR = False + +MIN_OCCURRENCES = 4 # To be included, the word must show up this many times across all documents (Pang and Lee used 4) +EPSILON = .001 # determines how long the algorithm runs (default is 0.001) + +KERNEL_TYPE = 0 # 0: linear, 2: radial basis (just use linear) +NORMALIZE_BAGS = True +USE_LIBLINEAR = False # Not implemented - it murdered my computer and wasn't noticeably faster. But maybe multicore is worth a look CACHE_SIZE = 512 -MIN_OCCURRENCES = 10 # To be included, the word must show up this many times across all documents def file_to_text(filename): f = open(filename) @@ -40,88 +46,91 @@ def file_to_text(filename): f.close() text = string.join(lines, " ") return text + +def generate_filenames(folder_name): + filenames = [] + for (folder, x, folder_filenames) in os.walk(folder_name): + for filename in folder_filenames: + if filename.endswith(".txt"): + filenames.append(os.path.join(folder, filename)) + return filenames + +def partition_filenames(filenames, num_partitions): + partitions = [[] for i in range(num_partitions)] + for i in range(len(filenames)): + partitions[i % num_partitions].append(filenames[i]) + return partitions -def make_bag(text, ref_bag): - bag_of_words = {} - do_negation = False - words = nltk.word_tokenize(text) - if USE_POS_TAGS:# and GRAM_LENGTH==1: - t5 = time.time() - tagged = nltk.pos_tag(words) - tagged = [string.join(t, "_") for t in tagged] - words = tagged - t6 = time.time() - print "Tag time (%d words): %f" % (len(words), (t6-t5)) - count = 0 - for i in range(len(words) - GRAM_LENGTH + 1): - n_gram = string.join(words[i:i+GRAM_LENGTH], "_") - if (GRAM_LENGTH == 1): # Pang and Lee didn't do negation tagging for bigrams. - if n_gram in NEGATION_WORDS: - do_negation = True - elif n_gram in PUNCTUATION: - do_negation = False - - if do_negation: - n_gram = "NOT_" + n_gram - - # LIBSVM won't use strings as keys, so hash to convert to a number. - index = hash(n_gram) - if not (USE_POS_TAGS and USE_ADJ_ONLY and (tagged[i][1] != "JJ")): - #if not (ref_bag != None and ((not ref_bag.has_key(index)) or (ref_bag[index] < MIN_OCCURRENCES))): - if (not USE_PRESENCE) and bag_of_words.has_key(index): - bag_of_words[index] += 1 - count += 1 - print n_gram, "=>", bag_of_words[index] - else: - bag_of_words[index] = 1 - count += 1 - print n_gram, "=>", bag_of_words[index] - - # Add it to the reference bag - if ref_bag.has_key(index): - ref_bag[index] += 1 - else: - ref_bag[index] = 1 - # Normalize the bag of words. For whatever reason it didn't work very well with small decimals... - if NORMALIZE_BAGS: - for k in bag_of_words.keys(): - bag_of_words[k] = float(NORMAL_LENGTH*bag_of_words[k])/count - return bag_of_words +# Set parameters from command-line arguments. +i = 0 +try: + args = sys.argv[1:] + while i < len(args): + if args[i] == "--gram-length": + GRAM_LENGTH = int(args[i+1]) + i += 2 + elif args[i] == "--num-folds": + NUM_FOLDS = int(args[i+1]) + i += 2 + elif args[i] == "--presence": + USE_PRESENCE = True + i += 1 + elif args[i] == "--frequency": + USE_PRESENCE = False + i += 1 + elif args[i] == "--use-pos-tags": + USE_POS_TAGS = True + i += 1 + elif args[i] == "--use-adj-only": + USE_ADJ_ONLY = True + i += 1 + elif args[i] == "--use-negation": + USE_NEGATION = True + i += 1 + elif args[i] == "--no-negation": + USE_NEGATION = False + i += 1 + elif args[i] == "--threshold": + MIN_OCCURRENCES = int(args[i+1]) + i += 2 + elif args[i] == "--epsilon": + EPSILON = float(args[i+1]) + i += 2 + elif args[i] == "--help": + print "Usage:" + print "--gram-length N\t\tUse groups of N consecutive words (Default: 1)" + print "--num-folds N\t\tUse N folds for cross-validation (Default: 3)" + print "--presence\t\tUse word presence rather than word frequency (Default: Off)" + print "--frequency\t\tUse word frequency rather than word presence (Default: On)" + print "--use-pos-tags\t\tUse part-of-speech tags (Default: Off)" + print "--use-adj-only\t\tUse adjectives only (requires --use-pos-tags and --gram-length 1) (Default: Off)" + print "--threshold N\t\tOnly include words that appear at least N times across all documents (Default: 4)" + print "--epsilon X\t\tSVM parameter to control the number of iterations (Default: 0.001)" + print "\t\t\t(0 < epsilon < 1; lower = more iterations)" + exit() + else: + print "Error: Invalid argument", args[i] + i += 1 +except Exception: + print "Invalid arguments" t0 = time.time() -pos_filenames = [] -neg_filenames = [] -next_word_index = 0 - -for (folder, x, filenames) in os.walk(POS_FOLDER): - for filename in filenames: - if filename.endswith(".txt"): - pos_filenames.append(os.path.join(folder, filename)) - -for (folder, x, filenames) in os.walk(NEG_FOLDER): - for filename in filenames: - if filename.endswith(".txt"): - neg_filenames.append(os.path.join(folder, filename)) - -# TEST +pos_filenames = generate_filenames(POS_FOLDER) +neg_filenames = generate_filenames(NEG_FOLDER) + +# TEST - to test on a subset of reviews (since some operations [i.e. tagging] are slow) #pos_filenames = random.sample(pos_filenames, 20) #neg_filenames = random.sample(neg_filenames, 20) # Partition reviews into folds. -pos_folds = [[] for i in range(NUM_FOLDS)] -neg_folds = [[] for i in range(NUM_FOLDS)] - -for i in range(len(pos_filenames)): - pos_folds[i % NUM_FOLDS].append(pos_filenames[i]) - -for i in range(len(neg_filenames)): - neg_folds[i % NUM_FOLDS].append(neg_filenames[i]) +pos_folds = partition_filenames(pos_filenames, NUM_FOLDS) +neg_folds = partition_filenames(neg_filenames, NUM_FOLDS) # Count occurrences of every word across all documents # (this is important for e.g. Delta TFIDF) -word_table = {} +total_word_counts = {} # Construct a bag of words (or n-grams) from each file. pos_fold_bags = [[] for i in range(NUM_FOLDS)] @@ -129,31 +138,25 @@ neg_fold_bags = [[] for i in range(NUM_FOLDS)] for i in range(NUM_FOLDS): for filename in pos_folds[i]: - t3 = time.time() - pos_fold_bags[i].append(make_bag(file_to_text(filename), ref_bag=word_table)) - t4 = time.time() - print "Bag time:", (t4-t3) + pos_fold_bags[i].append(BagOfWords.make(file_to_text(filename), ref_bag=total_word_counts, gram_length=GRAM_LENGTH, + use_presence=USE_PRESENCE, use_pos_tags=USE_POS_TAGS, use_adj_only=USE_ADJ_ONLY, normalize_bags=NORMALIZE_BAGS)) for filename in neg_folds[i]: - t3 = time.time() - neg_fold_bags[i].append(make_bag(file_to_text(filename), ref_bag=word_table)) - t4 = time.time() - print "Bag time:", (t4-t3) + neg_fold_bags[i].append( + BagOfWords.make(file_to_text(filename), ref_bag=total_word_counts, gram_length=GRAM_LENGTH, + use_presence=USE_PRESENCE, use_pos_tags=USE_POS_TAGS, use_adj_only=USE_ADJ_ONLY, normalize_bags=NORMALIZE_BAGS)) - # Remove words with less than the minimum occurrences threshold. -for k in word_table.keys(): - if word_table[k] < MIN_OCCURRENCES: - for bag in (neg_fold_bags + pos_fold_bags): - if bag.has_key(k): - bag.pop(k) - -#word_table = make_bag(all_text, use_presence=False) -for k in word_table.keys(): - if word_table[k] < MIN_OCCURRENCES: - word_table.pop(k) -num_unique_words = len(word_table.keys()) -print "# unique words:", num_unique_words +for k in total_word_counts.keys(): + if total_word_counts[k] < MIN_OCCURRENCES: + for fold in (neg_fold_bags + pos_fold_bags): + for bag in fold: + if bag.has_key(k): + bag.pop(k) + total_word_counts.pop(k) + +#num_unique_words = len(total_word_counts.keys()) +#print "# unique words:", num_unique_words t1 = time.time() print "Constructed bags, time:", (t1-t0) @@ -178,21 +181,13 @@ for i in range(NUM_FOLDS): train_labels = [1] * len(pos_train_bags) + [-1] * len(neg_train_bags) train_bags = pos_train_bags + neg_train_bags - # TODO: Investigate LIBSVM training parameters. - # TODO: Why does LIBLINEAR break my computer? - if USE_LIBLINEAR: - pass#m = liblinearutil.train(train_labels, train_bags, "-e %f" % EPSILON) - else: - m = svmutil.svm_train(train_labels, train_bags, "-t %d -e %f -m %d -q" % (KERNEL_TYPE, EPSILON, CACHE_SIZE)) + m = svmutil.svm_train(train_labels, train_bags, "-t %d -e %f -m %d -q" % (KERNEL_TYPE, EPSILON, CACHE_SIZE)) test_bags = pos_fold_bags[i] + neg_fold_bags[i] test_filenames = pos_test_filenames + neg_test_filenames test_labels = [1] * len(pos_test_filenames) + [-1] * len(neg_test_filenames) - if USE_LIBLINEAR: - pass#(predicted_labels, acc, p_vals) = liblinearutil.svm_predict(test_labels, test_bags, m) - else: - (predicted_labels, acc, p_vals) = svmutil.svm_predict(test_labels, test_bags, m) + (predicted_labels, acc, p_vals) = svmutil.svm_predict(test_labels, test_bags, m) avg_acc += acc[0] @@ -208,5 +203,6 @@ for i in range(NUM_FOLDS): t2 = time.time() avg_acc /= NUM_FOLDS print "Total accuracy:", avg_acc -print "Total time:", (t2-t1) +print "Classification time:", (t2-t1) +print "Total time:", (t2-t0)