diff --git a/review_svm.py b/review_svm.py index 967798a..836aae9 100644 --- a/review_svm.py +++ b/review_svm.py @@ -5,7 +5,7 @@ import nltk import svmutil -import liblinearutil +#import liblinearutil # Program to classify the movie review dataset using a support vector machine # (via LIBSVM), following Pang and Lee (2002). @@ -22,62 +22,77 @@ NORMAL_LENGTH = 1000 # TODO Make these command-line parameters. -USE_PRESENCE = False # If true, use presence rather than frequency. -USE_POS_TAGS = False -USE_ADJ_ONLY = False -GRAM_LENGTH = 1 # Unigrams, bigrams, ... -NUM_FOLDS = 3 # For cross-validation (Pang & Lee used 3) -EPSILON = 0.001 # determines how long the algorithm runs (default is 0.001) -KERNEL_TYPE = 0 # 0: linear, 2: radial basis -NORMALIZE_BAGS = False -USE_LIBLINEAR = True - -def make_bag(filename): +USE_PRESENCE = False # If true, use presence rather than frequency. +USE_POS_TAGS = True +USE_ADJ_ONLY = False +GRAM_LENGTH = 2 # Unigrams, bigrams, ... +NUM_FOLDS = 3 # For cross-validation (Pang & Lee used 3) +EPSILON = .1 # determines how long the algorithm runs (default is 0.001) +KERNEL_TYPE = 0 # 0: linear, 2: radial basis +NORMALIZE_BAGS = False +USE_LIBLINEAR = False +CACHE_SIZE = 512 +MIN_OCCURRENCES = 10 # To be included, the word must show up this many times across all documents + +def file_to_text(filename): f = open(filename) lines = f.readlines() f.close() text = string.join(lines, " ") + return text + +def make_bag(text, ref_bag): bag_of_words = {} do_negation = False - #words = text.split(" ") words = nltk.word_tokenize(text) - if USE_POS_TAGS and GRAM_LENGTH==1: + if USE_POS_TAGS:# and GRAM_LENGTH==1: + t5 = time.time() tagged = nltk.pos_tag(words) + tagged = [string.join(t, "_") for t in tagged] + words = tagged + t6 = time.time() + print "Tag time (%d words): %f" % (len(words), (t6-t5)) count = 0 for i in range(len(words) - GRAM_LENGTH + 1): - n_gram = string.join(words[i:i+GRAM_LENGTH+1], "_") + n_gram = string.join(words[i:i+GRAM_LENGTH], "_") if (GRAM_LENGTH == 1): # Pang and Lee didn't do negation tagging for bigrams. if n_gram in NEGATION_WORDS: do_negation = True elif n_gram in PUNCTUATION: do_negation = False - if USE_POS_TAGS: - n_gram = string.join(tagged[i], "_") if do_negation: n_gram = "NOT_" + n_gram + # LIBSVM won't use strings as keys, so hash to convert to a number. index = hash(n_gram) if not (USE_POS_TAGS and USE_ADJ_ONLY and (tagged[i][1] != "JJ")): + #if not (ref_bag != None and ((not ref_bag.has_key(index)) or (ref_bag[index] < MIN_OCCURRENCES))): if (not USE_PRESENCE) and bag_of_words.has_key(index): bag_of_words[index] += 1 - #print n_gram + " => " + str(bag_of_words[index]) count += 1 + #print n_gram, "=>", bag_of_words[index] else: bag_of_words[index] = 1 count += 1 - #print n_gram + " => " + str(bag_of_words[index]) + #print n_gram, "=>", bag_of_words[index] + + # Add it to the reference bag + if ref_bag.has_key(index): + ref_bag[index] += 1 + else: + ref_bag[index] = 1 # Normalize the bag of words. For whatever reason it didn't work very well with small decimals... if NORMALIZE_BAGS: for k in bag_of_words.keys(): bag_of_words[k] = float(NORMAL_LENGTH*bag_of_words[k])/count - return bag_of_words + +t0 = time.time() pos_filenames = [] neg_filenames = [] -word_table = {} next_word_index = 0 for (folder, x, filenames) in os.walk(POS_FOLDER): @@ -89,6 +104,10 @@ def make_bag(filename): for filename in filenames: if filename.endswith(".txt"): neg_filenames.append(os.path.join(folder, filename)) + +# TEST +#pos_filenames = random.sample(pos_filenames, 20) +#neg_filenames = random.sample(neg_filenames, 20) # Partition reviews into folds. pos_folds = [[] for i in range(NUM_FOLDS)] @@ -100,27 +119,51 @@ def make_bag(filename): for i in range(len(neg_filenames)): neg_folds[i % NUM_FOLDS].append(neg_filenames[i]) +# Count occurrences of every word across all documents +# (this is important for e.g. Delta TFIDF) +word_table = {} + # Construct a bag of words (or n-grams) from each file. pos_fold_bags = [[] for i in range(NUM_FOLDS)] neg_fold_bags = [[] for i in range(NUM_FOLDS)] - -print "Constructed bags." for i in range(NUM_FOLDS): for filename in pos_folds[i]: - pos_fold_bags[i].append(make_bag(filename)) + t3 = time.time() + pos_fold_bags[i].append(make_bag(file_to_text(filename), ref_bag=word_table)) + t4 = time.time() + print "Bag time:", (t4-t3) for filename in neg_folds[i]: - neg_fold_bags[i].append(make_bag(filename)) + t3 = time.time() + neg_fold_bags[i].append(make_bag(file_to_text(filename), ref_bag=word_table)) + t4 = time.time() + print "Bag time:", (t4-t3) + +# Remove words with less than the minimum occurrences threshold. +for k in word_table.keys(): + if word_table[k] < MIN_OCCURRENCES: + for bag in (neg_fold_bags + pos_fold_bags): + if bag.has_key(k): + bag.pop(k) + +#word_table = make_bag(all_text, use_presence=False) +for k in word_table.keys(): + if word_table[k] < MIN_OCCURRENCES: + word_table.pop(k) +num_unique_words = len(word_table.keys()) +print "# unique words:", num_unique_words t1 = time.time() -for i in range(NUM_FOLDS): - pos_train_bags = [] - neg_train_bags = [] +print "Constructed bags, time:", (t1-t0) +avg_acc = 0 +for i in range(NUM_FOLDS): pos_train_filenames = [] neg_train_filenames = [] + pos_train_bags = [] + neg_train_bags = [] pos_test_filenames = pos_folds[i] neg_test_filenames = neg_folds[i] @@ -136,27 +179,34 @@ def make_bag(filename): train_bags = pos_train_bags + neg_train_bags # TODO: Investigate LIBSVM training parameters. + # TODO: Why does LIBLINEAR break my computer? if USE_LIBLINEAR: - m = liblinearutil.train(train_labels, train_bags, "-e %f" % EPSILON) + pass#m = liblinearutil.train(train_labels, train_bags, "-e %f" % EPSILON) else: - m = svmutil.svm_train(train_labels, train_bags, "-t %d -e %f" % (KERNEL_TYPE, EPSILON)) - + m = svmutil.svm_train(train_labels, train_bags, "-t %d -e %f -m %d -q" % (KERNEL_TYPE, EPSILON, CACHE_SIZE)) test_bags = pos_fold_bags[i] + neg_fold_bags[i] test_filenames = pos_test_filenames + neg_test_filenames test_labels = [1] * len(pos_test_filenames) + [-1] * len(neg_test_filenames) if USE_LIBLINEAR: - (predicted_labels, acc, p_vals) = liblinearutil.svm_predict(test_labels, test_bags, m) + pass#(predicted_labels, acc, p_vals) = liblinearutil.svm_predict(test_labels, test_bags, m) else: (predicted_labels, acc, p_vals) = svmutil.svm_predict(test_labels, test_bags, m) + + avg_acc += acc[0] + """ indices = random.sample(range(len(test_filenames)), 10) filenames_labels = {} for j in indices: filename = test_filenames[j] predicted_label = predicted_labels[j] filenames_labels[filename] = predicted_labels[j] + """ t2 = time.time() -print "Total time:", t2-t1 +avg_acc /= NUM_FOLDS +print "Total accuracy:", avg_acc +print "Total time:", (t2-t1) +