From 64bcfa5eb30c6abf51b3b3a997beea143973e09a Mon Sep 17 00:00:00 2001 From: John Date: Tue, 15 Mar 2016 16:08:21 -0400 Subject: [PATCH] added NLTK integration (POS tags), LIBLINEAR support --- review_svm.py | 85 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 59 insertions(+), 26 deletions(-) diff --git a/review_svm.py b/review_svm.py index def7a9d..967798a 100644 --- a/review_svm.py +++ b/review_svm.py @@ -1,28 +1,36 @@ import os import random import string -from svmutil import * +import time + +import nltk +import svmutil +import liblinearutil # Program to classify the movie review dataset using a support vector machine # (via LIBSVM), following Pang and Lee (2002). -NEG_FOLDER = "review_polarity\\txt_sentoken\\neg" -POS_FOLDER = "review_polarity\\txt_sentoken\\pos" +NEG_FOLDER = os.path.join("review_polarity","txt_sentoken","neg") +POS_FOLDER = os.path.join("review_polarity","txt_sentoken","pos") # "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not', # 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word." # They didn't provide a full list. NEGATION_WORDS = ["not", "isn't", "didn't", "doesn't"] -PUNCTUATION = [".", "!", "?", ",", ";"] +PUNCTUATION = [".", "!", "?", ",", ";"] -NORMAL_LENGTH = 1000 +NORMAL_LENGTH = 1000 # TODO Make these command-line parameters. -USE_PRESENCE = False # If true, use presence rather than frequency. -GRAM_LENGTH = 3 # Unigrams, bigrams, ... -NUM_FOLDS = 3 # For cross-validation (Pang & Lee used 3) -EPSILON = 0.001 # determines how long the algorithm runs (default is 0.001) -KERNEL_TYPE = 0 # 0: linear, 2: radial basis +USE_PRESENCE = False # If true, use presence rather than frequency. +USE_POS_TAGS = False +USE_ADJ_ONLY = False +GRAM_LENGTH = 1 # Unigrams, bigrams, ... +NUM_FOLDS = 3 # For cross-validation (Pang & Lee used 3) +EPSILON = 0.001 # determines how long the algorithm runs (default is 0.001) +KERNEL_TYPE = 0 # 0: linear, 2: radial basis +NORMALIZE_BAGS = False +USE_LIBLINEAR = True def make_bag(filename): f = open(filename) @@ -32,28 +40,39 @@ def make_bag(filename): bag_of_words = {} do_negation = False - words = text.split(" ") + #words = text.split(" ") + words = nltk.word_tokenize(text) + if USE_POS_TAGS and GRAM_LENGTH==1: + tagged = nltk.pos_tag(words) count = 0 for i in range(len(words) - GRAM_LENGTH + 1): n_gram = string.join(words[i:i+GRAM_LENGTH+1], "_") if (GRAM_LENGTH == 1): # Pang and Lee didn't do negation tagging for bigrams. if n_gram in NEGATION_WORDS: do_negation = True - if n_gram in PUNCTUATION: + elif n_gram in PUNCTUATION: do_negation = False - elif do_negation: + + if USE_POS_TAGS: + n_gram = string.join(tagged[i], "_") + if do_negation: n_gram = "NOT_" + n_gram index = hash(n_gram) - if (not USE_PRESENCE) and bag_of_words.has_key(index): - bag_of_words[index] += 1 - count += 1 - else: - bag_of_words[index] = 1 - count += 1 + if not (USE_POS_TAGS and USE_ADJ_ONLY and (tagged[i][1] != "JJ")): + if (not USE_PRESENCE) and bag_of_words.has_key(index): + bag_of_words[index] += 1 + #print n_gram + " => " + str(bag_of_words[index]) + count += 1 + else: + bag_of_words[index] = 1 + count += 1 + #print n_gram + " => " + str(bag_of_words[index]) # Normalize the bag of words. For whatever reason it didn't work very well with small decimals... - for k in bag_of_words.keys(): - bag_of_words[k] = float(NORMAL_LENGTH*bag_of_words[k])/count + if NORMALIZE_BAGS: + for k in bag_of_words.keys(): + bag_of_words[k] = float(NORMAL_LENGTH*bag_of_words[k])/count + return bag_of_words pos_filenames = [] @@ -64,12 +83,12 @@ def make_bag(filename): for (folder, x, filenames) in os.walk(POS_FOLDER): for filename in filenames: if filename.endswith(".txt"): - pos_filenames.append(folder + "\\" + filename) + pos_filenames.append(os.path.join(folder, filename)) for (folder, x, filenames) in os.walk(NEG_FOLDER): for filename in filenames: if filename.endswith(".txt"): - neg_filenames.append(folder + "\\" + filename) + neg_filenames.append(os.path.join(folder, filename)) # Partition reviews into folds. pos_folds = [[] for i in range(NUM_FOLDS)] @@ -84,6 +103,8 @@ def make_bag(filename): # Construct a bag of words (or n-grams) from each file. pos_fold_bags = [[] for i in range(NUM_FOLDS)] neg_fold_bags = [[] for i in range(NUM_FOLDS)] + +print "Constructed bags." for i in range(NUM_FOLDS): for filename in pos_folds[i]: @@ -93,6 +114,7 @@ def make_bag(filename): neg_fold_bags[i].append(make_bag(filename)) +t1 = time.time() for i in range(NUM_FOLDS): pos_train_bags = [] neg_train_bags = [] @@ -114,16 +136,27 @@ def make_bag(filename): train_bags = pos_train_bags + neg_train_bags # TODO: Investigate LIBSVM training parameters. - m = svm_train(train_labels, train_bags, "-t %d -e %f" % (KERNEL_TYPE, EPSILON)) + if USE_LIBLINEAR: + m = liblinearutil.train(train_labels, train_bags, "-e %f" % EPSILON) + else: + m = svmutil.svm_train(train_labels, train_bags, "-t %d -e %f" % (KERNEL_TYPE, EPSILON)) + test_bags = pos_fold_bags[i] + neg_fold_bags[i] test_filenames = pos_test_filenames + neg_test_filenames test_labels = [1] * len(pos_test_filenames) + [-1] * len(neg_test_filenames) - (predicted_labels, acc, p_vals) = svm_predict(test_labels, test_bags, m) + if USE_LIBLINEAR: + (predicted_labels, acc, p_vals) = liblinearutil.svm_predict(test_labels, test_bags, m) + else: + (predicted_labels, acc, p_vals) = svmutil.svm_predict(test_labels, test_bags, m) + indices = random.sample(range(len(test_filenames)), 10) filenames_labels = {} for j in indices: filename = test_filenames[j] predicted_label = predicted_labels[j] - filenames_labels[filename] = predicted_labels[j] \ No newline at end of file + filenames_labels[filename] = predicted_labels[j] + +t2 = time.time() +print "Total time:", t2-t1