diff --git a/BagOfWords.py b/BagOfWords.py index 1f8cd32..c224bda 100644 --- a/BagOfWords.py +++ b/BagOfWords.py @@ -1,23 +1,24 @@ from __future__ import division -import nltk import string +import numpy +import nltk from TFIDF import tfidf # "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not', # 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word." # They didn't provide a full list. -NEGATION_WORDS = ["not", "isn't", "didn't", "doesn't"] -PUNCTUATION = [".", "!", "?", ",", ";"] #TODO make this work with POS tags (._.) +NEGATION_WORDS = ["not", "n't"] +PUNCTUATION = [".", "!", "?", ",", ";", '(', ')'] #TODO make this work with POS tags (._.) POSITION_TAGS = ["_1Q", "_2H", "_3Q"] +ADJECTIVE_TAGS = ["JJ", "JJR", "JJS", "JJT"] POSITION_THRESHOLDS = [0.25, 0.75, 1] # ref_bag is used to calculate the total word count across all documents. -def make(text, ref_bag=None, gram_length=1, use_negation=False, use_presence=False, use_pos_tags=False, use_adj_only=False, use_position=False, normalize=True, use_hash=True): +def make(words, ref_bag=None, gram_length=1, use_negation=False, use_presence=False, use_pos_tags=False, use_adj_only=False, use_position=False, normalize=True, use_hash=False): bag_of_words = {} if use_negation: do_negation = False - words = nltk.word_tokenize(text) if use_pos_tags: tagged = nltk.pos_tag(words) words = [string.join(t, "_") for t in tagged] @@ -45,7 +46,7 @@ def make(text, ref_bag=None, gram_length=1, use_negation=False, use_presence=Fal else: index = n_gram - if not (use_pos_tags and use_adj_only and (tagged[i][1] != "JJ")): + if not (use_pos_tags and use_adj_only and (tagged[i][1] not in ADJECTIVE_TAGS)): if (not use_presence) and bag_of_words.has_key(index): bag_of_words[index] += 1 else: @@ -79,4 +80,14 @@ def make_tfidf(document, documents): factor **= 0.5 for key in bag.keys(): bag[key] /= factor - return bag \ No newline at end of file + return bag + +def to_vector(bag, wordlist): + vec = [] + for word in wordlist: + if bag.has_key(word): + vec.append(bag[word]) + else: + vec.append(0) + return vec + #return numpy.array(vec).reshape(1,-1) \ No newline at end of file diff --git a/review_svm.py b/review_svm.py index 0d3a6bd..918b003 100644 --- a/review_svm.py +++ b/review_svm.py @@ -1,3 +1,4 @@ +from __future__ import division import os import random import string @@ -5,16 +6,18 @@ import time import sys import nltk -import svmutil +from nltk.corpus import movie_reviews +import numpy +#import svmutil +from sklearn.svm import SVC +from sklearn.svm import LinearSVC import BagOfWords +import XMLParser # Program to classify the movie review dataset using a support vector machine # (via LIBSVM), following Pang and Lee (2002). -POS_FOLDER = os.path.join("review_polarity","txt_sentoken","pos") -NEG_FOLDER = os.path.join("review_polarity","txt_sentoken","neg") - # "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not', # 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word." # They didn't provide a full list. @@ -22,8 +25,6 @@ NEG_FOLDER = os.path.join("review_polarity","txt_sentoken","neg") NEGATION_WORDS = ["not", "isn't", "didn't", "doesn't"] PUNCTUATION = [".", "!", "?", ",", ";"] -NORMAL_LENGTH = 1000 - # These are now command line parameters! See below... USE_PRESENCE = False # If true, use presence rather than frequency. USE_POS_TAGS = False @@ -31,45 +32,30 @@ USE_ADJ_ONLY = False USE_NEGATION = True USE_POSITION = False GRAM_LENGTH = 1 # Unigrams, bigrams, ... TODO use a range -NUM_FOLDS = 3 # For cross-validation (Pang & Lee used 3) +NUM_FOLDS = 5 # For cross-validation (Pang & Lee used 3) -MIN_OCCURRENCES = 4 # To be included, the word must show up this many times across all documents (Pang and Lee used 4) +MIN_OCCURRENCES = 0#4 # To be included, the word must show up this many times across all documents (Pang and Lee used 4) EPSILON = .001 # determines how long the algorithm runs (default is 0.001) -KERNEL_TYPE = 0 # 0: linear, 2: radial basis (just use linear) NORMALIZE_BAGS = True -USE_LIBLINEAR = False # Not implemented - it murdered my computer and wasn't noticeably faster. But maybe multicore is worth a look +USE_LIBLINEAR = True # Not implemented - it murdered my computer and wasn't noticeably faster. But maybe multicore is worth a look CACHE_SIZE = 512 -def file_to_text(filename): - f = open(filename) - lines = f.readlines() - f.close() - text = string.join(lines, " ") - return text - -def generate_filenames(folder_name): - filenames = [] - for (folder, x, folder_filenames) in os.walk(folder_name): - for filename in folder_filenames: - if filename.endswith(".txt"): - filenames.append(os.path.join(folder, filename)) - return filenames - -def partition_filenames(filenames, num_partitions): - partitions = [[] for i in range(num_partitions)] - for i in range(len(filenames)): - partitions[i % num_partitions].append(filenames[i]) - return partitions +USE_AMAZON = True # Use the Amazon review set, not Pang and Lee. + +def make_folds(documents, num_partitions): + folds = [[] for i in range(num_partitions)] + for i in range(len(documents)): + folds[i % num_partitions].append(documents[i]) + return folds def make_bag(text, total_word_counts): - return BagOfWords.make(file_to_text(filename), ref_bag=total_word_counts, + return BagOfWords.make(text, ref_bag=total_word_counts, gram_length=GRAM_LENGTH, use_presence=USE_PRESENCE, use_pos_tags=USE_POS_TAGS, use_adj_only=USE_ADJ_ONLY, normalize=NORMALIZE_BAGS, use_negation=USE_NEGATION, use_position=USE_POSITION) - # Set parameters from command-line arguments. i = 0 try: @@ -129,17 +115,37 @@ except Exception: print "Invalid arguments" t0 = time.time() - -pos_filenames = generate_filenames(POS_FOLDER) -neg_filenames = generate_filenames(NEG_FOLDER) -# TEST - to test on a subset of reviews (since some operations [i.e. tagging] are slow) -#pos_filenames = random.sample(pos_filenames, 20) -#neg_filenames = random.sample(neg_filenames, 20) +if USE_AMAZON: + # Load the mixed Amazon review dataset. + (ids, reviews, labels) = XMLParser.get_all_reviews() +else: + # Load the Pang and Lee sentiment dataset. + ids = movie_reviews.fileids() + reviews = [list(movie_reviews.words(id)) for id in ids] + labels = [] + for id in ids: + label = movie_reviews.categories(id)[0] + if label == 'pos': + labels.append(1) + elif label == 'neg': + labels.append(-1) + +positive_reviews = [] +negative_reviews = [] +for i in range(len(reviews)): + if labels[i] == 1: + positive_reviews.append(reviews[i]) + elif labels[i] == -1: + negative_reviews.append(reviews[i]) + +#TEST +positive_reviews = random.sample(positive_reviews, 1000) +negative_reviews = random.sample(negative_reviews, 1000) # Partition reviews into folds. -pos_folds = partition_filenames(pos_filenames, NUM_FOLDS) -neg_folds = partition_filenames(neg_filenames, NUM_FOLDS) +pos_folds = make_folds(positive_reviews, NUM_FOLDS) +neg_folds = make_folds(negative_reviews, NUM_FOLDS) # Count occurrences of every word across all documents # (this is important for e.g. Delta TFIDF) @@ -148,22 +154,23 @@ total_word_counts = {} # Construct a bag of words (or n-grams) from each file. pos_fold_bags = [[] for i in range(NUM_FOLDS)] neg_fold_bags = [[] for i in range(NUM_FOLDS)] - + for i in range(NUM_FOLDS): - for filename in pos_folds[i]: - pos_fold_bags[i].append(make_bag(file_to_text(filename), total_word_counts)) + for review in pos_folds[i]: + pos_fold_bags[i].append(make_bag(review, total_word_counts)) - for filename in neg_folds[i]: - neg_fold_bags[i].append(make_bag(file_to_text(filename), total_word_counts)) + for review in neg_folds[i]: + neg_fold_bags[i].append(make_bag(review, total_word_counts)) # Remove words with less than the minimum occurrences threshold. -for k in total_word_counts.keys(): - if total_word_counts[k] < MIN_OCCURRENCES: - for fold in (neg_fold_bags + pos_fold_bags): - for bag in fold: - if bag.has_key(k): - bag.pop(k) - total_word_counts.pop(k) +if MIN_OCCURRENCES > 0: + for k in total_word_counts.keys(): + if total_word_counts[k] < MIN_OCCURRENCES: + for fold in (neg_fold_bags + pos_fold_bags): + for bag in fold: + if bag.has_key(k): + bag.pop(k) + total_word_counts.pop(k) #num_unique_words = len(total_word_counts.keys()) #print "# unique words:", num_unique_words @@ -172,34 +179,45 @@ t1 = time.time() print "Constructed bags, time:", (t1-t0) avg_acc = 0 +wordlist = total_word_counts.keys() for i in range(NUM_FOLDS): - pos_train_filenames = [] - neg_train_filenames = [] + pos_train_reviews = [] + neg_train_reviews = [] pos_train_bags = [] neg_train_bags = [] - pos_test_filenames = pos_folds[i] - neg_test_filenames = neg_folds[i] - + pos_test_reviews = pos_folds[i] + neg_test_reviews = neg_folds[i] for j in range(NUM_FOLDS): if j != i: - pos_train_filenames += pos_folds[j] - neg_train_filenames += neg_folds[j] + pos_train_reviews += pos_folds[j] + neg_train_reviews += neg_folds[j] pos_train_bags += pos_fold_bags[j] neg_train_bags += neg_fold_bags[j] train_labels = [1] * len(pos_train_bags) + [-1] * len(neg_train_bags) train_bags = pos_train_bags + neg_train_bags - m = svmutil.svm_train(train_labels, train_bags, "-t %d -e %f -m %d -q" % (KERNEL_TYPE, EPSILON, CACHE_SIZE)) + #m = svmutil.svm_train(train_labels, train_bags, "-t 0 -e %f -m %d -q" % (EPSILON, CACHE_SIZE)) + if USE_LIBLINEAR: + classifier = LinearSVC() + else: + classifier = SVC(kernel="linear",tol=EPSILON) + + train_vecs = [BagOfWords.to_vector(bag, wordlist) for bag in train_bags] + classifier.fit(train_vecs, train_labels) test_bags = pos_fold_bags[i] + neg_fold_bags[i] - test_filenames = pos_test_filenames + neg_test_filenames - test_labels = [1] * len(pos_test_filenames) + [-1] * len(neg_test_filenames) + + test_vecs = [BagOfWords.to_vector(bag, wordlist) for bag in test_bags] + test_reviews = pos_test_reviews + neg_test_reviews + test_labels = [1] * len(pos_test_reviews) + [-1] * len(neg_test_reviews) - (predicted_labels, acc, p_vals) = svmutil.svm_predict(test_labels, test_bags, m) + #(predicted_labels, acc, p_vals) = svmutil.svm_predict(test_labels, test_bags, m) + predicted_labels = classifier.predict(test_vecs) + acc = classifier.score(test_vecs, test_labels) - avg_acc += acc[0] + avg_acc += acc """ indices = random.sample(range(len(test_filenames)), 10) @@ -214,5 +232,4 @@ t2 = time.time() avg_acc /= NUM_FOLDS print "Total accuracy:", avg_acc print "Classification time:", (t2-t1) -print "Total time:", (t2-t0) - +print "Total time:", (t2-t0) \ No newline at end of file