Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Updated SVM code for Amazon reviews, Sklearn, NLTK
  • Loading branch information
job13011 committed Apr 9, 2016
1 parent c15369d commit c1f8c1e
Show file tree
Hide file tree
Showing 2 changed files with 101 additions and 73 deletions.
25 changes: 18 additions & 7 deletions BagOfWords.py
@@ -1,23 +1,24 @@
from __future__ import division
import nltk
import string
import numpy
import nltk
from TFIDF import tfidf

# "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not',
# 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word."
# They didn't provide a full list.
NEGATION_WORDS = ["not", "isn't", "didn't", "doesn't"]
PUNCTUATION = [".", "!", "?", ",", ";"] #TODO make this work with POS tags (._.)
NEGATION_WORDS = ["not", "n't"]
PUNCTUATION = [".", "!", "?", ",", ";", '(', ')'] #TODO make this work with POS tags (._.)
POSITION_TAGS = ["_1Q", "_2H", "_3Q"]
ADJECTIVE_TAGS = ["JJ", "JJR", "JJS", "JJT"]
POSITION_THRESHOLDS = [0.25, 0.75, 1]

# ref_bag is used to calculate the total word count across all documents.
def make(text, ref_bag=None, gram_length=1, use_negation=False, use_presence=False, use_pos_tags=False, use_adj_only=False, use_position=False, normalize=True, use_hash=True):
def make(words, ref_bag=None, gram_length=1, use_negation=False, use_presence=False, use_pos_tags=False, use_adj_only=False, use_position=False, normalize=True, use_hash=False):
bag_of_words = {}
if use_negation:
do_negation = False

words = nltk.word_tokenize(text)
if use_pos_tags:
tagged = nltk.pos_tag(words)
words = [string.join(t, "_") for t in tagged]
Expand Down Expand Up @@ -45,7 +46,7 @@ def make(text, ref_bag=None, gram_length=1, use_negation=False, use_presence=Fal
else:
index = n_gram

if not (use_pos_tags and use_adj_only and (tagged[i][1] != "JJ")):
if not (use_pos_tags and use_adj_only and (tagged[i][1] not in ADJECTIVE_TAGS)):
if (not use_presence) and bag_of_words.has_key(index):
bag_of_words[index] += 1
else:
Expand Down Expand Up @@ -79,4 +80,14 @@ def make_tfidf(document, documents):
factor **= 0.5
for key in bag.keys():
bag[key] /= factor
return bag
return bag

def to_vector(bag, wordlist):
vec = []
for word in wordlist:
if bag.has_key(word):
vec.append(bag[word])
else:
vec.append(0)
return vec
#return numpy.array(vec).reshape(1,-1)
149 changes: 83 additions & 66 deletions review_svm.py
@@ -1,75 +1,61 @@
from __future__ import division
import os
import random
import string
import time
import sys

import nltk
import svmutil
from nltk.corpus import movie_reviews
import numpy
#import svmutil
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

import BagOfWords
import XMLParser

# Program to classify the movie review dataset using a support vector machine
# (via LIBSVM), following Pang and Lee (2002).

POS_FOLDER = os.path.join("review_polarity","txt_sentoken","pos")
NEG_FOLDER = os.path.join("review_polarity","txt_sentoken","neg")

# "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not',
# 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word."
# They didn't provide a full list.
# TODO make this a parameter
NEGATION_WORDS = ["not", "isn't", "didn't", "doesn't"]
PUNCTUATION = [".", "!", "?", ",", ";"]

NORMAL_LENGTH = 1000

# These are now command line parameters! See below...
USE_PRESENCE = False # If true, use presence rather than frequency.
USE_POS_TAGS = False
USE_ADJ_ONLY = False
USE_NEGATION = True
USE_POSITION = False
GRAM_LENGTH = 1 # Unigrams, bigrams, ... TODO use a range
NUM_FOLDS = 3 # For cross-validation (Pang & Lee used 3)
NUM_FOLDS = 5 # For cross-validation (Pang & Lee used 3)

MIN_OCCURRENCES = 4 # To be included, the word must show up this many times across all documents (Pang and Lee used 4)
MIN_OCCURRENCES = 0#4 # To be included, the word must show up this many times across all documents (Pang and Lee used 4)
EPSILON = .001 # determines how long the algorithm runs (default is 0.001)

KERNEL_TYPE = 0 # 0: linear, 2: radial basis (just use linear)
NORMALIZE_BAGS = True
USE_LIBLINEAR = False # Not implemented - it murdered my computer and wasn't noticeably faster. But maybe multicore is worth a look
USE_LIBLINEAR = True # Not implemented - it murdered my computer and wasn't noticeably faster. But maybe multicore is worth a look
CACHE_SIZE = 512

def file_to_text(filename):
f = open(filename)
lines = f.readlines()
f.close()
text = string.join(lines, " ")
return text

def generate_filenames(folder_name):
filenames = []
for (folder, x, folder_filenames) in os.walk(folder_name):
for filename in folder_filenames:
if filename.endswith(".txt"):
filenames.append(os.path.join(folder, filename))
return filenames

def partition_filenames(filenames, num_partitions):
partitions = [[] for i in range(num_partitions)]
for i in range(len(filenames)):
partitions[i % num_partitions].append(filenames[i])
return partitions
USE_AMAZON = True # Use the Amazon review set, not Pang and Lee.

def make_folds(documents, num_partitions):
folds = [[] for i in range(num_partitions)]
for i in range(len(documents)):
folds[i % num_partitions].append(documents[i])
return folds

def make_bag(text, total_word_counts):
return BagOfWords.make(file_to_text(filename), ref_bag=total_word_counts,
return BagOfWords.make(text, ref_bag=total_word_counts,
gram_length=GRAM_LENGTH, use_presence=USE_PRESENCE,
use_pos_tags=USE_POS_TAGS, use_adj_only=USE_ADJ_ONLY,
normalize=NORMALIZE_BAGS, use_negation=USE_NEGATION,
use_position=USE_POSITION)


# Set parameters from command-line arguments.
i = 0
try:
Expand Down Expand Up @@ -129,17 +115,37 @@ except Exception:
print "Invalid arguments"

t0 = time.time()

pos_filenames = generate_filenames(POS_FOLDER)
neg_filenames = generate_filenames(NEG_FOLDER)

# TEST - to test on a subset of reviews (since some operations [i.e. tagging] are slow)
#pos_filenames = random.sample(pos_filenames, 20)
#neg_filenames = random.sample(neg_filenames, 20)
if USE_AMAZON:
# Load the mixed Amazon review dataset.
(ids, reviews, labels) = XMLParser.get_all_reviews()
else:
# Load the Pang and Lee sentiment dataset.
ids = movie_reviews.fileids()
reviews = [list(movie_reviews.words(id)) for id in ids]
labels = []
for id in ids:
label = movie_reviews.categories(id)[0]
if label == 'pos':
labels.append(1)
elif label == 'neg':
labels.append(-1)

positive_reviews = []
negative_reviews = []
for i in range(len(reviews)):
if labels[i] == 1:
positive_reviews.append(reviews[i])
elif labels[i] == -1:
negative_reviews.append(reviews[i])

#TEST
positive_reviews = random.sample(positive_reviews, 1000)
negative_reviews = random.sample(negative_reviews, 1000)

# Partition reviews into folds.
pos_folds = partition_filenames(pos_filenames, NUM_FOLDS)
neg_folds = partition_filenames(neg_filenames, NUM_FOLDS)
pos_folds = make_folds(positive_reviews, NUM_FOLDS)
neg_folds = make_folds(negative_reviews, NUM_FOLDS)

# Count occurrences of every word across all documents
# (this is important for e.g. Delta TFIDF)
Expand All @@ -148,22 +154,23 @@ total_word_counts = {}
# Construct a bag of words (or n-grams) from each file.
pos_fold_bags = [[] for i in range(NUM_FOLDS)]
neg_fold_bags = [[] for i in range(NUM_FOLDS)]

for i in range(NUM_FOLDS):
for filename in pos_folds[i]:
pos_fold_bags[i].append(make_bag(file_to_text(filename), total_word_counts))
for review in pos_folds[i]:
pos_fold_bags[i].append(make_bag(review, total_word_counts))

for filename in neg_folds[i]:
neg_fold_bags[i].append(make_bag(file_to_text(filename), total_word_counts))
for review in neg_folds[i]:
neg_fold_bags[i].append(make_bag(review, total_word_counts))

# Remove words with less than the minimum occurrences threshold.
for k in total_word_counts.keys():
if total_word_counts[k] < MIN_OCCURRENCES:
for fold in (neg_fold_bags + pos_fold_bags):
for bag in fold:
if bag.has_key(k):
bag.pop(k)
total_word_counts.pop(k)
if MIN_OCCURRENCES > 0:
for k in total_word_counts.keys():
if total_word_counts[k] < MIN_OCCURRENCES:
for fold in (neg_fold_bags + pos_fold_bags):
for bag in fold:
if bag.has_key(k):
bag.pop(k)
total_word_counts.pop(k)

#num_unique_words = len(total_word_counts.keys())
#print "# unique words:", num_unique_words
Expand All @@ -172,34 +179,45 @@ t1 = time.time()
print "Constructed bags, time:", (t1-t0)
avg_acc = 0

wordlist = total_word_counts.keys()
for i in range(NUM_FOLDS):
pos_train_filenames = []
neg_train_filenames = []
pos_train_reviews = []
neg_train_reviews = []
pos_train_bags = []
neg_train_bags = []

pos_test_filenames = pos_folds[i]
neg_test_filenames = neg_folds[i]

pos_test_reviews = pos_folds[i]
neg_test_reviews = neg_folds[i]
for j in range(NUM_FOLDS):
if j != i:
pos_train_filenames += pos_folds[j]
neg_train_filenames += neg_folds[j]
pos_train_reviews += pos_folds[j]
neg_train_reviews += neg_folds[j]
pos_train_bags += pos_fold_bags[j]
neg_train_bags += neg_fold_bags[j]

train_labels = [1] * len(pos_train_bags) + [-1] * len(neg_train_bags)
train_bags = pos_train_bags + neg_train_bags

m = svmutil.svm_train(train_labels, train_bags, "-t %d -e %f -m %d -q" % (KERNEL_TYPE, EPSILON, CACHE_SIZE))
#m = svmutil.svm_train(train_labels, train_bags, "-t 0 -e %f -m %d -q" % (EPSILON, CACHE_SIZE))
if USE_LIBLINEAR:
classifier = LinearSVC()
else:
classifier = SVC(kernel="linear",tol=EPSILON)

train_vecs = [BagOfWords.to_vector(bag, wordlist) for bag in train_bags]
classifier.fit(train_vecs, train_labels)

test_bags = pos_fold_bags[i] + neg_fold_bags[i]
test_filenames = pos_test_filenames + neg_test_filenames
test_labels = [1] * len(pos_test_filenames) + [-1] * len(neg_test_filenames)

test_vecs = [BagOfWords.to_vector(bag, wordlist) for bag in test_bags]
test_reviews = pos_test_reviews + neg_test_reviews
test_labels = [1] * len(pos_test_reviews) + [-1] * len(neg_test_reviews)

(predicted_labels, acc, p_vals) = svmutil.svm_predict(test_labels, test_bags, m)
#(predicted_labels, acc, p_vals) = svmutil.svm_predict(test_labels, test_bags, m)
predicted_labels = classifier.predict(test_vecs)
acc = classifier.score(test_vecs, test_labels)

avg_acc += acc[0]
avg_acc += acc

"""
indices = random.sample(range(len(test_filenames)), 10)
Expand All @@ -214,5 +232,4 @@ t2 = time.time()
avg_acc /= NUM_FOLDS
print "Total accuracy:", avg_acc
print "Classification time:", (t2-t1)
print "Total time:", (t2-t0)

print "Total time:", (t2-t0)

0 comments on commit c1f8c1e

Please sign in to comment.