Skip to content

Commit

Permalink
Various fixes (min occurrences, etc.)
Browse files Browse the repository at this point in the history
  • Loading branch information
job13011 committed Mar 16, 2016
1 parent 64bcfa5 commit 5deb398
Showing 1 changed file with 83 additions and 33 deletions.
116 changes: 83 additions & 33 deletions review_svm.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import nltk
import svmutil
import liblinearutil
#import liblinearutil

# Program to classify the movie review dataset using a support vector machine
# (via LIBSVM), following Pang and Lee (2002).
Expand All @@ -22,62 +22,77 @@
NORMAL_LENGTH = 1000

# TODO Make these command-line parameters.
USE_PRESENCE = False # If true, use presence rather than frequency.
USE_POS_TAGS = False
USE_ADJ_ONLY = False
GRAM_LENGTH = 1 # Unigrams, bigrams, ...
NUM_FOLDS = 3 # For cross-validation (Pang & Lee used 3)
EPSILON = 0.001 # determines how long the algorithm runs (default is 0.001)
KERNEL_TYPE = 0 # 0: linear, 2: radial basis
NORMALIZE_BAGS = False
USE_LIBLINEAR = True

def make_bag(filename):
USE_PRESENCE = False # If true, use presence rather than frequency.
USE_POS_TAGS = True
USE_ADJ_ONLY = False
GRAM_LENGTH = 2 # Unigrams, bigrams, ...
NUM_FOLDS = 3 # For cross-validation (Pang & Lee used 3)
EPSILON = .1 # determines how long the algorithm runs (default is 0.001)
KERNEL_TYPE = 0 # 0: linear, 2: radial basis
NORMALIZE_BAGS = False
USE_LIBLINEAR = False
CACHE_SIZE = 512
MIN_OCCURRENCES = 10 # To be included, the word must show up this many times across all documents

def file_to_text(filename):
f = open(filename)
lines = f.readlines()
f.close()
text = string.join(lines, " ")
return text

def make_bag(text, ref_bag):
bag_of_words = {}
do_negation = False

#words = text.split(" ")
words = nltk.word_tokenize(text)
if USE_POS_TAGS and GRAM_LENGTH==1:
if USE_POS_TAGS:# and GRAM_LENGTH==1:
t5 = time.time()
tagged = nltk.pos_tag(words)
tagged = [string.join(t, "_") for t in tagged]
words = tagged
t6 = time.time()
print "Tag time (%d words): %f" % (len(words), (t6-t5))
count = 0
for i in range(len(words) - GRAM_LENGTH + 1):
n_gram = string.join(words[i:i+GRAM_LENGTH+1], "_")
n_gram = string.join(words[i:i+GRAM_LENGTH], "_")
if (GRAM_LENGTH == 1): # Pang and Lee didn't do negation tagging for bigrams.
if n_gram in NEGATION_WORDS:
do_negation = True
elif n_gram in PUNCTUATION:
do_negation = False

if USE_POS_TAGS:
n_gram = string.join(tagged[i], "_")
if do_negation:
n_gram = "NOT_" + n_gram

# LIBSVM won't use strings as keys, so hash to convert to a number.
index = hash(n_gram)
if not (USE_POS_TAGS and USE_ADJ_ONLY and (tagged[i][1] != "JJ")):
#if not (ref_bag != None and ((not ref_bag.has_key(index)) or (ref_bag[index] < MIN_OCCURRENCES))):
if (not USE_PRESENCE) and bag_of_words.has_key(index):
bag_of_words[index] += 1
#print n_gram + " => " + str(bag_of_words[index])
count += 1
#print n_gram, "=>", bag_of_words[index]
else:
bag_of_words[index] = 1
count += 1
#print n_gram + " => " + str(bag_of_words[index])
#print n_gram, "=>", bag_of_words[index]

# Add it to the reference bag
if ref_bag.has_key(index):
ref_bag[index] += 1
else:
ref_bag[index] = 1
# Normalize the bag of words. For whatever reason it didn't work very well with small decimals...
if NORMALIZE_BAGS:
for k in bag_of_words.keys():
bag_of_words[k] = float(NORMAL_LENGTH*bag_of_words[k])/count

return bag_of_words

t0 = time.time()

pos_filenames = []
neg_filenames = []
word_table = {}
next_word_index = 0

for (folder, x, filenames) in os.walk(POS_FOLDER):
Expand All @@ -89,6 +104,10 @@ def make_bag(filename):
for filename in filenames:
if filename.endswith(".txt"):
neg_filenames.append(os.path.join(folder, filename))

# TEST
#pos_filenames = random.sample(pos_filenames, 20)
#neg_filenames = random.sample(neg_filenames, 20)

# Partition reviews into folds.
pos_folds = [[] for i in range(NUM_FOLDS)]
Expand All @@ -100,27 +119,51 @@ def make_bag(filename):
for i in range(len(neg_filenames)):
neg_folds[i % NUM_FOLDS].append(neg_filenames[i])

# Count occurrences of every word across all documents
# (this is important for e.g. Delta TFIDF)
word_table = {}

# Construct a bag of words (or n-grams) from each file.
pos_fold_bags = [[] for i in range(NUM_FOLDS)]
neg_fold_bags = [[] for i in range(NUM_FOLDS)]

print "Constructed bags."

for i in range(NUM_FOLDS):
for filename in pos_folds[i]:
pos_fold_bags[i].append(make_bag(filename))
t3 = time.time()
pos_fold_bags[i].append(make_bag(file_to_text(filename), ref_bag=word_table))
t4 = time.time()
print "Bag time:", (t4-t3)

for filename in neg_folds[i]:
neg_fold_bags[i].append(make_bag(filename))
t3 = time.time()
neg_fold_bags[i].append(make_bag(file_to_text(filename), ref_bag=word_table))
t4 = time.time()
print "Bag time:", (t4-t3)


# Remove words with less than the minimum occurrences threshold.
for k in word_table.keys():
if word_table[k] < MIN_OCCURRENCES:
for bag in (neg_fold_bags + pos_fold_bags):
if bag.has_key(k):
bag.pop(k)

#word_table = make_bag(all_text, use_presence=False)
for k in word_table.keys():
if word_table[k] < MIN_OCCURRENCES:
word_table.pop(k)
num_unique_words = len(word_table.keys())
print "# unique words:", num_unique_words

t1 = time.time()
for i in range(NUM_FOLDS):
pos_train_bags = []
neg_train_bags = []
print "Constructed bags, time:", (t1-t0)
avg_acc = 0

for i in range(NUM_FOLDS):
pos_train_filenames = []
neg_train_filenames = []
pos_train_bags = []
neg_train_bags = []

pos_test_filenames = pos_folds[i]
neg_test_filenames = neg_folds[i]
Expand All @@ -136,27 +179,34 @@ def make_bag(filename):
train_bags = pos_train_bags + neg_train_bags

# TODO: Investigate LIBSVM training parameters.
# TODO: Why does LIBLINEAR break my computer?
if USE_LIBLINEAR:
m = liblinearutil.train(train_labels, train_bags, "-e %f" % EPSILON)
pass#m = liblinearutil.train(train_labels, train_bags, "-e %f" % EPSILON)
else:
m = svmutil.svm_train(train_labels, train_bags, "-t %d -e %f" % (KERNEL_TYPE, EPSILON))

m = svmutil.svm_train(train_labels, train_bags, "-t %d -e %f -m %d -q" % (KERNEL_TYPE, EPSILON, CACHE_SIZE))

test_bags = pos_fold_bags[i] + neg_fold_bags[i]
test_filenames = pos_test_filenames + neg_test_filenames
test_labels = [1] * len(pos_test_filenames) + [-1] * len(neg_test_filenames)

if USE_LIBLINEAR:
(predicted_labels, acc, p_vals) = liblinearutil.svm_predict(test_labels, test_bags, m)
pass#(predicted_labels, acc, p_vals) = liblinearutil.svm_predict(test_labels, test_bags, m)
else:
(predicted_labels, acc, p_vals) = svmutil.svm_predict(test_labels, test_bags, m)

avg_acc += acc[0]

"""
indices = random.sample(range(len(test_filenames)), 10)
filenames_labels = {}
for j in indices:
filename = test_filenames[j]
predicted_label = predicted_labels[j]
filenames_labels[filename] = predicted_labels[j]
"""

t2 = time.time()
print "Total time:", t2-t1
avg_acc /= NUM_FOLDS
print "Total accuracy:", avg_acc
print "Total time:", (t2-t1)

0 comments on commit 5deb398

Please sign in to comment.