Skip to content

Commit

Permalink
added NLTK integration (POS tags), LIBLINEAR support
Browse files Browse the repository at this point in the history
  • Loading branch information
job13011 committed Mar 15, 2016
1 parent 88fcad2 commit 64bcfa5
Showing 1 changed file with 59 additions and 26 deletions.
85 changes: 59 additions & 26 deletions review_svm.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,36 @@
import os
import random
import string
from svmutil import *
import time

import nltk
import svmutil
import liblinearutil

# Program to classify the movie review dataset using a support vector machine
# (via LIBSVM), following Pang and Lee (2002).

NEG_FOLDER = "review_polarity\\txt_sentoken\\neg"
POS_FOLDER = "review_polarity\\txt_sentoken\\pos"
NEG_FOLDER = os.path.join("review_polarity","txt_sentoken","neg")
POS_FOLDER = os.path.join("review_polarity","txt_sentoken","pos")

# "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not',
# 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word."
# They didn't provide a full list.
NEGATION_WORDS = ["not", "isn't", "didn't", "doesn't"]
PUNCTUATION = [".", "!", "?", ",", ";"]
PUNCTUATION = [".", "!", "?", ",", ";"]

NORMAL_LENGTH = 1000
NORMAL_LENGTH = 1000

# TODO Make these command-line parameters.
USE_PRESENCE = False # If true, use presence rather than frequency.
GRAM_LENGTH = 3 # Unigrams, bigrams, ...
NUM_FOLDS = 3 # For cross-validation (Pang & Lee used 3)
EPSILON = 0.001 # determines how long the algorithm runs (default is 0.001)
KERNEL_TYPE = 0 # 0: linear, 2: radial basis
USE_PRESENCE = False # If true, use presence rather than frequency.
USE_POS_TAGS = False
USE_ADJ_ONLY = False
GRAM_LENGTH = 1 # Unigrams, bigrams, ...
NUM_FOLDS = 3 # For cross-validation (Pang & Lee used 3)
EPSILON = 0.001 # determines how long the algorithm runs (default is 0.001)
KERNEL_TYPE = 0 # 0: linear, 2: radial basis
NORMALIZE_BAGS = False
USE_LIBLINEAR = True

def make_bag(filename):
f = open(filename)
Expand All @@ -32,28 +40,39 @@ def make_bag(filename):
bag_of_words = {}
do_negation = False

words = text.split(" ")
#words = text.split(" ")
words = nltk.word_tokenize(text)
if USE_POS_TAGS and GRAM_LENGTH==1:
tagged = nltk.pos_tag(words)
count = 0
for i in range(len(words) - GRAM_LENGTH + 1):
n_gram = string.join(words[i:i+GRAM_LENGTH+1], "_")
if (GRAM_LENGTH == 1): # Pang and Lee didn't do negation tagging for bigrams.
if n_gram in NEGATION_WORDS:
do_negation = True
if n_gram in PUNCTUATION:
elif n_gram in PUNCTUATION:
do_negation = False
elif do_negation:

if USE_POS_TAGS:
n_gram = string.join(tagged[i], "_")
if do_negation:
n_gram = "NOT_" + n_gram

index = hash(n_gram)
if (not USE_PRESENCE) and bag_of_words.has_key(index):
bag_of_words[index] += 1
count += 1
else:
bag_of_words[index] = 1
count += 1
if not (USE_POS_TAGS and USE_ADJ_ONLY and (tagged[i][1] != "JJ")):
if (not USE_PRESENCE) and bag_of_words.has_key(index):
bag_of_words[index] += 1
#print n_gram + " => " + str(bag_of_words[index])
count += 1
else:
bag_of_words[index] = 1
count += 1
#print n_gram + " => " + str(bag_of_words[index])
# Normalize the bag of words. For whatever reason it didn't work very well with small decimals...
for k in bag_of_words.keys():
bag_of_words[k] = float(NORMAL_LENGTH*bag_of_words[k])/count
if NORMALIZE_BAGS:
for k in bag_of_words.keys():
bag_of_words[k] = float(NORMAL_LENGTH*bag_of_words[k])/count

return bag_of_words

pos_filenames = []
Expand All @@ -64,12 +83,12 @@ def make_bag(filename):
for (folder, x, filenames) in os.walk(POS_FOLDER):
for filename in filenames:
if filename.endswith(".txt"):
pos_filenames.append(folder + "\\" + filename)
pos_filenames.append(os.path.join(folder, filename))

for (folder, x, filenames) in os.walk(NEG_FOLDER):
for filename in filenames:
if filename.endswith(".txt"):
neg_filenames.append(folder + "\\" + filename)
neg_filenames.append(os.path.join(folder, filename))

# Partition reviews into folds.
pos_folds = [[] for i in range(NUM_FOLDS)]
Expand All @@ -84,6 +103,8 @@ def make_bag(filename):
# Construct a bag of words (or n-grams) from each file.
pos_fold_bags = [[] for i in range(NUM_FOLDS)]
neg_fold_bags = [[] for i in range(NUM_FOLDS)]

print "Constructed bags."

for i in range(NUM_FOLDS):
for filename in pos_folds[i]:
Expand All @@ -93,6 +114,7 @@ def make_bag(filename):
neg_fold_bags[i].append(make_bag(filename))


t1 = time.time()
for i in range(NUM_FOLDS):
pos_train_bags = []
neg_train_bags = []
Expand All @@ -114,16 +136,27 @@ def make_bag(filename):
train_bags = pos_train_bags + neg_train_bags

# TODO: Investigate LIBSVM training parameters.
m = svm_train(train_labels, train_bags, "-t %d -e %f" % (KERNEL_TYPE, EPSILON))
if USE_LIBLINEAR:
m = liblinearutil.train(train_labels, train_bags, "-e %f" % EPSILON)
else:
m = svmutil.svm_train(train_labels, train_bags, "-t %d -e %f" % (KERNEL_TYPE, EPSILON))


test_bags = pos_fold_bags[i] + neg_fold_bags[i]
test_filenames = pos_test_filenames + neg_test_filenames
test_labels = [1] * len(pos_test_filenames) + [-1] * len(neg_test_filenames)

(predicted_labels, acc, p_vals) = svm_predict(test_labels, test_bags, m)
if USE_LIBLINEAR:
(predicted_labels, acc, p_vals) = liblinearutil.svm_predict(test_labels, test_bags, m)
else:
(predicted_labels, acc, p_vals) = svmutil.svm_predict(test_labels, test_bags, m)

indices = random.sample(range(len(test_filenames)), 10)
filenames_labels = {}
for j in indices:
filename = test_filenames[j]
predicted_label = predicted_labels[j]
filenames_labels[filename] = predicted_labels[j]
filenames_labels[filename] = predicted_labels[j]

t2 = time.time()
print "Total time:", t2-t1

0 comments on commit 64bcfa5

Please sign in to comment.