Skip to content

Commit

Permalink
Added cross validation to SVM
Browse files Browse the repository at this point in the history
  • Loading branch information
job13011 committed Mar 1, 2016
1 parent aef4d1f commit 0f1aeae
Showing 1 changed file with 76 additions and 42 deletions.
118 changes: 76 additions & 42 deletions review_svm.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,38 +6,52 @@
# Program to classify the movie review dataset using a support vector machine
# (via LIBSVM), following Pang and Lee (2002).

NEG_FOLDER = "txt_sentoken\\neg"
POS_FOLDER = "txt_sentoken\\pos"
NUM_TRAIN = 300
USE_PRESENCE = True # If true, use presence rather than frequency.
GRAM_LENGTH = 2
NEG_FOLDER = "review_polarity\\txt_sentoken\\neg"
POS_FOLDER = "review_polarity\\txt_sentoken\\pos"

# "Adapting a technique of Das and Chen (2001)..."
# "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not',
# 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word."
# They didn't provide a full list.
NEGATION_WORDS = ["not", "isn't", "didn't", "doesn't"]
END_SENTENCE_PUNCTUATION = [".", "!", "?"]
PUNCTUATION = [".", "!", "?", ",", ";"]

# TODO Make these command-line parameters.
USE_PRESENCE = False # If true, use presence rather than frequency.
GRAM_LENGTH = 1 # Unigrams, bigrams, ...
NUM_FOLDS = 3 # For cross-validation (Pang & Lee used 3)
EPSILON = 0.01 # determines how long the algorithm runs

def make_bag(filename):
f = open(filename)
lines = f.readlines()
f.close()
text = string.join(lines, " ")
bag_of_words = {}
for line in lines:
do_negation = False
words = line.split(" ")
for word in words:
if word in NEGATION_WORDS:
do_negation = False

words = text.split(" ")
count = 0
for i in range(len(words) - GRAM_LENGTH + 1):
n_gram = string.join(words[i:i+GRAM_LENGTH+1], "_")
if (GRAM_LENGTH == 1): # Pang and Lee didn't do negation tagging for bigrams.
if n_gram in NEGATION_WORDS:
do_negation = True
if word in END_SENTENCE_PUNCTUATION:
if n_gram in PUNCTUATION:
do_negation = False
elif do_negation:
word = "NOT_" + word
index = hash(word)

if (not USE_PRESENCE) and bag_of_words.has_key(index):
bag_of_words[index] += 1
else:
bag_of_words[index] = 1
f.close()
n_gram = "NOT_" + n_gram

index = hash(n_gram)
if (not USE_PRESENCE) and bag_of_words.has_key(index):
bag_of_words[index] += 1
count += 1
else:
bag_of_words[index] = 1
count += 1
# Normalize the bag of words.
#for k in bag_of_words.keys():
# bag_of_words[k] = float(bag_of_words[k])/count

return bag_of_words

neg_filenames = []
Expand All @@ -57,37 +71,57 @@ def make_bag(filename):

random.shuffle(neg_filenames)
random.shuffle(pos_filenames)
neg_train_filenames = neg_filenames[:NUM_TRAIN]
neg_test_filenames = neg_filenames[NUM_TRAIN:]
pos_train_filenames = pos_filenames[:NUM_TRAIN]
pos_test_filenames = pos_filenames[NUM_TRAIN:]
neg_folds = [[] for i in range(NUM_FOLDS)]
pos_folds = [[] for i in range(NUM_FOLDS)]
for i in range(len(neg_filenames)):
neg_folds[i % NUM_FOLDS].append(neg_filenames[i])

for i in range(len(pos_filenames)):
pos_folds[i % NUM_FOLDS].append(pos_filenames[i])

#TRAIN

neg_train_bags = []
pos_train_bags = []
#neg_fold_bags = [[] for i in range(NUM_FOLDS)]
#pos_fold_bags = [[] for i in range(NUM_FOLDS)]

for i in range(NUM_FOLDS):

neg_train_bags = []
pos_train_bags = []

for filename in neg_train_filenames:
neg_train_bags.append(make_bag(filename))
neg_train_filenames = []
pos_train_filenames = []
neg_test_filenames = neg_folds[i]
pos_test_filenames = pos_folds[i]
for j in range(NUM_FOLDS):
if j != i:
neg_train_filenames += neg_folds[j]
pos_train_filenames += pos_folds[j]

for filename in pos_train_filenames:
pos_train_bags.append(make_bag(filename))
for filename in neg_train_filenames:
neg_train_bags.append(make_bag(filename))

train_labels = [-1] * len(neg_train_bags) + [1] * len(pos_train_bags)
train_bags = neg_train_bags + pos_train_bags
for filename in pos_train_filenames:
pos_train_bags.append(make_bag(filename))

# TODO: Investigate LIBSVM training parameters.
m = svm_train(train_labels, train_bags, "-t 0")
train_labels = [-1] * len(neg_train_bags) + [1] * len(pos_train_bags)
train_bags = neg_train_bags + pos_train_bags

# TEST
# TODO: Investigate LIBSVM training parameters.
m = svm_train(train_labels, train_bags, "-t 0 -e %d" % EPSILON)

test_bags = []
test_filenames = neg_test_filenames + pos_test_filenames
# TEST
test_bags = []
test_filenames = neg_test_filenames + pos_test_filenames

for filename in test_filenames:
test_bags.append(make_bag(filename))
for filename in test_filenames:
test_bags.append(make_bag(filename))

test_labels = [-1] * len(neg_test_filenames) + [1] * len(pos_test_filenames)
svm_predict(test_labels, test_bags, m)
test_labels = [-1] * len(neg_test_filenames) + [1] * len(pos_test_filenames)
(train_labels, acc, p_vals) = svm_predict(test_labels, test_bags, m)
#indices = random.sample(range(len(test_filenames)))
#for i in indices:
# filename = test_filenames[i]
# if filename in neg_

#sorted(bag_of_words.items(), key=lambda (k,v): -v)[:100]

0 comments on commit 0f1aeae

Please sign in to comment.