Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Moved bag of words to its own file.
  • Loading branch information
job13011 committed Mar 17, 2016
1 parent 47c6a2a commit 593be58
Show file tree
Hide file tree
Showing 2 changed files with 163 additions and 113 deletions.
54 changes: 54 additions & 0 deletions BagOfWords.py
@@ -0,0 +1,54 @@
import nltk
import string

# "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not',
# 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word."
# They didn't provide a full list.
NEGATION_WORDS = ["not", "isn't", "didn't", "doesn't"]
PUNCTUATION = [".", "!", "?", ",", ";"] #TODO make this work with POS tags (._.)


def make(text, ref_bag=None, use_presence=False, use_pos_tags=False, use_adj_only=False, gram_length=1, normalize_bags=True):
bag_of_words = {}
do_negation = False

words = nltk.word_tokenize(text)
if use_pos_tags:# and gram_length==1:
tagged = nltk.pos_tag(words)
tagged = [string.join(t, "_") for t in tagged]
words = tagged
count = 0
for i in range(len(words) - gram_length + 1):
n_gram = string.join(words[i:i+gram_length], "_")
if (gram_length == 1): # Pang and Lee didn't do negation tagging for bigrams.
if n_gram in NEGATION_WORDS:
do_negation = True
elif n_gram in PUNCTUATION:
do_negation = False
if do_negation:
n_gram = "NOT_" + n_gram

# LIBSVM won't use strings as keys, so hash to convert to a number.
index = hash(n_gram)
if not (use_pos_tags and use_adj_only and (tagged[i][1] != "JJ")):
#if not (ref_bag != None and ((not ref_bag.has_key(index)) or (ref_bag[index] < MIN_OCCURRENCES))):
if (not use_presence) and bag_of_words.has_key(index):
bag_of_words[index] += 1
count += 1
else:
bag_of_words[index] = 1
count += 1

# Add it to the reference bag
if ref_bag != None:
if ref_bag.has_key(index):
ref_bag[index] += 1
else:
ref_bag[index] = 1

# TODO do this correctly

#if normalize_bags:
# for k in bag_of_words.keys():
# bag_of_words[k] = float(NORMAL_LENGTH*bag_of_words[k])/count
return bag_of_words
222 changes: 109 additions & 113 deletions review_svm.py
Expand Up @@ -2,158 +2,161 @@ import os
import random
import string
import time
import sys

import nltk
import svmutil
#import liblinearutil

import BagOfWords

# Program to classify the movie review dataset using a support vector machine
# (via LIBSVM), following Pang and Lee (2002).

NEG_FOLDER = os.path.join("review_polarity","txt_sentoken","neg")
POS_FOLDER = os.path.join("review_polarity","txt_sentoken","pos")
POS_FOLDER = os.path.join("review_polarity","txt_sentoken","pos")
NEG_FOLDER = os.path.join("review_polarity","txt_sentoken","neg")

# "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not',
# 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word."
# They didn't provide a full list.
# TODO make this a parameter
NEGATION_WORDS = ["not", "isn't", "didn't", "doesn't"]
PUNCTUATION = [".", "!", "?", ",", ";"]

NORMAL_LENGTH = 1000

# TODO Make these command-line parameters.
# These are now command line parameters! See below...
USE_PRESENCE = False # If true, use presence rather than frequency.
USE_POS_TAGS = True
USE_POS_TAGS = False
USE_ADJ_ONLY = False
GRAM_LENGTH = 2 # Unigrams, bigrams, ...
USE_NEGATION = True
GRAM_LENGTH = 1 # Unigrams, bigrams, ... TODO use a range
NUM_FOLDS = 3 # For cross-validation (Pang & Lee used 3)
EPSILON = .1 # determines how long the algorithm runs (default is 0.001)
KERNEL_TYPE = 0 # 0: linear, 2: radial basis
NORMALIZE_BAGS = False
USE_LIBLINEAR = False

MIN_OCCURRENCES = 4 # To be included, the word must show up this many times across all documents (Pang and Lee used 4)
EPSILON = .001 # determines how long the algorithm runs (default is 0.001)

KERNEL_TYPE = 0 # 0: linear, 2: radial basis (just use linear)
NORMALIZE_BAGS = True
USE_LIBLINEAR = False # Not implemented - it murdered my computer and wasn't noticeably faster. But maybe multicore is worth a look
CACHE_SIZE = 512
MIN_OCCURRENCES = 10 # To be included, the word must show up this many times across all documents

def file_to_text(filename):
f = open(filename)
lines = f.readlines()
f.close()
text = string.join(lines, " ")
return text

def generate_filenames(folder_name):
filenames = []
for (folder, x, folder_filenames) in os.walk(folder_name):
for filename in folder_filenames:
if filename.endswith(".txt"):
filenames.append(os.path.join(folder, filename))
return filenames

def partition_filenames(filenames, num_partitions):
partitions = [[] for i in range(num_partitions)]
for i in range(len(filenames)):
partitions[i % num_partitions].append(filenames[i])
return partitions

def make_bag(text, ref_bag):
bag_of_words = {}
do_negation = False

words = nltk.word_tokenize(text)
if USE_POS_TAGS:# and GRAM_LENGTH==1:
t5 = time.time()
tagged = nltk.pos_tag(words)
tagged = [string.join(t, "_") for t in tagged]
words = tagged
t6 = time.time()
print "Tag time (%d words): %f" % (len(words), (t6-t5))
count = 0
for i in range(len(words) - GRAM_LENGTH + 1):
n_gram = string.join(words[i:i+GRAM_LENGTH], "_")
if (GRAM_LENGTH == 1): # Pang and Lee didn't do negation tagging for bigrams.
if n_gram in NEGATION_WORDS:
do_negation = True
elif n_gram in PUNCTUATION:
do_negation = False

if do_negation:
n_gram = "NOT_" + n_gram

# LIBSVM won't use strings as keys, so hash to convert to a number.
index = hash(n_gram)
if not (USE_POS_TAGS and USE_ADJ_ONLY and (tagged[i][1] != "JJ")):
#if not (ref_bag != None and ((not ref_bag.has_key(index)) or (ref_bag[index] < MIN_OCCURRENCES))):
if (not USE_PRESENCE) and bag_of_words.has_key(index):
bag_of_words[index] += 1
count += 1
print n_gram, "=>", bag_of_words[index]
else:
bag_of_words[index] = 1
count += 1
print n_gram, "=>", bag_of_words[index]

# Add it to the reference bag
if ref_bag.has_key(index):
ref_bag[index] += 1
else:
ref_bag[index] = 1
# Normalize the bag of words. For whatever reason it didn't work very well with small decimals...
if NORMALIZE_BAGS:
for k in bag_of_words.keys():
bag_of_words[k] = float(NORMAL_LENGTH*bag_of_words[k])/count
return bag_of_words
# Set parameters from command-line arguments.
i = 0
try:
args = sys.argv[1:]
while i < len(args):
if args[i] == "--gram-length":
GRAM_LENGTH = int(args[i+1])
i += 2
elif args[i] == "--num-folds":
NUM_FOLDS = int(args[i+1])
i += 2
elif args[i] == "--presence":
USE_PRESENCE = True
i += 1
elif args[i] == "--frequency":
USE_PRESENCE = False
i += 1
elif args[i] == "--use-pos-tags":
USE_POS_TAGS = True
i += 1
elif args[i] == "--use-adj-only":
USE_ADJ_ONLY = True
i += 1
elif args[i] == "--use-negation":
USE_NEGATION = True
i += 1
elif args[i] == "--no-negation":
USE_NEGATION = False
i += 1
elif args[i] == "--threshold":
MIN_OCCURRENCES = int(args[i+1])
i += 2
elif args[i] == "--epsilon":
EPSILON = float(args[i+1])
i += 2
elif args[i] == "--help":
print "Usage:"
print "--gram-length N\t\tUse groups of N consecutive words (Default: 1)"
print "--num-folds N\t\tUse N folds for cross-validation (Default: 3)"
print "--presence\t\tUse word presence rather than word frequency (Default: Off)"
print "--frequency\t\tUse word frequency rather than word presence (Default: On)"
print "--use-pos-tags\t\tUse part-of-speech tags (Default: Off)"
print "--use-adj-only\t\tUse adjectives only (requires --use-pos-tags and --gram-length 1) (Default: Off)"
print "--threshold N\t\tOnly include words that appear at least N times across all documents (Default: 4)"
print "--epsilon X\t\tSVM parameter to control the number of iterations (Default: 0.001)"
print "\t\t\t(0 < epsilon < 1; lower = more iterations)"
exit()
else:
print "Error: Invalid argument", args[i]
i += 1
except Exception:
print "Invalid arguments"

t0 = time.time()

pos_filenames = []
neg_filenames = []
next_word_index = 0

for (folder, x, filenames) in os.walk(POS_FOLDER):
for filename in filenames:
if filename.endswith(".txt"):
pos_filenames.append(os.path.join(folder, filename))

for (folder, x, filenames) in os.walk(NEG_FOLDER):
for filename in filenames:
if filename.endswith(".txt"):
neg_filenames.append(os.path.join(folder, filename))

# TEST
pos_filenames = generate_filenames(POS_FOLDER)
neg_filenames = generate_filenames(NEG_FOLDER)

# TEST - to test on a subset of reviews (since some operations [i.e. tagging] are slow)
#pos_filenames = random.sample(pos_filenames, 20)
#neg_filenames = random.sample(neg_filenames, 20)

# Partition reviews into folds.
pos_folds = [[] for i in range(NUM_FOLDS)]
neg_folds = [[] for i in range(NUM_FOLDS)]

for i in range(len(pos_filenames)):
pos_folds[i % NUM_FOLDS].append(pos_filenames[i])

for i in range(len(neg_filenames)):
neg_folds[i % NUM_FOLDS].append(neg_filenames[i])
pos_folds = partition_filenames(pos_filenames, NUM_FOLDS)
neg_folds = partition_filenames(neg_filenames, NUM_FOLDS)

# Count occurrences of every word across all documents
# (this is important for e.g. Delta TFIDF)
word_table = {}
total_word_counts = {}

# Construct a bag of words (or n-grams) from each file.
pos_fold_bags = [[] for i in range(NUM_FOLDS)]
neg_fold_bags = [[] for i in range(NUM_FOLDS)]

for i in range(NUM_FOLDS):
for filename in pos_folds[i]:
t3 = time.time()
pos_fold_bags[i].append(make_bag(file_to_text(filename), ref_bag=word_table))
t4 = time.time()
print "Bag time:", (t4-t3)
pos_fold_bags[i].append(BagOfWords.make(file_to_text(filename), ref_bag=total_word_counts, gram_length=GRAM_LENGTH,
use_presence=USE_PRESENCE, use_pos_tags=USE_POS_TAGS, use_adj_only=USE_ADJ_ONLY, normalize_bags=NORMALIZE_BAGS))

for filename in neg_folds[i]:
t3 = time.time()
neg_fold_bags[i].append(make_bag(file_to_text(filename), ref_bag=word_table))
t4 = time.time()
print "Bag time:", (t4-t3)
neg_fold_bags[i].append(
BagOfWords.make(file_to_text(filename), ref_bag=total_word_counts, gram_length=GRAM_LENGTH,
use_presence=USE_PRESENCE, use_pos_tags=USE_POS_TAGS, use_adj_only=USE_ADJ_ONLY, normalize_bags=NORMALIZE_BAGS))


# Remove words with less than the minimum occurrences threshold.
for k in word_table.keys():
if word_table[k] < MIN_OCCURRENCES:
for bag in (neg_fold_bags + pos_fold_bags):
if bag.has_key(k):
bag.pop(k)

#word_table = make_bag(all_text, use_presence=False)
for k in word_table.keys():
if word_table[k] < MIN_OCCURRENCES:
word_table.pop(k)
num_unique_words = len(word_table.keys())
print "# unique words:", num_unique_words
for k in total_word_counts.keys():
if total_word_counts[k] < MIN_OCCURRENCES:
for fold in (neg_fold_bags + pos_fold_bags):
for bag in fold:
if bag.has_key(k):
bag.pop(k)
total_word_counts.pop(k)

#num_unique_words = len(total_word_counts.keys())
#print "# unique words:", num_unique_words

t1 = time.time()
print "Constructed bags, time:", (t1-t0)
Expand All @@ -178,21 +181,13 @@ for i in range(NUM_FOLDS):
train_labels = [1] * len(pos_train_bags) + [-1] * len(neg_train_bags)
train_bags = pos_train_bags + neg_train_bags

# TODO: Investigate LIBSVM training parameters.
# TODO: Why does LIBLINEAR break my computer?
if USE_LIBLINEAR:
pass#m = liblinearutil.train(train_labels, train_bags, "-e %f" % EPSILON)
else:
m = svmutil.svm_train(train_labels, train_bags, "-t %d -e %f -m %d -q" % (KERNEL_TYPE, EPSILON, CACHE_SIZE))
m = svmutil.svm_train(train_labels, train_bags, "-t %d -e %f -m %d -q" % (KERNEL_TYPE, EPSILON, CACHE_SIZE))

test_bags = pos_fold_bags[i] + neg_fold_bags[i]
test_filenames = pos_test_filenames + neg_test_filenames
test_labels = [1] * len(pos_test_filenames) + [-1] * len(neg_test_filenames)

if USE_LIBLINEAR:
pass#(predicted_labels, acc, p_vals) = liblinearutil.svm_predict(test_labels, test_bags, m)
else:
(predicted_labels, acc, p_vals) = svmutil.svm_predict(test_labels, test_bags, m)
(predicted_labels, acc, p_vals) = svmutil.svm_predict(test_labels, test_bags, m)

avg_acc += acc[0]

Expand All @@ -208,5 +203,6 @@ for i in range(NUM_FOLDS):
t2 = time.time()
avg_acc /= NUM_FOLDS
print "Total accuracy:", avg_acc
print "Total time:", (t2-t1)
print "Classification time:", (t2-t1)
print "Total time:", (t2-t0)

0 comments on commit 593be58

Please sign in to comment.