Skip to content

Commit

Permalink
stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
job13011 committed Mar 1, 2016
1 parent 0f1aeae commit 88fcad2
Showing 1 changed file with 50 additions and 48 deletions.
98 changes: 50 additions & 48 deletions review_svm.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,14 @@
NEGATION_WORDS = ["not", "isn't", "didn't", "doesn't"]
PUNCTUATION = [".", "!", "?", ",", ";"]

NORMAL_LENGTH = 1000

# TODO Make these command-line parameters.
USE_PRESENCE = False # If true, use presence rather than frequency.
GRAM_LENGTH = 1 # Unigrams, bigrams, ...
GRAM_LENGTH = 3 # Unigrams, bigrams, ...
NUM_FOLDS = 3 # For cross-validation (Pang & Lee used 3)
EPSILON = 0.01 # determines how long the algorithm runs
EPSILON = 0.001 # determines how long the algorithm runs (default is 0.001)
KERNEL_TYPE = 0 # 0: linear, 2: radial basis

def make_bag(filename):
f = open(filename)
Expand Down Expand Up @@ -48,80 +51,79 @@ def make_bag(filename):
else:
bag_of_words[index] = 1
count += 1
# Normalize the bag of words.
#for k in bag_of_words.keys():
# bag_of_words[k] = float(bag_of_words[k])/count

# Normalize the bag of words. For whatever reason it didn't work very well with small decimals...
for k in bag_of_words.keys():
bag_of_words[k] = float(NORMAL_LENGTH*bag_of_words[k])/count
return bag_of_words

neg_filenames = []
pos_filenames = []
neg_filenames = []
word_table = {}
next_word_index = 0

for (folder, x, filenames) in os.walk(NEG_FOLDER):
for filename in filenames:
if filename.endswith(".txt"):
neg_filenames.append(folder + "\\" + filename)

for (folder, x, filenames) in os.walk(POS_FOLDER):
for filename in filenames:
if filename.endswith(".txt"):
pos_filenames.append(folder + "\\" + filename)

random.shuffle(neg_filenames)
random.shuffle(pos_filenames)
neg_folds = [[] for i in range(NUM_FOLDS)]
for (folder, x, filenames) in os.walk(NEG_FOLDER):
for filename in filenames:
if filename.endswith(".txt"):
neg_filenames.append(folder + "\\" + filename)

# Partition reviews into folds.
pos_folds = [[] for i in range(NUM_FOLDS)]
for i in range(len(neg_filenames)):
neg_folds[i % NUM_FOLDS].append(neg_filenames[i])
neg_folds = [[] for i in range(NUM_FOLDS)]

for i in range(len(pos_filenames)):
pos_folds[i % NUM_FOLDS].append(pos_filenames[i])

#TRAIN

#neg_fold_bags = [[] for i in range(NUM_FOLDS)]
#pos_fold_bags = [[] for i in range(NUM_FOLDS)]
for i in range(len(neg_filenames)):
neg_folds[i % NUM_FOLDS].append(neg_filenames[i])

# Construct a bag of words (or n-grams) from each file.
pos_fold_bags = [[] for i in range(NUM_FOLDS)]
neg_fold_bags = [[] for i in range(NUM_FOLDS)]

for i in range(NUM_FOLDS):
for filename in pos_folds[i]:
pos_fold_bags[i].append(make_bag(filename))

for filename in neg_folds[i]:
neg_fold_bags[i].append(make_bag(filename))

neg_train_bags = []

for i in range(NUM_FOLDS):
pos_train_bags = []
neg_train_bags = []

neg_train_filenames = []
pos_train_filenames = []
neg_test_filenames = neg_folds[i]
neg_train_filenames = []

pos_test_filenames = pos_folds[i]
neg_test_filenames = neg_folds[i]

for j in range(NUM_FOLDS):
if j != i:
neg_train_filenames += neg_folds[j]
pos_train_filenames += pos_folds[j]
neg_train_filenames += neg_folds[j]
pos_train_bags += pos_fold_bags[j]
neg_train_bags += neg_fold_bags[j]

for filename in neg_train_filenames:
neg_train_bags.append(make_bag(filename))

for filename in pos_train_filenames:
pos_train_bags.append(make_bag(filename))

train_labels = [-1] * len(neg_train_bags) + [1] * len(pos_train_bags)
train_bags = neg_train_bags + pos_train_bags
train_labels = [1] * len(pos_train_bags) + [-1] * len(neg_train_bags)
train_bags = pos_train_bags + neg_train_bags

# TODO: Investigate LIBSVM training parameters.
m = svm_train(train_labels, train_bags, "-t 0 -e %d" % EPSILON)

# TEST
test_bags = []
test_filenames = neg_test_filenames + pos_test_filenames

for filename in test_filenames:
test_bags.append(make_bag(filename))
m = svm_train(train_labels, train_bags, "-t %d -e %f" % (KERNEL_TYPE, EPSILON))

test_labels = [-1] * len(neg_test_filenames) + [1] * len(pos_test_filenames)
(train_labels, acc, p_vals) = svm_predict(test_labels, test_bags, m)
#indices = random.sample(range(len(test_filenames)))
#for i in indices:
# filename = test_filenames[i]
# if filename in neg_
test_bags = pos_fold_bags[i] + neg_fold_bags[i]
test_filenames = pos_test_filenames + neg_test_filenames
test_labels = [1] * len(pos_test_filenames) + [-1] * len(neg_test_filenames)

#sorted(bag_of_words.items(), key=lambda (k,v): -v)[:100]
(predicted_labels, acc, p_vals) = svm_predict(test_labels, test_bags, m)
indices = random.sample(range(len(test_filenames)), 10)
filenames_labels = {}
for j in indices:
filename = test_filenames[j]
predicted_label = predicted_labels[j]
filenames_labels[filename] = predicted_labels[j]

0 comments on commit 88fcad2

Please sign in to comment.