Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Delta TFIDF optimization. 98% accuracy, I'm suspicious.
  • Loading branch information
job13011 committed Apr 11, 2016
1 parent 1fd3910 commit 6c944e0
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 45 deletions.
4 changes: 2 additions & 2 deletions BagOfWords.py
Expand Up @@ -84,11 +84,11 @@ def make_tfidf(document, documents):

# As per Martineau and Finn (2009), create a bag of words using delta TFIDF as the feature value.
# Todo: Bigrams?
def make_delta_tfidf(document, positive_set, negative_set, ref_bag):
def make_delta_tfidf(document, positive_set, negative_set, pos_idfs, neg_idfs, ref_bag):
bag = {}
factor = 0
for term in set(document):
weight = delta_tfidf(term, document, positive_set, negative_set)
weight = delta_tfidf(term, document, positive_set, negative_set, pos_idfs, neg_idfs)
if (weight != 0):
bag[term] = weight
factor += weight**2
Expand Down
41 changes: 29 additions & 12 deletions TFIDF.py
@@ -1,18 +1,37 @@
from __future__ import division
import math
import nltk

# document is assumed to be tokenized (a list of words)
# documents is a list of tokenized docs
def tfidf(term, document, documents):
all_doc_appearances = 0 # number of documents in which term appears

def compute_idfs(documents):
idfs = {}
N = len(documents)
for doc in documents:
if term in doc:
all_doc_appearances += 1
for term in doc:
if idfs.has_key(term):
idfs[term] += 1
else:
idfs[term] = 1
for term in idfs.keys():
idfs[term] = math.log(N/idfs[term])
return idfs

def tfidf(term, document, documents, idfs={}):
if idfs == {}:
all_doc_appearances = sum([doc for doc in documents if term in doc])
idf = math.log(len(documents)/all_doc_appearances, 10)
else:
if idfs.has_key(term):
idf = idfs[term]
else:
return 0 # is this supposed to happen???
doc_appearances = 0 # number of appearances of term in this document
for word in document:
if term == word:
doc_appearances += 1
num_docs = len(documents) # number of documents in the collection
"""
if doc_appearances == 0:
#This happens sometimes, probably due to inconsistent splitting/tokenizing.
#print "Error: no occurrences of", term
Expand All @@ -21,12 +40,10 @@ def tfidf(term, document, documents):
#print "Error: fuck,", term
return 0
else:
tfidf = (1 + math.log(doc_appearances,10)) * math.log((float(num_docs)/all_doc_appearances), 10)
return tfidf
"""
tfidf = (1 + math.log(doc_appearances,10)) * idf
return tfidf

# Martineau and Finn 2009
def delta_tfidf(term, document, positive_set, negative_set):
return tfidf(term, document, positive_set) - tfidf(term, document, negative_set)

def delta_tfidf_fast(term, document, positive_set, negative_set):
return tfidf(term, document, positive_set) - tfidf(term, document, negative_set)
def delta_tfidf(term, document, positive_set, negative_set, pos_idfs={}, neg_idfs={}):
return tfidf(term, document, positive_set, pos_idfs) - tfidf(term, document, negative_set, neg_idfs)
68 changes: 37 additions & 31 deletions review_svm.py
Expand Up @@ -11,7 +11,7 @@ import numpy
#import svmutil
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from TFIDF import delta_tfidf
from TFIDF import delta_tfidf, compute_idfs

import BagOfWords
import XMLParser
Expand All @@ -34,7 +34,7 @@ USE_ADJ_ONLY = False
USE_NEGATION = True
USE_POSITION = False
GRAM_LENGTH = 1 # Unigrams, bigrams, ... TODO use a range
NUM_FOLDS = 3 # For cross-validation (Pang & Lee used 3)
NUM_FOLDS = 10 # For cross-validation (Pang & Lee used 3)

MIN_OCCURRENCES = 0#4 # To be included, the word must show up this many times across all documents (Pang and Lee used 4)
EPSILON = .001 # determines how long the algorithm runs (default is 0.001)
Expand All @@ -44,11 +44,13 @@ USE_LIBLINEAR = True # This is supposedly faster for la

USE_AMAZON = False # Use the Amazon review set, not Pang and Lee.

def make_folds(documents, num_partitions):
def make_folds(documents, ids, num_partitions):
folds = [[] for i in range(num_partitions)]
fold_ids = [[] for i in range(num_partitions)]
for i in range(len(documents)):
folds[i % num_partitions].append(documents[i])
return folds
fold_ids[i % num_partitions].append(ids[i])
return (folds, fold_ids)

def make_bag(text, total_word_counts):
return BagOfWords.make(text, ref_bag=total_word_counts,
Expand Down Expand Up @@ -123,30 +125,35 @@ if USE_AMAZON:
else:
# Load the Pang and Lee sentiment dataset.
ids = movie_reviews.fileids()
reviews = [list(movie_reviews.words(id)) for id in ids]
reviews = [list(movie_reviews.words(fileids = [id])) for id in ids]
positive_ids = []
negative_ids = []
labels = []
for id in ids:
label = movie_reviews.categories(id)[0]
if label == 'pos':
labels.append(1)
positive_ids.append(id)
elif label == 'neg':
labels.append(-1)
negative_ids.append(id)

positive_reviews = []
negative_reviews = []

for i in range(len(reviews)):
if labels[i] == 1:
positive_reviews.append(reviews[i])
elif labels[i] == -1:
negative_reviews.append(reviews[i])

#TEST
positive_reviews = random.sample(positive_reviews, 250)
negative_reviews = random.sample(negative_reviews, 250)
#positive_reviews = random.sample(positive_reviews, 25)
#negative_reviews = random.sample(negative_reviews, 25)

# Partition reviews into folds.
pos_folds = make_folds(positive_reviews, NUM_FOLDS)
neg_folds = make_folds(negative_reviews, NUM_FOLDS)
(pos_folds, pos_fold_ids) = make_folds(positive_reviews, positive_ids, NUM_FOLDS)
(neg_folds, neg_fold_ids) = make_folds(negative_reviews, negative_ids, NUM_FOLDS)

# Count occurrences of every word across all documents
# (this is important for e.g. Delta TFIDF)
Expand All @@ -156,24 +163,25 @@ total_word_counts = {}
pos_fold_bags = [[] for i in range(NUM_FOLDS)]
neg_fold_bags = [[] for i in range(NUM_FOLDS)]

pos_fold_idfs = [compute_idfs(pos_folds[i]) for i in range(NUM_FOLDS)]
neg_fold_idfs = [compute_idfs(neg_folds[i]) for i in range(NUM_FOLDS)]

for i in range(NUM_FOLDS):
for review in pos_folds[i]:
t3 = time.time()
if USE_DELTATFIDF:
pos_fold_bags[i].append(BagOfWords.make_delta_tfidf(review, positive_reviews, negative_reviews, total_word_counts))
pos_idfs = pos_fold_idfs[i]
neg_idfs = neg_fold_idfs[i]
pos_fold_bags[i].append(BagOfWords.make_delta_tfidf(review, positive_reviews, negative_reviews, pos_idfs, neg_idfs, total_word_counts))
else:
pos_fold_bags[i].append(make_bag(review, total_word_counts))
t4 = time.time()
print "Bag time:", (t4-t3)
pos_fold_bags[i].append(make_bag(review, total_word_counts))

for review in neg_folds[i]:
t3 = time.time()
if USE_DELTATFIDF:
neg_fold_bags[i].append(BagOfWords.make_delta_tfidf(review, positive_reviews, negative_reviews, total_word_counts))
pos_idfs = pos_fold_idfs[i]
neg_idfs = neg_fold_idfs[i]
neg_fold_bags[i].append(BagOfWords.make_delta_tfidf(review, positive_reviews, negative_reviews, pos_idfs, neg_idfs, total_word_counts))
else:
neg_fold_bags[i].append(make_bag(review, total_word_counts))
t4 = time.time()
print "Bag time:", (t4-t3)

# Remove words with less than the minimum occurrences threshold.
if MIN_OCCURRENCES > 0:
Expand All @@ -193,6 +201,8 @@ print "Constructed bags, time:", (t1-t0)
avg_acc = 0

wordlist = total_word_counts.keys()

f = open("results.txt", "w")
for i in range(NUM_FOLDS):
pos_train_reviews = []
neg_train_reviews = []
Expand All @@ -201,6 +211,8 @@ for i in range(NUM_FOLDS):

pos_test_reviews = pos_folds[i]
neg_test_reviews = neg_folds[i]
pos_test_ids = pos_fold_ids[i]
neg_test_ids = neg_fold_ids[i]
for j in range(NUM_FOLDS):
if j != i:
pos_train_reviews += pos_folds[j]
Expand All @@ -211,7 +223,6 @@ for i in range(NUM_FOLDS):
train_labels = [1] * len(pos_train_bags) + [-1] * len(neg_train_bags)
train_bags = pos_train_bags + neg_train_bags

#m = svmutil.svm_train(train_labels, train_bags, "-t 0 -e %f -m %d -q" % (EPSILON, CACHE_SIZE))
if USE_LIBLINEAR:
classifier = LinearSVC()
else:
Expand All @@ -224,23 +235,18 @@ for i in range(NUM_FOLDS):

test_vecs = [BagOfWords.to_vector(bag, wordlist) for bag in test_bags]
test_reviews = pos_test_reviews + neg_test_reviews
test_ids = pos_test_ids + neg_test_ids
test_labels = [1] * len(pos_test_reviews) + [-1] * len(neg_test_reviews)

#(predicted_labels, acc, p_vals) = svmutil.svm_predict(test_labels, test_bags, m)
predicted_labels = classifier.predict(test_vecs)
acc = classifier.score(test_vecs, test_labels)

for i in range(len(test_reviews)):
print "%s\t%d\t%d" % (test_ids[i], test_labels[i], predicted_labels[i])

avg_acc += acc

"""
indices = random.sample(range(len(test_filenames)), 10)
filenames_labels = {}
for j in indices:
filename = test_filenames[j]
predicted_label = predicted_labels[j]
filenames_labels[filename] = predicted_labels[j]
"""

f.close()

t2 = time.time()
avg_acc /= NUM_FOLDS
print "Total accuracy:", avg_acc
Expand Down

0 comments on commit 6c944e0

Please sign in to comment.