Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
331 lines (289 sloc) 12.5 KB
from __future__ import division
import os
import random
import string
import sys
import nltk
from nltk.corpus import movie_reviews
import numpy
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
import BagOfWords
import XMLParser
import TwitterCorpus
from TFIDF import delta_tfidf, compute_idfs
# Program to classify the movie review dataset using a support vector machine, following Pang and Lee (2002).
# "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not',
# 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word."
# They didn't provide a full list.
NEGATION_WORDS = ["not", "isn't", "didn't", "doesn't"]
PUNCTUATION = [".", "!", "?", ",", ";"]
# These are now command line parameters! See below...
USE_DELTATFIDF = False # Martineau and Finn. Excludes some other parameters (e.g. frequency)
USE_PRESENCE = False # If true, use presence rather than frequency.
USE_POS_TAGS = False
USE_ADJ_ONLY = False
USE_NEGATION = True
USE_POSITION = False
GRAM_LENGTH = 1 # Unigrams, bigrams, ... TODO use a range
NUM_FOLDS = 3 # For cross-validation (Pang & Lee used 3, Martineau & Finin used 10)
MIN_OCCURRENCES = 4 # To be included, the word must show up this many times across all documents (Pang and Lee used 4)
EPSILON = .001 # determines how long the algorithm runs (default is 0.001)
NORMALIZE_BAGS = True
USE_LIBLINEAR = True # This is supposedly faster for large instances
CORPUS = "movies" # "twitter", "amazon", "movies"
USE_DELTA_TFIDF = False
def make_folds(documents, ids, num_partitions):
folds = [[] for i in range(num_partitions)]
fold_ids = [[] for i in range(num_partitions)]
for i in range(len(documents)):
folds[i % num_partitions].append(documents[i])
fold_ids[i % num_partitions].append(ids[i])
return (folds, fold_ids)
def make_bag(text, total_word_counts, **bag_params):
return BagOfWords.make(text, ref_bag=total_word_counts, **bag_params)
def from_command_line():
i = 0
# Set parameters to default values
gram_length = GRAM_LENGTH
num_folds = NUM_FOLDS
use_presence = USE_PRESENCE
use_pos_tags = USE_POS_TAGS
use_negation = USE_NEGATION
use_position = USE_POSITION
min_occurrences = MIN_OCCURRENCES
corpus = CORPUS
try:
args = sys.argv[1:]
while i < len(args):
if args[i] == "--gram-length":
gram_length = int(args[i+1])
i += 2
elif args[i] == "--num-folds":
num_folds = int(args[i+1])
i += 2
elif args[i] == "--presence":
use_presence = True
i += 1
elif args[i] == "--frequency":
use_presence = False
i += 1
elif args[i] == "--use-pos-tags":
use_pos_tags = True
i += 1
elif args[i] == "--use-adj-only":
use_adj_only = True
i += 1
elif args[i] == "--use-negation":
use_negation = True
i += 1
elif args[i] == "--no-negation":
use_negation = False
i += 1
elif args[i] == "--use-position":
use_position = True
i += 1
elif args[i] == "--threshold":
min_occurrences = int(args[i+1])
i += 2
elif args[i] == "--corpus":
corpus = args[i+1]
i += 2
elif args[i] == "--use-delta":
use_delta = True
i += 1
elif args[i] == "--help":
print "Usage:"
print "--gram-length N\t\tUse groups of N consecutive words (Default: 1)"
print "--num-folds N\t\tUse N folds for cross-validation (Default: 3)"
print "--presence\t\tUse word presence rather than word frequency (Default: Off)"
print "--frequency\t\tUse word frequency rather than word presence (Default: On)"
print "--use-pos-tags\t\tUse part-of-speech tags (Default: Off)"
print "--use-negation\t\tTag words appearing after a negation word (Default: Off)"
print "--use-adj-only\t\tUse adjectives only (requires --use-pos-tags and --gram-length 1) (Default: Off)"
print "--use-position\t\tTag words according to their position in the text (Default: Off)"
print "--threshold N\t\tOnly include words that appear at least N times across all documents (Default: 4)"
print "\t\t\t(0 < epsilon < 1; lower = more iterations)"
print "--corpus\t\tSelect a corpus to evaluate. (amazon, movies, twitter) (Default: movies)"
print "--use-delta\t\tUse Delta TFIDF. (Default: Off)"
exit()
else:
print "Error: Invalid argument", args[i]
i += 1
classify_reviews(gram_length, num_folds, use_presence, use_negation, use_pos_tags, use_adj_only, min_occurrences, corpus, use_delta)
except Exception:
print "Invalid arguments"
def classify_reviews(gram_length=GRAM_LENGTH, num_folds=NUM_FOLDS, use_presence=USE_PRESENCE, use_negation=USE_NEGATION, use_pos_tags=USE_POS_TAGS, use_adj_only=USE_ADJ_ONLY,
use_position = USE_POSITION, min_occurrences=MIN_OCCURRENCES, corpus=CORPUS, use_delta=USE_DELTA_TFIDF, skew=(1,1)):
positive_ids = []
negative_ids = []
if corpus == "amazon":
# Load the mixed Amazon review dataset.
(ids, reviews, labels) = XMLParser.get_all_reviews()
for i in range(len(ids)):
if labels[i] == 1:
positive_ids.append(ids[i])
elif labels[i] == -1:
negative_ids.append(ids[i])
elif corpus == "movies":
# Load the Pang and Lee sentiment dataset.
ids = movie_reviews.fileids()
reviews = [list(movie_reviews.words(fileids = [id])) for id in ids]
labels = []
for id in ids:
label = movie_reviews.categories(id)[0]
if label == 'pos':
labels.append(1)
positive_ids.append(id)
elif label == 'neg':
labels.append(-1)
negative_ids.append(id)
elif corpus == "twitter":
(ids, reviews, labels) = TwitterCorpus.load()
for i in range(len(ids)):
if labels[i] == 1:
positive_ids.append(ids[i])
elif labels[i] == -1:
negative_ids.append(ids[i])
positive_reviews = []
negative_reviews = []
for i in range(len(reviews)):
if labels[i] == 1:
positive_reviews.append(reviews[i])
elif labels[i] == -1:
negative_reviews.append(reviews[i])
num_pos = int(len(positive_reviews) * skew[0])
num_neg = int(len(negative_reviews) * skew[1])
positive_reviews = random.sample(positive_reviews, num_pos)
negative_reviews = random.sample(negative_reviews, num_neg)
# Partition reviews into folds.
(pos_folds, pos_fold_ids) = make_folds(positive_reviews, positive_ids, num_folds)
(neg_folds, neg_fold_ids) = make_folds(negative_reviews, negative_ids, num_folds)
# Count occurrences of every word across all documents
# (this is important for e.g. Delta TFIDF)
total_word_counts = {}
# Construct a bag of words (or n-grams) from each file.
pos_fold_bags = [[] for i in range(num_folds)]
neg_fold_bags = [[] for i in range(num_folds)]
pos_fold_idfs = [compute_idfs(pos_folds[i]) for i in range(num_folds)]
neg_fold_idfs = [compute_idfs(neg_folds[i]) for i in range(num_folds)]
bag_params = {'gram_length':gram_length, 'use_presence':use_presence, 'use_negation':use_negation, 'use_pos_tags':use_pos_tags,
'use_adj_only':use_adj_only, 'use_position':use_position}
for i in range(num_folds):
for review in pos_folds[i]:
if use_delta:
pos_idfs = pos_fold_idfs[i]
neg_idfs = neg_fold_idfs[i]
pos_fold_bags[i].append(BagOfWords.make_delta_tfidf(review, positive_reviews, negative_reviews, pos_idfs, neg_idfs, total_word_counts))
else:
pos_fold_bags[i].append(make_bag(review, total_word_counts, **bag_params))
for review in neg_folds[i]:
if use_delta:
pos_idfs = pos_fold_idfs[i]
neg_idfs = neg_fold_idfs[i]
neg_fold_bags[i].append(BagOfWords.make_delta_tfidf(review, positive_reviews, negative_reviews, pos_idfs, neg_idfs, total_word_counts))
else:
neg_fold_bags[i].append(make_bag(review, total_word_counts, **bag_params))
# Remove words with less than the minimum occurrences threshold.
if min_occurrences > 0:
for k in total_word_counts.keys():
if total_word_counts[k] < min_occurrences:
for fold in (neg_fold_bags + pos_fold_bags):
for bag in fold:
if bag.has_key(k):
bag.pop(k)
total_word_counts.pop(k)
avg_acc = 0
wordlist = total_word_counts.keys()
for i in range(num_folds):
pos_train_reviews = []
neg_train_reviews = []
pos_train_bags = []
neg_train_bags = []
pos_test_reviews = pos_folds[i]
neg_test_reviews = neg_folds[i]
pos_test_ids = pos_fold_ids[i]
neg_test_ids = neg_fold_ids[i]
for j in range(num_folds):
if j != i:
pos_train_reviews += pos_folds[j]
neg_train_reviews += neg_folds[j]
pos_train_bags += pos_fold_bags[j]
neg_train_bags += neg_fold_bags[j]
train_labels = [1] * len(pos_train_bags) + [-1] * len(neg_train_bags)
train_bags = pos_train_bags + neg_train_bags
if USE_LIBLINEAR:
classifier = LinearSVC()
else:
classifier = SVC(kernel="linear",tol=EPSILON)
train_vecs = [BagOfWords.to_vector(bag, wordlist) for bag in train_bags]
classifier.fit(train_vecs, train_labels)
test_bags = pos_fold_bags[i] + neg_fold_bags[i]
test_vecs = [BagOfWords.to_vector(bag, wordlist) for bag in test_bags]
test_reviews = pos_test_reviews + neg_test_reviews
test_ids = pos_test_ids + neg_test_ids
test_labels = [1] * len(pos_test_reviews) + [-1] * len(neg_test_reviews)
predicted_labels = classifier.predict(test_vecs)
acc = classifier.score(test_vecs, test_labels)
avg_acc += acc
avg_acc /= num_folds
return avg_acc
def run_configs():
min_occurrences = 4
use_negation = True
labels = []
accs = []
#for corpus in ["movies", "amazon", "twitter"]:
for corpus in ["amazon", "twitter"]:
for use_position in [False, True]:
for (use_pos_tags, use_adj_only) in [(False, False), (True, False), (True, True)]:
for gram_length in [1,2]:
for use_presence in [False, True]:
params = {'gram_length':gram_length, 'use_presence':use_presence, 'use_pos_tags':use_pos_tags, 'use_adj_only':use_adj_only,
'use_position':use_position, 'corpus':corpus, 'min_occurrences':min_occurrences, 'use_delta':False}
acc = classify_reviews(**params)
label = "gram_length: %d, use_presence: %s, corpus: %s, use_pos_tags: %s, use_adj_only: %s, use_position: %s" % (gram_length, use_presence, corpus, use_pos_tags, use_adj_only, use_position)
print label, acc
labels.append(label)
accs.append(acc)
# Delta-TFIDF construction doesn't support all parameters (yet).
params = {'corpus':corpus, 'use_delta':True}
acc = classify_reviews(**params)
label = "delta_tfidf: True, corpus: %s" % corpus
print label, acc
labels.append(label)
accs.append(acc)
return (labels, accs)
def run_skewed():
min_occurrences = 4
use_negation = True
use_delta = False
use_pos_tags = False
use_adj_only = False
use_position = False
use_presence = True
labels = []
accs = []
for corpus in ["movies", "amazon"]:
for skew in [(0.2,1), (0.4,1), (0.6,1), (0.8, 1), (1,0.8), (1,0.6), (1,0.4), (1,0.2)]:
for gram_length in [1,2]:
params = {'gram_length':gram_length, 'use_presence':use_presence, 'use_pos_tags':use_pos_tags, 'use_adj_only':use_adj_only,
'use_position':use_position, 'corpus':corpus, 'min_occurrences':min_occurrences, 'use_delta':False, 'skew': skew}
acc = classify_reviews(**params)
label = "corpus: %s, gram_length: %d, skew: (%f, %f)" % (corpus, gram_length, skew[0], skew[1])
print label, acc
labels.append(label)
accs.append(acc)
params = {'gram_length':1, 'use_presence':False, 'use_pos_tags':False, 'use_adj_only':False,
'use_position':False, 'corpus':corpus, 'min_occurrences':min_occurrences, 'use_delta':False, 'skew': skew}
acc = classify_reviews(**params)
label = "corpus: %s, delta_tfidf: True, skew: (%f, %f)" % (corpus, skew[0], skew[1])
print label, acc
labels.append(label)
accs.append(acc)
#(labels, accs) = run_configs()
(labels, accs) = run_skewed()
f = open('SVM_RESULTS_SKEW.txt', 'w')
for (label, acc) in zip(labels, accs):
f.write("%s\t%s\n" % (label, acc))
f.close()
You can’t perform that action at this time.