review_svm.py

from __future__ import division
import os
import random
import string
import time
import sys

import nltk
from nltk.corpus import movie_reviews
import numpy
#import svmutil
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from TFIDF import delta_tfidf, compute_idfs

import BagOfWords
import XMLParser

# Program to classify the movie review dataset using a support vector machine
# (via LIBSVM), following Pang and Lee (2002).

# "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not',
# 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word."
# They didn't provide a full list.
# TODO make this a parameter
NEGATION_WORDS = ["not", "isn't", "didn't", "doesn't"]
PUNCTUATION    = [".", "!", "?", ",", ";"]

# These are now command line parameters! See below...
USE_DELTATFIDF  = False                       # Martineau and Finn.  Excludes some other parameters (e.g. frequency)
USE_PRESENCE    = False                       # If true, use presence rather than frequency.
USE_POS_TAGS    = False
USE_ADJ_ONLY    = False
USE_NEGATION    = True
USE_POSITION    = False
GRAM_LENGTH     = 1                           # Unigrams, bigrams, ... TODO use a range
NUM_FOLDS       = 3                           # For cross-validation (Pang & Lee used 3, Martineau & Finn used 10)

MIN_OCCURRENCES = 0#4                         # To be included, the word must show up this many times across all documents (Pang and Lee used 4)
EPSILON         = .001                        # determines how long the algorithm runs (default is 0.001)

NORMALIZE_BAGS  = True
USE_LIBLINEAR   = True                        # This is supposedly faster for large instances

USE_AMAZON      = False                       # Use the Amazon review set, not Pang and Lee.

def make_folds(documents, ids, num_partitions):
  folds = [[] for i in range(num_partitions)]
  fold_ids = [[] for i in range(num_partitions)]
  for i in range(len(documents)):
    folds[i % num_partitions].append(documents[i])
    fold_ids[i % num_partitions].append(ids[i])
  return (folds, fold_ids)

def make_bag(text, total_word_counts):
  return BagOfWords.make(text, ref_bag=total_word_counts,
                         gram_length=GRAM_LENGTH, use_presence=USE_PRESENCE,
                         use_pos_tags=USE_POS_TAGS, use_adj_only=USE_ADJ_ONLY,
                         normalize=NORMALIZE_BAGS, use_negation=USE_NEGATION,
                         use_position=USE_POSITION)

# Set parameters from command-line arguments.
i = 0
try:
  args = sys.argv[1:]
  while i < len(args):
    if args[i] == "--gram-length":
      GRAM_LENGTH = int(args[i+1])
      i += 2
    elif args[i] == "--num-folds":
      NUM_FOLDS = int(args[i+1])
      i += 2
    elif args[i] == "--presence":
      USE_PRESENCE = True
      i += 1
    elif args[i] == "--frequency":
      USE_PRESENCE = False
      i += 1
    elif args[i] == "--use-pos-tags":
      USE_POS_TAGS = True
      i += 1
    elif args[i] == "--use-adj-only":
      USE_ADJ_ONLY = True
      i += 1
    elif args[i] == "--use-negation":
      USE_NEGATION = True
      i += 1
    elif args[i] == "--no-negation":
      USE_NEGATION = False
      i += 1
    elif args[i] == "--use-position":
      USE_POSITION = True
      i += 1
    elif args[i] == "--threshold":
      MIN_OCCURRENCES = int(args[i+1])
      i += 2
    elif args[i] == "--epsilon":
      EPSILON = float(args[i+1])
      i += 2
    elif args[i] == "--use-amazon":
      USE_AMAZON = True
      i += 1
    elif args[i] == "--use-delta":
      USE_DELTATFIDF = True
      i += 1
    elif args[i] == "--help":
      print "Usage:"
      print "--gram-length N\t\tUse groups of N consecutive words (Default: 1)"
      print "--num-folds N\t\tUse N folds for cross-validation (Default: 3)"
      print "--presence\t\tUse word presence rather than word frequency (Default: Off)"
      print "--frequency\t\tUse word frequency rather than word presence (Default: On)"
      print "--use-pos-tags\t\tUse part-of-speech tags (Default: Off)"
      print "--use-negation\t\tTag words appearing after a negation word (Default: Off)"
      print "--use-adj-only\t\tUse adjectives only (requires --use-pos-tags and --gram-length 1) (Default: Off)"
      print "--use-position\t\tTag words according to their position in the text (Default: Off)"
      print "--threshold N\t\tOnly include words that appear at least N times across all documents  (Default: 4)"
      print "--epsilon X\t\tSVM parameter to control the number of iterations (Default: 0.001)"
      print "\t\t\t(0 < epsilon < 1; lower = more iterations)"
      print "--use-amazon\t\tUse the Amazon data set rather than the movie review set.  (Default: Off)"
      print "--use-delta\t\tUse Delta TFIDF.  (Default: Off)"
      exit()
    else:
      print "Error: Invalid argument", args[i]
      i += 1
except Exception:
  print "Invalid arguments"

t0 = time.time()

positive_ids = []
negative_ids = []

if USE_AMAZON:
  # Load the mixed Amazon review dataset.
  (ids, reviews, labels) = XMLParser.get_all_reviews()
  for i in range(len(ids)):
    if labels[i] == 1:
      positive_ids.append(ids[i])
    elif labels[i] == -1:
      negative_ids.append(ids[i])
else:
  # Load the Pang and Lee sentiment dataset.
  ids = movie_reviews.fileids()
  reviews = [list(movie_reviews.words(fileids = [id])) for id in ids]
  labels = []
  for id in ids:
    label = movie_reviews.categories(id)[0]
    if label == 'pos':
      labels.append(1)
      positive_ids.append(id)
    elif label == 'neg':
      labels.append(-1)
      negative_ids.append(id)

positive_reviews = []
negative_reviews = []

for i in range(len(reviews)):
  if labels[i] == 1:
    positive_reviews.append(reviews[i])
  elif labels[i] == -1:
    negative_reviews.append(reviews[i])

#TEST
#positive_reviews = positive_reviews[:200]
#negative_reviews = negative_reviews[:600]
#positive_reviews = random.sample(positive_reviews, 1000)
#negative_reviews = random.sample(negative_reviews, 1000)

# Partition reviews into folds.
(pos_folds, pos_fold_ids) = make_folds(positive_reviews, positive_ids, NUM_FOLDS)
(neg_folds, neg_fold_ids) = make_folds(negative_reviews, negative_ids, NUM_FOLDS)

# Count occurrences of every word across all documents
# (this is important for e.g. Delta TFIDF)
total_word_counts = {}

# Construct a bag of words (or n-grams) from each file.
pos_fold_bags = [[] for i in range(NUM_FOLDS)]
neg_fold_bags = [[] for i in range(NUM_FOLDS)]

pos_fold_idfs = [compute_idfs(pos_folds[i]) for i in range(NUM_FOLDS)]
neg_fold_idfs = [compute_idfs(neg_folds[i]) for i in range(NUM_FOLDS)]

for i in range(NUM_FOLDS):
  for review in pos_folds[i]:
    if USE_DELTATFIDF:
      pos_idfs = pos_fold_idfs[i]
      neg_idfs = neg_fold_idfs[i]
      pos_fold_bags[i].append(BagOfWords.make_delta_tfidf(review, positive_reviews, negative_reviews, pos_idfs, neg_idfs, total_word_counts))
    else:
      pos_fold_bags[i].append(make_bag(review, total_word_counts))

  for review in neg_folds[i]:
    if USE_DELTATFIDF:
      pos_idfs = pos_fold_idfs[i]
      neg_idfs = neg_fold_idfs[i]
      neg_fold_bags[i].append(BagOfWords.make_delta_tfidf(review, positive_reviews, negative_reviews, pos_idfs, neg_idfs, total_word_counts))
    else:
      neg_fold_bags[i].append(make_bag(review, total_word_counts))

# Remove words with less than the minimum occurrences threshold.
if MIN_OCCURRENCES > 0:
  for k in total_word_counts.keys():
    if total_word_counts[k] < MIN_OCCURRENCES:
      for fold in (neg_fold_bags + pos_fold_bags):
        for bag in fold:
          if bag.has_key(k):
            bag.pop(k)
      total_word_counts.pop(k)

#num_unique_words = len(total_word_counts.keys())
#print "# unique words:", num_unique_words

t1 = time.time()
print "Constructed bags, time:", (t1-t0)
avg_acc = 0

wordlist = total_word_counts.keys()

#f = open("results.txt", "w")
for i in range(NUM_FOLDS):
  pos_train_reviews = []
  neg_train_reviews = []
  pos_train_bags = []
  neg_train_bags = []

  pos_test_reviews = pos_folds[i]
  neg_test_reviews = neg_folds[i]
  pos_test_ids = pos_fold_ids[i]
  neg_test_ids = neg_fold_ids[i]
  for j in range(NUM_FOLDS):
    if j != i:
      pos_train_reviews += pos_folds[j]
      neg_train_reviews += neg_folds[j]
      pos_train_bags += pos_fold_bags[j]
      neg_train_bags += neg_fold_bags[j]

  train_labels = [1] * len(pos_train_bags) + [-1] * len(neg_train_bags)
  train_bags = pos_train_bags + neg_train_bags

  if USE_LIBLINEAR:
    classifier = LinearSVC()
  else:
    classifier = SVC(kernel="linear",tol=EPSILON)

  train_vecs = [BagOfWords.to_vector(bag, wordlist) for bag in train_bags]
  classifier.fit(train_vecs, train_labels)

  test_bags = pos_fold_bags[i] + neg_fold_bags[i]

  test_vecs = [BagOfWords.to_vector(bag, wordlist) for bag in test_bags]
  test_reviews = pos_test_reviews + neg_test_reviews
  test_ids = pos_test_ids + neg_test_ids
  test_labels = [1] * len(pos_test_reviews) + [-1] * len(neg_test_reviews)

  predicted_labels = classifier.predict(test_vecs)
  acc = classifier.score(test_vecs, test_labels)
  for i in range(len(test_reviews)):
    #f.write("%s\t%d\t%d\n" % (test_ids[i], test_labels[i], predicted_labels[i]))
    print("%s\t%d\t%d" % (test_ids[i], test_labels[i], predicted_labels[i]))

  avg_acc += acc

#f.close()

t2 = time.time()
avg_acc /= NUM_FOLDS
print "Total accuracy:", avg_acc
print "Classification time:", (t2-t1)
print "Total time:", (t2-t0)
	from __future__ import division
	import os
	import random
	import string
	import time
	import sys

	import nltk
	from nltk.corpus import movie_reviews
	import numpy
	#import svmutil
	from sklearn.svm import SVC
	from sklearn.svm import LinearSVC
	from TFIDF import delta_tfidf, compute_idfs

	import BagOfWords
	import XMLParser

	# Program to classify the movie review dataset using a support vector machine
	# (via LIBSVM), following Pang and Lee (2002).

	# "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not',
	# 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word."
	# They didn't provide a full list.
	# TODO make this a parameter
	NEGATION_WORDS = ["not", "isn't", "didn't", "doesn't"]
	PUNCTUATION = [".", "!", "?", ",", ";"]

	# These are now command line parameters! See below...
	USE_DELTATFIDF = False # Martineau and Finn. Excludes some other parameters (e.g. frequency)
	USE_PRESENCE = False # If true, use presence rather than frequency.
	USE_POS_TAGS = False
	USE_ADJ_ONLY = False
	USE_NEGATION = True
	USE_POSITION = False
	GRAM_LENGTH = 1 # Unigrams, bigrams, ... TODO use a range
	NUM_FOLDS = 3 # For cross-validation (Pang & Lee used 3, Martineau & Finn used 10)

	MIN_OCCURRENCES = 0#4 # To be included, the word must show up this many times across all documents (Pang and Lee used 4)
	EPSILON = .001 # determines how long the algorithm runs (default is 0.001)

	NORMALIZE_BAGS = True
	USE_LIBLINEAR = True # This is supposedly faster for large instances

	USE_AMAZON = False # Use the Amazon review set, not Pang and Lee.

	def make_folds(documents, ids, num_partitions):
	folds = [[] for i in range(num_partitions)]
	fold_ids = [[] for i in range(num_partitions)]
	for i in range(len(documents)):
	folds[i % num_partitions].append(documents[i])
	fold_ids[i % num_partitions].append(ids[i])
	return (folds, fold_ids)

	def make_bag(text, total_word_counts):
	return BagOfWords.make(text, ref_bag=total_word_counts,
	gram_length=GRAM_LENGTH, use_presence=USE_PRESENCE,
	use_pos_tags=USE_POS_TAGS, use_adj_only=USE_ADJ_ONLY,
	normalize=NORMALIZE_BAGS, use_negation=USE_NEGATION,
	use_position=USE_POSITION)

	# Set parameters from command-line arguments.
	i = 0
	try:
	args = sys.argv[1:]
	while i < len(args):
	if args[i] == "--gram-length":
	GRAM_LENGTH = int(args[i+1])
	i += 2
	elif args[i] == "--num-folds":
	NUM_FOLDS = int(args[i+1])
	i += 2
	elif args[i] == "--presence":
	USE_PRESENCE = True
	i += 1
	elif args[i] == "--frequency":
	USE_PRESENCE = False
	i += 1
	elif args[i] == "--use-pos-tags":
	USE_POS_TAGS = True
	i += 1
	elif args[i] == "--use-adj-only":
	USE_ADJ_ONLY = True
	i += 1
	elif args[i] == "--use-negation":
	USE_NEGATION = True
	i += 1
	elif args[i] == "--no-negation":
	USE_NEGATION = False
	i += 1
	elif args[i] == "--use-position":
	USE_POSITION = True
	i += 1
	elif args[i] == "--threshold":
	MIN_OCCURRENCES = int(args[i+1])
	i += 2
	elif args[i] == "--epsilon":
	EPSILON = float(args[i+1])
	i += 2
	elif args[i] == "--use-amazon":
	USE_AMAZON = True
	i += 1
	elif args[i] == "--use-delta":
	USE_DELTATFIDF = True
	i += 1
	elif args[i] == "--help":
	print "Usage:"
	print "--gram-length N\t\tUse groups of N consecutive words (Default: 1)"
	print "--num-folds N\t\tUse N folds for cross-validation (Default: 3)"
	print "--presence\t\tUse word presence rather than word frequency (Default: Off)"
	print "--frequency\t\tUse word frequency rather than word presence (Default: On)"
	print "--use-pos-tags\t\tUse part-of-speech tags (Default: Off)"
	print "--use-negation\t\tTag words appearing after a negation word (Default: Off)"
	print "--use-adj-only\t\tUse adjectives only (requires --use-pos-tags and --gram-length 1) (Default: Off)"
	print "--use-position\t\tTag words according to their position in the text (Default: Off)"
	print "--threshold N\t\tOnly include words that appear at least N times across all documents (Default: 4)"
	print "--epsilon X\t\tSVM parameter to control the number of iterations (Default: 0.001)"
	print "\t\t\t(0 < epsilon < 1; lower = more iterations)"
	print "--use-amazon\t\tUse the Amazon data set rather than the movie review set. (Default: Off)"
	print "--use-delta\t\tUse Delta TFIDF. (Default: Off)"
	exit()
	else:
	print "Error: Invalid argument", args[i]
	i += 1
	except Exception:
	print "Invalid arguments"

	t0 = time.time()

	positive_ids = []
	negative_ids = []

	if USE_AMAZON:
	# Load the mixed Amazon review dataset.
	(ids, reviews, labels) = XMLParser.get_all_reviews()
	for i in range(len(ids)):
	if labels[i] == 1:
	positive_ids.append(ids[i])
	elif labels[i] == -1:
	negative_ids.append(ids[i])
	else:
	# Load the Pang and Lee sentiment dataset.
	ids = movie_reviews.fileids()
	reviews = [list(movie_reviews.words(fileids = [id])) for id in ids]
	labels = []
	for id in ids:
	label = movie_reviews.categories(id)[0]
	if label == 'pos':
	labels.append(1)
	positive_ids.append(id)
	elif label == 'neg':
	labels.append(-1)
	negative_ids.append(id)

	positive_reviews = []
	negative_reviews = []

	for i in range(len(reviews)):
	if labels[i] == 1:
	positive_reviews.append(reviews[i])
	elif labels[i] == -1:
	negative_reviews.append(reviews[i])

	#TEST
	#positive_reviews = positive_reviews[:200]
	#negative_reviews = negative_reviews[:600]
	#positive_reviews = random.sample(positive_reviews, 1000)
	#negative_reviews = random.sample(negative_reviews, 1000)

	# Partition reviews into folds.
	(pos_folds, pos_fold_ids) = make_folds(positive_reviews, positive_ids, NUM_FOLDS)
	(neg_folds, neg_fold_ids) = make_folds(negative_reviews, negative_ids, NUM_FOLDS)

	# Count occurrences of every word across all documents
	# (this is important for e.g. Delta TFIDF)
	total_word_counts = {}

	# Construct a bag of words (or n-grams) from each file.
	pos_fold_bags = [[] for i in range(NUM_FOLDS)]
	neg_fold_bags = [[] for i in range(NUM_FOLDS)]

	pos_fold_idfs = [compute_idfs(pos_folds[i]) for i in range(NUM_FOLDS)]
	neg_fold_idfs = [compute_idfs(neg_folds[i]) for i in range(NUM_FOLDS)]

	for i in range(NUM_FOLDS):
	for review in pos_folds[i]:
	if USE_DELTATFIDF:
	pos_idfs = pos_fold_idfs[i]
	neg_idfs = neg_fold_idfs[i]
	pos_fold_bags[i].append(BagOfWords.make_delta_tfidf(review, positive_reviews, negative_reviews, pos_idfs, neg_idfs, total_word_counts))
	else:
	pos_fold_bags[i].append(make_bag(review, total_word_counts))

	for review in neg_folds[i]:
	if USE_DELTATFIDF:
	pos_idfs = pos_fold_idfs[i]
	neg_idfs = neg_fold_idfs[i]
	neg_fold_bags[i].append(BagOfWords.make_delta_tfidf(review, positive_reviews, negative_reviews, pos_idfs, neg_idfs, total_word_counts))
	else:
	neg_fold_bags[i].append(make_bag(review, total_word_counts))

	# Remove words with less than the minimum occurrences threshold.
	if MIN_OCCURRENCES > 0:
	for k in total_word_counts.keys():
	if total_word_counts[k] < MIN_OCCURRENCES:
	for fold in (neg_fold_bags + pos_fold_bags):
	for bag in fold:
	if bag.has_key(k):
	bag.pop(k)
	total_word_counts.pop(k)

	#num_unique_words = len(total_word_counts.keys())
	#print "# unique words:", num_unique_words

	t1 = time.time()
	print "Constructed bags, time:", (t1-t0)
	avg_acc = 0

	wordlist = total_word_counts.keys()

	#f = open("results.txt", "w")
	for i in range(NUM_FOLDS):
	pos_train_reviews = []
	neg_train_reviews = []
	pos_train_bags = []
	neg_train_bags = []

	pos_test_reviews = pos_folds[i]
	neg_test_reviews = neg_folds[i]
	pos_test_ids = pos_fold_ids[i]
	neg_test_ids = neg_fold_ids[i]
	for j in range(NUM_FOLDS):
	if j != i:
	pos_train_reviews += pos_folds[j]
	neg_train_reviews += neg_folds[j]
	pos_train_bags += pos_fold_bags[j]
	neg_train_bags += neg_fold_bags[j]

	train_labels = [1] * len(pos_train_bags) + [-1] * len(neg_train_bags)
	train_bags = pos_train_bags + neg_train_bags

	if USE_LIBLINEAR:
	classifier = LinearSVC()
	else:
	classifier = SVC(kernel="linear",tol=EPSILON)

	train_vecs = [BagOfWords.to_vector(bag, wordlist) for bag in train_bags]
	classifier.fit(train_vecs, train_labels)

	test_bags = pos_fold_bags[i] + neg_fold_bags[i]

	test_vecs = [BagOfWords.to_vector(bag, wordlist) for bag in test_bags]
	test_reviews = pos_test_reviews + neg_test_reviews
	test_ids = pos_test_ids + neg_test_ids
	test_labels = [1] * len(pos_test_reviews) + [-1] * len(neg_test_reviews)

	predicted_labels = classifier.predict(test_vecs)
	acc = classifier.score(test_vecs, test_labels)
	for i in range(len(test_reviews)):
	#f.write("%s\t%d\t%d\n" % (test_ids[i], test_labels[i], predicted_labels[i]))
	print("%s\t%d\t%d" % (test_ids[i], test_labels[i], predicted_labels[i]))

	avg_acc += acc

	#f.close()

	t2 = time.time()
	avg_acc /= NUM_FOLDS
	print "Total accuracy:", avg_acc
	print "Classification time:", (t2-t1)
	print "Total time:", (t2-t0)