review_svm.py

import os
import random
import string
import time
import sys

import nltk
import svmutil

import BagOfWords

# Program to classify the movie review dataset using a support vector machine
# (via LIBSVM), following Pang and Lee (2002).

POS_FOLDER = os.path.join("review_polarity","txt_sentoken","pos")
NEG_FOLDER = os.path.join("review_polarity","txt_sentoken","neg")

# "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not',
# 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word."
# They didn't provide a full list.
# TODO make this a parameter
NEGATION_WORDS = ["not", "isn't", "didn't", "doesn't"]
PUNCTUATION    = [".", "!", "?", ",", ";"]

NORMAL_LENGTH  = 1000

# These are now command line parameters! See below...
USE_PRESENCE    = False                       # If true, use presence rather than frequency.
USE_POS_TAGS    = False
USE_ADJ_ONLY    = False
USE_NEGATION    = True
GRAM_LENGTH     = 1                           # Unigrams, bigrams, ... TODO use a range
NUM_FOLDS       = 3                           # For cross-validation (Pang & Lee used 3)

MIN_OCCURRENCES = 4                           # To be included, the word must show up this many times across all documents (Pang and Lee used 4)
EPSILON         = .001                        # determines how long the algorithm runs (default is 0.001)

KERNEL_TYPE     = 0                           # 0: linear, 2: radial basis (just use linear)
NORMALIZE_BAGS  = True
USE_LIBLINEAR   = False                       # Not implemented - it murdered my computer and wasn't noticeably faster.  But maybe multicore is worth a look
CACHE_SIZE      = 512

def file_to_text(filename):
  f = open(filename)
  lines = f.readlines()
  f.close()
  text = string.join(lines, " ")
  return text

def generate_filenames(folder_name):
  filenames = []
  for (folder, x, folder_filenames) in os.walk(folder_name):
    for filename in folder_filenames:
      if filename.endswith(".txt"):
        filenames.append(os.path.join(folder, filename))
  return filenames

def partition_filenames(filenames, num_partitions):
  partitions = [[] for i in range(num_partitions)]
  for i in range(len(filenames)):
    partitions[i % num_partitions].append(filenames[i])
  return partitions


# Set parameters from command-line arguments.
i = 0
try:
  args = sys.argv[1:]
  while i < len(args):
    if args[i] == "--gram-length":
      GRAM_LENGTH = int(args[i+1])
      i += 2
    elif args[i] == "--num-folds":
      NUM_FOLDS = int(args[i+1])
      i += 2
    elif args[i] == "--presence":
      USE_PRESENCE = True
      i += 1
    elif args[i] == "--frequency":
      USE_PRESENCE = False
      i += 1
    elif args[i] == "--use-pos-tags":
      USE_POS_TAGS = True
      i += 1
    elif args[i] == "--use-adj-only":
      USE_ADJ_ONLY = True
      i += 1
    elif args[i] == "--use-negation":
      USE_NEGATION = True
      i += 1
    elif args[i] == "--no-negation":
      USE_NEGATION = False
      i += 1
    elif args[i] == "--threshold":
      MIN_OCCURRENCES = int(args[i+1])
      i += 2
    elif args[i] == "--epsilon":
      EPSILON = float(args[i+1])
      i += 2
    elif args[i] == "--help":
      print "Usage:"
      print "--gram-length N\t\tUse groups of N consecutive words (Default: 1)"
      print "--num-folds N\t\tUse N folds for cross-validation (Default: 3)"
      print "--presence\t\tUse word presence rather than word frequency (Default: Off)"
      print "--frequency\t\tUse word frequency rather than word presence (Default: On)"
      print "--use-pos-tags\t\tUse part-of-speech tags (Default: Off)"
      print "--use-adj-only\t\tUse adjectives only (requires --use-pos-tags and --gram-length 1) (Default: Off)"
      print "--threshold N\t\tOnly include words that appear at least N times across all documents  (Default: 4)"
      print "--epsilon X\t\tSVM parameter to control the number of iterations (Default: 0.001)"
      print "\t\t\t(0 < epsilon < 1; lower = more iterations)"
      exit()
    else:
      print "Error: Invalid argument", args[i]
      i += 1
except Exception:
  print "Invalid arguments"

t0 = time.time()

pos_filenames = generate_filenames(POS_FOLDER)
neg_filenames = generate_filenames(NEG_FOLDER)

# TEST - to test on a subset of reviews (since some operations [i.e. tagging] are slow)
#pos_filenames = random.sample(pos_filenames, 20)
#neg_filenames = random.sample(neg_filenames, 20)

# Partition reviews into folds.
pos_folds = partition_filenames(pos_filenames, NUM_FOLDS)
neg_folds = partition_filenames(neg_filenames, NUM_FOLDS)

# Count occurrences of every word across all documents
# (this is important for e.g. Delta TFIDF)
total_word_counts = {}

# Construct a bag of words (or n-grams) from each file.
pos_fold_bags = [[] for i in range(NUM_FOLDS)]
neg_fold_bags = [[] for i in range(NUM_FOLDS)]

for i in range(NUM_FOLDS):
  for filename in pos_folds[i]:
    pos_fold_bags[i].append(BagOfWords.make(file_to_text(filename), ref_bag=total_word_counts, gram_length=GRAM_LENGTH,
                            use_presence=USE_PRESENCE, use_pos_tags=USE_POS_TAGS, use_adj_only=USE_ADJ_ONLY, normalize_bags=NORMALIZE_BAGS))

  for filename in neg_folds[i]:
    neg_fold_bags[i].append(
      BagOfWords.make(file_to_text(filename), ref_bag=total_word_counts, gram_length=GRAM_LENGTH,
                      use_presence=USE_PRESENCE, use_pos_tags=USE_POS_TAGS, use_adj_only=USE_ADJ_ONLY, normalize_bags=NORMALIZE_BAGS))

# Remove words with less than the minimum occurrences threshold.
for k in total_word_counts.keys():
  if total_word_counts[k] < MIN_OCCURRENCES:
    for fold in (neg_fold_bags + pos_fold_bags):
      for bag in fold:
        if bag.has_key(k):
          bag.pop(k)
    total_word_counts.pop(k)

#num_unique_words = len(total_word_counts.keys())
#print "# unique words:", num_unique_words

t1 = time.time()
print "Constructed bags, time:", (t1-t0)
avg_acc = 0

for i in range(NUM_FOLDS):
  pos_train_filenames = []
  neg_train_filenames = []
  pos_train_bags = []
  neg_train_bags = []

  pos_test_filenames = pos_folds[i]
  neg_test_filenames = neg_folds[i]

  for j in range(NUM_FOLDS):
    if j != i:
      pos_train_filenames += pos_folds[j]
      neg_train_filenames += neg_folds[j]
      pos_train_bags += pos_fold_bags[j]
      neg_train_bags += neg_fold_bags[j]

  train_labels = [1] * len(pos_train_bags) + [-1] * len(neg_train_bags)
  train_bags = pos_train_bags + neg_train_bags

  m = svmutil.svm_train(train_labels, train_bags, "-t %d -e %f -m %d -q" % (KERNEL_TYPE, EPSILON, CACHE_SIZE))

  test_bags = pos_fold_bags[i] + neg_fold_bags[i]
  test_filenames = pos_test_filenames + neg_test_filenames
  test_labels = [1] * len(pos_test_filenames) + [-1] * len(neg_test_filenames)

  (predicted_labels, acc, p_vals) = svmutil.svm_predict(test_labels, test_bags, m)

  avg_acc += acc[0]

  """
  indices = random.sample(range(len(test_filenames)), 10)
  filenames_labels = {}
  for j in indices:
    filename = test_filenames[j]
    predicted_label = predicted_labels[j]
    filenames_labels[filename] = predicted_labels[j]
  """

t2 = time.time()
avg_acc /= NUM_FOLDS
print "Total accuracy:", avg_acc
print "Classification time:", (t2-t1)
print "Total time:", (t2-t0)
	import os
	import random
	import string
	import time
	import sys

	import nltk
	import svmutil

	import BagOfWords

	# Program to classify the movie review dataset using a support vector machine
	# (via LIBSVM), following Pang and Lee (2002).

	POS_FOLDER = os.path.join("review_polarity","txt_sentoken","pos")
	NEG_FOLDER = os.path.join("review_polarity","txt_sentoken","neg")

	# "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not',
	# 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word."
	# They didn't provide a full list.
	# TODO make this a parameter
	NEGATION_WORDS = ["not", "isn't", "didn't", "doesn't"]
	PUNCTUATION = [".", "!", "?", ",", ";"]

	NORMAL_LENGTH = 1000

	# These are now command line parameters! See below...
	USE_PRESENCE = False # If true, use presence rather than frequency.
	USE_POS_TAGS = False
	USE_ADJ_ONLY = False
	USE_NEGATION = True
	GRAM_LENGTH = 1 # Unigrams, bigrams, ... TODO use a range
	NUM_FOLDS = 3 # For cross-validation (Pang & Lee used 3)

	MIN_OCCURRENCES = 4 # To be included, the word must show up this many times across all documents (Pang and Lee used 4)
	EPSILON = .001 # determines how long the algorithm runs (default is 0.001)

	KERNEL_TYPE = 0 # 0: linear, 2: radial basis (just use linear)
	NORMALIZE_BAGS = True
	USE_LIBLINEAR = False # Not implemented - it murdered my computer and wasn't noticeably faster. But maybe multicore is worth a look
	CACHE_SIZE = 512

	def file_to_text(filename):
	f = open(filename)
	lines = f.readlines()
	f.close()
	text = string.join(lines, " ")
	return text

	def generate_filenames(folder_name):
	filenames = []
	for (folder, x, folder_filenames) in os.walk(folder_name):
	for filename in folder_filenames:
	if filename.endswith(".txt"):
	filenames.append(os.path.join(folder, filename))
	return filenames

	def partition_filenames(filenames, num_partitions):
	partitions = [[] for i in range(num_partitions)]
	for i in range(len(filenames)):
	partitions[i % num_partitions].append(filenames[i])
	return partitions


	# Set parameters from command-line arguments.
	i = 0
	try:
	args = sys.argv[1:]
	while i < len(args):
	if args[i] == "--gram-length":
	GRAM_LENGTH = int(args[i+1])
	i += 2
	elif args[i] == "--num-folds":
	NUM_FOLDS = int(args[i+1])
	i += 2
	elif args[i] == "--presence":
	USE_PRESENCE = True
	i += 1
	elif args[i] == "--frequency":
	USE_PRESENCE = False
	i += 1
	elif args[i] == "--use-pos-tags":
	USE_POS_TAGS = True
	i += 1
	elif args[i] == "--use-adj-only":
	USE_ADJ_ONLY = True
	i += 1
	elif args[i] == "--use-negation":
	USE_NEGATION = True
	i += 1
	elif args[i] == "--no-negation":
	USE_NEGATION = False
	i += 1
	elif args[i] == "--threshold":
	MIN_OCCURRENCES = int(args[i+1])
	i += 2
	elif args[i] == "--epsilon":
	EPSILON = float(args[i+1])
	i += 2
	elif args[i] == "--help":
	print "Usage:"
	print "--gram-length N\t\tUse groups of N consecutive words (Default: 1)"
	print "--num-folds N\t\tUse N folds for cross-validation (Default: 3)"
	print "--presence\t\tUse word presence rather than word frequency (Default: Off)"
	print "--frequency\t\tUse word frequency rather than word presence (Default: On)"
	print "--use-pos-tags\t\tUse part-of-speech tags (Default: Off)"
	print "--use-adj-only\t\tUse adjectives only (requires --use-pos-tags and --gram-length 1) (Default: Off)"
	print "--threshold N\t\tOnly include words that appear at least N times across all documents (Default: 4)"
	print "--epsilon X\t\tSVM parameter to control the number of iterations (Default: 0.001)"
	print "\t\t\t(0 < epsilon < 1; lower = more iterations)"
	exit()
	else:
	print "Error: Invalid argument", args[i]
	i += 1
	except Exception:
	print "Invalid arguments"

	t0 = time.time()

	pos_filenames = generate_filenames(POS_FOLDER)
	neg_filenames = generate_filenames(NEG_FOLDER)

	# TEST - to test on a subset of reviews (since some operations [i.e. tagging] are slow)
	#pos_filenames = random.sample(pos_filenames, 20)
	#neg_filenames = random.sample(neg_filenames, 20)

	# Partition reviews into folds.
	pos_folds = partition_filenames(pos_filenames, NUM_FOLDS)
	neg_folds = partition_filenames(neg_filenames, NUM_FOLDS)

	# Count occurrences of every word across all documents
	# (this is important for e.g. Delta TFIDF)
	total_word_counts = {}

	# Construct a bag of words (or n-grams) from each file.
	pos_fold_bags = [[] for i in range(NUM_FOLDS)]
	neg_fold_bags = [[] for i in range(NUM_FOLDS)]

	for i in range(NUM_FOLDS):
	for filename in pos_folds[i]:
	pos_fold_bags[i].append(BagOfWords.make(file_to_text(filename), ref_bag=total_word_counts, gram_length=GRAM_LENGTH,
	use_presence=USE_PRESENCE, use_pos_tags=USE_POS_TAGS, use_adj_only=USE_ADJ_ONLY, normalize_bags=NORMALIZE_BAGS))

	for filename in neg_folds[i]:
	neg_fold_bags[i].append(
	BagOfWords.make(file_to_text(filename), ref_bag=total_word_counts, gram_length=GRAM_LENGTH,
	use_presence=USE_PRESENCE, use_pos_tags=USE_POS_TAGS, use_adj_only=USE_ADJ_ONLY, normalize_bags=NORMALIZE_BAGS))

	# Remove words with less than the minimum occurrences threshold.
	for k in total_word_counts.keys():
	if total_word_counts[k] < MIN_OCCURRENCES:
	for fold in (neg_fold_bags + pos_fold_bags):
	for bag in fold:
	if bag.has_key(k):
	bag.pop(k)
	total_word_counts.pop(k)

	#num_unique_words = len(total_word_counts.keys())
	#print "# unique words:", num_unique_words

	t1 = time.time()
	print "Constructed bags, time:", (t1-t0)
	avg_acc = 0

	for i in range(NUM_FOLDS):
	pos_train_filenames = []
	neg_train_filenames = []
	pos_train_bags = []
	neg_train_bags = []

	pos_test_filenames = pos_folds[i]
	neg_test_filenames = neg_folds[i]

	for j in range(NUM_FOLDS):
	if j != i:
	pos_train_filenames += pos_folds[j]
	neg_train_filenames += neg_folds[j]
	pos_train_bags += pos_fold_bags[j]
	neg_train_bags += neg_fold_bags[j]

	train_labels = [1] * len(pos_train_bags) + [-1] * len(neg_train_bags)
	train_bags = pos_train_bags + neg_train_bags

	m = svmutil.svm_train(train_labels, train_bags, "-t %d -e %f -m %d -q" % (KERNEL_TYPE, EPSILON, CACHE_SIZE))

	test_bags = pos_fold_bags[i] + neg_fold_bags[i]
	test_filenames = pos_test_filenames + neg_test_filenames
	test_labels = [1] * len(pos_test_filenames) + [-1] * len(neg_test_filenames)

	(predicted_labels, acc, p_vals) = svmutil.svm_predict(test_labels, test_bags, m)

	avg_acc += acc[0]

	"""
	indices = random.sample(range(len(test_filenames)), 10)
	filenames_labels = {}
	for j in indices:
	filename = test_filenames[j]
	predicted_label = predicted_labels[j]
	filenames_labels[filename] = predicted_labels[j]
	"""

	t2 = time.time()
	avg_acc /= NUM_FOLDS
	print "Total accuracy:", avg_acc
	print "Classification time:", (t2-t1)
	print "Total time:", (t2-t0)