review_svm.py

from __future__ import division
import os
import random
import string
import sys

import nltk
from nltk.corpus import movie_reviews
import numpy
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

import BagOfWords
import XMLParser
import TwitterCorpus
from TFIDF import delta_tfidf, compute_idfs

# Program to classify the movie review dataset using a support vector machine, following Pang and Lee (2002).

# "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not',
# 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word."
# They didn't provide a full list.
NEGATION_WORDS = ["not", "isn't", "didn't", "doesn't"]
PUNCTUATION    = [".", "!", "?", ",", ";"]

# These are now command line parameters! See below...
USE_DELTATFIDF  = False                       # Martineau and Finn.  Excludes some other parameters (e.g. frequency)
USE_PRESENCE    = False                       # If true, use presence rather than frequency.
USE_POS_TAGS    = False
USE_ADJ_ONLY    = False
USE_NEGATION    = True
USE_POSITION    = False
GRAM_LENGTH     = 1                           # Unigrams, bigrams, ... TODO use a range
NUM_FOLDS       = 3                           # For cross-validation (Pang & Lee used 3, Martineau & Finin used 10)

MIN_OCCURRENCES = 4                           # To be included, the word must show up this many times across all documents (Pang and Lee used 4)
EPSILON         = .001                        # determines how long the algorithm runs (default is 0.001)

NORMALIZE_BAGS  = True
USE_LIBLINEAR   = True                        # This is supposedly faster for large instances

CORPUS = "movies"                             # "twitter", "amazon", "movies"
USE_DELTA_TFIDF = False

def make_folds(documents, ids, num_partitions):
  folds = [[] for i in range(num_partitions)]
  fold_ids = [[] for i in range(num_partitions)]
  for i in range(len(documents)):
    folds[i % num_partitions].append(documents[i])
    fold_ids[i % num_partitions].append(ids[i])
  return (folds, fold_ids)
  
def make_bag(text, total_word_counts, **bag_params):
  return BagOfWords.make(text, ref_bag=total_word_counts, **bag_params)

def from_command_line():
  i = 0
  # Set parameters to default values
  gram_length = GRAM_LENGTH
  num_folds = NUM_FOLDS
  use_presence = USE_PRESENCE
  use_pos_tags = USE_POS_TAGS
  use_negation = USE_NEGATION
  use_position = USE_POSITION
  min_occurrences = MIN_OCCURRENCES
  corpus = CORPUS
  try:
    args = sys.argv[1:]
    while i < len(args):
      if args[i] == "--gram-length":
        gram_length = int(args[i+1])
        i += 2 
      elif args[i] == "--num-folds":
        num_folds = int(args[i+1])
        i += 2
      elif args[i] == "--presence":
        use_presence = True
        i += 1
      elif args[i] == "--frequency":
        use_presence = False
        i += 1
      elif args[i] == "--use-pos-tags":
        use_pos_tags = True
        i += 1
      elif args[i] == "--use-adj-only":
        use_adj_only = True
        i += 1
      elif args[i] == "--use-negation":
        use_negation = True
        i += 1
      elif args[i] == "--no-negation":
        use_negation = False
        i += 1
      elif args[i] == "--use-position":
        use_position = True
        i += 1
      elif args[i] == "--threshold":
        min_occurrences = int(args[i+1])
        i += 2
      elif args[i] == "--corpus":
        corpus = args[i+1]
        i += 2
      elif args[i] == "--use-delta":
        use_delta = True
        i += 1
      elif args[i] == "--help":
        print "Usage:"
        print "--gram-length N\t\tUse groups of N consecutive words (Default: 1)"
        print "--num-folds N\t\tUse N folds for cross-validation (Default: 3)"
        print "--presence\t\tUse word presence rather than word frequency (Default: Off)"
        print "--frequency\t\tUse word frequency rather than word presence (Default: On)"
        print "--use-pos-tags\t\tUse part-of-speech tags (Default: Off)"
        print "--use-negation\t\tTag words appearing after a negation word (Default: Off)"
        print "--use-adj-only\t\tUse adjectives only (requires --use-pos-tags and --gram-length 1) (Default: Off)"
        print "--use-position\t\tTag words according to their position in the text (Default: Off)"
        print "--threshold N\t\tOnly include words that appear at least N times across all documents  (Default: 4)"
        print "\t\t\t(0 < epsilon < 1; lower = more iterations)"
        print "--corpus\t\tSelect a corpus to evaluate. (amazon, movies, twitter)  (Default: movies)"
        print "--use-delta\t\tUse Delta TFIDF.  (Default: Off)"
        exit()
      else:
        print "Error: Invalid argument", args[i]
        i += 1
    classify_reviews(gram_length, num_folds, use_presence, use_negation, use_pos_tags, use_adj_only, min_occurrences, corpus, use_delta)
  except Exception:
    print "Invalid arguments"

def classify_reviews(gram_length=GRAM_LENGTH, num_folds=NUM_FOLDS, use_presence=USE_PRESENCE, use_negation=USE_NEGATION, use_pos_tags=USE_POS_TAGS, use_adj_only=USE_ADJ_ONLY,
                     use_position = USE_POSITION,  min_occurrences=MIN_OCCURRENCES, corpus=CORPUS, use_delta=USE_DELTA_TFIDF, skew=(1,1)):
  positive_ids = []
  negative_ids = []

  if corpus == "amazon":
    # Load the mixed Amazon review dataset.
    (ids, reviews, labels) = XMLParser.get_all_reviews()
    for i in range(len(ids)):
      if labels[i] == 1:
        positive_ids.append(ids[i])
      elif labels[i] == -1:
        negative_ids.append(ids[i])
  elif corpus == "movies":
    # Load the Pang and Lee sentiment dataset.
    ids = movie_reviews.fileids()
    reviews = [list(movie_reviews.words(fileids = [id])) for id in ids]
    labels = []
    for id in ids:
      label = movie_reviews.categories(id)[0]
      if label == 'pos':
        labels.append(1)
        positive_ids.append(id)
      elif label == 'neg': 
        labels.append(-1)
        negative_ids.append(id)
  elif corpus == "twitter":
    (ids, reviews, labels) = TwitterCorpus.load()
    for i in range(len(ids)):
      if labels[i] == 1:
        positive_ids.append(ids[i])
      elif labels[i] == -1:
        negative_ids.append(ids[i])

  positive_reviews = []
  negative_reviews = []

  for i in range(len(reviews)):
    if labels[i] == 1:
      positive_reviews.append(reviews[i])
    elif labels[i] == -1:
      negative_reviews.append(reviews[i])

  num_pos = int(len(positive_reviews) * skew[0])
  num_neg = int(len(negative_reviews) * skew[1])
  positive_reviews = random.sample(positive_reviews, num_pos)
  negative_reviews = random.sample(negative_reviews, num_neg)

  # Partition reviews into folds.
  (pos_folds, pos_fold_ids) = make_folds(positive_reviews, positive_ids, num_folds)
  (neg_folds, neg_fold_ids) = make_folds(negative_reviews, negative_ids, num_folds)

  # Count occurrences of every word across all documents
  # (this is important for e.g. Delta TFIDF)
  total_word_counts = {}

  # Construct a bag of words (or n-grams) from each file.
  pos_fold_bags = [[] for i in range(num_folds)]
  neg_fold_bags = [[] for i in range(num_folds)]

  pos_fold_idfs = [compute_idfs(pos_folds[i]) for i in range(num_folds)]
  neg_fold_idfs = [compute_idfs(neg_folds[i]) for i in range(num_folds)]

  bag_params = {'gram_length':gram_length, 'use_presence':use_presence, 'use_negation':use_negation, 'use_pos_tags':use_pos_tags,
                'use_adj_only':use_adj_only, 'use_position':use_position}
  
  for i in range(num_folds):
    for review in pos_folds[i]:
      if use_delta:
        pos_idfs = pos_fold_idfs[i]
        neg_idfs = neg_fold_idfs[i]
        pos_fold_bags[i].append(BagOfWords.make_delta_tfidf(review, positive_reviews, negative_reviews, pos_idfs, neg_idfs, total_word_counts))
      else:
        pos_fold_bags[i].append(make_bag(review, total_word_counts, **bag_params))
      
    for review in neg_folds[i]:
      if use_delta:
        pos_idfs = pos_fold_idfs[i]
        neg_idfs = neg_fold_idfs[i]
        neg_fold_bags[i].append(BagOfWords.make_delta_tfidf(review, positive_reviews, negative_reviews, pos_idfs, neg_idfs, total_word_counts))
      else:
        neg_fold_bags[i].append(make_bag(review, total_word_counts, **bag_params))

  # Remove words with less than the minimum occurrences threshold.
  if min_occurrences > 0:
    for k in total_word_counts.keys():
      if total_word_counts[k] < min_occurrences:
        for fold in (neg_fold_bags + pos_fold_bags):
          for bag in fold:
            if bag.has_key(k):
              bag.pop(k)
        total_word_counts.pop(k)

  avg_acc = 0

  wordlist = total_word_counts.keys()

  for i in range(num_folds):
    pos_train_reviews = []
    neg_train_reviews = []
    pos_train_bags = []
    neg_train_bags = []
    
    pos_test_reviews = pos_folds[i]
    neg_test_reviews = neg_folds[i]
    pos_test_ids = pos_fold_ids[i]
    neg_test_ids = neg_fold_ids[i]
    for j in range(num_folds):
      if j != i:
        pos_train_reviews += pos_folds[j]
        neg_train_reviews += neg_folds[j]
        pos_train_bags += pos_fold_bags[j]
        neg_train_bags += neg_fold_bags[j]

    train_labels = [1] * len(pos_train_bags) + [-1] * len(neg_train_bags)
    train_bags = pos_train_bags + neg_train_bags

    if USE_LIBLINEAR:
      classifier = LinearSVC()
    else:
      classifier = SVC(kernel="linear",tol=EPSILON)

    train_vecs = [BagOfWords.to_vector(bag, wordlist) for bag in train_bags]
    classifier.fit(train_vecs, train_labels)

    test_bags = pos_fold_bags[i] + neg_fold_bags[i]
      
    test_vecs = [BagOfWords.to_vector(bag, wordlist) for bag in test_bags]
    test_reviews = pos_test_reviews + neg_test_reviews
    test_ids = pos_test_ids + neg_test_ids
    test_labels = [1] * len(pos_test_reviews) + [-1] * len(neg_test_reviews)
    
    predicted_labels = classifier.predict(test_vecs)
    acc = classifier.score(test_vecs, test_labels)
    avg_acc += acc

  avg_acc /= num_folds
  return avg_acc
  
def run_configs():
  min_occurrences = 4
  use_negation = True
  labels = []
  accs = []
  #for corpus in ["movies", "amazon", "twitter"]:
  for corpus in ["amazon", "twitter"]:
    for use_position in [False, True]:
      for (use_pos_tags, use_adj_only) in [(False, False), (True, False), (True, True)]:
        for gram_length in [1,2]:
          for use_presence in [False, True]:
            params = {'gram_length':gram_length, 'use_presence':use_presence, 'use_pos_tags':use_pos_tags, 'use_adj_only':use_adj_only,
                    'use_position':use_position, 'corpus':corpus, 'min_occurrences':min_occurrences, 'use_delta':False}
            acc = classify_reviews(**params)
            label = "gram_length: %d, use_presence: %s, corpus: %s, use_pos_tags: %s, use_adj_only: %s, use_position: %s" % (gram_length, use_presence, corpus, use_pos_tags, use_adj_only, use_position)
            print label, acc
            labels.append(label)
            accs.append(acc)
    # Delta-TFIDF construction doesn't support all parameters (yet).
    params = {'corpus':corpus, 'use_delta':True}
    acc = classify_reviews(**params)
    label = "delta_tfidf: True, corpus: %s" % corpus
    print label, acc
    labels.append(label)
    accs.append(acc)
  return (labels, accs)
  
def run_skewed():
  min_occurrences = 4
  use_negation = True
  use_delta = False
  use_pos_tags = False
  use_adj_only = False
  use_position = False
  use_presence = True
  labels = []
  accs = []
  for corpus in ["movies", "amazon"]:
    for skew in  [(0.2,1), (0.4,1), (0.6,1), (0.8, 1), (1,0.8), (1,0.6), (1,0.4), (1,0.2)]:
      for gram_length in [1,2]:
        params = {'gram_length':gram_length, 'use_presence':use_presence, 'use_pos_tags':use_pos_tags, 'use_adj_only':use_adj_only,
                'use_position':use_position, 'corpus':corpus, 'min_occurrences':min_occurrences, 'use_delta':False, 'skew': skew}

        acc = classify_reviews(**params)
        label = "corpus: %s, gram_length: %d, skew: (%f, %f)" % (corpus, gram_length, skew[0], skew[1])

        print label, acc
        labels.append(label)
        accs.append(acc)
    
      params = {'gram_length':1, 'use_presence':False, 'use_pos_tags':False, 'use_adj_only':False,
              'use_position':False, 'corpus':corpus, 'min_occurrences':min_occurrences, 'use_delta':False, 'skew': skew}
      
      acc = classify_reviews(**params)
      label = "corpus: %s, delta_tfidf: True, skew: (%f, %f)" % (corpus, skew[0], skew[1])
      print label, acc
      labels.append(label)
      accs.append(acc)

#(labels, accs) = run_configs()
(labels, accs) = run_skewed()
f = open('SVM_RESULTS_SKEW.txt', 'w')
for (label, acc) in zip(labels, accs):
  f.write("%s\t%s\n" % (label, acc))
f.close()