From 6c944e023d77cad0b92011d5265a593ca20aa631 Mon Sep 17 00:00:00 2001 From: Jack Date: Mon, 11 Apr 2016 18:08:36 -0400 Subject: [PATCH] Delta TFIDF optimization. 98% accuracy, I'm suspicious. --- BagOfWords.py | 4 +-- TFIDF.py | 41 ++++++++++++++++++++++--------- review_svm.py | 68 ++++++++++++++++++++++++++++----------------------- 3 files changed, 68 insertions(+), 45 deletions(-) diff --git a/BagOfWords.py b/BagOfWords.py index dbaea22..68c04da 100644 --- a/BagOfWords.py +++ b/BagOfWords.py @@ -84,11 +84,11 @@ def make_tfidf(document, documents): # As per Martineau and Finn (2009), create a bag of words using delta TFIDF as the feature value. # Todo: Bigrams? -def make_delta_tfidf(document, positive_set, negative_set, ref_bag): +def make_delta_tfidf(document, positive_set, negative_set, pos_idfs, neg_idfs, ref_bag): bag = {} factor = 0 for term in set(document): - weight = delta_tfidf(term, document, positive_set, negative_set) + weight = delta_tfidf(term, document, positive_set, negative_set, pos_idfs, neg_idfs) if (weight != 0): bag[term] = weight factor += weight**2 diff --git a/TFIDF.py b/TFIDF.py index 35039fc..f863212 100644 --- a/TFIDF.py +++ b/TFIDF.py @@ -1,18 +1,37 @@ +from __future__ import division import math import nltk # document is assumed to be tokenized (a list of words) # documents is a list of tokenized docs -def tfidf(term, document, documents): - all_doc_appearances = 0 # number of documents in which term appears + +def compute_idfs(documents): + idfs = {} + N = len(documents) for doc in documents: - if term in doc: - all_doc_appearances += 1 + for term in doc: + if idfs.has_key(term): + idfs[term] += 1 + else: + idfs[term] = 1 + for term in idfs.keys(): + idfs[term] = math.log(N/idfs[term]) + return idfs + +def tfidf(term, document, documents, idfs={}): + if idfs == {}: + all_doc_appearances = sum([doc for doc in documents if term in doc]) + idf = math.log(len(documents)/all_doc_appearances, 10) + else: + if idfs.has_key(term): + idf = idfs[term] + else: + return 0 # is this supposed to happen??? doc_appearances = 0 # number of appearances of term in this document for word in document: if term == word: doc_appearances += 1 - num_docs = len(documents) # number of documents in the collection + """ if doc_appearances == 0: #This happens sometimes, probably due to inconsistent splitting/tokenizing. #print "Error: no occurrences of", term @@ -21,12 +40,10 @@ def tfidf(term, document, documents): #print "Error: fuck,", term return 0 else: - tfidf = (1 + math.log(doc_appearances,10)) * math.log((float(num_docs)/all_doc_appearances), 10) - return tfidf + """ + tfidf = (1 + math.log(doc_appearances,10)) * idf + return tfidf # Martineau and Finn 2009 -def delta_tfidf(term, document, positive_set, negative_set): - return tfidf(term, document, positive_set) - tfidf(term, document, negative_set) - -def delta_tfidf_fast(term, document, positive_set, negative_set): - return tfidf(term, document, positive_set) - tfidf(term, document, negative_set) +def delta_tfidf(term, document, positive_set, negative_set, pos_idfs={}, neg_idfs={}): + return tfidf(term, document, positive_set, pos_idfs) - tfidf(term, document, negative_set, neg_idfs) diff --git a/review_svm.py b/review_svm.py index 26535ac..5fa1573 100644 --- a/review_svm.py +++ b/review_svm.py @@ -11,7 +11,7 @@ import numpy #import svmutil from sklearn.svm import SVC from sklearn.svm import LinearSVC -from TFIDF import delta_tfidf +from TFIDF import delta_tfidf, compute_idfs import BagOfWords import XMLParser @@ -34,7 +34,7 @@ USE_ADJ_ONLY = False USE_NEGATION = True USE_POSITION = False GRAM_LENGTH = 1 # Unigrams, bigrams, ... TODO use a range -NUM_FOLDS = 3 # For cross-validation (Pang & Lee used 3) +NUM_FOLDS = 10 # For cross-validation (Pang & Lee used 3) MIN_OCCURRENCES = 0#4 # To be included, the word must show up this many times across all documents (Pang and Lee used 4) EPSILON = .001 # determines how long the algorithm runs (default is 0.001) @@ -44,11 +44,13 @@ USE_LIBLINEAR = True # This is supposedly faster for la USE_AMAZON = False # Use the Amazon review set, not Pang and Lee. -def make_folds(documents, num_partitions): +def make_folds(documents, ids, num_partitions): folds = [[] for i in range(num_partitions)] + fold_ids = [[] for i in range(num_partitions)] for i in range(len(documents)): folds[i % num_partitions].append(documents[i]) - return folds + fold_ids[i % num_partitions].append(ids[i]) + return (folds, fold_ids) def make_bag(text, total_word_counts): return BagOfWords.make(text, ref_bag=total_word_counts, @@ -123,30 +125,35 @@ if USE_AMAZON: else: # Load the Pang and Lee sentiment dataset. ids = movie_reviews.fileids() - reviews = [list(movie_reviews.words(id)) for id in ids] + reviews = [list(movie_reviews.words(fileids = [id])) for id in ids] + positive_ids = [] + negative_ids = [] labels = [] for id in ids: label = movie_reviews.categories(id)[0] if label == 'pos': labels.append(1) + positive_ids.append(id) elif label == 'neg': labels.append(-1) + negative_ids.append(id) positive_reviews = [] negative_reviews = [] + for i in range(len(reviews)): if labels[i] == 1: positive_reviews.append(reviews[i]) elif labels[i] == -1: negative_reviews.append(reviews[i]) - + #TEST -positive_reviews = random.sample(positive_reviews, 250) -negative_reviews = random.sample(negative_reviews, 250) +#positive_reviews = random.sample(positive_reviews, 25) +#negative_reviews = random.sample(negative_reviews, 25) # Partition reviews into folds. -pos_folds = make_folds(positive_reviews, NUM_FOLDS) -neg_folds = make_folds(negative_reviews, NUM_FOLDS) +(pos_folds, pos_fold_ids) = make_folds(positive_reviews, positive_ids, NUM_FOLDS) +(neg_folds, neg_fold_ids) = make_folds(negative_reviews, negative_ids, NUM_FOLDS) # Count occurrences of every word across all documents # (this is important for e.g. Delta TFIDF) @@ -156,24 +163,25 @@ total_word_counts = {} pos_fold_bags = [[] for i in range(NUM_FOLDS)] neg_fold_bags = [[] for i in range(NUM_FOLDS)] +pos_fold_idfs = [compute_idfs(pos_folds[i]) for i in range(NUM_FOLDS)] +neg_fold_idfs = [compute_idfs(neg_folds[i]) for i in range(NUM_FOLDS)] + for i in range(NUM_FOLDS): for review in pos_folds[i]: - t3 = time.time() if USE_DELTATFIDF: - pos_fold_bags[i].append(BagOfWords.make_delta_tfidf(review, positive_reviews, negative_reviews, total_word_counts)) + pos_idfs = pos_fold_idfs[i] + neg_idfs = neg_fold_idfs[i] + pos_fold_bags[i].append(BagOfWords.make_delta_tfidf(review, positive_reviews, negative_reviews, pos_idfs, neg_idfs, total_word_counts)) else: - pos_fold_bags[i].append(make_bag(review, total_word_counts)) - t4 = time.time() - print "Bag time:", (t4-t3) + pos_fold_bags[i].append(make_bag(review, total_word_counts)) for review in neg_folds[i]: - t3 = time.time() if USE_DELTATFIDF: - neg_fold_bags[i].append(BagOfWords.make_delta_tfidf(review, positive_reviews, negative_reviews, total_word_counts)) + pos_idfs = pos_fold_idfs[i] + neg_idfs = neg_fold_idfs[i] + neg_fold_bags[i].append(BagOfWords.make_delta_tfidf(review, positive_reviews, negative_reviews, pos_idfs, neg_idfs, total_word_counts)) else: neg_fold_bags[i].append(make_bag(review, total_word_counts)) - t4 = time.time() - print "Bag time:", (t4-t3) # Remove words with less than the minimum occurrences threshold. if MIN_OCCURRENCES > 0: @@ -193,6 +201,8 @@ print "Constructed bags, time:", (t1-t0) avg_acc = 0 wordlist = total_word_counts.keys() + +f = open("results.txt", "w") for i in range(NUM_FOLDS): pos_train_reviews = [] neg_train_reviews = [] @@ -201,6 +211,8 @@ for i in range(NUM_FOLDS): pos_test_reviews = pos_folds[i] neg_test_reviews = neg_folds[i] + pos_test_ids = pos_fold_ids[i] + neg_test_ids = neg_fold_ids[i] for j in range(NUM_FOLDS): if j != i: pos_train_reviews += pos_folds[j] @@ -211,7 +223,6 @@ for i in range(NUM_FOLDS): train_labels = [1] * len(pos_train_bags) + [-1] * len(neg_train_bags) train_bags = pos_train_bags + neg_train_bags - #m = svmutil.svm_train(train_labels, train_bags, "-t 0 -e %f -m %d -q" % (EPSILON, CACHE_SIZE)) if USE_LIBLINEAR: classifier = LinearSVC() else: @@ -224,23 +235,18 @@ for i in range(NUM_FOLDS): test_vecs = [BagOfWords.to_vector(bag, wordlist) for bag in test_bags] test_reviews = pos_test_reviews + neg_test_reviews + test_ids = pos_test_ids + neg_test_ids test_labels = [1] * len(pos_test_reviews) + [-1] * len(neg_test_reviews) - #(predicted_labels, acc, p_vals) = svmutil.svm_predict(test_labels, test_bags, m) predicted_labels = classifier.predict(test_vecs) acc = classifier.score(test_vecs, test_labels) - + for i in range(len(test_reviews)): + print "%s\t%d\t%d" % (test_ids[i], test_labels[i], predicted_labels[i]) + avg_acc += acc - - """ - indices = random.sample(range(len(test_filenames)), 10) - filenames_labels = {} - for j in indices: - filename = test_filenames[j] - predicted_label = predicted_labels[j] - filenames_labels[filename] = predicted_labels[j] - """ +f.close() + t2 = time.time() avg_acc /= NUM_FOLDS print "Total accuracy:", avg_acc