diff --git a/GlossLexicon.py b/GlossLexicon.py index 0fd0c4e..9d0f7d7 100644 --- a/GlossLexicon.py +++ b/GlossLexicon.py @@ -93,14 +93,6 @@ def do_stem(text): global stemmer return [stemmer.stem(word) for word in text] -def create_lexicon(words, labels): - lexicon = {} - for i in range(len(words)): - word = words[i] - label = labels[i] - lexicon[word] = label - return lexicon - def create(test_words, test_labels): # Set up initial Sets S_p and S_n neutral = [] @@ -184,7 +176,7 @@ def create(test_words, test_labels): for word in neg_words: #lexicon[word] = -1 - lexicon[word] = -1.5 + lexicon[word] = -1 return lexicon diff --git a/LexiconEval.py b/LexiconEval.py index 0c84cad..52e0db8 100644 --- a/LexiconEval.py +++ b/LexiconEval.py @@ -14,6 +14,7 @@ LEX_ALG = "gloss" LEX_SOURCE = "mpqa" # new and improved finite state machine +# kinda-sorta based on Taboada 2011. # states are as follows: # 0 - base # 1 - negator found @@ -35,9 +36,13 @@ def calculate_score(text, lexicon): num_neg = 0 num_halfneg = 0 for word in text: + if lexicon.has_key(word): + word_score = lexicon[word] + # EXPERIMENTAL + if word_score < 0: word_score *= 1.5 if state == 0: if lexicon.has_key(word): - score += lexicon[word] + score += word_score num_single += 1 elif word in negators: state = 1 @@ -45,7 +50,7 @@ def calculate_score(text, lexicon): state = 2 elif state == 1: if lexicon.has_key(word): - score += -1 * lexicon[word] + score += -1 * word_score num_neg += 1 state = 0 elif word in intensifiers: @@ -54,7 +59,7 @@ def calculate_score(text, lexicon): state = 0 elif state == 2: if lexicon.has_key(word): - score += 2 * lexicon[word] + score += 2 * word_score num_double += 1 state = 0 else: @@ -63,7 +68,7 @@ def calculate_score(text, lexicon): pass #TODO elif state == 4: if lexicon.has_key(word): - score += -0.5 * lexicon[word] + score += -0.5 * word_score num_halfneg += 1 state = 0 else: @@ -77,6 +82,15 @@ def do_stem(text): def get_label(id): return movie_reviews.categories(fileids=[id])[0] + +# Used to create a lexicon instance from the words + labels directly (i.e. without using an algorithm) +def create_lexicon(words, labels): + lexicon = {} + for i in range(len(words)): + word = words[i] + label = labels[i] + lexicon[word] = label + return lexicon i = 0 try: @@ -87,6 +101,8 @@ try: LEX_ALG = "gloss" elif args[i+1] == "conjunction": LEX_ALG = "conjunction" + elif args[i+1] == "none": + LEX_ALG = "none" else: print "Invalid algorithm" i += 2 @@ -137,9 +153,10 @@ elif LEX_ALG == "conjunction": elif LEX_ALG == "none": lexicon = create_lexicon(test_words, test_labels) -correct = len([(word, label) for (word, label) in zip(test_words, test_labels) if lexicon.has_key(word) and label == lexicon[word]]) -lex_acc = correct/len(lexicon.items()) -print "Lexicon accuracy:", lex_acc +if LEX_ALG != "none": + correct = len([(word, label) for (word, label) in zip(test_words, test_labels) if lexicon.has_key(word) and label == lexicon[word]]) + lex_acc = correct/len(lexicon.items()) + print "Lexicon accuracy:", lex_acc # Iterate through all of the reviews and compute scores by taking the sum of their # component lexicon words. Includes rudimentary negation testing. @@ -159,7 +176,6 @@ for id in ids: for word in words: if lexicon.has_key(word): score += lexicon[word] - x += 1 scores.append(score) #print id, score diff --git a/TFIDF.py b/TFIDF.py index f863212..202180c 100644 --- a/TFIDF.py +++ b/TFIDF.py @@ -20,7 +20,7 @@ def compute_idfs(documents): def tfidf(term, document, documents, idfs={}): if idfs == {}: - all_doc_appearances = sum([doc for doc in documents if term in doc]) + all_doc_appearances = len([doc for doc in documents if term in doc]) idf = math.log(len(documents)/all_doc_appearances, 10) else: if idfs.has_key(term): diff --git a/XMLParser.py b/XMLParser.py index 77eb31e..2bad81c 100644 --- a/XMLParser.py +++ b/XMLParser.py @@ -14,11 +14,11 @@ class ReviewHandler(xml.sax.ContentHandler): def endElement(self, tag): if tag == 'unique_id': - self.ids.append(self.data.strip()) + self.ids.append(clean_data(self.data.strip())) elif tag == 'title': - self.title = self.data.strip() + self.title = clean_data(self.data.strip()) elif tag == 'review_text': - self.reviews.append(nltk.word_tokenize(self.title + ' ' + self.data.strip())) + self.reviews.append(nltk.word_tokenize(self.title + ' ' + clean_data(self.data.strip()))) def characters(self, data): self.data += data @@ -28,11 +28,9 @@ def get_reviews(filename): f = open(filename, 'rU') data = f.read() + cleaned_data = clean_data(data) # hack because review format doesn't have an enclosing tag - # hack because there's a malformed ampersand... - data = data.replace('&', '&') - data = data.replace('\x1a', '') - xmldata = '' + data + '' + xmldata = '' + cleaned_data + '' f.close() handler = ReviewHandler() @@ -40,6 +38,17 @@ def get_reviews(filename): # Concatenate the review and title. return (handler.ids, handler.reviews) +def clean_data(data): + # hack because there's a malformed ampersand... + cleaned_data = data.replace('&', '&') + cleaned_data = cleaned_data.replace('\x1a', '') + # hack because there's a u'\ufffd'... wtf is that? + cleaned_data2 = '' + for char in cleaned_data: + if ord(char) < 255: + cleaned_data2 += char + return cleaned_data2 + def get_all_reviews(): filenames = ['sorted_data_acl/books/positive.review', 'sorted_data_acl/books/negative.review', 'sorted_data_acl/dvd/positive.review', 'sorted_data_acl/dvd/negative.review', diff --git a/cblexicon.py b/cblexicon.py index 6054d4a..ac41cca 100644 --- a/cblexicon.py +++ b/cblexicon.py @@ -228,14 +228,13 @@ class cblexicon: # Generate dissimilarity matrix dis = vectorize(conjSet,defSet) - """ # Its Cluster time set1 = defSet[len(defSet)//2:] set2 = defSet[:len(defSet)//2] """ set1 = random.sample(defSet, len(defSet)//4) set2 = [x for x in defSet if x not in set1] - + """ # Optimize objective function sets = optimize2(set1,set2,conjSet,defSet,dis) set1 = sets[0] @@ -243,15 +242,14 @@ class cblexicon: print(set1) print(set2) - """ - f = open('set1.txt', 'w+') - f2 = open('set1.txt', 'w+') + f1 = open('set1.txt', 'w+') + f2 = open('set2.txt', 'w+') for word in set1: - f.write(word + "/n") + f1.write(word + "\n") for word in set2: - f2.write(word + "/n") - """ - + f2.write(word + "\n") + f1.close() + f2.close() # Can we classify and then run bag of words? #negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] diff --git a/review_svm.py b/review_svm.py index 5fa1573..649f4bc 100644 --- a/review_svm.py +++ b/review_svm.py @@ -27,14 +27,14 @@ NEGATION_WORDS = ["not", "isn't", "didn't", "doesn't"] PUNCTUATION = [".", "!", "?", ",", ";"] # These are now command line parameters! See below... -USE_DELTATFIDF = True # Martineau and Finn. Excludes some other parameters (e.g. frequency) +USE_DELTATFIDF = False # Martineau and Finn. Excludes some other parameters (e.g. frequency) USE_PRESENCE = False # If true, use presence rather than frequency. USE_POS_TAGS = False USE_ADJ_ONLY = False USE_NEGATION = True USE_POSITION = False GRAM_LENGTH = 1 # Unigrams, bigrams, ... TODO use a range -NUM_FOLDS = 10 # For cross-validation (Pang & Lee used 3) +NUM_FOLDS = 3 # For cross-validation (Pang & Lee used 3, Martineau & Finn used 10) MIN_OCCURRENCES = 0#4 # To be included, the word must show up this many times across all documents (Pang and Lee used 4) EPSILON = .001 # determines how long the algorithm runs (default is 0.001) @@ -97,6 +97,12 @@ try: elif args[i] == "--epsilon": EPSILON = float(args[i+1]) i += 2 + elif args[i] == "--use-amazon": + USE_AMAZON = True + i += 1 + elif args[i] == "--use-delta": + USE_DELTATFIDF = True + i += 1 elif args[i] == "--help": print "Usage:" print "--gram-length N\t\tUse groups of N consecutive words (Default: 1)" @@ -110,6 +116,8 @@ try: print "--threshold N\t\tOnly include words that appear at least N times across all documents (Default: 4)" print "--epsilon X\t\tSVM parameter to control the number of iterations (Default: 0.001)" print "\t\t\t(0 < epsilon < 1; lower = more iterations)" + print "--use-amazon\t\tUse the Amazon data set rather than the movie review set. (Default: Off)" + print "--use-delta\t\tUse Delta TFIDF. (Default: Off)" exit() else: print "Error: Invalid argument", args[i] @@ -119,15 +127,21 @@ except Exception: t0 = time.time() +positive_ids = [] +negative_ids = [] + if USE_AMAZON: # Load the mixed Amazon review dataset. (ids, reviews, labels) = XMLParser.get_all_reviews() + for i in range(len(ids)): + if labels[i] == 1: + positive_ids.append(ids[i]) + elif labels[i] == -1: + negative_ids.append(ids[i]) else: # Load the Pang and Lee sentiment dataset. ids = movie_reviews.fileids() reviews = [list(movie_reviews.words(fileids = [id])) for id in ids] - positive_ids = [] - negative_ids = [] labels = [] for id in ids: label = movie_reviews.categories(id)[0] @@ -148,8 +162,10 @@ for i in range(len(reviews)): negative_reviews.append(reviews[i]) #TEST -#positive_reviews = random.sample(positive_reviews, 25) -#negative_reviews = random.sample(negative_reviews, 25) +#positive_reviews = positive_reviews[:200] +#negative_reviews = negative_reviews[:600] +#positive_reviews = random.sample(positive_reviews, 1000) +#negative_reviews = random.sample(negative_reviews, 1000) # Partition reviews into folds. (pos_folds, pos_fold_ids) = make_folds(positive_reviews, positive_ids, NUM_FOLDS) @@ -202,7 +218,7 @@ avg_acc = 0 wordlist = total_word_counts.keys() -f = open("results.txt", "w") +#f = open("results.txt", "w") for i in range(NUM_FOLDS): pos_train_reviews = [] neg_train_reviews = [] @@ -241,11 +257,12 @@ for i in range(NUM_FOLDS): predicted_labels = classifier.predict(test_vecs) acc = classifier.score(test_vecs, test_labels) for i in range(len(test_reviews)): - print "%s\t%d\t%d" % (test_ids[i], test_labels[i], predicted_labels[i]) + #f.write("%s\t%d\t%d\n" % (test_ids[i], test_labels[i], predicted_labels[i])) + print("%s\t%d\t%d" % (test_ids[i], test_labels[i], predicted_labels[i])) avg_acc += acc -f.close() +#f.close() t2 = time.time() avg_acc /= NUM_FOLDS diff --git a/svm.bat b/svm.bat index 66bc4a6..b9dc1a5 100644 --- a/svm.bat +++ b/svm.bat @@ -11,4 +11,7 @@ echo (5) Unigrams + POS tags python review_svm.py --gram-length 1 --presence --use-pos-tags echo (6) Adjectives -python review_svm.py --gram-length 1 --presence --use-pos-tags --use-adj-only \ No newline at end of file +python review_svm.py --gram-length 1 --presence --use-pos-tags --use-adj-only + +echo (8) Unigrams + Position +python review_svm.py --gram-length 1 --presence --use-position \ No newline at end of file