From 7b5edc3acd289ec24eb5a09744bfe34765363afb Mon Sep 17 00:00:00 2001 From: Jack Date: Tue, 26 Apr 2016 16:12:29 -0400 Subject: [PATCH] Twitter corpus; random changes from last week --- BagOfWords.py | 22 +++--- GlossLexicon.py | 9 +-- LexiconEval.py | 24 ++++-- TFIDF.py | 2 +- TwitterCorpus.py | 30 ++++++++ cblexicon.py | 185 +++++++++++++++++++++++++---------------------- getAdjectives.py | 72 +++++++++--------- graph.py | 39 +++++++--- review_svm.py | 110 ++++++++++++++++++---------- 9 files changed, 300 insertions(+), 193 deletions(-) create mode 100644 TwitterCorpus.py diff --git a/BagOfWords.py b/BagOfWords.py index 68c04da..b3007aa 100644 --- a/BagOfWords.py +++ b/BagOfWords.py @@ -14,17 +14,17 @@ ADJECTIVE_TAGS = ["JJ", "JJR", "JJS", "JJT"] POSITION_THRESHOLDS = [0.25, 0.75, 1] # ref_bag is used to calculate the total word count across all documents. -def make(words, ref_bag=None, gram_length=1, use_negation=False, use_presence=False, use_pos_tags=False, use_adj_only=False, use_position=False, normalize=True, use_hash=False): +def make(words, ref_bag=None, gram_length=1, use_negation=False, use_presence=False, use_pos_tags=False, use_adj_only=False, use_position=False, normalize=False): bag_of_words = {} if use_negation: do_negation = False if use_pos_tags: - tagged = nltk.pos_tag(words) + #tagged = nltk.pos_tag(words) + tagged = tagger.tag(words) # this is much much faster !!! words = [string.join(t, "_") for t in tagged] for i in range(len(words) - gram_length + 1): n_gram = string.join(words[i:i+gram_length], "_") - if use_negation: if (gram_length == 1): # Pang and Lee didn't do negation tagging for bigrams. if n_gram in NEGATION_WORDS: @@ -39,13 +39,8 @@ def make(words, ref_bag=None, gram_length=1, use_negation=False, use_presence=Fa if i/len(words) < POSITION_THRESHOLDS[j]: n_gram += POSITION_TAGS[j] break - - # LIBSVM won't use strings as keys, so hash to convert to a number. - if use_hash: - index = hash(n_gram) - else: - index = n_gram + index = n_gram if not (use_pos_tags and use_adj_only and (tagged[i][1] not in ADJECTIVE_TAGS)): if (not use_presence) and bag_of_words.has_key(index): bag_of_words[index] += 1 @@ -58,7 +53,8 @@ def make(words, ref_bag=None, gram_length=1, use_negation=False, use_presence=Fa ref_bag[index] += 1 else: ref_bag[index] = 1 - + + #length-normalize if normalize: length = 0 for k in bag_of_words.keys(): @@ -84,7 +80,7 @@ def make_tfidf(document, documents): # As per Martineau and Finn (2009), create a bag of words using delta TFIDF as the feature value. # Todo: Bigrams? -def make_delta_tfidf(document, positive_set, negative_set, pos_idfs, neg_idfs, ref_bag): +def make_delta_tfidf(document, positive_set, negative_set, pos_idfs, neg_idfs, ref_bag, use_pos_tags=False): bag = {} factor = 0 for term in set(document): @@ -112,4 +108,6 @@ def to_vector(bag, wordlist): else: vec.append(0) return vec - #return numpy.array(vec).reshape(1,-1) \ No newline at end of file + #return numpy.array(vec).reshape(1,-1) + +tagger = nltk.tag.perceptron.PerceptronTagger() \ No newline at end of file diff --git a/GlossLexicon.py b/GlossLexicon.py index 351ae1e..69fadb3 100644 --- a/GlossLexicon.py +++ b/GlossLexicon.py @@ -12,10 +12,10 @@ from nltk.corpus import wordnet as wn import BagOfWords -EXPAND_ITERATIONS = 3 +EXPAND_ITERATIONS = 2 CLASSIFIER = "svm" # "nb" = Naive Bayes, "svm" = Linear SVM, "me" = maximum entropy -REMOVE_STOPWORDS = True -USE_STEMMING = True # sync this up with eval! +REMOVE_STOPWORDS = False +USE_STEMMING = False # sync this up with eval! USE_EXAMPLES = True USE_EQUAL_TRAINING = True @@ -144,14 +144,12 @@ def create(test_words, test_labels): test_vecs = [BagOfWords.to_vector(bag, train_wordlist) for bag in test_bags] predicted_labels = classifier.predict(test_vecs) - """ correct = 0 for i in range(len(test_labels)): if test_labels[i] == predicted_labels[i]: correct += 1 print "Lexicon accuracy:", correct/len(test_labels) - """ word_labels = {} for i in range(len(test_words)): @@ -173,7 +171,6 @@ def create(test_words, test_labels): lexicon[word] = 1 for word in neg_words: - #lexicon[word] = -1 lexicon[word] = -1 return lexicon diff --git a/LexiconEval.py b/LexiconEval.py index afff8c2..3646fcf 100644 --- a/LexiconEval.py +++ b/LexiconEval.py @@ -7,13 +7,15 @@ from nltk.corpus import movie_reviews import MPQALexicon import AniaLexicon import GlossLexicon +import LexFromFile import XMLParser +import TwitterCorpus -USE_STEMMING = True # sync this up with lexicon! +USE_STEMMING = False # sync this up with lexicon! USE_PARSING = True LEX_ALG = "gloss" # "gloss", "conjunction", "none" LEX_SOURCE = "mpqa" # "mpqa", "ania" -CORPUS = "movies" # "amazon", "movies" +CORPUS = "movies" # "amazon", "movies", "twitter" NEG_MOD = 1.5 # Taboada suggested 1.5. # new and improved finite state machine @@ -115,6 +117,8 @@ try: CORPUS = "movies" elif args[i+1] == "amazon": CORPUS = "amazon" + elif args[i+1] == "twitter": + CORPUS = "twitter" i += 2 elif args[i] == "--help": print "Usage:" @@ -127,6 +131,7 @@ try: print " - ania: Use the hand-labeled lexicon from the Brown corpus" print "--corpus X: Choose the data set to test on" print " - amazon: Use the Amazon data set" + print " - twitter: Use the Twitter data set" print " - movies: Use the Pang&Lee movie data set (default)" exit() else: @@ -142,7 +147,7 @@ print "Corpus =", CORPUS # Load the test set. A few options here. if LEX_SOURCE == "mpqa": - (test_words, test_labels) = MPQALexicon.load(True) + (test_words, test_labels) = MPQALexicon.load(False) elif LEX_SOURCE == "ania": (test_words, test_labels) = AniaLexicon.load() else: @@ -164,10 +169,12 @@ if LEX_ALG != "none": correct = len([(word, label) for (word, label) in zip(test_words, test_labels) if lexicon.has_key(word) and label == lexicon[word]]) lex_acc = correct/len(lexicon.items()) print "Lexicon accuracy:", lex_acc - + +# TODO refactor me again. +#lexicon = LexFromFile.lexfromfile("cblex.txt") for key in lexicon.keys(): if lexicon[key] < 0: lexicon[key] *= NEG_MOD - + if CORPUS == "movies": ids = movie_reviews.fileids() reviews = [list(movie_reviews.words(fileids=[id])) for id in ids] @@ -180,10 +187,12 @@ if CORPUS == "movies": labels.append(-1) elif CORPUS == "amazon": (ids, reviews, labels) = XMLParser.get_all_reviews() +elif CORPUS == "twitter": + (ids, reviews, labels) = TwitterCorpus.load() #they're not reviews but we'll let it slide. else: print "Invalid corpus!" exit() - + """ # It feels like there should be a more efficient way do to this. shuffled = zip(ids,reviews,labels) @@ -192,6 +201,8 @@ ids = [x[0] for x in shuffled] reviews = [x[1] for x in shuffled] labels = [x[2] for x in shuffled] """ +#for k in lexicon.keys(): +# lexicon[k] *= -1 # Iterate through all of the reviews and compute scores by taking the sum of their # component lexicon words. Includes rudimentary negation testing. @@ -203,7 +214,6 @@ for i in range(len(reviews)): words = reviews[i] if USE_STEMMING: words = do_stem(words) - if USE_PARSING: score = calculate_score(words, lexicon) else: diff --git a/TFIDF.py b/TFIDF.py index 202180c..77008ad 100644 --- a/TFIDF.py +++ b/TFIDF.py @@ -44,6 +44,6 @@ def tfidf(term, document, documents, idfs={}): tfidf = (1 + math.log(doc_appearances,10)) * idf return tfidf -# Martineau and Finn 2009 +# Martineau and Finin 2009 def delta_tfidf(term, document, positive_set, negative_set, pos_idfs={}, neg_idfs={}): return tfidf(term, document, positive_set, pos_idfs) - tfidf(term, document, negative_set, neg_idfs) diff --git a/TwitterCorpus.py b/TwitterCorpus.py new file mode 100644 index 0000000..635cb5a --- /dev/null +++ b/TwitterCorpus.py @@ -0,0 +1,30 @@ +import nltk +import string +import random + +def load(sample=True): + CONJUNCTIONS = ["and", "but", "or"] + + f = open("Sentiment Analysis Dataset.csv") + lines = f.readlines() + #lines = lines[:1000] + f.close() + if sample: + lines = random.sample(lines, 10000) + ids = [] + tweets = [] + labels = [] + for line in lines[1:]: + line = line.replace("\"", "").strip() + line2 = "" + for c in line: + if ord(c) < 128: line2 += c + terms = line2.split(",") + id = terms[0] + label = int(terms[1]) + if label == 0: label = -1 + tweet = terms[3] + ids.append(id) + tweets.append(nltk.word_tokenize(tweet)) + labels.append(label) + return (ids, tweets, labels) \ No newline at end of file diff --git a/cblexicon.py b/cblexicon.py index 0f33cb7..ef6505a 100644 --- a/cblexicon.py +++ b/cblexicon.py @@ -6,74 +6,77 @@ from nltk.corpus import brown import random from nltk.stem import * import time +import scipy from sets import Set -def optimize(set1,set2,conjSet,defSet,dis): - i = 0 +""" +def optimize(set1, set2, conjSet, defSet, dis): currentMin = 999999 - consideredMin = calcScore(set1,set2,conjSet,dis) + consideredMin = calcScore(set1, set2, conjSet, dis) bestSwapWord = "" # Calculate the best word to remove until no moves lessen the function + i = 1 while( currentMin > consideredMin): print i - i = i + 1 currentMin = consideredMin for word in set1: set1.remove(word) set2.append(word) - test = calcScore(set1,set2,conjSet,dis) + test = calcScore(set1, set2, conjSet, dis) set2.remove(word) set1.append(word) - if (test < consideredMin): + if test < consideredMin: consideredMin = test bestSwapWord = word for word in set2: set2.remove(word) set1.append(word) - test = calcScore(set1,set2,conjSet,dis) + test = calcScore(set1, set2, conjSet, dis) set1.remove(word) set2.append(word) - if (test < consideredMin): + if test < consideredMin: consideredMin = test bestSwapWord = word - if(bestSwapWord in set1): + if bestSwapWord in set1: set1.remove(bestSwapWord) set2.append(bestSwapWord) else: set2.remove(bestSwapWord) set1.append(bestSwapWord) + i = i + 1 # Return the optimized sets - return set1,set2 + return set1, set2 +""" -def optimize2(set1,set2,conjSet,defSet,dis): +def optimize2(set1, set2, conjSet, defSet, dis): currentMin = 999999 - consideredMin = calcScore(set1,conjSet,dis) + calcScore(set2,conjSet,dis) + consideredMin = calcScore(set1, conjSet, dis) + calcScore(set2, conjSet, dis) bestSwapWord = None # Calculate the best word to remove until no moves lessen the function i = 1 - while( currentMin > consideredMin): + while currentMin > consideredMin: t1 = time.time() currentMin = consideredMin - currentS1 = calcScore(set1,conjSet,dis) - currentS2 = calcScore(set2,conjSet,dis) - consideredMin = currentS1 + currentS2 # + currentS1 = calcScore(set1, conjSet, dis) + currentS2 = calcScore(set2, conjSet, dis) + consideredMin = currentS1 + currentS2 for word in set1: - test = calcSwap(word,set1,set2,currentS1,currentS2,conjSet,dis) + test = calcSwap(word, set1, set2, currentS1, currentS2, conjSet, dis) if (test < consideredMin): consideredMin = test bestSwapWord = word for word in set2: - test = calcSwap(word,set2,set1,currentS2,currentS1,conjSet,dis) - if (test < consideredMin): + test = calcSwap(word, set2, set1, currentS2, currentS1, conjSet, dis) + if test < consideredMin: consideredMin = test bestSwapWord = word - if(bestSwapWord in set1): + if bestSwapWord in set1: set1.remove(bestSwapWord) set2.append(bestSwapWord) - elif(bestSwapWord in set2): + elif bestSwapWord in set2: set2.remove(bestSwapWord) set1.append(bestSwapWord) t2 = time.time() @@ -81,23 +84,23 @@ def optimize2(set1,set2,conjSet,defSet,dis): i += 1 # Return the optimized sets - return set1,set2 + return set1, set2 -def constraintSwap(set1,set2,conjSet,defSet,dis): +def constraintSwap(set1, set2, conjSet, defSet, dis): for word in set1: stay = 0 swap = 0 for otherword in set1: if otherword != word: - cats = dis[conjSet[word][0]][conjSet[otherword][0]] + cats = getDis(word, otherword) stay = stay + cats - stay = stay * (1/(len(set1)-1)) + stay /= (len(set1)-1) for otherword in set2: if otherword != word: - cats = dis[conjSet[word][0]][conjSet[otherword][0]] + cats = getDis(word, otherword) swap = swap + cats - swap = swap * (1/(len(set2))) - if(stay > swap): + swap /= len(set2) + if stay > swap: set1.remove(word) set2.append(word) @@ -106,19 +109,18 @@ def constraintSwap(set1,set2,conjSet,defSet,dis): swap = 0 for otherword in set2: if otherword != word: - cats = dis[conjSet[word][0]][conjSet[otherword][0]] - stay = stay + cats - stay = stay * (1/(len(set2)-1)) + cats = getDis(word, otherword) + stay += cats + stay /= (len(set2)-1) for otherword in set1: if otherword != word: - cats = dis[conjSet[word][0]][conjSet[otherword][0]] - swap = swap + cats - swap = swap * (1/(len(set1))) - if(stay > swap): + cats = getDis(word, otherword) + swap += cats + swap /= len(set1) + if stay > swap: set2.remove(word) set1.append(word) - return set1,set2 - + return set1, set2 def calcScore(set,conjSet,dis): score = 0 @@ -126,8 +128,8 @@ def calcScore(set,conjSet,dis): w1 = set[i] for j in range(i, len(set)): w2 = set[j] - cats = dis[conjSet[w1][0]][conjSet[w2][0]] - score = score + cats + cats = getDis(w1, w2) + score += cats return score / len(set) def calcSwap(word,currSet,opSet,currentCount,otherCount,conjSet,dis): @@ -135,25 +137,22 @@ def calcSwap(word,currSet,opSet,currentCount,otherCount,conjSet,dis): score2 = 0 for w in currSet: if word != w: - cats = dis[conjSet[word][0]][conjSet[w][0]] - score1 = score1 + cats - currentCount = ((currentCount* len(currSet)) - score1 )/(len(currSet)-1) + cats = getDis(word, w) + score1 += cats + currentCount = (currentCount * len(currSet) - score1)/(len(currSet)-1) #for word in set2: for w in opSet: if word != w: - cats = dis[conjSet[word][0]][conjSet[w][0]] - score2 = score2 + cats - otherCount = ((otherCount* len(opSet)) + score2 )/(len(opSet)+1) + cats = getDis(word, w) + score2 += cats + otherCount = (otherCount * len(opSet) + score2)/(len(opSet)+1) return currentCount + otherCount -def normalize_word(word): - return SnowballStemmer("english").stem(word) - -def vectorize(conjSet,defSet): +def vectorize(conjSet, defSet): dis = numpy.zeros((len(defSet),len(defSet))) - dis.fill(.5) + dis.fill(0.5) for word in defSet: similar = conjSet[word][1] dissimilar = conjSet[word][2] @@ -163,12 +162,10 @@ def vectorize(conjSet,defSet): dis[conjSet[word][0]][conjSet[d][0]] = 1 return dis -def word_feats(words): - return dict([(word, True) for word in words]) - def genSets(): - f = open('words.txt', 'r+') + f = open('words.txt', 'r') content = f.readlines() + f.close() positive = Set([]) negative = Set([]) @@ -179,53 +176,68 @@ def genSets(): elif (current[1][0] == 'n'): negative.add(current[0]) - return positive,negative + return positive, negative def getConj(): # Set up the tuple (index, similar, dissimilar) - f = open('conj.txt', 'r+') + f = open('movieconj.txt', 'r') content = f.readlines() + f.close() d = dict() i = 0 for line in content: current = line.split(' ') + # WTF is all this index math? if current[2] == "but": if current[0] in d: d[current[0]][2].add(current[1]) else: d[current[0]] = (i,Set(),Set([current[1]])) - i = i+1 + i += 1 if current[1] in d: d[current[1]][2].add(current[0]) else: d[current[1]] = (i,Set(),Set([current[0]])) - i = i+1 + i += 1 else: if current[0] in d: d[current[0]][1].add(current[1]) else: d[current[0]] = (i,Set([current[1]]),Set()) - i = i+1 + i += 1 if current[1] in d: d[current[1]][1].add(current[0]) else: d[current[1]] = (i,Set([current[0]]),Set()) - i = i+1 + i += 1 return d -def findFrequency(set1,set2): +def findFrequency(set1, set2): set1Freq = 0 set2Freq = 0 for word in brown.words(): - set1Freq = (set1Freq+1) if (word in set1) else set1Freq - set2Freq = (set2Freq+1) if (word in set2) else set2Freq + set1Freq = (set1Freq + 1) if (word in set1) else set1Freq + set2Freq = (set2Freq + 1) if (word in set2) else set2Freq return set1Freq, set2Freq +def getDis(a, b): + global dis, conjSet + a_index = conjSet[a][0] + b_index = conjSet[b][0] + """ + if dis.has_key((a_index,b_index)): + return dis[(a_index, b_index)] + else: + return 0 + """ + return dis[a_index][b_index] + def conjunctionData(set1,set2): - f = open('conj.txt', 'r+') + f = open('movieconj.txt', 'r+') content = f.readlines() + f.close() totalConj = 0 totalbuts = 0 correctbuts = 0 @@ -233,27 +245,27 @@ def conjunctionData(set1,set2): correctands = 0 totalors = 0 correctors = 0 - totalnors =0 + totalnors = 0 correctnors = 0 for line in content: - totalConj = totalConj +1 + totalConj = totalConj + 1 current = line.split(' ') if current[2] == "but": - totalbuts = totalbuts +1 - if( (current[0] in set1 and current[1] in set2) or (current[0] in set2 and current[1] in set1) ): - correctbuts = correctbuts +1 + totalbuts = totalbuts + 1 + if (current[0] in set1 and current[1] in set2) or (current[0] in set2 and current[1] in set1): + correctbuts = correctbuts + 1 elif current[2] == "and": - totalands = totalands +1 - if( (current[0] in set1 and current[1] in set1) or (current[0] in set2 and current[1] in set2) ): - correctands = correctands +1 + totalands = totalands + 1 + if (current[0] in set1 and current[1] in set1) or (current[0] in set2 and current[1] in set2): + correctands = correctands + 1 elif current[2] == "or": - totalors = totalors +1 - if( (current[0] in set1 and current[1] in set1) or (current[0] in set2 and current[1] in set2) ): - correctors = correctors +1 + totalors = totalors + 1 + if (current[0] in set1 and current[1] in set1) or (current[0] in set2 and current[1] in set2): + correctors = correctors + 1 elif current[2] == "nor": - totalnors = totalnors +1 - if( (current[0] in set1 and current[1] in set1) or (current[0] in set2 and current[1] in set2) ): - correctnors = correctnors +1 + totalnors = totalnors + 1 + if (current[0] in set1 and current[1] in set1) or (current[0] in set2 and current[1] in set2): + correctnors = correctnors + 1 print "Total Conjunctions: %d" % totalConj print "Total ands: %d \n Ands in same set: %d" % (totalands,correctands) print "Total ors: %d \n Ors in same set: %d" % (totalors,correctors) @@ -261,6 +273,7 @@ def conjunctionData(set1,set2): print "Total buts: %d \n Buts in opposite sets: %d" % (totalbuts,correctbuts) def returnCBLexicon(): + global dis, conjSet # Generate positive and negative initial sets sets = genSets() positive = random.sample(sets[0], min(len(sets[0]), len(sets[1]))) @@ -282,9 +295,10 @@ def returnCBLexicon(): bestSet1 = [] bestSet2 = [] bestScore = 999999 - numIterations = 10 + numIterations = 3 for i in range(numIterations): - set1 = random.sample(defSet, len(defSet)//2) + setsize = random.randint(len(defSet)//4, len(defSet)*3//4) + set1 = random.sample(defSet, setsize) set2 = [x for x in defSet if x not in set1] # Optimize objective function @@ -299,12 +313,14 @@ def returnCBLexicon(): bestScore = score #Find which set has a higher frequency in the training set - (set1Freq,set2Freq) = findFrequency(set1,set2) + #(set1Freq,set2Freq) = findFrequency(set1,set2) - positive = set1 if (set1Freq>set2Freq) else set2 - negative = set1 if (set1Freqset2Freq) else set2 + #negative = set1 if (set1Freqlen(set2) else set2 + negative = set2 if len(set1) 0 + #return tagger.tag([word])[0][1].startswith("JJ") and len(wn.synsets(word, wn.ADJ)) > 0 and word in mpqa_words + return word in mpqa_words def genConj(): - conj = open('movieconj.txt', 'r+') - ands = open('ands.txt', 'r+') - ors = open('ors.txt', 'r+') - buts = open('buts.txt', 'r+') - nor = open('nor.txt', 'r+') - eor = open('eor.txt', 'r+') - j = 0 + conj = open('movieconj.txt', 'w') + global tagger + tagger = nltk.tag.perceptron.PerceptronTagger() + num_lines = 0 for review in sorted(movie_reviews.fileids()): #For every review - tokens = nltk.pos_tag(nltk.word_tokenize(movie_reviews.raw(fileids=[review]))) - print j - j = j+1 - for i in range(0,len(tokens)-3): - if ((tokens[i][1]== "JJ" or tokens[i][1] == "JJR" or tokens[i][1] == "JJS") and (tokens[i+2][1]== "JJ" or tokens[i+2][1] == "JJR" or tokens[i+2][1] == "JJS")): - if (tokens[i+1][0] == "and"): - print tokens[i][0] - conj.write(tokens[i][0]+ " " + tokens[i+2][0] + " " + tokens[i+1][0] + "\n") - #ands.write(tokens[i][0]+ " " + tokens[i+2][0]+ "\n") - elif (tokens[i+1][0] == "or"): - conj.write(tokens[i][0]+ " " + tokens[i+2][0] + " " + tokens[i+1][0] + "\n") - #ors.write(tokens[i][0]+ " " + tokens[i+2][0]+ "\n") - elif (tokens[i+1][0] == "but"+ "\n"): - conj.write(tokens[i][0]+ " " + tokens[i+2][0] + " " + tokens[i+1][0] + "\n") - #buts.write(tokens[i][0]+ " " + tokens[i+2][0]+ "\n") - elif (tokens[i+1][0] == "either-or"): - conj.write(tokens[i][0]+ " " + tokens[i+2][0] + " " + tokens[i+1][0] + "\n") - #eor.write(tokens[i][0]+ " " + tokens[i+2][0]+ "\n") - elif (tokens[i+1][0] == "neither-nor"): - conj.write(tokens[i][0]+ " " + tokens[i+2][0] + " " + tokens[i+1][0] + "\n") - #nor.write(tokens[i][0]+ " " + tokens[i+2][0]+ "\n") - - - - + tokens = movie_reviews.words(fileids=[review]) + for i in range(0,len(tokens)-2): + if isAdj(tokens[i]) and isAdj(tokens[i+2]): + line = tokens[i]+ " " + tokens[i+2] + " " + tokens[i+1] + "\n" + print line.strip() + if (tokens[i+1] == "and"): + conj.write(line) + num_lines += 1 + elif (tokens[i+1] == "or"): + conj.write(line) + num_lines += 1 + elif (tokens[i+1] == "but"+ "\n"): + conj.write(line) + num_lines += 1 + elif (tokens[i+1] == "either-or"): + conj.write(line) + num_lines += 1 + elif (tokens[i+1] == "neither-nor"): + conj.write(line) + num_lines += 1 + print num_lines + def doBrown(): f = open('movieconj.txt', 'w') list1 = [] for word in sc.tagged_sents(): for w in word: - if(w[1] == "JJ" or w[1] == "JJR" or w[1] == "JJS" or w[1] == "JJT"): + if w[1].startswith("JJ"): list1.append(w[0]) counts = Counter(list1) d = dict(counts) @@ -55,4 +58,5 @@ def doBrown(): f.write(n+" \n") f.close() +(mpqa_words, mpqa_labels) = MPQALexicon.load() genConj() \ No newline at end of file diff --git a/graph.py b/graph.py index 04102ef..1425f71 100644 --- a/graph.py +++ b/graph.py @@ -10,7 +10,9 @@ labels = [ "bigrams, frequency, +Position", "bigrams, presence", "bigrams, presence, +Position", - "delta_tfidf" + "delta_tfidf", + "unigrams, presence, +POS", + "bigrams, presence, +POS" ] labels2 = [ "unigrams, frequency", @@ -21,23 +23,38 @@ labels2 = [ "bigrams, frequency, +Position", "bigrams, presence", "bigrams, presence, +Position", - "delta_tfidf" + "delta_tfidf", + "unigrams, presence, +POS", + "bigrams, presence, +POS" ] -tops = numpy.arange(len(labels)) -widths = [0.826002649356, 0.784479089868, 0.842490694287, 0.821997146847, 0.807497617378, 0.777000053946, 0.820491149832, 0.795509581438, 0.981992471513] -widths2 = [0.824624634419, 0.808376475678, 0.832750728912, 0.815374570779, 0.797876474366, 0.771876439875, 0.799001849413, 0.768376127015, 0.929999178955] +bottoms = numpy.arange(len(labels)) +widths = [0.826002649356, 0.784479089868, 0.842490694287, 0.821997146847, 0.807497617378, 0.777000053946, 0.820491149832, 0.795509581438, 0.981992471513, 0.836989684295, 0.818997140853] +widths2 = [0.824624634419, 0.808376475678, 0.832750728912, 0.815374570779, 0.797876474366, 0.771876439875, 0.799001849413, 0.768376127015, 0.929999178955, 0.813999695576, 0.792252879562] height = 0.3 -pyplot.barh(tops, widths, height, color="#FF0000") -pyplot.barh(tops+height, widths2, height, color="#00FF00") +pyplot.barh(bottoms, widths, height, color="#FF0000") +pyplot.barh(bottoms-height, widths2, height, color="#00FF00") pyplot.legend(["Movies", "Amazon"], loc=4) # bottom right -pyplot.yticks(tops+height, labels) +pyplot.yticks(bottoms, labels) pyplot.xlim(0.5, 1.0) -pyplot.ylim(tops[0]-2*height, tops[-1]+3*height) +#pyplot.ylim(tops[0]-2*height, bottoms[-1]+3*height) pyplot.show() """ +# TODO: Use POS tags on Amazon dataset. gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: False, use_position: False 0.822003140865 gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: False, use_position: True 0.781988575402 -gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position: False 0.77899606193 -gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position: True 0.762512512513 +gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position:False 0.77899606193 +gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position:True 0.762512512513 +gram_length: 1, use_presence: True, use_amazon: False, use_pos_tags: True, use_adj_only: False, use_position:False 0.836989684295 +gram_length: 1, use_presence: True, use_amazon: False, use_pos_tags: True, use_adj_only: False, use_position:True 0.814993136849 +gram_length: 1, use_presence: True, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position: False 0.769011526497 +gram_length: 1, use_presence: True, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position: True 0.749997002991 +gram_length: 2, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: False, use_position: False 0.800000599402 +gram_length: 2, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: False, use_position: True 0.762980045914 +gram_length: 2, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position:False 0.707506908106 +gram_length: 2, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position:True 0.669494344644 +gram_length: 2, use_presence: True, use_amazon: False, use_pos_tags: True, use_adj_only: False, use_position:False 0.818997140853 +gram_length: 2, use_presence: True, use_amazon: False, use_pos_tags: True, use_adj_only: False, use_position:True 0.78900607194 +gram_length: 2, use_presence: True, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position: False 0.714507921095 +gram_length: 2, use_presence: True, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position: True 0.668990847135 """ \ No newline at end of file diff --git a/review_svm.py b/review_svm.py index ceae890..3920aef 100644 --- a/review_svm.py +++ b/review_svm.py @@ -9,18 +9,17 @@ from nltk.corpus import movie_reviews import numpy from sklearn.svm import SVC from sklearn.svm import LinearSVC -from TFIDF import delta_tfidf, compute_idfs import BagOfWords import XMLParser +import TwitterCorpus +from TFIDF import delta_tfidf, compute_idfs -# Program to classify the movie review dataset using a support vector machine -# (via LIBSVM), following Pang and Lee (2002). +# Program to classify the movie review dataset using a support vector machine, following Pang and Lee (2002). # "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not', # 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word." # They didn't provide a full list. -# TODO make this a parameter NEGATION_WORDS = ["not", "isn't", "didn't", "doesn't"] PUNCTUATION = [".", "!", "?", ",", ";"] @@ -32,15 +31,15 @@ USE_ADJ_ONLY = False USE_NEGATION = True USE_POSITION = False GRAM_LENGTH = 1 # Unigrams, bigrams, ... TODO use a range -NUM_FOLDS = 3 # For cross-validation (Pang & Lee used 3, Martineau & Finn used 10) +NUM_FOLDS = 3 # For cross-validation (Pang & Lee used 3, Martineau & Finin used 10) -MIN_OCCURRENCES = 0#4 # To be included, the word must show up this many times across all documents (Pang and Lee used 4) +MIN_OCCURRENCES = 4 # To be included, the word must show up this many times across all documents (Pang and Lee used 4) EPSILON = .001 # determines how long the algorithm runs (default is 0.001) NORMALIZE_BAGS = True USE_LIBLINEAR = True # This is supposedly faster for large instances -USE_AMAZON = False # Use the Amazon review set, not Pang and Lee. +CORPUS = "movies" # "twitter", "amazon", "movies" USE_DELTA_TFIDF = False def make_folds(documents, ids, num_partitions): @@ -64,7 +63,7 @@ def from_command_line(): use_negation = USE_NEGATION use_position = USE_POSITION min_occurrences = MIN_OCCURRENCES - use_amazon = USE_AMAZON + corpus = CORPUS try: args = sys.argv[1:] while i < len(args): @@ -98,9 +97,9 @@ def from_command_line(): elif args[i] == "--threshold": min_occurrences = int(args[i+1]) i += 2 - elif args[i] == "--use-amazon": - use_amazon = True - i += 1 + elif args[i] == "--corpus": + corpus = args[i+1] + i += 2 elif args[i] == "--use-delta": use_delta = True i += 1 @@ -116,22 +115,22 @@ def from_command_line(): print "--use-position\t\tTag words according to their position in the text (Default: Off)" print "--threshold N\t\tOnly include words that appear at least N times across all documents (Default: 4)" print "\t\t\t(0 < epsilon < 1; lower = more iterations)" - print "--use-amazon\t\tUse the Amazon data set rather than the movie review set. (Default: Off)" + print "--corpus\t\tSelect a corpus to evaluate. (amazon, movies, twitter) (Default: movies)" print "--use-delta\t\tUse Delta TFIDF. (Default: Off)" exit() else: print "Error: Invalid argument", args[i] i += 1 - classify_reviews(gram_length, num_folds, use_presence, use_negation, use_pos_tags, use_adj_only, min_occurrences, use_amazon, use_delta) + classify_reviews(gram_length, num_folds, use_presence, use_negation, use_pos_tags, use_adj_only, min_occurrences, corpus, use_delta) except Exception: print "Invalid arguments" - + def classify_reviews(gram_length=GRAM_LENGTH, num_folds=NUM_FOLDS, use_presence=USE_PRESENCE, use_negation=USE_NEGATION, use_pos_tags=USE_POS_TAGS, use_adj_only=USE_ADJ_ONLY, - use_position = USE_POSITION, min_occurrences=MIN_OCCURRENCES, use_amazon=USE_AMAZON, use_delta=USE_DELTA_TFIDF): + use_position = USE_POSITION, min_occurrences=MIN_OCCURRENCES, corpus=CORPUS, use_delta=USE_DELTA_TFIDF, skew=(1,1)): positive_ids = [] negative_ids = [] - if use_amazon: + if corpus == "amazon": # Load the mixed Amazon review dataset. (ids, reviews, labels) = XMLParser.get_all_reviews() for i in range(len(ids)): @@ -139,7 +138,7 @@ def classify_reviews(gram_length=GRAM_LENGTH, num_folds=NUM_FOLDS, use_presence= positive_ids.append(ids[i]) elif labels[i] == -1: negative_ids.append(ids[i]) - else: + elif corpus == "movies": # Load the Pang and Lee sentiment dataset. ids = movie_reviews.fileids() reviews = [list(movie_reviews.words(fileids = [id])) for id in ids] @@ -152,6 +151,13 @@ def classify_reviews(gram_length=GRAM_LENGTH, num_folds=NUM_FOLDS, use_presence= elif label == 'neg': labels.append(-1) negative_ids.append(id) + elif corpus == "twitter": + (ids, reviews, labels) = TwitterCorpus.load() + for i in range(len(ids)): + if labels[i] == 1: + positive_ids.append(ids[i]) + elif labels[i] == -1: + negative_ids.append(ids[i]) positive_reviews = [] negative_reviews = [] @@ -162,11 +168,10 @@ def classify_reviews(gram_length=GRAM_LENGTH, num_folds=NUM_FOLDS, use_presence= elif labels[i] == -1: negative_reviews.append(reviews[i]) - #TEST - #positive_reviews = positive_reviews[:200] - #negative_reviews = negative_reviews[:600] - #positive_reviews = random.sample(positive_reviews, 1000) - #negative_reviews = random.sample(negative_reviews, 1000) + num_pos = int(len(positive_reviews) * skew[0]) + num_neg = int(len(negative_reviews) * skew[1]) + positive_reviews = random.sample(positive_reviews, num_pos) + negative_reviews = random.sample(negative_reviews, num_neg) # Partition reviews into folds. (pos_folds, pos_fold_ids) = make_folds(positive_reviews, positive_ids, num_folds) @@ -262,34 +267,65 @@ def classify_reviews(gram_length=GRAM_LENGTH, num_folds=NUM_FOLDS, use_presence= def run_configs(): min_occurrences = 4 use_negation = True - use_delta = False - use_pos_tags = False - use_adj_only = False labels = [] accs = [] - for use_amazon in [False, True]: - for gram_length in [1,2]: - for use_presence in [False, True]: - for (use_pos_tags, use_adj_only) in [(True, False), (True, True)]: - for use_position in [False, True]: + #for corpus in ["movies", "amazon", "twitter"]: + for corpus in ["amazon", "twitter"]: + for use_position in [False, True]: + for (use_pos_tags, use_adj_only) in [(False, False), (True, False), (True, True)]: + for gram_length in [1,2]: + for use_presence in [False, True]: params = {'gram_length':gram_length, 'use_presence':use_presence, 'use_pos_tags':use_pos_tags, 'use_adj_only':use_adj_only, - 'use_position':use_position, 'use_amazon':use_amazon, 'min_occurrences':min_occurrences, 'use_delta':False} + 'use_position':use_position, 'corpus':corpus, 'min_occurrences':min_occurrences, 'use_delta':False} acc = classify_reviews(**params) - label = "gram_length: %d, use_presence: %s, use_amazon: %s, use_pos_tags: %s, use_adj_only: %s, use_position: %s" % (gram_length, use_presence, use_amazon, use_pos_tags, use_adj_only, use_position) + label = "gram_length: %d, use_presence: %s, corpus: %s, use_pos_tags: %s, use_adj_only: %s, use_position: %s" % (gram_length, use_presence, corpus, use_pos_tags, use_adj_only, use_position) print label, acc labels.append(label) accs.append(acc) # Delta-TFIDF construction doesn't support all parameters (yet). - params = {'use_amazon':use_amazon, 'use_delta':True} + params = {'corpus':corpus, 'use_delta':True} acc = classify_reviews(**params) - label = "delta_tfidf: True, use_amazon: %s" % use_amazon + label = "delta_tfidf: True, corpus: %s" % corpus print label, acc labels.append(label) accs.append(acc) return (labels, accs) + +def run_skewed(): + min_occurrences = 4 + use_negation = True + use_delta = False + use_pos_tags = False + use_adj_only = False + use_position = False + use_presence = True + labels = [] + accs = [] + for corpus in ["movies", "amazon"]: + for skew in [(0.2,1), (0.4,1), (0.6,1), (0.8, 1), (1,0.8), (1,0.6), (1,0.4), (1,0.2)]: + for gram_length in [1,2]: + params = {'gram_length':gram_length, 'use_presence':use_presence, 'use_pos_tags':use_pos_tags, 'use_adj_only':use_adj_only, + 'use_position':use_position, 'corpus':corpus, 'min_occurrences':min_occurrences, 'use_delta':False, 'skew': skew} + + acc = classify_reviews(**params) + label = "corpus: %s, gram_length: %d, skew: (%f, %f)" % (corpus, gram_length, skew[0], skew[1]) + + print label, acc + labels.append(label) + accs.append(acc) + + params = {'gram_length':1, 'use_presence':False, 'use_pos_tags':False, 'use_adj_only':False, + 'use_position':False, 'corpus':corpus, 'min_occurrences':min_occurrences, 'use_delta':False, 'skew': skew} + + acc = classify_reviews(**params) + label = "corpus: %s, delta_tfidf: True, skew: (%f, %f)" % (corpus, skew[0], skew[1]) + print label, acc + labels.append(label) + accs.append(acc) -(labels, accs) = run_configs() -f = open('SVM_RESULTS.txt', 'w') +#(labels, accs) = run_configs() +(labels, accs) = run_skewed() +f = open('SVM_RESULTS_SKEW.txt', 'w') for (label, acc) in zip(labels, accs): f.write("%s\t%s\n" % (label, acc)) -f.close() +f.close() \ No newline at end of file