diff --git a/SentiWordLex.py b/SentiWordLex.py new file mode 100644 index 0000000..85583f5 --- /dev/null +++ b/SentiWordLex.py @@ -0,0 +1,44 @@ +from __future__ import division +import sys +import time + +import nltk +from nltk.corpus import movie_reviews +from nltk.corpus import sentiwordnet as swn +from nltk.corpus import wordnet as wn + +start_time = time.time() +count = 0.00 +correct = 0.00 +ids = sorted(movie_reviews.fileids()) + +for reviews in ids: #For every review + score = 0.0 + positive = 0.0 + negative = 0.0 + tokens = nltk.pos_tag(nltk.word_tokenize(movie_reviews.raw(fileids=[reviews]))) #Tokenize all words with POS + for token in tokens: + if (token[1]== "JJ" or token[1] == "JJR" or token[1] == "JJS"): # If adjective, check value + if len(wn.synsets(token[0], pos=wn.ADJ)) != 0 and swn.senti_synset(wn.synsets(token[0], pos=wn.ADJ)[0].name()) : + word = wn.synsets(token[0], pos=wn.ADJ)[0].name() + print word + print swn.senti_synset(word) + positive = positive + swn.senti_synset(word).pos_score() + negative = negative + swn.senti_synset(word).neg_score() + print "%s, %d, %d" %(word,positive,negative) + score = positive - negative + if (score < 0): + print "Negative at %f" % (score) + sentiment = 'neg' + else: + sentiment = 'pos' + print "Positive at %d" % (score) + if (sentiment == movie_reviews.categories(fileids=[reviews])[0]): + print "Correct" + correct = correct + 1.00 + count = count + 1.00 + +print correct/count +print "Seconds: %d" %(time.time() - start_time) +print "correct:", correct/len(ids) +print "positive:", positive/len(ids) \ No newline at end of file diff --git a/cblexicon.py b/cblexicon.py index ac41cca..99c1cdb 100644 --- a/cblexicon.py +++ b/cblexicon.py @@ -1,264 +1,313 @@ from __future__ import division -import math import nltk -from nltk.corpus import wordnet as wn -from collections import Counter import numpy -from nltk.corpus import movie_reviews import nltk.stem -from nltk.cluster import KMeansClusterer, GAAClusterer, euclidean_distance -from nltk.classify import NaiveBayesClassifier +from nltk.corpus import brown import random from nltk.stem import * from sets import Set -class cblexicon: - - def process(self): - - def optimize(set1,set2,conjSet,defSet,dis): - i = 0 - currentMin = 999999 - consideredMin = calcScore(set1,set2,conjSet,dis) - bestSwapWord = "" - # Calculate the best word to remove until no moves lessen the function - while( currentMin > consideredMin): - print i - i = i + 1 - currentMin = consideredMin - for word in set1: - set1.remove(word) - set2.append(word) - test = calcScore(set1,set2,conjSet,dis) - set2.remove(word) - set1.append(word) - if (test < consideredMin): - consideredMin = test - bestSwapWord = word - for word in set2: - set2.remove(word) - set1.append(word) - test = calcScore(set1,set2,conjSet,dis) - set1.remove(word) - set2.append(word) - if (test < consideredMin): - consideredMin = test - bestSwapWord = word - - if(bestSwapWord in set1): - set1.remove(bestSwapWord) - set2.append(bestSwapWord) - else: - set2.remove(bestSwapWord) - set1.append(bestSwapWord) - # Return the optimized sets - return set1,set2 - - def optimize2(set1,set2,conjSet,defSet,dis): - i = 0 - currentMin = 999999 - consideredMin = calcScore(set1,conjSet,dis) + calcScore(set2,conjSet,dis) - bestSwapWord = None - print consideredMin - # Calculate the best word to remove until no moves lessen the function - while( currentMin > consideredMin): - print "Iteration #%d: (%d, %d)" % (i, len(set1), len(set2)) - currentMin = consideredMin - currentS1 = calcScore(set1,conjSet,dis) - currentS2 = calcScore(set2,conjSet,dis) - consideredMin = currentS1 + currentS2 # - for word in set1: - test = calcSwap(word,set1,set2,currentS1,currentS2,conjSet,dis) - if (test < consideredMin): - print "found1" - consideredMin = test - bestSwapWord = word - for word in set2: - test = calcSwap(word,set2,set1,currentS2,currentS1,conjSet,dis) - if (test < consideredMin): - print "found2" - consideredMin = test - bestSwapWord = word - print "New min: %f" % consideredMin - - if(bestSwapWord in set1): - set1.remove(bestSwapWord) - set2.append(bestSwapWord) - elif(bestSwapWord in set2): - set2.remove(bestSwapWord) - set1.append(bestSwapWord) - i = i + 1 - - # Return the optimized sets - return set1,set2 - - def cluster(set1,set2,conjSet,defSet,dis): - for word in set1: - score1 = calcScore(word,set1,conjSet,dis) - #print "Score 1: %f" % score1 - score2 = calcScore(word,set2,conjSet,dis) - #print "Score 2: %f" % score2 - if score2 < score1: - print "swap" - set1.remove(word) - set2.append(word) - for word in set2: - score1 = calcScore(word,set1,conjSet,dis) - score2 = calcScore(word,set2,conjSet,dis) - if score1 < score2: - set2.remove(word) - set1.append(word) - return set1,set2 - - def calcScore(set,conjSet,dis): - score = 0 - for i in range(len(set)): - w1 = set[i] - for j in range(i, len(set)): - w2 = set[j] - cats = dis[conjSet[w1][0]][conjSet[w2][0]] - score = score + cats - return score / len(set) - - def calcSwap(word,currSet,opSet,currentCount,otherCount,conjSet,dis): - score1 = 0 - score2 = 0 - for w in currSet: - if word != w: - cats = dis[conjSet[word][0]][conjSet[w][0]] - score1 = score1 + cats - currentCount = ((currentCount* len(currSet)) - score1 )/(len(currSet)-1) - - #for word in set2: - for w in opSet: - if word != w: - cats = dis[conjSet[word][0]][conjSet[w][0]] - score2 = score2 + cats - otherCount = ((otherCount* len(opSet)) + score2 )/(len(opSet)+1) - - return currentCount + otherCount - - def normalize_word(word): - return SnowballStemmer("english").stem(word) - - def vectorize(conjSet,defSet): - dis = numpy.zeros((len(defSet),len(defSet))) - dis.fill(.5) - for word in defSet: - similar = conjSet[word][1] - dissimilar = conjSet[word][2] - for sim in similar: - dis[conjSet[word][0]][conjSet[sim][0]] = 0 - for d in dissimilar: - dis[conjSet[word][0]][conjSet[d][0]] = 1 - return dis - - def word_feats(words): - return dict([(word, True) for word in words]) - - def genSets(): - f = open('words.txt', 'r+') - content = f.readlines() - positive = Set([]) - negative = Set([]) - - for pair in content: - current = pair.split(' ') - if (current[1][0] == 'p'): - positive.add(current[0]) - elif (current[1][0] == 'n'): - negative.add(current[0]) - - return positive,negative - - def getConj(): - # Set up the tuple (index, similar, dissimilar) - f = open('conj.txt', 'r+') - content = f.readlines() - d = dict() - i = 0 - for line in content: - current = line.split(' ') - if current[2] == "but": - if current[0] in d: - d[current[0]][2].add(current[1]) - else: - d[current[0]] = (i,Set(),Set([current[1]])) - i = i+1 - if current[1] in d: - d[current[1]][2].add(current[0]) - else: - d[current[1]] = (i,Set(),Set([current[0]])) - i = i+1 - else: - if current[0] in d: - d[current[0]][1].add(current[1]) - else: - d[current[0]] = (i,Set([current[1]]),Set()) - i = i+1 - if current[1] in d: - d[current[1]][1].add(current[0]) - else: - d[current[1]] = (i,Set([current[0]]),Set()) - i = i+1 - return d - - #Get the Data# - """ - negids = movie_reviews.fileids('neg') - posids = movie_reviews.fileids('pos') - training = set(negids[:500] + posids[:500]) - testing = set(negids[500:] + posids[500:]) - """ - # Generate positive and negative initial sets - sets = genSets() - positive = random.sample(sets[0], min(len(sets[0]), len(sets[1]))) - negative = random.sample(sets[1], min(len(sets[0]), len(sets[1]))) - - # Clustering Setup - stopwords = set(nltk.corpus.stopwords.words('english')) - - # Create dictionary (adj, (index, similar, dissimilar)) - conjSet = getConj() - - # Create list out of all keys of conjSet - defSet = conjSet.keys() - - # Generate dissimilarity matrix - dis = vectorize(conjSet,defSet) - - # Its Cluster time - set1 = defSet[len(defSet)//2:] - set2 = defSet[:len(defSet)//2] - """ - set1 = random.sample(defSet, len(defSet)//4) - set2 = [x for x in defSet if x not in set1] - """ - # Optimize objective function - sets = optimize2(set1,set2,conjSet,defSet,dis) - set1 = sets[0] - set2 = sets[1] - - print(set1) - print(set2) - f1 = open('set1.txt', 'w+') - f2 = open('set2.txt', 'w+') +def optimize(set1,set2,conjSet,defSet,dis): + i = 0 + currentMin = 999999 + consideredMin = calcScore(set1,set2,conjSet,dis) + bestSwapWord = "" + # Calculate the best word to remove until no moves lessen the function + while( currentMin > consideredMin): + print i + i = i + 1 + currentMin = consideredMin for word in set1: - f1.write(word + "\n") + set1.remove(word) + set2.append(word) + test = calcScore(set1,set2,conjSet,dis) + set2.remove(word) + set1.append(word) + if (test < consideredMin): + consideredMin = test + bestSwapWord = word for word in set2: - f2.write(word + "\n") - f1.close() - f2.close() + set2.remove(word) + set1.append(word) + test = calcScore(set1,set2,conjSet,dis) + set1.remove(word) + set2.append(word) + if (test < consideredMin): + consideredMin = test + bestSwapWord = word - # Can we classify and then run bag of words? - #negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] - #posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids] - #trainfeats = [({word : True},"pos") for word in positive] + [({word : True},"neg") for word in negative] - #testfeats = negfeats[500:] + posfeats[500:] - #classifier1 = NaiveBayesClassifier.train(trainfeats) - #print 'accuracy:', nltk.classify.util.accuracy(classifier1,({"Bad": True},"neg")) + if(bestSwapWord in set1): + set1.remove(bestSwapWord) + set2.append(bestSwapWord) + else: + set2.remove(bestSwapWord) + set1.append(bestSwapWord) + # Return the optimized sets + return set1,set2 +def optimize2(set1,set2,conjSet,defSet,dis): + i = 0 + currentMin = 999999 + consideredMin = calcScore(set1,conjSet,dis) + calcScore(set2,conjSet,dis) + bestSwapWord = None + # Calculate the best word to remove until no moves lessen the function + while( currentMin > consideredMin): + currentMin = consideredMin + currentS1 = calcScore(set1,conjSet,dis) + currentS2 = calcScore(set2,conjSet,dis) + consideredMin = currentS1 + currentS2 # + for word in set1: + test = calcSwap(word,set1,set2,currentS1,currentS2,conjSet,dis) + if (test < consideredMin): + consideredMin = test + bestSwapWord = word + for word in set2: + test = calcSwap(word,set2,set1,currentS2,currentS1,conjSet,dis) + if (test < consideredMin): + consideredMin = test + bestSwapWord = word + + if(bestSwapWord in set1): + set1.remove(bestSwapWord) + set2.append(bestSwapWord) + elif(bestSwapWord in set2): + set2.remove(bestSwapWord) + set1.append(bestSwapWord) + i = i + 1 + + # Return the optimized sets + return set1,set2 + +def constraintSwap(set1,set2,conjSet,defSet,dis): + for word in set1: + stay = 0 + swap = 0 + for otherword in set1: + if otherword != word: + cats = dis[conjSet[word][0]][conjSet[otherword][0]] + stay = stay + cats + stay = stay * (1/(len(set1)-1)) + for otherword in set2: + if otherword != word: + cats = dis[conjSet[word][0]][conjSet[otherword][0]] + swap = swap + cats + swap = swap * (1/(len(set2))) + if(stay > swap): + set1.remove(word) + set2.append(word) + + for word in set2: + stay = 0 + swap = 0 + for otherword in set2: + if otherword != word: + cats = dis[conjSet[word][0]][conjSet[otherword][0]] + stay = stay + cats + stay = stay * (1/(len(set2)-1)) + for otherword in set1: + if otherword != word: + cats = dis[conjSet[word][0]][conjSet[otherword][0]] + swap = swap + cats + swap = swap * (1/(len(set1))) + if(stay > swap): + set2.remove(word) + set1.append(word) + return set1,set2 + + +def calcScore(set,conjSet,dis): + score = 0 + for i in range(len(set)): + w1 = set[i] + for j in range(i, len(set)): + w2 = set[j] + cats = dis[conjSet[w1][0]][conjSet[w2][0]] + score = score + cats + return score / len(set) + +def calcSwap(word,currSet,opSet,currentCount,otherCount,conjSet,dis): + score1 = 0 + score2 = 0 + for w in currSet: + if word != w: + cats = dis[conjSet[word][0]][conjSet[w][0]] + score1 = score1 + cats + currentCount = ((currentCount* len(currSet)) - score1 )/(len(currSet)-1) + + #for word in set2: + for w in opSet: + if word != w: + cats = dis[conjSet[word][0]][conjSet[w][0]] + score2 = score2 + cats + otherCount = ((otherCount* len(opSet)) + score2 )/(len(opSet)+1) + + return currentCount + otherCount + +def normalize_word(word): + return SnowballStemmer("english").stem(word) + +def vectorize(conjSet,defSet): + dis = numpy.zeros((len(defSet),len(defSet))) + dis.fill(.5) + for word in defSet: + similar = conjSet[word][1] + dissimilar = conjSet[word][2] + for sim in similar: + dis[conjSet[word][0]][conjSet[sim][0]] = 0 + for d in dissimilar: + dis[conjSet[word][0]][conjSet[d][0]] = 1 + return dis + +def word_feats(words): + return dict([(word, True) for word in words]) + +def genSets(): + f = open('words.txt', 'r+') + content = f.readlines() + positive = Set([]) + negative = Set([]) + + for pair in content: + current = pair.split(' ') + if (current[1][0] == 'p'): + positive.add(current[0]) + elif (current[1][0] == 'n'): + negative.add(current[0]) + + return positive,negative + +def getConj(): + # Set up the tuple (index, similar, dissimilar) + f = open('conj.txt', 'r+') + content = f.readlines() + d = dict() + i = 0 + for line in content: + current = line.split(' ') + if current[2] == "but": + if current[0] in d: + d[current[0]][2].add(current[1]) + else: + d[current[0]] = (i,Set(),Set([current[1]])) + i = i+1 + if current[1] in d: + d[current[1]][2].add(current[0]) + else: + d[current[1]] = (i,Set(),Set([current[0]])) + i = i+1 + else: + if current[0] in d: + d[current[0]][1].add(current[1]) + else: + d[current[0]] = (i,Set([current[1]]),Set()) + i = i+1 + if current[1] in d: + d[current[1]][1].add(current[0]) + else: + d[current[1]] = (i,Set([current[0]]),Set()) + i = i+1 + return d + +def findFrequency(set1,set2): + set1Freq = 0 + set2Freq = 0 + + for word in brown.words(): + set1Freq = (set1Freq+1) if (word in set1) else set1Freq + set2Freq = (set2Freq+1) if (word in set2) else set2Freq + + return set1Freq, set2Freq + +def conjunctionData(set1,set2): + f = open('conj.txt', 'r+') + content = f.readlines() + totalConj = 0 + totalbuts = 0 + correctbuts = 0 + totalands = 0 + correctands = 0 + totalors = 0 + correctors = 0 + totalnors =0 + correctnors = 0 + for line in content: + totalConj = totalConj +1 + current = line.split(' ') + if current[2] == "but": + totalbuts = totalbuts +1 + if( (current[0] in set1 and current[1] in set2) or (current[0] in set2 and current[1] in set1) ): + correctbuts = correctbuts +1 + elif current[2] == "and": + totalands = totalands +1 + if( (current[0] in set1 and current[1] in set1) or (current[0] in set2 and current[1] in set2) ): + correctands = correctands +1 + elif current[2] == "or": + totalors = totalors +1 + if( (current[0] in set1 and current[1] in set1) or (current[0] in set2 and current[1] in set2) ): + correctors = correctors +1 + elif current[2] == "nor": + totalnors = totalnors +1 + if( (current[0] in set1 and current[1] in set1) or (current[0] in set2 and current[1] in set2) ): + correctnors = correctnors +1 + print "Total Conjunctions: %d" % totalConj + print "Total ands: %d \n Ands in same set: %d" % (totalands,correctands) + print "Total ors: %d \n Ors in same set: %d" % (totalors,correctors) + print "Total nors: %d \n Nors in same set: %d" % (totalnors,correctnors) + print "Total buts: %d \n Buts in opposite sets: %d" % (totalbuts,correctbuts) + +def returnCBLexicon(): + # Generate positive and negative initial sets + sets = genSets() + positive = random.sample(sets[0], min(len(sets[0]), len(sets[1]))) + negative = random.sample(sets[1], min(len(sets[0]), len(sets[1]))) + + # Clustering Setup + stopwords = set(nltk.corpus.stopwords.words('english')) + + # Create dictionary (adj, (index, similar, dissimilar)) + conjSet = getConj() + + # Create list out of all keys of conjSet + defSet = conjSet.keys() + + # Generate dissimilarity matrix + dis = vectorize(conjSet,defSet) + + # Its Cluster time + set1 = defSet[len(defSet)//2:] + set2 = defSet[:len(defSet)//2] + """ + set1 = random.sample(defSet, len(defSet)//4) + set2 = [x for x in defSet if x not in set1] + """ + # Optimize objective function + (set1,set2) = optimize2(set1,set2,conjSet,defSet,dis) + # Check the constraint + #(set1,set2) = constraintSwap(set1,set2,conjSet,defSet,dis) + + #f1 = open('set1.txt', 'w+') + #f2 = open('set2.txt', 'w+') + #for word in set1: + # f1.write(word + "\n") + #for word in set2: + # f2.write(word + "\n") + #f1.close() + #f2.close() + + #Find which set has a higher frequency in the training set + (set1Freq,set2Freq) = findFrequency(set1,set2) + + positive = set1 if (set1Freq>set2Freq) else set2 + negative = set1 if (set1Freq