diff --git a/BagOfWords.pyc b/BagOfWords.pyc new file mode 100644 index 0000000..f5e3b49 Binary files /dev/null and b/BagOfWords.pyc differ diff --git a/GlossCount.py b/GlossCount.py index ded48b7..e05cf6b 100644 --- a/GlossCount.py +++ b/GlossCount.py @@ -61,19 +61,13 @@ class GlossCount: classifier = NaiveBayesClassifier.train(trainfeats) print "cat" #print classifier.classify(dict([(word, True) for word in words])) -<<<<<<< HEAD - #print classifier.classify(dict([("bad",True),("bad",True)])) - - -======= print classifier.classify(dict([("bad",True),("bad",True)])) ->>>>>>> parent of 47c6a2a... Bugfix + # Iterate through all of the reviews and find sentiment count = 0.00 correct = 0.00 - for reviews in movie_reviews.fileids(): + for reviews in movie_reviews.fileids(): #For every review score = 0; -<<<<<<< HEAD tokens = nltk.pos_tag(nltk.word_tokenize(movie_reviews.raw(fileids=[reviews]))) #Tokenize all words with POS for token in tokens: if (token[1]== "JJ" or token[1] == "JJR" or token[1] == "JJS"): # If adjective, check value @@ -82,15 +76,6 @@ class GlossCount: score = score - 1 elif(sent_value is 'pos'): score = score + 1 -======= - for words in movie_reviews.words(fileids=[reviews]): - if() - sent_value = classifier.classify(dict([(word, True)])) - if(sent_value is 'neg'): - score = score - 1 - elif(sent_value is 'pos'): - score = score + 1 ->>>>>>> parent of 47c6a2a... Bugfix if (score < 0): print "Negative at %d" % (score) sentiment = 'neg' @@ -98,8 +83,12 @@ class GlossCount: sentiment = 'pos' print "Positive at %d" % (score) if (sentiment == movie_reviews.categories(fileids=[reviews])[0]): + print "Correct" correct = correct + 1.00 count = count + 1.00 print correct/count + + + GlossCount().demo() \ No newline at end of file diff --git a/MPQALexicon.pyc b/MPQALexicon.pyc new file mode 100644 index 0000000..d481795 Binary files /dev/null and b/MPQALexicon.pyc differ diff --git a/cblexicon.py b/cblexicon.py index 74dba34..2afaa4d 100644 --- a/cblexicon.py +++ b/cblexicon.py @@ -1,26 +1,119 @@ import math import nltk from nltk.corpus import wordnet as wn -from nltk.corpus import brown as sc from collections import Counter +import numpy +from nltk.corpus import movie_reviews +import nltk.stem +from nltk.cluster import KMeansClusterer, GAAClusterer, euclidean_distance +from nltk.classify import NaiveBayesClassifier +import random +from nltk.stem import * + from sets import Set class cblexicon: - def genSets(self): - f = open('words.txt', 'r+') - content = f.readlines() - positive = Set([]) - negative = Set([]) + def process(self): + + def normalize_word(word): + return SnowballStemmer("english").stem(word) + + def vectorspaced(title,CS,DF): + title_components = CS[title][1] + return numpy.array([ + word in title_components + for word in DF], numpy.short) + + def word_feats(words): + return dict([(word, True) for word in words]) + + def genSets(): + f = open('words.txt', 'r+') + content = f.readlines() + positive = Set([]) + negative = Set([]) - for pair in content: - current = pair.split(' ') - if (current[1][0] == 'p'): - positive.add(current[0]) - elif (current[1][0] == 'n'): - negative.add(current[0]) + for pair in content: + current = pair.split(' ') + if (current[1][0] == 'p'): + positive.add(current[0]) + elif (current[1][0] == 'n'): + negative.add(current[0]) + return positive,negative + + def getConj(): + f = open('conj.txt', 'r+') + content = f.readlines() + d = dict() + i = 0 + for line in content: + current = line.split(' ') + #Add the first adjective + if current[0] in d: + d[current[0]][1].add(current[1]) + else: + d[current[0]] = (i,Set([current[1]])) + i = i+1 + #Add the second adjective + if current[1] in d: + d[current[1]][1].add(current[0]) + else: + d[current[1]] = (i,Set([current[0]])) + i = i+1 + return d + + #Get the Data# + negids = movie_reviews.fileids('neg') + posids = movie_reviews.fileids('pos') + training = set(negids[:500] + posids[:500]) + testing = set(negids[500:] + posids[500:]) + # Generate positive and negative initial sets + sets = genSets() + positive = random.sample(sets[0], min(len(sets[0]), len(sets[1]))) + negative = random.sample(sets[1], min(len(sets[0]), len(sets[1]))) print len(positive) print len(negative) -cblexicon().genSets() \ No newline at end of file + # Clustering Setup + stopwords = set(nltk.corpus.stopwords.words('english')) + # Create dictionary (adj, (index,[associated words])) + conjSet = getConj() + print conjSet + + # Create list out of all keys of conjSet + defSet = conjSet.keys() + + # Its Cluster time + cluster = KMeansClusterer(2, euclidean_distance) + print conjSet["young"] + z = vectorspaced("young",conjSet,defSet) + + for num in z: + if num == 1: + print "one" + + + #cluster.cluster([vectorspaced(title,conjSet,defSet) for title in defSet if title]) + cluster.cluster(vectorspaced("young",conjSet,defSet)) + cluster.cluster(vectorspaced("stiff",conjSet,defSet)) + classified_examples = [ + cluster.classify(vectorspaced(title,conjSet,defSet)) for title in defSet + ] + print classified_examples + + + + + # Can we classify and then run bag of words? + #negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] + #posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids] + #trainfeats = [({word : True},"pos") for word in positive] + [({word : True},"neg") for word in negative] + #testfeats = negfeats[500:] + posfeats[500:] + #classifier1 = NaiveBayesClassifier.train(trainfeats) + #print 'accuracy:', nltk.classify.util.accuracy(classifier1,({"Bad": True},"neg")) + + + +cblexicon().process() \ No newline at end of file diff --git a/getAdjectives.py b/getAdjectives.py index af79093..8b5423c 100644 --- a/getAdjectives.py +++ b/getAdjectives.py @@ -4,6 +4,37 @@ from nltk.corpus import brown as sc from collections import Counter + +def genConj(training): + conj = open('conj.txt', 'r+') + ands = open('ands.txt', 'r+') + ors = open('ors.txt', 'r+') + buts = open('buts.txt', 'r+') + nor = open('nor.txt', 'r+') + eor = open('eor.txt', 'r+') + j = 0; + for review in training: #For every review + tokens = nltk.pos_tag(nltk.word_tokenize(movie_reviews.raw(fileids=[review]))) + print j + j = j+1 + for i in range(0,len(tokens)-3): + if ((tokens[i][1]== "JJ" or tokens[i][1] == "JJR" or tokens[i][1] == "JJS") and (tokens[i+2][1]== "JJ" or tokens[i+2][1] == "JJR" or tokens[i+2][1] == "JJS")): + if (tokens[i+1][0] == "and"): + conj.write(tokens[i][0]+ " " + tokens[i+2][0] + " " + tokens[i+1][0] + "\n") + ands.write(tokens[i][0]+ " " + tokens[i+2][0]+ "\n") + elif (tokens[i+1][0] == "or"): + conj.write(tokens[i][0]+ " " + tokens[i+2][0] + " " + tokens[i+1][0] + "\n") + ors.write(tokens[i][0]+ " " + tokens[i+2][0]+ "\n") + elif (tokens[i+1][0] == "but"+ "\n"): + conj.write(tokens[i][0]+ " " + tokens[i+2][0] + " " + tokens[i+1][0] + "\n") + buts.write(tokens[i][0]+ " " + tokens[i+2][0]+ "\n") + elif (tokens[i+1][0] == "either-or"): + conj.write(tokens[i][0]+ " " + tokens[i+2][0] + " " + tokens[i+1][0] + "\n") + eor.write(tokens[i][0]+ " " + tokens[i+2][0]+ "\n") + elif (tokens[i+1][0] == "neither-nor"): + conj.write(tokens[i][0]+ " " + tokens[i+2][0] + " " + tokens[i+1][0] + "\n") + nor.write(tokens[i][0]+ " " + tokens[i+2][0]+ "\n") + f = open('words.txt', 'r+') list1 = [] for word in sc.tagged_sents():