From 66918f812ec6fa757dff549d6ea14c09dc5e84a6 Mon Sep 17 00:00:00 2001 From: Antonia Lewis Date: Sat, 2 Apr 2016 18:07:08 -0400 Subject: [PATCH] Working on clustering in cblexicon --- BagOfWords.pyc | Bin 0 -> 1790 bytes GlossCount.py | 23 +++------ MPQALexicon.pyc | Bin 0 -> 918 bytes cblexicon.py | 119 +++++++++++++++++++++++++++++++++++++++++------ getAdjectives.py | 31 ++++++++++++ 5 files changed, 143 insertions(+), 30 deletions(-) create mode 100644 BagOfWords.pyc create mode 100644 MPQALexicon.pyc diff --git a/BagOfWords.pyc b/BagOfWords.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f5e3b492756a1aa524f515753b01b6e68242566f GIT binary patch literal 1790 zcmbVMOK%%h6#niwcKnQ!*lucxN3($lp(5xa2qMxnDNU;?PMw5ER?29|OeSMHGx5xg zNYQwe@*nsa`~sG&*|TBUZAFnPv4HQ~aUHs49%s%y_nhxM@8tfhm48|K`zBo#Ot^93G-q-5S(MFNsF)e+T@#zEV>8A zWu!U8Eokx;gaGmT7E<6!VO&L;N9-734r$d0SqvbpVe9EljOUPp(-`LVk-8|e}&S1v#Ca_na5p4z*V)FCQMGOjv ze}yiYfLlgbHp1Vpvq&q5UrL8*vP7)WON?GgNa-x%$Lg|!EHGV>luSUMW4r)uLg9KE zbW7KeE}%UC;+m2ODrUV()ZI}{@i`RS66>V&oH(TUcoAVicrq+u^jxI{LkU(#n;Eq^ zdxIR36zL4A@y{wPqt7ZVig~sqe=Ld?R%S?WL~O4m{j9>hF;t;k>7y^!TwcB8+P z*cs{C=(QT9;B%N35uaHc{&54B9|&%L0awVH+C&#}SVnaUwaE!cXS+ zk9Bi)w49c&onhh`i*Tq1Yzy#x6uY*L23{Ea;IU=Ed6d}N@!4GtxsHR-Hx~UU2n|T= zguZ7^%g_tiF1o+}U}OJbXU9J5bhq}5_I`ZfvCY8l!OrIXf!Q|xb~}3yjqd)&gFSXLBl9M zbB0ny>5;fFaf?63VMiyn)H*Q%xzV+MX-u{w6=&sr&ovFseqe!VL}*J6M%*`53qqHf zip3-^`AH2{NMK0v5xLDTjt&wpPL4K09fg5&>>>>>> parent of 47c6a2a... Bugfix + # Iterate through all of the reviews and find sentiment count = 0.00 correct = 0.00 - for reviews in movie_reviews.fileids(): + for reviews in movie_reviews.fileids(): #For every review score = 0; -<<<<<<< HEAD tokens = nltk.pos_tag(nltk.word_tokenize(movie_reviews.raw(fileids=[reviews]))) #Tokenize all words with POS for token in tokens: if (token[1]== "JJ" or token[1] == "JJR" or token[1] == "JJS"): # If adjective, check value @@ -82,15 +76,6 @@ class GlossCount: score = score - 1 elif(sent_value is 'pos'): score = score + 1 -======= - for words in movie_reviews.words(fileids=[reviews]): - if() - sent_value = classifier.classify(dict([(word, True)])) - if(sent_value is 'neg'): - score = score - 1 - elif(sent_value is 'pos'): - score = score + 1 ->>>>>>> parent of 47c6a2a... Bugfix if (score < 0): print "Negative at %d" % (score) sentiment = 'neg' @@ -98,8 +83,12 @@ class GlossCount: sentiment = 'pos' print "Positive at %d" % (score) if (sentiment == movie_reviews.categories(fileids=[reviews])[0]): + print "Correct" correct = correct + 1.00 count = count + 1.00 print correct/count + + + GlossCount().demo() \ No newline at end of file diff --git a/MPQALexicon.pyc b/MPQALexicon.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d4817950205c82c4f7f978b06cbc8130910dd2fd GIT binary patch literal 918 zcmb7CO>fgc5PfSW&Q~M2l?o0$0E&c&NF^>55T!-pkVd6^L_z_{&2FL%uI+d?P$XL% z%J1U;a70{?xG*z8?S%`rcjwK_n>YL6@MpLEZS(h=DgHi+>=PQUOOpn^0&fW@yzB)O z5#9!Yh{iq*w?h?_t^f~|z&T-sQemNR4pf2QfU%mBAA;u^&VHZ_aPaOssyfO#79ljT zs5)Xs8By*Uss?n3vVji~iZO%t0hVJX`Pv2VI`b&EsB;@u3R#J%UqrHrvI$+o7MuFo zMA!<0s)eeJs)MovOTP&l>8asOJEN?0%wuMOeAH%^wUNy}-HIU~sBiS~9e)oa<9G@4o3P<(1+^)nH;6y<$5DNEuL zlQ>c3w8UoCB$Vb23ls@094=gIaosU5zDHv{xjTgRA5305Q@F`q>a*0wlezVir*?i2 z`*?DA^kQ#l&g?Wx`}z9Vk}RhGm+s*<>!?U=(Ay>(sjj-BZmGW7mRMI^CEHb-YSiO` aW4^u1=7Rov1$<8rPcywr%wt&v`S=4DySTCd literal 0 HcmV?d00001 diff --git a/cblexicon.py b/cblexicon.py index 74dba34..2afaa4d 100644 --- a/cblexicon.py +++ b/cblexicon.py @@ -1,26 +1,119 @@ import math import nltk from nltk.corpus import wordnet as wn -from nltk.corpus import brown as sc from collections import Counter +import numpy +from nltk.corpus import movie_reviews +import nltk.stem +from nltk.cluster import KMeansClusterer, GAAClusterer, euclidean_distance +from nltk.classify import NaiveBayesClassifier +import random +from nltk.stem import * + from sets import Set class cblexicon: - def genSets(self): - f = open('words.txt', 'r+') - content = f.readlines() - positive = Set([]) - negative = Set([]) + def process(self): + + def normalize_word(word): + return SnowballStemmer("english").stem(word) + + def vectorspaced(title,CS,DF): + title_components = CS[title][1] + return numpy.array([ + word in title_components + for word in DF], numpy.short) + + def word_feats(words): + return dict([(word, True) for word in words]) + + def genSets(): + f = open('words.txt', 'r+') + content = f.readlines() + positive = Set([]) + negative = Set([]) - for pair in content: - current = pair.split(' ') - if (current[1][0] == 'p'): - positive.add(current[0]) - elif (current[1][0] == 'n'): - negative.add(current[0]) + for pair in content: + current = pair.split(' ') + if (current[1][0] == 'p'): + positive.add(current[0]) + elif (current[1][0] == 'n'): + negative.add(current[0]) + return positive,negative + + def getConj(): + f = open('conj.txt', 'r+') + content = f.readlines() + d = dict() + i = 0 + for line in content: + current = line.split(' ') + #Add the first adjective + if current[0] in d: + d[current[0]][1].add(current[1]) + else: + d[current[0]] = (i,Set([current[1]])) + i = i+1 + #Add the second adjective + if current[1] in d: + d[current[1]][1].add(current[0]) + else: + d[current[1]] = (i,Set([current[0]])) + i = i+1 + return d + + #Get the Data# + negids = movie_reviews.fileids('neg') + posids = movie_reviews.fileids('pos') + training = set(negids[:500] + posids[:500]) + testing = set(negids[500:] + posids[500:]) + # Generate positive and negative initial sets + sets = genSets() + positive = random.sample(sets[0], min(len(sets[0]), len(sets[1]))) + negative = random.sample(sets[1], min(len(sets[0]), len(sets[1]))) print len(positive) print len(negative) -cblexicon().genSets() \ No newline at end of file + # Clustering Setup + stopwords = set(nltk.corpus.stopwords.words('english')) + # Create dictionary (adj, (index,[associated words])) + conjSet = getConj() + print conjSet + + # Create list out of all keys of conjSet + defSet = conjSet.keys() + + # Its Cluster time + cluster = KMeansClusterer(2, euclidean_distance) + print conjSet["young"] + z = vectorspaced("young",conjSet,defSet) + + for num in z: + if num == 1: + print "one" + + + #cluster.cluster([vectorspaced(title,conjSet,defSet) for title in defSet if title]) + cluster.cluster(vectorspaced("young",conjSet,defSet)) + cluster.cluster(vectorspaced("stiff",conjSet,defSet)) + classified_examples = [ + cluster.classify(vectorspaced(title,conjSet,defSet)) for title in defSet + ] + print classified_examples + + + + + # Can we classify and then run bag of words? + #negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] + #posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids] + #trainfeats = [({word : True},"pos") for word in positive] + [({word : True},"neg") for word in negative] + #testfeats = negfeats[500:] + posfeats[500:] + #classifier1 = NaiveBayesClassifier.train(trainfeats) + #print 'accuracy:', nltk.classify.util.accuracy(classifier1,({"Bad": True},"neg")) + + + +cblexicon().process() \ No newline at end of file diff --git a/getAdjectives.py b/getAdjectives.py index af79093..8b5423c 100644 --- a/getAdjectives.py +++ b/getAdjectives.py @@ -4,6 +4,37 @@ from nltk.corpus import brown as sc from collections import Counter + +def genConj(training): + conj = open('conj.txt', 'r+') + ands = open('ands.txt', 'r+') + ors = open('ors.txt', 'r+') + buts = open('buts.txt', 'r+') + nor = open('nor.txt', 'r+') + eor = open('eor.txt', 'r+') + j = 0; + for review in training: #For every review + tokens = nltk.pos_tag(nltk.word_tokenize(movie_reviews.raw(fileids=[review]))) + print j + j = j+1 + for i in range(0,len(tokens)-3): + if ((tokens[i][1]== "JJ" or tokens[i][1] == "JJR" or tokens[i][1] == "JJS") and (tokens[i+2][1]== "JJ" or tokens[i+2][1] == "JJR" or tokens[i+2][1] == "JJS")): + if (tokens[i+1][0] == "and"): + conj.write(tokens[i][0]+ " " + tokens[i+2][0] + " " + tokens[i+1][0] + "\n") + ands.write(tokens[i][0]+ " " + tokens[i+2][0]+ "\n") + elif (tokens[i+1][0] == "or"): + conj.write(tokens[i][0]+ " " + tokens[i+2][0] + " " + tokens[i+1][0] + "\n") + ors.write(tokens[i][0]+ " " + tokens[i+2][0]+ "\n") + elif (tokens[i+1][0] == "but"+ "\n"): + conj.write(tokens[i][0]+ " " + tokens[i+2][0] + " " + tokens[i+1][0] + "\n") + buts.write(tokens[i][0]+ " " + tokens[i+2][0]+ "\n") + elif (tokens[i+1][0] == "either-or"): + conj.write(tokens[i][0]+ " " + tokens[i+2][0] + " " + tokens[i+1][0] + "\n") + eor.write(tokens[i][0]+ " " + tokens[i+2][0]+ "\n") + elif (tokens[i+1][0] == "neither-nor"): + conj.write(tokens[i][0]+ " " + tokens[i+2][0] + " " + tokens[i+1][0] + "\n") + nor.write(tokens[i][0]+ " " + tokens[i+2][0]+ "\n") + f = open('words.txt', 'r+') list1 = [] for word in sc.tagged_sents():