From 66918f812ec6fa757dff549d6ea14c09dc5e84a6 Mon Sep 17 00:00:00 2001
From: Antonia Lewis <antonia.lewis@uconn.edu>
Date: Sat, 2 Apr 2016 18:07:08 -0400
Subject: [PATCH] Working on clustering in cblexicon

---
 BagOfWords.pyc   | Bin 0 -> 1790 bytes
 GlossCount.py    |  23 +++------
 MPQALexicon.pyc  | Bin 0 -> 918 bytes
 cblexicon.py     | 119 +++++++++++++++++++++++++++++++++++++++++------
 getAdjectives.py |  31 ++++++++++++
 5 files changed, 143 insertions(+), 30 deletions(-)
 create mode 100644 BagOfWords.pyc
 create mode 100644 MPQALexicon.pyc
diff --git a/BagOfWords.pyc b/BagOfWords.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f5e3b492756a1aa524f515753b01b6e68242566f
GIT binary patch
literal 1790
zcmbVMOK%%h6#niwcKnQ!*lucxN3($lp(5xa2qMxnDNU;?PMw5ER?29|OeSMHGx5xg
zNYQwe@*nsa`~sG&*|TBUZAFnPv4HQ~aUHs49%s%y_nhxM@8tfhm48|K`<r7-t}?%O
z`6fS-xO^0bKyS(`^b}kRZVv7Y+&ugl^tdd*orPP3TY_7LpGS`h6}VNnHTXI7s5}R^
z4tL%x^MXv?n|=E!!XwB6o|n)gl)~Q1A|na@;kBc4WWz`&c@ZVyhdL=pn1L(otgw+M
zVy(#EHgwt0bwfWk^h*O{fBGc*ezu)`?>zBo#Ot^93G-q-5S(MFNsF)e+T@#zEV>8A
zWu!U8Eokx;gaGmT7E<6!VO&L;N9-734r$d0SqvbpVe9EljOU<d(9a`fWfZO~d~0EJ
zS1gHj`IFKG=-G*7YT+j9zC`4$r>Pp(-`LVk-8|e}&S1v#Ca_na5p4z*V)FCQMGOjv
ze}yiYfLlgbHp1Vpvq&q5UrL8*vP7)WON?GgNa-x%$Lg|!EHGV>luSUMW4r)uLg9KE
zbW7KeE}%UC;+m2ODrUV()ZI}{@i`RS66>V&oH(TUcoAVicrq+u^jxI{LkU(#n;Eq^
zdxIR36zL4A@y{wPqt7ZVig~sqe=Ld?R%S?WL~O4m{j9>hF;t;k>7y^!TwcB8+P
z*cs{C=(QT9;B%N35uaHc{&54B9|&%L0awVH+C<tguA1~-Kx36kqn33@DUZmen0)eI
zllL%gn$j^knlJ%swj|aj)<)ZwDNvc`pvckNF%xW?x#a9n(Al>&#}SVnaUwaE!cXS+
zk9Bi)w49c&onhh`i*Tq1Yzy#x6uY*L23{Ea;IU=Ed6d}N@!4GtxsHR-Hx~UU2n|T=
zguZ7^%g_tiF1o+}U}OJbXU9J5bhq}5_I`ZfvCY8l!OrIXf!Q|xb~}3yjqd)&gFS<H
zGK?Ip8=^bi+wR`q+wMG`fSo$YsR@mYJ@B4oIY(iZ3I)lEoJ=Fu-laCWv3FvB$224e
zKX%S+M(gWSjwf)Dcy{Rdjt-)5Dmah5#0!tTYe80sWGZmnz8!_bXHx-76&Q>XLBl9M
zbB0ny>5;fFaf?63VMiyn)H*Q%xzV+MX-u{w6=&sr&ovFseqe!VL}*J6M%*`53qqHf
zip3-^`AH2{NMK0v5xLDTjt&wpPL4K09fg5&<OllbUf^#z+Bv%C_??qO3F-EER?;(P
z;K@uRU4c;r-X*I}T30J-S-oo&RF!whyam<Zx~*=hIaSF~$0}I2)D23@xhgFhR+IbG
zqO45%zFLuAXjdF`=Y=$d+z_%Pq%GtfAyTrgbhLYi<T|JSl~|DwYiE&rG4#HY8YP0x
NH1dn8#W3r+e*y30W#Rw;

literal 0
HcmV?d00001

diff --git a/GlossCount.py b/GlossCount.py
index ded48b7..e05cf6b 100644
--- a/GlossCount.py
+++ b/GlossCount.py
@@ -61,19 +61,13 @@ def expand_sets(positive,negative,neutral):
         classifier = NaiveBayesClassifier.train(trainfeats)
         print "cat"
         #print classifier.classify(dict([(word, True) for word in words]))
-<<<<<<< HEAD
-        #print classifier.classify(dict([("bad",True),("bad",True)]))
-
-
-=======
         print classifier.classify(dict([("bad",True),("bad",True)]))
->>>>>>> parent of 47c6a2a... Bugfix
+
         # Iterate through all of the reviews and find sentiment
         count = 0.00
         correct = 0.00
-        for reviews in movie_reviews.fileids():
+        for reviews in movie_reviews.fileids():     #For every review
             score = 0;
-<<<<<<< HEAD
             tokens = nltk.pos_tag(nltk.word_tokenize(movie_reviews.raw(fileids=[reviews])))     #Tokenize all words with POS
             for token in tokens:
                 if (token[1]== "JJ" or token[1] == "JJR" or token[1] == "JJS"):         # If adjective, check value
@@ -82,15 +76,6 @@ def expand_sets(positive,negative,neutral):
                         score = score - 1
                     elif(sent_value is 'pos'):
                         score = score + 1
-=======
-            for words in movie_reviews.words(fileids=[reviews]):
-                if()
-                sent_value = classifier.classify(dict([(word, True)]))
-                if(sent_value is 'neg'):
-                    score = score - 1
-                elif(sent_value is 'pos'):
-                    score = score + 1
->>>>>>> parent of 47c6a2a... Bugfix
             if (score < 0):
                 print "Negative at %d" % (score)
                 sentiment = 'neg'
@@ -98,8 +83,12 @@ def expand_sets(positive,negative,neutral):
                 sentiment = 'pos'
                 print "Positive at %d" % (score)
             if (sentiment == movie_reviews.categories(fileids=[reviews])[0]):
+                print "Correct"
                 correct = correct + 1.00
             count = count + 1.00
         print correct/count
 
+
+
+
 GlossCount().demo()
\ No newline at end of file
diff --git a/MPQALexicon.pyc b/MPQALexicon.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d4817950205c82c4f7f978b06cbc8130910dd2fd
GIT binary patch
literal 918
zcmb7CO>fgc5PfSW&Q~M2l?o0$0E&c&NF^>55T!-pkVd6^L_z_{&2FL%uI+d?P$XL%
z%J1U;a70{?xG*z8?S%`rcjwK_n>YL6@MpLEZS(h=DgHi+>=PQUOOpn^0&fW@yzB)O
z5#9!Yh{iq*w?h?_t^f~|z&T-sQemNR4pf2QfU%mBAA;u^&VHZ_aPaOssyfO#79ljT
zs5)Xs8By*Uss?n3vVji~iZO%t0hVJX`Pv2VI`b&EsB;@u3R#J%UqrHrvI$+o7MuFo
zMA!<0s)eeJs)MovO<e*MyC~bxO~JoVhFC->TP&l>8asOJEN?0%wuMOeAH<GS5plT-
z`o_c|iCI90$!9onH~0isCyQyaGA=Rc;O_I`t7nI!;nBVOeLtIdzK(5~ANe}rPg$V{
zp6@0vY*yr15*OBgrn8_(<_10_5bK3+t^?NFk4A!&m0)V-@kNlFUW1rYpXDa?E%FKz
z>%^wUNy}-HIU~sBiS<I&cIrL<0)_L1%>~9e)oa<9G@4o3P<(1+^)nH;6y<$5DNEuL
zlQ>c3w8UoCB$Vb23ls@094=gIaosU5zDHv{xjTgRA5305Q@F`q>a*0wlezVir*?i2
z`*?DA^kQ#l&g?Wx`}z9Vk}RhGm+s*<>!?U=(Ay>(sjj-BZmGW7mRMI^CEHb-YSiO`
aW4^u1=7Rov1$<8rPcywr%wt&v`S=4DySTCd

literal 0
HcmV?d00001

diff --git a/cblexicon.py b/cblexicon.py
index 74dba34..2afaa4d 100644
--- a/cblexicon.py
+++ b/cblexicon.py
@@ -1,26 +1,119 @@
 import math
 import nltk
 from nltk.corpus import wordnet as wn
-from nltk.corpus import brown as sc
 from collections import Counter
+import numpy
+from nltk.corpus import movie_reviews
+import nltk.stem
+from nltk.cluster import KMeansClusterer, GAAClusterer, euclidean_distance
+from nltk.classify import NaiveBayesClassifier
+import random
+from nltk.stem import *
+
 from sets import Set
 
 class cblexicon:
 
-    def genSets(self):
-        f = open('words.txt', 'r+')
-        content = f.readlines()
-        positive = Set([])
-        negative = Set([])
+    def process(self):
+
+        def normalize_word(word):
+            return SnowballStemmer("english").stem(word)
+
+        def vectorspaced(title,CS,DF):
+            title_components =  CS[title][1]
+            return numpy.array([
+                 word in title_components
+                 for word in DF], numpy.short)
+
+        def word_feats(words):
+            return dict([(word, True) for word in words])
+
+        def genSets():
+            f = open('words.txt', 'r+')
+            content = f.readlines()
+            positive = Set([])
+            negative = Set([])
 
-        for pair in content:
-            current = pair.split(' ')
-            if (current[1][0] == 'p'):
-                positive.add(current[0])
-            elif (current[1][0] == 'n'):
-                negative.add(current[0])
+            for pair in content:
+                current = pair.split(' ')
+                if (current[1][0] == 'p'):
+                    positive.add(current[0])
+                elif (current[1][0] == 'n'):
+                    negative.add(current[0])
 
+            return positive,negative
+
+        def getConj():
+            f = open('conj.txt', 'r+')
+            content = f.readlines()
+            d = dict()
+            i = 0
+            for line in content:
+                current = line.split(' ')
+                #Add the first adjective
+                if current[0] in d:
+                    d[current[0]][1].add(current[1])
+                else:
+                    d[current[0]] = (i,Set([current[1]]))
+                    i = i+1
+                #Add the second adjective
+                if current[1] in d:
+                    d[current[1]][1].add(current[0])
+                else:
+                    d[current[1]] = (i,Set([current[0]]))
+                    i = i+1
+            return d
+
+        #Get the Data#
+        negids = movie_reviews.fileids('neg')
+        posids = movie_reviews.fileids('pos')
+        training = set(negids[:500] + posids[:500])
+        testing = set(negids[500:] + posids[500:])
+        # Generate positive and negative initial sets
+        sets = genSets()
+        positive = random.sample(sets[0], min(len(sets[0]), len(sets[1])))
+        negative = random.sample(sets[1], min(len(sets[0]), len(sets[1])))
         print len(positive)
         print len(negative)
 
-cblexicon().genSets()
\ No newline at end of file
+        # Clustering Setup
+        stopwords = set(nltk.corpus.stopwords.words('english'))
+        # Create dictionary (adj, (index,[associated words]))
+        conjSet = getConj()
+        print conjSet
+
+        # Create list out of all keys of conjSet
+        defSet = conjSet.keys()
+
+        # Its Cluster time
+        cluster = KMeansClusterer(2, euclidean_distance)
+        print conjSet["young"]
+        z =  vectorspaced("young",conjSet,defSet)
+
+        for num in z:
+            if num == 1:
+                print "one"
+
+
+        #cluster.cluster([vectorspaced(title,conjSet,defSet) for title in defSet if title])
+        cluster.cluster(vectorspaced("young",conjSet,defSet))
+        cluster.cluster(vectorspaced("stiff",conjSet,defSet))
+        classified_examples = [
+                cluster.classify(vectorspaced(title,conjSet,defSet)) for title in defSet
+        ]
+        print classified_examples
+
+
+
+
+        # Can we classify and then run bag of words?
+        #negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
+        #posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
+        #trainfeats = [({word : True},"pos") for word in positive] + [({word : True},"neg") for word in negative]
+        #testfeats = negfeats[500:] + posfeats[500:]
+        #classifier1 = NaiveBayesClassifier.train(trainfeats)
+        #print 'accuracy:', nltk.classify.util.accuracy(classifier1,({"Bad": True},"neg"))
+
+
+
+cblexicon().process()
\ No newline at end of file
diff --git a/getAdjectives.py b/getAdjectives.py
index af79093..8b5423c 100644
--- a/getAdjectives.py
+++ b/getAdjectives.py
@@ -4,6 +4,37 @@
 from collections import Counter
 
 
+
+def genConj(training):
+    conj = open('conj.txt', 'r+')
+    ands = open('ands.txt', 'r+')
+    ors =  open('ors.txt', 'r+')
+    buts =  open('buts.txt', 'r+')
+    nor =  open('nor.txt', 'r+')
+    eor = open('eor.txt', 'r+')
+    j = 0;
+    for review in training:     #For every review
+        tokens = nltk.pos_tag(nltk.word_tokenize(movie_reviews.raw(fileids=[review])))
+        print j
+        j = j+1
+        for i in range(0,len(tokens)-3):
+            if ((tokens[i][1]== "JJ" or tokens[i][1] == "JJR" or tokens[i][1] == "JJS") and (tokens[i+2][1]== "JJ" or tokens[i+2][1] == "JJR" or tokens[i+2][1] == "JJS")):
+                if (tokens[i+1][0] == "and"):
+                    conj.write(tokens[i][0]+ " " + tokens[i+2][0] + " " + tokens[i+1][0] + "\n")
+                    ands.write(tokens[i][0]+ " " + tokens[i+2][0]+ "\n")
+                elif (tokens[i+1][0] == "or"):
+                    conj.write(tokens[i][0]+ " " + tokens[i+2][0] + " " + tokens[i+1][0] + "\n")
+                    ors.write(tokens[i][0]+ " " + tokens[i+2][0]+ "\n")
+                elif (tokens[i+1][0] == "but"+ "\n"):
+                    conj.write(tokens[i][0]+ " " + tokens[i+2][0] + " " + tokens[i+1][0] + "\n")
+                    buts.write(tokens[i][0]+ " " + tokens[i+2][0]+ "\n")
+                elif (tokens[i+1][0] == "either-or"):
+                    conj.write(tokens[i][0]+ " " + tokens[i+2][0] + " " + tokens[i+1][0] + "\n")
+                    eor.write(tokens[i][0]+ " " + tokens[i+2][0]+ "\n")
+                elif (tokens[i+1][0] == "neither-nor"):
+                    conj.write(tokens[i][0]+ " " + tokens[i+2][0] + " " + tokens[i+1][0] + "\n")
+                    nor.write(tokens[i][0]+ " " + tokens[i+2][0]+ "\n")
+
 f = open('words.txt', 'r+')
 list1 = []
 for word in sc.tagged_sents():