Working on clustering in cblexicon

job13011 · Apr 2, 2016 · 66918f8 · 66918f8
1 parent 8e9a44c
commit 66918f8
Show file tree

Hide file tree

Showing 5 changed files with 143 additions and 30 deletions.
diff --git a/BagOfWords.pyc b/BagOfWords.pyc
diff --git a/GlossCount.py b/GlossCount.py
@@ -61,19 +61,13 @@ def expand_sets(positive,negative,neutral):
         classifier = NaiveBayesClassifier.train(trainfeats)
         print "cat"
         #print classifier.classify(dict([(word, True) for word in words]))
-<<<<<<< HEAD
-        #print classifier.classify(dict([("bad",True),("bad",True)]))
-
-
-=======
         print classifier.classify(dict([("bad",True),("bad",True)]))
->>>>>>> parent of 47c6a2a... Bugfix
+
         # Iterate through all of the reviews and find sentiment
         count = 0.00
         correct = 0.00
-        for reviews in movie_reviews.fileids():
+        for reviews in movie_reviews.fileids():     #For every review
             score = 0;
-<<<<<<< HEAD
             tokens = nltk.pos_tag(nltk.word_tokenize(movie_reviews.raw(fileids=[reviews])))     #Tokenize all words with POS
             for token in tokens:
                 if (token[1]== "JJ" or token[1] == "JJR" or token[1] == "JJS"):         # If adjective, check value
@@ -82,24 +76,19 @@ def expand_sets(positive,negative,neutral):
                         score = score - 1
                     elif(sent_value is 'pos'):
                         score = score + 1
-=======
-            for words in movie_reviews.words(fileids=[reviews]):
-                if()
-                sent_value = classifier.classify(dict([(word, True)]))
-                if(sent_value is 'neg'):
-                    score = score - 1
-                elif(sent_value is 'pos'):
-                    score = score + 1
->>>>>>> parent of 47c6a2a... Bugfix
             if (score < 0):
                 print "Negative at %d" % (score)
                 sentiment = 'neg'
             else:
                 sentiment = 'pos'
                 print "Positive at %d" % (score)
             if (sentiment == movie_reviews.categories(fileids=[reviews])[0]):
+                print "Correct"
                 correct = correct + 1.00
             count = count + 1.00
         print correct/count
 
+
+
+
 GlossCount().demo()
diff --git a/MPQALexicon.pyc b/MPQALexicon.pyc
diff --git a/cblexicon.py b/cblexicon.py
@@ -1,26 +1,119 @@
 import math
 import nltk
 from nltk.corpus import wordnet as wn
-from nltk.corpus import brown as sc
 from collections import Counter
+import numpy
+from nltk.corpus import movie_reviews
+import nltk.stem
+from nltk.cluster import KMeansClusterer, GAAClusterer, euclidean_distance
+from nltk.classify import NaiveBayesClassifier
+import random
+from nltk.stem import *
+
 from sets import Set
 
 class cblexicon:
 
-    def genSets(self):
-        f = open('words.txt', 'r+')
-        content = f.readlines()
-        positive = Set([])
-        negative = Set([])
+    def process(self):
+
+        def normalize_word(word):
+            return SnowballStemmer("english").stem(word)
+
+        def vectorspaced(title,CS,DF):
+            title_components =  CS[title][1]
+            return numpy.array([
+                 word in title_components
+                 for word in DF], numpy.short)
+
+        def word_feats(words):
+            return dict([(word, True) for word in words])
+
+        def genSets():
+            f = open('words.txt', 'r+')
+            content = f.readlines()
+            positive = Set([])
+            negative = Set([])
 
-        for pair in content:
-            current = pair.split(' ')
-            if (current[1][0] == 'p'):
-                positive.add(current[0])
-            elif (current[1][0] == 'n'):
-                negative.add(current[0])
+            for pair in content:
+                current = pair.split(' ')
+                if (current[1][0] == 'p'):
+                    positive.add(current[0])
+                elif (current[1][0] == 'n'):
+                    negative.add(current[0])
 
+            return positive,negative
+
+        def getConj():
+            f = open('conj.txt', 'r+')
+            content = f.readlines()
+            d = dict()
+            i = 0
+            for line in content:
+                current = line.split(' ')
+                #Add the first adjective
+                if current[0] in d:
+                    d[current[0]][1].add(current[1])
+                else:
+                    d[current[0]] = (i,Set([current[1]]))
+                    i = i+1
+                #Add the second adjective
+                if current[1] in d:
+                    d[current[1]][1].add(current[0])
+                else:
+                    d[current[1]] = (i,Set([current[0]]))
+                    i = i+1
+            return d
+
+        #Get the Data#
+        negids = movie_reviews.fileids('neg')
+        posids = movie_reviews.fileids('pos')
+        training = set(negids[:500] + posids[:500])
+        testing = set(negids[500:] + posids[500:])
+        # Generate positive and negative initial sets
+        sets = genSets()
+        positive = random.sample(sets[0], min(len(sets[0]), len(sets[1])))
+        negative = random.sample(sets[1], min(len(sets[0]), len(sets[1])))
         print len(positive)
         print len(negative)
 
-cblexicon().genSets()
+        # Clustering Setup
+        stopwords = set(nltk.corpus.stopwords.words('english'))
+        # Create dictionary (adj, (index,[associated words]))
+        conjSet = getConj()
+        print conjSet
+
+        # Create list out of all keys of conjSet
+        defSet = conjSet.keys()
+
+        # Its Cluster time
+        cluster = KMeansClusterer(2, euclidean_distance)
+        print conjSet["young"]
+        z =  vectorspaced("young",conjSet,defSet)
+
+        for num in z:
+            if num == 1:
+                print "one"
+
+
+        #cluster.cluster([vectorspaced(title,conjSet,defSet) for title in defSet if title])
+        cluster.cluster(vectorspaced("young",conjSet,defSet))
+        cluster.cluster(vectorspaced("stiff",conjSet,defSet))
+        classified_examples = [
+                cluster.classify(vectorspaced(title,conjSet,defSet)) for title in defSet
+        ]
+        print classified_examples
+
+
+
+
+        # Can we classify and then run bag of words?
+        #negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
+        #posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
+        #trainfeats = [({word : True},"pos") for word in positive] + [({word : True},"neg") for word in negative]
+        #testfeats = negfeats[500:] + posfeats[500:]
+        #classifier1 = NaiveBayesClassifier.train(trainfeats)
+        #print 'accuracy:', nltk.classify.util.accuracy(classifier1,({"Bad": True},"neg"))
+
+
+
+cblexicon().process()
diff --git a/getAdjectives.py b/getAdjectives.py
@@ -4,6 +4,37 @@
 from collections import Counter
 
 
+
+def genConj(training):
+    conj = open('conj.txt', 'r+')
+    ands = open('ands.txt', 'r+')
+    ors =  open('ors.txt', 'r+')
+    buts =  open('buts.txt', 'r+')
+    nor =  open('nor.txt', 'r+')
+    eor = open('eor.txt', 'r+')
+    j = 0;
+    for review in training:     #For every review
+        tokens = nltk.pos_tag(nltk.word_tokenize(movie_reviews.raw(fileids=[review])))
+        print j
+        j = j+1
+        for i in range(0,len(tokens)-3):
+            if ((tokens[i][1]== "JJ" or tokens[i][1] == "JJR" or tokens[i][1] == "JJS") and (tokens[i+2][1]== "JJ" or tokens[i+2][1] == "JJR" or tokens[i+2][1] == "JJS")):
+                if (tokens[i+1][0] == "and"):
+                    conj.write(tokens[i][0]+ " " + tokens[i+2][0] + " " + tokens[i+1][0] + "\n")
+                    ands.write(tokens[i][0]+ " " + tokens[i+2][0]+ "\n")
+                elif (tokens[i+1][0] == "or"):
+                    conj.write(tokens[i][0]+ " " + tokens[i+2][0] + " " + tokens[i+1][0] + "\n")
+                    ors.write(tokens[i][0]+ " " + tokens[i+2][0]+ "\n")
+                elif (tokens[i+1][0] == "but"+ "\n"):
+                    conj.write(tokens[i][0]+ " " + tokens[i+2][0] + " " + tokens[i+1][0] + "\n")
+                    buts.write(tokens[i][0]+ " " + tokens[i+2][0]+ "\n")
+                elif (tokens[i+1][0] == "either-or"):
+                    conj.write(tokens[i][0]+ " " + tokens[i+2][0] + " " + tokens[i+1][0] + "\n")
+                    eor.write(tokens[i][0]+ " " + tokens[i+2][0]+ "\n")
+                elif (tokens[i+1][0] == "neither-nor"):
+                    conj.write(tokens[i][0]+ " " + tokens[i+2][0] + " " + tokens[i+1][0] + "\n")
+                    nor.write(tokens[i][0]+ " " + tokens[i+2][0]+ "\n")
+
 f = open('words.txt', 'r+')
 list1 = []
 for word in sc.tagged_sents():