Updates to Conjunctions

job13011 · Apr 10, 2016 · 235376b · 235376b
1 parent cbc5649
commit 235376b
Show file tree

Hide file tree

Showing 2 changed files with 1,922 additions and 36 deletions.
diff --git a/cblexicon.py b/cblexicon.py
@@ -1,3 +1,4 @@
+from __future__ import division
 import math
 import nltk
 from nltk.corpus import wordnet as wn
@@ -16,14 +17,46 @@ class cblexicon:
 
     def process(self):
 
+        def cluster(set1,set2,conjSet,defSet,dis):
+            for word in set1:
+                score1 = calcScore(word,set1,conjSet,dis)
+                #print "Score 1: %f" % score1
+                score2 = calcScore(word,set2,conjSet,dis)
+                #print "Score 2: %f" % score2
+                if score2 < score1:
+                    print "swap"
+                    set1.remove(word)
+                    set2.append(word)
+            for word in set2:
+                score1 = calcScore(word,set1,conjSet,dis)
+                score2 = calcScore(word,set2,conjSet,dis)
+                if score1 < score2:
+                    set2.remove(word)
+                    set1.append(word)
+            return set1,set2
+
+        def calcScore(curr,set,conjSet,dis):
+            score = 0
+            for word in set:
+                if word != curr:
+                    cats = dis[conjSet[curr][0]][conjSet[word][0]]
+                    score = score + cats
+            return score * (1.0/len(set))
+
         def normalize_word(word):
             return SnowballStemmer("english").stem(word)
 
-        def vectorspaced(title,CS,DF):
-            title_components =  CS[title][1]
-            return numpy.array([
-                 word in title_components
-                 for word in DF], numpy.short)
+        def vectorize(conjSet,defSet):
+            dis = numpy.zeros((len(defSet),len(defSet)))
+            dis.fill(.5)
+            for word in defSet:
+                similar = conjSet[word][1]
+                dissimilar = conjSet[word][2]
+                for sim in similar:
+                    dis[conjSet[word][0]][conjSet[sim][0]] = 0
+                for d in dissimilar:
+                    dis[conjSet[word][0]][conjSet[d][0]] = 1
+            return dis
 
         def word_feats(words):
             return dict([(word, True) for word in words])
@@ -44,24 +77,35 @@ def genSets():
             return positive,negative
 
         def getConj():
+            # Set up the tuple (index, similar, dissimilar)
             f = open('conj.txt', 'r+')
             content = f.readlines()
             d = dict()
             i = 0
             for line in content:
                 current = line.split(' ')
-                #Add the first adjective
-                if current[0] in d:
-                    d[current[0]][1].add(current[1])
-                else:
-                    d[current[0]] = (i,Set([current[1]]))
-                    i = i+1
-                #Add the second adjective
-                if current[1] in d:
-                    d[current[1]][1].add(current[0])
+                if current[2] == "but":
+                    if current[0] in d:
+                        d[current[0]][2].add(current[1])
+                    else:
+                        d[current[0]] = (i,Set(),Set([current[1]]))
+                        i = i+1
+                    if current[1] in d:
+                        d[current[1]][2].add(current[0])
+                    else:
+                        d[current[1]] = (i,Set(),Set([current[0]]))
+                        i = i+1
                 else:
-                    d[current[1]] = (i,Set([current[0]]))
-                    i = i+1
+                    if current[0] in d:
+                        d[current[0]][1].add(current[1])
+                    else:
+                        d[current[0]] = (i,Set([current[1]]),Set())
+                        i = i+1
+                    if current[1] in d:
+                        d[current[1]][1].add(current[0])
+                    else:
+                        d[current[1]] = (i,Set([current[0]]),Set())
+                        i = i+1
             return d
 
         #Get the Data#
@@ -73,38 +117,29 @@ def getConj():
         sets = genSets()
         positive = random.sample(sets[0], min(len(sets[0]), len(sets[1])))
         negative = random.sample(sets[1], min(len(sets[0]), len(sets[1])))
-        print len(positive)
-        print len(negative)
 
         # Clustering Setup
         stopwords = set(nltk.corpus.stopwords.words('english'))
-        # Create dictionary (adj, (index,[associated words]))
+        # Create dictionary (adj, (index, similar, dissimilar))
         conjSet = getConj()
-        print conjSet
 
         # Create list out of all keys of conjSet
         defSet = conjSet.keys()
 
-        # Its Cluster time
-        cluster = KMeansClusterer(2, euclidean_distance)
-        print conjSet["young"]
-        z =  vectorspaced("young",conjSet,defSet)
-
-        for num in z:
-            if num == 1:
-                print "one"
-
+        # Generate dissimilarity matrix
+        dis = vectorize(conjSet,defSet)
 
-        #cluster.cluster([vectorspaced(title,conjSet,defSet) for title in defSet if title])
-        cluster.cluster(vectorspaced("young",conjSet,defSet))
-        cluster.cluster(vectorspaced("stiff",conjSet,defSet))
-        classified_examples = [
-                cluster.classify(vectorspaced(title,conjSet,defSet)) for title in defSet
-        ]
-        print classified_examples
 
+        # Its Cluster time
+        set1 = defSet[len(defSet)//2:]
+        set2 = defSet[:len(defSet)//2]
 
+        for i in range(0,10):
+            sets = cluster(set1,set2,conjSet,defSet,dis)
+            set1 = sets[0]
+            set2 = sets[1]
 
+        print len(set2)
 
         # Can we classify and then run bag of words?
         #negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]