minimize the objective function

job13011 · Apr 12, 2016 · 480809d · 480809d
1 parent ae237e3
commit 480809d
Showing 1 changed file with 71 additions and 13 deletions.
diff --git a/cblexicon.py b/cblexicon.py
@@ -17,6 +17,44 @@ class cblexicon:
 
     def process(self):
 
+        def optimize(set1,set2,conjSet,defSet,dis):
+            i = 0
+            currentMin = 999999
+            consideredMin = calcScore(set1,set2,conjSet,dis)
+            bestSwapWord = ""
+            # Calculate the best word to remove until no moves lessen the function
+            while( currentMin > consideredMin):
+                print i
+                i = i + 1
+                currentMin = consideredMin
+                for word in set1:
+                    set1.remove(word)
+                    set2.append(word)
+                    test = calcScore(set1,set2,conjSet,dis)
+                    set2.remove(word)
+                    set1.append(word)
+                    if (test < consideredMin):
+                        consideredMin = test
+                        bestSwapWord = word
+                for word in set2:
+                    set2.remove(word)
+                    set1.append(word)
+                    test = calcScore(set1,set2,conjSet,dis)
+                    set1.remove(word)
+                    set2.append(word)
+                    if (test < consideredMin):
+                        consideredMin = test
+                        bestSwapWord = word
+
+                if(bestSwapWord in set1):
+                    set1.remove(word)
+                    set2.append(word)
+                else:
+                    set2.remove(word)
+                    set1.append(word)
+            # Return the optimized sets
+            return set1,set2
+
         def cluster(set1,set2,conjSet,defSet,dis):
             for word in set1:
                 score1 = calcScore(word,set1,conjSet,dis)
@@ -35,13 +73,23 @@ def cluster(set1,set2,conjSet,defSet,dis):
                     set1.append(word)
             return set1,set2
 
-        def calcScore(curr,set,conjSet,dis):
-            score = 0
-            for word in set:
-                if word != curr:
-                    cats = dis[conjSet[curr][0]][conjSet[word][0]]
-                    score = score + cats
-            return score * (1.0/len(set))
+        def calcScore(set1,set2,conjSet,dis):
+            score1 = 0
+            score2 = 0
+            for curr in set1:
+                for word in set1:
+                    if word != curr:
+                        cats = dis[conjSet[curr][0]][conjSet[word][0]]
+                        score1 = score1 + cats
+            score1 = score1 * (1.0/len(set1))
+
+            for curr in set2:
+                for word in set2:
+                    if word != curr:
+                        cats = dis[conjSet[curr][0]][conjSet[word][0]]
+                        score2 = score2 + cats
+            score2 = score2 * (1.0/len(set2))
+            return score1 + score2
 
         def normalize_word(word):
             return SnowballStemmer("english").stem(word)
@@ -120,6 +168,7 @@ def getConj():
 
         # Clustering Setup
         stopwords = set(nltk.corpus.stopwords.words('english'))
+
         # Create dictionary (adj, (index, similar, dissimilar))
         conjSet = getConj()
 
@@ -129,17 +178,26 @@ def getConj():
         # Generate dissimilarity matrix
         dis = vectorize(conjSet,defSet)
 
-
         # Its Cluster time
         set1 = defSet[len(defSet)//2:]
         set2 = defSet[:len(defSet)//2]
 
-        for i in range(0,10):
-            sets = cluster(set1,set2,conjSet,defSet,dis)
-            set1 = sets[0]
-            set2 = sets[1]
+        # Optimize objective function
+        sets = optimize(set1,set2,conjSet,defSet,dis)
+        set1 = sets[0]
+        set2 = sets[1]
+
+        print(set1)
+        print(set2)
+
+        f = open('set1.txt', 'w+')
+        f2 = open('set1.txt', 'w+')
+        for word in set1:
+            f.write(word + "/n")
+        for word in set2:
+            f2.write(word + "/n")
+
 
-        print len(set2)
 
         # Can we classify and then run bag of words?
         #negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]