diff --git a/cblexicon.py b/cblexicon.py index d07521c..b2375a8 100644 --- a/cblexicon.py +++ b/cblexicon.py @@ -17,6 +17,44 @@ class cblexicon: def process(self): + def optimize(set1,set2,conjSet,defSet,dis): + i = 0 + currentMin = 999999 + consideredMin = calcScore(set1,set2,conjSet,dis) + bestSwapWord = "" + # Calculate the best word to remove until no moves lessen the function + while( currentMin > consideredMin): + print i + i = i + 1 + currentMin = consideredMin + for word in set1: + set1.remove(word) + set2.append(word) + test = calcScore(set1,set2,conjSet,dis) + set2.remove(word) + set1.append(word) + if (test < consideredMin): + consideredMin = test + bestSwapWord = word + for word in set2: + set2.remove(word) + set1.append(word) + test = calcScore(set1,set2,conjSet,dis) + set1.remove(word) + set2.append(word) + if (test < consideredMin): + consideredMin = test + bestSwapWord = word + + if(bestSwapWord in set1): + set1.remove(word) + set2.append(word) + else: + set2.remove(word) + set1.append(word) + # Return the optimized sets + return set1,set2 + def cluster(set1,set2,conjSet,defSet,dis): for word in set1: score1 = calcScore(word,set1,conjSet,dis) @@ -35,13 +73,23 @@ class cblexicon: set1.append(word) return set1,set2 - def calcScore(curr,set,conjSet,dis): - score = 0 - for word in set: - if word != curr: - cats = dis[conjSet[curr][0]][conjSet[word][0]] - score = score + cats - return score * (1.0/len(set)) + def calcScore(set1,set2,conjSet,dis): + score1 = 0 + score2 = 0 + for curr in set1: + for word in set1: + if word != curr: + cats = dis[conjSet[curr][0]][conjSet[word][0]] + score1 = score1 + cats + score1 = score1 * (1.0/len(set1)) + + for curr in set2: + for word in set2: + if word != curr: + cats = dis[conjSet[curr][0]][conjSet[word][0]] + score2 = score2 + cats + score2 = score2 * (1.0/len(set2)) + return score1 + score2 def normalize_word(word): return SnowballStemmer("english").stem(word) @@ -120,6 +168,7 @@ class cblexicon: # Clustering Setup stopwords = set(nltk.corpus.stopwords.words('english')) + # Create dictionary (adj, (index, similar, dissimilar)) conjSet = getConj() @@ -129,17 +178,26 @@ class cblexicon: # Generate dissimilarity matrix dis = vectorize(conjSet,defSet) - # Its Cluster time set1 = defSet[len(defSet)//2:] set2 = defSet[:len(defSet)//2] - for i in range(0,10): - sets = cluster(set1,set2,conjSet,defSet,dis) - set1 = sets[0] - set2 = sets[1] + # Optimize objective function + sets = optimize(set1,set2,conjSet,defSet,dis) + set1 = sets[0] + set2 = sets[1] + + print(set1) + print(set2) + + f = open('set1.txt', 'w+') + f2 = open('set1.txt', 'w+') + for word in set1: + f.write(word + "/n") + for word in set2: + f2.write(word + "/n") + - print len(set2) # Can we classify and then run bag of words? #negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]