diff --git a/cblexicon.py b/cblexicon.py index a329d90..6054d4a 100644 --- a/cblexicon.py +++ b/cblexicon.py @@ -59,15 +59,15 @@ class cblexicon: i = 0 currentMin = 999999 consideredMin = calcScore(set1,conjSet,dis) + calcScore(set2,conjSet,dis) - bestSwapWord = "" + bestSwapWord = None print consideredMin # Calculate the best word to remove until no moves lessen the function while( currentMin > consideredMin): - print i - i = i + 1 + print "Iteration #%d: (%d, %d)" % (i, len(set1), len(set2)) currentMin = consideredMin currentS1 = calcScore(set1,conjSet,dis) currentS2 = calcScore(set2,conjSet,dis) + consideredMin = currentS1 + currentS2 # for word in set1: test = calcSwap(word,set1,set2,currentS1,currentS2,conjSet,dis) if (test < consideredMin): @@ -75,19 +75,20 @@ class cblexicon: consideredMin = test bestSwapWord = word for word in set2: - test = calcSwap(word,set2,set1,currentS1,currentS2,conjSet,dis) + test = calcSwap(word,set2,set1,currentS2,currentS1,conjSet,dis) if (test < consideredMin): print "found2" consideredMin = test bestSwapWord = word - print consideredMin + print "New min: %f" % consideredMin if(bestSwapWord in set1): set1.remove(bestSwapWord) set2.append(bestSwapWord) - else: + elif(bestSwapWord in set2): set2.remove(bestSwapWord) set1.append(bestSwapWord) + i = i + 1 # Return the optimized sets return set1,set2 @@ -112,12 +113,13 @@ class cblexicon: def calcScore(set,conjSet,dis): score = 0 - for curr in set: - for word in set: - if word != curr: - cats = dis[conjSet[curr][0]][conjSet[word][0]] - score = score + cats - return score * (1.0/len(set1)) + for i in range(len(set)): + w1 = set[i] + for j in range(i, len(set)): + w2 = set[j] + cats = dis[conjSet[w1][0]][conjSet[w2][0]] + score = score + cats + return score / len(set) def calcSwap(word,currSet,opSet,currentCount,otherCount,conjSet,dis): score1 = 0 @@ -126,13 +128,14 @@ class cblexicon: if word != w: cats = dis[conjSet[word][0]][conjSet[w][0]] score1 = score1 + cats - currentCount = ((currentCount* len(currSet)) - score1 ) * (1/(len(currSet)-1)) + currentCount = ((currentCount* len(currSet)) - score1 )/(len(currSet)-1) - for word in set2: + #for word in set2: + for w in opSet: if word != w: cats = dis[conjSet[word][0]][conjSet[w][0]] score2 = score2 + cats - otherCount = ((otherCount* len(opSet)) + score2 ) * (1/(len(opSet)-1)) + otherCount = ((otherCount* len(opSet)) + score2 )/(len(opSet)+1) return currentCount + otherCount @@ -202,10 +205,12 @@ class cblexicon: return d #Get the Data# + """ negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') training = set(negids[:500] + posids[:500]) testing = set(negids[500:] + posids[500:]) + """ # Generate positive and negative initial sets sets = genSets() positive = random.sample(sets[0], min(len(sets[0]), len(sets[1]))) @@ -223,25 +228,29 @@ class cblexicon: # Generate dissimilarity matrix dis = vectorize(conjSet,defSet) + """ # Its Cluster time set1 = defSet[len(defSet)//2:] set2 = defSet[:len(defSet)//2] - + """ + set1 = random.sample(defSet, len(defSet)//4) + set2 = [x for x in defSet if x not in set1] + # Optimize objective function sets = optimize2(set1,set2,conjSet,defSet,dis) set1 = sets[0] set2 = sets[1] - + print(set1) print(set2) - + """ f = open('set1.txt', 'w+') f2 = open('set1.txt', 'w+') for word in set1: f.write(word + "/n") for word in set2: f2.write(word + "/n") - + """ # Can we classify and then run bag of words?