Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
minimize the objective function
  • Loading branch information
adl13006 committed Apr 12, 2016
1 parent ae237e3 commit 480809d
Showing 1 changed file with 71 additions and 13 deletions.
84 changes: 71 additions & 13 deletions cblexicon.py
Expand Up @@ -17,6 +17,44 @@ class cblexicon:

def process(self):

def optimize(set1,set2,conjSet,defSet,dis):
i = 0
currentMin = 999999
consideredMin = calcScore(set1,set2,conjSet,dis)
bestSwapWord = ""
# Calculate the best word to remove until no moves lessen the function
while( currentMin > consideredMin):
print i
i = i + 1
currentMin = consideredMin
for word in set1:
set1.remove(word)
set2.append(word)
test = calcScore(set1,set2,conjSet,dis)
set2.remove(word)
set1.append(word)
if (test < consideredMin):
consideredMin = test
bestSwapWord = word
for word in set2:
set2.remove(word)
set1.append(word)
test = calcScore(set1,set2,conjSet,dis)
set1.remove(word)
set2.append(word)
if (test < consideredMin):
consideredMin = test
bestSwapWord = word

if(bestSwapWord in set1):
set1.remove(word)
set2.append(word)
else:
set2.remove(word)
set1.append(word)
# Return the optimized sets
return set1,set2

def cluster(set1,set2,conjSet,defSet,dis):
for word in set1:
score1 = calcScore(word,set1,conjSet,dis)
Expand All @@ -35,13 +73,23 @@ class cblexicon:
set1.append(word)
return set1,set2

def calcScore(curr,set,conjSet,dis):
score = 0
for word in set:
if word != curr:
cats = dis[conjSet[curr][0]][conjSet[word][0]]
score = score + cats
return score * (1.0/len(set))
def calcScore(set1,set2,conjSet,dis):
score1 = 0
score2 = 0
for curr in set1:
for word in set1:
if word != curr:
cats = dis[conjSet[curr][0]][conjSet[word][0]]
score1 = score1 + cats
score1 = score1 * (1.0/len(set1))

for curr in set2:
for word in set2:
if word != curr:
cats = dis[conjSet[curr][0]][conjSet[word][0]]
score2 = score2 + cats
score2 = score2 * (1.0/len(set2))
return score1 + score2

def normalize_word(word):
return SnowballStemmer("english").stem(word)
Expand Down Expand Up @@ -120,6 +168,7 @@ class cblexicon:

# Clustering Setup
stopwords = set(nltk.corpus.stopwords.words('english'))

# Create dictionary (adj, (index, similar, dissimilar))
conjSet = getConj()

Expand All @@ -129,17 +178,26 @@ class cblexicon:
# Generate dissimilarity matrix
dis = vectorize(conjSet,defSet)


# Its Cluster time
set1 = defSet[len(defSet)//2:]
set2 = defSet[:len(defSet)//2]

for i in range(0,10):
sets = cluster(set1,set2,conjSet,defSet,dis)
set1 = sets[0]
set2 = sets[1]
# Optimize objective function
sets = optimize(set1,set2,conjSet,defSet,dis)
set1 = sets[0]
set2 = sets[1]

print(set1)
print(set2)

f = open('set1.txt', 'w+')
f2 = open('set1.txt', 'w+')
for word in set1:
f.write(word + "/n")
for word in set2:
f2.write(word + "/n")


print len(set2)

# Can we classify and then run bag of words?
#negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
Expand Down

0 comments on commit 480809d

Please sign in to comment.