Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
335 lines (300 sloc) 10.3 KB
from __future__ import division
import nltk
import numpy
import nltk.stem
from nltk.corpus import brown
import random
from nltk.stem import *
import time
import scipy
from sets import Set
"""
def optimize(set1, set2, conjSet, defSet, dis):
currentMin = 999999
consideredMin = calcScore(set1, set2, conjSet, dis)
bestSwapWord = ""
# Calculate the best word to remove until no moves lessen the function
i = 1
while( currentMin > consideredMin):
print i
currentMin = consideredMin
for word in set1:
set1.remove(word)
set2.append(word)
test = calcScore(set1, set2, conjSet, dis)
set2.remove(word)
set1.append(word)
if test < consideredMin:
consideredMin = test
bestSwapWord = word
for word in set2:
set2.remove(word)
set1.append(word)
test = calcScore(set1, set2, conjSet, dis)
set1.remove(word)
set2.append(word)
if test < consideredMin:
consideredMin = test
bestSwapWord = word
if bestSwapWord in set1:
set1.remove(bestSwapWord)
set2.append(bestSwapWord)
else:
set2.remove(bestSwapWord)
set1.append(bestSwapWord)
i = i + 1
# Return the optimized sets
return set1, set2
"""
def optimize2(set1, set2, conjSet, defSet, dis):
currentMin = 999999
consideredMin = calcScore(set1, conjSet, dis) + calcScore(set2, conjSet, dis)
bestSwapWord = None
# Calculate the best word to remove until no moves lessen the function
i = 1
while currentMin > consideredMin:
t1 = time.time()
currentMin = consideredMin
currentS1 = calcScore(set1, conjSet, dis)
currentS2 = calcScore(set2, conjSet, dis)
consideredMin = currentS1 + currentS2
for word in set1:
test = calcSwap(word, set1, set2, currentS1, currentS2, conjSet, dis)
if (test < consideredMin):
consideredMin = test
bestSwapWord = word
for word in set2:
test = calcSwap(word, set2, set1, currentS2, currentS1, conjSet, dis)
if test < consideredMin:
consideredMin = test
bestSwapWord = word
if bestSwapWord in set1:
set1.remove(bestSwapWord)
set2.append(bestSwapWord)
elif bestSwapWord in set2:
set2.remove(bestSwapWord)
set1.append(bestSwapWord)
t2 = time.time()
print "Iteration: %d\tScore: %f\tTime: %f sec" % (i, consideredMin, t2-t1)
i += 1
# Return the optimized sets
return set1, set2
def constraintSwap(set1, set2, conjSet, defSet, dis):
for word in set1:
stay = 0
swap = 0
for otherword in set1:
if otherword != word:
cats = getDis(word, otherword)
stay = stay + cats
stay /= (len(set1)-1)
for otherword in set2:
if otherword != word:
cats = getDis(word, otherword)
swap = swap + cats
swap /= len(set2)
if stay > swap:
set1.remove(word)
set2.append(word)
for word in set2:
stay = 0
swap = 0
for otherword in set2:
if otherword != word:
cats = getDis(word, otherword)
stay += cats
stay /= (len(set2)-1)
for otherword in set1:
if otherword != word:
cats = getDis(word, otherword)
swap += cats
swap /= len(set1)
if stay > swap:
set2.remove(word)
set1.append(word)
return set1, set2
def calcScore(set,conjSet,dis):
score = 0
for i in range(len(set)):
w1 = set[i]
for j in range(i, len(set)):
w2 = set[j]
cats = getDis(w1, w2)
score += cats
return score / len(set)
def calcSwap(word,currSet,opSet,currentCount,otherCount,conjSet,dis):
score1 = 0
score2 = 0
for w in currSet:
if word != w:
cats = getDis(word, w)
score1 += cats
currentCount = (currentCount * len(currSet) - score1)/(len(currSet)-1)
#for word in set2:
for w in opSet:
if word != w:
cats = getDis(word, w)
score2 += cats
otherCount = (otherCount * len(opSet) + score2)/(len(opSet)+1)
return currentCount + otherCount
def vectorize(conjSet, defSet):
dis = numpy.zeros((len(defSet),len(defSet)))
dis.fill(0.5)
for word in defSet:
similar = conjSet[word][1]
dissimilar = conjSet[word][2]
for sim in similar:
dis[conjSet[word][0]][conjSet[sim][0]] = 0
for d in dissimilar:
dis[conjSet[word][0]][conjSet[d][0]] = 1
return dis
def genSets():
f = open('words.txt', 'r')
content = f.readlines()
f.close()
positive = Set([])
negative = Set([])
for pair in content:
current = pair.split(' ')
if (current[1][0] == 'p'):
positive.add(current[0])
elif (current[1][0] == 'n'):
negative.add(current[0])
return positive, negative
def getConj():
# Set up the tuple (index, similar, dissimilar)
f = open('movieconj.txt', 'r')
content = f.readlines()
f.close()
d = dict()
i = 0
for line in content:
current = line.split(' ')
# WTF is all this index math?
if current[2] == "but":
if current[0] in d:
d[current[0]][2].add(current[1])
else:
d[current[0]] = (i,Set(),Set([current[1]]))
i += 1
if current[1] in d:
d[current[1]][2].add(current[0])
else:
d[current[1]] = (i,Set(),Set([current[0]]))
i += 1
else:
if current[0] in d:
d[current[0]][1].add(current[1])
else:
d[current[0]] = (i,Set([current[1]]),Set())
i += 1
if current[1] in d:
d[current[1]][1].add(current[0])
else:
d[current[1]] = (i,Set([current[0]]),Set())
i += 1
return d
def findFrequency(set1, set2):
set1Freq = 0
set2Freq = 0
for word in brown.words():
set1Freq = (set1Freq + 1) if (word in set1) else set1Freq
set2Freq = (set2Freq + 1) if (word in set2) else set2Freq
return set1Freq, set2Freq
def getDis(a, b):
global dis, conjSet
a_index = conjSet[a][0]
b_index = conjSet[b][0]
"""
if dis.has_key((a_index,b_index)):
return dis[(a_index, b_index)]
else:
return 0
"""
return dis[a_index][b_index]
def conjunctionData(set1,set2):
f = open('movieconj.txt', 'r+')
content = f.readlines()
f.close()
totalConj = 0
totalbuts = 0
correctbuts = 0
totalands = 0
correctands = 0
totalors = 0
correctors = 0
totalnors = 0
correctnors = 0
for line in content:
totalConj = totalConj + 1
current = line.split(' ')
if current[2] == "but":
totalbuts = totalbuts + 1
if (current[0] in set1 and current[1] in set2) or (current[0] in set2 and current[1] in set1):
correctbuts = correctbuts + 1
elif current[2] == "and":
totalands = totalands + 1
if (current[0] in set1 and current[1] in set1) or (current[0] in set2 and current[1] in set2):
correctands = correctands + 1
elif current[2] == "or":
totalors = totalors + 1
if (current[0] in set1 and current[1] in set1) or (current[0] in set2 and current[1] in set2):
correctors = correctors + 1
elif current[2] == "nor":
totalnors = totalnors + 1
if (current[0] in set1 and current[1] in set1) or (current[0] in set2 and current[1] in set2):
correctnors = correctnors + 1
print "Total Conjunctions: %d" % totalConj
print "Total ands: %d \n Ands in same set: %d" % (totalands,correctands)
print "Total ors: %d \n Ors in same set: %d" % (totalors,correctors)
print "Total nors: %d \n Nors in same set: %d" % (totalnors,correctnors)
print "Total buts: %d \n Buts in opposite sets: %d" % (totalbuts,correctbuts)
def returnCBLexicon():
global dis, conjSet
# Generate positive and negative initial sets
sets = genSets()
positive = random.sample(sets[0], min(len(sets[0]), len(sets[1])))
negative = random.sample(sets[1], min(len(sets[0]), len(sets[1])))
# Clustering Setup
stopwords = set(nltk.corpus.stopwords.words('english'))
# Create dictionary (adj, (index, similar, dissimilar))
conjSet = getConj()
# Create list out of all keys of conjSet
defSet = conjSet.keys()
# Generate dissimilarity matrix
dis = vectorize(conjSet,defSet)
# Its Cluster time
bestSet1 = []
bestSet2 = []
bestScore = 999999
numIterations = 3
for i in range(numIterations):
setsize = random.randint(len(defSet)//4, len(defSet)*3//4)
set1 = random.sample(defSet, setsize)
set2 = [x for x in defSet if x not in set1]
# Optimize objective function
(set1,set2) = optimize2(set1,set2,conjSet,defSet,dis)
# Check the constraint
(set1,set2) = constraintSwap(set1,set2,conjSet,defSet,dis)
score = calcScore(set1,conjSet,dis) + calcScore(set2,conjSet,dis)
print "*** score = %f, bestScore = %f ***" % (score, bestScore)
if score < bestScore:
bestSet1 = set1
bestSet2 = set2
bestScore = score
#Find which set has a higher frequency in the training set
#(set1Freq,set2Freq) = findFrequency(set1,set2)
#positive = set1 if (set1Freq>set2Freq) else set2
#negative = set1 if (set1Freq<set2Freq) else set2
positive = set1 if len(set1)>len(set2) else set2
negative = set2 if len(set1)<len(set2) else set1
# conjunctionData(set1,set2)
# Generate Dictionary in correct format
lexicon = dict([(word,1) for word in positive])
lexicon.update(dict([(word,-1) for word in negative]))
return lexicon
lex = returnCBLexicon()
f = open("cblex.txt", "w")
for key in lex.keys():
f.write("%s, %d\n" % (key, lex[key]))
f.close()
You can’t perform that action at this time.