Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
from __future__ import division
import nltk
import numpy
import nltk.stem
from nltk.corpus import brown
import random
from nltk.stem import *
import time
import scipy
from sets import Set
"""
def optimize(set1, set2, conjSet, defSet, dis):
currentMin = 999999
consideredMin = calcScore(set1, set2, conjSet, dis)
bestSwapWord = ""
# Calculate the best word to remove until no moves lessen the function
i = 1
while( currentMin > consideredMin):
print i
currentMin = consideredMin
for word in set1:
set1.remove(word)
set2.append(word)
test = calcScore(set1, set2, conjSet, dis)
set2.remove(word)
set1.append(word)
if test < consideredMin:
consideredMin = test
bestSwapWord = word
for word in set2:
set2.remove(word)
set1.append(word)
test = calcScore(set1, set2, conjSet, dis)
set1.remove(word)
set2.append(word)
if test < consideredMin:
consideredMin = test
bestSwapWord = word
if bestSwapWord in set1:
set1.remove(bestSwapWord)
set2.append(bestSwapWord)
else:
set2.remove(bestSwapWord)
set1.append(bestSwapWord)
i = i + 1
# Return the optimized sets
return set1, set2
"""
def optimize2(set1, set2, conjSet, defSet, dis):
currentMin = 999999
consideredMin = calcScore(set1, conjSet, dis) + calcScore(set2, conjSet, dis)
bestSwapWord = None
# Calculate the best word to remove until no moves lessen the function
i = 1
while currentMin > consideredMin:
t1 = time.time()
currentMin = consideredMin
currentS1 = calcScore(set1, conjSet, dis)
currentS2 = calcScore(set2, conjSet, dis)
consideredMin = currentS1 + currentS2
for word in set1:
test = calcSwap(word, set1, set2, currentS1, currentS2, conjSet, dis)
if (test < consideredMin):
consideredMin = test
bestSwapWord = word
for word in set2:
test = calcSwap(word, set2, set1, currentS2, currentS1, conjSet, dis)
if test < consideredMin:
consideredMin = test
bestSwapWord = word
if bestSwapWord in set1:
set1.remove(bestSwapWord)
set2.append(bestSwapWord)
elif bestSwapWord in set2:
set2.remove(bestSwapWord)
set1.append(bestSwapWord)
t2 = time.time()
print "Iteration: %d\tScore: %f\tTime: %f sec" % (i, consideredMin, t2-t1)
i += 1
# Return the optimized sets
return set1, set2
def constraintSwap(set1, set2, conjSet, defSet, dis):
for word in set1:
stay = 0
swap = 0
for otherword in set1:
if otherword != word:
cats = getDis(word, otherword)
stay = stay + cats
stay /= (len(set1)-1)
for otherword in set2:
if otherword != word:
cats = getDis(word, otherword)
swap = swap + cats
swap /= len(set2)
if stay > swap:
set1.remove(word)
set2.append(word)
for word in set2:
stay = 0
swap = 0
for otherword in set2:
if otherword != word:
cats = getDis(word, otherword)
stay += cats
stay /= (len(set2)-1)
for otherword in set1:
if otherword != word:
cats = getDis(word, otherword)
swap += cats
swap /= len(set1)
if stay > swap:
set2.remove(word)
set1.append(word)
return set1, set2
def calcScore(set,conjSet,dis):
score = 0
for i in range(len(set)):
w1 = set[i]
for j in range(i, len(set)):
w2 = set[j]
cats = getDis(w1, w2)
score += cats
return score / len(set)
def calcSwap(word,currSet,opSet,currentCount,otherCount,conjSet,dis):
score1 = 0
score2 = 0
for w in currSet:
if word != w:
cats = getDis(word, w)
score1 += cats
currentCount = (currentCount * len(currSet) - score1)/(len(currSet)-1)
#for word in set2:
for w in opSet:
if word != w:
cats = getDis(word, w)
score2 += cats
otherCount = (otherCount * len(opSet) + score2)/(len(opSet)+1)
return currentCount + otherCount
def vectorize(conjSet, defSet):
dis = numpy.zeros((len(defSet),len(defSet)))
dis.fill(0.5)
for word in defSet:
similar = conjSet[word][1]
dissimilar = conjSet[word][2]
for sim in similar:
dis[conjSet[word][0]][conjSet[sim][0]] = 0
for d in dissimilar:
dis[conjSet[word][0]][conjSet[d][0]] = 1
return dis
def genSets():
f = open('words.txt', 'r')
content = f.readlines()
f.close()
positive = Set([])
negative = Set([])
for pair in content:
current = pair.split(' ')
if (current[1][0] == 'p'):
positive.add(current[0])
elif (current[1][0] == 'n'):
negative.add(current[0])
return positive, negative
def getConj():
# Set up the tuple (index, similar, dissimilar)
f = open('movieconj.txt', 'r')
content = f.readlines()
f.close()
d = dict()
i = 0
for line in content:
current = line.split(' ')
# WTF is all this index math?
if current[2] == "but":
if current[0] in d:
d[current[0]][2].add(current[1])
else:
d[current[0]] = (i,Set(),Set([current[1]]))
i += 1
if current[1] in d:
d[current[1]][2].add(current[0])
else:
d[current[1]] = (i,Set(),Set([current[0]]))
i += 1
else:
if current[0] in d:
d[current[0]][1].add(current[1])
else:
d[current[0]] = (i,Set([current[1]]),Set())
i += 1
if current[1] in d:
d[current[1]][1].add(current[0])
else:
d[current[1]] = (i,Set([current[0]]),Set())
i += 1
return d
def findFrequency(set1, set2):
set1Freq = 0
set2Freq = 0
for word in brown.words():
set1Freq = (set1Freq + 1) if (word in set1) else set1Freq
set2Freq = (set2Freq + 1) if (word in set2) else set2Freq
return set1Freq, set2Freq
def getDis(a, b):
global dis, conjSet
a_index = conjSet[a][0]
b_index = conjSet[b][0]
"""
if dis.has_key((a_index,b_index)):
return dis[(a_index, b_index)]
else:
return 0
"""
return dis[a_index][b_index]
def conjunctionData(set1,set2):
f = open('movieconj.txt', 'r+')
content = f.readlines()
f.close()
totalConj = 0
totalbuts = 0
correctbuts = 0
totalands = 0
correctands = 0
totalors = 0
correctors = 0
totalnors = 0
correctnors = 0
for line in content:
totalConj = totalConj + 1
current = line.split(' ')
if current[2] == "but":
totalbuts = totalbuts + 1
if (current[0] in set1 and current[1] in set2) or (current[0] in set2 and current[1] in set1):
correctbuts = correctbuts + 1
elif current[2] == "and":
totalands = totalands + 1
if (current[0] in set1 and current[1] in set1) or (current[0] in set2 and current[1] in set2):
correctands = correctands + 1
elif current[2] == "or":
totalors = totalors + 1
if (current[0] in set1 and current[1] in set1) or (current[0] in set2 and current[1] in set2):
correctors = correctors + 1
elif current[2] == "nor":
totalnors = totalnors + 1
if (current[0] in set1 and current[1] in set1) or (current[0] in set2 and current[1] in set2):
correctnors = correctnors + 1
print "Total Conjunctions: %d" % totalConj
print "Total ands: %d \n Ands in same set: %d" % (totalands,correctands)
print "Total ors: %d \n Ors in same set: %d" % (totalors,correctors)
print "Total nors: %d \n Nors in same set: %d" % (totalnors,correctnors)
print "Total buts: %d \n Buts in opposite sets: %d" % (totalbuts,correctbuts)
def returnCBLexicon():
global dis, conjSet
# Generate positive and negative initial sets
sets = genSets()
positive = random.sample(sets[0], min(len(sets[0]), len(sets[1])))
negative = random.sample(sets[1], min(len(sets[0]), len(sets[1])))
# Clustering Setup
stopwords = set(nltk.corpus.stopwords.words('english'))
# Create dictionary (adj, (index, similar, dissimilar))
conjSet = getConj()
# Create list out of all keys of conjSet
defSet = conjSet.keys()
# Generate dissimilarity matrix
dis = vectorize(conjSet,defSet)
# Its Cluster time
bestSet1 = []
bestSet2 = []
bestScore = 999999
numIterations = 3
for i in range(numIterations):
setsize = random.randint(len(defSet)//4, len(defSet)*3//4)
set1 = random.sample(defSet, setsize)
set2 = [x for x in defSet if x not in set1]
# Optimize objective function
(set1,set2) = optimize2(set1,set2,conjSet,defSet,dis)
# Check the constraint
(set1,set2) = constraintSwap(set1,set2,conjSet,defSet,dis)
score = calcScore(set1,conjSet,dis) + calcScore(set2,conjSet,dis)
print "*** score = %f, bestScore = %f ***" % (score, bestScore)
if score < bestScore:
bestSet1 = set1
bestSet2 = set2
bestScore = score
#Find which set has a higher frequency in the training set
#(set1Freq,set2Freq) = findFrequency(set1,set2)
#positive = set1 if (set1Freq>set2Freq) else set2
#negative = set1 if (set1Freq<set2Freq) else set2
positive = set1 if len(set1)>len(set2) else set2
negative = set2 if len(set1)<len(set2) else set1
# conjunctionData(set1,set2)
# Generate Dictionary in correct format
lexicon = dict([(word,1) for word in positive])
lexicon.update(dict([(word,-1) for word in negative]))
return lexicon
lex = returnCBLexicon()
f = open("cblex.txt", "w")
for key in lex.keys():
f.write("%s, %d\n" % (key, lex[key]))
f.close()