cblexicon.py

from __future__ import division
import nltk
import numpy
import nltk.stem
from nltk.corpus import brown
import random
from nltk.stem import *
import time
import scipy

from sets import Set

"""
def optimize(set1, set2, conjSet, defSet, dis):
    currentMin = 999999
    consideredMin = calcScore(set1, set2, conjSet, dis)
    bestSwapWord = ""
    # Calculate the best word to remove until no moves lessen the function
    i = 1
    while( currentMin > consideredMin):
        print i
        currentMin = consideredMin
        for word in set1:
            set1.remove(word)
            set2.append(word)
            test = calcScore(set1, set2, conjSet, dis)
            set2.remove(word)
            set1.append(word)
            if test < consideredMin:
                consideredMin = test
                bestSwapWord = word
        for word in set2:
            set2.remove(word)
            set1.append(word)
            test = calcScore(set1, set2, conjSet, dis)
            set1.remove(word)
            set2.append(word)
            if test < consideredMin:
                consideredMin = test
                bestSwapWord = word

        if bestSwapWord in set1:
            set1.remove(bestSwapWord)
            set2.append(bestSwapWord)
        else:
            set2.remove(bestSwapWord)
            set1.append(bestSwapWord)
        i = i + 1
    # Return the optimized sets
    return set1, set2
"""

def optimize2(set1, set2, conjSet, defSet, dis):
    currentMin = 999999
    consideredMin = calcScore(set1, conjSet, dis) + calcScore(set2, conjSet, dis)
    bestSwapWord = None
    # Calculate the best word to remove until no moves lessen the function
    i = 1
    while currentMin > consideredMin:
        t1 = time.time()
        currentMin = consideredMin
        currentS1 = calcScore(set1, conjSet, dis)
        currentS2 = calcScore(set2, conjSet, dis)
        consideredMin = currentS1 + currentS2
        for word in set1:
            test = calcSwap(word, set1, set2, currentS1, currentS2, conjSet, dis)
            if (test < consideredMin):
                consideredMin = test
                bestSwapWord = word
        for word in set2:
            test = calcSwap(word, set2, set1, currentS2, currentS1, conjSet, dis)
            if test < consideredMin:
                consideredMin = test
                bestSwapWord = word

        if bestSwapWord in set1:
            set1.remove(bestSwapWord)
            set2.append(bestSwapWord)
        elif bestSwapWord in set2:
            set2.remove(bestSwapWord)
            set1.append(bestSwapWord)
        t2 = time.time()
        print "Iteration: %d\tScore: %f\tTime: %f sec" % (i, consideredMin, t2-t1)
        i += 1

    # Return the optimized sets
    return set1, set2

def constraintSwap(set1, set2, conjSet, defSet, dis):
    for word in set1:
        stay = 0
        swap = 0
        for otherword in set1:
            if otherword != word:
                cats = getDis(word, otherword)
                stay = stay + cats
        stay /= (len(set1)-1)
        for otherword in set2:
            if otherword != word:
                cats = getDis(word, otherword)
                swap = swap + cats
        swap /= len(set2)
        if stay > swap:
            set1.remove(word)
            set2.append(word)

    for word in set2:
        stay = 0
        swap = 0
        for otherword in set2:
            if otherword != word:
                cats = getDis(word, otherword)
                stay +=  cats
        stay /= (len(set2)-1)
        for otherword in set1:
            if otherword != word:
                cats = getDis(word, otherword)
                swap += cats
        swap /= len(set1)
        if stay > swap:
            set2.remove(word)
            set1.append(word)
    return set1, set2

def calcScore(set,conjSet,dis):
    score = 0
    for i in range(len(set)):
        w1 = set[i]
        for j in range(i, len(set)):
            w2 = set[j]
            cats = getDis(w1, w2)
            score += cats
    return score / len(set)

def calcSwap(word,currSet,opSet,currentCount,otherCount,conjSet,dis):
    score1 = 0
    score2 = 0
    for w in currSet:
        if word != w:
            cats = getDis(word, w)
            score1 += cats
    currentCount = (currentCount * len(currSet) - score1)/(len(currSet)-1)

    #for word in set2:
    for w in opSet:
        if word != w:
            cats = getDis(word, w)
            score2 += cats
    otherCount = (otherCount * len(opSet) + score2)/(len(opSet)+1)

    return currentCount + otherCount

def vectorize(conjSet, defSet):
    dis = numpy.zeros((len(defSet),len(defSet)))
    dis.fill(0.5)
    for word in defSet:
        similar = conjSet[word][1]
        dissimilar = conjSet[word][2]
        for sim in similar:
            dis[conjSet[word][0]][conjSet[sim][0]] = 0
        for d in dissimilar:
            dis[conjSet[word][0]][conjSet[d][0]] = 1
    return dis

def genSets():
    f = open('words.txt', 'r')
    content = f.readlines()
    f.close()
    positive = Set([])
    negative = Set([])

    for pair in content:
        current = pair.split(' ')
        if (current[1][0] == 'p'):
            positive.add(current[0])
        elif (current[1][0] == 'n'):
            negative.add(current[0])

    return positive, negative

def getConj():
    # Set up the tuple (index, similar, dissimilar)
    f = open('movieconj.txt', 'r')
    content = f.readlines()
    f.close()
    d = dict()
    i = 0
    for line in content:
        current = line.split(' ')
        # WTF is all this index math?
        if current[2] == "but":
            if current[0] in d:
                d[current[0]][2].add(current[1])
            else:
                d[current[0]] = (i,Set(),Set([current[1]]))
                i += 1
            if current[1] in d:
                d[current[1]][2].add(current[0])
            else:
                d[current[1]] = (i,Set(),Set([current[0]]))
                i += 1
        else:
            if current[0] in d:
                d[current[0]][1].add(current[1])
            else:
                d[current[0]] = (i,Set([current[1]]),Set())
                i += 1
            if current[1] in d:
                d[current[1]][1].add(current[0])
            else:
                d[current[1]] = (i,Set([current[0]]),Set())
                i += 1
    return d

def findFrequency(set1, set2):
    set1Freq = 0
    set2Freq = 0

    for word in brown.words():
        set1Freq = (set1Freq + 1) if (word in set1) else set1Freq
        set2Freq = (set2Freq + 1) if (word in set2) else set2Freq

    return set1Freq, set2Freq

def getDis(a, b):
    global dis, conjSet
    a_index = conjSet[a][0]
    b_index = conjSet[b][0]
    """
    if dis.has_key((a_index,b_index)):
        return dis[(a_index, b_index)]
    else:
        return 0
    """
    return dis[a_index][b_index]

def conjunctionData(set1,set2):
    f = open('movieconj.txt', 'r+')
    content = f.readlines()
    f.close()
    totalConj = 0
    totalbuts = 0
    correctbuts = 0
    totalands = 0
    correctands = 0
    totalors = 0
    correctors = 0
    totalnors = 0
    correctnors = 0
    for line in content:
        totalConj = totalConj + 1
        current = line.split(' ')
        if current[2] == "but":
            totalbuts = totalbuts + 1
            if (current[0] in set1 and current[1] in set2) or (current[0] in set2 and current[1] in set1):
                correctbuts = correctbuts + 1
        elif current[2] == "and":
            totalands = totalands + 1
            if (current[0] in set1 and current[1] in set1) or (current[0] in set2 and current[1] in set2):
                correctands = correctands + 1
        elif current[2] == "or":
            totalors = totalors + 1
            if (current[0] in set1 and current[1] in set1) or (current[0] in set2 and current[1] in set2):
                correctors = correctors + 1
        elif current[2] == "nor":
            totalnors = totalnors + 1
            if (current[0] in set1 and current[1] in set1) or (current[0] in set2 and current[1] in set2):
                correctnors = correctnors + 1
    print "Total Conjunctions: %d" % totalConj
    print "Total ands: %d \n Ands in same set: %d" % (totalands,correctands)
    print "Total ors: %d \n Ors in same set: %d" % (totalors,correctors)
    print "Total nors: %d \n Nors in same set: %d" % (totalnors,correctnors)
    print "Total buts: %d \n Buts in opposite sets: %d" % (totalbuts,correctbuts)

def returnCBLexicon():
    global dis, conjSet
    # Generate positive and negative initial sets
    sets = genSets()
    positive = random.sample(sets[0], min(len(sets[0]), len(sets[1])))
    negative = random.sample(sets[1], min(len(sets[0]), len(sets[1])))

    # Clustering Setup
    stopwords = set(nltk.corpus.stopwords.words('english'))

    # Create dictionary (adj, (index, similar, dissimilar))
    conjSet = getConj()

    # Create list out of all keys of conjSet
    defSet = conjSet.keys()

    # Generate dissimilarity matrix
    dis = vectorize(conjSet,defSet)

    # Its Cluster time
    bestSet1 = []
    bestSet2 = []
    bestScore = 999999
    numIterations = 3
    for i in range(numIterations):
      setsize = random.randint(len(defSet)//4, len(defSet)*3//4)
      set1 = random.sample(defSet, setsize)
      set2 = [x for x in defSet if x not in set1]

      # Optimize objective function
      (set1,set2) = optimize2(set1,set2,conjSet,defSet,dis)
      # Check the constraint
      (set1,set2) = constraintSwap(set1,set2,conjSet,defSet,dis)
      score = calcScore(set1,conjSet,dis) + calcScore(set2,conjSet,dis)
      print "*** score = %f, bestScore = %f ***" % (score, bestScore)
      if score < bestScore:
        bestSet1 = set1
        bestSet2 = set2
        bestScore = score

    #Find which set has a higher frequency in the training set
    #(set1Freq,set2Freq) = findFrequency(set1,set2)

    #positive = set1 if (set1Freq>set2Freq) else set2
    #negative = set1 if (set1Freq<set2Freq) else set2
    positive = set1 if len(set1)>len(set2) else set2
    negative = set2 if len(set1)<len(set2) else set1

    # conjunctionData(set1,set2)

    # Generate Dictionary in correct format
    lexicon = dict([(word,1) for word in positive])
    lexicon.update(dict([(word,-1) for word in negative]))
    return lexicon

lex = returnCBLexicon()
f = open("cblex.txt", "w")
for key in lex.keys():
  f.write("%s, %d\n" % (key, lex[key]))
f.close()
	from __future__ import division
	import nltk
	import numpy
	import nltk.stem
	from nltk.corpus import brown
	import random
	from nltk.stem import *
	import time
	import scipy

	from sets import Set

	"""
	def optimize(set1, set2, conjSet, defSet, dis):
	currentMin = 999999
	consideredMin = calcScore(set1, set2, conjSet, dis)
	bestSwapWord = ""
	# Calculate the best word to remove until no moves lessen the function
	i = 1
	while( currentMin > consideredMin):
	print i
	currentMin = consideredMin
	for word in set1:
	set1.remove(word)
	set2.append(word)
	test = calcScore(set1, set2, conjSet, dis)
	set2.remove(word)
	set1.append(word)
	if test < consideredMin:
	consideredMin = test
	bestSwapWord = word
	for word in set2:
	set2.remove(word)
	set1.append(word)
	test = calcScore(set1, set2, conjSet, dis)
	set1.remove(word)
	set2.append(word)
	if test < consideredMin:
	consideredMin = test
	bestSwapWord = word

	if bestSwapWord in set1:
	set1.remove(bestSwapWord)
	set2.append(bestSwapWord)
	else:
	set2.remove(bestSwapWord)
	set1.append(bestSwapWord)
	i = i + 1
	# Return the optimized sets
	return set1, set2
	"""

	def optimize2(set1, set2, conjSet, defSet, dis):
	currentMin = 999999
	consideredMin = calcScore(set1, conjSet, dis) + calcScore(set2, conjSet, dis)
	bestSwapWord = None
	# Calculate the best word to remove until no moves lessen the function
	i = 1
	while currentMin > consideredMin:
	t1 = time.time()
	currentMin = consideredMin
	currentS1 = calcScore(set1, conjSet, dis)
	currentS2 = calcScore(set2, conjSet, dis)
	consideredMin = currentS1 + currentS2
	for word in set1:
	test = calcSwap(word, set1, set2, currentS1, currentS2, conjSet, dis)
	if (test < consideredMin):
	consideredMin = test
	bestSwapWord = word
	for word in set2:
	test = calcSwap(word, set2, set1, currentS2, currentS1, conjSet, dis)
	if test < consideredMin:
	consideredMin = test
	bestSwapWord = word

	if bestSwapWord in set1:
	set1.remove(bestSwapWord)
	set2.append(bestSwapWord)
	elif bestSwapWord in set2:
	set2.remove(bestSwapWord)
	set1.append(bestSwapWord)
	t2 = time.time()
	print "Iteration: %d\tScore: %f\tTime: %f sec" % (i, consideredMin, t2-t1)
	i += 1

	# Return the optimized sets
	return set1, set2

	def constraintSwap(set1, set2, conjSet, defSet, dis):
	for word in set1:
	stay = 0
	swap = 0
	for otherword in set1:
	if otherword != word:
	cats = getDis(word, otherword)
	stay = stay + cats
	stay /= (len(set1)-1)
	for otherword in set2:
	if otherword != word:
	cats = getDis(word, otherword)
	swap = swap + cats
	swap /= len(set2)
	if stay > swap:
	set1.remove(word)
	set2.append(word)

	for word in set2:
	stay = 0
	swap = 0
	for otherword in set2:
	if otherword != word:
	cats = getDis(word, otherword)
	stay += cats
	stay /= (len(set2)-1)
	for otherword in set1:
	if otherword != word:
	cats = getDis(word, otherword)
	swap += cats
	swap /= len(set1)
	if stay > swap:
	set2.remove(word)
	set1.append(word)
	return set1, set2

	def calcScore(set,conjSet,dis):
	score = 0
	for i in range(len(set)):
	w1 = set[i]
	for j in range(i, len(set)):
	w2 = set[j]
	cats = getDis(w1, w2)
	score += cats
	return score / len(set)

	def calcSwap(word,currSet,opSet,currentCount,otherCount,conjSet,dis):
	score1 = 0
	score2 = 0
	for w in currSet:
	if word != w:
	cats = getDis(word, w)
	score1 += cats
	currentCount = (currentCount * len(currSet) - score1)/(len(currSet)-1)

	#for word in set2:
	for w in opSet:
	if word != w:
	cats = getDis(word, w)
	score2 += cats
	otherCount = (otherCount * len(opSet) + score2)/(len(opSet)+1)

	return currentCount + otherCount

	def vectorize(conjSet, defSet):
	dis = numpy.zeros((len(defSet),len(defSet)))
	dis.fill(0.5)
	for word in defSet:
	similar = conjSet[word][1]
	dissimilar = conjSet[word][2]
	for sim in similar:
	dis[conjSet[word][0]][conjSet[sim][0]] = 0
	for d in dissimilar:
	dis[conjSet[word][0]][conjSet[d][0]] = 1
	return dis

	def genSets():
	f = open('words.txt', 'r')
	content = f.readlines()
	f.close()
	positive = Set([])
	negative = Set([])

	for pair in content:
	current = pair.split(' ')
	if (current[1][0] == 'p'):
	positive.add(current[0])
	elif (current[1][0] == 'n'):
	negative.add(current[0])

	return positive, negative

	def getConj():
	# Set up the tuple (index, similar, dissimilar)
	f = open('movieconj.txt', 'r')
	content = f.readlines()
	f.close()
	d = dict()
	i = 0
	for line in content:
	current = line.split(' ')
	# WTF is all this index math?
	if current[2] == "but":
	if current[0] in d:
	d[current[0]][2].add(current[1])
	else:
	d[current[0]] = (i,Set(),Set([current[1]]))
	i += 1
	if current[1] in d:
	d[current[1]][2].add(current[0])
	else:
	d[current[1]] = (i,Set(),Set([current[0]]))
	i += 1
	else:
	if current[0] in d:
	d[current[0]][1].add(current[1])
	else:
	d[current[0]] = (i,Set([current[1]]),Set())
	i += 1
	if current[1] in d:
	d[current[1]][1].add(current[0])
	else:
	d[current[1]] = (i,Set([current[0]]),Set())
	i += 1
	return d

	def findFrequency(set1, set2):
	set1Freq = 0
	set2Freq = 0

	for word in brown.words():
	set1Freq = (set1Freq + 1) if (word in set1) else set1Freq
	set2Freq = (set2Freq + 1) if (word in set2) else set2Freq

	return set1Freq, set2Freq

	def getDis(a, b):
	global dis, conjSet
	a_index = conjSet[a][0]
	b_index = conjSet[b][0]
	"""
	if dis.has_key((a_index,b_index)):
	return dis[(a_index, b_index)]
	else:
	return 0
	"""
	return dis[a_index][b_index]

	def conjunctionData(set1,set2):
	f = open('movieconj.txt', 'r+')
	content = f.readlines()
	f.close()
	totalConj = 0
	totalbuts = 0
	correctbuts = 0
	totalands = 0
	correctands = 0
	totalors = 0
	correctors = 0
	totalnors = 0
	correctnors = 0
	for line in content:
	totalConj = totalConj + 1
	current = line.split(' ')
	if current[2] == "but":
	totalbuts = totalbuts + 1
	if (current[0] in set1 and current[1] in set2) or (current[0] in set2 and current[1] in set1):
	correctbuts = correctbuts + 1
	elif current[2] == "and":
	totalands = totalands + 1
	if (current[0] in set1 and current[1] in set1) or (current[0] in set2 and current[1] in set2):
	correctands = correctands + 1
	elif current[2] == "or":
	totalors = totalors + 1
	if (current[0] in set1 and current[1] in set1) or (current[0] in set2 and current[1] in set2):
	correctors = correctors + 1
	elif current[2] == "nor":
	totalnors = totalnors + 1
	if (current[0] in set1 and current[1] in set1) or (current[0] in set2 and current[1] in set2):
	correctnors = correctnors + 1
	print "Total Conjunctions: %d" % totalConj
	print "Total ands: %d \n Ands in same set: %d" % (totalands,correctands)
	print "Total ors: %d \n Ors in same set: %d" % (totalors,correctors)
	print "Total nors: %d \n Nors in same set: %d" % (totalnors,correctnors)
	print "Total buts: %d \n Buts in opposite sets: %d" % (totalbuts,correctbuts)

	def returnCBLexicon():
	global dis, conjSet
	# Generate positive and negative initial sets
	sets = genSets()
	positive = random.sample(sets[0], min(len(sets[0]), len(sets[1])))
	negative = random.sample(sets[1], min(len(sets[0]), len(sets[1])))

	# Clustering Setup
	stopwords = set(nltk.corpus.stopwords.words('english'))

	# Create dictionary (adj, (index, similar, dissimilar))
	conjSet = getConj()

	# Create list out of all keys of conjSet
	defSet = conjSet.keys()

	# Generate dissimilarity matrix
	dis = vectorize(conjSet,defSet)

	# Its Cluster time
	bestSet1 = []
	bestSet2 = []
	bestScore = 999999
	numIterations = 3
	for i in range(numIterations):
	setsize = random.randint(len(defSet)//4, len(defSet)*3//4)
	set1 = random.sample(defSet, setsize)
	set2 = [x for x in defSet if x not in set1]

	# Optimize objective function
	(set1,set2) = optimize2(set1,set2,conjSet,defSet,dis)
	# Check the constraint
	(set1,set2) = constraintSwap(set1,set2,conjSet,defSet,dis)
	score = calcScore(set1,conjSet,dis) + calcScore(set2,conjSet,dis)
	print "* score = %f, bestScore = %f *" % (score, bestScore)
	if score < bestScore:
	bestSet1 = set1
	bestSet2 = set2
	bestScore = score

	#Find which set has a higher frequency in the training set
	#(set1Freq,set2Freq) = findFrequency(set1,set2)

	#positive = set1 if (set1Freq>set2Freq) else set2
	#negative = set1 if (set1Freq<set2Freq) else set2
	positive = set1 if len(set1)>len(set2) else set2
	negative = set2 if len(set1)<len(set2) else set1

	# conjunctionData(set1,set2)

	# Generate Dictionary in correct format
	lexicon = dict([(word,1) for word in positive])
	lexicon.update(dict([(word,-1) for word in negative]))
	return lexicon

	lex = returnCBLexicon()
	f = open("cblex.txt", "w")
	for key in lex.keys():
	f.write("%s, %d\n" % (key, lex[key]))
	f.close()