Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
BigData/cblexicon.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
334 lines (300 sloc)
10.3 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import division | |
import nltk | |
import numpy | |
import nltk.stem | |
from nltk.corpus import brown | |
import random | |
from nltk.stem import * | |
import time | |
import scipy | |
from sets import Set | |
""" | |
def optimize(set1, set2, conjSet, defSet, dis): | |
currentMin = 999999 | |
consideredMin = calcScore(set1, set2, conjSet, dis) | |
bestSwapWord = "" | |
# Calculate the best word to remove until no moves lessen the function | |
i = 1 | |
while( currentMin > consideredMin): | |
print i | |
currentMin = consideredMin | |
for word in set1: | |
set1.remove(word) | |
set2.append(word) | |
test = calcScore(set1, set2, conjSet, dis) | |
set2.remove(word) | |
set1.append(word) | |
if test < consideredMin: | |
consideredMin = test | |
bestSwapWord = word | |
for word in set2: | |
set2.remove(word) | |
set1.append(word) | |
test = calcScore(set1, set2, conjSet, dis) | |
set1.remove(word) | |
set2.append(word) | |
if test < consideredMin: | |
consideredMin = test | |
bestSwapWord = word | |
if bestSwapWord in set1: | |
set1.remove(bestSwapWord) | |
set2.append(bestSwapWord) | |
else: | |
set2.remove(bestSwapWord) | |
set1.append(bestSwapWord) | |
i = i + 1 | |
# Return the optimized sets | |
return set1, set2 | |
""" | |
def optimize2(set1, set2, conjSet, defSet, dis): | |
currentMin = 999999 | |
consideredMin = calcScore(set1, conjSet, dis) + calcScore(set2, conjSet, dis) | |
bestSwapWord = None | |
# Calculate the best word to remove until no moves lessen the function | |
i = 1 | |
while currentMin > consideredMin: | |
t1 = time.time() | |
currentMin = consideredMin | |
currentS1 = calcScore(set1, conjSet, dis) | |
currentS2 = calcScore(set2, conjSet, dis) | |
consideredMin = currentS1 + currentS2 | |
for word in set1: | |
test = calcSwap(word, set1, set2, currentS1, currentS2, conjSet, dis) | |
if (test < consideredMin): | |
consideredMin = test | |
bestSwapWord = word | |
for word in set2: | |
test = calcSwap(word, set2, set1, currentS2, currentS1, conjSet, dis) | |
if test < consideredMin: | |
consideredMin = test | |
bestSwapWord = word | |
if bestSwapWord in set1: | |
set1.remove(bestSwapWord) | |
set2.append(bestSwapWord) | |
elif bestSwapWord in set2: | |
set2.remove(bestSwapWord) | |
set1.append(bestSwapWord) | |
t2 = time.time() | |
print "Iteration: %d\tScore: %f\tTime: %f sec" % (i, consideredMin, t2-t1) | |
i += 1 | |
# Return the optimized sets | |
return set1, set2 | |
def constraintSwap(set1, set2, conjSet, defSet, dis): | |
for word in set1: | |
stay = 0 | |
swap = 0 | |
for otherword in set1: | |
if otherword != word: | |
cats = getDis(word, otherword) | |
stay = stay + cats | |
stay /= (len(set1)-1) | |
for otherword in set2: | |
if otherword != word: | |
cats = getDis(word, otherword) | |
swap = swap + cats | |
swap /= len(set2) | |
if stay > swap: | |
set1.remove(word) | |
set2.append(word) | |
for word in set2: | |
stay = 0 | |
swap = 0 | |
for otherword in set2: | |
if otherword != word: | |
cats = getDis(word, otherword) | |
stay += cats | |
stay /= (len(set2)-1) | |
for otherword in set1: | |
if otherword != word: | |
cats = getDis(word, otherword) | |
swap += cats | |
swap /= len(set1) | |
if stay > swap: | |
set2.remove(word) | |
set1.append(word) | |
return set1, set2 | |
def calcScore(set,conjSet,dis): | |
score = 0 | |
for i in range(len(set)): | |
w1 = set[i] | |
for j in range(i, len(set)): | |
w2 = set[j] | |
cats = getDis(w1, w2) | |
score += cats | |
return score / len(set) | |
def calcSwap(word,currSet,opSet,currentCount,otherCount,conjSet,dis): | |
score1 = 0 | |
score2 = 0 | |
for w in currSet: | |
if word != w: | |
cats = getDis(word, w) | |
score1 += cats | |
currentCount = (currentCount * len(currSet) - score1)/(len(currSet)-1) | |
#for word in set2: | |
for w in opSet: | |
if word != w: | |
cats = getDis(word, w) | |
score2 += cats | |
otherCount = (otherCount * len(opSet) + score2)/(len(opSet)+1) | |
return currentCount + otherCount | |
def vectorize(conjSet, defSet): | |
dis = numpy.zeros((len(defSet),len(defSet))) | |
dis.fill(0.5) | |
for word in defSet: | |
similar = conjSet[word][1] | |
dissimilar = conjSet[word][2] | |
for sim in similar: | |
dis[conjSet[word][0]][conjSet[sim][0]] = 0 | |
for d in dissimilar: | |
dis[conjSet[word][0]][conjSet[d][0]] = 1 | |
return dis | |
def genSets(): | |
f = open('words.txt', 'r') | |
content = f.readlines() | |
f.close() | |
positive = Set([]) | |
negative = Set([]) | |
for pair in content: | |
current = pair.split(' ') | |
if (current[1][0] == 'p'): | |
positive.add(current[0]) | |
elif (current[1][0] == 'n'): | |
negative.add(current[0]) | |
return positive, negative | |
def getConj(): | |
# Set up the tuple (index, similar, dissimilar) | |
f = open('movieconj.txt', 'r') | |
content = f.readlines() | |
f.close() | |
d = dict() | |
i = 0 | |
for line in content: | |
current = line.split(' ') | |
# WTF is all this index math? | |
if current[2] == "but": | |
if current[0] in d: | |
d[current[0]][2].add(current[1]) | |
else: | |
d[current[0]] = (i,Set(),Set([current[1]])) | |
i += 1 | |
if current[1] in d: | |
d[current[1]][2].add(current[0]) | |
else: | |
d[current[1]] = (i,Set(),Set([current[0]])) | |
i += 1 | |
else: | |
if current[0] in d: | |
d[current[0]][1].add(current[1]) | |
else: | |
d[current[0]] = (i,Set([current[1]]),Set()) | |
i += 1 | |
if current[1] in d: | |
d[current[1]][1].add(current[0]) | |
else: | |
d[current[1]] = (i,Set([current[0]]),Set()) | |
i += 1 | |
return d | |
def findFrequency(set1, set2): | |
set1Freq = 0 | |
set2Freq = 0 | |
for word in brown.words(): | |
set1Freq = (set1Freq + 1) if (word in set1) else set1Freq | |
set2Freq = (set2Freq + 1) if (word in set2) else set2Freq | |
return set1Freq, set2Freq | |
def getDis(a, b): | |
global dis, conjSet | |
a_index = conjSet[a][0] | |
b_index = conjSet[b][0] | |
""" | |
if dis.has_key((a_index,b_index)): | |
return dis[(a_index, b_index)] | |
else: | |
return 0 | |
""" | |
return dis[a_index][b_index] | |
def conjunctionData(set1,set2): | |
f = open('movieconj.txt', 'r+') | |
content = f.readlines() | |
f.close() | |
totalConj = 0 | |
totalbuts = 0 | |
correctbuts = 0 | |
totalands = 0 | |
correctands = 0 | |
totalors = 0 | |
correctors = 0 | |
totalnors = 0 | |
correctnors = 0 | |
for line in content: | |
totalConj = totalConj + 1 | |
current = line.split(' ') | |
if current[2] == "but": | |
totalbuts = totalbuts + 1 | |
if (current[0] in set1 and current[1] in set2) or (current[0] in set2 and current[1] in set1): | |
correctbuts = correctbuts + 1 | |
elif current[2] == "and": | |
totalands = totalands + 1 | |
if (current[0] in set1 and current[1] in set1) or (current[0] in set2 and current[1] in set2): | |
correctands = correctands + 1 | |
elif current[2] == "or": | |
totalors = totalors + 1 | |
if (current[0] in set1 and current[1] in set1) or (current[0] in set2 and current[1] in set2): | |
correctors = correctors + 1 | |
elif current[2] == "nor": | |
totalnors = totalnors + 1 | |
if (current[0] in set1 and current[1] in set1) or (current[0] in set2 and current[1] in set2): | |
correctnors = correctnors + 1 | |
print "Total Conjunctions: %d" % totalConj | |
print "Total ands: %d \n Ands in same set: %d" % (totalands,correctands) | |
print "Total ors: %d \n Ors in same set: %d" % (totalors,correctors) | |
print "Total nors: %d \n Nors in same set: %d" % (totalnors,correctnors) | |
print "Total buts: %d \n Buts in opposite sets: %d" % (totalbuts,correctbuts) | |
def returnCBLexicon(): | |
global dis, conjSet | |
# Generate positive and negative initial sets | |
sets = genSets() | |
positive = random.sample(sets[0], min(len(sets[0]), len(sets[1]))) | |
negative = random.sample(sets[1], min(len(sets[0]), len(sets[1]))) | |
# Clustering Setup | |
stopwords = set(nltk.corpus.stopwords.words('english')) | |
# Create dictionary (adj, (index, similar, dissimilar)) | |
conjSet = getConj() | |
# Create list out of all keys of conjSet | |
defSet = conjSet.keys() | |
# Generate dissimilarity matrix | |
dis = vectorize(conjSet,defSet) | |
# Its Cluster time | |
bestSet1 = [] | |
bestSet2 = [] | |
bestScore = 999999 | |
numIterations = 3 | |
for i in range(numIterations): | |
setsize = random.randint(len(defSet)//4, len(defSet)*3//4) | |
set1 = random.sample(defSet, setsize) | |
set2 = [x for x in defSet if x not in set1] | |
# Optimize objective function | |
(set1,set2) = optimize2(set1,set2,conjSet,defSet,dis) | |
# Check the constraint | |
(set1,set2) = constraintSwap(set1,set2,conjSet,defSet,dis) | |
score = calcScore(set1,conjSet,dis) + calcScore(set2,conjSet,dis) | |
print "*** score = %f, bestScore = %f ***" % (score, bestScore) | |
if score < bestScore: | |
bestSet1 = set1 | |
bestSet2 = set2 | |
bestScore = score | |
#Find which set has a higher frequency in the training set | |
#(set1Freq,set2Freq) = findFrequency(set1,set2) | |
#positive = set1 if (set1Freq>set2Freq) else set2 | |
#negative = set1 if (set1Freq<set2Freq) else set2 | |
positive = set1 if len(set1)>len(set2) else set2 | |
negative = set2 if len(set1)<len(set2) else set1 | |
# conjunctionData(set1,set2) | |
# Generate Dictionary in correct format | |
lexicon = dict([(word,1) for word in positive]) | |
lexicon.update(dict([(word,-1) for word in negative])) | |
return lexicon | |
lex = returnCBLexicon() | |
f = open("cblex.txt", "w") | |
for key in lex.keys(): | |
f.write("%s, %d\n" % (key, lex[key])) | |
f.close() |