cblexicon.py

from __future__ import division
import math
import nltk
from nltk.corpus import wordnet as wn
from collections import Counter
import numpy
from nltk.corpus import movie_reviews
import nltk.stem
from nltk.cluster import KMeansClusterer, GAAClusterer, euclidean_distance
from nltk.classify import NaiveBayesClassifier
import random
from nltk.stem import *

from sets import Set

class cblexicon:

    def process(self):

        def optimize(set1,set2,conjSet,defSet,dis):
            i = 0
            currentMin = 999999
            consideredMin = calcScore(set1,set2,conjSet,dis)
            bestSwapWord = ""
            # Calculate the best word to remove until no moves lessen the function
            while( currentMin > consideredMin):
                print i
                i = i + 1
                currentMin = consideredMin
                for word in set1:
                    set1.remove(word)
                    set2.append(word)
                    test = calcScore(set1,set2,conjSet,dis)
                    set2.remove(word)
                    set1.append(word)
                    if (test < consideredMin):
                        consideredMin = test
                        bestSwapWord = word
                for word in set2:
                    set2.remove(word)
                    set1.append(word)
                    test = calcScore(set1,set2,conjSet,dis)
                    set1.remove(word)
                    set2.append(word)
                    if (test < consideredMin):
                        consideredMin = test
                        bestSwapWord = word

                if(bestSwapWord in set1):
                    set1.remove(bestSwapWord)
                    set2.append(bestSwapWord)
                else:
                    set2.remove(bestSwapWord)
                    set1.append(bestSwapWord)
            # Return the optimized sets
            return set1,set2

        def optimize2(set1,set2,conjSet,defSet,dis):
            i = 0
            currentMin = 999999
            consideredMin = calcScore(set1,conjSet,dis) + calcScore(set2,conjSet,dis)
            bestSwapWord = None
            print consideredMin
            # Calculate the best word to remove until no moves lessen the function
            while( currentMin > consideredMin):
                print "Iteration #%d: (%d, %d)" % (i, len(set1), len(set2))
                currentMin = consideredMin
                currentS1 = calcScore(set1,conjSet,dis)
                currentS2 = calcScore(set2,conjSet,dis)
                consideredMin = currentS1 + currentS2 #
                for word in set1:
                    test = calcSwap(word,set1,set2,currentS1,currentS2,conjSet,dis)
                    if (test < consideredMin):
                        print "found1"
                        consideredMin = test
                        bestSwapWord = word
                for word in set2:
                    test = calcSwap(word,set2,set1,currentS2,currentS1,conjSet,dis)
                    if (test < consideredMin):
                        print "found2"
                        consideredMin = test
                        bestSwapWord = word
                print "New min: %f" % consideredMin

                if(bestSwapWord in set1):
                    set1.remove(bestSwapWord)
                    set2.append(bestSwapWord)
                elif(bestSwapWord in set2):
                    set2.remove(bestSwapWord)
                    set1.append(bestSwapWord)
                i = i + 1

            # Return the optimized sets
            return set1,set2

        def cluster(set1,set2,conjSet,defSet,dis):
            for word in set1:
                score1 = calcScore(word,set1,conjSet,dis)
                #print "Score 1: %f" % score1
                score2 = calcScore(word,set2,conjSet,dis)
                #print "Score 2: %f" % score2
                if score2 < score1:
                    print "swap"
                    set1.remove(word)
                    set2.append(word)
            for word in set2:
                score1 = calcScore(word,set1,conjSet,dis)
                score2 = calcScore(word,set2,conjSet,dis)
                if score1 < score2:
                    set2.remove(word)
                    set1.append(word)
            return set1,set2

        def calcScore(set,conjSet,dis):
            score = 0
            for i in range(len(set)):
                w1 = set[i]
                for j in range(i, len(set)):
                    w2 = set[j]
                    cats = dis[conjSet[w1][0]][conjSet[w2][0]]
                    score = score + cats
            return score / len(set)

        def calcSwap(word,currSet,opSet,currentCount,otherCount,conjSet,dis):
            score1 = 0
            score2 = 0
            for w in currSet:
                if word != w:
                    cats = dis[conjSet[word][0]][conjSet[w][0]]
                    score1 = score1 + cats
            currentCount = ((currentCount* len(currSet)) - score1 )/(len(currSet)-1)

            #for word in set2:
            for w in opSet:
                if word != w:
                    cats = dis[conjSet[word][0]][conjSet[w][0]]
                    score2 = score2 + cats
            otherCount = ((otherCount* len(opSet)) + score2 )/(len(opSet)+1)

            return currentCount + otherCount

        def normalize_word(word):
            return SnowballStemmer("english").stem(word)

        def vectorize(conjSet,defSet):
            dis = numpy.zeros((len(defSet),len(defSet)))
            dis.fill(.5)
            for word in defSet:
                similar = conjSet[word][1]
                dissimilar = conjSet[word][2]
                for sim in similar:
                    dis[conjSet[word][0]][conjSet[sim][0]] = 0
                for d in dissimilar:
                    dis[conjSet[word][0]][conjSet[d][0]] = 1
            return dis

        def word_feats(words):
            return dict([(word, True) for word in words])

        def genSets():
            f = open('words.txt', 'r+')
            content = f.readlines()
            positive = Set([])
            negative = Set([])

            for pair in content:
                current = pair.split(' ')
                if (current[1][0] == 'p'):
                    positive.add(current[0])
                elif (current[1][0] == 'n'):
                    negative.add(current[0])

            return positive,negative

        def getConj():
            # Set up the tuple (index, similar, dissimilar)
            f = open('conj.txt', 'r+')
            content = f.readlines()
            d = dict()
            i = 0
            for line in content:
                current = line.split(' ')
                if current[2] == "but":
                    if current[0] in d:
                        d[current[0]][2].add(current[1])
                    else:
                        d[current[0]] = (i,Set(),Set([current[1]]))
                        i = i+1
                    if current[1] in d:
                        d[current[1]][2].add(current[0])
                    else:
                        d[current[1]] = (i,Set(),Set([current[0]]))
                        i = i+1
                else:
                    if current[0] in d:
                        d[current[0]][1].add(current[1])
                    else:
                        d[current[0]] = (i,Set([current[1]]),Set())
                        i = i+1
                    if current[1] in d:
                        d[current[1]][1].add(current[0])
                    else:
                        d[current[1]] = (i,Set([current[0]]),Set())
                        i = i+1
            return d

        #Get the Data#
        """
        negids = movie_reviews.fileids('neg')
        posids = movie_reviews.fileids('pos')
        training = set(negids[:500] + posids[:500])
        testing = set(negids[500:] + posids[500:])
        """
        # Generate positive and negative initial sets
        sets = genSets()
        positive = random.sample(sets[0], min(len(sets[0]), len(sets[1])))
        negative = random.sample(sets[1], min(len(sets[0]), len(sets[1])))

        # Clustering Setup
        stopwords = set(nltk.corpus.stopwords.words('english'))

        # Create dictionary (adj, (index, similar, dissimilar))
        conjSet = getConj()

        # Create list out of all keys of conjSet
        defSet = conjSet.keys()

        # Generate dissimilarity matrix
        dis = vectorize(conjSet,defSet)

        # Its Cluster time
        set1 = defSet[len(defSet)//2:]
        set2 = defSet[:len(defSet)//2]
        """
        set1 = random.sample(defSet, len(defSet)//4)
        set2 = [x for x in defSet if x not in set1]
        """
        # Optimize objective function
        sets = optimize2(set1,set2,conjSet,defSet,dis)
        set1 = sets[0]
        set2 = sets[1]

        print(set1)
        print(set2)
        f1 = open('set1.txt', 'w+')
        f2 = open('set2.txt', 'w+')
        for word in set1:
            f1.write(word + "\n")
        for word in set2:
            f2.write(word + "\n")
        f1.close()
        f2.close()

        # Can we classify and then run bag of words?
        #negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
        #posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
        #trainfeats = [({word : True},"pos") for word in positive] + [({word : True},"neg") for word in negative]
        #testfeats = negfeats[500:] + posfeats[500:]
        #classifier1 = NaiveBayesClassifier.train(trainfeats)
        #print 'accuracy:', nltk.classify.util.accuracy(classifier1,({"Bad": True},"neg"))


cblexicon().process()
	from __future__ import division
	import math
	import nltk
	from nltk.corpus import wordnet as wn
	from collections import Counter
	import numpy
	from nltk.corpus import movie_reviews
	import nltk.stem
	from nltk.cluster import KMeansClusterer, GAAClusterer, euclidean_distance
	from nltk.classify import NaiveBayesClassifier
	import random
	from nltk.stem import *

	from sets import Set

	class cblexicon:

	def process(self):

	def optimize(set1,set2,conjSet,defSet,dis):
	i = 0
	currentMin = 999999
	consideredMin = calcScore(set1,set2,conjSet,dis)
	bestSwapWord = ""
	# Calculate the best word to remove until no moves lessen the function
	while( currentMin > consideredMin):
	print i
	i = i + 1
	currentMin = consideredMin
	for word in set1:
	set1.remove(word)
	set2.append(word)
	test = calcScore(set1,set2,conjSet,dis)
	set2.remove(word)
	set1.append(word)
	if (test < consideredMin):
	consideredMin = test
	bestSwapWord = word
	for word in set2:
	set2.remove(word)
	set1.append(word)
	test = calcScore(set1,set2,conjSet,dis)
	set1.remove(word)
	set2.append(word)
	if (test < consideredMin):
	consideredMin = test
	bestSwapWord = word

	if(bestSwapWord in set1):
	set1.remove(bestSwapWord)
	set2.append(bestSwapWord)
	else:
	set2.remove(bestSwapWord)
	set1.append(bestSwapWord)
	# Return the optimized sets
	return set1,set2

	def optimize2(set1,set2,conjSet,defSet,dis):
	i = 0
	currentMin = 999999
	consideredMin = calcScore(set1,conjSet,dis) + calcScore(set2,conjSet,dis)
	bestSwapWord = None
	print consideredMin
	# Calculate the best word to remove until no moves lessen the function
	while( currentMin > consideredMin):
	print "Iteration #%d: (%d, %d)" % (i, len(set1), len(set2))
	currentMin = consideredMin
	currentS1 = calcScore(set1,conjSet,dis)
	currentS2 = calcScore(set2,conjSet,dis)
	consideredMin = currentS1 + currentS2 #
	for word in set1:
	test = calcSwap(word,set1,set2,currentS1,currentS2,conjSet,dis)
	if (test < consideredMin):
	print "found1"
	consideredMin = test
	bestSwapWord = word
	for word in set2:
	test = calcSwap(word,set2,set1,currentS2,currentS1,conjSet,dis)
	if (test < consideredMin):
	print "found2"
	consideredMin = test
	bestSwapWord = word
	print "New min: %f" % consideredMin

	if(bestSwapWord in set1):
	set1.remove(bestSwapWord)
	set2.append(bestSwapWord)
	elif(bestSwapWord in set2):
	set2.remove(bestSwapWord)
	set1.append(bestSwapWord)
	i = i + 1

	# Return the optimized sets
	return set1,set2

	def cluster(set1,set2,conjSet,defSet,dis):
	for word in set1:
	score1 = calcScore(word,set1,conjSet,dis)
	#print "Score 1: %f" % score1
	score2 = calcScore(word,set2,conjSet,dis)
	#print "Score 2: %f" % score2
	if score2 < score1:
	print "swap"
	set1.remove(word)
	set2.append(word)
	for word in set2:
	score1 = calcScore(word,set1,conjSet,dis)
	score2 = calcScore(word,set2,conjSet,dis)
	if score1 < score2:
	set2.remove(word)
	set1.append(word)
	return set1,set2

	def calcScore(set,conjSet,dis):
	score = 0
	for i in range(len(set)):
	w1 = set[i]
	for j in range(i, len(set)):
	w2 = set[j]
	cats = dis[conjSet[w1][0]][conjSet[w2][0]]
	score = score + cats
	return score / len(set)

	def calcSwap(word,currSet,opSet,currentCount,otherCount,conjSet,dis):
	score1 = 0
	score2 = 0
	for w in currSet:
	if word != w:
	cats = dis[conjSet[word][0]][conjSet[w][0]]
	score1 = score1 + cats
	currentCount = ((currentCount* len(currSet)) - score1 )/(len(currSet)-1)

	#for word in set2:
	for w in opSet:
	if word != w:
	cats = dis[conjSet[word][0]][conjSet[w][0]]
	score2 = score2 + cats
	otherCount = ((otherCount* len(opSet)) + score2 )/(len(opSet)+1)

	return currentCount + otherCount

	def normalize_word(word):
	return SnowballStemmer("english").stem(word)

	def vectorize(conjSet,defSet):
	dis = numpy.zeros((len(defSet),len(defSet)))
	dis.fill(.5)
	for word in defSet:
	similar = conjSet[word][1]
	dissimilar = conjSet[word][2]
	for sim in similar:
	dis[conjSet[word][0]][conjSet[sim][0]] = 0
	for d in dissimilar:
	dis[conjSet[word][0]][conjSet[d][0]] = 1
	return dis

	def word_feats(words):
	return dict([(word, True) for word in words])

	def genSets():
	f = open('words.txt', 'r+')
	content = f.readlines()
	positive = Set([])
	negative = Set([])

	for pair in content:
	current = pair.split(' ')
	if (current[1][0] == 'p'):
	positive.add(current[0])
	elif (current[1][0] == 'n'):
	negative.add(current[0])

	return positive,negative

	def getConj():
	# Set up the tuple (index, similar, dissimilar)
	f = open('conj.txt', 'r+')
	content = f.readlines()
	d = dict()
	i = 0
	for line in content:
	current = line.split(' ')
	if current[2] == "but":
	if current[0] in d:
	d[current[0]][2].add(current[1])
	else:
	d[current[0]] = (i,Set(),Set([current[1]]))
	i = i+1
	if current[1] in d:
	d[current[1]][2].add(current[0])
	else:
	d[current[1]] = (i,Set(),Set([current[0]]))
	i = i+1
	else:
	if current[0] in d:
	d[current[0]][1].add(current[1])
	else:
	d[current[0]] = (i,Set([current[1]]),Set())
	i = i+1
	if current[1] in d:
	d[current[1]][1].add(current[0])
	else:
	d[current[1]] = (i,Set([current[0]]),Set())
	i = i+1
	return d

	#Get the Data#
	"""
	negids = movie_reviews.fileids('neg')
	posids = movie_reviews.fileids('pos')
	training = set(negids[:500] + posids[:500])
	testing = set(negids[500:] + posids[500:])
	"""
	# Generate positive and negative initial sets
	sets = genSets()
	positive = random.sample(sets[0], min(len(sets[0]), len(sets[1])))
	negative = random.sample(sets[1], min(len(sets[0]), len(sets[1])))

	# Clustering Setup
	stopwords = set(nltk.corpus.stopwords.words('english'))

	# Create dictionary (adj, (index, similar, dissimilar))
	conjSet = getConj()

	# Create list out of all keys of conjSet
	defSet = conjSet.keys()

	# Generate dissimilarity matrix
	dis = vectorize(conjSet,defSet)

	# Its Cluster time
	set1 = defSet[len(defSet)//2:]
	set2 = defSet[:len(defSet)//2]
	"""
	set1 = random.sample(defSet, len(defSet)//4)
	set2 = [x for x in defSet if x not in set1]
	"""
	# Optimize objective function
	sets = optimize2(set1,set2,conjSet,defSet,dis)
	set1 = sets[0]
	set2 = sets[1]

	print(set1)
	print(set2)
	f1 = open('set1.txt', 'w+')
	f2 = open('set2.txt', 'w+')
	for word in set1:
	f1.write(word + "\n")
	for word in set2:
	f2.write(word + "\n")
	f1.close()
	f2.close()

	# Can we classify and then run bag of words?
	#negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
	#posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
	#trainfeats = [({word : True},"pos") for word in positive] + [({word : True},"neg") for word in negative]
	#testfeats = negfeats[500:] + posfeats[500:]
	#classifier1 = NaiveBayesClassifier.train(trainfeats)
	#print 'accuracy:', nltk.classify.util.accuracy(classifier1,({"Bad": True},"neg"))



	cblexicon().process()