cblexicon.py

from __future__ import division
import math
import nltk
from nltk.corpus import wordnet as wn
from collections import Counter
import numpy
from nltk.corpus import movie_reviews
import nltk.stem
from nltk.cluster import KMeansClusterer, GAAClusterer, euclidean_distance
from nltk.classify import NaiveBayesClassifier
import random
from nltk.stem import *

from sets import Set

class cblexicon:

    def process(self):

        def cluster(set1,set2,conjSet,defSet,dis):
            for word in set1:
                score1 = calcScore(word,set1,conjSet,dis)
                #print "Score 1: %f" % score1
                score2 = calcScore(word,set2,conjSet,dis)
                #print "Score 2: %f" % score2
                if score2 < score1:
                    print "swap"
                    set1.remove(word)
                    set2.append(word)
            for word in set2:
                score1 = calcScore(word,set1,conjSet,dis)
                score2 = calcScore(word,set2,conjSet,dis)
                if score1 < score2:
                    set2.remove(word)
                    set1.append(word)
            return set1,set2

        def calcScore(curr,set,conjSet,dis):
            score = 0
            for word in set:
                if word != curr:
                    cats = dis[conjSet[curr][0]][conjSet[word][0]]
                    score = score + cats
            return score * (1.0/len(set))

        def normalize_word(word):
            return SnowballStemmer("english").stem(word)

        def vectorize(conjSet,defSet):
            dis = numpy.zeros((len(defSet),len(defSet)))
            dis.fill(.5)
            for word in defSet:
                similar = conjSet[word][1]
                dissimilar = conjSet[word][2]
                for sim in similar:
                    dis[conjSet[word][0]][conjSet[sim][0]] = 0
                for d in dissimilar:
                    dis[conjSet[word][0]][conjSet[d][0]] = 1
            return dis

        def word_feats(words):
            return dict([(word, True) for word in words])

        def genSets():
            f = open('words.txt', 'r+')
            content = f.readlines()
            positive = Set([])
            negative = Set([])

            for pair in content:
                current = pair.split(' ')
                if (current[1][0] == 'p'):
                    positive.add(current[0])
                elif (current[1][0] == 'n'):
                    negative.add(current[0])

            return positive,negative

        def getConj():
            # Set up the tuple (index, similar, dissimilar)
            f = open('conj.txt', 'r+')
            content = f.readlines()
            d = dict()
            i = 0
            for line in content:
                current = line.split(' ')
                if current[2] == "but":
                    if current[0] in d:
                        d[current[0]][2].add(current[1])
                    else:
                        d[current[0]] = (i,Set(),Set([current[1]]))
                        i = i+1
                    if current[1] in d:
                        d[current[1]][2].add(current[0])
                    else:
                        d[current[1]] = (i,Set(),Set([current[0]]))
                        i = i+1
                else:
                    if current[0] in d:
                        d[current[0]][1].add(current[1])
                    else:
                        d[current[0]] = (i,Set([current[1]]),Set())
                        i = i+1
                    if current[1] in d:
                        d[current[1]][1].add(current[0])
                    else:
                        d[current[1]] = (i,Set([current[0]]),Set())
                        i = i+1
            return d

        #Get the Data#
        negids = movie_reviews.fileids('neg')
        posids = movie_reviews.fileids('pos')
        training = set(negids[:500] + posids[:500])
        testing = set(negids[500:] + posids[500:])
        # Generate positive and negative initial sets
        sets = genSets()
        positive = random.sample(sets[0], min(len(sets[0]), len(sets[1])))
        negative = random.sample(sets[1], min(len(sets[0]), len(sets[1])))

        # Clustering Setup
        stopwords = set(nltk.corpus.stopwords.words('english'))
        # Create dictionary (adj, (index, similar, dissimilar))
        conjSet = getConj()

        # Create list out of all keys of conjSet
        defSet = conjSet.keys()

        # Generate dissimilarity matrix
        dis = vectorize(conjSet,defSet)


        # Its Cluster time
        set1 = defSet[len(defSet)//2:]
        set2 = defSet[:len(defSet)//2]

        for i in range(0,10):
            sets = cluster(set1,set2,conjSet,defSet,dis)
            set1 = sets[0]
            set2 = sets[1]

        print len(set2)

        # Can we classify and then run bag of words?
        #negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
        #posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
        #trainfeats = [({word : True},"pos") for word in positive] + [({word : True},"neg") for word in negative]
        #testfeats = negfeats[500:] + posfeats[500:]
        #classifier1 = NaiveBayesClassifier.train(trainfeats)
        #print 'accuracy:', nltk.classify.util.accuracy(classifier1,({"Bad": True},"neg"))


cblexicon().process()
	from __future__ import division
	import math
	import nltk
	from nltk.corpus import wordnet as wn
	from collections import Counter
	import numpy
	from nltk.corpus import movie_reviews
	import nltk.stem
	from nltk.cluster import KMeansClusterer, GAAClusterer, euclidean_distance
	from nltk.classify import NaiveBayesClassifier
	import random
	from nltk.stem import *

	from sets import Set

	class cblexicon:

	def process(self):

	def cluster(set1,set2,conjSet,defSet,dis):
	for word in set1:
	score1 = calcScore(word,set1,conjSet,dis)
	#print "Score 1: %f" % score1
	score2 = calcScore(word,set2,conjSet,dis)
	#print "Score 2: %f" % score2
	if score2 < score1:
	print "swap"
	set1.remove(word)
	set2.append(word)
	for word in set2:
	score1 = calcScore(word,set1,conjSet,dis)
	score2 = calcScore(word,set2,conjSet,dis)
	if score1 < score2:
	set2.remove(word)
	set1.append(word)
	return set1,set2

	def calcScore(curr,set,conjSet,dis):
	score = 0
	for word in set:
	if word != curr:
	cats = dis[conjSet[curr][0]][conjSet[word][0]]
	score = score + cats
	return score * (1.0/len(set))

	def normalize_word(word):
	return SnowballStemmer("english").stem(word)

	def vectorize(conjSet,defSet):
	dis = numpy.zeros((len(defSet),len(defSet)))
	dis.fill(.5)
	for word in defSet:
	similar = conjSet[word][1]
	dissimilar = conjSet[word][2]
	for sim in similar:
	dis[conjSet[word][0]][conjSet[sim][0]] = 0
	for d in dissimilar:
	dis[conjSet[word][0]][conjSet[d][0]] = 1
	return dis

	def word_feats(words):
	return dict([(word, True) for word in words])

	def genSets():
	f = open('words.txt', 'r+')
	content = f.readlines()
	positive = Set([])
	negative = Set([])

	for pair in content:
	current = pair.split(' ')
	if (current[1][0] == 'p'):
	positive.add(current[0])
	elif (current[1][0] == 'n'):
	negative.add(current[0])

	return positive,negative

	def getConj():
	# Set up the tuple (index, similar, dissimilar)
	f = open('conj.txt', 'r+')
	content = f.readlines()
	d = dict()
	i = 0
	for line in content:
	current = line.split(' ')
	if current[2] == "but":
	if current[0] in d:
	d[current[0]][2].add(current[1])
	else:
	d[current[0]] = (i,Set(),Set([current[1]]))
	i = i+1
	if current[1] in d:
	d[current[1]][2].add(current[0])
	else:
	d[current[1]] = (i,Set(),Set([current[0]]))
	i = i+1
	else:
	if current[0] in d:
	d[current[0]][1].add(current[1])
	else:
	d[current[0]] = (i,Set([current[1]]),Set())
	i = i+1
	if current[1] in d:
	d[current[1]][1].add(current[0])
	else:
	d[current[1]] = (i,Set([current[0]]),Set())
	i = i+1
	return d

	#Get the Data#
	negids = movie_reviews.fileids('neg')
	posids = movie_reviews.fileids('pos')
	training = set(negids[:500] + posids[:500])
	testing = set(negids[500:] + posids[500:])
	# Generate positive and negative initial sets
	sets = genSets()
	positive = random.sample(sets[0], min(len(sets[0]), len(sets[1])))
	negative = random.sample(sets[1], min(len(sets[0]), len(sets[1])))

	# Clustering Setup
	stopwords = set(nltk.corpus.stopwords.words('english'))
	# Create dictionary (adj, (index, similar, dissimilar))
	conjSet = getConj()

	# Create list out of all keys of conjSet
	defSet = conjSet.keys()

	# Generate dissimilarity matrix
	dis = vectorize(conjSet,defSet)


	# Its Cluster time
	set1 = defSet[len(defSet)//2:]
	set2 = defSet[:len(defSet)//2]

	for i in range(0,10):
	sets = cluster(set1,set2,conjSet,defSet,dis)
	set1 = sets[0]
	set2 = sets[1]

	print len(set2)

	# Can we classify and then run bag of words?
	#negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
	#posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
	#trainfeats = [({word : True},"pos") for word in positive] + [({word : True},"neg") for word in negative]
	#testfeats = negfeats[500:] + posfeats[500:]
	#classifier1 = NaiveBayesClassifier.train(trainfeats)
	#print 'accuracy:', nltk.classify.util.accuracy(classifier1,({"Bad": True},"neg"))



	cblexicon().process()