cblexicon.py

from __future__ import division
import math
import nltk
from nltk.corpus import wordnet as wn
from collections import Counter
import numpy
from nltk.corpus import movie_reviews
import nltk.stem
from nltk.cluster import KMeansClusterer, GAAClusterer, euclidean_distance
from nltk.classify import NaiveBayesClassifier
import random
from nltk.stem import *

from sets import Set

class cblexicon:

    def process(self):

        def cluster(set1,set2,conjSet,defSet,dis):
            for word in set1:
                score1 = calcScore(word,set1,conjSet,dis)
                #print "Score 1: %f" % score1
                score2 = calcScore(word,set2,conjSet,dis)
                #print "Score 2: %f" % score2
                if score2 < score1:
                    print "swap"
                    set1.remove(word)
                    set2.append(word)
            for word in set2:
                score1 = calcScore(word,set1,conjSet,dis)
                score2 = calcScore(word,set2,conjSet,dis)
                if score1 < score2:
                    set2.remove(word)
                    set1.append(word)
            return set1,set2

        def calcScore(curr,set,conjSet,dis):
            score = 0
            for word in set:
                if word != curr:
                    cats = dis[conjSet[curr][0]][conjSet[word][0]]
                    score = score + cats
            return score * (1.0/len(set))

        def normalize_word(word):
            return SnowballStemmer("english").stem(word)

        def vectorize(conjSet,defSet):
            dis = numpy.zeros((len(defSet),len(defSet)))
            dis.fill(.5)
            for word in defSet:
                similar = conjSet[word][1]
                dissimilar = conjSet[word][2]
                for sim in similar:
                    dis[conjSet[word][0]][conjSet[sim][0]] = 0
                for d in dissimilar:
                    dis[conjSet[word][0]][conjSet[d][0]] = 1
            return dis

        def word_feats(words):
            return dict([(word, True) for word in words])

        def genSets():
            f = open('words.txt', 'r+')
            content = f.readlines()
            positive = Set([])
            negative = Set([])

            for pair in content:
                current = pair.split(' ')
                if (current[1][0] == 'p'):
                    positive.add(current[0])
                elif (current[1][0] == 'n'):
                    negative.add(current[0])

            return positive,negative

        def getConj():
            # Set up the tuple (index, similar, dissimilar)
            f = open('conj.txt', 'r+')
            content = f.readlines()
            d = dict()
            i = 0
            for line in content:
                current = line.split(' ')
                if current[2] == "but":
                    if current[0] in d:
                        d[current[0]][2].add(current[1])
                    else:
                        d[current[0]] = (i,Set(),Set([current[1]]))
                        i = i+1
                    if current[1] in d:
                        d[current[1]][2].add(current[0])
                    else:
                        d[current[1]] = (i,Set(),Set([current[0]]))
                        i = i+1
                else:
                    if current[0] in d:
                        d[current[0]][1].add(current[1])
                    else:
                        d[current[0]] = (i,Set([current[1]]),Set())
                        i = i+1
                    if current[1] in d:
                        d[current[1]][1].add(current[0])
                    else:
                        d[current[1]] = (i,Set([current[0]]),Set())
                        i = i+1
            return d

        #Get the Data#
        negids = movie_reviews.fileids('neg')
        posids = movie_reviews.fileids('pos')
        training = set(negids[:500] + posids[:500])
        testing = set(negids[500:] + posids[500:])
        # Generate positive and negative initial sets
        sets = genSets()
        positive = random.sample(sets[0], min(len(sets[0]), len(sets[1])))
        negative = random.sample(sets[1], min(len(sets[0]), len(sets[1])))

        # Clustering Setup
        stopwords = set(nltk.corpus.stopwords.words('english'))
        # Create dictionary (adj, (index, similar, dissimilar))
        conjSet = getConj()

        # Create list out of all keys of conjSet
        defSet = conjSet.keys()

        # Generate dissimilarity matrix
        dis = vectorize(conjSet,defSet)


        # Its Cluster time
        set1 = defSet[len(defSet)//2:]
        set2 = defSet[:len(defSet)//2]

        for i in range(0,10):
            sets = cluster(set1,set2,conjSet,defSet,dis)
            set1 = sets[0]
            set2 = sets[1]

        print len(set2)

        # Can we classify and then run bag of words?
        #negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
        #posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
        #trainfeats = [({word : True},"pos") for word in positive] + [({word : True},"neg") for word in negative]
        #testfeats = negfeats[500:] + posfeats[500:]
        #classifier1 = NaiveBayesClassifier.train(trainfeats)
        #print 'accuracy:', nltk.classify.util.accuracy(classifier1,({"Bad": True},"neg"))


cblexicon().process()