cblexicon.py

import math
import nltk
from nltk.corpus import wordnet as wn
from collections import Counter
import numpy
from nltk.corpus import movie_reviews
import nltk.stem
from nltk.cluster import KMeansClusterer, GAAClusterer, euclidean_distance
from nltk.classify import NaiveBayesClassifier
import random
from nltk.stem import *

from sets import Set

class cblexicon:

    def process(self):

        def normalize_word(word):
            return SnowballStemmer("english").stem(word)

        def vectorspaced(title,CS,DF):
            title_components =  CS[title][1]
            return numpy.array([
                 word in title_components
                 for word in DF], numpy.short)

        def word_feats(words):
            return dict([(word, True) for word in words])

        def genSets():
            f = open('words.txt', 'r+')
            content = f.readlines()
            positive = Set([])
            negative = Set([])

            for pair in content:
                current = pair.split(' ')
                if (current[1][0] == 'p'):
                    positive.add(current[0])
                elif (current[1][0] == 'n'):
                    negative.add(current[0])

            return positive,negative

        def getConj():
            f = open('conj.txt', 'r+')
            content = f.readlines()
            d = dict()
            i = 0
            for line in content:
                current = line.split(' ')
                #Add the first adjective
                if current[0] in d:
                    d[current[0]][1].add(current[1])
                else:
                    d[current[0]] = (i,Set([current[1]]))
                    i = i+1
                #Add the second adjective
                if current[1] in d:
                    d[current[1]][1].add(current[0])
                else:
                    d[current[1]] = (i,Set([current[0]]))
                    i = i+1
            return d

        #Get the Data#
        negids = movie_reviews.fileids('neg')
        posids = movie_reviews.fileids('pos')
        training = set(negids[:500] + posids[:500])
        testing = set(negids[500:] + posids[500:])
        # Generate positive and negative initial sets
        sets = genSets()
        positive = random.sample(sets[0], min(len(sets[0]), len(sets[1])))
        negative = random.sample(sets[1], min(len(sets[0]), len(sets[1])))
        print len(positive)
        print len(negative)

        # Clustering Setup
        stopwords = set(nltk.corpus.stopwords.words('english'))
        # Create dictionary (adj, (index,[associated words]))
        conjSet = getConj()
        print conjSet

        # Create list out of all keys of conjSet
        defSet = conjSet.keys()

        # Its Cluster time
        cluster = KMeansClusterer(2, euclidean_distance)
        print conjSet["young"]
        z =  vectorspaced("young",conjSet,defSet)

        for num in z:
            if num == 1:
                print "one"


        #cluster.cluster([vectorspaced(title,conjSet,defSet) for title in defSet if title])
        cluster.cluster(vectorspaced("young",conjSet,defSet))
        cluster.cluster(vectorspaced("stiff",conjSet,defSet))
        classified_examples = [
                cluster.classify(vectorspaced(title,conjSet,defSet)) for title in defSet
        ]
        print classified_examples


        # Can we classify and then run bag of words?
        #negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
        #posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
        #trainfeats = [({word : True},"pos") for word in positive] + [({word : True},"neg") for word in negative]
        #testfeats = negfeats[500:] + posfeats[500:]
        #classifier1 = NaiveBayesClassifier.train(trainfeats)
        #print 'accuracy:', nltk.classify.util.accuracy(classifier1,({"Bad": True},"neg"))


cblexicon().process()