GlossBayes.py

import math
import nltk
from nltk.corpus import wordnet as wn
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from sets import Set


class Solver:

    def demo(self):
        def word_feats(words):
            return dict([(word, True) for word in words])
        def expand_sets(positive,negative,neutral):
            newPositive = set(positive)
            newNegative = set(negative)
            newNeutral = set(neutral)
            # Add Syns to Positive
            for word in positive:
                for syn in wn.synsets(word, pos=wn.ADJ):
                    for lemma in syn.lemmas():
                        curr = lemma.name().split('.')[0]
                        if(curr not in newNegative and curr not in newNeutral):
                            newPositive.add(curr)
                        elif( curr in newNegative):
                            newNegative.discard(curr)
                            newNeutral.add(curr)
                        # Deal with antonyms
                        for ant in lemma.antonyms():
                            if(ant not in newPositive and ant not in newNeutral):
                                newNegative.add(ant)
                            elif(ant in newPositive):
                                newPositive.discard(ant)
                                newNeutral.add(ant)
            # Add Syns to Negative
            for word in negative:
                for syn in wn.synsets(word, pos=wn.ADJ):
                    for lemma in syn.lemmas():
                        curr = lemma.name().split('.')[0]
                        print curr
                        if(curr not in newPositive and curr not in newNeutral):
                            newNegative.add(curr)
                        elif(curr in newPositive):
                            newPositive.discard(curr)
                            newNeutral.add(curr)
                        # Deal with antonyms
                        for ant in lemma.antonyms():
                            if(ant not in newNegative and ant not in newNeutral):
                                newPositive.add(ant)
                            elif(ant in newNegative):
                                newNegative.discard(ant)
                                newNeutral.add(ant)
            return (newPositive,newNegative,newNeutral)

        # Set up initial Sets S_p and S_n
        positive = Set(['Good'])
        negative = Set(['Bad'])
        neutral = Set([''])

        # Expand on Sets to get S_p' and S_n'
        for num in range(1,2):
            newsets = expand_sets(positive,negative,neutral);
            positive = set(newsets[0])
            negative = set(newsets[1])
            neutral = set(newsets[2])
            print positive
            print negative

        # # Learn Classifier
        # trainfeats = [({word : True},"pos") for word in positive] + [({word : True},"neg") for word in negative]
        #
        # #negfeats = [({'insulting': True},'neg'),({'bad':True},'neg')]
        #
        # #trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
        # classifier = NaiveBayesClassifier.train(trainfeats)
        #
        #
        # # Testing
        # negids = movie_reviews.fileids('neg')
        # posids = movie_reviews.fileids('pos')
        #
        # negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
        # posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
        # negcutoff = len(negfeats)*3/4
        # poscutoff = len(posfeats)*3/4
        #
        # testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
        #
        # print 'Dictionary of %d positive words and %d negative words, tested on %d instances' % (len(positive),len(negative), len(testfeats))
        #
        # print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
        # classifier.show_most_informative_features()


    #text = nltk.word_tokenize("And now for a production unlike any other a very fuzzy and cute dog")
    #print(text)
    #text = nltk.pos_tag(text)
    #print(text)
    #for token in text:
    #    if(token[1] == "JJ" or token[1] == "JJR" or token[1] == "JJS"):
    #        print(wn.synsets(token[0]))


Solver().demo()