Glossary Based Sentiment Analysis added

Accuracy of about 53% for the bayes, counting not functional. Will work on optimizing.
job13011 · Mar 16, 2016 · ac01541 · ac01541
1 parent 3e2c889
commit ac01541
Show file tree

Hide file tree

Showing 2 changed files with 182 additions and 0 deletions.
diff --git a/GlossBayes.py b/GlossBayes.py
@@ -0,0 +1,94 @@
+import math
+import nltk
+from nltk.corpus import wordnet as wn
+import nltk.classify.util
+from nltk.classify import NaiveBayesClassifier
+from nltk.corpus import movie_reviews
+from sets import Set
+
+
+class Solver:
+
+    def demo(self):
+        def word_feats(words):
+            return dict([(word, True) for word in words])
+
+        def expand_sets(positive,negative,neutral):
+            newPositive = set(positive)
+            newNegative = set(negative)
+            newNeutral = set(neutral)
+            # Add Syns to Positive
+            for word in positive:
+                for syn in wn.synsets(word, pos=wn.ADJ):
+                    for lemma in syn.lemmas():
+                        curr = lemma.name().split('.')[0]
+                        if( curr not in newPositive and curr not in newNegative and curr not in newNeutral):
+                            newPositive.add(curr)
+                        elif( curr in newNegative):
+                            newNegative.discard(curr)
+                            newNeutral.add(curr)
+            # Add Syns to Negative
+            for word in negative:
+                for syn in wn.synsets(word, pos=wn.ADJ):
+                    for lemma in syn.lemmas():
+                        curr = lemma.name().split('.')[0]
+                        if( curr not in newPositive and curr not in newNegative and curr not in newNeutral):
+                            newNegative.add(curr)
+                        elif(curr in newPositive):
+                            newPositive.discard(curr)
+                            newNeutral.add(curr)
+            return (newPositive,newNegative,newNeutral)
+
+        # Set up initial Sets S_p and S_n
+        positive = Set(['Good'])
+        negative = Set(['Bad'])
+        neutral = Set([''])
+
+        # Expand on Sets to get S_p' and S_n'
+        for num in range(1,3):
+            newsets = expand_sets(positive,negative,neutral);
+            positive = set(newsets[0])
+            negative = set(newsets[1])
+            neutral = set(newsets[2])
+
+        # Learn Classifier
+        trainfeats = [({word : True},"pos") for word in positive] + [({word : True},"neg") for word in negative]
+
+        #negfeats = [({'insulting': True},'neg'),({'bad':True},'neg')]
+
+        #trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
+        classifier = NaiveBayesClassifier.train(trainfeats)
+
+
+        # Testing
+        negids = movie_reviews.fileids('neg')
+        posids = movie_reviews.fileids('pos')
+
+        negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
+        posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
+        negcutoff = len(negfeats)*3/4
+        poscutoff = len(posfeats)*3/4
+
+        testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
+
+        print 'Dictionary of %d positive words and %d negative words, tested on %d instances' % (len(positive),len(negative), len(testfeats))
+
+        print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
+        classifier.show_most_informative_features()
+
+
+    #text = nltk.word_tokenize("And now for a production unlike any other a very fuzzy and cute dog")
+    #print(text)
+    #text = nltk.pos_tag(text)
+    #print(text)
+    #for token in text:
+    #    if(token[1] == "JJ" or token[1] == "JJR" or token[1] == "JJS"):
+    #        print(wn.synsets(token[0]))
+
+
+Solver().demo()
+
+
+
+
+
diff --git a/GlossCount.py b/GlossCount.py
@@ -0,0 +1,88 @@
+import math
+import nltk
+from nltk.corpus import wordnet as wn
+import nltk.classify.util
+from nltk.classify import NaiveBayesClassifier
+from nltk.corpus import movie_reviews
+from sets import Set
+
+class GlossCount:
+    def demo(self):
+        def value_of(sentiment):
+            if sentiment == 'pos': return 1
+            if sentiment == 'neg': return -1
+            return 0
+
+        def sentiment_score(review):
+            return 0
+
+     #return sum ([value_of(tag) for sentence in dict_tagged_sentences for token in sentence for tag in token[2]])
+
+        def expand_sets(positive,negative,neutral):
+                    newPositive = set(positive)
+                    newNegative = set(negative)
+                    newNeutral = set(neutral)
+                    # Add Syns to Positive
+                    for word in positive:
+                        for syn in wn.synsets(word, pos=wn.ADJ):
+                            for lemma in syn.lemmas():
+                                curr = lemma.name().split('.')[0]
+                                if( curr not in newPositive and curr not in newNegative and curr not in newNeutral):
+                                    newPositive.add(curr)
+                                elif( curr in newNegative):
+                                    newNegative.discard(curr)
+                                    newNeutral.add(curr)
+                    # Add Syns to Negative
+                    for word in negative:
+                        for syn in wn.synsets(word, pos=wn.ADJ):
+                            for lemma in syn.lemmas():
+                                curr = lemma.name().split('.')[0]
+                                if( curr not in newPositive and curr not in newNegative and curr not in newNeutral):
+                                    newNegative.add(curr)
+                                elif(curr in newPositive):
+                                    newPositive.discard(curr)
+                                    newNeutral.add(curr)
+                    return (newPositive,newNegative,newNeutral)
+
+        # Set up initial Sets S_p and S_n
+        positive = Set(['Good'])
+        negative = Set(['Bad'])
+        neutral = Set([''])
+
+        # Expand on Sets to get S_p' and S_n'
+        for num in range(1,2):
+            newsets = expand_sets(positive,negative,neutral);
+            positive = set(newsets[0])
+            negative = set(newsets[1])
+            neutral = set(newsets[2])
+
+        # Learn Classifier
+        trainfeats = [({word : True},"pos") for word in positive] + [({word : True},"neg") for word in negative]
+        classifier = NaiveBayesClassifier.train(trainfeats)
+        print "cat"
+        #print classifier.classify(dict([(word, True) for word in words]))
+        print classifier.classify(dict([("bad",True),("bad",True)]))
+        # Iterate through all of the reviews and find sentiment
+        count = 0.00
+        correct = 0.00
+        for reviews in movie_reviews.fileids():
+            score = 0;
+            for words in movie_reviews.words(fileids=[reviews]):
+                if()
+                sent_value = classifier.classify(dict([(word, True)]))
+                if(sent_value is 'neg'):
+                    score = score - 1
+                elif(sent_value is 'pos'):
+                    score = score + 1
+            if (score < 0):
+                print "Negative at %d" % (score)
+                sentiment = 'neg'
+            else:
+                sentiment = 'pos'
+                print "Positive at %d" % (score)
+            if (sentiment == movie_reviews.categories(fileids=[reviews])[0]):
+                correct = correct + 1.00
+            count = count + 1.00
+        print correct/count
+
+GlossCount().demo()