From ac015419fb2ded2ceabe177dd55d616f4822c886 Mon Sep 17 00:00:00 2001 From: Antonia Lewis Date: Wed, 16 Mar 2016 16:37:28 -0400 Subject: [PATCH] Glossary Based Sentiment Analysis added Accuracy of about 53% for the bayes, counting not functional. Will work on optimizing. --- GlossBayes.py | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++ GlossCount.py | 88 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 182 insertions(+) create mode 100644 GlossBayes.py create mode 100644 GlossCount.py diff --git a/GlossBayes.py b/GlossBayes.py new file mode 100644 index 0000000..9983f91 --- /dev/null +++ b/GlossBayes.py @@ -0,0 +1,94 @@ +import math +import nltk +from nltk.corpus import wordnet as wn +import nltk.classify.util +from nltk.classify import NaiveBayesClassifier +from nltk.corpus import movie_reviews +from sets import Set + + +class Solver: + + def demo(self): + def word_feats(words): + return dict([(word, True) for word in words]) + + def expand_sets(positive,negative,neutral): + newPositive = set(positive) + newNegative = set(negative) + newNeutral = set(neutral) + # Add Syns to Positive + for word in positive: + for syn in wn.synsets(word, pos=wn.ADJ): + for lemma in syn.lemmas(): + curr = lemma.name().split('.')[0] + if( curr not in newPositive and curr not in newNegative and curr not in newNeutral): + newPositive.add(curr) + elif( curr in newNegative): + newNegative.discard(curr) + newNeutral.add(curr) + # Add Syns to Negative + for word in negative: + for syn in wn.synsets(word, pos=wn.ADJ): + for lemma in syn.lemmas(): + curr = lemma.name().split('.')[0] + if( curr not in newPositive and curr not in newNegative and curr not in newNeutral): + newNegative.add(curr) + elif(curr in newPositive): + newPositive.discard(curr) + newNeutral.add(curr) + return (newPositive,newNegative,newNeutral) + + # Set up initial Sets S_p and S_n + positive = Set(['Good']) + negative = Set(['Bad']) + neutral = Set(['']) + + # Expand on Sets to get S_p' and S_n' + for num in range(1,3): + newsets = expand_sets(positive,negative,neutral); + positive = set(newsets[0]) + negative = set(newsets[1]) + neutral = set(newsets[2]) + + # Learn Classifier + trainfeats = [({word : True},"pos") for word in positive] + [({word : True},"neg") for word in negative] + + #negfeats = [({'insulting': True},'neg'),({'bad':True},'neg')] + + #trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] + classifier = NaiveBayesClassifier.train(trainfeats) + + + # Testing + negids = movie_reviews.fileids('neg') + posids = movie_reviews.fileids('pos') + + negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] + posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids] + negcutoff = len(negfeats)*3/4 + poscutoff = len(posfeats)*3/4 + + testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] + + print 'Dictionary of %d positive words and %d negative words, tested on %d instances' % (len(positive),len(negative), len(testfeats)) + + print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats) + classifier.show_most_informative_features() + + + #text = nltk.word_tokenize("And now for a production unlike any other a very fuzzy and cute dog") + #print(text) + #text = nltk.pos_tag(text) + #print(text) + #for token in text: + # if(token[1] == "JJ" or token[1] == "JJR" or token[1] == "JJS"): + # print(wn.synsets(token[0])) + + +Solver().demo() + + + + + diff --git a/GlossCount.py b/GlossCount.py new file mode 100644 index 0000000..2ff4471 --- /dev/null +++ b/GlossCount.py @@ -0,0 +1,88 @@ +import math +import nltk +from nltk.corpus import wordnet as wn +import nltk.classify.util +from nltk.classify import NaiveBayesClassifier +from nltk.corpus import movie_reviews +from sets import Set + +class GlossCount: + def demo(self): + def value_of(sentiment): + if sentiment == 'pos': return 1 + if sentiment == 'neg': return -1 + return 0 + + def sentiment_score(review): + return 0 + + #return sum ([value_of(tag) for sentence in dict_tagged_sentences for token in sentence for tag in token[2]]) + + def expand_sets(positive,negative,neutral): + newPositive = set(positive) + newNegative = set(negative) + newNeutral = set(neutral) + # Add Syns to Positive + for word in positive: + for syn in wn.synsets(word, pos=wn.ADJ): + for lemma in syn.lemmas(): + curr = lemma.name().split('.')[0] + if( curr not in newPositive and curr not in newNegative and curr not in newNeutral): + newPositive.add(curr) + elif( curr in newNegative): + newNegative.discard(curr) + newNeutral.add(curr) + # Add Syns to Negative + for word in negative: + for syn in wn.synsets(word, pos=wn.ADJ): + for lemma in syn.lemmas(): + curr = lemma.name().split('.')[0] + if( curr not in newPositive and curr not in newNegative and curr not in newNeutral): + newNegative.add(curr) + elif(curr in newPositive): + newPositive.discard(curr) + newNeutral.add(curr) + return (newPositive,newNegative,newNeutral) + + # Set up initial Sets S_p and S_n + positive = Set(['Good']) + negative = Set(['Bad']) + neutral = Set(['']) + + # Expand on Sets to get S_p' and S_n' + for num in range(1,2): + newsets = expand_sets(positive,negative,neutral); + positive = set(newsets[0]) + negative = set(newsets[1]) + neutral = set(newsets[2]) + + # Learn Classifier + trainfeats = [({word : True},"pos") for word in positive] + [({word : True},"neg") for word in negative] + classifier = NaiveBayesClassifier.train(trainfeats) + print "cat" + #print classifier.classify(dict([(word, True) for word in words])) + print classifier.classify(dict([("bad",True),("bad",True)])) + # Iterate through all of the reviews and find sentiment + count = 0.00 + correct = 0.00 + for reviews in movie_reviews.fileids(): + score = 0; + for words in movie_reviews.words(fileids=[reviews]): + if() + sent_value = classifier.classify(dict([(word, True)])) + if(sent_value is 'neg'): + score = score - 1 + elif(sent_value is 'pos'): + score = score + 1 + if (score < 0): + print "Negative at %d" % (score) + sentiment = 'neg' + else: + sentiment = 'pos' + print "Positive at %d" % (score) + if (sentiment == movie_reviews.categories(fileids=[reviews])[0]): + correct = correct + 1.00 + count = count + 1.00 + print correct/count + +GlossCount().demo() \ No newline at end of file