Merge branch 'master' of https://github.uconn.edu/job13011/BigData

job13011 · Mar 29, 2016 · 77869d0 · 77869d0
2 parents 3b2a970 + eb7a6d0
commit 77869d0
Show file tree

Hide file tree

Showing 6 changed files with 67,089 additions and 33 deletions.
diff --git a/BernoulliNB.py b/BernoulliNB.py
@@ -0,0 +1,108 @@
+from __future__ import division
+from math import log
+import re
+from nltk.corpus import movie_reviews as mr
+from nltk.corpus import stopwords
+STOP_WORDS = set(stopwords.words('english'))
+
+SPLIT_AMOUNT = 0.6          # training amount from data 
+
+def SplitData():
+    type_dict={}
+    docs_count={}
+    train_test = [[],[]]
+    for category in mr.categories():
+        type_dict[category]=mr.fileids(categories=category)
+    for cat in type_dict.keys():
+        li = type_dict[cat]
+        size=int(len(li)*SPLIT_AMOUNT)
+        docs_count[cat]=size
+        train_test[0].extend(li[:size])
+        train_test[1].extend(li[size:])
+    return [train_test,type_dict, docs_count]
+
+def tokenize(file_name): 
+    list_words = re.split(r'\W+',mr.raw(fileids=file_name))
+    return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS]
+
+def CalculateAccuracy(li_results):
+    a=0
+    b=0
+    c=0
+    d=0
+    cat = li_results[0][1]
+    for t in li_results:
+        if cat==t[1]:
+            if cat==t[2]:
+                a+=1
+            else:
+                b+=1
+        else:
+            if cat==t[2]:
+                c+=1
+            else:
+                d+=1
+    precision = a/(a+b)
+    # recall = a/(a+c)
+    # print "The following parameters are recorded for the category " , cat
+    print "precision =", precision
+
+# li = Preprocessor.get_testset_trainset(corpus)
+li = SplitData()
+testset = li[0][1]
+trainset = li[0][0]
+# li = Preprocessor.startup()
+cat_num_docs = li[2]
+
+#3)Create a dictionary with a word as the key and a dictionary as the value
+     ## in the dictionary the category as key and number of documents in that category where it occurs as value
+# 2d dict: word -> {pos ...}, {neg ...}
+word_cat_num_doc_dict={}
+
+#4)Loop through the reuters dataset, to get the entire text from  each file in the training set
+    ## Parse the string to get individual words - done by get_list_tokens_nltk()
+for file_name in trainset:
+    list_words = tokenize(file_name)
+    cat = mr.categories(fileids = file_name)[0]
+
+    for w in set(list_words):
+       word_cat_num_doc_dict[w]=word_cat_num_doc_dict.get(w,{})
+       word_cat_num_doc_dict[w][cat]=word_cat_num_doc_dict[w].get(cat,0)
+       word_cat_num_doc_dict[w][cat]+=1
+
+for w in word_cat_num_doc_dict:
+    for cat in cat_num_docs:
+        nct = word_cat_num_doc_dict[w].get(cat,0)
+        # convert #times a word appears into #times+1/#cat_reviews+2
+        ratio = (nct+1)/(cat_num_docs[cat]+2)
+        word_cat_num_doc_dict[w][cat]=ratio
+
+print "The Classifier is trained and it took"
+
+
+li_results=[]
+#5) Like in the training set,Loop through the test set, to get the individual words
+for file_name in testset:
+    minimum_neg_log_prob=1000000000
+    min_category=''
+    set_list_words = set(tokenize(file_name))
+
+##6) Get the probability for each category,
+    #using the cat_num_docs dictionary to wade through the categories
+    for cat in  cat_num_docs:
+        neg_log_prob=-log(cat_num_docs[cat]/len(trainset))
+        for w in word_cat_num_doc_dict:
+            if w in set_list_words:
+                neg_log_prob-=log(word_cat_num_doc_dict[w][cat])
+            else:
+                neg_log_prob-=log(1-word_cat_num_doc_dict[w][cat])
+        if minimum_neg_log_prob>neg_log_prob:
+            min_category=cat
+            minimum_neg_log_prob=neg_log_prob
+
+    li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
+
+CalculateAccuracy(li_results)
+
+
+
diff --git a/ComplementMNB.py b/ComplementMNB.py
@@ -0,0 +1,58 @@
+import csv
+import os
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_extraction.text import TfidfTransformer
+from sklearn.naive_bayes import MultinomialNB
+from sklearn import cross_validation
+from sklearn.metrics import classification_report
+import numpy as np
+from sklearn.metrics import accuracy_score
+
+REVIEWS = os.path.join(os.path.dirname(os.path.abspath(__file__)),'allrevs.csv')
+# review.csv contains two columns
+# first column is the review content (quoted)
+# second column is the assigned sentiment (positive or negative)
+def load_file():
+    with open(REVIEWS) as csv_file:
+        reader = csv.reader(csv_file,delimiter=",",quotechar='"')
+        reader.next()
+        data =[]
+        target = []
+        for row in reader:
+            # skip missing data
+            if row[0] and row[1]:
+                data.append(row[0])
+                target.append(row[1])
+
+        return data,target
+
+# preprocess creates the term frequency matrix for the review data set
+def preprocess():
+    data,target = load_file()
+    count_vectorizer = CountVectorizer(binary='true', stop_words='english')
+    data = count_vectorizer.fit_transform(data)
+    # tfidf_data = TfidfTransformer(norm='l2', use_idf=True, sublinear_tf=True, smooth_idf=True).fit_transform(data)
+    transformer = TfidfTransformer(norm='l2', use_idf=True, sublinear_tf=True, smooth_idf=True)
+    transformer.fit(data)
+    tfidf_data = transformer.transform(data)
+
+    return tfidf_data
+
+def learn_model(data,target):
+    # preparing data for split validation. 60% training, 40% test
+    data_train,data_test,target_train,target_test = cross_validation.train_test_split(data,target,test_size=0.4,random_state=43)
+    classifier = MultinomialNB().fit(data_train,target_train)
+    predicted = classifier.predict(data_test)
+    evaluate_model(target_test,predicted)
+
+# 
+def evaluate_model(target_true,target_predicted):
+    # print classification_report(target_true,target_predicted)
+    print "The accuracy score is {:.2%}".format(accuracy_score(target_true,target_predicted))
+
+
+data,target = load_file()
+tf_idf = preprocess()
+learn_model(tf_idf,target)
+
+
diff --git a/GlossBayes.py b/GlossBayes.py
@@ -12,7 +12,6 @@ class Solver:
     def demo(self):
         def word_feats(words):
             return dict([(word, True) for word in words])
-
         def expand_sets(positive,negative,neutral):
             newPositive = set(positive)
             newNegative = set(negative)
@@ -22,21 +21,36 @@ def expand_sets(positive,negative,neutral):
                 for syn in wn.synsets(word, pos=wn.ADJ):
                     for lemma in syn.lemmas():
                         curr = lemma.name().split('.')[0]
-                        if( curr not in newPositive and curr not in newNegative and curr not in newNeutral):
+                        if(curr not in newNegative and curr not in newNeutral):
                             newPositive.add(curr)
                         elif( curr in newNegative):
                             newNegative.discard(curr)
                             newNeutral.add(curr)
+                        # Deal with antonyms
+                        for ant in lemma.antonyms():
+                            if(ant not in newPositive and ant not in newNeutral):
+                                newNegative.add(ant)
+                            elif(ant in newPositive):
+                                newPositive.discard(ant)
+                                newNeutral.add(ant)
             # Add Syns to Negative
             for word in negative:
                 for syn in wn.synsets(word, pos=wn.ADJ):
                     for lemma in syn.lemmas():
                         curr = lemma.name().split('.')[0]
-                        if( curr not in newPositive and curr not in newNegative and curr not in newNeutral):
+                        print curr
+                        if(curr not in newPositive and curr not in newNeutral):
                             newNegative.add(curr)
                         elif(curr in newPositive):
                             newPositive.discard(curr)
                             newNeutral.add(curr)
+                        # Deal with antonyms
+                        for ant in lemma.antonyms():
+                            if(ant not in newNegative and ant not in newNeutral):
+                                newPositive.add(ant)
+                            elif(ant in newNegative):
+                                newNegative.discard(ant)
+                                newNeutral.add(ant)
             return (newPositive,newNegative,newNeutral)
 
         # Set up initial Sets S_p and S_n
@@ -45,36 +59,38 @@ def expand_sets(positive,negative,neutral):
         neutral = Set([''])
 
         # Expand on Sets to get S_p' and S_n'
-        for num in range(1,3):
+        for num in range(1,2):
             newsets = expand_sets(positive,negative,neutral);
             positive = set(newsets[0])
             negative = set(newsets[1])
             neutral = set(newsets[2])
-
-        # Learn Classifier
-        trainfeats = [({word : True},"pos") for word in positive] + [({word : True},"neg") for word in negative]
-
-        #negfeats = [({'insulting': True},'neg'),({'bad':True},'neg')]
-
-        #trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
-        classifier = NaiveBayesClassifier.train(trainfeats)
-
-
-        # Testing
-        negids = movie_reviews.fileids('neg')
-        posids = movie_reviews.fileids('pos')
-
-        negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
-        posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
-        negcutoff = len(negfeats)*3/4
-        poscutoff = len(posfeats)*3/4
-
-        testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
-
-        print 'Dictionary of %d positive words and %d negative words, tested on %d instances' % (len(positive),len(negative), len(testfeats))
-
-        print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
-        classifier.show_most_informative_features()
+            print positive
+            print negative
+
+        # # Learn Classifier
+        # trainfeats = [({word : True},"pos") for word in positive] + [({word : True},"neg") for word in negative]
+        #
+        # #negfeats = [({'insulting': True},'neg'),({'bad':True},'neg')]
+        #
+        # #trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
+        # classifier = NaiveBayesClassifier.train(trainfeats)
+        #
+        #
+        # # Testing
+        # negids = movie_reviews.fileids('neg')
+        # posids = movie_reviews.fileids('pos')
+        #
+        # negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
+        # posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
+        # negcutoff = len(negfeats)*3/4
+        # poscutoff = len(posfeats)*3/4
+        #
+        # testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
+        #
+        # print 'Dictionary of %d positive words and %d negative words, tested on %d instances' % (len(positive),len(negative), len(testfeats))
+        #
+        # print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
+        # classifier.show_most_informative_features()
 
 
     #text = nltk.word_tokenize("And now for a production unlike any other a very fuzzy and cute dog")

diff --git a/GlossCount.py b/GlossCount.py
@@ -27,7 +27,7 @@ def expand_sets(positive,negative,neutral):
                         for syn in wn.synsets(word, pos=wn.ADJ):
                             for lemma in syn.lemmas():
                                 curr = lemma.name().split('.')[0]
-                                if( curr not in newPositive and curr not in newNegative and curr not in newNeutral):
+                                if(curr not in newNegative and curr not in newNeutral):
                                     newPositive.add(curr)
                                 elif( curr in newNegative):
                                     newNegative.discard(curr)
@@ -37,7 +37,7 @@ def expand_sets(positive,negative,neutral):
                         for syn in wn.synsets(word, pos=wn.ADJ):
                             for lemma in syn.lemmas():
                                 curr = lemma.name().split('.')[0]
-                                if( curr not in newPositive and curr not in newNegative and curr not in newNeutral):
+                                if( curr not in newPositive and curr not in newNeutral):
                                     newNegative.add(curr)
                                 elif(curr in newPositive):
                                     newPositive.discard(curr)
@@ -50,7 +50,7 @@ def expand_sets(positive,negative,neutral):
         neutral = Set([''])
 
         # Expand on Sets to get S_p' and S_n'
-        for num in range(1,2):
+        for num in range(1,3):
             newsets = expand_sets(positive,negative,neutral);
             positive = set(newsets[0])
             negative = set(newsets[1])
@@ -63,12 +63,14 @@ def expand_sets(positive,negative,neutral):
         classifier = NaiveBayesClassifier.train(trainfeats)
         #print classifier.classify(dict([(word, True) for word in words]))
         #print classifier.classify(dict([("bad",True),("bad",True)]))
+
+
         # Iterate through all of the reviews and find sentiment
         count = 0.00
         correct = 0.00
         for reviews in movie_reviews.fileids():     #For every review
             score = 0;
-            tokens = nltk.pos_tag(nltk.word_tokenize(movie_reviews.raw(fileids=[reviews])))     #Tokenize all words
+            tokens = nltk.pos_tag(nltk.word_tokenize(movie_reviews.raw(fileids=[reviews])))     #Tokenize all words with POS
             for token in tokens:
                 if (token[1]== "JJ" or token[1] == "JJR" or token[1] == "JJS"):         # If adjective, check value
                     sent_value = classifier.classify(dict([(token[0], True)]))