Amazon Edits

Added Amazon reviews and updated code to handle it. Some cleanup required, but everything works.
job13011 · Apr 16, 2016 · 31fad1a · 31fad1a
1 parent 6d16225
commit 31fad1a
Show file tree

Hide file tree

Showing 7 changed files with 3,920 additions and 66,806 deletions.
diff --git a/BernoulliNB.py → BNB.py b/BernoulliNB.py → BNB.py
@@ -1,30 +1,60 @@
 from __future__ import division
 from math import log
+import random
+import csv
 import re
 from nltk.corpus import movie_reviews as mr
 from nltk.corpus import stopwords
 STOP_WORDS = set(stopwords.words('english'))
 
 SPLIT_AMOUNT = 0.6          # training amount from data 
 
+AMAZON = 1
+REVIEW_POL={}
+DEFINED_SIZE = 1
+DEFINED_SIZES = {'pos': 948, 'neg': 948}
+
 def SplitData():
     type_dict={}
     docs_count={}
     train_test = [[],[]]
-    for category in mr.categories():
-        type_dict[category]=mr.fileids(categories=category)
+    offset_sample = random.randint(-400,400)
+    print "offset_sample", offset_sample
+    if AMAZON:
+        offset_sample = random.randint(-600,600)
+        for category in ['pos', 'neg']:
+            type_dict[category]=[]
+        with open('amazon_revs.csv', 'rb') as csvfile:
+            rev_read = csv.reader(csvfile)
+            for row in rev_read:
+                type_dict[row[1]].append(row[0])
+                REVIEW_POL[row[0]] = row[1]
+    else:
+        for category in mr.categories():
+            type_dict[category]=mr.fileids(categories=category)
     for cat in type_dict.keys():
         li = type_dict[cat]
-        size=int(len(li)*SPLIT_AMOUNT)
+        random.shuffle(li)
+        size=int(len(li)*SPLIT_AMOUNT) + offset_sample
+        if DEFINED_SIZE:
+            size = DEFINED_SIZES[cat]
+        print "Category: ", cat, "Size:", size
+        offset_sample *= -1
         docs_count[cat]=size
         train_test[0].extend(li[:size])
         train_test[1].extend(li[size:])
     return [train_test,type_dict, docs_count]
-
+    
 def tokenize(file_name): 
-    list_words = re.split(r'\W+',mr.raw(fileids=file_name))
+    list_words = ()
+    if AMAZON:
+        list_words = re.split(r'\W+',file_name)
+    else:
+        list_words = re.split(r'\W+',mr.raw(fileids=file_name))
+
     return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS]
 
+
 def CalculateAccuracy(li_results):
     a=0
     b=0
@@ -63,8 +93,11 @@ def CalculateAccuracy(li_results):
     ## Parse the string to get individual words - done by get_list_tokens_nltk()
 for file_name in trainset:
     list_words = tokenize(file_name)
-    cat = mr.categories(fileids = file_name)[0]
-
+    cat = ''
+    if AMAZON:
+        cat = REVIEW_POL[file_name]
+    else:
+        cat = mr.categories(fileids = file_name)[0]
     for w in set(list_words):
        word_cat_num_doc_dict[w]=word_cat_num_doc_dict.get(w,{})
        word_cat_num_doc_dict[w][cat]=word_cat_num_doc_dict[w].get(cat,0)
@@ -100,8 +133,11 @@ def CalculateAccuracy(li_results):
             min_category=cat
             minimum_neg_log_prob=neg_log_prob
 
-    li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
-
+    if AMAZON:
+        li_results.append((file_name,min_category,REVIEW_POL[file_name]))
+    else:
+        li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
+    # break
 CalculateAccuracy(li_results)
 
 

diff --git a/ComplementMNB.py b/ComplementMNB.py
diff --git a/MultinomialNB.py → MNB.py b/MultinomialNB.py → MNB.py
@@ -1,29 +1,56 @@
 from __future__ import division
 from math import log
 import re
+import random
+import csv
 from nltk.corpus import movie_reviews as mr
 from nltk.corpus import stopwords
 STOP_WORDS = set(stopwords.words('english'))
 SPLIT_AMOUNT = 0.6          # training amount from data 
-# need to change calculations for stuff
-# https://www.dataquest.io/blog/naive-bayes-movies/
+
+AMAZON = 1
+REVIEW_POL={}
+DEFINED_SIZE = 1
+DEFINED_SIZES = {'pos': 948, 'neg': 948}
 
 def SplitData():
     type_dict={}
     docs_count={}
     train_test = [[],[]]
-    for category in mr.categories():
-        type_dict[category]=mr.fileids(categories=category)
+    offset_sample = random.randint(-400,400)
+    print "offset_sample", offset_sample
+    if AMAZON:
+        offset_sample = random.randint(-600,600)
+        for category in ['pos', 'neg']:
+            type_dict[category]=[]
+        with open('amazon_revs.csv', 'rb') as csvfile:
+            rev_read = csv.reader(csvfile)
+            for row in rev_read:
+                type_dict[row[1]].append(row[0])
+                REVIEW_POL[row[0]] = row[1]
+    else:
+        for category in mr.categories():
+            type_dict[category]=mr.fileids(categories=category)
     for cat in type_dict.keys():
         li = type_dict[cat]
-        size=int(len(li)*SPLIT_AMOUNT)
+        random.shuffle(li)
+        size=int(len(li)*SPLIT_AMOUNT) + offset_sample
+        if DEFINED_SIZE:
+            size = DEFINED_SIZES[cat]
+        print "Category: ", cat, "Size:", size
+        offset_sample *= -1
         docs_count[cat]=size
         train_test[0].extend(li[:size])
         train_test[1].extend(li[size:])
     return [train_test,type_dict, docs_count]
-
+    
 def tokenize(file_name): 
-    list_words = re.split(r'\W+',mr.raw(fileids=file_name))
+    list_words = ()
+    if AMAZON:
+        list_words = re.split(r'\W+',file_name)
+    else:
+        list_words = re.split(r'\W+',mr.raw(fileids=file_name))
+
     return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS]
 
 def CalculateAccuracy(li_results):
@@ -59,7 +86,8 @@ def CalculateAccuracy(li_results):
 
 ##4)Create a) a dictionary with a category as the key and dictionary of words-occurences as values
           #b) a dictionary with a category as the key and the number of words in it as the value
-# {pos-> {w1 = 17 times}, {w2 = 32 times}...} {neg-> ....}
+# {pos-> {w1 = 17 times}, {w2 = 32 times}...} 
+# {neg-> ....}
 cat_word_dict={}
 # {pos-> 4000 words} {neg-> 7000 words}
 cat_word_count_dict={}
@@ -74,7 +102,11 @@ def CalculateAccuracy(li_results):
 ##7) Check if category exists in dictionary, if not, create an empty dictionary,
     #and put word count as zero
     #and then insert words into the category's dictionary in both cases and update the word count
-    cat = mr.categories(fileids = file_name)[0]
+    cat = ''
+    if AMAZON:
+        cat = REVIEW_POL[file_name]
+    else:
+        cat = mr.categories(fileids = file_name)[0]
     cat_word_dict[cat] = cat_word_dict.get(cat,{})
     cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0)
 
@@ -103,8 +135,9 @@ def CalculateAccuracy(li_results):
 #9) Like in the training set,Loop through the test set, to get the entire text from  each file
 ##10) Similar step, parse the string to get individual words
 for file_name in testset:
-    print "File: ", file_name
-    minimum_neg_log_prob=1000000000
+    # print "File: ", file_name
+    # minimum_neg_log_prob=1000000000
+    minimum_neg_log_prob=-1000000000
     # minimum_neg_log_prob = 0      # NEW
     min_category=''
     list_words = tokenize(file_name)
@@ -118,25 +151,31 @@ def CalculateAccuracy(li_results):
         # print "________________________________________________________________"
         # print "________________________________________________________________"
         # print "\n\n" , cat, cat, cat, cat, cat, cat, cat, cat, cat, cat, "\n\n"
-        neg_log_prob=-log(cat_num_docs[cat]/length_train)
+        # neg_log_prob=-log(cat_num_docs[cat]/length_train)   # P(class)
+        neg_log_prob= log(cat_num_docs[cat]/length_train)   # P(class)
         # neg_log_prob = cat_num_docs[cat]/length_train
-        word_dict = cat_word_dict[cat]
-        count_cat = cat_word_count_dict[cat]
+        word_dict = cat_word_dict[cat]          # word counts for each word in class
+        count_cat = cat_word_count_dict[cat]    # total words in class 
         for w in list_words:
             count_word_train=word_dict.get(w,0)
             ratio = (count_word_train+1)/(count_cat+vocab_length)
-            neg_log_prob-=log(ratio)
+            # neg_log_prob-=log(ratio)
+            neg_log_prob+=log(ratio)
 
             # neg_log_prob *= ratio
             # print w, "Ratio found:",ratio, "new_neg_log:", neg_log_prob
             # break
-        # print "\n\n", minimum_neg_log_prob , '<' , neg_log_prob                 
-        if minimum_neg_log_prob>neg_log_prob:
-        # if minimum_neg_log_prob<neg_log_prob:
+        # print "\n\n", minimum_neg_log_prob , '<' , neg_log_prob  
+        # print "\n\n", cat, ' :: ', neg_log_prob
+        # if minimum_neg_log_prob>neg_log_prob:
+        if minimum_neg_log_prob<neg_log_prob:
             min_category=cat
             minimum_neg_log_prob=neg_log_prob
     # print "Min cat: ", min_category
-    li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
+    if AMAZON:
+        li_results.append((file_name,min_category,REVIEW_POL[file_name]))
+    else:
+        li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
     # break
 
 ###--------------------DEBUG STATEMENTS----------------------