Skip to content

Commit

Permalink
Amazon Edits
Browse files Browse the repository at this point in the history
Added Amazon reviews and updated code to handle it.
Some cleanup required, but everything works.
  • Loading branch information
asp10012 committed Apr 16, 2016
1 parent 6d16225 commit 31fad1a
Show file tree
Hide file tree
Showing 7 changed files with 3,920 additions and 66,806 deletions.
54 changes: 45 additions & 9 deletions BernoulliNB.py → BNB.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,60 @@
from __future__ import division
from math import log
import random
import csv
import re
from nltk.corpus import movie_reviews as mr
from nltk.corpus import stopwords
STOP_WORDS = set(stopwords.words('english'))

SPLIT_AMOUNT = 0.6 # training amount from data

AMAZON = 1
REVIEW_POL={}
DEFINED_SIZE = 1
DEFINED_SIZES = {'pos': 948, 'neg': 948}

def SplitData():
type_dict={}
docs_count={}
train_test = [[],[]]
for category in mr.categories():
type_dict[category]=mr.fileids(categories=category)
offset_sample = random.randint(-400,400)
print "offset_sample", offset_sample
if AMAZON:
offset_sample = random.randint(-600,600)
for category in ['pos', 'neg']:
type_dict[category]=[]
with open('amazon_revs.csv', 'rb') as csvfile:
rev_read = csv.reader(csvfile)
for row in rev_read:
type_dict[row[1]].append(row[0])
REVIEW_POL[row[0]] = row[1]
else:
for category in mr.categories():
type_dict[category]=mr.fileids(categories=category)
for cat in type_dict.keys():
li = type_dict[cat]
size=int(len(li)*SPLIT_AMOUNT)
random.shuffle(li)
size=int(len(li)*SPLIT_AMOUNT) + offset_sample
if DEFINED_SIZE:
size = DEFINED_SIZES[cat]
print "Category: ", cat, "Size:", size
offset_sample *= -1
docs_count[cat]=size
train_test[0].extend(li[:size])
train_test[1].extend(li[size:])
return [train_test,type_dict, docs_count]

def tokenize(file_name):
list_words = re.split(r'\W+',mr.raw(fileids=file_name))
list_words = ()
if AMAZON:
list_words = re.split(r'\W+',file_name)
else:
list_words = re.split(r'\W+',mr.raw(fileids=file_name))

return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS]


def CalculateAccuracy(li_results):
a=0
b=0
Expand Down Expand Up @@ -63,8 +93,11 @@ def CalculateAccuracy(li_results):
## Parse the string to get individual words - done by get_list_tokens_nltk()
for file_name in trainset:
list_words = tokenize(file_name)
cat = mr.categories(fileids = file_name)[0]

cat = ''
if AMAZON:
cat = REVIEW_POL[file_name]
else:
cat = mr.categories(fileids = file_name)[0]
for w in set(list_words):
word_cat_num_doc_dict[w]=word_cat_num_doc_dict.get(w,{})
word_cat_num_doc_dict[w][cat]=word_cat_num_doc_dict[w].get(cat,0)
Expand Down Expand Up @@ -100,8 +133,11 @@ def CalculateAccuracy(li_results):
min_category=cat
minimum_neg_log_prob=neg_log_prob

li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))

if AMAZON:
li_results.append((file_name,min_category,REVIEW_POL[file_name]))
else:
li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
# break
CalculateAccuracy(li_results)


Expand Down
58 changes: 0 additions & 58 deletions ComplementMNB.py

This file was deleted.

77 changes: 58 additions & 19 deletions MultinomialNB.py → MNB.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,56 @@
from __future__ import division
from math import log
import re
import random
import csv
from nltk.corpus import movie_reviews as mr
from nltk.corpus import stopwords
STOP_WORDS = set(stopwords.words('english'))
SPLIT_AMOUNT = 0.6 # training amount from data
# need to change calculations for stuff
# https://www.dataquest.io/blog/naive-bayes-movies/

AMAZON = 1
REVIEW_POL={}
DEFINED_SIZE = 1
DEFINED_SIZES = {'pos': 948, 'neg': 948}

def SplitData():
type_dict={}
docs_count={}
train_test = [[],[]]
for category in mr.categories():
type_dict[category]=mr.fileids(categories=category)
offset_sample = random.randint(-400,400)
print "offset_sample", offset_sample
if AMAZON:
offset_sample = random.randint(-600,600)
for category in ['pos', 'neg']:
type_dict[category]=[]
with open('amazon_revs.csv', 'rb') as csvfile:
rev_read = csv.reader(csvfile)
for row in rev_read:
type_dict[row[1]].append(row[0])
REVIEW_POL[row[0]] = row[1]
else:
for category in mr.categories():
type_dict[category]=mr.fileids(categories=category)
for cat in type_dict.keys():
li = type_dict[cat]
size=int(len(li)*SPLIT_AMOUNT)
random.shuffle(li)
size=int(len(li)*SPLIT_AMOUNT) + offset_sample
if DEFINED_SIZE:
size = DEFINED_SIZES[cat]
print "Category: ", cat, "Size:", size
offset_sample *= -1
docs_count[cat]=size
train_test[0].extend(li[:size])
train_test[1].extend(li[size:])
return [train_test,type_dict, docs_count]

def tokenize(file_name):
list_words = re.split(r'\W+',mr.raw(fileids=file_name))
list_words = ()
if AMAZON:
list_words = re.split(r'\W+',file_name)
else:
list_words = re.split(r'\W+',mr.raw(fileids=file_name))

return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS]

def CalculateAccuracy(li_results):
Expand Down Expand Up @@ -59,7 +86,8 @@ def CalculateAccuracy(li_results):

##4)Create a) a dictionary with a category as the key and dictionary of words-occurences as values
#b) a dictionary with a category as the key and the number of words in it as the value
# {pos-> {w1 = 17 times}, {w2 = 32 times}...} {neg-> ....}
# {pos-> {w1 = 17 times}, {w2 = 32 times}...}
# {neg-> ....}
cat_word_dict={}
# {pos-> 4000 words} {neg-> 7000 words}
cat_word_count_dict={}
Expand All @@ -74,7 +102,11 @@ def CalculateAccuracy(li_results):
##7) Check if category exists in dictionary, if not, create an empty dictionary,
#and put word count as zero
#and then insert words into the category's dictionary in both cases and update the word count
cat = mr.categories(fileids = file_name)[0]
cat = ''
if AMAZON:
cat = REVIEW_POL[file_name]
else:
cat = mr.categories(fileids = file_name)[0]
cat_word_dict[cat] = cat_word_dict.get(cat,{})
cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0)

Expand Down Expand Up @@ -103,8 +135,9 @@ def CalculateAccuracy(li_results):
#9) Like in the training set,Loop through the test set, to get the entire text from each file
##10) Similar step, parse the string to get individual words
for file_name in testset:
print "File: ", file_name
minimum_neg_log_prob=1000000000
# print "File: ", file_name
# minimum_neg_log_prob=1000000000
minimum_neg_log_prob=-1000000000
# minimum_neg_log_prob = 0 # NEW
min_category=''
list_words = tokenize(file_name)
Expand All @@ -118,25 +151,31 @@ def CalculateAccuracy(li_results):
# print "________________________________________________________________"
# print "________________________________________________________________"
# print "\n\n" , cat, cat, cat, cat, cat, cat, cat, cat, cat, cat, "\n\n"
neg_log_prob=-log(cat_num_docs[cat]/length_train)
# neg_log_prob=-log(cat_num_docs[cat]/length_train) # P(class)
neg_log_prob= log(cat_num_docs[cat]/length_train) # P(class)
# neg_log_prob = cat_num_docs[cat]/length_train
word_dict = cat_word_dict[cat]
count_cat = cat_word_count_dict[cat]
word_dict = cat_word_dict[cat] # word counts for each word in class
count_cat = cat_word_count_dict[cat] # total words in class
for w in list_words:
count_word_train=word_dict.get(w,0)
ratio = (count_word_train+1)/(count_cat+vocab_length)
neg_log_prob-=log(ratio)
# neg_log_prob-=log(ratio)
neg_log_prob+=log(ratio)

# neg_log_prob *= ratio
# print w, "Ratio found:",ratio, "new_neg_log:", neg_log_prob
# break
# print "\n\n", minimum_neg_log_prob , '<' , neg_log_prob
if minimum_neg_log_prob>neg_log_prob:
# if minimum_neg_log_prob<neg_log_prob:
# print "\n\n", minimum_neg_log_prob , '<' , neg_log_prob
# print "\n\n", cat, ' :: ', neg_log_prob
# if minimum_neg_log_prob>neg_log_prob:
if minimum_neg_log_prob<neg_log_prob:
min_category=cat
minimum_neg_log_prob=neg_log_prob
# print "Min cat: ", min_category
li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
if AMAZON:
li_results.append((file_name,min_category,REVIEW_POL[file_name]))
else:
li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
# break

###--------------------DEBUG STATEMENTS----------------------
Expand Down
Loading

0 comments on commit 31fad1a

Please sign in to comment.