Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Amazon Edits
Added Amazon reviews and updated code to handle it.
Some cleanup required, but everything works.
  • Loading branch information
asp10012 committed Apr 16, 2016
1 parent 6d16225 commit 31fad1a
Show file tree
Hide file tree
Showing 7 changed files with 3,920 additions and 66,806 deletions.
54 changes: 45 additions & 9 deletions BernoulliNB.py → BNB.py
@@ -1,30 +1,60 @@
from __future__ import division
from math import log
import random
import csv
import re
from nltk.corpus import movie_reviews as mr
from nltk.corpus import stopwords
STOP_WORDS = set(stopwords.words('english'))

SPLIT_AMOUNT = 0.6 # training amount from data

AMAZON = 1
REVIEW_POL={}
DEFINED_SIZE = 1
DEFINED_SIZES = {'pos': 948, 'neg': 948}

def SplitData():
type_dict={}
docs_count={}
train_test = [[],[]]
for category in mr.categories():
type_dict[category]=mr.fileids(categories=category)
offset_sample = random.randint(-400,400)
print "offset_sample", offset_sample
if AMAZON:
offset_sample = random.randint(-600,600)
for category in ['pos', 'neg']:
type_dict[category]=[]
with open('amazon_revs.csv', 'rb') as csvfile:
rev_read = csv.reader(csvfile)
for row in rev_read:
type_dict[row[1]].append(row[0])
REVIEW_POL[row[0]] = row[1]
else:
for category in mr.categories():
type_dict[category]=mr.fileids(categories=category)
for cat in type_dict.keys():
li = type_dict[cat]
size=int(len(li)*SPLIT_AMOUNT)
random.shuffle(li)
size=int(len(li)*SPLIT_AMOUNT) + offset_sample
if DEFINED_SIZE:
size = DEFINED_SIZES[cat]
print "Category: ", cat, "Size:", size
offset_sample *= -1
docs_count[cat]=size
train_test[0].extend(li[:size])
train_test[1].extend(li[size:])
return [train_test,type_dict, docs_count]

def tokenize(file_name):
list_words = re.split(r'\W+',mr.raw(fileids=file_name))
list_words = ()
if AMAZON:
list_words = re.split(r'\W+',file_name)
else:
list_words = re.split(r'\W+',mr.raw(fileids=file_name))

return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS]


def CalculateAccuracy(li_results):
a=0
b=0
Expand Down Expand Up @@ -63,8 +93,11 @@ word_cat_num_doc_dict={}
## Parse the string to get individual words - done by get_list_tokens_nltk()
for file_name in trainset:
list_words = tokenize(file_name)
cat = mr.categories(fileids = file_name)[0]

cat = ''
if AMAZON:
cat = REVIEW_POL[file_name]
else:
cat = mr.categories(fileids = file_name)[0]
for w in set(list_words):
word_cat_num_doc_dict[w]=word_cat_num_doc_dict.get(w,{})
word_cat_num_doc_dict[w][cat]=word_cat_num_doc_dict[w].get(cat,0)
Expand Down Expand Up @@ -100,8 +133,11 @@ for file_name in testset:
min_category=cat
minimum_neg_log_prob=neg_log_prob

li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))

if AMAZON:
li_results.append((file_name,min_category,REVIEW_POL[file_name]))
else:
li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
# break
CalculateAccuracy(li_results)


Expand Down
58 changes: 0 additions & 58 deletions ComplementMNB.py

This file was deleted.

77 changes: 58 additions & 19 deletions MultinomialNB.py → MNB.py
@@ -1,29 +1,56 @@
from __future__ import division
from math import log
import re
import random
import csv
from nltk.corpus import movie_reviews as mr
from nltk.corpus import stopwords
STOP_WORDS = set(stopwords.words('english'))
SPLIT_AMOUNT = 0.6 # training amount from data
# need to change calculations for stuff
# https://www.dataquest.io/blog/naive-bayes-movies/

AMAZON = 1
REVIEW_POL={}
DEFINED_SIZE = 1
DEFINED_SIZES = {'pos': 948, 'neg': 948}

def SplitData():
type_dict={}
docs_count={}
train_test = [[],[]]
for category in mr.categories():
type_dict[category]=mr.fileids(categories=category)
offset_sample = random.randint(-400,400)
print "offset_sample", offset_sample
if AMAZON:
offset_sample = random.randint(-600,600)
for category in ['pos', 'neg']:
type_dict[category]=[]
with open('amazon_revs.csv', 'rb') as csvfile:
rev_read = csv.reader(csvfile)
for row in rev_read:
type_dict[row[1]].append(row[0])
REVIEW_POL[row[0]] = row[1]
else:
for category in mr.categories():
type_dict[category]=mr.fileids(categories=category)
for cat in type_dict.keys():
li = type_dict[cat]
size=int(len(li)*SPLIT_AMOUNT)
random.shuffle(li)
size=int(len(li)*SPLIT_AMOUNT) + offset_sample
if DEFINED_SIZE:
size = DEFINED_SIZES[cat]
print "Category: ", cat, "Size:", size
offset_sample *= -1
docs_count[cat]=size
train_test[0].extend(li[:size])
train_test[1].extend(li[size:])
return [train_test,type_dict, docs_count]

def tokenize(file_name):
list_words = re.split(r'\W+',mr.raw(fileids=file_name))
list_words = ()
if AMAZON:
list_words = re.split(r'\W+',file_name)
else:
list_words = re.split(r'\W+',mr.raw(fileids=file_name))

return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS]

def CalculateAccuracy(li_results):
Expand Down Expand Up @@ -59,7 +86,8 @@ cat_num_docs = li[2]

##4)Create a) a dictionary with a category as the key and dictionary of words-occurences as values
#b) a dictionary with a category as the key and the number of words in it as the value
# {pos-> {w1 = 17 times}, {w2 = 32 times}...} {neg-> ....}
# {pos-> {w1 = 17 times}, {w2 = 32 times}...}
# {neg-> ....}
cat_word_dict={}
# {pos-> 4000 words} {neg-> 7000 words}
cat_word_count_dict={}
Expand All @@ -74,7 +102,11 @@ for file_name in trainset:
##7) Check if category exists in dictionary, if not, create an empty dictionary,
#and put word count as zero
#and then insert words into the category's dictionary in both cases and update the word count
cat = mr.categories(fileids = file_name)[0]
cat = ''
if AMAZON:
cat = REVIEW_POL[file_name]
else:
cat = mr.categories(fileids = file_name)[0]
cat_word_dict[cat] = cat_word_dict.get(cat,{})
cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0)

Expand Down Expand Up @@ -103,8 +135,9 @@ li_results=[]
#9) Like in the training set,Loop through the test set, to get the entire text from each file
##10) Similar step, parse the string to get individual words
for file_name in testset:
print "File: ", file_name
minimum_neg_log_prob=1000000000
# print "File: ", file_name
# minimum_neg_log_prob=1000000000
minimum_neg_log_prob=-1000000000
# minimum_neg_log_prob = 0 # NEW
min_category=''
list_words = tokenize(file_name)
Expand All @@ -118,25 +151,31 @@ for file_name in testset:
# print "________________________________________________________________"
# print "________________________________________________________________"
# print "\n\n" , cat, cat, cat, cat, cat, cat, cat, cat, cat, cat, "\n\n"
neg_log_prob=-log(cat_num_docs[cat]/length_train)
# neg_log_prob=-log(cat_num_docs[cat]/length_train) # P(class)
neg_log_prob= log(cat_num_docs[cat]/length_train) # P(class)
# neg_log_prob = cat_num_docs[cat]/length_train
word_dict = cat_word_dict[cat]
count_cat = cat_word_count_dict[cat]
word_dict = cat_word_dict[cat] # word counts for each word in class
count_cat = cat_word_count_dict[cat] # total words in class
for w in list_words:
count_word_train=word_dict.get(w,0)
ratio = (count_word_train+1)/(count_cat+vocab_length)
neg_log_prob-=log(ratio)
# neg_log_prob-=log(ratio)
neg_log_prob+=log(ratio)

# neg_log_prob *= ratio
# print w, "Ratio found:",ratio, "new_neg_log:", neg_log_prob
# break
# print "\n\n", minimum_neg_log_prob , '<' , neg_log_prob
if minimum_neg_log_prob>neg_log_prob:
# if minimum_neg_log_prob<neg_log_prob:
# print "\n\n", minimum_neg_log_prob , '<' , neg_log_prob
# print "\n\n", cat, ' :: ', neg_log_prob
# if minimum_neg_log_prob>neg_log_prob:
if minimum_neg_log_prob<neg_log_prob:
min_category=cat
minimum_neg_log_prob=neg_log_prob
# print "Min cat: ", min_category
li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
if AMAZON:
li_results.append((file_name,min_category,REVIEW_POL[file_name]))
else:
li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
# break

###--------------------DEBUG STATEMENTS----------------------
Expand Down

0 comments on commit 31fad1a

Please sign in to comment.