Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Naive Bayes stuff
BernoulliNB and MultinomialNB follow the normal algorithm (using log
precision to avoid FP underflow).
ComplementMNB implements SciKit's Multinomial formula, modified for the
paper
  • Loading branch information
asp10012 committed Mar 29, 2016
1 parent 2a8d5ef commit 6855e36
Show file tree
Hide file tree
Showing 4 changed files with 67,038 additions and 0 deletions.
108 changes: 108 additions & 0 deletions BernoulliNB.py
@@ -0,0 +1,108 @@
from __future__ import division
from math import log
import re
from nltk.corpus import movie_reviews as mr
from nltk.corpus import stopwords
STOP_WORDS = set(stopwords.words('english'))

SPLIT_AMOUNT = 0.6 # training amount from data

def SplitData():
type_dict={}
docs_count={}
train_test = [[],[]]
for category in mr.categories():
type_dict[category]=mr.fileids(categories=category)
for cat in type_dict.keys():
li = type_dict[cat]
size=int(len(li)*SPLIT_AMOUNT)
docs_count[cat]=size
train_test[0].extend(li[:size])
train_test[1].extend(li[size:])
return [train_test,type_dict, docs_count]

def tokenize(file_name):
list_words = re.split(r'\W+',mr.raw(fileids=file_name))
return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS]

def CalculateAccuracy(li_results):
a=0
b=0
c=0
d=0
cat = li_results[0][1]
for t in li_results:
if cat==t[1]:
if cat==t[2]:
a+=1
else:
b+=1
else:
if cat==t[2]:
c+=1
else:
d+=1
precision = a/(a+b)
# recall = a/(a+c)
# print "The following parameters are recorded for the category " , cat
print "precision =", precision

# li = Preprocessor.get_testset_trainset(corpus)
li = SplitData()
testset = li[0][1]
trainset = li[0][0]
# li = Preprocessor.startup()
cat_num_docs = li[2]

#3)Create a dictionary with a word as the key and a dictionary as the value
## in the dictionary the category as key and number of documents in that category where it occurs as value
# 2d dict: word -> {pos ...}, {neg ...}
word_cat_num_doc_dict={}

#4)Loop through the reuters dataset, to get the entire text from each file in the training set
## Parse the string to get individual words - done by get_list_tokens_nltk()
for file_name in trainset:
list_words = tokenize(file_name)
cat = mr.categories(fileids = file_name)[0]

for w in set(list_words):
word_cat_num_doc_dict[w]=word_cat_num_doc_dict.get(w,{})
word_cat_num_doc_dict[w][cat]=word_cat_num_doc_dict[w].get(cat,0)
word_cat_num_doc_dict[w][cat]+=1

for w in word_cat_num_doc_dict:
for cat in cat_num_docs:
nct = word_cat_num_doc_dict[w].get(cat,0)
# convert #times a word appears into #times+1/#cat_reviews+2
ratio = (nct+1)/(cat_num_docs[cat]+2)
word_cat_num_doc_dict[w][cat]=ratio

print "The Classifier is trained and it took"


li_results=[]
#5) Like in the training set,Loop through the test set, to get the individual words
for file_name in testset:
minimum_neg_log_prob=1000000000
min_category=''
set_list_words = set(tokenize(file_name))

##6) Get the probability for each category,
#using the cat_num_docs dictionary to wade through the categories
for cat in cat_num_docs:
neg_log_prob=-log(cat_num_docs[cat]/len(trainset))
for w in word_cat_num_doc_dict:
if w in set_list_words:
neg_log_prob-=log(word_cat_num_doc_dict[w][cat])
else:
neg_log_prob-=log(1-word_cat_num_doc_dict[w][cat])
if minimum_neg_log_prob>neg_log_prob:
min_category=cat
minimum_neg_log_prob=neg_log_prob

li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))

CalculateAccuracy(li_results)



58 changes: 58 additions & 0 deletions ComplementMNB.py
@@ -0,0 +1,58 @@
import csv
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import cross_validation
from sklearn.metrics import classification_report
import numpy as np
from sklearn.metrics import accuracy_score

REVIEWS = os.path.join(os.path.dirname(os.path.abspath(__file__)),'allrevs.csv')
# review.csv contains two columns
# first column is the review content (quoted)
# second column is the assigned sentiment (positive or negative)
def load_file():
with open(REVIEWS) as csv_file:
reader = csv.reader(csv_file,delimiter=",",quotechar='"')
reader.next()
data =[]
target = []
for row in reader:
# skip missing data
if row[0] and row[1]:
data.append(row[0])
target.append(row[1])

return data,target

# preprocess creates the term frequency matrix for the review data set
def preprocess():
data,target = load_file()
count_vectorizer = CountVectorizer(binary='true', stop_words='english')
data = count_vectorizer.fit_transform(data)
# tfidf_data = TfidfTransformer(norm='l2', use_idf=True, sublinear_tf=True, smooth_idf=True).fit_transform(data)
transformer = TfidfTransformer(norm='l2', use_idf=True, sublinear_tf=True, smooth_idf=True)
transformer.fit(data)
tfidf_data = transformer.transform(data)

return tfidf_data

def learn_model(data,target):
# preparing data for split validation. 60% training, 40% test
data_train,data_test,target_train,target_test = cross_validation.train_test_split(data,target,test_size=0.4,random_state=43)
classifier = MultinomialNB().fit(data_train,target_train)
predicted = classifier.predict(data_test)
evaluate_model(target_test,predicted)

#
def evaluate_model(target_true,target_predicted):
# print classification_report(target_true,target_predicted)
print "The accuracy score is {:.2%}".format(accuracy_score(target_true,target_predicted))


data,target = load_file()
tf_idf = preprocess()
learn_model(tf_idf,target)


152 changes: 152 additions & 0 deletions MultinomialNB.py
@@ -0,0 +1,152 @@
from __future__ import division
from math import log
import re
from nltk.corpus import movie_reviews as mr
from nltk.corpus import stopwords
STOP_WORDS = set(stopwords.words('english'))
SPLIT_AMOUNT = 0.6 # training amount from data
# need to change calculations for stuff
# https://www.dataquest.io/blog/naive-bayes-movies/

def SplitData():
type_dict={}
docs_count={}
train_test = [[],[]]
for category in mr.categories():
type_dict[category]=mr.fileids(categories=category)
for cat in type_dict.keys():
li = type_dict[cat]
size=int(len(li)*SPLIT_AMOUNT)
docs_count[cat]=size
train_test[0].extend(li[:size])
train_test[1].extend(li[size:])
return [train_test,type_dict, docs_count]

def tokenize(file_name):
list_words = re.split(r'\W+',mr.raw(fileids=file_name))
return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS]

def CalculateAccuracy(li_results):
a=0
b=0
c=0
d=0
cat = li_results[0][1]
for t in li_results:
if cat==t[1]:
if cat==t[2]:
a+=1
else:
b+=1
else:
if cat==t[2]:
c+=1
else:
d+=1
precision = a/(a+b)
# recall = a/(a+c)
# print "The following parameters are recorded for the category " , cat
print "precision =", precision

# li = Preprocessor.get_testset_trainset(corpus)
li = SplitData()
testset = li[0][1]
trainset = li[0][0]
# li = Preprocessor.startup()
cat_num_docs = li[2]



##4)Create a) a dictionary with a category as the key and dictionary of words-occurences as values
#b) a dictionary with a category as the key and the number of words in it as the value
# {pos-> {w1 = 17 times}, {w2 = 32 times}...} {neg-> ....}
cat_word_dict={}
# {pos-> 4000 words} {neg-> 7000 words}
cat_word_count_dict={}
#val = my_dict.get(key, mydefaultval)

##5)Loop through the training set, to get the entire text from each file
##6) Parse the string to get individual words
for file_name in trainset:
list_words = tokenize(file_name)


##7) Check if category exists in dictionary, if not, create an empty dictionary,
#and put word count as zero
#and then insert words into the category's dictionary in both cases and update the word count
cat = mr.categories(fileids = file_name)[0]
cat_word_dict[cat] = cat_word_dict.get(cat,{})
cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0)

# add number of words to total word count for cat
cat_word_count_dict[cat]+=len(list_words)
# start count for number of occurences for each word
for w in list_words:
cat_word_dict[cat][w] = cat_word_dict[cat].get(w, 0)
cat_word_dict[cat][w]+=1



##8) Get the vocabulary length
## number of words, total across categories
vocab_length=0
for dic in cat_word_dict.values():
vocab_length+=len(dic)





####Congratulations! the Classifier is trained, now it is time to run the Multinomial Naive Bayes Classifier on the test dataset
length_train = len(trainset)
li_results=[]
#9) Like in the training set,Loop through the test set, to get the entire text from each file
##10) Similar step, parse the string to get individual words
for file_name in testset:
print "File: ", file_name
minimum_neg_log_prob=1000000000
# minimum_neg_log_prob = 0 # NEW
min_category=''
list_words = tokenize(file_name)



##11) Get the probability for each category,
#can use any of the created dictionaries to wade through the categories
for cat in cat_word_count_dict:
# print cat , cat_num_docs[cat]/len(trainset)
# print "________________________________________________________________"
# print "________________________________________________________________"
# print "\n\n" , cat, cat, cat, cat, cat, cat, cat, cat, cat, cat, "\n\n"
neg_log_prob=-log(cat_num_docs[cat]/length_train)
# neg_log_prob = cat_num_docs[cat]/length_train
word_dict = cat_word_dict[cat]
count_cat = cat_word_count_dict[cat]
for w in list_words:
count_word_train=word_dict.get(w,0)
ratio = (count_word_train+1)/(count_cat+vocab_length)
neg_log_prob-=log(ratio)

# neg_log_prob *= ratio
# print w, "Ratio found:",ratio, "new_neg_log:", neg_log_prob
# break
# print "\n\n", minimum_neg_log_prob , '<' , neg_log_prob
if minimum_neg_log_prob>neg_log_prob:
# if minimum_neg_log_prob<neg_log_prob:
min_category=cat
minimum_neg_log_prob=neg_log_prob
# print "Min cat: ", min_category
li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
# break

###--------------------DEBUG STATEMENTS----------------------
#for t in li_results:
# if t[1]!=t[2]:
# print t
###--------------------DEBUG STATEMENTS----------------------

###--------------------DEBUG STATEMENTS----------------------

#12) Evaluating the classifier

CalculateAccuracy(li_results)

0 comments on commit 6855e36

Please sign in to comment.