MNB.py

from __future__ import division
from math import log
import re
import random
import csv
from nltk.corpus import movie_reviews as mr
from nltk.corpus import stopwords
STOP_WORDS = set(stopwords.words('english'))
SPLIT_AMOUNT = 0.6          # training amount from data

AMAZON = 0
REVIEW_POL={}
DEFINED_SIZE = 1
DEFINED_SIZES = {'pos': 600, 'neg': 600}

def SplitData():
    type_dict={}
    docs_count={}
    train_test = [[],[]]
    offset_sample = random.randint(-400,400)
    print "offset_sample", offset_sample
    if AMAZON:
        offset_sample = random.randint(-600,600)
        for category in ['pos', 'neg']:
            type_dict[category]=[]
        with open('amazon_revs.csv', 'rb') as csvfile:
            rev_read = csv.reader(csvfile)
            for row in rev_read:
                type_dict[row[1]].append(row[0])
                REVIEW_POL[row[0]] = row[1]
    else:
        for category in mr.categories():
            type_dict[category]=mr.fileids(categories=category)
    for cat in type_dict.keys():
        li = type_dict[cat]
        random.shuffle(li)
        size=int(len(li)*SPLIT_AMOUNT) + offset_sample
        if DEFINED_SIZE:
            size = DEFINED_SIZES[cat]
        print "Category: ", cat, "Size:", size
        offset_sample *= -1
        docs_count[cat]=size
        train_test[0].extend(li[:size])
        train_test[1].extend(li[size:])
    return [train_test,type_dict, docs_count]

def tokenize(file_name):
    list_words = ()
    if AMAZON:
        list_words = re.split(r'\W+',file_name)
    else:
        list_words = re.split(r'\W+',mr.raw(fileids=file_name))

    return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS]

def CalculateAccuracy(li_results):
    a=0
    b=0
    c=0
    d=0
    cat = li_results[0][1]
    for t in li_results:
        if cat==t[1]:
            if cat==t[2]:
                a+=1
            else:
                b+=1
        else:
            if cat==t[2]:
                c+=1
            else:
                d+=1
    precision = a/(a+b)
    # recall = a/(a+c)
    # print "The following parameters are recorded for the category " , cat
    print "precision =", precision

# li = Preprocessor.get_testset_trainset(corpus)
li = SplitData()
testset = li[0][1]
trainset = li[0][0]
# li = Preprocessor.startup()
cat_num_docs = li[2]


##4)Create a) a dictionary with a category as the key and dictionary of words-occurences as values
          #b) a dictionary with a category as the key and the number of words in it as the value
# {pos-> {w1 = 17 times}, {w2 = 32 times}...}
# {neg-> ....}
cat_word_dict={}
# {pos-> 4000 words} {neg-> 7000 words}
cat_word_count_dict={}
#val = my_dict.get(key, mydefaultval)

##5)Loop through the training set, to get the entire text from  each file
##6) Parse the string to get individual words
for file_name in trainset:
    list_words = tokenize(file_name)


##7) Check if category exists in dictionary, if not, create an empty dictionary,
    #and put word count as zero
    #and then insert words into the category's dictionary in both cases and update the word count
    cat = ''
    if AMAZON:
        cat = REVIEW_POL[file_name]
    else:
        cat = mr.categories(fileids = file_name)[0]
    cat_word_dict[cat] = cat_word_dict.get(cat,{})
    cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0)

# add number of words to total word count for cat
    cat_word_count_dict[cat]+=len(list_words)
# start count for number of occurences for each word
    for w in list_words:
        cat_word_dict[cat][w] = cat_word_dict[cat].get(w, 0)
        cat_word_dict[cat][w]+=1


##8) Get the vocabulary length
## number of words, total across categories
vocab_length=0
for dic in cat_word_dict.values():
     vocab_length+=len(dic)


####Congratulations! the Classifier is trained, now it is time to run the Multinomial Naive Bayes Classifier on the test dataset
length_train = len(trainset)
li_results=[]
#9) Like in the training set,Loop through the test set, to get the entire text from  each file
##10) Similar step, parse the string to get individual words
for file_name in testset:
    # print "File: ", file_name
    # minimum_neg_log_prob=1000000000
    minimum_neg_log_prob=-1000000000
    # minimum_neg_log_prob = 0      # NEW
    min_category=''
    list_words = tokenize(file_name)


##11) Get the probability for each category,
    #can use any of the created dictionaries to wade through the categories
    for cat in  cat_word_count_dict:
        # print cat , cat_num_docs[cat]/len(trainset)
        # print "________________________________________________________________"
        # print "________________________________________________________________"
        # print "\n\n" , cat, cat, cat, cat, cat, cat, cat, cat, cat, cat, "\n\n"
        # neg_log_prob=-log(cat_num_docs[cat]/length_train)   # P(class)
        neg_log_prob= log(cat_num_docs[cat]/length_train)   # P(class)
        # neg_log_prob = cat_num_docs[cat]/length_train
        word_dict = cat_word_dict[cat]          # word counts for each word in class
        count_cat = cat_word_count_dict[cat]    # total words in class
        for w in list_words:
            count_word_train=word_dict.get(w,0)
            ratio = (count_word_train+1)/(count_cat+vocab_length)
            # neg_log_prob-=log(ratio)
            neg_log_prob+=log(ratio)

            # neg_log_prob *= ratio
            # print w, "Ratio found:",ratio, "new_neg_log:", neg_log_prob
            # break
        # print "\n\n", minimum_neg_log_prob , '<' , neg_log_prob
        # print "\n\n", cat, ' :: ', neg_log_prob
        # if minimum_neg_log_prob>neg_log_prob:
        if minimum_neg_log_prob<neg_log_prob:
            min_category=cat
            minimum_neg_log_prob=neg_log_prob
    # print "Min cat: ", min_category
    if AMAZON:
        li_results.append((file_name,min_category,REVIEW_POL[file_name]))
    else:
        li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
    # break

###--------------------DEBUG STATEMENTS----------------------
#for t in li_results:
 #   if t[1]!=t[2]:
  #      print t
###--------------------DEBUG STATEMENTS----------------------

###--------------------DEBUG STATEMENTS----------------------

#12) Evaluating the classifier

CalculateAccuracy(li_results)
	from __future__ import division
	from math import log
	import re
	import random
	import csv
	from nltk.corpus import movie_reviews as mr
	from nltk.corpus import stopwords
	STOP_WORDS = set(stopwords.words('english'))
	SPLIT_AMOUNT = 0.6 # training amount from data

	AMAZON = 0
	REVIEW_POL={}
	DEFINED_SIZE = 1
	DEFINED_SIZES = {'pos': 600, 'neg': 600}

	def SplitData():
	type_dict={}
	docs_count={}
	train_test = [[],[]]
	offset_sample = random.randint(-400,400)
	print "offset_sample", offset_sample
	if AMAZON:
	offset_sample = random.randint(-600,600)
	for category in ['pos', 'neg']:
	type_dict[category]=[]
	with open('amazon_revs.csv', 'rb') as csvfile:
	rev_read = csv.reader(csvfile)
	for row in rev_read:
	type_dict[row[1]].append(row[0])
	REVIEW_POL[row[0]] = row[1]
	else:
	for category in mr.categories():
	type_dict[category]=mr.fileids(categories=category)
	for cat in type_dict.keys():
	li = type_dict[cat]
	random.shuffle(li)
	size=int(len(li)*SPLIT_AMOUNT) + offset_sample
	if DEFINED_SIZE:
	size = DEFINED_SIZES[cat]
	print "Category: ", cat, "Size:", size
	offset_sample *= -1
	docs_count[cat]=size
	train_test[0].extend(li[:size])
	train_test[1].extend(li[size:])
	return [train_test,type_dict, docs_count]

	def tokenize(file_name):
	list_words = ()
	if AMAZON:
	list_words = re.split(r'\W+',file_name)
	else:
	list_words = re.split(r'\W+',mr.raw(fileids=file_name))

	return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS]

	def CalculateAccuracy(li_results):
	a=0
	b=0
	c=0
	d=0
	cat = li_results[0][1]
	for t in li_results:
	if cat==t[1]:
	if cat==t[2]:
	a+=1
	else:
	b+=1
	else:
	if cat==t[2]:
	c+=1
	else:
	d+=1
	precision = a/(a+b)
	# recall = a/(a+c)
	# print "The following parameters are recorded for the category " , cat
	print "precision =", precision

	# li = Preprocessor.get_testset_trainset(corpus)
	li = SplitData()
	testset = li[0][1]
	trainset = li[0][0]
	# li = Preprocessor.startup()
	cat_num_docs = li[2]



	##4)Create a) a dictionary with a category as the key and dictionary of words-occurences as values
	#b) a dictionary with a category as the key and the number of words in it as the value
	# {pos-> {w1 = 17 times}, {w2 = 32 times}...}
	# {neg-> ....}
	cat_word_dict={}
	# {pos-> 4000 words} {neg-> 7000 words}
	cat_word_count_dict={}
	#val = my_dict.get(key, mydefaultval)

	##5)Loop through the training set, to get the entire text from each file
	##6) Parse the string to get individual words
	for file_name in trainset:
	list_words = tokenize(file_name)


	##7) Check if category exists in dictionary, if not, create an empty dictionary,
	#and put word count as zero
	#and then insert words into the category's dictionary in both cases and update the word count
	cat = ''
	if AMAZON:
	cat = REVIEW_POL[file_name]
	else:
	cat = mr.categories(fileids = file_name)[0]
	cat_word_dict[cat] = cat_word_dict.get(cat,{})
	cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0)

	# add number of words to total word count for cat
	cat_word_count_dict[cat]+=len(list_words)
	# start count for number of occurences for each word
	for w in list_words:
	cat_word_dict[cat][w] = cat_word_dict[cat].get(w, 0)
	cat_word_dict[cat][w]+=1



	##8) Get the vocabulary length
	## number of words, total across categories
	vocab_length=0
	for dic in cat_word_dict.values():
	vocab_length+=len(dic)





	####Congratulations! the Classifier is trained, now it is time to run the Multinomial Naive Bayes Classifier on the test dataset
	length_train = len(trainset)
	li_results=[]
	#9) Like in the training set,Loop through the test set, to get the entire text from each file
	##10) Similar step, parse the string to get individual words
	for file_name in testset:
	# print "File: ", file_name
	# minimum_neg_log_prob=1000000000
	minimum_neg_log_prob=-1000000000
	# minimum_neg_log_prob = 0 # NEW
	min_category=''
	list_words = tokenize(file_name)



	##11) Get the probability for each category,
	#can use any of the created dictionaries to wade through the categories
	for cat in cat_word_count_dict:
	# print cat , cat_num_docs[cat]/len(trainset)
	# print "________________________________________________________________"
	# print "________________________________________________________________"
	# print "\n\n" , cat, cat, cat, cat, cat, cat, cat, cat, cat, cat, "\n\n"
	# neg_log_prob=-log(cat_num_docs[cat]/length_train) # P(class)
	neg_log_prob= log(cat_num_docs[cat]/length_train) # P(class)
	# neg_log_prob = cat_num_docs[cat]/length_train
	word_dict = cat_word_dict[cat] # word counts for each word in class
	count_cat = cat_word_count_dict[cat] # total words in class
	for w in list_words:
	count_word_train=word_dict.get(w,0)
	ratio = (count_word_train+1)/(count_cat+vocab_length)
	# neg_log_prob-=log(ratio)
	neg_log_prob+=log(ratio)

	# neg_log_prob *= ratio
	# print w, "Ratio found:",ratio, "new_neg_log:", neg_log_prob
	# break
	# print "\n\n", minimum_neg_log_prob , '<' , neg_log_prob
	# print "\n\n", cat, ' :: ', neg_log_prob
	# if minimum_neg_log_prob>neg_log_prob:
	if minimum_neg_log_prob<neg_log_prob:
	min_category=cat
	minimum_neg_log_prob=neg_log_prob
	# print "Min cat: ", min_category
	if AMAZON:
	li_results.append((file_name,min_category,REVIEW_POL[file_name]))
	else:
	li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
	# break

	###--------------------DEBUG STATEMENTS----------------------
	#for t in li_results:
	# if t[1]!=t[2]:
	# print t
	###--------------------DEBUG STATEMENTS----------------------

	###--------------------DEBUG STATEMENTS----------------------

	#12) Evaluating the classifier

	CalculateAccuracy(li_results)