BNB.py

from __future__ import division
from math import log
import random
import csv
import re
from nltk.corpus import movie_reviews as mr
from nltk.corpus import stopwords
STOP_WORDS = set(stopwords.words('english'))

SPLIT_AMOUNT = 0.6          # training amount from data

AMAZON = 1
REVIEW_POL={}
DEFINED_SIZE = 1
DEFINED_SIZES = {'pos': 948, 'neg': 948}

def SplitData():
    type_dict={}
    docs_count={}
    train_test = [[],[]]
    offset_sample = random.randint(-400,400)
    print "offset_sample", offset_sample
    if AMAZON:
        offset_sample = random.randint(-600,600)
        for category in ['pos', 'neg']:
            type_dict[category]=[]
        with open('amazon_revs.csv', 'rb') as csvfile:
            rev_read = csv.reader(csvfile)
            for row in rev_read:
                type_dict[row[1]].append(row[0])
                REVIEW_POL[row[0]] = row[1]
    else:
        for category in mr.categories():
            type_dict[category]=mr.fileids(categories=category)
    for cat in type_dict.keys():
        li = type_dict[cat]
        random.shuffle(li)
        size=int(len(li)*SPLIT_AMOUNT) + offset_sample
        if DEFINED_SIZE:
            size = DEFINED_SIZES[cat]
        print "Category: ", cat, "Size:", size
        offset_sample *= -1
        docs_count[cat]=size
        train_test[0].extend(li[:size])
        train_test[1].extend(li[size:])
    return [train_test,type_dict, docs_count]

def tokenize(file_name):
    list_words = ()
    if AMAZON:
        list_words = re.split(r'\W+',file_name)
    else:
        list_words = re.split(r'\W+',mr.raw(fileids=file_name))

    return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS]


def CalculateAccuracy(li_results):
    a=0
    b=0
    c=0
    d=0
    cat = li_results[0][1]
    for t in li_results:
        if cat==t[1]:
            if cat==t[2]:
                a+=1
            else:
                b+=1
        else:
            if cat==t[2]:
                c+=1
            else:
                d+=1
    precision = a/(a+b)
    # recall = a/(a+c)
    # print "The following parameters are recorded for the category " , cat
    print "precision =", precision

# li = Preprocessor.get_testset_trainset(corpus)
li = SplitData()
testset = li[0][1]
trainset = li[0][0]
# li = Preprocessor.startup()
cat_num_docs = li[2]

#3)Create a dictionary with a word as the key and a dictionary as the value
     ## in the dictionary the category as key and number of documents in that category where it occurs as value
# 2d dict: word -> {pos ...}, {neg ...}
word_cat_num_doc_dict={}

#4)Loop through the reuters dataset, to get the entire text from  each file in the training set
    ## Parse the string to get individual words - done by get_list_tokens_nltk()
for file_name in trainset:
    list_words = tokenize(file_name)
    cat = ''
    if AMAZON:
        cat = REVIEW_POL[file_name]
    else:
        cat = mr.categories(fileids = file_name)[0]
    for w in set(list_words):
       word_cat_num_doc_dict[w]=word_cat_num_doc_dict.get(w,{})
       word_cat_num_doc_dict[w][cat]=word_cat_num_doc_dict[w].get(cat,0)
       word_cat_num_doc_dict[w][cat]+=1

for w in word_cat_num_doc_dict:
    for cat in cat_num_docs:
        nct = word_cat_num_doc_dict[w].get(cat,0)
        # convert #times a word appears into #times+1/#cat_reviews+2
        ratio = (nct+1)/(cat_num_docs[cat]+2)
        word_cat_num_doc_dict[w][cat]=ratio

print "The Classifier is trained and it took"


li_results=[]
#5) Like in the training set,Loop through the test set, to get the individual words
for file_name in testset:
    minimum_neg_log_prob=1000000000
    min_category=''
    set_list_words = set(tokenize(file_name))

##6) Get the probability for each category,
    #using the cat_num_docs dictionary to wade through the categories
    for cat in  cat_num_docs:
        neg_log_prob=-log(cat_num_docs[cat]/len(trainset))
        for w in word_cat_num_doc_dict:
            if w in set_list_words:
                neg_log_prob-=log(word_cat_num_doc_dict[w][cat])
            else:
                neg_log_prob-=log(1-word_cat_num_doc_dict[w][cat])
        if minimum_neg_log_prob>neg_log_prob:
            min_category=cat
            minimum_neg_log_prob=neg_log_prob

    if AMAZON:
        li_results.append((file_name,min_category,REVIEW_POL[file_name]))
    else:
        li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
    # break
CalculateAccuracy(li_results)
	from __future__ import division
	from math import log
	import random
	import csv
	import re
	from nltk.corpus import movie_reviews as mr
	from nltk.corpus import stopwords
	STOP_WORDS = set(stopwords.words('english'))

	SPLIT_AMOUNT = 0.6 # training amount from data

	AMAZON = 1
	REVIEW_POL={}
	DEFINED_SIZE = 1
	DEFINED_SIZES = {'pos': 948, 'neg': 948}

	def SplitData():
	type_dict={}
	docs_count={}
	train_test = [[],[]]
	offset_sample = random.randint(-400,400)
	print "offset_sample", offset_sample
	if AMAZON:
	offset_sample = random.randint(-600,600)
	for category in ['pos', 'neg']:
	type_dict[category]=[]
	with open('amazon_revs.csv', 'rb') as csvfile:
	rev_read = csv.reader(csvfile)
	for row in rev_read:
	type_dict[row[1]].append(row[0])
	REVIEW_POL[row[0]] = row[1]
	else:
	for category in mr.categories():
	type_dict[category]=mr.fileids(categories=category)
	for cat in type_dict.keys():
	li = type_dict[cat]
	random.shuffle(li)
	size=int(len(li)*SPLIT_AMOUNT) + offset_sample
	if DEFINED_SIZE:
	size = DEFINED_SIZES[cat]
	print "Category: ", cat, "Size:", size
	offset_sample *= -1
	docs_count[cat]=size
	train_test[0].extend(li[:size])
	train_test[1].extend(li[size:])
	return [train_test,type_dict, docs_count]

	def tokenize(file_name):
	list_words = ()
	if AMAZON:
	list_words = re.split(r'\W+',file_name)
	else:
	list_words = re.split(r'\W+',mr.raw(fileids=file_name))

	return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS]


	def CalculateAccuracy(li_results):
	a=0
	b=0
	c=0
	d=0
	cat = li_results[0][1]
	for t in li_results:
	if cat==t[1]:
	if cat==t[2]:
	a+=1
	else:
	b+=1
	else:
	if cat==t[2]:
	c+=1
	else:
	d+=1
	precision = a/(a+b)
	# recall = a/(a+c)
	# print "The following parameters are recorded for the category " , cat
	print "precision =", precision

	# li = Preprocessor.get_testset_trainset(corpus)
	li = SplitData()
	testset = li[0][1]
	trainset = li[0][0]
	# li = Preprocessor.startup()
	cat_num_docs = li[2]

	#3)Create a dictionary with a word as the key and a dictionary as the value
	## in the dictionary the category as key and number of documents in that category where it occurs as value
	# 2d dict: word -> {pos ...}, {neg ...}
	word_cat_num_doc_dict={}

	#4)Loop through the reuters dataset, to get the entire text from each file in the training set
	## Parse the string to get individual words - done by get_list_tokens_nltk()
	for file_name in trainset:
	list_words = tokenize(file_name)
	cat = ''
	if AMAZON:
	cat = REVIEW_POL[file_name]
	else:
	cat = mr.categories(fileids = file_name)[0]
	for w in set(list_words):
	word_cat_num_doc_dict[w]=word_cat_num_doc_dict.get(w,{})
	word_cat_num_doc_dict[w][cat]=word_cat_num_doc_dict[w].get(cat,0)
	word_cat_num_doc_dict[w][cat]+=1

	for w in word_cat_num_doc_dict:
	for cat in cat_num_docs:
	nct = word_cat_num_doc_dict[w].get(cat,0)
	# convert #times a word appears into #times+1/#cat_reviews+2
	ratio = (nct+1)/(cat_num_docs[cat]+2)
	word_cat_num_doc_dict[w][cat]=ratio

	print "The Classifier is trained and it took"


	li_results=[]
	#5) Like in the training set,Loop through the test set, to get the individual words
	for file_name in testset:
	minimum_neg_log_prob=1000000000
	min_category=''
	set_list_words = set(tokenize(file_name))

	##6) Get the probability for each category,
	#using the cat_num_docs dictionary to wade through the categories
	for cat in cat_num_docs:
	neg_log_prob=-log(cat_num_docs[cat]/len(trainset))
	for w in word_cat_num_doc_dict:
	if w in set_list_words:
	neg_log_prob-=log(word_cat_num_doc_dict[w][cat])
	else:
	neg_log_prob-=log(1-word_cat_num_doc_dict[w][cat])
	if minimum_neg_log_prob>neg_log_prob:
	min_category=cat
	minimum_neg_log_prob=neg_log_prob

	if AMAZON:
	li_results.append((file_name,min_category,REVIEW_POL[file_name]))
	else:
	li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
	# break
	CalculateAccuracy(li_results)