Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
from __future__ import division
from math import log
import re
import random
import csv
from nltk.corpus import movie_reviews as mr
from nltk.corpus import stopwords
STOP_WORDS = set(stopwords.words('english'))
SPLIT_AMOUNT = 0.6 # training amount from data
AMAZON = 0
REVIEW_POL={}
DEFINED_SIZE = 1
DEFINED_SIZES = {'pos': 600, 'neg': 600}
def SplitData():
type_dict={}
docs_count={}
train_test = [[],[]]
offset_sample = random.randint(-400,400)
print "offset_sample", offset_sample
if AMAZON:
offset_sample = random.randint(-600,600)
for category in ['pos', 'neg']:
type_dict[category]=[]
with open('amazon_revs.csv', 'rb') as csvfile:
rev_read = csv.reader(csvfile)
for row in rev_read:
type_dict[row[1]].append(row[0])
REVIEW_POL[row[0]] = row[1]
else:
for category in mr.categories():
type_dict[category]=mr.fileids(categories=category)
for cat in type_dict.keys():
li = type_dict[cat]
random.shuffle(li)
size=int(len(li)*SPLIT_AMOUNT) + offset_sample
if DEFINED_SIZE:
size = DEFINED_SIZES[cat]
print "Category: ", cat, "Size:", size
offset_sample *= -1
docs_count[cat]=size
train_test[0].extend(li[:size])
train_test[1].extend(li[size:])
return [train_test,type_dict, docs_count]
def tokenize(file_name):
list_words = ()
if AMAZON:
list_words = re.split(r'\W+',file_name)
else:
list_words = re.split(r'\W+',mr.raw(fileids=file_name))
return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS]
def CalculateAccuracy(li_results):
a=0
b=0
c=0
d=0
cat = li_results[0][1]
for t in li_results:
if cat==t[1]:
if cat==t[2]:
a+=1
else:
b+=1
else:
if cat==t[2]:
c+=1
else:
d+=1
precision = a/(a+b)
# recall = a/(a+c)
# print "The following parameters are recorded for the category " , cat
print "precision =", precision
# li = Preprocessor.get_testset_trainset(corpus)
li = SplitData()
testset = li[0][1]
trainset = li[0][0]
# li = Preprocessor.startup()
cat_num_docs = li[2]
##4)Create a) a dictionary with a category as the key and dictionary of words-occurences as values
#b) a dictionary with a category as the key and the number of words in it as the value
# {pos-> {w1 = 17 times}, {w2 = 32 times}...}
# {neg-> ....}
cat_word_dict={}
# {pos-> 4000 words} {neg-> 7000 words}
cat_word_count_dict={}
#val = my_dict.get(key, mydefaultval)
##5)Loop through the training set, to get the entire text from each file
##6) Parse the string to get individual words
for file_name in trainset:
list_words = tokenize(file_name)
##7) Check if category exists in dictionary, if not, create an empty dictionary,
#and put word count as zero
#and then insert words into the category's dictionary in both cases and update the word count
cat = ''
if AMAZON:
cat = REVIEW_POL[file_name]
else:
cat = mr.categories(fileids = file_name)[0]
cat_word_dict[cat] = cat_word_dict.get(cat,{})
cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0)
# add number of words to total word count for cat
cat_word_count_dict[cat]+=len(list_words)
# start count for number of occurences for each word
for w in list_words:
cat_word_dict[cat][w] = cat_word_dict[cat].get(w, 0)
cat_word_dict[cat][w]+=1
##8) Get the vocabulary length
## number of words, total across categories
vocab_length=0
for dic in cat_word_dict.values():
vocab_length+=len(dic)
####Congratulations! the Classifier is trained, now it is time to run the Multinomial Naive Bayes Classifier on the test dataset
length_train = len(trainset)
li_results=[]
#9) Like in the training set,Loop through the test set, to get the entire text from each file
##10) Similar step, parse the string to get individual words
for file_name in testset:
# print "File: ", file_name
# minimum_neg_log_prob=1000000000
minimum_neg_log_prob=-1000000000
# minimum_neg_log_prob = 0 # NEW
min_category=''
list_words = tokenize(file_name)
##11) Get the probability for each category,
#can use any of the created dictionaries to wade through the categories
for cat in cat_word_count_dict:
# print cat , cat_num_docs[cat]/len(trainset)
# print "________________________________________________________________"
# print "________________________________________________________________"
# print "\n\n" , cat, cat, cat, cat, cat, cat, cat, cat, cat, cat, "\n\n"
# neg_log_prob=-log(cat_num_docs[cat]/length_train) # P(class)
neg_log_prob= log(cat_num_docs[cat]/length_train) # P(class)
# neg_log_prob = cat_num_docs[cat]/length_train
word_dict = cat_word_dict[cat] # word counts for each word in class
count_cat = cat_word_count_dict[cat] # total words in class
for w in list_words:
count_word_train=word_dict.get(w,0)
ratio = (count_word_train+1)/(count_cat+vocab_length)
# neg_log_prob-=log(ratio)
neg_log_prob+=log(ratio)
# neg_log_prob *= ratio
# print w, "Ratio found:",ratio, "new_neg_log:", neg_log_prob
# break
# print "\n\n", minimum_neg_log_prob , '<' , neg_log_prob
# print "\n\n", cat, ' :: ', neg_log_prob
# if minimum_neg_log_prob>neg_log_prob:
if minimum_neg_log_prob<neg_log_prob:
min_category=cat
minimum_neg_log_prob=neg_log_prob
# print "Min cat: ", min_category
if AMAZON:
li_results.append((file_name,min_category,REVIEW_POL[file_name]))
else:
li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
# break
###--------------------DEBUG STATEMENTS----------------------
#for t in li_results:
# if t[1]!=t[2]:
# print t
###--------------------DEBUG STATEMENTS----------------------
###--------------------DEBUG STATEMENTS----------------------
#12) Evaluating the classifier
CalculateAccuracy(li_results)