Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
BigData/Naive_bayes.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
455 lines (381 sloc)
15.3 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
###################### | |
# Full version with all variations included | |
# To improve: create a main function allowing for multiple runs | |
# TF works properly! | |
###################### | |
from __future__ import division | |
from math import log | |
from math import pow | |
import re | |
import csv | |
from nltk.corpus import movie_reviews as mr | |
from nltk.corpus import stopwords | |
import random | |
STOP_WORDS = set(stopwords.words('english')) | |
SPLIT_AMOUNT = 0.6 # training amount from data | |
SHUFFLE = 1 | |
# COMPLEMENT = 0 # 1 - just comp, 2 - delta / one-v-all | |
# WEIGHTED = 0 # 1 - adjust weights | |
# TF = 0 # 1 - log term frew | |
# IDF = 0 # 1 - idf | |
# LENGTH = 0 # 1 - doc length adjust | |
AMAZON = 0 # 1 - use Amazon set | |
TWITTER = 0 # 1 - use Twitter set | |
TWEET_LIMIT = 5000 # we can't use the whole database, so just randomly grab this number of positive and negative reviews | |
OFFSET = 0 # introduced offset (skew) in datasets | |
REVIEW_POL={} | |
li = [] | |
# exit() | |
testset = [] | |
trainset = [] | |
# li = Preprocessor.startup() | |
cat_num_docs = {} | |
##4)Create a) a dictionary with a category as the key and dictionary of words-occurrences as values | |
#b) a dictionary with a category as the key and the number of words in it as the value | |
# {pos-> {w1 = 17 times}, {w2 = 32 times}...} {neg-> ....} | |
cat_word_dict={} | |
# {pos-> 4000 words} {neg-> 7000 words} | |
cat_word_count_dict={} | |
#val = my_dict.get(key, mydefaultval) | |
num_docs_word_in = {} | |
vocab_length=0 | |
word_cat_num_doc_dict={} | |
def SplitData(): | |
global REVIEW_POL | |
type_dict={} | |
docs_count={} | |
train_test = [[],[]] | |
# offset_sample = random.randint(-400,400) | |
offset_sample = OFFSET | |
# print "offset_sample", offset_sample | |
categories = ['neg', 'pos'] | |
if AMAZON: | |
# offset_sample = random.randint(-600,600) | |
for category in categories: | |
type_dict[category]=[] | |
with open('amazon_revs.csv', 'rb') as csvfile: | |
rev_read = csv.reader(csvfile) | |
for row in rev_read: | |
type_dict[row[1]].append(row[0]) | |
REVIEW_POL[row[0]] = row[1] | |
elif TWITTER: | |
for category in categories: | |
type_dict[category]=[] | |
with open('tweets.csv', 'rb') as csvfile: | |
rev_read = csv.reader(csvfile) | |
rev_read.next() # skip header row | |
number = [0,0] | |
for row in rev_read: | |
type_dict[ categories[ int(row[1]) ] ].append(row[3].strip()) | |
REVIEW_POL[row[3].strip()] = categories[int(row[1])] | |
number[int(row[1])] += 1 | |
if (number[0]>TWEET_LIMIT and number[1]>TWEET_LIMIT): | |
break | |
else: | |
for category in mr.categories(): | |
type_dict[category]=mr.fileids(categories=category) | |
# if NO_OFF: | |
# offset_sample = 0 | |
for cat in type_dict.keys(): | |
li = type_dict[cat] | |
if SHUFFLE: | |
random.shuffle(li) | |
size=int(len(li)*SPLIT_AMOUNT) + offset_sample | |
# if DEFINED_SIZE: | |
# size = DEFINED_SIZES[cat] | |
# print "Category: ", cat, "Size:", size | |
offset_sample *= -1 | |
docs_count[cat]=size | |
train_test[0].extend(li[:size]) | |
train_test[1].extend(li[size:]) | |
return [train_test,type_dict, docs_count] | |
def tokenize(file_name): | |
list_words = () | |
if AMAZON or TWITTER: | |
list_words = re.split(r'\W+',file_name) | |
else: | |
list_words = re.split(r'\W+',mr.raw(fileids=file_name)) | |
return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS] | |
def CalculateAccuracy(li_results): | |
a=0 | |
b=0 | |
c=0 | |
d=0 | |
cat = li_results[0][1] | |
for t in li_results: | |
if cat==t[1]: | |
if cat==t[2]: | |
a+=1 | |
else: | |
b+=1 | |
else: | |
if cat==t[2]: | |
c+=1 | |
else: | |
d+=1 | |
precision = a/(a+b) | |
# recall = a/(a+c) | |
# print "The following parameters are recorded for the category " , cat | |
# print "precision =", precision | |
return precision | |
def RunWholeThing(): | |
global AMAZON | |
global TWITTER | |
global OFFSET | |
global DEFINED_SIZE | |
global DEFINED_SIZES | |
OFFSET = 0 | |
AMAZON = 0 # 0 - use movie_reviews, 1 - use Amazon set | |
TWITTER = 0 | |
tested = [' Bern:', ' Mult:', ' TIL :', ' DTIL:', ' CW :', ' TIWC:'] | |
while OFFSET < 400: | |
print "Movies with ", OFFSET | |
ans = DoTheThing() | |
OFFSET = -1*OFFSET | |
if OFFSET != 0: | |
ans2 = DoTheThing() | |
ans3 = [ans , ans2] | |
ans = [sum(e)/len(e) for e in zip(*ans3)] | |
a_i = 0 | |
for a in ans: | |
print tested[a_i], a | |
a_i += 1 | |
OFFSET = -1*OFFSET | |
OFFSET += 300 | |
OFFSET = 0 | |
AMAZON = 1 | |
while OFFSET < 600: | |
print "Amazon with ", OFFSET | |
ans = DoTheThing() | |
OFFSET = -1*OFFSET | |
if OFFSET != 0: | |
ans2 = DoTheThing() | |
ans3 = [ans , ans2] | |
ans = [sum(e)/len(e) for e in zip(*ans3)] | |
a_i = 0 | |
for a in ans: | |
print tested[a_i], a | |
a_i += 1 | |
OFFSET = -1*OFFSET | |
OFFSET += 400 | |
OFFSET = 0 | |
AMAZON = 0 | |
TWITTER = 1 | |
while OFFSET < 1000: | |
print "Twitter with ", OFFSET | |
ans = DoTheThing() | |
OFFSET = -1*OFFSET | |
if OFFSET != 0: | |
ans2 = DoTheThing() | |
ans3 = [ans , ans2] | |
ans = [sum(e)/len(e) for e in zip(*ans3)] | |
a_i = 0 | |
for a in ans: | |
print tested[a_i], a | |
a_i += 1 | |
OFFSET = -1*OFFSET | |
OFFSET += 800 | |
def DoTheThing(): | |
i = 0 | |
reps = 5 | |
b_nb = 0 | |
m_nb = 0 | |
til_nb = 0 | |
dtil_nb = 0 | |
cw_nb = 0 | |
tilcw_nb= 0 | |
while i < reps: | |
TrainMachine() | |
b_nb += TestMachine_Bern()/reps | |
m_nb += TestMachine(0,0,0,0,0)/reps | |
til_nb += TestMachine(1,1,1,0,0)/reps | |
dtil_nb += TestMachine(1,1,1,2,0)/reps | |
cw_nb += TestMachine(0,0,0,1,1)/reps | |
tilcw_nb += TestMachine(1,1,1,1,1)/reps | |
i+=1 | |
# print " Bern: %0.6f\n Mult: %0.6f\n TIL : %0.6f\n DTIL: %0.6f\n CW : %0.6f\n TIWC: %0.6f" % (b_nb, m_nb, til_nb, dtil_nb, cw_nb, tilcw_nb) | |
return [b_nb, m_nb, til_nb, dtil_nb, cw_nb, tilcw_nb] | |
# li = Preprocessor.get_testset_trainset(corpus) | |
##5)Loop through the training set, to get the entire text from each file | |
##6) Parse the string to get individual words | |
def TrainMachine(): | |
global cat_word_dict | |
global cat_word_count_dict | |
global num_docs_word_in | |
global word_cat_num_doc_dict | |
global li | |
global testset | |
global trainset | |
global cat_num_docs | |
global vocab_length | |
li = SplitData() | |
testset = li[0][1] | |
trainset = li[0][0] | |
cat_num_docs = li[2] | |
for file_name in trainset: | |
list_words = tokenize(file_name) | |
##7) Check if category exists in dictionary, if not, create an empty dictionary, | |
#and put word count as zero | |
#and then insert words into the category's dictionary in both cases and update the word count | |
cat = '' | |
if AMAZON or TWITTER: | |
cat = REVIEW_POL[file_name] | |
else: | |
cat = mr.categories(fileids = file_name)[0] | |
cat_word_dict[cat] = cat_word_dict.get(cat,{}) | |
cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0) | |
# add number of words to total word count for cat | |
cat_word_count_dict[cat]+=len(list_words) | |
# start count for number of occurences for each word | |
counted = [] | |
for w in list_words: | |
cat_word_dict[cat][w] = cat_word_dict[cat].get(w, 0) | |
cat_word_dict[cat][w]+=1 | |
if w not in counted: | |
counted.append(w) | |
num_docs_word_in[w] = num_docs_word_in.get(w, 0) | |
num_docs_word_in[w] += 1 | |
word_cat_num_doc_dict[w]=word_cat_num_doc_dict.get(w,{}) | |
word_cat_num_doc_dict[w][cat]=word_cat_num_doc_dict[w].get(cat,0) | |
word_cat_num_doc_dict[w][cat]+=1 | |
for dic in cat_word_dict.values(): | |
vocab_length+=len(dic) | |
for w in word_cat_num_doc_dict: | |
for cat in cat_num_docs: | |
nct = word_cat_num_doc_dict[w].get(cat,0) | |
# convert #times a word appears into #times+1/#cat_reviews+2 | |
ratio = (nct+1)/(cat_num_docs[cat]+2) | |
word_cat_num_doc_dict[w][cat]=ratio | |
# ##8) Get the vocabulary length | |
# ## number of words, total across categories | |
# vocab_length=0 | |
# ####Congratulations! the Classifier is trained, now it is time to run the Multinomial Naive Bayes Classifier on the test dataset | |
# length_train = len(trainset) | |
# print "length of training set ", length_train | |
def TestMachine(t, i, l, c, w): | |
#9) Like in the training set,Loop through the test set, to get the entire text from each file | |
##10) Similar step, parse the string to get individual words | |
global trainset | |
global testset | |
TF = t # 1 - log term frew | |
IDF = i # 1 - idf | |
LENGTH = l # 1 - doc length adjust | |
COMPLEMENT = c # 1 - just comp, 2 - delta / one-v-all | |
WEIGHTED = w # 1 - adjust weights | |
length_train = len(trainset) | |
# print "length train " , length_train, len(testset) | |
li_results=[] | |
for file_name in testset: | |
# print "File: ", file_name | |
# minimum_neg_log_prob=1000000000 | |
minimum_neg_log_prob = -1000000000 # NEW | |
min_category='' | |
list_words = tokenize(file_name) | |
# print file_name | |
##11) Get the probability for each category, | |
#can use any of the created dictionaries to wade through the categories | |
for cat in cat_word_count_dict: | |
# print cat , cat_num_docs[cat]/len(trainset) | |
# print "________________________________________________________________" | |
# print "________________________________________________________________" | |
# print "\n\n" , cat, cat, cat, cat, cat, cat, cat, cat, cat, cat, "\n\n" | |
# neg_log_prob=-log(cat_num_docs[cat]/length_train) | |
inv_cat = 'pos' | |
if cat == 'pos': | |
inv_cat = 'neg' | |
neg_log_prob = log(cat_num_docs[cat]/length_train, 2) | |
# neg_log_prob = cat_num_docs[cat]/length_train | |
opp_word_dict = cat_word_dict[inv_cat] | |
opp_count_cat = cat_word_count_dict[inv_cat] | |
word_dict = cat_word_dict[cat] | |
count_cat = cat_word_count_dict[cat] | |
## get frequency counts | |
my_word_count = {} | |
for aw in list_words: | |
my_word_count[aw] = my_word_count.get(aw, 0) | |
my_word_count[aw]+=1 | |
## calculate necessary norms | |
length_norm = 0 | |
weight_normalizing_ratio = 0 | |
opp_weight_normalizing_ratio = 0 | |
for kw in my_word_count.keys(): | |
count_word_train=word_dict.get(kw,0) | |
ratio = (count_word_train+1)/(count_cat+vocab_length) | |
# if COMPLEMENT: | |
opp_count_word_train=opp_word_dict.get(kw,0) | |
opp_ratio = (opp_count_word_train+1)/(opp_count_cat+vocab_length) | |
# weight norm | |
weight_normalizing_ratio += abs(log(ratio, 2)) | |
opp_weight_normalizing_ratio += abs(log(opp_ratio, 2)) | |
if TF: | |
my_word_count[kw] = log(1 + my_word_count[kw]) | |
if IDF: | |
my_word_count[kw] = my_word_count[kw]*log(length_train/num_docs_word_in.get(kw,0.01), 2) #IDF | |
## length norm | |
w_freq = my_word_count[kw] | |
length_norm += pow(w_freq, 2) | |
length_norm = pow(length_norm, 0.5) | |
# print "LN: ", length_norm | |
for w in my_word_count.keys(): | |
count_word_train=word_dict.get(w,0) | |
ratio = (count_word_train+1)/(count_cat+vocab_length) #Nw,c+1/Nc+|V| = theta_c | |
# if COMPLEMENT: | |
opp_count_word_train=opp_word_dict.get(w,0) | |
opp_ratio = (opp_count_word_train+1)/(opp_count_cat+vocab_length) | |
word_freq = my_word_count[w] | |
if LENGTH: | |
word_freq = word_freq/length_norm # length normalization | |
ratio = log(ratio, 2) # weight factor log(theta_c) = weight_c,w | |
opp_ratio = log(opp_ratio, 2) | |
if WEIGHTED: | |
ratio = ratio/weight_normalizing_ratio # weight normalization | |
opp_ratio = opp_ratio/opp_weight_normalizing_ratio | |
if COMPLEMENT == 1: # just complement | |
neg_log_prob -= word_freq*opp_ratio | |
else: | |
neg_log_prob += word_freq*ratio # class probability | |
if COMPLEMENT == 2: # one-v-all | |
neg_log_prob += word_freq*ratio | |
# break | |
# print "NLP: ", neg_log_prob | |
# print "\n\n", cat, minimum_neg_log_prob , '<' , neg_log_prob | |
# if minimum_neg_log_prob>neg_log_prob: | |
if minimum_neg_log_prob<neg_log_prob: | |
min_category=cat | |
minimum_neg_log_prob=neg_log_prob | |
# print "Min cat: ", min_category | |
if AMAZON or TWITTER: | |
li_results.append((file_name,min_category,REVIEW_POL[file_name])) | |
else: | |
li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0])) | |
# break | |
#12) Evaluating the classifier | |
precision = CalculateAccuracy(li_results) | |
return precision | |
def TestMachine_Bern(): | |
li_results=[] | |
#5) Like in the training set,Loop through the test set, to get the individual words | |
for file_name in testset: | |
minimum_neg_log_prob=1000000000 | |
min_category='' | |
set_list_words = set(tokenize(file_name)) | |
##6) Get the probability for each category, | |
#using the cat_num_docs dictionary to wade through the categories | |
for cat in cat_num_docs: | |
neg_log_prob=-log(cat_num_docs[cat]/len(trainset)) | |
for w in word_cat_num_doc_dict: | |
if w in set_list_words: | |
neg_log_prob-=log(word_cat_num_doc_dict[w][cat]) | |
else: | |
neg_log_prob-=log(1-word_cat_num_doc_dict[w][cat]) | |
if minimum_neg_log_prob>neg_log_prob: | |
min_category=cat | |
minimum_neg_log_prob=neg_log_prob | |
if AMAZON or TWITTER: | |
li_results.append((file_name,min_category,REVIEW_POL[file_name])) | |
else: | |
li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0])) | |
# break | |
precision = CalculateAccuracy(li_results) | |
return precision | |
RunWholeThing() |