Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
BigData/OLD_VERSIONS/TWCNB.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
387 lines (322 sloc)
13.5 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
###################### | |
# Full version with all variations included | |
# To improve: create a main function allowing for multiple runs | |
###################### | |
from __future__ import division | |
from math import log | |
from math import pow | |
import re | |
import csv | |
from nltk.corpus import movie_reviews as mr | |
from nltk.corpus import stopwords | |
import random | |
STOP_WORDS = set(stopwords.words('english')) | |
SPLIT_AMOUNT = 0.6 # training amount from data | |
COMPLEMENT = 0 # 1 - just comp, 2 - delta / one-v-all | |
WEIGHTED = 0 # 1 - adjust weights | |
TF = 0 # 1 - log term frew | |
IDF = 0 # 1 - idf | |
LENGTH = 0 # 1 - doc length adjust | |
AMAZON = 0 # 0 - use movie_reviews, 1 - use Amazon set | |
NO_OFF = 1 # 0 - use random data size offset, 1 - nope | |
DEFINED_SIZE = 0 # 1 - use DEFINED_SIZES for pos, neg sets | |
DEFINED_SIZES = {'pos': 600, 'neg': 600} | |
REVIEW_POL={} | |
def SplitData(): | |
type_dict={} | |
docs_count={} | |
train_test = [[],[]] | |
offset_sample = random.randint(-400,400) | |
print "offset_sample", offset_sample | |
if AMAZON: | |
offset_sample = random.randint(-600,600) | |
for category in ['pos', 'neg']: | |
type_dict[category]=[] | |
with open('amazon_revs.csv', 'rb') as csvfile: | |
rev_read = csv.reader(csvfile) | |
for row in rev_read: | |
type_dict[row[1]].append(row[0]) | |
REVIEW_POL[row[0]] = row[1] | |
else: | |
for category in mr.categories(): | |
type_dict[category]=mr.fileids(categories=category) | |
if NO_OFF: | |
offset_sample = 0 | |
for cat in type_dict.keys(): | |
li = type_dict[cat] | |
# random.shuffle(li) | |
size=int(len(li)*SPLIT_AMOUNT) + offset_sample | |
if DEFINED_SIZE: | |
size = DEFINED_SIZES[cat] | |
print "Category: ", cat, "Size:", size | |
offset_sample *= -1 | |
docs_count[cat]=size | |
train_test[0].extend(li[:size]) | |
train_test[1].extend(li[size:]) | |
return [train_test,type_dict, docs_count] | |
def tokenize(file_name): | |
list_words = () | |
if AMAZON: | |
list_words = re.split(r'\W+',file_name) | |
else: | |
list_words = re.split(r'\W+',mr.raw(fileids=file_name)) | |
return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS] | |
def CalculateAccuracy(li_results): | |
a=0 | |
b=0 | |
c=0 | |
d=0 | |
cat = li_results[0][1] | |
for t in li_results: | |
if cat==t[1]: | |
if cat==t[2]: | |
a+=1 | |
else: | |
b+=1 | |
else: | |
if cat==t[2]: | |
c+=1 | |
else: | |
d+=1 | |
precision = a/(a+b) | |
# recall = a/(a+c) | |
# print "The following parameters are recorded for the category " , cat | |
print "precision =", precision | |
# li = Preprocessor.get_testset_trainset(corpus) | |
li = SplitData() | |
# exit() | |
testset = li[0][1] | |
trainset = li[0][0] | |
# li = Preprocessor.startup() | |
cat_num_docs = li[2] | |
length_train = len(trainset) | |
print "length of training set ", length_train | |
##4)Create a) a dictionary with a category as the key and dictionary of words-occurrences as values | |
#b) a dictionary with a category as the key and the number of words in it as the value | |
# {pos-> {w1 = 17 times}, {w2 = 32 times}...} {neg-> ....} | |
cat_word_dict={} | |
# {pos-> 4000 words} {neg-> 7000 words} | |
cat_word_count_dict={} | |
#val = my_dict.get(key, mydefaultval) | |
complete_training_docs_tokens = [] | |
num_docs_word_in = {} | |
counts_for_w = {} | |
##5)Loop through the training set, to get the entire text from each file | |
##6) Parse the string to get individual words | |
for file_name in trainset: | |
list_words = tokenize(file_name) | |
complete_training_docs_tokens.append(list_words) | |
# counts_for_w[file_name] = counts_for_w.get(file_name, {}) | |
counts_for_w[file_name] = {} | |
##7) Check if category exists in dictionary, if not, create an empty dictionary, | |
#and put word count as zero | |
#and then insert words into the category's dictionary in both cases and update the word count | |
cat = '' | |
if AMAZON: | |
cat = REVIEW_POL[file_name] | |
else: | |
cat = mr.categories(fileids = file_name)[0] | |
# cat_word_dict[cat] = cat_word_dict.get(cat,{}) | |
# cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0) | |
# add number of words to total word count for cat | |
# cat_word_count_dict[cat]+=len(list_words) | |
# start count for number of occurences for each word | |
counted = [] | |
for w in list_words: | |
# cat_word_dict[cat][w] = cat_word_dict[cat].get(w, 0) | |
# cat_word_dict[cat][w]+=1 | |
counts_for_w[file_name][w] = counts_for_w[file_name].get(w, 0) | |
counts_for_w[file_name][w] += 1 | |
if w not in counted: | |
counted.append(w) | |
num_docs_word_in[w] = num_docs_word_in.get(w, 0) | |
num_docs_word_in[w] += 1 | |
# break | |
for fn in trainset: | |
length_norm_val = 0 | |
cat = '' | |
if AMAZON: | |
cat = REVIEW_POL[fn] | |
else: | |
cat = mr.categories(fileids = fn)[0] | |
cat_word_dict[cat] = cat_word_dict.get(cat,{}) | |
cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0) | |
# print fn + "\n_______________________________\n" | |
# print tokenize(fn) | |
# print "" + "\n_______________________________\n" | |
# print counts_for_w[fn]['book'], num_docs_word_in['book'] | |
for c_w in counts_for_w[fn].keys(): | |
# print c_w | |
if TF: | |
counts_for_w[fn][c_w] = log(counts_for_w[fn][c_w] + 1, 2) | |
# if c_w == 'book' : | |
# print 'TF: ', counts_for_w[fn]['book'] | |
if IDF: | |
counts_for_w[fn][c_w] = counts_for_w[fn][c_w]*log(length_train/num_docs_word_in[c_w], 2) | |
# if c_w == 'book' : | |
# print 'IDF: ', counts_for_w[fn]['book'] | |
length_norm_val += (counts_for_w[fn][c_w]*counts_for_w[fn][c_w]) | |
length_norm_val = pow(length_norm_val,0.5) | |
# print counts_for_w[fn]['book'], num_docs_word_in['book'] | |
# print length_norm_val | |
for c_w in counts_for_w[fn].keys(): | |
if LENGTH: | |
counts_for_w[fn][c_w] /= length_norm_val | |
cat_word_count_dict[cat] += counts_for_w[fn][c_w] | |
cat_word_dict[cat][c_w] = cat_word_dict[cat].get(c_w, 0) | |
cat_word_dict[cat][c_w] += counts_for_w[fn][c_w] | |
# print cat_word_dict['neg']['book'] | |
# print cat_word_dict['pos']['book'] | |
# exit() | |
# print "Using LNV: ", length_norm_val | |
# length_norm_val = length_norm_val**(0.5) | |
# print "Using sqLNV: ", length_norm_val | |
# for fn in trainset: | |
# cat = '' | |
# if AMAZON: | |
# cat = REVIEW_POL[fn] | |
# else: | |
# cat = mr.categories(fileids = fn)[0] | |
# cat_word_dict[cat] = cat_word_dict.get(cat,{}) | |
# for c_w in counts_for_w[fn].keys(): | |
# if LENGTH: | |
# counts_for_w[fn][c_w] /= length_norm_val | |
# cat_word_dict[cat][c_w] = cat_word_dict[cat].get(c_w, 0) | |
# cat_word_dict[cat][c_w] += counts_for_w[fn][c_w] | |
##8) Get the vocabulary length | |
## number of words, total across categories | |
vocab_length=0 | |
# for dic in num_docs_word_in.keys(): | |
vocab_length=len(num_docs_word_in.keys()) | |
print cat_word_dict['pos']['book'], cat_word_dict['neg']['book'] | |
print "Vocab", vocab_length | |
for cat in cat_word_dict.keys(): | |
count_cat = cat_word_count_dict[cat] | |
weight_norm_cat = 0 | |
for w in cat_word_dict[cat].keys(): | |
cat_word_dict[cat][w] = (cat_word_dict[cat][w]+1)/(count_cat+vocab_length) | |
cat_word_dict[cat][w] = log ( cat_word_dict[cat][w] , 2) | |
weight_norm_cat += abs(cat_word_dict[cat][w]) | |
if WEIGHTED: | |
for w in cat_word_dict[cat].keys(): | |
cat_word_dict[cat][w] = cat_word_dict[cat][w]/weight_norm_cat | |
print cat_word_dict['pos']['book'], cat_word_dict['neg']['book'] | |
exit() | |
####Congratulations! the Classifier is trained, now it is time to run the Multinomial Naive Bayes Classifier on the test dataset | |
print 'pos' , cat_num_docs['pos']/len(trainset) | |
print 'neg' , cat_num_docs['neg']/len(trainset) | |
li_results=[] | |
li_results2=[] | |
#9) Like in the training set,Loop through the test set, to get the entire text from each file | |
##10) Similar step, parse the string to get individual words | |
for file_name in testset: | |
# print "File: ", file_name | |
# minimum_neg_log_prob=1000000000 | |
minimum_neg_log_prob = -1000000000 # NEW | |
minimum_pos_log_prob = 100000000 | |
min_category='' | |
max_category='' | |
list_words = tokenize(file_name) | |
##11) Get the probability for each category, | |
#can use any of the created dictionaries to wade through the categories | |
for cat in cat_word_count_dict: | |
# print "________________________________________________________________" | |
# print "________________________________________________________________" | |
# print "\n\n" , cat, cat, cat, cat, cat, cat, cat, cat, cat, cat, "\n\n" | |
# neg_log_prob=-log(cat_num_docs[cat]/length_train) | |
inv_cat = 'pos' | |
if cat == 'pos': | |
inv_cat = 'neg' | |
neg_log_prob = log(cat_num_docs[cat]/length_train, 2) | |
pos_log_prob = 0 | |
# neg_log_prob = cat_num_docs[cat]/length_train | |
opp_word_dict = cat_word_dict[inv_cat] | |
opp_count_cat = cat_word_count_dict[inv_cat] | |
word_dict = cat_word_dict[cat] | |
count_cat = cat_word_count_dict[cat] | |
my_word_count = {} | |
for aw in list_words: | |
my_word_count[aw] = my_word_count.get(aw, 0) | |
my_word_count[aw]+=1 | |
if COMPLEMENT: | |
neg_log_prob -= opp_word_dict.get(aw, 0) | |
else : | |
neg_log_prob += word_dict.get(aw, 0) | |
pos_log_prob += opp_word_dict.get(aw, 0) | |
# my_orig_word_count[aw] = my_orig_word_count.get(aw, 0) | |
# my_orig_word_count[aw]+=1 | |
# # length_norm = 0 | |
# weight_normalizing_ratio = 0 | |
# opp_weight_normalizing_ratio = 0 | |
# for kw in my_word_count.keys(): | |
# count_word_train=word_dict.get(kw,0) | |
# ratio = (count_word_train+1)/(count_cat+vocab_length) | |
# # if COMPLEMENT: | |
# opp_count_word_train=opp_word_dict.get(kw,0) | |
# opp_ratio = (opp_count_word_train+1)/(opp_count_cat+vocab_length) | |
# # weight norm | |
# # weight_normalizing_ratio += abs(log(ratio, 2)) | |
# # opp_weight_normalizing_ratio += abs(log(opp_ratio, 2)) | |
# weight_normalizing_ratio += log(ratio, 2) | |
# opp_weight_normalizing_ratio += log(opp_ratio, 2) | |
# # if TF: | |
# # my_word_count[kw] = log(1 + my_word_count[kw]) | |
# # if IDF: | |
# # my_word_count[kw] = my_word_count[kw]*log(length_train/num_docs_word_in.get(w,1)) #IDF | |
# # ## length norm | |
# # w_freq = my_word_count[kw] | |
# # length_norm += (w_freq * w_freq) | |
# length_norm = length_norm**(0.5) | |
# print "WNR: ", weight_normalizing_ratio | |
# for w in my_word_count.keys(): | |
# count_word_train=word_dict.get(w,0) | |
# ratio = (count_word_train+1)/(count_cat+vocab_length) #Nw,c+1/Nc+|V| = theta_c | |
# # if COMPLEMENT: | |
# opp_count_word_train=opp_word_dict.get(w,0) | |
# opp_ratio = (opp_count_word_train+1)/(opp_count_cat+vocab_length) | |
# word_freq = my_word_count[w] | |
# # if LENGTH: | |
# # word_freq = word_freq/length_norm # length normalization | |
# ratio = log(ratio, 2) # weight factor log(theta_c) = weight_c,w | |
# opp_ratio = log(opp_ratio, 2) | |
# if WEIGHTED: | |
# ratio = ratio/weight_normalizing_ratio # weight normalization | |
# opp_ratio = opp_ratio/opp_weight_normalizing_ratio | |
# if COMPLEMENT == 1: # just complement | |
# neg_log_prob -= word_freq*opp_ratio | |
# else: | |
# neg_log_prob += word_freq*ratio # class probability | |
# pos_log_prob += word_freq*ratio | |
# if COMPLEMENT == 2: # one-v-all | |
# neg_log_prob += word_freq*ratio | |
# break | |
# print "NLP: ", neg_log_prob | |
# print file_name | |
# print "\n\n", cat, minimum_pos_log_prob , '<' , neg_log_prob | |
# if minimum_pos_log_prob>pos_log_prob: | |
if minimum_neg_log_prob<neg_log_prob: | |
min_category=cat | |
minimum_neg_log_prob=neg_log_prob | |
# minimum_pos_log_prob=pos_log_prob | |
if minimum_pos_log_prob>pos_log_prob: | |
max_category=cat | |
minimum_pos_log_prob=pos_log_prob | |
# print "Min cat: ", min_category | |
if AMAZON: | |
li_results.append((file_name,min_category,REVIEW_POL[file_name])) | |
else: | |
li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0])) | |
# break | |
if AMAZON: | |
li_results2.append((file_name,max_category,REVIEW_POL[file_name])) | |
else: | |
li_results2.append((file_name,max_category,mr.categories(fileids = file_name)[0])) | |
###--------------------DEBUG STATEMENTS---------------------- | |
#for t in li_results: | |
# if t[1]!=t[2]: | |
# print t | |
###--------------------DEBUG STATEMENTS---------------------- | |
###--------------------DEBUG STATEMENTS---------------------- | |
#12) Evaluating the classifier | |
CalculateAccuracy(li_results) | |
CalculateAccuracy(li_results2) |