From 47d86abae52112fc8ce25868505ede371514cf65 Mon Sep 17 00:00:00 2001 From: ap1113 Date: Wed, 20 Apr 2016 19:42:52 -0400 Subject: [PATCH] functions i did a thing - but i did it at 2 am so this might be bad --- OLD_VERSIONS/CWMNB.py | 254 +++++++++++++++++++++++++++ OLD_VERSIONS/TCWNB2.py | 242 ++++++++++++++++++++++++++ OLD_VERSIONS/TWCNB_old.py | 235 +++++++++++++++++++++++++ TWCNB_v0_2.py | 358 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 1089 insertions(+) create mode 100644 OLD_VERSIONS/CWMNB.py create mode 100644 OLD_VERSIONS/TCWNB2.py create mode 100644 OLD_VERSIONS/TWCNB_old.py create mode 100644 TWCNB_v0_2.py diff --git a/OLD_VERSIONS/CWMNB.py b/OLD_VERSIONS/CWMNB.py new file mode 100644 index 0000000..2e2b524 --- /dev/null +++ b/OLD_VERSIONS/CWMNB.py @@ -0,0 +1,254 @@ +###################### +# This version is CWMNB only +###################### + +from __future__ import division +from math import log +import re +import csv +from nltk.corpus import movie_reviews as mr +from nltk.corpus import stopwords +import random +STOP_WORDS = set(stopwords.words('english')) +SPLIT_AMOUNT = 0.6 # training amount from data + +COMPLEMENT = 0 +WEIGHTED = 0 +USE_IDF = 0 +AMAZON = 0 +REVIEW_POL={} +DEFINED_SIZE = 1 +DEFINED_SIZES = {'pos': 600, 'neg': 600} +def SplitData(): + type_dict={} + docs_count={} + train_test = [[],[]] + offset_sample = random.randint(-400,400) + print "offset_sample", offset_sample + if AMAZON: + offset_sample = random.randint(-600,600) + for category in ['pos', 'neg']: + type_dict[category]=[] + with open('amazon_revs.csv', 'rb') as csvfile: + rev_read = csv.reader(csvfile) + for row in rev_read: + type_dict[row[1]].append(row[0]) + REVIEW_POL[row[0]] = row[1] + else: + for category in mr.categories(): + type_dict[category]=mr.fileids(categories=category) + for cat in type_dict.keys(): + li = type_dict[cat] + random.shuffle(li) + size=int(len(li)*SPLIT_AMOUNT) + offset_sample + if DEFINED_SIZE: + size = DEFINED_SIZES[cat] + print "Category: ", cat, "Size:", size + offset_sample *= -1 + docs_count[cat]=size + train_test[0].extend(li[:size]) + train_test[1].extend(li[size:]) + return [train_test,type_dict, docs_count] + +def tokenize(file_name): + list_words = () + if AMAZON: + list_words = re.split(r'\W+',file_name) + else: + list_words = re.split(r'\W+',mr.raw(fileids=file_name)) + + return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS] + + +def CalculateAccuracy(li_results): + a=0 + b=0 + c=0 + d=0 + cat = li_results[0][1] + for t in li_results: + if cat==t[1]: + if cat==t[2]: + a+=1 + else: + b+=1 + else: + if cat==t[2]: + c+=1 + else: + d+=1 + precision = a/(a+b) + # recall = a/(a+c) + # print "The following parameters are recorded for the category " , cat + print "precision =", precision + +# li = Preprocessor.get_testset_trainset(corpus) +li = SplitData() +# exit() +testset = li[0][1] +trainset = li[0][0] +# li = Preprocessor.startup() +cat_num_docs = li[2] + + + +##4)Create a) a dictionary with a category as the key and dictionary of words-occurrences as values + #b) a dictionary with a category as the key and the number of words in it as the value +# {pos-> {w1 = 17 times}, {w2 = 32 times}...} {neg-> ....} +cat_word_dict={} +# {pos-> 4000 words} {neg-> 7000 words} +cat_word_count_dict={} +#val = my_dict.get(key, mydefaultval) +complete_training_docs_tokens = [] + +##5)Loop through the training set, to get the entire text from each file +##6) Parse the string to get individual words +for file_name in trainset: + list_words = tokenize(file_name) + complete_training_docs_tokens.append(list_words) + + +##7) Check if category exists in dictionary, if not, create an empty dictionary, + #and put word count as zero + #and then insert words into the category's dictionary in both cases and update the word count + cat = '' + if AMAZON: + cat = REVIEW_POL[file_name] + else: + cat = mr.categories(fileids = file_name)[0] + cat_word_dict[cat] = cat_word_dict.get(cat,{}) + cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0) + +# add number of words to total word count for cat + cat_word_count_dict[cat]+=len(list_words) +# start count for number of occurences for each word + for w in list_words: + cat_word_dict[cat][w] = cat_word_dict[cat].get(w, 0) + cat_word_dict[cat][w]+=1 + + + +##8) Get the vocabulary length +## number of words, total across categories +vocab_length=0 +num_docs_word_in = {} +for dic in cat_word_dict.values(): + vocab_length+=len(dic) + if USE_IDF: + for uniq_word in dic.keys(): + num_docs_word_in[uniq_word] = num_docs_word_in.get(uniq_word, 1) + num_docs_word_in[uniq_word] = sum(1 for sr in complete_training_docs_tokens if uniq_word in sr) + + + +####Congratulations! the Classifier is trained, now it is time to run the Multinomial Naive Bayes Classifier on the test dataset +length_train = len(trainset) +li_results=[] +#9) Like in the training set,Loop through the test set, to get the entire text from each file +##10) Similar step, parse the string to get individual words +for file_name in testset: + # print "File: ", file_name + # minimum_neg_log_prob=1000000000 + minimum_neg_log_prob = -1000000000 # NEW + min_category='' + list_words = tokenize(file_name) + + + +##11) Get the probability for each category, + #can use any of the created dictionaries to wade through the categories + for cat in cat_word_count_dict: + # print cat , cat_num_docs[cat]/len(trainset) + # print "________________________________________________________________" + # print "________________________________________________________________" + # print "\n\n" , cat, cat, cat, cat, cat, cat, cat, cat, cat, cat, "\n\n" + # neg_log_prob=-log(cat_num_docs[cat]/length_train) + inv_cat = 'pos' + if cat == 'pos': + inv_cat = 'neg' + + neg_log_prob = log(cat_num_docs[cat]/length_train) + + # neg_log_prob = cat_num_docs[cat]/length_train + opp_word_dict = cat_word_dict[inv_cat] + opp_count_cat = cat_word_count_dict[inv_cat] + + word_dict = cat_word_dict[cat] + count_cat = cat_word_count_dict[cat] + + my_word_count = {} + for aw in list_words: + my_word_count[aw] = my_word_count.get(aw, 0) + my_word_count[aw]+=1 + + length_norm = 0 + weight_normalizing_ratio = 0 + for kw in my_word_count.keys(): + count_word_train=word_dict.get(kw,0) + ratio = (count_word_train+1)/(count_cat+vocab_length) + + if COMPLEMENT: + count_word_train=opp_word_dict.get(kw,0) + ratio = (count_word_train+1)/(opp_count_cat+vocab_length) + + # weight norm + weight_normalizing_ratio += abs(log(ratio)) + ## TF + # my_word_count[kw] = log(my_word_count[kw]+1) + ## length norm + # length_norm += (my_word_count[kw]**(2)) + + # length_norm = length_norm**(0.5) + # print "WNR: ", weight_normalizing_ratio + + for w in my_word_count.keys(): + count_word_train=word_dict.get(w,0) + ratio = (count_word_train+1)/(count_cat+vocab_length) #Nw,c+1/Nc+|V| = theta_c + + if COMPLEMENT: + count_word_train=opp_word_dict.get(w,0) + ratio = (count_word_train+1)/(opp_count_cat+vocab_length) + + word_freq = my_word_count[w] + + if USE_IDF: + word_freq = word_freq*log(length_train/num_docs_word_in.get(w,1)) #IDF + # word_freq = word_freq/length_norm # length normalization + + + ratio = log(ratio) # weight factor log(theta_c) = weight_c,w + + if WEIGHTED: + ratio = ratio/weight_normalizing_ratio # weight normalization + + if COMPLEMENT: + neg_log_prob -= word_freq*ratio + else: + neg_log_prob += word_freq*ratio # class probability + + # break + # print "NLP: ", neg_log_prob + # print "\n\n", cat, minimum_neg_log_prob , '<' , neg_log_prob + # if minimum_neg_log_prob>neg_log_prob: + if minimum_neg_log_prob1 and w.lower() not in STOP_WORDS] + + +def CalculateAccuracy(li_results): + a=0 + b=0 + c=0 + d=0 + cat = li_results[0][1] + for t in li_results: + if cat==t[1]: + if cat==t[2]: + a+=1 + else: + b+=1 + else: + if cat==t[2]: + c+=1 + else: + d+=1 + precision = a/(a+b) + # recall = a/(a+c) + # print "The following parameters are recorded for the category " , cat + print "precision =", precision + +# li = Preprocessor.get_testset_trainset(corpus) +li = SplitData() +# exit() +testset = li[0][1] +trainset = li[0][0] +# li = Preprocessor.startup() +cat_num_docs = li[2] + + + +##4)Create a) a dictionary with a category as the key and dictionary of words-occurrences as values + #b) a dictionary with a category as the key and the number of words in it as the value +# {pos-> {w1 = 17 times}, {w2 = 32 times}...} {neg-> ....} +cat_word_dict={} +# {pos-> 4000 words} {neg-> 7000 words} +cat_word_count_dict={} +#val = my_dict.get(key, mydefaultval) +complete_training_docs_tokens = [] + +##5)Loop through the training set, to get the entire text from each file +##6) Parse the string to get individual words +for file_name in trainset: + list_words = tokenize(file_name) + complete_training_docs_tokens.append(list_words) + + +##7) Check if category exists in dictionary, if not, create an empty dictionary, + #and put word count as zero + #and then insert words into the category's dictionary in both cases and update the word count + cat = '' + if AMAZON: + cat = REVIEW_POL[file_name] + else: + cat = mr.categories(fileids = file_name)[0] + cat_word_dict[cat] = cat_word_dict.get(cat,{}) + cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0) + +# add number of words to total word count for cat + cat_word_count_dict[cat]+=len(list_words) +# start count for number of occurences for each word + for w in list_words: + cat_word_dict[cat][w] = cat_word_dict[cat].get(w, 0) + cat_word_dict[cat][w]+=1 + + + +##8) Get the vocabulary length +## number of words, total across categories +vocab_length=0 +num_docs_word_in = {} +for dic in cat_word_dict.values(): + vocab_length+=len(dic) + if USE_IDF: + for uniq_word in dic.keys(): + num_docs_word_in[uniq_word] = num_docs_word_in.get(uniq_word, 1) + num_docs_word_in[uniq_word] = sum(1 for sr in complete_training_docs_tokens if uniq_word in sr) + + + +####Congratulations! the Classifier is trained, now it is time to run the Multinomial Naive Bayes Classifier on the test dataset +length_train = len(trainset) +li_results=[] +#9) Like in the training set,Loop through the test set, to get the entire text from each file +##10) Similar step, parse the string to get individual words +for file_name in testset: + # print "File: ", file_name + # minimum_neg_log_prob=1000000000 + minimum_neg_log_prob = -1000000000 # NEW + min_category='' + list_words = tokenize(file_name) + + + +##11) Get the probability for each category, + #can use any of the created dictionaries to wade through the categories + for cat in cat_word_count_dict: + # print cat , cat_num_docs[cat]/len(trainset) + # print "________________________________________________________________" + # print "________________________________________________________________" + # print "\n\n" , cat, cat, cat, cat, cat, cat, cat, cat, cat, cat, "\n\n" + # neg_log_prob=-log(cat_num_docs[cat]/length_train) + inv_cat = 'pos' + if cat == 'pos': + inv_cat = 'neg' + + + neg_log_prob=log(cat_num_docs[cat]/length_train) + + # neg_log_prob = cat_num_docs[cat]/length_train + # word_dict = cat_word_dict[inv_cat] + # count_cat = cat_word_count_dict[inv_cat] + + word_dict = cat_word_dict[cat] + count_cat = cat_word_count_dict[cat] + + my_word_count = {} + for aw in list_words: + my_word_count[aw] = my_word_count.get(aw, 0) + my_word_count[aw]+=1 + + length_norm = 0 + weight_normalizing_ratio = 0 + for kw in my_word_count.keys(): + count_word_train=word_dict.get(kw,0) + ratio = (count_word_train+1)/(count_cat+vocab_length) + ## weight norm + # weight_normalizing_ratio+=log(ratio) + ## TF + # my_word_count[kw] = log(my_word_count[kw]+1) + ## length norm + # length_norm += (my_word_count[kw]**(2)) + + # length_norm = length_norm**(0.5) + # print "WNR: ", weight_normalizing_ratio + + for w in my_word_count.keys(): + count_word_train=word_dict.get(w,0) + ratio = (count_word_train+1)/(count_cat+vocab_length) #Nw,c+1/Nc+|V| = theta_c + # neg_log_prob-=log(ratio) + word_freq = my_word_count[w] + if USE_IDF: + word_freq = word_freq*log(length_train/num_docs_word_in.get(w,1)) #IDF + # word_freq = word_freq/length_norm # length normalization + + # neg_log_prob += word_freq*log(ratio) #switch to + ratio = log(ratio) # weight factor log(theta_c) = weight_c,w + # ratio = ratio/weight_normalizing_ratio # weight normalization + neg_log_prob += word_freq*ratio # class probability + + + # neg_log_prob *= ratio + # print w, "Ratio found:",ratio, "new_neg_log:", neg_log_prob + # break + # print "NLP: ", neg_log_prob + # print "\n\n", cat, minimum_neg_log_prob , '<' , neg_log_prob + # if minimum_neg_log_prob>neg_log_prob: + if minimum_neg_log_prob1 and w.lower() not in STOP_WORDS] + + +def CalculateAccuracy(li_results): + a=0 + b=0 + c=0 + d=0 + cat = li_results[0][1] + for t in li_results: + if cat==t[1]: + if cat==t[2]: + a+=1 + else: + b+=1 + else: + if cat==t[2]: + c+=1 + else: + d+=1 + precision = a/(a+b) + # recall = a/(a+c) + # print "The following parameters are recorded for the category " , cat + print "precision =", precision + +# li = Preprocessor.get_testset_trainset(corpus) +li = SplitData() +# exit() +testset = li[0][1] +trainset = li[0][0] +# li = Preprocessor.startup() +cat_num_docs = li[2] + + + +##4)Create a) a dictionary with a category as the key and dictionary of words-occurrences as values + #b) a dictionary with a category as the key and the number of words in it as the value +# {pos-> {w1 = 17 times}, {w2 = 32 times}...} {neg-> ....} +cat_word_dict={} +# {pos-> 4000 words} {neg-> 7000 words} +cat_word_count_dict={} +#val = my_dict.get(key, mydefaultval) +complete_training_docs_tokens = [] + +##5)Loop through the training set, to get the entire text from each file +##6) Parse the string to get individual words +for file_name in trainset: + list_words = tokenize(file_name) + complete_training_docs_tokens.append(list_words) + + +##7) Check if category exists in dictionary, if not, create an empty dictionary, + #and put word count as zero + #and then insert words into the category's dictionary in both cases and update the word count + cat = '' + if AMAZON: + cat = REVIEW_POL[file_name] + else: + cat = mr.categories(fileids = file_name)[0] + cat_word_dict[cat] = cat_word_dict.get(cat,{}) + cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0) + +# add number of words to total word count for cat + cat_word_count_dict[cat]+=len(list_words) +# start count for number of occurences for each word + for w in list_words: + cat_word_dict[cat][w] = cat_word_dict[cat].get(w, 0) + cat_word_dict[cat][w]+=1 + + + +##8) Get the vocabulary length +## number of words, total across categories +vocab_length=0 +num_docs_word_in = {} +for dic in cat_word_dict.values(): + vocab_length+=len(dic) + if USE_IDF: + for uniq_word in dic.keys(): + num_docs_word_in[uniq_word] = num_docs_word_in.get(uniq_word, 1) + num_docs_word_in[uniq_word] = sum(1 for sr in complete_training_docs_tokens if uniq_word in sr) + + + +####Congratulations! the Classifier is trained, now it is time to run the Multinomial Naive Bayes Classifier on the test dataset +length_train = len(trainset) +li_results=[] +#9) Like in the training set,Loop through the test set, to get the entire text from each file +##10) Similar step, parse the string to get individual words +for file_name in testset: + # print "File: ", file_name + minimum_neg_log_prob=1000000000 + # minimum_neg_log_prob = 0 # NEW + min_category='' + list_words = tokenize(file_name) + + + +##11) Get the probability for each category, + #can use any of the created dictionaries to wade through the categories + for cat in cat_word_count_dict: + # print cat , cat_num_docs[cat]/len(trainset) + # print "________________________________________________________________" + # print "________________________________________________________________" + # print "\n\n" , cat, cat, cat, cat, cat, cat, cat, cat, cat, cat, "\n\n" + # neg_log_prob=-log(cat_num_docs[cat]/length_train) + inv_cat = 'pos' + if cat == 'pos': + inv_cat = 'neg' + + + neg_log_prob=log(cat_num_docs[cat]/length_train) + + # neg_log_prob = cat_num_docs[cat]/length_train + word_dict = cat_word_dict[inv_cat] + count_cat = cat_word_count_dict[inv_cat] + + my_word_count = {} + for aw in list_words: + my_word_count[aw] = my_word_count.get(aw, 0) + my_word_count[aw]+=1 + + length_norm = 0 + weight_normalizing_ratio = 0 + for kw in my_word_count.keys(): + count_word_train=word_dict.get(kw,0) + ratio = (count_word_train+1)/(count_cat+vocab_length) + ## weight norm + weight_normalizing_ratio+=log(ratio) + ## TF + my_word_count[kw] = log(my_word_count[kw]+1) + ## length norm + length_norm += (my_word_count[kw]**(2)) + + length_norm = length_norm**(0.5) + # print "WNR: ", weight_normalizing_ratio + + for w in my_word_count.keys(): + count_word_train=word_dict.get(w,0) + ratio = (count_word_train+1)/(count_cat+vocab_length) #Nw,c+1/Nc+|V| = theta_c + # neg_log_prob-=log(ratio) + word_freq = my_word_count[w] + if USE_IDF: + word_freq = word_freq*log(length_train/num_docs_word_in.get(w,1)) #IDF + word_freq = word_freq/length_norm # length normalization + + # neg_log_prob += word_freq*log(ratio) #switch to + ratio = log(ratio) # weight factor log(theta_c) = weight_c,w + ratio = ratio/weight_normalizing_ratio # weight normalization + neg_log_prob += word_freq*ratio # class probability + + + # neg_log_prob *= ratio + # print w, "Ratio found:",ratio, "new_neg_log:", neg_log_prob + # break + # print "NLP: ", neg_log_prob + # print "\n\n", cat, minimum_neg_log_prob , '<' , neg_log_prob + if minimum_neg_log_prob>neg_log_prob: + # if minimum_neg_log_prob {w1 = 17 times}, {w2 = 32 times}...} {neg-> ....} +cat_word_dict={} +# {pos-> 4000 words} {neg-> 7000 words} +cat_word_count_dict={} +#val = my_dict.get(key, mydefaultval) +num_docs_word_in = {} +vocab_length=0 + + + +def SplitData(): + global REVIEW_POL + type_dict={} + docs_count={} + train_test = [[],[]] + # offset_sample = random.randint(-400,400) + offset_sample = OFFSET + print "offset_sample", offset_sample + if AMAZON: + # offset_sample = random.randint(-600,600) + for category in ['pos', 'neg']: + type_dict[category]=[] + with open('amazon_revs.csv', 'rb') as csvfile: + rev_read = csv.reader(csvfile) + for row in rev_read: + type_dict[row[1]].append(row[0]) + REVIEW_POL[row[0]] = row[1] + else: + for category in mr.categories(): + type_dict[category]=mr.fileids(categories=category) + # if NO_OFF: + # offset_sample = 0 + for cat in type_dict.keys(): + li = type_dict[cat] + if SHUFFLE: + random.shuffle(li) + size=int(len(li)*SPLIT_AMOUNT) + offset_sample + # if DEFINED_SIZE: + # size = DEFINED_SIZES[cat] + print "Category: ", cat, "Size:", size + offset_sample *= -1 + docs_count[cat]=size + train_test[0].extend(li[:size]) + train_test[1].extend(li[size:]) + return [train_test,type_dict, docs_count] + +def tokenize(file_name): + list_words = () + if AMAZON: + list_words = re.split(r'\W+',file_name) + else: + list_words = re.split(r'\W+',mr.raw(fileids=file_name)) + + return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS] + + +def CalculateAccuracy(li_results): + a=0 + b=0 + c=0 + d=0 + cat = li_results[0][1] + for t in li_results: + if cat==t[1]: + if cat==t[2]: + a+=1 + else: + b+=1 + else: + if cat==t[2]: + c+=1 + else: + d+=1 + precision = a/(a+b) + # recall = a/(a+c) + # print "The following parameters are recorded for the category " , cat + print "precision =", precision + return precision + +def RunWholeThing(): + global AMAZON + global OFFSET + global DEFINED_SIZE + global DEFINED_SIZES + OFFSET = 0 + AMAZON = 0 # 0 - use movie_reviews, 1 - use Amazon set + while OFFSET < 400: + ans = DoTheThing() + print "Pos/Neg =", cat_num_docs['pos'], cat_num_docs['neg'], 'Movies @ +/-', OFFSET + print "_____________________________________________________" + OFFSET = -1*OFFSET + ans2 = DoTheThing() + OFFSET = -1*OFFSET + print ans , ans2 + OFFSET += 100 + OFFSET = 0 + AMAZON = 1 + + while OFFSET < 600: + ans = DoTheThing() + print "Pos/Neg =", cat_num_docs['pos'], cat_num_docs['neg'], 'Amazon @ +/-', OFFSET + print "_____________________________________________________" + OFFSET = -1*OFFSET + ans2 = DoTheThing() + OFFSET = -1*OFFSET + print ans , ans2 + OFFSET += 100 + + + +def DoTheThing(): + i = 0 + reps = 5 + + m_nb = 0 + ti_nb = 0 + til_nb = 0 + cw_nb = 0 + tilcw_nb= 0 + + while i < reps: + TrainMachine() + m_nb += TestMachine(0,0,0,0,0)/5 + ti_nb += TestMachine(1,1,0,0,0)/5 + til_nb += TestMachine(1,1,1,0,0)/5 + cw_nb += TestMachine(0,0,0,1,1)/5 + tilcw_nb += TestMachine(1,1,1,1,1)/5 + i+=1 + return (m_nb, ti_nb, til_nb, cw_nb, tilcw_nb) + + +# li = Preprocessor.get_testset_trainset(corpus) + +##5)Loop through the training set, to get the entire text from each file +##6) Parse the string to get individual words +def TrainMachine(): + global cat_word_dict + global cat_word_count_dict + global num_docs_word_in + global li + global testset + global trainset + global cat_num_docs + global vocab_length + + li = SplitData() + testset = li[0][1] + trainset = li[0][0] + cat_num_docs = li[2] + + for file_name in trainset: + list_words = tokenize(file_name) + + + ##7) Check if category exists in dictionary, if not, create an empty dictionary, + #and put word count as zero + #and then insert words into the category's dictionary in both cases and update the word count + cat = '' + if AMAZON: + cat = REVIEW_POL[file_name] + else: + cat = mr.categories(fileids = file_name)[0] + cat_word_dict[cat] = cat_word_dict.get(cat,{}) + cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0) + + # add number of words to total word count for cat + cat_word_count_dict[cat]+=len(list_words) + # start count for number of occurences for each word + counted = [] + for w in list_words: + cat_word_dict[cat][w] = cat_word_dict[cat].get(w, 0) + cat_word_dict[cat][w]+=1 + if w not in counted: + counted.append(w) + num_docs_word_in[w] = num_docs_word_in.get(w, 0) + num_docs_word_in[w] += 1 + + for dic in cat_word_dict.values(): + vocab_length+=len(dic) + + +# ##8) Get the vocabulary length +# ## number of words, total across categories +# vocab_length=0 + + + + +# ####Congratulations! the Classifier is trained, now it is time to run the Multinomial Naive Bayes Classifier on the test dataset +# length_train = len(trainset) +# print "length of training set ", length_train + + +def TestMachine(t, i, l, c, w): + #9) Like in the training set,Loop through the test set, to get the entire text from each file + ##10) Similar step, parse the string to get individual words + global trainset + global testset + TF = t # 1 - log term frew + IDF = i # 1 - idf + LENGTH = l # 1 - doc length adjust + COMPLEMENT = c # 1 - just comp, 2 - delta / one-v-all + WEIGHTED = w # 1 - adjust weights + length_train = len(trainset) + # print "length train " , length_train, len(testset) + li_results=[] + + for file_name in testset: + # print "File: ", file_name + # minimum_neg_log_prob=1000000000 + minimum_neg_log_prob = -1000000000 # NEW + min_category='' + list_words = tokenize(file_name) + # print file_name + + + + ##11) Get the probability for each category, + #can use any of the created dictionaries to wade through the categories + for cat in cat_word_count_dict: + # print cat , cat_num_docs[cat]/len(trainset) + # print "________________________________________________________________" + # print "________________________________________________________________" + # print "\n\n" , cat, cat, cat, cat, cat, cat, cat, cat, cat, cat, "\n\n" + # neg_log_prob=-log(cat_num_docs[cat]/length_train) + inv_cat = 'pos' + if cat == 'pos': + inv_cat = 'neg' + + neg_log_prob = log(cat_num_docs[cat]/length_train, 2) + + # neg_log_prob = cat_num_docs[cat]/length_train + opp_word_dict = cat_word_dict[inv_cat] + opp_count_cat = cat_word_count_dict[inv_cat] + + word_dict = cat_word_dict[cat] + count_cat = cat_word_count_dict[cat] + ## get frequency counts + my_word_count = {} + for aw in list_words: + my_word_count[aw] = my_word_count.get(aw, 0) + my_word_count[aw]+=1 + + ## calculate necessary norms + length_norm = 0 + weight_normalizing_ratio = 0 + opp_weight_normalizing_ratio = 0 + for kw in my_word_count.keys(): + count_word_train=word_dict.get(kw,0) + ratio = (count_word_train+1)/(count_cat+vocab_length) + + # if COMPLEMENT: + opp_count_word_train=opp_word_dict.get(kw,0) + opp_ratio = (opp_count_word_train+1)/(opp_count_cat+vocab_length) + + # weight norm + weight_normalizing_ratio += abs(log(ratio, 2)) + opp_weight_normalizing_ratio += abs(log(opp_ratio, 2)) + + if TF: + my_word_count[kw] = log(1 + my_word_count[kw]) + + if IDF: + my_word_count[kw] = my_word_count[kw]*log(length_train/num_docs_word_in.get(kw,1), 2) #IDF + ## length norm + w_freq = my_word_count[kw] + length_norm += pow(w_freq, 2) + + length_norm = pow(length_norm, 0.5) + # print "LN: ", length_norm + + for w in my_word_count.keys(): + count_word_train=word_dict.get(w,0) + ratio = (count_word_train+1)/(count_cat+vocab_length) #Nw,c+1/Nc+|V| = theta_c + + # if COMPLEMENT: + opp_count_word_train=opp_word_dict.get(w,0) + opp_ratio = (opp_count_word_train+1)/(opp_count_cat+vocab_length) + + word_freq = my_word_count[w] + + if LENGTH: + word_freq = word_freq/length_norm # length normalization + + + ratio = log(ratio, 2) # weight factor log(theta_c) = weight_c,w + opp_ratio = log(opp_ratio, 2) + + if WEIGHTED: + ratio = ratio/weight_normalizing_ratio # weight normalization + opp_ratio = opp_ratio/opp_weight_normalizing_ratio + + if COMPLEMENT == 1: # just complement + neg_log_prob -= word_freq*opp_ratio + else: + neg_log_prob += word_freq*ratio # class probability + if COMPLEMENT == 2: # one-v-all + neg_log_prob += word_freq*ratio + + # break + # print "NLP: ", neg_log_prob + # print "\n\n", cat, minimum_neg_log_prob , '<' , neg_log_prob + # if minimum_neg_log_prob>neg_log_prob: + if minimum_neg_log_prob