From d4e334b097421489d91bcb449a72539aef9d5636 Mon Sep 17 00:00:00 2001 From: ap1113 Date: Mon, 18 Apr 2016 02:08:54 -0400 Subject: [PATCH] clean up twcnb.py now has control variables for all variations/implementations of the algo, incl. delta tf-idf --- MNB.py | 4 +- TWCNB.py | 118 ++++++++++++++++++++++++++++++++++++------------------- 2 files changed, 80 insertions(+), 42 deletions(-) diff --git a/MNB.py b/MNB.py index 948b774..c2678ad 100644 --- a/MNB.py +++ b/MNB.py @@ -8,10 +8,10 @@ from nltk.corpus import stopwords STOP_WORDS = set(stopwords.words('english')) SPLIT_AMOUNT = 0.6 # training amount from data -AMAZON = 1 +AMAZON = 0 REVIEW_POL={} DEFINED_SIZE = 1 -DEFINED_SIZES = {'pos': 948, 'neg': 948} +DEFINED_SIZES = {'pos': 600, 'neg': 600} def SplitData(): type_dict={} diff --git a/TWCNB.py b/TWCNB.py index bb2564c..18354f8 100644 --- a/TWCNB.py +++ b/TWCNB.py @@ -1,3 +1,8 @@ +###################### +# Full version with all variations included +# To improve: create a main function allowing for multiple runs +###################### + from __future__ import division from math import log import re @@ -8,11 +13,17 @@ import random STOP_WORDS = set(stopwords.words('english')) SPLIT_AMOUNT = 0.6 # training amount from data -USE_IDF = 1 -AMAZON = 1 +COMPLEMENT = 2 # 1 - just comp, 2 - delta / one-v-all +WEIGHTED = 0 # 1 - adjust weights +TF = 0 # 1 - log term frew +IDF = 0 # 1 - idf +LENGTH = 0 # 1 - doc length adjust +AMAZON = 1 # 0 - use movie_reviews, 1 - use Amazon set +NO_OFF = 1 # 0 - use random data size offset, 1 - nope +DEFINED_SIZE = 0 # 1 - use DEFINED_SIZES for pos, neg sets +DEFINED_SIZES = {'pos': 700, 'neg': 1100} + REVIEW_POL={} -DEFINED_SIZE = 1 -DEFINED_SIZES = {'pos': 948, 'neg': 948} def SplitData(): type_dict={} docs_count={} @@ -31,6 +42,8 @@ def SplitData(): else: for category in mr.categories(): type_dict[category]=mr.fileids(categories=category) + if NO_OFF: + offset_sample = 0 for cat in type_dict.keys(): li = type_dict[cat] random.shuffle(li) @@ -94,9 +107,10 @@ cat_word_dict={} cat_word_count_dict={} #val = my_dict.get(key, mydefaultval) complete_training_docs_tokens = [] - +num_docs_word_in = {} ##5)Loop through the training set, to get the entire text from each file ##6) Parse the string to get individual words + for file_name in trainset: list_words = tokenize(file_name) complete_training_docs_tokens.append(list_words) @@ -114,36 +128,37 @@ for file_name in trainset: cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0) # add number of words to total word count for cat - cat_word_count_dict[cat]+=len(list_words) -# start count for number of occurences for each word + cat_word_count_dict[cat]+=len(list_words) +# start count for number of occurences for each word + counted = [] for w in list_words: cat_word_dict[cat][w] = cat_word_dict[cat].get(w, 0) cat_word_dict[cat][w]+=1 + if w not in counted: + counted.append(w) + num_docs_word_in[w] = num_docs_word_in.get(w, 0) + num_docs_word_in[w] += 1 ##8) Get the vocabulary length ## number of words, total across categories vocab_length=0 -num_docs_word_in = {} + for dic in cat_word_dict.values(): vocab_length+=len(dic) - if USE_IDF: - for uniq_word in dic.keys(): - num_docs_word_in[uniq_word] = num_docs_word_in.get(uniq_word, 1) - num_docs_word_in[uniq_word] = sum(1 for sr in complete_training_docs_tokens if uniq_word in sr) - - + ####Congratulations! the Classifier is trained, now it is time to run the Multinomial Naive Bayes Classifier on the test dataset length_train = len(trainset) +print "length of training set ", length_train li_results=[] #9) Like in the training set,Loop through the test set, to get the entire text from each file ##10) Similar step, parse the string to get individual words for file_name in testset: # print "File: ", file_name - minimum_neg_log_prob=1000000000 - # minimum_neg_log_prob = 0 # NEW + # minimum_neg_log_prob=1000000000 + minimum_neg_log_prob = -1000000000 # NEW min_category='' list_words = tokenize(file_name) @@ -161,12 +176,14 @@ for file_name in testset: if cat == 'pos': inv_cat = 'neg' - - neg_log_prob=log(cat_num_docs[cat]/length_train) + neg_log_prob = log(cat_num_docs[cat]/length_train) # neg_log_prob = cat_num_docs[cat]/length_train - word_dict = cat_word_dict[inv_cat] - count_cat = cat_word_count_dict[inv_cat] + opp_word_dict = cat_word_dict[inv_cat] + opp_count_cat = cat_word_count_dict[inv_cat] + + word_dict = cat_word_dict[cat] + count_cat = cat_word_count_dict[cat] my_word_count = {} for aw in list_words: @@ -175,15 +192,27 @@ for file_name in testset: length_norm = 0 weight_normalizing_ratio = 0 + opp_weight_normalizing_ratio = 0 for kw in my_word_count.keys(): - count_word_train=word_dict.get(kw,0) + count_word_train=word_dict.get(kw,0) ratio = (count_word_train+1)/(count_cat+vocab_length) - ## weight norm - weight_normalizing_ratio+=log(ratio) - ## TF - my_word_count[kw] = log(my_word_count[kw]+1) + + # if COMPLEMENT: + opp_count_word_train=opp_word_dict.get(kw,0) + opp_ratio = (opp_count_word_train+1)/(opp_count_cat+vocab_length) + + # weight norm + weight_normalizing_ratio += abs(log(ratio)) + opp_weight_normalizing_ratio += abs(log(opp_ratio)) + + if TF: + my_word_count[kw] = log(1 + my_word_count[kw]) + + if IDF: + my_word_count[kw] = my_word_count[kw]*log(length_train/num_docs_word_in.get(w,1)) #IDF ## length norm - length_norm += (my_word_count[kw]**(2)) + w_freq = my_word_count[kw] + length_norm += (w_freq * w_freq) length_norm = length_norm**(0.5) # print "WNR: ", weight_normalizing_ratio @@ -191,31 +220,40 @@ for file_name in testset: for w in my_word_count.keys(): count_word_train=word_dict.get(w,0) ratio = (count_word_train+1)/(count_cat+vocab_length) #Nw,c+1/Nc+|V| = theta_c - # neg_log_prob-=log(ratio) + + # if COMPLEMENT: + opp_count_word_train=opp_word_dict.get(w,0) + opp_ratio = (opp_count_word_train+1)/(opp_count_cat+vocab_length) + word_freq = my_word_count[w] - if USE_IDF: - word_freq = word_freq*log(length_train/num_docs_word_in.get(w,1)) #IDF - word_freq = word_freq/length_norm # length normalization - # neg_log_prob += word_freq*log(ratio) #switch to + if LENGTH: + word_freq = word_freq/length_norm # length normalization + + ratio = log(ratio) # weight factor log(theta_c) = weight_c,w - ratio = ratio/weight_normalizing_ratio # weight normalization - neg_log_prob += word_freq*ratio # class probability + opp_ratio = log(opp_ratio) + + if WEIGHTED: + ratio = ratio/weight_normalizing_ratio # weight normalization + opp_ratio = opp_ratio/opp_weight_normalizing_ratio + if COMPLEMENT == 1: # just complement + neg_log_prob -= word_freq*opp_ratio + else: + neg_log_prob += word_freq*ratio # class probability + if COMPLEMENT == 2: # one-v-all + neg_log_prob += word_freq*ratio - # neg_log_prob *= ratio - # print w, "Ratio found:",ratio, "new_neg_log:", neg_log_prob # break # print "NLP: ", neg_log_prob # print "\n\n", cat, minimum_neg_log_prob , '<' , neg_log_prob - if minimum_neg_log_prob>neg_log_prob: - # if minimum_neg_log_probneg_log_prob: + if minimum_neg_log_prob