From 14dc205bb850cc02144693e389d411702e11a571 Mon Sep 17 00:00:00 2001 From: ap1113 Date: Tue, 3 May 2016 17:02:38 -0400 Subject: [PATCH] clean up placed all nb implementations in one code, and moved a bunch of stuff around --- TWCNB_v0_2.py => Naive_bayes.py | 161 ++++++++--- BNB.py => OLD_VERSIONS/BNB.py | 0 MNB.py => OLD_VERSIONS/MNB.py | 0 TFIDF.py => OLD_VERSIONS/TFIDF.py | 0 OLD_VERSIONS/TWCNB.py | 387 +++++++++++++++++++++++++++ TWCNB.py => OLD_VERSIONS/TWCNB_v0.py | 0 OLD_VERSIONS/nb_graph.py | 82 ++++++ get_amazon_revs.py | 9 +- graph_nb_base.py | 82 ++++++ graph_nb_skew.py | 82 ++++++ 10 files changed, 767 insertions(+), 36 deletions(-) rename TWCNB_v0_2.py => Naive_bayes.py (69%) rename BNB.py => OLD_VERSIONS/BNB.py (100%) rename MNB.py => OLD_VERSIONS/MNB.py (100%) rename TFIDF.py => OLD_VERSIONS/TFIDF.py (100%) create mode 100644 OLD_VERSIONS/TWCNB.py rename TWCNB.py => OLD_VERSIONS/TWCNB_v0.py (100%) create mode 100644 OLD_VERSIONS/nb_graph.py create mode 100644 graph_nb_base.py create mode 100644 graph_nb_skew.py diff --git a/TWCNB_v0_2.py b/Naive_bayes.py similarity index 69% rename from TWCNB_v0_2.py rename to Naive_bayes.py index d0db97a..375de9f 100644 --- a/TWCNB_v0_2.py +++ b/Naive_bayes.py @@ -20,8 +20,10 @@ SHUFFLE = 1 # TF = 0 # 1 - log term frew # IDF = 0 # 1 - idf # LENGTH = 0 # 1 - doc length adjust -AMAZON = 0 # 0 - use movie_reviews, 1 - use Amazon set -OFFSET = 0 # 0 - use random data size offset, 1 - nope +AMAZON = 0 # 1 - use Amazon set +TWITTER = 0 # 1 - use Twitter set +TWEET_LIMIT = 5000 # we can't use the whole database, so just randomly grab this number of positive and negative reviews +OFFSET = 0 # introduced offset (skew) in datasets REVIEW_POL={} @@ -42,6 +44,7 @@ cat_word_count_dict={} num_docs_word_in = {} vocab_length=0 +word_cat_num_doc_dict={} def SplitData(): @@ -51,16 +54,30 @@ def SplitData(): train_test = [[],[]] # offset_sample = random.randint(-400,400) offset_sample = OFFSET - print "offset_sample", offset_sample + # print "offset_sample", offset_sample + categories = ['neg', 'pos'] if AMAZON: # offset_sample = random.randint(-600,600) - for category in ['pos', 'neg']: + for category in categories: type_dict[category]=[] with open('amazon_revs.csv', 'rb') as csvfile: rev_read = csv.reader(csvfile) for row in rev_read: type_dict[row[1]].append(row[0]) REVIEW_POL[row[0]] = row[1] + elif TWITTER: + for category in categories: + type_dict[category]=[] + with open('tweets.csv', 'rb') as csvfile: + rev_read = csv.reader(csvfile) + rev_read.next() # skip header row + number = [0,0] + for row in rev_read: + type_dict[ categories[ int(row[1]) ] ].append(row[3].strip()) + REVIEW_POL[row[3].strip()] = categories[int(row[1])] + number[int(row[1])] += 1 + if (number[0]>TWEET_LIMIT and number[1]>TWEET_LIMIT): + break else: for category in mr.categories(): type_dict[category]=mr.fileids(categories=category) @@ -73,7 +90,7 @@ def SplitData(): size=int(len(li)*SPLIT_AMOUNT) + offset_sample # if DEFINED_SIZE: # size = DEFINED_SIZES[cat] - print "Category: ", cat, "Size:", size + # print "Category: ", cat, "Size:", size offset_sample *= -1 docs_count[cat]=size train_test[0].extend(li[:size]) @@ -82,7 +99,7 @@ def SplitData(): def tokenize(file_name): list_words = () - if AMAZON: + if AMAZON or TWITTER: list_words = re.split(r'\W+',file_name) else: list_words = re.split(r'\W+',mr.raw(fileids=file_name)) @@ -110,59 +127,99 @@ def CalculateAccuracy(li_results): precision = a/(a+b) # recall = a/(a+c) # print "The following parameters are recorded for the category " , cat - print "precision =", precision + # print "precision =", precision return precision def RunWholeThing(): global AMAZON + global TWITTER global OFFSET global DEFINED_SIZE global DEFINED_SIZES OFFSET = 0 - AMAZON = 0 # 0 - use movie_reviews, 1 - use Amazon set + AMAZON = 0 # 0 - use movie_reviews, 1 - use Amazon set + TWITTER = 0 + tested = [' Bern:', ' Mult:', ' TIL :', ' DTIL:', ' CW :', ' TIWC:'] while OFFSET < 400: - ans = DoTheThing() - print "Pos/Neg =", cat_num_docs['pos'], cat_num_docs['neg'], 'Movies @ +/-', OFFSET - print "_____________________________________________________" - OFFSET = -1*OFFSET - ans2 = DoTheThing() + print "Movies with ", OFFSET + ans = DoTheThing() OFFSET = -1*OFFSET - print ans , ans2 - OFFSET += 100 + if OFFSET != 0: + ans2 = DoTheThing() + ans3 = [ans , ans2] + ans = [sum(e)/len(e) for e in zip(*ans3)] + a_i = 0 + for a in ans: + print tested[a_i], a + a_i += 1 + + OFFSET = -1*OFFSET + + OFFSET += 300 + OFFSET = 0 AMAZON = 1 + while OFFSET < 600: + print "Amazon with ", OFFSET ans = DoTheThing() - print "Pos/Neg =", cat_num_docs['pos'], cat_num_docs['neg'], 'Amazon @ +/-', OFFSET - print "_____________________________________________________" OFFSET = -1*OFFSET - ans2 = DoTheThing() + if OFFSET != 0: + ans2 = DoTheThing() + ans3 = [ans , ans2] + ans = [sum(e)/len(e) for e in zip(*ans3)] + a_i = 0 + for a in ans: + print tested[a_i], a + a_i += 1 + + OFFSET = -1*OFFSET + + OFFSET += 400 + + OFFSET = 0 + AMAZON = 0 + TWITTER = 1 + + while OFFSET < 1000: + print "Twitter with ", OFFSET + ans = DoTheThing() OFFSET = -1*OFFSET - print ans , ans2 - OFFSET += 100 + if OFFSET != 0: + ans2 = DoTheThing() + ans3 = [ans , ans2] + ans = [sum(e)/len(e) for e in zip(*ans3)] + a_i = 0 + for a in ans: + print tested[a_i], a + a_i += 1 + OFFSET = -1*OFFSET + OFFSET += 800 def DoTheThing(): i = 0 reps = 5 - + b_nb = 0 m_nb = 0 - ti_nb = 0 til_nb = 0 + dtil_nb = 0 cw_nb = 0 tilcw_nb= 0 while i < reps: TrainMachine() - m_nb += TestMachine(0,0,0,0,0)/5 - ti_nb += TestMachine(1,1,0,0,0)/5 - til_nb += TestMachine(1,1,1,0,0)/5 - cw_nb += TestMachine(0,0,0,1,1)/5 - tilcw_nb += TestMachine(1,1,1,1,1)/5 + b_nb += TestMachine_Bern()/reps + m_nb += TestMachine(0,0,0,0,0)/reps + til_nb += TestMachine(1,1,1,0,0)/reps + dtil_nb += TestMachine(1,1,1,2,0)/reps + cw_nb += TestMachine(0,0,0,1,1)/reps + tilcw_nb += TestMachine(1,1,1,1,1)/reps i+=1 - return (m_nb, ti_nb, til_nb, cw_nb, tilcw_nb) + # print " Bern: %0.6f\n Mult: %0.6f\n TIL : %0.6f\n DTIL: %0.6f\n CW : %0.6f\n TIWC: %0.6f" % (b_nb, m_nb, til_nb, dtil_nb, cw_nb, tilcw_nb) + return [b_nb, m_nb, til_nb, dtil_nb, cw_nb, tilcw_nb] # li = Preprocessor.get_testset_trainset(corpus) @@ -173,6 +230,7 @@ def TrainMachine(): global cat_word_dict global cat_word_count_dict global num_docs_word_in + global word_cat_num_doc_dict global li global testset global trainset @@ -192,7 +250,7 @@ def TrainMachine(): #and put word count as zero #and then insert words into the category's dictionary in both cases and update the word count cat = '' - if AMAZON: + if AMAZON or TWITTER: cat = REVIEW_POL[file_name] else: cat = mr.categories(fileids = file_name)[0] @@ -211,8 +269,18 @@ def TrainMachine(): num_docs_word_in[w] = num_docs_word_in.get(w, 0) num_docs_word_in[w] += 1 + word_cat_num_doc_dict[w]=word_cat_num_doc_dict.get(w,{}) + word_cat_num_doc_dict[w][cat]=word_cat_num_doc_dict[w].get(cat,0) + word_cat_num_doc_dict[w][cat]+=1 + for dic in cat_word_dict.values(): - vocab_length+=len(dic) + vocab_length+=len(dic) + for w in word_cat_num_doc_dict: + for cat in cat_num_docs: + nct = word_cat_num_doc_dict[w].get(cat,0) + # convert #times a word appears into #times+1/#cat_reviews+2 + ratio = (nct+1)/(cat_num_docs[cat]+2) + word_cat_num_doc_dict[w][cat]=ratio # ##8) Get the vocabulary length @@ -297,7 +365,7 @@ def TestMachine(t, i, l, c, w): my_word_count[kw] = log(1 + my_word_count[kw]) if IDF: - my_word_count[kw] = my_word_count[kw]*log(length_train/num_docs_word_in.get(kw,1), 2) #IDF + my_word_count[kw] = my_word_count[kw]*log(length_train/num_docs_word_in.get(kw,0.01), 2) #IDF ## length norm w_freq = my_word_count[kw] length_norm += pow(w_freq, 2) @@ -342,7 +410,7 @@ def TestMachine(t, i, l, c, w): minimum_neg_log_prob=neg_log_prob # print "Min cat: ", min_category - if AMAZON: + if AMAZON or TWITTER: li_results.append((file_name,min_category,REVIEW_POL[file_name])) else: li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0])) @@ -354,5 +422,34 @@ def TestMachine(t, i, l, c, w): precision = CalculateAccuracy(li_results) return precision +def TestMachine_Bern(): + li_results=[] + #5) Like in the training set,Loop through the test set, to get the individual words + for file_name in testset: + minimum_neg_log_prob=1000000000 + min_category='' + set_list_words = set(tokenize(file_name)) + + ##6) Get the probability for each category, + #using the cat_num_docs dictionary to wade through the categories + for cat in cat_num_docs: + neg_log_prob=-log(cat_num_docs[cat]/len(trainset)) + for w in word_cat_num_doc_dict: + if w in set_list_words: + neg_log_prob-=log(word_cat_num_doc_dict[w][cat]) + else: + neg_log_prob-=log(1-word_cat_num_doc_dict[w][cat]) + if minimum_neg_log_prob>neg_log_prob: + min_category=cat + minimum_neg_log_prob=neg_log_prob + + if AMAZON or TWITTER: + li_results.append((file_name,min_category,REVIEW_POL[file_name])) + else: + li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0])) + # break + precision = CalculateAccuracy(li_results) + return precision + RunWholeThing() \ No newline at end of file diff --git a/BNB.py b/OLD_VERSIONS/BNB.py similarity index 100% rename from BNB.py rename to OLD_VERSIONS/BNB.py diff --git a/MNB.py b/OLD_VERSIONS/MNB.py similarity index 100% rename from MNB.py rename to OLD_VERSIONS/MNB.py diff --git a/TFIDF.py b/OLD_VERSIONS/TFIDF.py similarity index 100% rename from TFIDF.py rename to OLD_VERSIONS/TFIDF.py diff --git a/OLD_VERSIONS/TWCNB.py b/OLD_VERSIONS/TWCNB.py new file mode 100644 index 0000000..2394283 --- /dev/null +++ b/OLD_VERSIONS/TWCNB.py @@ -0,0 +1,387 @@ +###################### +# Full version with all variations included +# To improve: create a main function allowing for multiple runs +###################### + +from __future__ import division +from math import log +from math import pow +import re +import csv +from nltk.corpus import movie_reviews as mr +from nltk.corpus import stopwords +import random +STOP_WORDS = set(stopwords.words('english')) +SPLIT_AMOUNT = 0.6 # training amount from data + +COMPLEMENT = 0 # 1 - just comp, 2 - delta / one-v-all +WEIGHTED = 0 # 1 - adjust weights +TF = 0 # 1 - log term frew +IDF = 0 # 1 - idf +LENGTH = 0 # 1 - doc length adjust +AMAZON = 0 # 0 - use movie_reviews, 1 - use Amazon set +NO_OFF = 1 # 0 - use random data size offset, 1 - nope +DEFINED_SIZE = 0 # 1 - use DEFINED_SIZES for pos, neg sets +DEFINED_SIZES = {'pos': 600, 'neg': 600} + +REVIEW_POL={} +def SplitData(): + type_dict={} + docs_count={} + train_test = [[],[]] + offset_sample = random.randint(-400,400) + print "offset_sample", offset_sample + if AMAZON: + offset_sample = random.randint(-600,600) + for category in ['pos', 'neg']: + type_dict[category]=[] + with open('amazon_revs.csv', 'rb') as csvfile: + rev_read = csv.reader(csvfile) + for row in rev_read: + type_dict[row[1]].append(row[0]) + REVIEW_POL[row[0]] = row[1] + else: + for category in mr.categories(): + type_dict[category]=mr.fileids(categories=category) + if NO_OFF: + offset_sample = 0 + for cat in type_dict.keys(): + li = type_dict[cat] + # random.shuffle(li) + size=int(len(li)*SPLIT_AMOUNT) + offset_sample + if DEFINED_SIZE: + size = DEFINED_SIZES[cat] + print "Category: ", cat, "Size:", size + offset_sample *= -1 + docs_count[cat]=size + train_test[0].extend(li[:size]) + train_test[1].extend(li[size:]) + return [train_test,type_dict, docs_count] + +def tokenize(file_name): + list_words = () + if AMAZON: + list_words = re.split(r'\W+',file_name) + else: + list_words = re.split(r'\W+',mr.raw(fileids=file_name)) + + return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS] + + +def CalculateAccuracy(li_results): + a=0 + b=0 + c=0 + d=0 + cat = li_results[0][1] + for t in li_results: + if cat==t[1]: + if cat==t[2]: + a+=1 + else: + b+=1 + else: + if cat==t[2]: + c+=1 + else: + d+=1 + precision = a/(a+b) + # recall = a/(a+c) + # print "The following parameters are recorded for the category " , cat + print "precision =", precision + +# li = Preprocessor.get_testset_trainset(corpus) +li = SplitData() +# exit() +testset = li[0][1] +trainset = li[0][0] +# li = Preprocessor.startup() +cat_num_docs = li[2] + +length_train = len(trainset) +print "length of training set ", length_train + +##4)Create a) a dictionary with a category as the key and dictionary of words-occurrences as values + #b) a dictionary with a category as the key and the number of words in it as the value +# {pos-> {w1 = 17 times}, {w2 = 32 times}...} {neg-> ....} +cat_word_dict={} +# {pos-> 4000 words} {neg-> 7000 words} +cat_word_count_dict={} +#val = my_dict.get(key, mydefaultval) +complete_training_docs_tokens = [] +num_docs_word_in = {} +counts_for_w = {} + + +##5)Loop through the training set, to get the entire text from each file +##6) Parse the string to get individual words + +for file_name in trainset: + list_words = tokenize(file_name) + complete_training_docs_tokens.append(list_words) + # counts_for_w[file_name] = counts_for_w.get(file_name, {}) + counts_for_w[file_name] = {} + +##7) Check if category exists in dictionary, if not, create an empty dictionary, + #and put word count as zero + #and then insert words into the category's dictionary in both cases and update the word count + cat = '' + if AMAZON: + cat = REVIEW_POL[file_name] + else: + cat = mr.categories(fileids = file_name)[0] + + # cat_word_dict[cat] = cat_word_dict.get(cat,{}) + # cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0) + +# add number of words to total word count for cat + # cat_word_count_dict[cat]+=len(list_words) +# start count for number of occurences for each word + counted = [] + for w in list_words: + # cat_word_dict[cat][w] = cat_word_dict[cat].get(w, 0) + # cat_word_dict[cat][w]+=1 + counts_for_w[file_name][w] = counts_for_w[file_name].get(w, 0) + counts_for_w[file_name][w] += 1 + if w not in counted: + counted.append(w) + num_docs_word_in[w] = num_docs_word_in.get(w, 0) + num_docs_word_in[w] += 1 + # break + + + +for fn in trainset: + length_norm_val = 0 + + cat = '' + if AMAZON: + cat = REVIEW_POL[fn] + else: + cat = mr.categories(fileids = fn)[0] + cat_word_dict[cat] = cat_word_dict.get(cat,{}) + cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0) + # print fn + "\n_______________________________\n" + # print tokenize(fn) + # print "" + "\n_______________________________\n" + # print counts_for_w[fn]['book'], num_docs_word_in['book'] + for c_w in counts_for_w[fn].keys(): + + # print c_w + + if TF: + counts_for_w[fn][c_w] = log(counts_for_w[fn][c_w] + 1, 2) + # if c_w == 'book' : + # print 'TF: ', counts_for_w[fn]['book'] + if IDF: + counts_for_w[fn][c_w] = counts_for_w[fn][c_w]*log(length_train/num_docs_word_in[c_w], 2) + # if c_w == 'book' : + # print 'IDF: ', counts_for_w[fn]['book'] + length_norm_val += (counts_for_w[fn][c_w]*counts_for_w[fn][c_w]) + length_norm_val = pow(length_norm_val,0.5) + # print counts_for_w[fn]['book'], num_docs_word_in['book'] + # print length_norm_val + + for c_w in counts_for_w[fn].keys(): + if LENGTH: + counts_for_w[fn][c_w] /= length_norm_val + + cat_word_count_dict[cat] += counts_for_w[fn][c_w] + cat_word_dict[cat][c_w] = cat_word_dict[cat].get(c_w, 0) + cat_word_dict[cat][c_w] += counts_for_w[fn][c_w] + +# print cat_word_dict['neg']['book'] +# print cat_word_dict['pos']['book'] + +# exit() +# print "Using LNV: ", length_norm_val +# length_norm_val = length_norm_val**(0.5) +# print "Using sqLNV: ", length_norm_val +# for fn in trainset: + # cat = '' + # if AMAZON: + # cat = REVIEW_POL[fn] + # else: + # cat = mr.categories(fileids = fn)[0] + # cat_word_dict[cat] = cat_word_dict.get(cat,{}) + + # for c_w in counts_for_w[fn].keys(): + # if LENGTH: + # counts_for_w[fn][c_w] /= length_norm_val + + # cat_word_dict[cat][c_w] = cat_word_dict[cat].get(c_w, 0) + # cat_word_dict[cat][c_w] += counts_for_w[fn][c_w] +##8) Get the vocabulary length +## number of words, total across categories +vocab_length=0 + +# for dic in num_docs_word_in.keys(): +vocab_length=len(num_docs_word_in.keys()) +print cat_word_dict['pos']['book'], cat_word_dict['neg']['book'] +print "Vocab", vocab_length +for cat in cat_word_dict.keys(): + count_cat = cat_word_count_dict[cat] + weight_norm_cat = 0 + for w in cat_word_dict[cat].keys(): + cat_word_dict[cat][w] = (cat_word_dict[cat][w]+1)/(count_cat+vocab_length) + cat_word_dict[cat][w] = log ( cat_word_dict[cat][w] , 2) + weight_norm_cat += abs(cat_word_dict[cat][w]) + if WEIGHTED: + for w in cat_word_dict[cat].keys(): + cat_word_dict[cat][w] = cat_word_dict[cat][w]/weight_norm_cat + +print cat_word_dict['pos']['book'], cat_word_dict['neg']['book'] +exit() + + + +####Congratulations! the Classifier is trained, now it is time to run the Multinomial Naive Bayes Classifier on the test dataset +print 'pos' , cat_num_docs['pos']/len(trainset) +print 'neg' , cat_num_docs['neg']/len(trainset) +li_results=[] +li_results2=[] +#9) Like in the training set,Loop through the test set, to get the entire text from each file +##10) Similar step, parse the string to get individual words +for file_name in testset: + # print "File: ", file_name + # minimum_neg_log_prob=1000000000 + minimum_neg_log_prob = -1000000000 # NEW + minimum_pos_log_prob = 100000000 + min_category='' + max_category='' + list_words = tokenize(file_name) + + + +##11) Get the probability for each category, + #can use any of the created dictionaries to wade through the categories + for cat in cat_word_count_dict: + + + # print "________________________________________________________________" + # print "________________________________________________________________" + # print "\n\n" , cat, cat, cat, cat, cat, cat, cat, cat, cat, cat, "\n\n" + # neg_log_prob=-log(cat_num_docs[cat]/length_train) + inv_cat = 'pos' + if cat == 'pos': + inv_cat = 'neg' + + neg_log_prob = log(cat_num_docs[cat]/length_train, 2) + pos_log_prob = 0 + + # neg_log_prob = cat_num_docs[cat]/length_train + opp_word_dict = cat_word_dict[inv_cat] + opp_count_cat = cat_word_count_dict[inv_cat] + + word_dict = cat_word_dict[cat] + count_cat = cat_word_count_dict[cat] + + my_word_count = {} + for aw in list_words: + my_word_count[aw] = my_word_count.get(aw, 0) + my_word_count[aw]+=1 + + if COMPLEMENT: + neg_log_prob -= opp_word_dict.get(aw, 0) + else : + neg_log_prob += word_dict.get(aw, 0) + + pos_log_prob += opp_word_dict.get(aw, 0) + # my_orig_word_count[aw] = my_orig_word_count.get(aw, 0) + # my_orig_word_count[aw]+=1 + + # # length_norm = 0 + # weight_normalizing_ratio = 0 + # opp_weight_normalizing_ratio = 0 + # for kw in my_word_count.keys(): + # count_word_train=word_dict.get(kw,0) + # ratio = (count_word_train+1)/(count_cat+vocab_length) + + # # if COMPLEMENT: + # opp_count_word_train=opp_word_dict.get(kw,0) + # opp_ratio = (opp_count_word_train+1)/(opp_count_cat+vocab_length) + + # # weight norm + # # weight_normalizing_ratio += abs(log(ratio, 2)) + # # opp_weight_normalizing_ratio += abs(log(opp_ratio, 2)) + # weight_normalizing_ratio += log(ratio, 2) + # opp_weight_normalizing_ratio += log(opp_ratio, 2) + + # # if TF: + # # my_word_count[kw] = log(1 + my_word_count[kw]) + + # # if IDF: + # # my_word_count[kw] = my_word_count[kw]*log(length_train/num_docs_word_in.get(w,1)) #IDF + # # ## length norm + # # w_freq = my_word_count[kw] + # # length_norm += (w_freq * w_freq) + + # length_norm = length_norm**(0.5) + # print "WNR: ", weight_normalizing_ratio + + # for w in my_word_count.keys(): + # count_word_train=word_dict.get(w,0) + # ratio = (count_word_train+1)/(count_cat+vocab_length) #Nw,c+1/Nc+|V| = theta_c + + # # if COMPLEMENT: + # opp_count_word_train=opp_word_dict.get(w,0) + # opp_ratio = (opp_count_word_train+1)/(opp_count_cat+vocab_length) + + # word_freq = my_word_count[w] + + # # if LENGTH: + # # word_freq = word_freq/length_norm # length normalization + + + # ratio = log(ratio, 2) # weight factor log(theta_c) = weight_c,w + # opp_ratio = log(opp_ratio, 2) + + # if WEIGHTED: + # ratio = ratio/weight_normalizing_ratio # weight normalization + # opp_ratio = opp_ratio/opp_weight_normalizing_ratio + + # if COMPLEMENT == 1: # just complement + # neg_log_prob -= word_freq*opp_ratio + # else: + # neg_log_prob += word_freq*ratio # class probability + # pos_log_prob += word_freq*ratio + # if COMPLEMENT == 2: # one-v-all + # neg_log_prob += word_freq*ratio + + + # break + # print "NLP: ", neg_log_prob + # print file_name + # print "\n\n", cat, minimum_pos_log_prob , '<' , neg_log_prob + # if minimum_pos_log_prob>pos_log_prob: + if minimum_neg_log_probpos_log_prob: + max_category=cat + minimum_pos_log_prob=pos_log_prob + # print "Min cat: ", min_category + + if AMAZON: + li_results.append((file_name,min_category,REVIEW_POL[file_name])) + else: + li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0])) + # break + if AMAZON: + li_results2.append((file_name,max_category,REVIEW_POL[file_name])) + else: + li_results2.append((file_name,max_category,mr.categories(fileids = file_name)[0])) + +###--------------------DEBUG STATEMENTS---------------------- +#for t in li_results: + # if t[1]!=t[2]: + # print t +###--------------------DEBUG STATEMENTS---------------------- + +###--------------------DEBUG STATEMENTS---------------------- + +#12) Evaluating the classifier + +CalculateAccuracy(li_results) +CalculateAccuracy(li_results2) diff --git a/TWCNB.py b/OLD_VERSIONS/TWCNB_v0.py similarity index 100% rename from TWCNB.py rename to OLD_VERSIONS/TWCNB_v0.py diff --git a/OLD_VERSIONS/nb_graph.py b/OLD_VERSIONS/nb_graph.py new file mode 100644 index 0000000..7db9e9f --- /dev/null +++ b/OLD_VERSIONS/nb_graph.py @@ -0,0 +1,82 @@ +import numpy +from matplotlib import pyplot + +labels = [ + "Bernoulli", + "Multinomial", + "W C + M", + "TF IDF L W C + M", + "TF IDF L + M", + "delta TF IDF L + M", +] + +tops = numpy.arange(len(labels)) + +# 0 , movies +widths = [ 0.789251270916, + 0.90355565901, + 0.687274830247, + 0.858607720931, + 0.91342342141, + 0.91342342141 ] +# skew , movies +widths2 = [ 0.139696306725, + 0.822764704115, + 0.125, + 0.125, + 0.920448727392, + 0.961479550697 ] + + +# 0 , amazon +widths3 = [ 0.684270469696, + 0.792608446831, + 0.529824561404, + 0.529824561404, + 0.798532433585, + 0.829834039103 ] +# skew , amazon +widths4 = [ 0.298353719071, + 0.73741502689, + 0.219298245614, + 0.219298245614, + 0.751207980863, + 0.771721619137 ] + +# 0 , twitter +widths5 = [ 0.717525563057, + 0.840751614224, + 0.558914441009, + 0.558667842964, + 0.849271128435, + 0.850938806271 ] + +# skew , twitter +widths6 = [ 0.584685826487, + 0.853741247102, + 0.323568358281, + 0.323555359506, + 0.862090804434, + 0.889175806229 ] + + +height = 0.1333333333 +pyplot.barh(tops+height*5, widths, height, color="#50312F") +pyplot.barh(tops+height*4, widths2, height, color="#F17CB0") +pyplot.barh(tops+height*3, widths3, height, color="#375E97") +pyplot.barh(tops+height*2, widths4, height, color="#60BD68") +pyplot.barh(tops+height*1, widths5, height, color="#CB0000") +pyplot.barh(tops+height*0, widths6, height, color="#FAA43A") + +pyplot.legend(["Movies", "Movies - skew", "Amazon", "Amazon - skew", "Twitter", "Twitter-skew"], loc=4) # bottom right +pyplot.yticks(tops+2*height, labels) +pyplot.xlim(0, 1.18) +pyplot.ylim(tops[0]-height, tops[-1]+7*height) +pyplot.show() + +""" +gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: False, use_position: False 0.822003140865 +gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: False, use_position: True 0.781988575402 +gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position: False 0.77899606193 +gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position: True 0.762512512513 +""" \ No newline at end of file diff --git a/get_amazon_revs.py b/get_amazon_revs.py index b5c50e3..b89eb31 100644 --- a/get_amazon_revs.py +++ b/get_amazon_revs.py @@ -1,8 +1,9 @@ import csv +import re path = "C:\\Users\\Astha\\Desktop\\amazon_revs" types = ['books' , 'dvd' , 'electronics' , 'kitchen_housewares'] cats = ['positive', 'negative'] -with open('amazon_revs.csv', 'wb') as f: +with open('amazon_revs2.csv', 'wb') as f: cw = csv.writer(f) for t in types: for c in cats: @@ -12,10 +13,10 @@ with open('amazon_revs.csv', 'wb') as f: curr_rev = '' add = 0 for line in txt_p: - line = line.rstrip() - if line == "": + # line = line.rstrip() + if re.match( r'', line ): add = 1 - elif line == "": + elif re.match( r'', line ): add = 0 cw.writerow([curr_rev, short_c]) curr_rev = '' diff --git a/graph_nb_base.py b/graph_nb_base.py new file mode 100644 index 0000000..08aa59e --- /dev/null +++ b/graph_nb_base.py @@ -0,0 +1,82 @@ +import numpy +from matplotlib import pyplot + +labels = [ + "BNB", + "MNB", + "W C", + "TWCNB", + "TF IDF L", + "TF IDF L OvA", +] + +tops = numpy.arange(len(labels)) + +# 0 , movies +widths = [ 0.789251270916, + 0.90355565901, + 0.687274830247, + 0.858607720931, + 0.91342342141, + 0.91342342141 ] +# skew , movies +# widths2 = [ 0.139696306725, + # 0.822764704115, + # 0.125, + # 0.125, + # 0.920448727392, + # 0.961479550697 ] + + +# 0 , amazon +widths3 = [ 0.684270469696, + 0.792608446831, + 0.529824561404, + 0.529824561404, + 0.798532433585, + 0.829834039103 ] +# skew , amazon +# widths4 = [ 0.298353719071, + # 0.73741502689, + # 0.219298245614, + # 0.219298245614, + # 0.751207980863, + # 0.771721619137 ] + +# 0 , twitter +widths5 = [ 0.717525563057, + 0.840751614224, + 0.558914441009, + 0.558667842964, + 0.849271128435, + 0.850938806271 ] + +# skew , twitter +# widths6 = [ 0.584685826487, + # 0.853741247102, + # 0.323568358281, + # 0.323555359506, + # 0.862090804434, + # 0.889175806229 ] + + +height = 0.2 +pyplot.barh(tops+height*2, widths, height, color="#3F681C") +# pyplot.barh(tops+height*4, widths2, height, color="#F17CB0") +pyplot.barh(tops+height*1, widths3, height, color="#5BC8AC") +# pyplot.barh(tops+height*2, widths4, height, color="#60BD68") +pyplot.barh(tops+height*0, widths5, height, color="#CB0000") +# pyplot.barh(tops+height*0, widths6, height, color="#FAA43A") + +pyplot.legend(["Movies", "Amazon", "Twitter"], loc=4) # bottom right +pyplot.yticks(tops+height, labels) +pyplot.xlim(0, 1.18) +pyplot.ylim(tops[0]-height, tops[-1]+4*height) +pyplot.show() + +""" +gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: False, use_position: False 0.822003140865 +gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: False, use_position: True 0.781988575402 +gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position: False 0.77899606193 +gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position: True 0.762512512513 +""" \ No newline at end of file diff --git a/graph_nb_skew.py b/graph_nb_skew.py new file mode 100644 index 0000000..e578fc0 --- /dev/null +++ b/graph_nb_skew.py @@ -0,0 +1,82 @@ +import numpy +from matplotlib import pyplot + +labels = [ + "BNB", + "MNB", + "W C", + "TWCNB", + "TF IDF L", + "TF IDF L OvA", +] + +tops = numpy.arange(len(labels)) + +# 0 , movies +# widths = [ 0.789251270916, + # 0.90355565901, + # 0.687274830247, + # 0.858607720931, + # 0.91342342141, + # 0.91342342141 ] +# skew , movies +widths2 = [ 0.139696306725, + 0.822764704115, + 0.125, + 0.125, + 0.920448727392, + 0.961479550697 ] + + +# 0 , amazon +# widths3 = [ 0.684270469696, + # 0.792608446831, + # 0.529824561404, + # 0.529824561404, + # 0.798532433585, + # 0.829834039103 ] +# skew , amazon +widths4 = [ 0.298353719071, + 0.73741502689, + 0.219298245614, + 0.219298245614, + 0.751207980863, + 0.771721619137 ] + +# 0 , twitter +# widths5 = [ 0.717525563057, + # 0.840751614224, + # 0.558914441009, + # 0.558667842964, + # 0.849271128435, + # 0.850938806271 ] + +# skew , twitter +widths6 = [ 0.584685826487, + 0.853741247102, + 0.323568358281, + 0.323555359506, + 0.862090804434, + 0.889175806229 ] + + +height = 0.2 +pyplot.barh(tops+height*2, widths2, height, color="#3F681C") +# pyplot.barh(tops+height*4, widths2, height, color="#F17CB0") +pyplot.barh(tops+height*1, widths4, height, color="#5BC8AC") +# pyplot.barh(tops+height*2, widths4, height, color="#60BD68") +pyplot.barh(tops+height*0, widths6, height, color="#CB0000") +# pyplot.barh(tops+height*0, widths6, height, color="#FAA43A") + +pyplot.legend(["Movies-skew", "Amazon-skew", "Twitter-skew"], loc=4) # bottom right +pyplot.yticks(tops+height, labels) +pyplot.xlim(0, 1.18) +pyplot.ylim(tops[0]-height, tops[-1]+4*height) +pyplot.show() + +""" +gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: False, use_position: False 0.822003140865 +gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: False, use_position: True 0.781988575402 +gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position: False 0.77899606193 +gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position: True 0.762512512513 +""" \ No newline at end of file