clean up

twcnb.py now has control variables for all variations/implementations of the algo, incl. delta tf-idf
job13011 · Apr 18, 2016 · d4e334b · d4e334b
1 parent 971ba40
commit d4e334b
Show file tree

Hide file tree

Showing 2 changed files with 80 additions and 42 deletions.
diff --git a/MNB.py b/MNB.py
@@ -8,10 +8,10 @@
 STOP_WORDS = set(stopwords.words('english'))
 SPLIT_AMOUNT = 0.6          # training amount from data 
 
-AMAZON = 1
+AMAZON = 0
 REVIEW_POL={}
 DEFINED_SIZE = 1
-DEFINED_SIZES = {'pos': 948, 'neg': 948}
+DEFINED_SIZES = {'pos': 600, 'neg': 600}
 
 def SplitData():
     type_dict={}

diff --git a/TWCNB.py b/TWCNB.py
@@ -1,3 +1,8 @@
+######################
+# Full version with all variations included
+# To improve: create a main function allowing for multiple runs
+######################
+
 from __future__ import division
 from math import log
 import re
@@ -8,11 +13,17 @@
 STOP_WORDS = set(stopwords.words('english'))
 SPLIT_AMOUNT = 0.6          # training amount from data 
 
-USE_IDF = 1
-AMAZON = 1
+COMPLEMENT = 2      # 1 - just comp, 2 - delta / one-v-all
+WEIGHTED = 0        # 1 - adjust weights
+TF = 0              # 1 - log term frew
+IDF = 0             # 1 - idf
+LENGTH = 0          # 1 - doc length adjust
+AMAZON = 1          # 0 - use movie_reviews, 1 - use Amazon set 
+NO_OFF = 1          # 0 - use random data size offset, 1 - nope
+DEFINED_SIZE = 0    # 1 - use DEFINED_SIZES for pos, neg sets
+DEFINED_SIZES = {'pos': 700, 'neg': 1100}
+
 REVIEW_POL={}
-DEFINED_SIZE = 1
-DEFINED_SIZES = {'pos': 948, 'neg': 948}
 def SplitData():
     type_dict={}
     docs_count={}
@@ -31,6 +42,8 @@ def SplitData():
     else:
         for category in mr.categories():
             type_dict[category]=mr.fileids(categories=category)
+    if NO_OFF:
+        offset_sample = 0
     for cat in type_dict.keys():
         li = type_dict[cat]
         random.shuffle(li)
@@ -94,9 +107,10 @@ def CalculateAccuracy(li_results):
 cat_word_count_dict={}
 #val = my_dict.get(key, mydefaultval)
 complete_training_docs_tokens = []
-
+num_docs_word_in = {} 
 ##5)Loop through the training set, to get the entire text from  each file
 ##6) Parse the string to get individual words
+
 for file_name in trainset:
     list_words = tokenize(file_name)
     complete_training_docs_tokens.append(list_words)
@@ -114,36 +128,37 @@ def CalculateAccuracy(li_results):
     cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0)
 
 # add number of words to total word count for cat
-    cat_word_count_dict[cat]+=len(list_words)   
-# start count for number of occurences for each word    
+    cat_word_count_dict[cat]+=len(list_words)      
+# start count for number of occurences for each word
+    counted = []
     for w in list_words:
         cat_word_dict[cat][w] = cat_word_dict[cat].get(w, 0)
         cat_word_dict[cat][w]+=1
+        if w not in counted:
+            counted.append(w)
+            num_docs_word_in[w] = num_docs_word_in.get(w, 0)
+            num_docs_word_in[w] += 1
 
 
 
 ##8) Get the vocabulary length
 ## number of words, total across categories
 vocab_length=0     
-num_docs_word_in = {}        
+
 for dic in cat_word_dict.values():
      vocab_length+=len(dic)
-     if USE_IDF:
-         for uniq_word in dic.keys():
-            num_docs_word_in[uniq_word] = num_docs_word_in.get(uniq_word, 1)
-            num_docs_word_in[uniq_word] = sum(1 for sr in complete_training_docs_tokens if uniq_word in sr)
-
-
+
 
 ####Congratulations! the Classifier is trained, now it is time to run the Multinomial Naive Bayes Classifier on the test dataset
 length_train = len(trainset)
+print "length of training set ", length_train
 li_results=[]
 #9) Like in the training set,Loop through the test set, to get the entire text from  each file
 ##10) Similar step, parse the string to get individual words
 for file_name in testset:
     # print "File: ", file_name
-    minimum_neg_log_prob=1000000000
-    # minimum_neg_log_prob = 0      # NEW
+    # minimum_neg_log_prob=1000000000
+    minimum_neg_log_prob = -1000000000      # NEW
     min_category=''
     list_words = tokenize(file_name)
 
@@ -161,12 +176,14 @@ def CalculateAccuracy(li_results):
         if cat == 'pos':
             inv_cat = 'neg'
 
-
-        neg_log_prob=log(cat_num_docs[cat]/length_train)
+        neg_log_prob = log(cat_num_docs[cat]/length_train)
 
         # neg_log_prob = cat_num_docs[cat]/length_train
-        word_dict = cat_word_dict[inv_cat]
-        count_cat = cat_word_count_dict[inv_cat]
+        opp_word_dict = cat_word_dict[inv_cat]
+        opp_count_cat = cat_word_count_dict[inv_cat]
+
+        word_dict = cat_word_dict[cat]
+        count_cat = cat_word_count_dict[cat]
 
         my_word_count = {}
         for aw in list_words:
@@ -175,47 +192,68 @@ def CalculateAccuracy(li_results):
 
         length_norm = 0
         weight_normalizing_ratio = 0
+        opp_weight_normalizing_ratio = 0
         for kw in my_word_count.keys():
-            count_word_train=word_dict.get(kw,0)
+            count_word_train=word_dict.get(kw,0)            
             ratio = (count_word_train+1)/(count_cat+vocab_length)
-            ## weight norm
-            weight_normalizing_ratio+=log(ratio)
-            ## TF
-            my_word_count[kw] = log(my_word_count[kw]+1)
+
+            # if COMPLEMENT:
+            opp_count_word_train=opp_word_dict.get(kw,0)
+            opp_ratio = (opp_count_word_train+1)/(opp_count_cat+vocab_length)
+
+            # weight norm
+            weight_normalizing_ratio += abs(log(ratio))
+            opp_weight_normalizing_ratio += abs(log(opp_ratio))
+
+            if TF:
+                my_word_count[kw] = log(1 + my_word_count[kw])
+
+            if IDF:
+                my_word_count[kw] = my_word_count[kw]*log(length_train/num_docs_word_in.get(w,1))     #IDF
             ## length norm
-            length_norm += (my_word_count[kw]**(2))
+            w_freq = my_word_count[kw]
+            length_norm += (w_freq * w_freq)
 
         length_norm = length_norm**(0.5)   
         # print "WNR: ", weight_normalizing_ratio
 
         for w in my_word_count.keys():
             count_word_train=word_dict.get(w,0)
             ratio = (count_word_train+1)/(count_cat+vocab_length)   #Nw,c+1/Nc+|V| = theta_c
-            # neg_log_prob-=log(ratio)
+
+            # if COMPLEMENT:
+            opp_count_word_train=opp_word_dict.get(w,0)
+            opp_ratio = (opp_count_word_train+1)/(opp_count_cat+vocab_length)
+
             word_freq = my_word_count[w]
-            if USE_IDF:
-                word_freq = word_freq*log(length_train/num_docs_word_in.get(w,1))     #IDF
-            word_freq = word_freq/length_norm       # length normalization
 
-            # neg_log_prob += word_freq*log(ratio)    #switch to 
+            if LENGTH:
+                word_freq = word_freq/length_norm       # length normalization
+
+
             ratio = log(ratio)      # weight factor     log(theta_c) = weight_c,w
-            ratio = ratio/weight_normalizing_ratio      # weight normalization
-            neg_log_prob += word_freq*ratio    # class probability
+            opp_ratio = log(opp_ratio)
+
+            if WEIGHTED:
+                ratio = ratio/weight_normalizing_ratio      # weight normalization
+                opp_ratio = opp_ratio/opp_weight_normalizing_ratio
 
+            if COMPLEMENT == 1:                     # just complement
+                neg_log_prob -= word_freq*opp_ratio
+            else:
+                neg_log_prob += word_freq*ratio    # class probability
+            if COMPLEMENT == 2:                     # one-v-all 
+                neg_log_prob += word_freq*ratio
 
-            # neg_log_prob *= ratio
-            # print w, "Ratio found:",ratio, "new_neg_log:", neg_log_prob
             # break
         # print "NLP: ", neg_log_prob
         # print "\n\n", cat, minimum_neg_log_prob , '<' , neg_log_prob                 
-        if minimum_neg_log_prob>neg_log_prob:
-        # if minimum_neg_log_prob<neg_log_prob:
+        # if minimum_neg_log_prob>neg_log_prob:
+        if minimum_neg_log_prob<neg_log_prob:
             min_category=cat
             minimum_neg_log_prob=neg_log_prob
     # print "Min cat: ", min_category
-    # correct_cat = 'pos'
-    # if file_name in all_review_cats['neg']:
-        # correct_cat = 'neg'
+
     if AMAZON:
         li_results.append((file_name,min_category,REVIEW_POL[file_name]))
     else: