From 14dc205bb850cc02144693e389d411702e11a571 Mon Sep 17 00:00:00 2001
From: ap1113 <astha.patni@uconn.edu>
Date: Tue, 3 May 2016 17:02:38 -0400
Subject: [PATCH] clean up

placed all nb implementations in one code, and moved a bunch of stuff
around
---
 TWCNB_v0_2.py => Naive_bayes.py      | 161 ++++++++---
 BNB.py => OLD_VERSIONS/BNB.py        |   0
 MNB.py => OLD_VERSIONS/MNB.py        |   0
 TFIDF.py => OLD_VERSIONS/TFIDF.py    |   0
 OLD_VERSIONS/TWCNB.py                | 387 +++++++++++++++++++++++++++
 TWCNB.py => OLD_VERSIONS/TWCNB_v0.py |   0
 OLD_VERSIONS/nb_graph.py             |  82 ++++++
 get_amazon_revs.py                   |   9 +-
 graph_nb_base.py                     |  82 ++++++
 graph_nb_skew.py                     |  82 ++++++
 10 files changed, 767 insertions(+), 36 deletions(-)
 rename TWCNB_v0_2.py => Naive_bayes.py (69%)
 rename BNB.py => OLD_VERSIONS/BNB.py (100%)
 rename MNB.py => OLD_VERSIONS/MNB.py (100%)
 rename TFIDF.py => OLD_VERSIONS/TFIDF.py (100%)
 create mode 100644 OLD_VERSIONS/TWCNB.py
 rename TWCNB.py => OLD_VERSIONS/TWCNB_v0.py (100%)
 create mode 100644 OLD_VERSIONS/nb_graph.py
 create mode 100644 graph_nb_base.py
 create mode 100644 graph_nb_skew.py

diff --git a/TWCNB_v0_2.py b/Naive_bayes.py
similarity index 69%
rename from TWCNB_v0_2.py
rename to Naive_bayes.py
index d0db97a..375de9f 100644
--- a/TWCNB_v0_2.py
+++ b/Naive_bayes.py
@@ -20,8 +20,10 @@ SHUFFLE = 1
 # TF = 0              # 1 - log term frew
 # IDF = 0             # 1 - idf
 # LENGTH = 0          # 1 - doc length adjust
-AMAZON = 0          # 0 - use movie_reviews, 1 - use Amazon set 
-OFFSET = 0          # 0 - use random data size offset, 1 - nope
+AMAZON = 0          # 1 - use Amazon set 
+TWITTER = 0            # 1 - use Twitter set
+TWEET_LIMIT = 5000      # we can't use the whole database, so just randomly grab this number of positive and negative reviews
+OFFSET = 0          # introduced offset (skew) in datasets
 
 REVIEW_POL={}
 
@@ -42,6 +44,7 @@ cat_word_count_dict={}
 num_docs_word_in = {} 
 vocab_length=0
 
+word_cat_num_doc_dict={}
 
 
 def SplitData():
@@ -51,16 +54,30 @@ def SplitData():
     train_test = [[],[]]
     # offset_sample = random.randint(-400,400)
     offset_sample = OFFSET
-    print "offset_sample", offset_sample
+    # print "offset_sample", offset_sample
+    categories = ['neg', 'pos']
     if AMAZON:
         # offset_sample = random.randint(-600,600)
-        for category in ['pos', 'neg']:
+        for category in categories:
             type_dict[category]=[]
         with open('amazon_revs.csv', 'rb') as csvfile:
             rev_read = csv.reader(csvfile)
             for row in rev_read:
                 type_dict[row[1]].append(row[0])
                 REVIEW_POL[row[0]] = row[1]
+    elif TWITTER:
+        for category in categories:
+            type_dict[category]=[]
+        with open('tweets.csv', 'rb') as csvfile:
+            rev_read = csv.reader(csvfile)
+            rev_read.next()     # skip header row
+            number = [0,0]
+            for row in rev_read:
+                type_dict[ categories[ int(row[1]) ] ].append(row[3].strip())
+                REVIEW_POL[row[3].strip()] = categories[int(row[1])]
+                number[int(row[1])] += 1 
+                if (number[0]>TWEET_LIMIT and number[1]>TWEET_LIMIT):
+                    break
     else:
         for category in mr.categories():
             type_dict[category]=mr.fileids(categories=category)
@@ -73,7 +90,7 @@ def SplitData():
         size=int(len(li)*SPLIT_AMOUNT) + offset_sample
         # if DEFINED_SIZE:
             # size = DEFINED_SIZES[cat]
-        print "Category: ", cat, "Size:", size
+        # print "Category: ", cat, "Size:", size
         offset_sample *= -1
         docs_count[cat]=size
         train_test[0].extend(li[:size])
@@ -82,7 +99,7 @@ def SplitData():
     
 def tokenize(file_name): 
     list_words = ()
-    if AMAZON:
+    if AMAZON or TWITTER:
         list_words = re.split(r'\W+',file_name)
     else:
         list_words = re.split(r'\W+',mr.raw(fileids=file_name))
@@ -110,59 +127,99 @@ def CalculateAccuracy(li_results):
     precision = a/(a+b)
     # recall = a/(a+c)
     # print "The following parameters are recorded for the category " , cat
-    print "precision =", precision
+    # print "precision =", precision
     return precision
   
 def RunWholeThing():
     global AMAZON
+    global TWITTER
     global OFFSET
     global DEFINED_SIZE
     global DEFINED_SIZES
     OFFSET = 0
-    AMAZON = 0          # 0 - use movie_reviews, 1 - use Amazon set 
+    AMAZON = 0          # 0 - use movie_reviews, 1 - use Amazon set
+    TWITTER = 0
+    tested = ['  Bern:', '  Mult:', '  TIL :', '  DTIL:', '  CW  :', '  TIWC:']
     while OFFSET < 400: 
-        ans = DoTheThing()
-        print "Pos/Neg =", cat_num_docs['pos'], cat_num_docs['neg'], 'Movies @ +/-', OFFSET
-        print "_____________________________________________________"        
-        OFFSET = -1*OFFSET
-        ans2 = DoTheThing()
+        print "Movies with ", OFFSET
+        ans = DoTheThing() 
         OFFSET = -1*OFFSET
-        print ans , ans2       
-        OFFSET += 100
+        if OFFSET != 0:
+            ans2 = DoTheThing()
+            ans3 = [ans , ans2]
+            ans = [sum(e)/len(e) for e in zip(*ans3)]
+        a_i = 0    
+        for a in ans:
+            print tested[a_i], a
+            a_i += 1
+            
+        OFFSET = -1*OFFSET  
+               
+        OFFSET += 300
+        
     OFFSET = 0
     AMAZON = 1
     
+    
     while OFFSET < 600: 
+        print "Amazon with ", OFFSET
         ans = DoTheThing() 
-        print "Pos/Neg =", cat_num_docs['pos'], cat_num_docs['neg'], 'Amazon @ +/-', OFFSET
-        print "_____________________________________________________"
         OFFSET = -1*OFFSET
-        ans2 = DoTheThing()
+        if OFFSET != 0:
+            ans2 = DoTheThing()
+            ans3 = [ans , ans2]
+            ans = [sum(e)/len(e) for e in zip(*ans3)]
+        a_i = 0    
+        for a in ans:
+            print tested[a_i], a
+            a_i += 1
+            
+        OFFSET = -1*OFFSET  
+              
+        OFFSET += 400
+        
+    OFFSET = 0
+    AMAZON = 0
+    TWITTER = 1
+    
+    while OFFSET < 1000: 
+        print "Twitter with ", OFFSET
+        ans = DoTheThing() 
         OFFSET = -1*OFFSET
-        print ans , ans2       
-        OFFSET += 100
+        if OFFSET != 0:
+            ans2 = DoTheThing()
+            ans3 = [ans , ans2]
+            ans = [sum(e)/len(e) for e in zip(*ans3)]
+        a_i = 0    
+        for a in ans:
+            print tested[a_i], a
+            a_i += 1
+        OFFSET = -1*OFFSET              
+        OFFSET += 800
     
     
 
 def DoTheThing():
     i = 0
     reps    = 5
-    
+    b_nb    = 0
     m_nb    = 0
-    ti_nb   = 0
     til_nb  = 0
+    dtil_nb = 0
     cw_nb   = 0
     tilcw_nb= 0
     
     while i < reps:
         TrainMachine()
-        m_nb += TestMachine(0,0,0,0,0)/5
-        ti_nb += TestMachine(1,1,0,0,0)/5
-        til_nb += TestMachine(1,1,1,0,0)/5
-        cw_nb += TestMachine(0,0,0,1,1)/5
-        tilcw_nb += TestMachine(1,1,1,1,1)/5        
+        b_nb += TestMachine_Bern()/reps
+        m_nb += TestMachine(0,0,0,0,0)/reps
+        til_nb += TestMachine(1,1,1,0,0)/reps
+        dtil_nb += TestMachine(1,1,1,2,0)/reps
+        cw_nb += TestMachine(0,0,0,1,1)/reps
+        tilcw_nb += TestMachine(1,1,1,1,1)/reps        
         i+=1
-    return (m_nb, ti_nb, til_nb, cw_nb, tilcw_nb)
+    # print "  Bern: %0.6f\n  Mult: %0.6f\n  TIL : %0.6f\n  DTIL: %0.6f\n  CW  : %0.6f\n  TIWC: %0.6f" % (b_nb, m_nb, til_nb, dtil_nb, cw_nb, tilcw_nb)
+    return [b_nb, m_nb, til_nb, dtil_nb, cw_nb, tilcw_nb]
 
   
 # li = Preprocessor.get_testset_trainset(corpus)
@@ -173,6 +230,7 @@ def TrainMachine():
     global cat_word_dict
     global cat_word_count_dict
     global num_docs_word_in
+    global word_cat_num_doc_dict
     global li
     global testset
     global trainset
@@ -192,7 +250,7 @@ def TrainMachine():
         #and put word count as zero
         #and then insert words into the category's dictionary in both cases and update the word count
         cat = ''
-        if AMAZON:
+        if AMAZON or TWITTER:
             cat = REVIEW_POL[file_name]
         else:
             cat = mr.categories(fileids = file_name)[0]
@@ -211,8 +269,18 @@ def TrainMachine():
                 num_docs_word_in[w] = num_docs_word_in.get(w, 0)
                 num_docs_word_in[w] += 1
                 
+                word_cat_num_doc_dict[w]=word_cat_num_doc_dict.get(w,{})
+                word_cat_num_doc_dict[w][cat]=word_cat_num_doc_dict[w].get(cat,0)
+                word_cat_num_doc_dict[w][cat]+=1
+                
     for dic in cat_word_dict.values():
-     vocab_length+=len(dic)
+        vocab_length+=len(dic)
+    for w in word_cat_num_doc_dict:
+        for cat in cat_num_docs:
+            nct = word_cat_num_doc_dict[w].get(cat,0)
+            # convert #times a word appears into #times+1/#cat_reviews+2
+            ratio = (nct+1)/(cat_num_docs[cat]+2)
+            word_cat_num_doc_dict[w][cat]=ratio
         
 
 # ##8) Get the vocabulary length
@@ -297,7 +365,7 @@ def TestMachine(t, i, l, c, w):
                     my_word_count[kw] = log(1 + my_word_count[kw])
                 
                 if IDF:
-                    my_word_count[kw] = my_word_count[kw]*log(length_train/num_docs_word_in.get(kw,1), 2)     #IDF
+                    my_word_count[kw] = my_word_count[kw]*log(length_train/num_docs_word_in.get(kw,0.01), 2)     #IDF
                 ## length norm
                 w_freq = my_word_count[kw]
                 length_norm += pow(w_freq, 2)
@@ -342,7 +410,7 @@ def TestMachine(t, i, l, c, w):
                 minimum_neg_log_prob=neg_log_prob
         # print "Min cat: ", min_category
         
-        if AMAZON:
+        if AMAZON or TWITTER:
             li_results.append((file_name,min_category,REVIEW_POL[file_name]))
         else:
             li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
@@ -354,5 +422,34 @@ def TestMachine(t, i, l, c, w):
     precision = CalculateAccuracy(li_results)
     return precision
 
+def TestMachine_Bern():
+    li_results=[]
+    #5) Like in the training set,Loop through the test set, to get the individual words
+    for file_name in testset:
+        minimum_neg_log_prob=1000000000
+        min_category=''
+        set_list_words = set(tokenize(file_name))
+
+    ##6) Get the probability for each category,
+        #using the cat_num_docs dictionary to wade through the categories
+        for cat in  cat_num_docs:
+            neg_log_prob=-log(cat_num_docs[cat]/len(trainset))
+            for w in word_cat_num_doc_dict:
+                if w in set_list_words:
+                    neg_log_prob-=log(word_cat_num_doc_dict[w][cat])
+                else:
+                    neg_log_prob-=log(1-word_cat_num_doc_dict[w][cat])
+            if minimum_neg_log_prob>neg_log_prob:
+                min_category=cat
+                minimum_neg_log_prob=neg_log_prob
+
+        if AMAZON or TWITTER:
+            li_results.append((file_name,min_category,REVIEW_POL[file_name]))
+        else:
+            li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
+        # break
+    precision = CalculateAccuracy(li_results)
+    return precision
+    
     
 RunWholeThing()
\ No newline at end of file
diff --git a/BNB.py b/OLD_VERSIONS/BNB.py
similarity index 100%
rename from BNB.py
rename to OLD_VERSIONS/BNB.py
diff --git a/MNB.py b/OLD_VERSIONS/MNB.py
similarity index 100%
rename from MNB.py
rename to OLD_VERSIONS/MNB.py
diff --git a/TFIDF.py b/OLD_VERSIONS/TFIDF.py
similarity index 100%
rename from TFIDF.py
rename to OLD_VERSIONS/TFIDF.py
diff --git a/OLD_VERSIONS/TWCNB.py b/OLD_VERSIONS/TWCNB.py
new file mode 100644
index 0000000..2394283
--- /dev/null
+++ b/OLD_VERSIONS/TWCNB.py
@@ -0,0 +1,387 @@
+######################
+# Full version with all variations included
+# To improve: create a main function allowing for multiple runs
+######################
+
+from __future__ import division
+from math import log
+from math import pow
+import re
+import csv
+from nltk.corpus import movie_reviews as mr
+from nltk.corpus import stopwords
+import random
+STOP_WORDS = set(stopwords.words('english'))
+SPLIT_AMOUNT = 0.6          # training amount from data 
+
+COMPLEMENT = 0      # 1 - just comp, 2 - delta / one-v-all
+WEIGHTED = 0        # 1 - adjust weights
+TF = 0              # 1 - log term frew
+IDF = 0             # 1 - idf
+LENGTH = 0         # 1 - doc length adjust
+AMAZON = 0          # 0 - use movie_reviews, 1 - use Amazon set 
+NO_OFF = 1          # 0 - use random data size offset, 1 - nope
+DEFINED_SIZE = 0    # 1 - use DEFINED_SIZES for pos, neg sets
+DEFINED_SIZES = {'pos': 600, 'neg': 600}
+
+REVIEW_POL={}
+def SplitData():
+    type_dict={}
+    docs_count={}
+    train_test = [[],[]]
+    offset_sample = random.randint(-400,400)
+    print "offset_sample", offset_sample
+    if AMAZON:
+        offset_sample = random.randint(-600,600)
+        for category in ['pos', 'neg']:
+            type_dict[category]=[]
+        with open('amazon_revs.csv', 'rb') as csvfile:
+            rev_read = csv.reader(csvfile)
+            for row in rev_read:
+                type_dict[row[1]].append(row[0])
+                REVIEW_POL[row[0]] = row[1]
+    else:
+        for category in mr.categories():
+            type_dict[category]=mr.fileids(categories=category)
+    if NO_OFF:
+        offset_sample = 0
+    for cat in type_dict.keys():
+        li = type_dict[cat]
+        # random.shuffle(li)
+        size=int(len(li)*SPLIT_AMOUNT) + offset_sample
+        if DEFINED_SIZE:
+            size = DEFINED_SIZES[cat]
+        print "Category: ", cat, "Size:", size
+        offset_sample *= -1
+        docs_count[cat]=size
+        train_test[0].extend(li[:size])
+        train_test[1].extend(li[size:])
+    return [train_test,type_dict, docs_count]
+    
+def tokenize(file_name): 
+    list_words = ()
+    if AMAZON:
+        list_words = re.split(r'\W+',file_name)
+    else:
+        list_words = re.split(r'\W+',mr.raw(fileids=file_name))
+    
+    return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS]
+
+
+def CalculateAccuracy(li_results):
+    a=0
+    b=0
+    c=0
+    d=0
+    cat = li_results[0][1]
+    for t in li_results:
+        if cat==t[1]:
+            if cat==t[2]:
+                a+=1
+            else:
+                b+=1
+        else:
+            if cat==t[2]:
+                c+=1
+            else:
+                d+=1
+    precision = a/(a+b)
+    # recall = a/(a+c)
+    # print "The following parameters are recorded for the category " , cat
+    print "precision =", precision
+    
+# li = Preprocessor.get_testset_trainset(corpus)
+li = SplitData()
+# exit()
+testset = li[0][1]
+trainset = li[0][0]
+# li = Preprocessor.startup()
+cat_num_docs = li[2]
+    
+length_train = len(trainset)
+print "length of training set ", length_train
+
+##4)Create a) a dictionary with a category as the key and dictionary of words-occurrences as values
+          #b) a dictionary with a category as the key and the number of words in it as the value
+# {pos-> {w1 = 17 times}, {w2 = 32 times}...} {neg-> ....}
+cat_word_dict={}
+# {pos-> 4000 words} {neg-> 7000 words}
+cat_word_count_dict={}
+#val = my_dict.get(key, mydefaultval)
+complete_training_docs_tokens = []
+num_docs_word_in = {} 
+counts_for_w = {}
+
+
+##5)Loop through the training set, to get the entire text from  each file
+##6) Parse the string to get individual words
+
+for file_name in trainset:
+    list_words = tokenize(file_name)
+    complete_training_docs_tokens.append(list_words)
+    # counts_for_w[file_name] = counts_for_w.get(file_name, {})
+    counts_for_w[file_name] = {}
+
+##7) Check if category exists in dictionary, if not, create an empty dictionary,
+    #and put word count as zero
+    #and then insert words into the category's dictionary in both cases and update the word count
+    cat = ''
+    if AMAZON:
+        cat = REVIEW_POL[file_name]
+    else:
+        cat = mr.categories(fileids = file_name)[0]
+    
+    # cat_word_dict[cat] = cat_word_dict.get(cat,{})
+    # cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0)
+    
+# add number of words to total word count for cat
+    # cat_word_count_dict[cat]+=len(list_words)      
+# start count for number of occurences for each word
+    counted = []
+    for w in list_words:
+        # cat_word_dict[cat][w] = cat_word_dict[cat].get(w, 0)
+        # cat_word_dict[cat][w]+=1
+        counts_for_w[file_name][w] = counts_for_w[file_name].get(w, 0)
+        counts_for_w[file_name][w] += 1
+        if w not in counted:
+            counted.append(w)
+            num_docs_word_in[w] = num_docs_word_in.get(w, 0)
+            num_docs_word_in[w] += 1
+    # break
+
+
+       
+for fn in trainset:
+    length_norm_val = 0
+    
+    cat = ''
+    if AMAZON:
+        cat = REVIEW_POL[fn]
+    else:
+        cat = mr.categories(fileids = fn)[0]        
+    cat_word_dict[cat] = cat_word_dict.get(cat,{})
+    cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0)
+    # print fn + "\n_______________________________\n"
+    # print tokenize(fn)
+    # print "" + "\n_______________________________\n"
+    # print counts_for_w[fn]['book'], num_docs_word_in['book']
+    for c_w in counts_for_w[fn].keys():
+        
+        # print c_w
+        
+        if TF:
+            counts_for_w[fn][c_w] = log(counts_for_w[fn][c_w] + 1, 2)
+            # if c_w == 'book' : 
+                # print 'TF: ', counts_for_w[fn]['book']
+        if IDF:
+            counts_for_w[fn][c_w] = counts_for_w[fn][c_w]*log(length_train/num_docs_word_in[c_w], 2)
+            # if c_w == 'book' : 
+                # print 'IDF: ', counts_for_w[fn]['book']
+        length_norm_val += (counts_for_w[fn][c_w]*counts_for_w[fn][c_w])
+    length_norm_val = pow(length_norm_val,0.5)
+    # print counts_for_w[fn]['book'], num_docs_word_in['book']
+    # print length_norm_val
+    
+    for c_w in counts_for_w[fn].keys():
+        if LENGTH:
+            counts_for_w[fn][c_w] /= length_norm_val 
+        
+        cat_word_count_dict[cat] += counts_for_w[fn][c_w]
+        cat_word_dict[cat][c_w] = cat_word_dict[cat].get(c_w, 0)
+        cat_word_dict[cat][c_w] += counts_for_w[fn][c_w]
+    
+# print cat_word_dict['neg']['book']
+# print cat_word_dict['pos']['book']
+    
+# exit()   
+# print "Using LNV: ", length_norm_val            
+# length_norm_val = length_norm_val**(0.5)
+# print "Using sqLNV: ", length_norm_val
+# for fn in trainset:
+    # cat = ''
+    # if AMAZON:
+        # cat = REVIEW_POL[fn]
+    # else:
+        # cat = mr.categories(fileids = fn)[0]        
+    # cat_word_dict[cat] = cat_word_dict.get(cat,{})
+    
+    # for c_w in counts_for_w[fn].keys():
+        # if LENGTH:
+            # counts_for_w[fn][c_w] /= length_norm_val 
+        
+        # cat_word_dict[cat][c_w] = cat_word_dict[cat].get(c_w, 0)
+        # cat_word_dict[cat][c_w] += counts_for_w[fn][c_w]
+##8) Get the vocabulary length
+## number of words, total across categories
+vocab_length=0     
+       
+# for dic in num_docs_word_in.keys():
+vocab_length=len(num_docs_word_in.keys())
+print cat_word_dict['pos']['book'], cat_word_dict['neg']['book']
+print "Vocab", vocab_length
+for cat in cat_word_dict.keys():
+    count_cat = cat_word_count_dict[cat]
+    weight_norm_cat = 0
+    for w in cat_word_dict[cat].keys():
+        cat_word_dict[cat][w] = (cat_word_dict[cat][w]+1)/(count_cat+vocab_length)
+        cat_word_dict[cat][w] = log ( cat_word_dict[cat][w] , 2)
+        weight_norm_cat += abs(cat_word_dict[cat][w])
+    if WEIGHTED:    
+        for w in cat_word_dict[cat].keys():
+            cat_word_dict[cat][w] = cat_word_dict[cat][w]/weight_norm_cat
+
+print cat_word_dict['pos']['book'], cat_word_dict['neg']['book']
+exit()
+
+     
+  
+####Congratulations! the Classifier is trained, now it is time to run the Multinomial Naive Bayes Classifier on the test dataset
+print 'pos' , cat_num_docs['pos']/len(trainset)
+print 'neg' , cat_num_docs['neg']/len(trainset)
+li_results=[]
+li_results2=[]
+#9) Like in the training set,Loop through the test set, to get the entire text from  each file
+##10) Similar step, parse the string to get individual words
+for file_name in testset:
+    # print "File: ", file_name
+    # minimum_neg_log_prob=1000000000
+    minimum_neg_log_prob = -1000000000      # NEW
+    minimum_pos_log_prob = 100000000
+    min_category=''
+    max_category=''
+    list_words = tokenize(file_name)
+
+
+    
+##11) Get the probability for each category,
+    #can use any of the created dictionaries to wade through the categories
+    for cat in  cat_word_count_dict:
+        
+        
+        # print "________________________________________________________________"
+        # print "________________________________________________________________"
+        # print "\n\n" , cat, cat, cat, cat, cat, cat, cat, cat, cat, cat, "\n\n"
+        # neg_log_prob=-log(cat_num_docs[cat]/length_train)
+        inv_cat = 'pos'
+        if cat == 'pos':
+            inv_cat = 'neg'
+        
+        neg_log_prob = log(cat_num_docs[cat]/length_train, 2)
+        pos_log_prob = 0
+        
+        # neg_log_prob = cat_num_docs[cat]/length_train
+        opp_word_dict = cat_word_dict[inv_cat]
+        opp_count_cat = cat_word_count_dict[inv_cat]
+        
+        word_dict = cat_word_dict[cat]
+        count_cat = cat_word_count_dict[cat]
+                
+        my_word_count = {}
+        for aw in list_words:
+            my_word_count[aw] = my_word_count.get(aw, 0)
+            my_word_count[aw]+=1
+            
+            if COMPLEMENT:
+                neg_log_prob -= opp_word_dict.get(aw, 0)
+            else :
+                neg_log_prob += word_dict.get(aw, 0)
+                
+            pos_log_prob += opp_word_dict.get(aw, 0)
+            # my_orig_word_count[aw] = my_orig_word_count.get(aw, 0)
+            # my_orig_word_count[aw]+=1
+        
+        # # length_norm = 0
+        # weight_normalizing_ratio = 0
+        # opp_weight_normalizing_ratio = 0
+        # for kw in my_word_count.keys():
+            # count_word_train=word_dict.get(kw,0)            
+            # ratio = (count_word_train+1)/(count_cat+vocab_length)
+            
+            # # if COMPLEMENT:
+            # opp_count_word_train=opp_word_dict.get(kw,0)
+            # opp_ratio = (opp_count_word_train+1)/(opp_count_cat+vocab_length)
+            
+            # # weight norm
+            # # weight_normalizing_ratio += abs(log(ratio, 2))
+            # # opp_weight_normalizing_ratio += abs(log(opp_ratio, 2))
+            # weight_normalizing_ratio += log(ratio, 2)
+            # opp_weight_normalizing_ratio += log(opp_ratio, 2)
+            
+            # # if TF:
+                # # my_word_count[kw] = log(1 + my_word_count[kw])
+            
+            # # if IDF:
+                # # my_word_count[kw] = my_word_count[kw]*log(length_train/num_docs_word_in.get(w,1))     #IDF
+            # # ## length norm
+            # # w_freq = my_word_count[kw]
+            # # length_norm += (w_freq * w_freq)
+            
+        # length_norm = length_norm**(0.5)   
+        # print "WNR: ", weight_normalizing_ratio
+        
+        # for w in my_word_count.keys():
+            # count_word_train=word_dict.get(w,0)
+            # ratio = (count_word_train+1)/(count_cat+vocab_length)   #Nw,c+1/Nc+|V| = theta_c
+            
+            # # if COMPLEMENT:
+            # opp_count_word_train=opp_word_dict.get(w,0)
+            # opp_ratio = (opp_count_word_train+1)/(opp_count_cat+vocab_length)
+            
+            # word_freq = my_word_count[w]
+            
+            # # if LENGTH:
+                # # word_freq = word_freq/length_norm       # length normalization
+            
+
+            # ratio = log(ratio, 2)      # weight factor     log(theta_c) = weight_c,w
+            # opp_ratio = log(opp_ratio, 2)
+            
+            # if WEIGHTED:
+                # ratio = ratio/weight_normalizing_ratio      # weight normalization
+                # opp_ratio = opp_ratio/opp_weight_normalizing_ratio
+            
+            # if COMPLEMENT == 1:                     # just complement
+                # neg_log_prob -= word_freq*opp_ratio
+            # else:
+                # neg_log_prob += word_freq*ratio    # class probability
+                # pos_log_prob += word_freq*ratio
+            # if COMPLEMENT == 2:                     # one-v-all 
+                # neg_log_prob += word_freq*ratio
+
+            
+            # break
+        # print "NLP: ", neg_log_prob
+        # print file_name
+        # print "\n\n", cat, minimum_pos_log_prob , '<' , neg_log_prob                 
+        # if minimum_pos_log_prob>pos_log_prob:
+        if minimum_neg_log_prob<neg_log_prob:
+            min_category=cat
+            minimum_neg_log_prob=neg_log_prob
+            # minimum_pos_log_prob=pos_log_prob
+        if minimum_pos_log_prob>pos_log_prob:
+            max_category=cat
+            minimum_pos_log_prob=pos_log_prob
+    # print "Min cat: ", min_category
+    
+    if AMAZON:
+        li_results.append((file_name,min_category,REVIEW_POL[file_name]))
+    else:
+        li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
+    # break
+    if AMAZON:
+        li_results2.append((file_name,max_category,REVIEW_POL[file_name]))
+    else:
+        li_results2.append((file_name,max_category,mr.categories(fileids = file_name)[0]))
+
+###--------------------DEBUG STATEMENTS----------------------
+#for t in li_results:
+ #   if t[1]!=t[2]:
+  #      print t
+###--------------------DEBUG STATEMENTS----------------------
+    
+###--------------------DEBUG STATEMENTS----------------------
+
+#12) Evaluating the classifier
+
+CalculateAccuracy(li_results)
+CalculateAccuracy(li_results2)
diff --git a/TWCNB.py b/OLD_VERSIONS/TWCNB_v0.py
similarity index 100%
rename from TWCNB.py
rename to OLD_VERSIONS/TWCNB_v0.py
diff --git a/OLD_VERSIONS/nb_graph.py b/OLD_VERSIONS/nb_graph.py
new file mode 100644
index 0000000..7db9e9f
--- /dev/null
+++ b/OLD_VERSIONS/nb_graph.py
@@ -0,0 +1,82 @@
+import numpy
+from matplotlib import pyplot
+
+labels = [
+  "Bernoulli",
+  "Multinomial",
+  "W C + M",
+  "TF IDF L W C + M",
+  "TF IDF L + M",
+  "delta TF IDF L + M",  
+]
+
+tops = numpy.arange(len(labels))
+
+# 0 , movies
+widths = [  0.789251270916, 
+            0.90355565901, 
+            0.687274830247, 
+            0.858607720931, 
+            0.91342342141, 
+            0.91342342141   ]
+# skew , movies            
+widths2 = [ 0.139696306725, 
+            0.822764704115, 
+            0.125,
+            0.125,
+            0.920448727392,
+            0.961479550697  ]
+            
+
+# 0 , amazon
+widths3 = [ 0.684270469696, 
+            0.792608446831, 
+            0.529824561404, 
+            0.529824561404, 
+            0.798532433585, 
+            0.829834039103   ]
+# skew , amazon            
+widths4 = [ 0.298353719071, 
+            0.73741502689, 
+            0.219298245614,
+            0.219298245614,
+            0.751207980863,
+            0.771721619137   ]
+            
+# 0 , twitter
+widths5 = [ 0.717525563057, 
+            0.840751614224, 
+            0.558914441009, 
+            0.558667842964, 
+            0.849271128435, 
+            0.850938806271   ]
+            
+# skew , twitter            
+widths6 = [ 0.584685826487, 
+            0.853741247102, 
+            0.323568358281,
+            0.323555359506,
+            0.862090804434,
+            0.889175806229   ]
+
+            
+height = 0.1333333333
+pyplot.barh(tops+height*5, widths, height, color="#50312F")
+pyplot.barh(tops+height*4, widths2, height, color="#F17CB0")
+pyplot.barh(tops+height*3, widths3, height, color="#375E97")
+pyplot.barh(tops+height*2, widths4, height, color="#60BD68")
+pyplot.barh(tops+height*1, widths5, height, color="#CB0000")
+pyplot.barh(tops+height*0, widths6, height, color="#FAA43A")
+
+pyplot.legend(["Movies", "Movies - skew", "Amazon", "Amazon - skew", "Twitter", "Twitter-skew"], loc=4) # bottom right
+pyplot.yticks(tops+2*height, labels)
+pyplot.xlim(0, 1.18)
+pyplot.ylim(tops[0]-height, tops[-1]+7*height)
+pyplot.show()
+
+"""
+gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: False, use_position: False 0.822003140865
+gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: False, use_position: True 0.781988575402
+gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position: False 0.77899606193
+gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position: True 0.762512512513
+"""
\ No newline at end of file
diff --git a/get_amazon_revs.py b/get_amazon_revs.py
index b5c50e3..b89eb31 100644
--- a/get_amazon_revs.py
+++ b/get_amazon_revs.py
@@ -1,8 +1,9 @@
 import csv
+import re
 path = "C:\\Users\\Astha\\Desktop\\amazon_revs"
 types = ['books' , 'dvd' , 'electronics' , 'kitchen_housewares']
 cats = ['positive', 'negative']
-with open('amazon_revs.csv', 'wb') as f:
+with open('amazon_revs2.csv', 'wb') as f:
     cw = csv.writer(f)
     for t in types:
         for c in cats:
@@ -12,10 +13,10 @@ with open('amazon_revs.csv', 'wb') as f:
             curr_rev = ''
             add = 0
             for line in txt_p:
-                line = line.rstrip()
-                if line == "<review_text>":
+                # line = line.rstrip()
+                if re.match( r'<review_text>', line ):
                     add = 1
-                elif line == "</review_text>":
+                elif re.match( r'</review_text>', line ):
                     add = 0
                     cw.writerow([curr_rev, short_c])
                     curr_rev = ''
diff --git a/graph_nb_base.py b/graph_nb_base.py
new file mode 100644
index 0000000..08aa59e
--- /dev/null
+++ b/graph_nb_base.py
@@ -0,0 +1,82 @@
+import numpy
+from matplotlib import pyplot
+
+labels = [
+  "BNB",
+  "MNB",
+  "W C",
+  "TWCNB",
+  "TF IDF L",
+  "TF IDF L OvA",  
+]
+
+tops = numpy.arange(len(labels))
+
+# 0 , movies
+widths = [  0.789251270916, 
+            0.90355565901, 
+            0.687274830247, 
+            0.858607720931, 
+            0.91342342141, 
+            0.91342342141   ]
+# skew , movies            
+# widths2 = [ 0.139696306725, 
+            # 0.822764704115, 
+            # 0.125,
+            # 0.125,
+            # 0.920448727392,
+            # 0.961479550697  ]
+            
+
+# 0 , amazon
+widths3 = [ 0.684270469696, 
+            0.792608446831, 
+            0.529824561404, 
+            0.529824561404, 
+            0.798532433585, 
+            0.829834039103   ]
+# skew , amazon            
+# widths4 = [ 0.298353719071, 
+            # 0.73741502689, 
+            # 0.219298245614,
+            # 0.219298245614,
+            # 0.751207980863,
+            # 0.771721619137   ]
+            
+# 0 , twitter
+widths5 = [ 0.717525563057, 
+            0.840751614224, 
+            0.558914441009, 
+            0.558667842964, 
+            0.849271128435, 
+            0.850938806271   ]
+            
+# skew , twitter            
+# widths6 = [ 0.584685826487, 
+            # 0.853741247102, 
+            # 0.323568358281,
+            # 0.323555359506,
+            # 0.862090804434,
+            # 0.889175806229   ]
+
+            
+height = 0.2
+pyplot.barh(tops+height*2, widths, height, color="#3F681C")
+# pyplot.barh(tops+height*4, widths2, height, color="#F17CB0")
+pyplot.barh(tops+height*1, widths3, height, color="#5BC8AC")
+# pyplot.barh(tops+height*2, widths4, height, color="#60BD68")
+pyplot.barh(tops+height*0, widths5, height, color="#CB0000")
+# pyplot.barh(tops+height*0, widths6, height, color="#FAA43A")
+
+pyplot.legend(["Movies", "Amazon", "Twitter"], loc=4) # bottom right
+pyplot.yticks(tops+height, labels)
+pyplot.xlim(0, 1.18)
+pyplot.ylim(tops[0]-height, tops[-1]+4*height)
+pyplot.show()
+
+"""
+gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: False, use_position: False 0.822003140865
+gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: False, use_position: True 0.781988575402
+gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position: False 0.77899606193
+gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position: True 0.762512512513
+"""
\ No newline at end of file
diff --git a/graph_nb_skew.py b/graph_nb_skew.py
new file mode 100644
index 0000000..e578fc0
--- /dev/null
+++ b/graph_nb_skew.py
@@ -0,0 +1,82 @@
+import numpy
+from matplotlib import pyplot
+
+labels = [
+  "BNB",
+  "MNB",
+  "W C",
+  "TWCNB",
+  "TF IDF L",
+  "TF IDF L OvA",  
+]
+
+tops = numpy.arange(len(labels))
+
+# 0 , movies
+# widths = [  0.789251270916, 
+            # 0.90355565901, 
+            # 0.687274830247, 
+            # 0.858607720931, 
+            # 0.91342342141, 
+            # 0.91342342141   ]
+# skew , movies            
+widths2 = [ 0.139696306725, 
+            0.822764704115, 
+            0.125,
+            0.125,
+            0.920448727392,
+            0.961479550697  ]
+            
+
+# 0 , amazon
+# widths3 = [ 0.684270469696, 
+            # 0.792608446831, 
+            # 0.529824561404, 
+            # 0.529824561404, 
+            # 0.798532433585, 
+            # 0.829834039103   ]
+# skew , amazon            
+widths4 = [ 0.298353719071, 
+            0.73741502689, 
+            0.219298245614,
+            0.219298245614,
+            0.751207980863,
+            0.771721619137   ]
+            
+# 0 , twitter
+# widths5 = [ 0.717525563057, 
+            # 0.840751614224, 
+            # 0.558914441009, 
+            # 0.558667842964, 
+            # 0.849271128435, 
+            # 0.850938806271   ]
+            
+# skew , twitter            
+widths6 = [ 0.584685826487, 
+            0.853741247102, 
+            0.323568358281,
+            0.323555359506,
+            0.862090804434,
+            0.889175806229   ]
+
+            
+height = 0.2
+pyplot.barh(tops+height*2, widths2, height, color="#3F681C")
+# pyplot.barh(tops+height*4, widths2, height, color="#F17CB0")
+pyplot.barh(tops+height*1, widths4, height, color="#5BC8AC")
+# pyplot.barh(tops+height*2, widths4, height, color="#60BD68")
+pyplot.barh(tops+height*0, widths6, height, color="#CB0000")
+# pyplot.barh(tops+height*0, widths6, height, color="#FAA43A")
+
+pyplot.legend(["Movies-skew", "Amazon-skew", "Twitter-skew"], loc=4) # bottom right
+pyplot.yticks(tops+height, labels)
+pyplot.xlim(0, 1.18)
+pyplot.ylim(tops[0]-height, tops[-1]+4*height)
+pyplot.show()
+
+"""
+gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: False, use_position: False 0.822003140865
+gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: False, use_position: True 0.781988575402
+gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position: False 0.77899606193
+gram_length: 1, use_presence: False, use_amazon: False, use_pos_tags: True, use_adj_only: True, use_position: True 0.762512512513
+"""
\ No newline at end of file