From 47d86abae52112fc8ce25868505ede371514cf65 Mon Sep 17 00:00:00 2001
From: ap1113 <astha.patni@uconn.edu>
Date: Wed, 20 Apr 2016 19:42:52 -0400
Subject: [PATCH] functions

i did a thing - but i did it at 2 am so this might be bad
---
 OLD_VERSIONS/CWMNB.py     | 254 +++++++++++++++++++++++++++
 OLD_VERSIONS/TCWNB2.py    | 242 ++++++++++++++++++++++++++
 OLD_VERSIONS/TWCNB_old.py | 235 +++++++++++++++++++++++++
 TWCNB_v0_2.py             | 358 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 1089 insertions(+)
 create mode 100644 OLD_VERSIONS/CWMNB.py
 create mode 100644 OLD_VERSIONS/TCWNB2.py
 create mode 100644 OLD_VERSIONS/TWCNB_old.py
 create mode 100644 TWCNB_v0_2.py

diff --git a/OLD_VERSIONS/CWMNB.py b/OLD_VERSIONS/CWMNB.py
new file mode 100644
index 0000000..2e2b524
--- /dev/null
+++ b/OLD_VERSIONS/CWMNB.py
@@ -0,0 +1,254 @@
+######################
+# This version is CWMNB only
+######################
+
+from __future__ import division
+from math import log
+import re
+import csv
+from nltk.corpus import movie_reviews as mr
+from nltk.corpus import stopwords
+import random
+STOP_WORDS = set(stopwords.words('english'))
+SPLIT_AMOUNT = 0.6          # training amount from data 
+
+COMPLEMENT = 0
+WEIGHTED = 0
+USE_IDF = 0
+AMAZON = 0
+REVIEW_POL={}
+DEFINED_SIZE = 1
+DEFINED_SIZES = {'pos': 600, 'neg': 600}
+def SplitData():
+    type_dict={}
+    docs_count={}
+    train_test = [[],[]]
+    offset_sample = random.randint(-400,400)
+    print "offset_sample", offset_sample
+    if AMAZON:
+        offset_sample = random.randint(-600,600)
+        for category in ['pos', 'neg']:
+            type_dict[category]=[]
+        with open('amazon_revs.csv', 'rb') as csvfile:
+            rev_read = csv.reader(csvfile)
+            for row in rev_read:
+                type_dict[row[1]].append(row[0])
+                REVIEW_POL[row[0]] = row[1]
+    else:
+        for category in mr.categories():
+            type_dict[category]=mr.fileids(categories=category)
+    for cat in type_dict.keys():
+        li = type_dict[cat]
+        random.shuffle(li)
+        size=int(len(li)*SPLIT_AMOUNT) + offset_sample
+        if DEFINED_SIZE:
+            size = DEFINED_SIZES[cat]
+        print "Category: ", cat, "Size:", size
+        offset_sample *= -1
+        docs_count[cat]=size
+        train_test[0].extend(li[:size])
+        train_test[1].extend(li[size:])
+    return [train_test,type_dict, docs_count]
+    
+def tokenize(file_name): 
+    list_words = ()
+    if AMAZON:
+        list_words = re.split(r'\W+',file_name)
+    else:
+        list_words = re.split(r'\W+',mr.raw(fileids=file_name))
+    
+    return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS]
+
+
+def CalculateAccuracy(li_results):
+    a=0
+    b=0
+    c=0
+    d=0
+    cat = li_results[0][1]
+    for t in li_results:
+        if cat==t[1]:
+            if cat==t[2]:
+                a+=1
+            else:
+                b+=1
+        else:
+            if cat==t[2]:
+                c+=1
+            else:
+                d+=1
+    precision = a/(a+b)
+    # recall = a/(a+c)
+    # print "The following parameters are recorded for the category " , cat
+    print "precision =", precision
+    
+# li = Preprocessor.get_testset_trainset(corpus)
+li = SplitData()
+# exit()
+testset = li[0][1]
+trainset = li[0][0]
+# li = Preprocessor.startup()
+cat_num_docs = li[2]
+    
+
+
+##4)Create a) a dictionary with a category as the key and dictionary of words-occurrences as values
+          #b) a dictionary with a category as the key and the number of words in it as the value
+# {pos-> {w1 = 17 times}, {w2 = 32 times}...} {neg-> ....}
+cat_word_dict={}
+# {pos-> 4000 words} {neg-> 7000 words}
+cat_word_count_dict={}
+#val = my_dict.get(key, mydefaultval)
+complete_training_docs_tokens = []
+
+##5)Loop through the training set, to get the entire text from  each file
+##6) Parse the string to get individual words
+for file_name in trainset:
+    list_words = tokenize(file_name)
+    complete_training_docs_tokens.append(list_words)
+    
+
+##7) Check if category exists in dictionary, if not, create an empty dictionary,
+    #and put word count as zero
+    #and then insert words into the category's dictionary in both cases and update the word count
+    cat = ''
+    if AMAZON:
+        cat = REVIEW_POL[file_name]
+    else:
+        cat = mr.categories(fileids = file_name)[0]
+    cat_word_dict[cat] = cat_word_dict.get(cat,{})
+    cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0)
+    
+# add number of words to total word count for cat
+    cat_word_count_dict[cat]+=len(list_words)   
+# start count for number of occurences for each word    
+    for w in list_words:
+        cat_word_dict[cat][w] = cat_word_dict[cat].get(w, 0)
+        cat_word_dict[cat][w]+=1
+        
+        
+
+##8) Get the vocabulary length
+## number of words, total across categories
+vocab_length=0     
+num_docs_word_in = {}        
+for dic in cat_word_dict.values():
+     vocab_length+=len(dic)
+     if USE_IDF:
+         for uniq_word in dic.keys():
+            num_docs_word_in[uniq_word] = num_docs_word_in.get(uniq_word, 1)
+            num_docs_word_in[uniq_word] = sum(1 for sr in complete_training_docs_tokens if uniq_word in sr)
+
+        
+        
+####Congratulations! the Classifier is trained, now it is time to run the Multinomial Naive Bayes Classifier on the test dataset
+length_train = len(trainset)
+li_results=[]
+#9) Like in the training set,Loop through the test set, to get the entire text from  each file
+##10) Similar step, parse the string to get individual words
+for file_name in testset:
+    # print "File: ", file_name
+    # minimum_neg_log_prob=1000000000
+    minimum_neg_log_prob = -1000000000      # NEW
+    min_category=''
+    list_words = tokenize(file_name)
+
+
+    
+##11) Get the probability for each category,
+    #can use any of the created dictionaries to wade through the categories
+    for cat in  cat_word_count_dict:
+        # print cat , cat_num_docs[cat]/len(trainset)
+        # print "________________________________________________________________"
+        # print "________________________________________________________________"
+        # print "\n\n" , cat, cat, cat, cat, cat, cat, cat, cat, cat, cat, "\n\n"
+        # neg_log_prob=-log(cat_num_docs[cat]/length_train)
+        inv_cat = 'pos'
+        if cat == 'pos':
+            inv_cat = 'neg'
+        
+        neg_log_prob = log(cat_num_docs[cat]/length_train)
+        
+        # neg_log_prob = cat_num_docs[cat]/length_train
+        opp_word_dict = cat_word_dict[inv_cat]
+        opp_count_cat = cat_word_count_dict[inv_cat]
+        
+        word_dict = cat_word_dict[cat]
+        count_cat = cat_word_count_dict[cat]
+                
+        my_word_count = {}
+        for aw in list_words:
+            my_word_count[aw] = my_word_count.get(aw, 0)
+            my_word_count[aw]+=1
+        
+        length_norm = 0
+        weight_normalizing_ratio = 0
+        for kw in my_word_count.keys():
+            count_word_train=word_dict.get(kw,0)            
+            ratio = (count_word_train+1)/(count_cat+vocab_length)
+            
+            if COMPLEMENT:
+                count_word_train=opp_word_dict.get(kw,0)
+                ratio = (count_word_train+1)/(opp_count_cat+vocab_length)
+            
+            # weight norm
+            weight_normalizing_ratio += abs(log(ratio))
+            ## TF
+            # my_word_count[kw] = log(my_word_count[kw]+1)
+            ## length norm
+            # length_norm += (my_word_count[kw]**(2))
+            
+        # length_norm = length_norm**(0.5)   
+        # print "WNR: ", weight_normalizing_ratio
+        
+        for w in my_word_count.keys():
+            count_word_train=word_dict.get(w,0)
+            ratio = (count_word_train+1)/(count_cat+vocab_length)   #Nw,c+1/Nc+|V| = theta_c
+            
+            if COMPLEMENT:
+                count_word_train=opp_word_dict.get(w,0)
+                ratio = (count_word_train+1)/(opp_count_cat+vocab_length)
+            
+            word_freq = my_word_count[w]
+            
+            if USE_IDF:
+                word_freq = word_freq*log(length_train/num_docs_word_in.get(w,1))     #IDF
+            # word_freq = word_freq/length_norm       # length normalization
+            
+
+            ratio = log(ratio)      # weight factor     log(theta_c) = weight_c,w
+            
+            if WEIGHTED:
+                ratio = ratio/weight_normalizing_ratio      # weight normalization
+            
+            if COMPLEMENT:
+                neg_log_prob -= word_freq*ratio
+            else:
+                neg_log_prob += word_freq*ratio    # class probability
+            
+            # break
+        # print "NLP: ", neg_log_prob
+        # print "\n\n", cat, minimum_neg_log_prob , '<' , neg_log_prob                 
+        # if minimum_neg_log_prob>neg_log_prob:
+        if minimum_neg_log_prob<neg_log_prob:
+            min_category=cat
+            minimum_neg_log_prob=neg_log_prob
+    # print "Min cat: ", min_category
+    
+    if AMAZON:
+        li_results.append((file_name,min_category,REVIEW_POL[file_name]))
+    else:
+        li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
+    # break
+
+###--------------------DEBUG STATEMENTS----------------------
+#for t in li_results:
+ #   if t[1]!=t[2]:
+  #      print t
+###--------------------DEBUG STATEMENTS----------------------
+    
+###--------------------DEBUG STATEMENTS----------------------
+
+#12) Evaluating the classifier
+
+CalculateAccuracy(li_results)
diff --git a/OLD_VERSIONS/TCWNB2.py b/OLD_VERSIONS/TCWNB2.py
new file mode 100644
index 0000000..80e9a36
--- /dev/null
+++ b/OLD_VERSIONS/TCWNB2.py
@@ -0,0 +1,242 @@
+######################
+# This version is MNB only
+######################
+
+from __future__ import division
+from math import log
+import re
+import csv
+from nltk.corpus import movie_reviews as mr
+from nltk.corpus import stopwords
+import random
+STOP_WORDS = set(stopwords.words('english'))
+SPLIT_AMOUNT = 0.6          # training amount from data 
+
+USE_IDF = 0
+AMAZON = 0
+REVIEW_POL={}
+DEFINED_SIZE = 1
+DEFINED_SIZES = {'pos': 600, 'neg': 600}
+def SplitData():
+    type_dict={}
+    docs_count={}
+    train_test = [[],[]]
+    offset_sample = random.randint(-400,400)
+    print "offset_sample", offset_sample
+    if AMAZON:
+        offset_sample = random.randint(-600,600)
+        for category in ['pos', 'neg']:
+            type_dict[category]=[]
+        with open('amazon_revs.csv', 'rb') as csvfile:
+            rev_read = csv.reader(csvfile)
+            for row in rev_read:
+                type_dict[row[1]].append(row[0])
+                REVIEW_POL[row[0]] = row[1]
+    else:
+        for category in mr.categories():
+            type_dict[category]=mr.fileids(categories=category)
+    for cat in type_dict.keys():
+        li = type_dict[cat]
+        random.shuffle(li)
+        size=int(len(li)*SPLIT_AMOUNT) + offset_sample
+        if DEFINED_SIZE:
+            size = DEFINED_SIZES[cat]
+        print "Category: ", cat, "Size:", size
+        offset_sample *= -1
+        docs_count[cat]=size
+        train_test[0].extend(li[:size])
+        train_test[1].extend(li[size:])
+    return [train_test,type_dict, docs_count]
+    
+def tokenize(file_name): 
+    list_words = ()
+    if AMAZON:
+        list_words = re.split(r'\W+',file_name)
+    else:
+        list_words = re.split(r'\W+',mr.raw(fileids=file_name))
+    
+    return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS]
+
+
+def CalculateAccuracy(li_results):
+    a=0
+    b=0
+    c=0
+    d=0
+    cat = li_results[0][1]
+    for t in li_results:
+        if cat==t[1]:
+            if cat==t[2]:
+                a+=1
+            else:
+                b+=1
+        else:
+            if cat==t[2]:
+                c+=1
+            else:
+                d+=1
+    precision = a/(a+b)
+    # recall = a/(a+c)
+    # print "The following parameters are recorded for the category " , cat
+    print "precision =", precision
+    
+# li = Preprocessor.get_testset_trainset(corpus)
+li = SplitData()
+# exit()
+testset = li[0][1]
+trainset = li[0][0]
+# li = Preprocessor.startup()
+cat_num_docs = li[2]
+    
+
+
+##4)Create a) a dictionary with a category as the key and dictionary of words-occurrences as values
+          #b) a dictionary with a category as the key and the number of words in it as the value
+# {pos-> {w1 = 17 times}, {w2 = 32 times}...} {neg-> ....}
+cat_word_dict={}
+# {pos-> 4000 words} {neg-> 7000 words}
+cat_word_count_dict={}
+#val = my_dict.get(key, mydefaultval)
+complete_training_docs_tokens = []
+
+##5)Loop through the training set, to get the entire text from  each file
+##6) Parse the string to get individual words
+for file_name in trainset:
+    list_words = tokenize(file_name)
+    complete_training_docs_tokens.append(list_words)
+    
+
+##7) Check if category exists in dictionary, if not, create an empty dictionary,
+    #and put word count as zero
+    #and then insert words into the category's dictionary in both cases and update the word count
+    cat = ''
+    if AMAZON:
+        cat = REVIEW_POL[file_name]
+    else:
+        cat = mr.categories(fileids = file_name)[0]
+    cat_word_dict[cat] = cat_word_dict.get(cat,{})
+    cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0)
+    
+# add number of words to total word count for cat
+    cat_word_count_dict[cat]+=len(list_words)   
+# start count for number of occurences for each word    
+    for w in list_words:
+        cat_word_dict[cat][w] = cat_word_dict[cat].get(w, 0)
+        cat_word_dict[cat][w]+=1
+        
+        
+
+##8) Get the vocabulary length
+## number of words, total across categories
+vocab_length=0     
+num_docs_word_in = {}        
+for dic in cat_word_dict.values():
+     vocab_length+=len(dic)
+     if USE_IDF:
+         for uniq_word in dic.keys():
+            num_docs_word_in[uniq_word] = num_docs_word_in.get(uniq_word, 1)
+            num_docs_word_in[uniq_word] = sum(1 for sr in complete_training_docs_tokens if uniq_word in sr)
+
+        
+        
+####Congratulations! the Classifier is trained, now it is time to run the Multinomial Naive Bayes Classifier on the test dataset
+length_train = len(trainset)
+li_results=[]
+#9) Like in the training set,Loop through the test set, to get the entire text from  each file
+##10) Similar step, parse the string to get individual words
+for file_name in testset:
+    # print "File: ", file_name
+    # minimum_neg_log_prob=1000000000
+    minimum_neg_log_prob = -1000000000      # NEW
+    min_category=''
+    list_words = tokenize(file_name)
+
+
+    
+##11) Get the probability for each category,
+    #can use any of the created dictionaries to wade through the categories
+    for cat in  cat_word_count_dict:
+        # print cat , cat_num_docs[cat]/len(trainset)
+        # print "________________________________________________________________"
+        # print "________________________________________________________________"
+        # print "\n\n" , cat, cat, cat, cat, cat, cat, cat, cat, cat, cat, "\n\n"
+        # neg_log_prob=-log(cat_num_docs[cat]/length_train)
+        inv_cat = 'pos'
+        if cat == 'pos':
+            inv_cat = 'neg'
+        
+        
+        neg_log_prob=log(cat_num_docs[cat]/length_train)
+        
+        # neg_log_prob = cat_num_docs[cat]/length_train
+        # word_dict = cat_word_dict[inv_cat]
+        # count_cat = cat_word_count_dict[inv_cat]
+        
+        word_dict = cat_word_dict[cat]
+        count_cat = cat_word_count_dict[cat]
+                
+        my_word_count = {}
+        for aw in list_words:
+            my_word_count[aw] = my_word_count.get(aw, 0)
+            my_word_count[aw]+=1
+        
+        length_norm = 0
+        weight_normalizing_ratio = 0
+        for kw in my_word_count.keys():
+            count_word_train=word_dict.get(kw,0)
+            ratio = (count_word_train+1)/(count_cat+vocab_length)
+            ## weight norm
+            # weight_normalizing_ratio+=log(ratio)
+            ## TF
+            # my_word_count[kw] = log(my_word_count[kw]+1)
+            ## length norm
+            # length_norm += (my_word_count[kw]**(2))
+            
+        # length_norm = length_norm**(0.5)   
+        # print "WNR: ", weight_normalizing_ratio
+        
+        for w in my_word_count.keys():
+            count_word_train=word_dict.get(w,0)
+            ratio = (count_word_train+1)/(count_cat+vocab_length)   #Nw,c+1/Nc+|V| = theta_c
+            # neg_log_prob-=log(ratio)
+            word_freq = my_word_count[w]
+            if USE_IDF:
+                word_freq = word_freq*log(length_train/num_docs_word_in.get(w,1))     #IDF
+            # word_freq = word_freq/length_norm       # length normalization
+            
+            # neg_log_prob += word_freq*log(ratio)    #switch to 
+            ratio = log(ratio)      # weight factor     log(theta_c) = weight_c,w
+            # ratio = ratio/weight_normalizing_ratio      # weight normalization
+            neg_log_prob += word_freq*ratio    # class probability
+            
+            
+            # neg_log_prob *= ratio
+            # print w, "Ratio found:",ratio, "new_neg_log:", neg_log_prob
+            # break
+        # print "NLP: ", neg_log_prob
+        # print "\n\n", cat, minimum_neg_log_prob , '<' , neg_log_prob                 
+        # if minimum_neg_log_prob>neg_log_prob:
+        if minimum_neg_log_prob<neg_log_prob:
+            min_category=cat
+            minimum_neg_log_prob=neg_log_prob
+    # print "Min cat: ", min_category
+    # correct_cat = 'pos'
+    # if file_name in all_review_cats['neg']:
+        # correct_cat = 'neg'
+    if AMAZON:
+        li_results.append((file_name,min_category,REVIEW_POL[file_name]))
+    else:
+        li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
+    # break
+
+###--------------------DEBUG STATEMENTS----------------------
+#for t in li_results:
+ #   if t[1]!=t[2]:
+  #      print t
+###--------------------DEBUG STATEMENTS----------------------
+    
+###--------------------DEBUG STATEMENTS----------------------
+
+#12) Evaluating the classifier
+
+CalculateAccuracy(li_results)
diff --git a/OLD_VERSIONS/TWCNB_old.py b/OLD_VERSIONS/TWCNB_old.py
new file mode 100644
index 0000000..bb2564c
--- /dev/null
+++ b/OLD_VERSIONS/TWCNB_old.py
@@ -0,0 +1,235 @@
+from __future__ import division
+from math import log
+import re
+import csv
+from nltk.corpus import movie_reviews as mr
+from nltk.corpus import stopwords
+import random
+STOP_WORDS = set(stopwords.words('english'))
+SPLIT_AMOUNT = 0.6          # training amount from data 
+
+USE_IDF = 1
+AMAZON = 1
+REVIEW_POL={}
+DEFINED_SIZE = 1
+DEFINED_SIZES = {'pos': 948, 'neg': 948}
+def SplitData():
+    type_dict={}
+    docs_count={}
+    train_test = [[],[]]
+    offset_sample = random.randint(-400,400)
+    print "offset_sample", offset_sample
+    if AMAZON:
+        offset_sample = random.randint(-600,600)
+        for category in ['pos', 'neg']:
+            type_dict[category]=[]
+        with open('amazon_revs.csv', 'rb') as csvfile:
+            rev_read = csv.reader(csvfile)
+            for row in rev_read:
+                type_dict[row[1]].append(row[0])
+                REVIEW_POL[row[0]] = row[1]
+    else:
+        for category in mr.categories():
+            type_dict[category]=mr.fileids(categories=category)
+    for cat in type_dict.keys():
+        li = type_dict[cat]
+        random.shuffle(li)
+        size=int(len(li)*SPLIT_AMOUNT) + offset_sample
+        if DEFINED_SIZE:
+            size = DEFINED_SIZES[cat]
+        print "Category: ", cat, "Size:", size
+        offset_sample *= -1
+        docs_count[cat]=size
+        train_test[0].extend(li[:size])
+        train_test[1].extend(li[size:])
+    return [train_test,type_dict, docs_count]
+    
+def tokenize(file_name): 
+    list_words = ()
+    if AMAZON:
+        list_words = re.split(r'\W+',file_name)
+    else:
+        list_words = re.split(r'\W+',mr.raw(fileids=file_name))
+    
+    return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS]
+
+
+def CalculateAccuracy(li_results):
+    a=0
+    b=0
+    c=0
+    d=0
+    cat = li_results[0][1]
+    for t in li_results:
+        if cat==t[1]:
+            if cat==t[2]:
+                a+=1
+            else:
+                b+=1
+        else:
+            if cat==t[2]:
+                c+=1
+            else:
+                d+=1
+    precision = a/(a+b)
+    # recall = a/(a+c)
+    # print "The following parameters are recorded for the category " , cat
+    print "precision =", precision
+    
+# li = Preprocessor.get_testset_trainset(corpus)
+li = SplitData()
+# exit()
+testset = li[0][1]
+trainset = li[0][0]
+# li = Preprocessor.startup()
+cat_num_docs = li[2]
+    
+
+
+##4)Create a) a dictionary with a category as the key and dictionary of words-occurrences as values
+          #b) a dictionary with a category as the key and the number of words in it as the value
+# {pos-> {w1 = 17 times}, {w2 = 32 times}...} {neg-> ....}
+cat_word_dict={}
+# {pos-> 4000 words} {neg-> 7000 words}
+cat_word_count_dict={}
+#val = my_dict.get(key, mydefaultval)
+complete_training_docs_tokens = []
+
+##5)Loop through the training set, to get the entire text from  each file
+##6) Parse the string to get individual words
+for file_name in trainset:
+    list_words = tokenize(file_name)
+    complete_training_docs_tokens.append(list_words)
+    
+
+##7) Check if category exists in dictionary, if not, create an empty dictionary,
+    #and put word count as zero
+    #and then insert words into the category's dictionary in both cases and update the word count
+    cat = ''
+    if AMAZON:
+        cat = REVIEW_POL[file_name]
+    else:
+        cat = mr.categories(fileids = file_name)[0]
+    cat_word_dict[cat] = cat_word_dict.get(cat,{})
+    cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0)
+    
+# add number of words to total word count for cat
+    cat_word_count_dict[cat]+=len(list_words)   
+# start count for number of occurences for each word    
+    for w in list_words:
+        cat_word_dict[cat][w] = cat_word_dict[cat].get(w, 0)
+        cat_word_dict[cat][w]+=1
+        
+        
+
+##8) Get the vocabulary length
+## number of words, total across categories
+vocab_length=0     
+num_docs_word_in = {}        
+for dic in cat_word_dict.values():
+     vocab_length+=len(dic)
+     if USE_IDF:
+         for uniq_word in dic.keys():
+            num_docs_word_in[uniq_word] = num_docs_word_in.get(uniq_word, 1)
+            num_docs_word_in[uniq_word] = sum(1 for sr in complete_training_docs_tokens if uniq_word in sr)
+
+        
+        
+####Congratulations! the Classifier is trained, now it is time to run the Multinomial Naive Bayes Classifier on the test dataset
+length_train = len(trainset)
+li_results=[]
+#9) Like in the training set,Loop through the test set, to get the entire text from  each file
+##10) Similar step, parse the string to get individual words
+for file_name in testset:
+    # print "File: ", file_name
+    minimum_neg_log_prob=1000000000
+    # minimum_neg_log_prob = 0      # NEW
+    min_category=''
+    list_words = tokenize(file_name)
+
+
+    
+##11) Get the probability for each category,
+    #can use any of the created dictionaries to wade through the categories
+    for cat in  cat_word_count_dict:
+        # print cat , cat_num_docs[cat]/len(trainset)
+        # print "________________________________________________________________"
+        # print "________________________________________________________________"
+        # print "\n\n" , cat, cat, cat, cat, cat, cat, cat, cat, cat, cat, "\n\n"
+        # neg_log_prob=-log(cat_num_docs[cat]/length_train)
+        inv_cat = 'pos'
+        if cat == 'pos':
+            inv_cat = 'neg'
+        
+        
+        neg_log_prob=log(cat_num_docs[cat]/length_train)
+        
+        # neg_log_prob = cat_num_docs[cat]/length_train
+        word_dict = cat_word_dict[inv_cat]
+        count_cat = cat_word_count_dict[inv_cat]
+                
+        my_word_count = {}
+        for aw in list_words:
+            my_word_count[aw] = my_word_count.get(aw, 0)
+            my_word_count[aw]+=1
+        
+        length_norm = 0
+        weight_normalizing_ratio = 0
+        for kw in my_word_count.keys():
+            count_word_train=word_dict.get(kw,0)
+            ratio = (count_word_train+1)/(count_cat+vocab_length)
+            ## weight norm
+            weight_normalizing_ratio+=log(ratio)
+            ## TF
+            my_word_count[kw] = log(my_word_count[kw]+1)
+            ## length norm
+            length_norm += (my_word_count[kw]**(2))
+            
+        length_norm = length_norm**(0.5)   
+        # print "WNR: ", weight_normalizing_ratio
+        
+        for w in my_word_count.keys():
+            count_word_train=word_dict.get(w,0)
+            ratio = (count_word_train+1)/(count_cat+vocab_length)   #Nw,c+1/Nc+|V| = theta_c
+            # neg_log_prob-=log(ratio)
+            word_freq = my_word_count[w]
+            if USE_IDF:
+                word_freq = word_freq*log(length_train/num_docs_word_in.get(w,1))     #IDF
+            word_freq = word_freq/length_norm       # length normalization
+            
+            # neg_log_prob += word_freq*log(ratio)    #switch to 
+            ratio = log(ratio)      # weight factor     log(theta_c) = weight_c,w
+            ratio = ratio/weight_normalizing_ratio      # weight normalization
+            neg_log_prob += word_freq*ratio    # class probability
+            
+            
+            # neg_log_prob *= ratio
+            # print w, "Ratio found:",ratio, "new_neg_log:", neg_log_prob
+            # break
+        # print "NLP: ", neg_log_prob
+        # print "\n\n", cat, minimum_neg_log_prob , '<' , neg_log_prob                 
+        if minimum_neg_log_prob>neg_log_prob:
+        # if minimum_neg_log_prob<neg_log_prob:
+            min_category=cat
+            minimum_neg_log_prob=neg_log_prob
+    # print "Min cat: ", min_category
+    # correct_cat = 'pos'
+    # if file_name in all_review_cats['neg']:
+        # correct_cat = 'neg'
+    if AMAZON:
+        li_results.append((file_name,min_category,REVIEW_POL[file_name]))
+    else:
+        li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
+    # break
+
+###--------------------DEBUG STATEMENTS----------------------
+#for t in li_results:
+ #   if t[1]!=t[2]:
+  #      print t
+###--------------------DEBUG STATEMENTS----------------------
+    
+###--------------------DEBUG STATEMENTS----------------------
+
+#12) Evaluating the classifier
+
+CalculateAccuracy(li_results)
diff --git a/TWCNB_v0_2.py b/TWCNB_v0_2.py
new file mode 100644
index 0000000..d0db97a
--- /dev/null
+++ b/TWCNB_v0_2.py
@@ -0,0 +1,358 @@
+######################
+# Full version with all variations included
+# To improve: create a main function allowing for multiple runs
+# TF works properly!
+######################
+
+from __future__ import division
+from math import log
+from math import pow
+import re
+import csv
+from nltk.corpus import movie_reviews as mr
+from nltk.corpus import stopwords
+import random
+STOP_WORDS = set(stopwords.words('english'))
+SPLIT_AMOUNT = 0.6          # training amount from data 
+SHUFFLE = 1
+# COMPLEMENT = 0      # 1 - just comp, 2 - delta / one-v-all
+# WEIGHTED = 0        # 1 - adjust weights
+# TF = 0              # 1 - log term frew
+# IDF = 0             # 1 - idf
+# LENGTH = 0          # 1 - doc length adjust
+AMAZON = 0          # 0 - use movie_reviews, 1 - use Amazon set 
+OFFSET = 0          # 0 - use random data size offset, 1 - nope
+
+REVIEW_POL={}
+
+li = []
+# exit()
+testset = []
+trainset = []
+# li = Preprocessor.startup()
+cat_num_docs = {}
+    
+##4)Create a) a dictionary with a category as the key and dictionary of words-occurrences as values
+          #b) a dictionary with a category as the key and the number of words in it as the value
+# {pos-> {w1 = 17 times}, {w2 = 32 times}...} {neg-> ....}
+cat_word_dict={}
+# {pos-> 4000 words} {neg-> 7000 words}
+cat_word_count_dict={}
+#val = my_dict.get(key, mydefaultval)
+num_docs_word_in = {} 
+vocab_length=0
+
+
+
+def SplitData():
+    global REVIEW_POL
+    type_dict={}
+    docs_count={}
+    train_test = [[],[]]
+    # offset_sample = random.randint(-400,400)
+    offset_sample = OFFSET
+    print "offset_sample", offset_sample
+    if AMAZON:
+        # offset_sample = random.randint(-600,600)
+        for category in ['pos', 'neg']:
+            type_dict[category]=[]
+        with open('amazon_revs.csv', 'rb') as csvfile:
+            rev_read = csv.reader(csvfile)
+            for row in rev_read:
+                type_dict[row[1]].append(row[0])
+                REVIEW_POL[row[0]] = row[1]
+    else:
+        for category in mr.categories():
+            type_dict[category]=mr.fileids(categories=category)
+    # if NO_OFF:
+        # offset_sample = 0
+    for cat in type_dict.keys():
+        li = type_dict[cat]
+        if SHUFFLE:
+            random.shuffle(li)
+        size=int(len(li)*SPLIT_AMOUNT) + offset_sample
+        # if DEFINED_SIZE:
+            # size = DEFINED_SIZES[cat]
+        print "Category: ", cat, "Size:", size
+        offset_sample *= -1
+        docs_count[cat]=size
+        train_test[0].extend(li[:size])
+        train_test[1].extend(li[size:])
+    return [train_test,type_dict, docs_count]
+    
+def tokenize(file_name): 
+    list_words = ()
+    if AMAZON:
+        list_words = re.split(r'\W+',file_name)
+    else:
+        list_words = re.split(r'\W+',mr.raw(fileids=file_name))
+    
+    return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS]
+
+
+def CalculateAccuracy(li_results):
+    a=0
+    b=0
+    c=0
+    d=0
+    cat = li_results[0][1]
+    for t in li_results:
+        if cat==t[1]:
+            if cat==t[2]:
+                a+=1
+            else:
+                b+=1
+        else:
+            if cat==t[2]:
+                c+=1
+            else:
+                d+=1
+    precision = a/(a+b)
+    # recall = a/(a+c)
+    # print "The following parameters are recorded for the category " , cat
+    print "precision =", precision
+    return precision
+  
+def RunWholeThing():
+    global AMAZON
+    global OFFSET
+    global DEFINED_SIZE
+    global DEFINED_SIZES
+    OFFSET = 0
+    AMAZON = 0          # 0 - use movie_reviews, 1 - use Amazon set 
+    while OFFSET < 400: 
+        ans = DoTheThing()
+        print "Pos/Neg =", cat_num_docs['pos'], cat_num_docs['neg'], 'Movies @ +/-', OFFSET
+        print "_____________________________________________________"        
+        OFFSET = -1*OFFSET
+        ans2 = DoTheThing()
+        OFFSET = -1*OFFSET
+        print ans , ans2       
+        OFFSET += 100
+    OFFSET = 0
+    AMAZON = 1
+    
+    while OFFSET < 600: 
+        ans = DoTheThing() 
+        print "Pos/Neg =", cat_num_docs['pos'], cat_num_docs['neg'], 'Amazon @ +/-', OFFSET
+        print "_____________________________________________________"
+        OFFSET = -1*OFFSET
+        ans2 = DoTheThing()
+        OFFSET = -1*OFFSET
+        print ans , ans2       
+        OFFSET += 100
+    
+    
+
+def DoTheThing():
+    i = 0
+    reps    = 5
+    
+    m_nb    = 0
+    ti_nb   = 0
+    til_nb  = 0
+    cw_nb   = 0
+    tilcw_nb= 0
+    
+    while i < reps:
+        TrainMachine()
+        m_nb += TestMachine(0,0,0,0,0)/5
+        ti_nb += TestMachine(1,1,0,0,0)/5
+        til_nb += TestMachine(1,1,1,0,0)/5
+        cw_nb += TestMachine(0,0,0,1,1)/5
+        tilcw_nb += TestMachine(1,1,1,1,1)/5        
+        i+=1
+    return (m_nb, ti_nb, til_nb, cw_nb, tilcw_nb)
+
+  
+# li = Preprocessor.get_testset_trainset(corpus)
+
+##5)Loop through the training set, to get the entire text from  each file
+##6) Parse the string to get individual words
+def TrainMachine():
+    global cat_word_dict
+    global cat_word_count_dict
+    global num_docs_word_in
+    global li
+    global testset
+    global trainset
+    global cat_num_docs
+    global vocab_length
+    
+    li = SplitData()
+    testset = li[0][1]
+    trainset = li[0][0]
+    cat_num_docs = li[2]
+
+    for file_name in trainset:
+        list_words = tokenize(file_name)
+        
+
+    ##7) Check if category exists in dictionary, if not, create an empty dictionary,
+        #and put word count as zero
+        #and then insert words into the category's dictionary in both cases and update the word count
+        cat = ''
+        if AMAZON:
+            cat = REVIEW_POL[file_name]
+        else:
+            cat = mr.categories(fileids = file_name)[0]
+        cat_word_dict[cat] = cat_word_dict.get(cat,{})
+        cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0)
+        
+    # add number of words to total word count for cat
+        cat_word_count_dict[cat]+=len(list_words)      
+    # start count for number of occurences for each word
+        counted = []
+        for w in list_words:
+            cat_word_dict[cat][w] = cat_word_dict[cat].get(w, 0)
+            cat_word_dict[cat][w]+=1
+            if w not in counted:
+                counted.append(w)
+                num_docs_word_in[w] = num_docs_word_in.get(w, 0)
+                num_docs_word_in[w] += 1
+                
+    for dic in cat_word_dict.values():
+     vocab_length+=len(dic)
+        
+
+# ##8) Get the vocabulary length
+# ## number of words, total across categories
+# vocab_length=0     
+       
+
+   
+        
+# ####Congratulations! the Classifier is trained, now it is time to run the Multinomial Naive Bayes Classifier on the test dataset
+# length_train = len(trainset)
+# print "length of training set ", length_train
+
+
+def TestMachine(t, i, l, c, w):
+    #9) Like in the training set,Loop through the test set, to get the entire text from  each file
+    ##10) Similar step, parse the string to get individual words
+    global trainset
+    global testset
+    TF = t              # 1 - log term frew
+    IDF = i             # 1 - idf
+    LENGTH = l          # 1 - doc length adjust
+    COMPLEMENT = c      # 1 - just comp, 2 - delta / one-v-all
+    WEIGHTED = w        # 1 - adjust weights
+    length_train = len(trainset)
+    # print "length train " , length_train, len(testset)
+    li_results=[]
+    
+    for file_name in testset:
+        # print "File: ", file_name
+        # minimum_neg_log_prob=1000000000
+        minimum_neg_log_prob = -1000000000      # NEW
+        min_category=''
+        list_words = tokenize(file_name)
+        # print file_name
+
+
+        
+    ##11) Get the probability for each category,
+        #can use any of the created dictionaries to wade through the categories
+        for cat in cat_word_count_dict:
+            # print cat , cat_num_docs[cat]/len(trainset)
+            # print "________________________________________________________________"
+            # print "________________________________________________________________"
+            # print "\n\n" , cat, cat, cat, cat, cat, cat, cat, cat, cat, cat, "\n\n"
+            # neg_log_prob=-log(cat_num_docs[cat]/length_train)
+            inv_cat = 'pos'
+            if cat == 'pos':
+                inv_cat = 'neg'
+            
+            neg_log_prob = log(cat_num_docs[cat]/length_train, 2)
+            
+            # neg_log_prob = cat_num_docs[cat]/length_train
+            opp_word_dict = cat_word_dict[inv_cat]
+            opp_count_cat = cat_word_count_dict[inv_cat]
+            
+            word_dict = cat_word_dict[cat]
+            count_cat = cat_word_count_dict[cat]
+            ## get frequency counts        
+            my_word_count = {}
+            for aw in list_words:
+                my_word_count[aw] = my_word_count.get(aw, 0)
+                my_word_count[aw]+=1
+            
+            ## calculate necessary norms
+            length_norm = 0
+            weight_normalizing_ratio = 0
+            opp_weight_normalizing_ratio = 0
+            for kw in my_word_count.keys():
+                count_word_train=word_dict.get(kw,0)            
+                ratio = (count_word_train+1)/(count_cat+vocab_length)
+                
+                # if COMPLEMENT:
+                opp_count_word_train=opp_word_dict.get(kw,0)
+                opp_ratio = (opp_count_word_train+1)/(opp_count_cat+vocab_length)
+                
+                # weight norm
+                weight_normalizing_ratio += abs(log(ratio, 2))
+                opp_weight_normalizing_ratio += abs(log(opp_ratio, 2))
+                
+                if TF:
+                    my_word_count[kw] = log(1 + my_word_count[kw])
+                
+                if IDF:
+                    my_word_count[kw] = my_word_count[kw]*log(length_train/num_docs_word_in.get(kw,1), 2)     #IDF
+                ## length norm
+                w_freq = my_word_count[kw]
+                length_norm += pow(w_freq, 2)
+                
+            length_norm = pow(length_norm, 0.5)   
+            # print "LN: ", length_norm
+            
+            for w in my_word_count.keys():
+                count_word_train=word_dict.get(w,0)
+                ratio = (count_word_train+1)/(count_cat+vocab_length)   #Nw,c+1/Nc+|V| = theta_c
+                
+                # if COMPLEMENT:
+                opp_count_word_train=opp_word_dict.get(w,0)
+                opp_ratio = (opp_count_word_train+1)/(opp_count_cat+vocab_length)
+                
+                word_freq = my_word_count[w]
+                
+                if LENGTH:
+                    word_freq = word_freq/length_norm       # length normalization
+                
+
+                ratio = log(ratio, 2)      # weight factor     log(theta_c) = weight_c,w
+                opp_ratio = log(opp_ratio, 2)
+                
+                if WEIGHTED:
+                    ratio = ratio/weight_normalizing_ratio      # weight normalization
+                    opp_ratio = opp_ratio/opp_weight_normalizing_ratio
+                
+                if COMPLEMENT == 1:                     # just complement
+                    neg_log_prob -= word_freq*opp_ratio
+                else:
+                    neg_log_prob += word_freq*ratio    # class probability
+                if COMPLEMENT == 2:                     # one-v-all 
+                    neg_log_prob += word_freq*ratio
+                
+                # break
+            # print "NLP: ", neg_log_prob
+            # print "\n\n", cat, minimum_neg_log_prob , '<' , neg_log_prob                 
+            # if minimum_neg_log_prob>neg_log_prob:
+            if minimum_neg_log_prob<neg_log_prob:
+                min_category=cat
+                minimum_neg_log_prob=neg_log_prob
+        # print "Min cat: ", min_category
+        
+        if AMAZON:
+            li_results.append((file_name,min_category,REVIEW_POL[file_name]))
+        else:
+            li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
+        # break
+
+
+    #12) Evaluating the classifier
+
+    precision = CalculateAccuracy(li_results)
+    return precision
+
+    
+RunWholeThing()
\ No newline at end of file