OLD_VERSIONS/TWCNB.py

######################
# Full version with all variations included
# To improve: create a main function allowing for multiple runs
######################

from __future__ import division
from math import log
from math import pow
import re
import csv
from nltk.corpus import movie_reviews as mr
from nltk.corpus import stopwords
import random
STOP_WORDS = set(stopwords.words('english'))
SPLIT_AMOUNT = 0.6          # training amount from data 

COMPLEMENT = 0      # 1 - just comp, 2 - delta / one-v-all
WEIGHTED = 0        # 1 - adjust weights
TF = 0              # 1 - log term frew
IDF = 0             # 1 - idf
LENGTH = 0         # 1 - doc length adjust
AMAZON = 0          # 0 - use movie_reviews, 1 - use Amazon set 
NO_OFF = 1          # 0 - use random data size offset, 1 - nope
DEFINED_SIZE = 0    # 1 - use DEFINED_SIZES for pos, neg sets
DEFINED_SIZES = {'pos': 600, 'neg': 600}

REVIEW_POL={}
def SplitData():
    type_dict={}
    docs_count={}
    train_test = [[],[]]
    offset_sample = random.randint(-400,400)
    print "offset_sample", offset_sample
    if AMAZON:
        offset_sample = random.randint(-600,600)
        for category in ['pos', 'neg']:
            type_dict[category]=[]
        with open('amazon_revs.csv', 'rb') as csvfile:
            rev_read = csv.reader(csvfile)
            for row in rev_read:
                type_dict[row[1]].append(row[0])
                REVIEW_POL[row[0]] = row[1]
    else:
        for category in mr.categories():
            type_dict[category]=mr.fileids(categories=category)
    if NO_OFF:
        offset_sample = 0
    for cat in type_dict.keys():
        li = type_dict[cat]
        # random.shuffle(li)
        size=int(len(li)*SPLIT_AMOUNT) + offset_sample
        if DEFINED_SIZE:
            size = DEFINED_SIZES[cat]
        print "Category: ", cat, "Size:", size
        offset_sample *= -1
        docs_count[cat]=size
        train_test[0].extend(li[:size])
        train_test[1].extend(li[size:])
    return [train_test,type_dict, docs_count]
    
def tokenize(file_name): 
    list_words = ()
    if AMAZON:
        list_words = re.split(r'\W+',file_name)
    else:
        list_words = re.split(r'\W+',mr.raw(fileids=file_name))
    
    return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS]


def CalculateAccuracy(li_results):
    a=0
    b=0
    c=0
    d=0
    cat = li_results[0][1]
    for t in li_results:
        if cat==t[1]:
            if cat==t[2]:
                a+=1
            else:
                b+=1
        else:
            if cat==t[2]:
                c+=1
            else:
                d+=1
    precision = a/(a+b)
    # recall = a/(a+c)
    # print "The following parameters are recorded for the category " , cat
    print "precision =", precision
    
# li = Preprocessor.get_testset_trainset(corpus)
li = SplitData()
# exit()
testset = li[0][1]
trainset = li[0][0]
# li = Preprocessor.startup()
cat_num_docs = li[2]
    
length_train = len(trainset)
print "length of training set ", length_train

##4)Create a) a dictionary with a category as the key and dictionary of words-occurrences as values
          #b) a dictionary with a category as the key and the number of words in it as the value
# {pos-> {w1 = 17 times}, {w2 = 32 times}...} {neg-> ....}
cat_word_dict={}
# {pos-> 4000 words} {neg-> 7000 words}
cat_word_count_dict={}
#val = my_dict.get(key, mydefaultval)
complete_training_docs_tokens = []
num_docs_word_in = {} 
counts_for_w = {}


##5)Loop through the training set, to get the entire text from  each file
##6) Parse the string to get individual words

for file_name in trainset:
    list_words = tokenize(file_name)
    complete_training_docs_tokens.append(list_words)
    # counts_for_w[file_name] = counts_for_w.get(file_name, {})
    counts_for_w[file_name] = {}

##7) Check if category exists in dictionary, if not, create an empty dictionary,
    #and put word count as zero
    #and then insert words into the category's dictionary in both cases and update the word count
    cat = ''
    if AMAZON:
        cat = REVIEW_POL[file_name]
    else:
        cat = mr.categories(fileids = file_name)[0]
    
    # cat_word_dict[cat] = cat_word_dict.get(cat,{})
    # cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0)
    
# add number of words to total word count for cat
    # cat_word_count_dict[cat]+=len(list_words)      
# start count for number of occurences for each word
    counted = []
    for w in list_words:
        # cat_word_dict[cat][w] = cat_word_dict[cat].get(w, 0)
        # cat_word_dict[cat][w]+=1
        counts_for_w[file_name][w] = counts_for_w[file_name].get(w, 0)
        counts_for_w[file_name][w] += 1
        if w not in counted:
            counted.append(w)
            num_docs_word_in[w] = num_docs_word_in.get(w, 0)
            num_docs_word_in[w] += 1
    # break


for fn in trainset:
    length_norm_val = 0
    
    cat = ''
    if AMAZON:
        cat = REVIEW_POL[fn]
    else:
        cat = mr.categories(fileids = fn)[0]        
    cat_word_dict[cat] = cat_word_dict.get(cat,{})
    cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0)
    # print fn + "\n_______________________________\n"
    # print tokenize(fn)
    # print "" + "\n_______________________________\n"
    # print counts_for_w[fn]['book'], num_docs_word_in['book']
    for c_w in counts_for_w[fn].keys():
        
        # print c_w
        
        if TF:
            counts_for_w[fn][c_w] = log(counts_for_w[fn][c_w] + 1, 2)
            # if c_w == 'book' : 
                # print 'TF: ', counts_for_w[fn]['book']
        if IDF:
            counts_for_w[fn][c_w] = counts_for_w[fn][c_w]*log(length_train/num_docs_word_in[c_w], 2)
            # if c_w == 'book' : 
                # print 'IDF: ', counts_for_w[fn]['book']
        length_norm_val += (counts_for_w[fn][c_w]*counts_for_w[fn][c_w])
    length_norm_val = pow(length_norm_val,0.5)
    # print counts_for_w[fn]['book'], num_docs_word_in['book']
    # print length_norm_val
    
    for c_w in counts_for_w[fn].keys():
        if LENGTH:
            counts_for_w[fn][c_w] /= length_norm_val 
        
        cat_word_count_dict[cat] += counts_for_w[fn][c_w]
        cat_word_dict[cat][c_w] = cat_word_dict[cat].get(c_w, 0)
        cat_word_dict[cat][c_w] += counts_for_w[fn][c_w]
    
# print cat_word_dict['neg']['book']
# print cat_word_dict['pos']['book']
    
# exit()   
# print "Using LNV: ", length_norm_val            
# length_norm_val = length_norm_val**(0.5)
# print "Using sqLNV: ", length_norm_val
# for fn in trainset:
    # cat = ''
    # if AMAZON:
        # cat = REVIEW_POL[fn]
    # else:
        # cat = mr.categories(fileids = fn)[0]        
    # cat_word_dict[cat] = cat_word_dict.get(cat,{})
    
    # for c_w in counts_for_w[fn].keys():
        # if LENGTH:
            # counts_for_w[fn][c_w] /= length_norm_val 
        
        # cat_word_dict[cat][c_w] = cat_word_dict[cat].get(c_w, 0)
        # cat_word_dict[cat][c_w] += counts_for_w[fn][c_w]
##8) Get the vocabulary length
## number of words, total across categories
vocab_length=0     
       
# for dic in num_docs_word_in.keys():
vocab_length=len(num_docs_word_in.keys())
print cat_word_dict['pos']['book'], cat_word_dict['neg']['book']
print "Vocab", vocab_length
for cat in cat_word_dict.keys():
    count_cat = cat_word_count_dict[cat]
    weight_norm_cat = 0
    for w in cat_word_dict[cat].keys():
        cat_word_dict[cat][w] = (cat_word_dict[cat][w]+1)/(count_cat+vocab_length)
        cat_word_dict[cat][w] = log ( cat_word_dict[cat][w] , 2)
        weight_norm_cat += abs(cat_word_dict[cat][w])
    if WEIGHTED:    
        for w in cat_word_dict[cat].keys():
            cat_word_dict[cat][w] = cat_word_dict[cat][w]/weight_norm_cat

print cat_word_dict['pos']['book'], cat_word_dict['neg']['book']
exit()

     
####Congratulations! the Classifier is trained, now it is time to run the Multinomial Naive Bayes Classifier on the test dataset
print 'pos' , cat_num_docs['pos']/len(trainset)
print 'neg' , cat_num_docs['neg']/len(trainset)
li_results=[]
li_results2=[]
#9) Like in the training set,Loop through the test set, to get the entire text from  each file
##10) Similar step, parse the string to get individual words
for file_name in testset:
    # print "File: ", file_name
    # minimum_neg_log_prob=1000000000
    minimum_neg_log_prob = -1000000000      # NEW
    minimum_pos_log_prob = 100000000
    min_category=''
    max_category=''
    list_words = tokenize(file_name)


##11) Get the probability for each category,
    #can use any of the created dictionaries to wade through the categories
    for cat in  cat_word_count_dict:
        
        
        # print "________________________________________________________________"
        # print "________________________________________________________________"
        # print "\n\n" , cat, cat, cat, cat, cat, cat, cat, cat, cat, cat, "\n\n"
        # neg_log_prob=-log(cat_num_docs[cat]/length_train)
        inv_cat = 'pos'
        if cat == 'pos':
            inv_cat = 'neg'
        
        neg_log_prob = log(cat_num_docs[cat]/length_train, 2)
        pos_log_prob = 0
        
        # neg_log_prob = cat_num_docs[cat]/length_train
        opp_word_dict = cat_word_dict[inv_cat]
        opp_count_cat = cat_word_count_dict[inv_cat]
        
        word_dict = cat_word_dict[cat]
        count_cat = cat_word_count_dict[cat]
                
        my_word_count = {}
        for aw in list_words:
            my_word_count[aw] = my_word_count.get(aw, 0)
            my_word_count[aw]+=1
            
            if COMPLEMENT:
                neg_log_prob -= opp_word_dict.get(aw, 0)
            else :
                neg_log_prob += word_dict.get(aw, 0)
                
            pos_log_prob += opp_word_dict.get(aw, 0)
            # my_orig_word_count[aw] = my_orig_word_count.get(aw, 0)
            # my_orig_word_count[aw]+=1
        
        # # length_norm = 0
        # weight_normalizing_ratio = 0
        # opp_weight_normalizing_ratio = 0
        # for kw in my_word_count.keys():
            # count_word_train=word_dict.get(kw,0)            
            # ratio = (count_word_train+1)/(count_cat+vocab_length)
            
            # # if COMPLEMENT:
            # opp_count_word_train=opp_word_dict.get(kw,0)
            # opp_ratio = (opp_count_word_train+1)/(opp_count_cat+vocab_length)
            
            # # weight norm
            # # weight_normalizing_ratio += abs(log(ratio, 2))
            # # opp_weight_normalizing_ratio += abs(log(opp_ratio, 2))
            # weight_normalizing_ratio += log(ratio, 2)
            # opp_weight_normalizing_ratio += log(opp_ratio, 2)
            
            # # if TF:
                # # my_word_count[kw] = log(1 + my_word_count[kw])
            
            # # if IDF:
                # # my_word_count[kw] = my_word_count[kw]*log(length_train/num_docs_word_in.get(w,1))     #IDF
            # # ## length norm
            # # w_freq = my_word_count[kw]
            # # length_norm += (w_freq * w_freq)
            
        # length_norm = length_norm**(0.5)   
        # print "WNR: ", weight_normalizing_ratio
        
        # for w in my_word_count.keys():
            # count_word_train=word_dict.get(w,0)
            # ratio = (count_word_train+1)/(count_cat+vocab_length)   #Nw,c+1/Nc+|V| = theta_c
            
            # # if COMPLEMENT:
            # opp_count_word_train=opp_word_dict.get(w,0)
            # opp_ratio = (opp_count_word_train+1)/(opp_count_cat+vocab_length)
            
            # word_freq = my_word_count[w]
            
            # # if LENGTH:
                # # word_freq = word_freq/length_norm       # length normalization
            

            # ratio = log(ratio, 2)      # weight factor     log(theta_c) = weight_c,w
            # opp_ratio = log(opp_ratio, 2)
            
            # if WEIGHTED:
                # ratio = ratio/weight_normalizing_ratio      # weight normalization
                # opp_ratio = opp_ratio/opp_weight_normalizing_ratio
            
            # if COMPLEMENT == 1:                     # just complement
                # neg_log_prob -= word_freq*opp_ratio
            # else:
                # neg_log_prob += word_freq*ratio    # class probability
                # pos_log_prob += word_freq*ratio
            # if COMPLEMENT == 2:                     # one-v-all 
                # neg_log_prob += word_freq*ratio

            
            # break
        # print "NLP: ", neg_log_prob
        # print file_name
        # print "\n\n", cat, minimum_pos_log_prob , '<' , neg_log_prob                 
        # if minimum_pos_log_prob>pos_log_prob:
        if minimum_neg_log_prob<neg_log_prob:
            min_category=cat
            minimum_neg_log_prob=neg_log_prob
            # minimum_pos_log_prob=pos_log_prob
        if minimum_pos_log_prob>pos_log_prob:
            max_category=cat
            minimum_pos_log_prob=pos_log_prob
    # print "Min cat: ", min_category
    
    if AMAZON:
        li_results.append((file_name,min_category,REVIEW_POL[file_name]))
    else:
        li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
    # break
    if AMAZON:
        li_results2.append((file_name,max_category,REVIEW_POL[file_name]))
    else:
        li_results2.append((file_name,max_category,mr.categories(fileids = file_name)[0]))

###--------------------DEBUG STATEMENTS----------------------
#for t in li_results:
 #   if t[1]!=t[2]:
  #      print t
###--------------------DEBUG STATEMENTS----------------------
    
###--------------------DEBUG STATEMENTS----------------------

#12) Evaluating the classifier

CalculateAccuracy(li_results)
CalculateAccuracy(li_results2)