Naive_bayes.py

######################
# Full version with all variations included
# To improve: create a main function allowing for multiple runs
# TF works properly!
######################

from __future__ import division
from math import log
from math import pow
import re
import csv
from nltk.corpus import movie_reviews as mr
from nltk.corpus import stopwords
import random
STOP_WORDS = set(stopwords.words('english'))
SPLIT_AMOUNT = 0.6          # training amount from data
SHUFFLE = 1
# COMPLEMENT = 0      # 1 - just comp, 2 - delta / one-v-all
# WEIGHTED = 0        # 1 - adjust weights
# TF = 0              # 1 - log term frew
# IDF = 0             # 1 - idf
# LENGTH = 0          # 1 - doc length adjust
AMAZON = 0          # 1 - use Amazon set
TWITTER = 0            # 1 - use Twitter set
TWEET_LIMIT = 5000      # we can't use the whole database, so just randomly grab this number of positive and negative reviews
OFFSET = 0          # introduced offset (skew) in datasets

REVIEW_POL={}

li = []
# exit()
testset = []
trainset = []
# li = Preprocessor.startup()
cat_num_docs = {}

##4)Create a) a dictionary with a category as the key and dictionary of words-occurrences as values
          #b) a dictionary with a category as the key and the number of words in it as the value
# {pos-> {w1 = 17 times}, {w2 = 32 times}...} {neg-> ....}
cat_word_dict={}
# {pos-> 4000 words} {neg-> 7000 words}
cat_word_count_dict={}
#val = my_dict.get(key, mydefaultval)
num_docs_word_in = {}
vocab_length=0

word_cat_num_doc_dict={}


def SplitData():
    global REVIEW_POL
    type_dict={}
    docs_count={}
    train_test = [[],[]]
    # offset_sample = random.randint(-400,400)
    offset_sample = OFFSET
    # print "offset_sample", offset_sample
    categories = ['neg', 'pos']
    if AMAZON:
        # offset_sample = random.randint(-600,600)
        for category in categories:
            type_dict[category]=[]
        with open('amazon_revs.csv', 'rb') as csvfile:
            rev_read = csv.reader(csvfile)
            for row in rev_read:
                type_dict[row[1]].append(row[0])
                REVIEW_POL[row[0]] = row[1]
    elif TWITTER:
        for category in categories:
            type_dict[category]=[]
        with open('tweets.csv', 'rb') as csvfile:
            rev_read = csv.reader(csvfile)
            rev_read.next()     # skip header row
            number = [0,0]
            for row in rev_read:
                type_dict[ categories[ int(row[1]) ] ].append(row[3].strip())
                REVIEW_POL[row[3].strip()] = categories[int(row[1])]
                number[int(row[1])] += 1
                if (number[0]>TWEET_LIMIT and number[1]>TWEET_LIMIT):
                    break
    else:
        for category in mr.categories():
            type_dict[category]=mr.fileids(categories=category)
    # if NO_OFF:
        # offset_sample = 0
    for cat in type_dict.keys():
        li = type_dict[cat]
        if SHUFFLE:
            random.shuffle(li)
        size=int(len(li)*SPLIT_AMOUNT) + offset_sample
        # if DEFINED_SIZE:
            # size = DEFINED_SIZES[cat]
        # print "Category: ", cat, "Size:", size
        offset_sample *= -1
        docs_count[cat]=size
        train_test[0].extend(li[:size])
        train_test[1].extend(li[size:])
    return [train_test,type_dict, docs_count]

def tokenize(file_name):
    list_words = ()
    if AMAZON or TWITTER:
        list_words = re.split(r'\W+',file_name)
    else:
        list_words = re.split(r'\W+',mr.raw(fileids=file_name))

    return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS]


def CalculateAccuracy(li_results):
    a=0
    b=0
    c=0
    d=0
    cat = li_results[0][1]
    for t in li_results:
        if cat==t[1]:
            if cat==t[2]:
                a+=1
            else:
                b+=1
        else:
            if cat==t[2]:
                c+=1
            else:
                d+=1
    precision = a/(a+b)
    # recall = a/(a+c)
    # print "The following parameters are recorded for the category " , cat
    # print "precision =", precision
    return precision

def RunWholeThing():
    global AMAZON
    global TWITTER
    global OFFSET
    global DEFINED_SIZE
    global DEFINED_SIZES
    OFFSET = 0
    AMAZON = 0          # 0 - use movie_reviews, 1 - use Amazon set
    TWITTER = 0
    tested = ['  Bern:', '  Mult:', '  TIL :', '  DTIL:', '  CW  :', '  TIWC:']
    while OFFSET < 400:
        print "Movies with ", OFFSET
        ans = DoTheThing()
        OFFSET = -1*OFFSET
        if OFFSET != 0:
            ans2 = DoTheThing()
            ans3 = [ans , ans2]
            ans = [sum(e)/len(e) for e in zip(*ans3)]
        a_i = 0
        for a in ans:
            print tested[a_i], a
            a_i += 1

        OFFSET = -1*OFFSET

        OFFSET += 300

    OFFSET = 0
    AMAZON = 1


    while OFFSET < 600:
        print "Amazon with ", OFFSET
        ans = DoTheThing()
        OFFSET = -1*OFFSET
        if OFFSET != 0:
            ans2 = DoTheThing()
            ans3 = [ans , ans2]
            ans = [sum(e)/len(e) for e in zip(*ans3)]
        a_i = 0
        for a in ans:
            print tested[a_i], a
            a_i += 1

        OFFSET = -1*OFFSET

        OFFSET += 400

    OFFSET = 0
    AMAZON = 0
    TWITTER = 1

    while OFFSET < 1000:
        print "Twitter with ", OFFSET
        ans = DoTheThing()
        OFFSET = -1*OFFSET
        if OFFSET != 0:
            ans2 = DoTheThing()
            ans3 = [ans , ans2]
            ans = [sum(e)/len(e) for e in zip(*ans3)]
        a_i = 0
        for a in ans:
            print tested[a_i], a
            a_i += 1
        OFFSET = -1*OFFSET
        OFFSET += 800


def DoTheThing():
    i = 0
    reps    = 5
    b_nb    = 0
    m_nb    = 0
    til_nb  = 0
    dtil_nb = 0
    cw_nb   = 0
    tilcw_nb= 0

    while i < reps:
        TrainMachine()
        b_nb += TestMachine_Bern()/reps
        m_nb += TestMachine(0,0,0,0,0)/reps
        til_nb += TestMachine(1,1,1,0,0)/reps
        dtil_nb += TestMachine(1,1,1,2,0)/reps
        cw_nb += TestMachine(0,0,0,1,1)/reps
        tilcw_nb += TestMachine(1,1,1,1,1)/reps
        i+=1
    # print "  Bern: %0.6f\n  Mult: %0.6f\n  TIL : %0.6f\n  DTIL: %0.6f\n  CW  : %0.6f\n  TIWC: %0.6f" % (b_nb, m_nb, til_nb, dtil_nb, cw_nb, tilcw_nb)
    return [b_nb, m_nb, til_nb, dtil_nb, cw_nb, tilcw_nb]


# li = Preprocessor.get_testset_trainset(corpus)

##5)Loop through the training set, to get the entire text from  each file
##6) Parse the string to get individual words
def TrainMachine():
    global cat_word_dict
    global cat_word_count_dict
    global num_docs_word_in
    global word_cat_num_doc_dict
    global li
    global testset
    global trainset
    global cat_num_docs
    global vocab_length

    li = SplitData()
    testset = li[0][1]
    trainset = li[0][0]
    cat_num_docs = li[2]

    for file_name in trainset:
        list_words = tokenize(file_name)


    ##7) Check if category exists in dictionary, if not, create an empty dictionary,
        #and put word count as zero
        #and then insert words into the category's dictionary in both cases and update the word count
        cat = ''
        if AMAZON or TWITTER:
            cat = REVIEW_POL[file_name]
        else:
            cat = mr.categories(fileids = file_name)[0]
        cat_word_dict[cat] = cat_word_dict.get(cat,{})
        cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0)

    # add number of words to total word count for cat
        cat_word_count_dict[cat]+=len(list_words)
    # start count for number of occurences for each word
        counted = []
        for w in list_words:
            cat_word_dict[cat][w] = cat_word_dict[cat].get(w, 0)
            cat_word_dict[cat][w]+=1
            if w not in counted:
                counted.append(w)
                num_docs_word_in[w] = num_docs_word_in.get(w, 0)
                num_docs_word_in[w] += 1

                word_cat_num_doc_dict[w]=word_cat_num_doc_dict.get(w,{})
                word_cat_num_doc_dict[w][cat]=word_cat_num_doc_dict[w].get(cat,0)
                word_cat_num_doc_dict[w][cat]+=1

    for dic in cat_word_dict.values():
        vocab_length+=len(dic)
    for w in word_cat_num_doc_dict:
        for cat in cat_num_docs:
            nct = word_cat_num_doc_dict[w].get(cat,0)
            # convert #times a word appears into #times+1/#cat_reviews+2
            ratio = (nct+1)/(cat_num_docs[cat]+2)
            word_cat_num_doc_dict[w][cat]=ratio


# ##8) Get the vocabulary length
# ## number of words, total across categories
# vocab_length=0


# ####Congratulations! the Classifier is trained, now it is time to run the Multinomial Naive Bayes Classifier on the test dataset
# length_train = len(trainset)
# print "length of training set ", length_train


def TestMachine(t, i, l, c, w):
    #9) Like in the training set,Loop through the test set, to get the entire text from  each file
    ##10) Similar step, parse the string to get individual words
    global trainset
    global testset
    TF = t              # 1 - log term frew
    IDF = i             # 1 - idf
    LENGTH = l          # 1 - doc length adjust
    COMPLEMENT = c      # 1 - just comp, 2 - delta / one-v-all
    WEIGHTED = w        # 1 - adjust weights
    length_train = len(trainset)
    # print "length train " , length_train, len(testset)
    li_results=[]

    for file_name in testset:
        # print "File: ", file_name
        # minimum_neg_log_prob=1000000000
        minimum_neg_log_prob = -1000000000      # NEW
        min_category=''
        list_words = tokenize(file_name)
        # print file_name


    ##11) Get the probability for each category,
        #can use any of the created dictionaries to wade through the categories
        for cat in cat_word_count_dict:
            # print cat , cat_num_docs[cat]/len(trainset)
            # print "________________________________________________________________"
            # print "________________________________________________________________"
            # print "\n\n" , cat, cat, cat, cat, cat, cat, cat, cat, cat, cat, "\n\n"
            # neg_log_prob=-log(cat_num_docs[cat]/length_train)
            inv_cat = 'pos'
            if cat == 'pos':
                inv_cat = 'neg'

            neg_log_prob = log(cat_num_docs[cat]/length_train, 2)

            # neg_log_prob = cat_num_docs[cat]/length_train
            opp_word_dict = cat_word_dict[inv_cat]
            opp_count_cat = cat_word_count_dict[inv_cat]

            word_dict = cat_word_dict[cat]
            count_cat = cat_word_count_dict[cat]
            ## get frequency counts
            my_word_count = {}
            for aw in list_words:
                my_word_count[aw] = my_word_count.get(aw, 0)
                my_word_count[aw]+=1

            ## calculate necessary norms
            length_norm = 0
            weight_normalizing_ratio = 0
            opp_weight_normalizing_ratio = 0
            for kw in my_word_count.keys():
                count_word_train=word_dict.get(kw,0)
                ratio = (count_word_train+1)/(count_cat+vocab_length)

                # if COMPLEMENT:
                opp_count_word_train=opp_word_dict.get(kw,0)
                opp_ratio = (opp_count_word_train+1)/(opp_count_cat+vocab_length)

                # weight norm
                weight_normalizing_ratio += abs(log(ratio, 2))
                opp_weight_normalizing_ratio += abs(log(opp_ratio, 2))

                if TF:
                    my_word_count[kw] = log(1 + my_word_count[kw])

                if IDF:
                    my_word_count[kw] = my_word_count[kw]*log(length_train/num_docs_word_in.get(kw,0.01), 2)     #IDF
                ## length norm
                w_freq = my_word_count[kw]
                length_norm += pow(w_freq, 2)

            length_norm = pow(length_norm, 0.5)
            # print "LN: ", length_norm

            for w in my_word_count.keys():
                count_word_train=word_dict.get(w,0)
                ratio = (count_word_train+1)/(count_cat+vocab_length)   #Nw,c+1/Nc+|V| = theta_c

                # if COMPLEMENT:
                opp_count_word_train=opp_word_dict.get(w,0)
                opp_ratio = (opp_count_word_train+1)/(opp_count_cat+vocab_length)

                word_freq = my_word_count[w]

                if LENGTH:
                    word_freq = word_freq/length_norm       # length normalization


                ratio = log(ratio, 2)      # weight factor     log(theta_c) = weight_c,w
                opp_ratio = log(opp_ratio, 2)

                if WEIGHTED:
                    ratio = ratio/weight_normalizing_ratio      # weight normalization
                    opp_ratio = opp_ratio/opp_weight_normalizing_ratio

                if COMPLEMENT == 1:                     # just complement
                    neg_log_prob -= word_freq*opp_ratio
                else:
                    neg_log_prob += word_freq*ratio    # class probability
                if COMPLEMENT == 2:                     # one-v-all
                    neg_log_prob += word_freq*ratio

                # break
            # print "NLP: ", neg_log_prob
            # print "\n\n", cat, minimum_neg_log_prob , '<' , neg_log_prob
            # if minimum_neg_log_prob>neg_log_prob:
            if minimum_neg_log_prob<neg_log_prob:
                min_category=cat
                minimum_neg_log_prob=neg_log_prob
        # print "Min cat: ", min_category

        if AMAZON or TWITTER:
            li_results.append((file_name,min_category,REVIEW_POL[file_name]))
        else:
            li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
        # break


    #12) Evaluating the classifier

    precision = CalculateAccuracy(li_results)
    return precision

def TestMachine_Bern():
    li_results=[]
    #5) Like in the training set,Loop through the test set, to get the individual words
    for file_name in testset:
        minimum_neg_log_prob=1000000000
        min_category=''
        set_list_words = set(tokenize(file_name))

    ##6) Get the probability for each category,
        #using the cat_num_docs dictionary to wade through the categories
        for cat in  cat_num_docs:
            neg_log_prob=-log(cat_num_docs[cat]/len(trainset))
            for w in word_cat_num_doc_dict:
                if w in set_list_words:
                    neg_log_prob-=log(word_cat_num_doc_dict[w][cat])
                else:
                    neg_log_prob-=log(1-word_cat_num_doc_dict[w][cat])
            if minimum_neg_log_prob>neg_log_prob:
                min_category=cat
                minimum_neg_log_prob=neg_log_prob

        if AMAZON or TWITTER:
            li_results.append((file_name,min_category,REVIEW_POL[file_name]))
        else:
            li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
        # break
    precision = CalculateAccuracy(li_results)
    return precision


RunWholeThing()
	######################
	# Full version with all variations included
	# To improve: create a main function allowing for multiple runs
	# TF works properly!
	######################

	from __future__ import division
	from math import log
	from math import pow
	import re
	import csv
	from nltk.corpus import movie_reviews as mr
	from nltk.corpus import stopwords
	import random
	STOP_WORDS = set(stopwords.words('english'))
	SPLIT_AMOUNT = 0.6 # training amount from data
	SHUFFLE = 1
	# COMPLEMENT = 0 # 1 - just comp, 2 - delta / one-v-all
	# WEIGHTED = 0 # 1 - adjust weights
	# TF = 0 # 1 - log term frew
	# IDF = 0 # 1 - idf
	# LENGTH = 0 # 1 - doc length adjust
	AMAZON = 0 # 1 - use Amazon set
	TWITTER = 0 # 1 - use Twitter set
	TWEET_LIMIT = 5000 # we can't use the whole database, so just randomly grab this number of positive and negative reviews
	OFFSET = 0 # introduced offset (skew) in datasets

	REVIEW_POL={}

	li = []
	# exit()
	testset = []
	trainset = []
	# li = Preprocessor.startup()
	cat_num_docs = {}

	##4)Create a) a dictionary with a category as the key and dictionary of words-occurrences as values
	#b) a dictionary with a category as the key and the number of words in it as the value
	# {pos-> {w1 = 17 times}, {w2 = 32 times}...} {neg-> ....}
	cat_word_dict={}
	# {pos-> 4000 words} {neg-> 7000 words}
	cat_word_count_dict={}
	#val = my_dict.get(key, mydefaultval)
	num_docs_word_in = {}
	vocab_length=0

	word_cat_num_doc_dict={}


	def SplitData():
	global REVIEW_POL
	type_dict={}
	docs_count={}
	train_test = [[],[]]
	# offset_sample = random.randint(-400,400)
	offset_sample = OFFSET
	# print "offset_sample", offset_sample
	categories = ['neg', 'pos']
	if AMAZON:
	# offset_sample = random.randint(-600,600)
	for category in categories:
	type_dict[category]=[]
	with open('amazon_revs.csv', 'rb') as csvfile:
	rev_read = csv.reader(csvfile)
	for row in rev_read:
	type_dict[row[1]].append(row[0])
	REVIEW_POL[row[0]] = row[1]
	elif TWITTER:
	for category in categories:
	type_dict[category]=[]
	with open('tweets.csv', 'rb') as csvfile:
	rev_read = csv.reader(csvfile)
	rev_read.next() # skip header row
	number = [0,0]
	for row in rev_read:
	type_dict[ categories[ int(row[1]) ] ].append(row[3].strip())
	REVIEW_POL[row[3].strip()] = categories[int(row[1])]
	number[int(row[1])] += 1
	if (number[0]>TWEET_LIMIT and number[1]>TWEET_LIMIT):
	break
	else:
	for category in mr.categories():
	type_dict[category]=mr.fileids(categories=category)
	# if NO_OFF:
	# offset_sample = 0
	for cat in type_dict.keys():
	li = type_dict[cat]
	if SHUFFLE:
	random.shuffle(li)
	size=int(len(li)*SPLIT_AMOUNT) + offset_sample
	# if DEFINED_SIZE:
	# size = DEFINED_SIZES[cat]
	# print "Category: ", cat, "Size:", size
	offset_sample *= -1
	docs_count[cat]=size
	train_test[0].extend(li[:size])
	train_test[1].extend(li[size:])
	return [train_test,type_dict, docs_count]

	def tokenize(file_name):
	list_words = ()
	if AMAZON or TWITTER:
	list_words = re.split(r'\W+',file_name)
	else:
	list_words = re.split(r'\W+',mr.raw(fileids=file_name))

	return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS]


	def CalculateAccuracy(li_results):
	a=0
	b=0
	c=0
	d=0
	cat = li_results[0][1]
	for t in li_results:
	if cat==t[1]:
	if cat==t[2]:
	a+=1
	else:
	b+=1
	else:
	if cat==t[2]:
	c+=1
	else:
	d+=1
	precision = a/(a+b)
	# recall = a/(a+c)
	# print "The following parameters are recorded for the category " , cat
	# print "precision =", precision
	return precision

	def RunWholeThing():
	global AMAZON
	global TWITTER
	global OFFSET
	global DEFINED_SIZE
	global DEFINED_SIZES
	OFFSET = 0
	AMAZON = 0 # 0 - use movie_reviews, 1 - use Amazon set
	TWITTER = 0
	tested = [' Bern:', ' Mult:', ' TIL :', ' DTIL:', ' CW :', ' TIWC:']
	while OFFSET < 400:
	print "Movies with ", OFFSET
	ans = DoTheThing()
	OFFSET = -1*OFFSET
	if OFFSET != 0:
	ans2 = DoTheThing()
	ans3 = [ans , ans2]
	ans = [sum(e)/len(e) for e in zip(*ans3)]
	a_i = 0
	for a in ans:
	print tested[a_i], a
	a_i += 1

	OFFSET = -1*OFFSET

	OFFSET += 300

	OFFSET = 0
	AMAZON = 1


	while OFFSET < 600:
	print "Amazon with ", OFFSET
	ans = DoTheThing()
	OFFSET = -1*OFFSET
	if OFFSET != 0:
	ans2 = DoTheThing()
	ans3 = [ans , ans2]
	ans = [sum(e)/len(e) for e in zip(*ans3)]
	a_i = 0
	for a in ans:
	print tested[a_i], a
	a_i += 1

	OFFSET = -1*OFFSET

	OFFSET += 400

	OFFSET = 0
	AMAZON = 0
	TWITTER = 1

	while OFFSET < 1000:
	print "Twitter with ", OFFSET
	ans = DoTheThing()
	OFFSET = -1*OFFSET
	if OFFSET != 0:
	ans2 = DoTheThing()
	ans3 = [ans , ans2]
	ans = [sum(e)/len(e) for e in zip(*ans3)]
	a_i = 0
	for a in ans:
	print tested[a_i], a
	a_i += 1
	OFFSET = -1*OFFSET
	OFFSET += 800



	def DoTheThing():
	i = 0
	reps = 5
	b_nb = 0
	m_nb = 0
	til_nb = 0
	dtil_nb = 0
	cw_nb = 0
	tilcw_nb= 0

	while i < reps:
	TrainMachine()
	b_nb += TestMachine_Bern()/reps
	m_nb += TestMachine(0,0,0,0,0)/reps
	til_nb += TestMachine(1,1,1,0,0)/reps
	dtil_nb += TestMachine(1,1,1,2,0)/reps
	cw_nb += TestMachine(0,0,0,1,1)/reps
	tilcw_nb += TestMachine(1,1,1,1,1)/reps
	i+=1
	# print " Bern: %0.6f\n Mult: %0.6f\n TIL : %0.6f\n DTIL: %0.6f\n CW : %0.6f\n TIWC: %0.6f" % (b_nb, m_nb, til_nb, dtil_nb, cw_nb, tilcw_nb)
	return [b_nb, m_nb, til_nb, dtil_nb, cw_nb, tilcw_nb]


	# li = Preprocessor.get_testset_trainset(corpus)

	##5)Loop through the training set, to get the entire text from each file
	##6) Parse the string to get individual words
	def TrainMachine():
	global cat_word_dict
	global cat_word_count_dict
	global num_docs_word_in
	global word_cat_num_doc_dict
	global li
	global testset
	global trainset
	global cat_num_docs
	global vocab_length

	li = SplitData()
	testset = li[0][1]
	trainset = li[0][0]
	cat_num_docs = li[2]

	for file_name in trainset:
	list_words = tokenize(file_name)


	##7) Check if category exists in dictionary, if not, create an empty dictionary,
	#and put word count as zero
	#and then insert words into the category's dictionary in both cases and update the word count
	cat = ''
	if AMAZON or TWITTER:
	cat = REVIEW_POL[file_name]
	else:
	cat = mr.categories(fileids = file_name)[0]
	cat_word_dict[cat] = cat_word_dict.get(cat,{})
	cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0)

	# add number of words to total word count for cat
	cat_word_count_dict[cat]+=len(list_words)
	# start count for number of occurences for each word
	counted = []
	for w in list_words:
	cat_word_dict[cat][w] = cat_word_dict[cat].get(w, 0)
	cat_word_dict[cat][w]+=1
	if w not in counted:
	counted.append(w)
	num_docs_word_in[w] = num_docs_word_in.get(w, 0)
	num_docs_word_in[w] += 1

	word_cat_num_doc_dict[w]=word_cat_num_doc_dict.get(w,{})
	word_cat_num_doc_dict[w][cat]=word_cat_num_doc_dict[w].get(cat,0)
	word_cat_num_doc_dict[w][cat]+=1

	for dic in cat_word_dict.values():
	vocab_length+=len(dic)
	for w in word_cat_num_doc_dict:
	for cat in cat_num_docs:
	nct = word_cat_num_doc_dict[w].get(cat,0)
	# convert #times a word appears into #times+1/#cat_reviews+2
	ratio = (nct+1)/(cat_num_docs[cat]+2)
	word_cat_num_doc_dict[w][cat]=ratio


	# ##8) Get the vocabulary length
	# ## number of words, total across categories
	# vocab_length=0




	# ####Congratulations! the Classifier is trained, now it is time to run the Multinomial Naive Bayes Classifier on the test dataset
	# length_train = len(trainset)
	# print "length of training set ", length_train


	def TestMachine(t, i, l, c, w):
	#9) Like in the training set,Loop through the test set, to get the entire text from each file
	##10) Similar step, parse the string to get individual words
	global trainset
	global testset
	TF = t # 1 - log term frew
	IDF = i # 1 - idf
	LENGTH = l # 1 - doc length adjust
	COMPLEMENT = c # 1 - just comp, 2 - delta / one-v-all
	WEIGHTED = w # 1 - adjust weights
	length_train = len(trainset)
	# print "length train " , length_train, len(testset)
	li_results=[]

	for file_name in testset:
	# print "File: ", file_name
	# minimum_neg_log_prob=1000000000
	minimum_neg_log_prob = -1000000000 # NEW
	min_category=''
	list_words = tokenize(file_name)
	# print file_name



	##11) Get the probability for each category,
	#can use any of the created dictionaries to wade through the categories
	for cat in cat_word_count_dict:
	# print cat , cat_num_docs[cat]/len(trainset)
	# print "________________________________________________________________"
	# print "________________________________________________________________"
	# print "\n\n" , cat, cat, cat, cat, cat, cat, cat, cat, cat, cat, "\n\n"
	# neg_log_prob=-log(cat_num_docs[cat]/length_train)
	inv_cat = 'pos'
	if cat == 'pos':
	inv_cat = 'neg'

	neg_log_prob = log(cat_num_docs[cat]/length_train, 2)

	# neg_log_prob = cat_num_docs[cat]/length_train
	opp_word_dict = cat_word_dict[inv_cat]
	opp_count_cat = cat_word_count_dict[inv_cat]

	word_dict = cat_word_dict[cat]
	count_cat = cat_word_count_dict[cat]
	## get frequency counts
	my_word_count = {}
	for aw in list_words:
	my_word_count[aw] = my_word_count.get(aw, 0)
	my_word_count[aw]+=1

	## calculate necessary norms
	length_norm = 0
	weight_normalizing_ratio = 0
	opp_weight_normalizing_ratio = 0
	for kw in my_word_count.keys():
	count_word_train=word_dict.get(kw,0)
	ratio = (count_word_train+1)/(count_cat+vocab_length)

	# if COMPLEMENT:
	opp_count_word_train=opp_word_dict.get(kw,0)
	opp_ratio = (opp_count_word_train+1)/(opp_count_cat+vocab_length)

	# weight norm
	weight_normalizing_ratio += abs(log(ratio, 2))
	opp_weight_normalizing_ratio += abs(log(opp_ratio, 2))

	if TF:
	my_word_count[kw] = log(1 + my_word_count[kw])

	if IDF:
	my_word_count[kw] = my_word_count[kw]*log(length_train/num_docs_word_in.get(kw,0.01), 2) #IDF
	## length norm
	w_freq = my_word_count[kw]
	length_norm += pow(w_freq, 2)

	length_norm = pow(length_norm, 0.5)
	# print "LN: ", length_norm

	for w in my_word_count.keys():
	count_word_train=word_dict.get(w,0)
	ratio = (count_word_train+1)/(count_cat+vocab_length) #Nw,c+1/Nc+\|V\| = theta_c

	# if COMPLEMENT:
	opp_count_word_train=opp_word_dict.get(w,0)
	opp_ratio = (opp_count_word_train+1)/(opp_count_cat+vocab_length)

	word_freq = my_word_count[w]

	if LENGTH:
	word_freq = word_freq/length_norm # length normalization


	ratio = log(ratio, 2) # weight factor log(theta_c) = weight_c,w
	opp_ratio = log(opp_ratio, 2)

	if WEIGHTED:
	ratio = ratio/weight_normalizing_ratio # weight normalization
	opp_ratio = opp_ratio/opp_weight_normalizing_ratio

	if COMPLEMENT == 1: # just complement
	neg_log_prob -= word_freq*opp_ratio
	else:
	neg_log_prob += word_freq*ratio # class probability
	if COMPLEMENT == 2: # one-v-all
	neg_log_prob += word_freq*ratio

	# break
	# print "NLP: ", neg_log_prob
	# print "\n\n", cat, minimum_neg_log_prob , '<' , neg_log_prob
	# if minimum_neg_log_prob>neg_log_prob:
	if minimum_neg_log_prob<neg_log_prob:
	min_category=cat
	minimum_neg_log_prob=neg_log_prob
	# print "Min cat: ", min_category

	if AMAZON or TWITTER:
	li_results.append((file_name,min_category,REVIEW_POL[file_name]))
	else:
	li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
	# break


	#12) Evaluating the classifier

	precision = CalculateAccuracy(li_results)
	return precision

	def TestMachine_Bern():
	li_results=[]
	#5) Like in the training set,Loop through the test set, to get the individual words
	for file_name in testset:
	minimum_neg_log_prob=1000000000
	min_category=''
	set_list_words = set(tokenize(file_name))

	##6) Get the probability for each category,
	#using the cat_num_docs dictionary to wade through the categories
	for cat in cat_num_docs:
	neg_log_prob=-log(cat_num_docs[cat]/len(trainset))
	for w in word_cat_num_doc_dict:
	if w in set_list_words:
	neg_log_prob-=log(word_cat_num_doc_dict[w][cat])
	else:
	neg_log_prob-=log(1-word_cat_num_doc_dict[w][cat])
	if minimum_neg_log_prob>neg_log_prob:
	min_category=cat
	minimum_neg_log_prob=neg_log_prob

	if AMAZON or TWITTER:
	li_results.append((file_name,min_category,REVIEW_POL[file_name]))
	else:
	li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
	# break
	precision = CalculateAccuracy(li_results)
	return precision


	RunWholeThing()