Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
clean up
twcnb.py now has control variables for all variations/implementations of
the algo, incl. delta tf-idf
  • Loading branch information
asp10012 committed Apr 18, 2016
1 parent 971ba40 commit d4e334b
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 42 deletions.
4 changes: 2 additions & 2 deletions MNB.py
Expand Up @@ -8,10 +8,10 @@ from nltk.corpus import stopwords
STOP_WORDS = set(stopwords.words('english'))
SPLIT_AMOUNT = 0.6 # training amount from data

AMAZON = 1
AMAZON = 0
REVIEW_POL={}
DEFINED_SIZE = 1
DEFINED_SIZES = {'pos': 948, 'neg': 948}
DEFINED_SIZES = {'pos': 600, 'neg': 600}

def SplitData():
type_dict={}
Expand Down
118 changes: 78 additions & 40 deletions TWCNB.py
@@ -1,3 +1,8 @@
######################
# Full version with all variations included
# To improve: create a main function allowing for multiple runs
######################

from __future__ import division
from math import log
import re
Expand All @@ -8,11 +13,17 @@ import random
STOP_WORDS = set(stopwords.words('english'))
SPLIT_AMOUNT = 0.6 # training amount from data

USE_IDF = 1
AMAZON = 1
COMPLEMENT = 2 # 1 - just comp, 2 - delta / one-v-all
WEIGHTED = 0 # 1 - adjust weights
TF = 0 # 1 - log term frew
IDF = 0 # 1 - idf
LENGTH = 0 # 1 - doc length adjust
AMAZON = 1 # 0 - use movie_reviews, 1 - use Amazon set
NO_OFF = 1 # 0 - use random data size offset, 1 - nope
DEFINED_SIZE = 0 # 1 - use DEFINED_SIZES for pos, neg sets
DEFINED_SIZES = {'pos': 700, 'neg': 1100}

REVIEW_POL={}
DEFINED_SIZE = 1
DEFINED_SIZES = {'pos': 948, 'neg': 948}
def SplitData():
type_dict={}
docs_count={}
Expand All @@ -31,6 +42,8 @@ def SplitData():
else:
for category in mr.categories():
type_dict[category]=mr.fileids(categories=category)
if NO_OFF:
offset_sample = 0
for cat in type_dict.keys():
li = type_dict[cat]
random.shuffle(li)
Expand Down Expand Up @@ -94,9 +107,10 @@ cat_word_dict={}
cat_word_count_dict={}
#val = my_dict.get(key, mydefaultval)
complete_training_docs_tokens = []

num_docs_word_in = {}
##5)Loop through the training set, to get the entire text from each file
##6) Parse the string to get individual words

for file_name in trainset:
list_words = tokenize(file_name)
complete_training_docs_tokens.append(list_words)
Expand All @@ -114,36 +128,37 @@ for file_name in trainset:
cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0)

# add number of words to total word count for cat
cat_word_count_dict[cat]+=len(list_words)
# start count for number of occurences for each word
cat_word_count_dict[cat]+=len(list_words)
# start count for number of occurences for each word
counted = []
for w in list_words:
cat_word_dict[cat][w] = cat_word_dict[cat].get(w, 0)
cat_word_dict[cat][w]+=1
if w not in counted:
counted.append(w)
num_docs_word_in[w] = num_docs_word_in.get(w, 0)
num_docs_word_in[w] += 1



##8) Get the vocabulary length
## number of words, total across categories
vocab_length=0
num_docs_word_in = {}

for dic in cat_word_dict.values():
vocab_length+=len(dic)
if USE_IDF:
for uniq_word in dic.keys():
num_docs_word_in[uniq_word] = num_docs_word_in.get(uniq_word, 1)
num_docs_word_in[uniq_word] = sum(1 for sr in complete_training_docs_tokens if uniq_word in sr)




####Congratulations! the Classifier is trained, now it is time to run the Multinomial Naive Bayes Classifier on the test dataset
length_train = len(trainset)
print "length of training set ", length_train
li_results=[]
#9) Like in the training set,Loop through the test set, to get the entire text from each file
##10) Similar step, parse the string to get individual words
for file_name in testset:
# print "File: ", file_name
minimum_neg_log_prob=1000000000
# minimum_neg_log_prob = 0 # NEW
# minimum_neg_log_prob=1000000000
minimum_neg_log_prob = -1000000000 # NEW
min_category=''
list_words = tokenize(file_name)

Expand All @@ -161,12 +176,14 @@ for file_name in testset:
if cat == 'pos':
inv_cat = 'neg'


neg_log_prob=log(cat_num_docs[cat]/length_train)
neg_log_prob = log(cat_num_docs[cat]/length_train)

# neg_log_prob = cat_num_docs[cat]/length_train
word_dict = cat_word_dict[inv_cat]
count_cat = cat_word_count_dict[inv_cat]
opp_word_dict = cat_word_dict[inv_cat]
opp_count_cat = cat_word_count_dict[inv_cat]

word_dict = cat_word_dict[cat]
count_cat = cat_word_count_dict[cat]

my_word_count = {}
for aw in list_words:
Expand All @@ -175,47 +192,68 @@ for file_name in testset:

length_norm = 0
weight_normalizing_ratio = 0
opp_weight_normalizing_ratio = 0
for kw in my_word_count.keys():
count_word_train=word_dict.get(kw,0)
count_word_train=word_dict.get(kw,0)
ratio = (count_word_train+1)/(count_cat+vocab_length)
## weight norm
weight_normalizing_ratio+=log(ratio)
## TF
my_word_count[kw] = log(my_word_count[kw]+1)

# if COMPLEMENT:
opp_count_word_train=opp_word_dict.get(kw,0)
opp_ratio = (opp_count_word_train+1)/(opp_count_cat+vocab_length)

# weight norm
weight_normalizing_ratio += abs(log(ratio))
opp_weight_normalizing_ratio += abs(log(opp_ratio))

if TF:
my_word_count[kw] = log(1 + my_word_count[kw])

if IDF:
my_word_count[kw] = my_word_count[kw]*log(length_train/num_docs_word_in.get(w,1)) #IDF
## length norm
length_norm += (my_word_count[kw]**(2))
w_freq = my_word_count[kw]
length_norm += (w_freq * w_freq)

length_norm = length_norm**(0.5)
# print "WNR: ", weight_normalizing_ratio

for w in my_word_count.keys():
count_word_train=word_dict.get(w,0)
ratio = (count_word_train+1)/(count_cat+vocab_length) #Nw,c+1/Nc+|V| = theta_c
# neg_log_prob-=log(ratio)

# if COMPLEMENT:
opp_count_word_train=opp_word_dict.get(w,0)
opp_ratio = (opp_count_word_train+1)/(opp_count_cat+vocab_length)

word_freq = my_word_count[w]
if USE_IDF:
word_freq = word_freq*log(length_train/num_docs_word_in.get(w,1)) #IDF
word_freq = word_freq/length_norm # length normalization

# neg_log_prob += word_freq*log(ratio) #switch to
if LENGTH:
word_freq = word_freq/length_norm # length normalization


ratio = log(ratio) # weight factor log(theta_c) = weight_c,w
ratio = ratio/weight_normalizing_ratio # weight normalization
neg_log_prob += word_freq*ratio # class probability
opp_ratio = log(opp_ratio)

if WEIGHTED:
ratio = ratio/weight_normalizing_ratio # weight normalization
opp_ratio = opp_ratio/opp_weight_normalizing_ratio

if COMPLEMENT == 1: # just complement
neg_log_prob -= word_freq*opp_ratio
else:
neg_log_prob += word_freq*ratio # class probability
if COMPLEMENT == 2: # one-v-all
neg_log_prob += word_freq*ratio

# neg_log_prob *= ratio
# print w, "Ratio found:",ratio, "new_neg_log:", neg_log_prob
# break
# print "NLP: ", neg_log_prob
# print "\n\n", cat, minimum_neg_log_prob , '<' , neg_log_prob
if minimum_neg_log_prob>neg_log_prob:
# if minimum_neg_log_prob<neg_log_prob:
# if minimum_neg_log_prob>neg_log_prob:
if minimum_neg_log_prob<neg_log_prob:
min_category=cat
minimum_neg_log_prob=neg_log_prob
# print "Min cat: ", min_category
# correct_cat = 'pos'
# if file_name in all_review_cats['neg']:
# correct_cat = 'neg'

if AMAZON:
li_results.append((file_name,min_category,REVIEW_POL[file_name]))
else:
Expand Down

0 comments on commit d4e334b

Please sign in to comment.