Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
functions
i did a thing - but i did it at 2 am so this might be bad
  • Loading branch information
asp10012 committed Apr 20, 2016
1 parent d4e334b commit 47d86ab
Show file tree
Hide file tree
Showing 4 changed files with 1,089 additions and 0 deletions.
254 changes: 254 additions & 0 deletions OLD_VERSIONS/CWMNB.py
@@ -0,0 +1,254 @@
######################
# This version is CWMNB only
######################

from __future__ import division
from math import log
import re
import csv
from nltk.corpus import movie_reviews as mr
from nltk.corpus import stopwords
import random
STOP_WORDS = set(stopwords.words('english'))
SPLIT_AMOUNT = 0.6 # training amount from data

COMPLEMENT = 0
WEIGHTED = 0
USE_IDF = 0
AMAZON = 0
REVIEW_POL={}
DEFINED_SIZE = 1
DEFINED_SIZES = {'pos': 600, 'neg': 600}
def SplitData():
type_dict={}
docs_count={}
train_test = [[],[]]
offset_sample = random.randint(-400,400)
print "offset_sample", offset_sample
if AMAZON:
offset_sample = random.randint(-600,600)
for category in ['pos', 'neg']:
type_dict[category]=[]
with open('amazon_revs.csv', 'rb') as csvfile:
rev_read = csv.reader(csvfile)
for row in rev_read:
type_dict[row[1]].append(row[0])
REVIEW_POL[row[0]] = row[1]
else:
for category in mr.categories():
type_dict[category]=mr.fileids(categories=category)
for cat in type_dict.keys():
li = type_dict[cat]
random.shuffle(li)
size=int(len(li)*SPLIT_AMOUNT) + offset_sample
if DEFINED_SIZE:
size = DEFINED_SIZES[cat]
print "Category: ", cat, "Size:", size
offset_sample *= -1
docs_count[cat]=size
train_test[0].extend(li[:size])
train_test[1].extend(li[size:])
return [train_test,type_dict, docs_count]

def tokenize(file_name):
list_words = ()
if AMAZON:
list_words = re.split(r'\W+',file_name)
else:
list_words = re.split(r'\W+',mr.raw(fileids=file_name))

return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS]


def CalculateAccuracy(li_results):
a=0
b=0
c=0
d=0
cat = li_results[0][1]
for t in li_results:
if cat==t[1]:
if cat==t[2]:
a+=1
else:
b+=1
else:
if cat==t[2]:
c+=1
else:
d+=1
precision = a/(a+b)
# recall = a/(a+c)
# print "The following parameters are recorded for the category " , cat
print "precision =", precision

# li = Preprocessor.get_testset_trainset(corpus)
li = SplitData()
# exit()
testset = li[0][1]
trainset = li[0][0]
# li = Preprocessor.startup()
cat_num_docs = li[2]



##4)Create a) a dictionary with a category as the key and dictionary of words-occurrences as values
#b) a dictionary with a category as the key and the number of words in it as the value
# {pos-> {w1 = 17 times}, {w2 = 32 times}...} {neg-> ....}
cat_word_dict={}
# {pos-> 4000 words} {neg-> 7000 words}
cat_word_count_dict={}
#val = my_dict.get(key, mydefaultval)
complete_training_docs_tokens = []

##5)Loop through the training set, to get the entire text from each file
##6) Parse the string to get individual words
for file_name in trainset:
list_words = tokenize(file_name)
complete_training_docs_tokens.append(list_words)


##7) Check if category exists in dictionary, if not, create an empty dictionary,
#and put word count as zero
#and then insert words into the category's dictionary in both cases and update the word count
cat = ''
if AMAZON:
cat = REVIEW_POL[file_name]
else:
cat = mr.categories(fileids = file_name)[0]
cat_word_dict[cat] = cat_word_dict.get(cat,{})
cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0)

# add number of words to total word count for cat
cat_word_count_dict[cat]+=len(list_words)
# start count for number of occurences for each word
for w in list_words:
cat_word_dict[cat][w] = cat_word_dict[cat].get(w, 0)
cat_word_dict[cat][w]+=1



##8) Get the vocabulary length
## number of words, total across categories
vocab_length=0
num_docs_word_in = {}
for dic in cat_word_dict.values():
vocab_length+=len(dic)
if USE_IDF:
for uniq_word in dic.keys():
num_docs_word_in[uniq_word] = num_docs_word_in.get(uniq_word, 1)
num_docs_word_in[uniq_word] = sum(1 for sr in complete_training_docs_tokens if uniq_word in sr)



####Congratulations! the Classifier is trained, now it is time to run the Multinomial Naive Bayes Classifier on the test dataset
length_train = len(trainset)
li_results=[]
#9) Like in the training set,Loop through the test set, to get the entire text from each file
##10) Similar step, parse the string to get individual words
for file_name in testset:
# print "File: ", file_name
# minimum_neg_log_prob=1000000000
minimum_neg_log_prob = -1000000000 # NEW
min_category=''
list_words = tokenize(file_name)



##11) Get the probability for each category,
#can use any of the created dictionaries to wade through the categories
for cat in cat_word_count_dict:
# print cat , cat_num_docs[cat]/len(trainset)
# print "________________________________________________________________"
# print "________________________________________________________________"
# print "\n\n" , cat, cat, cat, cat, cat, cat, cat, cat, cat, cat, "\n\n"
# neg_log_prob=-log(cat_num_docs[cat]/length_train)
inv_cat = 'pos'
if cat == 'pos':
inv_cat = 'neg'

neg_log_prob = log(cat_num_docs[cat]/length_train)

# neg_log_prob = cat_num_docs[cat]/length_train
opp_word_dict = cat_word_dict[inv_cat]
opp_count_cat = cat_word_count_dict[inv_cat]

word_dict = cat_word_dict[cat]
count_cat = cat_word_count_dict[cat]

my_word_count = {}
for aw in list_words:
my_word_count[aw] = my_word_count.get(aw, 0)
my_word_count[aw]+=1

length_norm = 0
weight_normalizing_ratio = 0
for kw in my_word_count.keys():
count_word_train=word_dict.get(kw,0)
ratio = (count_word_train+1)/(count_cat+vocab_length)

if COMPLEMENT:
count_word_train=opp_word_dict.get(kw,0)
ratio = (count_word_train+1)/(opp_count_cat+vocab_length)

# weight norm
weight_normalizing_ratio += abs(log(ratio))
## TF
# my_word_count[kw] = log(my_word_count[kw]+1)
## length norm
# length_norm += (my_word_count[kw]**(2))

# length_norm = length_norm**(0.5)
# print "WNR: ", weight_normalizing_ratio

for w in my_word_count.keys():
count_word_train=word_dict.get(w,0)
ratio = (count_word_train+1)/(count_cat+vocab_length) #Nw,c+1/Nc+|V| = theta_c

if COMPLEMENT:
count_word_train=opp_word_dict.get(w,0)
ratio = (count_word_train+1)/(opp_count_cat+vocab_length)

word_freq = my_word_count[w]

if USE_IDF:
word_freq = word_freq*log(length_train/num_docs_word_in.get(w,1)) #IDF
# word_freq = word_freq/length_norm # length normalization


ratio = log(ratio) # weight factor log(theta_c) = weight_c,w

if WEIGHTED:
ratio = ratio/weight_normalizing_ratio # weight normalization

if COMPLEMENT:
neg_log_prob -= word_freq*ratio
else:
neg_log_prob += word_freq*ratio # class probability

# break
# print "NLP: ", neg_log_prob
# print "\n\n", cat, minimum_neg_log_prob , '<' , neg_log_prob
# if minimum_neg_log_prob>neg_log_prob:
if minimum_neg_log_prob<neg_log_prob:
min_category=cat
minimum_neg_log_prob=neg_log_prob
# print "Min cat: ", min_category

if AMAZON:
li_results.append((file_name,min_category,REVIEW_POL[file_name]))
else:
li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
# break

###--------------------DEBUG STATEMENTS----------------------
#for t in li_results:
# if t[1]!=t[2]:
# print t
###--------------------DEBUG STATEMENTS----------------------

###--------------------DEBUG STATEMENTS----------------------

#12) Evaluating the classifier

CalculateAccuracy(li_results)

0 comments on commit 47d86ab

Please sign in to comment.