Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
clean up
placed all nb implementations in one code, and moved a bunch of stuff
around
  • Loading branch information
asp10012 committed May 3, 2016
1 parent 0a26108 commit 14dc205
Show file tree
Hide file tree
Showing 10 changed files with 767 additions and 36 deletions.
161 changes: 129 additions & 32 deletions TWCNB_v0_2.py → Naive_bayes.py
Expand Up @@ -20,8 +20,10 @@ SHUFFLE = 1
# TF = 0 # 1 - log term frew
# IDF = 0 # 1 - idf
# LENGTH = 0 # 1 - doc length adjust
AMAZON = 0 # 0 - use movie_reviews, 1 - use Amazon set
OFFSET = 0 # 0 - use random data size offset, 1 - nope
AMAZON = 0 # 1 - use Amazon set
TWITTER = 0 # 1 - use Twitter set
TWEET_LIMIT = 5000 # we can't use the whole database, so just randomly grab this number of positive and negative reviews
OFFSET = 0 # introduced offset (skew) in datasets

REVIEW_POL={}

Expand All @@ -42,6 +44,7 @@ cat_word_count_dict={}
num_docs_word_in = {}
vocab_length=0

word_cat_num_doc_dict={}


def SplitData():
Expand All @@ -51,16 +54,30 @@ def SplitData():
train_test = [[],[]]
# offset_sample = random.randint(-400,400)
offset_sample = OFFSET
print "offset_sample", offset_sample
# print "offset_sample", offset_sample
categories = ['neg', 'pos']
if AMAZON:
# offset_sample = random.randint(-600,600)
for category in ['pos', 'neg']:
for category in categories:
type_dict[category]=[]
with open('amazon_revs.csv', 'rb') as csvfile:
rev_read = csv.reader(csvfile)
for row in rev_read:
type_dict[row[1]].append(row[0])
REVIEW_POL[row[0]] = row[1]
elif TWITTER:
for category in categories:
type_dict[category]=[]
with open('tweets.csv', 'rb') as csvfile:
rev_read = csv.reader(csvfile)
rev_read.next() # skip header row
number = [0,0]
for row in rev_read:
type_dict[ categories[ int(row[1]) ] ].append(row[3].strip())
REVIEW_POL[row[3].strip()] = categories[int(row[1])]
number[int(row[1])] += 1
if (number[0]>TWEET_LIMIT and number[1]>TWEET_LIMIT):
break
else:
for category in mr.categories():
type_dict[category]=mr.fileids(categories=category)
Expand All @@ -73,7 +90,7 @@ def SplitData():
size=int(len(li)*SPLIT_AMOUNT) + offset_sample
# if DEFINED_SIZE:
# size = DEFINED_SIZES[cat]
print "Category: ", cat, "Size:", size
# print "Category: ", cat, "Size:", size
offset_sample *= -1
docs_count[cat]=size
train_test[0].extend(li[:size])
Expand All @@ -82,7 +99,7 @@ def SplitData():

def tokenize(file_name):
list_words = ()
if AMAZON:
if AMAZON or TWITTER:
list_words = re.split(r'\W+',file_name)
else:
list_words = re.split(r'\W+',mr.raw(fileids=file_name))
Expand Down Expand Up @@ -110,59 +127,99 @@ def CalculateAccuracy(li_results):
precision = a/(a+b)
# recall = a/(a+c)
# print "The following parameters are recorded for the category " , cat
print "precision =", precision
# print "precision =", precision
return precision

def RunWholeThing():
global AMAZON
global TWITTER
global OFFSET
global DEFINED_SIZE
global DEFINED_SIZES
OFFSET = 0
AMAZON = 0 # 0 - use movie_reviews, 1 - use Amazon set
AMAZON = 0 # 0 - use movie_reviews, 1 - use Amazon set
TWITTER = 0
tested = [' Bern:', ' Mult:', ' TIL :', ' DTIL:', ' CW :', ' TIWC:']
while OFFSET < 400:
ans = DoTheThing()
print "Pos/Neg =", cat_num_docs['pos'], cat_num_docs['neg'], 'Movies @ +/-', OFFSET
print "_____________________________________________________"
OFFSET = -1*OFFSET
ans2 = DoTheThing()
print "Movies with ", OFFSET
ans = DoTheThing()
OFFSET = -1*OFFSET
print ans , ans2
OFFSET += 100
if OFFSET != 0:
ans2 = DoTheThing()
ans3 = [ans , ans2]
ans = [sum(e)/len(e) for e in zip(*ans3)]
a_i = 0
for a in ans:
print tested[a_i], a
a_i += 1

OFFSET = -1*OFFSET

OFFSET += 300

OFFSET = 0
AMAZON = 1


while OFFSET < 600:
print "Amazon with ", OFFSET
ans = DoTheThing()
print "Pos/Neg =", cat_num_docs['pos'], cat_num_docs['neg'], 'Amazon @ +/-', OFFSET
print "_____________________________________________________"
OFFSET = -1*OFFSET
ans2 = DoTheThing()
if OFFSET != 0:
ans2 = DoTheThing()
ans3 = [ans , ans2]
ans = [sum(e)/len(e) for e in zip(*ans3)]
a_i = 0
for a in ans:
print tested[a_i], a
a_i += 1

OFFSET = -1*OFFSET

OFFSET += 400

OFFSET = 0
AMAZON = 0
TWITTER = 1

while OFFSET < 1000:
print "Twitter with ", OFFSET
ans = DoTheThing()
OFFSET = -1*OFFSET
print ans , ans2
OFFSET += 100
if OFFSET != 0:
ans2 = DoTheThing()
ans3 = [ans , ans2]
ans = [sum(e)/len(e) for e in zip(*ans3)]
a_i = 0
for a in ans:
print tested[a_i], a
a_i += 1
OFFSET = -1*OFFSET
OFFSET += 800



def DoTheThing():
i = 0
reps = 5

b_nb = 0
m_nb = 0
ti_nb = 0
til_nb = 0
dtil_nb = 0
cw_nb = 0
tilcw_nb= 0

while i < reps:
TrainMachine()
m_nb += TestMachine(0,0,0,0,0)/5
ti_nb += TestMachine(1,1,0,0,0)/5
til_nb += TestMachine(1,1,1,0,0)/5
cw_nb += TestMachine(0,0,0,1,1)/5
tilcw_nb += TestMachine(1,1,1,1,1)/5
b_nb += TestMachine_Bern()/reps
m_nb += TestMachine(0,0,0,0,0)/reps
til_nb += TestMachine(1,1,1,0,0)/reps
dtil_nb += TestMachine(1,1,1,2,0)/reps
cw_nb += TestMachine(0,0,0,1,1)/reps
tilcw_nb += TestMachine(1,1,1,1,1)/reps
i+=1
return (m_nb, ti_nb, til_nb, cw_nb, tilcw_nb)
# print " Bern: %0.6f\n Mult: %0.6f\n TIL : %0.6f\n DTIL: %0.6f\n CW : %0.6f\n TIWC: %0.6f" % (b_nb, m_nb, til_nb, dtil_nb, cw_nb, tilcw_nb)
return [b_nb, m_nb, til_nb, dtil_nb, cw_nb, tilcw_nb]


# li = Preprocessor.get_testset_trainset(corpus)
Expand All @@ -173,6 +230,7 @@ def TrainMachine():
global cat_word_dict
global cat_word_count_dict
global num_docs_word_in
global word_cat_num_doc_dict
global li
global testset
global trainset
Expand All @@ -192,7 +250,7 @@ def TrainMachine():
#and put word count as zero
#and then insert words into the category's dictionary in both cases and update the word count
cat = ''
if AMAZON:
if AMAZON or TWITTER:
cat = REVIEW_POL[file_name]
else:
cat = mr.categories(fileids = file_name)[0]
Expand All @@ -211,8 +269,18 @@ def TrainMachine():
num_docs_word_in[w] = num_docs_word_in.get(w, 0)
num_docs_word_in[w] += 1

word_cat_num_doc_dict[w]=word_cat_num_doc_dict.get(w,{})
word_cat_num_doc_dict[w][cat]=word_cat_num_doc_dict[w].get(cat,0)
word_cat_num_doc_dict[w][cat]+=1

for dic in cat_word_dict.values():
vocab_length+=len(dic)
vocab_length+=len(dic)
for w in word_cat_num_doc_dict:
for cat in cat_num_docs:
nct = word_cat_num_doc_dict[w].get(cat,0)
# convert #times a word appears into #times+1/#cat_reviews+2
ratio = (nct+1)/(cat_num_docs[cat]+2)
word_cat_num_doc_dict[w][cat]=ratio


# ##8) Get the vocabulary length
Expand Down Expand Up @@ -297,7 +365,7 @@ def TestMachine(t, i, l, c, w):
my_word_count[kw] = log(1 + my_word_count[kw])

if IDF:
my_word_count[kw] = my_word_count[kw]*log(length_train/num_docs_word_in.get(kw,1), 2) #IDF
my_word_count[kw] = my_word_count[kw]*log(length_train/num_docs_word_in.get(kw,0.01), 2) #IDF
## length norm
w_freq = my_word_count[kw]
length_norm += pow(w_freq, 2)
Expand Down Expand Up @@ -342,7 +410,7 @@ def TestMachine(t, i, l, c, w):
minimum_neg_log_prob=neg_log_prob
# print "Min cat: ", min_category

if AMAZON:
if AMAZON or TWITTER:
li_results.append((file_name,min_category,REVIEW_POL[file_name]))
else:
li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
Expand All @@ -354,5 +422,34 @@ def TestMachine(t, i, l, c, w):
precision = CalculateAccuracy(li_results)
return precision

def TestMachine_Bern():
li_results=[]
#5) Like in the training set,Loop through the test set, to get the individual words
for file_name in testset:
minimum_neg_log_prob=1000000000
min_category=''
set_list_words = set(tokenize(file_name))

##6) Get the probability for each category,
#using the cat_num_docs dictionary to wade through the categories
for cat in cat_num_docs:
neg_log_prob=-log(cat_num_docs[cat]/len(trainset))
for w in word_cat_num_doc_dict:
if w in set_list_words:
neg_log_prob-=log(word_cat_num_doc_dict[w][cat])
else:
neg_log_prob-=log(1-word_cat_num_doc_dict[w][cat])
if minimum_neg_log_prob>neg_log_prob:
min_category=cat
minimum_neg_log_prob=neg_log_prob

if AMAZON or TWITTER:
li_results.append((file_name,min_category,REVIEW_POL[file_name]))
else:
li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
# break
precision = CalculateAccuracy(li_results)
return precision


RunWholeThing()
File renamed without changes.
File renamed without changes.
File renamed without changes.

0 comments on commit 14dc205

Please sign in to comment.