Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
455 lines (381 sloc) 15.3 KB
######################
# Full version with all variations included
# To improve: create a main function allowing for multiple runs
# TF works properly!
######################
from __future__ import division
from math import log
from math import pow
import re
import csv
from nltk.corpus import movie_reviews as mr
from nltk.corpus import stopwords
import random
STOP_WORDS = set(stopwords.words('english'))
SPLIT_AMOUNT = 0.6 # training amount from data
SHUFFLE = 1
# COMPLEMENT = 0 # 1 - just comp, 2 - delta / one-v-all
# WEIGHTED = 0 # 1 - adjust weights
# TF = 0 # 1 - log term frew
# IDF = 0 # 1 - idf
# LENGTH = 0 # 1 - doc length adjust
AMAZON = 0 # 1 - use Amazon set
TWITTER = 0 # 1 - use Twitter set
TWEET_LIMIT = 5000 # we can't use the whole database, so just randomly grab this number of positive and negative reviews
OFFSET = 0 # introduced offset (skew) in datasets
REVIEW_POL={}
li = []
# exit()
testset = []
trainset = []
# li = Preprocessor.startup()
cat_num_docs = {}
##4)Create a) a dictionary with a category as the key and dictionary of words-occurrences as values
#b) a dictionary with a category as the key and the number of words in it as the value
# {pos-> {w1 = 17 times}, {w2 = 32 times}...} {neg-> ....}
cat_word_dict={}
# {pos-> 4000 words} {neg-> 7000 words}
cat_word_count_dict={}
#val = my_dict.get(key, mydefaultval)
num_docs_word_in = {}
vocab_length=0
word_cat_num_doc_dict={}
def SplitData():
global REVIEW_POL
type_dict={}
docs_count={}
train_test = [[],[]]
# offset_sample = random.randint(-400,400)
offset_sample = OFFSET
# print "offset_sample", offset_sample
categories = ['neg', 'pos']
if AMAZON:
# offset_sample = random.randint(-600,600)
for category in categories:
type_dict[category]=[]
with open('amazon_revs.csv', 'rb') as csvfile:
rev_read = csv.reader(csvfile)
for row in rev_read:
type_dict[row[1]].append(row[0])
REVIEW_POL[row[0]] = row[1]
elif TWITTER:
for category in categories:
type_dict[category]=[]
with open('tweets.csv', 'rb') as csvfile:
rev_read = csv.reader(csvfile)
rev_read.next() # skip header row
number = [0,0]
for row in rev_read:
type_dict[ categories[ int(row[1]) ] ].append(row[3].strip())
REVIEW_POL[row[3].strip()] = categories[int(row[1])]
number[int(row[1])] += 1
if (number[0]>TWEET_LIMIT and number[1]>TWEET_LIMIT):
break
else:
for category in mr.categories():
type_dict[category]=mr.fileids(categories=category)
# if NO_OFF:
# offset_sample = 0
for cat in type_dict.keys():
li = type_dict[cat]
if SHUFFLE:
random.shuffle(li)
size=int(len(li)*SPLIT_AMOUNT) + offset_sample
# if DEFINED_SIZE:
# size = DEFINED_SIZES[cat]
# print "Category: ", cat, "Size:", size
offset_sample *= -1
docs_count[cat]=size
train_test[0].extend(li[:size])
train_test[1].extend(li[size:])
return [train_test,type_dict, docs_count]
def tokenize(file_name):
list_words = ()
if AMAZON or TWITTER:
list_words = re.split(r'\W+',file_name)
else:
list_words = re.split(r'\W+',mr.raw(fileids=file_name))
return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS]
def CalculateAccuracy(li_results):
a=0
b=0
c=0
d=0
cat = li_results[0][1]
for t in li_results:
if cat==t[1]:
if cat==t[2]:
a+=1
else:
b+=1
else:
if cat==t[2]:
c+=1
else:
d+=1
precision = a/(a+b)
# recall = a/(a+c)
# print "The following parameters are recorded for the category " , cat
# print "precision =", precision
return precision
def RunWholeThing():
global AMAZON
global TWITTER
global OFFSET
global DEFINED_SIZE
global DEFINED_SIZES
OFFSET = 0
AMAZON = 0 # 0 - use movie_reviews, 1 - use Amazon set
TWITTER = 0
tested = [' Bern:', ' Mult:', ' TIL :', ' DTIL:', ' CW :', ' TIWC:']
while OFFSET < 400:
print "Movies with ", OFFSET
ans = DoTheThing()
OFFSET = -1*OFFSET
if OFFSET != 0:
ans2 = DoTheThing()
ans3 = [ans , ans2]
ans = [sum(e)/len(e) for e in zip(*ans3)]
a_i = 0
for a in ans:
print tested[a_i], a
a_i += 1
OFFSET = -1*OFFSET
OFFSET += 300
OFFSET = 0
AMAZON = 1
while OFFSET < 600:
print "Amazon with ", OFFSET
ans = DoTheThing()
OFFSET = -1*OFFSET
if OFFSET != 0:
ans2 = DoTheThing()
ans3 = [ans , ans2]
ans = [sum(e)/len(e) for e in zip(*ans3)]
a_i = 0
for a in ans:
print tested[a_i], a
a_i += 1
OFFSET = -1*OFFSET
OFFSET += 400
OFFSET = 0
AMAZON = 0
TWITTER = 1
while OFFSET < 1000:
print "Twitter with ", OFFSET
ans = DoTheThing()
OFFSET = -1*OFFSET
if OFFSET != 0:
ans2 = DoTheThing()
ans3 = [ans , ans2]
ans = [sum(e)/len(e) for e in zip(*ans3)]
a_i = 0
for a in ans:
print tested[a_i], a
a_i += 1
OFFSET = -1*OFFSET
OFFSET += 800
def DoTheThing():
i = 0
reps = 5
b_nb = 0
m_nb = 0
til_nb = 0
dtil_nb = 0
cw_nb = 0
tilcw_nb= 0
while i < reps:
TrainMachine()
b_nb += TestMachine_Bern()/reps
m_nb += TestMachine(0,0,0,0,0)/reps
til_nb += TestMachine(1,1,1,0,0)/reps
dtil_nb += TestMachine(1,1,1,2,0)/reps
cw_nb += TestMachine(0,0,0,1,1)/reps
tilcw_nb += TestMachine(1,1,1,1,1)/reps
i+=1
# print " Bern: %0.6f\n Mult: %0.6f\n TIL : %0.6f\n DTIL: %0.6f\n CW : %0.6f\n TIWC: %0.6f" % (b_nb, m_nb, til_nb, dtil_nb, cw_nb, tilcw_nb)
return [b_nb, m_nb, til_nb, dtil_nb, cw_nb, tilcw_nb]
# li = Preprocessor.get_testset_trainset(corpus)
##5)Loop through the training set, to get the entire text from each file
##6) Parse the string to get individual words
def TrainMachine():
global cat_word_dict
global cat_word_count_dict
global num_docs_word_in
global word_cat_num_doc_dict
global li
global testset
global trainset
global cat_num_docs
global vocab_length
li = SplitData()
testset = li[0][1]
trainset = li[0][0]
cat_num_docs = li[2]
for file_name in trainset:
list_words = tokenize(file_name)
##7) Check if category exists in dictionary, if not, create an empty dictionary,
#and put word count as zero
#and then insert words into the category's dictionary in both cases and update the word count
cat = ''
if AMAZON or TWITTER:
cat = REVIEW_POL[file_name]
else:
cat = mr.categories(fileids = file_name)[0]
cat_word_dict[cat] = cat_word_dict.get(cat,{})
cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0)
# add number of words to total word count for cat
cat_word_count_dict[cat]+=len(list_words)
# start count for number of occurences for each word
counted = []
for w in list_words:
cat_word_dict[cat][w] = cat_word_dict[cat].get(w, 0)
cat_word_dict[cat][w]+=1
if w not in counted:
counted.append(w)
num_docs_word_in[w] = num_docs_word_in.get(w, 0)
num_docs_word_in[w] += 1
word_cat_num_doc_dict[w]=word_cat_num_doc_dict.get(w,{})
word_cat_num_doc_dict[w][cat]=word_cat_num_doc_dict[w].get(cat,0)
word_cat_num_doc_dict[w][cat]+=1
for dic in cat_word_dict.values():
vocab_length+=len(dic)
for w in word_cat_num_doc_dict:
for cat in cat_num_docs:
nct = word_cat_num_doc_dict[w].get(cat,0)
# convert #times a word appears into #times+1/#cat_reviews+2
ratio = (nct+1)/(cat_num_docs[cat]+2)
word_cat_num_doc_dict[w][cat]=ratio
# ##8) Get the vocabulary length
# ## number of words, total across categories
# vocab_length=0
# ####Congratulations! the Classifier is trained, now it is time to run the Multinomial Naive Bayes Classifier on the test dataset
# length_train = len(trainset)
# print "length of training set ", length_train
def TestMachine(t, i, l, c, w):
#9) Like in the training set,Loop through the test set, to get the entire text from each file
##10) Similar step, parse the string to get individual words
global trainset
global testset
TF = t # 1 - log term frew
IDF = i # 1 - idf
LENGTH = l # 1 - doc length adjust
COMPLEMENT = c # 1 - just comp, 2 - delta / one-v-all
WEIGHTED = w # 1 - adjust weights
length_train = len(trainset)
# print "length train " , length_train, len(testset)
li_results=[]
for file_name in testset:
# print "File: ", file_name
# minimum_neg_log_prob=1000000000
minimum_neg_log_prob = -1000000000 # NEW
min_category=''
list_words = tokenize(file_name)
# print file_name
##11) Get the probability for each category,
#can use any of the created dictionaries to wade through the categories
for cat in cat_word_count_dict:
# print cat , cat_num_docs[cat]/len(trainset)
# print "________________________________________________________________"
# print "________________________________________________________________"
# print "\n\n" , cat, cat, cat, cat, cat, cat, cat, cat, cat, cat, "\n\n"
# neg_log_prob=-log(cat_num_docs[cat]/length_train)
inv_cat = 'pos'
if cat == 'pos':
inv_cat = 'neg'
neg_log_prob = log(cat_num_docs[cat]/length_train, 2)
# neg_log_prob = cat_num_docs[cat]/length_train
opp_word_dict = cat_word_dict[inv_cat]
opp_count_cat = cat_word_count_dict[inv_cat]
word_dict = cat_word_dict[cat]
count_cat = cat_word_count_dict[cat]
## get frequency counts
my_word_count = {}
for aw in list_words:
my_word_count[aw] = my_word_count.get(aw, 0)
my_word_count[aw]+=1
## calculate necessary norms
length_norm = 0
weight_normalizing_ratio = 0
opp_weight_normalizing_ratio = 0
for kw in my_word_count.keys():
count_word_train=word_dict.get(kw,0)
ratio = (count_word_train+1)/(count_cat+vocab_length)
# if COMPLEMENT:
opp_count_word_train=opp_word_dict.get(kw,0)
opp_ratio = (opp_count_word_train+1)/(opp_count_cat+vocab_length)
# weight norm
weight_normalizing_ratio += abs(log(ratio, 2))
opp_weight_normalizing_ratio += abs(log(opp_ratio, 2))
if TF:
my_word_count[kw] = log(1 + my_word_count[kw])
if IDF:
my_word_count[kw] = my_word_count[kw]*log(length_train/num_docs_word_in.get(kw,0.01), 2) #IDF
## length norm
w_freq = my_word_count[kw]
length_norm += pow(w_freq, 2)
length_norm = pow(length_norm, 0.5)
# print "LN: ", length_norm
for w in my_word_count.keys():
count_word_train=word_dict.get(w,0)
ratio = (count_word_train+1)/(count_cat+vocab_length) #Nw,c+1/Nc+|V| = theta_c
# if COMPLEMENT:
opp_count_word_train=opp_word_dict.get(w,0)
opp_ratio = (opp_count_word_train+1)/(opp_count_cat+vocab_length)
word_freq = my_word_count[w]
if LENGTH:
word_freq = word_freq/length_norm # length normalization
ratio = log(ratio, 2) # weight factor log(theta_c) = weight_c,w
opp_ratio = log(opp_ratio, 2)
if WEIGHTED:
ratio = ratio/weight_normalizing_ratio # weight normalization
opp_ratio = opp_ratio/opp_weight_normalizing_ratio
if COMPLEMENT == 1: # just complement
neg_log_prob -= word_freq*opp_ratio
else:
neg_log_prob += word_freq*ratio # class probability
if COMPLEMENT == 2: # one-v-all
neg_log_prob += word_freq*ratio
# break
# print "NLP: ", neg_log_prob
# print "\n\n", cat, minimum_neg_log_prob , '<' , neg_log_prob
# if minimum_neg_log_prob>neg_log_prob:
if minimum_neg_log_prob<neg_log_prob:
min_category=cat
minimum_neg_log_prob=neg_log_prob
# print "Min cat: ", min_category
if AMAZON or TWITTER:
li_results.append((file_name,min_category,REVIEW_POL[file_name]))
else:
li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
# break
#12) Evaluating the classifier
precision = CalculateAccuracy(li_results)
return precision
def TestMachine_Bern():
li_results=[]
#5) Like in the training set,Loop through the test set, to get the individual words
for file_name in testset:
minimum_neg_log_prob=1000000000
min_category=''
set_list_words = set(tokenize(file_name))
##6) Get the probability for each category,
#using the cat_num_docs dictionary to wade through the categories
for cat in cat_num_docs:
neg_log_prob=-log(cat_num_docs[cat]/len(trainset))
for w in word_cat_num_doc_dict:
if w in set_list_words:
neg_log_prob-=log(word_cat_num_doc_dict[w][cat])
else:
neg_log_prob-=log(1-word_cat_num_doc_dict[w][cat])
if minimum_neg_log_prob>neg_log_prob:
min_category=cat
minimum_neg_log_prob=neg_log_prob
if AMAZON or TWITTER:
li_results.append((file_name,min_category,REVIEW_POL[file_name]))
else:
li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
# break
precision = CalculateAccuracy(li_results)
return precision
RunWholeThing()
You can’t perform that action at this time.