Skip to content
Permalink
4e6b38c6e6
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
455 lines (381 sloc) 15.3 KB
######################
# Full version with all variations included
# To improve: create a main function allowing for multiple runs
# TF works properly!
######################
from __future__ import division
from math import log
from math import pow
import re
import csv
from nltk.corpus import movie_reviews as mr
from nltk.corpus import stopwords
import random
STOP_WORDS = set(stopwords.words('english'))
SPLIT_AMOUNT = 0.6 # training amount from data
SHUFFLE = 1
# COMPLEMENT = 0 # 1 - just comp, 2 - delta / one-v-all
# WEIGHTED = 0 # 1 - adjust weights
# TF = 0 # 1 - log term frew
# IDF = 0 # 1 - idf
# LENGTH = 0 # 1 - doc length adjust
AMAZON = 0 # 1 - use Amazon set
TWITTER = 0 # 1 - use Twitter set
TWEET_LIMIT = 5000 # we can't use the whole database, so just randomly grab this number of positive and negative reviews
OFFSET = 0 # introduced offset (skew) in datasets
REVIEW_POL={}
li = []
# exit()
testset = []
trainset = []
# li = Preprocessor.startup()
cat_num_docs = {}
##4)Create a) a dictionary with a category as the key and dictionary of words-occurrences as values
#b) a dictionary with a category as the key and the number of words in it as the value
# {pos-> {w1 = 17 times}, {w2 = 32 times}...} {neg-> ....}
cat_word_dict={}
# {pos-> 4000 words} {neg-> 7000 words}
cat_word_count_dict={}
#val = my_dict.get(key, mydefaultval)
num_docs_word_in = {}
vocab_length=0
word_cat_num_doc_dict={}
def SplitData():
global REVIEW_POL
type_dict={}
docs_count={}
train_test = [[],[]]
# offset_sample = random.randint(-400,400)
offset_sample = OFFSET
# print "offset_sample", offset_sample
categories = ['neg', 'pos']
if AMAZON:
# offset_sample = random.randint(-600,600)
for category in categories:
type_dict[category]=[]
with open('amazon_revs.csv', 'rb') as csvfile:
rev_read = csv.reader(csvfile)
for row in rev_read:
type_dict[row[1]].append(row[0])
REVIEW_POL[row[0]] = row[1]
elif TWITTER:
for category in categories:
type_dict[category]=[]
with open('tweets.csv', 'rb') as csvfile:
rev_read = csv.reader(csvfile)
rev_read.next() # skip header row
number = [0,0]
for row in rev_read:
type_dict[ categories[ int(row[1]) ] ].append(row[3].strip())
REVIEW_POL[row[3].strip()] = categories[int(row[1])]
number[int(row[1])] += 1
if (number[0]>TWEET_LIMIT and number[1]>TWEET_LIMIT):
break
else:
for category in mr.categories():
type_dict[category]=mr.fileids(categories=category)
# if NO_OFF:
# offset_sample = 0
for cat in type_dict.keys():
li = type_dict[cat]
if SHUFFLE:
random.shuffle(li)
size=int(len(li)*SPLIT_AMOUNT) + offset_sample
# if DEFINED_SIZE:
# size = DEFINED_SIZES[cat]
# print "Category: ", cat, "Size:", size
offset_sample *= -1
docs_count[cat]=size
train_test[0].extend(li[:size])
train_test[1].extend(li[size:])
return [train_test,type_dict, docs_count]
def tokenize(file_name):
list_words = ()
if AMAZON or TWITTER:
list_words = re.split(r'\W+',file_name)
else:
list_words = re.split(r'\W+',mr.raw(fileids=file_name))
return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS]
def CalculateAccuracy(li_results):
a=0
b=0
c=0
d=0
cat = li_results[0][1]
for t in li_results:
if cat==t[1]:
if cat==t[2]:
a+=1
else:
b+=1
else:
if cat==t[2]:
c+=1
else:
d+=1
precision = a/(a+b)
# recall = a/(a+c)
# print "The following parameters are recorded for the category " , cat
# print "precision =", precision
return precision
def RunWholeThing():
global AMAZON
global TWITTER
global OFFSET
global DEFINED_SIZE
global DEFINED_SIZES
OFFSET = 0
AMAZON = 0 # 0 - use movie_reviews, 1 - use Amazon set
TWITTER = 0
tested = [' Bern:', ' Mult:', ' TIL :', ' DTIL:', ' CW :', ' TIWC:']
while OFFSET < 400:
print "Movies with ", OFFSET
ans = DoTheThing()
OFFSET = -1*OFFSET
if OFFSET != 0:
ans2 = DoTheThing()
ans3 = [ans , ans2]
ans = [sum(e)/len(e) for e in zip(*ans3)]
a_i = 0
for a in ans:
print tested[a_i], a
a_i += 1
OFFSET = -1*OFFSET
OFFSET += 300
OFFSET = 0
AMAZON = 1
while OFFSET < 600:
print "Amazon with ", OFFSET
ans = DoTheThing()
OFFSET = -1*OFFSET
if OFFSET != 0:
ans2 = DoTheThing()
ans3 = [ans , ans2]
ans = [sum(e)/len(e) for e in zip(*ans3)]
a_i = 0
for a in ans:
print tested[a_i], a
a_i += 1
OFFSET = -1*OFFSET
OFFSET += 400
OFFSET = 0
AMAZON = 0
TWITTER = 1
while OFFSET < 1000:
print "Twitter with ", OFFSET
ans = DoTheThing()
OFFSET = -1*OFFSET
if OFFSET != 0:
ans2 = DoTheThing()
ans3 = [ans , ans2]
ans = [sum(e)/len(e) for e in zip(*ans3)]
a_i = 0
for a in ans:
print tested[a_i], a
a_i += 1
OFFSET = -1*OFFSET
OFFSET += 800
def DoTheThing():
i = 0
reps = 5
b_nb = 0
m_nb = 0
til_nb = 0
dtil_nb = 0
cw_nb = 0
tilcw_nb= 0
while i < reps:
TrainMachine()
b_nb += TestMachine_Bern()/reps
m_nb += TestMachine(0,0,0,0,0)/reps
til_nb += TestMachine(1,1,1,0,0)/reps
dtil_nb += TestMachine(1,1,1,2,0)/reps
cw_nb += TestMachine(0,0,0,1,1)/reps
tilcw_nb += TestMachine(1,1,1,1,1)/reps
i+=1
# print " Bern: %0.6f\n Mult: %0.6f\n TIL : %0.6f\n DTIL: %0.6f\n CW : %0.6f\n TIWC: %0.6f" % (b_nb, m_nb, til_nb, dtil_nb, cw_nb, tilcw_nb)
return [b_nb, m_nb, til_nb, dtil_nb, cw_nb, tilcw_nb]
# li = Preprocessor.get_testset_trainset(corpus)
##5)Loop through the training set, to get the entire text from each file
##6) Parse the string to get individual words
def TrainMachine():
global cat_word_dict
global cat_word_count_dict
global num_docs_word_in
global word_cat_num_doc_dict
global li
global testset
global trainset
global cat_num_docs
global vocab_length
li = SplitData()
testset = li[0][1]
trainset = li[0][0]
cat_num_docs = li[2]
for file_name in trainset:
list_words = tokenize(file_name)
##7) Check if category exists in dictionary, if not, create an empty dictionary,
#and put word count as zero
#and then insert words into the category's dictionary in both cases and update the word count
cat = ''
if AMAZON or TWITTER:
cat = REVIEW_POL[file_name]
else:
cat = mr.categories(fileids = file_name)[0]
cat_word_dict[cat] = cat_word_dict.get(cat,{})
cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0)
# add number of words to total word count for cat
cat_word_count_dict[cat]+=len(list_words)
# start count for number of occurences for each word
counted = []
for w in list_words:
cat_word_dict[cat][w] = cat_word_dict[cat].get(w, 0)
cat_word_dict[cat][w]+=1
if w not in counted:
counted.append(w)
num_docs_word_in[w] = num_docs_word_in.get(w, 0)
num_docs_word_in[w] += 1
word_cat_num_doc_dict[w]=word_cat_num_doc_dict.get(w,{})
word_cat_num_doc_dict[w][cat]=word_cat_num_doc_dict[w].get(cat,0)
word_cat_num_doc_dict[w][cat]+=1
for dic in cat_word_dict.values():
vocab_length+=len(dic)
for w in word_cat_num_doc_dict:
for cat in cat_num_docs:
nct = word_cat_num_doc_dict[w].get(cat,0)
# convert #times a word appears into #times+1/#cat_reviews+2
ratio = (nct+1)/(cat_num_docs[cat]+2)
word_cat_num_doc_dict[w][cat]=ratio
# ##8) Get the vocabulary length
# ## number of words, total across categories
# vocab_length=0
# ####Congratulations! the Classifier is trained, now it is time to run the Multinomial Naive Bayes Classifier on the test dataset
# length_train = len(trainset)
# print "length of training set ", length_train
def TestMachine(t, i, l, c, w):
#9) Like in the training set,Loop through the test set, to get the entire text from each file
##10) Similar step, parse the string to get individual words
global trainset
global testset
TF = t # 1 - log term frew
IDF = i # 1 - idf
LENGTH = l # 1 - doc length adjust
COMPLEMENT = c # 1 - just comp, 2 - delta / one-v-all
WEIGHTED = w # 1 - adjust weights
length_train = len(trainset)
# print "length train " , length_train, len(testset)
li_results=[]
for file_name in testset:
# print "File: ", file_name
# minimum_neg_log_prob=1000000000
minimum_neg_log_prob = -1000000000 # NEW
min_category=''
list_words = tokenize(file_name)
# print file_name
##11) Get the probability for each category,
#can use any of the created dictionaries to wade through the categories
for cat in cat_word_count_dict:
# print cat , cat_num_docs[cat]/len(trainset)
# print "________________________________________________________________"
# print "________________________________________________________________"
# print "\n\n" , cat, cat, cat, cat, cat, cat, cat, cat, cat, cat, "\n\n"
# neg_log_prob=-log(cat_num_docs[cat]/length_train)
inv_cat = 'pos'
if cat == 'pos':
inv_cat = 'neg'
neg_log_prob = log(cat_num_docs[cat]/length_train, 2)
# neg_log_prob = cat_num_docs[cat]/length_train
opp_word_dict = cat_word_dict[inv_cat]
opp_count_cat = cat_word_count_dict[inv_cat]
word_dict = cat_word_dict[cat]
count_cat = cat_word_count_dict[cat]
## get frequency counts
my_word_count = {}
for aw in list_words:
my_word_count[aw] = my_word_count.get(aw, 0)
my_word_count[aw]+=1
## calculate necessary norms
length_norm = 0
weight_normalizing_ratio = 0
opp_weight_normalizing_ratio = 0
for kw in my_word_count.keys():
count_word_train=word_dict.get(kw,0)
ratio = (count_word_train+1)/(count_cat+vocab_length)
# if COMPLEMENT:
opp_count_word_train=opp_word_dict.get(kw,0)
opp_ratio = (opp_count_word_train+1)/(opp_count_cat+vocab_length)
# weight norm
weight_normalizing_ratio += abs(log(ratio, 2))
opp_weight_normalizing_ratio += abs(log(opp_ratio, 2))
if TF:
my_word_count[kw] = log(1 + my_word_count[kw])
if IDF:
my_word_count[kw] = my_word_count[kw]*log(length_train/num_docs_word_in.get(kw,0.01), 2) #IDF
## length norm
w_freq = my_word_count[kw]
length_norm += pow(w_freq, 2)
length_norm = pow(length_norm, 0.5)
# print "LN: ", length_norm
for w in my_word_count.keys():
count_word_train=word_dict.get(w,0)
ratio = (count_word_train+1)/(count_cat+vocab_length) #Nw,c+1/Nc+|V| = theta_c
# if COMPLEMENT:
opp_count_word_train=opp_word_dict.get(w,0)
opp_ratio = (opp_count_word_train+1)/(opp_count_cat+vocab_length)
word_freq = my_word_count[w]
if LENGTH:
word_freq = word_freq/length_norm # length normalization
ratio = log(ratio, 2) # weight factor log(theta_c) = weight_c,w
opp_ratio = log(opp_ratio, 2)
if WEIGHTED:
ratio = ratio/weight_normalizing_ratio # weight normalization
opp_ratio = opp_ratio/opp_weight_normalizing_ratio
if COMPLEMENT == 1: # just complement
neg_log_prob -= word_freq*opp_ratio
else:
neg_log_prob += word_freq*ratio # class probability
if COMPLEMENT == 2: # one-v-all
neg_log_prob += word_freq*ratio
# break
# print "NLP: ", neg_log_prob
# print "\n\n", cat, minimum_neg_log_prob , '<' , neg_log_prob
# if minimum_neg_log_prob>neg_log_prob:
if minimum_neg_log_prob<neg_log_prob:
min_category=cat
minimum_neg_log_prob=neg_log_prob
# print "Min cat: ", min_category
if AMAZON or TWITTER:
li_results.append((file_name,min_category,REVIEW_POL[file_name]))
else:
li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
# break
#12) Evaluating the classifier
precision = CalculateAccuracy(li_results)
return precision
def TestMachine_Bern():
li_results=[]
#5) Like in the training set,Loop through the test set, to get the individual words
for file_name in testset:
minimum_neg_log_prob=1000000000
min_category=''
set_list_words = set(tokenize(file_name))
##6) Get the probability for each category,
#using the cat_num_docs dictionary to wade through the categories
for cat in cat_num_docs:
neg_log_prob=-log(cat_num_docs[cat]/len(trainset))
for w in word_cat_num_doc_dict:
if w in set_list_words:
neg_log_prob-=log(word_cat_num_doc_dict[w][cat])
else:
neg_log_prob-=log(1-word_cat_num_doc_dict[w][cat])
if minimum_neg_log_prob>neg_log_prob:
min_category=cat
minimum_neg_log_prob=neg_log_prob
if AMAZON or TWITTER:
li_results.append((file_name,min_category,REVIEW_POL[file_name]))
else:
li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
# break
precision = CalculateAccuracy(li_results)
return precision
RunWholeThing()