Skip to content
Permalink
4e6b38c6e6
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
144 lines (125 sloc) 4.43 KB
from __future__ import division
from math import log
import random
import csv
import re
from nltk.corpus import movie_reviews as mr
from nltk.corpus import stopwords
STOP_WORDS = set(stopwords.words('english'))
SPLIT_AMOUNT = 0.6 # training amount from data
AMAZON = 1
REVIEW_POL={}
DEFINED_SIZE = 1
DEFINED_SIZES = {'pos': 948, 'neg': 948}
def SplitData():
type_dict={}
docs_count={}
train_test = [[],[]]
offset_sample = random.randint(-400,400)
print "offset_sample", offset_sample
if AMAZON:
offset_sample = random.randint(-600,600)
for category in ['pos', 'neg']:
type_dict[category]=[]
with open('amazon_revs.csv', 'rb') as csvfile:
rev_read = csv.reader(csvfile)
for row in rev_read:
type_dict[row[1]].append(row[0])
REVIEW_POL[row[0]] = row[1]
else:
for category in mr.categories():
type_dict[category]=mr.fileids(categories=category)
for cat in type_dict.keys():
li = type_dict[cat]
random.shuffle(li)
size=int(len(li)*SPLIT_AMOUNT) + offset_sample
if DEFINED_SIZE:
size = DEFINED_SIZES[cat]
print "Category: ", cat, "Size:", size
offset_sample *= -1
docs_count[cat]=size
train_test[0].extend(li[:size])
train_test[1].extend(li[size:])
return [train_test,type_dict, docs_count]
def tokenize(file_name):
list_words = ()
if AMAZON:
list_words = re.split(r'\W+',file_name)
else:
list_words = re.split(r'\W+',mr.raw(fileids=file_name))
return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS]
def CalculateAccuracy(li_results):
a=0
b=0
c=0
d=0
cat = li_results[0][1]
for t in li_results:
if cat==t[1]:
if cat==t[2]:
a+=1
else:
b+=1
else:
if cat==t[2]:
c+=1
else:
d+=1
precision = a/(a+b)
# recall = a/(a+c)
# print "The following parameters are recorded for the category " , cat
print "precision =", precision
# li = Preprocessor.get_testset_trainset(corpus)
li = SplitData()
testset = li[0][1]
trainset = li[0][0]
# li = Preprocessor.startup()
cat_num_docs = li[2]
#3)Create a dictionary with a word as the key and a dictionary as the value
## in the dictionary the category as key and number of documents in that category where it occurs as value
# 2d dict: word -> {pos ...}, {neg ...}
word_cat_num_doc_dict={}
#4)Loop through the reuters dataset, to get the entire text from each file in the training set
## Parse the string to get individual words - done by get_list_tokens_nltk()
for file_name in trainset:
list_words = tokenize(file_name)
cat = ''
if AMAZON:
cat = REVIEW_POL[file_name]
else:
cat = mr.categories(fileids = file_name)[0]
for w in set(list_words):
word_cat_num_doc_dict[w]=word_cat_num_doc_dict.get(w,{})
word_cat_num_doc_dict[w][cat]=word_cat_num_doc_dict[w].get(cat,0)
word_cat_num_doc_dict[w][cat]+=1
for w in word_cat_num_doc_dict:
for cat in cat_num_docs:
nct = word_cat_num_doc_dict[w].get(cat,0)
# convert #times a word appears into #times+1/#cat_reviews+2
ratio = (nct+1)/(cat_num_docs[cat]+2)
word_cat_num_doc_dict[w][cat]=ratio
print "The Classifier is trained and it took"
li_results=[]
#5) Like in the training set,Loop through the test set, to get the individual words
for file_name in testset:
minimum_neg_log_prob=1000000000
min_category=''
set_list_words = set(tokenize(file_name))
##6) Get the probability for each category,
#using the cat_num_docs dictionary to wade through the categories
for cat in cat_num_docs:
neg_log_prob=-log(cat_num_docs[cat]/len(trainset))
for w in word_cat_num_doc_dict:
if w in set_list_words:
neg_log_prob-=log(word_cat_num_doc_dict[w][cat])
else:
neg_log_prob-=log(1-word_cat_num_doc_dict[w][cat])
if minimum_neg_log_prob>neg_log_prob:
min_category=cat
minimum_neg_log_prob=neg_log_prob
if AMAZON:
li_results.append((file_name,min_category,REVIEW_POL[file_name]))
else:
li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
# break
CalculateAccuracy(li_results)