Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
BigData/OLD_VERSIONS/BNB.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
144 lines (125 sloc)
4.43 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import division | |
from math import log | |
import random | |
import csv | |
import re | |
from nltk.corpus import movie_reviews as mr | |
from nltk.corpus import stopwords | |
STOP_WORDS = set(stopwords.words('english')) | |
SPLIT_AMOUNT = 0.6 # training amount from data | |
AMAZON = 1 | |
REVIEW_POL={} | |
DEFINED_SIZE = 1 | |
DEFINED_SIZES = {'pos': 948, 'neg': 948} | |
def SplitData(): | |
type_dict={} | |
docs_count={} | |
train_test = [[],[]] | |
offset_sample = random.randint(-400,400) | |
print "offset_sample", offset_sample | |
if AMAZON: | |
offset_sample = random.randint(-600,600) | |
for category in ['pos', 'neg']: | |
type_dict[category]=[] | |
with open('amazon_revs.csv', 'rb') as csvfile: | |
rev_read = csv.reader(csvfile) | |
for row in rev_read: | |
type_dict[row[1]].append(row[0]) | |
REVIEW_POL[row[0]] = row[1] | |
else: | |
for category in mr.categories(): | |
type_dict[category]=mr.fileids(categories=category) | |
for cat in type_dict.keys(): | |
li = type_dict[cat] | |
random.shuffle(li) | |
size=int(len(li)*SPLIT_AMOUNT) + offset_sample | |
if DEFINED_SIZE: | |
size = DEFINED_SIZES[cat] | |
print "Category: ", cat, "Size:", size | |
offset_sample *= -1 | |
docs_count[cat]=size | |
train_test[0].extend(li[:size]) | |
train_test[1].extend(li[size:]) | |
return [train_test,type_dict, docs_count] | |
def tokenize(file_name): | |
list_words = () | |
if AMAZON: | |
list_words = re.split(r'\W+',file_name) | |
else: | |
list_words = re.split(r'\W+',mr.raw(fileids=file_name)) | |
return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS] | |
def CalculateAccuracy(li_results): | |
a=0 | |
b=0 | |
c=0 | |
d=0 | |
cat = li_results[0][1] | |
for t in li_results: | |
if cat==t[1]: | |
if cat==t[2]: | |
a+=1 | |
else: | |
b+=1 | |
else: | |
if cat==t[2]: | |
c+=1 | |
else: | |
d+=1 | |
precision = a/(a+b) | |
# recall = a/(a+c) | |
# print "The following parameters are recorded for the category " , cat | |
print "precision =", precision | |
# li = Preprocessor.get_testset_trainset(corpus) | |
li = SplitData() | |
testset = li[0][1] | |
trainset = li[0][0] | |
# li = Preprocessor.startup() | |
cat_num_docs = li[2] | |
#3)Create a dictionary with a word as the key and a dictionary as the value | |
## in the dictionary the category as key and number of documents in that category where it occurs as value | |
# 2d dict: word -> {pos ...}, {neg ...} | |
word_cat_num_doc_dict={} | |
#4)Loop through the reuters dataset, to get the entire text from each file in the training set | |
## Parse the string to get individual words - done by get_list_tokens_nltk() | |
for file_name in trainset: | |
list_words = tokenize(file_name) | |
cat = '' | |
if AMAZON: | |
cat = REVIEW_POL[file_name] | |
else: | |
cat = mr.categories(fileids = file_name)[0] | |
for w in set(list_words): | |
word_cat_num_doc_dict[w]=word_cat_num_doc_dict.get(w,{}) | |
word_cat_num_doc_dict[w][cat]=word_cat_num_doc_dict[w].get(cat,0) | |
word_cat_num_doc_dict[w][cat]+=1 | |
for w in word_cat_num_doc_dict: | |
for cat in cat_num_docs: | |
nct = word_cat_num_doc_dict[w].get(cat,0) | |
# convert #times a word appears into #times+1/#cat_reviews+2 | |
ratio = (nct+1)/(cat_num_docs[cat]+2) | |
word_cat_num_doc_dict[w][cat]=ratio | |
print "The Classifier is trained and it took" | |
li_results=[] | |
#5) Like in the training set,Loop through the test set, to get the individual words | |
for file_name in testset: | |
minimum_neg_log_prob=1000000000 | |
min_category='' | |
set_list_words = set(tokenize(file_name)) | |
##6) Get the probability for each category, | |
#using the cat_num_docs dictionary to wade through the categories | |
for cat in cat_num_docs: | |
neg_log_prob=-log(cat_num_docs[cat]/len(trainset)) | |
for w in word_cat_num_doc_dict: | |
if w in set_list_words: | |
neg_log_prob-=log(word_cat_num_doc_dict[w][cat]) | |
else: | |
neg_log_prob-=log(1-word_cat_num_doc_dict[w][cat]) | |
if minimum_neg_log_prob>neg_log_prob: | |
min_category=cat | |
minimum_neg_log_prob=neg_log_prob | |
if AMAZON: | |
li_results.append((file_name,min_category,REVIEW_POL[file_name])) | |
else: | |
li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0])) | |
# break | |
CalculateAccuracy(li_results) | |