Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
Naive Bayes stuff
BernoulliNB and MultinomialNB follow the normal algorithm (using log precision to avoid FP underflow). ComplementMNB implements SciKit's Multinomial formula, modified for the paper
- Loading branch information
Showing
4 changed files
with
67,038 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
from __future__ import division | ||
from math import log | ||
import re | ||
from nltk.corpus import movie_reviews as mr | ||
from nltk.corpus import stopwords | ||
STOP_WORDS = set(stopwords.words('english')) | ||
|
||
SPLIT_AMOUNT = 0.6 # training amount from data | ||
|
||
def SplitData(): | ||
type_dict={} | ||
docs_count={} | ||
train_test = [[],[]] | ||
for category in mr.categories(): | ||
type_dict[category]=mr.fileids(categories=category) | ||
for cat in type_dict.keys(): | ||
li = type_dict[cat] | ||
size=int(len(li)*SPLIT_AMOUNT) | ||
docs_count[cat]=size | ||
train_test[0].extend(li[:size]) | ||
train_test[1].extend(li[size:]) | ||
return [train_test,type_dict, docs_count] | ||
|
||
def tokenize(file_name): | ||
list_words = re.split(r'\W+',mr.raw(fileids=file_name)) | ||
return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS] | ||
|
||
def CalculateAccuracy(li_results): | ||
a=0 | ||
b=0 | ||
c=0 | ||
d=0 | ||
cat = li_results[0][1] | ||
for t in li_results: | ||
if cat==t[1]: | ||
if cat==t[2]: | ||
a+=1 | ||
else: | ||
b+=1 | ||
else: | ||
if cat==t[2]: | ||
c+=1 | ||
else: | ||
d+=1 | ||
precision = a/(a+b) | ||
# recall = a/(a+c) | ||
# print "The following parameters are recorded for the category " , cat | ||
print "precision =", precision | ||
|
||
# li = Preprocessor.get_testset_trainset(corpus) | ||
li = SplitData() | ||
testset = li[0][1] | ||
trainset = li[0][0] | ||
# li = Preprocessor.startup() | ||
cat_num_docs = li[2] | ||
|
||
#3)Create a dictionary with a word as the key and a dictionary as the value | ||
## in the dictionary the category as key and number of documents in that category where it occurs as value | ||
# 2d dict: word -> {pos ...}, {neg ...} | ||
word_cat_num_doc_dict={} | ||
|
||
#4)Loop through the reuters dataset, to get the entire text from each file in the training set | ||
## Parse the string to get individual words - done by get_list_tokens_nltk() | ||
for file_name in trainset: | ||
list_words = tokenize(file_name) | ||
cat = mr.categories(fileids = file_name)[0] | ||
|
||
for w in set(list_words): | ||
word_cat_num_doc_dict[w]=word_cat_num_doc_dict.get(w,{}) | ||
word_cat_num_doc_dict[w][cat]=word_cat_num_doc_dict[w].get(cat,0) | ||
word_cat_num_doc_dict[w][cat]+=1 | ||
|
||
for w in word_cat_num_doc_dict: | ||
for cat in cat_num_docs: | ||
nct = word_cat_num_doc_dict[w].get(cat,0) | ||
# convert #times a word appears into #times+1/#cat_reviews+2 | ||
ratio = (nct+1)/(cat_num_docs[cat]+2) | ||
word_cat_num_doc_dict[w][cat]=ratio | ||
|
||
print "The Classifier is trained and it took" | ||
|
||
|
||
li_results=[] | ||
#5) Like in the training set,Loop through the test set, to get the individual words | ||
for file_name in testset: | ||
minimum_neg_log_prob=1000000000 | ||
min_category='' | ||
set_list_words = set(tokenize(file_name)) | ||
|
||
##6) Get the probability for each category, | ||
#using the cat_num_docs dictionary to wade through the categories | ||
for cat in cat_num_docs: | ||
neg_log_prob=-log(cat_num_docs[cat]/len(trainset)) | ||
for w in word_cat_num_doc_dict: | ||
if w in set_list_words: | ||
neg_log_prob-=log(word_cat_num_doc_dict[w][cat]) | ||
else: | ||
neg_log_prob-=log(1-word_cat_num_doc_dict[w][cat]) | ||
if minimum_neg_log_prob>neg_log_prob: | ||
min_category=cat | ||
minimum_neg_log_prob=neg_log_prob | ||
|
||
li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0])) | ||
|
||
CalculateAccuracy(li_results) | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
import csv | ||
import os | ||
from sklearn.feature_extraction.text import CountVectorizer | ||
from sklearn.feature_extraction.text import TfidfTransformer | ||
from sklearn.naive_bayes import MultinomialNB | ||
from sklearn import cross_validation | ||
from sklearn.metrics import classification_report | ||
import numpy as np | ||
from sklearn.metrics import accuracy_score | ||
|
||
REVIEWS = os.path.join(os.path.dirname(os.path.abspath(__file__)),'allrevs.csv') | ||
# review.csv contains two columns | ||
# first column is the review content (quoted) | ||
# second column is the assigned sentiment (positive or negative) | ||
def load_file(): | ||
with open(REVIEWS) as csv_file: | ||
reader = csv.reader(csv_file,delimiter=",",quotechar='"') | ||
reader.next() | ||
data =[] | ||
target = [] | ||
for row in reader: | ||
# skip missing data | ||
if row[0] and row[1]: | ||
data.append(row[0]) | ||
target.append(row[1]) | ||
|
||
return data,target | ||
|
||
# preprocess creates the term frequency matrix for the review data set | ||
def preprocess(): | ||
data,target = load_file() | ||
count_vectorizer = CountVectorizer(binary='true', stop_words='english') | ||
data = count_vectorizer.fit_transform(data) | ||
# tfidf_data = TfidfTransformer(norm='l2', use_idf=True, sublinear_tf=True, smooth_idf=True).fit_transform(data) | ||
transformer = TfidfTransformer(norm='l2', use_idf=True, sublinear_tf=True, smooth_idf=True) | ||
transformer.fit(data) | ||
tfidf_data = transformer.transform(data) | ||
|
||
return tfidf_data | ||
|
||
def learn_model(data,target): | ||
# preparing data for split validation. 60% training, 40% test | ||
data_train,data_test,target_train,target_test = cross_validation.train_test_split(data,target,test_size=0.4,random_state=43) | ||
classifier = MultinomialNB().fit(data_train,target_train) | ||
predicted = classifier.predict(data_test) | ||
evaluate_model(target_test,predicted) | ||
|
||
# | ||
def evaluate_model(target_true,target_predicted): | ||
# print classification_report(target_true,target_predicted) | ||
print "The accuracy score is {:.2%}".format(accuracy_score(target_true,target_predicted)) | ||
|
||
|
||
data,target = load_file() | ||
tf_idf = preprocess() | ||
learn_model(tf_idf,target) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,152 @@ | ||
from __future__ import division | ||
from math import log | ||
import re | ||
from nltk.corpus import movie_reviews as mr | ||
from nltk.corpus import stopwords | ||
STOP_WORDS = set(stopwords.words('english')) | ||
SPLIT_AMOUNT = 0.6 # training amount from data | ||
# need to change calculations for stuff | ||
# https://www.dataquest.io/blog/naive-bayes-movies/ | ||
|
||
def SplitData(): | ||
type_dict={} | ||
docs_count={} | ||
train_test = [[],[]] | ||
for category in mr.categories(): | ||
type_dict[category]=mr.fileids(categories=category) | ||
for cat in type_dict.keys(): | ||
li = type_dict[cat] | ||
size=int(len(li)*SPLIT_AMOUNT) | ||
docs_count[cat]=size | ||
train_test[0].extend(li[:size]) | ||
train_test[1].extend(li[size:]) | ||
return [train_test,type_dict, docs_count] | ||
|
||
def tokenize(file_name): | ||
list_words = re.split(r'\W+',mr.raw(fileids=file_name)) | ||
return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS] | ||
|
||
def CalculateAccuracy(li_results): | ||
a=0 | ||
b=0 | ||
c=0 | ||
d=0 | ||
cat = li_results[0][1] | ||
for t in li_results: | ||
if cat==t[1]: | ||
if cat==t[2]: | ||
a+=1 | ||
else: | ||
b+=1 | ||
else: | ||
if cat==t[2]: | ||
c+=1 | ||
else: | ||
d+=1 | ||
precision = a/(a+b) | ||
# recall = a/(a+c) | ||
# print "The following parameters are recorded for the category " , cat | ||
print "precision =", precision | ||
|
||
# li = Preprocessor.get_testset_trainset(corpus) | ||
li = SplitData() | ||
testset = li[0][1] | ||
trainset = li[0][0] | ||
# li = Preprocessor.startup() | ||
cat_num_docs = li[2] | ||
|
||
|
||
|
||
##4)Create a) a dictionary with a category as the key and dictionary of words-occurences as values | ||
#b) a dictionary with a category as the key and the number of words in it as the value | ||
# {pos-> {w1 = 17 times}, {w2 = 32 times}...} {neg-> ....} | ||
cat_word_dict={} | ||
# {pos-> 4000 words} {neg-> 7000 words} | ||
cat_word_count_dict={} | ||
#val = my_dict.get(key, mydefaultval) | ||
|
||
##5)Loop through the training set, to get the entire text from each file | ||
##6) Parse the string to get individual words | ||
for file_name in trainset: | ||
list_words = tokenize(file_name) | ||
|
||
|
||
##7) Check if category exists in dictionary, if not, create an empty dictionary, | ||
#and put word count as zero | ||
#and then insert words into the category's dictionary in both cases and update the word count | ||
cat = mr.categories(fileids = file_name)[0] | ||
cat_word_dict[cat] = cat_word_dict.get(cat,{}) | ||
cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0) | ||
|
||
# add number of words to total word count for cat | ||
cat_word_count_dict[cat]+=len(list_words) | ||
# start count for number of occurences for each word | ||
for w in list_words: | ||
cat_word_dict[cat][w] = cat_word_dict[cat].get(w, 0) | ||
cat_word_dict[cat][w]+=1 | ||
|
||
|
||
|
||
##8) Get the vocabulary length | ||
## number of words, total across categories | ||
vocab_length=0 | ||
for dic in cat_word_dict.values(): | ||
vocab_length+=len(dic) | ||
|
||
|
||
|
||
|
||
|
||
####Congratulations! the Classifier is trained, now it is time to run the Multinomial Naive Bayes Classifier on the test dataset | ||
length_train = len(trainset) | ||
li_results=[] | ||
#9) Like in the training set,Loop through the test set, to get the entire text from each file | ||
##10) Similar step, parse the string to get individual words | ||
for file_name in testset: | ||
print "File: ", file_name | ||
minimum_neg_log_prob=1000000000 | ||
# minimum_neg_log_prob = 0 # NEW | ||
min_category='' | ||
list_words = tokenize(file_name) | ||
|
||
|
||
|
||
##11) Get the probability for each category, | ||
#can use any of the created dictionaries to wade through the categories | ||
for cat in cat_word_count_dict: | ||
# print cat , cat_num_docs[cat]/len(trainset) | ||
# print "________________________________________________________________" | ||
# print "________________________________________________________________" | ||
# print "\n\n" , cat, cat, cat, cat, cat, cat, cat, cat, cat, cat, "\n\n" | ||
neg_log_prob=-log(cat_num_docs[cat]/length_train) | ||
# neg_log_prob = cat_num_docs[cat]/length_train | ||
word_dict = cat_word_dict[cat] | ||
count_cat = cat_word_count_dict[cat] | ||
for w in list_words: | ||
count_word_train=word_dict.get(w,0) | ||
ratio = (count_word_train+1)/(count_cat+vocab_length) | ||
neg_log_prob-=log(ratio) | ||
|
||
# neg_log_prob *= ratio | ||
# print w, "Ratio found:",ratio, "new_neg_log:", neg_log_prob | ||
# break | ||
# print "\n\n", minimum_neg_log_prob , '<' , neg_log_prob | ||
if minimum_neg_log_prob>neg_log_prob: | ||
# if minimum_neg_log_prob<neg_log_prob: | ||
min_category=cat | ||
minimum_neg_log_prob=neg_log_prob | ||
# print "Min cat: ", min_category | ||
li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0])) | ||
# break | ||
|
||
###--------------------DEBUG STATEMENTS---------------------- | ||
#for t in li_results: | ||
# if t[1]!=t[2]: | ||
# print t | ||
###--------------------DEBUG STATEMENTS---------------------- | ||
|
||
###--------------------DEBUG STATEMENTS---------------------- | ||
|
||
#12) Evaluating the classifier | ||
|
||
CalculateAccuracy(li_results) |
Oops, something went wrong.