diff --git a/BernoulliNB.py b/BernoulliNB.py new file mode 100644 index 0000000..775d86c --- /dev/null +++ b/BernoulliNB.py @@ -0,0 +1,108 @@ +from __future__ import division +from math import log +import re +from nltk.corpus import movie_reviews as mr +from nltk.corpus import stopwords +STOP_WORDS = set(stopwords.words('english')) + +SPLIT_AMOUNT = 0.6 # training amount from data + +def SplitData(): + type_dict={} + docs_count={} + train_test = [[],[]] + for category in mr.categories(): + type_dict[category]=mr.fileids(categories=category) + for cat in type_dict.keys(): + li = type_dict[cat] + size=int(len(li)*SPLIT_AMOUNT) + docs_count[cat]=size + train_test[0].extend(li[:size]) + train_test[1].extend(li[size:]) + return [train_test,type_dict, docs_count] + +def tokenize(file_name): + list_words = re.split(r'\W+',mr.raw(fileids=file_name)) + return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS] + +def CalculateAccuracy(li_results): + a=0 + b=0 + c=0 + d=0 + cat = li_results[0][1] + for t in li_results: + if cat==t[1]: + if cat==t[2]: + a+=1 + else: + b+=1 + else: + if cat==t[2]: + c+=1 + else: + d+=1 + precision = a/(a+b) + # recall = a/(a+c) + # print "The following parameters are recorded for the category " , cat + print "precision =", precision + +# li = Preprocessor.get_testset_trainset(corpus) +li = SplitData() +testset = li[0][1] +trainset = li[0][0] +# li = Preprocessor.startup() +cat_num_docs = li[2] + +#3)Create a dictionary with a word as the key and a dictionary as the value + ## in the dictionary the category as key and number of documents in that category where it occurs as value +# 2d dict: word -> {pos ...}, {neg ...} +word_cat_num_doc_dict={} + +#4)Loop through the reuters dataset, to get the entire text from each file in the training set + ## Parse the string to get individual words - done by get_list_tokens_nltk() +for file_name in trainset: + list_words = tokenize(file_name) + cat = mr.categories(fileids = file_name)[0] + + for w in set(list_words): + word_cat_num_doc_dict[w]=word_cat_num_doc_dict.get(w,{}) + word_cat_num_doc_dict[w][cat]=word_cat_num_doc_dict[w].get(cat,0) + word_cat_num_doc_dict[w][cat]+=1 + +for w in word_cat_num_doc_dict: + for cat in cat_num_docs: + nct = word_cat_num_doc_dict[w].get(cat,0) + # convert #times a word appears into #times+1/#cat_reviews+2 + ratio = (nct+1)/(cat_num_docs[cat]+2) + word_cat_num_doc_dict[w][cat]=ratio + +print "The Classifier is trained and it took" + + +li_results=[] +#5) Like in the training set,Loop through the test set, to get the individual words +for file_name in testset: + minimum_neg_log_prob=1000000000 + min_category='' + set_list_words = set(tokenize(file_name)) + +##6) Get the probability for each category, + #using the cat_num_docs dictionary to wade through the categories + for cat in cat_num_docs: + neg_log_prob=-log(cat_num_docs[cat]/len(trainset)) + for w in word_cat_num_doc_dict: + if w in set_list_words: + neg_log_prob-=log(word_cat_num_doc_dict[w][cat]) + else: + neg_log_prob-=log(1-word_cat_num_doc_dict[w][cat]) + if minimum_neg_log_prob>neg_log_prob: + min_category=cat + minimum_neg_log_prob=neg_log_prob + + li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0])) + +CalculateAccuracy(li_results) + + + diff --git a/ComplementMNB.py b/ComplementMNB.py new file mode 100644 index 0000000..b646acc --- /dev/null +++ b/ComplementMNB.py @@ -0,0 +1,58 @@ +import csv +import os +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.feature_extraction.text import TfidfTransformer +from sklearn.naive_bayes import MultinomialNB +from sklearn import cross_validation +from sklearn.metrics import classification_report +import numpy as np +from sklearn.metrics import accuracy_score + +REVIEWS = os.path.join(os.path.dirname(os.path.abspath(__file__)),'allrevs.csv') +# review.csv contains two columns +# first column is the review content (quoted) +# second column is the assigned sentiment (positive or negative) +def load_file(): + with open(REVIEWS) as csv_file: + reader = csv.reader(csv_file,delimiter=",",quotechar='"') + reader.next() + data =[] + target = [] + for row in reader: + # skip missing data + if row[0] and row[1]: + data.append(row[0]) + target.append(row[1]) + + return data,target + +# preprocess creates the term frequency matrix for the review data set +def preprocess(): + data,target = load_file() + count_vectorizer = CountVectorizer(binary='true', stop_words='english') + data = count_vectorizer.fit_transform(data) + # tfidf_data = TfidfTransformer(norm='l2', use_idf=True, sublinear_tf=True, smooth_idf=True).fit_transform(data) + transformer = TfidfTransformer(norm='l2', use_idf=True, sublinear_tf=True, smooth_idf=True) + transformer.fit(data) + tfidf_data = transformer.transform(data) + + return tfidf_data + +def learn_model(data,target): + # preparing data for split validation. 60% training, 40% test + data_train,data_test,target_train,target_test = cross_validation.train_test_split(data,target,test_size=0.4,random_state=43) + classifier = MultinomialNB().fit(data_train,target_train) + predicted = classifier.predict(data_test) + evaluate_model(target_test,predicted) + +# +def evaluate_model(target_true,target_predicted): + # print classification_report(target_true,target_predicted) + print "The accuracy score is {:.2%}".format(accuracy_score(target_true,target_predicted)) + + +data,target = load_file() +tf_idf = preprocess() +learn_model(tf_idf,target) + + diff --git a/MultinomialNB.py b/MultinomialNB.py new file mode 100644 index 0000000..9c56429 --- /dev/null +++ b/MultinomialNB.py @@ -0,0 +1,152 @@ +from __future__ import division +from math import log +import re +from nltk.corpus import movie_reviews as mr +from nltk.corpus import stopwords +STOP_WORDS = set(stopwords.words('english')) +SPLIT_AMOUNT = 0.6 # training amount from data +# need to change calculations for stuff +# https://www.dataquest.io/blog/naive-bayes-movies/ + +def SplitData(): + type_dict={} + docs_count={} + train_test = [[],[]] + for category in mr.categories(): + type_dict[category]=mr.fileids(categories=category) + for cat in type_dict.keys(): + li = type_dict[cat] + size=int(len(li)*SPLIT_AMOUNT) + docs_count[cat]=size + train_test[0].extend(li[:size]) + train_test[1].extend(li[size:]) + return [train_test,type_dict, docs_count] + +def tokenize(file_name): + list_words = re.split(r'\W+',mr.raw(fileids=file_name)) + return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS] + +def CalculateAccuracy(li_results): + a=0 + b=0 + c=0 + d=0 + cat = li_results[0][1] + for t in li_results: + if cat==t[1]: + if cat==t[2]: + a+=1 + else: + b+=1 + else: + if cat==t[2]: + c+=1 + else: + d+=1 + precision = a/(a+b) + # recall = a/(a+c) + # print "The following parameters are recorded for the category " , cat + print "precision =", precision + +# li = Preprocessor.get_testset_trainset(corpus) +li = SplitData() +testset = li[0][1] +trainset = li[0][0] +# li = Preprocessor.startup() +cat_num_docs = li[2] + + + +##4)Create a) a dictionary with a category as the key and dictionary of words-occurences as values + #b) a dictionary with a category as the key and the number of words in it as the value +# {pos-> {w1 = 17 times}, {w2 = 32 times}...} {neg-> ....} +cat_word_dict={} +# {pos-> 4000 words} {neg-> 7000 words} +cat_word_count_dict={} +#val = my_dict.get(key, mydefaultval) + +##5)Loop through the training set, to get the entire text from each file +##6) Parse the string to get individual words +for file_name in trainset: + list_words = tokenize(file_name) + + +##7) Check if category exists in dictionary, if not, create an empty dictionary, + #and put word count as zero + #and then insert words into the category's dictionary in both cases and update the word count + cat = mr.categories(fileids = file_name)[0] + cat_word_dict[cat] = cat_word_dict.get(cat,{}) + cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0) + +# add number of words to total word count for cat + cat_word_count_dict[cat]+=len(list_words) +# start count for number of occurences for each word + for w in list_words: + cat_word_dict[cat][w] = cat_word_dict[cat].get(w, 0) + cat_word_dict[cat][w]+=1 + + + +##8) Get the vocabulary length +## number of words, total across categories +vocab_length=0 +for dic in cat_word_dict.values(): + vocab_length+=len(dic) + + + + + +####Congratulations! the Classifier is trained, now it is time to run the Multinomial Naive Bayes Classifier on the test dataset +length_train = len(trainset) +li_results=[] +#9) Like in the training set,Loop through the test set, to get the entire text from each file +##10) Similar step, parse the string to get individual words +for file_name in testset: + print "File: ", file_name + minimum_neg_log_prob=1000000000 + # minimum_neg_log_prob = 0 # NEW + min_category='' + list_words = tokenize(file_name) + + + +##11) Get the probability for each category, + #can use any of the created dictionaries to wade through the categories + for cat in cat_word_count_dict: + # print cat , cat_num_docs[cat]/len(trainset) + # print "________________________________________________________________" + # print "________________________________________________________________" + # print "\n\n" , cat, cat, cat, cat, cat, cat, cat, cat, cat, cat, "\n\n" + neg_log_prob=-log(cat_num_docs[cat]/length_train) + # neg_log_prob = cat_num_docs[cat]/length_train + word_dict = cat_word_dict[cat] + count_cat = cat_word_count_dict[cat] + for w in list_words: + count_word_train=word_dict.get(w,0) + ratio = (count_word_train+1)/(count_cat+vocab_length) + neg_log_prob-=log(ratio) + + # neg_log_prob *= ratio + # print w, "Ratio found:",ratio, "new_neg_log:", neg_log_prob + # break + # print "\n\n", minimum_neg_log_prob , '<' , neg_log_prob + if minimum_neg_log_prob>neg_log_prob: + # if minimum_neg_log_prob