diff --git a/BernoulliNB.py b/BNB.py similarity index 69% rename from BernoulliNB.py rename to BNB.py index 775d86c..31b39f4 100644 --- a/BernoulliNB.py +++ b/BNB.py @@ -1,5 +1,7 @@ from __future__ import division from math import log +import random +import csv import re from nltk.corpus import movie_reviews as mr from nltk.corpus import stopwords @@ -7,24 +9,52 @@ STOP_WORDS = set(stopwords.words('english')) SPLIT_AMOUNT = 0.6 # training amount from data +AMAZON = 1 +REVIEW_POL={} +DEFINED_SIZE = 1 +DEFINED_SIZES = {'pos': 948, 'neg': 948} + def SplitData(): type_dict={} docs_count={} train_test = [[],[]] - for category in mr.categories(): - type_dict[category]=mr.fileids(categories=category) + offset_sample = random.randint(-400,400) + print "offset_sample", offset_sample + if AMAZON: + offset_sample = random.randint(-600,600) + for category in ['pos', 'neg']: + type_dict[category]=[] + with open('amazon_revs.csv', 'rb') as csvfile: + rev_read = csv.reader(csvfile) + for row in rev_read: + type_dict[row[1]].append(row[0]) + REVIEW_POL[row[0]] = row[1] + else: + for category in mr.categories(): + type_dict[category]=mr.fileids(categories=category) for cat in type_dict.keys(): li = type_dict[cat] - size=int(len(li)*SPLIT_AMOUNT) + random.shuffle(li) + size=int(len(li)*SPLIT_AMOUNT) + offset_sample + if DEFINED_SIZE: + size = DEFINED_SIZES[cat] + print "Category: ", cat, "Size:", size + offset_sample *= -1 docs_count[cat]=size train_test[0].extend(li[:size]) train_test[1].extend(li[size:]) return [train_test,type_dict, docs_count] - + def tokenize(file_name): - list_words = re.split(r'\W+',mr.raw(fileids=file_name)) + list_words = () + if AMAZON: + list_words = re.split(r'\W+',file_name) + else: + list_words = re.split(r'\W+',mr.raw(fileids=file_name)) + return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS] + def CalculateAccuracy(li_results): a=0 b=0 @@ -63,8 +93,11 @@ word_cat_num_doc_dict={} ## Parse the string to get individual words - done by get_list_tokens_nltk() for file_name in trainset: list_words = tokenize(file_name) - cat = mr.categories(fileids = file_name)[0] - + cat = '' + if AMAZON: + cat = REVIEW_POL[file_name] + else: + cat = mr.categories(fileids = file_name)[0] for w in set(list_words): word_cat_num_doc_dict[w]=word_cat_num_doc_dict.get(w,{}) word_cat_num_doc_dict[w][cat]=word_cat_num_doc_dict[w].get(cat,0) @@ -100,8 +133,11 @@ for file_name in testset: min_category=cat minimum_neg_log_prob=neg_log_prob - li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0])) - + if AMAZON: + li_results.append((file_name,min_category,REVIEW_POL[file_name])) + else: + li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0])) + # break CalculateAccuracy(li_results) diff --git a/ComplementMNB.py b/ComplementMNB.py deleted file mode 100644 index b646acc..0000000 --- a/ComplementMNB.py +++ /dev/null @@ -1,58 +0,0 @@ -import csv -import os -from sklearn.feature_extraction.text import CountVectorizer -from sklearn.feature_extraction.text import TfidfTransformer -from sklearn.naive_bayes import MultinomialNB -from sklearn import cross_validation -from sklearn.metrics import classification_report -import numpy as np -from sklearn.metrics import accuracy_score - -REVIEWS = os.path.join(os.path.dirname(os.path.abspath(__file__)),'allrevs.csv') -# review.csv contains two columns -# first column is the review content (quoted) -# second column is the assigned sentiment (positive or negative) -def load_file(): - with open(REVIEWS) as csv_file: - reader = csv.reader(csv_file,delimiter=",",quotechar='"') - reader.next() - data =[] - target = [] - for row in reader: - # skip missing data - if row[0] and row[1]: - data.append(row[0]) - target.append(row[1]) - - return data,target - -# preprocess creates the term frequency matrix for the review data set -def preprocess(): - data,target = load_file() - count_vectorizer = CountVectorizer(binary='true', stop_words='english') - data = count_vectorizer.fit_transform(data) - # tfidf_data = TfidfTransformer(norm='l2', use_idf=True, sublinear_tf=True, smooth_idf=True).fit_transform(data) - transformer = TfidfTransformer(norm='l2', use_idf=True, sublinear_tf=True, smooth_idf=True) - transformer.fit(data) - tfidf_data = transformer.transform(data) - - return tfidf_data - -def learn_model(data,target): - # preparing data for split validation. 60% training, 40% test - data_train,data_test,target_train,target_test = cross_validation.train_test_split(data,target,test_size=0.4,random_state=43) - classifier = MultinomialNB().fit(data_train,target_train) - predicted = classifier.predict(data_test) - evaluate_model(target_test,predicted) - -# -def evaluate_model(target_true,target_predicted): - # print classification_report(target_true,target_predicted) - print "The accuracy score is {:.2%}".format(accuracy_score(target_true,target_predicted)) - - -data,target = load_file() -tf_idf = preprocess() -learn_model(tf_idf,target) - - diff --git a/MultinomialNB.py b/MNB.py similarity index 67% rename from MultinomialNB.py rename to MNB.py index 9c56429..948b774 100644 --- a/MultinomialNB.py +++ b/MNB.py @@ -1,29 +1,56 @@ from __future__ import division from math import log import re +import random +import csv from nltk.corpus import movie_reviews as mr from nltk.corpus import stopwords STOP_WORDS = set(stopwords.words('english')) SPLIT_AMOUNT = 0.6 # training amount from data -# need to change calculations for stuff -# https://www.dataquest.io/blog/naive-bayes-movies/ + +AMAZON = 1 +REVIEW_POL={} +DEFINED_SIZE = 1 +DEFINED_SIZES = {'pos': 948, 'neg': 948} def SplitData(): type_dict={} docs_count={} train_test = [[],[]] - for category in mr.categories(): - type_dict[category]=mr.fileids(categories=category) + offset_sample = random.randint(-400,400) + print "offset_sample", offset_sample + if AMAZON: + offset_sample = random.randint(-600,600) + for category in ['pos', 'neg']: + type_dict[category]=[] + with open('amazon_revs.csv', 'rb') as csvfile: + rev_read = csv.reader(csvfile) + for row in rev_read: + type_dict[row[1]].append(row[0]) + REVIEW_POL[row[0]] = row[1] + else: + for category in mr.categories(): + type_dict[category]=mr.fileids(categories=category) for cat in type_dict.keys(): li = type_dict[cat] - size=int(len(li)*SPLIT_AMOUNT) + random.shuffle(li) + size=int(len(li)*SPLIT_AMOUNT) + offset_sample + if DEFINED_SIZE: + size = DEFINED_SIZES[cat] + print "Category: ", cat, "Size:", size + offset_sample *= -1 docs_count[cat]=size train_test[0].extend(li[:size]) train_test[1].extend(li[size:]) return [train_test,type_dict, docs_count] - + def tokenize(file_name): - list_words = re.split(r'\W+',mr.raw(fileids=file_name)) + list_words = () + if AMAZON: + list_words = re.split(r'\W+',file_name) + else: + list_words = re.split(r'\W+',mr.raw(fileids=file_name)) + return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS] def CalculateAccuracy(li_results): @@ -59,7 +86,8 @@ cat_num_docs = li[2] ##4)Create a) a dictionary with a category as the key and dictionary of words-occurences as values #b) a dictionary with a category as the key and the number of words in it as the value -# {pos-> {w1 = 17 times}, {w2 = 32 times}...} {neg-> ....} +# {pos-> {w1 = 17 times}, {w2 = 32 times}...} +# {neg-> ....} cat_word_dict={} # {pos-> 4000 words} {neg-> 7000 words} cat_word_count_dict={} @@ -74,7 +102,11 @@ for file_name in trainset: ##7) Check if category exists in dictionary, if not, create an empty dictionary, #and put word count as zero #and then insert words into the category's dictionary in both cases and update the word count - cat = mr.categories(fileids = file_name)[0] + cat = '' + if AMAZON: + cat = REVIEW_POL[file_name] + else: + cat = mr.categories(fileids = file_name)[0] cat_word_dict[cat] = cat_word_dict.get(cat,{}) cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0) @@ -103,8 +135,9 @@ li_results=[] #9) Like in the training set,Loop through the test set, to get the entire text from each file ##10) Similar step, parse the string to get individual words for file_name in testset: - print "File: ", file_name - minimum_neg_log_prob=1000000000 + # print "File: ", file_name + # minimum_neg_log_prob=1000000000 + minimum_neg_log_prob=-1000000000 # minimum_neg_log_prob = 0 # NEW min_category='' list_words = tokenize(file_name) @@ -118,25 +151,31 @@ for file_name in testset: # print "________________________________________________________________" # print "________________________________________________________________" # print "\n\n" , cat, cat, cat, cat, cat, cat, cat, cat, cat, cat, "\n\n" - neg_log_prob=-log(cat_num_docs[cat]/length_train) + # neg_log_prob=-log(cat_num_docs[cat]/length_train) # P(class) + neg_log_prob= log(cat_num_docs[cat]/length_train) # P(class) # neg_log_prob = cat_num_docs[cat]/length_train - word_dict = cat_word_dict[cat] - count_cat = cat_word_count_dict[cat] + word_dict = cat_word_dict[cat] # word counts for each word in class + count_cat = cat_word_count_dict[cat] # total words in class for w in list_words: count_word_train=word_dict.get(w,0) ratio = (count_word_train+1)/(count_cat+vocab_length) - neg_log_prob-=log(ratio) + # neg_log_prob-=log(ratio) + neg_log_prob+=log(ratio) # neg_log_prob *= ratio # print w, "Ratio found:",ratio, "new_neg_log:", neg_log_prob # break - # print "\n\n", minimum_neg_log_prob , '<' , neg_log_prob - if minimum_neg_log_prob>neg_log_prob: - # if minimum_neg_log_probneg_log_prob: + if minimum_neg_log_prob1 and w.lower() not in STOP_WORDS] + + +def CalculateAccuracy(li_results): + a=0 + b=0 + c=0 + d=0 + cat = li_results[0][1] + for t in li_results: + if cat==t[1]: + if cat==t[2]: + a+=1 + else: + b+=1 + else: + if cat==t[2]: + c+=1 + else: + d+=1 + precision = a/(a+b) + # recall = a/(a+c) + # print "The following parameters are recorded for the category " , cat + print "precision =", precision + +# li = Preprocessor.get_testset_trainset(corpus) +li = SplitData() +# exit() +testset = li[0][1] +trainset = li[0][0] +# li = Preprocessor.startup() +cat_num_docs = li[2] + + + +##4)Create a) a dictionary with a category as the key and dictionary of words-occurrences as values + #b) a dictionary with a category as the key and the number of words in it as the value +# {pos-> {w1 = 17 times}, {w2 = 32 times}...} {neg-> ....} +cat_word_dict={} +# {pos-> 4000 words} {neg-> 7000 words} +cat_word_count_dict={} +#val = my_dict.get(key, mydefaultval) +complete_training_docs_tokens = [] + +##5)Loop through the training set, to get the entire text from each file +##6) Parse the string to get individual words +for file_name in trainset: + list_words = tokenize(file_name) + complete_training_docs_tokens.append(list_words) + + +##7) Check if category exists in dictionary, if not, create an empty dictionary, + #and put word count as zero + #and then insert words into the category's dictionary in both cases and update the word count + cat = '' + if AMAZON: + cat = REVIEW_POL[file_name] + else: + cat = mr.categories(fileids = file_name)[0] + cat_word_dict[cat] = cat_word_dict.get(cat,{}) + cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0) + +# add number of words to total word count for cat + cat_word_count_dict[cat]+=len(list_words) +# start count for number of occurences for each word + for w in list_words: + cat_word_dict[cat][w] = cat_word_dict[cat].get(w, 0) + cat_word_dict[cat][w]+=1 + + + +##8) Get the vocabulary length +## number of words, total across categories +vocab_length=0 +num_docs_word_in = {} +for dic in cat_word_dict.values(): + vocab_length+=len(dic) + if USE_IDF: + for uniq_word in dic.keys(): + num_docs_word_in[uniq_word] = num_docs_word_in.get(uniq_word, 1) + num_docs_word_in[uniq_word] = sum(1 for sr in complete_training_docs_tokens if uniq_word in sr) + + + +####Congratulations! the Classifier is trained, now it is time to run the Multinomial Naive Bayes Classifier on the test dataset +length_train = len(trainset) +li_results=[] +#9) Like in the training set,Loop through the test set, to get the entire text from each file +##10) Similar step, parse the string to get individual words +for file_name in testset: + # print "File: ", file_name + minimum_neg_log_prob=1000000000 + # minimum_neg_log_prob = 0 # NEW + min_category='' + list_words = tokenize(file_name) + + + +##11) Get the probability for each category, + #can use any of the created dictionaries to wade through the categories + for cat in cat_word_count_dict: + # print cat , cat_num_docs[cat]/len(trainset) + # print "________________________________________________________________" + # print "________________________________________________________________" + # print "\n\n" , cat, cat, cat, cat, cat, cat, cat, cat, cat, cat, "\n\n" + # neg_log_prob=-log(cat_num_docs[cat]/length_train) + inv_cat = 'pos' + if cat == 'pos': + inv_cat = 'neg' + + + neg_log_prob=log(cat_num_docs[cat]/length_train) + + # neg_log_prob = cat_num_docs[cat]/length_train + word_dict = cat_word_dict[inv_cat] + count_cat = cat_word_count_dict[inv_cat] + + my_word_count = {} + for aw in list_words: + my_word_count[aw] = my_word_count.get(aw, 0) + my_word_count[aw]+=1 + + length_norm = 0 + weight_normalizing_ratio = 0 + for kw in my_word_count.keys(): + count_word_train=word_dict.get(kw,0) + ratio = (count_word_train+1)/(count_cat+vocab_length) + ## weight norm + weight_normalizing_ratio+=log(ratio) + ## TF + my_word_count[kw] = log(my_word_count[kw]+1) + ## length norm + length_norm += (my_word_count[kw]**(2)) + + length_norm = length_norm**(0.5) + # print "WNR: ", weight_normalizing_ratio + + for w in my_word_count.keys(): + count_word_train=word_dict.get(w,0) + ratio = (count_word_train+1)/(count_cat+vocab_length) #Nw,c+1/Nc+|V| = theta_c + # neg_log_prob-=log(ratio) + word_freq = my_word_count[w] + if USE_IDF: + word_freq = word_freq*log(length_train/num_docs_word_in.get(w,1)) #IDF + word_freq = word_freq/length_norm # length normalization + + # neg_log_prob += word_freq*log(ratio) #switch to + ratio = log(ratio) # weight factor log(theta_c) = weight_c,w + ratio = ratio/weight_normalizing_ratio # weight normalization + neg_log_prob += word_freq*ratio # class probability + + + # neg_log_prob *= ratio + # print w, "Ratio found:",ratio, "new_neg_log:", neg_log_prob + # break + # print "NLP: ", neg_log_prob + # print "\n\n", cat, minimum_neg_log_prob , '<' , neg_log_prob + if minimum_neg_log_prob>neg_log_prob: + # if minimum_neg_log_prob