Naive Bayes stuff

BernoulliNB and MultinomialNB follow the normal algorithm (using log precision to avoid FP underflow). ComplementMNB implements SciKit's Multinomial formula, modified for the paper
job13011 · Mar 29, 2016 · 6855e36 · 6855e36
1 parent 2a8d5ef
commit 6855e36
Show file tree

Hide file tree

Showing 4 changed files with 67,038 additions and 0 deletions.
diff --git a/BernoulliNB.py b/BernoulliNB.py
@@ -0,0 +1,108 @@
+from __future__ import division
+from math import log
+import re
+from nltk.corpus import movie_reviews as mr
+from nltk.corpus import stopwords
+STOP_WORDS = set(stopwords.words('english'))
+
+SPLIT_AMOUNT = 0.6          # training amount from data 
+
+def SplitData():
+    type_dict={}
+    docs_count={}
+    train_test = [[],[]]
+    for category in mr.categories():
+        type_dict[category]=mr.fileids(categories=category)
+    for cat in type_dict.keys():
+        li = type_dict[cat]
+        size=int(len(li)*SPLIT_AMOUNT)
+        docs_count[cat]=size
+        train_test[0].extend(li[:size])
+        train_test[1].extend(li[size:])
+    return [train_test,type_dict, docs_count]
+
+def tokenize(file_name): 
+    list_words = re.split(r'\W+',mr.raw(fileids=file_name))
+    return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS]
+
+def CalculateAccuracy(li_results):
+    a=0
+    b=0
+    c=0
+    d=0
+    cat = li_results[0][1]
+    for t in li_results:
+        if cat==t[1]:
+            if cat==t[2]:
+                a+=1
+            else:
+                b+=1
+        else:
+            if cat==t[2]:
+                c+=1
+            else:
+                d+=1
+    precision = a/(a+b)
+    # recall = a/(a+c)
+    # print "The following parameters are recorded for the category " , cat
+    print "precision =", precision
+
+# li = Preprocessor.get_testset_trainset(corpus)
+li = SplitData()
+testset = li[0][1]
+trainset = li[0][0]
+# li = Preprocessor.startup()
+cat_num_docs = li[2]
+
+#3)Create a dictionary with a word as the key and a dictionary as the value
+     ## in the dictionary the category as key and number of documents in that category where it occurs as value
+# 2d dict: word -> {pos ...}, {neg ...}
+word_cat_num_doc_dict={}
+
+#4)Loop through the reuters dataset, to get the entire text from  each file in the training set
+    ## Parse the string to get individual words - done by get_list_tokens_nltk()
+for file_name in trainset:
+    list_words = tokenize(file_name)
+    cat = mr.categories(fileids = file_name)[0]
+
+    for w in set(list_words):
+       word_cat_num_doc_dict[w]=word_cat_num_doc_dict.get(w,{})
+       word_cat_num_doc_dict[w][cat]=word_cat_num_doc_dict[w].get(cat,0)
+       word_cat_num_doc_dict[w][cat]+=1
+
+for w in word_cat_num_doc_dict:
+    for cat in cat_num_docs:
+        nct = word_cat_num_doc_dict[w].get(cat,0)
+        # convert #times a word appears into #times+1/#cat_reviews+2
+        ratio = (nct+1)/(cat_num_docs[cat]+2)
+        word_cat_num_doc_dict[w][cat]=ratio
+
+print "The Classifier is trained and it took"
+
+
+li_results=[]
+#5) Like in the training set,Loop through the test set, to get the individual words
+for file_name in testset:
+    minimum_neg_log_prob=1000000000
+    min_category=''
+    set_list_words = set(tokenize(file_name))
+
+##6) Get the probability for each category,
+    #using the cat_num_docs dictionary to wade through the categories
+    for cat in  cat_num_docs:
+        neg_log_prob=-log(cat_num_docs[cat]/len(trainset))
+        for w in word_cat_num_doc_dict:
+            if w in set_list_words:
+                neg_log_prob-=log(word_cat_num_doc_dict[w][cat])
+            else:
+                neg_log_prob-=log(1-word_cat_num_doc_dict[w][cat])
+        if minimum_neg_log_prob>neg_log_prob:
+            min_category=cat
+            minimum_neg_log_prob=neg_log_prob
+
+    li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
+
+CalculateAccuracy(li_results)
+
+
+
diff --git a/ComplementMNB.py b/ComplementMNB.py
@@ -0,0 +1,58 @@
+import csv
+import os
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_extraction.text import TfidfTransformer
+from sklearn.naive_bayes import MultinomialNB
+from sklearn import cross_validation
+from sklearn.metrics import classification_report
+import numpy as np
+from sklearn.metrics import accuracy_score
+
+REVIEWS = os.path.join(os.path.dirname(os.path.abspath(__file__)),'allrevs.csv')
+# review.csv contains two columns
+# first column is the review content (quoted)
+# second column is the assigned sentiment (positive or negative)
+def load_file():
+    with open(REVIEWS) as csv_file:
+        reader = csv.reader(csv_file,delimiter=",",quotechar='"')
+        reader.next()
+        data =[]
+        target = []
+        for row in reader:
+            # skip missing data
+            if row[0] and row[1]:
+                data.append(row[0])
+                target.append(row[1])
+
+        return data,target
+
+# preprocess creates the term frequency matrix for the review data set
+def preprocess():
+    data,target = load_file()
+    count_vectorizer = CountVectorizer(binary='true', stop_words='english')
+    data = count_vectorizer.fit_transform(data)
+    # tfidf_data = TfidfTransformer(norm='l2', use_idf=True, sublinear_tf=True, smooth_idf=True).fit_transform(data)
+    transformer = TfidfTransformer(norm='l2', use_idf=True, sublinear_tf=True, smooth_idf=True)
+    transformer.fit(data)
+    tfidf_data = transformer.transform(data)
+
+    return tfidf_data
+
+def learn_model(data,target):
+    # preparing data for split validation. 60% training, 40% test
+    data_train,data_test,target_train,target_test = cross_validation.train_test_split(data,target,test_size=0.4,random_state=43)
+    classifier = MultinomialNB().fit(data_train,target_train)
+    predicted = classifier.predict(data_test)
+    evaluate_model(target_test,predicted)
+
+# 
+def evaluate_model(target_true,target_predicted):
+    # print classification_report(target_true,target_predicted)
+    print "The accuracy score is {:.2%}".format(accuracy_score(target_true,target_predicted))
+
+
+data,target = load_file()
+tf_idf = preprocess()
+learn_model(tf_idf,target)
+
+
diff --git a/MultinomialNB.py b/MultinomialNB.py
@@ -0,0 +1,152 @@
+from __future__ import division
+from math import log
+import re
+from nltk.corpus import movie_reviews as mr
+from nltk.corpus import stopwords
+STOP_WORDS = set(stopwords.words('english'))
+SPLIT_AMOUNT = 0.6          # training amount from data 
+# need to change calculations for stuff
+# https://www.dataquest.io/blog/naive-bayes-movies/
+
+def SplitData():
+    type_dict={}
+    docs_count={}
+    train_test = [[],[]]
+    for category in mr.categories():
+        type_dict[category]=mr.fileids(categories=category)
+    for cat in type_dict.keys():
+        li = type_dict[cat]
+        size=int(len(li)*SPLIT_AMOUNT)
+        docs_count[cat]=size
+        train_test[0].extend(li[:size])
+        train_test[1].extend(li[size:])
+    return [train_test,type_dict, docs_count]
+
+def tokenize(file_name): 
+    list_words = re.split(r'\W+',mr.raw(fileids=file_name))
+    return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS]
+
+def CalculateAccuracy(li_results):
+    a=0
+    b=0
+    c=0
+    d=0
+    cat = li_results[0][1]
+    for t in li_results:
+        if cat==t[1]:
+            if cat==t[2]:
+                a+=1
+            else:
+                b+=1
+        else:
+            if cat==t[2]:
+                c+=1
+            else:
+                d+=1
+    precision = a/(a+b)
+    # recall = a/(a+c)
+    # print "The following parameters are recorded for the category " , cat
+    print "precision =", precision
+
+# li = Preprocessor.get_testset_trainset(corpus)
+li = SplitData()
+testset = li[0][1]
+trainset = li[0][0]
+# li = Preprocessor.startup()
+cat_num_docs = li[2]
+
+
+
+##4)Create a) a dictionary with a category as the key and dictionary of words-occurences as values
+          #b) a dictionary with a category as the key and the number of words in it as the value
+# {pos-> {w1 = 17 times}, {w2 = 32 times}...} {neg-> ....}
+cat_word_dict={}
+# {pos-> 4000 words} {neg-> 7000 words}
+cat_word_count_dict={}
+#val = my_dict.get(key, mydefaultval)
+
+##5)Loop through the training set, to get the entire text from  each file
+##6) Parse the string to get individual words
+for file_name in trainset:
+    list_words = tokenize(file_name)
+
+
+##7) Check if category exists in dictionary, if not, create an empty dictionary,
+    #and put word count as zero
+    #and then insert words into the category's dictionary in both cases and update the word count
+    cat = mr.categories(fileids = file_name)[0]
+    cat_word_dict[cat] = cat_word_dict.get(cat,{})
+    cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0)
+
+# add number of words to total word count for cat
+    cat_word_count_dict[cat]+=len(list_words)   
+# start count for number of occurences for each word    
+    for w in list_words:
+        cat_word_dict[cat][w] = cat_word_dict[cat].get(w, 0)
+        cat_word_dict[cat][w]+=1
+
+
+
+##8) Get the vocabulary length
+## number of words, total across categories
+vocab_length=0            
+for dic in cat_word_dict.values():
+     vocab_length+=len(dic)
+
+
+
+
+
+####Congratulations! the Classifier is trained, now it is time to run the Multinomial Naive Bayes Classifier on the test dataset
+length_train = len(trainset)
+li_results=[]
+#9) Like in the training set,Loop through the test set, to get the entire text from  each file
+##10) Similar step, parse the string to get individual words
+for file_name in testset:
+    print "File: ", file_name
+    minimum_neg_log_prob=1000000000
+    # minimum_neg_log_prob = 0      # NEW
+    min_category=''
+    list_words = tokenize(file_name)
+
+
+
+##11) Get the probability for each category,
+    #can use any of the created dictionaries to wade through the categories
+    for cat in  cat_word_count_dict:
+        # print cat , cat_num_docs[cat]/len(trainset)
+        # print "________________________________________________________________"
+        # print "________________________________________________________________"
+        # print "\n\n" , cat, cat, cat, cat, cat, cat, cat, cat, cat, cat, "\n\n"
+        neg_log_prob=-log(cat_num_docs[cat]/length_train)
+        # neg_log_prob = cat_num_docs[cat]/length_train
+        word_dict = cat_word_dict[cat]
+        count_cat = cat_word_count_dict[cat]
+        for w in list_words:
+            count_word_train=word_dict.get(w,0)
+            ratio = (count_word_train+1)/(count_cat+vocab_length)
+            neg_log_prob-=log(ratio)
+
+            # neg_log_prob *= ratio
+            # print w, "Ratio found:",ratio, "new_neg_log:", neg_log_prob
+            # break
+        # print "\n\n", minimum_neg_log_prob , '<' , neg_log_prob                 
+        if minimum_neg_log_prob>neg_log_prob:
+        # if minimum_neg_log_prob<neg_log_prob:
+            min_category=cat
+            minimum_neg_log_prob=neg_log_prob
+    # print "Min cat: ", min_category
+    li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
+    # break
+
+###--------------------DEBUG STATEMENTS----------------------
+#for t in li_results:
+ #   if t[1]!=t[2]:
+  #      print t
+###--------------------DEBUG STATEMENTS----------------------
+
+###--------------------DEBUG STATEMENTS----------------------
+
+#12) Evaluating the classifier
+
+CalculateAccuracy(li_results)