functions

i did a thing - but i did it at 2 am so this might be bad
job13011 · Apr 20, 2016 · 47d86ab · 47d86ab
1 parent d4e334b
commit 47d86ab
Show file tree

Hide file tree

Showing 4 changed files with 1,089 additions and 0 deletions.
diff --git a/OLD_VERSIONS/CWMNB.py b/OLD_VERSIONS/CWMNB.py
@@ -0,0 +1,254 @@
+######################
+# This version is CWMNB only
+######################
+
+from __future__ import division
+from math import log
+import re
+import csv
+from nltk.corpus import movie_reviews as mr
+from nltk.corpus import stopwords
+import random
+STOP_WORDS = set(stopwords.words('english'))
+SPLIT_AMOUNT = 0.6          # training amount from data 
+
+COMPLEMENT = 0
+WEIGHTED = 0
+USE_IDF = 0
+AMAZON = 0
+REVIEW_POL={}
+DEFINED_SIZE = 1
+DEFINED_SIZES = {'pos': 600, 'neg': 600}
+def SplitData():
+    type_dict={}
+    docs_count={}
+    train_test = [[],[]]
+    offset_sample = random.randint(-400,400)
+    print "offset_sample", offset_sample
+    if AMAZON:
+        offset_sample = random.randint(-600,600)
+        for category in ['pos', 'neg']:
+            type_dict[category]=[]
+        with open('amazon_revs.csv', 'rb') as csvfile:
+            rev_read = csv.reader(csvfile)
+            for row in rev_read:
+                type_dict[row[1]].append(row[0])
+                REVIEW_POL[row[0]] = row[1]
+    else:
+        for category in mr.categories():
+            type_dict[category]=mr.fileids(categories=category)
+    for cat in type_dict.keys():
+        li = type_dict[cat]
+        random.shuffle(li)
+        size=int(len(li)*SPLIT_AMOUNT) + offset_sample
+        if DEFINED_SIZE:
+            size = DEFINED_SIZES[cat]
+        print "Category: ", cat, "Size:", size
+        offset_sample *= -1
+        docs_count[cat]=size
+        train_test[0].extend(li[:size])
+        train_test[1].extend(li[size:])
+    return [train_test,type_dict, docs_count]
+
+def tokenize(file_name): 
+    list_words = ()
+    if AMAZON:
+        list_words = re.split(r'\W+',file_name)
+    else:
+        list_words = re.split(r'\W+',mr.raw(fileids=file_name))
+
+    return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS]
+
+
+def CalculateAccuracy(li_results):
+    a=0
+    b=0
+    c=0
+    d=0
+    cat = li_results[0][1]
+    for t in li_results:
+        if cat==t[1]:
+            if cat==t[2]:
+                a+=1
+            else:
+                b+=1
+        else:
+            if cat==t[2]:
+                c+=1
+            else:
+                d+=1
+    precision = a/(a+b)
+    # recall = a/(a+c)
+    # print "The following parameters are recorded for the category " , cat
+    print "precision =", precision
+
+# li = Preprocessor.get_testset_trainset(corpus)
+li = SplitData()
+# exit()
+testset = li[0][1]
+trainset = li[0][0]
+# li = Preprocessor.startup()
+cat_num_docs = li[2]
+
+
+
+##4)Create a) a dictionary with a category as the key and dictionary of words-occurrences as values
+          #b) a dictionary with a category as the key and the number of words in it as the value
+# {pos-> {w1 = 17 times}, {w2 = 32 times}...} {neg-> ....}
+cat_word_dict={}
+# {pos-> 4000 words} {neg-> 7000 words}
+cat_word_count_dict={}
+#val = my_dict.get(key, mydefaultval)
+complete_training_docs_tokens = []
+
+##5)Loop through the training set, to get the entire text from  each file
+##6) Parse the string to get individual words
+for file_name in trainset:
+    list_words = tokenize(file_name)
+    complete_training_docs_tokens.append(list_words)
+
+
+##7) Check if category exists in dictionary, if not, create an empty dictionary,
+    #and put word count as zero
+    #and then insert words into the category's dictionary in both cases and update the word count
+    cat = ''
+    if AMAZON:
+        cat = REVIEW_POL[file_name]
+    else:
+        cat = mr.categories(fileids = file_name)[0]
+    cat_word_dict[cat] = cat_word_dict.get(cat,{})
+    cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0)
+
+# add number of words to total word count for cat
+    cat_word_count_dict[cat]+=len(list_words)   
+# start count for number of occurences for each word    
+    for w in list_words:
+        cat_word_dict[cat][w] = cat_word_dict[cat].get(w, 0)
+        cat_word_dict[cat][w]+=1
+
+
+
+##8) Get the vocabulary length
+## number of words, total across categories
+vocab_length=0     
+num_docs_word_in = {}        
+for dic in cat_word_dict.values():
+     vocab_length+=len(dic)
+     if USE_IDF:
+         for uniq_word in dic.keys():
+            num_docs_word_in[uniq_word] = num_docs_word_in.get(uniq_word, 1)
+            num_docs_word_in[uniq_word] = sum(1 for sr in complete_training_docs_tokens if uniq_word in sr)
+
+
+
+####Congratulations! the Classifier is trained, now it is time to run the Multinomial Naive Bayes Classifier on the test dataset
+length_train = len(trainset)
+li_results=[]
+#9) Like in the training set,Loop through the test set, to get the entire text from  each file
+##10) Similar step, parse the string to get individual words
+for file_name in testset:
+    # print "File: ", file_name
+    # minimum_neg_log_prob=1000000000
+    minimum_neg_log_prob = -1000000000      # NEW
+    min_category=''
+    list_words = tokenize(file_name)
+
+
+
+##11) Get the probability for each category,
+    #can use any of the created dictionaries to wade through the categories
+    for cat in  cat_word_count_dict:
+        # print cat , cat_num_docs[cat]/len(trainset)
+        # print "________________________________________________________________"
+        # print "________________________________________________________________"
+        # print "\n\n" , cat, cat, cat, cat, cat, cat, cat, cat, cat, cat, "\n\n"
+        # neg_log_prob=-log(cat_num_docs[cat]/length_train)
+        inv_cat = 'pos'
+        if cat == 'pos':
+            inv_cat = 'neg'
+
+        neg_log_prob = log(cat_num_docs[cat]/length_train)
+
+        # neg_log_prob = cat_num_docs[cat]/length_train
+        opp_word_dict = cat_word_dict[inv_cat]
+        opp_count_cat = cat_word_count_dict[inv_cat]
+
+        word_dict = cat_word_dict[cat]
+        count_cat = cat_word_count_dict[cat]
+
+        my_word_count = {}
+        for aw in list_words:
+            my_word_count[aw] = my_word_count.get(aw, 0)
+            my_word_count[aw]+=1
+
+        length_norm = 0
+        weight_normalizing_ratio = 0
+        for kw in my_word_count.keys():
+            count_word_train=word_dict.get(kw,0)            
+            ratio = (count_word_train+1)/(count_cat+vocab_length)
+
+            if COMPLEMENT:
+                count_word_train=opp_word_dict.get(kw,0)
+                ratio = (count_word_train+1)/(opp_count_cat+vocab_length)
+
+            # weight norm
+            weight_normalizing_ratio += abs(log(ratio))
+            ## TF
+            # my_word_count[kw] = log(my_word_count[kw]+1)
+            ## length norm
+            # length_norm += (my_word_count[kw]**(2))
+
+        # length_norm = length_norm**(0.5)   
+        # print "WNR: ", weight_normalizing_ratio
+
+        for w in my_word_count.keys():
+            count_word_train=word_dict.get(w,0)
+            ratio = (count_word_train+1)/(count_cat+vocab_length)   #Nw,c+1/Nc+|V| = theta_c
+
+            if COMPLEMENT:
+                count_word_train=opp_word_dict.get(w,0)
+                ratio = (count_word_train+1)/(opp_count_cat+vocab_length)
+
+            word_freq = my_word_count[w]
+
+            if USE_IDF:
+                word_freq = word_freq*log(length_train/num_docs_word_in.get(w,1))     #IDF
+            # word_freq = word_freq/length_norm       # length normalization
+
+
+            ratio = log(ratio)      # weight factor     log(theta_c) = weight_c,w
+
+            if WEIGHTED:
+                ratio = ratio/weight_normalizing_ratio      # weight normalization
+
+            if COMPLEMENT:
+                neg_log_prob -= word_freq*ratio
+            else:
+                neg_log_prob += word_freq*ratio    # class probability
+
+            # break
+        # print "NLP: ", neg_log_prob
+        # print "\n\n", cat, minimum_neg_log_prob , '<' , neg_log_prob                 
+        # if minimum_neg_log_prob>neg_log_prob:
+        if minimum_neg_log_prob<neg_log_prob:
+            min_category=cat
+            minimum_neg_log_prob=neg_log_prob
+    # print "Min cat: ", min_category
+
+    if AMAZON:
+        li_results.append((file_name,min_category,REVIEW_POL[file_name]))
+    else:
+        li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
+    # break
+
+###--------------------DEBUG STATEMENTS----------------------
+#for t in li_results:
+ #   if t[1]!=t[2]:
+  #      print t
+###--------------------DEBUG STATEMENTS----------------------
+
+###--------------------DEBUG STATEMENTS----------------------
+
+#12) Evaluating the classifier
+
+CalculateAccuracy(li_results)