clean up

placed all nb implementations in one code, and moved a bunch of stuff around
job13011 · May 3, 2016 · 14dc205 · 14dc205
1 parent 0a26108
commit 14dc205
Show file tree

Hide file tree

Showing 10 changed files with 767 additions and 36 deletions.
diff --git a/TWCNB_v0_2.py → Naive_bayes.py b/TWCNB_v0_2.py → Naive_bayes.py
@@ -20,8 +20,10 @@
 # TF = 0              # 1 - log term frew
 # IDF = 0             # 1 - idf
 # LENGTH = 0          # 1 - doc length adjust
-AMAZON = 0          # 0 - use movie_reviews, 1 - use Amazon set 
-OFFSET = 0          # 0 - use random data size offset, 1 - nope
+AMAZON = 0          # 1 - use Amazon set 
+TWITTER = 0            # 1 - use Twitter set
+TWEET_LIMIT = 5000      # we can't use the whole database, so just randomly grab this number of positive and negative reviews
+OFFSET = 0          # introduced offset (skew) in datasets
 
 REVIEW_POL={}
 
@@ -42,6 +44,7 @@
 num_docs_word_in = {} 
 vocab_length=0
 
+word_cat_num_doc_dict={}
 
 
 def SplitData():
@@ -51,16 +54,30 @@ def SplitData():
     train_test = [[],[]]
     # offset_sample = random.randint(-400,400)
     offset_sample = OFFSET
-    print "offset_sample", offset_sample
+    # print "offset_sample", offset_sample
+    categories = ['neg', 'pos']
     if AMAZON:
         # offset_sample = random.randint(-600,600)
-        for category in ['pos', 'neg']:
+        for category in categories:
             type_dict[category]=[]
         with open('amazon_revs.csv', 'rb') as csvfile:
             rev_read = csv.reader(csvfile)
             for row in rev_read:
                 type_dict[row[1]].append(row[0])
                 REVIEW_POL[row[0]] = row[1]
+    elif TWITTER:
+        for category in categories:
+            type_dict[category]=[]
+        with open('tweets.csv', 'rb') as csvfile:
+            rev_read = csv.reader(csvfile)
+            rev_read.next()     # skip header row
+            number = [0,0]
+            for row in rev_read:
+                type_dict[ categories[ int(row[1]) ] ].append(row[3].strip())
+                REVIEW_POL[row[3].strip()] = categories[int(row[1])]
+                number[int(row[1])] += 1 
+                if (number[0]>TWEET_LIMIT and number[1]>TWEET_LIMIT):
+                    break
     else:
         for category in mr.categories():
             type_dict[category]=mr.fileids(categories=category)
@@ -73,7 +90,7 @@ def SplitData():
         size=int(len(li)*SPLIT_AMOUNT) + offset_sample
         # if DEFINED_SIZE:
             # size = DEFINED_SIZES[cat]
-        print "Category: ", cat, "Size:", size
+        # print "Category: ", cat, "Size:", size
         offset_sample *= -1
         docs_count[cat]=size
         train_test[0].extend(li[:size])
@@ -82,7 +99,7 @@ def SplitData():
 
 def tokenize(file_name): 
     list_words = ()
-    if AMAZON:
+    if AMAZON or TWITTER:
         list_words = re.split(r'\W+',file_name)
     else:
         list_words = re.split(r'\W+',mr.raw(fileids=file_name))
@@ -110,59 +127,99 @@ def CalculateAccuracy(li_results):
     precision = a/(a+b)
     # recall = a/(a+c)
     # print "The following parameters are recorded for the category " , cat
-    print "precision =", precision
+    # print "precision =", precision
     return precision
 
 def RunWholeThing():
     global AMAZON
+    global TWITTER
     global OFFSET
     global DEFINED_SIZE
     global DEFINED_SIZES
     OFFSET = 0
-    AMAZON = 0          # 0 - use movie_reviews, 1 - use Amazon set 
+    AMAZON = 0          # 0 - use movie_reviews, 1 - use Amazon set
+    TWITTER = 0
+    tested = ['  Bern:', '  Mult:', '  TIL :', '  DTIL:', '  CW  :', '  TIWC:']
     while OFFSET < 400: 
-        ans = DoTheThing()
-        print "Pos/Neg =", cat_num_docs['pos'], cat_num_docs['neg'], 'Movies @ +/-', OFFSET
-        print "_____________________________________________________"        
-        OFFSET = -1*OFFSET
-        ans2 = DoTheThing()
+        print "Movies with ", OFFSET
+        ans = DoTheThing() 
         OFFSET = -1*OFFSET
-        print ans , ans2       
-        OFFSET += 100
+        if OFFSET != 0:
+            ans2 = DoTheThing()
+            ans3 = [ans , ans2]
+            ans = [sum(e)/len(e) for e in zip(*ans3)]
+        a_i = 0    
+        for a in ans:
+            print tested[a_i], a
+            a_i += 1
+
+        OFFSET = -1*OFFSET  
+
+        OFFSET += 300
+
     OFFSET = 0
     AMAZON = 1
 
+
     while OFFSET < 600: 
+        print "Amazon with ", OFFSET
         ans = DoTheThing() 
-        print "Pos/Neg =", cat_num_docs['pos'], cat_num_docs['neg'], 'Amazon @ +/-', OFFSET
-        print "_____________________________________________________"
         OFFSET = -1*OFFSET
-        ans2 = DoTheThing()
+        if OFFSET != 0:
+            ans2 = DoTheThing()
+            ans3 = [ans , ans2]
+            ans = [sum(e)/len(e) for e in zip(*ans3)]
+        a_i = 0    
+        for a in ans:
+            print tested[a_i], a
+            a_i += 1
+
+        OFFSET = -1*OFFSET  
+
+        OFFSET += 400
+
+    OFFSET = 0
+    AMAZON = 0
+    TWITTER = 1
+
+    while OFFSET < 1000: 
+        print "Twitter with ", OFFSET
+        ans = DoTheThing() 
         OFFSET = -1*OFFSET
-        print ans , ans2       
-        OFFSET += 100
+        if OFFSET != 0:
+            ans2 = DoTheThing()
+            ans3 = [ans , ans2]
+            ans = [sum(e)/len(e) for e in zip(*ans3)]
+        a_i = 0    
+        for a in ans:
+            print tested[a_i], a
+            a_i += 1
+        OFFSET = -1*OFFSET              
+        OFFSET += 800
 
 
 
 def DoTheThing():
     i = 0
     reps    = 5
-
+    b_nb    = 0
     m_nb    = 0
-    ti_nb   = 0
     til_nb  = 0
+    dtil_nb = 0
     cw_nb   = 0
     tilcw_nb= 0
 
     while i < reps:
         TrainMachine()
-        m_nb += TestMachine(0,0,0,0,0)/5
-        ti_nb += TestMachine(1,1,0,0,0)/5
-        til_nb += TestMachine(1,1,1,0,0)/5
-        cw_nb += TestMachine(0,0,0,1,1)/5
-        tilcw_nb += TestMachine(1,1,1,1,1)/5        
+        b_nb += TestMachine_Bern()/reps
+        m_nb += TestMachine(0,0,0,0,0)/reps
+        til_nb += TestMachine(1,1,1,0,0)/reps
+        dtil_nb += TestMachine(1,1,1,2,0)/reps
+        cw_nb += TestMachine(0,0,0,1,1)/reps
+        tilcw_nb += TestMachine(1,1,1,1,1)/reps        
         i+=1
-    return (m_nb, ti_nb, til_nb, cw_nb, tilcw_nb)
+    # print "  Bern: %0.6f\n  Mult: %0.6f\n  TIL : %0.6f\n  DTIL: %0.6f\n  CW  : %0.6f\n  TIWC: %0.6f" % (b_nb, m_nb, til_nb, dtil_nb, cw_nb, tilcw_nb)
+    return [b_nb, m_nb, til_nb, dtil_nb, cw_nb, tilcw_nb]
 
 
 # li = Preprocessor.get_testset_trainset(corpus)
@@ -173,6 +230,7 @@ def TrainMachine():
     global cat_word_dict
     global cat_word_count_dict
     global num_docs_word_in
+    global word_cat_num_doc_dict
     global li
     global testset
     global trainset
@@ -192,7 +250,7 @@ def TrainMachine():
         #and put word count as zero
         #and then insert words into the category's dictionary in both cases and update the word count
         cat = ''
-        if AMAZON:
+        if AMAZON or TWITTER:
             cat = REVIEW_POL[file_name]
         else:
             cat = mr.categories(fileids = file_name)[0]
@@ -211,8 +269,18 @@ def TrainMachine():
                 num_docs_word_in[w] = num_docs_word_in.get(w, 0)
                 num_docs_word_in[w] += 1
 
+                word_cat_num_doc_dict[w]=word_cat_num_doc_dict.get(w,{})
+                word_cat_num_doc_dict[w][cat]=word_cat_num_doc_dict[w].get(cat,0)
+                word_cat_num_doc_dict[w][cat]+=1
+
     for dic in cat_word_dict.values():
-     vocab_length+=len(dic)
+        vocab_length+=len(dic)
+    for w in word_cat_num_doc_dict:
+        for cat in cat_num_docs:
+            nct = word_cat_num_doc_dict[w].get(cat,0)
+            # convert #times a word appears into #times+1/#cat_reviews+2
+            ratio = (nct+1)/(cat_num_docs[cat]+2)
+            word_cat_num_doc_dict[w][cat]=ratio
 
 
 # ##8) Get the vocabulary length
@@ -297,7 +365,7 @@ def TestMachine(t, i, l, c, w):
                     my_word_count[kw] = log(1 + my_word_count[kw])
 
                 if IDF:
-                    my_word_count[kw] = my_word_count[kw]*log(length_train/num_docs_word_in.get(kw,1), 2)     #IDF
+                    my_word_count[kw] = my_word_count[kw]*log(length_train/num_docs_word_in.get(kw,0.01), 2)     #IDF
                 ## length norm
                 w_freq = my_word_count[kw]
                 length_norm += pow(w_freq, 2)
@@ -342,7 +410,7 @@ def TestMachine(t, i, l, c, w):
                 minimum_neg_log_prob=neg_log_prob
         # print "Min cat: ", min_category
 
-        if AMAZON:
+        if AMAZON or TWITTER:
             li_results.append((file_name,min_category,REVIEW_POL[file_name]))
         else:
             li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
@@ -354,5 +422,34 @@ def TestMachine(t, i, l, c, w):
     precision = CalculateAccuracy(li_results)
     return precision
 
+def TestMachine_Bern():
+    li_results=[]
+    #5) Like in the training set,Loop through the test set, to get the individual words
+    for file_name in testset:
+        minimum_neg_log_prob=1000000000
+        min_category=''
+        set_list_words = set(tokenize(file_name))
+
+    ##6) Get the probability for each category,
+        #using the cat_num_docs dictionary to wade through the categories
+        for cat in  cat_num_docs:
+            neg_log_prob=-log(cat_num_docs[cat]/len(trainset))
+            for w in word_cat_num_doc_dict:
+                if w in set_list_words:
+                    neg_log_prob-=log(word_cat_num_doc_dict[w][cat])
+                else:
+                    neg_log_prob-=log(1-word_cat_num_doc_dict[w][cat])
+            if minimum_neg_log_prob>neg_log_prob:
+                min_category=cat
+                minimum_neg_log_prob=neg_log_prob
+
+        if AMAZON or TWITTER:
+            li_results.append((file_name,min_category,REVIEW_POL[file_name]))
+        else:
+            li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
+        # break
+    precision = CalculateAccuracy(li_results)
+    return precision
+
 
 RunWholeThing()
diff --git a/BNB.py → OLD_VERSIONS/BNB.py b/BNB.py → OLD_VERSIONS/BNB.py
diff --git a/MNB.py → OLD_VERSIONS/MNB.py b/MNB.py → OLD_VERSIONS/MNB.py
diff --git a/TFIDF.py → OLD_VERSIONS/TFIDF.py b/TFIDF.py → OLD_VERSIONS/TFIDF.py