Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
job13011 committed Apr 16, 2016
2 parents cf6576f + b4c94c3 commit d834a06
Show file tree
Hide file tree
Showing 9 changed files with 4,262 additions and 67,055 deletions.
54 changes: 45 additions & 9 deletions BernoulliNB.py → BNB.py
@@ -1,30 +1,60 @@
from __future__ import division
from math import log
import random
import csv
import re
from nltk.corpus import movie_reviews as mr
from nltk.corpus import stopwords
STOP_WORDS = set(stopwords.words('english'))

SPLIT_AMOUNT = 0.6 # training amount from data

AMAZON = 1
REVIEW_POL={}
DEFINED_SIZE = 1
DEFINED_SIZES = {'pos': 948, 'neg': 948}

def SplitData():
type_dict={}
docs_count={}
train_test = [[],[]]
for category in mr.categories():
type_dict[category]=mr.fileids(categories=category)
offset_sample = random.randint(-400,400)
print "offset_sample", offset_sample
if AMAZON:
offset_sample = random.randint(-600,600)
for category in ['pos', 'neg']:
type_dict[category]=[]
with open('amazon_revs.csv', 'rb') as csvfile:
rev_read = csv.reader(csvfile)
for row in rev_read:
type_dict[row[1]].append(row[0])
REVIEW_POL[row[0]] = row[1]
else:
for category in mr.categories():
type_dict[category]=mr.fileids(categories=category)
for cat in type_dict.keys():
li = type_dict[cat]
size=int(len(li)*SPLIT_AMOUNT)
random.shuffle(li)
size=int(len(li)*SPLIT_AMOUNT) + offset_sample
if DEFINED_SIZE:
size = DEFINED_SIZES[cat]
print "Category: ", cat, "Size:", size
offset_sample *= -1
docs_count[cat]=size
train_test[0].extend(li[:size])
train_test[1].extend(li[size:])
return [train_test,type_dict, docs_count]

def tokenize(file_name):
list_words = re.split(r'\W+',mr.raw(fileids=file_name))
list_words = ()
if AMAZON:
list_words = re.split(r'\W+',file_name)
else:
list_words = re.split(r'\W+',mr.raw(fileids=file_name))

return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS]


def CalculateAccuracy(li_results):
a=0
b=0
Expand Down Expand Up @@ -63,8 +93,11 @@ word_cat_num_doc_dict={}
## Parse the string to get individual words - done by get_list_tokens_nltk()
for file_name in trainset:
list_words = tokenize(file_name)
cat = mr.categories(fileids = file_name)[0]

cat = ''
if AMAZON:
cat = REVIEW_POL[file_name]
else:
cat = mr.categories(fileids = file_name)[0]
for w in set(list_words):
word_cat_num_doc_dict[w]=word_cat_num_doc_dict.get(w,{})
word_cat_num_doc_dict[w][cat]=word_cat_num_doc_dict[w].get(cat,0)
Expand Down Expand Up @@ -100,8 +133,11 @@ for file_name in testset:
min_category=cat
minimum_neg_log_prob=neg_log_prob

li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))

if AMAZON:
li_results.append((file_name,min_category,REVIEW_POL[file_name]))
else:
li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
# break
CalculateAccuracy(li_results)


Expand Down
58 changes: 0 additions & 58 deletions ComplementMNB.py

This file was deleted.

77 changes: 58 additions & 19 deletions MultinomialNB.py → MNB.py
@@ -1,29 +1,56 @@
from __future__ import division
from math import log
import re
import random
import csv
from nltk.corpus import movie_reviews as mr
from nltk.corpus import stopwords
STOP_WORDS = set(stopwords.words('english'))
SPLIT_AMOUNT = 0.6 # training amount from data
# need to change calculations for stuff
# https://www.dataquest.io/blog/naive-bayes-movies/

AMAZON = 1
REVIEW_POL={}
DEFINED_SIZE = 1
DEFINED_SIZES = {'pos': 948, 'neg': 948}

def SplitData():
type_dict={}
docs_count={}
train_test = [[],[]]
for category in mr.categories():
type_dict[category]=mr.fileids(categories=category)
offset_sample = random.randint(-400,400)
print "offset_sample", offset_sample
if AMAZON:
offset_sample = random.randint(-600,600)
for category in ['pos', 'neg']:
type_dict[category]=[]
with open('amazon_revs.csv', 'rb') as csvfile:
rev_read = csv.reader(csvfile)
for row in rev_read:
type_dict[row[1]].append(row[0])
REVIEW_POL[row[0]] = row[1]
else:
for category in mr.categories():
type_dict[category]=mr.fileids(categories=category)
for cat in type_dict.keys():
li = type_dict[cat]
size=int(len(li)*SPLIT_AMOUNT)
random.shuffle(li)
size=int(len(li)*SPLIT_AMOUNT) + offset_sample
if DEFINED_SIZE:
size = DEFINED_SIZES[cat]
print "Category: ", cat, "Size:", size
offset_sample *= -1
docs_count[cat]=size
train_test[0].extend(li[:size])
train_test[1].extend(li[size:])
return [train_test,type_dict, docs_count]

def tokenize(file_name):
list_words = re.split(r'\W+',mr.raw(fileids=file_name))
list_words = ()
if AMAZON:
list_words = re.split(r'\W+',file_name)
else:
list_words = re.split(r'\W+',mr.raw(fileids=file_name))

return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS]

def CalculateAccuracy(li_results):
Expand Down Expand Up @@ -59,7 +86,8 @@ cat_num_docs = li[2]

##4)Create a) a dictionary with a category as the key and dictionary of words-occurences as values
#b) a dictionary with a category as the key and the number of words in it as the value
# {pos-> {w1 = 17 times}, {w2 = 32 times}...} {neg-> ....}
# {pos-> {w1 = 17 times}, {w2 = 32 times}...}
# {neg-> ....}
cat_word_dict={}
# {pos-> 4000 words} {neg-> 7000 words}
cat_word_count_dict={}
Expand All @@ -74,7 +102,11 @@ for file_name in trainset:
##7) Check if category exists in dictionary, if not, create an empty dictionary,
#and put word count as zero
#and then insert words into the category's dictionary in both cases and update the word count
cat = mr.categories(fileids = file_name)[0]
cat = ''
if AMAZON:
cat = REVIEW_POL[file_name]
else:
cat = mr.categories(fileids = file_name)[0]
cat_word_dict[cat] = cat_word_dict.get(cat,{})
cat_word_count_dict[cat] = cat_word_count_dict.get(cat,0)

Expand Down Expand Up @@ -103,8 +135,9 @@ li_results=[]
#9) Like in the training set,Loop through the test set, to get the entire text from each file
##10) Similar step, parse the string to get individual words
for file_name in testset:
print "File: ", file_name
minimum_neg_log_prob=1000000000
# print "File: ", file_name
# minimum_neg_log_prob=1000000000
minimum_neg_log_prob=-1000000000
# minimum_neg_log_prob = 0 # NEW
min_category=''
list_words = tokenize(file_name)
Expand All @@ -118,25 +151,31 @@ for file_name in testset:
# print "________________________________________________________________"
# print "________________________________________________________________"
# print "\n\n" , cat, cat, cat, cat, cat, cat, cat, cat, cat, cat, "\n\n"
neg_log_prob=-log(cat_num_docs[cat]/length_train)
# neg_log_prob=-log(cat_num_docs[cat]/length_train) # P(class)
neg_log_prob= log(cat_num_docs[cat]/length_train) # P(class)
# neg_log_prob = cat_num_docs[cat]/length_train
word_dict = cat_word_dict[cat]
count_cat = cat_word_count_dict[cat]
word_dict = cat_word_dict[cat] # word counts for each word in class
count_cat = cat_word_count_dict[cat] # total words in class
for w in list_words:
count_word_train=word_dict.get(w,0)
ratio = (count_word_train+1)/(count_cat+vocab_length)
neg_log_prob-=log(ratio)
# neg_log_prob-=log(ratio)
neg_log_prob+=log(ratio)

# neg_log_prob *= ratio
# print w, "Ratio found:",ratio, "new_neg_log:", neg_log_prob
# break
# print "\n\n", minimum_neg_log_prob , '<' , neg_log_prob
if minimum_neg_log_prob>neg_log_prob:
# if minimum_neg_log_prob<neg_log_prob:
# print "\n\n", minimum_neg_log_prob , '<' , neg_log_prob
# print "\n\n", cat, ' :: ', neg_log_prob
# if minimum_neg_log_prob>neg_log_prob:
if minimum_neg_log_prob<neg_log_prob:
min_category=cat
minimum_neg_log_prob=neg_log_prob
# print "Min cat: ", min_category
li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
if AMAZON:
li_results.append((file_name,min_category,REVIEW_POL[file_name]))
else:
li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))
# break

###--------------------DEBUG STATEMENTS----------------------
Expand Down
44 changes: 44 additions & 0 deletions SentiWordLex.py
@@ -0,0 +1,44 @@
from __future__ import division
import sys
import time

import nltk
from nltk.corpus import movie_reviews
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn

start_time = time.time()
count = 0.00
correct = 0.00
ids = sorted(movie_reviews.fileids())

for reviews in ids: #For every review
score = 0.0
positive = 0.0
negative = 0.0
tokens = nltk.pos_tag(nltk.word_tokenize(movie_reviews.raw(fileids=[reviews]))) #Tokenize all words with POS
for token in tokens:
if (token[1]== "JJ" or token[1] == "JJR" or token[1] == "JJS"): # If adjective, check value
if len(wn.synsets(token[0], pos=wn.ADJ)) != 0 and swn.senti_synset(wn.synsets(token[0], pos=wn.ADJ)[0].name()) :
word = wn.synsets(token[0], pos=wn.ADJ)[0].name()
print word
print swn.senti_synset(word)
positive = positive + swn.senti_synset(word).pos_score()
negative = negative + swn.senti_synset(word).neg_score()
print "%s, %d, %d" %(word,positive,negative)
score = positive - negative
if (score < 0):
print "Negative at %f" % (score)
sentiment = 'neg'
else:
sentiment = 'pos'
print "Positive at %d" % (score)
if (sentiment == movie_reviews.categories(fileids=[reviews])[0]):
print "Correct"
correct = correct + 1.00
count = count + 1.00

print correct/count
print "Seconds: %d" %(time.time() - start_time)
print "correct:", correct/len(ids)
print "positive:", positive/len(ids)

0 comments on commit d834a06

Please sign in to comment.