Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
job13011 committed Mar 29, 2016
2 parents 3b2a970 + eb7a6d0 commit 77869d0
Show file tree
Hide file tree
Showing 6 changed files with 67,089 additions and 33 deletions.
108 changes: 108 additions & 0 deletions BernoulliNB.py
@@ -0,0 +1,108 @@
from __future__ import division
from math import log
import re
from nltk.corpus import movie_reviews as mr
from nltk.corpus import stopwords
STOP_WORDS = set(stopwords.words('english'))

SPLIT_AMOUNT = 0.6 # training amount from data

def SplitData():
type_dict={}
docs_count={}
train_test = [[],[]]
for category in mr.categories():
type_dict[category]=mr.fileids(categories=category)
for cat in type_dict.keys():
li = type_dict[cat]
size=int(len(li)*SPLIT_AMOUNT)
docs_count[cat]=size
train_test[0].extend(li[:size])
train_test[1].extend(li[size:])
return [train_test,type_dict, docs_count]

def tokenize(file_name):
list_words = re.split(r'\W+',mr.raw(fileids=file_name))
return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in STOP_WORDS]

def CalculateAccuracy(li_results):
a=0
b=0
c=0
d=0
cat = li_results[0][1]
for t in li_results:
if cat==t[1]:
if cat==t[2]:
a+=1
else:
b+=1
else:
if cat==t[2]:
c+=1
else:
d+=1
precision = a/(a+b)
# recall = a/(a+c)
# print "The following parameters are recorded for the category " , cat
print "precision =", precision

# li = Preprocessor.get_testset_trainset(corpus)
li = SplitData()
testset = li[0][1]
trainset = li[0][0]
# li = Preprocessor.startup()
cat_num_docs = li[2]

#3)Create a dictionary with a word as the key and a dictionary as the value
## in the dictionary the category as key and number of documents in that category where it occurs as value
# 2d dict: word -> {pos ...}, {neg ...}
word_cat_num_doc_dict={}

#4)Loop through the reuters dataset, to get the entire text from each file in the training set
## Parse the string to get individual words - done by get_list_tokens_nltk()
for file_name in trainset:
list_words = tokenize(file_name)
cat = mr.categories(fileids = file_name)[0]

for w in set(list_words):
word_cat_num_doc_dict[w]=word_cat_num_doc_dict.get(w,{})
word_cat_num_doc_dict[w][cat]=word_cat_num_doc_dict[w].get(cat,0)
word_cat_num_doc_dict[w][cat]+=1

for w in word_cat_num_doc_dict:
for cat in cat_num_docs:
nct = word_cat_num_doc_dict[w].get(cat,0)
# convert #times a word appears into #times+1/#cat_reviews+2
ratio = (nct+1)/(cat_num_docs[cat]+2)
word_cat_num_doc_dict[w][cat]=ratio

print "The Classifier is trained and it took"


li_results=[]
#5) Like in the training set,Loop through the test set, to get the individual words
for file_name in testset:
minimum_neg_log_prob=1000000000
min_category=''
set_list_words = set(tokenize(file_name))

##6) Get the probability for each category,
#using the cat_num_docs dictionary to wade through the categories
for cat in cat_num_docs:
neg_log_prob=-log(cat_num_docs[cat]/len(trainset))
for w in word_cat_num_doc_dict:
if w in set_list_words:
neg_log_prob-=log(word_cat_num_doc_dict[w][cat])
else:
neg_log_prob-=log(1-word_cat_num_doc_dict[w][cat])
if minimum_neg_log_prob>neg_log_prob:
min_category=cat
minimum_neg_log_prob=neg_log_prob

li_results.append((file_name,min_category,mr.categories(fileids = file_name)[0]))

CalculateAccuracy(li_results)



58 changes: 58 additions & 0 deletions ComplementMNB.py
@@ -0,0 +1,58 @@
import csv
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import cross_validation
from sklearn.metrics import classification_report
import numpy as np
from sklearn.metrics import accuracy_score

REVIEWS = os.path.join(os.path.dirname(os.path.abspath(__file__)),'allrevs.csv')
# review.csv contains two columns
# first column is the review content (quoted)
# second column is the assigned sentiment (positive or negative)
def load_file():
with open(REVIEWS) as csv_file:
reader = csv.reader(csv_file,delimiter=",",quotechar='"')
reader.next()
data =[]
target = []
for row in reader:
# skip missing data
if row[0] and row[1]:
data.append(row[0])
target.append(row[1])

return data,target

# preprocess creates the term frequency matrix for the review data set
def preprocess():
data,target = load_file()
count_vectorizer = CountVectorizer(binary='true', stop_words='english')
data = count_vectorizer.fit_transform(data)
# tfidf_data = TfidfTransformer(norm='l2', use_idf=True, sublinear_tf=True, smooth_idf=True).fit_transform(data)
transformer = TfidfTransformer(norm='l2', use_idf=True, sublinear_tf=True, smooth_idf=True)
transformer.fit(data)
tfidf_data = transformer.transform(data)

return tfidf_data

def learn_model(data,target):
# preparing data for split validation. 60% training, 40% test
data_train,data_test,target_train,target_test = cross_validation.train_test_split(data,target,test_size=0.4,random_state=43)
classifier = MultinomialNB().fit(data_train,target_train)
predicted = classifier.predict(data_test)
evaluate_model(target_test,predicted)

#
def evaluate_model(target_true,target_predicted):
# print classification_report(target_true,target_predicted)
print "The accuracy score is {:.2%}".format(accuracy_score(target_true,target_predicted))


data,target = load_file()
tf_idf = preprocess()
learn_model(tf_idf,target)


74 changes: 45 additions & 29 deletions GlossBayes.py
Expand Up @@ -12,7 +12,6 @@ class Solver:
def demo(self):
def word_feats(words):
return dict([(word, True) for word in words])

def expand_sets(positive,negative,neutral):
newPositive = set(positive)
newNegative = set(negative)
Expand All @@ -22,21 +21,36 @@ class Solver:
for syn in wn.synsets(word, pos=wn.ADJ):
for lemma in syn.lemmas():
curr = lemma.name().split('.')[0]
if( curr not in newPositive and curr not in newNegative and curr not in newNeutral):
if(curr not in newNegative and curr not in newNeutral):
newPositive.add(curr)
elif( curr in newNegative):
newNegative.discard(curr)
newNeutral.add(curr)
# Deal with antonyms
for ant in lemma.antonyms():
if(ant not in newPositive and ant not in newNeutral):
newNegative.add(ant)
elif(ant in newPositive):
newPositive.discard(ant)
newNeutral.add(ant)
# Add Syns to Negative
for word in negative:
for syn in wn.synsets(word, pos=wn.ADJ):
for lemma in syn.lemmas():
curr = lemma.name().split('.')[0]
if( curr not in newPositive and curr not in newNegative and curr not in newNeutral):
print curr
if(curr not in newPositive and curr not in newNeutral):
newNegative.add(curr)
elif(curr in newPositive):
newPositive.discard(curr)
newNeutral.add(curr)
# Deal with antonyms
for ant in lemma.antonyms():
if(ant not in newNegative and ant not in newNeutral):
newPositive.add(ant)
elif(ant in newNegative):
newNegative.discard(ant)
newNeutral.add(ant)
return (newPositive,newNegative,newNeutral)

# Set up initial Sets S_p and S_n
Expand All @@ -45,36 +59,38 @@ class Solver:
neutral = Set([''])

# Expand on Sets to get S_p' and S_n'
for num in range(1,3):
for num in range(1,2):
newsets = expand_sets(positive,negative,neutral);
positive = set(newsets[0])
negative = set(newsets[1])
neutral = set(newsets[2])

# Learn Classifier
trainfeats = [({word : True},"pos") for word in positive] + [({word : True},"neg") for word in negative]

#negfeats = [({'insulting': True},'neg'),({'bad':True},'neg')]

#trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
classifier = NaiveBayesClassifier.train(trainfeats)


# Testing
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
negcutoff = len(negfeats)*3/4
poscutoff = len(posfeats)*3/4

testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

print 'Dictionary of %d positive words and %d negative words, tested on %d instances' % (len(positive),len(negative), len(testfeats))

print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
classifier.show_most_informative_features()
print positive
print negative

# # Learn Classifier
# trainfeats = [({word : True},"pos") for word in positive] + [({word : True},"neg") for word in negative]
#
# #negfeats = [({'insulting': True},'neg'),({'bad':True},'neg')]
#
# #trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
# classifier = NaiveBayesClassifier.train(trainfeats)
#
#
# # Testing
# negids = movie_reviews.fileids('neg')
# posids = movie_reviews.fileids('pos')
#
# negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
# posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
# negcutoff = len(negfeats)*3/4
# poscutoff = len(posfeats)*3/4
#
# testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
#
# print 'Dictionary of %d positive words and %d negative words, tested on %d instances' % (len(positive),len(negative), len(testfeats))
#
# print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
# classifier.show_most_informative_features()


#text = nltk.word_tokenize("And now for a production unlike any other a very fuzzy and cute dog")
Expand Down
10 changes: 6 additions & 4 deletions GlossCount.py
Expand Up @@ -27,7 +27,7 @@ class GlossCount:
for syn in wn.synsets(word, pos=wn.ADJ):
for lemma in syn.lemmas():
curr = lemma.name().split('.')[0]
if( curr not in newPositive and curr not in newNegative and curr not in newNeutral):
if(curr not in newNegative and curr not in newNeutral):
newPositive.add(curr)
elif( curr in newNegative):
newNegative.discard(curr)
Expand All @@ -37,7 +37,7 @@ class GlossCount:
for syn in wn.synsets(word, pos=wn.ADJ):
for lemma in syn.lemmas():
curr = lemma.name().split('.')[0]
if( curr not in newPositive and curr not in newNegative and curr not in newNeutral):
if( curr not in newPositive and curr not in newNeutral):
newNegative.add(curr)
elif(curr in newPositive):
newPositive.discard(curr)
Expand All @@ -50,7 +50,7 @@ class GlossCount:
neutral = Set([''])

# Expand on Sets to get S_p' and S_n'
for num in range(1,2):
for num in range(1,3):
newsets = expand_sets(positive,negative,neutral);
positive = set(newsets[0])
negative = set(newsets[1])
Expand All @@ -63,12 +63,14 @@ class GlossCount:
classifier = NaiveBayesClassifier.train(trainfeats)
#print classifier.classify(dict([(word, True) for word in words]))
#print classifier.classify(dict([("bad",True),("bad",True)]))


# Iterate through all of the reviews and find sentiment
count = 0.00
correct = 0.00
for reviews in movie_reviews.fileids(): #For every review
score = 0;
tokens = nltk.pos_tag(nltk.word_tokenize(movie_reviews.raw(fileids=[reviews]))) #Tokenize all words
tokens = nltk.pos_tag(nltk.word_tokenize(movie_reviews.raw(fileids=[reviews]))) #Tokenize all words with POS
for token in tokens:
if (token[1]== "JJ" or token[1] == "JJR" or token[1] == "JJS"): # If adjective, check value
sent_value = classifier.classify(dict([(token[0], True)]))
Expand Down

0 comments on commit 77869d0

Please sign in to comment.