Skip to content
Permalink
cf6576fb12
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
58 lines (48 sloc) 2.1 KB
import csv
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import cross_validation
from sklearn.metrics import classification_report
import numpy as np
from sklearn.metrics import accuracy_score
REVIEWS = os.path.join(os.path.dirname(os.path.abspath(__file__)),'allrevs.csv')
# review.csv contains two columns
# first column is the review content (quoted)
# second column is the assigned sentiment (positive or negative)
def load_file():
with open(REVIEWS) as csv_file:
reader = csv.reader(csv_file,delimiter=",",quotechar='"')
reader.next()
data =[]
target = []
for row in reader:
# skip missing data
if row[0] and row[1]:
data.append(row[0])
target.append(row[1])
return data,target
# preprocess creates the term frequency matrix for the review data set
def preprocess():
data,target = load_file()
count_vectorizer = CountVectorizer(binary='true', stop_words='english')
data = count_vectorizer.fit_transform(data)
# tfidf_data = TfidfTransformer(norm='l2', use_idf=True, sublinear_tf=True, smooth_idf=True).fit_transform(data)
transformer = TfidfTransformer(norm='l2', use_idf=True, sublinear_tf=True, smooth_idf=True)
transformer.fit(data)
tfidf_data = transformer.transform(data)
return tfidf_data
def learn_model(data,target):
# preparing data for split validation. 60% training, 40% test
data_train,data_test,target_train,target_test = cross_validation.train_test_split(data,target,test_size=0.4,random_state=43)
classifier = MultinomialNB().fit(data_train,target_train)
predicted = classifier.predict(data_test)
evaluate_model(target_test,predicted)
#
def evaluate_model(target_true,target_predicted):
# print classification_report(target_true,target_predicted)
print "The accuracy score is {:.2%}".format(accuracy_score(target_true,target_predicted))
data,target = load_file()
tf_idf = preprocess()
learn_model(tf_idf,target)