Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
BigData/ComplementMNB.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
58 lines (48 sloc)
2.1 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import os | |
from sklearn.feature_extraction.text import CountVectorizer | |
from sklearn.feature_extraction.text import TfidfTransformer | |
from sklearn.naive_bayes import MultinomialNB | |
from sklearn import cross_validation | |
from sklearn.metrics import classification_report | |
import numpy as np | |
from sklearn.metrics import accuracy_score | |
REVIEWS = os.path.join(os.path.dirname(os.path.abspath(__file__)),'allrevs.csv') | |
# review.csv contains two columns | |
# first column is the review content (quoted) | |
# second column is the assigned sentiment (positive or negative) | |
def load_file(): | |
with open(REVIEWS) as csv_file: | |
reader = csv.reader(csv_file,delimiter=",",quotechar='"') | |
reader.next() | |
data =[] | |
target = [] | |
for row in reader: | |
# skip missing data | |
if row[0] and row[1]: | |
data.append(row[0]) | |
target.append(row[1]) | |
return data,target | |
# preprocess creates the term frequency matrix for the review data set | |
def preprocess(): | |
data,target = load_file() | |
count_vectorizer = CountVectorizer(binary='true', stop_words='english') | |
data = count_vectorizer.fit_transform(data) | |
# tfidf_data = TfidfTransformer(norm='l2', use_idf=True, sublinear_tf=True, smooth_idf=True).fit_transform(data) | |
transformer = TfidfTransformer(norm='l2', use_idf=True, sublinear_tf=True, smooth_idf=True) | |
transformer.fit(data) | |
tfidf_data = transformer.transform(data) | |
return tfidf_data | |
def learn_model(data,target): | |
# preparing data for split validation. 60% training, 40% test | |
data_train,data_test,target_train,target_test = cross_validation.train_test_split(data,target,test_size=0.4,random_state=43) | |
classifier = MultinomialNB().fit(data_train,target_train) | |
predicted = classifier.predict(data_test) | |
evaluate_model(target_test,predicted) | |
# | |
def evaluate_model(target_true,target_predicted): | |
# print classification_report(target_true,target_predicted) | |
print "The accuracy score is {:.2%}".format(accuracy_score(target_true,target_predicted)) | |
data,target = load_file() | |
tf_idf = preprocess() | |
learn_model(tf_idf,target) | |