Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
binary-classification-of-tweets-about-protest/generate_performance_report.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
106 lines (90 sloc)
3.32 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.linear_model import LogisticRegression | |
from xgboost import XGBClassifier | |
from sklearn.svm import SVC | |
from utils import get_data,get_model_spec,get_user_menu_input, get_user_string_input | |
from termcolor import colored | |
from pandas import DataFrame | |
import pyinputplus as pyip | |
import copy | |
import os | |
import config | |
cols = [ | |
'favorite_count', | |
'retweet_count', | |
'statuses_count', | |
'friends_count', | |
'hashtag_count', | |
'capitals', | |
'exclamation_marks', | |
'topic', | |
] | |
def normalize(importances): | |
total = 0 | |
for i in importances: | |
total += abs(i) | |
return [float(abs(each)) / total for each in importances] | |
models = { | |
"XG Boost": { | |
"model": lambda: XGBClassifier( | |
min_child_weight=0.002, | |
gamma=0.001, | |
subsample=1.0, | |
colsample_bytree=0.01, | |
max_depth=35, | |
eta=0.3), | |
"accuracy": 0, | |
"params": 0, | |
"func": lambda x: x.feature_importances_ | |
}, | |
"Logistic Regression": { | |
"model": LogisticRegression, | |
"accuracy": 0, | |
"params": 0, | |
"func": lambda x: normalize(x.coef_[0]) | |
}, | |
"Support Vector Machine": { | |
"model": lambda: SVC(kernel='linear'), | |
"accuracy": 0, | |
"params": 0, | |
"func": lambda x: normalize(x.coef_[0]) | |
}, | |
} | |
models_name = list(models.keys()) | |
if __name__ == "__main__": | |
best_model = None | |
best_accuracy = 0 | |
best_dataset = '' | |
report_name = get_user_string_input("Name your report") | |
csv_file_array=[['N-Gram','Cluster','Model','Accuracy','Precision','Recall','F1-Score'] + cols] | |
print(colored("Generating report. Please wait","green")) | |
for n in config.grams: | |
cur_row = [n] | |
for k in config.clusters: | |
cur_row.append(k) | |
file_path = 'data/extracted/{n}-gram-{k}-clusters.csv'.format(n=n,k=k) | |
X_train, X_test, Y_train, Y_test = get_data(0, n, k) | |
for model_name in models_name: | |
cur_row.append(model_name) | |
accuracy, recall, precision, f1, model = get_model_spec(models[model_name]["model"],X_train,X_test,Y_train,Y_test) | |
cur_row.append(accuracy) | |
cur_row.append(precision) | |
cur_row.append(recall) | |
cur_row.append(f1) | |
feature_importances = models[model_name]["func"](model) | |
for each in feature_importances: | |
cur_row.append(each) | |
if(accuracy > models[model_name]["accuracy"]): | |
models[model_name]["accuracy"] = accuracy | |
models[model_name]["params"] = feature_importances | |
csv_file_array.append(copy.copy(cur_row)) | |
del cur_row[-13:] # Backtracking | |
cur_row.pop() | |
report_path = "reports/{report_name}.csv".format(report_name=report_name) | |
for model_name, model in models.items(): | |
print(colored(model_name,"blue")) | |
print("Best Accuracy:", colored(model["accuracy"],"green")) | |
DataFrame(csv_file_array).to_csv( | |
report_path, | |
header=None, | |
index=None) | |
print(colored("The report is available at " + report_path,"green")) |