Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC
from utils import get_data,get_model_spec,get_user_menu_input, get_user_string_input
from termcolor import colored
from pandas import DataFrame
import pyinputplus as pyip
import copy
import os
import config
cols = [
'favorite_count',
'retweet_count',
'statuses_count',
'friends_count',
'hashtag_count',
'capitals',
'exclamation_marks',
'topic',
]
def normalize(importances):
total = 0
for i in importances:
total += abs(i)
return [float(abs(each)) / total for each in importances]
models = {
"XG Boost": {
"model": lambda: XGBClassifier(
min_child_weight=0.002,
gamma=0.001,
subsample=1.0,
colsample_bytree=0.01,
max_depth=35,
eta=0.3),
"accuracy": 0,
"params": 0,
"func": lambda x: x.feature_importances_
},
"Logistic Regression": {
"model": LogisticRegression,
"accuracy": 0,
"params": 0,
"func": lambda x: normalize(x.coef_[0])
},
"Support Vector Machine": {
"model": lambda: SVC(kernel='linear'),
"accuracy": 0,
"params": 0,
"func": lambda x: normalize(x.coef_[0])
},
}
models_name = list(models.keys())
if __name__ == "__main__":
best_model = None
best_accuracy = 0
best_dataset = ''
report_name = get_user_string_input("Name your report")
csv_file_array=[['N-Gram','Cluster','Model','Accuracy','Precision','Recall','F1-Score'] + cols]
print(colored("Generating report. Please wait","green"))
for n in config.grams:
cur_row = [n]
for k in config.clusters:
cur_row.append(k)
file_path = 'data/extracted/{n}-gram-{k}-clusters.csv'.format(n=n,k=k)
X_train, X_test, Y_train, Y_test = get_data(0, n, k)
for model_name in models_name:
cur_row.append(model_name)
accuracy, recall, precision, f1, model = get_model_spec(models[model_name]["model"],X_train,X_test,Y_train,Y_test)
cur_row.append(accuracy)
cur_row.append(precision)
cur_row.append(recall)
cur_row.append(f1)
feature_importances = models[model_name]["func"](model)
for each in feature_importances:
cur_row.append(each)
if(accuracy > models[model_name]["accuracy"]):
models[model_name]["accuracy"] = accuracy
models[model_name]["params"] = feature_importances
csv_file_array.append(copy.copy(cur_row))
del cur_row[-13:] # Backtracking
cur_row.pop()
report_path = "reports/{report_name}.csv".format(report_name=report_name)
for model_name, model in models.items():
print(colored(model_name,"blue"))
print("Best Accuracy:", colored(model["accuracy"],"green"))
DataFrame(csv_file_array).to_csv(
report_path,
header=None,
index=None)
print(colored("The report is available at " + report_path,"green"))