generate_performance_report.py

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC
from utils import get_data,get_model_spec,get_user_menu_input, get_user_string_input
from termcolor import colored
from pandas import DataFrame
import pyinputplus as pyip
import copy
import os
import config

cols = [
    'favorite_count',
    'retweet_count',
    'statuses_count',
    'friends_count',
    'hashtag_count',
    'capitals',
    'exclamation_marks',
    'topic',
]
def normalize(importances):
    total = 0
    for i in importances:
        total += abs(i)
    return [float(abs(each)) / total for each in importances]

models = {
    "XG Boost": {
        "model": lambda: XGBClassifier(
                                min_child_weight=0.002,
                                gamma=0.001,
                                subsample=1.0,
                                colsample_bytree=0.01,
                                max_depth=35,
                                eta=0.3),
        "accuracy": 0,
        "params": 0,
        "func": lambda x: x.feature_importances_
    },
    "Logistic Regression": {
        "model": LogisticRegression,
        "accuracy": 0,
        "params": 0,
        "func": lambda x: normalize(x.coef_[0])
    },
    "Support Vector Machine": {
        "model": lambda: SVC(kernel='linear'),
        "accuracy": 0,
        "params": 0,
        "func": lambda x: normalize(x.coef_[0])
    },
}

models_name = list(models.keys())

if __name__ == "__main__":
    best_model = None
    best_accuracy = 0
    best_dataset = ''
    report_name = get_user_string_input("Name your report")
    csv_file_array=[['N-Gram','Cluster','Model','Accuracy','Precision','Recall','F1-Score'] + cols]

    print(colored("Generating report. Please wait","green"))
    for n in config.grams:
        cur_row = [n]
        for k in config.clusters:
            cur_row.append(k)
            file_path = 'data/extracted/{n}-gram-{k}-clusters.csv'.format(n=n,k=k)
            X_train, X_test, Y_train, Y_test = get_data(0, n, k)
            for model_name in models_name:
                cur_row.append(model_name)
                accuracy, recall, precision, f1, model = get_model_spec(models[model_name]["model"],X_train,X_test,Y_train,Y_test)
                cur_row.append(accuracy)
                cur_row.append(precision)
                cur_row.append(recall)
                cur_row.append(f1)

                feature_importances =  models[model_name]["func"](model)

                for each in feature_importances:
                    cur_row.append(each)

                if(accuracy > models[model_name]["accuracy"]):
                    models[model_name]["accuracy"] = accuracy
                    models[model_name]["params"] = feature_importances

                csv_file_array.append(copy.copy(cur_row))

                del cur_row[-13:] # Backtracking

            cur_row.pop()


    report_path = "reports/{report_name}.csv".format(report_name=report_name)

    for model_name, model in models.items():
        print(colored(model_name,"blue"))
        print("Best Accuracy:", colored(model["accuracy"],"green"))

    DataFrame(csv_file_array).to_csv(
        report_path,
        header=None,
        index=None)

    print(colored("The report is available at " + report_path,"green"))
	from sklearn.linear_model import LogisticRegression
	from xgboost import XGBClassifier
	from sklearn.svm import SVC
	from utils import get_data,get_model_spec,get_user_menu_input, get_user_string_input
	from termcolor import colored
	from pandas import DataFrame
	import pyinputplus as pyip
	import copy
	import os
	import config

	cols = [
	'favorite_count',
	'retweet_count',
	'statuses_count',
	'friends_count',
	'hashtag_count',
	'capitals',
	'exclamation_marks',
	'topic',
	]
	def normalize(importances):
	total = 0
	for i in importances:
	total += abs(i)
	return [float(abs(each)) / total for each in importances]

	models = {
	"XG Boost": {
	"model": lambda: XGBClassifier(
	min_child_weight=0.002,
	gamma=0.001,
	subsample=1.0,
	colsample_bytree=0.01,
	max_depth=35,
	eta=0.3),
	"accuracy": 0,
	"params": 0,
	"func": lambda x: x.feature_importances_
	},
	"Logistic Regression": {
	"model": LogisticRegression,
	"accuracy": 0,
	"params": 0,
	"func": lambda x: normalize(x.coef_[0])
	},
	"Support Vector Machine": {
	"model": lambda: SVC(kernel='linear'),
	"accuracy": 0,
	"params": 0,
	"func": lambda x: normalize(x.coef_[0])
	},
	}

	models_name = list(models.keys())

	if __name__ == "__main__":
	best_model = None
	best_accuracy = 0
	best_dataset = ''
	report_name = get_user_string_input("Name your report")
	csv_file_array=[['N-Gram','Cluster','Model','Accuracy','Precision','Recall','F1-Score'] + cols]

	print(colored("Generating report. Please wait","green"))
	for n in config.grams:
	cur_row = [n]
	for k in config.clusters:
	cur_row.append(k)
	file_path = 'data/extracted/{n}-gram-{k}-clusters.csv'.format(n=n,k=k)
	X_train, X_test, Y_train, Y_test = get_data(0, n, k)
	for model_name in models_name:
	cur_row.append(model_name)
	accuracy, recall, precision, f1, model = get_model_spec(models[model_name]["model"],X_train,X_test,Y_train,Y_test)
	cur_row.append(accuracy)
	cur_row.append(precision)
	cur_row.append(recall)
	cur_row.append(f1)

	feature_importances = models[model_name]["func"](model)

	for each in feature_importances:
	cur_row.append(each)

	if(accuracy > models[model_name]["accuracy"]):
	models[model_name]["accuracy"] = accuracy
	models[model_name]["params"] = feature_importances

	csv_file_array.append(copy.copy(cur_row))

	del cur_row[-13:] # Backtracking

	cur_row.pop()


	report_path = "reports/{report_name}.csv".format(report_name=report_name)

	for model_name, model in models.items():
	print(colored(model_name,"blue"))
	print("Best Accuracy:", colored(model["accuracy"],"green"))

	DataFrame(csv_file_array).to_csv(
	report_path,
	header=None,
	index=None)

	print(colored("The report is available at " + report_path,"green"))