From 109b2959a519b153f6ab000b4705f81e9665fa7d Mon Sep 17 00:00:00 2001 From: Emily A Maciejewski Date: Sun, 26 Apr 2020 12:30:41 -0400 Subject: [PATCH] Random Forest Code Base --- pca.py | 39 +++++++ preprocessing.py | 61 +++++++++++ random_forest.py | 273 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 373 insertions(+) create mode 100644 pca.py create mode 100644 preprocessing.py create mode 100644 random_forest.py diff --git a/pca.py b/pca.py new file mode 100644 index 0000000..d19bfd2 --- /dev/null +++ b/pca.py @@ -0,0 +1,39 @@ +import numpy as np +import seaborn as sns +import matplotlib.pyplot as plt +import pandas as pd +from sklearn.decomposition import PCA +from sklearn.manifold import TSNE +from sklearn.preprocessing import StandardScaler + +import preprocessing + +sns.set_style("darkgrid", {"axes.facecolor": ".95"}) + +breastcancer_data = preprocessing.get_data('breast-cancer-wisconsin.data') +data = preprocessing.process_missing_values(breastcancer_data, remove=False) +data = pd.DataFrame(data) +data.columns = breastcancer_data.columns + +X = data[data.columns[:-1]].values +y = data[data.columns[-1]].values + +X = StandardScaler().fit_transform(X) + +pca = PCA(n_components=2) +X_pca = pca.fit_transform(X) + +X_pca_viz = pd.DataFrame(X_pca) +X_pca_viz.columns = ["comp1", "comp2"] +X_pca_viz['labels'] = y +sns.lmplot("comp1", "comp2", hue="labels", data=X_pca_viz, fit_reg=False) +plt.show() + + +"""tsne = TSNE() +X_tsne = tsne.fit_transform(X_pca) + +plt.rcParams['figure.figsize'] = (10.0, 10.0) +proj = pd.DataFrame(X_tsne) +proj["labels"] = y +sns.lmplot("comp_1", "comp_2", hue="labels", data=proj.sample(5000), fit_reg=False)""" \ No newline at end of file diff --git a/preprocessing.py b/preprocessing.py new file mode 100644 index 0000000..e3536da --- /dev/null +++ b/preprocessing.py @@ -0,0 +1,61 @@ +import pandas as pd +import numpy as np + + +def get_data(data_file): + data = pd.read_csv(data_file) + data = data.drop(columns=['ID']) + + return data + + +# imputation +def get_average_column_value(data, col): + total = 0 + count = 0 + + for i in range(data.shape[0]): + if data[i][col] != '?': + total += int(data[i][col]) + count += 1 + + return int(round(total/count, 0)) + + +def process_missing_values(data, remove=True): + vals = data.values + new_vals = np.zeros(vals.shape) + + diff = 0 + + for i in range(vals.shape[0]): + for j in range(len(vals[i])): + if vals[i][j] == '?': + if remove: + new_vals = np.delete(new_vals, i-diff, 0) + diff += 1 + else: + new_vals[i-diff][j] = get_average_column_value(vals, j) + elif isinstance(vals[i][j], str): + new_vals[i-diff][j] = int(vals[i][j]) + else: + new_vals[i-diff][j] = vals[i][j] + + return new_vals + + +def preprocess(): + breastcancer_data = get_data('breast-cancer-wisconsin.data') + + removed_data = process_missing_values(breastcancer_data, remove=True) + removed = pd.DataFrame(removed_data) + removed.columns = breastcancer_data.columns + + average_data = process_missing_values(breastcancer_data, remove=False) + average = pd.DataFrame(average_data) + average.columns = breastcancer_data.columns + + return [breastcancer_data, removed, average] + + + diff --git a/random_forest.py b/random_forest.py new file mode 100644 index 0000000..f312cb1 --- /dev/null +++ b/random_forest.py @@ -0,0 +1,273 @@ +import numpy as np +from sklearn.model_selection import train_test_split +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import precision_recall_fscore_support, roc_curve +from scipy import stats +import preprocessing +import pickle +import os +import matplotlib.pyplot as plt + + +def get_train_test(data): + labels = np.array(data.pop('class')) + labels = np.where(labels == 2, 0, labels) + labels = np.where(labels == 4, 1, labels) + train, test, train_labels, test_labels = train_test_split(data, labels, + test_size=0.3, random_state=42) + return [train, test, train_labels.astype(int), test_labels.astype(int)] + + +def train_random_forest(train, train_labels, model_name, + number_trees=100, split_criteria='gini', weights=None): + model = RandomForestClassifier(n_estimators=number_trees, criterion=split_criteria, + class_weight=weights) + model.fit(train, train_labels) + pickle.dump(model, open(model_name, 'wb')) + + +def evaluate(model, train, test, train_labels, test_labels): + loaded_model = pickle.load(open(model, 'rb')) + + train_predictions = loaded_model.predict(train) + test_predictions = loaded_model.predict(test) + + train_accuracy = get_accuracy(train_labels, train_predictions) + test_accuracy = get_accuracy(test_labels, test_predictions) + + return [train_accuracy, test_accuracy] + + +def evaluate_other_metrics(model, train, test, train_labels, test_labels): + loaded_model = pickle.load(open(model, 'rb')) + + train_predictions = loaded_model.predict(train) + test_predictions = loaded_model.predict(test) + + train_accuracy = precision_recall_fscore_support(train_labels, train_predictions, average='binary') + test_accuracy = precision_recall_fscore_support(test_labels, test_predictions, average='binary') + + return [train_accuracy, test_accuracy] + + +def get_accuracy(ground_truth, predictions): + correct = 0 + for i in range(len(predictions)): + if predictions[i] == ground_truth[i]: + correct += 1 + + return correct/len(predictions) + + +def investigate_incorrect_predictions(model, x_test, ground_truth, model_name): + loaded_model = pickle.load(open(model, 'rb')) + predictions = loaded_model.predict(x_test) + + false = [] + correct = [] + for i in range(len(predictions)): + if predictions[i] != ground_truth[i]: + false.append(x_test.values[i]) + else: + correct.append(x_test.values[i]) + + f = np.array(false) + c = np.array(correct) + + f_mean = np.round(np.mean(f, axis=0), 2) + f_mode = stats.mode(f, axis=0)[0][0] + + c_mean = np.round(np.mean(c, axis=0), 2) + c_mode = stats.mode(c, axis=0)[0][0] + + """bars = list(x_test.columns) + y_pos = np.arange(len(bars)) + plt.bar(y_pos, f_mean, 0.35, color='b', align='center', label='incorrect') + plt.bar(y_pos + 0.35, c_mean, 0.35, color='g', align='center', label='correct') + plt.xticks(y_pos, bars, rotation='vertical') + plt.ylabel('Mean') + plt.title('Mean Investigation: ' + model_name) + plt.legend() + plt.tight_layout() + plt.show() + + bars = list(x_test.columns) + y_pos = np.arange(len(bars)) + plt.bar(y_pos, f_mode, 0.35, color='b', align='center', label='incorrect') + plt.bar(y_pos + 0.35, c_mode, 0.35, color='g', align='center', label='correct') + plt.xticks(y_pos, bars, rotation='vertical') + plt.ylabel('Mode') + plt.title('Mode Investigation: ' + model_name) + plt.legend() + plt.tight_layout() + plt.show()""" + + return [f_mean, f_mode, c_mean, c_mode] + + +def roc(model, test, test_labels, title): + loaded_model = pickle.load(open(model, 'rb')) + y_score = loaded_model.predict_proba(test)[:, 1] + false_positive_rate, true_positive_rate, threshold = roc_curve(test_labels, y_score) + + plt.title('ROC Curve: ' + title) + plt.plot(false_positive_rate, true_positive_rate) + plt.ylabel('True Positive Rate') + plt.xlabel('False Positive Rate') + plt.ylim(0.9, 1.01) + plt.show() + + +def train_all_combos(removed_data, average_data): + r_train, r_test, r_train_labels, r_test_labels = get_train_test(removed_data) + a_train, a_test, a_train_labels, a_test_labels = get_train_test(average_data) + + # num_trees = [10, 20, 40, 80, 160, 320] + num_trees = [640] + + for tree in num_trees: + print(tree) + train_random_forest(r_train, r_train_labels, 'removed_gini_normal_' + str(tree) + + '.model', number_trees=tree) + train_random_forest(r_train, r_train_labels, 'removed_gini_balanced_' + str(tree) + + '.model', number_trees=tree, weights='balanced') + train_random_forest(r_train, r_train_labels, 'removed_entropy_normal_' + str(tree) + + '.model', number_trees=tree, split_criteria='entropy') + train_random_forest(r_train, r_train_labels, 'removed_entropy_balanced_' + str(tree) + + '.model', number_trees=tree, split_criteria='entropy', weights='balanced') + + train_random_forest(a_train, a_train_labels, 'average_gini_normal_' + str(tree) + + '.model', number_trees=tree) + train_random_forest(a_train, a_train_labels, 'average_gini_balanced_' + str(tree) + + '.model', number_trees=tree, weights='balanced') + train_random_forest(a_train, a_train_labels, 'average_entropy_normal_' + str(tree) + + '.model', number_trees=tree, split_criteria='entropy') + train_random_forest(a_train, a_train_labels, 'average_entropy_balanced_' + str(tree) + + '.model', number_trees=tree, split_criteria='entropy', weights='balanced') + + +def all_combos_roc(removed_data, average_data): + # models = ['models/' + model for model in os.listdir('models')] + models = ['models_big/' + model for model in os.listdir('models_big')] + + r_train, r_test, r_train_labels, r_test_labels = get_train_test(removed_data) + a_train, a_test, a_train_labels, a_test_labels = get_train_test(average_data) + + """for model in models: + print(model) + if 'average' in model: + roc(model, a_test, a_test_labels, model) + else: + roc(model, r_test, r_test_labels, model)""" + roc('models/average_gini_normal_80.model', a_test, a_test_labels, 'Average - Gini - Normal - 80') + + +def investigate(removed_data, average_data): + models = ['models/' + model for model in os.listdir('models')] + + r_train, r_test, r_train_labels, r_test_labels = get_train_test(removed_data) + a_train, a_test, a_train_labels, a_test_labels = get_train_test(average_data) + + f_mean = [] + f_mode = [] + c_mean = [] + c_mode = [] + + for model in models: + print(model) + if 'average' in model: + a, b, c, d = investigate_incorrect_predictions(model, a_test, a_test_labels, model) + f_mean.append(a) + f_mode.append(b) + c_mean.append(c) + c_mode.append(d) + else: + a, b, c, d = investigate_incorrect_predictions(model, r_test, r_test_labels, model) + f_mean.append(a) + f_mode.append(b) + c_mean.append(c) + c_mode.append(d) + + models = ['models_big/' + model for model in os.listdir('models_big')] + + for model in models: + print(model) + if 'average' in model: + a, b, c, d = investigate_incorrect_predictions(model, a_test, a_test_labels, model) + f_mean.append(a) + f_mode.append(b) + c_mean.append(c) + c_mode.append(d) + else: + a, b, c, d = investigate_incorrect_predictions(model, r_test, r_test_labels, model) + """f_mean.append(a) + f_mode.append(b) + c_mean.append(c) + c_mode.append(d)""" + + f_mean = np.array(f_mean) + f_mode = np.array(f_mode) + c_mean = np.array(c_mean) + c_mode = np.array(c_mode) + + f_mean = np.round(np.mean(f_mean, axis=0), 2) + f_mode = stats.mode(f_mode, axis=0)[0][0] + + c_mean = np.round(np.mean(c_mean, axis=0), 2) + c_mode = stats.mode(c_mode, axis=0)[0][0] + + bars = list(removed_data.columns) + y_pos = np.arange(len(bars)) + plt.bar(y_pos-0.15, f_mean, 0.3, color='b', align='center', label='incorrect') + plt.bar(y_pos+0.15, c_mean, 0.3, color='g', align='center', label='correct') + plt.xticks(y_pos, bars, rotation='vertical') + plt.ylabel('Mean') + plt.title('Mean Investigation: Correct vs. Incorrect Predictions - Average Data') + plt.legend() + plt.tight_layout() + plt.show() + + bars = list(removed_data.columns) + y_pos = np.arange(len(bars)) + plt.bar(y_pos, f_mode, 0.35, color='b', align='center', label='incorrect') + plt.bar(y_pos + 0.35, c_mode, 0.35, color='g', align='center', label='correct') + plt.xticks(y_pos, bars, rotation='vertical') + plt.ylabel('Mode') + plt.title('Mode Investigation: Correct vs. Incorrect Predictions - Average Data') + plt.legend() + plt.tight_layout() + plt.show() + + +def all_combos_accuracy(removed_data, average_data): + models = ['models/' + model for model in os.listdir('models')] + # models = ['models_big/' + model for model in os.listdir('models_big')] + + r_train, r_test, r_train_labels, r_test_labels = get_train_test(removed_data) + a_train, a_test, a_train_labels, a_test_labels = get_train_test(average_data) + + for model in models: + print(model) + if 'average' in model: + # train_accuracy, test_accuracy = evaluate(model, a_train, a_test, a_train_labels, a_test_labels) + train_accuracy, test_accuracy = evaluate_other_metrics(model, a_train, a_test, a_train_labels, + a_test_labels) + else: + # train_accuracy, test_accuracy = evaluate(model, r_train, r_test, r_train_labels, r_test_labels) + train_accuracy, test_accuracy = evaluate_other_metrics(model, r_train, r_test, r_train_labels, + r_test_labels) + + print(test_accuracy) + + +if __name__ == '__main__': + original, removed, average = preprocessing.preprocess() + # train_all_combos(removed, average) + # all_combos_accuracy(removed, average) + # all_combos_roc(removed, average) + investigate(removed, average) + + + + +