Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Random Forest Code Base
  • Loading branch information
eam15110 committed Apr 26, 2020
1 parent a565fa2 commit 109b295
Show file tree
Hide file tree
Showing 3 changed files with 373 additions and 0 deletions.
39 changes: 39 additions & 0 deletions pca.py
@@ -0,0 +1,39 @@
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

import preprocessing

sns.set_style("darkgrid", {"axes.facecolor": ".95"})

breastcancer_data = preprocessing.get_data('breast-cancer-wisconsin.data')
data = preprocessing.process_missing_values(breastcancer_data, remove=False)
data = pd.DataFrame(data)
data.columns = breastcancer_data.columns

X = data[data.columns[:-1]].values
y = data[data.columns[-1]].values

X = StandardScaler().fit_transform(X)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

X_pca_viz = pd.DataFrame(X_pca)
X_pca_viz.columns = ["comp1", "comp2"]
X_pca_viz['labels'] = y
sns.lmplot("comp1", "comp2", hue="labels", data=X_pca_viz, fit_reg=False)
plt.show()


"""tsne = TSNE()
X_tsne = tsne.fit_transform(X_pca)
plt.rcParams['figure.figsize'] = (10.0, 10.0)
proj = pd.DataFrame(X_tsne)
proj["labels"] = y
sns.lmplot("comp_1", "comp_2", hue="labels", data=proj.sample(5000), fit_reg=False)"""
61 changes: 61 additions & 0 deletions preprocessing.py
@@ -0,0 +1,61 @@
import pandas as pd
import numpy as np


def get_data(data_file):
data = pd.read_csv(data_file)
data = data.drop(columns=['ID'])

return data


# imputation
def get_average_column_value(data, col):
total = 0
count = 0

for i in range(data.shape[0]):
if data[i][col] != '?':
total += int(data[i][col])
count += 1

return int(round(total/count, 0))


def process_missing_values(data, remove=True):
vals = data.values
new_vals = np.zeros(vals.shape)

diff = 0

for i in range(vals.shape[0]):
for j in range(len(vals[i])):
if vals[i][j] == '?':
if remove:
new_vals = np.delete(new_vals, i-diff, 0)
diff += 1
else:
new_vals[i-diff][j] = get_average_column_value(vals, j)
elif isinstance(vals[i][j], str):
new_vals[i-diff][j] = int(vals[i][j])
else:
new_vals[i-diff][j] = vals[i][j]

return new_vals


def preprocess():
breastcancer_data = get_data('breast-cancer-wisconsin.data')

removed_data = process_missing_values(breastcancer_data, remove=True)
removed = pd.DataFrame(removed_data)
removed.columns = breastcancer_data.columns

average_data = process_missing_values(breastcancer_data, remove=False)
average = pd.DataFrame(average_data)
average.columns = breastcancer_data.columns

return [breastcancer_data, removed, average]



273 changes: 273 additions & 0 deletions random_forest.py
@@ -0,0 +1,273 @@
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support, roc_curve
from scipy import stats
import preprocessing
import pickle
import os
import matplotlib.pyplot as plt


def get_train_test(data):
labels = np.array(data.pop('class'))
labels = np.where(labels == 2, 0, labels)
labels = np.where(labels == 4, 1, labels)
train, test, train_labels, test_labels = train_test_split(data, labels,
test_size=0.3, random_state=42)
return [train, test, train_labels.astype(int), test_labels.astype(int)]


def train_random_forest(train, train_labels, model_name,
number_trees=100, split_criteria='gini', weights=None):
model = RandomForestClassifier(n_estimators=number_trees, criterion=split_criteria,
class_weight=weights)
model.fit(train, train_labels)
pickle.dump(model, open(model_name, 'wb'))


def evaluate(model, train, test, train_labels, test_labels):
loaded_model = pickle.load(open(model, 'rb'))

train_predictions = loaded_model.predict(train)
test_predictions = loaded_model.predict(test)

train_accuracy = get_accuracy(train_labels, train_predictions)
test_accuracy = get_accuracy(test_labels, test_predictions)

return [train_accuracy, test_accuracy]


def evaluate_other_metrics(model, train, test, train_labels, test_labels):
loaded_model = pickle.load(open(model, 'rb'))

train_predictions = loaded_model.predict(train)
test_predictions = loaded_model.predict(test)

train_accuracy = precision_recall_fscore_support(train_labels, train_predictions, average='binary')
test_accuracy = precision_recall_fscore_support(test_labels, test_predictions, average='binary')

return [train_accuracy, test_accuracy]


def get_accuracy(ground_truth, predictions):
correct = 0
for i in range(len(predictions)):
if predictions[i] == ground_truth[i]:
correct += 1

return correct/len(predictions)


def investigate_incorrect_predictions(model, x_test, ground_truth, model_name):
loaded_model = pickle.load(open(model, 'rb'))
predictions = loaded_model.predict(x_test)

false = []
correct = []
for i in range(len(predictions)):
if predictions[i] != ground_truth[i]:
false.append(x_test.values[i])
else:
correct.append(x_test.values[i])

f = np.array(false)
c = np.array(correct)

f_mean = np.round(np.mean(f, axis=0), 2)
f_mode = stats.mode(f, axis=0)[0][0]

c_mean = np.round(np.mean(c, axis=0), 2)
c_mode = stats.mode(c, axis=0)[0][0]

"""bars = list(x_test.columns)
y_pos = np.arange(len(bars))
plt.bar(y_pos, f_mean, 0.35, color='b', align='center', label='incorrect')
plt.bar(y_pos + 0.35, c_mean, 0.35, color='g', align='center', label='correct')
plt.xticks(y_pos, bars, rotation='vertical')
plt.ylabel('Mean')
plt.title('Mean Investigation: ' + model_name)
plt.legend()
plt.tight_layout()
plt.show()
bars = list(x_test.columns)
y_pos = np.arange(len(bars))
plt.bar(y_pos, f_mode, 0.35, color='b', align='center', label='incorrect')
plt.bar(y_pos + 0.35, c_mode, 0.35, color='g', align='center', label='correct')
plt.xticks(y_pos, bars, rotation='vertical')
plt.ylabel('Mode')
plt.title('Mode Investigation: ' + model_name)
plt.legend()
plt.tight_layout()
plt.show()"""

return [f_mean, f_mode, c_mean, c_mode]


def roc(model, test, test_labels, title):
loaded_model = pickle.load(open(model, 'rb'))
y_score = loaded_model.predict_proba(test)[:, 1]
false_positive_rate, true_positive_rate, threshold = roc_curve(test_labels, y_score)

plt.title('ROC Curve: ' + title)
plt.plot(false_positive_rate, true_positive_rate)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.ylim(0.9, 1.01)
plt.show()


def train_all_combos(removed_data, average_data):
r_train, r_test, r_train_labels, r_test_labels = get_train_test(removed_data)
a_train, a_test, a_train_labels, a_test_labels = get_train_test(average_data)

# num_trees = [10, 20, 40, 80, 160, 320]
num_trees = [640]

for tree in num_trees:
print(tree)
train_random_forest(r_train, r_train_labels, 'removed_gini_normal_' + str(tree)
+ '.model', number_trees=tree)
train_random_forest(r_train, r_train_labels, 'removed_gini_balanced_' + str(tree)
+ '.model', number_trees=tree, weights='balanced')
train_random_forest(r_train, r_train_labels, 'removed_entropy_normal_' + str(tree)
+ '.model', number_trees=tree, split_criteria='entropy')
train_random_forest(r_train, r_train_labels, 'removed_entropy_balanced_' + str(tree)
+ '.model', number_trees=tree, split_criteria='entropy', weights='balanced')

train_random_forest(a_train, a_train_labels, 'average_gini_normal_' + str(tree)
+ '.model', number_trees=tree)
train_random_forest(a_train, a_train_labels, 'average_gini_balanced_' + str(tree)
+ '.model', number_trees=tree, weights='balanced')
train_random_forest(a_train, a_train_labels, 'average_entropy_normal_' + str(tree)
+ '.model', number_trees=tree, split_criteria='entropy')
train_random_forest(a_train, a_train_labels, 'average_entropy_balanced_' + str(tree)
+ '.model', number_trees=tree, split_criteria='entropy', weights='balanced')


def all_combos_roc(removed_data, average_data):
# models = ['models/' + model for model in os.listdir('models')]
models = ['models_big/' + model for model in os.listdir('models_big')]

r_train, r_test, r_train_labels, r_test_labels = get_train_test(removed_data)
a_train, a_test, a_train_labels, a_test_labels = get_train_test(average_data)

"""for model in models:
print(model)
if 'average' in model:
roc(model, a_test, a_test_labels, model)
else:
roc(model, r_test, r_test_labels, model)"""
roc('models/average_gini_normal_80.model', a_test, a_test_labels, 'Average - Gini - Normal - 80')


def investigate(removed_data, average_data):
models = ['models/' + model for model in os.listdir('models')]

r_train, r_test, r_train_labels, r_test_labels = get_train_test(removed_data)
a_train, a_test, a_train_labels, a_test_labels = get_train_test(average_data)

f_mean = []
f_mode = []
c_mean = []
c_mode = []

for model in models:
print(model)
if 'average' in model:
a, b, c, d = investigate_incorrect_predictions(model, a_test, a_test_labels, model)
f_mean.append(a)
f_mode.append(b)
c_mean.append(c)
c_mode.append(d)
else:
a, b, c, d = investigate_incorrect_predictions(model, r_test, r_test_labels, model)
f_mean.append(a)
f_mode.append(b)
c_mean.append(c)
c_mode.append(d)

models = ['models_big/' + model for model in os.listdir('models_big')]

for model in models:
print(model)
if 'average' in model:
a, b, c, d = investigate_incorrect_predictions(model, a_test, a_test_labels, model)
f_mean.append(a)
f_mode.append(b)
c_mean.append(c)
c_mode.append(d)
else:
a, b, c, d = investigate_incorrect_predictions(model, r_test, r_test_labels, model)
"""f_mean.append(a)
f_mode.append(b)
c_mean.append(c)
c_mode.append(d)"""

f_mean = np.array(f_mean)
f_mode = np.array(f_mode)
c_mean = np.array(c_mean)
c_mode = np.array(c_mode)

f_mean = np.round(np.mean(f_mean, axis=0), 2)
f_mode = stats.mode(f_mode, axis=0)[0][0]

c_mean = np.round(np.mean(c_mean, axis=0), 2)
c_mode = stats.mode(c_mode, axis=0)[0][0]

bars = list(removed_data.columns)
y_pos = np.arange(len(bars))
plt.bar(y_pos-0.15, f_mean, 0.3, color='b', align='center', label='incorrect')
plt.bar(y_pos+0.15, c_mean, 0.3, color='g', align='center', label='correct')
plt.xticks(y_pos, bars, rotation='vertical')
plt.ylabel('Mean')
plt.title('Mean Investigation: Correct vs. Incorrect Predictions - Average Data')
plt.legend()
plt.tight_layout()
plt.show()

bars = list(removed_data.columns)
y_pos = np.arange(len(bars))
plt.bar(y_pos, f_mode, 0.35, color='b', align='center', label='incorrect')
plt.bar(y_pos + 0.35, c_mode, 0.35, color='g', align='center', label='correct')
plt.xticks(y_pos, bars, rotation='vertical')
plt.ylabel('Mode')
plt.title('Mode Investigation: Correct vs. Incorrect Predictions - Average Data')
plt.legend()
plt.tight_layout()
plt.show()


def all_combos_accuracy(removed_data, average_data):
models = ['models/' + model for model in os.listdir('models')]
# models = ['models_big/' + model for model in os.listdir('models_big')]

r_train, r_test, r_train_labels, r_test_labels = get_train_test(removed_data)
a_train, a_test, a_train_labels, a_test_labels = get_train_test(average_data)

for model in models:
print(model)
if 'average' in model:
# train_accuracy, test_accuracy = evaluate(model, a_train, a_test, a_train_labels, a_test_labels)
train_accuracy, test_accuracy = evaluate_other_metrics(model, a_train, a_test, a_train_labels,
a_test_labels)
else:
# train_accuracy, test_accuracy = evaluate(model, r_train, r_test, r_train_labels, r_test_labels)
train_accuracy, test_accuracy = evaluate_other_metrics(model, r_train, r_test, r_train_labels,
r_test_labels)

print(test_accuracy)


if __name__ == '__main__':
original, removed, average = preprocessing.preprocess()
# train_all_combos(removed, average)
# all_combos_accuracy(removed, average)
# all_combos_roc(removed, average)
investigate(removed, average)





0 comments on commit 109b295

Please sign in to comment.