Skip to content

Commit

Permalink
Add PHM08 and an initial notebook to test it. Also add .gitignore
Browse files Browse the repository at this point in the history
  • Loading branch information
RussellBentley committed Apr 12, 2024
1 parent 87662c8 commit ed2cd79
Show file tree
Hide file tree
Showing 8 changed files with 131,225 additions and 2 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.ipynb_checkpoints
262 changes: 262 additions & 0 deletions Challenge_01.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,262 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "d93bf412-ff4d-4f04-a8af-3d101b2b1e48",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import math\n",
"\n",
"import matplotlib.pyplot as plt\n",
"from matplotlib.colors import ListedColormap\n",
"from matplotlib import ticker\n",
"import matplotlib as mpl\n",
"\n",
"from sklearn.datasets import make_circles, make_classification, make_moons\n",
"from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier\n",
"from sklearn.inspection import DecisionBoundaryDisplay\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"from imblearn.ensemble import BalancedRandomForestClassifier"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "fb307961-d858-45e9-80ae-505806ec4063",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"train data raw: (45918, 26)\n",
"test data raw: (45918, 26)\n"
]
}
],
"source": [
"train_data_raw = np.loadtxt(\"Challenge_Data/train.txt\")\n",
"print(f\"train data raw: {train_data_raw.shape}\")\n",
"\n",
"test_data_raw = np.loadtxt(\"Challenge_Data/test.txt\")\n",
"print(f\"test data raw: {train_data_raw.shape}\")"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "86dba2b5-498f-4cf0-977c-867fb81406ee",
"metadata": {},
"outputs": [],
"source": [
"# We need to do things:\n",
"# - Strip unit number\n",
"# - Generate Labels\n",
"def process_raw_data(data):\n",
" (m, _) = data.shape\n",
" y = []\n",
" for i in range(0, m):\n",
" unit = data[i][0]\n",
" if i < m - 1 and unit != data[i + 1][0]:\n",
" y.append(1)\n",
" elif i == m - 1:\n",
" y.append(1)\n",
" else:\n",
" y.append(0)\n",
"\n",
" return (data[:,1:], y)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "0afb4e8b-0e22-4296-8e7f-5dd2e7019722",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"x train: (45918, 25), y train: 45918, # pos: 218\n",
"x test: (29820, 25), y test: 29820, # pos: 218\n"
]
}
],
"source": [
"(x_train, y_train) = process_raw_data(train_data_raw)\n",
"print(f\"x train: {x_train.shape}, y train: {len(y_train)}, # pos: {np.sum(y_train)}\")\n",
"\n",
"(x_test, y_test) = process_raw_data(test_data_raw)\n",
"print(f\"x test: {x_test.shape}, y test: {len(y_test)}, # pos: {np.sum(y_test)}\")"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "205561a6-7d65-422e-ba47-79f4a4e4398c",
"metadata": {},
"outputs": [],
"source": [
"# Compute the Perfomance Metrics\n",
"# - A trained classifier\n",
"# - x, y test data\n",
"# returns labels, \n",
"# 1 -> true positive\n",
"# 2 -> true negative\n",
"# 3 -> false positive\n",
"# 4 -> false negative\n",
"def score_classifier(clf, x, y):\n",
" score = clf.score(x, y)\n",
" (m, _) = x.shape\n",
" p = clf.predict(x)\n",
" true_pos = 0\n",
" true_neg = 0\n",
" false_pos = 0\n",
" false_neg = 0\n",
" labels = []\n",
" for i in range(0, m):\n",
" x_i = x[i]\n",
" y_i = y[i]\n",
" y_h = p[i]\n",
"\n",
" if y_i == y_h and y_h == 1:\n",
" true_pos += 1\n",
" labels.append(1)\n",
" \n",
" if y_i == y_h and y_h == 0:\n",
" true_neg += 1\n",
" labels.append(2)\n",
"\n",
" if y_i != y_h and y_h == 1:\n",
" false_pos += 1\n",
" labels.append(3)\n",
"\n",
" if y_i != y_h and y_h == 0:\n",
" false_neg += 1\n",
" labels.append(4)\n",
" \n",
" # Metrics from Table 2 of paper\n",
" tnr = 0\n",
" if true_neg + false_pos != 0:\n",
" tnr = true_neg / (true_neg + false_pos)\n",
"\n",
" tpr = 0\n",
" if true_pos + false_neg != 0:\n",
" tpr = true_pos / (true_pos + false_neg)\n",
" \n",
" g_mean = math.sqrt(tnr * tpr)\n",
"\n",
" precision = 0\n",
" if true_pos + false_pos != 0:\n",
" precision = true_pos / (true_pos + false_pos)\n",
"\n",
" f_measure = 0\n",
" if precision + tpr != 0:\n",
" f_measure = (2 * precision * tpr) / (precision + tpr)\n",
" \n",
" print(f\"TP: {true_pos}, TN: {true_neg}, FP: {false_pos}, FN: {false_neg}\")\n",
" print(f\"tnr: {tnr}, tpr: {tpr}, g_mean: {g_mean}, precision: {precision}, f_measure: {f_measure}, score: {score}\")\n",
"\n",
" return labels"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "65e24363-933c-4bf6-bf3d-3a2c524a7dd2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"random forest\n",
"TP: 0, TN: 29602, FP: 0, FN: 218\n",
"tnr: 1.0, tpr: 0.0, g_mean: 0.0, precision: 0, f_measure: 0, score: 0.9926894701542589\n",
"weighted random forest\n",
"TP: 0, TN: 29602, FP: 0, FN: 218\n",
"tnr: 1.0, tpr: 0.0, g_mean: 0.0, precision: 0, f_measure: 0, score: 0.9926894701542589\n",
"balanced forest\n",
"TP: 5, TN: 29531, FP: 71, FN: 213\n",
"tnr: 0.997601513411256, tpr: 0.022935779816513763, g_mean: 0.15126390400958672, precision: 0.06578947368421052, f_measure: 0.034013605442176874, score: 0.9904761904761905\n"
]
}
],
"source": [
"rf_clf = RandomForestClassifier()\n",
"rf_clf.fit(x_train, y_train)\n",
"score = rf_clf.score(x_test, y_test)\n",
"print(\"random forest\")\n",
"rf_labels = score_classifier(rf_clf, x_test, y_test)\n",
"\n",
"clf2 = RandomForestClassifier(class_weight = {0:0.01, 1:0.99},)\n",
"clf2.fit(x_train, y_train)\n",
"score = clf2.score(x_test, y_test)\n",
"print(\"weighted random forest\")\n",
"wf_labels = score_classifier(clf2, x_test, y_test)\n",
"\n",
"bf_clf = BalancedRandomForestClassifier(sampling_strategy='all', replacement=True, bootstrap=False)\n",
"bf_clf.fit(x_train, y_train)\n",
"print(\"balanced forest\")\n",
"bf_labels = score_classifier(bf_clf, x_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "076b6141-37c7-44b0-898c-12d0651691d3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"balanced forest\n",
"TP: 8, TN: 29539, FP: 63, FN: 210\n",
"tnr: 0.9978717654212553, tpr: 0.03669724770642202, g_mean: 0.1913613005675611, precision: 0.11267605633802817, f_measure: 0.05536332179930796, score: 0.9908450704225352\n"
]
}
],
"source": [
"bf_clf = BalancedRandomForestClassifier(sampling_strategy='all', replacement=True, bootstrap=False, max_features=25, class_weight='balanced_subsample')\n",
"bf_clf.fit(x_train, y_train)\n",
"print(\"balanced forest\")\n",
"bf_labels = score_classifier(bf_clf, x_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2dbb5d04-32d4-4d78-bdf5-66809cf526c1",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Binary file not shown.
Loading

0 comments on commit ed2cd79

Please sign in to comment.