Add PHM08 and an initial notebook to test it. Also add .gitignore

rwb11001 · Apr 12, 2024 · ed2cd79 · ed2cd79
1 parent 87662c8
commit ed2cd79
Show file tree

Hide file tree

Showing 8 changed files with 131,225 additions and 2 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+*.ipynb_checkpoints
diff --git a/Challenge_01.ipynb b/Challenge_01.ipynb
@@ -0,0 +1,262 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "d93bf412-ff4d-4f04-a8af-3d101b2b1e48",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import math\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "from matplotlib.colors import ListedColormap\n",
+    "from matplotlib import ticker\n",
+    "import matplotlib as mpl\n",
+    "\n",
+    "from sklearn.datasets import make_circles, make_classification, make_moons\n",
+    "from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier\n",
+    "from sklearn.inspection import DecisionBoundaryDisplay\n",
+    "from sklearn.tree import DecisionTreeClassifier\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "from imblearn.ensemble import BalancedRandomForestClassifier"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "fb307961-d858-45e9-80ae-505806ec4063",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "train data raw: (45918, 26)\n",
+      "test data raw: (45918, 26)\n"
+     ]
+    }
+   ],
+   "source": [
+    "train_data_raw = np.loadtxt(\"Challenge_Data/train.txt\")\n",
+    "print(f\"train data raw: {train_data_raw.shape}\")\n",
+    "\n",
+    "test_data_raw = np.loadtxt(\"Challenge_Data/test.txt\")\n",
+    "print(f\"test data raw: {train_data_raw.shape}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "86dba2b5-498f-4cf0-977c-867fb81406ee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# We need to do things:\n",
+    "#  - Strip unit number\n",
+    "#  - Generate Labels\n",
+    "def process_raw_data(data):\n",
+    "    (m, _) = data.shape\n",
+    "    y = []\n",
+    "    for i in range(0, m):\n",
+    "        unit = data[i][0]\n",
+    "        if i < m - 1 and unit != data[i + 1][0]:\n",
+    "            y.append(1)\n",
+    "        elif i == m - 1:\n",
+    "            y.append(1)\n",
+    "        else:\n",
+    "            y.append(0)\n",
+    "\n",
+    "    return (data[:,1:], y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "0afb4e8b-0e22-4296-8e7f-5dd2e7019722",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "x train: (45918, 25), y train: 45918, # pos: 218\n",
+      "x test: (29820, 25), y test: 29820, # pos: 218\n"
+     ]
+    }
+   ],
+   "source": [
+    "(x_train, y_train) = process_raw_data(train_data_raw)\n",
+    "print(f\"x train: {x_train.shape}, y train: {len(y_train)}, # pos: {np.sum(y_train)}\")\n",
+    "\n",
+    "(x_test, y_test) = process_raw_data(test_data_raw)\n",
+    "print(f\"x test: {x_test.shape}, y test: {len(y_test)}, # pos: {np.sum(y_test)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "205561a6-7d65-422e-ba47-79f4a4e4398c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Compute the Perfomance Metrics\n",
+    "#  - A trained classifier\n",
+    "#  - x, y test data\n",
+    "# returns labels, \n",
+    "#  1 -> true positive\n",
+    "#  2 -> true negative\n",
+    "#  3 -> false positive\n",
+    "#  4 -> false negative\n",
+    "def score_classifier(clf, x, y):\n",
+    "    score = clf.score(x, y)\n",
+    "    (m, _) = x.shape\n",
+    "    p = clf.predict(x)\n",
+    "    true_pos = 0\n",
+    "    true_neg = 0\n",
+    "    false_pos = 0\n",
+    "    false_neg = 0\n",
+    "    labels = []\n",
+    "    for i in range(0, m):\n",
+    "        x_i = x[i]\n",
+    "        y_i = y[i]\n",
+    "        y_h = p[i]\n",
+    "\n",
+    "        if y_i == y_h and y_h == 1:\n",
+    "            true_pos += 1\n",
+    "            labels.append(1)\n",
+    "            \n",
+    "        if y_i == y_h and y_h == 0:\n",
+    "            true_neg += 1\n",
+    "            labels.append(2)\n",
+    "\n",
+    "        if y_i != y_h and y_h == 1:\n",
+    "            false_pos += 1\n",
+    "            labels.append(3)\n",
+    "\n",
+    "        if y_i != y_h and y_h == 0:\n",
+    "            false_neg += 1\n",
+    "            labels.append(4)\n",
+    "    \n",
+    "    # Metrics from Table 2 of paper\n",
+    "    tnr = 0\n",
+    "    if true_neg + false_pos != 0:\n",
+    "        tnr = true_neg / (true_neg + false_pos)\n",
+    "\n",
+    "    tpr = 0\n",
+    "    if true_pos + false_neg != 0:\n",
+    "        tpr = true_pos / (true_pos + false_neg)\n",
+    "    \n",
+    "    g_mean = math.sqrt(tnr * tpr)\n",
+    "\n",
+    "    precision = 0\n",
+    "    if true_pos + false_pos != 0:\n",
+    "        precision = true_pos / (true_pos + false_pos)\n",
+    "\n",
+    "    f_measure = 0\n",
+    "    if precision + tpr != 0:\n",
+    "        f_measure = (2 * precision * tpr) / (precision + tpr)\n",
+    "    \n",
+    "    print(f\"TP: {true_pos}, TN: {true_neg}, FP: {false_pos}, FN: {false_neg}\")\n",
+    "    print(f\"tnr: {tnr}, tpr: {tpr}, g_mean: {g_mean}, precision: {precision}, f_measure: {f_measure}, score: {score}\")\n",
+    "\n",
+    "    return labels"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "id": "65e24363-933c-4bf6-bf3d-3a2c524a7dd2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "random forest\n",
+      "TP: 0, TN: 29602, FP: 0, FN: 218\n",
+      "tnr: 1.0, tpr: 0.0, g_mean: 0.0, precision: 0, f_measure: 0, score: 0.9926894701542589\n",
+      "weighted random forest\n",
+      "TP: 0, TN: 29602, FP: 0, FN: 218\n",
+      "tnr: 1.0, tpr: 0.0, g_mean: 0.0, precision: 0, f_measure: 0, score: 0.9926894701542589\n",
+      "balanced forest\n",
+      "TP: 5, TN: 29531, FP: 71, FN: 213\n",
+      "tnr: 0.997601513411256, tpr: 0.022935779816513763, g_mean: 0.15126390400958672, precision: 0.06578947368421052, f_measure: 0.034013605442176874, score: 0.9904761904761905\n"
+     ]
+    }
+   ],
+   "source": [
+    "rf_clf = RandomForestClassifier()\n",
+    "rf_clf.fit(x_train, y_train)\n",
+    "score = rf_clf.score(x_test, y_test)\n",
+    "print(\"random forest\")\n",
+    "rf_labels = score_classifier(rf_clf, x_test, y_test)\n",
+    "\n",
+    "clf2 = RandomForestClassifier(class_weight = {0:0.01, 1:0.99},)\n",
+    "clf2.fit(x_train, y_train)\n",
+    "score = clf2.score(x_test, y_test)\n",
+    "print(\"weighted random forest\")\n",
+    "wf_labels = score_classifier(clf2, x_test, y_test)\n",
+    "\n",
+    "bf_clf = BalancedRandomForestClassifier(sampling_strategy='all', replacement=True, bootstrap=False)\n",
+    "bf_clf.fit(x_train, y_train)\n",
+    "print(\"balanced forest\")\n",
+    "bf_labels = score_classifier(bf_clf, x_test, y_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "id": "076b6141-37c7-44b0-898c-12d0651691d3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "balanced forest\n",
+      "TP: 8, TN: 29539, FP: 63, FN: 210\n",
+      "tnr: 0.9978717654212553, tpr: 0.03669724770642202, g_mean: 0.1913613005675611, precision: 0.11267605633802817, f_measure: 0.05536332179930796, score: 0.9908450704225352\n"
+     ]
+    }
+   ],
+   "source": [
+    "bf_clf = BalancedRandomForestClassifier(sampling_strategy='all', replacement=True, bootstrap=False, max_features=25, class_weight='balanced_subsample')\n",
+    "bf_clf.fit(x_train, y_train)\n",
+    "print(\"balanced forest\")\n",
+    "bf_labels = score_classifier(bf_clf, x_test, y_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2dbb5d04-32d4-4d78-bdf5-66809cf526c1",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/Challenge_Data/Challenge Data Description 2016.pdf b/Challenge_Data/Challenge Data Description 2016.pdf