From c9809a35ebdf94c28fb64aece25a7c6614782e89 Mon Sep 17 00:00:00 2001
From: Luis Roberto Mercado Diaz <luis.mercado_diaz@uconn.edu>
Date: Mon, 22 Jan 2024 14:49:34 -0500
Subject: [PATCH] HPC FOLDER

Co-Authored-By: Dong Han <dong.han@uconn.edu>
---
 .../active_learning/ss_active_learning.py     |  86 +++++
 HPC/final_project/environment.yml             | 329 ++++++++++++++++++
 ...l_attemp_4_1_Dong_Ohm_summary_20231025.csv | 110 ++++++
 HPC/final_project/gp_al_ss_full.sh            |  34 ++
 HPC/final_project/models/ss_gp_model.py       | 119 +++++++
 HPC/final_project/ss_main.py                  |  79 +++++
 HPC/final_project/utils/data_loader.py        | 222 ++++++++++++
 ...l_attemp_4_1_Dong_Ohm_summary_20231025.csv | 110 ++++++
 HPC/final_project/utils/ss_evaluation.py      |  77 ++++
 HPC/final_project/utils/visualization.py      |  81 +++++
 10 files changed, 1247 insertions(+)
 create mode 100644 HPC/final_project/active_learning/ss_active_learning.py
 create mode 100644 HPC/final_project/environment.yml
 create mode 100644 HPC/final_project/final_attemp_4_1_Dong_Ohm_summary_20231025.csv
 create mode 100644 HPC/final_project/gp_al_ss_full.sh
 create mode 100644 HPC/final_project/models/ss_gp_model.py
 create mode 100644 HPC/final_project/ss_main.py
 create mode 100644 HPC/final_project/utils/data_loader.py
 create mode 100644 HPC/final_project/utils/final_attemp_4_1_Dong_Ohm_summary_20231025.csv
 create mode 100644 HPC/final_project/utils/ss_evaluation.py
 create mode 100644 HPC/final_project/utils/visualization.py

diff --git a/HPC/final_project/active_learning/ss_active_learning.py b/HPC/final_project/active_learning/ss_active_learning.py
new file mode 100644
index 0000000..4d15f68
--- /dev/null
+++ b/HPC/final_project/active_learning/ss_active_learning.py
@@ -0,0 +1,86 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Dec 18 18:23:23 2023
+
+@author: lrm22005
+"""
+import numpy as np
+import random
+import torch
+from torch.utils.data import DataLoader
+from sklearn.cluster import MiniBatchKMeans
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+def label_samples(uncertain_samples, validation_data):
+    labels = [validation_data[sample_id]['label'] for sample_id in uncertain_samples]
+    return uncertain_samples, labels
+
+def stochastic_uncertainty_sampling(gp_model, gp_likelihood, val_loader, n_samples, n_batches, n_components=2):
+    gp_model.eval()
+    gp_likelihood.eval()
+    uncertain_sample_indices = []
+    sampled_batches = random.sample(list(val_loader), n_batches)  # Randomly sample n_batches from val_loader
+    
+    with torch.no_grad():
+        for batch in sampled_batches:
+            # reduced_data = apply_tsne(batch['data'].reshape(batch['data'].size(0), -1), n_components=n_components)
+            # reduced_data_tensor = torch.Tensor(reduced_data).to(device)
+            reduced_data_tensor = batch['data'].view(batch['data'].size(0), -1).to(device)
+            predictions = gp_likelihood(gp_model(reduced_data_tensor))
+            var = predictions.variance
+            top_indices = torch.argsort(-var.flatten())[:n_samples]
+            uncertain_sample_indices.extend(top_indices.cpu().numpy())
+    
+    return uncertain_sample_indices[:n_samples]
+
+# def uncertainty_sampling(gp_model, gp_likelihood, val_loader, n_samples, n_components=2):
+#     gp_model.eval()
+#     gp_likelihood.eval()
+#     uncertain_sample_indices = []
+#     with torch.no_grad():
+#         for batch_idx, batch in tqdm(enumerate(val_loader), desc='Uncertainty Sampling', unit='batch'):
+#             reduced_data_tensor = batch['data'].view(batch['data'].size(0), -1).to(device)
+#             predictions = gp_likelihood(gp_model(reduced_data_tensor))
+#             var = predictions.variance
+#             top_indices = torch.argsort(-var.flatten())[:n_samples]
+#             batch_uncertain_indices = [batch_idx * val_loader.batch_size + idx for idx in top_indices]
+#             uncertain_sample_indices.extend(batch_uncertain_indices)
+#     return uncertain_sample_indices[:n_samples]
+
+def run_minibatch_kmeans(data_loader, n_clusters, device, batch_size=100):
+    # Initialize MiniBatchKMeans
+    minibatch_kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=0, batch_size=batch_size)
+
+    # Iterate through data_loader and fit MiniBatchKMeans
+    for batch in data_loader:
+        data = batch['data'].view(batch['data'].size(0), -1).to(device).cpu().numpy()
+        minibatch_kmeans.partial_fit(data)
+
+    return minibatch_kmeans
+
+# def compare_kmeans_gp_predictions(kmeans_model, gp_model, data_loader, device):
+#     # Compare K-Means with GP model predictions
+#     all_data, all_labels = [], []
+#     for batch in data_loader:
+#         data = batch['data'].view(batch['data'].size(0), -1).to(device)
+#         labels = batch['label'].to(device)
+#         gp_predictions = gp_model(data).mean.argmax(dim=0).cpu().numpy()
+#         kmeans_predictions = kmeans_model.predict(data.cpu().numpy())
+#         all_labels.append(labels.cpu().numpy())
+#         all_data.append((gp_predictions, kmeans_predictions))
+#     return all_data, np.concatenate(all_labels)
+
+def stochastic_compare_kmeans_gp_predictions(kmeans_model, gp_model, data_loader, n_batches, device):
+    all_data, all_labels = [], []
+    sampled_batches = random.sample(list(data_loader), n_batches)  # Randomly sample n_batches from data_loader
+    
+    for batch in sampled_batches:
+        data = batch['data'].view(batch['data'].size(0), -1).to(device)
+        labels = batch['label'].to(device)
+        gp_predictions = gp_model(data).mean.argmax(dim=0).cpu().numpy()
+        kmeans_predictions = kmeans_model.predict(data.cpu().numpy())
+        all_labels.append(labels.cpu().numpy())
+        all_data.append((gp_predictions, kmeans_predictions))
+    
+    return all_data, np.concatenate(all_labels)
\ No newline at end of file
diff --git a/HPC/final_project/environment.yml b/HPC/final_project/environment.yml
new file mode 100644
index 0000000..fa488fe
--- /dev/null
+++ b/HPC/final_project/environment.yml
@@ -0,0 +1,329 @@
+name: pytorch
+channels:
+  - pytorch
+  - pyg
+  - nvidia
+  - conda-forge
+  - anaconda
+  - defaults
+dependencies:
+  - alabaster=0.7.12=pyhd3eb1b0_0
+  - anyio=3.5.0=py311haa95532_0
+  - argon2-cffi=21.3.0=pyhd3eb1b0_0
+  - argon2-cffi-bindings=21.2.0=py311h2bbff1b_0
+  - arrow=1.2.3=py311haa95532_1
+  - astroid=2.14.2=py311haa95532_0
+  - asttokens=2.0.5=pyhd3eb1b0_0
+  - async-lru=2.0.4=py311haa95532_0
+  - atomicwrites=1.4.0=py_0
+  - attrs=23.1.0=py311haa95532_0
+  - autopep8=1.6.0=pyhd3eb1b0_1
+  - babel=2.11.0=py311haa95532_0
+  - backcall=0.2.0=pyhd3eb1b0_0
+  - bcrypt=3.2.0=py311h2bbff1b_1
+  - beautifulsoup4=4.12.2=py311haa95532_0
+  - binaryornot=0.4.4=pyhd3eb1b0_1
+  - black=23.11.0=py311haa95532_0
+  - blas=1.0=mkl
+  - bleach=4.1.0=pyhd3eb1b0_0
+  - bottleneck=1.3.5=py311h5bb9823_0
+  - brotli=1.0.9=ha925a31_2
+  - brotli-python=1.0.9=py311hd77b12b_7
+  - bzip2=1.0.8=he774522_0
+  - ca-certificates=2023.11.17=h56e8100_0
+  - certifi=2023.11.17=pyhd8ed1ab_0
+  - cffi=1.16.0=py311h2bbff1b_0
+  - chardet=4.0.0=py311haa95532_1003
+  - charset-normalizer=2.0.4=pyhd3eb1b0_0
+  - click=8.1.7=py311haa95532_0
+  - cloudpickle=2.2.1=py311haa95532_0
+  - colorama=0.4.6=py311haa95532_0
+  - console_shortcut=0.1.1=4
+  - contourpy=1.2.0=py311h59b6b97_0
+  - cookiecutter=2.5.0=py311haa95532_0
+  - cryptography=41.0.7=py311h89fc84f_0
+  - cuda-cccl=12.3.101=0
+  - cuda-cudart=12.1.105=0
+  - cuda-cudart-dev=12.1.105=0
+  - cuda-cupti=12.1.105=0
+  - cuda-libraries=12.1.0=0
+  - cuda-libraries-dev=12.1.0=0
+  - cuda-nvrtc=12.1.105=0
+  - cuda-nvrtc-dev=12.1.105=0
+  - cuda-nvtx=12.1.105=0
+  - cuda-opencl=12.3.101=0
+  - cuda-opencl-dev=12.3.101=0
+  - cuda-profiler-api=12.3.101=0
+  - cuda-runtime=12.1.0=0
+  - cycler=0.12.1=pyhd8ed1ab_0
+  - daal4py=2023.1.1=py311h30df693_0
+  - dal=2023.1.1=h59b6b97_48682
+  - debugpy=1.6.7=py311hd77b12b_0
+  - decorator=5.1.1=pyhd3eb1b0_0
+  - defusedxml=0.7.1=pyhd3eb1b0_0
+  - diff-match-patch=20200713=pyhd3eb1b0_0
+  - dill=0.3.7=py311haa95532_0
+  - docstring-to-markdown=0.11=py311haa95532_0
+  - docutils=0.18.1=py311haa95532_3
+  - executing=0.8.3=pyhd3eb1b0_0
+  - filelock=3.13.1=py311haa95532_0
+  - flake8=6.0.0=py311haa95532_0
+  - fonttools=4.25.0=pyhd3eb1b0_0
+  - freetype=2.12.1=ha860e81_0
+  - fsspec=2023.12.2=pyhca7485f_0
+  - giflib=5.2.1=h8cc25b3_3
+  - gmpy2=2.1.2=py311h7f96b67_0
+  - gpytorch=1.11=pyhd8ed1ab_0
+  - icc_rt=2022.1.0=h6049295_2
+  - icu=73.1=h6c2663c_0
+  - idna=3.4=py311haa95532_0
+  - imagesize=1.4.1=py311haa95532_0
+  - imbalanced-learn=0.11.0=pyhd8ed1ab_0
+  - importlib-metadata=6.0.0=py311haa95532_0
+  - importlib_metadata=6.0.0=hd3eb1b0_0
+  - inflection=0.5.1=py311haa95532_0
+  - intel-openmp=2023.1.0=h59b6b97_46320
+  - intervaltree=3.1.0=pyhd3eb1b0_0
+  - ipykernel=6.25.0=py311h746a85d_0
+  - ipython=8.15.0=py311haa95532_0
+  - ipython_genutils=0.2.0=pyhd3eb1b0_1
+  - isort=5.9.3=pyhd3eb1b0_0
+  - jaraco.classes=3.2.1=pyhd3eb1b0_0
+  - jaxtyping=0.2.25=pyhd8ed1ab_0
+  - jedi=0.18.1=py311haa95532_1
+  - jellyfish=1.0.1=py311h36a85e1_0
+  - jinja2=3.1.2=py311haa95532_0
+  - joblib=1.2.0=py311haa95532_0
+  - jpeg=9e=h2bbff1b_1
+  - json5=0.9.6=pyhd3eb1b0_0
+  - jsonschema=4.19.2=py311haa95532_0
+  - jsonschema-specifications=2023.7.1=py311haa95532_0
+  - jupyter-lsp=2.2.0=py311haa95532_0
+  - jupyter_client=8.6.0=py311haa95532_0
+  - jupyter_core=5.5.0=py311haa95532_0
+  - jupyter_events=0.8.0=py311haa95532_0
+  - jupyter_server=2.10.0=py311haa95532_0
+  - jupyter_server_terminals=0.4.4=py311haa95532_1
+  - jupyterlab=4.0.8=py311haa95532_0
+  - jupyterlab_pygments=0.1.2=py_0
+  - jupyterlab_server=2.25.1=py311haa95532_0
+  - jupyterlab_widgets=3.0.9=py311haa95532_0
+  - keyring=23.13.1=py311haa95532_0
+  - kiwisolver=1.4.4=py311hd77b12b_0
+  - krb5=1.20.1=h5b6d351_0
+  - lazy-object-proxy=1.6.0=py311h2bbff1b_0
+  - lerc=3.0=hd77b12b_0
+  - libclang=14.0.6=default_hb5a9fac_1
+  - libclang13=14.0.6=default_h8e68704_1
+  - libcublas=12.1.0.26=0
+  - libcublas-dev=12.1.0.26=0
+  - libcufft=11.0.2.4=0
+  - libcufft-dev=11.0.2.4=0
+  - libcurand=10.3.4.101=0
+  - libcurand-dev=10.3.4.101=0
+  - libcusolver=11.4.4.55=0
+  - libcusolver-dev=11.4.4.55=0
+  - libcusparse=12.0.2.55=0
+  - libcusparse-dev=12.0.2.55=0
+  - libdeflate=1.17=h2bbff1b_1
+  - libffi=3.4.4=hd77b12b_0
+  - libjpeg-turbo=2.0.0=h196d8e1_0
+  - libnpp=12.0.2.50=0
+  - libnpp-dev=12.0.2.50=0
+  - libnvjitlink=12.1.105=0
+  - libnvjitlink-dev=12.1.105=0
+  - libnvjpeg=12.1.1.14=0
+  - libnvjpeg-dev=12.1.1.14=0
+  - libpng=1.6.39=h8cc25b3_0
+  - libpq=12.15=h906ac69_1
+  - libsodium=1.0.18=h62dcd97_0
+  - libspatialindex=1.9.3=h6c2663c_0
+  - libtiff=4.5.1=hd77b12b_0
+  - libuv=1.44.2=h2bbff1b_0
+  - libwebp=1.3.2=hbc33d0d_0
+  - libwebp-base=1.3.2=h2bbff1b_0
+  - lightning-utilities=0.10.0=pyhd8ed1ab_0
+  - linear_operator=0.5.2=pyhd8ed1ab_0
+  - lz4-c=1.9.4=h2bbff1b_0
+  - markdown-it-py=2.2.0=py311haa95532_1
+  - markupsafe=2.1.1=py311h2bbff1b_0
+  - matplotlib=3.8.0=py311haa95532_0
+  - matplotlib-base=3.8.0=py311hf62ec03_0
+  - matplotlib-inline=0.1.6=py311haa95532_0
+  - mccabe=0.7.0=pyhd3eb1b0_0
+  - mdurl=0.1.0=py311haa95532_0
+  - mistune=2.0.4=py311haa95532_0
+  - mkl=2023.1.0=h6b88ed4_46358
+  - mkl-service=2.4.0=py311h2bbff1b_1
+  - mkl_fft=1.3.8=py311h2bbff1b_0
+  - mkl_random=1.2.4=py311h59b6b97_0
+  - more-itertools=10.1.0=py311haa95532_0
+  - mpc=1.1.0=h7edee0f_1
+  - mpfr=4.0.2=h62dcd97_1
+  - mpir=3.0.0=hec2e145_1
+  - mpmath=1.3.0=py311haa95532_0
+  - munkres=1.1.4=pyh9f0ad1d_0
+  - mypy_extensions=1.0.0=py311haa95532_0
+  - nbclient=0.8.0=py311haa95532_0
+  - nbconvert=7.10.0=py311haa95532_0
+  - nbformat=5.9.2=py311haa95532_0
+  - nest-asyncio=1.5.6=py311haa95532_0
+  - networkx=3.2.1=pyhd8ed1ab_0
+  - notebook=7.0.6=py311haa95532_0
+  - notebook-shim=0.2.3=py311haa95532_0
+  - numexpr=2.8.7=py311h1fcbade_0
+  - numpy=1.26.2=py311hdab7c0b_0
+  - numpy-base=1.26.2=py311hd01c5d8_0
+  - numpydoc=1.5.0=py311haa95532_0
+  - openjpeg=2.4.0=h4fc8c34_0
+  - openssl=3.0.12=h2bbff1b_0
+  - overrides=7.4.0=py311haa95532_0
+  - packaging=23.1=py311haa95532_0
+  - pandas=2.1.4=py311hf62ec03_0
+  - pandas-profiling=1.4.1=0
+  - pandocfilters=1.5.0=pyhd3eb1b0_0
+  - paramiko=2.8.1=pyhd3eb1b0_0
+  - parso=0.8.3=pyhd3eb1b0_0
+  - pathspec=0.10.3=py311haa95532_0
+  - patsy=0.5.3=py311haa95532_0
+  - pexpect=4.8.0=pyhd3eb1b0_3
+  - pickleshare=0.7.5=pyhd3eb1b0_1003
+  - pillow=10.0.1=py311h045eedc_0
+  - pip=23.3.1=py311haa95532_0
+  - platformdirs=3.10.0=py311haa95532_0
+  - pluggy=1.0.0=py311haa95532_1
+  - ply=3.11=py311haa95532_0
+  - prometheus_client=0.14.1=py311haa95532_0
+  - prompt-toolkit=3.0.36=py311haa95532_0
+  - psutil=5.9.0=py311h2bbff1b_0
+  - ptyprocess=0.7.0=pyhd3eb1b0_2
+  - pure_eval=0.2.2=pyhd3eb1b0_0
+  - pycodestyle=2.10.0=py311haa95532_0
+  - pycparser=2.21=pyhd3eb1b0_0
+  - pydocstyle=6.3.0=py311haa95532_0
+  - pyflakes=3.0.1=py311haa95532_0
+  - pyg=2.4.0=py311_torch_2.1.0_cu121
+  - pygments=2.15.1=py311haa95532_1
+  - pylint=2.16.2=py311haa95532_0
+  - pylint-venv=2.3.0=py311haa95532_0
+  - pyls-spyder=0.4.0=pyhd3eb1b0_0
+  - pynacl=1.5.0=py311h8cc25b3_0
+  - pyopenssl=23.2.0=py311haa95532_0
+  - pyparsing=3.0.9=py311haa95532_0
+  - pyqt=5.15.10=py311hd77b12b_0
+  - pyqt5-sip=12.13.0=py311h2bbff1b_0
+  - pyqtwebengine=5.15.10=py311hd77b12b_0
+  - pysocks=1.7.1=py311haa95532_0
+  - python=3.11.5=he1021f5_0
+  - python-dateutil=2.8.2=pyhd3eb1b0_0
+  - python-fastjsonschema=2.16.2=py311haa95532_0
+  - python-json-logger=2.0.7=py311haa95532_0
+  - python-lsp-black=1.2.1=py311haa95532_0
+  - python-lsp-jsonrpc=1.0.0=pyhd3eb1b0_0
+  - python-lsp-server=1.7.2=py311haa95532_0
+  - python-slugify=5.0.2=pyhd3eb1b0_0
+  - python-tzdata=2023.3=pyhd3eb1b0_0
+  - pytoolconfig=1.2.6=py311haa95532_0
+  - pytorch=2.1.2=py3.11_cuda12.1_cudnn8_0
+  - pytorch-cuda=12.1=hde6ce7c_5
+  - pytorch-lightning=2.1.2=pyhd8ed1ab_0
+  - pytorch-mutex=1.0=cuda
+  - pytz=2023.3.post1=py311haa95532_0
+  - pywin32=305=py311h2bbff1b_0
+  - pywin32-ctypes=0.2.0=py311haa95532_1000
+  - pywinpty=2.0.10=py311h5da7b33_0
+  - pyyaml=6.0.1=py311h2bbff1b_0
+  - pyzmq=25.1.0=py311hd77b12b_0
+  - qdarkstyle=3.0.2=pyhd3eb1b0_0
+  - qstylizer=0.2.2=py311haa95532_0
+  - qt-main=5.15.2=h19c9488_10
+  - qt-webengine=5.15.9=h5bd16bc_7
+  - qtawesome=1.2.2=py311haa95532_0
+  - qtconsole=5.4.2=py311haa95532_0
+  - qtpy=2.4.1=py311haa95532_0
+  - referencing=0.30.2=py311haa95532_0
+  - requests=2.31.0=py311haa95532_0
+  - rfc3339-validator=0.1.4=py311haa95532_0
+  - rfc3986-validator=0.1.1=py311haa95532_0
+  - rich=13.3.5=py311haa95532_0
+  - rope=1.7.0=py311haa95532_0
+  - rpds-py=0.10.6=py311h062c2fa_0
+  - rtree=1.0.1=py311h2eaa2aa_0
+  - scikit-learn=1.2.2=py311hd77b12b_1
+  - scikit-learn-intelex=2023.1.1=py311haa95532_0
+  - scipy=1.11.4=py311hc1ccb85_0
+  - seaborn=0.12.2=py311haa95532_0
+  - send2trash=1.8.2=py311haa95532_0
+  - setuptools=68.2.2=py311haa95532_0
+  - sip=6.7.12=py311hd77b12b_0
+  - six=1.16.0=pyhd3eb1b0_1
+  - sniffio=1.2.0=py311haa95532_1
+  - snowballstemmer=2.2.0=pyhd3eb1b0_0
+  - sortedcontainers=2.4.0=pyhd3eb1b0_0
+  - soupsieve=2.5=py311haa95532_0
+  - sphinx=5.0.2=py311haa95532_0
+  - sphinxcontrib-applehelp=1.0.2=pyhd3eb1b0_0
+  - sphinxcontrib-devhelp=1.0.2=pyhd3eb1b0_0
+  - sphinxcontrib-htmlhelp=2.0.0=pyhd3eb1b0_0
+  - sphinxcontrib-jsmath=1.0.1=pyhd3eb1b0_0
+  - sphinxcontrib-qthelp=1.0.3=pyhd3eb1b0_0
+  - sphinxcontrib-serializinghtml=1.1.5=pyhd3eb1b0_0
+  - spyder=5.4.3=py311haa95532_1
+  - spyder-kernels=2.4.4=py311haa95532_0
+  - sqlite=3.41.2=h2bbff1b_0
+  - stack_data=0.2.0=pyhd3eb1b0_0
+  - statsmodels=0.14.0=py311hd7041d2_0
+  - sympy=1.12=py311haa95532_0
+  - tbb=2021.8.0=h59b6b97_0
+  - terminado=0.17.1=py311haa95532_0
+  - text-unidecode=1.3=pyhd3eb1b0_0
+  - textdistance=4.2.1=pyhd3eb1b0_0
+  - threadpoolctl=2.2.0=pyh0d69192_0
+  - three-merge=0.1.1=pyhd3eb1b0_0
+  - tinycss2=1.2.1=py311haa95532_0
+  - tk=8.6.12=h2bbff1b_0
+  - toml=0.10.2=pyhd3eb1b0_0
+  - tomlkit=0.11.1=py311haa95532_0
+  - torchmetrics=1.2.1=pyhd8ed1ab_0
+  - tornado=6.3.3=py311h2bbff1b_0
+  - tqdm=4.66.1=pyhd8ed1ab_0
+  - traitlets=5.7.1=py311haa95532_0
+  - typeguard=2.13.3=pyhd8ed1ab_0
+  - typing-extensions=4.7.1=py311haa95532_0
+  - typing_extensions=4.7.1=py311haa95532_0
+  - tzdata=2023c=h04d1e81_0
+  - ujson=5.4.0=py311hd77b12b_0
+  - unidecode=1.2.0=pyhd3eb1b0_0
+  - urllib3=1.26.18=py311haa95532_0
+  - vc=14.2=h21ff451_1
+  - vs2015_runtime=14.27.29016=h5e58377_2
+  - watchdog=2.1.6=py311haa95532_0
+  - wcwidth=0.2.5=pyhd3eb1b0_0
+  - webencodings=0.5.1=py311haa95532_1
+  - websocket-client=0.58.0=py311haa95532_4
+  - whatthepatch=1.0.2=py311haa95532_0
+  - wheel=0.41.2=py311haa95532_0
+  - widgetsnbextension=4.0.9=pyhd8ed1ab_0
+  - win_inet_pton=1.1.0=py311haa95532_0
+  - winpty=0.4.3=4
+  - wrapt=1.14.1=py311h2bbff1b_0
+  - xz=5.4.5=h8cc25b3_0
+  - yaml=0.2.5=he774522_0
+  - yapf=0.31.0=pyhd3eb1b0_0
+  - zeromq=4.3.4=hd77b12b_0
+  - zipp=3.11.0=py311haa95532_0
+  - zlib=1.2.13=h8cc25b3_0
+  - zstd=1.5.5=hd43e919_0
+  - pip:
+      - comm==0.2.0
+      - fqdn==1.5.1
+      - ipywidgets==8.1.1
+      - isoduration==20.11.0
+      - jsonpointer==2.4
+      - jupyter==1.0.0
+      - jupyter-console==6.6.3
+      - torchaudio==2.1.2
+      - torchvision==0.16.2
+      - uri-template==1.3.0
+      - webcolors==1.13
+prefix: C:\Users\lrm22005\AppData\Local\anaconda\envs\pytorch
diff --git a/HPC/final_project/final_attemp_4_1_Dong_Ohm_summary_20231025.csv b/HPC/final_project/final_attemp_4_1_Dong_Ohm_summary_20231025.csv
new file mode 100644
index 0000000..ab8e70b
--- /dev/null
+++ b/HPC/final_project/final_attemp_4_1_Dong_Ohm_summary_20231025.csv
@@ -0,0 +1,110 @@
+UID,NSR,AF,PACPVC,SVT,Noisy
+002,102,0,21,0,0
+003,3193,0,0,1,6865
+005,642,0,587,1,2208
+007,101,0,21,1,864
+011,97,0,2,0,468
+012,2698,0,9,1,10297
+013,7254,0,59,1,8966
+017,0,170,0,0,0
+020,2589,0,26,2,1556
+021,511,0,1,1,322
+022,4122,0,5,1,1110
+024,7116,0,1,0,0
+026,168,0,1,2,0
+027,9366,0,0,0,0
+028,3442,0,403,7,12994
+029,5088,0,1541,12,12987
+030,2386,0,1108,2,15570
+034,0,0,0,6,0
+035,472,0,1108,2,1255
+036,4882,0,1108,2,12845
+037,1050,0,1108,2,6350
+038,272,0,228,12,0
+039,3566,0,2,1,5154
+041,1253,0,0,1,3471
+042,125,0,1,0,415
+044,738,0,1,0,3819
+045,2905,0,155,1,4323
+047,1366,0,106,1,3585
+049,1529,0,17,1,4799
+050,274,0,1,0,6303
+052,1567,0,3,1,2149
+053,3504,0,36,3,7169
+054,192,0,635,0,0
+055,82,0,34,3,270
+056,62,0,34,3,310
+057,5079,0,34,3,0
+058,1307,0,34,3,1936
+062,2766,0,0,0,4723
+063,1379,0,34,3,1787
+064,3579,0,58,1,3
+068,3134,0,34,0,3634
+069,362,0,66,13,3824
+070,791,0,4,15,1143
+073,1986,0,116,2,4916
+074,87,0,23,1,619
+075,196,13,2110,47,1
+077,2802,0,23,1,4604
+078,5325,0,9,1,0
+080,6305,0,284,5,6673
+082,0,0,0,3,0
+083,78,0,1,0,790
+084,3781,0,0,0,545
+086,10,0,67,1,0
+087,0,0,11,0,0
+088,1644,0,0,0,3
+089,0,0,4,0,0
+090,1979,0,1,0,795
+091,1253,0,4,0,4746
+093,0,0,800,1,0
+094,109,0,0,0,612
+098,82,0,0,0,6166
+099,3373,0,1,1,3749
+100,87,0,1485,193,0
+101,477,0,0,0,4602
+104,0,0,12,1,0
+106,531,0,1,0,2480
+108,0,0,0,0,0
+109,615,0,2,0,886
+110,12,1,261,1,0
+111,109,0,0,0,215
+112,5357,0,642,61,4
+113,0,0,4668,1,0
+118,3544,0,259,2,5056
+119,16,0,0,0,1877
+120,15,0,19,0,12615
+301,0,17,0,0,0
+302,0,2,0,0,0
+305,0,10,0,0,0
+306,0,2,0,0,0
+307,0,39,0,0,0
+310,0,6,0,0,0
+311,0,19,0,0,0
+312,0,3,0,0,0
+318,0,8,0,0,0
+319,0,13,0,0,0
+320,0,8,0,0,0
+321,0,11,0,0,0
+322,0,7,0,0,0
+324,0,6,0,0,0
+325,14,0,0,0,0
+327,2,0,6,0,0
+329,0,15,0,0,0
+400,0,600,0,0,0
+402,0,2213,0,0,0
+405,0,406,0,0,0
+406,0,1902,0,0,0
+407,0,166,0,0,0
+408,866,50,29,0,0
+409,0,584,0,0,0
+410,0,3525,0,0,0
+413,0,4804,0,0,0
+414,0,2004,0,0,0
+415,0,1955,0,0,0
+416,0,762,0,0,0
+419,1383,1491,27,1,0
+420,0,165,0,0,0
+421,0,2016,0,0,0
+422,0,103,0,0,0
+423,0,1459,0,0,0
diff --git a/HPC/final_project/gp_al_ss_full.sh b/HPC/final_project/gp_al_ss_full.sh
new file mode 100644
index 0000000..1338f92
--- /dev/null
+++ b/HPC/final_project/gp_al_ss_full.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+#SBATCH --partition=general-gpu
+#SBATCH --ntasks=1
+#SBATCH --mem=32GB
+#SBATCH --nodes=1
+#SBATCH --time=01:00:00
+#SBATCH --mail-type=END
+#SBATCH --mail-user=luis.mercado_diaz@uconn.edu
+#SBATCH --output=sbatch_output_ss_main.txt
+#SBATCH --error=sbatch_error_ss_main.txt
+#SBATCH --gres=gpu:1
+
+# Load necessary modules
+module purge # unload all modules from user's environment
+module load slurm cuda/11.6 cudnn/8.6.0
+
+# Activate Miniconda
+source /path/to/miniconda3/etc/profile.d/conda.sh
+
+# If the environment doesn't exist, create it from the YAML file
+if ! conda info --envs | grep pytorch; then
+   conda env create -f environment.yml
+fi
+
+# Activate your environment
+conda activate pytorch
+
+# Navigate to your project directory (change to your actual directory)
+cd /home/lrm22005/ML_Notebooks/Arrhytmia_GP/final_project/
+
+# Execute your Python script
+python3 ss_main.py
+
+#exit
\ No newline at end of file
diff --git a/HPC/final_project/models/ss_gp_model.py b/HPC/final_project/models/ss_gp_model.py
new file mode 100644
index 0000000..1a1bb90
--- /dev/null
+++ b/HPC/final_project/models/ss_gp_model.py
@@ -0,0 +1,119 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Dec 18 18:01:41 2023
+
+@author: lrm22005
+"""
+from tqdm import tqdm
+import torch
+import gpytorch
+
+num_latents = 6  # This should match the complexity of your data or the number of tasks
+num_tasks = 4    # This should match the number of output classes or tasks
+num_inducing_points = 50  # This is independent and should be sufficient for the input space
+
+class MultitaskGPModel(gpytorch.models.ApproximateGP):
+    def __init__(self):
+        # Let's use a different set of inducing points for each latent function
+        inducing_points = torch.rand(num_latents, num_inducing_points, 128 * 128)  # Assuming flattened 128x128 images
+
+        # We have to mark the CholeskyVariationalDistribution as batch
+        # so that we learn a variational distribution for each task
+        variational_distribution = gpytorch.variational.CholeskyVariationalDistribution(
+            inducing_points.size(-2), batch_shape=torch.Size([num_latents])
+        )
+
+        # We have to wrap the VariationalStrategy in a LMCVariationalStrategy
+        # so that the output will be a MultitaskMultivariateNormal rather than a batch output
+        variational_strategy = gpytorch.variational.LMCVariationalStrategy(
+            gpytorch.variational.VariationalStrategy(
+                self, inducing_points, variational_distribution, learn_inducing_locations=True
+            ),
+            num_tasks=num_tasks,
+            num_latents=num_latents,
+            latent_dim=-1
+        )
+
+        super().__init__(variational_strategy)
+
+        # The mean and covariance modules should be marked as batch
+        # so we learn a different set of hyperparameters
+        self.mean_module = gpytorch.means.ConstantMean(batch_shape=torch.Size([num_latents]))
+        self.covar_module = gpytorch.kernels.ScaleKernel(
+            gpytorch.kernels.RBFKernel(batch_shape=torch.Size([num_latents])),
+            batch_shape=torch.Size([num_latents])
+        )
+
+    def forward(self, x):
+        # The forward function should be written as if we were dealing with each output
+        # dimension in batch
+        # Ensure x is correctly shaped. It should have the same last dimension size as inducing_points
+        # x should be reshaped or sliced to have the shape [?, 1] where ? can be any size
+        # For example, if x originally has shape [N, D], and D != 1, you need to modify x accordingly
+        # print(f"Input shape: {x.shape}")
+        # x = x.view(x.size(0), -1)  # Flattening the images
+        # print(f"Input shape after flattening: {x.shape}")  # Debugging input shape
+        mean_x = self.mean_module(x)
+        covar_x = self.covar_module(x)
+    
+        # Debugging: Print shapes of intermediate outputs
+        # print(f"Mean shape: {mean_x.shape}, Covariance shape: {covar_x.shape}")
+        latent_pred = gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
+        # print(f"Latent prediction shape: {latent_pred.mean.shape}, {latent_pred.covariance_matrix.shape}")
+
+        return latent_pred
+
+
+def train_gp_model(train_x, train_y, val_loader, num_iterations=50, n_classes=4, patience=10, checkpoint_path='model_checkpoint_full.pt'):
+    model = MultitaskGPModel().to(device)
+    likelihood = gpytorch.likelihoods.SoftmaxLikelihood(num_features=4, num_classes=4).to(device)
+    model.train()
+    likelihood.train()
+    optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
+    mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=train_y.size(0))
+
+    best_val_loss = float('inf')
+    epochs_no_improve = 0
+
+    for i in tqdm(range(num_iterations), desc='Training', unit='iter', leave=False):
+        optimizer.zero_grad()
+        output = model(train_x)
+        loss = -mll(output, train_y)
+        scalar_loss = loss.sum() if loss.numel() > 1 else loss
+        scalar_loss.backward()
+        optimizer.step()
+
+        # Validation step
+        model.eval()
+        likelihood.eval()
+        with torch.no_grad():
+            val_loss = 0.0
+            for val_batch in val_loader:
+                val_x, val_y = val_batch['data'].view(val_batch['data'].size(0), -1).to(device), val_batch['label'].to(device)
+                val_output = model(val_x)
+                val_loss += -mll(val_output, val_y).item()
+            val_loss /= len(val_loader)
+
+        model.train()
+        likelihood.train()
+
+        # Early stopping and checkpointing based on validation loss
+        if val_loss < best_val_loss:
+            best_val_loss = val_loss
+            epochs_no_improve = 0
+            torch.save({'model_state_dict': model.state_dict(),
+                        'likelihood_state_dict': likelihood.state_dict(),
+                        'optimizer_state_dict': optimizer.state_dict()}, checkpoint_path)
+        else:
+            epochs_no_improve += 1
+            if epochs_no_improve == patience:
+                print(f"Early stopping triggered at iteration {i+1}")
+                break
+
+    # Load the best model before return
+    checkpoint = torch.load(checkpoint_path)
+    model.load_state_dict(checkpoint['model_state_dict'])
+    likelihood.load_state_dict(checkpoint['likelihood_state_dict'])
+    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
+
+    return model, likelihood
\ No newline at end of file
diff --git a/HPC/final_project/ss_main.py b/HPC/final_project/ss_main.py
new file mode 100644
index 0000000..706ea63
--- /dev/null
+++ b/HPC/final_project/ss_main.py
@@ -0,0 +1,79 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Dec 18 18:47:27 2023
+
+@author: lrm22005
+"""
+import tqdm
+import torch
+from utils.data_loader import preprocess_data, split_uids
+from models.ss_gp_model import MultitaskGPModel, train_gp_model
+from utils.ss_evaluation import stochastic_evaluation, evaluate_model_on_all_data
+from active_learning.ss_active_learning import stochastic_uncertainty_sampling, run_minibatch_kmeans, stochastic_compare_kmeans_gp_predictions, update_train_loader_with_uncertain_samples
+from utils.visualization import plot_comparative_results, plot_training_performance, plot_results
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+def main():
+    # Set parameters like n_classes, batch_size, etc.
+    n_classes = 4
+    batch_size = 512
+    clinical_trial_train, clinical_trial_test, clinical_trial_unlabeled = split_uids()
+    # Preprocess data
+    train_loader, val_loader, test_loader = preprocess_data('pt', clinical_trial_train, clinical_trial_test, clinical_trial_unlabeled, batch_size)
+
+    kmeans_model = run_minibatch_kmeans(train_loader, n_clusters=n_classes, device=device)
+
+    # Initialize result storage
+    results = {
+        'train_loss': [],
+        'validation_metrics': {'precision': [], 'recall': [], 'f1': [], 'auc_roc': []},
+        'test_metrics': None  # This will be filled in with the final test metrics
+    }
+    
+    # Initial model training
+    for train_batch in train_loader:
+        train_x = train_batch['data'].view(train_batch['data'].size(0), -1).to(device)
+        train_y = train_batch['label'].to(device)
+        model, likelihood = train_gp_model(train_x, train_y, val_loader, num_iterations=10, n_classes=n_classes)
+
+    active_learning_iterations = 10
+    n_samples = batch_size  # Number of uncertain samples to accumulate
+    for iteration in tqdm(range(active_learning_iterations), desc='Active Learning', unit='iteration', leave=True):
+        uncertain_sample_indices = stochastic_uncertainty_sampling(model, likelihood, val_loader, n_samples, n_batches=5, n_components=2)
+
+        # Accumulate indices of uncertain samples
+        accumulated_indices = []
+        for idx in uncertain_sample_indices:
+            accumulated_indices.append(idx)
+    
+        # Update the training loader with indices of uncertain samples
+        train_loader = update_train_loader_with_uncertain_samples(train_loader, accumulated_indices, batch_size)
+
+        # Re-train the model with the updated train_loader
+        for train_batch in tqdm(train_loader, desc='Batch Training', leave=False):
+            train_x = train_batch['data'].view(train_batch['data'].size(0), -1).to(device)  # Flatten the image
+            train_y = train_batch['label'].to(device)
+            model, likelihood = train_gp_model(train_x, train_y, val_loader, num_iterations=10, n_classes=n_classes)
+        val_metrics = stochastic_evaluation(model, likelihood, val_loader, device, n_classes, n_batches=5)
+        for metric in ['precision', 'recall', 'f1', 'auc_roc']:
+            results['validation_metrics'][metric].append(val_metrics[metric])
+
+        # Compare K-Means with GP model predictions after retraining
+        gp_vs_kmeans_data, original_labels = stochastic_compare_kmeans_gp_predictions(kmeans_model, model, train_loader, n_batches=5, device=device)
+    
+    plot_comparative_results(gp_vs_kmeans_data, original_labels)
+
+    plot_training_performance(results['train_loss'], results['validation_metrics'])
+
+    # Final evaluation on test set
+    classification_result = evaluate_model_on_all_data(model, likelihood, test_loader, device, n_classes=n_classes)
+    # Store test metrics
+    results['test_metrics'] = classification_result
+    # Now results dictionary is ready to be used for plotting
+    plot_results(results)
+    # You might also want to print or log the final test metrics
+    print("Final Test Metrics:", results['test_metrics'])
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/HPC/final_project/utils/data_loader.py b/HPC/final_project/utils/data_loader.py
new file mode 100644
index 0000000..8930e6f
--- /dev/null
+++ b/HPC/final_project/utils/data_loader.py
@@ -0,0 +1,222 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Dec 18 18:09:02 2023
+
+@author: lrm22005
+"""
+import os
+import pandas as pd
+import torch
+from torch.utils.data import Dataset, DataLoader
+from sklearn.preprocessing import StandardScaler
+
+def split_uids():
+    # ====== Load the per subject arrythmia summary ======
+    df_summary = pd.read_csv(r'final_attemp_4_1_Dong_Ohm_summary_20231025.csv')
+    df_summary['UID'] = df_summary['UID'].astype(str).str.zfill(3)
+    
+    df_summary['sample_nonAF'] = df_summary['NSR'] + df_summary['PACPVC'] + df_summary['SVT']
+    df_summary['sample_AF'] = df_summary['AF']
+    
+    df_summary['sample_nonAF_ratio'] = df_summary['sample_nonAF'] / (df_summary['sample_AF'] + df_summary['sample_nonAF'])
+    
+    all_UIDs = df_summary['UID'].unique()
+    # ====================================================
+    # ====== AF trial separation ======
+    # R:\ENGR_Chon\Dong\Numbers\Pulsewatch_numbers\Fahimeh_CNNED_general_ExpertSystemwApplication\tbl_file_name\TrainingSet_final_segments
+    AF_trial_Fahimeh_train = ['402','410']
+    AF_trial_Fahimeh_test = ['301', '302', '305', '306', '307', '310', '311', 
+                             '312', '318', '319', '320', '321', '322', '324', 
+                             '325', '327', '329', '400', '406', '407', '409',
+                             '414']
+    AF_trial_Fahimeh_did_not_use = ['405', '413', '415', '416', '420', '421', '422', '423']
+    AF_trial_paroxysmal_AF = ['408','419']
+    
+    AF_trial_train = AF_trial_Fahimeh_train
+    AF_trial_test = AF_trial_Fahimeh_test
+    AF_trial_unlabeled = AF_trial_Fahimeh_did_not_use + AF_trial_paroxysmal_AF
+    print(f'AF trial: {len(AF_trial_train)} training subjects {AF_trial_train}')
+    print(f'AF trial: {len(AF_trial_test)} testing subjects {AF_trial_test}')
+    print(f'AF trial: {len(AF_trial_unlabeled)} unlabeled subjects {AF_trial_unlabeled}')
+    # =================================
+    # === Clinical trial AF subjects separation ===
+    clinical_trial_AF_subjects = ['005', '017', '026', '051', '075', '082']
+    
+    remaining_UIDs = []
+    count_NSR = []
+    import math
+    for index, row in df_summary.iterrows():
+        UID = row['UID']
+        this_NSR = row['sample_nonAF']
+        if math.isnan(this_NSR):
+            # There is no segment in this subject, skip this UID.
+            print(f'---------UID {UID} has no segments.------------')
+            continue
+        if UID not in AF_trial_train and UID not in AF_trial_test and UID not in clinical_trial_AF_subjects \
+            and not UID[0] == '3' and not UID[0] == '4':
+            remaining_UIDs.append(UID)
+            count_NSR.append(this_NSR)
+    
+    from numpy import random
+    random.seed(seed=42)
+    from numpy.random import choice
+    list_of_candidates = remaining_UIDs
+    number_of_items_to_pick = round(len(list_of_candidates) * 0.15) # 10% labeled for training, 5% for testing.
+    temp_sum = sum(count_NSR)
+    probability_distribution = [x/temp_sum for x in count_NSR]
+    probability_distribution = [(1-x/temp_sum)/ (len(count_NSR)-1) for x in count_NSR]# Subjects with fewer segments have higher chance to be selected. Make sure the sum is one.
+    draw = choice(list_of_candidates, number_of_items_to_pick,
+                  p=probability_distribution, replace=False)
+    
+    clinical_trial_train = list(draw[:round(len(list_of_candidates) * 0.1)])
+    clinical_trial_test_nonAF = list(draw[round(len(list_of_candidates) * 0.1):])
+    clinical_trial_test_temp = clinical_trial_test_nonAF + clinical_trial_AF_subjects
+    clinical_trial_test = []
+    for UID in clinical_trial_test_temp:
+        # UID 051 and maybe other UIDs had no segments (unknown reason).
+        if UID in all_UIDs:
+            clinical_trial_test.append(UID)
+            
+    clinical_trial_unlabeled = []
+    for UID in all_UIDs:
+        if UID not in clinical_trial_train and UID not in clinical_trial_test and not UID[0] == '3' and not UID[0] == '4':
+            clinical_trial_unlabeled.append(UID)
+    print(f'Clinical trial: selected {len(clinical_trial_train)} UIDs for training {clinical_trial_train}')
+    print(f'Clinical trial: selected {len(clinical_trial_test)} UIDs for testing {clinical_trial_test}')
+    print(f'Clinical trial: selected {len(clinical_trial_unlabeled)} UIDs for unlabeled {clinical_trial_unlabeled}')
+    return clinical_trial_train, clinical_trial_test, clinical_trial_unlabeled
+
+class CustomDataset(Dataset):
+    def __init__(self, data_path, labels_path, UIDs, standardize=True, read_all_labels=False):
+        self.data_path = data_path
+        self.labels_path = labels_path
+        self.UIDs = UIDs
+        self.standardize = standardize
+        self.read_all_labels = read_all_labels
+        self.refresh_dataset()
+
+    def refresh_dataset(self):
+        self.segment_names, self.labels = self.extract_segment_names_and_labels()
+
+    def add_uids(self, new_uids):
+        unique_new_uids = [uid for uid in new_uids if uid not in self.UIDs]
+        self.UIDs.extend(unique_new_uids)
+        self.refresh_dataset()
+
+    def __len__(self):
+        return len(self.segment_names)
+
+    def __getitem__(self, idx):
+        segment_name = self.segment_names[idx]
+        label = self.labels[segment_name]
+        time_freq_tensor = self.load_data(segment_name)
+        return {'data': time_freq_tensor, 'label': label, 'segment_name': segment_name}
+
+    def extract_segment_names_and_labels(self):
+        segment_names = []
+        labels = {}
+
+        for UID in self.UIDs:
+            label_file = os.path.join(self.labels_path, UID + "_final_attemp_4_1_Dong.csv")
+            if os.path.exists(label_file):
+                label_data = pd.read_csv(label_file, sep=',', header=0, names=['segment', 'label'])
+                label_segment_names = label_data['segment'].apply(lambda x: x.split('.')[0])
+                for idx, segment_name in enumerate(label_segment_names):
+                    label_val = label_data['label'].values[idx]
+                    if self.read_all_labels:
+                        # Assign -1 if label is not in [0, 1, 2, 3]
+                        labels[segment_name] = label_val if label_val in [0, 1, 2, 3] else -1
+                        if segment_name not in segment_names:
+                            segment_names.append(segment_name)
+                    else:
+                        # Only add segments with labels in [0, 1, 2, 3]
+                        if label_val in [0, 1, 2, 3] and segment_name not in segment_names:
+                            segment_names.append(segment_name)
+                            labels[segment_name] = label_val
+
+        return segment_names, labels
+
+    def load_data(self, segment_name):
+        data_path_UID = os.path.join(self.data_path, segment_name.split('_')[0])
+        seg_path = os.path.join(data_path_UID, segment_name + '_filt_STFT.pt')
+        try:
+            time_freq_tensor = torch.load(seg_path)
+            if self.standardize:
+                time_freq_tensor = self.standard_scaling(time_freq_tensor)
+            return time_freq_tensor.clone()
+        except Exception as e:
+            print(f"Error processing segment: {segment_name}. Exception: {str(e)}")
+            return torch.zeros((1, 128, 128))
+
+    def standard_scaling(self, data):
+        scaler = StandardScaler()
+        data = scaler.fit_transform(data.reshape(-1, data.shape[-1])).reshape(data.shape)
+        return torch.Tensor(data)
+
+def load_data_split_batched(data_path, labels_path, UIDs, batch_size, standardize=False, read_all_labels=False, drop_last=False, num_workers=4):
+    dataset = CustomDataset(data_path, labels_path, UIDs, standardize, read_all_labels)
+    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, drop_last=drop_last, num_workers=num_workers, prefetch_factor=2)
+    return dataloader
+
+def get_data_paths(data_format, is_linux=False, is_hpc=False):
+    if is_linux:
+        base_path = "/mnt/r/ENGR_Chon/Dong/MATLAB_generate_results/NIH_PulseWatch"
+        labels_base_path = "/mnt/r/ENGR_Chon/NIH_Pulsewatch_Database/Adjudication_UConn"
+        saving_base_path = "/mnt/r/ENGR_Chon/Luis/Research/Casseys_case/Project_1_analysis"
+    elif is_hpc:
+        base_path = "/gpfs/scratchfs1/kic14002/doh16101"
+        labels_base_path = "/gpfs/scratchfs1/hfp14002/lrm22005"
+        saving_base_path = "/gpfs/scratchfs1/hfp14002/lrm22005/Casseys_case/Project_1_analysis"
+    else:
+        # R:\ENGR_Chon\Dong\MATLAB_generate_results\NIH_PulseWatch
+        base_path = "R:\ENGR_Chon\Dong\MATLAB_generate_results\\NIH_PulseWatch"
+        labels_base_path = "R:\ENGR_Chon\\NIH_Pulsewatch_Database\Adjudication_UConn"
+        saving_base_path = r"\\grove.ad.uconn.edu\research\ENGR_Chon\Luis\Research\Casseys_case"
+    if data_format == 'csv':
+        data_path = os.path.join(base_path, "TFS_csv")
+        labels_path = os.path.join(labels_base_path, "final_attemp_4_1_Dong_Ohm")
+        saving_path = os.path.join(saving_base_path, "Project_1_analysis")
+    elif data_format == 'png':
+        data_path = os.path.join(base_path, "TFS_plots")
+        labels_path = os.path.join(labels_base_path, "final_attemp_4_1_Dong_Ohm")
+        saving_path = os.path.join(saving_base_path, "Project_1_analysis")
+    elif data_format == 'pt':
+        data_path = os.path.join(base_path, "PT_format")
+        labels_path = os.path.join(labels_base_path, "final_attemp_4_1_Dong_Ohm")
+        saving_path = os.path.join(saving_base_path, "Project_1_analysis")
+    else:
+        raise ValueError("Invalid data format. Choose 'csv' or 'png.")
+    return data_path, labels_path, saving_path
+
+# Function to extract and preprocess data
+def preprocess_data(data_format, clinical_trial_train, clinical_trial_test, clinical_trial_unlabeled, batch_size, read_all_labels=True):
+    # Extracts paths and loads data into train, validation, and test loaders
+    data_path, labels_path, saving_path = get_data_paths(data_format, is_hpc=True)
+    train_loader = load_data_split_batched(data_path, labels_path, clinical_trial_train, batch_size, standardize=True, read_all_labels=False)
+    val_loader = load_data_split_batched(data_path, labels_path, clinical_trial_test, batch_size, standardize=True, read_all_labels=read_all_labels)
+    test_loader = load_data_split_batched(data_path, labels_path, clinical_trial_unlabeled, batch_size, standardize=True, read_all_labels=read_all_labels)
+    return train_loader, val_loader, test_loader
+
+def map_samples_to_uids(uncertain_sample_indices, dataset):
+    """
+    Maps indices of uncertain samples back to their corresponding segment names or UIDs.
+
+    Args:
+    - uncertain_sample_indices: Indices of the uncertain samples in the dataset.
+    - dataset: The dataset object which contains the mapping of segment names and UIDs.
+
+    Returns:
+    - List of UIDs or segment names corresponding to the uncertain samples.
+    """
+    return [dataset.segment_names[i] for i in uncertain_sample_indices]
+
+def update_train_loader_with_uncertain_samples(current_train_loader, new_sample_indices, batch_size, standardize=False, data_format='csv', read_all_labels=True):
+    # Extract current UIDs from the current_train_loader
+    current_dataset = current_train_loader.dataset
+    # Map new_samples back to their corresponding segment names or UIDs
+    new_uids = map_samples_to_uids(new_sample_indices, current_dataset)
+    # Add new UIDs to the current dataset and refresh it
+    current_dataset.add_uids(new_uids)
+    # Create new DataLoader with the updated dataset
+    updated_train_loader = DataLoader(current_dataset, batch_size=batch_size, shuffle=False)
+    return updated_train_loader
\ No newline at end of file
diff --git a/HPC/final_project/utils/final_attemp_4_1_Dong_Ohm_summary_20231025.csv b/HPC/final_project/utils/final_attemp_4_1_Dong_Ohm_summary_20231025.csv
new file mode 100644
index 0000000..ab8e70b
--- /dev/null
+++ b/HPC/final_project/utils/final_attemp_4_1_Dong_Ohm_summary_20231025.csv
@@ -0,0 +1,110 @@
+UID,NSR,AF,PACPVC,SVT,Noisy
+002,102,0,21,0,0
+003,3193,0,0,1,6865
+005,642,0,587,1,2208
+007,101,0,21,1,864
+011,97,0,2,0,468
+012,2698,0,9,1,10297
+013,7254,0,59,1,8966
+017,0,170,0,0,0
+020,2589,0,26,2,1556
+021,511,0,1,1,322
+022,4122,0,5,1,1110
+024,7116,0,1,0,0
+026,168,0,1,2,0
+027,9366,0,0,0,0
+028,3442,0,403,7,12994
+029,5088,0,1541,12,12987
+030,2386,0,1108,2,15570
+034,0,0,0,6,0
+035,472,0,1108,2,1255
+036,4882,0,1108,2,12845
+037,1050,0,1108,2,6350
+038,272,0,228,12,0
+039,3566,0,2,1,5154
+041,1253,0,0,1,3471
+042,125,0,1,0,415
+044,738,0,1,0,3819
+045,2905,0,155,1,4323
+047,1366,0,106,1,3585
+049,1529,0,17,1,4799
+050,274,0,1,0,6303
+052,1567,0,3,1,2149
+053,3504,0,36,3,7169
+054,192,0,635,0,0
+055,82,0,34,3,270
+056,62,0,34,3,310
+057,5079,0,34,3,0
+058,1307,0,34,3,1936
+062,2766,0,0,0,4723
+063,1379,0,34,3,1787
+064,3579,0,58,1,3
+068,3134,0,34,0,3634
+069,362,0,66,13,3824
+070,791,0,4,15,1143
+073,1986,0,116,2,4916
+074,87,0,23,1,619
+075,196,13,2110,47,1
+077,2802,0,23,1,4604
+078,5325,0,9,1,0
+080,6305,0,284,5,6673
+082,0,0,0,3,0
+083,78,0,1,0,790
+084,3781,0,0,0,545
+086,10,0,67,1,0
+087,0,0,11,0,0
+088,1644,0,0,0,3
+089,0,0,4,0,0
+090,1979,0,1,0,795
+091,1253,0,4,0,4746
+093,0,0,800,1,0
+094,109,0,0,0,612
+098,82,0,0,0,6166
+099,3373,0,1,1,3749
+100,87,0,1485,193,0
+101,477,0,0,0,4602
+104,0,0,12,1,0
+106,531,0,1,0,2480
+108,0,0,0,0,0
+109,615,0,2,0,886
+110,12,1,261,1,0
+111,109,0,0,0,215
+112,5357,0,642,61,4
+113,0,0,4668,1,0
+118,3544,0,259,2,5056
+119,16,0,0,0,1877
+120,15,0,19,0,12615
+301,0,17,0,0,0
+302,0,2,0,0,0
+305,0,10,0,0,0
+306,0,2,0,0,0
+307,0,39,0,0,0
+310,0,6,0,0,0
+311,0,19,0,0,0
+312,0,3,0,0,0
+318,0,8,0,0,0
+319,0,13,0,0,0
+320,0,8,0,0,0
+321,0,11,0,0,0
+322,0,7,0,0,0
+324,0,6,0,0,0
+325,14,0,0,0,0
+327,2,0,6,0,0
+329,0,15,0,0,0
+400,0,600,0,0,0
+402,0,2213,0,0,0
+405,0,406,0,0,0
+406,0,1902,0,0,0
+407,0,166,0,0,0
+408,866,50,29,0,0
+409,0,584,0,0,0
+410,0,3525,0,0,0
+413,0,4804,0,0,0
+414,0,2004,0,0,0
+415,0,1955,0,0,0
+416,0,762,0,0,0
+419,1383,1491,27,1,0
+420,0,165,0,0,0
+421,0,2016,0,0,0
+422,0,103,0,0,0
+423,0,1459,0,0,0
diff --git a/HPC/final_project/utils/ss_evaluation.py b/HPC/final_project/utils/ss_evaluation.py
new file mode 100644
index 0000000..4729701
--- /dev/null
+++ b/HPC/final_project/utils/ss_evaluation.py
@@ -0,0 +1,77 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Dec 18 18:17:40 2023
+
+@author: lrm22005
+"""
+import numpy as np
+import torch
+import gpytorch
+from sklearn.preprocessing import label_binarize
+from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
+from sklearn.metrics import precision_recall_fscore_support
+
+def evaluate_model_on_all_data(model, likelihood, data_loader, device, n_classes):
+    model.eval()
+    likelihood.eval()
+
+    all_predicted_labels = []
+    all_test_labels = []
+    
+    with torch.no_grad(), gpytorch.settings.fast_pred_var():
+        for i, batch in enumerate(data_loader):
+            test_data = batch['data'].view(batch['data'].size(0), -1).to(device)
+            test_labels = batch['label'].to(device)
+            # print(f"Test data shape before t-SNE: {test_data.shape}")
+          
+            predictions = likelihood(model(test_data)).mean
+            # Debugging - check shape of predictions
+            # print(f"Predictions shape: {predictions.shape}")
+            predicted_labels = predictions.argmax(dim=0)
+
+            # Add debugging information
+            # print(f"Batch {i}: Predicted Labels Shape: {predicted_labels.shape}, Actual Labels Shape: {test_labels.shape}")
+
+            all_predicted_labels.append(predicted_labels.cpu().numpy())
+            all_test_labels.append(test_labels.numpy())
+
+            # Debug the accumulation of labels
+            # current_predicted = np.concatenate(all_predicted_labels, axis=0)
+            # current_actual = np.concatenate(all_test_labels, axis=0)
+            # print(f"After Batch {i}: Accumulated Predicted Labels: {current_predicted.shape[0]}, Accumulated Actual Labels: {current_actual.shape[0]}")
+
+    # Concatenate all batch results
+    all_predicted_labels = np.concatenate(all_predicted_labels, axis=0)
+    all_test_labels = np.concatenate(all_test_labels, axis=0)
+
+    # Final check
+    # print(f"Final: Total Predicted Labels: {all_predicted_labels.shape[0]}, Total Actual Labels: {all_test_labels.shape[0]}")
+
+    # Verify if the shapes match before proceeding to calculate metrics
+    if all_predicted_labels.shape[0] != all_test_labels.shape[0]:
+        raise ValueError("Mismatch in the number of samples between predicted and actual labels")
+
+    # Compute overall evaluation metrics
+    precision, recall, f1, _ = precision_recall_fscore_support(all_test_labels, all_predicted_labels, average='macro')
+    # For AUC-ROC, you need the predicted probabilities and true labels in a one-hot encoded format
+    test_labels_one_hot = label_binarize(all_test_labels, classes=np.arange(n_classes))
+    auc_roc = roc_auc_score(test_labels_one_hot, predictions.softmax(dim=-1).cpu().numpy(), multi_class='ovr')
+    return {
+        'precision': precision,
+        'recall': recall,
+        'f1': f1,
+        'auc_roc': auc_roc
+    }
+
+def parse_classification_report(report):
+    """Parse a classification report into a dictionary of metrics."""
+    lines = report.split('\n')
+    main_metrics = lines[-2].split()
+    
+    # Assuming the last line is like "accuracy: x    macro avg    y1  y2  y3  y4"
+    return {
+        'precision': float(main_metrics[3]),
+        'recall': float(main_metrics[4]),
+        'f1': float(main_metrics[5]),
+        'auc_roc': None  # AUC-ROC is not part of the classification report by default
+    }
diff --git a/HPC/final_project/utils/visualization.py b/HPC/final_project/utils/visualization.py
new file mode 100644
index 0000000..3ecf59b
--- /dev/null
+++ b/HPC/final_project/utils/visualization.py
@@ -0,0 +1,81 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Dec 18 18:20:55 2023
+
+@author: lrm22005
+"""
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.metrics import confusion_matrix
+
+def plot_training_performance(train_loss, validation_metrics):
+    epochs = range(1, len(train_loss) + 1)
+    
+    # Plot training loss
+    plt.figure(figsize=(14, 6))
+    plt.subplot(1, 2, 1)
+    plt.plot(epochs, train_loss, 'b-', label='Training Loss')
+    plt.title('Training Loss')
+    plt.xlabel('Epochs')
+    plt.ylabel('Loss')
+    plt.legend()
+
+    # Plot validation metrics
+    plt.subplot(1, 2, 2)
+    plt.plot(epochs, validation_metrics['precision'], 'r-', label='Precision')
+    plt.plot(epochs, validation_metrics['recall'], 'g-', label='Recall')
+    plt.plot(epochs, validation_metrics['f1'], 'b-', label='F1 Score')
+    plt.plot(epochs, validation_metrics['auc_roc'], 'y-', label='AUC-ROC')
+    plt.title('Validation Metrics')
+    plt.xlabel('Epochs')
+    plt.ylabel('Metrics')
+    plt.legend()
+    
+    plt.tight_layout()
+    plt.show()
+
+def plot_results(results):
+    plt.figure(figsize=(12, 5))
+    plt.subplot(1, 2, 1)
+    plt.plot(results['train_loss'], label='Train Loss')
+    plt.title('Training Loss Over Time')
+    plt.legend()
+
+    plt.subplot(1, 2, 2)
+    for metric in ['precision', 'recall', 'f1']:
+        plt.plot(results['validation_metrics'][metric], label=metric.title())
+    plt.title('Validation Metrics Over Time')
+    plt.legend()
+    plt.show()
+    
+    test_metrics = results['test_metrics']
+    print("Test Metrics:")
+    print(f"Precision: {test_metrics['precision']}")
+    print(f"Recall: {test_metrics['recall']}")
+    print(f"F1 Score: {test_metrics['f1']}")
+    print(f"AUC-ROC: {test_metrics['auc_roc']}")
+
+def plot_comparative_results(gp_vs_kmeans_data, original_labels):
+    fig, axes = plt.subplots(1, 2, figsize=(14, 7))
+
+    # Plot 1: Confusion Matrix for GP Predictions vs Original Labels
+    gp_predictions = [pair[0] for pair in gp_vs_kmeans_data]
+    gp_predictions = np.concatenate(gp_predictions)
+    cm_gp = confusion_matrix(original_labels, gp_predictions)
+    sns.heatmap(cm_gp, annot=True, ax=axes[0], fmt='g')
+    axes[0].set_title('GP Model Predictions vs Original Labels')
+    axes[0].set_xlabel('Predicted Labels')
+    axes[0].set_ylabel('True Labels')
+
+    # Plot 2: Confusion Matrix for K-Means Predictions vs Original Labels
+    kmeans_predictions = [pair[1] for pair in gp_vs_kmeans_data]
+    kmeans_predictions = np.concatenate(kmeans_predictions)
+    cm_kmeans = confusion_matrix(original_labels, kmeans_predictions)
+    sns.heatmap(cm_kmeans, annot=True, ax=axes[1], fmt='g')
+    axes[1].set_title('K-Means Predictions vs Original Labels')
+    axes[1].set_xlabel('Predicted Labels')
+    axes[1].set_ylabel('True Labels')
+
+    plt.tight_layout()
+    plt.show()