From c9809a35ebdf94c28fb64aece25a7c6614782e89 Mon Sep 17 00:00:00 2001 From: Luis Roberto Mercado Diaz Date: Mon, 22 Jan 2024 14:49:34 -0500 Subject: [PATCH] HPC FOLDER Co-Authored-By: Dong Han --- .../active_learning/ss_active_learning.py | 86 +++++ HPC/final_project/environment.yml | 329 ++++++++++++++++++ ...l_attemp_4_1_Dong_Ohm_summary_20231025.csv | 110 ++++++ HPC/final_project/gp_al_ss_full.sh | 34 ++ HPC/final_project/models/ss_gp_model.py | 119 +++++++ HPC/final_project/ss_main.py | 79 +++++ HPC/final_project/utils/data_loader.py | 222 ++++++++++++ ...l_attemp_4_1_Dong_Ohm_summary_20231025.csv | 110 ++++++ HPC/final_project/utils/ss_evaluation.py | 77 ++++ HPC/final_project/utils/visualization.py | 81 +++++ 10 files changed, 1247 insertions(+) create mode 100644 HPC/final_project/active_learning/ss_active_learning.py create mode 100644 HPC/final_project/environment.yml create mode 100644 HPC/final_project/final_attemp_4_1_Dong_Ohm_summary_20231025.csv create mode 100644 HPC/final_project/gp_al_ss_full.sh create mode 100644 HPC/final_project/models/ss_gp_model.py create mode 100644 HPC/final_project/ss_main.py create mode 100644 HPC/final_project/utils/data_loader.py create mode 100644 HPC/final_project/utils/final_attemp_4_1_Dong_Ohm_summary_20231025.csv create mode 100644 HPC/final_project/utils/ss_evaluation.py create mode 100644 HPC/final_project/utils/visualization.py diff --git a/HPC/final_project/active_learning/ss_active_learning.py b/HPC/final_project/active_learning/ss_active_learning.py new file mode 100644 index 0000000..4d15f68 --- /dev/null +++ b/HPC/final_project/active_learning/ss_active_learning.py @@ -0,0 +1,86 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Dec 18 18:23:23 2023 + +@author: lrm22005 +""" +import numpy as np +import random +import torch +from torch.utils.data import DataLoader +from sklearn.cluster import MiniBatchKMeans + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +def label_samples(uncertain_samples, validation_data): + labels = [validation_data[sample_id]['label'] for sample_id in uncertain_samples] + return uncertain_samples, labels + +def stochastic_uncertainty_sampling(gp_model, gp_likelihood, val_loader, n_samples, n_batches, n_components=2): + gp_model.eval() + gp_likelihood.eval() + uncertain_sample_indices = [] + sampled_batches = random.sample(list(val_loader), n_batches) # Randomly sample n_batches from val_loader + + with torch.no_grad(): + for batch in sampled_batches: + # reduced_data = apply_tsne(batch['data'].reshape(batch['data'].size(0), -1), n_components=n_components) + # reduced_data_tensor = torch.Tensor(reduced_data).to(device) + reduced_data_tensor = batch['data'].view(batch['data'].size(0), -1).to(device) + predictions = gp_likelihood(gp_model(reduced_data_tensor)) + var = predictions.variance + top_indices = torch.argsort(-var.flatten())[:n_samples] + uncertain_sample_indices.extend(top_indices.cpu().numpy()) + + return uncertain_sample_indices[:n_samples] + +# def uncertainty_sampling(gp_model, gp_likelihood, val_loader, n_samples, n_components=2): +# gp_model.eval() +# gp_likelihood.eval() +# uncertain_sample_indices = [] +# with torch.no_grad(): +# for batch_idx, batch in tqdm(enumerate(val_loader), desc='Uncertainty Sampling', unit='batch'): +# reduced_data_tensor = batch['data'].view(batch['data'].size(0), -1).to(device) +# predictions = gp_likelihood(gp_model(reduced_data_tensor)) +# var = predictions.variance +# top_indices = torch.argsort(-var.flatten())[:n_samples] +# batch_uncertain_indices = [batch_idx * val_loader.batch_size + idx for idx in top_indices] +# uncertain_sample_indices.extend(batch_uncertain_indices) +# return uncertain_sample_indices[:n_samples] + +def run_minibatch_kmeans(data_loader, n_clusters, device, batch_size=100): + # Initialize MiniBatchKMeans + minibatch_kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=0, batch_size=batch_size) + + # Iterate through data_loader and fit MiniBatchKMeans + for batch in data_loader: + data = batch['data'].view(batch['data'].size(0), -1).to(device).cpu().numpy() + minibatch_kmeans.partial_fit(data) + + return minibatch_kmeans + +# def compare_kmeans_gp_predictions(kmeans_model, gp_model, data_loader, device): +# # Compare K-Means with GP model predictions +# all_data, all_labels = [], [] +# for batch in data_loader: +# data = batch['data'].view(batch['data'].size(0), -1).to(device) +# labels = batch['label'].to(device) +# gp_predictions = gp_model(data).mean.argmax(dim=0).cpu().numpy() +# kmeans_predictions = kmeans_model.predict(data.cpu().numpy()) +# all_labels.append(labels.cpu().numpy()) +# all_data.append((gp_predictions, kmeans_predictions)) +# return all_data, np.concatenate(all_labels) + +def stochastic_compare_kmeans_gp_predictions(kmeans_model, gp_model, data_loader, n_batches, device): + all_data, all_labels = [], [] + sampled_batches = random.sample(list(data_loader), n_batches) # Randomly sample n_batches from data_loader + + for batch in sampled_batches: + data = batch['data'].view(batch['data'].size(0), -1).to(device) + labels = batch['label'].to(device) + gp_predictions = gp_model(data).mean.argmax(dim=0).cpu().numpy() + kmeans_predictions = kmeans_model.predict(data.cpu().numpy()) + all_labels.append(labels.cpu().numpy()) + all_data.append((gp_predictions, kmeans_predictions)) + + return all_data, np.concatenate(all_labels) \ No newline at end of file diff --git a/HPC/final_project/environment.yml b/HPC/final_project/environment.yml new file mode 100644 index 0000000..fa488fe --- /dev/null +++ b/HPC/final_project/environment.yml @@ -0,0 +1,329 @@ +name: pytorch +channels: + - pytorch + - pyg + - nvidia + - conda-forge + - anaconda + - defaults +dependencies: + - alabaster=0.7.12=pyhd3eb1b0_0 + - anyio=3.5.0=py311haa95532_0 + - argon2-cffi=21.3.0=pyhd3eb1b0_0 + - argon2-cffi-bindings=21.2.0=py311h2bbff1b_0 + - arrow=1.2.3=py311haa95532_1 + - astroid=2.14.2=py311haa95532_0 + - asttokens=2.0.5=pyhd3eb1b0_0 + - async-lru=2.0.4=py311haa95532_0 + - atomicwrites=1.4.0=py_0 + - attrs=23.1.0=py311haa95532_0 + - autopep8=1.6.0=pyhd3eb1b0_1 + - babel=2.11.0=py311haa95532_0 + - backcall=0.2.0=pyhd3eb1b0_0 + - bcrypt=3.2.0=py311h2bbff1b_1 + - beautifulsoup4=4.12.2=py311haa95532_0 + - binaryornot=0.4.4=pyhd3eb1b0_1 + - black=23.11.0=py311haa95532_0 + - blas=1.0=mkl + - bleach=4.1.0=pyhd3eb1b0_0 + - bottleneck=1.3.5=py311h5bb9823_0 + - brotli=1.0.9=ha925a31_2 + - brotli-python=1.0.9=py311hd77b12b_7 + - bzip2=1.0.8=he774522_0 + - ca-certificates=2023.11.17=h56e8100_0 + - certifi=2023.11.17=pyhd8ed1ab_0 + - cffi=1.16.0=py311h2bbff1b_0 + - chardet=4.0.0=py311haa95532_1003 + - charset-normalizer=2.0.4=pyhd3eb1b0_0 + - click=8.1.7=py311haa95532_0 + - cloudpickle=2.2.1=py311haa95532_0 + - colorama=0.4.6=py311haa95532_0 + - console_shortcut=0.1.1=4 + - contourpy=1.2.0=py311h59b6b97_0 + - cookiecutter=2.5.0=py311haa95532_0 + - cryptography=41.0.7=py311h89fc84f_0 + - cuda-cccl=12.3.101=0 + - cuda-cudart=12.1.105=0 + - cuda-cudart-dev=12.1.105=0 + - cuda-cupti=12.1.105=0 + - cuda-libraries=12.1.0=0 + - cuda-libraries-dev=12.1.0=0 + - cuda-nvrtc=12.1.105=0 + - cuda-nvrtc-dev=12.1.105=0 + - cuda-nvtx=12.1.105=0 + - cuda-opencl=12.3.101=0 + - cuda-opencl-dev=12.3.101=0 + - cuda-profiler-api=12.3.101=0 + - cuda-runtime=12.1.0=0 + - cycler=0.12.1=pyhd8ed1ab_0 + - daal4py=2023.1.1=py311h30df693_0 + - dal=2023.1.1=h59b6b97_48682 + - debugpy=1.6.7=py311hd77b12b_0 + - decorator=5.1.1=pyhd3eb1b0_0 + - defusedxml=0.7.1=pyhd3eb1b0_0 + - diff-match-patch=20200713=pyhd3eb1b0_0 + - dill=0.3.7=py311haa95532_0 + - docstring-to-markdown=0.11=py311haa95532_0 + - docutils=0.18.1=py311haa95532_3 + - executing=0.8.3=pyhd3eb1b0_0 + - filelock=3.13.1=py311haa95532_0 + - flake8=6.0.0=py311haa95532_0 + - fonttools=4.25.0=pyhd3eb1b0_0 + - freetype=2.12.1=ha860e81_0 + - fsspec=2023.12.2=pyhca7485f_0 + - giflib=5.2.1=h8cc25b3_3 + - gmpy2=2.1.2=py311h7f96b67_0 + - gpytorch=1.11=pyhd8ed1ab_0 + - icc_rt=2022.1.0=h6049295_2 + - icu=73.1=h6c2663c_0 + - idna=3.4=py311haa95532_0 + - imagesize=1.4.1=py311haa95532_0 + - imbalanced-learn=0.11.0=pyhd8ed1ab_0 + - importlib-metadata=6.0.0=py311haa95532_0 + - importlib_metadata=6.0.0=hd3eb1b0_0 + - inflection=0.5.1=py311haa95532_0 + - intel-openmp=2023.1.0=h59b6b97_46320 + - intervaltree=3.1.0=pyhd3eb1b0_0 + - ipykernel=6.25.0=py311h746a85d_0 + - ipython=8.15.0=py311haa95532_0 + - ipython_genutils=0.2.0=pyhd3eb1b0_1 + - isort=5.9.3=pyhd3eb1b0_0 + - jaraco.classes=3.2.1=pyhd3eb1b0_0 + - jaxtyping=0.2.25=pyhd8ed1ab_0 + - jedi=0.18.1=py311haa95532_1 + - jellyfish=1.0.1=py311h36a85e1_0 + - jinja2=3.1.2=py311haa95532_0 + - joblib=1.2.0=py311haa95532_0 + - jpeg=9e=h2bbff1b_1 + - json5=0.9.6=pyhd3eb1b0_0 + - jsonschema=4.19.2=py311haa95532_0 + - jsonschema-specifications=2023.7.1=py311haa95532_0 + - jupyter-lsp=2.2.0=py311haa95532_0 + - jupyter_client=8.6.0=py311haa95532_0 + - jupyter_core=5.5.0=py311haa95532_0 + - jupyter_events=0.8.0=py311haa95532_0 + - jupyter_server=2.10.0=py311haa95532_0 + - jupyter_server_terminals=0.4.4=py311haa95532_1 + - jupyterlab=4.0.8=py311haa95532_0 + - jupyterlab_pygments=0.1.2=py_0 + - jupyterlab_server=2.25.1=py311haa95532_0 + - jupyterlab_widgets=3.0.9=py311haa95532_0 + - keyring=23.13.1=py311haa95532_0 + - kiwisolver=1.4.4=py311hd77b12b_0 + - krb5=1.20.1=h5b6d351_0 + - lazy-object-proxy=1.6.0=py311h2bbff1b_0 + - lerc=3.0=hd77b12b_0 + - libclang=14.0.6=default_hb5a9fac_1 + - libclang13=14.0.6=default_h8e68704_1 + - libcublas=12.1.0.26=0 + - libcublas-dev=12.1.0.26=0 + - libcufft=11.0.2.4=0 + - libcufft-dev=11.0.2.4=0 + - libcurand=10.3.4.101=0 + - libcurand-dev=10.3.4.101=0 + - libcusolver=11.4.4.55=0 + - libcusolver-dev=11.4.4.55=0 + - libcusparse=12.0.2.55=0 + - libcusparse-dev=12.0.2.55=0 + - libdeflate=1.17=h2bbff1b_1 + - libffi=3.4.4=hd77b12b_0 + - libjpeg-turbo=2.0.0=h196d8e1_0 + - libnpp=12.0.2.50=0 + - libnpp-dev=12.0.2.50=0 + - libnvjitlink=12.1.105=0 + - libnvjitlink-dev=12.1.105=0 + - libnvjpeg=12.1.1.14=0 + - libnvjpeg-dev=12.1.1.14=0 + - libpng=1.6.39=h8cc25b3_0 + - libpq=12.15=h906ac69_1 + - libsodium=1.0.18=h62dcd97_0 + - libspatialindex=1.9.3=h6c2663c_0 + - libtiff=4.5.1=hd77b12b_0 + - libuv=1.44.2=h2bbff1b_0 + - libwebp=1.3.2=hbc33d0d_0 + - libwebp-base=1.3.2=h2bbff1b_0 + - lightning-utilities=0.10.0=pyhd8ed1ab_0 + - linear_operator=0.5.2=pyhd8ed1ab_0 + - lz4-c=1.9.4=h2bbff1b_0 + - markdown-it-py=2.2.0=py311haa95532_1 + - markupsafe=2.1.1=py311h2bbff1b_0 + - matplotlib=3.8.0=py311haa95532_0 + - matplotlib-base=3.8.0=py311hf62ec03_0 + - matplotlib-inline=0.1.6=py311haa95532_0 + - mccabe=0.7.0=pyhd3eb1b0_0 + - mdurl=0.1.0=py311haa95532_0 + - mistune=2.0.4=py311haa95532_0 + - mkl=2023.1.0=h6b88ed4_46358 + - mkl-service=2.4.0=py311h2bbff1b_1 + - mkl_fft=1.3.8=py311h2bbff1b_0 + - mkl_random=1.2.4=py311h59b6b97_0 + - more-itertools=10.1.0=py311haa95532_0 + - mpc=1.1.0=h7edee0f_1 + - mpfr=4.0.2=h62dcd97_1 + - mpir=3.0.0=hec2e145_1 + - mpmath=1.3.0=py311haa95532_0 + - munkres=1.1.4=pyh9f0ad1d_0 + - mypy_extensions=1.0.0=py311haa95532_0 + - nbclient=0.8.0=py311haa95532_0 + - nbconvert=7.10.0=py311haa95532_0 + - nbformat=5.9.2=py311haa95532_0 + - nest-asyncio=1.5.6=py311haa95532_0 + - networkx=3.2.1=pyhd8ed1ab_0 + - notebook=7.0.6=py311haa95532_0 + - notebook-shim=0.2.3=py311haa95532_0 + - numexpr=2.8.7=py311h1fcbade_0 + - numpy=1.26.2=py311hdab7c0b_0 + - numpy-base=1.26.2=py311hd01c5d8_0 + - numpydoc=1.5.0=py311haa95532_0 + - openjpeg=2.4.0=h4fc8c34_0 + - openssl=3.0.12=h2bbff1b_0 + - overrides=7.4.0=py311haa95532_0 + - packaging=23.1=py311haa95532_0 + - pandas=2.1.4=py311hf62ec03_0 + - pandas-profiling=1.4.1=0 + - pandocfilters=1.5.0=pyhd3eb1b0_0 + - paramiko=2.8.1=pyhd3eb1b0_0 + - parso=0.8.3=pyhd3eb1b0_0 + - pathspec=0.10.3=py311haa95532_0 + - patsy=0.5.3=py311haa95532_0 + - pexpect=4.8.0=pyhd3eb1b0_3 + - pickleshare=0.7.5=pyhd3eb1b0_1003 + - pillow=10.0.1=py311h045eedc_0 + - pip=23.3.1=py311haa95532_0 + - platformdirs=3.10.0=py311haa95532_0 + - pluggy=1.0.0=py311haa95532_1 + - ply=3.11=py311haa95532_0 + - prometheus_client=0.14.1=py311haa95532_0 + - prompt-toolkit=3.0.36=py311haa95532_0 + - psutil=5.9.0=py311h2bbff1b_0 + - ptyprocess=0.7.0=pyhd3eb1b0_2 + - pure_eval=0.2.2=pyhd3eb1b0_0 + - pycodestyle=2.10.0=py311haa95532_0 + - pycparser=2.21=pyhd3eb1b0_0 + - pydocstyle=6.3.0=py311haa95532_0 + - pyflakes=3.0.1=py311haa95532_0 + - pyg=2.4.0=py311_torch_2.1.0_cu121 + - pygments=2.15.1=py311haa95532_1 + - pylint=2.16.2=py311haa95532_0 + - pylint-venv=2.3.0=py311haa95532_0 + - pyls-spyder=0.4.0=pyhd3eb1b0_0 + - pynacl=1.5.0=py311h8cc25b3_0 + - pyopenssl=23.2.0=py311haa95532_0 + - pyparsing=3.0.9=py311haa95532_0 + - pyqt=5.15.10=py311hd77b12b_0 + - pyqt5-sip=12.13.0=py311h2bbff1b_0 + - pyqtwebengine=5.15.10=py311hd77b12b_0 + - pysocks=1.7.1=py311haa95532_0 + - python=3.11.5=he1021f5_0 + - python-dateutil=2.8.2=pyhd3eb1b0_0 + - python-fastjsonschema=2.16.2=py311haa95532_0 + - python-json-logger=2.0.7=py311haa95532_0 + - python-lsp-black=1.2.1=py311haa95532_0 + - python-lsp-jsonrpc=1.0.0=pyhd3eb1b0_0 + - python-lsp-server=1.7.2=py311haa95532_0 + - python-slugify=5.0.2=pyhd3eb1b0_0 + - python-tzdata=2023.3=pyhd3eb1b0_0 + - pytoolconfig=1.2.6=py311haa95532_0 + - pytorch=2.1.2=py3.11_cuda12.1_cudnn8_0 + - pytorch-cuda=12.1=hde6ce7c_5 + - pytorch-lightning=2.1.2=pyhd8ed1ab_0 + - pytorch-mutex=1.0=cuda + - pytz=2023.3.post1=py311haa95532_0 + - pywin32=305=py311h2bbff1b_0 + - pywin32-ctypes=0.2.0=py311haa95532_1000 + - pywinpty=2.0.10=py311h5da7b33_0 + - pyyaml=6.0.1=py311h2bbff1b_0 + - pyzmq=25.1.0=py311hd77b12b_0 + - qdarkstyle=3.0.2=pyhd3eb1b0_0 + - qstylizer=0.2.2=py311haa95532_0 + - qt-main=5.15.2=h19c9488_10 + - qt-webengine=5.15.9=h5bd16bc_7 + - qtawesome=1.2.2=py311haa95532_0 + - qtconsole=5.4.2=py311haa95532_0 + - qtpy=2.4.1=py311haa95532_0 + - referencing=0.30.2=py311haa95532_0 + - requests=2.31.0=py311haa95532_0 + - rfc3339-validator=0.1.4=py311haa95532_0 + - rfc3986-validator=0.1.1=py311haa95532_0 + - rich=13.3.5=py311haa95532_0 + - rope=1.7.0=py311haa95532_0 + - rpds-py=0.10.6=py311h062c2fa_0 + - rtree=1.0.1=py311h2eaa2aa_0 + - scikit-learn=1.2.2=py311hd77b12b_1 + - scikit-learn-intelex=2023.1.1=py311haa95532_0 + - scipy=1.11.4=py311hc1ccb85_0 + - seaborn=0.12.2=py311haa95532_0 + - send2trash=1.8.2=py311haa95532_0 + - setuptools=68.2.2=py311haa95532_0 + - sip=6.7.12=py311hd77b12b_0 + - six=1.16.0=pyhd3eb1b0_1 + - sniffio=1.2.0=py311haa95532_1 + - snowballstemmer=2.2.0=pyhd3eb1b0_0 + - sortedcontainers=2.4.0=pyhd3eb1b0_0 + - soupsieve=2.5=py311haa95532_0 + - sphinx=5.0.2=py311haa95532_0 + - sphinxcontrib-applehelp=1.0.2=pyhd3eb1b0_0 + - sphinxcontrib-devhelp=1.0.2=pyhd3eb1b0_0 + - sphinxcontrib-htmlhelp=2.0.0=pyhd3eb1b0_0 + - sphinxcontrib-jsmath=1.0.1=pyhd3eb1b0_0 + - sphinxcontrib-qthelp=1.0.3=pyhd3eb1b0_0 + - sphinxcontrib-serializinghtml=1.1.5=pyhd3eb1b0_0 + - spyder=5.4.3=py311haa95532_1 + - spyder-kernels=2.4.4=py311haa95532_0 + - sqlite=3.41.2=h2bbff1b_0 + - stack_data=0.2.0=pyhd3eb1b0_0 + - statsmodels=0.14.0=py311hd7041d2_0 + - sympy=1.12=py311haa95532_0 + - tbb=2021.8.0=h59b6b97_0 + - terminado=0.17.1=py311haa95532_0 + - text-unidecode=1.3=pyhd3eb1b0_0 + - textdistance=4.2.1=pyhd3eb1b0_0 + - threadpoolctl=2.2.0=pyh0d69192_0 + - three-merge=0.1.1=pyhd3eb1b0_0 + - tinycss2=1.2.1=py311haa95532_0 + - tk=8.6.12=h2bbff1b_0 + - toml=0.10.2=pyhd3eb1b0_0 + - tomlkit=0.11.1=py311haa95532_0 + - torchmetrics=1.2.1=pyhd8ed1ab_0 + - tornado=6.3.3=py311h2bbff1b_0 + - tqdm=4.66.1=pyhd8ed1ab_0 + - traitlets=5.7.1=py311haa95532_0 + - typeguard=2.13.3=pyhd8ed1ab_0 + - typing-extensions=4.7.1=py311haa95532_0 + - typing_extensions=4.7.1=py311haa95532_0 + - tzdata=2023c=h04d1e81_0 + - ujson=5.4.0=py311hd77b12b_0 + - unidecode=1.2.0=pyhd3eb1b0_0 + - urllib3=1.26.18=py311haa95532_0 + - vc=14.2=h21ff451_1 + - vs2015_runtime=14.27.29016=h5e58377_2 + - watchdog=2.1.6=py311haa95532_0 + - wcwidth=0.2.5=pyhd3eb1b0_0 + - webencodings=0.5.1=py311haa95532_1 + - websocket-client=0.58.0=py311haa95532_4 + - whatthepatch=1.0.2=py311haa95532_0 + - wheel=0.41.2=py311haa95532_0 + - widgetsnbextension=4.0.9=pyhd8ed1ab_0 + - win_inet_pton=1.1.0=py311haa95532_0 + - winpty=0.4.3=4 + - wrapt=1.14.1=py311h2bbff1b_0 + - xz=5.4.5=h8cc25b3_0 + - yaml=0.2.5=he774522_0 + - yapf=0.31.0=pyhd3eb1b0_0 + - zeromq=4.3.4=hd77b12b_0 + - zipp=3.11.0=py311haa95532_0 + - zlib=1.2.13=h8cc25b3_0 + - zstd=1.5.5=hd43e919_0 + - pip: + - comm==0.2.0 + - fqdn==1.5.1 + - ipywidgets==8.1.1 + - isoduration==20.11.0 + - jsonpointer==2.4 + - jupyter==1.0.0 + - jupyter-console==6.6.3 + - torchaudio==2.1.2 + - torchvision==0.16.2 + - uri-template==1.3.0 + - webcolors==1.13 +prefix: C:\Users\lrm22005\AppData\Local\anaconda\envs\pytorch diff --git a/HPC/final_project/final_attemp_4_1_Dong_Ohm_summary_20231025.csv b/HPC/final_project/final_attemp_4_1_Dong_Ohm_summary_20231025.csv new file mode 100644 index 0000000..ab8e70b --- /dev/null +++ b/HPC/final_project/final_attemp_4_1_Dong_Ohm_summary_20231025.csv @@ -0,0 +1,110 @@ +UID,NSR,AF,PACPVC,SVT,Noisy +002,102,0,21,0,0 +003,3193,0,0,1,6865 +005,642,0,587,1,2208 +007,101,0,21,1,864 +011,97,0,2,0,468 +012,2698,0,9,1,10297 +013,7254,0,59,1,8966 +017,0,170,0,0,0 +020,2589,0,26,2,1556 +021,511,0,1,1,322 +022,4122,0,5,1,1110 +024,7116,0,1,0,0 +026,168,0,1,2,0 +027,9366,0,0,0,0 +028,3442,0,403,7,12994 +029,5088,0,1541,12,12987 +030,2386,0,1108,2,15570 +034,0,0,0,6,0 +035,472,0,1108,2,1255 +036,4882,0,1108,2,12845 +037,1050,0,1108,2,6350 +038,272,0,228,12,0 +039,3566,0,2,1,5154 +041,1253,0,0,1,3471 +042,125,0,1,0,415 +044,738,0,1,0,3819 +045,2905,0,155,1,4323 +047,1366,0,106,1,3585 +049,1529,0,17,1,4799 +050,274,0,1,0,6303 +052,1567,0,3,1,2149 +053,3504,0,36,3,7169 +054,192,0,635,0,0 +055,82,0,34,3,270 +056,62,0,34,3,310 +057,5079,0,34,3,0 +058,1307,0,34,3,1936 +062,2766,0,0,0,4723 +063,1379,0,34,3,1787 +064,3579,0,58,1,3 +068,3134,0,34,0,3634 +069,362,0,66,13,3824 +070,791,0,4,15,1143 +073,1986,0,116,2,4916 +074,87,0,23,1,619 +075,196,13,2110,47,1 +077,2802,0,23,1,4604 +078,5325,0,9,1,0 +080,6305,0,284,5,6673 +082,0,0,0,3,0 +083,78,0,1,0,790 +084,3781,0,0,0,545 +086,10,0,67,1,0 +087,0,0,11,0,0 +088,1644,0,0,0,3 +089,0,0,4,0,0 +090,1979,0,1,0,795 +091,1253,0,4,0,4746 +093,0,0,800,1,0 +094,109,0,0,0,612 +098,82,0,0,0,6166 +099,3373,0,1,1,3749 +100,87,0,1485,193,0 +101,477,0,0,0,4602 +104,0,0,12,1,0 +106,531,0,1,0,2480 +108,0,0,0,0,0 +109,615,0,2,0,886 +110,12,1,261,1,0 +111,109,0,0,0,215 +112,5357,0,642,61,4 +113,0,0,4668,1,0 +118,3544,0,259,2,5056 +119,16,0,0,0,1877 +120,15,0,19,0,12615 +301,0,17,0,0,0 +302,0,2,0,0,0 +305,0,10,0,0,0 +306,0,2,0,0,0 +307,0,39,0,0,0 +310,0,6,0,0,0 +311,0,19,0,0,0 +312,0,3,0,0,0 +318,0,8,0,0,0 +319,0,13,0,0,0 +320,0,8,0,0,0 +321,0,11,0,0,0 +322,0,7,0,0,0 +324,0,6,0,0,0 +325,14,0,0,0,0 +327,2,0,6,0,0 +329,0,15,0,0,0 +400,0,600,0,0,0 +402,0,2213,0,0,0 +405,0,406,0,0,0 +406,0,1902,0,0,0 +407,0,166,0,0,0 +408,866,50,29,0,0 +409,0,584,0,0,0 +410,0,3525,0,0,0 +413,0,4804,0,0,0 +414,0,2004,0,0,0 +415,0,1955,0,0,0 +416,0,762,0,0,0 +419,1383,1491,27,1,0 +420,0,165,0,0,0 +421,0,2016,0,0,0 +422,0,103,0,0,0 +423,0,1459,0,0,0 diff --git a/HPC/final_project/gp_al_ss_full.sh b/HPC/final_project/gp_al_ss_full.sh new file mode 100644 index 0000000..1338f92 --- /dev/null +++ b/HPC/final_project/gp_al_ss_full.sh @@ -0,0 +1,34 @@ +#!/bin/bash +#SBATCH --partition=general-gpu +#SBATCH --ntasks=1 +#SBATCH --mem=32GB +#SBATCH --nodes=1 +#SBATCH --time=01:00:00 +#SBATCH --mail-type=END +#SBATCH --mail-user=luis.mercado_diaz@uconn.edu +#SBATCH --output=sbatch_output_ss_main.txt +#SBATCH --error=sbatch_error_ss_main.txt +#SBATCH --gres=gpu:1 + +# Load necessary modules +module purge # unload all modules from user's environment +module load slurm cuda/11.6 cudnn/8.6.0 + +# Activate Miniconda +source /path/to/miniconda3/etc/profile.d/conda.sh + +# If the environment doesn't exist, create it from the YAML file +if ! conda info --envs | grep pytorch; then + conda env create -f environment.yml +fi + +# Activate your environment +conda activate pytorch + +# Navigate to your project directory (change to your actual directory) +cd /home/lrm22005/ML_Notebooks/Arrhytmia_GP/final_project/ + +# Execute your Python script +python3 ss_main.py + +#exit \ No newline at end of file diff --git a/HPC/final_project/models/ss_gp_model.py b/HPC/final_project/models/ss_gp_model.py new file mode 100644 index 0000000..1a1bb90 --- /dev/null +++ b/HPC/final_project/models/ss_gp_model.py @@ -0,0 +1,119 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Dec 18 18:01:41 2023 + +@author: lrm22005 +""" +from tqdm import tqdm +import torch +import gpytorch + +num_latents = 6 # This should match the complexity of your data or the number of tasks +num_tasks = 4 # This should match the number of output classes or tasks +num_inducing_points = 50 # This is independent and should be sufficient for the input space + +class MultitaskGPModel(gpytorch.models.ApproximateGP): + def __init__(self): + # Let's use a different set of inducing points for each latent function + inducing_points = torch.rand(num_latents, num_inducing_points, 128 * 128) # Assuming flattened 128x128 images + + # We have to mark the CholeskyVariationalDistribution as batch + # so that we learn a variational distribution for each task + variational_distribution = gpytorch.variational.CholeskyVariationalDistribution( + inducing_points.size(-2), batch_shape=torch.Size([num_latents]) + ) + + # We have to wrap the VariationalStrategy in a LMCVariationalStrategy + # so that the output will be a MultitaskMultivariateNormal rather than a batch output + variational_strategy = gpytorch.variational.LMCVariationalStrategy( + gpytorch.variational.VariationalStrategy( + self, inducing_points, variational_distribution, learn_inducing_locations=True + ), + num_tasks=num_tasks, + num_latents=num_latents, + latent_dim=-1 + ) + + super().__init__(variational_strategy) + + # The mean and covariance modules should be marked as batch + # so we learn a different set of hyperparameters + self.mean_module = gpytorch.means.ConstantMean(batch_shape=torch.Size([num_latents])) + self.covar_module = gpytorch.kernels.ScaleKernel( + gpytorch.kernels.RBFKernel(batch_shape=torch.Size([num_latents])), + batch_shape=torch.Size([num_latents]) + ) + + def forward(self, x): + # The forward function should be written as if we were dealing with each output + # dimension in batch + # Ensure x is correctly shaped. It should have the same last dimension size as inducing_points + # x should be reshaped or sliced to have the shape [?, 1] where ? can be any size + # For example, if x originally has shape [N, D], and D != 1, you need to modify x accordingly + # print(f"Input shape: {x.shape}") + # x = x.view(x.size(0), -1) # Flattening the images + # print(f"Input shape after flattening: {x.shape}") # Debugging input shape + mean_x = self.mean_module(x) + covar_x = self.covar_module(x) + + # Debugging: Print shapes of intermediate outputs + # print(f"Mean shape: {mean_x.shape}, Covariance shape: {covar_x.shape}") + latent_pred = gpytorch.distributions.MultivariateNormal(mean_x, covar_x) + # print(f"Latent prediction shape: {latent_pred.mean.shape}, {latent_pred.covariance_matrix.shape}") + + return latent_pred + + +def train_gp_model(train_x, train_y, val_loader, num_iterations=50, n_classes=4, patience=10, checkpoint_path='model_checkpoint_full.pt'): + model = MultitaskGPModel().to(device) + likelihood = gpytorch.likelihoods.SoftmaxLikelihood(num_features=4, num_classes=4).to(device) + model.train() + likelihood.train() + optimizer = torch.optim.Adam(model.parameters(), lr=0.1) + mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=train_y.size(0)) + + best_val_loss = float('inf') + epochs_no_improve = 0 + + for i in tqdm(range(num_iterations), desc='Training', unit='iter', leave=False): + optimizer.zero_grad() + output = model(train_x) + loss = -mll(output, train_y) + scalar_loss = loss.sum() if loss.numel() > 1 else loss + scalar_loss.backward() + optimizer.step() + + # Validation step + model.eval() + likelihood.eval() + with torch.no_grad(): + val_loss = 0.0 + for val_batch in val_loader: + val_x, val_y = val_batch['data'].view(val_batch['data'].size(0), -1).to(device), val_batch['label'].to(device) + val_output = model(val_x) + val_loss += -mll(val_output, val_y).item() + val_loss /= len(val_loader) + + model.train() + likelihood.train() + + # Early stopping and checkpointing based on validation loss + if val_loss < best_val_loss: + best_val_loss = val_loss + epochs_no_improve = 0 + torch.save({'model_state_dict': model.state_dict(), + 'likelihood_state_dict': likelihood.state_dict(), + 'optimizer_state_dict': optimizer.state_dict()}, checkpoint_path) + else: + epochs_no_improve += 1 + if epochs_no_improve == patience: + print(f"Early stopping triggered at iteration {i+1}") + break + + # Load the best model before return + checkpoint = torch.load(checkpoint_path) + model.load_state_dict(checkpoint['model_state_dict']) + likelihood.load_state_dict(checkpoint['likelihood_state_dict']) + optimizer.load_state_dict(checkpoint['optimizer_state_dict']) + + return model, likelihood \ No newline at end of file diff --git a/HPC/final_project/ss_main.py b/HPC/final_project/ss_main.py new file mode 100644 index 0000000..706ea63 --- /dev/null +++ b/HPC/final_project/ss_main.py @@ -0,0 +1,79 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Dec 18 18:47:27 2023 + +@author: lrm22005 +""" +import tqdm +import torch +from utils.data_loader import preprocess_data, split_uids +from models.ss_gp_model import MultitaskGPModel, train_gp_model +from utils.ss_evaluation import stochastic_evaluation, evaluate_model_on_all_data +from active_learning.ss_active_learning import stochastic_uncertainty_sampling, run_minibatch_kmeans, stochastic_compare_kmeans_gp_predictions, update_train_loader_with_uncertain_samples +from utils.visualization import plot_comparative_results, plot_training_performance, plot_results + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +def main(): + # Set parameters like n_classes, batch_size, etc. + n_classes = 4 + batch_size = 512 + clinical_trial_train, clinical_trial_test, clinical_trial_unlabeled = split_uids() + # Preprocess data + train_loader, val_loader, test_loader = preprocess_data('pt', clinical_trial_train, clinical_trial_test, clinical_trial_unlabeled, batch_size) + + kmeans_model = run_minibatch_kmeans(train_loader, n_clusters=n_classes, device=device) + + # Initialize result storage + results = { + 'train_loss': [], + 'validation_metrics': {'precision': [], 'recall': [], 'f1': [], 'auc_roc': []}, + 'test_metrics': None # This will be filled in with the final test metrics + } + + # Initial model training + for train_batch in train_loader: + train_x = train_batch['data'].view(train_batch['data'].size(0), -1).to(device) + train_y = train_batch['label'].to(device) + model, likelihood = train_gp_model(train_x, train_y, val_loader, num_iterations=10, n_classes=n_classes) + + active_learning_iterations = 10 + n_samples = batch_size # Number of uncertain samples to accumulate + for iteration in tqdm(range(active_learning_iterations), desc='Active Learning', unit='iteration', leave=True): + uncertain_sample_indices = stochastic_uncertainty_sampling(model, likelihood, val_loader, n_samples, n_batches=5, n_components=2) + + # Accumulate indices of uncertain samples + accumulated_indices = [] + for idx in uncertain_sample_indices: + accumulated_indices.append(idx) + + # Update the training loader with indices of uncertain samples + train_loader = update_train_loader_with_uncertain_samples(train_loader, accumulated_indices, batch_size) + + # Re-train the model with the updated train_loader + for train_batch in tqdm(train_loader, desc='Batch Training', leave=False): + train_x = train_batch['data'].view(train_batch['data'].size(0), -1).to(device) # Flatten the image + train_y = train_batch['label'].to(device) + model, likelihood = train_gp_model(train_x, train_y, val_loader, num_iterations=10, n_classes=n_classes) + val_metrics = stochastic_evaluation(model, likelihood, val_loader, device, n_classes, n_batches=5) + for metric in ['precision', 'recall', 'f1', 'auc_roc']: + results['validation_metrics'][metric].append(val_metrics[metric]) + + # Compare K-Means with GP model predictions after retraining + gp_vs_kmeans_data, original_labels = stochastic_compare_kmeans_gp_predictions(kmeans_model, model, train_loader, n_batches=5, device=device) + + plot_comparative_results(gp_vs_kmeans_data, original_labels) + + plot_training_performance(results['train_loss'], results['validation_metrics']) + + # Final evaluation on test set + classification_result = evaluate_model_on_all_data(model, likelihood, test_loader, device, n_classes=n_classes) + # Store test metrics + results['test_metrics'] = classification_result + # Now results dictionary is ready to be used for plotting + plot_results(results) + # You might also want to print or log the final test metrics + print("Final Test Metrics:", results['test_metrics']) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/HPC/final_project/utils/data_loader.py b/HPC/final_project/utils/data_loader.py new file mode 100644 index 0000000..8930e6f --- /dev/null +++ b/HPC/final_project/utils/data_loader.py @@ -0,0 +1,222 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Dec 18 18:09:02 2023 + +@author: lrm22005 +""" +import os +import pandas as pd +import torch +from torch.utils.data import Dataset, DataLoader +from sklearn.preprocessing import StandardScaler + +def split_uids(): + # ====== Load the per subject arrythmia summary ====== + df_summary = pd.read_csv(r'final_attemp_4_1_Dong_Ohm_summary_20231025.csv') + df_summary['UID'] = df_summary['UID'].astype(str).str.zfill(3) + + df_summary['sample_nonAF'] = df_summary['NSR'] + df_summary['PACPVC'] + df_summary['SVT'] + df_summary['sample_AF'] = df_summary['AF'] + + df_summary['sample_nonAF_ratio'] = df_summary['sample_nonAF'] / (df_summary['sample_AF'] + df_summary['sample_nonAF']) + + all_UIDs = df_summary['UID'].unique() + # ==================================================== + # ====== AF trial separation ====== + # R:\ENGR_Chon\Dong\Numbers\Pulsewatch_numbers\Fahimeh_CNNED_general_ExpertSystemwApplication\tbl_file_name\TrainingSet_final_segments + AF_trial_Fahimeh_train = ['402','410'] + AF_trial_Fahimeh_test = ['301', '302', '305', '306', '307', '310', '311', + '312', '318', '319', '320', '321', '322', '324', + '325', '327', '329', '400', '406', '407', '409', + '414'] + AF_trial_Fahimeh_did_not_use = ['405', '413', '415', '416', '420', '421', '422', '423'] + AF_trial_paroxysmal_AF = ['408','419'] + + AF_trial_train = AF_trial_Fahimeh_train + AF_trial_test = AF_trial_Fahimeh_test + AF_trial_unlabeled = AF_trial_Fahimeh_did_not_use + AF_trial_paroxysmal_AF + print(f'AF trial: {len(AF_trial_train)} training subjects {AF_trial_train}') + print(f'AF trial: {len(AF_trial_test)} testing subjects {AF_trial_test}') + print(f'AF trial: {len(AF_trial_unlabeled)} unlabeled subjects {AF_trial_unlabeled}') + # ================================= + # === Clinical trial AF subjects separation === + clinical_trial_AF_subjects = ['005', '017', '026', '051', '075', '082'] + + remaining_UIDs = [] + count_NSR = [] + import math + for index, row in df_summary.iterrows(): + UID = row['UID'] + this_NSR = row['sample_nonAF'] + if math.isnan(this_NSR): + # There is no segment in this subject, skip this UID. + print(f'---------UID {UID} has no segments.------------') + continue + if UID not in AF_trial_train and UID not in AF_trial_test and UID not in clinical_trial_AF_subjects \ + and not UID[0] == '3' and not UID[0] == '4': + remaining_UIDs.append(UID) + count_NSR.append(this_NSR) + + from numpy import random + random.seed(seed=42) + from numpy.random import choice + list_of_candidates = remaining_UIDs + number_of_items_to_pick = round(len(list_of_candidates) * 0.15) # 10% labeled for training, 5% for testing. + temp_sum = sum(count_NSR) + probability_distribution = [x/temp_sum for x in count_NSR] + probability_distribution = [(1-x/temp_sum)/ (len(count_NSR)-1) for x in count_NSR]# Subjects with fewer segments have higher chance to be selected. Make sure the sum is one. + draw = choice(list_of_candidates, number_of_items_to_pick, + p=probability_distribution, replace=False) + + clinical_trial_train = list(draw[:round(len(list_of_candidates) * 0.1)]) + clinical_trial_test_nonAF = list(draw[round(len(list_of_candidates) * 0.1):]) + clinical_trial_test_temp = clinical_trial_test_nonAF + clinical_trial_AF_subjects + clinical_trial_test = [] + for UID in clinical_trial_test_temp: + # UID 051 and maybe other UIDs had no segments (unknown reason). + if UID in all_UIDs: + clinical_trial_test.append(UID) + + clinical_trial_unlabeled = [] + for UID in all_UIDs: + if UID not in clinical_trial_train and UID not in clinical_trial_test and not UID[0] == '3' and not UID[0] == '4': + clinical_trial_unlabeled.append(UID) + print(f'Clinical trial: selected {len(clinical_trial_train)} UIDs for training {clinical_trial_train}') + print(f'Clinical trial: selected {len(clinical_trial_test)} UIDs for testing {clinical_trial_test}') + print(f'Clinical trial: selected {len(clinical_trial_unlabeled)} UIDs for unlabeled {clinical_trial_unlabeled}') + return clinical_trial_train, clinical_trial_test, clinical_trial_unlabeled + +class CustomDataset(Dataset): + def __init__(self, data_path, labels_path, UIDs, standardize=True, read_all_labels=False): + self.data_path = data_path + self.labels_path = labels_path + self.UIDs = UIDs + self.standardize = standardize + self.read_all_labels = read_all_labels + self.refresh_dataset() + + def refresh_dataset(self): + self.segment_names, self.labels = self.extract_segment_names_and_labels() + + def add_uids(self, new_uids): + unique_new_uids = [uid for uid in new_uids if uid not in self.UIDs] + self.UIDs.extend(unique_new_uids) + self.refresh_dataset() + + def __len__(self): + return len(self.segment_names) + + def __getitem__(self, idx): + segment_name = self.segment_names[idx] + label = self.labels[segment_name] + time_freq_tensor = self.load_data(segment_name) + return {'data': time_freq_tensor, 'label': label, 'segment_name': segment_name} + + def extract_segment_names_and_labels(self): + segment_names = [] + labels = {} + + for UID in self.UIDs: + label_file = os.path.join(self.labels_path, UID + "_final_attemp_4_1_Dong.csv") + if os.path.exists(label_file): + label_data = pd.read_csv(label_file, sep=',', header=0, names=['segment', 'label']) + label_segment_names = label_data['segment'].apply(lambda x: x.split('.')[0]) + for idx, segment_name in enumerate(label_segment_names): + label_val = label_data['label'].values[idx] + if self.read_all_labels: + # Assign -1 if label is not in [0, 1, 2, 3] + labels[segment_name] = label_val if label_val in [0, 1, 2, 3] else -1 + if segment_name not in segment_names: + segment_names.append(segment_name) + else: + # Only add segments with labels in [0, 1, 2, 3] + if label_val in [0, 1, 2, 3] and segment_name not in segment_names: + segment_names.append(segment_name) + labels[segment_name] = label_val + + return segment_names, labels + + def load_data(self, segment_name): + data_path_UID = os.path.join(self.data_path, segment_name.split('_')[0]) + seg_path = os.path.join(data_path_UID, segment_name + '_filt_STFT.pt') + try: + time_freq_tensor = torch.load(seg_path) + if self.standardize: + time_freq_tensor = self.standard_scaling(time_freq_tensor) + return time_freq_tensor.clone() + except Exception as e: + print(f"Error processing segment: {segment_name}. Exception: {str(e)}") + return torch.zeros((1, 128, 128)) + + def standard_scaling(self, data): + scaler = StandardScaler() + data = scaler.fit_transform(data.reshape(-1, data.shape[-1])).reshape(data.shape) + return torch.Tensor(data) + +def load_data_split_batched(data_path, labels_path, UIDs, batch_size, standardize=False, read_all_labels=False, drop_last=False, num_workers=4): + dataset = CustomDataset(data_path, labels_path, UIDs, standardize, read_all_labels) + dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, drop_last=drop_last, num_workers=num_workers, prefetch_factor=2) + return dataloader + +def get_data_paths(data_format, is_linux=False, is_hpc=False): + if is_linux: + base_path = "/mnt/r/ENGR_Chon/Dong/MATLAB_generate_results/NIH_PulseWatch" + labels_base_path = "/mnt/r/ENGR_Chon/NIH_Pulsewatch_Database/Adjudication_UConn" + saving_base_path = "/mnt/r/ENGR_Chon/Luis/Research/Casseys_case/Project_1_analysis" + elif is_hpc: + base_path = "/gpfs/scratchfs1/kic14002/doh16101" + labels_base_path = "/gpfs/scratchfs1/hfp14002/lrm22005" + saving_base_path = "/gpfs/scratchfs1/hfp14002/lrm22005/Casseys_case/Project_1_analysis" + else: + # R:\ENGR_Chon\Dong\MATLAB_generate_results\NIH_PulseWatch + base_path = "R:\ENGR_Chon\Dong\MATLAB_generate_results\\NIH_PulseWatch" + labels_base_path = "R:\ENGR_Chon\\NIH_Pulsewatch_Database\Adjudication_UConn" + saving_base_path = r"\\grove.ad.uconn.edu\research\ENGR_Chon\Luis\Research\Casseys_case" + if data_format == 'csv': + data_path = os.path.join(base_path, "TFS_csv") + labels_path = os.path.join(labels_base_path, "final_attemp_4_1_Dong_Ohm") + saving_path = os.path.join(saving_base_path, "Project_1_analysis") + elif data_format == 'png': + data_path = os.path.join(base_path, "TFS_plots") + labels_path = os.path.join(labels_base_path, "final_attemp_4_1_Dong_Ohm") + saving_path = os.path.join(saving_base_path, "Project_1_analysis") + elif data_format == 'pt': + data_path = os.path.join(base_path, "PT_format") + labels_path = os.path.join(labels_base_path, "final_attemp_4_1_Dong_Ohm") + saving_path = os.path.join(saving_base_path, "Project_1_analysis") + else: + raise ValueError("Invalid data format. Choose 'csv' or 'png.") + return data_path, labels_path, saving_path + +# Function to extract and preprocess data +def preprocess_data(data_format, clinical_trial_train, clinical_trial_test, clinical_trial_unlabeled, batch_size, read_all_labels=True): + # Extracts paths and loads data into train, validation, and test loaders + data_path, labels_path, saving_path = get_data_paths(data_format, is_hpc=True) + train_loader = load_data_split_batched(data_path, labels_path, clinical_trial_train, batch_size, standardize=True, read_all_labels=False) + val_loader = load_data_split_batched(data_path, labels_path, clinical_trial_test, batch_size, standardize=True, read_all_labels=read_all_labels) + test_loader = load_data_split_batched(data_path, labels_path, clinical_trial_unlabeled, batch_size, standardize=True, read_all_labels=read_all_labels) + return train_loader, val_loader, test_loader + +def map_samples_to_uids(uncertain_sample_indices, dataset): + """ + Maps indices of uncertain samples back to their corresponding segment names or UIDs. + + Args: + - uncertain_sample_indices: Indices of the uncertain samples in the dataset. + - dataset: The dataset object which contains the mapping of segment names and UIDs. + + Returns: + - List of UIDs or segment names corresponding to the uncertain samples. + """ + return [dataset.segment_names[i] for i in uncertain_sample_indices] + +def update_train_loader_with_uncertain_samples(current_train_loader, new_sample_indices, batch_size, standardize=False, data_format='csv', read_all_labels=True): + # Extract current UIDs from the current_train_loader + current_dataset = current_train_loader.dataset + # Map new_samples back to their corresponding segment names or UIDs + new_uids = map_samples_to_uids(new_sample_indices, current_dataset) + # Add new UIDs to the current dataset and refresh it + current_dataset.add_uids(new_uids) + # Create new DataLoader with the updated dataset + updated_train_loader = DataLoader(current_dataset, batch_size=batch_size, shuffle=False) + return updated_train_loader \ No newline at end of file diff --git a/HPC/final_project/utils/final_attemp_4_1_Dong_Ohm_summary_20231025.csv b/HPC/final_project/utils/final_attemp_4_1_Dong_Ohm_summary_20231025.csv new file mode 100644 index 0000000..ab8e70b --- /dev/null +++ b/HPC/final_project/utils/final_attemp_4_1_Dong_Ohm_summary_20231025.csv @@ -0,0 +1,110 @@ +UID,NSR,AF,PACPVC,SVT,Noisy +002,102,0,21,0,0 +003,3193,0,0,1,6865 +005,642,0,587,1,2208 +007,101,0,21,1,864 +011,97,0,2,0,468 +012,2698,0,9,1,10297 +013,7254,0,59,1,8966 +017,0,170,0,0,0 +020,2589,0,26,2,1556 +021,511,0,1,1,322 +022,4122,0,5,1,1110 +024,7116,0,1,0,0 +026,168,0,1,2,0 +027,9366,0,0,0,0 +028,3442,0,403,7,12994 +029,5088,0,1541,12,12987 +030,2386,0,1108,2,15570 +034,0,0,0,6,0 +035,472,0,1108,2,1255 +036,4882,0,1108,2,12845 +037,1050,0,1108,2,6350 +038,272,0,228,12,0 +039,3566,0,2,1,5154 +041,1253,0,0,1,3471 +042,125,0,1,0,415 +044,738,0,1,0,3819 +045,2905,0,155,1,4323 +047,1366,0,106,1,3585 +049,1529,0,17,1,4799 +050,274,0,1,0,6303 +052,1567,0,3,1,2149 +053,3504,0,36,3,7169 +054,192,0,635,0,0 +055,82,0,34,3,270 +056,62,0,34,3,310 +057,5079,0,34,3,0 +058,1307,0,34,3,1936 +062,2766,0,0,0,4723 +063,1379,0,34,3,1787 +064,3579,0,58,1,3 +068,3134,0,34,0,3634 +069,362,0,66,13,3824 +070,791,0,4,15,1143 +073,1986,0,116,2,4916 +074,87,0,23,1,619 +075,196,13,2110,47,1 +077,2802,0,23,1,4604 +078,5325,0,9,1,0 +080,6305,0,284,5,6673 +082,0,0,0,3,0 +083,78,0,1,0,790 +084,3781,0,0,0,545 +086,10,0,67,1,0 +087,0,0,11,0,0 +088,1644,0,0,0,3 +089,0,0,4,0,0 +090,1979,0,1,0,795 +091,1253,0,4,0,4746 +093,0,0,800,1,0 +094,109,0,0,0,612 +098,82,0,0,0,6166 +099,3373,0,1,1,3749 +100,87,0,1485,193,0 +101,477,0,0,0,4602 +104,0,0,12,1,0 +106,531,0,1,0,2480 +108,0,0,0,0,0 +109,615,0,2,0,886 +110,12,1,261,1,0 +111,109,0,0,0,215 +112,5357,0,642,61,4 +113,0,0,4668,1,0 +118,3544,0,259,2,5056 +119,16,0,0,0,1877 +120,15,0,19,0,12615 +301,0,17,0,0,0 +302,0,2,0,0,0 +305,0,10,0,0,0 +306,0,2,0,0,0 +307,0,39,0,0,0 +310,0,6,0,0,0 +311,0,19,0,0,0 +312,0,3,0,0,0 +318,0,8,0,0,0 +319,0,13,0,0,0 +320,0,8,0,0,0 +321,0,11,0,0,0 +322,0,7,0,0,0 +324,0,6,0,0,0 +325,14,0,0,0,0 +327,2,0,6,0,0 +329,0,15,0,0,0 +400,0,600,0,0,0 +402,0,2213,0,0,0 +405,0,406,0,0,0 +406,0,1902,0,0,0 +407,0,166,0,0,0 +408,866,50,29,0,0 +409,0,584,0,0,0 +410,0,3525,0,0,0 +413,0,4804,0,0,0 +414,0,2004,0,0,0 +415,0,1955,0,0,0 +416,0,762,0,0,0 +419,1383,1491,27,1,0 +420,0,165,0,0,0 +421,0,2016,0,0,0 +422,0,103,0,0,0 +423,0,1459,0,0,0 diff --git a/HPC/final_project/utils/ss_evaluation.py b/HPC/final_project/utils/ss_evaluation.py new file mode 100644 index 0000000..4729701 --- /dev/null +++ b/HPC/final_project/utils/ss_evaluation.py @@ -0,0 +1,77 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Dec 18 18:17:40 2023 + +@author: lrm22005 +""" +import numpy as np +import torch +import gpytorch +from sklearn.preprocessing import label_binarize +from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score +from sklearn.metrics import precision_recall_fscore_support + +def evaluate_model_on_all_data(model, likelihood, data_loader, device, n_classes): + model.eval() + likelihood.eval() + + all_predicted_labels = [] + all_test_labels = [] + + with torch.no_grad(), gpytorch.settings.fast_pred_var(): + for i, batch in enumerate(data_loader): + test_data = batch['data'].view(batch['data'].size(0), -1).to(device) + test_labels = batch['label'].to(device) + # print(f"Test data shape before t-SNE: {test_data.shape}") + + predictions = likelihood(model(test_data)).mean + # Debugging - check shape of predictions + # print(f"Predictions shape: {predictions.shape}") + predicted_labels = predictions.argmax(dim=0) + + # Add debugging information + # print(f"Batch {i}: Predicted Labels Shape: {predicted_labels.shape}, Actual Labels Shape: {test_labels.shape}") + + all_predicted_labels.append(predicted_labels.cpu().numpy()) + all_test_labels.append(test_labels.numpy()) + + # Debug the accumulation of labels + # current_predicted = np.concatenate(all_predicted_labels, axis=0) + # current_actual = np.concatenate(all_test_labels, axis=0) + # print(f"After Batch {i}: Accumulated Predicted Labels: {current_predicted.shape[0]}, Accumulated Actual Labels: {current_actual.shape[0]}") + + # Concatenate all batch results + all_predicted_labels = np.concatenate(all_predicted_labels, axis=0) + all_test_labels = np.concatenate(all_test_labels, axis=0) + + # Final check + # print(f"Final: Total Predicted Labels: {all_predicted_labels.shape[0]}, Total Actual Labels: {all_test_labels.shape[0]}") + + # Verify if the shapes match before proceeding to calculate metrics + if all_predicted_labels.shape[0] != all_test_labels.shape[0]: + raise ValueError("Mismatch in the number of samples between predicted and actual labels") + + # Compute overall evaluation metrics + precision, recall, f1, _ = precision_recall_fscore_support(all_test_labels, all_predicted_labels, average='macro') + # For AUC-ROC, you need the predicted probabilities and true labels in a one-hot encoded format + test_labels_one_hot = label_binarize(all_test_labels, classes=np.arange(n_classes)) + auc_roc = roc_auc_score(test_labels_one_hot, predictions.softmax(dim=-1).cpu().numpy(), multi_class='ovr') + return { + 'precision': precision, + 'recall': recall, + 'f1': f1, + 'auc_roc': auc_roc + } + +def parse_classification_report(report): + """Parse a classification report into a dictionary of metrics.""" + lines = report.split('\n') + main_metrics = lines[-2].split() + + # Assuming the last line is like "accuracy: x macro avg y1 y2 y3 y4" + return { + 'precision': float(main_metrics[3]), + 'recall': float(main_metrics[4]), + 'f1': float(main_metrics[5]), + 'auc_roc': None # AUC-ROC is not part of the classification report by default + } diff --git a/HPC/final_project/utils/visualization.py b/HPC/final_project/utils/visualization.py new file mode 100644 index 0000000..3ecf59b --- /dev/null +++ b/HPC/final_project/utils/visualization.py @@ -0,0 +1,81 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Dec 18 18:20:55 2023 + +@author: lrm22005 +""" +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +from sklearn.metrics import confusion_matrix + +def plot_training_performance(train_loss, validation_metrics): + epochs = range(1, len(train_loss) + 1) + + # Plot training loss + plt.figure(figsize=(14, 6)) + plt.subplot(1, 2, 1) + plt.plot(epochs, train_loss, 'b-', label='Training Loss') + plt.title('Training Loss') + plt.xlabel('Epochs') + plt.ylabel('Loss') + plt.legend() + + # Plot validation metrics + plt.subplot(1, 2, 2) + plt.plot(epochs, validation_metrics['precision'], 'r-', label='Precision') + plt.plot(epochs, validation_metrics['recall'], 'g-', label='Recall') + plt.plot(epochs, validation_metrics['f1'], 'b-', label='F1 Score') + plt.plot(epochs, validation_metrics['auc_roc'], 'y-', label='AUC-ROC') + plt.title('Validation Metrics') + plt.xlabel('Epochs') + plt.ylabel('Metrics') + plt.legend() + + plt.tight_layout() + plt.show() + +def plot_results(results): + plt.figure(figsize=(12, 5)) + plt.subplot(1, 2, 1) + plt.plot(results['train_loss'], label='Train Loss') + plt.title('Training Loss Over Time') + plt.legend() + + plt.subplot(1, 2, 2) + for metric in ['precision', 'recall', 'f1']: + plt.plot(results['validation_metrics'][metric], label=metric.title()) + plt.title('Validation Metrics Over Time') + plt.legend() + plt.show() + + test_metrics = results['test_metrics'] + print("Test Metrics:") + print(f"Precision: {test_metrics['precision']}") + print(f"Recall: {test_metrics['recall']}") + print(f"F1 Score: {test_metrics['f1']}") + print(f"AUC-ROC: {test_metrics['auc_roc']}") + +def plot_comparative_results(gp_vs_kmeans_data, original_labels): + fig, axes = plt.subplots(1, 2, figsize=(14, 7)) + + # Plot 1: Confusion Matrix for GP Predictions vs Original Labels + gp_predictions = [pair[0] for pair in gp_vs_kmeans_data] + gp_predictions = np.concatenate(gp_predictions) + cm_gp = confusion_matrix(original_labels, gp_predictions) + sns.heatmap(cm_gp, annot=True, ax=axes[0], fmt='g') + axes[0].set_title('GP Model Predictions vs Original Labels') + axes[0].set_xlabel('Predicted Labels') + axes[0].set_ylabel('True Labels') + + # Plot 2: Confusion Matrix for K-Means Predictions vs Original Labels + kmeans_predictions = [pair[1] for pair in gp_vs_kmeans_data] + kmeans_predictions = np.concatenate(kmeans_predictions) + cm_kmeans = confusion_matrix(original_labels, kmeans_predictions) + sns.heatmap(cm_kmeans, annot=True, ax=axes[1], fmt='g') + axes[1].set_title('K-Means Predictions vs Original Labels') + axes[1].set_xlabel('Predicted Labels') + axes[1].set_ylabel('True Labels') + + plt.tight_layout() + plt.show()