diff --git a/.gitignore b/.gitignore index 11d1435..233cecd 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ VAE.py model_checkpoint.pt GP_original_data.py Attention_network.py +*.pt diff --git a/BML_project/active_learning/__pycache__/ss_active_learning.cpython-311.pyc b/BML_project/active_learning/__pycache__/ss_active_learning.cpython-311.pyc deleted file mode 100644 index 8d691ba..0000000 Binary files a/BML_project/active_learning/__pycache__/ss_active_learning.cpython-311.pyc and /dev/null differ diff --git a/BML_project/active_learning/ss_active_learning.py b/BML_project/active_learning/ss_active_learning.py index 4442a34..2758c81 100644 --- a/BML_project/active_learning/ss_active_learning.py +++ b/BML_project/active_learning/ss_active_learning.py @@ -54,6 +54,7 @@ def run_minibatch_kmeans(data_loader, n_clusters, device, batch_size=100): for batch in data_loader: data = batch['data'].view(batch['data'].size(0), -1).to(device).cpu().numpy() minibatch_kmeans.partial_fit(data) + # minibatch_kmeans.fit(data) # Dong, 01/22/2024: Debug return minibatch_kmeans diff --git a/BML_project/cassey_CS330_torch.yml b/BML_project/cassey_CS330_torch.yml new file mode 100644 index 0000000..fc86d9e --- /dev/null +++ b/BML_project/cassey_CS330_torch.yml @@ -0,0 +1,272 @@ +name: CS330_torch +channels: + - pytorch + - nvidia + - anaconda + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1=main + - _openmp_mutex=5.1=1_gnu + - abseil-cpp=20211102.0=h27087fc_1 + - absl-py=2.0.0=pyhd8ed1ab_0 + - aiohttp=3.8.5=py311h5eee18b_0 + - aiosignal=1.3.1=pyhd8ed1ab_0 + - asttokens=2.4.0=pyhd8ed1ab_0 + - async-timeout=4.0.2=py311h06a4308_0 + - attrs=23.1.0=pyh71513ae_1 + - backcall=0.2.0=pyh9f0ad1d_0 + - backports=1.0=pyhd8ed1ab_3 + - backports.functools_lru_cache=1.6.5=pyhd8ed1ab_0 + - blas=1.1=openblas + - blinker=1.6.3=pyhd8ed1ab_0 + - bottleneck=1.3.5=py311hbed6279_0 + - brotli=1.0.9=h9c3ff4c_4 + - brotlipy=0.7.0=py311h5eee18b_1002 + - bzip2=1.0.8=h7b6447c_0 + - c-ares=1.19.1=h5eee18b_0 + - ca-certificates=2023.12.12=h06a4308_0 + - cachetools=5.3.1=pyhd8ed1ab_0 + - cairo=1.16.0=hb05425b_5 + - certifi=2023.11.17=py311h06a4308_0 + - cffi=1.15.1=py311h5eee18b_3 + - charset-normalizer=2.0.4=pyhd3eb1b0_0 + - click=8.1.7=unix_pyh707e725_0 + - cloudpickle=2.2.1=pyhd8ed1ab_0 + - colorama=0.4.6=pyhd8ed1ab_0 + - comm=0.1.4=pyhd8ed1ab_0 + - contourpy=1.0.5=py311hdb19cb5_0 + - cryptography=41.0.3=py311hdda0065_0 + - cuda-cudart=11.8.89=0 + - cuda-cupti=11.8.87=0 + - cuda-libraries=11.8.0=0 + - cuda-nvrtc=11.8.89=0 + - cuda-nvtx=11.8.86=0 + - cuda-runtime=11.8.0=0 + - cycler=0.12.1=pyhd8ed1ab_0 + - cyrus-sasl=2.1.28=h52b45da_1 + - dbus=1.13.18=hb2f20db_0 + - debugpy=1.6.7=py311h6a678d5_0 + - decorator=5.1.1=pyhd8ed1ab_0 + - eigen=3.4.0=h4bd325d_0 + - exceptiongroup=1.1.3=pyhd8ed1ab_0 + - executing=1.2.0=pyhd8ed1ab_0 + - expat=2.5.0=h6a678d5_0 + - ffmpeg=4.2.2=h20bf706_0 + - filelock=3.9.0=py311h06a4308_0 + - fontconfig=2.14.1=h4c34cd2_2 + - fonttools=4.25.0=pyhd3eb1b0_0 + - freetype=2.12.1=h4a9f257_0 + - frozenlist=1.3.3=py311h5eee18b_0 + - fsspec=2023.10.0=pyhca7485f_0 + - giflib=5.2.1=h5eee18b_3 + - glib=2.69.1=he621ea3_2 + - gmp=6.2.1=h295c915_3 + - gmpy2=2.1.2=py311hc9b5ff0_0 + - gnutls=3.6.15=he1e5248_0 + - google-auth=2.23.2=pyhca7485f_0 + - google-auth-oauthlib=1.0.0=pyhd8ed1ab_1 + - googledrivedownloader=0.4=pyhd3deb0d_1 + - gpytorch=1.11=pyhd8ed1ab_0 + - graphite2=1.3.14=h295c915_1 + - grpc-cpp=1.48.2=he1ff14a_1 + - grpcio=1.48.2=py311he1ff14a_1 + - gst-plugins-base=1.14.1=h6a678d5_1 + - gstreamer=1.14.1=h5eee18b_1 + - h5py=3.9.0=py311hdd6beaf_0 + - harfbuzz=4.3.0=hf52aaf7_1 + - hdf5=1.12.1=h2b7332f_3 + - icu=58.2=hf484d3e_1000 + - idna=3.4=py311h06a4308_0 + - imageio=2.31.5=pyh8c1a49c_0 + - importlib-metadata=6.8.0=pyha770c72_0 + - importlib_metadata=6.8.0=hd8ed1ab_0 + - iniconfig=1.1.1=pyhd3eb1b0_0 + - intel-openmp=2023.1.0=hdb19cb5_46305 + - ipykernel=6.25.2=pyh2140261_0 + - ipython=8.16.1=pyh0d859eb_0 + - jaxtyping=0.2.25=pyhd8ed1ab_0 + - jedi=0.19.1=pyhd8ed1ab_0 + - jinja2=3.1.2=py311h06a4308_0 + - joblib=1.2.0=py311h06a4308_0 + - jpeg=9e=h5eee18b_1 + - jupyter_client=8.3.1=pyhd8ed1ab_0 + - jupyter_core=4.12.0=py311h38be061_0 + - kiwisolver=1.4.4=py311h6a678d5_0 + - krb5=1.20.1=h143b758_1 + - lame=3.100=h7b6447c_0 + - lcms2=2.12=h3be6417_0 + - ld_impl_linux-64=2.38=h1181459_1 + - lerc=3.0=h295c915_0 + - libclang=14.0.6=default_hc6dbbc7_1 + - libclang13=14.0.6=default_he11475f_1 + - libcublas=11.11.3.6=0 + - libcufft=10.9.0.58=0 + - libcufile=1.7.2.10=0 + - libcups=2.4.2=h2d74bed_1 + - libcurand=10.3.3.141=0 + - libcurl=7.88.1=h251f7ec_2 + - libcusolver=11.4.1.48=0 + - libcusparse=11.7.5.86=0 + - libdeflate=1.17=h5eee18b_1 + - libedit=3.1.20221030=h5eee18b_0 + - libev=4.33=h7f8727e_1 + - libevent=2.1.12=hdbd6064_1 + - libffi=3.4.4=h6a678d5_0 + - libgcc-ng=11.2.0=h1234567_1 + - libgfortran=3.0.0=1 + - libgfortran-ng=11.2.0=h00389a5_1 + - libgfortran5=11.2.0=h1234567_1 + - libgomp=11.2.0=h1234567_1 + - libiconv=1.16=h7f8727e_2 + - libidn2=2.3.4=h5eee18b_0 + - libjpeg-turbo=2.0.0=h9bf148f_0 + - libllvm14=14.0.6=hdb19cb5_3 + - libnghttp2=1.52.0=h2d74bed_1 + - libnpp=11.8.0.86=0 + - libnvjpeg=11.9.0.86=0 + - libopenblas=0.3.21=h043d6bf_0 + - libopus=1.3.1=h7f98852_1 + - libpng=1.6.39=h5eee18b_0 + - libpq=12.15=hdbd6064_1 + - libprotobuf=3.20.3=he621ea3_0 + - libsodium=1.0.18=h36c2ea0_1 + - libssh2=1.10.0=hdbd6064_2 + - libstdcxx-ng=11.2.0=h1234567_1 + - libtasn1=4.19.0=h5eee18b_0 + - libtiff=4.5.1=h6a678d5_0 + - libunistring=0.9.10=h27cfd23_0 + - libuuid=1.41.5=h5eee18b_0 + - libvpx=1.7.0=h439df22_0 + - libwebp=1.3.2=h11a3e52_0 + - libwebp-base=1.3.2=h5eee18b_0 + - libxcb=1.15=h7f8727e_0 + - libxkbcommon=1.0.1=h5eee18b_1 + - libxml2=2.10.4=hcbfbd50_0 + - libxslt=1.1.37=h2085143_0 + - linear_operator=0.5.2=pyhd8ed1ab_0 + - llvm-openmp=14.0.6=h9e868ea_0 + - lockfile=0.12.2=py311h06a4308_0 + - lz4-c=1.9.4=h6a678d5_0 + - markdown=3.5=pyhd8ed1ab_0 + - markupsafe=2.1.1=py311h5eee18b_0 + - matplotlib=3.7.2=py311h06a4308_0 + - matplotlib-base=3.7.2=py311ha02d727_0 + - matplotlib-inline=0.1.6=pyhd8ed1ab_0 + - mkl=2023.1.0=h213fc3f_46343 + - mkl-service=2.4.0=py311h5eee18b_1 + - mpc=1.1.0=h10f8cd9_1 + - mpfr=4.0.2=hb69a4c5_1 + - mpmath=1.3.0=py311h06a4308_0 + - multidict=6.0.2=py311h5eee18b_0 + - munkres=1.1.4=pyh9f0ad1d_0 + - mysql=5.7.24=h721c034_2 + - ncurses=6.4=h6a678d5_0 + - nest-asyncio=1.5.6=pyhd8ed1ab_0 + - nettle=3.7.3=hbbd107a_1 + - networkx=3.1=py311h06a4308_0 + - ninja=1.10.2=h06a4308_5 + - ninja-base=1.10.2=hd09550d_5 + - numexpr=2.8.7=py311h812550d_0 + - numpy=1.26.0=py311h24aa872_0 + - numpy-base=1.26.0=py311hbfb1bba_0 + - oauthlib=3.2.2=pyhd8ed1ab_0 + - openblas=0.3.3=ha44fe06_1 + - opencv=4.6.0=py311h10ae9b0_5 + - openh264=2.1.1=h4ff587b_0 + - openjpeg=2.4.0=h3ad879b_0 + - openssl=3.0.12=h7f8727e_0 + - opt_einsum=3.3.0=pyhc1e730c_2 + - packaging=23.2=pyhd8ed1ab_0 + - pandas=2.0.3=py311ha02d727_0 + - parso=0.8.3=pyhd8ed1ab_0 + - pcre=8.45=h9c3ff4c_0 + - pexpect=4.8.0=pyh1a96a4e_2 + - pickleshare=0.7.5=pyhd3eb1b0_1003 + - pillow=10.0.1=py311ha6cbd5a_0 + - pip=23.2.1=py311h06a4308_0 + - pixman=0.40.0=h7f8727e_1 + - pluggy=1.0.0=py311h06a4308_1 + - ply=3.11=py_1 + - pretty_errors=1.2.25=pyhd8ed1ab_0 + - prompt-toolkit=3.0.39=pyha770c72_0 + - prompt_toolkit=3.0.39=hd8ed1ab_0 + - protobuf=3.20.3=py311h6a678d5_0 + - psutil=5.9.0=py311h5eee18b_0 + - ptyprocess=0.7.0=pyhd3deb0d_0 + - pure_eval=0.2.2=pyhd8ed1ab_0 + - pyasn1=0.5.0=pyhd8ed1ab_0 + - pyasn1-modules=0.3.0=pyhd8ed1ab_0 + - pycparser=2.21=pyhd3eb1b0_0 + - pygments=2.16.1=pyhd8ed1ab_0 + - pyjwt=2.8.0=pyhd8ed1ab_0 + - pyopenssl=23.2.0=py311h06a4308_0 + - pyparsing=3.0.9=pyhd8ed1ab_0 + - pyqt=5.15.7=py311h6a678d5_0 + - pyqt5-sip=12.11.0=py311h6a678d5_0 + - pysocks=1.7.1=py311h06a4308_0 + - pytest=7.4.0=py311h06a4308_0 + - python=3.11.5=h955ad1f_0 + - python-dateutil=2.8.2=pyhd8ed1ab_0 + - python-tzdata=2023.3=pyhd3eb1b0_0 + - python_abi=3.11=2_cp311 + - pytorch=2.1.0=cpu_py311h53e38e9_0 + - pytorch-cuda=11.8=h7e8668a_5 + - pytorch-model-summary=0.1.1=py_0 + - pytorch-mutex=1.0=cuda + - pytz=2023.3.post1=py311h06a4308_0 + - pyu2f=0.1.5=pyhd8ed1ab_0 + - pyyaml=6.0=py311h5eee18b_1 + - pyzmq=25.1.0=py311h6a678d5_0 + - qt-main=5.15.2=h7358343_9 + - qt-webengine=5.15.9=hbbf29b9_6 + - qtwebkit=5.212=h3fafdc1_5 + - re2=2022.04.01=h27087fc_0 + - readline=8.2=h5eee18b_0 + - requests=2.31.0=py311h06a4308_0 + - requests-oauthlib=1.3.1=pyhd8ed1ab_0 + - rsa=4.9=pyhd8ed1ab_0 + - scikit-learn=1.2.2=py311h6a678d5_1 + - scipy=1.11.3=py311h24aa872_0 + - seaborn=0.12.2=py311h06a4308_0 + - setuptools=68.0.0=py311h06a4308_0 + - sip=6.6.2=py311h6a678d5_0 + - six=1.16.0=pyh6c4a22f_0 + - sqlite=3.41.2=h5eee18b_0 + - stack_data=0.6.2=pyhd8ed1ab_0 + - sympy=1.11.1=py311h06a4308_0 + - tbb=2021.8.0=hdb19cb5_0 + - tensorboard=2.14.1=pyhd8ed1ab_0 + - tensorboard-data-server=0.7.0=py311h52d8a92_0 + - threadpoolctl=2.2.0=pyh0d69192_0 + - tk=8.6.12=h1ccaba5_0 + - toml=0.10.2=pyhd8ed1ab_0 + - torchaudio=2.1.0=py311_cu118 + - torchinfo=1.8.0=pyhd8ed1ab_0 + - torchtriton=2.1.0=py311 + - torchvision=0.15.2=cpu_py311h6e929fa_0 + - tornado=6.3.3=py311h5eee18b_0 + - tqdm=4.66.1=pyhd8ed1ab_0 + - traitlets=5.11.2=pyhd8ed1ab_0 + - typeguard=2.13.3=py311h06a4308_0 + - typing-extensions=4.7.1=py311h06a4308_0 + - typing_extensions=4.7.1=py311h06a4308_0 + - tzdata=2023c=h04d1e81_0 + - urllib3=1.26.16=py311h06a4308_0 + - wcwidth=0.2.8=pyhd8ed1ab_0 + - werkzeug=3.0.0=pyhd8ed1ab_0 + - wheel=0.41.2=py311h06a4308_0 + - x264=1!157.20191217=h7b6447c_0 + - xz=5.4.2=h5eee18b_0 + - yaml=0.2.5=h7b6447c_0 + - yarl=1.8.1=py311h5eee18b_0 + - zeromq=4.3.4=h9c3ff4c_1 + - zipp=3.17.0=pyhd8ed1ab_0 + - zlib=1.2.13=h5eee18b_0 + - zstd=1.5.5=hc292b87_0 + - pip: + - beautifulsoup4==4.12.2 + - gdown==4.7.1 + - soupsieve==2.5 + - torchsummary==1.5.1 +prefix: /home/doh16101/anaconda3/envs/CS330_torch diff --git a/BML_project/models/Colab_example_dataloader_2024_04_04.ipynb b/BML_project/models/Colab_example_dataloader_2024_04_04.ipynb new file mode 100644 index 0000000..4495514 --- /dev/null +++ b/BML_project/models/Colab_example_dataloader_2024_04_04.ipynb @@ -0,0 +1,22 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# R:\\ENGR_Chon\\Darren\\NIH_Pulsewatch\\Poincare_pt\\128x128\n", + "# Darren created the PT files again (because UID 120 has missing files in the original csv file)\n", + "# I need to prepare for my interview, and I will tar those PT files again and test your code on Colab later." + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/BML_project/models/__pycache__/ss_gp_model.cpython-311.pyc b/BML_project/models/__pycache__/ss_gp_model.cpython-311.pyc deleted file mode 100644 index 75ee3d4..0000000 Binary files a/BML_project/models/__pycache__/ss_gp_model.cpython-311.pyc and /dev/null differ diff --git a/BML_project/models/ss_gp_model.py b/BML_project/models/ss_gp_model.py index c18f06f..355a6fd 100644 --- a/BML_project/models/ss_gp_model.py +++ b/BML_project/models/ss_gp_model.py @@ -4,12 +4,15 @@ @author: lrm22005 """ +import os import numpy as np from tqdm import tqdm import torch import gpytorch from sklearn.metrics import precision_recall_fscore_support, roc_auc_score from sklearn.preprocessing import label_binarize +from utils_gp.data_loader import preprocess_data_train_val,preprocess_data_test +import time num_latents = 6 # This should match the complexity of your data or the number of tasks num_tasks = 4 # This should match the number of output classes or tasks @@ -20,7 +23,8 @@ class MultitaskGPModel(gpytorch.models.ApproximateGP): def __init__(self): # Let's use a different set of inducing points for each latent function - inducing_points = torch.rand(num_latents, num_inducing_points, 127 * 128) # Assuming flattened 128x128 images + inducing_points = torch.rand(num_latents, num_inducing_points, 128 * 128) # Assuming flattened 128x128 images + # Dong, 01/22/2024: I will use 128 * 128. # We have to mark the CholeskyVariationalDistribution as batch # so that we learn a variational distribution for each task @@ -69,11 +73,49 @@ def forward(self, x): return latent_pred -def train_gp_model(train_loader, val_loader, num_iterations=50, n_classes=4, patience=10, checkpoint_path='model_checkpoint_full.pt'): +def train_gp_model(train_loader, val_loader, batch_size,\ + data_format, clinical_trial_train, clinical_trial_test,\ + clinical_trial_unlabeled,\ + num_iterations=50, n_classes=4, patience=10, checkpoint_path='model_checkpoint_full.pt',\ + resume_training=False,\ + datackpt_name = 'dataset_checkpoint.pt',modelckpt_name = 'model_checkpoint_full.pt'): + print(f'Debug: resume_training:{resume_training}, checkpoint_path: {checkpoint_path}') model = MultitaskGPModel().to(device) likelihood = gpytorch.likelihoods.SoftmaxLikelihood(num_features=4, num_classes=4).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.1) mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=len(train_loader.dataset)) + + # Load checkpoint if resuming training for gp model. + start_epoch = 0 + flag_reload_dataloader = False # We do not need to reset train loader in the new epoch. + ckpt_model_file = os.path.join(checkpoint_path,modelckpt_name) + if resume_training and os.path.exists(ckpt_model_file): + print(f'Debug: loading ckpt: {ckpt_model_file}') + checkpoint = torch.load(ckpt_model_file) + model.load_state_dict(checkpoint['model_state_dict']) + likelihood.load_state_dict(checkpoint['likelihood_state_dict']) + optimizer.load_state_dict(checkpoint['optimizer_state_dict']) + start_epoch = checkpoint.get('epoch', 0) # Resume from the same epoch because you did not finished it. + + # Update the dataloader if there are segments finished. + finished_seg_names = checkpoint['finished_seg_names'] + + if len(finished_seg_names) > 0: + # There were segments used in training. Only update the train loader. + flag_reload_dataloader = True + print('Debug: renewing train_loader now...') + startTime_for_tictoc = time.time() + # ---- Dong, 02/15/2024: I want to test training on large dataset and resume training. ---- + # train_loader,_,_ = preprocess_data_train_val(data_format, clinical_trial_train, clinical_trial_test, batch_size, finished_seg_names,\ + # read_all_labels=False) + train_loader = preprocess_data_test(data_format = data_format, \ + clinical_trial_unlabeled=clinical_trial_unlabeled, \ + batch_size=batch_size,\ + finished_seg_names=finished_seg_names,\ + read_all_labels=False) + endTime_for_tictoc = time.time() - startTime_for_tictoc + print(f'Debug: took {endTime_for_tictoc} to renew the train_loader') + best_val_loss = float('inf') epochs_no_improve = 0 @@ -85,19 +127,69 @@ def train_gp_model(train_loader, val_loader, num_iterations=50, n_classes=4, pat 'train_loss': [] # Add a list to store training losses } - for epoch in tqdm(range(num_iterations), desc='Training', unit='epoch', leave=False): - for train_batch in train_loader: + for epoch in tqdm(range(start_epoch,num_iterations), desc='Training', unit='epoch', leave=False): + finished_idx = [] + finished_seg_names = [] + for batch_index, train_batch in enumerate(train_loader): + print(f'Debug: now in a new batch of data! {batch_index}/{len(train_loader)}') # train_batch is the image data. model.train() likelihood.train() optimizer.zero_grad() + train_x = train_batch['data'].reshape(train_batch['data'].size(0), -1).to(device) # Use reshape here train_y = train_batch['label'].to(device) + # Get finished segment index in dataloader and segment name. + temp_finished_idx = train_batch['idx'] + temp_finished_seg_names = train_batch['segment_name'] + print('Debug: temp_finished_idx:',temp_finished_idx) + print('Debug: temp_finished_segment_name:',temp_finished_seg_names) + finished_idx.append(temp_finished_idx) + finished_seg_names.append(temp_finished_seg_names) output = model(train_x) loss = -mll(output, train_y) metrics['train_loss'].append(loss.item()) # Store the training loss loss.backward() optimizer.step() + save_ckpt_model_path = os.path.join(checkpoint_path,modelckpt_name) + torch.save({ + 'epoch': epoch, + 'model_state_dict': model.state_dict(), + 'likelihood_state_dict': likelihood.state_dict(), + 'optimizer_state_dict': optimizer.state_dict(), + 'best_val_loss': best_val_loss, + 'finished_seg_names':finished_seg_names, + 'finished_idx':finished_idx + # Include other metrics as needed + }, save_ckpt_model_path) + + # Optionally, save the dataset state at intervals or after certain conditions + save_ckpt_dataset_path = os.path.join(checkpoint_path,datackpt_name) + train_loader.dataset.save_checkpoint(save_ckpt_dataset_path) # Here, manage the index as needed + + # import sys + # if epoch == 3 and batch_index == 5: + # sys.exit(f"Debug: Manually stop the program at epoch {epoch} batch {batch_index}.") + + # Reset the finished segments again because we finished one epoch. + finished_idx = [] + finished_seg_names = [] + if flag_reload_dataloader: + print('Debug: reset the train_loader now...') + # Reset the train dataloader now. + startTime_for_tictoc = time.time() + # --- Dong, 02/15/2024: + # train_loader,_,_ = preprocess_data_train_val(data_format, clinical_trial_train, clinical_trial_test, batch_size, finished_seg_names,\ + # read_all_labels=False) + train_loader = preprocess_data_test(data_format = data_format, \ + clinical_trial_unlabeled=clinical_trial_unlabeled, \ + batch_size=batch_size,\ + finished_seg_names=finished_seg_names,\ + read_all_labels=False) + endTime_for_tictoc = time.time() - startTime_for_tictoc + print(f'Debug: took {endTime_for_tictoc} to reset the train_loader') + flag_reload_dataloader = False # Turn off the flag for reseting train dataloader. + # Stochastic validation model.eval() likelihood.eval() @@ -130,19 +222,46 @@ def train_gp_model(train_loader, val_loader, num_iterations=50, n_classes=4, pat if val_loss < best_val_loss: best_val_loss = val_loss epochs_no_improve = 0 - torch.save({'model_state_dict': model.state_dict(), - 'likelihood_state_dict': likelihood.state_dict(), - 'optimizer_state_dict': optimizer.state_dict()}, checkpoint_path) + # torch.save({'model_state_dict': model.state_dict(), + # 'likelihood_state_dict': likelihood.state_dict(), + # 'optimizer_state_dict': optimizer.state_dict(), + # 'train_loader':train_loader, + # 'val_loader':val_loader + # }, checkpoint_path) else: epochs_no_improve += 1 if epochs_no_improve >= patience: print(f"Early stopping triggered at epoch {epoch+1}") break + + # Save checkpoint at the end of each epoch + save_ckpt_model_path = os.path.join(checkpoint_path,modelckpt_name) + torch.save({ + 'epoch': epoch, + 'model_state_dict': model.state_dict(), + 'likelihood_state_dict': likelihood.state_dict(), + 'optimizer_state_dict': optimizer.state_dict(), + 'best_val_loss': best_val_loss, + 'finished_seg_names':finished_seg_names, + 'finished_idx':finished_idx + # Include other metrics as needed + }, save_ckpt_model_path) + print('Debug: saved model checkpoint with epoch.',save_ckpt_model_path) + + # Optionally, save the dataset state at intervals or after certain conditions + save_ckpt_dataset_path = os.path.join(checkpoint_path,datackpt_name) + train_loader.dataset.save_checkpoint(save_ckpt_dataset_path) # Finished all batches, so start from zero. + + if epochs_no_improve >= patience: + print(f"Early stopping triggered at epoch {epoch+1}") + break - checkpoint = torch.load(checkpoint_path) - model.load_state_dict(checkpoint['model_state_dict']) - likelihood.load_state_dict(checkpoint['likelihood_state_dict']) - optimizer.load_state_dict(checkpoint['optimizer_state_dict']) + # Optionally, load the best model at the end of training + if os.path.exists(checkpoint_path): + checkpoint = torch.load(checkpoint_path) + model.load_state_dict(checkpoint['model_state_dict']) + likelihood.load_state_dict(checkpoint['likelihood_state_dict']) + optimizer.load_state_dict(checkpoint['optimizer_state_dict']) return model, likelihood, metrics diff --git a/BML_project/ss_main.py b/BML_project/ss_main.py index b784ce4..326f80f 100644 --- a/BML_project/ss_main.py +++ b/BML_project/ss_main.py @@ -4,16 +4,48 @@ @author: lrm22005 """ -import tqdm +from tqdm import tqdm import torch -from utils_gp.data_loader import preprocess_data, split_uids, update_train_loader_with_uncertain_samples +from utils_gp.data_loader import preprocess_data_train_val, split_uids, update_train_loader_with_uncertain_samples, preprocess_data_test from models.ss_gp_model import MultitaskGPModel, train_gp_model from utils_gp.ss_evaluation import stochastic_evaluation, evaluate_model_on_all_data from active_learning.ss_active_learning import stochastic_uncertainty_sampling, run_minibatch_kmeans, stochastic_compare_kmeans_gp_predictions from utils_gp.visualization import plot_comparative_results, plot_training_performance, plot_results +import os +import pickle +from datetime import datetime +now = datetime.now() # Get the time now for model checkpoint saving. + +dt_string = now.strftime("%Y_%m_%d_%H_%M_%S") # YYYY_mm_dd_HH_MM_SS, for model saving. +print("The date and time suffix of the model file is", dt_string) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +class CheckpointManager: + def __init__(self, checkpoint_dir): + self.checkpoint_dir = checkpoint_dir # Store the directory path for checkpoints + if not os.path.exists(checkpoint_dir): # Check if the directory exists + os.makedirs(checkpoint_dir) # Create the directory if it does not exist + + def save_checkpoint(self, loader_name, iteration, additional_state): + # Construct the checkpoint file path using the loader name + checkpoint_path = os.path.join(self.checkpoint_dir, f"{loader_name}_checkpoint.pkl") + checkpoint = { + 'iteration': iteration, # Store the current iteration + 'additional_state': additional_state # Store any additional state information + } + with open(checkpoint_path, 'wb') as f: # Open the file in write-binary mode + pickle.dump(checkpoint, f) # Serialize the checkpoint dictionary to the file + + def load_checkpoint(self, loader_name): + # Construct the checkpoint file path using the loader name + checkpoint_path = os.path.join(self.checkpoint_dir, f"{loader_name}_checkpoint.pkl") + try: + with open(checkpoint_path, 'rb') as f: # Open the file in read-binary mode + return pickle.load(f) # Deserialize the checkpoint file and return it + except FileNotFoundError: # Handle the case where the checkpoint file does not exist + return None # Return None if the file is not found + def main(): # Set parameters like n_classes, batch_size, etc. n_classes = 4 @@ -21,7 +53,29 @@ def main(): clinical_trial_train, clinical_trial_test, clinical_trial_unlabeled = split_uids() data_format = 'pt' # Preprocess data - train_loader, val_loader, test_loader = preprocess_data(data_format, clinical_trial_train, clinical_trial_test, clinical_trial_unlabeled, batch_size) + # ---- Dong, 02/15/2024: I want to test loading large amount dataset. ---- + # train_loader, val_loader, saving_path = preprocess_data_train_val(data_format = data_format, \ + _, val_loader, saving_path = preprocess_data_train_val(data_format = data_format, \ + clinical_trial_train=clinical_trial_train, \ + clinical_trial_test=clinical_trial_test, \ + batch_size=batch_size,\ + finished_seg_names = [],\ + read_all_labels=False) + # ---- Dong, 02/15/2024: I want to test loading large amount dataset. ---- + # test_loader = preprocess_data_test(data_format = data_format, \ + train_loader = preprocess_data_test(data_format = data_format, \ + clinical_trial_unlabeled=clinical_trial_unlabeled, \ + batch_size=batch_size,\ + finished_seg_names=[],\ + read_all_labels=False) + + menu_segment_names = train_loader.dataset.segment_names # All the segments to be run in the training dataset. + menu_labels = train_loader.dataset.labels # All the ground truth labels + print('Debug: len(menu_segment_names)',len(menu_segment_names)) + print('Debug: len(menu_labels)',len(menu_labels)) + + print('Debug: len(train_loader)',len(train_loader)) + print('Debug: dir(train_loader.dataset)',dir(train_loader.dataset)) kmeans_model = run_minibatch_kmeans(train_loader, n_clusters=n_classes, device=device) @@ -33,7 +87,21 @@ def main(): } # Initial model training - model, likelihood, training_metrics = train_gp_model(train_loader, val_loader, num_iterations=50, n_classes=n_classes) + model, likelihood, training_metrics = train_gp_model( + train_loader = train_loader, + val_loader = val_loader, + num_iterations=50, + n_classes=n_classes, + patience=10, + checkpoint_path=saving_path, + resume_training=True, + datackpt_name = 'dataset_checkpoint.pt', + modelckpt_name = 'model_checkpoint_full.pt', + batch_size=batch_size, + data_format = data_format, + clinical_trial_train = clinical_trial_train, + clinical_trial_test = clinical_trial_test, + clinical_trial_unlabeled=clinical_trial_unlabeled) # Dong: remember to change this function in its code. # Save the training metrics for future visualization results['train_loss'].extend(training_metrics['train_loss']) @@ -42,38 +110,80 @@ def main(): results['validation_metrics']['f1'].extend(training_metrics['f1_score']) # results['validation_metrics']['auc_roc'].extend(training_metrics['auc_roc']) + # --- Dong: copied from GP_Original_Checkpoint.py --- + # Initialize the CheckpointManager + checkpoint_manager = CheckpointManager(saving_path) + + # Attempt to load a training checkpoint + train_checkpoint = checkpoint_manager.load_checkpoint('train') + start_iteration = train_checkpoint['iteration'] if train_checkpoint else 0 + print('Debug: start_iteration is:',start_iteration) + # Dong, 01/25/2024: save it first before entering the active learning. + additional_state = { + 'model_state': model.state_dict(), + 'likelihood':likelihood, + 'val_loader':val_loader, + 'train_loader':train_loader + # Include other states like optimizer, scheduler, etc. + } + checkpoint_manager.save_checkpoint('train', start_iteration, additional_state) + # --------------------------------------------------- + active_learning_iterations = 10 # Active Learning Iterations - for iteration in tqdm(range(active_learning_iterations), desc='Active Learning', unit='iteration', leave=True): + for iteration in tqdm(range(start_iteration,active_learning_iterations), desc='Active Learning', unit='iteration', leave=True): + print(f"Active Learning Iteration: {iteration+1}/{active_learning_iterations}") # Perform uncertainty sampling to select new samples from the validation set - uncertain_sample_indices = stochastic_uncertainty_sampling(model, likelihood, val_loader, n_samples=batch_size, n_batches=5) - + uncertain_sample_indices = stochastic_uncertainty_sampling(model, likelihood, val_loader, n_samples=50, n_batches=5, device=device) + labeled_samples = label_samples(uncertain_sample_indices, val_loader.dataset) # Update the training loader with uncertain samples - train_loader = update_train_loader_with_uncertain_samples(train_loader, uncertain_sample_indices, batch_size) - print(f"Updated training data size: {len(train_loader.dataset)}") + train_loader = update_train_loader_with_uncertain_samples(train_loader, labeled_samples, batch_size) + + # Optionally, save the dataset state at intervals or after certain conditions + train_loader.dataset.save_checkpoint(dataset_checkpoint_path) # Here, manage the index as needed # Re-train the model with the updated training data - model, likelihood, val_metrics = train_gp_model(train_loader, val_loader, num_iterations=10, n_classes=n_classes, patience=10, checkpoint_path='model_checkpoint_last.pt') + model, likelihood, val_metrics = train_gp_model( + train_loader, val_loader, num_iterations=10, n_classes=n_classes, patience=10, + checkpoint_path=saving_path, resume_training=True, batch_size=batch_size) # Store the validation metrics after each active learning iteration results['validation_metrics']['precision'].append(val_metrics['precision']) results['validation_metrics']['recall'].append(val_metrics['recall']) - results['validation_metrics']['f1'].append(val_metrics['f1']) + results['validation_metrics']['f1'].append(val_metrics['f1_score']) # results['validation_metrics']['auc_roc'].append(val_metrics['auc_roc']) + # Save checkpoint at the end of each iteration + additional_state = { + 'model_state': model.state_dict(), + 'likelihood':likelihood, + 'val_loader':val_loader, + 'train_loader':train_loader + # Include other states like optimizer, scheduler, etc. + } + checkpoint_manager.save_checkpoint('train', iteration, additional_state) + # Compare K-Means with GP model predictions after retraining gp_vs_kmeans_data, original_labels = stochastic_compare_kmeans_gp_predictions(kmeans_model, model, train_loader, n_batches=5, device=device) plot_comparative_results(gp_vs_kmeans_data, original_labels) # Final evaluation on test set + import subprocess + print('Start to run bash script!') + subprocess.call("./BML_project/untar_unlabeled_PT.sh") + print('End to run bash script!') + + test_loader = preprocess_data_test(data_format = data_format, \ + clinical_trial_unlabeled=clinical_trial_unlabeled, \ + batch_size=batch_size,\ + finished_seg_names=[],\ + read_all_labels=False) test_metrics = evaluate_model_on_all_data(model, likelihood, test_loader, device, n_classes) test_kmeans_model = run_minibatch_kmeans(test_loader, n_clusters=n_classes, device=device) results['test_metrics'] = test_metrics test_gp_vs_kmeans_data, test_original_labels = stochastic_compare_kmeans_gp_predictions(test_kmeans_model, model, test_loader, n_batches=5, device=device) - - print(f"Length of original_labels: {len(original_labels)}, Length of gp_predictions: {len(gp_predictions)}") plot_comparative_results(test_gp_vs_kmeans_data, test_original_labels) # Visualization of results diff --git a/BML_project/utils_gp/__pycache__/data_loader.cpython-311.pyc b/BML_project/utils_gp/__pycache__/data_loader.cpython-311.pyc deleted file mode 100644 index ed2626e..0000000 Binary files a/BML_project/utils_gp/__pycache__/data_loader.cpython-311.pyc and /dev/null differ diff --git a/BML_project/utils_gp/__pycache__/ss_evaluation.cpython-311.pyc b/BML_project/utils_gp/__pycache__/ss_evaluation.cpython-311.pyc deleted file mode 100644 index 46b1836..0000000 Binary files a/BML_project/utils_gp/__pycache__/ss_evaluation.cpython-311.pyc and /dev/null differ diff --git a/BML_project/utils_gp/__pycache__/visualization.cpython-311.pyc b/BML_project/utils_gp/__pycache__/visualization.cpython-311.pyc deleted file mode 100644 index f53ef75..0000000 Binary files a/BML_project/utils_gp/__pycache__/visualization.cpython-311.pyc and /dev/null differ diff --git a/BML_project/utils_gp/data_loader.py b/BML_project/utils_gp/data_loader.py index fe3bc7e..bd22a79 100644 --- a/BML_project/utils_gp/data_loader.py +++ b/BML_project/utils_gp/data_loader.py @@ -5,6 +5,8 @@ @author: lrm22005 """ import os +# For saving checkpoints +from pathlib import Path import numpy as np import pandas as pd from PIL import Image @@ -12,10 +14,28 @@ from torch.utils.data import Dataset, DataLoader from sklearn.preprocessing import StandardScaler from torchvision.transforms import ToTensor +import socket +# Downsampling image +import cv2 +# import torchvision.transforms as T +# transform for rectangular resize +img_size = 32 # Dong, 01/30/2024: this is for testing the CIFAR10 models. +# transform = T.Resize((img_size,img_size)) def split_uids(): # ====== Load the per subject arrythmia summary ====== - df_summary = pd.read_csv(r'\\grove.ad.uconn.edu\research\ENGR_Chon\NIH_Pulsewatch_Database\Adjudication_UConn\final_attemp_4_1_Dong_Ohm_summary_20231025.csv') + your_computer_name = socket.gethostname() + if your_computer_name == 'localhost.localdomain': + # Dong, 12/09/2023: I am so sick of changing the path every time on different computer. + # This is Cassey's Luis server name. + df_summary = pd.read_csv(r'/mnt/r/ENGR_Chon/NIH_Pulsewatch_Database/Adjudication_UConn/final_attemp_4_1_Dong_Ohm_summary_20231025.csv') + elif your_computer_name == 'Darren_computer_name': + # Darren, you can put your computer name in the elif condition to separate it from Luis's computer. + df_summary = pd.read_csv(r'R:\ENGR_Chon\NIH_Pulsewatch_Database\Adjudication_UConn\final_attemp_4_1_Dong_Ohm_summary_20231025.csv') + elif your_computer_name == 'Luis_computer_name': + df_summary = pd.read_csv(r'\\grove.ad.uconn.edu\research\ENGR_Chon\NIH_Pulsewatch_Database\Adjudication_UConn\final_attemp_4_1_Dong_Ohm_summary_20231025.csv') + else: + df_summary = pd.read_csv(r'/content/drive/MyDrive/Adjudication_UConn/final_attemp_4_1_Dong_Ohm_summary_20231025.csv') df_summary['UID'] = df_summary['UID'].astype(str).str.zfill(3) df_summary['sample_nonAF'] = df_summary['NSR'] + df_summary['PACPVC'] + df_summary['SVT'] @@ -88,116 +108,101 @@ def split_uids(): print(f'Clinical trial: selected {len(clinical_trial_test)} UIDs for testing {clinical_trial_test}') print(f'Clinical trial: selected {len(clinical_trial_unlabeled)} UIDs for unlabeled {clinical_trial_unlabeled}') - clinical_trial_train = [clinical_trial_train[0]] - clinical_trial_test = [clinical_trial_test[0]] - clinical_trial_unlabeled = clinical_trial_unlabeled[0:4] + # clinical_trial_train = [clinical_trial_train[0]] + # clinical_trial_test = [clinical_trial_test[0]] + # clinical_trial_unlabeled = clinical_trial_unlabeled[0:4] return clinical_trial_train, clinical_trial_test, clinical_trial_unlabeled +def extract_segment_names_and_labels(UIDs,labels_path,read_all_labels=False): + # Extract all segment names and labels when starting the main function. + # Output: + # segment_names: list of string. + # labels: dictionary, with segment_names as key and label as value. + segment_names = [] + labels = {} + + for UID in UIDs: + label_file = os.path.join(labels_path, UID + "_final_attemp_4_1_Dong.csv") + if os.path.exists(label_file): + print('Debug: this file exists',label_file) + label_data = pd.read_csv(label_file, sep=',', header=0, names=['segment', 'label']) + label_segment_names = label_data['segment'].apply(lambda x: x.split('.')[0]) + for idx, segment_name in enumerate(label_segment_names): + label_val = label_data['label'].values[idx] + if read_all_labels: + # Assign -1 if label is not in [0, 1, 2, 3] + labels[segment_name] = label_val if label_val in [0, 1, 2, 3] else -1 + if segment_name not in segment_names: + segment_names.append(segment_name) + else: + # Only add segments with labels in [0, 1, 2, 3] + if label_val in [0, 1, 2, 3] and segment_name not in segment_names: + segment_names.append(segment_name) + labels[segment_name] = label_val + print('>>> Number of segments in this dataloader:',len(segment_names)) # Dong, 01/29/2024: know the number of segments before running training epochs. + print('>>> Number of labels in this dataloader:',len(labels)) + return segment_names, labels + +def remove_finished_segment_names_and_labels(labels,finished_seg_names): + # From extract_segment_names_and_labels: + # Input: + # labels: dictionary, with segment_names as key and label as value. + # finished_seg_names: list of string. + remain_labels = labels.copy() + print('Debug: type(remain_labels)',type(remain_labels)) + for batch in finished_seg_names: + for key in batch: + remain_labels.pop(key) + print('Debug: len(labels)',len(labels)) + print('Debug: len(remain_labels)',len(remain_labels)) + + return remain_labels + class CustomDataset(Dataset): - def __init__(self, data_path, labels_path, UIDs, standardize=True, data_format='csv', read_all_labels=False, start_idx=0): + def __init__(self, data_path, labels_path, batch_size,segment_names,labels, standardize=True, data_format='csv', read_all_labels=False): self.data_path = data_path self.labels_path = labels_path - self.UIDs = UIDs self.standardize = standardize self.data_format = data_format self.read_all_labels = read_all_labels self.transforms = ToTensor() - self.start_idx = start_idx # Initial batch index to start from, useful for resuming training - self.refresh_dataset() - - # Initialize the current batch index to None, this could be used if you want to track batch progress within the dataset itself - self.current_batch_index = None - - def refresh_dataset(self): - self.segment_names, self.labels = self.extract_segment_names_and_labels() + self.segment_names = segment_names + self.labels = labels - def add_uids(self, new_uids): - unique_new_uids = [uid for uid in new_uids if uid not in self.UIDs] - self.UIDs.extend(unique_new_uids) - self.refresh_dataset() + # Initialize the current batch index to None + self.batch_size = batch_size def __len__(self): return len(self.segment_names) def save_checkpoint(self, checkpoint_path): - # Enhanced to automatically include 'start_idx' in the checkpoint checkpoint = { 'segment_names': self.segment_names, - 'labels': self.labels, - 'UIDs': self.UIDs, - 'start_idx': self.start_idx # Now also saving start_idx + 'labels': self.labels + # Save the current batch index if provided } torch.save(checkpoint, checkpoint_path) def load_checkpoint(self, checkpoint_path): checkpoint = torch.load(checkpoint_path) + print('Debug: loaded dataset checkpoint!',checkpoint_path) self.segment_names = checkpoint['segment_names'] self.labels = checkpoint['labels'] - self.UIDs = checkpoint['UIDs'] - # Now also loading and setting start_idx from checkpoint - self.start_idx = checkpoint.get('start_idx', 0) self.refresh_dataset() + # Load the current batch index if it exists in the checkpoint def __getitem__(self, idx): - actual_idx = (idx + self.start_idx) % len(self.segment_names) # Adjust index based on start_idx and wrap around if needed - segment_name = self.segment_names[actual_idx] + segment_name = self.segment_names[idx] label = self.labels[segment_name] - if hasattr(self, 'all_data') and actual_idx < len(self.all_data): - time_freq_tensor = self.all_data[actual_idx] + if hasattr(self, 'all_data') and idx < len(self.all_data): + time_freq_tensor = self.all_data[idx] else: time_freq_tensor = self.load_data(segment_name) - - return {'data': time_freq_tensor, 'label': label, 'segment_name': segment_name} - def set_current_batch_index(self, index): - self.current_batch_index = index - - def get_current_batch_index(self): - return self.current_batch_index - - def set_start_idx(self, index): - self.start_idx = index - - def add_data_label_pair(self, data, label): - # Assign a unique ID or name for the new data - new_id = len(self.segment_names) - segment_name = f"new_data_{new_id}" - - # Append the new data and label - self.segment_names.append(segment_name) - self.labels[segment_name] = label - - # Append the new data tensor to an attribute that holds all the data - if hasattr(self, 'all_data'): - self.all_data.append(data) - else: - self.all_data = [data] - - def extract_segment_names_and_labels(self): - segment_names = [] - labels = {} - - for UID in self.UIDs: - label_file = os.path.join(self.labels_path, UID + "_final_attemp_4_1_Dong.csv") - if os.path.exists(label_file): - label_data = pd.read_csv(label_file, sep=',', header=0, names=['segment', 'label']) - label_segment_names = label_data['segment'].apply(lambda x: x.split('.')[0]) - for idx, segment_name in enumerate(label_segment_names): - label_val = label_data['label'].values[idx] - if self.read_all_labels: - # Assign -1 if label is not in [0, 1, 2, 3] - labels[segment_name] = label_val if label_val in [0, 1, 2, 3] else -1 - if segment_name not in segment_names: - segment_names.append(segment_name) - else: - # Only add segments with labels in [0, 1, 2, 3] - if label_val in [0, 1, 2, 3] and segment_name not in segment_names: - segment_names.append(segment_name) - labels[segment_name] = label_val - - return segment_names, labels + + return {'data': time_freq_tensor, 'label': label, 'segment_name': segment_name, 'idx': idx} def load_data(self, segment_name): data_path_UID = os.path.join(self.data_path, segment_name.split('_')[0]) @@ -222,32 +227,57 @@ def load_data(self, segment_name): except Exception as e: print(f"Error processing segment: {segment_name}. Exception: {str(e)}") - return torch.zeros((1, 128, 128)) # Return zeros in case of an error + return torch.zeros((1, img_size, img_size)) # Return zeros in case of an error def standard_scaling(self, data): scaler = StandardScaler() data = scaler.fit_transform(data.reshape(-1, data.shape[-1])).reshape(data.shape) return torch.Tensor(data) -def load_data_split_batched(data_path, labels_path, UIDs, batch_size, standardize=False, data_format='csv', read_all_labels=False, drop_last=False, num_workers=4, start_idx=0): - dataset = CustomDataset(data_path, labels_path, UIDs, standardize, data_format, read_all_labels, start_idx=start_idx) +def load_data_split_batched(data_path, labels_path, UIDs, batch_size, standardize=False, data_format='pt', read_all_labels=False, drop_last=False, num_workers=4,\ + finished_seg_names = []): + # Run the main from the beginning. Load all data into the dataloader. + segment_names, labels = extract_segment_names_and_labels(UIDs,labels_path,read_all_labels=read_all_labels) + if len(finished_seg_names) > 0: + # If any segments have been trained. + remain_labels = remove_finished_segment_names_and_labels(labels,finished_seg_names) + segment_names = list(remain_labels.keys()) + labels = remain_labels.copy() + dataset = CustomDataset(data_path=data_path, \ + labels_path=labels_path, \ + standardize=standardize, \ + data_format=data_format, \ + read_all_labels=read_all_labels, \ + batch_size=batch_size, + segment_names = segment_names, + labels = labels) dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, drop_last=drop_last, num_workers=num_workers, prefetch_factor=2) return dataloader -def get_data_paths(data_format, is_linux=False, is_hpc=False): - if is_linux: +def get_data_paths(data_format): + your_computer_name = socket.gethostname() + print('Debug: your_computer_name',your_computer_name) + if your_computer_name == 'localhost.localdomain': base_path = "/mnt/r/ENGR_Chon/Dong/MATLAB_generate_results/NIH_PulseWatch" labels_base_path = "/mnt/r/ENGR_Chon/NIH_Pulsewatch_Database/Adjudication_UConn" saving_base_path = "/mnt/r/ENGR_Chon/Luis/Research/Casseys_case/Project_1_analysis" - elif is_hpc: + elif your_computer_name == 'HPC_computer_name': base_path = "/gpfs/scratchfs1/kic14002/doh16101" labels_base_path = "/gpfs/scratchfs1/hfp14002/lrm22005" saving_base_path = "/gpfs/scratchfs1/hfp14002/lrm22005/Casseys_case/Project_1_analysis" - else: + elif your_computer_name == 'Darren_computer_name': # R:\ENGR_Chon\Dong\MATLAB_generate_results\NIH_PulseWatch base_path = "R:\ENGR_Chon\Dong\MATLAB_generate_results\\NIH_PulseWatch" labels_base_path = "R:\ENGR_Chon\\NIH_Pulsewatch_Database\Adjudication_UConn" saving_base_path = r"\\grove.ad.uconn.edu\research\ENGR_Chon\Luis\Research\Casseys_case" + else: + print('Debug: You are in Google Colab.') + base_path = '/content' + labels_base_path = '/content/drive/MyDrive/Adjudication_UConn' + saving_base_path = '/content/drive/MyDrive/Checkpoint_Colab' + # print('ERROR! YOUR DID NOT GET THE PATH.') + # raise ValueError + if data_format == 'csv': data_path = os.path.join(base_path, "TFS_csv") labels_path = os.path.join(labels_base_path, "final_attemp_4_1_Dong_Ohm") @@ -262,16 +292,51 @@ def get_data_paths(data_format, is_linux=False, is_hpc=False): saving_path = os.path.join(saving_base_path, "Project_1_analysis") else: raise ValueError("Invalid data format. Choose 'csv' or 'png.") + + # Create the parent path for checkpoints. + Path(saving_path).mkdir(parents=True, exist_ok=True) + return data_path, labels_path, saving_path # Function to extract and preprocess data -def preprocess_data(data_format, clinical_trial_train, clinical_trial_test, clinical_trial_unlabeled, batch_size, read_all_labels=False, current_batch_index=0): - start_idx = current_batch_index * batch_size +def preprocess_data_train_val(data_format, clinical_trial_train, clinical_trial_test, batch_size, finished_seg_names,\ + read_all_labels=False): + # Extracts paths and loads data into train, validation, and test loaders + data_path, labels_path, saving_path = get_data_paths(data_format) + + train_loader = load_data_split_batched(data_path=data_path, \ + labels_path=labels_path, \ + UIDs=clinical_trial_train, \ + batch_size = batch_size, \ + standardize=True, \ + data_format=data_format, \ + read_all_labels=read_all_labels,\ + finished_seg_names = finished_seg_names) + # Usually the validation set will not need to resume training. + val_loader = load_data_split_batched(data_path=data_path, \ + labels_path=labels_path, \ + UIDs=clinical_trial_test, \ + batch_size=batch_size, \ + standardize=True, \ + data_format=data_format, \ + read_all_labels=read_all_labels, \ + finished_seg_names = []) + return train_loader, val_loader, saving_path + +# Function to extract and preprocess data +def preprocess_data_test(data_format, clinical_trial_unlabeled, batch_size, finished_seg_names,\ + read_all_labels=False): + # Extracts paths and loads data into train, validation, and test loaders data_path, labels_path, saving_path = get_data_paths(data_format) - train_loader = load_data_split_batched(data_path, labels_path, clinical_trial_train, batch_size, standardize=True, data_format=data_format, read_all_labels=read_all_labels, start_idx=start_idx) - val_loader = load_data_split_batched(data_path, labels_path, clinical_trial_test, batch_size, standardize=True, data_format=data_format, read_all_labels=read_all_labels, start_idx=start_idx) - test_loader = load_data_split_batched(data_path, labels_path, clinical_trial_unlabeled, batch_size, standardize=True, data_format=data_format, read_all_labels=read_all_labels, start_idx=start_idx) - return train_loader, val_loader, test_loader + test_loader = load_data_split_batched(data_path=data_path, \ + labels_path=labels_path, \ + UIDs=clinical_trial_unlabeled, \ + batch_size=batch_size, \ + standardize=True, \ + data_format=data_format, \ + read_all_labels=read_all_labels,\ + finished_seg_names=finished_seg_names) + return test_loader def map_samples_to_uids(uncertain_sample_indices, dataset): """ diff --git a/main_darren_v1-8GJQ9R3.py b/main_darren_v1-8GJQ9R3.py new file mode 100644 index 0000000..a84b9a3 --- /dev/null +++ b/main_darren_v1-8GJQ9R3.py @@ -0,0 +1,260 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Apr 18 12:52:53 2024 + +@author: lrmercadod +""" +import torch +import torch.nn as nn +import time +import datetime as dt +import gpytorch +from sklearn.metrics import precision_recall_fscore_support, roc_auc_score +from sklearn.preprocessing import label_binarize + +# Import my own functions and classes +from utils.pathmaster import PathMaster +from utils.dataloader import preprocess_data + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +num_latents = 6 # This should match the complexity of your data or the number of tasks +num_tasks = 4 # This should match the number of output classes or tasks +num_inducing_points = 50 # This is independent and should be sufficient for the input space + +class MultitaskGPModel(gpytorch.models.ApproximateGP): + def __init__(self): + # Let's use a different set of inducing points for each latent function + inducing_points = torch.rand(num_latents, num_inducing_points, 128 * 128) # Assuming flattened 128x128 images + + # We have to mark the CholeskyVariationalDistribution as batch + # so that we learn a variational distribution for each task + variational_distribution = gpytorch.variational.CholeskyVariationalDistribution( + inducing_points.size(-2), batch_shape=torch.Size([num_latents]) + ) + + # We have to wrap the VariationalStrategy in a LMCVariationalStrategy + # so that the output will be a MultitaskMultivariateNormal rather than a batch output + variational_strategy = gpytorch.variational.LMCVariationalStrategy( + gpytorch.variational.VariationalStrategy( + self, inducing_points, variational_distribution, learn_inducing_locations=True + ), + num_tasks=num_tasks, + num_latents=num_latents, + latent_dim=-1 + ) + + super().__init__(variational_strategy) + + # The mean and covariance modules should be marked as batch + # so we learn a different set of hyperparameters + self.mean_module = gpytorch.means.ConstantMean(batch_shape=torch.Size([num_latents])) + self.covar_module = gpytorch.kernels.ScaleKernel( + gpytorch.kernels.RBFKernel(batch_shape=torch.Size([num_latents])), + batch_shape=torch.Size([num_latents]) + ) + + def forward(self, x): + mean_x = self.mean_module(x) + covar_x = self.covar_module(x) + latent_pred = gpytorch.distributions.MultivariateNormal(mean_x, covar_x) + return latent_pred + +def train_gp_model(train_loader, val_loader, num_iterations=50, n_classes=4, patience=10, + checkpoint_path='model_checkpoint.pt', resume_training=False): + model = MultitaskGPModel().to(device) + likelihood = gpytorch.likelihoods.SoftmaxLikelihood(num_features=4, num_classes=4).to(device) + optimizer = torch.optim.Adam(model.parameters(), lr=0.1) + mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=len(train_loader.dataset)) + + start_epoch = 0 + if resume_training and os.path.exists(checkpoint_path): + checkpoint = torch.load(checkpoint_path) + model.load_state_dict(checkpoint['model_state_dict']) + likelihood.load_state_dict(checkpoint['likelihood_state_dict']) + optimizer.load_state_dict(checkpoint['optimizer_state_dict']) + start_epoch = checkpoint.get('epoch', 0) + + best_val_loss = float('inf') + epochs_no_improve = 0 + + metrics = { + 'precision': [], + 'recall': [], + 'f1_score': [], + 'auc_roc': [], + 'train_loss': [] + } + + for epoch in range(start_epoch, num_iterations): + model.train() + likelihood.train() + for train_batch in train_loader: + optimizer.zero_grad() + train_x = train_batch['data'].reshape(train_batch['data'].size(0), -1).to(device) + train_y = train_batch['label'].to(device) + output = model(train_x) + loss = -mll(output, train_y) + metrics['train_loss'].append(loss.item()) + loss.backward() + optimizer.step() + + # Stochastic validation + model.eval() + likelihood.eval() + with torch.no_grad(): + val_indices = torch.randperm(len(val_loader.dataset))[:int(0.1 * len(val_loader.dataset))] + val_loss = 0.0 + val_labels = [] + val_predictions = [] + for idx in val_indices: + val_batch = val_loader.dataset[idx] + val_x = val_batch['data'].reshape(-1).unsqueeze(0).to(device) + val_y = torch.tensor([val_batch['label']], device=device) + val_output = model(val_x) + val_loss_batch = -mll(val_output, val_y).sum() + val_loss += val_loss_batch.item() + val_labels.append(val_y.item()) + val_predictions.append(val_output.mean.argmax(dim=-1).item()) + + precision, recall, f1, _ = precision_recall_fscore_support(val_labels, val_predictions, average='macro') + auc_roc = roc_auc_score(label_binarize(val_labels, classes=range(n_classes)), + label_binarize(val_predictions, classes=range(n_classes)), + multi_class='ovr') + + metrics['precision'].append(precision) + metrics['recall'].append(recall) + metrics['f1_score'].append(f1) + metrics['auc_roc'].append(auc_roc) + val_loss /= len(val_indices) + + if val_loss < best_val_loss: + best_val_loss = val_loss + epochs_no_improve = 0 + torch.save({'model_state_dict': model.state_dict(), + 'likelihood_state_dict': likelihood.state_dict(), + 'optimizer_state_dict': optimizer.state_dict(), + 'epoch': epoch}, checkpoint_path) + else: + epochs_no_improve += 1 + if epochs_no_improve >= patience: + print(f"Early stopping triggered at epoch {epoch+1}") + break + + if os.path.exists(checkpoint_path): + checkpoint = torch.load(checkpoint_path) + model.load_state_dict(checkpoint['model_state_dict']) + likelihood.load_state_dict(checkpoint['likelihood_state_dict']) + optimizer.load_state_dict(checkpoint['optimizer_state_dict']) + + return model, likelihood, metrics + +def evaluate_gp_model(test_loader, model, likelihood, n_classes=4): + model.eval() + likelihood.eval() + test_labels = [] + test_predictions = [] + + with torch.no_grad(): + for test_batch in test_loader: + test_x = test_batch['data'].reshape(test_batch['data'].size(0), -1).to(device) + test_y = test_batch['label'].to(device) + test_output = model(test_x) + test_labels.extend(test_y.tolist()) + test_predictions.extend(test_output.mean.argmax(dim=-1).tolist()) + + precision, recall, f1, _ = precision_recall_fscore_support(test_labels, test_predictions, average='macro') + auc_roc = roc_auc_score(label_binarize(test_labels, classes=range(n_classes)), + label_binarize(test_predictions, classes=range(n_classes)), + multi_class='ovr') + + metrics = { + 'precision': precision, + 'recall': recall, + 'f1_score': f1, + 'auc_roc': auc_roc + } + + return metrics + +def main(): + # Device and drives + is_linux = False + is_hpc = False + is_internal = False + is_external = True + binary = False + + # Input + is_tfs = True + + # Database + database = 'mimic3' + + # Initialize the focus + focus = 'thesis_results_database_multiclass' + + # Initialize the file tag + file_tag = 'MIMIC_III' + + # Image resolution + img_res = '128x128_float16' + + # Data type: the type to convert the data into when it is loaded in + data_type = torch.float32 + + # Model type + model_type = torch.float32 + + # Create a PathMaster object + pathmaster = PathMaster(is_linux, is_hpc, is_tfs, is_internal, is_external, focus, file_tag, img_res) + + # Image dimensions + img_channels = 1 + img_size = 128 + downsample = None + standardize = True + + # Run parameters + n_epochs = 100 + if binary: + n_classes = 2 + else: + n_classes = 3 + patience = round(n_epochs / 10) if n_epochs > 50 else 5 + save = True + + # Resume checkpoint + resume_checkpoint_path = None + + # Data loading details + data_format = 'pt' + batch_size = 256 + + # Preprocess database data + test_loader = preprocess_data(database, batch_size, standardize, img_channels, img_size, + downsample, data_type, pathmaster, binary) + + # Training and validation + start_time = time.time() + model, likelihood, metrics = train_gp_model(train_loader, val_loader, n_epochs, + n_classes, patience, save, pathmaster) + end_time = time.time() + time_passed = end_time - start_time + print('\nTraining and validation took %.2f minutes' % (time_passed / 60)) + + # Evaluation + start_time = time.time() + test_metrics = evaluate_gp_model(test_loader, model, likelihood, n_classes) + end_time = time.time() + time_passed = end_time - start_time + print('\nTesting took %.2f seconds' % time_passed) + + print('Test Metrics:') + print('Precision: %.4f' % test_metrics['precision']) + print('Recall: %.4f' % test_metrics['recall']) + print('F1 Score: %.4f' % test_metrics['f1_score']) + print('AUC-ROC: %.4f' % test_metrics['auc_roc']) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/main_darren_v1.py b/main_darren_v1.py new file mode 100644 index 0000000..29ec642 --- /dev/null +++ b/main_darren_v1.py @@ -0,0 +1,265 @@ +import os +import torch +import gpytorch +from sklearn.metrics import precision_recall_fscore_support, roc_auc_score +from sklearn.preprocessing import label_binarize +from torch.utils.data import Dataset, DataLoader +import numpy as np +import random +import time + +# Seeds +torch.manual_seed(42) +np.random.seed(42) +random.seed(42) + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +num_latents = 6 # This should match the complexity of your data or the number of tasks +num_tasks = 4 # This should match the number of output classes or tasks +num_inducing_points = 50 # This is independent and should be sufficient for the input space + +class MultitaskGPModel(gpytorch.models.ApproximateGP): + def __init__(self): + # Let's use a different set of inducing points for each latent function + inducing_points = torch.rand(num_latents, num_inducing_points, 128 * 128) # Assuming flattened 128x128 images + + # We have to mark the CholeskyVariationalDistribution as batch + # so that we learn a variational distribution for each task + variational_distribution = gpytorch.variational.CholeskyVariationalDistribution( + inducing_points.size(-2), batch_shape=torch.Size([num_latents]) + ) + + # We have to wrap the VariationalStrategy in a LMCVariationalStrategy + # so that the output will be a MultitaskMultivariateNormal rather than a batch output + variational_strategy = gpytorch.variational.LMCVariationalStrategy( + gpytorch.variational.VariationalStrategy( + self, inducing_points, variational_distribution, learn_inducing_locations=True + ), + num_tasks=num_tasks, + num_latents=num_latents, + latent_dim=-1 + ) + + super().__init__(variational_strategy) + + # The mean and covariance modules should be marked as batch + # so we learn a different set of hyperparameters + self.mean_module = gpytorch.means.ConstantMean(batch_shape=torch.Size([num_latents])) + self.covar_module = gpytorch.kernels.ScaleKernel( + gpytorch.kernels.RBFKernel(batch_shape=torch.Size([num_latents])), + batch_shape=torch.Size([num_latents]) + ) + + def forward(self, x): + mean_x = self.mean_module(x) + covar_x = self.covar_module(x) + latent_pred = gpytorch.distributions.MultivariateNormal(mean_x, covar_x) + return latent_pred + +class CustomDataset(Dataset): + def __init__(self, data_path, labels_path, binary=False): + self.data_path = data_path + self.labels_path = labels_path + self.binary = binary + self.segment_names, self.labels = self.extract_segment_names_and_labels() + + def __len__(self): + return len(self.segment_names) + + def __getitem__(self, idx): + segment_name = self.segment_names[idx] + label = self.labels[segment_name] + data_tensor = torch.load(os.path.join(self.data_path, segment_name + '.pt')) + return {'data': data_tensor, 'label': label, 'segment_name': segment_name} + + def extract_segment_names_and_labels(self): + segment_names = [] + labels = {} + + with open(self.labels_path, 'r') as file: + lines = file.readlines() + for line in lines[1:]: # Skip the header line + segment_name, label = line.strip().split(',') + label = int(float(label)) # Convert the label to float first, then to int + if self.binary and label == 2: + label = 0 # Convert PAC/PVC to non-AF (0) for binary classification + segment_names.append(segment_name) + labels[segment_name] = label + + return segment_names, labels + +def load_data(data_path, labels_path, batch_size, binary=False): + dataset = CustomDataset(data_path, labels_path, binary) + dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True) + return dataloader + +def train_gp_model(train_loader, val_loader, num_iterations=50, n_classes=4, patience=10, + checkpoint_path='model_checkpoint.pt', resume_training=False): + model = MultitaskGPModel().to(device) + likelihood = gpytorch.likelihoods.SoftmaxLikelihood(num_features=4, num_classes=4).to(device) + optimizer = torch.optim.Adam(model.parameters(), lr=0.1) + mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=len(train_loader.dataset)) + + start_epoch = 0 + if resume_training and os.path.exists(checkpoint_path): + checkpoint = torch.load(checkpoint_path) + model.load_state_dict(checkpoint['model_state_dict']) + likelihood.load_state_dict(checkpoint['likelihood_state_dict']) + optimizer.load_state_dict(checkpoint['optimizer_state_dict']) + start_epoch = checkpoint.get('epoch', 0) + + best_val_loss = float('inf') + epochs_no_improve = 0 + + metrics = { + 'precision': [], + 'recall': [], + 'f1_score': [], + 'auc_roc': [], + 'train_loss': [] + } + + for epoch in range(start_epoch, num_iterations): + model.train() + likelihood.train() + for train_batch in train_loader: + optimizer.zero_grad() + train_x = train_batch['data'].reshape(train_batch['data'].size(0), -1).to(device) + train_y = train_batch['label'].to(device) + output = model(train_x) + loss = -mll(output, train_y) + metrics['train_loss'].append(loss.item()) + loss.backward() + optimizer.step() + + # Stochastic validation + model.eval() + likelihood.eval() + with torch.no_grad(): + val_indices = torch.randperm(len(val_loader.dataset))[:int(0.1 * len(val_loader.dataset))] + val_loss = 0.0 + val_labels = [] + val_predictions = [] + for idx in val_indices: + val_batch = val_loader.dataset[idx] + val_x = val_batch['data'].reshape(-1).unsqueeze(0).to(device) + val_y = torch.tensor([val_batch['label']], device=device) + val_output = model(val_x) + val_loss_batch = -mll(val_output, val_y).sum() + val_loss += val_loss_batch.item() + val_labels.append(val_y.item()) + val_predictions.append(val_output.mean.argmax(dim=-1).item()) + + precision, recall, f1, _ = precision_recall_fscore_support(val_labels, val_predictions, average='macro') + auc_roc = roc_auc_score(label_binarize(val_labels, classes=range(n_classes)), + label_binarize(val_predictions, classes=range(n_classes)), + multi_class='ovr') + + metrics['precision'].append(precision) + metrics['recall'].append(recall) + metrics['f1_score'].append(f1) + metrics['auc_roc'].append(auc_roc) + val_loss /= len(val_indices) + + if val_loss < best_val_loss: + best_val_loss = val_loss + epochs_no_improve = 0 + torch.save({'model_state_dict': model.state_dict(), + 'likelihood_state_dict': likelihood.state_dict(), + 'optimizer_state_dict': optimizer.state_dict(), + 'epoch': epoch}, checkpoint_path) + else: + epochs_no_improve += 1 + if epochs_no_improve >= patience: + print(f"Early stopping triggered at epoch {epoch+1}") + break + + if os.path.exists(checkpoint_path): + checkpoint = torch.load(checkpoint_path) + model.load_state_dict(checkpoint['model_state_dict']) + likelihood.load_state_dict(checkpoint['likelihood_state_dict']) + optimizer.load_state_dict(checkpoint['optimizer_state_dict']) + + return model, likelihood, metrics + +def evaluate_gp_model(test_loader, model, likelihood, n_classes=4): + model.eval() + likelihood.eval() + test_labels = [] + test_predictions = [] + + with torch.no_grad(): + for test_batch in test_loader: + test_x = test_batch['data'].reshape(test_batch['data'].size(0), -1).to(device) + test_y = test_batch['label'].to(device) + test_output = model(test_x) + test_labels.extend(test_y.tolist()) + test_predictions.extend(test_output.mean.argmax(dim=-1).tolist()) + + precision, recall, f1, _ = precision_recall_fscore_support(test_labels, test_predictions, average='macro') + auc_roc = roc_auc_score(label_binarize(test_labels, classes=range(n_classes)), + label_binarize(test_predictions, classes=range(n_classes)), + multi_class='ovr') + + metrics = { + 'precision': precision, + 'recall': recall, + 'f1_score': f1, + 'auc_roc': auc_roc + } + + return metrics + +def main(): + # Paths + base_path = r"\\grove.ad.uconn.edu\\research\\ENGR_Chon\Darren\\NIH_Pulsewatch" + smote_type = 'Cassey5k_SMOTE' + split = 'holdout_60_10_30' + data_path_train = os.path.join(base_path, "TFS_pt", smote_type, split, "train") + data_path_val = os.path.join(base_path, "TFS_pt", smote_type, split, "validate") + data_path_test = os.path.join(base_path, "TFS_pt", smote_type, split, "test") + labels_path_train = os.path.join(base_path, "TFS_pt", smote_type, split, "Cassey5k_SMOTE_train_names_labels.csv") + labels_path_val = os.path.join(base_path, "TFS_pt", smote_type, split, "Cassey5k_SMOTE_validate_names_labels.csv") + labels_path_test = os.path.join(base_path, "TFS_pt", smote_type, split, "Cassey5k_SMOTE_test_names_labels.csv") + + # Parameters + binary = False + n_epochs = 100 + if binary: + n_classes = 2 + else: + n_classes = 3 + patience = round(n_epochs / 10) if n_epochs > 50 else 5 + save = True + resume_checkpoint_path = None + batch_size = 256 + + # Data loading + train_loader = load_data(data_path_train, labels_path_train, batch_size, binary) + val_loader = load_data(data_path_val, labels_path_val, batch_size, binary) + test_loader = load_data(data_path_test, labels_path_test, batch_size, binary) + + # Training and validation + start_time = time.time() + model, likelihood, metrics = train_gp_model(train_loader, val_loader, n_epochs, + n_classes, patience, save) + end_time = time.time() + time_passed = end_time - start_time + print('\nTraining and validation took %.2f minutes' % (time_passed / 60)) + + # Evaluation + start_time = time.time() + test_metrics = evaluate_gp_model(test_loader, model, likelihood, n_classes) + end_time = time.time() + time_passed = end_time - start_time + print('\nTesting took %.2f seconds' % time_passed) + + print('Test Metrics:') + print('Precision: %.4f' % test_metrics['precision']) + print('Recall: %.4f' % test_metrics['recall']) + print('F1 Score: %.4f' % test_metrics['f1_score']) + print('AUC-ROC: %.4f' % test_metrics['auc_roc']) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/transfer_data/tar_PT_files.sh b/transfer_data/tar_PT_files.sh new file mode 100644 index 0000000..600666d --- /dev/null +++ b/transfer_data/tar_PT_files.sh @@ -0,0 +1,19 @@ +source_path="/mnt/r/ENGR_Chon/Dong/MATLAB_generate_results/NIH_PulseWatch/PT_format" +source_path_all="/mnt/r/ENGR_Chon/Dong/MATLAB_generate_results/NIH_PulseWatch/PT_format/*" +dest_path="/mnt/r/ENGR_Chon/Dong/MATLAB_generate_results/NIH_PulseWatch/tar_PT_format" +if [ ! -d $dest_path ] +then + mkdir -p $dest_path + echo "Root dest_path does not exist, just created it: $dest_path" +fi + +for d in $source_path_all; do + sub_d=${d##*/} + echo sub_d + dest_tar="${dest_path}/${sub_d}.tar" + if [ ! -f $dest_tar ] + then + tar -C $source_path -cvf $dest_tar $sub_d + echo "Finished archive: $dest_tar" + fi +done; \ No newline at end of file diff --git a/transfer_data/tar_PT_files_single_UID.sh b/transfer_data/tar_PT_files_single_UID.sh new file mode 100644 index 0000000..dc3f3c0 --- /dev/null +++ b/transfer_data/tar_PT_files_single_UID.sh @@ -0,0 +1,5 @@ +source_path="/mnt/r/ENGR_Chon/Dong/MATLAB_generate_results/NIH_PulseWatch/PT_format" +dest_path="/mnt/r/ENGR_Chon/Dong/MATLAB_generate_results/NIH_PulseWatch/tar_PT_format" +sub_d="120" +dest_tar="${dest_path}/${sub_d}.tar" +tar -C $source_path -cvf $dest_tar $sub_d \ No newline at end of file diff --git a/utils/__pycache__/dataloader.cpython-310.pyc b/utils/__pycache__/dataloader.cpython-310.pyc new file mode 100644 index 0000000..ae5efbe Binary files /dev/null and b/utils/__pycache__/dataloader.cpython-310.pyc differ diff --git a/utils/__pycache__/dataloader.cpython-311.pyc b/utils/__pycache__/dataloader.cpython-311.pyc new file mode 100644 index 0000000..063e8f7 Binary files /dev/null and b/utils/__pycache__/dataloader.cpython-311.pyc differ diff --git a/utils/__pycache__/dataloader.cpython-312.pyc b/utils/__pycache__/dataloader.cpython-312.pyc new file mode 100644 index 0000000..af61a83 Binary files /dev/null and b/utils/__pycache__/dataloader.cpython-312.pyc differ diff --git a/utils/__pycache__/dataloader.cpython-39.pyc b/utils/__pycache__/dataloader.cpython-39.pyc new file mode 100644 index 0000000..1149806 Binary files /dev/null and b/utils/__pycache__/dataloader.cpython-39.pyc differ diff --git a/utils/__pycache__/dataloader_batch.cpython-310.pyc b/utils/__pycache__/dataloader_batch.cpython-310.pyc new file mode 100644 index 0000000..6b49db6 Binary files /dev/null and b/utils/__pycache__/dataloader_batch.cpython-310.pyc differ diff --git a/utils/__pycache__/dataloader_database.cpython-310.pyc b/utils/__pycache__/dataloader_database.cpython-310.pyc new file mode 100644 index 0000000..4ccea8f Binary files /dev/null and b/utils/__pycache__/dataloader_database.cpython-310.pyc differ diff --git a/utils/__pycache__/dataloader_smote.cpython-310.pyc b/utils/__pycache__/dataloader_smote.cpython-310.pyc new file mode 100644 index 0000000..8070ab1 Binary files /dev/null and b/utils/__pycache__/dataloader_smote.cpython-310.pyc differ diff --git a/utils/__pycache__/get_paths.cpython-310.pyc b/utils/__pycache__/get_paths.cpython-310.pyc new file mode 100644 index 0000000..18b2bb2 Binary files /dev/null and b/utils/__pycache__/get_paths.cpython-310.pyc differ diff --git a/utils/__pycache__/get_paths.cpython-311.pyc b/utils/__pycache__/get_paths.cpython-311.pyc new file mode 100644 index 0000000..6e0bdbe Binary files /dev/null and b/utils/__pycache__/get_paths.cpython-311.pyc differ diff --git a/utils/__pycache__/misc_func.cpython-310.pyc b/utils/__pycache__/misc_func.cpython-310.pyc new file mode 100644 index 0000000..5959146 Binary files /dev/null and b/utils/__pycache__/misc_func.cpython-310.pyc differ diff --git a/utils/__pycache__/model_func.cpython-310.pyc b/utils/__pycache__/model_func.cpython-310.pyc new file mode 100644 index 0000000..94b284b Binary files /dev/null and b/utils/__pycache__/model_func.cpython-310.pyc differ diff --git a/utils/__pycache__/model_func.cpython-311.pyc b/utils/__pycache__/model_func.cpython-311.pyc new file mode 100644 index 0000000..f798459 Binary files /dev/null and b/utils/__pycache__/model_func.cpython-311.pyc differ diff --git a/utils/__pycache__/model_func.cpython-312.pyc b/utils/__pycache__/model_func.cpython-312.pyc new file mode 100644 index 0000000..8a31f82 Binary files /dev/null and b/utils/__pycache__/model_func.cpython-312.pyc differ diff --git a/utils/__pycache__/model_func_batch.cpython-310.pyc b/utils/__pycache__/model_func_batch.cpython-310.pyc new file mode 100644 index 0000000..3b0039f Binary files /dev/null and b/utils/__pycache__/model_func_batch.cpython-310.pyc differ diff --git a/utils/__pycache__/pathmaster.cpython-310.pyc b/utils/__pycache__/pathmaster.cpython-310.pyc new file mode 100644 index 0000000..5411969 Binary files /dev/null and b/utils/__pycache__/pathmaster.cpython-310.pyc differ diff --git a/utils/__pycache__/pathmaster.cpython-312.pyc b/utils/__pycache__/pathmaster.cpython-312.pyc new file mode 100644 index 0000000..e70d54b Binary files /dev/null and b/utils/__pycache__/pathmaster.cpython-312.pyc differ diff --git a/utils/__pycache__/pathmaster.cpython-39.pyc b/utils/__pycache__/pathmaster.cpython-39.pyc new file mode 100644 index 0000000..d0ab4b5 Binary files /dev/null and b/utils/__pycache__/pathmaster.cpython-39.pyc differ diff --git a/utils/__pycache__/plot_save_func.cpython-310.pyc b/utils/__pycache__/plot_save_func.cpython-310.pyc new file mode 100644 index 0000000..902fe16 Binary files /dev/null and b/utils/__pycache__/plot_save_func.cpython-310.pyc differ diff --git a/utils/__pycache__/plot_save_func.cpython-311.pyc b/utils/__pycache__/plot_save_func.cpython-311.pyc new file mode 100644 index 0000000..5de7e02 Binary files /dev/null and b/utils/__pycache__/plot_save_func.cpython-311.pyc differ diff --git a/utils/__pycache__/plot_save_func.cpython-312.pyc b/utils/__pycache__/plot_save_func.cpython-312.pyc new file mode 100644 index 0000000..a7005b4 Binary files /dev/null and b/utils/__pycache__/plot_save_func.cpython-312.pyc differ diff --git a/utils/__pycache__/plot_save_func.cpython-39.pyc b/utils/__pycache__/plot_save_func.cpython-39.pyc new file mode 100644 index 0000000..35f1877 Binary files /dev/null and b/utils/__pycache__/plot_save_func.cpython-39.pyc differ diff --git a/utils/__pycache__/train_func.cpython-310.pyc b/utils/__pycache__/train_func.cpython-310.pyc new file mode 100644 index 0000000..c11ce94 Binary files /dev/null and b/utils/__pycache__/train_func.cpython-310.pyc differ diff --git a/utils/__pycache__/train_func.cpython-311.pyc b/utils/__pycache__/train_func.cpython-311.pyc new file mode 100644 index 0000000..8790f6f Binary files /dev/null and b/utils/__pycache__/train_func.cpython-311.pyc differ diff --git a/utils/dataloader.py b/utils/dataloader.py new file mode 100644 index 0000000..4a382e7 --- /dev/null +++ b/utils/dataloader.py @@ -0,0 +1,895 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Feb 26 18:29:59 2024 + +@author: dchen +""" +import os +import numpy as np +import pandas as pd +from PIL import Image +import torch +from torch.utils.data import Dataset, DataLoader +from sklearn.preprocessing import StandardScaler +from torchvision.transforms import ToTensor +import math +from numpy import random +from numpy.random import choice +import cv2 +from pyarrow import csv + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +# Seeds +torch.manual_seed(42) +np.random.seed(42) +random.seed(42) + + +def split_uids_60_10_30(pathmaster): + # ====== Load the per subject arrythmia summary ====== + file_path = pathmaster.summary_path() + # df_summary = pd.read_csv(file_path) + + # Read the CSV file using pyarrow.csv.read_csv + table_summary = csv.read_csv(file_path) + df_summary = table_summary.to_pandas() + + df_summary['UID'] = df_summary['UID'].astype(str).str.zfill(3) # Pads each UIDs with enough zeroes to be 3 characters + + df_summary['sample_nonAF'] = df_summary['NSR'] + df_summary['PACPVC'] + df_summary['SVT'] + df_summary['sample_AF'] = df_summary['AF'] + + df_summary['sample_nonAF_ratio'] = df_summary['sample_nonAF'] / (df_summary['sample_AF'] + df_summary['sample_nonAF']) + + # Filter out 0-segment UIDs and UIDs without NSR, AF, and/or PAC/PVC + remaining_UIDs = [] + + for index, row in df_summary.iterrows(): + UID = row['UID'] + if row['TOTAL'] == 0: + # There is no segment in this subject, skip this UID. + print(f'---------UID {UID} has no segments.------------') + elif (row['NSR'] > 0 or row['AF'] > 0 or row['PACPVC'] > 0): # Append UID only if it contains NSR, AF, or PAC/PVC + remaining_UIDs.append(UID) + else: + print(f'---------UID {UID} has no AF, NSR, or PAC/PVC segments.------------') + + # Split UIDs + uid_nsr_train = ['011', '014', '030', '037', '044', '050', '055', '058', '074', '083', '091', '098', '101', '106', '109', '119'] + uid_nsr_val = ['041', '056', '325'] + uid_nsr_test = ['003', '012', '020', '024', '027', '035', '036', '047'] + + uid_af_train = ['017', '301', '302', '305', '306', '318', '319', '320', '321', '322', '324', '329', '402', '405', '406', '407', '416', '420', '421'] + uid_af_val = ['400', '409', '422'] + uid_af_test = ['307', '310', '311', '312', '410', '413', '414', '415', '423'] + + uid_pacpvc_train = ['005', '007', '013', '021', '022', '026', '028', '029', '042', '064', '068', '073', '080', '086', '087', '089', '093', '104', '110', '113', '120', '327', '408'] + uid_pacpvc_val = ['045', '054', '112'] + uid_pacpvc_test = ['002', '038', '039', '052', '053', '069', '070', '075', '078', '090', '100', '419'] + + # Total UID counts + total_uid_nsr = uid_nsr_train + uid_nsr_val + uid_nsr_test + total_uid_af = uid_af_train + uid_af_val + uid_af_test + total_uid_pacpvc = uid_pacpvc_train + uid_pacpvc_val + uid_pacpvc_test + total_uid = total_uid_pacpvc + total_uid_af + total_uid_nsr + + print('Number of total and unique UIDs:', len(total_uid),'|', len(np.unique(total_uid))) + print('Number of total and unique NSR UIDs:', len(total_uid_nsr),'|', len(np.unique(total_uid_nsr))) + print('Number of total and unique AF UIDs:', len(total_uid_af),'|', len(np.unique(total_uid_af))) + print('Number of total and unique PAC/PVC UIDs:', len(total_uid_pacpvc),'|', len(np.unique(total_uid_pacpvc))) + + train_set = uid_nsr_train + uid_af_train + uid_pacpvc_train + val_set = uid_nsr_val + uid_af_val + uid_pacpvc_val + test_set = uid_nsr_test + uid_af_test + uid_pacpvc_test + + # Limit data set size to reduce computational load for optimization + test_set = test_set + + return train_set, val_set, test_set + + +def split_uids_60_10_30_smote(pathmaster): + # ====== Load the per subject arrythmia summary ====== + file_path = pathmaster.summary_path() + # df_summary = pd.read_csv(file_path) + + # Read the CSV file using pyarrow.csv.read_csv + table_summary = csv.read_csv(file_path) + df_summary = table_summary.to_pandas() + + df_summary['UID'] = df_summary['UID'].astype(str).str.zfill(3) # Pads each UIDs with enough zeroes to be 3 characters + + df_summary['sample_nonAF'] = df_summary['NSR'] + df_summary['PACPVC'] + df_summary['SVT'] + df_summary['sample_AF'] = df_summary['AF'] + + df_summary['sample_nonAF_ratio'] = df_summary['sample_nonAF'] / (df_summary['sample_AF'] + df_summary['sample_nonAF']) + + # Filter out 0-segment UIDs and UIDs without NSR, AF, and/or PAC/PVC + remaining_UIDs = [] + + for index, row in df_summary.iterrows(): + UID = row['UID'] + if row['TOTAL'] == 0: + # There is no segment in this subject, skip this UID. + print(f'---------UID {UID} has no segments.------------') + elif (row['NSR'] > 0 or row['AF'] > 0 or row['PACPVC'] > 0): # Append UID only if it contains NSR, AF, or PAC/PVC + remaining_UIDs.append(UID) + else: + print(f'---------UID {UID} has no AF, NSR, or PAC/PVC segments.------------') + + # Split UIDs + uid_nsr_train = ['003', '020', '024', '041', '044', '047', '049', '050', '058', '063', '077', '084', '088', '091', '098', '099', '106', '109', '111', '118', '325'] + uid_nsr_val = ['014', '030', '036', '074'] + uid_nsr_test = ['011', '012', '027', '035', '037', '055', '056', '057', '083', '094', '101', '119'] + + uid_af_train = ['017', '302', '306', '307', '310', '311', '319', '321', '324', '400', '402', '405', '406', '407', '409', '410', '415', '420', '421'] + uid_af_val = ['416', '422', '423'] + uid_af_test = ['301', '305', '312', '318', '320', '322', '329', '413', '414'] + + uid_pacpvc_train = ['005', '007', '013', '021', '022', '026', '028', '029', '042', '064', '068', '073', '080', '086', '087', '089', '093', '104', '110', '113', '120', '327', '408'] + uid_pacpvc_val = ['045', '054', '112'] + uid_pacpvc_test = ['002', '038', '039', '052', '053', '069', '070', '075', '078', '090', '100', '419'] + + # Total UID counts + total_uid_nsr = uid_nsr_train + uid_nsr_val + uid_nsr_test + total_uid_af = uid_af_train + uid_af_val + uid_af_test + total_uid_pacpvc = uid_pacpvc_train + uid_pacpvc_val + uid_pacpvc_test + total_uid = total_uid_pacpvc + total_uid_af + total_uid_nsr + + print('Number of total and unique UIDs:', len(total_uid),'|', len(np.unique(total_uid))) + print('Number of total and unique NSR UIDs:', len(total_uid_nsr),'|', len(np.unique(total_uid_nsr))) + print('Number of total and unique AF UIDs:', len(total_uid_af),'|', len(np.unique(total_uid_af))) + print('Number of total and unique PAC/PVC UIDs:', len(total_uid_pacpvc),'|', len(np.unique(total_uid_pacpvc))) + + train_set = uid_nsr_train + uid_af_train + uid_pacpvc_train + val_set = uid_nsr_val + uid_af_val + uid_pacpvc_val + test_set = uid_nsr_test + uid_af_test + uid_pacpvc_test + + return train_set, val_set, test_set + + +def split_uids_60_10_30_noPACPVC(pathmaster): + # ====== Load the per subject arrythmia summary ====== + file_path = pathmaster.summary_path() + # df_summary = pd.read_csv(file_path) + + # Read the CSV file using pyarrow.csv.read_csv + table_summary = csv.read_csv(file_path) + df_summary = table_summary.to_pandas() + + df_summary['UID'] = df_summary['UID'].astype(str).str.zfill(3) # Pads each UIDs with enough zeroes to be 3 characters + + df_summary['sample_nonAF'] = df_summary['NSR'] + df_summary['PACPVC'] + df_summary['SVT'] + df_summary['sample_AF'] = df_summary['AF'] + + df_summary['sample_nonAF_ratio'] = df_summary['sample_nonAF'] / (df_summary['sample_AF'] + df_summary['sample_nonAF']) + + # Filter out 0-segment UIDs and UIDs without NSR, AF, and/or PAC/PVC + remaining_UIDs = [] + + for index, row in df_summary.iterrows(): + UID = row['UID'] + if row['TOTAL'] == 0: + # There is no segment in this subject, skip this UID. + print(f'---------UID {UID} has no segments.------------') + elif (row['NSR'] > 0 or row['AF'] > 0 or row['PACPVC'] > 0): # Append UID only if it contains NSR, AF, or PAC/PVC + remaining_UIDs.append(UID) + else: + print(f'---------UID {UID} has no AF, NSR, or PAC/PVC segments.------------') + + # Split UIDs + uid_nsr_train = ['003', '020', '024', '041', '044', '047', '049', '050', '058', '063', '077', '084', '088', '091', '098', '099', '106', '109', '111', '118', '325'] + uid_nsr_val = ['014', '030', '036', '074'] + uid_nsr_test = ['011', '012', '027', '035', '037', '055', '056', '057', '083', '094', '101', '119'] + + uid_af_train = ['017', '302', '306', '307', '310', '311', '319', '321', '324', '400', '402', '405', '406', '407', '409', '410', '415', '420', '421'] + uid_af_val = ['416', '422', '423'] + uid_af_test = ['301', '305', '312', '318', '320', '322', '329', '413', '414'] + + uid_pacpvc_train = [] # ['005', '007', '013', '021', '022', '026', '028', '029', '042', '064', '068', '073', '080', '086', '087', '089', '093', '104', '110', '113', '120', '327', '408'] + uid_pacpvc_val = [] # ['045', '054', '112'] + uid_pacpvc_test = [] # ['002', '038', '039', '052', '053', '069', '070', '075', '078', '090', '100', '419'] + + # Total UID counts + total_uid_nsr = uid_nsr_train + uid_nsr_val + uid_nsr_test + total_uid_af = uid_af_train + uid_af_val + uid_af_test + total_uid_pacpvc = uid_pacpvc_train + uid_pacpvc_val + uid_pacpvc_test + total_uid = total_uid_pacpvc + total_uid_af + total_uid_nsr + + print('Number of total and unique UIDs:', len(total_uid),'|', len(np.unique(total_uid))) + print('Number of total and unique NSR UIDs:', len(total_uid_nsr),'|', len(np.unique(total_uid_nsr))) + print('Number of total and unique AF UIDs:', len(total_uid_af),'|', len(np.unique(total_uid_af))) + print('Number of total and unique PAC/PVC UIDs:', len(total_uid_pacpvc),'|', len(np.unique(total_uid_pacpvc))) + + train_set = uid_nsr_train + uid_af_train + uid_pacpvc_train + val_set = uid_nsr_val + uid_af_val + uid_pacpvc_val + test_set = uid_nsr_test + uid_af_test + uid_pacpvc_test + + return train_set, val_set, test_set + + +def split_uids_60_10_30_noNSR(pathmaster): + # ====== Load the per subject arrythmia summary ====== + file_path = pathmaster.summary_path() + # df_summary = pd.read_csv(file_path) + + # Read the CSV file using pyarrow.csv.read_csv + table_summary = csv.read_csv(file_path) + df_summary = table_summary.to_pandas() + + df_summary['UID'] = df_summary['UID'].astype(str).str.zfill(3) # Pads each UIDs with enough zeroes to be 3 characters + + df_summary['sample_nonAF'] = df_summary['NSR'] + df_summary['PACPVC'] + df_summary['SVT'] + df_summary['sample_AF'] = df_summary['AF'] + + df_summary['sample_nonAF_ratio'] = df_summary['sample_nonAF'] / (df_summary['sample_AF'] + df_summary['sample_nonAF']) + + # Filter out 0-segment UIDs and UIDs without NSR, AF, and/or PAC/PVC + remaining_UIDs = [] + + for index, row in df_summary.iterrows(): + UID = row['UID'] + if row['TOTAL'] == 0: + # There is no segment in this subject, skip this UID. + print(f'---------UID {UID} has no segments.------------') + elif (row['NSR'] > 0 or row['AF'] > 0 or row['PACPVC'] > 0): # Append UID only if it contains NSR, AF, or PAC/PVC + remaining_UIDs.append(UID) + else: + print(f'---------UID {UID} has no AF, NSR, or PAC/PVC segments.------------') + + # Split UIDs + uid_nsr_train = [] # ['003', '020', '024', '041', '044', '047', '049', '050', '058', '063', '077', '084', '088', '091', '098', '099', '106', '109', '111', '118', '325'] + uid_nsr_val = [] # ['014', '030', '036', '074'] + uid_nsr_test = [] # ['011', '012', '027', '035', '037', '055', '056', '057', '083', '094', '101', '119'] + + uid_af_train = ['017', '302', '306', '307', '310', '311', '319', '321', '324', '400', '402', '405', '406', '407', '409', '410', '415', '420', '421'] + uid_af_val = ['416', '422', '423'] + uid_af_test = ['301', '305', '312', '318', '320', '322', '329', '413', '414'] + + uid_pacpvc_train = ['005', '007', '013', '021', '022', '026', '028', '029', '042', '064', '068', '073', '080', '086', '087', '089', '093', '104', '110', '113', '120', '327', '408'] + uid_pacpvc_val = ['045', '054', '112'] + uid_pacpvc_test = ['002', '038', '039', '052', '053', '069', '070', '075', '078', '090', '100', '419'] + + # Total UID counts + total_uid_nsr = uid_nsr_train + uid_nsr_val + uid_nsr_test + total_uid_af = uid_af_train + uid_af_val + uid_af_test + total_uid_pacpvc = uid_pacpvc_train + uid_pacpvc_val + uid_pacpvc_test + total_uid = total_uid_pacpvc + total_uid_af + total_uid_nsr + + print('Number of total and unique UIDs:', len(total_uid),'|', len(np.unique(total_uid))) + print('Number of total and unique NSR UIDs:', len(total_uid_nsr),'|', len(np.unique(total_uid_nsr))) + print('Number of total and unique AF UIDs:', len(total_uid_af),'|', len(np.unique(total_uid_af))) + print('Number of total and unique PAC/PVC UIDs:', len(total_uid_pacpvc),'|', len(np.unique(total_uid_pacpvc))) + + train_set = uid_nsr_train + uid_af_train + uid_pacpvc_train + val_set = uid_nsr_val + uid_af_val + uid_pacpvc_val + test_set = uid_nsr_test + uid_af_test + uid_pacpvc_test + + return train_set, val_set, test_set + + +def split_uids_60_10_30_balanced(pathmaster): + # ====== Load the per subject arrythmia summary ====== + file_path = pathmaster.summary_path() + # df_summary = pd.read_csv(file_path) + + # Read the CSV file using pyarrow.csv.read_csv + table_summary = csv.read_csv(file_path) + df_summary = table_summary.to_pandas() + + df_summary['UID'] = df_summary['UID'].astype(str).str.zfill(3) # Pads each UIDs with enough zeroes to be 3 characters + + df_summary['sample_nonAF'] = df_summary['NSR'] + df_summary['PACPVC'] + df_summary['SVT'] + df_summary['sample_AF'] = df_summary['AF'] + + df_summary['sample_nonAF_ratio'] = df_summary['sample_nonAF'] / (df_summary['sample_AF'] + df_summary['sample_nonAF']) + + # Filter out 0-segment UIDs and UIDs without NSR, AF, and/or PAC/PVC + remaining_UIDs = [] + + for index, row in df_summary.iterrows(): + UID = row['UID'] + if row['TOTAL'] == 0: + # There is no segment in this subject, skip this UID. + print(f'---------UID {UID} has no segments.------------') + elif (row['NSR'] > 0 or row['AF'] > 0 or row['PACPVC'] > 0): # Append UID only if it contains NSR, AF, or PAC/PVC + remaining_UIDs.append(UID) + else: + print(f'---------UID {UID} has no AF, NSR, or PAC/PVC segments.------------') + + # Split UIDs + uid_nsr_train = ['041', '044', '047', '050', '058', '063', '091', '098', '106', '111', '325'] + uid_nsr_val = ['014', '030', '036', '074'] + uid_nsr_test = ['011', '012', '027', '035', '037', '055', '056', '057', '083', '094', '101', '119'] + + uid_af_train = ['017', '302', '306', '307', '310', '311', '319', '321', '324', '400', '402', '407', '409', '415', '420', '421'] + uid_af_val = ['416', '422', '423'] + uid_af_test = ['301', '305', '312', '318', '320', '322', '329', '413', '414'] + + uid_pacpvc_train = ['005', '007', '013', '021', '022', '026', '028', '029', '042', '064', '068', '073', '080', '086', '087', '089', '093', '104', '110', '113', '120', '327', '408'] + uid_pacpvc_val = ['045', '054', '112'] + uid_pacpvc_test = ['002', '038', '039', '052', '053', '069', '070', '075', '078', '090', '100', '419'] + + # Total UID counts + total_uid_nsr = uid_nsr_train + uid_nsr_val + uid_nsr_test + total_uid_af = uid_af_train + uid_af_val + uid_af_test + total_uid_pacpvc = uid_pacpvc_train + uid_pacpvc_val + uid_pacpvc_test + total_uid = total_uid_pacpvc + total_uid_af + total_uid_nsr + + print('Number of total and unique UIDs:', len(total_uid),'|', len(np.unique(total_uid))) + print('Number of total and unique NSR UIDs:', len(total_uid_nsr),'|', len(np.unique(total_uid_nsr))) + print('Number of total and unique AF UIDs:', len(total_uid_af),'|', len(np.unique(total_uid_af))) + print('Number of total and unique PAC/PVC UIDs:', len(total_uid_pacpvc),'|', len(np.unique(total_uid_pacpvc))) + + train_set = uid_nsr_train + uid_af_train + uid_pacpvc_train + val_set = uid_nsr_val + uid_af_val + uid_pacpvc_val + test_set = uid_nsr_test + uid_af_test + uid_pacpvc_test + + return train_set, val_set, test_set + + +def split_uids_2fold_60_40(pathmaster): + # ====== Load the per subject arrythmia summary ====== + file_path = pathmaster.summary_path() + # df_summary = pd.read_csv(file_path) + + # Read the CSV file using pyarrow.csv.read_csv + table_summary = csv.read_csv(file_path) + df_summary = table_summary.to_pandas() + + df_summary['UID'] = df_summary['UID'].astype(str).str.zfill(3) # Pads each UIDs with enough zeroes to be 3 characters + + df_summary['sample_nonAF'] = df_summary['NSR'] + df_summary['PACPVC'] + df_summary['SVT'] + df_summary['sample_AF'] = df_summary['AF'] + + df_summary['sample_nonAF_ratio'] = df_summary['sample_nonAF'] / (df_summary['sample_AF'] + df_summary['sample_nonAF']) + + # Filter out 0-segment UIDs and UIDs without NSR, AF, and/or PAC/PVC + remaining_UIDs = [] + + for index, row in df_summary.iterrows(): + UID = row['UID'] + if row['TOTAL'] == 0: + # There is no segment in this subject, skip this UID. + print(f'---------UID {UID} has no segments.------------') + elif (row['NSR'] > 0 or row['AF'] > 0 or row['PACPVC'] > 0): # Append UID only if it contains NSR, AF, or PAC/PVC + remaining_UIDs.append(UID) + else: + print(f'---------UID {UID} has no AF, NSR, or PAC/PVC segments.------------') + + # Split UIDs + uid_pacpvc_fold1 = ['007', '022', '028', '038', '054', '068', '075', '086', '087', '093', '120', '327'] + uid_pacpvc_fold2 = ['002', '005', '013', '021', '026', '029', '045', '073', '089', '100', '112', '408'] + uid_pacpvc_test = ['039', '042', '052', '053', '064', '069', '070', '078', '080', '090', '104', '110', '113', '419'] + + uid_af_fold1 = ['305', '307', '311', '318', '320', '322', '405', '415', '423'] + uid_af_fold2 = ['301', '319', '321', '324', '329', '400', '406', '409', '416'] + uid_af_test = ['017', '302', '306', '310', '312', '402', '407', '410', '413', '414', '420', '421', '422'] + + uid_nsr_fold1 = ['011', '014', '041', '050', '056', '058', '083', '106', '109'] + uid_nsr_fold2 = ['037', '047', '055', '074', '091', '098', '101', '119', '325'] + uid_nsr_test = ['003', '012', '020', '024', '027', '030', '035', '036', '044', '049', '057', '063', '077', '084', '088', '094', '099', '111', '118'] + + # Total UID counts + total_uid_pacpvc = uid_pacpvc_fold1 + uid_pacpvc_fold2 + uid_pacpvc_test + total_uid_af = uid_af_fold1 + uid_af_fold2 + uid_af_test + total_uid_nsr = uid_nsr_fold1 + uid_nsr_fold2 + uid_nsr_test + total_uid = total_uid_pacpvc + total_uid_af + total_uid_nsr + + print('Number of total and unique UIDs:', len(total_uid),'|', len(np.unique(total_uid))) + print('Number of total and unique NSR UIDs:', len(total_uid_nsr),'|', len(np.unique(total_uid_nsr))) + print('Number of total and unique AF UIDs:', len(total_uid_af),'|', len(np.unique(total_uid_af))) + print('Number of total and unique PAC/PVC UIDs:', len(total_uid_pacpvc),'|', len(np.unique(total_uid_pacpvc))) + + cross_val_fold1 = uid_nsr_fold1 + uid_af_fold1 + uid_pacpvc_fold1 + cross_val_fold2 = uid_nsr_fold2 + uid_af_fold2 + uid_pacpvc_fold2 + test = uid_nsr_test + uid_af_test + uid_pacpvc_test + + # # Limit data set size to reduce computational load for optimization + # cross_val_fold1 = uid_nsr_fold1[:2] + uid_af_fold1[:2] + uid_pacpvc_fold1[:2] + # cross_val_fold2 = uid_nsr_fold2[:2] + uid_af_fold2[:2] + uid_pacpvc_fold2[:2] + # test = uid_nsr_test[:2] + uid_af_test[:2] + uid_pacpvc_test[:2] + + return cross_val_fold1, cross_val_fold2, test + + +def split_uids_2fold_60_40_smote(pathmaster): + # ====== Load the per subject arrythmia summary ====== + file_path = pathmaster.summary_path() + # df_summary = pd.read_csv(file_path) + + # Read the CSV file using pyarrow.csv.read_csv + table_summary = csv.read_csv(file_path) + df_summary = table_summary.to_pandas() + + df_summary['UID'] = df_summary['UID'].astype(str).str.zfill(3) # Pads each UIDs with enough zeroes to be 3 characters + + df_summary['sample_nonAF'] = df_summary['NSR'] + df_summary['PACPVC'] + df_summary['SVT'] + df_summary['sample_AF'] = df_summary['AF'] + + df_summary['sample_nonAF_ratio'] = df_summary['sample_nonAF'] / (df_summary['sample_AF'] + df_summary['sample_nonAF']) + + # Filter out 0-segment UIDs and UIDs without NSR, AF, and/or PAC/PVC + remaining_UIDs = [] + + for index, row in df_summary.iterrows(): + UID = row['UID'] + if row['TOTAL'] == 0: + # There is no segment in this subject, skip this UID. + print(f'---------UID {UID} has no segments.------------') + elif (row['NSR'] > 0 or row['AF'] > 0 or row['PACPVC'] > 0): # Append UID only if it contains NSR, AF, or PAC/PVC + remaining_UIDs.append(UID) + else: + print(f'---------UID {UID} has no AF, NSR, or PAC/PVC segments.------------') + + # Split UIDs + uid_nsr_fold1 = ['020', '030', '037', '041', '058', '077', '084', '106', '109', '118', '325'] + uid_nsr_fold2 = ['003', '014', '036', '044', '047', '049', '063', '083', '088', '091', '099'] + uid_nsr_test = ['011', '012', '024', '027', '035', '050', '055', '056', '057', '074', '094', '098', '101', '111', '119'] + + uid_af_fold1 = ['302', '306', '307', '402', '405', '415', '420', '421', '422'] + uid_af_fold2 = ['310', '321', '324', '406', '407', '409', '414', '416', '423'] + uid_af_test = ['017', '301', '305', '311', '312', '318', '319', '320', '322', '329', '400', '410', '413'] + + uid_pacpvc_fold1 = ['007', '022', '028', '038', '054', '068', '075', '086', '087', '093', '120', '327'] + uid_pacpvc_fold2 = ['002', '005', '013', '021', '026', '029', '045', '073', '089', '100', '112', '408'] + uid_pacpvc_test = ['039', '042', '052', '053', '064', '069', '070', '078', '080', '090', '104', '110', '113', '419'] + + # Total UID counts + total_uid_nsr = uid_nsr_fold1 + uid_nsr_fold2 + uid_nsr_test + total_uid_af = uid_af_fold1 + uid_af_fold2 + uid_af_test + total_uid_pacpvc = uid_pacpvc_fold1 + uid_pacpvc_fold2 + uid_pacpvc_test + total_uid = total_uid_pacpvc + total_uid_af + total_uid_nsr + + print('Number of total and unique UIDs:', len(total_uid),'|', len(np.unique(total_uid))) + print('Number of total and unique NSR UIDs:', len(total_uid_nsr),'|', len(np.unique(total_uid_nsr))) + print('Number of total and unique AF UIDs:', len(total_uid_af),'|', len(np.unique(total_uid_af))) + print('Number of total and unique PAC/PVC UIDs:', len(total_uid_pacpvc),'|', len(np.unique(total_uid_pacpvc))) + + cross_val_fold1 = uid_nsr_fold1 + uid_af_fold1 + uid_pacpvc_fold1 + cross_val_fold2 = uid_nsr_fold2 + uid_af_fold2 + uid_pacpvc_fold2 + test_set = uid_nsr_test + uid_af_test + uid_pacpvc_test + + return cross_val_fold1, cross_val_fold2, test_set + + +def split_uids(pathmaster): + # ====== Load the per subject arrythmia summary ====== + file_path = pathmaster.summary_path() + # df_summary = pd.read_csv(file_path) + + # Read the CSV file using pyarrow.csv.read_csv + table_summary = csv.read_csv(file_path) + df_summary = table_summary.to_pandas() + + df_summary['UID'] = df_summary['UID'].astype(str).str.zfill(3) # Pads each UIDs with enough zeroes to be 3 characters + + df_summary['sample_nonAF'] = df_summary['NSR'] + df_summary['PACPVC'] + df_summary['SVT'] + df_summary['sample_AF'] = df_summary['AF'] + + df_summary['sample_nonAF_ratio'] = df_summary['sample_nonAF'] / (df_summary['sample_AF'] + df_summary['sample_nonAF']) + + all_UIDs = df_summary['UID'].unique() + + # ==================================================== + # ====== AF trial separation ====== + # R:\ENGR_Chon\Dong\Numbers\Pulsewatch_numbers\Fahimeh_CNNED_general_ExpertSystemwApplication\tbl_file_name\TrainingSet_final_segments + AF_trial_Fahimeh_train = ['402','410'] + AF_trial_Fahimeh_test = ['301', '302', '305', '306', '307', '310', '311', + '312', '318', '319', '320', '321', '322', '324', + '325', '327', '329', '400', '406', '407', '409', + '414'] + AF_trial_Fahimeh_did_not_use = ['405', '413', '415', '416', '420', '421', '422', '423'] + AF_trial_paroxysmal_AF = ['408','419'] + + AF_trial_train = AF_trial_Fahimeh_train + AF_trial_test = AF_trial_Fahimeh_test + AF_trial_unlabeled = AF_trial_Fahimeh_did_not_use + AF_trial_paroxysmal_AF + print(f'AF trial: {len(AF_trial_train)} training subjects {AF_trial_train}') + print(f'AF trial: {len(AF_trial_test)} testing subjects {AF_trial_test}') + print(f'AF trial: {len(AF_trial_unlabeled)} unlabeled subjects {AF_trial_unlabeled}') + + # ================================= + # === Clinical trial AF subjects separation === + clinical_trial_AF_subjects = ['005', '017', '026', '051', '075', '082'] + + # Filter out AF trial and 0-segment UIDs + remaining_UIDs = [] + count_NSR = [] + + for index, row in df_summary.iterrows(): + UID = row['UID'] + this_NSR = row['sample_nonAF'] + if math.isnan(row['sample_nonAF_ratio']): # sample_nonAF is never NaN, sample_nonAF_ratio may be NaN + # There is no segment in this subject, skip this UID. + print(f'---------UID {UID} has no segments.------------') + continue # If a UID has no segments, skip the rest of the for loop for this index, row + if UID not in AF_trial_train and UID not in AF_trial_test and UID not in clinical_trial_AF_subjects \ + and UID[0] != '3' and UID[0] != '4': + remaining_UIDs.append(UID) + count_NSR.append(this_NSR) + + # From the candidate UIDs, select a subset to be used for training, validation, and testing + random.seed(seed=42) + + list_of_candidates = remaining_UIDs + number_of_items_to_pick = round(len(list_of_candidates) * 0.25) # 15% labeled for training, 10% for testing. + sum_NSR = sum(count_NSR) + + # probability_distribution = [x/sum_NSR for x in count_NSR] # Proportion of total NSR segments for each UID + probability_distribution = [(1-x/sum_NSR)/ (len(count_NSR)-1) for x in count_NSR] # Subjects with fewer segments have higher chance to be selected. + draw = choice(list_of_candidates, number_of_items_to_pick, + p=probability_distribution, replace=False) + + # Ensures that training set contains both AF and non-AF + clinical_trial_train_nonAF = list(draw[:round(len(list_of_candidates) * 0.12)]) # Draws the first X number of candidates equal to 7% of the total list of candidates + clinical_trial_train_temp = clinical_trial_train_nonAF + clinical_trial_AF_subjects[:round(len(clinical_trial_AF_subjects)/2)] + clinical_trial_train = [] + + for UID in clinical_trial_train_temp: + # UID 051 and 108 and maybe other UIDs had no segments (unknown reason). + if UID in all_UIDs: + clinical_trial_train.append(UID) # Only use the UIDs that are in the summary to test + + # Ensures that the testing set contains both AF and non-AF + clinical_trial_test_nonAF = list(draw[round(len(list_of_candidates) * 0.12):]) # Draws the remaining candidates + clinical_trial_test_temp = clinical_trial_test_nonAF + clinical_trial_AF_subjects[round(len(clinical_trial_AF_subjects)/2):] + clinical_trial_test = [] + for UID in clinical_trial_test_temp: + # UID 051 and 108 and maybe other UIDs had no segments (unknown reason). + if UID in all_UIDs: + clinical_trial_test.append(UID) # Only use the UIDs that are in the summary to test + + # Uses all remaining subset of UIDs from original list not used in training or validating for testing + clinical_trial_unlabeled = [] + for UID in remaining_UIDs: # Changed from all_UIDs to remove UIDs with 0 segments (i.e. UID 108) + if UID not in clinical_trial_train and UID not in clinical_trial_test and UID[0] != '3' and UID[0] != '4': + clinical_trial_unlabeled.append(UID) + + # Sum up to 74 UIDs, all of the ones that do not start with '3' or '4' and dropping UID 108 which has 0 segments + print(f'Clinical trial: selected {len(clinical_trial_train)} UIDs for training {clinical_trial_train}') # Contains both non-AF and AF clinical trial subjects + print(f'Clinical trial: selected {len(clinical_trial_test)} UIDs for testing {clinical_trial_test}') # Contains both non-AF and AF clinical trial subjects + print(f'Clinical trial: selected {len(clinical_trial_unlabeled)} UIDs for unlabeled {clinical_trial_unlabeled}') # All remaining clinical trial subjects...probably contains both AF and non-AF + + # Used to make sure the model runs correctly + clinical_trial_train = ['063','416','005'] # Training + clinical_trial_test = ['058','409','054'] # Evaluation + clinical_trial_unlabeled = ['029','036','421'] # Testing + + return clinical_trial_train, clinical_trial_test, clinical_trial_unlabeled + + +class CustomDataset(Dataset): + def __init__(self, data_path, labels_path, UIDs, standardize=True, data_format='csv', read_all_labels=False, + start_idx=0, img_channels=1, img_size=128, downsample=None, data_type=torch.float32, is_tfs=True, binary=False): + self.data_path = data_path + self.labels_path = labels_path + self.UIDs = UIDs + self.standardize = standardize + self.data_format = data_format + self.read_all_labels = read_all_labels + self.transforms = ToTensor() + self.start_idx = start_idx # Initial batch index to start from, useful for resuming training + self.img_channels = img_channels + self.img_size = img_size + self.downsample = downsample + self.is_tfs = is_tfs + self.binary = binary + + # Must be manually set so that the image resolution chosen is the one that is returned + self.dtype = data_type + + self.refresh_dataset() + + def refresh_dataset(self): + self.segment_names, self.labels = self.extract_segment_names_and_labels() + + def add_uids(self, new_uids): + unique_new_uids = [uid for uid in new_uids if uid not in self.UIDs] # Appends any unqiue new UID in self.UIDs to unique_new_uids + self.UIDs.extend(unique_new_uids) # Appends unique_new_uids to UIDs + self.refresh_dataset() + + def __len__(self): # Method is implicitly called when len() is used on an instance of CustomDataset + return len(self.segment_names) + + def save_checkpoint(self, checkpoint_path): # Likely not worth using, simply use the save_checkpoint() function in train_func.py + # Enhanced to automatically include 'start_idx' in the checkpoint + checkpoint = { + 'segment_names': self.segment_names, + 'labels': self.labels, + 'UIDs': self.UIDs, + 'start_idx': self.start_idx # Now also saving start_idx + } + torch.save(checkpoint, checkpoint_path) # Using standard Python methods like pickle or json is generally recommended for dictionaries, there are no benefits for using torch.save, no real harm either + + def load_checkpoint(self, checkpoint_path): # Reloads where you started off last time (not where you ended), just use analogous function in train_func.py + checkpoint = torch.load(checkpoint_path) + self.segment_names = checkpoint['segment_names'] # Seems redundant since it is overwritten by refresh_dataset() + self.labels = checkpoint['labels'] # Seems redundant since it is overwritten by refresh_dataset() + self.UIDs = checkpoint['UIDs'] + # Now also loading and setting start_idx from checkpoint + self.start_idx = checkpoint.get('start_idx', 0) # Returns 0 if no start_idx found + self.refresh_dataset() + + def __getitem__(self, idx): # Method is implicitly called when getitem() is used on an instance of CustomDataset. It is called batch_size number of times per iteration of dataloader | Loads segments as needed (lazy loading) + actual_idx = (idx + self.start_idx) % len(self.segment_names) # Adjust index based on start_idx and wrap around if needed (i.e. index falls out of bounds) + segment_name = self.segment_names[actual_idx] + label = self.labels[segment_name] + + if hasattr(self, 'all_data') and actual_idx < len(self.all_data): # When Luis uses adds data to train_loader in main_checkpoints.py, + # new data is added (creating all_data) only after train_loader is created with its original training data. This means that if self.all_data + # exists, then __getitem__ is only be called in order to retrieve data newly added to train_loader in all_data + time_freq_tensor = self.all_data[actual_idx] + else: + time_freq_tensor = self.load_data(segment_name) + + return {'data': time_freq_tensor, 'label': label, 'segment_name': segment_name} + + # When iterating over the dataloader, which returns batches of data, each batch will contain a dictionary with keys corresponding to the data and labels. + + # Since the dataloader's dataset's __getitem__ method returns a dictionary with keys 'data', 'label', and 'segment_name', the returned batch will be a dictionary where: + + # The 'data' key will correspond to a tensor of shape (batch_size, ...), representing the shape of the data. + # The 'label' key will correspond to a tensor of shape (batch_size, ...), representing the shape of the labels. + # The 'segment_name' key will correspond to a tensor of shape (batch_size, ...), representing the shape of the segment_name. + + def set_start_idx(self, index): + self.start_idx = index + + def add_data_label_pair(self, data, label): + # Assign a unique ID or name for the new data + new_id = len(self.segment_names) + segment_name = f"new_data_{new_id}" + + # Append the new data and label + self.segment_names.append(segment_name) + self.labels[segment_name] = label + + # Append the new data tensor to an attribute that holds all of the newly added data + if hasattr(self, 'all_data'): + self.all_data.append(data) + else: + self.all_data = [data] + + # def extract_segment_names_and_labels(self): + # segment_names = [] + # labels = {} + + # for UID in self.UIDs: + # label_file = os.path.join(self.labels_path, UID + "_final_attemp_4_1_Dong.csv") + # if os.path.exists(label_file): + # # label_data = pd.read_csv(label_file, sep=',', header=0, names=['segment', 'label']) # Replaces the original headers with names + + # # Use PyArrow to read csv + # parse_options = csv.ParseOptions(delimiter=',') # Indicate delimiter + # read_options = csv.ReadOptions(column_names=['segment', 'label'], skip_rows=1) # Assign desired column names and skip the first row (headers) + # label_data = csv.read_csv(label_file, parse_options=parse_options, read_options=read_options) + # label_data = label_data.to_pandas() + + # label_segment_names = label_data['segment'].apply(lambda x: x.split('.')[0]) # Splits each segment name by '.' and retrieves the first part + # for idx, segment_name in enumerate(label_segment_names): # enumerate() returns the value and corresponding index of each element in an iterable + # label_val = label_data['label'].values[idx] + # # Will only use NSR (0), AF (1), and PAC/PVC(2) and not SVT (3) + # if self.read_all_labels: # If reading all labels, set all labels not 0, 1, or 2 to -1 and return all labels + # # Assign -1 if label is not in [0, 1, 2] + # labels[segment_name] = label_val if label_val in [0, 1, 2] else -1 + # if segment_name not in segment_names: + # segment_names.append(segment_name) + # else: + # # Only add segments with labels in [0, 1, 2] + # if label_val in [0, 1, 2] and segment_name not in segment_names: + # segment_names.append(segment_name) + # labels[segment_name] = label_val # Extracts the labels of the segments retrieved into a dictionary + + # # # Since shuffle=False for the dataloader in preprocess_data(), this is my work around for that while allowing for checkpointing + # # random.seed(seed=42) + # # random.shuffle(segment_names) # Will not affect the labels since the labels are in a dictionary + + # return segment_names, labels + + + def extract_segment_names_and_labels(self): # Only extract the segments and labels of a particular class, temporary solution + segment_names = [] + labels = {} + + # If a subject is not loading and there are no errors, just these lists + uid_nsr = ['011', '014', '041', '050', '056', '058', '083', '106', '109', + '037', '047', '055', '074', '091', '098', '101', '119', '325', + '003', '012', '020', '024', '027', '030', '035', '036', '044', '049', '057', '063', '077', '084', '088', '094', '099', '111', '118'] + uid_af = ['305', '307', '311', '318', '320', '322', '405', '415', '423', + '301', '319', '321', '324', '329', '400', '406', '409', '416', + '017', '302', '306', '310', '312', '402', '407', '410', '413', '414', '420', '421', '422'] + uid_pacpvc = ['007', '022', '028', '038', '054', '068', '075', '086', '087', '093', '120', '327', + '002', '005', '013', '021', '026', '029', '045', '073', '089', '100', '112', '408', + '039', '042', '052', '053', '064', '069', '070', '078', '080', '090', '104', '110', '113', '419'] + + for UID in self.UIDs: + label_file = os.path.join(self.labels_path, UID + "_final_attemp_4_1_Dong.csv") + if os.path.exists(label_file): + # label_data = pd.read_csv(label_file, sep=',', header=0, names=['segment', 'label']) # Replaces the original headers with names + + # Use PyArrow to read csv + parse_options = csv.ParseOptions(delimiter=',') # Indicate delimiter + read_options = csv.ReadOptions(column_names=['segment', 'label'], skip_rows=1) # Assign desired column names and skip the first row (headers) + label_data = csv.read_csv(label_file, parse_options=parse_options, read_options=read_options) + label_data = label_data.to_pandas() + + label_segment_names = label_data['segment'].apply(lambda x: x.split('.')[0]) # Splits each segment name by '.' and retrieves the first part + for idx, segment_name in enumerate(label_segment_names): # enumerate() returns the value and corresponding index of each element in an iterable + label_val = label_data['label'].values[idx] + # Will only use NSR (0), AF (1), and PAC/PVC(2) and not SVT (3) + if self.read_all_labels: # If reading all labels, set all labels not 0, 1, or 2 to -1 and return all labels + # Assign -1 if label is not in [0, 1, 2] + labels[segment_name] = label_val if label_val in [0, 1, 2] else -1 + if segment_name not in segment_names: + segment_names.append(segment_name) + else: + # Only add segments with labels in [0, 1, 2] + if label_val in [0, 1, 2] and segment_name not in segment_names: + # Temporary solution to ensure only segments of a particular class are loaded for each UID + if UID in uid_nsr and label_val == 0: + segment_names.append(segment_name) + labels[segment_name] = label_val + elif UID in uid_af and label_val == 1: + segment_names.append(segment_name) + labels[segment_name] = label_val + elif UID in uid_pacpvc and label_val == 2: + segment_names.append(segment_name) + if self.binary: + labels[segment_name] = 0 + else: + labels[segment_name] = label_val + + return segment_names, labels + + + def load_data(self, segment_name): + data_path_UID = os.path.join(self.data_path, segment_name.split('_')[0]) + if self.is_tfs: + seg_path = os.path.join(data_path_UID, segment_name + '_filt_STFT.' + self.data_format) + else: + seg_path = os.path.join(data_path_UID, segment_name + '_density_poincare.' + self.data_format) + + + try: # Allows to define a block of code to be executed and specify how to handle any errors that might occur during its execution + if self.data_format == 'csv' and seg_path.endswith('.csv'): + # time_freq_plot = np.array(pd.read_csv(seg_path, header=None)) + + # Use PyArrow to read csv + read_options = csv.ReadOptions(autogenerate_column_names=True) + seg_data = csv.read_csv(seg_path, read_options=read_options) + time_freq_plot = seg_data.to_pandas().to_numpy() + + time_freq_tensor = torch.tensor(time_freq_plot).reshape(self.img_channels, self.img_size, self.img_size) + elif self.data_format == 'png' and seg_path.endswith('.png'): + img = Image.open(seg_path) + img_data = np.array(img) + time_freq_tensor = torch.tensor(img_data).unsqueeze(0) + elif self.data_format == 'pt' and seg_path.endswith('.pt'): + time_freq_tensor = torch.load(seg_path) + else: + raise ValueError("Unsupported file format") + + if self.downsample is not None: + # Downsample the image + # Use OpenCV to resize the array to downsample x downsample using INTER_AREA interpolation + time_freq_array = cv2.resize(np.array(time_freq_tensor.reshape(self.img_size, self.img_size).to('cpu')), (self.downsample, self.downsample), interpolation=cv2.INTER_AREA) + time_freq_tensor = torch.tensor(time_freq_array, dtype=self.dtype).reshape(self.img_channels, self.downsample, self.downsample) + else: + time_freq_tensor = time_freq_tensor.reshape(self.img_channels, self.img_size, self.img_size).to(self.dtype) + + if self.standardize: + time_freq_tensor = self.standard_scaling(time_freq_tensor) # Standardize the data + + return time_freq_tensor + + except Exception as e: + print(f"Error processing segment: {segment_name}. Exception: {str(e)}") + if self.downsample is not None: + return torch.zeros((self.img_channels, self.downsample, self.downsample)) # Return zeros in case of an error + else: + return torch.zeros((self.img_channels, self.img_size, self.img_size)) # Return zeros in case of an error + + def standard_scaling(self, data): + scaler = StandardScaler() + data = scaler.fit_transform(data.reshape(-1, data.shape[-1])).reshape(data.shape) # Converts data into 2D array, standardizes it, reshapes it back into 3D (1,X,X) + return torch.tensor(data, dtype=self.dtype) + +def load_data_split_batched(data_path, labels_path, UIDs, batch_size, standardize=False, data_format='csv', + read_all_labels=False, drop_last=False, num_workers=4, start_idx=0, + img_channels=1, img_size=128, downsample=None, data_type=torch.float32, is_tfs=True, binary=False): + torch.manual_seed(42) + g = torch.Generator() + g.manual_seed(42) + + pin_memory = False + if torch.cuda.is_available(): + pin_memory = True + + dataset = CustomDataset(data_path, labels_path, UIDs, standardize, data_format, read_all_labels, start_idx=start_idx, + img_channels=img_channels, img_size=img_size, downsample=downsample, data_type=data_type, is_tfs=is_tfs, binary=binary) + dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=drop_last, num_workers=num_workers, prefetch_factor=2, persistent_workers=True, pin_memory=pin_memory, worker_init_fn=seed_worker, generator=g) # Prefetches 2 batches ahead of current training iteration (allows loading of data simultaneously with training). Shuffle is set to False to resume training at a specific batch. + return dataloader + +def seed_worker(worker_id): + worker_seed = torch.initial_seed() % 2**32 + np.random.seed(worker_seed) + random.seed(worker_seed) + +# Function to extract and preprocess data +def preprocess_data(data_format, clinical_trial_train, clinical_trial_test, clinical_trial_unlabeled, batch_size, standardize=False, + read_all_labels=False, img_channels=1, img_size=128, downsample=None, data_type=torch.float32, pathmaster=None, binary=False): + start_idx = 0 + data_path, labels_path = pathmaster.data_paths(data_format) + + if data_format == 'csv': + num_workers = 6 + elif data_format == 'pt': + num_workers = 8 + + train_loader = load_data_split_batched(data_path, labels_path, clinical_trial_train, batch_size, standardize=standardize, + data_format=data_format, read_all_labels=read_all_labels, num_workers=num_workers, + start_idx=start_idx, img_channels=img_channels, img_size=img_size, downsample=downsample, + data_type=data_type, is_tfs=pathmaster.is_tfs, binary=binary) + val_loader = load_data_split_batched(data_path, labels_path, clinical_trial_test, batch_size, standardize=standardize, + data_format=data_format, read_all_labels=read_all_labels, num_workers=num_workers, + start_idx=start_idx, img_channels=img_channels, img_size=img_size, downsample=downsample, + data_type=data_type, is_tfs=pathmaster.is_tfs, binary=binary) + test_loader = load_data_split_batched(data_path, labels_path, clinical_trial_unlabeled, batch_size, standardize=standardize, + data_format=data_format, read_all_labels=read_all_labels, num_workers=num_workers, + start_idx=start_idx, img_channels=img_channels, img_size=img_size, downsample=downsample, + data_type=data_type, is_tfs=pathmaster.is_tfs, binary=binary) + return train_loader, val_loader, test_loader + +def map_samples_to_uids(uncertain_sample_indices, dataset): + """ + Maps indices of uncertain samples back to their corresponding segment names or UIDs. + + Args: + - uncertain_sample_indices: Indices of the uncertain samples in the dataset. + - dataset: The dataset object which contains the mapping of segment names and UIDs. + + Returns: + - List of UIDs or segment names corresponding to the uncertain samples. + """ + return [dataset.segment_names[i] for i in uncertain_sample_indices] + +def update_train_loader_with_labeled_samples(current_train_loader, labeled_samples, batch_size): # Luis' doesn't seem to use this + """ + Updates the training DataLoader with newly labeled samples. + + Args: + - current_train_loader: The current DataLoader for the training set. + - labeled_samples: A list of tuples, each containing a data tensor and its new label. + - batch_size: Batch size for the DataLoader. + + Returns: + - DataLoader: The updated DataLoader with the new labeled samples. + """ + + # Extract the current dataset from the DataLoader + current_dataset = current_train_loader.dataset + + # Update the dataset with new samples and labels + for data_tensor, label in labeled_samples: + # Assuming the CustomDataset class has a method to add new data and labels + current_dataset.add_data_label_pair(data_tensor, label) + + # Create a new DataLoader with the updated dataset + updated_train_loader = DataLoader(current_dataset, batch_size=batch_size, shuffle=True, drop_last=False, num_workers=4, prefetch_factor=2) + + return updated_train_loader + +def update_train_loader_with_uncertain_samples(current_train_loader, new_sample_indices, batch_size): # Luis' uses this method for active learning + # Extract current UIDs from the current_train_loader + current_dataset = current_train_loader.dataset + # Map new_samples back to their corresponding segment names or UIDs + new_uids = map_samples_to_uids(new_sample_indices, current_dataset) + # Add new UIDs to the current dataset and refresh it + current_dataset.add_uids(new_uids) + # Create new DataLoader with the updated dataset + updated_train_loader = DataLoader(current_dataset, batch_size=batch_size, shuffle=False) + return updated_train_loader + + \ No newline at end of file diff --git a/utils/dataloader_database.py b/utils/dataloader_database.py new file mode 100644 index 0000000..c3ab6b1 --- /dev/null +++ b/utils/dataloader_database.py @@ -0,0 +1,223 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Feb 26 18:29:59 2024 + +@author: dchen +""" +import os +import numpy as np +import pandas as pd +from PIL import Image +import torch +from torch.utils.data import Dataset, DataLoader +from sklearn.preprocessing import StandardScaler +from torchvision.transforms import ToTensor +import math +from numpy import random +from numpy.random import choice +import cv2 +from pyarrow import csv + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +# Seeds +torch.manual_seed(42) +np.random.seed(42) +random.seed(42) + +class CustomDataset(Dataset): + def __init__(self, data_path, labels_path, standardize=True, data_format='pt', start_idx=0, + img_channels=1, img_size=128, downsample=None, data_type=torch.float32, is_tfs=True, binary=False): + self.data_path = data_path + self.labels_path = labels_path + self.standardize = standardize + self.data_format = data_format + self.transforms = ToTensor() + self.start_idx = start_idx # Initial batch index to start from, useful for resuming training + self.img_channels = img_channels + self.img_size = img_size + self.downsample = downsample + self.is_tfs = is_tfs + self.dtype = data_type + self.binary = binary + + self.refresh_dataset() + + + def refresh_dataset(self): + self.segment_names, self.labels = self.extract_segment_names_and_labels() + + + def __len__(self): # Method is implicitly called when len() is used on an instance of CustomDataset + return len(self.segment_names) + + + def __getitem__(self, idx): # Method is implicitly called when getitem() is used on an instance of CustomDataset. It is called batch_size number of times per iteration of dataloader | Loads segments as needed (lazy loading) + actual_idx = (idx + self.start_idx) % len(self.segment_names) # Adjust index based on start_idx and wrap around if needed (i.e. index falls out of bounds) + segment_name = self.segment_names[actual_idx] + label = self.labels[segment_name] + + data_tensor = self.load_data(segment_name) + + return {'data': data_tensor, 'label': label, 'segment_name': segment_name} + + # When iterating over the dataloader, which returns batches of data, each batch will contain a dictionary with keys corresponding to the data and labels. + + # Since the dataloader's dataset's __getitem__ method returns a dictionary with keys 'data', 'label', and 'segment_name', the returned batch will be a dictionary where: + + # The 'data' key will correspond to a tensor of shape (batch_size, ...), representing the shape of the data. + # The 'label' key will correspond to a tensor of shape (batch_size, ...), representing the shape of the labels. + # The 'segment_name' key will correspond to a tensor of shape (batch_size, ...), representing the shape of the segment_name. + + def set_start_idx(self, index): + self.start_idx = index + + + def extract_segment_names_and_labels(self): # Only extract the segments and labels of a particular class, temporary solution + segment_names = [] + labels = {} + label_file = self.labels_path + if os.path.exists(label_file): + # Use PyArrow to read csv + parse_options = csv.ParseOptions(delimiter=',') # Indicate delimiter + read_options = csv.ReadOptions(column_names=['segment_names', 'labels'], skip_rows=1) # Assign desired column names and skip the first row (headers) + label_data = csv.read_csv(label_file, parse_options=parse_options, read_options=read_options) + label_data = label_data.to_pandas() + + label_segment_names = label_data['segment_names'] + for idx, segment_name in enumerate(label_segment_names): # enumerate() returns the value and corresponding index of each element in an iterable + label_val = label_data['labels'].values[idx] + + if self.binary and label_val == 2: # If binary is true, set all PAC/PVC to 0 (non-AF) + label_val = 0 + + segment_names.append(segment_name) + labels[segment_name] = label_val + + return segment_names, labels + + + def second_to_last_directory_name(self, path): + # Normalize path separator to '/' + path = path.replace('\\', '/') + + # Split the path into its components + components = path.split('/') + + # Remove empty components + components = [c for c in components if c] + + # Check if the path ends with a separator (indicating it's a directory) + if path.endswith('/'): + # Remove the last empty component + components.pop() + + # If there's only one or zero directories in the path, return None + if len(components) <= 1: + return None + + # Return the name of the second-to-last directory + return components[-2] + + + def load_data(self, segment_name): + seg_path = os.path.join(self.data_path, segment_name + '.' + self.data_format) + + try: # Allows to define a block of code to be executed and specify how to handle any errors that might occur during its execution + if self.data_format == 'csv' and seg_path.endswith('.csv'): + # data_plot = np.array(pd.read_csv(seg_path, header=None)) + + # Use PyArrow to read csv + read_options = csv.ReadOptions(autogenerate_column_names=True) + seg_data = csv.read_csv(seg_path, read_options=read_options) + data_plot = seg_data.to_pandas().to_numpy() + + data_tensor = torch.tensor(data_plot).reshape(self.img_channels, self.img_size, self.img_size) + elif self.data_format == 'png' and seg_path.endswith('.png'): + img = Image.open(seg_path) + img_data = np.array(img) + data_tensor = torch.tensor(img_data).unsqueeze(0) + elif self.data_format == 'pt' and seg_path.endswith('.pt'): + data_tensor = torch.load(seg_path) + else: + raise ValueError("Unsupported file format") + + if self.downsample is not None: + # Downsample the image + # Use OpenCV to resize the array to downsample x downsample using INTER_AREA interpolation + data_array = cv2.resize(np.array(data_tensor.reshape(self.img_size, self.img_size).to('cpu')), (self.downsample, self.downsample), interpolation=cv2.INTER_AREA) + data_tensor = torch.tensor(data_array, dtype=self.dtype).reshape(self.img_channels, self.downsample, self.downsample) + else: + data_tensor = data_tensor.reshape(self.img_channels, self.img_size, self.img_size).to(self.dtype) + + if self.standardize: + data_tensor = self.standard_scaling(data_tensor) # Standardize the data + + return data_tensor + + except Exception as e: + print(f"Error processing segment: {segment_name}. Exception: {str(e)}") + if self.downsample is not None: + return torch.zeros((self.img_channels, self.downsample, self.downsample)) # Return zeros in case of an error + else: + return torch.zeros((self.img_channels, self.img_size, self.img_size)) # Return zeros in case of an error + + def standard_scaling(self, data): + scaler = StandardScaler() + data = scaler.fit_transform(data.reshape(-1, data.shape[-1])).reshape(data.shape) # Converts data into 2D array, standardizes it, reshapes it back into 3D (1,X,X) + return torch.tensor(data, dtype=self.dtype) + +def load_data_split_batched(data_path, labels_path, batch_size, standardize=False, data_format='csv', + drop_last=False, num_workers=4, start_idx=0, + img_channels=1, img_size=128, downsample=None, data_type=torch.float16, is_tfs=True, binary=False): + torch.manual_seed(42) + g = torch.Generator() + g.manual_seed(42) + + pin_memory = False + if torch.cuda.is_available(): + pin_memory = True + + dataset = CustomDataset(data_path, labels_path, standardize, data_format, start_idx=start_idx, + img_channels=img_channels, img_size=img_size, downsample=downsample, data_type=data_type, is_tfs=is_tfs, binary=binary) + dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=drop_last, num_workers=num_workers, prefetch_factor=2, persistent_workers=True, pin_memory=pin_memory, worker_init_fn=seed_worker, generator=g) # Prefetches 2 batches ahead of current training iteration (allows loading of data simultaneously with training). Shuffle is set to False to resume training at a specific batch. + return dataloader + +def seed_worker(worker_id): + worker_seed = torch.initial_seed() % 2**32 + np.random.seed(worker_seed) + random.seed(worker_seed) + +# Function to extract and preprocess data +def preprocess_data(database, batch_size, standardize=False, img_channels=1, img_size=128, + downsample=None, data_type=torch.float32, pathmaster=None, binary=False): + start_idx = 0 + + if database == 'DeepBeat' or database == 'deepbeat' or database == 'Deepbeat': + data_path, labels_path = pathmaster.deepbeat_paths() + elif database == 'MIMICIII' or database == 'mimiciii' or database == 'mimicIII' or database == 'mimic3': + data_path, labels_path = pathmaster.mimic3_paths() + elif database == 'Simband' or database == 'simband': + data_path, labels_path = pathmaster.simband_paths() + else: + print('Invalid Database') + + data_format = 'pt' + + num_workers = 1 + + test_loader = load_data_split_batched(data_path, labels_path, batch_size, standardize=standardize, + data_format=data_format, num_workers=num_workers, + start_idx=start_idx, img_channels=img_channels, img_size=img_size, downsample=downsample, + data_type=data_type, is_tfs=pathmaster.is_tfs, binary=binary) + # loader2 = load_data_split_batched(data_path, labels_path, batch_size, standardize=standardize, + # data_format=data_format, num_workers=num_workers, + # start_idx=start_idx, img_channels=img_channels, img_size=img_size, downsample=downsample, + # data_type=data_type, is_tfs=pathmaster.is_tfs, binary=False) + # loader3 = load_data_split_batched(data_path, labels_path, batch_size, standardize=standardize, + # data_format=data_format, num_workers=num_workers, + # start_idx=start_idx, img_channels=img_channels, img_size=img_size, downsample=downsample, + # data_type=data_type, is_tfs=pathmaster.is_tfs, binary=False) + return test_loader # loader1, loader2, loader3 + + \ No newline at end of file diff --git a/utils/dataloader_smote.py b/utils/dataloader_smote.py new file mode 100644 index 0000000..9266028 --- /dev/null +++ b/utils/dataloader_smote.py @@ -0,0 +1,215 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Feb 26 18:29:59 2024 + +@author: dchen +""" +import os +import numpy as np +import pandas as pd +from PIL import Image +import torch +from torch.utils.data import Dataset, DataLoader +from sklearn.preprocessing import StandardScaler +from torchvision.transforms import ToTensor +import math +from numpy import random +from numpy.random import choice +import cv2 +from pyarrow import csv + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +# Seeds +torch.manual_seed(42) +np.random.seed(42) +random.seed(42) + +class CustomDataset(Dataset): + def __init__(self, smote_path, groups, standardize=True, data_format='pt', start_idx=0, + img_channels=1, img_size=128, downsample=None, data_type=torch.float32, is_tfs=True): + self.smote_path = smote_path + self.standardize = standardize + self.data_format = data_format + self.transforms = ToTensor() + self.start_idx = start_idx # Initial batch index to start from, useful for resuming training + self.img_channels = img_channels + self.img_size = img_size + self.downsample = downsample + self.is_tfs = is_tfs + self.groups = groups + self.dtype = data_type + + self.refresh_dataset() + + + def refresh_dataset(self): + self.segment_names, self.labels = self.extract_segment_names_and_labels() + + + def __len__(self): # Method is implicitly called when len() is used on an instance of CustomDataset + return len(self.segment_names) + + + def __getitem__(self, idx): # Method is implicitly called when getitem() is used on an instance of CustomDataset. It is called batch_size number of times per iteration of dataloader | Loads segments as needed (lazy loading) + actual_idx = (idx + self.start_idx) % len(self.segment_names) # Adjust index based on start_idx and wrap around if needed (i.e. index falls out of bounds) + segment_name = self.segment_names[actual_idx] + label = self.labels[segment_name] + + data_tensor = self.load_data(segment_name) + + return {'data': data_tensor, 'label': label, 'segment_name': segment_name} + + # When iterating over the dataloader, which returns batches of data, each batch will contain a dictionary with keys corresponding to the data and labels. + + # Since the dataloader's dataset's __getitem__ method returns a dictionary with keys 'data', 'label', and 'segment_name', the returned batch will be a dictionary where: + + # The 'data' key will correspond to a tensor of shape (batch_size, ...), representing the shape of the data. + # The 'label' key will correspond to a tensor of shape (batch_size, ...), representing the shape of the labels. + # The 'segment_name' key will correspond to a tensor of shape (batch_size, ...), representing the shape of the segment_name. + + def set_start_idx(self, index): + self.start_idx = index + + + def extract_segment_names_and_labels(self): # Only extract the segments and labels of a particular class, temporary solution + segment_names = [] + labels = {} + + group_directories = [entry for entry in os.listdir(self.smote_path) if os.path.isdir(os.path.join(self.smote_path, entry))] + group = list(set(self.groups).intersection(set(group_directories)))[0] + + smote_type = self.second_to_last_directory_name(self.smote_path) + label_file = os.path.join(self.smote_path, smote_type + '_' + group + '_names_labels.csv') + if os.path.exists(label_file): + # Use PyArrow to read csv + parse_options = csv.ParseOptions(delimiter=',') # Indicate delimiter + read_options = csv.ReadOptions(column_names=['segment_name', 'label'], skip_rows=1) # Assign desired column names and skip the first row (headers) + label_data = csv.read_csv(label_file, parse_options=parse_options, read_options=read_options) + label_data = label_data.to_pandas() + + label_segment_names = label_data['segment_name'] + for idx, segment_name in enumerate(label_segment_names): # enumerate() returns the value and corresponding index of each element in an iterable + label_val = label_data['label'].values[idx] + segment_names.append(segment_name) + labels[segment_name] = label_val + + return segment_names, labels + + + def second_to_last_directory_name(self, path): + # Normalize path separator to '/' + path = path.replace('\\', '/') + + # Split the path into its components + components = path.split('/') + + # Remove empty components + components = [c for c in components if c] + + # Check if the path ends with a separator (indicating it's a directory) + if path.endswith('/'): + # Remove the last empty component + components.pop() + + # If there's only one or zero directories in the path, return None + if len(components) <= 1: + return None + + # Return the name of the second-to-last directory + return components[-2] + + + def load_data(self, segment_name): + data_path_group = os.path.join(self.smote_path, segment_name.split('_')[1]) + seg_path = os.path.join(data_path_group, segment_name + '.' + self.data_format) + + try: # Allows to define a block of code to be executed and specify how to handle any errors that might occur during its execution + if self.data_format == 'csv' and seg_path.endswith('.csv'): + # data_plot = np.array(pd.read_csv(seg_path, header=None)) + + # Use PyArrow to read csv + read_options = csv.ReadOptions(autogenerate_column_names=True) + seg_data = csv.read_csv(seg_path, read_options=read_options) + data_plot = seg_data.to_pandas().to_numpy() + + data_tensor = torch.tensor(data_plot).reshape(self.img_channels, self.img_size, self.img_size) + elif self.data_format == 'png' and seg_path.endswith('.png'): + img = Image.open(seg_path) + img_data = np.array(img) + data_tensor = torch.tensor(img_data).unsqueeze(0) + elif self.data_format == 'pt' and seg_path.endswith('.pt'): + data_tensor = torch.load(seg_path) + else: + raise ValueError("Unsupported file format") + + if self.downsample is not None: + # Downsample the image + # Use OpenCV to resize the array to downsample x downsample using INTER_AREA interpolation + data_array = cv2.resize(np.array(data_tensor.reshape(self.img_size, self.img_size).to('cpu')), (self.downsample, self.downsample), interpolation=cv2.INTER_AREA) + data_tensor = torch.tensor(data_array, dtype=self.dtype).reshape(self.img_channels, self.downsample, self.downsample) + else: + data_tensor = data_tensor.reshape(self.img_channels, self.img_size, self.img_size).to(self.dtype) + + if self.standardize: + data_tensor = self.standard_scaling(data_tensor) # Standardize the data + + return data_tensor + + except Exception as e: + print(f"Error processing segment: {segment_name}. Exception: {str(e)}") + if self.downsample is not None: + return torch.zeros((self.img_channels, self.downsample, self.downsample)) # Return zeros in case of an error + else: + return torch.zeros((self.img_channels, self.img_size, self.img_size)) # Return zeros in case of an error + + def standard_scaling(self, data): + scaler = StandardScaler() + data = scaler.fit_transform(data.reshape(-1, data.shape[-1])).reshape(data.shape) # Converts data into 2D array, standardizes it, reshapes it back into 3D (1,X,X) + return torch.tensor(data, dtype=self.dtype) + +def load_data_split_batched(smote_path, groups, batch_size, standardize=False, data_format='csv', + drop_last=False, num_workers=4, start_idx=0, + img_channels=1, img_size=128, downsample=None, data_type=torch.float32, is_tfs=True): + torch.manual_seed(42) + g = torch.Generator() + g.manual_seed(42) + + pin_memory = False + if torch.cuda.is_available(): + pin_memory = True + + dataset = CustomDataset(smote_path, groups, standardize, data_format, start_idx=start_idx, + img_channels=img_channels, img_size=img_size, downsample=downsample, data_type=data_type, is_tfs=is_tfs) + dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=drop_last, num_workers=num_workers, prefetch_factor=2, persistent_workers=True, pin_memory=pin_memory, worker_init_fn=seed_worker, generator=g) # Prefetches 2 batches ahead of current training iteration (allows loading of data simultaneously with training). Shuffle is set to False to resume training at a specific batch. + return dataloader + +def seed_worker(worker_id): + worker_seed = torch.initial_seed() % 2**32 + np.random.seed(worker_seed) + random.seed(worker_seed) + +# Function to extract and preprocess data +def preprocess_data(smote_type, split, batch_size, standardize=False, img_channels=1, img_size=128, + downsample=None, data_type=torch.float32, pathmaster=None): + start_idx = 0 + smote_path = pathmaster.smote_path(smote_type, split) + data_format = 'pt' + + num_workers = 8 + + loader1 = load_data_split_batched(smote_path, ['fold1', 'train'], batch_size, standardize=standardize, + data_format=data_format, num_workers=num_workers, + start_idx=start_idx, img_channels=img_channels, img_size=img_size, downsample=downsample, + data_type=data_type, is_tfs=pathmaster.is_tfs) + loader2 = load_data_split_batched(smote_path, ['fold2', 'validate'], batch_size, standardize=standardize, + data_format=data_format, num_workers=num_workers, + start_idx=start_idx, img_channels=img_channels, img_size=img_size, downsample=downsample, + data_type=data_type, is_tfs=pathmaster.is_tfs) + loader3 = load_data_split_batched(smote_path, ['test', 'test'], batch_size, standardize=standardize, + data_format=data_format, num_workers=num_workers, + start_idx=start_idx, img_channels=img_channels, img_size=img_size, downsample=downsample, + data_type=data_type, is_tfs=pathmaster.is_tfs) + return loader1, loader2, loader3 + + \ No newline at end of file diff --git a/utils/get_paths.py b/utils/get_paths.py new file mode 100644 index 0000000..b22752e --- /dev/null +++ b/utils/get_paths.py @@ -0,0 +1,154 @@ +# -*- coding: utf-8 -*- +""" +Created on Tue Feb 27 14:55:43 2024 + +@author: dchen +""" +import os + +def data_paths(data_format, is_linux=False, is_hpc=False): + if is_linux: + base_path = "/mnt/r/ENGR_Chon/Dong/MATLAB_generate_results/NIH_PulseWatch" + labels_base_path = "/mnt/r/ENGR_Chon/NIH_Pulsewatch_Database/Adjudication_UConn" + saving_base_path = "/mnt/r/ENGR_Chon/Darren/Honors_Thesis/saves/analysis" + elif is_hpc: + base_path = "/gpfs/scratchfs1/kic14002/doh16101" + labels_base_path = "/gpfs/scratchfs1/hfp14002/lrm22005" + saving_base_path = "/gpfs/scratchfs1/hfp14002/dac20022/Honors_Thesis/saves/analysis" + else: + # R:\ENGR_Chon\Dong\MATLAB_generate_results\NIH_PulseWatch + base_path = r"R:\ENGR_Chon\Dong\MATLAB_generate_results\\NIH_PulseWatch" # Why double \\ before NIH_Pulsewatch_Database? + labels_base_path = r"R:\ENGR_Chon\\NIH_Pulsewatch_Database\Adjudication_UConn" # Why double \\ before NIH_Pulsewatch_Database? + saving_base_path = r"\\grove.ad.uconn.edu\research\ENGR_Chon\Darren\Honors_Thesis\saves" # Only when writing to file in the R drive do we need the entire address for the R drive + if data_format == 'csv': + data_path = os.path.join(base_path, "TFS_csv") + labels_path = os.path.join(labels_base_path, "final_attemp_4_1_Dong_Ohm") + saving_path = os.path.join(saving_base_path, "analysis") + elif data_format == 'png': + data_path = os.path.join(base_path, "TFS_plots") + labels_path = os.path.join(labels_base_path, "final_attemp_4_1_Dong_Ohm") + saving_path = os.path.join(saving_base_path, "analysis") + elif data_format == 'pt': + data_path = os.path.join(base_path, "PT_format") + labels_path = os.path.join(labels_base_path, "final_attemp_4_1_Dong_Ohm") + saving_path = os.path.join(saving_base_path, "analysis") + else: + raise ValueError("Invalid data format. Choose 'csv', 'png, or 'pt'.") + + return data_path, labels_path, saving_path + + +def models_path(is_linux=False, is_hpc=False): + if is_linux: + models_path = "/mnt/r/ENGR_Chon/Darren/Honors_Thesis/models" + elif is_hpc: + models_path = "/gpfs/scratchfs1/hfp14002/dac20022/Honors_Thesis/models" + else: + models_path = r"\\grove.ad.uconn.edu\research\ENGR_Chon\Darren\Honors_Thesis\models" + + return models_path + +# Base saving paths +focus = 'misc' +# focus = '2_layers_per_block' +# focus = '2_layers_per_block' +linux_saves_path = '/mnt/r/ENGR_Chon/Darren/Honors_Thesis/saves/' + focus + '/' +hpc_saves_path = '/gpfs/scratchfs1/hfp14002/dac20022/Honors_Thesis/saves/' + focus + '/' +saves_path = r'\\grove.ad.uconn.edu\research\ENGR_Chon\Darren\Honors_Thesis\saves' + '\\' + focus + '\\' + +def losslists_path(is_linux=False, is_hpc=False): + if is_linux: + losslists_path = linux_saves_path + 'losslists' + elif is_hpc: + losslists_path = hpc_saves_path + 'losslists' + else: + losslists_path = saves_path + 'losslists' + + return losslists_path + + +def runtime_lists_path(is_linux=False, is_hpc=False): + if is_linux: + runtime_lists_path = linux_saves_path + 'runtime_lists' + elif is_hpc: + runtime_lists_path = hpc_saves_path + 'runtime_lists' + else: + runtime_lists_path = saves_path + 'runtime_lists' + + return runtime_lists_path + + +def predictions_path(is_linux=False, is_hpc=False): + if is_linux: + predictions_path = linux_saves_path + 'predictions' + elif is_hpc: + predictions_path = hpc_saves_path + 'predictions' + else: + predictions_path = saves_path + 'predictions' + + return predictions_path + +def prediction_proba_path(is_linux=False, is_hpc=False): + if is_linux: + prediction_proba_path = linux_saves_path + 'prediction_proba' + elif is_hpc: + prediction_proba_path = hpc_saves_path + 'prediction_proba' + else: + prediction_proba_path = saves_path + 'prediction_proba' + + return prediction_proba_path + + +def metrics_path(is_linux=False, is_hpc=False): + if is_linux: + metrics_path = linux_saves_path + 'metrics' + elif is_hpc: + metrics_path = hpc_saves_path + 'metrics' + else: + metrics_path = saves_path + 'metrics' + + return metrics_path + + +def confusion_matrices_path(is_linux=False, is_hpc=False): + if is_linux: + confusion_matrices_path = linux_saves_path + 'confusion_matrices' + elif is_hpc: + confusion_matrices_path = hpc_saves_path + 'confusion_matrices' + else: + confusion_matrices_path = saves_path + 'confusion_matrices' + + return confusion_matrices_path + + +def checkpoints_path(is_linux=False, is_hpc=False): + if is_linux: + checkpoints_path = linux_saves_path + 'checkpoints' + elif is_hpc: + checkpoints_path = hpc_saves_path + 'checkpoints' + else: + checkpoints_path = saves_path + 'checkpoints' + + return checkpoints_path + +def hyperparameters_path(is_linux=False, is_hpc=False): + if is_linux: + hyperparameters_path = linux_saves_path + 'hyperparameters' + elif is_hpc: + hyperparameters_path = hpc_saves_path + 'hyperparameters' + else: + hyperparameters_path = saves_path + 'hyperparameters' + + return hyperparameters_path + +def loss_curves_path(is_linux=False, is_hpc=False): + if is_linux: + loss_curves_path = linux_saves_path + 'loss_curves' + elif is_hpc: + loss_curves_path = hpc_saves_path + 'loss_curves' + else: + loss_curves_path = saves_path + 'loss_curves' + + return loss_curves_path + + diff --git a/utils/misc_func.py b/utils/misc_func.py new file mode 100644 index 0000000..6893a71 --- /dev/null +++ b/utils/misc_func.py @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -*- +""" +Created on Sun Mar 3 03:56:36 2024 + +@author: dchen +""" + +def substring_between_strings(main_string, start_string, end_string): + start_index = main_string.find(start_string) + if start_index == -1: + return None + + end_index = main_string.find(end_string, start_index + len(start_string)) + if end_index == -1: + return None + + return main_string[start_index + len(start_string):end_index] + + +def string_to_boolean(input_string): + if input_string.lower() in ['true', 't', 'yes', 'y', '1']: + return True + elif input_string.lower() in ['false', 'f', 'no', 'n', '0']: + return False + else: + raise ValueError("String does not represent a boolean value") diff --git a/utils/model_func.py b/utils/model_func.py new file mode 100644 index 0000000..95f19de --- /dev/null +++ b/utils/model_func.py @@ -0,0 +1,2145 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Feb 26 14:58:20 2024 + +@author: dchen +""" + +import os +import sys +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from tqdm import tqdm +import random +import time +import torch.autograd as autograd +from torch.cuda.amp import autocast, GradScaler + +# Import my own functions and classes +# from utils import get_paths +from utils import plot_save_func +from models.densenet import DenseNet3 as DenseNet +from models.densenet_configurable import DenseNet as DenseNet_config + +# If GPU is available, use GPU, else use CPU +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +# Seeds +torch.manual_seed(42) +np.random.seed(42) +random.seed(42) + + +def cross_val_2fold_DenseNet(model_hyperparameters, fold1_loader, fold2_loader, model_type=torch.float32, + n_epochs=100, n_classes=3, patience=10, save=False, + resume_checkpoint_path=None, pathmaster=None): + # If GPU is available, use GPU, else use CPU + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Optimizer and scheduler hyperparameters + lr = 0.0001 + + # Define img_channels + img_channels = 1 + + # Resume checkpoint if specified + if resume_checkpoint_path is not None and os.path.exists(resume_checkpoint_path): + # Load model hyperparameters + depth, growth_rate, compression, bottleneck, drop_rate, class_weights = load_hyperparameters(pathmaster) + + # Define DenseNet model based on loaded hyperparameters + model_fold1 = DenseNet(img_channels=img_channels, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device, dtype=model_type) + model_fold2 = DenseNet(img_channels=img_channels, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device, dtype=model_type) + + # Create optimizer and scheduler + optimizer_fold1 = torch.optim.Adam(model_fold1.parameters(), lr=lr) + optimizer_fold2 = torch.optim.Adam(model_fold2.parameters(), lr=lr) + + scheduler_fold1 = IdentityScheduler(optimizer_fold1) + scheduler_fold2 = IdentityScheduler(optimizer_fold2) + + model_fold1, optimizer_fold1, scheduler_fold1, model_fold2, optimizer_fold2, scheduler_fold2, epoch, loss = load_checkpoint_2fold(model_fold1, model_fold2, optimizer_fold1, optimizer_fold2, scheduler_fold1, scheduler_fold2, pathmaster) + start_epoch = epoch + 1 + best_loss_cross_val = loss + else: + # Extract model hyperparameters + depth = model_hyperparameters['depth'] + growth_rate = model_hyperparameters['growth_rate'] + compression = model_hyperparameters['compression'] + bottleneck = model_hyperparameters['bottleneck'] + drop_rate = model_hyperparameters['drop_rate'] + class_weights = model_hyperparameters['class_weights'] + + # Define DenseNet model based on loaded hyperparameters + model_fold1 = DenseNet(img_channels=1, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device, dtype=model_type) + model_fold2 = DenseNet(img_channels=1, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device, dtype=model_type) + + # Create optimizer and scheduler + optimizer_fold1 = torch.optim.Adam(model_fold1.parameters(), lr=lr) + optimizer_fold2 = torch.optim.Adam(model_fold2.parameters(), lr=lr) + + scheduler_fold1 = IdentityScheduler(optimizer_fold1) + scheduler_fold2 = IdentityScheduler(optimizer_fold2) + + best_loss_cross_val = float('inf') # If no checkpoint is loaded, set to infinity + start_epoch = 0 + + if save: + # Save hyperparameters + plot_save_func.save_hyperparameters(model_hyperparameters, pathmaster) + + # Create EarlyStoppingCallback object + early_stopping_callback = EarlyStoppingCallback(patience) + + # Create criterion for loss + criterion_train = nn.CrossEntropyLoss(weight=torch.tensor(class_weights).to(device=device)) + criterion_val = nn.CrossEntropyLoss() + + # Regularization + lambda_l1 = 0.01 + + # Initialize losslists + losslist_train = [] + losslist_cross_val = [] + + losslist_train_fold1 = [] + losslist_val_fold1 = [] + + losslist_train_fold2 = [] + losslist_val_fold2 = [] + + # Initialize runtime list + runtime_list = [] + + # Cross-validation + print('\n===========================================================================================') + sys.stdout.flush() + for epoch in tqdm(range(start_epoch, n_epochs), desc='Cross-Validation', unit='epoch', leave=False): # Creates a training progress bar with units of epoch + start_time = time.time() + sys.stderr.flush() + print("\nEntering Epoch:", epoch) + sys.stdout.flush() + + # Fold 1 training =============================================================================================================================================================== + model_fold1.train() + train_cum_loss_fold1 = 0 + for data_batch in tqdm(fold1_loader, total=len(fold1_loader), desc='Training', unit='batch', leave=False): + # Extract input and labels + X_train = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[1], data_batch['data'].shape[-1], data_batch['data'].shape[-1]).to(device=device) + Y_train = data_batch['label'].to(device=device) + + # Forward pass + logits, _, _ = model_fold1(X_train) + + # Regularization (if applicable) + l1 = 0 + for p in model_fold1.parameters(): + l1 = l1 + p.abs().sum() + + # Calculate total loss with regularization + batch_loss_train = criterion_train(logits.to(torch.float32), Y_train.long()) + lambda_l1 * l1 + train_cum_loss_fold1 += batch_loss_train.item() + + # Clear gradients + optimizer_fold1.zero_grad() + + # Backwards pass + batch_loss_train.backward() + + # Optimizer step + optimizer_fold1.step() + + # Update scheduler + scheduler_fold1.step() + + loss_train_fold1 = train_cum_loss_fold1 / len(fold1_loader) + + sys.stderr.flush() + print('\nTraining for Fold #1 in Epoch', epoch, 'has been completed!') + sys.stdout.flush() + + # Fold 1 validation ============================================================================================================================================================= + model_fold1.eval() + with torch.no_grad(): + val_cum_loss_fold1 = 0 + for data_batch in tqdm(fold2_loader, total=len(fold2_loader), desc='Validation', unit='batch', leave=False): + X_val = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[1], data_batch['data'].shape[-1], data_batch['data'].shape[-1]).to(device=device) + Y_val = data_batch['label'].to(device=device) + + logits, _, _ = model_fold1(X_val) + val_cum_loss_fold1 += criterion_val(logits.float(), Y_val.long()).item() + + loss_val_fold1 = val_cum_loss_fold1 / len(fold2_loader) + + sys.stderr.flush() + print('\nValidation for Fold #1 in Epoch', epoch, 'has been completed!') + sys.stdout.flush() + + # Fold 2 training =============================================================================================================================================================== + model_fold2.train() + train_cum_loss_fold2 = 0 + for data_batch in tqdm(fold2_loader, total=len(fold2_loader), desc='Training', unit='batch', leave=False): + # Extract input and labels + X_train = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[1], data_batch['data'].shape[-1], data_batch['data'].shape[-1]).to(device=device) + Y_train = data_batch['label'].to(device=device) + + # Forward pass + logits, _, _ = model_fold2(X_train) + + # Regularization (if applicable) + l1 = 0 + for p in model_fold2.parameters(): + l1 = l1 + p.abs().sum() + + # Calculate total loss with regularization + batch_loss_train = criterion(logits.to(torch.float32), Y_train.long()) + lambda_l1 * l1 + train_cum_loss_fold2 += batch_loss_train.item() + + # Clear gradients + optimizer_fold2.zero_grad() + + # Backwards pass + batch_loss_train.backward() + + # Optimizer step + optimizer_fold2.step() + + # Update scheduler + scheduler_fold2.step() + + loss_train_fold2 = train_cum_loss_fold2 / len(fold2_loader) + + sys.stderr.flush() + print('\nTraining for Fold #2 in Epoch', epoch, 'has been completed!') + sys.stdout.flush() + + # Fold 2 validation ============================================================================================================================================================= + model_fold2.eval() + with torch.no_grad(): + val_cum_loss_fold2 = 0 + for data_batch in tqdm(fold1_loader, total=len(fold1_loader), desc='Validation', unit='batch', leave=False): + X_val = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[1], data_batch['data'].shape[-1], data_batch['data'].shape[-1]).to(device=device) + Y_val = data_batch['label'].to(device=device) + + logits, _, _ = model_fold2(X_val) + val_cum_loss_fold2 += criterion(logits.float(), Y_val.long()).item() + + loss_val_fold2 = val_cum_loss_fold2 / len(fold1_loader) + + sys.stderr.flush() + print('\nValidation for Fold #2 in Epoch', epoch, 'has been completed!') + sys.stdout.flush() + # =============================================================================================================================================================================== + + # Caluclate epoch losses + epoch_loss_train = (loss_train_fold1 + loss_train_fold2) / 2 + epoch_loss_cross_val = (loss_val_fold1 + loss_val_fold2) / 2 + + # Append to losslists + losslist_train.append(epoch_loss_train) + losslist_cross_val.append(epoch_loss_cross_val) + + losslist_train_fold1.append(loss_train_fold1) + losslist_val_fold1.append(loss_val_fold1) + + losslist_train_fold2.append(loss_train_fold2) + losslist_val_fold2.append(loss_val_fold2) + + # Return the best cross-validation loss and save best checkpoint (epoch) + best_loss_cross_val = save_best_checkpoint_2fold(model_fold1, model_fold2, optimizer_fold1, optimizer_fold2, scheduler_fold1, scheduler_fold2, epoch, epoch_loss_cross_val, best_loss_cross_val, pathmaster) + + # Update line + sys.stderr.flush() + print("\n======> Epoch: {}/{}, Training Loss: {:.4f}, Cross-Validation Loss: {:.4f}".format(epoch, n_epochs-1, epoch_loss_train, epoch_loss_cross_val)) + print('\n===========================================================================================') + sys.stdout.flush() + + # Add epoch time to runtime_list + end_time = time.time() + time_passed = end_time-start_time # in seconds + runtime_list.append(time_passed) + + # Call the early stopping callback + if early_stopping_callback(epoch, epoch_loss_cross_val): + break + + # Saving + if save: + title = 'Training and Cross-Validation Loss' + plot_save_func.train_val_loss(losslist_train, losslist_cross_val, title, save, pathmaster) + + plot_save_func.save_losslists_2fold(losslist_train_fold1, losslist_val_fold1, losslist_train_fold2, losslist_val_fold2, losslist_train, losslist_cross_val, pathmaster) + plot_save_func.save_runtime_list(runtime_list, pathmaster) + + +def cross_val_2fold_DenseNet_mixed(model_hyperparameters, fold1_loader, fold2_loader, + n_epochs=100, n_classes=3, patience=10, save=False, + resume_checkpoint_path=None, pathmaster=None): + # If GPU is available, use GPU, else use CPU + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Optimizer and scheduler hyperparameters + lr = 0.0005 + + # Define img_channels + img_channels = 1 + + # Resume checkpoint if specified + if resume_checkpoint_path is not None and os.path.exists(resume_checkpoint_path): + # Load model hyperparameters + depth, growth_rate, compression, bottleneck, drop_rate, class_weights = load_hyperparameters(pathmaster) + + # Define DenseNet model based on loaded hyperparameters + model_fold1 = DenseNet(img_channels=img_channels, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device) + model_fold2 = DenseNet(img_channels=img_channels, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device) + + # Create optimizer and scheduler + optimizer_fold1 = torch.optim.Adam(model_fold1.parameters(), lr=lr) + optimizer_fold2 = torch.optim.Adam(model_fold2.parameters(), lr=lr) + + scheduler_fold1 = IdentityScheduler(optimizer_fold1) + scheduler_fold2 = IdentityScheduler(optimizer_fold2) + + model_fold1, optimizer_fold1, scheduler_fold1, model_fold2, optimizer_fold2, scheduler_fold2, epoch, loss = load_checkpoint_2fold(model_fold1, model_fold2, optimizer_fold1, optimizer_fold2, scheduler_fold1, scheduler_fold2, pathmaster) + start_epoch = epoch + 1 + best_loss_cross_val = loss + else: + # Extract model hyperparameters + depth = model_hyperparameters['depth'] + growth_rate = model_hyperparameters['growth_rate'] + compression = model_hyperparameters['compression'] + bottleneck = model_hyperparameters['bottleneck'] + drop_rate = model_hyperparameters['drop_rate'] + class_weights = model_hyperparameters['class_weights'] + + # Define DenseNet model based on loaded hyperparameters + model_fold1 = DenseNet(img_channels=1, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device) + model_fold2 = DenseNet(img_channels=1, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device) + + # Create optimizer and scheduler + optimizer_fold1 = torch.optim.Adam(model_fold1.parameters(), lr=lr) + optimizer_fold2 = torch.optim.Adam(model_fold2.parameters(), lr=lr) + + scheduler_fold1 = IdentityScheduler(optimizer_fold1) + scheduler_fold2 = IdentityScheduler(optimizer_fold2) + + best_loss_cross_val = float('inf') # If no checkpoint is loaded, set to infinity + start_epoch = 0 + + if save: + # Save hyperparameters + plot_save_func.save_hyperparameters(model_hyperparameters, pathmaster) + + # Create EarlyStoppingCallback object + early_stopping_callback = EarlyStoppingCallback(patience) + + # Create criterion for loss + criterion_train = nn.CrossEntropyLoss(weight=torch.tensor(class_weights).to(device=device)) + criterion_val = nn.CrossEntropyLoss() + + # Regularization + lambda_l1 = 0.01 + + # Initialize losslists + losslist_train = [] + losslist_cross_val = [] + + losslist_train_fold1 = [] + losslist_val_fold1 = [] + + losslist_train_fold2 = [] + losslist_val_fold2 = [] + + # Initialize runtime list + runtime_list = [] + + # Initialize predictions lists + predictions_list_train = [] + predictions_list_val = [] + + # Initialize true labels lists + true_labels_list_train = [] + true_labels_list_val = [] + + # Scalers + scaler_fold1 = GradScaler() + scaler_fold2 = GradScaler() + + # Cross-validation + print('\n===========================================================================================') + sys.stdout.flush() + for epoch in tqdm(range(start_epoch, n_epochs), desc='Cross-Validation', unit='epoch', leave=False): # Creates a training progress bar with units of epoch + start_time = time.time() + sys.stderr.flush() + print("\nEntering Epoch:", epoch) + sys.stdout.flush() + + # Epoch predictions + predictions_epoch_train = [] + predictions_epoch_val = [] + + # Fold 1 training =============================================================================================================================================================== + model_fold1.train() + train_cum_loss_fold1 = 0 + for data_batch in tqdm(fold1_loader, total=len(fold1_loader), desc='Training', unit='batch', leave=False): + # Extract input and labels + X_train = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[1], data_batch['data'].shape[-1], data_batch['data'].shape[-1]).to(device=device) + Y_train = data_batch['label'].to(device=device) + + if epoch == start_epoch: + true_labels_list_train.append(torch.reshape(Y_train, (-1,1))) + + with autocast(): + # Forward pass + logits, predictions, _ = model_fold1(X_train) + + predictions_epoch_train.append(torch.reshape(predictions, (-1,1))) + + # Regularization (if applicable) + l1 = 0 + for p in model_fold1.parameters(): + l1 = l1 + p.abs().sum() + + # Calculate total loss with regularization + batch_loss_train = criterion_train(logits.to(torch.float32), Y_train.long()) + lambda_l1 * l1 + train_cum_loss_fold1 += batch_loss_train.item() + + # Clear gradients + optimizer_fold1.zero_grad() + + # Backwards pass + scaler_fold1.scale(batch_loss_train).backward() + + # Optimizer step + scaler_fold1.step(optimizer_fold1) + + # Scaler update + scaler_fold1.update() + + # Update scheduler + scheduler_fold1.step() + + loss_train_fold1 = train_cum_loss_fold1 / len(fold1_loader) + + sys.stderr.flush() + print('\nTraining for Fold #1 in Epoch', epoch, 'has been completed!') + sys.stdout.flush() + + # Fold 1 validation ============================================================================================================================================================= + model_fold1.eval() + with torch.no_grad(): + val_cum_loss_fold1 = 0 + for data_batch in tqdm(fold2_loader, total=len(fold2_loader), desc='Validation', unit='batch', leave=False): + X_val = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[1], data_batch['data'].shape[-1], data_batch['data'].shape[-1]).to(device=device) + Y_val = data_batch['label'].to(device=device) + + if epoch == start_epoch: + true_labels_list_val.append(torch.reshape(Y_val, (-1,1))) + + logits, predictions, _ = model_fold1(X_val) + + predictions_epoch_val.append(torch.reshape(predictions, (-1,1))) + + val_cum_loss_fold1 += criterion_loss(logits.float(), Y_val.long()).item() + + loss_val_fold1 = val_cum_loss_fold1 / len(fold2_loader) + + sys.stderr.flush() + print('\nValidation for Fold #1 in Epoch', epoch, 'has been completed!') + sys.stdout.flush() + + # Fold 2 training =============================================================================================================================================================== + model_fold2.train() + train_cum_loss_fold2 = 0 + for data_batch in tqdm(fold2_loader, total=len(fold2_loader), desc='Training', unit='batch', leave=False): + # Extract input and labels + X_train = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[1], data_batch['data'].shape[-1], data_batch['data'].shape[-1]).to(device=device) + Y_train = data_batch['label'].to(device=device) + + if epoch == start_epoch: + true_labels_list_train.append(torch.reshape(Y_train, (-1,1))) + + with autocast(): + # Forward pass + logits, predictions, _ = model_fold2(X_train) + + predictions_epoch_train.append(torch.reshape(predictions, (-1,1))) + + # Regularization (if applicable) + l1 = 0 + for p in model_fold2.parameters(): + l1 = l1 + p.abs().sum() + + # Calculate total loss with regularization + batch_loss_train = criterion_train(logits.to(torch.float32), Y_train.long()) + lambda_l1 * l1 + train_cum_loss_fold2 += batch_loss_train.item() + + # Clear gradients + optimizer_fold2.zero_grad() + + # Backwards pass + scaler_fold2.scale(batch_loss_train).backward() + + # Optimizer step + scaler_fold2.step(optimizer_fold2) + + # Scaler update + scaler_fold2.update() + + # Update scheduler + scheduler_fold2.step() + + loss_train_fold2 = train_cum_loss_fold2 / len(fold2_loader) + + sys.stderr.flush() + print('\nTraining for Fold #2 in Epoch', epoch, 'has been completed!') + sys.stdout.flush() + + # Fold 2 validation ============================================================================================================================================================= + model_fold2.eval() + with torch.no_grad(): + val_cum_loss_fold2 = 0 + for data_batch in tqdm(fold1_loader, total=len(fold1_loader), desc='Validation', unit='batch', leave=False): + X_val = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[1], data_batch['data'].shape[-1], data_batch['data'].shape[-1]).to(device=device) + Y_val = data_batch['label'].to(device=device) + + if epoch == start_epoch: + true_labels_list_val.append(torch.reshape(Y_val, (-1,1))) + + logits, predictions, _ = model_fold2(X_val) + + predictions_epoch_val.append(torch.reshape(predictions, (-1,1))) + + val_cum_loss_fold2 += criterion_val(logits.float(), Y_val.long()).item() + + loss_val_fold2 = val_cum_loss_fold2 / len(fold1_loader) + + sys.stderr.flush() + print('\nValidation for Fold #2 in Epoch', epoch, 'has been completed!') + sys.stdout.flush() + # =============================================================================================================================================================================== + + # Caluclate epoch losses + epoch_loss_train = (loss_train_fold1 + loss_train_fold2) / 2 + epoch_loss_cross_val = (loss_val_fold1 + loss_val_fold2) / 2 + + # Append to losslists + losslist_train.append(epoch_loss_train) + losslist_cross_val.append(epoch_loss_cross_val) + + losslist_train_fold1.append(loss_train_fold1) + losslist_val_fold1.append(loss_val_fold1) + + losslist_train_fold2.append(loss_train_fold2) + losslist_val_fold2.append(loss_val_fold2) + + # Return the best cross-validation loss and save best checkpoint (epoch) + best_loss_cross_val = save_best_checkpoint_2fold(model_fold1, model_fold2, optimizer_fold1, optimizer_fold2, scheduler_fold1, scheduler_fold2, epoch, epoch_loss_cross_val, best_loss_cross_val, pathmaster) + + # Update line + sys.stderr.flush() + print("\n======> Epoch: {}/{}, Training Loss: {:.4f}, Cross-Validation Loss: {:.4f}".format(epoch, n_epochs-1, epoch_loss_train, epoch_loss_cross_val)) + print('\n===========================================================================================') + sys.stdout.flush() + + # Add epoch predictions + predictions_epoch_train = np.array(torch.cat(predictions_epoch_train, dim=0).to('cpu')) + predictions_epoch_val = np.array(torch.cat(predictions_epoch_val, dim=0).to('cpu')) + + predictions_list_train.append(predictions_epoch_train) + predictions_list_val.append(predictions_epoch_val) + + # Add epoch time to runtime_list + end_time = time.time() + time_passed = end_time-start_time # in seconds + runtime_list.append(time_passed) + + # Call the early stopping callback + if early_stopping_callback(epoch, epoch_loss_cross_val): + break + + # Convert true label list into array + true_labels_train = np.array(torch.cat(true_labels_list_train, dim=0).to('cpu')) + true_labels_val = np.array(torch.cat(true_labels_list_val, dim=0).to('cpu')) + + # Saving + if save: + title = 'Training and Cross-Validation Loss' + plot_save_func.train_val_loss(losslist_train, losslist_cross_val, title, save, pathmaster) + + title = 'Training and Cross-Validation Accuracy' + plot_save_func.accuracy_curves(true_labels_train, true_labels_val, predictions_list_train, predictions_list_val, title, save, pathmaster) + + plot_save_func.save_losslists_2fold(losslist_train_fold1, losslist_val_fold1, losslist_train_fold2, losslist_val_fold2, losslist_train, losslist_cross_val, pathmaster) + plot_save_func.save_runtime_list(runtime_list, pathmaster) + + +# Utilizes train() and validate() functions +def cross_val_2fold_DenseNet_func(model_hyperparameters, fold1_loader, fold2_loader, model_type=torch.float32, + n_epochs=100, n_classes=3, patience=10, save=False, + resume_checkpoint_path=None, pathmaster=None): + # If GPU is available, use GPU, else use CPU + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Optimizer and scheduler hyperparameters + lr = 0.0005 + + # Define img_channels + img_channels = 1 + + # Resume checkpoint if specified + if resume_checkpoint_path is not None and os.path.exists(resume_checkpoint_path): + # Load model hyperparameters + depth, growth_rate, compression, bottleneck, drop_rate, class_weights = load_hyperparameters(pathmaster) + + # Define DenseNet model based on loaded hyperparameters + model_fold1 = DenseNet(img_channels=img_channels, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device, dtype=model_type) + model_fold2 = DenseNet(img_channels=img_channels, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device, dtype=model_type) + + # Create optimizer and scheduler + optimizer_fold1 = torch.optim.Adam(model_fold1.parameters(), lr=lr) + optimizer_fold2 = torch.optim.Adam(model_fold2.parameters(), lr=lr) + + scheduler_fold1 = IdentityScheduler(optimizer_fold1) + scheduler_fold2 = IdentityScheduler(optimizer_fold2) + + model_fold1, optimizer_fold1, scheduler_fold1, model_fold2, optimizer_fold2, scheduler_fold2, epoch, loss = load_checkpoint_2fold(model_fold1, model_fold2, optimizer_fold1, optimizer_fold2, scheduler_fold1, scheduler_fold2, pathmaster) + start_epoch = epoch + 1 + best_loss_cross_val = loss + else: + # Extract model hyperparameters + depth = model_hyperparameters['depth'] + growth_rate = model_hyperparameters['growth_rate'] + compression = model_hyperparameters['compression'] + bottleneck = model_hyperparameters['bottleneck'] + drop_rate = model_hyperparameters['drop_rate'] + class_weights = model_hyperparameters['class_weights'] + + # Define DenseNet model based on loaded hyperparameters + model_fold1 = DenseNet(img_channels=1, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device, dtype=model_type) + model_fold2 = DenseNet(img_channels=1, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device, dtype=model_type) + + # Create optimizer and scheduler + optimizer_fold1 = torch.optim.Adam(model_fold1.parameters(), lr=lr) + optimizer_fold2 = torch.optim.Adam(model_fold2.parameters(), lr=lr) + + scheduler_fold1 = IdentityScheduler(optimizer_fold1) + scheduler_fold2 = IdentityScheduler(optimizer_fold2) + + best_loss_cross_val = float('inf') # If no checkpoint is loaded, set to infinity + start_epoch = 0 + + if save: + # Save hyperparameters + plot_save_func.save_hyperparameters(model_hyperparameters, pathmaster) + + # Create EarlyStoppingCallback object + early_stopping_callback = EarlyStoppingCallback(patience) + + # Create criterion for loss + criterion_train = nn.CrossEntropyLoss(weight=torch.tensor(class_weights).to(device=device)) + criterion_val = nn.CrossEntropyLoss() + + # Regularization + lambda_l1 = 0.01 + + # Initialize losslists + losslist_train = [] + losslist_cross_val = [] + + losslist_train_fold1 = [] + losslist_val_fold1 = [] + + losslist_train_fold2 = [] + losslist_val_fold2 = [] + + # Initialize runtime list + runtime_list = [] + + # Cross-validation + print('\n===========================================================================================') + sys.stdout.flush() + for epoch in tqdm(range(start_epoch, n_epochs), desc='Cross-Validation', unit='epoch', leave=False): # Creates a training progress bar with units of epoch + start_time = time.time() + sys.stderr.flush() + print("\nEntering Epoch:", epoch) + sys.stdout.flush() + + # Fold 1 (train on fold1, validate on fold2) + model_fold1, optimizer_fold1, scheduler_fold1, loss_train_fold1 = train(model_fold1, fold1_loader, optimizer_fold1, scheduler_fold1, criterion_train, lambda_l1) + loss_val_fold1 = validate(model_fold1, fold2_loader, criterion_val) + + # Fold 2 (train on fold2, validate on fold1) + model_fold2, optimizer_fold2, scheduler_fold2, loss_train_fold2 = train(model_fold2, fold2_loader, optimizer_fold2, scheduler_fold2, criterion_train, lambda_l1) + loss_val_fold2 = validate(model_fold2, fold1_loader, criterion_val) + + # Caluclate epoch losses + epoch_loss_train = (loss_train_fold1 + loss_train_fold2) / 2 + epoch_loss_cross_val = (loss_val_fold1 + loss_val_fold2) / 2 + + # Append to losslists + losslist_train.append(epoch_loss_train) + losslist_cross_val.append(epoch_loss_cross_val) + + losslist_train_fold1.append(loss_train_fold1) + losslist_val_fold1.append(loss_val_fold1) + + losslist_train_fold2.append(loss_train_fold2) + losslist_val_fold2.append(loss_val_fold2) + + # Return the best cross-validation loss and save best checkpoint (epoch) + best_loss_cross_val = save_best_checkpoint_2fold(model_fold1, model_fold2, optimizer_fold1, optimizer_fold2, scheduler_fold1, scheduler_fold2, epoch, epoch_loss_cross_val, best_loss_cross_val, pathmaster) + + # Update line + sys.stderr.flush() + print("\n======> Epoch: {}/{}, Training Loss: {:.4f}, Cross-Validation Loss: {:.4f}".format(epoch, n_epochs-1, epoch_loss_train, epoch_loss_cross_val)) + print('\n===========================================================================================') + sys.stdout.flush() + + # Add epoch time to runtime_list + end_time = time.time() + time_passed = end_time-start_time # in seconds + runtime_list.append(time_passed) + + # Call the early stopping callback + if early_stopping_callback(epoch, epoch_loss_cross_val): + break + + # Saving + if save: + title = 'Training and Cross-Validation Loss' + plot_save_func.train_val_loss(losslist_train, losslist_cross_val, title, save, pathmaster) + + plot_save_func.save_losslists_2fold(losslist_train_fold1, losslist_val_fold1, losslist_train_fold2, losslist_val_fold2, losslist_train, losslist_cross_val, pathmaster) + plot_save_func.save_runtime_list(runtime_list, pathmaster) + + +def train_validate_DenseNet(model_hyperparameters, train_loader, val_loader, model_type=torch.float32, + n_epochs=100, n_classes=3, patience=10, save=False, + resume_checkpoint_path=None, pathmaster=None): + + # If GPU is available, use GPU, else use CPU + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Optimizer and scheduler hyperparameters + lr = 0.0005 + + # Resume checkpoint if specified + if resume_checkpoint_path is not None and os.path.exists(resume_checkpoint_path): + # Load model hyperparameters + depth, growth_rate, compression, bottleneck, drop_rate, class_weights = load_hyperparameters(pathmaster) + + # Define DenseNet model based on loaded hyperparameters + model = DenseNet(img_channels=1, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device, dtype=model_type) + + # Create optimizer and scheduler + optimizer = torch.optim.Adam(model.parameters(), lr=lr) + scheduler = IdentityScheduler(optimizer) + + model, optimizer, scheduler, epoch, loss = load_checkpoint(model, optimizer, scheduler, pathmaster) + start_epoch = epoch + 1 + best_loss_val = loss + else: + # Extract model hyperparameters + depth = model_hyperparameters['depth'] + growth_rate = model_hyperparameters['growth_rate'] + compression = model_hyperparameters['compression'] + bottleneck = model_hyperparameters['bottleneck'] + drop_rate = model_hyperparameters['drop_rate'] + class_weights = model_hyperparameters['class_weights'] + + # Define DenseNet model based on input hyperparameters + model = DenseNet(img_channels=1, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device, dtype=model_type) + + # Create optimizer and scheduler + optimizer = torch.optim.Adam(model.parameters(), lr=lr) + scheduler = IdentityScheduler(optimizer) + + best_loss_val = float('inf') # If no checkpoint is loaded, set to infinity + start_epoch = 0 + + if save: + # Save hyperparameters + plot_save_func.save_hyperparameters(model_hyperparameters, pathmaster) + + # Create EarlyStoppingCallback object + early_stopping_callback = EarlyStoppingCallback(patience) + + # Create criterion for loss + criterion_train = nn.CrossEntropyLoss(weight=torch.tensor(class_weights).to(device=device)) + criterion_val = nn.CrossEntropyLoss() + + # Regularization + lambda_l1 = 0.01 + + # Initialize losslists + losslist_train = [] + losslist_val = [] + + # Initialize runtime list + runtime_list = [] + + # Training and validation + print('\n===========================================================================================') + sys.stdout.flush() + for epoch in tqdm(range(start_epoch, n_epochs), desc='Training and Validation', unit='epoch', leave=False): # Creates a training progress bar with units of epoch + start_time = time.time() + sys.stderr.flush() + print("\nEntering Epoch:", epoch) + # Training + model.train() + # Reset training sum of epoch loss and batch_count + sum_epoch_loss_train = 0 + sys.stdout.flush() + for train_batch in tqdm(train_loader, total=len(train_loader), desc='Training Epoch', unit='batch', leave=False): + # Extract input and labels + # train_batch['data'].shape = [batch_size, img_channels, img_size, img_size] + X_train = train_batch['data'].reshape(train_batch['data'].shape[0], train_batch['data'].shape[1], train_batch['data'].shape[-1], train_batch['data'].shape[-1]).to(device=device) + Y_train = train_batch['label'].to(device=device) + + # Forward pass + logits, _, _ = model(X_train) + + # Regularization + l1 = 0 + for p in model.parameters(): + l1 = l1 + p.abs().sum() + + # Calculate sum of total loss for epoch with regularization + batch_loss_train = criterion_train(logits.to(torch.float32), Y_train.long()) # Criterion returns a scalar tensor + batch_loss_train += lambda_l1 * l1 + + # Clear gradients + optimizer.zero_grad(set_to_none=True) + + # Backwards pass + batch_loss_train.backward() + + # Optimizer step + optimizer.step() + + # Generate epoch loss + sum_epoch_loss_train += batch_loss_train.item() + + # Update scheduler + scheduler.step() + + # Calculate epoch loss for training + epoch_loss_train = sum_epoch_loss_train / len(train_loader) + losslist_train.append(epoch_loss_train) + + sys.stderr.flush() + print('\nTraining for Epoch', epoch, 'has been completed!') + sys.stdout.flush() + + # Validation + model.eval() + sum_epoch_loss_val = 0 + with torch.no_grad(): # Disable gradient computation during validation + sys.stdout.flush() + for val_batch in tqdm(val_loader, total=len(val_loader), desc='Validation Epoch', unit='batch', leave=False): + # Extract input and labels + X_val = val_batch['data'].reshape(val_batch['data'].shape[0], val_batch['data'].shape[1], val_batch['data'].shape[-1], val_batch['data'].shape[-1]).to(device=device) + Y_val = val_batch['label'].to(device=device) + + # Forward pass + logits, _, _ = model(X_val) + + # Calculate sum of total loss for epoch + sum_epoch_loss_val += criterion_val(logits.float(), Y_val.long()).item() # Criterion returns a scalar tensor + + # Calculate epoch loss for validation + epoch_loss_val = sum_epoch_loss_val / len(val_loader) + losslist_val.append(epoch_loss_val) + + sys.stderr.flush() + print('\nValidation for Epoch', epoch, 'has been completed!') + sys.stdout.flush() + + # # Temporarily save checkpoint after each epoch + # save_checkpoint(model, optimizer, scheduler, epoch, loss=epoch_loss_val, checkpoint_path=temp_checkpoint_path) + + # Return the best validation loss and save best checkpoint (epoch) + best_loss_val = save_best_checkpoint(model, optimizer, scheduler, epoch, epoch_loss_val, best_loss_val, pathmaster) + + # Update line + sys.stderr.flush() + print("\n======> Epoch: {}/{}, Training Loss: {:.4f}, Validation Loss: {:.4f}".format(epoch, n_epochs-1, epoch_loss_train, epoch_loss_val)) + print('\n===========================================================================================') + sys.stdout.flush() + + # Add epoch time to runtime_list + end_time = time.time() + time_passed = end_time-start_time # in seconds + runtime_list.append(time_passed) + + + # Call the early stopping callback + if early_stopping_callback(epoch, epoch_loss_val): + break + + # Saving + if save: + title = 'Training and Validation Loss' + plot_save_func.train_val_loss(losslist_train, losslist_val, title, save, pathmaster) + + plot_save_func.save_losslists(losslist_train, losslist_val, pathmaster) + plot_save_func.save_runtime_list(runtime_list, pathmaster) + + +def train_validate_DenseNet_mixed(model_hyperparameters, train_loader, val_loader, + n_epochs=100, n_classes=3, patience=10, save=False, + resume_checkpoint_path=None, pathmaster=None): + + # If GPU is available, use GPU, else use CPU + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Optimizer and scheduler hyperparameters + lr = 0.0005 + + # Resume checkpoint if specified + if resume_checkpoint_path is not None and os.path.exists(resume_checkpoint_path): + # Load model hyperparameters + depth, growth_rate, compression, bottleneck, drop_rate, class_weights = load_hyperparameters(pathmaster) + + # Define DenseNet model based on loaded hyperparameters + model = DenseNet(img_channels=1, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device) + + # Create optimizer and scheduler + optimizer = torch.optim.Adam(model.parameters(), lr=lr) + scheduler = IdentityScheduler(optimizer) + + model, optimizer, scheduler, epoch, loss = load_checkpoint(model, optimizer, scheduler, pathmaster) + start_epoch = epoch + 1 + best_loss_val = loss + else: + # Extract model hyperparameters + depth = model_hyperparameters['depth'] + growth_rate = model_hyperparameters['growth_rate'] + compression = model_hyperparameters['compression'] + bottleneck = model_hyperparameters['bottleneck'] + drop_rate = model_hyperparameters['drop_rate'] + class_weights = model_hyperparameters['class_weights'] + + # Define DenseNet model based on input hyperparameters + model = DenseNet(img_channels=1, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device) + + # Create optimizer and scheduler + optimizer = torch.optim.Adam(model.parameters(), lr=lr) + scheduler = IdentityScheduler(optimizer) + + best_loss_val = float('inf') # If no checkpoint is loaded, set to infinity + start_epoch = 0 + + if save: + # Save hyperparameters + plot_save_func.save_hyperparameters(model_hyperparameters, pathmaster) + + # Create EarlyStoppingCallback object + early_stopping_callback = EarlyStoppingCallback(patience) + + # Create criterion for loss + criterion_train = nn.CrossEntropyLoss(weight=torch.tensor(class_weights).to(device=device)) + criterion_val = nn.CrossEntropyLoss() + + # Regularization + lambda_l1 = 0.01 + + # Initialize losslists + losslist_train = [] + losslist_val = [] + + # Initialize runtime list + runtime_list = [] + + # Initialize predictions lists + predictions_list_train = [] + predictions_list_val = [] + + # Initialize true labels lists + true_labels_list_train = [] + true_labels_list_val = [] + + # Scalers + scaler = GradScaler() + + # Training and validation + print('\n===========================================================================================') + sys.stdout.flush() + for epoch in tqdm(range(start_epoch, n_epochs), desc='Training and Validation', unit='epoch', leave=False): # Creates a training progress bar with units of epoch + start_time = time.time() + sys.stderr.flush() + print("\nEntering Epoch:", epoch) + # Training + model.train() + # Reset training sum of epoch loss and batch_count + sum_epoch_loss_train = 0 + sys.stdout.flush() + + # Epoch predictions + predictions_epoch_train = [] + predictions_epoch_val = [] + + for train_batch in tqdm(train_loader, total=len(train_loader), desc='Training Epoch', unit='batch', leave=False): + # Extract input and labels + # train_batch['data'].shape = [batch_size, img_channels, img_size, img_size] + X_train = train_batch['data'].reshape(train_batch['data'].shape[0], train_batch['data'].shape[1], train_batch['data'].shape[-1], train_batch['data'].shape[-1]).to(device=device) + Y_train = train_batch['label'].to(device=device) + + if epoch == start_epoch: + true_labels_list_train.append(torch.reshape(Y_train, (-1,1))) + + with autocast(): + # Forward pass + logits, predictions, _ = model(X_train) + + predictions_epoch_train.append(torch.reshape(predictions, (-1,1))) + + # Regularization + l1 = 0 + for p in model.parameters(): + l1 = l1 + p.abs().sum() + + # Calculate sum of total loss for epoch with regularization + batch_loss_train = criterion_train(logits.to(torch.float32), Y_train.long()) # Criterion returns a scalar tensor + batch_loss_train += lambda_l1 * l1 + + # Clear gradients + optimizer.zero_grad(set_to_none=True) + + # Backwards pass + scaler.scale(batch_loss_train).backward() + + # Optimizer step + scaler.step(optimizer) + + # Scaler update + scaler.update() + + # Generate epoch loss + sum_epoch_loss_train += batch_loss_train.item() + + # Update scheduler + scheduler.step() + + # Calculate epoch loss for training + epoch_loss_train = sum_epoch_loss_train / len(train_batch) + losslist_train.append(epoch_loss_train) + + sys.stderr.flush() + print('\nTraining for Epoch', epoch, 'has been completed!') + sys.stdout.flush() + + # Validation + model.eval() + sum_epoch_loss_val = 0 + with torch.no_grad(): # Disable gradient computation during validation + sys.stdout.flush() + for val_batch in tqdm(val_loader, total=len(val_loader), desc='Validation Epoch', unit='batch', leave=False): + # Extract input and labels + X_val = val_batch['data'].reshape(val_batch['data'].shape[0], val_batch['data'].shape[1], val_batch['data'].shape[-1], val_batch['data'].shape[-1]).to(device=device) + Y_val = val_batch['label'].to(device=device) + + if epoch == start_epoch: + true_labels_list_val.append(torch.reshape(Y_val, (-1,1))) + + # Forward pass + logits, predictions, _ = model(X_val) + predictions_epoch_val.append(torch.reshape(predictions, (-1,1))) + + # Calculate sum of total loss for epoch + sum_epoch_loss_val += criterion_val(logits.float(), Y_val.long()).item() # Criterion returns a scalar tensor + + # Calculate epoch loss for validation + epoch_loss_val = sum_epoch_loss_val / len(val_loader) + losslist_val.append(epoch_loss_val) + + sys.stderr.flush() + print('\nValidation for Epoch', epoch, 'has been completed!') + sys.stdout.flush() + + # # Temporarily save checkpoint after each epoch + # save_checkpoint(model, optimizer, scheduler, epoch, loss=epoch_loss_val, checkpoint_path=temp_checkpoint_path) + + # Return the best validation loss and save best checkpoint (epoch) + best_loss_val = save_best_checkpoint(model, optimizer, scheduler, epoch, epoch_loss_val, best_loss_val, pathmaster) + + # Update line + sys.stderr.flush() + print("\n======> Epoch: {}/{}, Training Loss: {:.4f}, Validation Loss: {:.4f}".format(epoch, n_epochs-1, epoch_loss_train, epoch_loss_val)) + print('\n===========================================================================================') + sys.stdout.flush() + + # Add epoch predictions + predictions_epoch_train = np.array(torch.cat(predictions_epoch_train, dim=0).to('cpu')) + predictions_epoch_val = np.array(torch.cat(predictions_epoch_val, dim=0).to('cpu')) + + predictions_list_train.append(predictions_epoch_train) + predictions_list_val.append(predictions_epoch_val) + + # Add epoch time to runtime_list + end_time = time.time() + time_passed = end_time-start_time # in seconds + runtime_list.append(time_passed) + + + # Call the early stopping callback + if early_stopping_callback(epoch, epoch_loss_val): + break + + # Convert true label list into array + true_labels_train = np.array(torch.cat(true_labels_list_train, dim=0).to('cpu')) + true_labels_val = np.array(torch.cat(true_labels_list_val, dim=0).to('cpu')) + + # Saving + if save: + title = 'Training and Validation Loss' + plot_save_func.train_val_loss(losslist_train, losslist_val, title, save, pathmaster) + + title = 'Training and Validation Accuracy' + plot_save_func.accuracy_curves(true_labels_train, true_labels_val, predictions_list_train, predictions_list_val, title, save, pathmaster) + + plot_save_func.save_losslists(losslist_train, losslist_val, pathmaster) + plot_save_func.save_runtime_list(runtime_list, pathmaster) + + +def train_validate_DenseNet_config(config, train_loader, val_loader, + n_epochs=100, n_classes=3, patience=10, save=False, + pathmaster=None): + # # Set filetag + # file_tag = str(dt.datetime.now()) + # # Define characters to replace with underscores + # chars_to_replace = [' ', ':', '.', '-'] + + # # Replace characters with underscores + # for char in chars_to_replace: + # file_tag = file_tag.replace(char, '_') + # pathmaster.set_file_tag(file_tag) + + # Save hyperparameters + model_hyperparameters = { # Default, no bottleneck or compression + 'depth': config['depth'], + 'growth_rate': config['growth_rate'], + 'compression': config['compression'], + 'bottleneck': config['bottleneck'], + 'drop_rate': config['drop_rate'], + 'class_weights': config['class_weights'], + 'learning_rate': config['learning_rate'], + 'num_dense_tran': config['num_dense_tran'], + 'lambda_l1': config['lambda_l1'], + 'activation': activation_to_string(config['activation']), + } + + if save: + plot_save_func.save_hyperparameters(model_hyperparameters, pathmaster) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + img_channels = 1 + + model = DenseNet_config(img_channels, config['depth'], n_classes, config['growth_rate'], config['compression'], + config['bottleneck'], config['drop_rate'], config['activation'], config['num_dense_tran']).to(device=device) + + # Loss function and optimizer + criterion_train = nn.CrossEntropyLoss(weight=torch.tensor(config['class_weights']).to(device=device)) + criterion_val = nn.CrossEntropyLoss() + optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate']) + scheduler = IdentityScheduler(optimizer) + + + # Scalers + scaler = GradScaler() + + # Initialize losslists + losslist_train = [] + losslist_val = [] + + # Initialize predictions lists + predictions_list_train = [] + predictions_list_val = [] + + # Initialize true labels lists + true_labels_list_train = [] + true_labels_list_val = [] + + # Initialize runtime list + runtime_list = [] + + # Create EarlyStoppingCallback object + early_stopping_callback = EarlyStoppingCallback(patience) + + # Initialize best validation loss + best_loss_val = float('inf') # If no checkpoint is loaded, set to infinity + + start_epoch = 0 + # Training and validation + print('\n===========================================================================================') + sys.stdout.flush() + for epoch in range(start_epoch, n_epochs): # Creates a training progress bar with units of epoch + start_time = time.time() + sys.stderr.flush() + print("\nEntering Epoch:", epoch) + # Training + model.train() + # Reset training sum of epoch loss and batch_count + sum_epoch_loss_train = 0 + sys.stdout.flush() + + # Epoch predictions + predictions_epoch_train = [] + predictions_epoch_val = [] + + for train_batch in tqdm(train_loader, total=len(train_loader), desc='Training Epoch', unit='batch', leave=False): + # Extract input and labels + # train_batch['data'].shape = [batch_size, img_channels, img_size, img_size] + X_train = train_batch['data'].reshape(train_batch['data'].shape[0], train_batch['data'].shape[1], train_batch['data'].shape[-1], train_batch['data'].shape[-1]).to(device=device) + Y_train = train_batch['label'].to(device=device) + + if epoch == start_epoch: + true_labels_list_train.append(torch.reshape(Y_train, (-1,1))) + + with autocast(): + # Forward pass + logits, predictions, _ = model(X_train) + + predictions_epoch_train.append(torch.reshape(predictions, (-1,1))) + + # Regularization + l1 = 0 + for p in model.parameters(): + l1 = l1 + p.abs().sum() + + # Calculate sum of total loss for epoch with regularization + batch_loss_train = criterion_train(logits.to(torch.float32), Y_train.long()) # Criterion returns a scalar tensor + batch_loss_train += config['lambda_l1'] * l1 + + # Clear gradients + optimizer.zero_grad(set_to_none=True) + + # Backwards pass + scaler.scale(batch_loss_train).backward() + + # Optimizer step + scaler.step(optimizer) + + # Scaler update + scaler.update() + + # Generate epoch loss + sum_epoch_loss_train += batch_loss_train.item() + + # Update scheduler + scheduler.step() + + # Calculate epoch loss for training + epoch_loss_train = sum_epoch_loss_train / len(train_batch) + losslist_train.append(epoch_loss_train) + + sys.stderr.flush() + print('\nTraining for Epoch', epoch, 'has been completed!') + sys.stdout.flush() + + # Validation + model.eval() + sum_epoch_loss_val = 0 + with torch.no_grad(): # Disable gradient computation during validation + sys.stdout.flush() + for val_batch in tqdm(val_loader, total=len(val_loader), desc='Validation Epoch', unit='batch', leave=False): + # Extract input and labels + X_val = val_batch['data'].reshape(val_batch['data'].shape[0], val_batch['data'].shape[1], val_batch['data'].shape[-1], val_batch['data'].shape[-1]).to(device=device) + Y_val = val_batch['label'].to(device=device) + + if epoch == start_epoch: + true_labels_list_val.append(torch.reshape(Y_val, (-1,1))) + + # Forward pass + logits, predictions, _ = model(X_val) + predictions_epoch_val.append(torch.reshape(predictions, (-1,1))) + + # Calculate sum of total loss for epoch + sum_epoch_loss_val += criterion_val(logits.float(), Y_val.long()).item() # Criterion returns a scalar tensor + + # Calculate epoch loss for validation + epoch_loss_val = sum_epoch_loss_val / len(val_loader) + losslist_val.append(epoch_loss_val) + + sys.stderr.flush() + print('\nValidation for Epoch', epoch, 'has been completed!') + sys.stdout.flush() + + # Return the best validation loss and save best checkpoint (epoch) + best_loss_val = save_best_checkpoint(model, optimizer, scheduler, epoch, epoch_loss_val, best_loss_val, pathmaster) + + # Update line + sys.stderr.flush() + print("\n======> Epoch: {}/{}, Training Loss: {:.4f}, Validation Loss: {:.4f}".format(epoch, n_epochs-1, epoch_loss_train, epoch_loss_val)) + print('\n===========================================================================================') + sys.stdout.flush() + + # Add epoch predictions + predictions_epoch_train = np.array(torch.cat(predictions_epoch_train, dim=0).to('cpu')) + predictions_epoch_val = np.array(torch.cat(predictions_epoch_val, dim=0).to('cpu')) + + predictions_list_train.append(predictions_epoch_train) + predictions_list_val.append(predictions_epoch_val) + + # Add epoch time to runtime_list + end_time = time.time() + time_passed = end_time-start_time # in seconds + runtime_list.append(time_passed) + + # Call the early stopping callback + if early_stopping_callback(epoch, epoch_loss_val): + break + + # Convert true label list into array + true_labels_train = np.array(torch.cat(true_labels_list_train, dim=0).to('cpu')) + true_labels_val = np.array(torch.cat(true_labels_list_val, dim=0).to('cpu')) + + if save: + title = 'Training and Validation Loss' + plot_save_func.train_val_loss(losslist_train, losslist_val, title, save, pathmaster) + + title = 'Training and Validation Accuracy' + plot_save_func.accuracy_curves(true_labels_train, true_labels_val, predictions_list_train, predictions_list_val, title, save, pathmaster) + + plot_save_func.save_losslists(losslist_train, losslist_val, pathmaster) + plot_save_func.save_runtime_list(runtime_list, pathmaster) + + +def best_DenseNet_2fold(fold1_loader, fold2_loader, model_type=torch.float32, n_classes=3, save=False, pathmaster=None): + print('\n===========================================================================================') + + # If GPU is available, use GPU, else use CPU + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Get paths + checkpoints_path = pathmaster.checkpoints_path() + + # Load model hyperparameters + depth, growth_rate, compression, bottleneck, drop_rate = load_hyperparameters(pathmaster) + # When testing on the test set, drop_rate should always be 0 + + # Define img_channels + img_channels = 1 + + # Initialize model + model_fold1 = DenseNet(img_channels=img_channels, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device, dtype=model_type) + model_fold2 = DenseNet(img_channels=img_channels, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device, dtype=model_type) + + # Create criterion for loss + criterion = nn.CrossEntropyLoss() + + # If checkpoint is not specified, terminate the function + checkpoint_path = os.path.join(checkpoints_path, 'checkpoint_' + pathmaster.file_tag + '.pt') + assert os.path.exists(checkpoint_path), 'Function terminated. Not a valid checkpoint path.' + + # Load models + model_fold1, model_fold2 = load_model_2fold(model_fold1, model_fold2, pathmaster) + + # Fold 1 ======================================================================================================================================================================= + # Initialize true label lists + true_labels_list_fold1 = [] + + # Intialize output (prediction) lists + predictions_list_fold1 = [] + prediction_proba_list_fold1 = [] + + # Validation + model_fold1.eval() + cum_loss_fold1 = 0 + with torch.no_grad(): # Disable gradient computation during validation + sys.stdout.flush() + for data_batch in tqdm(fold2_loader, total=len(fold2_loader), desc='Testing Fold #1', unit='batch', leave=False): + sys.stderr.flush() + + # Extract input and labels + X = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[1], data_batch['data'].shape[-1], data_batch['data'].shape[-1]).to(device=device) + Y = data_batch['label'].to(device=device) + true_labels_list_fold1.append(torch.reshape(Y, (-1,1))) + + # Forward pass + logits, predictions, prediction_proba = model_fold1(X) + predictions_list_fold1.append(torch.reshape(predictions, (-1,1))) + prediction_proba_list_fold1.append(torch.reshape(prediction_proba, (-1,n_classes))) + + # Calculate sum of total loss for epoch + cum_loss_fold1 += criterion(logits.float(), Y.long()).item() # Criterion returns a scalar tensor + + # Calculate loss for validation + loss_fold1 = cum_loss_fold1 / len(fold2_loader) + + # Convert true label list into array + true_labels_fold1 = np.array(torch.cat(true_labels_list_fold1, dim=0).to('cpu')) + + # Convert the output lists into arrays and concatenate along dim=0 (rows) + predictions_fold1 = np.array(torch.cat(predictions_list_fold1, dim=0).to('cpu')) + prediction_proba_fold1 = np.array(torch.cat(prediction_proba_list_fold1, dim=0).to('cpu')) + + # Fold 2 ======================================================================================================================================================================= + # Initialize true label lists + true_labels_list_fold2 = [] + + # Intialize output (prediction) lists + predictions_list_fold2 = [] + prediction_proba_list_fold2 = [] + + # Validation + model_fold2.eval() + cum_loss_fold2 = 0 + with torch.no_grad(): # Disable gradient computation during validation + sys.stdout.flush() + for data_batch in tqdm(fold1_loader, total=len(fold1_loader), desc='Testing Fold #2', unit='batch', leave=False): + sys.stderr.flush() + + # Extract input and labels + X = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[1], data_batch['data'].shape[-1], data_batch['data'].shape[-1]).to(device=device) + Y = data_batch['label'].to(device=device) + true_labels_list_fold2.append(torch.reshape(Y, (-1,1))) + + # Forward pass + logits, predictions, prediction_proba = model_fold2(X) + predictions_list_fold2.append(torch.reshape(predictions, (-1,1))) + prediction_proba_list_fold2.append(torch.reshape(prediction_proba, (-1,n_classes))) + + # Calculate sum of total loss for epoch + cum_loss_fold2 += criterion(logits.float(), Y.long()).item() # Criterion returns a scalar tensor + + # Calculate loss for validation + loss_fold2 = cum_loss_fold2 / len(fold1_loader) + + # Convert true label list into array + true_labels_fold2 = np.array(torch.cat(true_labels_list_fold2, dim=0).to('cpu')) + + # Convert the output lists into arrays and concatenate along dim=0 (rows) + predictions_fold2 = np.array(torch.cat(predictions_list_fold2, dim=0).to('cpu')) + prediction_proba_fold2 = np.array(torch.cat(prediction_proba_list_fold2, dim=0).to('cpu')) + # ============================================================================================================================================================================== + + # Create overall lists + true_labels = np.concatenate((true_labels_fold1, true_labels_fold2), axis=0) + predictions = np.concatenate((predictions_fold1, predictions_fold2), axis=0) + prediction_proba = np.concatenate((prediction_proba_fold1, prediction_proba_fold2), axis=0) + + # Print mean validation loss + mean_loss = (loss_fold1 + loss_fold2) / 2 + print('\n=====> Fold #1 Loss: %.4f' % loss_fold1) + print('=====> Fold #2 Loss: %.4f' % loss_fold2) + print('=====> Mean Loss: %.4f' % mean_loss) + + # Saving + if save: + from sklearn.metrics import confusion_matrix + conf_matrix = confusion_matrix(true_labels, predictions) + title = 'Cross-Validation Confusion Matrix' + plot_save_func.conf_matrix(conf_matrix, title, save, pathmaster) + + plot_save_func.save_labels(true_labels, pathmaster) + plot_save_func.save_predictions(predictions, pathmaster) + plot_save_func.save_prediction_proba(prediction_proba, pathmaster) + plot_save_func.metrics_2fold(true_labels_fold1, true_labels_fold2, predictions_fold1, predictions_fold2, prediction_proba_fold1, prediction_proba_fold2, save, pathmaster) + + clf_names = ['Fold #1', 'Fold #2', 'Combined'] + plot_save_func.mean_roc_curves([true_labels_fold1, true_labels_fold2], [prediction_proba_fold1, prediction_proba_fold2], clf_names, save, pathmaster) + + +# Utilizes test() function +def best_DenseNet_2fold_func(fold1_loader, fold2_loader, model_type=torch.float32, n_classes=3, save=False, pathmaster=None): + print('\n===========================================================================================') + + # If GPU is available, use GPU, else use CPU + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Get paths + checkpoints_path = pathmaster.checkpoints_path() + + # Load model hyperparameters + depth, growth_rate, compression, bottleneck, drop_rate, _ = load_hyperparameters(pathmaster) + # When testing on the test set, drop_rate should always be 0 + + # Define img_channels + img_channels = 1 + + # Initialize model + model_fold1 = DenseNet(img_channels=img_channels, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device, dtype=model_type) + model_fold2 = DenseNet(img_channels=img_channels, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device, dtype=model_type) + + # Create criterion for loss + criterion = nn.CrossEntropyLoss() + + # If checkpoint is not specified, terminate the function + checkpoint_path = os.path.join(checkpoints_path, 'checkpoint_' + pathmaster.file_tag + '.pt') + assert os.path.exists(checkpoint_path), 'Function terminated. Not a valid checkpoint path.' + + # Load models + model_fold1, model_fold2 = load_model_2fold(model_fold1, model_fold2, pathmaster) + + # Validation + with torch.no_grad(): # Disable gradient computation during validation + sys.stdout.flush() + + true_labels_fold1, predictions_fold1, prediction_proba_fold1, loss_fold1 = test(model_fold1, fold1_loader, criterion, n_classes) + true_labels_fold2, predictions_fold2, prediction_proba_fold2, loss_fold2 = test(model_fold2, fold2_loader, criterion, n_classes) + + # Create overall lists + true_labels = true_labels_fold1 + true_labels_fold2 + predictions = predictions_fold1 + predictions_fold2 + prediction_proba = prediction_proba_fold1 + prediction_proba_fold2 + + # Print mean validation loss + mean_loss = (loss_fold1 + loss_fold2) / 2 + print('\n======> Fold #1 Loss: %.4f' % loss_fold1) + print('=====> Fold #2 Loss: %.4f' % loss_fold2) + print('======> Mean Loss: %.4f' % mean_loss) + + # Saving + if save: + from sklearn.metrics import confusion_matrix + conf_matrix = confusion_matrix(true_labels, predictions) + title = 'Cross-Validation Confusion Matrix' + plot_save_func.conf_matrix(conf_matrix, title, save, pathmaster) + + plot_save_func.save_labels(true_labels, pathmaster) + plot_save_func.save_predictions(predictions, pathmaster) + plot_save_func.save_prediction_proba(prediction_proba, pathmaster) + plot_save_func.metrics_2fold(true_labels_fold1, true_labels_fold2, predictions_fold1, predictions_fold2, save, pathmaster) + + clf_names = ['Fold #1', 'Fold #2', 'Combined'] + plot_save_func.mean_roc_curves([true_labels_fold1, true_labels_fold2], [prediction_proba_fold1, prediction_proba_fold2], clf_names, save, pathmaster) + + +def best_DenseNet(data_loader, model_type=torch.float32, n_classes=3, save=False, pathmaster=None): + print('\n===========================================================================================') + + # If GPU is available, use GPU, else use CPU + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Get paths + checkpoints_path = pathmaster.checkpoints_path() + + # Load model hyperparameters + depth, growth_rate, compression, bottleneck, drop_rate, class_weights = load_hyperparameters(pathmaster) + # When testing on the test set, drop_rate should always be 0 + + # Initialize model + model = DenseNet(img_channels=1, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device, dtype=model_type) + + # Create criterion for loss + criterion = nn.CrossEntropyLoss() + + # If checkpoint is not specified, terminate the function + checkpoint_path = os.path.join(checkpoints_path, 'checkpoint_' + pathmaster.file_tag + '.pt') + assert os.path.exists(checkpoint_path), 'Function terminated. Not a valid checkpoint path.' + + # Load model + model = load_model(model, pathmaster) + + # Initialize true label lists + true_labels_list = [] + + # Intialize output (prediction) lists + predictions_list = [] + prediction_proba_list = [] + + # Evaluation + model.eval() + cum_loss = 0 + with torch.no_grad(): # Disable gradient computation during validation + sys.stdout.flush() + for data_batch in tqdm(data_loader, total=len(data_loader), desc='Testing', unit='batch', leave=False): + sys.stderr.flush() + + # Extract input and labels + X = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[1], data_batch['data'].shape[-1], data_batch['data'].shape[-1]).to(device=device) + Y = data_batch['label'].to(device=device) + true_labels_list.append(torch.reshape(Y, (-1,1))) + + # Forward pass + logits, predictions, prediction_proba = model(X) + predictions_list.append(torch.reshape(predictions, (-1,1))) + prediction_proba_list.append(torch.reshape(prediction_proba, (-1,n_classes))) + + # Calculate sum of total loss for epoch + cum_loss += criterion(logits.float(), Y.long()).item() # Criterion returns a scalar tensor + + # Calculate loss for validation + loss = cum_loss / len(data_loader) + + # Convert true label list into array + true_labels = np.array(torch.cat(true_labels_list, dim=0).to('cpu')) + + # Convert the output lists into arrays and concatenate along dim=0 (rows) + predictions = np.array(torch.cat(predictions_list, dim=0).to('cpu')) + prediction_proba = np.array(torch.cat(prediction_proba_list, dim=0).to('cpu')) + + # Print validation loss + print('\n======> Loss: %.4f' % loss) + + # Saving + if save: + from sklearn.metrics import confusion_matrix + conf_matrix = confusion_matrix(true_labels, predictions) + title = 'Evaluation Confusion Matrix' + plot_save_func.conf_matrix(conf_matrix, title, save, pathmaster) + + plot_save_func.save_labels(true_labels, pathmaster) + plot_save_func.save_predictions(predictions, pathmaster) + plot_save_func.save_prediction_proba(prediction_proba, pathmaster) + plot_save_func.metrics(true_labels, predictions, prediction_proba, save, pathmaster) + + plot_save_func.save_classification_report(true_labels, predictions, save, pathmaster) + plot_save_func.save_classification_report_imbalanced(true_labels, predictions, save, pathmaster) + + clf_names = ['Model'] + plot_save_func.mean_roc_curves([true_labels], [prediction_proba], clf_names, save, pathmaster) + plot_save_func.roc_curves(true_labels, prediction_proba, save, pathmaster) + + +def best_DenseNet_config(data_loader, model_type=torch.float32, n_classes=3, save=False, pathmaster=None): + print('\n===========================================================================================') + + # If GPU is available, use GPU, else use CPU + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Get paths + checkpoints_path = pathmaster.checkpoints_path() + + # Load model hyperparameters + depth, growth_rate, compression, bottleneck, drop_rate, _, _, num_dense_tran, _, activation = load_hyperparameters_random_search(pathmaster) + # When testing on the test set, drop_rate, class_weights, learning_rate, and lambda_l1 are not needed + + # Initialize model + model = DenseNet_config(img_channels=1, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate, + activation=activation, num_dense_tran=num_dense_tran).to(device=device, dtype=model_type) + + # Create criterion for loss + criterion = nn.CrossEntropyLoss() + + # If checkpoint is not specified, terminate the function + checkpoint_path = os.path.join(checkpoints_path, 'checkpoint_' + pathmaster.file_tag + '.pt') + assert os.path.exists(checkpoint_path), 'Function terminated. Not a valid checkpoint path.' + + # Load model + model = load_model(model, pathmaster) + + # Initialize true label lists + true_labels_list = [] + + # Intialize output (prediction) lists + predictions_list = [] + prediction_proba_list = [] + + # # Initialize segment names list + # segment_names_list = [] + + # Evaluation + model.eval() + cum_loss = 0 + with torch.no_grad(): # Disable gradient computation during validation + sys.stdout.flush() + for data_batch in tqdm(data_loader, total=len(data_loader), desc='Testing', unit='batch', leave=False): + sys.stderr.flush() + + # Extract input and labels + X = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[1], data_batch['data'].shape[-1], data_batch['data'].shape[-1]).to(device=device) + Y = data_batch['label'].to(device=device) + # Z = data_batch['segment_name'] + # segment_names_list.append(Z) + true_labels_list.append(torch.reshape(Y, (-1,1))) + + # Forward pass + logits, predictions, prediction_proba = model(X) + predictions_list.append(torch.reshape(predictions, (-1,1))) + prediction_proba_list.append(torch.reshape(prediction_proba, (-1,n_classes))) + + # Calculate sum of total loss for epoch + cum_loss += criterion(logits.float(), Y.long()).item() # Criterion returns a scalar tensor + + # Calculate loss for validation + loss = cum_loss / len(data_loader) + + # Convert true label list into array + true_labels = np.array(torch.cat(true_labels_list, dim=0).to('cpu')) + + # # Convert segment names list into array + # segment_names = np.concatenate(segment_names_list, axis=0) + # segment_names = segment_names.reshape(-1,1) + + # Convert the output lists into arrays and concatenate along dim=0 (rows) + predictions = np.array(torch.cat(predictions_list, dim=0).to('cpu')) + prediction_proba = np.array(torch.cat(prediction_proba_list, dim=0).to('cpu')) + + # Print validation loss + print('\n======> Loss: %.4f' % loss) + + # Saving + if save: + # pathmaster.set_file_tag(pathmaster.file_tag + '_test') + from sklearn.metrics import confusion_matrix + conf_matrix = confusion_matrix(true_labels, predictions) + title = 'Evaluation Confusion Matrix' + plot_save_func.conf_matrix(conf_matrix, title, save, pathmaster) + + plot_save_func.save_labels(true_labels, pathmaster) + # plot_save_func.save_labels(np.hstack([segment_names, true_labels]), pathmaster) + plot_save_func.save_predictions(predictions, pathmaster) + plot_save_func.save_prediction_proba(prediction_proba, pathmaster) + plot_save_func.metrics(true_labels, predictions, prediction_proba, save, pathmaster) + + plot_save_func.save_classification_report(true_labels, predictions, save, pathmaster) + plot_save_func.save_classification_report_imbalanced(true_labels, predictions, save, pathmaster) + + clf_names = ['Model'] + plot_save_func.mean_roc_curves([true_labels], [prediction_proba], clf_names, save, pathmaster) + plot_save_func.roc_curves(true_labels, prediction_proba, save, pathmaster) + + +def best_DenseNet_config_binary(data_loader, model_type=torch.float32, n_classes=3, save=False, pathmaster=None): + print('\n===========================================================================================') + + # If GPU is available, use GPU, else use CPU + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Get paths + checkpoints_path = pathmaster.checkpoints_path() + + # Load model hyperparameters + depth, growth_rate, compression, bottleneck, drop_rate, _, _, num_dense_tran, _, activation = load_hyperparameters_random_search(pathmaster) + # When testing on the test set, drop_rate, class_weights, learning_rate, and lambda_l1 are not needed + + # Initialize model + model = DenseNet_config(img_channels=1, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate, + activation=activation, num_dense_tran=num_dense_tran).to(device=device, dtype=model_type) + + # Create criterion for loss + criterion = nn.CrossEntropyLoss() + + # If checkpoint is not specified, terminate the function + checkpoint_path = os.path.join(checkpoints_path, 'checkpoint_' + pathmaster.file_tag + '.pt') + assert os.path.exists(checkpoint_path), 'Function terminated. Not a valid checkpoint path.' + + # Load model + model = load_model(model, pathmaster) + + # Initialize true label lists + true_labels_list = [] + + # Intialize output (prediction) lists + predictions_list = [] + prediction_proba_list = [] + + # Evaluation + model.eval() + cum_loss = 0 + with torch.no_grad(): # Disable gradient computation during validation + sys.stdout.flush() + for data_batch in tqdm(data_loader, total=len(data_loader), desc='Testing', unit='batch', leave=False): + sys.stderr.flush() + + # Extract input and labels + X = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[1], data_batch['data'].shape[-1], data_batch['data'].shape[-1]).to(device=device) + Y = data_batch['label'].to(device=device) + true_labels_list.append(torch.reshape(Y, (-1,1))) + + # Forward pass + logits, predictions, prediction_proba = model(X) + predictions_list.append(torch.reshape(predictions, (-1,1))) + prediction_proba_list.append(torch.reshape(prediction_proba, (-1,n_classes))) + + # Calculate sum of total loss for epoch + cum_loss += criterion(logits.float(), Y.long()).item() # Criterion returns a scalar tensor + + # Calculate loss for validation + loss = cum_loss / len(data_loader) + + # Convert true label list into array + true_labels = np.array(torch.cat(true_labels_list, dim=0).to('cpu')) + + # Convert the output lists into arrays and concatenate along dim=0 (rows) + predictions = np.array(torch.cat(predictions_list, dim=0).to('cpu')) + prediction_proba = np.array(torch.cat(prediction_proba_list, dim=0).to('cpu')) + + # Print validation loss + print('\n======> Loss: %.4f' % loss) + + # Saving + if save: + # pathmaster.set_file_tag(pathmaster.file_tag + '_test') + from sklearn.metrics import confusion_matrix + conf_matrix = confusion_matrix(true_labels, predictions) + title = 'Evaluation Confusion Matrix' + plot_save_func.conf_matrix(conf_matrix, title, save, pathmaster, class_names=['non-AF', 'AF']) + + plot_save_func.save_labels(true_labels, pathmaster) + plot_save_func.save_predictions(predictions, pathmaster) + plot_save_func.save_prediction_proba_binary(prediction_proba, pathmaster) + plot_save_func.metrics_binary(true_labels, predictions, prediction_proba, save, pathmaster) + + plot_save_func.save_classification_report(true_labels, predictions, save, pathmaster) + plot_save_func.save_classification_report_imbalanced(true_labels, predictions, save, pathmaster) + + plot_save_func.roc_curves_binary(true_labels, prediction_proba, save, pathmaster, class_names=['non-AF', 'AF']) + + +def train(model, dataloader, optimizer, scheduler, criterion, regularization): + model.train() + cum_loss = 0 + for data_batch in tqdm(dataloader, total=len(dataloader), desc='Training', unit='batch', leave=False): + # Extract input and labels + X_train = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[1], data_batch['data'].shape[-1], data_batch['data'].shape[-1]).to(device=device) + Y_train = data_batch['label'].to(device=device) + + # Forward pass + logits, _, _ = model(X_train) + + # Regularization (if applicable) + l1 = 0 + for p in model.parameters(): + l1 = l1 + p.abs().sum() + + # Calculate total loss with regularization + batch_loss_train = criterion(logits.to(torch.float32), Y_train.long()) + regularization * l1 + cum_loss += batch_loss_train.item() + + # Clear gradients + optimizer.zero_grad(set_to_none=True) + + # Backwards pass + batch_loss_train.backward() + + # Optimizer step + optimizer.step() + + # Update scheduler + scheduler.step() + + epoch_loss = cum_loss / len(dataloader) + + return model, optimizer, scheduler, epoch_loss + + +def validate(model, dataloader, criterion): + model.eval() + with torch.no_grad(): + cum_loss = 0 + for data_batch in tqdm(dataloader, total=len(dataloader), desc='Validation', unit='batch', leave=False): + X_val = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[1], data_batch['data'].shape[-1], data_batch['data'].shape[-1]).to(device=device) + Y_val = data_batch['label'].to(device=device) + + logits, _, _ = model(X_val) + cum_loss += criterion(logits.float(), Y_val.long()).item() + + epoch_loss = cum_loss / len(dataloader) + + return epoch_loss + + +def test(model, dataloader, criterion, n_classes): + # Initialize true label lists + true_labels_list = [] + + # Intialize output (prediction) lists + predictions_list = [] + prediction_proba_list = [] + + # Validation + model.eval() + cum_loss = 0 + with torch.no_grad(): # Disable gradient computation during validation + sys.stdout.flush() + for data_batch in tqdm(dataloader, total=len(dataloader), desc='Testing', unit='batch', leave=False): + sys.stderr.flush() + + # Extract input and labels + X = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[1], data_batch['data'].shape[-1], data_batch['data'].shape[-1]).to(device=device) + Y = data_batch['label'].to(device=device) + true_labels_list.append(torch.reshape(Y, (-1,1))) + + # Forward pass + logits, predictions, prediction_proba = model(X) + predictions_list.append(torch.reshape(predictions, (-1,1))) + prediction_proba_list.append(torch.reshape(prediction_proba, (-1,n_classes))) + + # Calculate sum of total loss for epoch + cum_loss += criterion(logits.float(), Y.long()).item() # Criterion returns a scalar tensor + + # Calculate loss for validation + loss = cum_loss / len(dataloader) + + # Convert true label list into array + true_labels = np.array(torch.cat(true_labels_list, dim=0).to('cpu')) + + # Convert the output lists into arrays and concatenate along dim=0 (rows) + predictions = np.array(torch.cat(predictions_list, dim=0).to('cpu')) + prediction_proba = np.array(torch.cat(prediction_proba_list, dim=0).to('cpu')) + + return true_labels, predictions, prediction_proba, loss + + +class IdentityScheduler(torch.optim.lr_scheduler._LRScheduler): + def __init__(self, optimizer, last_epoch=-1): + super(IdentityScheduler, self).__init__(optimizer, last_epoch) + + def get_lr(self): + # Returns the current learning rate without any modifications. + return self.base_lrs + + +def save_checkpoint(model, optimizer, scheduler, epoch, loss, checkpoint_path): # Will also be called to save the most recent checkpoint locally in the runtime so I always have the most recent checkpoint + torch.save({ + 'model_state_dict': model.state_dict(), + 'optimizer_state_dict': optimizer.state_dict(), + 'scheduler_state_dict': scheduler.state_dict() if scheduler else IdentityScheduler(optimizer).state_dict(), # Create identity scheduler if missing, actually doesn't work since the parameter is required + 'epoch': epoch, + 'loss': loss + }, checkpoint_path) + +def save_checkpoint_2fold(model_fold1, model_fold2, optimizer_fold1, optimizer_fold2, scheduler_fold1, scheduler_fold2, epoch, loss, checkpoint_path): # Will also be called to save the most recent checkpoint locally in the runtime so I always have the most recent checkpoint + torch.save({ + 'model_fold1_state_dict': model_fold1.state_dict(), + 'model_fold2_state_dict': model_fold2.state_dict(), + 'optimizer_fold1_state_dict': optimizer_fold1.state_dict(), + 'optimizer_fold2_state_dict': optimizer_fold2.state_dict(), + 'scheduler_fold1_state_dict': scheduler_fold1.state_dict(), + 'scheduler_fold2_state_dict': scheduler_fold2.state_dict(), + 'epoch': epoch, + 'loss': loss + }, checkpoint_path) + +def save_best_checkpoint(model, optimizer, scheduler, epoch, current_loss, best_loss, pathmaster): # When training the model, best_loss should be initialized to float.('inf') + # Might be good to have two different checkpoint paths, one for the best and one for the most recent checkpoint, maybe also have temp vs permanent checkpoint paths + if current_loss < best_loss: + checkpoints_path = pathmaster.checkpoints_path() + checkpoint_path = os.path.join(checkpoints_path, 'checkpoint_' + pathmaster.file_tag + '.pt') + best_loss = current_loss + save_checkpoint(model, optimizer, scheduler, epoch, best_loss, checkpoint_path) + print('\nNew checkpoint with better loss was saved!') + + return best_loss + else: + return best_loss + + +def save_best_checkpoint_2fold(model_fold1, model_fold2, optimizer_fold1, optimizer_fold2, scheduler_fold1, scheduler_fold2, epoch, current_loss, best_loss, pathmaster): # When training the model, best_loss should be initialized to float.('inf') + # Might be good to have two different checkpoint paths, one for the best and one for the most recent checkpoint, maybe also have temp vs permanent checkpoint paths + if current_loss < best_loss: + checkpoints_path = pathmaster.checkpoints_path() + checkpoint_path = os.path.join(checkpoints_path, 'checkpoint_' + pathmaster.file_tag + '.pt') + best_loss = current_loss + save_checkpoint_2fold(model_fold1, model_fold2, optimizer_fold1, optimizer_fold2, scheduler_fold1, scheduler_fold2, epoch, best_loss, checkpoint_path) + print('\nNew checkpoint with better loss was saved!') + + return best_loss + else: + return best_loss + + +def load_checkpoint(model, optimizer, scheduler, pathmaster): + checkpoints_path = pathmaster.checkpoints_path() + checkpoint_path = os.path.join(checkpoints_path, 'checkpoint_' + pathmaster.file_tag + '.pt') + if os.path.exists(checkpoint_path): + checkpoint = torch.load(checkpoint_path, map_location=device) + + model.load_state_dict(checkpoint['model_state_dict']) + optimizer.load_state_dict(checkpoint['optimizer_state_dict']) + scheduler.load_state_dict(checkpoint['scheduler_state_dict']) + start_epoch = checkpoint['epoch'] + loss = checkpoint['loss'] + + print('\nCheckpoint loaded!') + # print(f'Resuming training from epoch {start_epoch}, batch {start_batch}') + + return model, optimizer, scheduler, start_epoch, loss + else: + print('\nError! Checkpoint does not exist!') + + +def load_checkpoint_2fold(model_fold1, model_fold2, optimizer_fold1, optimizer_fold2, scheduler_fold1, scheduler_fold2, pathmaster): + checkpoints_path = pathmaster.checkpoints_path() + checkpoint_path = os.path.join(checkpoints_path, 'checkpoint_' + pathmaster.file_tag + '.pt') + if os.path.exists(checkpoint_path): + checkpoint = torch.load(checkpoint_path, map_location=device) + + model_fold1.load_state_dict(checkpoint['model_fold1_state_dict']) + optimizer_fold1.load_state_dict(checkpoint['optimizer_fold1_state_dict']) + scheduler_fold1.load_state_dict(checkpoint['scheduler_fold1_state_dict']) + + model_fold2.load_state_dict(checkpoint['model_fold2_state_dict']) + optimizer_fold2.load_state_dict(checkpoint['optimizer_fold2_state_dict']) + scheduler_fold2.load_state_dict(checkpoint['scheduler_fold2_state_dict']) + + start_epoch = checkpoint['epoch'] + loss = checkpoint['loss'] + + print('\nCheckpoint loaded!') + # print(f'Resuming training from epoch {start_epoch}, batch {start_batch}') + + return model_fold1, optimizer_fold1, scheduler_fold1, model_fold2, optimizer_fold2, scheduler_fold2, start_epoch, loss + else: + print('\nError! Checkpoint does not exist!') + + +def load_model_2fold(model_fold1, model_fold2, pathmaster): + checkpoints_path = pathmaster.checkpoints_path() + checkpoint_path = os.path.join(checkpoints_path, 'checkpoint_' + pathmaster.file_tag + '.pt') + if os.path.exists(checkpoint_path): + checkpoint = torch.load(checkpoint_path, map_location=device) + + model_fold1.load_state_dict(checkpoint['model_fold1_state_dict']) + model_fold2.load_state_dict(checkpoint['model_fold2_state_dict']) + + print('\nModels loaded!') + # print(f'Resuming training from epoch {start_epoch}, batch {start_batch}') + + return model_fold1, model_fold2 + else: + print('\nError! Models do not exist!') + + +def load_model(model, pathmaster): + checkpoints_path = pathmaster.checkpoints_path() + checkpoint_path = os.path.join(checkpoints_path, 'checkpoint_' + pathmaster.file_tag + '.pt') + if os.path.exists(checkpoint_path): + checkpoint = torch.load(checkpoint_path, map_location=device) + + model.load_state_dict(checkpoint['model_state_dict']) + + print('\nModel loaded!') + # print(f'Resuming training from epoch {start_epoch}, batch {start_batch}') + + return model + else: + print('\nError! Model does not exist!') + + +def load_hyperparameters(pathmaster): + hyperparameters_path = pathmaster.hyperparameters_path() + + # Extract model hyperparameters + model_hyperparameters_file = os.path.join(hyperparameters_path, 'hyperparameters_' + pathmaster.file_tag + '.csv') + model_hyperparameters = pd.read_csv(model_hyperparameters_file) + depth = int(model_hyperparameters['depth'].iloc[0]) + growth_rate = int(model_hyperparameters['growth_rate'].iloc[0]) + compression = model_hyperparameters['compression'].iloc[0] + bottleneck = model_hyperparameters['bottleneck'].iloc[0] + drop_rate = model_hyperparameters['drop_rate'].iloc[0] + class_weights = model_hyperparameters['class_weights'] + + return depth, growth_rate, compression, bottleneck, drop_rate, class_weights + + +def load_hyperparameters_random_search(pathmaster): + hyperparameters_path = pathmaster.hyperparameters_path() + + # Extract model hyperparameters + model_hyperparameters_file = os.path.join(hyperparameters_path, 'hyperparameters_' + pathmaster.file_tag + '.csv') + model_hyperparameters = pd.read_csv(model_hyperparameters_file) + depth = int(model_hyperparameters['depth'].iloc[0]) + growth_rate = int(model_hyperparameters['growth_rate'].iloc[0]) + compression = model_hyperparameters['compression'].iloc[0] + bottleneck = model_hyperparameters['bottleneck'].iloc[0] + drop_rate = model_hyperparameters['drop_rate'].iloc[0] + class_weights = model_hyperparameters['class_weights'] + learning_rate = model_hyperparameters['learning_rate'].iloc[0] + num_dense_tran = int(model_hyperparameters['num_dense_tran'].iloc[0]) + lambda_l1 = model_hyperparameters['lambda_l1'].iloc[0] + activation = string_to_activation((model_hyperparameters['activation'].iloc[0])) + + return depth, growth_rate, compression, bottleneck, drop_rate, class_weights, learning_rate, num_dense_tran, lambda_l1, activation + + +def string_to_activation(activation_string): + activation_map = { + 'relu': nn.ReLU(), + 'leaky_relu': nn.LeakyReLU(), + 'sigmoid': nn.Sigmoid(), + 'tanh': nn.Tanh(), + 'softmax': nn.Softmax(), + 'softplus': nn.Softplus(), + 'softshrink': nn.Softshrink(), + 'softmin': nn.Softmin(), + 'log_softmax': nn.LogSoftmax(), + 'elu': nn.ELU(), + 'prelu': nn.PReLU(), + 'relu6': nn.ReLU6(), + 'rrelu': nn.RReLU(), + 'celu': nn.CELU(), + 'selu': nn.SELU(), + 'gelu': nn.GELU(), + 'silu': nn.SiLU(), + # Add more activation functions if needed + } + + return activation_map.get(activation_string, None) + + +def activation_to_string(activation_func): + activation_map = { + nn.ReLU: 'relu', + nn.LeakyReLU: 'leaky_relu', + nn.Sigmoid: 'sigmoid', + nn.Tanh: 'tanh', + nn.Softmax: 'softmax', + nn.Softplus: 'softplus', + nn.Softshrink: 'softshrink', + nn.Softmin: 'softmin', + nn.LogSoftmax: 'log_softmax', + nn.ELU: 'elu', + nn.PReLU: 'prelu', + nn.ReLU6: 'relu6', + nn.RReLU: 'rrelu', + nn.CELU: 'celu', + nn.SELU: 'selu', + nn.GELU: 'gelu', + nn.SiLU: 'silu', + # Add more activation functions if needed + } + + return activation_map.get(activation_func.__class__, 'unknown') + + +class EarlyStoppingCallback: + def __init__(self, patience=10): + self.patience = patience + self.best_loss = float('inf') + self.counter = 0 + self.best_epoch = 0 + + def __call__(self, epoch, current_loss): + if current_loss < self.best_loss: + self.best_loss = current_loss + self.counter = 0 + self.best_epoch = epoch + else: + self.counter += 1 + if self.counter >= self.patience: + print(f"\nEarly stopping at epoch {epoch}. No improvement for {self.patience} epochs.") + + return True + + return False \ No newline at end of file diff --git a/utils/pathmaster.py b/utils/pathmaster.py new file mode 100644 index 0000000..38c5718 --- /dev/null +++ b/utils/pathmaster.py @@ -0,0 +1,321 @@ +# -*- coding: utf-8 -*- +""" +Created on Tue Mar 4 13:04:27 2024 + +@author: dchen +""" +import os + +class PathMaster(): + def __init__(self, is_linux=False, is_hpc=False, is_tfs=True, is_internal=False, is_external=False, focus='misc', file_tag='temp', img_res='not_an_img_res'): + self.focus = focus + self.file_tag = file_tag + self.is_linux = is_linux + self.is_hpc = is_hpc + self.is_tfs = is_tfs + self.is_internal = is_internal + self.is_external = is_external + self.img_res = img_res + + # Select correct root saves path + if self.is_linux: + if self.is_tfs: + self.saves_path = '/mnt/R/ENGR_Chon/Darren/Honors_Thesis/saves_tfs/' + self.focus + '/' + else: + self.saves_path = '/mnt/R/ENGR_Chon/Darren/Honors_Thesis/saves_poincare/' + self.focus + '/' + elif self.is_hpc: + if self.is_tfs: + self.saves_path = '/gpfs/scratchfs1/hfp14002/dac20022/Honors_Thesis/saves_tfs/' + self.focus + '/' + else: + self.saves_path = '/gpfs/scratchfs1/hfp14002/dac20022/Honors_Thesis/saves_poincare/' + self.focus + '/' + else: # Using your own computer + if self.is_tfs: + self.saves_path = r'\\grove.ad.uconn.edu\research\ENGR_Chon\Darren\Honors_Thesis\saves_tfs' + '\\' + self.focus + '\\' + else: + self.saves_path = r'\\grove.ad.uconn.edu\research\ENGR_Chon\Darren\Honors_Thesis\saves_poincare' + '\\' + self.focus + '\\' + + + def set_saves_path(self, saves_path): + self.saves_path = saves_path + + + def set_file_tag(self, file_tag): + self.file_tag = file_tag + + + def set_focus(self, focus): + self.focus = focus + + + def data_paths(self, data_format): + if data_format == 'pt': + # Base path + if self.is_linux: + base_path = "/mnt/R/ENGR_Chon/Darren/NIH_PulseWatch" + labels_base_path = "/mnt/R/ENGR_Chon/Darren/NIH_Pulsewatch" + # labels_base_path = "/mnt/R/ENGR_Chon/NIH_Pulsewatch_Database/Adjudication_UConn" + elif self.is_hpc: + base_path = "/gpfs/scratchfs1/kic14002/doh16101" + labels_base_path = "/gpfs/scratchfs1/hfp14002/lrm22005" + else: + if self.is_internal: + base_path = r'C:\\Chon_Lab\\NIH_Pulsewatch' + labels_base_path = r'C:\\Chon_Lab\\NIH_Pulsewatch' + elif self.is_external: + base_path = r'D:\\Chon_Lab\\NIH_Pulsewatch' + labels_base_path = r'D:\\Chon_Lab\\NIH_Pulsewatch' + else: + # R:\ENGR_Chon\Dong\MATLAB_generate_results\NIH_PulseWatch + base_path = "R:\\ENGR_Chon\\Darren\\NIH_Pulsewatch" # Why double \\ before NIH_Pulsewatch_Database? + labels_base_path = "R:\\ENGR_Chon\\Darren\\NIH_Pulsewatch" # Why double \\ before NIH_Pulsewatch_Database? + # labels_base_path = "R:\ENGR_Chon\\NIH_Pulsewatch_Database\Adjudication_UConn" + + # Type path + if self.is_tfs: + format_path = 'TFS_pt' + else: + format_path = 'Poincare_pt' + + # Join paths + data_path = os.path.join(base_path, format_path, self.img_res) + + else: + if self.is_linux: + base_path = "/mnt/R/ENGR_Chon/Dong/MATLAB_generate_results/NIH_PulseWatch" + labels_base_path = "/mnt/R/ENGR_Chon/Darren/NIH_Pulsewatch" + # labels_base_path = "/mnt/R/ENGR_Chon/NIH_Pulsewatch_Database/Adjudication_UConn" + elif self.is_hpc: + base_path = "/gpfs/scratchfs1/kic14002/doh16101" + labels_base_path = "/gpfs/scratchfs1/hfp14002/lrm22005" + else: + # R:\ENGR_Chon\Dong\MATLAB_generate_results\NIH_PulseWatch + base_path = "R:\ENGR_Chon\Dong\MATLAB_generate_results\\NIH_PulseWatch" # Why double \\ before NIH_Pulsewatch_Database? + labels_base_path = "R:\ENGR_Chon\Darren\\NIH_Pulsewatch" # Why double \\ before NIH_Pulsewatch_Database? + # labels_base_path = "R:\ENGR_Chon\\NIH_Pulsewatch_Database\Adjudication_UConn" + + if data_format == 'csv': + if self.is_tfs: + data_path = os.path.join(base_path, "TFS_csv") + else: + data_path = os.path.join(base_path, "Poincare_Density_csv") + elif data_format == 'png': + if not self.is_tfs: + print('No png image available for Density Poincare plot') + return + data_path = os.path.join(base_path, "TFS_plots") + else: + raise ValueError("Invalid data format. Choose 'csv', 'png, or 'pt'.") + + # Complete labels path + # labels_path = os.path.join(labels_base_path, "final_attemp_4_1_Dong_Ohm_2024_02_18_copy") + labels_path = os.path.join(labels_base_path, "Ground_Truths") + + # Check if directories exist + if not os.path.exists(data_path): + print("Data path does not exist") + return + if not os.path.exists(labels_path): + print("Labels path does not exist") + return + + return data_path, labels_path + + + def smote_path(self, smote_type, split): + if self.is_internal: + base_path = r'C:\Chon_Lab\NIH_Pulsewatch' + elif self.is_external: + base_path = r'D:\Chon_Lab\NIH_Pulsewatch' + else: + # R:\ENGR_Chon\Dong\MATLAB_generate_results\NIH_PulseWatch + base_path = "R:\ENGR_Chon\Darren\\NIH_Pulsewatch" # Why double \\ before NIH_Pulsewatch_Database? + + # Type path + if self.is_tfs: + format_path = 'TFS_pt' + else: + format_path = 'Poincare_pt' + + smote_path = os.path.join(base_path, format_path, smote_type, split) + + return smote_path + + + def deepbeat_paths(self): + if self.is_internal: + base_path = r'C:\Chon_Lab\Public_Database\DeepBeat\Concatenated_DeepBeat\test\Darren_conversion' + elif self.is_external: + base_path = r'D:\Chon_Lab\Public_Database\DeepBeat\Concatenated_DeepBeat\test\Darren_conversion' + else: + # R:\ENGR_Chon\Dong\MATLAB_generate_results\NIH_PulseWatch + base_path = r'R:\ENGR_Chon\Darren\Public_Database\DeepBeat\Concatenated_DeepBeat\test\Darren_conversion' + + # Type path + if self.is_tfs: + format_path = 'tfs_float16_pt' + else: + format_path = 'poincare_float16_pt' + + data_path = os.path.join(base_path, format_path) + labels_path = os.path.join(base_path, 'DeepBeat_segment_names_labels_STFT.csv') + + return data_path, labels_path + + + def mimic3_paths(self): + if self.is_internal: + base_path = r'C:\Chon_Lab\Public_Database\PPG_PeakDet_MIMICIII\Darren_conversion' + elif self.is_external: + base_path = r'D:\Chon_Lab\Public_Database\PPG_PeakDet_MIMICIII\Darren_conversion' + else: + # R:\ENGR_Chon\Dong\MATLAB_generate_results\NIH_PulseWatch + base_path = r'R:\ENGR_Chon\Darren\Public_Database\PPG_PeakDet_MIMICIII\Darren_conversion' + + # Type path + if self.is_tfs: + format_path = 'test_tfs_float16_pt' + else: + format_path = 'test_poincare_float16_pt' + + data_path = os.path.join(base_path, format_path) + labels_path = os.path.join(base_path, '2020_Han_Sensors_MIMICIII_Ground_Truth_STFT.csv') + + return data_path, labels_path + + + def simband_paths(self): + if self.is_internal: + base_path = r'C:\Chon_Lab\Public_Database\PPG_PeakDet_Simband\Darren_conversion' + elif self.is_external: + base_path = r'D:\Chon_Lab\Public_Database\PPG_PeakDet_Simband\Darren_conversion' + else: + # R:\ENGR_Chon\Dong\MATLAB_generate_results\NIH_PulseWatch + base_path = r'R:\ENGR_Chon\Darren\Public_Database\PPG_PeakDet_Simband\Darren_conversion' + + # Type path + if self.is_tfs: + format_path = 'tfs_float16_pt' + else: + format_path = 'poincare_float16_pt' + + data_path = os.path.join(base_path, format_path) + labels_path = os.path.join(base_path, 'simband_segments_labels_STFT.csv') + + return data_path, labels_path + + + def summary_path(self): + if self.is_linux: + summary_path = "/mnt/R/ENGR_Chon/Darren/NIH_Pulsewatch/labels_summary_2_18_Darren.csv" + elif self.is_hpc: + summary_path = "/gpfs/scratchfs1/hfp14002/dac20022/NIH_Pulsewatch/labels_summary_2_18_Darren.csv" + else: + if self.is_internal: + summary_path = r'C:\Chon_Lab\NIH_Pulsewatch\labels_summary_2_18_Darren.csv' + elif self.is_external: + summary_path = r'D:\Chon_Lab\NIH_Pulsewatch\labels_summary_2_18_Darren.csv' + else: + summary_path = r"\\grove.ad.uconn.edu\research\ENGR_Chon\Darren\NIH_Pulsewatch\labels_summary_2_18_Darren.csv" + + return summary_path + + + def models_path(self): + if self.is_linux: + models_path = "/mnt/R/ENGR_Chon/Darren/Honors_Thesis/models" + elif self.is_hpc: + models_path = "/gpfs/scratchfs1/hfp14002/dac20022/Honors_Thesis/models" + else: + models_path = r"\\grove.ad.uconn.edu\research\ENGR_Chon\Darren\Honors_Thesis\models" + + return models_path + + + def losslists_path(self): + losslists_path = self.saves_path + 'losslists' + + return losslists_path + + + def runtime_lists_path(self): + runtime_lists_path = self.saves_path + 'runtime_lists' + + return runtime_lists_path + + + def labels_path(self): + labels_path = self.saves_path + 'labels' + + return labels_path + + + def predictions_path(self): + predictions_path = self.saves_path + 'predictions' + + return predictions_path + + + def prediction_proba_path(self): + prediction_proba_path = self.saves_path + 'prediction_proba' + + return prediction_proba_path + + + def metrics_path(self): + metrics_path = self.saves_path + 'metrics' + + return metrics_path + + + def classification_report_path(self): + classification_report_path = self.saves_path + 'classification_reports' + + return classification_report_path + + + def classification_report_imbalanced_path(self): + classification_report_imbalanced_path = self.saves_path + 'classification_reports_imbalanced' + + return classification_report_imbalanced_path + + + def confusion_matrices_path(self): + confusion_matrices_path = self.saves_path + 'confusion_matrices' + + return confusion_matrices_path + + + def checkpoints_path(self): + checkpoints_path = self.saves_path + 'checkpoints' + + return checkpoints_path + + + def hyperparameters_path(self): + hyperparameters_path = self.saves_path + 'hyperparameters' + + return hyperparameters_path + + + def loss_curves_path(self): + loss_curves_path = self.saves_path + 'loss_curves' + + return loss_curves_path + + + def roc_curves_path(self): + roc_curves_path = self.saves_path + 'roc_curves' + + return roc_curves_path + + + def mean_roc_curves_path(self): + mean_roc_curves_path = self.saves_path + 'mean_roc_curves' + + return mean_roc_curves_path + + + def accuracy_curves_path(self): + accuracy_curves_path = self.saves_path + 'accuracy_curves' + + return accuracy_curves_path \ No newline at end of file diff --git a/utils/plot_save_func.py b/utils/plot_save_func.py new file mode 100644 index 0000000..abe201e --- /dev/null +++ b/utils/plot_save_func.py @@ -0,0 +1,542 @@ +# -*- coding: utf-8 -*- +""" +Created on Thu Feb 29 12:06:14 2024 + +@author: dchen +""" +import matplotlib.pyplot as plt +import numpy as np +import os +import pandas as pd +from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, roc_curve, auc, classification_report +from sklearn.preprocessing import label_binarize +from imblearn.metrics import classification_report_imbalanced + +# For increased csv speed +import pyarrow as pa +from pyarrow import csv + +def save_hyperparameters(hyperparameters, pathmaster): + hyperparameters_path = pathmaster.hyperparameters_path() + hyperparameters_path = os.path.join(hyperparameters_path, 'hyperparameters_' + pathmaster.file_tag + '.csv') + + # If there are class weights, make sure all other columns have same length + if hyperparameters['class_weights'] is not None: + # Update the dictionary + for key, value in hyperparameters.items(): + # If the length of the value is less than max_length + if key != 'class_weights': + # Fill missing values with np.nan + hyperparameters[key] = [value] + [np.nan] * (len(hyperparameters['class_weights']) - 1) + + hyperparameters = pd.DataFrame(hyperparameters) + hyperparameters.to_csv(hyperparameters_path, index=False) + + # # Using PyArrow (need each hyperparameter to be a list) + # hyperparameters_table = pa.Table.from_pydict(hyperparameters) + # csv.write_csv(hyperparameters_table, hyperparameters_path) + + +def save_losslists(losslist_train, losslist_val, pathmaster): # For holdout training and validation + losslists_path = pathmaster.losslists_path() + losslists_path = os.path.join(losslists_path, 'losslists_' + pathmaster.file_tag + '.csv') + # losslists = pd.DataFrame(dtype='float32') + # losslists['training'] = losslist_train + # losslists['validation'] = losslist_val + # losslists.to_csv(losslists_path, index=False, chunksize=500) + + # Using PyArrow + # losslists = { + # 'training': losslist_train, + # 'validation': losslist_val + # } + # losslists_table = pa.Table.from_pydict(losslists) + losslists = [np.array(losslist_train).reshape(-1).astype(np.float32), np.array(losslist_val).reshape(-1).astype(np.float32)] + losslists_names = ['training', 'validation'] + losslists_table = pa.Table.from_arrays(losslists, losslists_names) + csv.write_csv(losslists_table, losslists_path) + +def save_losslists_2fold(losslist_train_fold1, losslist_val_fold1, losslist_train_fold2, losslist_val_fold2, losslist_train, losslist_val, pathmaster): # For holdout training and validation + losslists_path = pathmaster.losslists_path() + losslists_path = os.path.join(losslists_path, 'losslists_' + pathmaster.file_tag + '.csv') + # losslists = pd.DataFrame(dtype='float32') + # losslists['training'] = losslist_train + # losslists['validation'] = losslist_val + # losslists.to_csv(losslists_path, index=False, chunksize=500) + + # Using PyArrow + # losslists = { + # 'training': losslist_train, + # 'validation': losslist_val + # } + # losslists_table = pa.Table.from_pydict(losslists) + losslists = [np.array(losslist_train_fold1).reshape(-1).astype(np.float32), np.array(losslist_val_fold1).reshape(-1).astype(np.float32), + np.array(losslist_train_fold2).reshape(-1).astype(np.float32), np.array(losslist_val_fold2).reshape(-1).astype(np.float32), + np.array(losslist_train).reshape(-1).astype(np.float32), np.array(losslist_val).reshape(-1).astype(np.float32)] + losslists_names = ['fold1_training', 'fold1_validation', 'fold2_training', 'fold2_validation', 'mean_training', 'mean_validation'] + losslists_table = pa.Table.from_arrays(losslists, losslists_names) + csv.write_csv(losslists_table, losslists_path) + + +def save_runtime_list(epoch_time_list, pathmaster): + # epoch_time_array = np.array(epoch_time_list).reshape(-1).astype(np.float32) + runtime_lists_path = pathmaster.runtime_lists_path() + runtime_lists_path = os.path.join(runtime_lists_path, 'runtime_lists_' + pathmaster.file_tag + '.csv') + # runtime_list = pd.DataFrame(dtype='float32') + # runtime_list['time_sec'] = epoch_time_list + # runtime_list.to_csv(runtime_lists_path, index=False, chunksize=500) + + # Using PyArrow + runtime_dict = {'epoch_time_sec': epoch_time_list, + 'mean_time_sec': [sum(epoch_time_list)/len(epoch_time_list)] + [np.nan] * (len(epoch_time_list) - 1)} + runtime_table = pa.Table.from_pydict(runtime_dict) + # runtime_table = pa.Table.from_arrays([epoch_time_array, np.array([np.mean(epoch_time_array)])], names=['epoch_time_sec', 'mean_time_sec']) + csv.write_csv(runtime_table, runtime_lists_path) + + +def save_labels(labels, pathmaster): + labels = labels.astype(np.int8) + labels_path = pathmaster.labels_path() + labels_path = os.path.join(labels_path, 'labels_' + pathmaster.file_tag + '.csv') + # labels = pd.DataFrame(np.array(labels), dtype='int') + # labels.to_csv(labels_path, index=False, chunksize=500) + + # Using PyArrow + # labels_dict = {'labels': labels.reshape(-1)} # Convert to 1D array + # labels_table = pa.Table.from_pydict(labels_dict) + labels_table = pa.Table.from_arrays([labels.reshape(-1)], names=['labels']) + csv.write_csv(labels_table, labels_path) + + +def save_predictions(predictions, pathmaster): + predictions = predictions.astype(np.int8) + predictions_path = pathmaster.predictions_path() + predictions_path = os.path.join(predictions_path, 'predictions_' + pathmaster.file_tag + '.csv') + # predictions = pd.DataFrame(np.array(predictions), dtype='int') + # predictions.to_csv(predictions_path, index=False, chunksize=500) + + # Using PyArrow + # predictions_dict = {'predictions': predictions.reshape(-1)} # Convert to 1D array + # predictions_table = pa.Table.from_pydict(predictions_dict) + predictions_table = pa.Table.from_arrays([predictions.reshape(-1)], names=['predictions']) + csv.write_csv(predictions_table, predictions_path) + + +def save_prediction_proba(prediction_proba, pathmaster): + prediction_proba = prediction_proba.astype(np.float32) + prediction_proba_path = pathmaster.prediction_proba_path() + prediction_proba_path = os.path.join(prediction_proba_path, 'prediction_proba_' + pathmaster.file_tag + '.csv') + # prediction_proba = pd.DataFrame(np.array(prediction_proba), dtype='float32') + # prediction_proba.to_csv(prediction_proba_path, index=False, chunksize=500) + + # Using PyArrow + # # Create PyArrow arrays with specific data type (float64) + # prediction_proba_dict = { + # '0': prediction_proba[:,0], + # '1': prediction_proba[:,1], + # '2': prediction_proba[:,2] + # } + + # Create a PyArrow table + # prediction_proba_Table = pa.Table.from_pydict(prediction_proba_dict) + # col_arrays = [prediction_proba[:,0], prediction_proba[:,1]] + # prediction_proba_Table = pa.Table.from_arrays(col_arrays, names=['0', '1']) + # csv.write_csv(prediction_proba_Table, prediction_proba_path) + col_arrays = [prediction_proba[:,0], prediction_proba[:,1], prediction_proba[:,2]] + prediction_proba_Table = pa.Table.from_arrays(col_arrays, names=['0', '1', '2']) + csv.write_csv(prediction_proba_Table, prediction_proba_path) + + +def save_prediction_proba_binary(prediction_proba, pathmaster): + prediction_proba = prediction_proba.astype(np.float32) + prediction_proba_path = pathmaster.prediction_proba_path() + prediction_proba_path = os.path.join(prediction_proba_path, 'prediction_proba_' + pathmaster.file_tag + '.csv') + # prediction_proba = pd.DataFrame(np.array(prediction_proba), dtype='float32') + # prediction_proba.to_csv(prediction_proba_path, index=False, chunksize=500) + + # Using PyArrow + # # Create PyArrow arrays with specific data type (float64) + # prediction_proba_dict = { + # '0': prediction_proba[:,0], + # '1': prediction_proba[:,1], + # '2': prediction_proba[:,2] + # } + + # Create a PyArrow table + # prediction_proba_Table = pa.Table.from_pydict(prediction_proba_dict) + # col_arrays = [prediction_proba[:,0], prediction_proba[:,1]] + # prediction_proba_Table = pa.Table.from_arrays(col_arrays, names=['0', '1']) + # csv.write_csv(prediction_proba_Table, prediction_proba_path) + col_arrays = [prediction_proba[:,0], prediction_proba[:,1]] + prediction_proba_Table = pa.Table.from_arrays(col_arrays, names=['0', '1']) + csv.write_csv(prediction_proba_Table, prediction_proba_path) + + +def metrics(Y_true, Y_pred, Y_proba, save=False, pathmaster=None): + averages = ['micro', 'macro', 'weighted'] + accuracy_list = [] + precision_list = [] + recall_list = [] + f1_list = [] + auc_list = [] + + for average in averages: + accuracy = accuracy_score(Y_true, Y_pred) + precision, recall, f1, _ = precision_recall_fscore_support(Y_true, Y_pred, average=average) + auc = roc_auc_score(Y_true, Y_proba, average=average, multi_class='ovr') + + accuracy_list.append(accuracy) + precision_list.append(precision) + recall_list.append(recall) + f1_list.append(f1) + auc_list.append(auc) + + metrics = { + 'accuracy': accuracy_list, + 'precision': precision_list, + 'recall': recall_list, + 'f1': f1_list, + 'auc': auc_list + } + + if save: + metrics_path = pathmaster.metrics_path() + metrics_path = os.path.join(metrics_path, 'metrics_' + pathmaster.file_tag + '.csv') + # metrics = pd.DataFrame(metrics, index=[0], dtype='float32') + # metrics.to_csv(metrics_path, index=False) + + # Using PyArrow + metrics_table = pa.Table.from_pydict(metrics) + csv.write_csv(metrics_table, metrics_path) + + +def metrics_binary(Y_true, Y_pred, Y_proba, save=False, pathmaster=None): + averages = ['micro', 'macro', 'weighted'] + accuracy_list = [] + precision_list = [] + recall_list = [] + f1_list = [] + auc_list = [] + + for average in averages: + accuracy = accuracy_score(Y_true, Y_pred) + precision, recall, f1, _ = precision_recall_fscore_support(Y_true, Y_pred, average=average) + auc = roc_auc_score(Y_true, Y_proba[:,1], average=average) + + accuracy_list.append(accuracy) + precision_list.append(precision) + recall_list.append(recall) + f1_list.append(f1) + auc_list.append(auc) + + metrics = { + 'accuracy': accuracy_list, + 'precision': precision_list, + 'recall': recall_list, + 'f1': f1_list, + 'auc': auc_list + } + + if save: + metrics_path = pathmaster.metrics_path() + metrics_path = os.path.join(metrics_path, 'metrics_' + pathmaster.file_tag + '.csv') + # metrics = pd.DataFrame(metrics, index=[0], dtype='float32') + # metrics.to_csv(metrics_path, index=False) + + # Using PyArrow + metrics_table = pa.Table.from_pydict(metrics) + csv.write_csv(metrics_table, metrics_path) + + +def metrics_2fold(Y_true_fold1, Y_true_fold2, Y_pred_fold1, Y_pred_fold2, Y_proba_fold1, Y_proba_fold2, save=False, pathmaster=None): + accuracy_fold1 = accuracy_score(Y_true_fold1, Y_pred_fold1) + precision_fold1, recall_fold1, f1_fold1, _ = precision_recall_fscore_support(Y_true_fold1, Y_pred_fold1, average='weighted') + auc_fold1 = roc_auc_score(Y_true_fold1, Y_proba_fold1, average='weighted', multi_class='ovr') + + accuracy_fold2 = accuracy_score(Y_true_fold2, Y_pred_fold2) + precision_fold2, recall_fold2, f1_fold2, _ = precision_recall_fscore_support(Y_true_fold2, Y_pred_fold2, average='weighted') + auc_fold2 = roc_auc_score(Y_true_fold2, Y_proba_fold2, average='weighted', multi_class='ovr') + + accuracy = accuracy_score(np.concatenate((Y_true_fold1,Y_true_fold2), axis=0), np.concatenate((Y_pred_fold1,Y_pred_fold2), axis=0)) + precision, recall, f1, _ = precision_recall_fscore_support(np.concatenate((Y_true_fold1,Y_true_fold2), axis=0), np.concatenate((Y_pred_fold1,Y_pred_fold2), axis=0), average='weighted') + auc = roc_auc_score(np.concatenate((Y_true_fold1,Y_true_fold2), axis=0), np.concatenate((Y_proba_fold1,Y_proba_fold2), axis=0), average='weighted', multi_class='ovr') + + metrics = { + 'accuracy': [accuracy_fold1, accuracy_fold2, accuracy], + 'precision': [precision_fold1, precision_fold2, precision], + 'recall': [recall_fold1, recall_fold2, recall], + 'f1': [f1_fold1, f1_fold2, f1], + 'auc': [auc_fold1, auc_fold2, auc] + } + + if save: + metrics_path = pathmaster.metrics_path() + metrics_path = os.path.join(metrics_path, 'metrics_' + pathmaster.file_tag + '.csv') + # metrics = pd.DataFrame(metrics, index=[0], dtype='float32') + # metrics.to_csv(metrics_path, index=False) + + # Using PyArrow + metrics_table = pa.Table.from_pydict(metrics) + csv.write_csv(metrics_table, metrics_path) + + +def save_classification_report(Y_true, Y_pred, save=False, pathmaster=None): + report = classification_report(Y_true, Y_pred, output_dict=True) + row_labels = ['precision', 'recall', 'f1', 'support'] + + if save: + classification_report_path = pathmaster.classification_report_path() + classification_report_path = os.path.join(classification_report_path, 'classification_report_' + pathmaster.file_tag + '.csv') + report = pd.DataFrame(report) + # report.reset_index(inplace=True) + report.insert(loc=0, column='metrics', value=row_labels) + report.to_csv(classification_report_path, index=False) + + # # Using PyArrow + # report_table = pa.Table.from_pydict(report) + # csv.write_csv(report_table, classification_report_path) + + +def save_classification_report_imbalanced(Y_true, Y_pred, save=False, pathmaster=None): + report_imbalanced = classification_report_imbalanced(Y_true, Y_pred, output_dict=True) + row_labels = ['precision', 'recall', 'specificity', 'f1', 'geo mean', 'iba', 'support'] + + if save: + classification_report_imbalanced_path = pathmaster.classification_report_imbalanced_path() + classification_report_imbalanced_path = os.path.join(classification_report_imbalanced_path, 'classification_report_imbalanced_' + pathmaster.file_tag + '.csv') + report_imbalanced = pd.DataFrame(report_imbalanced) + # report_imbalanced.reset_index(inplace=True) + report_imbalanced.insert(loc=0, column='metrics', value=row_labels) + report_imbalanced.to_csv(classification_report_imbalanced_path, index=False) + + # # Using PyArrow + # report_imbalanced_table = pa.Table.from_pydict(report_imbalanced) + # csv.write_csv(report_imbalanced_table, classification_report_imbalanced_path) + + +def roc_curves(y_test, y_prob, save=False, pathmaster=None, class_names=['NSR', 'AF', 'PAC/PVC']): + # Get the unique class labels + classes = np.unique(y_test) + + if class_names is None: + class_names = np.unique(y_test) + + # Convert labels to binary matrix + y_bin = label_binarize(y_test, classes=classes) + + # Pre-allocate arrays for ROC curves + fpr_mean = np.linspace(0, 1, 100) + tpr_mean = [] + fpr = [] + tpr = [] + AUC = [] + + # Calculate ROC curves for each class + for i, class_label in enumerate(classes): + fpr_i, tpr_i, _ = roc_curve(y_bin[:, i], y_prob[:, i]) + AUC.append(roc_auc_score(y_bin[:, i], y_prob[:, i])) + fpr.append(fpr_i) + tpr.append(tpr_i) + + # Interpolate TPR for mean ROC curve + tpr_mean.append(np.interp(fpr_mean, fpr_i, tpr_i)) + + # Calculate mean and standard deviation for TPR and AUC + tpr_mean = np.mean(np.array(tpr_mean).reshape(len(classes), -1), axis=0) + tpr_stdv = np.std(tpr_mean, axis=0) + mean_auc = auc(fpr_mean, tpr_mean) + std_auc = np.std(AUC) + + # Create the plot + plt.figure(figsize=(12, 9)) + plt.clf() + plt.plot([0, 1], [0, 1], 'k--') + plt.axis([0, 1, 0, 1]) + plt.xlabel('False Positive Rate', fontsize=16) + plt.ylabel('True Positive Rate', fontsize=16) + plt.title('ROC Curves (' + pathmaster.file_tag + ')', fontweight='bold') + + # Plot individual ROC curves + for i in range(len(classes)): + label_str = f"ROC Label {class_names[i]} (AUC = {AUC[i]:.3f})" + plt.plot(fpr[i], tpr[i], linewidth=3, label=label_str) + + # Plot mean ROC curve with standard deviation + plt.plot(fpr_mean, tpr_mean, color='k', label=rf"Mean ROC (AUC = {mean_auc:.3f} $\pm$ {std_auc:.3f})", linewidth=5) + plt.fill_between(fpr_mean, np.maximum(tpr_mean - tpr_stdv, 0), np.minimum(tpr_mean + tpr_stdv, 1), color='grey', alpha=0.2, label=r"$\pm$ 1 std. dev.") + + plt.legend(loc="lower right") + + if save: + roc_curves_path = pathmaster.roc_curves_path() + roc_curves_path = os.path.join(roc_curves_path, 'roc_curves_' + pathmaster.file_tag + '.jpg') + plt.savefig(roc_curves_path, dpi=150) + + +def roc_curves_binary(y_test, y_prob, save=False, pathmaster=None, class_names=['Negative', 'Positive']): + y_prob = y_prob[:,1] + # Convert labels to binary matrix + y_bin = label_binarize(y_test, classes=np.unique(y_test)) + + # Pre-allocate arrays for ROC curves + fpr_mean = np.linspace(0, 1, 100) + tpr_mean = [] + fpr = [] + tpr = [] + AUC = [] + + # Calculate ROC curve for the positive class + fpr, tpr, _ = roc_curve(y_bin, y_prob) + AUC = roc_auc_score(y_bin, y_prob) + + # Create the plot + plt.figure(figsize=(12, 9)) + plt.plot([0, 1], [0, 1], 'k--') + plt.plot(fpr, tpr, linewidth=3, label=f'ROC Curve (AUC = {AUC:.3f})') + plt.axis([0, 1, 0, 1]) + plt.xlabel('False Positive Rate', fontsize=16) + plt.ylabel('True Positive Rate', fontsize=16) + plt.title('ROC Curve', fontweight='bold') + plt.legend(loc="lower right") + + if save: + roc_curves_path = pathmaster.roc_curves_path() + roc_curves_path = os.path.join(roc_curves_path, 'roc_curves_' + pathmaster.file_tag + '.jpg') + plt.savefig(roc_curves_path, dpi=150) + + +def mean_roc_curves(Y_tests, Y_probas, clf_names, save=False, pathmaster=None): + # Pre-allocate arrays for ROC curves + fpr_mean = np.linspace(0, 1, 100) + # tpr_mean = np.zeros_like(fpr_mean) + + # Set figure size + plt.figure(figsize=(12,9)) + + # Plot individual mean ROC curves for each classifier + for y_test, y_prob, clf_name in zip(Y_tests, Y_probas, clf_names): + # Get the unique class labels + classes = np.unique(y_test) + + # Convert labels to binary matrix + y_bin = label_binarize(y_test, classes=classes) + + # Pre-allocate arrays for ROC curves + fpr = [] + tpr = [] + AUC = [] + + # Calculate ROC curves for each class + for i, class_label in enumerate(classes): + fpr_i, tpr_i, _ = roc_curve(y_bin[:, i], y_prob[:, i]) + AUC.append(roc_auc_score(y_bin[:, i], y_prob[:, i])) + fpr.append(fpr_i) + tpr.append(tpr_i) + + # Interpolate TPR for mean ROC curve + tpr_interp = [np.interp(fpr_mean, fpr_i, tpr_i) for fpr_i, tpr_i in zip(fpr, tpr)] + tpr_mean = np.mean(tpr_interp, axis=0) + + # Plot mean ROC curve + plt.plot(fpr_mean, tpr_mean, label=f"{clf_name} - Mean ROC (AUC = {auc(fpr_mean, tpr_mean):.3f} $\pm$ {np.std(AUC):.3f})", linewidth=2) + + # Additional plot configurations + plt.plot([0, 1], [0, 1], 'k--') + plt.axis([0, 1, 0, 1]) + plt.xlabel('False Positive Rate', fontsize=12) + plt.ylabel('True Positive Rate', fontsize=12) + plt.title('Mean ROC Curve(s)', fontweight='bold') + plt.legend(loc="lower right") + # plt.show() + + if save: + mean_roc_curves_path = pathmaster.mean_roc_curves_path() + mean_roc_curves_path = os.path.join(mean_roc_curves_path, 'mean_roc_curves_' + pathmaster.file_tag + '.jpg') + plt.savefig(mean_roc_curves_path, dpi=150) + + +def conf_matrix(conf_matrix, title='Confusion Matrix', save=False, pathmaster=None, class_names=['NSR', 'AF', 'PAC/PVC']): + title = title + ' (' + pathmaster.file_tag + ')' + conf_matrix_norm = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis] # Normalize + + plt.figure(figsize=(10, 8)) # Adjust the figure size as per your preference + plt.imshow(conf_matrix_norm, interpolation='nearest', cmap=plt.cm.Blues, vmin=0.0, vmax=1.0) + plt.title(title, fontweight='bold') + plt.colorbar() + tick_marks = np.arange(len(conf_matrix)) + + if class_names is not None: + tick_marks = np.arange(len(class_names)) + plt.xticks(tick_marks, class_names) + plt.yticks(tick_marks, class_names) + else: + tick_marks = np.arange(len(conf_matrix)) + plt.xticks(tick_marks, tick_marks) + plt.yticks(tick_marks, tick_marks) + + plt.xlabel('Predicted label') + plt.ylabel('True label') + + # Add counts and percentages in each box + for i in range(conf_matrix.shape[0]): + for j in range(conf_matrix.shape[1]): + percentage = conf_matrix_norm[i, j] * 100 + count = int(conf_matrix[i, j]) + # text_color = 'black' if conf_matrix[i, j] < np.max(conf_matrix) / 1.5 else 'white' + text_color = 'black' if percentage < 80 else 'white' + plt.text(j, i, "{:.2f}%\n{}".format(percentage, count), + horizontalalignment="center", + verticalalignment="center", + color=text_color) + + if save: + confusion_matrices_path = pathmaster.confusion_matrices_path() + confusion_matrices_path = os.path.join(confusion_matrices_path, 'confusion_matrix_' + pathmaster.file_tag + '.jpg') + plt.savefig(confusion_matrices_path, dpi=200) + + # plt.show() + + +def train_val_loss(losslist_train, losslist_val, title='Training and Validation Loss', save=False, pathmaster=None): + title = title + ' (' + pathmaster.file_tag + ')' + plt.figure(figsize=(12, 8)) + plt.plot(range(len(losslist_train)), losslist_train, label='training') + plt.plot(range(len(losslist_val)), losslist_val, label='validation') + plt.legend() + plt.title(title, fontweight='bold') + plt.xlabel('Epochs') + plt.ylabel('Loss') + + if save: + loss_curves_path = pathmaster.loss_curves_path() + loss_curves_path = os.path.join(loss_curves_path, 'loss_curve_' + pathmaster.file_tag + '.jpg') + plt.savefig(loss_curves_path, dpi=150) + + # plt.show() + +def accuracy_curves(Y_true_train, Y_true_val, Y_pred_train, Y_pred_val, title='Training and Validation Accuracy', save=False, pathmaster=None): + accuracy_list_train = [] + accuracy_list_val = [] + epochs_train = range(len(Y_pred_train)) + epochs_val = range(len(Y_pred_val)) + + for predictions in Y_pred_train: + accuracy = accuracy_score(Y_true_train, predictions) + accuracy_list_train.append(accuracy) + for predictions in Y_pred_val: + accuracy = accuracy_score(Y_true_val, predictions) + accuracy_list_val.append(accuracy) + + title = title + ' (' + pathmaster.file_tag + ')' + plt.figure(figsize=(12, 8)) + plt.plot(epochs_train, accuracy_list_train, label='training') + plt.plot(epochs_val, accuracy_list_val, label='validation') + plt.legend() + plt.title(title, fontweight='bold') + plt.xlabel('Epochs') + plt.ylabel('Accuracy') + + if save: + accuracy_curves_path = pathmaster.accuracy_curves_path() + accuracy_curves_path = os.path.join(accuracy_curves_path, 'accuracy_curve_' + pathmaster.file_tag + '.jpg') + plt.savefig(accuracy_curves_path, dpi=150) \ No newline at end of file diff --git a/utils/smote.py b/utils/smote.py new file mode 100644 index 0000000..890d2fb --- /dev/null +++ b/utils/smote.py @@ -0,0 +1,158 @@ +import torch +import torch.nn as nn +import torchvision.transforms as transforms +import os +import csv +from imblearn.over_sampling import SMOTE +import numpy as np +from tqdm import tqdm +import pandas as pd +from concurrent.futures import ProcessPoolExecutor + +import sys +sys.path.append('R:\ENGR_Chon\Darren\Honors_Thesis') + +# Import my own functions and classes +from utils.pathmaster import PathMaster +from utils import dataloader + +def apply_cassey_smote(data, labels): + cassey_smote = SMOTE(random_state=42,sampling_strategy='not majority',k_neighbors=5) + data_resampled, labels_resampled = cassey_smote.fit_resample(data, labels) + return data_resampled, labels_resampled + +def save_image(i, image, group, save_dir): + # Generate a unique file name with zero-padding + file_name = f'{i+1:06d}' + '_' + group + '_tfs' + + # Convert the image to a PyTorch tensor + tensor_image = torch.tensor(image).to(dtype=torch.float16) + + # Save the tensor to a .pt file + torch.save(tensor_image, os.path.join(save_dir, file_name + '.pt')) + + return file_name + +def save_images_parallel(data_resampled, group, save_dir): + file_names = [] + with ProcessPoolExecutor() as executor: + results = [executor.submit(save_image, i, image, group, save_dir) for i, image in enumerate(data_resampled)] + for future in results: + file_names.append(future.result()) + return file_names + +def main(): + # Initialize save location specifics + smote_type = 'Cassey_SMOTE' + split = '2foldCV_60_40' + groups = ['fold1', 'fold2', 'test'] + + # Device and drives + is_linux = False + is_hpc = False + is_internal = True + is_external = False + + # Input + is_tfs = True + + # Intialize the focus + focus = 'misc' + + # Initialize the file tag + file_tag = 'temp' + + # Image resolution + img_res = '128x128_float16' + + # Data type: the type to convert the data into when it is loaded in + data_type = torch.float32 + + # Create a PathMaster object + pathmaster = PathMaster(is_linux, is_hpc, is_tfs, is_internal, is_external, focus, file_tag, img_res) + + # Image dimensions + img_channels = 1 + img_size = 128 + downsample = None + standardize = None + + # Split UIDs + # train_set, val_set, test_set = dataloader.split_uids(pathmaster) + cross_val_fold1, cross_val_fold2, test_set = dataloader.split_uids_2fold_60_40_smote(pathmaster) + # train_set, val_set, test_set = dataloader.split_uids_60_10_30(pathmaster) + + # Preprocess data + data_format = 'pt' + batch_size = 256 + + # train_loader, val_loader, _ = dataloader.preprocess_data(data_format, clinical_trial_train, clinical_trial_test, clinical_trial_unlabeled, + # batch_size, standardize, False, img_channels, img_size, downsample, data_type, pathmaster) + fold1_loader, fold2_loader, test_loader = dataloader.preprocess_data(data_format, cross_val_fold1, cross_val_fold2, test_set, batch_size, + standardize, False, img_channels, img_size, downsample, data_type, pathmaster) + # train_loader, val_loader, test_loader = dataloader.preprocess_data(data_format, train_set, val_set, test_set, + # batch_size, standardize, False, img_channels, img_size, downsample, data_type, pathmaster) + data_loaders = [fold1_loader, fold2_loader, test_loader] + print() + sys.stdout.flush() + for data_loader, group in tqdm(zip(data_loaders,groups), total=len(data_loaders), desc='SMOTE', unit='Data Loader', leave=False): + sys.stderr.flush() + + # Define your original data and labels + data = np.empty((0,img_size*img_size)) + labels = np.empty((0,1)) + + sys.stdout.flush() + + for data_batch in tqdm(data_loader, total=len(data_loader), desc='Loading', unit='batch', leave=False): + sys.stderr.flush() + + # Extract input and labels + X = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[-1] * data_batch['data'].shape[-1]).numpy() + Y = data_batch['label'].numpy().reshape(-1,1) + + data = np.concatenate((data, X), axis=0) + labels = np.concatenate((labels, Y), axis=0) + + sys.stderr.flush() + print('\nData shape:', data.shape) + print('Labels shape:', labels.shape) + sys.stdout.flush() + + if group != 'test': + # SMOTE + data_resampled, labels_resampled = apply_cassey_smote(data, labels) + data_resampled = data_resampled.reshape(len(data_resampled), img_channels, img_size, img_size) + sys.stderr.flush() + print('\nResampled Data shape:', data_resampled.shape) + print('Resampled Labels shape:', labels_resampled.shape) + print() + sys.stdout.flush() + else: + data_resampled = data + data_resampled = data_resampled.reshape(len(data_resampled), img_channels, img_size, img_size) + labels_resampled = labels + sys.stderr.flush() + print('\nResampled Data shape:', data_resampled.shape) + print('Resampled Labels shape:', labels_resampled.shape) + print() + sys.stdout.flush() + + # Define a directory to save the images + # save_dir = os.path.join(r'\\grove.ad.uconn.edu\research\ENGR_Chon\Darren\NIH_Pulsewatch', smote_type, split, group) + save_dir = os.path.join(r'\\grove.ad.uconn.edu\research\ENGR_Chon\Darren\NIH_Pulsewatch', smote_type, split, group) + os.makedirs(save_dir, exist_ok=True) + + file_names = save_images_parallel(data_resampled, group, save_dir) + + # Ground truths + data_labels = pd.DataFrame({ + 'segment_name': file_names, + 'label': labels_resampled + }) + + csv_file_name = os.path.join(r'C:\Chon_Lab\NIH_Pulsewatch', smote_type, split, smote_type + '_' + group + '_names_labels.csv') + data_labels.to_csv(csv_file_name, index=False) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/utils/smote_accelerated.py b/utils/smote_accelerated.py new file mode 100644 index 0000000..555f8cc --- /dev/null +++ b/utils/smote_accelerated.py @@ -0,0 +1,178 @@ +import torch +import torch.nn as nn +import torchvision.transforms as transforms +import os +import csv +from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN +import numpy as np +from tqdm import tqdm +import pandas as pd +from concurrent.futures import ProcessPoolExecutor + +import sys +sys.path.append('R:\ENGR_Chon\Darren\Honors_Thesis') + +# Import my own functions and classes +from utils.pathmaster import PathMaster +from utils import dataloader + +def apply_cassey_smote(data, labels): + cassey_smote = SMOTE(random_state=42,sampling_strategy='not majority',k_neighbors=5) + data_resampled, labels_resampled = cassey_smote.fit_resample(data, labels) + return data_resampled, labels_resampled + +def apply_borderline_smote(data, labels): + borderline_smote = BorderlineSMOTE(random_state=42,sampling_strategy='not majority',k_neighbors=5) + data_resampled, labels_resampled = borderline_smote.fit_resample(data, labels) + return data_resampled, labels_resampled + +def apply_adasyn(data, labels): + adasyn = ADASYN(random_state=42,sampling_strategy='not majority',n_neighbors=5) + data_resampled, labels_resampled = adasyn.fit_resample(data, labels) + return data_resampled, labels_resampled + +def save_image(i, image, group, save_dir): + + # Generate a unique file name with zero-padding + file_name = f'{i+1:06d}' + '_' + group + '_tfs' + + # Convert the image to a PyTorch tensor + tensor_image = torch.tensor(image).to(dtype=torch.float16) + tensor_image = tensor_image.reshape(tensor_image.size()[-2], tensor_image.size()[-2]) + + + # Save the tensor to a .pt file + torch.save(tensor_image, os.path.join(save_dir, file_name + '.pt')) + + return file_name + +def save_images_parallel(data_resampled, group, save_dir): + file_names = [] + with ProcessPoolExecutor() as executor: + results = [executor.submit(save_image, i, image, group, save_dir) for i, image in enumerate(data_resampled)] + for future in results: + file_names.append(future.result()) + return file_names + +def main(): + # Initialize save location specifics + # smote_type = 'Cassey_SMOTE' + smote_type = 'Borderline_SMOTE' + # smote_type = 'ADASYN' + + # split = '2foldCV_60_40' + split = 'holdout_60_10_30' + + # groups = ['fold1', 'fold2', 'test'] + groups = ['train', 'validate', 'test'] + + # Device and drives + is_linux = False + is_hpc = False + is_internal = True + is_external = False + + # Input + is_tfs = True + + # Intialize the focus + focus = 'misc' + + # Initialize the file tag + file_tag = 'temp' + + # Image resolution + img_res = '128x128_float16' + + # Data type: the type to convert the data into when it is loaded in + data_type = torch.float32 + + # Create a PathMaster object + pathmaster = PathMaster(is_linux, is_hpc, is_tfs, is_internal, is_external, focus, file_tag, img_res) + + # Image dimensions + img_channels = 1 + img_size = 128 + downsample = None + standardize = None + + # Split UIDs + # cross_val_fold1, cross_val_fold2, test_set = dataloader.split_uids_2fold_60_40_smote(pathmaster) + train_set, val_set, test_set = dataloader.split_uids_60_10_30_smote(pathmaster) + + # Preprocess data + data_format = 'pt' + batch_size = 256 + + # fold1_loader, fold2_loader, test_loader = dataloader.preprocess_data(data_format, cross_val_fold1, cross_val_fold2, test_set, batch_size, + # standardize, False, img_channels, img_size, downsample, data_type, pathmaster) + # data_loaders = [fold1_loader, fold2_loader, test_loader] + + train_loader, val_loader, test_loader = dataloader.preprocess_data(data_format, train_set, val_set, test_set, + batch_size, standardize, False, img_channels, img_size, downsample, data_type, pathmaster) + data_loaders = [train_loader, val_loader, test_loader] + print() + sys.stdout.flush() + for data_loader, group in tqdm(zip(data_loaders,groups), total=len(data_loaders), desc='SMOTE', unit='Data Loader', leave=False): + sys.stderr.flush() + + # Define your original data and labels + data = np.empty((0,img_size*img_size)) + labels = np.empty((0,1)) + + sys.stdout.flush() + + for data_batch in tqdm(data_loader, total=len(data_loader), desc='Loading', unit='batch', leave=False): + sys.stderr.flush() + + # Extract input and labels + X = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[-1] * data_batch['data'].shape[-1]).numpy() + Y = data_batch['label'].numpy().reshape(-1,1) + + data = np.concatenate((data, X), axis=0) + labels = np.concatenate((labels, Y), axis=0) + + sys.stderr.flush() + print('\nData shape:', data.shape) + print('Labels shape:', labels.shape) + sys.stdout.flush() + + if group != 'test': + # SMOTE + # data_resampled, labels_resampled = apply_cassey_smote(data, labels) + data_resampled, labels_resampled = apply_borderline_smote(data, labels) + # data_resampled, labels_resampled = apply_adasyn(data, labels) + data_resampled = data_resampled.reshape(len(data_resampled), img_channels, img_size, img_size) + sys.stderr.flush() + print('\nResampled Data shape:', data_resampled.shape) + print('Resampled Labels shape:', labels_resampled.shape) + print() + sys.stdout.flush() + else: + data_resampled = data + data_resampled = data_resampled.reshape(len(data_resampled), img_channels, img_size, img_size) + labels_resampled = labels + sys.stderr.flush() + print('\nResampled Data shape:', data_resampled.shape) + print('Resampled Labels shape:', labels_resampled.shape) + print() + sys.stdout.flush() + + # Define a directory to save the images + # save_dir = os.path.join(r'\\grove.ad.uconn.edu\research\ENGR_Chon\Darren\NIH_Pulsewatch', smote_type, split, group) + save_dir = os.path.join(r'C:\Chon_Lab\NIH_Pulsewatch\TFS_pt', smote_type, split, group) + os.makedirs(save_dir, exist_ok=True) + + file_names = save_images_parallel(data_resampled, group, save_dir) + + # Ground truths + data_labels = pd.DataFrame({ + 'segment_name': file_names, + 'label': labels_resampled.reshape(-1) + }) + + csv_file_name = os.path.join(r'C:\Chon_Lab\NIH_Pulsewatch\TFS_pt', smote_type, split, smote_type + '_' + group + '_names_labels.csv') + data_labels.to_csv(csv_file_name, index=False) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/utils/smote_accelerated_lab.py b/utils/smote_accelerated_lab.py new file mode 100644 index 0000000..90201ba --- /dev/null +++ b/utils/smote_accelerated_lab.py @@ -0,0 +1,177 @@ +import torch +import torch.nn as nn +import torchvision.transforms as transforms +import os +import csv +from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN +import numpy as np +from tqdm import tqdm +import pandas as pd +from concurrent.futures import ProcessPoolExecutor + +import sys +sys.path.append('R:\ENGR_Chon\Darren\Honors_Thesis') + +# Import my own functions and classes +from utils.pathmaster import PathMaster +from utils import dataloader + +def apply_cassey_smote(data, labels): + cassey_smote = SMOTE(random_state=42,sampling_strategy='not majority',k_neighbors=5) + data_resampled, labels_resampled = cassey_smote.fit_resample(data, labels) + return data_resampled, labels_resampled + +def apply_borderline_smote(data, labels): + borderline_smote = BorderlineSMOTE(random_state=42,sampling_strategy='not majority',k_neighbors=5) + data_resampled, labels_resampled = borderline_smote.fit_resample(data, labels) + return data_resampled, labels_resampled + +def apply_adasyn(data, labels): + adasyn = ADASYN(random_state=42,sampling_strategy='not majority',n_neighbors=4) + data_resampled, labels_resampled = adasyn.fit_resample(data, labels) + return data_resampled, labels_resampled + +def save_image(i, image, group, save_dir): + + # Generate a unique file name with zero-padding + file_name = f'{i+1:06d}' + '_' + group + '_tfs' + + # Convert the image to a PyTorch tensor + tensor_image = torch.tensor(image).to(dtype=torch.float16) + tensor_image = tensor_image.reshape(tensor_image.size()[-2], tensor_image.size()[-2]) + + # Save the tensor to a .pt file + torch.save(tensor_image, os.path.join(save_dir, file_name + '.pt')) + + return file_name + +def save_images_parallel(data_resampled, group, save_dir): + file_names = [] + with ProcessPoolExecutor() as executor: + results = [executor.submit(save_image, i, image, group, save_dir) for i, image in enumerate(data_resampled)] + for future in results: + file_names.append(future.result()) + return file_names + +def main(): + # Initialize save location specifics + # smote_type = 'Cassey4k_SMOTE' + # smote_type = 'Borderline5k_SMOTE' + smote_type = 'ADASYN6k' + + # split = '2foldCV_60_40' + split = 'holdout_60_10_30' + + # groups = ['fold1', 'fold2', 'test'] + groups = ['train', 'validate', 'test'] + + # Device and drives + is_linux = False + is_hpc = False + is_internal = False + is_external = False + + # Input + is_tfs = True + + # Intialize the focus + focus = 'misc' + + # Initialize the file tag + file_tag = 'temp' + + # Image resolution + img_res = '128x128_float16' + + # Data type: the type to convert the data into when it is loaded in + data_type = torch.float32 + + # Create a PathMaster object + pathmaster = PathMaster(is_linux, is_hpc, is_tfs, is_internal, is_external, focus, file_tag, img_res) + + # Image dimensions + img_channels = 1 + img_size = 128 + downsample = None + standardize = None + + # Split UIDs + # cross_val_fold1, cross_val_fold2, test_set = dataloader.split_uids_2fold_60_40_smote(pathmaster) + train_set, val_set, test_set = dataloader.split_uids_60_10_30_smote(pathmaster) + + # Preprocess data + data_format = 'pt' + batch_size = 256 + + # fold1_loader, fold2_loader, test_loader = dataloader.preprocess_data(data_format, cross_val_fold1, cross_val_fold2, test_set, batch_size, + # standardize, False, img_channels, img_size, downsample, data_type, pathmaster) + # data_loaders = [fold1_loader, fold2_loader, test_loader] + + train_loader, val_loader, test_loader = dataloader.preprocess_data(data_format, train_set, val_set, test_set, + batch_size, standardize, False, img_channels, img_size, downsample, data_type, pathmaster) + data_loaders = [train_loader, val_loader, test_loader] + print() + sys.stdout.flush() + for data_loader, group in tqdm(zip(data_loaders,groups), total=len(data_loaders), desc='SMOTE', unit='Data Loader', leave=False): + sys.stderr.flush() + + # Define your original data and labels + data = np.empty((0,img_size*img_size)) + labels = np.empty((0,1)) + + sys.stdout.flush() + + for data_batch in tqdm(data_loader, total=len(data_loader), desc='Loading', unit='batch', leave=False): + sys.stderr.flush() + + # Extract input and labels + X = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[-1] * data_batch['data'].shape[-1]).numpy() + Y = data_batch['label'].numpy().reshape(-1,1) + + data = np.concatenate((data, X), axis=0) + labels = np.concatenate((labels, Y), axis=0) + + sys.stderr.flush() + print('\nData shape:', data.shape) + print('Labels shape:', labels.shape) + sys.stdout.flush() + + if group != 'test': + # SMOTE + # data_resampled, labels_resampled = apply_cassey_smote(data, labels) + # data_resampled, labels_resampled = apply_borderline_smote(data, labels) + data_resampled, labels_resampled = apply_adasyn(data, labels) + data_resampled = data_resampled.reshape(len(data_resampled), img_size, img_size) + sys.stderr.flush() + print('\nResampled Data shape:', data_resampled.shape) + print('Resampled Labels shape:', labels_resampled.shape) + print() + sys.stdout.flush() + else: + data_resampled = data + data_resampled = data_resampled.reshape(len(data_resampled), img_size, img_size) + labels_resampled = labels + sys.stderr.flush() + print('\nData shape:', data_resampled.shape) + print('Labels shape:', labels_resampled.shape) + print() + sys.stdout.flush() + + # Define a directory to save the images + # save_dir = os.path.join(r'\\grove.ad.uconn.edu\research\ENGR_Chon\Darren\NIH_Pulsewatch', smote_type, split, group) + save_dir = os.path.join(r'\\grove.ad.uconn.edu\research\ENGR_Chon\Darren\NIH_Pulsewatch\TFS_pt', smote_type, split, group) + os.makedirs(save_dir, exist_ok=True) + + file_names = save_images_parallel(data_resampled, group, save_dir) + + # Ground truths + data_labels = pd.DataFrame({ + 'segment_name': file_names, + 'label': labels_resampled.reshape(-1) + }) + + csv_file_name = os.path.join(r'\\grove.ad.uconn.edu\research\ENGR_Chon\Darren\NIH_Pulsewatch\TFS_pt', smote_type, split, smote_type + '_' + group + '_names_labels.csv') + data_labels.to_csv(csv_file_name, index=False) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/utils/smote_transfer_location.py b/utils/smote_transfer_location.py new file mode 100644 index 0000000..93ffd22 --- /dev/null +++ b/utils/smote_transfer_location.py @@ -0,0 +1,93 @@ +import os +import pandas as pd +import numpy as np +from PIL import Image +import torch +from concurrent.futures import ProcessPoolExecutor +from pyarrow import csv +import cv2 +from tqdm import tqdm +import sys + + +def preprocess_and_save_data(data_path, output_path): + if not os.path.exists(output_path): + os.makedirs(output_path) + group_directories = [entry for entry in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, entry))] + for group in tqdm(group_directories, total=len(group_directories), desc='Data Transfer', unit='Group', leave=False): + sys.stderr.flush() + group_path = os.path.join(data_path, group) + group_output_path = os.path.join(output_path, group) + if not os.path.exists(group_output_path): + os.makedirs(group_output_path) + # else: # Only use for resuming converting + # print('Skipping', group) + # continue + files_to_process = [file for file in os.listdir(group_path) if file.endswith(('.csv', '.png', '.pt'))] + with ProcessPoolExecutor() as executor: + executor.map(preprocess_file, [group_path]*len(files_to_process), files_to_process, [group_output_path]*len(files_to_process)) + print() + print(group, 'data transfer done!') + sys.stdout.flush() + +def preprocess_file(group_path, file, group_output_path): + is_tfs = True + if is_tfs: + dtype = torch.float16 + input_size = 128 + else: + dtype = torch.uint8 + input_size = 500 + + downsample = None + + file_path = os.path.join(group_path, file) + if file.endswith('.csv'): + # data = pd.read_csv(file_path, header=None).to_numpy() + + # Use PyArrow + read_options = csv.ReadOptions(autogenerate_column_names=True) + data = csv.read_csv(file_path, read_options=read_options).to_pandas().to_numpy() + + if data.shape != (input_size, input_size): + print(f"Warning: File {file_path} has shape {data.shape} instead of", input_size + 'x', input_size + '.') + elif file.endswith('.png'): + data = np.array(Image.open(file_path)) + if data.shape != (input_size, input_size): + print(f"Warning: Image {file_path} has shape {data.shape} instead of", input_size + 'x', input_size + '.') + elif file.endswith('.pt'): + data = torch.load(file_path) + if data.shape != (input_size, input_size): + print(f"Warning: Image {file_path} has shape {data.shape} instead of", input_size + 'x', input_size + '.') + else: + print('Incorrect data type') + return + + if downsample is not None: + # Downsample the image + # Use OpenCV to resize the array to downsample x downsample using INTER_AREA interpolation + data_array = cv2.resize(np.array(data), (downsample, downsample), interpolation=cv2.INTER_AREA) + data_tensor = torch.tensor(data_array, dtype=dtype).view(downsample, downsample) + elif file.endswith('.pt'): + data_tensor = data.to(dtype).view(input_size, input_size) + else: + data_tensor = torch.tensor(data, dtype=dtype).view(input_size, input_size) + + # base_name, extension = os.path.splitext(file) + output_file_path = os.path.join(group_output_path, file) + torch.save(data_tensor, output_file_path) + +def main(): + smote_type = 'ADASYN6k' + split = 'holdout_60_10_30' + input_path = os.path.join(r'\\grove.ad.uconn.edu\research\ENGR_Chon\Darren\NIH_PulseWatch\TFS_pt', smote_type, split) + # input_path = os.path.join(r'\\grove.ad.uconn.edu\research\ENGR_Chon\Darren\NIH_PulseWatch\Poincare_pt', smote_type, split) + + output_path = os.path.join(r'C:\Chon_Lab\NIH_Pulsewatch\TFS_pt', smote_type, split) + # output_path = os.path.join(r'C:\Chon_Lab\NIH_Pulsewatch\Poincare_pt', smote_type, split) + + preprocess_and_save_data(input_path, output_path) + print('Data transfer complete!') + +if __name__ == '__main__': + main()