diff --git a/main_darren_v1-8GJQ9R3.py b/main_darren_v1-8GJQ9R3.py new file mode 100644 index 0000000..a84b9a3 --- /dev/null +++ b/main_darren_v1-8GJQ9R3.py @@ -0,0 +1,260 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Apr 18 12:52:53 2024 + +@author: lrmercadod +""" +import torch +import torch.nn as nn +import time +import datetime as dt +import gpytorch +from sklearn.metrics import precision_recall_fscore_support, roc_auc_score +from sklearn.preprocessing import label_binarize + +# Import my own functions and classes +from utils.pathmaster import PathMaster +from utils.dataloader import preprocess_data + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +num_latents = 6 # This should match the complexity of your data or the number of tasks +num_tasks = 4 # This should match the number of output classes or tasks +num_inducing_points = 50 # This is independent and should be sufficient for the input space + +class MultitaskGPModel(gpytorch.models.ApproximateGP): + def __init__(self): + # Let's use a different set of inducing points for each latent function + inducing_points = torch.rand(num_latents, num_inducing_points, 128 * 128) # Assuming flattened 128x128 images + + # We have to mark the CholeskyVariationalDistribution as batch + # so that we learn a variational distribution for each task + variational_distribution = gpytorch.variational.CholeskyVariationalDistribution( + inducing_points.size(-2), batch_shape=torch.Size([num_latents]) + ) + + # We have to wrap the VariationalStrategy in a LMCVariationalStrategy + # so that the output will be a MultitaskMultivariateNormal rather than a batch output + variational_strategy = gpytorch.variational.LMCVariationalStrategy( + gpytorch.variational.VariationalStrategy( + self, inducing_points, variational_distribution, learn_inducing_locations=True + ), + num_tasks=num_tasks, + num_latents=num_latents, + latent_dim=-1 + ) + + super().__init__(variational_strategy) + + # The mean and covariance modules should be marked as batch + # so we learn a different set of hyperparameters + self.mean_module = gpytorch.means.ConstantMean(batch_shape=torch.Size([num_latents])) + self.covar_module = gpytorch.kernels.ScaleKernel( + gpytorch.kernels.RBFKernel(batch_shape=torch.Size([num_latents])), + batch_shape=torch.Size([num_latents]) + ) + + def forward(self, x): + mean_x = self.mean_module(x) + covar_x = self.covar_module(x) + latent_pred = gpytorch.distributions.MultivariateNormal(mean_x, covar_x) + return latent_pred + +def train_gp_model(train_loader, val_loader, num_iterations=50, n_classes=4, patience=10, + checkpoint_path='model_checkpoint.pt', resume_training=False): + model = MultitaskGPModel().to(device) + likelihood = gpytorch.likelihoods.SoftmaxLikelihood(num_features=4, num_classes=4).to(device) + optimizer = torch.optim.Adam(model.parameters(), lr=0.1) + mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=len(train_loader.dataset)) + + start_epoch = 0 + if resume_training and os.path.exists(checkpoint_path): + checkpoint = torch.load(checkpoint_path) + model.load_state_dict(checkpoint['model_state_dict']) + likelihood.load_state_dict(checkpoint['likelihood_state_dict']) + optimizer.load_state_dict(checkpoint['optimizer_state_dict']) + start_epoch = checkpoint.get('epoch', 0) + + best_val_loss = float('inf') + epochs_no_improve = 0 + + metrics = { + 'precision': [], + 'recall': [], + 'f1_score': [], + 'auc_roc': [], + 'train_loss': [] + } + + for epoch in range(start_epoch, num_iterations): + model.train() + likelihood.train() + for train_batch in train_loader: + optimizer.zero_grad() + train_x = train_batch['data'].reshape(train_batch['data'].size(0), -1).to(device) + train_y = train_batch['label'].to(device) + output = model(train_x) + loss = -mll(output, train_y) + metrics['train_loss'].append(loss.item()) + loss.backward() + optimizer.step() + + # Stochastic validation + model.eval() + likelihood.eval() + with torch.no_grad(): + val_indices = torch.randperm(len(val_loader.dataset))[:int(0.1 * len(val_loader.dataset))] + val_loss = 0.0 + val_labels = [] + val_predictions = [] + for idx in val_indices: + val_batch = val_loader.dataset[idx] + val_x = val_batch['data'].reshape(-1).unsqueeze(0).to(device) + val_y = torch.tensor([val_batch['label']], device=device) + val_output = model(val_x) + val_loss_batch = -mll(val_output, val_y).sum() + val_loss += val_loss_batch.item() + val_labels.append(val_y.item()) + val_predictions.append(val_output.mean.argmax(dim=-1).item()) + + precision, recall, f1, _ = precision_recall_fscore_support(val_labels, val_predictions, average='macro') + auc_roc = roc_auc_score(label_binarize(val_labels, classes=range(n_classes)), + label_binarize(val_predictions, classes=range(n_classes)), + multi_class='ovr') + + metrics['precision'].append(precision) + metrics['recall'].append(recall) + metrics['f1_score'].append(f1) + metrics['auc_roc'].append(auc_roc) + val_loss /= len(val_indices) + + if val_loss < best_val_loss: + best_val_loss = val_loss + epochs_no_improve = 0 + torch.save({'model_state_dict': model.state_dict(), + 'likelihood_state_dict': likelihood.state_dict(), + 'optimizer_state_dict': optimizer.state_dict(), + 'epoch': epoch}, checkpoint_path) + else: + epochs_no_improve += 1 + if epochs_no_improve >= patience: + print(f"Early stopping triggered at epoch {epoch+1}") + break + + if os.path.exists(checkpoint_path): + checkpoint = torch.load(checkpoint_path) + model.load_state_dict(checkpoint['model_state_dict']) + likelihood.load_state_dict(checkpoint['likelihood_state_dict']) + optimizer.load_state_dict(checkpoint['optimizer_state_dict']) + + return model, likelihood, metrics + +def evaluate_gp_model(test_loader, model, likelihood, n_classes=4): + model.eval() + likelihood.eval() + test_labels = [] + test_predictions = [] + + with torch.no_grad(): + for test_batch in test_loader: + test_x = test_batch['data'].reshape(test_batch['data'].size(0), -1).to(device) + test_y = test_batch['label'].to(device) + test_output = model(test_x) + test_labels.extend(test_y.tolist()) + test_predictions.extend(test_output.mean.argmax(dim=-1).tolist()) + + precision, recall, f1, _ = precision_recall_fscore_support(test_labels, test_predictions, average='macro') + auc_roc = roc_auc_score(label_binarize(test_labels, classes=range(n_classes)), + label_binarize(test_predictions, classes=range(n_classes)), + multi_class='ovr') + + metrics = { + 'precision': precision, + 'recall': recall, + 'f1_score': f1, + 'auc_roc': auc_roc + } + + return metrics + +def main(): + # Device and drives + is_linux = False + is_hpc = False + is_internal = False + is_external = True + binary = False + + # Input + is_tfs = True + + # Database + database = 'mimic3' + + # Initialize the focus + focus = 'thesis_results_database_multiclass' + + # Initialize the file tag + file_tag = 'MIMIC_III' + + # Image resolution + img_res = '128x128_float16' + + # Data type: the type to convert the data into when it is loaded in + data_type = torch.float32 + + # Model type + model_type = torch.float32 + + # Create a PathMaster object + pathmaster = PathMaster(is_linux, is_hpc, is_tfs, is_internal, is_external, focus, file_tag, img_res) + + # Image dimensions + img_channels = 1 + img_size = 128 + downsample = None + standardize = True + + # Run parameters + n_epochs = 100 + if binary: + n_classes = 2 + else: + n_classes = 3 + patience = round(n_epochs / 10) if n_epochs > 50 else 5 + save = True + + # Resume checkpoint + resume_checkpoint_path = None + + # Data loading details + data_format = 'pt' + batch_size = 256 + + # Preprocess database data + test_loader = preprocess_data(database, batch_size, standardize, img_channels, img_size, + downsample, data_type, pathmaster, binary) + + # Training and validation + start_time = time.time() + model, likelihood, metrics = train_gp_model(train_loader, val_loader, n_epochs, + n_classes, patience, save, pathmaster) + end_time = time.time() + time_passed = end_time - start_time + print('\nTraining and validation took %.2f minutes' % (time_passed / 60)) + + # Evaluation + start_time = time.time() + test_metrics = evaluate_gp_model(test_loader, model, likelihood, n_classes) + end_time = time.time() + time_passed = end_time - start_time + print('\nTesting took %.2f seconds' % time_passed) + + print('Test Metrics:') + print('Precision: %.4f' % test_metrics['precision']) + print('Recall: %.4f' % test_metrics['recall']) + print('F1 Score: %.4f' % test_metrics['f1_score']) + print('AUC-ROC: %.4f' % test_metrics['auc_roc']) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/main_darren_v1.py b/main_darren_v1.py new file mode 100644 index 0000000..29ec642 --- /dev/null +++ b/main_darren_v1.py @@ -0,0 +1,265 @@ +import os +import torch +import gpytorch +from sklearn.metrics import precision_recall_fscore_support, roc_auc_score +from sklearn.preprocessing import label_binarize +from torch.utils.data import Dataset, DataLoader +import numpy as np +import random +import time + +# Seeds +torch.manual_seed(42) +np.random.seed(42) +random.seed(42) + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +num_latents = 6 # This should match the complexity of your data or the number of tasks +num_tasks = 4 # This should match the number of output classes or tasks +num_inducing_points = 50 # This is independent and should be sufficient for the input space + +class MultitaskGPModel(gpytorch.models.ApproximateGP): + def __init__(self): + # Let's use a different set of inducing points for each latent function + inducing_points = torch.rand(num_latents, num_inducing_points, 128 * 128) # Assuming flattened 128x128 images + + # We have to mark the CholeskyVariationalDistribution as batch + # so that we learn a variational distribution for each task + variational_distribution = gpytorch.variational.CholeskyVariationalDistribution( + inducing_points.size(-2), batch_shape=torch.Size([num_latents]) + ) + + # We have to wrap the VariationalStrategy in a LMCVariationalStrategy + # so that the output will be a MultitaskMultivariateNormal rather than a batch output + variational_strategy = gpytorch.variational.LMCVariationalStrategy( + gpytorch.variational.VariationalStrategy( + self, inducing_points, variational_distribution, learn_inducing_locations=True + ), + num_tasks=num_tasks, + num_latents=num_latents, + latent_dim=-1 + ) + + super().__init__(variational_strategy) + + # The mean and covariance modules should be marked as batch + # so we learn a different set of hyperparameters + self.mean_module = gpytorch.means.ConstantMean(batch_shape=torch.Size([num_latents])) + self.covar_module = gpytorch.kernels.ScaleKernel( + gpytorch.kernels.RBFKernel(batch_shape=torch.Size([num_latents])), + batch_shape=torch.Size([num_latents]) + ) + + def forward(self, x): + mean_x = self.mean_module(x) + covar_x = self.covar_module(x) + latent_pred = gpytorch.distributions.MultivariateNormal(mean_x, covar_x) + return latent_pred + +class CustomDataset(Dataset): + def __init__(self, data_path, labels_path, binary=False): + self.data_path = data_path + self.labels_path = labels_path + self.binary = binary + self.segment_names, self.labels = self.extract_segment_names_and_labels() + + def __len__(self): + return len(self.segment_names) + + def __getitem__(self, idx): + segment_name = self.segment_names[idx] + label = self.labels[segment_name] + data_tensor = torch.load(os.path.join(self.data_path, segment_name + '.pt')) + return {'data': data_tensor, 'label': label, 'segment_name': segment_name} + + def extract_segment_names_and_labels(self): + segment_names = [] + labels = {} + + with open(self.labels_path, 'r') as file: + lines = file.readlines() + for line in lines[1:]: # Skip the header line + segment_name, label = line.strip().split(',') + label = int(float(label)) # Convert the label to float first, then to int + if self.binary and label == 2: + label = 0 # Convert PAC/PVC to non-AF (0) for binary classification + segment_names.append(segment_name) + labels[segment_name] = label + + return segment_names, labels + +def load_data(data_path, labels_path, batch_size, binary=False): + dataset = CustomDataset(data_path, labels_path, binary) + dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True) + return dataloader + +def train_gp_model(train_loader, val_loader, num_iterations=50, n_classes=4, patience=10, + checkpoint_path='model_checkpoint.pt', resume_training=False): + model = MultitaskGPModel().to(device) + likelihood = gpytorch.likelihoods.SoftmaxLikelihood(num_features=4, num_classes=4).to(device) + optimizer = torch.optim.Adam(model.parameters(), lr=0.1) + mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=len(train_loader.dataset)) + + start_epoch = 0 + if resume_training and os.path.exists(checkpoint_path): + checkpoint = torch.load(checkpoint_path) + model.load_state_dict(checkpoint['model_state_dict']) + likelihood.load_state_dict(checkpoint['likelihood_state_dict']) + optimizer.load_state_dict(checkpoint['optimizer_state_dict']) + start_epoch = checkpoint.get('epoch', 0) + + best_val_loss = float('inf') + epochs_no_improve = 0 + + metrics = { + 'precision': [], + 'recall': [], + 'f1_score': [], + 'auc_roc': [], + 'train_loss': [] + } + + for epoch in range(start_epoch, num_iterations): + model.train() + likelihood.train() + for train_batch in train_loader: + optimizer.zero_grad() + train_x = train_batch['data'].reshape(train_batch['data'].size(0), -1).to(device) + train_y = train_batch['label'].to(device) + output = model(train_x) + loss = -mll(output, train_y) + metrics['train_loss'].append(loss.item()) + loss.backward() + optimizer.step() + + # Stochastic validation + model.eval() + likelihood.eval() + with torch.no_grad(): + val_indices = torch.randperm(len(val_loader.dataset))[:int(0.1 * len(val_loader.dataset))] + val_loss = 0.0 + val_labels = [] + val_predictions = [] + for idx in val_indices: + val_batch = val_loader.dataset[idx] + val_x = val_batch['data'].reshape(-1).unsqueeze(0).to(device) + val_y = torch.tensor([val_batch['label']], device=device) + val_output = model(val_x) + val_loss_batch = -mll(val_output, val_y).sum() + val_loss += val_loss_batch.item() + val_labels.append(val_y.item()) + val_predictions.append(val_output.mean.argmax(dim=-1).item()) + + precision, recall, f1, _ = precision_recall_fscore_support(val_labels, val_predictions, average='macro') + auc_roc = roc_auc_score(label_binarize(val_labels, classes=range(n_classes)), + label_binarize(val_predictions, classes=range(n_classes)), + multi_class='ovr') + + metrics['precision'].append(precision) + metrics['recall'].append(recall) + metrics['f1_score'].append(f1) + metrics['auc_roc'].append(auc_roc) + val_loss /= len(val_indices) + + if val_loss < best_val_loss: + best_val_loss = val_loss + epochs_no_improve = 0 + torch.save({'model_state_dict': model.state_dict(), + 'likelihood_state_dict': likelihood.state_dict(), + 'optimizer_state_dict': optimizer.state_dict(), + 'epoch': epoch}, checkpoint_path) + else: + epochs_no_improve += 1 + if epochs_no_improve >= patience: + print(f"Early stopping triggered at epoch {epoch+1}") + break + + if os.path.exists(checkpoint_path): + checkpoint = torch.load(checkpoint_path) + model.load_state_dict(checkpoint['model_state_dict']) + likelihood.load_state_dict(checkpoint['likelihood_state_dict']) + optimizer.load_state_dict(checkpoint['optimizer_state_dict']) + + return model, likelihood, metrics + +def evaluate_gp_model(test_loader, model, likelihood, n_classes=4): + model.eval() + likelihood.eval() + test_labels = [] + test_predictions = [] + + with torch.no_grad(): + for test_batch in test_loader: + test_x = test_batch['data'].reshape(test_batch['data'].size(0), -1).to(device) + test_y = test_batch['label'].to(device) + test_output = model(test_x) + test_labels.extend(test_y.tolist()) + test_predictions.extend(test_output.mean.argmax(dim=-1).tolist()) + + precision, recall, f1, _ = precision_recall_fscore_support(test_labels, test_predictions, average='macro') + auc_roc = roc_auc_score(label_binarize(test_labels, classes=range(n_classes)), + label_binarize(test_predictions, classes=range(n_classes)), + multi_class='ovr') + + metrics = { + 'precision': precision, + 'recall': recall, + 'f1_score': f1, + 'auc_roc': auc_roc + } + + return metrics + +def main(): + # Paths + base_path = r"\\grove.ad.uconn.edu\\research\\ENGR_Chon\Darren\\NIH_Pulsewatch" + smote_type = 'Cassey5k_SMOTE' + split = 'holdout_60_10_30' + data_path_train = os.path.join(base_path, "TFS_pt", smote_type, split, "train") + data_path_val = os.path.join(base_path, "TFS_pt", smote_type, split, "validate") + data_path_test = os.path.join(base_path, "TFS_pt", smote_type, split, "test") + labels_path_train = os.path.join(base_path, "TFS_pt", smote_type, split, "Cassey5k_SMOTE_train_names_labels.csv") + labels_path_val = os.path.join(base_path, "TFS_pt", smote_type, split, "Cassey5k_SMOTE_validate_names_labels.csv") + labels_path_test = os.path.join(base_path, "TFS_pt", smote_type, split, "Cassey5k_SMOTE_test_names_labels.csv") + + # Parameters + binary = False + n_epochs = 100 + if binary: + n_classes = 2 + else: + n_classes = 3 + patience = round(n_epochs / 10) if n_epochs > 50 else 5 + save = True + resume_checkpoint_path = None + batch_size = 256 + + # Data loading + train_loader = load_data(data_path_train, labels_path_train, batch_size, binary) + val_loader = load_data(data_path_val, labels_path_val, batch_size, binary) + test_loader = load_data(data_path_test, labels_path_test, batch_size, binary) + + # Training and validation + start_time = time.time() + model, likelihood, metrics = train_gp_model(train_loader, val_loader, n_epochs, + n_classes, patience, save) + end_time = time.time() + time_passed = end_time - start_time + print('\nTraining and validation took %.2f minutes' % (time_passed / 60)) + + # Evaluation + start_time = time.time() + test_metrics = evaluate_gp_model(test_loader, model, likelihood, n_classes) + end_time = time.time() + time_passed = end_time - start_time + print('\nTesting took %.2f seconds' % time_passed) + + print('Test Metrics:') + print('Precision: %.4f' % test_metrics['precision']) + print('Recall: %.4f' % test_metrics['recall']) + print('F1 Score: %.4f' % test_metrics['f1_score']) + print('AUC-ROC: %.4f' % test_metrics['auc_roc']) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/utils/__pycache__/dataloader.cpython-310.pyc b/utils/__pycache__/dataloader.cpython-310.pyc new file mode 100644 index 0000000..ae5efbe Binary files /dev/null and b/utils/__pycache__/dataloader.cpython-310.pyc differ diff --git a/utils/__pycache__/dataloader.cpython-311.pyc b/utils/__pycache__/dataloader.cpython-311.pyc new file mode 100644 index 0000000..063e8f7 Binary files /dev/null and b/utils/__pycache__/dataloader.cpython-311.pyc differ diff --git a/utils/__pycache__/dataloader.cpython-312.pyc b/utils/__pycache__/dataloader.cpython-312.pyc new file mode 100644 index 0000000..af61a83 Binary files /dev/null and b/utils/__pycache__/dataloader.cpython-312.pyc differ diff --git a/utils/__pycache__/dataloader.cpython-39.pyc b/utils/__pycache__/dataloader.cpython-39.pyc new file mode 100644 index 0000000..1149806 Binary files /dev/null and b/utils/__pycache__/dataloader.cpython-39.pyc differ diff --git a/utils/__pycache__/dataloader_batch.cpython-310.pyc b/utils/__pycache__/dataloader_batch.cpython-310.pyc new file mode 100644 index 0000000..6b49db6 Binary files /dev/null and b/utils/__pycache__/dataloader_batch.cpython-310.pyc differ diff --git a/utils/__pycache__/dataloader_database.cpython-310.pyc b/utils/__pycache__/dataloader_database.cpython-310.pyc new file mode 100644 index 0000000..4ccea8f Binary files /dev/null and b/utils/__pycache__/dataloader_database.cpython-310.pyc differ diff --git a/utils/__pycache__/dataloader_smote.cpython-310.pyc b/utils/__pycache__/dataloader_smote.cpython-310.pyc new file mode 100644 index 0000000..8070ab1 Binary files /dev/null and b/utils/__pycache__/dataloader_smote.cpython-310.pyc differ diff --git a/utils/__pycache__/get_paths.cpython-310.pyc b/utils/__pycache__/get_paths.cpython-310.pyc new file mode 100644 index 0000000..18b2bb2 Binary files /dev/null and b/utils/__pycache__/get_paths.cpython-310.pyc differ diff --git a/utils/__pycache__/get_paths.cpython-311.pyc b/utils/__pycache__/get_paths.cpython-311.pyc new file mode 100644 index 0000000..6e0bdbe Binary files /dev/null and b/utils/__pycache__/get_paths.cpython-311.pyc differ diff --git a/utils/__pycache__/misc_func.cpython-310.pyc b/utils/__pycache__/misc_func.cpython-310.pyc new file mode 100644 index 0000000..5959146 Binary files /dev/null and b/utils/__pycache__/misc_func.cpython-310.pyc differ diff --git a/utils/__pycache__/model_func.cpython-310.pyc b/utils/__pycache__/model_func.cpython-310.pyc new file mode 100644 index 0000000..94b284b Binary files /dev/null and b/utils/__pycache__/model_func.cpython-310.pyc differ diff --git a/utils/__pycache__/model_func.cpython-311.pyc b/utils/__pycache__/model_func.cpython-311.pyc new file mode 100644 index 0000000..f798459 Binary files /dev/null and b/utils/__pycache__/model_func.cpython-311.pyc differ diff --git a/utils/__pycache__/model_func.cpython-312.pyc b/utils/__pycache__/model_func.cpython-312.pyc new file mode 100644 index 0000000..8a31f82 Binary files /dev/null and b/utils/__pycache__/model_func.cpython-312.pyc differ diff --git a/utils/__pycache__/model_func_batch.cpython-310.pyc b/utils/__pycache__/model_func_batch.cpython-310.pyc new file mode 100644 index 0000000..3b0039f Binary files /dev/null and b/utils/__pycache__/model_func_batch.cpython-310.pyc differ diff --git a/utils/__pycache__/pathmaster.cpython-310.pyc b/utils/__pycache__/pathmaster.cpython-310.pyc new file mode 100644 index 0000000..5411969 Binary files /dev/null and b/utils/__pycache__/pathmaster.cpython-310.pyc differ diff --git a/utils/__pycache__/pathmaster.cpython-312.pyc b/utils/__pycache__/pathmaster.cpython-312.pyc new file mode 100644 index 0000000..e70d54b Binary files /dev/null and b/utils/__pycache__/pathmaster.cpython-312.pyc differ diff --git a/utils/__pycache__/pathmaster.cpython-39.pyc b/utils/__pycache__/pathmaster.cpython-39.pyc new file mode 100644 index 0000000..d0ab4b5 Binary files /dev/null and b/utils/__pycache__/pathmaster.cpython-39.pyc differ diff --git a/utils/__pycache__/plot_save_func.cpython-310.pyc b/utils/__pycache__/plot_save_func.cpython-310.pyc new file mode 100644 index 0000000..902fe16 Binary files /dev/null and b/utils/__pycache__/plot_save_func.cpython-310.pyc differ diff --git a/utils/__pycache__/plot_save_func.cpython-311.pyc b/utils/__pycache__/plot_save_func.cpython-311.pyc new file mode 100644 index 0000000..5de7e02 Binary files /dev/null and b/utils/__pycache__/plot_save_func.cpython-311.pyc differ diff --git a/utils/__pycache__/plot_save_func.cpython-312.pyc b/utils/__pycache__/plot_save_func.cpython-312.pyc new file mode 100644 index 0000000..a7005b4 Binary files /dev/null and b/utils/__pycache__/plot_save_func.cpython-312.pyc differ diff --git a/utils/__pycache__/plot_save_func.cpython-39.pyc b/utils/__pycache__/plot_save_func.cpython-39.pyc new file mode 100644 index 0000000..35f1877 Binary files /dev/null and b/utils/__pycache__/plot_save_func.cpython-39.pyc differ diff --git a/utils/__pycache__/train_func.cpython-310.pyc b/utils/__pycache__/train_func.cpython-310.pyc new file mode 100644 index 0000000..c11ce94 Binary files /dev/null and b/utils/__pycache__/train_func.cpython-310.pyc differ diff --git a/utils/__pycache__/train_func.cpython-311.pyc b/utils/__pycache__/train_func.cpython-311.pyc new file mode 100644 index 0000000..8790f6f Binary files /dev/null and b/utils/__pycache__/train_func.cpython-311.pyc differ diff --git a/utils/dataloader.py b/utils/dataloader.py new file mode 100644 index 0000000..4a382e7 --- /dev/null +++ b/utils/dataloader.py @@ -0,0 +1,895 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Feb 26 18:29:59 2024 + +@author: dchen +""" +import os +import numpy as np +import pandas as pd +from PIL import Image +import torch +from torch.utils.data import Dataset, DataLoader +from sklearn.preprocessing import StandardScaler +from torchvision.transforms import ToTensor +import math +from numpy import random +from numpy.random import choice +import cv2 +from pyarrow import csv + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +# Seeds +torch.manual_seed(42) +np.random.seed(42) +random.seed(42) + + +def split_uids_60_10_30(pathmaster): + # ====== Load the per subject arrythmia summary ====== + file_path = pathmaster.summary_path() + # df_summary = pd.read_csv(file_path) + + # Read the CSV file using pyarrow.csv.read_csv + table_summary = csv.read_csv(file_path) + df_summary = table_summary.to_pandas() + + df_summary['UID'] = df_summary['UID'].astype(str).str.zfill(3) # Pads each UIDs with enough zeroes to be 3 characters + + df_summary['sample_nonAF'] = df_summary['NSR'] + df_summary['PACPVC'] + df_summary['SVT'] + df_summary['sample_AF'] = df_summary['AF'] + + df_summary['sample_nonAF_ratio'] = df_summary['sample_nonAF'] / (df_summary['sample_AF'] + df_summary['sample_nonAF']) + + # Filter out 0-segment UIDs and UIDs without NSR, AF, and/or PAC/PVC + remaining_UIDs = [] + + for index, row in df_summary.iterrows(): + UID = row['UID'] + if row['TOTAL'] == 0: + # There is no segment in this subject, skip this UID. + print(f'---------UID {UID} has no segments.------------') + elif (row['NSR'] > 0 or row['AF'] > 0 or row['PACPVC'] > 0): # Append UID only if it contains NSR, AF, or PAC/PVC + remaining_UIDs.append(UID) + else: + print(f'---------UID {UID} has no AF, NSR, or PAC/PVC segments.------------') + + # Split UIDs + uid_nsr_train = ['011', '014', '030', '037', '044', '050', '055', '058', '074', '083', '091', '098', '101', '106', '109', '119'] + uid_nsr_val = ['041', '056', '325'] + uid_nsr_test = ['003', '012', '020', '024', '027', '035', '036', '047'] + + uid_af_train = ['017', '301', '302', '305', '306', '318', '319', '320', '321', '322', '324', '329', '402', '405', '406', '407', '416', '420', '421'] + uid_af_val = ['400', '409', '422'] + uid_af_test = ['307', '310', '311', '312', '410', '413', '414', '415', '423'] + + uid_pacpvc_train = ['005', '007', '013', '021', '022', '026', '028', '029', '042', '064', '068', '073', '080', '086', '087', '089', '093', '104', '110', '113', '120', '327', '408'] + uid_pacpvc_val = ['045', '054', '112'] + uid_pacpvc_test = ['002', '038', '039', '052', '053', '069', '070', '075', '078', '090', '100', '419'] + + # Total UID counts + total_uid_nsr = uid_nsr_train + uid_nsr_val + uid_nsr_test + total_uid_af = uid_af_train + uid_af_val + uid_af_test + total_uid_pacpvc = uid_pacpvc_train + uid_pacpvc_val + uid_pacpvc_test + total_uid = total_uid_pacpvc + total_uid_af + total_uid_nsr + + print('Number of total and unique UIDs:', len(total_uid),'|', len(np.unique(total_uid))) + print('Number of total and unique NSR UIDs:', len(total_uid_nsr),'|', len(np.unique(total_uid_nsr))) + print('Number of total and unique AF UIDs:', len(total_uid_af),'|', len(np.unique(total_uid_af))) + print('Number of total and unique PAC/PVC UIDs:', len(total_uid_pacpvc),'|', len(np.unique(total_uid_pacpvc))) + + train_set = uid_nsr_train + uid_af_train + uid_pacpvc_train + val_set = uid_nsr_val + uid_af_val + uid_pacpvc_val + test_set = uid_nsr_test + uid_af_test + uid_pacpvc_test + + # Limit data set size to reduce computational load for optimization + test_set = test_set + + return train_set, val_set, test_set + + +def split_uids_60_10_30_smote(pathmaster): + # ====== Load the per subject arrythmia summary ====== + file_path = pathmaster.summary_path() + # df_summary = pd.read_csv(file_path) + + # Read the CSV file using pyarrow.csv.read_csv + table_summary = csv.read_csv(file_path) + df_summary = table_summary.to_pandas() + + df_summary['UID'] = df_summary['UID'].astype(str).str.zfill(3) # Pads each UIDs with enough zeroes to be 3 characters + + df_summary['sample_nonAF'] = df_summary['NSR'] + df_summary['PACPVC'] + df_summary['SVT'] + df_summary['sample_AF'] = df_summary['AF'] + + df_summary['sample_nonAF_ratio'] = df_summary['sample_nonAF'] / (df_summary['sample_AF'] + df_summary['sample_nonAF']) + + # Filter out 0-segment UIDs and UIDs without NSR, AF, and/or PAC/PVC + remaining_UIDs = [] + + for index, row in df_summary.iterrows(): + UID = row['UID'] + if row['TOTAL'] == 0: + # There is no segment in this subject, skip this UID. + print(f'---------UID {UID} has no segments.------------') + elif (row['NSR'] > 0 or row['AF'] > 0 or row['PACPVC'] > 0): # Append UID only if it contains NSR, AF, or PAC/PVC + remaining_UIDs.append(UID) + else: + print(f'---------UID {UID} has no AF, NSR, or PAC/PVC segments.------------') + + # Split UIDs + uid_nsr_train = ['003', '020', '024', '041', '044', '047', '049', '050', '058', '063', '077', '084', '088', '091', '098', '099', '106', '109', '111', '118', '325'] + uid_nsr_val = ['014', '030', '036', '074'] + uid_nsr_test = ['011', '012', '027', '035', '037', '055', '056', '057', '083', '094', '101', '119'] + + uid_af_train = ['017', '302', '306', '307', '310', '311', '319', '321', '324', '400', '402', '405', '406', '407', '409', '410', '415', '420', '421'] + uid_af_val = ['416', '422', '423'] + uid_af_test = ['301', '305', '312', '318', '320', '322', '329', '413', '414'] + + uid_pacpvc_train = ['005', '007', '013', '021', '022', '026', '028', '029', '042', '064', '068', '073', '080', '086', '087', '089', '093', '104', '110', '113', '120', '327', '408'] + uid_pacpvc_val = ['045', '054', '112'] + uid_pacpvc_test = ['002', '038', '039', '052', '053', '069', '070', '075', '078', '090', '100', '419'] + + # Total UID counts + total_uid_nsr = uid_nsr_train + uid_nsr_val + uid_nsr_test + total_uid_af = uid_af_train + uid_af_val + uid_af_test + total_uid_pacpvc = uid_pacpvc_train + uid_pacpvc_val + uid_pacpvc_test + total_uid = total_uid_pacpvc + total_uid_af + total_uid_nsr + + print('Number of total and unique UIDs:', len(total_uid),'|', len(np.unique(total_uid))) + print('Number of total and unique NSR UIDs:', len(total_uid_nsr),'|', len(np.unique(total_uid_nsr))) + print('Number of total and unique AF UIDs:', len(total_uid_af),'|', len(np.unique(total_uid_af))) + print('Number of total and unique PAC/PVC UIDs:', len(total_uid_pacpvc),'|', len(np.unique(total_uid_pacpvc))) + + train_set = uid_nsr_train + uid_af_train + uid_pacpvc_train + val_set = uid_nsr_val + uid_af_val + uid_pacpvc_val + test_set = uid_nsr_test + uid_af_test + uid_pacpvc_test + + return train_set, val_set, test_set + + +def split_uids_60_10_30_noPACPVC(pathmaster): + # ====== Load the per subject arrythmia summary ====== + file_path = pathmaster.summary_path() + # df_summary = pd.read_csv(file_path) + + # Read the CSV file using pyarrow.csv.read_csv + table_summary = csv.read_csv(file_path) + df_summary = table_summary.to_pandas() + + df_summary['UID'] = df_summary['UID'].astype(str).str.zfill(3) # Pads each UIDs with enough zeroes to be 3 characters + + df_summary['sample_nonAF'] = df_summary['NSR'] + df_summary['PACPVC'] + df_summary['SVT'] + df_summary['sample_AF'] = df_summary['AF'] + + df_summary['sample_nonAF_ratio'] = df_summary['sample_nonAF'] / (df_summary['sample_AF'] + df_summary['sample_nonAF']) + + # Filter out 0-segment UIDs and UIDs without NSR, AF, and/or PAC/PVC + remaining_UIDs = [] + + for index, row in df_summary.iterrows(): + UID = row['UID'] + if row['TOTAL'] == 0: + # There is no segment in this subject, skip this UID. + print(f'---------UID {UID} has no segments.------------') + elif (row['NSR'] > 0 or row['AF'] > 0 or row['PACPVC'] > 0): # Append UID only if it contains NSR, AF, or PAC/PVC + remaining_UIDs.append(UID) + else: + print(f'---------UID {UID} has no AF, NSR, or PAC/PVC segments.------------') + + # Split UIDs + uid_nsr_train = ['003', '020', '024', '041', '044', '047', '049', '050', '058', '063', '077', '084', '088', '091', '098', '099', '106', '109', '111', '118', '325'] + uid_nsr_val = ['014', '030', '036', '074'] + uid_nsr_test = ['011', '012', '027', '035', '037', '055', '056', '057', '083', '094', '101', '119'] + + uid_af_train = ['017', '302', '306', '307', '310', '311', '319', '321', '324', '400', '402', '405', '406', '407', '409', '410', '415', '420', '421'] + uid_af_val = ['416', '422', '423'] + uid_af_test = ['301', '305', '312', '318', '320', '322', '329', '413', '414'] + + uid_pacpvc_train = [] # ['005', '007', '013', '021', '022', '026', '028', '029', '042', '064', '068', '073', '080', '086', '087', '089', '093', '104', '110', '113', '120', '327', '408'] + uid_pacpvc_val = [] # ['045', '054', '112'] + uid_pacpvc_test = [] # ['002', '038', '039', '052', '053', '069', '070', '075', '078', '090', '100', '419'] + + # Total UID counts + total_uid_nsr = uid_nsr_train + uid_nsr_val + uid_nsr_test + total_uid_af = uid_af_train + uid_af_val + uid_af_test + total_uid_pacpvc = uid_pacpvc_train + uid_pacpvc_val + uid_pacpvc_test + total_uid = total_uid_pacpvc + total_uid_af + total_uid_nsr + + print('Number of total and unique UIDs:', len(total_uid),'|', len(np.unique(total_uid))) + print('Number of total and unique NSR UIDs:', len(total_uid_nsr),'|', len(np.unique(total_uid_nsr))) + print('Number of total and unique AF UIDs:', len(total_uid_af),'|', len(np.unique(total_uid_af))) + print('Number of total and unique PAC/PVC UIDs:', len(total_uid_pacpvc),'|', len(np.unique(total_uid_pacpvc))) + + train_set = uid_nsr_train + uid_af_train + uid_pacpvc_train + val_set = uid_nsr_val + uid_af_val + uid_pacpvc_val + test_set = uid_nsr_test + uid_af_test + uid_pacpvc_test + + return train_set, val_set, test_set + + +def split_uids_60_10_30_noNSR(pathmaster): + # ====== Load the per subject arrythmia summary ====== + file_path = pathmaster.summary_path() + # df_summary = pd.read_csv(file_path) + + # Read the CSV file using pyarrow.csv.read_csv + table_summary = csv.read_csv(file_path) + df_summary = table_summary.to_pandas() + + df_summary['UID'] = df_summary['UID'].astype(str).str.zfill(3) # Pads each UIDs with enough zeroes to be 3 characters + + df_summary['sample_nonAF'] = df_summary['NSR'] + df_summary['PACPVC'] + df_summary['SVT'] + df_summary['sample_AF'] = df_summary['AF'] + + df_summary['sample_nonAF_ratio'] = df_summary['sample_nonAF'] / (df_summary['sample_AF'] + df_summary['sample_nonAF']) + + # Filter out 0-segment UIDs and UIDs without NSR, AF, and/or PAC/PVC + remaining_UIDs = [] + + for index, row in df_summary.iterrows(): + UID = row['UID'] + if row['TOTAL'] == 0: + # There is no segment in this subject, skip this UID. + print(f'---------UID {UID} has no segments.------------') + elif (row['NSR'] > 0 or row['AF'] > 0 or row['PACPVC'] > 0): # Append UID only if it contains NSR, AF, or PAC/PVC + remaining_UIDs.append(UID) + else: + print(f'---------UID {UID} has no AF, NSR, or PAC/PVC segments.------------') + + # Split UIDs + uid_nsr_train = [] # ['003', '020', '024', '041', '044', '047', '049', '050', '058', '063', '077', '084', '088', '091', '098', '099', '106', '109', '111', '118', '325'] + uid_nsr_val = [] # ['014', '030', '036', '074'] + uid_nsr_test = [] # ['011', '012', '027', '035', '037', '055', '056', '057', '083', '094', '101', '119'] + + uid_af_train = ['017', '302', '306', '307', '310', '311', '319', '321', '324', '400', '402', '405', '406', '407', '409', '410', '415', '420', '421'] + uid_af_val = ['416', '422', '423'] + uid_af_test = ['301', '305', '312', '318', '320', '322', '329', '413', '414'] + + uid_pacpvc_train = ['005', '007', '013', '021', '022', '026', '028', '029', '042', '064', '068', '073', '080', '086', '087', '089', '093', '104', '110', '113', '120', '327', '408'] + uid_pacpvc_val = ['045', '054', '112'] + uid_pacpvc_test = ['002', '038', '039', '052', '053', '069', '070', '075', '078', '090', '100', '419'] + + # Total UID counts + total_uid_nsr = uid_nsr_train + uid_nsr_val + uid_nsr_test + total_uid_af = uid_af_train + uid_af_val + uid_af_test + total_uid_pacpvc = uid_pacpvc_train + uid_pacpvc_val + uid_pacpvc_test + total_uid = total_uid_pacpvc + total_uid_af + total_uid_nsr + + print('Number of total and unique UIDs:', len(total_uid),'|', len(np.unique(total_uid))) + print('Number of total and unique NSR UIDs:', len(total_uid_nsr),'|', len(np.unique(total_uid_nsr))) + print('Number of total and unique AF UIDs:', len(total_uid_af),'|', len(np.unique(total_uid_af))) + print('Number of total and unique PAC/PVC UIDs:', len(total_uid_pacpvc),'|', len(np.unique(total_uid_pacpvc))) + + train_set = uid_nsr_train + uid_af_train + uid_pacpvc_train + val_set = uid_nsr_val + uid_af_val + uid_pacpvc_val + test_set = uid_nsr_test + uid_af_test + uid_pacpvc_test + + return train_set, val_set, test_set + + +def split_uids_60_10_30_balanced(pathmaster): + # ====== Load the per subject arrythmia summary ====== + file_path = pathmaster.summary_path() + # df_summary = pd.read_csv(file_path) + + # Read the CSV file using pyarrow.csv.read_csv + table_summary = csv.read_csv(file_path) + df_summary = table_summary.to_pandas() + + df_summary['UID'] = df_summary['UID'].astype(str).str.zfill(3) # Pads each UIDs with enough zeroes to be 3 characters + + df_summary['sample_nonAF'] = df_summary['NSR'] + df_summary['PACPVC'] + df_summary['SVT'] + df_summary['sample_AF'] = df_summary['AF'] + + df_summary['sample_nonAF_ratio'] = df_summary['sample_nonAF'] / (df_summary['sample_AF'] + df_summary['sample_nonAF']) + + # Filter out 0-segment UIDs and UIDs without NSR, AF, and/or PAC/PVC + remaining_UIDs = [] + + for index, row in df_summary.iterrows(): + UID = row['UID'] + if row['TOTAL'] == 0: + # There is no segment in this subject, skip this UID. + print(f'---------UID {UID} has no segments.------------') + elif (row['NSR'] > 0 or row['AF'] > 0 or row['PACPVC'] > 0): # Append UID only if it contains NSR, AF, or PAC/PVC + remaining_UIDs.append(UID) + else: + print(f'---------UID {UID} has no AF, NSR, or PAC/PVC segments.------------') + + # Split UIDs + uid_nsr_train = ['041', '044', '047', '050', '058', '063', '091', '098', '106', '111', '325'] + uid_nsr_val = ['014', '030', '036', '074'] + uid_nsr_test = ['011', '012', '027', '035', '037', '055', '056', '057', '083', '094', '101', '119'] + + uid_af_train = ['017', '302', '306', '307', '310', '311', '319', '321', '324', '400', '402', '407', '409', '415', '420', '421'] + uid_af_val = ['416', '422', '423'] + uid_af_test = ['301', '305', '312', '318', '320', '322', '329', '413', '414'] + + uid_pacpvc_train = ['005', '007', '013', '021', '022', '026', '028', '029', '042', '064', '068', '073', '080', '086', '087', '089', '093', '104', '110', '113', '120', '327', '408'] + uid_pacpvc_val = ['045', '054', '112'] + uid_pacpvc_test = ['002', '038', '039', '052', '053', '069', '070', '075', '078', '090', '100', '419'] + + # Total UID counts + total_uid_nsr = uid_nsr_train + uid_nsr_val + uid_nsr_test + total_uid_af = uid_af_train + uid_af_val + uid_af_test + total_uid_pacpvc = uid_pacpvc_train + uid_pacpvc_val + uid_pacpvc_test + total_uid = total_uid_pacpvc + total_uid_af + total_uid_nsr + + print('Number of total and unique UIDs:', len(total_uid),'|', len(np.unique(total_uid))) + print('Number of total and unique NSR UIDs:', len(total_uid_nsr),'|', len(np.unique(total_uid_nsr))) + print('Number of total and unique AF UIDs:', len(total_uid_af),'|', len(np.unique(total_uid_af))) + print('Number of total and unique PAC/PVC UIDs:', len(total_uid_pacpvc),'|', len(np.unique(total_uid_pacpvc))) + + train_set = uid_nsr_train + uid_af_train + uid_pacpvc_train + val_set = uid_nsr_val + uid_af_val + uid_pacpvc_val + test_set = uid_nsr_test + uid_af_test + uid_pacpvc_test + + return train_set, val_set, test_set + + +def split_uids_2fold_60_40(pathmaster): + # ====== Load the per subject arrythmia summary ====== + file_path = pathmaster.summary_path() + # df_summary = pd.read_csv(file_path) + + # Read the CSV file using pyarrow.csv.read_csv + table_summary = csv.read_csv(file_path) + df_summary = table_summary.to_pandas() + + df_summary['UID'] = df_summary['UID'].astype(str).str.zfill(3) # Pads each UIDs with enough zeroes to be 3 characters + + df_summary['sample_nonAF'] = df_summary['NSR'] + df_summary['PACPVC'] + df_summary['SVT'] + df_summary['sample_AF'] = df_summary['AF'] + + df_summary['sample_nonAF_ratio'] = df_summary['sample_nonAF'] / (df_summary['sample_AF'] + df_summary['sample_nonAF']) + + # Filter out 0-segment UIDs and UIDs without NSR, AF, and/or PAC/PVC + remaining_UIDs = [] + + for index, row in df_summary.iterrows(): + UID = row['UID'] + if row['TOTAL'] == 0: + # There is no segment in this subject, skip this UID. + print(f'---------UID {UID} has no segments.------------') + elif (row['NSR'] > 0 or row['AF'] > 0 or row['PACPVC'] > 0): # Append UID only if it contains NSR, AF, or PAC/PVC + remaining_UIDs.append(UID) + else: + print(f'---------UID {UID} has no AF, NSR, or PAC/PVC segments.------------') + + # Split UIDs + uid_pacpvc_fold1 = ['007', '022', '028', '038', '054', '068', '075', '086', '087', '093', '120', '327'] + uid_pacpvc_fold2 = ['002', '005', '013', '021', '026', '029', '045', '073', '089', '100', '112', '408'] + uid_pacpvc_test = ['039', '042', '052', '053', '064', '069', '070', '078', '080', '090', '104', '110', '113', '419'] + + uid_af_fold1 = ['305', '307', '311', '318', '320', '322', '405', '415', '423'] + uid_af_fold2 = ['301', '319', '321', '324', '329', '400', '406', '409', '416'] + uid_af_test = ['017', '302', '306', '310', '312', '402', '407', '410', '413', '414', '420', '421', '422'] + + uid_nsr_fold1 = ['011', '014', '041', '050', '056', '058', '083', '106', '109'] + uid_nsr_fold2 = ['037', '047', '055', '074', '091', '098', '101', '119', '325'] + uid_nsr_test = ['003', '012', '020', '024', '027', '030', '035', '036', '044', '049', '057', '063', '077', '084', '088', '094', '099', '111', '118'] + + # Total UID counts + total_uid_pacpvc = uid_pacpvc_fold1 + uid_pacpvc_fold2 + uid_pacpvc_test + total_uid_af = uid_af_fold1 + uid_af_fold2 + uid_af_test + total_uid_nsr = uid_nsr_fold1 + uid_nsr_fold2 + uid_nsr_test + total_uid = total_uid_pacpvc + total_uid_af + total_uid_nsr + + print('Number of total and unique UIDs:', len(total_uid),'|', len(np.unique(total_uid))) + print('Number of total and unique NSR UIDs:', len(total_uid_nsr),'|', len(np.unique(total_uid_nsr))) + print('Number of total and unique AF UIDs:', len(total_uid_af),'|', len(np.unique(total_uid_af))) + print('Number of total and unique PAC/PVC UIDs:', len(total_uid_pacpvc),'|', len(np.unique(total_uid_pacpvc))) + + cross_val_fold1 = uid_nsr_fold1 + uid_af_fold1 + uid_pacpvc_fold1 + cross_val_fold2 = uid_nsr_fold2 + uid_af_fold2 + uid_pacpvc_fold2 + test = uid_nsr_test + uid_af_test + uid_pacpvc_test + + # # Limit data set size to reduce computational load for optimization + # cross_val_fold1 = uid_nsr_fold1[:2] + uid_af_fold1[:2] + uid_pacpvc_fold1[:2] + # cross_val_fold2 = uid_nsr_fold2[:2] + uid_af_fold2[:2] + uid_pacpvc_fold2[:2] + # test = uid_nsr_test[:2] + uid_af_test[:2] + uid_pacpvc_test[:2] + + return cross_val_fold1, cross_val_fold2, test + + +def split_uids_2fold_60_40_smote(pathmaster): + # ====== Load the per subject arrythmia summary ====== + file_path = pathmaster.summary_path() + # df_summary = pd.read_csv(file_path) + + # Read the CSV file using pyarrow.csv.read_csv + table_summary = csv.read_csv(file_path) + df_summary = table_summary.to_pandas() + + df_summary['UID'] = df_summary['UID'].astype(str).str.zfill(3) # Pads each UIDs with enough zeroes to be 3 characters + + df_summary['sample_nonAF'] = df_summary['NSR'] + df_summary['PACPVC'] + df_summary['SVT'] + df_summary['sample_AF'] = df_summary['AF'] + + df_summary['sample_nonAF_ratio'] = df_summary['sample_nonAF'] / (df_summary['sample_AF'] + df_summary['sample_nonAF']) + + # Filter out 0-segment UIDs and UIDs without NSR, AF, and/or PAC/PVC + remaining_UIDs = [] + + for index, row in df_summary.iterrows(): + UID = row['UID'] + if row['TOTAL'] == 0: + # There is no segment in this subject, skip this UID. + print(f'---------UID {UID} has no segments.------------') + elif (row['NSR'] > 0 or row['AF'] > 0 or row['PACPVC'] > 0): # Append UID only if it contains NSR, AF, or PAC/PVC + remaining_UIDs.append(UID) + else: + print(f'---------UID {UID} has no AF, NSR, or PAC/PVC segments.------------') + + # Split UIDs + uid_nsr_fold1 = ['020', '030', '037', '041', '058', '077', '084', '106', '109', '118', '325'] + uid_nsr_fold2 = ['003', '014', '036', '044', '047', '049', '063', '083', '088', '091', '099'] + uid_nsr_test = ['011', '012', '024', '027', '035', '050', '055', '056', '057', '074', '094', '098', '101', '111', '119'] + + uid_af_fold1 = ['302', '306', '307', '402', '405', '415', '420', '421', '422'] + uid_af_fold2 = ['310', '321', '324', '406', '407', '409', '414', '416', '423'] + uid_af_test = ['017', '301', '305', '311', '312', '318', '319', '320', '322', '329', '400', '410', '413'] + + uid_pacpvc_fold1 = ['007', '022', '028', '038', '054', '068', '075', '086', '087', '093', '120', '327'] + uid_pacpvc_fold2 = ['002', '005', '013', '021', '026', '029', '045', '073', '089', '100', '112', '408'] + uid_pacpvc_test = ['039', '042', '052', '053', '064', '069', '070', '078', '080', '090', '104', '110', '113', '419'] + + # Total UID counts + total_uid_nsr = uid_nsr_fold1 + uid_nsr_fold2 + uid_nsr_test + total_uid_af = uid_af_fold1 + uid_af_fold2 + uid_af_test + total_uid_pacpvc = uid_pacpvc_fold1 + uid_pacpvc_fold2 + uid_pacpvc_test + total_uid = total_uid_pacpvc + total_uid_af + total_uid_nsr + + print('Number of total and unique UIDs:', len(total_uid),'|', len(np.unique(total_uid))) + print('Number of total and unique NSR UIDs:', len(total_uid_nsr),'|', len(np.unique(total_uid_nsr))) + print('Number of total and unique AF UIDs:', len(total_uid_af),'|', len(np.unique(total_uid_af))) + print('Number of total and unique PAC/PVC UIDs:', len(total_uid_pacpvc),'|', len(np.unique(total_uid_pacpvc))) + + cross_val_fold1 = uid_nsr_fold1 + uid_af_fold1 + uid_pacpvc_fold1 + cross_val_fold2 = uid_nsr_fold2 + uid_af_fold2 + uid_pacpvc_fold2 + test_set = uid_nsr_test + uid_af_test + uid_pacpvc_test + + return cross_val_fold1, cross_val_fold2, test_set + + +def split_uids(pathmaster): + # ====== Load the per subject arrythmia summary ====== + file_path = pathmaster.summary_path() + # df_summary = pd.read_csv(file_path) + + # Read the CSV file using pyarrow.csv.read_csv + table_summary = csv.read_csv(file_path) + df_summary = table_summary.to_pandas() + + df_summary['UID'] = df_summary['UID'].astype(str).str.zfill(3) # Pads each UIDs with enough zeroes to be 3 characters + + df_summary['sample_nonAF'] = df_summary['NSR'] + df_summary['PACPVC'] + df_summary['SVT'] + df_summary['sample_AF'] = df_summary['AF'] + + df_summary['sample_nonAF_ratio'] = df_summary['sample_nonAF'] / (df_summary['sample_AF'] + df_summary['sample_nonAF']) + + all_UIDs = df_summary['UID'].unique() + + # ==================================================== + # ====== AF trial separation ====== + # R:\ENGR_Chon\Dong\Numbers\Pulsewatch_numbers\Fahimeh_CNNED_general_ExpertSystemwApplication\tbl_file_name\TrainingSet_final_segments + AF_trial_Fahimeh_train = ['402','410'] + AF_trial_Fahimeh_test = ['301', '302', '305', '306', '307', '310', '311', + '312', '318', '319', '320', '321', '322', '324', + '325', '327', '329', '400', '406', '407', '409', + '414'] + AF_trial_Fahimeh_did_not_use = ['405', '413', '415', '416', '420', '421', '422', '423'] + AF_trial_paroxysmal_AF = ['408','419'] + + AF_trial_train = AF_trial_Fahimeh_train + AF_trial_test = AF_trial_Fahimeh_test + AF_trial_unlabeled = AF_trial_Fahimeh_did_not_use + AF_trial_paroxysmal_AF + print(f'AF trial: {len(AF_trial_train)} training subjects {AF_trial_train}') + print(f'AF trial: {len(AF_trial_test)} testing subjects {AF_trial_test}') + print(f'AF trial: {len(AF_trial_unlabeled)} unlabeled subjects {AF_trial_unlabeled}') + + # ================================= + # === Clinical trial AF subjects separation === + clinical_trial_AF_subjects = ['005', '017', '026', '051', '075', '082'] + + # Filter out AF trial and 0-segment UIDs + remaining_UIDs = [] + count_NSR = [] + + for index, row in df_summary.iterrows(): + UID = row['UID'] + this_NSR = row['sample_nonAF'] + if math.isnan(row['sample_nonAF_ratio']): # sample_nonAF is never NaN, sample_nonAF_ratio may be NaN + # There is no segment in this subject, skip this UID. + print(f'---------UID {UID} has no segments.------------') + continue # If a UID has no segments, skip the rest of the for loop for this index, row + if UID not in AF_trial_train and UID not in AF_trial_test and UID not in clinical_trial_AF_subjects \ + and UID[0] != '3' and UID[0] != '4': + remaining_UIDs.append(UID) + count_NSR.append(this_NSR) + + # From the candidate UIDs, select a subset to be used for training, validation, and testing + random.seed(seed=42) + + list_of_candidates = remaining_UIDs + number_of_items_to_pick = round(len(list_of_candidates) * 0.25) # 15% labeled for training, 10% for testing. + sum_NSR = sum(count_NSR) + + # probability_distribution = [x/sum_NSR for x in count_NSR] # Proportion of total NSR segments for each UID + probability_distribution = [(1-x/sum_NSR)/ (len(count_NSR)-1) for x in count_NSR] # Subjects with fewer segments have higher chance to be selected. + draw = choice(list_of_candidates, number_of_items_to_pick, + p=probability_distribution, replace=False) + + # Ensures that training set contains both AF and non-AF + clinical_trial_train_nonAF = list(draw[:round(len(list_of_candidates) * 0.12)]) # Draws the first X number of candidates equal to 7% of the total list of candidates + clinical_trial_train_temp = clinical_trial_train_nonAF + clinical_trial_AF_subjects[:round(len(clinical_trial_AF_subjects)/2)] + clinical_trial_train = [] + + for UID in clinical_trial_train_temp: + # UID 051 and 108 and maybe other UIDs had no segments (unknown reason). + if UID in all_UIDs: + clinical_trial_train.append(UID) # Only use the UIDs that are in the summary to test + + # Ensures that the testing set contains both AF and non-AF + clinical_trial_test_nonAF = list(draw[round(len(list_of_candidates) * 0.12):]) # Draws the remaining candidates + clinical_trial_test_temp = clinical_trial_test_nonAF + clinical_trial_AF_subjects[round(len(clinical_trial_AF_subjects)/2):] + clinical_trial_test = [] + for UID in clinical_trial_test_temp: + # UID 051 and 108 and maybe other UIDs had no segments (unknown reason). + if UID in all_UIDs: + clinical_trial_test.append(UID) # Only use the UIDs that are in the summary to test + + # Uses all remaining subset of UIDs from original list not used in training or validating for testing + clinical_trial_unlabeled = [] + for UID in remaining_UIDs: # Changed from all_UIDs to remove UIDs with 0 segments (i.e. UID 108) + if UID not in clinical_trial_train and UID not in clinical_trial_test and UID[0] != '3' and UID[0] != '4': + clinical_trial_unlabeled.append(UID) + + # Sum up to 74 UIDs, all of the ones that do not start with '3' or '4' and dropping UID 108 which has 0 segments + print(f'Clinical trial: selected {len(clinical_trial_train)} UIDs for training {clinical_trial_train}') # Contains both non-AF and AF clinical trial subjects + print(f'Clinical trial: selected {len(clinical_trial_test)} UIDs for testing {clinical_trial_test}') # Contains both non-AF and AF clinical trial subjects + print(f'Clinical trial: selected {len(clinical_trial_unlabeled)} UIDs for unlabeled {clinical_trial_unlabeled}') # All remaining clinical trial subjects...probably contains both AF and non-AF + + # Used to make sure the model runs correctly + clinical_trial_train = ['063','416','005'] # Training + clinical_trial_test = ['058','409','054'] # Evaluation + clinical_trial_unlabeled = ['029','036','421'] # Testing + + return clinical_trial_train, clinical_trial_test, clinical_trial_unlabeled + + +class CustomDataset(Dataset): + def __init__(self, data_path, labels_path, UIDs, standardize=True, data_format='csv', read_all_labels=False, + start_idx=0, img_channels=1, img_size=128, downsample=None, data_type=torch.float32, is_tfs=True, binary=False): + self.data_path = data_path + self.labels_path = labels_path + self.UIDs = UIDs + self.standardize = standardize + self.data_format = data_format + self.read_all_labels = read_all_labels + self.transforms = ToTensor() + self.start_idx = start_idx # Initial batch index to start from, useful for resuming training + self.img_channels = img_channels + self.img_size = img_size + self.downsample = downsample + self.is_tfs = is_tfs + self.binary = binary + + # Must be manually set so that the image resolution chosen is the one that is returned + self.dtype = data_type + + self.refresh_dataset() + + def refresh_dataset(self): + self.segment_names, self.labels = self.extract_segment_names_and_labels() + + def add_uids(self, new_uids): + unique_new_uids = [uid for uid in new_uids if uid not in self.UIDs] # Appends any unqiue new UID in self.UIDs to unique_new_uids + self.UIDs.extend(unique_new_uids) # Appends unique_new_uids to UIDs + self.refresh_dataset() + + def __len__(self): # Method is implicitly called when len() is used on an instance of CustomDataset + return len(self.segment_names) + + def save_checkpoint(self, checkpoint_path): # Likely not worth using, simply use the save_checkpoint() function in train_func.py + # Enhanced to automatically include 'start_idx' in the checkpoint + checkpoint = { + 'segment_names': self.segment_names, + 'labels': self.labels, + 'UIDs': self.UIDs, + 'start_idx': self.start_idx # Now also saving start_idx + } + torch.save(checkpoint, checkpoint_path) # Using standard Python methods like pickle or json is generally recommended for dictionaries, there are no benefits for using torch.save, no real harm either + + def load_checkpoint(self, checkpoint_path): # Reloads where you started off last time (not where you ended), just use analogous function in train_func.py + checkpoint = torch.load(checkpoint_path) + self.segment_names = checkpoint['segment_names'] # Seems redundant since it is overwritten by refresh_dataset() + self.labels = checkpoint['labels'] # Seems redundant since it is overwritten by refresh_dataset() + self.UIDs = checkpoint['UIDs'] + # Now also loading and setting start_idx from checkpoint + self.start_idx = checkpoint.get('start_idx', 0) # Returns 0 if no start_idx found + self.refresh_dataset() + + def __getitem__(self, idx): # Method is implicitly called when getitem() is used on an instance of CustomDataset. It is called batch_size number of times per iteration of dataloader | Loads segments as needed (lazy loading) + actual_idx = (idx + self.start_idx) % len(self.segment_names) # Adjust index based on start_idx and wrap around if needed (i.e. index falls out of bounds) + segment_name = self.segment_names[actual_idx] + label = self.labels[segment_name] + + if hasattr(self, 'all_data') and actual_idx < len(self.all_data): # When Luis uses adds data to train_loader in main_checkpoints.py, + # new data is added (creating all_data) only after train_loader is created with its original training data. This means that if self.all_data + # exists, then __getitem__ is only be called in order to retrieve data newly added to train_loader in all_data + time_freq_tensor = self.all_data[actual_idx] + else: + time_freq_tensor = self.load_data(segment_name) + + return {'data': time_freq_tensor, 'label': label, 'segment_name': segment_name} + + # When iterating over the dataloader, which returns batches of data, each batch will contain a dictionary with keys corresponding to the data and labels. + + # Since the dataloader's dataset's __getitem__ method returns a dictionary with keys 'data', 'label', and 'segment_name', the returned batch will be a dictionary where: + + # The 'data' key will correspond to a tensor of shape (batch_size, ...), representing the shape of the data. + # The 'label' key will correspond to a tensor of shape (batch_size, ...), representing the shape of the labels. + # The 'segment_name' key will correspond to a tensor of shape (batch_size, ...), representing the shape of the segment_name. + + def set_start_idx(self, index): + self.start_idx = index + + def add_data_label_pair(self, data, label): + # Assign a unique ID or name for the new data + new_id = len(self.segment_names) + segment_name = f"new_data_{new_id}" + + # Append the new data and label + self.segment_names.append(segment_name) + self.labels[segment_name] = label + + # Append the new data tensor to an attribute that holds all of the newly added data + if hasattr(self, 'all_data'): + self.all_data.append(data) + else: + self.all_data = [data] + + # def extract_segment_names_and_labels(self): + # segment_names = [] + # labels = {} + + # for UID in self.UIDs: + # label_file = os.path.join(self.labels_path, UID + "_final_attemp_4_1_Dong.csv") + # if os.path.exists(label_file): + # # label_data = pd.read_csv(label_file, sep=',', header=0, names=['segment', 'label']) # Replaces the original headers with names + + # # Use PyArrow to read csv + # parse_options = csv.ParseOptions(delimiter=',') # Indicate delimiter + # read_options = csv.ReadOptions(column_names=['segment', 'label'], skip_rows=1) # Assign desired column names and skip the first row (headers) + # label_data = csv.read_csv(label_file, parse_options=parse_options, read_options=read_options) + # label_data = label_data.to_pandas() + + # label_segment_names = label_data['segment'].apply(lambda x: x.split('.')[0]) # Splits each segment name by '.' and retrieves the first part + # for idx, segment_name in enumerate(label_segment_names): # enumerate() returns the value and corresponding index of each element in an iterable + # label_val = label_data['label'].values[idx] + # # Will only use NSR (0), AF (1), and PAC/PVC(2) and not SVT (3) + # if self.read_all_labels: # If reading all labels, set all labels not 0, 1, or 2 to -1 and return all labels + # # Assign -1 if label is not in [0, 1, 2] + # labels[segment_name] = label_val if label_val in [0, 1, 2] else -1 + # if segment_name not in segment_names: + # segment_names.append(segment_name) + # else: + # # Only add segments with labels in [0, 1, 2] + # if label_val in [0, 1, 2] and segment_name not in segment_names: + # segment_names.append(segment_name) + # labels[segment_name] = label_val # Extracts the labels of the segments retrieved into a dictionary + + # # # Since shuffle=False for the dataloader in preprocess_data(), this is my work around for that while allowing for checkpointing + # # random.seed(seed=42) + # # random.shuffle(segment_names) # Will not affect the labels since the labels are in a dictionary + + # return segment_names, labels + + + def extract_segment_names_and_labels(self): # Only extract the segments and labels of a particular class, temporary solution + segment_names = [] + labels = {} + + # If a subject is not loading and there are no errors, just these lists + uid_nsr = ['011', '014', '041', '050', '056', '058', '083', '106', '109', + '037', '047', '055', '074', '091', '098', '101', '119', '325', + '003', '012', '020', '024', '027', '030', '035', '036', '044', '049', '057', '063', '077', '084', '088', '094', '099', '111', '118'] + uid_af = ['305', '307', '311', '318', '320', '322', '405', '415', '423', + '301', '319', '321', '324', '329', '400', '406', '409', '416', + '017', '302', '306', '310', '312', '402', '407', '410', '413', '414', '420', '421', '422'] + uid_pacpvc = ['007', '022', '028', '038', '054', '068', '075', '086', '087', '093', '120', '327', + '002', '005', '013', '021', '026', '029', '045', '073', '089', '100', '112', '408', + '039', '042', '052', '053', '064', '069', '070', '078', '080', '090', '104', '110', '113', '419'] + + for UID in self.UIDs: + label_file = os.path.join(self.labels_path, UID + "_final_attemp_4_1_Dong.csv") + if os.path.exists(label_file): + # label_data = pd.read_csv(label_file, sep=',', header=0, names=['segment', 'label']) # Replaces the original headers with names + + # Use PyArrow to read csv + parse_options = csv.ParseOptions(delimiter=',') # Indicate delimiter + read_options = csv.ReadOptions(column_names=['segment', 'label'], skip_rows=1) # Assign desired column names and skip the first row (headers) + label_data = csv.read_csv(label_file, parse_options=parse_options, read_options=read_options) + label_data = label_data.to_pandas() + + label_segment_names = label_data['segment'].apply(lambda x: x.split('.')[0]) # Splits each segment name by '.' and retrieves the first part + for idx, segment_name in enumerate(label_segment_names): # enumerate() returns the value and corresponding index of each element in an iterable + label_val = label_data['label'].values[idx] + # Will only use NSR (0), AF (1), and PAC/PVC(2) and not SVT (3) + if self.read_all_labels: # If reading all labels, set all labels not 0, 1, or 2 to -1 and return all labels + # Assign -1 if label is not in [0, 1, 2] + labels[segment_name] = label_val if label_val in [0, 1, 2] else -1 + if segment_name not in segment_names: + segment_names.append(segment_name) + else: + # Only add segments with labels in [0, 1, 2] + if label_val in [0, 1, 2] and segment_name not in segment_names: + # Temporary solution to ensure only segments of a particular class are loaded for each UID + if UID in uid_nsr and label_val == 0: + segment_names.append(segment_name) + labels[segment_name] = label_val + elif UID in uid_af and label_val == 1: + segment_names.append(segment_name) + labels[segment_name] = label_val + elif UID in uid_pacpvc and label_val == 2: + segment_names.append(segment_name) + if self.binary: + labels[segment_name] = 0 + else: + labels[segment_name] = label_val + + return segment_names, labels + + + def load_data(self, segment_name): + data_path_UID = os.path.join(self.data_path, segment_name.split('_')[0]) + if self.is_tfs: + seg_path = os.path.join(data_path_UID, segment_name + '_filt_STFT.' + self.data_format) + else: + seg_path = os.path.join(data_path_UID, segment_name + '_density_poincare.' + self.data_format) + + + try: # Allows to define a block of code to be executed and specify how to handle any errors that might occur during its execution + if self.data_format == 'csv' and seg_path.endswith('.csv'): + # time_freq_plot = np.array(pd.read_csv(seg_path, header=None)) + + # Use PyArrow to read csv + read_options = csv.ReadOptions(autogenerate_column_names=True) + seg_data = csv.read_csv(seg_path, read_options=read_options) + time_freq_plot = seg_data.to_pandas().to_numpy() + + time_freq_tensor = torch.tensor(time_freq_plot).reshape(self.img_channels, self.img_size, self.img_size) + elif self.data_format == 'png' and seg_path.endswith('.png'): + img = Image.open(seg_path) + img_data = np.array(img) + time_freq_tensor = torch.tensor(img_data).unsqueeze(0) + elif self.data_format == 'pt' and seg_path.endswith('.pt'): + time_freq_tensor = torch.load(seg_path) + else: + raise ValueError("Unsupported file format") + + if self.downsample is not None: + # Downsample the image + # Use OpenCV to resize the array to downsample x downsample using INTER_AREA interpolation + time_freq_array = cv2.resize(np.array(time_freq_tensor.reshape(self.img_size, self.img_size).to('cpu')), (self.downsample, self.downsample), interpolation=cv2.INTER_AREA) + time_freq_tensor = torch.tensor(time_freq_array, dtype=self.dtype).reshape(self.img_channels, self.downsample, self.downsample) + else: + time_freq_tensor = time_freq_tensor.reshape(self.img_channels, self.img_size, self.img_size).to(self.dtype) + + if self.standardize: + time_freq_tensor = self.standard_scaling(time_freq_tensor) # Standardize the data + + return time_freq_tensor + + except Exception as e: + print(f"Error processing segment: {segment_name}. Exception: {str(e)}") + if self.downsample is not None: + return torch.zeros((self.img_channels, self.downsample, self.downsample)) # Return zeros in case of an error + else: + return torch.zeros((self.img_channels, self.img_size, self.img_size)) # Return zeros in case of an error + + def standard_scaling(self, data): + scaler = StandardScaler() + data = scaler.fit_transform(data.reshape(-1, data.shape[-1])).reshape(data.shape) # Converts data into 2D array, standardizes it, reshapes it back into 3D (1,X,X) + return torch.tensor(data, dtype=self.dtype) + +def load_data_split_batched(data_path, labels_path, UIDs, batch_size, standardize=False, data_format='csv', + read_all_labels=False, drop_last=False, num_workers=4, start_idx=0, + img_channels=1, img_size=128, downsample=None, data_type=torch.float32, is_tfs=True, binary=False): + torch.manual_seed(42) + g = torch.Generator() + g.manual_seed(42) + + pin_memory = False + if torch.cuda.is_available(): + pin_memory = True + + dataset = CustomDataset(data_path, labels_path, UIDs, standardize, data_format, read_all_labels, start_idx=start_idx, + img_channels=img_channels, img_size=img_size, downsample=downsample, data_type=data_type, is_tfs=is_tfs, binary=binary) + dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=drop_last, num_workers=num_workers, prefetch_factor=2, persistent_workers=True, pin_memory=pin_memory, worker_init_fn=seed_worker, generator=g) # Prefetches 2 batches ahead of current training iteration (allows loading of data simultaneously with training). Shuffle is set to False to resume training at a specific batch. + return dataloader + +def seed_worker(worker_id): + worker_seed = torch.initial_seed() % 2**32 + np.random.seed(worker_seed) + random.seed(worker_seed) + +# Function to extract and preprocess data +def preprocess_data(data_format, clinical_trial_train, clinical_trial_test, clinical_trial_unlabeled, batch_size, standardize=False, + read_all_labels=False, img_channels=1, img_size=128, downsample=None, data_type=torch.float32, pathmaster=None, binary=False): + start_idx = 0 + data_path, labels_path = pathmaster.data_paths(data_format) + + if data_format == 'csv': + num_workers = 6 + elif data_format == 'pt': + num_workers = 8 + + train_loader = load_data_split_batched(data_path, labels_path, clinical_trial_train, batch_size, standardize=standardize, + data_format=data_format, read_all_labels=read_all_labels, num_workers=num_workers, + start_idx=start_idx, img_channels=img_channels, img_size=img_size, downsample=downsample, + data_type=data_type, is_tfs=pathmaster.is_tfs, binary=binary) + val_loader = load_data_split_batched(data_path, labels_path, clinical_trial_test, batch_size, standardize=standardize, + data_format=data_format, read_all_labels=read_all_labels, num_workers=num_workers, + start_idx=start_idx, img_channels=img_channels, img_size=img_size, downsample=downsample, + data_type=data_type, is_tfs=pathmaster.is_tfs, binary=binary) + test_loader = load_data_split_batched(data_path, labels_path, clinical_trial_unlabeled, batch_size, standardize=standardize, + data_format=data_format, read_all_labels=read_all_labels, num_workers=num_workers, + start_idx=start_idx, img_channels=img_channels, img_size=img_size, downsample=downsample, + data_type=data_type, is_tfs=pathmaster.is_tfs, binary=binary) + return train_loader, val_loader, test_loader + +def map_samples_to_uids(uncertain_sample_indices, dataset): + """ + Maps indices of uncertain samples back to their corresponding segment names or UIDs. + + Args: + - uncertain_sample_indices: Indices of the uncertain samples in the dataset. + - dataset: The dataset object which contains the mapping of segment names and UIDs. + + Returns: + - List of UIDs or segment names corresponding to the uncertain samples. + """ + return [dataset.segment_names[i] for i in uncertain_sample_indices] + +def update_train_loader_with_labeled_samples(current_train_loader, labeled_samples, batch_size): # Luis' doesn't seem to use this + """ + Updates the training DataLoader with newly labeled samples. + + Args: + - current_train_loader: The current DataLoader for the training set. + - labeled_samples: A list of tuples, each containing a data tensor and its new label. + - batch_size: Batch size for the DataLoader. + + Returns: + - DataLoader: The updated DataLoader with the new labeled samples. + """ + + # Extract the current dataset from the DataLoader + current_dataset = current_train_loader.dataset + + # Update the dataset with new samples and labels + for data_tensor, label in labeled_samples: + # Assuming the CustomDataset class has a method to add new data and labels + current_dataset.add_data_label_pair(data_tensor, label) + + # Create a new DataLoader with the updated dataset + updated_train_loader = DataLoader(current_dataset, batch_size=batch_size, shuffle=True, drop_last=False, num_workers=4, prefetch_factor=2) + + return updated_train_loader + +def update_train_loader_with_uncertain_samples(current_train_loader, new_sample_indices, batch_size): # Luis' uses this method for active learning + # Extract current UIDs from the current_train_loader + current_dataset = current_train_loader.dataset + # Map new_samples back to their corresponding segment names or UIDs + new_uids = map_samples_to_uids(new_sample_indices, current_dataset) + # Add new UIDs to the current dataset and refresh it + current_dataset.add_uids(new_uids) + # Create new DataLoader with the updated dataset + updated_train_loader = DataLoader(current_dataset, batch_size=batch_size, shuffle=False) + return updated_train_loader + + \ No newline at end of file diff --git a/utils/dataloader_database.py b/utils/dataloader_database.py new file mode 100644 index 0000000..c3ab6b1 --- /dev/null +++ b/utils/dataloader_database.py @@ -0,0 +1,223 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Feb 26 18:29:59 2024 + +@author: dchen +""" +import os +import numpy as np +import pandas as pd +from PIL import Image +import torch +from torch.utils.data import Dataset, DataLoader +from sklearn.preprocessing import StandardScaler +from torchvision.transforms import ToTensor +import math +from numpy import random +from numpy.random import choice +import cv2 +from pyarrow import csv + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +# Seeds +torch.manual_seed(42) +np.random.seed(42) +random.seed(42) + +class CustomDataset(Dataset): + def __init__(self, data_path, labels_path, standardize=True, data_format='pt', start_idx=0, + img_channels=1, img_size=128, downsample=None, data_type=torch.float32, is_tfs=True, binary=False): + self.data_path = data_path + self.labels_path = labels_path + self.standardize = standardize + self.data_format = data_format + self.transforms = ToTensor() + self.start_idx = start_idx # Initial batch index to start from, useful for resuming training + self.img_channels = img_channels + self.img_size = img_size + self.downsample = downsample + self.is_tfs = is_tfs + self.dtype = data_type + self.binary = binary + + self.refresh_dataset() + + + def refresh_dataset(self): + self.segment_names, self.labels = self.extract_segment_names_and_labels() + + + def __len__(self): # Method is implicitly called when len() is used on an instance of CustomDataset + return len(self.segment_names) + + + def __getitem__(self, idx): # Method is implicitly called when getitem() is used on an instance of CustomDataset. It is called batch_size number of times per iteration of dataloader | Loads segments as needed (lazy loading) + actual_idx = (idx + self.start_idx) % len(self.segment_names) # Adjust index based on start_idx and wrap around if needed (i.e. index falls out of bounds) + segment_name = self.segment_names[actual_idx] + label = self.labels[segment_name] + + data_tensor = self.load_data(segment_name) + + return {'data': data_tensor, 'label': label, 'segment_name': segment_name} + + # When iterating over the dataloader, which returns batches of data, each batch will contain a dictionary with keys corresponding to the data and labels. + + # Since the dataloader's dataset's __getitem__ method returns a dictionary with keys 'data', 'label', and 'segment_name', the returned batch will be a dictionary where: + + # The 'data' key will correspond to a tensor of shape (batch_size, ...), representing the shape of the data. + # The 'label' key will correspond to a tensor of shape (batch_size, ...), representing the shape of the labels. + # The 'segment_name' key will correspond to a tensor of shape (batch_size, ...), representing the shape of the segment_name. + + def set_start_idx(self, index): + self.start_idx = index + + + def extract_segment_names_and_labels(self): # Only extract the segments and labels of a particular class, temporary solution + segment_names = [] + labels = {} + label_file = self.labels_path + if os.path.exists(label_file): + # Use PyArrow to read csv + parse_options = csv.ParseOptions(delimiter=',') # Indicate delimiter + read_options = csv.ReadOptions(column_names=['segment_names', 'labels'], skip_rows=1) # Assign desired column names and skip the first row (headers) + label_data = csv.read_csv(label_file, parse_options=parse_options, read_options=read_options) + label_data = label_data.to_pandas() + + label_segment_names = label_data['segment_names'] + for idx, segment_name in enumerate(label_segment_names): # enumerate() returns the value and corresponding index of each element in an iterable + label_val = label_data['labels'].values[idx] + + if self.binary and label_val == 2: # If binary is true, set all PAC/PVC to 0 (non-AF) + label_val = 0 + + segment_names.append(segment_name) + labels[segment_name] = label_val + + return segment_names, labels + + + def second_to_last_directory_name(self, path): + # Normalize path separator to '/' + path = path.replace('\\', '/') + + # Split the path into its components + components = path.split('/') + + # Remove empty components + components = [c for c in components if c] + + # Check if the path ends with a separator (indicating it's a directory) + if path.endswith('/'): + # Remove the last empty component + components.pop() + + # If there's only one or zero directories in the path, return None + if len(components) <= 1: + return None + + # Return the name of the second-to-last directory + return components[-2] + + + def load_data(self, segment_name): + seg_path = os.path.join(self.data_path, segment_name + '.' + self.data_format) + + try: # Allows to define a block of code to be executed and specify how to handle any errors that might occur during its execution + if self.data_format == 'csv' and seg_path.endswith('.csv'): + # data_plot = np.array(pd.read_csv(seg_path, header=None)) + + # Use PyArrow to read csv + read_options = csv.ReadOptions(autogenerate_column_names=True) + seg_data = csv.read_csv(seg_path, read_options=read_options) + data_plot = seg_data.to_pandas().to_numpy() + + data_tensor = torch.tensor(data_plot).reshape(self.img_channels, self.img_size, self.img_size) + elif self.data_format == 'png' and seg_path.endswith('.png'): + img = Image.open(seg_path) + img_data = np.array(img) + data_tensor = torch.tensor(img_data).unsqueeze(0) + elif self.data_format == 'pt' and seg_path.endswith('.pt'): + data_tensor = torch.load(seg_path) + else: + raise ValueError("Unsupported file format") + + if self.downsample is not None: + # Downsample the image + # Use OpenCV to resize the array to downsample x downsample using INTER_AREA interpolation + data_array = cv2.resize(np.array(data_tensor.reshape(self.img_size, self.img_size).to('cpu')), (self.downsample, self.downsample), interpolation=cv2.INTER_AREA) + data_tensor = torch.tensor(data_array, dtype=self.dtype).reshape(self.img_channels, self.downsample, self.downsample) + else: + data_tensor = data_tensor.reshape(self.img_channels, self.img_size, self.img_size).to(self.dtype) + + if self.standardize: + data_tensor = self.standard_scaling(data_tensor) # Standardize the data + + return data_tensor + + except Exception as e: + print(f"Error processing segment: {segment_name}. Exception: {str(e)}") + if self.downsample is not None: + return torch.zeros((self.img_channels, self.downsample, self.downsample)) # Return zeros in case of an error + else: + return torch.zeros((self.img_channels, self.img_size, self.img_size)) # Return zeros in case of an error + + def standard_scaling(self, data): + scaler = StandardScaler() + data = scaler.fit_transform(data.reshape(-1, data.shape[-1])).reshape(data.shape) # Converts data into 2D array, standardizes it, reshapes it back into 3D (1,X,X) + return torch.tensor(data, dtype=self.dtype) + +def load_data_split_batched(data_path, labels_path, batch_size, standardize=False, data_format='csv', + drop_last=False, num_workers=4, start_idx=0, + img_channels=1, img_size=128, downsample=None, data_type=torch.float16, is_tfs=True, binary=False): + torch.manual_seed(42) + g = torch.Generator() + g.manual_seed(42) + + pin_memory = False + if torch.cuda.is_available(): + pin_memory = True + + dataset = CustomDataset(data_path, labels_path, standardize, data_format, start_idx=start_idx, + img_channels=img_channels, img_size=img_size, downsample=downsample, data_type=data_type, is_tfs=is_tfs, binary=binary) + dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=drop_last, num_workers=num_workers, prefetch_factor=2, persistent_workers=True, pin_memory=pin_memory, worker_init_fn=seed_worker, generator=g) # Prefetches 2 batches ahead of current training iteration (allows loading of data simultaneously with training). Shuffle is set to False to resume training at a specific batch. + return dataloader + +def seed_worker(worker_id): + worker_seed = torch.initial_seed() % 2**32 + np.random.seed(worker_seed) + random.seed(worker_seed) + +# Function to extract and preprocess data +def preprocess_data(database, batch_size, standardize=False, img_channels=1, img_size=128, + downsample=None, data_type=torch.float32, pathmaster=None, binary=False): + start_idx = 0 + + if database == 'DeepBeat' or database == 'deepbeat' or database == 'Deepbeat': + data_path, labels_path = pathmaster.deepbeat_paths() + elif database == 'MIMICIII' or database == 'mimiciii' or database == 'mimicIII' or database == 'mimic3': + data_path, labels_path = pathmaster.mimic3_paths() + elif database == 'Simband' or database == 'simband': + data_path, labels_path = pathmaster.simband_paths() + else: + print('Invalid Database') + + data_format = 'pt' + + num_workers = 1 + + test_loader = load_data_split_batched(data_path, labels_path, batch_size, standardize=standardize, + data_format=data_format, num_workers=num_workers, + start_idx=start_idx, img_channels=img_channels, img_size=img_size, downsample=downsample, + data_type=data_type, is_tfs=pathmaster.is_tfs, binary=binary) + # loader2 = load_data_split_batched(data_path, labels_path, batch_size, standardize=standardize, + # data_format=data_format, num_workers=num_workers, + # start_idx=start_idx, img_channels=img_channels, img_size=img_size, downsample=downsample, + # data_type=data_type, is_tfs=pathmaster.is_tfs, binary=False) + # loader3 = load_data_split_batched(data_path, labels_path, batch_size, standardize=standardize, + # data_format=data_format, num_workers=num_workers, + # start_idx=start_idx, img_channels=img_channels, img_size=img_size, downsample=downsample, + # data_type=data_type, is_tfs=pathmaster.is_tfs, binary=False) + return test_loader # loader1, loader2, loader3 + + \ No newline at end of file diff --git a/utils/dataloader_smote.py b/utils/dataloader_smote.py new file mode 100644 index 0000000..9266028 --- /dev/null +++ b/utils/dataloader_smote.py @@ -0,0 +1,215 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Feb 26 18:29:59 2024 + +@author: dchen +""" +import os +import numpy as np +import pandas as pd +from PIL import Image +import torch +from torch.utils.data import Dataset, DataLoader +from sklearn.preprocessing import StandardScaler +from torchvision.transforms import ToTensor +import math +from numpy import random +from numpy.random import choice +import cv2 +from pyarrow import csv + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +# Seeds +torch.manual_seed(42) +np.random.seed(42) +random.seed(42) + +class CustomDataset(Dataset): + def __init__(self, smote_path, groups, standardize=True, data_format='pt', start_idx=0, + img_channels=1, img_size=128, downsample=None, data_type=torch.float32, is_tfs=True): + self.smote_path = smote_path + self.standardize = standardize + self.data_format = data_format + self.transforms = ToTensor() + self.start_idx = start_idx # Initial batch index to start from, useful for resuming training + self.img_channels = img_channels + self.img_size = img_size + self.downsample = downsample + self.is_tfs = is_tfs + self.groups = groups + self.dtype = data_type + + self.refresh_dataset() + + + def refresh_dataset(self): + self.segment_names, self.labels = self.extract_segment_names_and_labels() + + + def __len__(self): # Method is implicitly called when len() is used on an instance of CustomDataset + return len(self.segment_names) + + + def __getitem__(self, idx): # Method is implicitly called when getitem() is used on an instance of CustomDataset. It is called batch_size number of times per iteration of dataloader | Loads segments as needed (lazy loading) + actual_idx = (idx + self.start_idx) % len(self.segment_names) # Adjust index based on start_idx and wrap around if needed (i.e. index falls out of bounds) + segment_name = self.segment_names[actual_idx] + label = self.labels[segment_name] + + data_tensor = self.load_data(segment_name) + + return {'data': data_tensor, 'label': label, 'segment_name': segment_name} + + # When iterating over the dataloader, which returns batches of data, each batch will contain a dictionary with keys corresponding to the data and labels. + + # Since the dataloader's dataset's __getitem__ method returns a dictionary with keys 'data', 'label', and 'segment_name', the returned batch will be a dictionary where: + + # The 'data' key will correspond to a tensor of shape (batch_size, ...), representing the shape of the data. + # The 'label' key will correspond to a tensor of shape (batch_size, ...), representing the shape of the labels. + # The 'segment_name' key will correspond to a tensor of shape (batch_size, ...), representing the shape of the segment_name. + + def set_start_idx(self, index): + self.start_idx = index + + + def extract_segment_names_and_labels(self): # Only extract the segments and labels of a particular class, temporary solution + segment_names = [] + labels = {} + + group_directories = [entry for entry in os.listdir(self.smote_path) if os.path.isdir(os.path.join(self.smote_path, entry))] + group = list(set(self.groups).intersection(set(group_directories)))[0] + + smote_type = self.second_to_last_directory_name(self.smote_path) + label_file = os.path.join(self.smote_path, smote_type + '_' + group + '_names_labels.csv') + if os.path.exists(label_file): + # Use PyArrow to read csv + parse_options = csv.ParseOptions(delimiter=',') # Indicate delimiter + read_options = csv.ReadOptions(column_names=['segment_name', 'label'], skip_rows=1) # Assign desired column names and skip the first row (headers) + label_data = csv.read_csv(label_file, parse_options=parse_options, read_options=read_options) + label_data = label_data.to_pandas() + + label_segment_names = label_data['segment_name'] + for idx, segment_name in enumerate(label_segment_names): # enumerate() returns the value and corresponding index of each element in an iterable + label_val = label_data['label'].values[idx] + segment_names.append(segment_name) + labels[segment_name] = label_val + + return segment_names, labels + + + def second_to_last_directory_name(self, path): + # Normalize path separator to '/' + path = path.replace('\\', '/') + + # Split the path into its components + components = path.split('/') + + # Remove empty components + components = [c for c in components if c] + + # Check if the path ends with a separator (indicating it's a directory) + if path.endswith('/'): + # Remove the last empty component + components.pop() + + # If there's only one or zero directories in the path, return None + if len(components) <= 1: + return None + + # Return the name of the second-to-last directory + return components[-2] + + + def load_data(self, segment_name): + data_path_group = os.path.join(self.smote_path, segment_name.split('_')[1]) + seg_path = os.path.join(data_path_group, segment_name + '.' + self.data_format) + + try: # Allows to define a block of code to be executed and specify how to handle any errors that might occur during its execution + if self.data_format == 'csv' and seg_path.endswith('.csv'): + # data_plot = np.array(pd.read_csv(seg_path, header=None)) + + # Use PyArrow to read csv + read_options = csv.ReadOptions(autogenerate_column_names=True) + seg_data = csv.read_csv(seg_path, read_options=read_options) + data_plot = seg_data.to_pandas().to_numpy() + + data_tensor = torch.tensor(data_plot).reshape(self.img_channels, self.img_size, self.img_size) + elif self.data_format == 'png' and seg_path.endswith('.png'): + img = Image.open(seg_path) + img_data = np.array(img) + data_tensor = torch.tensor(img_data).unsqueeze(0) + elif self.data_format == 'pt' and seg_path.endswith('.pt'): + data_tensor = torch.load(seg_path) + else: + raise ValueError("Unsupported file format") + + if self.downsample is not None: + # Downsample the image + # Use OpenCV to resize the array to downsample x downsample using INTER_AREA interpolation + data_array = cv2.resize(np.array(data_tensor.reshape(self.img_size, self.img_size).to('cpu')), (self.downsample, self.downsample), interpolation=cv2.INTER_AREA) + data_tensor = torch.tensor(data_array, dtype=self.dtype).reshape(self.img_channels, self.downsample, self.downsample) + else: + data_tensor = data_tensor.reshape(self.img_channels, self.img_size, self.img_size).to(self.dtype) + + if self.standardize: + data_tensor = self.standard_scaling(data_tensor) # Standardize the data + + return data_tensor + + except Exception as e: + print(f"Error processing segment: {segment_name}. Exception: {str(e)}") + if self.downsample is not None: + return torch.zeros((self.img_channels, self.downsample, self.downsample)) # Return zeros in case of an error + else: + return torch.zeros((self.img_channels, self.img_size, self.img_size)) # Return zeros in case of an error + + def standard_scaling(self, data): + scaler = StandardScaler() + data = scaler.fit_transform(data.reshape(-1, data.shape[-1])).reshape(data.shape) # Converts data into 2D array, standardizes it, reshapes it back into 3D (1,X,X) + return torch.tensor(data, dtype=self.dtype) + +def load_data_split_batched(smote_path, groups, batch_size, standardize=False, data_format='csv', + drop_last=False, num_workers=4, start_idx=0, + img_channels=1, img_size=128, downsample=None, data_type=torch.float32, is_tfs=True): + torch.manual_seed(42) + g = torch.Generator() + g.manual_seed(42) + + pin_memory = False + if torch.cuda.is_available(): + pin_memory = True + + dataset = CustomDataset(smote_path, groups, standardize, data_format, start_idx=start_idx, + img_channels=img_channels, img_size=img_size, downsample=downsample, data_type=data_type, is_tfs=is_tfs) + dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=drop_last, num_workers=num_workers, prefetch_factor=2, persistent_workers=True, pin_memory=pin_memory, worker_init_fn=seed_worker, generator=g) # Prefetches 2 batches ahead of current training iteration (allows loading of data simultaneously with training). Shuffle is set to False to resume training at a specific batch. + return dataloader + +def seed_worker(worker_id): + worker_seed = torch.initial_seed() % 2**32 + np.random.seed(worker_seed) + random.seed(worker_seed) + +# Function to extract and preprocess data +def preprocess_data(smote_type, split, batch_size, standardize=False, img_channels=1, img_size=128, + downsample=None, data_type=torch.float32, pathmaster=None): + start_idx = 0 + smote_path = pathmaster.smote_path(smote_type, split) + data_format = 'pt' + + num_workers = 8 + + loader1 = load_data_split_batched(smote_path, ['fold1', 'train'], batch_size, standardize=standardize, + data_format=data_format, num_workers=num_workers, + start_idx=start_idx, img_channels=img_channels, img_size=img_size, downsample=downsample, + data_type=data_type, is_tfs=pathmaster.is_tfs) + loader2 = load_data_split_batched(smote_path, ['fold2', 'validate'], batch_size, standardize=standardize, + data_format=data_format, num_workers=num_workers, + start_idx=start_idx, img_channels=img_channels, img_size=img_size, downsample=downsample, + data_type=data_type, is_tfs=pathmaster.is_tfs) + loader3 = load_data_split_batched(smote_path, ['test', 'test'], batch_size, standardize=standardize, + data_format=data_format, num_workers=num_workers, + start_idx=start_idx, img_channels=img_channels, img_size=img_size, downsample=downsample, + data_type=data_type, is_tfs=pathmaster.is_tfs) + return loader1, loader2, loader3 + + \ No newline at end of file diff --git a/utils/get_paths.py b/utils/get_paths.py new file mode 100644 index 0000000..b22752e --- /dev/null +++ b/utils/get_paths.py @@ -0,0 +1,154 @@ +# -*- coding: utf-8 -*- +""" +Created on Tue Feb 27 14:55:43 2024 + +@author: dchen +""" +import os + +def data_paths(data_format, is_linux=False, is_hpc=False): + if is_linux: + base_path = "/mnt/r/ENGR_Chon/Dong/MATLAB_generate_results/NIH_PulseWatch" + labels_base_path = "/mnt/r/ENGR_Chon/NIH_Pulsewatch_Database/Adjudication_UConn" + saving_base_path = "/mnt/r/ENGR_Chon/Darren/Honors_Thesis/saves/analysis" + elif is_hpc: + base_path = "/gpfs/scratchfs1/kic14002/doh16101" + labels_base_path = "/gpfs/scratchfs1/hfp14002/lrm22005" + saving_base_path = "/gpfs/scratchfs1/hfp14002/dac20022/Honors_Thesis/saves/analysis" + else: + # R:\ENGR_Chon\Dong\MATLAB_generate_results\NIH_PulseWatch + base_path = r"R:\ENGR_Chon\Dong\MATLAB_generate_results\\NIH_PulseWatch" # Why double \\ before NIH_Pulsewatch_Database? + labels_base_path = r"R:\ENGR_Chon\\NIH_Pulsewatch_Database\Adjudication_UConn" # Why double \\ before NIH_Pulsewatch_Database? + saving_base_path = r"\\grove.ad.uconn.edu\research\ENGR_Chon\Darren\Honors_Thesis\saves" # Only when writing to file in the R drive do we need the entire address for the R drive + if data_format == 'csv': + data_path = os.path.join(base_path, "TFS_csv") + labels_path = os.path.join(labels_base_path, "final_attemp_4_1_Dong_Ohm") + saving_path = os.path.join(saving_base_path, "analysis") + elif data_format == 'png': + data_path = os.path.join(base_path, "TFS_plots") + labels_path = os.path.join(labels_base_path, "final_attemp_4_1_Dong_Ohm") + saving_path = os.path.join(saving_base_path, "analysis") + elif data_format == 'pt': + data_path = os.path.join(base_path, "PT_format") + labels_path = os.path.join(labels_base_path, "final_attemp_4_1_Dong_Ohm") + saving_path = os.path.join(saving_base_path, "analysis") + else: + raise ValueError("Invalid data format. Choose 'csv', 'png, or 'pt'.") + + return data_path, labels_path, saving_path + + +def models_path(is_linux=False, is_hpc=False): + if is_linux: + models_path = "/mnt/r/ENGR_Chon/Darren/Honors_Thesis/models" + elif is_hpc: + models_path = "/gpfs/scratchfs1/hfp14002/dac20022/Honors_Thesis/models" + else: + models_path = r"\\grove.ad.uconn.edu\research\ENGR_Chon\Darren\Honors_Thesis\models" + + return models_path + +# Base saving paths +focus = 'misc' +# focus = '2_layers_per_block' +# focus = '2_layers_per_block' +linux_saves_path = '/mnt/r/ENGR_Chon/Darren/Honors_Thesis/saves/' + focus + '/' +hpc_saves_path = '/gpfs/scratchfs1/hfp14002/dac20022/Honors_Thesis/saves/' + focus + '/' +saves_path = r'\\grove.ad.uconn.edu\research\ENGR_Chon\Darren\Honors_Thesis\saves' + '\\' + focus + '\\' + +def losslists_path(is_linux=False, is_hpc=False): + if is_linux: + losslists_path = linux_saves_path + 'losslists' + elif is_hpc: + losslists_path = hpc_saves_path + 'losslists' + else: + losslists_path = saves_path + 'losslists' + + return losslists_path + + +def runtime_lists_path(is_linux=False, is_hpc=False): + if is_linux: + runtime_lists_path = linux_saves_path + 'runtime_lists' + elif is_hpc: + runtime_lists_path = hpc_saves_path + 'runtime_lists' + else: + runtime_lists_path = saves_path + 'runtime_lists' + + return runtime_lists_path + + +def predictions_path(is_linux=False, is_hpc=False): + if is_linux: + predictions_path = linux_saves_path + 'predictions' + elif is_hpc: + predictions_path = hpc_saves_path + 'predictions' + else: + predictions_path = saves_path + 'predictions' + + return predictions_path + +def prediction_proba_path(is_linux=False, is_hpc=False): + if is_linux: + prediction_proba_path = linux_saves_path + 'prediction_proba' + elif is_hpc: + prediction_proba_path = hpc_saves_path + 'prediction_proba' + else: + prediction_proba_path = saves_path + 'prediction_proba' + + return prediction_proba_path + + +def metrics_path(is_linux=False, is_hpc=False): + if is_linux: + metrics_path = linux_saves_path + 'metrics' + elif is_hpc: + metrics_path = hpc_saves_path + 'metrics' + else: + metrics_path = saves_path + 'metrics' + + return metrics_path + + +def confusion_matrices_path(is_linux=False, is_hpc=False): + if is_linux: + confusion_matrices_path = linux_saves_path + 'confusion_matrices' + elif is_hpc: + confusion_matrices_path = hpc_saves_path + 'confusion_matrices' + else: + confusion_matrices_path = saves_path + 'confusion_matrices' + + return confusion_matrices_path + + +def checkpoints_path(is_linux=False, is_hpc=False): + if is_linux: + checkpoints_path = linux_saves_path + 'checkpoints' + elif is_hpc: + checkpoints_path = hpc_saves_path + 'checkpoints' + else: + checkpoints_path = saves_path + 'checkpoints' + + return checkpoints_path + +def hyperparameters_path(is_linux=False, is_hpc=False): + if is_linux: + hyperparameters_path = linux_saves_path + 'hyperparameters' + elif is_hpc: + hyperparameters_path = hpc_saves_path + 'hyperparameters' + else: + hyperparameters_path = saves_path + 'hyperparameters' + + return hyperparameters_path + +def loss_curves_path(is_linux=False, is_hpc=False): + if is_linux: + loss_curves_path = linux_saves_path + 'loss_curves' + elif is_hpc: + loss_curves_path = hpc_saves_path + 'loss_curves' + else: + loss_curves_path = saves_path + 'loss_curves' + + return loss_curves_path + + diff --git a/utils/misc_func.py b/utils/misc_func.py new file mode 100644 index 0000000..6893a71 --- /dev/null +++ b/utils/misc_func.py @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -*- +""" +Created on Sun Mar 3 03:56:36 2024 + +@author: dchen +""" + +def substring_between_strings(main_string, start_string, end_string): + start_index = main_string.find(start_string) + if start_index == -1: + return None + + end_index = main_string.find(end_string, start_index + len(start_string)) + if end_index == -1: + return None + + return main_string[start_index + len(start_string):end_index] + + +def string_to_boolean(input_string): + if input_string.lower() in ['true', 't', 'yes', 'y', '1']: + return True + elif input_string.lower() in ['false', 'f', 'no', 'n', '0']: + return False + else: + raise ValueError("String does not represent a boolean value") diff --git a/utils/model_func.py b/utils/model_func.py new file mode 100644 index 0000000..95f19de --- /dev/null +++ b/utils/model_func.py @@ -0,0 +1,2145 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Feb 26 14:58:20 2024 + +@author: dchen +""" + +import os +import sys +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from tqdm import tqdm +import random +import time +import torch.autograd as autograd +from torch.cuda.amp import autocast, GradScaler + +# Import my own functions and classes +# from utils import get_paths +from utils import plot_save_func +from models.densenet import DenseNet3 as DenseNet +from models.densenet_configurable import DenseNet as DenseNet_config + +# If GPU is available, use GPU, else use CPU +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +# Seeds +torch.manual_seed(42) +np.random.seed(42) +random.seed(42) + + +def cross_val_2fold_DenseNet(model_hyperparameters, fold1_loader, fold2_loader, model_type=torch.float32, + n_epochs=100, n_classes=3, patience=10, save=False, + resume_checkpoint_path=None, pathmaster=None): + # If GPU is available, use GPU, else use CPU + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Optimizer and scheduler hyperparameters + lr = 0.0001 + + # Define img_channels + img_channels = 1 + + # Resume checkpoint if specified + if resume_checkpoint_path is not None and os.path.exists(resume_checkpoint_path): + # Load model hyperparameters + depth, growth_rate, compression, bottleneck, drop_rate, class_weights = load_hyperparameters(pathmaster) + + # Define DenseNet model based on loaded hyperparameters + model_fold1 = DenseNet(img_channels=img_channels, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device, dtype=model_type) + model_fold2 = DenseNet(img_channels=img_channels, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device, dtype=model_type) + + # Create optimizer and scheduler + optimizer_fold1 = torch.optim.Adam(model_fold1.parameters(), lr=lr) + optimizer_fold2 = torch.optim.Adam(model_fold2.parameters(), lr=lr) + + scheduler_fold1 = IdentityScheduler(optimizer_fold1) + scheduler_fold2 = IdentityScheduler(optimizer_fold2) + + model_fold1, optimizer_fold1, scheduler_fold1, model_fold2, optimizer_fold2, scheduler_fold2, epoch, loss = load_checkpoint_2fold(model_fold1, model_fold2, optimizer_fold1, optimizer_fold2, scheduler_fold1, scheduler_fold2, pathmaster) + start_epoch = epoch + 1 + best_loss_cross_val = loss + else: + # Extract model hyperparameters + depth = model_hyperparameters['depth'] + growth_rate = model_hyperparameters['growth_rate'] + compression = model_hyperparameters['compression'] + bottleneck = model_hyperparameters['bottleneck'] + drop_rate = model_hyperparameters['drop_rate'] + class_weights = model_hyperparameters['class_weights'] + + # Define DenseNet model based on loaded hyperparameters + model_fold1 = DenseNet(img_channels=1, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device, dtype=model_type) + model_fold2 = DenseNet(img_channels=1, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device, dtype=model_type) + + # Create optimizer and scheduler + optimizer_fold1 = torch.optim.Adam(model_fold1.parameters(), lr=lr) + optimizer_fold2 = torch.optim.Adam(model_fold2.parameters(), lr=lr) + + scheduler_fold1 = IdentityScheduler(optimizer_fold1) + scheduler_fold2 = IdentityScheduler(optimizer_fold2) + + best_loss_cross_val = float('inf') # If no checkpoint is loaded, set to infinity + start_epoch = 0 + + if save: + # Save hyperparameters + plot_save_func.save_hyperparameters(model_hyperparameters, pathmaster) + + # Create EarlyStoppingCallback object + early_stopping_callback = EarlyStoppingCallback(patience) + + # Create criterion for loss + criterion_train = nn.CrossEntropyLoss(weight=torch.tensor(class_weights).to(device=device)) + criterion_val = nn.CrossEntropyLoss() + + # Regularization + lambda_l1 = 0.01 + + # Initialize losslists + losslist_train = [] + losslist_cross_val = [] + + losslist_train_fold1 = [] + losslist_val_fold1 = [] + + losslist_train_fold2 = [] + losslist_val_fold2 = [] + + # Initialize runtime list + runtime_list = [] + + # Cross-validation + print('\n===========================================================================================') + sys.stdout.flush() + for epoch in tqdm(range(start_epoch, n_epochs), desc='Cross-Validation', unit='epoch', leave=False): # Creates a training progress bar with units of epoch + start_time = time.time() + sys.stderr.flush() + print("\nEntering Epoch:", epoch) + sys.stdout.flush() + + # Fold 1 training =============================================================================================================================================================== + model_fold1.train() + train_cum_loss_fold1 = 0 + for data_batch in tqdm(fold1_loader, total=len(fold1_loader), desc='Training', unit='batch', leave=False): + # Extract input and labels + X_train = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[1], data_batch['data'].shape[-1], data_batch['data'].shape[-1]).to(device=device) + Y_train = data_batch['label'].to(device=device) + + # Forward pass + logits, _, _ = model_fold1(X_train) + + # Regularization (if applicable) + l1 = 0 + for p in model_fold1.parameters(): + l1 = l1 + p.abs().sum() + + # Calculate total loss with regularization + batch_loss_train = criterion_train(logits.to(torch.float32), Y_train.long()) + lambda_l1 * l1 + train_cum_loss_fold1 += batch_loss_train.item() + + # Clear gradients + optimizer_fold1.zero_grad() + + # Backwards pass + batch_loss_train.backward() + + # Optimizer step + optimizer_fold1.step() + + # Update scheduler + scheduler_fold1.step() + + loss_train_fold1 = train_cum_loss_fold1 / len(fold1_loader) + + sys.stderr.flush() + print('\nTraining for Fold #1 in Epoch', epoch, 'has been completed!') + sys.stdout.flush() + + # Fold 1 validation ============================================================================================================================================================= + model_fold1.eval() + with torch.no_grad(): + val_cum_loss_fold1 = 0 + for data_batch in tqdm(fold2_loader, total=len(fold2_loader), desc='Validation', unit='batch', leave=False): + X_val = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[1], data_batch['data'].shape[-1], data_batch['data'].shape[-1]).to(device=device) + Y_val = data_batch['label'].to(device=device) + + logits, _, _ = model_fold1(X_val) + val_cum_loss_fold1 += criterion_val(logits.float(), Y_val.long()).item() + + loss_val_fold1 = val_cum_loss_fold1 / len(fold2_loader) + + sys.stderr.flush() + print('\nValidation for Fold #1 in Epoch', epoch, 'has been completed!') + sys.stdout.flush() + + # Fold 2 training =============================================================================================================================================================== + model_fold2.train() + train_cum_loss_fold2 = 0 + for data_batch in tqdm(fold2_loader, total=len(fold2_loader), desc='Training', unit='batch', leave=False): + # Extract input and labels + X_train = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[1], data_batch['data'].shape[-1], data_batch['data'].shape[-1]).to(device=device) + Y_train = data_batch['label'].to(device=device) + + # Forward pass + logits, _, _ = model_fold2(X_train) + + # Regularization (if applicable) + l1 = 0 + for p in model_fold2.parameters(): + l1 = l1 + p.abs().sum() + + # Calculate total loss with regularization + batch_loss_train = criterion(logits.to(torch.float32), Y_train.long()) + lambda_l1 * l1 + train_cum_loss_fold2 += batch_loss_train.item() + + # Clear gradients + optimizer_fold2.zero_grad() + + # Backwards pass + batch_loss_train.backward() + + # Optimizer step + optimizer_fold2.step() + + # Update scheduler + scheduler_fold2.step() + + loss_train_fold2 = train_cum_loss_fold2 / len(fold2_loader) + + sys.stderr.flush() + print('\nTraining for Fold #2 in Epoch', epoch, 'has been completed!') + sys.stdout.flush() + + # Fold 2 validation ============================================================================================================================================================= + model_fold2.eval() + with torch.no_grad(): + val_cum_loss_fold2 = 0 + for data_batch in tqdm(fold1_loader, total=len(fold1_loader), desc='Validation', unit='batch', leave=False): + X_val = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[1], data_batch['data'].shape[-1], data_batch['data'].shape[-1]).to(device=device) + Y_val = data_batch['label'].to(device=device) + + logits, _, _ = model_fold2(X_val) + val_cum_loss_fold2 += criterion(logits.float(), Y_val.long()).item() + + loss_val_fold2 = val_cum_loss_fold2 / len(fold1_loader) + + sys.stderr.flush() + print('\nValidation for Fold #2 in Epoch', epoch, 'has been completed!') + sys.stdout.flush() + # =============================================================================================================================================================================== + + # Caluclate epoch losses + epoch_loss_train = (loss_train_fold1 + loss_train_fold2) / 2 + epoch_loss_cross_val = (loss_val_fold1 + loss_val_fold2) / 2 + + # Append to losslists + losslist_train.append(epoch_loss_train) + losslist_cross_val.append(epoch_loss_cross_val) + + losslist_train_fold1.append(loss_train_fold1) + losslist_val_fold1.append(loss_val_fold1) + + losslist_train_fold2.append(loss_train_fold2) + losslist_val_fold2.append(loss_val_fold2) + + # Return the best cross-validation loss and save best checkpoint (epoch) + best_loss_cross_val = save_best_checkpoint_2fold(model_fold1, model_fold2, optimizer_fold1, optimizer_fold2, scheduler_fold1, scheduler_fold2, epoch, epoch_loss_cross_val, best_loss_cross_val, pathmaster) + + # Update line + sys.stderr.flush() + print("\n======> Epoch: {}/{}, Training Loss: {:.4f}, Cross-Validation Loss: {:.4f}".format(epoch, n_epochs-1, epoch_loss_train, epoch_loss_cross_val)) + print('\n===========================================================================================') + sys.stdout.flush() + + # Add epoch time to runtime_list + end_time = time.time() + time_passed = end_time-start_time # in seconds + runtime_list.append(time_passed) + + # Call the early stopping callback + if early_stopping_callback(epoch, epoch_loss_cross_val): + break + + # Saving + if save: + title = 'Training and Cross-Validation Loss' + plot_save_func.train_val_loss(losslist_train, losslist_cross_val, title, save, pathmaster) + + plot_save_func.save_losslists_2fold(losslist_train_fold1, losslist_val_fold1, losslist_train_fold2, losslist_val_fold2, losslist_train, losslist_cross_val, pathmaster) + plot_save_func.save_runtime_list(runtime_list, pathmaster) + + +def cross_val_2fold_DenseNet_mixed(model_hyperparameters, fold1_loader, fold2_loader, + n_epochs=100, n_classes=3, patience=10, save=False, + resume_checkpoint_path=None, pathmaster=None): + # If GPU is available, use GPU, else use CPU + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Optimizer and scheduler hyperparameters + lr = 0.0005 + + # Define img_channels + img_channels = 1 + + # Resume checkpoint if specified + if resume_checkpoint_path is not None and os.path.exists(resume_checkpoint_path): + # Load model hyperparameters + depth, growth_rate, compression, bottleneck, drop_rate, class_weights = load_hyperparameters(pathmaster) + + # Define DenseNet model based on loaded hyperparameters + model_fold1 = DenseNet(img_channels=img_channels, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device) + model_fold2 = DenseNet(img_channels=img_channels, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device) + + # Create optimizer and scheduler + optimizer_fold1 = torch.optim.Adam(model_fold1.parameters(), lr=lr) + optimizer_fold2 = torch.optim.Adam(model_fold2.parameters(), lr=lr) + + scheduler_fold1 = IdentityScheduler(optimizer_fold1) + scheduler_fold2 = IdentityScheduler(optimizer_fold2) + + model_fold1, optimizer_fold1, scheduler_fold1, model_fold2, optimizer_fold2, scheduler_fold2, epoch, loss = load_checkpoint_2fold(model_fold1, model_fold2, optimizer_fold1, optimizer_fold2, scheduler_fold1, scheduler_fold2, pathmaster) + start_epoch = epoch + 1 + best_loss_cross_val = loss + else: + # Extract model hyperparameters + depth = model_hyperparameters['depth'] + growth_rate = model_hyperparameters['growth_rate'] + compression = model_hyperparameters['compression'] + bottleneck = model_hyperparameters['bottleneck'] + drop_rate = model_hyperparameters['drop_rate'] + class_weights = model_hyperparameters['class_weights'] + + # Define DenseNet model based on loaded hyperparameters + model_fold1 = DenseNet(img_channels=1, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device) + model_fold2 = DenseNet(img_channels=1, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device) + + # Create optimizer and scheduler + optimizer_fold1 = torch.optim.Adam(model_fold1.parameters(), lr=lr) + optimizer_fold2 = torch.optim.Adam(model_fold2.parameters(), lr=lr) + + scheduler_fold1 = IdentityScheduler(optimizer_fold1) + scheduler_fold2 = IdentityScheduler(optimizer_fold2) + + best_loss_cross_val = float('inf') # If no checkpoint is loaded, set to infinity + start_epoch = 0 + + if save: + # Save hyperparameters + plot_save_func.save_hyperparameters(model_hyperparameters, pathmaster) + + # Create EarlyStoppingCallback object + early_stopping_callback = EarlyStoppingCallback(patience) + + # Create criterion for loss + criterion_train = nn.CrossEntropyLoss(weight=torch.tensor(class_weights).to(device=device)) + criterion_val = nn.CrossEntropyLoss() + + # Regularization + lambda_l1 = 0.01 + + # Initialize losslists + losslist_train = [] + losslist_cross_val = [] + + losslist_train_fold1 = [] + losslist_val_fold1 = [] + + losslist_train_fold2 = [] + losslist_val_fold2 = [] + + # Initialize runtime list + runtime_list = [] + + # Initialize predictions lists + predictions_list_train = [] + predictions_list_val = [] + + # Initialize true labels lists + true_labels_list_train = [] + true_labels_list_val = [] + + # Scalers + scaler_fold1 = GradScaler() + scaler_fold2 = GradScaler() + + # Cross-validation + print('\n===========================================================================================') + sys.stdout.flush() + for epoch in tqdm(range(start_epoch, n_epochs), desc='Cross-Validation', unit='epoch', leave=False): # Creates a training progress bar with units of epoch + start_time = time.time() + sys.stderr.flush() + print("\nEntering Epoch:", epoch) + sys.stdout.flush() + + # Epoch predictions + predictions_epoch_train = [] + predictions_epoch_val = [] + + # Fold 1 training =============================================================================================================================================================== + model_fold1.train() + train_cum_loss_fold1 = 0 + for data_batch in tqdm(fold1_loader, total=len(fold1_loader), desc='Training', unit='batch', leave=False): + # Extract input and labels + X_train = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[1], data_batch['data'].shape[-1], data_batch['data'].shape[-1]).to(device=device) + Y_train = data_batch['label'].to(device=device) + + if epoch == start_epoch: + true_labels_list_train.append(torch.reshape(Y_train, (-1,1))) + + with autocast(): + # Forward pass + logits, predictions, _ = model_fold1(X_train) + + predictions_epoch_train.append(torch.reshape(predictions, (-1,1))) + + # Regularization (if applicable) + l1 = 0 + for p in model_fold1.parameters(): + l1 = l1 + p.abs().sum() + + # Calculate total loss with regularization + batch_loss_train = criterion_train(logits.to(torch.float32), Y_train.long()) + lambda_l1 * l1 + train_cum_loss_fold1 += batch_loss_train.item() + + # Clear gradients + optimizer_fold1.zero_grad() + + # Backwards pass + scaler_fold1.scale(batch_loss_train).backward() + + # Optimizer step + scaler_fold1.step(optimizer_fold1) + + # Scaler update + scaler_fold1.update() + + # Update scheduler + scheduler_fold1.step() + + loss_train_fold1 = train_cum_loss_fold1 / len(fold1_loader) + + sys.stderr.flush() + print('\nTraining for Fold #1 in Epoch', epoch, 'has been completed!') + sys.stdout.flush() + + # Fold 1 validation ============================================================================================================================================================= + model_fold1.eval() + with torch.no_grad(): + val_cum_loss_fold1 = 0 + for data_batch in tqdm(fold2_loader, total=len(fold2_loader), desc='Validation', unit='batch', leave=False): + X_val = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[1], data_batch['data'].shape[-1], data_batch['data'].shape[-1]).to(device=device) + Y_val = data_batch['label'].to(device=device) + + if epoch == start_epoch: + true_labels_list_val.append(torch.reshape(Y_val, (-1,1))) + + logits, predictions, _ = model_fold1(X_val) + + predictions_epoch_val.append(torch.reshape(predictions, (-1,1))) + + val_cum_loss_fold1 += criterion_loss(logits.float(), Y_val.long()).item() + + loss_val_fold1 = val_cum_loss_fold1 / len(fold2_loader) + + sys.stderr.flush() + print('\nValidation for Fold #1 in Epoch', epoch, 'has been completed!') + sys.stdout.flush() + + # Fold 2 training =============================================================================================================================================================== + model_fold2.train() + train_cum_loss_fold2 = 0 + for data_batch in tqdm(fold2_loader, total=len(fold2_loader), desc='Training', unit='batch', leave=False): + # Extract input and labels + X_train = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[1], data_batch['data'].shape[-1], data_batch['data'].shape[-1]).to(device=device) + Y_train = data_batch['label'].to(device=device) + + if epoch == start_epoch: + true_labels_list_train.append(torch.reshape(Y_train, (-1,1))) + + with autocast(): + # Forward pass + logits, predictions, _ = model_fold2(X_train) + + predictions_epoch_train.append(torch.reshape(predictions, (-1,1))) + + # Regularization (if applicable) + l1 = 0 + for p in model_fold2.parameters(): + l1 = l1 + p.abs().sum() + + # Calculate total loss with regularization + batch_loss_train = criterion_train(logits.to(torch.float32), Y_train.long()) + lambda_l1 * l1 + train_cum_loss_fold2 += batch_loss_train.item() + + # Clear gradients + optimizer_fold2.zero_grad() + + # Backwards pass + scaler_fold2.scale(batch_loss_train).backward() + + # Optimizer step + scaler_fold2.step(optimizer_fold2) + + # Scaler update + scaler_fold2.update() + + # Update scheduler + scheduler_fold2.step() + + loss_train_fold2 = train_cum_loss_fold2 / len(fold2_loader) + + sys.stderr.flush() + print('\nTraining for Fold #2 in Epoch', epoch, 'has been completed!') + sys.stdout.flush() + + # Fold 2 validation ============================================================================================================================================================= + model_fold2.eval() + with torch.no_grad(): + val_cum_loss_fold2 = 0 + for data_batch in tqdm(fold1_loader, total=len(fold1_loader), desc='Validation', unit='batch', leave=False): + X_val = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[1], data_batch['data'].shape[-1], data_batch['data'].shape[-1]).to(device=device) + Y_val = data_batch['label'].to(device=device) + + if epoch == start_epoch: + true_labels_list_val.append(torch.reshape(Y_val, (-1,1))) + + logits, predictions, _ = model_fold2(X_val) + + predictions_epoch_val.append(torch.reshape(predictions, (-1,1))) + + val_cum_loss_fold2 += criterion_val(logits.float(), Y_val.long()).item() + + loss_val_fold2 = val_cum_loss_fold2 / len(fold1_loader) + + sys.stderr.flush() + print('\nValidation for Fold #2 in Epoch', epoch, 'has been completed!') + sys.stdout.flush() + # =============================================================================================================================================================================== + + # Caluclate epoch losses + epoch_loss_train = (loss_train_fold1 + loss_train_fold2) / 2 + epoch_loss_cross_val = (loss_val_fold1 + loss_val_fold2) / 2 + + # Append to losslists + losslist_train.append(epoch_loss_train) + losslist_cross_val.append(epoch_loss_cross_val) + + losslist_train_fold1.append(loss_train_fold1) + losslist_val_fold1.append(loss_val_fold1) + + losslist_train_fold2.append(loss_train_fold2) + losslist_val_fold2.append(loss_val_fold2) + + # Return the best cross-validation loss and save best checkpoint (epoch) + best_loss_cross_val = save_best_checkpoint_2fold(model_fold1, model_fold2, optimizer_fold1, optimizer_fold2, scheduler_fold1, scheduler_fold2, epoch, epoch_loss_cross_val, best_loss_cross_val, pathmaster) + + # Update line + sys.stderr.flush() + print("\n======> Epoch: {}/{}, Training Loss: {:.4f}, Cross-Validation Loss: {:.4f}".format(epoch, n_epochs-1, epoch_loss_train, epoch_loss_cross_val)) + print('\n===========================================================================================') + sys.stdout.flush() + + # Add epoch predictions + predictions_epoch_train = np.array(torch.cat(predictions_epoch_train, dim=0).to('cpu')) + predictions_epoch_val = np.array(torch.cat(predictions_epoch_val, dim=0).to('cpu')) + + predictions_list_train.append(predictions_epoch_train) + predictions_list_val.append(predictions_epoch_val) + + # Add epoch time to runtime_list + end_time = time.time() + time_passed = end_time-start_time # in seconds + runtime_list.append(time_passed) + + # Call the early stopping callback + if early_stopping_callback(epoch, epoch_loss_cross_val): + break + + # Convert true label list into array + true_labels_train = np.array(torch.cat(true_labels_list_train, dim=0).to('cpu')) + true_labels_val = np.array(torch.cat(true_labels_list_val, dim=0).to('cpu')) + + # Saving + if save: + title = 'Training and Cross-Validation Loss' + plot_save_func.train_val_loss(losslist_train, losslist_cross_val, title, save, pathmaster) + + title = 'Training and Cross-Validation Accuracy' + plot_save_func.accuracy_curves(true_labels_train, true_labels_val, predictions_list_train, predictions_list_val, title, save, pathmaster) + + plot_save_func.save_losslists_2fold(losslist_train_fold1, losslist_val_fold1, losslist_train_fold2, losslist_val_fold2, losslist_train, losslist_cross_val, pathmaster) + plot_save_func.save_runtime_list(runtime_list, pathmaster) + + +# Utilizes train() and validate() functions +def cross_val_2fold_DenseNet_func(model_hyperparameters, fold1_loader, fold2_loader, model_type=torch.float32, + n_epochs=100, n_classes=3, patience=10, save=False, + resume_checkpoint_path=None, pathmaster=None): + # If GPU is available, use GPU, else use CPU + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Optimizer and scheduler hyperparameters + lr = 0.0005 + + # Define img_channels + img_channels = 1 + + # Resume checkpoint if specified + if resume_checkpoint_path is not None and os.path.exists(resume_checkpoint_path): + # Load model hyperparameters + depth, growth_rate, compression, bottleneck, drop_rate, class_weights = load_hyperparameters(pathmaster) + + # Define DenseNet model based on loaded hyperparameters + model_fold1 = DenseNet(img_channels=img_channels, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device, dtype=model_type) + model_fold2 = DenseNet(img_channels=img_channels, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device, dtype=model_type) + + # Create optimizer and scheduler + optimizer_fold1 = torch.optim.Adam(model_fold1.parameters(), lr=lr) + optimizer_fold2 = torch.optim.Adam(model_fold2.parameters(), lr=lr) + + scheduler_fold1 = IdentityScheduler(optimizer_fold1) + scheduler_fold2 = IdentityScheduler(optimizer_fold2) + + model_fold1, optimizer_fold1, scheduler_fold1, model_fold2, optimizer_fold2, scheduler_fold2, epoch, loss = load_checkpoint_2fold(model_fold1, model_fold2, optimizer_fold1, optimizer_fold2, scheduler_fold1, scheduler_fold2, pathmaster) + start_epoch = epoch + 1 + best_loss_cross_val = loss + else: + # Extract model hyperparameters + depth = model_hyperparameters['depth'] + growth_rate = model_hyperparameters['growth_rate'] + compression = model_hyperparameters['compression'] + bottleneck = model_hyperparameters['bottleneck'] + drop_rate = model_hyperparameters['drop_rate'] + class_weights = model_hyperparameters['class_weights'] + + # Define DenseNet model based on loaded hyperparameters + model_fold1 = DenseNet(img_channels=1, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device, dtype=model_type) + model_fold2 = DenseNet(img_channels=1, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device, dtype=model_type) + + # Create optimizer and scheduler + optimizer_fold1 = torch.optim.Adam(model_fold1.parameters(), lr=lr) + optimizer_fold2 = torch.optim.Adam(model_fold2.parameters(), lr=lr) + + scheduler_fold1 = IdentityScheduler(optimizer_fold1) + scheduler_fold2 = IdentityScheduler(optimizer_fold2) + + best_loss_cross_val = float('inf') # If no checkpoint is loaded, set to infinity + start_epoch = 0 + + if save: + # Save hyperparameters + plot_save_func.save_hyperparameters(model_hyperparameters, pathmaster) + + # Create EarlyStoppingCallback object + early_stopping_callback = EarlyStoppingCallback(patience) + + # Create criterion for loss + criterion_train = nn.CrossEntropyLoss(weight=torch.tensor(class_weights).to(device=device)) + criterion_val = nn.CrossEntropyLoss() + + # Regularization + lambda_l1 = 0.01 + + # Initialize losslists + losslist_train = [] + losslist_cross_val = [] + + losslist_train_fold1 = [] + losslist_val_fold1 = [] + + losslist_train_fold2 = [] + losslist_val_fold2 = [] + + # Initialize runtime list + runtime_list = [] + + # Cross-validation + print('\n===========================================================================================') + sys.stdout.flush() + for epoch in tqdm(range(start_epoch, n_epochs), desc='Cross-Validation', unit='epoch', leave=False): # Creates a training progress bar with units of epoch + start_time = time.time() + sys.stderr.flush() + print("\nEntering Epoch:", epoch) + sys.stdout.flush() + + # Fold 1 (train on fold1, validate on fold2) + model_fold1, optimizer_fold1, scheduler_fold1, loss_train_fold1 = train(model_fold1, fold1_loader, optimizer_fold1, scheduler_fold1, criterion_train, lambda_l1) + loss_val_fold1 = validate(model_fold1, fold2_loader, criterion_val) + + # Fold 2 (train on fold2, validate on fold1) + model_fold2, optimizer_fold2, scheduler_fold2, loss_train_fold2 = train(model_fold2, fold2_loader, optimizer_fold2, scheduler_fold2, criterion_train, lambda_l1) + loss_val_fold2 = validate(model_fold2, fold1_loader, criterion_val) + + # Caluclate epoch losses + epoch_loss_train = (loss_train_fold1 + loss_train_fold2) / 2 + epoch_loss_cross_val = (loss_val_fold1 + loss_val_fold2) / 2 + + # Append to losslists + losslist_train.append(epoch_loss_train) + losslist_cross_val.append(epoch_loss_cross_val) + + losslist_train_fold1.append(loss_train_fold1) + losslist_val_fold1.append(loss_val_fold1) + + losslist_train_fold2.append(loss_train_fold2) + losslist_val_fold2.append(loss_val_fold2) + + # Return the best cross-validation loss and save best checkpoint (epoch) + best_loss_cross_val = save_best_checkpoint_2fold(model_fold1, model_fold2, optimizer_fold1, optimizer_fold2, scheduler_fold1, scheduler_fold2, epoch, epoch_loss_cross_val, best_loss_cross_val, pathmaster) + + # Update line + sys.stderr.flush() + print("\n======> Epoch: {}/{}, Training Loss: {:.4f}, Cross-Validation Loss: {:.4f}".format(epoch, n_epochs-1, epoch_loss_train, epoch_loss_cross_val)) + print('\n===========================================================================================') + sys.stdout.flush() + + # Add epoch time to runtime_list + end_time = time.time() + time_passed = end_time-start_time # in seconds + runtime_list.append(time_passed) + + # Call the early stopping callback + if early_stopping_callback(epoch, epoch_loss_cross_val): + break + + # Saving + if save: + title = 'Training and Cross-Validation Loss' + plot_save_func.train_val_loss(losslist_train, losslist_cross_val, title, save, pathmaster) + + plot_save_func.save_losslists_2fold(losslist_train_fold1, losslist_val_fold1, losslist_train_fold2, losslist_val_fold2, losslist_train, losslist_cross_val, pathmaster) + plot_save_func.save_runtime_list(runtime_list, pathmaster) + + +def train_validate_DenseNet(model_hyperparameters, train_loader, val_loader, model_type=torch.float32, + n_epochs=100, n_classes=3, patience=10, save=False, + resume_checkpoint_path=None, pathmaster=None): + + # If GPU is available, use GPU, else use CPU + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Optimizer and scheduler hyperparameters + lr = 0.0005 + + # Resume checkpoint if specified + if resume_checkpoint_path is not None and os.path.exists(resume_checkpoint_path): + # Load model hyperparameters + depth, growth_rate, compression, bottleneck, drop_rate, class_weights = load_hyperparameters(pathmaster) + + # Define DenseNet model based on loaded hyperparameters + model = DenseNet(img_channels=1, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device, dtype=model_type) + + # Create optimizer and scheduler + optimizer = torch.optim.Adam(model.parameters(), lr=lr) + scheduler = IdentityScheduler(optimizer) + + model, optimizer, scheduler, epoch, loss = load_checkpoint(model, optimizer, scheduler, pathmaster) + start_epoch = epoch + 1 + best_loss_val = loss + else: + # Extract model hyperparameters + depth = model_hyperparameters['depth'] + growth_rate = model_hyperparameters['growth_rate'] + compression = model_hyperparameters['compression'] + bottleneck = model_hyperparameters['bottleneck'] + drop_rate = model_hyperparameters['drop_rate'] + class_weights = model_hyperparameters['class_weights'] + + # Define DenseNet model based on input hyperparameters + model = DenseNet(img_channels=1, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device, dtype=model_type) + + # Create optimizer and scheduler + optimizer = torch.optim.Adam(model.parameters(), lr=lr) + scheduler = IdentityScheduler(optimizer) + + best_loss_val = float('inf') # If no checkpoint is loaded, set to infinity + start_epoch = 0 + + if save: + # Save hyperparameters + plot_save_func.save_hyperparameters(model_hyperparameters, pathmaster) + + # Create EarlyStoppingCallback object + early_stopping_callback = EarlyStoppingCallback(patience) + + # Create criterion for loss + criterion_train = nn.CrossEntropyLoss(weight=torch.tensor(class_weights).to(device=device)) + criterion_val = nn.CrossEntropyLoss() + + # Regularization + lambda_l1 = 0.01 + + # Initialize losslists + losslist_train = [] + losslist_val = [] + + # Initialize runtime list + runtime_list = [] + + # Training and validation + print('\n===========================================================================================') + sys.stdout.flush() + for epoch in tqdm(range(start_epoch, n_epochs), desc='Training and Validation', unit='epoch', leave=False): # Creates a training progress bar with units of epoch + start_time = time.time() + sys.stderr.flush() + print("\nEntering Epoch:", epoch) + # Training + model.train() + # Reset training sum of epoch loss and batch_count + sum_epoch_loss_train = 0 + sys.stdout.flush() + for train_batch in tqdm(train_loader, total=len(train_loader), desc='Training Epoch', unit='batch', leave=False): + # Extract input and labels + # train_batch['data'].shape = [batch_size, img_channels, img_size, img_size] + X_train = train_batch['data'].reshape(train_batch['data'].shape[0], train_batch['data'].shape[1], train_batch['data'].shape[-1], train_batch['data'].shape[-1]).to(device=device) + Y_train = train_batch['label'].to(device=device) + + # Forward pass + logits, _, _ = model(X_train) + + # Regularization + l1 = 0 + for p in model.parameters(): + l1 = l1 + p.abs().sum() + + # Calculate sum of total loss for epoch with regularization + batch_loss_train = criterion_train(logits.to(torch.float32), Y_train.long()) # Criterion returns a scalar tensor + batch_loss_train += lambda_l1 * l1 + + # Clear gradients + optimizer.zero_grad(set_to_none=True) + + # Backwards pass + batch_loss_train.backward() + + # Optimizer step + optimizer.step() + + # Generate epoch loss + sum_epoch_loss_train += batch_loss_train.item() + + # Update scheduler + scheduler.step() + + # Calculate epoch loss for training + epoch_loss_train = sum_epoch_loss_train / len(train_loader) + losslist_train.append(epoch_loss_train) + + sys.stderr.flush() + print('\nTraining for Epoch', epoch, 'has been completed!') + sys.stdout.flush() + + # Validation + model.eval() + sum_epoch_loss_val = 0 + with torch.no_grad(): # Disable gradient computation during validation + sys.stdout.flush() + for val_batch in tqdm(val_loader, total=len(val_loader), desc='Validation Epoch', unit='batch', leave=False): + # Extract input and labels + X_val = val_batch['data'].reshape(val_batch['data'].shape[0], val_batch['data'].shape[1], val_batch['data'].shape[-1], val_batch['data'].shape[-1]).to(device=device) + Y_val = val_batch['label'].to(device=device) + + # Forward pass + logits, _, _ = model(X_val) + + # Calculate sum of total loss for epoch + sum_epoch_loss_val += criterion_val(logits.float(), Y_val.long()).item() # Criterion returns a scalar tensor + + # Calculate epoch loss for validation + epoch_loss_val = sum_epoch_loss_val / len(val_loader) + losslist_val.append(epoch_loss_val) + + sys.stderr.flush() + print('\nValidation for Epoch', epoch, 'has been completed!') + sys.stdout.flush() + + # # Temporarily save checkpoint after each epoch + # save_checkpoint(model, optimizer, scheduler, epoch, loss=epoch_loss_val, checkpoint_path=temp_checkpoint_path) + + # Return the best validation loss and save best checkpoint (epoch) + best_loss_val = save_best_checkpoint(model, optimizer, scheduler, epoch, epoch_loss_val, best_loss_val, pathmaster) + + # Update line + sys.stderr.flush() + print("\n======> Epoch: {}/{}, Training Loss: {:.4f}, Validation Loss: {:.4f}".format(epoch, n_epochs-1, epoch_loss_train, epoch_loss_val)) + print('\n===========================================================================================') + sys.stdout.flush() + + # Add epoch time to runtime_list + end_time = time.time() + time_passed = end_time-start_time # in seconds + runtime_list.append(time_passed) + + + # Call the early stopping callback + if early_stopping_callback(epoch, epoch_loss_val): + break + + # Saving + if save: + title = 'Training and Validation Loss' + plot_save_func.train_val_loss(losslist_train, losslist_val, title, save, pathmaster) + + plot_save_func.save_losslists(losslist_train, losslist_val, pathmaster) + plot_save_func.save_runtime_list(runtime_list, pathmaster) + + +def train_validate_DenseNet_mixed(model_hyperparameters, train_loader, val_loader, + n_epochs=100, n_classes=3, patience=10, save=False, + resume_checkpoint_path=None, pathmaster=None): + + # If GPU is available, use GPU, else use CPU + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Optimizer and scheduler hyperparameters + lr = 0.0005 + + # Resume checkpoint if specified + if resume_checkpoint_path is not None and os.path.exists(resume_checkpoint_path): + # Load model hyperparameters + depth, growth_rate, compression, bottleneck, drop_rate, class_weights = load_hyperparameters(pathmaster) + + # Define DenseNet model based on loaded hyperparameters + model = DenseNet(img_channels=1, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device) + + # Create optimizer and scheduler + optimizer = torch.optim.Adam(model.parameters(), lr=lr) + scheduler = IdentityScheduler(optimizer) + + model, optimizer, scheduler, epoch, loss = load_checkpoint(model, optimizer, scheduler, pathmaster) + start_epoch = epoch + 1 + best_loss_val = loss + else: + # Extract model hyperparameters + depth = model_hyperparameters['depth'] + growth_rate = model_hyperparameters['growth_rate'] + compression = model_hyperparameters['compression'] + bottleneck = model_hyperparameters['bottleneck'] + drop_rate = model_hyperparameters['drop_rate'] + class_weights = model_hyperparameters['class_weights'] + + # Define DenseNet model based on input hyperparameters + model = DenseNet(img_channels=1, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device) + + # Create optimizer and scheduler + optimizer = torch.optim.Adam(model.parameters(), lr=lr) + scheduler = IdentityScheduler(optimizer) + + best_loss_val = float('inf') # If no checkpoint is loaded, set to infinity + start_epoch = 0 + + if save: + # Save hyperparameters + plot_save_func.save_hyperparameters(model_hyperparameters, pathmaster) + + # Create EarlyStoppingCallback object + early_stopping_callback = EarlyStoppingCallback(patience) + + # Create criterion for loss + criterion_train = nn.CrossEntropyLoss(weight=torch.tensor(class_weights).to(device=device)) + criterion_val = nn.CrossEntropyLoss() + + # Regularization + lambda_l1 = 0.01 + + # Initialize losslists + losslist_train = [] + losslist_val = [] + + # Initialize runtime list + runtime_list = [] + + # Initialize predictions lists + predictions_list_train = [] + predictions_list_val = [] + + # Initialize true labels lists + true_labels_list_train = [] + true_labels_list_val = [] + + # Scalers + scaler = GradScaler() + + # Training and validation + print('\n===========================================================================================') + sys.stdout.flush() + for epoch in tqdm(range(start_epoch, n_epochs), desc='Training and Validation', unit='epoch', leave=False): # Creates a training progress bar with units of epoch + start_time = time.time() + sys.stderr.flush() + print("\nEntering Epoch:", epoch) + # Training + model.train() + # Reset training sum of epoch loss and batch_count + sum_epoch_loss_train = 0 + sys.stdout.flush() + + # Epoch predictions + predictions_epoch_train = [] + predictions_epoch_val = [] + + for train_batch in tqdm(train_loader, total=len(train_loader), desc='Training Epoch', unit='batch', leave=False): + # Extract input and labels + # train_batch['data'].shape = [batch_size, img_channels, img_size, img_size] + X_train = train_batch['data'].reshape(train_batch['data'].shape[0], train_batch['data'].shape[1], train_batch['data'].shape[-1], train_batch['data'].shape[-1]).to(device=device) + Y_train = train_batch['label'].to(device=device) + + if epoch == start_epoch: + true_labels_list_train.append(torch.reshape(Y_train, (-1,1))) + + with autocast(): + # Forward pass + logits, predictions, _ = model(X_train) + + predictions_epoch_train.append(torch.reshape(predictions, (-1,1))) + + # Regularization + l1 = 0 + for p in model.parameters(): + l1 = l1 + p.abs().sum() + + # Calculate sum of total loss for epoch with regularization + batch_loss_train = criterion_train(logits.to(torch.float32), Y_train.long()) # Criterion returns a scalar tensor + batch_loss_train += lambda_l1 * l1 + + # Clear gradients + optimizer.zero_grad(set_to_none=True) + + # Backwards pass + scaler.scale(batch_loss_train).backward() + + # Optimizer step + scaler.step(optimizer) + + # Scaler update + scaler.update() + + # Generate epoch loss + sum_epoch_loss_train += batch_loss_train.item() + + # Update scheduler + scheduler.step() + + # Calculate epoch loss for training + epoch_loss_train = sum_epoch_loss_train / len(train_batch) + losslist_train.append(epoch_loss_train) + + sys.stderr.flush() + print('\nTraining for Epoch', epoch, 'has been completed!') + sys.stdout.flush() + + # Validation + model.eval() + sum_epoch_loss_val = 0 + with torch.no_grad(): # Disable gradient computation during validation + sys.stdout.flush() + for val_batch in tqdm(val_loader, total=len(val_loader), desc='Validation Epoch', unit='batch', leave=False): + # Extract input and labels + X_val = val_batch['data'].reshape(val_batch['data'].shape[0], val_batch['data'].shape[1], val_batch['data'].shape[-1], val_batch['data'].shape[-1]).to(device=device) + Y_val = val_batch['label'].to(device=device) + + if epoch == start_epoch: + true_labels_list_val.append(torch.reshape(Y_val, (-1,1))) + + # Forward pass + logits, predictions, _ = model(X_val) + predictions_epoch_val.append(torch.reshape(predictions, (-1,1))) + + # Calculate sum of total loss for epoch + sum_epoch_loss_val += criterion_val(logits.float(), Y_val.long()).item() # Criterion returns a scalar tensor + + # Calculate epoch loss for validation + epoch_loss_val = sum_epoch_loss_val / len(val_loader) + losslist_val.append(epoch_loss_val) + + sys.stderr.flush() + print('\nValidation for Epoch', epoch, 'has been completed!') + sys.stdout.flush() + + # # Temporarily save checkpoint after each epoch + # save_checkpoint(model, optimizer, scheduler, epoch, loss=epoch_loss_val, checkpoint_path=temp_checkpoint_path) + + # Return the best validation loss and save best checkpoint (epoch) + best_loss_val = save_best_checkpoint(model, optimizer, scheduler, epoch, epoch_loss_val, best_loss_val, pathmaster) + + # Update line + sys.stderr.flush() + print("\n======> Epoch: {}/{}, Training Loss: {:.4f}, Validation Loss: {:.4f}".format(epoch, n_epochs-1, epoch_loss_train, epoch_loss_val)) + print('\n===========================================================================================') + sys.stdout.flush() + + # Add epoch predictions + predictions_epoch_train = np.array(torch.cat(predictions_epoch_train, dim=0).to('cpu')) + predictions_epoch_val = np.array(torch.cat(predictions_epoch_val, dim=0).to('cpu')) + + predictions_list_train.append(predictions_epoch_train) + predictions_list_val.append(predictions_epoch_val) + + # Add epoch time to runtime_list + end_time = time.time() + time_passed = end_time-start_time # in seconds + runtime_list.append(time_passed) + + + # Call the early stopping callback + if early_stopping_callback(epoch, epoch_loss_val): + break + + # Convert true label list into array + true_labels_train = np.array(torch.cat(true_labels_list_train, dim=0).to('cpu')) + true_labels_val = np.array(torch.cat(true_labels_list_val, dim=0).to('cpu')) + + # Saving + if save: + title = 'Training and Validation Loss' + plot_save_func.train_val_loss(losslist_train, losslist_val, title, save, pathmaster) + + title = 'Training and Validation Accuracy' + plot_save_func.accuracy_curves(true_labels_train, true_labels_val, predictions_list_train, predictions_list_val, title, save, pathmaster) + + plot_save_func.save_losslists(losslist_train, losslist_val, pathmaster) + plot_save_func.save_runtime_list(runtime_list, pathmaster) + + +def train_validate_DenseNet_config(config, train_loader, val_loader, + n_epochs=100, n_classes=3, patience=10, save=False, + pathmaster=None): + # # Set filetag + # file_tag = str(dt.datetime.now()) + # # Define characters to replace with underscores + # chars_to_replace = [' ', ':', '.', '-'] + + # # Replace characters with underscores + # for char in chars_to_replace: + # file_tag = file_tag.replace(char, '_') + # pathmaster.set_file_tag(file_tag) + + # Save hyperparameters + model_hyperparameters = { # Default, no bottleneck or compression + 'depth': config['depth'], + 'growth_rate': config['growth_rate'], + 'compression': config['compression'], + 'bottleneck': config['bottleneck'], + 'drop_rate': config['drop_rate'], + 'class_weights': config['class_weights'], + 'learning_rate': config['learning_rate'], + 'num_dense_tran': config['num_dense_tran'], + 'lambda_l1': config['lambda_l1'], + 'activation': activation_to_string(config['activation']), + } + + if save: + plot_save_func.save_hyperparameters(model_hyperparameters, pathmaster) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + img_channels = 1 + + model = DenseNet_config(img_channels, config['depth'], n_classes, config['growth_rate'], config['compression'], + config['bottleneck'], config['drop_rate'], config['activation'], config['num_dense_tran']).to(device=device) + + # Loss function and optimizer + criterion_train = nn.CrossEntropyLoss(weight=torch.tensor(config['class_weights']).to(device=device)) + criterion_val = nn.CrossEntropyLoss() + optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate']) + scheduler = IdentityScheduler(optimizer) + + + # Scalers + scaler = GradScaler() + + # Initialize losslists + losslist_train = [] + losslist_val = [] + + # Initialize predictions lists + predictions_list_train = [] + predictions_list_val = [] + + # Initialize true labels lists + true_labels_list_train = [] + true_labels_list_val = [] + + # Initialize runtime list + runtime_list = [] + + # Create EarlyStoppingCallback object + early_stopping_callback = EarlyStoppingCallback(patience) + + # Initialize best validation loss + best_loss_val = float('inf') # If no checkpoint is loaded, set to infinity + + start_epoch = 0 + # Training and validation + print('\n===========================================================================================') + sys.stdout.flush() + for epoch in range(start_epoch, n_epochs): # Creates a training progress bar with units of epoch + start_time = time.time() + sys.stderr.flush() + print("\nEntering Epoch:", epoch) + # Training + model.train() + # Reset training sum of epoch loss and batch_count + sum_epoch_loss_train = 0 + sys.stdout.flush() + + # Epoch predictions + predictions_epoch_train = [] + predictions_epoch_val = [] + + for train_batch in tqdm(train_loader, total=len(train_loader), desc='Training Epoch', unit='batch', leave=False): + # Extract input and labels + # train_batch['data'].shape = [batch_size, img_channels, img_size, img_size] + X_train = train_batch['data'].reshape(train_batch['data'].shape[0], train_batch['data'].shape[1], train_batch['data'].shape[-1], train_batch['data'].shape[-1]).to(device=device) + Y_train = train_batch['label'].to(device=device) + + if epoch == start_epoch: + true_labels_list_train.append(torch.reshape(Y_train, (-1,1))) + + with autocast(): + # Forward pass + logits, predictions, _ = model(X_train) + + predictions_epoch_train.append(torch.reshape(predictions, (-1,1))) + + # Regularization + l1 = 0 + for p in model.parameters(): + l1 = l1 + p.abs().sum() + + # Calculate sum of total loss for epoch with regularization + batch_loss_train = criterion_train(logits.to(torch.float32), Y_train.long()) # Criterion returns a scalar tensor + batch_loss_train += config['lambda_l1'] * l1 + + # Clear gradients + optimizer.zero_grad(set_to_none=True) + + # Backwards pass + scaler.scale(batch_loss_train).backward() + + # Optimizer step + scaler.step(optimizer) + + # Scaler update + scaler.update() + + # Generate epoch loss + sum_epoch_loss_train += batch_loss_train.item() + + # Update scheduler + scheduler.step() + + # Calculate epoch loss for training + epoch_loss_train = sum_epoch_loss_train / len(train_batch) + losslist_train.append(epoch_loss_train) + + sys.stderr.flush() + print('\nTraining for Epoch', epoch, 'has been completed!') + sys.stdout.flush() + + # Validation + model.eval() + sum_epoch_loss_val = 0 + with torch.no_grad(): # Disable gradient computation during validation + sys.stdout.flush() + for val_batch in tqdm(val_loader, total=len(val_loader), desc='Validation Epoch', unit='batch', leave=False): + # Extract input and labels + X_val = val_batch['data'].reshape(val_batch['data'].shape[0], val_batch['data'].shape[1], val_batch['data'].shape[-1], val_batch['data'].shape[-1]).to(device=device) + Y_val = val_batch['label'].to(device=device) + + if epoch == start_epoch: + true_labels_list_val.append(torch.reshape(Y_val, (-1,1))) + + # Forward pass + logits, predictions, _ = model(X_val) + predictions_epoch_val.append(torch.reshape(predictions, (-1,1))) + + # Calculate sum of total loss for epoch + sum_epoch_loss_val += criterion_val(logits.float(), Y_val.long()).item() # Criterion returns a scalar tensor + + # Calculate epoch loss for validation + epoch_loss_val = sum_epoch_loss_val / len(val_loader) + losslist_val.append(epoch_loss_val) + + sys.stderr.flush() + print('\nValidation for Epoch', epoch, 'has been completed!') + sys.stdout.flush() + + # Return the best validation loss and save best checkpoint (epoch) + best_loss_val = save_best_checkpoint(model, optimizer, scheduler, epoch, epoch_loss_val, best_loss_val, pathmaster) + + # Update line + sys.stderr.flush() + print("\n======> Epoch: {}/{}, Training Loss: {:.4f}, Validation Loss: {:.4f}".format(epoch, n_epochs-1, epoch_loss_train, epoch_loss_val)) + print('\n===========================================================================================') + sys.stdout.flush() + + # Add epoch predictions + predictions_epoch_train = np.array(torch.cat(predictions_epoch_train, dim=0).to('cpu')) + predictions_epoch_val = np.array(torch.cat(predictions_epoch_val, dim=0).to('cpu')) + + predictions_list_train.append(predictions_epoch_train) + predictions_list_val.append(predictions_epoch_val) + + # Add epoch time to runtime_list + end_time = time.time() + time_passed = end_time-start_time # in seconds + runtime_list.append(time_passed) + + # Call the early stopping callback + if early_stopping_callback(epoch, epoch_loss_val): + break + + # Convert true label list into array + true_labels_train = np.array(torch.cat(true_labels_list_train, dim=0).to('cpu')) + true_labels_val = np.array(torch.cat(true_labels_list_val, dim=0).to('cpu')) + + if save: + title = 'Training and Validation Loss' + plot_save_func.train_val_loss(losslist_train, losslist_val, title, save, pathmaster) + + title = 'Training and Validation Accuracy' + plot_save_func.accuracy_curves(true_labels_train, true_labels_val, predictions_list_train, predictions_list_val, title, save, pathmaster) + + plot_save_func.save_losslists(losslist_train, losslist_val, pathmaster) + plot_save_func.save_runtime_list(runtime_list, pathmaster) + + +def best_DenseNet_2fold(fold1_loader, fold2_loader, model_type=torch.float32, n_classes=3, save=False, pathmaster=None): + print('\n===========================================================================================') + + # If GPU is available, use GPU, else use CPU + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Get paths + checkpoints_path = pathmaster.checkpoints_path() + + # Load model hyperparameters + depth, growth_rate, compression, bottleneck, drop_rate = load_hyperparameters(pathmaster) + # When testing on the test set, drop_rate should always be 0 + + # Define img_channels + img_channels = 1 + + # Initialize model + model_fold1 = DenseNet(img_channels=img_channels, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device, dtype=model_type) + model_fold2 = DenseNet(img_channels=img_channels, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device, dtype=model_type) + + # Create criterion for loss + criterion = nn.CrossEntropyLoss() + + # If checkpoint is not specified, terminate the function + checkpoint_path = os.path.join(checkpoints_path, 'checkpoint_' + pathmaster.file_tag + '.pt') + assert os.path.exists(checkpoint_path), 'Function terminated. Not a valid checkpoint path.' + + # Load models + model_fold1, model_fold2 = load_model_2fold(model_fold1, model_fold2, pathmaster) + + # Fold 1 ======================================================================================================================================================================= + # Initialize true label lists + true_labels_list_fold1 = [] + + # Intialize output (prediction) lists + predictions_list_fold1 = [] + prediction_proba_list_fold1 = [] + + # Validation + model_fold1.eval() + cum_loss_fold1 = 0 + with torch.no_grad(): # Disable gradient computation during validation + sys.stdout.flush() + for data_batch in tqdm(fold2_loader, total=len(fold2_loader), desc='Testing Fold #1', unit='batch', leave=False): + sys.stderr.flush() + + # Extract input and labels + X = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[1], data_batch['data'].shape[-1], data_batch['data'].shape[-1]).to(device=device) + Y = data_batch['label'].to(device=device) + true_labels_list_fold1.append(torch.reshape(Y, (-1,1))) + + # Forward pass + logits, predictions, prediction_proba = model_fold1(X) + predictions_list_fold1.append(torch.reshape(predictions, (-1,1))) + prediction_proba_list_fold1.append(torch.reshape(prediction_proba, (-1,n_classes))) + + # Calculate sum of total loss for epoch + cum_loss_fold1 += criterion(logits.float(), Y.long()).item() # Criterion returns a scalar tensor + + # Calculate loss for validation + loss_fold1 = cum_loss_fold1 / len(fold2_loader) + + # Convert true label list into array + true_labels_fold1 = np.array(torch.cat(true_labels_list_fold1, dim=0).to('cpu')) + + # Convert the output lists into arrays and concatenate along dim=0 (rows) + predictions_fold1 = np.array(torch.cat(predictions_list_fold1, dim=0).to('cpu')) + prediction_proba_fold1 = np.array(torch.cat(prediction_proba_list_fold1, dim=0).to('cpu')) + + # Fold 2 ======================================================================================================================================================================= + # Initialize true label lists + true_labels_list_fold2 = [] + + # Intialize output (prediction) lists + predictions_list_fold2 = [] + prediction_proba_list_fold2 = [] + + # Validation + model_fold2.eval() + cum_loss_fold2 = 0 + with torch.no_grad(): # Disable gradient computation during validation + sys.stdout.flush() + for data_batch in tqdm(fold1_loader, total=len(fold1_loader), desc='Testing Fold #2', unit='batch', leave=False): + sys.stderr.flush() + + # Extract input and labels + X = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[1], data_batch['data'].shape[-1], data_batch['data'].shape[-1]).to(device=device) + Y = data_batch['label'].to(device=device) + true_labels_list_fold2.append(torch.reshape(Y, (-1,1))) + + # Forward pass + logits, predictions, prediction_proba = model_fold2(X) + predictions_list_fold2.append(torch.reshape(predictions, (-1,1))) + prediction_proba_list_fold2.append(torch.reshape(prediction_proba, (-1,n_classes))) + + # Calculate sum of total loss for epoch + cum_loss_fold2 += criterion(logits.float(), Y.long()).item() # Criterion returns a scalar tensor + + # Calculate loss for validation + loss_fold2 = cum_loss_fold2 / len(fold1_loader) + + # Convert true label list into array + true_labels_fold2 = np.array(torch.cat(true_labels_list_fold2, dim=0).to('cpu')) + + # Convert the output lists into arrays and concatenate along dim=0 (rows) + predictions_fold2 = np.array(torch.cat(predictions_list_fold2, dim=0).to('cpu')) + prediction_proba_fold2 = np.array(torch.cat(prediction_proba_list_fold2, dim=0).to('cpu')) + # ============================================================================================================================================================================== + + # Create overall lists + true_labels = np.concatenate((true_labels_fold1, true_labels_fold2), axis=0) + predictions = np.concatenate((predictions_fold1, predictions_fold2), axis=0) + prediction_proba = np.concatenate((prediction_proba_fold1, prediction_proba_fold2), axis=0) + + # Print mean validation loss + mean_loss = (loss_fold1 + loss_fold2) / 2 + print('\n=====> Fold #1 Loss: %.4f' % loss_fold1) + print('=====> Fold #2 Loss: %.4f' % loss_fold2) + print('=====> Mean Loss: %.4f' % mean_loss) + + # Saving + if save: + from sklearn.metrics import confusion_matrix + conf_matrix = confusion_matrix(true_labels, predictions) + title = 'Cross-Validation Confusion Matrix' + plot_save_func.conf_matrix(conf_matrix, title, save, pathmaster) + + plot_save_func.save_labels(true_labels, pathmaster) + plot_save_func.save_predictions(predictions, pathmaster) + plot_save_func.save_prediction_proba(prediction_proba, pathmaster) + plot_save_func.metrics_2fold(true_labels_fold1, true_labels_fold2, predictions_fold1, predictions_fold2, prediction_proba_fold1, prediction_proba_fold2, save, pathmaster) + + clf_names = ['Fold #1', 'Fold #2', 'Combined'] + plot_save_func.mean_roc_curves([true_labels_fold1, true_labels_fold2], [prediction_proba_fold1, prediction_proba_fold2], clf_names, save, pathmaster) + + +# Utilizes test() function +def best_DenseNet_2fold_func(fold1_loader, fold2_loader, model_type=torch.float32, n_classes=3, save=False, pathmaster=None): + print('\n===========================================================================================') + + # If GPU is available, use GPU, else use CPU + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Get paths + checkpoints_path = pathmaster.checkpoints_path() + + # Load model hyperparameters + depth, growth_rate, compression, bottleneck, drop_rate, _ = load_hyperparameters(pathmaster) + # When testing on the test set, drop_rate should always be 0 + + # Define img_channels + img_channels = 1 + + # Initialize model + model_fold1 = DenseNet(img_channels=img_channels, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device, dtype=model_type) + model_fold2 = DenseNet(img_channels=img_channels, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device, dtype=model_type) + + # Create criterion for loss + criterion = nn.CrossEntropyLoss() + + # If checkpoint is not specified, terminate the function + checkpoint_path = os.path.join(checkpoints_path, 'checkpoint_' + pathmaster.file_tag + '.pt') + assert os.path.exists(checkpoint_path), 'Function terminated. Not a valid checkpoint path.' + + # Load models + model_fold1, model_fold2 = load_model_2fold(model_fold1, model_fold2, pathmaster) + + # Validation + with torch.no_grad(): # Disable gradient computation during validation + sys.stdout.flush() + + true_labels_fold1, predictions_fold1, prediction_proba_fold1, loss_fold1 = test(model_fold1, fold1_loader, criterion, n_classes) + true_labels_fold2, predictions_fold2, prediction_proba_fold2, loss_fold2 = test(model_fold2, fold2_loader, criterion, n_classes) + + # Create overall lists + true_labels = true_labels_fold1 + true_labels_fold2 + predictions = predictions_fold1 + predictions_fold2 + prediction_proba = prediction_proba_fold1 + prediction_proba_fold2 + + # Print mean validation loss + mean_loss = (loss_fold1 + loss_fold2) / 2 + print('\n======> Fold #1 Loss: %.4f' % loss_fold1) + print('=====> Fold #2 Loss: %.4f' % loss_fold2) + print('======> Mean Loss: %.4f' % mean_loss) + + # Saving + if save: + from sklearn.metrics import confusion_matrix + conf_matrix = confusion_matrix(true_labels, predictions) + title = 'Cross-Validation Confusion Matrix' + plot_save_func.conf_matrix(conf_matrix, title, save, pathmaster) + + plot_save_func.save_labels(true_labels, pathmaster) + plot_save_func.save_predictions(predictions, pathmaster) + plot_save_func.save_prediction_proba(prediction_proba, pathmaster) + plot_save_func.metrics_2fold(true_labels_fold1, true_labels_fold2, predictions_fold1, predictions_fold2, save, pathmaster) + + clf_names = ['Fold #1', 'Fold #2', 'Combined'] + plot_save_func.mean_roc_curves([true_labels_fold1, true_labels_fold2], [prediction_proba_fold1, prediction_proba_fold2], clf_names, save, pathmaster) + + +def best_DenseNet(data_loader, model_type=torch.float32, n_classes=3, save=False, pathmaster=None): + print('\n===========================================================================================') + + # If GPU is available, use GPU, else use CPU + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Get paths + checkpoints_path = pathmaster.checkpoints_path() + + # Load model hyperparameters + depth, growth_rate, compression, bottleneck, drop_rate, class_weights = load_hyperparameters(pathmaster) + # When testing on the test set, drop_rate should always be 0 + + # Initialize model + model = DenseNet(img_channels=1, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate).to(device=device, dtype=model_type) + + # Create criterion for loss + criterion = nn.CrossEntropyLoss() + + # If checkpoint is not specified, terminate the function + checkpoint_path = os.path.join(checkpoints_path, 'checkpoint_' + pathmaster.file_tag + '.pt') + assert os.path.exists(checkpoint_path), 'Function terminated. Not a valid checkpoint path.' + + # Load model + model = load_model(model, pathmaster) + + # Initialize true label lists + true_labels_list = [] + + # Intialize output (prediction) lists + predictions_list = [] + prediction_proba_list = [] + + # Evaluation + model.eval() + cum_loss = 0 + with torch.no_grad(): # Disable gradient computation during validation + sys.stdout.flush() + for data_batch in tqdm(data_loader, total=len(data_loader), desc='Testing', unit='batch', leave=False): + sys.stderr.flush() + + # Extract input and labels + X = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[1], data_batch['data'].shape[-1], data_batch['data'].shape[-1]).to(device=device) + Y = data_batch['label'].to(device=device) + true_labels_list.append(torch.reshape(Y, (-1,1))) + + # Forward pass + logits, predictions, prediction_proba = model(X) + predictions_list.append(torch.reshape(predictions, (-1,1))) + prediction_proba_list.append(torch.reshape(prediction_proba, (-1,n_classes))) + + # Calculate sum of total loss for epoch + cum_loss += criterion(logits.float(), Y.long()).item() # Criterion returns a scalar tensor + + # Calculate loss for validation + loss = cum_loss / len(data_loader) + + # Convert true label list into array + true_labels = np.array(torch.cat(true_labels_list, dim=0).to('cpu')) + + # Convert the output lists into arrays and concatenate along dim=0 (rows) + predictions = np.array(torch.cat(predictions_list, dim=0).to('cpu')) + prediction_proba = np.array(torch.cat(prediction_proba_list, dim=0).to('cpu')) + + # Print validation loss + print('\n======> Loss: %.4f' % loss) + + # Saving + if save: + from sklearn.metrics import confusion_matrix + conf_matrix = confusion_matrix(true_labels, predictions) + title = 'Evaluation Confusion Matrix' + plot_save_func.conf_matrix(conf_matrix, title, save, pathmaster) + + plot_save_func.save_labels(true_labels, pathmaster) + plot_save_func.save_predictions(predictions, pathmaster) + plot_save_func.save_prediction_proba(prediction_proba, pathmaster) + plot_save_func.metrics(true_labels, predictions, prediction_proba, save, pathmaster) + + plot_save_func.save_classification_report(true_labels, predictions, save, pathmaster) + plot_save_func.save_classification_report_imbalanced(true_labels, predictions, save, pathmaster) + + clf_names = ['Model'] + plot_save_func.mean_roc_curves([true_labels], [prediction_proba], clf_names, save, pathmaster) + plot_save_func.roc_curves(true_labels, prediction_proba, save, pathmaster) + + +def best_DenseNet_config(data_loader, model_type=torch.float32, n_classes=3, save=False, pathmaster=None): + print('\n===========================================================================================') + + # If GPU is available, use GPU, else use CPU + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Get paths + checkpoints_path = pathmaster.checkpoints_path() + + # Load model hyperparameters + depth, growth_rate, compression, bottleneck, drop_rate, _, _, num_dense_tran, _, activation = load_hyperparameters_random_search(pathmaster) + # When testing on the test set, drop_rate, class_weights, learning_rate, and lambda_l1 are not needed + + # Initialize model + model = DenseNet_config(img_channels=1, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate, + activation=activation, num_dense_tran=num_dense_tran).to(device=device, dtype=model_type) + + # Create criterion for loss + criterion = nn.CrossEntropyLoss() + + # If checkpoint is not specified, terminate the function + checkpoint_path = os.path.join(checkpoints_path, 'checkpoint_' + pathmaster.file_tag + '.pt') + assert os.path.exists(checkpoint_path), 'Function terminated. Not a valid checkpoint path.' + + # Load model + model = load_model(model, pathmaster) + + # Initialize true label lists + true_labels_list = [] + + # Intialize output (prediction) lists + predictions_list = [] + prediction_proba_list = [] + + # # Initialize segment names list + # segment_names_list = [] + + # Evaluation + model.eval() + cum_loss = 0 + with torch.no_grad(): # Disable gradient computation during validation + sys.stdout.flush() + for data_batch in tqdm(data_loader, total=len(data_loader), desc='Testing', unit='batch', leave=False): + sys.stderr.flush() + + # Extract input and labels + X = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[1], data_batch['data'].shape[-1], data_batch['data'].shape[-1]).to(device=device) + Y = data_batch['label'].to(device=device) + # Z = data_batch['segment_name'] + # segment_names_list.append(Z) + true_labels_list.append(torch.reshape(Y, (-1,1))) + + # Forward pass + logits, predictions, prediction_proba = model(X) + predictions_list.append(torch.reshape(predictions, (-1,1))) + prediction_proba_list.append(torch.reshape(prediction_proba, (-1,n_classes))) + + # Calculate sum of total loss for epoch + cum_loss += criterion(logits.float(), Y.long()).item() # Criterion returns a scalar tensor + + # Calculate loss for validation + loss = cum_loss / len(data_loader) + + # Convert true label list into array + true_labels = np.array(torch.cat(true_labels_list, dim=0).to('cpu')) + + # # Convert segment names list into array + # segment_names = np.concatenate(segment_names_list, axis=0) + # segment_names = segment_names.reshape(-1,1) + + # Convert the output lists into arrays and concatenate along dim=0 (rows) + predictions = np.array(torch.cat(predictions_list, dim=0).to('cpu')) + prediction_proba = np.array(torch.cat(prediction_proba_list, dim=0).to('cpu')) + + # Print validation loss + print('\n======> Loss: %.4f' % loss) + + # Saving + if save: + # pathmaster.set_file_tag(pathmaster.file_tag + '_test') + from sklearn.metrics import confusion_matrix + conf_matrix = confusion_matrix(true_labels, predictions) + title = 'Evaluation Confusion Matrix' + plot_save_func.conf_matrix(conf_matrix, title, save, pathmaster) + + plot_save_func.save_labels(true_labels, pathmaster) + # plot_save_func.save_labels(np.hstack([segment_names, true_labels]), pathmaster) + plot_save_func.save_predictions(predictions, pathmaster) + plot_save_func.save_prediction_proba(prediction_proba, pathmaster) + plot_save_func.metrics(true_labels, predictions, prediction_proba, save, pathmaster) + + plot_save_func.save_classification_report(true_labels, predictions, save, pathmaster) + plot_save_func.save_classification_report_imbalanced(true_labels, predictions, save, pathmaster) + + clf_names = ['Model'] + plot_save_func.mean_roc_curves([true_labels], [prediction_proba], clf_names, save, pathmaster) + plot_save_func.roc_curves(true_labels, prediction_proba, save, pathmaster) + + +def best_DenseNet_config_binary(data_loader, model_type=torch.float32, n_classes=3, save=False, pathmaster=None): + print('\n===========================================================================================') + + # If GPU is available, use GPU, else use CPU + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Get paths + checkpoints_path = pathmaster.checkpoints_path() + + # Load model hyperparameters + depth, growth_rate, compression, bottleneck, drop_rate, _, _, num_dense_tran, _, activation = load_hyperparameters_random_search(pathmaster) + # When testing on the test set, drop_rate, class_weights, learning_rate, and lambda_l1 are not needed + + # Initialize model + model = DenseNet_config(img_channels=1, depth=depth, n_classes=n_classes, growth_rate=growth_rate, + compression=compression, bottleneck=bottleneck, drop_rate=drop_rate, + activation=activation, num_dense_tran=num_dense_tran).to(device=device, dtype=model_type) + + # Create criterion for loss + criterion = nn.CrossEntropyLoss() + + # If checkpoint is not specified, terminate the function + checkpoint_path = os.path.join(checkpoints_path, 'checkpoint_' + pathmaster.file_tag + '.pt') + assert os.path.exists(checkpoint_path), 'Function terminated. Not a valid checkpoint path.' + + # Load model + model = load_model(model, pathmaster) + + # Initialize true label lists + true_labels_list = [] + + # Intialize output (prediction) lists + predictions_list = [] + prediction_proba_list = [] + + # Evaluation + model.eval() + cum_loss = 0 + with torch.no_grad(): # Disable gradient computation during validation + sys.stdout.flush() + for data_batch in tqdm(data_loader, total=len(data_loader), desc='Testing', unit='batch', leave=False): + sys.stderr.flush() + + # Extract input and labels + X = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[1], data_batch['data'].shape[-1], data_batch['data'].shape[-1]).to(device=device) + Y = data_batch['label'].to(device=device) + true_labels_list.append(torch.reshape(Y, (-1,1))) + + # Forward pass + logits, predictions, prediction_proba = model(X) + predictions_list.append(torch.reshape(predictions, (-1,1))) + prediction_proba_list.append(torch.reshape(prediction_proba, (-1,n_classes))) + + # Calculate sum of total loss for epoch + cum_loss += criterion(logits.float(), Y.long()).item() # Criterion returns a scalar tensor + + # Calculate loss for validation + loss = cum_loss / len(data_loader) + + # Convert true label list into array + true_labels = np.array(torch.cat(true_labels_list, dim=0).to('cpu')) + + # Convert the output lists into arrays and concatenate along dim=0 (rows) + predictions = np.array(torch.cat(predictions_list, dim=0).to('cpu')) + prediction_proba = np.array(torch.cat(prediction_proba_list, dim=0).to('cpu')) + + # Print validation loss + print('\n======> Loss: %.4f' % loss) + + # Saving + if save: + # pathmaster.set_file_tag(pathmaster.file_tag + '_test') + from sklearn.metrics import confusion_matrix + conf_matrix = confusion_matrix(true_labels, predictions) + title = 'Evaluation Confusion Matrix' + plot_save_func.conf_matrix(conf_matrix, title, save, pathmaster, class_names=['non-AF', 'AF']) + + plot_save_func.save_labels(true_labels, pathmaster) + plot_save_func.save_predictions(predictions, pathmaster) + plot_save_func.save_prediction_proba_binary(prediction_proba, pathmaster) + plot_save_func.metrics_binary(true_labels, predictions, prediction_proba, save, pathmaster) + + plot_save_func.save_classification_report(true_labels, predictions, save, pathmaster) + plot_save_func.save_classification_report_imbalanced(true_labels, predictions, save, pathmaster) + + plot_save_func.roc_curves_binary(true_labels, prediction_proba, save, pathmaster, class_names=['non-AF', 'AF']) + + +def train(model, dataloader, optimizer, scheduler, criterion, regularization): + model.train() + cum_loss = 0 + for data_batch in tqdm(dataloader, total=len(dataloader), desc='Training', unit='batch', leave=False): + # Extract input and labels + X_train = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[1], data_batch['data'].shape[-1], data_batch['data'].shape[-1]).to(device=device) + Y_train = data_batch['label'].to(device=device) + + # Forward pass + logits, _, _ = model(X_train) + + # Regularization (if applicable) + l1 = 0 + for p in model.parameters(): + l1 = l1 + p.abs().sum() + + # Calculate total loss with regularization + batch_loss_train = criterion(logits.to(torch.float32), Y_train.long()) + regularization * l1 + cum_loss += batch_loss_train.item() + + # Clear gradients + optimizer.zero_grad(set_to_none=True) + + # Backwards pass + batch_loss_train.backward() + + # Optimizer step + optimizer.step() + + # Update scheduler + scheduler.step() + + epoch_loss = cum_loss / len(dataloader) + + return model, optimizer, scheduler, epoch_loss + + +def validate(model, dataloader, criterion): + model.eval() + with torch.no_grad(): + cum_loss = 0 + for data_batch in tqdm(dataloader, total=len(dataloader), desc='Validation', unit='batch', leave=False): + X_val = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[1], data_batch['data'].shape[-1], data_batch['data'].shape[-1]).to(device=device) + Y_val = data_batch['label'].to(device=device) + + logits, _, _ = model(X_val) + cum_loss += criterion(logits.float(), Y_val.long()).item() + + epoch_loss = cum_loss / len(dataloader) + + return epoch_loss + + +def test(model, dataloader, criterion, n_classes): + # Initialize true label lists + true_labels_list = [] + + # Intialize output (prediction) lists + predictions_list = [] + prediction_proba_list = [] + + # Validation + model.eval() + cum_loss = 0 + with torch.no_grad(): # Disable gradient computation during validation + sys.stdout.flush() + for data_batch in tqdm(dataloader, total=len(dataloader), desc='Testing', unit='batch', leave=False): + sys.stderr.flush() + + # Extract input and labels + X = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[1], data_batch['data'].shape[-1], data_batch['data'].shape[-1]).to(device=device) + Y = data_batch['label'].to(device=device) + true_labels_list.append(torch.reshape(Y, (-1,1))) + + # Forward pass + logits, predictions, prediction_proba = model(X) + predictions_list.append(torch.reshape(predictions, (-1,1))) + prediction_proba_list.append(torch.reshape(prediction_proba, (-1,n_classes))) + + # Calculate sum of total loss for epoch + cum_loss += criterion(logits.float(), Y.long()).item() # Criterion returns a scalar tensor + + # Calculate loss for validation + loss = cum_loss / len(dataloader) + + # Convert true label list into array + true_labels = np.array(torch.cat(true_labels_list, dim=0).to('cpu')) + + # Convert the output lists into arrays and concatenate along dim=0 (rows) + predictions = np.array(torch.cat(predictions_list, dim=0).to('cpu')) + prediction_proba = np.array(torch.cat(prediction_proba_list, dim=0).to('cpu')) + + return true_labels, predictions, prediction_proba, loss + + +class IdentityScheduler(torch.optim.lr_scheduler._LRScheduler): + def __init__(self, optimizer, last_epoch=-1): + super(IdentityScheduler, self).__init__(optimizer, last_epoch) + + def get_lr(self): + # Returns the current learning rate without any modifications. + return self.base_lrs + + +def save_checkpoint(model, optimizer, scheduler, epoch, loss, checkpoint_path): # Will also be called to save the most recent checkpoint locally in the runtime so I always have the most recent checkpoint + torch.save({ + 'model_state_dict': model.state_dict(), + 'optimizer_state_dict': optimizer.state_dict(), + 'scheduler_state_dict': scheduler.state_dict() if scheduler else IdentityScheduler(optimizer).state_dict(), # Create identity scheduler if missing, actually doesn't work since the parameter is required + 'epoch': epoch, + 'loss': loss + }, checkpoint_path) + +def save_checkpoint_2fold(model_fold1, model_fold2, optimizer_fold1, optimizer_fold2, scheduler_fold1, scheduler_fold2, epoch, loss, checkpoint_path): # Will also be called to save the most recent checkpoint locally in the runtime so I always have the most recent checkpoint + torch.save({ + 'model_fold1_state_dict': model_fold1.state_dict(), + 'model_fold2_state_dict': model_fold2.state_dict(), + 'optimizer_fold1_state_dict': optimizer_fold1.state_dict(), + 'optimizer_fold2_state_dict': optimizer_fold2.state_dict(), + 'scheduler_fold1_state_dict': scheduler_fold1.state_dict(), + 'scheduler_fold2_state_dict': scheduler_fold2.state_dict(), + 'epoch': epoch, + 'loss': loss + }, checkpoint_path) + +def save_best_checkpoint(model, optimizer, scheduler, epoch, current_loss, best_loss, pathmaster): # When training the model, best_loss should be initialized to float.('inf') + # Might be good to have two different checkpoint paths, one for the best and one for the most recent checkpoint, maybe also have temp vs permanent checkpoint paths + if current_loss < best_loss: + checkpoints_path = pathmaster.checkpoints_path() + checkpoint_path = os.path.join(checkpoints_path, 'checkpoint_' + pathmaster.file_tag + '.pt') + best_loss = current_loss + save_checkpoint(model, optimizer, scheduler, epoch, best_loss, checkpoint_path) + print('\nNew checkpoint with better loss was saved!') + + return best_loss + else: + return best_loss + + +def save_best_checkpoint_2fold(model_fold1, model_fold2, optimizer_fold1, optimizer_fold2, scheduler_fold1, scheduler_fold2, epoch, current_loss, best_loss, pathmaster): # When training the model, best_loss should be initialized to float.('inf') + # Might be good to have two different checkpoint paths, one for the best and one for the most recent checkpoint, maybe also have temp vs permanent checkpoint paths + if current_loss < best_loss: + checkpoints_path = pathmaster.checkpoints_path() + checkpoint_path = os.path.join(checkpoints_path, 'checkpoint_' + pathmaster.file_tag + '.pt') + best_loss = current_loss + save_checkpoint_2fold(model_fold1, model_fold2, optimizer_fold1, optimizer_fold2, scheduler_fold1, scheduler_fold2, epoch, best_loss, checkpoint_path) + print('\nNew checkpoint with better loss was saved!') + + return best_loss + else: + return best_loss + + +def load_checkpoint(model, optimizer, scheduler, pathmaster): + checkpoints_path = pathmaster.checkpoints_path() + checkpoint_path = os.path.join(checkpoints_path, 'checkpoint_' + pathmaster.file_tag + '.pt') + if os.path.exists(checkpoint_path): + checkpoint = torch.load(checkpoint_path, map_location=device) + + model.load_state_dict(checkpoint['model_state_dict']) + optimizer.load_state_dict(checkpoint['optimizer_state_dict']) + scheduler.load_state_dict(checkpoint['scheduler_state_dict']) + start_epoch = checkpoint['epoch'] + loss = checkpoint['loss'] + + print('\nCheckpoint loaded!') + # print(f'Resuming training from epoch {start_epoch}, batch {start_batch}') + + return model, optimizer, scheduler, start_epoch, loss + else: + print('\nError! Checkpoint does not exist!') + + +def load_checkpoint_2fold(model_fold1, model_fold2, optimizer_fold1, optimizer_fold2, scheduler_fold1, scheduler_fold2, pathmaster): + checkpoints_path = pathmaster.checkpoints_path() + checkpoint_path = os.path.join(checkpoints_path, 'checkpoint_' + pathmaster.file_tag + '.pt') + if os.path.exists(checkpoint_path): + checkpoint = torch.load(checkpoint_path, map_location=device) + + model_fold1.load_state_dict(checkpoint['model_fold1_state_dict']) + optimizer_fold1.load_state_dict(checkpoint['optimizer_fold1_state_dict']) + scheduler_fold1.load_state_dict(checkpoint['scheduler_fold1_state_dict']) + + model_fold2.load_state_dict(checkpoint['model_fold2_state_dict']) + optimizer_fold2.load_state_dict(checkpoint['optimizer_fold2_state_dict']) + scheduler_fold2.load_state_dict(checkpoint['scheduler_fold2_state_dict']) + + start_epoch = checkpoint['epoch'] + loss = checkpoint['loss'] + + print('\nCheckpoint loaded!') + # print(f'Resuming training from epoch {start_epoch}, batch {start_batch}') + + return model_fold1, optimizer_fold1, scheduler_fold1, model_fold2, optimizer_fold2, scheduler_fold2, start_epoch, loss + else: + print('\nError! Checkpoint does not exist!') + + +def load_model_2fold(model_fold1, model_fold2, pathmaster): + checkpoints_path = pathmaster.checkpoints_path() + checkpoint_path = os.path.join(checkpoints_path, 'checkpoint_' + pathmaster.file_tag + '.pt') + if os.path.exists(checkpoint_path): + checkpoint = torch.load(checkpoint_path, map_location=device) + + model_fold1.load_state_dict(checkpoint['model_fold1_state_dict']) + model_fold2.load_state_dict(checkpoint['model_fold2_state_dict']) + + print('\nModels loaded!') + # print(f'Resuming training from epoch {start_epoch}, batch {start_batch}') + + return model_fold1, model_fold2 + else: + print('\nError! Models do not exist!') + + +def load_model(model, pathmaster): + checkpoints_path = pathmaster.checkpoints_path() + checkpoint_path = os.path.join(checkpoints_path, 'checkpoint_' + pathmaster.file_tag + '.pt') + if os.path.exists(checkpoint_path): + checkpoint = torch.load(checkpoint_path, map_location=device) + + model.load_state_dict(checkpoint['model_state_dict']) + + print('\nModel loaded!') + # print(f'Resuming training from epoch {start_epoch}, batch {start_batch}') + + return model + else: + print('\nError! Model does not exist!') + + +def load_hyperparameters(pathmaster): + hyperparameters_path = pathmaster.hyperparameters_path() + + # Extract model hyperparameters + model_hyperparameters_file = os.path.join(hyperparameters_path, 'hyperparameters_' + pathmaster.file_tag + '.csv') + model_hyperparameters = pd.read_csv(model_hyperparameters_file) + depth = int(model_hyperparameters['depth'].iloc[0]) + growth_rate = int(model_hyperparameters['growth_rate'].iloc[0]) + compression = model_hyperparameters['compression'].iloc[0] + bottleneck = model_hyperparameters['bottleneck'].iloc[0] + drop_rate = model_hyperparameters['drop_rate'].iloc[0] + class_weights = model_hyperparameters['class_weights'] + + return depth, growth_rate, compression, bottleneck, drop_rate, class_weights + + +def load_hyperparameters_random_search(pathmaster): + hyperparameters_path = pathmaster.hyperparameters_path() + + # Extract model hyperparameters + model_hyperparameters_file = os.path.join(hyperparameters_path, 'hyperparameters_' + pathmaster.file_tag + '.csv') + model_hyperparameters = pd.read_csv(model_hyperparameters_file) + depth = int(model_hyperparameters['depth'].iloc[0]) + growth_rate = int(model_hyperparameters['growth_rate'].iloc[0]) + compression = model_hyperparameters['compression'].iloc[0] + bottleneck = model_hyperparameters['bottleneck'].iloc[0] + drop_rate = model_hyperparameters['drop_rate'].iloc[0] + class_weights = model_hyperparameters['class_weights'] + learning_rate = model_hyperparameters['learning_rate'].iloc[0] + num_dense_tran = int(model_hyperparameters['num_dense_tran'].iloc[0]) + lambda_l1 = model_hyperparameters['lambda_l1'].iloc[0] + activation = string_to_activation((model_hyperparameters['activation'].iloc[0])) + + return depth, growth_rate, compression, bottleneck, drop_rate, class_weights, learning_rate, num_dense_tran, lambda_l1, activation + + +def string_to_activation(activation_string): + activation_map = { + 'relu': nn.ReLU(), + 'leaky_relu': nn.LeakyReLU(), + 'sigmoid': nn.Sigmoid(), + 'tanh': nn.Tanh(), + 'softmax': nn.Softmax(), + 'softplus': nn.Softplus(), + 'softshrink': nn.Softshrink(), + 'softmin': nn.Softmin(), + 'log_softmax': nn.LogSoftmax(), + 'elu': nn.ELU(), + 'prelu': nn.PReLU(), + 'relu6': nn.ReLU6(), + 'rrelu': nn.RReLU(), + 'celu': nn.CELU(), + 'selu': nn.SELU(), + 'gelu': nn.GELU(), + 'silu': nn.SiLU(), + # Add more activation functions if needed + } + + return activation_map.get(activation_string, None) + + +def activation_to_string(activation_func): + activation_map = { + nn.ReLU: 'relu', + nn.LeakyReLU: 'leaky_relu', + nn.Sigmoid: 'sigmoid', + nn.Tanh: 'tanh', + nn.Softmax: 'softmax', + nn.Softplus: 'softplus', + nn.Softshrink: 'softshrink', + nn.Softmin: 'softmin', + nn.LogSoftmax: 'log_softmax', + nn.ELU: 'elu', + nn.PReLU: 'prelu', + nn.ReLU6: 'relu6', + nn.RReLU: 'rrelu', + nn.CELU: 'celu', + nn.SELU: 'selu', + nn.GELU: 'gelu', + nn.SiLU: 'silu', + # Add more activation functions if needed + } + + return activation_map.get(activation_func.__class__, 'unknown') + + +class EarlyStoppingCallback: + def __init__(self, patience=10): + self.patience = patience + self.best_loss = float('inf') + self.counter = 0 + self.best_epoch = 0 + + def __call__(self, epoch, current_loss): + if current_loss < self.best_loss: + self.best_loss = current_loss + self.counter = 0 + self.best_epoch = epoch + else: + self.counter += 1 + if self.counter >= self.patience: + print(f"\nEarly stopping at epoch {epoch}. No improvement for {self.patience} epochs.") + + return True + + return False \ No newline at end of file diff --git a/utils/pathmaster.py b/utils/pathmaster.py new file mode 100644 index 0000000..38c5718 --- /dev/null +++ b/utils/pathmaster.py @@ -0,0 +1,321 @@ +# -*- coding: utf-8 -*- +""" +Created on Tue Mar 4 13:04:27 2024 + +@author: dchen +""" +import os + +class PathMaster(): + def __init__(self, is_linux=False, is_hpc=False, is_tfs=True, is_internal=False, is_external=False, focus='misc', file_tag='temp', img_res='not_an_img_res'): + self.focus = focus + self.file_tag = file_tag + self.is_linux = is_linux + self.is_hpc = is_hpc + self.is_tfs = is_tfs + self.is_internal = is_internal + self.is_external = is_external + self.img_res = img_res + + # Select correct root saves path + if self.is_linux: + if self.is_tfs: + self.saves_path = '/mnt/R/ENGR_Chon/Darren/Honors_Thesis/saves_tfs/' + self.focus + '/' + else: + self.saves_path = '/mnt/R/ENGR_Chon/Darren/Honors_Thesis/saves_poincare/' + self.focus + '/' + elif self.is_hpc: + if self.is_tfs: + self.saves_path = '/gpfs/scratchfs1/hfp14002/dac20022/Honors_Thesis/saves_tfs/' + self.focus + '/' + else: + self.saves_path = '/gpfs/scratchfs1/hfp14002/dac20022/Honors_Thesis/saves_poincare/' + self.focus + '/' + else: # Using your own computer + if self.is_tfs: + self.saves_path = r'\\grove.ad.uconn.edu\research\ENGR_Chon\Darren\Honors_Thesis\saves_tfs' + '\\' + self.focus + '\\' + else: + self.saves_path = r'\\grove.ad.uconn.edu\research\ENGR_Chon\Darren\Honors_Thesis\saves_poincare' + '\\' + self.focus + '\\' + + + def set_saves_path(self, saves_path): + self.saves_path = saves_path + + + def set_file_tag(self, file_tag): + self.file_tag = file_tag + + + def set_focus(self, focus): + self.focus = focus + + + def data_paths(self, data_format): + if data_format == 'pt': + # Base path + if self.is_linux: + base_path = "/mnt/R/ENGR_Chon/Darren/NIH_PulseWatch" + labels_base_path = "/mnt/R/ENGR_Chon/Darren/NIH_Pulsewatch" + # labels_base_path = "/mnt/R/ENGR_Chon/NIH_Pulsewatch_Database/Adjudication_UConn" + elif self.is_hpc: + base_path = "/gpfs/scratchfs1/kic14002/doh16101" + labels_base_path = "/gpfs/scratchfs1/hfp14002/lrm22005" + else: + if self.is_internal: + base_path = r'C:\\Chon_Lab\\NIH_Pulsewatch' + labels_base_path = r'C:\\Chon_Lab\\NIH_Pulsewatch' + elif self.is_external: + base_path = r'D:\\Chon_Lab\\NIH_Pulsewatch' + labels_base_path = r'D:\\Chon_Lab\\NIH_Pulsewatch' + else: + # R:\ENGR_Chon\Dong\MATLAB_generate_results\NIH_PulseWatch + base_path = "R:\\ENGR_Chon\\Darren\\NIH_Pulsewatch" # Why double \\ before NIH_Pulsewatch_Database? + labels_base_path = "R:\\ENGR_Chon\\Darren\\NIH_Pulsewatch" # Why double \\ before NIH_Pulsewatch_Database? + # labels_base_path = "R:\ENGR_Chon\\NIH_Pulsewatch_Database\Adjudication_UConn" + + # Type path + if self.is_tfs: + format_path = 'TFS_pt' + else: + format_path = 'Poincare_pt' + + # Join paths + data_path = os.path.join(base_path, format_path, self.img_res) + + else: + if self.is_linux: + base_path = "/mnt/R/ENGR_Chon/Dong/MATLAB_generate_results/NIH_PulseWatch" + labels_base_path = "/mnt/R/ENGR_Chon/Darren/NIH_Pulsewatch" + # labels_base_path = "/mnt/R/ENGR_Chon/NIH_Pulsewatch_Database/Adjudication_UConn" + elif self.is_hpc: + base_path = "/gpfs/scratchfs1/kic14002/doh16101" + labels_base_path = "/gpfs/scratchfs1/hfp14002/lrm22005" + else: + # R:\ENGR_Chon\Dong\MATLAB_generate_results\NIH_PulseWatch + base_path = "R:\ENGR_Chon\Dong\MATLAB_generate_results\\NIH_PulseWatch" # Why double \\ before NIH_Pulsewatch_Database? + labels_base_path = "R:\ENGR_Chon\Darren\\NIH_Pulsewatch" # Why double \\ before NIH_Pulsewatch_Database? + # labels_base_path = "R:\ENGR_Chon\\NIH_Pulsewatch_Database\Adjudication_UConn" + + if data_format == 'csv': + if self.is_tfs: + data_path = os.path.join(base_path, "TFS_csv") + else: + data_path = os.path.join(base_path, "Poincare_Density_csv") + elif data_format == 'png': + if not self.is_tfs: + print('No png image available for Density Poincare plot') + return + data_path = os.path.join(base_path, "TFS_plots") + else: + raise ValueError("Invalid data format. Choose 'csv', 'png, or 'pt'.") + + # Complete labels path + # labels_path = os.path.join(labels_base_path, "final_attemp_4_1_Dong_Ohm_2024_02_18_copy") + labels_path = os.path.join(labels_base_path, "Ground_Truths") + + # Check if directories exist + if not os.path.exists(data_path): + print("Data path does not exist") + return + if not os.path.exists(labels_path): + print("Labels path does not exist") + return + + return data_path, labels_path + + + def smote_path(self, smote_type, split): + if self.is_internal: + base_path = r'C:\Chon_Lab\NIH_Pulsewatch' + elif self.is_external: + base_path = r'D:\Chon_Lab\NIH_Pulsewatch' + else: + # R:\ENGR_Chon\Dong\MATLAB_generate_results\NIH_PulseWatch + base_path = "R:\ENGR_Chon\Darren\\NIH_Pulsewatch" # Why double \\ before NIH_Pulsewatch_Database? + + # Type path + if self.is_tfs: + format_path = 'TFS_pt' + else: + format_path = 'Poincare_pt' + + smote_path = os.path.join(base_path, format_path, smote_type, split) + + return smote_path + + + def deepbeat_paths(self): + if self.is_internal: + base_path = r'C:\Chon_Lab\Public_Database\DeepBeat\Concatenated_DeepBeat\test\Darren_conversion' + elif self.is_external: + base_path = r'D:\Chon_Lab\Public_Database\DeepBeat\Concatenated_DeepBeat\test\Darren_conversion' + else: + # R:\ENGR_Chon\Dong\MATLAB_generate_results\NIH_PulseWatch + base_path = r'R:\ENGR_Chon\Darren\Public_Database\DeepBeat\Concatenated_DeepBeat\test\Darren_conversion' + + # Type path + if self.is_tfs: + format_path = 'tfs_float16_pt' + else: + format_path = 'poincare_float16_pt' + + data_path = os.path.join(base_path, format_path) + labels_path = os.path.join(base_path, 'DeepBeat_segment_names_labels_STFT.csv') + + return data_path, labels_path + + + def mimic3_paths(self): + if self.is_internal: + base_path = r'C:\Chon_Lab\Public_Database\PPG_PeakDet_MIMICIII\Darren_conversion' + elif self.is_external: + base_path = r'D:\Chon_Lab\Public_Database\PPG_PeakDet_MIMICIII\Darren_conversion' + else: + # R:\ENGR_Chon\Dong\MATLAB_generate_results\NIH_PulseWatch + base_path = r'R:\ENGR_Chon\Darren\Public_Database\PPG_PeakDet_MIMICIII\Darren_conversion' + + # Type path + if self.is_tfs: + format_path = 'test_tfs_float16_pt' + else: + format_path = 'test_poincare_float16_pt' + + data_path = os.path.join(base_path, format_path) + labels_path = os.path.join(base_path, '2020_Han_Sensors_MIMICIII_Ground_Truth_STFT.csv') + + return data_path, labels_path + + + def simband_paths(self): + if self.is_internal: + base_path = r'C:\Chon_Lab\Public_Database\PPG_PeakDet_Simband\Darren_conversion' + elif self.is_external: + base_path = r'D:\Chon_Lab\Public_Database\PPG_PeakDet_Simband\Darren_conversion' + else: + # R:\ENGR_Chon\Dong\MATLAB_generate_results\NIH_PulseWatch + base_path = r'R:\ENGR_Chon\Darren\Public_Database\PPG_PeakDet_Simband\Darren_conversion' + + # Type path + if self.is_tfs: + format_path = 'tfs_float16_pt' + else: + format_path = 'poincare_float16_pt' + + data_path = os.path.join(base_path, format_path) + labels_path = os.path.join(base_path, 'simband_segments_labels_STFT.csv') + + return data_path, labels_path + + + def summary_path(self): + if self.is_linux: + summary_path = "/mnt/R/ENGR_Chon/Darren/NIH_Pulsewatch/labels_summary_2_18_Darren.csv" + elif self.is_hpc: + summary_path = "/gpfs/scratchfs1/hfp14002/dac20022/NIH_Pulsewatch/labels_summary_2_18_Darren.csv" + else: + if self.is_internal: + summary_path = r'C:\Chon_Lab\NIH_Pulsewatch\labels_summary_2_18_Darren.csv' + elif self.is_external: + summary_path = r'D:\Chon_Lab\NIH_Pulsewatch\labels_summary_2_18_Darren.csv' + else: + summary_path = r"\\grove.ad.uconn.edu\research\ENGR_Chon\Darren\NIH_Pulsewatch\labels_summary_2_18_Darren.csv" + + return summary_path + + + def models_path(self): + if self.is_linux: + models_path = "/mnt/R/ENGR_Chon/Darren/Honors_Thesis/models" + elif self.is_hpc: + models_path = "/gpfs/scratchfs1/hfp14002/dac20022/Honors_Thesis/models" + else: + models_path = r"\\grove.ad.uconn.edu\research\ENGR_Chon\Darren\Honors_Thesis\models" + + return models_path + + + def losslists_path(self): + losslists_path = self.saves_path + 'losslists' + + return losslists_path + + + def runtime_lists_path(self): + runtime_lists_path = self.saves_path + 'runtime_lists' + + return runtime_lists_path + + + def labels_path(self): + labels_path = self.saves_path + 'labels' + + return labels_path + + + def predictions_path(self): + predictions_path = self.saves_path + 'predictions' + + return predictions_path + + + def prediction_proba_path(self): + prediction_proba_path = self.saves_path + 'prediction_proba' + + return prediction_proba_path + + + def metrics_path(self): + metrics_path = self.saves_path + 'metrics' + + return metrics_path + + + def classification_report_path(self): + classification_report_path = self.saves_path + 'classification_reports' + + return classification_report_path + + + def classification_report_imbalanced_path(self): + classification_report_imbalanced_path = self.saves_path + 'classification_reports_imbalanced' + + return classification_report_imbalanced_path + + + def confusion_matrices_path(self): + confusion_matrices_path = self.saves_path + 'confusion_matrices' + + return confusion_matrices_path + + + def checkpoints_path(self): + checkpoints_path = self.saves_path + 'checkpoints' + + return checkpoints_path + + + def hyperparameters_path(self): + hyperparameters_path = self.saves_path + 'hyperparameters' + + return hyperparameters_path + + + def loss_curves_path(self): + loss_curves_path = self.saves_path + 'loss_curves' + + return loss_curves_path + + + def roc_curves_path(self): + roc_curves_path = self.saves_path + 'roc_curves' + + return roc_curves_path + + + def mean_roc_curves_path(self): + mean_roc_curves_path = self.saves_path + 'mean_roc_curves' + + return mean_roc_curves_path + + + def accuracy_curves_path(self): + accuracy_curves_path = self.saves_path + 'accuracy_curves' + + return accuracy_curves_path \ No newline at end of file diff --git a/utils/plot_save_func.py b/utils/plot_save_func.py new file mode 100644 index 0000000..abe201e --- /dev/null +++ b/utils/plot_save_func.py @@ -0,0 +1,542 @@ +# -*- coding: utf-8 -*- +""" +Created on Thu Feb 29 12:06:14 2024 + +@author: dchen +""" +import matplotlib.pyplot as plt +import numpy as np +import os +import pandas as pd +from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, roc_curve, auc, classification_report +from sklearn.preprocessing import label_binarize +from imblearn.metrics import classification_report_imbalanced + +# For increased csv speed +import pyarrow as pa +from pyarrow import csv + +def save_hyperparameters(hyperparameters, pathmaster): + hyperparameters_path = pathmaster.hyperparameters_path() + hyperparameters_path = os.path.join(hyperparameters_path, 'hyperparameters_' + pathmaster.file_tag + '.csv') + + # If there are class weights, make sure all other columns have same length + if hyperparameters['class_weights'] is not None: + # Update the dictionary + for key, value in hyperparameters.items(): + # If the length of the value is less than max_length + if key != 'class_weights': + # Fill missing values with np.nan + hyperparameters[key] = [value] + [np.nan] * (len(hyperparameters['class_weights']) - 1) + + hyperparameters = pd.DataFrame(hyperparameters) + hyperparameters.to_csv(hyperparameters_path, index=False) + + # # Using PyArrow (need each hyperparameter to be a list) + # hyperparameters_table = pa.Table.from_pydict(hyperparameters) + # csv.write_csv(hyperparameters_table, hyperparameters_path) + + +def save_losslists(losslist_train, losslist_val, pathmaster): # For holdout training and validation + losslists_path = pathmaster.losslists_path() + losslists_path = os.path.join(losslists_path, 'losslists_' + pathmaster.file_tag + '.csv') + # losslists = pd.DataFrame(dtype='float32') + # losslists['training'] = losslist_train + # losslists['validation'] = losslist_val + # losslists.to_csv(losslists_path, index=False, chunksize=500) + + # Using PyArrow + # losslists = { + # 'training': losslist_train, + # 'validation': losslist_val + # } + # losslists_table = pa.Table.from_pydict(losslists) + losslists = [np.array(losslist_train).reshape(-1).astype(np.float32), np.array(losslist_val).reshape(-1).astype(np.float32)] + losslists_names = ['training', 'validation'] + losslists_table = pa.Table.from_arrays(losslists, losslists_names) + csv.write_csv(losslists_table, losslists_path) + +def save_losslists_2fold(losslist_train_fold1, losslist_val_fold1, losslist_train_fold2, losslist_val_fold2, losslist_train, losslist_val, pathmaster): # For holdout training and validation + losslists_path = pathmaster.losslists_path() + losslists_path = os.path.join(losslists_path, 'losslists_' + pathmaster.file_tag + '.csv') + # losslists = pd.DataFrame(dtype='float32') + # losslists['training'] = losslist_train + # losslists['validation'] = losslist_val + # losslists.to_csv(losslists_path, index=False, chunksize=500) + + # Using PyArrow + # losslists = { + # 'training': losslist_train, + # 'validation': losslist_val + # } + # losslists_table = pa.Table.from_pydict(losslists) + losslists = [np.array(losslist_train_fold1).reshape(-1).astype(np.float32), np.array(losslist_val_fold1).reshape(-1).astype(np.float32), + np.array(losslist_train_fold2).reshape(-1).astype(np.float32), np.array(losslist_val_fold2).reshape(-1).astype(np.float32), + np.array(losslist_train).reshape(-1).astype(np.float32), np.array(losslist_val).reshape(-1).astype(np.float32)] + losslists_names = ['fold1_training', 'fold1_validation', 'fold2_training', 'fold2_validation', 'mean_training', 'mean_validation'] + losslists_table = pa.Table.from_arrays(losslists, losslists_names) + csv.write_csv(losslists_table, losslists_path) + + +def save_runtime_list(epoch_time_list, pathmaster): + # epoch_time_array = np.array(epoch_time_list).reshape(-1).astype(np.float32) + runtime_lists_path = pathmaster.runtime_lists_path() + runtime_lists_path = os.path.join(runtime_lists_path, 'runtime_lists_' + pathmaster.file_tag + '.csv') + # runtime_list = pd.DataFrame(dtype='float32') + # runtime_list['time_sec'] = epoch_time_list + # runtime_list.to_csv(runtime_lists_path, index=False, chunksize=500) + + # Using PyArrow + runtime_dict = {'epoch_time_sec': epoch_time_list, + 'mean_time_sec': [sum(epoch_time_list)/len(epoch_time_list)] + [np.nan] * (len(epoch_time_list) - 1)} + runtime_table = pa.Table.from_pydict(runtime_dict) + # runtime_table = pa.Table.from_arrays([epoch_time_array, np.array([np.mean(epoch_time_array)])], names=['epoch_time_sec', 'mean_time_sec']) + csv.write_csv(runtime_table, runtime_lists_path) + + +def save_labels(labels, pathmaster): + labels = labels.astype(np.int8) + labels_path = pathmaster.labels_path() + labels_path = os.path.join(labels_path, 'labels_' + pathmaster.file_tag + '.csv') + # labels = pd.DataFrame(np.array(labels), dtype='int') + # labels.to_csv(labels_path, index=False, chunksize=500) + + # Using PyArrow + # labels_dict = {'labels': labels.reshape(-1)} # Convert to 1D array + # labels_table = pa.Table.from_pydict(labels_dict) + labels_table = pa.Table.from_arrays([labels.reshape(-1)], names=['labels']) + csv.write_csv(labels_table, labels_path) + + +def save_predictions(predictions, pathmaster): + predictions = predictions.astype(np.int8) + predictions_path = pathmaster.predictions_path() + predictions_path = os.path.join(predictions_path, 'predictions_' + pathmaster.file_tag + '.csv') + # predictions = pd.DataFrame(np.array(predictions), dtype='int') + # predictions.to_csv(predictions_path, index=False, chunksize=500) + + # Using PyArrow + # predictions_dict = {'predictions': predictions.reshape(-1)} # Convert to 1D array + # predictions_table = pa.Table.from_pydict(predictions_dict) + predictions_table = pa.Table.from_arrays([predictions.reshape(-1)], names=['predictions']) + csv.write_csv(predictions_table, predictions_path) + + +def save_prediction_proba(prediction_proba, pathmaster): + prediction_proba = prediction_proba.astype(np.float32) + prediction_proba_path = pathmaster.prediction_proba_path() + prediction_proba_path = os.path.join(prediction_proba_path, 'prediction_proba_' + pathmaster.file_tag + '.csv') + # prediction_proba = pd.DataFrame(np.array(prediction_proba), dtype='float32') + # prediction_proba.to_csv(prediction_proba_path, index=False, chunksize=500) + + # Using PyArrow + # # Create PyArrow arrays with specific data type (float64) + # prediction_proba_dict = { + # '0': prediction_proba[:,0], + # '1': prediction_proba[:,1], + # '2': prediction_proba[:,2] + # } + + # Create a PyArrow table + # prediction_proba_Table = pa.Table.from_pydict(prediction_proba_dict) + # col_arrays = [prediction_proba[:,0], prediction_proba[:,1]] + # prediction_proba_Table = pa.Table.from_arrays(col_arrays, names=['0', '1']) + # csv.write_csv(prediction_proba_Table, prediction_proba_path) + col_arrays = [prediction_proba[:,0], prediction_proba[:,1], prediction_proba[:,2]] + prediction_proba_Table = pa.Table.from_arrays(col_arrays, names=['0', '1', '2']) + csv.write_csv(prediction_proba_Table, prediction_proba_path) + + +def save_prediction_proba_binary(prediction_proba, pathmaster): + prediction_proba = prediction_proba.astype(np.float32) + prediction_proba_path = pathmaster.prediction_proba_path() + prediction_proba_path = os.path.join(prediction_proba_path, 'prediction_proba_' + pathmaster.file_tag + '.csv') + # prediction_proba = pd.DataFrame(np.array(prediction_proba), dtype='float32') + # prediction_proba.to_csv(prediction_proba_path, index=False, chunksize=500) + + # Using PyArrow + # # Create PyArrow arrays with specific data type (float64) + # prediction_proba_dict = { + # '0': prediction_proba[:,0], + # '1': prediction_proba[:,1], + # '2': prediction_proba[:,2] + # } + + # Create a PyArrow table + # prediction_proba_Table = pa.Table.from_pydict(prediction_proba_dict) + # col_arrays = [prediction_proba[:,0], prediction_proba[:,1]] + # prediction_proba_Table = pa.Table.from_arrays(col_arrays, names=['0', '1']) + # csv.write_csv(prediction_proba_Table, prediction_proba_path) + col_arrays = [prediction_proba[:,0], prediction_proba[:,1]] + prediction_proba_Table = pa.Table.from_arrays(col_arrays, names=['0', '1']) + csv.write_csv(prediction_proba_Table, prediction_proba_path) + + +def metrics(Y_true, Y_pred, Y_proba, save=False, pathmaster=None): + averages = ['micro', 'macro', 'weighted'] + accuracy_list = [] + precision_list = [] + recall_list = [] + f1_list = [] + auc_list = [] + + for average in averages: + accuracy = accuracy_score(Y_true, Y_pred) + precision, recall, f1, _ = precision_recall_fscore_support(Y_true, Y_pred, average=average) + auc = roc_auc_score(Y_true, Y_proba, average=average, multi_class='ovr') + + accuracy_list.append(accuracy) + precision_list.append(precision) + recall_list.append(recall) + f1_list.append(f1) + auc_list.append(auc) + + metrics = { + 'accuracy': accuracy_list, + 'precision': precision_list, + 'recall': recall_list, + 'f1': f1_list, + 'auc': auc_list + } + + if save: + metrics_path = pathmaster.metrics_path() + metrics_path = os.path.join(metrics_path, 'metrics_' + pathmaster.file_tag + '.csv') + # metrics = pd.DataFrame(metrics, index=[0], dtype='float32') + # metrics.to_csv(metrics_path, index=False) + + # Using PyArrow + metrics_table = pa.Table.from_pydict(metrics) + csv.write_csv(metrics_table, metrics_path) + + +def metrics_binary(Y_true, Y_pred, Y_proba, save=False, pathmaster=None): + averages = ['micro', 'macro', 'weighted'] + accuracy_list = [] + precision_list = [] + recall_list = [] + f1_list = [] + auc_list = [] + + for average in averages: + accuracy = accuracy_score(Y_true, Y_pred) + precision, recall, f1, _ = precision_recall_fscore_support(Y_true, Y_pred, average=average) + auc = roc_auc_score(Y_true, Y_proba[:,1], average=average) + + accuracy_list.append(accuracy) + precision_list.append(precision) + recall_list.append(recall) + f1_list.append(f1) + auc_list.append(auc) + + metrics = { + 'accuracy': accuracy_list, + 'precision': precision_list, + 'recall': recall_list, + 'f1': f1_list, + 'auc': auc_list + } + + if save: + metrics_path = pathmaster.metrics_path() + metrics_path = os.path.join(metrics_path, 'metrics_' + pathmaster.file_tag + '.csv') + # metrics = pd.DataFrame(metrics, index=[0], dtype='float32') + # metrics.to_csv(metrics_path, index=False) + + # Using PyArrow + metrics_table = pa.Table.from_pydict(metrics) + csv.write_csv(metrics_table, metrics_path) + + +def metrics_2fold(Y_true_fold1, Y_true_fold2, Y_pred_fold1, Y_pred_fold2, Y_proba_fold1, Y_proba_fold2, save=False, pathmaster=None): + accuracy_fold1 = accuracy_score(Y_true_fold1, Y_pred_fold1) + precision_fold1, recall_fold1, f1_fold1, _ = precision_recall_fscore_support(Y_true_fold1, Y_pred_fold1, average='weighted') + auc_fold1 = roc_auc_score(Y_true_fold1, Y_proba_fold1, average='weighted', multi_class='ovr') + + accuracy_fold2 = accuracy_score(Y_true_fold2, Y_pred_fold2) + precision_fold2, recall_fold2, f1_fold2, _ = precision_recall_fscore_support(Y_true_fold2, Y_pred_fold2, average='weighted') + auc_fold2 = roc_auc_score(Y_true_fold2, Y_proba_fold2, average='weighted', multi_class='ovr') + + accuracy = accuracy_score(np.concatenate((Y_true_fold1,Y_true_fold2), axis=0), np.concatenate((Y_pred_fold1,Y_pred_fold2), axis=0)) + precision, recall, f1, _ = precision_recall_fscore_support(np.concatenate((Y_true_fold1,Y_true_fold2), axis=0), np.concatenate((Y_pred_fold1,Y_pred_fold2), axis=0), average='weighted') + auc = roc_auc_score(np.concatenate((Y_true_fold1,Y_true_fold2), axis=0), np.concatenate((Y_proba_fold1,Y_proba_fold2), axis=0), average='weighted', multi_class='ovr') + + metrics = { + 'accuracy': [accuracy_fold1, accuracy_fold2, accuracy], + 'precision': [precision_fold1, precision_fold2, precision], + 'recall': [recall_fold1, recall_fold2, recall], + 'f1': [f1_fold1, f1_fold2, f1], + 'auc': [auc_fold1, auc_fold2, auc] + } + + if save: + metrics_path = pathmaster.metrics_path() + metrics_path = os.path.join(metrics_path, 'metrics_' + pathmaster.file_tag + '.csv') + # metrics = pd.DataFrame(metrics, index=[0], dtype='float32') + # metrics.to_csv(metrics_path, index=False) + + # Using PyArrow + metrics_table = pa.Table.from_pydict(metrics) + csv.write_csv(metrics_table, metrics_path) + + +def save_classification_report(Y_true, Y_pred, save=False, pathmaster=None): + report = classification_report(Y_true, Y_pred, output_dict=True) + row_labels = ['precision', 'recall', 'f1', 'support'] + + if save: + classification_report_path = pathmaster.classification_report_path() + classification_report_path = os.path.join(classification_report_path, 'classification_report_' + pathmaster.file_tag + '.csv') + report = pd.DataFrame(report) + # report.reset_index(inplace=True) + report.insert(loc=0, column='metrics', value=row_labels) + report.to_csv(classification_report_path, index=False) + + # # Using PyArrow + # report_table = pa.Table.from_pydict(report) + # csv.write_csv(report_table, classification_report_path) + + +def save_classification_report_imbalanced(Y_true, Y_pred, save=False, pathmaster=None): + report_imbalanced = classification_report_imbalanced(Y_true, Y_pred, output_dict=True) + row_labels = ['precision', 'recall', 'specificity', 'f1', 'geo mean', 'iba', 'support'] + + if save: + classification_report_imbalanced_path = pathmaster.classification_report_imbalanced_path() + classification_report_imbalanced_path = os.path.join(classification_report_imbalanced_path, 'classification_report_imbalanced_' + pathmaster.file_tag + '.csv') + report_imbalanced = pd.DataFrame(report_imbalanced) + # report_imbalanced.reset_index(inplace=True) + report_imbalanced.insert(loc=0, column='metrics', value=row_labels) + report_imbalanced.to_csv(classification_report_imbalanced_path, index=False) + + # # Using PyArrow + # report_imbalanced_table = pa.Table.from_pydict(report_imbalanced) + # csv.write_csv(report_imbalanced_table, classification_report_imbalanced_path) + + +def roc_curves(y_test, y_prob, save=False, pathmaster=None, class_names=['NSR', 'AF', 'PAC/PVC']): + # Get the unique class labels + classes = np.unique(y_test) + + if class_names is None: + class_names = np.unique(y_test) + + # Convert labels to binary matrix + y_bin = label_binarize(y_test, classes=classes) + + # Pre-allocate arrays for ROC curves + fpr_mean = np.linspace(0, 1, 100) + tpr_mean = [] + fpr = [] + tpr = [] + AUC = [] + + # Calculate ROC curves for each class + for i, class_label in enumerate(classes): + fpr_i, tpr_i, _ = roc_curve(y_bin[:, i], y_prob[:, i]) + AUC.append(roc_auc_score(y_bin[:, i], y_prob[:, i])) + fpr.append(fpr_i) + tpr.append(tpr_i) + + # Interpolate TPR for mean ROC curve + tpr_mean.append(np.interp(fpr_mean, fpr_i, tpr_i)) + + # Calculate mean and standard deviation for TPR and AUC + tpr_mean = np.mean(np.array(tpr_mean).reshape(len(classes), -1), axis=0) + tpr_stdv = np.std(tpr_mean, axis=0) + mean_auc = auc(fpr_mean, tpr_mean) + std_auc = np.std(AUC) + + # Create the plot + plt.figure(figsize=(12, 9)) + plt.clf() + plt.plot([0, 1], [0, 1], 'k--') + plt.axis([0, 1, 0, 1]) + plt.xlabel('False Positive Rate', fontsize=16) + plt.ylabel('True Positive Rate', fontsize=16) + plt.title('ROC Curves (' + pathmaster.file_tag + ')', fontweight='bold') + + # Plot individual ROC curves + for i in range(len(classes)): + label_str = f"ROC Label {class_names[i]} (AUC = {AUC[i]:.3f})" + plt.plot(fpr[i], tpr[i], linewidth=3, label=label_str) + + # Plot mean ROC curve with standard deviation + plt.plot(fpr_mean, tpr_mean, color='k', label=rf"Mean ROC (AUC = {mean_auc:.3f} $\pm$ {std_auc:.3f})", linewidth=5) + plt.fill_between(fpr_mean, np.maximum(tpr_mean - tpr_stdv, 0), np.minimum(tpr_mean + tpr_stdv, 1), color='grey', alpha=0.2, label=r"$\pm$ 1 std. dev.") + + plt.legend(loc="lower right") + + if save: + roc_curves_path = pathmaster.roc_curves_path() + roc_curves_path = os.path.join(roc_curves_path, 'roc_curves_' + pathmaster.file_tag + '.jpg') + plt.savefig(roc_curves_path, dpi=150) + + +def roc_curves_binary(y_test, y_prob, save=False, pathmaster=None, class_names=['Negative', 'Positive']): + y_prob = y_prob[:,1] + # Convert labels to binary matrix + y_bin = label_binarize(y_test, classes=np.unique(y_test)) + + # Pre-allocate arrays for ROC curves + fpr_mean = np.linspace(0, 1, 100) + tpr_mean = [] + fpr = [] + tpr = [] + AUC = [] + + # Calculate ROC curve for the positive class + fpr, tpr, _ = roc_curve(y_bin, y_prob) + AUC = roc_auc_score(y_bin, y_prob) + + # Create the plot + plt.figure(figsize=(12, 9)) + plt.plot([0, 1], [0, 1], 'k--') + plt.plot(fpr, tpr, linewidth=3, label=f'ROC Curve (AUC = {AUC:.3f})') + plt.axis([0, 1, 0, 1]) + plt.xlabel('False Positive Rate', fontsize=16) + plt.ylabel('True Positive Rate', fontsize=16) + plt.title('ROC Curve', fontweight='bold') + plt.legend(loc="lower right") + + if save: + roc_curves_path = pathmaster.roc_curves_path() + roc_curves_path = os.path.join(roc_curves_path, 'roc_curves_' + pathmaster.file_tag + '.jpg') + plt.savefig(roc_curves_path, dpi=150) + + +def mean_roc_curves(Y_tests, Y_probas, clf_names, save=False, pathmaster=None): + # Pre-allocate arrays for ROC curves + fpr_mean = np.linspace(0, 1, 100) + # tpr_mean = np.zeros_like(fpr_mean) + + # Set figure size + plt.figure(figsize=(12,9)) + + # Plot individual mean ROC curves for each classifier + for y_test, y_prob, clf_name in zip(Y_tests, Y_probas, clf_names): + # Get the unique class labels + classes = np.unique(y_test) + + # Convert labels to binary matrix + y_bin = label_binarize(y_test, classes=classes) + + # Pre-allocate arrays for ROC curves + fpr = [] + tpr = [] + AUC = [] + + # Calculate ROC curves for each class + for i, class_label in enumerate(classes): + fpr_i, tpr_i, _ = roc_curve(y_bin[:, i], y_prob[:, i]) + AUC.append(roc_auc_score(y_bin[:, i], y_prob[:, i])) + fpr.append(fpr_i) + tpr.append(tpr_i) + + # Interpolate TPR for mean ROC curve + tpr_interp = [np.interp(fpr_mean, fpr_i, tpr_i) for fpr_i, tpr_i in zip(fpr, tpr)] + tpr_mean = np.mean(tpr_interp, axis=0) + + # Plot mean ROC curve + plt.plot(fpr_mean, tpr_mean, label=f"{clf_name} - Mean ROC (AUC = {auc(fpr_mean, tpr_mean):.3f} $\pm$ {np.std(AUC):.3f})", linewidth=2) + + # Additional plot configurations + plt.plot([0, 1], [0, 1], 'k--') + plt.axis([0, 1, 0, 1]) + plt.xlabel('False Positive Rate', fontsize=12) + plt.ylabel('True Positive Rate', fontsize=12) + plt.title('Mean ROC Curve(s)', fontweight='bold') + plt.legend(loc="lower right") + # plt.show() + + if save: + mean_roc_curves_path = pathmaster.mean_roc_curves_path() + mean_roc_curves_path = os.path.join(mean_roc_curves_path, 'mean_roc_curves_' + pathmaster.file_tag + '.jpg') + plt.savefig(mean_roc_curves_path, dpi=150) + + +def conf_matrix(conf_matrix, title='Confusion Matrix', save=False, pathmaster=None, class_names=['NSR', 'AF', 'PAC/PVC']): + title = title + ' (' + pathmaster.file_tag + ')' + conf_matrix_norm = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis] # Normalize + + plt.figure(figsize=(10, 8)) # Adjust the figure size as per your preference + plt.imshow(conf_matrix_norm, interpolation='nearest', cmap=plt.cm.Blues, vmin=0.0, vmax=1.0) + plt.title(title, fontweight='bold') + plt.colorbar() + tick_marks = np.arange(len(conf_matrix)) + + if class_names is not None: + tick_marks = np.arange(len(class_names)) + plt.xticks(tick_marks, class_names) + plt.yticks(tick_marks, class_names) + else: + tick_marks = np.arange(len(conf_matrix)) + plt.xticks(tick_marks, tick_marks) + plt.yticks(tick_marks, tick_marks) + + plt.xlabel('Predicted label') + plt.ylabel('True label') + + # Add counts and percentages in each box + for i in range(conf_matrix.shape[0]): + for j in range(conf_matrix.shape[1]): + percentage = conf_matrix_norm[i, j] * 100 + count = int(conf_matrix[i, j]) + # text_color = 'black' if conf_matrix[i, j] < np.max(conf_matrix) / 1.5 else 'white' + text_color = 'black' if percentage < 80 else 'white' + plt.text(j, i, "{:.2f}%\n{}".format(percentage, count), + horizontalalignment="center", + verticalalignment="center", + color=text_color) + + if save: + confusion_matrices_path = pathmaster.confusion_matrices_path() + confusion_matrices_path = os.path.join(confusion_matrices_path, 'confusion_matrix_' + pathmaster.file_tag + '.jpg') + plt.savefig(confusion_matrices_path, dpi=200) + + # plt.show() + + +def train_val_loss(losslist_train, losslist_val, title='Training and Validation Loss', save=False, pathmaster=None): + title = title + ' (' + pathmaster.file_tag + ')' + plt.figure(figsize=(12, 8)) + plt.plot(range(len(losslist_train)), losslist_train, label='training') + plt.plot(range(len(losslist_val)), losslist_val, label='validation') + plt.legend() + plt.title(title, fontweight='bold') + plt.xlabel('Epochs') + plt.ylabel('Loss') + + if save: + loss_curves_path = pathmaster.loss_curves_path() + loss_curves_path = os.path.join(loss_curves_path, 'loss_curve_' + pathmaster.file_tag + '.jpg') + plt.savefig(loss_curves_path, dpi=150) + + # plt.show() + +def accuracy_curves(Y_true_train, Y_true_val, Y_pred_train, Y_pred_val, title='Training and Validation Accuracy', save=False, pathmaster=None): + accuracy_list_train = [] + accuracy_list_val = [] + epochs_train = range(len(Y_pred_train)) + epochs_val = range(len(Y_pred_val)) + + for predictions in Y_pred_train: + accuracy = accuracy_score(Y_true_train, predictions) + accuracy_list_train.append(accuracy) + for predictions in Y_pred_val: + accuracy = accuracy_score(Y_true_val, predictions) + accuracy_list_val.append(accuracy) + + title = title + ' (' + pathmaster.file_tag + ')' + plt.figure(figsize=(12, 8)) + plt.plot(epochs_train, accuracy_list_train, label='training') + plt.plot(epochs_val, accuracy_list_val, label='validation') + plt.legend() + plt.title(title, fontweight='bold') + plt.xlabel('Epochs') + plt.ylabel('Accuracy') + + if save: + accuracy_curves_path = pathmaster.accuracy_curves_path() + accuracy_curves_path = os.path.join(accuracy_curves_path, 'accuracy_curve_' + pathmaster.file_tag + '.jpg') + plt.savefig(accuracy_curves_path, dpi=150) \ No newline at end of file diff --git a/utils/smote.py b/utils/smote.py new file mode 100644 index 0000000..890d2fb --- /dev/null +++ b/utils/smote.py @@ -0,0 +1,158 @@ +import torch +import torch.nn as nn +import torchvision.transforms as transforms +import os +import csv +from imblearn.over_sampling import SMOTE +import numpy as np +from tqdm import tqdm +import pandas as pd +from concurrent.futures import ProcessPoolExecutor + +import sys +sys.path.append('R:\ENGR_Chon\Darren\Honors_Thesis') + +# Import my own functions and classes +from utils.pathmaster import PathMaster +from utils import dataloader + +def apply_cassey_smote(data, labels): + cassey_smote = SMOTE(random_state=42,sampling_strategy='not majority',k_neighbors=5) + data_resampled, labels_resampled = cassey_smote.fit_resample(data, labels) + return data_resampled, labels_resampled + +def save_image(i, image, group, save_dir): + # Generate a unique file name with zero-padding + file_name = f'{i+1:06d}' + '_' + group + '_tfs' + + # Convert the image to a PyTorch tensor + tensor_image = torch.tensor(image).to(dtype=torch.float16) + + # Save the tensor to a .pt file + torch.save(tensor_image, os.path.join(save_dir, file_name + '.pt')) + + return file_name + +def save_images_parallel(data_resampled, group, save_dir): + file_names = [] + with ProcessPoolExecutor() as executor: + results = [executor.submit(save_image, i, image, group, save_dir) for i, image in enumerate(data_resampled)] + for future in results: + file_names.append(future.result()) + return file_names + +def main(): + # Initialize save location specifics + smote_type = 'Cassey_SMOTE' + split = '2foldCV_60_40' + groups = ['fold1', 'fold2', 'test'] + + # Device and drives + is_linux = False + is_hpc = False + is_internal = True + is_external = False + + # Input + is_tfs = True + + # Intialize the focus + focus = 'misc' + + # Initialize the file tag + file_tag = 'temp' + + # Image resolution + img_res = '128x128_float16' + + # Data type: the type to convert the data into when it is loaded in + data_type = torch.float32 + + # Create a PathMaster object + pathmaster = PathMaster(is_linux, is_hpc, is_tfs, is_internal, is_external, focus, file_tag, img_res) + + # Image dimensions + img_channels = 1 + img_size = 128 + downsample = None + standardize = None + + # Split UIDs + # train_set, val_set, test_set = dataloader.split_uids(pathmaster) + cross_val_fold1, cross_val_fold2, test_set = dataloader.split_uids_2fold_60_40_smote(pathmaster) + # train_set, val_set, test_set = dataloader.split_uids_60_10_30(pathmaster) + + # Preprocess data + data_format = 'pt' + batch_size = 256 + + # train_loader, val_loader, _ = dataloader.preprocess_data(data_format, clinical_trial_train, clinical_trial_test, clinical_trial_unlabeled, + # batch_size, standardize, False, img_channels, img_size, downsample, data_type, pathmaster) + fold1_loader, fold2_loader, test_loader = dataloader.preprocess_data(data_format, cross_val_fold1, cross_val_fold2, test_set, batch_size, + standardize, False, img_channels, img_size, downsample, data_type, pathmaster) + # train_loader, val_loader, test_loader = dataloader.preprocess_data(data_format, train_set, val_set, test_set, + # batch_size, standardize, False, img_channels, img_size, downsample, data_type, pathmaster) + data_loaders = [fold1_loader, fold2_loader, test_loader] + print() + sys.stdout.flush() + for data_loader, group in tqdm(zip(data_loaders,groups), total=len(data_loaders), desc='SMOTE', unit='Data Loader', leave=False): + sys.stderr.flush() + + # Define your original data and labels + data = np.empty((0,img_size*img_size)) + labels = np.empty((0,1)) + + sys.stdout.flush() + + for data_batch in tqdm(data_loader, total=len(data_loader), desc='Loading', unit='batch', leave=False): + sys.stderr.flush() + + # Extract input and labels + X = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[-1] * data_batch['data'].shape[-1]).numpy() + Y = data_batch['label'].numpy().reshape(-1,1) + + data = np.concatenate((data, X), axis=0) + labels = np.concatenate((labels, Y), axis=0) + + sys.stderr.flush() + print('\nData shape:', data.shape) + print('Labels shape:', labels.shape) + sys.stdout.flush() + + if group != 'test': + # SMOTE + data_resampled, labels_resampled = apply_cassey_smote(data, labels) + data_resampled = data_resampled.reshape(len(data_resampled), img_channels, img_size, img_size) + sys.stderr.flush() + print('\nResampled Data shape:', data_resampled.shape) + print('Resampled Labels shape:', labels_resampled.shape) + print() + sys.stdout.flush() + else: + data_resampled = data + data_resampled = data_resampled.reshape(len(data_resampled), img_channels, img_size, img_size) + labels_resampled = labels + sys.stderr.flush() + print('\nResampled Data shape:', data_resampled.shape) + print('Resampled Labels shape:', labels_resampled.shape) + print() + sys.stdout.flush() + + # Define a directory to save the images + # save_dir = os.path.join(r'\\grove.ad.uconn.edu\research\ENGR_Chon\Darren\NIH_Pulsewatch', smote_type, split, group) + save_dir = os.path.join(r'\\grove.ad.uconn.edu\research\ENGR_Chon\Darren\NIH_Pulsewatch', smote_type, split, group) + os.makedirs(save_dir, exist_ok=True) + + file_names = save_images_parallel(data_resampled, group, save_dir) + + # Ground truths + data_labels = pd.DataFrame({ + 'segment_name': file_names, + 'label': labels_resampled + }) + + csv_file_name = os.path.join(r'C:\Chon_Lab\NIH_Pulsewatch', smote_type, split, smote_type + '_' + group + '_names_labels.csv') + data_labels.to_csv(csv_file_name, index=False) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/utils/smote_accelerated.py b/utils/smote_accelerated.py new file mode 100644 index 0000000..555f8cc --- /dev/null +++ b/utils/smote_accelerated.py @@ -0,0 +1,178 @@ +import torch +import torch.nn as nn +import torchvision.transforms as transforms +import os +import csv +from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN +import numpy as np +from tqdm import tqdm +import pandas as pd +from concurrent.futures import ProcessPoolExecutor + +import sys +sys.path.append('R:\ENGR_Chon\Darren\Honors_Thesis') + +# Import my own functions and classes +from utils.pathmaster import PathMaster +from utils import dataloader + +def apply_cassey_smote(data, labels): + cassey_smote = SMOTE(random_state=42,sampling_strategy='not majority',k_neighbors=5) + data_resampled, labels_resampled = cassey_smote.fit_resample(data, labels) + return data_resampled, labels_resampled + +def apply_borderline_smote(data, labels): + borderline_smote = BorderlineSMOTE(random_state=42,sampling_strategy='not majority',k_neighbors=5) + data_resampled, labels_resampled = borderline_smote.fit_resample(data, labels) + return data_resampled, labels_resampled + +def apply_adasyn(data, labels): + adasyn = ADASYN(random_state=42,sampling_strategy='not majority',n_neighbors=5) + data_resampled, labels_resampled = adasyn.fit_resample(data, labels) + return data_resampled, labels_resampled + +def save_image(i, image, group, save_dir): + + # Generate a unique file name with zero-padding + file_name = f'{i+1:06d}' + '_' + group + '_tfs' + + # Convert the image to a PyTorch tensor + tensor_image = torch.tensor(image).to(dtype=torch.float16) + tensor_image = tensor_image.reshape(tensor_image.size()[-2], tensor_image.size()[-2]) + + + # Save the tensor to a .pt file + torch.save(tensor_image, os.path.join(save_dir, file_name + '.pt')) + + return file_name + +def save_images_parallel(data_resampled, group, save_dir): + file_names = [] + with ProcessPoolExecutor() as executor: + results = [executor.submit(save_image, i, image, group, save_dir) for i, image in enumerate(data_resampled)] + for future in results: + file_names.append(future.result()) + return file_names + +def main(): + # Initialize save location specifics + # smote_type = 'Cassey_SMOTE' + smote_type = 'Borderline_SMOTE' + # smote_type = 'ADASYN' + + # split = '2foldCV_60_40' + split = 'holdout_60_10_30' + + # groups = ['fold1', 'fold2', 'test'] + groups = ['train', 'validate', 'test'] + + # Device and drives + is_linux = False + is_hpc = False + is_internal = True + is_external = False + + # Input + is_tfs = True + + # Intialize the focus + focus = 'misc' + + # Initialize the file tag + file_tag = 'temp' + + # Image resolution + img_res = '128x128_float16' + + # Data type: the type to convert the data into when it is loaded in + data_type = torch.float32 + + # Create a PathMaster object + pathmaster = PathMaster(is_linux, is_hpc, is_tfs, is_internal, is_external, focus, file_tag, img_res) + + # Image dimensions + img_channels = 1 + img_size = 128 + downsample = None + standardize = None + + # Split UIDs + # cross_val_fold1, cross_val_fold2, test_set = dataloader.split_uids_2fold_60_40_smote(pathmaster) + train_set, val_set, test_set = dataloader.split_uids_60_10_30_smote(pathmaster) + + # Preprocess data + data_format = 'pt' + batch_size = 256 + + # fold1_loader, fold2_loader, test_loader = dataloader.preprocess_data(data_format, cross_val_fold1, cross_val_fold2, test_set, batch_size, + # standardize, False, img_channels, img_size, downsample, data_type, pathmaster) + # data_loaders = [fold1_loader, fold2_loader, test_loader] + + train_loader, val_loader, test_loader = dataloader.preprocess_data(data_format, train_set, val_set, test_set, + batch_size, standardize, False, img_channels, img_size, downsample, data_type, pathmaster) + data_loaders = [train_loader, val_loader, test_loader] + print() + sys.stdout.flush() + for data_loader, group in tqdm(zip(data_loaders,groups), total=len(data_loaders), desc='SMOTE', unit='Data Loader', leave=False): + sys.stderr.flush() + + # Define your original data and labels + data = np.empty((0,img_size*img_size)) + labels = np.empty((0,1)) + + sys.stdout.flush() + + for data_batch in tqdm(data_loader, total=len(data_loader), desc='Loading', unit='batch', leave=False): + sys.stderr.flush() + + # Extract input and labels + X = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[-1] * data_batch['data'].shape[-1]).numpy() + Y = data_batch['label'].numpy().reshape(-1,1) + + data = np.concatenate((data, X), axis=0) + labels = np.concatenate((labels, Y), axis=0) + + sys.stderr.flush() + print('\nData shape:', data.shape) + print('Labels shape:', labels.shape) + sys.stdout.flush() + + if group != 'test': + # SMOTE + # data_resampled, labels_resampled = apply_cassey_smote(data, labels) + data_resampled, labels_resampled = apply_borderline_smote(data, labels) + # data_resampled, labels_resampled = apply_adasyn(data, labels) + data_resampled = data_resampled.reshape(len(data_resampled), img_channels, img_size, img_size) + sys.stderr.flush() + print('\nResampled Data shape:', data_resampled.shape) + print('Resampled Labels shape:', labels_resampled.shape) + print() + sys.stdout.flush() + else: + data_resampled = data + data_resampled = data_resampled.reshape(len(data_resampled), img_channels, img_size, img_size) + labels_resampled = labels + sys.stderr.flush() + print('\nResampled Data shape:', data_resampled.shape) + print('Resampled Labels shape:', labels_resampled.shape) + print() + sys.stdout.flush() + + # Define a directory to save the images + # save_dir = os.path.join(r'\\grove.ad.uconn.edu\research\ENGR_Chon\Darren\NIH_Pulsewatch', smote_type, split, group) + save_dir = os.path.join(r'C:\Chon_Lab\NIH_Pulsewatch\TFS_pt', smote_type, split, group) + os.makedirs(save_dir, exist_ok=True) + + file_names = save_images_parallel(data_resampled, group, save_dir) + + # Ground truths + data_labels = pd.DataFrame({ + 'segment_name': file_names, + 'label': labels_resampled.reshape(-1) + }) + + csv_file_name = os.path.join(r'C:\Chon_Lab\NIH_Pulsewatch\TFS_pt', smote_type, split, smote_type + '_' + group + '_names_labels.csv') + data_labels.to_csv(csv_file_name, index=False) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/utils/smote_accelerated_lab.py b/utils/smote_accelerated_lab.py new file mode 100644 index 0000000..90201ba --- /dev/null +++ b/utils/smote_accelerated_lab.py @@ -0,0 +1,177 @@ +import torch +import torch.nn as nn +import torchvision.transforms as transforms +import os +import csv +from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN +import numpy as np +from tqdm import tqdm +import pandas as pd +from concurrent.futures import ProcessPoolExecutor + +import sys +sys.path.append('R:\ENGR_Chon\Darren\Honors_Thesis') + +# Import my own functions and classes +from utils.pathmaster import PathMaster +from utils import dataloader + +def apply_cassey_smote(data, labels): + cassey_smote = SMOTE(random_state=42,sampling_strategy='not majority',k_neighbors=5) + data_resampled, labels_resampled = cassey_smote.fit_resample(data, labels) + return data_resampled, labels_resampled + +def apply_borderline_smote(data, labels): + borderline_smote = BorderlineSMOTE(random_state=42,sampling_strategy='not majority',k_neighbors=5) + data_resampled, labels_resampled = borderline_smote.fit_resample(data, labels) + return data_resampled, labels_resampled + +def apply_adasyn(data, labels): + adasyn = ADASYN(random_state=42,sampling_strategy='not majority',n_neighbors=4) + data_resampled, labels_resampled = adasyn.fit_resample(data, labels) + return data_resampled, labels_resampled + +def save_image(i, image, group, save_dir): + + # Generate a unique file name with zero-padding + file_name = f'{i+1:06d}' + '_' + group + '_tfs' + + # Convert the image to a PyTorch tensor + tensor_image = torch.tensor(image).to(dtype=torch.float16) + tensor_image = tensor_image.reshape(tensor_image.size()[-2], tensor_image.size()[-2]) + + # Save the tensor to a .pt file + torch.save(tensor_image, os.path.join(save_dir, file_name + '.pt')) + + return file_name + +def save_images_parallel(data_resampled, group, save_dir): + file_names = [] + with ProcessPoolExecutor() as executor: + results = [executor.submit(save_image, i, image, group, save_dir) for i, image in enumerate(data_resampled)] + for future in results: + file_names.append(future.result()) + return file_names + +def main(): + # Initialize save location specifics + # smote_type = 'Cassey4k_SMOTE' + # smote_type = 'Borderline5k_SMOTE' + smote_type = 'ADASYN6k' + + # split = '2foldCV_60_40' + split = 'holdout_60_10_30' + + # groups = ['fold1', 'fold2', 'test'] + groups = ['train', 'validate', 'test'] + + # Device and drives + is_linux = False + is_hpc = False + is_internal = False + is_external = False + + # Input + is_tfs = True + + # Intialize the focus + focus = 'misc' + + # Initialize the file tag + file_tag = 'temp' + + # Image resolution + img_res = '128x128_float16' + + # Data type: the type to convert the data into when it is loaded in + data_type = torch.float32 + + # Create a PathMaster object + pathmaster = PathMaster(is_linux, is_hpc, is_tfs, is_internal, is_external, focus, file_tag, img_res) + + # Image dimensions + img_channels = 1 + img_size = 128 + downsample = None + standardize = None + + # Split UIDs + # cross_val_fold1, cross_val_fold2, test_set = dataloader.split_uids_2fold_60_40_smote(pathmaster) + train_set, val_set, test_set = dataloader.split_uids_60_10_30_smote(pathmaster) + + # Preprocess data + data_format = 'pt' + batch_size = 256 + + # fold1_loader, fold2_loader, test_loader = dataloader.preprocess_data(data_format, cross_val_fold1, cross_val_fold2, test_set, batch_size, + # standardize, False, img_channels, img_size, downsample, data_type, pathmaster) + # data_loaders = [fold1_loader, fold2_loader, test_loader] + + train_loader, val_loader, test_loader = dataloader.preprocess_data(data_format, train_set, val_set, test_set, + batch_size, standardize, False, img_channels, img_size, downsample, data_type, pathmaster) + data_loaders = [train_loader, val_loader, test_loader] + print() + sys.stdout.flush() + for data_loader, group in tqdm(zip(data_loaders,groups), total=len(data_loaders), desc='SMOTE', unit='Data Loader', leave=False): + sys.stderr.flush() + + # Define your original data and labels + data = np.empty((0,img_size*img_size)) + labels = np.empty((0,1)) + + sys.stdout.flush() + + for data_batch in tqdm(data_loader, total=len(data_loader), desc='Loading', unit='batch', leave=False): + sys.stderr.flush() + + # Extract input and labels + X = data_batch['data'].reshape(data_batch['data'].shape[0], data_batch['data'].shape[-1] * data_batch['data'].shape[-1]).numpy() + Y = data_batch['label'].numpy().reshape(-1,1) + + data = np.concatenate((data, X), axis=0) + labels = np.concatenate((labels, Y), axis=0) + + sys.stderr.flush() + print('\nData shape:', data.shape) + print('Labels shape:', labels.shape) + sys.stdout.flush() + + if group != 'test': + # SMOTE + # data_resampled, labels_resampled = apply_cassey_smote(data, labels) + # data_resampled, labels_resampled = apply_borderline_smote(data, labels) + data_resampled, labels_resampled = apply_adasyn(data, labels) + data_resampled = data_resampled.reshape(len(data_resampled), img_size, img_size) + sys.stderr.flush() + print('\nResampled Data shape:', data_resampled.shape) + print('Resampled Labels shape:', labels_resampled.shape) + print() + sys.stdout.flush() + else: + data_resampled = data + data_resampled = data_resampled.reshape(len(data_resampled), img_size, img_size) + labels_resampled = labels + sys.stderr.flush() + print('\nData shape:', data_resampled.shape) + print('Labels shape:', labels_resampled.shape) + print() + sys.stdout.flush() + + # Define a directory to save the images + # save_dir = os.path.join(r'\\grove.ad.uconn.edu\research\ENGR_Chon\Darren\NIH_Pulsewatch', smote_type, split, group) + save_dir = os.path.join(r'\\grove.ad.uconn.edu\research\ENGR_Chon\Darren\NIH_Pulsewatch\TFS_pt', smote_type, split, group) + os.makedirs(save_dir, exist_ok=True) + + file_names = save_images_parallel(data_resampled, group, save_dir) + + # Ground truths + data_labels = pd.DataFrame({ + 'segment_name': file_names, + 'label': labels_resampled.reshape(-1) + }) + + csv_file_name = os.path.join(r'\\grove.ad.uconn.edu\research\ENGR_Chon\Darren\NIH_Pulsewatch\TFS_pt', smote_type, split, smote_type + '_' + group + '_names_labels.csv') + data_labels.to_csv(csv_file_name, index=False) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/utils/smote_transfer_location.py b/utils/smote_transfer_location.py new file mode 100644 index 0000000..93ffd22 --- /dev/null +++ b/utils/smote_transfer_location.py @@ -0,0 +1,93 @@ +import os +import pandas as pd +import numpy as np +from PIL import Image +import torch +from concurrent.futures import ProcessPoolExecutor +from pyarrow import csv +import cv2 +from tqdm import tqdm +import sys + + +def preprocess_and_save_data(data_path, output_path): + if not os.path.exists(output_path): + os.makedirs(output_path) + group_directories = [entry for entry in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, entry))] + for group in tqdm(group_directories, total=len(group_directories), desc='Data Transfer', unit='Group', leave=False): + sys.stderr.flush() + group_path = os.path.join(data_path, group) + group_output_path = os.path.join(output_path, group) + if not os.path.exists(group_output_path): + os.makedirs(group_output_path) + # else: # Only use for resuming converting + # print('Skipping', group) + # continue + files_to_process = [file for file in os.listdir(group_path) if file.endswith(('.csv', '.png', '.pt'))] + with ProcessPoolExecutor() as executor: + executor.map(preprocess_file, [group_path]*len(files_to_process), files_to_process, [group_output_path]*len(files_to_process)) + print() + print(group, 'data transfer done!') + sys.stdout.flush() + +def preprocess_file(group_path, file, group_output_path): + is_tfs = True + if is_tfs: + dtype = torch.float16 + input_size = 128 + else: + dtype = torch.uint8 + input_size = 500 + + downsample = None + + file_path = os.path.join(group_path, file) + if file.endswith('.csv'): + # data = pd.read_csv(file_path, header=None).to_numpy() + + # Use PyArrow + read_options = csv.ReadOptions(autogenerate_column_names=True) + data = csv.read_csv(file_path, read_options=read_options).to_pandas().to_numpy() + + if data.shape != (input_size, input_size): + print(f"Warning: File {file_path} has shape {data.shape} instead of", input_size + 'x', input_size + '.') + elif file.endswith('.png'): + data = np.array(Image.open(file_path)) + if data.shape != (input_size, input_size): + print(f"Warning: Image {file_path} has shape {data.shape} instead of", input_size + 'x', input_size + '.') + elif file.endswith('.pt'): + data = torch.load(file_path) + if data.shape != (input_size, input_size): + print(f"Warning: Image {file_path} has shape {data.shape} instead of", input_size + 'x', input_size + '.') + else: + print('Incorrect data type') + return + + if downsample is not None: + # Downsample the image + # Use OpenCV to resize the array to downsample x downsample using INTER_AREA interpolation + data_array = cv2.resize(np.array(data), (downsample, downsample), interpolation=cv2.INTER_AREA) + data_tensor = torch.tensor(data_array, dtype=dtype).view(downsample, downsample) + elif file.endswith('.pt'): + data_tensor = data.to(dtype).view(input_size, input_size) + else: + data_tensor = torch.tensor(data, dtype=dtype).view(input_size, input_size) + + # base_name, extension = os.path.splitext(file) + output_file_path = os.path.join(group_output_path, file) + torch.save(data_tensor, output_file_path) + +def main(): + smote_type = 'ADASYN6k' + split = 'holdout_60_10_30' + input_path = os.path.join(r'\\grove.ad.uconn.edu\research\ENGR_Chon\Darren\NIH_PulseWatch\TFS_pt', smote_type, split) + # input_path = os.path.join(r'\\grove.ad.uconn.edu\research\ENGR_Chon\Darren\NIH_PulseWatch\Poincare_pt', smote_type, split) + + output_path = os.path.join(r'C:\Chon_Lab\NIH_Pulsewatch\TFS_pt', smote_type, split) + # output_path = os.path.join(r'C:\Chon_Lab\NIH_Pulsewatch\Poincare_pt', smote_type, split) + + preprocess_and_save_data(input_path, output_path) + print('Data transfer complete!') + +if __name__ == '__main__': + main()