From 861e7420a147d34802c29fb2d53a2eba8f9ab6ff Mon Sep 17 00:00:00 2001 From: doh16101 Date: Thu, 4 Apr 2024 12:18:20 -0400 Subject: [PATCH] Cassey manually updated the data loader changes. --- .../Colab_example_dataloader_2024_04_04.ipynb | 22 ++ BML_project/models/ss_gp_model.py | 138 ++++++++++- BML_project/ss_main.py | 74 +++++- BML_project/utils_gp/data_loader.py | 219 +++++++++++------- 4 files changed, 348 insertions(+), 105 deletions(-) create mode 100644 BML_project/models/Colab_example_dataloader_2024_04_04.ipynb diff --git a/BML_project/models/Colab_example_dataloader_2024_04_04.ipynb b/BML_project/models/Colab_example_dataloader_2024_04_04.ipynb new file mode 100644 index 0000000..4495514 --- /dev/null +++ b/BML_project/models/Colab_example_dataloader_2024_04_04.ipynb @@ -0,0 +1,22 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# R:\\ENGR_Chon\\Darren\\NIH_Pulsewatch\\Poincare_pt\\128x128\n", + "# Darren created the PT files again (because UID 120 has missing files in the original csv file)\n", + "# I need to prepare for my interview, and I will tar those PT files again and test your code on Colab later." + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/BML_project/models/ss_gp_model.py b/BML_project/models/ss_gp_model.py index e364cec..355a6fd 100644 --- a/BML_project/models/ss_gp_model.py +++ b/BML_project/models/ss_gp_model.py @@ -4,12 +4,15 @@ @author: lrm22005 """ +import os import numpy as np from tqdm import tqdm import torch import gpytorch from sklearn.metrics import precision_recall_fscore_support, roc_auc_score from sklearn.preprocessing import label_binarize +from utils_gp.data_loader import preprocess_data_train_val,preprocess_data_test +import time num_latents = 6 # This should match the complexity of your data or the number of tasks num_tasks = 4 # This should match the number of output classes or tasks @@ -70,11 +73,49 @@ def forward(self, x): return latent_pred -def train_gp_model(train_loader, val_loader, num_iterations=50, n_classes=4, patience=10, checkpoint_path='model_checkpoint_full.pt'): +def train_gp_model(train_loader, val_loader, batch_size,\ + data_format, clinical_trial_train, clinical_trial_test,\ + clinical_trial_unlabeled,\ + num_iterations=50, n_classes=4, patience=10, checkpoint_path='model_checkpoint_full.pt',\ + resume_training=False,\ + datackpt_name = 'dataset_checkpoint.pt',modelckpt_name = 'model_checkpoint_full.pt'): + print(f'Debug: resume_training:{resume_training}, checkpoint_path: {checkpoint_path}') model = MultitaskGPModel().to(device) likelihood = gpytorch.likelihoods.SoftmaxLikelihood(num_features=4, num_classes=4).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.1) mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=len(train_loader.dataset)) + + # Load checkpoint if resuming training for gp model. + start_epoch = 0 + flag_reload_dataloader = False # We do not need to reset train loader in the new epoch. + ckpt_model_file = os.path.join(checkpoint_path,modelckpt_name) + if resume_training and os.path.exists(ckpt_model_file): + print(f'Debug: loading ckpt: {ckpt_model_file}') + checkpoint = torch.load(ckpt_model_file) + model.load_state_dict(checkpoint['model_state_dict']) + likelihood.load_state_dict(checkpoint['likelihood_state_dict']) + optimizer.load_state_dict(checkpoint['optimizer_state_dict']) + start_epoch = checkpoint.get('epoch', 0) # Resume from the same epoch because you did not finished it. + + # Update the dataloader if there are segments finished. + finished_seg_names = checkpoint['finished_seg_names'] + + if len(finished_seg_names) > 0: + # There were segments used in training. Only update the train loader. + flag_reload_dataloader = True + print('Debug: renewing train_loader now...') + startTime_for_tictoc = time.time() + # ---- Dong, 02/15/2024: I want to test training on large dataset and resume training. ---- + # train_loader,_,_ = preprocess_data_train_val(data_format, clinical_trial_train, clinical_trial_test, batch_size, finished_seg_names,\ + # read_all_labels=False) + train_loader = preprocess_data_test(data_format = data_format, \ + clinical_trial_unlabeled=clinical_trial_unlabeled, \ + batch_size=batch_size,\ + finished_seg_names=finished_seg_names,\ + read_all_labels=False) + endTime_for_tictoc = time.time() - startTime_for_tictoc + print(f'Debug: took {endTime_for_tictoc} to renew the train_loader') + best_val_loss = float('inf') epochs_no_improve = 0 @@ -86,19 +127,69 @@ def train_gp_model(train_loader, val_loader, num_iterations=50, n_classes=4, pat 'train_loss': [] # Add a list to store training losses } - for epoch in tqdm(range(num_iterations), desc='Training', unit='epoch', leave=False): - for train_batch in train_loader: + for epoch in tqdm(range(start_epoch,num_iterations), desc='Training', unit='epoch', leave=False): + finished_idx = [] + finished_seg_names = [] + for batch_index, train_batch in enumerate(train_loader): + print(f'Debug: now in a new batch of data! {batch_index}/{len(train_loader)}') # train_batch is the image data. model.train() likelihood.train() optimizer.zero_grad() + train_x = train_batch['data'].reshape(train_batch['data'].size(0), -1).to(device) # Use reshape here train_y = train_batch['label'].to(device) + # Get finished segment index in dataloader and segment name. + temp_finished_idx = train_batch['idx'] + temp_finished_seg_names = train_batch['segment_name'] + print('Debug: temp_finished_idx:',temp_finished_idx) + print('Debug: temp_finished_segment_name:',temp_finished_seg_names) + finished_idx.append(temp_finished_idx) + finished_seg_names.append(temp_finished_seg_names) output = model(train_x) loss = -mll(output, train_y) metrics['train_loss'].append(loss.item()) # Store the training loss loss.backward() optimizer.step() + save_ckpt_model_path = os.path.join(checkpoint_path,modelckpt_name) + torch.save({ + 'epoch': epoch, + 'model_state_dict': model.state_dict(), + 'likelihood_state_dict': likelihood.state_dict(), + 'optimizer_state_dict': optimizer.state_dict(), + 'best_val_loss': best_val_loss, + 'finished_seg_names':finished_seg_names, + 'finished_idx':finished_idx + # Include other metrics as needed + }, save_ckpt_model_path) + + # Optionally, save the dataset state at intervals or after certain conditions + save_ckpt_dataset_path = os.path.join(checkpoint_path,datackpt_name) + train_loader.dataset.save_checkpoint(save_ckpt_dataset_path) # Here, manage the index as needed + + # import sys + # if epoch == 3 and batch_index == 5: + # sys.exit(f"Debug: Manually stop the program at epoch {epoch} batch {batch_index}.") + + # Reset the finished segments again because we finished one epoch. + finished_idx = [] + finished_seg_names = [] + if flag_reload_dataloader: + print('Debug: reset the train_loader now...') + # Reset the train dataloader now. + startTime_for_tictoc = time.time() + # --- Dong, 02/15/2024: + # train_loader,_,_ = preprocess_data_train_val(data_format, clinical_trial_train, clinical_trial_test, batch_size, finished_seg_names,\ + # read_all_labels=False) + train_loader = preprocess_data_test(data_format = data_format, \ + clinical_trial_unlabeled=clinical_trial_unlabeled, \ + batch_size=batch_size,\ + finished_seg_names=finished_seg_names,\ + read_all_labels=False) + endTime_for_tictoc = time.time() - startTime_for_tictoc + print(f'Debug: took {endTime_for_tictoc} to reset the train_loader') + flag_reload_dataloader = False # Turn off the flag for reseting train dataloader. + # Stochastic validation model.eval() likelihood.eval() @@ -131,19 +222,46 @@ def train_gp_model(train_loader, val_loader, num_iterations=50, n_classes=4, pat if val_loss < best_val_loss: best_val_loss = val_loss epochs_no_improve = 0 - torch.save({'model_state_dict': model.state_dict(), - 'likelihood_state_dict': likelihood.state_dict(), - 'optimizer_state_dict': optimizer.state_dict()}, checkpoint_path) + # torch.save({'model_state_dict': model.state_dict(), + # 'likelihood_state_dict': likelihood.state_dict(), + # 'optimizer_state_dict': optimizer.state_dict(), + # 'train_loader':train_loader, + # 'val_loader':val_loader + # }, checkpoint_path) else: epochs_no_improve += 1 if epochs_no_improve >= patience: print(f"Early stopping triggered at epoch {epoch+1}") break + + # Save checkpoint at the end of each epoch + save_ckpt_model_path = os.path.join(checkpoint_path,modelckpt_name) + torch.save({ + 'epoch': epoch, + 'model_state_dict': model.state_dict(), + 'likelihood_state_dict': likelihood.state_dict(), + 'optimizer_state_dict': optimizer.state_dict(), + 'best_val_loss': best_val_loss, + 'finished_seg_names':finished_seg_names, + 'finished_idx':finished_idx + # Include other metrics as needed + }, save_ckpt_model_path) + print('Debug: saved model checkpoint with epoch.',save_ckpt_model_path) + + # Optionally, save the dataset state at intervals or after certain conditions + save_ckpt_dataset_path = os.path.join(checkpoint_path,datackpt_name) + train_loader.dataset.save_checkpoint(save_ckpt_dataset_path) # Finished all batches, so start from zero. + + if epochs_no_improve >= patience: + print(f"Early stopping triggered at epoch {epoch+1}") + break - checkpoint = torch.load(checkpoint_path) - model.load_state_dict(checkpoint['model_state_dict']) - likelihood.load_state_dict(checkpoint['likelihood_state_dict']) - optimizer.load_state_dict(checkpoint['optimizer_state_dict']) + # Optionally, load the best model at the end of training + if os.path.exists(checkpoint_path): + checkpoint = torch.load(checkpoint_path) + model.load_state_dict(checkpoint['model_state_dict']) + likelihood.load_state_dict(checkpoint['likelihood_state_dict']) + optimizer.load_state_dict(checkpoint['optimizer_state_dict']) return model, likelihood, metrics diff --git a/BML_project/ss_main.py b/BML_project/ss_main.py index bf34fbf..326f80f 100644 --- a/BML_project/ss_main.py +++ b/BML_project/ss_main.py @@ -6,13 +6,18 @@ """ from tqdm import tqdm import torch -from utils_gp.data_loader import preprocess_data, split_uids, update_train_loader_with_uncertain_samples +from utils_gp.data_loader import preprocess_data_train_val, split_uids, update_train_loader_with_uncertain_samples, preprocess_data_test from models.ss_gp_model import MultitaskGPModel, train_gp_model from utils_gp.ss_evaluation import stochastic_evaluation, evaluate_model_on_all_data from active_learning.ss_active_learning import stochastic_uncertainty_sampling, run_minibatch_kmeans, stochastic_compare_kmeans_gp_predictions from utils_gp.visualization import plot_comparative_results, plot_training_performance, plot_results import os import pickle +from datetime import datetime +now = datetime.now() # Get the time now for model checkpoint saving. + +dt_string = now.strftime("%Y_%m_%d_%H_%M_%S") # YYYY_mm_dd_HH_MM_SS, for model saving. +print("The date and time suffix of the model file is", dt_string) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -48,8 +53,29 @@ def main(): clinical_trial_train, clinical_trial_test, clinical_trial_unlabeled = split_uids() data_format = 'pt' # Preprocess data - train_loader, val_loader, test_loader, saving_path = preprocess_data(data_format, clinical_trial_train, clinical_trial_test, clinical_trial_unlabeled, batch_size) + # ---- Dong, 02/15/2024: I want to test loading large amount dataset. ---- + # train_loader, val_loader, saving_path = preprocess_data_train_val(data_format = data_format, \ + _, val_loader, saving_path = preprocess_data_train_val(data_format = data_format, \ + clinical_trial_train=clinical_trial_train, \ + clinical_trial_test=clinical_trial_test, \ + batch_size=batch_size,\ + finished_seg_names = [],\ + read_all_labels=False) + # ---- Dong, 02/15/2024: I want to test loading large amount dataset. ---- + # test_loader = preprocess_data_test(data_format = data_format, \ + train_loader = preprocess_data_test(data_format = data_format, \ + clinical_trial_unlabeled=clinical_trial_unlabeled, \ + batch_size=batch_size,\ + finished_seg_names=[],\ + read_all_labels=False) + + menu_segment_names = train_loader.dataset.segment_names # All the segments to be run in the training dataset. + menu_labels = train_loader.dataset.labels # All the ground truth labels + print('Debug: len(menu_segment_names)',len(menu_segment_names)) + print('Debug: len(menu_labels)',len(menu_labels)) + print('Debug: len(train_loader)',len(train_loader)) + print('Debug: dir(train_loader.dataset)',dir(train_loader.dataset)) kmeans_model = run_minibatch_kmeans(train_loader, n_clusters=n_classes, device=device) @@ -61,7 +87,21 @@ def main(): } # Initial model training - model, likelihood, training_metrics = train_gp_model(train_loader, val_loader, num_iterations=50, n_classes=n_classes) + model, likelihood, training_metrics = train_gp_model( + train_loader = train_loader, + val_loader = val_loader, + num_iterations=50, + n_classes=n_classes, + patience=10, + checkpoint_path=saving_path, + resume_training=True, + datackpt_name = 'dataset_checkpoint.pt', + modelckpt_name = 'model_checkpoint_full.pt', + batch_size=batch_size, + data_format = data_format, + clinical_trial_train = clinical_trial_train, + clinical_trial_test = clinical_trial_test, + clinical_trial_unlabeled=clinical_trial_unlabeled) # Dong: remember to change this function in its code. # Save the training metrics for future visualization results['train_loss'].extend(training_metrics['train_loss']) @@ -77,6 +117,7 @@ def main(): # Attempt to load a training checkpoint train_checkpoint = checkpoint_manager.load_checkpoint('train') start_iteration = train_checkpoint['iteration'] if train_checkpoint else 0 + print('Debug: start_iteration is:',start_iteration) # Dong, 01/25/2024: save it first before entering the active learning. additional_state = { 'model_state': model.state_dict(), @@ -91,15 +132,20 @@ def main(): active_learning_iterations = 10 # Active Learning Iterations for iteration in tqdm(range(start_iteration,active_learning_iterations), desc='Active Learning', unit='iteration', leave=True): + print(f"Active Learning Iteration: {iteration+1}/{active_learning_iterations}") # Perform uncertainty sampling to select new samples from the validation set - uncertain_sample_indices = stochastic_uncertainty_sampling(model, likelihood, val_loader, n_samples=batch_size, n_batches=5) - + uncertain_sample_indices = stochastic_uncertainty_sampling(model, likelihood, val_loader, n_samples=50, n_batches=5, device=device) + labeled_samples = label_samples(uncertain_sample_indices, val_loader.dataset) # Update the training loader with uncertain samples - train_loader = update_train_loader_with_uncertain_samples(train_loader, uncertain_sample_indices, batch_size) - print(f"Updated training data size: {len(train_loader.dataset)}") + train_loader = update_train_loader_with_uncertain_samples(train_loader, labeled_samples, batch_size) + + # Optionally, save the dataset state at intervals or after certain conditions + train_loader.dataset.save_checkpoint(dataset_checkpoint_path) # Here, manage the index as needed # Re-train the model with the updated training data - model, likelihood, val_metrics = train_gp_model(train_loader, val_loader, num_iterations=10, n_classes=n_classes, patience=10, checkpoint_path='model_checkpoint_last.pt') + model, likelihood, val_metrics = train_gp_model( + train_loader, val_loader, num_iterations=10, n_classes=n_classes, patience=10, + checkpoint_path=saving_path, resume_training=True, batch_size=batch_size) # Store the validation metrics after each active learning iteration results['validation_metrics']['precision'].append(val_metrics['precision']) @@ -123,13 +169,21 @@ def main(): plot_comparative_results(gp_vs_kmeans_data, original_labels) # Final evaluation on test set + import subprocess + print('Start to run bash script!') + subprocess.call("./BML_project/untar_unlabeled_PT.sh") + print('End to run bash script!') + + test_loader = preprocess_data_test(data_format = data_format, \ + clinical_trial_unlabeled=clinical_trial_unlabeled, \ + batch_size=batch_size,\ + finished_seg_names=[],\ + read_all_labels=False) test_metrics = evaluate_model_on_all_data(model, likelihood, test_loader, device, n_classes) test_kmeans_model = run_minibatch_kmeans(test_loader, n_clusters=n_classes, device=device) results['test_metrics'] = test_metrics test_gp_vs_kmeans_data, test_original_labels = stochastic_compare_kmeans_gp_predictions(test_kmeans_model, model, test_loader, n_batches=5, device=device) - - print(f"Length of original_labels: {len(original_labels)}, Length of gp_predictions: {len(gp_predictions)}") plot_comparative_results(test_gp_vs_kmeans_data, test_original_labels) # Visualization of results diff --git a/BML_project/utils_gp/data_loader.py b/BML_project/utils_gp/data_loader.py index e0a6823..bd22a79 100644 --- a/BML_project/utils_gp/data_loader.py +++ b/BML_project/utils_gp/data_loader.py @@ -5,6 +5,8 @@ @author: lrm22005 """ import os +# For saving checkpoints +from pathlib import Path import numpy as np import pandas as pd from PIL import Image @@ -13,6 +15,12 @@ from sklearn.preprocessing import StandardScaler from torchvision.transforms import ToTensor import socket +# Downsampling image +import cv2 +# import torchvision.transforms as T +# transform for rectangular resize +img_size = 32 # Dong, 01/30/2024: this is for testing the CIFAR10 models. +# transform = T.Resize((img_size,img_size)) def split_uids(): # ====== Load the per subject arrythmia summary ====== @@ -26,6 +34,8 @@ def split_uids(): df_summary = pd.read_csv(r'R:\ENGR_Chon\NIH_Pulsewatch_Database\Adjudication_UConn\final_attemp_4_1_Dong_Ohm_summary_20231025.csv') elif your_computer_name == 'Luis_computer_name': df_summary = pd.read_csv(r'\\grove.ad.uconn.edu\research\ENGR_Chon\NIH_Pulsewatch_Database\Adjudication_UConn\final_attemp_4_1_Dong_Ohm_summary_20231025.csv') + else: + df_summary = pd.read_csv(r'/content/drive/MyDrive/Adjudication_UConn/final_attemp_4_1_Dong_Ohm_summary_20231025.csv') df_summary['UID'] = df_summary['UID'].astype(str).str.zfill(3) df_summary['sample_nonAF'] = df_summary['NSR'] + df_summary['PACPVC'] + df_summary['SVT'] @@ -103,111 +113,96 @@ def split_uids(): # clinical_trial_unlabeled = clinical_trial_unlabeled[0:4] return clinical_trial_train, clinical_trial_test, clinical_trial_unlabeled +def extract_segment_names_and_labels(UIDs,labels_path,read_all_labels=False): + # Extract all segment names and labels when starting the main function. + # Output: + # segment_names: list of string. + # labels: dictionary, with segment_names as key and label as value. + segment_names = [] + labels = {} + + for UID in UIDs: + label_file = os.path.join(labels_path, UID + "_final_attemp_4_1_Dong.csv") + if os.path.exists(label_file): + print('Debug: this file exists',label_file) + label_data = pd.read_csv(label_file, sep=',', header=0, names=['segment', 'label']) + label_segment_names = label_data['segment'].apply(lambda x: x.split('.')[0]) + for idx, segment_name in enumerate(label_segment_names): + label_val = label_data['label'].values[idx] + if read_all_labels: + # Assign -1 if label is not in [0, 1, 2, 3] + labels[segment_name] = label_val if label_val in [0, 1, 2, 3] else -1 + if segment_name not in segment_names: + segment_names.append(segment_name) + else: + # Only add segments with labels in [0, 1, 2, 3] + if label_val in [0, 1, 2, 3] and segment_name not in segment_names: + segment_names.append(segment_name) + labels[segment_name] = label_val + print('>>> Number of segments in this dataloader:',len(segment_names)) # Dong, 01/29/2024: know the number of segments before running training epochs. + print('>>> Number of labels in this dataloader:',len(labels)) + return segment_names, labels + +def remove_finished_segment_names_and_labels(labels,finished_seg_names): + # From extract_segment_names_and_labels: + # Input: + # labels: dictionary, with segment_names as key and label as value. + # finished_seg_names: list of string. + remain_labels = labels.copy() + print('Debug: type(remain_labels)',type(remain_labels)) + for batch in finished_seg_names: + for key in batch: + remain_labels.pop(key) + print('Debug: len(labels)',len(labels)) + print('Debug: len(remain_labels)',len(remain_labels)) + + return remain_labels + class CustomDataset(Dataset): - def __init__(self, data_path, labels_path, UIDs, standardize=True, data_format='csv', read_all_labels=False, start_idx=0): + def __init__(self, data_path, labels_path, batch_size,segment_names,labels, standardize=True, data_format='csv', read_all_labels=False): self.data_path = data_path self.labels_path = labels_path - self.UIDs = UIDs self.standardize = standardize self.data_format = data_format self.read_all_labels = read_all_labels self.transforms = ToTensor() - self.start_idx = start_idx # Initial batch index to start from, useful for resuming training - self.refresh_dataset() - - # Initialize the current batch index to None, this could be used if you want to track batch progress within the dataset itself - self.current_batch_index = None + self.segment_names = segment_names + self.labels = labels - def refresh_dataset(self): - self.segment_names, self.labels = self.extract_segment_names_and_labels() - - def add_uids(self, new_uids): - unique_new_uids = [uid for uid in new_uids if uid not in self.UIDs] - self.UIDs.extend(unique_new_uids) - self.refresh_dataset() + # Initialize the current batch index to None + self.batch_size = batch_size def __len__(self): return len(self.segment_names) def save_checkpoint(self, checkpoint_path): - # Enhanced to automatically include 'start_idx' in the checkpoint checkpoint = { 'segment_names': self.segment_names, - 'labels': self.labels, - 'UIDs': self.UIDs, - 'start_idx': self.start_idx # Now also saving start_idx + 'labels': self.labels + # Save the current batch index if provided } torch.save(checkpoint, checkpoint_path) def load_checkpoint(self, checkpoint_path): checkpoint = torch.load(checkpoint_path) + print('Debug: loaded dataset checkpoint!',checkpoint_path) self.segment_names = checkpoint['segment_names'] self.labels = checkpoint['labels'] - self.UIDs = checkpoint['UIDs'] - # Now also loading and setting start_idx from checkpoint - self.start_idx = checkpoint.get('start_idx', 0) self.refresh_dataset() + # Load the current batch index if it exists in the checkpoint def __getitem__(self, idx): - actual_idx = (idx + self.start_idx) % len(self.segment_names) # Adjust index based on start_idx and wrap around if needed - segment_name = self.segment_names[actual_idx] + segment_name = self.segment_names[idx] label = self.labels[segment_name] - if hasattr(self, 'all_data') and actual_idx < len(self.all_data): - time_freq_tensor = self.all_data[actual_idx] + if hasattr(self, 'all_data') and idx < len(self.all_data): + time_freq_tensor = self.all_data[idx] else: time_freq_tensor = self.load_data(segment_name) - return {'data': time_freq_tensor, 'label': label, 'segment_name': segment_name} - def set_current_batch_index(self, index): - self.current_batch_index = index - - def get_current_batch_index(self): - return self.current_batch_index - - def set_start_idx(self, index): - self.start_idx = index - - def add_data_label_pair(self, data, label): - # Assign a unique ID or name for the new data - new_id = len(self.segment_names) - segment_name = f"new_data_{new_id}" - - # Append the new data and label - self.segment_names.append(segment_name) - self.labels[segment_name] = label - - # Append the new data tensor to an attribute that holds all the data - if hasattr(self, 'all_data'): - self.all_data.append(data) - else: - self.all_data = [data] - - def extract_segment_names_and_labels(self): - segment_names = [] - labels = {} - - for UID in self.UIDs: - label_file = os.path.join(self.labels_path, UID + "_final_attemp_4_1_Dong.csv") - if os.path.exists(label_file): - print('Debug: this file exists',label_file) - label_data = pd.read_csv(label_file, sep=',', header=0, names=['segment', 'label']) - label_segment_names = label_data['segment'].apply(lambda x: x.split('.')[0]) - for idx, segment_name in enumerate(label_segment_names): - label_val = label_data['label'].values[idx] - if self.read_all_labels: - # Assign -1 if label is not in [0, 1, 2, 3] - labels[segment_name] = label_val if label_val in [0, 1, 2, 3] else -1 - if segment_name not in segment_names: - segment_names.append(segment_name) - else: - # Only add segments with labels in [0, 1, 2, 3] - if label_val in [0, 1, 2, 3] and segment_name not in segment_names: - segment_names.append(segment_name) - labels[segment_name] = label_val - - return segment_names, labels + + return {'data': time_freq_tensor, 'label': label, 'segment_name': segment_name, 'idx': idx} def load_data(self, segment_name): data_path_UID = os.path.join(self.data_path, segment_name.split('_')[0]) @@ -232,15 +227,30 @@ def load_data(self, segment_name): except Exception as e: print(f"Error processing segment: {segment_name}. Exception: {str(e)}") - return torch.zeros((1, 128, 128)) # Return zeros in case of an error + return torch.zeros((1, img_size, img_size)) # Return zeros in case of an error def standard_scaling(self, data): scaler = StandardScaler() data = scaler.fit_transform(data.reshape(-1, data.shape[-1])).reshape(data.shape) return torch.Tensor(data) -def load_data_split_batched(data_path, labels_path, UIDs, batch_size, standardize=False, data_format='csv', read_all_labels=False, drop_last=False, num_workers=4, start_idx=0): - dataset = CustomDataset(data_path, labels_path, UIDs, standardize, data_format, read_all_labels, start_idx=start_idx) +def load_data_split_batched(data_path, labels_path, UIDs, batch_size, standardize=False, data_format='pt', read_all_labels=False, drop_last=False, num_workers=4,\ + finished_seg_names = []): + # Run the main from the beginning. Load all data into the dataloader. + segment_names, labels = extract_segment_names_and_labels(UIDs,labels_path,read_all_labels=read_all_labels) + if len(finished_seg_names) > 0: + # If any segments have been trained. + remain_labels = remove_finished_segment_names_and_labels(labels,finished_seg_names) + segment_names = list(remain_labels.keys()) + labels = remain_labels.copy() + dataset = CustomDataset(data_path=data_path, \ + labels_path=labels_path, \ + standardize=standardize, \ + data_format=data_format, \ + read_all_labels=read_all_labels, \ + batch_size=batch_size, + segment_names = segment_names, + labels = labels) dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, drop_last=drop_last, num_workers=num_workers, prefetch_factor=2) return dataloader @@ -261,8 +271,12 @@ def get_data_paths(data_format): labels_base_path = "R:\ENGR_Chon\\NIH_Pulsewatch_Database\Adjudication_UConn" saving_base_path = r"\\grove.ad.uconn.edu\research\ENGR_Chon\Luis\Research\Casseys_case" else: - print('ERROR! YOUR DID NOT GET THE PATH.') - raise ValueError + print('Debug: You are in Google Colab.') + base_path = '/content' + labels_base_path = '/content/drive/MyDrive/Adjudication_UConn' + saving_base_path = '/content/drive/MyDrive/Checkpoint_Colab' + # print('ERROR! YOUR DID NOT GET THE PATH.') + # raise ValueError if data_format == 'csv': data_path = os.path.join(base_path, "TFS_csv") @@ -278,16 +292,51 @@ def get_data_paths(data_format): saving_path = os.path.join(saving_base_path, "Project_1_analysis") else: raise ValueError("Invalid data format. Choose 'csv' or 'png.") + + # Create the parent path for checkpoints. + Path(saving_path).mkdir(parents=True, exist_ok=True) + return data_path, labels_path, saving_path # Function to extract and preprocess data -def preprocess_data(data_format, clinical_trial_train, clinical_trial_test, clinical_trial_unlabeled, batch_size, read_all_labels=False, current_batch_index=0): - start_idx = current_batch_index * batch_size +def preprocess_data_train_val(data_format, clinical_trial_train, clinical_trial_test, batch_size, finished_seg_names,\ + read_all_labels=False): + # Extracts paths and loads data into train, validation, and test loaders + data_path, labels_path, saving_path = get_data_paths(data_format) + + train_loader = load_data_split_batched(data_path=data_path, \ + labels_path=labels_path, \ + UIDs=clinical_trial_train, \ + batch_size = batch_size, \ + standardize=True, \ + data_format=data_format, \ + read_all_labels=read_all_labels,\ + finished_seg_names = finished_seg_names) + # Usually the validation set will not need to resume training. + val_loader = load_data_split_batched(data_path=data_path, \ + labels_path=labels_path, \ + UIDs=clinical_trial_test, \ + batch_size=batch_size, \ + standardize=True, \ + data_format=data_format, \ + read_all_labels=read_all_labels, \ + finished_seg_names = []) + return train_loader, val_loader, saving_path + +# Function to extract and preprocess data +def preprocess_data_test(data_format, clinical_trial_unlabeled, batch_size, finished_seg_names,\ + read_all_labels=False): + # Extracts paths and loads data into train, validation, and test loaders data_path, labels_path, saving_path = get_data_paths(data_format) - train_loader = load_data_split_batched(data_path, labels_path, clinical_trial_train, batch_size, standardize=True, data_format=data_format, read_all_labels=read_all_labels, start_idx=start_idx) - val_loader = load_data_split_batched(data_path, labels_path, clinical_trial_test, batch_size, standardize=True, data_format=data_format, read_all_labels=read_all_labels, start_idx=start_idx) - test_loader = load_data_split_batched(data_path, labels_path, clinical_trial_unlabeled, batch_size, standardize=True, data_format=data_format, read_all_labels=read_all_labels, start_idx=start_idx) - return train_loader, val_loader, test_loader + test_loader = load_data_split_batched(data_path=data_path, \ + labels_path=labels_path, \ + UIDs=clinical_trial_unlabeled, \ + batch_size=batch_size, \ + standardize=True, \ + data_format=data_format, \ + read_all_labels=read_all_labels,\ + finished_seg_names=finished_seg_names) + return test_loader def map_samples_to_uids(uncertain_sample_indices, dataset): """