From 861e7420a147d34802c29fb2d53a2eba8f9ab6ff Mon Sep 17 00:00:00 2001
From: doh16101 <dong.han@uconn.edu>
Date: Thu, 4 Apr 2024 12:18:20 -0400
Subject: [PATCH] Cassey manually updated the data loader changes.

---
 .../Colab_example_dataloader_2024_04_04.ipynb |  22 ++
 BML_project/models/ss_gp_model.py             | 138 ++++++++++-
 BML_project/ss_main.py                        |  74 +++++-
 BML_project/utils_gp/data_loader.py           | 219 +++++++++++-------
 4 files changed, 348 insertions(+), 105 deletions(-)
 create mode 100644 BML_project/models/Colab_example_dataloader_2024_04_04.ipynb

diff --git a/BML_project/models/Colab_example_dataloader_2024_04_04.ipynb b/BML_project/models/Colab_example_dataloader_2024_04_04.ipynb
new file mode 100644
index 0000000..4495514
--- /dev/null
+++ b/BML_project/models/Colab_example_dataloader_2024_04_04.ipynb
@@ -0,0 +1,22 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# R:\\ENGR_Chon\\Darren\\NIH_Pulsewatch\\Poincare_pt\\128x128\n",
+    "# Darren created the PT files again (because UID 120 has missing files in the original csv file)\n",
+    "# I need to prepare for my interview, and I will tar those PT files again and  test your code on Colab later."
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/BML_project/models/ss_gp_model.py b/BML_project/models/ss_gp_model.py
index e364cec..355a6fd 100644
--- a/BML_project/models/ss_gp_model.py
+++ b/BML_project/models/ss_gp_model.py
@@ -4,12 +4,15 @@
 
 @author: lrm22005
 """
+import os
 import numpy as np
 from tqdm import tqdm
 import torch
 import gpytorch
 from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
 from sklearn.preprocessing import label_binarize
+from utils_gp.data_loader import preprocess_data_train_val,preprocess_data_test
+import time
 
 num_latents = 6  # This should match the complexity of your data or the number of tasks
 num_tasks = 4    # This should match the number of output classes or tasks
@@ -70,11 +73,49 @@ def forward(self, x):
         return latent_pred
 
 
-def train_gp_model(train_loader, val_loader, num_iterations=50, n_classes=4, patience=10, checkpoint_path='model_checkpoint_full.pt'):
+def train_gp_model(train_loader, val_loader, batch_size,\
+                   data_format, clinical_trial_train, clinical_trial_test,\
+                   clinical_trial_unlabeled,\
+                   num_iterations=50, n_classes=4, patience=10, checkpoint_path='model_checkpoint_full.pt',\
+                   resume_training=False,\
+                   datackpt_name = 'dataset_checkpoint.pt',modelckpt_name = 'model_checkpoint_full.pt'):
+    print(f'Debug: resume_training:{resume_training}, checkpoint_path: {checkpoint_path}')
     model = MultitaskGPModel().to(device)
     likelihood = gpytorch.likelihoods.SoftmaxLikelihood(num_features=4, num_classes=4).to(device)
     optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
     mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=len(train_loader.dataset))
+
+    # Load checkpoint if resuming training for gp model.
+    start_epoch = 0
+    flag_reload_dataloader = False # We do not need to reset train loader in the new epoch.
+    ckpt_model_file = os.path.join(checkpoint_path,modelckpt_name)
+    if resume_training and os.path.exists(ckpt_model_file):
+        print(f'Debug: loading ckpt: {ckpt_model_file}')
+        checkpoint = torch.load(ckpt_model_file)
+        model.load_state_dict(checkpoint['model_state_dict'])
+        likelihood.load_state_dict(checkpoint['likelihood_state_dict'])
+        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
+        start_epoch = checkpoint.get('epoch', 0) # Resume from the same epoch because you did not finished it.
+
+        # Update the dataloader if there are segments finished.
+        finished_seg_names = checkpoint['finished_seg_names']
+
+        if len(finished_seg_names) > 0:
+            # There were segments used in training. Only update the train loader.
+            flag_reload_dataloader = True
+            print('Debug: renewing train_loader now...')
+            startTime_for_tictoc = time.time()
+            # ---- Dong, 02/15/2024: I want to test training on large dataset and resume training. ----
+            # train_loader,_,_ = preprocess_data_train_val(data_format, clinical_trial_train, clinical_trial_test, batch_size, finished_seg_names,\
+            #                   read_all_labels=False)
+            train_loader = preprocess_data_test(data_format = data_format, \
+                                       clinical_trial_unlabeled=clinical_trial_unlabeled, \
+                                       batch_size=batch_size,\
+                                       finished_seg_names=finished_seg_names,\
+                                       read_all_labels=False)
+            endTime_for_tictoc = time.time() - startTime_for_tictoc
+            print(f'Debug: took {endTime_for_tictoc} to renew the train_loader')
+
     best_val_loss = float('inf')
     epochs_no_improve = 0
 
@@ -86,19 +127,69 @@ def train_gp_model(train_loader, val_loader, num_iterations=50, n_classes=4, pat
         'train_loss': []  # Add a list to store training losses
     }
 
-    for epoch in tqdm(range(num_iterations), desc='Training', unit='epoch', leave=False):
-        for train_batch in train_loader:
+    for epoch in tqdm(range(start_epoch,num_iterations), desc='Training', unit='epoch', leave=False):
+        finished_idx = []
+        finished_seg_names = []
+        for batch_index, train_batch in enumerate(train_loader):
+            print(f'Debug: now in a new batch of data! {batch_index}/{len(train_loader)}') # train_batch is the image data.
             model.train()
             likelihood.train()
             optimizer.zero_grad()
+
             train_x = train_batch['data'].reshape(train_batch['data'].size(0), -1).to(device)  # Use reshape here
             train_y = train_batch['label'].to(device)
+            # Get finished segment index in dataloader and segment name.
+            temp_finished_idx = train_batch['idx']
+            temp_finished_seg_names = train_batch['segment_name']
+            print('Debug: temp_finished_idx:',temp_finished_idx)
+            print('Debug: temp_finished_segment_name:',temp_finished_seg_names)
+            finished_idx.append(temp_finished_idx)
+            finished_seg_names.append(temp_finished_seg_names)
             output = model(train_x)
             loss = -mll(output, train_y)
             metrics['train_loss'].append(loss.item())  # Store the training loss
             loss.backward()
             optimizer.step()
 
+            save_ckpt_model_path = os.path.join(checkpoint_path,modelckpt_name)
+            torch.save({
+                'epoch': epoch,
+                'model_state_dict': model.state_dict(),
+                'likelihood_state_dict': likelihood.state_dict(),
+                'optimizer_state_dict': optimizer.state_dict(),
+                'best_val_loss': best_val_loss,
+                'finished_seg_names':finished_seg_names,
+                'finished_idx':finished_idx
+                # Include other metrics as needed
+            }, save_ckpt_model_path)
+
+            # Optionally, save the dataset state at intervals or after certain conditions
+            save_ckpt_dataset_path = os.path.join(checkpoint_path,datackpt_name)
+            train_loader.dataset.save_checkpoint(save_ckpt_dataset_path)  # Here, manage the index as needed
+
+            # import sys
+            # if epoch == 3 and batch_index == 5:
+            #     sys.exit(f"Debug: Manually stop the program at epoch {epoch} batch {batch_index}.")
+        
+        # Reset the finished segments again because we finished one epoch.
+        finished_idx = [] 
+        finished_seg_names = []
+        if flag_reload_dataloader:
+            print('Debug: reset the train_loader now...')
+            # Reset the train dataloader now.
+            startTime_for_tictoc = time.time()
+            # --- Dong, 02/15/2024:
+            # train_loader,_,_ = preprocess_data_train_val(data_format, clinical_trial_train, clinical_trial_test, batch_size, finished_seg_names,\
+                            #   read_all_labels=False)
+            train_loader = preprocess_data_test(data_format = data_format, \
+                                       clinical_trial_unlabeled=clinical_trial_unlabeled, \
+                                       batch_size=batch_size,\
+                                       finished_seg_names=finished_seg_names,\
+                                       read_all_labels=False)
+            endTime_for_tictoc = time.time() - startTime_for_tictoc
+            print(f'Debug: took {endTime_for_tictoc} to reset the train_loader')
+            flag_reload_dataloader = False # Turn off the flag for reseting train dataloader.
+
         # Stochastic validation
         model.eval()
         likelihood.eval()
@@ -131,19 +222,46 @@ def train_gp_model(train_loader, val_loader, num_iterations=50, n_classes=4, pat
         if val_loss < best_val_loss:
             best_val_loss = val_loss
             epochs_no_improve = 0
-            torch.save({'model_state_dict': model.state_dict(),
-                        'likelihood_state_dict': likelihood.state_dict(),
-                        'optimizer_state_dict': optimizer.state_dict()}, checkpoint_path)
+            # torch.save({'model_state_dict': model.state_dict(),
+            #             'likelihood_state_dict': likelihood.state_dict(),
+            #             'optimizer_state_dict': optimizer.state_dict(),
+            #             'train_loader':train_loader,
+            #             'val_loader':val_loader
+            #             }, checkpoint_path)
         else:
             epochs_no_improve += 1
             if epochs_no_improve >= patience:
                 print(f"Early stopping triggered at epoch {epoch+1}")
                 break
+            
+        # Save checkpoint at the end of each epoch
+        save_ckpt_model_path = os.path.join(checkpoint_path,modelckpt_name)
+        torch.save({
+            'epoch': epoch,
+            'model_state_dict': model.state_dict(),
+            'likelihood_state_dict': likelihood.state_dict(),
+            'optimizer_state_dict': optimizer.state_dict(),
+            'best_val_loss': best_val_loss,
+            'finished_seg_names':finished_seg_names,
+            'finished_idx':finished_idx
+            # Include other metrics as needed
+        }, save_ckpt_model_path)
+        print('Debug: saved model checkpoint with epoch.',save_ckpt_model_path)
+
+        # Optionally, save the dataset state at intervals or after certain conditions
+        save_ckpt_dataset_path = os.path.join(checkpoint_path,datackpt_name)
+        train_loader.dataset.save_checkpoint(save_ckpt_dataset_path)  # Finished all batches, so start from zero.
+
+        if epochs_no_improve >= patience:
+            print(f"Early stopping triggered at epoch {epoch+1}")
+            break
 
-    checkpoint = torch.load(checkpoint_path)
-    model.load_state_dict(checkpoint['model_state_dict'])
-    likelihood.load_state_dict(checkpoint['likelihood_state_dict'])
-    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
+    # Optionally, load the best model at the end of training
+    if os.path.exists(checkpoint_path):
+        checkpoint = torch.load(checkpoint_path)
+        model.load_state_dict(checkpoint['model_state_dict'])
+        likelihood.load_state_dict(checkpoint['likelihood_state_dict'])
+        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
 
     return model, likelihood, metrics
 
diff --git a/BML_project/ss_main.py b/BML_project/ss_main.py
index bf34fbf..326f80f 100644
--- a/BML_project/ss_main.py
+++ b/BML_project/ss_main.py
@@ -6,13 +6,18 @@
 """
 from tqdm import tqdm
 import torch
-from utils_gp.data_loader import preprocess_data, split_uids, update_train_loader_with_uncertain_samples
+from utils_gp.data_loader import preprocess_data_train_val, split_uids, update_train_loader_with_uncertain_samples, preprocess_data_test
 from models.ss_gp_model import MultitaskGPModel, train_gp_model
 from utils_gp.ss_evaluation import stochastic_evaluation, evaluate_model_on_all_data
 from active_learning.ss_active_learning import stochastic_uncertainty_sampling, run_minibatch_kmeans, stochastic_compare_kmeans_gp_predictions
 from utils_gp.visualization import plot_comparative_results, plot_training_performance, plot_results
 import os
 import pickle
+from datetime import datetime
+now = datetime.now() # Get the time now for model checkpoint saving.
+
+dt_string = now.strftime("%Y_%m_%d_%H_%M_%S") # YYYY_mm_dd_HH_MM_SS, for model saving.
+print("The date and time suffix of the model file is", dt_string)
 
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
@@ -48,8 +53,29 @@ def main():
     clinical_trial_train, clinical_trial_test, clinical_trial_unlabeled = split_uids()
     data_format = 'pt'
     # Preprocess data
-    train_loader, val_loader, test_loader, saving_path = preprocess_data(data_format, clinical_trial_train, clinical_trial_test, clinical_trial_unlabeled, batch_size)
+    # ---- Dong, 02/15/2024: I want to test loading large amount dataset. ----
+    # train_loader, val_loader, saving_path = preprocess_data_train_val(data_format = data_format, \
+    _, val_loader, saving_path = preprocess_data_train_val(data_format = data_format, \
+                                                clinical_trial_train=clinical_trial_train, \
+                                                clinical_trial_test=clinical_trial_test, \
+                                                batch_size=batch_size,\
+                                                finished_seg_names = [],\
+                                                read_all_labels=False)
+    # ---- Dong, 02/15/2024: I want to test loading large amount dataset. ----
+    # test_loader = preprocess_data_test(data_format = data_format, \
+    train_loader = preprocess_data_test(data_format = data_format, \
+                                       clinical_trial_unlabeled=clinical_trial_unlabeled, \
+                                       batch_size=batch_size,\
+                                       finished_seg_names=[],\
+                                       read_all_labels=False)
+
+    menu_segment_names = train_loader.dataset.segment_names # All the segments to be run in the training dataset.
+    menu_labels = train_loader.dataset.labels # All the ground truth labels 
+    print('Debug: len(menu_segment_names)',len(menu_segment_names))
+    print('Debug: len(menu_labels)',len(menu_labels))
+
     print('Debug: len(train_loader)',len(train_loader))
+    print('Debug: dir(train_loader.dataset)',dir(train_loader.dataset))
 
     kmeans_model = run_minibatch_kmeans(train_loader, n_clusters=n_classes, device=device)
 
@@ -61,7 +87,21 @@ def main():
     }
     
     # Initial model training
-    model, likelihood, training_metrics = train_gp_model(train_loader, val_loader, num_iterations=50, n_classes=n_classes)
+    model, likelihood, training_metrics = train_gp_model(
+        train_loader = train_loader,
+        val_loader = val_loader,
+        num_iterations=50, 
+        n_classes=n_classes,
+        patience=10,
+        checkpoint_path=saving_path,
+        resume_training=True,
+        datackpt_name = 'dataset_checkpoint.pt',
+        modelckpt_name = 'model_checkpoint_full.pt',
+        batch_size=batch_size,
+        data_format = data_format,
+        clinical_trial_train = clinical_trial_train,
+        clinical_trial_test = clinical_trial_test,
+        clinical_trial_unlabeled=clinical_trial_unlabeled) # Dong: remember to change this function in its code.
 
     # Save the training metrics for future visualization
     results['train_loss'].extend(training_metrics['train_loss'])
@@ -77,6 +117,7 @@ def main():
     # Attempt to load a training checkpoint
     train_checkpoint = checkpoint_manager.load_checkpoint('train')
     start_iteration = train_checkpoint['iteration'] if train_checkpoint else 0
+    print('Debug: start_iteration is:',start_iteration)
     # Dong, 01/25/2024: save it first before entering the active learning.
     additional_state = {
         'model_state': model.state_dict(),
@@ -91,15 +132,20 @@ def main():
     active_learning_iterations = 10
     # Active Learning Iterations
     for iteration in tqdm(range(start_iteration,active_learning_iterations), desc='Active Learning', unit='iteration', leave=True):
+        print(f"Active Learning Iteration: {iteration+1}/{active_learning_iterations}")
         # Perform uncertainty sampling to select new samples from the validation set
-        uncertain_sample_indices = stochastic_uncertainty_sampling(model, likelihood, val_loader, n_samples=batch_size, n_batches=5)
-
+        uncertain_sample_indices = stochastic_uncertainty_sampling(model, likelihood, val_loader, n_samples=50, n_batches=5, device=device)
+        labeled_samples = label_samples(uncertain_sample_indices, val_loader.dataset)
         # Update the training loader with uncertain samples
-        train_loader = update_train_loader_with_uncertain_samples(train_loader, uncertain_sample_indices, batch_size)
-        print(f"Updated training data size: {len(train_loader.dataset)}")
+        train_loader = update_train_loader_with_uncertain_samples(train_loader, labeled_samples, batch_size)
+
+        # Optionally, save the dataset state at intervals or after certain conditions
+        train_loader.dataset.save_checkpoint(dataset_checkpoint_path)  # Here, manage the index as needed
 
         # Re-train the model with the updated training data
-        model, likelihood, val_metrics = train_gp_model(train_loader, val_loader, num_iterations=10, n_classes=n_classes, patience=10, checkpoint_path='model_checkpoint_last.pt')
+        model, likelihood, val_metrics = train_gp_model(
+            train_loader, val_loader, num_iterations=10, n_classes=n_classes, patience=10,
+            checkpoint_path=saving_path, resume_training=True, batch_size=batch_size)
 
         # Store the validation metrics after each active learning iteration
         results['validation_metrics']['precision'].append(val_metrics['precision'])
@@ -123,13 +169,21 @@ def main():
     plot_comparative_results(gp_vs_kmeans_data, original_labels)
 
     # Final evaluation on test set
+    import subprocess
+    print('Start to run bash script!')
+    subprocess.call("./BML_project/untar_unlabeled_PT.sh")
+    print('End to run bash script!')
+
+    test_loader = preprocess_data_test(data_format = data_format, \
+                                       clinical_trial_unlabeled=clinical_trial_unlabeled, \
+                                       batch_size=batch_size,\
+                                       finished_seg_names=[],\
+                                       read_all_labels=False)
     test_metrics = evaluate_model_on_all_data(model, likelihood, test_loader, device, n_classes)
     test_kmeans_model = run_minibatch_kmeans(test_loader, n_clusters=n_classes, device=device)
 
     results['test_metrics'] = test_metrics
     test_gp_vs_kmeans_data, test_original_labels = stochastic_compare_kmeans_gp_predictions(test_kmeans_model, model, test_loader, n_batches=5, device=device)
-
-    print(f"Length of original_labels: {len(original_labels)}, Length of gp_predictions: {len(gp_predictions)}")
     plot_comparative_results(test_gp_vs_kmeans_data, test_original_labels)
 
     # Visualization of results
diff --git a/BML_project/utils_gp/data_loader.py b/BML_project/utils_gp/data_loader.py
index e0a6823..bd22a79 100644
--- a/BML_project/utils_gp/data_loader.py
+++ b/BML_project/utils_gp/data_loader.py
@@ -5,6 +5,8 @@
 @author: lrm22005
 """
 import os
+# For saving checkpoints
+from pathlib import Path
 import numpy as np
 import pandas as pd
 from PIL import Image
@@ -13,6 +15,12 @@
 from sklearn.preprocessing import StandardScaler
 from torchvision.transforms import ToTensor
 import socket
+# Downsampling image
+import cv2
+# import torchvision.transforms as T
+# transform for rectangular resize
+img_size = 32 # Dong, 01/30/2024: this is for testing the CIFAR10 models.
+# transform = T.Resize((img_size,img_size))
 
 def split_uids():
     # ====== Load the per subject arrythmia summary ======
@@ -26,6 +34,8 @@ def split_uids():
         df_summary = pd.read_csv(r'R:\ENGR_Chon\NIH_Pulsewatch_Database\Adjudication_UConn\final_attemp_4_1_Dong_Ohm_summary_20231025.csv')
     elif your_computer_name == 'Luis_computer_name':
         df_summary = pd.read_csv(r'\\grove.ad.uconn.edu\research\ENGR_Chon\NIH_Pulsewatch_Database\Adjudication_UConn\final_attemp_4_1_Dong_Ohm_summary_20231025.csv')
+    else:
+        df_summary = pd.read_csv(r'/content/drive/MyDrive/Adjudication_UConn/final_attemp_4_1_Dong_Ohm_summary_20231025.csv')
     df_summary['UID'] = df_summary['UID'].astype(str).str.zfill(3)
     
     df_summary['sample_nonAF'] = df_summary['NSR'] + df_summary['PACPVC'] + df_summary['SVT']
@@ -103,111 +113,96 @@ def split_uids():
     # clinical_trial_unlabeled = clinical_trial_unlabeled[0:4]
     
     return clinical_trial_train, clinical_trial_test, clinical_trial_unlabeled
+def extract_segment_names_and_labels(UIDs,labels_path,read_all_labels=False):
+    # Extract all segment names and labels when starting the main function.
+    # Output: 
+    #   segment_names: list of string.
+    #   labels: dictionary, with segment_names as key and label as value.
+    segment_names = []
+    labels = {}
+
+    for UID in UIDs:
+        label_file = os.path.join(labels_path, UID + "_final_attemp_4_1_Dong.csv")
+        if os.path.exists(label_file):
+            print('Debug: this file exists',label_file)
+            label_data = pd.read_csv(label_file, sep=',', header=0, names=['segment', 'label'])
+            label_segment_names = label_data['segment'].apply(lambda x: x.split('.')[0])
+            for idx, segment_name in enumerate(label_segment_names):
+                label_val = label_data['label'].values[idx]
+                if read_all_labels:
+                    # Assign -1 if label is not in [0, 1, 2, 3]
+                    labels[segment_name] = label_val if label_val in [0, 1, 2, 3] else -1
+                    if segment_name not in segment_names:
+                        segment_names.append(segment_name)
+                else:
+                    # Only add segments with labels in [0, 1, 2, 3]
+                    if label_val in [0, 1, 2, 3] and segment_name not in segment_names:
+                        segment_names.append(segment_name)
+                        labels[segment_name] = label_val
+    print('>>> Number of segments in this dataloader:',len(segment_names)) # Dong, 01/29/2024: know the number of segments before running training epochs.
+    print('>>> Number of labels in this dataloader:',len(labels))
+    return segment_names, labels
+
+def remove_finished_segment_names_and_labels(labels,finished_seg_names):
+    # From extract_segment_names_and_labels:
+    # Input: 
+    #   labels: dictionary, with segment_names as key and label as value.
+    #   finished_seg_names: list of string.
+    remain_labels = labels.copy()
+    print('Debug: type(remain_labels)',type(remain_labels))
+    for batch in finished_seg_names:
+        for key in batch:
+            remain_labels.pop(key)
+    print('Debug: len(labels)',len(labels))
+    print('Debug: len(remain_labels)',len(remain_labels))
+
+    return remain_labels
+
 
 class CustomDataset(Dataset):
-    def __init__(self, data_path, labels_path, UIDs, standardize=True, data_format='csv', read_all_labels=False, start_idx=0):
+    def __init__(self, data_path, labels_path, batch_size,segment_names,labels, standardize=True, data_format='csv', read_all_labels=False):
         self.data_path = data_path
         self.labels_path = labels_path
-        self.UIDs = UIDs
         self.standardize = standardize
         self.data_format = data_format
         self.read_all_labels = read_all_labels
         self.transforms = ToTensor()
-        self.start_idx = start_idx  # Initial batch index to start from, useful for resuming training
-        self.refresh_dataset()
-
-        # Initialize the current batch index to None, this could be used if you want to track batch progress within the dataset itself
-        self.current_batch_index = None
+        self.segment_names = segment_names
+        self.labels = labels
 
-    def refresh_dataset(self):
-        self.segment_names, self.labels = self.extract_segment_names_and_labels()
-
-    def add_uids(self, new_uids):
-        unique_new_uids = [uid for uid in new_uids if uid not in self.UIDs]
-        self.UIDs.extend(unique_new_uids)
-        self.refresh_dataset()
+        # Initialize the current batch index to None
+        self.batch_size = batch_size
 
     def __len__(self):
         return len(self.segment_names)
 
     def save_checkpoint(self, checkpoint_path):
-        # Enhanced to automatically include 'start_idx' in the checkpoint
         checkpoint = {
             'segment_names': self.segment_names,
-            'labels': self.labels,
-            'UIDs': self.UIDs,
-            'start_idx': self.start_idx  # Now also saving start_idx
+            'labels': self.labels
+            # Save the current batch index if provided
         }
         torch.save(checkpoint, checkpoint_path)
 
     def load_checkpoint(self, checkpoint_path):
         checkpoint = torch.load(checkpoint_path)
+        print('Debug: loaded dataset checkpoint!',checkpoint_path)
         self.segment_names = checkpoint['segment_names']
         self.labels = checkpoint['labels']
-        self.UIDs = checkpoint['UIDs']
-        # Now also loading and setting start_idx from checkpoint
-        self.start_idx = checkpoint.get('start_idx', 0)
         self.refresh_dataset()
+        # Load the current batch index if it exists in the checkpoint
 
     def __getitem__(self, idx):
-        actual_idx = (idx + self.start_idx) % len(self.segment_names)  # Adjust index based on start_idx and wrap around if needed
-        segment_name = self.segment_names[actual_idx]
+        segment_name = self.segment_names[idx]
         label = self.labels[segment_name]
 
-        if hasattr(self, 'all_data') and actual_idx < len(self.all_data):
-            time_freq_tensor = self.all_data[actual_idx]
+        if hasattr(self, 'all_data') and idx < len(self.all_data):
+            time_freq_tensor = self.all_data[idx]
         else:
             time_freq_tensor = self.load_data(segment_name)
-        return {'data': time_freq_tensor, 'label': label, 'segment_name': segment_name}
 
-    def set_current_batch_index(self, index):
-        self.current_batch_index = index
-    
-    def get_current_batch_index(self):
-        return self.current_batch_index
-
-    def set_start_idx(self, index):
-        self.start_idx = index
-      
-    def add_data_label_pair(self, data, label):
-        # Assign a unique ID or name for the new data
-        new_id = len(self.segment_names)
-        segment_name = f"new_data_{new_id}"
-
-        # Append the new data and label
-        self.segment_names.append(segment_name)
-        self.labels[segment_name] = label
-
-        # Append the new data tensor to an attribute that holds all the data
-        if hasattr(self, 'all_data'):
-            self.all_data.append(data)
-        else:
-            self.all_data = [data]
-            
-    def extract_segment_names_and_labels(self):
-        segment_names = []
-        labels = {}
-
-        for UID in self.UIDs:
-            label_file = os.path.join(self.labels_path, UID + "_final_attemp_4_1_Dong.csv")
-            if os.path.exists(label_file):
-                print('Debug: this file exists',label_file)
-                label_data = pd.read_csv(label_file, sep=',', header=0, names=['segment', 'label'])
-                label_segment_names = label_data['segment'].apply(lambda x: x.split('.')[0])
-                for idx, segment_name in enumerate(label_segment_names):
-                    label_val = label_data['label'].values[idx]
-                    if self.read_all_labels:
-                        # Assign -1 if label is not in [0, 1, 2, 3]
-                        labels[segment_name] = label_val if label_val in [0, 1, 2, 3] else -1
-                        if segment_name not in segment_names:
-                            segment_names.append(segment_name)
-                    else:
-                        # Only add segments with labels in [0, 1, 2, 3]
-                        if label_val in [0, 1, 2, 3] and segment_name not in segment_names:
-                            segment_names.append(segment_name)
-                            labels[segment_name] = label_val
-
-        return segment_names, labels
+
+        return {'data': time_freq_tensor, 'label': label, 'segment_name': segment_name, 'idx': idx}
 
     def load_data(self, segment_name):
         data_path_UID = os.path.join(self.data_path, segment_name.split('_')[0])
@@ -232,15 +227,30 @@ def load_data(self, segment_name):
 
         except Exception as e:
             print(f"Error processing segment: {segment_name}. Exception: {str(e)}")
-            return torch.zeros((1, 128, 128))  # Return zeros in case of an error
+            return torch.zeros((1, img_size, img_size))  # Return zeros in case of an error
 
     def standard_scaling(self, data):
         scaler = StandardScaler()
         data = scaler.fit_transform(data.reshape(-1, data.shape[-1])).reshape(data.shape)
         return torch.Tensor(data)
 
-def load_data_split_batched(data_path, labels_path, UIDs, batch_size, standardize=False, data_format='csv', read_all_labels=False, drop_last=False, num_workers=4, start_idx=0):
-    dataset = CustomDataset(data_path, labels_path, UIDs, standardize, data_format, read_all_labels, start_idx=start_idx)
+def load_data_split_batched(data_path, labels_path, UIDs, batch_size, standardize=False, data_format='pt', read_all_labels=False, drop_last=False, num_workers=4,\
+                            finished_seg_names = []):
+    # Run the main from the beginning. Load all data into the dataloader.
+    segment_names, labels = extract_segment_names_and_labels(UIDs,labels_path,read_all_labels=read_all_labels)
+    if len(finished_seg_names) > 0:
+        # If any segments have been trained.
+        remain_labels = remove_finished_segment_names_and_labels(labels,finished_seg_names)
+        segment_names = list(remain_labels.keys())
+        labels = remain_labels.copy()
+    dataset = CustomDataset(data_path=data_path, \
+                            labels_path=labels_path, \
+                            standardize=standardize, \
+                            data_format=data_format, \
+                            read_all_labels=read_all_labels, \
+                            batch_size=batch_size,
+                            segment_names = segment_names,
+                            labels = labels)
     dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, drop_last=drop_last, num_workers=num_workers, prefetch_factor=2)
     return dataloader
 
@@ -261,8 +271,12 @@ def get_data_paths(data_format):
         labels_base_path = "R:\ENGR_Chon\\NIH_Pulsewatch_Database\Adjudication_UConn"
         saving_base_path = r"\\grove.ad.uconn.edu\research\ENGR_Chon\Luis\Research\Casseys_case"
     else:
-        print('ERROR! YOUR DID NOT GET THE PATH.')
-        raise ValueError
+        print('Debug: You are in Google Colab.')
+        base_path = '/content'
+        labels_base_path = '/content/drive/MyDrive/Adjudication_UConn'
+        saving_base_path = '/content/drive/MyDrive/Checkpoint_Colab'
+        # print('ERROR! YOUR DID NOT GET THE PATH.')
+        # raise ValueError
     
     if data_format == 'csv':
         data_path = os.path.join(base_path, "TFS_csv")
@@ -278,16 +292,51 @@ def get_data_paths(data_format):
         saving_path = os.path.join(saving_base_path, "Project_1_analysis")
     else:
         raise ValueError("Invalid data format. Choose 'csv' or 'png.")
+    
+    # Create the parent path for checkpoints.
+    Path(saving_path).mkdir(parents=True, exist_ok=True)
+
     return data_path, labels_path, saving_path
 
 # Function to extract and preprocess data
-def preprocess_data(data_format, clinical_trial_train, clinical_trial_test, clinical_trial_unlabeled, batch_size, read_all_labels=False, current_batch_index=0):
-    start_idx = current_batch_index * batch_size
+def preprocess_data_train_val(data_format, clinical_trial_train, clinical_trial_test, batch_size, finished_seg_names,\
+                              read_all_labels=False):
+    # Extracts paths and loads data into train, validation, and test loaders
+    data_path, labels_path, saving_path = get_data_paths(data_format)
+
+    train_loader = load_data_split_batched(data_path=data_path, \
+                                            labels_path=labels_path, \
+                                            UIDs=clinical_trial_train, \
+                                            batch_size = batch_size, \
+                                            standardize=True, \
+                                            data_format=data_format, \
+                                            read_all_labels=read_all_labels,\
+                                            finished_seg_names = finished_seg_names)
+    # Usually the validation set will not need to resume training.
+    val_loader = load_data_split_batched(data_path=data_path, \
+                                        labels_path=labels_path, \
+                                        UIDs=clinical_trial_test, \
+                                        batch_size=batch_size, \
+                                        standardize=True, \
+                                        data_format=data_format, \
+                                        read_all_labels=read_all_labels, \
+                                        finished_seg_names = [])
+    return train_loader, val_loader, saving_path
+
+# Function to extract and preprocess data
+def preprocess_data_test(data_format, clinical_trial_unlabeled, batch_size, finished_seg_names,\
+                         read_all_labels=False):
+    # Extracts paths and loads data into train, validation, and test loaders
     data_path, labels_path, saving_path = get_data_paths(data_format)
-    train_loader = load_data_split_batched(data_path, labels_path, clinical_trial_train, batch_size, standardize=True, data_format=data_format, read_all_labels=read_all_labels, start_idx=start_idx)
-    val_loader = load_data_split_batched(data_path, labels_path, clinical_trial_test, batch_size, standardize=True, data_format=data_format, read_all_labels=read_all_labels, start_idx=start_idx)
-    test_loader = load_data_split_batched(data_path, labels_path, clinical_trial_unlabeled, batch_size, standardize=True, data_format=data_format, read_all_labels=read_all_labels, start_idx=start_idx)
-    return train_loader, val_loader, test_loader
+    test_loader = load_data_split_batched(data_path=data_path, \
+                                        labels_path=labels_path, \
+                                        UIDs=clinical_trial_unlabeled, \
+                                        batch_size=batch_size, \
+                                        standardize=True, \
+                                        data_format=data_format, \
+                                        read_all_labels=read_all_labels,\
+                                        finished_seg_names=finished_seg_names)
+    return test_loader
 
 def map_samples_to_uids(uncertain_sample_indices, dataset):
     """