diff --git a/BML_project/active_learning/__pycache__/ss_active_learning.cpython-311.pyc b/BML_project/active_learning/__pycache__/ss_active_learning.cpython-311.pyc new file mode 100644 index 0000000..45c9875 Binary files /dev/null and b/BML_project/active_learning/__pycache__/ss_active_learning.cpython-311.pyc differ diff --git a/BML_project/active_learning/ss_active_learning.py b/BML_project/active_learning/ss_active_learning.py new file mode 100644 index 0000000..546c4ad --- /dev/null +++ b/BML_project/active_learning/ss_active_learning.py @@ -0,0 +1,120 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Dec 18 18:23:23 2023 + +@author: lrm22005 +""" +import numpy as np +import random +import torch +from torch.utils.data import DataLoader +from sklearn.cluster import MiniBatchKMeans + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +def label_samples(uncertain_samples, validation_data): + labels = [validation_data[sample_id]['label'] for sample_id in uncertain_samples] + return uncertain_samples, labels + +def stochastic_uncertainty_sampling(gp_model, gp_likelihood, val_loader, n_samples, n_batches, n_components=2): + gp_model.eval() + gp_likelihood.eval() + uncertain_sample_indices = [] + sampled_batches = random.sample(list(val_loader), n_batches) # Randomly sample n_batches from val_loader + + with torch.no_grad(): + for batch in sampled_batches: + # reduced_data = apply_tsne(batch['data'].reshape(batch['data'].size(0), -1), n_components=n_components) + # reduced_data_tensor = torch.Tensor(reduced_data).to(device) + reduced_data_tensor = batch['data'].view(batch['data'].size(0), -1).to(device) + predictions = gp_likelihood(gp_model(reduced_data_tensor)) + var = predictions.variance + top_indices = torch.argsort(-var.flatten())[:n_samples] + uncertain_sample_indices.extend(top_indices.cpu().numpy()) + + return uncertain_sample_indices[:n_samples] + +# def uncertainty_sampling(gp_model, gp_likelihood, val_loader, n_samples, n_components=2): +# gp_model.eval() +# gp_likelihood.eval() +# uncertain_sample_indices = [] +# with torch.no_grad(): +# for batch_idx, batch in tqdm(enumerate(val_loader), desc='Uncertainty Sampling', unit='batch'): +# reduced_data_tensor = batch['data'].view(batch['data'].size(0), -1).to(device) +# predictions = gp_likelihood(gp_model(reduced_data_tensor)) +# var = predictions.variance +# top_indices = torch.argsort(-var.flatten())[:n_samples] +# batch_uncertain_indices = [batch_idx * val_loader.batch_size + idx for idx in top_indices] +# uncertain_sample_indices.extend(batch_uncertain_indices) +# return uncertain_sample_indices[:n_samples] + +def run_minibatch_kmeans(data_loader, n_clusters, device, batch_size=100): + # Initialize MiniBatchKMeans + minibatch_kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=0, batch_size=batch_size) + + # Iterate through data_loader and fit MiniBatchKMeans + for batch in data_loader: + data = batch['data'].view(batch['data'].size(0), -1).to(device).cpu().numpy() + minibatch_kmeans.partial_fit(data) + + return minibatch_kmeans + +# def compare_kmeans_gp_predictions(kmeans_model, gp_model, data_loader, device): +# # Compare K-Means with GP model predictions +# all_data, all_labels = [], [] +# for batch in data_loader: +# data = batch['data'].view(batch['data'].size(0), -1).to(device) +# labels = batch['label'].to(device) +# gp_predictions = gp_model(data).mean.argmax(dim=0).cpu().numpy() +# kmeans_predictions = kmeans_model.predict(data.cpu().numpy()) +# all_labels.append(labels.cpu().numpy()) +# all_data.append((gp_predictions, kmeans_predictions)) +# return all_data, np.concatenate(all_labels) + +def stochastic_compare_kmeans_gp_predictions(kmeans_model, gp_model, data_loader, n_batches, device): + all_data, all_labels = [], [] + sampled_batches = random.sample(list(data_loader), n_batches) # Randomly sample n_batches from data_loader + + for batch in sampled_batches: + data = batch['data'].view(batch['data'].size(0), -1).to(device) + labels = batch['label'].to(device) + gp_predictions = gp_model(data).mean.argmax(dim=0).cpu().numpy() + kmeans_predictions = kmeans_model.predict(data.cpu().numpy()) + all_labels.append(labels.cpu().numpy()) + all_data.append((gp_predictions, kmeans_predictions)) + + return all_data, np.concatenate(all_labels) + +import random + +def refined_uncertainty_sampling(gp_model, gp_likelihood, kmeans_model, data_loader, n_samples, n_batches, uncertainty_threshold=0.2): + gp_model.eval() + gp_likelihood.eval() + uncertain_sample_indices = [] + + # Calculate the total number of batches in the DataLoader + total_batches = len(data_loader) + + # Ensure that n_batches does not exceed total_batches + n_batches = min(n_batches, total_batches) + + # Randomly sample n_batches from data_loader + sampled_batches = random.sample(list(data_loader), n_batches) + + with torch.no_grad(): + for batch in sampled_batches: + data_tensor = batch['data'].view(batch['data'].size(0), -1).to(device) + gp_predictions = gp_likelihood(gp_model(data_tensor)) + kmeans_predictions = kmeans_model.predict(data_tensor.cpu().numpy()) + + # Calculate the difference between K-means and GP predictions + disagreement = (gp_predictions.mean.argmax(dim=-1).cpu().numpy() != kmeans_predictions).astype(int) + + # Calculate uncertainty based on variance of GP predictions + uncertainty = gp_predictions.variance.cpu().numpy() + + # Select samples where the disagreement is high and the model is uncertain + uncertain_indices = np.where((disagreement > 0) & (uncertainty > uncertainty_threshold))[0] + uncertain_sample_indices.extend(uncertain_indices) + + return uncertain_sample_indices[:n_samples] diff --git a/BML_project/models/__pycache__/ss_gp_model.cpython-311.pyc b/BML_project/models/__pycache__/ss_gp_model.cpython-311.pyc new file mode 100644 index 0000000..7ea0e5a Binary files /dev/null and b/BML_project/models/__pycache__/ss_gp_model.cpython-311.pyc differ diff --git a/BML_project/models/ss_gp_model.py b/BML_project/models/ss_gp_model.py new file mode 100644 index 0000000..c18f06f --- /dev/null +++ b/BML_project/models/ss_gp_model.py @@ -0,0 +1,198 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Dec 18 18:01:41 2023 + +@author: lrm22005 +""" +import numpy as np +from tqdm import tqdm +import torch +import gpytorch +from sklearn.metrics import precision_recall_fscore_support, roc_auc_score +from sklearn.preprocessing import label_binarize + +num_latents = 6 # This should match the complexity of your data or the number of tasks +num_tasks = 4 # This should match the number of output classes or tasks +num_inducing_points = 50 # This is independent and should be sufficient for the input space + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +class MultitaskGPModel(gpytorch.models.ApproximateGP): + def __init__(self): + # Let's use a different set of inducing points for each latent function + inducing_points = torch.rand(num_latents, num_inducing_points, 127 * 128) # Assuming flattened 128x128 images + + # We have to mark the CholeskyVariationalDistribution as batch + # so that we learn a variational distribution for each task + variational_distribution = gpytorch.variational.CholeskyVariationalDistribution( + inducing_points.size(-2), batch_shape=torch.Size([num_latents]) + ) + + # We have to wrap the VariationalStrategy in a LMCVariationalStrategy + # so that the output will be a MultitaskMultivariateNormal rather than a batch output + variational_strategy = gpytorch.variational.LMCVariationalStrategy( + gpytorch.variational.VariationalStrategy( + self, inducing_points, variational_distribution, learn_inducing_locations=True + ), + num_tasks=num_tasks, + num_latents=num_latents, + latent_dim=-1 + ) + + super().__init__(variational_strategy) + + # The mean and covariance modules should be marked as batch + # so we learn a different set of hyperparameters + self.mean_module = gpytorch.means.ConstantMean(batch_shape=torch.Size([num_latents])) + self.covar_module = gpytorch.kernels.ScaleKernel( + gpytorch.kernels.RBFKernel(batch_shape=torch.Size([num_latents])), + batch_shape=torch.Size([num_latents]) + ) + + def forward(self, x): + # The forward function should be written as if we were dealing with each output + # dimension in batch + # Ensure x is correctly shaped. It should have the same last dimension size as inducing_points + # x should be reshaped or sliced to have the shape [?, 1] where ? can be any size + # For example, if x originally has shape [N, D], and D != 1, you need to modify x accordingly + # print(f"Input shape: {x.shape}") + # x = x.view(x.size(0), -1) # Flattening the images + # print(f"Input shape after flattening: {x.shape}") # Debugging input shape + mean_x = self.mean_module(x) + covar_x = self.covar_module(x) + + # Debugging: Print shapes of intermediate outputs + # print(f"Mean shape: {mean_x.shape}, Covariance shape: {covar_x.shape}") + latent_pred = gpytorch.distributions.MultivariateNormal(mean_x, covar_x) + # print(f"Latent prediction shape: {latent_pred.mean.shape}, {latent_pred.covariance_matrix.shape}") + + return latent_pred + + +def train_gp_model(train_loader, val_loader, num_iterations=50, n_classes=4, patience=10, checkpoint_path='model_checkpoint_full.pt'): + model = MultitaskGPModel().to(device) + likelihood = gpytorch.likelihoods.SoftmaxLikelihood(num_features=4, num_classes=4).to(device) + optimizer = torch.optim.Adam(model.parameters(), lr=0.1) + mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=len(train_loader.dataset)) + best_val_loss = float('inf') + epochs_no_improve = 0 + + metrics = { + 'precision': [], + 'recall': [], + 'f1_score': [], + 'auc_roc': [], + 'train_loss': [] # Add a list to store training losses + } + + for epoch in tqdm(range(num_iterations), desc='Training', unit='epoch', leave=False): + for train_batch in train_loader: + model.train() + likelihood.train() + optimizer.zero_grad() + train_x = train_batch['data'].reshape(train_batch['data'].size(0), -1).to(device) # Use reshape here + train_y = train_batch['label'].to(device) + output = model(train_x) + loss = -mll(output, train_y) + metrics['train_loss'].append(loss.item()) # Store the training loss + loss.backward() + optimizer.step() + + # Stochastic validation + model.eval() + likelihood.eval() + with torch.no_grad(): + val_indices = torch.randperm(len(val_loader.dataset))[:int(1 * len(val_loader.dataset))] + val_loss = 0.0 + val_labels = [] + val_predictions = [] + for idx in val_indices: + val_batch = val_loader.dataset[idx] + val_x = val_batch['data'].reshape(-1).unsqueeze(0).to(device) # Use reshape here + val_y = torch.tensor([val_batch['label']], device=device) + val_output = model(val_x) + val_loss_batch = -mll(val_output, val_y).sum() + val_loss += val_loss_batch.item() + val_labels.append(val_y.item()) + val_predictions.append(val_output.mean.argmax(dim=-1).item()) + + precision, recall, f1, _ = precision_recall_fscore_support(val_labels, val_predictions, average='macro') + # auc_roc = roc_auc_score(label_binarize(val_labels, classes=np.arange(n_classes)), + # label_binarize(val_predictions, classes=np.arange(n_classes)), + # multi_class='ovr') + + metrics['precision'].append(precision) + metrics['recall'].append(recall) + metrics['f1_score'].append(f1) + # metrics['auc_roc'].append(auc_roc) + val_loss /= len(val_indices) + + if val_loss < best_val_loss: + best_val_loss = val_loss + epochs_no_improve = 0 + torch.save({'model_state_dict': model.state_dict(), + 'likelihood_state_dict': likelihood.state_dict(), + 'optimizer_state_dict': optimizer.state_dict()}, checkpoint_path) + else: + epochs_no_improve += 1 + if epochs_no_improve >= patience: + print(f"Early stopping triggered at epoch {epoch+1}") + break + + checkpoint = torch.load(checkpoint_path) + model.load_state_dict(checkpoint['model_state_dict']) + likelihood.load_state_dict(checkpoint['likelihood_state_dict']) + optimizer.load_state_dict(checkpoint['optimizer_state_dict']) + + return model, likelihood, metrics + +def semi_supervised_labeling(kmeans_model, gp_model, gp_likelihood, data_loader, confidence_threshold=0.8): + gp_model.eval() + gp_likelihood.eval() + labeled_samples = [] + + with torch.no_grad(): + for batch in data_loader: + data_tensor = batch['data'].view(batch['data'].size(0), -1).to(device) + kmeans_predictions = kmeans_model.predict(data_tensor.cpu().numpy()) + gp_predictions = gp_likelihood(gp_model(data_tensor)) + + # Use GP predictions where the model is confident + confident_indices = gp_predictions.confidence().cpu().numpy() > confidence_threshold + for i, confident in enumerate(confident_indices): + if confident: + labeled_samples.append((data_tensor[i], gp_predictions.mean.argmax(dim=-1)[i].item())) + else: + labeled_samples.append((data_tensor[i], kmeans_predictions[i])) + + return labeled_samples + +def calculate_elbo(model, likelihood, data_loader): + """ + Calculates the ELBO (Evidence Lower Bound) score for the model on the given data. + + Args: + - model: The trained Gaussian Process model. + - likelihood: The likelihood associated with the GP model. + - data_loader: DataLoader providing the data over which to calculate ELBO. + + Returns: + - elbo_score: The calculated ELBO score. + """ + model.eval() + likelihood.eval() + mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=len(data_loader.dataset)) + + with torch.no_grad(): + elbo_score = 0.0 + for batch in data_loader: + train_x = batch['data'].reshape(batch['data'].size(0), -1).to(device) + train_y = batch['label'].to(device) + output = model(train_x) + # Calculate the ELBO as the negative loss + elbo_score += -mll(output, train_y).sum().item() + + # Average the ELBO over all data samples + elbo_score /= len(data_loader.dataset) + + return elbo_score diff --git a/BML_project/ss_main.py b/BML_project/ss_main.py new file mode 100644 index 0000000..a610684 --- /dev/null +++ b/BML_project/ss_main.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Dec 18 18:47:27 2023 + +@author: lrm22005 +""" +import tqdm +import torch +from utils.data_loader import preprocess_data, split_uids, update_train_loader_with_uncertain_samples +from models.ss_gp_model import MultitaskGPModel, train_gp_model +from utils_gp.ss_evaluation import stochastic_evaluation, evaluate_model_on_all_data +from active_learning.ss_active_learning import stochastic_uncertainty_sampling, run_minibatch_kmeans, stochastic_compare_kmeans_gp_predictions +from utils.visualization import plot_comparative_results, plot_training_performance, plot_results + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +def main(): + # Set parameters like n_classes, batch_size, etc. + n_classes = 4 + batch_size = 1024 + clinical_trial_train, clinical_trial_test, clinical_trial_unlabeled = split_uids() + data_format = 'pt' + # Preprocess data + train_loader, val_loader, test_loader = preprocess_data(data_format, clinical_trial_train, clinical_trial_test, clinical_trial_unlabeled, batch_size) + + kmeans_model = run_minibatch_kmeans(train_loader, n_clusters=n_classes, device=device) + + # Initialize result storage + results = { + 'train_loss': [], + 'validation_metrics': {'precision': [], 'recall': [], 'f1': [], 'auc_roc': []}, + 'test_metrics': None + } + + # Initial model training + model, likelihood, training_metrics = train_gp_model(train_loader, val_loader, num_iterations=50, n_classes=n_classes) + + # Save the training metrics for future visualization + results['train_loss'].extend(training_metrics['train_loss']) + results['validation_metrics']['precision'].extend(training_metrics['precision']) + results['validation_metrics']['recall'].extend(training_metrics['recall']) + results['validation_metrics']['f1'].extend(training_metrics['f1_score']) + # results['validation_metrics']['auc_roc'].extend(training_metrics['auc_roc']) + + active_learning_iterations = 10 + # Active Learning Iterations + for iteration in tqdm(range(active_learning_iterations), desc='Active Learning', unit='iteration', leave=True): + # Perform uncertainty sampling to select new samples from the validation set + uncertain_sample_indices = stochastic_uncertainty_sampling(model, likelihood, val_loader, n_samples=batch_size, n_batches=5) + + # Update the training loader with uncertain samples + train_loader = update_train_loader_with_uncertain_samples(train_loader, uncertain_sample_indices, batch_size) + + # Re-train the model with the updated training data + model, likelihood, val_metrics = train_gp_model(train_loader, val_loader, num_iterations=10, n_classes=n_classes, patience=10, checkpoint_path='model_checkpoint_last.pt') + + # Store the validation metrics after each active learning iteration + results['validation_metrics']['precision'].append(val_metrics['precision']) + results['validation_metrics']['recall'].append(val_metrics['recall']) + results['validation_metrics']['f1'].append(val_metrics['f1']) + # results['validation_metrics']['auc_roc'].append(val_metrics['auc_roc']) + + # Compare K-Means with GP model predictions after retraining + gp_vs_kmeans_data, original_labels = stochastic_compare_kmeans_gp_predictions(kmeans_model, model, train_loader, n_batches=5, device=device) + + plot_comparative_results(gp_vs_kmeans_data, original_labels) + + # Final evaluation on test set + test_metrics = evaluate_model_on_all_data(model, likelihood, test_loader, device, n_classes) + test_kmeans_model = run_minibatch_kmeans(test_loader, n_clusters=n_classes, device=device) + + results['test_metrics'] = test_metrics + test_gp_vs_kmeans_data, test_original_labels = stochastic_compare_kmeans_gp_predictions(test_kmeans_model, model, test_loader, n_batches=5, device=device) + plot_comparative_results(test_gp_vs_kmeans_data, test_original_labels) + + # Visualization of results + plot_training_performance(results['train_loss'], results['validation_metrics']) + plot_results(results['test_metrics']) + + # Print final test metrics + print("Final Test Metrics:", results['test_metrics']) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/BML_project/ss_main_ss.py b/BML_project/ss_main_ss.py new file mode 100644 index 0000000..0d0aed4 --- /dev/null +++ b/BML_project/ss_main_ss.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- +""" +Created on Thu Jan 4 14:40:13 2024 + +@author: lrm22005 +""" +from tqdm import tqdm +import torch +from utils_gp.data_loader import preprocess_data, split_uids, update_train_loader_with_labeled_samples, update_train_loader_with_uncertain_samples +from models.ss_gp_model import MultitaskGPModel, train_gp_model, semi_supervised_labeling, calculate_elbo +from utils_gp.ss_evaluation import stochastic_evaluation, evaluate_model_on_all_data, threshold_based_labeling, resolve_conflicts +from active_learning.ss_active_learning import run_minibatch_kmeans, stochastic_compare_kmeans_gp_predictions, refined_uncertainty_sampling +from utils_gp.visualization import plot_comparative_results, plot_training_performance, plot_results + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +def main(): + # Set parameters like n_classes, batch_size, etc. + n_classes = 4 + batch_size = 1024 + clinical_trial_train, clinical_trial_test, clinical_trial_unlabeled = split_uids() + data_format = 'pt' + + # Preprocess data + train_loader, val_loader, test_loader = preprocess_data(data_format, clinical_trial_train, clinical_trial_test, clinical_trial_unlabeled, batch_size) + + kmeans_model = run_minibatch_kmeans(train_loader, n_clusters=n_classes, device=device, batch_size=batch_size) + + # Initialize result storage + results = { + 'train_loss': [], + 'validation_metrics': {'precision': [], 'recall': [], 'f1': [], 'auc_roc': []}, + 'test_metrics': None + } + + # Initial model training + model, likelihood, training_metrics = train_gp_model(train_loader, val_loader, num_iterations=50, n_classes=n_classes) + + # Save the training metrics for future visualization + results['train_loss'].extend(training_metrics['train_loss']) + results['validation_metrics']['precision'].extend(training_metrics['precision']) + results['validation_metrics']['recall'].extend(training_metrics['recall']) + results['validation_metrics']['f1'].extend(training_metrics['f1_score']) + + elbo_threshold = -0.5 # Define a threshold for the ELBO score + use_threshold_labeling = False # Initially, do not use threshold-based labeling + + active_learning_iterations = 10 + # Active Learning Iterations + for iteration in tqdm(range(active_learning_iterations), desc='Active Learning', unit='iteration', leave=True): + # Perform uncertainty sampling to select new samples from the validation set + uncertain_sample_indices = refined_uncertainty_sampling(model, likelihood, kmeans_model, val_loader, n_samples=batch_size, n_batches=5) + + # Semi-supervised labeling with K-means and GP model + semi_supervised_samples = semi_supervised_labeling(kmeans_model, model, likelihood, val_loader) + + labeled_samples = semi_supervised_samples # Initially, use only semi-supervised samples + + if use_threshold_labeling: + # Threshold-based labeling to decide when a sample's predicted label should be trusted + threshold_based_samples = threshold_based_labeling(kmeans_model, model, likelihood, val_loader) + + # Combine the two sets of labeled samples + # Implement your logic for resolving conflicts between the two methods here + labeled_samples = resolve_conflicts(semi_supervised_samples, threshold_based_samples) + + # Update the training loader with uncertain and newly labeled samples + train_loader = update_train_loader_with_uncertain_samples(train_loader, uncertain_sample_indices, batch_size) + train_loader = update_train_loader_with_labeled_samples(train_loader, labeled_samples, batch_size) + + # Re-train the model with the updated training data + model, likelihood, training_metrics = train_gp_model(train_loader, val_loader, num_iterations=10, n_classes=n_classes, patience=10, checkpoint_path='model_checkpoint_last.pt') + + # Store the ELBO score after each active learning iteration + current_elbo = calculate_elbo(model, likelihood, train_loader) + results['elbo'].append(current_elbo) + + # Determine if the threshold-based labeling should be used in the next iteration based on the ELBO score + if current_elbo >= elbo_threshold: + use_threshold_labeling = True + + # Compare K-Means with GP model predictions after retraining + gp_vs_kmeans_data, original_labels = stochastic_compare_kmeans_gp_predictions(kmeans_model, model, train_loader, n_batches=5, device=device) + + plot_comparative_results(gp_vs_kmeans_data, original_labels) + + # Final evaluation on test set + test_metrics = evaluate_model_on_all_data(model, likelihood, test_loader, device, n_classes) + test_kmeans_model = run_minibatch_kmeans(test_loader, n_clusters=n_classes, device=device) + + results['test_metrics'] = test_metrics + test_gp_vs_kmeans_data, test_original_labels = stochastic_compare_kmeans_gp_predictions(test_kmeans_model, model, test_loader, n_batches=5, device=device) + plot_comparative_results(test_gp_vs_kmeans_data, test_original_labels) + + # Visualization of results + plot_training_performance(results['train_loss'], results['validation_metrics']) + plot_results(results['test_metrics']) + + # Print final test metrics + print("Final Test Metrics:", results['test_metrics']) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/BML_project/utils_gp/__pycache__/data_loader.cpython-311.pyc b/BML_project/utils_gp/__pycache__/data_loader.cpython-311.pyc new file mode 100644 index 0000000..0b1a7eb Binary files /dev/null and b/BML_project/utils_gp/__pycache__/data_loader.cpython-311.pyc differ diff --git a/BML_project/utils_gp/__pycache__/ss_evaluation.cpython-311.pyc b/BML_project/utils_gp/__pycache__/ss_evaluation.cpython-311.pyc new file mode 100644 index 0000000..14c7dfa Binary files /dev/null and b/BML_project/utils_gp/__pycache__/ss_evaluation.cpython-311.pyc differ diff --git a/BML_project/utils_gp/__pycache__/visualization.cpython-311.pyc b/BML_project/utils_gp/__pycache__/visualization.cpython-311.pyc new file mode 100644 index 0000000..a8c3afd Binary files /dev/null and b/BML_project/utils_gp/__pycache__/visualization.cpython-311.pyc differ diff --git a/BML_project/utils_gp/data_loader.py b/BML_project/utils_gp/data_loader.py new file mode 100644 index 0000000..14ed5fe --- /dev/null +++ b/BML_project/utils_gp/data_loader.py @@ -0,0 +1,297 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Dec 18 18:09:02 2023 + +@author: lrm22005 +""" +import os +import numpy as np +import pandas as pd +from PIL import Image +import torch +from torch.utils.data import Dataset, DataLoader +from sklearn.preprocessing import StandardScaler +from torchvision.transforms import ToTensor + +def split_uids(): + # ====== Load the per subject arrythmia summary ====== + df_summary = pd.read_csv(r'\\grove.ad.uconn.edu\research\ENGR_Chon\NIH_Pulsewatch_Database\Adjudication_UConn\final_attemp_4_1_Dong_Ohm_summary_20231025.csv') + df_summary['UID'] = df_summary['UID'].astype(str).str.zfill(3) + + df_summary['sample_nonAF'] = df_summary['NSR'] + df_summary['PACPVC'] + df_summary['SVT'] + df_summary['sample_AF'] = df_summary['AF'] + + df_summary['sample_nonAF_ratio'] = df_summary['sample_nonAF'] / (df_summary['sample_AF'] + df_summary['sample_nonAF']) + + all_UIDs = df_summary['UID'].unique() + # ==================================================== + # ====== AF trial separation ====== + # R:\ENGR_Chon\Dong\Numbers\Pulsewatch_numbers\Fahimeh_CNNED_general_ExpertSystemwApplication\tbl_file_name\TrainingSet_final_segments + AF_trial_Fahimeh_train = ['402','410'] + AF_trial_Fahimeh_test = ['301', '302', '305', '306', '307', '310', '311', + '312', '318', '319', '320', '321', '322', '324', + '325', '327', '329', '400', '406', '407', '409', + '414'] + AF_trial_Fahimeh_did_not_use = ['405', '413', '415', '416', '420', '421', '422', '423'] + AF_trial_paroxysmal_AF = ['408','419'] + + AF_trial_train = AF_trial_Fahimeh_train + AF_trial_test = AF_trial_Fahimeh_test + AF_trial_unlabeled = AF_trial_Fahimeh_did_not_use + AF_trial_paroxysmal_AF + print(f'AF trial: {len(AF_trial_train)} training subjects {AF_trial_train}') + print(f'AF trial: {len(AF_trial_test)} testing subjects {AF_trial_test}') + print(f'AF trial: {len(AF_trial_unlabeled)} unlabeled subjects {AF_trial_unlabeled}') + # ================================= + # === Clinical trial AF subjects separation === + clinical_trial_AF_subjects = ['005', '017', '026', '051', '075', '082'] + + remaining_UIDs = [] + count_NSR = [] + import math + for index, row in df_summary.iterrows(): + UID = row['UID'] + this_NSR = row['sample_nonAF'] + if math.isnan(this_NSR): + # There is no segment in this subject, skip this UID. + print(f'---------UID {UID} has no segments.------------') + continue + if UID not in AF_trial_train and UID not in AF_trial_test and UID not in clinical_trial_AF_subjects \ + and not UID[0] == '3' and not UID[0] == '4': + remaining_UIDs.append(UID) + count_NSR.append(this_NSR) + + from numpy import random + random.seed(seed=42) + from numpy.random import choice + list_of_candidates = remaining_UIDs + number_of_items_to_pick = round(len(list_of_candidates) * 0.15) # 10% labeled for training, 5% for testing. + temp_sum = sum(count_NSR) + probability_distribution = [x/temp_sum for x in count_NSR] + probability_distribution = [(1-x/temp_sum)/ (len(count_NSR)-1) for x in count_NSR]# Subjects with fewer segments have higher chance to be selected. Make sure the sum is one. + draw = choice(list_of_candidates, number_of_items_to_pick, + p=probability_distribution, replace=False) + + clinical_trial_train = list(draw[:round(len(list_of_candidates) * 0.1)]) + clinical_trial_test_nonAF = list(draw[round(len(list_of_candidates) * 0.1):]) + clinical_trial_test_temp = clinical_trial_test_nonAF + clinical_trial_AF_subjects + clinical_trial_test = [] + for UID in clinical_trial_test_temp: + # UID 051 and maybe other UIDs had no segments (unknown reason). + if UID in all_UIDs: + clinical_trial_test.append(UID) + + clinical_trial_unlabeled = [] + for UID in all_UIDs: + if UID not in clinical_trial_train and UID not in clinical_trial_test and not UID[0] == '3' and not UID[0] == '4': + clinical_trial_unlabeled.append(UID) + print(f'Clinical trial: selected {len(clinical_trial_train)} UIDs for training {clinical_trial_train}') + print(f'Clinical trial: selected {len(clinical_trial_test)} UIDs for testing {clinical_trial_test}') + print(f'Clinical trial: selected {len(clinical_trial_unlabeled)} UIDs for unlabeled {clinical_trial_unlabeled}') + + clinical_trial_train = [clinical_trial_train[0]] + clinical_trial_test = [clinical_trial_test[0]] + clinical_trial_unlabeled = clinical_trial_unlabeled[0:4] + + return clinical_trial_train, clinical_trial_test, clinical_trial_unlabeled + +class CustomDataset(Dataset): + def __init__(self, data_path, labels_path, UIDs, standardize=True, data_format='csv', read_all_labels=False): + self.data_path = data_path + self.labels_path = labels_path + self.UIDs = UIDs + self.standardize = standardize + self.data_format = data_format + self.read_all_labels = read_all_labels + self.transforms = ToTensor() + self.refresh_dataset() + + def refresh_dataset(self): + # Extract unique segment names and their corresponding labels + self.segment_names, self.labels = self.extract_segment_names_and_labels() + + def add_uids(self, new_uids): + # Ensure new UIDs are unique and not already in the dataset + unique_new_uids = [uid for uid in new_uids if uid not in self.UIDs] + + # Add unique new UIDs and refresh the dataset + self.UIDs.extend(unique_new_uids) + self.refresh_dataset() + + def __len__(self): + return len(self.segment_names) + + def __getitem__(self, idx): + segment_name = self.segment_names[idx] + label = self.labels[segment_name] + + if hasattr(self, 'all_data') and idx < len(self.all_data): + # Data is stored in memory + time_freq_tensor = self.all_data[idx] + else: + # Load data on-the-fly based on the segment_name + time_freq_tensor = self.load_data(segment_name) + + return {'data': time_freq_tensor, 'label': label, 'segment_name': segment_name} + + def add_data_label_pair(self, data, label): + # Assign a unique ID or name for the new data + new_id = len(self.segment_names) + segment_name = f"new_data_{new_id}" + + # Append the new data and label + self.segment_names.append(segment_name) + self.labels[segment_name] = label + + # Append the new data tensor to an attribute that holds all the data + if hasattr(self, 'all_data'): + self.all_data.append(data) + else: + self.all_data = [data] + + def extract_segment_names_and_labels(self): + segment_names = [] + labels = {} + + for UID in self.UIDs: + label_file = os.path.join(self.labels_path, UID + "_final_attemp_4_1_Dong.csv") + if os.path.exists(label_file): + label_data = pd.read_csv(label_file, sep=',', header=0, names=['segment', 'label']) + label_segment_names = label_data['segment'].apply(lambda x: x.split('.')[0]) + for idx, segment_name in enumerate(label_segment_names): + label_val = label_data['label'].values[idx] + if self.read_all_labels: + # Assign -1 if label is not in [0, 1, 2, 3] + labels[segment_name] = label_val if label_val in [0, 1, 2, 3] else -1 + if segment_name not in segment_names: + segment_names.append(segment_name) + else: + # Only add segments with labels in [0, 1, 2, 3] + if label_val in [0, 1, 2, 3] and segment_name not in segment_names: + segment_names.append(segment_name) + labels[segment_name] = label_val + + return segment_names, labels + + def load_data(self, segment_name): + data_path_UID = os.path.join(self.data_path, segment_name.split('_')[0]) + seg_path = os.path.join(data_path_UID, segment_name + '_filt_STFT.' + self.data_format) + + try: + if self.data_format == 'csv' and seg_path.endswith('.csv'): + time_freq_plot = np.array(pd.read_csv(seg_path, header=None)) + time_freq_tensor = torch.Tensor(time_freq_plot).reshape(1, 128, 128) + elif self.data_format == 'png' and seg_path.endswith('.png'): + img = Image.open(seg_path) + img_data = np.array(img) + time_freq_tensor = torch.Tensor(img_data).unsqueeze(0) + elif self.data_format == 'pt' and seg_path.endswith('.pt'): + time_freq_tensor = torch.load(seg_path) + else: + raise ValueError("Unsupported file format") + if self.standardize: + time_freq_tensor = self.standard_scaling(time_freq_tensor) # Standardize the data + + return time_freq_tensor.clone() + + except Exception as e: + print(f"Error processing segment: {segment_name}. Exception: {str(e)}") + return torch.zeros((1, 128, 128)) # Return zeros in case of an error + + def standard_scaling(self, data): + scaler = StandardScaler() + data = scaler.fit_transform(data.reshape(-1, data.shape[-1])).reshape(data.shape) + return torch.Tensor(data) + +def load_data_split_batched(data_path, labels_path, UIDs, batch_size, standardize=False, data_format='csv', read_all_labels=False, drop_last=False, num_workers=4): + dataset = CustomDataset(data_path, labels_path, UIDs, standardize, data_format, read_all_labels) + dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, drop_last=drop_last, num_workers=num_workers, prefetch_factor=2) + return dataloader + +def get_data_paths(data_format, is_linux=False, is_hpc=False): + if is_linux: + base_path = "/mnt/r/ENGR_Chon/Dong/MATLAB_generate_results/NIH_PulseWatch" + labels_base_path = "/mnt/r/ENGR_Chon/NIH_Pulsewatch_Database/Adjudication_UConn" + saving_base_path = "/mnt/r/ENGR_Chon/Luis/Research/Casseys_case/Project_1_analysis" + elif is_hpc: + base_path = "/gpfs/scratchfs1/kic14002/doh16101" + labels_base_path = "/gpfs/scratchfs1/hfp14002/lrm22005" + saving_base_path = "/gpfs/scratchfs1/hfp14002/lrm22005/Casseys_case/Project_1_analysis" + else: + # R:\ENGR_Chon\Dong\MATLAB_generate_results\NIH_PulseWatch + base_path = "R:\ENGR_Chon\Dong\MATLAB_generate_results\\NIH_PulseWatch" + labels_base_path = "R:\ENGR_Chon\\NIH_Pulsewatch_Database\Adjudication_UConn" + saving_base_path = r"\\grove.ad.uconn.edu\research\ENGR_Chon\Luis\Research\Casseys_case" + if data_format == 'csv': + data_path = os.path.join(base_path, "TFS_csv") + labels_path = os.path.join(labels_base_path, "final_attemp_4_1_Dong_Ohm") + saving_path = os.path.join(saving_base_path, "Project_1_analysis") + elif data_format == 'png': + data_path = os.path.join(base_path, "TFS_plots") + labels_path = os.path.join(labels_base_path, "final_attemp_4_1_Dong_Ohm") + saving_path = os.path.join(saving_base_path, "Project_1_analysis") + elif data_format == 'pt': + data_path = os.path.join(base_path, "PT_format") + labels_path = os.path.join(labels_base_path, "final_attemp_4_1_Dong_Ohm") + saving_path = os.path.join(saving_base_path, "Project_1_analysis") + else: + raise ValueError("Invalid data format. Choose 'csv' or 'png.") + return data_path, labels_path, saving_path + +# Function to extract and preprocess data +def preprocess_data(data_format, clinical_trial_train, clinical_trial_test, clinical_trial_unlabeled, batch_size, read_all_labels=False): + # Extracts paths and loads data into train, validation, and test loaders + data_path, labels_path, saving_path = get_data_paths(data_format) + train_loader = load_data_split_batched(data_path, labels_path, clinical_trial_train, batch_size, standardize=True, data_format=data_format, read_all_labels=read_all_labels) + val_loader = load_data_split_batched(data_path, labels_path, clinical_trial_test, batch_size, standardize=True, data_format=data_format, read_all_labels=read_all_labels) + test_loader = load_data_split_batched(data_path, labels_path, clinical_trial_unlabeled, batch_size, standardize=True, data_format=data_format, read_all_labels=read_all_labels) + return train_loader, val_loader, test_loader + +def map_samples_to_uids(uncertain_sample_indices, dataset): + """ + Maps indices of uncertain samples back to their corresponding segment names or UIDs. + + Args: + - uncertain_sample_indices: Indices of the uncertain samples in the dataset. + - dataset: The dataset object which contains the mapping of segment names and UIDs. + + Returns: + - List of UIDs or segment names corresponding to the uncertain samples. + """ + return [dataset.segment_names[i] for i in uncertain_sample_indices] + +def update_train_loader_with_labeled_samples(current_train_loader, labeled_samples, batch_size): + """ + Updates the training DataLoader with newly labeled samples. + + Args: + - current_train_loader: The current DataLoader for the training set. + - labeled_samples: A list of tuples, each containing a data tensor and its new label. + - batch_size: Batch size for the DataLoader. + + Returns: + - DataLoader: The updated DataLoader with the new labeled samples. + """ + + # Extract the current dataset from the DataLoader + current_dataset = current_train_loader.dataset + + # Update the dataset with new samples and labels + for data_tensor, label in labeled_samples: + # Assuming the CustomDataset class has a method to add new data and labels + current_dataset.add_data_label_pair(data_tensor, label) + + # Create a new DataLoader with the updated dataset + updated_train_loader = DataLoader(current_dataset, batch_size=batch_size, shuffle=True, drop_last=False, num_workers=4, prefetch_factor=2) + + return updated_train_loader + +def update_train_loader_with_uncertain_samples(current_train_loader, new_sample_indices, batch_size, standardize=False, data_format='csv', read_all_labels=True): + # Extract current UIDs from the current_train_loader + current_dataset = current_train_loader.dataset + # Map new_samples back to their corresponding segment names or UIDs + new_uids = map_samples_to_uids(new_sample_indices, current_dataset) + # Add new UIDs to the current dataset and refresh it + current_dataset.add_uids(new_uids) + # Create new DataLoader with the updated dataset + updated_train_loader = DataLoader(current_dataset, batch_size=batch_size, shuffle=False) + return updated_train_loader \ No newline at end of file diff --git a/BML_project/utils_gp/ss_evaluation.py b/BML_project/utils_gp/ss_evaluation.py new file mode 100644 index 0000000..5a01ee4 --- /dev/null +++ b/BML_project/utils_gp/ss_evaluation.py @@ -0,0 +1,167 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Dec 18 18:17:40 2023 + +@author: lrm22005 +""" +import numpy as np +import torch +import gpytorch +from sklearn.preprocessing import label_binarize +from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score +from sklearn.metrics import precision_recall_fscore_support + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +def stochastic_evaluation(model, likelihood, data_loader, device, n_classes, n_batches): + model.eval() + likelihood.eval() + random_indices = torch.randperm(len(data_loader.dataset))[:n_batches * data_loader.batch_size] + + all_predicted_labels = [] + all_actual_labels = [] + + with torch.no_grad(), gpytorch.settings.fast_pred_var(): + for idx in random_indices: + batch = data_loader.dataset[idx] + test_data = batch['data'].view(-1).unsqueeze(0).to(device) + test_label = torch.tensor([batch['label']]).to(device) + + predictions = likelihood(model(test_data)).mean + predicted_label = predictions.argmax(dim=-1).item() + + all_predicted_labels.append(predicted_label) + all_actual_labels.append(test_label.item()) + + # Compute metrics + precision, recall, f1, _ = precision_recall_fscore_support(all_actual_labels, all_predicted_labels, average='macro') + # auc_roc = roc_auc_score(label_binarize(all_actual_labels, classes=np.arange(n_classes)), + # label_binarize(all_predicted_labels, classes=np.arange(n_classes)), + # multi_class='ovr') + + return { + 'precision': precision, + 'recall': recall, + 'f1': f1, + # 'auc_roc': auc_roc + } + +def evaluate_model_on_all_data(model, likelihood, data_loader, device, n_classes): + model.eval() + likelihood.eval() + + all_predicted_labels = [] + all_test_labels = [] + + with torch.no_grad(), gpytorch.settings.fast_pred_var(): + for i, batch in enumerate(data_loader): + test_data = batch['data'].view(batch['data'].size(0), -1).to(device) + test_labels = batch['label'].to(device) + # print(f"Test data shape before t-SNE: {test_data.shape}") + + predictions = likelihood(model(test_data)).mean + # Debugging - check shape of predictions + # print(f"Predictions shape: {predictions.shape}") + predicted_labels = predictions.argmax(dim=0) + + # Add debugging information + # print(f"Batch {i}: Predicted Labels Shape: {predicted_labels.shape}, Actual Labels Shape: {test_labels.shape}") + + all_predicted_labels.append(predicted_labels.cpu().numpy()) + all_test_labels.append(test_labels.numpy()) + + # Debug the accumulation of labels + # current_predicted = np.concatenate(all_predicted_labels, axis=0) + # current_actual = np.concatenate(all_test_labels, axis=0) + # print(f"After Batch {i}: Accumulated Predicted Labels: {current_predicted.shape[0]}, Accumulated Actual Labels: {current_actual.shape[0]}") + + # Concatenate all batch results + all_predicted_labels = np.concatenate(all_predicted_labels, axis=0) + all_test_labels = np.concatenate(all_test_labels, axis=0) + + # Final check + # print(f"Final: Total Predicted Labels: {all_predicted_labels.shape[0]}, Total Actual Labels: {all_test_labels.shape[0]}") + + # Verify if the shapes match before proceeding to calculate metrics + if all_predicted_labels.shape[0] != all_test_labels.shape[0]: + raise ValueError("Mismatch in the number of samples between predicted and actual labels") + + # Compute overall evaluation metrics + precision, recall, f1, _ = precision_recall_fscore_support(all_test_labels, all_predicted_labels, average='macro') + # For AUC-ROC, you need the predicted probabilities and true labels in a one-hot encoded format + test_labels_one_hot = label_binarize(all_test_labels, classes=np.arange(n_classes)) + auc_roc = roc_auc_score(test_labels_one_hot, predictions.softmax(dim=-1).cpu().numpy(), multi_class='ovr') + return { + 'precision': precision, + 'recall': recall, + 'f1': f1, + 'auc_roc': auc_roc + } + +def threshold_based_labeling(kmeans_model, gp_model, gp_likelihood, data_loader, agreement_threshold=0.5): + gp_model.eval() + gp_likelihood.eval() + new_labels = [] + + with torch.no_grad(): + for batch in data_loader: + data_tensor = batch['data'].view(batch['data'].size(0), -1).to(device) + kmeans_predictions = kmeans_model.predict(data_tensor.cpu().numpy()) + gp_predictions = gp_likelihood(gp_model(data_tensor)) + + # Measure agreement between K-means and GP model + agreement = (gp_predictions.mean.argmax(dim=-1).cpu().numpy() == kmeans_predictions).astype(int) + + # Assign labels based on agreement threshold + for i, agree in enumerate(agreement): + if agree > agreement_threshold: + new_labels.append(gp_predictions.mean.argmax(dim=-1)[i].item()) + else: + new_labels.append(kmeans_predictions[i]) + + return new_labels + +def resolve_conflicts(semi_supervised_samples, threshold_based_samples): + """ + Resolves conflicts between two sets of labeled samples. + + Args: + - semi_supervised_samples: Labeled samples from the semi_supervised_labeling method. + - threshold_based_samples: Labeled samples from the threshold_based_labeling method. + + Returns: + - resolved_samples: The resolved set of labeled samples. + """ + resolved_samples = [] + + # Create dictionaries for quick lookup + semi_supervised_dict = {segment_name: label for segment_name, label in semi_supervised_samples} + threshold_based_dict = {segment_name: label for segment_name, label in threshold_based_samples} + + # Combine all unique segment names + all_segments = set(semi_supervised_dict.keys()).union(set(threshold_based_dict.keys())) + + for segment_name in all_segments: + if segment_name in semi_supervised_dict and segment_name in threshold_based_dict: + # If there's a conflict, resolve it here. For simplicity, we're taking the label from semi_supervised + # You can implement other strategies like majority vote, confidence weighting, or agreement only + resolved_samples.append((segment_name, semi_supervised_dict[segment_name])) + elif segment_name in semi_supervised_dict: + resolved_samples.append((segment_name, semi_supervised_dict[segment_name])) + elif segment_name in threshold_based_dict: + resolved_samples.append((segment_name, threshold_based_dict[segment_name])) + + return resolved_samples + +def parse_classification_report(report): + """Parse a classification report into a dictionary of metrics.""" + lines = report.split('\n') + main_metrics = lines[-2].split() + + # Assuming the last line is like "accuracy: x macro avg y1 y2 y3 y4" + return { + 'precision': float(main_metrics[3]), + 'recall': float(main_metrics[4]), + 'f1': float(main_metrics[5]), + 'auc_roc': None # AUC-ROC is not part of the classification report by default + } diff --git a/BML_project/utils_gp/visualization.py b/BML_project/utils_gp/visualization.py new file mode 100644 index 0000000..3ecf59b --- /dev/null +++ b/BML_project/utils_gp/visualization.py @@ -0,0 +1,81 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Dec 18 18:20:55 2023 + +@author: lrm22005 +""" +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +from sklearn.metrics import confusion_matrix + +def plot_training_performance(train_loss, validation_metrics): + epochs = range(1, len(train_loss) + 1) + + # Plot training loss + plt.figure(figsize=(14, 6)) + plt.subplot(1, 2, 1) + plt.plot(epochs, train_loss, 'b-', label='Training Loss') + plt.title('Training Loss') + plt.xlabel('Epochs') + plt.ylabel('Loss') + plt.legend() + + # Plot validation metrics + plt.subplot(1, 2, 2) + plt.plot(epochs, validation_metrics['precision'], 'r-', label='Precision') + plt.plot(epochs, validation_metrics['recall'], 'g-', label='Recall') + plt.plot(epochs, validation_metrics['f1'], 'b-', label='F1 Score') + plt.plot(epochs, validation_metrics['auc_roc'], 'y-', label='AUC-ROC') + plt.title('Validation Metrics') + plt.xlabel('Epochs') + plt.ylabel('Metrics') + plt.legend() + + plt.tight_layout() + plt.show() + +def plot_results(results): + plt.figure(figsize=(12, 5)) + plt.subplot(1, 2, 1) + plt.plot(results['train_loss'], label='Train Loss') + plt.title('Training Loss Over Time') + plt.legend() + + plt.subplot(1, 2, 2) + for metric in ['precision', 'recall', 'f1']: + plt.plot(results['validation_metrics'][metric], label=metric.title()) + plt.title('Validation Metrics Over Time') + plt.legend() + plt.show() + + test_metrics = results['test_metrics'] + print("Test Metrics:") + print(f"Precision: {test_metrics['precision']}") + print(f"Recall: {test_metrics['recall']}") + print(f"F1 Score: {test_metrics['f1']}") + print(f"AUC-ROC: {test_metrics['auc_roc']}") + +def plot_comparative_results(gp_vs_kmeans_data, original_labels): + fig, axes = plt.subplots(1, 2, figsize=(14, 7)) + + # Plot 1: Confusion Matrix for GP Predictions vs Original Labels + gp_predictions = [pair[0] for pair in gp_vs_kmeans_data] + gp_predictions = np.concatenate(gp_predictions) + cm_gp = confusion_matrix(original_labels, gp_predictions) + sns.heatmap(cm_gp, annot=True, ax=axes[0], fmt='g') + axes[0].set_title('GP Model Predictions vs Original Labels') + axes[0].set_xlabel('Predicted Labels') + axes[0].set_ylabel('True Labels') + + # Plot 2: Confusion Matrix for K-Means Predictions vs Original Labels + kmeans_predictions = [pair[1] for pair in gp_vs_kmeans_data] + kmeans_predictions = np.concatenate(kmeans_predictions) + cm_kmeans = confusion_matrix(original_labels, kmeans_predictions) + sns.heatmap(cm_kmeans, annot=True, ax=axes[1], fmt='g') + axes[1].set_title('K-Means Predictions vs Original Labels') + axes[1].set_xlabel('Predicted Labels') + axes[1].set_ylabel('True Labels') + + plt.tight_layout() + plt.show() diff --git a/pytorch_file_generation_loader_update.py b/pytorch_file_generation_loader_update.py new file mode 100644 index 0000000..cea6242 --- /dev/null +++ b/pytorch_file_generation_loader_update.py @@ -0,0 +1,374 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Dec 18 17:11:29 2023 + +@author: lrm22005 +""" + +# import os +# import pandas as pd +# import numpy as np +# import torch +# from PIL import Image +# import torch + +#### From first to the last +# def preprocess_and_save_data(data_path, output_path): +# # Make sure the output directory exists +# if not os.path.exists(output_path): +# os.makedirs(output_path) + +# # Traverse the directories for each UID +# for uid in os.listdir(data_path): +# uid_path = os.path.join(data_path, uid) +# if os.path.isdir(uid_path): +# # Make a corresponding directory in the output path +# uid_output_path = os.path.join(output_path, uid) +# if not os.path.exists(uid_output_path): +# os.makedirs(uid_output_path) + +# # Process each file within the UID directory +# for file in os.listdir(uid_path): +# if file.endswith('.csv') or file.endswith('.png'): +# file_path = os.path.join(uid_path, file) +# if file.endswith('.csv'): +# data = pd.read_csv(file_path).values +# else: # if file.endswith('.png'): +# data = np.array(Image.open(file_path)) + +# data_tensor = torch.tensor(data, dtype=torch.float32) +# output_file_path = os.path.join(uid_output_path, file.replace('.csv', '.pt').replace('.png', '.pt')) +# torch.save(data_tensor, output_file_path) + +# # Define your input and output paths +# input_path = r'\\grove.ad.uconn.edu\research\ENGR_Chon\Dong\MATLAB_generate_results\NIH_PulseWatch\TFS_csv' +# output_path = r'\\grove.ad.uconn.edu\research\ENGR_Chon\Dong\MATLAB_generate_results\NIH_PulseWatch\PT_format' + +# # Run the preprocessing function +# preprocess_and_save_data(input_path, output_path) + +#### From the last to the first +# def preprocess_and_save_data(data_path, output_path): +# # Make sure the output directory exists +# if not os.path.exists(output_path): +# os.makedirs(output_path) + +# # Traverse the directories for each UID +# # Get the list of directories and sort them in descending order +# uids = sorted(os.listdir(data_path), reverse=True) +# for uid in uids: +# uid_path = os.path.join(data_path, uid) +# if os.path.isdir(uid_path): +# # Make a corresponding directory in the output path +# uid_output_path = os.path.join(output_path, uid) +# if not os.path.exists(uid_output_path): +# os.makedirs(uid_output_path) + +# # Process each file within the UID directory +# for file in os.listdir(uid_path): +# if file.endswith('.csv') or file.endswith('.png'): +# file_path = os.path.join(uid_path, file) +# if file.endswith('.csv'): +# data = pd.read_csv(file_path).values +# else: # if file.endswith('.png'): +# data = np.array(Image.open(file_path)) + +# data_tensor = torch.tensor(data, dtype=torch.float32) +# output_file_path = os.path.join(uid_output_path, file.replace('.csv', '.pt').replace('.png', '.pt')) +# torch.save(data_tensor, output_file_path) + +# # Define your input and output paths +# input_path = r'\\grove.ad.uconn.edu\research\ENGR_Chon\Dong\MATLAB_generate_results\NIH_PulseWatch\TFS_csv' +# output_path = r'\\grove.ad.uconn.edu\research\ENGR_Chon\Dong\MATLAB_generate_results\NIH_PulseWatch\PT_format' + +# # Run the preprocessing function +# preprocess_and_save_data(input_path, output_path) +###################################################################################################################################################### +#### First to last +import os +import pandas as pd +import numpy as np +from PIL import Image +import torch +from concurrent.futures import ThreadPoolExecutor + +def preprocess_file(uid_path, file, uid_output_path): + file_path = os.path.join(uid_path, file) + + if file.endswith('.csv'): + data = pd.read_csv(file_path).values + elif file.endswith('.png'): + data = np.array(Image.open(file_path)) + else: + return + + data_tensor = torch.tensor(data, dtype=torch.float32) + base_name, extension = os.path.splitext(file) + output_file_path = os.path.join(uid_output_path, f'{base_name}.pt') + torch.save(data_tensor, output_file_path) + +def preprocess_and_save_data(data_path, output_path): + # Make sure the output directory exists + if not os.path.exists(output_path): + os.makedirs(output_path) + + # Traverse the directories for each UID + for uid in os.listdir(data_path): + uid_path = os.path.join(data_path, uid) + if os.path.isdir(uid_path): + # Make a corresponding directory in the output path + uid_output_path = os.path.join(output_path, uid) + if not os.path.exists(uid_output_path): + os.makedirs(uid_output_path) + + # Create a ThreadPoolExecutor for parallel processing + with ThreadPoolExecutor() as executor: + files_to_process = [file for file in os.listdir(uid_path) if file.endswith(('.csv', '.png'))] + for file in files_to_process: + executor.submit(preprocess_file, uid_path, file, uid_output_path) + +# Define your input and output paths +input_path = r'\\grove.ad.uconn.edu\research\ENGR_Chon\Dong\MATLAB_generate_results\NIH_PulseWatch\TFS_csv' +output_path = r'\\grove.ad.uconn.edu\research\ENGR_Chon\Dong\MATLAB_generate_results\NIH_PulseWatch\PT_format' + +# Run the preprocessing function +preprocess_and_save_data(input_path, output_path) +###################################################################################################################################################### +#### Last to first +import os +import pandas as pd +import numpy as np +import torch +from PIL import Image +from concurrent.futures import ThreadPoolExecutor + +def process_file(uid_path, file, uid_output_path): + if file.endswith('.csv') or file.endswith('.png'): + file_path = os.path.join(uid_path, file) + if file.endswith('.csv'): + data = pd.read_csv(file_path).values + else: # if file.endswith('.png'): + data = np.array(Image.open(file_path)) + + data_tensor = torch.tensor(data, dtype=torch.float32) + output_file_path = os.path.join(uid_output_path, file.replace('.csv', '.pt').replace('.png', '.pt')) + torch.save(data_tensor, output_file_path) + +def preprocess_and_save_data(data_path, output_path): + if not os.path.exists(output_path): + os.makedirs(output_path) + + uids = sorted(os.listdir(data_path), reverse=True) + for uid in uids: + uid_path = os.path.join(data_path, uid) + if os.path.isdir(uid_path): + uid_output_path = os.path.join(output_path, uid) + if not os.path.exists(uid_output_path): + os.makedirs(uid_output_path) + + # Use a ThreadPoolExecutor to process files in parallel + with ThreadPoolExecutor() as executor: + # Create a list of tasks for the executor + tasks = [executor.submit(process_file, uid_path, file, uid_output_path) for file in os.listdir(uid_path)] + # Wait for all tasks to complete + for task in tasks: + task.result() + +# Define your input and output paths +input_path = r'\\grove.ad.uconn.edu\research\ENGR_Chon\Dong\MATLAB_generate_results\NIH_PulseWatch\TFS_csv' +output_path = r'\\grove.ad.uconn.edu\research\ENGR_Chon\Dong\MATLAB_generate_results\NIH_PulseWatch\PT_format' + +# Run the preprocessing function +preprocess_and_save_data(input_path, output_path) +###################################################################################################################################################### +###################################################################################################################################################### +###################################################################################################################################################### + +from torch.utils.data import Dataset, DataLoader +from sklearn.preprocessing import StandardScaler + +class CustomDataset(Dataset): + def __init__(self, data_path, labels_path, UIDs, standardize=True, data_format='csv', read_all_labels=False): + self.data_path = data_path + self.labels_path = labels_path + self.UIDs = UIDs + self.standardize = standardize + self.data_format = data_format + self.read_all_labels = read_all_labels + self.refresh_dataset() + + def refresh_dataset(self): + self.segment_names, self.labels = self.extract_segment_names_and_labels() + + def add_uids(self, new_uids): + unique_new_uids = [uid for uid in new_uids if uid not in self.UIDs] + self.UIDs.extend(unique_new_uids) + self.refresh_dataset() + + def __len__(self): + return len(self.segment_names) + + def __getitem__(self, idx): + segment_name = self.segment_names[idx] + label = self.labels[segment_name] + time_freq_tensor = self.load_data(segment_name) + return {'data': time_freq_tensor, 'label': label, 'segment_name': segment_name} + + def extract_segment_names_and_labels(self): + segment_names = [] + labels = {} + + for UID in self.UIDs: + label_file = os.path.join(self.labels_path, UID + "_final_attemp_4_1_Dong.csv") + if os.path.exists(label_file): + label_data = pd.read_csv(label_file, sep=',', header=0, names=['segment', 'label']) + label_segment_names = label_data['segment'].apply(lambda x: x.split('.')[0]) + for idx, segment_name in enumerate(label_segment_names): + label_val = label_data['label'].values[idx] + if self.read_all_labels: + # Assign -1 if label is not in [0, 1, 2, 3] + labels[segment_name] = label_val if label_val in [0, 1, 2, 3] else -1 + if segment_name not in segment_names: + segment_names.append(segment_name) + else: + # Only add segments with labels in [0, 1, 2, 3] + if label_val in [0, 1, 2, 3] and segment_name not in segment_names: + segment_names.append(segment_name) + labels[segment_name] = label_val + + return segment_names, labels + + def load_data(self, segment_name): + data_path_UID = os.path.join(self.data_path, segment_name.split('_')[0]) + seg_path = os.path.join(data_path_UID, segment_name + '_filt_STFT.csv') + + try: + if self.data_format == 'csv' and seg_path.endswith('.csv'): + time_freq_plot = np.array(pd.read_csv(seg_path, header=None)) + time_freq_tensor = torch.Tensor(time_freq_plot).reshape(1, 128, 128) + elif self.data_format == 'png' and seg_path.endswith('.png'): + img = Image.open(seg_path) + img_data = np.array(img) + time_freq_tensor = torch.Tensor(img_data).unsqueeze(0) + elif self.dta_format == 'pt' and seg_path.endswith('pt'): + time_freq_tensor = torch.load(seg_path) + else: + raise ValueError("Unsupported file format") + if self.standardize: + time_freq_tensor = self.standard_scaling(time_freq_tensor) # Standardize the data + + return time_freq_tensor.clone() + + except Exception as e: + print(f"Error processing segment: {segment_name}. Exception: {str(e)}") + return torch.zeros((1, 128, 128)) # Return zeros in case of an error + + def standard_scaling(self, data): + scaler = StandardScaler() + data = scaler.fit_transform(data.reshape(-1, data.shape[-1])).reshape(data.shape) + return torch.Tensor(data) + +def load_data_split_batched(data_path, labels_path, UIDs, batch_size, standardize=False, data_format='csv', read_all_labels=True, drop_last=False, num_workers=4): + dataset = CustomDataset(data_path, labels_path, UIDs, standardize, data_format, read_all_labels) + dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, drop_last=drop_last, num_workers=num_workers, prefetch_factor=2) + return dataloader + + + + + + +import os +import pandas as pd +import numpy as np +from PIL import Image +import torch +from concurrent.futures import ThreadPoolExecutor + +def preprocess_file(uid_path, file, uid_output_path): + file_path = os.path.join(uid_path, file) + + if file.endswith('.csv'): + # Ensure that the CSV file is read without an index or header + data = pd.read_csv(file_path, header=None).values + # Check the shape of the data and log if it's not 128x128 + if data.shape != (128, 128): + print(f"Warning: File {file_path} has shape {data.shape} instead of 128x128.") + elif file.endswith('.png'): + data = np.array(Image.open(file_path)) + # Check the shape of the image and log if it's not 128x128 + if data.shape != (128, 128): + print(f"Warning: Image {file_path} has shape {data.shape} instead of 128x128.") + else: + return # Skip files that are not CSV or PNG + + # Convert data to a 128x128 tensor + data_tensor = torch.tensor(data, dtype=torch.float32).view(128, 128) + base_name, extension = os.path.splitext(file) + output_file_path = os.path.join(uid_output_path, f'{base_name}.pt') + torch.save(data_tensor, output_file_path) + +def preprocess_and_save_data(data_path, output_path): + # Make sure the output directory exists + if not os.path.exists(output_path): + os.makedirs(output_path) + + # Traverse the directories for each UID + for uid in os.listdir(data_path): + uid_path = os.path.join(data_path, uid) + if os.path.isdir(uid_path): + # Make a corresponding directory in the output path + uid_output_path = os.path.join(output_path, uid) + if not os.path.exists(uid_output_path): + os.makedirs(uid_output_path) + + # Create a ThreadPoolExecutor for parallel processing + with ThreadPoolExecutor() as executor: + files_to_process = [file for file in os.listdir(uid_path) if file.endswith(('.csv', '.png'))] + for file in files_to_process: + executor.submit(preprocess_file, uid_path, file, uid_output_path) + +# Define your input and output paths +input_path = r'\\grove.ad.uconn.edu\research\ENGR_Chon\Dong\MATLAB_generate_results\NIH_PulseWatch\TFS_csv' +output_path = r'\\grove.ad.uconn.edu\research\ENGR_Chon\Dong\MATLAB_generate_results\NIH_PulseWatch\PT_format' + +# Run the preprocessing function +preprocess_and_save_data(input_path, output_path) + + + +def preprocess_and_save_data(data_path, output_path): + if not os.path.exists(output_path): + os.makedirs(output_path) + all_uids = [uid for uid in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, uid))] + for uid in reversed(all_uids): # Reverse the list of directories + uid_path = os.path.join(data_path, uid) + uid_output_path = os.path.join(output_path, uid) + if not os.path.exists(uid_output_path): + os.makedirs(uid_output_path) + with ThreadPoolExecutor() as executor: + files_to_process = [file for file in os.listdir(uid_path) if file.endswith(('.csv', '.png'))] + for file in files_to_process: + executor.submit(preprocess_file, uid_path, file, uid_output_path) + +def preprocess_file(uid_path, file, uid_output_path): + file_path = os.path.join(uid_path, file) + if file.endswith('.csv'): + data = pd.read_csv(file_path, header=None).values + if data.shape != (128, 128): + print(f"Warning: File {file_path} has shape {data.shape} instead of 128x128.") + elif file.endswith('.png'): + data = np.array(Image.open(file_path)) + if data.shape != (128, 128): + print(f"Warning: Image {file_path} has shape {data.shape} instead of 128x128.") + else: + return + data_tensor = torch.tensor(data, dtype=torch.float32).view(128, 128) + base_name, extension = os.path.splitext(file) + output_file_path = os.path.join(uid_output_path, f'{base_name}.pt') + torch.save(data_tensor, output_file_path) + +# Top-level script execution: +input_path = r'\\grove.ad.uconn.edu\research\ENGR_Chon\Dong\MATLAB_generate_results\NIH_PulseWatch\TFS_csv' +output_path = r'\\grove.ad.uconn.edu\research\ENGR_Chon\Dong\MATLAB_generate_results\NIH_PulseWatch\PT_format' +preprocess_and_save_data(input_path, output_path) diff --git a/semisupervised_method.py b/semisupervised_method.py index 547ffcd..2608cd4 100644 --- a/semisupervised_method.py +++ b/semisupervised_method.py @@ -64,18 +64,13 @@ def __init__(self, data_path, labels_path, UIDs, standardize=True, data_format=' self.standardize = standardize self.data_format = data_format self.read_all_labels = read_all_labels - self.transforms = ToTensor() self.refresh_dataset() def refresh_dataset(self): - # Extract unique segment names and their corresponding labels self.segment_names, self.labels = self.extract_segment_names_and_labels() def add_uids(self, new_uids): - # Ensure new UIDs are unique and not already in the dataset unique_new_uids = [uid for uid in new_uids if uid not in self.UIDs] - - # Add unique new UIDs and refresh the dataset self.UIDs.extend(unique_new_uids) self.refresh_dataset() @@ -85,10 +80,7 @@ def __len__(self): def __getitem__(self, idx): segment_name = self.segment_names[idx] label = self.labels[segment_name] - - # Load data on-the-fly based on the segment_name time_freq_tensor = self.load_data(segment_name) - return {'data': time_freq_tensor, 'label': label, 'segment_name': segment_name} def extract_segment_names_and_labels(self): @@ -117,36 +109,24 @@ def extract_segment_names_and_labels(self): def load_data(self, segment_name): data_path_UID = os.path.join(self.data_path, segment_name.split('_')[0]) - seg_path = os.path.join(data_path_UID, segment_name + '_filt_STFT.csv') - + seg_path = os.path.join(data_path_UID, segment_name + '_filt_STFT.pt') try: - if self.data_format == 'csv' and seg_path.endswith('.csv'): - time_freq_plot = np.array(pd.read_csv(seg_path, header=None)) - time_freq_tensor = torch.Tensor(time_freq_plot).reshape(1, 128, 128) - elif self.data_format == 'png' and seg_path.endswith('.png'): - img = Image.open(seg_path) - img_data = np.array(img) - time_freq_tensor = torch.Tensor(img_data).unsqueeze(0) - else: - raise ValueError("Unsupported file format") - + time_freq_tensor = torch.load(seg_path) if self.standardize: - time_freq_tensor = self.standard_scaling(time_freq_tensor) # Standardize the data - + time_freq_tensor = self.standard_scaling(time_freq_tensor) return time_freq_tensor.clone() - except Exception as e: print(f"Error processing segment: {segment_name}. Exception: {str(e)}") - return torch.zeros((1, 128, 128)) # Return zeros in case of an error + return torch.zeros((1, 128, 128)) def standard_scaling(self, data): scaler = StandardScaler() data = scaler.fit_transform(data.reshape(-1, data.shape[-1])).reshape(data.shape) return torch.Tensor(data) -def load_data_split_batched(data_path, labels_path, UIDs, batch_size, standardize=False, data_format='csv', read_all_labels=True, drop_last=False, num_workers=4): - dataset = CustomDataset(data_path, labels_path, UIDs, standardize, data_format, read_all_labels) - dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, drop_last=drop_last, num_workers=num_workers) +def load_data_split_batched(data_path, labels_path, UIDs, batch_size, standardize=False, read_all_labels=True, drop_last=False, num_workers=4): + dataset = CustomDataset(data_path, labels_path, UIDs, standardize, read_all_labels) + dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, drop_last=drop_last, num_workers=num_workers, prefetch_factor=2) return dataloader # To validate the len of the dataloader