Skip to content
Permalink
main
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 18 13:49:26 2023
@author: lrm22005
"""
import os
import torch
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, Dataset, TensorDataset
from torchvision.transforms import ToTensor
from sklearn.preprocessing import StandardScaler
from PIL import Image
import logging
import numpy as np
import pandas as pd
import torch
from torch.distributions import MultivariateNormal
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, adjusted_rand_score, adjusted_mutual_info_score, davies_bouldin_score
import seaborn as sns
from PIL import Image # Import the Image module
def get_data_paths(data_format, is_linux=False, is_hpc=False):
if is_linux:
base_path = "/mnt/r/ENGR_Chon/Dong/MATLAB_generate_results/NIH_PulseWatch"
labels_base_path = "/mnt/r/ENGR_Chon/NIH_Pulsewatch_Database/Adjudication_UConn"
saving_base_path = "/mnt/r/ENGR_Chon/Luis/Research/Casseys_case/Project_1_analysis"
elif is_hpc:
base_path = "/gpfs/scratchfs1/kic14002/doh16101"
labels_base_path = "/gpfs/scratchfs1/hfp14002/lrm22005"
saving_base_path = "/gpfs/scratchfs1/hfp14002/lrm22005/Casseys_case/Project_1_analysis"
else:
# R:\ENGR_Chon\Dong\MATLAB_generate_results\NIH_PulseWatch
base_path = "R:\ENGR_Chon\Dong\MATLAB_generate_results\\NIH_PulseWatch"
labels_base_path = "R:\ENGR_Chon\\NIH_Pulsewatch_Database\Adjudication_UConn"
saving_base_path = r"\\grove.ad.uconn.edu\research\ENGR_Chon\Luis\Research\Casseys_case"
if data_format == 'csv':
data_path = os.path.join(base_path, "TFS_csv")
labels_path = os.path.join(labels_base_path, "final_attemp_4_1_Dong_Ohm")
saving_path = os.path.join(saving_base_path, "Project_1_analysis")
elif data_format == 'png':
data_path = os.path.join(base_path, "TFS_plots")
labels_path = os.path.join(labels_base_path, "final_attemp_4_1_Dong_Ohm")
saving_path = os.path.join(saving_base_path, "Project_1_analysis")
else:
raise ValueError("Invalid data format. Choose 'csv' or 'png.")
return data_path, labels_path, saving_path
class CustomDataset(Dataset):
def __init__(self, data_path, labels_path, UIDs, standardize=True, data_format='csv', read_all_labels=False):
self.data_path = data_path
self.labels_path = labels_path
self.UIDs = UIDs
self.standardize = standardize
self.data_format = data_format
self.read_all_labels = read_all_labels
self.refresh_dataset()
def refresh_dataset(self):
self.segment_names, self.labels = self.extract_segment_names_and_labels()
def add_uids(self, new_uids):
unique_new_uids = [uid for uid in new_uids if uid not in self.UIDs]
self.UIDs.extend(unique_new_uids)
self.refresh_dataset()
def __len__(self):
return len(self.segment_names)
def __getitem__(self, idx):
segment_name = self.segment_names[idx]
label = self.labels[segment_name]
time_freq_tensor = self.load_data(segment_name)
return {'data': time_freq_tensor, 'label': label, 'segment_name': segment_name}
def extract_segment_names_and_labels(self):
segment_names = []
labels = {}
for UID in self.UIDs:
label_file = os.path.join(self.labels_path, UID + "_final_attemp_4_1_Dong.csv")
if os.path.exists(label_file):
label_data = pd.read_csv(label_file, sep=',', header=0, names=['segment', 'label'])
label_segment_names = label_data['segment'].apply(lambda x: x.split('.')[0])
for idx, segment_name in enumerate(label_segment_names):
label_val = label_data['label'].values[idx]
if self.read_all_labels:
# Assign -1 if label is not in [0, 1, 2, 3]
labels[segment_name] = label_val if label_val in [0, 1, 2, 3] else -1
if segment_name not in segment_names:
segment_names.append(segment_name)
else:
# Only add segments with labels in [0, 1, 2, 3]
if label_val in [0, 1, 2, 3] and segment_name not in segment_names:
segment_names.append(segment_name)
labels[segment_name] = label_val
return segment_names, labels
def load_data(self, segment_name):
data_path_UID = os.path.join(self.data_path, segment_name.split('_')[0])
seg_path = os.path.join(data_path_UID, segment_name + '_filt_STFT.pt')
try:
time_freq_tensor = torch.load(seg_path)
if self.standardize:
time_freq_tensor = self.standard_scaling(time_freq_tensor)
return time_freq_tensor.clone()
except Exception as e:
print(f"Error processing segment: {segment_name}. Exception: {str(e)}")
return torch.zeros((1, 128, 128))
def standard_scaling(self, data):
scaler = StandardScaler()
data = scaler.fit_transform(data.reshape(-1, data.shape[-1])).reshape(data.shape)
return torch.Tensor(data)
def load_data_split_batched(data_path, labels_path, UIDs, batch_size, standardize=False, read_all_labels=True, drop_last=False, num_workers=4):
dataset = CustomDataset(data_path, labels_path, UIDs, standardize, read_all_labels)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, drop_last=drop_last, num_workers=num_workers, prefetch_factor=2)
return dataloader
# To validate the len of the dataloader
# number of examples
# len(train_loader)
# number of batches
# len(train_loader)
# ====== Load the per subject arrythmia summary ======
df_summary = pd.read_csv(r'\\grove.ad.uconn.edu\research\ENGR_Chon\NIH_Pulsewatch_Database\Adjudication_UConn\final_attemp_4_1_Dong_Ohm_summary_20231025.csv')
df_summary['UID'] = df_summary['UID'].astype(str).str.zfill(3)
df_summary['sample_nonAF'] = df_summary['NSR'] + df_summary['PACPVC'] + df_summary['SVT']
df_summary['sample_AF'] = df_summary['AF']
df_summary['sample_nonAF_ratio'] = df_summary['sample_nonAF'] / (df_summary['sample_AF'] + df_summary['sample_nonAF'])
all_UIDs = df_summary['UID'].unique()
# ====================================================
# ====== AF trial separation ======
# R:\ENGR_Chon\Dong\Numbers\Pulsewatch_numbers\Fahimeh_CNNED_general_ExpertSystemwApplication\tbl_file_name\TrainingSet_final_segments
AF_trial_Fahimeh_train = ['402','410']
AF_trial_Fahimeh_test = ['301', '302', '305', '306', '307', '310', '311',
'312', '318', '319', '320', '321', '322', '324',
'325', '327', '329', '400', '406', '407', '409',
'414']
AF_trial_Fahimeh_did_not_use = ['405', '413', '415', '416', '420', '421', '422', '423']
AF_trial_paroxysmal_AF = ['408','419']
AF_trial_train = AF_trial_Fahimeh_train
AF_trial_test = AF_trial_Fahimeh_test
AF_trial_unlabeled = AF_trial_Fahimeh_did_not_use + AF_trial_paroxysmal_AF
print(f'AF trial: {len(AF_trial_train)} training subjects {AF_trial_train}')
print(f'AF trial: {len(AF_trial_test)} testing subjects {AF_trial_test}')
print(f'AF trial: {len(AF_trial_unlabeled)} unlabeled subjects {AF_trial_unlabeled}')
# =================================
# === Clinical trial AF subjects separation ===
clinical_trial_AF_subjects = ['005', '017', '026', '051', '075', '082']
remaining_UIDs = []
count_NSR = []
import math
for index, row in df_summary.iterrows():
UID = row['UID']
this_NSR = row['sample_nonAF']
if math.isnan(this_NSR):
# There is no segment in this subject, skip this UID.
print(f'---------UID {UID} has no segments.------------')
continue
if UID not in AF_trial_train and UID not in AF_trial_test and UID not in clinical_trial_AF_subjects \
and not UID[0] == '3' and not UID[0] == '4':
remaining_UIDs.append(UID)
count_NSR.append(this_NSR)
from numpy import random
random.seed(seed=42)
from numpy.random import choice
list_of_candidates = remaining_UIDs
number_of_items_to_pick = round(len(list_of_candidates) * 0.15) # 10% labeled for training, 5% for testing.
temp_sum = sum(count_NSR)
probability_distribution = [x/temp_sum for x in count_NSR]
probability_distribution = [(1-x/temp_sum)/ (len(count_NSR)-1) for x in count_NSR]# Subjects with fewer segments have higher chance to be selected. Make sure the sum is one.
draw = choice(list_of_candidates, number_of_items_to_pick,
p=probability_distribution, replace=False)
clinical_trial_train = list(draw[:round(len(list_of_candidates) * 0.1)])
clinical_trial_test_nonAF = list(draw[round(len(list_of_candidates) * 0.1):])
clinical_trial_test_temp = clinical_trial_test_nonAF + clinical_trial_AF_subjects
clinical_trial_test = []
for UID in clinical_trial_test_temp:
# UID 051 and maybe other UIDs had no segments (unknown reason).
if UID in all_UIDs:
clinical_trial_test.append(UID)
clinical_trial_unlabeled = []
for UID in all_UIDs:
if UID not in clinical_trial_train and UID not in clinical_trial_test and not UID[0] == '3' and not UID[0] == '4':
clinical_trial_unlabeled.append(UID)
print(f'Clinical trial: selected {len(clinical_trial_train)} UIDs for training {clinical_trial_train}')
print(f'Clinical trial: selected {len(clinical_trial_test)} UIDs for testing {clinical_trial_test}')
print(f'Clinical trial: selected {len(clinical_trial_unlabeled)} UIDs for unlabeled {clinical_trial_unlabeled}')
is_linux = False # Set to True if running on Linux, False if on Windows
is_hpc = False # Set to True if running on an HPC, False if on Windows
data_format = 'csv' # Choose 'csv' or 'png'
data_path, labels_path, saving_path = get_data_paths(data_format, is_linux=is_linux, is_hpc=is_hpc)
# clinical_trial_train = [clinical_trial_train[0]]
# clinical_trial_test = [clinical_trial_test[0]]
# clinical_trial_unlabeled = clinical_trial_unlabeled[0:4]
batch_size = 512
train_loader = load_data_split_batched(data_path, labels_path, clinical_trial_train, batch_size, standardize=True, data_format='csv', read_all_labels=False, drop_last=True)
val_loader = load_data_split_batched(data_path, labels_path, clinical_trial_test, batch_size, standardize=True, data_format='csv', read_all_labels=False, drop_last=True)
test_loader = load_data_split_batched(data_path, labels_path, clinical_trial_unlabeled, batch_size, standardize=True, data_format='csv', read_all_labels=False, drop_last=True)
# data_iter = iter(train_loader)
# data = next(data_iter)
# data["label"]
'''
Key Points:
Initialization: The script starts by initializing the data loaders for training, validation, and testing.
Initial Training: The model is initially trained on the training set.
Active Learning Loop: During each iteration of active learning, the script:
Performs uncertainty sampling on the validation set.
Updates the training loader with the uncertain samples.
Retrains the model with this updated training set.
Final Evaluation: After the active learning iterations, the model is evaluated on the test set to assess its performance on unseen data.
Notes:
Data Preprocessing: The script assumes that train_batch['data'] from your data loaders is already appropriately preprocessed for your model. If additional preprocessing is required, make sure to include it.
Active Learning Iterations: The number of active learning iterations (active_learning_iterations) and the number of samples per iteration (n_samples) are parameters you can adjust based on your scenario and dataset.
Test Evaluation: The final evaluation on the test set gives you an understanding of how well the model generalizes to new, unseen data.
'''
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
import gpytorch
from gpytorch.mlls import VariationalELBO
from gpytorch.kernels import RBFKernel, PeriodicKernel, ScaleKernel
from gpytorch.models import ExactGP
from gpytorch.mlls import ExactMarginalLogLikelihood
from gpytorch.means import ConstantMean
from gpytorch.distributions import MultivariateNormal
from sklearn.cluster import KMeans
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
# Assuming CustomDataset and data loader functions are defined as provided
def map_samples_to_uids(uncertain_sample_indices, dataset):
"""
Maps indices of uncertain samples back to their corresponding segment names or UIDs.
Args:
- uncertain_sample_indices: Indices of the uncertain samples in the dataset.
- dataset: The dataset object which contains the mapping of segment names and UIDs.
Returns:
- List of UIDs or segment names corresponding to the uncertain samples.
"""
return [dataset.segment_names[i] for i in uncertain_sample_indices]
def apply_tsne(data, n_components=2):
n_samples = data.shape[0]
perplexity = min(30, n_samples - 1) # Ensure perplexity is less than the number of samples
tsne = TSNE(n_components=n_components, perplexity=perplexity)
return tsne.fit_transform(data)
import torch
import gpytorch
from gpytorch.kernels import SpectralMixtureKernel
from gpytorch.models import AbstractVariationalGP
from gpytorch.variational import CholeskyVariationalDistribution, VariationalStrategy
from gpytorch.distributions import MultivariateNormal
import numpy as np
import torch
import gpytorch
import pandas as pd
from sklearn.cluster import KMeans
from torch.utils.data import DataLoader
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, precision_recall_fscore_support
from sklearn.preprocessing import label_binarize, StandardScaler
from sklearn.manifold import TSNE
num_latents = 6 # This should match the complexity of your data or the number of tasks
num_tasks = 4 # This should match the number of output classes or tasks
num_inducing_points = 50 # This is independent and should be sufficient for the input space
class MultitaskGPModel(gpytorch.models.ApproximateGP):
def __init__(self):
# Let's use a different set of inducing points for each latent function
inducing_points = torch.rand(num_latents, num_inducing_points, 128 * 128) # Assuming flattened 128x128 images
# We have to mark the CholeskyVariationalDistribution as batch
# so that we learn a variational distribution for each task
variational_distribution = gpytorch.variational.CholeskyVariationalDistribution(
inducing_points.size(-2), batch_shape=torch.Size([num_latents])
)
# We have to wrap the VariationalStrategy in a LMCVariationalStrategy
# so that the output will be a MultitaskMultivariateNormal rather than a batch output
variational_strategy = gpytorch.variational.LMCVariationalStrategy(
gpytorch.variational.VariationalStrategy(
self, inducing_points, variational_distribution, learn_inducing_locations=True
),
num_tasks=num_tasks,
num_latents=num_latents,
latent_dim=-1
)
super().__init__(variational_strategy)
# The mean and covariance modules should be marked as batch
# so we learn a different set of hyperparameters
self.mean_module = gpytorch.means.ConstantMean(batch_shape=torch.Size([num_latents]))
self.covar_module = gpytorch.kernels.ScaleKernel(
gpytorch.kernels.RBFKernel(batch_shape=torch.Size([num_latents])),
batch_shape=torch.Size([num_latents])
)
def forward(self, x):
# The forward function should be written as if we were dealing with each output
# dimension in batch
# Ensure x is correctly shaped. It should have the same last dimension size as inducing_points
# x should be reshaped or sliced to have the shape [?, 1] where ? can be any size
# For example, if x originally has shape [N, D], and D != 1, you need to modify x accordingly
# print(f"Input shape: {x.shape}")
# x = x.view(x.size(0), -1) # Flattening the images
# print(f"Input shape after flattening: {x.shape}") # Debugging input shape
mean_x = self.mean_module(x)
covar_x = self.covar_module(x)
# Debugging: Print shapes of intermediate outputs
# print(f"Mean shape: {mean_x.shape}, Covariance shape: {covar_x.shape}")
latent_pred = gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
# print(f"Latent prediction shape: {latent_pred.mean.shape}, {latent_pred.covariance_matrix.shape}")
return latent_pred
# Usage
# model = MultitaskGPModel()
# likelihood = gpytorch.likelihoods.SoftmaxLikelihood(num_features=2, num_classes=4, mixing_weights = False)
from tqdm import tqdm
# Initialize result storage
results = {
'train_loss': [],
'validation_metrics': {'precision': [], 'recall': [], 'f1': [], 'auc_roc': []},
'test_metrics': None # This will be filled in with the final test metrics
}
import torch
import gpytorch
from gpytorch.likelihoods import SoftmaxLikelihood
from gpytorch.functions import log_normal_cdf
def train_gp_model(train_x, train_y, val_loader, num_iterations=50, n_classes=4, patience=10, checkpoint_path='model_checkpoint_full.pt'):
model = MultitaskGPModel().to(device)
likelihood = gpytorch.likelihoods.SoftmaxLikelihood(num_features=4, num_classes=4).to(device)
model.train()
likelihood.train()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=train_y.size(0))
best_val_loss = float('inf')
epochs_no_improve = 0
for i in tqdm(range(num_iterations), desc='Training', unit='iter', leave=False):
optimizer.zero_grad()
output = model(train_x)
loss = -mll(output, train_y)
scalar_loss = loss.sum() if loss.numel() > 1 else loss
scalar_loss.backward()
optimizer.step()
# Validation step
model.eval()
likelihood.eval()
with torch.no_grad():
val_loss = 0.0
for val_batch in val_loader:
val_x, val_y = val_batch['data'].view(val_batch['data'].size(0), -1).to(device), val_batch['label'].to(device)
val_output = model(val_x)
val_loss += -mll(val_output, val_y).item()
val_loss /= len(val_loader)
model.train()
likelihood.train()
# Early stopping and checkpointing based on validation loss
if val_loss < best_val_loss:
best_val_loss = val_loss
epochs_no_improve = 0
torch.save({'model_state_dict': model.state_dict(),
'likelihood_state_dict': likelihood.state_dict(),
'optimizer_state_dict': optimizer.state_dict()}, checkpoint_path)
else:
epochs_no_improve += 1
if epochs_no_improve == patience:
print(f"Early stopping triggered at iteration {i+1}")
break
# Load the best model before return
checkpoint = torch.load(checkpoint_path)
model.load_state_dict(checkpoint['model_state_dict'])
likelihood.load_state_dict(checkpoint['likelihood_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
return model, likelihood
def parse_classification_report(report):
"""Parse a classification report into a dictionary of metrics."""
lines = report.split('\n')
main_metrics = lines[-2].split()
# Assuming the last line is like "accuracy: x macro avg y1 y2 y3 y4"
return {
'precision': float(main_metrics[3]),
'recall': float(main_metrics[4]),
'f1': float(main_metrics[5]),
'auc_roc': None # AUC-ROC is not part of the classification report by default
}
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import label_binarize
from sklearn.metrics import auc
from sklearn.metrics import roc_curve
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
def evaluate_model_on_all_data(model, likelihood, data_loader, device, n_classes):
model.eval()
likelihood.eval()
all_predicted_labels = []
all_test_labels = []
with torch.no_grad(), gpytorch.settings.fast_pred_var():
for i, batch in enumerate(data_loader):
test_data = batch['data'].view(batch['data'].size(0), -1).to(device)
test_labels = batch['label'].to(device)
# print(f"Test data shape before t-SNE: {test_data.shape}")
predictions = likelihood(model(test_data)).mean
# Debugging - check shape of predictions
# print(f"Predictions shape: {predictions.shape}")
predicted_labels = predictions.argmax(dim=0)
# Add debugging information
# print(f"Batch {i}: Predicted Labels Shape: {predicted_labels.shape}, Actual Labels Shape: {test_labels.shape}")
all_predicted_labels.append(predicted_labels.cpu().numpy())
all_test_labels.append(test_labels.numpy())
# Debug the accumulation of labels
current_predicted = np.concatenate(all_predicted_labels, axis=0)
current_actual = np.concatenate(all_test_labels, axis=0)
# print(f"After Batch {i}: Accumulated Predicted Labels: {current_predicted.shape[0]}, Accumulated Actual Labels: {current_actual.shape[0]}")
# Concatenate all batch results
all_predicted_labels = np.concatenate(all_predicted_labels, axis=0)
all_test_labels = np.concatenate(all_test_labels, axis=0)
# Final check
# print(f"Final: Total Predicted Labels: {all_predicted_labels.shape[0]}, Total Actual Labels: {all_test_labels.shape[0]}")
# Verify if the shapes match before proceeding to calculate metrics
if all_predicted_labels.shape[0] != all_test_labels.shape[0]:
raise ValueError("Mismatch in the number of samples between predicted and actual labels")
# Compute overall evaluation metrics
precision, recall, f1, _ = precision_recall_fscore_support(all_test_labels, all_predicted_labels, average='macro')
# For AUC-ROC, you need the predicted probabilities and true labels in a one-hot encoded format
test_labels_one_hot = label_binarize(all_test_labels, classes=np.arange(n_classes))
auc_roc = roc_auc_score(test_labels_one_hot, predictions.softmax(dim=-1).cpu().numpy(), multi_class='ovr')
return {
'precision': precision,
'recall': recall,
'f1': f1,
'auc_roc': auc_roc
}
import random
def stochastic_uncertainty_sampling(gp_model, gp_likelihood, val_loader, n_samples, n_batches, n_components=2):
gp_model.eval()
gp_likelihood.eval()
uncertain_sample_indices = []
sampled_batches = random.sample(list(val_loader), n_batches) # Randomly sample n_batches from val_loader
with torch.no_grad():
for batch in sampled_batches:
# reduced_data = apply_tsne(batch['data'].reshape(batch['data'].size(0), -1), n_components=n_components)
# reduced_data_tensor = torch.Tensor(reduced_data).to(device)
reduced_data_tensor = batch['data'].view(batch['data'].size(0), -1).to(device)
predictions = gp_likelihood(gp_model(reduced_data_tensor))
var = predictions.variance
top_indices = torch.argsort(-var.flatten())[:n_samples]
uncertain_sample_indices.extend(top_indices.cpu().numpy())
return uncertain_sample_indices[:n_samples]
# def uncertainty_sampling(gp_model, gp_likelihood, val_loader, n_samples, n_components=2):
# gp_model.eval()
# gp_likelihood.eval()
# uncertain_sample_indices = []
# with torch.no_grad():
# for batch_idx, batch in tqdm(enumerate(val_loader), desc='Uncertainty Sampling', unit='batch'):
# reduced_data_tensor = batch['data'].view(batch['data'].size(0), -1).to(device)
# predictions = gp_likelihood(gp_model(reduced_data_tensor))
# var = predictions.variance
# top_indices = torch.argsort(-var.flatten())[:n_samples]
# batch_uncertain_indices = [batch_idx * val_loader.batch_size + idx for idx in top_indices]
# uncertain_sample_indices.extend(batch_uncertain_indices)
# return uncertain_sample_indices[:n_samples]
def label_samples(uncertain_samples, validation_data):
labels = [validation_data[sample_id]['label'] for sample_id in uncertain_samples]
return uncertain_samples, labels
def update_train_loader_with_uncertain_samples(current_train_loader, new_sample_indices, data_path, labels_path, batch_size, standardize=False, data_format='csv', read_all_labels=True):
# Extract current UIDs from the current_train_loader
current_dataset = current_train_loader.dataset
# Map new_samples back to their corresponding segment names or UIDs
new_uids = map_samples_to_uids(new_sample_indices, current_dataset)
# Add new UIDs to the current dataset and refresh it
current_dataset.add_uids(new_uids)
# Create new DataLoader with the updated dataset
updated_train_loader = DataLoader(current_dataset, batch_size=batch_size, shuffle=False)
return updated_train_loader
def plot_training_performance(train_loss, validation_metrics):
epochs = range(1, len(train_loss) + 1)
# Plot training loss
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
plt.plot(epochs, train_loss, 'b-', label='Training Loss')
plt.title('Training Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
# Plot validation metrics
plt.subplot(1, 2, 2)
plt.plot(epochs, validation_metrics['precision'], 'r-', label='Precision')
plt.plot(epochs, validation_metrics['recall'], 'g-', label='Recall')
plt.plot(epochs, validation_metrics['f1'], 'b-', label='F1 Score')
plt.plot(epochs, validation_metrics['auc_roc'], 'y-', label='AUC-ROC')
plt.title('Validation Metrics')
plt.xlabel('Epochs')
plt.ylabel('Metrics')
plt.legend()
plt.tight_layout()
plt.show()
def plot_results(results):
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(results['train_loss'], label='Train Loss')
plt.title('Training Loss Over Time')
plt.legend()
plt.subplot(1, 2, 2)
for metric in ['precision', 'recall', 'f1']:
plt.plot(results['validation_metrics'][metric], label=metric.title())
plt.title('Validation Metrics Over Time')
plt.legend()
plt.show()
test_metrics = results['test_metrics']
print("Test Metrics:")
print(f"Precision: {test_metrics['precision']}")
print(f"Recall: {test_metrics['recall']}")
print(f"F1 Score: {test_metrics['f1']}")
print(f"AUC-ROC: {test_metrics['auc_roc']}")
from sklearn.cluster import KMeans
# K-Means Validation Function
def kmeans_validation(model, data_loader, n_clusters, device):
model.eval()
all_data = []
all_predictions = []
with torch.no_grad():
for batch in data_loader:
data = batch['data'].view(batch['data'].size(0), -1).to(device)
labels = batch['label'].to(device)
predictions = model(data).mean.argmax(dim=-1).cpu().numpy()
all_data.extend(data.cpu().numpy())
all_predictions.extend(predictions)
all_data = np.array(all_data)
all_predictions = np.array(all_predictions)
# Perform KMeans clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(all_data)
cluster_labels = kmeans.labels_
# Check label consistency within each cluster
for i in range(n_clusters):
cluster_indices = np.where(cluster_labels == i)[0]
cluster_pred_labels = all_predictions[cluster_indices]
most_common_label = np.bincount(cluster_pred_labels).argmax()
# Compare most_common_label with actual labels in the cluster
# Adjust the model's predictions if necessary
return kmeans
from sklearn.cluster import MiniBatchKMeans
def run_minibatch_kmeans(data_loader, n_clusters, device, batch_size=100):
# Initialize MiniBatchKMeans
minibatch_kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=0, batch_size=batch_size)
# Iterate through data_loader and fit MiniBatchKMeans
for batch in data_loader:
data = batch['data'].view(batch['data'].size(0), -1).to(device).cpu().numpy()
minibatch_kmeans.partial_fit(data)
return minibatch_kmeans
# def compare_kmeans_gp_predictions(kmeans_model, gp_model, data_loader, device):
# # Compare K-Means with GP model predictions
# all_data, all_labels = [], []
# for batch in data_loader:
# data = batch['data'].view(batch['data'].size(0), -1).to(device)
# labels = batch['label'].to(device)
# gp_predictions = gp_model(data).mean.argmax(dim=0).cpu().numpy()
# kmeans_predictions = kmeans_model.predict(data.cpu().numpy())
# all_labels.append(labels.cpu().numpy())
# all_data.append((gp_predictions, kmeans_predictions))
# return all_data, np.concatenate(all_labels)
def stochastic_compare_kmeans_gp_predictions(kmeans_model, gp_model, data_loader, n_batches, device):
all_data, all_labels = [], []
sampled_batches = random.sample(list(data_loader), n_batches) # Randomly sample n_batches from data_loader
for batch in sampled_batches:
data = batch['data'].view(batch['data'].size(0), -1).to(device)
labels = batch['label'].to(device)
gp_predictions = gp_model(data).mean.argmax(dim=0).cpu().numpy()
kmeans_predictions = kmeans_model.predict(data.cpu().numpy())
all_labels.append(labels.cpu().numpy())
all_data.append((gp_predictions, kmeans_predictions))
return all_data, np.concatenate(all_labels)
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns
def plot_comparative_results(gp_vs_kmeans_data, original_labels):
fig, axes = plt.subplots(1, 2, figsize=(14, 7))
# Plot 1: Confusion Matrix for GP Predictions vs Original Labels
gp_predictions = [pair[0] for pair in gp_vs_kmeans_data]
gp_predictions = np.concatenate(gp_predictions)
cm_gp = confusion_matrix(original_labels, gp_predictions)
sns.heatmap(cm_gp, annot=True, ax=axes[0], fmt='g')
axes[0].set_title('GP Model Predictions vs Original Labels')
axes[0].set_xlabel('Predicted Labels')
axes[0].set_ylabel('True Labels')
# Plot 2: Confusion Matrix for K-Means Predictions vs Original Labels
kmeans_predictions = [pair[1] for pair in gp_vs_kmeans_data]
kmeans_predictions = np.concatenate(kmeans_predictions)
cm_kmeans = confusion_matrix(original_labels, kmeans_predictions)
sns.heatmap(cm_kmeans, annot=True, ax=axes[1], fmt='g')
axes[1].set_title('K-Means Predictions vs Original Labels')
axes[1].set_xlabel('Predicted Labels')
axes[1].set_ylabel('True Labels')
plt.tight_layout()
plt.show()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Main execution
if __name__ == "__main__":
# Define the number of output features and classes first
n_classes = 4 # Assuming you have 4 classes
# Initialize data loaders
train_loader = load_data_split_batched(data_path, labels_path, clinical_trial_train, batch_size, standardize=False, data_format='csv', read_all_labels=False, drop_last=True)
val_loader = load_data_split_batched(data_path, labels_path, clinical_trial_test, batch_size, standardize=False, data_format='csv', read_all_labels=True, drop_last=True)
test_loader = load_data_split_batched(data_path, labels_path, clinical_trial_unlabeled, batch_size, standardize=False, data_format='csv', read_all_labels=True, drop_last=True)
kmeans_model = run_minibatch_kmeans(train_loader, n_clusters=n_classes, device=device)
# Initialize result storage
results = {
'train_loss': [],
'validation_metrics': {'precision': [], 'recall': [], 'f1': [], 'auc_roc': []},
'test_metrics': None # This will be filled in with the final test metrics
}
# Initial model training
for train_batch in train_loader:
train_x = train_batch['data'].view(train_batch['data'].size(0), -1).to(device)
train_y = train_batch['label'].to(device)
model, likelihood = train_gp_model(train_x, train_y, val_loader, num_iterations=10, n_classes=n_classes)
active_learning_iterations = 10
n_samples = batch_size # Number of uncertain samples to accumulate
for iteration in tqdm(range(active_learning_iterations), desc='Active Learning', unit='iteration', leave=True):
uncertain_sample_indices = stochastic_uncertainty_sampling(model, likelihood, val_loader, n_samples, n_batches=5, n_components=2)
# Accumulate indices of uncertain samples
accumulated_indices = []
for idx in uncertain_sample_indices:
accumulated_indices.append(idx)
# Update the training loader with indices of uncertain samples
train_loader = update_train_loader_with_uncertain_samples(train_loader, accumulated_indices, data_path, labels_path, batch_size)
# Re-train the model with the updated train_loader
for train_batch in tqdm(train_loader, desc='Batch Training', leave=False):
train_x = train_batch['data'].view(train_batch['data'].size(0), -1).to(device) # Flatten the image
train_y = train_batch['label'].to(device)
model, likelihood = train_gp_model(train_x, train_y, val_loader, num_iterations=10, n_classes=n_classes)
val_metrics = evaluate_model_on_all_data(model, likelihood, val_loader, device, n_classes)
for metric in ['precision', 'recall', 'f1', 'auc_roc']:
results['validation_metrics'][metric].append(val_metrics[metric])
# Compare K-Means with GP model predictions after retraining
gp_vs_kmeans_data, original_labels = stochastic_compare_kmeans_gp_predictions(kmeans_model, model, train_loader, n_batches=5, device=device)
plot_comparative_results(gp_vs_kmeans_data, original_labels)
plot_training_performance(results['train_loss'], results['validation_metrics'])
# Final evaluation on test set
classification_result = evaluate_model_on_all_data(model, likelihood, test_loader, device, n_classes=n_classes)
# Store test metrics
results['test_metrics'] = classification_result
# Now results dictionary is ready to be used for plotting
plot_results(results)
# You might also want to print or log the final test metrics
print("Final Test Metrics:", results['test_metrics'])