Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
B_ML_Project/semisupervised_method.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
759 lines (639 sloc)
34.6 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Mon Dec 18 13:49:26 2023 | |
@author: lrm22005 | |
""" | |
import os | |
import torch | |
import pandas as pd | |
import numpy as np | |
from torch.utils.data import DataLoader, Dataset, TensorDataset | |
from torchvision.transforms import ToTensor | |
from sklearn.preprocessing import StandardScaler | |
from PIL import Image | |
import logging | |
import numpy as np | |
import pandas as pd | |
import torch | |
from torch.distributions import MultivariateNormal | |
from torch.utils.data import DataLoader, TensorDataset | |
import matplotlib.pyplot as plt | |
from sklearn.decomposition import PCA, IncrementalPCA | |
from sklearn.manifold import TSNE | |
from sklearn.cluster import KMeans | |
from sklearn.cluster import MiniBatchKMeans | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.metrics import silhouette_score, adjusted_rand_score, adjusted_mutual_info_score, davies_bouldin_score | |
import seaborn as sns | |
from PIL import Image # Import the Image module | |
def get_data_paths(data_format, is_linux=False, is_hpc=False): | |
if is_linux: | |
base_path = "/mnt/r/ENGR_Chon/Dong/MATLAB_generate_results/NIH_PulseWatch" | |
labels_base_path = "/mnt/r/ENGR_Chon/NIH_Pulsewatch_Database/Adjudication_UConn" | |
saving_base_path = "/mnt/r/ENGR_Chon/Luis/Research/Casseys_case/Project_1_analysis" | |
elif is_hpc: | |
base_path = "/gpfs/scratchfs1/kic14002/doh16101" | |
labels_base_path = "/gpfs/scratchfs1/hfp14002/lrm22005" | |
saving_base_path = "/gpfs/scratchfs1/hfp14002/lrm22005/Casseys_case/Project_1_analysis" | |
else: | |
# R:\ENGR_Chon\Dong\MATLAB_generate_results\NIH_PulseWatch | |
base_path = "R:\ENGR_Chon\Dong\MATLAB_generate_results\\NIH_PulseWatch" | |
labels_base_path = "R:\ENGR_Chon\\NIH_Pulsewatch_Database\Adjudication_UConn" | |
saving_base_path = r"\\grove.ad.uconn.edu\research\ENGR_Chon\Luis\Research\Casseys_case" | |
if data_format == 'csv': | |
data_path = os.path.join(base_path, "TFS_csv") | |
labels_path = os.path.join(labels_base_path, "final_attemp_4_1_Dong_Ohm") | |
saving_path = os.path.join(saving_base_path, "Project_1_analysis") | |
elif data_format == 'png': | |
data_path = os.path.join(base_path, "TFS_plots") | |
labels_path = os.path.join(labels_base_path, "final_attemp_4_1_Dong_Ohm") | |
saving_path = os.path.join(saving_base_path, "Project_1_analysis") | |
else: | |
raise ValueError("Invalid data format. Choose 'csv' or 'png.") | |
return data_path, labels_path, saving_path | |
class CustomDataset(Dataset): | |
def __init__(self, data_path, labels_path, UIDs, standardize=True, data_format='csv', read_all_labels=False): | |
self.data_path = data_path | |
self.labels_path = labels_path | |
self.UIDs = UIDs | |
self.standardize = standardize | |
self.data_format = data_format | |
self.read_all_labels = read_all_labels | |
self.refresh_dataset() | |
def refresh_dataset(self): | |
self.segment_names, self.labels = self.extract_segment_names_and_labels() | |
def add_uids(self, new_uids): | |
unique_new_uids = [uid for uid in new_uids if uid not in self.UIDs] | |
self.UIDs.extend(unique_new_uids) | |
self.refresh_dataset() | |
def __len__(self): | |
return len(self.segment_names) | |
def __getitem__(self, idx): | |
segment_name = self.segment_names[idx] | |
label = self.labels[segment_name] | |
time_freq_tensor = self.load_data(segment_name) | |
return {'data': time_freq_tensor, 'label': label, 'segment_name': segment_name} | |
def extract_segment_names_and_labels(self): | |
segment_names = [] | |
labels = {} | |
for UID in self.UIDs: | |
label_file = os.path.join(self.labels_path, UID + "_final_attemp_4_1_Dong.csv") | |
if os.path.exists(label_file): | |
label_data = pd.read_csv(label_file, sep=',', header=0, names=['segment', 'label']) | |
label_segment_names = label_data['segment'].apply(lambda x: x.split('.')[0]) | |
for idx, segment_name in enumerate(label_segment_names): | |
label_val = label_data['label'].values[idx] | |
if self.read_all_labels: | |
# Assign -1 if label is not in [0, 1, 2, 3] | |
labels[segment_name] = label_val if label_val in [0, 1, 2, 3] else -1 | |
if segment_name not in segment_names: | |
segment_names.append(segment_name) | |
else: | |
# Only add segments with labels in [0, 1, 2, 3] | |
if label_val in [0, 1, 2, 3] and segment_name not in segment_names: | |
segment_names.append(segment_name) | |
labels[segment_name] = label_val | |
return segment_names, labels | |
def load_data(self, segment_name): | |
data_path_UID = os.path.join(self.data_path, segment_name.split('_')[0]) | |
seg_path = os.path.join(data_path_UID, segment_name + '_filt_STFT.pt') | |
try: | |
time_freq_tensor = torch.load(seg_path) | |
if self.standardize: | |
time_freq_tensor = self.standard_scaling(time_freq_tensor) | |
return time_freq_tensor.clone() | |
except Exception as e: | |
print(f"Error processing segment: {segment_name}. Exception: {str(e)}") | |
return torch.zeros((1, 128, 128)) | |
def standard_scaling(self, data): | |
scaler = StandardScaler() | |
data = scaler.fit_transform(data.reshape(-1, data.shape[-1])).reshape(data.shape) | |
return torch.Tensor(data) | |
def load_data_split_batched(data_path, labels_path, UIDs, batch_size, standardize=False, read_all_labels=True, drop_last=False, num_workers=4): | |
dataset = CustomDataset(data_path, labels_path, UIDs, standardize, read_all_labels) | |
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, drop_last=drop_last, num_workers=num_workers, prefetch_factor=2) | |
return dataloader | |
# To validate the len of the dataloader | |
# number of examples | |
# len(train_loader) | |
# number of batches | |
# len(train_loader) | |
# ====== Load the per subject arrythmia summary ====== | |
df_summary = pd.read_csv(r'\\grove.ad.uconn.edu\research\ENGR_Chon\NIH_Pulsewatch_Database\Adjudication_UConn\final_attemp_4_1_Dong_Ohm_summary_20231025.csv') | |
df_summary['UID'] = df_summary['UID'].astype(str).str.zfill(3) | |
df_summary['sample_nonAF'] = df_summary['NSR'] + df_summary['PACPVC'] + df_summary['SVT'] | |
df_summary['sample_AF'] = df_summary['AF'] | |
df_summary['sample_nonAF_ratio'] = df_summary['sample_nonAF'] / (df_summary['sample_AF'] + df_summary['sample_nonAF']) | |
all_UIDs = df_summary['UID'].unique() | |
# ==================================================== | |
# ====== AF trial separation ====== | |
# R:\ENGR_Chon\Dong\Numbers\Pulsewatch_numbers\Fahimeh_CNNED_general_ExpertSystemwApplication\tbl_file_name\TrainingSet_final_segments | |
AF_trial_Fahimeh_train = ['402','410'] | |
AF_trial_Fahimeh_test = ['301', '302', '305', '306', '307', '310', '311', | |
'312', '318', '319', '320', '321', '322', '324', | |
'325', '327', '329', '400', '406', '407', '409', | |
'414'] | |
AF_trial_Fahimeh_did_not_use = ['405', '413', '415', '416', '420', '421', '422', '423'] | |
AF_trial_paroxysmal_AF = ['408','419'] | |
AF_trial_train = AF_trial_Fahimeh_train | |
AF_trial_test = AF_trial_Fahimeh_test | |
AF_trial_unlabeled = AF_trial_Fahimeh_did_not_use + AF_trial_paroxysmal_AF | |
print(f'AF trial: {len(AF_trial_train)} training subjects {AF_trial_train}') | |
print(f'AF trial: {len(AF_trial_test)} testing subjects {AF_trial_test}') | |
print(f'AF trial: {len(AF_trial_unlabeled)} unlabeled subjects {AF_trial_unlabeled}') | |
# ================================= | |
# === Clinical trial AF subjects separation === | |
clinical_trial_AF_subjects = ['005', '017', '026', '051', '075', '082'] | |
remaining_UIDs = [] | |
count_NSR = [] | |
import math | |
for index, row in df_summary.iterrows(): | |
UID = row['UID'] | |
this_NSR = row['sample_nonAF'] | |
if math.isnan(this_NSR): | |
# There is no segment in this subject, skip this UID. | |
print(f'---------UID {UID} has no segments.------------') | |
continue | |
if UID not in AF_trial_train and UID not in AF_trial_test and UID not in clinical_trial_AF_subjects \ | |
and not UID[0] == '3' and not UID[0] == '4': | |
remaining_UIDs.append(UID) | |
count_NSR.append(this_NSR) | |
from numpy import random | |
random.seed(seed=42) | |
from numpy.random import choice | |
list_of_candidates = remaining_UIDs | |
number_of_items_to_pick = round(len(list_of_candidates) * 0.15) # 10% labeled for training, 5% for testing. | |
temp_sum = sum(count_NSR) | |
probability_distribution = [x/temp_sum for x in count_NSR] | |
probability_distribution = [(1-x/temp_sum)/ (len(count_NSR)-1) for x in count_NSR]# Subjects with fewer segments have higher chance to be selected. Make sure the sum is one. | |
draw = choice(list_of_candidates, number_of_items_to_pick, | |
p=probability_distribution, replace=False) | |
clinical_trial_train = list(draw[:round(len(list_of_candidates) * 0.1)]) | |
clinical_trial_test_nonAF = list(draw[round(len(list_of_candidates) * 0.1):]) | |
clinical_trial_test_temp = clinical_trial_test_nonAF + clinical_trial_AF_subjects | |
clinical_trial_test = [] | |
for UID in clinical_trial_test_temp: | |
# UID 051 and maybe other UIDs had no segments (unknown reason). | |
if UID in all_UIDs: | |
clinical_trial_test.append(UID) | |
clinical_trial_unlabeled = [] | |
for UID in all_UIDs: | |
if UID not in clinical_trial_train and UID not in clinical_trial_test and not UID[0] == '3' and not UID[0] == '4': | |
clinical_trial_unlabeled.append(UID) | |
print(f'Clinical trial: selected {len(clinical_trial_train)} UIDs for training {clinical_trial_train}') | |
print(f'Clinical trial: selected {len(clinical_trial_test)} UIDs for testing {clinical_trial_test}') | |
print(f'Clinical trial: selected {len(clinical_trial_unlabeled)} UIDs for unlabeled {clinical_trial_unlabeled}') | |
is_linux = False # Set to True if running on Linux, False if on Windows | |
is_hpc = False # Set to True if running on an HPC, False if on Windows | |
data_format = 'csv' # Choose 'csv' or 'png' | |
data_path, labels_path, saving_path = get_data_paths(data_format, is_linux=is_linux, is_hpc=is_hpc) | |
# clinical_trial_train = [clinical_trial_train[0]] | |
# clinical_trial_test = [clinical_trial_test[0]] | |
# clinical_trial_unlabeled = clinical_trial_unlabeled[0:4] | |
batch_size = 512 | |
train_loader = load_data_split_batched(data_path, labels_path, clinical_trial_train, batch_size, standardize=True, data_format='csv', read_all_labels=False, drop_last=True) | |
val_loader = load_data_split_batched(data_path, labels_path, clinical_trial_test, batch_size, standardize=True, data_format='csv', read_all_labels=False, drop_last=True) | |
test_loader = load_data_split_batched(data_path, labels_path, clinical_trial_unlabeled, batch_size, standardize=True, data_format='csv', read_all_labels=False, drop_last=True) | |
# data_iter = iter(train_loader) | |
# data = next(data_iter) | |
# data["label"] | |
''' | |
Key Points: | |
Initialization: The script starts by initializing the data loaders for training, validation, and testing. | |
Initial Training: The model is initially trained on the training set. | |
Active Learning Loop: During each iteration of active learning, the script: | |
Performs uncertainty sampling on the validation set. | |
Updates the training loader with the uncertain samples. | |
Retrains the model with this updated training set. | |
Final Evaluation: After the active learning iterations, the model is evaluated on the test set to assess its performance on unseen data. | |
Notes: | |
Data Preprocessing: The script assumes that train_batch['data'] from your data loaders is already appropriately preprocessed for your model. If additional preprocessing is required, make sure to include it. | |
Active Learning Iterations: The number of active learning iterations (active_learning_iterations) and the number of samples per iteration (n_samples) are parameters you can adjust based on your scenario and dataset. | |
Test Evaluation: The final evaluation on the test set gives you an understanding of how well the model generalizes to new, unseen data. | |
''' | |
import numpy as np | |
import torch | |
from torch.utils.data import DataLoader, Dataset | |
import os | |
import pandas as pd | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.manifold import TSNE | |
import gpytorch | |
from gpytorch.mlls import VariationalELBO | |
from gpytorch.kernels import RBFKernel, PeriodicKernel, ScaleKernel | |
from gpytorch.models import ExactGP | |
from gpytorch.mlls import ExactMarginalLogLikelihood | |
from gpytorch.means import ConstantMean | |
from gpytorch.distributions import MultivariateNormal | |
from sklearn.cluster import KMeans | |
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score | |
# Assuming CustomDataset and data loader functions are defined as provided | |
def map_samples_to_uids(uncertain_sample_indices, dataset): | |
""" | |
Maps indices of uncertain samples back to their corresponding segment names or UIDs. | |
Args: | |
- uncertain_sample_indices: Indices of the uncertain samples in the dataset. | |
- dataset: The dataset object which contains the mapping of segment names and UIDs. | |
Returns: | |
- List of UIDs or segment names corresponding to the uncertain samples. | |
""" | |
return [dataset.segment_names[i] for i in uncertain_sample_indices] | |
def apply_tsne(data, n_components=2): | |
n_samples = data.shape[0] | |
perplexity = min(30, n_samples - 1) # Ensure perplexity is less than the number of samples | |
tsne = TSNE(n_components=n_components, perplexity=perplexity) | |
return tsne.fit_transform(data) | |
import torch | |
import gpytorch | |
from gpytorch.kernels import SpectralMixtureKernel | |
from gpytorch.models import AbstractVariationalGP | |
from gpytorch.variational import CholeskyVariationalDistribution, VariationalStrategy | |
from gpytorch.distributions import MultivariateNormal | |
import numpy as np | |
import torch | |
import gpytorch | |
import pandas as pd | |
from sklearn.cluster import KMeans | |
from torch.utils.data import DataLoader | |
from tqdm import tqdm | |
import matplotlib.pyplot as plt | |
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, precision_recall_fscore_support | |
from sklearn.preprocessing import label_binarize, StandardScaler | |
from sklearn.manifold import TSNE | |
num_latents = 6 # This should match the complexity of your data or the number of tasks | |
num_tasks = 4 # This should match the number of output classes or tasks | |
num_inducing_points = 50 # This is independent and should be sufficient for the input space | |
class MultitaskGPModel(gpytorch.models.ApproximateGP): | |
def __init__(self): | |
# Let's use a different set of inducing points for each latent function | |
inducing_points = torch.rand(num_latents, num_inducing_points, 128 * 128) # Assuming flattened 128x128 images | |
# We have to mark the CholeskyVariationalDistribution as batch | |
# so that we learn a variational distribution for each task | |
variational_distribution = gpytorch.variational.CholeskyVariationalDistribution( | |
inducing_points.size(-2), batch_shape=torch.Size([num_latents]) | |
) | |
# We have to wrap the VariationalStrategy in a LMCVariationalStrategy | |
# so that the output will be a MultitaskMultivariateNormal rather than a batch output | |
variational_strategy = gpytorch.variational.LMCVariationalStrategy( | |
gpytorch.variational.VariationalStrategy( | |
self, inducing_points, variational_distribution, learn_inducing_locations=True | |
), | |
num_tasks=num_tasks, | |
num_latents=num_latents, | |
latent_dim=-1 | |
) | |
super().__init__(variational_strategy) | |
# The mean and covariance modules should be marked as batch | |
# so we learn a different set of hyperparameters | |
self.mean_module = gpytorch.means.ConstantMean(batch_shape=torch.Size([num_latents])) | |
self.covar_module = gpytorch.kernels.ScaleKernel( | |
gpytorch.kernels.RBFKernel(batch_shape=torch.Size([num_latents])), | |
batch_shape=torch.Size([num_latents]) | |
) | |
def forward(self, x): | |
# The forward function should be written as if we were dealing with each output | |
# dimension in batch | |
# Ensure x is correctly shaped. It should have the same last dimension size as inducing_points | |
# x should be reshaped or sliced to have the shape [?, 1] where ? can be any size | |
# For example, if x originally has shape [N, D], and D != 1, you need to modify x accordingly | |
# print(f"Input shape: {x.shape}") | |
# x = x.view(x.size(0), -1) # Flattening the images | |
# print(f"Input shape after flattening: {x.shape}") # Debugging input shape | |
mean_x = self.mean_module(x) | |
covar_x = self.covar_module(x) | |
# Debugging: Print shapes of intermediate outputs | |
# print(f"Mean shape: {mean_x.shape}, Covariance shape: {covar_x.shape}") | |
latent_pred = gpytorch.distributions.MultivariateNormal(mean_x, covar_x) | |
# print(f"Latent prediction shape: {latent_pred.mean.shape}, {latent_pred.covariance_matrix.shape}") | |
return latent_pred | |
# Usage | |
# model = MultitaskGPModel() | |
# likelihood = gpytorch.likelihoods.SoftmaxLikelihood(num_features=2, num_classes=4, mixing_weights = False) | |
from tqdm import tqdm | |
# Initialize result storage | |
results = { | |
'train_loss': [], | |
'validation_metrics': {'precision': [], 'recall': [], 'f1': [], 'auc_roc': []}, | |
'test_metrics': None # This will be filled in with the final test metrics | |
} | |
import torch | |
import gpytorch | |
from gpytorch.likelihoods import SoftmaxLikelihood | |
from gpytorch.functions import log_normal_cdf | |
def train_gp_model(train_x, train_y, val_loader, num_iterations=50, n_classes=4, patience=10, checkpoint_path='model_checkpoint_full.pt'): | |
model = MultitaskGPModel().to(device) | |
likelihood = gpytorch.likelihoods.SoftmaxLikelihood(num_features=4, num_classes=4).to(device) | |
model.train() | |
likelihood.train() | |
optimizer = torch.optim.Adam(model.parameters(), lr=0.1) | |
mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=train_y.size(0)) | |
best_val_loss = float('inf') | |
epochs_no_improve = 0 | |
for i in tqdm(range(num_iterations), desc='Training', unit='iter', leave=False): | |
optimizer.zero_grad() | |
output = model(train_x) | |
loss = -mll(output, train_y) | |
scalar_loss = loss.sum() if loss.numel() > 1 else loss | |
scalar_loss.backward() | |
optimizer.step() | |
# Validation step | |
model.eval() | |
likelihood.eval() | |
with torch.no_grad(): | |
val_loss = 0.0 | |
for val_batch in val_loader: | |
val_x, val_y = val_batch['data'].view(val_batch['data'].size(0), -1).to(device), val_batch['label'].to(device) | |
val_output = model(val_x) | |
val_loss += -mll(val_output, val_y).item() | |
val_loss /= len(val_loader) | |
model.train() | |
likelihood.train() | |
# Early stopping and checkpointing based on validation loss | |
if val_loss < best_val_loss: | |
best_val_loss = val_loss | |
epochs_no_improve = 0 | |
torch.save({'model_state_dict': model.state_dict(), | |
'likelihood_state_dict': likelihood.state_dict(), | |
'optimizer_state_dict': optimizer.state_dict()}, checkpoint_path) | |
else: | |
epochs_no_improve += 1 | |
if epochs_no_improve == patience: | |
print(f"Early stopping triggered at iteration {i+1}") | |
break | |
# Load the best model before return | |
checkpoint = torch.load(checkpoint_path) | |
model.load_state_dict(checkpoint['model_state_dict']) | |
likelihood.load_state_dict(checkpoint['likelihood_state_dict']) | |
optimizer.load_state_dict(checkpoint['optimizer_state_dict']) | |
return model, likelihood | |
def parse_classification_report(report): | |
"""Parse a classification report into a dictionary of metrics.""" | |
lines = report.split('\n') | |
main_metrics = lines[-2].split() | |
# Assuming the last line is like "accuracy: x macro avg y1 y2 y3 y4" | |
return { | |
'precision': float(main_metrics[3]), | |
'recall': float(main_metrics[4]), | |
'f1': float(main_metrics[5]), | |
'auc_roc': None # AUC-ROC is not part of the classification report by default | |
} | |
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score | |
from sklearn.preprocessing import label_binarize | |
from sklearn.metrics import auc | |
from sklearn.metrics import roc_curve | |
from sklearn.metrics import classification_report | |
from sklearn.metrics import precision_recall_fscore_support | |
def evaluate_model_on_all_data(model, likelihood, data_loader, device, n_classes): | |
model.eval() | |
likelihood.eval() | |
all_predicted_labels = [] | |
all_test_labels = [] | |
with torch.no_grad(), gpytorch.settings.fast_pred_var(): | |
for i, batch in enumerate(data_loader): | |
test_data = batch['data'].view(batch['data'].size(0), -1).to(device) | |
test_labels = batch['label'].to(device) | |
# print(f"Test data shape before t-SNE: {test_data.shape}") | |
predictions = likelihood(model(test_data)).mean | |
# Debugging - check shape of predictions | |
# print(f"Predictions shape: {predictions.shape}") | |
predicted_labels = predictions.argmax(dim=0) | |
# Add debugging information | |
# print(f"Batch {i}: Predicted Labels Shape: {predicted_labels.shape}, Actual Labels Shape: {test_labels.shape}") | |
all_predicted_labels.append(predicted_labels.cpu().numpy()) | |
all_test_labels.append(test_labels.numpy()) | |
# Debug the accumulation of labels | |
current_predicted = np.concatenate(all_predicted_labels, axis=0) | |
current_actual = np.concatenate(all_test_labels, axis=0) | |
# print(f"After Batch {i}: Accumulated Predicted Labels: {current_predicted.shape[0]}, Accumulated Actual Labels: {current_actual.shape[0]}") | |
# Concatenate all batch results | |
all_predicted_labels = np.concatenate(all_predicted_labels, axis=0) | |
all_test_labels = np.concatenate(all_test_labels, axis=0) | |
# Final check | |
# print(f"Final: Total Predicted Labels: {all_predicted_labels.shape[0]}, Total Actual Labels: {all_test_labels.shape[0]}") | |
# Verify if the shapes match before proceeding to calculate metrics | |
if all_predicted_labels.shape[0] != all_test_labels.shape[0]: | |
raise ValueError("Mismatch in the number of samples between predicted and actual labels") | |
# Compute overall evaluation metrics | |
precision, recall, f1, _ = precision_recall_fscore_support(all_test_labels, all_predicted_labels, average='macro') | |
# For AUC-ROC, you need the predicted probabilities and true labels in a one-hot encoded format | |
test_labels_one_hot = label_binarize(all_test_labels, classes=np.arange(n_classes)) | |
auc_roc = roc_auc_score(test_labels_one_hot, predictions.softmax(dim=-1).cpu().numpy(), multi_class='ovr') | |
return { | |
'precision': precision, | |
'recall': recall, | |
'f1': f1, | |
'auc_roc': auc_roc | |
} | |
import random | |
def stochastic_uncertainty_sampling(gp_model, gp_likelihood, val_loader, n_samples, n_batches, n_components=2): | |
gp_model.eval() | |
gp_likelihood.eval() | |
uncertain_sample_indices = [] | |
sampled_batches = random.sample(list(val_loader), n_batches) # Randomly sample n_batches from val_loader | |
with torch.no_grad(): | |
for batch in sampled_batches: | |
# reduced_data = apply_tsne(batch['data'].reshape(batch['data'].size(0), -1), n_components=n_components) | |
# reduced_data_tensor = torch.Tensor(reduced_data).to(device) | |
reduced_data_tensor = batch['data'].view(batch['data'].size(0), -1).to(device) | |
predictions = gp_likelihood(gp_model(reduced_data_tensor)) | |
var = predictions.variance | |
top_indices = torch.argsort(-var.flatten())[:n_samples] | |
uncertain_sample_indices.extend(top_indices.cpu().numpy()) | |
return uncertain_sample_indices[:n_samples] | |
# def uncertainty_sampling(gp_model, gp_likelihood, val_loader, n_samples, n_components=2): | |
# gp_model.eval() | |
# gp_likelihood.eval() | |
# uncertain_sample_indices = [] | |
# with torch.no_grad(): | |
# for batch_idx, batch in tqdm(enumerate(val_loader), desc='Uncertainty Sampling', unit='batch'): | |
# reduced_data_tensor = batch['data'].view(batch['data'].size(0), -1).to(device) | |
# predictions = gp_likelihood(gp_model(reduced_data_tensor)) | |
# var = predictions.variance | |
# top_indices = torch.argsort(-var.flatten())[:n_samples] | |
# batch_uncertain_indices = [batch_idx * val_loader.batch_size + idx for idx in top_indices] | |
# uncertain_sample_indices.extend(batch_uncertain_indices) | |
# return uncertain_sample_indices[:n_samples] | |
def label_samples(uncertain_samples, validation_data): | |
labels = [validation_data[sample_id]['label'] for sample_id in uncertain_samples] | |
return uncertain_samples, labels | |
def update_train_loader_with_uncertain_samples(current_train_loader, new_sample_indices, data_path, labels_path, batch_size, standardize=False, data_format='csv', read_all_labels=True): | |
# Extract current UIDs from the current_train_loader | |
current_dataset = current_train_loader.dataset | |
# Map new_samples back to their corresponding segment names or UIDs | |
new_uids = map_samples_to_uids(new_sample_indices, current_dataset) | |
# Add new UIDs to the current dataset and refresh it | |
current_dataset.add_uids(new_uids) | |
# Create new DataLoader with the updated dataset | |
updated_train_loader = DataLoader(current_dataset, batch_size=batch_size, shuffle=False) | |
return updated_train_loader | |
def plot_training_performance(train_loss, validation_metrics): | |
epochs = range(1, len(train_loss) + 1) | |
# Plot training loss | |
plt.figure(figsize=(14, 6)) | |
plt.subplot(1, 2, 1) | |
plt.plot(epochs, train_loss, 'b-', label='Training Loss') | |
plt.title('Training Loss') | |
plt.xlabel('Epochs') | |
plt.ylabel('Loss') | |
plt.legend() | |
# Plot validation metrics | |
plt.subplot(1, 2, 2) | |
plt.plot(epochs, validation_metrics['precision'], 'r-', label='Precision') | |
plt.plot(epochs, validation_metrics['recall'], 'g-', label='Recall') | |
plt.plot(epochs, validation_metrics['f1'], 'b-', label='F1 Score') | |
plt.plot(epochs, validation_metrics['auc_roc'], 'y-', label='AUC-ROC') | |
plt.title('Validation Metrics') | |
plt.xlabel('Epochs') | |
plt.ylabel('Metrics') | |
plt.legend() | |
plt.tight_layout() | |
plt.show() | |
def plot_results(results): | |
plt.figure(figsize=(12, 5)) | |
plt.subplot(1, 2, 1) | |
plt.plot(results['train_loss'], label='Train Loss') | |
plt.title('Training Loss Over Time') | |
plt.legend() | |
plt.subplot(1, 2, 2) | |
for metric in ['precision', 'recall', 'f1']: | |
plt.plot(results['validation_metrics'][metric], label=metric.title()) | |
plt.title('Validation Metrics Over Time') | |
plt.legend() | |
plt.show() | |
test_metrics = results['test_metrics'] | |
print("Test Metrics:") | |
print(f"Precision: {test_metrics['precision']}") | |
print(f"Recall: {test_metrics['recall']}") | |
print(f"F1 Score: {test_metrics['f1']}") | |
print(f"AUC-ROC: {test_metrics['auc_roc']}") | |
from sklearn.cluster import KMeans | |
# K-Means Validation Function | |
def kmeans_validation(model, data_loader, n_clusters, device): | |
model.eval() | |
all_data = [] | |
all_predictions = [] | |
with torch.no_grad(): | |
for batch in data_loader: | |
data = batch['data'].view(batch['data'].size(0), -1).to(device) | |
labels = batch['label'].to(device) | |
predictions = model(data).mean.argmax(dim=-1).cpu().numpy() | |
all_data.extend(data.cpu().numpy()) | |
all_predictions.extend(predictions) | |
all_data = np.array(all_data) | |
all_predictions = np.array(all_predictions) | |
# Perform KMeans clustering | |
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(all_data) | |
cluster_labels = kmeans.labels_ | |
# Check label consistency within each cluster | |
for i in range(n_clusters): | |
cluster_indices = np.where(cluster_labels == i)[0] | |
cluster_pred_labels = all_predictions[cluster_indices] | |
most_common_label = np.bincount(cluster_pred_labels).argmax() | |
# Compare most_common_label with actual labels in the cluster | |
# Adjust the model's predictions if necessary | |
return kmeans | |
from sklearn.cluster import MiniBatchKMeans | |
def run_minibatch_kmeans(data_loader, n_clusters, device, batch_size=100): | |
# Initialize MiniBatchKMeans | |
minibatch_kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=0, batch_size=batch_size) | |
# Iterate through data_loader and fit MiniBatchKMeans | |
for batch in data_loader: | |
data = batch['data'].view(batch['data'].size(0), -1).to(device).cpu().numpy() | |
minibatch_kmeans.partial_fit(data) | |
return minibatch_kmeans | |
# def compare_kmeans_gp_predictions(kmeans_model, gp_model, data_loader, device): | |
# # Compare K-Means with GP model predictions | |
# all_data, all_labels = [], [] | |
# for batch in data_loader: | |
# data = batch['data'].view(batch['data'].size(0), -1).to(device) | |
# labels = batch['label'].to(device) | |
# gp_predictions = gp_model(data).mean.argmax(dim=0).cpu().numpy() | |
# kmeans_predictions = kmeans_model.predict(data.cpu().numpy()) | |
# all_labels.append(labels.cpu().numpy()) | |
# all_data.append((gp_predictions, kmeans_predictions)) | |
# return all_data, np.concatenate(all_labels) | |
def stochastic_compare_kmeans_gp_predictions(kmeans_model, gp_model, data_loader, n_batches, device): | |
all_data, all_labels = [], [] | |
sampled_batches = random.sample(list(data_loader), n_batches) # Randomly sample n_batches from data_loader | |
for batch in sampled_batches: | |
data = batch['data'].view(batch['data'].size(0), -1).to(device) | |
labels = batch['label'].to(device) | |
gp_predictions = gp_model(data).mean.argmax(dim=0).cpu().numpy() | |
kmeans_predictions = kmeans_model.predict(data.cpu().numpy()) | |
all_labels.append(labels.cpu().numpy()) | |
all_data.append((gp_predictions, kmeans_predictions)) | |
return all_data, np.concatenate(all_labels) | |
import matplotlib.pyplot as plt | |
from sklearn.metrics import confusion_matrix | |
import seaborn as sns | |
def plot_comparative_results(gp_vs_kmeans_data, original_labels): | |
fig, axes = plt.subplots(1, 2, figsize=(14, 7)) | |
# Plot 1: Confusion Matrix for GP Predictions vs Original Labels | |
gp_predictions = [pair[0] for pair in gp_vs_kmeans_data] | |
gp_predictions = np.concatenate(gp_predictions) | |
cm_gp = confusion_matrix(original_labels, gp_predictions) | |
sns.heatmap(cm_gp, annot=True, ax=axes[0], fmt='g') | |
axes[0].set_title('GP Model Predictions vs Original Labels') | |
axes[0].set_xlabel('Predicted Labels') | |
axes[0].set_ylabel('True Labels') | |
# Plot 2: Confusion Matrix for K-Means Predictions vs Original Labels | |
kmeans_predictions = [pair[1] for pair in gp_vs_kmeans_data] | |
kmeans_predictions = np.concatenate(kmeans_predictions) | |
cm_kmeans = confusion_matrix(original_labels, kmeans_predictions) | |
sns.heatmap(cm_kmeans, annot=True, ax=axes[1], fmt='g') | |
axes[1].set_title('K-Means Predictions vs Original Labels') | |
axes[1].set_xlabel('Predicted Labels') | |
axes[1].set_ylabel('True Labels') | |
plt.tight_layout() | |
plt.show() | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
# Main execution | |
if __name__ == "__main__": | |
# Define the number of output features and classes first | |
n_classes = 4 # Assuming you have 4 classes | |
# Initialize data loaders | |
train_loader = load_data_split_batched(data_path, labels_path, clinical_trial_train, batch_size, standardize=False, data_format='csv', read_all_labels=False, drop_last=True) | |
val_loader = load_data_split_batched(data_path, labels_path, clinical_trial_test, batch_size, standardize=False, data_format='csv', read_all_labels=True, drop_last=True) | |
test_loader = load_data_split_batched(data_path, labels_path, clinical_trial_unlabeled, batch_size, standardize=False, data_format='csv', read_all_labels=True, drop_last=True) | |
kmeans_model = run_minibatch_kmeans(train_loader, n_clusters=n_classes, device=device) | |
# Initialize result storage | |
results = { | |
'train_loss': [], | |
'validation_metrics': {'precision': [], 'recall': [], 'f1': [], 'auc_roc': []}, | |
'test_metrics': None # This will be filled in with the final test metrics | |
} | |
# Initial model training | |
for train_batch in train_loader: | |
train_x = train_batch['data'].view(train_batch['data'].size(0), -1).to(device) | |
train_y = train_batch['label'].to(device) | |
model, likelihood = train_gp_model(train_x, train_y, val_loader, num_iterations=10, n_classes=n_classes) | |
active_learning_iterations = 10 | |
n_samples = batch_size # Number of uncertain samples to accumulate | |
for iteration in tqdm(range(active_learning_iterations), desc='Active Learning', unit='iteration', leave=True): | |
uncertain_sample_indices = stochastic_uncertainty_sampling(model, likelihood, val_loader, n_samples, n_batches=5, n_components=2) | |
# Accumulate indices of uncertain samples | |
accumulated_indices = [] | |
for idx in uncertain_sample_indices: | |
accumulated_indices.append(idx) | |
# Update the training loader with indices of uncertain samples | |
train_loader = update_train_loader_with_uncertain_samples(train_loader, accumulated_indices, data_path, labels_path, batch_size) | |
# Re-train the model with the updated train_loader | |
for train_batch in tqdm(train_loader, desc='Batch Training', leave=False): | |
train_x = train_batch['data'].view(train_batch['data'].size(0), -1).to(device) # Flatten the image | |
train_y = train_batch['label'].to(device) | |
model, likelihood = train_gp_model(train_x, train_y, val_loader, num_iterations=10, n_classes=n_classes) | |
val_metrics = evaluate_model_on_all_data(model, likelihood, val_loader, device, n_classes) | |
for metric in ['precision', 'recall', 'f1', 'auc_roc']: | |
results['validation_metrics'][metric].append(val_metrics[metric]) | |
# Compare K-Means with GP model predictions after retraining | |
gp_vs_kmeans_data, original_labels = stochastic_compare_kmeans_gp_predictions(kmeans_model, model, train_loader, n_batches=5, device=device) | |
plot_comparative_results(gp_vs_kmeans_data, original_labels) | |
plot_training_performance(results['train_loss'], results['validation_metrics']) | |
# Final evaluation on test set | |
classification_result = evaluate_model_on_all_data(model, likelihood, test_loader, device, n_classes=n_classes) | |
# Store test metrics | |
results['test_metrics'] = classification_result | |
# Now results dictionary is ready to be used for plotting | |
plot_results(results) | |
# You might also want to print or log the final test metrics | |
print("Final Test Metrics:", results['test_metrics']) |