diff --git a/BML_project/active_learning/__pycache__/ss_active_learning.cpython-311.pyc b/BML_project/active_learning/__pycache__/ss_active_learning.cpython-311.pyc index 45c9875..8d691ba 100644 Binary files a/BML_project/active_learning/__pycache__/ss_active_learning.cpython-311.pyc and b/BML_project/active_learning/__pycache__/ss_active_learning.cpython-311.pyc differ diff --git a/BML_project/active_learning/ss_active_learning.py b/BML_project/active_learning/ss_active_learning.py index 4c44836..4442a34 100644 --- a/BML_project/active_learning/ss_active_learning.py +++ b/BML_project/active_learning/ss_active_learning.py @@ -57,6 +57,30 @@ def run_minibatch_kmeans(data_loader, n_clusters, device, batch_size=100): return minibatch_kmeans +""" from sklearn.cluster import MiniBatchKMeans +import numpy as np + +def run_minibatch_kmeans(data_loader, n_clusters, device, batch_size=100, n_init='auto'): + # Initialize MiniBatchKMeans with explicit n_init to suppress the FutureWarning + minibatch_kmeans = MiniBatchKMeans(n_clusters=n_clusters, n_init=n_init, random_state=0, batch_size=batch_size) + + # Prepare an empty list to collect all data for fitting + all_data = [] + + # Iterate through data_loader and collect data + for batch in data_loader: + # Assuming 'data' is a key in your batch dict that contains the features + data = batch['data'].view(batch['data'].size(0), -1).cpu().numpy() # Adjust as necessary + all_data.append(data) + + # Concatenate all data collected from the batches + all_data_np = np.concatenate(all_data, axis=0) + + # Fit MiniBatchKMeans with all collected data at once + minibatch_kmeans.fit(all_data_np) + + return minibatch_kmeans """ + # def compare_kmeans_gp_predictions(kmeans_model, gp_model, data_loader, device): # # Compare K-Means with GP model predictions # all_data, all_labels = [], [] @@ -80,7 +104,9 @@ def stochastic_compare_kmeans_gp_predictions(kmeans_model, gp_model, data_loader kmeans_predictions = kmeans_model.predict(data.cpu().numpy()) all_labels.append(labels.cpu().numpy()) all_data.append((gp_predictions, kmeans_predictions)) - + print(f"Processed batch size: {len(current_batch_labels)}, Cumulative original_labels size: {len(original_labels)}, Cumulative gp_predictions size: {len(gp_predictions)}") + if len(current_batch_labels) < expected_batch_size: + print(f"Last batch processed with size: {len(current_batch_labels)}") return all_data, np.concatenate(all_labels) import random diff --git a/BML_project/main_checkpoints_updated.py b/BML_project/main_checkpoints_updated.py new file mode 100644 index 0000000..234291c --- /dev/null +++ b/BML_project/main_checkpoints_updated.py @@ -0,0 +1,64 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Feb 7 15:34:31 2024 + +@author: lrm22005 +""" +import os +import tqdm +import torch +from utils.data_loader import preprocess_data, split_uids, update_train_loader_with_uncertain_samples +from models.ss_gp_model import MultitaskGPModel, train_gp_model +from utils_gp.ss_evaluation import stochastic_evaluation, evaluate_model_on_all_data +from active_learning.ss_active_learning import stochastic_uncertainty_sampling, run_minibatch_kmeans, stochastic_compare_kmeans_gp_predictions, label_samples +from utils.visualization import plot_comparative_results, plot_training_performance, plot_results + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +def main(): + n_classes = 4 + batch_size = 1024 + clinical_trial_train, clinical_trial_test, clinical_trial_unlabeled = split_uids() + data_format = 'pt' + + train_loader, val_loader, test_loader = preprocess_data(data_format, clinical_trial_train, clinical_trial_test, clinical_trial_unlabeled, batch_size) + + # Initialize result storage + results = { + 'train_loss': [], + 'validation_metrics': {'precision': [], 'recall': [], 'f1': [], 'auc_roc': []}, + 'active_learning': {'validation_metrics': []}, # Store validation metrics for each active learning iteration + 'test_metrics': None + } + + # Initial model training + model, likelihood, training_metrics = train_gp_model(train_loader, val_loader, num_iterations=50, n_classes=n_classes, patience=10, checkpoint_path='model_checkpoint_full.pt') + + # Save initial training metrics + results['train_loss'].extend(training_metrics['train_loss']) + for metric in ['precision', 'recall', 'f1_score']: + results['validation_metrics'][metric].extend(training_metrics[metric]) + + active_learning_iterations = 10 + for iteration in tqdm.tqdm(range(active_learning_iterations), desc='Active Learning', unit='iteration'): + uncertain_sample_indices = stochastic_uncertainty_sampling(model, likelihood, val_loader, n_samples=batch_size, device=device) + train_loader = update_train_loader_with_uncertain_samples(train_loader, uncertain_sample_indices, batch_size) + + # Re-train the model with updated training data + model, likelihood, val_metrics = train_gp_model(train_loader, val_loader, num_iterations=10, n_classes=n_classes, patience=10, checkpoint_path='model_checkpoint_last.pt') + + # Store validation metrics for each active learning iteration + results['active_learning']['validation_metrics'].append(val_metrics) + + # Final evaluations + test_metrics = evaluate_model_on_all_data(model, likelihood, test_loader, device, n_classes) + results['test_metrics'] = test_metrics + + # Visualization of results + plot_training_performance(results['train_loss'], results['validation_metrics']) + plot_results(results['test_metrics']) # Adjust this function to handle the structure of test_metrics + + print("Final Test Metrics:", results['test_metrics']) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/BML_project/models/__pycache__/ss_gp_model.cpython-311.pyc b/BML_project/models/__pycache__/ss_gp_model.cpython-311.pyc index 7ea0e5a..75ee3d4 100644 Binary files a/BML_project/models/__pycache__/ss_gp_model.cpython-311.pyc and b/BML_project/models/__pycache__/ss_gp_model.cpython-311.pyc differ diff --git a/BML_project/ss_main.py b/BML_project/ss_main.py index a610684..b784ce4 100644 --- a/BML_project/ss_main.py +++ b/BML_project/ss_main.py @@ -6,11 +6,11 @@ """ import tqdm import torch -from utils.data_loader import preprocess_data, split_uids, update_train_loader_with_uncertain_samples +from utils_gp.data_loader import preprocess_data, split_uids, update_train_loader_with_uncertain_samples from models.ss_gp_model import MultitaskGPModel, train_gp_model from utils_gp.ss_evaluation import stochastic_evaluation, evaluate_model_on_all_data from active_learning.ss_active_learning import stochastic_uncertainty_sampling, run_minibatch_kmeans, stochastic_compare_kmeans_gp_predictions -from utils.visualization import plot_comparative_results, plot_training_performance, plot_results +from utils_gp.visualization import plot_comparative_results, plot_training_performance, plot_results device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -50,6 +50,7 @@ def main(): # Update the training loader with uncertain samples train_loader = update_train_loader_with_uncertain_samples(train_loader, uncertain_sample_indices, batch_size) + print(f"Updated training data size: {len(train_loader.dataset)}") # Re-train the model with the updated training data model, likelihood, val_metrics = train_gp_model(train_loader, val_loader, num_iterations=10, n_classes=n_classes, patience=10, checkpoint_path='model_checkpoint_last.pt') @@ -71,6 +72,8 @@ def main(): results['test_metrics'] = test_metrics test_gp_vs_kmeans_data, test_original_labels = stochastic_compare_kmeans_gp_predictions(test_kmeans_model, model, test_loader, n_batches=5, device=device) + + print(f"Length of original_labels: {len(original_labels)}, Length of gp_predictions: {len(gp_predictions)}") plot_comparative_results(test_gp_vs_kmeans_data, test_original_labels) # Visualization of results diff --git a/BML_project/utils_gp/__pycache__/data_loader.cpython-311.pyc b/BML_project/utils_gp/__pycache__/data_loader.cpython-311.pyc index 0b1a7eb..ed2626e 100644 Binary files a/BML_project/utils_gp/__pycache__/data_loader.cpython-311.pyc and b/BML_project/utils_gp/__pycache__/data_loader.cpython-311.pyc differ diff --git a/BML_project/utils_gp/__pycache__/ss_evaluation.cpython-311.pyc b/BML_project/utils_gp/__pycache__/ss_evaluation.cpython-311.pyc index 14c7dfa..46b1836 100644 Binary files a/BML_project/utils_gp/__pycache__/ss_evaluation.cpython-311.pyc and b/BML_project/utils_gp/__pycache__/ss_evaluation.cpython-311.pyc differ diff --git a/BML_project/utils_gp/__pycache__/visualization.cpython-311.pyc b/BML_project/utils_gp/__pycache__/visualization.cpython-311.pyc index a8c3afd..f53ef75 100644 Binary files a/BML_project/utils_gp/__pycache__/visualization.cpython-311.pyc and b/BML_project/utils_gp/__pycache__/visualization.cpython-311.pyc differ