Skip to content

Commit

Permalink
Model Gaussian Variational Process
Browse files Browse the repository at this point in the history
Adding the final version of the model with Semisupervised and supervised methods.

Co-Authored-By: Dong Han <dong.han@uconn.edu>
  • Loading branch information
lrm22005 and doh16101 committed Jan 18, 2024
1 parent 61fb8ee commit 9388a46
Show file tree
Hide file tree
Showing 14 changed files with 1,431 additions and 27 deletions.
Binary file not shown.
120 changes: 120 additions & 0 deletions BML_project/active_learning/ss_active_learning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 18 18:23:23 2023
@author: lrm22005
"""
import numpy as np
import random
import torch
from torch.utils.data import DataLoader
from sklearn.cluster import MiniBatchKMeans

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def label_samples(uncertain_samples, validation_data):
labels = [validation_data[sample_id]['label'] for sample_id in uncertain_samples]
return uncertain_samples, labels

def stochastic_uncertainty_sampling(gp_model, gp_likelihood, val_loader, n_samples, n_batches, n_components=2):
gp_model.eval()
gp_likelihood.eval()
uncertain_sample_indices = []
sampled_batches = random.sample(list(val_loader), n_batches) # Randomly sample n_batches from val_loader

with torch.no_grad():
for batch in sampled_batches:
# reduced_data = apply_tsne(batch['data'].reshape(batch['data'].size(0), -1), n_components=n_components)
# reduced_data_tensor = torch.Tensor(reduced_data).to(device)
reduced_data_tensor = batch['data'].view(batch['data'].size(0), -1).to(device)
predictions = gp_likelihood(gp_model(reduced_data_tensor))
var = predictions.variance
top_indices = torch.argsort(-var.flatten())[:n_samples]
uncertain_sample_indices.extend(top_indices.cpu().numpy())

return uncertain_sample_indices[:n_samples]

# def uncertainty_sampling(gp_model, gp_likelihood, val_loader, n_samples, n_components=2):
# gp_model.eval()
# gp_likelihood.eval()
# uncertain_sample_indices = []
# with torch.no_grad():
# for batch_idx, batch in tqdm(enumerate(val_loader), desc='Uncertainty Sampling', unit='batch'):
# reduced_data_tensor = batch['data'].view(batch['data'].size(0), -1).to(device)
# predictions = gp_likelihood(gp_model(reduced_data_tensor))
# var = predictions.variance
# top_indices = torch.argsort(-var.flatten())[:n_samples]
# batch_uncertain_indices = [batch_idx * val_loader.batch_size + idx for idx in top_indices]
# uncertain_sample_indices.extend(batch_uncertain_indices)
# return uncertain_sample_indices[:n_samples]

def run_minibatch_kmeans(data_loader, n_clusters, device, batch_size=100):
# Initialize MiniBatchKMeans
minibatch_kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=0, batch_size=batch_size)

# Iterate through data_loader and fit MiniBatchKMeans
for batch in data_loader:
data = batch['data'].view(batch['data'].size(0), -1).to(device).cpu().numpy()
minibatch_kmeans.partial_fit(data)

This comment has been minimized.

Copy link
@doh16101

doh16101 Jan 22, 2024

Author Collaborator

Hi @lrm22005 , could you please export your conda environment here? I want to know if I have a different package version.

I kept having this error:

OSError: /home/doh16101/anaconda3/envs/CS330_torch/lib/python3.11/site-packages/torchvision/video_reader.so: undefined symbol: _ZN3c108ListType3getESsNS_4Type24SingletonOrSharedTypePtrIS1_EE

This comment has been minimized.

This comment has been minimized.

Copy link
@doh16101

doh16101 Jan 22, 2024

Author Collaborator

Great! I think my torchvision package was damaged somehow. It was 0.16.0. I tried automatically installing torchvision again using Conda, and Conda installed the 0.15.0 for me, but it fixed the code error. I later used pip install torchvision==0.16.2. Now I have the same torchvision package like you, and it also fixed the code error.


return minibatch_kmeans

# def compare_kmeans_gp_predictions(kmeans_model, gp_model, data_loader, device):
# # Compare K-Means with GP model predictions
# all_data, all_labels = [], []
# for batch in data_loader:
# data = batch['data'].view(batch['data'].size(0), -1).to(device)
# labels = batch['label'].to(device)
# gp_predictions = gp_model(data).mean.argmax(dim=0).cpu().numpy()
# kmeans_predictions = kmeans_model.predict(data.cpu().numpy())
# all_labels.append(labels.cpu().numpy())
# all_data.append((gp_predictions, kmeans_predictions))
# return all_data, np.concatenate(all_labels)

def stochastic_compare_kmeans_gp_predictions(kmeans_model, gp_model, data_loader, n_batches, device):
all_data, all_labels = [], []
sampled_batches = random.sample(list(data_loader), n_batches) # Randomly sample n_batches from data_loader

for batch in sampled_batches:
data = batch['data'].view(batch['data'].size(0), -1).to(device)
labels = batch['label'].to(device)
gp_predictions = gp_model(data).mean.argmax(dim=0).cpu().numpy()
kmeans_predictions = kmeans_model.predict(data.cpu().numpy())
all_labels.append(labels.cpu().numpy())
all_data.append((gp_predictions, kmeans_predictions))

return all_data, np.concatenate(all_labels)

import random

def refined_uncertainty_sampling(gp_model, gp_likelihood, kmeans_model, data_loader, n_samples, n_batches, uncertainty_threshold=0.2):
gp_model.eval()
gp_likelihood.eval()
uncertain_sample_indices = []

# Calculate the total number of batches in the DataLoader
total_batches = len(data_loader)

# Ensure that n_batches does not exceed total_batches
n_batches = min(n_batches, total_batches)

# Randomly sample n_batches from data_loader
sampled_batches = random.sample(list(data_loader), n_batches)

with torch.no_grad():
for batch in sampled_batches:
data_tensor = batch['data'].view(batch['data'].size(0), -1).to(device)
gp_predictions = gp_likelihood(gp_model(data_tensor))
kmeans_predictions = kmeans_model.predict(data_tensor.cpu().numpy())

# Calculate the difference between K-means and GP predictions
disagreement = (gp_predictions.mean.argmax(dim=-1).cpu().numpy() != kmeans_predictions).astype(int)

# Calculate uncertainty based on variance of GP predictions
uncertainty = gp_predictions.variance.cpu().numpy()

# Select samples where the disagreement is high and the model is uncertain
uncertain_indices = np.where((disagreement > 0) & (uncertainty > uncertainty_threshold))[0]
uncertain_sample_indices.extend(uncertain_indices)

return uncertain_sample_indices[:n_samples]
Binary file not shown.
198 changes: 198 additions & 0 deletions BML_project/models/ss_gp_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 18 18:01:41 2023
@author: lrm22005
"""
import numpy as np
from tqdm import tqdm
import torch
import gpytorch
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
from sklearn.preprocessing import label_binarize

num_latents = 6 # This should match the complexity of your data or the number of tasks

This comment has been minimized.

Copy link
@doh16101

doh16101 Jan 20, 2024

Author Collaborator

Hi @lrm22005 , why should the number of latent space be size of six? Why does it have to be bigger than the number of tasks?

num_tasks = 4 # This should match the number of output classes or tasks
num_inducing_points = 50 # This is independent and should be sufficient for the input space

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class MultitaskGPModel(gpytorch.models.ApproximateGP):
def __init__(self):
# Let's use a different set of inducing points for each latent function
inducing_points = torch.rand(num_latents, num_inducing_points, 127 * 128) # Assuming flattened 128x128 images

This comment has been minimized.

Copy link
@doh16101

doh16101 Jan 20, 2024

Author Collaborator

Hi @lrm22005 , could you explain why the dimension is 127 * 128?

This comment has been minimized.

Copy link
@doh16101

doh16101 Jan 22, 2024

Author Collaborator

Luis' reply: OOOH, important, very important, it's 127x128 just because i found that error and just to resolve it in that moment that I was trying to run and validate the times of the new files, I did that change. but it's not the final dimension size.


# We have to mark the CholeskyVariationalDistribution as batch
# so that we learn a variational distribution for each task
variational_distribution = gpytorch.variational.CholeskyVariationalDistribution(
inducing_points.size(-2), batch_shape=torch.Size([num_latents])
)

# We have to wrap the VariationalStrategy in a LMCVariationalStrategy
# so that the output will be a MultitaskMultivariateNormal rather than a batch output
variational_strategy = gpytorch.variational.LMCVariationalStrategy(
gpytorch.variational.VariationalStrategy(
self, inducing_points, variational_distribution, learn_inducing_locations=True
),
num_tasks=num_tasks,
num_latents=num_latents,
latent_dim=-1
)

super().__init__(variational_strategy)

# The mean and covariance modules should be marked as batch
# so we learn a different set of hyperparameters
self.mean_module = gpytorch.means.ConstantMean(batch_shape=torch.Size([num_latents]))
self.covar_module = gpytorch.kernels.ScaleKernel(
gpytorch.kernels.RBFKernel(batch_shape=torch.Size([num_latents])),
batch_shape=torch.Size([num_latents])
)

def forward(self, x):
# The forward function should be written as if we were dealing with each output
# dimension in batch
# Ensure x is correctly shaped. It should have the same last dimension size as inducing_points
# x should be reshaped or sliced to have the shape [?, 1] where ? can be any size
# For example, if x originally has shape [N, D], and D != 1, you need to modify x accordingly
# print(f"Input shape: {x.shape}")
# x = x.view(x.size(0), -1) # Flattening the images
# print(f"Input shape after flattening: {x.shape}") # Debugging input shape
mean_x = self.mean_module(x)
covar_x = self.covar_module(x)

# Debugging: Print shapes of intermediate outputs
# print(f"Mean shape: {mean_x.shape}, Covariance shape: {covar_x.shape}")
latent_pred = gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
# print(f"Latent prediction shape: {latent_pred.mean.shape}, {latent_pred.covariance_matrix.shape}")

return latent_pred


def train_gp_model(train_loader, val_loader, num_iterations=50, n_classes=4, patience=10, checkpoint_path='model_checkpoint_full.pt'):
model = MultitaskGPModel().to(device)
likelihood = gpytorch.likelihoods.SoftmaxLikelihood(num_features=4, num_classes=4).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=len(train_loader.dataset))
best_val_loss = float('inf')
epochs_no_improve = 0

metrics = {
'precision': [],
'recall': [],
'f1_score': [],
'auc_roc': [],
'train_loss': [] # Add a list to store training losses
}

for epoch in tqdm(range(num_iterations), desc='Training', unit='epoch', leave=False):
for train_batch in train_loader:
model.train()
likelihood.train()
optimizer.zero_grad()
train_x = train_batch['data'].reshape(train_batch['data'].size(0), -1).to(device) # Use reshape here
train_y = train_batch['label'].to(device)
output = model(train_x)
loss = -mll(output, train_y)
metrics['train_loss'].append(loss.item()) # Store the training loss
loss.backward()
optimizer.step()

# Stochastic validation
model.eval()
likelihood.eval()
with torch.no_grad():
val_indices = torch.randperm(len(val_loader.dataset))[:int(1 * len(val_loader.dataset))]
val_loss = 0.0
val_labels = []
val_predictions = []
for idx in val_indices:
val_batch = val_loader.dataset[idx]
val_x = val_batch['data'].reshape(-1).unsqueeze(0).to(device) # Use reshape here
val_y = torch.tensor([val_batch['label']], device=device)
val_output = model(val_x)
val_loss_batch = -mll(val_output, val_y).sum()
val_loss += val_loss_batch.item()
val_labels.append(val_y.item())
val_predictions.append(val_output.mean.argmax(dim=-1).item())

precision, recall, f1, _ = precision_recall_fscore_support(val_labels, val_predictions, average='macro')
# auc_roc = roc_auc_score(label_binarize(val_labels, classes=np.arange(n_classes)),
# label_binarize(val_predictions, classes=np.arange(n_classes)),
# multi_class='ovr')

metrics['precision'].append(precision)
metrics['recall'].append(recall)
metrics['f1_score'].append(f1)
# metrics['auc_roc'].append(auc_roc)
val_loss /= len(val_indices)

if val_loss < best_val_loss:
best_val_loss = val_loss
epochs_no_improve = 0
torch.save({'model_state_dict': model.state_dict(),
'likelihood_state_dict': likelihood.state_dict(),
'optimizer_state_dict': optimizer.state_dict()}, checkpoint_path)
else:
epochs_no_improve += 1
if epochs_no_improve >= patience:
print(f"Early stopping triggered at epoch {epoch+1}")
break

checkpoint = torch.load(checkpoint_path)
model.load_state_dict(checkpoint['model_state_dict'])
likelihood.load_state_dict(checkpoint['likelihood_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

return model, likelihood, metrics

def semi_supervised_labeling(kmeans_model, gp_model, gp_likelihood, data_loader, confidence_threshold=0.8):
gp_model.eval()
gp_likelihood.eval()
labeled_samples = []

with torch.no_grad():
for batch in data_loader:
data_tensor = batch['data'].view(batch['data'].size(0), -1).to(device)
kmeans_predictions = kmeans_model.predict(data_tensor.cpu().numpy())
gp_predictions = gp_likelihood(gp_model(data_tensor))

# Use GP predictions where the model is confident
confident_indices = gp_predictions.confidence().cpu().numpy() > confidence_threshold
for i, confident in enumerate(confident_indices):
if confident:
labeled_samples.append((data_tensor[i], gp_predictions.mean.argmax(dim=-1)[i].item()))

This comment has been minimized.

Copy link
@doh16101

doh16101 Jan 20, 2024

Author Collaborator

@lrm22005 I think this is very smart. I have wanted to implement a similar idea for a long time. Thank you for implementing it!

else:
labeled_samples.append((data_tensor[i], kmeans_predictions[i]))

return labeled_samples

def calculate_elbo(model, likelihood, data_loader):
"""
Calculates the ELBO (Evidence Lower Bound) score for the model on the given data.
Args:
- model: The trained Gaussian Process model.
- likelihood: The likelihood associated with the GP model.
- data_loader: DataLoader providing the data over which to calculate ELBO.
Returns:
- elbo_score: The calculated ELBO score.
"""
model.eval()
likelihood.eval()
mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=len(data_loader.dataset))

with torch.no_grad():
elbo_score = 0.0
for batch in data_loader:
train_x = batch['data'].reshape(batch['data'].size(0), -1).to(device)
train_y = batch['label'].to(device)
output = model(train_x)
# Calculate the ELBO as the negative loss
elbo_score += -mll(output, train_y).sum().item()

# Average the ELBO over all data samples
elbo_score /= len(data_loader.dataset)

return elbo_score
Loading

0 comments on commit 9388a46

Please sign in to comment.