From 5fc6745aa3838220222b30a93548163f317b523e Mon Sep 17 00:00:00 2001 From: Luis Roberto Mercado Diaz Date: Tue, 24 Oct 2023 13:30:08 -0400 Subject: [PATCH 1/6] Adding the MFVI and PCA The Mean Field Variance Inference method was added to this code, performing this analysis looking for a probabilistic model to compare the distributions of the data. I would update in a new version a convergence criteria to evaluate in a more complex and significant relevant way the data. --- project_1.py | 247 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 233 insertions(+), 14 deletions(-) diff --git a/project_1.py b/project_1.py index d9fdf85..cad732b 100644 --- a/project_1.py +++ b/project_1.py @@ -9,12 +9,13 @@ import numpy as np import pandas as pd import torch +from torch.distributions import MultivariateNormal from torch.utils.data import DataLoader, TensorDataset import matplotlib.pyplot as plt -from sklearn.decomposition import PCA +from sklearn.decomposition import PCA, silhouette_score, adjusted_rand_score import seaborn as sns -def load_data(data_path, dataset_size=10, train=True, standardize=True): +def load_data(data_path, labels_path, dataset_size=10, train=True, standardize=True): # Load data from the specified data_path dir_list_UID = os.listdir(data_path) UID_list = dir_list_UID[:dataset_size] if train else dir_list_UID[dataset_size:] @@ -38,7 +39,19 @@ def load_data(data_path, dataset_size=10, train=True, standardize=True): if standardize: X_data = standard_scaling(X_data) - return X_data, segment_names + # Extract labels from CSV files + labels = extract_labels(UID_list, labels_path) + + return X_data, segment_names, labels + +def extract_labels(UID_list, labels_path): + labels = {} + for UID in UID_list: + label_file = os.path.join(labels_path, UID + "_final_attemp_4_1_Dong.csv") + if os.path.exists(label_file): + label_data = pd.read_csv(label_file, sep='\t', header=None, names=['segment', 'label']) + labels[UID] = label_data['label'].values + return labels def standard_scaling(tensor): # Z-score normalization (standardization) @@ -57,6 +70,7 @@ def visualize_trends(data, segment_names, num_plots=3): num_samples, _, _ = data.shape for _ in range(num_plots): idx = np.random.randint(0, num_samples) + plt.figure() # Create a new figure for each plot plt.imshow(data[idx].numpy()) plt.title(f"Segment: {segment_names[idx]}") plt.colorbar() @@ -69,36 +83,241 @@ def perform_pca(data, num_components=2): reduced_data = pca.fit_transform(data_flattened.numpy()) return reduced_data, pca -def visualize_correlation_matrix(data): - # Visualize the correlation matrix +def visualize_correlation_matrix(data, segment_names, subject_mode=True, num_subjects_to_visualize=None, save_path=None): + ''' + Usage: + To visualize the correlation matrix for each subject individually, you can call: + visualize_correlation_matrix(train_data, subject_mode=True, save_path="path_to_save_results") + + To visualize the correlation matrix for a specific quantity of subjects (groups), you can call: + visualize_correlation_matrix(train_data, subject_mode=False, num_subjects_to_visualize=5, save_path="path_to_save_results") + ''' + # Visualize the correlation matrix for each subject or subgroup data_flattened = data.view(data.size(0), -1).numpy() - correlation_matrix = np.corrcoef(data_flattened, rowvar=False) - sns.heatmap(correlation_matrix, cmap="coolwarm", xticklabels=False, yticklabels=False) - plt.title("Correlation Matrix") + + subject_names = [filename.split('_')[0] for filename in segment_names] + unique_subjects = list(set(subject_names)) + + if subject_mode: + for subject in unique_subjects: + subject_indices = [i for i, name in enumerate(subject_names) if name == subject] + subject_data = data_flattened[subject_indices] + correlation_matrix = np.corrcoef(subject_data, rowvar=False) + + plt.figure() + sns.heatmap(correlation_matrix, cmap="coolwarm", xticklabels=False, yticklabels=False) + plt.title(f"Correlation Matrix for Subject {subject}") + + if save_path: + subject_save_path = os.path.join(save_path, f"correlation_matrix_subject_{subject}.png") + plt.savefig(subject_save_path) + + plt.show() + + else: # Group mode + if num_subjects_to_visualize is None: + num_subjects_to_visualize = len(unique_subjects) + + for i in range(num_subjects_to_visualize): + subject = unique_subjects[i] + subject_indices = [i for i, name in enumerate(subject_names) if name == subject] + subject_data = data_flattened[subject_indices] + correlation_matrix = np.corrcoef(subject_data, rowvar=False) + + plt.figure() + sns.heatmap(correlation_matrix, cmap="coolwarm", xticklabels=False, yticklabels=False) + plt.title(f"Correlation Matrix for {num_subjects_to_visualize} Subjects {subject}") + + if save_path: + subject_save_path = os.path.join(save_path, f"correlation_matrix_subject_group{subject}.png") + plt.savefig(subject_save_path) + + plt.show() + +# This function computes the log PDF of a multivariate normal distribution +def multivariate_normal_log_pdf_MFVI(x, mu, sigma_sq): + # x: Data points (N x D) + # mu: Means of the components (K x D) + # sigma_sq: Variances of the components (K x D) + N, D = x.shape + K, _ = mu.shape + + log_p = torch.empty(N, K, dtype=x.dtype, device=x.device) + for k in range(K): + cov_matrix = torch.diag(sigma_sq[k]) + mvn = MultivariateNormal(mu[k], cov_matrix) + log_p[:, k] = mvn.log_prob(x) + + return log_p + +def perform_mfvi(data, K, n_optimization_iterations): + N, D = data.shape[0], data.shape[1] * data.shape[2] # Calculate feature dimension D + # Define the variational parameters for the GMM + miu_variational = torch.randn(K, D, requires_grad=True) + log_sigma_variational = torch.randn(K, D, requires_grad=True) + alpha_variational = torch.randn(K, requires_grad=True) + + # Define the optimizer for gradient descent + optimizer = torch.optim.Adam([miu_variational, log_sigma_variational, alpha_variational], lr=0.001) + + for iteration in range(n_optimization_iterations): + # Initialize gradients + optimizer.zero_grad() + + # Compute the Gaussian means and covariances from variational parameters + sigma_variational_sq = torch.exp(log_sigma_variational.clone()) + + # Calculate the responsibilities (E[zi]) + log_pi_variational = torch.digamma(alpha_variational) - torch.digamma(alpha_variational.sum()) + log_resp = log_pi_variational.unsqueeze(0) + multivariate_normal_log_pdf_MFVI(data, miu_variational, sigma_variational_sq) + log_resp_max = log_resp.max(dim=1, keepdim=True).values.clone() + resp = torch.exp(log_resp - log_resp_max).clone() + resp /= resp.sum(dim=1, keepdim=True) + + # Compute the ELBO and perform backpropagation + elbo = -torch.sum(resp * log_resp) + torch.sum(resp * torch.log(resp)) + + # Perform backpropagation with retain_graph=True + elbo.backward(retain_graph=True) + + # Update the variational parameters + optimizer.step() + + # Print progress + if (iteration + 1) % 100 == 0: + print(f"Iteration {iteration + 1}/{n_optimization_iterations}") + + # Extract the learned parameters + miu = miu_variational.detach().numpy() + pi = torch.softmax(alpha_variational, dim=0).detach().numpy() + + return miu, pi, resp + +def plot_pca(reduced_data, labels, method='original_labels', save_path=None): + """ + Plot the PCA results, and optionally save the plot. + + Args: + data (torch.Tensor): The data after perform PCA. + labels (list or np.ndarray): The labels or class information for data. + save_path (str, optional): If provided, save the PCA plot to this path. + + Returns: + sklearn.decomposition.PCA: The PCA object containing the results. + """ + + # Create a scatter plot of PCA results + plt.figure(figsize=(8, 6)) + scatter = plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=labels, cmap=plt.cm.viridis) + plt.colorbar(scatter, label='Labels') + plt.title('PCA Plot {method}') + plt.xlabel('Principal Component 1') + plt.ylabel('Principal Component 2') + + # Save the plot if save_path is provided + if save_path: + plt.savefig(save_path, f"PCA_analysis_using_{method}.png") + + plt.show() + # Example usage: + # train_data, segment_names, labels = load_data(data_path, labels_path, dataset_size=141, train=True) + # pca = plot_pca(reduced_data, labels, method='original_labels', save_path="pca_plot.png") + +def plot_clusters(data, zi, title, save_path=None): + """ + Plot the data points colored by cluster assignment. + + Args: + data (torch.Tensor): The data points. + zi (torch.Tensor): The cluster assignments. + title (str): The title for the plot. + """ + unique_clusters = torch.unique(zi) + colors = plt.cm.viridis(torch.linspace(0, 1, len(unique_clusters))) + + plt.figure(figsize=(8, 6)) + for i, cluster in enumerate(unique_clusters): + cluster_data = data[zi == cluster] + plt.scatter(cluster_data[:, 0], cluster_data[:, 1], c=colors[i], label=f'Cluster {int(cluster)}') + + plt.title(title) + plt.legend() + plt.xlabel('Feature 1') + plt.ylabel('Feature 2') + + # Save the plot if save_path is provided + if save_path: + filename = title.replace(' ', '_') + ".png" + plt.savefig(os.path.join(save_path, filename)) + plt.show() def main(): is_linux = False # Set to True if running on Linux, False if on Windows + is_hpc = False # Set to True if running on hpc, False if on Windows + if is_linux: data_path = "/mnt/r/ENGR_Chon/Dong/MATLAB_generate_results/NIH_PulseWatch/TFS_csv" + labels_path = "/mnt/r/ENGR_Chon/NIH_Pulsewatch_Database/Adjudication_UConn/final_attemp_4_1_Dong_Ohm" + saving_path = "/mnt/r/ENGR_Chon/Luis/Research/Casseys_case/Project_1_analysis" + elif is_hpc: + data_path = "/gpfs/scratchfs1/kic14002/doh16101/TFS_csv" + labels_path = "/gpfs/scratchfs1/hfp14002/lrm22005/final_attemp_4_1_Dong_Ohm" + saving_path = "/gpfs/scratchfs1/hfp14002/lrm22005/Casseys_case/Project_1_analysis" else: data_path = r"R:\ENGR_Chon\Dong\MATLAB_generate_results\NIH_PulseWatch\TFS_csv" + labels_path = r"R:\ENGR_Chon\NIH_Pulsewatch_Database\Adjudication_UConn\final_attemp_4_1_Dong_Ohm" + saving_path = r"R:\ENGR_Chon\Luis\Research\Casseys_case\Project_1_analysis" - train_data, segment_names = load_data(data_path, dataset_size=141, train=True) - test_data, _ = load_data(data_path, dataset_size=10, train=False) + train_data, segment_names, labels = load_data(data_path, labels_path, dataset_size=141, train=True) + test_data, _, _ = load_data(data_path, labels_path, dataset_size=141, train=False) train_dataloader = create_dataloader(train_data) test_dataloader = create_dataloader(test_data) # Visualize random trends/segments - visualize_trends(train_data, segment_names, num_plots=3) + visualize_trends(train_data, segment_names, num_plots=20) # Perform PCA for dimensionality reduction - reduced_data, pca = perform_pca(train_data, num_components=2) - print("Explained variance ratio:", pca.explained_variance_ratio_) + # reduced_data, pca = perform_pca(train_data, num_components=2) + # print("Explained variance ratio:", pca.explained_variance_ratio_) # Visualize the correlation matrix - visualize_correlation_matrix(train_data) + visualize_correlation_matrix(train_data, segment_names, subject_mode=True, num_subjects_to_visualize=None, save_path=saving_path) + visualize_correlation_matrix(train_data, segment_names, subject_mode=False, num_subjects_to_visualize=10, save_path=saving_path) + # visualize_correlation_matrix(train_data, segment_names, subject_mode=False, num_subjects_to_visualize=None, save_path=saving_path) + + # Perform MFVI for your data + K = 4 # Number of clusters + n_optimization_iterations = 100 + miu_mfvi, pi_mfvi, resp_mfvi = perform_mfvi(train_data, K, n_optimization_iterations) + + # Calculate clustering metrics for MFVI + zi_mfvi = np.argmax(resp_mfvi, axis=1) + # Perform PCA for dimensionality reduction + reduced_data, pca = perform_pca(train_data, num_components=2) + print("Explained variance ratio:", pca.explained_variance_ratio_) + + # Create two plots: PCA results and original labels + plt.figure(figsize=(16, 6)) + + # Plot PCA results + plot_pca(reduced_data, zi_mfvi, method='MFVI', save_path="pca_plot.png") + + # Plot original labels + plot_pca(reduced_data, labels, method="original_labels", save_path="pca_plot.png") + + # Calculate clustering metrics for PCA results + silhouette_pca = silhouette_score(reduced_data, zi_mfvi) + ari_pca = adjusted_rand_score(labels, zi_mfvi) + + # Print and compare clustering metrics for PCA + print("PCA Clustering Metrics Comparison:") + print(f"Silhouette Score (PCA): {silhouette_pca}") + print(f"Adjusted Rand Index (PCA vs. True Labels): {ari_pca}") + + # Plot clusters for MFVI results + plot_clusters(reduced_data, torch.from_numpy(zi_mfvi), title="MFVI Clustering Results (Train Data)", save_path=saving_path) if __name__ == "__main__": main() \ No newline at end of file From f7b5b20751fe4b67d991187691e73bea63446e6b Mon Sep 17 00:00:00 2001 From: Luis Roberto Mercado Diaz Date: Tue, 24 Oct 2023 13:47:56 -0400 Subject: [PATCH 2/6] MFVI Convergence This code has the Mean-Field convergence criteria to choose if use it or not and stop the code just if it converge. --- project_1.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/project_1.py b/project_1.py index cad732b..ddcd899 100644 --- a/project_1.py +++ b/project_1.py @@ -150,7 +150,7 @@ def multivariate_normal_log_pdf_MFVI(x, mu, sigma_sq): return log_p -def perform_mfvi(data, K, n_optimization_iterations): +def perform_mfvi(data, K, n_optimization_iterations, convergence_threshold=1e-5, run_until_convergence=True): N, D = data.shape[0], data.shape[1] * data.shape[2] # Calculate feature dimension D # Define the variational parameters for the GMM miu_variational = torch.randn(K, D, requires_grad=True) @@ -160,7 +160,9 @@ def perform_mfvi(data, K, n_optimization_iterations): # Define the optimizer for gradient descent optimizer = torch.optim.Adam([miu_variational, log_sigma_variational, alpha_variational], lr=0.001) - for iteration in range(n_optimization_iterations): + prev_elbo = float('-inf') + iteration = 0 + while True: # Initialize gradients optimizer.zero_grad() @@ -187,6 +189,16 @@ def perform_mfvi(data, K, n_optimization_iterations): if (iteration + 1) % 100 == 0: print(f"Iteration {iteration + 1}/{n_optimization_iterations}") + if run_until_convergence: + if iteration > 0 and abs(elbo - prev_elbo) < convergence_threshold: + print(f"Converged after {iteration + 1} iterations") + break + elif iteration == n_optimization_iterations - 1: + print("Reached the specified number of iterations.") + + prev_elbo = elbo + iteration += 1 + # Extract the learned parameters miu = miu_variational.detach().numpy() pi = torch.softmax(alpha_variational, dim=0).detach().numpy() @@ -290,7 +302,7 @@ def main(): # Perform MFVI for your data K = 4 # Number of clusters n_optimization_iterations = 100 - miu_mfvi, pi_mfvi, resp_mfvi = perform_mfvi(train_data, K, n_optimization_iterations) + miu_mfvi, pi_mfvi, resp_mfvi = perform_mfvi(train_data, K, n_optimization_iterations, convergence_threshold=1e-5, run_until_convergence=False) # Calculate clustering metrics for MFVI zi_mfvi = np.argmax(resp_mfvi, axis=1) From 05370595ce6c7366c3ac8660e8f5d3bd13108607 Mon Sep 17 00:00:00 2001 From: Luis Roberto Mercado Diaz Date: Tue, 24 Oct 2023 13:55:19 -0400 Subject: [PATCH 3/6] Update project_1.py --- project_1.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/project_1.py b/project_1.py index ddcd899..5a3d472 100644 --- a/project_1.py +++ b/project_1.py @@ -12,7 +12,8 @@ from torch.distributions import MultivariateNormal from torch.utils.data import DataLoader, TensorDataset import matplotlib.pyplot as plt -from sklearn.decomposition import PCA, silhouette_score, adjusted_rand_score +from sklearn.decomposition import PCA +from sklearn.metrics import silhouette_score, adjusted_rand_score import seaborn as sns def load_data(data_path, labels_path, dataset_size=10, train=True, standardize=True): @@ -281,8 +282,8 @@ def main(): labels_path = r"R:\ENGR_Chon\NIH_Pulsewatch_Database\Adjudication_UConn\final_attemp_4_1_Dong_Ohm" saving_path = r"R:\ENGR_Chon\Luis\Research\Casseys_case\Project_1_analysis" - train_data, segment_names, labels = load_data(data_path, labels_path, dataset_size=141, train=True) - test_data, _, _ = load_data(data_path, labels_path, dataset_size=141, train=False) + train_data, segment_names, labels = load_data(data_path, labels_path, dataset_size=20, train=True) + test_data, _, _ = load_data(data_path, labels_path, dataset_size=30, train=False) train_dataloader = create_dataloader(train_data) test_dataloader = create_dataloader(test_data) @@ -302,7 +303,7 @@ def main(): # Perform MFVI for your data K = 4 # Number of clusters n_optimization_iterations = 100 - miu_mfvi, pi_mfvi, resp_mfvi = perform_mfvi(train_data, K, n_optimization_iterations, convergence_threshold=1e-5, run_until_convergence=False) + miu_mfvi, pi_mfvi, resp_mfvi = perform_mfvi(train_data, K, n_optimization_iterations=1000, convergence_threshold=1e-5, run_until_convergence=False) # Calculate clustering metrics for MFVI zi_mfvi = np.argmax(resp_mfvi, axis=1) From 6edf0c39d27c5999245d2a535c650476dd1a9099 Mon Sep 17 00:00:00 2001 From: Luis Roberto Mercado Diaz Date: Tue, 24 Oct 2023 14:19:23 -0400 Subject: [PATCH 4/6] PNG analysis Adding a way to analyze time series or plots. --- project_1.py | 84 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 57 insertions(+), 27 deletions(-) diff --git a/project_1.py b/project_1.py index 5a3d472..9c52fcf 100644 --- a/project_1.py +++ b/project_1.py @@ -15,34 +15,73 @@ from sklearn.decomposition import PCA from sklearn.metrics import silhouette_score, adjusted_rand_score import seaborn as sns +from PIL import Image # Import the Image module + +def get_data_paths(data_format, is_linux=False, is_hpc=False): + if is_linux: + base_path = "/mnt/r/ENGR_Chon/Dong/MATLAB_generate_results/NIH_PulseWatch" + labels_base_path = "/mnt/r/ENGR_Chon/NIH_Pulsewatch_Database/Adjudication_UConn" + saving_base_path = "/mnt/r/ENGR_Chon/Luis/Research/Casseys_case/Project_1_analysis" + elif is_hpc: + base_path = "/gpfs/scratchfs1/kic14002/doh16101" + labels_base_path = "/gpfs/scratchfs1/hfp14002/lrm22005" + saving_base_path = "/gpfs/scratchfs1/hfp14002/lrm22005/Casseys_case/Project_1_analysis" + else: + base_path = "R:\\ENGR_Chon\\Dong\\MATLAB_generate_results\\NIH_PulseWatch" + labels_base_path = "R:\\ENGR_Chon\\NIH_Pulsewatch_Database\\Adjudication_UConn" + saving_base_path = "R:\\ENGR_Chon\\Luis\\Research\\Casseys_case\\Project_1_analysis" + + if data_format == 'csv': + data_path = os.path.join(base_path, "TFS_csv") + labels_path = os.path.join(labels_base_path, "final_attemp_4_1_Dong_Ohm") + saving_path = os.path.join(saving_base_path, "Project_1_analysis") + elif data_format == 'png': + data_path = os.path.join(base_path, "TFS_plots") + labels_path = os.path.join(labels_base_path, "final_attemp_4_1_Dong_Ohm") + saving_path = os.path.join(saving_base_path, "Project_1_analysis") + else: + raise ValueError("Invalid data format. Choose 'csv' or 'png.") + + return data_path, labels_path, saving_path + +def load_data(data_path, labels_path, dataset_size=10, train=True, standardize=True, data_format='csv'): + if data_format not in ['csv', 'png']: + raise ValueError("Invalid data_format. Choose 'csv' or 'png'.") -def load_data(data_path, labels_path, dataset_size=10, train=True, standardize=True): - # Load data from the specified data_path dir_list_UID = os.listdir(data_path) UID_list = dir_list_UID[:dataset_size] if train else dir_list_UID[dataset_size:] - + X_data = [] segment_names = [] - + for UID in UID_list: data_path_UID = os.path.join(data_path, UID) dir_list_seg = os.listdir(data_path_UID) - + for seg in dir_list_seg[:50]: # Limiting to 50 segments seg_path = os.path.join(data_path_UID, seg) - time_freq_plot = np.array(pd.read_csv(seg_path, header=None)) - time_freq_tensor = torch.Tensor(time_freq_plot).reshape(1, 128, 128) + + if data_format == 'csv' and seg.endswith('.csv'): + time_freq_plot = np.array(pd.read_csv(seg_path, header=None)) + time_freq_tensor = torch.Tensor(time_freq_plot).reshape(1, 128, 128) + elif data_format == 'png' and seg.endswith('.png'): + img = Image.open(seg_path) + img_data = np.array(img) + time_freq_tensor = torch.Tensor(img_data).unsqueeze(0) + else: + continue # Skip other file formats + X_data.append(time_freq_tensor) segment_names.append(seg) # Store segment names - + X_data = torch.cat(X_data, 0) - + if standardize: X_data = standard_scaling(X_data) - + # Extract labels from CSV files labels = extract_labels(UID_list, labels_path) - + return X_data, segment_names, labels def extract_labels(UID_list, labels_path): @@ -269,24 +308,15 @@ def main(): is_linux = False # Set to True if running on Linux, False if on Windows is_hpc = False # Set to True if running on hpc, False if on Windows - if is_linux: - data_path = "/mnt/r/ENGR_Chon/Dong/MATLAB_generate_results/NIH_PulseWatch/TFS_csv" - labels_path = "/mnt/r/ENGR_Chon/NIH_Pulsewatch_Database/Adjudication_UConn/final_attemp_4_1_Dong_Ohm" - saving_path = "/mnt/r/ENGR_Chon/Luis/Research/Casseys_case/Project_1_analysis" - elif is_hpc: - data_path = "/gpfs/scratchfs1/kic14002/doh16101/TFS_csv" - labels_path = "/gpfs/scratchfs1/hfp14002/lrm22005/final_attemp_4_1_Dong_Ohm" - saving_path = "/gpfs/scratchfs1/hfp14002/lrm22005/Casseys_case/Project_1_analysis" - else: - data_path = r"R:\ENGR_Chon\Dong\MATLAB_generate_results\NIH_PulseWatch\TFS_csv" - labels_path = r"R:\ENGR_Chon\NIH_Pulsewatch_Database\Adjudication_UConn\final_attemp_4_1_Dong_Ohm" - saving_path = r"R:\ENGR_Chon\Luis\Research\Casseys_case\Project_1_analysis" + data_format = 'csv' # Choose 'csv' or 'png' + + data_path, labels_path, saving_path = get_data_paths(data_format, is_linux=is_linux, is_hpc=is_hpc) - train_data, segment_names, labels = load_data(data_path, labels_path, dataset_size=20, train=True) - test_data, _, _ = load_data(data_path, labels_path, dataset_size=30, train=False) + train_data, segment_names, labels = load_data(data_path, labels_path, dataset_size=20, train=True, data_format=data_format) + # test_data, _, _ = load_data(data_path, labels_path, dataset_size=30, train=False) train_dataloader = create_dataloader(train_data) - test_dataloader = create_dataloader(test_data) + # test_dataloader = create_dataloader(test_data) # Visualize random trends/segments visualize_trends(train_data, segment_names, num_plots=20) @@ -297,7 +327,7 @@ def main(): # Visualize the correlation matrix visualize_correlation_matrix(train_data, segment_names, subject_mode=True, num_subjects_to_visualize=None, save_path=saving_path) - visualize_correlation_matrix(train_data, segment_names, subject_mode=False, num_subjects_to_visualize=10, save_path=saving_path) + visualize_correlation_matrix(train_data, segment_names, subject_mode=False, num_subjects_to_visualize=20, save_path=saving_path) # visualize_correlation_matrix(train_data, segment_names, subject_mode=False, num_subjects_to_visualize=None, save_path=saving_path) # Perform MFVI for your data From 07b28c3b14be05eacf5a46c0e9c909fc3176d256 Mon Sep 17 00:00:00 2001 From: Luis Roberto Mercado Diaz Date: Tue, 24 Oct 2023 14:20:48 -0400 Subject: [PATCH 5/6] checking version --- project_1.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/project_1.py b/project_1.py index 9c52fcf..90d6e78 100644 --- a/project_1.py +++ b/project_1.py @@ -308,7 +308,7 @@ def main(): is_linux = False # Set to True if running on Linux, False if on Windows is_hpc = False # Set to True if running on hpc, False if on Windows - data_format = 'csv' # Choose 'csv' or 'png' + data_format = 'png' # Choose 'csv' or 'png' data_path, labels_path, saving_path = get_data_paths(data_format, is_linux=is_linux, is_hpc=is_hpc) @@ -327,7 +327,7 @@ def main(): # Visualize the correlation matrix visualize_correlation_matrix(train_data, segment_names, subject_mode=True, num_subjects_to_visualize=None, save_path=saving_path) - visualize_correlation_matrix(train_data, segment_names, subject_mode=False, num_subjects_to_visualize=20, save_path=saving_path) + visualize_correlation_matrix(train_data, segment_names, subject_mode=False, num_subjects_to_visualize=None, save_path=saving_path) # visualize_correlation_matrix(train_data, segment_names, subject_mode=False, num_subjects_to_visualize=None, save_path=saving_path) # Perform MFVI for your data From 49f69ae90d865962fac8ef1357960a236bc8a194 Mon Sep 17 00:00:00 2001 From: Luis Roberto Mercado Diaz Date: Tue, 24 Oct 2023 14:35:03 -0400 Subject: [PATCH 6/6] defining better parameters Non relevant update --- project_1.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/project_1.py b/project_1.py index 90d6e78..caf0e18 100644 --- a/project_1.py +++ b/project_1.py @@ -105,7 +105,7 @@ def create_dataloader(data, batch_size=64, shuffle=True): data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle) return data_loader -def visualize_trends(data, segment_names, num_plots=3): +def visualize_trends(data, segment_names, num_plots=3, save_path=None): # Visualize random trends/segments num_samples, _, _ = data.shape for _ in range(num_plots): @@ -114,6 +114,9 @@ def visualize_trends(data, segment_names, num_plots=3): plt.imshow(data[idx].numpy()) plt.title(f"Segment: {segment_names[idx]}") plt.colorbar() + if save_path: + subject_save_path = os.path.join(save_path, f"trends_visualization_segment_{segment_names}.png") + plt.savefig(subject_save_path) plt.show() def perform_pca(data, num_components=2): @@ -169,7 +172,7 @@ def visualize_correlation_matrix(data, segment_names, subject_mode=True, num_sub plt.title(f"Correlation Matrix for {num_subjects_to_visualize} Subjects {subject}") if save_path: - subject_save_path = os.path.join(save_path, f"correlation_matrix_subject_group{subject}.png") + subject_save_path = os.path.join(save_path, f"correlation_matrix_subject_group_{subject}.png") plt.savefig(subject_save_path) plt.show() @@ -312,7 +315,7 @@ def main(): data_path, labels_path, saving_path = get_data_paths(data_format, is_linux=is_linux, is_hpc=is_hpc) - train_data, segment_names, labels = load_data(data_path, labels_path, dataset_size=20, train=True, data_format=data_format) + train_data, segment_names, labels = load_data(data_path, labels_path, dataset_size=10, train=True, data_format=data_format) # test_data, _, _ = load_data(data_path, labels_path, dataset_size=30, train=False) train_dataloader = create_dataloader(train_data) @@ -327,12 +330,11 @@ def main(): # Visualize the correlation matrix visualize_correlation_matrix(train_data, segment_names, subject_mode=True, num_subjects_to_visualize=None, save_path=saving_path) - visualize_correlation_matrix(train_data, segment_names, subject_mode=False, num_subjects_to_visualize=None, save_path=saving_path) + # visualize_correlation_matrix(train_data, segment_names, subject_mode=False, num_subjects_to_visualize=None, save_path=saving_path) # visualize_correlation_matrix(train_data, segment_names, subject_mode=False, num_subjects_to_visualize=None, save_path=saving_path) # Perform MFVI for your data K = 4 # Number of clusters - n_optimization_iterations = 100 miu_mfvi, pi_mfvi, resp_mfvi = perform_mfvi(train_data, K, n_optimization_iterations=1000, convergence_threshold=1e-5, run_until_convergence=False) # Calculate clustering metrics for MFVI