diff --git a/project_1.py b/project_1.py index 09a30a9..1c645ca 100644 --- a/project_1.py +++ b/project_1.py @@ -12,8 +12,11 @@ from torch.distributions import MultivariateNormal from torch.utils.data import DataLoader, TensorDataset import matplotlib.pyplot as plt -from sklearn.decomposition import PCA -from sklearn.metrics import silhouette_score, adjusted_rand_score +from sklearn.decomposition import PCA, IncrementalPCA +from sklearn.manifold import TSNE +from sklearn.cluster import KMeans +from sklearn.preprocessing import StandardScaler +from sklearn.metrics import silhouette_score, adjusted_rand_score, adjusted_mutual_info_score, davies_bouldin_score import seaborn as sns from PIL import Image # Import the Image module @@ -44,16 +47,26 @@ def get_data_paths(data_format, is_linux=False, is_hpc=False): return data_path, labels_path, saving_path -def load_data(data_path, labels_path, dataset_size=10, train=True, standardize=True, data_format='csv'): +# Standardize the data +def standard_scaling(data): + scaler = StandardScaler() + data_shape = data.shape + data = data.view(data_shape[0], -1) + data = scaler.fit_transform(data) + data = data.view(data_shape) + return torch.Tensor(data) + +def load_data(data_path, labels_path, dataset_size=2, train=True, standardize=True, data_format='csv', return_all=False): if data_format not in ['csv', 'png']: raise ValueError("Invalid data_format. Choose 'csv' or 'png.") dir_list_UID = os.listdir(data_path) UID_list = dir_list_UID[:dataset_size] if train else dir_list_UID[dataset_size:] - X_data = [] + X_data = [] # Store all data X_data_original = [] # Store original data without standardization segment_names = [] + validated_labels = [] # Store only the label values for UID in UID_list: data_path_UID = os.path.join(data_path, UID) @@ -72,29 +85,73 @@ def load_data(data_path, labels_path, dataset_size=10, train=True, standardize=T else: continue # Skip other file formats - # X_data_original.append(time_freq_tensor.clone()) # Store a copy of the original data X_data.append(time_freq_tensor) + X_data_original.append(time_freq_tensor.clone()) # Store a copy of the original data - segment_names.append(seg) # Store segment names + segment_names.append(seg.split('_filt')[0]) # Extract and store segment names X_data = torch.cat(X_data, 0) - # X_data_original = torch.cat(X_data_original, 0) + X_data_original = torch.cat(X_data_original, 0) if standardize: X_data = standard_scaling(X_data) # Standardize the data # Extract labels from CSV files - labels = extract_labels(UID_list, labels_path) + labels = extract_labels(UID_list, labels_path, segment_names) + + important_labels = [0.0, 1.0, 2.0, 3.0] # List of important labels + + # Initialize labels for segments as unlabeled (-1) + segment_labels = {segment_name: -1 for segment_name in segment_names} + + for UID in labels.keys(): + if UID not in UID_list: + # Skip UIDs that are not in the dataset + continue + + label_data, label_segment_names = labels[UID] + + for idx, segment_label in enumerate(label_data): + segment_name = label_segment_names[idx] + if segment_label in important_labels: + segment_labels[segment_name] = segment_label + else: + # Set labels that are not in the important list as -1 (Unlabeled) + segment_labels[segment_name] = -1 - return X_data_original, X_data, segment_names, labels + # Return all segments along with labels + if return_all: + return X_data_original, X_data, segment_names, segment_labels, segment_labels.values() -def extract_labels(UID_list, labels_path): + # Filter out segments that are unlabeled (-1) + filtered_segment_names = [segment_name for segment_name, label in segment_labels.items() if label != -1] + + # Filter data to match the filtered segment names + filtered_data = torch.stack([X_data[segment_names.index(segment_name)] for segment_name in filtered_segment_names]) + + # Return labeled and unlabeled segments along with labels + if return_all == 'labeled': + return X_data_original, filtered_data, filtered_segment_names, {seg: segment_labels[seg] for seg in filtered_segment_names}, {seg: segment_labels[seg] for seg in filtered_segment_names}.values() + + # Return unlabeled segments along with labels + if return_all == 'unlabeled': + unlabeled_segment_names = [segment_name for segment_name, label in segment_labels.items() if label == -1] + unlabeled_data = torch.stack([X_data[segment_names.index(segment_name)] for segment_name in unlabeled_segment_names]) + return X_data_original, unlabeled_data, unlabeled_segment_names, {seg: segment_labels[seg] for seg in unlabeled_segment_names}, {seg: segment_labels[seg] for seg in unlabeled_segment_names}.values() + + # By default, return only labeled segments along with labels + return X_data_original, filtered_data, filtered_segment_names, {seg: segment_labels[seg] for seg in filtered_segment_names}, {seg: segment_labels[seg] for seg in filtered_segment_names}.values() + + +def extract_labels(UID_list, labels_path, segment_names): labels = {} for UID in UID_list: label_file = os.path.join(labels_path, UID + "_final_attemp_4_1_Dong.csv") if os.path.exists(label_file): - label_data = pd.read_csv(label_file, sep='\t', header=None, names=['segment', 'label']) - labels[UID] = label_data['label'].values + label_data = pd.read_csv(label_file, sep=',', header=0, names=['segment', 'label']) + label_segment_names = label_data['segment'].apply(lambda x: x.split('_')[-1].split('.')[0]) + labels[UID] = (label_data['label'].values, label_segment_names.values) + return labels def standard_scaling(tensor): @@ -139,12 +196,42 @@ def visualize_trends(standardized_data, original_data, segment_names, num_plots= else: print("This is a trend analysis for data in an unsupported format.") -def perform_pca(data, num_components=2): +def perform_pca(data, num_components=2, num_clusters=4): # Perform PCA for dimensionality reduction data_flattened = data.view(data.size(0), -1) # Flatten the data pca = PCA(n_components=num_components) reduced_data = pca.fit_transform(data_flattened.numpy()) - return reduced_data, pca + + # Cluster the data using K-Means + kmeans = KMeans(n_clusters=num_clusters) + labels = kmeans.fit_predict(reduced_data) + + return reduced_data, pca, labels + +def perform_pca_sgd(data, num_components=2, num_clusters=4, batch_size=64): + data_flattened = data.view(data.size(0), -1) + ipca = IncrementalPCA(n_components=num_components, batch_size=batch_size) + reduced_data = ipca.fit_transform(data_flattened.numpy()) + + # Cluster the data using K-Means + kmeans = KMeans(n_clusters=num_clusters) + labels = kmeans.fit_predict(reduced_data) + + return reduced_data, ipca, labels + +# Update perform_tsne function name +def perform_tsne(data, num_components=2, num_clusters=4): + data_flattened = data.view(data.size(0), -1) + + # Perform t-SNE + tsne = TSNE(n_components=num_components, perplexity=30, n_iter=300) + reduced_data = tsne.fit_transform(data_flattened.numpy()) + + # Cluster the data using K-Means + kmeans = KMeans(n_clusters=num_clusters) + labels = kmeans.fit_predict(reduced_data) + + return reduced_data, labels def visualize_correlation_matrix(data, segment_names, subject_mode=True, num_subjects_to_visualize=None, save_path=None): ''' @@ -268,6 +355,18 @@ def perform_mfvi(data, K, n_optimization_iterations, convergence_threshold=1e-5, return miu, pi, resp +# Function to evaluate clustering and print multiple metrics +def evaluate_clustering(data, true_labels, predicted_labels): + ari = adjusted_rand_score(true_labels, predicted_labels) + ami = adjusted_mutual_info_score(true_labels, predicted_labels) + silhouette = silhouette_score(data, predicted_labels) + davies_bouldin = davies_bouldin_score(data, predicted_labels) + + print(f'Adjusted Rand Index (ARI): {ari}') + print(f'Adjusted Mutual Info (AMI): {ami}') + print(f'Silhouette Score: {silhouette}') + print(f'Davies-Bouldin Index: {davies_bouldin}') + def plot_pca(reduced_data, labels, method='original_labels', save_path=None): """ Plot the PCA results, and optionally save the plot. @@ -334,54 +433,121 @@ def main(): data_format = 'csv' # Choose 'csv' or 'png' data_path, labels_path, saving_path = get_data_paths(data_format, is_linux=is_linux, is_hpc=is_hpc) - original_data, standardized_data, segment_names, labels = load_data(data_path, labels_path, dataset_size=10, train=False, data_format=data_format) - # test_data, _, _ = load_data(data_path, labels_path, dataset_size=30, train=False) + + # Load data with labels and segment names + _, labeled_data, _, _, labeled_labels = load_data(data_path, labels_path, dataset_size=141, train=True, data_format=data_format, return_all="labeled") + + # Load unlabeled data + _, unlabeled_data, _, _, unlabeled_labels = load_data(data_path, labels_path, dataset_size=141, train=True, data_format=data_format, return_all="unlabeled") + + # Load all data (labeled and unlabeled) + original_data, all_data, segment_names, segment_labels, all_labels = load_data(data_path, labels_path, dataset_size=141, train=True, data_format=data_format, return_all=True) + + # test_data, _, _, _, _ = load_data(data_path, labels_path, dataset_size=30, train=False) - train_dataloader = create_dataloader(standardized_data) + train_dataloader = create_dataloader(labeled_data) # test_dataloader = create_dataloader(test_data) # Visualize random trends/segments - visualize_trends(standardized_data, original_data, segment_names, num_plots=10, data_format=data_format, save_path=None) - - # Perform PCA for dimensionality reduction - # reduced_data, pca = perform_pca(standardized_data, num_components=2) - # print("Explained variance ratio:", pca.explained_variance_ratio_) + visualize_trends(labeled_data, original_data, segment_names, num_plots=10, data_format=data_format, save_path=saving_path) # Visualize the correlation matrix - visualize_correlation_matrix(standardized_data, segment_names, subject_mode=True, num_subjects_to_visualize=None, save_path=saving_path) - # visualize_correlation_matrix(standardized_data, segment_names, subject_mode=False, num_subjects_to_visualize=None, save_path=saving_path) - # visualize_correlation_matrix(standardized_data, segment_names, subject_mode=False, num_subjects_to_visualize=None, save_path=saving_path) + # visualize_correlation_matrix(labeled_data, segment_names, subject_mode=True, num_subjects_to_visualize=None, save_path=saving_path) + # visualize_correlation_matrix(unlabeled_data, segment_names, subject_mode=True, num_subjects_to_visualize=None, save_path=saving_path) + # visualize_correlation_matrix(all_data, segment_names, subject_mode=False, num_subjects_to_visualize=None, save_path=saving_path) - # Perform MFVI for your data - K = 4 # Number of clusters - miu_mfvi, pi_mfvi, resp_mfvi = perform_mfvi(standardized_data, K, n_optimization_iterations=1000, convergence_threshold=1e-5, run_until_convergence=False) + ##### LABELED ###### + # Perform PCA on labeled data + pca_reduced_data, pca, pca_labeled_labels= perform_pca_sgd(labeled_data, num_components=2, num_clusters=4, batch_size=64) - # Calculate clustering metrics for MFVI - zi_mfvi = np.argmax(resp_mfvi, axis=1) - # Perform PCA for dimensionality reduction - reduced_data, pca = perform_pca(standardized_data, num_components=2) - print("Explained variance ratio:", pca.explained_variance_ratio_) + # Plot PCA for labeled data + plot_pca(pca_reduced_data, labeled_labels, method='PCA on Labeled Data', save_path=saving_path) + + # For PCA on labeled data + evaluate_clustering(labeled_data.view(labeled_data.size(0), -1).numpy(), labeled_labels, pca_labeled_labels) + + # Perform t-SNE on labeled data + tsne_reduced_data, tsne_labels = perform_tsne(labeled_data) - # Create two plots: PCA results and original labels - plt.figure(figsize=(16, 6)) + # Plot t-SNE for labeled data + plot_clusters(tsne_reduced_data, tsne_labels, 't-SNE on Labeled Data', save_path=saving_path) + + # For t-SNE on labeled data + evaluate_clustering(tsne_reduced_data, labeled_labels, tsne_labels) + + # Perform MFVI on labeled data + miu, pi, resp = perform_mfvi(labeled_data, K=4, n_optimization_iterations=300, convergence_threshold=1e-5, run_until_convergence=False) - # Plot PCA results - plot_pca(reduced_data, zi_mfvi, method='MFVI', save_path="pca_plot.png") + # Extract cluster assignments from MFVI + mfvi_labels = torch.argmax(resp, dim=1).numpy() - # Plot original labels - plot_pca(reduced_data, labels, method="original_labels", save_path="pca_plot.png") + # Plot MFVI for labeled data + plot_clusters(labeled_data.view(labeled_data.size(0), -1).numpy(), mfvi_labels, 'MFVI on Labeled Data', save_path=saving_path) - # Calculate clustering metrics for PCA results - silhouette_pca = silhouette_score(reduced_data, zi_mfvi) - ari_pca = adjusted_rand_score(labels, zi_mfvi) + # For MFVI on labeled data + evaluate_clustering(labeled_data.view(labeled_data.size(0), -1).numpy(), labeled_labels, mfvi_labels) + + ##### UNLABELED ###### + # Perform PCA on unlabeled data + pca_reduced_data, pca, pca_unlabeled_labels = perform_pca_sgd(unlabeled_data, num_components=2, num_clusters=4, batch_size=64) - # Print and compare clustering metrics for PCA - print("PCA Clustering Metrics Comparison:") - print(f"Silhouette Score (PCA): {silhouette_pca}") - print(f"Adjusted Rand Index (PCA vs. True Labels): {ari_pca}") + # Plot PCA for unlabeled data + plot_pca(pca_reduced_data, unlabeled_labels, method='PCA on Unlabeled Data', save_path=saving_path) - # Plot clusters for MFVI results - plot_clusters(reduced_data, torch.from_numpy(zi_mfvi), title="MFVI Clustering Results (Train Data)", save_path=saving_path) + # For PCA on unlabeled data + evaluate_clustering(unlabeled_data.view(unlabeled_data.size(0), -1).numpy(), unlabeled_labels, pca_unlabeled_labels) + + # Perform t-SNE on unlabeled data + tsne_reduced_data, tsne_labels = perform_tsne(unlabeled_data) + + # Plot t-SNE for unlabeled data + plot_clusters(tsne_reduced_data, tsne_labels, 't-SNE on Unlabeled Data', save_path=saving_path) + + # For t-SNE on unlabeled data + evaluate_clustering(tsne_reduced_data, unlabeled_labels, tsne_labels) + + # Perform MFVI on unlabeled data + miu, pi, resp = perform_mfvi(unlabeled_data, K=4, n_optimization_iterations=300, convergence_threshold=1e-5, run_until_convergence=False) + + # Extract cluster assignments from MFVI + mfvi_labels = torch.argmax(resp, dim=1).numpy() + + # Plot MFVI for unlabeled data + plot_clusters(unlabeled_data.view(unlabeled_data.size(0), -1).numpy(), mfvi_labels, 'MFVI on Unlabeled Data', save_path=saving_path) + + # For MFVI on unlabeled data + evaluate_clustering(unlabeled_data.view(unlabeled_data.size(0), -1).numpy(), unlabeled_labels, mfvi_labels) + + ##### ALL DATA ###### + # Perform PCA on all data + pca_reduced_data, pca, pca_labels= perform_pca_sgd(all_data, num_components=2, num_clusters=4, batch_size=64) + + # Plot PCA for all data + plot_pca(pca_reduced_data, all_labels, method='PCA on All Data', save_path=saving_path) + + # For PCA on all data + evaluate_clustering(all_data.view(all_data.size(0), -1).numpy(), all_labels, pca_labels) + + # Perform t-SNE on all data + tsne_reduced_data, tsne_labels = perform_tsne(all_data) + + # Plot t-SNE for all data + plot_clusters(tsne_reduced_data, tsne_labels, 't-SNE on All Data', save_path=saving_path) + + # For t-SNE on all data + evaluate_clustering(tsne_reduced_data, all_labels, tsne_labels) + + # Perform MFVI on all data + miu, pi, resp = perform_mfvi(all_data, K=4, n_optimization_iterations=300, convergence_threshold=1e-5, run_until_convergence=False) + + # Extract cluster assignments from MFVI + mfvi_labels = torch.argmax(resp, dim=1).numpy() + + # Plot MFVI for all data + plot_clusters(all_data.view(all_data.size(0), -1).numpy(), mfvi_labels, 'MFVI on All Data', save_path=saving_path) + + # For MFVI on all data + evaluate_clustering(all_data.view(all_data.size(0), -1).numpy(), all_labels, mfvi_labels) if __name__ == "__main__": main() \ No newline at end of file