From 4b6b50eb16afa5172c1166000ffd1e543ada9053 Mon Sep 17 00:00:00 2001 From: Luis Roberto Mercado Diaz Date: Mon, 23 Oct 2023 15:39:24 -0400 Subject: [PATCH] Initial commit Starting the coding. I am using modular codes because I need to avoid error repetitions. And simplify the debugging. --- project_1.py | 104 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 project_1.py diff --git a/project_1.py b/project_1.py new file mode 100644 index 0000000..d9fdf85 --- /dev/null +++ b/project_1.py @@ -0,0 +1,104 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Oct 23 14:50:24 2023 + +@author: lrm22005 +""" + +import os +import numpy as np +import pandas as pd +import torch +from torch.utils.data import DataLoader, TensorDataset +import matplotlib.pyplot as plt +from sklearn.decomposition import PCA +import seaborn as sns + +def load_data(data_path, dataset_size=10, train=True, standardize=True): + # Load data from the specified data_path + dir_list_UID = os.listdir(data_path) + UID_list = dir_list_UID[:dataset_size] if train else dir_list_UID[dataset_size:] + + X_data = [] + segment_names = [] + + for UID in UID_list: + data_path_UID = os.path.join(data_path, UID) + dir_list_seg = os.listdir(data_path_UID) + + for seg in dir_list_seg[:50]: # Limiting to 50 segments + seg_path = os.path.join(data_path_UID, seg) + time_freq_plot = np.array(pd.read_csv(seg_path, header=None)) + time_freq_tensor = torch.Tensor(time_freq_plot).reshape(1, 128, 128) + X_data.append(time_freq_tensor) + segment_names.append(seg) # Store segment names + + X_data = torch.cat(X_data, 0) + + if standardize: + X_data = standard_scaling(X_data) + + return X_data, segment_names + +def standard_scaling(tensor): + # Z-score normalization (standardization) + mean = tensor.mean() + std = tensor.std() + tensor_standardized = (tensor - mean) / std + return tensor_standardized + +def create_dataloader(data, batch_size=64, shuffle=True): + dataset = TensorDataset(data, data) + data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle) + return data_loader + +def visualize_trends(data, segment_names, num_plots=3): + # Visualize random trends/segments + num_samples, _, _ = data.shape + for _ in range(num_plots): + idx = np.random.randint(0, num_samples) + plt.imshow(data[idx].numpy()) + plt.title(f"Segment: {segment_names[idx]}") + plt.colorbar() + plt.show() + +def perform_pca(data, num_components=2): + # Perform PCA for dimensionality reduction + data_flattened = data.view(data.size(0), -1) # Flatten the data + pca = PCA(n_components=num_components) + reduced_data = pca.fit_transform(data_flattened.numpy()) + return reduced_data, pca + +def visualize_correlation_matrix(data): + # Visualize the correlation matrix + data_flattened = data.view(data.size(0), -1).numpy() + correlation_matrix = np.corrcoef(data_flattened, rowvar=False) + sns.heatmap(correlation_matrix, cmap="coolwarm", xticklabels=False, yticklabels=False) + plt.title("Correlation Matrix") + plt.show() + +def main(): + is_linux = False # Set to True if running on Linux, False if on Windows + if is_linux: + data_path = "/mnt/r/ENGR_Chon/Dong/MATLAB_generate_results/NIH_PulseWatch/TFS_csv" + else: + data_path = r"R:\ENGR_Chon\Dong\MATLAB_generate_results\NIH_PulseWatch\TFS_csv" + + train_data, segment_names = load_data(data_path, dataset_size=141, train=True) + test_data, _ = load_data(data_path, dataset_size=10, train=False) + + train_dataloader = create_dataloader(train_data) + test_dataloader = create_dataloader(test_data) + + # Visualize random trends/segments + visualize_trends(train_data, segment_names, num_plots=3) + + # Perform PCA for dimensionality reduction + reduced_data, pca = perform_pca(train_data, num_components=2) + print("Explained variance ratio:", pca.explained_variance_ratio_) + + # Visualize the correlation matrix + visualize_correlation_matrix(train_data) + +if __name__ == "__main__": + main() \ No newline at end of file