Skip to content

Commit

Permalink
GENERAL UPDATE
Browse files Browse the repository at this point in the history
This commit has changes in the entire code looking for optimization in the procedure to load the data and the procedure to process. Also I changed the functions for PCA, added t-SNE and a simple way to plot.
  • Loading branch information
lrm22005 committed Oct 25, 2023
1 parent 92fca39 commit f03187d
Showing 1 changed file with 214 additions and 48 deletions.
262 changes: 214 additions & 48 deletions project_1.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,11 @@
from torch.distributions import MultivariateNormal
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, adjusted_rand_score, adjusted_mutual_info_score, davies_bouldin_score
import seaborn as sns
from PIL import Image # Import the Image module

Expand Down Expand Up @@ -44,16 +47,26 @@ def get_data_paths(data_format, is_linux=False, is_hpc=False):

return data_path, labels_path, saving_path

def load_data(data_path, labels_path, dataset_size=10, train=True, standardize=True, data_format='csv'):
# Standardize the data
def standard_scaling(data):
scaler = StandardScaler()
data_shape = data.shape
data = data.view(data_shape[0], -1)
data = scaler.fit_transform(data)
data = data.view(data_shape)
return torch.Tensor(data)

def load_data(data_path, labels_path, dataset_size=2, train=True, standardize=True, data_format='csv', return_all=False):
if data_format not in ['csv', 'png']:
raise ValueError("Invalid data_format. Choose 'csv' or 'png.")

dir_list_UID = os.listdir(data_path)
UID_list = dir_list_UID[:dataset_size] if train else dir_list_UID[dataset_size:]

X_data = []
X_data = [] # Store all data
X_data_original = [] # Store original data without standardization
segment_names = []
validated_labels = [] # Store only the label values

for UID in UID_list:
data_path_UID = os.path.join(data_path, UID)
Expand All @@ -72,29 +85,73 @@ def load_data(data_path, labels_path, dataset_size=10, train=True, standardize=T
else:
continue # Skip other file formats

# X_data_original.append(time_freq_tensor.clone()) # Store a copy of the original data
X_data.append(time_freq_tensor)
X_data_original.append(time_freq_tensor.clone()) # Store a copy of the original data

segment_names.append(seg) # Store segment names
segment_names.append(seg.split('_filt')[0]) # Extract and store segment names

X_data = torch.cat(X_data, 0)
# X_data_original = torch.cat(X_data_original, 0)
X_data_original = torch.cat(X_data_original, 0)

if standardize:
X_data = standard_scaling(X_data) # Standardize the data

# Extract labels from CSV files
labels = extract_labels(UID_list, labels_path)
labels = extract_labels(UID_list, labels_path, segment_names)

important_labels = [0.0, 1.0, 2.0, 3.0] # List of important labels

# Initialize labels for segments as unlabeled (-1)
segment_labels = {segment_name: -1 for segment_name in segment_names}

for UID in labels.keys():
if UID not in UID_list:
# Skip UIDs that are not in the dataset
continue

label_data, label_segment_names = labels[UID]

for idx, segment_label in enumerate(label_data):
segment_name = label_segment_names[idx]
if segment_label in important_labels:
segment_labels[segment_name] = segment_label
else:
# Set labels that are not in the important list as -1 (Unlabeled)
segment_labels[segment_name] = -1

return X_data_original, X_data, segment_names, labels
# Return all segments along with labels
if return_all:
return X_data_original, X_data, segment_names, segment_labels, segment_labels.values()

def extract_labels(UID_list, labels_path):
# Filter out segments that are unlabeled (-1)
filtered_segment_names = [segment_name for segment_name, label in segment_labels.items() if label != -1]

# Filter data to match the filtered segment names
filtered_data = torch.stack([X_data[segment_names.index(segment_name)] for segment_name in filtered_segment_names])

# Return labeled and unlabeled segments along with labels
if return_all == 'labeled':
return X_data_original, filtered_data, filtered_segment_names, {seg: segment_labels[seg] for seg in filtered_segment_names}, {seg: segment_labels[seg] for seg in filtered_segment_names}.values()

# Return unlabeled segments along with labels
if return_all == 'unlabeled':
unlabeled_segment_names = [segment_name for segment_name, label in segment_labels.items() if label == -1]
unlabeled_data = torch.stack([X_data[segment_names.index(segment_name)] for segment_name in unlabeled_segment_names])
return X_data_original, unlabeled_data, unlabeled_segment_names, {seg: segment_labels[seg] for seg in unlabeled_segment_names}, {seg: segment_labels[seg] for seg in unlabeled_segment_names}.values()

# By default, return only labeled segments along with labels
return X_data_original, filtered_data, filtered_segment_names, {seg: segment_labels[seg] for seg in filtered_segment_names}, {seg: segment_labels[seg] for seg in filtered_segment_names}.values()


def extract_labels(UID_list, labels_path, segment_names):
labels = {}
for UID in UID_list:
label_file = os.path.join(labels_path, UID + "_final_attemp_4_1_Dong.csv")
if os.path.exists(label_file):
label_data = pd.read_csv(label_file, sep='\t', header=None, names=['segment', 'label'])
labels[UID] = label_data['label'].values
label_data = pd.read_csv(label_file, sep=',', header=0, names=['segment', 'label'])
label_segment_names = label_data['segment'].apply(lambda x: x.split('_')[-1].split('.')[0])
labels[UID] = (label_data['label'].values, label_segment_names.values)

return labels

def standard_scaling(tensor):
Expand Down Expand Up @@ -139,12 +196,42 @@ def visualize_trends(standardized_data, original_data, segment_names, num_plots=
else:
print("This is a trend analysis for data in an unsupported format.")

def perform_pca(data, num_components=2):
def perform_pca(data, num_components=2, num_clusters=4):
# Perform PCA for dimensionality reduction
data_flattened = data.view(data.size(0), -1) # Flatten the data
pca = PCA(n_components=num_components)
reduced_data = pca.fit_transform(data_flattened.numpy())
return reduced_data, pca

# Cluster the data using K-Means
kmeans = KMeans(n_clusters=num_clusters)
labels = kmeans.fit_predict(reduced_data)

return reduced_data, pca, labels

def perform_pca_sgd(data, num_components=2, num_clusters=4, batch_size=64):
data_flattened = data.view(data.size(0), -1)
ipca = IncrementalPCA(n_components=num_components, batch_size=batch_size)
reduced_data = ipca.fit_transform(data_flattened.numpy())

# Cluster the data using K-Means
kmeans = KMeans(n_clusters=num_clusters)
labels = kmeans.fit_predict(reduced_data)

return reduced_data, ipca, labels

# Update perform_tsne function name
def perform_tsne(data, num_components=2, num_clusters=4):
data_flattened = data.view(data.size(0), -1)

# Perform t-SNE
tsne = TSNE(n_components=num_components, perplexity=30, n_iter=300)
reduced_data = tsne.fit_transform(data_flattened.numpy())

# Cluster the data using K-Means
kmeans = KMeans(n_clusters=num_clusters)
labels = kmeans.fit_predict(reduced_data)

return reduced_data, labels

def visualize_correlation_matrix(data, segment_names, subject_mode=True, num_subjects_to_visualize=None, save_path=None):
'''
Expand Down Expand Up @@ -268,6 +355,18 @@ def perform_mfvi(data, K, n_optimization_iterations, convergence_threshold=1e-5,

return miu, pi, resp

# Function to evaluate clustering and print multiple metrics
def evaluate_clustering(data, true_labels, predicted_labels):
ari = adjusted_rand_score(true_labels, predicted_labels)
ami = adjusted_mutual_info_score(true_labels, predicted_labels)
silhouette = silhouette_score(data, predicted_labels)
davies_bouldin = davies_bouldin_score(data, predicted_labels)

print(f'Adjusted Rand Index (ARI): {ari}')
print(f'Adjusted Mutual Info (AMI): {ami}')
print(f'Silhouette Score: {silhouette}')
print(f'Davies-Bouldin Index: {davies_bouldin}')

def plot_pca(reduced_data, labels, method='original_labels', save_path=None):
"""
Plot the PCA results, and optionally save the plot.
Expand Down Expand Up @@ -334,54 +433,121 @@ def main():
data_format = 'csv' # Choose 'csv' or 'png'

data_path, labels_path, saving_path = get_data_paths(data_format, is_linux=is_linux, is_hpc=is_hpc)
original_data, standardized_data, segment_names, labels = load_data(data_path, labels_path, dataset_size=10, train=False, data_format=data_format)
# test_data, _, _ = load_data(data_path, labels_path, dataset_size=30, train=False)

# Load data with labels and segment names
_, labeled_data, _, _, labeled_labels = load_data(data_path, labels_path, dataset_size=141, train=True, data_format=data_format, return_all="labeled")

# Load unlabeled data
_, unlabeled_data, _, _, unlabeled_labels = load_data(data_path, labels_path, dataset_size=141, train=True, data_format=data_format, return_all="unlabeled")

# Load all data (labeled and unlabeled)
original_data, all_data, segment_names, segment_labels, all_labels = load_data(data_path, labels_path, dataset_size=141, train=True, data_format=data_format, return_all=True)

# test_data, _, _, _, _ = load_data(data_path, labels_path, dataset_size=30, train=False)

train_dataloader = create_dataloader(standardized_data)
train_dataloader = create_dataloader(labeled_data)
# test_dataloader = create_dataloader(test_data)

# Visualize random trends/segments
visualize_trends(standardized_data, original_data, segment_names, num_plots=10, data_format=data_format, save_path=None)

# Perform PCA for dimensionality reduction
# reduced_data, pca = perform_pca(standardized_data, num_components=2)
# print("Explained variance ratio:", pca.explained_variance_ratio_)
visualize_trends(labeled_data, original_data, segment_names, num_plots=10, data_format=data_format, save_path=saving_path)

# Visualize the correlation matrix
visualize_correlation_matrix(standardized_data, segment_names, subject_mode=True, num_subjects_to_visualize=None, save_path=saving_path)
# visualize_correlation_matrix(standardized_data, segment_names, subject_mode=False, num_subjects_to_visualize=None, save_path=saving_path)
# visualize_correlation_matrix(standardized_data, segment_names, subject_mode=False, num_subjects_to_visualize=None, save_path=saving_path)
# visualize_correlation_matrix(labeled_data, segment_names, subject_mode=True, num_subjects_to_visualize=None, save_path=saving_path)
# visualize_correlation_matrix(unlabeled_data, segment_names, subject_mode=True, num_subjects_to_visualize=None, save_path=saving_path)
# visualize_correlation_matrix(all_data, segment_names, subject_mode=False, num_subjects_to_visualize=None, save_path=saving_path)

# Perform MFVI for your data
K = 4 # Number of clusters
miu_mfvi, pi_mfvi, resp_mfvi = perform_mfvi(standardized_data, K, n_optimization_iterations=1000, convergence_threshold=1e-5, run_until_convergence=False)
##### LABELED ######
# Perform PCA on labeled data
pca_reduced_data, pca, pca_labeled_labels= perform_pca_sgd(labeled_data, num_components=2, num_clusters=4, batch_size=64)

# Calculate clustering metrics for MFVI
zi_mfvi = np.argmax(resp_mfvi, axis=1)
# Perform PCA for dimensionality reduction
reduced_data, pca = perform_pca(standardized_data, num_components=2)
print("Explained variance ratio:", pca.explained_variance_ratio_)
# Plot PCA for labeled data
plot_pca(pca_reduced_data, labeled_labels, method='PCA on Labeled Data', save_path=saving_path)

# For PCA on labeled data
evaluate_clustering(labeled_data.view(labeled_data.size(0), -1).numpy(), labeled_labels, pca_labeled_labels)

# Perform t-SNE on labeled data
tsne_reduced_data, tsne_labels = perform_tsne(labeled_data)

# Create two plots: PCA results and original labels
plt.figure(figsize=(16, 6))
# Plot t-SNE for labeled data
plot_clusters(tsne_reduced_data, tsne_labels, 't-SNE on Labeled Data', save_path=saving_path)

# For t-SNE on labeled data
evaluate_clustering(tsne_reduced_data, labeled_labels, tsne_labels)

# Perform MFVI on labeled data
miu, pi, resp = perform_mfvi(labeled_data, K=4, n_optimization_iterations=300, convergence_threshold=1e-5, run_until_convergence=False)

# Plot PCA results
plot_pca(reduced_data, zi_mfvi, method='MFVI', save_path="pca_plot.png")
# Extract cluster assignments from MFVI
mfvi_labels = torch.argmax(resp, dim=1).numpy()

# Plot original labels
plot_pca(reduced_data, labels, method="original_labels", save_path="pca_plot.png")
# Plot MFVI for labeled data
plot_clusters(labeled_data.view(labeled_data.size(0), -1).numpy(), mfvi_labels, 'MFVI on Labeled Data', save_path=saving_path)

# Calculate clustering metrics for PCA results
silhouette_pca = silhouette_score(reduced_data, zi_mfvi)
ari_pca = adjusted_rand_score(labels, zi_mfvi)
# For MFVI on labeled data
evaluate_clustering(labeled_data.view(labeled_data.size(0), -1).numpy(), labeled_labels, mfvi_labels)

##### UNLABELED ######
# Perform PCA on unlabeled data
pca_reduced_data, pca, pca_unlabeled_labels = perform_pca_sgd(unlabeled_data, num_components=2, num_clusters=4, batch_size=64)

# Print and compare clustering metrics for PCA
print("PCA Clustering Metrics Comparison:")
print(f"Silhouette Score (PCA): {silhouette_pca}")
print(f"Adjusted Rand Index (PCA vs. True Labels): {ari_pca}")
# Plot PCA for unlabeled data
plot_pca(pca_reduced_data, unlabeled_labels, method='PCA on Unlabeled Data', save_path=saving_path)

# Plot clusters for MFVI results
plot_clusters(reduced_data, torch.from_numpy(zi_mfvi), title="MFVI Clustering Results (Train Data)", save_path=saving_path)
# For PCA on unlabeled data
evaluate_clustering(unlabeled_data.view(unlabeled_data.size(0), -1).numpy(), unlabeled_labels, pca_unlabeled_labels)

# Perform t-SNE on unlabeled data
tsne_reduced_data, tsne_labels = perform_tsne(unlabeled_data)

# Plot t-SNE for unlabeled data
plot_clusters(tsne_reduced_data, tsne_labels, 't-SNE on Unlabeled Data', save_path=saving_path)

# For t-SNE on unlabeled data
evaluate_clustering(tsne_reduced_data, unlabeled_labels, tsne_labels)

# Perform MFVI on unlabeled data
miu, pi, resp = perform_mfvi(unlabeled_data, K=4, n_optimization_iterations=300, convergence_threshold=1e-5, run_until_convergence=False)

# Extract cluster assignments from MFVI
mfvi_labels = torch.argmax(resp, dim=1).numpy()

# Plot MFVI for unlabeled data
plot_clusters(unlabeled_data.view(unlabeled_data.size(0), -1).numpy(), mfvi_labels, 'MFVI on Unlabeled Data', save_path=saving_path)

# For MFVI on unlabeled data
evaluate_clustering(unlabeled_data.view(unlabeled_data.size(0), -1).numpy(), unlabeled_labels, mfvi_labels)

##### ALL DATA ######
# Perform PCA on all data
pca_reduced_data, pca, pca_labels= perform_pca_sgd(all_data, num_components=2, num_clusters=4, batch_size=64)

# Plot PCA for all data
plot_pca(pca_reduced_data, all_labels, method='PCA on All Data', save_path=saving_path)

# For PCA on all data
evaluate_clustering(all_data.view(all_data.size(0), -1).numpy(), all_labels, pca_labels)

# Perform t-SNE on all data
tsne_reduced_data, tsne_labels = perform_tsne(all_data)

# Plot t-SNE for all data
plot_clusters(tsne_reduced_data, tsne_labels, 't-SNE on All Data', save_path=saving_path)

# For t-SNE on all data
evaluate_clustering(tsne_reduced_data, all_labels, tsne_labels)

# Perform MFVI on all data
miu, pi, resp = perform_mfvi(all_data, K=4, n_optimization_iterations=300, convergence_threshold=1e-5, run_until_convergence=False)

# Extract cluster assignments from MFVI
mfvi_labels = torch.argmax(resp, dim=1).numpy()

# Plot MFVI for all data
plot_clusters(all_data.view(all_data.size(0), -1).numpy(), mfvi_labels, 'MFVI on All Data', save_path=saving_path)

# For MFVI on all data
evaluate_clustering(all_data.view(all_data.size(0), -1).numpy(), all_labels, mfvi_labels)

if __name__ == "__main__":
main()

0 comments on commit f03187d

Please sign in to comment.