Skip to content

Commit

Permalink
Merge branch 'Luis'
Browse files Browse the repository at this point in the history
  • Loading branch information
lrm22005 committed Oct 25, 2023
2 parents 0fac61d + f03187d commit 8be2756
Showing 1 changed file with 213 additions and 47 deletions.
260 changes: 213 additions & 47 deletions project_1.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,11 @@
from torch.distributions import MultivariateNormal
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, adjusted_rand_score, adjusted_mutual_info_score, davies_bouldin_score
import seaborn as sns
from PIL import Image # Import the Image module

Expand Down Expand Up @@ -44,16 +47,26 @@ def get_data_paths(data_format, is_linux=False, is_hpc=False):

return data_path, labels_path, saving_path

def load_data(data_path, labels_path, dataset_size=10, train=True, standardize=True, data_format='csv'):
# Standardize the data
def standard_scaling(data):
scaler = StandardScaler()
data_shape = data.shape
data = data.view(data_shape[0], -1)
data = scaler.fit_transform(data)
data = data.view(data_shape)
return torch.Tensor(data)

def load_data(data_path, labels_path, dataset_size=2, train=True, standardize=True, data_format='csv', return_all=False):
if data_format not in ['csv', 'png']:
raise ValueError("Invalid data_format. Choose 'csv' or 'png.")

dir_list_UID = os.listdir(data_path)
UID_list = dir_list_UID[:dataset_size] if train else dir_list_UID[dataset_size:]

X_data = []
X_data = [] # Store all data
X_data_original = [] # Store original data without standardization
segment_names = []
validated_labels = [] # Store only the label values

for UID in UID_list:
data_path_UID = os.path.join(data_path, UID)
Expand All @@ -72,10 +85,10 @@ def load_data(data_path, labels_path, dataset_size=10, train=True, standardize=T
else:
continue # Skip other file formats

X_data_original.append(time_freq_tensor.clone()) # Store a copy of the original data
X_data.append(time_freq_tensor)
X_data_original.append(time_freq_tensor.clone()) # Store a copy of the original data

segment_names.append(seg) # Store segment names
segment_names.append(seg.split('_filt')[0]) # Extract and store segment names

X_data = torch.cat(X_data, 0)
X_data_original = torch.cat(X_data_original, 0)
Expand All @@ -84,17 +97,61 @@ def load_data(data_path, labels_path, dataset_size=10, train=True, standardize=T
X_data = standard_scaling(X_data) # Standardize the data

# Extract labels from CSV files
labels = extract_labels(UID_list, labels_path)
labels = extract_labels(UID_list, labels_path, segment_names)

important_labels = [0.0, 1.0, 2.0, 3.0] # List of important labels

# Initialize labels for segments as unlabeled (-1)
segment_labels = {segment_name: -1 for segment_name in segment_names}

for UID in labels.keys():
if UID not in UID_list:
# Skip UIDs that are not in the dataset
continue

label_data, label_segment_names = labels[UID]

for idx, segment_label in enumerate(label_data):
segment_name = label_segment_names[idx]
if segment_label in important_labels:
segment_labels[segment_name] = segment_label
else:
# Set labels that are not in the important list as -1 (Unlabeled)
segment_labels[segment_name] = -1

# Return all segments along with labels
if return_all:
return X_data_original, X_data, segment_names, segment_labels, segment_labels.values()

# Filter out segments that are unlabeled (-1)
filtered_segment_names = [segment_name for segment_name, label in segment_labels.items() if label != -1]

# Filter data to match the filtered segment names
filtered_data = torch.stack([X_data[segment_names.index(segment_name)] for segment_name in filtered_segment_names])

# Return labeled and unlabeled segments along with labels
if return_all == 'labeled':
return X_data_original, filtered_data, filtered_segment_names, {seg: segment_labels[seg] for seg in filtered_segment_names}, {seg: segment_labels[seg] for seg in filtered_segment_names}.values()

# Return unlabeled segments along with labels
if return_all == 'unlabeled':
unlabeled_segment_names = [segment_name for segment_name, label in segment_labels.items() if label == -1]
unlabeled_data = torch.stack([X_data[segment_names.index(segment_name)] for segment_name in unlabeled_segment_names])
return X_data_original, unlabeled_data, unlabeled_segment_names, {seg: segment_labels[seg] for seg in unlabeled_segment_names}, {seg: segment_labels[seg] for seg in unlabeled_segment_names}.values()

return X_data_original, X_data, segment_names, labels
# By default, return only labeled segments along with labels
return X_data_original, filtered_data, filtered_segment_names, {seg: segment_labels[seg] for seg in filtered_segment_names}, {seg: segment_labels[seg] for seg in filtered_segment_names}.values()

def extract_labels(UID_list, labels_path):

def extract_labels(UID_list, labels_path, segment_names):
labels = {}
for UID in UID_list:
label_file = os.path.join(labels_path, UID + "_final_attemp_4_1_Dong.csv")
if os.path.exists(label_file):
label_data = pd.read_csv(label_file, sep='\t', header=None, names=['segment', 'label'])
labels[UID] = label_data['label'].values
label_data = pd.read_csv(label_file, sep=',', header=0, names=['segment', 'label'])
label_segment_names = label_data['segment'].apply(lambda x: x.split('_')[-1].split('.')[0])
labels[UID] = (label_data['label'].values, label_segment_names.values)

return labels

def standard_scaling(tensor):
Expand Down Expand Up @@ -139,12 +196,42 @@ def visualize_trends(standardized_data, original_data, segment_names, num_plots=
else:
print("This is a trend analysis for data in an unsupported format.")

def perform_pca(data, num_components=2):
def perform_pca(data, num_components=2, num_clusters=4):
# Perform PCA for dimensionality reduction
data_flattened = data.view(data.size(0), -1) # Flatten the data
pca = PCA(n_components=num_components)
reduced_data = pca.fit_transform(data_flattened.numpy())
return reduced_data, pca

# Cluster the data using K-Means
kmeans = KMeans(n_clusters=num_clusters)
labels = kmeans.fit_predict(reduced_data)

return reduced_data, pca, labels

def perform_pca_sgd(data, num_components=2, num_clusters=4, batch_size=64):
data_flattened = data.view(data.size(0), -1)
ipca = IncrementalPCA(n_components=num_components, batch_size=batch_size)
reduced_data = ipca.fit_transform(data_flattened.numpy())

# Cluster the data using K-Means
kmeans = KMeans(n_clusters=num_clusters)
labels = kmeans.fit_predict(reduced_data)

return reduced_data, ipca, labels

# Update perform_tsne function name
def perform_tsne(data, num_components=2, num_clusters=4):
data_flattened = data.view(data.size(0), -1)

# Perform t-SNE
tsne = TSNE(n_components=num_components, perplexity=30, n_iter=300)
reduced_data = tsne.fit_transform(data_flattened.numpy())

# Cluster the data using K-Means
kmeans = KMeans(n_clusters=num_clusters)
labels = kmeans.fit_predict(reduced_data)

return reduced_data, labels

def visualize_correlation_matrix(data, segment_names, subject_mode=True, num_subjects_to_visualize=None, save_path=None):
'''
Expand Down Expand Up @@ -268,6 +355,18 @@ def perform_mfvi(data, K, n_optimization_iterations, convergence_threshold=1e-5,

return miu, pi, resp

# Function to evaluate clustering and print multiple metrics
def evaluate_clustering(data, true_labels, predicted_labels):
ari = adjusted_rand_score(true_labels, predicted_labels)
ami = adjusted_mutual_info_score(true_labels, predicted_labels)
silhouette = silhouette_score(data, predicted_labels)
davies_bouldin = davies_bouldin_score(data, predicted_labels)

print(f'Adjusted Rand Index (ARI): {ari}')
print(f'Adjusted Mutual Info (AMI): {ami}')
print(f'Silhouette Score: {silhouette}')
print(f'Davies-Bouldin Index: {davies_bouldin}')

def plot_pca(reduced_data, labels, method='original_labels', save_path=None):
"""
Plot the PCA results, and optionally save the plot.
Expand Down Expand Up @@ -334,54 +433,121 @@ def main():
data_format = 'csv' # Choose 'csv' or 'png'

data_path, labels_path, saving_path = get_data_paths(data_format, is_linux=is_linux, is_hpc=is_hpc)
original_data, standardized_data, segment_names, labels = load_data(data_path, labels_path, dataset_size=10, train=True, data_format=data_format)
# test_data, _, _ = load_data(data_path, labels_path, dataset_size=30, train=False)

# Load data with labels and segment names
_, labeled_data, _, _, labeled_labels = load_data(data_path, labels_path, dataset_size=141, train=True, data_format=data_format, return_all="labeled")

# Load unlabeled data
_, unlabeled_data, _, _, unlabeled_labels = load_data(data_path, labels_path, dataset_size=141, train=True, data_format=data_format, return_all="unlabeled")

# Load all data (labeled and unlabeled)
original_data, all_data, segment_names, segment_labels, all_labels = load_data(data_path, labels_path, dataset_size=141, train=True, data_format=data_format, return_all=True)

train_dataloader = create_dataloader(standardized_data)
# test_data, _, _, _, _ = load_data(data_path, labels_path, dataset_size=30, train=False)

train_dataloader = create_dataloader(labeled_data)
# test_dataloader = create_dataloader(test_data)

# Visualize random trends/segments
visualize_trends(standardized_data, original_data, segment_names, num_plots=10, data_format=data_format, save_path=None)

# Perform PCA for dimensionality reduction
# reduced_data, pca = perform_pca(standardized_data, num_components=2)
# print("Explained variance ratio:", pca.explained_variance_ratio_)
visualize_trends(labeled_data, original_data, segment_names, num_plots=10, data_format=data_format, save_path=saving_path)

# Visualize the correlation matrix
visualize_correlation_matrix(standardized_data, segment_names, subject_mode=True, num_subjects_to_visualize=None, save_path=saving_path)
# visualize_correlation_matrix(standardized_data, segment_names, subject_mode=False, num_subjects_to_visualize=None, save_path=saving_path)
# visualize_correlation_matrix(standardized_data, segment_names, subject_mode=False, num_subjects_to_visualize=None, save_path=saving_path)
# visualize_correlation_matrix(labeled_data, segment_names, subject_mode=True, num_subjects_to_visualize=None, save_path=saving_path)
# visualize_correlation_matrix(unlabeled_data, segment_names, subject_mode=True, num_subjects_to_visualize=None, save_path=saving_path)
# visualize_correlation_matrix(all_data, segment_names, subject_mode=False, num_subjects_to_visualize=None, save_path=saving_path)

# Perform MFVI for your data
K = 4 # Number of clusters
miu_mfvi, pi_mfvi, resp_mfvi = perform_mfvi(standardized_data, K, n_optimization_iterations=1000, convergence_threshold=1e-5, run_until_convergence=False)
##### LABELED ######
# Perform PCA on labeled data
pca_reduced_data, pca, pca_labeled_labels= perform_pca_sgd(labeled_data, num_components=2, num_clusters=4, batch_size=64)

# Calculate clustering metrics for MFVI
zi_mfvi = np.argmax(resp_mfvi, axis=1)
# Perform PCA for dimensionality reduction
reduced_data, pca = perform_pca(standardized_data, num_components=2)
print("Explained variance ratio:", pca.explained_variance_ratio_)
# Plot PCA for labeled data
plot_pca(pca_reduced_data, labeled_labels, method='PCA on Labeled Data', save_path=saving_path)

# For PCA on labeled data
evaluate_clustering(labeled_data.view(labeled_data.size(0), -1).numpy(), labeled_labels, pca_labeled_labels)

# Perform t-SNE on labeled data
tsne_reduced_data, tsne_labels = perform_tsne(labeled_data)

# Plot t-SNE for labeled data
plot_clusters(tsne_reduced_data, tsne_labels, 't-SNE on Labeled Data', save_path=saving_path)

# For t-SNE on labeled data
evaluate_clustering(tsne_reduced_data, labeled_labels, tsne_labels)

# Perform MFVI on labeled data
miu, pi, resp = perform_mfvi(labeled_data, K=4, n_optimization_iterations=300, convergence_threshold=1e-5, run_until_convergence=False)

# Extract cluster assignments from MFVI
mfvi_labels = torch.argmax(resp, dim=1).numpy()

# Plot MFVI for labeled data
plot_clusters(labeled_data.view(labeled_data.size(0), -1).numpy(), mfvi_labels, 'MFVI on Labeled Data', save_path=saving_path)

# For MFVI on labeled data
evaluate_clustering(labeled_data.view(labeled_data.size(0), -1).numpy(), labeled_labels, mfvi_labels)

##### UNLABELED ######
# Perform PCA on unlabeled data
pca_reduced_data, pca, pca_unlabeled_labels = perform_pca_sgd(unlabeled_data, num_components=2, num_clusters=4, batch_size=64)

# Plot PCA for unlabeled data
plot_pca(pca_reduced_data, unlabeled_labels, method='PCA on Unlabeled Data', save_path=saving_path)

# For PCA on unlabeled data
evaluate_clustering(unlabeled_data.view(unlabeled_data.size(0), -1).numpy(), unlabeled_labels, pca_unlabeled_labels)

# Perform t-SNE on unlabeled data
tsne_reduced_data, tsne_labels = perform_tsne(unlabeled_data)

# Plot t-SNE for unlabeled data
plot_clusters(tsne_reduced_data, tsne_labels, 't-SNE on Unlabeled Data', save_path=saving_path)

# For t-SNE on unlabeled data
evaluate_clustering(tsne_reduced_data, unlabeled_labels, tsne_labels)

# Perform MFVI on unlabeled data
miu, pi, resp = perform_mfvi(unlabeled_data, K=4, n_optimization_iterations=300, convergence_threshold=1e-5, run_until_convergence=False)

# Create two plots: PCA results and original labels
plt.figure(figsize=(16, 6))
# Extract cluster assignments from MFVI
mfvi_labels = torch.argmax(resp, dim=1).numpy()

# Plot PCA results
plot_pca(reduced_data, zi_mfvi, method='MFVI', save_path="pca_plot.png")
# Plot MFVI for unlabeled data
plot_clusters(unlabeled_data.view(unlabeled_data.size(0), -1).numpy(), mfvi_labels, 'MFVI on Unlabeled Data', save_path=saving_path)

# For MFVI on unlabeled data
evaluate_clustering(unlabeled_data.view(unlabeled_data.size(0), -1).numpy(), unlabeled_labels, mfvi_labels)

##### ALL DATA ######
# Perform PCA on all data
pca_reduced_data, pca, pca_labels= perform_pca_sgd(all_data, num_components=2, num_clusters=4, batch_size=64)

# Plot original labels
plot_pca(reduced_data, labels, method="original_labels", save_path="pca_plot.png")
# Plot PCA for all data
plot_pca(pca_reduced_data, all_labels, method='PCA on All Data', save_path=saving_path)

# For PCA on all data
evaluate_clustering(all_data.view(all_data.size(0), -1).numpy(), all_labels, pca_labels)

# Calculate clustering metrics for PCA results
silhouette_pca = silhouette_score(reduced_data, zi_mfvi)
ari_pca = adjusted_rand_score(labels, zi_mfvi)
# Perform t-SNE on all data
tsne_reduced_data, tsne_labels = perform_tsne(all_data)

# Print and compare clustering metrics for PCA
print("PCA Clustering Metrics Comparison:")
print(f"Silhouette Score (PCA): {silhouette_pca}")
print(f"Adjusted Rand Index (PCA vs. True Labels): {ari_pca}")
# Plot t-SNE for all data
plot_clusters(tsne_reduced_data, tsne_labels, 't-SNE on All Data', save_path=saving_path)

# For t-SNE on all data
evaluate_clustering(tsne_reduced_data, all_labels, tsne_labels)

# Plot clusters for MFVI results
plot_clusters(reduced_data, torch.from_numpy(zi_mfvi), title="MFVI Clustering Results (Train Data)", save_path=saving_path)
# Perform MFVI on all data
miu, pi, resp = perform_mfvi(all_data, K=4, n_optimization_iterations=300, convergence_threshold=1e-5, run_until_convergence=False)

# Extract cluster assignments from MFVI
mfvi_labels = torch.argmax(resp, dim=1).numpy()

# Plot MFVI for all data
plot_clusters(all_data.view(all_data.size(0), -1).numpy(), mfvi_labels, 'MFVI on All Data', save_path=saving_path)

# For MFVI on all data
evaluate_clustering(all_data.view(all_data.size(0), -1).numpy(), all_labels, mfvi_labels)

if __name__ == "__main__":
main()

0 comments on commit 8be2756

Please sign in to comment.