Skip to content

Commit

Permalink
ERROR IN MFVI
Browse files Browse the repository at this point in the history
I found that the matrix dimensions generate errors, solved.
  • Loading branch information
lrm22005 committed Nov 2, 2023
1 parent 249ffbb commit b93972d
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 100 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@

error_log.txt
progress.log
135 changes: 35 additions & 100 deletions project_1.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,13 @@
from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, adjusted_rand_score, adjusted_mutual_info_score, davies_bouldin_score
import seaborn as sns
from PIL import Image # Import the Image module

os.environ['OMP_NUM_THREADS'] = '3'
# Create a logger
logger = logging.getLogger(__name__)
logging.basicConfig(filename='error_log.txt', level=logging.ERROR)
Expand All @@ -35,6 +37,7 @@
progress_logger.addHandler(progress_handler)

def get_data_paths(data_format, is_linux=False, is_hpc=False):
log_progress("Code execution get_data_paths")
if is_linux:
base_path = "/mnt/r/ENGR_Chon/Dong/MATLAB_generate_results/NIH_PulseWatch"
labels_base_path = "/mnt/r/ENGR_Chon/NIH_Pulsewatch_Database/Adjudication_UConn"
Expand All @@ -46,8 +49,7 @@ def get_data_paths(data_format, is_linux=False, is_hpc=False):
else:
base_path = "R:\\ENGR_Chon\\Dong\\MATLAB_generate_results\\NIH_PulseWatch"
labels_base_path = "R:\\ENGR_Chon\\NIH_Pulsewatch_Database\\Adjudication_UConn"
saving_base_path = "R:\\ENGR_Chon\\Luis\\Research\\Casseys_case\\Project_1_analysis"

saving_base_path = r"\\grove.ad.uconn.edu\research\ENGR_Chon\Luis\Research\Casseys_case"
if data_format == 'csv':
data_path = os.path.join(base_path, "TFS_csv")
labels_path = os.path.join(labels_base_path, "final_attemp_4_1_Dong_Ohm")
Expand All @@ -58,16 +60,13 @@ def get_data_paths(data_format, is_linux=False, is_hpc=False):
saving_path = os.path.join(saving_base_path, "Project_1_analysis")
else:
raise ValueError("Invalid data format. Choose 'csv' or 'png.")

log_progress("Code execution completed get_data_paths")
return data_path, labels_path, saving_path

# Standardize the data
def standard_scaling(data):
scaler = StandardScaler()
data_shape = data.shape
data = data.view(data_shape[0], -1)
data = scaler.fit_transform(data)
data = data.view(data_shape)
data = scaler.fit_transform(data.reshape(-1, data.shape[-1])).reshape(data.shape)
return torch.Tensor(data)

def load_data(data_path, labels_path, dataset_size=2, train=True, standardize=True, data_format='csv', return_all=False):
Expand Down Expand Up @@ -108,7 +107,6 @@ def load_data(data_path, labels_path, dataset_size=2, train=True, standardize=Tr
logger.error(f"Error processing segment: {seg} in UID: {UID}. Exception: {str(e)}")
logger.error(f"Error processing segment: {time_freq_plot.size()} in UID: {UID}. Exception: {str(e)}")
# You can also add more information to the error log, such as the value of time_freq_plot.
continue # Continue to the next segment

X_data = torch.cat(X_data, 0)
X_data_original = torch.cat(X_data_original, 0)
Expand Down Expand Up @@ -141,18 +139,24 @@ def load_data(data_path, labels_path, dataset_size=2, train=True, standardize=Tr

# Filter out segments that are unlabeled (-1)
filtered_segment_names = [segment_name for segment_name, label in segment_labels.items() if label != -1]

# Filter data to match the filtered segment names
filtered_data = torch.stack([X_data[segment_names.index(segment_name)] for segment_name in filtered_segment_names])
# Check if there are no labeled segments
if not filtered_segment_names:
filtered_data = None # Set filtered_data to None
else:
# Filter data to match the filtered segment names
filtered_data = torch.stack([X_data[segment_names.index(segment_name)] for segment_name in filtered_segment_names])

# Return all segments along with labels
if return_all is True:
return X_data_original, X_data, segment_names, segment_labels, segment_labels.values()

# Return labeled and unlabeled segments along with labels
if return_all == 'labeled':
return X_data_original, filtered_data, filtered_segment_names, {seg: segment_labels[seg] for seg in filtered_segment_names}, {seg: segment_labels[seg] for seg in filtered_segment_names}.values()

if filtered_data is not None:
return X_data_original, filtered_data, filtered_segment_names, {seg: segment_labels[seg] for seg in filtered_segment_names}, {seg: segment_labels[seg] for seg in filtered_segment_names}.values()
else:
return X_data_original, None, [], {}, []

# Return unlabeled segments along with labels
elif return_all == 'unlabeled':
unlabeled_segment_names = [segment_name for segment_name, label in segment_labels.items() if label == -1]
Expand Down Expand Up @@ -207,8 +211,8 @@ def visualize_trends(standardized_data, original_data, segment_names, num_plots=
plt.tight_layout()

if save_path:
subject_save_path = os.path.join(save_path, f"trends_visualization_segment_{segment_names[idx]}.png")
plt.savefig(subject_save_path)
subject_save_path = os.path.join(save_path, f"trends_{segment_names[idx]}.png")
plt.savefig(subject_save_path, dpi=400, format='png')
plt.show()
elif data_format == 'png':
print("This is a trend analysis for PNG data format.")
Expand All @@ -233,7 +237,7 @@ def perform_pca_sgd(data, num_components=2, num_clusters=4, batch_size=64):
reduced_data = ipca.fit_transform(data_flattened.numpy())

# Cluster the data using K-Means
kmeans = KMeans(n_clusters=num_clusters)
kmeans = KMeans(n_clusters=num_clusters, init='k-means++', n_init='auto')
labels = kmeans.fit_predict(reduced_data)

return reduced_data, ipca, labels
Expand All @@ -247,94 +251,30 @@ def perform_tsne(data, num_components=2, num_clusters=4):
reduced_data = tsne.fit_transform(data_flattened.numpy())

# Cluster the data using K-Means
kmeans = KMeans(n_clusters=num_clusters)
kmeans = KMeans(n_clusters=num_clusters, init='k-means++', n_init='auto')
labels = kmeans.fit_predict(reduced_data)

return reduced_data, labels

def visualize_correlation_matrix(data, segment_names, subject_mode=True, num_subjects_to_visualize=None, batch_size=32, method='pearson', save_path=None):
data_flattened = data.view(data.size(0), -1).numpy()

if subject_mode:
subject_names = [filename.split('_')[0] for filename in segment_names]
unique_subjects = list(set(subject_names))
else:
subject_names = [filename.split('_')[0] for filename in segment_names]
unique_subjects = list(set(subject_names))

if num_subjects_to_visualize is None:
num_subjects_to_visualize = len(unique_subjects)

for i in range(num_subjects_to_visualize):
subject = unique_subjects[i]
subject_indices = [j for j, name in enumerate(subject_names) if name == subject]
subject_data = data_flattened[subject_indices]

# Shuffle the data to avoid bias
np.random.shuffle(subject_data)

# Calculate the number of batches
num_batches = len(subject_data) // batch_size

batch_correlations = []

for batch_index in range(num_batches):
start = batch_index * batch_size
end = (batch_index + 1) * batch_size
batch = subject_data[start:end]

# Calculate the correlation matrix for the batch
correlation_matrix = np.corrcoef(batch, rowvar=False)

# Calculate the mean or median of the per-batch correlations
if method == 'mean':
batch_correlation = np.mean(correlation_matrix)
elif method == 'median':
batch_correlation = np.median(correlation_matrix)

batch_correlations.append(batch_correlation)

# Aggregate the batch correlations
overall_correlation = np.mean(batch_correlations) # You can use median instead of mean if needed

# Calculate confidence intervals on the aggregated correlation
batch_correlations = np.array(batch_correlations)
ci_lower = np.percentile(batch_correlations, 2.5)
ci_upper = np.percentile(batch_correlations, 97.5)

# Print or save the results
print(f"Overall Correlation for {num_subjects_to_visualize} Subjects {subject}: {overall_correlation:.4f}")
print(f"Confidence Intervals: [{ci_lower:.4f}, {ci_upper:.4f}]")

if save_path:
subject_save_path = os.path.join(save_path, f"correlation_matrix_subject_group_{subject}.png")
plt.figure()
sns.heatmap(correlation_matrix, cmap="coolwarm", xticklabels=False, yticklabels=False)
plt.title(f"Correlation Matrix for {num_subjects_to_visualize} Subjects {subject}")
plt.savefig(subject_save_path)
plt.close()

# Plot the per-batch correlations over time/batches
plt.figure()
plt.plot(batch_correlations)
plt.xlabel("Batch Index")
plt.ylabel("Correlation")
plt.title(f"Per-Batch Correlations for {num_subjects_to_visualize} Subjects {subject}")
plt.show()

return reduced_data, labels

# This function computes the log PDF of a multivariate normal distribution
def multivariate_normal_log_pdf_MFVI(x, mu, sigma_sq):
# x: Data points (N x D)
# x: Data points (N x H x W)
# mu: Means of the components (K x D)
# sigma_sq: Variances of the components (K x D)
N, D = x.shape
K, _ = mu.shape

N, H, W = x.shape # Get the dimensions of the data tensor
K, D = mu.shape

log_p = torch.empty(N, K, dtype=x.dtype, device=x.device)
for k in range(K):
# Create a covariance matrix for each component
cov_matrix = torch.diag(sigma_sq[k])
mvn = MultivariateNormal(mu[k], cov_matrix)
log_p[:, k] = mvn.log_prob(x)

# Calculate the log PDF for each data point
for n in range(N):
data_point = x[n].view(-1) # Flatten the 2D slice to a 1D vector
mvn = MultivariateNormal(mu[k], cov_matrix)
log_p[n, k] = mvn.log_prob(data_point)

return log_p

Expand Down Expand Up @@ -550,11 +490,6 @@ def visualize_and_analyze_data(data, original_data, segments, labels, data_type,
except Exception as e:
handle_error(f"Visualizing trends for {data_type} data", e)

try:
visualize_correlation_matrix(data, segments, subject_mode=False, num_subjects_to_visualize=None, batch_size=32, method='pearson', save_path=saving_path)
except Exception as e:
handle_error(f"Visualizing correlation matrix for {data_type} data", e)

try:
pca_reduced_data, pca_labels, tsne_reduced_data, tsne_labels = perform_dimensionality_reduction(data, data_type, saving_path)
except Exception as e:
Expand Down Expand Up @@ -681,7 +616,7 @@ def main(case_to_run):
unlabeled_original_data, unlabeled_data, unlabeled_segments, unlabeled_segments_labels, unlabeled_labels = load_data(data_path, labels_path, dataset_size=10, train=True, data_format=data_format, return_all="unlabeled")
results['unlabeled'] = process_unlabeled_data(unlabeled_data, unlabeled_original_data, unlabeled_segments, unlabeled_labels, saving_path, data_format)
elif case_to_run == "all_data":
all_original_data, all_data, all_segment_names, segment_labels, all_labels = load_data(data_path, labels_path, dataset_size=10, train=True, data_format=data_format, return_all=True)
all_original_data, all_data, all_segment_names, segment_labels, all_labels = load_data(data_path, labels_path, dataset_size=1, train=True, data_format=data_format, return_all=True)
results['all_data'] = process_all_data(all_data, all_original_data, all_segment_names, all_labels, saving_path, data_format)
else:
log_progress("Invalid case specified. Please use 'labeled', 'unlabeled', or 'all_data'.")
Expand Down

0 comments on commit b93972d

Please sign in to comment.