Initial code upload

sfp-lab · Feb 24, 2021 · 7dcc13d · 7dcc13d
commit 7dcc13d
Show file tree

Hide file tree

Showing 14 changed files with 458,347 additions and 0 deletions.
diff --git a/analysis_notebook.Rmd b/analysis_notebook.Rmd
diff --git a/analyze_our_methylation_data.R b/analyze_our_methylation_data.R
diff --git a/atac_seq_analysis.R b/atac_seq_analysis.R
diff --git a/common_functions.py b/common_functions.py
diff --git a/erosion_cluster.py b/erosion_cluster.py
@@ -0,0 +1,41 @@
+import pandas as pd
+from sklearn.cluster import KMeans
+import pickle
+
+# input_filename="E:/pinterlab/x_erosion_data/first_eight_samples_beta_matrix.csv.gz"
+# cluster_distance_output="D:/pinterlab/x_erosion/first_eight_erosion_cluster_distances_df.csv"
+# cluster_df_output="D:/pinterlab/x_erosion/first_eight_erosion_cluster_df.csv"
+
+# input_filename="E:/pinterlab/x_erosion_data/second_eight_samples_beta_matrix.csv.gz"
+# cluster_distance_output="D:/pinterlab/x_erosion/second_eight_erosion_cluster_distances_df.csv"
+# cluster_df_output="D:/pinterlab/x_erosion/second_eight_erosion_cluster_df.csv"
+
+input_filename="E:/pinterlab/x_erosion_data/third_eight_samples_beta_matrix.csv.gz"
+cluster_distance_output="D:/pinterlab/x_erosion/third_eight_erosion_cluster_distances_df.csv"
+cluster_df_output="D:/pinterlab/x_erosion/third_eight_erosion_cluster_df.csv"
+
+
+methylation_df = pd.read_csv(input_filename, index_col=[0])
+
+[kmeans_model, cluster_map_dict, probes_to_keep, imp] = pickle.load(open("D:/pinterlab/x_erosion/erosion_kmeans_model.pkl", "rb"))
+
+subset_meth_df = methylation_df.loc[probes_to_keep,:]
+
+imputed_meth_df = pd.DataFrame(imp.transform(subset_meth_df.T), index=list(subset_meth_df.T.index), columns=list(subset_meth_df.T.columns))
+
+predicted_clusters = kmeans_model.predict(imputed_meth_df)
+
+cluster_distances = kmeans_model.transform(imputed_meth_df)
+corrected_cluster_distances = [x[[int(i) for i in cluster_map_dict.keys()]].tolist() for x in cluster_distances]
+# print(pd.DataFrame(corrected_cluster_distances))
+cluster_dist_df = pd.DataFrame(corrected_cluster_distances, index=list(subset_meth_df.T.index), columns=[0,1,2,3,4,5])
+cluster_dist_df.to_csv(cluster_distance_output)
+
+fixed_clusters = [cluster_map_dict[x] for x in predicted_clusters]
+
+cluster_df = pd.DataFrame([list(subset_meth_df.T.index), fixed_clusters]).T
+cluster_df.columns = ['sample_name', 'cluster']
+
+cluster_df.to_csv(cluster_df_output, index=False)
+
+# imputed_meth_df.T.to_csv("imputed_data.csv.gz")
diff --git a/erosion_cluster_dist.py b/erosion_cluster_dist.py
@@ -0,0 +1,37 @@
+import pandas as pd
+from sklearn.cluster import KMeans
+import pickle
+
+# input_filename="E:/pinterlab/x_erosion_data/first_eight_samples_beta_matrix.csv.gz"
+# cluster_distance_output="D:/pinterlab/x_erosion/first_eight_erosion_cluster_distances_df.csv"
+# cluster_df_output="D:/pinterlab/x_erosion/first_eight_erosion_cluster_df.csv"
+
+input_filename="D:/pinterlab/x_erosion/female_df_no_h9_chrX.csv.gz"
+cluster_distance_output="D:/pinterlab/x_erosion/female_df_no_h9_chrX_erosion_cluster_distances_df.csv"
+# cluster_df_output="D:/pinterlab/x_erosion/second_eight_erosion_cluster_df.csv"
+
+
+methylation_df = pd.read_csv(input_filename, index_col=[0])
+
+[kmeans_model, cluster_map_dict, probes_to_keep, imp] = pickle.load(open("D:/pinterlab/x_erosion/erosion_kmeans_model.pkl", "rb"))
+
+subset_meth_df = methylation_df.loc[probes_to_keep,:]
+
+imputed_meth_df = pd.DataFrame(imp.transform(subset_meth_df.T), index=list(subset_meth_df.T.index), columns=list(subset_meth_df.T.columns))
+
+predicted_clusters = kmeans_model.predict(imputed_meth_df)
+
+cluster_distances = kmeans_model.transform(imputed_meth_df)
+corrected_cluster_distances = [x[[int(i) for i in cluster_map_dict.keys()]].tolist() for x in cluster_distances]
+# print(pd.DataFrame(corrected_cluster_distances))
+cluster_dist_df = pd.DataFrame(corrected_cluster_distances, index=list(subset_meth_df.T.index), columns=[0,1,2,3,4,5])
+cluster_dist_df.to_csv(cluster_distance_output)
+
+fixed_clusters = [cluster_map_dict[x] for x in predicted_clusters]
+
+cluster_df = pd.DataFrame([list(subset_meth_df.T.index), fixed_clusters]).T
+cluster_df.columns = ['sample_name', 'cluster']
+
+# cluster_df.to_csv(cluster_df_output, index=False)
+
+# imputed_meth_df.T.to_csv("imputed_data.csv.gz")
diff --git a/erosion_kmeans_model.pkl b/erosion_kmeans_model.pkl