Skip to content

Commit

Permalink
Initial code upload
Browse files Browse the repository at this point in the history
  • Loading branch information
prb11009 committed Feb 24, 2021
0 parents commit 7dcc13d
Show file tree
Hide file tree
Showing 14 changed files with 458,347 additions and 0 deletions.
8,039 changes: 8,039 additions & 0 deletions analysis_notebook.Rmd

Large diffs are not rendered by default.

1,295 changes: 1,295 additions & 0 deletions analyze_our_methylation_data.R

Large diffs are not rendered by default.

2,175 changes: 2,175 additions & 0 deletions atac_seq_analysis.R

Large diffs are not rendered by default.

742 changes: 742 additions & 0 deletions common_functions.py

Large diffs are not rendered by default.

41 changes: 41 additions & 0 deletions erosion_cluster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import pandas as pd
from sklearn.cluster import KMeans
import pickle

# input_filename="E:/pinterlab/x_erosion_data/first_eight_samples_beta_matrix.csv.gz"
# cluster_distance_output="D:/pinterlab/x_erosion/first_eight_erosion_cluster_distances_df.csv"
# cluster_df_output="D:/pinterlab/x_erosion/first_eight_erosion_cluster_df.csv"

# input_filename="E:/pinterlab/x_erosion_data/second_eight_samples_beta_matrix.csv.gz"
# cluster_distance_output="D:/pinterlab/x_erosion/second_eight_erosion_cluster_distances_df.csv"
# cluster_df_output="D:/pinterlab/x_erosion/second_eight_erosion_cluster_df.csv"

input_filename="E:/pinterlab/x_erosion_data/third_eight_samples_beta_matrix.csv.gz"
cluster_distance_output="D:/pinterlab/x_erosion/third_eight_erosion_cluster_distances_df.csv"
cluster_df_output="D:/pinterlab/x_erosion/third_eight_erosion_cluster_df.csv"


methylation_df = pd.read_csv(input_filename, index_col=[0])

[kmeans_model, cluster_map_dict, probes_to_keep, imp] = pickle.load(open("D:/pinterlab/x_erosion/erosion_kmeans_model.pkl", "rb"))

subset_meth_df = methylation_df.loc[probes_to_keep,:]

imputed_meth_df = pd.DataFrame(imp.transform(subset_meth_df.T), index=list(subset_meth_df.T.index), columns=list(subset_meth_df.T.columns))

predicted_clusters = kmeans_model.predict(imputed_meth_df)

cluster_distances = kmeans_model.transform(imputed_meth_df)
corrected_cluster_distances = [x[[int(i) for i in cluster_map_dict.keys()]].tolist() for x in cluster_distances]
# print(pd.DataFrame(corrected_cluster_distances))
cluster_dist_df = pd.DataFrame(corrected_cluster_distances, index=list(subset_meth_df.T.index), columns=[0,1,2,3,4,5])
cluster_dist_df.to_csv(cluster_distance_output)

fixed_clusters = [cluster_map_dict[x] for x in predicted_clusters]

cluster_df = pd.DataFrame([list(subset_meth_df.T.index), fixed_clusters]).T
cluster_df.columns = ['sample_name', 'cluster']

cluster_df.to_csv(cluster_df_output, index=False)

# imputed_meth_df.T.to_csv("imputed_data.csv.gz")
37 changes: 37 additions & 0 deletions erosion_cluster_dist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import pandas as pd
from sklearn.cluster import KMeans
import pickle

# input_filename="E:/pinterlab/x_erosion_data/first_eight_samples_beta_matrix.csv.gz"
# cluster_distance_output="D:/pinterlab/x_erosion/first_eight_erosion_cluster_distances_df.csv"
# cluster_df_output="D:/pinterlab/x_erosion/first_eight_erosion_cluster_df.csv"

input_filename="D:/pinterlab/x_erosion/female_df_no_h9_chrX.csv.gz"
cluster_distance_output="D:/pinterlab/x_erosion/female_df_no_h9_chrX_erosion_cluster_distances_df.csv"
# cluster_df_output="D:/pinterlab/x_erosion/second_eight_erosion_cluster_df.csv"


methylation_df = pd.read_csv(input_filename, index_col=[0])

[kmeans_model, cluster_map_dict, probes_to_keep, imp] = pickle.load(open("D:/pinterlab/x_erosion/erosion_kmeans_model.pkl", "rb"))

subset_meth_df = methylation_df.loc[probes_to_keep,:]

imputed_meth_df = pd.DataFrame(imp.transform(subset_meth_df.T), index=list(subset_meth_df.T.index), columns=list(subset_meth_df.T.columns))

predicted_clusters = kmeans_model.predict(imputed_meth_df)

cluster_distances = kmeans_model.transform(imputed_meth_df)
corrected_cluster_distances = [x[[int(i) for i in cluster_map_dict.keys()]].tolist() for x in cluster_distances]
# print(pd.DataFrame(corrected_cluster_distances))
cluster_dist_df = pd.DataFrame(corrected_cluster_distances, index=list(subset_meth_df.T.index), columns=[0,1,2,3,4,5])
cluster_dist_df.to_csv(cluster_distance_output)

fixed_clusters = [cluster_map_dict[x] for x in predicted_clusters]

cluster_df = pd.DataFrame([list(subset_meth_df.T.index), fixed_clusters]).T
cluster_df.columns = ['sample_name', 'cluster']

# cluster_df.to_csv(cluster_df_output, index=False)

# imputed_meth_df.T.to_csv("imputed_data.csv.gz")
Binary file added erosion_kmeans_model.pkl
Binary file not shown.
Loading

0 comments on commit 7dcc13d

Please sign in to comment.