Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
contiguous_x_erosion/analyze_our_methylation_data.R
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
1498 lines (1296 sloc)
80.6 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(minfi) | |
process_array_as_450k <- function(samples_in_order, array_registry_location, data_directory, basename_id) { | |
array_registry_df <- readxl::read_excel(array_registry_location) | |
batch_samples_df <- array_registry_df %>% | |
filter(`Tube Label` %in% samples_in_order) %>% | |
dplyr::rename(description=`...4`) %>% | |
dplyr::mutate(cell_line=str_split(description, " ", simplify = T)[,1], | |
cell_type=str_split(description, " ", simplify = T)[,2], | |
passage=as.numeric(str_replace(str_split(description, " ", simplify = T)[,4], "p", "")), | |
Sentrix_Position=sprintf("R0%dC01", 1:8), | |
Basename=sprintf("%s/%s_%s", data_directory, basename_id, Sentrix_Position), | |
basename_nodir=sprintf("%s_%s", basename_id, Sentrix_Position), | |
Sample_Name=description) | |
RGset <- read.metharray.exp(targets = batch_samples_df) | |
gmset <- preprocessIllumina(RGset) | |
gmset <- mapToGenome(gmset) | |
gmset_450k <- convertArray(gmset, outType="IlluminaHumanMethylation450k") | |
all_beta_mat <- getBeta(gmset_450k) | |
colnames(all_beta_mat) <- batch_samples_df[match(colnames(all_beta_mat), batch_samples_df$basename_nodir),]$Sample_Name | |
return(as.data.frame(all_beta_mat) %>% rownames_to_column("rn")) | |
} | |
# from email from Bo | |
samples_in_order_first_batch <- c("ar 52", "ar 54", "ar 56", | |
"ar 58", "ar 60", "ar 62", | |
"ar 64", "ar 66") | |
first_beta_mat <- process_array_as_450k(samples_in_order_first_batch, | |
"E:/pinterlab/x_erosion_data/Array_registry_update_201013.xlsx", | |
"E:/pinterlab/x_erosion_data/first_eight_samples", | |
"204668830015") | |
write_csv(as.data.frame(all_beta_mat) %>% rownames_to_column("rn"), "E:/pinterlab/x_erosion_data/first_eight_samples_beta_matrix.csv.gz") | |
#second batch of array | |
samples_in_order_second_batch <- c("ar 47", "ar 48", "ar72", "ar73", | |
"ar74", "ar75", "ar76", "ar77") | |
second_beta_mat <- process_array_as_450k(samples_in_order_second_batch, | |
"E:/pinterlab/x_erosion_data/Array_registry_update_201116.xlsx", | |
"E:/pinterlab/x_erosion_data/second_eight_samples", | |
"204792210038") | |
write_csv(second_beta_mat, "E:/pinterlab/x_erosion_data/second_eight_samples_beta_matrix.csv.gz") | |
#third batch of array | |
samples_in_order_third_batch <- c("ar86", "ar87", "ar88", | |
"ar89", "ar90", "ar91", | |
"ar92", "ar93") | |
third_beta_mat <- process_array_as_450k(samples_in_order_third_batch, | |
"E:/pinterlab/x_erosion_data/Array_Registry_Update210126.xlsx", | |
"E:/pinterlab/x_erosion_data/third_eight_samples", | |
"205003700119") | |
write_csv(third_beta_mat, "E:/pinterlab/x_erosion_data/third_eight_samples_beta_matrix.csv.gz") | |
first_erosion_clusters_df <- read_csv("first_eight_erosion_cluster_df.csv") | |
second_erosion_clusters_df <- read_csv("second_eight_erosion_cluster_df.csv") | |
third_erosion_clusters_df <- read_csv("third_eight_erosion_cluster_df.csv") | |
combined_table_df <- rbind(first_erosion_clusters_df, second_erosion_clusters_df, third_erosion_clusters_df) %>% | |
mutate(cell_line=case_when(grepl("C19XX", sample_name) ~ "C19", | |
grepl("C23XX", sample_name) ~ "C23", | |
grepl("C6XX", sample_name) ~ "C6"), | |
passage=as.numeric(stringr::str_split(sample_name, " p", simplify=T)[,2])) %>% | |
dplyr::arrange(cell_line, passage) | |
# plot in heatmap | |
darcy_erosion_clusters_df <- read_csv("E:/pinterlab/darcy_data_xx_xo/methylation_data/erosion_cluster_df.csv") %>% | |
dplyr::filter(sample_name %in% c("ar34", "ar37", "ar40", "ar41", "ar42")) %>% | |
dplyr::mutate(sample_name=case_when(sample_name == "ar34" ~ "ar34_C6xx_p7", | |
sample_name == "ar37" ~ "ar37_C11XX_p7", | |
sample_name == "ar40" ~ "ar40_C13XX_p10", | |
sample_name == "ar41" ~ "ar41_C19XX_p9", | |
sample_name == "ar42" ~ "ar42_C23XX_p10")) | |
darcy_beta_mat <- read_csv("E:/pinterlab/darcy_data_xx_xo/methylation_data/beta_matrix.csv.gz") %>% | |
dplyr::select(rn, | |
ar34_C6xx_p7=ar34, | |
ar37_C11XX_p7=ar37, | |
ar40_C13XX_p10=ar40, | |
ar41_C19XX_p9=ar41, | |
ar42_C23XX_p10=ar42) | |
first_eight_beta_mat <- read_csv("E:/pinterlab/x_erosion_data/first_eight_samples_beta_matrix.csv.gz") | |
second_eight_beta_mat <- read_csv("E:/pinterlab/x_erosion_data/second_eight_samples_beta_matrix.csv.gz") | |
third_eight_beta_mat <- read_csv("E:/pinterlab/x_erosion_data/third_eight_samples_beta_matrix.csv.gz") | |
female_df_no_h9_chrX <- read_csv("D:/pinterlab/x_erosion/female_df_no_h9_chrX.csv.gz") | |
all_dmp_df <- read_csv('D:/pinterlab/x_erosion/all_dmp_df.csv.gz') | |
# do this based on genes and when the promoters change their methylation. | |
x_annotation_df <- read_csv("D:/pinterlab/x_erosion/x_annotation.csv.gz") | |
statistics <- read_csv('D:/pinterlab/x_erosion/variance_statistics_noh9_nooutliers.csv.gz') | |
positions_to_keep <- statistics$Position[statistics$alpha <= .01 & (statistics$femaleVariance > statistics$maleVariance)] | |
sig_x_sites <- intersect(positions_to_keep, x_annotation_df$rowname) | |
heatmap_df <- inner_join(female_df_no_h9_chrX %>% filter(rn %in% x_annotation_df$rowname), | |
dplyr::select(all_dmp_df, rowname, key_transition, passes_threshold, delta_increasing, hg38_refgene_group), by=c("rn"="rowname")) | |
heatmap_df <- arrange(heatmap_df, desc(passes_threshold), desc(delta_increasing), desc(key_transition)) %>% | |
left_join(darcy_beta_mat, by=c("rn")) %>% | |
left_join(first_eight_beta_mat, by=c("rn")) %>% | |
left_join(second_eight_beta_mat, by=c("rn")) %>% | |
left_join(third_eight_beta_mat, by=c("rn")) | |
ordered_sites <- heatmap_df$rn | |
heatmap_df[heatmap_df$passes_threshold == FALSE, "cluster"] <- 10 | |
heatmap_df[(heatmap_df$passes_threshold == TRUE) & (heatmap_df$delta_increasing == T), "cluster"] <- heatmap_df[(heatmap_df$passes_threshold == TRUE) & (heatmap_df$delta_increasing == T), "key_transition"] - 1 | |
heatmap_df[(heatmap_df$passes_threshold == TRUE) & (heatmap_df$delta_increasing == F), "cluster"] <- (heatmap_df[(heatmap_df$passes_threshold == TRUE) & (heatmap_df$delta_increasing == F), "key_transition"] - 1) + 5 | |
heatmap_df <- dplyr::select(heatmap_df, -key_transition, -passes_threshold, -delta_increasing, -hg38_refgene_group) | |
sample_clusters_df <- read_csv("D:/pinterlab/x_erosion/saved_sample_clusters.csv", | |
col_names = c('name','cluster')) %>% | |
bind_rows(darcy_erosion_clusters_df %>% dplyr::rename(name=sample_name)) %>% | |
bind_rows(first_erosion_clusters_df %>% dplyr::rename(name=sample_name)) %>% | |
bind_rows(second_erosion_clusters_df %>% dplyr::rename(name=sample_name)) %>% | |
bind_rows(third_erosion_clusters_df %>% dplyr::rename(name=sample_name)) %>% | |
arrange(cluster) | |
ordered_sample_names <- sample_clusters_df$name | |
heatmap_df <- reshape2::melt(heatmap_df, id.vars=c("rn", "cluster")) %>% dplyr::rename(site=rn, sample_name=variable) | |
for (sample_row in 1:nrow(sample_clusters_df)) { | |
cluster_name <- unlist(sample_clusters_df[sample_row, "name"]) | |
cluster <- unlist(sample_clusters_df[sample_row, "cluster"]) | |
heatmap_df[heatmap_df$sample_name == cluster_name, "sample_cluster"] <- cluster | |
} | |
sigvar_heatmap_plot <- ggplot(heatmap_df %>% filter(site %in% sig_x_sites), aes(x=factor(sample_name, levels = ordered_sample_names), | |
y=factor(site, levels = ordered_sites), fill=value)) + | |
geom_tile() + | |
facet_grid(cluster ~ sample_cluster, scales = "free", space="free", shrink=FALSE, switch = "y", labeller = labeller(cluster = function(c){ | |
return(list( | |
"0"="1↑", | |
"1"="2↑", | |
"2"="3↑", | |
"3"="4↑", | |
"4"="5↑", | |
"5"="1↓", | |
"6"="2↓", | |
"7"="3↓", | |
"8"="4↓", | |
"9"="5↓", | |
"10"="~" | |
)[c]) | |
})) + | |
# facet_grid(cluster ~ sample_cluster,space="free_x", scales = "free", shrink=FALSE) + | |
scale_fill_gradient2(low="#0bb1a4", high = "#E966FE", midpoint = .5, breaks=c(0,.5,1), labels=c("unmethylated\n??=0", "50% methylation\n??=0.5", "methylated\n??=1"))+ | |
labs(x="Female Samples", | |
y="CpG Sites with Greater Variance in Female Samples", | |
fill="DNA Methylation") + | |
theme(axis.text.y=element_blank(), | |
axis.text.x = element_text(angle = 90, hjust=1, size=5), | |
axis.ticks.y=element_blank(), | |
legend.position="top", | |
legend.justification = "center", | |
legend.title = element_text(size = 6, vjust = .75), | |
legend.text = element_text(size = 6), | |
# plot.background = element_rect(fill = "lightgrey"), | |
# panel.border = element_rect(colour = "black", fill=NA, size=1), | |
plot.margin = margin(r=0), | |
panel.spacing.y = unit(2, "mm"), | |
panel.spacing.x = unit(0, "mm"), | |
strip.background.x = element_rect(color="black"), | |
strip.background.y = element_rect(fill = NA, colour = "black", size = .7), | |
strip.text = element_text(size=6), | |
strip.text.y = element_text(angle=180), | |
axis.title = element_text(size=8)) | |
# only our data | |
sigvar_heatmap_plot_our_data_only <- ggplot(heatmap_df %>% filter(site %in% sig_x_sites & grepl("C19XX|C23XX", sample_name)), aes(x=factor(sample_name, levels = ordered_sample_names), | |
y=factor(site, levels = ordered_sites), fill=value)) + | |
geom_tile() + | |
facet_grid(cluster ~ sample_cluster, scales = "free", space="free", shrink=FALSE, switch = "y", labeller = labeller(cluster = function(c){ | |
return(list( | |
"0"="1↑", | |
"1"="2↑", | |
"2"="3↑", | |
"3"="4↑", | |
"4"="5↑", | |
"5"="1↓", | |
"6"="2↓", | |
"7"="3↓", | |
"8"="4↓", | |
"9"="5↓", | |
"10"="~" | |
)[c]) | |
})) + | |
# facet_grid(cluster ~ sample_cluster,space="free_x", scales = "free", shrink=FALSE) + | |
scale_fill_gradient2(low="#0bb1a4", high = "#E966FE", midpoint = .5, breaks=c(0,.5,1), labels=c("unmethylated\nβ=0", "50% methylation\nβ=0.5", "methylated\nβ=1"))+ | |
labs(x="Female Samples", | |
y="CpG Sites with Greater Variance in Female Samples", | |
fill="DNA Methylation") + | |
theme(axis.text.y=element_blank(), | |
axis.text.x = element_text(angle = 90, hjust=1, size=5), | |
axis.ticks.y=element_blank(), | |
legend.position="top", | |
legend.justification = "center", | |
legend.title = element_text(size = 6, vjust = .75), | |
legend.text = element_text(size = 6), | |
# plot.background = element_rect(fill = "lightgrey"), | |
# panel.border = element_rect(colour = "black", fill=NA, size=1), | |
plot.margin = margin(r=0), | |
panel.spacing.y = unit(2, "mm"), | |
panel.spacing.x = unit(0, "mm"), | |
strip.background.x = element_rect(color="black"), | |
strip.background.y = element_rect(fill = NA, colour = "black", size = .7), | |
strip.text = element_text(size=6), | |
strip.text.y = element_text(angle=180), | |
axis.title = element_text(size=8)) | |
ggsave("figs/our_data/our_data_only_squished_heatmap.png", sigvar_heatmap_plot_our_data_only, height=245, width=30, units = "mm") | |
library(radiant.data) | |
#calculate_continuous_cluster_assignment | |
first_cluster_dist_df <- read_csv("D:/pinterlab/x_erosion/first_eight_erosion_cluster_distances_df.csv") %>% | |
dplyr::rename(sample_name=`X1`) %>% | |
left_join(first_erosion_clusters_df, by="sample_name") %>% | |
dplyr::mutate(x0=case_when(cluster == 0 ~ `0`, | |
cluster == 1 ~ `1`, | |
cluster == 2 ~ `2`, | |
cluster == 3 ~ `3`, | |
cluster == 4 ~ `4`, | |
cluster == 5 ~ `5`), | |
x1=case_when(cluster == 0 ~ `1`, | |
cluster == 1 ~ pmin(`2`, `0`), | |
cluster == 2 ~ pmin(`3`, `1`), | |
cluster == 3 ~ pmin(`4`, `2`), | |
cluster == 4 ~ pmin(`5`, `3`), | |
cluster == 5 ~ `4`), | |
direction=case_when(cluster == 0 ~ 1, | |
cluster == 1 ~ c(1, -1)[which.pmin(`2`, `0`)], | |
cluster == 2 ~ c(1, -1)[which.pmin(`3`, `1`)], | |
cluster == 3 ~ c(1, -1)[which.pmin(`4`, `2`)], | |
cluster == 4 ~ c(1, -1)[which.pmin(`5`, `3`)], | |
cluster == 5 ~ `4`), | |
delta=-log((x1/x0)-.8335, base=45)+.03, | |
continuous_cluster=cluster+(direction * delta)) | |
second_cluster_dist_df <- read_csv("D:/pinterlab/x_erosion/second_eight_erosion_cluster_distances_df.csv") %>% | |
dplyr::rename(sample_name=`X1`) %>% | |
left_join(second_erosion_clusters_df, by="sample_name") %>% | |
dplyr::mutate(x0=case_when(cluster == 0 ~ `0`, | |
cluster == 1 ~ `1`, | |
cluster == 2 ~ `2`, | |
cluster == 3 ~ `3`, | |
cluster == 4 ~ `4`, | |
cluster == 5 ~ `5`), | |
x1=case_when(cluster == 0 ~ `1`, | |
cluster == 1 ~ pmin(`2`, `0`), | |
cluster == 2 ~ pmin(`3`, `1`), | |
cluster == 3 ~ pmin(`4`, `2`), | |
cluster == 4 ~ pmin(`5`, `3`), | |
cluster == 5 ~ `4`), | |
direction=case_when(cluster == 0 ~ 1, | |
cluster == 1 ~ c(1, -1)[which.pmin(`2`, `0`)], | |
cluster == 2 ~ c(1, -1)[which.pmin(`3`, `1`)], | |
cluster == 3 ~ c(1, -1)[which.pmin(`4`, `2`)], | |
cluster == 4 ~ c(1, -1)[which.pmin(`5`, `3`)], | |
cluster == 5 ~ `4`), | |
delta=-log((x1/x0)-.8335, base=45)+.03, | |
continuous_cluster=cluster+(direction * delta)) | |
third_cluster_dist_df <- read_csv("D:/pinterlab/x_erosion/third_eight_erosion_cluster_distances_df.csv") %>% | |
dplyr::rename(sample_name=`X1`) %>% | |
left_join(third_erosion_clusters_df, by="sample_name") %>% | |
dplyr::mutate(x0=case_when(cluster == 0 ~ `0`, | |
cluster == 1 ~ `1`, | |
cluster == 2 ~ `2`, | |
cluster == 3 ~ `3`, | |
cluster == 4 ~ `4`, | |
cluster == 5 ~ `5`), | |
x1=case_when(cluster == 0 ~ `1`, | |
cluster == 1 ~ pmin(`2`, `0`), | |
cluster == 2 ~ pmin(`3`, `1`), | |
cluster == 3 ~ pmin(`4`, `2`), | |
cluster == 4 ~ pmin(`5`, `3`), | |
cluster == 5 ~ `4`), | |
direction=case_when(cluster == 0 ~ 1, | |
cluster == 1 ~ c(1, -1)[which.pmin(`2`, `0`)], | |
cluster == 2 ~ c(1, -1)[which.pmin(`3`, `1`)], | |
cluster == 3 ~ c(1, -1)[which.pmin(`4`, `2`)], | |
cluster == 4 ~ c(1, -1)[which.pmin(`5`, `3`)], | |
cluster == 5 ~ `4`), | |
delta=-log((x1/x0)-.8335, base=45)+.03, | |
continuous_cluster=cluster+(direction * delta)) | |
continuous_cluster_df <- rbind(first_cluster_dist_df, second_cluster_dist_df, third_cluster_dist_df) %>% | |
dplyr::mutate(cell_line=str_split(sample_name, " ", simplify = T)[,1], | |
passage=as.numeric(str_replace(str_split(sample_name, " ", simplify = T)[,4], "p", ""))) %>% | |
arrange(cell_line, passage) %>% | |
dplyr::select(sample_name, cell_line, passage, cluster, continuous_cluster) | |
write_csv(continuous_cluster_df, "continuous_cluster_df.csv") | |
# y=-log_45(x-.8335)+.03 | |
#do the same on the old darcy data | |
old_erosion_clusters_df <- read_csv("E:/pinterlab/darcy_data_xx_xo/methylation_data/erosion_cluster_df.csv") | |
old_cluster_dist_df <- read_csv("D:/pinterlab/x_erosion/old_erosion_cluster_distances_df.csv") %>% | |
dplyr::rename(sample_name=`X1`) %>% | |
filter(sample_name %in% c("ar34", "ar37", "ar40", "ar38", "ar39", "ar42", "ar41")) %>% | |
left_join(old_erosion_clusters_df, by="sample_name") %>% | |
dplyr::mutate(x0=case_when(cluster == 0 ~ `0`, | |
cluster == 1 ~ `1`, | |
cluster == 2 ~ `2`, | |
cluster == 3 ~ `3`, | |
cluster == 4 ~ `4`, | |
cluster == 5 ~ `5`), | |
x1=case_when(cluster == 0 ~ `1`, | |
cluster == 1 ~ pmin(`2`, `0`), | |
cluster == 2 ~ pmin(`3`, `1`), | |
cluster == 3 ~ pmin(`4`, `2`), | |
cluster == 4 ~ pmin(`5`, `3`), | |
cluster == 5 ~ `4`), | |
direction=case_when(cluster == 0 ~ 1, | |
cluster == 1 ~ c(1, -1)[which.pmin(`2`, `0`)], | |
cluster == 2 ~ c(1, -1)[which.pmin(`3`, `1`)], | |
cluster == 3 ~ c(1, -1)[which.pmin(`4`, `2`)], | |
cluster == 4 ~ c(1, -1)[which.pmin(`5`, `3`)], | |
cluster == 5 ~ `4`), | |
delta=-log((x1/x0)-.8335, base=45)+.03, | |
delta=case_when(between(delta,0,.5) ~ delta, | |
delta < 0 ~ 0, | |
delta > .5 ~ .5), | |
continuous_cluster=cluster+(direction * delta), | |
description=case_when(sample_name == "ar34" ~ "C6 XX p7", | |
sample_name == "ar37" ~ "C11 XX p7", | |
sample_name == "ar40" ~ "C13 XX p10", | |
sample_name == "ar38" ~ "XXY p21", | |
sample_name == "ar39" ~ "H9 p40", | |
sample_name == "ar41" ~ "C19 p9", | |
sample_name == "ar42" ~ "C23 XX p10")) | |
analysis_sample_distances <- read_csv("female_df_no_h9_chrX_erosion_cluster_distances_df.csv") | |
saved_sample_clusters <- read_csv("saved_sample_clusters.csv", col_names = c("sample_name", "cluster")) | |
analysis_sample_distances <- analysis_sample_distances %>% | |
dplyr::rename(sample_name=`X1`) %>% | |
left_join(saved_sample_clusters, by="sample_name") %>% | |
dplyr::mutate(cluster_dist=dplyr::case_when(cluster==0 ~ `0`, | |
cluster==1 ~ `1`, | |
cluster==2 ~ `2`, | |
cluster==3 ~ `3`, | |
cluster==4 ~ `4`, | |
cluster==5 ~ `5`), | |
data_type="analysis_samples") | |
our_sample_distances <- read_csv("D:/pinterlab/x_erosion/first_eight_erosion_cluster_distances_df.csv") %>% | |
dplyr::rename(sample_name=`X1`) %>% | |
left_join(read_csv("first_eight_erosion_cluster_df.csv"), by="sample_name") %>% | |
rbind(read_csv("D:/pinterlab/x_erosion/second_eight_erosion_cluster_distances_df.csv") %>% | |
dplyr::rename(sample_name=`X1`) %>% | |
left_join(read_csv("second_eight_erosion_cluster_df.csv"), by="sample_name")) %>% | |
dplyr::mutate(cluster_dist=dplyr::case_when(cluster==0 ~ `0`, | |
cluster==1 ~ `1`, | |
cluster==2 ~ `2`, | |
cluster==3 ~ `3`, | |
cluster==4 ~ `4`, | |
cluster==5 ~ `5`), | |
data_type="our_samples") | |
ggplot(analysis_sample_distances, aes(x=cluster_dist, color=data_type)) + | |
geom_density() + | |
geom_vline(data=our_sample_distances, mapping=aes(xintercept=cluster_dist, color=data_type)) + | |
facet_grid(rows = vars(cluster)) + | |
labs(title="Distribution of cluster distances for samples") + | |
theme_bw() + | |
scale_color_nejm() | |
ggsave("figs/our_data/cluster_distance_distribution.png") | |
# add to PCA | |
female_df_no_h9_chrX <- read_csv('female_df_no_h9_chrX.csv.gz') | |
female_df_only_h9_chrX <- read_csv("female_sig_x_sites_df_only_h9.csv.gz") | |
female_beta_df_chrX <- left_join(female_df_no_h9_chrX, female_df_only_h9_chrX, by=c("rn"="rowname")) | |
sample_annotation_df <- read_csv("sample_data_combined_all_info.csv") | |
first_eight_beta_mat <- read_csv("E:/pinterlab/x_erosion_data/first_eight_samples_beta_matrix.csv.gz") | |
second_eight_beta_mat <- read_csv("E:/pinterlab/x_erosion_data/second_eight_samples_beta_matrix.csv.gz") | |
third_eight_beta_mat <- read_csv("E:/pinterlab/x_erosion_data/third_eight_samples_beta_matrix.csv.gz") | |
our_data_beta_df <- left_join(first_eight_beta_mat, second_eight_beta_mat, by="rn") %>% | |
left_join(third_eight_beta_mat, by="rn") | |
x_annotation_df <- read_csv("D:/pinterlab/x_erosion/x_annotation.csv.gz") | |
statistics <- read_csv('D:/pinterlab/x_erosion/variance_statistics_noh9_nooutliers.csv.gz') | |
positions_to_keep <- statistics$Position[statistics$alpha <= .01 & (statistics$femaleVariance > statistics$maleVariance)] | |
sig_x_sites <- intersect(positions_to_keep, x_annotation_df$rowname) | |
no_na_sigxsites <- na.omit(female_beta_df_chrX %>% | |
left_join(our_data_beta_df, by="rn") %>% | |
filter(rn %in% sig_x_sites))$rn | |
female_sig_sites_pca <- prcomp(t(na.omit(female_beta_df_chrX %>% filter(rn %in% no_na_sigxsites) %>% select(-rn)))) | |
female_sig_sites_pca_df <- rownames_to_column(as.data.frame(female_sig_sites_pca$x), var = "sample_name") | |
female_sig_sites_pca_df <- left_join(female_sig_sites_pca_df, sample_annotation_df, by = "sample_name") | |
# female_sig_sites_pca_df <- left_join(female_sig_sites_pca_df, female_avg_x_methylation_df, by = "sample_name") | |
female_sig_sites_pca_df <- left_join(female_sig_sites_pca_df, tidy(colMeans(female_beta_df_chrX %>% select(-rn))) %>% rename(sample_name=names, meanXMethylation=x), by = "sample_name") | |
female_no_h9_sig_sites_pov <- sprintf("%2.2f%%", 100* female_sig_sites_pca$sdev^2/sum(female_sig_sites_pca$sdev^2)) | |
female_no_h9_sig_sites_pc1_label <- sprintf("PC1 (%s)", female_no_h9_sig_sites_pov[1]) | |
female_no_h9_sig_sites_pc2_label <- sprintf("PC2 (%s)", female_no_h9_sig_sites_pov[2]) | |
our_data_beta_df_sigxsites <- our_data_beta_df %>% | |
filter(rn %in% no_na_sigxsites) | |
our_data_pca <- predict(female_sig_sites_pca, t(our_data_beta_df_sigxsites %>% dplyr::select(-rn))) | |
clusters_pData <- read.csv("saved_sample_clusters.csv", | |
header = FALSE, | |
row.names = 1, | |
col.names=c('name','cluster')) | |
female_sig_sites_pca_df$python_clusters <- clusters_pData[female_sig_sites_pca_df$sample_name,] | |
color_clusters <- clusters_pData | |
color_clusters$cluster <- as.factor(color_clusters$cluster) | |
continuous_cluster_df <- read_csv("continuous_cluster_df.csv") | |
pca_female_no_h9_by_python_6 <- ggplot(female_sig_sites_pca_df, mapping = aes(x=PC1, y=PC2)) + | |
stat_ellipse(data = female_sig_sites_pca_df %>% filter(cell_line != "WA09"), mapping = aes(x=PC1, y=PC2, fill=as.factor(python_clusters)), alpha=.2, size=.2, geom="polygon") + | |
geom_line(female_sig_sites_pca_df %>% filter(cell_line %in% c("HDF51IPS3", "HDF51IPS1", "HDF51IPS2", "HDF51IPS11", "HDF51IPS12", "HDF51IPS6", "HDF51IPS14", "WA09") & grepl("Nazor_", sample_name)), | |
mapping = aes(x=PC1, y=PC2, group=cell_line), size=.3, alpha=.4) + | |
geom_point(aes(color=as.factor(python_clusters), shape=(cell_line == "WA09")), size=.5) + | |
# geom_text_repel(female_no_h9_sig_sites_pca_df %>% filter(cell_line %in% correctly_ordered_by_sig_names & grepl("Nazor_", sample_name)), mapping = aes(x=PC1, y=PC2, label=passage), size=2, color="grey", segment.size=.15, min.segment.length=.25, segment.color="darkgrey") + | |
scale_shape_manual(values = c(17, 16), labels=c("H9", "Other"), limits=c(T,F), guide=F) + | |
theme_bw() + | |
labs(x=female_no_h9_sig_sites_pc1_label, | |
y=female_no_h9_sig_sites_pc2_label, | |
title = "PCA of Female Samples", | |
color="Cluster")+ | |
base_plot_theme + | |
scale_color_manual(limits=as.factor(c(0,1,2,3,4,5,NA)), labels=c("A","B","C","D","E","F", "H9"), values = c(RColorBrewer::brewer.pal(6,"Dark2"), "#000000"), na.value="#000000") + | |
scale_fill_manual(limits=as.factor(c(0,1,2,3,4,5,NA)), labels=c("A","B","C","D","E","F", "H9"), values = c(RColorBrewer::brewer.pal(6,"Dark2"), "#000000"), na.value="#000000", guide=F) + | |
# scale_color_brewer(type="qual", palette = "Dark2", limits=c(0,1,2,3,4,5), labels=c("A","B","C","D","E","F")) + | |
guides(color=guide_legend(nrow=1)) + | |
theme(legend.key.width = unit(1, units = "mm"), | |
legend.text = element_text(size=5, margin = margin(r=.5, l=-.7, unit="mm"))) + | |
scale_x_continuous(limits=c(-11, 20), expand = c(0,0)) + | |
scale_y_continuous(limits=c(-8,10.5), expand = c(0,0)) | |
ggsave("figs/main/fig1_pca_female_with_h9.pdf", pca_female_no_h9_by_python_6, width = 72, height = 38, units = "mm") | |
our_data_pca_df <- as.data.frame(our_data_pca) %>% | |
rownames_to_column(var="sample_name") %>% | |
left_join(continuous_cluster_df, by="sample_name") | |
library(ggrepel) | |
our_sample_pca <- ggplot(our_data_pca_df, mapping = aes(x=PC1, y=PC2)) + | |
geom_point(data = female_sig_sites_pca_df, mapping = aes(x=PC1, y=PC2, color=as.factor(python_clusters), shape=(cell_line == "WA09")), alpha=.2, size=.4) + | |
geom_point(aes(color=as.factor(cluster), shape=(cell_line == "WA09")), size=.6) + | |
stat_ellipse(data = female_sig_sites_pca_df %>% filter(cell_line != "WA09"), mapping = aes(x=PC1, y=PC2, fill=as.factor(python_clusters)), alpha=.2, size=.2, geom="polygon") + | |
geom_text_repel(aes(label=passage, color=as.factor(cluster)), size=2, min.segment.length = 0, segment.size = .15) + | |
theme_bw() + | |
labs(x=female_no_h9_sig_sites_pc1_label, | |
y=female_no_h9_sig_sites_pc2_label, | |
title = "PCA of Our Female Samples", | |
color="Cluster")+ | |
base_plot_theme + | |
scale_shape_manual(values = c(17, 16), labels=c("H9", "Other"), limits=c(T,F), guide=F) + | |
scale_color_manual(limits=as.factor(c(0,1,2,3,4,5,NA)), labels=c("A","B","C","D","E","F", "H9"), values = c(RColorBrewer::brewer.pal(6,"Dark2"), "#000000"), na.value="#000000") + | |
scale_fill_manual(limits=as.factor(c(0,1,2,3,4,5,NA)), labels=c("A","B","C","D","E","F", "H9"), values = c(RColorBrewer::brewer.pal(6,"Dark2"), "#000000"), na.value="#000000", guide=F) + | |
# scale_color_brewer(type="qual", palette = "Dark2", limits=c(0,1,2,3,4,5), labels=c("A","B","C","D","E","F")) + | |
guides(color=guide_legend(nrow=1)) + | |
theme(legend.key.width = unit(1, units = "mm"), | |
legend.text = element_text(size=5, margin = margin(r=.5, l=-.7, unit="mm"))) + | |
scale_x_continuous(limits=c(-11, 20)) + | |
scale_y_continuous(limits=c(-8,10.5)) | |
ggsave("figs/our_data/female-pca-by-cluster-our-data.pdf",plot=our_sample_pca, height = 84, width = 84, units = 'mm') | |
c19_sample_pca <- ggplot(our_data_pca_df %>% filter(cell_line=="C19XX"), mapping = aes(x=PC1, y=PC2)) + | |
stat_ellipse(data = female_sig_sites_pca_df %>% filter(cell_line != "WA09"), mapping = aes(x=PC1, y=PC2, fill=as.factor(python_clusters)), alpha=.2, size=.2, geom="polygon") + | |
geom_point(aes(color=as.factor(cluster), shape=(cell_line == "WA09")), size=.6) + | |
geom_text_repel(aes(label=passage, color=as.factor(cluster)), size=2, min.segment.length = 0, segment.size = .15, show.legend = F) + | |
theme_bw() + | |
labs(x=female_no_h9_sig_sites_pc1_label, | |
y=female_no_h9_sig_sites_pc2_label, | |
title = "PCA of C19 Female Samples", | |
color="Cluster")+ | |
base_plot_theme + | |
scale_shape_manual(values = c(17, 16), labels=c("H9", "Other"), limits=c(T,F), guide=F) + | |
scale_color_manual(limits=as.factor(c(0,1,2,3,4,5,NA)), labels=c("A","B","C","D","E","F", "H9"), values = c(RColorBrewer::brewer.pal(6,"Dark2"), "#000000"), na.value="#000000") + | |
scale_fill_manual(limits=as.factor(c(0,1,2,3,4,5,NA)), labels=c("A","B","C","D","E","F", "H9"), values = c(RColorBrewer::brewer.pal(6,"Dark2"), "#000000"), na.value="#000000", guide=F) + | |
# scale_color_brewer(type="qual", palette = "Dark2", limits=c(0,1,2,3,4,5), labels=c("A","B","C","D","E","F")) + | |
guides(color=guide_legend(nrow=1)) + | |
theme(legend.key.width = unit(1, units = "mm"), | |
legend.text = element_text(size=5, margin = margin(r=.5, l=-.7, unit="mm"))) + | |
scale_x_continuous(limits=c(-11, 20), expand = c(0,0)) + | |
scale_y_continuous(limits=c(-8,10.5), expand = c(0,0)) | |
ggsave("figs/main/fig1_female-pca-by-cluster-c19only.pdf",plot=c19_sample_pca, height = 38, width = 72, units = 'mm') | |
c23_sample_pca <- ggplot(our_data_pca_df %>% filter(cell_line=="C23XX"), mapping = aes(x=PC1, y=PC2)) + | |
stat_ellipse(data = female_sig_sites_pca_df %>% filter(cell_line != "WA09"), mapping = aes(x=PC1, y=PC2, fill=as.factor(python_clusters)), alpha=.2, size=.2, geom="polygon") + | |
geom_point(aes(color=as.factor(cluster), shape=(cell_line == "WA09")), size=.6) + | |
geom_text_repel(aes(label=passage, color=as.factor(cluster)), size=2, min.segment.length = 0, segment.size = .15, show.legend = F) + | |
theme_bw() + | |
labs(x=female_no_h9_sig_sites_pc1_label, | |
y=female_no_h9_sig_sites_pc2_label, | |
title = "PCA of C23 Female Samples", | |
color="Cluster")+ | |
base_plot_theme + | |
scale_shape_manual(values = c(17, 16), labels=c("H9", "Other"), limits=c(T,F), guide=F) + | |
scale_color_manual(limits=as.factor(c(0,1,2,3,4,5,NA)), labels=c("A","B","C","D","E","F", "H9"), values = c(RColorBrewer::brewer.pal(6,"Dark2"), "#000000"), na.value="#000000") + | |
scale_fill_manual(limits=as.factor(c(0,1,2,3,4,5,NA)), labels=c("A","B","C","D","E","F", "H9"), values = c(RColorBrewer::brewer.pal(6,"Dark2"), "#000000"), na.value="#000000", guide=F) + | |
# scale_color_brewer(type="qual", palette = "Dark2", limits=c(0,1,2,3,4,5), labels=c("A","B","C","D","E","F")) + | |
guides(color=guide_legend(nrow=1)) + | |
theme(legend.key.width = unit(1, units = "mm"), | |
legend.text = element_text(size=5, margin = margin(r=.5, l=-.7, unit="mm"))) + | |
scale_x_continuous(limits=c(-11, 20), expand = c(0,0)) + | |
scale_y_continuous(limits=c(-8,10.5), expand = c(0,0)) | |
ggsave("figs/main/fig1_female-pca-by-cluster-c23only.pdf",plot=c23_sample_pca, height = 38, width = 72, units = 'mm') | |
# percent completion plots | |
all_dmp_df <- read_csv("all_dmp_df.csv.gz") | |
demethylation_dmps <- all_dmp_df %>% | |
filter(delta_increasing == F & passes_threshold == T & hg38_chr == "chrX") | |
changing_x_dmps <- all_dmp_df %>% | |
filter(passes_threshold == T & hg38_chr == "chrX") | |
percent_completion_df <- our_data_beta_df %>% | |
reshape2::melt(id.vars="rn", variable.name="sample_name", value.name="beta_value") %>% | |
right_join(changing_x_dmps %>% select(rowname, | |
cluster_0_mean, | |
cluster_1_mean, | |
cluster_2_mean, | |
cluster_3_mean, | |
cluster_4_mean, | |
cluster_5_mean, | |
key_transition, | |
passes_threshold, | |
delta_increasing, | |
delta), by=c("rn"="rowname")) %>% | |
mutate(sample_delta=case_when(key_transition==1 ~ beta_value - cluster_0_mean, | |
key_transition==2 ~ beta_value - cluster_1_mean, | |
key_transition==3 ~ beta_value - cluster_2_mean, | |
key_transition==4 ~ beta_value - cluster_3_mean, | |
key_transition==5 ~ beta_value - cluster_4_mean), | |
as_predicted=case_when(delta_increasing ~ sample_delta >= delta*.9, | |
delta_increasing == F ~ sample_delta <= delta*.9)) %>% | |
drop_na() %>% | |
group_by(key_transition, sample_name, delta_increasing) %>% | |
summarise(percent_as_predicted=sum(as_predicted)/n(), as_predicted=sum(as_predicted), total=n()) %>% | |
left_join(continuous_cluster_df, by="sample_name") | |
percent_completion_df_grouped <- our_data_beta_df %>% | |
reshape2::melt(id.vars="rn", variable.name="sample_name", value.name="beta_value") %>% | |
left_join(continuous_cluster_df, by="sample_name") %>% | |
filter(sample_name != "C6XX") %>% | |
group_by(cluster, rn) %>% | |
summarise(our_mean_beta=mean(beta_value)) %>% | |
mutate(cluster_name=sprintf("our_cluster_%d_mean",cluster)) %>% | |
reshape2::dcast(rn~cluster_name, value.var="our_mean_beta") %>% | |
right_join(changing_x_dmps %>% select(rowname, | |
cluster_0_mean, | |
cluster_1_mean, | |
cluster_2_mean, | |
cluster_3_mean, | |
cluster_4_mean, | |
cluster_5_mean, | |
key_transition, | |
passes_threshold, | |
delta_increasing, | |
delta), by=c("rn"="rowname")) %>% | |
filter(passes_threshold==T) %>% | |
mutate(our_delta=case_when(key_transition==1 ~ our_cluster_1_mean-our_cluster_0_mean, | |
key_transition==2 ~ our_cluster_2_mean-our_cluster_1_mean), | |
sign_matches=delta*our_delta > 0, | |
delta_magnitude_matches=case_when(delta_increasing ~ our_delta >= delta*.9, | |
!delta_increasing ~ our_delta <= delta*.9)) %>% | |
drop_na() %>% | |
group_by(key_transition, delta_increasing) %>% | |
summarise(percent_as_predicted_sign=sum(sign_matches)/n(), | |
as_predicted_sign=sum(sign_matches), | |
percent_as_predicted_magnitude=sum(delta_magnitude_matches)/n(), | |
as_predicted_magnitude=sum(delta_magnitude_matches), | |
total=n()) | |
#normalize by cluster 0 samples | |
cluster_0_mean_completion <- percent_completion_df %>% | |
filter(cluster==0) %>% | |
group_by(cell_line, delta_increasing, key_transition) %>% | |
summarise(mean_cluster0_percent=min(percent_as_predicted)) | |
percent_completion_df <- percent_completion_df %>% | |
left_join(cluster_0_mean_completion, by=c("cell_line", "delta_increasing", "key_transition")) %>% | |
mutate(cluster_0_adjusted_percent_as_predicted=percent_as_predicted-mean_cluster0_percent) | |
library(ggsci) | |
percent_demethylated_plot <- ggplot(percent_completion_df %>% filter(cell_line %in% c("C19XX", "C23XX") & delta_increasing == F), aes(x=passage, y=percent_as_predicted, color=as.factor(key_transition))) + | |
geom_line() + | |
geom_point(size=.5) + | |
facet_grid(rows=vars(cell_line)) + | |
scale_color_futurama() + | |
labs(title = "Fraction of Transition Specific DMPs Demethylated", | |
y="Fraction of Transition-Specific DMPs Demethylated", | |
x="Passage Number", | |
color = "Transition") + | |
theme_bw() + | |
base_plot_theme + | |
theme(legend.title = element_text(size=5, face="bold"), | |
legend.text = element_text(size=5), | |
legend.key.width = unit(2, "mm")) | |
ggsave("figs/our_data/percent_demethylated_plot.pdf", percent_completion_plot, width = 174/3, height=80, units = "mm") | |
percent_demethylated_plot_cluster0_norm <- ggplot(percent_completion_df %>% filter(cell_line %in% c("C19XX", "C23XX") & delta_increasing == F), aes(x=passage, y=cluster_0_adjusted_percent_as_predicted, color=as.factor(key_transition))) + | |
geom_line() + | |
geom_point(size=.5) + | |
facet_grid(rows=vars(cell_line)) + | |
scale_color_futurama() + | |
labs(title = "Fraction of Transition Specific DMPs Demethylated Adjusted to cluster A", | |
y="Fraction of Transition-Specific DMPs Demethylated", | |
x="Passage Number", | |
color = "Transition") + | |
theme_bw() + | |
base_plot_theme + | |
theme(legend.title = element_text(size=5, face="bold"), | |
legend.text = element_text(size=5), | |
legend.key.width = unit(2, "mm")) | |
ggsave("figs/our_data/percent_demethylated_plot_norm_to_clusterA.pdf", percent_demethylated_plot_cluster0_norm, width = 174/3, height=80, units = "mm") | |
percent_methylated_plot <- ggplot(percent_completion_df %>% filter(cell_line %in% c("C19XX", "C23XX") & delta_increasing == T), aes(x=passage, y=percent_as_predicted, color=as.factor(key_transition))) + | |
geom_line() + | |
geom_point(size=.5) + | |
facet_grid(cols=vars(cell_line)) + | |
scale_color_futurama() + | |
labs(title = "Fraction of Transition Specific DMPs Methylated", | |
y="Fraction of Transition-Specific DMPs Methylated", | |
x="Passage Number", | |
color = "Transition") + | |
theme_bw() + | |
base_plot_theme + | |
theme(legend.title = element_text(size=5, face="bold"), | |
legend.text = element_text(size=5), | |
legend.key.width = unit(2, "mm"), | |
) | |
ggsave("figs/our_data/percent_methylated_plot.pdf", percent_completion_plot, width = 174/3, height=80, units = "mm") | |
cluster_0_demethylated_probes_c19 <- (our_data_beta_df %>% | |
reshape2::melt(id.vars="rn", variable.name="sample_name", value.name="beta_value") %>% | |
right_join(changing_x_dmps %>% select(rowname, | |
cluster_0_mean, | |
cluster_1_mean, | |
cluster_2_mean, | |
cluster_3_mean, | |
cluster_4_mean, | |
cluster_5_mean, | |
key_transition, | |
passes_threshold, | |
delta_increasing, | |
delta), by=c("rn"="rowname")) %>% | |
mutate(sample_delta=case_when(key_transition==1 ~ beta_value - cluster_0_mean, | |
key_transition==2 ~ beta_value - cluster_1_mean, | |
key_transition==3 ~ beta_value - cluster_2_mean, | |
key_transition==4 ~ beta_value - cluster_3_mean, | |
key_transition==5 ~ beta_value - cluster_4_mean), | |
as_predicted=case_when(delta_increasing ~ sample_delta >= delta*.9, | |
delta_increasing == F ~ sample_delta <= delta*.9)) %>% | |
drop_na() %>% | |
left_join(continuous_cluster_df, by="sample_name") %>% | |
filter(cluster == 0 & delta_increasing == F & as_predicted == T & cell_line == "C19XX"))$rn | |
cluster_0_demethylated_probes_c23 <- (our_data_beta_df %>% | |
reshape2::melt(id.vars="rn", variable.name="sample_name", value.name="beta_value") %>% | |
right_join(changing_x_dmps %>% select(rowname, | |
cluster_0_mean, | |
cluster_1_mean, | |
cluster_2_mean, | |
cluster_3_mean, | |
cluster_4_mean, | |
cluster_5_mean, | |
key_transition, | |
passes_threshold, | |
delta_increasing, | |
delta), by=c("rn"="rowname")) %>% | |
mutate(sample_delta=case_when(key_transition==1 ~ beta_value - cluster_0_mean, | |
key_transition==2 ~ beta_value - cluster_1_mean, | |
key_transition==3 ~ beta_value - cluster_2_mean, | |
key_transition==4 ~ beta_value - cluster_3_mean, | |
key_transition==5 ~ beta_value - cluster_4_mean), | |
as_predicted=case_when(delta_increasing ~ sample_delta >= delta*.9, | |
delta_increasing == F ~ sample_delta <= delta*.9)) %>% | |
drop_na() %>% | |
left_join(continuous_cluster_df, by="sample_name") %>% | |
filter(cluster == 0 & delta_increasing == F & as_predicted == T & cell_line == "C23XX"))$rn | |
percent_completion_df <- our_data_beta_df %>% | |
reshape2::melt(id.vars="rn", variable.name="sample_name", value.name="beta_value") %>% | |
right_join(changing_x_dmps %>% select(rowname, | |
cluster_0_mean, | |
cluster_1_mean, | |
cluster_2_mean, | |
cluster_3_mean, | |
cluster_4_mean, | |
cluster_5_mean, | |
key_transition, | |
passes_threshold, | |
delta_increasing, | |
delta), by=c("rn"="rowname")) %>% | |
mutate(sample_delta=case_when(key_transition==1 ~ beta_value - cluster_0_mean, | |
key_transition==2 ~ beta_value - cluster_1_mean, | |
key_transition==3 ~ beta_value - cluster_2_mean, | |
key_transition==4 ~ beta_value - cluster_3_mean, | |
key_transition==5 ~ beta_value - cluster_4_mean), | |
as_predicted=case_when(delta_increasing == T ~ sample_delta >= delta*.9, | |
delta_increasing == F ~ sample_delta <= delta*.9)) %>% | |
drop_na() %>% | |
left_join(continuous_cluster_df, by="sample_name") %>% | |
filter((cell_line == "C19XX" & !(rn %in% cluster_0_demethylated_probes_c19)) | (cell_line == "C23XX" & !(rn %in% cluster_0_demethylated_probes_c23))) %>% | |
group_by(key_transition, sample_name, delta_increasing, cell_line, cluster, passage) %>% | |
summarise(percent_as_predicted=sum(as_predicted)/n()) | |
percent_demethylated_plot_noclusterAprobes <- ggplot(percent_completion_df %>% filter(cell_line %in% c("C19XX", "C23XX") & delta_increasing == F), aes(x=passage, y=percent_as_predicted, color=as.factor(key_transition))) + | |
geom_line() + | |
geom_point(size=.5) + | |
facet_grid(cols=vars(cell_line)) + | |
scale_color_futurama() + | |
labs(title = "Fraction of Transition-Specific DMPs Demethylated", | |
y="Fraction of DMPs Demethylated", | |
x="Passage Number", | |
color = "Transition") + | |
theme_bw() + | |
base_plot_theme + | |
theme(legend.title = element_text(size=5, face="bold"), | |
legend.text = element_text(size=5), | |
legend.key.width = unit(2, "mm"), | |
strip.text.x = element_text(size=6, margin=margin(t=1, b=1)), | |
strip.background.x = element_rect()) | |
ggsave("figs/main/fig1_percent_demethylated_plot_noclusterAdemethylatedprobes.pdf", percent_demethylated_plot_noclusterAprobes, width = 72, height=42, units = "mm") | |
# run differential methylation analysis | |
array_registry_df <- readxl::read_excel("E:/pinterlab/x_erosion_data/Array_Registry_Update210126.xlsx") | |
samples_in_order <- c("ar 52", "ar 54", "ar 56", | |
"ar 58", "ar 60", "ar 62", | |
"ar 64", "ar 66") | |
data_directory <- "E:/pinterlab/x_erosion_data/first_eight_samples" | |
basename_id <- "204668830015" | |
first_batch_samples_df <- array_registry_df %>% | |
filter(`Tube Label` %in% samples_in_order) %>% | |
dplyr::rename(description=`...4`) %>% | |
dplyr::mutate(cell_line=str_split(description, " ", simplify = T)[,1], | |
cell_type=str_split(description, " ", simplify = T)[,2], | |
passage=as.numeric(str_replace(str_split(description, " ", simplify = T)[,4], "p", "")), | |
Sentrix_Position=sprintf("R0%dC01", 1:8), | |
Basename=sprintf("%s/%s_%s", data_directory, basename_id, Sentrix_Position), | |
basename_nodir=sprintf("%s_%s", basename_id, Sentrix_Position), | |
Sample_Name=description) | |
#second batch of array | |
samples_in_order <- c("ar 47", "ar 48", "ar72", "ar73", | |
"ar74", "ar75", "ar76", "ar77") | |
data_directory <- "E:/pinterlab/x_erosion_data/second_eight_samples" | |
basename_id <- "204792210038" | |
second_batch_samples_df <- array_registry_df %>% | |
filter(`Tube Label` %in% samples_in_order) %>% | |
dplyr::rename(description=`...4`) %>% | |
dplyr::mutate(cell_line=str_split(description, " ", simplify = T)[,1], | |
cell_type=str_split(description, " ", simplify = T)[,2], | |
passage=as.numeric(str_replace(str_split(description, " ", simplify = T)[,4], "p", "")), | |
Sentrix_Position=sprintf("R0%dC01", 1:8), | |
Basename=sprintf("%s/%s_%s", data_directory, basename_id, Sentrix_Position), | |
basename_nodir=sprintf("%s_%s", basename_id, Sentrix_Position), | |
Sample_Name=description) | |
#third batch of array | |
samples_in_order <- c("ar86", "ar87", "ar88", | |
"ar89", "ar90", "ar91", | |
"ar92", "ar93") | |
data_directory <- "E:/pinterlab/x_erosion_data/third_eight_samples" | |
basename_id <- "205003700119" | |
third_batch_samples_df <- array_registry_df %>% | |
filter(`Tube Label` %in% samples_in_order) %>% | |
dplyr::rename(description=`...4`) %>% | |
dplyr::mutate(cell_line=str_split(description, " ", simplify = T)[,1], | |
cell_type=str_split(description, " ", simplify = T)[,2], | |
passage=as.numeric(str_replace(str_split(description, " ", simplify = T)[,4], "p", "")), | |
Sentrix_Position=sprintf("R0%dC01", 1:8), | |
Basename=sprintf("%s/%s_%s", data_directory, basename_id, Sentrix_Position), | |
basename_nodir=sprintf("%s_%s", basename_id, Sentrix_Position), | |
Sample_Name=description) | |
sample_info_df <- rbind(first_batch_samples_df, second_batch_samples_df, third_batch_samples_df) %>% | |
left_join(continuous_cluster_df, by=c("description"="sample_name", "cell_line", "passage")) | |
RGset <- read.metharray.exp(targets = sample_info_df) | |
mset <- preprocessIllumina(RGset) | |
gmset <- mapToGenome(mset) | |
beta_mat_850k <- getBeta(gmset) | |
colnames(beta_mat_850k) <- sample_info_df[match(colnames(beta_mat_850k), sample_info_df$basename_nodir),]$Sample_Name | |
write_csv(as.data.frame(beta_mat_850k) %>% rownames_to_column(var="rn"), "our_data_beta_mat_850k.csv.gz") | |
site_annotation_850k <- getAnnotation(gmset) | |
write_csv(site_annotation_850k %>% as.data.frame() %>% rownames_to_column(), "site_annotation_850k.csv.gz") | |
our_data_dmp_dir <- 'dmp_between_our_sample_clusters' | |
dir.create(our_data_dmp_dir) | |
all_dmp_df <- data.frame() | |
transition_columns <- c() | |
our_data_beta_mat_850k <- read_csv("our_data_beta_mat_850k.csv.gz") %>% column_to_rownames(var="rn") | |
site_annotation_850k <- read_csv("site_annotation_850k.csv.gz") | |
cluster_means <- tibble(rowname=character()) | |
for (n2 in min(sample_info_df$cluster):(max(sample_info_df$cluster))) { | |
print(n2) | |
cluster_samples <- filter(sample_info_df, cluster %in% c(n2))$Sample_Name | |
mean_df <- as.data.frame(rowMeans(na.omit(our_data_beta_mat_850k[cluster_samples]))) %>% | |
rownames_to_column() %>% | |
dplyr::rename(!!sprintf('cluster_%d_mean', n2) := 2) | |
cluster_means <- full_join(cluster_means, mean_df, by="rowname") | |
} | |
for (n in min(sample_info_df$cluster):(max(sample_info_df$cluster)-1)) { | |
print(sprintf('comparing clusters %d and %d', n, n+1)) | |
cluster_1_group <- filter(sample_info_df, cluster <= n)$Sample_Name | |
cluster_2_group <- filter(sample_info_df, cluster > n)$Sample_Name | |
temp_cluster_data <- sample_info_df #column_to_rownames(clusters_pData, var="name") | |
temp_cluster_data[temp_cluster_data$Sample_Name %in% cluster_1_group, 'cluster'] <- 0 | |
temp_cluster_data[temp_cluster_data$Sample_Name %in% cluster_2_group, 'cluster'] <- 1 | |
clusters_pair <- temp_cluster_data$Sample_Name | |
subset_beta <- na.omit(our_data_beta_mat_850k[,clusters_pair]) | |
subset_clusters <- as.matrix(temp_cluster_data %>% dplyr::select(Sample_Name, cluster) %>% column_to_rownames(var="Sample_Name")) | |
subset_clusters[1] <- as.factor(subset_clusters[1]) | |
# subset_clusters <- as.matrix(temp_cluster_data[clusters_pair,]) | |
# subset_clusters[1] <- as.factor(subset_clusters[1]) | |
# row.names(subset_clusters) <- clusters_pair | |
dmp <- dmpFinder(as.matrix(subset_beta), pheno = subset_clusters[,1], type = "categorical") | |
# subset_beta_mat <- as.matrix(subset_beta) | |
# row.names(subset_beta_mat) <- rownames(subset_beta) | |
dmp_df <- as.data.frame(dmp) %>% rownames_to_column(var="rowname") | |
dmp_df <- left_join(dmp_df, site_annotation_850k %>% select(rowname, | |
hg19_chr=chr, | |
hg19_pos=pos, | |
hg19_gene=UCSC_RefGene_Name, | |
hg19_refgene_group=UCSC_RefGene_Group), by="rowname") | |
dmp_df <- left_join(dmp_df, cluster_means, by="rowname") | |
write.csv(dmp_df, | |
file = sprintf('%s/dmp_table_%d_%d.csv',our_data_dmp_dir, n, n+1), | |
quote = FALSE) | |
if (n + 1 == 1) { | |
all_dmp_df <- dmp_df %>% select(rowname, | |
hg19_chr, | |
hg19_pos, | |
hg19_gene, | |
hg19_refgene_group, | |
matches("cluster_._mean")) | |
} | |
all_dmp_df <- full_join(all_dmp_df, dmp_df %>% select(rowname, !!sprintf('transition_%s_qval', n + 1):=qval), by="rowname") | |
transition_columns <- append(transition_columns, sprintf('transition_%s_qval', n + 1)) | |
} | |
library(radiant.data) | |
q_val_threshold <- .01 | |
all_dmp_df <- all_dmp_df %>% | |
dplyr::mutate(key_transition=case_when(transition_1_qval < transition_2_qval ~ 1, | |
transition_2_qval < transition_1_qval ~ 2), | |
passes_threshold=case_when(key_transition==1 ~ transition_1_qval <= q_val_threshold, | |
key_transition==2 ~ transition_2_qval <= q_val_threshold), | |
delta=case_when(key_transition==1 ~ cluster_1_mean - cluster_0_mean, | |
key_transition==2 ~ cluster_2_mean - cluster_1_mean), | |
delta_increasing=delta>0) | |
write_csv(all_dmp_df, "our_data_all_dmp_df.csv.gz") | |
our_data_all_dmp_df <- read_csv("our_data_all_dmp_df.csv.gz") | |
all_dmp_df <- read_csv("all_dmp_df.csv.gz") | |
#heatmap of XIST and FIRRE | |
our_data_beta_mat_850k <- read_csv("our_data_beta_mat_850k.csv.gz") | |
desired_genes <- c("XIST", "FIRRE") | |
gene_order <- c(F,T) | |
subset_heatmap_df <- inner_join(our_data_beta_mat_850k, | |
dplyr::select(all_dmp_df, rowname, hg38_gene, hg38_pos, hg38_refgene_group), by=c("rn"="rowname")) %>% | |
filter(hg38_gene %in% desired_genes) | |
subset_heatmap_df <- arrange(subset_heatmap_df, hg38_pos) | |
ordered_sites <- subset_heatmap_df$rn | |
ordered_sites <- c() | |
for (i in 1:length(desired_genes)) { | |
g <- desired_genes[i] | |
o <- gene_order[i] | |
if(o == T) { | |
ordered_sites <- c(ordered_sites, arrange(subset_heatmap_df %>% filter(hg38_gene == g), hg38_pos)$rn) | |
}else { | |
ordered_sites <- c(ordered_sites, arrange(subset_heatmap_df %>% filter(hg38_gene == g), desc(hg38_pos))$rn) | |
} | |
} | |
subset_heatmap_df <- dplyr::select(subset_heatmap_df, -hg38_pos) | |
# sample_clusters_df <- read_csv("saved_sample_clusters.csv", | |
# col_names = c('name','cluster')) %>% arrange(cluster) | |
ordered_sample_names <- (continuous_cluster_df %>% filter(cell_line %in% c("C19XX", "C23XX")) %>% arrange(desc(cell_line), passage))$sample_name | |
ordered_sample_names <- c(ordered_sample_names[12], ordered_sample_names[1:11],ordered_sample_names[13:22]) | |
subset_heatmap_df <- reshape2::melt(subset_heatmap_df, id.vars=c("rn", "hg38_gene")) %>% dplyr::rename(site=rn, sample_name=variable) | |
subset_heatmap_df <- inner_join(subset_heatmap_df, continuous_cluster_df %>% dplyr::select(sample_name, cell_line, passage, sample_cluster=cluster), by=c("sample_name")) %>% | |
filter(cell_line %in% c("C19XX", "C23XX")) | |
convert_int_to_letters <- Vectorize(vectorize.args = "c", | |
FUN = function(c) { | |
switch(c+1, | |
"A", | |
"B", | |
"C", | |
"D", | |
"E", | |
"F", | |
) | |
}) | |
subset_heatmap_df <- subset_heatmap_df %>% mutate(sample_cluster=convert_int_to_letters(sample_cluster), value=as.numeric(value)) | |
subset_heatmap <- ggplot(subset_heatmap_df, aes(x=factor(sample_name, levels = ordered_sample_names), | |
y=factor(site, levels = ordered_sites), fill=value)) + | |
geom_tile() + | |
# facet_grid(cluster ~ sample_cluster, scales = "free", space="free", shrink=FALSE, switch = "y") + | |
facet_grid(factor(hg38_gene, levels=desired_genes) ~ sample_cluster,space="free_x", scales = "free", shrink=T, switch = "y") + | |
scale_fill_gradient2(low="#0bb1a4", high = "#E966FE", | |
midpoint = .5, | |
limits=c(0,1), | |
breaks=c(0,.5,1), | |
labels=c("unmethylated\n??=0", "50% methylation\n??=0.5", "methylated\n??=1")) + | |
labs(x="Female Samples", | |
y="", | |
fill="DNA Methylation") + | |
theme(axis.text=element_blank(), | |
#axis.text.x=element_text(angle=90), | |
axis.ticks=element_blank(), | |
legend.position="top", | |
legend.justification = "center", | |
# legend.title = element_text(size = 6, vjust = .8), | |
# legend.text = element_text(size = 6), | |
# legend.position="top", | |
# legend.justification = "right", | |
legend.title = element_text(size = 4, vjust = .8), | |
legend.text = element_text(size = 4), | |
legend.key.height = unit(2.5, "mm"), | |
plot.margin = margin(r=0), | |
panel.spacing.y = unit(2, "mm"), | |
panel.spacing.x = unit(0.2, "mm"), | |
# strip.background.x = element_rect(color="black"), | |
strip.text.y = element_blank(), | |
# strip.text.y = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")), | |
strip.text.x = element_text(size=5, margin = margin(1,1.5,1,1.5, "mm")), | |
strip.background = element_rect(size = .3), | |
axis.title = element_text(size=7)) | |
ggsave("figs/our_data/xist_firre_heatmap_our_data_450K probes.svg", subset_heatmap, width=15, height = 65, units = "mm") | |
# plot dmp changes with our data | |
all_dmp_df <- read_csv("all_dmp_df.csv.gz") | |
our_data_beta_mat_850k <- read_csv("our_data_beta_mat_850k.csv.gz") | |
site_annotation_850k <- read_csv("site_annotation_850k.csv.gz") | |
x_annotation <- site_annotation_850k %>% filter(chr == "chrX") | |
continuous_cluster_df <- read_csv("continuous_cluster_df.csv") | |
our_data_x_chr_df <- our_data_beta_mat_850k %>% | |
filter(rn %in% x_annotation$rowname) %>% | |
reshape2::melt(id.vars=c("rn"), variable.name="sample_name") %>% | |
left_join(x_annotation %>% dplyr::select(rowname, chr, pos, strand, UCSC_RefGene_Group, UCSC_RefGene_Name), by=c("rn"="rowname")) %>% | |
left_join(continuous_cluster_df, by="sample_name") %>% | |
filter(cell_line != "C6XX") | |
c19_data_x_chr_df <- our_data_x_chr_df %>% | |
filter(cell_line == "C19XX") | |
c19_first_passage <- c19_data_x_chr_df %>% | |
filter(passage == min(passage)) %>% | |
dplyr::select(rn, first_passage_value=value) | |
c19_data_x_chr_df <- c19_data_x_chr_df %>% | |
left_join(c19_first_passage, by="rn") %>% | |
group_by(rn) %>% | |
arrange(passage) %>% | |
mutate(last_passage_value=dplyr::lag(value)) %>% | |
ungroup() | |
ggplot(c19_data_x_chr_df, aes(x=pos, y=value-first_passage_value, color=value-first_passage_value<0)) + | |
geom_line() + | |
scale_color_aaas() + | |
theme_bw() + | |
facet_grid(rows=vars(passage)) | |
mean_methylation_df_escape <- read_csv("mean_methylation_df_escape.csv.gz") | |
ggplot(c19_data_x_chr_df %>% filter(value-last_passage_value < -0.1), aes(x=pos, y=value-last_passage_value, color=value-last_passage_value<0)) + | |
geom_vline(data = mean_methylation_df_escape, mapping=aes(xintercept=hg19_pos), color="lightgrey", size=.3) + | |
geom_point(aes(alpha=abs(value-last_passage_value)), size=.7) + | |
scale_color_aaas() + | |
scale_alpha_continuous(range = c(.2,1)) + | |
theme_classic() + | |
facet_grid(rows=vars(passage)) + | |
labs(title="C19XX Progressive Changes") | |
ggsave("figs/our_data/c19_progressive_changes.pdf") | |
ggsave("figs/our_data/c19_progressive_changes_alt.pdf") | |
ggsave("figs/our_data/c19_progressive_changes_alt3.pdf") | |
c23_data_x_chr_df <- our_data_x_chr_df %>% | |
filter(cell_line == "C23XX") | |
c23_data_x_chr_df <- c23_data_x_chr_df %>% | |
group_by(rn) %>% | |
arrange(passage) %>% | |
mutate(last_passage_value=dplyr::lag(value)) %>% | |
ungroup() | |
ggplot(c23_data_x_chr_df %>% filter(abs(value-last_passage_value) >= .05), aes(x=pos, y=value-last_passage_value, color=value-last_passage_value<0)) + | |
geom_line() + | |
scale_color_aaas() + | |
theme_bw() + | |
facet_grid(rows=vars(passage)) + | |
labs(title="C23XX Progressive Changes") | |
ggsave("figs/our_data/c23_progressive_changes.pdf") | |
#our data heatmap | |
our_data_beta_df <- read_csv("our_data_beta_mat_850k.csv.gz") | |
all_dmp_df <- read_csv('D:/pinterlab/x_erosion/all_dmp_df.csv.gz') | |
# do this based on genes and when the promoters change their methylation. | |
x_annotation_df <- read_csv("D:/pinterlab/x_erosion/x_annotation.csv.gz") | |
statistics <- read_csv('D:/pinterlab/x_erosion/variance_statistics_noh9_nooutliers.csv.gz') | |
positions_to_keep <- statistics$Position[statistics$alpha <= .01 & (statistics$femaleVariance > statistics$maleVariance)] | |
sig_x_sites <- intersect(positions_to_keep, x_annotation_df$rowname) | |
heatmap_df <- inner_join(our_data_beta_df %>% filter(rn %in% x_annotation_df$rowname), | |
dplyr::select(all_dmp_df, rowname, key_transition, passes_threshold, delta_increasing, hg38_refgene_group), by=c("rn"="rowname")) %>% | |
mutate(cluster=case_when(!passes_threshold ~ 10, | |
delta_increasing ~ key_transition - 1, | |
!delta_increasing ~ (key_transition-1)+5)) | |
heatmap_df <- arrange(heatmap_df, desc(passes_threshold), desc(delta_increasing), desc(key_transition)) | |
ordered_sites <- heatmap_df$rn | |
heatmap_df <- dplyr::select(heatmap_df, -key_transition, -passes_threshold, -delta_increasing, -hg38_refgene_group) | |
continous_clusters_df <- read_csv("continuous_cluster_df.csv") %>% | |
filter(cell_line %in% c("C19XX", "C23XX")) | |
ordered_sample_names <- continous_clusters_df %>% arrange(desc(cell_line), passage) %>% pull(sample_name) | |
ordered_sample_names <- c(ordered_sample_names[12], ordered_sample_names[1:11],ordered_sample_names[13:22]) | |
heatmap_df <- reshape2::melt(heatmap_df, id.vars=c("rn", "cluster"), variable.name="sample_name") %>% dplyr::rename(site=rn) %>% | |
right_join(continous_clusters_df %>% dplyr::rename(sample_cluster=cluster), by="sample_name") | |
# only our data | |
sigvar_heatmap_plot_our_data_only <- ggplot(heatmap_df %>% filter(site %in% sig_x_sites & grepl("C19XX|C23XX", sample_name)), aes(x=factor(sample_name, levels = ordered_sample_names), | |
y=factor(site, levels = ordered_sites), fill=value)) + | |
geom_tile() + | |
facet_grid(cluster ~ sample_cluster, scales = "free", space="free", shrink=FALSE, switch = "y", labeller = labeller(cluster = function(c){ | |
return(list( | |
"0"="1↑", | |
"1"="2↑", | |
"2"="3↑", | |
"3"="4↑", | |
"4"="5↑", | |
"5"="1↓", | |
"6"="2↓", | |
"7"="3↓", | |
"8"="4↓", | |
"9"="5↓", | |
"10"="~" | |
)[c]) | |
})) + | |
# facet_grid(cluster ~ sample_cluster,space="free_x", scales = "free", shrink=FALSE) + | |
scale_fill_gradient2(low="#0bb1a4", high = "#E966FE", midpoint = .5, breaks=c(0,.5,1), labels=c("unmethylated\nβ=0", "50% methylation\nβ=0.5", "methylated\nβ=1"))+ | |
labs(x="Female Samples", | |
y="CpG Sites with Greater Variance in Female Samples", | |
fill="DNA Methylation") + | |
theme(axis.text.y=element_blank(), | |
axis.text.x = element_blank(), #element_text(angle = 90, hjust=1, size=5), | |
axis.ticks=element_blank(), | |
legend.position="top", | |
legend.justification = "center", | |
legend.title = element_text(size = 6, vjust = .75), | |
legend.text = element_text(size = 6), | |
# plot.background = element_rect(fill = "lightgrey"), | |
# panel.border = element_rect(colour = "black", fill=NA, size=1), | |
plot.margin = margin(r=0), | |
panel.spacing.y = unit(2, "mm"), | |
panel.spacing.x = unit(0, "mm"), | |
strip.background.x = element_rect(color="black"), | |
strip.background.y = element_blank(), | |
strip.text.x = element_text(size=6), | |
strip.text.y = element_blank(), | |
axis.title.x = element_text(size=8), | |
axis.title.y=element_blank()) | |
ggsave("figs/our_data/our_data_heatmap.svg", sigvar_heatmap_plot_our_data_only, height = 236, width = 15, units = "mm") | |
our_data_dmp_df <- read_csv("our_data_all_dmp_df.csv.gz") | |
threshold <- .05 | |
our_data_dmp_df <- our_data_dmp_df %>% | |
mutate(passes_threshold=case_when(key_transition == 1 ~ transition_1_qval <= threshold, | |
key_transition == 2 ~ transition_2_qval <= threshold)) | |
mean_methylation_df_escape <- read_csv("mean_methylation_df_escape.csv.gz") | |
#hg19_pos was manually annotated for each gene | |
escapee_pos_df <- mean_methylation_df_escape %>% | |
transmute(hg38_gene_name, | |
delta_ipsc=hg19_pos, | |
delta_esc=hg19_pos, | |
delta_H9=hg19_pos) %>% | |
reshape2::melt(id.vars=c("hg38_gene_name"), | |
value.name="TSS", | |
variable.name="type") | |
our_data_dmps <- ggplot(our_data_dmp_df %>% filter(passes_threshold & hg19_chr == "chrX"), aes(x=hg19_pos, y=delta)) + | |
# geom_point(aes(alpha=abs(delta)), size=.4) + | |
geom_segment(mapping=aes(x=hg19_pos,xend=hg19_pos,y=0, yend=delta, color=as.factor(key_transition)), alpha=.1) + | |
geom_vline(data=escapee_pos_df, aes(xintercept=TSS), color="darkgrey", size=.3, alpha=.8) + | |
labs(y="", | |
x="X Chr Coordinate (Mb)") + | |
scale_x_continuous(limits=c(0, 155270560), breaks = seq(0, 155270560, 2e7), labels=c(0, 20, 40, 60, 80, 100, 120, 140), minor_breaks = seq(0, 155270560, 1e7), expand=c(0,0)) + | |
scale_color_manual(limits=c("1", "2", "3", "4", "5"), | |
values=c(pal_d3()(5)), guide=F) + | |
scale_alpha(guide=F)+ | |
# facet_grid(rows=vars(key_transition)) + | |
facet_grid(rows=vars(as.factor(key_transition)), labeller = labeller(.rows=function(n){ | |
return(list("1"="Tran. 1", | |
"2"="Tran. 2")[n]) | |
})) + | |
labs(title="DMP analysis with our samples", | |
color="Delta Increasing") + | |
theme_bw() + | |
base_plot_theme + | |
theme(panel.grid.major = element_blank(), | |
panel.grid.minor = element_blank(), | |
strip.text = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")), | |
strip.background = element_rect(size = .3), | |
title = element_blank(), | |
axis.title.y = element_blank(), | |
axis.text.y=element_text(size=5), | |
axis.title.x = element_blank(), | |
axis.text.x = element_blank(), | |
axis.ticks.x = element_blank(), | |
plot.background = element_blank(), | |
panel.spacing.y = unit(.75, "mm")) | |
ggsave("figs/our_data/our_data_dmp_alt.pdf", our_data_dmps, width=174, height = 20, units="mm") | |
# calculate positional correlation between meta-analysis DMP and our DMPs | |
all_dmp_df <- read_csv("all_dmp_df.csv.gz") | |
our_data_dmp_df <- read_csv("our_data_all_dmp_df.csv.gz") | |
h3k27me3_peaks <- read_table2("bedgraphs/GSM1528885_H9_XISTpos_K27me3.peaks.bed", col_names=c("chr", "start", "end", "signal")) | |
h3k27me3_peaks_chrX <- h3k27me3_peaks %>% filter(chr == "chrX") | |
h3k27me3_peaks_chrX_windowed <- window_average_chrX(bed_file_df=h3k27me3_peaks_chrX %>% rename(pos=start, delta=signal), window_size = 100000, step_size = 50000, include_empty = T, calculate_sum = T) | |
h3k9_xi_peaks <- read_table2("bedgraphs/GSM1528888_H9_XISTpos_K9me3.peaks.bed.gz", col_names=c("chr", "start", "end", "signal")) | |
h3k9_xi_peaks_chrX <- h3k9_xi_peaks %>% filter(chr == "chrX") | |
h3k9_xi_peaks_chrX_windowed <- window_average_chrX(bed_file_df=h3k9_xi_peaks_chrX %>% rename(pos=start, delta=signal), window_size = 100000, step_size = 50000, include_empty = T, calculate_sum = T) | |
transition_correlations <- tibble(transition=numeric(), | |
correlation=numeric(), | |
correlation_p_val=numeric(), | |
h3k27me3_correlation=numeric(), | |
h3k27me3_correlation_p_val=numeric(), | |
h3k9me3_correlation=numeric(), | |
h3k9me3_correlation_p_val=numeric()) | |
for (t in 1:2) { | |
bed_df <- all_dmp_df %>% filter((hg19_chr == "chrX") & (passes_threshold == T) & (key_transition == t)) | |
our_bed_df <- our_data_dmp_df %>% filter((hg19_chr == "chrX") & (passes_threshold == T) & (key_transition == t)) | |
averaged_distance_metaanalysis <- window_average_chrX(bed_file_df=bed_df %>% rename(chr=hg19_chr, pos=hg19_pos), window_size = 100000, step_size = 50000, include_empty = T) | |
averaged_distance_our_data <- window_average_chrX(bed_file_df=our_bed_df %>% rename(chr=hg19_chr, pos=hg19_pos), window_size = 100000, step_size = 50000, include_empty = T) | |
cor_test <- cor.test(averaged_distance_metaanalysis$delta, averaged_distance_our_data$delta) | |
k27_cor_test <- cor.test(abs(averaged_distance_our_data$delta), h3k27me3_peaks_chrX_windowed$delta) | |
k9_cor_test <- cor.test(abs(averaged_distance_our_data$delta), h3k9_xi_peaks_chrX_windowed$delta) | |
transition_correlations <- add_row(transition_correlations, | |
transition=t, | |
correlation=cor_test$estimate, | |
correlation_p_val=cor_test$p.value, | |
h3k27me3_correlation=k27_cor_test$estimate, | |
h3k27me3_correlation_p_val=k27_cor_test$p.value, | |
h3k9me3_correlation=k9_cor_test$estimate, | |
h3k9me3_correlation_p_val=k9_cor_test$p.value) | |
} | |
#histone correlations | |
transition__histone_correlations <- tibble(transition=numeric(), | |
correlation=numeric(), | |
correlation_p_val=numeric()) | |
for (t in 1:2) { | |
bed_df <- all_dmp_df %>% filter((hg19_chr == "chrX") & (passes_threshold == T) & (key_transition == t)) | |
our_bed_df <- our_data_dmp_df %>% filter((hg19_chr == "chrX") & (passes_threshold == T) & (key_transition == t) & (rowname %in% all_dmp_df$rowname)) | |
averaged_distance_metaanalysis <- window_average_chrX(bed_file_df=bed_df %>% rename(chr=hg19_chr, pos=hg19_pos), window_size = 100000, step_size = 50000, include_empty = T) | |
averaged_distance_our_data <- window_average_chrX(bed_file_df=our_bed_df %>% rename(chr=hg19_chr, pos=hg19_pos), window_size = 100000, step_size = 50000, include_empty = T) | |
cor_test <- cor.test(averaged_distance_metaanalysis$delta, averaged_distance_our_data$delta) | |
transition_correlations <- add_row(transition_correlations, | |
transition=t, | |
correlation=cor_test$estimate, | |
correlation_p_val=cor_test$p.value) | |
} | |
# qPCR data | |
qpcr_dat <- read_csv("e:/pinterlab/x_erosion_data/qpcr_data.csv") | |
ggplot(qpcr_dat %>% mutate(cell_line=case_when(cell_line=="C19" ~ "C19 XIST qPCR", T~"C23 XIST qPCR")), aes(x=factor(passage, levels=c("p8","p11","p12","p13","p15","p17","p19","p20","p22")), y=`%TBP`)) + | |
facet_grid(cols=vars(cell_line), scales = "free_x") + | |
geom_boxplot(size=.2, fill="grey") + | |
geom_point(size=.4) + | |
labs(title="XIST qPCR", | |
x="Passage", | |
y="%TBP") + | |
theme_bw() + | |
base_plot_theme + | |
theme(strip.text = element_text(size=6, margin=margin(t=1, b=1), face = "bold"), strip.background = element_blank(), plot.title = element_blank()) | |
ggsave("figs/supplementary/supp1_xist_qpcr_boxplot.pdf", width=80, height=30, units="mm") | |
qpcr_dat_col <- qpcr_dat %>% | |
group_by(cell_line, passage) %>% | |
summarise(mean_exp=mean(`%TBP`), | |
sd=sd(`%TBP`)) | |
ggplot(qpcr_dat_col, aes(x=factor(passage, levels=c("p8","p11","p12","p13","p15","p17","p19","p20","p22")), y=mean_exp)) + | |
facet_grid(cols=vars(cell_line), scales = "free_x") + | |
geom_bar(size=.2, stat="identity") + | |
geom_errorbar(aes(ymin=mean_exp-sd, ymax=mean_exp+sd), width=.5, size=.2) + | |
labs(title="XIST qPCR", | |
x="Passage", | |
y="%TBP") + | |
theme_bw() + | |
base_plot_theme + | |
theme(strip.text = element_text(size=6, margin=margin(t=1, b=1))) | |
ggsave("figs/supplementary/supp1_xist_qpcr_barplot.pdf", width=80, height=40, units="mm") | |
# checking copy number compared to H9s using DNAme | |
# library(stringi) | |
# datasets_df <- read.csv("datasets.csv.gz") %>% filter(Name=="Boscolo") | |
# | |
# for (row in 1:nrow(datasets_df)) { | |
# print(row) | |
# name <- as.character(datasets_df$Name[row]) | |
# basedir <- as.character(datasets_df$Directory[row]) | |
# datatype <- as.character(datasets_df$Type[row]) | |
# sample_metadata_file <- as.character(datasets_df$SampleInfoFile[row]) | |
# array_size <- datasets_df$ArraySize[row] | |
# if (array_size == 450) { | |
# array_type <- "IlluminaHumanMethylation450k" | |
# } else if (array_size == 850) { | |
# array_type <- "IlluminaHumanMethylationEPIC" | |
# } | |
# | |
# | |
# if (datatype == "geo_raw") { | |
# sample_info_df <- read.csv(file.path(basedir, sample_metadata_file), stringsAsFactors = FALSE, row.names = 1) | |
# samples_to_keep <- sample_info_df[sample_info_df$Keep %in% c("TRUE"), ] | |
# raw_filename <- file.path(basedir, as.character(datasets_df$DataFile[row])) | |
# u_name <- as.character(datasets_df$Uname[row]) | |
# m_name <- as.character(datasets_df$Mname[row]) | |
# separator <- stri_unescape_unicode(as.character(datasets_df$Separator[row])) | |
# gmset <- readGEORawFile( | |
# filename = raw_filename, | |
# Uname = u_name, | |
# Mname = m_name, | |
# array = array_type, | |
# sep = separator | |
# ) | |
# gmset <- readGEORawFile( | |
# filename = raw_filename, | |
# Uname = u_name, | |
# Mname = m_name, | |
# array = array_type, | |
# sep = separator, | |
# pData = sample_info_df[sampleNames(gmset), ] | |
# ) | |
# } else if (datatype == "idat") { | |
# targets <- as.data.frame(read.metharray.sheet(basedir, pattern = sample_metadata_file)) | |
# targets_to_keep <- subset(targets, Keep == TRUE) | |
# | |
# RGset <- read.metharray.exp(targets = targets_to_keep) | |
# gmset <- preprocessIllumina(RGset) | |
# gmset <- mapToGenome(gmset) | |
# } | |
# | |
# sampleNames(gmset) <- paste(name, gmset$accession, gmset$cell_type, gmset$cell_line, gmset$gender, | |
# paste("p", gmset$passage, sep = ""), | |
# sep = "_" | |
# ) | |
# | |
# | |
# plot_group <- data.frame(group = name, sample_name = sampleNames(gmset), row.names = "sample_name") | |
# if (row == 1) { | |
# combined_arrays <- gmset | |
# plot_groups_combined <- plot_group | |
# } else { | |
# combined_arrays <- combineArrays(combined_arrays, gmset, outType = "IlluminaHumanMethylation450k") | |
# plot_groups_combined <- rbind(plot_groups_combined, plot_group) | |
# } | |
# } | |
array_registry_df <- readxl::read_excel("E:/pinterlab/x_erosion_data/Array_Registry_Update210126.xlsx") | |
samples_in_order <- c("ar 52", "ar 54", "ar 56", | |
"ar 58", "ar 60", "ar 62", | |
"ar 64", "ar 66") | |
data_directory <- "E:/pinterlab/x_erosion_data/first_eight_samples" | |
basename_id <- "204668830015" | |
first_batch_samples_df <- array_registry_df %>% | |
filter(`Tube Label` %in% samples_in_order) %>% | |
dplyr::rename(description=`...4`) %>% | |
dplyr::mutate(cell_line=str_split(description, " ", simplify = T)[,1], | |
cell_type=str_split(description, " ", simplify = T)[,2], | |
passage=as.numeric(str_replace(str_split(description, " ", simplify = T)[,4], "p", "")), | |
Sentrix_Position=sprintf("R0%dC01", 1:8), | |
Basename=sprintf("%s/%s_%s", data_directory, basename_id, Sentrix_Position), | |
basename_nodir=sprintf("%s_%s", basename_id, Sentrix_Position), | |
Sample_Name=description) | |
#second batch of array | |
samples_in_order <- c("ar 47", "ar 48", "ar72", "ar73", | |
"ar74", "ar75", "ar76", "ar77") | |
data_directory <- "E:/pinterlab/x_erosion_data/second_eight_samples" | |
basename_id <- "204792210038" | |
second_batch_samples_df <- array_registry_df %>% | |
filter(`Tube Label` %in% samples_in_order) %>% | |
dplyr::rename(description=`...4`) %>% | |
dplyr::mutate(cell_line=str_split(description, " ", simplify = T)[,1], | |
cell_type=str_split(description, " ", simplify = T)[,2], | |
passage=as.numeric(str_replace(str_split(description, " ", simplify = T)[,4], "p", "")), | |
Sentrix_Position=sprintf("R0%dC01", 1:8), | |
Basename=sprintf("%s/%s_%s", data_directory, basename_id, Sentrix_Position), | |
basename_nodir=sprintf("%s_%s", basename_id, Sentrix_Position), | |
Sample_Name=description) | |
#third batch of array | |
samples_in_order <- c("ar86", "ar87", "ar88", | |
"ar89", "ar90", "ar91", | |
"ar92", "ar93") | |
data_directory <- "E:/pinterlab/x_erosion_data/third_eight_samples" | |
basename_id <- "205003700119" | |
third_batch_samples_df <- array_registry_df %>% | |
filter(`Tube Label` %in% samples_in_order) %>% | |
dplyr::rename(description=`...4`) %>% | |
dplyr::mutate(cell_line=str_split(description, " ", simplify = T)[,1], | |
cell_type=str_split(description, " ", simplify = T)[,2], | |
passage=as.numeric(str_replace(str_split(description, " ", simplify = T)[,4], "p", "")), | |
Sentrix_Position=sprintf("R0%dC01", 1:8), | |
Basename=sprintf("%s/%s_%s", data_directory, basename_id, Sentrix_Position), | |
basename_nodir=sprintf("%s_%s", basename_id, Sentrix_Position), | |
Sample_Name=description) | |
continuous_cluster_df <- read_csv("continuous_cluster_df.csv") | |
sample_info_df <- rbind(first_batch_samples_df, second_batch_samples_df, third_batch_samples_df) %>% | |
left_join(continuous_cluster_df, by=c("description"="sample_name", "cell_line", "passage")) | |
#%>% | |
add_row(cell_line="H9", | |
Sample_Name="Daily_H9", | |
Basename="D:/pinterlab/x_erosion/daily_h9_sc/GSM2285135_7970368067_R04C01", | |
basename_nodir="GSM2285135_7970368067_R04C01") | |
RGset <- read.metharray.exp(targets = sample_info_df %>% filter(cell_line != "H9")) | |
mset <- preprocessIllumina(RGset) | |
sampleNames(mset) <- sample_info_df[match(sampleNames(mset), sample_info_df$basename_nodir),]$Sample_Name | |
sampleNames(RGset) <- sample_info_df[match(sampleNames(RGset), sample_info_df$basename_nodir),]$Sample_Name | |
RGset_h9 <- read.metharray.exp(targets = sample_info_df %>% filter(cell_line == "H9")) | |
mset_h9 <- preprocessIllumina(RGset_h9) | |
combined_mset <- combineArrays(mset, mset_h9) | |
# gmset <- mapToGenome(mset) | |
# combined_arrays <- combineArrays(combined_arrays, gmset, outType = "IlluminaHumanMethylation450k") | |
library(conumee) | |
source("E:/pinterlab/custom_conumee_functions.R") | |
data(exclude_regions) | |
anno <- custom.CNV.create_anno(array_type="450k", exclude_regions=exclude_regions) | |
# sampleNames(mset) <- sample_info_df[match(sampleNames(mset), sample_info_df$basename_nodir),]$Sample_Name | |
cnv_data <- CNV.load(combined_mset) | |
minfi.controls <- (pData(mset)$cell_line == "H9") | |
x <- CNV.fit(cnv_data[sampleNames(combined_mset) == "C19XX iPSC MEFs p19"], cnv_data[pData(combined_mset)$cell_line == "H9"], anno) | |
x <- CNV.bin(x) | |
x <- CNV.segment(x) | |
CNV.genomeplot(x) | |
#output for GEO | |
beta_df <- getBeta(gmset) | |
beta_df <- as.data.frame(beta_df) %>% rownames_to_column(var="rn") | |
detect_p_df <- detectionP(RGset) | |
detect_p_df <- as.data.frame(detect_p_df) %>% rownames_to_column(var="rn") | |
joined_df <- full_join(beta_df, detect_p_df, by="rn", suffix=c("", "pval")) %>% | |
select(ID_REF=rn, | |
`C19XX IPSC MEF p8`, | |
`C19XX IPSC MEF p8pval`, | |
`C19XX iPSC MEF p11`, | |
`C19XX iPSC MEF p11pval`, | |
`C19XX iPSC MEF p13`, | |
`C19XX iPSC MEF p13pval`, | |
`C19XX iPSC MEF p17`, | |
`C19XX iPSC MEF p17pval`, | |
`C19XX iPSC MEFs p19`, | |
`C19XX iPSC MEFs p19pval`, | |
`C19XX iPSC MEFs p21`, | |
`C19XX iPSC MEFs p21pval`, | |
`C19XX iPSC MEFs p22`, | |
`C19XX iPSC MEFs p22pval`, | |
`C19XX iPSC MEFs p28`, | |
`C19XX iPSC MEFs p28pval`, | |
`C19XX iPSC MEFs p29`, | |
`C19XX iPSC MEFs p29pval`, | |
`C19XX iPSC MEFs p30`, | |
`C19XX iPSC MEFs p30pval`, | |
`C19XX iPSC MEFs p32`, | |
`C19XX iPSC MEFs p32pval`, | |
`C23XX iPSC MEF p12`, | |
`C23XX iPSC MEF p12pval`, | |
`C23XX iPSC MEF p15`, | |
`C23XX iPSC MEF p15pval`, | |
`C23XX iPSC MEF p17`, | |
`C23XX iPSC MEF p17pval`, | |
`C23XX iPSC MEF p20`, | |
`C23XX iPSC MEF p20pval`, | |
`C23XX iPSC MEFs p22`, | |
`C23XX iPSC MEFs p22pval`, | |
`C23XX iPSC MEFs p24`, | |
`C23XX iPSC MEFs p24pval`, | |
`C23XX iPSC MEFs p25`, | |
`C23XX iPSC MEFs p25pval`, | |
`C23XX iPSC MEFs p31`, | |
`C23XX iPSC MEFs p31pval`, | |
`C23XX iPSC MEFs p32`, | |
`C23XX iPSC MEFs p32pval`, | |
`C23XX iPSC MEFs p33`, | |
`C23XX iPSC MEFs p33pval`, | |
`C23XX iPSC MEFs p35`, | |
`C23XX iPSC MEFs p35pval`) | |
write_csv(joined_df, "e:/pinterlab/x_erosion_data/geo_joined_sheet.csv") | |