Skip to content
Permalink
2e6bcacfa8
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
1498 lines (1296 sloc) 80.6 KB
library(tidyverse)
library(minfi)
process_array_as_450k <- function(samples_in_order, array_registry_location, data_directory, basename_id) {
array_registry_df <- readxl::read_excel(array_registry_location)
batch_samples_df <- array_registry_df %>%
filter(`Tube Label` %in% samples_in_order) %>%
dplyr::rename(description=`...4`) %>%
dplyr::mutate(cell_line=str_split(description, " ", simplify = T)[,1],
cell_type=str_split(description, " ", simplify = T)[,2],
passage=as.numeric(str_replace(str_split(description, " ", simplify = T)[,4], "p", "")),
Sentrix_Position=sprintf("R0%dC01", 1:8),
Basename=sprintf("%s/%s_%s", data_directory, basename_id, Sentrix_Position),
basename_nodir=sprintf("%s_%s", basename_id, Sentrix_Position),
Sample_Name=description)
RGset <- read.metharray.exp(targets = batch_samples_df)
gmset <- preprocessIllumina(RGset)
gmset <- mapToGenome(gmset)
gmset_450k <- convertArray(gmset, outType="IlluminaHumanMethylation450k")
all_beta_mat <- getBeta(gmset_450k)
colnames(all_beta_mat) <- batch_samples_df[match(colnames(all_beta_mat), batch_samples_df$basename_nodir),]$Sample_Name
return(as.data.frame(all_beta_mat) %>% rownames_to_column("rn"))
}
# from email from Bo
samples_in_order_first_batch <- c("ar 52", "ar 54", "ar 56",
"ar 58", "ar 60", "ar 62",
"ar 64", "ar 66")
first_beta_mat <- process_array_as_450k(samples_in_order_first_batch,
"E:/pinterlab/x_erosion_data/Array_registry_update_201013.xlsx",
"E:/pinterlab/x_erosion_data/first_eight_samples",
"204668830015")
write_csv(as.data.frame(all_beta_mat) %>% rownames_to_column("rn"), "E:/pinterlab/x_erosion_data/first_eight_samples_beta_matrix.csv.gz")
#second batch of array
samples_in_order_second_batch <- c("ar 47", "ar 48", "ar72", "ar73",
"ar74", "ar75", "ar76", "ar77")
second_beta_mat <- process_array_as_450k(samples_in_order_second_batch,
"E:/pinterlab/x_erosion_data/Array_registry_update_201116.xlsx",
"E:/pinterlab/x_erosion_data/second_eight_samples",
"204792210038")
write_csv(second_beta_mat, "E:/pinterlab/x_erosion_data/second_eight_samples_beta_matrix.csv.gz")
#third batch of array
samples_in_order_third_batch <- c("ar86", "ar87", "ar88",
"ar89", "ar90", "ar91",
"ar92", "ar93")
third_beta_mat <- process_array_as_450k(samples_in_order_third_batch,
"E:/pinterlab/x_erosion_data/Array_Registry_Update210126.xlsx",
"E:/pinterlab/x_erosion_data/third_eight_samples",
"205003700119")
write_csv(third_beta_mat, "E:/pinterlab/x_erosion_data/third_eight_samples_beta_matrix.csv.gz")
first_erosion_clusters_df <- read_csv("first_eight_erosion_cluster_df.csv")
second_erosion_clusters_df <- read_csv("second_eight_erosion_cluster_df.csv")
third_erosion_clusters_df <- read_csv("third_eight_erosion_cluster_df.csv")
combined_table_df <- rbind(first_erosion_clusters_df, second_erosion_clusters_df, third_erosion_clusters_df) %>%
mutate(cell_line=case_when(grepl("C19XX", sample_name) ~ "C19",
grepl("C23XX", sample_name) ~ "C23",
grepl("C6XX", sample_name) ~ "C6"),
passage=as.numeric(stringr::str_split(sample_name, " p", simplify=T)[,2])) %>%
dplyr::arrange(cell_line, passage)
# plot in heatmap
darcy_erosion_clusters_df <- read_csv("E:/pinterlab/darcy_data_xx_xo/methylation_data/erosion_cluster_df.csv") %>%
dplyr::filter(sample_name %in% c("ar34", "ar37", "ar40", "ar41", "ar42")) %>%
dplyr::mutate(sample_name=case_when(sample_name == "ar34" ~ "ar34_C6xx_p7",
sample_name == "ar37" ~ "ar37_C11XX_p7",
sample_name == "ar40" ~ "ar40_C13XX_p10",
sample_name == "ar41" ~ "ar41_C19XX_p9",
sample_name == "ar42" ~ "ar42_C23XX_p10"))
darcy_beta_mat <- read_csv("E:/pinterlab/darcy_data_xx_xo/methylation_data/beta_matrix.csv.gz") %>%
dplyr::select(rn,
ar34_C6xx_p7=ar34,
ar37_C11XX_p7=ar37,
ar40_C13XX_p10=ar40,
ar41_C19XX_p9=ar41,
ar42_C23XX_p10=ar42)
first_eight_beta_mat <- read_csv("E:/pinterlab/x_erosion_data/first_eight_samples_beta_matrix.csv.gz")
second_eight_beta_mat <- read_csv("E:/pinterlab/x_erosion_data/second_eight_samples_beta_matrix.csv.gz")
third_eight_beta_mat <- read_csv("E:/pinterlab/x_erosion_data/third_eight_samples_beta_matrix.csv.gz")
female_df_no_h9_chrX <- read_csv("D:/pinterlab/x_erosion/female_df_no_h9_chrX.csv.gz")
all_dmp_df <- read_csv('D:/pinterlab/x_erosion/all_dmp_df.csv.gz')
# do this based on genes and when the promoters change their methylation.
x_annotation_df <- read_csv("D:/pinterlab/x_erosion/x_annotation.csv.gz")
statistics <- read_csv('D:/pinterlab/x_erosion/variance_statistics_noh9_nooutliers.csv.gz')
positions_to_keep <- statistics$Position[statistics$alpha <= .01 & (statistics$femaleVariance > statistics$maleVariance)]
sig_x_sites <- intersect(positions_to_keep, x_annotation_df$rowname)
heatmap_df <- inner_join(female_df_no_h9_chrX %>% filter(rn %in% x_annotation_df$rowname),
dplyr::select(all_dmp_df, rowname, key_transition, passes_threshold, delta_increasing, hg38_refgene_group), by=c("rn"="rowname"))
heatmap_df <- arrange(heatmap_df, desc(passes_threshold), desc(delta_increasing), desc(key_transition)) %>%
left_join(darcy_beta_mat, by=c("rn")) %>%
left_join(first_eight_beta_mat, by=c("rn")) %>%
left_join(second_eight_beta_mat, by=c("rn")) %>%
left_join(third_eight_beta_mat, by=c("rn"))
ordered_sites <- heatmap_df$rn
heatmap_df[heatmap_df$passes_threshold == FALSE, "cluster"] <- 10
heatmap_df[(heatmap_df$passes_threshold == TRUE) & (heatmap_df$delta_increasing == T), "cluster"] <- heatmap_df[(heatmap_df$passes_threshold == TRUE) & (heatmap_df$delta_increasing == T), "key_transition"] - 1
heatmap_df[(heatmap_df$passes_threshold == TRUE) & (heatmap_df$delta_increasing == F), "cluster"] <- (heatmap_df[(heatmap_df$passes_threshold == TRUE) & (heatmap_df$delta_increasing == F), "key_transition"] - 1) + 5
heatmap_df <- dplyr::select(heatmap_df, -key_transition, -passes_threshold, -delta_increasing, -hg38_refgene_group)
sample_clusters_df <- read_csv("D:/pinterlab/x_erosion/saved_sample_clusters.csv",
col_names = c('name','cluster')) %>%
bind_rows(darcy_erosion_clusters_df %>% dplyr::rename(name=sample_name)) %>%
bind_rows(first_erosion_clusters_df %>% dplyr::rename(name=sample_name)) %>%
bind_rows(second_erosion_clusters_df %>% dplyr::rename(name=sample_name)) %>%
bind_rows(third_erosion_clusters_df %>% dplyr::rename(name=sample_name)) %>%
arrange(cluster)
ordered_sample_names <- sample_clusters_df$name
heatmap_df <- reshape2::melt(heatmap_df, id.vars=c("rn", "cluster")) %>% dplyr::rename(site=rn, sample_name=variable)
for (sample_row in 1:nrow(sample_clusters_df)) {
cluster_name <- unlist(sample_clusters_df[sample_row, "name"])
cluster <- unlist(sample_clusters_df[sample_row, "cluster"])
heatmap_df[heatmap_df$sample_name == cluster_name, "sample_cluster"] <- cluster
}
sigvar_heatmap_plot <- ggplot(heatmap_df %>% filter(site %in% sig_x_sites), aes(x=factor(sample_name, levels = ordered_sample_names),
y=factor(site, levels = ordered_sites), fill=value)) +
geom_tile() +
facet_grid(cluster ~ sample_cluster, scales = "free", space="free", shrink=FALSE, switch = "y", labeller = labeller(cluster = function(c){
return(list(
"0"="1↑",
"1"="2↑",
"2"="3↑",
"3"="4↑",
"4"="5↑",
"5"="1↓",
"6"="2↓",
"7"="3↓",
"8"="4↓",
"9"="5↓",
"10"="~"
)[c])
})) +
# facet_grid(cluster ~ sample_cluster,space="free_x", scales = "free", shrink=FALSE) +
scale_fill_gradient2(low="#0bb1a4", high = "#E966FE", midpoint = .5, breaks=c(0,.5,1), labels=c("unmethylated\n??=0", "50% methylation\n??=0.5", "methylated\n??=1"))+
labs(x="Female Samples",
y="CpG Sites with Greater Variance in Female Samples",
fill="DNA Methylation") +
theme(axis.text.y=element_blank(),
axis.text.x = element_text(angle = 90, hjust=1, size=5),
axis.ticks.y=element_blank(),
legend.position="top",
legend.justification = "center",
legend.title = element_text(size = 6, vjust = .75),
legend.text = element_text(size = 6),
# plot.background = element_rect(fill = "lightgrey"),
# panel.border = element_rect(colour = "black", fill=NA, size=1),
plot.margin = margin(r=0),
panel.spacing.y = unit(2, "mm"),
panel.spacing.x = unit(0, "mm"),
strip.background.x = element_rect(color="black"),
strip.background.y = element_rect(fill = NA, colour = "black", size = .7),
strip.text = element_text(size=6),
strip.text.y = element_text(angle=180),
axis.title = element_text(size=8))
# only our data
sigvar_heatmap_plot_our_data_only <- ggplot(heatmap_df %>% filter(site %in% sig_x_sites & grepl("C19XX|C23XX", sample_name)), aes(x=factor(sample_name, levels = ordered_sample_names),
y=factor(site, levels = ordered_sites), fill=value)) +
geom_tile() +
facet_grid(cluster ~ sample_cluster, scales = "free", space="free", shrink=FALSE, switch = "y", labeller = labeller(cluster = function(c){
return(list(
"0"="1↑",
"1"="2↑",
"2"="3↑",
"3"="4↑",
"4"="5↑",
"5"="1↓",
"6"="2↓",
"7"="3↓",
"8"="4↓",
"9"="5↓",
"10"="~"
)[c])
})) +
# facet_grid(cluster ~ sample_cluster,space="free_x", scales = "free", shrink=FALSE) +
scale_fill_gradient2(low="#0bb1a4", high = "#E966FE", midpoint = .5, breaks=c(0,.5,1), labels=c("unmethylated\nβ=0", "50% methylation\nβ=0.5", "methylated\nβ=1"))+
labs(x="Female Samples",
y="CpG Sites with Greater Variance in Female Samples",
fill="DNA Methylation") +
theme(axis.text.y=element_blank(),
axis.text.x = element_text(angle = 90, hjust=1, size=5),
axis.ticks.y=element_blank(),
legend.position="top",
legend.justification = "center",
legend.title = element_text(size = 6, vjust = .75),
legend.text = element_text(size = 6),
# plot.background = element_rect(fill = "lightgrey"),
# panel.border = element_rect(colour = "black", fill=NA, size=1),
plot.margin = margin(r=0),
panel.spacing.y = unit(2, "mm"),
panel.spacing.x = unit(0, "mm"),
strip.background.x = element_rect(color="black"),
strip.background.y = element_rect(fill = NA, colour = "black", size = .7),
strip.text = element_text(size=6),
strip.text.y = element_text(angle=180),
axis.title = element_text(size=8))
ggsave("figs/our_data/our_data_only_squished_heatmap.png", sigvar_heatmap_plot_our_data_only, height=245, width=30, units = "mm")
library(radiant.data)
#calculate_continuous_cluster_assignment
first_cluster_dist_df <- read_csv("D:/pinterlab/x_erosion/first_eight_erosion_cluster_distances_df.csv") %>%
dplyr::rename(sample_name=`X1`) %>%
left_join(first_erosion_clusters_df, by="sample_name") %>%
dplyr::mutate(x0=case_when(cluster == 0 ~ `0`,
cluster == 1 ~ `1`,
cluster == 2 ~ `2`,
cluster == 3 ~ `3`,
cluster == 4 ~ `4`,
cluster == 5 ~ `5`),
x1=case_when(cluster == 0 ~ `1`,
cluster == 1 ~ pmin(`2`, `0`),
cluster == 2 ~ pmin(`3`, `1`),
cluster == 3 ~ pmin(`4`, `2`),
cluster == 4 ~ pmin(`5`, `3`),
cluster == 5 ~ `4`),
direction=case_when(cluster == 0 ~ 1,
cluster == 1 ~ c(1, -1)[which.pmin(`2`, `0`)],
cluster == 2 ~ c(1, -1)[which.pmin(`3`, `1`)],
cluster == 3 ~ c(1, -1)[which.pmin(`4`, `2`)],
cluster == 4 ~ c(1, -1)[which.pmin(`5`, `3`)],
cluster == 5 ~ `4`),
delta=-log((x1/x0)-.8335, base=45)+.03,
continuous_cluster=cluster+(direction * delta))
second_cluster_dist_df <- read_csv("D:/pinterlab/x_erosion/second_eight_erosion_cluster_distances_df.csv") %>%
dplyr::rename(sample_name=`X1`) %>%
left_join(second_erosion_clusters_df, by="sample_name") %>%
dplyr::mutate(x0=case_when(cluster == 0 ~ `0`,
cluster == 1 ~ `1`,
cluster == 2 ~ `2`,
cluster == 3 ~ `3`,
cluster == 4 ~ `4`,
cluster == 5 ~ `5`),
x1=case_when(cluster == 0 ~ `1`,
cluster == 1 ~ pmin(`2`, `0`),
cluster == 2 ~ pmin(`3`, `1`),
cluster == 3 ~ pmin(`4`, `2`),
cluster == 4 ~ pmin(`5`, `3`),
cluster == 5 ~ `4`),
direction=case_when(cluster == 0 ~ 1,
cluster == 1 ~ c(1, -1)[which.pmin(`2`, `0`)],
cluster == 2 ~ c(1, -1)[which.pmin(`3`, `1`)],
cluster == 3 ~ c(1, -1)[which.pmin(`4`, `2`)],
cluster == 4 ~ c(1, -1)[which.pmin(`5`, `3`)],
cluster == 5 ~ `4`),
delta=-log((x1/x0)-.8335, base=45)+.03,
continuous_cluster=cluster+(direction * delta))
third_cluster_dist_df <- read_csv("D:/pinterlab/x_erosion/third_eight_erosion_cluster_distances_df.csv") %>%
dplyr::rename(sample_name=`X1`) %>%
left_join(third_erosion_clusters_df, by="sample_name") %>%
dplyr::mutate(x0=case_when(cluster == 0 ~ `0`,
cluster == 1 ~ `1`,
cluster == 2 ~ `2`,
cluster == 3 ~ `3`,
cluster == 4 ~ `4`,
cluster == 5 ~ `5`),
x1=case_when(cluster == 0 ~ `1`,
cluster == 1 ~ pmin(`2`, `0`),
cluster == 2 ~ pmin(`3`, `1`),
cluster == 3 ~ pmin(`4`, `2`),
cluster == 4 ~ pmin(`5`, `3`),
cluster == 5 ~ `4`),
direction=case_when(cluster == 0 ~ 1,
cluster == 1 ~ c(1, -1)[which.pmin(`2`, `0`)],
cluster == 2 ~ c(1, -1)[which.pmin(`3`, `1`)],
cluster == 3 ~ c(1, -1)[which.pmin(`4`, `2`)],
cluster == 4 ~ c(1, -1)[which.pmin(`5`, `3`)],
cluster == 5 ~ `4`),
delta=-log((x1/x0)-.8335, base=45)+.03,
continuous_cluster=cluster+(direction * delta))
continuous_cluster_df <- rbind(first_cluster_dist_df, second_cluster_dist_df, third_cluster_dist_df) %>%
dplyr::mutate(cell_line=str_split(sample_name, " ", simplify = T)[,1],
passage=as.numeric(str_replace(str_split(sample_name, " ", simplify = T)[,4], "p", ""))) %>%
arrange(cell_line, passage) %>%
dplyr::select(sample_name, cell_line, passage, cluster, continuous_cluster)
write_csv(continuous_cluster_df, "continuous_cluster_df.csv")
# y=-log_45(x-.8335)+.03
#do the same on the old darcy data
old_erosion_clusters_df <- read_csv("E:/pinterlab/darcy_data_xx_xo/methylation_data/erosion_cluster_df.csv")
old_cluster_dist_df <- read_csv("D:/pinterlab/x_erosion/old_erosion_cluster_distances_df.csv") %>%
dplyr::rename(sample_name=`X1`) %>%
filter(sample_name %in% c("ar34", "ar37", "ar40", "ar38", "ar39", "ar42", "ar41")) %>%
left_join(old_erosion_clusters_df, by="sample_name") %>%
dplyr::mutate(x0=case_when(cluster == 0 ~ `0`,
cluster == 1 ~ `1`,
cluster == 2 ~ `2`,
cluster == 3 ~ `3`,
cluster == 4 ~ `4`,
cluster == 5 ~ `5`),
x1=case_when(cluster == 0 ~ `1`,
cluster == 1 ~ pmin(`2`, `0`),
cluster == 2 ~ pmin(`3`, `1`),
cluster == 3 ~ pmin(`4`, `2`),
cluster == 4 ~ pmin(`5`, `3`),
cluster == 5 ~ `4`),
direction=case_when(cluster == 0 ~ 1,
cluster == 1 ~ c(1, -1)[which.pmin(`2`, `0`)],
cluster == 2 ~ c(1, -1)[which.pmin(`3`, `1`)],
cluster == 3 ~ c(1, -1)[which.pmin(`4`, `2`)],
cluster == 4 ~ c(1, -1)[which.pmin(`5`, `3`)],
cluster == 5 ~ `4`),
delta=-log((x1/x0)-.8335, base=45)+.03,
delta=case_when(between(delta,0,.5) ~ delta,
delta < 0 ~ 0,
delta > .5 ~ .5),
continuous_cluster=cluster+(direction * delta),
description=case_when(sample_name == "ar34" ~ "C6 XX p7",
sample_name == "ar37" ~ "C11 XX p7",
sample_name == "ar40" ~ "C13 XX p10",
sample_name == "ar38" ~ "XXY p21",
sample_name == "ar39" ~ "H9 p40",
sample_name == "ar41" ~ "C19 p9",
sample_name == "ar42" ~ "C23 XX p10"))
analysis_sample_distances <- read_csv("female_df_no_h9_chrX_erosion_cluster_distances_df.csv")
saved_sample_clusters <- read_csv("saved_sample_clusters.csv", col_names = c("sample_name", "cluster"))
analysis_sample_distances <- analysis_sample_distances %>%
dplyr::rename(sample_name=`X1`) %>%
left_join(saved_sample_clusters, by="sample_name") %>%
dplyr::mutate(cluster_dist=dplyr::case_when(cluster==0 ~ `0`,
cluster==1 ~ `1`,
cluster==2 ~ `2`,
cluster==3 ~ `3`,
cluster==4 ~ `4`,
cluster==5 ~ `5`),
data_type="analysis_samples")
our_sample_distances <- read_csv("D:/pinterlab/x_erosion/first_eight_erosion_cluster_distances_df.csv") %>%
dplyr::rename(sample_name=`X1`) %>%
left_join(read_csv("first_eight_erosion_cluster_df.csv"), by="sample_name") %>%
rbind(read_csv("D:/pinterlab/x_erosion/second_eight_erosion_cluster_distances_df.csv") %>%
dplyr::rename(sample_name=`X1`) %>%
left_join(read_csv("second_eight_erosion_cluster_df.csv"), by="sample_name")) %>%
dplyr::mutate(cluster_dist=dplyr::case_when(cluster==0 ~ `0`,
cluster==1 ~ `1`,
cluster==2 ~ `2`,
cluster==3 ~ `3`,
cluster==4 ~ `4`,
cluster==5 ~ `5`),
data_type="our_samples")
ggplot(analysis_sample_distances, aes(x=cluster_dist, color=data_type)) +
geom_density() +
geom_vline(data=our_sample_distances, mapping=aes(xintercept=cluster_dist, color=data_type)) +
facet_grid(rows = vars(cluster)) +
labs(title="Distribution of cluster distances for samples") +
theme_bw() +
scale_color_nejm()
ggsave("figs/our_data/cluster_distance_distribution.png")
# add to PCA
female_df_no_h9_chrX <- read_csv('female_df_no_h9_chrX.csv.gz')
female_df_only_h9_chrX <- read_csv("female_sig_x_sites_df_only_h9.csv.gz")
female_beta_df_chrX <- left_join(female_df_no_h9_chrX, female_df_only_h9_chrX, by=c("rn"="rowname"))
sample_annotation_df <- read_csv("sample_data_combined_all_info.csv")
first_eight_beta_mat <- read_csv("E:/pinterlab/x_erosion_data/first_eight_samples_beta_matrix.csv.gz")
second_eight_beta_mat <- read_csv("E:/pinterlab/x_erosion_data/second_eight_samples_beta_matrix.csv.gz")
third_eight_beta_mat <- read_csv("E:/pinterlab/x_erosion_data/third_eight_samples_beta_matrix.csv.gz")
our_data_beta_df <- left_join(first_eight_beta_mat, second_eight_beta_mat, by="rn") %>%
left_join(third_eight_beta_mat, by="rn")
x_annotation_df <- read_csv("D:/pinterlab/x_erosion/x_annotation.csv.gz")
statistics <- read_csv('D:/pinterlab/x_erosion/variance_statistics_noh9_nooutliers.csv.gz')
positions_to_keep <- statistics$Position[statistics$alpha <= .01 & (statistics$femaleVariance > statistics$maleVariance)]
sig_x_sites <- intersect(positions_to_keep, x_annotation_df$rowname)
no_na_sigxsites <- na.omit(female_beta_df_chrX %>%
left_join(our_data_beta_df, by="rn") %>%
filter(rn %in% sig_x_sites))$rn
female_sig_sites_pca <- prcomp(t(na.omit(female_beta_df_chrX %>% filter(rn %in% no_na_sigxsites) %>% select(-rn))))
female_sig_sites_pca_df <- rownames_to_column(as.data.frame(female_sig_sites_pca$x), var = "sample_name")
female_sig_sites_pca_df <- left_join(female_sig_sites_pca_df, sample_annotation_df, by = "sample_name")
# female_sig_sites_pca_df <- left_join(female_sig_sites_pca_df, female_avg_x_methylation_df, by = "sample_name")
female_sig_sites_pca_df <- left_join(female_sig_sites_pca_df, tidy(colMeans(female_beta_df_chrX %>% select(-rn))) %>% rename(sample_name=names, meanXMethylation=x), by = "sample_name")
female_no_h9_sig_sites_pov <- sprintf("%2.2f%%", 100* female_sig_sites_pca$sdev^2/sum(female_sig_sites_pca$sdev^2))
female_no_h9_sig_sites_pc1_label <- sprintf("PC1 (%s)", female_no_h9_sig_sites_pov[1])
female_no_h9_sig_sites_pc2_label <- sprintf("PC2 (%s)", female_no_h9_sig_sites_pov[2])
our_data_beta_df_sigxsites <- our_data_beta_df %>%
filter(rn %in% no_na_sigxsites)
our_data_pca <- predict(female_sig_sites_pca, t(our_data_beta_df_sigxsites %>% dplyr::select(-rn)))
clusters_pData <- read.csv("saved_sample_clusters.csv",
header = FALSE,
row.names = 1,
col.names=c('name','cluster'))
female_sig_sites_pca_df$python_clusters <- clusters_pData[female_sig_sites_pca_df$sample_name,]
color_clusters <- clusters_pData
color_clusters$cluster <- as.factor(color_clusters$cluster)
continuous_cluster_df <- read_csv("continuous_cluster_df.csv")
pca_female_no_h9_by_python_6 <- ggplot(female_sig_sites_pca_df, mapping = aes(x=PC1, y=PC2)) +
stat_ellipse(data = female_sig_sites_pca_df %>% filter(cell_line != "WA09"), mapping = aes(x=PC1, y=PC2, fill=as.factor(python_clusters)), alpha=.2, size=.2, geom="polygon") +
geom_line(female_sig_sites_pca_df %>% filter(cell_line %in% c("HDF51IPS3", "HDF51IPS1", "HDF51IPS2", "HDF51IPS11", "HDF51IPS12", "HDF51IPS6", "HDF51IPS14", "WA09") & grepl("Nazor_", sample_name)),
mapping = aes(x=PC1, y=PC2, group=cell_line), size=.3, alpha=.4) +
geom_point(aes(color=as.factor(python_clusters), shape=(cell_line == "WA09")), size=.5) +
# geom_text_repel(female_no_h9_sig_sites_pca_df %>% filter(cell_line %in% correctly_ordered_by_sig_names & grepl("Nazor_", sample_name)), mapping = aes(x=PC1, y=PC2, label=passage), size=2, color="grey", segment.size=.15, min.segment.length=.25, segment.color="darkgrey") +
scale_shape_manual(values = c(17, 16), labels=c("H9", "Other"), limits=c(T,F), guide=F) +
theme_bw() +
labs(x=female_no_h9_sig_sites_pc1_label,
y=female_no_h9_sig_sites_pc2_label,
title = "PCA of Female Samples",
color="Cluster")+
base_plot_theme +
scale_color_manual(limits=as.factor(c(0,1,2,3,4,5,NA)), labels=c("A","B","C","D","E","F", "H9"), values = c(RColorBrewer::brewer.pal(6,"Dark2"), "#000000"), na.value="#000000") +
scale_fill_manual(limits=as.factor(c(0,1,2,3,4,5,NA)), labels=c("A","B","C","D","E","F", "H9"), values = c(RColorBrewer::brewer.pal(6,"Dark2"), "#000000"), na.value="#000000", guide=F) +
# scale_color_brewer(type="qual", palette = "Dark2", limits=c(0,1,2,3,4,5), labels=c("A","B","C","D","E","F")) +
guides(color=guide_legend(nrow=1)) +
theme(legend.key.width = unit(1, units = "mm"),
legend.text = element_text(size=5, margin = margin(r=.5, l=-.7, unit="mm"))) +
scale_x_continuous(limits=c(-11, 20), expand = c(0,0)) +
scale_y_continuous(limits=c(-8,10.5), expand = c(0,0))
ggsave("figs/main/fig1_pca_female_with_h9.pdf", pca_female_no_h9_by_python_6, width = 72, height = 38, units = "mm")
our_data_pca_df <- as.data.frame(our_data_pca) %>%
rownames_to_column(var="sample_name") %>%
left_join(continuous_cluster_df, by="sample_name")
library(ggrepel)
our_sample_pca <- ggplot(our_data_pca_df, mapping = aes(x=PC1, y=PC2)) +
geom_point(data = female_sig_sites_pca_df, mapping = aes(x=PC1, y=PC2, color=as.factor(python_clusters), shape=(cell_line == "WA09")), alpha=.2, size=.4) +
geom_point(aes(color=as.factor(cluster), shape=(cell_line == "WA09")), size=.6) +
stat_ellipse(data = female_sig_sites_pca_df %>% filter(cell_line != "WA09"), mapping = aes(x=PC1, y=PC2, fill=as.factor(python_clusters)), alpha=.2, size=.2, geom="polygon") +
geom_text_repel(aes(label=passage, color=as.factor(cluster)), size=2, min.segment.length = 0, segment.size = .15) +
theme_bw() +
labs(x=female_no_h9_sig_sites_pc1_label,
y=female_no_h9_sig_sites_pc2_label,
title = "PCA of Our Female Samples",
color="Cluster")+
base_plot_theme +
scale_shape_manual(values = c(17, 16), labels=c("H9", "Other"), limits=c(T,F), guide=F) +
scale_color_manual(limits=as.factor(c(0,1,2,3,4,5,NA)), labels=c("A","B","C","D","E","F", "H9"), values = c(RColorBrewer::brewer.pal(6,"Dark2"), "#000000"), na.value="#000000") +
scale_fill_manual(limits=as.factor(c(0,1,2,3,4,5,NA)), labels=c("A","B","C","D","E","F", "H9"), values = c(RColorBrewer::brewer.pal(6,"Dark2"), "#000000"), na.value="#000000", guide=F) +
# scale_color_brewer(type="qual", palette = "Dark2", limits=c(0,1,2,3,4,5), labels=c("A","B","C","D","E","F")) +
guides(color=guide_legend(nrow=1)) +
theme(legend.key.width = unit(1, units = "mm"),
legend.text = element_text(size=5, margin = margin(r=.5, l=-.7, unit="mm"))) +
scale_x_continuous(limits=c(-11, 20)) +
scale_y_continuous(limits=c(-8,10.5))
ggsave("figs/our_data/female-pca-by-cluster-our-data.pdf",plot=our_sample_pca, height = 84, width = 84, units = 'mm')
c19_sample_pca <- ggplot(our_data_pca_df %>% filter(cell_line=="C19XX"), mapping = aes(x=PC1, y=PC2)) +
stat_ellipse(data = female_sig_sites_pca_df %>% filter(cell_line != "WA09"), mapping = aes(x=PC1, y=PC2, fill=as.factor(python_clusters)), alpha=.2, size=.2, geom="polygon") +
geom_point(aes(color=as.factor(cluster), shape=(cell_line == "WA09")), size=.6) +
geom_text_repel(aes(label=passage, color=as.factor(cluster)), size=2, min.segment.length = 0, segment.size = .15, show.legend = F) +
theme_bw() +
labs(x=female_no_h9_sig_sites_pc1_label,
y=female_no_h9_sig_sites_pc2_label,
title = "PCA of C19 Female Samples",
color="Cluster")+
base_plot_theme +
scale_shape_manual(values = c(17, 16), labels=c("H9", "Other"), limits=c(T,F), guide=F) +
scale_color_manual(limits=as.factor(c(0,1,2,3,4,5,NA)), labels=c("A","B","C","D","E","F", "H9"), values = c(RColorBrewer::brewer.pal(6,"Dark2"), "#000000"), na.value="#000000") +
scale_fill_manual(limits=as.factor(c(0,1,2,3,4,5,NA)), labels=c("A","B","C","D","E","F", "H9"), values = c(RColorBrewer::brewer.pal(6,"Dark2"), "#000000"), na.value="#000000", guide=F) +
# scale_color_brewer(type="qual", palette = "Dark2", limits=c(0,1,2,3,4,5), labels=c("A","B","C","D","E","F")) +
guides(color=guide_legend(nrow=1)) +
theme(legend.key.width = unit(1, units = "mm"),
legend.text = element_text(size=5, margin = margin(r=.5, l=-.7, unit="mm"))) +
scale_x_continuous(limits=c(-11, 20), expand = c(0,0)) +
scale_y_continuous(limits=c(-8,10.5), expand = c(0,0))
ggsave("figs/main/fig1_female-pca-by-cluster-c19only.pdf",plot=c19_sample_pca, height = 38, width = 72, units = 'mm')
c23_sample_pca <- ggplot(our_data_pca_df %>% filter(cell_line=="C23XX"), mapping = aes(x=PC1, y=PC2)) +
stat_ellipse(data = female_sig_sites_pca_df %>% filter(cell_line != "WA09"), mapping = aes(x=PC1, y=PC2, fill=as.factor(python_clusters)), alpha=.2, size=.2, geom="polygon") +
geom_point(aes(color=as.factor(cluster), shape=(cell_line == "WA09")), size=.6) +
geom_text_repel(aes(label=passage, color=as.factor(cluster)), size=2, min.segment.length = 0, segment.size = .15, show.legend = F) +
theme_bw() +
labs(x=female_no_h9_sig_sites_pc1_label,
y=female_no_h9_sig_sites_pc2_label,
title = "PCA of C23 Female Samples",
color="Cluster")+
base_plot_theme +
scale_shape_manual(values = c(17, 16), labels=c("H9", "Other"), limits=c(T,F), guide=F) +
scale_color_manual(limits=as.factor(c(0,1,2,3,4,5,NA)), labels=c("A","B","C","D","E","F", "H9"), values = c(RColorBrewer::brewer.pal(6,"Dark2"), "#000000"), na.value="#000000") +
scale_fill_manual(limits=as.factor(c(0,1,2,3,4,5,NA)), labels=c("A","B","C","D","E","F", "H9"), values = c(RColorBrewer::brewer.pal(6,"Dark2"), "#000000"), na.value="#000000", guide=F) +
# scale_color_brewer(type="qual", palette = "Dark2", limits=c(0,1,2,3,4,5), labels=c("A","B","C","D","E","F")) +
guides(color=guide_legend(nrow=1)) +
theme(legend.key.width = unit(1, units = "mm"),
legend.text = element_text(size=5, margin = margin(r=.5, l=-.7, unit="mm"))) +
scale_x_continuous(limits=c(-11, 20), expand = c(0,0)) +
scale_y_continuous(limits=c(-8,10.5), expand = c(0,0))
ggsave("figs/main/fig1_female-pca-by-cluster-c23only.pdf",plot=c23_sample_pca, height = 38, width = 72, units = 'mm')
# percent completion plots
all_dmp_df <- read_csv("all_dmp_df.csv.gz")
demethylation_dmps <- all_dmp_df %>%
filter(delta_increasing == F & passes_threshold == T & hg38_chr == "chrX")
changing_x_dmps <- all_dmp_df %>%
filter(passes_threshold == T & hg38_chr == "chrX")
percent_completion_df <- our_data_beta_df %>%
reshape2::melt(id.vars="rn", variable.name="sample_name", value.name="beta_value") %>%
right_join(changing_x_dmps %>% select(rowname,
cluster_0_mean,
cluster_1_mean,
cluster_2_mean,
cluster_3_mean,
cluster_4_mean,
cluster_5_mean,
key_transition,
passes_threshold,
delta_increasing,
delta), by=c("rn"="rowname")) %>%
mutate(sample_delta=case_when(key_transition==1 ~ beta_value - cluster_0_mean,
key_transition==2 ~ beta_value - cluster_1_mean,
key_transition==3 ~ beta_value - cluster_2_mean,
key_transition==4 ~ beta_value - cluster_3_mean,
key_transition==5 ~ beta_value - cluster_4_mean),
as_predicted=case_when(delta_increasing ~ sample_delta >= delta*.9,
delta_increasing == F ~ sample_delta <= delta*.9)) %>%
drop_na() %>%
group_by(key_transition, sample_name, delta_increasing) %>%
summarise(percent_as_predicted=sum(as_predicted)/n(), as_predicted=sum(as_predicted), total=n()) %>%
left_join(continuous_cluster_df, by="sample_name")
percent_completion_df_grouped <- our_data_beta_df %>%
reshape2::melt(id.vars="rn", variable.name="sample_name", value.name="beta_value") %>%
left_join(continuous_cluster_df, by="sample_name") %>%
filter(sample_name != "C6XX") %>%
group_by(cluster, rn) %>%
summarise(our_mean_beta=mean(beta_value)) %>%
mutate(cluster_name=sprintf("our_cluster_%d_mean",cluster)) %>%
reshape2::dcast(rn~cluster_name, value.var="our_mean_beta") %>%
right_join(changing_x_dmps %>% select(rowname,
cluster_0_mean,
cluster_1_mean,
cluster_2_mean,
cluster_3_mean,
cluster_4_mean,
cluster_5_mean,
key_transition,
passes_threshold,
delta_increasing,
delta), by=c("rn"="rowname")) %>%
filter(passes_threshold==T) %>%
mutate(our_delta=case_when(key_transition==1 ~ our_cluster_1_mean-our_cluster_0_mean,
key_transition==2 ~ our_cluster_2_mean-our_cluster_1_mean),
sign_matches=delta*our_delta > 0,
delta_magnitude_matches=case_when(delta_increasing ~ our_delta >= delta*.9,
!delta_increasing ~ our_delta <= delta*.9)) %>%
drop_na() %>%
group_by(key_transition, delta_increasing) %>%
summarise(percent_as_predicted_sign=sum(sign_matches)/n(),
as_predicted_sign=sum(sign_matches),
percent_as_predicted_magnitude=sum(delta_magnitude_matches)/n(),
as_predicted_magnitude=sum(delta_magnitude_matches),
total=n())
#normalize by cluster 0 samples
cluster_0_mean_completion <- percent_completion_df %>%
filter(cluster==0) %>%
group_by(cell_line, delta_increasing, key_transition) %>%
summarise(mean_cluster0_percent=min(percent_as_predicted))
percent_completion_df <- percent_completion_df %>%
left_join(cluster_0_mean_completion, by=c("cell_line", "delta_increasing", "key_transition")) %>%
mutate(cluster_0_adjusted_percent_as_predicted=percent_as_predicted-mean_cluster0_percent)
library(ggsci)
percent_demethylated_plot <- ggplot(percent_completion_df %>% filter(cell_line %in% c("C19XX", "C23XX") & delta_increasing == F), aes(x=passage, y=percent_as_predicted, color=as.factor(key_transition))) +
geom_line() +
geom_point(size=.5) +
facet_grid(rows=vars(cell_line)) +
scale_color_futurama() +
labs(title = "Fraction of Transition Specific DMPs Demethylated",
y="Fraction of Transition-Specific DMPs Demethylated",
x="Passage Number",
color = "Transition") +
theme_bw() +
base_plot_theme +
theme(legend.title = element_text(size=5, face="bold"),
legend.text = element_text(size=5),
legend.key.width = unit(2, "mm"))
ggsave("figs/our_data/percent_demethylated_plot.pdf", percent_completion_plot, width = 174/3, height=80, units = "mm")
percent_demethylated_plot_cluster0_norm <- ggplot(percent_completion_df %>% filter(cell_line %in% c("C19XX", "C23XX") & delta_increasing == F), aes(x=passage, y=cluster_0_adjusted_percent_as_predicted, color=as.factor(key_transition))) +
geom_line() +
geom_point(size=.5) +
facet_grid(rows=vars(cell_line)) +
scale_color_futurama() +
labs(title = "Fraction of Transition Specific DMPs Demethylated Adjusted to cluster A",
y="Fraction of Transition-Specific DMPs Demethylated",
x="Passage Number",
color = "Transition") +
theme_bw() +
base_plot_theme +
theme(legend.title = element_text(size=5, face="bold"),
legend.text = element_text(size=5),
legend.key.width = unit(2, "mm"))
ggsave("figs/our_data/percent_demethylated_plot_norm_to_clusterA.pdf", percent_demethylated_plot_cluster0_norm, width = 174/3, height=80, units = "mm")
percent_methylated_plot <- ggplot(percent_completion_df %>% filter(cell_line %in% c("C19XX", "C23XX") & delta_increasing == T), aes(x=passage, y=percent_as_predicted, color=as.factor(key_transition))) +
geom_line() +
geom_point(size=.5) +
facet_grid(cols=vars(cell_line)) +
scale_color_futurama() +
labs(title = "Fraction of Transition Specific DMPs Methylated",
y="Fraction of Transition-Specific DMPs Methylated",
x="Passage Number",
color = "Transition") +
theme_bw() +
base_plot_theme +
theme(legend.title = element_text(size=5, face="bold"),
legend.text = element_text(size=5),
legend.key.width = unit(2, "mm"),
)
ggsave("figs/our_data/percent_methylated_plot.pdf", percent_completion_plot, width = 174/3, height=80, units = "mm")
cluster_0_demethylated_probes_c19 <- (our_data_beta_df %>%
reshape2::melt(id.vars="rn", variable.name="sample_name", value.name="beta_value") %>%
right_join(changing_x_dmps %>% select(rowname,
cluster_0_mean,
cluster_1_mean,
cluster_2_mean,
cluster_3_mean,
cluster_4_mean,
cluster_5_mean,
key_transition,
passes_threshold,
delta_increasing,
delta), by=c("rn"="rowname")) %>%
mutate(sample_delta=case_when(key_transition==1 ~ beta_value - cluster_0_mean,
key_transition==2 ~ beta_value - cluster_1_mean,
key_transition==3 ~ beta_value - cluster_2_mean,
key_transition==4 ~ beta_value - cluster_3_mean,
key_transition==5 ~ beta_value - cluster_4_mean),
as_predicted=case_when(delta_increasing ~ sample_delta >= delta*.9,
delta_increasing == F ~ sample_delta <= delta*.9)) %>%
drop_na() %>%
left_join(continuous_cluster_df, by="sample_name") %>%
filter(cluster == 0 & delta_increasing == F & as_predicted == T & cell_line == "C19XX"))$rn
cluster_0_demethylated_probes_c23 <- (our_data_beta_df %>%
reshape2::melt(id.vars="rn", variable.name="sample_name", value.name="beta_value") %>%
right_join(changing_x_dmps %>% select(rowname,
cluster_0_mean,
cluster_1_mean,
cluster_2_mean,
cluster_3_mean,
cluster_4_mean,
cluster_5_mean,
key_transition,
passes_threshold,
delta_increasing,
delta), by=c("rn"="rowname")) %>%
mutate(sample_delta=case_when(key_transition==1 ~ beta_value - cluster_0_mean,
key_transition==2 ~ beta_value - cluster_1_mean,
key_transition==3 ~ beta_value - cluster_2_mean,
key_transition==4 ~ beta_value - cluster_3_mean,
key_transition==5 ~ beta_value - cluster_4_mean),
as_predicted=case_when(delta_increasing ~ sample_delta >= delta*.9,
delta_increasing == F ~ sample_delta <= delta*.9)) %>%
drop_na() %>%
left_join(continuous_cluster_df, by="sample_name") %>%
filter(cluster == 0 & delta_increasing == F & as_predicted == T & cell_line == "C23XX"))$rn
percent_completion_df <- our_data_beta_df %>%
reshape2::melt(id.vars="rn", variable.name="sample_name", value.name="beta_value") %>%
right_join(changing_x_dmps %>% select(rowname,
cluster_0_mean,
cluster_1_mean,
cluster_2_mean,
cluster_3_mean,
cluster_4_mean,
cluster_5_mean,
key_transition,
passes_threshold,
delta_increasing,
delta), by=c("rn"="rowname")) %>%
mutate(sample_delta=case_when(key_transition==1 ~ beta_value - cluster_0_mean,
key_transition==2 ~ beta_value - cluster_1_mean,
key_transition==3 ~ beta_value - cluster_2_mean,
key_transition==4 ~ beta_value - cluster_3_mean,
key_transition==5 ~ beta_value - cluster_4_mean),
as_predicted=case_when(delta_increasing == T ~ sample_delta >= delta*.9,
delta_increasing == F ~ sample_delta <= delta*.9)) %>%
drop_na() %>%
left_join(continuous_cluster_df, by="sample_name") %>%
filter((cell_line == "C19XX" & !(rn %in% cluster_0_demethylated_probes_c19)) | (cell_line == "C23XX" & !(rn %in% cluster_0_demethylated_probes_c23))) %>%
group_by(key_transition, sample_name, delta_increasing, cell_line, cluster, passage) %>%
summarise(percent_as_predicted=sum(as_predicted)/n())
percent_demethylated_plot_noclusterAprobes <- ggplot(percent_completion_df %>% filter(cell_line %in% c("C19XX", "C23XX") & delta_increasing == F), aes(x=passage, y=percent_as_predicted, color=as.factor(key_transition))) +
geom_line() +
geom_point(size=.5) +
facet_grid(cols=vars(cell_line)) +
scale_color_futurama() +
labs(title = "Fraction of Transition-Specific DMPs Demethylated",
y="Fraction of DMPs Demethylated",
x="Passage Number",
color = "Transition") +
theme_bw() +
base_plot_theme +
theme(legend.title = element_text(size=5, face="bold"),
legend.text = element_text(size=5),
legend.key.width = unit(2, "mm"),
strip.text.x = element_text(size=6, margin=margin(t=1, b=1)),
strip.background.x = element_rect())
ggsave("figs/main/fig1_percent_demethylated_plot_noclusterAdemethylatedprobes.pdf", percent_demethylated_plot_noclusterAprobes, width = 72, height=42, units = "mm")
# run differential methylation analysis
array_registry_df <- readxl::read_excel("E:/pinterlab/x_erosion_data/Array_Registry_Update210126.xlsx")
samples_in_order <- c("ar 52", "ar 54", "ar 56",
"ar 58", "ar 60", "ar 62",
"ar 64", "ar 66")
data_directory <- "E:/pinterlab/x_erosion_data/first_eight_samples"
basename_id <- "204668830015"
first_batch_samples_df <- array_registry_df %>%
filter(`Tube Label` %in% samples_in_order) %>%
dplyr::rename(description=`...4`) %>%
dplyr::mutate(cell_line=str_split(description, " ", simplify = T)[,1],
cell_type=str_split(description, " ", simplify = T)[,2],
passage=as.numeric(str_replace(str_split(description, " ", simplify = T)[,4], "p", "")),
Sentrix_Position=sprintf("R0%dC01", 1:8),
Basename=sprintf("%s/%s_%s", data_directory, basename_id, Sentrix_Position),
basename_nodir=sprintf("%s_%s", basename_id, Sentrix_Position),
Sample_Name=description)
#second batch of array
samples_in_order <- c("ar 47", "ar 48", "ar72", "ar73",
"ar74", "ar75", "ar76", "ar77")
data_directory <- "E:/pinterlab/x_erosion_data/second_eight_samples"
basename_id <- "204792210038"
second_batch_samples_df <- array_registry_df %>%
filter(`Tube Label` %in% samples_in_order) %>%
dplyr::rename(description=`...4`) %>%
dplyr::mutate(cell_line=str_split(description, " ", simplify = T)[,1],
cell_type=str_split(description, " ", simplify = T)[,2],
passage=as.numeric(str_replace(str_split(description, " ", simplify = T)[,4], "p", "")),
Sentrix_Position=sprintf("R0%dC01", 1:8),
Basename=sprintf("%s/%s_%s", data_directory, basename_id, Sentrix_Position),
basename_nodir=sprintf("%s_%s", basename_id, Sentrix_Position),
Sample_Name=description)
#third batch of array
samples_in_order <- c("ar86", "ar87", "ar88",
"ar89", "ar90", "ar91",
"ar92", "ar93")
data_directory <- "E:/pinterlab/x_erosion_data/third_eight_samples"
basename_id <- "205003700119"
third_batch_samples_df <- array_registry_df %>%
filter(`Tube Label` %in% samples_in_order) %>%
dplyr::rename(description=`...4`) %>%
dplyr::mutate(cell_line=str_split(description, " ", simplify = T)[,1],
cell_type=str_split(description, " ", simplify = T)[,2],
passage=as.numeric(str_replace(str_split(description, " ", simplify = T)[,4], "p", "")),
Sentrix_Position=sprintf("R0%dC01", 1:8),
Basename=sprintf("%s/%s_%s", data_directory, basename_id, Sentrix_Position),
basename_nodir=sprintf("%s_%s", basename_id, Sentrix_Position),
Sample_Name=description)
sample_info_df <- rbind(first_batch_samples_df, second_batch_samples_df, third_batch_samples_df) %>%
left_join(continuous_cluster_df, by=c("description"="sample_name", "cell_line", "passage"))
RGset <- read.metharray.exp(targets = sample_info_df)
mset <- preprocessIllumina(RGset)
gmset <- mapToGenome(mset)
beta_mat_850k <- getBeta(gmset)
colnames(beta_mat_850k) <- sample_info_df[match(colnames(beta_mat_850k), sample_info_df$basename_nodir),]$Sample_Name
write_csv(as.data.frame(beta_mat_850k) %>% rownames_to_column(var="rn"), "our_data_beta_mat_850k.csv.gz")
site_annotation_850k <- getAnnotation(gmset)
write_csv(site_annotation_850k %>% as.data.frame() %>% rownames_to_column(), "site_annotation_850k.csv.gz")
our_data_dmp_dir <- 'dmp_between_our_sample_clusters'
dir.create(our_data_dmp_dir)
all_dmp_df <- data.frame()
transition_columns <- c()
our_data_beta_mat_850k <- read_csv("our_data_beta_mat_850k.csv.gz") %>% column_to_rownames(var="rn")
site_annotation_850k <- read_csv("site_annotation_850k.csv.gz")
cluster_means <- tibble(rowname=character())
for (n2 in min(sample_info_df$cluster):(max(sample_info_df$cluster))) {
print(n2)
cluster_samples <- filter(sample_info_df, cluster %in% c(n2))$Sample_Name
mean_df <- as.data.frame(rowMeans(na.omit(our_data_beta_mat_850k[cluster_samples]))) %>%
rownames_to_column() %>%
dplyr::rename(!!sprintf('cluster_%d_mean', n2) := 2)
cluster_means <- full_join(cluster_means, mean_df, by="rowname")
}
for (n in min(sample_info_df$cluster):(max(sample_info_df$cluster)-1)) {
print(sprintf('comparing clusters %d and %d', n, n+1))
cluster_1_group <- filter(sample_info_df, cluster <= n)$Sample_Name
cluster_2_group <- filter(sample_info_df, cluster > n)$Sample_Name
temp_cluster_data <- sample_info_df #column_to_rownames(clusters_pData, var="name")
temp_cluster_data[temp_cluster_data$Sample_Name %in% cluster_1_group, 'cluster'] <- 0
temp_cluster_data[temp_cluster_data$Sample_Name %in% cluster_2_group, 'cluster'] <- 1
clusters_pair <- temp_cluster_data$Sample_Name
subset_beta <- na.omit(our_data_beta_mat_850k[,clusters_pair])
subset_clusters <- as.matrix(temp_cluster_data %>% dplyr::select(Sample_Name, cluster) %>% column_to_rownames(var="Sample_Name"))
subset_clusters[1] <- as.factor(subset_clusters[1])
# subset_clusters <- as.matrix(temp_cluster_data[clusters_pair,])
# subset_clusters[1] <- as.factor(subset_clusters[1])
# row.names(subset_clusters) <- clusters_pair
dmp <- dmpFinder(as.matrix(subset_beta), pheno = subset_clusters[,1], type = "categorical")
# subset_beta_mat <- as.matrix(subset_beta)
# row.names(subset_beta_mat) <- rownames(subset_beta)
dmp_df <- as.data.frame(dmp) %>% rownames_to_column(var="rowname")
dmp_df <- left_join(dmp_df, site_annotation_850k %>% select(rowname,
hg19_chr=chr,
hg19_pos=pos,
hg19_gene=UCSC_RefGene_Name,
hg19_refgene_group=UCSC_RefGene_Group), by="rowname")
dmp_df <- left_join(dmp_df, cluster_means, by="rowname")
write.csv(dmp_df,
file = sprintf('%s/dmp_table_%d_%d.csv',our_data_dmp_dir, n, n+1),
quote = FALSE)
if (n + 1 == 1) {
all_dmp_df <- dmp_df %>% select(rowname,
hg19_chr,
hg19_pos,
hg19_gene,
hg19_refgene_group,
matches("cluster_._mean"))
}
all_dmp_df <- full_join(all_dmp_df, dmp_df %>% select(rowname, !!sprintf('transition_%s_qval', n + 1):=qval), by="rowname")
transition_columns <- append(transition_columns, sprintf('transition_%s_qval', n + 1))
}
library(radiant.data)
q_val_threshold <- .01
all_dmp_df <- all_dmp_df %>%
dplyr::mutate(key_transition=case_when(transition_1_qval < transition_2_qval ~ 1,
transition_2_qval < transition_1_qval ~ 2),
passes_threshold=case_when(key_transition==1 ~ transition_1_qval <= q_val_threshold,
key_transition==2 ~ transition_2_qval <= q_val_threshold),
delta=case_when(key_transition==1 ~ cluster_1_mean - cluster_0_mean,
key_transition==2 ~ cluster_2_mean - cluster_1_mean),
delta_increasing=delta>0)
write_csv(all_dmp_df, "our_data_all_dmp_df.csv.gz")
our_data_all_dmp_df <- read_csv("our_data_all_dmp_df.csv.gz")
all_dmp_df <- read_csv("all_dmp_df.csv.gz")
#heatmap of XIST and FIRRE
our_data_beta_mat_850k <- read_csv("our_data_beta_mat_850k.csv.gz")
desired_genes <- c("XIST", "FIRRE")
gene_order <- c(F,T)
subset_heatmap_df <- inner_join(our_data_beta_mat_850k,
dplyr::select(all_dmp_df, rowname, hg38_gene, hg38_pos, hg38_refgene_group), by=c("rn"="rowname")) %>%
filter(hg38_gene %in% desired_genes)
subset_heatmap_df <- arrange(subset_heatmap_df, hg38_pos)
ordered_sites <- subset_heatmap_df$rn
ordered_sites <- c()
for (i in 1:length(desired_genes)) {
g <- desired_genes[i]
o <- gene_order[i]
if(o == T) {
ordered_sites <- c(ordered_sites, arrange(subset_heatmap_df %>% filter(hg38_gene == g), hg38_pos)$rn)
}else {
ordered_sites <- c(ordered_sites, arrange(subset_heatmap_df %>% filter(hg38_gene == g), desc(hg38_pos))$rn)
}
}
subset_heatmap_df <- dplyr::select(subset_heatmap_df, -hg38_pos)
# sample_clusters_df <- read_csv("saved_sample_clusters.csv",
# col_names = c('name','cluster')) %>% arrange(cluster)
ordered_sample_names <- (continuous_cluster_df %>% filter(cell_line %in% c("C19XX", "C23XX")) %>% arrange(desc(cell_line), passage))$sample_name
ordered_sample_names <- c(ordered_sample_names[12], ordered_sample_names[1:11],ordered_sample_names[13:22])
subset_heatmap_df <- reshape2::melt(subset_heatmap_df, id.vars=c("rn", "hg38_gene")) %>% dplyr::rename(site=rn, sample_name=variable)
subset_heatmap_df <- inner_join(subset_heatmap_df, continuous_cluster_df %>% dplyr::select(sample_name, cell_line, passage, sample_cluster=cluster), by=c("sample_name")) %>%
filter(cell_line %in% c("C19XX", "C23XX"))
convert_int_to_letters <- Vectorize(vectorize.args = "c",
FUN = function(c) {
switch(c+1,
"A",
"B",
"C",
"D",
"E",
"F",
)
})
subset_heatmap_df <- subset_heatmap_df %>% mutate(sample_cluster=convert_int_to_letters(sample_cluster), value=as.numeric(value))
subset_heatmap <- ggplot(subset_heatmap_df, aes(x=factor(sample_name, levels = ordered_sample_names),
y=factor(site, levels = ordered_sites), fill=value)) +
geom_tile() +
# facet_grid(cluster ~ sample_cluster, scales = "free", space="free", shrink=FALSE, switch = "y") +
facet_grid(factor(hg38_gene, levels=desired_genes) ~ sample_cluster,space="free_x", scales = "free", shrink=T, switch = "y") +
scale_fill_gradient2(low="#0bb1a4", high = "#E966FE",
midpoint = .5,
limits=c(0,1),
breaks=c(0,.5,1),
labels=c("unmethylated\n??=0", "50% methylation\n??=0.5", "methylated\n??=1")) +
labs(x="Female Samples",
y="",
fill="DNA Methylation") +
theme(axis.text=element_blank(),
#axis.text.x=element_text(angle=90),
axis.ticks=element_blank(),
legend.position="top",
legend.justification = "center",
# legend.title = element_text(size = 6, vjust = .8),
# legend.text = element_text(size = 6),
# legend.position="top",
# legend.justification = "right",
legend.title = element_text(size = 4, vjust = .8),
legend.text = element_text(size = 4),
legend.key.height = unit(2.5, "mm"),
plot.margin = margin(r=0),
panel.spacing.y = unit(2, "mm"),
panel.spacing.x = unit(0.2, "mm"),
# strip.background.x = element_rect(color="black"),
strip.text.y = element_blank(),
# strip.text.y = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")),
strip.text.x = element_text(size=5, margin = margin(1,1.5,1,1.5, "mm")),
strip.background = element_rect(size = .3),
axis.title = element_text(size=7))
ggsave("figs/our_data/xist_firre_heatmap_our_data_450K probes.svg", subset_heatmap, width=15, height = 65, units = "mm")
# plot dmp changes with our data
all_dmp_df <- read_csv("all_dmp_df.csv.gz")
our_data_beta_mat_850k <- read_csv("our_data_beta_mat_850k.csv.gz")
site_annotation_850k <- read_csv("site_annotation_850k.csv.gz")
x_annotation <- site_annotation_850k %>% filter(chr == "chrX")
continuous_cluster_df <- read_csv("continuous_cluster_df.csv")
our_data_x_chr_df <- our_data_beta_mat_850k %>%
filter(rn %in% x_annotation$rowname) %>%
reshape2::melt(id.vars=c("rn"), variable.name="sample_name") %>%
left_join(x_annotation %>% dplyr::select(rowname, chr, pos, strand, UCSC_RefGene_Group, UCSC_RefGene_Name), by=c("rn"="rowname")) %>%
left_join(continuous_cluster_df, by="sample_name") %>%
filter(cell_line != "C6XX")
c19_data_x_chr_df <- our_data_x_chr_df %>%
filter(cell_line == "C19XX")
c19_first_passage <- c19_data_x_chr_df %>%
filter(passage == min(passage)) %>%
dplyr::select(rn, first_passage_value=value)
c19_data_x_chr_df <- c19_data_x_chr_df %>%
left_join(c19_first_passage, by="rn") %>%
group_by(rn) %>%
arrange(passage) %>%
mutate(last_passage_value=dplyr::lag(value)) %>%
ungroup()
ggplot(c19_data_x_chr_df, aes(x=pos, y=value-first_passage_value, color=value-first_passage_value<0)) +
geom_line() +
scale_color_aaas() +
theme_bw() +
facet_grid(rows=vars(passage))
mean_methylation_df_escape <- read_csv("mean_methylation_df_escape.csv.gz")
ggplot(c19_data_x_chr_df %>% filter(value-last_passage_value < -0.1), aes(x=pos, y=value-last_passage_value, color=value-last_passage_value<0)) +
geom_vline(data = mean_methylation_df_escape, mapping=aes(xintercept=hg19_pos), color="lightgrey", size=.3) +
geom_point(aes(alpha=abs(value-last_passage_value)), size=.7) +
scale_color_aaas() +
scale_alpha_continuous(range = c(.2,1)) +
theme_classic() +
facet_grid(rows=vars(passage)) +
labs(title="C19XX Progressive Changes")
ggsave("figs/our_data/c19_progressive_changes.pdf")
ggsave("figs/our_data/c19_progressive_changes_alt.pdf")
ggsave("figs/our_data/c19_progressive_changes_alt3.pdf")
c23_data_x_chr_df <- our_data_x_chr_df %>%
filter(cell_line == "C23XX")
c23_data_x_chr_df <- c23_data_x_chr_df %>%
group_by(rn) %>%
arrange(passage) %>%
mutate(last_passage_value=dplyr::lag(value)) %>%
ungroup()
ggplot(c23_data_x_chr_df %>% filter(abs(value-last_passage_value) >= .05), aes(x=pos, y=value-last_passage_value, color=value-last_passage_value<0)) +
geom_line() +
scale_color_aaas() +
theme_bw() +
facet_grid(rows=vars(passage)) +
labs(title="C23XX Progressive Changes")
ggsave("figs/our_data/c23_progressive_changes.pdf")
#our data heatmap
our_data_beta_df <- read_csv("our_data_beta_mat_850k.csv.gz")
all_dmp_df <- read_csv('D:/pinterlab/x_erosion/all_dmp_df.csv.gz')
# do this based on genes and when the promoters change their methylation.
x_annotation_df <- read_csv("D:/pinterlab/x_erosion/x_annotation.csv.gz")
statistics <- read_csv('D:/pinterlab/x_erosion/variance_statistics_noh9_nooutliers.csv.gz')
positions_to_keep <- statistics$Position[statistics$alpha <= .01 & (statistics$femaleVariance > statistics$maleVariance)]
sig_x_sites <- intersect(positions_to_keep, x_annotation_df$rowname)
heatmap_df <- inner_join(our_data_beta_df %>% filter(rn %in% x_annotation_df$rowname),
dplyr::select(all_dmp_df, rowname, key_transition, passes_threshold, delta_increasing, hg38_refgene_group), by=c("rn"="rowname")) %>%
mutate(cluster=case_when(!passes_threshold ~ 10,
delta_increasing ~ key_transition - 1,
!delta_increasing ~ (key_transition-1)+5))
heatmap_df <- arrange(heatmap_df, desc(passes_threshold), desc(delta_increasing), desc(key_transition))
ordered_sites <- heatmap_df$rn
heatmap_df <- dplyr::select(heatmap_df, -key_transition, -passes_threshold, -delta_increasing, -hg38_refgene_group)
continous_clusters_df <- read_csv("continuous_cluster_df.csv") %>%
filter(cell_line %in% c("C19XX", "C23XX"))
ordered_sample_names <- continous_clusters_df %>% arrange(desc(cell_line), passage) %>% pull(sample_name)
ordered_sample_names <- c(ordered_sample_names[12], ordered_sample_names[1:11],ordered_sample_names[13:22])
heatmap_df <- reshape2::melt(heatmap_df, id.vars=c("rn", "cluster"), variable.name="sample_name") %>% dplyr::rename(site=rn) %>%
right_join(continous_clusters_df %>% dplyr::rename(sample_cluster=cluster), by="sample_name")
# only our data
sigvar_heatmap_plot_our_data_only <- ggplot(heatmap_df %>% filter(site %in% sig_x_sites & grepl("C19XX|C23XX", sample_name)), aes(x=factor(sample_name, levels = ordered_sample_names),
y=factor(site, levels = ordered_sites), fill=value)) +
geom_tile() +
facet_grid(cluster ~ sample_cluster, scales = "free", space="free", shrink=FALSE, switch = "y", labeller = labeller(cluster = function(c){
return(list(
"0"="1↑",
"1"="2↑",
"2"="3↑",
"3"="4↑",
"4"="5↑",
"5"="1↓",
"6"="2↓",
"7"="3↓",
"8"="4↓",
"9"="5↓",
"10"="~"
)[c])
})) +
# facet_grid(cluster ~ sample_cluster,space="free_x", scales = "free", shrink=FALSE) +
scale_fill_gradient2(low="#0bb1a4", high = "#E966FE", midpoint = .5, breaks=c(0,.5,1), labels=c("unmethylated\nβ=0", "50% methylation\nβ=0.5", "methylated\nβ=1"))+
labs(x="Female Samples",
y="CpG Sites with Greater Variance in Female Samples",
fill="DNA Methylation") +
theme(axis.text.y=element_blank(),
axis.text.x = element_blank(), #element_text(angle = 90, hjust=1, size=5),
axis.ticks=element_blank(),
legend.position="top",
legend.justification = "center",
legend.title = element_text(size = 6, vjust = .75),
legend.text = element_text(size = 6),
# plot.background = element_rect(fill = "lightgrey"),
# panel.border = element_rect(colour = "black", fill=NA, size=1),
plot.margin = margin(r=0),
panel.spacing.y = unit(2, "mm"),
panel.spacing.x = unit(0, "mm"),
strip.background.x = element_rect(color="black"),
strip.background.y = element_blank(),
strip.text.x = element_text(size=6),
strip.text.y = element_blank(),
axis.title.x = element_text(size=8),
axis.title.y=element_blank())
ggsave("figs/our_data/our_data_heatmap.svg", sigvar_heatmap_plot_our_data_only, height = 236, width = 15, units = "mm")
our_data_dmp_df <- read_csv("our_data_all_dmp_df.csv.gz")
threshold <- .05
our_data_dmp_df <- our_data_dmp_df %>%
mutate(passes_threshold=case_when(key_transition == 1 ~ transition_1_qval <= threshold,
key_transition == 2 ~ transition_2_qval <= threshold))
mean_methylation_df_escape <- read_csv("mean_methylation_df_escape.csv.gz")
#hg19_pos was manually annotated for each gene
escapee_pos_df <- mean_methylation_df_escape %>%
transmute(hg38_gene_name,
delta_ipsc=hg19_pos,
delta_esc=hg19_pos,
delta_H9=hg19_pos) %>%
reshape2::melt(id.vars=c("hg38_gene_name"),
value.name="TSS",
variable.name="type")
our_data_dmps <- ggplot(our_data_dmp_df %>% filter(passes_threshold & hg19_chr == "chrX"), aes(x=hg19_pos, y=delta)) +
# geom_point(aes(alpha=abs(delta)), size=.4) +
geom_segment(mapping=aes(x=hg19_pos,xend=hg19_pos,y=0, yend=delta, color=as.factor(key_transition)), alpha=.1) +
geom_vline(data=escapee_pos_df, aes(xintercept=TSS), color="darkgrey", size=.3, alpha=.8) +
labs(y="",
x="X Chr Coordinate (Mb)") +
scale_x_continuous(limits=c(0, 155270560), breaks = seq(0, 155270560, 2e7), labels=c(0, 20, 40, 60, 80, 100, 120, 140), minor_breaks = seq(0, 155270560, 1e7), expand=c(0,0)) +
scale_color_manual(limits=c("1", "2", "3", "4", "5"),
values=c(pal_d3()(5)), guide=F) +
scale_alpha(guide=F)+
# facet_grid(rows=vars(key_transition)) +
facet_grid(rows=vars(as.factor(key_transition)), labeller = labeller(.rows=function(n){
return(list("1"="Tran. 1",
"2"="Tran. 2")[n])
})) +
labs(title="DMP analysis with our samples",
color="Delta Increasing") +
theme_bw() +
base_plot_theme +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.text = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")),
strip.background = element_rect(size = .3),
title = element_blank(),
axis.title.y = element_blank(),
axis.text.y=element_text(size=5),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
plot.background = element_blank(),
panel.spacing.y = unit(.75, "mm"))
ggsave("figs/our_data/our_data_dmp_alt.pdf", our_data_dmps, width=174, height = 20, units="mm")
# calculate positional correlation between meta-analysis DMP and our DMPs
all_dmp_df <- read_csv("all_dmp_df.csv.gz")
our_data_dmp_df <- read_csv("our_data_all_dmp_df.csv.gz")
h3k27me3_peaks <- read_table2("bedgraphs/GSM1528885_H9_XISTpos_K27me3.peaks.bed", col_names=c("chr", "start", "end", "signal"))
h3k27me3_peaks_chrX <- h3k27me3_peaks %>% filter(chr == "chrX")
h3k27me3_peaks_chrX_windowed <- window_average_chrX(bed_file_df=h3k27me3_peaks_chrX %>% rename(pos=start, delta=signal), window_size = 100000, step_size = 50000, include_empty = T, calculate_sum = T)
h3k9_xi_peaks <- read_table2("bedgraphs/GSM1528888_H9_XISTpos_K9me3.peaks.bed.gz", col_names=c("chr", "start", "end", "signal"))
h3k9_xi_peaks_chrX <- h3k9_xi_peaks %>% filter(chr == "chrX")
h3k9_xi_peaks_chrX_windowed <- window_average_chrX(bed_file_df=h3k9_xi_peaks_chrX %>% rename(pos=start, delta=signal), window_size = 100000, step_size = 50000, include_empty = T, calculate_sum = T)
transition_correlations <- tibble(transition=numeric(),
correlation=numeric(),
correlation_p_val=numeric(),
h3k27me3_correlation=numeric(),
h3k27me3_correlation_p_val=numeric(),
h3k9me3_correlation=numeric(),
h3k9me3_correlation_p_val=numeric())
for (t in 1:2) {
bed_df <- all_dmp_df %>% filter((hg19_chr == "chrX") & (passes_threshold == T) & (key_transition == t))
our_bed_df <- our_data_dmp_df %>% filter((hg19_chr == "chrX") & (passes_threshold == T) & (key_transition == t))
averaged_distance_metaanalysis <- window_average_chrX(bed_file_df=bed_df %>% rename(chr=hg19_chr, pos=hg19_pos), window_size = 100000, step_size = 50000, include_empty = T)
averaged_distance_our_data <- window_average_chrX(bed_file_df=our_bed_df %>% rename(chr=hg19_chr, pos=hg19_pos), window_size = 100000, step_size = 50000, include_empty = T)
cor_test <- cor.test(averaged_distance_metaanalysis$delta, averaged_distance_our_data$delta)
k27_cor_test <- cor.test(abs(averaged_distance_our_data$delta), h3k27me3_peaks_chrX_windowed$delta)
k9_cor_test <- cor.test(abs(averaged_distance_our_data$delta), h3k9_xi_peaks_chrX_windowed$delta)
transition_correlations <- add_row(transition_correlations,
transition=t,
correlation=cor_test$estimate,
correlation_p_val=cor_test$p.value,
h3k27me3_correlation=k27_cor_test$estimate,
h3k27me3_correlation_p_val=k27_cor_test$p.value,
h3k9me3_correlation=k9_cor_test$estimate,
h3k9me3_correlation_p_val=k9_cor_test$p.value)
}
#histone correlations
transition__histone_correlations <- tibble(transition=numeric(),
correlation=numeric(),
correlation_p_val=numeric())
for (t in 1:2) {
bed_df <- all_dmp_df %>% filter((hg19_chr == "chrX") & (passes_threshold == T) & (key_transition == t))
our_bed_df <- our_data_dmp_df %>% filter((hg19_chr == "chrX") & (passes_threshold == T) & (key_transition == t) & (rowname %in% all_dmp_df$rowname))
averaged_distance_metaanalysis <- window_average_chrX(bed_file_df=bed_df %>% rename(chr=hg19_chr, pos=hg19_pos), window_size = 100000, step_size = 50000, include_empty = T)
averaged_distance_our_data <- window_average_chrX(bed_file_df=our_bed_df %>% rename(chr=hg19_chr, pos=hg19_pos), window_size = 100000, step_size = 50000, include_empty = T)
cor_test <- cor.test(averaged_distance_metaanalysis$delta, averaged_distance_our_data$delta)
transition_correlations <- add_row(transition_correlations,
transition=t,
correlation=cor_test$estimate,
correlation_p_val=cor_test$p.value)
}
# qPCR data
qpcr_dat <- read_csv("e:/pinterlab/x_erosion_data/qpcr_data.csv")
ggplot(qpcr_dat %>% mutate(cell_line=case_when(cell_line=="C19" ~ "C19 XIST qPCR", T~"C23 XIST qPCR")), aes(x=factor(passage, levels=c("p8","p11","p12","p13","p15","p17","p19","p20","p22")), y=`%TBP`)) +
facet_grid(cols=vars(cell_line), scales = "free_x") +
geom_boxplot(size=.2, fill="grey") +
geom_point(size=.4) +
labs(title="XIST qPCR",
x="Passage",
y="%TBP") +
theme_bw() +
base_plot_theme +
theme(strip.text = element_text(size=6, margin=margin(t=1, b=1), face = "bold"), strip.background = element_blank(), plot.title = element_blank())
ggsave("figs/supplementary/supp1_xist_qpcr_boxplot.pdf", width=80, height=30, units="mm")
qpcr_dat_col <- qpcr_dat %>%
group_by(cell_line, passage) %>%
summarise(mean_exp=mean(`%TBP`),
sd=sd(`%TBP`))
ggplot(qpcr_dat_col, aes(x=factor(passage, levels=c("p8","p11","p12","p13","p15","p17","p19","p20","p22")), y=mean_exp)) +
facet_grid(cols=vars(cell_line), scales = "free_x") +
geom_bar(size=.2, stat="identity") +
geom_errorbar(aes(ymin=mean_exp-sd, ymax=mean_exp+sd), width=.5, size=.2) +
labs(title="XIST qPCR",
x="Passage",
y="%TBP") +
theme_bw() +
base_plot_theme +
theme(strip.text = element_text(size=6, margin=margin(t=1, b=1)))
ggsave("figs/supplementary/supp1_xist_qpcr_barplot.pdf", width=80, height=40, units="mm")
# checking copy number compared to H9s using DNAme
# library(stringi)
# datasets_df <- read.csv("datasets.csv.gz") %>% filter(Name=="Boscolo")
#
# for (row in 1:nrow(datasets_df)) {
# print(row)
# name <- as.character(datasets_df$Name[row])
# basedir <- as.character(datasets_df$Directory[row])
# datatype <- as.character(datasets_df$Type[row])
# sample_metadata_file <- as.character(datasets_df$SampleInfoFile[row])
# array_size <- datasets_df$ArraySize[row]
# if (array_size == 450) {
# array_type <- "IlluminaHumanMethylation450k"
# } else if (array_size == 850) {
# array_type <- "IlluminaHumanMethylationEPIC"
# }
#
#
# if (datatype == "geo_raw") {
# sample_info_df <- read.csv(file.path(basedir, sample_metadata_file), stringsAsFactors = FALSE, row.names = 1)
# samples_to_keep <- sample_info_df[sample_info_df$Keep %in% c("TRUE"), ]
# raw_filename <- file.path(basedir, as.character(datasets_df$DataFile[row]))
# u_name <- as.character(datasets_df$Uname[row])
# m_name <- as.character(datasets_df$Mname[row])
# separator <- stri_unescape_unicode(as.character(datasets_df$Separator[row]))
# gmset <- readGEORawFile(
# filename = raw_filename,
# Uname = u_name,
# Mname = m_name,
# array = array_type,
# sep = separator
# )
# gmset <- readGEORawFile(
# filename = raw_filename,
# Uname = u_name,
# Mname = m_name,
# array = array_type,
# sep = separator,
# pData = sample_info_df[sampleNames(gmset), ]
# )
# } else if (datatype == "idat") {
# targets <- as.data.frame(read.metharray.sheet(basedir, pattern = sample_metadata_file))
# targets_to_keep <- subset(targets, Keep == TRUE)
#
# RGset <- read.metharray.exp(targets = targets_to_keep)
# gmset <- preprocessIllumina(RGset)
# gmset <- mapToGenome(gmset)
# }
#
# sampleNames(gmset) <- paste(name, gmset$accession, gmset$cell_type, gmset$cell_line, gmset$gender,
# paste("p", gmset$passage, sep = ""),
# sep = "_"
# )
#
#
# plot_group <- data.frame(group = name, sample_name = sampleNames(gmset), row.names = "sample_name")
# if (row == 1) {
# combined_arrays <- gmset
# plot_groups_combined <- plot_group
# } else {
# combined_arrays <- combineArrays(combined_arrays, gmset, outType = "IlluminaHumanMethylation450k")
# plot_groups_combined <- rbind(plot_groups_combined, plot_group)
# }
# }
array_registry_df <- readxl::read_excel("E:/pinterlab/x_erosion_data/Array_Registry_Update210126.xlsx")
samples_in_order <- c("ar 52", "ar 54", "ar 56",
"ar 58", "ar 60", "ar 62",
"ar 64", "ar 66")
data_directory <- "E:/pinterlab/x_erosion_data/first_eight_samples"
basename_id <- "204668830015"
first_batch_samples_df <- array_registry_df %>%
filter(`Tube Label` %in% samples_in_order) %>%
dplyr::rename(description=`...4`) %>%
dplyr::mutate(cell_line=str_split(description, " ", simplify = T)[,1],
cell_type=str_split(description, " ", simplify = T)[,2],
passage=as.numeric(str_replace(str_split(description, " ", simplify = T)[,4], "p", "")),
Sentrix_Position=sprintf("R0%dC01", 1:8),
Basename=sprintf("%s/%s_%s", data_directory, basename_id, Sentrix_Position),
basename_nodir=sprintf("%s_%s", basename_id, Sentrix_Position),
Sample_Name=description)
#second batch of array
samples_in_order <- c("ar 47", "ar 48", "ar72", "ar73",
"ar74", "ar75", "ar76", "ar77")
data_directory <- "E:/pinterlab/x_erosion_data/second_eight_samples"
basename_id <- "204792210038"
second_batch_samples_df <- array_registry_df %>%
filter(`Tube Label` %in% samples_in_order) %>%
dplyr::rename(description=`...4`) %>%
dplyr::mutate(cell_line=str_split(description, " ", simplify = T)[,1],
cell_type=str_split(description, " ", simplify = T)[,2],
passage=as.numeric(str_replace(str_split(description, " ", simplify = T)[,4], "p", "")),
Sentrix_Position=sprintf("R0%dC01", 1:8),
Basename=sprintf("%s/%s_%s", data_directory, basename_id, Sentrix_Position),
basename_nodir=sprintf("%s_%s", basename_id, Sentrix_Position),
Sample_Name=description)
#third batch of array
samples_in_order <- c("ar86", "ar87", "ar88",
"ar89", "ar90", "ar91",
"ar92", "ar93")
data_directory <- "E:/pinterlab/x_erosion_data/third_eight_samples"
basename_id <- "205003700119"
third_batch_samples_df <- array_registry_df %>%
filter(`Tube Label` %in% samples_in_order) %>%
dplyr::rename(description=`...4`) %>%
dplyr::mutate(cell_line=str_split(description, " ", simplify = T)[,1],
cell_type=str_split(description, " ", simplify = T)[,2],
passage=as.numeric(str_replace(str_split(description, " ", simplify = T)[,4], "p", "")),
Sentrix_Position=sprintf("R0%dC01", 1:8),
Basename=sprintf("%s/%s_%s", data_directory, basename_id, Sentrix_Position),
basename_nodir=sprintf("%s_%s", basename_id, Sentrix_Position),
Sample_Name=description)
continuous_cluster_df <- read_csv("continuous_cluster_df.csv")
sample_info_df <- rbind(first_batch_samples_df, second_batch_samples_df, third_batch_samples_df) %>%
left_join(continuous_cluster_df, by=c("description"="sample_name", "cell_line", "passage"))
#%>%
add_row(cell_line="H9",
Sample_Name="Daily_H9",
Basename="D:/pinterlab/x_erosion/daily_h9_sc/GSM2285135_7970368067_R04C01",
basename_nodir="GSM2285135_7970368067_R04C01")
RGset <- read.metharray.exp(targets = sample_info_df %>% filter(cell_line != "H9"))
mset <- preprocessIllumina(RGset)
sampleNames(mset) <- sample_info_df[match(sampleNames(mset), sample_info_df$basename_nodir),]$Sample_Name
sampleNames(RGset) <- sample_info_df[match(sampleNames(RGset), sample_info_df$basename_nodir),]$Sample_Name
RGset_h9 <- read.metharray.exp(targets = sample_info_df %>% filter(cell_line == "H9"))
mset_h9 <- preprocessIllumina(RGset_h9)
combined_mset <- combineArrays(mset, mset_h9)
# gmset <- mapToGenome(mset)
# combined_arrays <- combineArrays(combined_arrays, gmset, outType = "IlluminaHumanMethylation450k")
library(conumee)
source("E:/pinterlab/custom_conumee_functions.R")
data(exclude_regions)
anno <- custom.CNV.create_anno(array_type="450k", exclude_regions=exclude_regions)
# sampleNames(mset) <- sample_info_df[match(sampleNames(mset), sample_info_df$basename_nodir),]$Sample_Name
cnv_data <- CNV.load(combined_mset)
minfi.controls <- (pData(mset)$cell_line == "H9")
x <- CNV.fit(cnv_data[sampleNames(combined_mset) == "C19XX iPSC MEFs p19"], cnv_data[pData(combined_mset)$cell_line == "H9"], anno)
x <- CNV.bin(x)
x <- CNV.segment(x)
CNV.genomeplot(x)
#output for GEO
beta_df <- getBeta(gmset)
beta_df <- as.data.frame(beta_df) %>% rownames_to_column(var="rn")
detect_p_df <- detectionP(RGset)
detect_p_df <- as.data.frame(detect_p_df) %>% rownames_to_column(var="rn")
joined_df <- full_join(beta_df, detect_p_df, by="rn", suffix=c("", "pval")) %>%
select(ID_REF=rn,
`C19XX IPSC MEF p8`,
`C19XX IPSC MEF p8pval`,
`C19XX iPSC MEF p11`,
`C19XX iPSC MEF p11pval`,
`C19XX iPSC MEF p13`,
`C19XX iPSC MEF p13pval`,
`C19XX iPSC MEF p17`,
`C19XX iPSC MEF p17pval`,
`C19XX iPSC MEFs p19`,
`C19XX iPSC MEFs p19pval`,
`C19XX iPSC MEFs p21`,
`C19XX iPSC MEFs p21pval`,
`C19XX iPSC MEFs p22`,
`C19XX iPSC MEFs p22pval`,
`C19XX iPSC MEFs p28`,
`C19XX iPSC MEFs p28pval`,
`C19XX iPSC MEFs p29`,
`C19XX iPSC MEFs p29pval`,
`C19XX iPSC MEFs p30`,
`C19XX iPSC MEFs p30pval`,
`C19XX iPSC MEFs p32`,
`C19XX iPSC MEFs p32pval`,
`C23XX iPSC MEF p12`,
`C23XX iPSC MEF p12pval`,
`C23XX iPSC MEF p15`,
`C23XX iPSC MEF p15pval`,
`C23XX iPSC MEF p17`,
`C23XX iPSC MEF p17pval`,
`C23XX iPSC MEF p20`,
`C23XX iPSC MEF p20pval`,
`C23XX iPSC MEFs p22`,
`C23XX iPSC MEFs p22pval`,
`C23XX iPSC MEFs p24`,
`C23XX iPSC MEFs p24pval`,
`C23XX iPSC MEFs p25`,
`C23XX iPSC MEFs p25pval`,
`C23XX iPSC MEFs p31`,
`C23XX iPSC MEFs p31pval`,
`C23XX iPSC MEFs p32`,
`C23XX iPSC MEFs p32pval`,
`C23XX iPSC MEFs p33`,
`C23XX iPSC MEFs p33pval`,
`C23XX iPSC MEFs p35`,
`C23XX iPSC MEFs p35pval`)
write_csv(joined_df, "e:/pinterlab/x_erosion_data/geo_joined_sheet.csv")