From 2e6bcacfa8019b8db5c6e998e2d17eaf411f94e9 Mon Sep 17 00:00:00 2001 From: somethingp Date: Thu, 25 Mar 2021 20:01:29 -0400 Subject: [PATCH] Updated --- analysis_notebook.Rmd | 57 +++++---- analyze_our_methylation_data.R | 225 +++++++++++++++++++++++++++++++-- 2 files changed, 247 insertions(+), 35 deletions(-) diff --git a/analysis_notebook.Rmd b/analysis_notebook.Rmd index 3af2303..770c072 100644 --- a/analysis_notebook.Rmd +++ b/analysis_notebook.Rmd @@ -422,7 +422,9 @@ pca_female_by_group <- ggplot(female_pca_df, mapping = aes(x=PC1, y=PC2)) + title = "Studies", color="Study")+ base_plot_theme + - scale_color_futurama() + + scale_color_manual(limits=c("GSE110544", "GSE31848", "GSE59091", "GSE60924", "GSE72923", "GSE85828", "GSE34982", "GSE73938"), + values = c(pal_futurama()(4), pal_startrek()(3)[3], pal_rickandmorty()(5)[c(2,3,4)]))+ + # scale_color_futurama() + guides(color=guide_legend(nrow=3,byrow=TRUE)) + theme(legend.key.width = unit(1, units="mm"), legend.key.height = unit(1, units="mm"), @@ -599,7 +601,8 @@ pca_male_by_group <- ggplot(male_pca_df, mapping = aes(x=PC1, y=PC2)) + legend.text = element_text(size=4, margin = margin(r=.5, l=-1, unit="mm")), legend.box.margin = margin(t=-3,l=-5, unit="mm"), legend.spacing.y=unit(.2, "mm")) + - scale_color_futurama() + scale_color_manual(limits=c("GSE110544", "GSE31848", "GSE59091", "GSE60924", "GSE72923", "GSE85828"), + values = c(pal_futurama()(4), pal_startrek()(3)[3], pal_rickandmorty()(5)[c(2)])) pca_male_by_methylation <- ggplot(male_pca_df, mapping = aes(x=PC1, y=PC2)) + geom_point(aes(color=meanXMethylation), size=.5, shape=20) + @@ -905,7 +908,7 @@ lm_eqn <- function(m) { global_mean_methylation <- ggplot(means, aes(x=sex, group=sex, y=mean_methylation, fill=sex)) + geom_boxplot(show.legend = F, size=.3, outlier.size = .5, outlier.shape = 20) + geom_signif(map_signif_level=TRUE, comparisons = list(c("female", "male")), size=.3, textsize = 2) + - labs(x="Sex", y="Global Mean β-values", title = "Global Mean Methylation") + + labs(x="Sex", y="Global Mean β-values", title = "Global Mean\nMethylation") + theme_bw() + base_plot_theme + theme(legend.position = "none") + @@ -1117,14 +1120,14 @@ ggsave("figs/main/fig1_variance_density_plot.pdf", variance_density_plot, width= sig_diff_x_v_no_x <- ggplot(statistics, aes(y=alpha, x=(Position %in% x_sites), fill=(Position %in% x_sites))) + geom_boxplot(outlier.shape=NA, size=.4) + - labs(title="Brown-Forsythe\nSignificances", - x="", - y="BF-Test (alpha)") + - scale_x_discrete(labels=c("Autosomal\nProbes", "X\nProbes"), limits=c(FALSE, TRUE)) + + labs(title="Brown-Forsythe\np-values", + x="Chromosome", + y="BF p-value") + + scale_x_discrete(labels=c("Autosomal", "X"), limits=c(FALSE, TRUE)) + scale_fill_grey(start=.5, end=.8, guide=F) + theme_bw() + - base_plot_theme + - theme(axis.title.x=element_blank()) + base_plot_theme #+ + # theme(axis.title.x=element_blank()) positions_to_keep <- statistics$Position[statistics$alpha <= .01 & (statistics$femaleVariance > statistics$maleVariance)] # returns the density data @@ -1154,17 +1157,20 @@ unsig_female_beta_vec <- as.vector(as.matrix(unsig_female_beta_mat %>% dplyr::se sig_female_beta_vec <- as.vector(as.matrix(sig_female_beta_df %>% dplyr::select(-rn, -Chromosome, -Position, -Gene))) sig_nonsig_dist_plot <- ggplot() + - geom_violin(aes(x="Sig\nProbes", y=sig_female_beta_vec, fill='female'), trim=FALSE, show.legend = F, size=.3) + - geom_violin(aes(x="Non-sig\nProbes", y=unsig_female_beta_vec, fill='male'), trim=FALSE, show.legend = F, size=.3) + - geom_boxplot(aes(x="Sig\nProbes", y=sig_female_beta_vec), width=0.025, size=.3, outlier.shape=20, outlier.size=.4) + - geom_boxplot(aes(x="Non-sig\nProbes", y=unsig_female_beta_vec), width=0.025, size=.3, outlier.shape=20, outlier.size=.4) + - labs(x="", + geom_violin(aes(x="p<.01", y=sig_female_beta_vec, fill='female'), trim=FALSE, show.legend = F, size=.3) + + geom_violin(aes(x="p>.01", y=unsig_female_beta_vec, fill='male'), trim=FALSE, show.legend = F, size=.3) + + geom_boxplot(aes(x="p<.01", y=sig_female_beta_vec), width=0.025, size=.3, outlier.shape=20, outlier.size=.4) + + geom_boxplot(aes(x="p>.01", y=unsig_female_beta_vec), width=0.025, size=.3, outlier.shape=20, outlier.size=.4) + + labs(x="BF p-values", y='DNAme β-value', - title="β-values of sig. and \nnon-sig. variant sites\nin Female hPSCs") + + #title="β-values of sig. and \nnon-sig. variant sites\nin Female hPSCs") + + title="Probe β-values in Female\nhPSCs by Variance") + + scale_x_discrete(limits=c("p>.01", "p<.01")) + theme_bw() + base_plot_theme + theme(legend.position = "none", - axis.title.x=element_blank()) + + # axis.title.x=element_blank() + ) + scale_fill_futurama() # ggsave('figs/beta value distributions - sig-unsig.png') sig_nonsig_dist_plot @@ -1195,7 +1201,7 @@ data_df <- tibble(mean_x = mean_x_methylation, mean_sig_x = mean_sig_x_methylati correlation <- sprintf("corr = %.2f", cor(mean_x_methylation, mean_sig_x_methylation)) female_x_sigsites <- ggplot(data_df, aes(x=mean_x, y=mean_sig_x)) + geom_point(color=sex_colors[1],size=.5, shape=20) + - labs(x="Mean X DNAme", y="Mean Sig X DNAme", title="Female hPSCs\nSig var X-probes") + + labs(x="Mean X DNAme", y="Mean Sig X DNAme", title="Female hPSCs\nBF p<.01 X-probes") + geom_smooth(method='lm', color="black", size=.5) + # annotate("text", x=.55, y=.47, label=lm_eqn(lm(mean_autosome_methylation ~ mean_x_methylation, female_means)), parse=TRUE) + annotate("text", x=.45, y=.45, label=correlation, size=2) + @@ -1205,7 +1211,7 @@ female_x_sigsites <- ggplot(data_df, aes(x=mean_x, y=mean_sig_x)) + correlation <- sprintf("corr = %.2f", cor(mean_x_methylation, mean_nonsig_x_methylation)) female_x_nonsigsites <- ggplot(data_df, aes(x=mean_x, y=mean_nonsig_x)) + geom_point(color=sex_colors[1],size=.5, shape=20) + - labs(x="Mean X Chr DNAme", y="Mean Non-Sig X DNAme", title="Female hPSCs\nNon-Sig var X-probes") + + labs(x="Mean X Chr DNAme", y="Mean Non-Sig X DNAme", title="Female hPSCs\n BF p>.01 X-probes") + geom_smooth(method='lm', color="black", size=.5) + # annotate("text", x=.55, y=.47, label=lm_eqn(lm(mean_autosome_methylation ~ mean_x_methylation, female_means)), parse=TRUE) + annotate("text", x=.55, y=.6, label=correlation, size=2) + @@ -1569,12 +1575,12 @@ pca_female_no_h9_by_python_6 <- ggplot(female_pca_df, mapping = aes(x=PC1, y=PC2 # geom_text_repel(female_pca_df %>% filter(cell_line %in% correctly_ordered_by_sig_names & grepl("Nazor_", sample_name)), mapping = aes(x=PC1, y=PC2, label=passage), size=2, color="grey", segment.size=.15, min.segment.length=.25, segment.color="darkgrey") + scale_shape_manual(values = c(17, 16), labels=c("H9", "Other"), limits=c(T,F), guide=F) + theme_bw() + - labs(x=female_no_h9_sig_sites_pc1_label, - y=female_no_h9_sig_sites_pc2_label, + labs(x=female_pc1_label, + y=female_pc2_label, title = "PCA of Female Samples", color="Cluster")+ base_plot_theme + - scale_color_manual(limits=c(0,1,2,3,4,5,NA), labels=c("A","B","C","D","E","F", "H9"), values = c(RColorBrewer::brewer.pal(6,"Dark2"), "#000000"), na.value="#000000") + + scale_color_manual(limits=factor(c(0,1,2,3,4,5,NA)), labels=c("A","B","C","D","E","F", "H9"), values = c(RColorBrewer::brewer.pal(6,"Dark2"), "#000000"), na.value="#000000") + # scale_color_brewer(type="qual", palette = "Dark2", limits=c(0,1,2,3,4,5), labels=c("A","B","C","D","E","F")) + guides(color=guide_legend(nrow=1)) + theme(legend.key.width = unit(1, units = "mm"), @@ -8023,12 +8029,15 @@ table_1_df <- read_csv("table_1_sample_data_info.csv") %>% `Sample Cluster Assignment` == 5 ~ "F")) studies_in_clusters <- ggplot(table_1_df) + - geom_bar(aes(x=cluster, fill=Source)) + + geom_bar(aes(x=cluster, fill=sprintf("%s (%s)", str_replace(Source, " et al,", ","), `GEO Group Accession`))) + labs(title="Studies in Clusters", x="Sample Cluster", - y="Number of Samples") + + y="Number of Samples", + fill="Source") + theme_bw() + - scale_fill_npg() + + scale_fill_manual(limits=c("Banovich, 2018 (GSE110544)", "Nazor, 2012 (GSE31848)", "Butcher, 2016 (GSE59091)", "Nishizawa, 2016 (GSE60924)", "Zdravkovic, 2015 (GSE72923)", "Salomonis, 2016 (GSE85828)", "Takasawa, 2018 (GSE73938)")[c(1,3,2,4,6,7,5)], + values = c(pal_futurama()(4), pal_startrek()(3)[3], pal_rickandmorty()(5)[c(2,4)])[c(1,3,2,4,6,7,5)])+ + # scale_fill_npg() + base_plot_theme + theme(legend.position="right", legend.justification = "left") diff --git a/analyze_our_methylation_data.R b/analyze_our_methylation_data.R index 62d6456..f657f1d 100644 --- a/analyze_our_methylation_data.R +++ b/analyze_our_methylation_data.R @@ -817,17 +817,6 @@ RGset <- read.metharray.exp(targets = sample_info_df) mset <- preprocessIllumina(RGset) gmset <- mapToGenome(mset) -#copy number stuff -library(conumee) -source("E:/pinterlab/custom_conumee_functions.R") -data(exclude_regions) - -anno <- custom.CNV.create_anno(array_type="EPIC", exclude_regions=exclude_regions) - -sampleNames(mset) <- sample_info_df[match(sampleNames(mset), sample_info_df$basename_nodir),]$Sample_Name -cnv_data <- CNV.load(mset) -x <- CNV.fit(cnv_data, anno) - beta_mat_850k <- getBeta(gmset) colnames(beta_mat_850k) <- sample_info_df[match(colnames(beta_mat_850k), sample_info_df$basename_nodir),]$Sample_Name write_csv(as.data.frame(beta_mat_850k) %>% rownames_to_column(var="rn"), "our_data_beta_mat_850k.csv.gz") @@ -1293,3 +1282,217 @@ ggplot(qpcr_dat_col, aes(x=factor(passage, levels=c("p8","p11","p12","p13","p15" theme(strip.text = element_text(size=6, margin=margin(t=1, b=1))) ggsave("figs/supplementary/supp1_xist_qpcr_barplot.pdf", width=80, height=40, units="mm") + + +# checking copy number compared to H9s using DNAme +# library(stringi) +# datasets_df <- read.csv("datasets.csv.gz") %>% filter(Name=="Boscolo") +# +# for (row in 1:nrow(datasets_df)) { +# print(row) +# name <- as.character(datasets_df$Name[row]) +# basedir <- as.character(datasets_df$Directory[row]) +# datatype <- as.character(datasets_df$Type[row]) +# sample_metadata_file <- as.character(datasets_df$SampleInfoFile[row]) +# array_size <- datasets_df$ArraySize[row] +# if (array_size == 450) { +# array_type <- "IlluminaHumanMethylation450k" +# } else if (array_size == 850) { +# array_type <- "IlluminaHumanMethylationEPIC" +# } +# +# +# if (datatype == "geo_raw") { +# sample_info_df <- read.csv(file.path(basedir, sample_metadata_file), stringsAsFactors = FALSE, row.names = 1) +# samples_to_keep <- sample_info_df[sample_info_df$Keep %in% c("TRUE"), ] +# raw_filename <- file.path(basedir, as.character(datasets_df$DataFile[row])) +# u_name <- as.character(datasets_df$Uname[row]) +# m_name <- as.character(datasets_df$Mname[row]) +# separator <- stri_unescape_unicode(as.character(datasets_df$Separator[row])) +# gmset <- readGEORawFile( +# filename = raw_filename, +# Uname = u_name, +# Mname = m_name, +# array = array_type, +# sep = separator +# ) +# gmset <- readGEORawFile( +# filename = raw_filename, +# Uname = u_name, +# Mname = m_name, +# array = array_type, +# sep = separator, +# pData = sample_info_df[sampleNames(gmset), ] +# ) +# } else if (datatype == "idat") { +# targets <- as.data.frame(read.metharray.sheet(basedir, pattern = sample_metadata_file)) +# targets_to_keep <- subset(targets, Keep == TRUE) +# +# RGset <- read.metharray.exp(targets = targets_to_keep) +# gmset <- preprocessIllumina(RGset) +# gmset <- mapToGenome(gmset) +# } +# +# sampleNames(gmset) <- paste(name, gmset$accession, gmset$cell_type, gmset$cell_line, gmset$gender, +# paste("p", gmset$passage, sep = ""), +# sep = "_" +# ) +# +# +# plot_group <- data.frame(group = name, sample_name = sampleNames(gmset), row.names = "sample_name") +# if (row == 1) { +# combined_arrays <- gmset +# plot_groups_combined <- plot_group +# } else { +# combined_arrays <- combineArrays(combined_arrays, gmset, outType = "IlluminaHumanMethylation450k") +# plot_groups_combined <- rbind(plot_groups_combined, plot_group) +# } +# } + +array_registry_df <- readxl::read_excel("E:/pinterlab/x_erosion_data/Array_Registry_Update210126.xlsx") +samples_in_order <- c("ar 52", "ar 54", "ar 56", + "ar 58", "ar 60", "ar 62", + "ar 64", "ar 66") +data_directory <- "E:/pinterlab/x_erosion_data/first_eight_samples" +basename_id <- "204668830015" +first_batch_samples_df <- array_registry_df %>% + filter(`Tube Label` %in% samples_in_order) %>% + dplyr::rename(description=`...4`) %>% + dplyr::mutate(cell_line=str_split(description, " ", simplify = T)[,1], + cell_type=str_split(description, " ", simplify = T)[,2], + passage=as.numeric(str_replace(str_split(description, " ", simplify = T)[,4], "p", "")), + Sentrix_Position=sprintf("R0%dC01", 1:8), + Basename=sprintf("%s/%s_%s", data_directory, basename_id, Sentrix_Position), + basename_nodir=sprintf("%s_%s", basename_id, Sentrix_Position), + Sample_Name=description) + + +#second batch of array +samples_in_order <- c("ar 47", "ar 48", "ar72", "ar73", + "ar74", "ar75", "ar76", "ar77") +data_directory <- "E:/pinterlab/x_erosion_data/second_eight_samples" +basename_id <- "204792210038" +second_batch_samples_df <- array_registry_df %>% + filter(`Tube Label` %in% samples_in_order) %>% + dplyr::rename(description=`...4`) %>% + dplyr::mutate(cell_line=str_split(description, " ", simplify = T)[,1], + cell_type=str_split(description, " ", simplify = T)[,2], + passage=as.numeric(str_replace(str_split(description, " ", simplify = T)[,4], "p", "")), + Sentrix_Position=sprintf("R0%dC01", 1:8), + Basename=sprintf("%s/%s_%s", data_directory, basename_id, Sentrix_Position), + basename_nodir=sprintf("%s_%s", basename_id, Sentrix_Position), + Sample_Name=description) + +#third batch of array +samples_in_order <- c("ar86", "ar87", "ar88", + "ar89", "ar90", "ar91", + "ar92", "ar93") +data_directory <- "E:/pinterlab/x_erosion_data/third_eight_samples" +basename_id <- "205003700119" +third_batch_samples_df <- array_registry_df %>% + filter(`Tube Label` %in% samples_in_order) %>% + dplyr::rename(description=`...4`) %>% + dplyr::mutate(cell_line=str_split(description, " ", simplify = T)[,1], + cell_type=str_split(description, " ", simplify = T)[,2], + passage=as.numeric(str_replace(str_split(description, " ", simplify = T)[,4], "p", "")), + Sentrix_Position=sprintf("R0%dC01", 1:8), + Basename=sprintf("%s/%s_%s", data_directory, basename_id, Sentrix_Position), + basename_nodir=sprintf("%s_%s", basename_id, Sentrix_Position), + Sample_Name=description) + +continuous_cluster_df <- read_csv("continuous_cluster_df.csv") + +sample_info_df <- rbind(first_batch_samples_df, second_batch_samples_df, third_batch_samples_df) %>% + left_join(continuous_cluster_df, by=c("description"="sample_name", "cell_line", "passage")) +#%>% + add_row(cell_line="H9", + Sample_Name="Daily_H9", + Basename="D:/pinterlab/x_erosion/daily_h9_sc/GSM2285135_7970368067_R04C01", + basename_nodir="GSM2285135_7970368067_R04C01") + +RGset <- read.metharray.exp(targets = sample_info_df %>% filter(cell_line != "H9")) +mset <- preprocessIllumina(RGset) +sampleNames(mset) <- sample_info_df[match(sampleNames(mset), sample_info_df$basename_nodir),]$Sample_Name +sampleNames(RGset) <- sample_info_df[match(sampleNames(RGset), sample_info_df$basename_nodir),]$Sample_Name + +RGset_h9 <- read.metharray.exp(targets = sample_info_df %>% filter(cell_line == "H9")) +mset_h9 <- preprocessIllumina(RGset_h9) + +combined_mset <- combineArrays(mset, mset_h9) + +# gmset <- mapToGenome(mset) + + +# combined_arrays <- combineArrays(combined_arrays, gmset, outType = "IlluminaHumanMethylation450k") + +library(conumee) +source("E:/pinterlab/custom_conumee_functions.R") +data(exclude_regions) + +anno <- custom.CNV.create_anno(array_type="450k", exclude_regions=exclude_regions) + +# sampleNames(mset) <- sample_info_df[match(sampleNames(mset), sample_info_df$basename_nodir),]$Sample_Name +cnv_data <- CNV.load(combined_mset) +minfi.controls <- (pData(mset)$cell_line == "H9") +x <- CNV.fit(cnv_data[sampleNames(combined_mset) == "C19XX iPSC MEFs p19"], cnv_data[pData(combined_mset)$cell_line == "H9"], anno) +x <- CNV.bin(x) +x <- CNV.segment(x) + +CNV.genomeplot(x) + +#output for GEO + +beta_df <- getBeta(gmset) +beta_df <- as.data.frame(beta_df) %>% rownames_to_column(var="rn") +detect_p_df <- detectionP(RGset) +detect_p_df <- as.data.frame(detect_p_df) %>% rownames_to_column(var="rn") +joined_df <- full_join(beta_df, detect_p_df, by="rn", suffix=c("", "pval")) %>% + select(ID_REF=rn, + `C19XX IPSC MEF p8`, + `C19XX IPSC MEF p8pval`, + `C19XX iPSC MEF p11`, + `C19XX iPSC MEF p11pval`, + `C19XX iPSC MEF p13`, + `C19XX iPSC MEF p13pval`, + `C19XX iPSC MEF p17`, + `C19XX iPSC MEF p17pval`, + `C19XX iPSC MEFs p19`, + `C19XX iPSC MEFs p19pval`, + `C19XX iPSC MEFs p21`, + `C19XX iPSC MEFs p21pval`, + `C19XX iPSC MEFs p22`, + `C19XX iPSC MEFs p22pval`, + `C19XX iPSC MEFs p28`, + `C19XX iPSC MEFs p28pval`, + `C19XX iPSC MEFs p29`, + `C19XX iPSC MEFs p29pval`, + `C19XX iPSC MEFs p30`, + `C19XX iPSC MEFs p30pval`, + `C19XX iPSC MEFs p32`, + `C19XX iPSC MEFs p32pval`, + `C23XX iPSC MEF p12`, + `C23XX iPSC MEF p12pval`, + `C23XX iPSC MEF p15`, + `C23XX iPSC MEF p15pval`, + `C23XX iPSC MEF p17`, + `C23XX iPSC MEF p17pval`, + `C23XX iPSC MEF p20`, + `C23XX iPSC MEF p20pval`, + `C23XX iPSC MEFs p22`, + `C23XX iPSC MEFs p22pval`, + `C23XX iPSC MEFs p24`, + `C23XX iPSC MEFs p24pval`, + `C23XX iPSC MEFs p25`, + `C23XX iPSC MEFs p25pval`, + `C23XX iPSC MEFs p31`, + `C23XX iPSC MEFs p31pval`, + `C23XX iPSC MEFs p32`, + `C23XX iPSC MEFs p32pval`, + `C23XX iPSC MEFs p33`, + `C23XX iPSC MEFs p33pval`, + `C23XX iPSC MEFs p35`, + `C23XX iPSC MEFs p35pval`) + +write_csv(joined_df, "e:/pinterlab/x_erosion_data/geo_joined_sheet.csv") + +