Updated

sfp-lab · Mar 26, 2021 · 2e6bcac · 2e6bcac
1 parent 7dcc13d
commit 2e6bcac
Show file tree

Hide file tree

Showing 2 changed files with 247 additions and 35 deletions.
diff --git a/analysis_notebook.Rmd b/analysis_notebook.Rmd
@@ -422,7 +422,9 @@ pca_female_by_group <- ggplot(female_pca_df, mapping = aes(x=PC1, y=PC2)) +
                              title = "Studies",
                              color="Study")+
                         base_plot_theme +
-                        scale_color_futurama() +
+                        scale_color_manual(limits=c("GSE110544", "GSE31848", "GSE59091", "GSE60924", "GSE72923", "GSE85828", "GSE34982", "GSE73938"),
+                                           values = c(pal_futurama()(4), pal_startrek()(3)[3], pal_rickandmorty()(5)[c(2,3,4)]))+
+                        # scale_color_futurama() +
                         guides(color=guide_legend(nrow=3,byrow=TRUE)) +
                         theme(legend.key.width = unit(1, units="mm"),
                               legend.key.height = unit(1, units="mm"),
@@ -599,7 +601,8 @@ pca_male_by_group <- ggplot(male_pca_df, mapping = aes(x=PC1, y=PC2)) +
                               legend.text = element_text(size=4, margin = margin(r=.5, l=-1, unit="mm")),
                               legend.box.margin = margin(t=-3,l=-5, unit="mm"),
                               legend.spacing.y=unit(.2, "mm")) +
-                        scale_color_futurama()
+                        scale_color_manual(limits=c("GSE110544", "GSE31848", "GSE59091", "GSE60924", "GSE72923", "GSE85828"),
+                                           values = c(pal_futurama()(4), pal_startrek()(3)[3], pal_rickandmorty()(5)[c(2)]))
 
 pca_male_by_methylation <- ggplot(male_pca_df, mapping = aes(x=PC1, y=PC2)) +
                               geom_point(aes(color=meanXMethylation), size=.5, shape=20) +
@@ -905,7 +908,7 @@ lm_eqn <- function(m) {
 global_mean_methylation <- ggplot(means, aes(x=sex, group=sex, y=mean_methylation, fill=sex)) +
   geom_boxplot(show.legend = F, size=.3, outlier.size = .5, outlier.shape = 20) +
   geom_signif(map_signif_level=TRUE, comparisons = list(c("female", "male")), size=.3, textsize = 2) +
-  labs(x="Sex", y="Global Mean β-values", title = "Global Mean Methylation") +
+  labs(x="Sex", y="Global Mean β-values", title = "Global Mean\nMethylation") +
   theme_bw() +
   base_plot_theme +
   theme(legend.position = "none") +
@@ -1117,14 +1120,14 @@ ggsave("figs/main/fig1_variance_density_plot.pdf", variance_density_plot, width=
 
 sig_diff_x_v_no_x <- ggplot(statistics, aes(y=alpha, x=(Position %in% x_sites), fill=(Position %in% x_sites))) +
   geom_boxplot(outlier.shape=NA, size=.4) +
-  labs(title="Brown-Forsythe\nSignificances",
-       x="",
-       y="BF-Test (alpha)") +
-  scale_x_discrete(labels=c("Autosomal\nProbes", "X\nProbes"), limits=c(FALSE, TRUE)) +
+  labs(title="Brown-Forsythe\np-values",
+       x="Chromosome",
+       y="BF p-value") +
+  scale_x_discrete(labels=c("Autosomal", "X"), limits=c(FALSE, TRUE)) +
   scale_fill_grey(start=.5, end=.8, guide=F) +
   theme_bw() +
-  base_plot_theme +
-  theme(axis.title.x=element_blank())
+  base_plot_theme #+
+  # theme(axis.title.x=element_blank())
 
 positions_to_keep <- statistics$Position[statistics$alpha <= .01 & (statistics$femaleVariance > statistics$maleVariance)] # returns the density data
 
@@ -1154,17 +1157,20 @@ unsig_female_beta_vec <- as.vector(as.matrix(unsig_female_beta_mat %>% dplyr::se
 sig_female_beta_vec <- as.vector(as.matrix(sig_female_beta_df %>% dplyr::select(-rn, -Chromosome, -Position, -Gene)))
 
 sig_nonsig_dist_plot <- ggplot() +
-  geom_violin(aes(x="Sig\nProbes", y=sig_female_beta_vec, fill='female'), trim=FALSE, show.legend = F, size=.3) +
-  geom_violin(aes(x="Non-sig\nProbes", y=unsig_female_beta_vec, fill='male'), trim=FALSE, show.legend = F, size=.3) +
-  geom_boxplot(aes(x="Sig\nProbes", y=sig_female_beta_vec), width=0.025, size=.3, outlier.shape=20, outlier.size=.4) +
-  geom_boxplot(aes(x="Non-sig\nProbes", y=unsig_female_beta_vec), width=0.025, size=.3, outlier.shape=20, outlier.size=.4) +
-  labs(x="", 
+  geom_violin(aes(x="p<.01", y=sig_female_beta_vec, fill='female'), trim=FALSE, show.legend = F, size=.3) +
+  geom_violin(aes(x="p>.01", y=unsig_female_beta_vec, fill='male'), trim=FALSE, show.legend = F, size=.3) +
+  geom_boxplot(aes(x="p<.01", y=sig_female_beta_vec), width=0.025, size=.3, outlier.shape=20, outlier.size=.4) +
+  geom_boxplot(aes(x="p>.01", y=unsig_female_beta_vec), width=0.025, size=.3, outlier.shape=20, outlier.size=.4) +
+  labs(x="BF p-values", 
        y='DNAme β-value',
-       title="β-values of sig. and \nnon-sig. variant sites\nin Female hPSCs") +
+       #title="β-values of sig. and \nnon-sig. variant sites\nin Female hPSCs") +
+       title="Probe β-values in Female\nhPSCs by Variance") +
+  scale_x_discrete(limits=c("p>.01", "p<.01")) +
   theme_bw() +
   base_plot_theme +
   theme(legend.position = "none",
-  axis.title.x=element_blank()) +
+  # axis.title.x=element_blank()
+  ) +
   scale_fill_futurama()
 # ggsave('figs/beta value distributions - sig-unsig.png')
 sig_nonsig_dist_plot
@@ -1195,7 +1201,7 @@ data_df <- tibble(mean_x = mean_x_methylation, mean_sig_x = mean_sig_x_methylati
 correlation <- sprintf("corr = %.2f",  cor(mean_x_methylation, mean_sig_x_methylation))
 female_x_sigsites <- ggplot(data_df, aes(x=mean_x, y=mean_sig_x)) +
   geom_point(color=sex_colors[1],size=.5, shape=20) +
-  labs(x="Mean X DNAme", y="Mean Sig X DNAme", title="Female hPSCs\nSig var X-probes") +
+  labs(x="Mean X DNAme", y="Mean Sig X DNAme", title="Female hPSCs\nBF p<.01 X-probes") +
   geom_smooth(method='lm', color="black", size=.5) +
   # annotate("text", x=.55, y=.47, label=lm_eqn(lm(mean_autosome_methylation ~ mean_x_methylation, female_means)), parse=TRUE) +
   annotate("text", x=.45, y=.45, label=correlation, size=2) +
@@ -1205,7 +1211,7 @@ female_x_sigsites <- ggplot(data_df, aes(x=mean_x, y=mean_sig_x)) +
 correlation <- sprintf("corr = %.2f",  cor(mean_x_methylation, mean_nonsig_x_methylation))
 female_x_nonsigsites <- ggplot(data_df, aes(x=mean_x, y=mean_nonsig_x)) +
   geom_point(color=sex_colors[1],size=.5, shape=20) +
-  labs(x="Mean X Chr DNAme", y="Mean Non-Sig X DNAme", title="Female hPSCs\nNon-Sig var X-probes") +
+  labs(x="Mean X Chr DNAme", y="Mean Non-Sig X DNAme", title="Female hPSCs\n BF p>.01 X-probes") +
   geom_smooth(method='lm', color="black", size=.5) +
   # annotate("text", x=.55, y=.47, label=lm_eqn(lm(mean_autosome_methylation ~ mean_x_methylation, female_means)), parse=TRUE) +
   annotate("text", x=.55, y=.6, label=correlation, size=2) +
@@ -1569,12 +1575,12 @@ pca_female_no_h9_by_python_6 <- ggplot(female_pca_df, mapping = aes(x=PC1, y=PC2
                         # geom_text_repel(female_pca_df %>% filter(cell_line %in% correctly_ordered_by_sig_names & grepl("Nazor_", sample_name)), mapping = aes(x=PC1, y=PC2, label=passage), size=2, color="grey", segment.size=.15, min.segment.length=.25, segment.color="darkgrey") +
                         scale_shape_manual(values = c(17, 16), labels=c("H9", "Other"), limits=c(T,F), guide=F) +
                         theme_bw() + 
-                        labs(x=female_no_h9_sig_sites_pc1_label,
-                             y=female_no_h9_sig_sites_pc2_label,
+                        labs(x=female_pc1_label,
+                             y=female_pc2_label,
                              title = "PCA of Female Samples",
                              color="Cluster")+
                         base_plot_theme +
-                        scale_color_manual(limits=c(0,1,2,3,4,5,NA), labels=c("A","B","C","D","E","F", "H9"), values = c(RColorBrewer::brewer.pal(6,"Dark2"), "#000000"), na.value="#000000") +
+                        scale_color_manual(limits=factor(c(0,1,2,3,4,5,NA)), labels=c("A","B","C","D","E","F", "H9"), values = c(RColorBrewer::brewer.pal(6,"Dark2"), "#000000"), na.value="#000000") +
                         # scale_color_brewer(type="qual", palette = "Dark2", limits=c(0,1,2,3,4,5), labels=c("A","B","C","D","E","F")) +
                         guides(color=guide_legend(nrow=1)) +
                         theme(legend.key.width = unit(1, units = "mm"),
@@ -8023,12 +8029,15 @@ table_1_df <- read_csv("table_1_sample_data_info.csv") %>%
                                        `Sample Cluster Assignment` == 5 ~ "F"))
 
 studies_in_clusters <- ggplot(table_1_df) +
-  geom_bar(aes(x=cluster, fill=Source)) +
+  geom_bar(aes(x=cluster, fill=sprintf("%s (%s)", str_replace(Source, " et al,", ","), `GEO Group Accession`))) +
   labs(title="Studies in Clusters",
        x="Sample Cluster",
-       y="Number of Samples") +
+       y="Number of Samples",
+       fill="Source") +
   theme_bw() +
-  scale_fill_npg() +
+  scale_fill_manual(limits=c("Banovich, 2018 (GSE110544)", "Nazor, 2012 (GSE31848)", "Butcher, 2016 (GSE59091)", "Nishizawa, 2016 (GSE60924)", "Zdravkovic, 2015 (GSE72923)", "Salomonis, 2016 (GSE85828)", "Takasawa, 2018 (GSE73938)")[c(1,3,2,4,6,7,5)],
+                                           values = c(pal_futurama()(4), pal_startrek()(3)[3], pal_rickandmorty()(5)[c(2,4)])[c(1,3,2,4,6,7,5)])+
+  # scale_fill_npg() +
   base_plot_theme +
   theme(legend.position="right",
         legend.justification = "left")

diff --git a/analyze_our_methylation_data.R b/analyze_our_methylation_data.R
@@ -817,17 +817,6 @@ RGset <- read.metharray.exp(targets = sample_info_df)
 mset <- preprocessIllumina(RGset)
 gmset <- mapToGenome(mset)
 
-#copy number stuff
-library(conumee)
-source("E:/pinterlab/custom_conumee_functions.R")
-data(exclude_regions)
-
-anno <- custom.CNV.create_anno(array_type="EPIC", exclude_regions=exclude_regions)
-
-sampleNames(mset) <- sample_info_df[match(sampleNames(mset), sample_info_df$basename_nodir),]$Sample_Name
-cnv_data <- CNV.load(mset)
-x <- CNV.fit(cnv_data, anno)
-
 beta_mat_850k <- getBeta(gmset)
 colnames(beta_mat_850k) <- sample_info_df[match(colnames(beta_mat_850k), sample_info_df$basename_nodir),]$Sample_Name
 write_csv(as.data.frame(beta_mat_850k) %>% rownames_to_column(var="rn"), "our_data_beta_mat_850k.csv.gz")
@@ -1293,3 +1282,217 @@ ggplot(qpcr_dat_col, aes(x=factor(passage, levels=c("p8","p11","p12","p13","p15"
   theme(strip.text = element_text(size=6, margin=margin(t=1, b=1)))
 
 ggsave("figs/supplementary/supp1_xist_qpcr_barplot.pdf", width=80, height=40, units="mm")
+
+
+# checking copy number compared to H9s using DNAme
+# library(stringi)
+# datasets_df <- read.csv("datasets.csv.gz") %>% filter(Name=="Boscolo")
+# 
+# for (row in 1:nrow(datasets_df)) {
+#   print(row)
+#   name <- as.character(datasets_df$Name[row])
+#   basedir <- as.character(datasets_df$Directory[row])
+#   datatype <- as.character(datasets_df$Type[row])
+#   sample_metadata_file <- as.character(datasets_df$SampleInfoFile[row])
+#   array_size <- datasets_df$ArraySize[row]
+#   if (array_size == 450) {
+#     array_type <- "IlluminaHumanMethylation450k"
+#   } else if (array_size == 850) {
+#     array_type <- "IlluminaHumanMethylationEPIC"
+#   }
+#   
+#   
+#   if (datatype == "geo_raw") {
+#     sample_info_df <- read.csv(file.path(basedir, sample_metadata_file), stringsAsFactors = FALSE, row.names = 1)
+#     samples_to_keep <- sample_info_df[sample_info_df$Keep %in% c("TRUE"), ]
+#     raw_filename <- file.path(basedir, as.character(datasets_df$DataFile[row]))
+#     u_name <- as.character(datasets_df$Uname[row])
+#     m_name <- as.character(datasets_df$Mname[row])
+#     separator <- stri_unescape_unicode(as.character(datasets_df$Separator[row]))
+#     gmset <- readGEORawFile(
+#       filename = raw_filename,
+#       Uname = u_name,
+#       Mname = m_name,
+#       array = array_type,
+#       sep = separator
+#     )
+#     gmset <- readGEORawFile(
+#       filename = raw_filename,
+#       Uname = u_name,
+#       Mname = m_name,
+#       array = array_type,
+#       sep = separator,
+#       pData = sample_info_df[sampleNames(gmset), ]
+#     )
+#   } else if (datatype == "idat") {
+#     targets <- as.data.frame(read.metharray.sheet(basedir, pattern = sample_metadata_file))
+#     targets_to_keep <- subset(targets, Keep == TRUE)
+#     
+#     RGset <- read.metharray.exp(targets = targets_to_keep)
+#     gmset <- preprocessIllumina(RGset)
+#     gmset <- mapToGenome(gmset)
+#   }
+#   
+#   sampleNames(gmset) <- paste(name, gmset$accession, gmset$cell_type, gmset$cell_line, gmset$gender,
+#                               paste("p", gmset$passage, sep = ""),
+#                               sep = "_"
+#   )
+#   
+#   
+#   plot_group <- data.frame(group = name, sample_name = sampleNames(gmset), row.names = "sample_name")
+#   if (row == 1) {
+#     combined_arrays <- gmset
+#     plot_groups_combined <- plot_group
+#   } else {
+#     combined_arrays <- combineArrays(combined_arrays, gmset, outType = "IlluminaHumanMethylation450k")
+#     plot_groups_combined <- rbind(plot_groups_combined, plot_group)
+#   }
+# }
+
+array_registry_df <- readxl::read_excel("E:/pinterlab/x_erosion_data/Array_Registry_Update210126.xlsx")
+samples_in_order <- c("ar 52", "ar 54", "ar 56",
+                      "ar 58", "ar 60", "ar 62",
+                      "ar 64", "ar 66")
+data_directory <- "E:/pinterlab/x_erosion_data/first_eight_samples"
+basename_id <- "204668830015"
+first_batch_samples_df <- array_registry_df %>%
+  filter(`Tube Label` %in% samples_in_order) %>%
+  dplyr::rename(description=`...4`) %>%
+  dplyr::mutate(cell_line=str_split(description, " ", simplify = T)[,1],
+                cell_type=str_split(description, " ", simplify = T)[,2],
+                passage=as.numeric(str_replace(str_split(description, " ", simplify = T)[,4], "p", "")),
+                Sentrix_Position=sprintf("R0%dC01", 1:8),
+                Basename=sprintf("%s/%s_%s", data_directory, basename_id, Sentrix_Position),
+                basename_nodir=sprintf("%s_%s", basename_id, Sentrix_Position),
+                Sample_Name=description)
+
+
+#second batch of array
+samples_in_order <- c("ar 47", "ar 48", "ar72", "ar73", 
+                      "ar74", "ar75", "ar76", "ar77")
+data_directory <- "E:/pinterlab/x_erosion_data/second_eight_samples"
+basename_id <- "204792210038"
+second_batch_samples_df <- array_registry_df %>%
+  filter(`Tube Label` %in% samples_in_order) %>%
+  dplyr::rename(description=`...4`) %>%
+  dplyr::mutate(cell_line=str_split(description, " ", simplify = T)[,1],
+                cell_type=str_split(description, " ", simplify = T)[,2],
+                passage=as.numeric(str_replace(str_split(description, " ", simplify = T)[,4], "p", "")),
+                Sentrix_Position=sprintf("R0%dC01", 1:8),
+                Basename=sprintf("%s/%s_%s", data_directory, basename_id, Sentrix_Position),
+                basename_nodir=sprintf("%s_%s", basename_id, Sentrix_Position),
+                Sample_Name=description)
+
+#third batch of array
+samples_in_order <- c("ar86", "ar87", "ar88",
+                      "ar89", "ar90", "ar91",
+                      "ar92", "ar93")
+data_directory <- "E:/pinterlab/x_erosion_data/third_eight_samples"
+basename_id <- "205003700119"
+third_batch_samples_df <- array_registry_df %>%
+  filter(`Tube Label` %in% samples_in_order) %>%
+  dplyr::rename(description=`...4`) %>%
+  dplyr::mutate(cell_line=str_split(description, " ", simplify = T)[,1],
+                cell_type=str_split(description, " ", simplify = T)[,2],
+                passage=as.numeric(str_replace(str_split(description, " ", simplify = T)[,4], "p", "")),
+                Sentrix_Position=sprintf("R0%dC01", 1:8),
+                Basename=sprintf("%s/%s_%s", data_directory, basename_id, Sentrix_Position),
+                basename_nodir=sprintf("%s_%s", basename_id, Sentrix_Position),
+                Sample_Name=description)
+
+continuous_cluster_df <- read_csv("continuous_cluster_df.csv")
+
+sample_info_df <- rbind(first_batch_samples_df, second_batch_samples_df, third_batch_samples_df) %>%
+  left_join(continuous_cluster_df, by=c("description"="sample_name", "cell_line", "passage")) 
+#%>%
+  add_row(cell_line="H9", 
+          Sample_Name="Daily_H9", 
+          Basename="D:/pinterlab/x_erosion/daily_h9_sc/GSM2285135_7970368067_R04C01",
+          basename_nodir="GSM2285135_7970368067_R04C01")
+
+RGset <- read.metharray.exp(targets = sample_info_df %>% filter(cell_line != "H9"))
+mset <- preprocessIllumina(RGset)
+sampleNames(mset) <- sample_info_df[match(sampleNames(mset), sample_info_df$basename_nodir),]$Sample_Name
+sampleNames(RGset) <- sample_info_df[match(sampleNames(RGset), sample_info_df$basename_nodir),]$Sample_Name
+
+RGset_h9 <- read.metharray.exp(targets = sample_info_df %>% filter(cell_line == "H9"))
+mset_h9 <- preprocessIllumina(RGset_h9)
+
+combined_mset <- combineArrays(mset, mset_h9)
+
+# gmset <- mapToGenome(mset)
+
+
+# combined_arrays <- combineArrays(combined_arrays, gmset, outType = "IlluminaHumanMethylation450k")
+
+library(conumee)
+source("E:/pinterlab/custom_conumee_functions.R")
+data(exclude_regions)
+
+anno <- custom.CNV.create_anno(array_type="450k", exclude_regions=exclude_regions)
+
+# sampleNames(mset) <- sample_info_df[match(sampleNames(mset), sample_info_df$basename_nodir),]$Sample_Name
+cnv_data <- CNV.load(combined_mset)
+minfi.controls <- (pData(mset)$cell_line == "H9")
+x <- CNV.fit(cnv_data[sampleNames(combined_mset) == "C19XX iPSC MEFs p19"], cnv_data[pData(combined_mset)$cell_line == "H9"], anno)
+x <- CNV.bin(x)
+x <- CNV.segment(x)
+
+CNV.genomeplot(x)
+
+#output for GEO
+
+beta_df <- getBeta(gmset)
+beta_df <- as.data.frame(beta_df) %>% rownames_to_column(var="rn")
+detect_p_df <- detectionP(RGset)
+detect_p_df <- as.data.frame(detect_p_df) %>% rownames_to_column(var="rn")
+joined_df <- full_join(beta_df, detect_p_df, by="rn", suffix=c("", "pval")) %>%
+              select(ID_REF=rn,
+                     `C19XX IPSC MEF p8`,
+                     `C19XX IPSC MEF p8pval`,
+                     `C19XX iPSC MEF p11`,
+                     `C19XX iPSC MEF p11pval`,
+                     `C19XX iPSC MEF p13`,
+                     `C19XX iPSC MEF p13pval`,
+                     `C19XX iPSC MEF p17`,
+                     `C19XX iPSC MEF p17pval`,
+                     `C19XX iPSC MEFs p19`,
+                     `C19XX iPSC MEFs p19pval`,
+                     `C19XX iPSC MEFs p21`,
+                     `C19XX iPSC MEFs p21pval`,
+                     `C19XX iPSC MEFs p22`,
+                     `C19XX iPSC MEFs p22pval`,
+                     `C19XX iPSC MEFs p28`,
+                     `C19XX iPSC MEFs p28pval`,
+                     `C19XX iPSC MEFs p29`,
+                     `C19XX iPSC MEFs p29pval`,
+                     `C19XX iPSC MEFs p30`,
+                     `C19XX iPSC MEFs p30pval`,
+                     `C19XX iPSC MEFs p32`,
+                     `C19XX iPSC MEFs p32pval`,
+                     `C23XX iPSC MEF p12`,
+                     `C23XX iPSC MEF p12pval`,
+                     `C23XX iPSC MEF p15`,
+                     `C23XX iPSC MEF p15pval`,
+                     `C23XX iPSC MEF p17`,
+                     `C23XX iPSC MEF p17pval`,
+                     `C23XX iPSC MEF p20`,
+                     `C23XX iPSC MEF p20pval`,
+                     `C23XX iPSC MEFs p22`,
+                     `C23XX iPSC MEFs p22pval`,
+                     `C23XX iPSC MEFs p24`,
+                     `C23XX iPSC MEFs p24pval`,
+                     `C23XX iPSC MEFs p25`,
+                     `C23XX iPSC MEFs p25pval`,
+                     `C23XX iPSC MEFs p31`,
+                     `C23XX iPSC MEFs p31pval`,
+                     `C23XX iPSC MEFs p32`,
+                     `C23XX iPSC MEFs p32pval`,
+                     `C23XX iPSC MEFs p33`,
+                     `C23XX iPSC MEFs p33pval`,
+                     `C23XX iPSC MEFs p35`,
+                     `C23XX iPSC MEFs p35pval`)
+
+write_csv(joined_df, "e:/pinterlab/x_erosion_data/geo_joined_sheet.csv")
+
+