Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Updated
  • Loading branch information
prb11009 committed Mar 26, 2021
1 parent 7dcc13d commit 2e6bcac
Show file tree
Hide file tree
Showing 2 changed files with 247 additions and 35 deletions.
57 changes: 33 additions & 24 deletions analysis_notebook.Rmd
Expand Up @@ -422,7 +422,9 @@ pca_female_by_group <- ggplot(female_pca_df, mapping = aes(x=PC1, y=PC2)) +
title = "Studies",
color="Study")+
base_plot_theme +
scale_color_futurama() +
scale_color_manual(limits=c("GSE110544", "GSE31848", "GSE59091", "GSE60924", "GSE72923", "GSE85828", "GSE34982", "GSE73938"),
values = c(pal_futurama()(4), pal_startrek()(3)[3], pal_rickandmorty()(5)[c(2,3,4)]))+
# scale_color_futurama() +
guides(color=guide_legend(nrow=3,byrow=TRUE)) +
theme(legend.key.width = unit(1, units="mm"),
legend.key.height = unit(1, units="mm"),
Expand Down Expand Up @@ -599,7 +601,8 @@ pca_male_by_group <- ggplot(male_pca_df, mapping = aes(x=PC1, y=PC2)) +
legend.text = element_text(size=4, margin = margin(r=.5, l=-1, unit="mm")),
legend.box.margin = margin(t=-3,l=-5, unit="mm"),
legend.spacing.y=unit(.2, "mm")) +
scale_color_futurama()
scale_color_manual(limits=c("GSE110544", "GSE31848", "GSE59091", "GSE60924", "GSE72923", "GSE85828"),
values = c(pal_futurama()(4), pal_startrek()(3)[3], pal_rickandmorty()(5)[c(2)]))
pca_male_by_methylation <- ggplot(male_pca_df, mapping = aes(x=PC1, y=PC2)) +
geom_point(aes(color=meanXMethylation), size=.5, shape=20) +
Expand Down Expand Up @@ -905,7 +908,7 @@ lm_eqn <- function(m) {
global_mean_methylation <- ggplot(means, aes(x=sex, group=sex, y=mean_methylation, fill=sex)) +
geom_boxplot(show.legend = F, size=.3, outlier.size = .5, outlier.shape = 20) +
geom_signif(map_signif_level=TRUE, comparisons = list(c("female", "male")), size=.3, textsize = 2) +
labs(x="Sex", y="Global Mean β-values", title = "Global Mean Methylation") +
labs(x="Sex", y="Global Mean β-values", title = "Global Mean\nMethylation") +
theme_bw() +
base_plot_theme +
theme(legend.position = "none") +
Expand Down Expand Up @@ -1117,14 +1120,14 @@ ggsave("figs/main/fig1_variance_density_plot.pdf", variance_density_plot, width=
sig_diff_x_v_no_x <- ggplot(statistics, aes(y=alpha, x=(Position %in% x_sites), fill=(Position %in% x_sites))) +
geom_boxplot(outlier.shape=NA, size=.4) +
labs(title="Brown-Forsythe\nSignificances",
x="",
y="BF-Test (alpha)") +
scale_x_discrete(labels=c("Autosomal\nProbes", "X\nProbes"), limits=c(FALSE, TRUE)) +
labs(title="Brown-Forsythe\np-values",
x="Chromosome",
y="BF p-value") +
scale_x_discrete(labels=c("Autosomal", "X"), limits=c(FALSE, TRUE)) +
scale_fill_grey(start=.5, end=.8, guide=F) +
theme_bw() +
base_plot_theme +
theme(axis.title.x=element_blank())
base_plot_theme #+
# theme(axis.title.x=element_blank())
positions_to_keep <- statistics$Position[statistics$alpha <= .01 & (statistics$femaleVariance > statistics$maleVariance)] # returns the density data
Expand Down Expand Up @@ -1154,17 +1157,20 @@ unsig_female_beta_vec <- as.vector(as.matrix(unsig_female_beta_mat %>% dplyr::se
sig_female_beta_vec <- as.vector(as.matrix(sig_female_beta_df %>% dplyr::select(-rn, -Chromosome, -Position, -Gene)))
sig_nonsig_dist_plot <- ggplot() +
geom_violin(aes(x="Sig\nProbes", y=sig_female_beta_vec, fill='female'), trim=FALSE, show.legend = F, size=.3) +
geom_violin(aes(x="Non-sig\nProbes", y=unsig_female_beta_vec, fill='male'), trim=FALSE, show.legend = F, size=.3) +
geom_boxplot(aes(x="Sig\nProbes", y=sig_female_beta_vec), width=0.025, size=.3, outlier.shape=20, outlier.size=.4) +
geom_boxplot(aes(x="Non-sig\nProbes", y=unsig_female_beta_vec), width=0.025, size=.3, outlier.shape=20, outlier.size=.4) +
labs(x="",
geom_violin(aes(x="p<.01", y=sig_female_beta_vec, fill='female'), trim=FALSE, show.legend = F, size=.3) +
geom_violin(aes(x="p>.01", y=unsig_female_beta_vec, fill='male'), trim=FALSE, show.legend = F, size=.3) +
geom_boxplot(aes(x="p<.01", y=sig_female_beta_vec), width=0.025, size=.3, outlier.shape=20, outlier.size=.4) +
geom_boxplot(aes(x="p>.01", y=unsig_female_beta_vec), width=0.025, size=.3, outlier.shape=20, outlier.size=.4) +
labs(x="BF p-values",
y='DNAme β-value',
title="β-values of sig. and \nnon-sig. variant sites\nin Female hPSCs") +
#title="β-values of sig. and \nnon-sig. variant sites\nin Female hPSCs") +
title="Probe β-values in Female\nhPSCs by Variance") +
scale_x_discrete(limits=c("p>.01", "p<.01")) +
theme_bw() +
base_plot_theme +
theme(legend.position = "none",
axis.title.x=element_blank()) +
# axis.title.x=element_blank()
) +
scale_fill_futurama()
# ggsave('figs/beta value distributions - sig-unsig.png')
sig_nonsig_dist_plot
Expand Down Expand Up @@ -1195,7 +1201,7 @@ data_df <- tibble(mean_x = mean_x_methylation, mean_sig_x = mean_sig_x_methylati
correlation <- sprintf("corr = %.2f", cor(mean_x_methylation, mean_sig_x_methylation))
female_x_sigsites <- ggplot(data_df, aes(x=mean_x, y=mean_sig_x)) +
geom_point(color=sex_colors[1],size=.5, shape=20) +
labs(x="Mean X DNAme", y="Mean Sig X DNAme", title="Female hPSCs\nSig var X-probes") +
labs(x="Mean X DNAme", y="Mean Sig X DNAme", title="Female hPSCs\nBF p<.01 X-probes") +
geom_smooth(method='lm', color="black", size=.5) +
# annotate("text", x=.55, y=.47, label=lm_eqn(lm(mean_autosome_methylation ~ mean_x_methylation, female_means)), parse=TRUE) +
annotate("text", x=.45, y=.45, label=correlation, size=2) +
Expand All @@ -1205,7 +1211,7 @@ female_x_sigsites <- ggplot(data_df, aes(x=mean_x, y=mean_sig_x)) +
correlation <- sprintf("corr = %.2f", cor(mean_x_methylation, mean_nonsig_x_methylation))
female_x_nonsigsites <- ggplot(data_df, aes(x=mean_x, y=mean_nonsig_x)) +
geom_point(color=sex_colors[1],size=.5, shape=20) +
labs(x="Mean X Chr DNAme", y="Mean Non-Sig X DNAme", title="Female hPSCs\nNon-Sig var X-probes") +
labs(x="Mean X Chr DNAme", y="Mean Non-Sig X DNAme", title="Female hPSCs\n BF p>.01 X-probes") +
geom_smooth(method='lm', color="black", size=.5) +
# annotate("text", x=.55, y=.47, label=lm_eqn(lm(mean_autosome_methylation ~ mean_x_methylation, female_means)), parse=TRUE) +
annotate("text", x=.55, y=.6, label=correlation, size=2) +
Expand Down Expand Up @@ -1569,12 +1575,12 @@ pca_female_no_h9_by_python_6 <- ggplot(female_pca_df, mapping = aes(x=PC1, y=PC2
# geom_text_repel(female_pca_df %>% filter(cell_line %in% correctly_ordered_by_sig_names & grepl("Nazor_", sample_name)), mapping = aes(x=PC1, y=PC2, label=passage), size=2, color="grey", segment.size=.15, min.segment.length=.25, segment.color="darkgrey") +
scale_shape_manual(values = c(17, 16), labels=c("H9", "Other"), limits=c(T,F), guide=F) +
theme_bw() +
labs(x=female_no_h9_sig_sites_pc1_label,
y=female_no_h9_sig_sites_pc2_label,
labs(x=female_pc1_label,
y=female_pc2_label,
title = "PCA of Female Samples",
color="Cluster")+
base_plot_theme +
scale_color_manual(limits=c(0,1,2,3,4,5,NA), labels=c("A","B","C","D","E","F", "H9"), values = c(RColorBrewer::brewer.pal(6,"Dark2"), "#000000"), na.value="#000000") +
scale_color_manual(limits=factor(c(0,1,2,3,4,5,NA)), labels=c("A","B","C","D","E","F", "H9"), values = c(RColorBrewer::brewer.pal(6,"Dark2"), "#000000"), na.value="#000000") +
# scale_color_brewer(type="qual", palette = "Dark2", limits=c(0,1,2,3,4,5), labels=c("A","B","C","D","E","F")) +
guides(color=guide_legend(nrow=1)) +
theme(legend.key.width = unit(1, units = "mm"),
Expand Down Expand Up @@ -8023,12 +8029,15 @@ table_1_df <- read_csv("table_1_sample_data_info.csv") %>%
`Sample Cluster Assignment` == 5 ~ "F"))
studies_in_clusters <- ggplot(table_1_df) +
geom_bar(aes(x=cluster, fill=Source)) +
geom_bar(aes(x=cluster, fill=sprintf("%s (%s)", str_replace(Source, " et al,", ","), `GEO Group Accession`))) +
labs(title="Studies in Clusters",
x="Sample Cluster",
y="Number of Samples") +
y="Number of Samples",
fill="Source") +
theme_bw() +
scale_fill_npg() +
scale_fill_manual(limits=c("Banovich, 2018 (GSE110544)", "Nazor, 2012 (GSE31848)", "Butcher, 2016 (GSE59091)", "Nishizawa, 2016 (GSE60924)", "Zdravkovic, 2015 (GSE72923)", "Salomonis, 2016 (GSE85828)", "Takasawa, 2018 (GSE73938)")[c(1,3,2,4,6,7,5)],
values = c(pal_futurama()(4), pal_startrek()(3)[3], pal_rickandmorty()(5)[c(2,4)])[c(1,3,2,4,6,7,5)])+
# scale_fill_npg() +
base_plot_theme +
theme(legend.position="right",
legend.justification = "left")
Expand Down
225 changes: 214 additions & 11 deletions analyze_our_methylation_data.R
Expand Up @@ -817,17 +817,6 @@ RGset <- read.metharray.exp(targets = sample_info_df)
mset <- preprocessIllumina(RGset)
gmset <- mapToGenome(mset)

#copy number stuff
library(conumee)
source("E:/pinterlab/custom_conumee_functions.R")
data(exclude_regions)

anno <- custom.CNV.create_anno(array_type="EPIC", exclude_regions=exclude_regions)

sampleNames(mset) <- sample_info_df[match(sampleNames(mset), sample_info_df$basename_nodir),]$Sample_Name
cnv_data <- CNV.load(mset)
x <- CNV.fit(cnv_data, anno)

beta_mat_850k <- getBeta(gmset)
colnames(beta_mat_850k) <- sample_info_df[match(colnames(beta_mat_850k), sample_info_df$basename_nodir),]$Sample_Name
write_csv(as.data.frame(beta_mat_850k) %>% rownames_to_column(var="rn"), "our_data_beta_mat_850k.csv.gz")
Expand Down Expand Up @@ -1293,3 +1282,217 @@ ggplot(qpcr_dat_col, aes(x=factor(passage, levels=c("p8","p11","p12","p13","p15"
theme(strip.text = element_text(size=6, margin=margin(t=1, b=1)))

ggsave("figs/supplementary/supp1_xist_qpcr_barplot.pdf", width=80, height=40, units="mm")


# checking copy number compared to H9s using DNAme
# library(stringi)
# datasets_df <- read.csv("datasets.csv.gz") %>% filter(Name=="Boscolo")
#
# for (row in 1:nrow(datasets_df)) {
# print(row)
# name <- as.character(datasets_df$Name[row])
# basedir <- as.character(datasets_df$Directory[row])
# datatype <- as.character(datasets_df$Type[row])
# sample_metadata_file <- as.character(datasets_df$SampleInfoFile[row])
# array_size <- datasets_df$ArraySize[row]
# if (array_size == 450) {
# array_type <- "IlluminaHumanMethylation450k"
# } else if (array_size == 850) {
# array_type <- "IlluminaHumanMethylationEPIC"
# }
#
#
# if (datatype == "geo_raw") {
# sample_info_df <- read.csv(file.path(basedir, sample_metadata_file), stringsAsFactors = FALSE, row.names = 1)
# samples_to_keep <- sample_info_df[sample_info_df$Keep %in% c("TRUE"), ]
# raw_filename <- file.path(basedir, as.character(datasets_df$DataFile[row]))
# u_name <- as.character(datasets_df$Uname[row])
# m_name <- as.character(datasets_df$Mname[row])
# separator <- stri_unescape_unicode(as.character(datasets_df$Separator[row]))
# gmset <- readGEORawFile(
# filename = raw_filename,
# Uname = u_name,
# Mname = m_name,
# array = array_type,
# sep = separator
# )
# gmset <- readGEORawFile(
# filename = raw_filename,
# Uname = u_name,
# Mname = m_name,
# array = array_type,
# sep = separator,
# pData = sample_info_df[sampleNames(gmset), ]
# )
# } else if (datatype == "idat") {
# targets <- as.data.frame(read.metharray.sheet(basedir, pattern = sample_metadata_file))
# targets_to_keep <- subset(targets, Keep == TRUE)
#
# RGset <- read.metharray.exp(targets = targets_to_keep)
# gmset <- preprocessIllumina(RGset)
# gmset <- mapToGenome(gmset)
# }
#
# sampleNames(gmset) <- paste(name, gmset$accession, gmset$cell_type, gmset$cell_line, gmset$gender,
# paste("p", gmset$passage, sep = ""),
# sep = "_"
# )
#
#
# plot_group <- data.frame(group = name, sample_name = sampleNames(gmset), row.names = "sample_name")
# if (row == 1) {
# combined_arrays <- gmset
# plot_groups_combined <- plot_group
# } else {
# combined_arrays <- combineArrays(combined_arrays, gmset, outType = "IlluminaHumanMethylation450k")
# plot_groups_combined <- rbind(plot_groups_combined, plot_group)
# }
# }

array_registry_df <- readxl::read_excel("E:/pinterlab/x_erosion_data/Array_Registry_Update210126.xlsx")
samples_in_order <- c("ar 52", "ar 54", "ar 56",
"ar 58", "ar 60", "ar 62",
"ar 64", "ar 66")
data_directory <- "E:/pinterlab/x_erosion_data/first_eight_samples"
basename_id <- "204668830015"
first_batch_samples_df <- array_registry_df %>%
filter(`Tube Label` %in% samples_in_order) %>%
dplyr::rename(description=`...4`) %>%
dplyr::mutate(cell_line=str_split(description, " ", simplify = T)[,1],
cell_type=str_split(description, " ", simplify = T)[,2],
passage=as.numeric(str_replace(str_split(description, " ", simplify = T)[,4], "p", "")),
Sentrix_Position=sprintf("R0%dC01", 1:8),
Basename=sprintf("%s/%s_%s", data_directory, basename_id, Sentrix_Position),
basename_nodir=sprintf("%s_%s", basename_id, Sentrix_Position),
Sample_Name=description)


#second batch of array
samples_in_order <- c("ar 47", "ar 48", "ar72", "ar73",
"ar74", "ar75", "ar76", "ar77")
data_directory <- "E:/pinterlab/x_erosion_data/second_eight_samples"
basename_id <- "204792210038"
second_batch_samples_df <- array_registry_df %>%
filter(`Tube Label` %in% samples_in_order) %>%
dplyr::rename(description=`...4`) %>%
dplyr::mutate(cell_line=str_split(description, " ", simplify = T)[,1],
cell_type=str_split(description, " ", simplify = T)[,2],
passage=as.numeric(str_replace(str_split(description, " ", simplify = T)[,4], "p", "")),
Sentrix_Position=sprintf("R0%dC01", 1:8),
Basename=sprintf("%s/%s_%s", data_directory, basename_id, Sentrix_Position),
basename_nodir=sprintf("%s_%s", basename_id, Sentrix_Position),
Sample_Name=description)

#third batch of array
samples_in_order <- c("ar86", "ar87", "ar88",
"ar89", "ar90", "ar91",
"ar92", "ar93")
data_directory <- "E:/pinterlab/x_erosion_data/third_eight_samples"
basename_id <- "205003700119"
third_batch_samples_df <- array_registry_df %>%
filter(`Tube Label` %in% samples_in_order) %>%
dplyr::rename(description=`...4`) %>%
dplyr::mutate(cell_line=str_split(description, " ", simplify = T)[,1],
cell_type=str_split(description, " ", simplify = T)[,2],
passage=as.numeric(str_replace(str_split(description, " ", simplify = T)[,4], "p", "")),
Sentrix_Position=sprintf("R0%dC01", 1:8),
Basename=sprintf("%s/%s_%s", data_directory, basename_id, Sentrix_Position),
basename_nodir=sprintf("%s_%s", basename_id, Sentrix_Position),
Sample_Name=description)

continuous_cluster_df <- read_csv("continuous_cluster_df.csv")

sample_info_df <- rbind(first_batch_samples_df, second_batch_samples_df, third_batch_samples_df) %>%
left_join(continuous_cluster_df, by=c("description"="sample_name", "cell_line", "passage"))
#%>%
add_row(cell_line="H9",
Sample_Name="Daily_H9",
Basename="D:/pinterlab/x_erosion/daily_h9_sc/GSM2285135_7970368067_R04C01",
basename_nodir="GSM2285135_7970368067_R04C01")

RGset <- read.metharray.exp(targets = sample_info_df %>% filter(cell_line != "H9"))
mset <- preprocessIllumina(RGset)
sampleNames(mset) <- sample_info_df[match(sampleNames(mset), sample_info_df$basename_nodir),]$Sample_Name
sampleNames(RGset) <- sample_info_df[match(sampleNames(RGset), sample_info_df$basename_nodir),]$Sample_Name

RGset_h9 <- read.metharray.exp(targets = sample_info_df %>% filter(cell_line == "H9"))
mset_h9 <- preprocessIllumina(RGset_h9)

combined_mset <- combineArrays(mset, mset_h9)

# gmset <- mapToGenome(mset)


# combined_arrays <- combineArrays(combined_arrays, gmset, outType = "IlluminaHumanMethylation450k")

library(conumee)
source("E:/pinterlab/custom_conumee_functions.R")
data(exclude_regions)

anno <- custom.CNV.create_anno(array_type="450k", exclude_regions=exclude_regions)

# sampleNames(mset) <- sample_info_df[match(sampleNames(mset), sample_info_df$basename_nodir),]$Sample_Name
cnv_data <- CNV.load(combined_mset)
minfi.controls <- (pData(mset)$cell_line == "H9")
x <- CNV.fit(cnv_data[sampleNames(combined_mset) == "C19XX iPSC MEFs p19"], cnv_data[pData(combined_mset)$cell_line == "H9"], anno)
x <- CNV.bin(x)
x <- CNV.segment(x)

CNV.genomeplot(x)

#output for GEO

beta_df <- getBeta(gmset)
beta_df <- as.data.frame(beta_df) %>% rownames_to_column(var="rn")
detect_p_df <- detectionP(RGset)
detect_p_df <- as.data.frame(detect_p_df) %>% rownames_to_column(var="rn")
joined_df <- full_join(beta_df, detect_p_df, by="rn", suffix=c("", "pval")) %>%
select(ID_REF=rn,
`C19XX IPSC MEF p8`,
`C19XX IPSC MEF p8pval`,
`C19XX iPSC MEF p11`,
`C19XX iPSC MEF p11pval`,
`C19XX iPSC MEF p13`,
`C19XX iPSC MEF p13pval`,
`C19XX iPSC MEF p17`,
`C19XX iPSC MEF p17pval`,
`C19XX iPSC MEFs p19`,
`C19XX iPSC MEFs p19pval`,
`C19XX iPSC MEFs p21`,
`C19XX iPSC MEFs p21pval`,
`C19XX iPSC MEFs p22`,
`C19XX iPSC MEFs p22pval`,
`C19XX iPSC MEFs p28`,
`C19XX iPSC MEFs p28pval`,
`C19XX iPSC MEFs p29`,
`C19XX iPSC MEFs p29pval`,
`C19XX iPSC MEFs p30`,
`C19XX iPSC MEFs p30pval`,
`C19XX iPSC MEFs p32`,
`C19XX iPSC MEFs p32pval`,
`C23XX iPSC MEF p12`,
`C23XX iPSC MEF p12pval`,
`C23XX iPSC MEF p15`,
`C23XX iPSC MEF p15pval`,
`C23XX iPSC MEF p17`,
`C23XX iPSC MEF p17pval`,
`C23XX iPSC MEF p20`,
`C23XX iPSC MEF p20pval`,
`C23XX iPSC MEFs p22`,
`C23XX iPSC MEFs p22pval`,
`C23XX iPSC MEFs p24`,
`C23XX iPSC MEFs p24pval`,
`C23XX iPSC MEFs p25`,
`C23XX iPSC MEFs p25pval`,
`C23XX iPSC MEFs p31`,
`C23XX iPSC MEFs p31pval`,
`C23XX iPSC MEFs p32`,
`C23XX iPSC MEFs p32pval`,
`C23XX iPSC MEFs p33`,
`C23XX iPSC MEFs p33pval`,
`C23XX iPSC MEFs p35`,
`C23XX iPSC MEFs p35pval`)

write_csv(joined_df, "e:/pinterlab/x_erosion_data/geo_joined_sheet.csv")


0 comments on commit 2e6bcac

Please sign in to comment.