Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
---
title: "Methylation Analysis"
output: html_notebook
---
This notebook takes us through the methylation analysis performed in this study.
First we begin by downloading all the data from GEO, and performing some quality assessment and culling of the data.
Below we use the minfi package to download all our datasets of interest, and combine them into the Illumina 450K methylation array format.
```{r}
#---Dependencies installation---#
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install("minfi", version = "3.8")
# BiocManager::install("IlluminaHumanMethylation450kanno.ilmn12.hg19")
# BiocManager::install("IlluminaHumanMethylationEPICanno.ilm10b4.hg19")
# BiocManager::install('IlluminaHumanMethylationEPICmanifest')
# BiocManager::install('IlluminaHumanMethylation450kmanifest')
# BiocManager::install('tidyverse')
#-------------------------------#
```
```{r}
library(minfi)
library(IlluminaHumanMethylation450kanno.ilmn12.hg19)
library(IlluminaHumanMethylationEPICanno.ilm10b4.hg19)
library(stringi)
library(tidyverse)
datasets_df <- read.csv("datasets.csv.gz")
for (row in 1:nrow(datasets_df)) {
print(row)
name <- as.character(datasets_df$Name[row])
basedir <- as.character(datasets_df$Directory[row])
datatype <- as.character(datasets_df$Type[row])
sample_metadata_file <- as.character(datasets_df$SampleInfoFile[row])
array_size <- datasets_df$ArraySize[row]
if (array_size == 450) {
array_type <- "IlluminaHumanMethylation450k"
} else if (array_size == 850) {
array_type <- "IlluminaHumanMethylationEPIC"
}
if (datatype == "geo_raw") {
sample_info_df <- read.csv(file.path(basedir, sample_metadata_file), stringsAsFactors = FALSE, row.names = 1)
samples_to_keep <- sample_info_df[sample_info_df$Keep %in% c("TRUE"), ]
raw_filename <- file.path(basedir, as.character(datasets_df$DataFile[row]))
u_name <- as.character(datasets_df$Uname[row])
m_name <- as.character(datasets_df$Mname[row])
separator <- stri_unescape_unicode(as.character(datasets_df$Separator[row]))
gmset <- readGEORawFile(
filename = raw_filename,
Uname = u_name,
Mname = m_name,
array = array_type,
sep = separator
)
gmset <- readGEORawFile(
filename = raw_filename,
Uname = u_name,
Mname = m_name,
array = array_type,
sep = separator,
pData = sample_info_df[sampleNames(gmset), ]
)
} else if (datatype == "idat") {
targets <- as.data.frame(read.metharray.sheet(basedir, pattern = sample_metadata_file))
targets_to_keep <- subset(targets, Keep == TRUE)
RGset <- read.metharray.exp(targets = targets_to_keep)
gmset <- preprocessIllumina(RGset)
gmset <- mapToGenome(gmset)
}
sampleNames(gmset) <- paste(name, gmset$accession, gmset$cell_type, gmset$cell_line, gmset$gender,
paste("p", gmset$passage, sep = ""),
sep = "_"
)
plot_group <- data.frame(group = name, sample_name = sampleNames(gmset), row.names = "sample_name")
if (row == 1) {
combined_arrays <- gmset
plot_groups_combined <- plot_group
} else {
combined_arrays <- combineArrays(combined_arrays, gmset, outType = "IlluminaHumanMethylation450k")
plot_groups_combined <- rbind(plot_groups_combined, plot_group)
}
}
```
## Minfi sex prediction
Now we will use minfi to run sex prediction on the data. We have also manually identified the sex of each cell line using either the paper that the cell lines are coming from, or using Cellosaurus for well established cell lines. We remove any cell lines that have a conflict between the predicted sex, and the manually annotated sex.
```{r}
# remove all cross reactive probes
# site_annotation_df <- getAnnotation(combined_arrays)
cross_reactive_probes <- read_csv("Chen-non-specific-probes-Illumina450k.csv.gz")
probes_to_keep <- !(rownames(combined_arrays) %in% cross_reactive_probes$TargetID)
filtered_combined_arrays <- combined_arrays[probes_to_keep,]
filtered_site_annotation_df <- getAnnotation(filtered_combined_arrays)
## Update the annotation to hg38 for better gene annotation and such
# First we create a fasta file with all the probes
library(seqinr)
# filtered_site_annotation_df <- read_csv("filtered_site_annotation_df.csv")
write.fasta(as.list(filtered_site_annotation_df$SourceSeq), filtered_site_annotation_df$Name, "probe_sequences.fa")
```
Now we'll use blat to map the probes to the hg38 genome.
```{bash}
./blat -minIdentity=100 -maxGap=0 genomes/ucsc.hg38.fa probe_sequences.fa hg38_probe_mapping.psl
```
Finally we'll bring it back to R
```{r}
hg38_probe_mapping <- read_tsv("hg38_probe_mapping.psl",
skip=5,
col_names = c("match",
"mis_match",
"rep_match",
"Ns",
"Q_gapcount",
"Q_gapbases",
"T_gapcount",
"T_gapbases",
"strand",
"Q_name",
"Q_size",
"Q_start",
"Q_end",
"T_name",
"T_size",
"T_start",
"T_end",
"blockcount",
"blockSizes",
"qStarts",
"tStarts"))
full_mappings <- hg38_probe_mapping %>% filter(match >= (Q_size-3) & T_gapcount == 0)
uniquely_mapping_probes <- full_mappings %>% filter(!(Q_name %in% Q_name[duplicated(Q_name)]))
filtered_combined_arrays <- combined_arrays[uniquely_mapping_probes$Q_name,]
filtered_site_annotation_df <- getAnnotation(filtered_combined_arrays)
filtered_site_annotation_df$hg38_chromosome <- uniquely_mapping_probes$T_name
filtered_site_annotation_df$hg38_pos <- rowMeans(uniquely_mapping_probes %>% dplyr::select(T_start, T_end))
filtered_site_annotation_df <- rownames_to_column(as.data.frame(filtered_site_annotation_df))
## Annotating the hg38 genes using biomart
library(biomaRt)
library(GenomicRanges)
hg38_mart <- useMart('ensembl', dataset='hsapiens_gene_ensembl') # create a mart object
# in order to prevent timeout, we'll get this 1 chromosome at a time
# hg38_biomart_structure_df <- tibble(external_gene_name=character(),
# description=character(),
# chromosome_name=character(),
# start_position=numeric(),
# end_position=numeric(),
# strand=numeric(),
# ensembl_transcript_id=character(),
# ensembl_gene_id=character(),
# transcription_start_site=numeric(),
# transcript_start=numeric(),
# transcript_end=numeric(),
# `5_utr_start`=numeric(),
# `5_utr_end`=numeric(),
# `3_utr_start`=numeric(),
# `3_utr_end`=numeric())
# for (chrom_name in c(1:22, "X", "Y")){
# hg38_biomart_structure_df <- bind_rows(hg38_biomart_structure_df, getBM(mart=hg38_mart,
# filters = c( "chromosome_name"),
# values = list(chromosome_name=chrom_name),
# attributes=c('external_gene_name', 'description', 'chromosome_name',
# 'start_position', 'end_position', 'strand',
# 'ensembl_transcript_id',
# 'ensembl_gene_id', 'transcription_start_site',
# 'transcript_start', 'transcript_end',
# '5_utr_start', '5_utr_end', '3_utr_start', '3_utr_end')) %>% mutate(chromosome_name=as.character(chromosome_name)))
# }
#
# write_csv(hg38_biomart_structure_df, "hg38_biomart_structure_df.csv")
hg38_biomart_structure_df <- read_csv("hg38_biomart_structure_df.csv", col_types = cols(
external_gene_name = col_character(),
description = col_character(),
chromosome_name = col_character(),
start_position = col_double(),
end_position = col_double(),
strand = col_double(),
ensembl_transcript_id = col_character(),
ensembl_gene_id = col_character(),
transcription_start_site = col_double(),
transcript_start = col_double(),
transcript_end = col_double(),
`5_utr_start` = col_double(),
`5_utr_end` = col_double(),
`3_utr_start` = col_double(),
`3_utr_end` = col_double()))
hg38_biomart_feature_df <- getBM(mart=hg38_mart,
filters = c("transcript_gencode_basic", "chromosome_name", "transcript_tsl"),
values = list(transcript_gencode_basic=TRUE, chromosome_name=c(1:22, "X", "Y") , transcript_tsl=TRUE),
attributes=c('ensembl_transcript_id', 'ensembl_gene_id',
'chromosome_name', 'start_position', 'end_position',
'transcript_tsl', 'transcript_appris', 'transcript_length'))
write_csv(hg38_biomart_feature_df, "hg38_biomart_feature_df.csv")
hg38_biomart_feature_df <- read_csv("hg38_biomart_feature_df.csv", col_types = cols("chromosome_name"="c"))
# hg38_biomart_df <- full_join(hg38_biomart_feature_df, hg38_biomart_structure_df, by=c("ensembl_transcript_id"))
biomart_genes_grange <- makeGRangesFromDataFrame(hg38_biomart_feature_df %>%
mutate(chrom=paste("chr", chromosome_name, sep = "")) %>%
dplyr::select(chrom, start=start_position, end=end_position))
cpg_probes_grange <- makeGRangesFromDataFrame(filtered_site_annotation_df %>%
mutate(end=hg38_pos) %>%
dplyr::select(chrom=hg38_chromosome, start=hg38_pos, end))
nearest_gene_idx <- nearest(cpg_probes_grange, biomart_genes_grange)
nearest_gene_distances_df <- as_tibble(distanceToNearest(cpg_probes_grange, biomart_genes_grange))
nearest_ensembl_genes <- hg38_biomart_feature_df[nearest_gene_idx,]$ensembl_gene_id
nearest_ensembl_genes[nearest_gene_distances_df$distance > 5000] <- ""
filtered_site_annotation_df$hg38_ensembl_gene_id <- nearest_ensembl_genes
tsl_levels <- unique(filter(hg38_biomart_feature_df, grepl("tsl1", transcript_tsl))$transcript_tsl)
tsl_levels <- c(tsl_levels, unique(filter(hg38_biomart_feature_df, grepl("tsl2", transcript_tsl))$transcript_tsl))
tsl_levels <- c(tsl_levels, unique(filter(hg38_biomart_feature_df, grepl("tsl3", transcript_tsl))$transcript_tsl))
tsl_levels <- c(tsl_levels, unique(filter(hg38_biomart_feature_df, grepl("tsl4", transcript_tsl))$transcript_tsl))
tsl_levels <- c(tsl_levels, unique(filter(hg38_biomart_feature_df, grepl("tsl5", transcript_tsl))$transcript_tsl))
tsl_levels <- c(tsl_levels, unique(filter(hg38_biomart_feature_df, grepl("tslna", transcript_tsl))$transcript_tsl))
appris_levels <- unique(filter(hg38_biomart_feature_df, grepl("principal1", transcript_appris))$transcript_appris)
appris_levels <- c(appris_levels, unique(filter(hg38_biomart_feature_df, grepl("principal2", transcript_appris))$transcript_appris))
appris_levels <- c(appris_levels, unique(filter(hg38_biomart_feature_df, grepl("principal3", transcript_appris))$transcript_appris))
appris_levels <- c(appris_levels, unique(filter(hg38_biomart_feature_df, grepl("principal4", transcript_appris))$transcript_appris))
appris_levels <- c(appris_levels, unique(filter(hg38_biomart_feature_df, grepl("alternative1", transcript_appris))$transcript_appris))
appris_levels <- c(appris_levels, unique(filter(hg38_biomart_feature_df, grepl("alternative2", transcript_appris))$transcript_appris))
appris_levels <- c(appris_levels, "")
# i <- 0
# total <- nrow(filtered_site_annotation_df)
canonical_transcripts <- hg38_biomart_feature_df %>% group_by(ensembl_gene_id) %>%
arrange(factor(transcript_tsl, levels = tsl_levels),
factor(transcript_appris, levels = appris_levels),
desc(transcript_length),
) %>%
filter(row_number()==1)
write_csv(canonical_transcripts, "hg38_canonical_transcripts.csv")
hg38_biomart_structure_df_canonical <- hg38_biomart_structure_df %>%
filter(ensembl_transcript_id %in% canonical_transcripts$ensembl_transcript_id) %>%
mutate(chrom=paste("chr", chromosome_name, sep=""))
gene_groups_df <- na.omit(hg38_biomart_structure_df_canonical %>%
mutate(group="5_UTR") %>%
dplyr::select(chrom, start=`5_utr_start`, end=`5_utr_end`, group, ensembl_gene_id, external_gene_name))
gene_groups_df <- bind_rows(gene_groups_df, na.omit(hg38_biomart_structure_df_canonical %>%
mutate(group="3_UTR") %>%
dplyr::select(chrom, start=`3_utr_start`, end=`3_utr_end`, group, ensembl_gene_id, external_gene_name)))
gene_groups_df <- bind_rows(gene_groups_df, na.omit(hg38_biomart_structure_df_canonical %>%
filter(strand == 1) %>%
distinct(ensembl_transcript_id, .keep_all = T) %>%
mutate(group="TSS200", start=(transcription_start_site - 200), end=transcription_start_site-1) %>%
dplyr::select(chrom, start, end, group, ensembl_gene_id, external_gene_name)))
gene_groups_df <- bind_rows(gene_groups_df, na.omit(hg38_biomart_structure_df_canonical %>%
filter(strand == 1) %>%
distinct(ensembl_transcript_id, .keep_all = T) %>%
mutate(group="TSS1500", start=(transcription_start_site - 1500), end=(transcription_start_site - 201)) %>%
dplyr::select(chrom, start, end, group, ensembl_gene_id, external_gene_name)))
gene_groups_df <- bind_rows(gene_groups_df, na.omit(hg38_biomart_structure_df_canonical %>%
filter(strand == -1) %>%
distinct(ensembl_transcript_id, .keep_all = T) %>%
mutate(group="TSS200", start=(transcription_start_site+1), end=(transcription_start_site+200)) %>%
dplyr::select(chrom, start, end, group, ensembl_gene_id, external_gene_name)))
gene_groups_df <- bind_rows(gene_groups_df, na.omit(hg38_biomart_structure_df_canonical %>%
filter(strand == -1) %>%
distinct(ensembl_transcript_id, .keep_all = T) %>%
mutate(group="TSS1500", start=(transcription_start_site + 201), end=(transcription_start_site + 1500)) %>%
dplyr::select(chrom, start, end, group, ensembl_gene_id, external_gene_name)))
gene_groups_df <- bind_rows(gene_groups_df, na.omit(hg38_biomart_structure_df_canonical %>%
filter(strand == 1) %>%
distinct(ensembl_transcript_id, .keep_all = T) %>%
mutate(group="OutsideTranscript3", start=(transcript_end+1), end=(transcript_end + 5000)) %>%
dplyr::select(chrom, start, end, group, ensembl_gene_id, external_gene_name)))
gene_groups_df <- bind_rows(gene_groups_df, na.omit(hg38_biomart_structure_df_canonical %>%
filter(strand == -1) %>%
distinct(ensembl_transcript_id, .keep_all = T) %>%
mutate(group="OutsideTranscript3",start=(transcript_start - 5000), end=(transcript_start-1)) %>%
dplyr::select(chrom, start, end, group, ensembl_gene_id, external_gene_name)))
gene_groups_df <- bind_rows(gene_groups_df, na.omit(hg38_biomart_structure_df_canonical %>%
filter(strand == 1) %>%
distinct(ensembl_transcript_id, .keep_all = T) %>%
mutate(group="OutsideTranscript5", start=(transcript_start - 5000), end=(transcript_start -1501)) %>%
dplyr::select(chrom, start, end, group, ensembl_gene_id, external_gene_name)))
gene_groups_df <- bind_rows(gene_groups_df, na.omit(hg38_biomart_structure_df_canonical %>%
filter(strand == -1) %>%
distinct(ensembl_transcript_id, .keep_all = T) %>%
mutate(group="OutsideTranscript5", start=(transcript_end + 1501), end=(transcript_end + 5000)) %>%
dplyr::select(chrom, start, end, group, ensembl_gene_id, external_gene_name)))
test_df <- right_join(filtered_site_annotation_df, gene_groups_df, by=c("hg38_ensembl_gene_id"="ensembl_gene_id")) %>%
filter((hg38_pos >= start & hg38_pos <= end))
filtered_site_annotation_df_test <- left_join(filtered_site_annotation_df, test_df %>% dplyr::select(rowname, hg38_gene_group=group), by="rowname")
filtered_site_annotation_df_test <- left_join(filtered_site_annotation_df_test,
hg38_biomart_structure_df_canonical %>% dplyr::select(ensembl_gene_id, hg38_gene_name=external_gene_name) %>% distinct(ensembl_gene_id, .keep_all = T),
by=c("hg38_ensembl_gene_id"="ensembl_gene_id"))
filtered_site_annotation_df_test[
(filtered_site_annotation_df_test$hg38_ensembl_gene_id != "") & is.na(filtered_site_annotation_df_test$hg38_gene_group),
"hg38_gene_group"] <- "Body"
filtered_site_annotation_df <- filtered_site_annotation_df_test
write_csv(filtered_site_annotation_df, "filtered_site_annotation_df.csv")
x_annotation_df <- filtered_site_annotation_df[filtered_site_annotation_df$hg38_chromosome == "chrX", ]
write_csv(x_annotation_df, "x_annotation.csv")
x_sites <- x_annotation_df$Name
# compute predicted sex using getSex function from minfi
sex_prediction <- as.data.frame(getSex(filtered_combined_arrays))[sampleNames(combined_arrays), ]
# get all sample names that have a predicted sex of female, and an annotated sex of female
all_female_sample_names <- sampleNames(filtered_combined_arrays)[(filtered_combined_arrays$Keep %in% c("TRUE")) &
(filtered_combined_arrays$gender == "female") &
(sex_prediction$predictedSex == "F")]
all_male_sample_names <- sampleNames(filtered_combined_arrays)[(filtered_combined_arrays$Keep %in% c("TRUE")) &
(filtered_combined_arrays$gender == "male") &
(sex_prediction$predictedSex == "M")]
all_beta_mat <- getBeta(filtered_combined_arrays)
write_csv(as.data.frame(all_beta_mat) %>% rownames_to_column(), "all_beta_mat.csv.gz")
# get a matrix of the beta values for all the female samples we will keep
female_beta_mat <- na.omit(all_beta_mat[, all_female_sample_names])
write_csv(as.data.frame(female_beta_mat) %>% rownames_to_column(), "female_beta_mat.csv.gz")
# get a matrix of the beta values for all the male samples we will keep
male_beta_mat <- na.omit(all_beta_mat[, all_male_sample_names])
write_csv(as.data.frame(male_beta_mat) %>% rownames_to_column(), "male_beta_mat.csv.gz")
sample_annotation_df <- as.data.frame(pData(filtered_combined_arrays))
sample_annotation_df[c(all_female_sample_names, all_male_sample_names), "gender_matches"] <- TRUE
sample_annotation_df <- rownames_to_column(sample_annotation_df, var = "sample_name")
write.csv(sample_annotation_df, file="sample_data_combined_all_info.csv")
```
Next we will look at the female and male samples to identify outliers, and remove them.
```{r}
extrafont::loadfonts(device="win")
library(ggplot2)
library(ggfortify)
library(grid)
library(gridExtra)
library(ggsci)
library(viridis)
# create a PCA plot of the female samples, and color by different attributes.
female_all_x_df <- female_beta_mat[rownames(female_beta_mat) %in% x_annotation_df$Name,]
female_avg_x_methylation_df <- data.frame(meanXMethylation=colMeans(na.omit(female_all_x_df)))
female_avg_x_methylation_df <- rownames_to_column(female_avg_x_methylation_df, var = "sample_name")
female_pca <- prcomp(t(na.omit(female_all_x_df)))
female_pca_df <- rownames_to_column(as.data.frame(female_pca$x), var = "sample_name")
female_pca_df <- left_join(female_pca_df, sample_annotation_df, by = "sample_name")
female_pca_df <- left_join(female_pca_df, female_avg_x_methylation_df, by = "sample_name")
female_pov <- sprintf("%2.2f%%", 100* female_pca$sdev^2/sum(female_pca$sdev^2))
female_pc1_label <- sprintf("PC1 (%s)", female_pov[1])
female_pc2_label <- sprintf("PC2 (%s)", female_pov[2])
base_plot_theme <- theme(plot.title = element_text(hjust = 0.5, size=6, margin = margin(b=.5, unit = "mm"), face="bold"),
legend.position="bottom",
legend.justification = "right",
legend.margin = margin(t=0,r=0,b=0,l=0, unit="mm"),
legend.box.margin = margin(t=-3, unit="mm"),
legend.text = element_text(size=5, margin = margin(r=.5, l=0, unit="mm")),
legend.title = element_text(size=5, face="bold", margin = margin(r=1, unit="mm")),
legend.key.width = unit(2, units="mm"),
legend.key.height = unit(2, units="mm"),
legend.key.size = unit(.2, units = "mm"),
axis.title = element_text(size=6, margin=margin(t=0,r=0,b=0,l=0, unit = "mm")),
axis.text = element_text(size = 6),
plot.margin = unit(c(1,1,1,1), units="mm"),
panel.grid.major = element_line(size=.3),
panel.grid.minor = element_line(size=.2),
panel.border = element_rect(size = .3),
axis.ticks = element_line(size = .3))
pca_female_by_group <- ggplot(female_pca_df, mapping = aes(x=PC1, y=PC2)) +
geom_point(aes(color=group_accession), size=.5, shape=20) +
theme_bw() +
labs(x=female_pc1_label,
y=female_pc2_label,
title = "Studies",
color="Study")+
base_plot_theme +
scale_color_manual(limits=c("GSE110544", "GSE31848", "GSE59091", "GSE60924", "GSE72923", "GSE85828", "GSE34982", "GSE73938"),
values = c(pal_futurama()(4), pal_startrek()(3)[3], pal_rickandmorty()(5)[c(2,3,4)]))+
# scale_color_futurama() +
guides(color=guide_legend(nrow=3,byrow=TRUE)) +
theme(legend.key.width = unit(1, units="mm"),
legend.key.height = unit(1, units="mm"),
legend.text = element_text(size=4, margin = margin(r=.5, l=-1, unit="mm")),
legend.box.margin = margin(t=-3,l=-5, unit="mm"),
legend.spacing.y=unit(.2, "mm"))
pca_female_by_methylation <- ggplot(female_pca_df, mapping = aes(x=PC1, y=PC2)) +
geom_point(aes(color=meanXMethylation, shape = (cell_line == "WA09")), size=.5) +
scale_shape_manual(values = c(17, 16), labels=c("H9", "Other"), limits=c(T,F), guide=F) +
labs(x=female_pc1_label,
y=female_pc2_label,
title = "Mean X Methylation",
color="Methylation")+
theme_bw() +
base_plot_theme +
theme(legend.title = element_text(vjust = .95),
legend.box.margin = margin(t=-3,l=-6, unit="mm"),
legend.key.width = unit(3, units = "mm")) +
scale_color_viridis_c(breaks=c(.4,.5,.6), minor_breaks=waiver())
#add color for H9
pca_female_by_cell_line <- ggplot(female_pca_df, mapping = aes(x=PC1, y=PC2)) +
geom_point(aes(color=cell_line, shape = (cell_line == "WA09")), size=.5) +
scale_shape_manual(values = c(17, 16), labels=c("H9", "Other"), limits=c(T,F)) +
geom_point(data = female_pca_df %>% filter(cell_line == "WA09"),
mapping = aes(x=PC1, y=PC2), size=.5, color="#000000", shape=17) +
labs(x=female_pc1_label,
y=female_pc2_label,
title="Cell Lines",
shape="Cell Line") +
theme_bw() +
base_plot_theme +
scale_color_discrete(guide=F)
pca_female_by_cell_type <- ggplot(female_pca_df, mapping = aes(x=PC1, y=PC2)) +
geom_point(aes(color=cell_type), size=.5, shape=20) +
labs(x=female_pc1_label,
y=female_pc2_label,
title = "Cell Type",
color="Cell Type")+
theme_bw() +
base_plot_theme +
scale_color_startrek()
arranged_female_pca <- grid.arrange(pca_female_by_methylation,
pca_female_by_group,
pca_female_by_cell_type,
pca_female_by_cell_line,
top = textGrob("PCA of Female Samples",gp=gpar(fontsize=7, fontface="bold")), nrow=1)
ggsave("figs/supplementary/female_pca.pdf", arranged_female_pca, width = 139.2, height=41, units = "mm")
```
## Removal of overrepresented H9 (a.k.a. WA09) lines from female samples
```{r}
h9_lines <- filter(sample_annotation_df, cell_line == 'WA09' & gender_matches == T)$sample_name
female_all_x_df_only_h9 <- dplyr::select(as.data.frame(female_all_x_df), all_of(h9_lines))
write_csv(female_all_x_df_only_h9, "female_all_x_df_only_h9.csv.gz")
female_sig_x_sites_df_only_h9 <- female_all_x_df_only_h9[sig_x_sites,]
write_csv(female_sig_x_sites_df_only_h9 %>% rownames_to_column(), "female_sig_x_sites_df_only_h9.csv.gz")
female_all_x_df_no_h9 <- dplyr::select(as.data.frame(female_all_x_df), -h9_lines)
female_no_h9_pca <- prcomp(t(na.omit(female_all_x_df_no_h9)))
female_no_h9_pca_df <- rownames_to_column(as.data.frame(female_no_h9_pca$x), var = "sample_name")
female_no_h9_pca_df <- left_join(female_no_h9_pca_df, sample_annotation_df, by = "sample_name")
female_no_h9_pca_df <- left_join(female_no_h9_pca_df, female_avg_x_methylation_df, by = "sample_name")
# female_no_h9_pca_df <- left_join(female_no_h9_pca_df, tidy(colMeans(female_all_x_df_no_h9)) %>% rename(sample_name=names, meanXMethylation=x), by = "sample_name")
female_no_h9_pov <- sprintf("%2.2f%%", 100* female_no_h9_pca$sdev^2/sum(female_no_h9_pca$sdev^2))
female_no_h9_pc1_label <- sprintf("PC1 (%s)", female_no_h9_pov[1])
female_no_h9_pc2_label <- sprintf("PC2 (%s)", female_no_h9_pov[2])
pca_female_no_h9_by_group <- ggplot(female_no_h9_pca_df, mapping = aes(x=PC1, y=PC2)) +
geom_point(aes(color=group_accession), size=.5, shape=20) +
theme_bw() +
labs(x=female_no_h9_pc1_label,
y=female_no_h9_pc2_label,
title = "Studies",
color="Study")+
base_plot_theme +
theme(legend.key.width = unit(1, units="mm"),
legend.key.height = unit(1, units="mm"),
legend.text = element_text(size=4, margin = margin(r=.5, l=-1, unit="mm")),
legend.box.margin = margin(t=-3,l=-5, unit="mm"),
legend.spacing.y=unit(.2, "mm")) +
scale_color_futurama()
pca_female_no_h9_by_methylation <- ggplot(female_no_h9_pca_df, mapping = aes(x=PC1, y=PC2)) +
geom_point(aes(color=meanXMethylation), size=.5, shape=20) +
labs(x=female_no_h9_pc1_label,
y=female_no_h9_pc2_label,
title = "Mean X Methylation",
color="Methylation")+
theme_bw() +
base_plot_theme +
theme(legend.title = element_text(vjust = .95, margin=margin(l=-3, unit="mm")),
legend.box.margin = margin(t=-3,l=-10, unit="mm"),
legend.key.width = unit(3, units = "mm")) +
scale_color_viridis_c(breaks=c(.4,.5,.6), minor_breaks=waiver())
pca_female_no_h9_by_cell_line <- ggplot(female_no_h9_pca_df, mapping = aes(x=PC1, y=PC2)) +
geom_point(aes(color=cell_line, shape = (cell_line == "WA09")), size=.5) +
scale_shape_manual(values = c(16, 17), labels=c("Other", "H9")) +
geom_point(data = female_no_h9_pca_df %>% filter(cell_line == "WA09"),
mapping = aes(x=PC1, y=PC2), size=.5, color="#000000", shape=17) +
labs(x=female_no_h9_pc1_label,
y=female_no_h9_pc2_label,
title="Cell Lines",
shape="Cell Line") +
theme_bw() +
base_plot_theme +
scale_color_discrete(guide=F)
pca_female_no_h9_by_cell_type <- ggplot(female_no_h9_pca_df, mapping = aes(x=PC1, y=PC2)) +
geom_point(aes(color=cell_type), size=.5, shape=20) +
labs(x=female_no_h9_pc1_label,
y=female_no_h9_pc2_label,
title = "Cell Type",
color="Cell Type")+
theme_bw() +
base_plot_theme +
scale_color_startrek()
arranged_female_no_h9_pca <- grid.arrange(pca_female_no_h9_by_methylation,
pca_female_no_h9_by_group,
pca_female_no_h9_by_cell_type,
pca_female_no_h9_by_cell_line,
top = textGrob("PCA of Female Samples - No H9",gp=gpar(fontsize=6)))
ggsave("figs/supplementary/female_no_h9_pca.pdf",arranged_female_no_h9_pca, width = 80, height=85, units = 'mm')
arranged_female_no_h9_pca <- grid.arrange(pca_female_no_h9_by_methylation,
top = textGrob("Female - No H9", gp=gpar(fontsize=7, fontface="bold")))
ggsave("figs/supplementary/female_no_h9_pca_methylation.svg",arranged_female_no_h9_pca, width = 34.8, height=41, units = 'mm')
```
## Create PCA Plots of the male samples
```{r,dpi=600, fig.width=7, fig.height=7}
male_all_x_df <- as.data.frame(male_beta_mat[rownames(male_beta_mat) %in% x_annotation_df$Name,])
male_avg_x_methylation_df <- data.frame(meanXMethylation=colMeans(na.omit(male_all_x_df)))
male_avg_x_methylation_df <- rownames_to_column(male_avg_x_methylation_df, var = "sample_name")
male_pca <- prcomp(t(na.omit(male_all_x_df)))
male_pca_df <- rownames_to_column(as.data.frame(male_pca$x), var = "sample_name")
male_pca_df <- left_join(male_pca_df, sample_annotation_df, by = "sample_name")
male_pca_df <- left_join(male_pca_df, male_avg_x_methylation_df, by = "sample_name")
male_pov <- sprintf("%2.2f%%", 100* male_pca$sdev^2/sum(male_pca$sdev^2))
male_pc1_label <- sprintf("PC1 (%s)", male_pov[1])
male_pc2_label <- sprintf("PC2 (%s)", male_pov[2])
pca_male_by_group <- ggplot(male_pca_df, mapping = aes(x=PC1, y=PC2)) +
geom_point(aes(color=group_accession), size=.5, shape=20) +
theme_bw() +
labs(x=male_pc1_label,
y=male_pc2_label,
title = "Studies",
color="Study")+
base_plot_theme +
guides(color=guide_legend(nrow=2,byrow=TRUE)) +
theme(legend.key.width = unit(1, units="mm"),
legend.key.height = unit(1, units="mm"),
legend.text = element_text(size=4, margin = margin(r=.5, l=-1, unit="mm")),
legend.box.margin = margin(t=-3,l=-5, unit="mm"),
legend.spacing.y=unit(.2, "mm")) +
scale_color_manual(limits=c("GSE110544", "GSE31848", "GSE59091", "GSE60924", "GSE72923", "GSE85828"),
values = c(pal_futurama()(4), pal_startrek()(3)[3], pal_rickandmorty()(5)[c(2)]))
pca_male_by_methylation <- ggplot(male_pca_df, mapping = aes(x=PC1, y=PC2)) +
geom_point(aes(color=meanXMethylation), size=.5, shape=20) +
labs(x=male_pc1_label,
y=male_pc2_label,
title = "Mean X Methylation",
color="Methylation")+
theme_bw() +
base_plot_theme +
theme(legend.title = element_text(vjust = .95),
legend.box.margin = margin(t=-3,l=-6, unit="mm"),
legend.key.width = unit(3, units = "mm")) +
scale_color_viridis_c()
pca_male_by_cell_line <- ggplot(male_pca_df, mapping = aes(x=PC1, y=PC2)) +
geom_point(aes(color=cell_line), size=.5, shape=20) +
labs(x=male_pc1_label,
y=male_pc2_label,
title="Cell Lines",
shape="Cell Line") +
theme_bw() +
base_plot_theme +
scale_color_discrete(guide=F)
pca_male_by_cell_type <- ggplot(male_pca_df, mapping = aes(x=PC1, y=PC2)) +
geom_point(aes(color=cell_type), size=.5, shape=20) +
labs(x=male_pc1_label,
y=male_pc2_label,
title = "Cell Type",
color="Cell Type")+
theme_bw() +
base_plot_theme +
scale_color_startrek()
arranged_male_pca <- grid.arrange(pca_male_by_methylation,
pca_male_by_group,
pca_male_by_cell_type,
pca_male_by_cell_line,
top = textGrob("PCA of Male Samples", gp=gpar(fontsize=7, fontface="bold")), nrow=1)
ggsave("figs/supplementary/male_pca.pdf", arranged_male_pca, width = 139.2, height=41, units = 'mm')
```
## Identification and removal of male outliers
```{r}
male_outlier_sample_names <- (male_pca_df %>% filter(PC1 > 20))$sample_name
male_no_outliers_all_x_df <- as.data.frame(male_beta_mat[rownames(male_beta_mat) %in% x_annotation_df$Name,])
male_no_outliers_all_x_df <- dplyr::select(male_all_x_df, -male_outlier_sample_names)
male_no_outliers_pca <- prcomp(t(na.omit(male_no_outliers_all_x_df)))
male_no_outliers_pca_df <- rownames_to_column(as.data.frame(male_no_outliers_pca$x), var = "sample_name")
male_no_outliers_pca_df <- left_join(male_no_outliers_pca_df, sample_annotation_df, by = "sample_name")
male_no_outliers_pca_df <- left_join(male_no_outliers_pca_df, male_avg_x_methylation_df, by = "sample_name")
male_no_outliers_pov <- sprintf("%2.2f%%", 100* male_no_outliers_pca$sdev^2/sum(male_no_outliers_pca$sdev^2))
male_no_outliers_pc1_label <- sprintf("PC1 (%s)", male_no_outliers_pov[1])
male_no_outliers_pc2_label <- sprintf("PC2 (%s)", male_no_outliers_pov[2])
pca_male_no_outliers_by_group <- ggplot(male_no_outliers_pca_df, mapping = aes(x=PC1, y=PC2)) +
geom_point(aes(color=group_accession), size=.5, shape=20) +
theme_bw() +
labs(x=male_no_outliers_pc1_label,
y=male_no_outliers_pc2_label,
title = "Studies",
color="Study")+
base_plot_theme +
scale_color_futurama()
pca_male_no_outliers_by_methylation <- ggplot(male_no_outliers_pca_df, mapping = aes(x=PC1, y=PC2)) +
geom_point(aes(color=meanXMethylation), size=.5, shape=20) +
labs(x=male_no_outliers_pc1_label,
y=male_no_outliers_pc2_label,
title = "Mean X Methylation",
color="Methylation")+
theme_bw() +
base_plot_theme +
theme(legend.title = element_text(vjust = .95),
legend.box.margin = margin(t=-3,l=-6, unit="mm"),
legend.key.width = unit(3, units = "mm")) +
scale_color_viridis_c()
pca_male_no_outliers_by_cell_line <- ggplot(male_no_outliers_pca_df, mapping = aes(x=PC1, y=PC2)) +
geom_point(aes(color=cell_line), size=.5, shape=20) +
labs(x=male_no_outliers_pc1_label,
y=male_no_outliers_pc2_label,
title="Cell Lines",
shape="Cell Line") +
theme_bw() +
base_plot_theme +
scale_color_discrete(guide=F)
pca_male_no_outliers_by_cell_type <- ggplot(male_no_outliers_pca_df, mapping = aes(x=PC1, y=PC2)) +
geom_point(aes(color=cell_type), size=.5, shape=20) +
labs(x=male_no_outliers_pc1_label,
y=male_no_outliers_pc2_label,
title = "Cell Type",
color="Cell Type")+
theme_bw() +
base_plot_theme +
scale_color_startrek()
arranged_male_no_outliers_pca <- grid.arrange(pca_male_no_outliers_by_methylation,
pca_male_no_outliers_by_group,
pca_male_no_outliers_by_cell_type,
pca_male_no_outliers_by_cell_line,
top = textGrob("PCA of Male Samples - No outliers",gp=gpar(fontsize=6)))
ggsave("figs/supplementary/male_no_outliers_pca.pdf", arranged_male_no_outliers_pca, width = 80, height=85, units = 'mm')
arranged_male_no_outliers_pca <- grid.arrange(pca_male_no_outliers_by_methylation,
top = textGrob("Male - No outliers",gp=gpar(fontsize=7, fontface="bold")))
ggsave("figs/supplementary/male_no_outliers_pca_meth.svg", arranged_male_no_outliers_pca, width = 34.8, height=40, units = 'mm')
```
```{r}
library(data.table)
# have to use fwrite from package data.table because write.csv couldn't handle the large matrix. This will save the data to a file.
female_df_no_h9 <- as.data.frame(female_beta_mat)
female_df_no_h9 <- dplyr::select(female_df_no_h9, -h9_lines)
female_df_no_h9_chrX <- female_df_no_h9[x_annotation_df$Name,]
female_df_no_h9 <- as.matrix(female_df_no_h9)
female_df_no_h9_chrX <- as.matrix(female_df_no_h9_only_x)
fwrite(as.data.table(na.omit(female_df_no_h9),
keep.rownames=TRUE,
na.rm=TRUE),
file='female_df_no_h9.csv',
row.names = FALSE)
fwrite(as.data.table(na.omit(female_df_no_h9_chrX),
keep.rownames=TRUE,
na.rm=TRUE),
file='female_df_no_h9_chrX.csv',
row.names = FALSE)
male_df_no_outliers <- as.data.frame(male_beta_mat)
male_df_no_outliers <- dplyr::select(male_df_no_outliers, -male_outlier_sample_names)
male_df_no_outliers <- as.matrix(male_df_no_outliers)
fwrite(as.data.table(na.omit(male_df_no_outliers), keep.rownames=TRUE, na.rm=TRUE),
file='male_df_no_outliers.csv',
row.names = FALSE)
```
Now we look at the distribution of values in female and male samples.
```{r}
female_df_no_h9_chrX <- read_csv("female_df_no_h9_chrX.csv.gz")
male_df_no_outliers <- read_csv("male_df_no_outliers.csv.gz")
x_annotation_df <- read_csv("x_annotation.csv.gz")
male_df_no_outliers_chrX <- male_df_no_outliers %>% filter(rn %in% x_annotation_df$rowname)
xchr_beta_dist <- ggplot() +
geom_violin(aes(x="female", y=as_vector(female_df_no_h9_chrX %>% select(-rn)), fill='female'), trim=FALSE, show.legend = F, size=.3) +
geom_violin(aes(x="male", y=as_vector(male_df_no_outliers_chrX %>% select(-rn)), fill='male'), trim=FALSE, show.legend = F, size=.3) +
geom_boxplot(aes(x="female", y=as_vector(female_df_no_h9_chrX %>% select(-rn))), width=0.025, size=.3) +
geom_boxplot(aes(x="male", y=as_vector(male_df_no_outliers_chrX %>% select(-rn))), width=0.025, size=.3) +
labs(x="Sex",
y='β-value',
title="X-Chr CpG β-Values") +
theme_bw() +
base_plot_theme +
scale_fill_npg()
ggsave('figs/main/fig1_xchr_beta_value_distributions.pdf',xchr_beta_dist, width = 36, height = 40, units = 'mm')
xchr_beta_dist
```
## Comparing female lines to male lines
Here we compare the female and male lines to observe the differences between them.
```{r}
library(broom)
# base_plot_theme <- theme(plot.title = element_text(hjust = 0.5, size=9),
# legend.position="bottom",
# legend.justification = "right",
# legend.title = element_text(size = 6, margin = margin(r=2, unit="mm")),
# legend.text = element_text(size = 6, margin = margin(r=2, unit="mm")),
# legend.key.height = unit(2, "mm"),
# legend.key.width = unit(1, "mm"),
# legend.margin = margin(t=0,r=0,b=0,l=0, unit="mm"),
# axis.title = element_text(size=8),
# axis.text = element_text(size = 6))
# all_beta_mat_subset <- all_beta_mat[x_annotation_df$Name, setdiff(c(colnames(female_all_x_df_no_h9), colnames(male_no_outliers_all_x_df)), c('Chromosome', 'Gene', 'Position'))]
male_female_beta_mat <- full_join(female_df_no_h9_chrX, male_df_no_outliers_chrX, by="rn") %>% select(-rn)
female_male_avg_x_methylation_df <- colMeans(na.omit(male_female_beta_mat)) %>%
tidy() %>%
dplyr::rename(sample_name=names, meanXMethylation=x)
sample_annotation_df <- read_csv("sample_data_combined_all_info.csv")
female_male_pca <- prcomp(t(na.omit(male_female_beta_mat)))
female_male_pca_df <- rownames_to_column(as.data.frame(female_male_pca$x), var = "sample_name")
female_male_pca_df <- left_join(female_male_pca_df, sample_annotation_df, by = "sample_name")
female_male_pca_df <- left_join(female_male_pca_df, female_male_avg_x_methylation_df, by = "sample_name")
female_male_pov <- sprintf("%2.2f%%", 100* female_male_pca$sdev^2/sum(female_male_pca$sdev^2))
female_male_pc1_label <- sprintf("PC1 (%s)", female_male_pov[1])
female_male_pc2_label <- sprintf("PC2 (%s)", female_male_pov[2])
female_male_pca_plot <- ggplot(female_male_pca_df, mapping = aes(x=PC1, y=PC2)) +
geom_point(aes(color=gender), size=.5, shape=20) +
labs(x=female_male_pc1_label,
y=female_male_pc2_label,
title = "PCA of Female and Male",
color="Sex")+
theme_bw() +
base_plot_theme +
scale_color_npg()
ggsave("figs/main/fig1_female_male_pca.pdf", female_male_pca_plot, height = 40, width = 36, units = 'mm')
female_male_pca_plot
```
## Looking at global methylation patterns
```{r}
library(tidyverse)
library(ggsignif)
library(ggsci)
library(viridis)
library(grid)
library(gridExtra)
female_df_no_h9 <- read_csv("female_df_no_h9.csv.gz")
male_df_no_outliers <- read_csv("male_df_no_outliers.csv.gz")
x_annotation_df <- read_csv("x_annotation.csv.gz")
filtered_site_annotation_df <- read_csv("filtered_site_annotation_df.csv.gz")
sex_chromosome_sites <- (filtered_site_annotation_df %>% filter(hg38_chromosome %in% c("chrX", "chrY")))$rowname
female_means <- colMeans(female_df_no_h9 %>% dplyr::select(-rn))
female_x_chr_means <- colMeans(female_df_no_h9 %>% filter(rn %in% x_annotation_df$rowname) %>% dplyr::select(-rn))
xist_sites <- (x_annotation_df %>% filter(grepl("XIST",UCSC_RefGene_Name)))$rowname
female_xist_means <- colMeans(female_df_no_h9 %>% filter(rn %in% xist_sites) %>% dplyr::select(-rn))
female_autosome_means <- colMeans(female_df_no_h9 %>% filter(!(rn %in% sex_chromosome_sites)) %>% dplyr::select(-rn))
male_means <- colMeans(male_df_no_outliers %>% dplyr::select(-rn))
male_x_chr_means <- colMeans(male_df_no_outliers %>% filter(rn %in% x_annotation_df$rowname) %>% dplyr::select(-rn))
male_xist_means <- colMeans(male_df_no_outliers %>% filter(rn %in% xist_sites) %>% dplyr::select(-rn))
male_autosome_means <- colMeans(male_df_no_outliers %>% filter(!(rn %in% sex_chromosome_sites)) %>% dplyr::select(-rn))
means <- tibble(
sex=character(),
sample_name=character(),
mean_methylation=numeric(),
mean_x_methylation=numeric(),
mean_xist_methylation=numeric(),
mean_autosome_methylation=numeric()
)
for (sample in names(female_means)){
means <- add_row(means, sex="female",
sample_name=sample,
mean_methylation=female_means[sample],
mean_x_methylation=female_x_chr_means[sample],
mean_xist_methylation=female_xist_means[sample],
mean_autosome_methylation=female_autosome_means[sample])
}
for (sample in names(male_means)){
means <- add_row(means, sex="male",
sample_name=sample,
mean_methylation=male_means[sample],
mean_x_methylation=male_x_chr_means[sample],
mean_xist_methylation=male_xist_means[sample],
mean_autosome_methylation=male_autosome_means[sample])
}
lm_eqn <- function(m) {
# Displays regression line equation and R^2 value on plot
# Usage:
# p + annotate("text", x=25, y=300, label=lm_eqn(lm(y ~ x, df)), parse=TRUE)
l <- list(a = format(coef(m)[1], digits = 2),
b = format(abs(coef(m)[2]), digits = 2),
r2 = format(summary(m)$r.squared, digits = 3));
if (coef(m)[2] >= 0) {
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,l)
} else {
eq <- substitute(italic(y) == a - b %.% italic(x)*","~~italic(r)^2~"="~r2,l)
}
as.character(as.expression(eq));
}
global_mean_methylation <- ggplot(means, aes(x=sex, group=sex, y=mean_methylation, fill=sex)) +
geom_boxplot(show.legend = F, size=.3, outlier.size = .5, outlier.shape = 20) +
geom_signif(map_signif_level=TRUE, comparisons = list(c("female", "male")), size=.3, textsize = 2) +
labs(x="Sex", y="Global Mean β-values", title = "Global Mean\nMethylation") +
theme_bw() +
base_plot_theme +
theme(legend.position = "none") +
scale_y_continuous(limits = c(0.4,0.65)) +
scale_fill_npg()
autosomal_mean_methylation <- ggplot(means, aes(x=sex, group=sex, y=mean_autosome_methylation, fill=sex)) +
geom_boxplot(show.legend = F, size=.3, outlier.size = .5, outlier.shape = 20) +
geom_signif(map_signif_level=TRUE, comparisons = list(c("female", "male")), size=.3, textsize = 2) +
labs(x="Sex", y="Autosomal Mean Methylation β-value", title = "Autosomal Mean Methylation By Sex") +
theme_bw() +
base_plot_theme +
theme(legend.position = "none") +
scale_y_continuous(limits = c(0.4,0.65)) +
scale_fill_npg()
# ggsave("figs/supplementary/Male vs Female Global Methylation.svg", units = "mm", width = 84, height = 84)
female_means <- means %>% filter(sex=="female")
sex_colors <- pal_npg("nrc")(2)
correlation <- sprintf("corr = %.2f", cor(female_means$mean_x_methylation, female_means$mean_methylation))
female_x_global <- ggplot(female_means, aes(x=mean_x_methylation, y=mean_methylation)) +
geom_point(color=sex_colors[1],size=.7) +
labs(x="Mean X Chr Methylation", y="Mean Global Methylation", title="Female Samples - X Chr vs Global Meth") +
geom_smooth(method='lm', color="black", size=.5) +
# annotate("text", x=.55, y=.47, label=lm_eqn(lm(mean_methylation ~ mean_x_methylation, female_means)), parse=TRUE) +
annotate("text", x=.55, y=.45, label=correlation, size=3) +
theme_bw() +
base_plot_theme
# ggsave("figs/Female Samples - X Chr vs Global Meth.png")
correlation <- sprintf("corr = %.2f", cor(female_means$mean_x_methylation, female_means$mean_autosome_methylation))
female_x_autosome <- ggplot(female_means, aes(x=mean_x_methylation, y=mean_autosome_methylation)) +
geom_point(color=sex_colors[1],size=.7) +
labs(x="Mean X Chr Methylation", y="Mean Autosomal Methylation", title="Female Samples - X Chr vs Autosomal Meth") +
geom_smooth(method='lm', color="black", size=.5) +
# annotate("text", x=.55, y=.47, label=lm_eqn(lm(mean_autosome_methylation ~ mean_x_methylation, female_means)), parse=TRUE) +
annotate("text", x=.55, y=.45, label=correlation, size=3) +
theme_bw() +
base_plot_theme
# ggsave("figs/Female Samples - X Chr vs Autosomal Meth.svg", height=55, width=84, units = "mm")
correlation <- sprintf("corr = %.2f", cor(female_means$mean_x_methylation, female_means$mean_xist_methylation))
female_x_xist <- ggplot(means %>% filter(sex=="female"), aes(x=mean_x_methylation, y=mean_xist_methylation)) +
geom_point(color=sex_colors[1],size=.5, shape=20) +
labs(x="Mean X Chr Methylation", y="Mean XIST Methylation", title="Female Samples - X Chr vs XIST Meth") +
# geom_smooth(method='lm', color="black", size=.5) +
# annotate("text", x=.43, y=.77, label=lm_eqn(lm(mean_xist_methylation ~ mean_x_methylation, female_means)), parse=TRUE) +
# annotate("text", x=.43, y=.75, label=correlation, size=3) +
theme_bw() +
base_plot_theme
ggsave("figs/main/fig1_female_x_v_xist.svg",female_x_xist, height=60, width=58, units = "mm")
correlation <- sprintf("corr = %.2f", cor(female_means$mean_methylation, female_means$mean_xist_methylation))
female_xist_global <- ggplot(means %>% filter(sex=="female"), aes(x=mean_xist_methylation, y=mean_methylation)) +
geom_point(color=sex_colors[1],size=.7) +
labs(x="Mean XIST Methylation", y="Mean Global Methylation", title="Female Samples - XIST vs Global Meth")+
geom_smooth(method='lm', color="black", size=.5) +
# annotate("text", x=.73, y=.48, label=lm_eqn(lm(mean_methylation ~ mean_xist_methylation, female_means)), parse=TRUE) +
annotate("text", x=.73, y=.47, label=correlation, size=3) +
theme_bw() +
base_plot_theme
# ggsave("figs/Female Samples - XIST vs Global Meth.png")
correlation <- sprintf("corr = %.2f", cor(female_means$mean_autosome_methylation, female_means$mean_xist_methylation))
female_xist_autosome <- ggplot(means %>% filter(sex=="female"), aes(x=mean_xist_methylation, y=mean_autosome_methylation)) +
geom_point(color=sex_colors[1],size=.7) +
labs(x="Mean XIST Methylation", y="Mean Autosomal Methylation", title="Female Samples - XIST vs Autosomal Meth")+
geom_smooth(method='lm', color="black", size=.5) +
# annotate("text", x=.73, y=.48, label=lm_eqn(lm(mean_autosome_methylation ~ mean_xist_methylation, female_means)), parse=TRUE) +
annotate("text", x=.73, y=.47, label=correlation, size=3) +
theme_bw() +
base_plot_theme
# ggsave("figs/Female Samples - XIST vs Autosomal Meth.png")
male_means <- means %>% filter(sex=="male")
correlation <- sprintf("corr = %.2f", cor(male_means$mean_x_methylation, male_means$mean_methylation))
male_x_global <- ggplot(male_means, aes(x=mean_x_methylation, y=mean_methylation)) +
geom_point(color=sex_colors[2],size=.7) +
labs(x="Mean X Chr Methylation", y="Mean Global Methylation", title="Male Samples - X Chr vs Global Meth") +
geom_smooth(method='lm', color="black", size=.5) +
# annotate("text", x=.46, y=.49, label=lm_eqn(lm(mean_methylation ~ mean_x_methylation, male_means)), parse=TRUE) +
annotate("text", x=.46, y=.48, label=correlation, size=3) +
theme_bw() +
base_plot_theme
# ggsave("figs/Male Samples - X Chr vs Global Meth.png")
correlation <- sprintf("corr = %.2f", cor(male_means$mean_x_methylation, male_means$mean_autosome_methylation))
male_x_autosome <- ggplot(male_means, aes(x=mean_x_methylation, y=mean_autosome_methylation)) +
geom_point(color=sex_colors[2],size=.7) +
labs(x="Mean X Chr Methylation", y="Mean Autosomal Methylation", title="Male Samples - X Chr vs Autosomal Meth") +
geom_smooth(method='lm', color="black", size=.5) +
# annotate("text", x=.46, y=.49, label=lm_eqn(lm(mean_autosome_methylation ~ mean_x_methylation, male_means)), parse=TRUE) +
annotate("text", x=.46, y=.48, label=correlation, size=3)+
theme_bw() +
base_plot_theme
# ggsave("figs/Male Samples - X Chr vs Autosomal Meth.svg", height=55, width=84, units = "mm")
correlation <- sprintf("corr = %.2f", cor(male_means$mean_x_methylation, male_means$mean_xist_methylation))
male_x_xist <- ggplot(means %>% filter(sex=="male"), aes(x=mean_x_methylation, y=mean_xist_methylation)) +
geom_point(color=sex_colors[2],size=.7) +
labs(x="Mean X Chr Methylation", y="Mean XIST Methylation", title="Male Samples - X Chr vs XIST Meth") +
geom_smooth(method='lm', color="black", size=.5) +
# annotate("text", x=.41, y=.92, label=lm_eqn(lm(mean_xist_methylation ~ mean_x_methylation, male_means)), parse=TRUE) +
annotate("text", x=.41, y=.91, label=correlation, size=3) +
theme_bw() +
base_plot_theme
# ggsave("figs/Male Samples - X Chr vs XIST Meth.png")
correlation <- sprintf("corr = %.2f", cor(male_means$mean_methylation, male_means$mean_xist_methylation))
male_xist_global <- ggplot(means %>% filter(sex=="male"), aes(x=mean_xist_methylation, y=mean_methylation)) +
geom_point(color=sex_colors[2],size=.7) +
labs(x="Mean XIST Methylation", y="Mean Global Methylation", title="Male Samples - XIST vs Global Meth")+
geom_smooth(method='lm', color="black", size=.5) +
# annotate("text", x=.90, y=.475, label=lm_eqn(lm(mean_methylation ~ mean_xist_methylation, male_means)), parse=TRUE) +
annotate("text", x=.90, y=.465, label=correlation, size=3) +
theme_bw() +
base_plot_theme
# ggsave("figs/Male Samples - XIST vs Global Meth.png")
correlation <- sprintf("corr = %.2f", cor(male_means$mean_autosome_methylation, male_means$mean_xist_methylation))
male_xist_autosome <- ggplot(means %>% filter(sex=="male"), aes(x=mean_xist_methylation, y=mean_autosome_methylation)) +
geom_point(color=sex_colors[2],size=.7) +
labs(x="Mean XIST Methylation", y="Mean Autosomal Methylation", title="Male Samples - XIST vs Autosomal Meth")+
geom_smooth(method='lm', color="black", size=.5) +
# annotate("text", x=.90, y=.48, label=lm_eqn(lm(mean_autosome_methylation ~ mean_xist_methylation, male_means)), parse=TRUE) +
annotate("text", x=.90, y=.47, label=correlation, size=3) +
theme_bw() +
base_plot_theme
# ggsave("figs/Male Samples - XIST vs Autosomal Meth.png")
supp_5 <- grid.arrange(female_x_global, male_x_global,
female_xist_global, male_xist_global,
female_xist_autosome, male_xist_autosome,
male_x_autosome, male_x_xist, nrow=4, padding=unit(4, "mm"))
ggsave("figs/supplementary/supp_5.pdf",supp_5, width=174, height=236, unit="mm")
```
## Choosing sites that are more variant in the female lines
We run the Brown-Forsythe test using a custom program written in GoLang due to the advantages in its speed.
First we'll save our data from R into CSVs that can be read by the GoLang program.
Now we run the stats-runner-for-beta-matrix program written in GoLang on these CSV files, and get an output of variance_statistics_noh9_nooutliers.csv. This will calculate the variances and perform the brown-forsythe test.
```{bash}
./stats-runner-for-beta-matrix/stats-runner-for-beta-matrix -bf -a "female_df_no_h9.csv" -b "male_df_no_outliers.csv" -o "variance_statistics_noh9_nooutliers"
```
Now we can bring the variance statistics back into R, and continue working with it.
```{r}
statistics <- read_csv('variance_statistics_noh9_nooutliers.csv.gz')
x_sites <- x_annotation_df$rowname
filtered_site_annotation_df <- read_csv("filtered_site_annotation_df.csv.gz")
sex_chromosome_sites <- (filtered_site_annotation_df %>% filter(hg38_chromosome %in% c("chrX", "chrY")))$rowname
variance_density_plot_autosome <- ggplot(statistics[!(statistics$Position %in% sex_chromosome_sites),]) +
geom_density(aes(x=femaleVariance, color="female", fill="female"), trim=F, size=.3, alpha=.2) +
geom_density(aes(x=maleVariance, color="male", fill="male"), trim=F, size=.3, alpha=.2) +
labs(title="Autosomal CpG β Variance",
x="β-Value Variance Across Samples",
y="Density") +
theme_bw() +
base_plot_theme +
scale_color_npg(name="Sex") +
scale_fill_npg(name="Sex")
variance_ks <- ks.test(ggplot_build(variance_density_plot_autosome)$data[[1]]$y, ggplot_build(variance_density_plot_autosome)$data[[2]]$y)
ks_dist <- sprintf("K.S. Distance = %.2f", as.numeric(variance_ks$statistic))
variance_density_plot_autosome <- variance_density_plot_autosome +
annotate("text", x=.26, y=12, label=ks_dist, size=2)
ggsave("figs/main/fig1_variance_density_plot_autosome.pdf", variance_density_plot_autosome, width=36, height=40, units="mm")
# ks.test(filter(statistics, !(Position %in% sex_chromosome_sites))$femaleVariance, filter(statistics, !(Position %in% sex_chromosome_sites))$maleVariance)
ggsave("figs/main/fig1a.svg",variance_density_plot_autosome, width = 86.999, height = 78.666, units = "mm" )
variance_density_plot <- ggplot(statistics[statistics$Position %in% x_sites,]) +
geom_density(aes(x=femaleVariance, color="female", fill="female"), trim=F, size=.3, alpha=.2) +
geom_density(aes(x=maleVariance, color="male", fill="male"), trim=F, size=.3, alpha=.2) +
labs(title="X-Chr CpG β Variance",
x="β-Value Variance Across Samples",
y="Density") +
theme_bw() +
base_plot_theme +
scale_color_npg(name="Sex") +
scale_fill_npg(name="Sex")
variance_ks <- ks.test(ggplot_build(variance_density_plot)$data[[1]]$y, ggplot_build(variance_density_plot)$data[[2]]$y)
ks_dist <- sprintf("K.S. Distance = %.2f*", as.numeric(variance_ks$statistic))
variance_density_plot <- variance_density_plot +
annotate("text", x=.26, y=12, label=ks_dist, size=2)
ggsave("figs/main/fig1_variance_density_plot.pdf", variance_density_plot, width=36, height=40, units="mm")
# ks.test(filter(statistics, (Position %in% x_sites))$femaleVariance, filter(statistics, !(Position %in% sex_chromosome_sites))$maleVariance)
# ggsave("figs/x-chr-cpg-variance.pdf", height = 84, width = 84, units = 'mm')
sig_diff_x_v_no_x <- ggplot(statistics, aes(y=alpha, x=(Position %in% x_sites), fill=(Position %in% x_sites))) +
geom_boxplot(outlier.shape=NA, size=.4) +
labs(title="Brown-Forsythe\np-values",
x="Chromosome",
y="BF p-value") +
scale_x_discrete(labels=c("Autosomal", "X"), limits=c(FALSE, TRUE)) +
scale_fill_grey(start=.5, end=.8, guide=F) +
theme_bw() +
base_plot_theme #+
# theme(axis.title.x=element_blank())
positions_to_keep <- statistics$Position[statistics$alpha <= .01 & (statistics$femaleVariance > statistics$maleVariance)] # returns the density data
sig_x_sites <- intersect(positions_to_keep, x_sites)
print(length(sig_x_sites))
sig_annotation <- x_annotation_df[x_annotation_df$Name %in% sig_x_sites,]
sig_female_beta_df <- female_df_no_h9 %>% filter(rn %in% sig_annotation$Name)
all_colnames <- colnames(sig_female_beta_df %>% select(-rn))
sig_female_beta_df$Gene <- sig_annotation$hg38_gene_name
sig_female_beta_df$Chromosome <- sig_annotation$hg38_chromosome
sig_female_beta_df$Position <- sig_annotation$hg38_pos
sig_female_beta_df <- sig_female_beta_df[c('rn','Chromosome', 'Position', 'Gene', all_colnames)]
write.csv(sig_female_beta_df, file='sig_female_beta_df_no_h9.csv')
```
## Beta value distributions after site selection
```{r}
unsig_female_beta_mat <- female_df_no_h9 %>% filter(rn %in% x_annotation_df$Name & !(rn %in% sig_annotation$Name))
unsig_female_beta_vec <- as.vector(as.matrix(unsig_female_beta_mat %>% dplyr::select(-rn)))
sig_female_beta_vec <- as.vector(as.matrix(sig_female_beta_df %>% dplyr::select(-rn, -Chromosome, -Position, -Gene)))
sig_nonsig_dist_plot <- ggplot() +
geom_violin(aes(x="p<.01", y=sig_female_beta_vec, fill='female'), trim=FALSE, show.legend = F, size=.3) +
geom_violin(aes(x="p>.01", y=unsig_female_beta_vec, fill='male'), trim=FALSE, show.legend = F, size=.3) +
geom_boxplot(aes(x="p<.01", y=sig_female_beta_vec), width=0.025, size=.3, outlier.shape=20, outlier.size=.4) +
geom_boxplot(aes(x="p>.01", y=unsig_female_beta_vec), width=0.025, size=.3, outlier.shape=20, outlier.size=.4) +
labs(x="BF p-values",
y='DNAme β-value',
#title="β-values of sig. and \nnon-sig. variant sites\nin Female hPSCs") +
title="Probe β-values in Female\nhPSCs by Variance") +
scale_x_discrete(limits=c("p>.01", "p<.01")) +
theme_bw() +
base_plot_theme +
theme(legend.position = "none",
# axis.title.x=element_blank()
) +
scale_fill_futurama()
# ggsave('figs/beta value distributions - sig-unsig.png')
sig_nonsig_dist_plot
```
## Calculating PPVs and correlation to avg X chromosome methylation
```{r}
female_df_no_h9_chrX <- read_csv("female_df_no_h9_chrX.csv.gz")
female_df_no_h9 <- read_csv("female_df_no_h9.csv")
mean_x_methylation <- colMeans(female_df_no_h9_chrX %>% dplyr::select(-rn))
mean_sig_x_methylation <- colMeans(female_df_no_h9_chrX %>% filter(rn %in% sig_x_sites) %>% dplyr::select(-rn))
mean_nonsig_x_methylation <- colMeans(female_df_no_h9_chrX %>% filter((rn %in% x_sites) & !(rn %in% sig_x_sites)) %>% dplyr::select(-rn))
mean_genome_methylation <- colMeans(female_df_no_h9 %>% dplyr::select(-rn))
xist_sites <- (x_annotation_df %>% filter(hg38_gene_name == "XIST"))$rowname
mean_xist_methylation <- colMeans(female_df_no_h9 %>% filter(rn %in% xist_sites) %>% dplyr::select(-rn))
cor.test(mean_x_methylation, mean_sig_x_methylation)
cor.test(mean_x_methylation, mean_genome_methylation)
cor.test(mean_x_methylation, mean_xist_methylation)
cor.test(mean_sig_x_methylation, mean_genome_methylation)
cor.test(mean_sig_x_methylation, mean_xist_methylation)
data_df <- tibble(mean_x = mean_x_methylation, mean_sig_x = mean_sig_x_methylation, mean_nonsig_x = mean_nonsig_x_methylation)
correlation <- sprintf("corr = %.2f", cor(mean_x_methylation, mean_sig_x_methylation))
female_x_sigsites <- ggplot(data_df, aes(x=mean_x, y=mean_sig_x)) +
geom_point(color=sex_colors[1],size=.5, shape=20) +
labs(x="Mean X DNAme", y="Mean Sig X DNAme", title="Female hPSCs\nBF p<.01 X-probes") +
geom_smooth(method='lm', color="black", size=.5) +
# annotate("text", x=.55, y=.47, label=lm_eqn(lm(mean_autosome_methylation ~ mean_x_methylation, female_means)), parse=TRUE) +
annotate("text", x=.45, y=.45, label=correlation, size=2) +
theme_bw() +
base_plot_theme
correlation <- sprintf("corr = %.2f", cor(mean_x_methylation, mean_nonsig_x_methylation))
female_x_nonsigsites <- ggplot(data_df, aes(x=mean_x, y=mean_nonsig_x)) +
geom_point(color=sex_colors[1],size=.5, shape=20) +
labs(x="Mean X Chr DNAme", y="Mean Non-Sig X DNAme", title="Female hPSCs\n BF p>.01 X-probes") +
geom_smooth(method='lm', color="black", size=.5) +
# annotate("text", x=.55, y=.47, label=lm_eqn(lm(mean_autosome_methylation ~ mean_x_methylation, female_means)), parse=TRUE) +
annotate("text", x=.55, y=.6, label=correlation, size=2) +
theme_bw() +
base_plot_theme
supp_6 <- grid.arrange(global_mean_methylation,
sig_diff_x_v_no_x, sig_nonsig_dist_plot,
female_x_nonsigsites, female_x_sigsites, nrow = 1)
ggsave("figs/supplementary/supp_1_f_j.pdf", supp_6, width = 174, height=40, units = "mm")
sample_annotation_df <- read_csv("sample_data_combined_all_info.csv")
# Trying to evaluate a pseudo-ppv based on passage numbers
female_sample_annotation_df <- sample_annotation_df %>% filter(sample_name %in% names(mean_x_methylation))
# female_sample_annotation_df <- rownames_to_column(female_sample_annotation_df, var="sample_name")
# cell_line_counts <- filter(female_sample_annotation_df, !is.na(passage)) %>% dplyr::count(cell_line)
duplicated_cell_lines <- unique(filter(female_sample_annotation_df, !is.na(passage) & duplicated(cell_line))$cell_line)
total_pairs <- 0
correctly_ordered_by_x <- 0
correctly_ordered_by_x_names <- c()
correctly_ordered_by_sig <- 0
correctly_ordered_by_sig_names <- c()
for(duplicated_cell_line in duplicated_cell_lines) {
cell_line_annotation_df <- female_sample_annotation_df %>% filter(!is.na(passage) & cell_line == duplicated_cell_line) %>% dplyr::arrange(passage)
total_pairs <- total_pairs + 1
early_sample <- cell_line_annotation_df$sample_name[1]
late_sample <- cell_line_annotation_df$sample_name[2]
early_x_meth <- mean_x_methylation[early_sample]
late_x_meth <- mean_x_methylation[late_sample]
if (late_x_meth <= early_x_meth) {
correctly_ordered_by_x <- correctly_ordered_by_x + 1
correctly_ordered_by_x_names <- append(correctly_ordered_by_x_names, duplicated_cell_line)
}
early_sig_meth <- mean_sig_x_methylation[early_sample]
late_sig_meth <- mean_sig_x_methylation[late_sample]
if (late_sig_meth <= early_sig_meth) {
correctly_ordered_by_sig <- correctly_ordered_by_sig + 1
correctly_ordered_by_sig_names <- append(correctly_ordered_by_sig_names, duplicated_cell_line)
}
# cell_line_annotation_df
}
```
Now we will switch to python because of it's machine/statistical learning packages.
---------------------Insert python notebook here----------------------------------------------------
```{python}
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
female_sig_sites_nowa09_df = pd.read_csv("sig_female_beta_df_no_h9.csv", index_col=[0,1,2,3,4])
female_all_chrX_nowa09_df = pd.read_csv("female_df_no_h9_chrX.csv", index_col=[0])
df_to_plot = female_sig_sites_nowa09_df
suffix = '_nowa09'
columns_to_keep = df_to_plot.columns.tolist()
# print(columns_to_keep)
sample_data = pd.read_csv("sample_data_combined_all_info.csv", index_col=1)
# print(sample_data.index)
sample_data_subset = sample_data.loc[df_to_plot.columns.tolist(), :]
print(sample_data_subset['cell_line'])
cell_lines = sample_data_subset['cell_line']
counts = cell_lines.value_counts()
cell_line_weights = 1 / cell_lines.map(counts)
mean_methylation = pd.DataFrame(female_all_chrX_nowa09_df.mean())
mean_methylation = MinMaxScaler(feature_range=(0, 5)).fit_transform(mean_methylation)
print(cell_line_weights)
## Getting the optimal number of clusters using elbow method
Sum_of_squared_distances = []
K = range(1,15)
for k in K:
km = KMeans(n_clusters=k, n_init=100, max_iter=1000)
km = km.fit(df_to_plot.T, sample_weight=cell_line_weights)
Sum_of_squared_distances.append(km.inertia_)
print(Sum_of_squared_distances)
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()
```
```{python}
from sklearn.decomposition import PCA
km4 = KMeans(n_clusters=4, n_init=100, max_iter=1000).fit_predict(df_to_plot.T, sample_weight=cell_line_weights)
km5 = KMeans(n_clusters=5, n_init=100, max_iter=1000).fit_predict(df_to_plot.T, sample_weight=cell_line_weights)
km6 = KMeans(n_clusters=6, n_init=100, max_iter=1000).fit_predict(df_to_plot.T, sample_weight=cell_line_weights)
km7 = KMeans(n_clusters=7, n_init=100, max_iter=1000).fit_predict(df_to_plot.T, sample_weight=cell_line_weights)
km8 = KMeans(n_clusters=8, n_init=100, max_iter=1000).fit_predict(df_to_plot.T, sample_weight=cell_line_weights)
pca_embedding = PCA(n_components=2).fit_transform(df_to_plot.T)
f, axarr = plt.subplots(2, 3)
f.set_size_inches(15,10)
axarr[0, 0].scatter(pca_embedding[:, 0], pca_embedding[:, 1], c=[sns.color_palette("Blues")[int(x)] for x in mean_methylation])
axarr[0, 0].set_title('Average X methylation')
axarr[0, 1].scatter(pca_embedding[:, 0], pca_embedding[:, 1], c=[sns.color_palette()[x] for x in km4])
axarr[0, 1].set_title('4 clusters')
axarr[0, 2].scatter(pca_embedding[:, 0], pca_embedding[:, 1], c=[sns.color_palette()[x] for x in km5])
axarr[0, 2].set_title('5 clusters')
axarr[1, 0].scatter(pca_embedding[:, 0], pca_embedding[:, 1], c=[sns.color_palette()[x] for x in km6])
axarr[1, 0].set_title('6 clusters')
axarr[1, 1].scatter(pca_embedding[:, 0], pca_embedding[:, 1], c=[sns.color_palette()[x] for x in km7])
axarr[1, 1].set_title('7 clusters')
axarr[1, 2].scatter(pca_embedding[:, 0], pca_embedding[:, 1], c=[sns.color_palette()[x] for x in km8])
axarr[1, 2].set_title('8 clusters')
plt.show()
# save to replicate plot in R
clusters_df = pd.DataFrame([km4, km5, km6, km7, km8], index=["km4", "km5", "km6", "km7", "km8"], columns=df_to_plot.columns)
clusters_df.T.to_csv('saved_sample_clusters_4_8.csv', header=False)
```
```{python}
# sample_data = pd.read_csv("sample_data_combined.csv", index_col=0)
import numpy as np
n_clusters = 6
def get_ordered_clusters(data,
n_clusters=None,
mean_methylation=None,
sample_data=None,
n_init=100,
max_iter=1000,
clusters=None):
#get weights by frequency of cell_line in the dataset
if clusters is None:
if sample_data is None:
clusters = KMeans(n_clusters=n_clusters,
n_init=n_init,
max_iter=max_iter,n_jobs=-1,init='random').fit_predict(data.T)
else:
sample_data_subset = sample_data.loc[data.columns.tolist(), :]
cell_lines = sample_data_subset['cell_line']
counts = cell_lines.value_counts()
cell_line_weights = 1 / cell_lines.map(counts)
clusters = KMeans(n_clusters=n_clusters,
n_init=n_init,
max_iter=max_iter,n_jobs=-1,init='random').fit_predict(
data.T, sample_weight=cell_line_weights)
cluster_names_dict = {}
if mean_methylation is None:
mean_methylation = pd.DataFrame(data.mean())
mean_methylation = MinMaxScaler(
feature_range=(0, 5)).fit_transform(mean_methylation)
for cluster in list(set(clusters)):
cluster_names_dict[cluster] = np.mean(
mean_methylation[clusters == cluster])
sorted_x = sorted(cluster_names_dict.items(), key=lambda kv: kv[1])
sorted_x.reverse()
cluster_names_dict_fixed = {}
clusters_fixed = [0 for x in clusters]
for i, sorted_tup in enumerate(sorted_x):
for j, clust in enumerate(clusters):
if clust == sorted_tup[0]:
clusters_fixed[j] = i
if i == 0:
cluster_names_dict_fixed[i] = 'Non-Eroded'
elif i == len(sorted_x) - 1:
cluster_names_dict_fixed[i] = 'Eroded'
else:
cluster_names_dict_fixed[i] = sorted_tup[1]
clusters = np.array(clusters_fixed)
cluster_names_dict = cluster_names_dict_fixed
return clusters, cluster_names_dict
clusters, cluster_names_dict = get_ordered_clusters(
data=df_to_plot,
n_clusters=n_clusters,
mean_methylation=mean_methylation,
sample_data=sample_data)
clusters_row = pd.Series(clusters, name='clusters', index=df_to_plot.columns)
print(clusters_row)
clusters_row.to_csv('saved_sample_clusters_test.csv', header=False)
```
## remake the python plots in R
```{r}
elbow_method_df <- tibble(num_clusters=1:14, sum_of_squared_distances=c(21544.753987410935, 14657.4276351863, 11426.322690540565, 10171.239511057178, 9652.956939046413, 9267.650214615307, 8963.094498025255, 8733.35142099591, 8459.750435732954, 8198.846796977472, 8059.111714376631, 7933.215394915591, 7696.179270524132, 7473.282326589432))
elbow_plot <- ggplot(elbow_method_df, aes(x=num_clusters, y=sum_of_squared_distances)) +
geom_line() +
theme_bw() +
base_plot_theme +
labs(title = "Elbow Method for\nOptimal k", x="k", y="Sum of squared distances")
cluster_testing_df <- read_csv("saved_sample_clusters_4_8.csv.gz",
col_names = c("sample_name", "km4", "km5", "km6", "km7", "km8"))
cluster_testing_df <- column_to_rownames(cluster_testing_df, var="sample_name")
cluster_plots <- c()
female_df_no_h9_chrX <- read_csv('female_df_no_h9_chrX.csv.gz')
sample_annotation_df <- read_csv("sample_data_combined_all_info.csv")
female_no_h9_sig_sites_pca <- prcomp(t(na.omit(female_df_no_h9_chrX %>% filter(rn %in% sig_x_sites) %>% select(-rn))))
female_no_h9_sig_sites_pca_df <- rownames_to_column(as.data.frame(female_no_h9_sig_sites_pca$x), var = "sample_name")
female_no_h9_sig_sites_pca_df <- left_join(female_no_h9_sig_sites_pca_df, sample_annotation_df, by = "sample_name")
# female_no_h9_sig_sites_pca_df <- left_join(female_no_h9_sig_sites_pca_df, female_avg_x_methylation_df, by = "sample_name")
female_no_h9_sig_sites_pca_df <- left_join(female_no_h9_sig_sites_pca_df, tidy(colMeans(female_df_no_h9_chrX %>% select(-rn))) %>% rename(sample_name=names, meanXMethylation=x), by = "sample_name")
female_no_h9_sig_sites_pov <- sprintf("%2.2f%%", 100* female_no_h9_sig_sites_pca$sdev^2/sum(female_no_h9_sig_sites_pca$sdev^2))
female_no_h9_sig_sites_pc1_label <- sprintf("PC1 (%s)", female_no_h9_sig_sites_pov[1])
female_no_h9_sig_sites_pc2_label <- sprintf("PC2 (%s)", female_no_h9_sig_sites_pov[2])
female_no_h9_pca_df <- female_no_h9_sig_sites_pca_df
female_no_h9_pca_df$python_clusters <- cluster_testing_df[female_no_h9_pca_df$sample_name, ]$km4
km4_plot <- ggplot(female_no_h9_pca_df, mapping = aes(x=PC1, y=PC2)) +
geom_point(aes(color=as.factor(python_clusters)), size=.5, shape=20) +
theme_bw() +
labs(x=female_no_h9_pc1_label,
y=female_no_h9_pc2_label,
title = "4 clusters",
color="Cluster")+
base_plot_theme +
scale_color_brewer(type="qual", palette = "Dark2", guide=F)
female_no_h9_pca_df$python_clusters <- cluster_testing_df[female_no_h9_pca_df$sample_name, ]$km5
km5_plot <- ggplot(female_no_h9_pca_df, mapping = aes(x=PC1, y=PC2)) +
geom_point(aes(color=as.factor(python_clusters)), size=.5, shape=20) +
theme_bw() +
labs(x=female_no_h9_pc1_label,
y=female_no_h9_pc2_label,
title = "5 clusters",
color="Cluster")+
base_plot_theme +
scale_color_brewer(type="qual", palette = "Dark2", guide=F)
female_no_h9_pca_df$python_clusters <- cluster_testing_df[female_no_h9_pca_df$sample_name, ]$km6
km6_plot <- ggplot(female_no_h9_pca_df, mapping = aes(x=PC1, y=PC2)) +
geom_point(aes(color=as.factor(python_clusters)), size=.5, shape=20) +
theme_bw() +
labs(x=female_no_h9_pc1_label,
y=female_no_h9_pc2_label,
title = "6 clusters",
color="Cluster")+
base_plot_theme +
scale_color_brewer(type="qual", palette = "Dark2", guide=F)
female_no_h9_pca_df$python_clusters <- cluster_testing_df[female_no_h9_pca_df$sample_name, ]$km7
km7_plot <- ggplot(female_no_h9_pca_df, mapping = aes(x=PC1, y=PC2)) +
geom_point(aes(color=as.factor(python_clusters)), size=.5, shape=20) +
theme_bw() +
labs(x=female_no_h9_pc1_label,
y=female_no_h9_pc2_label,
title = "7 clusters",
color="Cluster")+
base_plot_theme +
scale_color_brewer(type="qual", palette = "Dark2", guide=F)
female_no_h9_pca_df$python_clusters <- cluster_testing_df[female_no_h9_pca_df$sample_name, ]$km8
km8_plot <- ggplot(female_no_h9_pca_df, mapping = aes(x=PC1, y=PC2)) +
geom_point(aes(color=as.factor(python_clusters)), size=.5, shape=20) +
theme_bw() +
labs(x=female_no_h9_pc1_label,
y=female_no_h9_pc2_label,
title = "8 clusters",
color="Cluster")+
base_plot_theme +
scale_color_brewer(type="qual", palette = "Dark2", guide=F)
kmeans_plots <- grid.arrange(elbow_plot,
km4_plot,
km5_plot,
km6_plot,
km7_plot,
km8_plot,
top = textGrob("Deciding optimal number of clusters",gp=gpar(fontsize=7, fontface="bold")), nrow=1)
ggsave("figs/supplementary/kmeans_plots.pdf", kmeans_plots, width = 174, height=40, units = 'mm')
```
## Bring in sample clusters from python analysis
```{r}
clusters_pData <- read.csv("saved_sample_clusters.csv",
header = FALSE,
row.names = 1,
col.names=c('name','cluster'))
# clusters_pData <- read.csv("saved_sample_clusters_test.csv",
# header = FALSE,
# row.names = 1,
# col.names=c('name','cluster'))
female_no_h9_pca_df$python_clusters <- clusters_pData[female_no_h9_pca_df$sample_name,]
female_no_h9_sig_sites_pca_df$python_clusters <- clusters_pData[female_no_h9_sig_sites_pca_df$sample_name,]
color_clusters <- clusters_pData
color_clusters$cluster <- as.factor(color_clusters$cluster)
# check where the samples clusters fall in the PCA
pca_female_no_h9_by_python_6 <- ggplot(female_no_h9_sig_sites_pca_df, mapping = aes(x=PC1, y=PC2)) +
geom_point(aes(color=as.factor(python_clusters), shape=(cell_line == "WA09")), size=.5) +
scale_shape_manual(values = c(17, 16), labels=c("H9", "Other"), limits=c(T,F), guide=F) +
theme_bw() +
labs(x=female_no_h9_sig_sites_pc1_label,
y=female_no_h9_sig_sites_pc2_label,
title = "PCA of Female Samples",
color="Cluster")+
base_plot_theme +
scale_color_brewer(type="qual", palette = "Dark2", limits=c(0,1,2,3,4,5), labels=c("A","B","C","D","E","F")) +
guides(color=guide_legend(nrow=1)) +
theme(legend.key.width = unit(1, units = "mm"),
legend.text = element_text(size=5, margin = margin(r=.5, l=-.7, unit="mm")))
ggsave("figs/main/fig1_pca_female.pdf", pca_female_no_h9_by_python_6, height = 47, width=87, units = "mm")
# female_pca <- prcomp(t(na.omit(female_all_x_df)))
# female_pca_df <- rownames_to_column(as.data.frame(female_pca$x), var = "sample_name")
# female_pca_df <- left_join(female_pca_df, sample_annotation_df, by = "sample_name")
# female_pca_df <- left_join(female_pca_df, female_avg_x_methylation_df, by = "sample_name")
female_pca_df$python_clusters <- clusters_pData[female_pca_df$sample_name,]
library(ggrepel)
pca_female_no_h9_by_python_6 <- ggplot(female_pca_df, mapping = aes(x=PC1, y=PC2)) +
geom_line(female_pca_df %>% filter(cell_line %in% c("HDF51IPS3", "HDF51IPS1", "HDF51IPS2", "HDF51IPS11", "HDF51IPS12", "HDF51IPS6", "HDF51IPS14", "WA09") & grepl("Nazor_", sample_name)),
mapping = aes(x=PC1, y=PC2, group=cell_line), size=.3, alpha=.4) +
geom_point(aes(color=as.factor(python_clusters), shape=(cell_line == "WA09")), size=.5) +
# geom_text_repel(female_pca_df %>% filter(cell_line %in% correctly_ordered_by_sig_names & grepl("Nazor_", sample_name)), mapping = aes(x=PC1, y=PC2, label=passage), size=2, color="grey", segment.size=.15, min.segment.length=.25, segment.color="darkgrey") +
scale_shape_manual(values = c(17, 16), labels=c("H9", "Other"), limits=c(T,F), guide=F) +
theme_bw() +
labs(x=female_pc1_label,
y=female_pc2_label,
title = "PCA of Female Samples",
color="Cluster")+
base_plot_theme +
scale_color_manual(limits=factor(c(0,1,2,3,4,5,NA)), labels=c("A","B","C","D","E","F", "H9"), values = c(RColorBrewer::brewer.pal(6,"Dark2"), "#000000"), na.value="#000000") +
# scale_color_brewer(type="qual", palette = "Dark2", limits=c(0,1,2,3,4,5), labels=c("A","B","C","D","E","F")) +
guides(color=guide_legend(nrow=1)) +
theme(legend.key.width = unit(1, units = "mm"),
legend.text = element_text(size=5, margin = margin(r=.5, l=-.7, unit="mm")))
ggsave("figs/main/fig1_pca_female_with_h9.pdf", pca_female_no_h9_by_python_6, height = 69.3, width=87, units = "mm")
fig_1 <- grid.arrange(female_x_autosome, female_x_xist,
xchr_beta_dist, female_male_pca_plot,
pca_female_no_h9_by_methylation, pca_female_no_h9_by_python_6, nrow=3)
fig_1 <- grid.arrange(female_x_xist, variance_density_plot,
xchr_beta_dist, female_male_pca_plot,
pca_female_no_h9_by_methylation, pca_female_no_h9_by_python_6, nrow=3)
ggsave("figs/main/fig1.svg", fig_1, height = 236, width=174, units = "mm")
ggsave("figs/female-pca-by-cluster.pdf", height = 84, width = 84, units = 'mm')
```
## Look at the pairs in relation to the clusters
```{r}
cluster_tib <- read_csv("saved_sample_clusters.csv", col_names = c("sample_name", "cluster"))
ordered_cell_lines_df <- filter(female_sample_annotation_df, !is.na(passage) & cell_line %in% correctly_ordered_by_sig_names) %>%
left_join(cluster_tib, by="sample_name")
```
## Differentially Methylated Probes Analysis
Now that we have our sample clusters from the python kmeans analysis, we will now focus on clustering our CpG probes based on when they change their methylation status.
```{r}
library(tidyverse)
library(minfi)
clusters_pData <- read_csv("saved_sample_clusters.csv", col_names = c("name", "cluster"))
dir.create('dmp_between_sample_clusters')
all_dmp_df <- data.frame()
transition_columns <- c()
female_df_no_h9 <- read_csv("female_df_no_h9.csv") %>% column_to_rownames(var="rn")
filtered_site_annotation_df <- read_csv("filtered_site_annotation_df.csv")
filtered_site_annotation_df_temp <- column_to_rownames(filtered_site_annotation_df, var="rowname")
cluster_means <- tibble(rowname=character())
for (n2 in min(clusters_pData$cluster):(max(clusters_pData$cluster))) {
print(n2)
cluster_samples <- filter(clusters_pData, cluster %in% c(n2))$name
mean_df <- as.data.frame(rowMeans(na.omit(female_df_no_h9[cluster_samples]))) %>%
rownames_to_column() %>%
dplyr::rename(!!sprintf('cluster_%d_mean', n2) := 2)
cluster_means <- full_join(cluster_means, mean_df, by="rowname")
}
for (n in min(clusters_pData$cluster):(max(clusters_pData$cluster)-1)) {
print(sprintf('comparing clusters %d and %d', n, n+1))
cluster_1_group <- filter(clusters_pData, cluster <= n)$name
cluster_2_group <- filter(clusters_pData, cluster > n)$name
temp_cluster_data <- clusters_pData #column_to_rownames(clusters_pData, var="name")
temp_cluster_data[temp_cluster_data$name %in% cluster_1_group, 'cluster'] <- 0
temp_cluster_data[temp_cluster_data$name %in% cluster_2_group, 'cluster'] <- 1
clusters_pair <- temp_cluster_data$name
subset_beta <- na.omit(female_df_no_h9[,clusters_pair])
subset_clusters <- as.matrix(temp_cluster_data %>% column_to_rownames(var="name"))
subset_clusters[1] <- as.factor(subset_clusters[1])
# subset_clusters <- as.matrix(temp_cluster_data[clusters_pair,])
# subset_clusters[1] <- as.factor(subset_clusters[1])
# row.names(subset_clusters) <- clusters_pair
dmp <- dmpFinder(as.matrix(subset_beta), pheno = subset_clusters[,1], type = "categorical")
# subset_beta_mat <- as.matrix(subset_beta)
# row.names(subset_beta_mat) <- rownames(subset_beta)
dmp_df <- as.data.frame(dmp) %>% rownames_to_column(var="rowname")
dmp_df <- left_join(dmp_df, filtered_site_annotation_df %>% select(rowname,
hg19_chr=chr,
hg19_pos=pos,
hg19_gene=UCSC_RefGene_Name,
hg19_refgene_group=UCSC_RefGene_Group,
hg38_chr=hg38_chromosome,
hg38_pos=hg38_pos,
hg38_gene=hg38_gene_name,
hg38_refgene_group=hg38_gene_group), by="rowname")
dmp_df <- left_join(dmp_df, cluster_means, by="rowname")
write.csv(dmp_df,
file = sprintf('dmp_between_sample_clusters/dmp_table_%d_%d.csv', n, n+1),
quote = FALSE)
if (n + 1 == 1) {
all_dmp_df <- dmp_df %>% select(rowname,
hg19_chr,
hg19_pos,
hg19_gene,
hg19_refgene_group,
hg38_chr,
hg38_pos,
hg38_gene,
hg38_refgene_group,
matches("cluster_._mean"))
}
all_dmp_df <- full_join(all_dmp_df, dmp_df %>% select(rowname, !!sprintf('transition_%s_qval', n + 1):=qval), by="rowname")
transition_columns <- append(transition_columns, sprintf('transition_%s_qval', n + 1))
}
all_dmp_df$key_transition <- colnames(all_dmp_df[transition_columns])[apply(all_dmp_df[transition_columns], 1, which.min)]
all_dmp_df$key_transition <- as.numeric(unlist(as.data.frame(t(as.data.frame(strsplit(all_dmp_df$key_transition, '_'))))[2]))
for (row in 1:nrow(all_dmp_df)) {
transitions <- c()
transition_deltas <- c()
for (transition in 1:5) {
qval_col <- sprintf("transition_%d_qval", transition)
if (all_dmp_df[row, qval_col] <= 0.1) {
transitions <- append(transitions, transition)
col_1 <- sprintf("cluster_%d_mean", transition-1)
col_2 <- sprintf("cluster_%d_mean", transition)
transition_deltas <- append(transition_deltas, all_dmp_df[row, col_2] - all_dmp_df[row, col_1])
}
}
alt_key_transition <- NA
if (length(transitions) > 0) {
alt_key_transition <- transitions[which.max(abs(transition_deltas))]
all_dmp_df[row, "alt_key_transition"] <- alt_key_transition
all_dmp_df[row, "alt_delta"] <- transition_deltas[which.max(abs(transition_deltas))]
}
}
q_val_threshold <- .001
all_dmp_df$passes_threshold <- apply(all_dmp_df[transition_columns], 1, min) <= q_val_threshold
all_dmp_df$passes_stringent_threshold <- apply(all_dmp_df[transition_columns], 1, min) <= 1e-10
all_dmp_df$delta <- apply(all_dmp_df, 1, function(r) {
col_1_name <- sprintf('cluster_%d_mean', as.numeric(r['key_transition']) - 1)
col_2_name <- sprintf('cluster_%d_mean', as.numeric(r['key_transition']))
return(as.numeric(r[col_2_name]) - as.numeric(r[col_1_name]))
})
all_dmp_df$delta_increasing <- apply(all_dmp_df, 1, function(r) {
col_1_name <- sprintf('cluster_%d_mean', as.numeric(r['key_transition']) - 1)
col_2_name <- sprintf('cluster_%d_mean', as.numeric(r['key_transition']))
return(as.numeric(r[col_2_name]) - as.numeric(r[col_1_name]) >= 0)
})
write_csv(all_dmp_df, "all_dmp_df.csv")
```
## Compare cluster A to H9
```{r}
female_df <- read_csv("female_beta_mat.csv.gz") %>% column_to_rownames(var="rowname")
clusters_pData <- read_csv("saved_sample_clusters.csv", col_names = c("name", "cluster"))
filtered_site_annotation_df <- read_csv("filtered_site_annotation_df.csv.gz")
filtered_site_annotation_df_temp <- column_to_rownames(filtered_site_annotation_df, var="rowname")
print("comparing cluster A and H9")
cluster_1_group <- filter(clusters_pData, cluster == 0)$name
cluster_a_data <- filter(clusters_pData, cluster == 0) #column_to_rownames(clusters_pData, var="name")
h9_lines <- filter(sample_annotation_df, cell_line == 'WA09' & gender_matches == T)$sample_name
temp_cluster_data <- rbind(cluster_a_data, tibble(name=h9_lines, cluster=1))
subset_beta <- na.omit(female_df[,c(cluster_1_group, h9_lines)])
subset_clusters <- as.matrix(temp_cluster_data %>% column_to_rownames(var="name"))
subset_clusters[1] <- as.factor(subset_clusters[1])
dmp <- dmpFinder(as.matrix(subset_beta), pheno = subset_clusters[,1], type = "categorical")
# subset_beta_mat <- as.matrix(subset_beta)
# row.names(subset_beta_mat) <- rownames(subset_beta)
dmp_df <- as.data.frame(dmp) %>% rownames_to_column(var="rowname")
dmp_df <- left_join(dmp_df, filtered_site_annotation_df %>% select(rowname,
hg19_chr=chr,
hg19_pos=pos,
hg19_gene=UCSC_RefGene_Name,
hg19_refgene_group=UCSC_RefGene_Group,
hg38_chr=hg38_chromosome,
hg38_pos=hg38_pos,
hg38_gene=hg38_gene_name,
hg38_refgene_group=hg38_gene_group), by="rowname")
cluster_means <- tibble(rowname=character())
for (n2 in min(temp_cluster_data$cluster):(max(temp_cluster_data$cluster))) {
print(n2)
cluster_samples <- filter(temp_cluster_data, cluster %in% c(n2))$name
mean_df <- as.data.frame(rowMeans(na.omit(female_df[cluster_samples]))) %>%
rownames_to_column() %>%
dplyr::rename(!!sprintf('cluster_%d_mean', n2) := 2)
cluster_means <- full_join(cluster_means, mean_df, by="rowname")
}
dmp_df <- left_join(dmp_df, cluster_means %>% rename(H9_mean=cluster_1_mean), by="rowname")
dmp_df <- dmp_df %>% mutate(passes_threshold=qval <= q_val_threshold,
passes_stringent_threshold=qval <= 1e-10)
write_csv(dmp_df, "dmp_table_clusterA_H9.csv.gz")
write(bedgraph_header(name="eroding_delta_H9",
description=sprintf("eroding deltas from H9 transition 1Mb windowed")),
"bedgraphs/windowed_eroding_delta_tran_H9.bedgraph")
bed_df <- dmp_df %>% filter((hg19_chr == "chrX") & (passes_threshold == T))
cluster_col_0 <- "cluster_0_mean" #sprintf("cluster_%d_mean", t-1)
cluster_col_1 <- "H9_mean"
bed_df$delta <- unlist(bed_df[cluster_col_1]) - unlist(bed_df[cluster_col_0])
write_tsv(window_average_chrX(bed_file_df=bed_df %>% rename(chr=hg19_chr, pos=hg19_pos)), "bedgraphs/windowed_eroding_delta_tran_H9.bedgraph", col_names = F, append=TRUE)
#non cluster A iPSCs
ipsc_lines <- filter(sample_annotation_df, cell_type == "iPSC")$sample_name
non_a_ipsc_lines <- filter(clusters_pData, cluster != 0 & name %in% ipsc_lines)$name
temp_cluster_data <- rbind(cluster_a_data, tibble(name=non_a_ipsc_lines, cluster=1))
subset_beta <- na.omit(female_df[,c(cluster_1_group, non_a_ipsc_lines)])
subset_clusters <- as.matrix(temp_cluster_data %>% column_to_rownames(var="name"))
subset_clusters[1] <- as.factor(subset_clusters[1])
dmp <- dmpFinder(as.matrix(subset_beta), pheno = subset_clusters[,1], type = "categorical")
# subset_beta_mat <- as.matrix(subset_beta)
# row.names(subset_beta_mat) <- rownames(subset_beta)
dmp_df <- as.data.frame(dmp) %>% rownames_to_column(var="rowname")
dmp_df <- left_join(dmp_df, filtered_site_annotation_df %>% select(rowname,
hg19_chr=chr,
hg19_pos=pos,
hg19_gene=UCSC_RefGene_Name,
hg19_refgene_group=UCSC_RefGene_Group,
hg38_chr=hg38_chromosome,
hg38_pos=hg38_pos,
hg38_gene=hg38_gene_name,
hg38_refgene_group=hg38_gene_group), by="rowname")
cluster_means <- tibble(rowname=character())
for (n2 in min(temp_cluster_data$cluster):(max(temp_cluster_data$cluster))) {
print(n2)
cluster_samples <- filter(temp_cluster_data, cluster %in% c(n2))$name
mean_df <- as.data.frame(rowMeans(na.omit(female_df[cluster_samples]))) %>%
rownames_to_column() %>%
dplyr::rename(!!sprintf('cluster_%d_mean', n2) := 2)
cluster_means <- full_join(cluster_means, mean_df, by="rowname")
}
dmp_df <- left_join(dmp_df, cluster_means %>% rename(non_A_iPSC_mean=cluster_1_mean), by="rowname")
dmp_df <- dmp_df %>% mutate(passes_threshold=qval <= q_val_threshold,
passes_stringent_threshold=qval <= 1e-10)
write_csv(dmp_df, "dmp_table_clusterA_non_A_iPSC.csv.gz")
write(bedgraph_header(name="eroding_delta_iPSC",
description=sprintf("eroding deltas from iPSC transition 1Mb windowed")),
"bedgraphs/windowed_eroding_delta_tran_iPSC.bedgraph")
bed_df <- dmp_df %>% filter((hg19_chr == "chrX") & (passes_threshold == T))
cluster_col_0 <- "cluster_0_mean" #sprintf("cluster_%d_mean", t-1)
cluster_col_1 <- "non_A_iPSC_mean"
bed_df$delta <- unlist(bed_df[cluster_col_1]) - unlist(bed_df[cluster_col_0])
write_tsv(window_average_chrX(bed_file_df=bed_df %>% rename(chr=hg19_chr, pos=hg19_pos)), "bedgraphs/windowed_eroding_delta_tran_iPSC.bedgraph", col_names = F, append=TRUE)
#non cluster A ESCs
esc_lines <- filter(sample_annotation_df, cell_type == "ESC")$sample_name
non_a_esc_lines <- filter(clusters_pData, cluster != 0 & name %in% esc_lines)$name
temp_cluster_data <- rbind(cluster_a_data, tibble(name=non_a_esc_lines, cluster=1))
subset_beta <- na.omit(female_df[,c(cluster_1_group, non_a_esc_lines)])
subset_clusters <- as.matrix(temp_cluster_data %>% column_to_rownames(var="name"))
subset_clusters[1] <- as.factor(subset_clusters[1])
dmp <- dmpFinder(as.matrix(subset_beta), pheno = subset_clusters[,1], type = "categorical")
# subset_beta_mat <- as.matrix(subset_beta)
# row.names(subset_beta_mat) <- rownames(subset_beta)
dmp_df <- as.data.frame(dmp) %>% rownames_to_column(var="rowname")
dmp_df <- left_join(dmp_df, filtered_site_annotation_df %>% select(rowname,
hg19_chr=chr,
hg19_pos=pos,
hg19_gene=UCSC_RefGene_Name,
hg19_refgene_group=UCSC_RefGene_Group,
hg38_chr=hg38_chromosome,
hg38_pos=hg38_pos,
hg38_gene=hg38_gene_name,
hg38_refgene_group=hg38_gene_group), by="rowname")
cluster_means <- tibble(rowname=character())
for (n2 in min(temp_cluster_data$cluster):(max(temp_cluster_data$cluster))) {
print(n2)
cluster_samples <- filter(temp_cluster_data, cluster %in% c(n2))$name
mean_df <- as.data.frame(rowMeans(na.omit(female_df[cluster_samples]))) %>%
rownames_to_column() %>%
dplyr::rename(!!sprintf('cluster_%d_mean', n2) := 2)
cluster_means <- full_join(cluster_means, mean_df, by="rowname")
}
dmp_df <- left_join(dmp_df, cluster_means %>% rename(non_A_ESC_mean=cluster_1_mean), by="rowname")
dmp_df <- dmp_df %>% mutate(passes_threshold=qval <= q_val_threshold,
passes_stringent_threshold=qval <= 1e-10)
write_csv(dmp_df, "dmp_table_clusterA_non_A_ESC.csv.gz")
write(bedgraph_header(name="eroding_delta_ESC",
description=sprintf("eroding deltas from ESC transition 1Mb windowed")),
"bedgraphs/windowed_eroding_delta_tran_ESC.bedgraph")
bed_df <- dmp_df %>% filter((hg19_chr == "chrX") & (passes_threshold == T))
cluster_col_0 <- "cluster_0_mean" #sprintf("cluster_%d_mean", t-1)
cluster_col_1 <- "non_A_ESC_mean"
bed_df$delta <- unlist(bed_df[cluster_col_1]) - unlist(bed_df[cluster_col_0])
write_tsv(window_average_chrX(bed_file_df=bed_df %>% rename(chr=hg19_chr, pos=hg19_pos)), "bedgraphs/windowed_eroding_delta_tran_ESC.bedgraph", col_names = F, append=TRUE)
# calculating histone correlations
h3k27me3_peaks <- read_table2("bedgraphs/GSM1528885_H9_XISTpos_K27me3.peaks.bed", col_names=c("chr", "start", "end", "signal"))
h3k27me3_peaks_chrX <- h3k27me3_peaks %>% filter(chr == "chrX")
h3k27me3_peaks_chrX_windowed <- window_average_chrX(bed_file_df=h3k27me3_peaks_chrX %>% rename(pos=start, delta=signal), window_size = 100000, step_size = 50000, include_empty = T, calculate_sum = T)
h3k9_xi_peaks <- read_table2("bedgraphs/GSM1528888_H9_XISTpos_K9me3.peaks.bed.gz", col_names=c("chr", "start", "end", "signal"))
h3k9_xi_peaks_chrX <- h3k9_xi_peaks %>% filter(chr == "chrX")
h3k9_xi_peaks_chrX_windowed <- window_average_chrX(bed_file_df=h3k9_xi_peaks_chrX %>% rename(pos=start, delta=signal), window_size = 100000, step_size = 50000, include_empty = T, calculate_sum = T)
histone_mark_correlations <- tibble(group=character(),
h3k27me3_correlation=numeric(),
h3k27me3_correlation_p_val=numeric(),
h3k9me3_correlation=numeric(),
h3k9me3_correlation_p_val=numeric())
for (g in c("H9", "iPSC", "ESC")) {
if (g == "H9") {
bed_df <- read_csv(sprintf("dmp_table_clusterA_%s.csv.gz", g)) %>%
filter((hg19_chr == "chrX") & (passes_threshold == T))
bed_df <- bed_df %>%
mutate(delta=H9_mean-cluster_0_mean)
}else if (g == "iPSC") {
bed_df <- read_csv(sprintf("dmp_table_clusterA_non_A_%s.csv.gz", g)) %>%
filter((hg19_chr == "chrX") & (passes_threshold == T))
bed_df <- bed_df %>%
mutate(delta=non_A_iPSC_mean-cluster_0_mean)
}else if (g == "ESC") {
bed_df <- read_csv(sprintf("dmp_table_clusterA_non_A_%s.csv.gz", g)) %>%
filter((hg19_chr == "chrX") & (passes_threshold == T))
bed_df <- bed_df %>%
mutate(delta=non_A_ESC_mean-cluster_0_mean)
}
# bed_df <- all_dmp_df %>% filter((hg19_chr == "chrX") & (passes_threshold == T) & (key_transition == t))
averaged_distance <- window_average_chrX(bed_file_df=bed_df %>% rename(chr=hg19_chr, pos=hg19_pos), window_size = 100000, step_size = 50000, include_empty = T)
k27_cor_test <- cor.test(abs(averaged_distance$delta), h3k27me3_peaks_chrX_windowed$delta)
k9_cor_test <- cor.test(abs(averaged_distance$delta), h3k9_xi_peaks_chrX_windowed$delta) # h3k9me3_signal_chrX$signal)
# k27_xe_cor_test <- cor.test(abs(averaged_distance$delta), h3k27me3_xe_signal_chrX$signal)
# k9_xe_cor_test <- cor.test(abs(averaged_distance$delta), h3k9_xe_peaks_windowed$delta)
histone_mark_correlations <- add_row(histone_mark_correlations,
group=g,
h3k27me3_correlation=k27_cor_test$estimate,
h3k27me3_correlation_p_val=k27_cor_test$p.value,
h3k9me3_correlation=k9_cor_test$estimate,
h3k9me3_correlation_p_val=k9_cor_test$p.value)
# h3k27me3_xe_correlation=k27_xe_cor_test$estimate,
# h3k9me3_xe_correlation=k9_xe_cor_test$estimate)
}
histone_cor_df <- reshape2::melt(histone_mark_correlations %>%
dplyr::select(transition,
h3k27me3_correlation,
h3k9me3_correlation),
id.vars=c("transition"))
histone_cor_df$variable <- factor(histone_cor_df$variable, levels=c("h3k9me3_correlation", "h3k27me3_correlation"))
histon_cor_plot <- ggplot(histone_cor_df) +
geom_bar(aes(x=transition, y=value, fill=variable), stat="identity", position=position_dodge()) +
theme_bw() +
base_plot_theme +
coord_flip() +
facet_grid(rows=vars(sprintf("Tran %d", transition)), scales="free_y")+
labs(title="Correlation of DMPs With Histone Meth.",
x="",
y="Pearson Correlation",
fill="Histone Modification") +
scale_fill_manual(limits=c("h3k27me3_correlation", "h3k9me3_correlation"),
labels=c("H3K27me3", "H3K9me3"),
values = c(pal_jama()(1), "#8C564B"),
guide=F) +
scale_x_continuous(labels = NULL) +
scale_y_continuous(limits=c(-.13,.33)) +
theme(strip.text = element_text(size=5, margin = margin(0,.5,0,.5, "mm")),
strip.background = element_rect(size = .3),
panel.grid.minor.y = element_blank(),
axis.ticks.length.y = unit(0, "mm"),
axis.title.y = element_blank(),
axis.text.y = element_blank(),
panel.spacing.y = unit(.75, "mm"))
```
## Make heatmap in R
```{r}
female_df_no_h9_chrX <- read_csv("female_df_no_h9_chrX.csv.gz")
all_dmp_df <- read_csv('all_dmp_df.csv.gz')
# do this based on genes and when the promoters change their methylation.
x_annotation_df <- read_csv("x_annotation.csv.gz")
statistics <- read_csv('variance_statistics_noh9_nooutliers.csv.gz')
positions_to_keep <- statistics$Position[statistics$alpha <= .01 & (statistics$femaleVariance > statistics$maleVariance)]
sig_x_sites <- intersect(positions_to_keep, x_annotation_df$rowname)
heatmap_df <- inner_join(female_df_no_h9_chrX %>% filter(rn %in% x_annotation_df$rowname),
dplyr::select(all_dmp_df, rowname, key_transition, passes_threshold, delta_increasing, hg38_refgene_group), by=c("rn"="rowname"))
heatmap_df <- arrange(heatmap_df, desc(passes_threshold), desc(delta_increasing), desc(key_transition))
ordered_sites <- heatmap_df$rn
heatmap_df[heatmap_df$passes_threshold == FALSE, "cluster"] <- 10
heatmap_df[(heatmap_df$passes_threshold == TRUE) & (heatmap_df$delta_increasing == T), "cluster"] <- heatmap_df[(heatmap_df$passes_threshold == TRUE) & (heatmap_df$delta_increasing == T), "key_transition"] - 1
heatmap_df[(heatmap_df$passes_threshold == TRUE) & (heatmap_df$delta_increasing == F), "cluster"] <- (heatmap_df[(heatmap_df$passes_threshold == TRUE) & (heatmap_df$delta_increasing == F), "key_transition"] - 1) + 5
barplot_df <- heatmap_df %>% dplyr::select(rn, cluster, hg38_refgene_group, delta_increasing)
barplot_df[is.na(barplot_df$hg38_refgene_group), "hg38_refgene_group"] <- "No Gene"
barplot_df <- barplot_df %>% dplyr::mutate(formatted_cluster=factor(c(
"1↑",
"2↑",
"3↑",
"4↑",
"5↑",
"1↓",
"2↓",
"3↓",
"4↓",
"5↓",
"~"
)[cluster+1], levels=rev(c(
"1↑",
"2↑",
"3↑",
"4↑",
"5↑",
"1↓",
"2↓",
"3↓",
"4↓",
"5↓",
"~"
))))
cpg_composition_plot <- ggplot(barplot_df %>% filter(rn %in% sig_x_sites),
aes(x=formatted_cluster,
y=(..count..),
fill=factor(hg38_refgene_group,
levels = rev(c("OutsideTranscript5",
"TSS1500",
"TSS200",
"5_UTR",
"Body",
"3_UTR",
"OutsideTranscript3",
"No Gene"))))) +
geom_bar(position="fill") +
scale_fill_manual(values=c("#BBBBBB", viridisLite::viridis(n=8)[2:8]),
limits=rev(c("OutsideTranscript5",
"TSS1500",
"TSS200",
"5_UTR",
"Body",
"3_UTR",
"OutsideTranscript3",
"No Gene"))) +
geom_text(data=barplot_df %>% filter(rn %in% sig_x_sites), mapping=aes(x=formatted_cluster, label=(..count..), fill=NA),
stat="count", position=position_fill(vjust=1.05), size=1.5) +
coord_flip() +
scale_y_continuous(labels = scales::percent_format())+
labs(x="CpG DMP Cluster",
y="Percent of Probes",
title="Composition of CpG Clusters") +
theme_bw() +
base_plot_theme +
guides(fill=guide_legend(nrow=3, reverse=TRUE, title="")) +
theme(legend.box.margin = margin(l=-10, unit="mm"),
legend.text = element_text(margin=margin(l=-.7, unit="mm")))
ggsave("figs/supplementary/supp_2_cpg_composition_prop.pdf",cpg_composition_plot, width = 48, height = 50, units = "mm")
formatted_barplot_df <- tibble(cluster=numeric(),
num_sites=numeric(),
group=character())
for (c in 0:10) {
total_sites <- nrow(barplot_df %>% filter((cluster == c)))
num_outside_5 <- nrow(barplot_df %>% filter((cluster == c) & hg38_refgene_group == "OutsideTranscript5"))
num_tss1500 <- nrow(barplot_df %>% filter((cluster == c) & hg38_refgene_group == "TSS1500"))
num_tss200 <- nrow(barplot_df %>% filter((cluster == c) & hg38_refgene_group == "TSS200"))
num_5utr <- nrow(barplot_df %>% filter((cluster == c) & hg38_refgene_group == "5_UTR"))
num_body <- nrow(barplot_df %>% filter((cluster == c) & hg38_refgene_group == "Body"))
num_3utr <- nrow(barplot_df %>% filter((cluster == c) & hg38_refgene_group == "3_UTR"))
num_outside_3 <- nrow(barplot_df %>% filter((cluster == c) & hg38_refgene_group == "OutsideTranscript3"))
num_no_gene <- nrow(barplot_df %>% filter((cluster == c) & is.na(hg38_refgene_group)))
# num_other <- total_sites - (num_tss1500 + num_tss200 + num_5utr + num_exon1 + num_body + num_3utr)
formatted_barplot_df <- add_row(formatted_barplot_df,
cluster = c,
num_sites = num_outside_5,
group = "OutsideTranscript5'")
formatted_barplot_df <- add_row(formatted_barplot_df,
cluster = c,
num_sites = num_tss1500,cpg_composition_plot
group = "TSS1500")
formatted_barplot_df <- add_row(formatted_barplot_df,
cluster = c,
num_sites = num_tss200,
group = "TSS200")
formatted_barplot_df <- add_row(formatted_barplot_df,
cluster = c,
num_sites = num_5utr,
group = "5'UTR")
formatted_barplot_df <- add_row(formatted_barplot_df,
cluster = c,
num_sites = num_body,
group = "Body")
formatted_barplot_df <- add_row(formatted_barplot_df,
cluster = c,
num_sites = num_3utr,
group = "3'UTR")
formatted_barplot_df <- add_row(formatted_barplot_df,
cluster = c,
num_sites = num_outside_3,
group = "OutsideTranscript3'")
formatted_barplot_df <- add_row(formatted_barplot_df,
cluster = c,
num_sites = num_no_gene,
group = "No Gene")
}
set_panel_size <- function(p=NULL, g=ggplotGrob(p),
margin = unit(1,"mm"),
height=unit(15, "mm")){
panels <- grep("panel", g$layout$name)
panel_index_w<- unique(g$layout$l[panels])
panel_index_h<- unique(g$layout$t[panels])
nw <- length(panel_index_w)
nh <- length(panel_index_h)
if(getRversion() < "3.3.0"){
# the following conversion is necessary
# because there is no `[<-`.unit method
# so promoting to unit.list allows standard list indexing
# g$widths <- grid:::unit.list(g$widths)
g$heights <- grid:::unit.list(g$heights)
# g$widths[panel_index_w] <- rep(list(width), nw)
g$heights[panel_index_h] <- rep(list(height), nh)
} else {
# g$widths[panel_index_w] <- rep(width, nw)
g$heights[panel_index_h] <- rep(height, nh)
}
return(g)
}
formatted_barplot_df <- formatted_barplot_df %>% mutate(formatted_cluster=factor(c(
"1↑",
"2↑",
"3↑",
"4↑",
"5↑",
"1↓",
"2↓",
"3↓",
"4↓",
"5↓",
"↕"
)[cluster+1], levels=rev(c(
"1↑",
"2↑",
"3↑",
"4↑",
"5↑",
"1↓",
"2↓",
"3↓",
"4↓",
"5↓",
"↕"
))))
p <- ggplot(data=formatted_barplot_df, aes(x=formatted_cluster, y=num_sites, #(..num_sites..)/sum(..num_sites..),
fill=factor(group, levels = rev(c("OutsideTranscript5'",
"TSS1500",
"TSS200",
"5'UTR",
"Body",
"3'UTR",
"OutsideTranscript3'",
"No Gene"))))) +
geom_bar(stat="identity") +
coord_flip() +
scale_fill_manual(values=c("#BBBBBB", viridisLite::viridis(n=8)[2:8])) +
# scale_fill_brewer(type = "qual", palette = 8) +
# scale_y_continuous(expand = c(0, 0)) +
labs(x="CpG DMP Cluster",
y="Number of Probes",
title="Composition of CpG Clusters") +
theme_bw() +
base_plot_theme +
guides(fill=guide_legend(nrow=1, reverse=TRUE, title="")) +
theme(legend.key.width = unit(2, "mm"),
legend.text = element_text(size = 5),
axis.text = element_text(size=5))
# grid::grid.draw(set_panel_size(p))
ggsave("figs/supplementary/supp_2_cpg_composition_temp.svg",p, width = 86, height = 86, units = "mm")
heatmap_df <- dplyr::select(heatmap_df, -key_transition, -passes_threshold, -delta_increasing, -hg38_refgene_group)
sample_clusters_df <- read_csv("saved_sample_clusters.csv",
col_names = c('name','cluster')) %>% arrange(cluster)
ordered_sample_names <- sample_clusters_df$name
heatmap_df <- reshape2::melt(heatmap_df, id.vars=c("rn", "cluster")) %>% dplyr::rename(site=rn, sample_name=variable)
for (sample_row in 1:nrow(sample_clusters_df)) {
cluster_name <- unlist(sample_clusters_df[sample_row, "name"])
cluster <- unlist(sample_clusters_df[sample_row, "cluster"])
heatmap_df[heatmap_df$sample_name == cluster_name, "sample_cluster"] <- cluster
}
library(grid)
sigvar_heatmap_plot <- ggplot(heatmap_df %>% filter(site %in% sig_x_sites), aes(x=factor(sample_name, levels = ordered_sample_names),
y=factor(site, levels = ordered_sites), fill=value)) +
geom_tile() +
facet_grid(cluster ~ sample_cluster, scales = "free", space="free", shrink=FALSE, switch = "y", labeller = labeller(cluster = function(c){
return(list(
"0"="1↑",
"1"="2↑",
"2"="3↑",
"3"="4↑",
"4"="5↑",
"5"="1↓",
"6"="2↓",
"7"="3↓",
"8"="4↓",
"9"="5↓",
"10"="↕"
)[c])
})) +
# facet_grid(cluster ~ sample_cluster,space="free_x", scales = "free", shrink=FALSE) +
scale_fill_gradient2(low="#0bb1a4", high = "#E966FE", midpoint = .5, breaks=c(0,.5,1), labels=c("unmethylated\nβ=0", "50% methylation\nβ=0.5", "methylated\nβ=1"))+
labs(x="Female Samples",
y="CpG Sites with Greater Variance in Female Samples",
fill="DNA Methylation") +
theme(axis.text=element_blank(),
axis.ticks=element_blank(),
legend.position="top",
legend.justification = "center",
legend.title = element_text(size = 6, vjust = .75),
legend.text = element_text(size = 6),
# plot.background = element_rect(fill = "lightgrey"),
# panel.border = element_rect(colour = "black", fill=NA, size=1),
plot.margin = margin(r=0),
panel.spacing.y = unit(2, "mm"),
panel.spacing.x = unit(0, "mm"),
strip.background.x = element_rect(color="black"),
strip.background.y = element_rect(fill = NA, colour = "black", size = .7),
strip.text = element_text(size=6),
strip.text.y = element_text(angle=180),
axis.title = element_text(size=8))
sigvar_heatmap_plot + theme(axis.text.x=element_text(angle = 90, size=7))
# all_x_heatmap_plot <- ggplot(heatmap_df, aes(x=factor(sample_name, levels = ordered_sample_names),
# y=factor(site, levels = ordered_sites), fill=value)) +
# geom_tile() +
# facet_grid(cluster ~ sample_cluster, scales = "free", space="free", shrink=FALSE, switch = "y") +
# # facet_grid(cluster ~ sample_cluster,space="free_x", scales = "free", shrink=FALSE) +
# scale_fill_gradient2(low="#0bb1a4", high = "#E966FE", midpoint = .5, breaks=c(0,.5,1), labels=c("unmethylated\nβ=0", "50% methylation\nβ=0.5", "methylated\nβ=1"))+
# labs(x="Female Samples",
# y="CpG Sites with Greater Variance in Female Samples",
# fill="DNA Methylation") +
# theme(axis.text=element_blank(),
# axis.ticks=element_blank(),
# legend.position="top",
# legend.justification = "center",
# legend.title = element_text(size = 6, vjust = .75),
# legend.text = element_text(size = 6),
# # plot.background = element_rect(fill = "lightgrey"),
# # panel.border = element_rect(colour = "black", fill=NA, size=1),
# plot.margin = margin(r=0),
# panel.spacing.y = unit(2, "mm"),
# panel.spacing.x = unit(0, "mm"),
# strip.background.x = element_rect(color="black"),
# strip.background.y = element_rect(fill = NA, colour = "black", size = .7),
# strip.text = element_text(size=6),
# strip.text.y = element_text(angle=180),
# axis.title = element_text(size=8))
ggsave("figs/heatmap.pdf",set_panel_size(heatmap_plot), width = 85, height = 230, units = "mm")
all_x_heatmap_plot <- ggplot(heatmap_df, aes(x=factor(sample_name, levels = ordered_sample_names),
y=factor(site, levels = ordered_sites), fill=value)) +
geom_tile() +
facet_grid(cluster ~ sample_cluster, scales = "free", space="free", shrink=FALSE, switch = "y", labeller = labeller(cluster = function(c){
return(list(
"0"="1↑",
"1"="2↑",
"2"="3↑",
"3"="4↑",
"4"="5↑",
"5"="1↓",
"6"="2↓",
"7"="3↓",
"8"="4↓",
"9"="5↓",
"10"="↕"
)[c])
})) +
# facet_grid(cluster ~ sample_cluster,space="free_x", scales = "free", shrink=FALSE) +
scale_fill_gradient2(low="#0bb1a4", high = "#E966FE", midpoint = .5, breaks=c(0,.5,1), labels=c("unmethylated\nβ=0", "50% methylation\nβ=0.5", "methylated\nβ=1"))+
labs(x="Female Samples",
y="CpG Sites with Greater Variance in Female Samples",
fill="DNA Methylation") +
theme(axis.text=element_blank(),
axis.ticks=element_blank(),
legend.position="top",
legend.justification = "center",
legend.title = element_text(size = 6, vjust = .75),
legend.text = element_text(size = 6),
# plot.background = element_rect(fill = "lightgrey"),
# panel.border = element_rect(colour = "black", fill=NA, size=1),
plot.margin = margin(r=0),
panel.spacing.y = unit(2, "mm"),
panel.spacing.x = unit(0, "mm"),
strip.background.x = element_rect(color="black"),
strip.background.y = element_rect(fill = NA, colour = "black", size = .7),
strip.text = element_text(size=6),
strip.text.y = element_text(angle=180),
axis.title = element_text(size=8))
ggsave("figs/heatmap_all_x.png",all_x_heatmap_plot, width = 85, height = 230, units = "mm")
library(purrr)
all_dmp_df$key_qval <- unlist(pmap(all_dmp_df %>% dplyr::select(matches(glob2rx("transition_*_qval|^key_transition"))),
function(transition_1_qval, transition_2_qval, transition_3_qval, transition_4_qval, transition_5_qval, key_transition) {
# print(transition_1_qval)
switch (key_transition,
"1" = return(transition_1_qval),
"2" = return(transition_2_qval),
"3" = return(transition_3_qval),
"4" = return(transition_4_qval),
"5" = return(transition_5_qval)
)
}))
# volcano plot for all the sites (pick their key_transition for the log_qval)
all_chr_volcano <- ggplot(all_dmp_df, aes(delta, -log10(key_qval))) +
geom_point(aes(color = (hg38_chr=="chrX")), alpha = .4) +
labs(x=sprintf('Change in DNA Methylation'),
y='-log10(q-value)',
title="Most significant q-value per probe from DMP Analysis") +
# scale_x_continuous(breaks = c(-.5,0,.5)) +
theme_bw() +
base_plot_theme +
scale_color_jama(name="", limits=c(FALSE, TRUE), labels = c("Not X Chr", "X Chr"))
library(ggsignif)
beta_delta_barplots <- ggplot(all_dmp_df, aes(x=hg38_chr=="chrX", y= delta, color=hg38_chr=="chrX")) +
geom_boxplot(outlier.shape = NA) +
geom_signif(map_signif_level=T, comparisons = list(c(1,2)), color="black", y_position = .25) +
scale_y_continuous(limits = c(-.5,.3)) +
scale_x_discrete(breaks=c(FALSE, TRUE), labels=c("Not X Chr", "X Chr")) +
facet_grid(.~ delta_increasing ,space="free", shrink=T, labeller = labeller(delta_increasing=function(delta_increasing){
return(list("TRUE"="Probes Gaining Methylation",
"FALSE"="Probes Losing Methylation"
)[delta_increasing])
})) +
labs(x="",
y="Change in DNA Methylation",
title="Difference in change in DNA methylation")+
theme_bw() +
base_plot_theme +
theme(strip.text.x = element_text(size=6),axis.title.x = element_blank())+
scale_color_jama(name="", limits=c(FALSE, TRUE), labels = c("Not X Chr", "X Chr"))
library(gridExtra)
# xist_expr_plt comes from expression_analysis_notebook.Rmd
xist_expr_plot <- xist_expr_plt #+
# scale_x_discrete(breaks=0:5, labels = c("A", "B", "C", "D", "E", "F")) +
# labs(x="Sample Cluster") +
# scale_fill_grey(start=0.9, end=.3, guide=F)+
# theme_bw() +
# base_plot_theme
fig_2 <- arrangeGrob(sigvar_heatmap_plot, all_chr_volcano, beta_delta_barplots, xist_expr_plot, ncol=3, nrow=3,
widths = c(.5,5.5,6), heights = c(4,3.5,4.5),
layout_matrix=rbind(c(NA,1,4),
c(NA,1,3),
c(NA,1,2)))
ggsave("figs/main/fig_2a.pdf", sigvar_heatmap_plot, width = 85, height = 236, units = "mm")
ggsave("figs/main/fig_2_xist_expr.pdf", xist_expr_plot, width = 43, height = 64, units = "mm")
ggsave("figs/main/fig_2c.pdf", beta_delta_barplots, width = 85, height = (236)*(3.5/12), units = "mm")
ggsave("figs/main/fig_2d.pdf", all_chr_volcano, width = 85, height = (236)*(4.5/12), units = "mm")
ggsave("figs/main/fig_2.png", fig_2, width = 174, height=236,units = "mm", dpi=1000)
ggsave("figs/main/fig_2.pdf", fig_2, width = 174, height=236,units = "mm")
```
```{r}
cluster_0_samples <- (sample_clusters_df %>% filter(cluster == 0))$name
heatmap(as.matrix(female_df_no_h9_chrX %>% select(!!(sample_clusters_df %>% filter(cluster == 0))$name)))
cluster_0_outliers <- cluster_0_samples[grepl("hSF6", cluster_0_samples)]
female_df_no_h9 <- read_csv("female_df_no_h9.csv")
filtered_site_annotation_df <- read_csv("filtered_site_annotation_df.csv")
female_df_no_h9_autosome <- female_df_no_h9 %>% filter((rn %in% filter(filtered_site_annotation_df, !(hg38_chromosome %in% c("chrX", "chrY")))$rowname))
mean_autosome_methylation <- tidy(colMeans(female_df_no_h9_autosome %>% select(-rn)))
mean_autosome_methylation <- left_join(mean_autosome_methylation, sample_clusters_df, by=c("names"="name"))
mean_autosome_methylation[mean_autosome_methylation["names"] == cluster_0_outliers, "cluster"] <- 0.5
ggplot(mean_autosome_methylation, aes(x=cluster, y=x, group=cluster)) +
geom_boxplot() +
labs(y="Mean autosomal methylation",
x="Sample Cluster",
title="Mean autosomal methylation in female samples") +
scale_x_discrete(limits=c(0,0.5,1,2,3,4,5), labels=c("A", "A-outliers", "B", "C", "D", "E", "F"))
```
## Transition 1 in more detail (volcano plots and such)
```{r}
library(ggrepel)
library(gridExtra)
make_volcano_plot_modified <- function(dmp_df_temp, diff_cutoff_high = NA, diff_cutoff_low = NA, log10_qval_cutoff = 3, cluster_1_col, cluster_2_col) {
# dmp_df_temp <- all_dmp_df
dmp_df_temp$delta <- as.numeric(unlist(dmp_df_temp[cluster_2_col])) - as.numeric(unlist(dmp_df_temp[cluster_1_col]))
dmp_df_temp$log10qval <- -log10(dmp_df_temp$qval)
if (is.na(diff_cutoff_high)) {
max_delta <- max(dmp_df_temp[dmp_df_temp$log10qval > log10_qval_cutoff,]$delta)
mild.threshold.upper = (7.5 * max_delta / 10)
}else {
mild.threshold.upper <- diff_cutoff_high
}
if (is.na(diff_cutoff_low)) {
min_delta <- min(dmp_df_temp[dmp_df_temp$log10qval > log10_qval_cutoff,]$delta)
mild.threshold.lower = (8.5 * min_delta / 10)
}else {
mild.threshold.lower <- diff_cutoff_low
}
dmp_df_temp[is.na(dmp_df_temp$hg38_gene),]$hg38_gene <- as.character(dmp_df_temp[is.na(dmp_df_temp$hg38_gene),]$hg38_pos)
dmp_df_temp <- filter(dmp_df_temp, hg38_chr == "chrX")
label_rows <- ((! between(as_vector(dmp_df_temp$delta), mild.threshold.lower, mild.threshold.upper)) &
(dmp_df_temp$log10qval > log10_qval_cutoff)) | (dmp_df_temp$hg38_gene == "XIST")
x_chromosome_length <- 155270560
chunk_size <- 8e6
chr_color_df <- tibble(start=numeric(),
end=numeric(),
pos_chunk=numeric())
for (n in 1:ceiling(x_chromosome_length/chunk_size)) {
min <- (n-1) * chunk_size
max <- (n) * chunk_size
dmp_df_temp[(dmp_df_temp$hg38_pos > min) & (dmp_df_temp$hg38_pos <= max), "pos_chunk"] <- n
if (max > x_chromosome_length) {
max <- x_chromosome_length
}
chr_color_df <- add_row(chr_color_df,
start=min,
end=max,
pos_chunk=n)
}
show_legend = F
volcano_plot <- ggplot(dmp_df_temp, aes(delta, log10qval)) +
geom_point(aes(color = as_factor(pos_chunk)), alpha = .4, show.legend = F, size=.3) +
geom_text_repel(data = dmp_df_temp[label_rows,],
aes(delta, log10qval, label=hg38_gene, color = as.factor(pos_chunk)), show.legend = F, size=2, segment.size = .2) +
labs(x=sprintf('%s - %s', cluster_2_col, cluster_1_col),
y='-log10(q-value)',
title=sprintf('Transition from cluster %s to cluster %s',
unlist(strsplit(cluster_1_col, '_'))[2],
fmt = unlist(strsplit(cluster_2_col, '_'))[2])) +
theme_bw() +
base_plot_theme
position_plot <- ggplot(dmp_df_temp, aes(hg38_pos, log10qval)) +
geom_line(aes(color = as.factor(pos_chunk)), show.legend = T) +
labs(x="X Chr Coordinate",
y="log10(qval)",
title="Transition 1 DMP q-values") +
theme_bw() +
base_plot_theme +
theme(plot.title = element_text(hjust = 0.5, size=8),
axis.title = element_text(size=7),
axis.text = element_text(size = 6))
chr_plot <- ggplot(chr_color_df) +
geom_rect(aes(xmin=start, xmax=end, ymin=0, ymax=1, fill=as.factor(pos_chunk)), show.legend=F) +
theme_bw() +
base_plot_theme +
scale_y_continuous(labels = NULL, expand = c(0,0), breaks = NULL, minor_breaks = NULL) +
scale_x_continuous(limits=c(0, x_chromosome_length), breaks = seq(0, 155270560, 2e7), labels=c(0, 20, 40, 60, 80, 100, 120, 140), minor_breaks = seq(0, 155270560, 1e7), expand=c(0,0))
return(list(volcano_plot=volcano_plot, position_plot=position_plot, chr_plot=chr_plot))
}
modified_volcano <- make_volcano_plot_modified(all_dmp_df %>% filter(hg38_chr == "chrX") %>% dplyr::rename(qval=transition_1_qval), cluster_1_col = sprintf('cluster_%d_mean', 0),
cluster_2_col = sprintf('cluster_%d_mean', 1),
diff_cutoff_high = .17, diff_cutoff_low = -.8, log10_qval_cutoff = 25)
modified_volcano$volcano_plot <- modified_volcano$volcano_plot +
labs(title="X Chromosome DNAme Changes in Transition 1",
x="Transition 1 Delta ([cluster B mean] - [cluster A mean])") +
scale_x_continuous(limits = c(-.5, .5), breaks = c(-.1*(4:0),.1*(1:4)), expand = c(0,0)) +
scale_y_continuous(limits=c(0,90), expand=c(0,0))
ggsave("figs/main/fig2c_volcano_plot.svg", modified_volcano$volcano_plot, width=85, height=32.614, units = "mm")
ggsave("figs/main/fig2c_volcano_plot.pdf", modified_volcano$volcano_plot, width = 87.4, height = 32.613, units = "mm")
chr_plot <- modified_volcano$chr_plot +
theme(axis.title = element_blank(),
plot.margin = unit(c(0,0,0,0), "mm"))
ggsave("figs/main/fig2clegend.svg", chr_plot, width = 76.786, height=6, units = "mm")
```
## Make subset heatmap with only genes of interest
```{r}
library(grid)
female_df_no_h9_chrX <- read_csv("female_df_no_h9_chrX.csv.gz")
all_dmp_df <- read_csv("all_dmp_df.csv.gz")
make_subset_heatmap <- function(desired_genes, gene_order=NA) {
subset_heatmap_df <- inner_join(female_df_no_h9_chrX,
dplyr::select(all_dmp_df, rowname, hg38_gene, hg38_pos), by=c("rn"="rowname")) %>%
filter(hg38_gene %in% desired_genes)
subset_heatmap_df <- arrange(subset_heatmap_df, hg38_pos)
ordered_sites <- subset_heatmap_df$rn
if (!is.na(gene_order)) {
ordered_sites <- c()
for (i in 1:length(desired_genes)) {
g <- desired_genes[i]
o <- gene_order[i]
if(o == T) {
ordered_sites <- c(ordered_sites, arrange(subset_heatmap_df %>% filter(hg38_gene == g), hg38_pos)$rn)
}else {
ordered_sites <- c(ordered_sites, arrange(subset_heatmap_df %>% filter(hg38_gene == g), desc(hg38_pos))$rn)
}
}
}
subset_heatmap_df <- dplyr::select(subset_heatmap_df, -hg38_pos)
sample_clusters_df <- read_csv("saved_sample_clusters.csv",
col_names = c('name','cluster')) %>% arrange(cluster)
ordered_sample_names <- sample_clusters_df$name
subset_heatmap_df <- reshape2::melt(subset_heatmap_df, id.vars=c("rn", "hg38_gene")) %>% dplyr::rename(site=rn, sample_name=variable)
subset_heatmap_df <- inner_join(subset_heatmap_df, sample_clusters_df, by=c("sample_name"="name")) %>% dplyr::rename(sample_cluster="cluster")
convert_int_to_letters <- Vectorize(vectorize.args = "c",
FUN = function(c) {
switch(c+1,
"A",
"B",
"C",
"D",
"E",
"F",
)
})
subset_heatmap_df <- subset_heatmap_df %>% mutate(sample_cluster=convert_int_to_letters(sample_cluster))
subset_heatmap <- ggplot(subset_heatmap_df, aes(x=factor(sample_name, levels = ordered_sample_names),
y=factor(site, levels = ordered_sites), fill=value)) +
geom_tile() +
# facet_grid(cluster ~ sample_cluster, scales = "free", space="free", shrink=FALSE, switch = "y") +
facet_grid(factor(hg38_gene, levels=desired_genes) ~ sample_cluster,space="free_x", scales = "free", shrink=T, switch = "y") +
scale_fill_gradient2(low="#0bb1a4", high = "#E966FE",
midpoint = .5,
limits=c(0,1),
breaks=c(0,.5,1),
labels=c("unmethylated\nβ=0", "50% methylation\nβ=0.5", "methylated\nβ=1")) +
labs(x="Female Samples",
y="",
fill="DNA Methylation") +
theme(axis.text=element_blank(),
axis.ticks=element_blank(),
legend.position="top",
legend.justification = "center",
# legend.title = element_text(size = 6, vjust = .8),
# legend.text = element_text(size = 6),
# legend.position="top",
# legend.justification = "right",
legend.title = element_text(size = 4, vjust = .8),
legend.text = element_text(size = 4),
legend.key.height = unit(2.5, "mm"),
plot.margin = margin(r=0),
panel.spacing.y = unit(2, "mm"),
panel.spacing.x = unit(0.2, "mm"),
# strip.background.x = element_rect(color="black"),
strip.text.y = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")),
strip.text.x = element_text(size=5, margin = margin(1,1.5,1,1.5, "mm")),
strip.background = element_rect(size = .3),
axis.title = element_text(size=7))
return(subset_heatmap)
}
long_arm_heatmap <- make_subset_heatmap(c("XIST", "FIRRE"), gene_order=c(F,T)) + theme(legend.position="bottom", legend.justification = "right", legend.box.margin = margin(t=-4,r=3, unit="mm"))
ggsave("figs/main/fig2_xist_firre_heatmap.pdf", long_arm_heatmap, width=85, height = 65, units = "mm")
fig_3 <- arrangeGrob(modified_volcano$volcano_plot, modified_volcano$position_plot, long_arm_heatmap, layout_matrix = cbind(c(1,2), c(3,3)), widths = c(70.5, 63.5))
ggsave("figs/main/fig3.svg", fig_3, width = 134, height = 118, units = "mm")
update_geom_defaults("boxplot",
list(size=.3,
outlier.shape=NA))
update_geom_defaults("point",
list(shape=20,
size=.5))
dummy <- data.frame(cluster=c("A", "A"),
expression=c(20, 12),
hgnc_symbol=c("XIST", "FIRRE"))
xist_expression <- plot_gene_expression(c("XIST", "FIRRE"), multiple=T) +
theme_bw() +
geom_signif(comparisons = list(c(1,2), c(1,3), c(1,4), c(1,5)),step_increase=.12, margin_top=-.4, tip_length = 0, size=.3, textsize = 2, test="wilcox.test") +
base_plot_theme +
geom_blank(data=dummy) +
theme(strip.text = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")),
strip.background = element_rect(size = .3))
ggsave("figs/main/fig_2_xist_expr.pdf", xist_expression, width = 43.9, height = 64, units = "mm")
firre_expression <- plot_gene_expression("FIRRE") +
theme_bw() +
base_plot_theme
ggsave("figs/main/fig_2_firre_expr.pdf", firre_expression, width = 43.9, height = 32, units = "mm")
meth_array_density <- stat_bin(all_dmp_df %>% filter(hg38_chr == "chrX"), mapping=aes(x=hg38_pos, y = (..count..)/sum(..count..)), stat="bin", bins = 100)
meth_array_density <- ggplot_build(ggplot() + meth_array_density)$data[[1]] %>% arrange(x)
non_sig_density <- stat_bin(all_dmp_df %>% filter(hg38_chr == "chrX" & (passes_threshold == F)),
mapping = aes(x=hg38_pos, y = (..count..)/sum(..count..)), breaks = meth_array_density$xmin)
non_sig_density <- ggplot_build(ggplot() + non_sig_density)$data[[1]] %>% arrange(x)
ks.test(meth_array_density$y, non_sig_density$y)
tran_1_density <- stat_bin(all_dmp_df %>% filter(hg38_chr == "chrX" & (key_transition == 1) & (passes_threshold == T)),
mapping = aes(x=hg38_pos, y = (..count..)/sum(..count..)), breaks = meth_array_density$xmin)
tran_1_density <- ggplot_build(ggplot() + tran_1_density)$data[[1]] %>% arrange(x)
ks.test(meth_array_density$y, tran_1_density$y)
transition_1_not_random_plot <- ggplot() +
geom_area(all_dmp_df %>% filter(hg38_chr == "chrX"), mapping=aes(x=hg38_pos, y = (..count..)/sum(..count..)), stat="bin", bins = 100, fill="grey") +
geom_freqpoly(all_dmp_df %>% filter(hg38_chr == "chrX" & (passes_threshold == F)),
mapping = aes(x=hg38_pos, y = (..count..)/sum(..count..), color="non-significant"),
stat="bin", bins = 100, size=.3) +
geom_freqpoly(all_dmp_df %>% filter(hg38_chr == "chrX" & (key_transition == 1) & (passes_threshold == T)),
mapping = aes(x=hg38_pos, y = (..count..)/sum(..count..), color="transition_1"),
stat="bin", bins = 100, size=.3) +
theme_bw() +
scale_x_continuous(limits=c(0, 155270560), breaks = seq(0, 155270560, 2e7), labels=c(0, 20, 40, 60, 80, 100, 120, 140), minor_breaks = seq(0, 155270560, 1e7), expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
labs(x="X Chr Coordinate (Mb)",
y="Percent of Probes",
title = "Probes Changing in Transition 1 vs Probes Not Changing in XCE") +
scale_color_manual( name="Probe Type",
labels=c("1↑ & 1↓", "~"),
limits=c("transition_1", "non-significant"),
values = c(pal_d3("category10")(1), "#000000"),
guide=F) +
base_plot_theme
ggsave("figs/main/fig2b.pdf", transition_1_not_random_plot, width = 87.4, height = 32.613, units = "mm")
# one more plot about the centers, and the origins of the early transitions. using the xist correlations
```
## Look at what is moving with XIST to try to pinpoint early changes
```{r}
library(tidyverse)
library(ggplot2)
all_dmp_df <- read_csv("all_dmp_df.csv")
female_df_no_h9_chrX <- read_csv("female_df_no_h9_chrX.csv")
female_df_no_h9 <- read_csv("female_df_no_h9.csv")
x_annotation_df <- read_csv("x_annotation.csv")
sample_clusters_df <- read_csv("saved_sample_clusters.csv",
col_names = c('name','cluster'))
clutser_0_samples <- (sample_clusters_df %>% filter(cluster == 0))$name
female_df_no_h9_chrX <- female_df_no_h9_chrX %>% select(rn, !!clutser_0_samples)
female_df_no_h9 <- female_df_no_h9 %>% select(rn, !!clutser_0_samples)
xist_sites_of_interest <- c("cg03554089", "cg12653510", "cg05533223", "cg11717280", "cg20698282")
# xist_sites_of_interest <- filter(all_dmp_df, hg38_gene == "XIST" & (hg38_refgene_group %in% c("TSS200")))$rowname
# female_df_no_h9 <- rownames_to_column(female_df_no_h9, var="rn")
mean_interesting_xist_sites <- colMeans(female_df_no_h9_chrX %>% filter(rn %in% xist_sites_of_interest) %>% dplyr::select(-rn)) /
colMeans(female_df_no_h9 %>% filter(!(rn %in% x_annotation_df$rowname)) %>% dplyr::select(-rn))
# xist_cpg2_sites <- c("cg03554089", "cg12653510", "cg05533223")
# mean_xist_cpg2_sites <- colMeans(female_df_no_h9_chrX %>% filter(rn %in% xist_cpg2_sites) %>% dplyr::select(-rn))
#
# ggplot() +
# geom_point(aes(x=mean_interesting_xist_sites, mean_xist_cpg2_sites))
mean_interesting_xist_sites_df <- as.data.frame(mean_interesting_xist_sites) %>%
rownames_to_column(var="sample_name") %>%
dplyr::rename(xist_methylation=2)
# quants <- quantile(mean_interesting_xist_sites)
quants <- c(0,0,0,0,0)
quants[c(1,5)] <- range(mean_interesting_xist_sites)
quants[3] <- mean(mean_interesting_xist_sites)
quants[2] <- mean(quants[c(1,3)])
quants[4] <- mean(quants[c(5,3)])
# quants <- c(0.5470616,0.641993,0.7369243,.825,0.9267870)
sorted_sample_names <- names(sort(mean_interesting_xist_sites))
quant_1_samples <- names(mean_interesting_xist_sites[mean_interesting_xist_sites < quants[2]])
quant_2_samples <- names(mean_interesting_xist_sites[(mean_interesting_xist_sites >= quants[2]) &
(mean_interesting_xist_sites < quants[3])])
quant_3_samples <- names(mean_interesting_xist_sites[(mean_interesting_xist_sites >= quants[3]) &
(mean_interesting_xist_sites < quants[4])])
quant_4_samples <- names(mean_interesting_xist_sites[mean_interesting_xist_sites >= quants[4]])
correlating_probes_df <- female_df_no_h9_chrX
autosome_mean <- colMeans(female_df_no_h9 %>% filter(!(rn %in% x_annotation_df$rowname)) %>% dplyr::select(-rn))
for (sample_name in names(autosome_mean)) {
correlating_probes_df[sample_name] <- correlating_probes_df[sample_name] / autosome_mean[sample_name]
}
correlating_probes_df$correlation_to_xist <- as.numeric(cor(t(dplyr::select(correlating_probes_df, -rn)), mean_interesting_xist_sites))
correlating_probes_df$correlation_to_xist_quant_1 <- as.numeric(cor(t(dplyr::select(correlating_probes_df, !!quant_1_samples)),
mean_interesting_xist_sites[quant_1_samples]))
correlating_probes_df$correlation_to_xist_quant_2 <- as.numeric(cor(t(dplyr::select(correlating_probes_df, !!quant_2_samples)),
mean_interesting_xist_sites[quant_2_samples]))
correlating_probes_df$correlation_to_xist_quant_3 <- as.numeric(cor(t(dplyr::select(correlating_probes_df, !!quant_3_samples)),
mean_interesting_xist_sites[quant_3_samples]))
correlating_probes_df$correlation_to_xist_quant_4 <- as.numeric(cor(t(dplyr::select(correlating_probes_df, !!quant_4_samples)),
mean_interesting_xist_sites[quant_4_samples]))
correlating_probes_df <- left_join(correlating_probes_df, x_annotation_df %>% select(rowname, hg38_pos, hg38_gene_name), by=c("rn"="rowname"))
data_df <- reshape2::melt(correlating_probes_df %>%
select(rn, hg38_pos, hg38_gene_name,
!!sorted_sample_names), id.vars=c("rn", "hg38_pos", "hg38_gene_name")) %>%
rename(sample_name = variable)
data_df <- left_join(data_df, mean_interesting_xist_sites_df, by=c("sample_name"="sample_name"))
# TODO: get rid of probes that don't change much
data_df_top <- data_df %>% group_by(rn) %>% arrange(xist_methylation) %>% top_n(5) %>%
summarise_at(vars(value), funs(mean = mean(., na.rm = TRUE)))
data_df_bottom <- data_df %>% group_by(rn) %>% arrange(xist_methylation) %>% top_n(-5) %>%
summarise_at(vars(value), funs(mean = mean(., na.rm = TRUE)))
data_df_top <- left_join(data_df_top, data_df_bottom, by="rn", suffix=c(".start", ".end"))
probes_that_are_changing <- dplyr::filter(data_df_top, mean.end - mean.start >= .3)$rn
ggplot(data_df %>% filter(rn %in% probes_that_are_changing), aes(x=xist_methylation, y=value)) +
geom_smooth(aes(color=rn, alpha=.2), show.legend = F, alpha=.2, se=F, span=.2, size=.2)
# normalize to 0 to .5
library(scales)
#average by gene
normalized_correlating_probes_df <- correlating_probes_df %>% group_by(hg38_gene_name) %>% summarise_at(vars(hg38_pos, !!sorted_sample_names), funs(mean)) %>% ungroup() %>% column_to_rownames(var="hg38_pos") %>% select(-hg38_gene_name)
normalized_correlating_probes_df <- correlating_probes_df %>% select(rn, !!sorted_sample_names) %>% column_to_rownames(var="rn")
normalized_correlating_probes_df <- t(apply(normalized_correlating_probes_df, 1, rescale, to=c(0,.5)))
normalized_correlating_probes_df <- as.data.frame(normalized_correlating_probes_df) %>% rownames_to_column(var="rn")
normalized_correlating_probes_df <- left_join(normalized_correlating_probes_df, x_annotation_df %>% select(rowname, hg38_pos, hg38_gene_name), by=c("rn"="rowname"))
data_norm_df <- reshape2::melt(normalized_correlating_probes_df %>%
select(hg38_pos=rn,
!!sorted_sample_names), id.vars=c("hg38_pos")) %>%
rename(sample_name = variable)
data_norm_df <- reshape2::melt(normalized_correlating_probes_df %>%
select(rn, hg38_pos, hg38_gene_name,
!!sorted_sample_names), id.vars=c("rn", "hg38_pos", "hg38_gene_name")) %>%
rename(sample_name = variable)
data_norm_df <- left_join(data_norm_df, mean_interesting_xist_sites_df, by=c("sample_name"="sample_name"))
data_norm_df <- data_norm_df %>% filter(xist_methylation <= 1.7)
ggplot(data_norm_df %>% filter(rn %in% probes_that_are_changing), aes(x=xist_methylation, y=value)) +
geom_smooth(aes(color=rn, alpha=.2), show.legend = F, alpha=.2, se=F, span=.2, size=.2)
# where does the probe lose .2 from it's starting mean
data_norm_df_top <- data_norm_df %>% group_by(rn) %>% arrange(xist_methylation) %>% top_n(5) %>%
summarise_at(vars(value), funs(mean = mean(., na.rm = TRUE)))
data_norm_df_bottom <- data_norm_df %>% group_by(rn) %>% arrange(xist_methylation) %>% top_n(-5) %>%
summarise_at(vars(value), funs(mean = mean(., na.rm = TRUE)))
data_norm_df_top <- left_join(data_norm_df_top, data_norm_df_bottom, by="rn", suffix=c("_start", "_end"))
data_norm_df_top <- data_norm_df_top %>% dplyr::mutate(mean_threshold = (mean_start *.8))
xist_ranges <- (100:199)/100
i <- 1
data_norm_df$passing_fraction <- 0
for(xist_min in xist_ranges) {
print(i)
data_norm_df_range <- data_norm_df %>% filter((rn %in% probes_that_are_changing) &
(xist_methylation >= xist_min) &
(xist_methylation < (xist_min + .05)))
if(nrow(data_norm_df_range) > 0) {
data_norm_df_range_mean <- data_norm_df_range %>% group_by(rn) %>%
summarise_at(vars(value), funs(mean = mean(., na.rm = TRUE)))
data_norm_df_range_mean <- left_join(data_norm_df_range_mean, data_norm_df_top %>% dplyr::select(rn, mean_threshold), by="rn")
probes_passing_threshold <- (data_norm_df_range_mean %>% filter(mean <= mean_threshold))$rn
if (length(probes_passing_threshold) > 0 & sum((data_norm_df$rn %in% probes_passing_threshold) & (data_norm_df$passing_fraction == 0)) > 0) {
data_norm_df[(data_norm_df$rn %in% probes_passing_threshold) & (data_norm_df$passing_fraction == 0),]$passing_fraction <- i
}
}
i <- i + 1
}
ggplot(data_norm_df %>% filter(rn %in% probes_that_are_changing), aes(x=xist_methylation, y=value)) +
geom_smooth(aes(color=as.factor(passing_fraction)), show.legend = T, se=F, span=.2, size=.5) + scale_color_viridis_d()
ggplot(data_norm_df %>% filter((rn %in% probes_that_are_changing) & (passing_fraction > 1) & (passing_fraction <= 40)) %>% group_by(rn) %>% filter(row_number() == 1) %>% ungroup()) +
geom_freqpoly(aes(hg38_pos), bins=500)
# fitting sigmoid curve models
# sigmoidal_formula <- value ~ L / 1+exp(-k*(xist_expr - x0))
# L / 1+exp(-k*(xist_methylation - x0)
# test_df <- data_norm_df %>% filter(rn == "cg00650640")
# x <- test_df$xist_methylation
# y <- test_df$value
# fit <- nls(y ~ SSlogis(x, Asym, xmid, scal), data = data.frame(x, y))
# xmid_estimate <- fit$m$getPars()[2]
get_midpoint_estimate <- function(test_df) {
x <- test_df$xist_methylation
y <- test_df$value
fit <- NA
try(fit <- nls(y ~ SSlogis(x, Asym, xmid, scal), data = data.frame(x, y)))
if (length(fit) == 1 & is.na(fit)) {
return(NA)
}else {
xmid_estimate <- as.numeric(unlist(fit$m$getPars()[2]))
}
xmid_estimate <- as.numeric(unlist(fit$m$getPars()[2]))
# print(xmid_estimate)
return(xmid_estimate)
}
mods <- data_norm_df %>%
group_by(rn) %>%
do(xmid = get_midpoint_estimate(.))
mods <- mods %>% mutate(fixed_xmid = unlist(xmid))
ggplot(mods %>% filter(rn %in% probes_that_are_changing)) +
geom_freqpoly(aes(fixed_xmid), bins=500) +
scale_x_continuous(limit = c(1,2)) +
geom_vline(xintercept = 1.54, color="green") +
labs(title="xmid distribution from sigmoid curve fit")
early_probes_blue <- (mods %>% filter(fixed_xmid < 1.55))$hg38_pos
View(data_norm_df %>% filter(rn %in% early_probes_blue) %>% group_by(rn) %>% filter(row_number() == 1) %>% ungroup() %>% dplyr::select(rn, hg38_pos, hg38_gene_name))
# look at the distribution of xmids along the X chromosome
dat_df <- data_norm_df # %>% group_by(rn) %>% filter(row_number() == 1) %>% ungroup()
dat_df <- left_join(dat_df, mods %>% select(hg38_pos, fixed_xmid), by="hg38_pos")
ggplot(dat_df %>% filter(hg38_pos %in% early_probes_blue), aes(x=as.numeric(hg38_pos), y=fixed_xmid)) +
geom_smooth(span=.1) +
geom_point() +
scale_y_continuous(limits = c(1,2)) +
scale_x_continuous(breaks = (1:14)*1e7) +
labs(title="x mid along x chromosome - smooth")
data_df <- reshape2::melt(correlating_probes_df %>%
filter( abs(correlation_to_xist) >= .5) %>%
select(rn,
correlation_to_xist_quant_1,
correlation_to_xist_quant_2,
correlation_to_xist_quant_3,
correlation_to_xist_quant_4), id.vars=c("rn"))
data_df <- left_join(data_df, x_annotation_df %>% select(rowname, hg38_pos, hg38_gene_name), by=c("rn"="rowname"))
xist_correlating_probes <- filter(correlating_probes_df, abs(correlation_to_xist_quant_3) >= .5)$rn
ggplot(data_df) +
geom_boxplot(aes(x=variable, y=abs(value)), show.legend=F)
ggplot(data_df) +
geom_point(aes(x=hg38_pos, y=value, color=variable), show.legend=T, alpha=.5)
ggplot(correlating_probes_df) +
geom_line(aes(x=hg38_pos, y=abs(correlation_to_xist)))
ggplot() +
geom_line(data = all_dmp_df %>% filter(hg38_chr == "chrX"), aes(x=hg38_pos, y=1/(transition_1_qval)), color="blue", alpha=.3) +
geom_line(data=correlating_probes_df, aes(x=hg38_pos, y=(abs(correlation_to_xist_quant_1))), color="red", alpha=.3)
xist_corr <- correlating_probes_df$correlation_to_xist
names(xist_corr) <- correlating_probes_df$rn
tran_1_qval <- (all_dmp_df %>% filter(hg38_chr == "chrX"))$transition_1_qval
names(tran_1_qval) <- (all_dmp_df %>% filter(hg38_chr == "chrX"))$hg38
cor.test(abs(xist_corr), tran_1_qval[names(xist_corr)], method = "kendall")
data_df <- left_join(all_dmp_df %>% filter(hg38_chr == "chrX") %>% select(rowname, hg38_pos, transition_1_qval, hg38_gene),
correlating_probes_df %>% select(rn, correlation_to_xist, correlation_to_xist_quant_1,
correlation_to_xist_quant_2, correlation_to_xist_quant_3,
correlation_to_xist_quant_4),
by=c("rowname"="rn"))
x_chromosome_length <- 155270560
chunk_size <- 8e6
for (n in 1:ceiling(x_chromosome_length/chunk_size)) {
min <- (n-1) * chunk_size
max <- (n) * chunk_size
data_df[(data_df$hg38_pos > min) & (data_df$hg38_pos <= max), "pos_chunk"] <- n
}
ggplot(data_df) +
geom_point(aes(x=abs(correlation_to_xist), y=-log10(transition_1_qval), color=as.factor(pos_chunk)), show.legend = F) +
labs(title="transition 1 qval vs correlation to xist")
cor.test(abs(data_df$correlation_to_xist), data_df$transition_1_qval, method = "spearman")
ggplot(data_df) +
geom_point(aes(x=abs(correlation_to_xist_quant_1), y=-log10(transition_1_qval), color=as.factor(pos_chunk)), show.legend = F) +
labs(title="transition 1 qval vs correlation to xist quant 1")
ggplot(data_df) +
geom_point(aes(x=abs(correlation_to_xist_quant_2), y=-log10(transition_1_qval), color=as.factor(pos_chunk)), show.legend = F) +
labs(title="transition 1 qval vs correlation to xist quant 2")
ggplot(data_df) +
geom_point(aes(x=abs(correlation_to_xist_quant_3), y=-log10(transition_1_qval), color=as.factor(pos_chunk)), show.legend = F) +
labs(title="transition 1 qval vs correlation to xist quant 3")
ggplot(data_df) +
geom_point(aes(x=abs(correlation_to_xist_quant_4), y=-log10(transition_1_qval), color=as.factor(pos_chunk)), show.legend = F) +
labs(title="transition 1 qval vs correlation to xist quant 4")
data_df <- left_join(all_dmp_df %>% filter(hg38_chr == "chrX") %>% select(rowname, hg38_gene, hg38_pos, hg38_refgene_group),
correlating_probes_df %>% select(rn, corr_3=correlation_to_xist_quant_3, corr_2=correlation_to_xist_quant_2),
by=c("rowname"="rn"))
ggplot(data_df) +
geom_smooth(aes(x=hg38_pos, y=abs(corr_3)))
data_df$pseudo_position <- order(order(data_df$hg38_pos))
data_df_filtered <- data_df %>% group_by(hg38_gene) %>% dplyr::arrange(desc(corr_3)) %>% filter(row_number() <= 3) %>% ungroup()
cutoffs <- c()
num_passings <- c()
for(c in 1:90) {
cutoff <- c/100
pseudo_positions_above_cutoff <- filter(data_df_filtered, abs(corr_3) >= cutoff)$pseudo_position
num_passings <- c(num_passings, length(pseudo_positions_above_cutoff))
cutoffs <- c(cutoffs, cutoff)
}
cutoff_df <- tibble(cutoff=cutoffs, num_passing=num_passings)
ggplot(cutoff_df, aes(x=cutoff, y=num_passing)) + geom_line() + labs(title = "optimizing cutoff")
cutoff <- .4
pseudo_positions_above_cutoff <- filter(data_df_filtered, abs(corr_3) >= cutoff)$pseudo_position
# library(BAMMtools)
# jbreaks <- getJenksBreaks(pseudo_positions_above_cutoff, 5)
k_min <- 1
k_max <- 15
wss <- sapply(k_min:k_max,
function(k){kmeans(pseudo_positions_above_cutoff, k, nstart=50,iter.max = 15 )$tot.withinss})
elbow_df <- tibble(num_clusters = k_min:k_max, wcss = wss)
ggplot(elbow_df, aes(x=num_clusters, y=wcss)) +
geom_point() +
geom_line() +
labs(x="Number of clusters K",
y="Total within-clusters sum of squares",
title = "Optimal k-means clusters")
# pick k=4
k=5
kmeans <- kmeans(pseudo_positions_above_cutoff, k, nstart=50, iter.max = 15)
center_genes <- c()
center_positions <- c()
for(c in kmeans$centers) {
rounded_c <- round(c)
center_genes <- c(center_genes, unlist(filter(data_df, pseudo_position == rounded_c)$hg38_gene))
center_positions <- c(center_positions, unlist(filter(data_df, pseudo_position == rounded_c)$hg38_pos))
}
print(center_positions/1e6)
positions_above_cutoff <- filter(data_df_filtered, abs(corr_3) >= cutoff)$hg38_pos
position_cluster_df <- tibble(positions=positions_above_cutoff,
cluster=kmeans$cluster,
correlation_to_xist=filter(data_df_filtered, abs(corr_3) >= cutoff)$corr_3)
library(SDMTools)
centers_df <- position_cluster_df %>% group_by(cluster) %>% summarise(center=weighted.mean(positions, correlation_to_xist), stderr = wt.sd(positions, correlation_to_xist)/sqrt(length(positions)))
write_csv(data_df, "xist_correlation_df.csv")
python_centers <- c(1109.799248,
5935.39440388,
7162.58421653,
4498.5805607)
center_genes <- c()
center_positions <- c()
for(c in python_centers) {
rounded_c <- round(c)
center_genes <- c(center_genes, unlist(filter(data_df, pseudo_position == rounded_c)$hg38_gene))
center_positions <- c(center_positions, unlist(filter(data_df, pseudo_position == rounded_c)$hg38_pos))
}
print(center_positions/1e6)
# get real center based on clusters
positions_above_cutoff <- filter(sample_cluster_0_noh9_chrX_df, abs(correlation_to_xist) >= cutoff)$pos
```
## Bedgraph files for the first transition
```{r}
library(tidyverse)
library(ggplot2)
all_dmp_df <- read_csv('all_dmp_df.csv')
# do this based on genes and when the promoters change their methylation.
x_annotation_df <- read_csv("x_annotation.csv")
statistics <- read_csv('variance_statistics_noh9_nooutliers.csv')
positions_to_keep <- statistics$Position[statistics$alpha <= .01 & (statistics$femaleVariance > statistics$maleVariance)] # returns the density data
sig_x_sites <- intersect(positions_to_keep, x_annotation_df$rowname)
bedgraph_header <- function(name="", description="") {
return(sprintf('track type=bedGraph name="%s" description="%s" color=1,0,0 altColor=1,0,0 priority=priority autoScale=on alwaysZero=on gridDefault=on maxHeightPixels=100:60:8 visbility=full graphType=bar viewLimits=0:1 yLineMark=0 yLineOnOff=off windowingFunction=mean+whiskers smoothingWindow=8', name, description))
}
# windowing the bed file
window_average_chrX <- function(bed_file_df, window_size = 1000000, step_size = 10000, include_empty=FALSE, calculate_sum=F) {
# bed_file_df <- bed_df
x_chr_size <- 155270560
windowed_bed_df <- tibble(chr = character(),
pos_start = numeric(),
pos_end = numeric(),
delta = numeric())
for (s in 0:floor(155270560 / step_size)) {
start_position <- s * step_size
end_position <- min((start_position + window_size), x_chr_size)
num_rows <- nrow(bed_file_df %>% filter((chr == "chrX") & (pos >= start_position) & (pos <= end_position))) # (abs((!!start_position) - pos) <= (window_size/2))))
if ((num_rows > 0) | (include_empty == T)) {
if (calculate_sum == F) {
window_mean_delta <- mean((bed_file_df %>% filter((chr == "chrX") & (abs(!!start_position - pos) <= (window_size/2))))$delta, na.rm=T)
if(is.na(window_mean_delta)) {
window_mean_delta <- 0
}
windowed_bed_df <- add_row(windowed_bed_df,
chr="chrX",
pos_start=start_position,
pos_end=end_position,
delta=window_mean_delta)
}else{
window_sum_delta <- sum((bed_file_df %>% filter((chr == "chrX") & (abs(!!start_position - pos) <= (window_size/2))))$delta, na.rm=T)
if(is.na(window_sum_delta)) {
window_sum_delta <- 0
}
windowed_bed_df <- add_row(windowed_bed_df,
chr="chrX",
pos_start=start_position,
pos_end=end_position,
delta=window_sum_delta)
}
}
}
return(windowed_bed_df)
}
# for (t in 1:5) {
# write(bedgraph_header(name=sprintf("all_delta_%d", t),
# description=sprintf("all deltas from %d transition 1Mb windowed", t)),
# sprintf("bedgraphs/windowed_all_delta_tran_%d.bedgraph", t))
# bed_df <- all_dmp_df %>% filter(chr == "chrX")
# cluster_col_0 <- "cluster_0_mean" #sprintf("cluster_%d_mean", t-1)
# cluster_col_1 <- sprintf("cluster_%d_mean", t)
# bed_df$delta <- unlist(bed_df[cluster_col_1]) - unlist(bed_df[cluster_col_0])
#
# bed_df$pos_start <- bed_df$pos - 30
# bed_df$pos_end <- bed_df$pos + 30
#
# write_tsv(window_average_chrX(bed_file_df=bed_df), sprintf("bedgraphs/windowed_all_delta_tran_%d.bedgraph", t), col_names = F, append=TRUE)
# # write_tsv(bed_df[c("chr", "pos_start", "pos_end", "delta")], sprintf("bedgraphs/all_delta_tran_%d.bedgraph", t), col_names = F, append=TRUE)
# }
#
# for (t in 1:5) {
# write(bedgraph_header(name=sprintf("non_eroding_delta_%d", t),
# description=sprintf("non_eroding deltas from %d transition 1Mb windowed", t)),
# sprintf("bedgraphs/windowed_non_eroding_delta_tran_%d.bedgraph", t))
# bed_df <- all_dmp_df %>% filter((chr == "chrX") & (passes_threshold == F))
# cluster_col_0 <- "cluster_0_mean" #sprintf("cluster_%d_mean", t-1)
# cluster_col_1 <- sprintf("cluster_%d_mean", t)
# bed_df$delta <- unlist(bed_df[cluster_col_1]) - unlist(bed_df[cluster_col_0])
#
# bed_df$pos_start <- bed_df$pos - 30
# bed_df$pos_end <- bed_df$pos + 30
#
# write_tsv(window_average_chrX(bed_file_df=bed_df), sprintf("bedgraphs/windowed_non_eroding_delta_tran_%d.bedgraph", t), col_names = F, append=TRUE)
#
# }
for (t in 1:5) {
write(bedgraph_header(name=sprintf("eroding_delta_%d", t),
description=sprintf("eroding deltas from %d transition 1Mb windowed", t)),
sprintf("bedgraphs/windowed_eroding_delta_tran_%d.bedgraph", t))
bed_df <- all_dmp_df %>% filter((hg19_chr == "chrX") & (passes_threshold == T))
cluster_col_0 <- "cluster_0_mean" #sprintf("cluster_%d_mean", t-1)
cluster_col_1 <- sprintf("cluster_%d_mean", t)
bed_df$delta <- unlist(bed_df[cluster_col_1]) - unlist(bed_df[cluster_col_0])
write_tsv(window_average_chrX(bed_file_df=bed_df %>% rename(chr=hg19_chr, pos=hg19_pos)), sprintf("bedgraphs/windowed_eroding_delta_tran_%d.bedgraph", t), col_names = F, append=TRUE)
}
for (t in 1:5) {
write(bedgraph_header(name=sprintf("noneroding_delta_%d", t),
description=sprintf("noneroding deltas from %d transition 1Mb windowed", t)),
sprintf("bedgraphs/windowed_eroding_delta_tran_%d.bedgraph", t))
bed_df <- all_dmp_df %>% filter((hg19_chr == "chrX") & (passes_threshold == F))
cluster_col_0 <- "cluster_0_mean" #sprintf("cluster_%d_mean", t-1)
cluster_col_1 <- sprintf("cluster_%d_mean", t)
bed_df$delta <- unlist(bed_df[cluster_col_1]) - unlist(bed_df[cluster_col_0])
write_tsv(window_average_chrX(bed_file_df=bed_df %>% rename(chr=hg19_chr, pos=hg19_pos)), sprintf("bedgraphs/windowed_noneroding_delta_tran_%d.bedgraph", t), col_names = F, append=TRUE)
}
```
## Correlation with histone methylation marks (K27 and K9)
```{r}
library(tidyverse)
all_dmp_df <- read_csv("all_dmp_df.csv.gz")
h3k27me3_peaks <- read_table2("bedgraphs/GSM1528885_H9_XISTpos_K27me3.peaks.bed", col_names=c("chr", "start", "end", "signal"))
h3k27me3_peaks_chrX <- h3k27me3_peaks %>% filter(chr == "chrX")
h3k27me3_peaks_chrX_windowed <- window_average_chrX(bed_file_df=h3k27me3_peaks_chrX %>% rename(pos=start, delta=signal), window_size = 100000, step_size = 50000, include_empty = T, calculate_sum = T)
# h3k9me3_signal <- read_tsv("bedgraphs/GSM1528888_H9_XISTpos_H3K9me3.signal.bed", skip=1, col_names=c("chr", "start", "end", "signal"))
# h3k9me3_signal_chrX <- h3k9me3_signal %>% filter(chr == "chrX")
# h3k27me3_xe_signal <- read_table2("bedgraphs/GSM1528886_H9_XISTneg_K27me3.peaks.bed", col_names=c("chr", "start", "end", "signal"))
# h3k27me3_xe_signal_chrX <- h3k27me3_signal %>% filter(chr == "chrX")
# h3k27me3_xe_signal_chrX_avg <- window_average_chrX(bed_file_df=h3k27me3_xe_signal_chrX %>% rename(pos=start, delta=signal), window_size = 100000, step_size = 50000, include_empty = T, calculate_sum = T)
#
#
# h3k9me3_xe_signal <- read_tsv("bedgraphs/GSM1528889_H9_XISTneg_H3K9me3.signal.bed", skip=1, col_names=c("chr", "start", "end", "signal"))
# h3k9me3_xe_signal_chrX <- h3k9me3_signal %>% filter(chr == "chrX")
h3k9_xi_peaks <- read_table2("bedgraphs/GSM1528888_H9_XISTpos_K9me3.peaks.bed.gz", col_names=c("chr", "start", "end", "signal"))
h3k9_xi_peaks_chrX <- h3k9_xi_peaks %>% filter(chr == "chrX")
h3k9_xi_peaks_chrX_windowed <- window_average_chrX(bed_file_df=h3k9_xi_peaks_chrX %>% rename(pos=start, delta=signal), window_size = 100000, step_size = 50000, include_empty = T, calculate_sum = T)
# h3k9_xe_peaks <- read_table2("/pinterlab/GSM1528889_H9_XISTneg_K9me3.peaks.bed", col_names=c("chr", "start", "end", "signal"))
# h3k9_xe_peaks_chrX <- h3k9_xe_peaks %>% filter(chr == "chrX")
# h3k9_xe_peaks_windowed <- window_average_chrX(bed_file_df=h3k9_xe_peaks_chrX %>% rename(pos=start, delta=signal), window_size = 100000, step_size = 50000, include_empty = T, calculate_sum = T)
histone_mark_correlations <- tibble(transition=numeric(),
h3k27me3_correlation=numeric(),
h3k27me3_correlation_p_val=numeric(),
h3k9me3_correlation=numeric(),
h3k9me3_correlation_p_val=numeric())
for (t in 1:5) {
bed_df <- all_dmp_df %>% filter((hg19_chr == "chrX") & (passes_threshold == T) & (key_transition == t))
averaged_distance <- window_average_chrX(bed_file_df=bed_df %>% rename(chr=hg19_chr, pos=hg19_pos), window_size = 100000, step_size = 50000, include_empty = T)
k27_cor_test <- cor.test(abs(averaged_distance$delta), h3k27me3_peaks_chrX_windowed$delta)
k9_cor_test <- cor.test(abs(averaged_distance$delta), h3k9_xi_peaks_chrX_windowed$delta) # h3k9me3_signal_chrX$signal)
# k27_xe_cor_test <- cor.test(abs(averaged_distance$delta), h3k27me3_xe_signal_chrX$signal)
# k9_xe_cor_test <- cor.test(abs(averaged_distance$delta), h3k9_xe_peaks_windowed$delta)
histone_mark_correlations <- add_row(histone_mark_correlations,
transition=t,
h3k27me3_correlation=k27_cor_test$estimate,
h3k27me3_correlation_p_val=k27_cor_test$p.value,
h3k9me3_correlation=k9_cor_test$estimate,
h3k9me3_correlation_p_val=k9_cor_test$p.value)
# h3k27me3_xe_correlation=k27_xe_cor_test$estimate,
# h3k9me3_xe_correlation=k9_xe_cor_test$estimate)
}
histone_cor_df <- reshape2::melt(histone_mark_correlations %>%
dplyr::select(transition,
h3k27me3_correlation,
h3k9me3_correlation),
id.vars=c("transition"))
histone_cor_df$variable <- factor(histone_cor_df$variable, levels=c("h3k9me3_correlation", "h3k27me3_correlation"))
histon_cor_plot <- ggplot(histone_cor_df) +
geom_bar(aes(x=transition, y=value, fill=variable), stat="identity", position=position_dodge()) +
theme_bw() +
base_plot_theme +
coord_flip() +
facet_grid(rows=vars(sprintf("Tran %d", transition)), scales="free_y")+
labs(title="Correlation of DMPs With Histone Meth.",
x="",
y="Pearson Correlation",
fill="Histone Modification") +
scale_fill_manual(limits=c("h3k27me3_correlation", "h3k9me3_correlation"),
labels=c("H3K27me3", "H3K9me3"),
values = c(pal_jama()(1), "#8C564B"),
guide=F) +
scale_x_continuous(labels = NULL) +
scale_y_continuous(limits=c(-.13,.33)) +
theme(strip.text = element_text(size=5, margin = margin(0,.5,0,.5, "mm")),
strip.background = element_rect(size = .3),
panel.grid.minor.y = element_blank(),
axis.ticks.length.y = unit(0, "mm"),
axis.title.y = element_blank(),
axis.text.y = element_blank(),
panel.spacing.y = unit(.75, "mm"))
# theme(legend.text = element_text(size=4),
# legend.title = element_text(size=4,face="bold"),
# legend.key.width = unit(2, units="mm"),
# legend.key.height = unit(3, units="mm"),
# legend.key.size = unit(.2, units = "mm"),
# axis.ticks.length.y = unit(0, "mm"),
# axis.title.y = element_text(size=0, margin = margin(0,0,0,0)),
# axis.text.y = element_text(size=0, margin = margin(0,0,0,0)),
# strip.text = element_text(size=4, margin = margin(.25,0,.25,0, "mm")),
# strip.background = element_rect(size = .3),
# panel.spacing.y = unit(.5, "mm"))
ggsave("figs/main/fig_4_histone_cor_plot.svg", histon_cor_plot, width = 46, height = 40, units = "mm")
ggsave("figs/main/fig_4_histone_cor_plot_no_guide.pdf", histon_cor_plot, width = 46, height = 47.5, units = "mm")
ggsave("figs/main/fig_4_histone_cor_plot_spearman.svg", histon_cor_plot, width = 46, height = 47.5, units = "mm")
```
## Plotting the browser view
```{r}
all_dmp_df <- read_csv("all_dmp_df.csv.gz")
plotting_bedgraph_df <- tibble(chr=character(),
start_pos=numeric(),
end_pos=numeric(),
value_num=numeric(),
type=character())
bedgraphs <- list(H3K27me3="bedgraphs/GSM1528885_H9_XISTpos_H3K27me3.signal.bed.gz",
H3K9me3="bedgraphs/GSM1528888_H9_XISTpos_H3K9me3.signal.bed.gz",
delta_tran_1="bedgraphs/windowed_eroding_delta_tran_1.bedgraph",
delta_tran_2="bedgraphs/windowed_eroding_delta_tran_2.bedgraph",
delta_tran_3="bedgraphs/windowed_eroding_delta_tran_3.bedgraph",
delta_tran_4="bedgraphs/windowed_eroding_delta_tran_4.bedgraph",
delta_tran_5="bedgraphs/windowed_eroding_delta_tran_5.bedgraph",
delta_H9="bedgraphs/windowed_eroding_delta_tran_H9.bedgraph")
for (bedname in names(bedgraphs)) {
bed_file <- read_tsv(as.character(bedgraphs[bedname]),
skip = 1,
col_names = c("chr", "start_pos", "end_pos", "value_num"))
bed_file$type <- bedname
bed_file[bed_file$start_pos >= 58100000 & bed_file$end_pos <= 63000000, ]$value_num <- 0
plotting_bedgraph_df <- bind_rows(plotting_bedgraph_df, bed_file)
}
library(scales)
library(facetscales)
scales_y <- list(
H3K27me3 = scale_y_continuous(),
H3K9me3 = scale_y_continuous(),
delta_tran_1 = scale_y_continuous(limits = c(-.5,.5)),
delta_tran_2 = scale_y_continuous(limits = c(-.5,.5)),
delta_tran_3 = scale_y_continuous(limits = c(-.5,.5)),
delta_tran_4 = scale_y_continuous(limits = c(-.5,.5)),
delta_tran_5 = scale_y_continuous(limits = c(-.5,.5)),
delta_H9 = scale_y_continuous(limits = c(-.5,.5))
)
mean_methylation_df_escape <- read_csv("mean_methylation_df_escape.csv.gz")
#hg19_pos was manually annotated for each gene
escapee_pos_df <- mean_methylation_df_escape %>%
transmute(hg38_gene_name,
delta_tran_1=hg19_pos,
delta_tran_2=hg19_pos,
delta_tran_3=hg19_pos,
delta_tran_4=hg19_pos,
delta_tran_5=hg19_pos) %>%
reshape2::melt(id.vars=c("hg38_gene_name"),
value.name="TSS",
variable.name="type")
tran_specific_df <- all_dmp_df %>% filter(passes_threshold==T & hg19_chr =="chrX") %>% select(delta, hg19_pos, key_transition) %>% mutate(type=sprintf("delta_tran_%d", key_transition))
dmp_table_clusterA_H9 <- read_csv("dmp_table_clusterA_H9.csv.gz")
tran_specific_df <- rbind(tran_specific_df, dmp_table_clusterA_H9 %>% filter(passes_threshold==T & hg19_chr == "chrX") %>% mutate(delta=H9_mean-cluster_0_mean, type="delta_H9", key_transition=6) %>% dplyr::select(delta, hg19_pos, key_transition, type))
browser_view <- ggplot(plotting_bedgraph_df %>% filter(chr == "chrX"), mapping=aes(x=start_pos, y = value_num)) +
geom_vline(data=escapee_pos_df, aes(xintercept=TSS), color="darkgrey", size=.3, alpha=.8) +
geom_area(mapping = aes(x=start_pos, y = value_num, fill=type), stat = "identity") +
scale_fill_manual(
limits=names(bedgraphs),
values = c(pal_jama()(1), "#8C564B", "#BABABA", "#BABABA", "#BABABA", "#BABABA", "#BABABA", "#BABABA"),
guide=F) +
facet_grid_sc(rows=vars(factor(type, levels = names(bedgraphs))), scales = list(y=scales_y),labeller = labeller(.rows=function(n){
return(list("H3K27me3"="H3K27me3",
"H3K9me3"="H3K9me3",
"delta_tran_1"="Tran. 1",
"delta_tran_2"="Tran. 2",
"delta_tran_3"="Tran. 3",
"delta_tran_4"="Tran. 4",
"delta_tran_5"="Tran. 5",
"delta_H9"="H9")[n])
})) +
geom_segment(data=tran_specific_df, mapping=aes(x=hg19_pos,xend=hg19_pos,y=0, yend=delta, color=type), alpha=.1) +
labs(y="",
x="X Chr Coordinate (Mb)") +
scale_x_continuous(limits=c(0, 155270560), breaks = seq(0, 155270560, 2e7), labels=c(0, 20, 40, 60, 80, 100, 120, 140), minor_breaks = seq(0, 155270560, 1e7), expand=c(0,0)) +
theme_bw() +
scale_color_manual(limits=c("delta_tran_1", "delta_tran_2", "delta_tran_3", "delta_tran_4", "delta_tran_5", "delta_H9"),
values=c(pal_d3()(5), pal_locuszoom()(4)[4]), guide=F) +
base_plot_theme +
# scale_color_material("cyan") +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.text = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")),
strip.background = element_rect(size = .3),
title = element_blank(),
axis.title.y = element_blank(),
axis.text.y=element_text(size=5))
ggsave("figs/main/fig_4_cumulative_changes_grey_with_H9.pdf", browser_view, width = 171, height = 95, units = "mm")
ggsave("figs/main/fig_4_cumulative_changes_grey.pdf", browser_view, width = 171, height = 85, units = "mm")
```
## Plotting the browser view for H9, iPSC, and ESCs
```{r}
# all_dmp_df <- read_csv("all_dmp_df.csv.gz")
plotting_bedgraph_df <- tibble(chr=character(),
start_pos=numeric(),
end_pos=numeric(),
value_num=numeric(),
type=character())
bedgraphs <- list(H3K27me3="bedgraphs/GSM1528885_H9_XISTpos_H3K27me3.signal.bed.gz",
H3K9me3="bedgraphs/GSM1528888_H9_XISTpos_H3K9me3.signal.bed.gz",
delta_ipsc="bedgraphs/windowed_eroding_delta_tran_iPSC.bedgraph",
delta_esc="bedgraphs/windowed_eroding_delta_tran_ESC.bedgraph",
delta_H9="bedgraphs/windowed_eroding_delta_tran_H9.bedgraph")
for (bedname in names(bedgraphs)) {
bed_file <- read_tsv(as.character(bedgraphs[bedname]),
skip = 1,
col_names = c("chr", "start_pos", "end_pos", "value_num"))
bed_file$type <- bedname
bed_file[bed_file$start_pos >= 58100000 & bed_file$end_pos <= 63000000, ]$value_num <- 0
plotting_bedgraph_df <- bind_rows(plotting_bedgraph_df, bed_file)
}
library(scales)
library(facetscales)
scales_y <- list(
H3K27me3 = scale_y_continuous(),
H3K9me3 = scale_y_continuous(),
delta_ipsc = scale_y_continuous(limits = c(-.5,.5)),
delta_esc = scale_y_continuous(limits = c(-.5,.5)),
delta_H9 = scale_y_continuous(limits = c(-.5,.5))
)
mean_methylation_df_escape <- read_csv("mean_methylation_df_escape.csv.gz")
#hg19_pos was manually annotated for each gene
escapee_pos_df <- mean_methylation_df_escape %>%
transmute(hg38_gene_name,
delta_ipsc=hg19_pos,
delta_esc=hg19_pos,
delta_H9=hg19_pos) %>%
reshape2::melt(id.vars=c("hg38_gene_name"),
value.name="TSS",
variable.name="type")
dmp_table_clusterA_non_A_iPSC <- read_csv("dmp_table_clusterA_non_A_iPSC.csv.gz")
tran_specific_df <- dmp_table_clusterA_non_A_iPSC %>% filter(passes_threshold==T & hg19_chr == "chrX") %>% mutate(delta=non_A_iPSC_mean-cluster_0_mean, type="delta_ipsc") %>% dplyr::select(delta, hg19_pos, type)
dmp_table_clusterA_non_A_ESC <- read_csv("dmp_table_clusterA_non_A_ESC.csv.gz")
tran_specific_df <- rbind(tran_specific_df, dmp_table_clusterA_non_A_ESC %>% filter(passes_threshold==T & hg19_chr == "chrX") %>% mutate(delta=non_A_ESC_mean-cluster_0_mean, type="delta_esc") %>% dplyr::select(delta, hg19_pos, type))
dmp_table_clusterA_H9 <- read_csv("dmp_table_clusterA_H9.csv.gz")
tran_specific_df <- rbind(tran_specific_df, dmp_table_clusterA_H9 %>% filter(passes_threshold==T & hg19_chr == "chrX") %>% mutate(delta=H9_mean-cluster_0_mean, type="delta_H9") %>% dplyr::select(delta, hg19_pos, type))
browser_view_h9 <- ggplot(plotting_bedgraph_df %>% filter(chr == "chrX"), mapping=aes(x=start_pos, y = value_num)) +
geom_vline(data=escapee_pos_df, aes(xintercept=TSS), color="darkgrey", size=.3, alpha=.8) +
geom_area(mapping = aes(x=start_pos, y = value_num, fill=type), stat = "identity") +
scale_fill_manual(
limits=names(bedgraphs),
values = c(pal_jama()(1), "#8C564B", "#BABABA", "#BABABA", "#BABABA"),
guide=F) +
facet_grid_sc(rows=vars(factor(type, levels = names(bedgraphs))), scales = list(y=scales_y),labeller = labeller(.rows=function(n){
return(list("H3K27me3"="H3K27me3",
"H3K9me3"="H3K9me3",
"delta_ipsc"="iPSCs",
"delta_esc"="ESCs",
"delta_H9"="H9")[n])
})) +
geom_segment(data=tran_specific_df, mapping=aes(x=hg19_pos,xend=hg19_pos,y=0, yend=delta, color=type), alpha=.1) +
labs(y="",
x="X Chr Coordinate (Mb)",
title="Changes in DNAme by Cell Type") +
scale_x_continuous(limits=c(0, 155270560), breaks = seq(0, 155270560, 2e7), labels=c(0, 20, 40, 60, 80, 100, 120, 140), minor_breaks = seq(0, 155270560, 1e7), expand=c(0,0)) +
theme_bw() +
scale_color_manual(limits=c("delta_ipsc", "delta_esc", "delta_H9"),
values=c(pal_locuszoom()(3)), guide=F) +
base_plot_theme +
# scale_color_material("cyan") +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.text = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")),
strip.background = element_rect(size = .3),
title = element_blank(),
axis.title.y = element_blank(),
axis.text.y=element_text(size=5),
panel.spacing.y = unit(.75, "mm"))
ggsave("figs/supplementary/supp_2_cumulative_changes_ipsc_esc_h9.pdf", browser_view_h9, width = 174, height = 55.4, units = "mm")
```
## Browser view for noneroding probes
```{r}
non_eroding_plotting_bedgraph_df <- tibble(chr=character(),
start_pos=numeric(),
end_pos=numeric(),
value_num=numeric(),
type=character())
non_eroding_bedgraphs <- list(delta_tran_1="bedgraphs/windowed_noneroding_delta_tran_1.bedgraph",
delta_tran_2="bedgraphs/windowed_noneroding_delta_tran_2.bedgraph",
delta_tran_3="bedgraphs/windowed_noneroding_delta_tran_3.bedgraph",
delta_tran_4="bedgraphs/windowed_noneroding_delta_tran_4.bedgraph",
delta_tran_5="bedgraphs/windowed_noneroding_delta_tran_5.bedgraph")
for (bedname in names(non_eroding_bedgraphs)) {
bed_file <- read_tsv(as.character(non_eroding_bedgraphs[bedname]), skip = 1, col_names = c("chr", "start_pos", "end_pos", "value_num"))
bed_file$type = bedname
bed_file[bed_file$start_pos >= 58100000 & bed_file$end_pos <= 63000000, ]$value_num <- 0
non_eroding_plotting_bedgraph_df <- bind_rows(non_eroding_plotting_bedgraph_df, bed_file)
}
library(scales)
library(facetscales)
scales_y <- list(
delta_tran_1 = scale_y_continuous(limits = c(-.5,.5), breaks=c(-.5,0,.5)),
delta_tran_2 = scale_y_continuous(limits = c(-.5,.5), breaks=c(-.5,0,.5)),
delta_tran_3 = scale_y_continuous(limits = c(-.5,.5), breaks=c(-.5,0,.5)),
delta_tran_4 = scale_y_continuous(limits = c(-.5,.5), breaks=c(-.5,0,.5)),
delta_tran_5 = scale_y_continuous(limits = c(-.5,.5), breaks=c(-.5,0,.5))
)
mean_methylation_df_escape <- read_csv("mean_methylation_df_escape.csv.gz")
#hg19_pos was manually annotated for each gene
escapee_pos_df <- mean_methylation_df_escape %>%
transmute(hg38_gene_name,
delta_tran_1=hg19_pos,
delta_tran_2=hg19_pos,
delta_tran_3=hg19_pos,
delta_tran_4=hg19_pos,
delta_tran_5=hg19_pos) %>%
reshape2::melt(id.vars=c("hg38_gene_name"),
value.name="TSS",
variable.name="type")
non_eroding_plotting_bedgraph_df$type <- factor(non_eroding_plotting_bedgraph_df$type, levels = names(non_eroding_bedgraphs))
browser_view_non_eroding <- ggplot(non_eroding_plotting_bedgraph_df %>% filter(chr == "chrX"), mapping=aes(x=start_pos, y = value_num)) +
# geom_vline(data=escapee_pos_df, aes(xintercept=TSS), color="grey", alpha = .6) +
geom_area(mapping = aes(x=start_pos, y = value_num), stat = "identity") +
facet_grid_sc(rows=vars(type), scales = list(y=scales_y), labeller = labeller(type=function(n){
return(list("delta_tran_1"="Tran. 1",
"delta_tran_2"="Tran. 2",
"delta_tran_3"="Tran. 3",
"delta_tran_4"="Tran. 4",
"delta_tran_5"="Tran. 5")[n])
})) +
labs(y="",
x="X Chr Coordinate (Mb)",
title = "Cumulative Changes for Non-eroding Probes") +
scale_x_continuous(limits=c(0, 155270560), breaks = seq(0, 155270560, 2e7), labels=c(0, 20, 40, 60, 80, 100, 120, 140), minor_breaks = seq(0, 155270560, 1e7), expand=c(0,0)) +
theme_bw() +
base_plot_theme +
scale_color_viridis_d() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.spacing.y = unit(.75, "mm"),
panel.spacing.x = unit(0, "mm"),
strip.text = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")),
strip.background = element_rect(size = .3))
ggsave("figs/supplementary/supp_2_cumulative_changes_non_eroding.pdf", browser_view_non_eroding, width = 86, height = 50, units = "mm")
```
## Get the composition of each of the CpG clusters
```{r}
all_dmp_df <- rownames_to_column(all_dmp_df)
x_annotation_df <- rownames_to_column(as.data.frame(x_annotation_df))
tran_1_inc_annotation_tib <- x_annotation_df %>%
dplyr::filter(Name %in% (dplyr::filter(all_dmp_df, (key_transition == 1) &
(delta_increasing == TRUE) &
(passes_threshold == TRUE))$rowname))
tran_1_dec_annotation_tib <- x_annotation_df %>% dplyr::filter(Name %in% dplyr::filter(all_dmp_df, (key_transition == 1) & (delta_increasing == F) & (passes_threshold == TRUE))$rowname)
tran_2_inc_annotation_tib <- x_annotation_df %>% dplyr::filter(Name %in% dplyr::filter(all_dmp_df, (key_transition == 2) & (delta_increasing == TRUE) & (passes_threshold == TRUE))$rowname)
tran_2_dec_annotation_tib <- x_annotation_df %>% dplyr::filter(Name %in% dplyr::filter(all_dmp_df, (key_transition == 2) & (delta_increasing == F) & (passes_threshold == TRUE))$rowname)
5
tran_3_inc_annotation_tib <- x_annotation_df %>% dplyr::filter(Name %in% dplyr::filter(all_dmp_df, (key_transition == 3) & (delta_increasing == TRUE) & (passes_threshold == TRUE))$rowname)
tran_3_dec_annotation_tib <- x_annotation_df %>% dplyr::filter(Name %in% dplyr::filter(all_dmp_df, (key_transition == 3) & (delta_increasing == F) & (passes_threshold == TRUE))$rowname)
tran_4_inc_annotation_tib <- x_annotation_df %>% dplyr::filter(Name %in% dplyr::filter(all_dmp_df, (key_transition == 4) & (delta_increasing == TRUE) & (passes_threshold == TRUE))$rowname)
tran_4_dec_annotation_tib <- x_annotation_df %>% dplyr::filter(Name %in% dplyr::filter(all_dmp_df, (key_transition == 4) & (delta_increasing == F) & (passes_threshold == TRUE))$rowname)
tran_5_inc_annotation_tib <- x_annotation_df %>% dplyr::filter(Name %in% dplyr::filter(all_dmp_df, (key_transition == 5) & (delta_increasing == TRUE) & (passes_threshold == TRUE))$rowname)
tran_5_dec_annotation_tib <- x_annotation_df %>% dplyr::filter(Name %in% dplyr::filter(all_dmp_df, (key_transition == 5) & (delta_increasing == F) & (passes_threshold == TRUE))$rowname)
refgene_groups <- unlist(str_split(x_annotation_df$UCSC_RefGene_Group, ";"))
refgene_groups_sig <- unlist(str_split(sig_annotation$UCSC_RefGene_Group, ";"))
tran_1_inc_groups_sig <- unlist(str_split(tran_1_inc_annotation_tib$UCSC_RefGene_Group, ";"))
tran_1_dec_groups_sig <- unlist(str_split(tran_1_dec_annotation_tib$UCSC_RefGene_Group, ";"))
tran_2_inc_groups_sig <- unlist(str_split(tran_2_inc_annotation_tib$UCSC_RefGene_Group, ";"))
tran_2_dec_groups_sig <- unlist(str_split(tran_2_dec_annotation_tib$UCSC_RefGene_Group, ";"))
tran_3_inc_groups_sig <- unlist(str_split(tran_3_inc_annotation_tib$UCSC_RefGene_Group, ";"))
tran_3_dec_groups_sig <- unlist(str_split(tran_3_dec_annotation_tib$UCSC_RefGene_Group, ";"))
tran_4_inc_groups_sig <- unlist(str_split(tran_4_inc_annotation_tib$UCSC_RefGene_Group, ";"))
tran_4_dec_groups_sig <- unlist(str_split(tran_4_dec_annotation_tib$UCSC_RefGene_Group, ";"))
tran_5_inc_groups_sig <- unlist(str_split(tran_5_inc_annotation_tib$UCSC_RefGene_Group, ";"))
tran_5_dec_groups_sig <- unlist(str_split(tran_5_dec_annotation_tib$UCSC_RefGene_Group, ";"))
plot_tib <- tibble(category=character(),type=character(), percentage=numeric())
add_to_plot_tib <- function(plot_tib, cat, group) {
group[is.na(group)] <- "NA"
percents <- sapply(unique(group), function(x) sum(x == group, na.rm = T)/length(group))
for (name in names(percents)) {
plot_tib <- add_row(plot_tib, category=cat, type=name, percentage=percents[name])
}
return(plot_tib)
}
plot_tib <- add_to_plot_tib(plot_tib, "before", refgene_groups)
plot_tib <- add_to_plot_tib(plot_tib, "after", refgene_groups_sig)
plot_tib <- add_to_plot_tib(plot_tib, "1_inc", tran_1_inc_groups_sig)
plot_tib <- add_to_plot_tib(plot_tib, "1_dec", tran_1_dec_groups_sig)
plot_tib <- add_to_plot_tib(plot_tib, "2_inc", tran_2_inc_groups_sig)
plot_tib <- add_to_plot_tib(plot_tib, "2_dec", tran_2_dec_groups_sig)
plot_tib <- add_to_plot_tib(plot_tib, "3_inc", tran_3_inc_groups_sig)
plot_tib <- add_to_plot_tib(plot_tib, "3_dec", tran_3_dec_groups_sig)
plot_tib <- add_to_plot_tib(plot_tib, "4_inc", tran_4_inc_groups_sig)
plot_tib <- add_to_plot_tib(plot_tib, "4_dec", tran_4_dec_groups_sig)
plot_tib <- add_to_plot_tib(plot_tib, "5_inc", tran_5_inc_groups_sig)
plot_tib <- add_to_plot_tib(plot_tib, "5_dec", tran_5_dec_groups_sig)
library(scales)
ggplot(plot_tib,aes(x=factor(category, levels=c("before",
"after",
"1_inc",
"1_dec",
"2_inc",
"2_dec",
"3_inc",
"3_dec",
"4_inc",
"4_dec",
"5_inc",
"5_dec")),
y=percentage,
label=paste(round(percentage*100), "%"),
fill=factor(type, levels=c("1stExon","TSS200", "TSS1500", "3'UTR", "5'UTR", "Body", "NA" ))))+
geom_bar(stat="identity") +
geom_text(position = position_stack(vjust = 0.5), size=3) +
scale_y_continuous(labels = percent)+
scale_x_discrete(limits=c("before", "after", "1_inc", "1_dec", "2_inc", "2_dec", "3_inc", "3_dec", "4_inc", "4_dec", "5_inc", "5_dec")) +
labs(x="", y="Percent Composition", fill="CpG Location Relative to Gene", title="Composition of CpG Sites Before/After Variation Selection") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_fill_brewer(palette = 'Pastel2', na.value="grey")
```
## Volcano plots for each transition
```{r}
library(ggrepel)
make_volcano_plot <- function(all_dmp_df, diff_cutoff_high = NA, diff_cutoff_low = NA, log10_qval_cutoff = 3, cluster_1_col, cluster_2_col) {
# dmp_df$mapped_gene <- apply(dmp_df, 1, function (x) as.character(findClosestGene(x['chr'], as.numeric(x['pos']), genome='hg19')$geneName[1]))
dmp_df_temp <- all_dmp_df
dmp_df_temp$delta <- apply(dmp_df_temp, 1, function (x) as.numeric(x[cluster_2_col]) - as.numeric(x[cluster_1_col]))
dmp_df_temp$log10qval <- apply(dmp_df_temp, 1, function (x) -log10(as.numeric(x['qval'])))
lowerq = quantile(dmp_df_temp$delta)[2]
upperq = quantile(dmp_df_temp$delta)[4]
iqr = upperq - lowerq
if (is.na(diff_cutoff_high)) {
max_delta <- max(dmp_df_temp[dmp_df_temp$log10qval > log10_qval_cutoff,]$delta)
pos_range <- max_delta - 0
mild.threshold.upper = (7.5 * pos_range / 10)
}else {
mild.threshold.upper <- diff_cutoff_high
}
if (is.na(diff_cutoff_low)) {
min_delta <- min(dmp_df_temp[dmp_df_temp$log10qval > log10_qval_cutoff,]$delta)
neg_range <- 0 - min_delta
mild.threshold.lower = (-8.5 * neg_range / 10)
}else {
mild.threshold.lower <- diff_cutoff_low
}
# dmp_df_temp$mapped_gene <- ""
label_rows <- ((dmp_df_temp$delta > mild.threshold.upper) | (dmp_df_temp$delta < mild.threshold.lower)) &
(dmp_df_temp$log10qval > log10_qval_cutoff)
# dmp_df_temp[label_rows, 'mapped_gene'] <- apply(dmp_df_temp[label_rows,], 1, function (x) as.character(findClosestGene(x['chr'], as.numeric(x['pos']), genome='hg19')$geneName[1]))
dmp_df_temp[is.na(dmp_df_temp$mapped_gene),]$mapped_gene <- dmp_df_temp[is.na(dmp_df_temp$mapped_gene),]$pos
#fix the ordering of the chromosomes for the color labels
dmp_df_temp$chr <- gsub("chr", "", dmp_df_temp$chr)
dmp_df_temp$chr <- factor(dmp_df_temp$chr, levels = c("1","2","3","4","5",
"6","7","8","9","10",
"11","12","13","14","15",
"16","17","18","19","20",
"21","22","X","Y"))
if (cluster_1_col != 'cluster_0_mean') {
show_legend = F
return(ggplot(dmp_df_temp, aes(delta, log10qval)) +
geom_point(aes(color = chr), alpha = .4, show.legend = show_legend) +
labs(x=sprintf('%s - %s', cluster_2_col, cluster_1_col),
y='-log10(q-value)',
title=sprintf('Transition from cluster %s to cluster %s', unlist(strsplit(cluster_1_col, '_'))[2], unlist(strsplit(cluster_2_col, '_'))[2])) +
theme(plot.title = element_text(hjust = 0.5)))
}else {
show_legend = T
return(ggplot(dmp_df_temp, aes(delta, log10qval)) +
geom_point(aes(color = chr), alpha = .4, show.legend = show_legend, size=.5) +
geom_text_repel(data = dmp_df_temp[((dmp_df_temp$delta > mild.threshold.upper) | (dmp_df_temp$delta < mild.threshold.lower)) &
(dmp_df_temp$log10qval > log10_qval_cutoff),],
aes(label=mapped_gene, color = chr),show.legend = FALSE, size=2) +
labs(x=sprintf('%s - %s', cluster_2_col, cluster_1_col),
y='-log10(q-value)',
title=sprintf('Transition from cluster %s to cluster %s', unlist(strsplit(cluster_1_col, '_'))[2], unlist(strsplit(cluster_2_col, '_'))[2])) +
theme(plot.title = element_text(hjust = 0.5)))
}
# make volcano plot
# ggsave(sprintf('volcano_plots_all_sites/transition_%s_to_%s.png', unlist(strsplit(cluster_1_col, '_'))[2], unlist(strsplit(cluster_2_col, '_'))[2]), dpi=300)
}
p1 <- make_volcano_plot(all_dmp_df, cluster_1_col = sprintf('cluster_%d_mean', 0),
cluster_2_col = sprintf('cluster_%d_mean', 1),
diff_cutoff_high = .21, diff_cutoff_low = -.8, log10_qval_cutoff = 25)
p1 +
labs(x="(cluster 1 mean) - (cluster 0 mean)",
title="CpG Sites In The 1st Transition",
color="Chromosome") +
theme(plot.title = element_text(hjust = 0.5, size=9),
legend.position="bottom",
legend.justification = "right",
legend.title = element_text(size = 6),
legend.text = element_text(size = 6),
legend.key.height = unit(2, "mm"),
legend.key.width = unit(2, "mm"),
legend.margin = margin(t=0,r=0,b=0,l=0, unit="mm"),
axis.title = element_text(size=8),
axis.text = element_text(size = 6)) +
guides(color=guide_legend(nrow=2,byrow=TRUE))
ggsave("figs/volcano_plot_tran_1.pdf", height = 90, width = 85, units = 'mm')
ggsave("figs/volcano_plot_tran_1.png", height = 90, width = 85, units = 'mm', dpi=1000)
for (i in 0:4) {
print(i)
plot <- make_volcano_plot(all_dmp_df, cluster_1_col = sprintf('cluster_%d_mean', i),
cluster_2_col = sprintf('cluster_%d_mean', i+1),
diff_cutoff_high = .21, diff_cutoff_low = -.8, log10_qval_cutoff = 25)
if (i==0){
p1 <- plot
}else if (i==1){
p2 <- plot
}else if (i==2){
p3 <- plot
}else if (i==3){
p4 <- plot
}else if (i==4){
p5 <- plot
}
}
grobbed <- arrangeGrob(p1, p2, p3,p4,p5, layout_matrix = cbind(c(1,1,2,4), c(1,1,3,5)))
ggsave('volcano_plots.png', grobbed, width = 9.2, height = 11.8, units = "in", dpi = 300)
```
## Drawing the CpG islands on XIST, FIRRE, and DXZ4
```{r}
xist_probe_pos <- c(10927,
30555,
30626,
30708,
32133,
32155,
32765,
32777,
33225)
ggplot() +
geom_rect(aes(xmin=0, ymin=1, ymax=2, xmax=32103)) +
geom_rect(aes(xmin=32103, ymin=1, ymax=2, xmax=32103+200, fill="TSS200")) +
geom_rect(aes(xmin=32103+200, ymin=1, ymax=2, xmax=32103+1500, fill="TSS1500")) +
geom_rect(aes(xmin=xist_probe_pos, ymin=1, ymax=2, xmax=xist_probe_pos+100, fill="CpG Probe")) +
scale_y_continuous(limits=c(0,4))
```
## Compiling sites into genes by only looking at promoter regions
```{r}
library(tidyverse)
gene_compiled_tss_sites_df <- tibble(gene=character(),
total_sites=integer(),
transition_1_inc_sites=integer(),
transition_1_dec_sites=integer(),
transition_2_inc_sites=integer(),
transition_2_dec_sites=integer(),
transition_3_inc_sites=integer(),
transition_3_dec_sites=integer(),
transition_4_inc_sites=integer(),
transition_4_dec_sites=integer(),
transition_5_inc_sites=integer(),
transition_5_dec_sites=integer(),
)
for (row_idx in 1:nrow(all_dmp_df)) {
gene <- unlist(all_dmp_df[row_idx,"mapped_gene"])
dist_to_tss <- unlist(all_dmp_df[row_idx,"dist_to_tss"])
delta_increasing <- unlist(all_dmp_df[row_idx,"delta_increasing"])
key_transition <- unlist(all_dmp_df[row_idx,"key_transition"])
alt_key_transition <- unlist(all_dmp_df[row_idx,"alt_key_transition"])
alt_delta <- unlist(all_dmp_df[row_idx,"alt_delta"])
refgene_group <- unlist(all_dmp_df[row_idx, "refgene_group"])
passes_threshold <- unlist(all_dmp_df[row_idx, "passes_threshold"])
if (grepl("TSS1500|TSS200|5'UTR", refgene_group) & (! is.na(gene)) & passes_threshold) {
# if (grepl("TSS1500|TSS200", refgene_group) & (! is.na(gene)) & (! is.na(alt_key_transition))) {
gene_row_idx <- 0
if (!(gene %in% gene_compiled_tss_sites_df$gene)) {
# gene is not in gene_compiled_tss_sites_df, so add a row for it
gene_row_idx <- nrow(gene_compiled_tss_sites_df)+1
gene_compiled_tss_sites_df[gene_row_idx, c("gene")] = gene
gene_compiled_tss_sites_df[is.na(gene_compiled_tss_sites_df)] <- 0
}else {
i <- 1
for (gene_name in gene_compiled_tss_sites_df$gene) {
if ((! is.na(gene_name)) && gene == gene_name) {
gene_row_idx <- i
break
}
i <- i+1
}
}
gene_compiled_tss_sites_df[gene_row_idx, "total_sites"] <- gene_compiled_tss_sites_df[gene_row_idx, "total_sites"] + 1
change <- "dec"
if (delta_increasing) {
# if (alt_delta > 0) {
change <- "inc"
}
gene_compiled_tss_sites_df[gene_row_idx, sprintf("transition_%d_%s_sites", key_transition, change)] <-
gene_compiled_tss_sites_df[gene_row_idx, sprintf("transition_%d_%s_sites", key_transition, change)] + 1
# gene_compiled_tss_sites_df[gene_row_idx, sprintf("transition_%d_%s_sites", alt_key_transition, change)] <-
# gene_compiled_tss_sites_df[gene_row_idx, sprintf("transition_%d_%s_sites", alt_key_transition, change)] + 1
}
}
gene_compiled_df <- gene_compiled_tss_sites_df
gene_compiled_df <- add_column(gene_compiled_df, cluster=c(0)*nrow(gene_compiled_df))
# gene_clusters <- tibble(gene=character(), cluster=integer())
for (row_idx in 1:nrow(gene_compiled_df)) {
gene <- unlist(gene_compiled_df[row_idx, "gene"])
total <- unlist(gene_compiled_df[row_idx, "total_sites"])
maximum_transition <- 0
maximum_change <- ""
maximum_count <- 0
for(key_transition in 1:5) {
for (change in c("inc", "dec")) {
transition <- sprintf("transition_%d_%s_sites", key_transition, change)
if (gene_compiled_df[row_idx, transition] > maximum_count) {
maximum_transition <- key_transition
maximum_change <- change
maximum_count <- as.integer(unlist(gene_compiled_df[row_idx, transition]))
}
}
}
cluster <- 10
if (maximum_count / total >= .5) {
if (maximum_change == "dec") {
cluster <- maximum_transition - 1 + 5
}else {
cluster <- maximum_transition - 1
}
}
i <- 1
for (gene_name in gene_compiled_df$gene) {
if ((! is.na(gene_name)) && gene == gene_name) {
gene_row_idx <- i
break
}
i <- i+1
}
gene_compiled_df[i, "cluster"] <- cluster
}
write_csv(gene_compiled_df, "gene_compiled_df.csv")
```
## Defining Escapee Genes
```{r}
library(tidyverse)
escape_expresison_status <- readxl::read_excel("GenesExpressionStatus.xlsx")
known_escapee_genes <- (escape_expresison_status %>% filter(`XCI stat` == "Escape" & `Gene name` %in% all_dmp_df$mapped_gene))$`Gene name`
roc <- tibble(threshold=numeric(),
TPR=numeric(),
FPR=numeric())
for (thresh in seq(.1, 1, .01)) {
calculated_escapees <- all_dmp_df %>% filter((cluster_0_mean <= thresh) & grepl("TSS1500|TSS200", refgene_group) & (chr == "chrX"))
calculated_escapee_genes <- unique(calculated_escapees$mapped_gene)
false_positive <- (length(calculated_escapee_genes) - length(intersect(calculated_escapee_genes, known_escapee_genes))) / length(calculated_escapee_genes)
true_positive_rate <- length(intersect(calculated_escapee_genes, known_escapee_genes)) / length(known_escapee_genes)
roc <- add_row(roc, threshold=thresh, TPR=true_positive_rate, FPR=false_positive)
}
library(ggrepel)
ggplot(roc %>% filter(threshold < .8 & threshold > .1), aes(x=FPR, y=TPR)) +
geom_point() +
geom_line(size = 2, alpha = 0.5)+
labs(title= "ROC curve",
x = "Percent False Positives",
y = "True Positive Rate (Sensitivity)") +
geom_text_repel(data=roc %>% filter(threshold < .8 & threshold >= .3 & TPR >.7 & (FPR) < .70),aes(label=threshold))
ggsave("figs/escapee_roc.png")
ggplot(roc, aes(x=FPR, y=TPR)) +
geom_point() +
geom_line(size = 2, alpha = 0.5)+
labs(title= "ROC curve",
x = "Percent False Positives",
y = "True Positive Rate (Sensitivity)") +
geom_text_repel(data=roc %>% filter(TPR >.1 & FPR < .75),aes(label=threshold))
ggsave("figs/escapee_roc_wide.png")
```
## Sanity check for escapee behavior
```{r}
escape_expresison_status <- readxl::read_excel("GenesExpressionStatus.xlsx")
known_escapee_genes <- (escape_expresison_status %>% filter(`XCI stat` == "Escape" & `Gene name` %in% all_dmp_df$mapped_gene))$`Gene name`
known_variable_genes <- (escape_expresison_status %>% filter(`XCI stat` == "Variable" & `Gene name` %in% all_dmp_df$mapped_gene))$`Gene name`
known_inactive_genes <- (escape_expresison_status %>% filter(`XCI stat` == "Inactive" & `Gene name` %in% all_dmp_df$mapped_gene))$`Gene name`
escape_dmp_df <- all_dmp_df %>% filter(mapped_gene %in% known_escapee_genes)
escape_promoter_methylation <- (escape_dmp_df %>% filter(grepl("TSS1500|TSS200", refgene_group)) %>% dplyr::select(cluster_0_mean))$cluster_0_mean
escape_body_methylation <- (escape_dmp_df %>% filter(!grepl("TSS1500|TSS200", refgene_group)) %>% dplyr::select(cluster_0_mean))$cluster_0_mean
variable_dmp_df <- all_dmp_df %>% filter(mapped_gene %in% known_variable_genes)
variable_promoter_methylation <- (variable_dmp_df %>% filter(grepl("TSS1500|TSS200", refgene_group)) %>% dplyr::select(cluster_0_mean))$cluster_0_mean
variable_body_methylation <- (variable_dmp_df %>% filter(!grepl("TSS1500|TSS200", refgene_group)) %>% dplyr::select(cluster_0_mean))$cluster_0_mean
inactive_dmp_df <- all_dmp_df %>% filter(mapped_gene %in% known_inactive_genes)
inactive_promoter_methylation <- (inactive_dmp_df %>% filter(grepl("TSS1500|TSS200", refgene_group)) %>% dplyr::select(cluster_0_mean))$cluster_0_mean
inactive_body_methylation <- (inactive_dmp_df %>% filter(!grepl("TSS1500|TSS200", refgene_group)) %>% dplyr::select(cluster_0_mean))$cluster_0_mean
ggplot() +
geom_violin(aes(x="Escape Promoter", y=escape_promoter_methylation), fill='lightgreen', trim=FALSE, show.legend = F) +
geom_violin(aes(x="Escape Body", y=escape_body_methylation), fill='darkgreen', trim=FALSE, show.legend = F) +
geom_boxplot(aes(x="Escape Promoter", y=escape_promoter_methylation), width=0.025) +
geom_boxplot(aes(x="Escape Body", y=escape_body_methylation), width=0.025) +
geom_violin(aes(x="Variable Promoter", y=variable_promoter_methylation), fill='lightblue', trim=FALSE, show.legend = F) +
geom_violin(aes(x="Variable Body", y=variable_body_methylation), fill='darkblue', trim=FALSE, show.legend = F) +
geom_boxplot(aes(x="Variable Promoter", y=variable_promoter_methylation), width=0.025) +
geom_boxplot(aes(x="Variable Body", y=variable_body_methylation), width=0.025) +
geom_violin(aes(x="Inactive Promoter", y=inactive_promoter_methylation), fill='pink', trim=FALSE, show.legend = F) +
geom_violin(aes(x="Inactive Body", y=inactive_body_methylation), fill='darkred', trim=FALSE, show.legend = F) +
geom_boxplot(aes(x="Inactive Promoter", y=inactive_promoter_methylation), width=0.025) +
geom_boxplot(aes(x="Inactive Body", y=inactive_body_methylation), width=0.025) +
scale_x_discrete(limits = c("Escape Promoter", "Escape Body", "Variable Promoter", "Variable Body", "Inactive Promoter", "Inactive Body")) +
labs(x="",
y='Methylation Beta',
title="Distribution of Beta Values of Escapee Gene Promoter/Body") +
theme(plot.title = element_text(hjust = 0.5))
ggsave("figs/beta_value_breakdown_by_gene_type.png")
```
## Doing a PCA of probes to identify escapees
```{r}
library(tidyverse)
female_df_no_h9_chrX <- read_csv("female_df_no_h9_chrX.csv.gz")
male_df_no_outliers <- read_csv("male_df_no_outliers.csv.gz")
x_annotation_df <- read_csv("x_annotation.csv.gz")
male_df_no_outliers_chrX <- male_df_no_outliers %>% filter(rn %in% x_annotation_df$rowname)
sample_cluster_df <- read_csv("saved_sample_clusters.csv",
col_names=c('name','cluster'))
cluster_0_samples <- (sample_cluster_df %>% filter(cluster == 0))$name
female_df_no_h9_chrX_cluster_0 <- female_df_no_h9_chrX %>% dplyr::select(rn, !!cluster_0_samples)
combined_probe_df <- inner_join(female_df_no_h9_chrX_cluster_0, male_df_no_outliers_chrX, by="rn")
library(ggfortify)
library(broom)
mean_methylation_df <- tidy(rowMeans(column_to_rownames(na.omit(female_df_no_h9_chrX_cluster_0), var="rn"))) %>% dplyr::rename(female_mean=x)
mean_methylation_df <- inner_join(mean_methylation_df,
tidy(rowMeans(column_to_rownames(na.omit(male_df_no_outliers_chrX), var="rn"))) %>% dplyr::rename(male_mean=x),
by="names") %>% column_to_rownames(var="names")
autoplot(prcomp(column_to_rownames(combined_probe_df, var="rn")), data=mean_methylation_df, colour="female_mean", size="male_mean")
mean_methylation_df <- rownames_to_column(mean_methylation_df, var="rn")
mean_methylation_df <- left_join(mean_methylation_df,
x_annotation_df %>% dplyr::select(rowname, hg38_chromosome,hg38_gene_group, hg38_gene_name, hg38_ensembl_gene_id, hg38_pos),
by=c("rn"="rowname"))
canonical_transcripts <- read_csv("hg38_canonical_transcripts.csv.gz")
hg38_biomart_structure_df <- read_csv("hg38_biomart_structure_df.csv.gz", col_types = cols(chromosome_name="c"))
hg38_biomart_structure_df_canonical <- hg38_biomart_structure_df %>%
filter(ensembl_transcript_id %in% canonical_transcripts$ensembl_transcript_id) %>%
mutate(chrom=paste("chr", chromosome_name, sep=""))
tss_df <- hg38_biomart_structure_df_canonical %>% group_by(ensembl_gene_id) %>%
summarise(TSS=dplyr::first(transcription_start_site), strand=dplyr::first(strand))
mean_methylation_df <- left_join(mean_methylation_df, tss_df, by=c("hg38_ensembl_gene_id"="ensembl_gene_id"))
subject_training_set_cotton <- read.csv("xci_subject_genes.csv.gz", header=F)$V1
escapee_training_set_cotton <- c("ACE2",
"CA5B",
"CXorf38",
"DDX3X",
"EIF1AX",
"FAM9C",
"FUNDC1",
"GPR112",
"HDHD1",
"HS6ST2",
"INE2",
"JPX",
"PNMA3",
"PNPLA4",
"RPS4X",
"SHROOM2",
"STS",
"SYAP1",
"TAF7L",
"UBA1",
"VENTXP1",
"VGLL1",
"ZFX",
"ZRSR2")
mean_methylation_df$training_designation = NA
mean_methylation_df[mean_methylation_df$hg38_gene_name %in% subject_training_set_cotton,]$training_designation <- "subject"
mean_methylation_df[mean_methylation_df$hg38_gene_name %in% escapee_training_set_cotton,]$training_designation <- "escape"
ggplot(mean_methylation_df %>% filter((male_mean < .25) & !is.na(training_designation) & strand*(hg38_pos - TSS) >= -1500 & strand*(hg38_pos - TSS) <= 3500), aes(x=strand*(hg38_pos - TSS), y=abs(female_mean - male_mean), color=training_designation)) +
geom_smooth(alpha=.4, span=.1)
plotting_df <- mean_methylation_df %>%
filter((male_mean < .25) & strand*(hg38_pos - TSS) >= -500 & strand*(hg38_pos - TSS) <= 1500) %>%
group_by(hg38_gene_name) %>%
summarise(female_mean=mean(female_mean),
male_mean=mean(male_mean),
training_designation=dplyr::first(training_designation))
plotting_df$sex_delta <- abs(plotting_df$male_mean - plotting_df$female_mean)
training_labeller <- labeller(sex=function(s){
return(list(
female_mean="Female",
male_mean="Male",
sex_delta="Sex Delta"
))
})
plotting_df_2 <- reshape2::melt(plotting_df, vars.name=c("hg38_gene_name", "training_designation"), value.name="DNAm", variable.name=c("sex"))
cotton_training_methylation_plot <- ggplot(plotting_df_2 %>% filter(!is.na(training_designation)), aes(y=DNAm, x=training_designation)) +
facet_grid(cols = vars(sex), labeller = training_labeller) +
geom_jitter(aes(color=sex), shape=20, size=.5) +
geom_boxplot(outlier.shape = NA, fill=NA) +
scale_color_npg(guide=F) +
labs(y="DNAme Averaged by Gene",
x="Training Designation",
title="DNAme of Cotton Training Genes") +
theme_bw() +
base_plot_theme +
theme(strip.text = element_text(size=5, margin = margin(.7,0,.7,0, "mm")),
strip.background = element_rect(size = .3),
axis.text.x=element_text(angle=30, hjust=1, margin=margin(t=.3, unit="mm")))
ggsave("figs/supplementary/sup2_cotton_escape_designation.pdf", cotton_training_methylation_plot, height = 33, width = 43.5, units = "mm")
# ggplot(plotting_df , aes(y=abs(female_mean-male_mean), x=training_designation)) +
# geom_boxplot() +
# geom_jitter()
true_escape_training_set <- (plotting_df %>% filter(female_mean < .25 & training_designation == "escape"))$hg38_gene_name
plotting_df_3 <- plotting_df_2 %>%
mutate(training_designation=replace(training_designation, training_designation=="escape", NA)) %>%
mutate(training_designation=replace(training_designation, hg38_gene_name %in% true_escape_training_set, "escape"))
pb_training_methylation_plot <- ggplot(plotting_df_3 %>% filter(!is.na(training_designation)), aes(y=DNAm, x=training_designation)) +
facet_grid(cols = vars(sex), labeller = training_labeller) +
geom_jitter(aes(color=sex), shape=20, size=.5) +
geom_boxplot(outlier.shape = NA, fill=NA) +
scale_color_npg(guide=F) +
labs(y="DNAme Averaged by Gene",
x="Training Designation",
title="DNAme of Our Training Genes") +
theme_bw() +
base_plot_theme +
theme(strip.text = element_text(size=5, margin = margin(.7,0,.7,0, "mm")),
strip.background = element_rect(size = .3),
axis.text.x=element_text(angle=30, hjust=1, margin=margin(t=.3, unit="mm")))
ggsave("figs/supplementary/sup2_pb_escape_designation.pdf", pb_training_methylation_plot, height = 33, width = 43.5, units = "mm")
training_methylation_df <- mean_methylation_df %>%
filter((male_mean < .25) & (strand*(hg38_pos - TSS) >= -500) & (strand*(hg38_pos - TSS) <= 1500) &
(hg38_gene_name %in% true_escape_training_set))
mean_sex_delta <- mean((training_methylation_df %>% transmute(sex_delta=abs(female_mean - male_mean)))$sex_delta)
sd_sex_delta <- sd((training_methylation_df %>% transmute(sex_delta=abs(female_mean - male_mean)))$sex_delta)
mean_female_mean <- mean(training_methylation_df$female_mean)
sd_female_mean <- sd(training_methylation_df$female_mean)
mean_methylation_df_escape <- mean_methylation_df %>%
filter((male_mean < .25) & (strand*(hg38_pos - TSS) >= -500 & strand*(hg38_pos - TSS) <= 1500)) %>%
group_by(hg38_gene_name) %>%
summarise(female_mean=mean(female_mean),
male_mean=mean(male_mean),
hg38_ensembl_gene_id=dplyr::first(hg38_ensembl_gene_id)) %>%
filter((abs(female_mean - male_mean) < (mean_sex_delta + (3*sd_sex_delta))) &
(female_mean <= mean_female_mean + (3*sd_female_mean)))
mean_methylation_df_escape <- left_join(mean_methylation_df_escape, tss_df, by=c("hg38_ensembl_gene_id"="ensembl_gene_id"))
true_escapees <- unique(mean_methylation_df_escape$hg38_gene_name)
length(true_escapees)
plotting_df_4 <- plotting_df_2 %>%
mutate(training_designation=replace(training_designation, training_designation=="escape", NA)) %>%
mutate(training_designation=replace(training_designation, hg38_gene_name %in% true_escapees, "escape")) %>%
mutate(training_designation=replace(training_designation, !(hg38_gene_name %in% true_escapees), "subject"))
true_escapee_methylation_plot <- ggplot(plotting_df_4 %>% filter(!is.na(training_designation)), aes(y=DNAm, x=training_designation)) +
facet_grid(cols = vars(sex), labeller = training_labeller) +
geom_jitter(aes(color=sex), shape=20, size=.5) +
geom_boxplot(outlier.shape = NA, fill=NA) +
scale_color_npg(guide=F) +
labs(y="DNAme Averaged by Gene",
x="True Designation",
title="DNAme of Designated Genes") +
theme_bw() +
base_plot_theme +
theme(strip.text = element_text(size=5, margin = margin(.7,0,.7,0, "mm")),
strip.background = element_rect(size = .3),
axis.text.x=element_text(angle=30, hjust=1, margin=margin(t=.3, unit="mm")))
ggsave("figs/supplementary/sup2_true_escape.pdf", true_escapee_methylation_plot, height = 33, width = 43.5, units = "mm")
mean_methylation_df <- mutate(mean_methylation_df, is_promoter=strand*(hg38_pos - TSS) >= -500 & strand*(hg38_pos - TSS) <= 1500)
mean_methylation_df <- mean_methylation_df %>% mutate(true_designation=replace(training_designation, hg38_gene_name %in% true_escapees, "Escape"))
mean_methylation_df <- mean_methylation_df %>% mutate(true_designation=replace(true_designation, !(hg38_gene_name %in% true_escapees), "Subject"))
probe_methylation_dist <- ggplot(mean_methylation_df %>% filter(!is.na(hg38_gene_name)), aes(y=female_mean, x=true_designation, fill= (is_promoter))) +
geom_violin(size=.3) +
labs(title="Escape vs Subject Probes",
x="True Designation",
y="DNAme",
fill="Probe Type") +
scale_fill_rickandmorty(labels=c("Body", "Promoter")) +
theme_bw() +
base_plot_theme
ggsave("figs/supplementary/sup2_escape_methylation_dist.pdf", probe_methylation_dist, width = 43.5, height=33, units = "mm")
ggplot(plotting_df , aes(y=female_mean, x=hg38_gene_name %in% true_escapees)) +
geom_boxplot() +
geom_jitter()
ggplot(plotting_df , aes(y=abs(female_mean-male_mean), x=hg38_gene_name %in% true_escapees)) +
geom_boxplot() +
geom_jitter()
escapee_heatmap <- make_subset_heatmap(true_escapees)
write_csv(mean_methylation_df_escape, "mean_methylation_df_escape.csv")
# mean_methylation_df_escape <- left_join(mean_methylation_df_escape,
# x_annotation_df %>% select(rowname, hg38_chromosome,hg38_gene_group, hg38_gene_name, hg38_ensembl_gene_id, hg38_pos),
# by=c("rn"="rowname"))
```
## Check where these clusters are happening/Defining PB Escapees
```{r}
library(tidyverse)
library(ggplot2)
all_dmp_df <- read_csv('all_dmp_df.csv.gz')
mean_methylation_df_escape <- read_csv("mean_methylation_df_escape.csv.gz")
# do this based on genes and when the promoters change their methylation.
x_annotation_df <- read_csv("filtered_site_annotation_df.csv.gz") %>% filter(hg38_chromosome == "chrX")
chr_x_dmp_df_short_arm <- all_dmp_df %>% filter((hg38_chr == "chrX") & (hg38_pos <= 58100000))
short_arm_escapees <- mean_methylation_df_escape %>% filter(TSS <= 58100000)
for (row_idx in 1:nrow(chr_x_dmp_df_short_arm)) {
pos <- unlist(chr_x_dmp_df_short_arm[row_idx, "hg38_pos"])
min_distance <- min(abs(short_arm_escapees["TSS"] - pos))
# min_distance_end <- min(abs(short_arm_escapees["end_position"] - pos))
chr_x_dmp_df_short_arm[row_idx, "min_dist_to_pb_escapee"] <- min_distance #min(min_distance, min_distance_end)
}
canonical_transcripts <- read_csv("hg38_canonical_transcripts.csv.gz")
hg38_biomart_structure_df <- read_csv("hg38_biomart_structure_df.csv.gz", col_types = cols(chromosome_name="c"))
hg38_biomart_structure_df_canonical <- hg38_biomart_structure_df %>%
filter(ensembl_transcript_id %in% canonical_transcripts$ensembl_transcript_id &
chromosome_name=="X" &
transcription_start_site <= 58100000) %>%
group_by(ensembl_gene_id) %>%
summarise(transcription_start_site=dplyr::first(transcription_start_site))
n <- 1000 # how many rounds of random choosing I want
mean_min_distance_random_df_short_arm <- tibble(run=numeric(),
key_transition=numeric(),
mean_distance=numeric())
for (i in 1:n) {
random_escapees <- sample(hg38_biomart_structure_df_canonical$ensembl_gene_id, nrow(short_arm_escapees))
random_escapees_df <- hg38_biomart_structure_df_canonical %>% filter(ensembl_gene_id %in% random_escapees)
test_df <- chr_x_dmp_df_short_arm %>% group_by(rowname) %>% summarise(
key_transition=first(key_transition),
passes_threshold=first(passes_threshold),
min_dist_to_random_escapee=min(abs(random_escapees_df$transcription_start_site - hg38_pos))
)
mean_min_distance_random_df_short_arm <- full_join(mean_min_distance_random_df_short_arm, test_df %>% group_by(key_transition) %>%
summarise(mean_distance=median(min_dist_to_random_escapee)) %>% mutate(run=i),
by=c("run", "key_transition", "mean_distance"))
# write_csv(test_df, "short_arm_pb_random_escapee_distances.csv", append=T, col_names=F)
# distances_to_random_escapees <- full_join(distances_to_random_escapees, test_df, by=c("key_transition", "min_dist_to_random_escapee", "passes_threshold"))
}
# permutation 1 p-values
p1_short_arm_pvals <- c()
for (t in 1:5) {
p <- nrow(mean_min_distance_random_df_short_arm %>%
arrange(mean_distance) %>% filter(key_transition == t &
(mean_distance <= median((chr_x_dmp_df_short_arm %>% filter(passes_threshold==T &
key_transition==t))$min_dist_to_pb_escapee))))
p2 <- nrow(mean_min_distance_random_df_short_arm %>%
arrange(mean_distance) %>% filter(key_transition == t &
(mean_distance >= median((chr_x_dmp_df_short_arm %>% filter(passes_threshold==T &
key_transition==t))$min_dist_to_pb_escapee))))
print(min(p,p2)/1000)
pval <- min(p,p2)/1000
if (pval == 0) {
pval <- "p < .001"
}else {
pval <- sprintf("p = %.3f", pval)
}
p1_short_arm_pvals <- c(p1_short_arm_pvals, pval)
}
# distances_to_random_escapees_short_arm <- read_csv("short_arm_pb_random_escapee_distances.csv", col_names = c("rowname","key_transition","passes_threshold", "min_dist"))
# distances_to_random_escapees_short_arm$type <- "random"
n <- 1000 # how many rounds of random choosing I want
mean_min_distance_random_probes_df_short_arm <- tibble(run=numeric(),
key_transition=numeric(),
mean_distance=numeric())
for (i in 1:n) {
for (t in 1:5) {
random_short_arm_probes <- sample(chr_x_dmp_df_short_arm$rowname, nrow(chr_x_dmp_df_short_arm %>% filter(passes_threshold==T & key_transition==t)))
random_short_arm_probes_df <- chr_x_dmp_df_short_arm %>% filter(rowname %in% random_short_arm_probes)
test_df <- random_short_arm_probes_df %>% group_by(rowname) %>% summarise(
key_transition=t,
passes_threshold=T,
min_dist_to_random_escapee=min(abs(short_arm_escapees$TSS - hg38_pos))
)
mean_min_distance_random_probes_df_short_arm <- add_row(mean_min_distance_random_probes_df_short_arm,
run=i,
key_transition=t,
mean_distance=median(test_df$min_dist_to_random_escapee))
# write_csv(test_df, "short_arm_pb_random_probes_distances.csv", append=T, col_names=F)
# distances_to_random_escapees <- full_join(distances_to_random_escapees, test_df, by=c("key_transition", "min_dist_to_random_escapee", "passes_threshold"))
}
}
# permutation 2 p-values
p2_short_arm_pvals <- c()
for (t in 1:5) {
p <- nrow(mean_min_distance_random_probes_df_short_arm %>%
arrange(mean_distance) %>% filter(key_transition == t &
(mean_distance <= median((chr_x_dmp_df_short_arm %>% filter(passes_threshold==T &
key_transition==t))$min_dist_to_pb_escapee))))
p2 <- nrow(mean_min_distance_random_probes_df_short_arm %>%
arrange(mean_distance) %>% filter(key_transition == t &
(mean_distance >= median((chr_x_dmp_df_short_arm %>% filter(passes_threshold==T &
key_transition==t))$min_dist_to_pb_escapee))))
print(min(p,p2)/1000)
pval <- min(p,p2)/1000
if (pval ==0) {
pval <- "p < .001"
}else {
pval <- sprintf("p = %.3f", pval)
}
p2_short_arm_pvals <- c(p2_short_arm_pvals, pval)
}
# distances_to_random_probes_short_arm <- read_csv("short_arm_pb_random_probes_distances.csv", col_names = c("rowname","key_transition","passes_threshold", "min_dist"))
# distances_to_random_probes_short_arm$type <- "random_probes"
p1_shorm_arm_df <- tibble(key_transition = c(1,2,3,4,5),
pval = p1_short_arm_pvals,
x=0,
y=1)
p2_shorm_arm_df <- tibble(key_transition = c(1,2,3,4,5),
pval = p2_short_arm_pvals,
x=3.5e7,
y=1)
short_arm_permutation_plot <- ggplot(mean_min_distance_random_df_short_arm) +
geom_density(aes(mean_distance, y=..scaled..),
n = 1e4,
fill="darkgrey",
color="darkgray",
size=.3,
alpha=.5) +
geom_density(aes(mean_distance, y=..scaled..), data=mean_min_distance_random_probes_df_short_arm,
n = 1e4,
fill="grey",
color="grey",
size=.3,
alpha=.5) +
geom_text(data=p1_shorm_arm_df, mapping=aes(x=x, y=y, label=pval), inherit.aes = F, size=1.2, hjust=-0.1, vjust=0) +
geom_text(data=p2_shorm_arm_df, mapping=aes(x=x, y=y, label=pval), inherit.aes = F, size=1.2, hjust=1.1, vjust=0) +
facet_grid(key_transition~., scales = "free_y", labeller = labeller(key_transition = function(c){
return(list(
"1"="Tran. 1",
"2"="Tran. 2",
"3"="Tran. 3",
"4"="Tran. 4",
"5"="Tran. 5"
)[c])
})) +
geom_vline(data=(chr_x_dmp_df_short_arm %>% filter(passes_threshold==T) %>% group_by(key_transition) %>% summarise(mean_distance=median(min_dist_to_pb_escapee))),
mapping = aes(xintercept=mean_distance), size=.3) +
geom_density(data=chr_x_dmp_df_short_arm %>% filter(passes_threshold==T),
mapping=aes(min_dist_to_pb_escapee,
y=..scaled..,
fill=as.factor(key_transition),
color=as.factor(key_transition)),
n = 1e4, alpha=.5, size=.3) +
geom_text(data=(chr_x_dmp_df_short_arm %>%
filter(passes_threshold==T) %>%
group_by(key_transition) %>%
summarise(mean_distance=median(min_dist_to_pb_escapee))),
mapping = aes(x=mean_distance, y=.5, label=sprintf("%.1f Mb", mean_distance / 1e6)),
# nudge_x=-200,
hjust=1.1,
size=1.2) +
theme_bw() +
# scale_x_continuous(breaks = c(0,.5,1,1.5,2,2.5,3,3.5)*1e7, labels = c(0,.5,1,1.5,2,2.5,3,3.5)*10) +
# scale_x_log10(limits = c(1e4,1e8), breaks=c(1e4, 1e5,1e6,1e7,1e8), labels = c(.01,.1,1,10,100)) +
scale_x_sqrt(breaks=c(1e6, 5e6, 1e7, 1.5e7, 2e7, 2.5e7, 3e7), labels=c(1, 5, 10, 15, 20, 25, 30), expand=c(0,0)) +
scale_y_continuous(limits = c(0,1.2), breaks = c(0,.5,1), minor_breaks = c(.25,.75)) +
scale_color_d3(guide=F) +
scale_fill_d3(guide=F) +
# scale_fill_material("teal", name="Transition", guide=F) +
# scale_color_material("teal", name="Transition", guide=F) +
labs(title="Minimum Distance of DMPs to Escapee on Short Arm",
x="Minimum Distance to Gene Set (Mb)",
y="Scaled Density") +
base_plot_theme +
theme(strip.text = element_text(size=5, margin = margin(0,.5,0,.5, "mm")),
strip.background = element_rect(size = .3),
panel.spacing.y = unit(.75, "mm"))
ggsave("figs/main/min_dist_escapee_short_arm.pdf", short_arm_permutation_plot, width = 64, height=47.5, units = "mm")
# distances_to_pb_escapees_short_arm <- chr_x_dmp_df_short_arm %>% dplyr::select(rowname, key_transition, passes_threshold, min_dist=min_dist_to_pb_escapee)
# distances_to_pb_escapees_short_arm$type <- "pb_escapee"
# distances_to_pb_escapees_short_arm[distances_to_pb_escapees_short_arm$passes_threshold == F, "type"] <- "pb_escapee_no_thresh"
# all_distances_short_arm <- full_join(distances_to_random_escapees_short_arm, distances_to_pb_escapees_short_arm, by=c("rowname", "key_transition", "passes_threshold", "min_dist", "type"))
# all_distances_short_arm <- full_join(all_distances_short_arm, distances_to_random_probes_short_arm, by=c("rowname", "key_transition", "passes_threshold", "min_dist", "type"))
# library(ggpubr)
# library(ggsci)
# all_distances_short_arm$key_transition <- sprintf("Tran. %d", all_distances_short_arm$key_transition)
# short_arm_distances_plot <- ggplot(all_distances_short_arm %>% filter(type != "pb_escapee_no_thresh"), aes(x=type, y=min_dist, fill=type)) +
# geom_boxplot(outlier.shape = NA, size=.3, notch = T) +
# # geom_hline(yintercept = mean(filter(all_distances_short_arm, type=="pb_escapee_no_thresh")$min_dist), linetype="dashed", color=pal_futurama()(n=3)[3], show.legend = T, size=.3) +
# stat_compare_means(comparisons = list(c("pb_escapee", "random")), label="p.signif", size=1, tip.length = .02)+
# # scale_y_continuous(breaks = c(0,.5,1,1.5,2,2.5,3,3.5)*1e7, labels = c(0,.5,1,1.5,2,2.5,3,3.5)*10) +
# scale_y_log10(limits = c(1e4,1e8), breaks=c(1e4, 1e5,1e6,1e7,1e8)) +
# scale_x_discrete(breaks=c("pb_escapee", "random_probes", "random"), labels=c("", "", "")) +
# facet_grid(~key_transition) +
# scale_fill_futurama(breaks=c("pb_escapee", "random", "random_probes"), labels=c("Short Arm Escapees", "100 Sets of 12 Random Short Arm Genes","100 Sets of Random Probes")) +
# theme_bw() +
# labs(y="Minimum Distance to Gene Set (Mb)",
# x="",
# fill="Gene Set",
# title="Minimum Distance to Escapee on Short Arm") +
# base_plot_theme +
# theme(strip.text = element_text(size=5, margin = margin(.5,0,.5,0, "mm")),
# strip.background = element_rect(size = .3),
# legend.text = element_text(size=4),
# legend.title = element_text(size=4, face="bold"),
# legend.key.width = unit(2, units="mm"),
# legend.key.height = unit(3, units="mm"),
# legend.key.size = unit(.2, units = "mm"),
# axis.title.x = element_text(margin = margin(0,0,0,0), size = 0),
# axis.text.x = element_text(margin = margin(0,0,0,0), size = 0),
# axis.ticks.length.x = unit(0, "mm"))
# ggsave("figs/main/4_short_arm_dist.svg", short_arm_distances_plot, width=64, height=45, units="mm")
```
## Min escapee distance on long arm
```{r}
chr_x_dmp_df_long_arm <- all_dmp_df %>% filter((hg38_chr == "chrX") & (hg38_pos > 58100000))
long_arm_escapees <- mean_methylation_df_escape %>% filter(TSS > 58100000)
for (row_idx in 1:nrow(chr_x_dmp_df_long_arm)) {
pos <- unlist(chr_x_dmp_df_long_arm[row_idx, "hg38_pos"])
min_distance <- min(abs(long_arm_escapees["TSS"] - pos))
# min_distance_end <- min(abs(short_arm_escapees["end_position"] - pos))
chr_x_dmp_df_long_arm[row_idx, "min_dist_to_pb_escapee"] <- min_distance #min(min_distance, min_distance_end)
}
# canonical_transcripts <- read_csv("hg38_canonical_transcripts.csv")
# hg38_biomart_structure_df <- read_csv("hg38_biomart_structure_df.csv", col_types = cols(chromosome_name="c"))
hg38_biomart_structure_df_canonical_long_arm <- hg38_biomart_structure_df %>%
filter(ensembl_transcript_id %in% canonical_transcripts$ensembl_transcript_id &
chromosome_name=="X" &
transcription_start_site > 58100000) %>%
group_by(ensembl_gene_id) %>%
summarise(transcription_start_site=dplyr::first(transcription_start_site))
n <- 1000 # how many rounds of random choosing I want
mean_min_distance_random_df_long_arm <- tibble(run=numeric(),
key_transition=numeric(),
mean_distance=numeric())
for (i in 1:n) {
random_escapees <- sample(hg38_biomart_structure_df_canonical_long_arm$ensembl_gene_id, nrow(long_arm_escapees))
random_escapees_df <- hg38_biomart_structure_df_canonical_long_arm %>% filter(ensembl_gene_id %in% random_escapees)
test_df <- chr_x_dmp_df_long_arm %>% group_by(rowname) %>% summarise(
key_transition=dplyr::first(key_transition),
passes_threshold=dplyr::first(passes_threshold),
min_dist_to_random_escapee=min(abs(random_escapees_df$transcription_start_site - hg38_pos))
)
mean_min_distance_random_df_long_arm <- full_join(mean_min_distance_random_df_long_arm, test_df %>% group_by(key_transition) %>% summarise(mean_distance=median(min_dist_to_random_escapee)) %>% mutate(run=i), by=c("run", "key_transition", "mean_distance"))
# write_csv(test_df, "long_arm_pb_random_escapee_distances.csv", append=T, col_names=F)
# distances_to_random_escapees <- full_join(distances_to_random_escapees, test_df, by=c("key_transition", "min_dist_to_random_escapee", "passes_threshold"))
}
# permutation 1 p-values
p1_pvals <- c()
for (t in 1:5) {
p <- nrow(mean_min_distance_random_df_long_arm %>%
arrange(mean_distance) %>% filter(key_transition == t &
(mean_distance <= median((chr_x_dmp_df_long_arm %>% filter(passes_threshold==T &
key_transition==t))$min_dist_to_pb_escapee))))
p2 <- nrow(mean_min_distance_random_df_long_arm %>%
arrange(mean_distance) %>% filter(key_transition == t &
(mean_distance >= median((chr_x_dmp_df_long_arm %>% filter(passes_threshold==T &
key_transition==t))$min_dist_to_pb_escapee))))
print(min(p,p2)/1000)
pval <- min(p,p2)/1000
if (pval == 0) {
pval <- "p < 0.001"
}else {
pval <- sprintf("p = %.3f", pval)
}
p1_pvals <- c(p1_pvals, pval)
}
# distances_to_random_escapees_long_arm <- read_csv("long_arm_pb_random_escapee_distances.csv", col_names = c("rowname","key_transition","passes_threshold", "min_dist"))
# distances_to_random_escapees_long_arm$type <- "random"
n <- 1000 # how many rounds of random choosing I want
mean_min_distance_random_probes_df_long_arm <- tibble(run=numeric(),
key_transition=numeric(),
mean_distance=numeric())
for (i in 1:n) {
for (t in 1:5) {
random_long_arm_probes <- sample(chr_x_dmp_df_long_arm$rowname, nrow(chr_x_dmp_df_long_arm %>% filter(passes_threshold==T & key_transition==t)))
random_long_arm_probes_df <- chr_x_dmp_df_long_arm %>% filter(rowname %in% random_long_arm_probes)
test_df <- random_long_arm_probes_df %>% group_by(rowname) %>% summarise(
key_transition=t,
passes_threshold=T,
min_dist_to_random_escapee=min(abs(long_arm_escapees$TSS - hg38_pos))
)
mean_min_distance_random_probes_df_long_arm <- add_row(mean_min_distance_random_probes_df_long_arm,
run=i,
key_transition=t,
mean_distance=median(test_df$min_dist_to_random_escapee))
# write_csv(test_df, "long_arm_pb_random_probes_distances.csv", append=T, col_names=F)
# distances_to_random_escapees <- full_join(distances_to_random_escapees, test_df, by=c("key_transition", "min_dist_to_random_escapee", "passes_threshold"))
}
}
# distances_to_random_probes_long_arm <- read_csv("long_arm_pb_random_probes_distances.csv", col_names = c("rowname","key_transition","passes_threshold", "min_dist"))
# distances_to_random_probes_long_arm$type <- "random_probes"
# permutation 2 p-values
p2_pvals <- c()
for (t in 1:5) {
p <- nrow(mean_min_distance_random_probes_df_long_arm %>%
arrange(mean_distance) %>% filter(key_transition == t &
(mean_distance <= median((chr_x_dmp_df_long_arm %>% filter(passes_threshold==T &
key_transition==t))$min_dist_to_pb_escapee))))
p2 <- nrow(mean_min_distance_random_probes_df_long_arm %>%
arrange(mean_distance) %>% filter(key_transition == t &
(mean_distance >= median((chr_x_dmp_df_long_arm %>% filter(passes_threshold==T &
key_transition==t))$min_dist_to_pb_escapee))))
print(min(p,p2)/1000)
pval <- min(p,p2)/1000
if (pval == 0) {
pval <- "p < 0.001"
}else {
pval <- sprintf("p = %.3f", pval)
}
p2_pvals <- c(p2_pvals, pval)
}
p1_long_arm_df <- tibble(key_transition = c(1,2,3,4,5),
pval = p1_pvals,
x=0,
y=1)
p2_long_arm_df <- tibble(key_transition = c(1,2,3,4,5),
pval = p2_pvals,
x=6.5e7,
y=1)
long_arm_permutation_plot <- ggplot(mean_min_distance_random_df_long_arm) +
geom_density(aes(mean_distance, y=..scaled..), n = 1e4,
fill="darkgrey", color="darkgrey", size=.3, alpha=.5) +
geom_density(aes(mean_distance, y=..scaled..), data=mean_min_distance_random_probes_df_long_arm,
n = 1e4, fill="grey", color="grey", size=.3, alpha=.5) +
facet_grid(key_transition~., scales = "free_y", labeller = labeller(key_transition = function(c){
return(list(
"1"="Tran. 1",
"2"="Tran. 2",
"3"="Tran. 3",
"4"="Tran. 4",
"5"="Tran. 5"
)[c])
})) +
geom_vline(data=(chr_x_dmp_df_long_arm %>%
filter(passes_threshold==T) %>%
group_by(key_transition) %>%
summarise(mean_distance=median(min_dist_to_pb_escapee))),
mapping = aes(xintercept=mean_distance),
size=.3) +
geom_density(data=chr_x_dmp_df_long_arm %>% filter(passes_threshold==T),
mapping=aes(min_dist_to_pb_escapee, y=..scaled..,
fill=as.factor(key_transition),
color=as.factor(key_transition)),
n = 1e4, alpha=.5,
size=.3) +
geom_text(data=(chr_x_dmp_df_long_arm %>%
filter(passes_threshold==T) %>%
group_by(key_transition) %>%
summarise(mean_distance=median(min_dist_to_pb_escapee))),
mapping = aes(x=mean_distance, y=.5, label=sprintf("%.1f Mb", mean_distance / 1e6)),
hjust=1.1,
size=1.2) +
geom_text(data=p1_long_arm_df, mapping=aes(x=x, y=y, label=pval), inherit.aes = F, size=1.2, hjust=-0.1, vjust=0) +
geom_text(data=p2_long_arm_df, mapping=aes(x=x, y=y, label=pval), inherit.aes = F, size=1.2, hjust=1.1, vjust=0) +
theme_bw() +
scale_x_sqrt(breaks=c(1e6, 5e6, 1e7, 1.5e7, 2e7,
3e7, 4e7, 5e7, 6e7),
labels=c(1, 5, 10, 15, 20, 30, 40, 50, 60), expand=c(0,0)) +
# scale_x_log10(limits = c(1e4,1e8), breaks=c(1e4, 1e5,1e6,1e7,1e8), labels = c(.01,.1,1,10,100)) +
scale_fill_d3(guide=F) +
scale_color_d3(guide=F) +
labs(title="Minimum Distance of DMPs to Escapee on Long Arm",
x="Minimum Distance to Gene Set (Mb)",
y="Scaled Density") +
base_plot_theme +
scale_y_continuous(limits=c(0,1.2), breaks = c(0,.5,1)) +
theme(strip.text = element_text(size=5, margin = margin(0,.5,0,.5, "mm")),
strip.background = element_rect(size = .3),
legend.text = element_text(size=4),
legend.title = element_text(size=4, face="bold"),
legend.key.width = unit(2, units="mm"),
legend.key.height = unit(2, units="mm"),
legend.key.size = unit(.2, units = "mm"),
panel.spacing.y = unit(.75, "mm"))
# axis.title.x = element_text(margin = margin(0,0,0,0), size = 0),
# axis.text.x = element_text(margin = margin(0,0,0,0), size = 0),
# axis.ticks.length.x = unit(0, "mm"))
ggsave("figs/main/min_dist_escapee_long_arm.pdf", long_arm_permutation_plot, width = 64, height=47.5, units = "mm")
distances_to_pb_escapees_long_arm <- chr_x_dmp_df_long_arm %>% dplyr::select(rowname, key_transition, passes_threshold, min_dist=min_dist_to_pb_escapee)
distances_to_pb_escapees_long_arm$type <- "pb_escapee"
distances_to_pb_escapees_long_arm[distances_to_pb_escapees_long_arm$passes_threshold == F, "type"] <- "pb_escapee_no_thresh"
all_distances_long_arm <- full_join(distances_to_random_escapees_long_arm, distances_to_pb_escapees_long_arm, by=c("rowname", "key_transition", "passes_threshold", "min_dist", "type"))
all_distances_long_arm <- full_join(all_distances_long_arm, distances_to_random_probes_long_arm, by=c("rowname", "key_transition", "passes_threshold", "min_dist", "type"))
library(ggpubr)
library(ggsci)
all_distances_long_arm$key_transition <- sprintf("Tran. %d", all_distances_long_arm$key_transition)
long_arm_escape_plot <- ggplot(all_distances_long_arm %>% filter(type != "pb_escapee_no_thresh"), aes(x=type, y=min_dist, fill=type)) +
geom_boxplot(outlier.shape = NA, size=.3, notch = T) +
stat_compare_means(comparisons = list(c("pb_escapee", "random")), label="p.signif", size=1, tip.length = .02)+#, label.y=2e8, tip.length = .02)+
# scale_y_continuous(limits = c(0,5.5e7), breaks = (0:5)*1e7, labels = (0:5)*10)+#limits = c(5e4,1e8))+#, breaks=c(1e5,1e6,1e7,5e8)) +
scale_y_log10(limits = c(5e4,1e9), breaks=c(1e5,1e6,1e7,1e8, 1e9)) +
scale_x_discrete(breaks=c("pb_escapee", "random", "random_probes"), labels=c("", "", "")) +
# geom_hline(yintercept = mean(filter(all_distances_long_arm, type=="pb_escapee_no_thresh")$min_dist), linetype="dashed", color=pal_futurama()(n=3)[3], size=.3) +
facet_grid(~key_transition) +
scale_fill_futurama(breaks=c("pb_escapee", "random", "random_probes"), labels=c("Long Arm Escapees", "100 Sets of 3 Random Long Arm Genes", "100 Sets of Random Long Arm Probes")) +
theme_bw() +
labs(y="Minimum Distance to Gene Set (Mb)",
x="",
title="Minimum Distance to Escapee on Long Arm",
fill="Gene Set") +
base_plot_theme +
theme(strip.text = element_text(size=5, margin = margin(.5,0,.5,0, "mm")),
strip.background = element_rect(size = .3),
legend.text = element_text(size=4),
legend.title = element_text(size=4, face="bold"),
legend.key.width = unit(2, units="mm"),
legend.key.height = unit(3, units="mm"),
legend.key.size = unit(.2, units = "mm"),
axis.title.x = element_text(margin = margin(0,0,0,0), size = 0),
axis.text.x = element_text(margin = margin(0,0,0,0), size = 0),
axis.ticks.length.x = unit(0, "mm"))
ggsave("figs/main/4_long_arm_dist.svg", long_arm_escape_plot, width=64, height=45, units="mm")
probability_tib_long_arm <- tibble(
min_distance = numeric(),
transition = numeric(),
probability = numeric()
)
for (dist_to_escapee in seq(1, 7e7, 1e6)) {
# dist_to_escapee <- dist_to_escapee * 1e4
sites_within_dist <- chr_x_dmp_df_long_arm %>% filter(((min_dist_to_pb_escapee <= dist_to_escapee)) & (passes_threshold == T))
sites_within_dist_par <- chr_x_dmp_df_long_arm %>% filter((((hg19_pos - 2699520) <= dist_to_escapee)) & (passes_threshold == T))
for (t in 1:5) {
prob <- nrow(sites_within_dist %>% filter(key_transition == t & passes_threshold == T)) /
nrow(chr_x_dmp_df_long_arm %>% filter(key_transition == t & passes_threshold == T))
probability_tib_long_arm <- add_row(probability_tib_long_arm,
min_distance = dist_to_escapee,
transition = t,
probability = prob)
}
}
d50_df <- tibble(transition=numeric(),d50=numeric())
for (t in 1:5) {
d50_dist <- as.numeric(chr_x_dmp_df_long_arm %>%
filter(key_transition == t & passes_threshold==T) %>%
arrange(min_dist_to_pb_escapee) %>% filter(row_number()==ceiling(n()/2)) %>%
dplyr::select(min_dist_to_pb_escapee))
d50_df <- add_row(d50_df, transition=t, d50=d50_dist)
}
probability_of_erosion_long_arm <- ggplot(na.omit(probability_tib_long_arm), aes(x=min_distance, y=probability, group=transition, color=as.factor(transition))) +
geom_line() +
labs(x = "Minimum Distance to Escapee (Mb)",
y = "Fraction of Transition Sites",
color = "Transition",
title="Probability of Erosion on Long Arm by\nDistance from Escapee") +
theme_bw() +
scale_color_futurama() +
scale_x_continuous(breaks = (0:6)*1e7, labels = (0:6)*10) +
base_plot_theme +
theme(legend.title = element_text(size=5, face="bold"),
legend.text = element_text(size=5),
legend.key.width = unit(2, "mm"))
ggsave("figs/supplementary/sup2_prob_escape_long_arm.pdf", probability_of_erosion_long_arm, width = 174/3, height=40, units = "mm")
```
## Min escapee distance for full chromosome
```{r}
library(tidyverse)
library(ggplot2)
all_dmp_df <- read_csv("all_dmp_df.csv.gz")
mean_methylation_df_escape <- read_csv("mean_methylation_df_escape.csv.gz")
hg38_biomart_structure_df <- read_csv("hg38_biomart_structure_df.csv.gz", col_types = cols(chromosome_name="c"))
canonical_transcripts <- read_csv("hg38_canonical_transcripts.csv.gz")
chr_x_dmp_df <- all_dmp_df %>% filter((hg38_chr == "chrX"))
# long_arm_escapees <- mean_methylation_df_escape %>% filter(TSS > 58100000)
for (row_idx in 1:nrow(chr_x_dmp_df)) {
pos <- unlist(chr_x_dmp_df[row_idx, "hg38_pos"])
min_distance <- min(abs(mean_methylation_df_escape["TSS"] - pos))
# min_distance_end <- min(abs(short_arm_escapees["end_position"] - pos))
chr_x_dmp_df[row_idx, "min_dist_to_pb_escapee"] <- min_distance #min(min_distance, min_distance_end)
}
hg38_biomart_structure_df_canonical_chrX <- hg38_biomart_structure_df %>%
filter(ensembl_transcript_id %in% canonical_transcripts$ensembl_transcript_id &
chromosome_name=="X") %>%
group_by(ensembl_gene_id) %>%
summarise(transcription_start_site=dplyr::first(transcription_start_site))
n <- 1000 # how many rounds of random choosing I want
mean_min_distance_random_df <- tibble(run=numeric(),
key_transition=numeric(),
mean_distance=numeric())
for (i in 1:n) {
random_escapees <- sample(hg38_biomart_structure_df_canonical_chrX$ensembl_gene_id, nrow(mean_methylation_df_escape))
random_escapees_df <- hg38_biomart_structure_df_canonical_chrX %>% filter(ensembl_gene_id %in% random_escapees)
test_df <- chr_x_dmp_df %>% group_by(rowname) %>% summarise(
key_transition=dplyr::first(key_transition),
passes_threshold=dplyr::first(passes_threshold),
min_dist_to_random_escapee=min(abs(random_escapees_df$transcription_start_site - hg38_pos))
)
# mean_min_distance_random <- c(mean_min_distance_random, mean(test_df$min_dist_to_random_escapee))
mean_min_distance_random_df <- full_join(mean_min_distance_random_df, test_df %>% group_by(key_transition) %>% summarise(mean_distance=median(min_dist_to_random_escapee)) %>% mutate(run=i), by=c("run", "key_transition", "mean_distance"))
# write_csv(test_df, "chr_x_pb_random_escapee_distances.csv", append=T, col_names=F)
# distances_to_random_escapees <- full_join(distances_to_random_escapees, test_df, by=c("key_transition", "min_dist_to_random_escapee", "passes_threshold"))
}
distances_to_random_escapees_chr_x <- read_csv("chr_x_pb_random_escapee_distances.csv", col_names = c("rowname","key_transition","passes_threshold", "min_dist"))
distances_to_random_escapees_chr_x$type <- "random"
n <- 1000 # how many rounds of random choosing I want
mean_min_distance_random_probes_df <- tibble(run=numeric(),
key_transition=numeric(),
mean_distance=numeric())
for (i in 1:n) {
for (t in 1:5) {
random_chr_x_probes <- sample(chr_x_dmp_df$rowname, nrow(chr_x_dmp_df %>% filter(passes_threshold==T & key_transition==t)))
random_chr_x_probes_df <- chr_x_dmp_df %>% filter(rowname %in% random_chr_x_probes)
test_df <- random_chr_x_probes_df %>% group_by(rowname) %>% summarise(
key_transition=t,
passes_threshold=T,
min_dist_to_random_escapee=min(abs(mean_methylation_df_escape$TSS - hg38_pos))
)
mean_min_distance_random_probes_df <- add_row(mean_min_distance_random_probes_df,
run=i,
key_transition=t,
mean_distance=median(test_df$min_dist_to_random_escapee))
# write_csv(test_df, "chr_x_pb_random_probes_distances.csv", append=T, col_names=F)
# distances_to_random_escapees <- full_join(distances_to_random_escapees, test_df, by=c("key_transition", "min_dist_to_random_escapee", "passes_threshold"))
}
}
distances_to_random_probes_chr_x <- read_csv("chr_x_pb_random_probes_distances.csv", col_names = c("rowname","key_transition","passes_threshold", "min_dist"))
distances_to_random_probes_chr_x$type <- "random_probes"
chrx_permutation_plot <- ggplot(mean_min_distance_random_df) +
geom_density(aes(mean_distance, y=..scaled..), n=1e3, fill="darkgrey", color="darkgrey") +
geom_density(aes(mean_distance, y=..scaled..), data=mean_min_distance_random_probes_df, n = 1e3, fill="lightgrey", color="lightgrey") +
facet_grid(key_transition~., scale="free_y") +
geom_vline(data=(chr_x_dmp_df %>% filter(passes_threshold==T) %>% group_by(key_transition) %>% summarise(mean_distance=median(min_dist_to_pb_escapee))),
mapping = aes(xintercept=mean_distance, color=factor(key_transition)), linetype="dashed") +
geom_density(data=chr_x_dmp_df %>% filter(passes_threshold==T), mapping=aes(min_dist_to_pb_escapee, y=..scaled.., fill=factor(key_transition), color=factor(key_transition)), n = 1e3, alpha=.5) +
# geom_vline(data=chr_x_dmp_df %>% filter(passes_threshold==T), mapping=aes(xintercept=median(min_dist_to_pb_escapee), color=factor(key_transition)),
# linetype="dashed", size=1) +
theme_bw()
distances_to_pb_escapees_chr_x <- chr_x_dmp_df %>% dplyr::select(rowname, key_transition, passes_threshold, min_dist=min_dist_to_pb_escapee)
distances_to_pb_escapees_chr_x$type <- "pb_escapee"
distances_to_pb_escapees_chr_x[distances_to_pb_escapees_chr_x$passes_threshold == F, "type"] <- "pb_escapee_no_thresh"
all_distances_chr_x <- full_join(distances_to_random_escapees_chr_x, distances_to_pb_escapees_chr_x, by=c("rowname", "key_transition", "passes_threshold", "min_dist", "type"))
all_distances_chr_x <- full_join(all_distances_chr_x, distances_to_random_probes_chr_x, by=c("rowname", "key_transition", "passes_threshold", "min_dist", "type"))
library(ggpubr)
library(ggsci)
all_distances_chr_x$key_transition <- sprintf("Tran. %d", all_distances_chr_x$key_transition)
chr_x_escape_plot <- ggplot(all_distances_chr_x %>% filter(type != "pb_escapee_no_thresh"), aes(x=type, y=min_dist, fill=type)) +
geom_boxplot(outlier.shape = NA, size=.3, notch = T) +
stat_compare_means(comparisons = list(c("pb_escapee", "random")), label="p.signif", size=1, tip.length = .02)+#, label.y=2e8, tip.length = .02)+
# scale_y_continuous(limits = c(0,5.5e7), breaks = (0:5)*1e7, labels = (0:5)*10)+#limits = c(5e4,1e8))+#, breaks=c(1e5,1e6,1e7,5e8)) +
scale_y_log10(limits = c(1e4,5e8))+#, breaks=c(1e5,1e6,1e7,1e8, 1e9)) +
scale_x_discrete(breaks=c("pb_escapee", "random", "random_probes"), labels=c("", "", "")) +
# geom_hline(yintercept = mean(filter(all_distances_long_arm, type=="pb_escapee_no_thresh")$min_dist), linetype="dashed", color=pal_futurama()(n=3)[3], size=.3) +
facet_grid(~key_transition) +
scale_fill_futurama(breaks=c("pb_escapee", "random", "random_probes"), labels=c("Chr X Escapees", "100 Sets of 15 Random Long Arm Genes", "100 Sets of Random X Chr Probes")) +
theme_bw() +
labs(y="Minimum Distance to Gene Set (Mb)",
x="",
title="Minimum Distance to Escapee on Long Arm",
fill="Gene Set") +
base_plot_theme +
theme(strip.text = element_text(size=5, margin = margin(.5,0,.5,0, "mm")),
strip.background = element_rect(size = .3),
legend.text = element_text(size=4),
legend.title = element_text(size=4, face="bold"),
legend.key.width = unit(2, units="mm"),
legend.key.height = unit(3, units="mm"),
legend.key.size = unit(.2, units = "mm"),
axis.title.x = element_text(margin = margin(0,0,0,0), size = 0),
axis.text.x = element_text(margin = margin(0,0,0,0), size = 0),
axis.ticks.length.x = unit(0, "mm"))
ggsave("figs/main/chr_x_escape_plot.svg", chr_x_escape_plot, width=64, height=45, units="mm")
# Probability of demethylation
probability_tib <- tibble(
min_distance = numeric(),
transition = numeric(),
probability = numeric(),
probability_par = numeric()
)
for (dist_to_escapee in seq(1, 7e7, 1e4)) {
# dist_to_escapee <- dist_to_escapee * 1e4
sites_within_dist <- chr_x_dmp_df %>% filter(((min_dist_to_pb_escapee <= dist_to_escapee)) & (passes_threshold == T))
sites_within_dist_par <- chr_x_dmp_df %>% filter((((hg19_pos - 2699520) <= dist_to_escapee)) & (passes_threshold == T))
for (t in 1:5) {
prob <- nrow(sites_within_dist %>% filter(key_transition == t & passes_threshold == T)) /
nrow(chr_x_dmp_df %>% filter(key_transition == t & passes_threshold == T))
prob_par <- nrow(sites_within_dist_par %>% filter(key_transition == t & passes_threshold == T)) /
nrow(chr_x_dmp_df %>% filter(key_transition == t & passes_threshold == T))
probability_tib <- add_row(probability_tib,
min_distance = dist_to_escapee,
transition = t,
probability = prob,
probability_par = prob_par)
}
}
d50_df <- tibble(transition=numeric(),d50=numeric())
for (t in 1:5) {
d50_dist <- as.numeric(chr_x_dmp_df %>%
filter(key_transition == t & passes_threshold==T) %>%
arrange(min_dist_to_pb_escapee) %>% filter(row_number()==ceiling(n()/2)) %>%
dplyr::select(min_dist_to_pb_escapee))
d50_df <- add_row(d50_df, transition=t, d50=d50_dist)
}
probability_of_erosion <- ggplot(na.omit(probability_tib), aes(x=min_distance, y=probability, group=transition, color=as.factor(transition))) +
geom_line() +
labs(x = "Minimum Distance to Escapee (Mb)",
y = "Fraction of Transition Sites",
color = "Transition",
title="Probability of Erosion on Whole\nChromosome by Distance from Escapee") +
theme_bw() +
scale_color_futurama() +
scale_x_continuous(breaks = (0:6)*1e7, labels = (0:6)*10) +
base_plot_theme
theme(legend.title = element_text(size=5, face="bold"),
legend.text = element_text(size=5),
legend.key.width = unit(2, "mm"))
ggsave("figs/supplementary/sup2_prob_escape_whole_x.pdf", probability_of_erosion, width = 58, height=40, units = "mm")
```
## Probability of demethylation based on escapee distance
```{r}
probability_tib <- tibble(
min_distance = numeric(),
transition = numeric(),
probability = numeric(),
probability_par = numeric()
)
for (dist_to_escapee in seq(1, 7e7, 5e6)) {
# dist_to_escapee <- dist_to_escapee * 1e4
sites_within_dist <- chr_x_dmp_df_short_arm %>% filter(((min_dist_to_pb_escapee <= dist_to_escapee)) & (passes_threshold == T))
sites_within_dist_par <- chr_x_dmp_df_short_arm %>% filter((((hg19_pos - 2699520) <= dist_to_escapee)) & (passes_threshold == T))
for (t in 1:5) {
prob <- nrow(sites_within_dist %>% filter(key_transition == t & passes_threshold == T)) /
nrow(chr_x_dmp_df_short_arm %>% filter(key_transition == t & passes_threshold == T))
prob_par <- nrow(sites_within_dist_par %>% filter(key_transition == t & passes_threshold == T)) /
nrow(chr_x_dmp_df_short_arm %>% filter(key_transition == t & passes_threshold == T))
probability_tib <- add_row(probability_tib,
min_distance = dist_to_escapee,
transition = t,
probability = prob,
probability_par = prob_par)
}
}
d50_df_short_arm <- tibble(transition=numeric(),d50=numeric())
for (t in 1:5) {
d50_dist <- as.numeric(chr_x_dmp_df_short_arm %>%
filter(key_transition == t & passes_threshold==T) %>%
arrange(min_dist_to_pb_escapee) %>% filter(row_number()==ceiling(n()/2)) %>%
dplyr::select(min_dist_to_pb_escapee))
d50_df_short_arm <- add_row(d50_df_short_arm, transition=t, d50=d50_dist)
}
probability_of_erosion_short_arm <- ggplot(na.omit(probability_tib), aes(x=min_distance, y=probability, group=transition, color=as.factor(transition))) +
geom_line() +
labs(x = "Minimum Distance to Escapee (Mb)",
y = "Fraction of Transition Sites",
color = "Transition",
title="Probability of Erosion on Short Arm by\nDistance from Escapee") +
theme_bw() +
scale_color_futurama() +
scale_x_continuous(breaks = (0:6)*1e7, labels = (0:6)*10) +
base_plot_theme
theme(legend.title = element_text(size=5, face="bold"),
legend.text = element_text(size=5),
legend.key.width = unit(2, "mm"))
ggsave("figs/supplementary/sup2_prob_escape_short_arm.pdf", probability_of_erosion_short_arm, width = 174/3, height=40, units = "mm")
probability_of_erosion_short_arm_par <- ggplot(na.omit(probability_tib), aes(x=min_distance, y=probability_par, group=transition, color=as.factor(transition))) +
geom_line() +
labs(x = "Minimum Distance to PAR (Mb)",
y = "Fraction of Transition Sites",
color = "Transition",
title="Probability of Erosion on Short Arm by\nDistance from PAR") +
theme_bw() +
scale_color_futurama() +
scale_x_continuous(breaks = (0:6)*1e7, labels = (0:6)*10) +
base_plot_theme +
theme(legend.title = element_text(size=5, face="bold"),
legend.text = element_text(size=5),
legend.key.width = unit(2, "mm"))
ggsave("figs/supplementary/sup2_prob_escape_short_arm_par.svg", probability_of_erosion_short_arm_par, width = 174/4, height=50, units = "mm")
```
## BElow is an old block that isn't being currently used
```{r}
chr_x_dmp_df_long_arm <- all_dmp_df %>% filter((chr == "chrX") & (pos > 58100000))
long_arm_escapees <- gene_compiled_df %>% filter(gene %in% c("DANT2", "FIRRE"))
for (row_idx in 1:nrow(chr_x_dmp_df_long_arm)) {
pos <- unlist(chr_x_dmp_df_long_arm[row_idx, "pos"])
min_distance <- min(abs(long_arm_escapees["start_position"] - pos))
min_distance_end <- min(abs(long_arm_escapees["end_position"] - pos))
chr_x_dmp_df_long_arm[row_idx, "min_dist_to_escapee"] <- min(min_distance, min_distance_end)
}
ggplot(chr_x_dmp_df_long_arm, aes(x=as.factor(key_transition), y=min_dist_to_escapee, fill=passes_threshold)) +
# geom_violin(aes(x=cluster, group=cluster, y=min_dist_to_escapee, fill=cluster)) +
geom_boxplot() +
stat_compare_means(aes(group = passes_threshold), label="p.format")+
# geom_point(aes(fill=random), position = position_jitterdodge()) +
# geom_signif(map_signif_level=T,
# comparisons = list(c(1,2), c(1,3), c(1,4), c(1,5))) +
scale_y_log10()
for (row_idx in 1:nrow(chr_x_dmp_df)) {
pos <- unlist(chr_x_dmp_df[row_idx, "pos"])
min_distance <- min(abs(gene_compiled_df[gene_compiled_df$gene %in% escapee, "start_position"] - pos))
min_distance_end <- min(abs(gene_compiled_df[gene_compiled_df$gene %in% cotton_genes, "end_position"] - pos))
# min_distance <- min(abs(gene_compiled_df[gene_compiled_df$gene %in% cotton_genes, "start_position"] - pos))
# min_distance_end <- min(abs(gene_compiled_df[gene_compiled_df$gene %in% cotton_genes, "end_position"] - pos))
chr_x_dmp_df[row_idx, "min_dist_to_cotton_escapee"] <- min(min_positions)
}
all_x_genes <- gene_compiled_df[gene_compiled_df$cluster == 10, "gene"]$gene
distances <- tibble(
key_transition = integer(),
min_distance = numeric(),
delta_increasing = numeric(),
pos = numeric(),
passes_threshold = logical()
)
for (n in 1:100) {
test_set <- sample(all_x_genes, 16)
for (row_idx in 1:nrow(chr_x_dmp_df)) {
pos <- unlist(chr_x_dmp_df[row_idx, "pos"])
key_transition <- unlist(chr_x_dmp_df[row_idx, "key_transition"])
delta_increasing <- unlist(chr_x_dmp_df[row_idx, "delta_increasing"])
passes_threshold <- unlist(chr_x_dmp_df[row_idx, "passes_threshold"])
min_distance <- min(abs(gene_compiled_df[gene_compiled_df$gene %in% test_set, "start_position"] - pos))
min_distance_end <- min(abs(gene_compiled_df[gene_compiled_df$gene %in% test_set, "end_position"] - pos))
if (is.na(chr_x_dmp_df[row_idx, "min_dist_to_test"])) {
chr_x_dmp_df[row_idx, "min_dist_to_test"] <- 0.0
}
distances <- add_row(distances,
key_transition=key_transition,
min_distance = min(min_distance, min_distance_end),
pos=pos,
delta_increasing=delta_increasing,
passes_threshold=passes_threshold)
}
}
# chr_x_dmp_df["min_dist_to_test"] <- chr_x_dmp_df["min_dist_to_test"] / 100
write_csv(distances, "distances_test.csv")
ggplot(distances %>% filter(delta_increasing == FALSE & passes_threshold == TRUE & pos <= 58100000 ), aes(x=key_transition, group=key_transition, y=min_distance)) +
# geom_violin(aes(x=cluster, group=cluster, y=min_dist_to_escapee, fill=cluster)) +
geom_boxplot() +
geom_point() +
geom_signif(map_signif_level=T,
comparisons = list(c(1,2), c(1,3), c(1,4), c(1,5))) +
scale_y_log10()
distances <- read_csv("distances_test.csv")
all_distances <- tibble(
key_transition = integer(),
min_distance = numeric(),
delta_increasing = numeric(),
pos = numeric(),
passes_threshold = logical(),
random = logical()
)
all_distances <- distances
all_distances["random"] <- "random_escapees"
for (row_idx in 1:nrow(chr_x_dmp_df)) {
random <- "eroding"
if (unlist(chr_x_dmp_df[row_idx, "passes_threshold"]) == F) {
random <- "non-eroding"
}
all_distances <- add_row(all_distances,
key_transition=unlist(chr_x_dmp_df[row_idx, "key_transition"]),
min_distance=unlist(chr_x_dmp_df[row_idx, "min_dist_to_escapee"]),
random=random,
delta_increasing=unlist(chr_x_dmp_df[row_idx, "delta_increasing"]),
pos=unlist(chr_x_dmp_df[row_idx, "pos"]),
passes_threshold=unlist(chr_x_dmp_df[row_idx, "passes_threshold"]))
}
chr_x_dmp_df_short_arm$sig_x_site <- chr_x_dmp_df_short_arm$X1 %in% sig_x_sites
for (row_idx in 1:nrow(chr_x_dmp_df_short_arm)) {
random <- "eroding"
if (unlist(chr_x_dmp_df_short_arm[row_idx, "passes_threshold"]) == F) {
random <- "non-eroding"
}
all_distances <- add_row(all_distances,
key_transition=unlist(chr_x_dmp_df_short_arm[row_idx, "key_transition"]),
min_distance=unlist(chr_x_dmp_df_short_arm[row_idx, "min_dist_to_escapee"]),
random=random,
delta_increasing=unlist(chr_x_dmp_df_short_arm[row_idx, "delta_increasing"]),
pos=unlist(chr_x_dmp_df_short_arm[row_idx, "pos"]),
passes_threshold=unlist(chr_x_dmp_df_short_arm[row_idx, "passes_threshold"]))
}
for (row_idx in 1:nrow(chr_x_dmp_df_long_arm)) {
all_distances <- add_row(all_distances,
key_transition=unlist(chr_x_dmp_df_long_arm[row_idx, "key_transition"]),
min_distance=unlist(chr_x_dmp_df_long_arm[row_idx, "min_dist_to_escapee"]),
random=FALSE,
delta_increasing=unlist(chr_x_dmp_df_long_arm[row_idx, "delta_increasing"]),
pos=unlist(chr_x_dmp_df_long_arm[row_idx, "pos"]),
passes_threshold=unlist(chr_x_dmp_df_long_arm[row_idx, "passes_threshold"]))
}
library(ggpubr)
comparisons = list(c("eroding", "non-eroding"), c("eroding", "random_escapees"))
ggplot(all_distances %>% filter(pos <= 58100000), aes(x=random, y=min_distance, fill=random)) +
# geom_violin(aes(x=cluster, group=cluster, y=min_dist_to_escapee, fill=cluster)) +
geom_boxplot() +
stat_compare_means(comparisons = list(c("eroding", "non-eroding"), c("eroding", "random_escapees")), label="p.format", size=2)+
# geom_point(aes(fill=random), position = position_jitterdodge()) +
# geom_signif(map_signif_level=T,
# comparisons = list(c(1,2), c(1,3), c(1,4), c(1,5))) +
scale_y_log10() +
facet_grid(~key_transition) +
scale_fill_futurama() +
theme_bw() +
base_plot_theme +
labs(y="Minimum Distance to Escapee",
x="",
title="Minimum Distance to Escapee on Short Arm") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.3))
ggsave("figs/min_dist_short_arm.png", width=190, height=190, units="mm", dpi = 1000)
ggplot(chr_x_dmp_df_short_arm, aes(x=as.factor(key_transition), y=min_dist_to_escapee, fill=sig_x_site)) +
# geom_violin(aes(x=cluster, group=cluster, y=min_dist_to_escapee, fill=cluster)) +
geom_boxplot() +
stat_compare_means(aes(group = sig_x_site), label="p.format")+
# geom_point(aes(fill=random), position = position_jitterdodge()) +
# geom_signif(map_signif_level=T,
# comparisons = list(c(1,2), c(1,3), c(1,4), c(1,5))) +
scale_y_log10()
ggplot(chr_x_dmp_df %>% filter(delta_increasing == FALSE & passes_threshold == TRUE & pos <= 58100000), aes(x=key_transition, group=key_transition, y=min_dist_to_cotton_escapee, fill="min_cotton")) +
# geom_violin(aes(x=cluster, group=cluster, y=min_dist_to_escapee, fill=cluster)) +
geom_boxplot() +
geom_boxplot(data=distances %>% filter(delta_increasing == FALSE & passes_threshold == TRUE & pos <= 58100000), mapping=aes(x=key_transition, group=key_transition, y=min_distance, fill="min_random")) +
geom_point() +
geom_signif(map_signif_level=T,
comparisons = list(c(1,2), c(1,3), c(1,4), c(1,5))) +
scale_y_log10()
ggplot(chr_x_dmp_df %>% filter(delta_increasing == FALSE & passes_threshold == TRUE & pos <= 58100000), aes(x=min_dist_to_cotton_escapee, y=abs(delta))) +
# geom_smooth(aes(color=as.factor(key_transition))) +
# geom_smooth(color="black") +
geom_point(aes(color=as.factor(key_transition))) +
scale_x_log10()
ggplot(chr_x_dmp_df %>% filter(delta_increasing == FALSE & passes_threshold == TRUE), aes(x=min_dist_to_escapee, y=abs(delta))) +
geom_point(aes(color=as.factor(key_transition))) +
scale_x_log10()
# y_position=c(1.2e7,1.21e7,1.21e7,1.21e7)
ggplot(chr_x_dmp_df %>% filter(delta_increasing == FALSE & passes_threshold == TRUE & pos <= 58100000), aes(x=key_transition, group=key_transition, y=min_dist_to_escapee)) +
# geom_violin(aes(x=cluster, group=cluster, y=min_dist_to_escapee, fill=cluster)) +
geom_boxplot() +
geom_point() +
geom_signif(map_signif_level=T,
comparisons = list(c(1,2), c(1,3), c(1,4), c(1,5))) +
scale_y_log10()
ggplot(chr_x_dmp_df %>% filter(delta_increasing == FALSE & passes_threshold == TRUE & pos <= 58100000), aes(x=min_dist_to_escapee, y=abs(delta))) +
# geom_smooth(aes(color=as.factor(key_transition))) +
# geom_smooth(color="black") +
geom_point(aes(color=as.factor(key_transition))) +
scale_x_log10()
for (row_idx in 1:nrow(gene_compiled_df)) {
gene <- unlist(gene_compiled_df[row_idx, "gene"])
total <- unlist(gene_compiled_df[row_idx, "total_sites"])
start_pos <- unlist(gene_compiled_df[row_idx, "start_position"])
cluster <- unlist(gene_compiled_df[row_idx, "cluster"])
if (cluster <= 9) {
min_distance <- min(abs(gene_compiled_df[gene_compiled_df$gene %in% 11, "start_position"] - start_pos))
gene_compiled_df[row_idx, "min_dist_to_escapee"] <- min_distance
}
}
library(ggsignif)
ggplot(gene_compiled_df %>% filter((chr=="chrX") & (cluster %in% c(5,6,7,8,9)))) +
# geom_violin(aes(x=cluster, group=cluster, y=min_dist_to_escapee, fill=cluster)) +
geom_boxplot(aes(x=cluster, group=cluster, y=min_dist_to_escapee)) +
geom_point(aes(x=cluster, y=min_dist_to_escapee)) +
geom_signif(aes(x=cluster, group=cluster, y=min_dist_to_escapee),
map_signif_level=F, comparisons = list(c(5,6), c(5,7), c(5,8), c(5,9))) +
scale_y_log10()
mean_distances <- gene_compiled_df %>% group_by(cluster) %>% summarise(
min_dist_to_escapee = mean(na.omit(min_dist_to_escapee)))
ggplot(gene_compiled_df[(gene_compiled_df$chr == "chrX") & (gene_compiled_df$cluster %in% c(5,6,7,8,9)),]) +
geom_bar()
geom_freqpoly(aes(start_position, colour=as.factor(cluster)), binwidth = 1500000) +
labs(x="Location on X Chr",
y='Frequency Count',
title='Location of X Chromosome CpG sites that are Decreasing in Methylation')
plt3 <- ggplot(all_dmp_df[(all_dmp_df$chr == "chrX") & (all_dmp_df$passes_threshold == F)& (all_dmp_df$delta_increasing == F),]) +
geom_freqpoly(aes(pos),binwidth = 1500000) +
labs(x="Location on X Chr",
y='Frequency Count',
title=sprintf('Location of X Chromosome CpG sites that are Decreasing in Methylation', i-1, i)) +
theme(plot.title = element_text(hjust = 0.5))+
scale_color_brewer(palette='Spectral')
```
```{r old code probably for deletion}
library(tidyverse)
library(ggplot2)
all_dmp_df <- read_csv('dmp_transition_info.csv')
# do this based on genes and when the promoters change their methylation.
x_annotation_df <- read_csv("x_annotation.csv.gz")
statistics <- read.csv('variance_statistics_noh9_nooutliers.csv.gz')
positions_to_keep <- statistics$Position[statistics$alpha <= .01 & (statistics$femaleVariance > statistics$maleVariance)] # returns the density data
sig_x_sites <- intersect(positions_to_keep, x_annotation_df$rowname)
print(length(sig_x_sites))
sig_annotation <- x_annotation_df[x_annotation_df$Name %in% sig_x_sites,]
x_sig_sites <- intersect(positions_to_keep, x_annotation_df$rowname)
all_dmp_df$escapee <- (all_dmp_df$chr == "chrX") & !(all_dmp_df$X1 %in% x_sig_sites) &
(grepl("TSS1500|TSS200|5'UTR", all_dmp_df$refgene_group) &
(rowMeans(all_dmp_df[c("cluster_0_mean",
"cluster_1_mean",
"cluster_2_mean",
"cluster_3_mean",
"cluster_4_mean",
"cluster_5_mean")]) < .25))
#group by genes
gene_compiled_tss_sites_df <- tibble(gene=character(),
chr=character(),
total_sites=integer(),
total_sites_passing_threshold=integer(),
num_escapees=integer(),
mean_promoter_methylation=numeric(),
transition_1_inc_sites=integer(),
transition_1_dec_sites=integer(),
transition_2_inc_sites=integer(),
transition_2_dec_sites=integer(),
transition_3_inc_sites=integer(),
transition_3_dec_sites=integer(),
transition_4_inc_sites=integer(),
transition_4_dec_sites=integer(),
transition_5_inc_sites=integer(),
transition_5_dec_sites=integer(),
)
for (row_idx in 1:nrow(all_dmp_df)) {
gene <- unlist(all_dmp_df[row_idx, "mapped_gene"])
# genes <- strsplit(unlist(all_dmp_df[row_idx, "gene"]), ";")
refgene_group <- unlist(all_dmp_df[row_idx, "refgene_group"])
chr <- unlist(all_dmp_df[row_idx, "chr"])
dist_to_tss <- unlist(all_dmp_df[row_idx, "dist_to_tss"])
delta_increasing <-
unlist(all_dmp_df[row_idx, "delta_increasing"])
key_transition <- unlist(all_dmp_df[row_idx, "key_transition"])
alt_key_transition <-
unlist(all_dmp_df[row_idx, "alt_key_transition"])
alt_delta <- unlist(all_dmp_df[row_idx, "alt_delta"])
passes_threshold <-
unlist(all_dmp_df[row_idx, "passes_threshold"])
more_variant <-
unlist(all_dmp_df[row_idx, "X1"]) %in% positions_to_keep
escapee <- unlist(all_dmp_df[row_idx, "escapee"])
mean_methylation <- unlist(rowMeans(all_dmp_df[row_idx, c("cluster_0_mean",
"cluster_1_mean",
"cluster_2_mean",
"cluster_3_mean",
"cluster_4_mean",
"cluster_5_mean")]))
if (grepl("TSS1500|TSS200|5'UTR", refgene_group) &
(!is.na(gene))) {
# if (grepl("TSS1500|TSS200", refgene_group) & (! is.na(gene)) & (! is.na(alt_key_transition))) {
gene_row_idx <- 0
if (!(gene %in% gene_compiled_tss_sites_df$gene)) {
# gene is not in gene_compiled_tss_sites_df, so add a row for it
gene_row_idx <- nrow(gene_compiled_tss_sites_df) + 1
gene_compiled_tss_sites_df[gene_row_idx, c("gene")] = gene
gene_compiled_tss_sites_df[gene_row_idx, c("chr")] = chr
gene_compiled_tss_sites_df[is.na(gene_compiled_tss_sites_df)] <-
0
} else {
i <- 1
for (gene_name in gene_compiled_tss_sites_df$gene) {
if ((!is.na(gene_name)) && gene == gene_name) {
gene_row_idx <- i
break
}
i <- i + 1
}
}
gene_compiled_tss_sites_df[gene_row_idx, "total_sites"] <-
gene_compiled_tss_sites_df[gene_row_idx, "total_sites"] + 1
gene_compiled_tss_sites_df[gene_row_idx, "mean_promoter_methylation"] <-
gene_compiled_tss_sites_df[gene_row_idx, "mean_promoter_methylation"] + mean_methylation
change <- "dec"
if (delta_increasing) {
# if (alt_delta > 0) {
change <- "inc"
}
if (escapee == TRUE) {
gene_compiled_tss_sites_df[gene_row_idx, "num_escapees"] <-
gene_compiled_tss_sites_df[gene_row_idx, "num_escapees"] + 1
} else if (passes_threshold) {
gene_compiled_tss_sites_df[gene_row_idx, "total_sites_passing_threshold"] <-
gene_compiled_tss_sites_df[gene_row_idx, "total_sites_passing_threshold"] + 1
gene_compiled_tss_sites_df[gene_row_idx, sprintf("transition_%d_%s_sites", key_transition, change)] <-
gene_compiled_tss_sites_df[gene_row_idx, sprintf("transition_%d_%s_sites", key_transition, change)] + 1
}
}
}
gene_compiled_tss_sites_df["mean_promoter_methylation"] <- gene_compiled_tss_sites_df["mean_promoter_methylation"] /
gene_compiled_tss_sites_df["total_sites"]
# write_csv(gene_compiled_tss_sites_df, "gene_compiled_tss_sites.csv")
gene_compiled_df <- gene_compiled_tss_sites_df
# add gene positions
library('biomaRt')
mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
genes <- gene_compiled_df$gene
G_list <- getBM(filters= c("hgnc_symbol"), attributes= c("ensembl_gene_id",
"hgnc_symbol", "chromosome_name", "start_position", "end_position", "strand", "description"),values=genes,mart= mart)
#turn into inner join
gene_compiled_df <- inner_join(gene_compiled_df,G_list,by=c("gene"="hgnc_symbol"))
gene_compiled_df <- add_column(gene_compiled_df, cluster=c(0)*nrow(gene_compiled_df))
# gene_clusters <- tibble(gene=character(), cluster=integer())
for (row_idx in 1:nrow(gene_compiled_df)) {
gene <- unlist(gene_compiled_df[row_idx, "gene"])
chr <- unlist(gene_compiled_df[row_idx, "chr"])
total_sites <- unlist(gene_compiled_df[row_idx, "total_sites"])
total_sites_passing_threshold <- unlist(gene_compiled_df[row_idx, "total_sites_passing_threshold"])
num_escapees <- unlist(gene_compiled_df[row_idx, "num_escapees"])
mean_promoter_methylation <- unlist(gene_compiled_df[row_idx, "mean_promoter_methylation"])
maximum_transition <- 0
maximum_change <- ""
maximum_count <- 0
for(key_transition in 1:5) {
for (change in c("inc", "dec")) {
transition <- sprintf("transition_%d_%s_sites", key_transition, change)
if (gene_compiled_df[row_idx, transition] > maximum_count) {
maximum_transition <- key_transition
maximum_change <- change
maximum_count <- as.integer(unlist(gene_compiled_df[row_idx, transition]))
}
}
}
cluster <- 10
# if (gene %in% known_escapee_genes) {
# cluster <- 11
# }else
if (((num_escapees >= 1) & ((mean_promoter_methylation < .6) & (chr == "chrX"))) & (maximum_count / total_sites < .5)) { #(mean_promoter_methylation <= .25) {
cluster <- 11
}else if ((total_sites_passing_threshold != 0) & (maximum_count / total_sites_passing_threshold >= .5)) {
if (maximum_change == "dec") {
cluster <- maximum_transition - 1 + 5
}else {
cluster <- maximum_transition - 1
}
}
i <- 1
for (gene_name in gene_compiled_df$gene) {
if ((! is.na(gene_name)) && gene == gene_name) {
gene_row_idx <- i
break
}
i <- i+1
}
gene_compiled_df[i, "cluster"] <- cluster
}
plt2 <- ggplot(gene_compiled_df[(gene_compiled_df$chr == "chrX") & (gene_compiled_df$cluster %in% c(5,6,7,8,9) | gene_compiled_df$cluster == 11),]) +
geom_freqpoly(aes(start_position, colour=as.factor(cluster)), bins = 1000) +
labs(x="Location on X Chr",
y='Frequency Count',
title='Location of X Chromosome CpG sites that are Decreasing in Methylation') #+
#geom_freqpoly(data = all_dmp_df[(all_dmp_df$chr == "chrX") & !(all_dmp_df$X1 %in% x_sig_sites) & (all_dmp_df$cluster_0_mean < .125),], mapping=
# aes(pos, colour="escapee", binwidth = 1500000)) +
# theme(plot.title = element_text(hjust = 0.5))+
# scale_color_brewer(palette='Spectral')
chr_x_dmp_df <- all_dmp_df %>% filter((X1 %in% sig_x_sites))
chr_x_dmp_df_short_arm <- all_dmp_df %>% filter((chr == "chrX") & (pos <= 58100000))
short_arm_escapees <- gene_compiled_df %>% filter((cluster == 11) & (start_position <= 58100000))
for (row_idx in 1:nrow(chr_x_dmp_df_short_arm)) {
pos <- unlist(chr_x_dmp_df_short_arm[row_idx, "pos"])
min_distance <- min(abs(short_arm_escapees["start_position"] - pos))
min_distance_end <- min(abs(short_arm_escapees["end_position"] - pos))
chr_x_dmp_df_short_arm[row_idx, "min_dist_to_escapee"] <- min(min_distance, min_distance_end)
}
chr_x_dmp_df_long_arm <- all_dmp_df %>% filter((chr == "chrX") & (all_dmp_df$passes_threshold == T) & (pos >= 58100000))
long_arm_escapees <- gene_compiled_df %>% filter(gene %in% c("DANT2", "FIRRE"))
for (row_idx in 1:nrow(chr_x_dmp_df_long_arm)) {
pos <- unlist(chr_x_dmp_df_long_arm[row_idx, "pos"])
min_distance <- min(abs(long_arm_escapees["start_position"] - pos))
min_distance_end <- min(abs(long_arm_escapees["end_position"] - pos))
chr_x_dmp_df_long_arm[row_idx, "min_dist_to_escapee"] <- min(min_distance, min_distance_end)
}
for (row_idx in 1:nrow(chr_x_dmp_df)) {
pos <- unlist(chr_x_dmp_df[row_idx, "pos"])
min_distance <- min(abs(gene_compiled_df[gene_compiled_df$gene %in% escapee, "start_position"] - pos))
min_distance_end <- min(abs(gene_compiled_df[gene_compiled_df$gene %in% cotton_genes, "end_position"] - pos))
# min_distance <- min(abs(gene_compiled_df[gene_compiled_df$gene %in% cotton_genes, "start_position"] - pos))
# min_distance_end <- min(abs(gene_compiled_df[gene_compiled_df$gene %in% cotton_genes, "end_position"] - pos))
chr_x_dmp_df[row_idx, "min_dist_to_cotton_escapee"] <- min(min_positions)
}
all_x_genes <- gene_compiled_df[gene_compiled_df$cluster == 10, "gene"]$gene
distances <- tibble(
key_transition = integer(),
min_distance = numeric(),
delta_increasing = numeric(),
pos = numeric(),
passes_threshold = logical()
)
for (n in 1:100) {
test_set <- sample(all_x_genes, 16)
for (row_idx in 1:nrow(chr_x_dmp_df)) {
pos <- unlist(chr_x_dmp_df[row_idx, "pos"])
key_transition <- unlist(chr_x_dmp_df[row_idx, "key_transition"])
delta_increasing <- unlist(chr_x_dmp_df[row_idx, "delta_increasing"])
passes_threshold <- unlist(chr_x_dmp_df[row_idx, "passes_threshold"])
min_distance <- min(abs(gene_compiled_df[gene_compiled_df$gene %in% test_set, "start_position"] - pos))
min_distance_end <- min(abs(gene_compiled_df[gene_compiled_df$gene %in% test_set, "end_position"] - pos))
if (is.na(chr_x_dmp_df[row_idx, "min_dist_to_test"])) {
chr_x_dmp_df[row_idx, "min_dist_to_test"] <- 0.0
}
distances <- add_row(distances,
key_transition=key_transition,
min_distance = min(min_distance, min_distance_end),
pos=pos,
delta_increasing=delta_increasing,
passes_threshold=passes_threshold)
}
}
# chr_x_dmp_df["min_dist_to_test"] <- chr_x_dmp_df["min_dist_to_test"] / 100
write_csv(distances, "distances_test.csv")
ggplot(distances %>% filter(delta_increasing == FALSE & passes_threshold == TRUE & pos <= 58100000 ), aes(x=key_transition, group=key_transition, y=min_distance)) +
# geom_violin(aes(x=cluster, group=cluster, y=min_dist_to_escapee, fill=cluster)) +
geom_boxplot() +
geom_point() +
geom_signif(map_signif_level=T,
comparisons = list(c(1,2), c(1,3), c(1,4), c(1,5))) +
scale_y_log10()
distances <- read_csv("distances_test.csv")
all_distances <- tibble(
key_transition = integer(),
min_distance = numeric(),
delta_increasing = numeric(),
pos = numeric(),
passes_threshold = logical(),
random = logical()
)
all_distances <- distances
all_distances["random"] <- TRUE
for (row_idx in 1:nrow(chr_x_dmp_df)) {
all_distances <- add_row(all_distances,
key_transition=unlist(chr_x_dmp_df[row_idx, "key_transition"]),
min_distance=unlist(chr_x_dmp_df[row_idx, "min_dist_to_escapee"]),
random=FALSE,
delta_increasing=unlist(chr_x_dmp_df[row_idx, "delta_increasing"]),
pos=unlist(chr_x_dmp_df[row_idx, "pos"]),
passes_threshold=unlist(chr_x_dmp_df[row_idx, "passes_threshold"]))
}
for (row_idx in 1:nrow(chr_x_dmp_df_short_arm)) {
all_distances <- add_row(all_distances,
key_transition=unlist(chr_x_dmp_df_short_arm[row_idx, "key_transition"]),
min_distance=unlist(chr_x_dmp_df_short_arm[row_idx, "min_dist_to_escapee"]),
random=FALSE,
delta_increasing=unlist(chr_x_dmp_df_short_arm[row_idx, "delta_increasing"]),
pos=unlist(chr_x_dmp_df_short_arm[row_idx, "pos"]),
passes_threshold=unlist(chr_x_dmp_df_short_arm[row_idx, "passes_threshold"]))
}
for (row_idx in 1:nrow(chr_x_dmp_df_long_arm)) {
all_distances <- add_row(all_distances,
key_transition=unlist(chr_x_dmp_df_long_arm[row_idx, "key_transition"]),
min_distance=unlist(chr_x_dmp_df_long_arm[row_idx, "min_dist_to_escapee"]),
random=FALSE,
delta_increasing=unlist(chr_x_dmp_df_long_arm[row_idx, "delta_increasing"]),
pos=unlist(chr_x_dmp_df_long_arm[row_idx, "pos"]),
passes_threshold=unlist(chr_x_dmp_df_long_arm[row_idx, "passes_threshold"]))
}
library(ggpubr)
ggplot(all_distances %>% filter(passes_threshold == TRUE & pos <= 58100000), aes(x=as.factor(key_transition), y=min_distance, fill=random)) +
# geom_violin(aes(x=cluster, group=cluster, y=min_dist_to_escapee, fill=cluster)) +
geom_boxplot() +
stat_compare_means(aes(group = random), label="p.format")+
# geom_point(aes(fill=random), position = position_jitterdodge()) +
# geom_signif(map_signif_level=T,
# comparisons = list(c(1,2), c(1,3), c(1,4), c(1,5))) +
scale_y_log10()
chr_x_dmp_df_short_arm$sig_x_site <- chr_x_dmp_df_short_arm$X1 %in% sig_x_sites
ggplot(chr_x_dmp_df_short_arm, aes(x=as.factor(key_transition), y=min_dist_to_escapee, fill=sig_x_site)) +
# geom_violin(aes(x=cluster, group=cluster, y=min_dist_to_escapee, fill=cluster)) +
geom_boxplot() +
stat_compare_means(aes(group = sig_x_site), label="p.format")+
# geom_point(aes(fill=random), position = position_jitterdodge()) +
# geom_signif(map_signif_level=T,
# comparisons = list(c(1,2), c(1,3), c(1,4), c(1,5))) +
scale_y_log10()
ggplot(chr_x_dmp_df %>% filter(delta_increasing == FALSE & passes_threshold == TRUE & pos <= 58100000), aes(x=key_transition, group=key_transition, y=min_dist_to_cotton_escapee, fill="min_cotton")) +
# geom_violin(aes(x=cluster, group=cluster, y=min_dist_to_escapee, fill=cluster)) +
geom_boxplot() +
geom_boxplot(data=distances %>% filter(delta_increasing == FALSE & passes_threshold == TRUE & pos <= 58100000), mapping=aes(x=key_transition, group=key_transition, y=min_distance, fill="min_random")) +
geom_point() +
geom_signif(map_signif_level=T,
comparisons = list(c(1,2), c(1,3), c(1,4), c(1,5))) +
scale_y_log10()
ggplot(chr_x_dmp_df %>% filter(delta_increasing == FALSE & passes_threshold == TRUE & pos <= 58100000), aes(x=min_dist_to_cotton_escapee, y=abs(delta))) +
# geom_smooth(aes(color=as.factor(key_transition))) +
# geom_smooth(color="black") +
geom_point(aes(color=as.factor(key_transition))) +
scale_x_log10()
ggplot(chr_x_dmp_df %>% filter(delta_increasing == FALSE & passes_threshold == TRUE), aes(x=min_dist_to_escapee, y=abs(delta))) +
geom_point(aes(color=as.factor(key_transition))) +
scale_x_log10()
# y_position=c(1.2e7,1.21e7,1.21e7,1.21e7)
ggplot(chr_x_dmp_df %>% filter(delta_increasing == FALSE & passes_threshold == TRUE & pos <= 58100000), aes(x=key_transition, group=key_transition, y=min_dist_to_escapee)) +
# geom_violin(aes(x=cluster, group=cluster, y=min_dist_to_escapee, fill=cluster)) +
geom_boxplot() +
geom_point() +
geom_signif(map_signif_level=T,
comparisons = list(c(1,2), c(1,3), c(1,4), c(1,5))) +
scale_y_log10()
ggplot(chr_x_dmp_df %>% filter(delta_increasing == FALSE & passes_threshold == TRUE & pos <= 58100000), aes(x=min_dist_to_escapee, y=abs(delta))) +
# geom_smooth(aes(color=as.factor(key_transition))) +
# geom_smooth(color="black") +
geom_point(aes(color=as.factor(key_transition))) +
scale_x_log10()
for (row_idx in 1:nrow(gene_compiled_df)) {
gene <- unlist(gene_compiled_df[row_idx, "gene"])
total <- unlist(gene_compiled_df[row_idx, "total_sites"])
start_pos <- unlist(gene_compiled_df[row_idx, "start_position"])
cluster <- unlist(gene_compiled_df[row_idx, "cluster"])
if (cluster <= 9) {
min_distance <- min(abs(gene_compiled_df[gene_compiled_df$gene %in% 11, "start_position"] - start_pos))
gene_compiled_df[row_idx, "min_dist_to_escapee"] <- min_distance
}
}
library(ggsignif)
ggplot(gene_compiled_df %>% filter((chr=="chrX") & (cluster %in% c(5,6,7,8,9)))) +
# geom_violin(aes(x=cluster, group=cluster, y=min_dist_to_escapee, fill=cluster)) +
geom_boxplot(aes(x=cluster, group=cluster, y=min_dist_to_escapee)) +
geom_point(aes(x=cluster, y=min_dist_to_escapee)) +
geom_signif(aes(x=cluster, group=cluster, y=min_dist_to_escapee),
map_signif_level=F, comparisons = list(c(5,6), c(5,7), c(5,8), c(5,9))) +
scale_y_log10()
mean_distances <- gene_compiled_df %>% group_by(cluster) %>% summarise(
min_dist_to_escapee = mean(na.omit(min_dist_to_escapee)))
ggplot(gene_compiled_df[(gene_compiled_df$chr == "chrX") & (gene_compiled_df$cluster %in% c(5,6,7,8,9)),]) +
geom_bar()
geom_freqpoly(aes(start_position, colour=as.factor(cluster)), binwidth = 1500000) +
labs(x="Location on X Chr",
y='Frequency Count',
title='Location of X Chromosome CpG sites that are Decreasing in Methylation')
plt3 <- ggplot(all_dmp_df[(all_dmp_df$chr == "chrX") & (all_dmp_df$passes_threshold == F)& (all_dmp_df$delta_increasing == F),]) +
geom_freqpoly(aes(pos),binwidth = 1500000) +
labs(x="Location on X Chr",
y='Frequency Count',
title=sprintf('Location of X Chromosome CpG sites that are Decreasing in Methylation', i-1, i)) +
theme(plot.title = element_text(hjust = 0.5))+
scale_color_brewer(palette='Spectral')
```
## Probability of erosion based on distance to escapees
```{r}
chr_x_dmp_df_short_arm <- all_dmp_df %>% filter((chr == "chrX") & (pos <= 58100000))
shortarm_escapees <- gene_compiled_df %>% filter((cluster == 11) & (start_position <= 58100000))
for (row_idx in 1:nrow(chr_x_dmp_df_short_arm)) {
pos <- unlist(chr_x_dmp_df_short_arm[row_idx, "pos"])
min_distance <- min(abs(shortarm_escapees$start_position - pos))
min_distance_end <- min(abs(shortarm_escapees$end_position - pos))
chr_x_dmp_df_short_arm[row_idx, "min_dist_to_escapee"] <- min(min_distance, min_distance_end)
q_val_col <- sprintf("transition_%d_qval", unlist( chr_x_dmp_df_short_arm[row_idx, "key_transition"]))
alt_q_val_col <- sprintf("transition_%d_qval", unlist( chr_x_dmp_df_short_arm[row_idx, "alt_key_transition"]))
chr_x_dmp_df_short_arm[row_idx, "min_dist_to_escapee"] <- min(min_distance, min_distance_end)
chr_x_dmp_df_short_arm[row_idx, "key_qval"] <- unlist(chr_x_dmp_df_short_arm[row_idx, q_val_col])
if (is.na(unlist( chr_x_dmp_df_short_arm[row_idx, "alt_key_transition"]))) {
chr_x_dmp_df_short_arm[row_idx, "alt_key_qval"] <- NA
}else{
chr_x_dmp_df_short_arm[row_idx, "alt_key_qval"] <- unlist(chr_x_dmp_df_short_arm[row_idx, alt_q_val_col])
}
}
transition_probabilities <- tibble(transition = numeric(),
distance = character(),
probability_of_sig_padj = numeric())
for (t in 1:5) {
for (dist_cutoff in 1:1e8) {
tran_col <- sprintf("transition_%d_qval", t)
#(!! ensym(tran_col) <= .1)
num_changing <- nrow(chr_x_dmp_df_short_arm %>% filter((min_dist_to_escapee <= dist_cutoff) & (!! ensym(tran_col) <= .1)))
num_total <- nrow(chr_x_dmp_df_short_arm %>% filter(min_dist_to_escapee <= dist_cutoff))
transition_probabilities <- add_row(transition_probabilities,
transition = t,
distance = dist_cutoff,
probability_of_sig_padj = num_changing / num_total)
}
}
ggplot(chr_x_dmp_df_short_arm %>% filter(min_dist_to_escapee < 50000 & delta < 0), aes(x = min_dist_to_escapee, y = delta)) +
geom_bin2d() +
facet_grid(key_transition ~ .)
ggplot(chr_x_dmp_df_short_arm, aes(x = min_dist_to_escapee, y = transition_2_qval)) +
geom_bin2d()
geom_line(aes(group = transition, color=as.factor(transition)))
```
## Contiguity search
```{r}
ggplot() +
geom_area(all_dmp_df %>% filter(cluster_0_mean >= .4 & cluster_0_mean <= .6 & hg38_chr == "chrX" & passes_threshold == F),
mapping = aes(x=hg38_pos,y = (..count..), fill="not passing threshold"), stat="bin", bins = 100) +
geom_area(all_dmp_df %>% filter(cluster_0_mean >= .4 & cluster_0_mean <= .6 & hg38_chr == "chrX" & passes_threshold == T),
mapping = aes(x=hg38_pos,y = (..count..), fill="passing threshold"), stat="bin", bins = 100, alpha=.4) +
geom_area(all_dmp_df %>% filter(hg38_chr == "chrX"), mapping=aes(x=hg38_pos, y = (..count..)), stat="bin", bins = 100, fill="grey", alpha=.4) +
theme_bw() +
labs(x="X Chr Coordinate",
y="Percent of Sites",
title = "Probes that start from .4-.6") +
base_plot_theme
composition_passing_threshold <- all_dmp_df %>% filter(cluster_0_mean >= .4 & cluster_0_mean <= .6 & hg38_chr == "chrX" & passes_threshold == T & delta_increasing == F) %>% group_by(hg38_refgene_group) %>% tally() %>% mutate(pct=(100*n)/sum(n))
composition_not_passing_threshold <- all_dmp_df %>% filter(cluster_0_mean >= .4 & cluster_0_mean <= .6 & hg38_chr == "chrX" & passes_threshold == F) %>% group_by(hg38_refgene_group) %>% tally() %>% mutate(pct=(100*n)/sum(n))
composition <- full_join(composition_passing_threshold, composition_not_passing_threshold, by="hg38_refgene_group", suffix=c(".pass", ".notpass"))
ggplot() +
geom_area(all_dmp_df %>% filter(cluster_0_mean >= .4 & cluster_0_mean <= .6 & hg38_chr == "chrX" & passes_threshold == T & key_transition <= 2 & grepl("TSS200|5_UTR", hg38_refgene_group)),
mapping = aes(x=hg38_pos,y = (..count..)/sum(..count..), fill="early", color="early"), stat="bin", bins = 100, alpha=.4) +
geom_area(all_dmp_df %>% filter(cluster_0_mean >= .4 & cluster_0_mean <= .6 & hg38_chr == "chrX" & passes_threshold == T & key_transition >= 3 & grepl("TSS200|5_UTR", hg38_refgene_group)),
mapping = aes(x=hg38_pos,y = (..count..)/sum(..count..), fill="late", color="late"), stat="bin", bins = 100, alpha=.4 ) +
theme_bw() +
labs(x="X Chr Coordinate",
y="Percent of Sites",
title = "Probes that start from .4-.6") +
base_plot_theme +
scale_fill_viridis_d() +
scale_color_viridis_d()
early_promoter_probes <- all_dmp_df %>% filter(cluster_0_mean >= .4 & cluster_0_mean <= .6 & hg38_chr == "chrX" & passes_threshold == T & key_transition <= 2 & delta_increasing == F & grepl("TSS200|5_UTR", hg38_refgene_group))
late_promoter_probes <- all_dmp_df %>% filter(cluster_0_mean >= .4 & cluster_0_mean <= .6 & hg38_chr == "chrX" & passes_threshold == T & key_transition >= 3 & delta_increasing == F & grepl("TSS200|5_UTR", hg38_refgene_group))
early_genes_tally <- early_promoter_probes %>% group_by(hg38_gene) %>% tally()
late_genes_tally <- late_promoter_probes %>% group_by(hg38_gene) %>% tally()
early_genes <- unique(early_promoter_probes$hg38_gene)
late_genes <- unique(late_promoter_probes$hg38_gene)
combined_intersecting_tallies <- inner_join(early_genes_tally, late_genes_tally, by="hg38_gene")
```
## Grouping by genes
Making a version of all_dmp_df table, but for genes
```{r}
library(broom)
female_df_no_h9 <- read_csv("female_df_no_h9.csv")
diff_expr_all_df <- read_csv("diff_expr_all_df.csv")
# counts_all_df <- read_csv("counts_all_df.csv")
batch_corrected_vsd_df <- read_csv("batch_corrected_vsd_df.csv")
batch_corrected_vsd_df <- batch_corrected_vsd_df %>%
dplyr::select(-c(ensembl_id,
ensembl_id_no_version,
end_position,
start_position,
description,
strand,
chromosome_name))
samples_with_expr_data <- intersect(colnames(batch_corrected_vsd_df %>% dplyr::select(-hgnc_symbol)), colnames(female_df_no_h9 %>% dplyr::select(-rn)))
batch_corrected_vsd_df <- batch_corrected_vsd_df %>% dplyr::select(gene_name=hgnc_symbol, !!samples_with_expr_data)
female_df_no_h9_subset <- female_df_no_h9 %>% dplyr::select(rn, !!samples_with_expr_data)
filtered_site_annotation_df <- read_csv("filtered_site_annotation_df.csv")
female_df_no_h9_subset <- left_join(female_df_no_h9_subset, filtered_site_annotation_df %>%
dplyr::select(rowname, hg38_chromosome, hg38_pos, hg38_gene_group, hg38_gene_name), by=c("rn"="rowname"))
female_df_no_h9_subset_gened <- female_df_no_h9_subset %>%
filter(grepl("TSS1500|TSS200|5_UTR", hg38_gene_group)) %>%
group_by(hg38_gene_name) %>%
summarise_at(samples_with_expr_data,mean)
list_genes <- intersect(female_df_no_h9_subset_gened$hg38_gene_name, batch_corrected_vsd_df$gene_name)
concordance_df <- tibble(gene=character(), corr=numeric(), pval=numeric(), meth_min=numeric(),meth_max=numeric(),
expr_min=numeric(), expr_max=numeric())
for(gene in list_genes) {
gene_meth <- unlist(female_df_no_h9_subset_gened %>% filter(hg38_gene_name == gene) %>% dplyr::select(-hg38_gene_name))
gene_expr <- unlist(batch_corrected_vsd_df %>% filter(gene_name == gene) %>% filter(row_number()==1) %>% dplyr::select(-gene_name))
concordance_df <- add_row(concordance_df,
gene=gene,
corr=as.numeric(cor.test(gene_meth, gene_expr)$estimate),
pval=as.numeric(cor.test(gene_meth, gene_expr)$p.value),
meth_min=range(gene_meth)[1],
meth_max=range(gene_meth)[2],
expr_min=range(gene_expr)[1],
expr_max=range(gene_expr)[2])
}
gened_dmp_df <- all_dmp_df %>%
filter(grepl("TSS1500|TSS200|5_UTR", hg38_refgene_group)) %>%
mutate(tran_1_delta=cluster_1_mean-cluster_0_mean,
tran_2_delta=cluster_2_mean-cluster_1_mean,
tran_3_delta=cluster_3_mean-cluster_2_mean,
tran_4_delta=cluster_4_mean-cluster_3_mean,
tran_5_delta=cluster_5_mean-cluster_4_mean) %>%
group_by(hg38_gene) %>%
summarise(tran_1_mean_delta=mean(tran_1_delta),
tran_2_mean_delta=mean(tran_2_delta),
tran_3_mean_delta=mean(tran_3_delta),
tran_4_mean_delta=mean(tran_4_delta),
tran_5_mean_delta=mean(tran_5_delta),
tran_1_mean_qval=mean(transition_1_qval),
tran_2_mean_qval=mean(transition_2_qval),
tran_3_mean_qval=mean(transition_3_qval),
tran_4_mean_qval=mean(transition_4_qval),
tran_5_mean_qval=mean(transition_5_qval),
mean_delta=mean(delta),
probes_passing_threshold=sum(passes_threshold),
total_probes=n())
concordance_df <- left_join(concordance_df, gened_dmp_df %>% dplyr::select(hg38_gene, methylation_change=mean_delta, probes_passing_threshold, total_probes), by=c("gene"="hg38_gene"))
diff_expr_all_df$key_log2FC <- NA
diff_expr_all_df$first_transition <- NA
diff_expr_all_df$first_log2FC <- NA
for(row in 1:nrow(diff_expr_all_df)) {
key_transition <- as.numeric(unlist(diff_expr_all_df[row, "key_transition"]))
log2fc_col <- sprintf("transition_%d_log2FC", key_transition)
diff_expr_all_df[row, "key_log2FC"] <- as.numeric(unlist(diff_expr_all_df[row, log2fc_col]))
pvals <- unlist(diff_expr_all_df[row, c("transition_1_padj", "transition_2_padj", "transition_3_padj", "transition_4_padj")])
pvals <- pvals < .1
pvals[is.na(pvals)] <- FALSE
first_transition <- as.numeric(which(pvals)[1])
first_delta_col <- sprintf("transition_%d_log2FC", first_transition)
diff_expr_all_df[row, "first_transition"] <- first_transition
if (!is.na(first_transition)) {
diff_expr_all_df[row, "first_log2FC"] <- as.numeric(unlist(diff_expr_all_df[row, first_delta_col]))
}
}
concordance_df <- left_join(concordance_df, diff_expr_all_df %>%
dplyr::select(gene,
chr,
passes_expression_threshold=passes_threshold,
expr_key_transition=key_transition,
expr_key_log2FC=key_log2FC,
expr_first_sig_transition=first_transition,
expr_first_log2FC=first_log2FC),
by=c("gene"))
transition_columns <- c("tran_1_mean_qval", "tran_2_mean_qval", "tran_3_mean_qval", "tran_4_mean_qval", "tran_5_mean_qval")
gened_dmp_df$key_transition <- colnames(gened_dmp_df[transition_columns])[apply(gened_dmp_df[transition_columns], 1, which.min)]
gened_dmp_df$key_transition <- as.numeric(unlist(as.data.frame(t(as.data.frame(strsplit(gened_dmp_df$key_transition, '_'))))[2]))
gened_dmp_df$key_mean_qval <- 0
for (row in 1:nrow(gened_dmp_df)) {
qval_col <- sprintf("tran_%d_mean_qval", as.numeric(unlist(gened_dmp_df[row, "key_transition"])))
gened_dmp_df[row, "key_mean_qval"] <- gened_dmp_df[row, qval_col]
}
```
## Looking for enrichment using clusterProfiler
```{r}
library(tidyverse)
library(clusterProfiler)
library(msigdbr)
library(enrichplot)
library(ggsci)
library('org.Hs.eg.db')
library(ReactomePA)
library(meshes)
library(MeSH.Hsa.eg.db)
library(DOSE)
library(RDAVIDWebService)
library(grid)
library(gridExtra)
library(scales)
msig_df <- msigdbr(species = "Homo sapiens")
msig_t2g <- msig_df %>%
dplyr::select(gs_name, gene_symbol)
concordance_df <- read_csv("concordance_df.csv")
new_concordance_df <- read_csv("new_concordance_df.csv.gz")
grouped_concordance_df_filtered <- read_csv("grouped_concordance_df_filtered.csv") %>% filter(abs(max_tran_log2FC) >= .5)
enrichment_base_theme <- base_plot_theme +
theme(legend.key.width = unit(7, "mm"),
legend.text = element_text(size = 5),
legend.title = element_text(size = 5, face = "bold"))
plot_enrich <- function(df, title_name) {
enrichment_base_theme <- base_plot_theme +
theme(legend.key.width = unit(7, "mm"),
legend.text = element_text(size = 5),
legend.title = element_text(size = 5, face = "bold"))
if(is.null(df) | nrow(as_tibble(df)) == 0){
return(ggplot() + theme_void() + labs(title=title_name))
}else{
df_tib <- as_tibble(df) %>%
group_by(ID) %>%
mutate(gene_ratio_eval=eval(str2expression(GeneRatio))) %>%
ungroup()
df_tib <- df_tib %>%
mutate(wrapped_ID=wrap_format(20)(str_replace_all(ID, "_", " ")))
df_tib_levels <- (df_tib %>% arrange(gene_ratio_eval))$wrapped_ID
return(
dotplot(df) +
labs(title=title_name) +
theme_bw() +
enrichment_base_theme
# ggplot(df_tib) +
# geom_point(aes(x=gene_ratio_eval, y=factor(wrapped_ID, levels=msig_tib_levels), color=p.adjust, size=Count)) +
# scale_color_gsea(reverse=T) +
# labs(x="Gene Ratio",
# y="",
# title=title_name) +
# theme_bw() +
# base_plot_theme +
# theme(axis.title.y = element_blank()) +
# enrichment_base_theme
)
}
}
make_enrichment_plots <- function(background_genes, foreground_genes, plot_title, file_save_name, pvalueCutoff=.05){
background_entrez <- bitr(background_genes, "SYMBOL", "ENTREZID", OrgDb = org.Hs.eg.db)$ENTREZID
if(length(foreground_genes) > 0) {
foreground_entrez <- bitr(foreground_genes, "SYMBOL", "ENTREZID", OrgDb = org.Hs.eg.db)$ENTREZID
msig_df <- enricher(foreground_genes, universe = background_genes, TERM2GENE = msig_t2g, pvalueCutoff=pvalueCutoff)
msig_tib <- as_tibble(msig_df) %>% mutate(Ontology="MSigDB")
# if(is.null(msig_df)) {
# msig_plt <- ggplot() + theme_void()
# }
# msig_plt <- plot_enrich(msig_df, "MSigDB")
kegg_df <- enrichKEGG(foreground_entrez, universe = background_entrez, pvalueCutoff=pvalueCutoff)
kegg_tib <- as_tibble(kegg_df) %>% mutate(Ontology="KEGG")
# kegg_plt <- plot_enrich(kegg_df, "KEGG")
go_df <- enrichGO(foreground_entrez, universe = background_entrez, OrgDb = org.Hs.eg.db, pvalueCutoff=pvalueCutoff)
go_tib <- as_tibble(go_df) %>% mutate(Ontology="GO")
# go_plt <- plot_enrich(go_df, "GO")
reactome_df <- enrichPathway(foreground_entrez, universe = background_entrez, pvalueCutoff=pvalueCutoff)
reactome_tib <- as_tibble(reactome_df) %>% mutate(Ontology="Reactome")
# reactome_plt <- plot_enrich(reactome_df, "Reactome")
mesh_gendoo_df <- enrichMeSH(foreground_entrez,
universe = background_entrez,
MeSHDb = "MeSH.Hsa.eg.db",
database='gendoo',
category = 'C',
minGSSize = 10, pvalueCutoff=pvalueCutoff)
mesh_gendoo_tib <- as_tibble(mesh_gendoo_df) %>% mutate(Ontology="MeSH - Gendoo")
# mesh_gen_plt <- plot_enrich(mesh_gendoo_df, "MeSH - Gendoo")
# mesh_pubmed_df <- enrichMeSH(foreground_entrez, universe = background_entrez, MeSHDb = "MeSH.Hsa.eg.db", database='gene2pubmed', category = 'C')
# mesh_pubmed_tib <- as_tibble(mesh_pubmed_df) %>% mutate(Ontology="MeSH - gene2pubmed")
# mesh_pub_plt <- plot_enrich(mesh_pubmed_df, "MeSH - gene2pubmed")
do_df <- enrichDO(foreground_entrez, universe = background_entrez, pvalueCutoff=pvalueCutoff)
do_tib <- as_tibble(do_df) %>% mutate(Ontology="Disease Ontology")
# do_plt <- plot_enrich(do_df, "Disease Ontology")
ncg_df <- enrichNCG(foreground_entrez, universe = background_entrez, pvalueCutoff=pvalueCutoff)
ncg_tib <- as_tibble(ncg_df) %>% mutate(Ontology="NCG")
# ncg_plt <- plot_enrich(ncg_df, "NCG")
dgn_df <- enrichDGN(foreground_entrez, universe = background_entrez, pvalueCutoff=pvalueCutoff)
dgn_tib <- as_tibble(dgn_df) %>% mutate(Ontology="DGN")
# dgn_plt <- plot_enrich(dgn_df, "DGN")
wpgmtfile <- system.file("wikipathways-20200310-gmt-Homo_sapiens.gmt", package="clusterProfiler")
wp2gene <- read.gmt("wikipathways-20200310-gmt-Homo_sapiens.gmt")
wp2gene <- wp2gene %>% tidyr::separate(term, c("name","version","wpid","org"), "%")
wpid2gene <- wp2gene %>% dplyr::select(wpid, gene) #TERM2GENE
wpid2name <- wp2gene %>% dplyr::select(wpid, name) #TERM2NAME
wiki_df <- enricher(foreground_entrez, universe = background_entrez, TERM2GENE = wpid2gene, TERM2NAME = wpid2name, pvalueCutoff=pvalueCutoff)
wiki_tib <- as_tibble(wiki_df) %>% mutate(Ontology="Wiki Pathways")
super_tib <- bind_rows(msig_tib, kegg_tib, go_tib,
reactome_tib, mesh_gendoo_tib,
do_tib, ncg_tib, dgn_tib, wiki_tib) #mesh_pubmed_tib
super_tib <- super_tib %>%
group_by(Description) %>%
mutate(gene_ratio_eval=eval(str2expression(GeneRatio))) %>%
ungroup()
super_tib <- super_tib %>%
mutate(wrapped_ID=wrap_format(20)(str_replace_all(ID, "_", " ")))
return(super_tib)
# gridded_plts <- arrangeGrob(msig_plt, kegg_plt, go_plt, reactome_plt, mesh_gen_plt, mesh_pub_plt, do_plt, ncg_plt, dgn_plt,
# top = textGrob(plot_title,gp=gpar(fontsize=6)))
# ggsave(file_save_name, gridded_plts, width = 236, height = 236, units = "mm")
}else{
print(sprintf("no genes for transition %d", t))
}
}
# Whole genome
background_genes <- concordance_df$gene
for(t in 1:5) {
foreground_genes <- filter(concordance_df, pval <= .05 & corr < 0 & expr_first_sig_transition == t)$gene
title <- sprintf("Whole Genome Transition %d - %d genes", t, length(foreground_genes))
save_name <- sprintf("figs/enrichment/whole_genome_tran_%d.png",t)
make_enrichment_plots(background_genes, foreground_genes, title, save_name)
}
# X Chr genome
x_background_genes <- filter(concordance_df, chr == "X")$gene
for(t in 1:5) {
x_foreground_genes <- filter(concordance_df, chr == "X" & pval <= .05 & corr < 0 & expr_first_sig_transition == t)$gene
title <- sprintf("X Chr Transition %d - %d genes", t, length(x_foreground_genes))
save_name <- sprintf("figs/enrichment/x_chr_tran_%d.png",t)
make_enrichment_plots(x_background_genes, x_foreground_genes, title, save_name)
}
# Non X Chr genome
nonx_background_genes <- filter(concordance_df, !(chr %in% c("X", "Y")))$gene
for(t in 1:5) {
nonx_foreground_genes <- filter(concordance_df, !(chr %in% c("X", "Y")) & pval <= .05 & corr < 0 & expr_first_sig_transition == t)$gene
title <- sprintf("Non-X Chr Transition %d - %d genes", t, length(nonx_foreground_genes))
save_name <- sprintf("figs/enrichment/nonx_chr_tran_%d.png",t)
make_enrichment_plots(nonx_background_genes, nonx_foreground_genes, title, save_name)
}
# expression only enrichments
# Whole genome with grouped_concordance_df_filtered
diff_expr_all_df <- read_csv("diff_expr_all_df.csv.gz")
genome_background_genes <- filter(diff_expr_all_df, !(chr %in% c("Y")))$gene
super_tib <- tibble(ID=character(),
Description=character(),
GeneRatio=character(),
BgRatio=character(),
pvalue=numeric(),
p.adjust=numeric(),
qvalue=numeric(),
geneID=character(),
Count=numeric(),
Ontology=character(),
gene_ratio_eval=numeric(),
wrapped_ID=character(),
Transition=numeric())
gene_counts <- tibble(description=character(),
count=numeric())
gene_counts <- add_row(gene_counts,
description="background",
count=length(genome_background_genes))
for(t in 1:5) {
genome_foreground_genes <- filter(diff_expr_all_df, !(chr %in% c("Y")) & key_transition == t & passes_threshold == T)$gene
if(length(genome_foreground_genes) >0){
# title <- sprintf("Non-X Chr Transition grouped_concordance_df_filtered %d - %d genes", t, length(genome_foreground_genes))
# save_name <- sprintf("figs/enrichment/grouped_concordance_df_filtered_genome_chr_tran_%d.png",t)
tib <- make_enrichment_plots(genome_background_genes, genome_foreground_genes, title, save_name) %>% mutate(Transition=t)
super_tib <- bind_rows(super_tib, tib)
gene_counts <- add_row(gene_counts,
description=sprintf("foreground tran. %d", t),
count=length(genome_foreground_genes))
# ggsave(save_name, plt, width = 87, height=59, units = "mm")
}
}
write_csv(super_tib, "genome_enrichment_expr_genes.csv")
# Non X Chr genome with grouped_concordance_df_filtered
nonx_background_genes <- filter(diff_expr_all_df, !(chr %in% c("X", "Y")))$gene
super_tib <- tibble(ID=character(),
Description=character(),
GeneRatio=character(),
BgRatio=character(),
pvalue=numeric(),
p.adjust=numeric(),
qvalue=numeric(),
geneID=character(),
Count=numeric(),
Ontology=character(),
gene_ratio_eval=numeric(),
wrapped_ID=character(),
Transition=numeric())
gene_counts <- tibble(description=character(),
count=numeric())
gene_counts <- add_row(gene_counts,
description="background",
count=length(nonx_background_genes))
for(t in 1:5) {
nonx_foreground_genes <- filter(diff_expr_all_df, !(chr %in% c("X", "Y")) & key_transition == t & passes_threshold == T)$gene
if(length(nonx_foreground_genes) >0){
# title <- sprintf("Non-X Chr Transition grouped_concordance_df_filtered %d - %d genes", t, length(nonx_foreground_genes))
# save_name <- sprintf("figs/enrichment/grouped_concordance_df_filtered_nonx_chr_tran_%d.png",t)
tib <- make_enrichment_plots(nonx_background_genes, nonx_foreground_genes, title, save_name) %>% mutate(Transition=t)
super_tib <- bind_rows(super_tib, tib)
gene_counts <- add_row(gene_counts,
description=sprintf("foreground tran. %d", t),
count=length(nonx_foreground_genes))
# ggsave(save_name, plt, width = 87, height=59, units = "mm")
}
}
write_csv(super_tib, "non_x_enrichment_expr_genes.csv")
x_background_genes <- filter(diff_expr_all_df, (chr %in% c("X")))$gene
super_tib <- tibble(ID=character(),
Description=character(),
GeneRatio=character(),
BgRatio=character(),
pvalue=numeric(),
p.adjust=numeric(),
qvalue=numeric(),
geneID=character(),
Count=numeric(),
Ontology=character(),
gene_ratio_eval=numeric(),
wrapped_ID=character(),
Transition=numeric())
gene_counts <- tibble(description=character(),
count=numeric())
gene_counts <- add_row(gene_counts,
description="background",
count=length(x_background_genes))
for(t in 1:5) {
x_foreground_genes <- filter(diff_expr_all_df, (chr %in% c("X")) & key_transition == t & passes_threshold == T)$gene
# title <- sprintf("Non-X Chr Transition grouped_concordance_df_filtered %d - %d genes", t, length(nonx_foreground_genes))
# save_name <- sprintf("figs/enrichment/grouped_concordance_df_filtered_nonx_chr_tran_%d.png",t)
tib <- make_enrichment_plots(x_background_genes, x_foreground_genes, title, save_name) %>% mutate(Transition=t)
super_tib <- bind_rows(super_tib, tib)
gene_counts <- add_row(gene_counts,
description=sprintf("foreground tran. %d", t),
count=length(x_foreground_genes))
# ggsave(save_name, plt, width = 87, height=59, units = "mm")
}
write_csv(super_tib, "x_enrichment_expr_genes.csv")
# Whole genome with grouped_concordance_df_filtered
genome_background_genes <- filter(new_concordance_df, !(chr %in% c("Y")))$gene
super_tib <- tibble(ID=character(),
Description=character(),
GeneRatio=character(),
BgRatio=character(),
pvalue=numeric(),
p.adjust=numeric(),
qvalue=numeric(),
geneID=character(),
Count=numeric(),
Ontology=character(),
gene_ratio_eval=numeric(),
wrapped_ID=character(),
Transition=numeric())
gene_counts <- tibble(description=character(),
count=numeric())
gene_counts <- add_row(gene_counts,
description="background",
count=length(genome_background_genes))
for(t in 1:5) {
genome_foreground_genes <- filter(new_concordance_df, !(chr %in% c("Y")) & key_transition == t & passes_threshold == T & (atac_concordant == T | meth_concordant == T))$gene
if(length(genome_foreground_genes) >0){
# title <- sprintf("Non-X Chr Transition grouped_concordance_df_filtered %d - %d genes", t, length(genome_foreground_genes))
# save_name <- sprintf("figs/enrichment/grouped_concordance_df_filtered_genome_chr_tran_%d.png",t)
tib <- make_enrichment_plots(genome_background_genes, genome_foreground_genes, title, save_name) %>% mutate(Transition=t)
super_tib <- bind_rows(super_tib, tib)
gene_counts <- add_row(gene_counts,
description=sprintf("foreground tran. %d", t),
count=length(genome_foreground_genes))
# ggsave(save_name, plt, width = 87, height=59, units = "mm")
}
}
write_csv(super_tib, "genome_enrichment_atac_meth_concordanct_genes.csv")
# Non X Chr genome with grouped_concordance_df_filtered
nonx_background_genes <- filter(new_concordance_df, !(chr %in% c("X", "Y")))$gene
super_tib <- tibble(ID=character(),
Description=character(),
GeneRatio=character(),
BgRatio=character(),
pvalue=numeric(),
p.adjust=numeric(),
qvalue=numeric(),
geneID=character(),
Count=numeric(),
Ontology=character(),
gene_ratio_eval=numeric(),
wrapped_ID=character(),
Transition=numeric())
gene_counts <- tibble(description=character(),
count=numeric())
gene_counts <- add_row(gene_counts,
description="background",
count=length(nonx_background_genes))
cats_of_interest <- c("HALLMARK_OXIDATIVE_PHOSPHORYLATION",
"MOOTHA_MITOCHONDRIA",
"GO_MITOCHONDRION",
"GO_MITOCHONDRIAL_PART",
"MOOTHA_HUMAN_MITODB_6_2002",
"GO_MITOCHONDRION_ORGANIZATION",
"GO_MITOCHONDRIAL_ENVELOPE",
"GO_ENERGY_DERIVATION_BY_OXIDATION_OF_ORGANIC_COMPOUNDS",
"GO_OXIDATION_REDUCTION_PROCESS",
"WONG_MITOCHONDRIA_GENE_MODULE")
for(t in 1:5) {
nonx_foreground_genes <- filter(new_concordance_df, !(chr %in% c("X", "Y")) & key_transition == t & passes_threshold == T & (atac_concordant == T | meth_concordant == T))$gene
if(length(nonx_foreground_genes) >0){
# title <- sprintf("Non-X Chr Transition grouped_concordance_df_filtered %d - %d genes", t, length(nonx_foreground_genes))
# save_name <- sprintf("figs/enrichment/grouped_concordance_df_filtered_nonx_chr_tran_%d.png",t)
tib <- make_enrichment_plots(nonx_background_genes, nonx_foreground_genes, title, save_name, pvalueCutoff=1) %>% mutate(Transition=t)
super_tib <- bind_rows(super_tib, tib)
gene_counts <- add_row(gene_counts,
description=sprintf("foreground tran. %d", t),
count=length(nonx_foreground_genes))
# ggsave(save_name, plt, width = 87, height=59, units = "mm")
}
}
write_csv(super_tib, "non_x_enrichment_atac_meth_concordanct_genes.csv")
write_csv(super_tib, "non_x_enrichment_atac_meth_concordant_no_thresh.csv")
write_csv(super_tib %>% filter(ID %in% cats_of_interest), "non_x_enrichment_atac_meth_concordant_mitochondrial_alone.csv")
saved_tib <- super_tib
super_tib <- super_tib %>%
mutate(wrapped_ID=wrap_format(20)(str_replace_all(sprintf("%s", Description), "_", " ")))
wrapped_ID_levels <- (super_tib %>% arrange(Transition, gene_ratio_eval))$wrapped_ID
enrichment_nonx_plot <- ggplot(super_tib) +
geom_point(aes(x=gene_ratio_eval, y=factor(wrapped_ID, levels=unique(wrapped_ID_levels)), color=p.adjust, size=Count)) +
scale_color_gsea(reverse=T) +
scale_size(range = c(.1,4)) +
labs(x="Gene Ratio",
y="",
title= "Enrichment Analysis for Concordant Genes in Each Transition") +
theme_bw() +
base_plot_theme +
theme(axis.title.y = element_blank()) +
facet_grid( rows=vars(Transition), scales = "free_y", space="free_y", labeller = labeller(.rows=function(n){ #cols=vars(Ontology),
return(list("1"="Tran. 1",
"2"="Tran. 2",
"3"="Tran. 3",
"4"="Tran. 4",
"5"="Tran. 5")[as.character(n)])
})) +
scale_x_continuous(expand = c(.05,.05)) +
enrichment_base_theme +
theme(strip.text.y = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")),
strip.text.x = element_text(size=5, margin = margin(1,1.5,1,1.5, "mm")),
strip.background = element_rect(size = .3),
plot.title = element_blank())
ggsave("figs/enrichment/grouped_concordance_df_filtered_nonx_enrich_alt.svg", width = 38, height = 180, units = "mm")
# Non X Chr genome with expression diff_expr_f_vs_each
diff_expr_f_vs_each <- read_csv("diff_expr_f_vs_each.csv")
nonx_background_genes <- filter(diff_expr_f_vs_each, !(chromosome_name %in% c("X", "Y")))$gene
super_tib <- tibble(ID=character(),
Description=character(),
GeneRatio=character(),
BgRatio=character(),
pvalue=numeric(),
p.adjust=numeric(),
qvalue=numeric(),
geneID=character(),
Count=numeric(),
Ontology=character(),
gene_ratio_eval=numeric(),
wrapped_ID=character(),
Transition=numeric())
gene_counts <- tibble(description=character(),
count=numeric())
gene_counts <- add_row(gene_counts,
description="background",
count=length(nonx_background_genes))
# for(t in 1:5) {
nonx_foreground_genes <- filter(diff_expr_f_vs_each, !(chromosome_name %in% c("X", "Y")) & D_vs_F_padj <= .1)$gene
# title <- sprintf("Non-X Chr Transition grouped_concordance_df_filtered %d - %d genes", t, length(nonx_foreground_genes))
# save_name <- sprintf("figs/enrichment/grouped_concordance_df_filtered_nonx_chr_tran_%d.png",t)
tib <- make_enrichment_plots(nonx_background_genes, nonx_foreground_genes, title, save_name) %>% mutate(Transition=t)
super_tib <- bind_rows(super_tib, tib)
gene_counts <- add_row(gene_counts,
description=sprintf("foreground tran. %d", t),
count=length(nonx_foreground_genes))
# ggsave(save_name, plt, width = 87, height=59, units = "mm")
# }
saved_tib <- super_tib
super_tib <- super_tib %>%
mutate(wrapped_ID=wrap_format(20)(str_replace_all(sprintf("%s", Description), "_", " ")))
wrapped_ID_levels <- (super_tib %>% arrange(Transition, gene_ratio_eval))$wrapped_ID
enrichment_nonx_plot <- ggplot(super_tib) +
geom_point(aes(x=gene_ratio_eval, y=factor(wrapped_ID, levels=unique(wrapped_ID_levels)), color=p.adjust, size=Count)) +
scale_color_gsea(reverse=T) +
scale_size(range = c(.1,4)) +
labs(x="Gene Ratio",
y="",
title= "Enrichment Analysis for Concordant Genes in Each Transition") +
theme_bw() +
base_plot_theme +
theme(axis.title.y = element_blank()) +
facet_grid( rows=vars(Transition), scales = "free_y", space="free_y", labeller = labeller(.rows=function(n){ #cols=vars(Ontology),
return(list("1"="Tran. 1",
"2"="Tran. 2",
"3"="Tran. 3",
"4"="Tran. 4",
"5"="Tran. 5")[as.character(n)])
})) +
scale_x_continuous(expand = c(.05,.05)) +
enrichment_base_theme +
theme(strip.text.y = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")),
strip.text.x = element_text(size=5, margin = margin(1,1.5,1,1.5, "mm")),
strip.background = element_rect(size = .3),
plot.title = element_blank())
x_background_genes <- filter(new_concordance_df, (chr %in% c("X")))$gene
super_tib <- tibble(ID=character(),
Description=character(),
GeneRatio=character(),
BgRatio=character(),
pvalue=numeric(),
p.adjust=numeric(),
qvalue=numeric(),
geneID=character(),
Count=numeric(),
Ontology=character(),
gene_ratio_eval=numeric(),
wrapped_ID=character(),
Transition=numeric())
gene_counts <- tibble(description=character(),
count=numeric())
gene_counts <- add_row(gene_counts,
description="background",
count=length(x_background_genes))
for(t in 1:5) {
x_foreground_genes <- filter(new_concordance_df, (chr %in% c("X")) & key_transition == t & passes_threshold == T & (atac_concordant == T | meth_concordant == T))$gene
# title <- sprintf("Non-X Chr Transition grouped_concordance_df_filtered %d - %d genes", t, length(nonx_foreground_genes))
# save_name <- sprintf("figs/enrichment/grouped_concordance_df_filtered_nonx_chr_tran_%d.png",t)
tib <- make_enrichment_plots(x_background_genes, x_foreground_genes, title, save_name) %>% mutate(Transition=t)
super_tib <- bind_rows(super_tib, tib)
gene_counts <- add_row(gene_counts,
description=sprintf("foreground tran. %d", t),
count=length(x_foreground_genes))
# ggsave(save_name, plt, width = 87, height=59, units = "mm")
}
write_csv(super_tib, "x_enrichment_atac_meth_concordanct_genes.csv")
super_tib <- super_tib %>%
mutate(wrapped_ID=wrap_format(20)(str_replace_all(Description, "_", " ")))
super_tib_levels <- (super_tib %>% arrange(gene_ratio_eval))$wrapped_ID
enrichment_x_plot <- ggplot(super_tib) +
geom_point(aes(x=gene_ratio_eval, y=factor(wrapped_ID, levels=super_tib_levels), color=p.adjust, size=Count)) +
scale_color_gsea(reverse=T) +
labs(x="Gene Ratio",
y="",
title= "Enrichment Analysis for Concordant Genes in Each Transition - X Chr",
size="Gene\nCount") +
theme_bw() +
base_plot_theme +
theme(axis.title.y = element_blank()) +
facet_grid(rows=vars(Transition), scales = "free_y", space="free_y", labeller = labeller(.rows=function(n){ #cols=vars(Ontology),
return(list("1"="Tran. 1",
"2"="Tran. 2",
"3"="Tran. 3",
"4"="Tran. 4",
"5"="Tran. 5")[as.character(n)])
})) +
scale_x_continuous(expand = c(.05,.05)) +
enrichment_base_theme +
scale_size(range=c(.1,5)) +
theme(strip.text.y = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")),
strip.text.x = element_text(size=5, margin = margin(1,1.5,1,1.5, "mm")),
strip.background = element_rect(size = .3),
plot.title = element_blank())
ggsave("figs/enrichment/grouped_concordance_df_filtered_x_enrich_alt.svg", enrichment_x_plot, width = 80, height = 120, units = "mm")
## With key transition instead of first transition
# Whole genome
background_genes <- concordance_df$gene
for(t in 1:5) {
foreground_genes <- filter(concordance_df, pval <= .05 & corr < 0 & expr_key_transition == t)$gene
title <- sprintf("Whole Genome Transition %d - %d genes", t, length(foreground_genes))
save_name <- sprintf("figs/enrichment/key_whole_genome_tran_%d.png",t)
make_enrichment_plots(background_genes, foreground_genes, title, save_name)
}
# X Chr genome
x_background_genes <- filter(concordance_df, chr == "X")$gene
for(t in 1:5) {
x_foreground_genes <- filter(concordance_df, chr == "X" & pval <= .05 & corr < 0 & expr_key_transition == t)$gene
title <- sprintf("X Chr Transition %d - %d genes", t, length(x_foreground_genes))
save_name <- sprintf("figs/enrichment/key_x_chr_tran_%d.png",t)
make_enrichment_plots(x_background_genes, x_foreground_genes, title, save_name)
}
# Non X Chr genome
nonx_background_genes <- filter(concordance_df, !(chr %in% c("X", "Y")))$gene
for(t in 1:5) {
nonx_foreground_genes <- filter(concordance_df, !(chr %in% c("X", "Y")) & pval <= .05 & corr < 0 & expr_key_transition == t)$gene
title <- sprintf("Non-X Chr Transition %d - %d genes", t, length(nonx_foreground_genes))
save_name <- sprintf("figs/enrichment/key_nonx_chr_tran_%d.png",t)
make_enrichment_plots(nonx_background_genes, nonx_foreground_genes, title, save_name)
}
```
## Looking for gene counts in each transition from NCG
```{r}
cancer_genes_df <- read_csv("building_cancer_db/cosmic_cancer_gene_census.csv.gz")
# cancer_genes_df$type <- "unknown"
# cancer_genes_df[grepl("oncogene", cancer_genes_df$description), "type"] <- "oncogene"
# cancer_genes_df[grepl("tumour suppressor gene", cancer_genes_df$description), "type"] <- "tumour suppressor gene"
# grouped_concordance_df_filtered <- read_csv("grouped_concordance_df_filtered.csv.gz") %>% filter(abs(max_tran_log2FC) >= .5)
# concordance_df <- read_csv("concordance_df.csv")
new_concordance_df <- read_csv("new_concordance_df.csv.gz")
cancer_plot_df <- new_concordance_df %>% filter(passes_threshold == T & (atac_concordant | meth_concordant)) #grouped_concordance_df_filtered
cancer_plot_df <- left_join(cancer_plot_df, cancer_genes_df
%>% dplyr::select(`Gene Symbol`, `Role in Cancer`),
by=c("gene"="Gene Symbol")) %>%
filter(!is.na(`Role in Cancer`))
cancer_plot_df$cancer_gene_type <- "Other"
cancer_plot_df[grepl("oncogene", cancer_plot_df$`Role in Cancer`), "cancer_gene_type"] <- "Oncogene"
cancer_plot_df[grepl("TSG", cancer_plot_df$`Role in Cancer`), "cancer_gene_type"] <- "Tumor Suppressor"
cancer_plot_df[grepl("oncogene, TSG", cancer_plot_df$`Role in Cancer`), "cancer_gene_type"] <- "Oncogene + Tumor Suppressor"
cancer_plot_df$max_tran_label <- paste("Tran. ", cancer_plot_df$key_transition)
cancer_plot_df[cancer_plot_df$key_transition %in% c(4,5), "max_tran_label"] <- "Tran. 4/5"
cancer_plot_df <- cancer_plot_df %>% filter(chr != "Y")
cancer_genes_plot <- ggplot(cancer_plot_df, aes(x=(chr=="X"), y=delta, fill=cancer_gene_type)) +
# geom_boxplot(outlier.shape=NA, size=.3) +
geom_hline(yintercept = 0, size=.1)+
ggrepel::geom_text_repel( aes(label=gene, color=cancer_gene_type), size=1.2, position = position_dodge(width=.7), segment.size=.1) +
geom_point(pch = 21, position = position_dodge(width=.7), size=1.1, stroke=.1, alpha=.7) +
# geom_jitter(size=.7) +
facet_grid(cols = vars(max_tran_label)) +
scale_fill_manual(values = pal_simpsons()(5)[c(2,4,5,3)],
limits=c("Oncogene", "Oncogene + Tumor Suppressor", "Tumor Suppressor", "Other")) +
scale_color_manual(values = pal_simpsons()(5)[c(2,4,5,3)],
limits=c("Oncogene", "Oncogene + Tumor Suppressor", "Tumor Suppressor", "Other"),
guide=F) +
scale_x_discrete(breaks=c(F,T), labels=c("Autosomal", "X"), name="Chromosome") +
scale_shape(guide=F) +
labs(title="Cancer Genes Perturbed in Each Transition",
y="log2(Fold Change)",
fill="Role in Cancer") +
theme_bw() +
base_plot_theme +
theme(strip.text.y = element_text(size=5, margin = margin(0.5,0,0.5,0, "mm")),
strip.text.x = element_text(size=5, margin = margin(0.7,0.5,0.7,0.5, "mm")),
strip.background = element_rect(size = .3))
ggsave("figs/main/fig_4_cancer_genes.pdf", cancer_genes_plot, width = 116, height=53.27, units = "mm")
write_csv(cancer_plot_df %>% select(gene, chr, max_tran_log2FC,max_tran_label, cancer_gene_type), "cancer_plot_df.csv")
nonx_background_genes <- filter(concordance_df, !(chr %in% c("X", "Y")))$gene
nonx_gene_counts <- tibble(description=character(),
total=numeric(),
oncogenes=numeric(),
tumor_suppressors=numeric(),
other_cancer_genes=numeric())
nonx_gene_counts <- add_row(nonx_gene_counts,
description="background",
total=length(nonx_background_genes),
oncogenes=nrow(cancer_genes_df %>% filter(grepl("oncogene", `Role in Cancer`) &
`Gene Symbol` %in% nonx_background_genes)),
tumor_suppressors=nrow(cancer_genes_df %>% filter(grepl("TSG", `Role in Cancer`) &
`Gene Symbol` %in% nonx_background_genes)),
other_cancer_genes=nrow(cancer_genes_df %>% filter(!grepl("TSG|oncogene", `Role in Cancer`) &
`Gene Symbol` %in% nonx_background_genes)))
for(t in 1:5) {
nonx_foreground_genes_up <- filter(grouped_concordance_df_filtered, !(chr %in% c("X", "Y")) & max_transition == t & max_tran_log2FC >= 0)$gene
nonx_foreground_genes_down <- filter(grouped_concordance_df_filtered, !(chr %in% c("X", "Y")) & max_transition == t & max_tran_log2FC < 0)$gene
nonx_gene_counts <- add_row(nonx_gene_counts,
description=sprintf("Transition %d", t),
total=length(nonx_foreground_genes),
oncogenes=nrow(cancer_genes_df %>% filter(grepl("oncogene", `Role in Cancer`) &
`Gene Symbol` %in% nonx_foreground_genes_up)),
tumor_suppressors=nrow(cancer_genes_df %>% filter(grepl("TSG", `Role in Cancer`) &
`Gene Symbol` %in% nonx_foreground_genes_up)),
other_cancer_genes=nrow(cancer_genes_df %>% filter(!grepl("TSG|oncogene", `Role in Cancer`) &
`Gene Symbol` %in% nonx_foreground_genes_up)))
nonx_gene_counts <- add_row(nonx_gene_counts,
description=sprintf("Transition %d", t),
total=length(nonx_foreground_genes),
oncogenes=-1*nrow(cancer_genes_df %>% filter(grepl("oncogene", `Role in Cancer`) &
`Gene Symbol` %in% nonx_foreground_genes_down)),
tumor_suppressors=-1*nrow(cancer_genes_df %>% filter(grepl("TSG", `Role in Cancer`) &
`Gene Symbol` %in% nonx_foreground_genes_down)),
other_cancer_genes=-1*nrow(cancer_genes_df %>% filter(!grepl("TSG|oncogene", `Role in Cancer`) &
`Gene Symbol` %in% nonx_foreground_genes_down)))
# ggsave(save_name, plt, width = 87, height=59, units = "mm")
}
nonx_gene_counts <- nonx_gene_counts %>% mutate(all_cancer=oncogenes + tumor_suppressors + other_cancer_genes)
nonx_gene_counts$chrom <- "nonX"
x_background_genes <- filter(concordance_df, (chr %in% c("X")))$gene
x_gene_counts <- tibble(description=character(),
total=numeric(),
oncogenes=numeric(),
tumor_suppressors=numeric(),
other_cancer_genes=numeric())
x_gene_counts <- add_row(x_gene_counts,
description="background",
total=length(x_background_genes),
oncogenes=nrow((cancer_genes_df %>% filter(grepl("oncogene", `Role in Cancer`) &
`Gene Symbol` %in% x_background_genes))),
tumor_suppressors=nrow(cancer_genes_df %>% filter(grepl("TSG", `Role in Cancer`) &
`Gene Symbol` %in% x_background_genes)),
other_cancer_genes=nrow(cancer_genes_df %>% filter(!grepl("TSG|oncogene", `Role in Cancer`) &
`Gene Symbol` %in% x_background_genes)))
for(t in 1:5) {
x_foreground_genes_up <- filter(grouped_concordance_df_filtered, (chr %in% c("X")) & max_transition == t & max_tran_log2FC >= 0)$gene
x_foreground_genes_down <- filter(grouped_concordance_df_filtered, (chr %in% c("X")) & max_transition == t & max_tran_log2FC < 0)$gene
x_gene_counts <- add_row(x_gene_counts,
description=sprintf("Transition %d", t),
total=length(x_foreground_genes),
oncogenes=nrow((cancer_genes_df %>% filter(grepl("oncogene", `Role in Cancer`) &
`Gene Symbol` %in% x_foreground_genes_up))),
tumor_suppressors=nrow(cancer_genes_df %>% filter(grepl("TSG", `Role in Cancer`) &
`Gene Symbol` %in% x_foreground_genes_up)),
other_cancer_genes=nrow(cancer_genes_df %>% filter(!grepl("TSG|oncogene", `Role in Cancer`) &
`Gene Symbol` %in% x_foreground_genes_up)))
x_gene_counts <- add_row(x_gene_counts,
description=sprintf("Transition %d", t),
total=length(x_foreground_genes),
oncogenes=-1*nrow((cancer_genes_df %>% filter(grepl("oncogene", `Role in Cancer`) &
`Gene Symbol` %in% x_foreground_genes_down))),
tumor_suppressors=-1*nrow(cancer_genes_df %>% filter(grepl("TSG", `Role in Cancer`) &
`Gene Symbol` %in% x_foreground_genes_down)),
other_cancer_genes=-1*nrow(cancer_genes_df %>% filter(!grepl("TSG|oncogene", `Role in Cancer`) &
`Gene Symbol` %in% x_foreground_genes_down)))
# ggsave(save_name, plt, width = 87, height=59, units = "mm")
}
x_gene_counts <- x_gene_counts %>% mutate(all_cancer=oncogenes + tumor_suppressors + other_cancer_genes)
x_gene_counts$chrom <- "X"
all_counts <- reshape2::melt(nonx_gene_counts %>% select(description, all_cancer, tumor_suppressors,oncogenes,other_cancer_genes, chrom), id.vars=c("description", "chrom"))
all_counts <- full_join(all_counts,
reshape2::melt(x_gene_counts %>% select(description, all_cancer, tumor_suppressors,oncogenes,other_cancer_genes, chrom), id.vars=c("description", "chrom")),
by=c("description", "chrom", "variable", "value"))
cancer_gene_num_plot <- ggplot(all_counts %>% filter(description != "background" & variable != "all_cancer"), aes(x=chrom, y=value, fill=factor(variable, levels = c("oncogenes", "tumor_suppressors", "other_cancer_genes")))) +
geom_bar(stat="identity", position = "stack") +
facet_grid(cols = vars(description)) +
scale_fill_manual(values = pal_rickandmorty()(5)[3:5],
limits=c("oncogenes", "tumor_suppressors", "other_cancer_genes"),
labels=c("Oncogene", "Tumor Suppressor", "Other")) +
geom_hline(yintercept = 0, size=.1)+
theme_bw() +
base_plot_theme +
labs(title="Number of Cancer Genes Perturbed in Each Transition",
y="Number of Cancer Genes",
x="Chromosome",
fill="Type") +
theme(strip.text.y = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")),
strip.text.x = element_text(size=5, margin = margin(1,1.5,1,1.5, "mm")),
strip.background = element_rect(size = .3))
ggsave("figs/main/fig_4_num_cancer_genes.svg", cancer_gene_num_plot, width = 87, height=47, units = "mm")
```
## Getting sequences to look for TF binding enrichment (ELK-1)
```{r}
library(tidyverse)
concordance_df <- read_csv("concordance_df.csv")
canonical_transcripts <- read_csv("hg38_canonical_transcripts.csv")
hg38_biomart_structure_df <- read_csv("hg38_biomart_structure_df.csv", col_types = cols(chromosome_name="c"))
hg38_biomart_structure_df_canonical <- hg38_biomart_structure_df %>%
filter(ensembl_transcript_id %in% canonical_transcripts$ensembl_transcript_id) %>%
mutate(chrom=paste("chr", chromosome_name, sep=""))
write_sequence_bed <- function(gene_names, start_buff=-600, end_buff=400) {
structure_subset_df <- hg38_biomart_structure_df_canonical %>%
filter(external_gene_name %in% gene_names) %>%
mutate(seq_start=transcription_start_site-(strand * start_buff),
seq_end=transcription_start_site+(strand * end_buff),
chromosome_name=sprintf("chr%s", chromosome_name)) %>%
select(external_gene_name, chromosome_name, transcription_start_site, strand, seq_start, seq_end) %>%
group_by(external_gene_name) %>%
filter(row_number()==1) %>%
ungroup()
return(structure_subset_df)
}
bg_genes <- (concordance_df %>% filter(!(chr %in% c("X", "Y"))))$gene
bg_bed <- write_sequence_bed(bg_genes)
write_tsv(bg_bed %>% select(chromosome_name, seq_start, seq_end), "meme-suite/autosomal_bg.bed", col_names = F)
```
## Plotting methylation and expression changes along the genome
```{r}
library(tidyverse)
library(ggplot2)
all_dmp_df <- read_csv("all_dmp_df.csv")
chrom_size <- function(chr) {
return(as.numeric(list(
"chr1" = 248956422,
"chr2" = 242193529,
"chr3" = 198295559,
"chr4" = 190214555,
"chr5" = 181538259,
"chr6" = 170805979,
"chr7" = 159345973,
"chr8" = 145138636,
"chr9" = 138394717,
"chr10" = 133797422,
"chr11" = 135086622,
"chr12" = 133275309,
"chr13" = 114364328,
"chr14" = 107043718,
"chr15" = 101991189,
"chr16" = 90338345,
"chr17" = 83257441,
"chr18" = 80373285,
"chr19" = 58617616,
"chr20" = 64444167,
"chr21" = 46709983,
"chr22" = 50818468,
"chrX" = 156040895,
"chrY" = 57227415
)[chr]))
}
window_average_genome <- function(all_dmp_df, window_size = 1000000, step_size = 10000, include_empty=FALSE) {
# bed_file_df <- bed_df
windowed_bed_df <- tibble(chr = character(),
pos_start = numeric(),
pos_end = numeric(),
delta_1 = numeric(),
delta_2 = numeric(),
delta_3 = numeric(),
delta_4 = numeric(),
delta_5 = numeric())
for (chr in c("chr1", "chr2", "chr3", "chr4", "chr5",
"chr6", "chr7", "chr8", "chr9",
"chr10", "chr11", "chr12", "chr13", "chr14",
"chr15", "chr16", "chr17", "chr18", "chr19",
"chr20", "chr21", "chr22", "chrX", "chrY")) {
print(chr)
chr_size <- chrom_size(chr)
for (s in 0:floor(chr_size / step_size)) {
start_position <- s * step_size
end_position <- min((start_position + window_size), chr_size)
# print(chr)
# print(s)
num_rows <- nrow(all_dmp_df %>% filter((hg38_chr == chr) & (hg38_pos >= start_position) & (hg38_pos <= end_position))) # (abs((!!start_position) - pos) <= (window_size/2))))
if ((num_rows > 0) | (include_empty == T)) {
window_mean_delta <- all_dmp_df %>%
filter((hg38_chr == chr) & (abs(!!start_position - hg38_pos) <= (window_size/2))) %>%
summarise(chr=first(hg38_chr),
pos_start=start_position,
pos_end=end_position,
mean_tran_1_delta=mean(cluster_1_mean-cluster_0_mean),
mean_tran_2_delta=mean(cluster_2_mean-cluster_0_mean),
mean_tran_3_delta=mean(cluster_3_mean-cluster_0_mean),
mean_tran_4_delta=mean(cluster_4_mean-cluster_0_mean),
mean_tran_5_delta=mean(cluster_5_mean-cluster_0_mean))
window_mean_delta[is.na(window_mean_delta)] <- 0
write_csv(window_mean_delta, "windowed_all_chr_deltas.csv", col_names = F, append = T)
# windowed_bed_df <- add_row(windowed_bed_df,
# chr=chr,
# pos_start=start_position,
# pos_end=end_position,
# delta_1=as.numeric(window_mean_delta$mean_tran_1_delta),
# delta_2=as.numeric(window_mean_delta$mean_tran_2_delta),
# delta_3=as.numeric(window_mean_delta$mean_tran_3_delta),
# delta_4=as.numeric(window_mean_delta$mean_tran_4_delta),
# delta_5=as.numeric(window_mean_delta$mean_tran_5_delta))
}
}
}
windowed_bed_df <- read_csv("windowed_all_chr_deltas.csv", col_names = c("chr",
"pos_start",
"pos_end",
"mean_tran_1_delta",
"mean_tran_2_delta",
"mean_tran_3_delta",
"mean_tran_4_delta",
"mean_tran_5_delta"))
return(windowed_bed_df)
}
whole_genome_cumulative <- window_average_genome(all_dmp_df %>% filter(passes_threshold == T))
whole_genome_plot_df <- whole_genome_cumulative %>% filter(chr != "0") %>%
reshape2::melt(id.vars=c("chr", "pos_start", "pos_end"))
library(ggridges)
ggplot(whole_genome_plot_df %>% filter(chr != "chrY"), aes(x=pos_start, y=0, group=variable, height=value, fill=variable, color=variable)) +
geom_ridgeline(min_height=-1, alpha=.2) +
facet_grid(rows = vars(factor(chr, levels = c("chr1", "chr2", "chr3", "chr4", "chr5",
"chr6", "chr7", "chr8", "chr9",
"chr10", "chr11", "chr12", "chr13", "chr14",
"chr15", "chr16", "chr17", "chr18", "chr19",
"chr20", "chr21", "chr22", "chrX", "chrY")))) +
scale_fill_viridis_d() +
scale_color_viridis_d()
ggplot(all_dmp_df %>% mutate(tran_1_diff=cluster_1_mean-cluster_0_mean), aes(x=hg38_pos, y=tran_1_diff)) +
geom_area(mapping=aes(fill=tran_1_diff < 0))+
facet_wrap(vars(hg38_chr), shrink = F) +
scale_y_continuous(limits=c(-.5, .5))
ggplot(all_dmp_df %>% mutate(tran_5_diff=cluster_5_mean-cluster_0_mean), aes(x=hg38_pos, y=tran_5_diff)) +
geom_area(mapping=aes(fill=tran_5_diff < 0))+
facet_wrap(vars(hg38_chr), shrink = F)+
scale_y_continuous(limits=c(-.5, .5))
global_meth_df <- all_dmp_df %>% group_by(hg38_chr) %>% summarise(mean_0=mean(cluster_0_mean),
mean_1=mean(cluster_1_mean),
mean_2=mean(cluster_2_mean),
mean_3=mean(cluster_3_mean),
mean_4=mean(cluster_4_mean),
mean_5=mean(cluster_5_mean))
global_meth_df <- reshape2::melt(global_meth_df, id.vars=c("hg38_chr"))
global_meth_df <- reshape2::melt(all_dmp_df, id.vars=c("hg38_chr", "hg38_pos", "rowname"), measure.vars=c("cluster_0_mean",
"cluster_1_mean",
"cluster_2_mean",
"cluster_3_mean",
"cluster_4_mean",
"cluster_5_mean"))
library(ggsignif)
ggplot(global_meth_df %>% filter(hg38_chr != "chrY"), aes(x=variable, y=value)) +
facet_wrap(vars(hg38_chr))+
geom_violin() +
theme_bw() +
scale_y_continuous(limits=c(0,1.2))
scale_fill_uchicago()
geom_signif(comparisons = list(c("cluster_0_mean","cluster_1_mean"),
c("cluster_1_mean","cluster_2_mean"),
c("cluster_2_mean","cluster_3_mean"),
c("cluster_3_mean","cluster_4_mean"),
c("cluster_4_mean","cluster_5_mean")), map_signif_level = T)
statistics <- read_csv('variance_statistics_noh9_nooutliers.csv')
filtered_site_annotation_df <- read_csv("filtered_site_annotation_df.csv")
statistics_combined_df <- left_join(statistics, filtered_site_annotation_df %>% dplyr::select(rowname,hg38_chromosome, hg38_gene_name, hg38_gene_group), by=c("Position"="rowname"))
interesting_probes <- (statistics_combined_df %>% filter(alpha <= 0.05 & femaleVariance > maleVariance))$Position
ggplot(global_meth_df %>% filter(rowname %in% interesting_probes), aes(x=variable, y=value)) +
facet_wrap(vars(hg38_chr))+
geom_boxplot() +
theme_bw() +
scale_y_continuous(limits=c(0,1.2))
```
## concordance score with expression changes
```{r}
diff_expr_all_df <- read_csv("diff_expr_all_df.csv")
gened_direction_match <- inner_join(gened_dmp_df, diff_expr_all_df, by=c("hg38_gene"="gene")) %>%
mutate(tran_1_direction_match=(tran_1_mean_delta*transition_1_log2FC) < 0,
tran_2_direction_match=(tran_2_mean_delta*transition_2_log2FC) < 0,
tran_3_direction_match=(tran_3_mean_delta*transition_3_log2FC) < 0,
tran_4_direction_match=(tran_4_mean_delta*transition_4_log2FC) < 0,
tran_5_direction_match=(tran_5_mean_delta*transition_5_log2FC) < 0)
gened_direction_match$concordant <- F
for (row in 1:nrow(gened_direction_match)) {
direction_col <- sprintf("tran_%d_direction_match", as.numeric(unlist(gened_direction_match[row, "key_transition.x"])))
gened_direction_match[row, "concordant"] <- gened_direction_match[row, direction_col] & as.numeric(unlist(gened_direction_match[row, "key_mean_qval"])) <= 0.001
}
```
## Exploring global demethylation with DUSP9 demethylation
```{r}
female_df_no_h9 <- read_csv("female_df_no_h9.csv")
male_df_no_outliers <- read_csv("male_df_no_outliers.csv")
filtered_site_annotation_df <- read_csv("filtered_site_annotation_df.csv")
dusp9_probes <- (filtered_site_annotation_df %>% filter(hg38_gene_name == "DUSP9" & grepl("TSS1500|TSS200|5_UTR", hg38_gene_group)))$rowname
autosome_probes <- (filtered_site_annotation_df %>% filter(!(hg38_chromosome %in% c("chrX", "chrY"))))$rowname
mean_genome_methylation_female <- tidy(colMeans(dplyr::select(female_df_no_h9 %>% filter(rn %in% autosome_probes), -rn)))
mean_dusp9_methylation_female <- tidy(colMeans(dplyr::select(female_df_no_h9 %>% filter(rn %in% dusp9_probes), -rn)))
dusp9_v_genome_female <- full_join(mean_genome_methylation_female %>% rename(genome_methylation=x),
mean_dusp9_methylation_female %>% rename(dusp9_methylation=x), by="names")
dusp9_v_genome_female$sex <- "female"
mean_genome_methylation_male <- tidy(colMeans(dplyr::select(male_df_no_outliers %>% filter(rn %in% autosome_probes), -rn)))
mean_dusp9_methylation_male <- tidy(colMeans(dplyr::select(male_df_no_outliers %>% filter(rn %in% dusp9_probes), -rn)))
dusp9_v_genome_male <- full_join(mean_genome_methylation_male %>% rename(genome_methylation=x),
mean_dusp9_methylation_male %>% rename(dusp9_methylation=x), by="names")
dusp9_v_genome_male$sex <- "male"
dusp9_v_genome_combined <- full_join(dusp9_v_genome_female, dusp9_v_genome_male %>% mutate(dusp9_methylation= dusp9_methylation/ 2), by=c("names", "genome_methylation", "dusp9_methylation", "sex"))
ggplot(dusp9_v_genome_combined, aes(x=dusp9_methylation, y=genome_methylation, color=sex)) +
geom_point() +
theme_bw() +
labs(title="Genome Methylation vs DUSP9 Methylation") +
scale_color_npg()
ggplot(dusp9_v_genome_combined %>% filter(dusp9_methylation < .2)) +
geom_boxplot(aes(x=sex, y=genome_methylation, fill=sex)) +
scale_fill_npg() +
labs(title="Genome Methylation when DUSP9 methylation < .4")
update_geom_defaults("boxplot",
list(size=.3,
outlier.shape=NA))
update_geom_defaults("point",
list(shape=20,
size=.5))
dusp9_expression <- plot_gene_expression("DUSP9") +
scale_y_continuous() +
theme_bw() +
base_plot_theme
elk1_expression <- plot_gene_expression("ELK1") +
scale_y_continuous() +
theme_bw() +
base_plot_theme
dusp9_elk1_expression <- plot_gene_expression(c("DUSP9", "ELK1"), multiple=T) +
theme_bw() +
base_plot_theme +
theme(strip.text.x = element_text(size=5, margin = margin(.5,0,.5,0, "mm")),
strip.text.y = element_text(size=5, margin = margin(0,0.5,0,.5, "mm")))
dusp9_elk1_methylation <- make_subset_heatmap(c("DUSP9", "ELK1")) +
theme(strip.text.x = element_text(size=5, margin = margin(.5,0,.5,0, "mm")),
strip.text.y = element_text(size=5, margin = margin(0,0.5,0,.5, "mm")),
legend.key.height = unit(3, "mm"),
legend.title = element_text(size=5,vjust = .9),
legend.text = element_text(size=4),
legend.box.margin = margin(t=4,b=-4, unit="mm"),
plot.margin = margin(t=-4, unit = "mm"),
axis.title.x = element_text(size=5),
axis.title.y = element_text(size = 0, margin = margin(0,0,0,0)))
arranged_dusp9_elk1 <- grid.arrange(dusp9_elk1_methylation,
dusp9_expression,
elk1_expression,
layout_matrix=rbind(c(1,2), c(1,3)), nrow=2)
ggsave("figs/supplementary/supp_4_dusp9_elk1_meth.pdf", dusp9_elk1_methylation, width = 87, height=58, units = "mm")
ggsave("figs/supplementary/supp_4_dusp9_elk1_expr.pdf", dusp9_elk1_expression, width = 87, height=58, units = "mm")
ggsave("figs/main/fig_4_dusp9_elk1_araf_meth.svg", dusp9_elk1_methylation, width = 76, height=58, units = "mm")
ggsave("figs/main/fig_4_dusp9_elk1_araf_expr.svg", dusp9_elk1_expression, width = 40, height=58, units = "mm")
ggsave("figs/main/fig_4_dusp9_expr.pdf", dusp9_expression, width = 40, height=48, units = "mm")
ggsave("figs/main/fig_4_elk1_expr.pdf", elk1_expression, width = 40, height=29, units = "mm")
dusp9_methylation <- make_subset_heatmap(c("DUSP9")) +
theme(strip.text.x = element_text(size=5, margin = margin(.5,0,.5,0, "mm")),
strip.text.y = element_text(size=5, margin = margin(0,0.5,0,.5, "mm")),
legend.key.height = unit(3, "mm"),
legend.title = element_text(size=5,vjust = .9),
legend.text = element_text(size=4),
legend.box.margin = margin(t=4,b=-4, unit="mm"),
plot.margin = margin(t=-4, unit = "mm"),
axis.title.x = element_text(size=5),
axis.title.y = element_text(size = 0, margin = margin(0,0,0,0)))
ggsave("figs/main/fig_4_dusp9_meth.pdf", dusp9_methylation, width = 76, height=48, units = "mm")
ggsave("figs/main/fig_4_dusp9_expr.pdf", dusp9_expression, width = 40, height=48, units = "mm")
elk1_methylation <- make_subset_heatmap(c("ELK1")) +
theme(strip.text.x = element_text(size=5, margin = margin(.5,0,.5,0, "mm")),
strip.text.y = element_text(size=5, margin = margin(0,0.5,0,.5, "mm")),
legend.key.height = unit(3, "mm"),
legend.title = element_text(size=5,vjust = .9),
legend.text = element_text(size=4),
legend.box.margin = margin(t=4,b=-4, unit="mm"),
plot.margin = margin(t=-4, unit = "mm"),
axis.title.x = element_text(size=5),
axis.title.y = element_text(size = 0, margin = margin(0,0,0,0)))
ggsave("figs/supplementary/supp_4_elk1_meth.pdf", elk1_methylation, width = 87, height=48, units = "mm")
ggsave("figs/supplementary/supp_4_elk1_expr.pdf", elk1_expression, width = 87, height=48, units = "mm")
```
## Global methylation changes across the chromosomes
```{r}
all_dmp_df <- read_csv("all_dmp_df.csv.gz")
plotting_df <- all_dmp_df %>% filter(passes_threshold == T)# & abs(delta) >= .15)
plotting_df$chr_alt <- gsub("chr", "", plotting_df$hg38_chr)
plotting_df$chr_alt <- gsub("X", "23", plotting_df$chr_alt)
plotting_df$chr_alt <- as.numeric(plotting_df$chr_alt)
chromosomes_df <- tibble(chromosome=c(1, 2, 3, 4, 5, 6, 7, 23, 8,
9, 11, 10, 12, 13, 14, 15,
16, 17, 18, 20, 19, 24, 22, 21),
size=c(248956422, 242193529, 198295559, 190214555,
181538259, 170805979, 159345973, 156040895,
145138636, 138394717, 135086622, 133797422,
133275309, 114364328, 107043718, 101991189,
90338345, 83257441, 80373285, 64444167, 58617616,
57227415, 50818468, 46709983)) %>% filter(chromosome != 24)
#alpha=abs(delta)
# tran_1_density <- ggplot(plotting_df %>% filter(chr_alt != 23 & key_transition == 1)) +
# geom_density(aes(x=abs(delta)), fill="black") +
# theme_bw() +
# base_plot_theme +
# scale_y_continuous(breaks = c(0,5,10)) +
# scale_x_continuous(breaks = c(0,.2,.4)) +
# labs(title="Autosomal β Density") +
# theme(axis.ticks.length = unit(.4, "mm"),
# plot.margin = margin(l=0,b=0,t=0.5,r=0.4, "mm"),
# axis.title.y = element_text(size=3, margin = margin(t = 0, r = -0.2, b = 0, l = 0, "mm")),
# axis.title.x = element_text(size=3,margin = margin(t = -0.2, r = 0, b = 0, l = 0, "mm")),
# plot.title = element_text(size=3.5))
# ggsave("figs/supplementary/supp_3_tran_1_density.pdf", tran_1_density, width = 18, height=12, units = "mm")
# tran_2_density <- ggplot(plotting_df %>% filter(chr_alt != 23 & key_transition == 2)) +
# geom_density(aes(x=abs(delta)), fill="black") +
# theme_bw() +
# base_plot_theme +
# scale_y_continuous(breaks = c(0,5,10)) +
# scale_x_continuous(breaks = c(0,.2,.4)) +
# labs(title="Autosomal β Density") +
# theme(axis.ticks.length = unit(.4, "mm"),
# plot.margin = margin(l=0,b=0,t=0.5,r=0.4, "mm"),
# axis.title.y = element_text(size=3, margin = margin(t = 0, r = -0.2, b = 0, l = 0, "mm")),
# axis.title.x = element_text(size=3,margin = margin(t = -0.2, r = 0, b = 0, l = 0, "mm")),
# plot.title = element_text(size=3.5))
# ggsave("figs/supplementary/supp_3_tran_2_density.pdf", tran_2_density, width = 18, height=12, units = "mm")
# tran_3_density <- ggplot(plotting_df %>% filter(chr_alt != 23 & key_transition == 3)) +
# geom_density(aes(x=abs(delta)), fill="black") +
# theme_bw() +
# base_plot_theme +
# scale_y_continuous(breaks = c(0,7,15)) +
# scale_x_continuous(breaks = c(0,.2,.4)) +
# labs(title="Autosomal β Density") +
# theme(axis.ticks.length = unit(.4, "mm"),
# plot.margin = margin(l=0,b=0,t=0.5,r=0.4, "mm"),
# axis.title.y = element_text(size=3, margin = margin(t = 0, r = -0.2, b = 0, l = 0, "mm")),
# axis.title.x = element_text(size=3,margin = margin(t = -0.2, r = 0, b = 0, l = 0, "mm")),
# plot.title = element_text(size=3.5))
# ggsave("figs/supplementary/supp_3_tran_3_density.pdf", tran_3_density, width = 18, height=12, units = "mm")
# tran_4_density <- ggplot(plotting_df %>% filter(chr_alt != 23 & key_transition == 4)) +
# geom_density(aes(x=abs(delta)), fill="black") +
# theme_bw() +
# base_plot_theme +
# scale_y_continuous(breaks = c(0,7,15)) +
# scale_x_continuous(breaks = c(0,.2,.4)) +
# labs(title="Autosomal β Density") +
# theme(axis.ticks.length = unit(.4, "mm"),
# plot.margin = margin(l=0,b=0,t=0.5,r=0.4, "mm"),
# axis.title.y = element_text(size=3, margin = margin(t = 0, r = -0.2, b = 0, l = 0, "mm")),
# axis.title.x = element_text(size=3,margin = margin(t = -0.2, r = 0, b = 0, l = 0, "mm")),
# plot.title = element_text(size=3.5))
# ggsave("figs/supplementary/supp_3_tran_4_density.pdf", tran_4_density, width = 18, height=12, units = "mm")
# tran_5_density <- ggplot(plotting_df %>% filter(chr_alt != 23 & key_transition == 5)) +
# geom_density(aes(x=abs(delta)), fill="black") +
# theme_bw() +
# base_plot_theme +
# scale_y_continuous(breaks = c(0,10,20)) +
# scale_x_continuous(breaks = c(0,.2,.4, .6)) +
# labs(title="Autosomal β Density") +
# theme(axis.ticks.length = unit(.4, "mm"),
# plot.margin = margin(l=0,b=0,t=0.5,r=0.4, "mm"),
# axis.title.y = element_text(size=3, margin = margin(t = 0, r = -0.2, b = 0, l = 0, "mm")),
# axis.title.x = element_text(size=3,margin = margin(t = -0.2, r = 0, b = 0, l = 0, "mm")),
# plot.title = element_text(size=3.5))
# ggsave("figs/supplementary/supp_3_tran_5_density.pdf", tran_5_density, width = 18, height=12, units = "mm")
global_meth_changes <- ggplot(plotting_df) +
geom_rect(data=chromosomes_df,
aes(xmin=chromosome-.4, xmax=chromosome+.4, ymin=0, ymax=size), fill="lightgrey")+
geom_segment(aes(y=hg38_pos, x=chr_alt-.4, yend=hg38_pos, xend=chr_alt+.4, color=delta_increasing, alpha=abs(delta)), size=.3) +
facet_grid(rows = vars(key_transition), labeller = labeller(.rows=function(n){
return(list("1"="Tran. 1",
"2"="Tran. 2",
"3"="Tran. 3",
"4"="Tran. 4",
"5"="Tran. 5")[n])
})) +
scale_x_reverse(breaks = 1:23, labels = c(1:22, "X"), minor_breaks = NULL, expand=c(.01,.01)) +
scale_y_continuous(expand=c(0,0), breaks = c(0,50e6,100e6,150e6,200e6,250e6), labels=c(0,50,100,150,200,250)) +
scale_color_aaas(limits=c(F,T),labels=c("Decreasing", "Increasing"), name="Direction of Change") +
labs(x="Chromosome",
y="Position (Mb)",
title="DNAme Changes on all chromosomes") +
theme_bw() +
coord_flip() +
base_plot_theme +
scale_alpha(limits=c(0,.5), range=c(0,.3), oob=scales::squish, breaks=c(0,.25,.5)) +
theme(strip.text.y = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")),
strip.text.x = element_text(size=5, margin = margin(1,1.5,1,1.5, "mm")),
strip.background = element_rect(size = .3),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
legend.box="vertical",
legend.box.just="right",
legend.box.spacing = unit(5, "mm"),
legend.spacing.y=unit(1, "mm"))
ggsave("figs/supplementary/supp_3a.pdf",global_meth_changes, width=174/3, height=236, units="mm")
global_meth_changes_tran45 <- ggplot(plotting_df %>% filter(key_transition >= 4) %>% mutate(key_transition = 5)) +
geom_rect(data=chromosomes_df,
aes(xmin=chromosome-.4, xmax=chromosome+.4, ymin=0, ymax=size), fill="lightgrey")+
geom_segment(aes(y=hg38_pos, x=chr_alt-.4, yend=hg38_pos, xend=chr_alt+.4, color=delta_increasing, alpha=abs(delta)), size=.3) +
facet_grid(rows = vars(key_transition), labeller = labeller(.rows=function(n){
return(list("1"="Tran. 1",
"2"="Tran. 2",
"3"="Tran. 3",
"4"="Tran. 4/5",
"5"="Tran. 4/5")[n])
})) +
scale_x_reverse(breaks = 1:23, labels = c(1:22, "X"), minor_breaks = NULL, expand=c(0.01,0.01)) +
scale_y_continuous(expand=c(0,0), breaks = c(0,50e6,100e6,150e6,200e6,250e6), labels=c(0,50,100,150,200,250)) +
coord_flip() +
scale_color_aaas(limits=c(F,T),labels=c("Decreasing", "Increasing"), name="Direction of Change") +
labs(x="Chromosome",
y="Position (Mb)",
title="DNAme Changes") +
theme_bw() +
base_plot_theme +
scale_alpha(limits=c(0,.5), range=c(0,.3), oob=scales::squish, breaks=c(0,.25,.5)) +
theme(strip.text.y = element_text(size=5, margin = margin(0,0.5,0,.5, "mm")),
strip.text.x = element_text(size=5, margin = margin(0,0.5,0,.5, "mm")),
strip.background = element_rect(size = .3),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
legend.box="vertical",
legend.box.just="right",
legend.box.spacing = unit(3.5, "mm"),
legend.spacing.y=unit(.2, "mm"),
axis.text.y=element_text(size=5))
ggsave("figs/main/fig_4_global_changes_meth.pdf", global_meth_changes_tran45, width = 58, height=47, units = "mm")
diff_expr_all_df <- read_csv("diff_expr_all_df.csv.gz")
grouped_concordance_df_filtered <- read_csv("grouped_concordance_df_filtered.csv.gz")
expr_plotting_df <- grouped_concordance_df_filtered # diff_expr_all_df #%>% filter(passes_threshold == T)
expr_plotting_df$chr_alt <- gsub("X", "23", expr_plotting_df$chr)
expr_plotting_df$chr_alt <- as.numeric(expr_plotting_df$chr_alt)
# expr_plotting_df_melt <- expr_plotting_df %>%
# select("chr_alt",
# "ens_id",
# "gene",
# "start_position",
# "end_position",
# "transition_1_log2FC",
# "transition_2_log2FC",
# "transition_3_log2FC",
# "transition_4_log2FC") %>%
# reshape2::melt(id.vars=c("chr_alt",
# "ens_id",
# "gene",
# "start_position",
# "end_position"))
chromosomes_df <- tibble(chromosome=c(1, 2, 3, 4, 5, 6, 7, 23, 8,
9, 11, 10, 12, 13, 14, 15,
16, 17, 18, 20, 19, 24, 22, 21),
size=c(248956422, 242193529, 198295559, 190214555,
181538259, 170805979, 159345973, 156040895,
145138636, 138394717, 135086622, 133797422,
133275309, 114364328, 107043718, 101991189,
90338345, 83257441, 80373285, 64444167, 58617616,
57227415, 50818468, 46709983)) %>% filter(chromosome != 24)
global_expr_changes <- ggplot(expr_plotting_df %>% filter(abs(max_tran_log2FC) >= .5)) +
geom_rect(data=chromosomes_df,
aes(xmin=chromosome-.4, xmax=chromosome+.4, ymin=0, ymax=size), fill="lightgrey")+
geom_segment(aes(y=start_position, x=chr_alt-.4, yend=start_position, xend=chr_alt+.4, color=max_tran_log2FC >= 0), size=.3) +
facet_grid(rows = vars(max_transition), labeller = labeller(.rows=function(n){
return(list("1"="Tran. 1",
"2"="Tran. 2",
"3"="Tran. 3",
"4"="Tran. 4",
"5"="Tran. 5")[n])
})) +
scale_x_continuous(breaks = 1:23, labels = c(1:22, "X"), minor_breaks = NULL, expand=c(.01,.01)) +
scale_y_continuous(expand=c(0,0), breaks = c(0,50e6,100e6,150e6,200e6,250e6), labels=c(0,50,100,150,200,250)) +
scale_color_aaas(limits=c(T,F),labels=c("Increasing", "Decreasing"), name="Direction of Expression Change") +
labs(x="Chromosome",
y="Position (Mb)",
title="Expression/Methylation Concordant Changes on all chromosomes") +
theme_bw() +
base_plot_theme +
theme(strip.text.y = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")),
strip.text.x = element_text(size=5, margin = margin(1,1.5,1,1.5, "mm")),
strip.background = element_rect(size = .3),
legend.box.spacing = unit(5, "mm"),
legend.spacing.y=unit(1, "mm"))
ggsave("figs/supplementary/supp_3b.pdf", global_expr_changes, width=85, height=236, units="mm")
expr_plotting_df %>% filter(abs(max_tran_log2FC) >= .5) %>%
count(max_transition, max_tran_log2FC >= 0, chr == "X") %>%
arrange(max_transition, desc(`max_tran_log2FC >= 0`)) %>%
View()
global_expr_changes_tran45 <- ggplot(expr_plotting_df %>% filter(abs(max_tran_log2FC) >= .5 & max_transition >= 4) %>% mutate(max_transition = 5)) +
geom_rect(data=chromosomes_df,
aes(xmin=chromosome-.4, xmax=chromosome+.4, ymin=0, ymax=size), fill="lightgrey")+
geom_segment(aes(y=start_position, x=chr_alt-.4, yend=start_position, xend=chr_alt+.4, color=max_tran_log2FC >= 0), size=.3) +
facet_grid(rows = vars(max_transition), labeller = labeller(.rows=function(n){
return(list("1"="Tran. 1",
"2"="Tran. 2",
"3"="Tran. 3",
"4"="Tran. 4/5",
"5"="Tran. 4/5")[n])
})) +
scale_x_continuous(breaks = 1:23, labels = c(1:22, "X"), minor_breaks = NULL) +
scale_y_continuous(expand=c(0,0)) +
scale_color_aaas(limits=c(T,F),labels=c("Increasing", "Decreasing"), name="Direction of Expression Change") +
labs(x="Chromosome",
y="Position (Mb)",
title="Expression/Methylation Concordant Changes on all chromosomes") +
theme_bw() +
base_plot_theme +
theme(strip.text.y = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")),
strip.text.x = element_text(size=5, margin = margin(1,1.5,1,1.5, "mm")),
strip.background = element_rect(size = .3))
ggsave("figs/main/fig_4_global_changes_expr.svg", global_expr_changes_tran45, width = 58, height=47, units = "mm")
expr_plotting_df %>% filter(abs(max_tran_log2FC) >= .5 & max_transition >= 4) %>% mutate(max_transition = 5) %>%
count(max_transition, max_tran_log2FC >= 0, chr == "X") %>%
View()
# global_expr_changes_tran5 <- ggplot(expr_plotting_df %>% filter(abs(max_tran_log2FC) >= .5 & max_transition == 5)) +
# geom_rect(data=chromosomes_df,
# aes(xmin=chromosome-.4, xmax=chromosome+.4, ymin=0, ymax=size), fill="lightgrey")+
# geom_segment(aes(y=start_position, x=chr_alt-.4, yend=start_position, xend=chr_alt+.4, color=max_tran_log2FC >= 0), size=.3) +
# facet_grid(rows = vars(max_transition), labeller = labeller(.rows=function(n){
# return(list("1"="Tran. 1",
# "2"="Tran. 2",
# "3"="Tran. 3",
# "4"="Tran. 4",
# "5"="Tran. 5")[n])
# })) +
# scale_x_continuous(breaks = 1:23, labels = c(1:22, "X"), minor_breaks = NULL) +
# scale_y_continuous(expand=c(0,0)) +
# scale_color_aaas(limits=c(T,F),labels=c("Increasing", "Decreasing"), name="Direction of Expression Change") +
# labs(x="Chromosome",
# y="Position (Mb)",
# title="Expression/Methylation Concordant Changes on all chromosomes") +
# theme_bw() +
# base_plot_theme +
# theme(strip.text.y = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")),
# strip.text.x = element_text(size=5, margin = margin(1,1.5,1,1.5, "mm")),
# strip.background = element_rect(size = .3))
# ggsave("figs/main/fig_4_global_changes.svg", global_expr_changes_tran5, width = 87, height=47, units = "mm")
# global_expr_changes_alt <- ggplot(expr_plotting_df_melt %>% filter(abs(value) >= 1)) +
# geom_rect(data=chromosomes_df,
# aes(xmin=chromosome-.4, xmax=chromosome+.4, ymin=0, ymax=size), fill="lightgrey")+
# geom_segment(aes(y=start_position, x=chr_alt-.4, yend=start_position, xend=chr_alt+.4, color=value>0), alpha=.3) +
# facet_grid(rows = vars(variable)) +
# scale_x_continuous(breaks = 1:23, labels = c(1:22, "X"), minor_breaks = NULL) +
# scale_y_continuous(expand=c(0,0)) +
# scale_color_aaas(limits=c(T,F),labels=c("Increasing", "Decreasing"), name="Direction of Change") +
# labs(x="Chromosome",
# y="Position (Mb)",
# title="Expression Changes on all chromosomes") +
# theme_bw() +
# base_plot_theme
```
## Grouping methylation by gene
```{r}
all_dmp_df <- read_csv("all_dmp_df.csv")
concordance_df <- read_csv("concordance_df.csv")
diff_expr_all_df <- read_csv("diff_expr_all_df.csv")
library(tidyr)
grouped_df <- all_dmp_df %>% group_by(hg38_gene) %>%
summarise(hg38_chr=first(hg38_chr),
all_probes=length(key_transition),
transition_1_probes=sum(key_transition == 1 & passes_threshold == T),
transition_1_delta=sum(key_transition == 1 & passes_threshold == T),
transition_2_probes=sum(key_transition == 2 & passes_threshold == T),
transition_3_probes=sum(key_transition == 3 & passes_threshold == T),
transition_4_probes=sum(key_transition == 4 & passes_threshold == T),
transition_5_probes=sum(key_transition == 5 & passes_threshold == T))
gene_dmp_df <- all_dmp_df %>%
group_by(hg38_gene) %>%
summarise(cluster_0_mean_meth=mean(cluster_0_mean),
cluster_1_mean_meth=mean(cluster_1_mean),
cluster_2_mean_meth=mean(cluster_2_mean),
cluster_3_mean_meth=mean(cluster_3_mean),
cluster_4_mean_meth=mean(cluster_4_mean),
cluster_5_mean_meth=mean(cluster_5_mean))
grouped_df_test <- left_join(grouped_df, gene_dmp_df, by=c("hg38_gene"))
grouped_df_promoter <- all_dmp_df %>% group_by(hg38_gene) %>%
filter(hg38_refgene_group %in% c("TSS1500", "TSS200", "5_UTR")) %>%
summarise(hg38_chr=first(hg38_chr),
all_probes=length(key_transition),
transition_1_probes=sum(key_transition == 1 & passes_threshold == T),
transition_1_delta=sum(key_transition == 1 & passes_threshold == T),
transition_2_probes=sum(key_transition == 2 & passes_threshold == T),
transition_3_probes=sum(key_transition == 3 & passes_threshold == T),
transition_4_probes=sum(key_transition == 4 & passes_threshold == T),
transition_5_probes=sum(key_transition == 5 & passes_threshold == T))
gene_dmp_df_promoter <- all_dmp_df %>%
filter(hg38_refgene_group %in% c("TSS1500", "TSS200", "5_UTR")) %>%
group_by(hg38_gene) %>%
summarise(cluster_0_mean_meth=mean(cluster_0_mean),
cluster_1_mean_meth=mean(cluster_1_mean),
cluster_2_mean_meth=mean(cluster_2_mean),
cluster_3_mean_meth=mean(cluster_3_mean),
cluster_4_mean_meth=mean(cluster_4_mean),
cluster_5_mean_meth=mean(cluster_5_mean))
grouped_df_test_promoter <- left_join(grouped_df_promoter, gene_dmp_df_promoter, by=c("hg38_gene"))
eroding_methylation_genes <- (grouped_df_test_promoter %>% filter(hg38_chr == "chrX" & (transition_1_probes+transition_2_probes+transition_3_probes+transition_4_probes+transition_5_probes != 0) &
cluster_0_mean_meth > .3 &
cluster_0_mean_meth < .7))$hg38_gene
concordance_eroding_df <- concordance_df %>% filter(gene %in% eroding_methylation_genes)
melted_group_df <- reshape2::melt(grouped_df %>% select(hg38_gene,
transition_1_probes,
transition_2_probes,
transition_3_probes,
transition_4_probes,
transition_5_probes),
id.vars=c("hg38_gene")) %>%
group_by(hg38_gene) %>%
arrange(desc(value)) %>%
summarise(max_transition=first(variable),
max_probes=first(value)) %>%
mutate(max_transition=as.numeric(gsub("transition_|_probes", "", max_transition)))
grouped_concordance_df <- left_join(concordance_df, grouped_df, by=c("gene"="hg38_gene"))
grouped_concordance_df <- left_join(grouped_concordance_df, melted_group_df, by=c("gene"="hg38_gene"))
grouped_concordance_df <- left_join(grouped_concordance_df, diff_expr_all_df %>%
select(gene,
chr,
start_position,
end_position,
transition_1_log2FC,
transition_2_log2FC,
transition_3_log2FC,
transition_4_log2FC,
transition_1_padj,
transition_2_padj,
transition_3_padj,
transition_4_padj))
grouped_concordance_df_dmp <- left_join(grouped_concordance_df, all_dmp_df %>%
select(hg38_gene,
cluster_0_mean_meth=cluster_0_mean,
cluster_1_mean_meth=cluster_1_mean,
cluster_2_mean_meth=cluster_2_mean,
cluster_3_mean_meth=cluster_3_mean,
cluster_4_mean_meth=cluster_4_mean,
cluster_5_mean_meth=cluster_5_mean,
probe_key_transition=key_transition),
by=c("gene"="hg38_gene")) %>%
filter(max_transition == probe_key_transition) %>%
group_by(gene) %>%
summarise(cluster_0_mean_meth=mean(cluster_0_mean_meth),
cluster_1_mean_meth=mean(cluster_1_mean_meth),
cluster_2_mean_meth=mean(cluster_2_mean_meth),
cluster_3_mean_meth=mean(cluster_3_mean_meth),
cluster_4_mean_meth=mean(cluster_4_mean_meth),
cluster_5_mean_meth=mean(cluster_5_mean_meth))
grouped_concordance_df <- left_join(grouped_concordance_df,
grouped_concordance_df_dmp,
by=c("gene"))
grouped_concordance_df$max_meth_delta <- 0
grouped_concordance_df$max_tran_log2FC <- 0
grouped_concordance_df$max_tran_expr_padj <- 0
for (r in 1:nrow(grouped_concordance_df)) {
max_tran <- as.numeric(unlist(grouped_concordance_df$max_transition[r]))
cluster_a_mean <- as.numeric(unlist(grouped_concordance_df[r,
sprintf("cluster_%d_mean_meth",
max_tran - 1)]))
cluster_b_mean <- as.numeric(unlist(grouped_concordance_df[r,
sprintf("cluster_%d_mean_meth",
max_tran)]))
grouped_concordance_df[r, "max_meth_delta"] <- cluster_b_mean - cluster_a_mean
if (max_tran == 5) {
max_tran_log2FC <- as.numeric(unlist(grouped_concordance_df[r,
sprintf("transition_%d_log2FC",
4)]))
max_tran_expr_padj <- as.numeric(unlist(grouped_concordance_df[r,
sprintf("transition_%d_padj",
4)]))
}else {
max_tran_log2FC <- as.numeric(unlist(grouped_concordance_df[r,
sprintf("transition_%d_log2FC",
max_tran)]))
max_tran_expr_padj <- as.numeric(unlist(grouped_concordance_df[r,
sprintf("transition_%d_padj",
max_tran)]))
}
grouped_concordance_df[r, "max_tran_log2FC"] <- max_tran_log2FC
grouped_concordance_df[r, "max_tran_expr_padj"] <- max_tran_expr_padj
}
write_csv(grouped_concordance_df, "grouped_concordance_df.csv")
grouped_concordance_df_filtered <- grouped_concordance_df %>%
filter(pval <= .1 & corr < 0 & max_probes != 0)
# grouped_concordance_df_filtered_dmp <- left_join(grouped_concordance_df_filtered, all_dmp_df %>%
# select(hg38_gene,
# cluster_0_mean_meth=cluster_0_mean,
# cluster_1_mean_meth=cluster_1_mean,
# cluster_2_mean_meth=cluster_2_mean,
# cluster_3_mean_meth=cluster_3_mean,
# cluster_4_mean_meth=cluster_4_mean,
# cluster_5_mean_meth=cluster_5_mean,
# probe_key_transition=key_transition),
# by=c("gene"="hg38_gene"))
# grouped_concordance_df_filtered_dmp <- grouped_concordance_df_filtered_dmp %>%
# filter(max_transition == probe_key_transition) %>%
# group_by(gene) %>%
# summarise(cluster_0_mean_meth=mean(cluster_0_mean_meth),
# cluster_1_mean_meth=mean(cluster_1_mean_meth),
# cluster_2_mean_meth=mean(cluster_2_mean_meth),
# cluster_3_mean_meth=mean(cluster_3_mean_meth),
# cluster_4_mean_meth=mean(cluster_4_mean_meth),
# cluster_5_mean_meth=mean(cluster_5_mean_meth))
# grouped_concordance_df_filtered <- left_join(grouped_concordance_df_filtered,
# grouped_concordance_df_filtered_dmp,
# by=c("gene"))
# grouped_concordance_df_filtered$max_meth_delta <- 0
# grouped_concordance_df_filtered$max_tran_log2FC <- 0
# for (r in 1:nrow(grouped_concordance_df_filtered)) {
# max_tran <- as.numeric(unlist(grouped_concordance_df_filtered$max_transition[r]))
# cluster_a_mean <- as.numeric(unlist(grouped_concordance_df_filtered[r,
# sprintf("cluster_%d_mean_meth",
# max_tran - 1)]))
# cluster_b_mean <- as.numeric(unlist(grouped_concordance_df_filtered[r,
# sprintf("cluster_%d_mean_meth",
# max_tran)]))
#
# grouped_concordance_df_filtered[r, "max_meth_delta"] <- cluster_b_mean - cluster_a_mean
#
# max_tran_log2FC <- as.numeric(unlist(grouped_concordance_df_filtered[r,
# sprintf("transition_%d_log2FC",
# max_tran)]))
#
# grouped_concordance_df_filtered[r, "max_tran_log2FC"] <- max_tran_log2FC
# }
write_csv(grouped_concordance_df_filtered, "grouped_concordance_df_filtered.csv")
```
## Global methylation differences between male and female per female cluster
```{r}
female_df_no_h9 <- read_csv("female_df_no_h9.csv.gz")
male_df_no_outliers <- read_csv("male_df_no_outliers.csv.gz")
sample_clusters_df <- read_csv("saved_sample_clusters.csv", col_names = c("sample_name", "cluster"))
all_dmp_df <- read_csv("all_dmp_df.csv.gz")
male_df_no_outliers_mean <- male_df_no_outliers %>%
column_to_rownames(var="rn") %>%
rowMeans() %>%
tidy()
all_dmp_df_with_male <- inner_join(all_dmp_df, male_df_no_outliers_mean %>% dplyr::rename(male_mean=x), by=c("rowname"="names"))
plot_df <- all_dmp_df_with_male %>% filter(!(hg38_chr %in% c("chrX", "chrY"))) %>% #& male_mean >= .5) %>%
dplyr::select(rowname, cluster_0_mean, cluster_1_mean, cluster_2_mean, cluster_3_mean, cluster_4_mean, cluster_5_mean, male_mean) %>%
reshape2::melt(id.vars=c("rowname")) %>% dplyr::rename(site=rowname, group=variable, mean_methylation=value)
library(ggpubr)
library(ggsci)
plot_df$cluster <- str_extract(plot_df$group, "[0-5]")
my_comparisons <- list( c("cluster_0_mean", "male_mean", "Cluster A", "Male"),
c("cluster_1_mean", "male_mean", "Cluster B", "Male"),
c("cluster_2_mean", "male_mean", "Cluster C", "Male"),
c("cluster_3_mean", "male_mean", "Cluster D", "Male"),
c("cluster_4_mean", "male_mean", "Cluster E", "Male"),
c("cluster_5_mean", "male_mean", "Cluster F", "Male"))
ks_df <- tibble(group1=character(), group2=character(),p=character(),y.position=numeric())
k <- 0
for (c in my_comparisons) {
test_a <- (plot_df %>% filter(group==c[1]))$mean_methylation
test_b <- (plot_df %>% filter(group==c[2]))$mean_methylation
ks_df <- ks_df %>%
dplyr::add_row(group1=c[1],
group2=c[2],
p=sprintf("%.2f", as.numeric(ks.test(test_a, test_b)$statistic)),
y.position=1.37-k)
k <- k + .07
}
global_methylation_by_cluster_plt <- ggplot(plot_df %>% filter(), aes(x=group, y=mean_methylation, fill=group=="male_mean", alpha=cluster)) +
geom_violin(outlier.shape = NA, size=.3) +
# stat_pvalue_manual(data=ks_df, inherit.aes = FALSE, size=.1, bracket.size=.1, tip.length = 0, label.size = 1, vjust=0, label = "K.S. Distance = {p}") +
# stat_compare_means(aes(x=group, y=mean_methylation, group=group, fill=group), comparisons = my_comparisons, method="t.test", alpha=1) +
geom_text(data=ks_df, inherit.aes=F, mapping=aes(y=.5, x=group1, label=paste("D=",p, sep="")), color=pal_npg()(2)[2], size=1.5, hjust=-.1) +
labs(y="Mean Autosomal DNAme",
x="",
title = "Autosomal DNAme in Female and Male hPSCs")+
theme_bw() +
base_plot_theme +
scale_fill_npg(name="Sex", labels=c("female", "male")) +
scale_alpha_discrete(guide=F) +
scale_x_discrete(breaks=c(sprintf("cluster_%d_mean",0:5),"male_mean"), labels=c(sprintf("Cluster %s", c("A", "B", "C", "D", "E", "F")), "Male"))+
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, margin = margin(b=0, unit="mm")),
axis.title.x=element_text(size=0, margin=margin(0,0,0,0)),
# legend.text = element_text(size=4),
# legend.title = element_text(size=5),
legend.key.width = unit(2, units="mm"),
legend.key.height = unit(2, units="mm"),
legend.key.size = unit(.2, units = "mm")) +
scale_y_continuous()
ggsave("figs/main/fig_4_autosomal_meth.pdf", global_methylation_by_cluster_plt, height = 47, width=58, units="mm")
# only plot for samples with expression
sample_data_combined_df <- read_csv("sample_data_combined_all_info.csv")
female_samples_to_keep <- sample_data_combined_df %>% filter(!is.na(sra_accession) & sample_name %in% sample_clusters_df$sample_name)
female_df_no_h9_with_expr <- female_df_no_h9 %>% select(rn, one_of(female_samples_to_keep$sample_name))
filtered_site_annotation_df <- read_csv("filtered_site_annotation_df.csv.gz")
all_dmp_df_expr_only <- female_df_no_h9_with_expr %>%
reshape2::melt(id.vars=c("rn"), variable.name="sample_name", value.name="beta") %>%
left_join(sample_clusters_df, by="sample_name") %>%
left_join(filtered_site_annotation_df %>% select(rowname, hg38_chromosome), by=c("rn"="rowname"))
global_methylation_by_cluster_plt_expr <- ggplot(all_dmp_df_expr_only %>% filter(!(hg38_chromosome %in% c("chrX", "chrY"))), aes(x=cluster, y=beta, fill=cluster, group=cluster)) +
geom_boxplot(size=.3)
female_df_no_h9_all <- female_df_no_h9 %>% select(rn, one_of(sample_clusters_df$sample_name))
all_dmp_df_all <- female_df_no_h9_all %>%
reshape2::melt(id.vars=c("rn"), variable.name="sample_name", value.name="beta") %>%
left_join(sample_clusters_df, by="sample_name") %>%
left_join(filtered_site_annotation_df %>% select(rowname, hg38_chromosome), by=c("rn"="rowname"))
global_methylation_by_cluster_plt_all <- ggplot(all_dmp_df_all %>% filter(!(hg38_chromosome %in% c("chrX", "chrY"))), aes(x=cluster, y=beta, fill=cluster, group=cluster)) +
geom_boxplot(size=.3)
```
## Generating BED files to look for motifs and term enrichment in the CpG clusters
```{r}
# generate bed files for each transition 500 bp fragments, then run DREME and AME on it
dir.create("x_chr_methylation_sites")
background_df <- all_dmp_df[(all_dmp_df$chr == 'chrX'),]
background_df$start_pos <- format(background_df$pos - 250, scientific = FALSE, trim = T)
background_df$end_pos <- format(background_df$pos + 250, scientific = FALSE, trim = T)
write_tsv(background_df[c('chr', "start_pos", "end_pos")],
sprintf('x_chr_methylation_sites/background_x_chr.bed', i), col_names = F, quote = FALSE)
for (i in 1:5) {
plotable_df_increasing <- all_dmp_df[(all_dmp_df$key_transition == i) & (all_dmp_df$delta_increasing == TRUE) & (all_dmp_df$passes_threshold == T) & (all_dmp_df$chr == 'chrX'),]
plotable_df_decreasing <- all_dmp_df[(all_dmp_df$key_transition == i) & (all_dmp_df$delta_increasing == F) & (all_dmp_df$passes_threshold == T) & (all_dmp_df$chr == 'chrX'),]
plotable_df_increasing$start_pos <- format(plotable_df_increasing$pos - 250, scientific = FALSE, trim = T)
plotable_df_increasing$end_pos <- format(plotable_df_increasing$pos + 250, scientific = FALSE, trim = T)
write_tsv(plotable_df_increasing[c('chr', "start_pos", "end_pos")],
sprintf('x_chr_methylation_sites/transition_%d_increasing_x_chr.bed', i), col_names = F, quote = FALSE)
write_tsv(plotable_df_increasing[plotable_df_increasing$dist_to_tss < 2000,c('chr', "start_pos", "end_pos")],
sprintf('x_chr_methylation_sites/transition_%d_increasing_x_chr_promoters.bed', i), col_names = F, quote = FALSE)
plotable_df_decreasing$start_pos <- format(plotable_df_decreasing$pos - 250, scientific = FALSE, trim = T)
plotable_df_decreasing$end_pos <- format(plotable_df_decreasing$pos + 250, scientific = FALSE, trim = T)
write_tsv(plotable_df_decreasing[c('chr', "start_pos", "end_pos")],
sprintf('x_chr_methylation_sites/transition_%d_decreasing_x_chr.bed', i), col_names = F, quote = FALSE)
write_tsv(plotable_df_decreasing[plotable_df_decreasing$dist_to_tss < 2000,c('chr', "start_pos", "end_pos")],
sprintf('x_chr_methylation_sites/transition_%d_decreasing_x_chr_promoters.bed', i), col_names = F, quote = FALSE)
}
```
## DUSP9 stuff for Stefan
```{r}
library(tidyverse)
library(broom)
all_dmp_df <- read_csv("all_dmp_df.csv.gz")
female_df_no_h9 <- read_csv("female_df_no_h9.csv.gz")
filtered_site_annotation_df <- read_csv("filtered_site_annotation_df.csv.gz")
autosome_probes <- filter(filtered_site_annotation_df, !(hg38_chromosome %in% c("chrX", "chrY")))$rowname
autosome_mean <- tidy(colMeans(female_df_no_h9 %>% filter(rn %in% autosome_probes) %>% dplyr::select(-rn))) %>%
rename(autosome_mean=x)
dusp9_promoter_probes <- filter(filtered_site_annotation_df, (hg38_gene_name %in% c("DUSP9")) & hg38_gene_group %in% c("TSS1500", "TSS200", "5_UTR"))$rowname
dusp9_promoter_mean <- tidy(colMeans(female_df_no_h9 %>% filter(rn %in% dusp9_promoter_probes) %>% dplyr::select(-rn))) %>%
rename(dusp9_promoter_mean=x)
elk1_promoter_probes <- filter(filtered_site_annotation_df, (hg38_gene_name %in% c("ELK1")) & hg38_gene_group %in% c("TSS1500", "TSS200", "5_UTR"))$rowname
elk1_promoter_mean <- tidy(colMeans(female_df_no_h9 %>% filter(rn %in% elk1_promoter_probes) %>% dplyr::select(-rn))) %>%
rename(elk1_promoter_mean=x)
means_df <- full_join(dusp9_promoter_mean, autosome_mean, by="names")
means_df <- full_join(means_df, elk1_promoter_mean, by="names")
sample_clusters_df <- read_csv("saved_sample_clusters.csv",
col_names = c('name','cluster')) %>% arrange(cluster)
means_df <- left_join(means_df, sample_clusters_df, by=c("names"="name"))
cluster_f_data <- means_df %>% filter(cluster == 5)
cluster_f_cor <- cor.test(cluster_f_data$dusp9_promoter_mean,
cluster_f_data$autosome_mean,
method="spearman")
cluster_nonf_data <- means_df %>% filter(cluster < 5)
cluster_nonf_cor <- cor.test(cluster_nonf_data$dusp9_promoter_mean,
cluster_nonf_data$autosome_mean,
method="spearman")
override.shape <- c(21,21,21,21,22,24)
dusp9_autosome_plot <- ggplot(means_df, aes(x=dusp9_promoter_mean, y=autosome_mean)) +
geom_smooth(data=means_df %>% filter(cluster == 5), mapping=aes(color="F"), method="lm", size=.3) +
geom_smooth(data=means_df %>% filter(cluster < 5), mapping=aes(color="A-E"), method="lm", size=.3) +
annotate("text", x=.2,y=.45,label=sprintf("Spearman corr: %.2f\np=%.2e", cluster_f_cor$estimate, cluster_f_cor$p.value), color=pal_aaas()(2)[2], size=1.5) +
annotate("text", x=.4,y=.57,label=sprintf("Spearman corr: %.2f\np=%.2e", cluster_nonf_cor$estimate, cluster_nonf_cor$p.value), color=pal_aaas()(2)[1], size=1.5) +
geom_point(aes(alpha=as.factor(cluster), shape=as.factor(cluster), stroke=cluster >= 4), color="black", size=2, fill="#E64B35FF") +
theme_bw() +
base_plot_theme +
scale_fill_npg(guide=F) +
scale_color_aaas() +
scale_shape_manual(guide = FALSE, values=override.shape) +
scale_alpha_discrete(limits=c(0,1,2,3,4,5), labels=c("A","B","C","D","E","F")) +
guides(alpha = guide_legend(override.aes = list(fill, shape=override.shape), nrow=1)) +
labs(title="DUSP9 promoter DNAme vs autosomal DNAme",
fill="Cluster",
alpha="Cluster",
x="DUSP9 Mean Promoter DNAme",
y="Mean Autosomal DNAme",
color="Cluster Regression")
ggsave("figs/supplementary/dusp9_autosome.pdf", dusp9_autosome_plot, width = 174/2, height = 174/2, units="mm")
cor.test(means_df$dusp9_promoter_mean, means_df$autosome_mean)
cor.test(filter(means_df, cluster >= 4)$dusp9_promoter_mean, filter(means_df, cluster >= 4)$autosome_mean)
#Elk1 autosome plot
elk1_autosome_plot <- ggplot(means_df, aes(x=elk1_promoter_mean, y=autosome_mean)) +
geom_smooth(data=means_df %>% filter(cluster == 5), mapping=aes(color="F"), method="lm", size=.3) +
geom_smooth(data=means_df %>% filter(cluster < 5), mapping=aes(color="A-E"), method="lm", size=.3) +
# annotate("text", x=.2,y=.45,label=sprintf("Spearman corr: %.2f\np=%.2e", cluster_f_cor$estimate, cluster_f_cor$p.value), color=pal_aaas()(2)[2], size=1.5) +
# annotate("text", x=.4,y=.57,label=sprintf("Spearman corr: %.2f\np=%.2e", cluster_nonf_cor$estimate, cluster_nonf_cor$p.value), color=pal_aaas()(2)[1], size=1.5) +
geom_point(aes(alpha=as.factor(cluster), shape=as.factor(cluster), stroke=cluster >= 4), color="black", size=2, fill="#E64B35FF") +
theme_bw() +
base_plot_theme +
scale_fill_npg(guide=F) +
scale_color_aaas() +
scale_shape_manual(guide = FALSE, values=override.shape) +
scale_alpha_discrete(limits=c(0,1,2,3,4,5), labels=c("A","B","C","D","E","F")) +
guides(alpha = guide_legend(override.aes = list(fill, shape=override.shape), nrow=1)) +
labs(title="DUSP9 promoter methylation vs autosomal methylation",
fill="Cluster",
alpha="Cluster",
x="ELK1 Mean Promoter Methylation",
y="Mean Autosomal Methylation",
color="Cluster Regression")
# DUSP9 expression vs autosomal mean methylation
batch_corrected_vsd_df <- read_csv("batch_corrected_vsd_df.csv") %>%
dplyr::select(-ensembl_id, -ensembl_id_no_version, -chromosome_name, -start_position, -description, -end_position, -strand)
dusp9_expression <- batch_corrected_vsd_df %>%
filter(hgnc_symbol=="DUSP9") %>%
reshape2::melt(id.vars="hgnc_symbol", variable.name="sample_name", value.name="dusp9_expr") %>%
left_join(means_df, by=c("sample_name"="names"))
cluster_f_data_expr <- dusp9_expression %>% filter(cluster == 5)
cluster_f_cor_expr <- cor.test(cluster_f_data_expr$dusp9_expr,
cluster_f_data_expr$autosome_mean)
cluster_nonf_data_expr <- dusp9_expression %>% filter(cluster < 5)
cluster_nonf_cor_expr <- cor.test(cluster_nonf_data_expr$dusp9_expr,
cluster_nonf_data_expr$autosome_mean)
dusp9_autosome_expr_plot <- ggplot(dusp9_expression, aes(x=dusp9_expr, y=autosome_mean)) +
geom_smooth(data=dusp9_expression %>% filter(cluster == 5), mapping=aes(color="F"), method="lm", size=.3, se=F) +
geom_smooth(data=dusp9_expression %>% filter(cluster < 5), mapping=aes(color="A-E"), method="lm", size=.3, se=F) +
annotate("text", x=6.4,y=.5,label=sprintf("Pearson corr: %.2f\np=%.2e", cluster_f_cor_expr$estimate, cluster_f_cor_expr$p.value), color=pal_aaas()(2)[2], size=1.5) +
annotate("text", x=4.9,y=.535,label=sprintf("Pearson corr: %.2f\np=%.2e", cluster_nonf_cor_expr$estimate, cluster_nonf_cor_expr$p.value), color=pal_aaas()(2)[1], size=1.5) +
geom_point(aes(alpha=as.factor(cluster), shape=as.factor(cluster), stroke=cluster >= 4), color="black", size=2, fill="#E64B35FF") +
theme_bw() +
base_plot_theme +
scale_fill_npg(guide=F) +
scale_color_aaas() +
scale_shape_manual(guide = FALSE, values=override.shape, limits=c(0,1,2,3,4,5)) +
scale_alpha_discrete(limits=c(0,1,2,3,4,5), labels=c("A","B","C","D","E","F")) +
guides(alpha = guide_legend(override.aes = list(fill, shape=override.shape), nrow=1)) +
labs(title="DUSP9 promoter methylation vs autosomal methylation",
fill="Cluster",
alpha="Cluster",
x="DUSP9 Mean Promoter Methylation",
y="Mean Autosomal Methylation",
color="Cluster Regression")
ggsave("figs/supplementary/dusp9_autosome_expr.svg", dusp9_autosome_expr_plot, width = 174/2, height = 174/2, units="mm")
```
## Heatmap of Kilens genes in cluster F
```{r}
table_3_expression <- read_csv("table_3_expression_all_v_all.csv.gz")
female_df_no_h9 <- read_csv("female_df_no_h9.csv")
cluster_tib <- read_csv("saved_sample_clusters.csv", col_names = c("sample_name", "cluster"))
filtered_site_annotation_df <- read_csv("filtered_site_annotation_df.csv")
sample_data_combined_all_info <- read_csv("sample_data_combined_all_info.csv")
kilens_genes <- (table_3_expression %>% filter(!is.na(kielens_log2FC) & `Transition 4/5 p-adj` <= .1 & Chromosome != "X"))$`Gene Symbol`
kilens_probes <- (filtered_site_annotation_df %>% filter(hg38_gene_name %in% kilens_genes & !(hg38_chromosome %in% c("chrX", "chrY"))))$rowname
kilens_heatmap_df <- female_df_no_h9 %>%
filter(rn %in% kilens_probes) %>%
dplyr::select(rn, !!filter(cluster_tib, cluster %in% c(0,5))$sample_name) %>%
reshape2::melt(id.vars="rn", variable.name="sample_name") %>%
left_join(filtered_site_annotation_df %>%
dplyr::select(rowname, hg38_gene_name, hg38_pos),
by=c("rn"="rowname")) %>%
left_join(cluster_tib, by="sample_name") %>%
left_join(sample_data_combined_all_info, by="sample_name")
data <- kilens_heatmap_df %>%
reshape2::dcast(rn ~ sample_name, value.var="value") %>%
column_to_rownames(var="rn")
sample_ord <- hclust( dist(t(data), method = "euclidean"), method = "single" )$order
ordered_sample_names <- rev(colnames(data)[sample_ord])
site_ord <- hclust( dist((data), method = "euclidean"), method = "single" )$order
ordered_sites <- (rownames(data)[site_ord])
ordered_labels_df <- kilens_heatmap_df %>%
group_by(sample_name, group_accession, sra_accession, cluster) %>%
summarise() %>%
arrange(factor(sample_name, levels=ordered_sample_names), sample_name) %>%
mutate(sample_label=paste(group_accession, sra_accession, sep="-"))
library(facetscales)
scales_x <- list(
`0` = scale_x_discrete(limits=filter(ordered_labels_df, cluster==0)$sample_name, labels = filter(ordered_labels_df, cluster==0)$sample_label),
`5` = scale_x_discrete(limits=filter(ordered_labels_df, cluster==5)$sample_name, labels = filter(ordered_labels_df, cluster==5)$sample_label)
)
# library(caret)
# kilens_heatmap_df = preProcess(train, method = "range")
kilens_heatmap <- ggplot(kilens_heatmap_df, aes(x=factor(sample_name, levels=ordered_sample_names),
y=factor(rn, levels=ordered_sites), fill=value)) +
geom_tile() +
# facet_grid(cluster ~ sample_cluster, scales = "free", space="free", shrink=FALSE, switch = "y") +
# facet_grid(. ~ cluster, scales = "free", space="free") +
facet_grid_sc(cols = vars(cluster), scales = list(x = scales_x), space="free") +
scale_fill_gradient2(low="#0bb1a4", high = "#E966FE",
midpoint = .5,
limits=c(0,1),
breaks=c(0,.5,1),
labels=c("unmethylated\nβ=0", "50% methylation\nβ=0.5", "methylated\nβ=1")) +
# scale_x_discrete(limits=ordered_labels_df$sample_name, labels = ordered_labels_df$sample_label) +
labs(x="Female Samples",
y="",
fill="DNA Methylation") +
theme(axis.text.y=element_blank(),
axis.ticks.y=element_blank(),
legend.position="top",
legend.justification = "center",
legend.title = element_text(size = 6, vjust = .75),
legend.text = element_text(size = 6),
# legend.position="top",
# legend.justification = "right",
# legend.title = element_text(size = 6, vjust = 1),
# legend.text = element_text(size = 6),
# legend.key.height = unit(2, "mm"),
plot.margin = margin(r=0),
panel.spacing.y = unit(2, "mm"),
panel.spacing.x = unit(0.2, "mm"),
axis.text.x = element_text(size = 9*0.8, angle = 270, hjust = 0, colour = "grey50"),
# strip.background.x = element_rect(color="black"),
strip.text.y = element_text(size=7),
strip.text.x = element_text(size=7),
axis.title = element_text(size=7))
kilens_heatmap_df_gene <- female_df_no_h9 %>%
filter(rn %in% kilens_probes) %>%
dplyr::select(rn, !!filter(cluster_tib, cluster %in% c(0,3,5))$sample_name) %>%
reshape2::melt(id.vars="rn", variable.name="sample_name") %>%
left_join(filtered_site_annotation_df %>%
dplyr::select(rowname, hg38_gene_name, hg38_pos),
by=c("rn"="rowname")) %>%
left_join(cluster_tib, by="sample_name") %>%
left_join(sample_data_combined_all_info, by="sample_name") %>%
group_by(hg38_gene_name, sample_name, cluster, group_accession, sra_accession) %>%
summarise(value=mean(value))
gene_meth_df_kilens <- read_csv("all_dmp_df.csv.gz") %>%
group_by(hg38_gene) %>%
summarise(cluster_0_mean_meth=mean(cluster_0_mean)) %>%
filter(hg38_gene %in% kilens_genes) %>%
dplyr::select(hg38_gene, cluster_0_mean_meth)
kilens_heatmap_df_gene_cluster_3_subtracted <- kilens_heatmap_df_gene %>%
left_join(gene_meth_df_kilens, by=c("hg38_gene_name"="hg38_gene")) %>%
mutate(value=value-cluster_0_mean_meth)
#sign test - how many of these genes are decreasing in methylation
sign_test <- grouped_concordance_df %>%
filter(gene %in% kilens_genes) %>%
mutate(tran_4_5_delta_meth=cluster_5_mean_meth-cluster_3_mean_meth) %>%
count(tran_4_5_delta_meth > 0)
# concordance_df <- read_csv("concordance_df.csv")
data_gene <- kilens_heatmap_df_gene_cluster_3_subtracted %>%
reshape2::dcast(hg38_gene_name ~ sample_name, value.var="value") %>%
column_to_rownames(var="hg38_gene_name")
sample_ord_gene <- hclust( dist(t(data_gene), method = "euclidean"), method = "single" )$order
ordered_sample_names_gene <- rev(colnames(data_gene)[sample_ord_gene])
ordered_labels_df_gene <- kilens_heatmap_df_gene %>%
group_by(sample_name, group_accession, sra_accession, cluster) %>%
summarise() %>%
arrange(factor(sample_name, levels=ordered_sample_names_gene), sample_name) %>%
mutate(sample_label=paste(group_accession, sra_accession, sep="-"))
scales_x_gene <- list(
`0` = scale_x_discrete(limits=filter(ordered_labels_df_gene, cluster==0)$sample_name, labels = filter(ordered_labels_df_gene, cluster==0)$sample_label),
`3` = scale_x_discrete(limits=filter(ordered_labels_df_gene, cluster==3)$sample_name, labels = filter(ordered_labels_df_gene, cluster==3)$sample_label),
`5` = scale_x_discrete(limits=filter(ordered_labels_df_gene, cluster==5)$sample_name, labels = filter(ordered_labels_df_gene, cluster==5)$sample_label)
)
gene_ord <- hclust( dist((data_gene), method = "euclidean"), method = "single" )$order
ordered_genes <- (rownames(data_gene)[gene_ord])
# kilens_heatmap_df_scaled <- kilens_heatmap_df_gene %>%
# group_by(hg38_gene_name) %>%
# mutate(scaled_value=scales::rescale(value, to=c(0,1))) %>%
# ungroup()
# kilens_heatmap_df_scaled <- kilens_heatmap_df_scaled %>%
# mutate(row_type="Genes")
kilens_heatmap_df_samples <- tibble(sample_name=character(), cluster=character(), value=character(), group_accession=character(), hg38_gene_name=character(), has_expression=logical())
for (sample_name in unique(kilens_heatmap_df_gene_cluster_3_subtracted$sample_name)) {
cluster <- as.numeric(unlist(cluster_tib[cluster_tib$sample_name == sample_name, "cluster"]))
group <- as.character(unlist(unique(kilens_heatmap_df_gene_cluster_3_subtracted[kilens_heatmap_df_gene_cluster_3_subtracted$sample_name == sample_name, "group_accession"])))
sra <- as.character(unlist(unique(kilens_heatmap_df_gene_cluster_3_subtracted[kilens_heatmap_df_gene_cluster_3_subtracted$sample_name == sample_name, "sra_accession"])))
kilens_heatmap_df_samples <- kilens_heatmap_df_samples %>%
add_row(sample_name=sample_name,
cluster=cluster,
value=group,
group_accession=group,
# row_type="Sample Group",
has_expression=!is.na(sra),
hg38_gene_name="Sample Group")
}
kilens_heatmap_genes <- ggplot(kilens_heatmap_df_gene_cluster_3_subtracted,
aes(x=factor(sample_name, levels=ordered_sample_names_gene),
y=factor(hg38_gene_name, levels=ordered_genes), fill=value)) +
geom_tile() +
# geom_point(data=kilens_heatmap_df_scaled %>% filter(row_type == "Sample Group"), aes(color=group_accession)) +
# facet_grid(cluster ~ sample_cluster, scales = "free", space="free", shrink=FALSE, switch = "y") +
# facet_grid(. ~ cluster, scales = "free", space="free") +
facet_grid_sc(cols = vars(cluster), scales = list(x = scales_x_gene), space="free") +
# scale_fill_material(palette = "deep-purple")+#,
# limits=c(0,1),
# breaks=c(0,.5,1))+#,
# labels=c("unmethylated\nβ=0", "50% methylation\nβ=0.5", "methylated\nβ=1")) +
scale_fill_gradient2(name="value",
low=pal_aaas()(1), high = pal_aaas()(2)[2],
midpoint = 0,
limits=c(-.25,.25),
breaks=c(-.25,0,.25), oob=scales::squish)+#,
# labels=c("unmethylated\nβ=0", "50% methylation\nβ=0.5", "methylated\nβ=1")) +
scale_color_npg(name="group_accession") +
labs(x="Female Samples",
y="",
fill="Cluster A Subtracted DNA Methylation") +
theme(axis.text.x = element_text(size = 9*0.8, angle = 270, hjust = 0),
axis.text.y=element_text(size=9*.5),
axis.ticks = element_line(size=.3),
legend.position="top",
legend.justification = "center",
legend.title = element_text(size = 6, vjust = .75),
legend.text = element_text(size = 6),
# legend.position="top",
# legend.justification = "right",
# legend.title = element_text(size = 6, vjust = 1),
# legend.text = element_text(size = 6),
# legend.key.height = unit(2, "mm"),
plot.margin = margin(r=0),
panel.spacing.y = unit(2, "mm"),
panel.spacing.x = unit(0.2, "mm"),
# strip.background.x = element_rect(color="black"),
strip.text.y = element_text(size=7),
strip.text.x = element_text(size=7),
axis.title = element_text(size=7))
ggsave("figs/supplementary/kilens_heatmap_gene_dual_color_cluster_A_subtracted.svg", height = 250, width = 174/2, units = "mm")
ggsave("figs/supplementary/kilens_heatmap_gene_dual_color_cluster_A_subtracted_cluster_D.svg", kilens_heatmap_genes, height = 250, width = 174/2, units = "mm")
kilens_heatmap_samples <- ggplot(kilens_heatmap_df_samples,
aes(x=factor(sample_name, levels=ordered_sample_names_gene),
y=factor(hg38_gene_name, levels=ordered_genes), fill=value)) +
geom_tile() +
geom_point(aes(color=has_expression), size=.4) +
# geom_point(data=kilens_heatmap_df_scaled %>% filter(row_type == "Sample Group"), aes(color=group_accession)) +
# facet_grid(cluster ~ sample_cluster, scales = "free", space="free", shrink=FALSE, switch = "y") +
# facet_grid(. ~ cluster, scales = "free", space="free") +
facet_grid_sc(cols = vars(cluster), scales = list(x = scales_x_gene), space="free") +
scale_color_manual(values=c("#000000", NA), limits=c(T,F)) +
# scale_fill_material(palette = "deep-purple",
# limits=c(0,1),
# breaks=c(0,.5,1))+#,
# labels=c("unmethylated\nβ=0", "50% methylation\nβ=0.5", "methylated\nβ=1")) +
scale_fill_npg(name="group_accession") +
labs(x="Female Samples",
y="",
fill="Sample Group") +
theme(axis.text.x = element_text(size = 9*0.8, angle = 270, hjust = 0),
axis.text.y=element_text(size=9*.5),
axis.ticks.y = element_line(size=.3),
legend.position="top",
legend.justification = "center",
legend.title = element_text(size = 6, vjust = .75),
legend.text = element_text(size = 6),
# legend.position="top",
# legend.justification = "right",
# legend.title = element_text(size = 6, vjust = 1),
# legend.text = element_text(size = 6),
# legend.key.height = unit(2, "mm"),
plot.margin = margin(r=0),
panel.spacing.y = unit(2, "mm"),
panel.spacing.x = unit(0.2, "mm"),
# strip.background.x = element_rect(color="black"),
strip.text.y = element_text(size=7),
strip.text.x = element_text(size=7),
axis.title = element_text(size=7))
ggsave("figs/supplementary/kilens_heatmap_gene_labels.svg", height = 70, width = (174/2)- 6.936, units = "mm")
```
## Dotplot of kilens expression vs. our methylation
```{r}
library(tidyverse)
library(ggplot2)
table_3_expression <- read_csv("table_3_expression_all_v_all.csv")
female_df_no_h9 <- read_csv("female_df_no_h9.csv")
cluster_tib <- read_csv("saved_sample_clusters.csv", col_names = c("sample_name", "cluster"))
filtered_site_annotation_df <- read_csv("filtered_site_annotation_df.csv")
all_dmp_df <- read_csv("all_dmp_df.csv")
grouped_concordance_df <- read_csv("grouped_concordance_df.csv.gz")
kilens_genes <- (table_3_expression %>%
filter(!is.na(kielens_log2FC) & table_3_expression$`Transition 4/5 p-adj` <= .1))$`Gene Symbol`
# kilens_probes <- (filtered_site_annotation_df %>% filter(hg38_gene_name %in% kilens_genes & !(hg38_chromosome %in% c("chrX", "chrY"))))$rowname
kilens_grouped_concordance_df <- grouped_concordance_df %>%
filter(gene %in% kilens_genes & hg38_chr != "chrX") %>%
left_join(table_3_expression %>% select(`Gene Symbol`, kielens_log2FC, kielens_T2iLGö_mean, `kielens_KSR+FGF2_mean`, `Transition 4/5 log2FoldChange`, `Transition 4/5 p-adj`),
by=c("gene"="Gene Symbol"))
corr_log2fc_meth <- cor.test(kilens_grouped_concordance_df$`Transition 4/5 log2FoldChange`,
kilens_grouped_concordance_df$cluster_5_mean_meth - kilens_grouped_concordance_df$cluster_3_mean_meth,
method = "spearman")
tran_5_plot <- ggplot(kilens_grouped_concordance_df, aes(x=`Transition 4/5 log2FoldChange`, y=cluster_5_mean_meth-cluster_3_mean_meth)) +
geom_point(aes(color=`Transition 4/5 p-adj`), size=.7) +
scale_x_continuous(trans = scales::pseudo_log_trans()) +
scale_y_continuous(trans = scales::pseudo_log_trans(), limits=c(-.2,.01)) +
geom_smooth(method='lm', size=.3) +
scale_color_viridis_c() +
annotate("text", label=sprintf("Spearman Corr: %.2f\np=%.2e",corr_log2fc_meth$estimate, corr_log2fc_meth$p.value), x=4, y=-.02, size=2) +
# geom_text(aes(label=sprintf("Spearman Corr: %.2f\np=%.2e",corr_log2fc_meth$estimate, corr_log2fc_meth$p.value), x=4, y=-.02), size=2) +
theme_bw() +
base_plot_theme +
labs(title="Transition 4/5 DNAme vs Expression for Kilens, et al. 2018 Genes", y="(Cluster F DNAme) - (Cluster D DNAme)", x="Transition 4/5 log2(F.C.)") +
theme(legend.key.width = unit(4, units="mm"))
ggsave("figs/supplementary/kielens_meth_expr_correlation.pdf",tran_5_plot,width = 174/2, height=87, units = "mm")
kilens_melted_df <- kilens_grouped_concordance_df %>%
reshape2::melt()
sample_data_combined_all_info <- read_csv("sample_data_combined_all_info.csv")
expression_samples <- filter(sample_data_combined_all_info, !is.na(sra_accession))$sample_name
samples_without_expression <- setdiff(cluster_tib$sample_name, expression_samples)
kilens_grouped_promoter_df <- female_df_no_h9 %>%
select(rn, !!samples_without_expression) %>%
left_join(filtered_site_annotation_df, by=c("rn"="rowname")) %>%
#filter(hg38_gene_group %in% c("TSS1500", "TSS200", "5_UTR")) %>%
reshape2::melt(id.vars=c("hg38_gene_name", "hg38_chromosome"), measure.vars=samples_without_expression, variable.name="sample_name", value.name="methylation") %>%
group_by(hg38_gene_name, hg38_chromosome, sample_name) %>%
summarise(mean_methylation=mean(methylation)) %>%
left_join(cluster_tib, by="sample_name") %>%
group_by(hg38_gene_name, hg38_chromosome, cluster) %>%
summarise(mean_methylation=mean(mean_methylation)) %>%
reshape2::dcast(hg38_gene_name + hg38_chromosome ~ cluster) %>%
rename(cluster_0_promoter_mean=`0`,
cluster_1_promoter_mean=`1`,
cluster_2_promoter_mean=`2`,
cluster_3_promoter_mean=`3`,
cluster_4_promoter_mean=`4`,
cluster_5_promoter_mean=`5`) %>%
filter(hg38_gene_name %in% kilens_genes & hg38_chromosome != "chrX") %>%
left_join(table_3_expression %>% select(`Gene Symbol`, kielens_log2FC, kielens_T2iLGö_mean, `kielens_KSR+FGF2_mean`, `Transition 4/5 log2FoldChange`, `Transition 4/5 p-adj`),
by=c("hg38_gene_name"="Gene Symbol"))
corr_log2fc_promoter_meth <- cor.test(kilens_grouped_promoter_df$`Transition 4/5 log2FoldChange`,
kilens_grouped_promoter_df$cluster_5_promoter_mean - kilens_grouped_promoter_df$cluster_3_promoter_mean,
method = "spearman")
tran_5_promoter_plot <- ggplot(kilens_grouped_promoter_df, aes(x=`Transition 4/5 log2FoldChange`, y=cluster_5_promoter_mean-cluster_3_promoter_mean)) +
geom_point(aes(color=`Transition 4/5 p-adj`), size=.7) +
scale_x_continuous(trans = scales::pseudo_log_trans()) +
scale_y_continuous(trans = scales::pseudo_log_trans(), limits=c(-.2,.05)) +
geom_smooth(method='lm', size=.3) +
scale_color_viridis_c() +
geom_text(aes(label=sprintf("Spearman Corr: %.2f",corr_log2fc_promoter_meth$estimate), x=-4, y=-.15), size=2) +
theme_bw() +
base_plot_theme +
labs(title="Transition 4/5 Methylation vs Expression for Kilens Genes - No Expression Samples",
y="cluster_5_mean-cluster_3_mean")
ggsave("figs/supplementary/kielens_no_expression_samples_meth_expr_correlation.svg",tran_5_promoter_plot,width = 87, height=87, units = "mm")
```
### Use BEDTOOLS to turn bed files into fasta files, and then run dreme and ame
```{bash}
for filename in x_chr_methylation_sites/*.bed
do
name=$(echo $filename | rev | cut -f 2- -d '.' | rev)
bedtools getfasta -fi motif_search_tools/ucsc.hg19.fa -bed $filename > ${name}.fa
mkdir -p ${name}/{dreme,ame}
dreme -verbosity 1 -oc ${name}/dreme -dna -p ${name}.fa -n background_x_chr.fa -t 18000 -e 0.05 > ${name}/dreme/out.txt 2>&1 & PIDIOS=$!
ame --verbose 3 --oc ${name}/ame --scoring avg --method fisher --hit-lo-fraction 0.25 --evalue-report-threshold 10.0 --control background_x_chr.fa ${name}.fa /mnt/d/pinterlab/meme-suite/meme-5.0.4/motif_databases/EUKARYOTE/jolma2013.meme /mnt/d/pinterlab/meme-suite/meme-5.0.4/motif_databases/JASPAR/JASPAR2018_CORE_vertebrates_non-redundant.meme /mnt/d/pinterlab/meme-suite/meme-5.0.4/motif_databases/MOUSE/uniprobe_mouse.meme > ${name}/ame/out.txt 2>&1 & PIDMIX=$!
wait $PIDIOS
wait $PIDMIX
done
```
### Figure for study breakdown
```{r}
table_1_df <- read_csv("table_1_sample_data_info.csv") %>%
filter(!is.na(`Sample Cluster Assignment`) & Source != "This paper") %>%
mutate(cluster=case_when(`Sample Cluster Assignment` == 0 ~ "A",
`Sample Cluster Assignment` == 1 ~ "B",
`Sample Cluster Assignment` == 2 ~ "C",
`Sample Cluster Assignment` == 3 ~ "D",
`Sample Cluster Assignment` == 4 ~ "E",
`Sample Cluster Assignment` == 5 ~ "F"))
studies_in_clusters <- ggplot(table_1_df) +
geom_bar(aes(x=cluster, fill=sprintf("%s (%s)", str_replace(Source, " et al,", ","), `GEO Group Accession`))) +
labs(title="Studies in Clusters",
x="Sample Cluster",
y="Number of Samples",
fill="Source") +
theme_bw() +
scale_fill_manual(limits=c("Banovich, 2018 (GSE110544)", "Nazor, 2012 (GSE31848)", "Butcher, 2016 (GSE59091)", "Nishizawa, 2016 (GSE60924)", "Zdravkovic, 2015 (GSE72923)", "Salomonis, 2016 (GSE85828)", "Takasawa, 2018 (GSE73938)")[c(1,3,2,4,6,7,5)],
values = c(pal_futurama()(4), pal_startrek()(3)[3], pal_rickandmorty()(5)[c(2,4)])[c(1,3,2,4,6,7,5)])+
# scale_fill_npg() +
base_plot_theme +
theme(legend.position="right",
legend.justification = "left")
ggsave("figs/supplementary/supp_1_studies_in_clusters.pdf", studies_in_clusters, width=80, height=30, units="mm")
```