Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
contiguous_x_erosion/analysis_notebook.Rmd
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
8048 lines (6739 sloc)
410 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "Methylation Analysis" | |
output: html_notebook | |
--- | |
This notebook takes us through the methylation analysis performed in this study. | |
First we begin by downloading all the data from GEO, and performing some quality assessment and culling of the data. | |
Below we use the minfi package to download all our datasets of interest, and combine them into the Illumina 450K methylation array format. | |
```{r} | |
#---Dependencies installation---# | |
# if (!requireNamespace("BiocManager", quietly = TRUE)) | |
# install.packages("BiocManager") | |
# BiocManager::install("minfi", version = "3.8") | |
# BiocManager::install("IlluminaHumanMethylation450kanno.ilmn12.hg19") | |
# BiocManager::install("IlluminaHumanMethylationEPICanno.ilm10b4.hg19") | |
# BiocManager::install('IlluminaHumanMethylationEPICmanifest') | |
# BiocManager::install('IlluminaHumanMethylation450kmanifest') | |
# BiocManager::install('tidyverse') | |
#-------------------------------# | |
``` | |
```{r} | |
library(minfi) | |
library(IlluminaHumanMethylation450kanno.ilmn12.hg19) | |
library(IlluminaHumanMethylationEPICanno.ilm10b4.hg19) | |
library(stringi) | |
library(tidyverse) | |
datasets_df <- read.csv("datasets.csv.gz") | |
for (row in 1:nrow(datasets_df)) { | |
print(row) | |
name <- as.character(datasets_df$Name[row]) | |
basedir <- as.character(datasets_df$Directory[row]) | |
datatype <- as.character(datasets_df$Type[row]) | |
sample_metadata_file <- as.character(datasets_df$SampleInfoFile[row]) | |
array_size <- datasets_df$ArraySize[row] | |
if (array_size == 450) { | |
array_type <- "IlluminaHumanMethylation450k" | |
} else if (array_size == 850) { | |
array_type <- "IlluminaHumanMethylationEPIC" | |
} | |
if (datatype == "geo_raw") { | |
sample_info_df <- read.csv(file.path(basedir, sample_metadata_file), stringsAsFactors = FALSE, row.names = 1) | |
samples_to_keep <- sample_info_df[sample_info_df$Keep %in% c("TRUE"), ] | |
raw_filename <- file.path(basedir, as.character(datasets_df$DataFile[row])) | |
u_name <- as.character(datasets_df$Uname[row]) | |
m_name <- as.character(datasets_df$Mname[row]) | |
separator <- stri_unescape_unicode(as.character(datasets_df$Separator[row])) | |
gmset <- readGEORawFile( | |
filename = raw_filename, | |
Uname = u_name, | |
Mname = m_name, | |
array = array_type, | |
sep = separator | |
) | |
gmset <- readGEORawFile( | |
filename = raw_filename, | |
Uname = u_name, | |
Mname = m_name, | |
array = array_type, | |
sep = separator, | |
pData = sample_info_df[sampleNames(gmset), ] | |
) | |
} else if (datatype == "idat") { | |
targets <- as.data.frame(read.metharray.sheet(basedir, pattern = sample_metadata_file)) | |
targets_to_keep <- subset(targets, Keep == TRUE) | |
RGset <- read.metharray.exp(targets = targets_to_keep) | |
gmset <- preprocessIllumina(RGset) | |
gmset <- mapToGenome(gmset) | |
} | |
sampleNames(gmset) <- paste(name, gmset$accession, gmset$cell_type, gmset$cell_line, gmset$gender, | |
paste("p", gmset$passage, sep = ""), | |
sep = "_" | |
) | |
plot_group <- data.frame(group = name, sample_name = sampleNames(gmset), row.names = "sample_name") | |
if (row == 1) { | |
combined_arrays <- gmset | |
plot_groups_combined <- plot_group | |
} else { | |
combined_arrays <- combineArrays(combined_arrays, gmset, outType = "IlluminaHumanMethylation450k") | |
plot_groups_combined <- rbind(plot_groups_combined, plot_group) | |
} | |
} | |
``` | |
## Minfi sex prediction | |
Now we will use minfi to run sex prediction on the data. We have also manually identified the sex of each cell line using either the paper that the cell lines are coming from, or using Cellosaurus for well established cell lines. We remove any cell lines that have a conflict between the predicted sex, and the manually annotated sex. | |
```{r} | |
# remove all cross reactive probes | |
# site_annotation_df <- getAnnotation(combined_arrays) | |
cross_reactive_probes <- read_csv("Chen-non-specific-probes-Illumina450k.csv.gz") | |
probes_to_keep <- !(rownames(combined_arrays) %in% cross_reactive_probes$TargetID) | |
filtered_combined_arrays <- combined_arrays[probes_to_keep,] | |
filtered_site_annotation_df <- getAnnotation(filtered_combined_arrays) | |
## Update the annotation to hg38 for better gene annotation and such | |
# First we create a fasta file with all the probes | |
library(seqinr) | |
# filtered_site_annotation_df <- read_csv("filtered_site_annotation_df.csv") | |
write.fasta(as.list(filtered_site_annotation_df$SourceSeq), filtered_site_annotation_df$Name, "probe_sequences.fa") | |
``` | |
Now we'll use blat to map the probes to the hg38 genome. | |
```{bash} | |
./blat -minIdentity=100 -maxGap=0 genomes/ucsc.hg38.fa probe_sequences.fa hg38_probe_mapping.psl | |
``` | |
Finally we'll bring it back to R | |
```{r} | |
hg38_probe_mapping <- read_tsv("hg38_probe_mapping.psl", | |
skip=5, | |
col_names = c("match", | |
"mis_match", | |
"rep_match", | |
"Ns", | |
"Q_gapcount", | |
"Q_gapbases", | |
"T_gapcount", | |
"T_gapbases", | |
"strand", | |
"Q_name", | |
"Q_size", | |
"Q_start", | |
"Q_end", | |
"T_name", | |
"T_size", | |
"T_start", | |
"T_end", | |
"blockcount", | |
"blockSizes", | |
"qStarts", | |
"tStarts")) | |
full_mappings <- hg38_probe_mapping %>% filter(match >= (Q_size-3) & T_gapcount == 0) | |
uniquely_mapping_probes <- full_mappings %>% filter(!(Q_name %in% Q_name[duplicated(Q_name)])) | |
filtered_combined_arrays <- combined_arrays[uniquely_mapping_probes$Q_name,] | |
filtered_site_annotation_df <- getAnnotation(filtered_combined_arrays) | |
filtered_site_annotation_df$hg38_chromosome <- uniquely_mapping_probes$T_name | |
filtered_site_annotation_df$hg38_pos <- rowMeans(uniquely_mapping_probes %>% dplyr::select(T_start, T_end)) | |
filtered_site_annotation_df <- rownames_to_column(as.data.frame(filtered_site_annotation_df)) | |
## Annotating the hg38 genes using biomart | |
library(biomaRt) | |
library(GenomicRanges) | |
hg38_mart <- useMart('ensembl', dataset='hsapiens_gene_ensembl') # create a mart object | |
# in order to prevent timeout, we'll get this 1 chromosome at a time | |
# hg38_biomart_structure_df <- tibble(external_gene_name=character(), | |
# description=character(), | |
# chromosome_name=character(), | |
# start_position=numeric(), | |
# end_position=numeric(), | |
# strand=numeric(), | |
# ensembl_transcript_id=character(), | |
# ensembl_gene_id=character(), | |
# transcription_start_site=numeric(), | |
# transcript_start=numeric(), | |
# transcript_end=numeric(), | |
# `5_utr_start`=numeric(), | |
# `5_utr_end`=numeric(), | |
# `3_utr_start`=numeric(), | |
# `3_utr_end`=numeric()) | |
# for (chrom_name in c(1:22, "X", "Y")){ | |
# hg38_biomart_structure_df <- bind_rows(hg38_biomart_structure_df, getBM(mart=hg38_mart, | |
# filters = c( "chromosome_name"), | |
# values = list(chromosome_name=chrom_name), | |
# attributes=c('external_gene_name', 'description', 'chromosome_name', | |
# 'start_position', 'end_position', 'strand', | |
# 'ensembl_transcript_id', | |
# 'ensembl_gene_id', 'transcription_start_site', | |
# 'transcript_start', 'transcript_end', | |
# '5_utr_start', '5_utr_end', '3_utr_start', '3_utr_end')) %>% mutate(chromosome_name=as.character(chromosome_name))) | |
# } | |
# | |
# write_csv(hg38_biomart_structure_df, "hg38_biomart_structure_df.csv") | |
hg38_biomart_structure_df <- read_csv("hg38_biomart_structure_df.csv", col_types = cols( | |
external_gene_name = col_character(), | |
description = col_character(), | |
chromosome_name = col_character(), | |
start_position = col_double(), | |
end_position = col_double(), | |
strand = col_double(), | |
ensembl_transcript_id = col_character(), | |
ensembl_gene_id = col_character(), | |
transcription_start_site = col_double(), | |
transcript_start = col_double(), | |
transcript_end = col_double(), | |
`5_utr_start` = col_double(), | |
`5_utr_end` = col_double(), | |
`3_utr_start` = col_double(), | |
`3_utr_end` = col_double())) | |
hg38_biomart_feature_df <- getBM(mart=hg38_mart, | |
filters = c("transcript_gencode_basic", "chromosome_name", "transcript_tsl"), | |
values = list(transcript_gencode_basic=TRUE, chromosome_name=c(1:22, "X", "Y") , transcript_tsl=TRUE), | |
attributes=c('ensembl_transcript_id', 'ensembl_gene_id', | |
'chromosome_name', 'start_position', 'end_position', | |
'transcript_tsl', 'transcript_appris', 'transcript_length')) | |
write_csv(hg38_biomart_feature_df, "hg38_biomart_feature_df.csv") | |
hg38_biomart_feature_df <- read_csv("hg38_biomart_feature_df.csv", col_types = cols("chromosome_name"="c")) | |
# hg38_biomart_df <- full_join(hg38_biomart_feature_df, hg38_biomart_structure_df, by=c("ensembl_transcript_id")) | |
biomart_genes_grange <- makeGRangesFromDataFrame(hg38_biomart_feature_df %>% | |
mutate(chrom=paste("chr", chromosome_name, sep = "")) %>% | |
dplyr::select(chrom, start=start_position, end=end_position)) | |
cpg_probes_grange <- makeGRangesFromDataFrame(filtered_site_annotation_df %>% | |
mutate(end=hg38_pos) %>% | |
dplyr::select(chrom=hg38_chromosome, start=hg38_pos, end)) | |
nearest_gene_idx <- nearest(cpg_probes_grange, biomart_genes_grange) | |
nearest_gene_distances_df <- as_tibble(distanceToNearest(cpg_probes_grange, biomart_genes_grange)) | |
nearest_ensembl_genes <- hg38_biomart_feature_df[nearest_gene_idx,]$ensembl_gene_id | |
nearest_ensembl_genes[nearest_gene_distances_df$distance > 5000] <- "" | |
filtered_site_annotation_df$hg38_ensembl_gene_id <- nearest_ensembl_genes | |
tsl_levels <- unique(filter(hg38_biomart_feature_df, grepl("tsl1", transcript_tsl))$transcript_tsl) | |
tsl_levels <- c(tsl_levels, unique(filter(hg38_biomart_feature_df, grepl("tsl2", transcript_tsl))$transcript_tsl)) | |
tsl_levels <- c(tsl_levels, unique(filter(hg38_biomart_feature_df, grepl("tsl3", transcript_tsl))$transcript_tsl)) | |
tsl_levels <- c(tsl_levels, unique(filter(hg38_biomart_feature_df, grepl("tsl4", transcript_tsl))$transcript_tsl)) | |
tsl_levels <- c(tsl_levels, unique(filter(hg38_biomart_feature_df, grepl("tsl5", transcript_tsl))$transcript_tsl)) | |
tsl_levels <- c(tsl_levels, unique(filter(hg38_biomart_feature_df, grepl("tslna", transcript_tsl))$transcript_tsl)) | |
appris_levels <- unique(filter(hg38_biomart_feature_df, grepl("principal1", transcript_appris))$transcript_appris) | |
appris_levels <- c(appris_levels, unique(filter(hg38_biomart_feature_df, grepl("principal2", transcript_appris))$transcript_appris)) | |
appris_levels <- c(appris_levels, unique(filter(hg38_biomart_feature_df, grepl("principal3", transcript_appris))$transcript_appris)) | |
appris_levels <- c(appris_levels, unique(filter(hg38_biomart_feature_df, grepl("principal4", transcript_appris))$transcript_appris)) | |
appris_levels <- c(appris_levels, unique(filter(hg38_biomart_feature_df, grepl("alternative1", transcript_appris))$transcript_appris)) | |
appris_levels <- c(appris_levels, unique(filter(hg38_biomart_feature_df, grepl("alternative2", transcript_appris))$transcript_appris)) | |
appris_levels <- c(appris_levels, "") | |
# i <- 0 | |
# total <- nrow(filtered_site_annotation_df) | |
canonical_transcripts <- hg38_biomart_feature_df %>% group_by(ensembl_gene_id) %>% | |
arrange(factor(transcript_tsl, levels = tsl_levels), | |
factor(transcript_appris, levels = appris_levels), | |
desc(transcript_length), | |
) %>% | |
filter(row_number()==1) | |
write_csv(canonical_transcripts, "hg38_canonical_transcripts.csv") | |
hg38_biomart_structure_df_canonical <- hg38_biomart_structure_df %>% | |
filter(ensembl_transcript_id %in% canonical_transcripts$ensembl_transcript_id) %>% | |
mutate(chrom=paste("chr", chromosome_name, sep="")) | |
gene_groups_df <- na.omit(hg38_biomart_structure_df_canonical %>% | |
mutate(group="5_UTR") %>% | |
dplyr::select(chrom, start=`5_utr_start`, end=`5_utr_end`, group, ensembl_gene_id, external_gene_name)) | |
gene_groups_df <- bind_rows(gene_groups_df, na.omit(hg38_biomart_structure_df_canonical %>% | |
mutate(group="3_UTR") %>% | |
dplyr::select(chrom, start=`3_utr_start`, end=`3_utr_end`, group, ensembl_gene_id, external_gene_name))) | |
gene_groups_df <- bind_rows(gene_groups_df, na.omit(hg38_biomart_structure_df_canonical %>% | |
filter(strand == 1) %>% | |
distinct(ensembl_transcript_id, .keep_all = T) %>% | |
mutate(group="TSS200", start=(transcription_start_site - 200), end=transcription_start_site-1) %>% | |
dplyr::select(chrom, start, end, group, ensembl_gene_id, external_gene_name))) | |
gene_groups_df <- bind_rows(gene_groups_df, na.omit(hg38_biomart_structure_df_canonical %>% | |
filter(strand == 1) %>% | |
distinct(ensembl_transcript_id, .keep_all = T) %>% | |
mutate(group="TSS1500", start=(transcription_start_site - 1500), end=(transcription_start_site - 201)) %>% | |
dplyr::select(chrom, start, end, group, ensembl_gene_id, external_gene_name))) | |
gene_groups_df <- bind_rows(gene_groups_df, na.omit(hg38_biomart_structure_df_canonical %>% | |
filter(strand == -1) %>% | |
distinct(ensembl_transcript_id, .keep_all = T) %>% | |
mutate(group="TSS200", start=(transcription_start_site+1), end=(transcription_start_site+200)) %>% | |
dplyr::select(chrom, start, end, group, ensembl_gene_id, external_gene_name))) | |
gene_groups_df <- bind_rows(gene_groups_df, na.omit(hg38_biomart_structure_df_canonical %>% | |
filter(strand == -1) %>% | |
distinct(ensembl_transcript_id, .keep_all = T) %>% | |
mutate(group="TSS1500", start=(transcription_start_site + 201), end=(transcription_start_site + 1500)) %>% | |
dplyr::select(chrom, start, end, group, ensembl_gene_id, external_gene_name))) | |
gene_groups_df <- bind_rows(gene_groups_df, na.omit(hg38_biomart_structure_df_canonical %>% | |
filter(strand == 1) %>% | |
distinct(ensembl_transcript_id, .keep_all = T) %>% | |
mutate(group="OutsideTranscript3", start=(transcript_end+1), end=(transcript_end + 5000)) %>% | |
dplyr::select(chrom, start, end, group, ensembl_gene_id, external_gene_name))) | |
gene_groups_df <- bind_rows(gene_groups_df, na.omit(hg38_biomart_structure_df_canonical %>% | |
filter(strand == -1) %>% | |
distinct(ensembl_transcript_id, .keep_all = T) %>% | |
mutate(group="OutsideTranscript3",start=(transcript_start - 5000), end=(transcript_start-1)) %>% | |
dplyr::select(chrom, start, end, group, ensembl_gene_id, external_gene_name))) | |
gene_groups_df <- bind_rows(gene_groups_df, na.omit(hg38_biomart_structure_df_canonical %>% | |
filter(strand == 1) %>% | |
distinct(ensembl_transcript_id, .keep_all = T) %>% | |
mutate(group="OutsideTranscript5", start=(transcript_start - 5000), end=(transcript_start -1501)) %>% | |
dplyr::select(chrom, start, end, group, ensembl_gene_id, external_gene_name))) | |
gene_groups_df <- bind_rows(gene_groups_df, na.omit(hg38_biomart_structure_df_canonical %>% | |
filter(strand == -1) %>% | |
distinct(ensembl_transcript_id, .keep_all = T) %>% | |
mutate(group="OutsideTranscript5", start=(transcript_end + 1501), end=(transcript_end + 5000)) %>% | |
dplyr::select(chrom, start, end, group, ensembl_gene_id, external_gene_name))) | |
test_df <- right_join(filtered_site_annotation_df, gene_groups_df, by=c("hg38_ensembl_gene_id"="ensembl_gene_id")) %>% | |
filter((hg38_pos >= start & hg38_pos <= end)) | |
filtered_site_annotation_df_test <- left_join(filtered_site_annotation_df, test_df %>% dplyr::select(rowname, hg38_gene_group=group), by="rowname") | |
filtered_site_annotation_df_test <- left_join(filtered_site_annotation_df_test, | |
hg38_biomart_structure_df_canonical %>% dplyr::select(ensembl_gene_id, hg38_gene_name=external_gene_name) %>% distinct(ensembl_gene_id, .keep_all = T), | |
by=c("hg38_ensembl_gene_id"="ensembl_gene_id")) | |
filtered_site_annotation_df_test[ | |
(filtered_site_annotation_df_test$hg38_ensembl_gene_id != "") & is.na(filtered_site_annotation_df_test$hg38_gene_group), | |
"hg38_gene_group"] <- "Body" | |
filtered_site_annotation_df <- filtered_site_annotation_df_test | |
write_csv(filtered_site_annotation_df, "filtered_site_annotation_df.csv") | |
x_annotation_df <- filtered_site_annotation_df[filtered_site_annotation_df$hg38_chromosome == "chrX", ] | |
write_csv(x_annotation_df, "x_annotation.csv") | |
x_sites <- x_annotation_df$Name | |
# compute predicted sex using getSex function from minfi | |
sex_prediction <- as.data.frame(getSex(filtered_combined_arrays))[sampleNames(combined_arrays), ] | |
# get all sample names that have a predicted sex of female, and an annotated sex of female | |
all_female_sample_names <- sampleNames(filtered_combined_arrays)[(filtered_combined_arrays$Keep %in% c("TRUE")) & | |
(filtered_combined_arrays$gender == "female") & | |
(sex_prediction$predictedSex == "F")] | |
all_male_sample_names <- sampleNames(filtered_combined_arrays)[(filtered_combined_arrays$Keep %in% c("TRUE")) & | |
(filtered_combined_arrays$gender == "male") & | |
(sex_prediction$predictedSex == "M")] | |
all_beta_mat <- getBeta(filtered_combined_arrays) | |
write_csv(as.data.frame(all_beta_mat) %>% rownames_to_column(), "all_beta_mat.csv.gz") | |
# get a matrix of the beta values for all the female samples we will keep | |
female_beta_mat <- na.omit(all_beta_mat[, all_female_sample_names]) | |
write_csv(as.data.frame(female_beta_mat) %>% rownames_to_column(), "female_beta_mat.csv.gz") | |
# get a matrix of the beta values for all the male samples we will keep | |
male_beta_mat <- na.omit(all_beta_mat[, all_male_sample_names]) | |
write_csv(as.data.frame(male_beta_mat) %>% rownames_to_column(), "male_beta_mat.csv.gz") | |
sample_annotation_df <- as.data.frame(pData(filtered_combined_arrays)) | |
sample_annotation_df[c(all_female_sample_names, all_male_sample_names), "gender_matches"] <- TRUE | |
sample_annotation_df <- rownames_to_column(sample_annotation_df, var = "sample_name") | |
write.csv(sample_annotation_df, file="sample_data_combined_all_info.csv") | |
``` | |
Next we will look at the female and male samples to identify outliers, and remove them. | |
```{r} | |
extrafont::loadfonts(device="win") | |
library(ggplot2) | |
library(ggfortify) | |
library(grid) | |
library(gridExtra) | |
library(ggsci) | |
library(viridis) | |
# create a PCA plot of the female samples, and color by different attributes. | |
female_all_x_df <- female_beta_mat[rownames(female_beta_mat) %in% x_annotation_df$Name,] | |
female_avg_x_methylation_df <- data.frame(meanXMethylation=colMeans(na.omit(female_all_x_df))) | |
female_avg_x_methylation_df <- rownames_to_column(female_avg_x_methylation_df, var = "sample_name") | |
female_pca <- prcomp(t(na.omit(female_all_x_df))) | |
female_pca_df <- rownames_to_column(as.data.frame(female_pca$x), var = "sample_name") | |
female_pca_df <- left_join(female_pca_df, sample_annotation_df, by = "sample_name") | |
female_pca_df <- left_join(female_pca_df, female_avg_x_methylation_df, by = "sample_name") | |
female_pov <- sprintf("%2.2f%%", 100* female_pca$sdev^2/sum(female_pca$sdev^2)) | |
female_pc1_label <- sprintf("PC1 (%s)", female_pov[1]) | |
female_pc2_label <- sprintf("PC2 (%s)", female_pov[2]) | |
base_plot_theme <- theme(plot.title = element_text(hjust = 0.5, size=6, margin = margin(b=.5, unit = "mm"), face="bold"), | |
legend.position="bottom", | |
legend.justification = "right", | |
legend.margin = margin(t=0,r=0,b=0,l=0, unit="mm"), | |
legend.box.margin = margin(t=-3, unit="mm"), | |
legend.text = element_text(size=5, margin = margin(r=.5, l=0, unit="mm")), | |
legend.title = element_text(size=5, face="bold", margin = margin(r=1, unit="mm")), | |
legend.key.width = unit(2, units="mm"), | |
legend.key.height = unit(2, units="mm"), | |
legend.key.size = unit(.2, units = "mm"), | |
axis.title = element_text(size=6, margin=margin(t=0,r=0,b=0,l=0, unit = "mm")), | |
axis.text = element_text(size = 6), | |
plot.margin = unit(c(1,1,1,1), units="mm"), | |
panel.grid.major = element_line(size=.3), | |
panel.grid.minor = element_line(size=.2), | |
panel.border = element_rect(size = .3), | |
axis.ticks = element_line(size = .3)) | |
pca_female_by_group <- ggplot(female_pca_df, mapping = aes(x=PC1, y=PC2)) + | |
geom_point(aes(color=group_accession), size=.5, shape=20) + | |
theme_bw() + | |
labs(x=female_pc1_label, | |
y=female_pc2_label, | |
title = "Studies", | |
color="Study")+ | |
base_plot_theme + | |
scale_color_manual(limits=c("GSE110544", "GSE31848", "GSE59091", "GSE60924", "GSE72923", "GSE85828", "GSE34982", "GSE73938"), | |
values = c(pal_futurama()(4), pal_startrek()(3)[3], pal_rickandmorty()(5)[c(2,3,4)]))+ | |
# scale_color_futurama() + | |
guides(color=guide_legend(nrow=3,byrow=TRUE)) + | |
theme(legend.key.width = unit(1, units="mm"), | |
legend.key.height = unit(1, units="mm"), | |
legend.text = element_text(size=4, margin = margin(r=.5, l=-1, unit="mm")), | |
legend.box.margin = margin(t=-3,l=-5, unit="mm"), | |
legend.spacing.y=unit(.2, "mm")) | |
pca_female_by_methylation <- ggplot(female_pca_df, mapping = aes(x=PC1, y=PC2)) + | |
geom_point(aes(color=meanXMethylation, shape = (cell_line == "WA09")), size=.5) + | |
scale_shape_manual(values = c(17, 16), labels=c("H9", "Other"), limits=c(T,F), guide=F) + | |
labs(x=female_pc1_label, | |
y=female_pc2_label, | |
title = "Mean X Methylation", | |
color="Methylation")+ | |
theme_bw() + | |
base_plot_theme + | |
theme(legend.title = element_text(vjust = .95), | |
legend.box.margin = margin(t=-3,l=-6, unit="mm"), | |
legend.key.width = unit(3, units = "mm")) + | |
scale_color_viridis_c(breaks=c(.4,.5,.6), minor_breaks=waiver()) | |
#add color for H9 | |
pca_female_by_cell_line <- ggplot(female_pca_df, mapping = aes(x=PC1, y=PC2)) + | |
geom_point(aes(color=cell_line, shape = (cell_line == "WA09")), size=.5) + | |
scale_shape_manual(values = c(17, 16), labels=c("H9", "Other"), limits=c(T,F)) + | |
geom_point(data = female_pca_df %>% filter(cell_line == "WA09"), | |
mapping = aes(x=PC1, y=PC2), size=.5, color="#000000", shape=17) + | |
labs(x=female_pc1_label, | |
y=female_pc2_label, | |
title="Cell Lines", | |
shape="Cell Line") + | |
theme_bw() + | |
base_plot_theme + | |
scale_color_discrete(guide=F) | |
pca_female_by_cell_type <- ggplot(female_pca_df, mapping = aes(x=PC1, y=PC2)) + | |
geom_point(aes(color=cell_type), size=.5, shape=20) + | |
labs(x=female_pc1_label, | |
y=female_pc2_label, | |
title = "Cell Type", | |
color="Cell Type")+ | |
theme_bw() + | |
base_plot_theme + | |
scale_color_startrek() | |
arranged_female_pca <- grid.arrange(pca_female_by_methylation, | |
pca_female_by_group, | |
pca_female_by_cell_type, | |
pca_female_by_cell_line, | |
top = textGrob("PCA of Female Samples",gp=gpar(fontsize=7, fontface="bold")), nrow=1) | |
ggsave("figs/supplementary/female_pca.pdf", arranged_female_pca, width = 139.2, height=41, units = "mm") | |
``` | |
## Removal of overrepresented H9 (a.k.a. WA09) lines from female samples | |
```{r} | |
h9_lines <- filter(sample_annotation_df, cell_line == 'WA09' & gender_matches == T)$sample_name | |
female_all_x_df_only_h9 <- dplyr::select(as.data.frame(female_all_x_df), all_of(h9_lines)) | |
write_csv(female_all_x_df_only_h9, "female_all_x_df_only_h9.csv.gz") | |
female_sig_x_sites_df_only_h9 <- female_all_x_df_only_h9[sig_x_sites,] | |
write_csv(female_sig_x_sites_df_only_h9 %>% rownames_to_column(), "female_sig_x_sites_df_only_h9.csv.gz") | |
female_all_x_df_no_h9 <- dplyr::select(as.data.frame(female_all_x_df), -h9_lines) | |
female_no_h9_pca <- prcomp(t(na.omit(female_all_x_df_no_h9))) | |
female_no_h9_pca_df <- rownames_to_column(as.data.frame(female_no_h9_pca$x), var = "sample_name") | |
female_no_h9_pca_df <- left_join(female_no_h9_pca_df, sample_annotation_df, by = "sample_name") | |
female_no_h9_pca_df <- left_join(female_no_h9_pca_df, female_avg_x_methylation_df, by = "sample_name") | |
# female_no_h9_pca_df <- left_join(female_no_h9_pca_df, tidy(colMeans(female_all_x_df_no_h9)) %>% rename(sample_name=names, meanXMethylation=x), by = "sample_name") | |
female_no_h9_pov <- sprintf("%2.2f%%", 100* female_no_h9_pca$sdev^2/sum(female_no_h9_pca$sdev^2)) | |
female_no_h9_pc1_label <- sprintf("PC1 (%s)", female_no_h9_pov[1]) | |
female_no_h9_pc2_label <- sprintf("PC2 (%s)", female_no_h9_pov[2]) | |
pca_female_no_h9_by_group <- ggplot(female_no_h9_pca_df, mapping = aes(x=PC1, y=PC2)) + | |
geom_point(aes(color=group_accession), size=.5, shape=20) + | |
theme_bw() + | |
labs(x=female_no_h9_pc1_label, | |
y=female_no_h9_pc2_label, | |
title = "Studies", | |
color="Study")+ | |
base_plot_theme + | |
theme(legend.key.width = unit(1, units="mm"), | |
legend.key.height = unit(1, units="mm"), | |
legend.text = element_text(size=4, margin = margin(r=.5, l=-1, unit="mm")), | |
legend.box.margin = margin(t=-3,l=-5, unit="mm"), | |
legend.spacing.y=unit(.2, "mm")) + | |
scale_color_futurama() | |
pca_female_no_h9_by_methylation <- ggplot(female_no_h9_pca_df, mapping = aes(x=PC1, y=PC2)) + | |
geom_point(aes(color=meanXMethylation), size=.5, shape=20) + | |
labs(x=female_no_h9_pc1_label, | |
y=female_no_h9_pc2_label, | |
title = "Mean X Methylation", | |
color="Methylation")+ | |
theme_bw() + | |
base_plot_theme + | |
theme(legend.title = element_text(vjust = .95, margin=margin(l=-3, unit="mm")), | |
legend.box.margin = margin(t=-3,l=-10, unit="mm"), | |
legend.key.width = unit(3, units = "mm")) + | |
scale_color_viridis_c(breaks=c(.4,.5,.6), minor_breaks=waiver()) | |
pca_female_no_h9_by_cell_line <- ggplot(female_no_h9_pca_df, mapping = aes(x=PC1, y=PC2)) + | |
geom_point(aes(color=cell_line, shape = (cell_line == "WA09")), size=.5) + | |
scale_shape_manual(values = c(16, 17), labels=c("Other", "H9")) + | |
geom_point(data = female_no_h9_pca_df %>% filter(cell_line == "WA09"), | |
mapping = aes(x=PC1, y=PC2), size=.5, color="#000000", shape=17) + | |
labs(x=female_no_h9_pc1_label, | |
y=female_no_h9_pc2_label, | |
title="Cell Lines", | |
shape="Cell Line") + | |
theme_bw() + | |
base_plot_theme + | |
scale_color_discrete(guide=F) | |
pca_female_no_h9_by_cell_type <- ggplot(female_no_h9_pca_df, mapping = aes(x=PC1, y=PC2)) + | |
geom_point(aes(color=cell_type), size=.5, shape=20) + | |
labs(x=female_no_h9_pc1_label, | |
y=female_no_h9_pc2_label, | |
title = "Cell Type", | |
color="Cell Type")+ | |
theme_bw() + | |
base_plot_theme + | |
scale_color_startrek() | |
arranged_female_no_h9_pca <- grid.arrange(pca_female_no_h9_by_methylation, | |
pca_female_no_h9_by_group, | |
pca_female_no_h9_by_cell_type, | |
pca_female_no_h9_by_cell_line, | |
top = textGrob("PCA of Female Samples - No H9",gp=gpar(fontsize=6))) | |
ggsave("figs/supplementary/female_no_h9_pca.pdf",arranged_female_no_h9_pca, width = 80, height=85, units = 'mm') | |
arranged_female_no_h9_pca <- grid.arrange(pca_female_no_h9_by_methylation, | |
top = textGrob("Female - No H9", gp=gpar(fontsize=7, fontface="bold"))) | |
ggsave("figs/supplementary/female_no_h9_pca_methylation.svg",arranged_female_no_h9_pca, width = 34.8, height=41, units = 'mm') | |
``` | |
## Create PCA Plots of the male samples | |
```{r,dpi=600, fig.width=7, fig.height=7} | |
male_all_x_df <- as.data.frame(male_beta_mat[rownames(male_beta_mat) %in% x_annotation_df$Name,]) | |
male_avg_x_methylation_df <- data.frame(meanXMethylation=colMeans(na.omit(male_all_x_df))) | |
male_avg_x_methylation_df <- rownames_to_column(male_avg_x_methylation_df, var = "sample_name") | |
male_pca <- prcomp(t(na.omit(male_all_x_df))) | |
male_pca_df <- rownames_to_column(as.data.frame(male_pca$x), var = "sample_name") | |
male_pca_df <- left_join(male_pca_df, sample_annotation_df, by = "sample_name") | |
male_pca_df <- left_join(male_pca_df, male_avg_x_methylation_df, by = "sample_name") | |
male_pov <- sprintf("%2.2f%%", 100* male_pca$sdev^2/sum(male_pca$sdev^2)) | |
male_pc1_label <- sprintf("PC1 (%s)", male_pov[1]) | |
male_pc2_label <- sprintf("PC2 (%s)", male_pov[2]) | |
pca_male_by_group <- ggplot(male_pca_df, mapping = aes(x=PC1, y=PC2)) + | |
geom_point(aes(color=group_accession), size=.5, shape=20) + | |
theme_bw() + | |
labs(x=male_pc1_label, | |
y=male_pc2_label, | |
title = "Studies", | |
color="Study")+ | |
base_plot_theme + | |
guides(color=guide_legend(nrow=2,byrow=TRUE)) + | |
theme(legend.key.width = unit(1, units="mm"), | |
legend.key.height = unit(1, units="mm"), | |
legend.text = element_text(size=4, margin = margin(r=.5, l=-1, unit="mm")), | |
legend.box.margin = margin(t=-3,l=-5, unit="mm"), | |
legend.spacing.y=unit(.2, "mm")) + | |
scale_color_manual(limits=c("GSE110544", "GSE31848", "GSE59091", "GSE60924", "GSE72923", "GSE85828"), | |
values = c(pal_futurama()(4), pal_startrek()(3)[3], pal_rickandmorty()(5)[c(2)])) | |
pca_male_by_methylation <- ggplot(male_pca_df, mapping = aes(x=PC1, y=PC2)) + | |
geom_point(aes(color=meanXMethylation), size=.5, shape=20) + | |
labs(x=male_pc1_label, | |
y=male_pc2_label, | |
title = "Mean X Methylation", | |
color="Methylation")+ | |
theme_bw() + | |
base_plot_theme + | |
theme(legend.title = element_text(vjust = .95), | |
legend.box.margin = margin(t=-3,l=-6, unit="mm"), | |
legend.key.width = unit(3, units = "mm")) + | |
scale_color_viridis_c() | |
pca_male_by_cell_line <- ggplot(male_pca_df, mapping = aes(x=PC1, y=PC2)) + | |
geom_point(aes(color=cell_line), size=.5, shape=20) + | |
labs(x=male_pc1_label, | |
y=male_pc2_label, | |
title="Cell Lines", | |
shape="Cell Line") + | |
theme_bw() + | |
base_plot_theme + | |
scale_color_discrete(guide=F) | |
pca_male_by_cell_type <- ggplot(male_pca_df, mapping = aes(x=PC1, y=PC2)) + | |
geom_point(aes(color=cell_type), size=.5, shape=20) + | |
labs(x=male_pc1_label, | |
y=male_pc2_label, | |
title = "Cell Type", | |
color="Cell Type")+ | |
theme_bw() + | |
base_plot_theme + | |
scale_color_startrek() | |
arranged_male_pca <- grid.arrange(pca_male_by_methylation, | |
pca_male_by_group, | |
pca_male_by_cell_type, | |
pca_male_by_cell_line, | |
top = textGrob("PCA of Male Samples", gp=gpar(fontsize=7, fontface="bold")), nrow=1) | |
ggsave("figs/supplementary/male_pca.pdf", arranged_male_pca, width = 139.2, height=41, units = 'mm') | |
``` | |
## Identification and removal of male outliers | |
```{r} | |
male_outlier_sample_names <- (male_pca_df %>% filter(PC1 > 20))$sample_name | |
male_no_outliers_all_x_df <- as.data.frame(male_beta_mat[rownames(male_beta_mat) %in% x_annotation_df$Name,]) | |
male_no_outliers_all_x_df <- dplyr::select(male_all_x_df, -male_outlier_sample_names) | |
male_no_outliers_pca <- prcomp(t(na.omit(male_no_outliers_all_x_df))) | |
male_no_outliers_pca_df <- rownames_to_column(as.data.frame(male_no_outliers_pca$x), var = "sample_name") | |
male_no_outliers_pca_df <- left_join(male_no_outliers_pca_df, sample_annotation_df, by = "sample_name") | |
male_no_outliers_pca_df <- left_join(male_no_outliers_pca_df, male_avg_x_methylation_df, by = "sample_name") | |
male_no_outliers_pov <- sprintf("%2.2f%%", 100* male_no_outliers_pca$sdev^2/sum(male_no_outliers_pca$sdev^2)) | |
male_no_outliers_pc1_label <- sprintf("PC1 (%s)", male_no_outliers_pov[1]) | |
male_no_outliers_pc2_label <- sprintf("PC2 (%s)", male_no_outliers_pov[2]) | |
pca_male_no_outliers_by_group <- ggplot(male_no_outliers_pca_df, mapping = aes(x=PC1, y=PC2)) + | |
geom_point(aes(color=group_accession), size=.5, shape=20) + | |
theme_bw() + | |
labs(x=male_no_outliers_pc1_label, | |
y=male_no_outliers_pc2_label, | |
title = "Studies", | |
color="Study")+ | |
base_plot_theme + | |
scale_color_futurama() | |
pca_male_no_outliers_by_methylation <- ggplot(male_no_outliers_pca_df, mapping = aes(x=PC1, y=PC2)) + | |
geom_point(aes(color=meanXMethylation), size=.5, shape=20) + | |
labs(x=male_no_outliers_pc1_label, | |
y=male_no_outliers_pc2_label, | |
title = "Mean X Methylation", | |
color="Methylation")+ | |
theme_bw() + | |
base_plot_theme + | |
theme(legend.title = element_text(vjust = .95), | |
legend.box.margin = margin(t=-3,l=-6, unit="mm"), | |
legend.key.width = unit(3, units = "mm")) + | |
scale_color_viridis_c() | |
pca_male_no_outliers_by_cell_line <- ggplot(male_no_outliers_pca_df, mapping = aes(x=PC1, y=PC2)) + | |
geom_point(aes(color=cell_line), size=.5, shape=20) + | |
labs(x=male_no_outliers_pc1_label, | |
y=male_no_outliers_pc2_label, | |
title="Cell Lines", | |
shape="Cell Line") + | |
theme_bw() + | |
base_plot_theme + | |
scale_color_discrete(guide=F) | |
pca_male_no_outliers_by_cell_type <- ggplot(male_no_outliers_pca_df, mapping = aes(x=PC1, y=PC2)) + | |
geom_point(aes(color=cell_type), size=.5, shape=20) + | |
labs(x=male_no_outliers_pc1_label, | |
y=male_no_outliers_pc2_label, | |
title = "Cell Type", | |
color="Cell Type")+ | |
theme_bw() + | |
base_plot_theme + | |
scale_color_startrek() | |
arranged_male_no_outliers_pca <- grid.arrange(pca_male_no_outliers_by_methylation, | |
pca_male_no_outliers_by_group, | |
pca_male_no_outliers_by_cell_type, | |
pca_male_no_outliers_by_cell_line, | |
top = textGrob("PCA of Male Samples - No outliers",gp=gpar(fontsize=6))) | |
ggsave("figs/supplementary/male_no_outliers_pca.pdf", arranged_male_no_outliers_pca, width = 80, height=85, units = 'mm') | |
arranged_male_no_outliers_pca <- grid.arrange(pca_male_no_outliers_by_methylation, | |
top = textGrob("Male - No outliers",gp=gpar(fontsize=7, fontface="bold"))) | |
ggsave("figs/supplementary/male_no_outliers_pca_meth.svg", arranged_male_no_outliers_pca, width = 34.8, height=40, units = 'mm') | |
``` | |
```{r} | |
library(data.table) | |
# have to use fwrite from package data.table because write.csv couldn't handle the large matrix. This will save the data to a file. | |
female_df_no_h9 <- as.data.frame(female_beta_mat) | |
female_df_no_h9 <- dplyr::select(female_df_no_h9, -h9_lines) | |
female_df_no_h9_chrX <- female_df_no_h9[x_annotation_df$Name,] | |
female_df_no_h9 <- as.matrix(female_df_no_h9) | |
female_df_no_h9_chrX <- as.matrix(female_df_no_h9_only_x) | |
fwrite(as.data.table(na.omit(female_df_no_h9), | |
keep.rownames=TRUE, | |
na.rm=TRUE), | |
file='female_df_no_h9.csv', | |
row.names = FALSE) | |
fwrite(as.data.table(na.omit(female_df_no_h9_chrX), | |
keep.rownames=TRUE, | |
na.rm=TRUE), | |
file='female_df_no_h9_chrX.csv', | |
row.names = FALSE) | |
male_df_no_outliers <- as.data.frame(male_beta_mat) | |
male_df_no_outliers <- dplyr::select(male_df_no_outliers, -male_outlier_sample_names) | |
male_df_no_outliers <- as.matrix(male_df_no_outliers) | |
fwrite(as.data.table(na.omit(male_df_no_outliers), keep.rownames=TRUE, na.rm=TRUE), | |
file='male_df_no_outliers.csv', | |
row.names = FALSE) | |
``` | |
Now we look at the distribution of values in female and male samples. | |
```{r} | |
female_df_no_h9_chrX <- read_csv("female_df_no_h9_chrX.csv.gz") | |
male_df_no_outliers <- read_csv("male_df_no_outliers.csv.gz") | |
x_annotation_df <- read_csv("x_annotation.csv.gz") | |
male_df_no_outliers_chrX <- male_df_no_outliers %>% filter(rn %in% x_annotation_df$rowname) | |
xchr_beta_dist <- ggplot() + | |
geom_violin(aes(x="female", y=as_vector(female_df_no_h9_chrX %>% select(-rn)), fill='female'), trim=FALSE, show.legend = F, size=.3) + | |
geom_violin(aes(x="male", y=as_vector(male_df_no_outliers_chrX %>% select(-rn)), fill='male'), trim=FALSE, show.legend = F, size=.3) + | |
geom_boxplot(aes(x="female", y=as_vector(female_df_no_h9_chrX %>% select(-rn))), width=0.025, size=.3) + | |
geom_boxplot(aes(x="male", y=as_vector(male_df_no_outliers_chrX %>% select(-rn))), width=0.025, size=.3) + | |
labs(x="Sex", | |
y='β-value', | |
title="X-Chr CpG β-Values") + | |
theme_bw() + | |
base_plot_theme + | |
scale_fill_npg() | |
ggsave('figs/main/fig1_xchr_beta_value_distributions.pdf',xchr_beta_dist, width = 36, height = 40, units = 'mm') | |
xchr_beta_dist | |
``` | |
## Comparing female lines to male lines | |
Here we compare the female and male lines to observe the differences between them. | |
```{r} | |
library(broom) | |
# base_plot_theme <- theme(plot.title = element_text(hjust = 0.5, size=9), | |
# legend.position="bottom", | |
# legend.justification = "right", | |
# legend.title = element_text(size = 6, margin = margin(r=2, unit="mm")), | |
# legend.text = element_text(size = 6, margin = margin(r=2, unit="mm")), | |
# legend.key.height = unit(2, "mm"), | |
# legend.key.width = unit(1, "mm"), | |
# legend.margin = margin(t=0,r=0,b=0,l=0, unit="mm"), | |
# axis.title = element_text(size=8), | |
# axis.text = element_text(size = 6)) | |
# all_beta_mat_subset <- all_beta_mat[x_annotation_df$Name, setdiff(c(colnames(female_all_x_df_no_h9), colnames(male_no_outliers_all_x_df)), c('Chromosome', 'Gene', 'Position'))] | |
male_female_beta_mat <- full_join(female_df_no_h9_chrX, male_df_no_outliers_chrX, by="rn") %>% select(-rn) | |
female_male_avg_x_methylation_df <- colMeans(na.omit(male_female_beta_mat)) %>% | |
tidy() %>% | |
dplyr::rename(sample_name=names, meanXMethylation=x) | |
sample_annotation_df <- read_csv("sample_data_combined_all_info.csv") | |
female_male_pca <- prcomp(t(na.omit(male_female_beta_mat))) | |
female_male_pca_df <- rownames_to_column(as.data.frame(female_male_pca$x), var = "sample_name") | |
female_male_pca_df <- left_join(female_male_pca_df, sample_annotation_df, by = "sample_name") | |
female_male_pca_df <- left_join(female_male_pca_df, female_male_avg_x_methylation_df, by = "sample_name") | |
female_male_pov <- sprintf("%2.2f%%", 100* female_male_pca$sdev^2/sum(female_male_pca$sdev^2)) | |
female_male_pc1_label <- sprintf("PC1 (%s)", female_male_pov[1]) | |
female_male_pc2_label <- sprintf("PC2 (%s)", female_male_pov[2]) | |
female_male_pca_plot <- ggplot(female_male_pca_df, mapping = aes(x=PC1, y=PC2)) + | |
geom_point(aes(color=gender), size=.5, shape=20) + | |
labs(x=female_male_pc1_label, | |
y=female_male_pc2_label, | |
title = "PCA of Female and Male", | |
color="Sex")+ | |
theme_bw() + | |
base_plot_theme + | |
scale_color_npg() | |
ggsave("figs/main/fig1_female_male_pca.pdf", female_male_pca_plot, height = 40, width = 36, units = 'mm') | |
female_male_pca_plot | |
``` | |
## Looking at global methylation patterns | |
```{r} | |
library(tidyverse) | |
library(ggsignif) | |
library(ggsci) | |
library(viridis) | |
library(grid) | |
library(gridExtra) | |
female_df_no_h9 <- read_csv("female_df_no_h9.csv.gz") | |
male_df_no_outliers <- read_csv("male_df_no_outliers.csv.gz") | |
x_annotation_df <- read_csv("x_annotation.csv.gz") | |
filtered_site_annotation_df <- read_csv("filtered_site_annotation_df.csv.gz") | |
sex_chromosome_sites <- (filtered_site_annotation_df %>% filter(hg38_chromosome %in% c("chrX", "chrY")))$rowname | |
female_means <- colMeans(female_df_no_h9 %>% dplyr::select(-rn)) | |
female_x_chr_means <- colMeans(female_df_no_h9 %>% filter(rn %in% x_annotation_df$rowname) %>% dplyr::select(-rn)) | |
xist_sites <- (x_annotation_df %>% filter(grepl("XIST",UCSC_RefGene_Name)))$rowname | |
female_xist_means <- colMeans(female_df_no_h9 %>% filter(rn %in% xist_sites) %>% dplyr::select(-rn)) | |
female_autosome_means <- colMeans(female_df_no_h9 %>% filter(!(rn %in% sex_chromosome_sites)) %>% dplyr::select(-rn)) | |
male_means <- colMeans(male_df_no_outliers %>% dplyr::select(-rn)) | |
male_x_chr_means <- colMeans(male_df_no_outliers %>% filter(rn %in% x_annotation_df$rowname) %>% dplyr::select(-rn)) | |
male_xist_means <- colMeans(male_df_no_outliers %>% filter(rn %in% xist_sites) %>% dplyr::select(-rn)) | |
male_autosome_means <- colMeans(male_df_no_outliers %>% filter(!(rn %in% sex_chromosome_sites)) %>% dplyr::select(-rn)) | |
means <- tibble( | |
sex=character(), | |
sample_name=character(), | |
mean_methylation=numeric(), | |
mean_x_methylation=numeric(), | |
mean_xist_methylation=numeric(), | |
mean_autosome_methylation=numeric() | |
) | |
for (sample in names(female_means)){ | |
means <- add_row(means, sex="female", | |
sample_name=sample, | |
mean_methylation=female_means[sample], | |
mean_x_methylation=female_x_chr_means[sample], | |
mean_xist_methylation=female_xist_means[sample], | |
mean_autosome_methylation=female_autosome_means[sample]) | |
} | |
for (sample in names(male_means)){ | |
means <- add_row(means, sex="male", | |
sample_name=sample, | |
mean_methylation=male_means[sample], | |
mean_x_methylation=male_x_chr_means[sample], | |
mean_xist_methylation=male_xist_means[sample], | |
mean_autosome_methylation=male_autosome_means[sample]) | |
} | |
lm_eqn <- function(m) { | |
# Displays regression line equation and R^2 value on plot | |
# Usage: | |
# p + annotate("text", x=25, y=300, label=lm_eqn(lm(y ~ x, df)), parse=TRUE) | |
l <- list(a = format(coef(m)[1], digits = 2), | |
b = format(abs(coef(m)[2]), digits = 2), | |
r2 = format(summary(m)$r.squared, digits = 3)); | |
if (coef(m)[2] >= 0) { | |
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,l) | |
} else { | |
eq <- substitute(italic(y) == a - b %.% italic(x)*","~~italic(r)^2~"="~r2,l) | |
} | |
as.character(as.expression(eq)); | |
} | |
global_mean_methylation <- ggplot(means, aes(x=sex, group=sex, y=mean_methylation, fill=sex)) + | |
geom_boxplot(show.legend = F, size=.3, outlier.size = .5, outlier.shape = 20) + | |
geom_signif(map_signif_level=TRUE, comparisons = list(c("female", "male")), size=.3, textsize = 2) + | |
labs(x="Sex", y="Global Mean β-values", title = "Global Mean\nMethylation") + | |
theme_bw() + | |
base_plot_theme + | |
theme(legend.position = "none") + | |
scale_y_continuous(limits = c(0.4,0.65)) + | |
scale_fill_npg() | |
autosomal_mean_methylation <- ggplot(means, aes(x=sex, group=sex, y=mean_autosome_methylation, fill=sex)) + | |
geom_boxplot(show.legend = F, size=.3, outlier.size = .5, outlier.shape = 20) + | |
geom_signif(map_signif_level=TRUE, comparisons = list(c("female", "male")), size=.3, textsize = 2) + | |
labs(x="Sex", y="Autosomal Mean Methylation β-value", title = "Autosomal Mean Methylation By Sex") + | |
theme_bw() + | |
base_plot_theme + | |
theme(legend.position = "none") + | |
scale_y_continuous(limits = c(0.4,0.65)) + | |
scale_fill_npg() | |
# ggsave("figs/supplementary/Male vs Female Global Methylation.svg", units = "mm", width = 84, height = 84) | |
female_means <- means %>% filter(sex=="female") | |
sex_colors <- pal_npg("nrc")(2) | |
correlation <- sprintf("corr = %.2f", cor(female_means$mean_x_methylation, female_means$mean_methylation)) | |
female_x_global <- ggplot(female_means, aes(x=mean_x_methylation, y=mean_methylation)) + | |
geom_point(color=sex_colors[1],size=.7) + | |
labs(x="Mean X Chr Methylation", y="Mean Global Methylation", title="Female Samples - X Chr vs Global Meth") + | |
geom_smooth(method='lm', color="black", size=.5) + | |
# annotate("text", x=.55, y=.47, label=lm_eqn(lm(mean_methylation ~ mean_x_methylation, female_means)), parse=TRUE) + | |
annotate("text", x=.55, y=.45, label=correlation, size=3) + | |
theme_bw() + | |
base_plot_theme | |
# ggsave("figs/Female Samples - X Chr vs Global Meth.png") | |
correlation <- sprintf("corr = %.2f", cor(female_means$mean_x_methylation, female_means$mean_autosome_methylation)) | |
female_x_autosome <- ggplot(female_means, aes(x=mean_x_methylation, y=mean_autosome_methylation)) + | |
geom_point(color=sex_colors[1],size=.7) + | |
labs(x="Mean X Chr Methylation", y="Mean Autosomal Methylation", title="Female Samples - X Chr vs Autosomal Meth") + | |
geom_smooth(method='lm', color="black", size=.5) + | |
# annotate("text", x=.55, y=.47, label=lm_eqn(lm(mean_autosome_methylation ~ mean_x_methylation, female_means)), parse=TRUE) + | |
annotate("text", x=.55, y=.45, label=correlation, size=3) + | |
theme_bw() + | |
base_plot_theme | |
# ggsave("figs/Female Samples - X Chr vs Autosomal Meth.svg", height=55, width=84, units = "mm") | |
correlation <- sprintf("corr = %.2f", cor(female_means$mean_x_methylation, female_means$mean_xist_methylation)) | |
female_x_xist <- ggplot(means %>% filter(sex=="female"), aes(x=mean_x_methylation, y=mean_xist_methylation)) + | |
geom_point(color=sex_colors[1],size=.5, shape=20) + | |
labs(x="Mean X Chr Methylation", y="Mean XIST Methylation", title="Female Samples - X Chr vs XIST Meth") + | |
# geom_smooth(method='lm', color="black", size=.5) + | |
# annotate("text", x=.43, y=.77, label=lm_eqn(lm(mean_xist_methylation ~ mean_x_methylation, female_means)), parse=TRUE) + | |
# annotate("text", x=.43, y=.75, label=correlation, size=3) + | |
theme_bw() + | |
base_plot_theme | |
ggsave("figs/main/fig1_female_x_v_xist.svg",female_x_xist, height=60, width=58, units = "mm") | |
correlation <- sprintf("corr = %.2f", cor(female_means$mean_methylation, female_means$mean_xist_methylation)) | |
female_xist_global <- ggplot(means %>% filter(sex=="female"), aes(x=mean_xist_methylation, y=mean_methylation)) + | |
geom_point(color=sex_colors[1],size=.7) + | |
labs(x="Mean XIST Methylation", y="Mean Global Methylation", title="Female Samples - XIST vs Global Meth")+ | |
geom_smooth(method='lm', color="black", size=.5) + | |
# annotate("text", x=.73, y=.48, label=lm_eqn(lm(mean_methylation ~ mean_xist_methylation, female_means)), parse=TRUE) + | |
annotate("text", x=.73, y=.47, label=correlation, size=3) + | |
theme_bw() + | |
base_plot_theme | |
# ggsave("figs/Female Samples - XIST vs Global Meth.png") | |
correlation <- sprintf("corr = %.2f", cor(female_means$mean_autosome_methylation, female_means$mean_xist_methylation)) | |
female_xist_autosome <- ggplot(means %>% filter(sex=="female"), aes(x=mean_xist_methylation, y=mean_autosome_methylation)) + | |
geom_point(color=sex_colors[1],size=.7) + | |
labs(x="Mean XIST Methylation", y="Mean Autosomal Methylation", title="Female Samples - XIST vs Autosomal Meth")+ | |
geom_smooth(method='lm', color="black", size=.5) + | |
# annotate("text", x=.73, y=.48, label=lm_eqn(lm(mean_autosome_methylation ~ mean_xist_methylation, female_means)), parse=TRUE) + | |
annotate("text", x=.73, y=.47, label=correlation, size=3) + | |
theme_bw() + | |
base_plot_theme | |
# ggsave("figs/Female Samples - XIST vs Autosomal Meth.png") | |
male_means <- means %>% filter(sex=="male") | |
correlation <- sprintf("corr = %.2f", cor(male_means$mean_x_methylation, male_means$mean_methylation)) | |
male_x_global <- ggplot(male_means, aes(x=mean_x_methylation, y=mean_methylation)) + | |
geom_point(color=sex_colors[2],size=.7) + | |
labs(x="Mean X Chr Methylation", y="Mean Global Methylation", title="Male Samples - X Chr vs Global Meth") + | |
geom_smooth(method='lm', color="black", size=.5) + | |
# annotate("text", x=.46, y=.49, label=lm_eqn(lm(mean_methylation ~ mean_x_methylation, male_means)), parse=TRUE) + | |
annotate("text", x=.46, y=.48, label=correlation, size=3) + | |
theme_bw() + | |
base_plot_theme | |
# ggsave("figs/Male Samples - X Chr vs Global Meth.png") | |
correlation <- sprintf("corr = %.2f", cor(male_means$mean_x_methylation, male_means$mean_autosome_methylation)) | |
male_x_autosome <- ggplot(male_means, aes(x=mean_x_methylation, y=mean_autosome_methylation)) + | |
geom_point(color=sex_colors[2],size=.7) + | |
labs(x="Mean X Chr Methylation", y="Mean Autosomal Methylation", title="Male Samples - X Chr vs Autosomal Meth") + | |
geom_smooth(method='lm', color="black", size=.5) + | |
# annotate("text", x=.46, y=.49, label=lm_eqn(lm(mean_autosome_methylation ~ mean_x_methylation, male_means)), parse=TRUE) + | |
annotate("text", x=.46, y=.48, label=correlation, size=3)+ | |
theme_bw() + | |
base_plot_theme | |
# ggsave("figs/Male Samples - X Chr vs Autosomal Meth.svg", height=55, width=84, units = "mm") | |
correlation <- sprintf("corr = %.2f", cor(male_means$mean_x_methylation, male_means$mean_xist_methylation)) | |
male_x_xist <- ggplot(means %>% filter(sex=="male"), aes(x=mean_x_methylation, y=mean_xist_methylation)) + | |
geom_point(color=sex_colors[2],size=.7) + | |
labs(x="Mean X Chr Methylation", y="Mean XIST Methylation", title="Male Samples - X Chr vs XIST Meth") + | |
geom_smooth(method='lm', color="black", size=.5) + | |
# annotate("text", x=.41, y=.92, label=lm_eqn(lm(mean_xist_methylation ~ mean_x_methylation, male_means)), parse=TRUE) + | |
annotate("text", x=.41, y=.91, label=correlation, size=3) + | |
theme_bw() + | |
base_plot_theme | |
# ggsave("figs/Male Samples - X Chr vs XIST Meth.png") | |
correlation <- sprintf("corr = %.2f", cor(male_means$mean_methylation, male_means$mean_xist_methylation)) | |
male_xist_global <- ggplot(means %>% filter(sex=="male"), aes(x=mean_xist_methylation, y=mean_methylation)) + | |
geom_point(color=sex_colors[2],size=.7) + | |
labs(x="Mean XIST Methylation", y="Mean Global Methylation", title="Male Samples - XIST vs Global Meth")+ | |
geom_smooth(method='lm', color="black", size=.5) + | |
# annotate("text", x=.90, y=.475, label=lm_eqn(lm(mean_methylation ~ mean_xist_methylation, male_means)), parse=TRUE) + | |
annotate("text", x=.90, y=.465, label=correlation, size=3) + | |
theme_bw() + | |
base_plot_theme | |
# ggsave("figs/Male Samples - XIST vs Global Meth.png") | |
correlation <- sprintf("corr = %.2f", cor(male_means$mean_autosome_methylation, male_means$mean_xist_methylation)) | |
male_xist_autosome <- ggplot(means %>% filter(sex=="male"), aes(x=mean_xist_methylation, y=mean_autosome_methylation)) + | |
geom_point(color=sex_colors[2],size=.7) + | |
labs(x="Mean XIST Methylation", y="Mean Autosomal Methylation", title="Male Samples - XIST vs Autosomal Meth")+ | |
geom_smooth(method='lm', color="black", size=.5) + | |
# annotate("text", x=.90, y=.48, label=lm_eqn(lm(mean_autosome_methylation ~ mean_xist_methylation, male_means)), parse=TRUE) + | |
annotate("text", x=.90, y=.47, label=correlation, size=3) + | |
theme_bw() + | |
base_plot_theme | |
# ggsave("figs/Male Samples - XIST vs Autosomal Meth.png") | |
supp_5 <- grid.arrange(female_x_global, male_x_global, | |
female_xist_global, male_xist_global, | |
female_xist_autosome, male_xist_autosome, | |
male_x_autosome, male_x_xist, nrow=4, padding=unit(4, "mm")) | |
ggsave("figs/supplementary/supp_5.pdf",supp_5, width=174, height=236, unit="mm") | |
``` | |
## Choosing sites that are more variant in the female lines | |
We run the Brown-Forsythe test using a custom program written in GoLang due to the advantages in its speed. | |
First we'll save our data from R into CSVs that can be read by the GoLang program. | |
Now we run the stats-runner-for-beta-matrix program written in GoLang on these CSV files, and get an output of variance_statistics_noh9_nooutliers.csv. This will calculate the variances and perform the brown-forsythe test. | |
```{bash} | |
./stats-runner-for-beta-matrix/stats-runner-for-beta-matrix -bf -a "female_df_no_h9.csv" -b "male_df_no_outliers.csv" -o "variance_statistics_noh9_nooutliers" | |
``` | |
Now we can bring the variance statistics back into R, and continue working with it. | |
```{r} | |
statistics <- read_csv('variance_statistics_noh9_nooutliers.csv.gz') | |
x_sites <- x_annotation_df$rowname | |
filtered_site_annotation_df <- read_csv("filtered_site_annotation_df.csv.gz") | |
sex_chromosome_sites <- (filtered_site_annotation_df %>% filter(hg38_chromosome %in% c("chrX", "chrY")))$rowname | |
variance_density_plot_autosome <- ggplot(statistics[!(statistics$Position %in% sex_chromosome_sites),]) + | |
geom_density(aes(x=femaleVariance, color="female", fill="female"), trim=F, size=.3, alpha=.2) + | |
geom_density(aes(x=maleVariance, color="male", fill="male"), trim=F, size=.3, alpha=.2) + | |
labs(title="Autosomal CpG β Variance", | |
x="β-Value Variance Across Samples", | |
y="Density") + | |
theme_bw() + | |
base_plot_theme + | |
scale_color_npg(name="Sex") + | |
scale_fill_npg(name="Sex") | |
variance_ks <- ks.test(ggplot_build(variance_density_plot_autosome)$data[[1]]$y, ggplot_build(variance_density_plot_autosome)$data[[2]]$y) | |
ks_dist <- sprintf("K.S. Distance = %.2f", as.numeric(variance_ks$statistic)) | |
variance_density_plot_autosome <- variance_density_plot_autosome + | |
annotate("text", x=.26, y=12, label=ks_dist, size=2) | |
ggsave("figs/main/fig1_variance_density_plot_autosome.pdf", variance_density_plot_autosome, width=36, height=40, units="mm") | |
# ks.test(filter(statistics, !(Position %in% sex_chromosome_sites))$femaleVariance, filter(statistics, !(Position %in% sex_chromosome_sites))$maleVariance) | |
ggsave("figs/main/fig1a.svg",variance_density_plot_autosome, width = 86.999, height = 78.666, units = "mm" ) | |
variance_density_plot <- ggplot(statistics[statistics$Position %in% x_sites,]) + | |
geom_density(aes(x=femaleVariance, color="female", fill="female"), trim=F, size=.3, alpha=.2) + | |
geom_density(aes(x=maleVariance, color="male", fill="male"), trim=F, size=.3, alpha=.2) + | |
labs(title="X-Chr CpG β Variance", | |
x="β-Value Variance Across Samples", | |
y="Density") + | |
theme_bw() + | |
base_plot_theme + | |
scale_color_npg(name="Sex") + | |
scale_fill_npg(name="Sex") | |
variance_ks <- ks.test(ggplot_build(variance_density_plot)$data[[1]]$y, ggplot_build(variance_density_plot)$data[[2]]$y) | |
ks_dist <- sprintf("K.S. Distance = %.2f*", as.numeric(variance_ks$statistic)) | |
variance_density_plot <- variance_density_plot + | |
annotate("text", x=.26, y=12, label=ks_dist, size=2) | |
ggsave("figs/main/fig1_variance_density_plot.pdf", variance_density_plot, width=36, height=40, units="mm") | |
# ks.test(filter(statistics, (Position %in% x_sites))$femaleVariance, filter(statistics, !(Position %in% sex_chromosome_sites))$maleVariance) | |
# ggsave("figs/x-chr-cpg-variance.pdf", height = 84, width = 84, units = 'mm') | |
sig_diff_x_v_no_x <- ggplot(statistics, aes(y=alpha, x=(Position %in% x_sites), fill=(Position %in% x_sites))) + | |
geom_boxplot(outlier.shape=NA, size=.4) + | |
labs(title="Brown-Forsythe\np-values", | |
x="Chromosome", | |
y="BF p-value") + | |
scale_x_discrete(labels=c("Autosomal", "X"), limits=c(FALSE, TRUE)) + | |
scale_fill_grey(start=.5, end=.8, guide=F) + | |
theme_bw() + | |
base_plot_theme #+ | |
# theme(axis.title.x=element_blank()) | |
positions_to_keep <- statistics$Position[statistics$alpha <= .01 & (statistics$femaleVariance > statistics$maleVariance)] # returns the density data | |
sig_x_sites <- intersect(positions_to_keep, x_sites) | |
print(length(sig_x_sites)) | |
sig_annotation <- x_annotation_df[x_annotation_df$Name %in% sig_x_sites,] | |
sig_female_beta_df <- female_df_no_h9 %>% filter(rn %in% sig_annotation$Name) | |
all_colnames <- colnames(sig_female_beta_df %>% select(-rn)) | |
sig_female_beta_df$Gene <- sig_annotation$hg38_gene_name | |
sig_female_beta_df$Chromosome <- sig_annotation$hg38_chromosome | |
sig_female_beta_df$Position <- sig_annotation$hg38_pos | |
sig_female_beta_df <- sig_female_beta_df[c('rn','Chromosome', 'Position', 'Gene', all_colnames)] | |
write.csv(sig_female_beta_df, file='sig_female_beta_df_no_h9.csv') | |
``` | |
## Beta value distributions after site selection | |
```{r} | |
unsig_female_beta_mat <- female_df_no_h9 %>% filter(rn %in% x_annotation_df$Name & !(rn %in% sig_annotation$Name)) | |
unsig_female_beta_vec <- as.vector(as.matrix(unsig_female_beta_mat %>% dplyr::select(-rn))) | |
sig_female_beta_vec <- as.vector(as.matrix(sig_female_beta_df %>% dplyr::select(-rn, -Chromosome, -Position, -Gene))) | |
sig_nonsig_dist_plot <- ggplot() + | |
geom_violin(aes(x="p<.01", y=sig_female_beta_vec, fill='female'), trim=FALSE, show.legend = F, size=.3) + | |
geom_violin(aes(x="p>.01", y=unsig_female_beta_vec, fill='male'), trim=FALSE, show.legend = F, size=.3) + | |
geom_boxplot(aes(x="p<.01", y=sig_female_beta_vec), width=0.025, size=.3, outlier.shape=20, outlier.size=.4) + | |
geom_boxplot(aes(x="p>.01", y=unsig_female_beta_vec), width=0.025, size=.3, outlier.shape=20, outlier.size=.4) + | |
labs(x="BF p-values", | |
y='DNAme β-value', | |
#title="β-values of sig. and \nnon-sig. variant sites\nin Female hPSCs") + | |
title="Probe β-values in Female\nhPSCs by Variance") + | |
scale_x_discrete(limits=c("p>.01", "p<.01")) + | |
theme_bw() + | |
base_plot_theme + | |
theme(legend.position = "none", | |
# axis.title.x=element_blank() | |
) + | |
scale_fill_futurama() | |
# ggsave('figs/beta value distributions - sig-unsig.png') | |
sig_nonsig_dist_plot | |
``` | |
## Calculating PPVs and correlation to avg X chromosome methylation | |
```{r} | |
female_df_no_h9_chrX <- read_csv("female_df_no_h9_chrX.csv.gz") | |
female_df_no_h9 <- read_csv("female_df_no_h9.csv") | |
mean_x_methylation <- colMeans(female_df_no_h9_chrX %>% dplyr::select(-rn)) | |
mean_sig_x_methylation <- colMeans(female_df_no_h9_chrX %>% filter(rn %in% sig_x_sites) %>% dplyr::select(-rn)) | |
mean_nonsig_x_methylation <- colMeans(female_df_no_h9_chrX %>% filter((rn %in% x_sites) & !(rn %in% sig_x_sites)) %>% dplyr::select(-rn)) | |
mean_genome_methylation <- colMeans(female_df_no_h9 %>% dplyr::select(-rn)) | |
xist_sites <- (x_annotation_df %>% filter(hg38_gene_name == "XIST"))$rowname | |
mean_xist_methylation <- colMeans(female_df_no_h9 %>% filter(rn %in% xist_sites) %>% dplyr::select(-rn)) | |
cor.test(mean_x_methylation, mean_sig_x_methylation) | |
cor.test(mean_x_methylation, mean_genome_methylation) | |
cor.test(mean_x_methylation, mean_xist_methylation) | |
cor.test(mean_sig_x_methylation, mean_genome_methylation) | |
cor.test(mean_sig_x_methylation, mean_xist_methylation) | |
data_df <- tibble(mean_x = mean_x_methylation, mean_sig_x = mean_sig_x_methylation, mean_nonsig_x = mean_nonsig_x_methylation) | |
correlation <- sprintf("corr = %.2f", cor(mean_x_methylation, mean_sig_x_methylation)) | |
female_x_sigsites <- ggplot(data_df, aes(x=mean_x, y=mean_sig_x)) + | |
geom_point(color=sex_colors[1],size=.5, shape=20) + | |
labs(x="Mean X DNAme", y="Mean Sig X DNAme", title="Female hPSCs\nBF p<.01 X-probes") + | |
geom_smooth(method='lm', color="black", size=.5) + | |
# annotate("text", x=.55, y=.47, label=lm_eqn(lm(mean_autosome_methylation ~ mean_x_methylation, female_means)), parse=TRUE) + | |
annotate("text", x=.45, y=.45, label=correlation, size=2) + | |
theme_bw() + | |
base_plot_theme | |
correlation <- sprintf("corr = %.2f", cor(mean_x_methylation, mean_nonsig_x_methylation)) | |
female_x_nonsigsites <- ggplot(data_df, aes(x=mean_x, y=mean_nonsig_x)) + | |
geom_point(color=sex_colors[1],size=.5, shape=20) + | |
labs(x="Mean X Chr DNAme", y="Mean Non-Sig X DNAme", title="Female hPSCs\n BF p>.01 X-probes") + | |
geom_smooth(method='lm', color="black", size=.5) + | |
# annotate("text", x=.55, y=.47, label=lm_eqn(lm(mean_autosome_methylation ~ mean_x_methylation, female_means)), parse=TRUE) + | |
annotate("text", x=.55, y=.6, label=correlation, size=2) + | |
theme_bw() + | |
base_plot_theme | |
supp_6 <- grid.arrange(global_mean_methylation, | |
sig_diff_x_v_no_x, sig_nonsig_dist_plot, | |
female_x_nonsigsites, female_x_sigsites, nrow = 1) | |
ggsave("figs/supplementary/supp_1_f_j.pdf", supp_6, width = 174, height=40, units = "mm") | |
sample_annotation_df <- read_csv("sample_data_combined_all_info.csv") | |
# Trying to evaluate a pseudo-ppv based on passage numbers | |
female_sample_annotation_df <- sample_annotation_df %>% filter(sample_name %in% names(mean_x_methylation)) | |
# female_sample_annotation_df <- rownames_to_column(female_sample_annotation_df, var="sample_name") | |
# cell_line_counts <- filter(female_sample_annotation_df, !is.na(passage)) %>% dplyr::count(cell_line) | |
duplicated_cell_lines <- unique(filter(female_sample_annotation_df, !is.na(passage) & duplicated(cell_line))$cell_line) | |
total_pairs <- 0 | |
correctly_ordered_by_x <- 0 | |
correctly_ordered_by_x_names <- c() | |
correctly_ordered_by_sig <- 0 | |
correctly_ordered_by_sig_names <- c() | |
for(duplicated_cell_line in duplicated_cell_lines) { | |
cell_line_annotation_df <- female_sample_annotation_df %>% filter(!is.na(passage) & cell_line == duplicated_cell_line) %>% dplyr::arrange(passage) | |
total_pairs <- total_pairs + 1 | |
early_sample <- cell_line_annotation_df$sample_name[1] | |
late_sample <- cell_line_annotation_df$sample_name[2] | |
early_x_meth <- mean_x_methylation[early_sample] | |
late_x_meth <- mean_x_methylation[late_sample] | |
if (late_x_meth <= early_x_meth) { | |
correctly_ordered_by_x <- correctly_ordered_by_x + 1 | |
correctly_ordered_by_x_names <- append(correctly_ordered_by_x_names, duplicated_cell_line) | |
} | |
early_sig_meth <- mean_sig_x_methylation[early_sample] | |
late_sig_meth <- mean_sig_x_methylation[late_sample] | |
if (late_sig_meth <= early_sig_meth) { | |
correctly_ordered_by_sig <- correctly_ordered_by_sig + 1 | |
correctly_ordered_by_sig_names <- append(correctly_ordered_by_sig_names, duplicated_cell_line) | |
} | |
# cell_line_annotation_df | |
} | |
``` | |
Now we will switch to python because of it's machine/statistical learning packages. | |
---------------------Insert python notebook here---------------------------------------------------- | |
```{python} | |
import pandas as pd | |
from sklearn.preprocessing import MinMaxScaler | |
from sklearn.cluster import KMeans | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
sns.set() | |
female_sig_sites_nowa09_df = pd.read_csv("sig_female_beta_df_no_h9.csv", index_col=[0,1,2,3,4]) | |
female_all_chrX_nowa09_df = pd.read_csv("female_df_no_h9_chrX.csv", index_col=[0]) | |
df_to_plot = female_sig_sites_nowa09_df | |
suffix = '_nowa09' | |
columns_to_keep = df_to_plot.columns.tolist() | |
# print(columns_to_keep) | |
sample_data = pd.read_csv("sample_data_combined_all_info.csv", index_col=1) | |
# print(sample_data.index) | |
sample_data_subset = sample_data.loc[df_to_plot.columns.tolist(), :] | |
print(sample_data_subset['cell_line']) | |
cell_lines = sample_data_subset['cell_line'] | |
counts = cell_lines.value_counts() | |
cell_line_weights = 1 / cell_lines.map(counts) | |
mean_methylation = pd.DataFrame(female_all_chrX_nowa09_df.mean()) | |
mean_methylation = MinMaxScaler(feature_range=(0, 5)).fit_transform(mean_methylation) | |
print(cell_line_weights) | |
## Getting the optimal number of clusters using elbow method | |
Sum_of_squared_distances = [] | |
K = range(1,15) | |
for k in K: | |
km = KMeans(n_clusters=k, n_init=100, max_iter=1000) | |
km = km.fit(df_to_plot.T, sample_weight=cell_line_weights) | |
Sum_of_squared_distances.append(km.inertia_) | |
print(Sum_of_squared_distances) | |
plt.plot(K, Sum_of_squared_distances, 'bx-') | |
plt.xlabel('k') | |
plt.ylabel('Sum_of_squared_distances') | |
plt.title('Elbow Method For Optimal k') | |
plt.show() | |
``` | |
```{python} | |
from sklearn.decomposition import PCA | |
km4 = KMeans(n_clusters=4, n_init=100, max_iter=1000).fit_predict(df_to_plot.T, sample_weight=cell_line_weights) | |
km5 = KMeans(n_clusters=5, n_init=100, max_iter=1000).fit_predict(df_to_plot.T, sample_weight=cell_line_weights) | |
km6 = KMeans(n_clusters=6, n_init=100, max_iter=1000).fit_predict(df_to_plot.T, sample_weight=cell_line_weights) | |
km7 = KMeans(n_clusters=7, n_init=100, max_iter=1000).fit_predict(df_to_plot.T, sample_weight=cell_line_weights) | |
km8 = KMeans(n_clusters=8, n_init=100, max_iter=1000).fit_predict(df_to_plot.T, sample_weight=cell_line_weights) | |
pca_embedding = PCA(n_components=2).fit_transform(df_to_plot.T) | |
f, axarr = plt.subplots(2, 3) | |
f.set_size_inches(15,10) | |
axarr[0, 0].scatter(pca_embedding[:, 0], pca_embedding[:, 1], c=[sns.color_palette("Blues")[int(x)] for x in mean_methylation]) | |
axarr[0, 0].set_title('Average X methylation') | |
axarr[0, 1].scatter(pca_embedding[:, 0], pca_embedding[:, 1], c=[sns.color_palette()[x] for x in km4]) | |
axarr[0, 1].set_title('4 clusters') | |
axarr[0, 2].scatter(pca_embedding[:, 0], pca_embedding[:, 1], c=[sns.color_palette()[x] for x in km5]) | |
axarr[0, 2].set_title('5 clusters') | |
axarr[1, 0].scatter(pca_embedding[:, 0], pca_embedding[:, 1], c=[sns.color_palette()[x] for x in km6]) | |
axarr[1, 0].set_title('6 clusters') | |
axarr[1, 1].scatter(pca_embedding[:, 0], pca_embedding[:, 1], c=[sns.color_palette()[x] for x in km7]) | |
axarr[1, 1].set_title('7 clusters') | |
axarr[1, 2].scatter(pca_embedding[:, 0], pca_embedding[:, 1], c=[sns.color_palette()[x] for x in km8]) | |
axarr[1, 2].set_title('8 clusters') | |
plt.show() | |
# save to replicate plot in R | |
clusters_df = pd.DataFrame([km4, km5, km6, km7, km8], index=["km4", "km5", "km6", "km7", "km8"], columns=df_to_plot.columns) | |
clusters_df.T.to_csv('saved_sample_clusters_4_8.csv', header=False) | |
``` | |
```{python} | |
# sample_data = pd.read_csv("sample_data_combined.csv", index_col=0) | |
import numpy as np | |
n_clusters = 6 | |
def get_ordered_clusters(data, | |
n_clusters=None, | |
mean_methylation=None, | |
sample_data=None, | |
n_init=100, | |
max_iter=1000, | |
clusters=None): | |
#get weights by frequency of cell_line in the dataset | |
if clusters is None: | |
if sample_data is None: | |
clusters = KMeans(n_clusters=n_clusters, | |
n_init=n_init, | |
max_iter=max_iter,n_jobs=-1,init='random').fit_predict(data.T) | |
else: | |
sample_data_subset = sample_data.loc[data.columns.tolist(), :] | |
cell_lines = sample_data_subset['cell_line'] | |
counts = cell_lines.value_counts() | |
cell_line_weights = 1 / cell_lines.map(counts) | |
clusters = KMeans(n_clusters=n_clusters, | |
n_init=n_init, | |
max_iter=max_iter,n_jobs=-1,init='random').fit_predict( | |
data.T, sample_weight=cell_line_weights) | |
cluster_names_dict = {} | |
if mean_methylation is None: | |
mean_methylation = pd.DataFrame(data.mean()) | |
mean_methylation = MinMaxScaler( | |
feature_range=(0, 5)).fit_transform(mean_methylation) | |
for cluster in list(set(clusters)): | |
cluster_names_dict[cluster] = np.mean( | |
mean_methylation[clusters == cluster]) | |
sorted_x = sorted(cluster_names_dict.items(), key=lambda kv: kv[1]) | |
sorted_x.reverse() | |
cluster_names_dict_fixed = {} | |
clusters_fixed = [0 for x in clusters] | |
for i, sorted_tup in enumerate(sorted_x): | |
for j, clust in enumerate(clusters): | |
if clust == sorted_tup[0]: | |
clusters_fixed[j] = i | |
if i == 0: | |
cluster_names_dict_fixed[i] = 'Non-Eroded' | |
elif i == len(sorted_x) - 1: | |
cluster_names_dict_fixed[i] = 'Eroded' | |
else: | |
cluster_names_dict_fixed[i] = sorted_tup[1] | |
clusters = np.array(clusters_fixed) | |
cluster_names_dict = cluster_names_dict_fixed | |
return clusters, cluster_names_dict | |
clusters, cluster_names_dict = get_ordered_clusters( | |
data=df_to_plot, | |
n_clusters=n_clusters, | |
mean_methylation=mean_methylation, | |
sample_data=sample_data) | |
clusters_row = pd.Series(clusters, name='clusters', index=df_to_plot.columns) | |
print(clusters_row) | |
clusters_row.to_csv('saved_sample_clusters_test.csv', header=False) | |
``` | |
## remake the python plots in R | |
```{r} | |
elbow_method_df <- tibble(num_clusters=1:14, sum_of_squared_distances=c(21544.753987410935, 14657.4276351863, 11426.322690540565, 10171.239511057178, 9652.956939046413, 9267.650214615307, 8963.094498025255, 8733.35142099591, 8459.750435732954, 8198.846796977472, 8059.111714376631, 7933.215394915591, 7696.179270524132, 7473.282326589432)) | |
elbow_plot <- ggplot(elbow_method_df, aes(x=num_clusters, y=sum_of_squared_distances)) + | |
geom_line() + | |
theme_bw() + | |
base_plot_theme + | |
labs(title = "Elbow Method for\nOptimal k", x="k", y="Sum of squared distances") | |
cluster_testing_df <- read_csv("saved_sample_clusters_4_8.csv.gz", | |
col_names = c("sample_name", "km4", "km5", "km6", "km7", "km8")) | |
cluster_testing_df <- column_to_rownames(cluster_testing_df, var="sample_name") | |
cluster_plots <- c() | |
female_df_no_h9_chrX <- read_csv('female_df_no_h9_chrX.csv.gz') | |
sample_annotation_df <- read_csv("sample_data_combined_all_info.csv") | |
female_no_h9_sig_sites_pca <- prcomp(t(na.omit(female_df_no_h9_chrX %>% filter(rn %in% sig_x_sites) %>% select(-rn)))) | |
female_no_h9_sig_sites_pca_df <- rownames_to_column(as.data.frame(female_no_h9_sig_sites_pca$x), var = "sample_name") | |
female_no_h9_sig_sites_pca_df <- left_join(female_no_h9_sig_sites_pca_df, sample_annotation_df, by = "sample_name") | |
# female_no_h9_sig_sites_pca_df <- left_join(female_no_h9_sig_sites_pca_df, female_avg_x_methylation_df, by = "sample_name") | |
female_no_h9_sig_sites_pca_df <- left_join(female_no_h9_sig_sites_pca_df, tidy(colMeans(female_df_no_h9_chrX %>% select(-rn))) %>% rename(sample_name=names, meanXMethylation=x), by = "sample_name") | |
female_no_h9_sig_sites_pov <- sprintf("%2.2f%%", 100* female_no_h9_sig_sites_pca$sdev^2/sum(female_no_h9_sig_sites_pca$sdev^2)) | |
female_no_h9_sig_sites_pc1_label <- sprintf("PC1 (%s)", female_no_h9_sig_sites_pov[1]) | |
female_no_h9_sig_sites_pc2_label <- sprintf("PC2 (%s)", female_no_h9_sig_sites_pov[2]) | |
female_no_h9_pca_df <- female_no_h9_sig_sites_pca_df | |
female_no_h9_pca_df$python_clusters <- cluster_testing_df[female_no_h9_pca_df$sample_name, ]$km4 | |
km4_plot <- ggplot(female_no_h9_pca_df, mapping = aes(x=PC1, y=PC2)) + | |
geom_point(aes(color=as.factor(python_clusters)), size=.5, shape=20) + | |
theme_bw() + | |
labs(x=female_no_h9_pc1_label, | |
y=female_no_h9_pc2_label, | |
title = "4 clusters", | |
color="Cluster")+ | |
base_plot_theme + | |
scale_color_brewer(type="qual", palette = "Dark2", guide=F) | |
female_no_h9_pca_df$python_clusters <- cluster_testing_df[female_no_h9_pca_df$sample_name, ]$km5 | |
km5_plot <- ggplot(female_no_h9_pca_df, mapping = aes(x=PC1, y=PC2)) + | |
geom_point(aes(color=as.factor(python_clusters)), size=.5, shape=20) + | |
theme_bw() + | |
labs(x=female_no_h9_pc1_label, | |
y=female_no_h9_pc2_label, | |
title = "5 clusters", | |
color="Cluster")+ | |
base_plot_theme + | |
scale_color_brewer(type="qual", palette = "Dark2", guide=F) | |
female_no_h9_pca_df$python_clusters <- cluster_testing_df[female_no_h9_pca_df$sample_name, ]$km6 | |
km6_plot <- ggplot(female_no_h9_pca_df, mapping = aes(x=PC1, y=PC2)) + | |
geom_point(aes(color=as.factor(python_clusters)), size=.5, shape=20) + | |
theme_bw() + | |
labs(x=female_no_h9_pc1_label, | |
y=female_no_h9_pc2_label, | |
title = "6 clusters", | |
color="Cluster")+ | |
base_plot_theme + | |
scale_color_brewer(type="qual", palette = "Dark2", guide=F) | |
female_no_h9_pca_df$python_clusters <- cluster_testing_df[female_no_h9_pca_df$sample_name, ]$km7 | |
km7_plot <- ggplot(female_no_h9_pca_df, mapping = aes(x=PC1, y=PC2)) + | |
geom_point(aes(color=as.factor(python_clusters)), size=.5, shape=20) + | |
theme_bw() + | |
labs(x=female_no_h9_pc1_label, | |
y=female_no_h9_pc2_label, | |
title = "7 clusters", | |
color="Cluster")+ | |
base_plot_theme + | |
scale_color_brewer(type="qual", palette = "Dark2", guide=F) | |
female_no_h9_pca_df$python_clusters <- cluster_testing_df[female_no_h9_pca_df$sample_name, ]$km8 | |
km8_plot <- ggplot(female_no_h9_pca_df, mapping = aes(x=PC1, y=PC2)) + | |
geom_point(aes(color=as.factor(python_clusters)), size=.5, shape=20) + | |
theme_bw() + | |
labs(x=female_no_h9_pc1_label, | |
y=female_no_h9_pc2_label, | |
title = "8 clusters", | |
color="Cluster")+ | |
base_plot_theme + | |
scale_color_brewer(type="qual", palette = "Dark2", guide=F) | |
kmeans_plots <- grid.arrange(elbow_plot, | |
km4_plot, | |
km5_plot, | |
km6_plot, | |
km7_plot, | |
km8_plot, | |
top = textGrob("Deciding optimal number of clusters",gp=gpar(fontsize=7, fontface="bold")), nrow=1) | |
ggsave("figs/supplementary/kmeans_plots.pdf", kmeans_plots, width = 174, height=40, units = 'mm') | |
``` | |
## Bring in sample clusters from python analysis | |
```{r} | |
clusters_pData <- read.csv("saved_sample_clusters.csv", | |
header = FALSE, | |
row.names = 1, | |
col.names=c('name','cluster')) | |
# clusters_pData <- read.csv("saved_sample_clusters_test.csv", | |
# header = FALSE, | |
# row.names = 1, | |
# col.names=c('name','cluster')) | |
female_no_h9_pca_df$python_clusters <- clusters_pData[female_no_h9_pca_df$sample_name,] | |
female_no_h9_sig_sites_pca_df$python_clusters <- clusters_pData[female_no_h9_sig_sites_pca_df$sample_name,] | |
color_clusters <- clusters_pData | |
color_clusters$cluster <- as.factor(color_clusters$cluster) | |
# check where the samples clusters fall in the PCA | |
pca_female_no_h9_by_python_6 <- ggplot(female_no_h9_sig_sites_pca_df, mapping = aes(x=PC1, y=PC2)) + | |
geom_point(aes(color=as.factor(python_clusters), shape=(cell_line == "WA09")), size=.5) + | |
scale_shape_manual(values = c(17, 16), labels=c("H9", "Other"), limits=c(T,F), guide=F) + | |
theme_bw() + | |
labs(x=female_no_h9_sig_sites_pc1_label, | |
y=female_no_h9_sig_sites_pc2_label, | |
title = "PCA of Female Samples", | |
color="Cluster")+ | |
base_plot_theme + | |
scale_color_brewer(type="qual", palette = "Dark2", limits=c(0,1,2,3,4,5), labels=c("A","B","C","D","E","F")) + | |
guides(color=guide_legend(nrow=1)) + | |
theme(legend.key.width = unit(1, units = "mm"), | |
legend.text = element_text(size=5, margin = margin(r=.5, l=-.7, unit="mm"))) | |
ggsave("figs/main/fig1_pca_female.pdf", pca_female_no_h9_by_python_6, height = 47, width=87, units = "mm") | |
# female_pca <- prcomp(t(na.omit(female_all_x_df))) | |
# female_pca_df <- rownames_to_column(as.data.frame(female_pca$x), var = "sample_name") | |
# female_pca_df <- left_join(female_pca_df, sample_annotation_df, by = "sample_name") | |
# female_pca_df <- left_join(female_pca_df, female_avg_x_methylation_df, by = "sample_name") | |
female_pca_df$python_clusters <- clusters_pData[female_pca_df$sample_name,] | |
library(ggrepel) | |
pca_female_no_h9_by_python_6 <- ggplot(female_pca_df, mapping = aes(x=PC1, y=PC2)) + | |
geom_line(female_pca_df %>% filter(cell_line %in% c("HDF51IPS3", "HDF51IPS1", "HDF51IPS2", "HDF51IPS11", "HDF51IPS12", "HDF51IPS6", "HDF51IPS14", "WA09") & grepl("Nazor_", sample_name)), | |
mapping = aes(x=PC1, y=PC2, group=cell_line), size=.3, alpha=.4) + | |
geom_point(aes(color=as.factor(python_clusters), shape=(cell_line == "WA09")), size=.5) + | |
# geom_text_repel(female_pca_df %>% filter(cell_line %in% correctly_ordered_by_sig_names & grepl("Nazor_", sample_name)), mapping = aes(x=PC1, y=PC2, label=passage), size=2, color="grey", segment.size=.15, min.segment.length=.25, segment.color="darkgrey") + | |
scale_shape_manual(values = c(17, 16), labels=c("H9", "Other"), limits=c(T,F), guide=F) + | |
theme_bw() + | |
labs(x=female_pc1_label, | |
y=female_pc2_label, | |
title = "PCA of Female Samples", | |
color="Cluster")+ | |
base_plot_theme + | |
scale_color_manual(limits=factor(c(0,1,2,3,4,5,NA)), labels=c("A","B","C","D","E","F", "H9"), values = c(RColorBrewer::brewer.pal(6,"Dark2"), "#000000"), na.value="#000000") + | |
# scale_color_brewer(type="qual", palette = "Dark2", limits=c(0,1,2,3,4,5), labels=c("A","B","C","D","E","F")) + | |
guides(color=guide_legend(nrow=1)) + | |
theme(legend.key.width = unit(1, units = "mm"), | |
legend.text = element_text(size=5, margin = margin(r=.5, l=-.7, unit="mm"))) | |
ggsave("figs/main/fig1_pca_female_with_h9.pdf", pca_female_no_h9_by_python_6, height = 69.3, width=87, units = "mm") | |
fig_1 <- grid.arrange(female_x_autosome, female_x_xist, | |
xchr_beta_dist, female_male_pca_plot, | |
pca_female_no_h9_by_methylation, pca_female_no_h9_by_python_6, nrow=3) | |
fig_1 <- grid.arrange(female_x_xist, variance_density_plot, | |
xchr_beta_dist, female_male_pca_plot, | |
pca_female_no_h9_by_methylation, pca_female_no_h9_by_python_6, nrow=3) | |
ggsave("figs/main/fig1.svg", fig_1, height = 236, width=174, units = "mm") | |
ggsave("figs/female-pca-by-cluster.pdf", height = 84, width = 84, units = 'mm') | |
``` | |
## Look at the pairs in relation to the clusters | |
```{r} | |
cluster_tib <- read_csv("saved_sample_clusters.csv", col_names = c("sample_name", "cluster")) | |
ordered_cell_lines_df <- filter(female_sample_annotation_df, !is.na(passage) & cell_line %in% correctly_ordered_by_sig_names) %>% | |
left_join(cluster_tib, by="sample_name") | |
``` | |
## Differentially Methylated Probes Analysis | |
Now that we have our sample clusters from the python kmeans analysis, we will now focus on clustering our CpG probes based on when they change their methylation status. | |
```{r} | |
library(tidyverse) | |
library(minfi) | |
clusters_pData <- read_csv("saved_sample_clusters.csv", col_names = c("name", "cluster")) | |
dir.create('dmp_between_sample_clusters') | |
all_dmp_df <- data.frame() | |
transition_columns <- c() | |
female_df_no_h9 <- read_csv("female_df_no_h9.csv") %>% column_to_rownames(var="rn") | |
filtered_site_annotation_df <- read_csv("filtered_site_annotation_df.csv") | |
filtered_site_annotation_df_temp <- column_to_rownames(filtered_site_annotation_df, var="rowname") | |
cluster_means <- tibble(rowname=character()) | |
for (n2 in min(clusters_pData$cluster):(max(clusters_pData$cluster))) { | |
print(n2) | |
cluster_samples <- filter(clusters_pData, cluster %in% c(n2))$name | |
mean_df <- as.data.frame(rowMeans(na.omit(female_df_no_h9[cluster_samples]))) %>% | |
rownames_to_column() %>% | |
dplyr::rename(!!sprintf('cluster_%d_mean', n2) := 2) | |
cluster_means <- full_join(cluster_means, mean_df, by="rowname") | |
} | |
for (n in min(clusters_pData$cluster):(max(clusters_pData$cluster)-1)) { | |
print(sprintf('comparing clusters %d and %d', n, n+1)) | |
cluster_1_group <- filter(clusters_pData, cluster <= n)$name | |
cluster_2_group <- filter(clusters_pData, cluster > n)$name | |
temp_cluster_data <- clusters_pData #column_to_rownames(clusters_pData, var="name") | |
temp_cluster_data[temp_cluster_data$name %in% cluster_1_group, 'cluster'] <- 0 | |
temp_cluster_data[temp_cluster_data$name %in% cluster_2_group, 'cluster'] <- 1 | |
clusters_pair <- temp_cluster_data$name | |
subset_beta <- na.omit(female_df_no_h9[,clusters_pair]) | |
subset_clusters <- as.matrix(temp_cluster_data %>% column_to_rownames(var="name")) | |
subset_clusters[1] <- as.factor(subset_clusters[1]) | |
# subset_clusters <- as.matrix(temp_cluster_data[clusters_pair,]) | |
# subset_clusters[1] <- as.factor(subset_clusters[1]) | |
# row.names(subset_clusters) <- clusters_pair | |
dmp <- dmpFinder(as.matrix(subset_beta), pheno = subset_clusters[,1], type = "categorical") | |
# subset_beta_mat <- as.matrix(subset_beta) | |
# row.names(subset_beta_mat) <- rownames(subset_beta) | |
dmp_df <- as.data.frame(dmp) %>% rownames_to_column(var="rowname") | |
dmp_df <- left_join(dmp_df, filtered_site_annotation_df %>% select(rowname, | |
hg19_chr=chr, | |
hg19_pos=pos, | |
hg19_gene=UCSC_RefGene_Name, | |
hg19_refgene_group=UCSC_RefGene_Group, | |
hg38_chr=hg38_chromosome, | |
hg38_pos=hg38_pos, | |
hg38_gene=hg38_gene_name, | |
hg38_refgene_group=hg38_gene_group), by="rowname") | |
dmp_df <- left_join(dmp_df, cluster_means, by="rowname") | |
write.csv(dmp_df, | |
file = sprintf('dmp_between_sample_clusters/dmp_table_%d_%d.csv', n, n+1), | |
quote = FALSE) | |
if (n + 1 == 1) { | |
all_dmp_df <- dmp_df %>% select(rowname, | |
hg19_chr, | |
hg19_pos, | |
hg19_gene, | |
hg19_refgene_group, | |
hg38_chr, | |
hg38_pos, | |
hg38_gene, | |
hg38_refgene_group, | |
matches("cluster_._mean")) | |
} | |
all_dmp_df <- full_join(all_dmp_df, dmp_df %>% select(rowname, !!sprintf('transition_%s_qval', n + 1):=qval), by="rowname") | |
transition_columns <- append(transition_columns, sprintf('transition_%s_qval', n + 1)) | |
} | |
all_dmp_df$key_transition <- colnames(all_dmp_df[transition_columns])[apply(all_dmp_df[transition_columns], 1, which.min)] | |
all_dmp_df$key_transition <- as.numeric(unlist(as.data.frame(t(as.data.frame(strsplit(all_dmp_df$key_transition, '_'))))[2])) | |
for (row in 1:nrow(all_dmp_df)) { | |
transitions <- c() | |
transition_deltas <- c() | |
for (transition in 1:5) { | |
qval_col <- sprintf("transition_%d_qval", transition) | |
if (all_dmp_df[row, qval_col] <= 0.1) { | |
transitions <- append(transitions, transition) | |
col_1 <- sprintf("cluster_%d_mean", transition-1) | |
col_2 <- sprintf("cluster_%d_mean", transition) | |
transition_deltas <- append(transition_deltas, all_dmp_df[row, col_2] - all_dmp_df[row, col_1]) | |
} | |
} | |
alt_key_transition <- NA | |
if (length(transitions) > 0) { | |
alt_key_transition <- transitions[which.max(abs(transition_deltas))] | |
all_dmp_df[row, "alt_key_transition"] <- alt_key_transition | |
all_dmp_df[row, "alt_delta"] <- transition_deltas[which.max(abs(transition_deltas))] | |
} | |
} | |
q_val_threshold <- .001 | |
all_dmp_df$passes_threshold <- apply(all_dmp_df[transition_columns], 1, min) <= q_val_threshold | |
all_dmp_df$passes_stringent_threshold <- apply(all_dmp_df[transition_columns], 1, min) <= 1e-10 | |
all_dmp_df$delta <- apply(all_dmp_df, 1, function(r) { | |
col_1_name <- sprintf('cluster_%d_mean', as.numeric(r['key_transition']) - 1) | |
col_2_name <- sprintf('cluster_%d_mean', as.numeric(r['key_transition'])) | |
return(as.numeric(r[col_2_name]) - as.numeric(r[col_1_name])) | |
}) | |
all_dmp_df$delta_increasing <- apply(all_dmp_df, 1, function(r) { | |
col_1_name <- sprintf('cluster_%d_mean', as.numeric(r['key_transition']) - 1) | |
col_2_name <- sprintf('cluster_%d_mean', as.numeric(r['key_transition'])) | |
return(as.numeric(r[col_2_name]) - as.numeric(r[col_1_name]) >= 0) | |
}) | |
write_csv(all_dmp_df, "all_dmp_df.csv") | |
``` | |
## Compare cluster A to H9 | |
```{r} | |
female_df <- read_csv("female_beta_mat.csv.gz") %>% column_to_rownames(var="rowname") | |
clusters_pData <- read_csv("saved_sample_clusters.csv", col_names = c("name", "cluster")) | |
filtered_site_annotation_df <- read_csv("filtered_site_annotation_df.csv.gz") | |
filtered_site_annotation_df_temp <- column_to_rownames(filtered_site_annotation_df, var="rowname") | |
print("comparing cluster A and H9") | |
cluster_1_group <- filter(clusters_pData, cluster == 0)$name | |
cluster_a_data <- filter(clusters_pData, cluster == 0) #column_to_rownames(clusters_pData, var="name") | |
h9_lines <- filter(sample_annotation_df, cell_line == 'WA09' & gender_matches == T)$sample_name | |
temp_cluster_data <- rbind(cluster_a_data, tibble(name=h9_lines, cluster=1)) | |
subset_beta <- na.omit(female_df[,c(cluster_1_group, h9_lines)]) | |
subset_clusters <- as.matrix(temp_cluster_data %>% column_to_rownames(var="name")) | |
subset_clusters[1] <- as.factor(subset_clusters[1]) | |
dmp <- dmpFinder(as.matrix(subset_beta), pheno = subset_clusters[,1], type = "categorical") | |
# subset_beta_mat <- as.matrix(subset_beta) | |
# row.names(subset_beta_mat) <- rownames(subset_beta) | |
dmp_df <- as.data.frame(dmp) %>% rownames_to_column(var="rowname") | |
dmp_df <- left_join(dmp_df, filtered_site_annotation_df %>% select(rowname, | |
hg19_chr=chr, | |
hg19_pos=pos, | |
hg19_gene=UCSC_RefGene_Name, | |
hg19_refgene_group=UCSC_RefGene_Group, | |
hg38_chr=hg38_chromosome, | |
hg38_pos=hg38_pos, | |
hg38_gene=hg38_gene_name, | |
hg38_refgene_group=hg38_gene_group), by="rowname") | |
cluster_means <- tibble(rowname=character()) | |
for (n2 in min(temp_cluster_data$cluster):(max(temp_cluster_data$cluster))) { | |
print(n2) | |
cluster_samples <- filter(temp_cluster_data, cluster %in% c(n2))$name | |
mean_df <- as.data.frame(rowMeans(na.omit(female_df[cluster_samples]))) %>% | |
rownames_to_column() %>% | |
dplyr::rename(!!sprintf('cluster_%d_mean', n2) := 2) | |
cluster_means <- full_join(cluster_means, mean_df, by="rowname") | |
} | |
dmp_df <- left_join(dmp_df, cluster_means %>% rename(H9_mean=cluster_1_mean), by="rowname") | |
dmp_df <- dmp_df %>% mutate(passes_threshold=qval <= q_val_threshold, | |
passes_stringent_threshold=qval <= 1e-10) | |
write_csv(dmp_df, "dmp_table_clusterA_H9.csv.gz") | |
write(bedgraph_header(name="eroding_delta_H9", | |
description=sprintf("eroding deltas from H9 transition 1Mb windowed")), | |
"bedgraphs/windowed_eroding_delta_tran_H9.bedgraph") | |
bed_df <- dmp_df %>% filter((hg19_chr == "chrX") & (passes_threshold == T)) | |
cluster_col_0 <- "cluster_0_mean" #sprintf("cluster_%d_mean", t-1) | |
cluster_col_1 <- "H9_mean" | |
bed_df$delta <- unlist(bed_df[cluster_col_1]) - unlist(bed_df[cluster_col_0]) | |
write_tsv(window_average_chrX(bed_file_df=bed_df %>% rename(chr=hg19_chr, pos=hg19_pos)), "bedgraphs/windowed_eroding_delta_tran_H9.bedgraph", col_names = F, append=TRUE) | |
#non cluster A iPSCs | |
ipsc_lines <- filter(sample_annotation_df, cell_type == "iPSC")$sample_name | |
non_a_ipsc_lines <- filter(clusters_pData, cluster != 0 & name %in% ipsc_lines)$name | |
temp_cluster_data <- rbind(cluster_a_data, tibble(name=non_a_ipsc_lines, cluster=1)) | |
subset_beta <- na.omit(female_df[,c(cluster_1_group, non_a_ipsc_lines)]) | |
subset_clusters <- as.matrix(temp_cluster_data %>% column_to_rownames(var="name")) | |
subset_clusters[1] <- as.factor(subset_clusters[1]) | |
dmp <- dmpFinder(as.matrix(subset_beta), pheno = subset_clusters[,1], type = "categorical") | |
# subset_beta_mat <- as.matrix(subset_beta) | |
# row.names(subset_beta_mat) <- rownames(subset_beta) | |
dmp_df <- as.data.frame(dmp) %>% rownames_to_column(var="rowname") | |
dmp_df <- left_join(dmp_df, filtered_site_annotation_df %>% select(rowname, | |
hg19_chr=chr, | |
hg19_pos=pos, | |
hg19_gene=UCSC_RefGene_Name, | |
hg19_refgene_group=UCSC_RefGene_Group, | |
hg38_chr=hg38_chromosome, | |
hg38_pos=hg38_pos, | |
hg38_gene=hg38_gene_name, | |
hg38_refgene_group=hg38_gene_group), by="rowname") | |
cluster_means <- tibble(rowname=character()) | |
for (n2 in min(temp_cluster_data$cluster):(max(temp_cluster_data$cluster))) { | |
print(n2) | |
cluster_samples <- filter(temp_cluster_data, cluster %in% c(n2))$name | |
mean_df <- as.data.frame(rowMeans(na.omit(female_df[cluster_samples]))) %>% | |
rownames_to_column() %>% | |
dplyr::rename(!!sprintf('cluster_%d_mean', n2) := 2) | |
cluster_means <- full_join(cluster_means, mean_df, by="rowname") | |
} | |
dmp_df <- left_join(dmp_df, cluster_means %>% rename(non_A_iPSC_mean=cluster_1_mean), by="rowname") | |
dmp_df <- dmp_df %>% mutate(passes_threshold=qval <= q_val_threshold, | |
passes_stringent_threshold=qval <= 1e-10) | |
write_csv(dmp_df, "dmp_table_clusterA_non_A_iPSC.csv.gz") | |
write(bedgraph_header(name="eroding_delta_iPSC", | |
description=sprintf("eroding deltas from iPSC transition 1Mb windowed")), | |
"bedgraphs/windowed_eroding_delta_tran_iPSC.bedgraph") | |
bed_df <- dmp_df %>% filter((hg19_chr == "chrX") & (passes_threshold == T)) | |
cluster_col_0 <- "cluster_0_mean" #sprintf("cluster_%d_mean", t-1) | |
cluster_col_1 <- "non_A_iPSC_mean" | |
bed_df$delta <- unlist(bed_df[cluster_col_1]) - unlist(bed_df[cluster_col_0]) | |
write_tsv(window_average_chrX(bed_file_df=bed_df %>% rename(chr=hg19_chr, pos=hg19_pos)), "bedgraphs/windowed_eroding_delta_tran_iPSC.bedgraph", col_names = F, append=TRUE) | |
#non cluster A ESCs | |
esc_lines <- filter(sample_annotation_df, cell_type == "ESC")$sample_name | |
non_a_esc_lines <- filter(clusters_pData, cluster != 0 & name %in% esc_lines)$name | |
temp_cluster_data <- rbind(cluster_a_data, tibble(name=non_a_esc_lines, cluster=1)) | |
subset_beta <- na.omit(female_df[,c(cluster_1_group, non_a_esc_lines)]) | |
subset_clusters <- as.matrix(temp_cluster_data %>% column_to_rownames(var="name")) | |
subset_clusters[1] <- as.factor(subset_clusters[1]) | |
dmp <- dmpFinder(as.matrix(subset_beta), pheno = subset_clusters[,1], type = "categorical") | |
# subset_beta_mat <- as.matrix(subset_beta) | |
# row.names(subset_beta_mat) <- rownames(subset_beta) | |
dmp_df <- as.data.frame(dmp) %>% rownames_to_column(var="rowname") | |
dmp_df <- left_join(dmp_df, filtered_site_annotation_df %>% select(rowname, | |
hg19_chr=chr, | |
hg19_pos=pos, | |
hg19_gene=UCSC_RefGene_Name, | |
hg19_refgene_group=UCSC_RefGene_Group, | |
hg38_chr=hg38_chromosome, | |
hg38_pos=hg38_pos, | |
hg38_gene=hg38_gene_name, | |
hg38_refgene_group=hg38_gene_group), by="rowname") | |
cluster_means <- tibble(rowname=character()) | |
for (n2 in min(temp_cluster_data$cluster):(max(temp_cluster_data$cluster))) { | |
print(n2) | |
cluster_samples <- filter(temp_cluster_data, cluster %in% c(n2))$name | |
mean_df <- as.data.frame(rowMeans(na.omit(female_df[cluster_samples]))) %>% | |
rownames_to_column() %>% | |
dplyr::rename(!!sprintf('cluster_%d_mean', n2) := 2) | |
cluster_means <- full_join(cluster_means, mean_df, by="rowname") | |
} | |
dmp_df <- left_join(dmp_df, cluster_means %>% rename(non_A_ESC_mean=cluster_1_mean), by="rowname") | |
dmp_df <- dmp_df %>% mutate(passes_threshold=qval <= q_val_threshold, | |
passes_stringent_threshold=qval <= 1e-10) | |
write_csv(dmp_df, "dmp_table_clusterA_non_A_ESC.csv.gz") | |
write(bedgraph_header(name="eroding_delta_ESC", | |
description=sprintf("eroding deltas from ESC transition 1Mb windowed")), | |
"bedgraphs/windowed_eroding_delta_tran_ESC.bedgraph") | |
bed_df <- dmp_df %>% filter((hg19_chr == "chrX") & (passes_threshold == T)) | |
cluster_col_0 <- "cluster_0_mean" #sprintf("cluster_%d_mean", t-1) | |
cluster_col_1 <- "non_A_ESC_mean" | |
bed_df$delta <- unlist(bed_df[cluster_col_1]) - unlist(bed_df[cluster_col_0]) | |
write_tsv(window_average_chrX(bed_file_df=bed_df %>% rename(chr=hg19_chr, pos=hg19_pos)), "bedgraphs/windowed_eroding_delta_tran_ESC.bedgraph", col_names = F, append=TRUE) | |
# calculating histone correlations | |
h3k27me3_peaks <- read_table2("bedgraphs/GSM1528885_H9_XISTpos_K27me3.peaks.bed", col_names=c("chr", "start", "end", "signal")) | |
h3k27me3_peaks_chrX <- h3k27me3_peaks %>% filter(chr == "chrX") | |
h3k27me3_peaks_chrX_windowed <- window_average_chrX(bed_file_df=h3k27me3_peaks_chrX %>% rename(pos=start, delta=signal), window_size = 100000, step_size = 50000, include_empty = T, calculate_sum = T) | |
h3k9_xi_peaks <- read_table2("bedgraphs/GSM1528888_H9_XISTpos_K9me3.peaks.bed.gz", col_names=c("chr", "start", "end", "signal")) | |
h3k9_xi_peaks_chrX <- h3k9_xi_peaks %>% filter(chr == "chrX") | |
h3k9_xi_peaks_chrX_windowed <- window_average_chrX(bed_file_df=h3k9_xi_peaks_chrX %>% rename(pos=start, delta=signal), window_size = 100000, step_size = 50000, include_empty = T, calculate_sum = T) | |
histone_mark_correlations <- tibble(group=character(), | |
h3k27me3_correlation=numeric(), | |
h3k27me3_correlation_p_val=numeric(), | |
h3k9me3_correlation=numeric(), | |
h3k9me3_correlation_p_val=numeric()) | |
for (g in c("H9", "iPSC", "ESC")) { | |
if (g == "H9") { | |
bed_df <- read_csv(sprintf("dmp_table_clusterA_%s.csv.gz", g)) %>% | |
filter((hg19_chr == "chrX") & (passes_threshold == T)) | |
bed_df <- bed_df %>% | |
mutate(delta=H9_mean-cluster_0_mean) | |
}else if (g == "iPSC") { | |
bed_df <- read_csv(sprintf("dmp_table_clusterA_non_A_%s.csv.gz", g)) %>% | |
filter((hg19_chr == "chrX") & (passes_threshold == T)) | |
bed_df <- bed_df %>% | |
mutate(delta=non_A_iPSC_mean-cluster_0_mean) | |
}else if (g == "ESC") { | |
bed_df <- read_csv(sprintf("dmp_table_clusterA_non_A_%s.csv.gz", g)) %>% | |
filter((hg19_chr == "chrX") & (passes_threshold == T)) | |
bed_df <- bed_df %>% | |
mutate(delta=non_A_ESC_mean-cluster_0_mean) | |
} | |
# bed_df <- all_dmp_df %>% filter((hg19_chr == "chrX") & (passes_threshold == T) & (key_transition == t)) | |
averaged_distance <- window_average_chrX(bed_file_df=bed_df %>% rename(chr=hg19_chr, pos=hg19_pos), window_size = 100000, step_size = 50000, include_empty = T) | |
k27_cor_test <- cor.test(abs(averaged_distance$delta), h3k27me3_peaks_chrX_windowed$delta) | |
k9_cor_test <- cor.test(abs(averaged_distance$delta), h3k9_xi_peaks_chrX_windowed$delta) # h3k9me3_signal_chrX$signal) | |
# k27_xe_cor_test <- cor.test(abs(averaged_distance$delta), h3k27me3_xe_signal_chrX$signal) | |
# k9_xe_cor_test <- cor.test(abs(averaged_distance$delta), h3k9_xe_peaks_windowed$delta) | |
histone_mark_correlations <- add_row(histone_mark_correlations, | |
group=g, | |
h3k27me3_correlation=k27_cor_test$estimate, | |
h3k27me3_correlation_p_val=k27_cor_test$p.value, | |
h3k9me3_correlation=k9_cor_test$estimate, | |
h3k9me3_correlation_p_val=k9_cor_test$p.value) | |
# h3k27me3_xe_correlation=k27_xe_cor_test$estimate, | |
# h3k9me3_xe_correlation=k9_xe_cor_test$estimate) | |
} | |
histone_cor_df <- reshape2::melt(histone_mark_correlations %>% | |
dplyr::select(transition, | |
h3k27me3_correlation, | |
h3k9me3_correlation), | |
id.vars=c("transition")) | |
histone_cor_df$variable <- factor(histone_cor_df$variable, levels=c("h3k9me3_correlation", "h3k27me3_correlation")) | |
histon_cor_plot <- ggplot(histone_cor_df) + | |
geom_bar(aes(x=transition, y=value, fill=variable), stat="identity", position=position_dodge()) + | |
theme_bw() + | |
base_plot_theme + | |
coord_flip() + | |
facet_grid(rows=vars(sprintf("Tran %d", transition)), scales="free_y")+ | |
labs(title="Correlation of DMPs With Histone Meth.", | |
x="", | |
y="Pearson Correlation", | |
fill="Histone Modification") + | |
scale_fill_manual(limits=c("h3k27me3_correlation", "h3k9me3_correlation"), | |
labels=c("H3K27me3", "H3K9me3"), | |
values = c(pal_jama()(1), "#8C564B"), | |
guide=F) + | |
scale_x_continuous(labels = NULL) + | |
scale_y_continuous(limits=c(-.13,.33)) + | |
theme(strip.text = element_text(size=5, margin = margin(0,.5,0,.5, "mm")), | |
strip.background = element_rect(size = .3), | |
panel.grid.minor.y = element_blank(), | |
axis.ticks.length.y = unit(0, "mm"), | |
axis.title.y = element_blank(), | |
axis.text.y = element_blank(), | |
panel.spacing.y = unit(.75, "mm")) | |
``` | |
## Make heatmap in R | |
```{r} | |
female_df_no_h9_chrX <- read_csv("female_df_no_h9_chrX.csv.gz") | |
all_dmp_df <- read_csv('all_dmp_df.csv.gz') | |
# do this based on genes and when the promoters change their methylation. | |
x_annotation_df <- read_csv("x_annotation.csv.gz") | |
statistics <- read_csv('variance_statistics_noh9_nooutliers.csv.gz') | |
positions_to_keep <- statistics$Position[statistics$alpha <= .01 & (statistics$femaleVariance > statistics$maleVariance)] | |
sig_x_sites <- intersect(positions_to_keep, x_annotation_df$rowname) | |
heatmap_df <- inner_join(female_df_no_h9_chrX %>% filter(rn %in% x_annotation_df$rowname), | |
dplyr::select(all_dmp_df, rowname, key_transition, passes_threshold, delta_increasing, hg38_refgene_group), by=c("rn"="rowname")) | |
heatmap_df <- arrange(heatmap_df, desc(passes_threshold), desc(delta_increasing), desc(key_transition)) | |
ordered_sites <- heatmap_df$rn | |
heatmap_df[heatmap_df$passes_threshold == FALSE, "cluster"] <- 10 | |
heatmap_df[(heatmap_df$passes_threshold == TRUE) & (heatmap_df$delta_increasing == T), "cluster"] <- heatmap_df[(heatmap_df$passes_threshold == TRUE) & (heatmap_df$delta_increasing == T), "key_transition"] - 1 | |
heatmap_df[(heatmap_df$passes_threshold == TRUE) & (heatmap_df$delta_increasing == F), "cluster"] <- (heatmap_df[(heatmap_df$passes_threshold == TRUE) & (heatmap_df$delta_increasing == F), "key_transition"] - 1) + 5 | |
barplot_df <- heatmap_df %>% dplyr::select(rn, cluster, hg38_refgene_group, delta_increasing) | |
barplot_df[is.na(barplot_df$hg38_refgene_group), "hg38_refgene_group"] <- "No Gene" | |
barplot_df <- barplot_df %>% dplyr::mutate(formatted_cluster=factor(c( | |
"1↑", | |
"2↑", | |
"3↑", | |
"4↑", | |
"5↑", | |
"1↓", | |
"2↓", | |
"3↓", | |
"4↓", | |
"5↓", | |
"~" | |
)[cluster+1], levels=rev(c( | |
"1↑", | |
"2↑", | |
"3↑", | |
"4↑", | |
"5↑", | |
"1↓", | |
"2↓", | |
"3↓", | |
"4↓", | |
"5↓", | |
"~" | |
)))) | |
cpg_composition_plot <- ggplot(barplot_df %>% filter(rn %in% sig_x_sites), | |
aes(x=formatted_cluster, | |
y=(..count..), | |
fill=factor(hg38_refgene_group, | |
levels = rev(c("OutsideTranscript5", | |
"TSS1500", | |
"TSS200", | |
"5_UTR", | |
"Body", | |
"3_UTR", | |
"OutsideTranscript3", | |
"No Gene"))))) + | |
geom_bar(position="fill") + | |
scale_fill_manual(values=c("#BBBBBB", viridisLite::viridis(n=8)[2:8]), | |
limits=rev(c("OutsideTranscript5", | |
"TSS1500", | |
"TSS200", | |
"5_UTR", | |
"Body", | |
"3_UTR", | |
"OutsideTranscript3", | |
"No Gene"))) + | |
geom_text(data=barplot_df %>% filter(rn %in% sig_x_sites), mapping=aes(x=formatted_cluster, label=(..count..), fill=NA), | |
stat="count", position=position_fill(vjust=1.05), size=1.5) + | |
coord_flip() + | |
scale_y_continuous(labels = scales::percent_format())+ | |
labs(x="CpG DMP Cluster", | |
y="Percent of Probes", | |
title="Composition of CpG Clusters") + | |
theme_bw() + | |
base_plot_theme + | |
guides(fill=guide_legend(nrow=3, reverse=TRUE, title="")) + | |
theme(legend.box.margin = margin(l=-10, unit="mm"), | |
legend.text = element_text(margin=margin(l=-.7, unit="mm"))) | |
ggsave("figs/supplementary/supp_2_cpg_composition_prop.pdf",cpg_composition_plot, width = 48, height = 50, units = "mm") | |
formatted_barplot_df <- tibble(cluster=numeric(), | |
num_sites=numeric(), | |
group=character()) | |
for (c in 0:10) { | |
total_sites <- nrow(barplot_df %>% filter((cluster == c))) | |
num_outside_5 <- nrow(barplot_df %>% filter((cluster == c) & hg38_refgene_group == "OutsideTranscript5")) | |
num_tss1500 <- nrow(barplot_df %>% filter((cluster == c) & hg38_refgene_group == "TSS1500")) | |
num_tss200 <- nrow(barplot_df %>% filter((cluster == c) & hg38_refgene_group == "TSS200")) | |
num_5utr <- nrow(barplot_df %>% filter((cluster == c) & hg38_refgene_group == "5_UTR")) | |
num_body <- nrow(barplot_df %>% filter((cluster == c) & hg38_refgene_group == "Body")) | |
num_3utr <- nrow(barplot_df %>% filter((cluster == c) & hg38_refgene_group == "3_UTR")) | |
num_outside_3 <- nrow(barplot_df %>% filter((cluster == c) & hg38_refgene_group == "OutsideTranscript3")) | |
num_no_gene <- nrow(barplot_df %>% filter((cluster == c) & is.na(hg38_refgene_group))) | |
# num_other <- total_sites - (num_tss1500 + num_tss200 + num_5utr + num_exon1 + num_body + num_3utr) | |
formatted_barplot_df <- add_row(formatted_barplot_df, | |
cluster = c, | |
num_sites = num_outside_5, | |
group = "OutsideTranscript5'") | |
formatted_barplot_df <- add_row(formatted_barplot_df, | |
cluster = c, | |
num_sites = num_tss1500,cpg_composition_plot | |
group = "TSS1500") | |
formatted_barplot_df <- add_row(formatted_barplot_df, | |
cluster = c, | |
num_sites = num_tss200, | |
group = "TSS200") | |
formatted_barplot_df <- add_row(formatted_barplot_df, | |
cluster = c, | |
num_sites = num_5utr, | |
group = "5'UTR") | |
formatted_barplot_df <- add_row(formatted_barplot_df, | |
cluster = c, | |
num_sites = num_body, | |
group = "Body") | |
formatted_barplot_df <- add_row(formatted_barplot_df, | |
cluster = c, | |
num_sites = num_3utr, | |
group = "3'UTR") | |
formatted_barplot_df <- add_row(formatted_barplot_df, | |
cluster = c, | |
num_sites = num_outside_3, | |
group = "OutsideTranscript3'") | |
formatted_barplot_df <- add_row(formatted_barplot_df, | |
cluster = c, | |
num_sites = num_no_gene, | |
group = "No Gene") | |
} | |
set_panel_size <- function(p=NULL, g=ggplotGrob(p), | |
margin = unit(1,"mm"), | |
height=unit(15, "mm")){ | |
panels <- grep("panel", g$layout$name) | |
panel_index_w<- unique(g$layout$l[panels]) | |
panel_index_h<- unique(g$layout$t[panels]) | |
nw <- length(panel_index_w) | |
nh <- length(panel_index_h) | |
if(getRversion() < "3.3.0"){ | |
# the following conversion is necessary | |
# because there is no `[<-`.unit method | |
# so promoting to unit.list allows standard list indexing | |
# g$widths <- grid:::unit.list(g$widths) | |
g$heights <- grid:::unit.list(g$heights) | |
# g$widths[panel_index_w] <- rep(list(width), nw) | |
g$heights[panel_index_h] <- rep(list(height), nh) | |
} else { | |
# g$widths[panel_index_w] <- rep(width, nw) | |
g$heights[panel_index_h] <- rep(height, nh) | |
} | |
return(g) | |
} | |
formatted_barplot_df <- formatted_barplot_df %>% mutate(formatted_cluster=factor(c( | |
"1↑", | |
"2↑", | |
"3↑", | |
"4↑", | |
"5↑", | |
"1↓", | |
"2↓", | |
"3↓", | |
"4↓", | |
"5↓", | |
"↕" | |
)[cluster+1], levels=rev(c( | |
"1↑", | |
"2↑", | |
"3↑", | |
"4↑", | |
"5↑", | |
"1↓", | |
"2↓", | |
"3↓", | |
"4↓", | |
"5↓", | |
"↕" | |
)))) | |
p <- ggplot(data=formatted_barplot_df, aes(x=formatted_cluster, y=num_sites, #(..num_sites..)/sum(..num_sites..), | |
fill=factor(group, levels = rev(c("OutsideTranscript5'", | |
"TSS1500", | |
"TSS200", | |
"5'UTR", | |
"Body", | |
"3'UTR", | |
"OutsideTranscript3'", | |
"No Gene"))))) + | |
geom_bar(stat="identity") + | |
coord_flip() + | |
scale_fill_manual(values=c("#BBBBBB", viridisLite::viridis(n=8)[2:8])) + | |
# scale_fill_brewer(type = "qual", palette = 8) + | |
# scale_y_continuous(expand = c(0, 0)) + | |
labs(x="CpG DMP Cluster", | |
y="Number of Probes", | |
title="Composition of CpG Clusters") + | |
theme_bw() + | |
base_plot_theme + | |
guides(fill=guide_legend(nrow=1, reverse=TRUE, title="")) + | |
theme(legend.key.width = unit(2, "mm"), | |
legend.text = element_text(size = 5), | |
axis.text = element_text(size=5)) | |
# grid::grid.draw(set_panel_size(p)) | |
ggsave("figs/supplementary/supp_2_cpg_composition_temp.svg",p, width = 86, height = 86, units = "mm") | |
heatmap_df <- dplyr::select(heatmap_df, -key_transition, -passes_threshold, -delta_increasing, -hg38_refgene_group) | |
sample_clusters_df <- read_csv("saved_sample_clusters.csv", | |
col_names = c('name','cluster')) %>% arrange(cluster) | |
ordered_sample_names <- sample_clusters_df$name | |
heatmap_df <- reshape2::melt(heatmap_df, id.vars=c("rn", "cluster")) %>% dplyr::rename(site=rn, sample_name=variable) | |
for (sample_row in 1:nrow(sample_clusters_df)) { | |
cluster_name <- unlist(sample_clusters_df[sample_row, "name"]) | |
cluster <- unlist(sample_clusters_df[sample_row, "cluster"]) | |
heatmap_df[heatmap_df$sample_name == cluster_name, "sample_cluster"] <- cluster | |
} | |
library(grid) | |
sigvar_heatmap_plot <- ggplot(heatmap_df %>% filter(site %in% sig_x_sites), aes(x=factor(sample_name, levels = ordered_sample_names), | |
y=factor(site, levels = ordered_sites), fill=value)) + | |
geom_tile() + | |
facet_grid(cluster ~ sample_cluster, scales = "free", space="free", shrink=FALSE, switch = "y", labeller = labeller(cluster = function(c){ | |
return(list( | |
"0"="1↑", | |
"1"="2↑", | |
"2"="3↑", | |
"3"="4↑", | |
"4"="5↑", | |
"5"="1↓", | |
"6"="2↓", | |
"7"="3↓", | |
"8"="4↓", | |
"9"="5↓", | |
"10"="↕" | |
)[c]) | |
})) + | |
# facet_grid(cluster ~ sample_cluster,space="free_x", scales = "free", shrink=FALSE) + | |
scale_fill_gradient2(low="#0bb1a4", high = "#E966FE", midpoint = .5, breaks=c(0,.5,1), labels=c("unmethylated\nβ=0", "50% methylation\nβ=0.5", "methylated\nβ=1"))+ | |
labs(x="Female Samples", | |
y="CpG Sites with Greater Variance in Female Samples", | |
fill="DNA Methylation") + | |
theme(axis.text=element_blank(), | |
axis.ticks=element_blank(), | |
legend.position="top", | |
legend.justification = "center", | |
legend.title = element_text(size = 6, vjust = .75), | |
legend.text = element_text(size = 6), | |
# plot.background = element_rect(fill = "lightgrey"), | |
# panel.border = element_rect(colour = "black", fill=NA, size=1), | |
plot.margin = margin(r=0), | |
panel.spacing.y = unit(2, "mm"), | |
panel.spacing.x = unit(0, "mm"), | |
strip.background.x = element_rect(color="black"), | |
strip.background.y = element_rect(fill = NA, colour = "black", size = .7), | |
strip.text = element_text(size=6), | |
strip.text.y = element_text(angle=180), | |
axis.title = element_text(size=8)) | |
sigvar_heatmap_plot + theme(axis.text.x=element_text(angle = 90, size=7)) | |
# all_x_heatmap_plot <- ggplot(heatmap_df, aes(x=factor(sample_name, levels = ordered_sample_names), | |
# y=factor(site, levels = ordered_sites), fill=value)) + | |
# geom_tile() + | |
# facet_grid(cluster ~ sample_cluster, scales = "free", space="free", shrink=FALSE, switch = "y") + | |
# # facet_grid(cluster ~ sample_cluster,space="free_x", scales = "free", shrink=FALSE) + | |
# scale_fill_gradient2(low="#0bb1a4", high = "#E966FE", midpoint = .5, breaks=c(0,.5,1), labels=c("unmethylated\nβ=0", "50% methylation\nβ=0.5", "methylated\nβ=1"))+ | |
# labs(x="Female Samples", | |
# y="CpG Sites with Greater Variance in Female Samples", | |
# fill="DNA Methylation") + | |
# theme(axis.text=element_blank(), | |
# axis.ticks=element_blank(), | |
# legend.position="top", | |
# legend.justification = "center", | |
# legend.title = element_text(size = 6, vjust = .75), | |
# legend.text = element_text(size = 6), | |
# # plot.background = element_rect(fill = "lightgrey"), | |
# # panel.border = element_rect(colour = "black", fill=NA, size=1), | |
# plot.margin = margin(r=0), | |
# panel.spacing.y = unit(2, "mm"), | |
# panel.spacing.x = unit(0, "mm"), | |
# strip.background.x = element_rect(color="black"), | |
# strip.background.y = element_rect(fill = NA, colour = "black", size = .7), | |
# strip.text = element_text(size=6), | |
# strip.text.y = element_text(angle=180), | |
# axis.title = element_text(size=8)) | |
ggsave("figs/heatmap.pdf",set_panel_size(heatmap_plot), width = 85, height = 230, units = "mm") | |
all_x_heatmap_plot <- ggplot(heatmap_df, aes(x=factor(sample_name, levels = ordered_sample_names), | |
y=factor(site, levels = ordered_sites), fill=value)) + | |
geom_tile() + | |
facet_grid(cluster ~ sample_cluster, scales = "free", space="free", shrink=FALSE, switch = "y", labeller = labeller(cluster = function(c){ | |
return(list( | |
"0"="1↑", | |
"1"="2↑", | |
"2"="3↑", | |
"3"="4↑", | |
"4"="5↑", | |
"5"="1↓", | |
"6"="2↓", | |
"7"="3↓", | |
"8"="4↓", | |
"9"="5↓", | |
"10"="↕" | |
)[c]) | |
})) + | |
# facet_grid(cluster ~ sample_cluster,space="free_x", scales = "free", shrink=FALSE) + | |
scale_fill_gradient2(low="#0bb1a4", high = "#E966FE", midpoint = .5, breaks=c(0,.5,1), labels=c("unmethylated\nβ=0", "50% methylation\nβ=0.5", "methylated\nβ=1"))+ | |
labs(x="Female Samples", | |
y="CpG Sites with Greater Variance in Female Samples", | |
fill="DNA Methylation") + | |
theme(axis.text=element_blank(), | |
axis.ticks=element_blank(), | |
legend.position="top", | |
legend.justification = "center", | |
legend.title = element_text(size = 6, vjust = .75), | |
legend.text = element_text(size = 6), | |
# plot.background = element_rect(fill = "lightgrey"), | |
# panel.border = element_rect(colour = "black", fill=NA, size=1), | |
plot.margin = margin(r=0), | |
panel.spacing.y = unit(2, "mm"), | |
panel.spacing.x = unit(0, "mm"), | |
strip.background.x = element_rect(color="black"), | |
strip.background.y = element_rect(fill = NA, colour = "black", size = .7), | |
strip.text = element_text(size=6), | |
strip.text.y = element_text(angle=180), | |
axis.title = element_text(size=8)) | |
ggsave("figs/heatmap_all_x.png",all_x_heatmap_plot, width = 85, height = 230, units = "mm") | |
library(purrr) | |
all_dmp_df$key_qval <- unlist(pmap(all_dmp_df %>% dplyr::select(matches(glob2rx("transition_*_qval|^key_transition"))), | |
function(transition_1_qval, transition_2_qval, transition_3_qval, transition_4_qval, transition_5_qval, key_transition) { | |
# print(transition_1_qval) | |
switch (key_transition, | |
"1" = return(transition_1_qval), | |
"2" = return(transition_2_qval), | |
"3" = return(transition_3_qval), | |
"4" = return(transition_4_qval), | |
"5" = return(transition_5_qval) | |
) | |
})) | |
# volcano plot for all the sites (pick their key_transition for the log_qval) | |
all_chr_volcano <- ggplot(all_dmp_df, aes(delta, -log10(key_qval))) + | |
geom_point(aes(color = (hg38_chr=="chrX")), alpha = .4) + | |
labs(x=sprintf('Change in DNA Methylation'), | |
y='-log10(q-value)', | |
title="Most significant q-value per probe from DMP Analysis") + | |
# scale_x_continuous(breaks = c(-.5,0,.5)) + | |
theme_bw() + | |
base_plot_theme + | |
scale_color_jama(name="", limits=c(FALSE, TRUE), labels = c("Not X Chr", "X Chr")) | |
library(ggsignif) | |
beta_delta_barplots <- ggplot(all_dmp_df, aes(x=hg38_chr=="chrX", y= delta, color=hg38_chr=="chrX")) + | |
geom_boxplot(outlier.shape = NA) + | |
geom_signif(map_signif_level=T, comparisons = list(c(1,2)), color="black", y_position = .25) + | |
scale_y_continuous(limits = c(-.5,.3)) + | |
scale_x_discrete(breaks=c(FALSE, TRUE), labels=c("Not X Chr", "X Chr")) + | |
facet_grid(.~ delta_increasing ,space="free", shrink=T, labeller = labeller(delta_increasing=function(delta_increasing){ | |
return(list("TRUE"="Probes Gaining Methylation", | |
"FALSE"="Probes Losing Methylation" | |
)[delta_increasing]) | |
})) + | |
labs(x="", | |
y="Change in DNA Methylation", | |
title="Difference in change in DNA methylation")+ | |
theme_bw() + | |
base_plot_theme + | |
theme(strip.text.x = element_text(size=6),axis.title.x = element_blank())+ | |
scale_color_jama(name="", limits=c(FALSE, TRUE), labels = c("Not X Chr", "X Chr")) | |
library(gridExtra) | |
# xist_expr_plt comes from expression_analysis_notebook.Rmd | |
xist_expr_plot <- xist_expr_plt #+ | |
# scale_x_discrete(breaks=0:5, labels = c("A", "B", "C", "D", "E", "F")) + | |
# labs(x="Sample Cluster") + | |
# scale_fill_grey(start=0.9, end=.3, guide=F)+ | |
# theme_bw() + | |
# base_plot_theme | |
fig_2 <- arrangeGrob(sigvar_heatmap_plot, all_chr_volcano, beta_delta_barplots, xist_expr_plot, ncol=3, nrow=3, | |
widths = c(.5,5.5,6), heights = c(4,3.5,4.5), | |
layout_matrix=rbind(c(NA,1,4), | |
c(NA,1,3), | |
c(NA,1,2))) | |
ggsave("figs/main/fig_2a.pdf", sigvar_heatmap_plot, width = 85, height = 236, units = "mm") | |
ggsave("figs/main/fig_2_xist_expr.pdf", xist_expr_plot, width = 43, height = 64, units = "mm") | |
ggsave("figs/main/fig_2c.pdf", beta_delta_barplots, width = 85, height = (236)*(3.5/12), units = "mm") | |
ggsave("figs/main/fig_2d.pdf", all_chr_volcano, width = 85, height = (236)*(4.5/12), units = "mm") | |
ggsave("figs/main/fig_2.png", fig_2, width = 174, height=236,units = "mm", dpi=1000) | |
ggsave("figs/main/fig_2.pdf", fig_2, width = 174, height=236,units = "mm") | |
``` | |
```{r} | |
cluster_0_samples <- (sample_clusters_df %>% filter(cluster == 0))$name | |
heatmap(as.matrix(female_df_no_h9_chrX %>% select(!!(sample_clusters_df %>% filter(cluster == 0))$name))) | |
cluster_0_outliers <- cluster_0_samples[grepl("hSF6", cluster_0_samples)] | |
female_df_no_h9 <- read_csv("female_df_no_h9.csv") | |
filtered_site_annotation_df <- read_csv("filtered_site_annotation_df.csv") | |
female_df_no_h9_autosome <- female_df_no_h9 %>% filter((rn %in% filter(filtered_site_annotation_df, !(hg38_chromosome %in% c("chrX", "chrY")))$rowname)) | |
mean_autosome_methylation <- tidy(colMeans(female_df_no_h9_autosome %>% select(-rn))) | |
mean_autosome_methylation <- left_join(mean_autosome_methylation, sample_clusters_df, by=c("names"="name")) | |
mean_autosome_methylation[mean_autosome_methylation["names"] == cluster_0_outliers, "cluster"] <- 0.5 | |
ggplot(mean_autosome_methylation, aes(x=cluster, y=x, group=cluster)) + | |
geom_boxplot() + | |
labs(y="Mean autosomal methylation", | |
x="Sample Cluster", | |
title="Mean autosomal methylation in female samples") + | |
scale_x_discrete(limits=c(0,0.5,1,2,3,4,5), labels=c("A", "A-outliers", "B", "C", "D", "E", "F")) | |
``` | |
## Transition 1 in more detail (volcano plots and such) | |
```{r} | |
library(ggrepel) | |
library(gridExtra) | |
make_volcano_plot_modified <- function(dmp_df_temp, diff_cutoff_high = NA, diff_cutoff_low = NA, log10_qval_cutoff = 3, cluster_1_col, cluster_2_col) { | |
# dmp_df_temp <- all_dmp_df | |
dmp_df_temp$delta <- as.numeric(unlist(dmp_df_temp[cluster_2_col])) - as.numeric(unlist(dmp_df_temp[cluster_1_col])) | |
dmp_df_temp$log10qval <- -log10(dmp_df_temp$qval) | |
if (is.na(diff_cutoff_high)) { | |
max_delta <- max(dmp_df_temp[dmp_df_temp$log10qval > log10_qval_cutoff,]$delta) | |
mild.threshold.upper = (7.5 * max_delta / 10) | |
}else { | |
mild.threshold.upper <- diff_cutoff_high | |
} | |
if (is.na(diff_cutoff_low)) { | |
min_delta <- min(dmp_df_temp[dmp_df_temp$log10qval > log10_qval_cutoff,]$delta) | |
mild.threshold.lower = (8.5 * min_delta / 10) | |
}else { | |
mild.threshold.lower <- diff_cutoff_low | |
} | |
dmp_df_temp[is.na(dmp_df_temp$hg38_gene),]$hg38_gene <- as.character(dmp_df_temp[is.na(dmp_df_temp$hg38_gene),]$hg38_pos) | |
dmp_df_temp <- filter(dmp_df_temp, hg38_chr == "chrX") | |
label_rows <- ((! between(as_vector(dmp_df_temp$delta), mild.threshold.lower, mild.threshold.upper)) & | |
(dmp_df_temp$log10qval > log10_qval_cutoff)) | (dmp_df_temp$hg38_gene == "XIST") | |
x_chromosome_length <- 155270560 | |
chunk_size <- 8e6 | |
chr_color_df <- tibble(start=numeric(), | |
end=numeric(), | |
pos_chunk=numeric()) | |
for (n in 1:ceiling(x_chromosome_length/chunk_size)) { | |
min <- (n-1) * chunk_size | |
max <- (n) * chunk_size | |
dmp_df_temp[(dmp_df_temp$hg38_pos > min) & (dmp_df_temp$hg38_pos <= max), "pos_chunk"] <- n | |
if (max > x_chromosome_length) { | |
max <- x_chromosome_length | |
} | |
chr_color_df <- add_row(chr_color_df, | |
start=min, | |
end=max, | |
pos_chunk=n) | |
} | |
show_legend = F | |
volcano_plot <- ggplot(dmp_df_temp, aes(delta, log10qval)) + | |
geom_point(aes(color = as_factor(pos_chunk)), alpha = .4, show.legend = F, size=.3) + | |
geom_text_repel(data = dmp_df_temp[label_rows,], | |
aes(delta, log10qval, label=hg38_gene, color = as.factor(pos_chunk)), show.legend = F, size=2, segment.size = .2) + | |
labs(x=sprintf('%s - %s', cluster_2_col, cluster_1_col), | |
y='-log10(q-value)', | |
title=sprintf('Transition from cluster %s to cluster %s', | |
unlist(strsplit(cluster_1_col, '_'))[2], | |
fmt = unlist(strsplit(cluster_2_col, '_'))[2])) + | |
theme_bw() + | |
base_plot_theme | |
position_plot <- ggplot(dmp_df_temp, aes(hg38_pos, log10qval)) + | |
geom_line(aes(color = as.factor(pos_chunk)), show.legend = T) + | |
labs(x="X Chr Coordinate", | |
y="log10(qval)", | |
title="Transition 1 DMP q-values") + | |
theme_bw() + | |
base_plot_theme + | |
theme(plot.title = element_text(hjust = 0.5, size=8), | |
axis.title = element_text(size=7), | |
axis.text = element_text(size = 6)) | |
chr_plot <- ggplot(chr_color_df) + | |
geom_rect(aes(xmin=start, xmax=end, ymin=0, ymax=1, fill=as.factor(pos_chunk)), show.legend=F) + | |
theme_bw() + | |
base_plot_theme + | |
scale_y_continuous(labels = NULL, expand = c(0,0), breaks = NULL, minor_breaks = NULL) + | |
scale_x_continuous(limits=c(0, x_chromosome_length), breaks = seq(0, 155270560, 2e7), labels=c(0, 20, 40, 60, 80, 100, 120, 140), minor_breaks = seq(0, 155270560, 1e7), expand=c(0,0)) | |
return(list(volcano_plot=volcano_plot, position_plot=position_plot, chr_plot=chr_plot)) | |
} | |
modified_volcano <- make_volcano_plot_modified(all_dmp_df %>% filter(hg38_chr == "chrX") %>% dplyr::rename(qval=transition_1_qval), cluster_1_col = sprintf('cluster_%d_mean', 0), | |
cluster_2_col = sprintf('cluster_%d_mean', 1), | |
diff_cutoff_high = .17, diff_cutoff_low = -.8, log10_qval_cutoff = 25) | |
modified_volcano$volcano_plot <- modified_volcano$volcano_plot + | |
labs(title="X Chromosome DNAme Changes in Transition 1", | |
x="Transition 1 Delta ([cluster B mean] - [cluster A mean])") + | |
scale_x_continuous(limits = c(-.5, .5), breaks = c(-.1*(4:0),.1*(1:4)), expand = c(0,0)) + | |
scale_y_continuous(limits=c(0,90), expand=c(0,0)) | |
ggsave("figs/main/fig2c_volcano_plot.svg", modified_volcano$volcano_plot, width=85, height=32.614, units = "mm") | |
ggsave("figs/main/fig2c_volcano_plot.pdf", modified_volcano$volcano_plot, width = 87.4, height = 32.613, units = "mm") | |
chr_plot <- modified_volcano$chr_plot + | |
theme(axis.title = element_blank(), | |
plot.margin = unit(c(0,0,0,0), "mm")) | |
ggsave("figs/main/fig2clegend.svg", chr_plot, width = 76.786, height=6, units = "mm") | |
``` | |
## Make subset heatmap with only genes of interest | |
```{r} | |
library(grid) | |
female_df_no_h9_chrX <- read_csv("female_df_no_h9_chrX.csv.gz") | |
all_dmp_df <- read_csv("all_dmp_df.csv.gz") | |
make_subset_heatmap <- function(desired_genes, gene_order=NA) { | |
subset_heatmap_df <- inner_join(female_df_no_h9_chrX, | |
dplyr::select(all_dmp_df, rowname, hg38_gene, hg38_pos), by=c("rn"="rowname")) %>% | |
filter(hg38_gene %in% desired_genes) | |
subset_heatmap_df <- arrange(subset_heatmap_df, hg38_pos) | |
ordered_sites <- subset_heatmap_df$rn | |
if (!is.na(gene_order)) { | |
ordered_sites <- c() | |
for (i in 1:length(desired_genes)) { | |
g <- desired_genes[i] | |
o <- gene_order[i] | |
if(o == T) { | |
ordered_sites <- c(ordered_sites, arrange(subset_heatmap_df %>% filter(hg38_gene == g), hg38_pos)$rn) | |
}else { | |
ordered_sites <- c(ordered_sites, arrange(subset_heatmap_df %>% filter(hg38_gene == g), desc(hg38_pos))$rn) | |
} | |
} | |
} | |
subset_heatmap_df <- dplyr::select(subset_heatmap_df, -hg38_pos) | |
sample_clusters_df <- read_csv("saved_sample_clusters.csv", | |
col_names = c('name','cluster')) %>% arrange(cluster) | |
ordered_sample_names <- sample_clusters_df$name | |
subset_heatmap_df <- reshape2::melt(subset_heatmap_df, id.vars=c("rn", "hg38_gene")) %>% dplyr::rename(site=rn, sample_name=variable) | |
subset_heatmap_df <- inner_join(subset_heatmap_df, sample_clusters_df, by=c("sample_name"="name")) %>% dplyr::rename(sample_cluster="cluster") | |
convert_int_to_letters <- Vectorize(vectorize.args = "c", | |
FUN = function(c) { | |
switch(c+1, | |
"A", | |
"B", | |
"C", | |
"D", | |
"E", | |
"F", | |
) | |
}) | |
subset_heatmap_df <- subset_heatmap_df %>% mutate(sample_cluster=convert_int_to_letters(sample_cluster)) | |
subset_heatmap <- ggplot(subset_heatmap_df, aes(x=factor(sample_name, levels = ordered_sample_names), | |
y=factor(site, levels = ordered_sites), fill=value)) + | |
geom_tile() + | |
# facet_grid(cluster ~ sample_cluster, scales = "free", space="free", shrink=FALSE, switch = "y") + | |
facet_grid(factor(hg38_gene, levels=desired_genes) ~ sample_cluster,space="free_x", scales = "free", shrink=T, switch = "y") + | |
scale_fill_gradient2(low="#0bb1a4", high = "#E966FE", | |
midpoint = .5, | |
limits=c(0,1), | |
breaks=c(0,.5,1), | |
labels=c("unmethylated\nβ=0", "50% methylation\nβ=0.5", "methylated\nβ=1")) + | |
labs(x="Female Samples", | |
y="", | |
fill="DNA Methylation") + | |
theme(axis.text=element_blank(), | |
axis.ticks=element_blank(), | |
legend.position="top", | |
legend.justification = "center", | |
# legend.title = element_text(size = 6, vjust = .8), | |
# legend.text = element_text(size = 6), | |
# legend.position="top", | |
# legend.justification = "right", | |
legend.title = element_text(size = 4, vjust = .8), | |
legend.text = element_text(size = 4), | |
legend.key.height = unit(2.5, "mm"), | |
plot.margin = margin(r=0), | |
panel.spacing.y = unit(2, "mm"), | |
panel.spacing.x = unit(0.2, "mm"), | |
# strip.background.x = element_rect(color="black"), | |
strip.text.y = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")), | |
strip.text.x = element_text(size=5, margin = margin(1,1.5,1,1.5, "mm")), | |
strip.background = element_rect(size = .3), | |
axis.title = element_text(size=7)) | |
return(subset_heatmap) | |
} | |
long_arm_heatmap <- make_subset_heatmap(c("XIST", "FIRRE"), gene_order=c(F,T)) + theme(legend.position="bottom", legend.justification = "right", legend.box.margin = margin(t=-4,r=3, unit="mm")) | |
ggsave("figs/main/fig2_xist_firre_heatmap.pdf", long_arm_heatmap, width=85, height = 65, units = "mm") | |
fig_3 <- arrangeGrob(modified_volcano$volcano_plot, modified_volcano$position_plot, long_arm_heatmap, layout_matrix = cbind(c(1,2), c(3,3)), widths = c(70.5, 63.5)) | |
ggsave("figs/main/fig3.svg", fig_3, width = 134, height = 118, units = "mm") | |
update_geom_defaults("boxplot", | |
list(size=.3, | |
outlier.shape=NA)) | |
update_geom_defaults("point", | |
list(shape=20, | |
size=.5)) | |
dummy <- data.frame(cluster=c("A", "A"), | |
expression=c(20, 12), | |
hgnc_symbol=c("XIST", "FIRRE")) | |
xist_expression <- plot_gene_expression(c("XIST", "FIRRE"), multiple=T) + | |
theme_bw() + | |
geom_signif(comparisons = list(c(1,2), c(1,3), c(1,4), c(1,5)),step_increase=.12, margin_top=-.4, tip_length = 0, size=.3, textsize = 2, test="wilcox.test") + | |
base_plot_theme + | |
geom_blank(data=dummy) + | |
theme(strip.text = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")), | |
strip.background = element_rect(size = .3)) | |
ggsave("figs/main/fig_2_xist_expr.pdf", xist_expression, width = 43.9, height = 64, units = "mm") | |
firre_expression <- plot_gene_expression("FIRRE") + | |
theme_bw() + | |
base_plot_theme | |
ggsave("figs/main/fig_2_firre_expr.pdf", firre_expression, width = 43.9, height = 32, units = "mm") | |
meth_array_density <- stat_bin(all_dmp_df %>% filter(hg38_chr == "chrX"), mapping=aes(x=hg38_pos, y = (..count..)/sum(..count..)), stat="bin", bins = 100) | |
meth_array_density <- ggplot_build(ggplot() + meth_array_density)$data[[1]] %>% arrange(x) | |
non_sig_density <- stat_bin(all_dmp_df %>% filter(hg38_chr == "chrX" & (passes_threshold == F)), | |
mapping = aes(x=hg38_pos, y = (..count..)/sum(..count..)), breaks = meth_array_density$xmin) | |
non_sig_density <- ggplot_build(ggplot() + non_sig_density)$data[[1]] %>% arrange(x) | |
ks.test(meth_array_density$y, non_sig_density$y) | |
tran_1_density <- stat_bin(all_dmp_df %>% filter(hg38_chr == "chrX" & (key_transition == 1) & (passes_threshold == T)), | |
mapping = aes(x=hg38_pos, y = (..count..)/sum(..count..)), breaks = meth_array_density$xmin) | |
tran_1_density <- ggplot_build(ggplot() + tran_1_density)$data[[1]] %>% arrange(x) | |
ks.test(meth_array_density$y, tran_1_density$y) | |
transition_1_not_random_plot <- ggplot() + | |
geom_area(all_dmp_df %>% filter(hg38_chr == "chrX"), mapping=aes(x=hg38_pos, y = (..count..)/sum(..count..)), stat="bin", bins = 100, fill="grey") + | |
geom_freqpoly(all_dmp_df %>% filter(hg38_chr == "chrX" & (passes_threshold == F)), | |
mapping = aes(x=hg38_pos, y = (..count..)/sum(..count..), color="non-significant"), | |
stat="bin", bins = 100, size=.3) + | |
geom_freqpoly(all_dmp_df %>% filter(hg38_chr == "chrX" & (key_transition == 1) & (passes_threshold == T)), | |
mapping = aes(x=hg38_pos, y = (..count..)/sum(..count..), color="transition_1"), | |
stat="bin", bins = 100, size=.3) + | |
theme_bw() + | |
scale_x_continuous(limits=c(0, 155270560), breaks = seq(0, 155270560, 2e7), labels=c(0, 20, 40, 60, 80, 100, 120, 140), minor_breaks = seq(0, 155270560, 1e7), expand=c(0,0)) + | |
scale_y_continuous(expand=c(0,0)) + | |
labs(x="X Chr Coordinate (Mb)", | |
y="Percent of Probes", | |
title = "Probes Changing in Transition 1 vs Probes Not Changing in XCE") + | |
scale_color_manual( name="Probe Type", | |
labels=c("1↑ & 1↓", "~"), | |
limits=c("transition_1", "non-significant"), | |
values = c(pal_d3("category10")(1), "#000000"), | |
guide=F) + | |
base_plot_theme | |
ggsave("figs/main/fig2b.pdf", transition_1_not_random_plot, width = 87.4, height = 32.613, units = "mm") | |
# one more plot about the centers, and the origins of the early transitions. using the xist correlations | |
``` | |
## Look at what is moving with XIST to try to pinpoint early changes | |
```{r} | |
library(tidyverse) | |
library(ggplot2) | |
all_dmp_df <- read_csv("all_dmp_df.csv") | |
female_df_no_h9_chrX <- read_csv("female_df_no_h9_chrX.csv") | |
female_df_no_h9 <- read_csv("female_df_no_h9.csv") | |
x_annotation_df <- read_csv("x_annotation.csv") | |
sample_clusters_df <- read_csv("saved_sample_clusters.csv", | |
col_names = c('name','cluster')) | |
clutser_0_samples <- (sample_clusters_df %>% filter(cluster == 0))$name | |
female_df_no_h9_chrX <- female_df_no_h9_chrX %>% select(rn, !!clutser_0_samples) | |
female_df_no_h9 <- female_df_no_h9 %>% select(rn, !!clutser_0_samples) | |
xist_sites_of_interest <- c("cg03554089", "cg12653510", "cg05533223", "cg11717280", "cg20698282") | |
# xist_sites_of_interest <- filter(all_dmp_df, hg38_gene == "XIST" & (hg38_refgene_group %in% c("TSS200")))$rowname | |
# female_df_no_h9 <- rownames_to_column(female_df_no_h9, var="rn") | |
mean_interesting_xist_sites <- colMeans(female_df_no_h9_chrX %>% filter(rn %in% xist_sites_of_interest) %>% dplyr::select(-rn)) / | |
colMeans(female_df_no_h9 %>% filter(!(rn %in% x_annotation_df$rowname)) %>% dplyr::select(-rn)) | |
# xist_cpg2_sites <- c("cg03554089", "cg12653510", "cg05533223") | |
# mean_xist_cpg2_sites <- colMeans(female_df_no_h9_chrX %>% filter(rn %in% xist_cpg2_sites) %>% dplyr::select(-rn)) | |
# | |
# ggplot() + | |
# geom_point(aes(x=mean_interesting_xist_sites, mean_xist_cpg2_sites)) | |
mean_interesting_xist_sites_df <- as.data.frame(mean_interesting_xist_sites) %>% | |
rownames_to_column(var="sample_name") %>% | |
dplyr::rename(xist_methylation=2) | |
# quants <- quantile(mean_interesting_xist_sites) | |
quants <- c(0,0,0,0,0) | |
quants[c(1,5)] <- range(mean_interesting_xist_sites) | |
quants[3] <- mean(mean_interesting_xist_sites) | |
quants[2] <- mean(quants[c(1,3)]) | |
quants[4] <- mean(quants[c(5,3)]) | |
# quants <- c(0.5470616,0.641993,0.7369243,.825,0.9267870) | |
sorted_sample_names <- names(sort(mean_interesting_xist_sites)) | |
quant_1_samples <- names(mean_interesting_xist_sites[mean_interesting_xist_sites < quants[2]]) | |
quant_2_samples <- names(mean_interesting_xist_sites[(mean_interesting_xist_sites >= quants[2]) & | |
(mean_interesting_xist_sites < quants[3])]) | |
quant_3_samples <- names(mean_interesting_xist_sites[(mean_interesting_xist_sites >= quants[3]) & | |
(mean_interesting_xist_sites < quants[4])]) | |
quant_4_samples <- names(mean_interesting_xist_sites[mean_interesting_xist_sites >= quants[4]]) | |
correlating_probes_df <- female_df_no_h9_chrX | |
autosome_mean <- colMeans(female_df_no_h9 %>% filter(!(rn %in% x_annotation_df$rowname)) %>% dplyr::select(-rn)) | |
for (sample_name in names(autosome_mean)) { | |
correlating_probes_df[sample_name] <- correlating_probes_df[sample_name] / autosome_mean[sample_name] | |
} | |
correlating_probes_df$correlation_to_xist <- as.numeric(cor(t(dplyr::select(correlating_probes_df, -rn)), mean_interesting_xist_sites)) | |
correlating_probes_df$correlation_to_xist_quant_1 <- as.numeric(cor(t(dplyr::select(correlating_probes_df, !!quant_1_samples)), | |
mean_interesting_xist_sites[quant_1_samples])) | |
correlating_probes_df$correlation_to_xist_quant_2 <- as.numeric(cor(t(dplyr::select(correlating_probes_df, !!quant_2_samples)), | |
mean_interesting_xist_sites[quant_2_samples])) | |
correlating_probes_df$correlation_to_xist_quant_3 <- as.numeric(cor(t(dplyr::select(correlating_probes_df, !!quant_3_samples)), | |
mean_interesting_xist_sites[quant_3_samples])) | |
correlating_probes_df$correlation_to_xist_quant_4 <- as.numeric(cor(t(dplyr::select(correlating_probes_df, !!quant_4_samples)), | |
mean_interesting_xist_sites[quant_4_samples])) | |
correlating_probes_df <- left_join(correlating_probes_df, x_annotation_df %>% select(rowname, hg38_pos, hg38_gene_name), by=c("rn"="rowname")) | |
data_df <- reshape2::melt(correlating_probes_df %>% | |
select(rn, hg38_pos, hg38_gene_name, | |
!!sorted_sample_names), id.vars=c("rn", "hg38_pos", "hg38_gene_name")) %>% | |
rename(sample_name = variable) | |
data_df <- left_join(data_df, mean_interesting_xist_sites_df, by=c("sample_name"="sample_name")) | |
# TODO: get rid of probes that don't change much | |
data_df_top <- data_df %>% group_by(rn) %>% arrange(xist_methylation) %>% top_n(5) %>% | |
summarise_at(vars(value), funs(mean = mean(., na.rm = TRUE))) | |
data_df_bottom <- data_df %>% group_by(rn) %>% arrange(xist_methylation) %>% top_n(-5) %>% | |
summarise_at(vars(value), funs(mean = mean(., na.rm = TRUE))) | |
data_df_top <- left_join(data_df_top, data_df_bottom, by="rn", suffix=c(".start", ".end")) | |
probes_that_are_changing <- dplyr::filter(data_df_top, mean.end - mean.start >= .3)$rn | |
ggplot(data_df %>% filter(rn %in% probes_that_are_changing), aes(x=xist_methylation, y=value)) + | |
geom_smooth(aes(color=rn, alpha=.2), show.legend = F, alpha=.2, se=F, span=.2, size=.2) | |
# normalize to 0 to .5 | |
library(scales) | |
#average by gene | |
normalized_correlating_probes_df <- correlating_probes_df %>% group_by(hg38_gene_name) %>% summarise_at(vars(hg38_pos, !!sorted_sample_names), funs(mean)) %>% ungroup() %>% column_to_rownames(var="hg38_pos") %>% select(-hg38_gene_name) | |
normalized_correlating_probes_df <- correlating_probes_df %>% select(rn, !!sorted_sample_names) %>% column_to_rownames(var="rn") | |
normalized_correlating_probes_df <- t(apply(normalized_correlating_probes_df, 1, rescale, to=c(0,.5))) | |
normalized_correlating_probes_df <- as.data.frame(normalized_correlating_probes_df) %>% rownames_to_column(var="rn") | |
normalized_correlating_probes_df <- left_join(normalized_correlating_probes_df, x_annotation_df %>% select(rowname, hg38_pos, hg38_gene_name), by=c("rn"="rowname")) | |
data_norm_df <- reshape2::melt(normalized_correlating_probes_df %>% | |
select(hg38_pos=rn, | |
!!sorted_sample_names), id.vars=c("hg38_pos")) %>% | |
rename(sample_name = variable) | |
data_norm_df <- reshape2::melt(normalized_correlating_probes_df %>% | |
select(rn, hg38_pos, hg38_gene_name, | |
!!sorted_sample_names), id.vars=c("rn", "hg38_pos", "hg38_gene_name")) %>% | |
rename(sample_name = variable) | |
data_norm_df <- left_join(data_norm_df, mean_interesting_xist_sites_df, by=c("sample_name"="sample_name")) | |
data_norm_df <- data_norm_df %>% filter(xist_methylation <= 1.7) | |
ggplot(data_norm_df %>% filter(rn %in% probes_that_are_changing), aes(x=xist_methylation, y=value)) + | |
geom_smooth(aes(color=rn, alpha=.2), show.legend = F, alpha=.2, se=F, span=.2, size=.2) | |
# where does the probe lose .2 from it's starting mean | |
data_norm_df_top <- data_norm_df %>% group_by(rn) %>% arrange(xist_methylation) %>% top_n(5) %>% | |
summarise_at(vars(value), funs(mean = mean(., na.rm = TRUE))) | |
data_norm_df_bottom <- data_norm_df %>% group_by(rn) %>% arrange(xist_methylation) %>% top_n(-5) %>% | |
summarise_at(vars(value), funs(mean = mean(., na.rm = TRUE))) | |
data_norm_df_top <- left_join(data_norm_df_top, data_norm_df_bottom, by="rn", suffix=c("_start", "_end")) | |
data_norm_df_top <- data_norm_df_top %>% dplyr::mutate(mean_threshold = (mean_start *.8)) | |
xist_ranges <- (100:199)/100 | |
i <- 1 | |
data_norm_df$passing_fraction <- 0 | |
for(xist_min in xist_ranges) { | |
print(i) | |
data_norm_df_range <- data_norm_df %>% filter((rn %in% probes_that_are_changing) & | |
(xist_methylation >= xist_min) & | |
(xist_methylation < (xist_min + .05))) | |
if(nrow(data_norm_df_range) > 0) { | |
data_norm_df_range_mean <- data_norm_df_range %>% group_by(rn) %>% | |
summarise_at(vars(value), funs(mean = mean(., na.rm = TRUE))) | |
data_norm_df_range_mean <- left_join(data_norm_df_range_mean, data_norm_df_top %>% dplyr::select(rn, mean_threshold), by="rn") | |
probes_passing_threshold <- (data_norm_df_range_mean %>% filter(mean <= mean_threshold))$rn | |
if (length(probes_passing_threshold) > 0 & sum((data_norm_df$rn %in% probes_passing_threshold) & (data_norm_df$passing_fraction == 0)) > 0) { | |
data_norm_df[(data_norm_df$rn %in% probes_passing_threshold) & (data_norm_df$passing_fraction == 0),]$passing_fraction <- i | |
} | |
} | |
i <- i + 1 | |
} | |
ggplot(data_norm_df %>% filter(rn %in% probes_that_are_changing), aes(x=xist_methylation, y=value)) + | |
geom_smooth(aes(color=as.factor(passing_fraction)), show.legend = T, se=F, span=.2, size=.5) + scale_color_viridis_d() | |
ggplot(data_norm_df %>% filter((rn %in% probes_that_are_changing) & (passing_fraction > 1) & (passing_fraction <= 40)) %>% group_by(rn) %>% filter(row_number() == 1) %>% ungroup()) + | |
geom_freqpoly(aes(hg38_pos), bins=500) | |
# fitting sigmoid curve models | |
# sigmoidal_formula <- value ~ L / 1+exp(-k*(xist_expr - x0)) | |
# L / 1+exp(-k*(xist_methylation - x0) | |
# test_df <- data_norm_df %>% filter(rn == "cg00650640") | |
# x <- test_df$xist_methylation | |
# y <- test_df$value | |
# fit <- nls(y ~ SSlogis(x, Asym, xmid, scal), data = data.frame(x, y)) | |
# xmid_estimate <- fit$m$getPars()[2] | |
get_midpoint_estimate <- function(test_df) { | |
x <- test_df$xist_methylation | |
y <- test_df$value | |
fit <- NA | |
try(fit <- nls(y ~ SSlogis(x, Asym, xmid, scal), data = data.frame(x, y))) | |
if (length(fit) == 1 & is.na(fit)) { | |
return(NA) | |
}else { | |
xmid_estimate <- as.numeric(unlist(fit$m$getPars()[2])) | |
} | |
xmid_estimate <- as.numeric(unlist(fit$m$getPars()[2])) | |
# print(xmid_estimate) | |
return(xmid_estimate) | |
} | |
mods <- data_norm_df %>% | |
group_by(rn) %>% | |
do(xmid = get_midpoint_estimate(.)) | |
mods <- mods %>% mutate(fixed_xmid = unlist(xmid)) | |
ggplot(mods %>% filter(rn %in% probes_that_are_changing)) + | |
geom_freqpoly(aes(fixed_xmid), bins=500) + | |
scale_x_continuous(limit = c(1,2)) + | |
geom_vline(xintercept = 1.54, color="green") + | |
labs(title="xmid distribution from sigmoid curve fit") | |
early_probes_blue <- (mods %>% filter(fixed_xmid < 1.55))$hg38_pos | |
View(data_norm_df %>% filter(rn %in% early_probes_blue) %>% group_by(rn) %>% filter(row_number() == 1) %>% ungroup() %>% dplyr::select(rn, hg38_pos, hg38_gene_name)) | |
# look at the distribution of xmids along the X chromosome | |
dat_df <- data_norm_df # %>% group_by(rn) %>% filter(row_number() == 1) %>% ungroup() | |
dat_df <- left_join(dat_df, mods %>% select(hg38_pos, fixed_xmid), by="hg38_pos") | |
ggplot(dat_df %>% filter(hg38_pos %in% early_probes_blue), aes(x=as.numeric(hg38_pos), y=fixed_xmid)) + | |
geom_smooth(span=.1) + | |
geom_point() + | |
scale_y_continuous(limits = c(1,2)) + | |
scale_x_continuous(breaks = (1:14)*1e7) + | |
labs(title="x mid along x chromosome - smooth") | |
data_df <- reshape2::melt(correlating_probes_df %>% | |
filter( abs(correlation_to_xist) >= .5) %>% | |
select(rn, | |
correlation_to_xist_quant_1, | |
correlation_to_xist_quant_2, | |
correlation_to_xist_quant_3, | |
correlation_to_xist_quant_4), id.vars=c("rn")) | |
data_df <- left_join(data_df, x_annotation_df %>% select(rowname, hg38_pos, hg38_gene_name), by=c("rn"="rowname")) | |
xist_correlating_probes <- filter(correlating_probes_df, abs(correlation_to_xist_quant_3) >= .5)$rn | |
ggplot(data_df) + | |
geom_boxplot(aes(x=variable, y=abs(value)), show.legend=F) | |
ggplot(data_df) + | |
geom_point(aes(x=hg38_pos, y=value, color=variable), show.legend=T, alpha=.5) | |
ggplot(correlating_probes_df) + | |
geom_line(aes(x=hg38_pos, y=abs(correlation_to_xist))) | |
ggplot() + | |
geom_line(data = all_dmp_df %>% filter(hg38_chr == "chrX"), aes(x=hg38_pos, y=1/(transition_1_qval)), color="blue", alpha=.3) + | |
geom_line(data=correlating_probes_df, aes(x=hg38_pos, y=(abs(correlation_to_xist_quant_1))), color="red", alpha=.3) | |
xist_corr <- correlating_probes_df$correlation_to_xist | |
names(xist_corr) <- correlating_probes_df$rn | |
tran_1_qval <- (all_dmp_df %>% filter(hg38_chr == "chrX"))$transition_1_qval | |
names(tran_1_qval) <- (all_dmp_df %>% filter(hg38_chr == "chrX"))$hg38 | |
cor.test(abs(xist_corr), tran_1_qval[names(xist_corr)], method = "kendall") | |
data_df <- left_join(all_dmp_df %>% filter(hg38_chr == "chrX") %>% select(rowname, hg38_pos, transition_1_qval, hg38_gene), | |
correlating_probes_df %>% select(rn, correlation_to_xist, correlation_to_xist_quant_1, | |
correlation_to_xist_quant_2, correlation_to_xist_quant_3, | |
correlation_to_xist_quant_4), | |
by=c("rowname"="rn")) | |
x_chromosome_length <- 155270560 | |
chunk_size <- 8e6 | |
for (n in 1:ceiling(x_chromosome_length/chunk_size)) { | |
min <- (n-1) * chunk_size | |
max <- (n) * chunk_size | |
data_df[(data_df$hg38_pos > min) & (data_df$hg38_pos <= max), "pos_chunk"] <- n | |
} | |
ggplot(data_df) + | |
geom_point(aes(x=abs(correlation_to_xist), y=-log10(transition_1_qval), color=as.factor(pos_chunk)), show.legend = F) + | |
labs(title="transition 1 qval vs correlation to xist") | |
cor.test(abs(data_df$correlation_to_xist), data_df$transition_1_qval, method = "spearman") | |
ggplot(data_df) + | |
geom_point(aes(x=abs(correlation_to_xist_quant_1), y=-log10(transition_1_qval), color=as.factor(pos_chunk)), show.legend = F) + | |
labs(title="transition 1 qval vs correlation to xist quant 1") | |
ggplot(data_df) + | |
geom_point(aes(x=abs(correlation_to_xist_quant_2), y=-log10(transition_1_qval), color=as.factor(pos_chunk)), show.legend = F) + | |
labs(title="transition 1 qval vs correlation to xist quant 2") | |
ggplot(data_df) + | |
geom_point(aes(x=abs(correlation_to_xist_quant_3), y=-log10(transition_1_qval), color=as.factor(pos_chunk)), show.legend = F) + | |
labs(title="transition 1 qval vs correlation to xist quant 3") | |
ggplot(data_df) + | |
geom_point(aes(x=abs(correlation_to_xist_quant_4), y=-log10(transition_1_qval), color=as.factor(pos_chunk)), show.legend = F) + | |
labs(title="transition 1 qval vs correlation to xist quant 4") | |
data_df <- left_join(all_dmp_df %>% filter(hg38_chr == "chrX") %>% select(rowname, hg38_gene, hg38_pos, hg38_refgene_group), | |
correlating_probes_df %>% select(rn, corr_3=correlation_to_xist_quant_3, corr_2=correlation_to_xist_quant_2), | |
by=c("rowname"="rn")) | |
ggplot(data_df) + | |
geom_smooth(aes(x=hg38_pos, y=abs(corr_3))) | |
data_df$pseudo_position <- order(order(data_df$hg38_pos)) | |
data_df_filtered <- data_df %>% group_by(hg38_gene) %>% dplyr::arrange(desc(corr_3)) %>% filter(row_number() <= 3) %>% ungroup() | |
cutoffs <- c() | |
num_passings <- c() | |
for(c in 1:90) { | |
cutoff <- c/100 | |
pseudo_positions_above_cutoff <- filter(data_df_filtered, abs(corr_3) >= cutoff)$pseudo_position | |
num_passings <- c(num_passings, length(pseudo_positions_above_cutoff)) | |
cutoffs <- c(cutoffs, cutoff) | |
} | |
cutoff_df <- tibble(cutoff=cutoffs, num_passing=num_passings) | |
ggplot(cutoff_df, aes(x=cutoff, y=num_passing)) + geom_line() + labs(title = "optimizing cutoff") | |
cutoff <- .4 | |
pseudo_positions_above_cutoff <- filter(data_df_filtered, abs(corr_3) >= cutoff)$pseudo_position | |
# library(BAMMtools) | |
# jbreaks <- getJenksBreaks(pseudo_positions_above_cutoff, 5) | |
k_min <- 1 | |
k_max <- 15 | |
wss <- sapply(k_min:k_max, | |
function(k){kmeans(pseudo_positions_above_cutoff, k, nstart=50,iter.max = 15 )$tot.withinss}) | |
elbow_df <- tibble(num_clusters = k_min:k_max, wcss = wss) | |
ggplot(elbow_df, aes(x=num_clusters, y=wcss)) + | |
geom_point() + | |
geom_line() + | |
labs(x="Number of clusters K", | |
y="Total within-clusters sum of squares", | |
title = "Optimal k-means clusters") | |
# pick k=4 | |
k=5 | |
kmeans <- kmeans(pseudo_positions_above_cutoff, k, nstart=50, iter.max = 15) | |
center_genes <- c() | |
center_positions <- c() | |
for(c in kmeans$centers) { | |
rounded_c <- round(c) | |
center_genes <- c(center_genes, unlist(filter(data_df, pseudo_position == rounded_c)$hg38_gene)) | |
center_positions <- c(center_positions, unlist(filter(data_df, pseudo_position == rounded_c)$hg38_pos)) | |
} | |
print(center_positions/1e6) | |
positions_above_cutoff <- filter(data_df_filtered, abs(corr_3) >= cutoff)$hg38_pos | |
position_cluster_df <- tibble(positions=positions_above_cutoff, | |
cluster=kmeans$cluster, | |
correlation_to_xist=filter(data_df_filtered, abs(corr_3) >= cutoff)$corr_3) | |
library(SDMTools) | |
centers_df <- position_cluster_df %>% group_by(cluster) %>% summarise(center=weighted.mean(positions, correlation_to_xist), stderr = wt.sd(positions, correlation_to_xist)/sqrt(length(positions))) | |
write_csv(data_df, "xist_correlation_df.csv") | |
python_centers <- c(1109.799248, | |
5935.39440388, | |
7162.58421653, | |
4498.5805607) | |
center_genes <- c() | |
center_positions <- c() | |
for(c in python_centers) { | |
rounded_c <- round(c) | |
center_genes <- c(center_genes, unlist(filter(data_df, pseudo_position == rounded_c)$hg38_gene)) | |
center_positions <- c(center_positions, unlist(filter(data_df, pseudo_position == rounded_c)$hg38_pos)) | |
} | |
print(center_positions/1e6) | |
# get real center based on clusters | |
positions_above_cutoff <- filter(sample_cluster_0_noh9_chrX_df, abs(correlation_to_xist) >= cutoff)$pos | |
``` | |
## Bedgraph files for the first transition | |
```{r} | |
library(tidyverse) | |
library(ggplot2) | |
all_dmp_df <- read_csv('all_dmp_df.csv') | |
# do this based on genes and when the promoters change their methylation. | |
x_annotation_df <- read_csv("x_annotation.csv") | |
statistics <- read_csv('variance_statistics_noh9_nooutliers.csv') | |
positions_to_keep <- statistics$Position[statistics$alpha <= .01 & (statistics$femaleVariance > statistics$maleVariance)] # returns the density data | |
sig_x_sites <- intersect(positions_to_keep, x_annotation_df$rowname) | |
bedgraph_header <- function(name="", description="") { | |
return(sprintf('track type=bedGraph name="%s" description="%s" color=1,0,0 altColor=1,0,0 priority=priority autoScale=on alwaysZero=on gridDefault=on maxHeightPixels=100:60:8 visbility=full graphType=bar viewLimits=0:1 yLineMark=0 yLineOnOff=off windowingFunction=mean+whiskers smoothingWindow=8', name, description)) | |
} | |
# windowing the bed file | |
window_average_chrX <- function(bed_file_df, window_size = 1000000, step_size = 10000, include_empty=FALSE, calculate_sum=F) { | |
# bed_file_df <- bed_df | |
x_chr_size <- 155270560 | |
windowed_bed_df <- tibble(chr = character(), | |
pos_start = numeric(), | |
pos_end = numeric(), | |
delta = numeric()) | |
for (s in 0:floor(155270560 / step_size)) { | |
start_position <- s * step_size | |
end_position <- min((start_position + window_size), x_chr_size) | |
num_rows <- nrow(bed_file_df %>% filter((chr == "chrX") & (pos >= start_position) & (pos <= end_position))) # (abs((!!start_position) - pos) <= (window_size/2)))) | |
if ((num_rows > 0) | (include_empty == T)) { | |
if (calculate_sum == F) { | |
window_mean_delta <- mean((bed_file_df %>% filter((chr == "chrX") & (abs(!!start_position - pos) <= (window_size/2))))$delta, na.rm=T) | |
if(is.na(window_mean_delta)) { | |
window_mean_delta <- 0 | |
} | |
windowed_bed_df <- add_row(windowed_bed_df, | |
chr="chrX", | |
pos_start=start_position, | |
pos_end=end_position, | |
delta=window_mean_delta) | |
}else{ | |
window_sum_delta <- sum((bed_file_df %>% filter((chr == "chrX") & (abs(!!start_position - pos) <= (window_size/2))))$delta, na.rm=T) | |
if(is.na(window_sum_delta)) { | |
window_sum_delta <- 0 | |
} | |
windowed_bed_df <- add_row(windowed_bed_df, | |
chr="chrX", | |
pos_start=start_position, | |
pos_end=end_position, | |
delta=window_sum_delta) | |
} | |
} | |
} | |
return(windowed_bed_df) | |
} | |
# for (t in 1:5) { | |
# write(bedgraph_header(name=sprintf("all_delta_%d", t), | |
# description=sprintf("all deltas from %d transition 1Mb windowed", t)), | |
# sprintf("bedgraphs/windowed_all_delta_tran_%d.bedgraph", t)) | |
# bed_df <- all_dmp_df %>% filter(chr == "chrX") | |
# cluster_col_0 <- "cluster_0_mean" #sprintf("cluster_%d_mean", t-1) | |
# cluster_col_1 <- sprintf("cluster_%d_mean", t) | |
# bed_df$delta <- unlist(bed_df[cluster_col_1]) - unlist(bed_df[cluster_col_0]) | |
# | |
# bed_df$pos_start <- bed_df$pos - 30 | |
# bed_df$pos_end <- bed_df$pos + 30 | |
# | |
# write_tsv(window_average_chrX(bed_file_df=bed_df), sprintf("bedgraphs/windowed_all_delta_tran_%d.bedgraph", t), col_names = F, append=TRUE) | |
# # write_tsv(bed_df[c("chr", "pos_start", "pos_end", "delta")], sprintf("bedgraphs/all_delta_tran_%d.bedgraph", t), col_names = F, append=TRUE) | |
# } | |
# | |
# for (t in 1:5) { | |
# write(bedgraph_header(name=sprintf("non_eroding_delta_%d", t), | |
# description=sprintf("non_eroding deltas from %d transition 1Mb windowed", t)), | |
# sprintf("bedgraphs/windowed_non_eroding_delta_tran_%d.bedgraph", t)) | |
# bed_df <- all_dmp_df %>% filter((chr == "chrX") & (passes_threshold == F)) | |
# cluster_col_0 <- "cluster_0_mean" #sprintf("cluster_%d_mean", t-1) | |
# cluster_col_1 <- sprintf("cluster_%d_mean", t) | |
# bed_df$delta <- unlist(bed_df[cluster_col_1]) - unlist(bed_df[cluster_col_0]) | |
# | |
# bed_df$pos_start <- bed_df$pos - 30 | |
# bed_df$pos_end <- bed_df$pos + 30 | |
# | |
# write_tsv(window_average_chrX(bed_file_df=bed_df), sprintf("bedgraphs/windowed_non_eroding_delta_tran_%d.bedgraph", t), col_names = F, append=TRUE) | |
# | |
# } | |
for (t in 1:5) { | |
write(bedgraph_header(name=sprintf("eroding_delta_%d", t), | |
description=sprintf("eroding deltas from %d transition 1Mb windowed", t)), | |
sprintf("bedgraphs/windowed_eroding_delta_tran_%d.bedgraph", t)) | |
bed_df <- all_dmp_df %>% filter((hg19_chr == "chrX") & (passes_threshold == T)) | |
cluster_col_0 <- "cluster_0_mean" #sprintf("cluster_%d_mean", t-1) | |
cluster_col_1 <- sprintf("cluster_%d_mean", t) | |
bed_df$delta <- unlist(bed_df[cluster_col_1]) - unlist(bed_df[cluster_col_0]) | |
write_tsv(window_average_chrX(bed_file_df=bed_df %>% rename(chr=hg19_chr, pos=hg19_pos)), sprintf("bedgraphs/windowed_eroding_delta_tran_%d.bedgraph", t), col_names = F, append=TRUE) | |
} | |
for (t in 1:5) { | |
write(bedgraph_header(name=sprintf("noneroding_delta_%d", t), | |
description=sprintf("noneroding deltas from %d transition 1Mb windowed", t)), | |
sprintf("bedgraphs/windowed_eroding_delta_tran_%d.bedgraph", t)) | |
bed_df <- all_dmp_df %>% filter((hg19_chr == "chrX") & (passes_threshold == F)) | |
cluster_col_0 <- "cluster_0_mean" #sprintf("cluster_%d_mean", t-1) | |
cluster_col_1 <- sprintf("cluster_%d_mean", t) | |
bed_df$delta <- unlist(bed_df[cluster_col_1]) - unlist(bed_df[cluster_col_0]) | |
write_tsv(window_average_chrX(bed_file_df=bed_df %>% rename(chr=hg19_chr, pos=hg19_pos)), sprintf("bedgraphs/windowed_noneroding_delta_tran_%d.bedgraph", t), col_names = F, append=TRUE) | |
} | |
``` | |
## Correlation with histone methylation marks (K27 and K9) | |
```{r} | |
library(tidyverse) | |
all_dmp_df <- read_csv("all_dmp_df.csv.gz") | |
h3k27me3_peaks <- read_table2("bedgraphs/GSM1528885_H9_XISTpos_K27me3.peaks.bed", col_names=c("chr", "start", "end", "signal")) | |
h3k27me3_peaks_chrX <- h3k27me3_peaks %>% filter(chr == "chrX") | |
h3k27me3_peaks_chrX_windowed <- window_average_chrX(bed_file_df=h3k27me3_peaks_chrX %>% rename(pos=start, delta=signal), window_size = 100000, step_size = 50000, include_empty = T, calculate_sum = T) | |
# h3k9me3_signal <- read_tsv("bedgraphs/GSM1528888_H9_XISTpos_H3K9me3.signal.bed", skip=1, col_names=c("chr", "start", "end", "signal")) | |
# h3k9me3_signal_chrX <- h3k9me3_signal %>% filter(chr == "chrX") | |
# h3k27me3_xe_signal <- read_table2("bedgraphs/GSM1528886_H9_XISTneg_K27me3.peaks.bed", col_names=c("chr", "start", "end", "signal")) | |
# h3k27me3_xe_signal_chrX <- h3k27me3_signal %>% filter(chr == "chrX") | |
# h3k27me3_xe_signal_chrX_avg <- window_average_chrX(bed_file_df=h3k27me3_xe_signal_chrX %>% rename(pos=start, delta=signal), window_size = 100000, step_size = 50000, include_empty = T, calculate_sum = T) | |
# | |
# | |
# h3k9me3_xe_signal <- read_tsv("bedgraphs/GSM1528889_H9_XISTneg_H3K9me3.signal.bed", skip=1, col_names=c("chr", "start", "end", "signal")) | |
# h3k9me3_xe_signal_chrX <- h3k9me3_signal %>% filter(chr == "chrX") | |
h3k9_xi_peaks <- read_table2("bedgraphs/GSM1528888_H9_XISTpos_K9me3.peaks.bed.gz", col_names=c("chr", "start", "end", "signal")) | |
h3k9_xi_peaks_chrX <- h3k9_xi_peaks %>% filter(chr == "chrX") | |
h3k9_xi_peaks_chrX_windowed <- window_average_chrX(bed_file_df=h3k9_xi_peaks_chrX %>% rename(pos=start, delta=signal), window_size = 100000, step_size = 50000, include_empty = T, calculate_sum = T) | |
# h3k9_xe_peaks <- read_table2("/pinterlab/GSM1528889_H9_XISTneg_K9me3.peaks.bed", col_names=c("chr", "start", "end", "signal")) | |
# h3k9_xe_peaks_chrX <- h3k9_xe_peaks %>% filter(chr == "chrX") | |
# h3k9_xe_peaks_windowed <- window_average_chrX(bed_file_df=h3k9_xe_peaks_chrX %>% rename(pos=start, delta=signal), window_size = 100000, step_size = 50000, include_empty = T, calculate_sum = T) | |
histone_mark_correlations <- tibble(transition=numeric(), | |
h3k27me3_correlation=numeric(), | |
h3k27me3_correlation_p_val=numeric(), | |
h3k9me3_correlation=numeric(), | |
h3k9me3_correlation_p_val=numeric()) | |
for (t in 1:5) { | |
bed_df <- all_dmp_df %>% filter((hg19_chr == "chrX") & (passes_threshold == T) & (key_transition == t)) | |
averaged_distance <- window_average_chrX(bed_file_df=bed_df %>% rename(chr=hg19_chr, pos=hg19_pos), window_size = 100000, step_size = 50000, include_empty = T) | |
k27_cor_test <- cor.test(abs(averaged_distance$delta), h3k27me3_peaks_chrX_windowed$delta) | |
k9_cor_test <- cor.test(abs(averaged_distance$delta), h3k9_xi_peaks_chrX_windowed$delta) # h3k9me3_signal_chrX$signal) | |
# k27_xe_cor_test <- cor.test(abs(averaged_distance$delta), h3k27me3_xe_signal_chrX$signal) | |
# k9_xe_cor_test <- cor.test(abs(averaged_distance$delta), h3k9_xe_peaks_windowed$delta) | |
histone_mark_correlations <- add_row(histone_mark_correlations, | |
transition=t, | |
h3k27me3_correlation=k27_cor_test$estimate, | |
h3k27me3_correlation_p_val=k27_cor_test$p.value, | |
h3k9me3_correlation=k9_cor_test$estimate, | |
h3k9me3_correlation_p_val=k9_cor_test$p.value) | |
# h3k27me3_xe_correlation=k27_xe_cor_test$estimate, | |
# h3k9me3_xe_correlation=k9_xe_cor_test$estimate) | |
} | |
histone_cor_df <- reshape2::melt(histone_mark_correlations %>% | |
dplyr::select(transition, | |
h3k27me3_correlation, | |
h3k9me3_correlation), | |
id.vars=c("transition")) | |
histone_cor_df$variable <- factor(histone_cor_df$variable, levels=c("h3k9me3_correlation", "h3k27me3_correlation")) | |
histon_cor_plot <- ggplot(histone_cor_df) + | |
geom_bar(aes(x=transition, y=value, fill=variable), stat="identity", position=position_dodge()) + | |
theme_bw() + | |
base_plot_theme + | |
coord_flip() + | |
facet_grid(rows=vars(sprintf("Tran %d", transition)), scales="free_y")+ | |
labs(title="Correlation of DMPs With Histone Meth.", | |
x="", | |
y="Pearson Correlation", | |
fill="Histone Modification") + | |
scale_fill_manual(limits=c("h3k27me3_correlation", "h3k9me3_correlation"), | |
labels=c("H3K27me3", "H3K9me3"), | |
values = c(pal_jama()(1), "#8C564B"), | |
guide=F) + | |
scale_x_continuous(labels = NULL) + | |
scale_y_continuous(limits=c(-.13,.33)) + | |
theme(strip.text = element_text(size=5, margin = margin(0,.5,0,.5, "mm")), | |
strip.background = element_rect(size = .3), | |
panel.grid.minor.y = element_blank(), | |
axis.ticks.length.y = unit(0, "mm"), | |
axis.title.y = element_blank(), | |
axis.text.y = element_blank(), | |
panel.spacing.y = unit(.75, "mm")) | |
# theme(legend.text = element_text(size=4), | |
# legend.title = element_text(size=4,face="bold"), | |
# legend.key.width = unit(2, units="mm"), | |
# legend.key.height = unit(3, units="mm"), | |
# legend.key.size = unit(.2, units = "mm"), | |
# axis.ticks.length.y = unit(0, "mm"), | |
# axis.title.y = element_text(size=0, margin = margin(0,0,0,0)), | |
# axis.text.y = element_text(size=0, margin = margin(0,0,0,0)), | |
# strip.text = element_text(size=4, margin = margin(.25,0,.25,0, "mm")), | |
# strip.background = element_rect(size = .3), | |
# panel.spacing.y = unit(.5, "mm")) | |
ggsave("figs/main/fig_4_histone_cor_plot.svg", histon_cor_plot, width = 46, height = 40, units = "mm") | |
ggsave("figs/main/fig_4_histone_cor_plot_no_guide.pdf", histon_cor_plot, width = 46, height = 47.5, units = "mm") | |
ggsave("figs/main/fig_4_histone_cor_plot_spearman.svg", histon_cor_plot, width = 46, height = 47.5, units = "mm") | |
``` | |
## Plotting the browser view | |
```{r} | |
all_dmp_df <- read_csv("all_dmp_df.csv.gz") | |
plotting_bedgraph_df <- tibble(chr=character(), | |
start_pos=numeric(), | |
end_pos=numeric(), | |
value_num=numeric(), | |
type=character()) | |
bedgraphs <- list(H3K27me3="bedgraphs/GSM1528885_H9_XISTpos_H3K27me3.signal.bed.gz", | |
H3K9me3="bedgraphs/GSM1528888_H9_XISTpos_H3K9me3.signal.bed.gz", | |
delta_tran_1="bedgraphs/windowed_eroding_delta_tran_1.bedgraph", | |
delta_tran_2="bedgraphs/windowed_eroding_delta_tran_2.bedgraph", | |
delta_tran_3="bedgraphs/windowed_eroding_delta_tran_3.bedgraph", | |
delta_tran_4="bedgraphs/windowed_eroding_delta_tran_4.bedgraph", | |
delta_tran_5="bedgraphs/windowed_eroding_delta_tran_5.bedgraph", | |
delta_H9="bedgraphs/windowed_eroding_delta_tran_H9.bedgraph") | |
for (bedname in names(bedgraphs)) { | |
bed_file <- read_tsv(as.character(bedgraphs[bedname]), | |
skip = 1, | |
col_names = c("chr", "start_pos", "end_pos", "value_num")) | |
bed_file$type <- bedname | |
bed_file[bed_file$start_pos >= 58100000 & bed_file$end_pos <= 63000000, ]$value_num <- 0 | |
plotting_bedgraph_df <- bind_rows(plotting_bedgraph_df, bed_file) | |
} | |
library(scales) | |
library(facetscales) | |
scales_y <- list( | |
H3K27me3 = scale_y_continuous(), | |
H3K9me3 = scale_y_continuous(), | |
delta_tran_1 = scale_y_continuous(limits = c(-.5,.5)), | |
delta_tran_2 = scale_y_continuous(limits = c(-.5,.5)), | |
delta_tran_3 = scale_y_continuous(limits = c(-.5,.5)), | |
delta_tran_4 = scale_y_continuous(limits = c(-.5,.5)), | |
delta_tran_5 = scale_y_continuous(limits = c(-.5,.5)), | |
delta_H9 = scale_y_continuous(limits = c(-.5,.5)) | |
) | |
mean_methylation_df_escape <- read_csv("mean_methylation_df_escape.csv.gz") | |
#hg19_pos was manually annotated for each gene | |
escapee_pos_df <- mean_methylation_df_escape %>% | |
transmute(hg38_gene_name, | |
delta_tran_1=hg19_pos, | |
delta_tran_2=hg19_pos, | |
delta_tran_3=hg19_pos, | |
delta_tran_4=hg19_pos, | |
delta_tran_5=hg19_pos) %>% | |
reshape2::melt(id.vars=c("hg38_gene_name"), | |
value.name="TSS", | |
variable.name="type") | |
tran_specific_df <- all_dmp_df %>% filter(passes_threshold==T & hg19_chr =="chrX") %>% select(delta, hg19_pos, key_transition) %>% mutate(type=sprintf("delta_tran_%d", key_transition)) | |
dmp_table_clusterA_H9 <- read_csv("dmp_table_clusterA_H9.csv.gz") | |
tran_specific_df <- rbind(tran_specific_df, dmp_table_clusterA_H9 %>% filter(passes_threshold==T & hg19_chr == "chrX") %>% mutate(delta=H9_mean-cluster_0_mean, type="delta_H9", key_transition=6) %>% dplyr::select(delta, hg19_pos, key_transition, type)) | |
browser_view <- ggplot(plotting_bedgraph_df %>% filter(chr == "chrX"), mapping=aes(x=start_pos, y = value_num)) + | |
geom_vline(data=escapee_pos_df, aes(xintercept=TSS), color="darkgrey", size=.3, alpha=.8) + | |
geom_area(mapping = aes(x=start_pos, y = value_num, fill=type), stat = "identity") + | |
scale_fill_manual( | |
limits=names(bedgraphs), | |
values = c(pal_jama()(1), "#8C564B", "#BABABA", "#BABABA", "#BABABA", "#BABABA", "#BABABA", "#BABABA"), | |
guide=F) + | |
facet_grid_sc(rows=vars(factor(type, levels = names(bedgraphs))), scales = list(y=scales_y),labeller = labeller(.rows=function(n){ | |
return(list("H3K27me3"="H3K27me3", | |
"H3K9me3"="H3K9me3", | |
"delta_tran_1"="Tran. 1", | |
"delta_tran_2"="Tran. 2", | |
"delta_tran_3"="Tran. 3", | |
"delta_tran_4"="Tran. 4", | |
"delta_tran_5"="Tran. 5", | |
"delta_H9"="H9")[n]) | |
})) + | |
geom_segment(data=tran_specific_df, mapping=aes(x=hg19_pos,xend=hg19_pos,y=0, yend=delta, color=type), alpha=.1) + | |
labs(y="", | |
x="X Chr Coordinate (Mb)") + | |
scale_x_continuous(limits=c(0, 155270560), breaks = seq(0, 155270560, 2e7), labels=c(0, 20, 40, 60, 80, 100, 120, 140), minor_breaks = seq(0, 155270560, 1e7), expand=c(0,0)) + | |
theme_bw() + | |
scale_color_manual(limits=c("delta_tran_1", "delta_tran_2", "delta_tran_3", "delta_tran_4", "delta_tran_5", "delta_H9"), | |
values=c(pal_d3()(5), pal_locuszoom()(4)[4]), guide=F) + | |
base_plot_theme + | |
# scale_color_material("cyan") + | |
theme(panel.grid.major = element_blank(), | |
panel.grid.minor = element_blank(), | |
strip.text = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")), | |
strip.background = element_rect(size = .3), | |
title = element_blank(), | |
axis.title.y = element_blank(), | |
axis.text.y=element_text(size=5)) | |
ggsave("figs/main/fig_4_cumulative_changes_grey_with_H9.pdf", browser_view, width = 171, height = 95, units = "mm") | |
ggsave("figs/main/fig_4_cumulative_changes_grey.pdf", browser_view, width = 171, height = 85, units = "mm") | |
``` | |
## Plotting the browser view for H9, iPSC, and ESCs | |
```{r} | |
# all_dmp_df <- read_csv("all_dmp_df.csv.gz") | |
plotting_bedgraph_df <- tibble(chr=character(), | |
start_pos=numeric(), | |
end_pos=numeric(), | |
value_num=numeric(), | |
type=character()) | |
bedgraphs <- list(H3K27me3="bedgraphs/GSM1528885_H9_XISTpos_H3K27me3.signal.bed.gz", | |
H3K9me3="bedgraphs/GSM1528888_H9_XISTpos_H3K9me3.signal.bed.gz", | |
delta_ipsc="bedgraphs/windowed_eroding_delta_tran_iPSC.bedgraph", | |
delta_esc="bedgraphs/windowed_eroding_delta_tran_ESC.bedgraph", | |
delta_H9="bedgraphs/windowed_eroding_delta_tran_H9.bedgraph") | |
for (bedname in names(bedgraphs)) { | |
bed_file <- read_tsv(as.character(bedgraphs[bedname]), | |
skip = 1, | |
col_names = c("chr", "start_pos", "end_pos", "value_num")) | |
bed_file$type <- bedname | |
bed_file[bed_file$start_pos >= 58100000 & bed_file$end_pos <= 63000000, ]$value_num <- 0 | |
plotting_bedgraph_df <- bind_rows(plotting_bedgraph_df, bed_file) | |
} | |
library(scales) | |
library(facetscales) | |
scales_y <- list( | |
H3K27me3 = scale_y_continuous(), | |
H3K9me3 = scale_y_continuous(), | |
delta_ipsc = scale_y_continuous(limits = c(-.5,.5)), | |
delta_esc = scale_y_continuous(limits = c(-.5,.5)), | |
delta_H9 = scale_y_continuous(limits = c(-.5,.5)) | |
) | |
mean_methylation_df_escape <- read_csv("mean_methylation_df_escape.csv.gz") | |
#hg19_pos was manually annotated for each gene | |
escapee_pos_df <- mean_methylation_df_escape %>% | |
transmute(hg38_gene_name, | |
delta_ipsc=hg19_pos, | |
delta_esc=hg19_pos, | |
delta_H9=hg19_pos) %>% | |
reshape2::melt(id.vars=c("hg38_gene_name"), | |
value.name="TSS", | |
variable.name="type") | |
dmp_table_clusterA_non_A_iPSC <- read_csv("dmp_table_clusterA_non_A_iPSC.csv.gz") | |
tran_specific_df <- dmp_table_clusterA_non_A_iPSC %>% filter(passes_threshold==T & hg19_chr == "chrX") %>% mutate(delta=non_A_iPSC_mean-cluster_0_mean, type="delta_ipsc") %>% dplyr::select(delta, hg19_pos, type) | |
dmp_table_clusterA_non_A_ESC <- read_csv("dmp_table_clusterA_non_A_ESC.csv.gz") | |
tran_specific_df <- rbind(tran_specific_df, dmp_table_clusterA_non_A_ESC %>% filter(passes_threshold==T & hg19_chr == "chrX") %>% mutate(delta=non_A_ESC_mean-cluster_0_mean, type="delta_esc") %>% dplyr::select(delta, hg19_pos, type)) | |
dmp_table_clusterA_H9 <- read_csv("dmp_table_clusterA_H9.csv.gz") | |
tran_specific_df <- rbind(tran_specific_df, dmp_table_clusterA_H9 %>% filter(passes_threshold==T & hg19_chr == "chrX") %>% mutate(delta=H9_mean-cluster_0_mean, type="delta_H9") %>% dplyr::select(delta, hg19_pos, type)) | |
browser_view_h9 <- ggplot(plotting_bedgraph_df %>% filter(chr == "chrX"), mapping=aes(x=start_pos, y = value_num)) + | |
geom_vline(data=escapee_pos_df, aes(xintercept=TSS), color="darkgrey", size=.3, alpha=.8) + | |
geom_area(mapping = aes(x=start_pos, y = value_num, fill=type), stat = "identity") + | |
scale_fill_manual( | |
limits=names(bedgraphs), | |
values = c(pal_jama()(1), "#8C564B", "#BABABA", "#BABABA", "#BABABA"), | |
guide=F) + | |
facet_grid_sc(rows=vars(factor(type, levels = names(bedgraphs))), scales = list(y=scales_y),labeller = labeller(.rows=function(n){ | |
return(list("H3K27me3"="H3K27me3", | |
"H3K9me3"="H3K9me3", | |
"delta_ipsc"="iPSCs", | |
"delta_esc"="ESCs", | |
"delta_H9"="H9")[n]) | |
})) + | |
geom_segment(data=tran_specific_df, mapping=aes(x=hg19_pos,xend=hg19_pos,y=0, yend=delta, color=type), alpha=.1) + | |
labs(y="", | |
x="X Chr Coordinate (Mb)", | |
title="Changes in DNAme by Cell Type") + | |
scale_x_continuous(limits=c(0, 155270560), breaks = seq(0, 155270560, 2e7), labels=c(0, 20, 40, 60, 80, 100, 120, 140), minor_breaks = seq(0, 155270560, 1e7), expand=c(0,0)) + | |
theme_bw() + | |
scale_color_manual(limits=c("delta_ipsc", "delta_esc", "delta_H9"), | |
values=c(pal_locuszoom()(3)), guide=F) + | |
base_plot_theme + | |
# scale_color_material("cyan") + | |
theme(panel.grid.major = element_blank(), | |
panel.grid.minor = element_blank(), | |
strip.text = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")), | |
strip.background = element_rect(size = .3), | |
title = element_blank(), | |
axis.title.y = element_blank(), | |
axis.text.y=element_text(size=5), | |
panel.spacing.y = unit(.75, "mm")) | |
ggsave("figs/supplementary/supp_2_cumulative_changes_ipsc_esc_h9.pdf", browser_view_h9, width = 174, height = 55.4, units = "mm") | |
``` | |
## Browser view for noneroding probes | |
```{r} | |
non_eroding_plotting_bedgraph_df <- tibble(chr=character(), | |
start_pos=numeric(), | |
end_pos=numeric(), | |
value_num=numeric(), | |
type=character()) | |
non_eroding_bedgraphs <- list(delta_tran_1="bedgraphs/windowed_noneroding_delta_tran_1.bedgraph", | |
delta_tran_2="bedgraphs/windowed_noneroding_delta_tran_2.bedgraph", | |
delta_tran_3="bedgraphs/windowed_noneroding_delta_tran_3.bedgraph", | |
delta_tran_4="bedgraphs/windowed_noneroding_delta_tran_4.bedgraph", | |
delta_tran_5="bedgraphs/windowed_noneroding_delta_tran_5.bedgraph") | |
for (bedname in names(non_eroding_bedgraphs)) { | |
bed_file <- read_tsv(as.character(non_eroding_bedgraphs[bedname]), skip = 1, col_names = c("chr", "start_pos", "end_pos", "value_num")) | |
bed_file$type = bedname | |
bed_file[bed_file$start_pos >= 58100000 & bed_file$end_pos <= 63000000, ]$value_num <- 0 | |
non_eroding_plotting_bedgraph_df <- bind_rows(non_eroding_plotting_bedgraph_df, bed_file) | |
} | |
library(scales) | |
library(facetscales) | |
scales_y <- list( | |
delta_tran_1 = scale_y_continuous(limits = c(-.5,.5), breaks=c(-.5,0,.5)), | |
delta_tran_2 = scale_y_continuous(limits = c(-.5,.5), breaks=c(-.5,0,.5)), | |
delta_tran_3 = scale_y_continuous(limits = c(-.5,.5), breaks=c(-.5,0,.5)), | |
delta_tran_4 = scale_y_continuous(limits = c(-.5,.5), breaks=c(-.5,0,.5)), | |
delta_tran_5 = scale_y_continuous(limits = c(-.5,.5), breaks=c(-.5,0,.5)) | |
) | |
mean_methylation_df_escape <- read_csv("mean_methylation_df_escape.csv.gz") | |
#hg19_pos was manually annotated for each gene | |
escapee_pos_df <- mean_methylation_df_escape %>% | |
transmute(hg38_gene_name, | |
delta_tran_1=hg19_pos, | |
delta_tran_2=hg19_pos, | |
delta_tran_3=hg19_pos, | |
delta_tran_4=hg19_pos, | |
delta_tran_5=hg19_pos) %>% | |
reshape2::melt(id.vars=c("hg38_gene_name"), | |
value.name="TSS", | |
variable.name="type") | |
non_eroding_plotting_bedgraph_df$type <- factor(non_eroding_plotting_bedgraph_df$type, levels = names(non_eroding_bedgraphs)) | |
browser_view_non_eroding <- ggplot(non_eroding_plotting_bedgraph_df %>% filter(chr == "chrX"), mapping=aes(x=start_pos, y = value_num)) + | |
# geom_vline(data=escapee_pos_df, aes(xintercept=TSS), color="grey", alpha = .6) + | |
geom_area(mapping = aes(x=start_pos, y = value_num), stat = "identity") + | |
facet_grid_sc(rows=vars(type), scales = list(y=scales_y), labeller = labeller(type=function(n){ | |
return(list("delta_tran_1"="Tran. 1", | |
"delta_tran_2"="Tran. 2", | |
"delta_tran_3"="Tran. 3", | |
"delta_tran_4"="Tran. 4", | |
"delta_tran_5"="Tran. 5")[n]) | |
})) + | |
labs(y="", | |
x="X Chr Coordinate (Mb)", | |
title = "Cumulative Changes for Non-eroding Probes") + | |
scale_x_continuous(limits=c(0, 155270560), breaks = seq(0, 155270560, 2e7), labels=c(0, 20, 40, 60, 80, 100, 120, 140), minor_breaks = seq(0, 155270560, 1e7), expand=c(0,0)) + | |
theme_bw() + | |
base_plot_theme + | |
scale_color_viridis_d() + | |
theme(panel.grid.major = element_blank(), | |
panel.grid.minor = element_blank(), | |
panel.spacing.y = unit(.75, "mm"), | |
panel.spacing.x = unit(0, "mm"), | |
strip.text = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")), | |
strip.background = element_rect(size = .3)) | |
ggsave("figs/supplementary/supp_2_cumulative_changes_non_eroding.pdf", browser_view_non_eroding, width = 86, height = 50, units = "mm") | |
``` | |
## Get the composition of each of the CpG clusters | |
```{r} | |
all_dmp_df <- rownames_to_column(all_dmp_df) | |
x_annotation_df <- rownames_to_column(as.data.frame(x_annotation_df)) | |
tran_1_inc_annotation_tib <- x_annotation_df %>% | |
dplyr::filter(Name %in% (dplyr::filter(all_dmp_df, (key_transition == 1) & | |
(delta_increasing == TRUE) & | |
(passes_threshold == TRUE))$rowname)) | |
tran_1_dec_annotation_tib <- x_annotation_df %>% dplyr::filter(Name %in% dplyr::filter(all_dmp_df, (key_transition == 1) & (delta_increasing == F) & (passes_threshold == TRUE))$rowname) | |
tran_2_inc_annotation_tib <- x_annotation_df %>% dplyr::filter(Name %in% dplyr::filter(all_dmp_df, (key_transition == 2) & (delta_increasing == TRUE) & (passes_threshold == TRUE))$rowname) | |
tran_2_dec_annotation_tib <- x_annotation_df %>% dplyr::filter(Name %in% dplyr::filter(all_dmp_df, (key_transition == 2) & (delta_increasing == F) & (passes_threshold == TRUE))$rowname) | |
5 | |
tran_3_inc_annotation_tib <- x_annotation_df %>% dplyr::filter(Name %in% dplyr::filter(all_dmp_df, (key_transition == 3) & (delta_increasing == TRUE) & (passes_threshold == TRUE))$rowname) | |
tran_3_dec_annotation_tib <- x_annotation_df %>% dplyr::filter(Name %in% dplyr::filter(all_dmp_df, (key_transition == 3) & (delta_increasing == F) & (passes_threshold == TRUE))$rowname) | |
tran_4_inc_annotation_tib <- x_annotation_df %>% dplyr::filter(Name %in% dplyr::filter(all_dmp_df, (key_transition == 4) & (delta_increasing == TRUE) & (passes_threshold == TRUE))$rowname) | |
tran_4_dec_annotation_tib <- x_annotation_df %>% dplyr::filter(Name %in% dplyr::filter(all_dmp_df, (key_transition == 4) & (delta_increasing == F) & (passes_threshold == TRUE))$rowname) | |
tran_5_inc_annotation_tib <- x_annotation_df %>% dplyr::filter(Name %in% dplyr::filter(all_dmp_df, (key_transition == 5) & (delta_increasing == TRUE) & (passes_threshold == TRUE))$rowname) | |
tran_5_dec_annotation_tib <- x_annotation_df %>% dplyr::filter(Name %in% dplyr::filter(all_dmp_df, (key_transition == 5) & (delta_increasing == F) & (passes_threshold == TRUE))$rowname) | |
refgene_groups <- unlist(str_split(x_annotation_df$UCSC_RefGene_Group, ";")) | |
refgene_groups_sig <- unlist(str_split(sig_annotation$UCSC_RefGene_Group, ";")) | |
tran_1_inc_groups_sig <- unlist(str_split(tran_1_inc_annotation_tib$UCSC_RefGene_Group, ";")) | |
tran_1_dec_groups_sig <- unlist(str_split(tran_1_dec_annotation_tib$UCSC_RefGene_Group, ";")) | |
tran_2_inc_groups_sig <- unlist(str_split(tran_2_inc_annotation_tib$UCSC_RefGene_Group, ";")) | |
tran_2_dec_groups_sig <- unlist(str_split(tran_2_dec_annotation_tib$UCSC_RefGene_Group, ";")) | |
tran_3_inc_groups_sig <- unlist(str_split(tran_3_inc_annotation_tib$UCSC_RefGene_Group, ";")) | |
tran_3_dec_groups_sig <- unlist(str_split(tran_3_dec_annotation_tib$UCSC_RefGene_Group, ";")) | |
tran_4_inc_groups_sig <- unlist(str_split(tran_4_inc_annotation_tib$UCSC_RefGene_Group, ";")) | |
tran_4_dec_groups_sig <- unlist(str_split(tran_4_dec_annotation_tib$UCSC_RefGene_Group, ";")) | |
tran_5_inc_groups_sig <- unlist(str_split(tran_5_inc_annotation_tib$UCSC_RefGene_Group, ";")) | |
tran_5_dec_groups_sig <- unlist(str_split(tran_5_dec_annotation_tib$UCSC_RefGene_Group, ";")) | |
plot_tib <- tibble(category=character(),type=character(), percentage=numeric()) | |
add_to_plot_tib <- function(plot_tib, cat, group) { | |
group[is.na(group)] <- "NA" | |
percents <- sapply(unique(group), function(x) sum(x == group, na.rm = T)/length(group)) | |
for (name in names(percents)) { | |
plot_tib <- add_row(plot_tib, category=cat, type=name, percentage=percents[name]) | |
} | |
return(plot_tib) | |
} | |
plot_tib <- add_to_plot_tib(plot_tib, "before", refgene_groups) | |
plot_tib <- add_to_plot_tib(plot_tib, "after", refgene_groups_sig) | |
plot_tib <- add_to_plot_tib(plot_tib, "1_inc", tran_1_inc_groups_sig) | |
plot_tib <- add_to_plot_tib(plot_tib, "1_dec", tran_1_dec_groups_sig) | |
plot_tib <- add_to_plot_tib(plot_tib, "2_inc", tran_2_inc_groups_sig) | |
plot_tib <- add_to_plot_tib(plot_tib, "2_dec", tran_2_dec_groups_sig) | |
plot_tib <- add_to_plot_tib(plot_tib, "3_inc", tran_3_inc_groups_sig) | |
plot_tib <- add_to_plot_tib(plot_tib, "3_dec", tran_3_dec_groups_sig) | |
plot_tib <- add_to_plot_tib(plot_tib, "4_inc", tran_4_inc_groups_sig) | |
plot_tib <- add_to_plot_tib(plot_tib, "4_dec", tran_4_dec_groups_sig) | |
plot_tib <- add_to_plot_tib(plot_tib, "5_inc", tran_5_inc_groups_sig) | |
plot_tib <- add_to_plot_tib(plot_tib, "5_dec", tran_5_dec_groups_sig) | |
library(scales) | |
ggplot(plot_tib,aes(x=factor(category, levels=c("before", | |
"after", | |
"1_inc", | |
"1_dec", | |
"2_inc", | |
"2_dec", | |
"3_inc", | |
"3_dec", | |
"4_inc", | |
"4_dec", | |
"5_inc", | |
"5_dec")), | |
y=percentage, | |
label=paste(round(percentage*100), "%"), | |
fill=factor(type, levels=c("1stExon","TSS200", "TSS1500", "3'UTR", "5'UTR", "Body", "NA" ))))+ | |
geom_bar(stat="identity") + | |
geom_text(position = position_stack(vjust = 0.5), size=3) + | |
scale_y_continuous(labels = percent)+ | |
scale_x_discrete(limits=c("before", "after", "1_inc", "1_dec", "2_inc", "2_dec", "3_inc", "3_dec", "4_inc", "4_dec", "5_inc", "5_dec")) + | |
labs(x="", y="Percent Composition", fill="CpG Location Relative to Gene", title="Composition of CpG Sites Before/After Variation Selection") + | |
theme(plot.title = element_text(hjust = 0.5)) + | |
scale_fill_brewer(palette = 'Pastel2', na.value="grey") | |
``` | |
## Volcano plots for each transition | |
```{r} | |
library(ggrepel) | |
make_volcano_plot <- function(all_dmp_df, diff_cutoff_high = NA, diff_cutoff_low = NA, log10_qval_cutoff = 3, cluster_1_col, cluster_2_col) { | |
# dmp_df$mapped_gene <- apply(dmp_df, 1, function (x) as.character(findClosestGene(x['chr'], as.numeric(x['pos']), genome='hg19')$geneName[1])) | |
dmp_df_temp <- all_dmp_df | |
dmp_df_temp$delta <- apply(dmp_df_temp, 1, function (x) as.numeric(x[cluster_2_col]) - as.numeric(x[cluster_1_col])) | |
dmp_df_temp$log10qval <- apply(dmp_df_temp, 1, function (x) -log10(as.numeric(x['qval']))) | |
lowerq = quantile(dmp_df_temp$delta)[2] | |
upperq = quantile(dmp_df_temp$delta)[4] | |
iqr = upperq - lowerq | |
if (is.na(diff_cutoff_high)) { | |
max_delta <- max(dmp_df_temp[dmp_df_temp$log10qval > log10_qval_cutoff,]$delta) | |
pos_range <- max_delta - 0 | |
mild.threshold.upper = (7.5 * pos_range / 10) | |
}else { | |
mild.threshold.upper <- diff_cutoff_high | |
} | |
if (is.na(diff_cutoff_low)) { | |
min_delta <- min(dmp_df_temp[dmp_df_temp$log10qval > log10_qval_cutoff,]$delta) | |
neg_range <- 0 - min_delta | |
mild.threshold.lower = (-8.5 * neg_range / 10) | |
}else { | |
mild.threshold.lower <- diff_cutoff_low | |
} | |
# dmp_df_temp$mapped_gene <- "" | |
label_rows <- ((dmp_df_temp$delta > mild.threshold.upper) | (dmp_df_temp$delta < mild.threshold.lower)) & | |
(dmp_df_temp$log10qval > log10_qval_cutoff) | |
# dmp_df_temp[label_rows, 'mapped_gene'] <- apply(dmp_df_temp[label_rows,], 1, function (x) as.character(findClosestGene(x['chr'], as.numeric(x['pos']), genome='hg19')$geneName[1])) | |
dmp_df_temp[is.na(dmp_df_temp$mapped_gene),]$mapped_gene <- dmp_df_temp[is.na(dmp_df_temp$mapped_gene),]$pos | |
#fix the ordering of the chromosomes for the color labels | |
dmp_df_temp$chr <- gsub("chr", "", dmp_df_temp$chr) | |
dmp_df_temp$chr <- factor(dmp_df_temp$chr, levels = c("1","2","3","4","5", | |
"6","7","8","9","10", | |
"11","12","13","14","15", | |
"16","17","18","19","20", | |
"21","22","X","Y")) | |
if (cluster_1_col != 'cluster_0_mean') { | |
show_legend = F | |
return(ggplot(dmp_df_temp, aes(delta, log10qval)) + | |
geom_point(aes(color = chr), alpha = .4, show.legend = show_legend) + | |
labs(x=sprintf('%s - %s', cluster_2_col, cluster_1_col), | |
y='-log10(q-value)', | |
title=sprintf('Transition from cluster %s to cluster %s', unlist(strsplit(cluster_1_col, '_'))[2], unlist(strsplit(cluster_2_col, '_'))[2])) + | |
theme(plot.title = element_text(hjust = 0.5))) | |
}else { | |
show_legend = T | |
return(ggplot(dmp_df_temp, aes(delta, log10qval)) + | |
geom_point(aes(color = chr), alpha = .4, show.legend = show_legend, size=.5) + | |
geom_text_repel(data = dmp_df_temp[((dmp_df_temp$delta > mild.threshold.upper) | (dmp_df_temp$delta < mild.threshold.lower)) & | |
(dmp_df_temp$log10qval > log10_qval_cutoff),], | |
aes(label=mapped_gene, color = chr),show.legend = FALSE, size=2) + | |
labs(x=sprintf('%s - %s', cluster_2_col, cluster_1_col), | |
y='-log10(q-value)', | |
title=sprintf('Transition from cluster %s to cluster %s', unlist(strsplit(cluster_1_col, '_'))[2], unlist(strsplit(cluster_2_col, '_'))[2])) + | |
theme(plot.title = element_text(hjust = 0.5))) | |
} | |
# make volcano plot | |
# ggsave(sprintf('volcano_plots_all_sites/transition_%s_to_%s.png', unlist(strsplit(cluster_1_col, '_'))[2], unlist(strsplit(cluster_2_col, '_'))[2]), dpi=300) | |
} | |
p1 <- make_volcano_plot(all_dmp_df, cluster_1_col = sprintf('cluster_%d_mean', 0), | |
cluster_2_col = sprintf('cluster_%d_mean', 1), | |
diff_cutoff_high = .21, diff_cutoff_low = -.8, log10_qval_cutoff = 25) | |
p1 + | |
labs(x="(cluster 1 mean) - (cluster 0 mean)", | |
title="CpG Sites In The 1st Transition", | |
color="Chromosome") + | |
theme(plot.title = element_text(hjust = 0.5, size=9), | |
legend.position="bottom", | |
legend.justification = "right", | |
legend.title = element_text(size = 6), | |
legend.text = element_text(size = 6), | |
legend.key.height = unit(2, "mm"), | |
legend.key.width = unit(2, "mm"), | |
legend.margin = margin(t=0,r=0,b=0,l=0, unit="mm"), | |
axis.title = element_text(size=8), | |
axis.text = element_text(size = 6)) + | |
guides(color=guide_legend(nrow=2,byrow=TRUE)) | |
ggsave("figs/volcano_plot_tran_1.pdf", height = 90, width = 85, units = 'mm') | |
ggsave("figs/volcano_plot_tran_1.png", height = 90, width = 85, units = 'mm', dpi=1000) | |
for (i in 0:4) { | |
print(i) | |
plot <- make_volcano_plot(all_dmp_df, cluster_1_col = sprintf('cluster_%d_mean', i), | |
cluster_2_col = sprintf('cluster_%d_mean', i+1), | |
diff_cutoff_high = .21, diff_cutoff_low = -.8, log10_qval_cutoff = 25) | |
if (i==0){ | |
p1 <- plot | |
}else if (i==1){ | |
p2 <- plot | |
}else if (i==2){ | |
p3 <- plot | |
}else if (i==3){ | |
p4 <- plot | |
}else if (i==4){ | |
p5 <- plot | |
} | |
} | |
grobbed <- arrangeGrob(p1, p2, p3,p4,p5, layout_matrix = cbind(c(1,1,2,4), c(1,1,3,5))) | |
ggsave('volcano_plots.png', grobbed, width = 9.2, height = 11.8, units = "in", dpi = 300) | |
``` | |
## Drawing the CpG islands on XIST, FIRRE, and DXZ4 | |
```{r} | |
xist_probe_pos <- c(10927, | |
30555, | |
30626, | |
30708, | |
32133, | |
32155, | |
32765, | |
32777, | |
33225) | |
ggplot() + | |
geom_rect(aes(xmin=0, ymin=1, ymax=2, xmax=32103)) + | |
geom_rect(aes(xmin=32103, ymin=1, ymax=2, xmax=32103+200, fill="TSS200")) + | |
geom_rect(aes(xmin=32103+200, ymin=1, ymax=2, xmax=32103+1500, fill="TSS1500")) + | |
geom_rect(aes(xmin=xist_probe_pos, ymin=1, ymax=2, xmax=xist_probe_pos+100, fill="CpG Probe")) + | |
scale_y_continuous(limits=c(0,4)) | |
``` | |
## Compiling sites into genes by only looking at promoter regions | |
```{r} | |
library(tidyverse) | |
gene_compiled_tss_sites_df <- tibble(gene=character(), | |
total_sites=integer(), | |
transition_1_inc_sites=integer(), | |
transition_1_dec_sites=integer(), | |
transition_2_inc_sites=integer(), | |
transition_2_dec_sites=integer(), | |
transition_3_inc_sites=integer(), | |
transition_3_dec_sites=integer(), | |
transition_4_inc_sites=integer(), | |
transition_4_dec_sites=integer(), | |
transition_5_inc_sites=integer(), | |
transition_5_dec_sites=integer(), | |
) | |
for (row_idx in 1:nrow(all_dmp_df)) { | |
gene <- unlist(all_dmp_df[row_idx,"mapped_gene"]) | |
dist_to_tss <- unlist(all_dmp_df[row_idx,"dist_to_tss"]) | |
delta_increasing <- unlist(all_dmp_df[row_idx,"delta_increasing"]) | |
key_transition <- unlist(all_dmp_df[row_idx,"key_transition"]) | |
alt_key_transition <- unlist(all_dmp_df[row_idx,"alt_key_transition"]) | |
alt_delta <- unlist(all_dmp_df[row_idx,"alt_delta"]) | |
refgene_group <- unlist(all_dmp_df[row_idx, "refgene_group"]) | |
passes_threshold <- unlist(all_dmp_df[row_idx, "passes_threshold"]) | |
if (grepl("TSS1500|TSS200|5'UTR", refgene_group) & (! is.na(gene)) & passes_threshold) { | |
# if (grepl("TSS1500|TSS200", refgene_group) & (! is.na(gene)) & (! is.na(alt_key_transition))) { | |
gene_row_idx <- 0 | |
if (!(gene %in% gene_compiled_tss_sites_df$gene)) { | |
# gene is not in gene_compiled_tss_sites_df, so add a row for it | |
gene_row_idx <- nrow(gene_compiled_tss_sites_df)+1 | |
gene_compiled_tss_sites_df[gene_row_idx, c("gene")] = gene | |
gene_compiled_tss_sites_df[is.na(gene_compiled_tss_sites_df)] <- 0 | |
}else { | |
i <- 1 | |
for (gene_name in gene_compiled_tss_sites_df$gene) { | |
if ((! is.na(gene_name)) && gene == gene_name) { | |
gene_row_idx <- i | |
break | |
} | |
i <- i+1 | |
} | |
} | |
gene_compiled_tss_sites_df[gene_row_idx, "total_sites"] <- gene_compiled_tss_sites_df[gene_row_idx, "total_sites"] + 1 | |
change <- "dec" | |
if (delta_increasing) { | |
# if (alt_delta > 0) { | |
change <- "inc" | |
} | |
gene_compiled_tss_sites_df[gene_row_idx, sprintf("transition_%d_%s_sites", key_transition, change)] <- | |
gene_compiled_tss_sites_df[gene_row_idx, sprintf("transition_%d_%s_sites", key_transition, change)] + 1 | |
# gene_compiled_tss_sites_df[gene_row_idx, sprintf("transition_%d_%s_sites", alt_key_transition, change)] <- | |
# gene_compiled_tss_sites_df[gene_row_idx, sprintf("transition_%d_%s_sites", alt_key_transition, change)] + 1 | |
} | |
} | |
gene_compiled_df <- gene_compiled_tss_sites_df | |
gene_compiled_df <- add_column(gene_compiled_df, cluster=c(0)*nrow(gene_compiled_df)) | |
# gene_clusters <- tibble(gene=character(), cluster=integer()) | |
for (row_idx in 1:nrow(gene_compiled_df)) { | |
gene <- unlist(gene_compiled_df[row_idx, "gene"]) | |
total <- unlist(gene_compiled_df[row_idx, "total_sites"]) | |
maximum_transition <- 0 | |
maximum_change <- "" | |
maximum_count <- 0 | |
for(key_transition in 1:5) { | |
for (change in c("inc", "dec")) { | |
transition <- sprintf("transition_%d_%s_sites", key_transition, change) | |
if (gene_compiled_df[row_idx, transition] > maximum_count) { | |
maximum_transition <- key_transition | |
maximum_change <- change | |
maximum_count <- as.integer(unlist(gene_compiled_df[row_idx, transition])) | |
} | |
} | |
} | |
cluster <- 10 | |
if (maximum_count / total >= .5) { | |
if (maximum_change == "dec") { | |
cluster <- maximum_transition - 1 + 5 | |
}else { | |
cluster <- maximum_transition - 1 | |
} | |
} | |
i <- 1 | |
for (gene_name in gene_compiled_df$gene) { | |
if ((! is.na(gene_name)) && gene == gene_name) { | |
gene_row_idx <- i | |
break | |
} | |
i <- i+1 | |
} | |
gene_compiled_df[i, "cluster"] <- cluster | |
} | |
write_csv(gene_compiled_df, "gene_compiled_df.csv") | |
``` | |
## Defining Escapee Genes | |
```{r} | |
library(tidyverse) | |
escape_expresison_status <- readxl::read_excel("GenesExpressionStatus.xlsx") | |
known_escapee_genes <- (escape_expresison_status %>% filter(`XCI stat` == "Escape" & `Gene name` %in% all_dmp_df$mapped_gene))$`Gene name` | |
roc <- tibble(threshold=numeric(), | |
TPR=numeric(), | |
FPR=numeric()) | |
for (thresh in seq(.1, 1, .01)) { | |
calculated_escapees <- all_dmp_df %>% filter((cluster_0_mean <= thresh) & grepl("TSS1500|TSS200", refgene_group) & (chr == "chrX")) | |
calculated_escapee_genes <- unique(calculated_escapees$mapped_gene) | |
false_positive <- (length(calculated_escapee_genes) - length(intersect(calculated_escapee_genes, known_escapee_genes))) / length(calculated_escapee_genes) | |
true_positive_rate <- length(intersect(calculated_escapee_genes, known_escapee_genes)) / length(known_escapee_genes) | |
roc <- add_row(roc, threshold=thresh, TPR=true_positive_rate, FPR=false_positive) | |
} | |
library(ggrepel) | |
ggplot(roc %>% filter(threshold < .8 & threshold > .1), aes(x=FPR, y=TPR)) + | |
geom_point() + | |
geom_line(size = 2, alpha = 0.5)+ | |
labs(title= "ROC curve", | |
x = "Percent False Positives", | |
y = "True Positive Rate (Sensitivity)") + | |
geom_text_repel(data=roc %>% filter(threshold < .8 & threshold >= .3 & TPR >.7 & (FPR) < .70),aes(label=threshold)) | |
ggsave("figs/escapee_roc.png") | |
ggplot(roc, aes(x=FPR, y=TPR)) + | |
geom_point() + | |
geom_line(size = 2, alpha = 0.5)+ | |
labs(title= "ROC curve", | |
x = "Percent False Positives", | |
y = "True Positive Rate (Sensitivity)") + | |
geom_text_repel(data=roc %>% filter(TPR >.1 & FPR < .75),aes(label=threshold)) | |
ggsave("figs/escapee_roc_wide.png") | |
``` | |
## Sanity check for escapee behavior | |
```{r} | |
escape_expresison_status <- readxl::read_excel("GenesExpressionStatus.xlsx") | |
known_escapee_genes <- (escape_expresison_status %>% filter(`XCI stat` == "Escape" & `Gene name` %in% all_dmp_df$mapped_gene))$`Gene name` | |
known_variable_genes <- (escape_expresison_status %>% filter(`XCI stat` == "Variable" & `Gene name` %in% all_dmp_df$mapped_gene))$`Gene name` | |
known_inactive_genes <- (escape_expresison_status %>% filter(`XCI stat` == "Inactive" & `Gene name` %in% all_dmp_df$mapped_gene))$`Gene name` | |
escape_dmp_df <- all_dmp_df %>% filter(mapped_gene %in% known_escapee_genes) | |
escape_promoter_methylation <- (escape_dmp_df %>% filter(grepl("TSS1500|TSS200", refgene_group)) %>% dplyr::select(cluster_0_mean))$cluster_0_mean | |
escape_body_methylation <- (escape_dmp_df %>% filter(!grepl("TSS1500|TSS200", refgene_group)) %>% dplyr::select(cluster_0_mean))$cluster_0_mean | |
variable_dmp_df <- all_dmp_df %>% filter(mapped_gene %in% known_variable_genes) | |
variable_promoter_methylation <- (variable_dmp_df %>% filter(grepl("TSS1500|TSS200", refgene_group)) %>% dplyr::select(cluster_0_mean))$cluster_0_mean | |
variable_body_methylation <- (variable_dmp_df %>% filter(!grepl("TSS1500|TSS200", refgene_group)) %>% dplyr::select(cluster_0_mean))$cluster_0_mean | |
inactive_dmp_df <- all_dmp_df %>% filter(mapped_gene %in% known_inactive_genes) | |
inactive_promoter_methylation <- (inactive_dmp_df %>% filter(grepl("TSS1500|TSS200", refgene_group)) %>% dplyr::select(cluster_0_mean))$cluster_0_mean | |
inactive_body_methylation <- (inactive_dmp_df %>% filter(!grepl("TSS1500|TSS200", refgene_group)) %>% dplyr::select(cluster_0_mean))$cluster_0_mean | |
ggplot() + | |
geom_violin(aes(x="Escape Promoter", y=escape_promoter_methylation), fill='lightgreen', trim=FALSE, show.legend = F) + | |
geom_violin(aes(x="Escape Body", y=escape_body_methylation), fill='darkgreen', trim=FALSE, show.legend = F) + | |
geom_boxplot(aes(x="Escape Promoter", y=escape_promoter_methylation), width=0.025) + | |
geom_boxplot(aes(x="Escape Body", y=escape_body_methylation), width=0.025) + | |
geom_violin(aes(x="Variable Promoter", y=variable_promoter_methylation), fill='lightblue', trim=FALSE, show.legend = F) + | |
geom_violin(aes(x="Variable Body", y=variable_body_methylation), fill='darkblue', trim=FALSE, show.legend = F) + | |
geom_boxplot(aes(x="Variable Promoter", y=variable_promoter_methylation), width=0.025) + | |
geom_boxplot(aes(x="Variable Body", y=variable_body_methylation), width=0.025) + | |
geom_violin(aes(x="Inactive Promoter", y=inactive_promoter_methylation), fill='pink', trim=FALSE, show.legend = F) + | |
geom_violin(aes(x="Inactive Body", y=inactive_body_methylation), fill='darkred', trim=FALSE, show.legend = F) + | |
geom_boxplot(aes(x="Inactive Promoter", y=inactive_promoter_methylation), width=0.025) + | |
geom_boxplot(aes(x="Inactive Body", y=inactive_body_methylation), width=0.025) + | |
scale_x_discrete(limits = c("Escape Promoter", "Escape Body", "Variable Promoter", "Variable Body", "Inactive Promoter", "Inactive Body")) + | |
labs(x="", | |
y='Methylation Beta', | |
title="Distribution of Beta Values of Escapee Gene Promoter/Body") + | |
theme(plot.title = element_text(hjust = 0.5)) | |
ggsave("figs/beta_value_breakdown_by_gene_type.png") | |
``` | |
## Doing a PCA of probes to identify escapees | |
```{r} | |
library(tidyverse) | |
female_df_no_h9_chrX <- read_csv("female_df_no_h9_chrX.csv.gz") | |
male_df_no_outliers <- read_csv("male_df_no_outliers.csv.gz") | |
x_annotation_df <- read_csv("x_annotation.csv.gz") | |
male_df_no_outliers_chrX <- male_df_no_outliers %>% filter(rn %in% x_annotation_df$rowname) | |
sample_cluster_df <- read_csv("saved_sample_clusters.csv", | |
col_names=c('name','cluster')) | |
cluster_0_samples <- (sample_cluster_df %>% filter(cluster == 0))$name | |
female_df_no_h9_chrX_cluster_0 <- female_df_no_h9_chrX %>% dplyr::select(rn, !!cluster_0_samples) | |
combined_probe_df <- inner_join(female_df_no_h9_chrX_cluster_0, male_df_no_outliers_chrX, by="rn") | |
library(ggfortify) | |
library(broom) | |
mean_methylation_df <- tidy(rowMeans(column_to_rownames(na.omit(female_df_no_h9_chrX_cluster_0), var="rn"))) %>% dplyr::rename(female_mean=x) | |
mean_methylation_df <- inner_join(mean_methylation_df, | |
tidy(rowMeans(column_to_rownames(na.omit(male_df_no_outliers_chrX), var="rn"))) %>% dplyr::rename(male_mean=x), | |
by="names") %>% column_to_rownames(var="names") | |
autoplot(prcomp(column_to_rownames(combined_probe_df, var="rn")), data=mean_methylation_df, colour="female_mean", size="male_mean") | |
mean_methylation_df <- rownames_to_column(mean_methylation_df, var="rn") | |
mean_methylation_df <- left_join(mean_methylation_df, | |
x_annotation_df %>% dplyr::select(rowname, hg38_chromosome,hg38_gene_group, hg38_gene_name, hg38_ensembl_gene_id, hg38_pos), | |
by=c("rn"="rowname")) | |
canonical_transcripts <- read_csv("hg38_canonical_transcripts.csv.gz") | |
hg38_biomart_structure_df <- read_csv("hg38_biomart_structure_df.csv.gz", col_types = cols(chromosome_name="c")) | |
hg38_biomart_structure_df_canonical <- hg38_biomart_structure_df %>% | |
filter(ensembl_transcript_id %in% canonical_transcripts$ensembl_transcript_id) %>% | |
mutate(chrom=paste("chr", chromosome_name, sep="")) | |
tss_df <- hg38_biomart_structure_df_canonical %>% group_by(ensembl_gene_id) %>% | |
summarise(TSS=dplyr::first(transcription_start_site), strand=dplyr::first(strand)) | |
mean_methylation_df <- left_join(mean_methylation_df, tss_df, by=c("hg38_ensembl_gene_id"="ensembl_gene_id")) | |
subject_training_set_cotton <- read.csv("xci_subject_genes.csv.gz", header=F)$V1 | |
escapee_training_set_cotton <- c("ACE2", | |
"CA5B", | |
"CXorf38", | |
"DDX3X", | |
"EIF1AX", | |
"FAM9C", | |
"FUNDC1", | |
"GPR112", | |
"HDHD1", | |
"HS6ST2", | |
"INE2", | |
"JPX", | |
"PNMA3", | |
"PNPLA4", | |
"RPS4X", | |
"SHROOM2", | |
"STS", | |
"SYAP1", | |
"TAF7L", | |
"UBA1", | |
"VENTXP1", | |
"VGLL1", | |
"ZFX", | |
"ZRSR2") | |
mean_methylation_df$training_designation = NA | |
mean_methylation_df[mean_methylation_df$hg38_gene_name %in% subject_training_set_cotton,]$training_designation <- "subject" | |
mean_methylation_df[mean_methylation_df$hg38_gene_name %in% escapee_training_set_cotton,]$training_designation <- "escape" | |
ggplot(mean_methylation_df %>% filter((male_mean < .25) & !is.na(training_designation) & strand*(hg38_pos - TSS) >= -1500 & strand*(hg38_pos - TSS) <= 3500), aes(x=strand*(hg38_pos - TSS), y=abs(female_mean - male_mean), color=training_designation)) + | |
geom_smooth(alpha=.4, span=.1) | |
plotting_df <- mean_methylation_df %>% | |
filter((male_mean < .25) & strand*(hg38_pos - TSS) >= -500 & strand*(hg38_pos - TSS) <= 1500) %>% | |
group_by(hg38_gene_name) %>% | |
summarise(female_mean=mean(female_mean), | |
male_mean=mean(male_mean), | |
training_designation=dplyr::first(training_designation)) | |
plotting_df$sex_delta <- abs(plotting_df$male_mean - plotting_df$female_mean) | |
training_labeller <- labeller(sex=function(s){ | |
return(list( | |
female_mean="Female", | |
male_mean="Male", | |
sex_delta="Sex Delta" | |
)) | |
}) | |
plotting_df_2 <- reshape2::melt(plotting_df, vars.name=c("hg38_gene_name", "training_designation"), value.name="DNAm", variable.name=c("sex")) | |
cotton_training_methylation_plot <- ggplot(plotting_df_2 %>% filter(!is.na(training_designation)), aes(y=DNAm, x=training_designation)) + | |
facet_grid(cols = vars(sex), labeller = training_labeller) + | |
geom_jitter(aes(color=sex), shape=20, size=.5) + | |
geom_boxplot(outlier.shape = NA, fill=NA) + | |
scale_color_npg(guide=F) + | |
labs(y="DNAme Averaged by Gene", | |
x="Training Designation", | |
title="DNAme of Cotton Training Genes") + | |
theme_bw() + | |
base_plot_theme + | |
theme(strip.text = element_text(size=5, margin = margin(.7,0,.7,0, "mm")), | |
strip.background = element_rect(size = .3), | |
axis.text.x=element_text(angle=30, hjust=1, margin=margin(t=.3, unit="mm"))) | |
ggsave("figs/supplementary/sup2_cotton_escape_designation.pdf", cotton_training_methylation_plot, height = 33, width = 43.5, units = "mm") | |
# ggplot(plotting_df , aes(y=abs(female_mean-male_mean), x=training_designation)) + | |
# geom_boxplot() + | |
# geom_jitter() | |
true_escape_training_set <- (plotting_df %>% filter(female_mean < .25 & training_designation == "escape"))$hg38_gene_name | |
plotting_df_3 <- plotting_df_2 %>% | |
mutate(training_designation=replace(training_designation, training_designation=="escape", NA)) %>% | |
mutate(training_designation=replace(training_designation, hg38_gene_name %in% true_escape_training_set, "escape")) | |
pb_training_methylation_plot <- ggplot(plotting_df_3 %>% filter(!is.na(training_designation)), aes(y=DNAm, x=training_designation)) + | |
facet_grid(cols = vars(sex), labeller = training_labeller) + | |
geom_jitter(aes(color=sex), shape=20, size=.5) + | |
geom_boxplot(outlier.shape = NA, fill=NA) + | |
scale_color_npg(guide=F) + | |
labs(y="DNAme Averaged by Gene", | |
x="Training Designation", | |
title="DNAme of Our Training Genes") + | |
theme_bw() + | |
base_plot_theme + | |
theme(strip.text = element_text(size=5, margin = margin(.7,0,.7,0, "mm")), | |
strip.background = element_rect(size = .3), | |
axis.text.x=element_text(angle=30, hjust=1, margin=margin(t=.3, unit="mm"))) | |
ggsave("figs/supplementary/sup2_pb_escape_designation.pdf", pb_training_methylation_plot, height = 33, width = 43.5, units = "mm") | |
training_methylation_df <- mean_methylation_df %>% | |
filter((male_mean < .25) & (strand*(hg38_pos - TSS) >= -500) & (strand*(hg38_pos - TSS) <= 1500) & | |
(hg38_gene_name %in% true_escape_training_set)) | |
mean_sex_delta <- mean((training_methylation_df %>% transmute(sex_delta=abs(female_mean - male_mean)))$sex_delta) | |
sd_sex_delta <- sd((training_methylation_df %>% transmute(sex_delta=abs(female_mean - male_mean)))$sex_delta) | |
mean_female_mean <- mean(training_methylation_df$female_mean) | |
sd_female_mean <- sd(training_methylation_df$female_mean) | |
mean_methylation_df_escape <- mean_methylation_df %>% | |
filter((male_mean < .25) & (strand*(hg38_pos - TSS) >= -500 & strand*(hg38_pos - TSS) <= 1500)) %>% | |
group_by(hg38_gene_name) %>% | |
summarise(female_mean=mean(female_mean), | |
male_mean=mean(male_mean), | |
hg38_ensembl_gene_id=dplyr::first(hg38_ensembl_gene_id)) %>% | |
filter((abs(female_mean - male_mean) < (mean_sex_delta + (3*sd_sex_delta))) & | |
(female_mean <= mean_female_mean + (3*sd_female_mean))) | |
mean_methylation_df_escape <- left_join(mean_methylation_df_escape, tss_df, by=c("hg38_ensembl_gene_id"="ensembl_gene_id")) | |
true_escapees <- unique(mean_methylation_df_escape$hg38_gene_name) | |
length(true_escapees) | |
plotting_df_4 <- plotting_df_2 %>% | |
mutate(training_designation=replace(training_designation, training_designation=="escape", NA)) %>% | |
mutate(training_designation=replace(training_designation, hg38_gene_name %in% true_escapees, "escape")) %>% | |
mutate(training_designation=replace(training_designation, !(hg38_gene_name %in% true_escapees), "subject")) | |
true_escapee_methylation_plot <- ggplot(plotting_df_4 %>% filter(!is.na(training_designation)), aes(y=DNAm, x=training_designation)) + | |
facet_grid(cols = vars(sex), labeller = training_labeller) + | |
geom_jitter(aes(color=sex), shape=20, size=.5) + | |
geom_boxplot(outlier.shape = NA, fill=NA) + | |
scale_color_npg(guide=F) + | |
labs(y="DNAme Averaged by Gene", | |
x="True Designation", | |
title="DNAme of Designated Genes") + | |
theme_bw() + | |
base_plot_theme + | |
theme(strip.text = element_text(size=5, margin = margin(.7,0,.7,0, "mm")), | |
strip.background = element_rect(size = .3), | |
axis.text.x=element_text(angle=30, hjust=1, margin=margin(t=.3, unit="mm"))) | |
ggsave("figs/supplementary/sup2_true_escape.pdf", true_escapee_methylation_plot, height = 33, width = 43.5, units = "mm") | |
mean_methylation_df <- mutate(mean_methylation_df, is_promoter=strand*(hg38_pos - TSS) >= -500 & strand*(hg38_pos - TSS) <= 1500) | |
mean_methylation_df <- mean_methylation_df %>% mutate(true_designation=replace(training_designation, hg38_gene_name %in% true_escapees, "Escape")) | |
mean_methylation_df <- mean_methylation_df %>% mutate(true_designation=replace(true_designation, !(hg38_gene_name %in% true_escapees), "Subject")) | |
probe_methylation_dist <- ggplot(mean_methylation_df %>% filter(!is.na(hg38_gene_name)), aes(y=female_mean, x=true_designation, fill= (is_promoter))) + | |
geom_violin(size=.3) + | |
labs(title="Escape vs Subject Probes", | |
x="True Designation", | |
y="DNAme", | |
fill="Probe Type") + | |
scale_fill_rickandmorty(labels=c("Body", "Promoter")) + | |
theme_bw() + | |
base_plot_theme | |
ggsave("figs/supplementary/sup2_escape_methylation_dist.pdf", probe_methylation_dist, width = 43.5, height=33, units = "mm") | |
ggplot(plotting_df , aes(y=female_mean, x=hg38_gene_name %in% true_escapees)) + | |
geom_boxplot() + | |
geom_jitter() | |
ggplot(plotting_df , aes(y=abs(female_mean-male_mean), x=hg38_gene_name %in% true_escapees)) + | |
geom_boxplot() + | |
geom_jitter() | |
escapee_heatmap <- make_subset_heatmap(true_escapees) | |
write_csv(mean_methylation_df_escape, "mean_methylation_df_escape.csv") | |
# mean_methylation_df_escape <- left_join(mean_methylation_df_escape, | |
# x_annotation_df %>% select(rowname, hg38_chromosome,hg38_gene_group, hg38_gene_name, hg38_ensembl_gene_id, hg38_pos), | |
# by=c("rn"="rowname")) | |
``` | |
## Check where these clusters are happening/Defining PB Escapees | |
```{r} | |
library(tidyverse) | |
library(ggplot2) | |
all_dmp_df <- read_csv('all_dmp_df.csv.gz') | |
mean_methylation_df_escape <- read_csv("mean_methylation_df_escape.csv.gz") | |
# do this based on genes and when the promoters change their methylation. | |
x_annotation_df <- read_csv("filtered_site_annotation_df.csv.gz") %>% filter(hg38_chromosome == "chrX") | |
chr_x_dmp_df_short_arm <- all_dmp_df %>% filter((hg38_chr == "chrX") & (hg38_pos <= 58100000)) | |
short_arm_escapees <- mean_methylation_df_escape %>% filter(TSS <= 58100000) | |
for (row_idx in 1:nrow(chr_x_dmp_df_short_arm)) { | |
pos <- unlist(chr_x_dmp_df_short_arm[row_idx, "hg38_pos"]) | |
min_distance <- min(abs(short_arm_escapees["TSS"] - pos)) | |
# min_distance_end <- min(abs(short_arm_escapees["end_position"] - pos)) | |
chr_x_dmp_df_short_arm[row_idx, "min_dist_to_pb_escapee"] <- min_distance #min(min_distance, min_distance_end) | |
} | |
canonical_transcripts <- read_csv("hg38_canonical_transcripts.csv.gz") | |
hg38_biomart_structure_df <- read_csv("hg38_biomart_structure_df.csv.gz", col_types = cols(chromosome_name="c")) | |
hg38_biomart_structure_df_canonical <- hg38_biomart_structure_df %>% | |
filter(ensembl_transcript_id %in% canonical_transcripts$ensembl_transcript_id & | |
chromosome_name=="X" & | |
transcription_start_site <= 58100000) %>% | |
group_by(ensembl_gene_id) %>% | |
summarise(transcription_start_site=dplyr::first(transcription_start_site)) | |
n <- 1000 # how many rounds of random choosing I want | |
mean_min_distance_random_df_short_arm <- tibble(run=numeric(), | |
key_transition=numeric(), | |
mean_distance=numeric()) | |
for (i in 1:n) { | |
random_escapees <- sample(hg38_biomart_structure_df_canonical$ensembl_gene_id, nrow(short_arm_escapees)) | |
random_escapees_df <- hg38_biomart_structure_df_canonical %>% filter(ensembl_gene_id %in% random_escapees) | |
test_df <- chr_x_dmp_df_short_arm %>% group_by(rowname) %>% summarise( | |
key_transition=first(key_transition), | |
passes_threshold=first(passes_threshold), | |
min_dist_to_random_escapee=min(abs(random_escapees_df$transcription_start_site - hg38_pos)) | |
) | |
mean_min_distance_random_df_short_arm <- full_join(mean_min_distance_random_df_short_arm, test_df %>% group_by(key_transition) %>% | |
summarise(mean_distance=median(min_dist_to_random_escapee)) %>% mutate(run=i), | |
by=c("run", "key_transition", "mean_distance")) | |
# write_csv(test_df, "short_arm_pb_random_escapee_distances.csv", append=T, col_names=F) | |
# distances_to_random_escapees <- full_join(distances_to_random_escapees, test_df, by=c("key_transition", "min_dist_to_random_escapee", "passes_threshold")) | |
} | |
# permutation 1 p-values | |
p1_short_arm_pvals <- c() | |
for (t in 1:5) { | |
p <- nrow(mean_min_distance_random_df_short_arm %>% | |
arrange(mean_distance) %>% filter(key_transition == t & | |
(mean_distance <= median((chr_x_dmp_df_short_arm %>% filter(passes_threshold==T & | |
key_transition==t))$min_dist_to_pb_escapee)))) | |
p2 <- nrow(mean_min_distance_random_df_short_arm %>% | |
arrange(mean_distance) %>% filter(key_transition == t & | |
(mean_distance >= median((chr_x_dmp_df_short_arm %>% filter(passes_threshold==T & | |
key_transition==t))$min_dist_to_pb_escapee)))) | |
print(min(p,p2)/1000) | |
pval <- min(p,p2)/1000 | |
if (pval == 0) { | |
pval <- "p < .001" | |
}else { | |
pval <- sprintf("p = %.3f", pval) | |
} | |
p1_short_arm_pvals <- c(p1_short_arm_pvals, pval) | |
} | |
# distances_to_random_escapees_short_arm <- read_csv("short_arm_pb_random_escapee_distances.csv", col_names = c("rowname","key_transition","passes_threshold", "min_dist")) | |
# distances_to_random_escapees_short_arm$type <- "random" | |
n <- 1000 # how many rounds of random choosing I want | |
mean_min_distance_random_probes_df_short_arm <- tibble(run=numeric(), | |
key_transition=numeric(), | |
mean_distance=numeric()) | |
for (i in 1:n) { | |
for (t in 1:5) { | |
random_short_arm_probes <- sample(chr_x_dmp_df_short_arm$rowname, nrow(chr_x_dmp_df_short_arm %>% filter(passes_threshold==T & key_transition==t))) | |
random_short_arm_probes_df <- chr_x_dmp_df_short_arm %>% filter(rowname %in% random_short_arm_probes) | |
test_df <- random_short_arm_probes_df %>% group_by(rowname) %>% summarise( | |
key_transition=t, | |
passes_threshold=T, | |
min_dist_to_random_escapee=min(abs(short_arm_escapees$TSS - hg38_pos)) | |
) | |
mean_min_distance_random_probes_df_short_arm <- add_row(mean_min_distance_random_probes_df_short_arm, | |
run=i, | |
key_transition=t, | |
mean_distance=median(test_df$min_dist_to_random_escapee)) | |
# write_csv(test_df, "short_arm_pb_random_probes_distances.csv", append=T, col_names=F) | |
# distances_to_random_escapees <- full_join(distances_to_random_escapees, test_df, by=c("key_transition", "min_dist_to_random_escapee", "passes_threshold")) | |
} | |
} | |
# permutation 2 p-values | |
p2_short_arm_pvals <- c() | |
for (t in 1:5) { | |
p <- nrow(mean_min_distance_random_probes_df_short_arm %>% | |
arrange(mean_distance) %>% filter(key_transition == t & | |
(mean_distance <= median((chr_x_dmp_df_short_arm %>% filter(passes_threshold==T & | |
key_transition==t))$min_dist_to_pb_escapee)))) | |
p2 <- nrow(mean_min_distance_random_probes_df_short_arm %>% | |
arrange(mean_distance) %>% filter(key_transition == t & | |
(mean_distance >= median((chr_x_dmp_df_short_arm %>% filter(passes_threshold==T & | |
key_transition==t))$min_dist_to_pb_escapee)))) | |
print(min(p,p2)/1000) | |
pval <- min(p,p2)/1000 | |
if (pval ==0) { | |
pval <- "p < .001" | |
}else { | |
pval <- sprintf("p = %.3f", pval) | |
} | |
p2_short_arm_pvals <- c(p2_short_arm_pvals, pval) | |
} | |
# distances_to_random_probes_short_arm <- read_csv("short_arm_pb_random_probes_distances.csv", col_names = c("rowname","key_transition","passes_threshold", "min_dist")) | |
# distances_to_random_probes_short_arm$type <- "random_probes" | |
p1_shorm_arm_df <- tibble(key_transition = c(1,2,3,4,5), | |
pval = p1_short_arm_pvals, | |
x=0, | |
y=1) | |
p2_shorm_arm_df <- tibble(key_transition = c(1,2,3,4,5), | |
pval = p2_short_arm_pvals, | |
x=3.5e7, | |
y=1) | |
short_arm_permutation_plot <- ggplot(mean_min_distance_random_df_short_arm) + | |
geom_density(aes(mean_distance, y=..scaled..), | |
n = 1e4, | |
fill="darkgrey", | |
color="darkgray", | |
size=.3, | |
alpha=.5) + | |
geom_density(aes(mean_distance, y=..scaled..), data=mean_min_distance_random_probes_df_short_arm, | |
n = 1e4, | |
fill="grey", | |
color="grey", | |
size=.3, | |
alpha=.5) + | |
geom_text(data=p1_shorm_arm_df, mapping=aes(x=x, y=y, label=pval), inherit.aes = F, size=1.2, hjust=-0.1, vjust=0) + | |
geom_text(data=p2_shorm_arm_df, mapping=aes(x=x, y=y, label=pval), inherit.aes = F, size=1.2, hjust=1.1, vjust=0) + | |
facet_grid(key_transition~., scales = "free_y", labeller = labeller(key_transition = function(c){ | |
return(list( | |
"1"="Tran. 1", | |
"2"="Tran. 2", | |
"3"="Tran. 3", | |
"4"="Tran. 4", | |
"5"="Tran. 5" | |
)[c]) | |
})) + | |
geom_vline(data=(chr_x_dmp_df_short_arm %>% filter(passes_threshold==T) %>% group_by(key_transition) %>% summarise(mean_distance=median(min_dist_to_pb_escapee))), | |
mapping = aes(xintercept=mean_distance), size=.3) + | |
geom_density(data=chr_x_dmp_df_short_arm %>% filter(passes_threshold==T), | |
mapping=aes(min_dist_to_pb_escapee, | |
y=..scaled.., | |
fill=as.factor(key_transition), | |
color=as.factor(key_transition)), | |
n = 1e4, alpha=.5, size=.3) + | |
geom_text(data=(chr_x_dmp_df_short_arm %>% | |
filter(passes_threshold==T) %>% | |
group_by(key_transition) %>% | |
summarise(mean_distance=median(min_dist_to_pb_escapee))), | |
mapping = aes(x=mean_distance, y=.5, label=sprintf("%.1f Mb", mean_distance / 1e6)), | |
# nudge_x=-200, | |
hjust=1.1, | |
size=1.2) + | |
theme_bw() + | |
# scale_x_continuous(breaks = c(0,.5,1,1.5,2,2.5,3,3.5)*1e7, labels = c(0,.5,1,1.5,2,2.5,3,3.5)*10) + | |
# scale_x_log10(limits = c(1e4,1e8), breaks=c(1e4, 1e5,1e6,1e7,1e8), labels = c(.01,.1,1,10,100)) + | |
scale_x_sqrt(breaks=c(1e6, 5e6, 1e7, 1.5e7, 2e7, 2.5e7, 3e7), labels=c(1, 5, 10, 15, 20, 25, 30), expand=c(0,0)) + | |
scale_y_continuous(limits = c(0,1.2), breaks = c(0,.5,1), minor_breaks = c(.25,.75)) + | |
scale_color_d3(guide=F) + | |
scale_fill_d3(guide=F) + | |
# scale_fill_material("teal", name="Transition", guide=F) + | |
# scale_color_material("teal", name="Transition", guide=F) + | |
labs(title="Minimum Distance of DMPs to Escapee on Short Arm", | |
x="Minimum Distance to Gene Set (Mb)", | |
y="Scaled Density") + | |
base_plot_theme + | |
theme(strip.text = element_text(size=5, margin = margin(0,.5,0,.5, "mm")), | |
strip.background = element_rect(size = .3), | |
panel.spacing.y = unit(.75, "mm")) | |
ggsave("figs/main/min_dist_escapee_short_arm.pdf", short_arm_permutation_plot, width = 64, height=47.5, units = "mm") | |
# distances_to_pb_escapees_short_arm <- chr_x_dmp_df_short_arm %>% dplyr::select(rowname, key_transition, passes_threshold, min_dist=min_dist_to_pb_escapee) | |
# distances_to_pb_escapees_short_arm$type <- "pb_escapee" | |
# distances_to_pb_escapees_short_arm[distances_to_pb_escapees_short_arm$passes_threshold == F, "type"] <- "pb_escapee_no_thresh" | |
# all_distances_short_arm <- full_join(distances_to_random_escapees_short_arm, distances_to_pb_escapees_short_arm, by=c("rowname", "key_transition", "passes_threshold", "min_dist", "type")) | |
# all_distances_short_arm <- full_join(all_distances_short_arm, distances_to_random_probes_short_arm, by=c("rowname", "key_transition", "passes_threshold", "min_dist", "type")) | |
# library(ggpubr) | |
# library(ggsci) | |
# all_distances_short_arm$key_transition <- sprintf("Tran. %d", all_distances_short_arm$key_transition) | |
# short_arm_distances_plot <- ggplot(all_distances_short_arm %>% filter(type != "pb_escapee_no_thresh"), aes(x=type, y=min_dist, fill=type)) + | |
# geom_boxplot(outlier.shape = NA, size=.3, notch = T) + | |
# # geom_hline(yintercept = mean(filter(all_distances_short_arm, type=="pb_escapee_no_thresh")$min_dist), linetype="dashed", color=pal_futurama()(n=3)[3], show.legend = T, size=.3) + | |
# stat_compare_means(comparisons = list(c("pb_escapee", "random")), label="p.signif", size=1, tip.length = .02)+ | |
# # scale_y_continuous(breaks = c(0,.5,1,1.5,2,2.5,3,3.5)*1e7, labels = c(0,.5,1,1.5,2,2.5,3,3.5)*10) + | |
# scale_y_log10(limits = c(1e4,1e8), breaks=c(1e4, 1e5,1e6,1e7,1e8)) + | |
# scale_x_discrete(breaks=c("pb_escapee", "random_probes", "random"), labels=c("", "", "")) + | |
# facet_grid(~key_transition) + | |
# scale_fill_futurama(breaks=c("pb_escapee", "random", "random_probes"), labels=c("Short Arm Escapees", "100 Sets of 12 Random Short Arm Genes","100 Sets of Random Probes")) + | |
# theme_bw() + | |
# labs(y="Minimum Distance to Gene Set (Mb)", | |
# x="", | |
# fill="Gene Set", | |
# title="Minimum Distance to Escapee on Short Arm") + | |
# base_plot_theme + | |
# theme(strip.text = element_text(size=5, margin = margin(.5,0,.5,0, "mm")), | |
# strip.background = element_rect(size = .3), | |
# legend.text = element_text(size=4), | |
# legend.title = element_text(size=4, face="bold"), | |
# legend.key.width = unit(2, units="mm"), | |
# legend.key.height = unit(3, units="mm"), | |
# legend.key.size = unit(.2, units = "mm"), | |
# axis.title.x = element_text(margin = margin(0,0,0,0), size = 0), | |
# axis.text.x = element_text(margin = margin(0,0,0,0), size = 0), | |
# axis.ticks.length.x = unit(0, "mm")) | |
# ggsave("figs/main/4_short_arm_dist.svg", short_arm_distances_plot, width=64, height=45, units="mm") | |
``` | |
## Min escapee distance on long arm | |
```{r} | |
chr_x_dmp_df_long_arm <- all_dmp_df %>% filter((hg38_chr == "chrX") & (hg38_pos > 58100000)) | |
long_arm_escapees <- mean_methylation_df_escape %>% filter(TSS > 58100000) | |
for (row_idx in 1:nrow(chr_x_dmp_df_long_arm)) { | |
pos <- unlist(chr_x_dmp_df_long_arm[row_idx, "hg38_pos"]) | |
min_distance <- min(abs(long_arm_escapees["TSS"] - pos)) | |
# min_distance_end <- min(abs(short_arm_escapees["end_position"] - pos)) | |
chr_x_dmp_df_long_arm[row_idx, "min_dist_to_pb_escapee"] <- min_distance #min(min_distance, min_distance_end) | |
} | |
# canonical_transcripts <- read_csv("hg38_canonical_transcripts.csv") | |
# hg38_biomart_structure_df <- read_csv("hg38_biomart_structure_df.csv", col_types = cols(chromosome_name="c")) | |
hg38_biomart_structure_df_canonical_long_arm <- hg38_biomart_structure_df %>% | |
filter(ensembl_transcript_id %in% canonical_transcripts$ensembl_transcript_id & | |
chromosome_name=="X" & | |
transcription_start_site > 58100000) %>% | |
group_by(ensembl_gene_id) %>% | |
summarise(transcription_start_site=dplyr::first(transcription_start_site)) | |
n <- 1000 # how many rounds of random choosing I want | |
mean_min_distance_random_df_long_arm <- tibble(run=numeric(), | |
key_transition=numeric(), | |
mean_distance=numeric()) | |
for (i in 1:n) { | |
random_escapees <- sample(hg38_biomart_structure_df_canonical_long_arm$ensembl_gene_id, nrow(long_arm_escapees)) | |
random_escapees_df <- hg38_biomart_structure_df_canonical_long_arm %>% filter(ensembl_gene_id %in% random_escapees) | |
test_df <- chr_x_dmp_df_long_arm %>% group_by(rowname) %>% summarise( | |
key_transition=dplyr::first(key_transition), | |
passes_threshold=dplyr::first(passes_threshold), | |
min_dist_to_random_escapee=min(abs(random_escapees_df$transcription_start_site - hg38_pos)) | |
) | |
mean_min_distance_random_df_long_arm <- full_join(mean_min_distance_random_df_long_arm, test_df %>% group_by(key_transition) %>% summarise(mean_distance=median(min_dist_to_random_escapee)) %>% mutate(run=i), by=c("run", "key_transition", "mean_distance")) | |
# write_csv(test_df, "long_arm_pb_random_escapee_distances.csv", append=T, col_names=F) | |
# distances_to_random_escapees <- full_join(distances_to_random_escapees, test_df, by=c("key_transition", "min_dist_to_random_escapee", "passes_threshold")) | |
} | |
# permutation 1 p-values | |
p1_pvals <- c() | |
for (t in 1:5) { | |
p <- nrow(mean_min_distance_random_df_long_arm %>% | |
arrange(mean_distance) %>% filter(key_transition == t & | |
(mean_distance <= median((chr_x_dmp_df_long_arm %>% filter(passes_threshold==T & | |
key_transition==t))$min_dist_to_pb_escapee)))) | |
p2 <- nrow(mean_min_distance_random_df_long_arm %>% | |
arrange(mean_distance) %>% filter(key_transition == t & | |
(mean_distance >= median((chr_x_dmp_df_long_arm %>% filter(passes_threshold==T & | |
key_transition==t))$min_dist_to_pb_escapee)))) | |
print(min(p,p2)/1000) | |
pval <- min(p,p2)/1000 | |
if (pval == 0) { | |
pval <- "p < 0.001" | |
}else { | |
pval <- sprintf("p = %.3f", pval) | |
} | |
p1_pvals <- c(p1_pvals, pval) | |
} | |
# distances_to_random_escapees_long_arm <- read_csv("long_arm_pb_random_escapee_distances.csv", col_names = c("rowname","key_transition","passes_threshold", "min_dist")) | |
# distances_to_random_escapees_long_arm$type <- "random" | |
n <- 1000 # how many rounds of random choosing I want | |
mean_min_distance_random_probes_df_long_arm <- tibble(run=numeric(), | |
key_transition=numeric(), | |
mean_distance=numeric()) | |
for (i in 1:n) { | |
for (t in 1:5) { | |
random_long_arm_probes <- sample(chr_x_dmp_df_long_arm$rowname, nrow(chr_x_dmp_df_long_arm %>% filter(passes_threshold==T & key_transition==t))) | |
random_long_arm_probes_df <- chr_x_dmp_df_long_arm %>% filter(rowname %in% random_long_arm_probes) | |
test_df <- random_long_arm_probes_df %>% group_by(rowname) %>% summarise( | |
key_transition=t, | |
passes_threshold=T, | |
min_dist_to_random_escapee=min(abs(long_arm_escapees$TSS - hg38_pos)) | |
) | |
mean_min_distance_random_probes_df_long_arm <- add_row(mean_min_distance_random_probes_df_long_arm, | |
run=i, | |
key_transition=t, | |
mean_distance=median(test_df$min_dist_to_random_escapee)) | |
# write_csv(test_df, "long_arm_pb_random_probes_distances.csv", append=T, col_names=F) | |
# distances_to_random_escapees <- full_join(distances_to_random_escapees, test_df, by=c("key_transition", "min_dist_to_random_escapee", "passes_threshold")) | |
} | |
} | |
# distances_to_random_probes_long_arm <- read_csv("long_arm_pb_random_probes_distances.csv", col_names = c("rowname","key_transition","passes_threshold", "min_dist")) | |
# distances_to_random_probes_long_arm$type <- "random_probes" | |
# permutation 2 p-values | |
p2_pvals <- c() | |
for (t in 1:5) { | |
p <- nrow(mean_min_distance_random_probes_df_long_arm %>% | |
arrange(mean_distance) %>% filter(key_transition == t & | |
(mean_distance <= median((chr_x_dmp_df_long_arm %>% filter(passes_threshold==T & | |
key_transition==t))$min_dist_to_pb_escapee)))) | |
p2 <- nrow(mean_min_distance_random_probes_df_long_arm %>% | |
arrange(mean_distance) %>% filter(key_transition == t & | |
(mean_distance >= median((chr_x_dmp_df_long_arm %>% filter(passes_threshold==T & | |
key_transition==t))$min_dist_to_pb_escapee)))) | |
print(min(p,p2)/1000) | |
pval <- min(p,p2)/1000 | |
if (pval == 0) { | |
pval <- "p < 0.001" | |
}else { | |
pval <- sprintf("p = %.3f", pval) | |
} | |
p2_pvals <- c(p2_pvals, pval) | |
} | |
p1_long_arm_df <- tibble(key_transition = c(1,2,3,4,5), | |
pval = p1_pvals, | |
x=0, | |
y=1) | |
p2_long_arm_df <- tibble(key_transition = c(1,2,3,4,5), | |
pval = p2_pvals, | |
x=6.5e7, | |
y=1) | |
long_arm_permutation_plot <- ggplot(mean_min_distance_random_df_long_arm) + | |
geom_density(aes(mean_distance, y=..scaled..), n = 1e4, | |
fill="darkgrey", color="darkgrey", size=.3, alpha=.5) + | |
geom_density(aes(mean_distance, y=..scaled..), data=mean_min_distance_random_probes_df_long_arm, | |
n = 1e4, fill="grey", color="grey", size=.3, alpha=.5) + | |
facet_grid(key_transition~., scales = "free_y", labeller = labeller(key_transition = function(c){ | |
return(list( | |
"1"="Tran. 1", | |
"2"="Tran. 2", | |
"3"="Tran. 3", | |
"4"="Tran. 4", | |
"5"="Tran. 5" | |
)[c]) | |
})) + | |
geom_vline(data=(chr_x_dmp_df_long_arm %>% | |
filter(passes_threshold==T) %>% | |
group_by(key_transition) %>% | |
summarise(mean_distance=median(min_dist_to_pb_escapee))), | |
mapping = aes(xintercept=mean_distance), | |
size=.3) + | |
geom_density(data=chr_x_dmp_df_long_arm %>% filter(passes_threshold==T), | |
mapping=aes(min_dist_to_pb_escapee, y=..scaled.., | |
fill=as.factor(key_transition), | |
color=as.factor(key_transition)), | |
n = 1e4, alpha=.5, | |
size=.3) + | |
geom_text(data=(chr_x_dmp_df_long_arm %>% | |
filter(passes_threshold==T) %>% | |
group_by(key_transition) %>% | |
summarise(mean_distance=median(min_dist_to_pb_escapee))), | |
mapping = aes(x=mean_distance, y=.5, label=sprintf("%.1f Mb", mean_distance / 1e6)), | |
hjust=1.1, | |
size=1.2) + | |
geom_text(data=p1_long_arm_df, mapping=aes(x=x, y=y, label=pval), inherit.aes = F, size=1.2, hjust=-0.1, vjust=0) + | |
geom_text(data=p2_long_arm_df, mapping=aes(x=x, y=y, label=pval), inherit.aes = F, size=1.2, hjust=1.1, vjust=0) + | |
theme_bw() + | |
scale_x_sqrt(breaks=c(1e6, 5e6, 1e7, 1.5e7, 2e7, | |
3e7, 4e7, 5e7, 6e7), | |
labels=c(1, 5, 10, 15, 20, 30, 40, 50, 60), expand=c(0,0)) + | |
# scale_x_log10(limits = c(1e4,1e8), breaks=c(1e4, 1e5,1e6,1e7,1e8), labels = c(.01,.1,1,10,100)) + | |
scale_fill_d3(guide=F) + | |
scale_color_d3(guide=F) + | |
labs(title="Minimum Distance of DMPs to Escapee on Long Arm", | |
x="Minimum Distance to Gene Set (Mb)", | |
y="Scaled Density") + | |
base_plot_theme + | |
scale_y_continuous(limits=c(0,1.2), breaks = c(0,.5,1)) + | |
theme(strip.text = element_text(size=5, margin = margin(0,.5,0,.5, "mm")), | |
strip.background = element_rect(size = .3), | |
legend.text = element_text(size=4), | |
legend.title = element_text(size=4, face="bold"), | |
legend.key.width = unit(2, units="mm"), | |
legend.key.height = unit(2, units="mm"), | |
legend.key.size = unit(.2, units = "mm"), | |
panel.spacing.y = unit(.75, "mm")) | |
# axis.title.x = element_text(margin = margin(0,0,0,0), size = 0), | |
# axis.text.x = element_text(margin = margin(0,0,0,0), size = 0), | |
# axis.ticks.length.x = unit(0, "mm")) | |
ggsave("figs/main/min_dist_escapee_long_arm.pdf", long_arm_permutation_plot, width = 64, height=47.5, units = "mm") | |
distances_to_pb_escapees_long_arm <- chr_x_dmp_df_long_arm %>% dplyr::select(rowname, key_transition, passes_threshold, min_dist=min_dist_to_pb_escapee) | |
distances_to_pb_escapees_long_arm$type <- "pb_escapee" | |
distances_to_pb_escapees_long_arm[distances_to_pb_escapees_long_arm$passes_threshold == F, "type"] <- "pb_escapee_no_thresh" | |
all_distances_long_arm <- full_join(distances_to_random_escapees_long_arm, distances_to_pb_escapees_long_arm, by=c("rowname", "key_transition", "passes_threshold", "min_dist", "type")) | |
all_distances_long_arm <- full_join(all_distances_long_arm, distances_to_random_probes_long_arm, by=c("rowname", "key_transition", "passes_threshold", "min_dist", "type")) | |
library(ggpubr) | |
library(ggsci) | |
all_distances_long_arm$key_transition <- sprintf("Tran. %d", all_distances_long_arm$key_transition) | |
long_arm_escape_plot <- ggplot(all_distances_long_arm %>% filter(type != "pb_escapee_no_thresh"), aes(x=type, y=min_dist, fill=type)) + | |
geom_boxplot(outlier.shape = NA, size=.3, notch = T) + | |
stat_compare_means(comparisons = list(c("pb_escapee", "random")), label="p.signif", size=1, tip.length = .02)+#, label.y=2e8, tip.length = .02)+ | |
# scale_y_continuous(limits = c(0,5.5e7), breaks = (0:5)*1e7, labels = (0:5)*10)+#limits = c(5e4,1e8))+#, breaks=c(1e5,1e6,1e7,5e8)) + | |
scale_y_log10(limits = c(5e4,1e9), breaks=c(1e5,1e6,1e7,1e8, 1e9)) + | |
scale_x_discrete(breaks=c("pb_escapee", "random", "random_probes"), labels=c("", "", "")) + | |
# geom_hline(yintercept = mean(filter(all_distances_long_arm, type=="pb_escapee_no_thresh")$min_dist), linetype="dashed", color=pal_futurama()(n=3)[3], size=.3) + | |
facet_grid(~key_transition) + | |
scale_fill_futurama(breaks=c("pb_escapee", "random", "random_probes"), labels=c("Long Arm Escapees", "100 Sets of 3 Random Long Arm Genes", "100 Sets of Random Long Arm Probes")) + | |
theme_bw() + | |
labs(y="Minimum Distance to Gene Set (Mb)", | |
x="", | |
title="Minimum Distance to Escapee on Long Arm", | |
fill="Gene Set") + | |
base_plot_theme + | |
theme(strip.text = element_text(size=5, margin = margin(.5,0,.5,0, "mm")), | |
strip.background = element_rect(size = .3), | |
legend.text = element_text(size=4), | |
legend.title = element_text(size=4, face="bold"), | |
legend.key.width = unit(2, units="mm"), | |
legend.key.height = unit(3, units="mm"), | |
legend.key.size = unit(.2, units = "mm"), | |
axis.title.x = element_text(margin = margin(0,0,0,0), size = 0), | |
axis.text.x = element_text(margin = margin(0,0,0,0), size = 0), | |
axis.ticks.length.x = unit(0, "mm")) | |
ggsave("figs/main/4_long_arm_dist.svg", long_arm_escape_plot, width=64, height=45, units="mm") | |
probability_tib_long_arm <- tibble( | |
min_distance = numeric(), | |
transition = numeric(), | |
probability = numeric() | |
) | |
for (dist_to_escapee in seq(1, 7e7, 1e6)) { | |
# dist_to_escapee <- dist_to_escapee * 1e4 | |
sites_within_dist <- chr_x_dmp_df_long_arm %>% filter(((min_dist_to_pb_escapee <= dist_to_escapee)) & (passes_threshold == T)) | |
sites_within_dist_par <- chr_x_dmp_df_long_arm %>% filter((((hg19_pos - 2699520) <= dist_to_escapee)) & (passes_threshold == T)) | |
for (t in 1:5) { | |
prob <- nrow(sites_within_dist %>% filter(key_transition == t & passes_threshold == T)) / | |
nrow(chr_x_dmp_df_long_arm %>% filter(key_transition == t & passes_threshold == T)) | |
probability_tib_long_arm <- add_row(probability_tib_long_arm, | |
min_distance = dist_to_escapee, | |
transition = t, | |
probability = prob) | |
} | |
} | |
d50_df <- tibble(transition=numeric(),d50=numeric()) | |
for (t in 1:5) { | |
d50_dist <- as.numeric(chr_x_dmp_df_long_arm %>% | |
filter(key_transition == t & passes_threshold==T) %>% | |
arrange(min_dist_to_pb_escapee) %>% filter(row_number()==ceiling(n()/2)) %>% | |
dplyr::select(min_dist_to_pb_escapee)) | |
d50_df <- add_row(d50_df, transition=t, d50=d50_dist) | |
} | |
probability_of_erosion_long_arm <- ggplot(na.omit(probability_tib_long_arm), aes(x=min_distance, y=probability, group=transition, color=as.factor(transition))) + | |
geom_line() + | |
labs(x = "Minimum Distance to Escapee (Mb)", | |
y = "Fraction of Transition Sites", | |
color = "Transition", | |
title="Probability of Erosion on Long Arm by\nDistance from Escapee") + | |
theme_bw() + | |
scale_color_futurama() + | |
scale_x_continuous(breaks = (0:6)*1e7, labels = (0:6)*10) + | |
base_plot_theme + | |
theme(legend.title = element_text(size=5, face="bold"), | |
legend.text = element_text(size=5), | |
legend.key.width = unit(2, "mm")) | |
ggsave("figs/supplementary/sup2_prob_escape_long_arm.pdf", probability_of_erosion_long_arm, width = 174/3, height=40, units = "mm") | |
``` | |
## Min escapee distance for full chromosome | |
```{r} | |
library(tidyverse) | |
library(ggplot2) | |
all_dmp_df <- read_csv("all_dmp_df.csv.gz") | |
mean_methylation_df_escape <- read_csv("mean_methylation_df_escape.csv.gz") | |
hg38_biomart_structure_df <- read_csv("hg38_biomart_structure_df.csv.gz", col_types = cols(chromosome_name="c")) | |
canonical_transcripts <- read_csv("hg38_canonical_transcripts.csv.gz") | |
chr_x_dmp_df <- all_dmp_df %>% filter((hg38_chr == "chrX")) | |
# long_arm_escapees <- mean_methylation_df_escape %>% filter(TSS > 58100000) | |
for (row_idx in 1:nrow(chr_x_dmp_df)) { | |
pos <- unlist(chr_x_dmp_df[row_idx, "hg38_pos"]) | |
min_distance <- min(abs(mean_methylation_df_escape["TSS"] - pos)) | |
# min_distance_end <- min(abs(short_arm_escapees["end_position"] - pos)) | |
chr_x_dmp_df[row_idx, "min_dist_to_pb_escapee"] <- min_distance #min(min_distance, min_distance_end) | |
} | |
hg38_biomart_structure_df_canonical_chrX <- hg38_biomart_structure_df %>% | |
filter(ensembl_transcript_id %in% canonical_transcripts$ensembl_transcript_id & | |
chromosome_name=="X") %>% | |
group_by(ensembl_gene_id) %>% | |
summarise(transcription_start_site=dplyr::first(transcription_start_site)) | |
n <- 1000 # how many rounds of random choosing I want | |
mean_min_distance_random_df <- tibble(run=numeric(), | |
key_transition=numeric(), | |
mean_distance=numeric()) | |
for (i in 1:n) { | |
random_escapees <- sample(hg38_biomart_structure_df_canonical_chrX$ensembl_gene_id, nrow(mean_methylation_df_escape)) | |
random_escapees_df <- hg38_biomart_structure_df_canonical_chrX %>% filter(ensembl_gene_id %in% random_escapees) | |
test_df <- chr_x_dmp_df %>% group_by(rowname) %>% summarise( | |
key_transition=dplyr::first(key_transition), | |
passes_threshold=dplyr::first(passes_threshold), | |
min_dist_to_random_escapee=min(abs(random_escapees_df$transcription_start_site - hg38_pos)) | |
) | |
# mean_min_distance_random <- c(mean_min_distance_random, mean(test_df$min_dist_to_random_escapee)) | |
mean_min_distance_random_df <- full_join(mean_min_distance_random_df, test_df %>% group_by(key_transition) %>% summarise(mean_distance=median(min_dist_to_random_escapee)) %>% mutate(run=i), by=c("run", "key_transition", "mean_distance")) | |
# write_csv(test_df, "chr_x_pb_random_escapee_distances.csv", append=T, col_names=F) | |
# distances_to_random_escapees <- full_join(distances_to_random_escapees, test_df, by=c("key_transition", "min_dist_to_random_escapee", "passes_threshold")) | |
} | |
distances_to_random_escapees_chr_x <- read_csv("chr_x_pb_random_escapee_distances.csv", col_names = c("rowname","key_transition","passes_threshold", "min_dist")) | |
distances_to_random_escapees_chr_x$type <- "random" | |
n <- 1000 # how many rounds of random choosing I want | |
mean_min_distance_random_probes_df <- tibble(run=numeric(), | |
key_transition=numeric(), | |
mean_distance=numeric()) | |
for (i in 1:n) { | |
for (t in 1:5) { | |
random_chr_x_probes <- sample(chr_x_dmp_df$rowname, nrow(chr_x_dmp_df %>% filter(passes_threshold==T & key_transition==t))) | |
random_chr_x_probes_df <- chr_x_dmp_df %>% filter(rowname %in% random_chr_x_probes) | |
test_df <- random_chr_x_probes_df %>% group_by(rowname) %>% summarise( | |
key_transition=t, | |
passes_threshold=T, | |
min_dist_to_random_escapee=min(abs(mean_methylation_df_escape$TSS - hg38_pos)) | |
) | |
mean_min_distance_random_probes_df <- add_row(mean_min_distance_random_probes_df, | |
run=i, | |
key_transition=t, | |
mean_distance=median(test_df$min_dist_to_random_escapee)) | |
# write_csv(test_df, "chr_x_pb_random_probes_distances.csv", append=T, col_names=F) | |
# distances_to_random_escapees <- full_join(distances_to_random_escapees, test_df, by=c("key_transition", "min_dist_to_random_escapee", "passes_threshold")) | |
} | |
} | |
distances_to_random_probes_chr_x <- read_csv("chr_x_pb_random_probes_distances.csv", col_names = c("rowname","key_transition","passes_threshold", "min_dist")) | |
distances_to_random_probes_chr_x$type <- "random_probes" | |
chrx_permutation_plot <- ggplot(mean_min_distance_random_df) + | |
geom_density(aes(mean_distance, y=..scaled..), n=1e3, fill="darkgrey", color="darkgrey") + | |
geom_density(aes(mean_distance, y=..scaled..), data=mean_min_distance_random_probes_df, n = 1e3, fill="lightgrey", color="lightgrey") + | |
facet_grid(key_transition~., scale="free_y") + | |
geom_vline(data=(chr_x_dmp_df %>% filter(passes_threshold==T) %>% group_by(key_transition) %>% summarise(mean_distance=median(min_dist_to_pb_escapee))), | |
mapping = aes(xintercept=mean_distance, color=factor(key_transition)), linetype="dashed") + | |
geom_density(data=chr_x_dmp_df %>% filter(passes_threshold==T), mapping=aes(min_dist_to_pb_escapee, y=..scaled.., fill=factor(key_transition), color=factor(key_transition)), n = 1e3, alpha=.5) + | |
# geom_vline(data=chr_x_dmp_df %>% filter(passes_threshold==T), mapping=aes(xintercept=median(min_dist_to_pb_escapee), color=factor(key_transition)), | |
# linetype="dashed", size=1) + | |
theme_bw() | |
distances_to_pb_escapees_chr_x <- chr_x_dmp_df %>% dplyr::select(rowname, key_transition, passes_threshold, min_dist=min_dist_to_pb_escapee) | |
distances_to_pb_escapees_chr_x$type <- "pb_escapee" | |
distances_to_pb_escapees_chr_x[distances_to_pb_escapees_chr_x$passes_threshold == F, "type"] <- "pb_escapee_no_thresh" | |
all_distances_chr_x <- full_join(distances_to_random_escapees_chr_x, distances_to_pb_escapees_chr_x, by=c("rowname", "key_transition", "passes_threshold", "min_dist", "type")) | |
all_distances_chr_x <- full_join(all_distances_chr_x, distances_to_random_probes_chr_x, by=c("rowname", "key_transition", "passes_threshold", "min_dist", "type")) | |
library(ggpubr) | |
library(ggsci) | |
all_distances_chr_x$key_transition <- sprintf("Tran. %d", all_distances_chr_x$key_transition) | |
chr_x_escape_plot <- ggplot(all_distances_chr_x %>% filter(type != "pb_escapee_no_thresh"), aes(x=type, y=min_dist, fill=type)) + | |
geom_boxplot(outlier.shape = NA, size=.3, notch = T) + | |
stat_compare_means(comparisons = list(c("pb_escapee", "random")), label="p.signif", size=1, tip.length = .02)+#, label.y=2e8, tip.length = .02)+ | |
# scale_y_continuous(limits = c(0,5.5e7), breaks = (0:5)*1e7, labels = (0:5)*10)+#limits = c(5e4,1e8))+#, breaks=c(1e5,1e6,1e7,5e8)) + | |
scale_y_log10(limits = c(1e4,5e8))+#, breaks=c(1e5,1e6,1e7,1e8, 1e9)) + | |
scale_x_discrete(breaks=c("pb_escapee", "random", "random_probes"), labels=c("", "", "")) + | |
# geom_hline(yintercept = mean(filter(all_distances_long_arm, type=="pb_escapee_no_thresh")$min_dist), linetype="dashed", color=pal_futurama()(n=3)[3], size=.3) + | |
facet_grid(~key_transition) + | |
scale_fill_futurama(breaks=c("pb_escapee", "random", "random_probes"), labels=c("Chr X Escapees", "100 Sets of 15 Random Long Arm Genes", "100 Sets of Random X Chr Probes")) + | |
theme_bw() + | |
labs(y="Minimum Distance to Gene Set (Mb)", | |
x="", | |
title="Minimum Distance to Escapee on Long Arm", | |
fill="Gene Set") + | |
base_plot_theme + | |
theme(strip.text = element_text(size=5, margin = margin(.5,0,.5,0, "mm")), | |
strip.background = element_rect(size = .3), | |
legend.text = element_text(size=4), | |
legend.title = element_text(size=4, face="bold"), | |
legend.key.width = unit(2, units="mm"), | |
legend.key.height = unit(3, units="mm"), | |
legend.key.size = unit(.2, units = "mm"), | |
axis.title.x = element_text(margin = margin(0,0,0,0), size = 0), | |
axis.text.x = element_text(margin = margin(0,0,0,0), size = 0), | |
axis.ticks.length.x = unit(0, "mm")) | |
ggsave("figs/main/chr_x_escape_plot.svg", chr_x_escape_plot, width=64, height=45, units="mm") | |
# Probability of demethylation | |
probability_tib <- tibble( | |
min_distance = numeric(), | |
transition = numeric(), | |
probability = numeric(), | |
probability_par = numeric() | |
) | |
for (dist_to_escapee in seq(1, 7e7, 1e4)) { | |
# dist_to_escapee <- dist_to_escapee * 1e4 | |
sites_within_dist <- chr_x_dmp_df %>% filter(((min_dist_to_pb_escapee <= dist_to_escapee)) & (passes_threshold == T)) | |
sites_within_dist_par <- chr_x_dmp_df %>% filter((((hg19_pos - 2699520) <= dist_to_escapee)) & (passes_threshold == T)) | |
for (t in 1:5) { | |
prob <- nrow(sites_within_dist %>% filter(key_transition == t & passes_threshold == T)) / | |
nrow(chr_x_dmp_df %>% filter(key_transition == t & passes_threshold == T)) | |
prob_par <- nrow(sites_within_dist_par %>% filter(key_transition == t & passes_threshold == T)) / | |
nrow(chr_x_dmp_df %>% filter(key_transition == t & passes_threshold == T)) | |
probability_tib <- add_row(probability_tib, | |
min_distance = dist_to_escapee, | |
transition = t, | |
probability = prob, | |
probability_par = prob_par) | |
} | |
} | |
d50_df <- tibble(transition=numeric(),d50=numeric()) | |
for (t in 1:5) { | |
d50_dist <- as.numeric(chr_x_dmp_df %>% | |
filter(key_transition == t & passes_threshold==T) %>% | |
arrange(min_dist_to_pb_escapee) %>% filter(row_number()==ceiling(n()/2)) %>% | |
dplyr::select(min_dist_to_pb_escapee)) | |
d50_df <- add_row(d50_df, transition=t, d50=d50_dist) | |
} | |
probability_of_erosion <- ggplot(na.omit(probability_tib), aes(x=min_distance, y=probability, group=transition, color=as.factor(transition))) + | |
geom_line() + | |
labs(x = "Minimum Distance to Escapee (Mb)", | |
y = "Fraction of Transition Sites", | |
color = "Transition", | |
title="Probability of Erosion on Whole\nChromosome by Distance from Escapee") + | |
theme_bw() + | |
scale_color_futurama() + | |
scale_x_continuous(breaks = (0:6)*1e7, labels = (0:6)*10) + | |
base_plot_theme | |
theme(legend.title = element_text(size=5, face="bold"), | |
legend.text = element_text(size=5), | |
legend.key.width = unit(2, "mm")) | |
ggsave("figs/supplementary/sup2_prob_escape_whole_x.pdf", probability_of_erosion, width = 58, height=40, units = "mm") | |
``` | |
## Probability of demethylation based on escapee distance | |
```{r} | |
probability_tib <- tibble( | |
min_distance = numeric(), | |
transition = numeric(), | |
probability = numeric(), | |
probability_par = numeric() | |
) | |
for (dist_to_escapee in seq(1, 7e7, 5e6)) { | |
# dist_to_escapee <- dist_to_escapee * 1e4 | |
sites_within_dist <- chr_x_dmp_df_short_arm %>% filter(((min_dist_to_pb_escapee <= dist_to_escapee)) & (passes_threshold == T)) | |
sites_within_dist_par <- chr_x_dmp_df_short_arm %>% filter((((hg19_pos - 2699520) <= dist_to_escapee)) & (passes_threshold == T)) | |
for (t in 1:5) { | |
prob <- nrow(sites_within_dist %>% filter(key_transition == t & passes_threshold == T)) / | |
nrow(chr_x_dmp_df_short_arm %>% filter(key_transition == t & passes_threshold == T)) | |
prob_par <- nrow(sites_within_dist_par %>% filter(key_transition == t & passes_threshold == T)) / | |
nrow(chr_x_dmp_df_short_arm %>% filter(key_transition == t & passes_threshold == T)) | |
probability_tib <- add_row(probability_tib, | |
min_distance = dist_to_escapee, | |
transition = t, | |
probability = prob, | |
probability_par = prob_par) | |
} | |
} | |
d50_df_short_arm <- tibble(transition=numeric(),d50=numeric()) | |
for (t in 1:5) { | |
d50_dist <- as.numeric(chr_x_dmp_df_short_arm %>% | |
filter(key_transition == t & passes_threshold==T) %>% | |
arrange(min_dist_to_pb_escapee) %>% filter(row_number()==ceiling(n()/2)) %>% | |
dplyr::select(min_dist_to_pb_escapee)) | |
d50_df_short_arm <- add_row(d50_df_short_arm, transition=t, d50=d50_dist) | |
} | |
probability_of_erosion_short_arm <- ggplot(na.omit(probability_tib), aes(x=min_distance, y=probability, group=transition, color=as.factor(transition))) + | |
geom_line() + | |
labs(x = "Minimum Distance to Escapee (Mb)", | |
y = "Fraction of Transition Sites", | |
color = "Transition", | |
title="Probability of Erosion on Short Arm by\nDistance from Escapee") + | |
theme_bw() + | |
scale_color_futurama() + | |
scale_x_continuous(breaks = (0:6)*1e7, labels = (0:6)*10) + | |
base_plot_theme | |
theme(legend.title = element_text(size=5, face="bold"), | |
legend.text = element_text(size=5), | |
legend.key.width = unit(2, "mm")) | |
ggsave("figs/supplementary/sup2_prob_escape_short_arm.pdf", probability_of_erosion_short_arm, width = 174/3, height=40, units = "mm") | |
probability_of_erosion_short_arm_par <- ggplot(na.omit(probability_tib), aes(x=min_distance, y=probability_par, group=transition, color=as.factor(transition))) + | |
geom_line() + | |
labs(x = "Minimum Distance to PAR (Mb)", | |
y = "Fraction of Transition Sites", | |
color = "Transition", | |
title="Probability of Erosion on Short Arm by\nDistance from PAR") + | |
theme_bw() + | |
scale_color_futurama() + | |
scale_x_continuous(breaks = (0:6)*1e7, labels = (0:6)*10) + | |
base_plot_theme + | |
theme(legend.title = element_text(size=5, face="bold"), | |
legend.text = element_text(size=5), | |
legend.key.width = unit(2, "mm")) | |
ggsave("figs/supplementary/sup2_prob_escape_short_arm_par.svg", probability_of_erosion_short_arm_par, width = 174/4, height=50, units = "mm") | |
``` | |
## BElow is an old block that isn't being currently used | |
```{r} | |
chr_x_dmp_df_long_arm <- all_dmp_df %>% filter((chr == "chrX") & (pos > 58100000)) | |
long_arm_escapees <- gene_compiled_df %>% filter(gene %in% c("DANT2", "FIRRE")) | |
for (row_idx in 1:nrow(chr_x_dmp_df_long_arm)) { | |
pos <- unlist(chr_x_dmp_df_long_arm[row_idx, "pos"]) | |
min_distance <- min(abs(long_arm_escapees["start_position"] - pos)) | |
min_distance_end <- min(abs(long_arm_escapees["end_position"] - pos)) | |
chr_x_dmp_df_long_arm[row_idx, "min_dist_to_escapee"] <- min(min_distance, min_distance_end) | |
} | |
ggplot(chr_x_dmp_df_long_arm, aes(x=as.factor(key_transition), y=min_dist_to_escapee, fill=passes_threshold)) + | |
# geom_violin(aes(x=cluster, group=cluster, y=min_dist_to_escapee, fill=cluster)) + | |
geom_boxplot() + | |
stat_compare_means(aes(group = passes_threshold), label="p.format")+ | |
# geom_point(aes(fill=random), position = position_jitterdodge()) + | |
# geom_signif(map_signif_level=T, | |
# comparisons = list(c(1,2), c(1,3), c(1,4), c(1,5))) + | |
scale_y_log10() | |
for (row_idx in 1:nrow(chr_x_dmp_df)) { | |
pos <- unlist(chr_x_dmp_df[row_idx, "pos"]) | |
min_distance <- min(abs(gene_compiled_df[gene_compiled_df$gene %in% escapee, "start_position"] - pos)) | |
min_distance_end <- min(abs(gene_compiled_df[gene_compiled_df$gene %in% cotton_genes, "end_position"] - pos)) | |
# min_distance <- min(abs(gene_compiled_df[gene_compiled_df$gene %in% cotton_genes, "start_position"] - pos)) | |
# min_distance_end <- min(abs(gene_compiled_df[gene_compiled_df$gene %in% cotton_genes, "end_position"] - pos)) | |
chr_x_dmp_df[row_idx, "min_dist_to_cotton_escapee"] <- min(min_positions) | |
} | |
all_x_genes <- gene_compiled_df[gene_compiled_df$cluster == 10, "gene"]$gene | |
distances <- tibble( | |
key_transition = integer(), | |
min_distance = numeric(), | |
delta_increasing = numeric(), | |
pos = numeric(), | |
passes_threshold = logical() | |
) | |
for (n in 1:100) { | |
test_set <- sample(all_x_genes, 16) | |
for (row_idx in 1:nrow(chr_x_dmp_df)) { | |
pos <- unlist(chr_x_dmp_df[row_idx, "pos"]) | |
key_transition <- unlist(chr_x_dmp_df[row_idx, "key_transition"]) | |
delta_increasing <- unlist(chr_x_dmp_df[row_idx, "delta_increasing"]) | |
passes_threshold <- unlist(chr_x_dmp_df[row_idx, "passes_threshold"]) | |
min_distance <- min(abs(gene_compiled_df[gene_compiled_df$gene %in% test_set, "start_position"] - pos)) | |
min_distance_end <- min(abs(gene_compiled_df[gene_compiled_df$gene %in% test_set, "end_position"] - pos)) | |
if (is.na(chr_x_dmp_df[row_idx, "min_dist_to_test"])) { | |
chr_x_dmp_df[row_idx, "min_dist_to_test"] <- 0.0 | |
} | |
distances <- add_row(distances, | |
key_transition=key_transition, | |
min_distance = min(min_distance, min_distance_end), | |
pos=pos, | |
delta_increasing=delta_increasing, | |
passes_threshold=passes_threshold) | |
} | |
} | |
# chr_x_dmp_df["min_dist_to_test"] <- chr_x_dmp_df["min_dist_to_test"] / 100 | |
write_csv(distances, "distances_test.csv") | |
ggplot(distances %>% filter(delta_increasing == FALSE & passes_threshold == TRUE & pos <= 58100000 ), aes(x=key_transition, group=key_transition, y=min_distance)) + | |
# geom_violin(aes(x=cluster, group=cluster, y=min_dist_to_escapee, fill=cluster)) + | |
geom_boxplot() + | |
geom_point() + | |
geom_signif(map_signif_level=T, | |
comparisons = list(c(1,2), c(1,3), c(1,4), c(1,5))) + | |
scale_y_log10() | |
distances <- read_csv("distances_test.csv") | |
all_distances <- tibble( | |
key_transition = integer(), | |
min_distance = numeric(), | |
delta_increasing = numeric(), | |
pos = numeric(), | |
passes_threshold = logical(), | |
random = logical() | |
) | |
all_distances <- distances | |
all_distances["random"] <- "random_escapees" | |
for (row_idx in 1:nrow(chr_x_dmp_df)) { | |
random <- "eroding" | |
if (unlist(chr_x_dmp_df[row_idx, "passes_threshold"]) == F) { | |
random <- "non-eroding" | |
} | |
all_distances <- add_row(all_distances, | |
key_transition=unlist(chr_x_dmp_df[row_idx, "key_transition"]), | |
min_distance=unlist(chr_x_dmp_df[row_idx, "min_dist_to_escapee"]), | |
random=random, | |
delta_increasing=unlist(chr_x_dmp_df[row_idx, "delta_increasing"]), | |
pos=unlist(chr_x_dmp_df[row_idx, "pos"]), | |
passes_threshold=unlist(chr_x_dmp_df[row_idx, "passes_threshold"])) | |
} | |
chr_x_dmp_df_short_arm$sig_x_site <- chr_x_dmp_df_short_arm$X1 %in% sig_x_sites | |
for (row_idx in 1:nrow(chr_x_dmp_df_short_arm)) { | |
random <- "eroding" | |
if (unlist(chr_x_dmp_df_short_arm[row_idx, "passes_threshold"]) == F) { | |
random <- "non-eroding" | |
} | |
all_distances <- add_row(all_distances, | |
key_transition=unlist(chr_x_dmp_df_short_arm[row_idx, "key_transition"]), | |
min_distance=unlist(chr_x_dmp_df_short_arm[row_idx, "min_dist_to_escapee"]), | |
random=random, | |
delta_increasing=unlist(chr_x_dmp_df_short_arm[row_idx, "delta_increasing"]), | |
pos=unlist(chr_x_dmp_df_short_arm[row_idx, "pos"]), | |
passes_threshold=unlist(chr_x_dmp_df_short_arm[row_idx, "passes_threshold"])) | |
} | |
for (row_idx in 1:nrow(chr_x_dmp_df_long_arm)) { | |
all_distances <- add_row(all_distances, | |
key_transition=unlist(chr_x_dmp_df_long_arm[row_idx, "key_transition"]), | |
min_distance=unlist(chr_x_dmp_df_long_arm[row_idx, "min_dist_to_escapee"]), | |
random=FALSE, | |
delta_increasing=unlist(chr_x_dmp_df_long_arm[row_idx, "delta_increasing"]), | |
pos=unlist(chr_x_dmp_df_long_arm[row_idx, "pos"]), | |
passes_threshold=unlist(chr_x_dmp_df_long_arm[row_idx, "passes_threshold"])) | |
} | |
library(ggpubr) | |
comparisons = list(c("eroding", "non-eroding"), c("eroding", "random_escapees")) | |
ggplot(all_distances %>% filter(pos <= 58100000), aes(x=random, y=min_distance, fill=random)) + | |
# geom_violin(aes(x=cluster, group=cluster, y=min_dist_to_escapee, fill=cluster)) + | |
geom_boxplot() + | |
stat_compare_means(comparisons = list(c("eroding", "non-eroding"), c("eroding", "random_escapees")), label="p.format", size=2)+ | |
# geom_point(aes(fill=random), position = position_jitterdodge()) + | |
# geom_signif(map_signif_level=T, | |
# comparisons = list(c(1,2), c(1,3), c(1,4), c(1,5))) + | |
scale_y_log10() + | |
facet_grid(~key_transition) + | |
scale_fill_futurama() + | |
theme_bw() + | |
base_plot_theme + | |
labs(y="Minimum Distance to Escapee", | |
x="", | |
title="Minimum Distance to Escapee on Short Arm") + | |
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.3)) | |
ggsave("figs/min_dist_short_arm.png", width=190, height=190, units="mm", dpi = 1000) | |
ggplot(chr_x_dmp_df_short_arm, aes(x=as.factor(key_transition), y=min_dist_to_escapee, fill=sig_x_site)) + | |
# geom_violin(aes(x=cluster, group=cluster, y=min_dist_to_escapee, fill=cluster)) + | |
geom_boxplot() + | |
stat_compare_means(aes(group = sig_x_site), label="p.format")+ | |
# geom_point(aes(fill=random), position = position_jitterdodge()) + | |
# geom_signif(map_signif_level=T, | |
# comparisons = list(c(1,2), c(1,3), c(1,4), c(1,5))) + | |
scale_y_log10() | |
ggplot(chr_x_dmp_df %>% filter(delta_increasing == FALSE & passes_threshold == TRUE & pos <= 58100000), aes(x=key_transition, group=key_transition, y=min_dist_to_cotton_escapee, fill="min_cotton")) + | |
# geom_violin(aes(x=cluster, group=cluster, y=min_dist_to_escapee, fill=cluster)) + | |
geom_boxplot() + | |
geom_boxplot(data=distances %>% filter(delta_increasing == FALSE & passes_threshold == TRUE & pos <= 58100000), mapping=aes(x=key_transition, group=key_transition, y=min_distance, fill="min_random")) + | |
geom_point() + | |
geom_signif(map_signif_level=T, | |
comparisons = list(c(1,2), c(1,3), c(1,4), c(1,5))) + | |
scale_y_log10() | |
ggplot(chr_x_dmp_df %>% filter(delta_increasing == FALSE & passes_threshold == TRUE & pos <= 58100000), aes(x=min_dist_to_cotton_escapee, y=abs(delta))) + | |
# geom_smooth(aes(color=as.factor(key_transition))) + | |
# geom_smooth(color="black") + | |
geom_point(aes(color=as.factor(key_transition))) + | |
scale_x_log10() | |
ggplot(chr_x_dmp_df %>% filter(delta_increasing == FALSE & passes_threshold == TRUE), aes(x=min_dist_to_escapee, y=abs(delta))) + | |
geom_point(aes(color=as.factor(key_transition))) + | |
scale_x_log10() | |
# y_position=c(1.2e7,1.21e7,1.21e7,1.21e7) | |
ggplot(chr_x_dmp_df %>% filter(delta_increasing == FALSE & passes_threshold == TRUE & pos <= 58100000), aes(x=key_transition, group=key_transition, y=min_dist_to_escapee)) + | |
# geom_violin(aes(x=cluster, group=cluster, y=min_dist_to_escapee, fill=cluster)) + | |
geom_boxplot() + | |
geom_point() + | |
geom_signif(map_signif_level=T, | |
comparisons = list(c(1,2), c(1,3), c(1,4), c(1,5))) + | |
scale_y_log10() | |
ggplot(chr_x_dmp_df %>% filter(delta_increasing == FALSE & passes_threshold == TRUE & pos <= 58100000), aes(x=min_dist_to_escapee, y=abs(delta))) + | |
# geom_smooth(aes(color=as.factor(key_transition))) + | |
# geom_smooth(color="black") + | |
geom_point(aes(color=as.factor(key_transition))) + | |
scale_x_log10() | |
for (row_idx in 1:nrow(gene_compiled_df)) { | |
gene <- unlist(gene_compiled_df[row_idx, "gene"]) | |
total <- unlist(gene_compiled_df[row_idx, "total_sites"]) | |
start_pos <- unlist(gene_compiled_df[row_idx, "start_position"]) | |
cluster <- unlist(gene_compiled_df[row_idx, "cluster"]) | |
if (cluster <= 9) { | |
min_distance <- min(abs(gene_compiled_df[gene_compiled_df$gene %in% 11, "start_position"] - start_pos)) | |
gene_compiled_df[row_idx, "min_dist_to_escapee"] <- min_distance | |
} | |
} | |
library(ggsignif) | |
ggplot(gene_compiled_df %>% filter((chr=="chrX") & (cluster %in% c(5,6,7,8,9)))) + | |
# geom_violin(aes(x=cluster, group=cluster, y=min_dist_to_escapee, fill=cluster)) + | |
geom_boxplot(aes(x=cluster, group=cluster, y=min_dist_to_escapee)) + | |
geom_point(aes(x=cluster, y=min_dist_to_escapee)) + | |
geom_signif(aes(x=cluster, group=cluster, y=min_dist_to_escapee), | |
map_signif_level=F, comparisons = list(c(5,6), c(5,7), c(5,8), c(5,9))) + | |
scale_y_log10() | |
mean_distances <- gene_compiled_df %>% group_by(cluster) %>% summarise( | |
min_dist_to_escapee = mean(na.omit(min_dist_to_escapee))) | |
ggplot(gene_compiled_df[(gene_compiled_df$chr == "chrX") & (gene_compiled_df$cluster %in% c(5,6,7,8,9)),]) + | |
geom_bar() | |
geom_freqpoly(aes(start_position, colour=as.factor(cluster)), binwidth = 1500000) + | |
labs(x="Location on X Chr", | |
y='Frequency Count', | |
title='Location of X Chromosome CpG sites that are Decreasing in Methylation') | |
plt3 <- ggplot(all_dmp_df[(all_dmp_df$chr == "chrX") & (all_dmp_df$passes_threshold == F)& (all_dmp_df$delta_increasing == F),]) + | |
geom_freqpoly(aes(pos),binwidth = 1500000) + | |
labs(x="Location on X Chr", | |
y='Frequency Count', | |
title=sprintf('Location of X Chromosome CpG sites that are Decreasing in Methylation', i-1, i)) + | |
theme(plot.title = element_text(hjust = 0.5))+ | |
scale_color_brewer(palette='Spectral') | |
``` | |
```{r old code probably for deletion} | |
library(tidyverse) | |
library(ggplot2) | |
all_dmp_df <- read_csv('dmp_transition_info.csv') | |
# do this based on genes and when the promoters change their methylation. | |
x_annotation_df <- read_csv("x_annotation.csv.gz") | |
statistics <- read.csv('variance_statistics_noh9_nooutliers.csv.gz') | |
positions_to_keep <- statistics$Position[statistics$alpha <= .01 & (statistics$femaleVariance > statistics$maleVariance)] # returns the density data | |
sig_x_sites <- intersect(positions_to_keep, x_annotation_df$rowname) | |
print(length(sig_x_sites)) | |
sig_annotation <- x_annotation_df[x_annotation_df$Name %in% sig_x_sites,] | |
x_sig_sites <- intersect(positions_to_keep, x_annotation_df$rowname) | |
all_dmp_df$escapee <- (all_dmp_df$chr == "chrX") & !(all_dmp_df$X1 %in% x_sig_sites) & | |
(grepl("TSS1500|TSS200|5'UTR", all_dmp_df$refgene_group) & | |
(rowMeans(all_dmp_df[c("cluster_0_mean", | |
"cluster_1_mean", | |
"cluster_2_mean", | |
"cluster_3_mean", | |
"cluster_4_mean", | |
"cluster_5_mean")]) < .25)) | |
#group by genes | |
gene_compiled_tss_sites_df <- tibble(gene=character(), | |
chr=character(), | |
total_sites=integer(), | |
total_sites_passing_threshold=integer(), | |
num_escapees=integer(), | |
mean_promoter_methylation=numeric(), | |
transition_1_inc_sites=integer(), | |
transition_1_dec_sites=integer(), | |
transition_2_inc_sites=integer(), | |
transition_2_dec_sites=integer(), | |
transition_3_inc_sites=integer(), | |
transition_3_dec_sites=integer(), | |
transition_4_inc_sites=integer(), | |
transition_4_dec_sites=integer(), | |
transition_5_inc_sites=integer(), | |
transition_5_dec_sites=integer(), | |
) | |
for (row_idx in 1:nrow(all_dmp_df)) { | |
gene <- unlist(all_dmp_df[row_idx, "mapped_gene"]) | |
# genes <- strsplit(unlist(all_dmp_df[row_idx, "gene"]), ";") | |
refgene_group <- unlist(all_dmp_df[row_idx, "refgene_group"]) | |
chr <- unlist(all_dmp_df[row_idx, "chr"]) | |
dist_to_tss <- unlist(all_dmp_df[row_idx, "dist_to_tss"]) | |
delta_increasing <- | |
unlist(all_dmp_df[row_idx, "delta_increasing"]) | |
key_transition <- unlist(all_dmp_df[row_idx, "key_transition"]) | |
alt_key_transition <- | |
unlist(all_dmp_df[row_idx, "alt_key_transition"]) | |
alt_delta <- unlist(all_dmp_df[row_idx, "alt_delta"]) | |
passes_threshold <- | |
unlist(all_dmp_df[row_idx, "passes_threshold"]) | |
more_variant <- | |
unlist(all_dmp_df[row_idx, "X1"]) %in% positions_to_keep | |
escapee <- unlist(all_dmp_df[row_idx, "escapee"]) | |
mean_methylation <- unlist(rowMeans(all_dmp_df[row_idx, c("cluster_0_mean", | |
"cluster_1_mean", | |
"cluster_2_mean", | |
"cluster_3_mean", | |
"cluster_4_mean", | |
"cluster_5_mean")])) | |
if (grepl("TSS1500|TSS200|5'UTR", refgene_group) & | |
(!is.na(gene))) { | |
# if (grepl("TSS1500|TSS200", refgene_group) & (! is.na(gene)) & (! is.na(alt_key_transition))) { | |
gene_row_idx <- 0 | |
if (!(gene %in% gene_compiled_tss_sites_df$gene)) { | |
# gene is not in gene_compiled_tss_sites_df, so add a row for it | |
gene_row_idx <- nrow(gene_compiled_tss_sites_df) + 1 | |
gene_compiled_tss_sites_df[gene_row_idx, c("gene")] = gene | |
gene_compiled_tss_sites_df[gene_row_idx, c("chr")] = chr | |
gene_compiled_tss_sites_df[is.na(gene_compiled_tss_sites_df)] <- | |
0 | |
} else { | |
i <- 1 | |
for (gene_name in gene_compiled_tss_sites_df$gene) { | |
if ((!is.na(gene_name)) && gene == gene_name) { | |
gene_row_idx <- i | |
break | |
} | |
i <- i + 1 | |
} | |
} | |
gene_compiled_tss_sites_df[gene_row_idx, "total_sites"] <- | |
gene_compiled_tss_sites_df[gene_row_idx, "total_sites"] + 1 | |
gene_compiled_tss_sites_df[gene_row_idx, "mean_promoter_methylation"] <- | |
gene_compiled_tss_sites_df[gene_row_idx, "mean_promoter_methylation"] + mean_methylation | |
change <- "dec" | |
if (delta_increasing) { | |
# if (alt_delta > 0) { | |
change <- "inc" | |
} | |
if (escapee == TRUE) { | |
gene_compiled_tss_sites_df[gene_row_idx, "num_escapees"] <- | |
gene_compiled_tss_sites_df[gene_row_idx, "num_escapees"] + 1 | |
} else if (passes_threshold) { | |
gene_compiled_tss_sites_df[gene_row_idx, "total_sites_passing_threshold"] <- | |
gene_compiled_tss_sites_df[gene_row_idx, "total_sites_passing_threshold"] + 1 | |
gene_compiled_tss_sites_df[gene_row_idx, sprintf("transition_%d_%s_sites", key_transition, change)] <- | |
gene_compiled_tss_sites_df[gene_row_idx, sprintf("transition_%d_%s_sites", key_transition, change)] + 1 | |
} | |
} | |
} | |
gene_compiled_tss_sites_df["mean_promoter_methylation"] <- gene_compiled_tss_sites_df["mean_promoter_methylation"] / | |
gene_compiled_tss_sites_df["total_sites"] | |
# write_csv(gene_compiled_tss_sites_df, "gene_compiled_tss_sites.csv") | |
gene_compiled_df <- gene_compiled_tss_sites_df | |
# add gene positions | |
library('biomaRt') | |
mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl")) | |
genes <- gene_compiled_df$gene | |
G_list <- getBM(filters= c("hgnc_symbol"), attributes= c("ensembl_gene_id", | |
"hgnc_symbol", "chromosome_name", "start_position", "end_position", "strand", "description"),values=genes,mart= mart) | |
#turn into inner join | |
gene_compiled_df <- inner_join(gene_compiled_df,G_list,by=c("gene"="hgnc_symbol")) | |
gene_compiled_df <- add_column(gene_compiled_df, cluster=c(0)*nrow(gene_compiled_df)) | |
# gene_clusters <- tibble(gene=character(), cluster=integer()) | |
for (row_idx in 1:nrow(gene_compiled_df)) { | |
gene <- unlist(gene_compiled_df[row_idx, "gene"]) | |
chr <- unlist(gene_compiled_df[row_idx, "chr"]) | |
total_sites <- unlist(gene_compiled_df[row_idx, "total_sites"]) | |
total_sites_passing_threshold <- unlist(gene_compiled_df[row_idx, "total_sites_passing_threshold"]) | |
num_escapees <- unlist(gene_compiled_df[row_idx, "num_escapees"]) | |
mean_promoter_methylation <- unlist(gene_compiled_df[row_idx, "mean_promoter_methylation"]) | |
maximum_transition <- 0 | |
maximum_change <- "" | |
maximum_count <- 0 | |
for(key_transition in 1:5) { | |
for (change in c("inc", "dec")) { | |
transition <- sprintf("transition_%d_%s_sites", key_transition, change) | |
if (gene_compiled_df[row_idx, transition] > maximum_count) { | |
maximum_transition <- key_transition | |
maximum_change <- change | |
maximum_count <- as.integer(unlist(gene_compiled_df[row_idx, transition])) | |
} | |
} | |
} | |
cluster <- 10 | |
# if (gene %in% known_escapee_genes) { | |
# cluster <- 11 | |
# }else | |
if (((num_escapees >= 1) & ((mean_promoter_methylation < .6) & (chr == "chrX"))) & (maximum_count / total_sites < .5)) { #(mean_promoter_methylation <= .25) { | |
cluster <- 11 | |
}else if ((total_sites_passing_threshold != 0) & (maximum_count / total_sites_passing_threshold >= .5)) { | |
if (maximum_change == "dec") { | |
cluster <- maximum_transition - 1 + 5 | |
}else { | |
cluster <- maximum_transition - 1 | |
} | |
} | |
i <- 1 | |
for (gene_name in gene_compiled_df$gene) { | |
if ((! is.na(gene_name)) && gene == gene_name) { | |
gene_row_idx <- i | |
break | |
} | |
i <- i+1 | |
} | |
gene_compiled_df[i, "cluster"] <- cluster | |
} | |
plt2 <- ggplot(gene_compiled_df[(gene_compiled_df$chr == "chrX") & (gene_compiled_df$cluster %in% c(5,6,7,8,9) | gene_compiled_df$cluster == 11),]) + | |
geom_freqpoly(aes(start_position, colour=as.factor(cluster)), bins = 1000) + | |
labs(x="Location on X Chr", | |
y='Frequency Count', | |
title='Location of X Chromosome CpG sites that are Decreasing in Methylation') #+ | |
#geom_freqpoly(data = all_dmp_df[(all_dmp_df$chr == "chrX") & !(all_dmp_df$X1 %in% x_sig_sites) & (all_dmp_df$cluster_0_mean < .125),], mapping= | |
# aes(pos, colour="escapee", binwidth = 1500000)) + | |
# theme(plot.title = element_text(hjust = 0.5))+ | |
# scale_color_brewer(palette='Spectral') | |
chr_x_dmp_df <- all_dmp_df %>% filter((X1 %in% sig_x_sites)) | |
chr_x_dmp_df_short_arm <- all_dmp_df %>% filter((chr == "chrX") & (pos <= 58100000)) | |
short_arm_escapees <- gene_compiled_df %>% filter((cluster == 11) & (start_position <= 58100000)) | |
for (row_idx in 1:nrow(chr_x_dmp_df_short_arm)) { | |
pos <- unlist(chr_x_dmp_df_short_arm[row_idx, "pos"]) | |
min_distance <- min(abs(short_arm_escapees["start_position"] - pos)) | |
min_distance_end <- min(abs(short_arm_escapees["end_position"] - pos)) | |
chr_x_dmp_df_short_arm[row_idx, "min_dist_to_escapee"] <- min(min_distance, min_distance_end) | |
} | |
chr_x_dmp_df_long_arm <- all_dmp_df %>% filter((chr == "chrX") & (all_dmp_df$passes_threshold == T) & (pos >= 58100000)) | |
long_arm_escapees <- gene_compiled_df %>% filter(gene %in% c("DANT2", "FIRRE")) | |
for (row_idx in 1:nrow(chr_x_dmp_df_long_arm)) { | |
pos <- unlist(chr_x_dmp_df_long_arm[row_idx, "pos"]) | |
min_distance <- min(abs(long_arm_escapees["start_position"] - pos)) | |
min_distance_end <- min(abs(long_arm_escapees["end_position"] - pos)) | |
chr_x_dmp_df_long_arm[row_idx, "min_dist_to_escapee"] <- min(min_distance, min_distance_end) | |
} | |
for (row_idx in 1:nrow(chr_x_dmp_df)) { | |
pos <- unlist(chr_x_dmp_df[row_idx, "pos"]) | |
min_distance <- min(abs(gene_compiled_df[gene_compiled_df$gene %in% escapee, "start_position"] - pos)) | |
min_distance_end <- min(abs(gene_compiled_df[gene_compiled_df$gene %in% cotton_genes, "end_position"] - pos)) | |
# min_distance <- min(abs(gene_compiled_df[gene_compiled_df$gene %in% cotton_genes, "start_position"] - pos)) | |
# min_distance_end <- min(abs(gene_compiled_df[gene_compiled_df$gene %in% cotton_genes, "end_position"] - pos)) | |
chr_x_dmp_df[row_idx, "min_dist_to_cotton_escapee"] <- min(min_positions) | |
} | |
all_x_genes <- gene_compiled_df[gene_compiled_df$cluster == 10, "gene"]$gene | |
distances <- tibble( | |
key_transition = integer(), | |
min_distance = numeric(), | |
delta_increasing = numeric(), | |
pos = numeric(), | |
passes_threshold = logical() | |
) | |
for (n in 1:100) { | |
test_set <- sample(all_x_genes, 16) | |
for (row_idx in 1:nrow(chr_x_dmp_df)) { | |
pos <- unlist(chr_x_dmp_df[row_idx, "pos"]) | |
key_transition <- unlist(chr_x_dmp_df[row_idx, "key_transition"]) | |
delta_increasing <- unlist(chr_x_dmp_df[row_idx, "delta_increasing"]) | |
passes_threshold <- unlist(chr_x_dmp_df[row_idx, "passes_threshold"]) | |
min_distance <- min(abs(gene_compiled_df[gene_compiled_df$gene %in% test_set, "start_position"] - pos)) | |
min_distance_end <- min(abs(gene_compiled_df[gene_compiled_df$gene %in% test_set, "end_position"] - pos)) | |
if (is.na(chr_x_dmp_df[row_idx, "min_dist_to_test"])) { | |
chr_x_dmp_df[row_idx, "min_dist_to_test"] <- 0.0 | |
} | |
distances <- add_row(distances, | |
key_transition=key_transition, | |
min_distance = min(min_distance, min_distance_end), | |
pos=pos, | |
delta_increasing=delta_increasing, | |
passes_threshold=passes_threshold) | |
} | |
} | |
# chr_x_dmp_df["min_dist_to_test"] <- chr_x_dmp_df["min_dist_to_test"] / 100 | |
write_csv(distances, "distances_test.csv") | |
ggplot(distances %>% filter(delta_increasing == FALSE & passes_threshold == TRUE & pos <= 58100000 ), aes(x=key_transition, group=key_transition, y=min_distance)) + | |
# geom_violin(aes(x=cluster, group=cluster, y=min_dist_to_escapee, fill=cluster)) + | |
geom_boxplot() + | |
geom_point() + | |
geom_signif(map_signif_level=T, | |
comparisons = list(c(1,2), c(1,3), c(1,4), c(1,5))) + | |
scale_y_log10() | |
distances <- read_csv("distances_test.csv") | |
all_distances <- tibble( | |
key_transition = integer(), | |
min_distance = numeric(), | |
delta_increasing = numeric(), | |
pos = numeric(), | |
passes_threshold = logical(), | |
random = logical() | |
) | |
all_distances <- distances | |
all_distances["random"] <- TRUE | |
for (row_idx in 1:nrow(chr_x_dmp_df)) { | |
all_distances <- add_row(all_distances, | |
key_transition=unlist(chr_x_dmp_df[row_idx, "key_transition"]), | |
min_distance=unlist(chr_x_dmp_df[row_idx, "min_dist_to_escapee"]), | |
random=FALSE, | |
delta_increasing=unlist(chr_x_dmp_df[row_idx, "delta_increasing"]), | |
pos=unlist(chr_x_dmp_df[row_idx, "pos"]), | |
passes_threshold=unlist(chr_x_dmp_df[row_idx, "passes_threshold"])) | |
} | |
for (row_idx in 1:nrow(chr_x_dmp_df_short_arm)) { | |
all_distances <- add_row(all_distances, | |
key_transition=unlist(chr_x_dmp_df_short_arm[row_idx, "key_transition"]), | |
min_distance=unlist(chr_x_dmp_df_short_arm[row_idx, "min_dist_to_escapee"]), | |
random=FALSE, | |
delta_increasing=unlist(chr_x_dmp_df_short_arm[row_idx, "delta_increasing"]), | |
pos=unlist(chr_x_dmp_df_short_arm[row_idx, "pos"]), | |
passes_threshold=unlist(chr_x_dmp_df_short_arm[row_idx, "passes_threshold"])) | |
} | |
for (row_idx in 1:nrow(chr_x_dmp_df_long_arm)) { | |
all_distances <- add_row(all_distances, | |
key_transition=unlist(chr_x_dmp_df_long_arm[row_idx, "key_transition"]), | |
min_distance=unlist(chr_x_dmp_df_long_arm[row_idx, "min_dist_to_escapee"]), | |
random=FALSE, | |
delta_increasing=unlist(chr_x_dmp_df_long_arm[row_idx, "delta_increasing"]), | |
pos=unlist(chr_x_dmp_df_long_arm[row_idx, "pos"]), | |
passes_threshold=unlist(chr_x_dmp_df_long_arm[row_idx, "passes_threshold"])) | |
} | |
library(ggpubr) | |
ggplot(all_distances %>% filter(passes_threshold == TRUE & pos <= 58100000), aes(x=as.factor(key_transition), y=min_distance, fill=random)) + | |
# geom_violin(aes(x=cluster, group=cluster, y=min_dist_to_escapee, fill=cluster)) + | |
geom_boxplot() + | |
stat_compare_means(aes(group = random), label="p.format")+ | |
# geom_point(aes(fill=random), position = position_jitterdodge()) + | |
# geom_signif(map_signif_level=T, | |
# comparisons = list(c(1,2), c(1,3), c(1,4), c(1,5))) + | |
scale_y_log10() | |
chr_x_dmp_df_short_arm$sig_x_site <- chr_x_dmp_df_short_arm$X1 %in% sig_x_sites | |
ggplot(chr_x_dmp_df_short_arm, aes(x=as.factor(key_transition), y=min_dist_to_escapee, fill=sig_x_site)) + | |
# geom_violin(aes(x=cluster, group=cluster, y=min_dist_to_escapee, fill=cluster)) + | |
geom_boxplot() + | |
stat_compare_means(aes(group = sig_x_site), label="p.format")+ | |
# geom_point(aes(fill=random), position = position_jitterdodge()) + | |
# geom_signif(map_signif_level=T, | |
# comparisons = list(c(1,2), c(1,3), c(1,4), c(1,5))) + | |
scale_y_log10() | |
ggplot(chr_x_dmp_df %>% filter(delta_increasing == FALSE & passes_threshold == TRUE & pos <= 58100000), aes(x=key_transition, group=key_transition, y=min_dist_to_cotton_escapee, fill="min_cotton")) + | |
# geom_violin(aes(x=cluster, group=cluster, y=min_dist_to_escapee, fill=cluster)) + | |
geom_boxplot() + | |
geom_boxplot(data=distances %>% filter(delta_increasing == FALSE & passes_threshold == TRUE & pos <= 58100000), mapping=aes(x=key_transition, group=key_transition, y=min_distance, fill="min_random")) + | |
geom_point() + | |
geom_signif(map_signif_level=T, | |
comparisons = list(c(1,2), c(1,3), c(1,4), c(1,5))) + | |
scale_y_log10() | |
ggplot(chr_x_dmp_df %>% filter(delta_increasing == FALSE & passes_threshold == TRUE & pos <= 58100000), aes(x=min_dist_to_cotton_escapee, y=abs(delta))) + | |
# geom_smooth(aes(color=as.factor(key_transition))) + | |
# geom_smooth(color="black") + | |
geom_point(aes(color=as.factor(key_transition))) + | |
scale_x_log10() | |
ggplot(chr_x_dmp_df %>% filter(delta_increasing == FALSE & passes_threshold == TRUE), aes(x=min_dist_to_escapee, y=abs(delta))) + | |
geom_point(aes(color=as.factor(key_transition))) + | |
scale_x_log10() | |
# y_position=c(1.2e7,1.21e7,1.21e7,1.21e7) | |
ggplot(chr_x_dmp_df %>% filter(delta_increasing == FALSE & passes_threshold == TRUE & pos <= 58100000), aes(x=key_transition, group=key_transition, y=min_dist_to_escapee)) + | |
# geom_violin(aes(x=cluster, group=cluster, y=min_dist_to_escapee, fill=cluster)) + | |
geom_boxplot() + | |
geom_point() + | |
geom_signif(map_signif_level=T, | |
comparisons = list(c(1,2), c(1,3), c(1,4), c(1,5))) + | |
scale_y_log10() | |
ggplot(chr_x_dmp_df %>% filter(delta_increasing == FALSE & passes_threshold == TRUE & pos <= 58100000), aes(x=min_dist_to_escapee, y=abs(delta))) + | |
# geom_smooth(aes(color=as.factor(key_transition))) + | |
# geom_smooth(color="black") + | |
geom_point(aes(color=as.factor(key_transition))) + | |
scale_x_log10() | |
for (row_idx in 1:nrow(gene_compiled_df)) { | |
gene <- unlist(gene_compiled_df[row_idx, "gene"]) | |
total <- unlist(gene_compiled_df[row_idx, "total_sites"]) | |
start_pos <- unlist(gene_compiled_df[row_idx, "start_position"]) | |
cluster <- unlist(gene_compiled_df[row_idx, "cluster"]) | |
if (cluster <= 9) { | |
min_distance <- min(abs(gene_compiled_df[gene_compiled_df$gene %in% 11, "start_position"] - start_pos)) | |
gene_compiled_df[row_idx, "min_dist_to_escapee"] <- min_distance | |
} | |
} | |
library(ggsignif) | |
ggplot(gene_compiled_df %>% filter((chr=="chrX") & (cluster %in% c(5,6,7,8,9)))) + | |
# geom_violin(aes(x=cluster, group=cluster, y=min_dist_to_escapee, fill=cluster)) + | |
geom_boxplot(aes(x=cluster, group=cluster, y=min_dist_to_escapee)) + | |
geom_point(aes(x=cluster, y=min_dist_to_escapee)) + | |
geom_signif(aes(x=cluster, group=cluster, y=min_dist_to_escapee), | |
map_signif_level=F, comparisons = list(c(5,6), c(5,7), c(5,8), c(5,9))) + | |
scale_y_log10() | |
mean_distances <- gene_compiled_df %>% group_by(cluster) %>% summarise( | |
min_dist_to_escapee = mean(na.omit(min_dist_to_escapee))) | |
ggplot(gene_compiled_df[(gene_compiled_df$chr == "chrX") & (gene_compiled_df$cluster %in% c(5,6,7,8,9)),]) + | |
geom_bar() | |
geom_freqpoly(aes(start_position, colour=as.factor(cluster)), binwidth = 1500000) + | |
labs(x="Location on X Chr", | |
y='Frequency Count', | |
title='Location of X Chromosome CpG sites that are Decreasing in Methylation') | |
plt3 <- ggplot(all_dmp_df[(all_dmp_df$chr == "chrX") & (all_dmp_df$passes_threshold == F)& (all_dmp_df$delta_increasing == F),]) + | |
geom_freqpoly(aes(pos),binwidth = 1500000) + | |
labs(x="Location on X Chr", | |
y='Frequency Count', | |
title=sprintf('Location of X Chromosome CpG sites that are Decreasing in Methylation', i-1, i)) + | |
theme(plot.title = element_text(hjust = 0.5))+ | |
scale_color_brewer(palette='Spectral') | |
``` | |
## Probability of erosion based on distance to escapees | |
```{r} | |
chr_x_dmp_df_short_arm <- all_dmp_df %>% filter((chr == "chrX") & (pos <= 58100000)) | |
shortarm_escapees <- gene_compiled_df %>% filter((cluster == 11) & (start_position <= 58100000)) | |
for (row_idx in 1:nrow(chr_x_dmp_df_short_arm)) { | |
pos <- unlist(chr_x_dmp_df_short_arm[row_idx, "pos"]) | |
min_distance <- min(abs(shortarm_escapees$start_position - pos)) | |
min_distance_end <- min(abs(shortarm_escapees$end_position - pos)) | |
chr_x_dmp_df_short_arm[row_idx, "min_dist_to_escapee"] <- min(min_distance, min_distance_end) | |
q_val_col <- sprintf("transition_%d_qval", unlist( chr_x_dmp_df_short_arm[row_idx, "key_transition"])) | |
alt_q_val_col <- sprintf("transition_%d_qval", unlist( chr_x_dmp_df_short_arm[row_idx, "alt_key_transition"])) | |
chr_x_dmp_df_short_arm[row_idx, "min_dist_to_escapee"] <- min(min_distance, min_distance_end) | |
chr_x_dmp_df_short_arm[row_idx, "key_qval"] <- unlist(chr_x_dmp_df_short_arm[row_idx, q_val_col]) | |
if (is.na(unlist( chr_x_dmp_df_short_arm[row_idx, "alt_key_transition"]))) { | |
chr_x_dmp_df_short_arm[row_idx, "alt_key_qval"] <- NA | |
}else{ | |
chr_x_dmp_df_short_arm[row_idx, "alt_key_qval"] <- unlist(chr_x_dmp_df_short_arm[row_idx, alt_q_val_col]) | |
} | |
} | |
transition_probabilities <- tibble(transition = numeric(), | |
distance = character(), | |
probability_of_sig_padj = numeric()) | |
for (t in 1:5) { | |
for (dist_cutoff in 1:1e8) { | |
tran_col <- sprintf("transition_%d_qval", t) | |
#(!! ensym(tran_col) <= .1) | |
num_changing <- nrow(chr_x_dmp_df_short_arm %>% filter((min_dist_to_escapee <= dist_cutoff) & (!! ensym(tran_col) <= .1))) | |
num_total <- nrow(chr_x_dmp_df_short_arm %>% filter(min_dist_to_escapee <= dist_cutoff)) | |
transition_probabilities <- add_row(transition_probabilities, | |
transition = t, | |
distance = dist_cutoff, | |
probability_of_sig_padj = num_changing / num_total) | |
} | |
} | |
ggplot(chr_x_dmp_df_short_arm %>% filter(min_dist_to_escapee < 50000 & delta < 0), aes(x = min_dist_to_escapee, y = delta)) + | |
geom_bin2d() + | |
facet_grid(key_transition ~ .) | |
ggplot(chr_x_dmp_df_short_arm, aes(x = min_dist_to_escapee, y = transition_2_qval)) + | |
geom_bin2d() | |
geom_line(aes(group = transition, color=as.factor(transition))) | |
``` | |
## Contiguity search | |
```{r} | |
ggplot() + | |
geom_area(all_dmp_df %>% filter(cluster_0_mean >= .4 & cluster_0_mean <= .6 & hg38_chr == "chrX" & passes_threshold == F), | |
mapping = aes(x=hg38_pos,y = (..count..), fill="not passing threshold"), stat="bin", bins = 100) + | |
geom_area(all_dmp_df %>% filter(cluster_0_mean >= .4 & cluster_0_mean <= .6 & hg38_chr == "chrX" & passes_threshold == T), | |
mapping = aes(x=hg38_pos,y = (..count..), fill="passing threshold"), stat="bin", bins = 100, alpha=.4) + | |
geom_area(all_dmp_df %>% filter(hg38_chr == "chrX"), mapping=aes(x=hg38_pos, y = (..count..)), stat="bin", bins = 100, fill="grey", alpha=.4) + | |
theme_bw() + | |
labs(x="X Chr Coordinate", | |
y="Percent of Sites", | |
title = "Probes that start from .4-.6") + | |
base_plot_theme | |
composition_passing_threshold <- all_dmp_df %>% filter(cluster_0_mean >= .4 & cluster_0_mean <= .6 & hg38_chr == "chrX" & passes_threshold == T & delta_increasing == F) %>% group_by(hg38_refgene_group) %>% tally() %>% mutate(pct=(100*n)/sum(n)) | |
composition_not_passing_threshold <- all_dmp_df %>% filter(cluster_0_mean >= .4 & cluster_0_mean <= .6 & hg38_chr == "chrX" & passes_threshold == F) %>% group_by(hg38_refgene_group) %>% tally() %>% mutate(pct=(100*n)/sum(n)) | |
composition <- full_join(composition_passing_threshold, composition_not_passing_threshold, by="hg38_refgene_group", suffix=c(".pass", ".notpass")) | |
ggplot() + | |
geom_area(all_dmp_df %>% filter(cluster_0_mean >= .4 & cluster_0_mean <= .6 & hg38_chr == "chrX" & passes_threshold == T & key_transition <= 2 & grepl("TSS200|5_UTR", hg38_refgene_group)), | |
mapping = aes(x=hg38_pos,y = (..count..)/sum(..count..), fill="early", color="early"), stat="bin", bins = 100, alpha=.4) + | |
geom_area(all_dmp_df %>% filter(cluster_0_mean >= .4 & cluster_0_mean <= .6 & hg38_chr == "chrX" & passes_threshold == T & key_transition >= 3 & grepl("TSS200|5_UTR", hg38_refgene_group)), | |
mapping = aes(x=hg38_pos,y = (..count..)/sum(..count..), fill="late", color="late"), stat="bin", bins = 100, alpha=.4 ) + | |
theme_bw() + | |
labs(x="X Chr Coordinate", | |
y="Percent of Sites", | |
title = "Probes that start from .4-.6") + | |
base_plot_theme + | |
scale_fill_viridis_d() + | |
scale_color_viridis_d() | |
early_promoter_probes <- all_dmp_df %>% filter(cluster_0_mean >= .4 & cluster_0_mean <= .6 & hg38_chr == "chrX" & passes_threshold == T & key_transition <= 2 & delta_increasing == F & grepl("TSS200|5_UTR", hg38_refgene_group)) | |
late_promoter_probes <- all_dmp_df %>% filter(cluster_0_mean >= .4 & cluster_0_mean <= .6 & hg38_chr == "chrX" & passes_threshold == T & key_transition >= 3 & delta_increasing == F & grepl("TSS200|5_UTR", hg38_refgene_group)) | |
early_genes_tally <- early_promoter_probes %>% group_by(hg38_gene) %>% tally() | |
late_genes_tally <- late_promoter_probes %>% group_by(hg38_gene) %>% tally() | |
early_genes <- unique(early_promoter_probes$hg38_gene) | |
late_genes <- unique(late_promoter_probes$hg38_gene) | |
combined_intersecting_tallies <- inner_join(early_genes_tally, late_genes_tally, by="hg38_gene") | |
``` | |
## Grouping by genes | |
Making a version of all_dmp_df table, but for genes | |
```{r} | |
library(broom) | |
female_df_no_h9 <- read_csv("female_df_no_h9.csv") | |
diff_expr_all_df <- read_csv("diff_expr_all_df.csv") | |
# counts_all_df <- read_csv("counts_all_df.csv") | |
batch_corrected_vsd_df <- read_csv("batch_corrected_vsd_df.csv") | |
batch_corrected_vsd_df <- batch_corrected_vsd_df %>% | |
dplyr::select(-c(ensembl_id, | |
ensembl_id_no_version, | |
end_position, | |
start_position, | |
description, | |
strand, | |
chromosome_name)) | |
samples_with_expr_data <- intersect(colnames(batch_corrected_vsd_df %>% dplyr::select(-hgnc_symbol)), colnames(female_df_no_h9 %>% dplyr::select(-rn))) | |
batch_corrected_vsd_df <- batch_corrected_vsd_df %>% dplyr::select(gene_name=hgnc_symbol, !!samples_with_expr_data) | |
female_df_no_h9_subset <- female_df_no_h9 %>% dplyr::select(rn, !!samples_with_expr_data) | |
filtered_site_annotation_df <- read_csv("filtered_site_annotation_df.csv") | |
female_df_no_h9_subset <- left_join(female_df_no_h9_subset, filtered_site_annotation_df %>% | |
dplyr::select(rowname, hg38_chromosome, hg38_pos, hg38_gene_group, hg38_gene_name), by=c("rn"="rowname")) | |
female_df_no_h9_subset_gened <- female_df_no_h9_subset %>% | |
filter(grepl("TSS1500|TSS200|5_UTR", hg38_gene_group)) %>% | |
group_by(hg38_gene_name) %>% | |
summarise_at(samples_with_expr_data,mean) | |
list_genes <- intersect(female_df_no_h9_subset_gened$hg38_gene_name, batch_corrected_vsd_df$gene_name) | |
concordance_df <- tibble(gene=character(), corr=numeric(), pval=numeric(), meth_min=numeric(),meth_max=numeric(), | |
expr_min=numeric(), expr_max=numeric()) | |
for(gene in list_genes) { | |
gene_meth <- unlist(female_df_no_h9_subset_gened %>% filter(hg38_gene_name == gene) %>% dplyr::select(-hg38_gene_name)) | |
gene_expr <- unlist(batch_corrected_vsd_df %>% filter(gene_name == gene) %>% filter(row_number()==1) %>% dplyr::select(-gene_name)) | |
concordance_df <- add_row(concordance_df, | |
gene=gene, | |
corr=as.numeric(cor.test(gene_meth, gene_expr)$estimate), | |
pval=as.numeric(cor.test(gene_meth, gene_expr)$p.value), | |
meth_min=range(gene_meth)[1], | |
meth_max=range(gene_meth)[2], | |
expr_min=range(gene_expr)[1], | |
expr_max=range(gene_expr)[2]) | |
} | |
gened_dmp_df <- all_dmp_df %>% | |
filter(grepl("TSS1500|TSS200|5_UTR", hg38_refgene_group)) %>% | |
mutate(tran_1_delta=cluster_1_mean-cluster_0_mean, | |
tran_2_delta=cluster_2_mean-cluster_1_mean, | |
tran_3_delta=cluster_3_mean-cluster_2_mean, | |
tran_4_delta=cluster_4_mean-cluster_3_mean, | |
tran_5_delta=cluster_5_mean-cluster_4_mean) %>% | |
group_by(hg38_gene) %>% | |
summarise(tran_1_mean_delta=mean(tran_1_delta), | |
tran_2_mean_delta=mean(tran_2_delta), | |
tran_3_mean_delta=mean(tran_3_delta), | |
tran_4_mean_delta=mean(tran_4_delta), | |
tran_5_mean_delta=mean(tran_5_delta), | |
tran_1_mean_qval=mean(transition_1_qval), | |
tran_2_mean_qval=mean(transition_2_qval), | |
tran_3_mean_qval=mean(transition_3_qval), | |
tran_4_mean_qval=mean(transition_4_qval), | |
tran_5_mean_qval=mean(transition_5_qval), | |
mean_delta=mean(delta), | |
probes_passing_threshold=sum(passes_threshold), | |
total_probes=n()) | |
concordance_df <- left_join(concordance_df, gened_dmp_df %>% dplyr::select(hg38_gene, methylation_change=mean_delta, probes_passing_threshold, total_probes), by=c("gene"="hg38_gene")) | |
diff_expr_all_df$key_log2FC <- NA | |
diff_expr_all_df$first_transition <- NA | |
diff_expr_all_df$first_log2FC <- NA | |
for(row in 1:nrow(diff_expr_all_df)) { | |
key_transition <- as.numeric(unlist(diff_expr_all_df[row, "key_transition"])) | |
log2fc_col <- sprintf("transition_%d_log2FC", key_transition) | |
diff_expr_all_df[row, "key_log2FC"] <- as.numeric(unlist(diff_expr_all_df[row, log2fc_col])) | |
pvals <- unlist(diff_expr_all_df[row, c("transition_1_padj", "transition_2_padj", "transition_3_padj", "transition_4_padj")]) | |
pvals <- pvals < .1 | |
pvals[is.na(pvals)] <- FALSE | |
first_transition <- as.numeric(which(pvals)[1]) | |
first_delta_col <- sprintf("transition_%d_log2FC", first_transition) | |
diff_expr_all_df[row, "first_transition"] <- first_transition | |
if (!is.na(first_transition)) { | |
diff_expr_all_df[row, "first_log2FC"] <- as.numeric(unlist(diff_expr_all_df[row, first_delta_col])) | |
} | |
} | |
concordance_df <- left_join(concordance_df, diff_expr_all_df %>% | |
dplyr::select(gene, | |
chr, | |
passes_expression_threshold=passes_threshold, | |
expr_key_transition=key_transition, | |
expr_key_log2FC=key_log2FC, | |
expr_first_sig_transition=first_transition, | |
expr_first_log2FC=first_log2FC), | |
by=c("gene")) | |
transition_columns <- c("tran_1_mean_qval", "tran_2_mean_qval", "tran_3_mean_qval", "tran_4_mean_qval", "tran_5_mean_qval") | |
gened_dmp_df$key_transition <- colnames(gened_dmp_df[transition_columns])[apply(gened_dmp_df[transition_columns], 1, which.min)] | |
gened_dmp_df$key_transition <- as.numeric(unlist(as.data.frame(t(as.data.frame(strsplit(gened_dmp_df$key_transition, '_'))))[2])) | |
gened_dmp_df$key_mean_qval <- 0 | |
for (row in 1:nrow(gened_dmp_df)) { | |
qval_col <- sprintf("tran_%d_mean_qval", as.numeric(unlist(gened_dmp_df[row, "key_transition"]))) | |
gened_dmp_df[row, "key_mean_qval"] <- gened_dmp_df[row, qval_col] | |
} | |
``` | |
## Looking for enrichment using clusterProfiler | |
```{r} | |
library(tidyverse) | |
library(clusterProfiler) | |
library(msigdbr) | |
library(enrichplot) | |
library(ggsci) | |
library('org.Hs.eg.db') | |
library(ReactomePA) | |
library(meshes) | |
library(MeSH.Hsa.eg.db) | |
library(DOSE) | |
library(RDAVIDWebService) | |
library(grid) | |
library(gridExtra) | |
library(scales) | |
msig_df <- msigdbr(species = "Homo sapiens") | |
msig_t2g <- msig_df %>% | |
dplyr::select(gs_name, gene_symbol) | |
concordance_df <- read_csv("concordance_df.csv") | |
new_concordance_df <- read_csv("new_concordance_df.csv.gz") | |
grouped_concordance_df_filtered <- read_csv("grouped_concordance_df_filtered.csv") %>% filter(abs(max_tran_log2FC) >= .5) | |
enrichment_base_theme <- base_plot_theme + | |
theme(legend.key.width = unit(7, "mm"), | |
legend.text = element_text(size = 5), | |
legend.title = element_text(size = 5, face = "bold")) | |
plot_enrich <- function(df, title_name) { | |
enrichment_base_theme <- base_plot_theme + | |
theme(legend.key.width = unit(7, "mm"), | |
legend.text = element_text(size = 5), | |
legend.title = element_text(size = 5, face = "bold")) | |
if(is.null(df) | nrow(as_tibble(df)) == 0){ | |
return(ggplot() + theme_void() + labs(title=title_name)) | |
}else{ | |
df_tib <- as_tibble(df) %>% | |
group_by(ID) %>% | |
mutate(gene_ratio_eval=eval(str2expression(GeneRatio))) %>% | |
ungroup() | |
df_tib <- df_tib %>% | |
mutate(wrapped_ID=wrap_format(20)(str_replace_all(ID, "_", " "))) | |
df_tib_levels <- (df_tib %>% arrange(gene_ratio_eval))$wrapped_ID | |
return( | |
dotplot(df) + | |
labs(title=title_name) + | |
theme_bw() + | |
enrichment_base_theme | |
# ggplot(df_tib) + | |
# geom_point(aes(x=gene_ratio_eval, y=factor(wrapped_ID, levels=msig_tib_levels), color=p.adjust, size=Count)) + | |
# scale_color_gsea(reverse=T) + | |
# labs(x="Gene Ratio", | |
# y="", | |
# title=title_name) + | |
# theme_bw() + | |
# base_plot_theme + | |
# theme(axis.title.y = element_blank()) + | |
# enrichment_base_theme | |
) | |
} | |
} | |
make_enrichment_plots <- function(background_genes, foreground_genes, plot_title, file_save_name, pvalueCutoff=.05){ | |
background_entrez <- bitr(background_genes, "SYMBOL", "ENTREZID", OrgDb = org.Hs.eg.db)$ENTREZID | |
if(length(foreground_genes) > 0) { | |
foreground_entrez <- bitr(foreground_genes, "SYMBOL", "ENTREZID", OrgDb = org.Hs.eg.db)$ENTREZID | |
msig_df <- enricher(foreground_genes, universe = background_genes, TERM2GENE = msig_t2g, pvalueCutoff=pvalueCutoff) | |
msig_tib <- as_tibble(msig_df) %>% mutate(Ontology="MSigDB") | |
# if(is.null(msig_df)) { | |
# msig_plt <- ggplot() + theme_void() | |
# } | |
# msig_plt <- plot_enrich(msig_df, "MSigDB") | |
kegg_df <- enrichKEGG(foreground_entrez, universe = background_entrez, pvalueCutoff=pvalueCutoff) | |
kegg_tib <- as_tibble(kegg_df) %>% mutate(Ontology="KEGG") | |
# kegg_plt <- plot_enrich(kegg_df, "KEGG") | |
go_df <- enrichGO(foreground_entrez, universe = background_entrez, OrgDb = org.Hs.eg.db, pvalueCutoff=pvalueCutoff) | |
go_tib <- as_tibble(go_df) %>% mutate(Ontology="GO") | |
# go_plt <- plot_enrich(go_df, "GO") | |
reactome_df <- enrichPathway(foreground_entrez, universe = background_entrez, pvalueCutoff=pvalueCutoff) | |
reactome_tib <- as_tibble(reactome_df) %>% mutate(Ontology="Reactome") | |
# reactome_plt <- plot_enrich(reactome_df, "Reactome") | |
mesh_gendoo_df <- enrichMeSH(foreground_entrez, | |
universe = background_entrez, | |
MeSHDb = "MeSH.Hsa.eg.db", | |
database='gendoo', | |
category = 'C', | |
minGSSize = 10, pvalueCutoff=pvalueCutoff) | |
mesh_gendoo_tib <- as_tibble(mesh_gendoo_df) %>% mutate(Ontology="MeSH - Gendoo") | |
# mesh_gen_plt <- plot_enrich(mesh_gendoo_df, "MeSH - Gendoo") | |
# mesh_pubmed_df <- enrichMeSH(foreground_entrez, universe = background_entrez, MeSHDb = "MeSH.Hsa.eg.db", database='gene2pubmed', category = 'C') | |
# mesh_pubmed_tib <- as_tibble(mesh_pubmed_df) %>% mutate(Ontology="MeSH - gene2pubmed") | |
# mesh_pub_plt <- plot_enrich(mesh_pubmed_df, "MeSH - gene2pubmed") | |
do_df <- enrichDO(foreground_entrez, universe = background_entrez, pvalueCutoff=pvalueCutoff) | |
do_tib <- as_tibble(do_df) %>% mutate(Ontology="Disease Ontology") | |
# do_plt <- plot_enrich(do_df, "Disease Ontology") | |
ncg_df <- enrichNCG(foreground_entrez, universe = background_entrez, pvalueCutoff=pvalueCutoff) | |
ncg_tib <- as_tibble(ncg_df) %>% mutate(Ontology="NCG") | |
# ncg_plt <- plot_enrich(ncg_df, "NCG") | |
dgn_df <- enrichDGN(foreground_entrez, universe = background_entrez, pvalueCutoff=pvalueCutoff) | |
dgn_tib <- as_tibble(dgn_df) %>% mutate(Ontology="DGN") | |
# dgn_plt <- plot_enrich(dgn_df, "DGN") | |
wpgmtfile <- system.file("wikipathways-20200310-gmt-Homo_sapiens.gmt", package="clusterProfiler") | |
wp2gene <- read.gmt("wikipathways-20200310-gmt-Homo_sapiens.gmt") | |
wp2gene <- wp2gene %>% tidyr::separate(term, c("name","version","wpid","org"), "%") | |
wpid2gene <- wp2gene %>% dplyr::select(wpid, gene) #TERM2GENE | |
wpid2name <- wp2gene %>% dplyr::select(wpid, name) #TERM2NAME | |
wiki_df <- enricher(foreground_entrez, universe = background_entrez, TERM2GENE = wpid2gene, TERM2NAME = wpid2name, pvalueCutoff=pvalueCutoff) | |
wiki_tib <- as_tibble(wiki_df) %>% mutate(Ontology="Wiki Pathways") | |
super_tib <- bind_rows(msig_tib, kegg_tib, go_tib, | |
reactome_tib, mesh_gendoo_tib, | |
do_tib, ncg_tib, dgn_tib, wiki_tib) #mesh_pubmed_tib | |
super_tib <- super_tib %>% | |
group_by(Description) %>% | |
mutate(gene_ratio_eval=eval(str2expression(GeneRatio))) %>% | |
ungroup() | |
super_tib <- super_tib %>% | |
mutate(wrapped_ID=wrap_format(20)(str_replace_all(ID, "_", " "))) | |
return(super_tib) | |
# gridded_plts <- arrangeGrob(msig_plt, kegg_plt, go_plt, reactome_plt, mesh_gen_plt, mesh_pub_plt, do_plt, ncg_plt, dgn_plt, | |
# top = textGrob(plot_title,gp=gpar(fontsize=6))) | |
# ggsave(file_save_name, gridded_plts, width = 236, height = 236, units = "mm") | |
}else{ | |
print(sprintf("no genes for transition %d", t)) | |
} | |
} | |
# Whole genome | |
background_genes <- concordance_df$gene | |
for(t in 1:5) { | |
foreground_genes <- filter(concordance_df, pval <= .05 & corr < 0 & expr_first_sig_transition == t)$gene | |
title <- sprintf("Whole Genome Transition %d - %d genes", t, length(foreground_genes)) | |
save_name <- sprintf("figs/enrichment/whole_genome_tran_%d.png",t) | |
make_enrichment_plots(background_genes, foreground_genes, title, save_name) | |
} | |
# X Chr genome | |
x_background_genes <- filter(concordance_df, chr == "X")$gene | |
for(t in 1:5) { | |
x_foreground_genes <- filter(concordance_df, chr == "X" & pval <= .05 & corr < 0 & expr_first_sig_transition == t)$gene | |
title <- sprintf("X Chr Transition %d - %d genes", t, length(x_foreground_genes)) | |
save_name <- sprintf("figs/enrichment/x_chr_tran_%d.png",t) | |
make_enrichment_plots(x_background_genes, x_foreground_genes, title, save_name) | |
} | |
# Non X Chr genome | |
nonx_background_genes <- filter(concordance_df, !(chr %in% c("X", "Y")))$gene | |
for(t in 1:5) { | |
nonx_foreground_genes <- filter(concordance_df, !(chr %in% c("X", "Y")) & pval <= .05 & corr < 0 & expr_first_sig_transition == t)$gene | |
title <- sprintf("Non-X Chr Transition %d - %d genes", t, length(nonx_foreground_genes)) | |
save_name <- sprintf("figs/enrichment/nonx_chr_tran_%d.png",t) | |
make_enrichment_plots(nonx_background_genes, nonx_foreground_genes, title, save_name) | |
} | |
# expression only enrichments | |
# Whole genome with grouped_concordance_df_filtered | |
diff_expr_all_df <- read_csv("diff_expr_all_df.csv.gz") | |
genome_background_genes <- filter(diff_expr_all_df, !(chr %in% c("Y")))$gene | |
super_tib <- tibble(ID=character(), | |
Description=character(), | |
GeneRatio=character(), | |
BgRatio=character(), | |
pvalue=numeric(), | |
p.adjust=numeric(), | |
qvalue=numeric(), | |
geneID=character(), | |
Count=numeric(), | |
Ontology=character(), | |
gene_ratio_eval=numeric(), | |
wrapped_ID=character(), | |
Transition=numeric()) | |
gene_counts <- tibble(description=character(), | |
count=numeric()) | |
gene_counts <- add_row(gene_counts, | |
description="background", | |
count=length(genome_background_genes)) | |
for(t in 1:5) { | |
genome_foreground_genes <- filter(diff_expr_all_df, !(chr %in% c("Y")) & key_transition == t & passes_threshold == T)$gene | |
if(length(genome_foreground_genes) >0){ | |
# title <- sprintf("Non-X Chr Transition grouped_concordance_df_filtered %d - %d genes", t, length(genome_foreground_genes)) | |
# save_name <- sprintf("figs/enrichment/grouped_concordance_df_filtered_genome_chr_tran_%d.png",t) | |
tib <- make_enrichment_plots(genome_background_genes, genome_foreground_genes, title, save_name) %>% mutate(Transition=t) | |
super_tib <- bind_rows(super_tib, tib) | |
gene_counts <- add_row(gene_counts, | |
description=sprintf("foreground tran. %d", t), | |
count=length(genome_foreground_genes)) | |
# ggsave(save_name, plt, width = 87, height=59, units = "mm") | |
} | |
} | |
write_csv(super_tib, "genome_enrichment_expr_genes.csv") | |
# Non X Chr genome with grouped_concordance_df_filtered | |
nonx_background_genes <- filter(diff_expr_all_df, !(chr %in% c("X", "Y")))$gene | |
super_tib <- tibble(ID=character(), | |
Description=character(), | |
GeneRatio=character(), | |
BgRatio=character(), | |
pvalue=numeric(), | |
p.adjust=numeric(), | |
qvalue=numeric(), | |
geneID=character(), | |
Count=numeric(), | |
Ontology=character(), | |
gene_ratio_eval=numeric(), | |
wrapped_ID=character(), | |
Transition=numeric()) | |
gene_counts <- tibble(description=character(), | |
count=numeric()) | |
gene_counts <- add_row(gene_counts, | |
description="background", | |
count=length(nonx_background_genes)) | |
for(t in 1:5) { | |
nonx_foreground_genes <- filter(diff_expr_all_df, !(chr %in% c("X", "Y")) & key_transition == t & passes_threshold == T)$gene | |
if(length(nonx_foreground_genes) >0){ | |
# title <- sprintf("Non-X Chr Transition grouped_concordance_df_filtered %d - %d genes", t, length(nonx_foreground_genes)) | |
# save_name <- sprintf("figs/enrichment/grouped_concordance_df_filtered_nonx_chr_tran_%d.png",t) | |
tib <- make_enrichment_plots(nonx_background_genes, nonx_foreground_genes, title, save_name) %>% mutate(Transition=t) | |
super_tib <- bind_rows(super_tib, tib) | |
gene_counts <- add_row(gene_counts, | |
description=sprintf("foreground tran. %d", t), | |
count=length(nonx_foreground_genes)) | |
# ggsave(save_name, plt, width = 87, height=59, units = "mm") | |
} | |
} | |
write_csv(super_tib, "non_x_enrichment_expr_genes.csv") | |
x_background_genes <- filter(diff_expr_all_df, (chr %in% c("X")))$gene | |
super_tib <- tibble(ID=character(), | |
Description=character(), | |
GeneRatio=character(), | |
BgRatio=character(), | |
pvalue=numeric(), | |
p.adjust=numeric(), | |
qvalue=numeric(), | |
geneID=character(), | |
Count=numeric(), | |
Ontology=character(), | |
gene_ratio_eval=numeric(), | |
wrapped_ID=character(), | |
Transition=numeric()) | |
gene_counts <- tibble(description=character(), | |
count=numeric()) | |
gene_counts <- add_row(gene_counts, | |
description="background", | |
count=length(x_background_genes)) | |
for(t in 1:5) { | |
x_foreground_genes <- filter(diff_expr_all_df, (chr %in% c("X")) & key_transition == t & passes_threshold == T)$gene | |
# title <- sprintf("Non-X Chr Transition grouped_concordance_df_filtered %d - %d genes", t, length(nonx_foreground_genes)) | |
# save_name <- sprintf("figs/enrichment/grouped_concordance_df_filtered_nonx_chr_tran_%d.png",t) | |
tib <- make_enrichment_plots(x_background_genes, x_foreground_genes, title, save_name) %>% mutate(Transition=t) | |
super_tib <- bind_rows(super_tib, tib) | |
gene_counts <- add_row(gene_counts, | |
description=sprintf("foreground tran. %d", t), | |
count=length(x_foreground_genes)) | |
# ggsave(save_name, plt, width = 87, height=59, units = "mm") | |
} | |
write_csv(super_tib, "x_enrichment_expr_genes.csv") | |
# Whole genome with grouped_concordance_df_filtered | |
genome_background_genes <- filter(new_concordance_df, !(chr %in% c("Y")))$gene | |
super_tib <- tibble(ID=character(), | |
Description=character(), | |
GeneRatio=character(), | |
BgRatio=character(), | |
pvalue=numeric(), | |
p.adjust=numeric(), | |
qvalue=numeric(), | |
geneID=character(), | |
Count=numeric(), | |
Ontology=character(), | |
gene_ratio_eval=numeric(), | |
wrapped_ID=character(), | |
Transition=numeric()) | |
gene_counts <- tibble(description=character(), | |
count=numeric()) | |
gene_counts <- add_row(gene_counts, | |
description="background", | |
count=length(genome_background_genes)) | |
for(t in 1:5) { | |
genome_foreground_genes <- filter(new_concordance_df, !(chr %in% c("Y")) & key_transition == t & passes_threshold == T & (atac_concordant == T | meth_concordant == T))$gene | |
if(length(genome_foreground_genes) >0){ | |
# title <- sprintf("Non-X Chr Transition grouped_concordance_df_filtered %d - %d genes", t, length(genome_foreground_genes)) | |
# save_name <- sprintf("figs/enrichment/grouped_concordance_df_filtered_genome_chr_tran_%d.png",t) | |
tib <- make_enrichment_plots(genome_background_genes, genome_foreground_genes, title, save_name) %>% mutate(Transition=t) | |
super_tib <- bind_rows(super_tib, tib) | |
gene_counts <- add_row(gene_counts, | |
description=sprintf("foreground tran. %d", t), | |
count=length(genome_foreground_genes)) | |
# ggsave(save_name, plt, width = 87, height=59, units = "mm") | |
} | |
} | |
write_csv(super_tib, "genome_enrichment_atac_meth_concordanct_genes.csv") | |
# Non X Chr genome with grouped_concordance_df_filtered | |
nonx_background_genes <- filter(new_concordance_df, !(chr %in% c("X", "Y")))$gene | |
super_tib <- tibble(ID=character(), | |
Description=character(), | |
GeneRatio=character(), | |
BgRatio=character(), | |
pvalue=numeric(), | |
p.adjust=numeric(), | |
qvalue=numeric(), | |
geneID=character(), | |
Count=numeric(), | |
Ontology=character(), | |
gene_ratio_eval=numeric(), | |
wrapped_ID=character(), | |
Transition=numeric()) | |
gene_counts <- tibble(description=character(), | |
count=numeric()) | |
gene_counts <- add_row(gene_counts, | |
description="background", | |
count=length(nonx_background_genes)) | |
cats_of_interest <- c("HALLMARK_OXIDATIVE_PHOSPHORYLATION", | |
"MOOTHA_MITOCHONDRIA", | |
"GO_MITOCHONDRION", | |
"GO_MITOCHONDRIAL_PART", | |
"MOOTHA_HUMAN_MITODB_6_2002", | |
"GO_MITOCHONDRION_ORGANIZATION", | |
"GO_MITOCHONDRIAL_ENVELOPE", | |
"GO_ENERGY_DERIVATION_BY_OXIDATION_OF_ORGANIC_COMPOUNDS", | |
"GO_OXIDATION_REDUCTION_PROCESS", | |
"WONG_MITOCHONDRIA_GENE_MODULE") | |
for(t in 1:5) { | |
nonx_foreground_genes <- filter(new_concordance_df, !(chr %in% c("X", "Y")) & key_transition == t & passes_threshold == T & (atac_concordant == T | meth_concordant == T))$gene | |
if(length(nonx_foreground_genes) >0){ | |
# title <- sprintf("Non-X Chr Transition grouped_concordance_df_filtered %d - %d genes", t, length(nonx_foreground_genes)) | |
# save_name <- sprintf("figs/enrichment/grouped_concordance_df_filtered_nonx_chr_tran_%d.png",t) | |
tib <- make_enrichment_plots(nonx_background_genes, nonx_foreground_genes, title, save_name, pvalueCutoff=1) %>% mutate(Transition=t) | |
super_tib <- bind_rows(super_tib, tib) | |
gene_counts <- add_row(gene_counts, | |
description=sprintf("foreground tran. %d", t), | |
count=length(nonx_foreground_genes)) | |
# ggsave(save_name, plt, width = 87, height=59, units = "mm") | |
} | |
} | |
write_csv(super_tib, "non_x_enrichment_atac_meth_concordanct_genes.csv") | |
write_csv(super_tib, "non_x_enrichment_atac_meth_concordant_no_thresh.csv") | |
write_csv(super_tib %>% filter(ID %in% cats_of_interest), "non_x_enrichment_atac_meth_concordant_mitochondrial_alone.csv") | |
saved_tib <- super_tib | |
super_tib <- super_tib %>% | |
mutate(wrapped_ID=wrap_format(20)(str_replace_all(sprintf("%s", Description), "_", " "))) | |
wrapped_ID_levels <- (super_tib %>% arrange(Transition, gene_ratio_eval))$wrapped_ID | |
enrichment_nonx_plot <- ggplot(super_tib) + | |
geom_point(aes(x=gene_ratio_eval, y=factor(wrapped_ID, levels=unique(wrapped_ID_levels)), color=p.adjust, size=Count)) + | |
scale_color_gsea(reverse=T) + | |
scale_size(range = c(.1,4)) + | |
labs(x="Gene Ratio", | |
y="", | |
title= "Enrichment Analysis for Concordant Genes in Each Transition") + | |
theme_bw() + | |
base_plot_theme + | |
theme(axis.title.y = element_blank()) + | |
facet_grid( rows=vars(Transition), scales = "free_y", space="free_y", labeller = labeller(.rows=function(n){ #cols=vars(Ontology), | |
return(list("1"="Tran. 1", | |
"2"="Tran. 2", | |
"3"="Tran. 3", | |
"4"="Tran. 4", | |
"5"="Tran. 5")[as.character(n)]) | |
})) + | |
scale_x_continuous(expand = c(.05,.05)) + | |
enrichment_base_theme + | |
theme(strip.text.y = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")), | |
strip.text.x = element_text(size=5, margin = margin(1,1.5,1,1.5, "mm")), | |
strip.background = element_rect(size = .3), | |
plot.title = element_blank()) | |
ggsave("figs/enrichment/grouped_concordance_df_filtered_nonx_enrich_alt.svg", width = 38, height = 180, units = "mm") | |
# Non X Chr genome with expression diff_expr_f_vs_each | |
diff_expr_f_vs_each <- read_csv("diff_expr_f_vs_each.csv") | |
nonx_background_genes <- filter(diff_expr_f_vs_each, !(chromosome_name %in% c("X", "Y")))$gene | |
super_tib <- tibble(ID=character(), | |
Description=character(), | |
GeneRatio=character(), | |
BgRatio=character(), | |
pvalue=numeric(), | |
p.adjust=numeric(), | |
qvalue=numeric(), | |
geneID=character(), | |
Count=numeric(), | |
Ontology=character(), | |
gene_ratio_eval=numeric(), | |
wrapped_ID=character(), | |
Transition=numeric()) | |
gene_counts <- tibble(description=character(), | |
count=numeric()) | |
gene_counts <- add_row(gene_counts, | |
description="background", | |
count=length(nonx_background_genes)) | |
# for(t in 1:5) { | |
nonx_foreground_genes <- filter(diff_expr_f_vs_each, !(chromosome_name %in% c("X", "Y")) & D_vs_F_padj <= .1)$gene | |
# title <- sprintf("Non-X Chr Transition grouped_concordance_df_filtered %d - %d genes", t, length(nonx_foreground_genes)) | |
# save_name <- sprintf("figs/enrichment/grouped_concordance_df_filtered_nonx_chr_tran_%d.png",t) | |
tib <- make_enrichment_plots(nonx_background_genes, nonx_foreground_genes, title, save_name) %>% mutate(Transition=t) | |
super_tib <- bind_rows(super_tib, tib) | |
gene_counts <- add_row(gene_counts, | |
description=sprintf("foreground tran. %d", t), | |
count=length(nonx_foreground_genes)) | |
# ggsave(save_name, plt, width = 87, height=59, units = "mm") | |
# } | |
saved_tib <- super_tib | |
super_tib <- super_tib %>% | |
mutate(wrapped_ID=wrap_format(20)(str_replace_all(sprintf("%s", Description), "_", " "))) | |
wrapped_ID_levels <- (super_tib %>% arrange(Transition, gene_ratio_eval))$wrapped_ID | |
enrichment_nonx_plot <- ggplot(super_tib) + | |
geom_point(aes(x=gene_ratio_eval, y=factor(wrapped_ID, levels=unique(wrapped_ID_levels)), color=p.adjust, size=Count)) + | |
scale_color_gsea(reverse=T) + | |
scale_size(range = c(.1,4)) + | |
labs(x="Gene Ratio", | |
y="", | |
title= "Enrichment Analysis for Concordant Genes in Each Transition") + | |
theme_bw() + | |
base_plot_theme + | |
theme(axis.title.y = element_blank()) + | |
facet_grid( rows=vars(Transition), scales = "free_y", space="free_y", labeller = labeller(.rows=function(n){ #cols=vars(Ontology), | |
return(list("1"="Tran. 1", | |
"2"="Tran. 2", | |
"3"="Tran. 3", | |
"4"="Tran. 4", | |
"5"="Tran. 5")[as.character(n)]) | |
})) + | |
scale_x_continuous(expand = c(.05,.05)) + | |
enrichment_base_theme + | |
theme(strip.text.y = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")), | |
strip.text.x = element_text(size=5, margin = margin(1,1.5,1,1.5, "mm")), | |
strip.background = element_rect(size = .3), | |
plot.title = element_blank()) | |
x_background_genes <- filter(new_concordance_df, (chr %in% c("X")))$gene | |
super_tib <- tibble(ID=character(), | |
Description=character(), | |
GeneRatio=character(), | |
BgRatio=character(), | |
pvalue=numeric(), | |
p.adjust=numeric(), | |
qvalue=numeric(), | |
geneID=character(), | |
Count=numeric(), | |
Ontology=character(), | |
gene_ratio_eval=numeric(), | |
wrapped_ID=character(), | |
Transition=numeric()) | |
gene_counts <- tibble(description=character(), | |
count=numeric()) | |
gene_counts <- add_row(gene_counts, | |
description="background", | |
count=length(x_background_genes)) | |
for(t in 1:5) { | |
x_foreground_genes <- filter(new_concordance_df, (chr %in% c("X")) & key_transition == t & passes_threshold == T & (atac_concordant == T | meth_concordant == T))$gene | |
# title <- sprintf("Non-X Chr Transition grouped_concordance_df_filtered %d - %d genes", t, length(nonx_foreground_genes)) | |
# save_name <- sprintf("figs/enrichment/grouped_concordance_df_filtered_nonx_chr_tran_%d.png",t) | |
tib <- make_enrichment_plots(x_background_genes, x_foreground_genes, title, save_name) %>% mutate(Transition=t) | |
super_tib <- bind_rows(super_tib, tib) | |
gene_counts <- add_row(gene_counts, | |
description=sprintf("foreground tran. %d", t), | |
count=length(x_foreground_genes)) | |
# ggsave(save_name, plt, width = 87, height=59, units = "mm") | |
} | |
write_csv(super_tib, "x_enrichment_atac_meth_concordanct_genes.csv") | |
super_tib <- super_tib %>% | |
mutate(wrapped_ID=wrap_format(20)(str_replace_all(Description, "_", " "))) | |
super_tib_levels <- (super_tib %>% arrange(gene_ratio_eval))$wrapped_ID | |
enrichment_x_plot <- ggplot(super_tib) + | |
geom_point(aes(x=gene_ratio_eval, y=factor(wrapped_ID, levels=super_tib_levels), color=p.adjust, size=Count)) + | |
scale_color_gsea(reverse=T) + | |
labs(x="Gene Ratio", | |
y="", | |
title= "Enrichment Analysis for Concordant Genes in Each Transition - X Chr", | |
size="Gene\nCount") + | |
theme_bw() + | |
base_plot_theme + | |
theme(axis.title.y = element_blank()) + | |
facet_grid(rows=vars(Transition), scales = "free_y", space="free_y", labeller = labeller(.rows=function(n){ #cols=vars(Ontology), | |
return(list("1"="Tran. 1", | |
"2"="Tran. 2", | |
"3"="Tran. 3", | |
"4"="Tran. 4", | |
"5"="Tran. 5")[as.character(n)]) | |
})) + | |
scale_x_continuous(expand = c(.05,.05)) + | |
enrichment_base_theme + | |
scale_size(range=c(.1,5)) + | |
theme(strip.text.y = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")), | |
strip.text.x = element_text(size=5, margin = margin(1,1.5,1,1.5, "mm")), | |
strip.background = element_rect(size = .3), | |
plot.title = element_blank()) | |
ggsave("figs/enrichment/grouped_concordance_df_filtered_x_enrich_alt.svg", enrichment_x_plot, width = 80, height = 120, units = "mm") | |
## With key transition instead of first transition | |
# Whole genome | |
background_genes <- concordance_df$gene | |
for(t in 1:5) { | |
foreground_genes <- filter(concordance_df, pval <= .05 & corr < 0 & expr_key_transition == t)$gene | |
title <- sprintf("Whole Genome Transition %d - %d genes", t, length(foreground_genes)) | |
save_name <- sprintf("figs/enrichment/key_whole_genome_tran_%d.png",t) | |
make_enrichment_plots(background_genes, foreground_genes, title, save_name) | |
} | |
# X Chr genome | |
x_background_genes <- filter(concordance_df, chr == "X")$gene | |
for(t in 1:5) { | |
x_foreground_genes <- filter(concordance_df, chr == "X" & pval <= .05 & corr < 0 & expr_key_transition == t)$gene | |
title <- sprintf("X Chr Transition %d - %d genes", t, length(x_foreground_genes)) | |
save_name <- sprintf("figs/enrichment/key_x_chr_tran_%d.png",t) | |
make_enrichment_plots(x_background_genes, x_foreground_genes, title, save_name) | |
} | |
# Non X Chr genome | |
nonx_background_genes <- filter(concordance_df, !(chr %in% c("X", "Y")))$gene | |
for(t in 1:5) { | |
nonx_foreground_genes <- filter(concordance_df, !(chr %in% c("X", "Y")) & pval <= .05 & corr < 0 & expr_key_transition == t)$gene | |
title <- sprintf("Non-X Chr Transition %d - %d genes", t, length(nonx_foreground_genes)) | |
save_name <- sprintf("figs/enrichment/key_nonx_chr_tran_%d.png",t) | |
make_enrichment_plots(nonx_background_genes, nonx_foreground_genes, title, save_name) | |
} | |
``` | |
## Looking for gene counts in each transition from NCG | |
```{r} | |
cancer_genes_df <- read_csv("building_cancer_db/cosmic_cancer_gene_census.csv.gz") | |
# cancer_genes_df$type <- "unknown" | |
# cancer_genes_df[grepl("oncogene", cancer_genes_df$description), "type"] <- "oncogene" | |
# cancer_genes_df[grepl("tumour suppressor gene", cancer_genes_df$description), "type"] <- "tumour suppressor gene" | |
# grouped_concordance_df_filtered <- read_csv("grouped_concordance_df_filtered.csv.gz") %>% filter(abs(max_tran_log2FC) >= .5) | |
# concordance_df <- read_csv("concordance_df.csv") | |
new_concordance_df <- read_csv("new_concordance_df.csv.gz") | |
cancer_plot_df <- new_concordance_df %>% filter(passes_threshold == T & (atac_concordant | meth_concordant)) #grouped_concordance_df_filtered | |
cancer_plot_df <- left_join(cancer_plot_df, cancer_genes_df | |
%>% dplyr::select(`Gene Symbol`, `Role in Cancer`), | |
by=c("gene"="Gene Symbol")) %>% | |
filter(!is.na(`Role in Cancer`)) | |
cancer_plot_df$cancer_gene_type <- "Other" | |
cancer_plot_df[grepl("oncogene", cancer_plot_df$`Role in Cancer`), "cancer_gene_type"] <- "Oncogene" | |
cancer_plot_df[grepl("TSG", cancer_plot_df$`Role in Cancer`), "cancer_gene_type"] <- "Tumor Suppressor" | |
cancer_plot_df[grepl("oncogene, TSG", cancer_plot_df$`Role in Cancer`), "cancer_gene_type"] <- "Oncogene + Tumor Suppressor" | |
cancer_plot_df$max_tran_label <- paste("Tran. ", cancer_plot_df$key_transition) | |
cancer_plot_df[cancer_plot_df$key_transition %in% c(4,5), "max_tran_label"] <- "Tran. 4/5" | |
cancer_plot_df <- cancer_plot_df %>% filter(chr != "Y") | |
cancer_genes_plot <- ggplot(cancer_plot_df, aes(x=(chr=="X"), y=delta, fill=cancer_gene_type)) + | |
# geom_boxplot(outlier.shape=NA, size=.3) + | |
geom_hline(yintercept = 0, size=.1)+ | |
ggrepel::geom_text_repel( aes(label=gene, color=cancer_gene_type), size=1.2, position = position_dodge(width=.7), segment.size=.1) + | |
geom_point(pch = 21, position = position_dodge(width=.7), size=1.1, stroke=.1, alpha=.7) + | |
# geom_jitter(size=.7) + | |
facet_grid(cols = vars(max_tran_label)) + | |
scale_fill_manual(values = pal_simpsons()(5)[c(2,4,5,3)], | |
limits=c("Oncogene", "Oncogene + Tumor Suppressor", "Tumor Suppressor", "Other")) + | |
scale_color_manual(values = pal_simpsons()(5)[c(2,4,5,3)], | |
limits=c("Oncogene", "Oncogene + Tumor Suppressor", "Tumor Suppressor", "Other"), | |
guide=F) + | |
scale_x_discrete(breaks=c(F,T), labels=c("Autosomal", "X"), name="Chromosome") + | |
scale_shape(guide=F) + | |
labs(title="Cancer Genes Perturbed in Each Transition", | |
y="log2(Fold Change)", | |
fill="Role in Cancer") + | |
theme_bw() + | |
base_plot_theme + | |
theme(strip.text.y = element_text(size=5, margin = margin(0.5,0,0.5,0, "mm")), | |
strip.text.x = element_text(size=5, margin = margin(0.7,0.5,0.7,0.5, "mm")), | |
strip.background = element_rect(size = .3)) | |
ggsave("figs/main/fig_4_cancer_genes.pdf", cancer_genes_plot, width = 116, height=53.27, units = "mm") | |
write_csv(cancer_plot_df %>% select(gene, chr, max_tran_log2FC,max_tran_label, cancer_gene_type), "cancer_plot_df.csv") | |
nonx_background_genes <- filter(concordance_df, !(chr %in% c("X", "Y")))$gene | |
nonx_gene_counts <- tibble(description=character(), | |
total=numeric(), | |
oncogenes=numeric(), | |
tumor_suppressors=numeric(), | |
other_cancer_genes=numeric()) | |
nonx_gene_counts <- add_row(nonx_gene_counts, | |
description="background", | |
total=length(nonx_background_genes), | |
oncogenes=nrow(cancer_genes_df %>% filter(grepl("oncogene", `Role in Cancer`) & | |
`Gene Symbol` %in% nonx_background_genes)), | |
tumor_suppressors=nrow(cancer_genes_df %>% filter(grepl("TSG", `Role in Cancer`) & | |
`Gene Symbol` %in% nonx_background_genes)), | |
other_cancer_genes=nrow(cancer_genes_df %>% filter(!grepl("TSG|oncogene", `Role in Cancer`) & | |
`Gene Symbol` %in% nonx_background_genes))) | |
for(t in 1:5) { | |
nonx_foreground_genes_up <- filter(grouped_concordance_df_filtered, !(chr %in% c("X", "Y")) & max_transition == t & max_tran_log2FC >= 0)$gene | |
nonx_foreground_genes_down <- filter(grouped_concordance_df_filtered, !(chr %in% c("X", "Y")) & max_transition == t & max_tran_log2FC < 0)$gene | |
nonx_gene_counts <- add_row(nonx_gene_counts, | |
description=sprintf("Transition %d", t), | |
total=length(nonx_foreground_genes), | |
oncogenes=nrow(cancer_genes_df %>% filter(grepl("oncogene", `Role in Cancer`) & | |
`Gene Symbol` %in% nonx_foreground_genes_up)), | |
tumor_suppressors=nrow(cancer_genes_df %>% filter(grepl("TSG", `Role in Cancer`) & | |
`Gene Symbol` %in% nonx_foreground_genes_up)), | |
other_cancer_genes=nrow(cancer_genes_df %>% filter(!grepl("TSG|oncogene", `Role in Cancer`) & | |
`Gene Symbol` %in% nonx_foreground_genes_up))) | |
nonx_gene_counts <- add_row(nonx_gene_counts, | |
description=sprintf("Transition %d", t), | |
total=length(nonx_foreground_genes), | |
oncogenes=-1*nrow(cancer_genes_df %>% filter(grepl("oncogene", `Role in Cancer`) & | |
`Gene Symbol` %in% nonx_foreground_genes_down)), | |
tumor_suppressors=-1*nrow(cancer_genes_df %>% filter(grepl("TSG", `Role in Cancer`) & | |
`Gene Symbol` %in% nonx_foreground_genes_down)), | |
other_cancer_genes=-1*nrow(cancer_genes_df %>% filter(!grepl("TSG|oncogene", `Role in Cancer`) & | |
`Gene Symbol` %in% nonx_foreground_genes_down))) | |
# ggsave(save_name, plt, width = 87, height=59, units = "mm") | |
} | |
nonx_gene_counts <- nonx_gene_counts %>% mutate(all_cancer=oncogenes + tumor_suppressors + other_cancer_genes) | |
nonx_gene_counts$chrom <- "nonX" | |
x_background_genes <- filter(concordance_df, (chr %in% c("X")))$gene | |
x_gene_counts <- tibble(description=character(), | |
total=numeric(), | |
oncogenes=numeric(), | |
tumor_suppressors=numeric(), | |
other_cancer_genes=numeric()) | |
x_gene_counts <- add_row(x_gene_counts, | |
description="background", | |
total=length(x_background_genes), | |
oncogenes=nrow((cancer_genes_df %>% filter(grepl("oncogene", `Role in Cancer`) & | |
`Gene Symbol` %in% x_background_genes))), | |
tumor_suppressors=nrow(cancer_genes_df %>% filter(grepl("TSG", `Role in Cancer`) & | |
`Gene Symbol` %in% x_background_genes)), | |
other_cancer_genes=nrow(cancer_genes_df %>% filter(!grepl("TSG|oncogene", `Role in Cancer`) & | |
`Gene Symbol` %in% x_background_genes))) | |
for(t in 1:5) { | |
x_foreground_genes_up <- filter(grouped_concordance_df_filtered, (chr %in% c("X")) & max_transition == t & max_tran_log2FC >= 0)$gene | |
x_foreground_genes_down <- filter(grouped_concordance_df_filtered, (chr %in% c("X")) & max_transition == t & max_tran_log2FC < 0)$gene | |
x_gene_counts <- add_row(x_gene_counts, | |
description=sprintf("Transition %d", t), | |
total=length(x_foreground_genes), | |
oncogenes=nrow((cancer_genes_df %>% filter(grepl("oncogene", `Role in Cancer`) & | |
`Gene Symbol` %in% x_foreground_genes_up))), | |
tumor_suppressors=nrow(cancer_genes_df %>% filter(grepl("TSG", `Role in Cancer`) & | |
`Gene Symbol` %in% x_foreground_genes_up)), | |
other_cancer_genes=nrow(cancer_genes_df %>% filter(!grepl("TSG|oncogene", `Role in Cancer`) & | |
`Gene Symbol` %in% x_foreground_genes_up))) | |
x_gene_counts <- add_row(x_gene_counts, | |
description=sprintf("Transition %d", t), | |
total=length(x_foreground_genes), | |
oncogenes=-1*nrow((cancer_genes_df %>% filter(grepl("oncogene", `Role in Cancer`) & | |
`Gene Symbol` %in% x_foreground_genes_down))), | |
tumor_suppressors=-1*nrow(cancer_genes_df %>% filter(grepl("TSG", `Role in Cancer`) & | |
`Gene Symbol` %in% x_foreground_genes_down)), | |
other_cancer_genes=-1*nrow(cancer_genes_df %>% filter(!grepl("TSG|oncogene", `Role in Cancer`) & | |
`Gene Symbol` %in% x_foreground_genes_down))) | |
# ggsave(save_name, plt, width = 87, height=59, units = "mm") | |
} | |
x_gene_counts <- x_gene_counts %>% mutate(all_cancer=oncogenes + tumor_suppressors + other_cancer_genes) | |
x_gene_counts$chrom <- "X" | |
all_counts <- reshape2::melt(nonx_gene_counts %>% select(description, all_cancer, tumor_suppressors,oncogenes,other_cancer_genes, chrom), id.vars=c("description", "chrom")) | |
all_counts <- full_join(all_counts, | |
reshape2::melt(x_gene_counts %>% select(description, all_cancer, tumor_suppressors,oncogenes,other_cancer_genes, chrom), id.vars=c("description", "chrom")), | |
by=c("description", "chrom", "variable", "value")) | |
cancer_gene_num_plot <- ggplot(all_counts %>% filter(description != "background" & variable != "all_cancer"), aes(x=chrom, y=value, fill=factor(variable, levels = c("oncogenes", "tumor_suppressors", "other_cancer_genes")))) + | |
geom_bar(stat="identity", position = "stack") + | |
facet_grid(cols = vars(description)) + | |
scale_fill_manual(values = pal_rickandmorty()(5)[3:5], | |
limits=c("oncogenes", "tumor_suppressors", "other_cancer_genes"), | |
labels=c("Oncogene", "Tumor Suppressor", "Other")) + | |
geom_hline(yintercept = 0, size=.1)+ | |
theme_bw() + | |
base_plot_theme + | |
labs(title="Number of Cancer Genes Perturbed in Each Transition", | |
y="Number of Cancer Genes", | |
x="Chromosome", | |
fill="Type") + | |
theme(strip.text.y = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")), | |
strip.text.x = element_text(size=5, margin = margin(1,1.5,1,1.5, "mm")), | |
strip.background = element_rect(size = .3)) | |
ggsave("figs/main/fig_4_num_cancer_genes.svg", cancer_gene_num_plot, width = 87, height=47, units = "mm") | |
``` | |
## Getting sequences to look for TF binding enrichment (ELK-1) | |
```{r} | |
library(tidyverse) | |
concordance_df <- read_csv("concordance_df.csv") | |
canonical_transcripts <- read_csv("hg38_canonical_transcripts.csv") | |
hg38_biomart_structure_df <- read_csv("hg38_biomart_structure_df.csv", col_types = cols(chromosome_name="c")) | |
hg38_biomart_structure_df_canonical <- hg38_biomart_structure_df %>% | |
filter(ensembl_transcript_id %in% canonical_transcripts$ensembl_transcript_id) %>% | |
mutate(chrom=paste("chr", chromosome_name, sep="")) | |
write_sequence_bed <- function(gene_names, start_buff=-600, end_buff=400) { | |
structure_subset_df <- hg38_biomart_structure_df_canonical %>% | |
filter(external_gene_name %in% gene_names) %>% | |
mutate(seq_start=transcription_start_site-(strand * start_buff), | |
seq_end=transcription_start_site+(strand * end_buff), | |
chromosome_name=sprintf("chr%s", chromosome_name)) %>% | |
select(external_gene_name, chromosome_name, transcription_start_site, strand, seq_start, seq_end) %>% | |
group_by(external_gene_name) %>% | |
filter(row_number()==1) %>% | |
ungroup() | |
return(structure_subset_df) | |
} | |
bg_genes <- (concordance_df %>% filter(!(chr %in% c("X", "Y"))))$gene | |
bg_bed <- write_sequence_bed(bg_genes) | |
write_tsv(bg_bed %>% select(chromosome_name, seq_start, seq_end), "meme-suite/autosomal_bg.bed", col_names = F) | |
``` | |
## Plotting methylation and expression changes along the genome | |
```{r} | |
library(tidyverse) | |
library(ggplot2) | |
all_dmp_df <- read_csv("all_dmp_df.csv") | |
chrom_size <- function(chr) { | |
return(as.numeric(list( | |
"chr1" = 248956422, | |
"chr2" = 242193529, | |
"chr3" = 198295559, | |
"chr4" = 190214555, | |
"chr5" = 181538259, | |
"chr6" = 170805979, | |
"chr7" = 159345973, | |
"chr8" = 145138636, | |
"chr9" = 138394717, | |
"chr10" = 133797422, | |
"chr11" = 135086622, | |
"chr12" = 133275309, | |
"chr13" = 114364328, | |
"chr14" = 107043718, | |
"chr15" = 101991189, | |
"chr16" = 90338345, | |
"chr17" = 83257441, | |
"chr18" = 80373285, | |
"chr19" = 58617616, | |
"chr20" = 64444167, | |
"chr21" = 46709983, | |
"chr22" = 50818468, | |
"chrX" = 156040895, | |
"chrY" = 57227415 | |
)[chr])) | |
} | |
window_average_genome <- function(all_dmp_df, window_size = 1000000, step_size = 10000, include_empty=FALSE) { | |
# bed_file_df <- bed_df | |
windowed_bed_df <- tibble(chr = character(), | |
pos_start = numeric(), | |
pos_end = numeric(), | |
delta_1 = numeric(), | |
delta_2 = numeric(), | |
delta_3 = numeric(), | |
delta_4 = numeric(), | |
delta_5 = numeric()) | |
for (chr in c("chr1", "chr2", "chr3", "chr4", "chr5", | |
"chr6", "chr7", "chr8", "chr9", | |
"chr10", "chr11", "chr12", "chr13", "chr14", | |
"chr15", "chr16", "chr17", "chr18", "chr19", | |
"chr20", "chr21", "chr22", "chrX", "chrY")) { | |
print(chr) | |
chr_size <- chrom_size(chr) | |
for (s in 0:floor(chr_size / step_size)) { | |
start_position <- s * step_size | |
end_position <- min((start_position + window_size), chr_size) | |
# print(chr) | |
# print(s) | |
num_rows <- nrow(all_dmp_df %>% filter((hg38_chr == chr) & (hg38_pos >= start_position) & (hg38_pos <= end_position))) # (abs((!!start_position) - pos) <= (window_size/2)))) | |
if ((num_rows > 0) | (include_empty == T)) { | |
window_mean_delta <- all_dmp_df %>% | |
filter((hg38_chr == chr) & (abs(!!start_position - hg38_pos) <= (window_size/2))) %>% | |
summarise(chr=first(hg38_chr), | |
pos_start=start_position, | |
pos_end=end_position, | |
mean_tran_1_delta=mean(cluster_1_mean-cluster_0_mean), | |
mean_tran_2_delta=mean(cluster_2_mean-cluster_0_mean), | |
mean_tran_3_delta=mean(cluster_3_mean-cluster_0_mean), | |
mean_tran_4_delta=mean(cluster_4_mean-cluster_0_mean), | |
mean_tran_5_delta=mean(cluster_5_mean-cluster_0_mean)) | |
window_mean_delta[is.na(window_mean_delta)] <- 0 | |
write_csv(window_mean_delta, "windowed_all_chr_deltas.csv", col_names = F, append = T) | |
# windowed_bed_df <- add_row(windowed_bed_df, | |
# chr=chr, | |
# pos_start=start_position, | |
# pos_end=end_position, | |
# delta_1=as.numeric(window_mean_delta$mean_tran_1_delta), | |
# delta_2=as.numeric(window_mean_delta$mean_tran_2_delta), | |
# delta_3=as.numeric(window_mean_delta$mean_tran_3_delta), | |
# delta_4=as.numeric(window_mean_delta$mean_tran_4_delta), | |
# delta_5=as.numeric(window_mean_delta$mean_tran_5_delta)) | |
} | |
} | |
} | |
windowed_bed_df <- read_csv("windowed_all_chr_deltas.csv", col_names = c("chr", | |
"pos_start", | |
"pos_end", | |
"mean_tran_1_delta", | |
"mean_tran_2_delta", | |
"mean_tran_3_delta", | |
"mean_tran_4_delta", | |
"mean_tran_5_delta")) | |
return(windowed_bed_df) | |
} | |
whole_genome_cumulative <- window_average_genome(all_dmp_df %>% filter(passes_threshold == T)) | |
whole_genome_plot_df <- whole_genome_cumulative %>% filter(chr != "0") %>% | |
reshape2::melt(id.vars=c("chr", "pos_start", "pos_end")) | |
library(ggridges) | |
ggplot(whole_genome_plot_df %>% filter(chr != "chrY"), aes(x=pos_start, y=0, group=variable, height=value, fill=variable, color=variable)) + | |
geom_ridgeline(min_height=-1, alpha=.2) + | |
facet_grid(rows = vars(factor(chr, levels = c("chr1", "chr2", "chr3", "chr4", "chr5", | |
"chr6", "chr7", "chr8", "chr9", | |
"chr10", "chr11", "chr12", "chr13", "chr14", | |
"chr15", "chr16", "chr17", "chr18", "chr19", | |
"chr20", "chr21", "chr22", "chrX", "chrY")))) + | |
scale_fill_viridis_d() + | |
scale_color_viridis_d() | |
ggplot(all_dmp_df %>% mutate(tran_1_diff=cluster_1_mean-cluster_0_mean), aes(x=hg38_pos, y=tran_1_diff)) + | |
geom_area(mapping=aes(fill=tran_1_diff < 0))+ | |
facet_wrap(vars(hg38_chr), shrink = F) + | |
scale_y_continuous(limits=c(-.5, .5)) | |
ggplot(all_dmp_df %>% mutate(tran_5_diff=cluster_5_mean-cluster_0_mean), aes(x=hg38_pos, y=tran_5_diff)) + | |
geom_area(mapping=aes(fill=tran_5_diff < 0))+ | |
facet_wrap(vars(hg38_chr), shrink = F)+ | |
scale_y_continuous(limits=c(-.5, .5)) | |
global_meth_df <- all_dmp_df %>% group_by(hg38_chr) %>% summarise(mean_0=mean(cluster_0_mean), | |
mean_1=mean(cluster_1_mean), | |
mean_2=mean(cluster_2_mean), | |
mean_3=mean(cluster_3_mean), | |
mean_4=mean(cluster_4_mean), | |
mean_5=mean(cluster_5_mean)) | |
global_meth_df <- reshape2::melt(global_meth_df, id.vars=c("hg38_chr")) | |
global_meth_df <- reshape2::melt(all_dmp_df, id.vars=c("hg38_chr", "hg38_pos", "rowname"), measure.vars=c("cluster_0_mean", | |
"cluster_1_mean", | |
"cluster_2_mean", | |
"cluster_3_mean", | |
"cluster_4_mean", | |
"cluster_5_mean")) | |
library(ggsignif) | |
ggplot(global_meth_df %>% filter(hg38_chr != "chrY"), aes(x=variable, y=value)) + | |
facet_wrap(vars(hg38_chr))+ | |
geom_violin() + | |
theme_bw() + | |
scale_y_continuous(limits=c(0,1.2)) | |
scale_fill_uchicago() | |
geom_signif(comparisons = list(c("cluster_0_mean","cluster_1_mean"), | |
c("cluster_1_mean","cluster_2_mean"), | |
c("cluster_2_mean","cluster_3_mean"), | |
c("cluster_3_mean","cluster_4_mean"), | |
c("cluster_4_mean","cluster_5_mean")), map_signif_level = T) | |
statistics <- read_csv('variance_statistics_noh9_nooutliers.csv') | |
filtered_site_annotation_df <- read_csv("filtered_site_annotation_df.csv") | |
statistics_combined_df <- left_join(statistics, filtered_site_annotation_df %>% dplyr::select(rowname,hg38_chromosome, hg38_gene_name, hg38_gene_group), by=c("Position"="rowname")) | |
interesting_probes <- (statistics_combined_df %>% filter(alpha <= 0.05 & femaleVariance > maleVariance))$Position | |
ggplot(global_meth_df %>% filter(rowname %in% interesting_probes), aes(x=variable, y=value)) + | |
facet_wrap(vars(hg38_chr))+ | |
geom_boxplot() + | |
theme_bw() + | |
scale_y_continuous(limits=c(0,1.2)) | |
``` | |
## concordance score with expression changes | |
```{r} | |
diff_expr_all_df <- read_csv("diff_expr_all_df.csv") | |
gened_direction_match <- inner_join(gened_dmp_df, diff_expr_all_df, by=c("hg38_gene"="gene")) %>% | |
mutate(tran_1_direction_match=(tran_1_mean_delta*transition_1_log2FC) < 0, | |
tran_2_direction_match=(tran_2_mean_delta*transition_2_log2FC) < 0, | |
tran_3_direction_match=(tran_3_mean_delta*transition_3_log2FC) < 0, | |
tran_4_direction_match=(tran_4_mean_delta*transition_4_log2FC) < 0, | |
tran_5_direction_match=(tran_5_mean_delta*transition_5_log2FC) < 0) | |
gened_direction_match$concordant <- F | |
for (row in 1:nrow(gened_direction_match)) { | |
direction_col <- sprintf("tran_%d_direction_match", as.numeric(unlist(gened_direction_match[row, "key_transition.x"]))) | |
gened_direction_match[row, "concordant"] <- gened_direction_match[row, direction_col] & as.numeric(unlist(gened_direction_match[row, "key_mean_qval"])) <= 0.001 | |
} | |
``` | |
## Exploring global demethylation with DUSP9 demethylation | |
```{r} | |
female_df_no_h9 <- read_csv("female_df_no_h9.csv") | |
male_df_no_outliers <- read_csv("male_df_no_outliers.csv") | |
filtered_site_annotation_df <- read_csv("filtered_site_annotation_df.csv") | |
dusp9_probes <- (filtered_site_annotation_df %>% filter(hg38_gene_name == "DUSP9" & grepl("TSS1500|TSS200|5_UTR", hg38_gene_group)))$rowname | |
autosome_probes <- (filtered_site_annotation_df %>% filter(!(hg38_chromosome %in% c("chrX", "chrY"))))$rowname | |
mean_genome_methylation_female <- tidy(colMeans(dplyr::select(female_df_no_h9 %>% filter(rn %in% autosome_probes), -rn))) | |
mean_dusp9_methylation_female <- tidy(colMeans(dplyr::select(female_df_no_h9 %>% filter(rn %in% dusp9_probes), -rn))) | |
dusp9_v_genome_female <- full_join(mean_genome_methylation_female %>% rename(genome_methylation=x), | |
mean_dusp9_methylation_female %>% rename(dusp9_methylation=x), by="names") | |
dusp9_v_genome_female$sex <- "female" | |
mean_genome_methylation_male <- tidy(colMeans(dplyr::select(male_df_no_outliers %>% filter(rn %in% autosome_probes), -rn))) | |
mean_dusp9_methylation_male <- tidy(colMeans(dplyr::select(male_df_no_outliers %>% filter(rn %in% dusp9_probes), -rn))) | |
dusp9_v_genome_male <- full_join(mean_genome_methylation_male %>% rename(genome_methylation=x), | |
mean_dusp9_methylation_male %>% rename(dusp9_methylation=x), by="names") | |
dusp9_v_genome_male$sex <- "male" | |
dusp9_v_genome_combined <- full_join(dusp9_v_genome_female, dusp9_v_genome_male %>% mutate(dusp9_methylation= dusp9_methylation/ 2), by=c("names", "genome_methylation", "dusp9_methylation", "sex")) | |
ggplot(dusp9_v_genome_combined, aes(x=dusp9_methylation, y=genome_methylation, color=sex)) + | |
geom_point() + | |
theme_bw() + | |
labs(title="Genome Methylation vs DUSP9 Methylation") + | |
scale_color_npg() | |
ggplot(dusp9_v_genome_combined %>% filter(dusp9_methylation < .2)) + | |
geom_boxplot(aes(x=sex, y=genome_methylation, fill=sex)) + | |
scale_fill_npg() + | |
labs(title="Genome Methylation when DUSP9 methylation < .4") | |
update_geom_defaults("boxplot", | |
list(size=.3, | |
outlier.shape=NA)) | |
update_geom_defaults("point", | |
list(shape=20, | |
size=.5)) | |
dusp9_expression <- plot_gene_expression("DUSP9") + | |
scale_y_continuous() + | |
theme_bw() + | |
base_plot_theme | |
elk1_expression <- plot_gene_expression("ELK1") + | |
scale_y_continuous() + | |
theme_bw() + | |
base_plot_theme | |
dusp9_elk1_expression <- plot_gene_expression(c("DUSP9", "ELK1"), multiple=T) + | |
theme_bw() + | |
base_plot_theme + | |
theme(strip.text.x = element_text(size=5, margin = margin(.5,0,.5,0, "mm")), | |
strip.text.y = element_text(size=5, margin = margin(0,0.5,0,.5, "mm"))) | |
dusp9_elk1_methylation <- make_subset_heatmap(c("DUSP9", "ELK1")) + | |
theme(strip.text.x = element_text(size=5, margin = margin(.5,0,.5,0, "mm")), | |
strip.text.y = element_text(size=5, margin = margin(0,0.5,0,.5, "mm")), | |
legend.key.height = unit(3, "mm"), | |
legend.title = element_text(size=5,vjust = .9), | |
legend.text = element_text(size=4), | |
legend.box.margin = margin(t=4,b=-4, unit="mm"), | |
plot.margin = margin(t=-4, unit = "mm"), | |
axis.title.x = element_text(size=5), | |
axis.title.y = element_text(size = 0, margin = margin(0,0,0,0))) | |
arranged_dusp9_elk1 <- grid.arrange(dusp9_elk1_methylation, | |
dusp9_expression, | |
elk1_expression, | |
layout_matrix=rbind(c(1,2), c(1,3)), nrow=2) | |
ggsave("figs/supplementary/supp_4_dusp9_elk1_meth.pdf", dusp9_elk1_methylation, width = 87, height=58, units = "mm") | |
ggsave("figs/supplementary/supp_4_dusp9_elk1_expr.pdf", dusp9_elk1_expression, width = 87, height=58, units = "mm") | |
ggsave("figs/main/fig_4_dusp9_elk1_araf_meth.svg", dusp9_elk1_methylation, width = 76, height=58, units = "mm") | |
ggsave("figs/main/fig_4_dusp9_elk1_araf_expr.svg", dusp9_elk1_expression, width = 40, height=58, units = "mm") | |
ggsave("figs/main/fig_4_dusp9_expr.pdf", dusp9_expression, width = 40, height=48, units = "mm") | |
ggsave("figs/main/fig_4_elk1_expr.pdf", elk1_expression, width = 40, height=29, units = "mm") | |
dusp9_methylation <- make_subset_heatmap(c("DUSP9")) + | |
theme(strip.text.x = element_text(size=5, margin = margin(.5,0,.5,0, "mm")), | |
strip.text.y = element_text(size=5, margin = margin(0,0.5,0,.5, "mm")), | |
legend.key.height = unit(3, "mm"), | |
legend.title = element_text(size=5,vjust = .9), | |
legend.text = element_text(size=4), | |
legend.box.margin = margin(t=4,b=-4, unit="mm"), | |
plot.margin = margin(t=-4, unit = "mm"), | |
axis.title.x = element_text(size=5), | |
axis.title.y = element_text(size = 0, margin = margin(0,0,0,0))) | |
ggsave("figs/main/fig_4_dusp9_meth.pdf", dusp9_methylation, width = 76, height=48, units = "mm") | |
ggsave("figs/main/fig_4_dusp9_expr.pdf", dusp9_expression, width = 40, height=48, units = "mm") | |
elk1_methylation <- make_subset_heatmap(c("ELK1")) + | |
theme(strip.text.x = element_text(size=5, margin = margin(.5,0,.5,0, "mm")), | |
strip.text.y = element_text(size=5, margin = margin(0,0.5,0,.5, "mm")), | |
legend.key.height = unit(3, "mm"), | |
legend.title = element_text(size=5,vjust = .9), | |
legend.text = element_text(size=4), | |
legend.box.margin = margin(t=4,b=-4, unit="mm"), | |
plot.margin = margin(t=-4, unit = "mm"), | |
axis.title.x = element_text(size=5), | |
axis.title.y = element_text(size = 0, margin = margin(0,0,0,0))) | |
ggsave("figs/supplementary/supp_4_elk1_meth.pdf", elk1_methylation, width = 87, height=48, units = "mm") | |
ggsave("figs/supplementary/supp_4_elk1_expr.pdf", elk1_expression, width = 87, height=48, units = "mm") | |
``` | |
## Global methylation changes across the chromosomes | |
```{r} | |
all_dmp_df <- read_csv("all_dmp_df.csv.gz") | |
plotting_df <- all_dmp_df %>% filter(passes_threshold == T)# & abs(delta) >= .15) | |
plotting_df$chr_alt <- gsub("chr", "", plotting_df$hg38_chr) | |
plotting_df$chr_alt <- gsub("X", "23", plotting_df$chr_alt) | |
plotting_df$chr_alt <- as.numeric(plotting_df$chr_alt) | |
chromosomes_df <- tibble(chromosome=c(1, 2, 3, 4, 5, 6, 7, 23, 8, | |
9, 11, 10, 12, 13, 14, 15, | |
16, 17, 18, 20, 19, 24, 22, 21), | |
size=c(248956422, 242193529, 198295559, 190214555, | |
181538259, 170805979, 159345973, 156040895, | |
145138636, 138394717, 135086622, 133797422, | |
133275309, 114364328, 107043718, 101991189, | |
90338345, 83257441, 80373285, 64444167, 58617616, | |
57227415, 50818468, 46709983)) %>% filter(chromosome != 24) | |
#alpha=abs(delta) | |
# tran_1_density <- ggplot(plotting_df %>% filter(chr_alt != 23 & key_transition == 1)) + | |
# geom_density(aes(x=abs(delta)), fill="black") + | |
# theme_bw() + | |
# base_plot_theme + | |
# scale_y_continuous(breaks = c(0,5,10)) + | |
# scale_x_continuous(breaks = c(0,.2,.4)) + | |
# labs(title="Autosomal β Density") + | |
# theme(axis.ticks.length = unit(.4, "mm"), | |
# plot.margin = margin(l=0,b=0,t=0.5,r=0.4, "mm"), | |
# axis.title.y = element_text(size=3, margin = margin(t = 0, r = -0.2, b = 0, l = 0, "mm")), | |
# axis.title.x = element_text(size=3,margin = margin(t = -0.2, r = 0, b = 0, l = 0, "mm")), | |
# plot.title = element_text(size=3.5)) | |
# ggsave("figs/supplementary/supp_3_tran_1_density.pdf", tran_1_density, width = 18, height=12, units = "mm") | |
# tran_2_density <- ggplot(plotting_df %>% filter(chr_alt != 23 & key_transition == 2)) + | |
# geom_density(aes(x=abs(delta)), fill="black") + | |
# theme_bw() + | |
# base_plot_theme + | |
# scale_y_continuous(breaks = c(0,5,10)) + | |
# scale_x_continuous(breaks = c(0,.2,.4)) + | |
# labs(title="Autosomal β Density") + | |
# theme(axis.ticks.length = unit(.4, "mm"), | |
# plot.margin = margin(l=0,b=0,t=0.5,r=0.4, "mm"), | |
# axis.title.y = element_text(size=3, margin = margin(t = 0, r = -0.2, b = 0, l = 0, "mm")), | |
# axis.title.x = element_text(size=3,margin = margin(t = -0.2, r = 0, b = 0, l = 0, "mm")), | |
# plot.title = element_text(size=3.5)) | |
# ggsave("figs/supplementary/supp_3_tran_2_density.pdf", tran_2_density, width = 18, height=12, units = "mm") | |
# tran_3_density <- ggplot(plotting_df %>% filter(chr_alt != 23 & key_transition == 3)) + | |
# geom_density(aes(x=abs(delta)), fill="black") + | |
# theme_bw() + | |
# base_plot_theme + | |
# scale_y_continuous(breaks = c(0,7,15)) + | |
# scale_x_continuous(breaks = c(0,.2,.4)) + | |
# labs(title="Autosomal β Density") + | |
# theme(axis.ticks.length = unit(.4, "mm"), | |
# plot.margin = margin(l=0,b=0,t=0.5,r=0.4, "mm"), | |
# axis.title.y = element_text(size=3, margin = margin(t = 0, r = -0.2, b = 0, l = 0, "mm")), | |
# axis.title.x = element_text(size=3,margin = margin(t = -0.2, r = 0, b = 0, l = 0, "mm")), | |
# plot.title = element_text(size=3.5)) | |
# ggsave("figs/supplementary/supp_3_tran_3_density.pdf", tran_3_density, width = 18, height=12, units = "mm") | |
# tran_4_density <- ggplot(plotting_df %>% filter(chr_alt != 23 & key_transition == 4)) + | |
# geom_density(aes(x=abs(delta)), fill="black") + | |
# theme_bw() + | |
# base_plot_theme + | |
# scale_y_continuous(breaks = c(0,7,15)) + | |
# scale_x_continuous(breaks = c(0,.2,.4)) + | |
# labs(title="Autosomal β Density") + | |
# theme(axis.ticks.length = unit(.4, "mm"), | |
# plot.margin = margin(l=0,b=0,t=0.5,r=0.4, "mm"), | |
# axis.title.y = element_text(size=3, margin = margin(t = 0, r = -0.2, b = 0, l = 0, "mm")), | |
# axis.title.x = element_text(size=3,margin = margin(t = -0.2, r = 0, b = 0, l = 0, "mm")), | |
# plot.title = element_text(size=3.5)) | |
# ggsave("figs/supplementary/supp_3_tran_4_density.pdf", tran_4_density, width = 18, height=12, units = "mm") | |
# tran_5_density <- ggplot(plotting_df %>% filter(chr_alt != 23 & key_transition == 5)) + | |
# geom_density(aes(x=abs(delta)), fill="black") + | |
# theme_bw() + | |
# base_plot_theme + | |
# scale_y_continuous(breaks = c(0,10,20)) + | |
# scale_x_continuous(breaks = c(0,.2,.4, .6)) + | |
# labs(title="Autosomal β Density") + | |
# theme(axis.ticks.length = unit(.4, "mm"), | |
# plot.margin = margin(l=0,b=0,t=0.5,r=0.4, "mm"), | |
# axis.title.y = element_text(size=3, margin = margin(t = 0, r = -0.2, b = 0, l = 0, "mm")), | |
# axis.title.x = element_text(size=3,margin = margin(t = -0.2, r = 0, b = 0, l = 0, "mm")), | |
# plot.title = element_text(size=3.5)) | |
# ggsave("figs/supplementary/supp_3_tran_5_density.pdf", tran_5_density, width = 18, height=12, units = "mm") | |
global_meth_changes <- ggplot(plotting_df) + | |
geom_rect(data=chromosomes_df, | |
aes(xmin=chromosome-.4, xmax=chromosome+.4, ymin=0, ymax=size), fill="lightgrey")+ | |
geom_segment(aes(y=hg38_pos, x=chr_alt-.4, yend=hg38_pos, xend=chr_alt+.4, color=delta_increasing, alpha=abs(delta)), size=.3) + | |
facet_grid(rows = vars(key_transition), labeller = labeller(.rows=function(n){ | |
return(list("1"="Tran. 1", | |
"2"="Tran. 2", | |
"3"="Tran. 3", | |
"4"="Tran. 4", | |
"5"="Tran. 5")[n]) | |
})) + | |
scale_x_reverse(breaks = 1:23, labels = c(1:22, "X"), minor_breaks = NULL, expand=c(.01,.01)) + | |
scale_y_continuous(expand=c(0,0), breaks = c(0,50e6,100e6,150e6,200e6,250e6), labels=c(0,50,100,150,200,250)) + | |
scale_color_aaas(limits=c(F,T),labels=c("Decreasing", "Increasing"), name="Direction of Change") + | |
labs(x="Chromosome", | |
y="Position (Mb)", | |
title="DNAme Changes on all chromosomes") + | |
theme_bw() + | |
coord_flip() + | |
base_plot_theme + | |
scale_alpha(limits=c(0,.5), range=c(0,.3), oob=scales::squish, breaks=c(0,.25,.5)) + | |
theme(strip.text.y = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")), | |
strip.text.x = element_text(size=5, margin = margin(1,1.5,1,1.5, "mm")), | |
strip.background = element_rect(size = .3), | |
panel.grid.major.y = element_blank(), | |
panel.grid.minor.y = element_blank(), | |
legend.box="vertical", | |
legend.box.just="right", | |
legend.box.spacing = unit(5, "mm"), | |
legend.spacing.y=unit(1, "mm")) | |
ggsave("figs/supplementary/supp_3a.pdf",global_meth_changes, width=174/3, height=236, units="mm") | |
global_meth_changes_tran45 <- ggplot(plotting_df %>% filter(key_transition >= 4) %>% mutate(key_transition = 5)) + | |
geom_rect(data=chromosomes_df, | |
aes(xmin=chromosome-.4, xmax=chromosome+.4, ymin=0, ymax=size), fill="lightgrey")+ | |
geom_segment(aes(y=hg38_pos, x=chr_alt-.4, yend=hg38_pos, xend=chr_alt+.4, color=delta_increasing, alpha=abs(delta)), size=.3) + | |
facet_grid(rows = vars(key_transition), labeller = labeller(.rows=function(n){ | |
return(list("1"="Tran. 1", | |
"2"="Tran. 2", | |
"3"="Tran. 3", | |
"4"="Tran. 4/5", | |
"5"="Tran. 4/5")[n]) | |
})) + | |
scale_x_reverse(breaks = 1:23, labels = c(1:22, "X"), minor_breaks = NULL, expand=c(0.01,0.01)) + | |
scale_y_continuous(expand=c(0,0), breaks = c(0,50e6,100e6,150e6,200e6,250e6), labels=c(0,50,100,150,200,250)) + | |
coord_flip() + | |
scale_color_aaas(limits=c(F,T),labels=c("Decreasing", "Increasing"), name="Direction of Change") + | |
labs(x="Chromosome", | |
y="Position (Mb)", | |
title="DNAme Changes") + | |
theme_bw() + | |
base_plot_theme + | |
scale_alpha(limits=c(0,.5), range=c(0,.3), oob=scales::squish, breaks=c(0,.25,.5)) + | |
theme(strip.text.y = element_text(size=5, margin = margin(0,0.5,0,.5, "mm")), | |
strip.text.x = element_text(size=5, margin = margin(0,0.5,0,.5, "mm")), | |
strip.background = element_rect(size = .3), | |
panel.grid.major.y = element_blank(), | |
panel.grid.minor.y = element_blank(), | |
legend.box="vertical", | |
legend.box.just="right", | |
legend.box.spacing = unit(3.5, "mm"), | |
legend.spacing.y=unit(.2, "mm"), | |
axis.text.y=element_text(size=5)) | |
ggsave("figs/main/fig_4_global_changes_meth.pdf", global_meth_changes_tran45, width = 58, height=47, units = "mm") | |
diff_expr_all_df <- read_csv("diff_expr_all_df.csv.gz") | |
grouped_concordance_df_filtered <- read_csv("grouped_concordance_df_filtered.csv.gz") | |
expr_plotting_df <- grouped_concordance_df_filtered # diff_expr_all_df #%>% filter(passes_threshold == T) | |
expr_plotting_df$chr_alt <- gsub("X", "23", expr_plotting_df$chr) | |
expr_plotting_df$chr_alt <- as.numeric(expr_plotting_df$chr_alt) | |
# expr_plotting_df_melt <- expr_plotting_df %>% | |
# select("chr_alt", | |
# "ens_id", | |
# "gene", | |
# "start_position", | |
# "end_position", | |
# "transition_1_log2FC", | |
# "transition_2_log2FC", | |
# "transition_3_log2FC", | |
# "transition_4_log2FC") %>% | |
# reshape2::melt(id.vars=c("chr_alt", | |
# "ens_id", | |
# "gene", | |
# "start_position", | |
# "end_position")) | |
chromosomes_df <- tibble(chromosome=c(1, 2, 3, 4, 5, 6, 7, 23, 8, | |
9, 11, 10, 12, 13, 14, 15, | |
16, 17, 18, 20, 19, 24, 22, 21), | |
size=c(248956422, 242193529, 198295559, 190214555, | |
181538259, 170805979, 159345973, 156040895, | |
145138636, 138394717, 135086622, 133797422, | |
133275309, 114364328, 107043718, 101991189, | |
90338345, 83257441, 80373285, 64444167, 58617616, | |
57227415, 50818468, 46709983)) %>% filter(chromosome != 24) | |
global_expr_changes <- ggplot(expr_plotting_df %>% filter(abs(max_tran_log2FC) >= .5)) + | |
geom_rect(data=chromosomes_df, | |
aes(xmin=chromosome-.4, xmax=chromosome+.4, ymin=0, ymax=size), fill="lightgrey")+ | |
geom_segment(aes(y=start_position, x=chr_alt-.4, yend=start_position, xend=chr_alt+.4, color=max_tran_log2FC >= 0), size=.3) + | |
facet_grid(rows = vars(max_transition), labeller = labeller(.rows=function(n){ | |
return(list("1"="Tran. 1", | |
"2"="Tran. 2", | |
"3"="Tran. 3", | |
"4"="Tran. 4", | |
"5"="Tran. 5")[n]) | |
})) + | |
scale_x_continuous(breaks = 1:23, labels = c(1:22, "X"), minor_breaks = NULL, expand=c(.01,.01)) + | |
scale_y_continuous(expand=c(0,0), breaks = c(0,50e6,100e6,150e6,200e6,250e6), labels=c(0,50,100,150,200,250)) + | |
scale_color_aaas(limits=c(T,F),labels=c("Increasing", "Decreasing"), name="Direction of Expression Change") + | |
labs(x="Chromosome", | |
y="Position (Mb)", | |
title="Expression/Methylation Concordant Changes on all chromosomes") + | |
theme_bw() + | |
base_plot_theme + | |
theme(strip.text.y = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")), | |
strip.text.x = element_text(size=5, margin = margin(1,1.5,1,1.5, "mm")), | |
strip.background = element_rect(size = .3), | |
legend.box.spacing = unit(5, "mm"), | |
legend.spacing.y=unit(1, "mm")) | |
ggsave("figs/supplementary/supp_3b.pdf", global_expr_changes, width=85, height=236, units="mm") | |
expr_plotting_df %>% filter(abs(max_tran_log2FC) >= .5) %>% | |
count(max_transition, max_tran_log2FC >= 0, chr == "X") %>% | |
arrange(max_transition, desc(`max_tran_log2FC >= 0`)) %>% | |
View() | |
global_expr_changes_tran45 <- ggplot(expr_plotting_df %>% filter(abs(max_tran_log2FC) >= .5 & max_transition >= 4) %>% mutate(max_transition = 5)) + | |
geom_rect(data=chromosomes_df, | |
aes(xmin=chromosome-.4, xmax=chromosome+.4, ymin=0, ymax=size), fill="lightgrey")+ | |
geom_segment(aes(y=start_position, x=chr_alt-.4, yend=start_position, xend=chr_alt+.4, color=max_tran_log2FC >= 0), size=.3) + | |
facet_grid(rows = vars(max_transition), labeller = labeller(.rows=function(n){ | |
return(list("1"="Tran. 1", | |
"2"="Tran. 2", | |
"3"="Tran. 3", | |
"4"="Tran. 4/5", | |
"5"="Tran. 4/5")[n]) | |
})) + | |
scale_x_continuous(breaks = 1:23, labels = c(1:22, "X"), minor_breaks = NULL) + | |
scale_y_continuous(expand=c(0,0)) + | |
scale_color_aaas(limits=c(T,F),labels=c("Increasing", "Decreasing"), name="Direction of Expression Change") + | |
labs(x="Chromosome", | |
y="Position (Mb)", | |
title="Expression/Methylation Concordant Changes on all chromosomes") + | |
theme_bw() + | |
base_plot_theme + | |
theme(strip.text.y = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")), | |
strip.text.x = element_text(size=5, margin = margin(1,1.5,1,1.5, "mm")), | |
strip.background = element_rect(size = .3)) | |
ggsave("figs/main/fig_4_global_changes_expr.svg", global_expr_changes_tran45, width = 58, height=47, units = "mm") | |
expr_plotting_df %>% filter(abs(max_tran_log2FC) >= .5 & max_transition >= 4) %>% mutate(max_transition = 5) %>% | |
count(max_transition, max_tran_log2FC >= 0, chr == "X") %>% | |
View() | |
# global_expr_changes_tran5 <- ggplot(expr_plotting_df %>% filter(abs(max_tran_log2FC) >= .5 & max_transition == 5)) + | |
# geom_rect(data=chromosomes_df, | |
# aes(xmin=chromosome-.4, xmax=chromosome+.4, ymin=0, ymax=size), fill="lightgrey")+ | |
# geom_segment(aes(y=start_position, x=chr_alt-.4, yend=start_position, xend=chr_alt+.4, color=max_tran_log2FC >= 0), size=.3) + | |
# facet_grid(rows = vars(max_transition), labeller = labeller(.rows=function(n){ | |
# return(list("1"="Tran. 1", | |
# "2"="Tran. 2", | |
# "3"="Tran. 3", | |
# "4"="Tran. 4", | |
# "5"="Tran. 5")[n]) | |
# })) + | |
# scale_x_continuous(breaks = 1:23, labels = c(1:22, "X"), minor_breaks = NULL) + | |
# scale_y_continuous(expand=c(0,0)) + | |
# scale_color_aaas(limits=c(T,F),labels=c("Increasing", "Decreasing"), name="Direction of Expression Change") + | |
# labs(x="Chromosome", | |
# y="Position (Mb)", | |
# title="Expression/Methylation Concordant Changes on all chromosomes") + | |
# theme_bw() + | |
# base_plot_theme + | |
# theme(strip.text.y = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")), | |
# strip.text.x = element_text(size=5, margin = margin(1,1.5,1,1.5, "mm")), | |
# strip.background = element_rect(size = .3)) | |
# ggsave("figs/main/fig_4_global_changes.svg", global_expr_changes_tran5, width = 87, height=47, units = "mm") | |
# global_expr_changes_alt <- ggplot(expr_plotting_df_melt %>% filter(abs(value) >= 1)) + | |
# geom_rect(data=chromosomes_df, | |
# aes(xmin=chromosome-.4, xmax=chromosome+.4, ymin=0, ymax=size), fill="lightgrey")+ | |
# geom_segment(aes(y=start_position, x=chr_alt-.4, yend=start_position, xend=chr_alt+.4, color=value>0), alpha=.3) + | |
# facet_grid(rows = vars(variable)) + | |
# scale_x_continuous(breaks = 1:23, labels = c(1:22, "X"), minor_breaks = NULL) + | |
# scale_y_continuous(expand=c(0,0)) + | |
# scale_color_aaas(limits=c(T,F),labels=c("Increasing", "Decreasing"), name="Direction of Change") + | |
# labs(x="Chromosome", | |
# y="Position (Mb)", | |
# title="Expression Changes on all chromosomes") + | |
# theme_bw() + | |
# base_plot_theme | |
``` | |
## Grouping methylation by gene | |
```{r} | |
all_dmp_df <- read_csv("all_dmp_df.csv") | |
concordance_df <- read_csv("concordance_df.csv") | |
diff_expr_all_df <- read_csv("diff_expr_all_df.csv") | |
library(tidyr) | |
grouped_df <- all_dmp_df %>% group_by(hg38_gene) %>% | |
summarise(hg38_chr=first(hg38_chr), | |
all_probes=length(key_transition), | |
transition_1_probes=sum(key_transition == 1 & passes_threshold == T), | |
transition_1_delta=sum(key_transition == 1 & passes_threshold == T), | |
transition_2_probes=sum(key_transition == 2 & passes_threshold == T), | |
transition_3_probes=sum(key_transition == 3 & passes_threshold == T), | |
transition_4_probes=sum(key_transition == 4 & passes_threshold == T), | |
transition_5_probes=sum(key_transition == 5 & passes_threshold == T)) | |
gene_dmp_df <- all_dmp_df %>% | |
group_by(hg38_gene) %>% | |
summarise(cluster_0_mean_meth=mean(cluster_0_mean), | |
cluster_1_mean_meth=mean(cluster_1_mean), | |
cluster_2_mean_meth=mean(cluster_2_mean), | |
cluster_3_mean_meth=mean(cluster_3_mean), | |
cluster_4_mean_meth=mean(cluster_4_mean), | |
cluster_5_mean_meth=mean(cluster_5_mean)) | |
grouped_df_test <- left_join(grouped_df, gene_dmp_df, by=c("hg38_gene")) | |
grouped_df_promoter <- all_dmp_df %>% group_by(hg38_gene) %>% | |
filter(hg38_refgene_group %in% c("TSS1500", "TSS200", "5_UTR")) %>% | |
summarise(hg38_chr=first(hg38_chr), | |
all_probes=length(key_transition), | |
transition_1_probes=sum(key_transition == 1 & passes_threshold == T), | |
transition_1_delta=sum(key_transition == 1 & passes_threshold == T), | |
transition_2_probes=sum(key_transition == 2 & passes_threshold == T), | |
transition_3_probes=sum(key_transition == 3 & passes_threshold == T), | |
transition_4_probes=sum(key_transition == 4 & passes_threshold == T), | |
transition_5_probes=sum(key_transition == 5 & passes_threshold == T)) | |
gene_dmp_df_promoter <- all_dmp_df %>% | |
filter(hg38_refgene_group %in% c("TSS1500", "TSS200", "5_UTR")) %>% | |
group_by(hg38_gene) %>% | |
summarise(cluster_0_mean_meth=mean(cluster_0_mean), | |
cluster_1_mean_meth=mean(cluster_1_mean), | |
cluster_2_mean_meth=mean(cluster_2_mean), | |
cluster_3_mean_meth=mean(cluster_3_mean), | |
cluster_4_mean_meth=mean(cluster_4_mean), | |
cluster_5_mean_meth=mean(cluster_5_mean)) | |
grouped_df_test_promoter <- left_join(grouped_df_promoter, gene_dmp_df_promoter, by=c("hg38_gene")) | |
eroding_methylation_genes <- (grouped_df_test_promoter %>% filter(hg38_chr == "chrX" & (transition_1_probes+transition_2_probes+transition_3_probes+transition_4_probes+transition_5_probes != 0) & | |
cluster_0_mean_meth > .3 & | |
cluster_0_mean_meth < .7))$hg38_gene | |
concordance_eroding_df <- concordance_df %>% filter(gene %in% eroding_methylation_genes) | |
melted_group_df <- reshape2::melt(grouped_df %>% select(hg38_gene, | |
transition_1_probes, | |
transition_2_probes, | |
transition_3_probes, | |
transition_4_probes, | |
transition_5_probes), | |
id.vars=c("hg38_gene")) %>% | |
group_by(hg38_gene) %>% | |
arrange(desc(value)) %>% | |
summarise(max_transition=first(variable), | |
max_probes=first(value)) %>% | |
mutate(max_transition=as.numeric(gsub("transition_|_probes", "", max_transition))) | |
grouped_concordance_df <- left_join(concordance_df, grouped_df, by=c("gene"="hg38_gene")) | |
grouped_concordance_df <- left_join(grouped_concordance_df, melted_group_df, by=c("gene"="hg38_gene")) | |
grouped_concordance_df <- left_join(grouped_concordance_df, diff_expr_all_df %>% | |
select(gene, | |
chr, | |
start_position, | |
end_position, | |
transition_1_log2FC, | |
transition_2_log2FC, | |
transition_3_log2FC, | |
transition_4_log2FC, | |
transition_1_padj, | |
transition_2_padj, | |
transition_3_padj, | |
transition_4_padj)) | |
grouped_concordance_df_dmp <- left_join(grouped_concordance_df, all_dmp_df %>% | |
select(hg38_gene, | |
cluster_0_mean_meth=cluster_0_mean, | |
cluster_1_mean_meth=cluster_1_mean, | |
cluster_2_mean_meth=cluster_2_mean, | |
cluster_3_mean_meth=cluster_3_mean, | |
cluster_4_mean_meth=cluster_4_mean, | |
cluster_5_mean_meth=cluster_5_mean, | |
probe_key_transition=key_transition), | |
by=c("gene"="hg38_gene")) %>% | |
filter(max_transition == probe_key_transition) %>% | |
group_by(gene) %>% | |
summarise(cluster_0_mean_meth=mean(cluster_0_mean_meth), | |
cluster_1_mean_meth=mean(cluster_1_mean_meth), | |
cluster_2_mean_meth=mean(cluster_2_mean_meth), | |
cluster_3_mean_meth=mean(cluster_3_mean_meth), | |
cluster_4_mean_meth=mean(cluster_4_mean_meth), | |
cluster_5_mean_meth=mean(cluster_5_mean_meth)) | |
grouped_concordance_df <- left_join(grouped_concordance_df, | |
grouped_concordance_df_dmp, | |
by=c("gene")) | |
grouped_concordance_df$max_meth_delta <- 0 | |
grouped_concordance_df$max_tran_log2FC <- 0 | |
grouped_concordance_df$max_tran_expr_padj <- 0 | |
for (r in 1:nrow(grouped_concordance_df)) { | |
max_tran <- as.numeric(unlist(grouped_concordance_df$max_transition[r])) | |
cluster_a_mean <- as.numeric(unlist(grouped_concordance_df[r, | |
sprintf("cluster_%d_mean_meth", | |
max_tran - 1)])) | |
cluster_b_mean <- as.numeric(unlist(grouped_concordance_df[r, | |
sprintf("cluster_%d_mean_meth", | |
max_tran)])) | |
grouped_concordance_df[r, "max_meth_delta"] <- cluster_b_mean - cluster_a_mean | |
if (max_tran == 5) { | |
max_tran_log2FC <- as.numeric(unlist(grouped_concordance_df[r, | |
sprintf("transition_%d_log2FC", | |
4)])) | |
max_tran_expr_padj <- as.numeric(unlist(grouped_concordance_df[r, | |
sprintf("transition_%d_padj", | |
4)])) | |
}else { | |
max_tran_log2FC <- as.numeric(unlist(grouped_concordance_df[r, | |
sprintf("transition_%d_log2FC", | |
max_tran)])) | |
max_tran_expr_padj <- as.numeric(unlist(grouped_concordance_df[r, | |
sprintf("transition_%d_padj", | |
max_tran)])) | |
} | |
grouped_concordance_df[r, "max_tran_log2FC"] <- max_tran_log2FC | |
grouped_concordance_df[r, "max_tran_expr_padj"] <- max_tran_expr_padj | |
} | |
write_csv(grouped_concordance_df, "grouped_concordance_df.csv") | |
grouped_concordance_df_filtered <- grouped_concordance_df %>% | |
filter(pval <= .1 & corr < 0 & max_probes != 0) | |
# grouped_concordance_df_filtered_dmp <- left_join(grouped_concordance_df_filtered, all_dmp_df %>% | |
# select(hg38_gene, | |
# cluster_0_mean_meth=cluster_0_mean, | |
# cluster_1_mean_meth=cluster_1_mean, | |
# cluster_2_mean_meth=cluster_2_mean, | |
# cluster_3_mean_meth=cluster_3_mean, | |
# cluster_4_mean_meth=cluster_4_mean, | |
# cluster_5_mean_meth=cluster_5_mean, | |
# probe_key_transition=key_transition), | |
# by=c("gene"="hg38_gene")) | |
# grouped_concordance_df_filtered_dmp <- grouped_concordance_df_filtered_dmp %>% | |
# filter(max_transition == probe_key_transition) %>% | |
# group_by(gene) %>% | |
# summarise(cluster_0_mean_meth=mean(cluster_0_mean_meth), | |
# cluster_1_mean_meth=mean(cluster_1_mean_meth), | |
# cluster_2_mean_meth=mean(cluster_2_mean_meth), | |
# cluster_3_mean_meth=mean(cluster_3_mean_meth), | |
# cluster_4_mean_meth=mean(cluster_4_mean_meth), | |
# cluster_5_mean_meth=mean(cluster_5_mean_meth)) | |
# grouped_concordance_df_filtered <- left_join(grouped_concordance_df_filtered, | |
# grouped_concordance_df_filtered_dmp, | |
# by=c("gene")) | |
# grouped_concordance_df_filtered$max_meth_delta <- 0 | |
# grouped_concordance_df_filtered$max_tran_log2FC <- 0 | |
# for (r in 1:nrow(grouped_concordance_df_filtered)) { | |
# max_tran <- as.numeric(unlist(grouped_concordance_df_filtered$max_transition[r])) | |
# cluster_a_mean <- as.numeric(unlist(grouped_concordance_df_filtered[r, | |
# sprintf("cluster_%d_mean_meth", | |
# max_tran - 1)])) | |
# cluster_b_mean <- as.numeric(unlist(grouped_concordance_df_filtered[r, | |
# sprintf("cluster_%d_mean_meth", | |
# max_tran)])) | |
# | |
# grouped_concordance_df_filtered[r, "max_meth_delta"] <- cluster_b_mean - cluster_a_mean | |
# | |
# max_tran_log2FC <- as.numeric(unlist(grouped_concordance_df_filtered[r, | |
# sprintf("transition_%d_log2FC", | |
# max_tran)])) | |
# | |
# grouped_concordance_df_filtered[r, "max_tran_log2FC"] <- max_tran_log2FC | |
# } | |
write_csv(grouped_concordance_df_filtered, "grouped_concordance_df_filtered.csv") | |
``` | |
## Global methylation differences between male and female per female cluster | |
```{r} | |
female_df_no_h9 <- read_csv("female_df_no_h9.csv.gz") | |
male_df_no_outliers <- read_csv("male_df_no_outliers.csv.gz") | |
sample_clusters_df <- read_csv("saved_sample_clusters.csv", col_names = c("sample_name", "cluster")) | |
all_dmp_df <- read_csv("all_dmp_df.csv.gz") | |
male_df_no_outliers_mean <- male_df_no_outliers %>% | |
column_to_rownames(var="rn") %>% | |
rowMeans() %>% | |
tidy() | |
all_dmp_df_with_male <- inner_join(all_dmp_df, male_df_no_outliers_mean %>% dplyr::rename(male_mean=x), by=c("rowname"="names")) | |
plot_df <- all_dmp_df_with_male %>% filter(!(hg38_chr %in% c("chrX", "chrY"))) %>% #& male_mean >= .5) %>% | |
dplyr::select(rowname, cluster_0_mean, cluster_1_mean, cluster_2_mean, cluster_3_mean, cluster_4_mean, cluster_5_mean, male_mean) %>% | |
reshape2::melt(id.vars=c("rowname")) %>% dplyr::rename(site=rowname, group=variable, mean_methylation=value) | |
library(ggpubr) | |
library(ggsci) | |
plot_df$cluster <- str_extract(plot_df$group, "[0-5]") | |
my_comparisons <- list( c("cluster_0_mean", "male_mean", "Cluster A", "Male"), | |
c("cluster_1_mean", "male_mean", "Cluster B", "Male"), | |
c("cluster_2_mean", "male_mean", "Cluster C", "Male"), | |
c("cluster_3_mean", "male_mean", "Cluster D", "Male"), | |
c("cluster_4_mean", "male_mean", "Cluster E", "Male"), | |
c("cluster_5_mean", "male_mean", "Cluster F", "Male")) | |
ks_df <- tibble(group1=character(), group2=character(),p=character(),y.position=numeric()) | |
k <- 0 | |
for (c in my_comparisons) { | |
test_a <- (plot_df %>% filter(group==c[1]))$mean_methylation | |
test_b <- (plot_df %>% filter(group==c[2]))$mean_methylation | |
ks_df <- ks_df %>% | |
dplyr::add_row(group1=c[1], | |
group2=c[2], | |
p=sprintf("%.2f", as.numeric(ks.test(test_a, test_b)$statistic)), | |
y.position=1.37-k) | |
k <- k + .07 | |
} | |
global_methylation_by_cluster_plt <- ggplot(plot_df %>% filter(), aes(x=group, y=mean_methylation, fill=group=="male_mean", alpha=cluster)) + | |
geom_violin(outlier.shape = NA, size=.3) + | |
# stat_pvalue_manual(data=ks_df, inherit.aes = FALSE, size=.1, bracket.size=.1, tip.length = 0, label.size = 1, vjust=0, label = "K.S. Distance = {p}") + | |
# stat_compare_means(aes(x=group, y=mean_methylation, group=group, fill=group), comparisons = my_comparisons, method="t.test", alpha=1) + | |
geom_text(data=ks_df, inherit.aes=F, mapping=aes(y=.5, x=group1, label=paste("D=",p, sep="")), color=pal_npg()(2)[2], size=1.5, hjust=-.1) + | |
labs(y="Mean Autosomal DNAme", | |
x="", | |
title = "Autosomal DNAme in Female and Male hPSCs")+ | |
theme_bw() + | |
base_plot_theme + | |
scale_fill_npg(name="Sex", labels=c("female", "male")) + | |
scale_alpha_discrete(guide=F) + | |
scale_x_discrete(breaks=c(sprintf("cluster_%d_mean",0:5),"male_mean"), labels=c(sprintf("Cluster %s", c("A", "B", "C", "D", "E", "F")), "Male"))+ | |
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, margin = margin(b=0, unit="mm")), | |
axis.title.x=element_text(size=0, margin=margin(0,0,0,0)), | |
# legend.text = element_text(size=4), | |
# legend.title = element_text(size=5), | |
legend.key.width = unit(2, units="mm"), | |
legend.key.height = unit(2, units="mm"), | |
legend.key.size = unit(.2, units = "mm")) + | |
scale_y_continuous() | |
ggsave("figs/main/fig_4_autosomal_meth.pdf", global_methylation_by_cluster_plt, height = 47, width=58, units="mm") | |
# only plot for samples with expression | |
sample_data_combined_df <- read_csv("sample_data_combined_all_info.csv") | |
female_samples_to_keep <- sample_data_combined_df %>% filter(!is.na(sra_accession) & sample_name %in% sample_clusters_df$sample_name) | |
female_df_no_h9_with_expr <- female_df_no_h9 %>% select(rn, one_of(female_samples_to_keep$sample_name)) | |
filtered_site_annotation_df <- read_csv("filtered_site_annotation_df.csv.gz") | |
all_dmp_df_expr_only <- female_df_no_h9_with_expr %>% | |
reshape2::melt(id.vars=c("rn"), variable.name="sample_name", value.name="beta") %>% | |
left_join(sample_clusters_df, by="sample_name") %>% | |
left_join(filtered_site_annotation_df %>% select(rowname, hg38_chromosome), by=c("rn"="rowname")) | |
global_methylation_by_cluster_plt_expr <- ggplot(all_dmp_df_expr_only %>% filter(!(hg38_chromosome %in% c("chrX", "chrY"))), aes(x=cluster, y=beta, fill=cluster, group=cluster)) + | |
geom_boxplot(size=.3) | |
female_df_no_h9_all <- female_df_no_h9 %>% select(rn, one_of(sample_clusters_df$sample_name)) | |
all_dmp_df_all <- female_df_no_h9_all %>% | |
reshape2::melt(id.vars=c("rn"), variable.name="sample_name", value.name="beta") %>% | |
left_join(sample_clusters_df, by="sample_name") %>% | |
left_join(filtered_site_annotation_df %>% select(rowname, hg38_chromosome), by=c("rn"="rowname")) | |
global_methylation_by_cluster_plt_all <- ggplot(all_dmp_df_all %>% filter(!(hg38_chromosome %in% c("chrX", "chrY"))), aes(x=cluster, y=beta, fill=cluster, group=cluster)) + | |
geom_boxplot(size=.3) | |
``` | |
## Generating BED files to look for motifs and term enrichment in the CpG clusters | |
```{r} | |
# generate bed files for each transition 500 bp fragments, then run DREME and AME on it | |
dir.create("x_chr_methylation_sites") | |
background_df <- all_dmp_df[(all_dmp_df$chr == 'chrX'),] | |
background_df$start_pos <- format(background_df$pos - 250, scientific = FALSE, trim = T) | |
background_df$end_pos <- format(background_df$pos + 250, scientific = FALSE, trim = T) | |
write_tsv(background_df[c('chr', "start_pos", "end_pos")], | |
sprintf('x_chr_methylation_sites/background_x_chr.bed', i), col_names = F, quote = FALSE) | |
for (i in 1:5) { | |
plotable_df_increasing <- all_dmp_df[(all_dmp_df$key_transition == i) & (all_dmp_df$delta_increasing == TRUE) & (all_dmp_df$passes_threshold == T) & (all_dmp_df$chr == 'chrX'),] | |
plotable_df_decreasing <- all_dmp_df[(all_dmp_df$key_transition == i) & (all_dmp_df$delta_increasing == F) & (all_dmp_df$passes_threshold == T) & (all_dmp_df$chr == 'chrX'),] | |
plotable_df_increasing$start_pos <- format(plotable_df_increasing$pos - 250, scientific = FALSE, trim = T) | |
plotable_df_increasing$end_pos <- format(plotable_df_increasing$pos + 250, scientific = FALSE, trim = T) | |
write_tsv(plotable_df_increasing[c('chr', "start_pos", "end_pos")], | |
sprintf('x_chr_methylation_sites/transition_%d_increasing_x_chr.bed', i), col_names = F, quote = FALSE) | |
write_tsv(plotable_df_increasing[plotable_df_increasing$dist_to_tss < 2000,c('chr', "start_pos", "end_pos")], | |
sprintf('x_chr_methylation_sites/transition_%d_increasing_x_chr_promoters.bed', i), col_names = F, quote = FALSE) | |
plotable_df_decreasing$start_pos <- format(plotable_df_decreasing$pos - 250, scientific = FALSE, trim = T) | |
plotable_df_decreasing$end_pos <- format(plotable_df_decreasing$pos + 250, scientific = FALSE, trim = T) | |
write_tsv(plotable_df_decreasing[c('chr', "start_pos", "end_pos")], | |
sprintf('x_chr_methylation_sites/transition_%d_decreasing_x_chr.bed', i), col_names = F, quote = FALSE) | |
write_tsv(plotable_df_decreasing[plotable_df_decreasing$dist_to_tss < 2000,c('chr', "start_pos", "end_pos")], | |
sprintf('x_chr_methylation_sites/transition_%d_decreasing_x_chr_promoters.bed', i), col_names = F, quote = FALSE) | |
} | |
``` | |
## DUSP9 stuff for Stefan | |
```{r} | |
library(tidyverse) | |
library(broom) | |
all_dmp_df <- read_csv("all_dmp_df.csv.gz") | |
female_df_no_h9 <- read_csv("female_df_no_h9.csv.gz") | |
filtered_site_annotation_df <- read_csv("filtered_site_annotation_df.csv.gz") | |
autosome_probes <- filter(filtered_site_annotation_df, !(hg38_chromosome %in% c("chrX", "chrY")))$rowname | |
autosome_mean <- tidy(colMeans(female_df_no_h9 %>% filter(rn %in% autosome_probes) %>% dplyr::select(-rn))) %>% | |
rename(autosome_mean=x) | |
dusp9_promoter_probes <- filter(filtered_site_annotation_df, (hg38_gene_name %in% c("DUSP9")) & hg38_gene_group %in% c("TSS1500", "TSS200", "5_UTR"))$rowname | |
dusp9_promoter_mean <- tidy(colMeans(female_df_no_h9 %>% filter(rn %in% dusp9_promoter_probes) %>% dplyr::select(-rn))) %>% | |
rename(dusp9_promoter_mean=x) | |
elk1_promoter_probes <- filter(filtered_site_annotation_df, (hg38_gene_name %in% c("ELK1")) & hg38_gene_group %in% c("TSS1500", "TSS200", "5_UTR"))$rowname | |
elk1_promoter_mean <- tidy(colMeans(female_df_no_h9 %>% filter(rn %in% elk1_promoter_probes) %>% dplyr::select(-rn))) %>% | |
rename(elk1_promoter_mean=x) | |
means_df <- full_join(dusp9_promoter_mean, autosome_mean, by="names") | |
means_df <- full_join(means_df, elk1_promoter_mean, by="names") | |
sample_clusters_df <- read_csv("saved_sample_clusters.csv", | |
col_names = c('name','cluster')) %>% arrange(cluster) | |
means_df <- left_join(means_df, sample_clusters_df, by=c("names"="name")) | |
cluster_f_data <- means_df %>% filter(cluster == 5) | |
cluster_f_cor <- cor.test(cluster_f_data$dusp9_promoter_mean, | |
cluster_f_data$autosome_mean, | |
method="spearman") | |
cluster_nonf_data <- means_df %>% filter(cluster < 5) | |
cluster_nonf_cor <- cor.test(cluster_nonf_data$dusp9_promoter_mean, | |
cluster_nonf_data$autosome_mean, | |
method="spearman") | |
override.shape <- c(21,21,21,21,22,24) | |
dusp9_autosome_plot <- ggplot(means_df, aes(x=dusp9_promoter_mean, y=autosome_mean)) + | |
geom_smooth(data=means_df %>% filter(cluster == 5), mapping=aes(color="F"), method="lm", size=.3) + | |
geom_smooth(data=means_df %>% filter(cluster < 5), mapping=aes(color="A-E"), method="lm", size=.3) + | |
annotate("text", x=.2,y=.45,label=sprintf("Spearman corr: %.2f\np=%.2e", cluster_f_cor$estimate, cluster_f_cor$p.value), color=pal_aaas()(2)[2], size=1.5) + | |
annotate("text", x=.4,y=.57,label=sprintf("Spearman corr: %.2f\np=%.2e", cluster_nonf_cor$estimate, cluster_nonf_cor$p.value), color=pal_aaas()(2)[1], size=1.5) + | |
geom_point(aes(alpha=as.factor(cluster), shape=as.factor(cluster), stroke=cluster >= 4), color="black", size=2, fill="#E64B35FF") + | |
theme_bw() + | |
base_plot_theme + | |
scale_fill_npg(guide=F) + | |
scale_color_aaas() + | |
scale_shape_manual(guide = FALSE, values=override.shape) + | |
scale_alpha_discrete(limits=c(0,1,2,3,4,5), labels=c("A","B","C","D","E","F")) + | |
guides(alpha = guide_legend(override.aes = list(fill, shape=override.shape), nrow=1)) + | |
labs(title="DUSP9 promoter DNAme vs autosomal DNAme", | |
fill="Cluster", | |
alpha="Cluster", | |
x="DUSP9 Mean Promoter DNAme", | |
y="Mean Autosomal DNAme", | |
color="Cluster Regression") | |
ggsave("figs/supplementary/dusp9_autosome.pdf", dusp9_autosome_plot, width = 174/2, height = 174/2, units="mm") | |
cor.test(means_df$dusp9_promoter_mean, means_df$autosome_mean) | |
cor.test(filter(means_df, cluster >= 4)$dusp9_promoter_mean, filter(means_df, cluster >= 4)$autosome_mean) | |
#Elk1 autosome plot | |
elk1_autosome_plot <- ggplot(means_df, aes(x=elk1_promoter_mean, y=autosome_mean)) + | |
geom_smooth(data=means_df %>% filter(cluster == 5), mapping=aes(color="F"), method="lm", size=.3) + | |
geom_smooth(data=means_df %>% filter(cluster < 5), mapping=aes(color="A-E"), method="lm", size=.3) + | |
# annotate("text", x=.2,y=.45,label=sprintf("Spearman corr: %.2f\np=%.2e", cluster_f_cor$estimate, cluster_f_cor$p.value), color=pal_aaas()(2)[2], size=1.5) + | |
# annotate("text", x=.4,y=.57,label=sprintf("Spearman corr: %.2f\np=%.2e", cluster_nonf_cor$estimate, cluster_nonf_cor$p.value), color=pal_aaas()(2)[1], size=1.5) + | |
geom_point(aes(alpha=as.factor(cluster), shape=as.factor(cluster), stroke=cluster >= 4), color="black", size=2, fill="#E64B35FF") + | |
theme_bw() + | |
base_plot_theme + | |
scale_fill_npg(guide=F) + | |
scale_color_aaas() + | |
scale_shape_manual(guide = FALSE, values=override.shape) + | |
scale_alpha_discrete(limits=c(0,1,2,3,4,5), labels=c("A","B","C","D","E","F")) + | |
guides(alpha = guide_legend(override.aes = list(fill, shape=override.shape), nrow=1)) + | |
labs(title="DUSP9 promoter methylation vs autosomal methylation", | |
fill="Cluster", | |
alpha="Cluster", | |
x="ELK1 Mean Promoter Methylation", | |
y="Mean Autosomal Methylation", | |
color="Cluster Regression") | |
# DUSP9 expression vs autosomal mean methylation | |
batch_corrected_vsd_df <- read_csv("batch_corrected_vsd_df.csv") %>% | |
dplyr::select(-ensembl_id, -ensembl_id_no_version, -chromosome_name, -start_position, -description, -end_position, -strand) | |
dusp9_expression <- batch_corrected_vsd_df %>% | |
filter(hgnc_symbol=="DUSP9") %>% | |
reshape2::melt(id.vars="hgnc_symbol", variable.name="sample_name", value.name="dusp9_expr") %>% | |
left_join(means_df, by=c("sample_name"="names")) | |
cluster_f_data_expr <- dusp9_expression %>% filter(cluster == 5) | |
cluster_f_cor_expr <- cor.test(cluster_f_data_expr$dusp9_expr, | |
cluster_f_data_expr$autosome_mean) | |
cluster_nonf_data_expr <- dusp9_expression %>% filter(cluster < 5) | |
cluster_nonf_cor_expr <- cor.test(cluster_nonf_data_expr$dusp9_expr, | |
cluster_nonf_data_expr$autosome_mean) | |
dusp9_autosome_expr_plot <- ggplot(dusp9_expression, aes(x=dusp9_expr, y=autosome_mean)) + | |
geom_smooth(data=dusp9_expression %>% filter(cluster == 5), mapping=aes(color="F"), method="lm", size=.3, se=F) + | |
geom_smooth(data=dusp9_expression %>% filter(cluster < 5), mapping=aes(color="A-E"), method="lm", size=.3, se=F) + | |
annotate("text", x=6.4,y=.5,label=sprintf("Pearson corr: %.2f\np=%.2e", cluster_f_cor_expr$estimate, cluster_f_cor_expr$p.value), color=pal_aaas()(2)[2], size=1.5) + | |
annotate("text", x=4.9,y=.535,label=sprintf("Pearson corr: %.2f\np=%.2e", cluster_nonf_cor_expr$estimate, cluster_nonf_cor_expr$p.value), color=pal_aaas()(2)[1], size=1.5) + | |
geom_point(aes(alpha=as.factor(cluster), shape=as.factor(cluster), stroke=cluster >= 4), color="black", size=2, fill="#E64B35FF") + | |
theme_bw() + | |
base_plot_theme + | |
scale_fill_npg(guide=F) + | |
scale_color_aaas() + | |
scale_shape_manual(guide = FALSE, values=override.shape, limits=c(0,1,2,3,4,5)) + | |
scale_alpha_discrete(limits=c(0,1,2,3,4,5), labels=c("A","B","C","D","E","F")) + | |
guides(alpha = guide_legend(override.aes = list(fill, shape=override.shape), nrow=1)) + | |
labs(title="DUSP9 promoter methylation vs autosomal methylation", | |
fill="Cluster", | |
alpha="Cluster", | |
x="DUSP9 Mean Promoter Methylation", | |
y="Mean Autosomal Methylation", | |
color="Cluster Regression") | |
ggsave("figs/supplementary/dusp9_autosome_expr.svg", dusp9_autosome_expr_plot, width = 174/2, height = 174/2, units="mm") | |
``` | |
## Heatmap of Kilens genes in cluster F | |
```{r} | |
table_3_expression <- read_csv("table_3_expression_all_v_all.csv.gz") | |
female_df_no_h9 <- read_csv("female_df_no_h9.csv") | |
cluster_tib <- read_csv("saved_sample_clusters.csv", col_names = c("sample_name", "cluster")) | |
filtered_site_annotation_df <- read_csv("filtered_site_annotation_df.csv") | |
sample_data_combined_all_info <- read_csv("sample_data_combined_all_info.csv") | |
kilens_genes <- (table_3_expression %>% filter(!is.na(kielens_log2FC) & `Transition 4/5 p-adj` <= .1 & Chromosome != "X"))$`Gene Symbol` | |
kilens_probes <- (filtered_site_annotation_df %>% filter(hg38_gene_name %in% kilens_genes & !(hg38_chromosome %in% c("chrX", "chrY"))))$rowname | |
kilens_heatmap_df <- female_df_no_h9 %>% | |
filter(rn %in% kilens_probes) %>% | |
dplyr::select(rn, !!filter(cluster_tib, cluster %in% c(0,5))$sample_name) %>% | |
reshape2::melt(id.vars="rn", variable.name="sample_name") %>% | |
left_join(filtered_site_annotation_df %>% | |
dplyr::select(rowname, hg38_gene_name, hg38_pos), | |
by=c("rn"="rowname")) %>% | |
left_join(cluster_tib, by="sample_name") %>% | |
left_join(sample_data_combined_all_info, by="sample_name") | |
data <- kilens_heatmap_df %>% | |
reshape2::dcast(rn ~ sample_name, value.var="value") %>% | |
column_to_rownames(var="rn") | |
sample_ord <- hclust( dist(t(data), method = "euclidean"), method = "single" )$order | |
ordered_sample_names <- rev(colnames(data)[sample_ord]) | |
site_ord <- hclust( dist((data), method = "euclidean"), method = "single" )$order | |
ordered_sites <- (rownames(data)[site_ord]) | |
ordered_labels_df <- kilens_heatmap_df %>% | |
group_by(sample_name, group_accession, sra_accession, cluster) %>% | |
summarise() %>% | |
arrange(factor(sample_name, levels=ordered_sample_names), sample_name) %>% | |
mutate(sample_label=paste(group_accession, sra_accession, sep="-")) | |
library(facetscales) | |
scales_x <- list( | |
`0` = scale_x_discrete(limits=filter(ordered_labels_df, cluster==0)$sample_name, labels = filter(ordered_labels_df, cluster==0)$sample_label), | |
`5` = scale_x_discrete(limits=filter(ordered_labels_df, cluster==5)$sample_name, labels = filter(ordered_labels_df, cluster==5)$sample_label) | |
) | |
# library(caret) | |
# kilens_heatmap_df = preProcess(train, method = "range") | |
kilens_heatmap <- ggplot(kilens_heatmap_df, aes(x=factor(sample_name, levels=ordered_sample_names), | |
y=factor(rn, levels=ordered_sites), fill=value)) + | |
geom_tile() + | |
# facet_grid(cluster ~ sample_cluster, scales = "free", space="free", shrink=FALSE, switch = "y") + | |
# facet_grid(. ~ cluster, scales = "free", space="free") + | |
facet_grid_sc(cols = vars(cluster), scales = list(x = scales_x), space="free") + | |
scale_fill_gradient2(low="#0bb1a4", high = "#E966FE", | |
midpoint = .5, | |
limits=c(0,1), | |
breaks=c(0,.5,1), | |
labels=c("unmethylated\nβ=0", "50% methylation\nβ=0.5", "methylated\nβ=1")) + | |
# scale_x_discrete(limits=ordered_labels_df$sample_name, labels = ordered_labels_df$sample_label) + | |
labs(x="Female Samples", | |
y="", | |
fill="DNA Methylation") + | |
theme(axis.text.y=element_blank(), | |
axis.ticks.y=element_blank(), | |
legend.position="top", | |
legend.justification = "center", | |
legend.title = element_text(size = 6, vjust = .75), | |
legend.text = element_text(size = 6), | |
# legend.position="top", | |
# legend.justification = "right", | |
# legend.title = element_text(size = 6, vjust = 1), | |
# legend.text = element_text(size = 6), | |
# legend.key.height = unit(2, "mm"), | |
plot.margin = margin(r=0), | |
panel.spacing.y = unit(2, "mm"), | |
panel.spacing.x = unit(0.2, "mm"), | |
axis.text.x = element_text(size = 9*0.8, angle = 270, hjust = 0, colour = "grey50"), | |
# strip.background.x = element_rect(color="black"), | |
strip.text.y = element_text(size=7), | |
strip.text.x = element_text(size=7), | |
axis.title = element_text(size=7)) | |
kilens_heatmap_df_gene <- female_df_no_h9 %>% | |
filter(rn %in% kilens_probes) %>% | |
dplyr::select(rn, !!filter(cluster_tib, cluster %in% c(0,3,5))$sample_name) %>% | |
reshape2::melt(id.vars="rn", variable.name="sample_name") %>% | |
left_join(filtered_site_annotation_df %>% | |
dplyr::select(rowname, hg38_gene_name, hg38_pos), | |
by=c("rn"="rowname")) %>% | |
left_join(cluster_tib, by="sample_name") %>% | |
left_join(sample_data_combined_all_info, by="sample_name") %>% | |
group_by(hg38_gene_name, sample_name, cluster, group_accession, sra_accession) %>% | |
summarise(value=mean(value)) | |
gene_meth_df_kilens <- read_csv("all_dmp_df.csv.gz") %>% | |
group_by(hg38_gene) %>% | |
summarise(cluster_0_mean_meth=mean(cluster_0_mean)) %>% | |
filter(hg38_gene %in% kilens_genes) %>% | |
dplyr::select(hg38_gene, cluster_0_mean_meth) | |
kilens_heatmap_df_gene_cluster_3_subtracted <- kilens_heatmap_df_gene %>% | |
left_join(gene_meth_df_kilens, by=c("hg38_gene_name"="hg38_gene")) %>% | |
mutate(value=value-cluster_0_mean_meth) | |
#sign test - how many of these genes are decreasing in methylation | |
sign_test <- grouped_concordance_df %>% | |
filter(gene %in% kilens_genes) %>% | |
mutate(tran_4_5_delta_meth=cluster_5_mean_meth-cluster_3_mean_meth) %>% | |
count(tran_4_5_delta_meth > 0) | |
# concordance_df <- read_csv("concordance_df.csv") | |
data_gene <- kilens_heatmap_df_gene_cluster_3_subtracted %>% | |
reshape2::dcast(hg38_gene_name ~ sample_name, value.var="value") %>% | |
column_to_rownames(var="hg38_gene_name") | |
sample_ord_gene <- hclust( dist(t(data_gene), method = "euclidean"), method = "single" )$order | |
ordered_sample_names_gene <- rev(colnames(data_gene)[sample_ord_gene]) | |
ordered_labels_df_gene <- kilens_heatmap_df_gene %>% | |
group_by(sample_name, group_accession, sra_accession, cluster) %>% | |
summarise() %>% | |
arrange(factor(sample_name, levels=ordered_sample_names_gene), sample_name) %>% | |
mutate(sample_label=paste(group_accession, sra_accession, sep="-")) | |
scales_x_gene <- list( | |
`0` = scale_x_discrete(limits=filter(ordered_labels_df_gene, cluster==0)$sample_name, labels = filter(ordered_labels_df_gene, cluster==0)$sample_label), | |
`3` = scale_x_discrete(limits=filter(ordered_labels_df_gene, cluster==3)$sample_name, labels = filter(ordered_labels_df_gene, cluster==3)$sample_label), | |
`5` = scale_x_discrete(limits=filter(ordered_labels_df_gene, cluster==5)$sample_name, labels = filter(ordered_labels_df_gene, cluster==5)$sample_label) | |
) | |
gene_ord <- hclust( dist((data_gene), method = "euclidean"), method = "single" )$order | |
ordered_genes <- (rownames(data_gene)[gene_ord]) | |
# kilens_heatmap_df_scaled <- kilens_heatmap_df_gene %>% | |
# group_by(hg38_gene_name) %>% | |
# mutate(scaled_value=scales::rescale(value, to=c(0,1))) %>% | |
# ungroup() | |
# kilens_heatmap_df_scaled <- kilens_heatmap_df_scaled %>% | |
# mutate(row_type="Genes") | |
kilens_heatmap_df_samples <- tibble(sample_name=character(), cluster=character(), value=character(), group_accession=character(), hg38_gene_name=character(), has_expression=logical()) | |
for (sample_name in unique(kilens_heatmap_df_gene_cluster_3_subtracted$sample_name)) { | |
cluster <- as.numeric(unlist(cluster_tib[cluster_tib$sample_name == sample_name, "cluster"])) | |
group <- as.character(unlist(unique(kilens_heatmap_df_gene_cluster_3_subtracted[kilens_heatmap_df_gene_cluster_3_subtracted$sample_name == sample_name, "group_accession"]))) | |
sra <- as.character(unlist(unique(kilens_heatmap_df_gene_cluster_3_subtracted[kilens_heatmap_df_gene_cluster_3_subtracted$sample_name == sample_name, "sra_accession"]))) | |
kilens_heatmap_df_samples <- kilens_heatmap_df_samples %>% | |
add_row(sample_name=sample_name, | |
cluster=cluster, | |
value=group, | |
group_accession=group, | |
# row_type="Sample Group", | |
has_expression=!is.na(sra), | |
hg38_gene_name="Sample Group") | |
} | |
kilens_heatmap_genes <- ggplot(kilens_heatmap_df_gene_cluster_3_subtracted, | |
aes(x=factor(sample_name, levels=ordered_sample_names_gene), | |
y=factor(hg38_gene_name, levels=ordered_genes), fill=value)) + | |
geom_tile() + | |
# geom_point(data=kilens_heatmap_df_scaled %>% filter(row_type == "Sample Group"), aes(color=group_accession)) + | |
# facet_grid(cluster ~ sample_cluster, scales = "free", space="free", shrink=FALSE, switch = "y") + | |
# facet_grid(. ~ cluster, scales = "free", space="free") + | |
facet_grid_sc(cols = vars(cluster), scales = list(x = scales_x_gene), space="free") + | |
# scale_fill_material(palette = "deep-purple")+#, | |
# limits=c(0,1), | |
# breaks=c(0,.5,1))+#, | |
# labels=c("unmethylated\nβ=0", "50% methylation\nβ=0.5", "methylated\nβ=1")) + | |
scale_fill_gradient2(name="value", | |
low=pal_aaas()(1), high = pal_aaas()(2)[2], | |
midpoint = 0, | |
limits=c(-.25,.25), | |
breaks=c(-.25,0,.25), oob=scales::squish)+#, | |
# labels=c("unmethylated\nβ=0", "50% methylation\nβ=0.5", "methylated\nβ=1")) + | |
scale_color_npg(name="group_accession") + | |
labs(x="Female Samples", | |
y="", | |
fill="Cluster A Subtracted DNA Methylation") + | |
theme(axis.text.x = element_text(size = 9*0.8, angle = 270, hjust = 0), | |
axis.text.y=element_text(size=9*.5), | |
axis.ticks = element_line(size=.3), | |
legend.position="top", | |
legend.justification = "center", | |
legend.title = element_text(size = 6, vjust = .75), | |
legend.text = element_text(size = 6), | |
# legend.position="top", | |
# legend.justification = "right", | |
# legend.title = element_text(size = 6, vjust = 1), | |
# legend.text = element_text(size = 6), | |
# legend.key.height = unit(2, "mm"), | |
plot.margin = margin(r=0), | |
panel.spacing.y = unit(2, "mm"), | |
panel.spacing.x = unit(0.2, "mm"), | |
# strip.background.x = element_rect(color="black"), | |
strip.text.y = element_text(size=7), | |
strip.text.x = element_text(size=7), | |
axis.title = element_text(size=7)) | |
ggsave("figs/supplementary/kilens_heatmap_gene_dual_color_cluster_A_subtracted.svg", height = 250, width = 174/2, units = "mm") | |
ggsave("figs/supplementary/kilens_heatmap_gene_dual_color_cluster_A_subtracted_cluster_D.svg", kilens_heatmap_genes, height = 250, width = 174/2, units = "mm") | |
kilens_heatmap_samples <- ggplot(kilens_heatmap_df_samples, | |
aes(x=factor(sample_name, levels=ordered_sample_names_gene), | |
y=factor(hg38_gene_name, levels=ordered_genes), fill=value)) + | |
geom_tile() + | |
geom_point(aes(color=has_expression), size=.4) + | |
# geom_point(data=kilens_heatmap_df_scaled %>% filter(row_type == "Sample Group"), aes(color=group_accession)) + | |
# facet_grid(cluster ~ sample_cluster, scales = "free", space="free", shrink=FALSE, switch = "y") + | |
# facet_grid(. ~ cluster, scales = "free", space="free") + | |
facet_grid_sc(cols = vars(cluster), scales = list(x = scales_x_gene), space="free") + | |
scale_color_manual(values=c("#000000", NA), limits=c(T,F)) + | |
# scale_fill_material(palette = "deep-purple", | |
# limits=c(0,1), | |
# breaks=c(0,.5,1))+#, | |
# labels=c("unmethylated\nβ=0", "50% methylation\nβ=0.5", "methylated\nβ=1")) + | |
scale_fill_npg(name="group_accession") + | |
labs(x="Female Samples", | |
y="", | |
fill="Sample Group") + | |
theme(axis.text.x = element_text(size = 9*0.8, angle = 270, hjust = 0), | |
axis.text.y=element_text(size=9*.5), | |
axis.ticks.y = element_line(size=.3), | |
legend.position="top", | |
legend.justification = "center", | |
legend.title = element_text(size = 6, vjust = .75), | |
legend.text = element_text(size = 6), | |
# legend.position="top", | |
# legend.justification = "right", | |
# legend.title = element_text(size = 6, vjust = 1), | |
# legend.text = element_text(size = 6), | |
# legend.key.height = unit(2, "mm"), | |
plot.margin = margin(r=0), | |
panel.spacing.y = unit(2, "mm"), | |
panel.spacing.x = unit(0.2, "mm"), | |
# strip.background.x = element_rect(color="black"), | |
strip.text.y = element_text(size=7), | |
strip.text.x = element_text(size=7), | |
axis.title = element_text(size=7)) | |
ggsave("figs/supplementary/kilens_heatmap_gene_labels.svg", height = 70, width = (174/2)- 6.936, units = "mm") | |
``` | |
## Dotplot of kilens expression vs. our methylation | |
```{r} | |
library(tidyverse) | |
library(ggplot2) | |
table_3_expression <- read_csv("table_3_expression_all_v_all.csv") | |
female_df_no_h9 <- read_csv("female_df_no_h9.csv") | |
cluster_tib <- read_csv("saved_sample_clusters.csv", col_names = c("sample_name", "cluster")) | |
filtered_site_annotation_df <- read_csv("filtered_site_annotation_df.csv") | |
all_dmp_df <- read_csv("all_dmp_df.csv") | |
grouped_concordance_df <- read_csv("grouped_concordance_df.csv.gz") | |
kilens_genes <- (table_3_expression %>% | |
filter(!is.na(kielens_log2FC) & table_3_expression$`Transition 4/5 p-adj` <= .1))$`Gene Symbol` | |
# kilens_probes <- (filtered_site_annotation_df %>% filter(hg38_gene_name %in% kilens_genes & !(hg38_chromosome %in% c("chrX", "chrY"))))$rowname | |
kilens_grouped_concordance_df <- grouped_concordance_df %>% | |
filter(gene %in% kilens_genes & hg38_chr != "chrX") %>% | |
left_join(table_3_expression %>% select(`Gene Symbol`, kielens_log2FC, kielens_T2iLGö_mean, `kielens_KSR+FGF2_mean`, `Transition 4/5 log2FoldChange`, `Transition 4/5 p-adj`), | |
by=c("gene"="Gene Symbol")) | |
corr_log2fc_meth <- cor.test(kilens_grouped_concordance_df$`Transition 4/5 log2FoldChange`, | |
kilens_grouped_concordance_df$cluster_5_mean_meth - kilens_grouped_concordance_df$cluster_3_mean_meth, | |
method = "spearman") | |
tran_5_plot <- ggplot(kilens_grouped_concordance_df, aes(x=`Transition 4/5 log2FoldChange`, y=cluster_5_mean_meth-cluster_3_mean_meth)) + | |
geom_point(aes(color=`Transition 4/5 p-adj`), size=.7) + | |
scale_x_continuous(trans = scales::pseudo_log_trans()) + | |
scale_y_continuous(trans = scales::pseudo_log_trans(), limits=c(-.2,.01)) + | |
geom_smooth(method='lm', size=.3) + | |
scale_color_viridis_c() + | |
annotate("text", label=sprintf("Spearman Corr: %.2f\np=%.2e",corr_log2fc_meth$estimate, corr_log2fc_meth$p.value), x=4, y=-.02, size=2) + | |
# geom_text(aes(label=sprintf("Spearman Corr: %.2f\np=%.2e",corr_log2fc_meth$estimate, corr_log2fc_meth$p.value), x=4, y=-.02), size=2) + | |
theme_bw() + | |
base_plot_theme + | |
labs(title="Transition 4/5 DNAme vs Expression for Kilens, et al. 2018 Genes", y="(Cluster F DNAme) - (Cluster D DNAme)", x="Transition 4/5 log2(F.C.)") + | |
theme(legend.key.width = unit(4, units="mm")) | |
ggsave("figs/supplementary/kielens_meth_expr_correlation.pdf",tran_5_plot,width = 174/2, height=87, units = "mm") | |
kilens_melted_df <- kilens_grouped_concordance_df %>% | |
reshape2::melt() | |
sample_data_combined_all_info <- read_csv("sample_data_combined_all_info.csv") | |
expression_samples <- filter(sample_data_combined_all_info, !is.na(sra_accession))$sample_name | |
samples_without_expression <- setdiff(cluster_tib$sample_name, expression_samples) | |
kilens_grouped_promoter_df <- female_df_no_h9 %>% | |
select(rn, !!samples_without_expression) %>% | |
left_join(filtered_site_annotation_df, by=c("rn"="rowname")) %>% | |
#filter(hg38_gene_group %in% c("TSS1500", "TSS200", "5_UTR")) %>% | |
reshape2::melt(id.vars=c("hg38_gene_name", "hg38_chromosome"), measure.vars=samples_without_expression, variable.name="sample_name", value.name="methylation") %>% | |
group_by(hg38_gene_name, hg38_chromosome, sample_name) %>% | |
summarise(mean_methylation=mean(methylation)) %>% | |
left_join(cluster_tib, by="sample_name") %>% | |
group_by(hg38_gene_name, hg38_chromosome, cluster) %>% | |
summarise(mean_methylation=mean(mean_methylation)) %>% | |
reshape2::dcast(hg38_gene_name + hg38_chromosome ~ cluster) %>% | |
rename(cluster_0_promoter_mean=`0`, | |
cluster_1_promoter_mean=`1`, | |
cluster_2_promoter_mean=`2`, | |
cluster_3_promoter_mean=`3`, | |
cluster_4_promoter_mean=`4`, | |
cluster_5_promoter_mean=`5`) %>% | |
filter(hg38_gene_name %in% kilens_genes & hg38_chromosome != "chrX") %>% | |
left_join(table_3_expression %>% select(`Gene Symbol`, kielens_log2FC, kielens_T2iLGö_mean, `kielens_KSR+FGF2_mean`, `Transition 4/5 log2FoldChange`, `Transition 4/5 p-adj`), | |
by=c("hg38_gene_name"="Gene Symbol")) | |
corr_log2fc_promoter_meth <- cor.test(kilens_grouped_promoter_df$`Transition 4/5 log2FoldChange`, | |
kilens_grouped_promoter_df$cluster_5_promoter_mean - kilens_grouped_promoter_df$cluster_3_promoter_mean, | |
method = "spearman") | |
tran_5_promoter_plot <- ggplot(kilens_grouped_promoter_df, aes(x=`Transition 4/5 log2FoldChange`, y=cluster_5_promoter_mean-cluster_3_promoter_mean)) + | |
geom_point(aes(color=`Transition 4/5 p-adj`), size=.7) + | |
scale_x_continuous(trans = scales::pseudo_log_trans()) + | |
scale_y_continuous(trans = scales::pseudo_log_trans(), limits=c(-.2,.05)) + | |
geom_smooth(method='lm', size=.3) + | |
scale_color_viridis_c() + | |
geom_text(aes(label=sprintf("Spearman Corr: %.2f",corr_log2fc_promoter_meth$estimate), x=-4, y=-.15), size=2) + | |
theme_bw() + | |
base_plot_theme + | |
labs(title="Transition 4/5 Methylation vs Expression for Kilens Genes - No Expression Samples", | |
y="cluster_5_mean-cluster_3_mean") | |
ggsave("figs/supplementary/kielens_no_expression_samples_meth_expr_correlation.svg",tran_5_promoter_plot,width = 87, height=87, units = "mm") | |
``` | |
### Use BEDTOOLS to turn bed files into fasta files, and then run dreme and ame | |
```{bash} | |
for filename in x_chr_methylation_sites/*.bed | |
do | |
name=$(echo $filename | rev | cut -f 2- -d '.' | rev) | |
bedtools getfasta -fi motif_search_tools/ucsc.hg19.fa -bed $filename > ${name}.fa | |
mkdir -p ${name}/{dreme,ame} | |
dreme -verbosity 1 -oc ${name}/dreme -dna -p ${name}.fa -n background_x_chr.fa -t 18000 -e 0.05 > ${name}/dreme/out.txt 2>&1 & PIDIOS=$! | |
ame --verbose 3 --oc ${name}/ame --scoring avg --method fisher --hit-lo-fraction 0.25 --evalue-report-threshold 10.0 --control background_x_chr.fa ${name}.fa /mnt/d/pinterlab/meme-suite/meme-5.0.4/motif_databases/EUKARYOTE/jolma2013.meme /mnt/d/pinterlab/meme-suite/meme-5.0.4/motif_databases/JASPAR/JASPAR2018_CORE_vertebrates_non-redundant.meme /mnt/d/pinterlab/meme-suite/meme-5.0.4/motif_databases/MOUSE/uniprobe_mouse.meme > ${name}/ame/out.txt 2>&1 & PIDMIX=$! | |
wait $PIDIOS | |
wait $PIDMIX | |
done | |
``` | |
### Figure for study breakdown | |
```{r} | |
table_1_df <- read_csv("table_1_sample_data_info.csv") %>% | |
filter(!is.na(`Sample Cluster Assignment`) & Source != "This paper") %>% | |
mutate(cluster=case_when(`Sample Cluster Assignment` == 0 ~ "A", | |
`Sample Cluster Assignment` == 1 ~ "B", | |
`Sample Cluster Assignment` == 2 ~ "C", | |
`Sample Cluster Assignment` == 3 ~ "D", | |
`Sample Cluster Assignment` == 4 ~ "E", | |
`Sample Cluster Assignment` == 5 ~ "F")) | |
studies_in_clusters <- ggplot(table_1_df) + | |
geom_bar(aes(x=cluster, fill=sprintf("%s (%s)", str_replace(Source, " et al,", ","), `GEO Group Accession`))) + | |
labs(title="Studies in Clusters", | |
x="Sample Cluster", | |
y="Number of Samples", | |
fill="Source") + | |
theme_bw() + | |
scale_fill_manual(limits=c("Banovich, 2018 (GSE110544)", "Nazor, 2012 (GSE31848)", "Butcher, 2016 (GSE59091)", "Nishizawa, 2016 (GSE60924)", "Zdravkovic, 2015 (GSE72923)", "Salomonis, 2016 (GSE85828)", "Takasawa, 2018 (GSE73938)")[c(1,3,2,4,6,7,5)], | |
values = c(pal_futurama()(4), pal_startrek()(3)[3], pal_rickandmorty()(5)[c(2,4)])[c(1,3,2,4,6,7,5)])+ | |
# scale_fill_npg() + | |
base_plot_theme + | |
theme(legend.position="right", | |
legend.justification = "left") | |
ggsave("figs/supplementary/supp_1_studies_in_clusters.pdf", studies_in_clusters, width=80, height=30, units="mm") | |
``` | |