Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
contiguous_x_erosion/atac_seq_analysis.R
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
2175 lines (1877 sloc)
111 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(stringr) | |
atac_counts_df <- read_tsv("banovich-atacseq-data/GSE108458_iPSC-ATAC-counts.txt") | |
sample_data_df <- read_csv("sample_data_combined_all_info.csv") | |
cluster_info_df <- read_csv("saved_sample_clusters.csv", col_names = c("sample_name", "cluster")) | |
banovich_female_sample_df <- sample_data_df %>% | |
left_join(cluster_info_df, by="sample_name") %>% | |
filter(cluster != "" & !is.na(cluster) & group_accession == "GSE110544") | |
banovich_cell_lines <- intersect(banovich_female_sample_df$cell_line, | |
colnames(atac_counts_df)) | |
female_atac_counts_df <- atac_counts_df %>% | |
select(window, !!banovich_cell_lines) %>% | |
mutate(chr=stringr::str_split(window, "-", simplify = T)[,1], | |
start=stringr::str_split(window, "-", simplify = T)[,2], | |
end=stringr::str_split(window, "-", simplify = T)[,3]) %>% | |
select(-window) | |
library(DESeq2) | |
banovich_sample_sheet <- banovich_female_sample_df %>% | |
dplyr::select(SampleID=cell_line, | |
Condition=cluster) | |
dba_atac <- dba(sampleSheet = banovich_sample_sheet, | |
) | |
# get the atac accessions for alignments | |
atac_accessions <- read_csv("banovich-atacseq-data/SraRunTable.txt") | |
sras_to_keep <- atac_accessions %>% | |
filter(derived_from_cell_line %in% banovich_cell_lines) | |
write_lines(sras_to_keep$Run, "banovich-atacseq-data/atac_sras_to_download.txt") | |
atac_sample_info <- sras_to_keep %>% | |
select(derived_from_cell_line, atac_accession=Run) %>% | |
left_join(banovich_female_sample_df, by=c("derived_from_cell_line"="cell_line")) | |
library(stringr) | |
atac_sample_info$hasXPeaks <- F | |
for (f in Sys.glob("esatac_results/results_txt/*.txt")) { | |
f_df <- read_tsv(f) | |
srr_name <- stringr::str_replace(basename(f), "_1.RPeakAnno.txt", "") | |
chromosome_counts <- f_df %>% | |
dplyr::count(chromatin) %>% | |
reshape2::dcast(. ~ chromatin) %>% | |
select(-.) | |
chromosome_counts$atac_accession <- srr_name | |
if ("chrX" %in% f_df$chromatin) { | |
atac_sample_info[atac_sample_info$atac_accession == srr_name,"hasXPeaks"] <- T | |
} | |
for (chr in colnames(chromosome_counts %>% select(-atac_accession))) { | |
atac_sample_info[atac_sample_info$atac_accession == srr_name, chr] <- chromosome_counts[1, chr] | |
} | |
} | |
write_csv(atac_sample_info %>% | |
dplyr::select(sample_name, | |
atac_accession, | |
group_accession, | |
sex=gender, | |
cluster, | |
accession, | |
cell_type, | |
cell_line=derived_from_cell_line, | |
#hasXPeaks, | |
#!!colnames(chromosome_counts %>% dplyr::select(-atac_accession, -.)) | |
), "atac_sample_info.csv.gz") | |
library(caret) | |
## footprint analysis | |
rds_test <- readRDS("banovich-atacseq-data/footprints/SRR6412411_footprint.rds") | |
all_tfs <- names(rds_test) | |
tf_ks_df <- tibble(tf=character(), | |
srr1=character(), | |
srr2=character(), | |
ks_distance=numeric(), | |
ks_pval=numeric()) | |
footprint_files <- Sys.glob("banovich-atacseq-data/footprints/*.rds") | |
for (i in 1:(length(footprint_files)-1)) { | |
print(i) | |
fp1 <- readRDS(footprint_files[i]) | |
s1 <- str_split(basename(footprint_files[i]), "_", simplify=T)[1] | |
for (j in (i+1):length(footprint_files)) { | |
fp2 <- readRDS(footprint_files[j]) | |
s2 <- str_split(basename(footprint_files[j]), "_", simplify=T)[1] | |
for (tf in all_tfs) { | |
fp_df <- data.frame( | |
d1=as.numeric(unlist(fp1[tf])), | |
d2=as.numeric(unlist(fp2[tf])) | |
) | |
norm_ds <- predict(preProcess(fp_df, method = "range"), fp_df) | |
ks_res <- ks.test(norm_ds$d1, norm_ds$d2) | |
ks_dist <- as.numeric(unlist(ks_res$statistic)) | |
ks_p <- as.numeric(unlist(ks_res$p.value)) | |
tf_ks_df <- add_row(tf_ks_df, | |
tf=tf, | |
srr1=s1, | |
srr2=s2, | |
ks_distance=ks_dist, | |
ks_pval=ks_p) | |
} | |
} | |
} | |
atac_sample_info <- read_csv("atac_sample_info.csv.gz") | |
tf_ks_df <- left_join(tf_ks_df, | |
atac_sample_info %>% | |
select(atac_accession, s1_cluster=cluster), | |
by=c("srr1"="atac_accession")) %>% | |
left_join(atac_sample_info %>% | |
select(atac_accession, s2_cluster=cluster), | |
by=c("srr2"="atac_accession")) | |
write_csv(tf_ks_df, "tf_ks_df.csv.gz") | |
summarized_tf_ks_df <- tf_ks_df %>% | |
group_by(tf, cluster_1 = pmin(s1_cluster, s2_cluster), cluster_2 = pmax(s1_cluster, s2_cluster)) %>% | |
summarise(mean_distance = mean(ks_distance), | |
mean_pval=mean(ks_pval)) | |
ggplot(summarized_tf_ks_df %>% filter(cluster_1 == 0 & mean_pval <= .1)) + | |
geom_density(aes(mean_distance, color=interaction(cluster_1,cluster_2))) | |
tf_ks_df_diff <- summarized_tf_ks_df %>% | |
filter(cluster_1 == 0 & cluster_2 %in% c(0,5)) %>% | |
arrange(cluster_1, cluster_2) %>% | |
group_by(tf) %>% | |
summarise(ks_dist_diff=last(mean_distance)-first(mean_distance), | |
ks_pval_diff=last(mean_pval)-first(mean_pval)) | |
changing_TFs <- (tf_ks_df_diff %>% filter(ks_dist_diff > 0))$tf | |
ggplot(summarized_tf_ks_df %>% filter(cluster_1 == 0 & tf %in% changing_TFs & mean_pval <= .1)) + | |
geom_density(aes(mean_distance, color=interaction(cluster_1,cluster_2))) | |
# creating a clustered footprint map | |
tf <- "KLF5" | |
footprint_df <- tibble( | |
tf=character(), | |
sample_srr=character(), | |
position=numeric(), | |
signal=numeric() | |
) | |
for (footprint_file in footprint_files) { | |
print(footprint_file) | |
fp_rds <- readRDS(footprint_file) | |
s1 <- str_split(basename(footprint_files[i]), "_", simplify=T)[1] | |
for (tf in all_tfs) { | |
tf_footprint <- as.numeric(unlist(fp1[tf])) | |
for (x in 1:length(tf_footprint)) { | |
footprint_df <- add_row(footprint_df, | |
tf=tf, | |
sample_srr=s1, | |
position=x, | |
signal=tf_footprint[x]) | |
} | |
} | |
} | |
# Do differential peak analysis | |
library(DiffBind) | |
atac_sample_info <- atac_sample_info %>% | |
mutate(bamReads=sprintf("banovich-atacseq-data/macs2_peaks/%s_1.nodup.filtered.aligned.bam", atac_accession), | |
Peaks=sprintf("banovich-atacseq-data/macs2_peaks/%s_1_peaks.xls", atac_accession), | |
PeakCaller="macs", | |
PeakFormat="macs") | |
atac_sample_sheet <- atac_sample_info %>% | |
mutate(SampleID=paste(sample_name, cluster, sep = "_")) %>% | |
dplyr::select(SampleID, | |
atac_accession, | |
bamReads, | |
Peaks, | |
PeakCaller, | |
Condition=cluster, | |
PeakFormat) | |
dba_atac <- dba(sampleSheet=atac_sample_sheet, config=data.frame(RunParallel=T, cores=8)) | |
dba_atac <- dba.count(dba_atac, bRemoveDuplicates = F)#, bUseSummarizeOverlaps = T) | |
dba.save(dba_atac, file="dba_atac") | |
dba_atac <- dba.load("dba_atac") | |
for (t in 1:4) { | |
cluster_a <- t-1 | |
cluster_b <- t | |
if (t==4) { | |
cluster_b <- 5 | |
} | |
cluster_a_samples <- (atac_sample_sheet %>% filter(Condition <= cluster_a))$SampleID | |
cluster_b_samples <- (atac_sample_sheet %>% filter(Condition >= cluster_b))$SampleID | |
cluster_a_group <- dba.mask(dba_atac, attribute = DBA_CONDITION, value = 0:(cluster_a), combine="or") | |
cluster_b_group <- dba.mask(dba_atac, attribute = DBA_CONDITION, value = (cluster_a+1):5, combine="or") | |
transition_dba_atac_contrast <- dba.contrast(dba_atac, group1 = cluster_a_group, | |
name1 = sprintf("cluster_%s", paste(c("A","B","C", "D", "F")[(1):(cluster_a+1)], collapse="")), | |
group2 = cluster_b_group, | |
name2 = sprintf("cluster_%s", paste(c("A","B","C", "D", "F")[(cluster_a+2):5], collapse=""))) | |
transition_dba_atac_contrast <- dba.analyze(transition_dba_atac_contrast, method=c(DBA_EDGER, DBA_DESEQ2)) | |
results <- dba.report(transition_dba_atac_contrast, contrast=1, bFlip=T, bCounts=T, th=1, method=DBA_EDGER, bCalled = T) | |
write_csv(as_tibble(results), sprintf("banovich-atacseq-data/macs2_diff_peaks/edger_transition_%d_diff_atac.csv", t)) | |
results_deseq2 <- dba.report(transition_dba_atac_contrast, contrast=1, bFlip=T, bCounts=T, th=1, method=DBA_DESEQ2, bCalled = T) | |
write_csv(as_tibble(results_deseq2), sprintf("banovich-atacseq-data/macs2_diff_peaks/deseq2_transition_%d_diff_atac.csv", t)) | |
} | |
all_edgers <- tibble(seqnames=character(), | |
start=numeric(), | |
end=numeric(), | |
width=numeric()) | |
for (t in 1:4) { | |
transition_df <- read_csv(sprintf("banovich-atacseq-data/macs2_diff_peaks/edger_transition_%d_diff_atac.csv", t)) | |
all_edgers <- full_join(all_edgers, | |
transition_df %>% select( | |
seqnames, | |
start, | |
end, | |
width, | |
!!sprintf("transition_%d_fold", t):=Fold, | |
!!sprintf("transition_%d_pval", t):=p.value, | |
!!sprintf("transition_%d_FDR", t):=FDR | |
), by=c("seqnames", "start", "end", "width")) | |
} | |
all_edgers_test <- all_edgers %>% | |
mutate(most_significant_transition=max.col(-1 * data.frame(transition_1_FDR, transition_2_FDR, transition_3_FDR, transition_4_FDR)), | |
most_significant_fdr=pmin(transition_1_FDR, transition_2_FDR, transition_3_FDR, transition_4_FDR), | |
passes_significance_threashold=pmin(transition_1_FDR, transition_2_FDR, transition_3_FDR, transition_4_FDR) <= .1) %>% | |
mutate(delta=case_when(most_significant_transition == 1 ~transition_1_fold, | |
most_significant_transition == 2 ~transition_2_fold, | |
most_significant_transition == 3 ~transition_3_fold, | |
most_significant_transition == 4 ~transition_4_fold)) %>% | |
mutate(delta_increasing=delta>0) | |
write_csv(all_edgers_test, "all_edgers_test.csv.gz") | |
all_edgers_test %>% | |
group_by(passes_significance_threashold, delta_increasing) %>% | |
summarise(n=n(), mean_delta=mean(delta)) %>% View | |
all_deseqs <- tibble(seqnames=character(), | |
start=numeric(), | |
end=numeric(), | |
width=numeric()) | |
for (t in 1:4) { | |
transition_df <- read_csv(sprintf("banovich-atacseq-data/macs2_diff_peaks/deseq2_transition_%d_diff_atac.csv", t)) | |
all_deseqs <- full_join(all_deseqs, | |
transition_df %>% select( | |
seqnames, | |
start, | |
end, | |
width, | |
!!sprintf("transition_%d_fold", t):=Fold, | |
!!sprintf("transition_%d_pval", t):=p.value, | |
!!sprintf("transition_%d_FDR", t):=FDR | |
), by=c("seqnames", "start", "end", "width")) | |
} | |
all_deseqs_test <- all_deseqs %>% | |
mutate(most_significant_transition=max.col(-1 * data.frame(transition_1_FDR, transition_2_FDR, transition_3_FDR, transition_4_FDR)), | |
most_significant_fdr=pmin(transition_1_FDR, transition_2_FDR, transition_3_FDR, transition_4_FDR), | |
passes_significance_threashold=pmin(transition_1_FDR, transition_2_FDR, transition_3_FDR, transition_4_FDR) <= .1) %>% | |
mutate(delta=case_when(most_significant_transition == 1 ~transition_1_fold, | |
most_significant_transition == 2 ~transition_2_fold, | |
most_significant_transition == 3 ~transition_3_fold, | |
most_significant_transition == 4 ~transition_4_fold)) %>% | |
mutate(delta_increasing=delta>0) | |
write_csv(all_deseqs_test, "all_deseqs_test.csv.gz") | |
all_deseqs_test %>% | |
group_by(passes_significance_threashold, delta_increasing) %>% | |
summarise(n=n(), mean_delta=mean(delta)) %>% View | |
all_edger_passing_threshold <- all_edgers_test %>% filter(passes_significance_threashold == T) | |
all_edger_passing_threshold_hg38 <- read_tsv("all_edger_passing_threshold_hg38.bed", col_names = c("hg38_chr", "hg38_start", "hg38_end")) %>% | |
filter(hg38_start != 50211461 & hg38_start != 149903954) | |
liftover_coords_all_edger <- read_tsv("all_edger_passing_threshold_hg19_liftover.bed", col_names = c("hg19_chr", "hg19_start", "hg19_end")) | |
all_edger_coor_convert_df <- all_edger_passing_threshold_hg38 %>% | |
cbind(liftover_coords_all_edger) | |
all_edger_passing_threshold_hg38 <- all_edger_passing_threshold %>% | |
filter(seqnames == "chrX") %>% | |
right_join(all_edger_coor_convert_df, by=c("seqnames"="hg38_chr", "start"="hg38_start", "end"="hg38_end")) | |
all_edger_passing_threshold_hg38 <- all_edger_passing_threshold_hg38 %>% | |
filter(passes_significance_threashold == T) %>% | |
mutate(hg19_width = hg19_end - hg19_start) | |
all_transitions_atac_windowed <- tibble(chr = character(), | |
pos_start = numeric(), | |
pos_end = numeric(), | |
mean_delta = numeric(), | |
count = numeric(), | |
cumulative_peak_width = numeric(), | |
key_transition=numeric(), | |
delta_increasing=logical()) | |
for (t in 1:4) { | |
for(d in c(T,F)) { | |
tran_windowed <- window_average_genome_atac_transition(all_edger_passing_threshold_hg38 %>% | |
filter(hg19_chr == "chrX" & most_significant_transition == t & delta_increasing == d) %>% | |
select(-seqnames, -start, -end, -width, seqnames=hg19_chr, start=hg19_start, end=hg19_end, width=hg19_width), | |
window_size = 100000, | |
step_size = 50000, | |
include_empty=T) | |
tran_windowed$key_transition <- t | |
tran_windowed$delta_increasing <- d | |
if (d == F) { | |
tran_windowed$cumulative_peak_width <- -1 * tran_windowed$cumulative_peak_width | |
} | |
all_transitions_atac_windowed <- rbind(all_transitions_atac_windowed, tran_windowed) | |
} | |
} | |
library(facetscales) | |
mean_methylation_df_escape <- read_csv("mean_methylation_df_escape.csv.gz") | |
#hg19_pos was manually annotated for each gene | |
escapee_pos_df <- mean_methylation_df_escape %>% | |
transmute(hg38_gene_name, | |
delta_tran_1=hg19_pos, | |
delta_tran_2=hg19_pos, | |
delta_tran_3=hg19_pos, | |
delta_tran_4=hg19_pos, | |
delta_tran_5=hg19_pos) %>% | |
reshape2::melt(id.vars=c("hg38_gene_name"), | |
value.name="TSS", | |
variable.name="type") | |
scales_y <- list( | |
"1" = scale_y_continuous(limits=c(-8000, 8000), expand=c(0,0), breaks=c(-4000,0,4000)), | |
"2" = scale_y_continuous(limits=c(-8000, 8000), expand=c(0,0), breaks=c(-4000,0,4000)), | |
"3" = scale_y_continuous(limits=c(-4000, 4000), expand=c(0,0), breaks=c(-2000,0,2000)), | |
"4" = scale_y_continuous(limits=c(-4000, 4000), expand=c(0,0), breaks=c(-2000,0,2000))) | |
browser_view_atac <- ggplot(all_transitions_atac_windowed, mapping=aes(x=pos_start, y = cumulative_peak_width)) + | |
geom_vline(data=escapee_pos_df, aes(xintercept=TSS), color="darkgrey", size=.3, alpha=.8) + | |
facet_grid_sc(rows=vars(factor(key_transition, levels = 1:4)), scales = list(y=scales_y), labeller = labeller(.rows=function(n){ | |
return(list("1"="Tran. 1", | |
"2"="Tran. 2", | |
"3"="Tran. 3", | |
"4"="Tran. 4/5")[n]) | |
})) + | |
geom_area(data=all_transitions_atac_windowed %>% filter(delta_increasing == T), mapping=aes(x=pos_start, y = cumulative_peak_width, fill=factor(key_transition)), inherit.aes = F) + | |
geom_area(data=all_transitions_atac_windowed %>% filter(delta_increasing == F), mapping=aes(x=pos_start, y= cumulative_peak_width, fill=factor(key_transition)) , inherit.aes = F) + | |
labs(y="Number of Peaks", | |
x="X Chr Coordinate (Mb)") + | |
scale_y_continuous(limits=c(-8000,8000), breaks = c(-4000,0,4000)) + | |
scale_x_continuous(limits=c(0, 155270560), breaks = seq(0, 155270560, 2e7), labels=c(0, 20, 40, 60, 80, 100, 120, 140), minor_breaks = seq(0, 155270560, 1e7), expand=c(0,0)) + | |
theme_bw() + | |
scale_fill_manual(limits=c(1,2,3,5,4), | |
values=pal_d3()(5), guide=F) + | |
base_plot_theme + | |
# scale_color_material("cyan") + | |
theme(panel.grid.major = element_blank(), | |
panel.grid.minor = element_blank(), | |
strip.text = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")), | |
strip.background = element_rect(size = .3), | |
title = element_blank(), | |
panel.spacing.y = unit(.5, "mm"), | |
axis.title.y = element_blank(), | |
axis.text.y=element_text(size=5)) + | |
labs(title="Significant ATAC-seq Peaks") | |
ggsave("figs/main/fig3_atac_spreading.pdf", browser_view_atac, width = 171, height=45, units = "mm") | |
# atac fold change densities | |
all_edgers_test_filtered <- all_edgers_test %>% filter(passes_significance_threashold == T & seqnames =="chrX") | |
for(t in 1:4) { | |
atac_tran_density <- ggplot(all_edgers_test_filtered %>% filter(most_significant_transition == t)) + | |
geom_density(aes(x=abs(delta)), size=.15) + | |
theme_bw() + | |
base_plot_theme + | |
scale_y_continuous() + | |
scale_x_continuous(breaks = c(0:8)) + | |
labs(y="Density") + | |
theme(axis.ticks.length = unit(.2, "mm"), | |
axis.ticks.x = element_line(size=.15), | |
plot.margin = margin(l=0,b=0,t=0.5,r=0.4, "mm"), | |
axis.title.x = element_blank(), | |
axis.title.y = element_text(size=2, margin = margin(r=.1,b=0,t=0,l=0, unit="mm")), | |
plot.title = element_text(size=3.5), | |
axis.ticks.y = element_blank(), | |
axis.text.y = element_blank(), | |
panel.grid.major =element_blank(), | |
panel.grid.minor =element_blank(), | |
axis.text.x=element_text(size=2, margin = margin(t=.2,b=0,r=0,l=0, unit="mm")), | |
) | |
ggsave(sprintf("figs/main/fig3_atac_inset_density_%d.pdf", t), atac_tran_density, width=6, height=4, units = "mm") | |
} | |
# v2 of log2FC densities | |
atac_all_tran_density <- ggplot(all_edgers_test_filtered) + | |
geom_density(aes(x=abs(delta), color=as.factor(most_significant_transition), group=most_significant_transition), size=.15) + | |
theme_bw() + | |
base_plot_theme + | |
scale_y_continuous(expand = c(0,.05)) + | |
scale_x_continuous(breaks = c(0:8), expand = c(0,0)) + | |
scale_color_manual(limits=c(1,2,3,4), | |
values=pal_d3()(5)[c(1:3,5)], labels=c("1", "2", "3", "4/5"), guide=F) + | |
labs(y="Density", color="Transition", x="ATAC log2(FC)") + | |
theme(axis.ticks.length = unit(.2, "mm"), | |
axis.ticks = element_line(size=.15), | |
plot.margin = margin(l=0,b=0,t=0.5,r=0.4, "mm"), | |
axis.title.x = element_text(size=4, margin = margin(t=.1,b=0,r=0,l=0, unit="mm")), | |
axis.title.y = element_text(size=4, margin = margin(r=.1,b=0,t=0,l=0, unit="mm")), | |
plot.title = element_blank(), | |
# axis.ticks.y = element_blank(), | |
# axis.text.y = element_blank(), | |
panel.grid.major =element_blank(), | |
panel.grid.minor =element_blank(), | |
axis.text.x=element_text(size=4, margin = margin(t=.2,b=0,r=0,l=0, unit="mm")), | |
axis.text.y=element_text(size=2, margin = margin(t=0,b=0,r=0,l=0, unit="mm")), | |
) | |
ggsave("figs/main/fig3_atac_all_tran_density.pdf", atac_all_tran_density, width=26.419, height=13.5, units = "mm") | |
chrom_size_hg38 <- function(chr) { | |
return(as.numeric(list( | |
"chr1" = 248956422, | |
"chr2" = 242193529, | |
"chr3" = 198295559, | |
"chr4" = 190214555, | |
"chr5" = 181538259, | |
"chr6" = 170805979, | |
"chr7" = 159345973, | |
"chr8" = 145138636, | |
"chr9" = 138394717, | |
"chr10" = 133797422, | |
"chr11" = 135086622, | |
"chr12" = 133275309, | |
"chr13" = 114364328, | |
"chr14" = 107043718, | |
"chr15" = 101991189, | |
"chr16" = 90338345, | |
"chr17" = 83257441, | |
"chr18" = 80373285, | |
"chr19" = 58617616, | |
"chr20" = 64444167, | |
"chr21" = 46709983, | |
"chr22" = 50818468, | |
"chrX" = 156040895, | |
"chrY" = 57227415 | |
)[chr])) | |
} | |
chrom_size_hg19 <- function(chr) { | |
return(as.numeric(list( | |
"chr1" = 249250621, | |
"chr2" = 243199373, | |
"chr3" = 198022430, | |
"chr4" = 191154276, | |
"chr5" = 180915260, | |
"chr6" = 171115067, | |
"chr7" = 159138663, | |
"chr8" = 146364022, | |
"chr9" = 141213431, | |
"chr10" = 135534747, | |
"chr11" = 135006516, | |
"chr12" = 133851895, | |
"chr13" = 115169878, | |
"chr14" = 107349540, | |
"chr15" = 102531392, | |
"chr16" = 90354753, | |
"chr17" = 81195210, | |
"chr18" = 78077248, | |
"chr20" = 63025520, | |
"chr19" = 59128983, | |
"chr22" = 51304566, | |
"chr21" = 48129895, | |
"chrX" = 155270560, | |
"chrY" = 59373566 | |
)[chr])) | |
} | |
window_average_genome_atac_transition <- function(edger_results, window_size = 1000000, step_size = 10000, include_empty=FALSE, hg19=T) { | |
# bed_file_df <- bed_df | |
chrom_size <- chrom_size_hg38 | |
if (hg19 == T) { | |
chrom_size <- chrom_size_hg19 | |
} | |
windowed_bed_df <- tibble(chr = character(), | |
pos_start = numeric(), | |
pos_end = numeric(), | |
delta=numeric()) | |
chromosomes <- unique(edger_results$seqnames) | |
for (chr in chromosomes) { | |
print(chr) | |
chr_size <- chrom_size(chr) | |
for (s in 0:floor(chr_size / step_size)) { | |
start_position <- s * step_size | |
end_position <- min((start_position + window_size), chr_size) | |
# print(chr) | |
# print(s) | |
num_rows <- nrow(edger_results %>% | |
filter((seqnames == chr) & ((abs(!!start_position - start) <= (window_size/2)) | | |
(abs(!!start_position - end) <= (window_size/2))))) # (abs((!!start_position) - pos) <= (window_size/2)))) | |
# print(num_rows) | |
# print(start_position) | |
# print(end_position) | |
# print(start) | |
# print(end) | |
if ((num_rows > 0)) { | |
# print(edger_results %>% | |
# filter((seqnames == chr) & (min(abs(!!start_position - start), abs(!!start_position - end)) <= (window_size/2)))) | |
window_mean_delta <- edger_results %>% | |
filter((seqnames == chr) & ((abs(!!start_position - start) <= (window_size/2)) | | |
(abs(!!start_position - end) <= (window_size/2)))) %>% | |
summarise(chr=first(seqnames), | |
pos_start=start_position, | |
pos_end=end_position, | |
mean_delta=mean(delta), | |
count=n(), | |
cumulative_peak_width=sum(width)) | |
window_mean_delta[is.na(window_mean_delta)] <- 0 | |
write_csv(window_mean_delta, "windowed_all_chr_deltas_temp.csv", col_names = F, append = T) | |
# windowed_bed_df <- add_row(windowed_bed_df, | |
# chr=chr, | |
# pos_start=start_position, | |
# pos_end=end_position, | |
# delta_1=as.numeric(window_mean_delta$mean_tran_1_delta), | |
# delta_2=as.numeric(window_mean_delta$mean_tran_2_delta), | |
# delta_3=as.numeric(window_mean_delta$mean_tran_3_delta), | |
# delta_4=as.numeric(window_mean_delta$mean_tran_4_delta), | |
# delta_5=as.numeric(window_mean_delta$mean_tran_5_delta)) | |
}else if(include_empty == T) { | |
window_mean_delta <- tibble(chr=c(first(chr)), | |
pos_start=c(start_position), | |
pos_end=c(end_position), | |
mean_delta=c(0), | |
count=c(0), | |
cumulative_peak_width=c(0)) | |
write_csv(window_mean_delta, "windowed_all_chr_deltas_temp.csv", col_names = F, append = T) | |
} | |
} | |
} | |
windowed_bed_df <- read_csv("windowed_all_chr_deltas_temp.csv", col_names = c("chr", | |
"pos_start", | |
"pos_end", | |
"mean_delta", | |
"count", | |
"cumulative_peak_width")) | |
file.remove("windowed_all_chr_deltas_temp.csv") | |
return(windowed_bed_df) | |
} | |
h3k27me3_peaks <- read_table2("bedgraphs/GSM1528885_H9_XISTpos_K27me3.peaks.bed", col_names=c("chr", "start", "end", "signal")) | |
h3k27me3_peaks_chrX <- h3k27me3_peaks %>% filter(chr == "chrX") | |
h3k27me3_peaks_chrX_windowed <- window_average_chrX(bed_file_df=h3k27me3_peaks_chrX %>% rename(pos=start, delta=signal), window_size = 100000, step_size = 50000, | |
include_empty = T, calculate_sum = T) | |
h3k9_xi_peaks <- read_table2("bedgraphs/GSM1528888_H9_XISTpos_K9me3.peaks.bed.gz", col_names=c("chr", "start", "end", "signal")) | |
h3k9_xi_peaks_chrX <- h3k9_xi_peaks %>% filter(chr == "chrX") | |
h3k9_xi_peaks_chrX_windowed <- window_average_chrX(bed_file_df=h3k9_xi_peaks_chrX %>% rename(pos=start, delta=signal), window_size = 100000, step_size = 50000, include_empty = T, calculate_sum = T) | |
# write_tsv(all_edger_passing_threshold %>% filter(seqnames == "chrX") %>% select(seqnames, start, end), "all_edger_passing_threshold_hg38.bed", col_names = F) | |
all_edger_passing_threshold_hg38 <- read_tsv("all_edger_passing_threshold_hg38.bed", col_names = c("hg38_chr", "hg38_start", "hg38_end")) %>% | |
filter(hg38_start != 50211461 & hg38_start != 149903954) | |
liftover_coords_all_edger <- read_tsv("all_edger_passing_threshold_hg19_liftover.bed", col_names = c("hg19_chr", "hg19_start", "hg19_end")) | |
all_edger_coor_convert_df <- all_edger_passing_threshold_hg38 %>% | |
cbind(liftover_coords_all_edger) | |
all_edger_coor_converted_df <- all_edger_passing_threshold %>% | |
filter(seqnames == "chrX") %>% | |
right_join(all_edger_coor_convert_df, by=c("seqnames"="hg38_chr", "start"="hg38_start", "end"="hg38_end")) | |
all_edger_coor_converted_df <- all_edger_coor_converted_df %>% | |
filter(passes_significance_threashold == T) %>% | |
mutate(hg19_width = hg19_end - hg19_start) | |
histone_mark_correlations_atac <- tibble(transition=numeric(), | |
h3k27me3_correlation=numeric(), | |
h3k27me3_correlation_p_val=numeric(), | |
h3k9me3_correlation=numeric(), | |
h3k9me3_correlation_p_val=numeric(), | |
dmp_correlation=numeric(), | |
dmp_correlation_p_val=numeric()) | |
all_dmp_df <- read_csv("all_dmp_df.csv.gz") | |
for (t in 1:4) { | |
tran_windowed <- window_average_genome_atac_transition(all_edger_coor_converted_df %>% | |
filter(hg19_chr == "chrX" & most_significant_transition == t & delta_increasing == T) %>% | |
select(-seqnames, -start, -end, -width, seqnames=hg19_chr, start=hg19_start, end=hg19_end, width=hg19_width), | |
window_size = 100000, | |
step_size = 50000, | |
include_empty=T, | |
hg19 = T) | |
k27_cor_test <- cor.test(abs(tran_windowed$cumulative_peak_width), h3k27me3_peaks_chrX_windowed$delta) | |
k9_cor_test <- cor.test(abs(tran_windowed$cumulative_peak_width), h3k9_xi_peaks_chrX_windowed$delta) # h3k9me3_signal_chrX$signal) | |
bed_df <- all_dmp_df %>% filter((hg19_chr == "chrX") & (passes_threshold == T) & (key_transition == t)) | |
averaged_distance <- window_average_chrX(bed_file_df=bed_df %>% rename(chr=hg19_chr, pos=hg19_pos), window_size = 100000, step_size = 50000, include_empty = T) | |
dmp_cor_test <- cor.test(abs(tran_windowed$cumulative_peak_width), abs(averaged_distance$delta)) | |
# k27_xe_cor_test <- cor.test(abs(averaged_distance$delta), h3k27me3_xe_signal_chrX$signal) | |
# k9_xe_cor_test <- cor.test(abs(averaged_distance$delta), h3k9_xe_peaks_windowed$delta) | |
histone_mark_correlations_atac <- add_row(histone_mark_correlations_atac, | |
transition=t, | |
h3k27me3_correlation=k27_cor_test$estimate, | |
h3k27me3_correlation_p_val=k27_cor_test$p.value, | |
h3k9me3_correlation=k9_cor_test$estimate, | |
h3k9me3_correlation_p_val=k9_cor_test$p.value, | |
dmp_correlation=dmp_cor_test$estimate, | |
dmp_correlation_p_val=dmp_cor_test$p.value) | |
} | |
histone_cor_df <- reshape2::melt(histone_mark_correlations_atac %>% | |
dplyr::select(transition, | |
h3k27me3_correlation, | |
h3k9me3_correlation, | |
dmp_correlation), | |
id.vars=c("transition")) | |
histone_cor_df$variable <- factor(histone_cor_df$variable, levels=c("h3k9me3_correlation", "h3k27me3_correlation", "dmp_correlation")) | |
histon_cor_plot <- ggplot(histone_cor_df %>% filter(variable != "dmp_correlation")) + | |
geom_bar(aes(x=transition, y=value, fill=variable), stat="identity", position=position_dodge()) + | |
theme_bw() + | |
base_plot_theme + | |
coord_flip() + | |
facet_grid(rows=vars(transition), scales="free_y", labeller = labeller(transition = function(c){ | |
return(list( | |
"1"="Tran. 1", | |
"2"="Tran. 2", | |
"3"="Tran. 3", | |
"4"="Tran. 4/5" | |
)[c]) | |
}))+ | |
labs(title="Corr. of ATAC-peaks With Histone Meth.", | |
x="", | |
y="Pearson Correlation", | |
fill="Correlation With: ") + | |
scale_fill_manual(limits=c("h3k27me3_correlation", "h3k9me3_correlation", "dmp_correlation"), | |
labels=c("H3K27me3", "H3K9me3", "DMP"), | |
values = c(pal_jama()(1), "#8C564B", pal_jama()(5)[5]), | |
guide=F) + | |
scale_x_continuous(labels = NULL) + | |
scale_y_continuous(limits=c(-.13,.33)) + | |
theme(strip.text = element_text(size=5, margin = margin(0,.5,0,.5, "mm")), | |
strip.background = element_rect(size = .3), | |
legend.text = element_text(size=4), | |
legend.title = element_text(size=4, face="bold"), | |
legend.key.width = unit(2, units="mm"), | |
legend.key.height = unit(3, units="mm"), | |
legend.key.size = unit(.2, units = "mm"), | |
panel.grid.minor.y = element_blank(), | |
axis.ticks.length.y = unit(0, "mm"), | |
axis.title.y = element_blank(), | |
axis.text.y = element_blank(), | |
panel.spacing.y = unit(.75, "mm")) | |
ggsave("figs/main/fig_3_atac_histone_cor_plot_pearson.pdf", histon_cor_plot, width = 46, height = 40, units = "mm") | |
## Short arm distances | |
chr_x_atac_peaks_df_short_arm <- all_edgers_test %>% filter((seqnames == "chrX") & (start <= 58100000)) | |
short_arm_escapees <- mean_methylation_df_escape %>% filter(TSS <= 58100000) | |
canonical_transcripts <- read_csv("hg38_canonical_transcripts.csv.gz") | |
hg38_biomart_structure_df <- read_csv("hg38_biomart_structure_df.csv.gz", col_types = cols(chromosome_name="c")) | |
hg38_biomart_structure_df_canonical <- hg38_biomart_structure_df %>% | |
filter(ensembl_transcript_id %in% canonical_transcripts$ensembl_transcript_id & | |
chromosome_name=="X" & | |
transcription_start_site <= 58100000) %>% | |
group_by(ensembl_gene_id) %>% | |
summarise(transcription_start_site=dplyr::first(transcription_start_site), | |
end_position=dplyr::first(end_position)) | |
short_arm_escapees_canonical <- hg38_biomart_structure_df_canonical %>% | |
filter(ensembl_gene_id %in% short_arm_escapees$hg38_ensembl_gene_id) | |
for (row_idx in 1:nrow(chr_x_atac_peaks_df_short_arm)) { | |
pos_start <- unlist(chr_x_atac_peaks_df_short_arm[row_idx, "start"]) | |
pos_end <- unlist(chr_x_atac_peaks_df_short_arm[row_idx, "end"]) | |
min_distance <- min(c(abs(short_arm_escapees_canonical["transcription_start_site"] - pos_start), | |
abs(short_arm_escapees_canonical["transcription_start_site"] - pos_end), | |
abs(short_arm_escapees_canonical["end_position"] - pos_start), | |
abs(short_arm_escapees_canonical["end_position"] - pos_end), recursive=T)) | |
#min_distance_end <- min(abs(short_arm_escapees["end_position"] - pos)) | |
chr_x_atac_peaks_df_short_arm[row_idx, "min_dist_to_pb_escapee"] <- min_distance #min(min_distance, min_distance_end) | |
} | |
get_random_escapee_permutation <- function(short_arm_escapees, chr_x_atac_peaks_df_short_arm, hg38_biomart_structure_df_canonical=hg38_biomart_structure_df_canonical, n=1000) { | |
mean_min_distance_random_df_short_arm <- tibble(run=numeric(), | |
key_transition=numeric(), | |
mean_distance=numeric()) | |
for (i in 1:n) { | |
random_escapees <- sample(hg38_biomart_structure_df_canonical$ensembl_gene_id, nrow(short_arm_escapees)) | |
random_escapees_df <- hg38_biomart_structure_df_canonical %>% filter(ensembl_gene_id %in% random_escapees) | |
test_df <- chr_x_atac_peaks_df_short_arm %>% | |
filter(passes_significance_threashold == T & delta_increasing == T) %>% | |
group_by(seqnames, start, end) %>% summarise( | |
key_transition=dplyr::first(most_significant_transition), | |
passes_threshold=dplyr::first(passes_significance_threashold), | |
min_dist_to_random_escapee=min(c(abs(random_escapees_df$transcription_start_site - start), | |
abs(random_escapees_df$transcription_start_site - end), | |
abs(random_escapees_df$end_position - start), | |
abs(random_escapees_df$end_position - end), recursive=T)) | |
) | |
mean_min_distance_random_df_short_arm <- full_join(mean_min_distance_random_df_short_arm, test_df %>% group_by(key_transition) %>% | |
summarise(mean_distance=median(min_dist_to_random_escapee)) %>% mutate(run=i), | |
by=c("run", "key_transition", "mean_distance")) | |
# write_csv(test_df, "short_arm_pb_random_escapee_distances.csv", append=T, col_names=F) | |
# distances_to_random_escapees <- full_join(distances_to_random_escapees, test_df, by=c("key_transition", "min_dist_to_random_escapee", "passes_threshold")) | |
} | |
return(mean_min_distance_random_df_short_arm) | |
} | |
mean_min_distance_random_df_short_arm <- get_random_escapee_permutation(short_arm_escapees, chr_x_atac_peaks_df_short_arm, hg38_biomart_structure_df_canonical = hg38_biomart_structure_df_canonical) | |
# permutation 1 p-values | |
p1_atac_short_pvals <- c() | |
for (t in 1:4) { | |
p <- nrow(mean_min_distance_random_df_short_arm %>% | |
arrange(mean_distance) %>% filter(key_transition == t & | |
(mean_distance <= median((chr_x_atac_peaks_df_short_arm %>% filter(passes_significance_threashold==T & | |
most_significant_transition==t))$min_dist_to_pb_escapee)))) | |
p2 <- nrow(mean_min_distance_random_df_short_arm %>% | |
arrange(mean_distance) %>% filter(key_transition == t & | |
(mean_distance >= median((chr_x_atac_peaks_df_short_arm %>% filter(passes_significance_threashold==T & | |
most_significant_transition==t))$min_dist_to_pb_escapee)))) | |
print(min(p,p2)/1000) | |
pval <- min(p,p2)/1000 | |
if(pval == 0) { | |
pval <- "p < 0.001" | |
}else { | |
pval <- sprintf("p = %.3f", pval) | |
} | |
p1_atac_short_pvals <- c(p1_atac_short_pvals, pval) | |
} | |
# distances_to_random_escapees_short_arm <- read_csv("short_arm_pb_random_escapee_distances.csv", col_names = c("rowname","key_transition","passes_threshold", "min_dist")) | |
# distances_to_random_escapees_short_arm$type <- "random" | |
get_random_peaks_permutation <- function(short_arm_escapees_canonical, chr_x_atac_peaks_df_short_arm, hg38_biomart_structure_df_canonical=hg38_biomart_structure_df_canonical, n=1000) { | |
mean_min_distance_random_probes_df_short_arm <- tibble(run=numeric(), | |
key_transition=numeric(), | |
mean_distance=numeric()) | |
chr_x_atac_peaks_df_short_arm <- chr_x_atac_peaks_df_short_arm %>% | |
mutate(rowname=paste(seqnames, start, end, sep="_")) | |
for (i in 1:n) { | |
for (t in 1:4) { | |
random_short_arm_probes <- sample(chr_x_atac_peaks_df_short_arm$rowname, nrow(chr_x_atac_peaks_df_short_arm %>% filter(passes_significance_threashold==T & most_significant_transition==t & delta_increasing == T))) | |
random_short_arm_probes_df <- chr_x_atac_peaks_df_short_arm %>% filter(rowname %in% random_short_arm_probes) | |
test_df <- random_short_arm_probes_df %>% group_by(rowname) %>% summarise( | |
key_transition=t, | |
passes_threshold=T, | |
min_dist_to_random_escapee=min(c(abs(short_arm_escapees_canonical$transcription_start_site - start), | |
abs(short_arm_escapees_canonical$transcription_start_site - end), | |
abs(short_arm_escapees_canonical$end_position - start), | |
abs(short_arm_escapees_canonical$end_position - end), recursive=T)) | |
) | |
mean_min_distance_random_probes_df_short_arm <- add_row(mean_min_distance_random_probes_df_short_arm, | |
run=i, | |
key_transition=t, | |
mean_distance=median(test_df$min_dist_to_random_escapee)) | |
# write_csv(test_df, "short_arm_pb_random_probes_distances.csv", append=T, col_names=F) | |
# distances_to_random_escapees <- full_join(distances_to_random_escapees, test_df, by=c("key_transition", "min_dist_to_random_escapee", "passes_threshold")) | |
} | |
} | |
return(mean_min_distance_random_probes_df_short_arm) | |
} | |
mean_min_distance_random_probes_df_short_arm <- get_random_peaks_permutation(short_arm_escapees_canonical, chr_x_atac_peaks_df_short_arm) | |
# permutation 2 p-values | |
p2_atac_short_pvals <- c() | |
for (t in 1:4) { | |
p <- nrow(mean_min_distance_random_probes_df_short_arm %>% | |
arrange(mean_distance) %>% filter(key_transition == t & | |
(mean_distance <= median((chr_x_atac_peaks_df_short_arm %>% filter(passes_significance_threashold==T & | |
most_significant_transition==t))$min_dist_to_pb_escapee)))) | |
p2 <- nrow(mean_min_distance_random_probes_df_short_arm %>% | |
arrange(mean_distance) %>% filter(key_transition == t & | |
(mean_distance >= median((chr_x_atac_peaks_df_short_arm %>% filter(passes_significance_threashold==T & | |
most_significant_transition==t))$min_dist_to_pb_escapee)))) | |
print(min(p,p2)/1000) | |
pval <- min(p,p2)/1000 | |
if(pval == 0) { | |
pval <- "p < 0.001" | |
}else { | |
pval <- sprintf("p = %.3f", pval) | |
} | |
p2_atac_short_pvals <- c(p2_atac_short_pvals, pval) | |
} | |
# distances_to_random_probes_short_arm <- read_csv("short_arm_pb_random_probes_distances.csv", col_names = c("rowname","key_transition","passes_threshold", "min_dist")) | |
# distances_to_random_probes_short_arm$type <- "random_probes" | |
p1_atac_short_df <- tibble(most_significant_transition = c(1:3,5), | |
pval = p1_atac_short_pvals, | |
x=0, | |
y=1) | |
p2_atac_short_df <- tibble(most_significant_transition = c(1:3,5), | |
pval = p2_atac_short_pvals, | |
x=3.5e7, | |
y=1) | |
chr_x_atac_peaks_df_short_arm[chr_x_atac_peaks_df_short_arm$most_significant_transition==4, "most_significant_transition"] <- 5 | |
short_arm_permutation_plot <- ggplot(data=chr_x_atac_peaks_df_short_arm %>% filter(passes_significance_threashold==T & delta_increasing == T))+#mean_min_distance_random_df_short_arm) + | |
geom_density(aes(mean_distance, y=..scaled..), data=mean_min_distance_random_df_short_arm, | |
n = 1e4, | |
fill="darkgrey", | |
color="darkgray", | |
size=.3, | |
alpha=.5) + | |
geom_density(aes(mean_distance, y=..scaled..), data=mean_min_distance_random_probes_df_short_arm, | |
n = 1e4, | |
fill="grey", | |
color="grey", | |
size=.3, | |
alpha=.5) + | |
facet_grid(most_significant_transition~., scales = "free_y", labeller = labeller(most_significant_transition = function(c){ | |
return(list( | |
"1"="Tran. 1", | |
"2"="Tran. 2", | |
"3"="Tran. 3", | |
"4"="Tran. 4", | |
"5"="Tran. 4/5" | |
)[c]) | |
})) + | |
geom_vline(data=(chr_x_atac_peaks_df_short_arm %>% filter(passes_significance_threashold==T & delta_increasing == T) %>% group_by(most_significant_transition) %>% summarise(mean_distance=median(min_dist_to_pb_escapee))), | |
mapping = aes(xintercept=mean_distance), size=.3) + | |
geom_density(data=chr_x_atac_peaks_df_short_arm %>% filter(passes_significance_threashold==T & delta_increasing == T), | |
mapping=aes(min_dist_to_pb_escapee, | |
y=..scaled.., | |
fill=as.factor(most_significant_transition), | |
color=as.factor(most_significant_transition)), | |
n = 1e4, alpha=.5, size=.3) + | |
geom_text(data=(chr_x_atac_peaks_df_short_arm %>% | |
filter(passes_significance_threashold==T & delta_increasing == T) %>% | |
group_by(most_significant_transition) %>% | |
summarise(mean_distance=median(min_dist_to_pb_escapee))), | |
mapping = aes(x=mean_distance, y=.5, label=sprintf("%.1f Mb", mean_distance / 1e6)), | |
hjust=1.1, | |
size=1.2) + | |
geom_text(data=p1_atac_short_df, mapping=aes(x=x, y=y, label=pval), inherit.aes = F, size=1.2, hjust=-0.1, vjust=0) + | |
geom_text(data=p2_atac_short_df, mapping=aes(x=x, y=y, label=pval), inherit.aes = F, size=1.2, hjust=1.1, vjust=0) + | |
theme_bw() + | |
# scale_x_continuous(breaks = c(0,.5,1,1.5,2,2.5,3,3.5)*1e7, labels = c(0,.5,1,1.5,2,2.5,3,3.5)*10) + | |
# scale_x_log10(limits = c(1e4,1e8), breaks=c(1e4, 1e5,1e6,1e7,1e8), labels = c(.01,.1,1,10,100)) + | |
scale_x_sqrt(breaks=c(1e6, 5e6, 1e7, 1.5e7, 2e7, 2.5e7, 3e7), labels=c(1, 5, 10, 15, 20, 25, 30), expand=c(0,0)) + | |
scale_y_continuous(limits=c(0,1.2), breaks = c(0,.5,1), minor_breaks = c(.25,.75)) + | |
scale_color_d3(guide=F, limits=c("1","2","3","4","5")) + | |
scale_fill_d3(guide=F, limits=c("1","2","3","4","5")) + | |
# scale_fill_material("teal", name="Transition", guide=F) + | |
# scale_color_material("teal", name="Transition", guide=F) + | |
labs(title="Min. Distance of ATAC-peaks to Escapee on Short Arm", | |
x="Minimum Distance to Gene Set (Mb)", | |
y="Scaled Density") + | |
base_plot_theme + | |
theme(strip.text = element_text(size=5, margin = margin(0,.5,0,.5, "mm")), | |
strip.background = element_rect(size = .3), | |
panel.spacing.y = unit(.75, "mm")) | |
ggsave("figs/main/fig3_min_dist_atac_escapee_short_arm.pdf", short_arm_permutation_plot, width = 64, height=40, units = "mm") | |
## Long arm distances | |
chr_x_atac_peaks_df_long_arm <- all_edgers_test %>% filter((seqnames == "chrX") & (start > 58100000)) | |
long_arm_escapees <- mean_methylation_df_escape %>% filter(TSS > 58100000) | |
hg38_biomart_structure_df_canonical_long_arm <- hg38_biomart_structure_df %>% | |
filter(ensembl_transcript_id %in% canonical_transcripts$ensembl_transcript_id & | |
chromosome_name=="X" & | |
transcription_start_site > 58100000) %>% | |
group_by(ensembl_gene_id) %>% | |
summarise(transcription_start_site=dplyr::first(transcription_start_site), | |
end_position=dplyr::first(end_position)) | |
long_arm_escapees_canonical <- hg38_biomart_structure_df_canonical_long_arm %>% | |
filter(ensembl_gene_id %in% long_arm_escapees$hg38_ensembl_gene_id) | |
for (row_idx in 1:nrow(chr_x_atac_peaks_df_long_arm)) { | |
pos_start <- unlist(chr_x_atac_peaks_df_long_arm[row_idx, "start"]) | |
pos_end <- unlist(chr_x_atac_peaks_df_long_arm[row_idx, "end"]) | |
min_distance <- min(c(abs(long_arm_escapees_canonical["transcription_start_site"] - pos_start), | |
abs(long_arm_escapees_canonical["transcription_start_site"] - pos_end), | |
abs(long_arm_escapees_canonical["end_position"] - pos_start), | |
abs(long_arm_escapees_canonical["end_position"] - pos_end), recursive=T)) | |
#min_distance_end <- min(abs(short_arm_escapees["end_position"] - pos)) | |
chr_x_atac_peaks_df_long_arm[row_idx, "min_dist_to_pb_escapee"] <- min_distance #min(min_distance, min_distance_end) | |
} | |
mean_min_distance_random_df_long_arm <- get_random_escapee_permutation(long_arm_escapees, chr_x_atac_peaks_df_long_arm, hg38_biomart_structure_df_canonical = hg38_biomart_structure_df_canonical_long_arm) | |
# permutation 1 p-values | |
p1_atac_long_pvals <- c() | |
for (t in 1:4) { | |
p <- nrow(mean_min_distance_random_df_long_arm %>% | |
arrange(mean_distance) %>% filter(key_transition == t & | |
(mean_distance <= median((chr_x_atac_peaks_df_long_arm %>% filter(passes_significance_threashold==T & | |
most_significant_transition==t))$min_dist_to_pb_escapee)))) | |
p2 <- nrow(mean_min_distance_random_df_long_arm %>% | |
arrange(mean_distance) %>% filter(key_transition == t & | |
(mean_distance >= median((chr_x_atac_peaks_df_long_arm %>% filter(passes_significance_threashold==T & | |
most_significant_transition==t))$min_dist_to_pb_escapee)))) | |
print(min(p,p2)/1000) | |
pval <- min(p,p2)/1000 | |
if(pval == 0) { | |
pval <- "p < 0.001" | |
}else { | |
pval <- sprintf("p = %.3f", pval) | |
} | |
p1_atac_long_pvals <- c(p1_atac_long_pvals, pval) | |
} | |
mean_min_distance_random_probes_df_long_arm <- get_random_peaks_permutation(long_arm_escapees_canonical, chr_x_atac_peaks_df_long_arm, hg38_biomart_structure_df_canonical = hg38_biomart_structure_df_canonical_long_arm) | |
# permutation 2 p-values | |
p2_atac_long_pvals <- c() | |
for (t in 1:4) { | |
p <- nrow(mean_min_distance_random_probes_df_long_arm %>% | |
arrange(mean_distance) %>% filter(key_transition == t & | |
(mean_distance <= median((chr_x_atac_peaks_df_long_arm %>% filter(passes_significance_threashold==T & | |
most_significant_transition==t))$min_dist_to_pb_escapee)))) | |
p2 <- nrow(mean_min_distance_random_probes_df_long_arm %>% | |
arrange(mean_distance) %>% filter(key_transition == t & | |
(mean_distance >= median((chr_x_atac_peaks_df_long_arm %>% filter(passes_significance_threashold==T & | |
most_significant_transition==t))$min_dist_to_pb_escapee)))) | |
print(min(p,p2)/1000) | |
pval <- min(p,p2)/1000 | |
if(pval == 0) { | |
pval <- "p < 0.001" | |
}else { | |
pval <- sprintf("p = %.3f", pval) | |
} | |
p2_atac_long_pvals <- c(p2_atac_long_pvals, pval) | |
} | |
p1_atac_long_df <- tibble(most_significant_transition = c(1:4), | |
pval = p1_atac_long_pvals, | |
x=0, | |
y=1) | |
p2_atac_long_df <- tibble(most_significant_transition = c(1:4), | |
pval = p2_atac_long_pvals, | |
x=6.5e7, | |
y=1) | |
long_arm_permutation_plot_atac_peaks <- ggplot(data=chr_x_atac_peaks_df_long_arm %>% filter(passes_significance_threashold==T & delta_increasing == T))+#mean_min_distance_random_df_short_arm) + | |
geom_density(aes(mean_distance, y=..scaled..), data=mean_min_distance_random_df_long_arm, | |
n = 1e4, | |
fill="darkgrey", | |
color="darkgray", | |
size=.3, | |
alpha=.5) + | |
geom_density(aes(mean_distance, y=..scaled..), data=mean_min_distance_random_probes_df_long_arm, | |
n = 1e4, | |
fill="grey", | |
color="grey", | |
size=.3, | |
alpha=.5) + | |
facet_grid(most_significant_transition~., scales = "free_y", labeller = labeller(most_significant_transition = function(c){ | |
return(list( | |
"1"="Tran. 1", | |
"2"="Tran. 2", | |
"3"="Tran. 3", | |
"4"="Tran. 4/5", | |
"5"="Tran. 5" | |
)[c]) | |
})) + | |
geom_vline(data=(chr_x_atac_peaks_df_long_arm %>% filter(passes_significance_threashold==T & delta_increasing == T) %>% group_by(most_significant_transition) %>% summarise(mean_distance=median(min_dist_to_pb_escapee))), | |
mapping = aes(xintercept=mean_distance), size=.3) + | |
geom_density(data=chr_x_atac_peaks_df_long_arm %>% filter(passes_significance_threashold==T & delta_increasing == T), | |
mapping=aes(min_dist_to_pb_escapee, | |
y=..scaled.., | |
fill=as.factor(most_significant_transition), | |
color=as.factor(most_significant_transition)), | |
n = 1e4, alpha=.5, size=.3) + | |
geom_text(data=(chr_x_atac_peaks_df_long_arm %>% | |
filter(passes_significance_threashold==T & delta_increasing == T) %>% | |
group_by(most_significant_transition) %>% | |
summarise(mean_distance=median(min_dist_to_pb_escapee))), | |
mapping = aes(x=mean_distance, y=.5, label=sprintf("%.1f Mb", mean_distance / 1e6)), | |
hjust=1.1, | |
size=1.2) + | |
geom_text(data=p1_atac_long_df, mapping=aes(x=x, y=y, label=pval), inherit.aes = F, size=1.2, hjust=-0.1, vjust=0) + | |
geom_text(data=p2_atac_long_df, mapping=aes(x=x, y=y, label=pval), inherit.aes = F, size=1.2, hjust=1.1, vjust=0) + | |
theme_bw() + | |
# scale_x_continuous(breaks = c(0,.5,1,1.5,2,2.5,3,3.5)*1e7, labels = c(0,.5,1,1.5,2,2.5,3,3.5)*10) + | |
# scale_x_log10(limits = c(1e4,1e8), breaks=c(1e4, 1e5,1e6,1e7,1e8), labels = c(.01,.1,1,10,100)) + | |
scale_x_sqrt(breaks=c(1e6, 5e6, 1e7, 1.5e7, 2e7, | |
2.5e7, 3e7, 3.5e7, 4e7, 4.5e7, 5e7, 5.5e7, 6e7), | |
labels=c(1, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60), expand=c(0,0)) + | |
scale_y_continuous(limits=c(0,1.2), breaks = c(0,.5,1), minor_breaks = c(.25,.75)) + | |
scale_color_d3(guide=F, limits=c(1:3,5,4)) + | |
scale_fill_d3(guide=F, limits=c(1:3,5,4)) + | |
# scale_fill_material("teal", name="Transition", guide=F) + | |
# scale_color_material("teal", name="Transition", guide=F) + | |
labs(title="Min. Distance of ATAC-peaks to Escapee on Long Arm", | |
x="Minimum Distance to Gene Set (Mb)", | |
y="Scaled Density") + | |
base_plot_theme + | |
theme(strip.text = element_text(size=5, margin = margin(0,.5,0,.5, "mm")), | |
strip.background = element_rect(size = .3), | |
legend.key.width = unit(2, units="mm"), | |
legend.key.height = unit(3, units="mm"), | |
legend.key.size = unit(.2, units = "mm"), | |
panel.spacing.y = unit(.75, "mm")) | |
ggsave("figs/main/min_dist_atac_escapee_long_arm.pdf", long_arm_permutation_plot_atac_peaks, width = 64, height=40, units = "mm") | |
# escapee distance cumulative plots | |
probability_tib_short_arm <- tibble( | |
min_distance = numeric(), | |
transition = numeric(), | |
probability = numeric() | |
) | |
for (dist_to_escapee in seq(1, 7e7, 1e6)) { | |
# dist_to_escapee <- dist_to_escapee * 1e4 | |
sites_within_dist <- chr_x_atac_peaks_df_short_arm %>% filter(((min_dist_to_pb_escapee <= dist_to_escapee)) & (passes_significance_threashold == T)) | |
for (t in 1:5) { | |
prob <- nrow(sites_within_dist %>% filter(most_significant_transition == t & passes_significance_threashold == T)) / | |
nrow(chr_x_atac_peaks_df_short_arm %>% filter(most_significant_transition == t & passes_significance_threashold == T)) | |
probability_tib_short_arm <- add_row(probability_tib_short_arm, | |
min_distance = dist_to_escapee, | |
transition = t, | |
probability = prob) | |
} | |
} | |
d50_df_short <- tibble(transition=numeric(),d50=numeric()) | |
for (t in 1:5) { | |
d50_dist <- as.numeric(chr_x_atac_peaks_df_short_arm %>% | |
filter(most_significant_transition == t & passes_significance_threashold==T) %>% | |
arrange(min_dist_to_pb_escapee) %>% filter(row_number()==ceiling(n()/2)) %>% | |
dplyr::select(min_dist_to_pb_escapee)) | |
d50_df_short <- add_row(d50_df_short, transition=t, d50=d50_dist) | |
} | |
probability_of_atac_peak_short_arm <- ggplot(na.omit(probability_tib_short_arm), aes(x=min_distance, y=probability, group=transition, color=as.factor(transition))) + | |
geom_line() + | |
labs(x = "Minimum Distance to Escapee (Mb)", | |
y = "Fraction of Transition Sites", | |
color = "Transition", | |
title="Probability of Open Chromatin on Short Arm by\nDistance from Escapee") + | |
theme_bw() + | |
scale_color_futurama(breaks=c(0,1,2,3,5,4), labels=c("0","1", "2", "3", "NA", "4/5")) + | |
scale_x_continuous(breaks = (0:6)*1e7, labels = (0:6)*10) + | |
base_plot_theme + | |
theme(legend.title = element_text(size=5, face="bold"), | |
legend.text = element_text(size=5), | |
legend.key.width = unit(2, "mm")) | |
ggsave("figs/supplementary/sup2_prob_atac_peak_short_arm.svg", probability_of_atac_peak_short_arm, width = 58, height=40, units = "mm") | |
probability_tib_long_arm <- tibble( | |
min_distance = numeric(), | |
transition = numeric(), | |
probability = numeric() | |
) | |
for (dist_to_escapee in seq(1, 7e7, 1e6)) { | |
# dist_to_escapee <- dist_to_escapee * 1e4 | |
sites_within_dist <- chr_x_atac_peaks_df_long_arm %>% filter(((min_dist_to_pb_escapee <= dist_to_escapee)) & (passes_significance_threashold == T)) | |
for (t in 1:5) { | |
prob <- nrow(sites_within_dist %>% filter(most_significant_transition == t & passes_significance_threashold == T)) / | |
nrow(chr_x_atac_peaks_df_long_arm %>% filter(most_significant_transition == t & passes_significance_threashold == T)) | |
probability_tib_long_arm <- add_row(probability_tib_long_arm, | |
min_distance = dist_to_escapee, | |
transition = t, | |
probability = prob) | |
} | |
} | |
d50_df <- tibble(transition=numeric(),d50=numeric()) | |
for (t in 1:5) { | |
d50_dist <- as.numeric(chr_x_atac_peaks_df_long_arm %>% | |
filter(most_significant_transition == t & passes_significance_threashold==T) %>% | |
arrange(min_dist_to_pb_escapee) %>% filter(row_number()==ceiling(n()/2)) %>% | |
dplyr::select(min_dist_to_pb_escapee)) | |
d50_df <- add_row(d50_df, transition=t, d50=d50_dist) | |
} | |
probability_of_atac_peaks_long_arm <- ggplot(na.omit(probability_tib_long_arm), aes(x=min_distance, y=probability, group=transition, color=as.factor(transition))) + | |
geom_line() + | |
labs(x = "Minimum Distance to Escapee (Mb)", | |
y = "Fraction of Transition Sites", | |
color = "Transition", | |
title="Probability of Open Chromatin on Long Arm by\nDistance from Escapee") + | |
theme_bw() + | |
scale_color_futurama(breaks=c(0,1,2,3,5,4), labels=c("0","1", "2", "3", "NA", "4/5")) + | |
scale_x_continuous(breaks = (0:6)*1e7, labels = (0:6)*10) + | |
base_plot_theme + | |
theme(legend.title = element_text(size=5, face="bold"), | |
legend.text = element_text(size=5), | |
legend.key.width = unit(2, "mm")) | |
ggsave("figs/supplementary/sup2_prob_atac_peak_long_arm.svg", probability_of_atac_peaks_long_arm, width = 58, height=40, units = "mm") | |
#probability of escape for whole x chromosome | |
chr_x_atac_peaks_df_whole_chrom <- all_edgers_test %>% filter((seqnames == "chrX")) | |
whole_chrom_escapees <- mean_methylation_df_escape | |
hg38_biomart_structure_df_canonical_whole_chrom <- hg38_biomart_structure_df %>% | |
filter(ensembl_transcript_id %in% canonical_transcripts$ensembl_transcript_id & | |
chromosome_name=="X") %>% | |
group_by(ensembl_gene_id) %>% | |
summarise(transcription_start_site=dplyr::first(transcription_start_site), | |
end_position=dplyr::first(end_position)) | |
whole_chrom_escapees_canonical <- hg38_biomart_structure_df_canonical_whole_chrom %>% | |
filter(ensembl_gene_id %in% whole_chrom_escapees$hg38_ensembl_gene_id) | |
for (row_idx in 1:nrow(chr_x_atac_peaks_df_whole_chrom)) { | |
pos_start <- unlist(chr_x_atac_peaks_df_whole_chrom[row_idx, "start"]) | |
pos_end <- unlist(chr_x_atac_peaks_df_whole_chrom[row_idx, "end"]) | |
min_distance <- min(c(abs(whole_chrom_escapees_canonical["transcription_start_site"] - pos_start), | |
abs(whole_chrom_escapees_canonical["transcription_start_site"] - pos_end), | |
abs(whole_chrom_escapees_canonical["end_position"] - pos_start), | |
abs(whole_chrom_escapees_canonical["end_position"] - pos_end), recursive=T)) | |
#min_distance_end <- min(abs(short_arm_escapees["end_position"] - pos)) | |
chr_x_atac_peaks_df_whole_chrom[row_idx, "min_dist_to_pb_escapee"] <- min_distance #min(min_distance, min_distance_end) | |
} | |
probability_tib_whole_chrom <- tibble( | |
min_distance = numeric(), | |
transition = numeric(), | |
probability = numeric() | |
) | |
for (dist_to_escapee in seq(1, 7e7, 1e6)) { | |
# dist_to_escapee <- dist_to_escapee * 1e4 | |
sites_within_dist <- chr_x_atac_peaks_df_whole_chrom %>% filter(((min_dist_to_pb_escapee <= dist_to_escapee)) & (passes_significance_threashold == T)) | |
for (t in 1:5) { | |
prob <- nrow(sites_within_dist %>% filter(most_significant_transition == t & passes_significance_threashold == T)) / | |
nrow(chr_x_atac_peaks_df_whole_chrom %>% filter(most_significant_transition == t & passes_significance_threashold == T)) | |
probability_tib_whole_chrom <- add_row(probability_tib_whole_chrom, | |
min_distance = dist_to_escapee, | |
transition = t, | |
probability = prob) | |
} | |
} | |
probability_of_atac_peaks_whole_chrom <- ggplot(na.omit(probability_tib_whole_chrom), aes(x=min_distance, y=probability, group=transition, color=as.factor(transition))) + | |
geom_line() + | |
labs(x = "Minimum Distance to Escapee (Mb)", | |
y = "Fraction of Transition Sites", | |
color = "Transition", | |
title="Probability of Open Chromatin on Whole Chromosome by\nDistance from Escapee") + | |
theme_bw() + | |
scale_color_futurama(breaks=c(0,1,2,3,5,4), labels=c("0","1", "2", "3", "NA", "4/5")) + | |
scale_x_continuous(breaks = (0:6)*1e7, labels = (0:6)*10) + | |
base_plot_theme + | |
theme(legend.title = element_text(size=5, face="bold"), | |
legend.text = element_text(size=5), | |
legend.key.width = unit(2, "mm")) | |
ggsave("figs/supplementary/sup2_prob_atac_peak_whole_chrom.svg", probability_of_atac_peaks_whole_chrom, width = 58, height=40, units = "mm") | |
## Figure for XIST | |
canonical_transcripts <- read_csv("hg38_canonical_transcripts.csv.gz") | |
hg38_biomart_structure_df <- read_csv("hg38_biomart_structure_df.csv.gz", col_types = cols(chromosome_name="c")) | |
hg38_biomart_structure_df_canonical <- hg38_biomart_structure_df %>% | |
filter(ensembl_transcript_id %in% canonical_transcripts$ensembl_transcript_id & | |
chromosome_name=="X") %>% | |
group_by(ensembl_gene_id) %>% | |
summarise(external_gene_name=dplyr::first(external_gene_name), | |
transcription_start_site=dplyr::first(transcription_start_site), | |
start_position=dplyr::first(start_position), | |
end_position=dplyr::first(end_position)) | |
xist_cov_start <- 73845000 # as.numeric((hg38_biomart_structure_df_canonical %>% filter(external_gene_name == "XIST"))$start_position[1]) | |
xist_cov_end <- as.numeric((hg38_biomart_structure_df_canonical %>% filter(external_gene_name == "XIST"))$end_position[1]) + 5000 | |
firre_cov_start <- as.numeric((hg38_biomart_structure_df_canonical %>% filter(external_gene_name == "FIRRE"))$start_position[1]) | |
firre_cov_end <- as.numeric((hg38_biomart_structure_df_canonical %>% filter(external_gene_name == "FIRRE"))$end_position[1]) + 5000 | |
xist_peaks <- tibble(sample_atac_srr=character(), | |
chr=character(), | |
start=numeric(), | |
end=numeric(), | |
cluster=numeric(), | |
pileup=numeric(), | |
abs_summit=numeric()) | |
firre_peaks <- tibble(sample_atac_srr=character(), | |
chr=character(), | |
start=numeric(), | |
end=numeric(), | |
cluster=numeric(), | |
pileup=numeric(), | |
abs_summit=numeric()) | |
for (i in 1:nrow(atac_sample_info)) { | |
srr <- as.character(atac_sample_info[i, "atac_accession"]) | |
cluster <- as.numeric(atac_sample_info[i, "cluster"]) | |
peaks_file <- as.character(atac_sample_info[i, "Peaks"]) | |
peaks_df <- read_tsv(peaks_file, comment = "#") | |
# chrX:73,820,656-73,852,723) | |
sample_xist_peaks <- peaks_df %>% | |
filter(chr == "chrX" & | |
start >= 73820656 & | |
end <= 73852723 + 5000) %>% | |
mutate(sample_atac_srr=srr, | |
cluster=cluster) %>% | |
dplyr::select(sample_atac_srr, chr, start, end, cluster, pileup, abs_summit) | |
xist_peaks <- xist_peaks %>% | |
rbind(sample_xist_peaks) | |
# chrX:73,820,656-73,852,723) | |
sample_firre_peaks <- peaks_df %>% | |
filter(chr == "chrX" & | |
end >= firre_cov_start & | |
start <= firre_cov_end) %>% | |
mutate(sample_atac_srr=srr, | |
cluster=cluster) %>% | |
dplyr::select(sample_atac_srr, chr, start, end, cluster, pileup, abs_summit) | |
firre_peaks <- firre_peaks %>% | |
rbind(sample_firre_peaks) | |
} | |
ggplot(xist_peaks) + | |
annotate("rect", xmin=73851080, xmax=73851597, ymin=0, ymax=150, fill="grey") + | |
annotate("rect", xmin=73851995, xmax=73852383, ymin=0, ymax=150, fill="red") + | |
annotate("rect", xmin=73851195, xmax=73851236, ymin=0, ymax=150, fill="blue") + | |
annotate("rect", xmin=73849811, xmax=73849975, ymin=0, ymax=150, fill="green") + | |
geom_vline(xintercept=73852723) + | |
geom_rect(aes(xmin=start, xmax=end, ymin=0, ymax=pileup, fill=sample_atac_srr), alpha=.2, color="black") + | |
facet_grid(rows=vars(cluster)) + | |
scale_x_continuous(limits=c(73845000, 73852723+5000)) | |
# plot xist peaks using the read bed files | |
all_reads_df <- tibble(chr=character(), | |
start=numeric(), | |
end=numeric(), | |
score=numeric(), | |
sample=character(), | |
cluster=numeric()) | |
for (i in 1:nrow(atac_sample_info)) { | |
srr <- as.character(atac_sample_info[i, "atac_accession"]) | |
cluster <- as.numeric(atac_sample_info[i, "cluster"]) | |
bedgraph_file <- sprintf("banovich-atacseq-data/macs2_peaks/%s.BedGraph", srr) | |
reads_df <- read_tsv(bedgraph_file, col_names = c("chr", "start", "end", "score")) | |
reads_df <- reads_df %>% | |
filter(chr == "chrX" & | |
((end >= xist_cov_start & start <= xist_cov_end) | | |
(end >= firre_cov_start & start <= firre_cov_end))) | |
reads_df$sample <- srr | |
reads_df$cluster <- cluster | |
all_reads_df <- all_reads_df %>% | |
rbind(reads_df) | |
} | |
write_csv(all_reads_df, "xist_firre_atac_coverage_reads.csv.gz") | |
xist_gene_end <- as.numeric((hg38_biomart_structure_df_canonical %>% filter(external_gene_name == "XIST"))$end_position[1]) | |
xist_facet_df <- tibble(name=c("XIST", "Minimal Promoter", "CpgII/P2", "YY1", "YY1", "YY1", "YY1", "Exon1"), | |
cluster=6, | |
ymin=0, | |
ymax=.3, | |
xmin=c(xist_cov_start+1500, xist_gene_end, 73851080, 73851489, 73851374, 73851341, 73851205, 73841382), | |
xmax=c(xist_gene_end, 73852964, 73851597, 73851498, 73851383, 73851350, 73851214, 73852753)) | |
xist_tss_df <- tibble(cluster=6, | |
group=c("down_line", "down_line", "left_line", "left_line", "arrow", "arrow", "arrow"), | |
x=c(xist_gene_end, xist_gene_end, xist_gene_end, xist_gene_end-100, xist_gene_end-100, xist_gene_end-100, xist_gene_end-200), | |
y=c(.3, -.1, -.1, -.1, -.05, -.15, -.1)) | |
xist_text_df <- tibble(cluster=6, | |
x=xist_gene_end-350, | |
y=-.1, | |
text="TSS") | |
library(facetscales) | |
scales_y <- list( | |
"0" = scale_y_continuous(limits=c(-.2, 1.2), expand=c(0,0), breaks=c(0,.5,1)), | |
"0.5" = scale_y_continuous(limits=c(-.2, 1.2), expand=c(0,0), breaks=c(0,.5,1)), | |
"1" = scale_y_continuous(limits=c(-.2, 1.2), expand=c(0,0), breaks=c(0,.5,1)), | |
"2" = scale_y_continuous(limits=c(-.2, 1.2), expand=c(0,0), breaks=c(0,.5,1)), | |
"3" = scale_y_continuous(limits=c(-.2, 1.2), expand=c(0,0), breaks=c(0,.5,1)), | |
"4" = scale_y_continuous(limits=c(-.2, 1.2), expand=c(0,0), breaks=c(0,.5,1)), | |
"5" = scale_y_continuous(limits=c(-.2, 1.2), expand=c(0,0), breaks=c(0,.5,1)), | |
"6" = scale_y_continuous(limits=c(-.25, .55), expand=c(0,0), breaks=NULL) | |
) | |
significant_diff_peaks_xist_df <- read_csv("all_edgers_test.csv.gz") %>% | |
filter(passes_significance_threashold == T & | |
seqnames == "chrX" & | |
start <= (xist_cov_end-3500) & | |
end >= (xist_cov_start+1500)) %>% | |
mutate(cluster=most_significant_transition - 1) | |
filtered_site_annotation_df <- read_csv("filtered_site_annotation_df.csv.gz") | |
xist_annotation_df <- filtered_site_annotation_df %>% filter(hg38_chromosome == "chrX" & between(hg38_pos, xist_cov_start, xist_cov_end)) | |
# separate low XIST lines in cluster 0/A | |
all_reads_df <- all_reads_df %>% | |
mutate(cluster=case_when(sample %in% c("SRR6412441", "SRR6412449") ~ .5, | |
T ~ cluster)) | |
xist_atac_coverage_plot <- ggplot(all_reads_df) + | |
geom_vline(data=xist_annotation_df, mapping=aes(xintercept=hg38_pos), alpha=.3, size=.15) + | |
geom_rect(aes(xmin=start, xmax=end, ymin=0, ymax=score), alpha=.2, fill="black") + | |
#fill=as.factor(cluster)), | |
geom_segment(data=xist_peaks, aes(x=start, xend=end, y=-.1, yend=-.1, color="no change"), size=.7, alpha=.4) + | |
geom_segment(data=significant_diff_peaks_xist_df, aes(x=start, xend=end, y=-.1, yend=-.1, color=delta_increasing), size=.7, alpha=1) + | |
facet_grid_sc(rows = vars(as.factor(cluster)), space="free_y", scales = list(y=scales_y), labeller = labeller(.rows=function(n){ | |
return(list("0"="Cluster A", | |
"0.5"="Cluster A (XIST-)", | |
"1"="Cluster B", | |
"2"="Cluster C", | |
"3"="Cluster D", | |
"5"="Cluster F", | |
"6"="XIST")[n]) | |
})) + | |
geom_rect(data= xist_facet_df, aes(xmin=xmin, xmax=xmax, fill=name, ymin = ymin, ymax=ymax)) + | |
geom_polygon(data=xist_tss_df, aes(x=x, y=y, group=group)) + | |
geom_path(data=xist_tss_df, aes(x=x, y=y, group=group), size=.3) + | |
geom_text(data=xist_text_df, aes(x=x, y=y, label=text), size=1.2) + | |
theme_bw() + | |
base_plot_theme + | |
scale_fill_manual(values = c("#000000","#000000", "#7AC943", "#FF931E", "#ED1C24"), | |
breaks = c("XIST", "Exon1", "Minimal Promoter", "CpgII/P2", "YY1"), | |
labels = c("XIST", "Exon 1", "Minimal Promoter", "CpgII/P2", "YY1")) + | |
scale_color_manual(values = c("#000000", pal_npg()(2)), | |
breaks = c("no change", "FALSE", "TRUE"), | |
labels=c("Invariant", "Closing", "Opening")) + | |
theme(panel.grid.major = element_blank(), | |
panel.grid.minor = element_blank(), | |
strip.text = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")), | |
strip.background = element_rect(size = .3)) + | |
scale_x_continuous(limits=c(xist_cov_start+1500, xist_cov_end-3500), expand=c(0,0)) + | |
labs(title="XIST ATAC Coverage", | |
x="X Chr Coordinate", | |
y="Reads per million", | |
fill="XIST Region", | |
color="ATAC Peak") | |
ggsave("figs/main/fig2_xist_atac_coverage_plot.svg",xist_atac_coverage_plot, width=84, height=80, units = "mm") | |
scales_y_firre_atac <- list( | |
"0" = scale_y_continuous(limits=c(-.2, 1.2), expand=c(0,0), breaks=c(0,.5,1)), | |
"1" = scale_y_continuous(limits=c(-.2, 1.2), expand=c(0,0), breaks=c(0,.5,1)), | |
"2" = scale_y_continuous(limits=c(-.2, 1.2), expand=c(0,0), breaks=c(0,.5,1)), | |
"3" = scale_y_continuous(limits=c(-.2, 1.2), expand=c(0,0), breaks=c(0,.5,1)), | |
"4" = scale_y_continuous(limits=c(-.2, 1.2), expand=c(0,0), breaks=c(0,.5,1)), | |
"5" = scale_y_continuous(limits=c(-.2, 1.2), expand=c(0,0), breaks=c(0,.5,1)), | |
"6" = scale_y_continuous(limits=c(-.25, .55), expand=c(0,0), breaks=NULL) | |
) | |
firre_gene_end <- as.numeric((hg38_biomart_structure_df_canonical %>% filter(external_gene_name == "FIRRE"))$transcription_start_site[1]) | |
firre_repeats <- read_table("firre.fa.out_repeats", | |
col_names = c("SW_score", | |
"perc_div", | |
"perc_del", | |
"perc_ins", | |
"query_sequence", | |
"pos_start", | |
"pos_end", | |
"pos_left", | |
"strand", | |
"matching_repeat", | |
"repeat_class_family", | |
"pos_in_repeat_start", | |
"pos_in_repeat_end", | |
"pos_in_repeat_left", | |
"id", | |
"astericks"), | |
skip = 3) | |
firre_facet_df <- tibble(name=c("FIRRE", "Exon", "Exon", "Exon", "Exon", "Exon", "Exon", "Exon", "Exon", "Exon", "Exon", "Exon", "Exon"), | |
cluster=6, | |
ymin=0, | |
ymax=.3, | |
xmin=c(firre_cov_start+1500, 131830604, 131825365, 131794466, 131794115, 131785258, 131784039, 131777744, 131768696, 131756709, 131755746, 131749458, 131692636), | |
xmax=c(firre_gene_end, 131830292, 131825222, 131794324, 131793994, 131785157, 131783887, 131777623, 131768549, 131756611, 131755598, 131749306, 131688779)) %>% | |
rbind( firre_repeats %>% | |
filter(repeat_class_family=="Unspecified" & !is.na(matching_repeat) & matching_repeat != "") %>% | |
mutate(end=131830862-pos_start+1, start=131830862-pos_end+1, cluster=6, name=matching_repeat, ymin=0, ymax=.3) %>% | |
select(name, cluster, ymin, ymax, xmin=start, xmax=end) %>% | |
filter(xmax >= firre_cov_start & xmin <= firre_cov_end & name %in% c("R=1"))) %>% #,"R=3","R=12","R=15","R=17" | |
mutate(name=factor(name, | |
levels = (c("FIRRE", | |
"Exon", | |
"R=0", | |
"R=1", | |
"R=2", | |
"R=3", | |
"R=4", | |
"R=5", | |
"R=6", | |
"R=7", | |
"R=8", | |
"R=9", | |
"R=10", | |
"R=11", | |
"R=12", | |
"R=13", | |
"R=14", | |
"R=15", | |
"R=16", | |
"R=17")))) | |
# firre_facet_df[firre_facet_df$name == "R5", "ymin"] <- -1 | |
firre_tss_df <- tibble(cluster=6, | |
group=c("down_line", "down_line", "left_line", "left_line", "arrow", "arrow", "arrow"), | |
x=c(firre_gene_end, firre_gene_end, firre_gene_end, firre_gene_end-2000, firre_gene_end-2000, firre_gene_end-2000, firre_gene_end-4000), | |
y=c(.3, -.1, -.1, -.1, -.05, -.15, -.1)) | |
firre_text_df <- tibble(cluster=6, | |
x=firre_gene_end-7000, | |
y=-.1, | |
text="TSS") | |
significant_diff_peaks_firre_df <- read_csv("all_edgers_test.csv.gz") %>% | |
filter(passes_significance_threashold == T & | |
seqnames == "chrX" & | |
start <= (firre_cov_end) & | |
end >= (firre_cov_start)) %>% | |
mutate(cluster=most_significant_transition - 1) | |
firre_annotation_df <- filtered_site_annotation_df %>% filter(hg38_chromosome == "chrX" & between(hg38_pos, firre_cov_start, firre_cov_end)) | |
firre_atac_coverage_plot <- ggplot(all_reads_df) + | |
geom_vline(data=firre_annotation_df, mapping=aes(xintercept=hg38_pos), alpha=.3, size=.15) + | |
geom_rect(aes(xmin=start, xmax=end, ymin=0, ymax=score), alpha=.2, fill="black") + | |
geom_segment(data=firre_peaks, aes(x=start, xend=end, y=-.1, yend=-.1, color="no change"), size=.7, alpha=.4) + | |
geom_segment(data=significant_diff_peaks_firre_df, aes(x=start, xend=end, y=-.1, yend=-.1, color=delta_increasing), size=.7, alpha=1) + | |
facet_grid_sc(rows = vars(factor(cluster, levels=c(6,0:3,5))), scales = list(y=scales_y_firre_atac), space="free_y", labeller = labeller(.rows=function(n){ | |
return(list("0"="Cluster A", | |
"1"="Cluster B", | |
"2"="Cluster C", | |
"3"="Cluster D", | |
"5"="Cluster F", | |
"6"="FIRRE")[n]) | |
})) + | |
geom_rect(data= firre_facet_df, aes(xmin=xmin, xmax=xmax, fill=name, ymin = ymin, ymax=ymax)) + | |
geom_polygon(data=firre_tss_df, aes(x=x, y=y, group=group)) + | |
geom_path(data=firre_tss_df, aes(x=x, y=y, group=group), size=.3) + | |
geom_text(data=firre_text_df, aes(x=x, y=y, label=text), size=1.2) + | |
scale_fill_manual(values = c("#0195F8","#000000", "yellow"),# "red", "red", "red", "red","red", "red", "red", "red", "red", "red", "red", "red", "red", "red","red", "red", "red"), | |
breaks = c("FIRRE", | |
"Exon", | |
# "R=0", | |
"R=1", | |
# "R=2", | |
"R=3", | |
# "R=4", | |
# "R=5", | |
# "R=6", | |
# "R=7", | |
# "R=8", | |
# "R=9", | |
# "R=10", | |
# "R=11", | |
"R=12", | |
# "R=13", | |
# "R=14", | |
"R=15", | |
# "R=16", | |
"R=17"), | |
labels = c("Intron", | |
"Exon", | |
"Repeat", | |
"Repeat", | |
"Repeat", | |
"Repeat", | |
# "Repeat", | |
# "Repeat", | |
# "Repeat", | |
# "Repeat", | |
# "Repeat", | |
# "Repeat", | |
# "Repeat", | |
# "Repeat", | |
# "Repeat", | |
# "Repeat", | |
# "Repeat", | |
# "Repeat", | |
# "Repeat", | |
"Repeat")) + | |
theme_bw() + | |
base_plot_theme + | |
theme(panel.grid.major = element_blank(), | |
panel.grid.minor = element_blank(), | |
strip.text = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")), | |
strip.background = element_rect(size = .3)) + | |
scale_color_manual(values = c("#000000", pal_npg()(2)), | |
breaks = c("no change", "FALSE", "TRUE"), | |
labels=c("Invariant", "Closing", "Opening")) + | |
scale_x_continuous(limits=c(firre_cov_start, firre_cov_end), expand=c(0,0)) + | |
labs(title="FIRRE ATAC Coverage", | |
x="X Chr Coordinate", | |
y="Reads per million") | |
ggsave("figs/main/fig2_firre_atac_coverage_plot.svg",firre_atac_coverage_plot, width=84, height=80, units = "mm") | |
heatmap_xist_peaks <- tibble(start=numeric(), | |
end=numeric(), | |
sample=character(), | |
strength=numeric(), | |
cluster=numeric()) | |
for (i in 1:nrow(atac_sample_info)) { | |
srr <- as.character(atac_sample_info[i, "atac_accession"]) | |
cluster <- as.numeric(atac_sample_info[i, "cluster"]) | |
for (x in 1:18) { | |
start_coor <- 73847000+((x-1)*500) | |
end_coor <- 73847000+((x)*500) | |
pileup <- NA | |
if (nrow(xist_peaks %>% | |
filter(sample_atac_srr == srr & | |
start <= end_coor & | |
end >= start_coor)) > 0){ | |
pileup <- as.numeric((xist_peaks %>% | |
filter(sample_atac_srr == srr & | |
start <= end_coor & | |
end >= start_coor))[1,"fold_enrichment"]) | |
# if (pileup < 50) { | |
# pileup <- 50 | |
# }else if (pileup > 100) { | |
# pileup<- 100 | |
# } | |
} | |
heatmap_xist_peaks <- add_row(heatmap_xist_peaks, | |
start=start_coor, | |
end=end_coor, | |
sample=srr, | |
cluster=cluster, | |
strength=pileup) | |
} | |
} | |
ggplot(heatmap_xist_peaks) + | |
geom_tile(aes(x=sample, y=start, fill=strength))+ | |
facet_grid(cols=vars(cluster), scale="free_x") + | |
scale_fill_viridis_c() | |
# Footprint motif analysis | |
# motif_counts_df <- tibble(sample_atac_srr=character(), | |
# motif_tf=character(), | |
# cluster=numeric(), | |
# cou=numeric()) | |
motif_counts_all <- tibble(tf_motif=character()) | |
for (i in 1:nrow(atac_sample_info)) { | |
srr <- as.character(atac_sample_info[i, "atac_accession"]) | |
cluster <- as.numeric(atac_sample_info[i, "cluster"]) | |
motif_file <- sprintf("banovich-atacseq-data/macs2_peaks/hint_atac_output/match/%s_1_mpbs.bed", srr) | |
sample_motifs_df <- read_tsv(motif_file, col_names = c("chr","start","end","tf_motif","score","strand", "empty")) | |
motif_counts <- sample_motifs_df %>% | |
count(tf_motif) %>% | |
rename(!!srr:=n) #%>% | |
#add_row(tf_motif="cluster", | |
# !!srr:=cluster) | |
motif_counts_all <- full_join(motif_counts_all, | |
motif_counts, | |
by=c("tf_motif")) | |
# chrX:73,820,656-73,852,723) | |
} | |
arranged_sample_names <- (atac_sample_info %>% | |
arrange(cluster))$atac_accession | |
motif_counts_all <- motif_counts_all %>% select(tf_motif, !!arranged_sample_names) | |
library(broom) | |
motif_counts_all_norm <- motif_counts_all | |
count_sums_df <- colSums(motif_counts_all %>% select(-tf_motif)) %>% tidy() | |
for (i in 1:nrow(count_sums_df)) { | |
srr <- as.character(count_sums_df[i, "names"]) | |
total <- as.numeric(count_sums_df[i, "x"]) | |
counts_col <- as.numeric(unlist(motif_counts_all_norm[srr])) | |
motif_counts_all_norm <- motif_counts_all_norm %>% | |
mutate(!!srr:=1e6*counts_col/total) | |
} | |
write_csv(motif_counts_all %>% select(tf_motif, !!arranged_sample_names), "motif_counts_all.csv") | |
write_csv(motif_counts_all_norm %>% select(tf_motif, !!arranged_sample_names), "motif_counts_all_norm.csv") | |
tf_anova_pvals <- tibble(tf_motif=character(), pval=numeric()) | |
for (tf in unique(motif_counts_all_norm$tf_motif)) { | |
tf_df <- motif_counts_all_norm %>% | |
filter(tf_motif == tf) | |
motif_counts_all_melted <- tf_df %>% | |
reshape2::melt(id.vars="tf_motif", variable.name="sample_srr", value.name="count") %>% | |
left_join(atac_sample_info %>% select(atac_accession, cluster), by=c("sample_srr"="atac_accession")) %>% | |
filter(cluster %in% c(0:5)) | |
aov_obj <- tidy(aov(count ~ cluster, motif_counts_all_melted)) | |
pval <- as.numeric(unlist(aov_obj[1, "p.value"])) | |
tf_anova_pvals <- add_row(tf_anova_pvals, | |
tf_motif=tf, | |
pval=pval) | |
} | |
sig_tfs <- unlist((tf_anova_pvals %>% filter(pval <= .1))$tf_motif) | |
motif_counts_all_melted_sig <- motif_counts_all_norm %>% | |
reshape2::melt(id.vars="tf_motif", variable.name="sample_srr", value.name="count") %>% | |
left_join(atac_sample_info %>% select(atac_accession, cluster), by=c("sample_srr"="atac_accession")) %>% | |
filter(tf_motif %in% sig_tfs) | |
ggplot(motif_counts_all_melted_sig %>% filter(cluster %in% c(0:5))) + | |
geom_boxplot(aes(x=cluster, y=count, group=cluster, fill=factor(cluster))) + | |
facet_wrap(vars(tf_motif), scales = "free_y") | |
scale_x_continuous(limits=c(73820656, 73852723+5000)) | |
xist_peaks_plot_df <- xist_peaks %>% | |
reshape2::melt(id.vars=c("sample_atac_srr", "cluster", "pileup"), | |
measure.vars=c("start", "end", "abs_summit"), | |
value.name="pos") %>% | |
mutate(pileup=case_when(variable == "start" ~ 0, | |
variable == "end" ~ 0, | |
variable == "abs_summit" ~pileup)) | |
ggplot(xist_peaks_plot_df) + | |
geom_line(aes(x=pos,y=pileup, color=sample_atac_srr), alpha=.2) + | |
facet_grid(rows=vars(cluster)) + | |
scale_x_continuous(limits=c(73820656, 73852723+5000)) | |
#concordance with dmps | |
all_dmp_df <- read_csv("all_dmp_df.csv.gz") | |
all_edgers_test <- read_csv("all_edgers_test.csv.gz") | |
coverage_distance_from_dmp_df <- tibble(dist_from_dmp=numeric(), | |
covered_atac_peaks=numeric(), | |
covered_opposite=numeric(), | |
total_atac_peaks=numeric()) | |
library(IRanges) | |
for(d in c(100, 500, 1000, 10e3, 100e3)) { | |
count <- 0 | |
rangesA <- split(IRanges(all_dmp_df$hg38_pos - d, all_dmp_df$hg38_pos + d), all_dmp_df$hg38_chr) | |
rangesB <- split(IRanges(all_edgers_test$start, all_edgers_test$end), all_edgers_test$seqnames) | |
#which rangesB wholly contain at least one rangesA? | |
ov <- sum(as.data.frame(countOverlaps(rangesB, rangesA))$value) | |
rev_ov <- nrow(as.data.frame(subsetByOverlaps(rangesB, rangesA))) | |
coverage_distance_from_dmp_df <- add_row(coverage_distance_from_dmp_df, | |
dist_from_dmp=d, | |
covered_atac_peaks=ov, | |
covered_opposite=rev_ov, | |
total_atac_peaks=nrow(all_edgers_test)) | |
} | |
canonical_transcripts <- read_csv("hg38_canonical_transcripts.csv.gz") | |
hg38_biomart_structure_df <- read_csv("hg38_biomart_structure_df.csv.gz", col_types = cols(chromosome_name="c")) | |
hg38_biomart_structure_df_canonical <- hg38_biomart_structure_df %>% | |
filter(ensembl_transcript_id %in% canonical_transcripts$ensembl_transcript_id) %>% | |
group_by(ensembl_gene_id) %>% | |
summarise(external_gene_name=dplyr::first(external_gene_name), | |
description=dplyr::first(description), | |
transcription_start_site=dplyr::first(transcription_start_site), | |
chromosome_name=dplyr::first(chromosome_name), | |
end_position=dplyr::first(end_position), | |
start_position=dplyr::first(start_position), | |
strand=dplyr::first(strand)) %>% | |
mutate(promoter_start=case_when(strand == 1 ~ transcription_start_site-5000, | |
strand == -1 ~ start_position), | |
promoter_end=case_when(strand == 1 ~ end_position, | |
strand == -1 ~ transcription_start_site+5000)) | |
all_edgers_test_2 <- all_edgers_test | |
hg38_biomart_structure_df_canonical <- hg38_biomart_structure_df_canonical %>% arrange(end_position - start_position) | |
for (r in 1:nrow(hg38_biomart_structure_df_canonical)) { | |
ensembl_gene_id <- as.character(unlist(hg38_biomart_structure_df_canonical[r, "ensembl_gene_id"])) | |
external_gene_name <- as.character(unlist(hg38_biomart_structure_df_canonical[r, "external_gene_name"])) | |
description <- as.character(unlist(hg38_biomart_structure_df_canonical[r, "description"])) | |
chr <- sprintf("chr%s", as.character(unlist(hg38_biomart_structure_df_canonical[r, "chromosome_name"]))) | |
promoter_start <- as.integer((unlist(hg38_biomart_structure_df_canonical[r, "promoter_start"]))) | |
promoter_end <- as.integer((unlist(hg38_biomart_structure_df_canonical[r, "promoter_end"]))) | |
all_edgers_test_2 <- all_edgers_test_2 %>% | |
mutate(ensembl_id=case_when(start <= promoter_end & end >= promoter_start & seqnames == chr ~ensembl_gene_id, | |
TRUE ~ensembl_id), | |
gene=case_when(start <= promoter_end & end >= promoter_start & seqnames == chr ~external_gene_name, | |
TRUE ~gene), | |
gene_description=case_when(start <= promoter_end & end >= promoter_start & seqnames == chr ~description, | |
TRUE ~gene_description), | |
) | |
} | |
write_csv(all_edgers_test_2, "all_edgers_test_2_full_gene.csv.gz") | |
all_dmp_df <- read_csv("all_dmp_df.csv.gz") | |
library(radiant.data) | |
grouped_dmp_df <- all_dmp_df %>% | |
group_by(hg38_gene) %>% | |
summarise(cluster_0_mean_meth=mean(cluster_0_mean), | |
cluster_1_mean_meth=mean(cluster_1_mean), | |
cluster_2_mean_meth=mean(cluster_2_mean), | |
cluster_3_mean_meth=mean(cluster_3_mean), | |
cluster_4_mean_meth=mean(cluster_4_mean), | |
cluster_5_mean_meth=mean(cluster_5_mean), | |
transition_1_probes=sum(key_transition == 1), | |
transition_2_probes=sum(key_transition == 2), | |
transition_3_probes=sum(key_transition == 3), | |
transition_4_probes=sum(key_transition == 4), | |
transition_5_probes=sum(key_transition == 5), | |
transition_1_probes_passing_increasing=sum(key_transition == 1 & passes_threshold == T & delta_increasing == T), | |
transition_1_probes_passing_decreasing=sum(key_transition == 1 & passes_threshold == T & delta_increasing == F), | |
transition_2_probes_passing_increasing=sum(key_transition == 2 & passes_threshold == T & delta_increasing == T), | |
transition_2_probes_passing_decreasing=sum(key_transition == 2 & passes_threshold == T & delta_increasing == F), | |
transition_3_probes_passing_increasing=sum(key_transition == 3 & passes_threshold == T & delta_increasing == T), | |
transition_3_probes_passing_decreasing=sum(key_transition == 3 & passes_threshold == T & delta_increasing == F), | |
transition_4_probes_passing_increasing=sum(key_transition == 4 & passes_threshold == T & delta_increasing == T), | |
transition_4_probes_passing_decreasing=sum(key_transition == 4 & passes_threshold == T & delta_increasing == F), | |
transition_5_probes_passing_increasing=sum(key_transition == 5 & passes_threshold == T & delta_increasing == T), | |
transition_5_probes_passing_decreasing=sum(key_transition == 5 & passes_threshold == T & delta_increasing == F)) | |
grouped_atac_df <- all_edgers_test_2 %>% | |
filter(passes_significance_threashold == T) %>% | |
group_by(gene) %>% | |
summarise(mean_tran_1_atac_fold=mean(transition_1_fold), | |
mean_tran_2_atac_fold=mean(transition_2_fold), | |
mean_tran_3_atac_fold=mean(transition_3_fold), | |
mean_tran_4_atac_fold=mean(transition_4_fold), | |
mean_tran_1_atac_fdr=mean(transition_1_FDR), | |
mean_tran_2_atac_fdr=mean(transition_2_FDR), | |
mean_tran_3_atac_fdr=mean(transition_3_FDR), | |
mean_tran_4_atac_fdr=mean(transition_4_FDR), | |
atac_chr=dplyr::first(seqnames)) | |
atac_plotting_df <- all_edgers_test %>% filter(passes_significance_threashold == T & seqnames != "chrY") # & abs(delta) >= .5) | |
atac_plotting_df$chr_alt <- gsub("chr", "", atac_plotting_df$seqnames) | |
atac_plotting_df$chr_alt <- gsub("X", "23", atac_plotting_df$chr_alt) | |
atac_plotting_df$chr_alt <- as.numeric(atac_plotting_df$chr_alt) | |
atac_plotting_df <- atac_plotting_df %>% filter(!is.na(chr_alt)) | |
chromosomes_df <- tibble(chromosome=c(1, 2, 3, 4, 5, 6, 7, 23, 8, | |
9, 11, 10, 12, 13, 14, 15, | |
16, 17, 18, 20, 19, 24, 22, 21), | |
size=c(248956422, 242193529, 198295559, 190214555, | |
181538259, 170805979, 159345973, 156040895, | |
145138636, 138394717, 135086622, 133797422, | |
133275309, 114364328, 107043718, 101991189, | |
90338345, 83257441, 80373285, 64444167, 58617616, | |
57227415, 50818468, 46709983)) %>% filter(chromosome != 24) | |
global_atac_changes <- ggplot(atac_plotting_df %>% mutate(most_significant_transition=case_when(most_significant_transition==4~5, | |
T~most_significant_transition)) %>% add_row(most_significant_transition=4)) + | |
geom_rect(data=chromosomes_df, | |
aes(xmin=chromosome-.4, xmax=chromosome+.4, ymin=0, ymax=size), fill="lightgrey")+ | |
# geom_rect(aes(ymin=start, xmin=chr_alt-.4, ymax=end, xmax=chr_alt+.4, color=delta_increasing, alpha=abs(delta)), size=.1) + | |
geom_segment(aes(y=start, x=chr_alt-.4, yend=start, xend=chr_alt+.4, color=delta_increasing, alpha=abs(delta)), size=.3) + | |
facet_grid(rows = vars(most_significant_transition), labeller = labeller(.rows=function(n){ | |
return(list("1"="Tran. 1", | |
"2"="Tran. 2", | |
"3"="Tran. 3", | |
"4"="empty", | |
"5"="Tran. 4/5")[n]) | |
})) + | |
scale_x_reverse(breaks = 1:23, labels = c(1:22, "X"), minor_breaks = NULL, expand=c(.01,.01)) + | |
scale_y_continuous(expand=c(0,0), breaks = c(0,50e6,100e6,150e6,200e6,250e6), labels=c(0,50,100,150,200,250)) + | |
scale_color_aaas(limits=c(F,T),labels=c("Decreasing", "Increasing"), name="Direction of Change") + | |
scale_fill_aaas(limits=c(F,T),labels=c("Decreasing", "Increasing"), name="Direction of Change") + | |
labs(x="Chromosome", | |
y="Position (Mb)", | |
title="Differential ATAC Peaks on all chromosomes") + | |
theme_bw() + | |
base_plot_theme + | |
coord_flip() + | |
scale_alpha(limits=c(0.5,4), oob=scales::squish, name="abs(log2FC)") + | |
theme(strip.text.y = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")), | |
strip.text.x = element_text(size=5, margin = margin(1,1.5,1,1.5, "mm")), | |
strip.background = element_rect(size = .3), | |
panel.grid.major.y = element_blank(), | |
panel.grid.minor.y = element_blank(), | |
legend.box="vertical", | |
legend.box.just="right", | |
legend.box.spacing = unit(5, "mm"), | |
legend.spacing.y=unit(1, "mm")) | |
ggsave("figs/supplementary/supp_3_atac_nocutoff_rotated.pdf",global_atac_changes, width=174/3, height=236, units="mm") | |
for(t in 1:4) { | |
tran_5_density <- ggplot(plotting_df %>% filter(chr_alt != 23 & key_transition == 5)) + | |
geom_density(aes(x=abs(delta)), fill="black") + | |
theme_bw() + | |
base_plot_theme + | |
scale_y_continuous(breaks = c(0,10,20)) + | |
scale_x_continuous(breaks = c(0,.2,.4, .6)) + | |
labs(title="Autosomal β Density") + | |
theme(axis.ticks.length = unit(.4, "mm"), | |
plot.margin = margin(l=0,b=0,t=0.5,r=0.4, "mm"), | |
axis.title.y = element_text(size=3, margin = margin(t = 0, r = -0.2, b = 0, l = 0, "mm")), | |
axis.title.x = element_text(size=3,margin = margin(t = -0.2, r = 0, b = 0, l = 0, "mm")), | |
plot.title = element_text(size=3.5)) | |
ggsave(sprintf("figs/supplementary/supp_3_tran_%d_density_atac.pdf", t), tran_5_density, width = 18, height=12, units = "mm") | |
} | |
dmp_atac_df <- right_join(grouped_atac_df, grouped_dmp_df, by=c("gene"="hg38_gene")) %>% filter(!is.na(atac_chr)) | |
dmp_atac_df <- dmp_atac_df %>% | |
mutate(max_tran_meth_delta=case_when( max_transition==1 ~ cluster_1_mean_meth - cluster_0_mean_meth, | |
max_transition==2 ~ cluster_2_mean_meth - cluster_1_mean_meth, | |
max_transition==3 ~ cluster_3_mean_meth - cluster_2_mean_meth, | |
max_transition==4 ~ cluster_4_mean_meth - cluster_3_mean_meth, | |
max_transition==5 ~ cluster_5_mean_meth - cluster_4_mean_meth, | |
TRUE ~ 0), | |
tran_1_delta=cluster_1_mean_meth - cluster_0_mean_meth, | |
tran_2_delta=cluster_2_mean_meth - cluster_1_mean_meth, | |
tran_3_delta=cluster_3_mean_meth - cluster_2_mean_meth, | |
tran_45_delta=cluster_5_mean_meth - cluster_3_mean_meth) | |
for (i in 1:nrow(dmp_atac_df)) { | |
gene_meth_deltas <- unlist(dmp_atac_df[i, c("tran_1_delta", "tran_2_delta", "tran_3_delta", "tran_45_delta")]) | |
gene_atac_deltas <- unlist(dmp_atac_df[i, c("mean_tran_1_atac_fold", "mean_tran_2_atac_fold", "mean_tran_3_atac_fold", "mean_tran_4_atac_fold")]) | |
cor_test <- cor.test(gene_meth_deltas, gene_atac_deltas, method="spearman") | |
dmp_atac_df[i, "atac_meth_corr"] <- as.numeric(cor_test$estimate) | |
dmp_atac_df[i, "atac_meth_corr_pval"] <- as.numeric(cor_test$p.value) | |
} | |
concordant_count <- dmp_atac_df %>% | |
filter(max_transition > 0 & | |
atac_meth_corr < 0 & | |
atac_meth_corr_pval <= .1) %>% nrow() | |
library("ChIPseeker") | |
library("TxDb.Hsapiens.UCSC.hg38.knownGene") | |
library(GenomicRanges) | |
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene | |
edgers_peaks_granges <- makeGRangesFromDataFrame(all_edgers_test %>% filter(passes_significance_threashold == T & !(seqnames %in% c("chrX", "chrY")) & abs(delta) >= 1) %>% dplyr::select(chr=seqnames, start, end, length=width)) | |
# edgers_peaks_granges_tran4_dec <- makeGRangesFromDataFrame(all_edgers_test %>% filter(passes_significance_threashold == T & !(seqnames %in% c("chrX", "chrY")) & abs(delta) >= 1 & most_significant_transition == 4 & delta_increasing == F) %>% dplyr::select(chr=seqnames, start, end, length=width)) | |
annotated_peaks <- annotatePeak(edgers_peaks_granges, tssRegion=c(-3000, 3000), TxDb=txdb) %>% as.data.frame() | |
library(biomaRt) | |
hg38_mart <- useMart('ensembl', dataset='hsapiens_gene_ensembl') | |
biomart_genes <- getBM(mart=hg38_mart, | |
filters = c( "ensembl_transcript_id_version"), | |
values = list(ensembl_transcript_id_version=annotated_peaks$transcriptId), | |
attributes=c('external_gene_name', 'description', 'chromosome_name', | |
'ensembl_transcript_id', | |
'ensembl_transcript_id_version', | |
'ensembl_gene_id', | |
'ensembl_gene_id_version')) | |
annotated_peaks <- left_join(annotated_peaks, | |
biomart_genes, | |
by=c("transcriptId"="ensembl_transcript_id_version")) | |
annotated_edgers <- all_edgers_test %>% | |
filter(passes_significance_threashold == T & !(seqnames %in% c("chrX", "chrY")) & abs(delta) >= 1) %>% | |
left_join(annotated_peaks, by=c("seqnames", "start", "end")) | |
annotated_edgers_gene_collated <- annotated_edgers %>% | |
group_by(ensembl_gene_id_version,external_gene_name, most_significant_transition, delta_increasing) %>% | |
summarise(count=n(), | |
mean_delta=mean(delta)) | |
grouped_concordance_df <- read_csv("grouped_concordance_df.csv.gz") | |
grouped_concordance_df$atac_concordant <- F | |
for(r in 1:nrow(grouped_concordance_df)) { | |
gene <- as.character(unlist(grouped_concordance_df[r, "gene"])) | |
chr <- paste("chr", as.character(unlist(grouped_concordance_df[r, "chr"])), sep="") | |
start_position <- as.numeric(unlist(grouped_concordance_df[r, "start_position"])) | |
end_position <- as.numeric(unlist(grouped_concordance_df[r, "end_position"])) | |
expr_key_transition <- as.numeric(unlist(grouped_concordance_df[r, "expr_key_transition"])) | |
expr_key_log2FC <- as.numeric(unlist(grouped_concordance_df[r, "expr_key_log2FC"])) | |
# filtered_atac <- annotated_edgers_gene_collated %>% | |
# filter(external_gene_name == gene & most_significant_transition == expr_key_transition & delta_increasing == (expr_key_log2FC > 0) & count > 0) | |
# grouped_concordance_df[r, "atac_concordant"] <- nrow(filtered_atac) > 0 | |
filtered_atac <- all_edgers_test %>% filter(seqnames == chr & start <= (end_position+1e6) & end >= (start_position-1e6) & passes_significance_threashold == T & most_significant_transition == expr_key_transition) | |
grouped_concordance_df[r, "atac_concordant"] <- nrow(filtered_atac) > 0 | |
} | |
# diff_expr_all_df <- read_csv("diff_expr_all_df.csv.gz") | |
diff_expr_all_df <- read_csv("diff_expr_all_df.csv.gz") | |
new_concordance_df <- diff_expr_all_df | |
new_concordance_df$atac_concordant <- F | |
new_concordance_df$meth_concordant <- F | |
for(r in 1:nrow(new_concordance_df)) { | |
gene <- as.character(unlist(new_concordance_df[r, "gene"])) | |
chr <- paste("chr", as.character(unlist(new_concordance_df[r, "chr"])), sep="") | |
start_position <- as.numeric(unlist(new_concordance_df[r, "start_position"])) | |
end_position <- as.numeric(unlist(new_concordance_df[r, "end_position"])) | |
expr_key_transition <- as.numeric(unlist(new_concordance_df[r, "key_transition"])) | |
expr_log2FC_increasing <- as.logical(unlist(new_concordance_df[r, "positive_delta"])) | |
expr_key_log2FC <- as.numeric(unlist(new_concordance_df[r, sprintf("transition_%d_log2FC", expr_key_transition)])) | |
new_concordance_df[r, "delta"] <- expr_key_log2FC | |
filtered_atac <- all_edgers_test %>% filter(seqnames == chr & start <= (end_position+2e3) & end >= (start_position-2e3) & passes_significance_threashold == T & most_significant_transition == expr_key_transition) | |
new_concordance_df[r, "atac_concordant"] <- nrow(filtered_atac) > 0 | |
filtered_dmp <- grouped_dmp_df %>% filter(hg38_gene == gene) | |
if(nrow(filtered_dmp) > 0 & expr_key_transition < 4) { | |
sig_probes_in_key_transition <- as.numeric(unlist(filtered_dmp[1, sprintf("transition_%d_probes_passing_increasing", expr_key_transition)])) | |
if(expr_log2FC_increasing) { | |
sig_probes_in_key_transition <- as.numeric(unlist(filtered_dmp[1, sprintf("transition_%d_probes_passing_decreasing", expr_key_transition)])) | |
} | |
new_concordance_df[r, "meth_concordant"] <- sig_probes_in_key_transition > 0 | |
}else if (nrow(filtered_dmp) > 0 & expr_key_transition == 4) { | |
sig_probes_in_key_transition <- as.numeric(unlist(filtered_dmp[1, sprintf("transition_%d_probes_passing_increasing", expr_key_transition)])) + | |
as.numeric(unlist(filtered_dmp[1, sprintf("transition_%d_probes_passing_increasing", 5)])) | |
if(expr_log2FC_increasing) { | |
sig_probes_in_key_transition <- as.numeric(unlist(filtered_dmp[1, sprintf("transition_%d_probes_passing_decreasing", expr_key_transition)])) + | |
as.numeric(unlist(filtered_dmp[1, sprintf("transition_%d_probes_passing_decreasing", 5)])) | |
} | |
new_concordance_df[r, "meth_concordant"] <- sig_probes_in_key_transition > 0 | |
} | |
} | |
write_csv(new_concordance_df, "new_concordance_df.csv.gz") | |
write_csv(new_concordance_df %>% dplyr::select(ens_id, gene, chr, start_position, end_position, key_transition, passes_expression_threshold=passes_threshold, expression_increasing=positive_delta, atac_concordant, meth_concordant), "table_4_meth_atac_expr_concordance.csv") | |
# grouped_concordance_df_filtered <- grouped_concordance_df %>% | |
# filter((pval <= .1 & corr < 0 & max_probes != 0) | atac_concordant == T) | |
expr_plotting_df <- new_concordance_df %>% filter(passes_threshold == T & (atac_concordant == T | meth_concordant == T)) # diff_expr_all_df #%>% filter(passes_threshold == T) | |
expr_plotting_df$chr_alt <- gsub("X", "23", expr_plotting_df$chr) | |
expr_plotting_df$chr_alt <- as.numeric(expr_plotting_df$chr_alt) | |
chromosomes_df <- tibble(chromosome=c(1, 2, 3, 4, 5, 6, 7, 23, 8, | |
9, 11, 10, 12, 13, 14, 15, | |
16, 17, 18, 20, 19, 24, 22, 21), | |
size=c(248956422, 242193529, 198295559, 190214555, | |
181538259, 170805979, 159345973, 156040895, | |
145138636, 138394717, 135086622, 133797422, | |
133275309, 114364328, 107043718, 101991189, | |
90338345, 83257441, 80373285, 64444167, 58617616, | |
57227415, 50818468, 46709983)) %>% filter(chromosome != 24) | |
global_expr_changes <- ggplot(expr_plotting_df %>% mutate(key_transition=case_when(key_transition==4~5, | |
T~key_transition)) %>% add_row(key_transition=4)) +# %>% filter(abs(max_tran_log2FC) >= .5)) + | |
geom_rect(data=chromosomes_df, | |
aes(xmin=chromosome-.4, xmax=chromosome+.4, ymin=0, ymax=size), fill="lightgrey")+ | |
geom_segment(aes(y=start_position, x=chr_alt-.4, yend=start_position, xend=chr_alt+.4, color=delta >= 0, alpha=abs(delta)), size=.3) + | |
facet_grid(rows = vars(key_transition), labeller = labeller(.rows=function(n){ | |
return(list("1"="Tran. 1", | |
"2"="Tran. 2", | |
"3"="Tran. 3", | |
"4"="Tran. 4", | |
"5"="Tran. 4/5")[n]) | |
})) + | |
scale_alpha(limits=c(0,.5), oob=scales::squish, breaks=c(0,.25,.5), name="abs(log2FC)") + | |
scale_x_reverse(breaks = 1:23, labels = c(1:22, "X"), minor_breaks = NULL, expand=c(.01,.01)) + | |
scale_y_continuous(expand=c(0,0), breaks = c(0,50e6,100e6,150e6,200e6,250e6), labels=c(0,50,100,150,200,250)) + | |
scale_color_aaas(limits=c(F, T),labels=c("Decreasing", "Increasing"), name="Direction of Expression Change") + | |
labs(x="Chromosome", | |
y="Position (Mb)", | |
title="Concordant Expression Changes on all chromosomes") + | |
theme_bw() + | |
base_plot_theme + | |
coord_flip() + | |
guides(alpha="legend", colours="legend") + | |
theme(strip.text.y = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")), | |
strip.text.x = element_text(size=5, margin = margin(1,1.5,1,1.5, "mm")), | |
strip.background = element_rect(size = .3), | |
panel.grid.major.y = element_blank(), | |
panel.grid.minor.y = element_blank(), | |
legend.box="vertical", | |
legend.box.just="right", | |
legend.box.spacing = unit(5, "mm"), | |
legend.spacing.y=unit(1, "mm")) | |
ggsave("figs/supplementary/supp_3c_alt_meth_dir.pdf", global_expr_changes, width=174/3, height=236, units="mm") | |
global_expr_changes_tran45 <- ggplot(expr_plotting_df %>% filter(key_transition==4)) +# %>% filter(abs(max_tran_log2FC) >= .5)) + | |
geom_rect(data=chromosomes_df, | |
aes(xmin=chromosome-.4, xmax=chromosome+.4, ymin=0, ymax=size), fill="lightgrey")+ | |
geom_segment(aes(y=start_position, x=chr_alt-.4, yend=start_position, xend=chr_alt+.4, color=delta >= 0, alpha=abs(delta)), size=.3) + | |
facet_grid(rows = vars(key_transition), labeller = labeller(.rows=function(n){ | |
return(list("1"="Tran. 1", | |
"2"="Tran. 2", | |
"3"="Tran. 3", | |
"4"="Tran. 4/5", | |
"5"="Tran. 4/5")[n]) | |
})) + | |
scale_x_reverse(breaks = 1:23, labels = c(1:22, "X"), minor_breaks = NULL, expand=c(0.01,0.01)) + | |
scale_y_continuous(expand=c(0,0), breaks = c(0,50e6,100e6,150e6,200e6,250e6), labels=c(0,50,100,150,200,250)) + | |
scale_color_aaas(limits=c(F, T),labels=c("Decreasing", "Increasing"), name="Direction of Expression Change") + | |
scale_alpha(limits=c(0,.5), oob=scales::squish, breaks=c(0,.25,.5), name="abs(log2FC)") + | |
labs(x="Chromosome", | |
y="Position (Mb)", | |
title="Concordant Expression Changes") + | |
theme_bw() + | |
base_plot_theme + | |
coord_flip() + | |
guides(alpha=guide_legend(order = 2), col=guide_legend(order = 1)) + | |
theme(strip.text.y = element_text(size=5, margin = margin(0,0.5,0,.5, "mm")), | |
strip.text.x = element_text(size=5, margin = margin(0,0.5,0,.5, "mm")), | |
strip.background = element_rect(size = .3), | |
panel.grid.major.y = element_blank(), | |
panel.grid.minor.y = element_blank(), | |
legend.box="vertical", | |
legend.box.just="right", | |
legend.box.spacing = unit(3.5, "mm"), | |
legend.spacing.y=unit(.2, "mm"), | |
axis.text.y=element_text(size=5)) | |
ggsave("figs/main/fig_4_global_changes_expr.pdf", global_expr_changes_tran45, width = 58, height=47, units = "mm") | |
# check for overlap with repeats | |
rmsk_df <- read_tsv("atac_rmsk_fischer/filtered_hg38_rmsk.bed", col_names = c("chr", "start", "end", "rep_name", "score", "strand")) | |
all_edgers_test <- read_csv("all_edgers_test.csv.gz") | |
library(GenomicRanges) | |
rmsk_grange <- makeGRangesFromDataFrame(rmsk_df %>% select(-strand), keep.extra.columns = T, ignore.strand = T) | |
sig_edgers_test_grange <- makeGRangesFromDataFrame(all_edgers_test %>% filter(passes_significance_threashold == T), keep.extra.columns = T, ignore.strand = T) | |
overlapping_rmsks <- subsetByOverlaps(rmsk_grange, sig_edgers_test_grange) | |
overlapping_rmsks_df <- as.data.frame(overlapping_rmsks) | |
overlapping_repnames <- unique(overlapping_rmsks_df$rep_name) | |
sorted_chrom_sizes <- read_tsv("atac_rmsk_fischer/hg38.chrom.sizes.sorted.txt", col_names = c("chr", "size")) | |
rmsk_df %>% | |
filter(rep_name %in% overlapping_repnames & chr %in% sorted_chrom_sizes$chr) %>% | |
group_by(rep_name) %>% | |
arrange(chr, start) %>% | |
group_walk(~write_tsv(.x, sprintf("atac_rmsk_fischer/rep_beds/%s_rmsk.bed", .y$rep_name), col_names = F)) | |
all_edgers_test %>% | |
filter(passes_significance_threashold == T & seqnames %in% sorted_chrom_sizes$chr) %>% | |
group_by(delta_increasing, most_significant_transition) %>% | |
arrange(seqnames, start) %>% | |
group_walk(~write_tsv(.x %>% select(seqnames, start, end), sprintf("atac_rmsk_fischer/edgers_atac_beds/T%d_up_%s_edgers.bed", .y$most_significant_transition, .y$delta_increasing), col_names = F)) | |
fisher_output_df <- read_csv("atac_rmsk_fischer/fisher_output.csv") | |
filtered_fisher_df <- fisher_output_df %>% filter(right_pval<.01) %>% | |
mutate(delta_increasing_mod=case_when(delta_increasing==T ~ 1, | |
delta_increasing == F ~ -1)) | |
expression_hervs <- c("LTR1", | |
"LTR2C", | |
"LTR40a", | |
"LTR40b", | |
"LTR5_Hs", | |
"LTR5_Hs", | |
"LTR5", | |
"LTR5_Hs", | |
"LTR5B", | |
"LTR6A", | |
"LTR7", | |
"LTR7Y", | |
"MER50", | |
"LTR1", | |
"LTR2C", | |
"LTR40a", | |
"LTR40b", | |
"LTR5_Hs", | |
"LTR5_Hs", | |
"LTR5", | |
"LTR5_Hs", | |
"LTR5B", | |
"LTR6A", | |
"LTR7", | |
"LTR7Y", | |
"MER50") | |
ltr_pal <- c("#fdfdfd", "#1d1d1d", "#ebce2b", "#702c8c", "#db6917", "#96cde6", "#ba1c30", "#c0bd7f", "#7f7e80", "#5fa641", "#d485b2", "#4277b6", "#df8461", "#463397", "#e1a11a", "#91218c", "#e8e948", "#7e1510", "#92ae31", "#6f340d", "#d32b1e", "#2b3514") | |
plotting_fisher_df <- filtered_fisher_df %>% | |
filter(rep %in% expression_hervs) %>% | |
mutate(sig_marker=case_when(right_pval < .0001 ~ "***", | |
right_pval < .001 ~ "**", | |
right_pval < .01 ~ "*")) %>% | |
arrange(factor(rep, levels = c("LTR5_Hs", "LTR7", "LTR7Y", "LTR1", "LTR2C", "LTR6A", "MER50"))) %>% | |
mutate(rep=factor(rep, levels = c("LTR5_Hs", "LTR7", "LTR7Y", "LTR1", "LTR2C", "LTR6A", "MER50")), | |
lab_ypos = cumsum(delta_increasing_mod*overlaps) - 0.5 * overlaps) | |
library(shadowtext) | |
atac_fisher_plot <- ggplot(plotting_fisher_df, aes(x=transition, y=delta_increasing_mod*overlaps, fill=rep)) + | |
geom_bar(stat="identity") + | |
# geom_text(data=plotting_fisher_df, aes(label = rep),position = position_stack(vjust = 0.5), size = 1.) + | |
geom_text(data=plotting_fisher_df, aes(label = paste(rep,sig_marker, sep="")),position = position_stack(vjust = 0.5), size = 1.3) + | |
scale_fill_manual(values=ltr_pal[c(6,8,10,3,7,22,13)], | |
limits=c("LTR5_Hs", | |
"LTR7", | |
"LTR7Y", | |
"LTR1", | |
"LTR2C", | |
"LTR6A", | |
"MER50"), guide=F) + | |
scale_x_discrete(limits=1:4, labels=c("1", "2", "3", "4/5")) + | |
theme_bw() + | |
base_plot_theme + | |
labs(title="Repeats Overlapped by\nDiff ATAC Peaks", | |
y="Repeat Overlap", | |
x="Transition") | |
ggsave("figs/main/fig4_atac_fisher_plot.pdf", atac_fisher_plot, width=40, height = 61, units = "mm") |