Skip to content
Permalink
2e6bcacfa8
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
2175 lines (1877 sloc) 111 KB
library(tidyverse)
library(stringr)
atac_counts_df <- read_tsv("banovich-atacseq-data/GSE108458_iPSC-ATAC-counts.txt")
sample_data_df <- read_csv("sample_data_combined_all_info.csv")
cluster_info_df <- read_csv("saved_sample_clusters.csv", col_names = c("sample_name", "cluster"))
banovich_female_sample_df <- sample_data_df %>%
left_join(cluster_info_df, by="sample_name") %>%
filter(cluster != "" & !is.na(cluster) & group_accession == "GSE110544")
banovich_cell_lines <- intersect(banovich_female_sample_df$cell_line,
colnames(atac_counts_df))
female_atac_counts_df <- atac_counts_df %>%
select(window, !!banovich_cell_lines) %>%
mutate(chr=stringr::str_split(window, "-", simplify = T)[,1],
start=stringr::str_split(window, "-", simplify = T)[,2],
end=stringr::str_split(window, "-", simplify = T)[,3]) %>%
select(-window)
library(DESeq2)
banovich_sample_sheet <- banovich_female_sample_df %>%
dplyr::select(SampleID=cell_line,
Condition=cluster)
dba_atac <- dba(sampleSheet = banovich_sample_sheet,
)
# get the atac accessions for alignments
atac_accessions <- read_csv("banovich-atacseq-data/SraRunTable.txt")
sras_to_keep <- atac_accessions %>%
filter(derived_from_cell_line %in% banovich_cell_lines)
write_lines(sras_to_keep$Run, "banovich-atacseq-data/atac_sras_to_download.txt")
atac_sample_info <- sras_to_keep %>%
select(derived_from_cell_line, atac_accession=Run) %>%
left_join(banovich_female_sample_df, by=c("derived_from_cell_line"="cell_line"))
library(stringr)
atac_sample_info$hasXPeaks <- F
for (f in Sys.glob("esatac_results/results_txt/*.txt")) {
f_df <- read_tsv(f)
srr_name <- stringr::str_replace(basename(f), "_1.RPeakAnno.txt", "")
chromosome_counts <- f_df %>%
dplyr::count(chromatin) %>%
reshape2::dcast(. ~ chromatin) %>%
select(-.)
chromosome_counts$atac_accession <- srr_name
if ("chrX" %in% f_df$chromatin) {
atac_sample_info[atac_sample_info$atac_accession == srr_name,"hasXPeaks"] <- T
}
for (chr in colnames(chromosome_counts %>% select(-atac_accession))) {
atac_sample_info[atac_sample_info$atac_accession == srr_name, chr] <- chromosome_counts[1, chr]
}
}
write_csv(atac_sample_info %>%
dplyr::select(sample_name,
atac_accession,
group_accession,
sex=gender,
cluster,
accession,
cell_type,
cell_line=derived_from_cell_line,
#hasXPeaks,
#!!colnames(chromosome_counts %>% dplyr::select(-atac_accession, -.))
), "atac_sample_info.csv.gz")
library(caret)
## footprint analysis
rds_test <- readRDS("banovich-atacseq-data/footprints/SRR6412411_footprint.rds")
all_tfs <- names(rds_test)
tf_ks_df <- tibble(tf=character(),
srr1=character(),
srr2=character(),
ks_distance=numeric(),
ks_pval=numeric())
footprint_files <- Sys.glob("banovich-atacseq-data/footprints/*.rds")
for (i in 1:(length(footprint_files)-1)) {
print(i)
fp1 <- readRDS(footprint_files[i])
s1 <- str_split(basename(footprint_files[i]), "_", simplify=T)[1]
for (j in (i+1):length(footprint_files)) {
fp2 <- readRDS(footprint_files[j])
s2 <- str_split(basename(footprint_files[j]), "_", simplify=T)[1]
for (tf in all_tfs) {
fp_df <- data.frame(
d1=as.numeric(unlist(fp1[tf])),
d2=as.numeric(unlist(fp2[tf]))
)
norm_ds <- predict(preProcess(fp_df, method = "range"), fp_df)
ks_res <- ks.test(norm_ds$d1, norm_ds$d2)
ks_dist <- as.numeric(unlist(ks_res$statistic))
ks_p <- as.numeric(unlist(ks_res$p.value))
tf_ks_df <- add_row(tf_ks_df,
tf=tf,
srr1=s1,
srr2=s2,
ks_distance=ks_dist,
ks_pval=ks_p)
}
}
}
atac_sample_info <- read_csv("atac_sample_info.csv.gz")
tf_ks_df <- left_join(tf_ks_df,
atac_sample_info %>%
select(atac_accession, s1_cluster=cluster),
by=c("srr1"="atac_accession")) %>%
left_join(atac_sample_info %>%
select(atac_accession, s2_cluster=cluster),
by=c("srr2"="atac_accession"))
write_csv(tf_ks_df, "tf_ks_df.csv.gz")
summarized_tf_ks_df <- tf_ks_df %>%
group_by(tf, cluster_1 = pmin(s1_cluster, s2_cluster), cluster_2 = pmax(s1_cluster, s2_cluster)) %>%
summarise(mean_distance = mean(ks_distance),
mean_pval=mean(ks_pval))
ggplot(summarized_tf_ks_df %>% filter(cluster_1 == 0 & mean_pval <= .1)) +
geom_density(aes(mean_distance, color=interaction(cluster_1,cluster_2)))
tf_ks_df_diff <- summarized_tf_ks_df %>%
filter(cluster_1 == 0 & cluster_2 %in% c(0,5)) %>%
arrange(cluster_1, cluster_2) %>%
group_by(tf) %>%
summarise(ks_dist_diff=last(mean_distance)-first(mean_distance),
ks_pval_diff=last(mean_pval)-first(mean_pval))
changing_TFs <- (tf_ks_df_diff %>% filter(ks_dist_diff > 0))$tf
ggplot(summarized_tf_ks_df %>% filter(cluster_1 == 0 & tf %in% changing_TFs & mean_pval <= .1)) +
geom_density(aes(mean_distance, color=interaction(cluster_1,cluster_2)))
# creating a clustered footprint map
tf <- "KLF5"
footprint_df <- tibble(
tf=character(),
sample_srr=character(),
position=numeric(),
signal=numeric()
)
for (footprint_file in footprint_files) {
print(footprint_file)
fp_rds <- readRDS(footprint_file)
s1 <- str_split(basename(footprint_files[i]), "_", simplify=T)[1]
for (tf in all_tfs) {
tf_footprint <- as.numeric(unlist(fp1[tf]))
for (x in 1:length(tf_footprint)) {
footprint_df <- add_row(footprint_df,
tf=tf,
sample_srr=s1,
position=x,
signal=tf_footprint[x])
}
}
}
# Do differential peak analysis
library(DiffBind)
atac_sample_info <- atac_sample_info %>%
mutate(bamReads=sprintf("banovich-atacseq-data/macs2_peaks/%s_1.nodup.filtered.aligned.bam", atac_accession),
Peaks=sprintf("banovich-atacseq-data/macs2_peaks/%s_1_peaks.xls", atac_accession),
PeakCaller="macs",
PeakFormat="macs")
atac_sample_sheet <- atac_sample_info %>%
mutate(SampleID=paste(sample_name, cluster, sep = "_")) %>%
dplyr::select(SampleID,
atac_accession,
bamReads,
Peaks,
PeakCaller,
Condition=cluster,
PeakFormat)
dba_atac <- dba(sampleSheet=atac_sample_sheet, config=data.frame(RunParallel=T, cores=8))
dba_atac <- dba.count(dba_atac, bRemoveDuplicates = F)#, bUseSummarizeOverlaps = T)
dba.save(dba_atac, file="dba_atac")
dba_atac <- dba.load("dba_atac")
for (t in 1:4) {
cluster_a <- t-1
cluster_b <- t
if (t==4) {
cluster_b <- 5
}
cluster_a_samples <- (atac_sample_sheet %>% filter(Condition <= cluster_a))$SampleID
cluster_b_samples <- (atac_sample_sheet %>% filter(Condition >= cluster_b))$SampleID
cluster_a_group <- dba.mask(dba_atac, attribute = DBA_CONDITION, value = 0:(cluster_a), combine="or")
cluster_b_group <- dba.mask(dba_atac, attribute = DBA_CONDITION, value = (cluster_a+1):5, combine="or")
transition_dba_atac_contrast <- dba.contrast(dba_atac, group1 = cluster_a_group,
name1 = sprintf("cluster_%s", paste(c("A","B","C", "D", "F")[(1):(cluster_a+1)], collapse="")),
group2 = cluster_b_group,
name2 = sprintf("cluster_%s", paste(c("A","B","C", "D", "F")[(cluster_a+2):5], collapse="")))
transition_dba_atac_contrast <- dba.analyze(transition_dba_atac_contrast, method=c(DBA_EDGER, DBA_DESEQ2))
results <- dba.report(transition_dba_atac_contrast, contrast=1, bFlip=T, bCounts=T, th=1, method=DBA_EDGER, bCalled = T)
write_csv(as_tibble(results), sprintf("banovich-atacseq-data/macs2_diff_peaks/edger_transition_%d_diff_atac.csv", t))
results_deseq2 <- dba.report(transition_dba_atac_contrast, contrast=1, bFlip=T, bCounts=T, th=1, method=DBA_DESEQ2, bCalled = T)
write_csv(as_tibble(results_deseq2), sprintf("banovich-atacseq-data/macs2_diff_peaks/deseq2_transition_%d_diff_atac.csv", t))
}
all_edgers <- tibble(seqnames=character(),
start=numeric(),
end=numeric(),
width=numeric())
for (t in 1:4) {
transition_df <- read_csv(sprintf("banovich-atacseq-data/macs2_diff_peaks/edger_transition_%d_diff_atac.csv", t))
all_edgers <- full_join(all_edgers,
transition_df %>% select(
seqnames,
start,
end,
width,
!!sprintf("transition_%d_fold", t):=Fold,
!!sprintf("transition_%d_pval", t):=p.value,
!!sprintf("transition_%d_FDR", t):=FDR
), by=c("seqnames", "start", "end", "width"))
}
all_edgers_test <- all_edgers %>%
mutate(most_significant_transition=max.col(-1 * data.frame(transition_1_FDR, transition_2_FDR, transition_3_FDR, transition_4_FDR)),
most_significant_fdr=pmin(transition_1_FDR, transition_2_FDR, transition_3_FDR, transition_4_FDR),
passes_significance_threashold=pmin(transition_1_FDR, transition_2_FDR, transition_3_FDR, transition_4_FDR) <= .1) %>%
mutate(delta=case_when(most_significant_transition == 1 ~transition_1_fold,
most_significant_transition == 2 ~transition_2_fold,
most_significant_transition == 3 ~transition_3_fold,
most_significant_transition == 4 ~transition_4_fold)) %>%
mutate(delta_increasing=delta>0)
write_csv(all_edgers_test, "all_edgers_test.csv.gz")
all_edgers_test %>%
group_by(passes_significance_threashold, delta_increasing) %>%
summarise(n=n(), mean_delta=mean(delta)) %>% View
all_deseqs <- tibble(seqnames=character(),
start=numeric(),
end=numeric(),
width=numeric())
for (t in 1:4) {
transition_df <- read_csv(sprintf("banovich-atacseq-data/macs2_diff_peaks/deseq2_transition_%d_diff_atac.csv", t))
all_deseqs <- full_join(all_deseqs,
transition_df %>% select(
seqnames,
start,
end,
width,
!!sprintf("transition_%d_fold", t):=Fold,
!!sprintf("transition_%d_pval", t):=p.value,
!!sprintf("transition_%d_FDR", t):=FDR
), by=c("seqnames", "start", "end", "width"))
}
all_deseqs_test <- all_deseqs %>%
mutate(most_significant_transition=max.col(-1 * data.frame(transition_1_FDR, transition_2_FDR, transition_3_FDR, transition_4_FDR)),
most_significant_fdr=pmin(transition_1_FDR, transition_2_FDR, transition_3_FDR, transition_4_FDR),
passes_significance_threashold=pmin(transition_1_FDR, transition_2_FDR, transition_3_FDR, transition_4_FDR) <= .1) %>%
mutate(delta=case_when(most_significant_transition == 1 ~transition_1_fold,
most_significant_transition == 2 ~transition_2_fold,
most_significant_transition == 3 ~transition_3_fold,
most_significant_transition == 4 ~transition_4_fold)) %>%
mutate(delta_increasing=delta>0)
write_csv(all_deseqs_test, "all_deseqs_test.csv.gz")
all_deseqs_test %>%
group_by(passes_significance_threashold, delta_increasing) %>%
summarise(n=n(), mean_delta=mean(delta)) %>% View
all_edger_passing_threshold <- all_edgers_test %>% filter(passes_significance_threashold == T)
all_edger_passing_threshold_hg38 <- read_tsv("all_edger_passing_threshold_hg38.bed", col_names = c("hg38_chr", "hg38_start", "hg38_end")) %>%
filter(hg38_start != 50211461 & hg38_start != 149903954)
liftover_coords_all_edger <- read_tsv("all_edger_passing_threshold_hg19_liftover.bed", col_names = c("hg19_chr", "hg19_start", "hg19_end"))
all_edger_coor_convert_df <- all_edger_passing_threshold_hg38 %>%
cbind(liftover_coords_all_edger)
all_edger_passing_threshold_hg38 <- all_edger_passing_threshold %>%
filter(seqnames == "chrX") %>%
right_join(all_edger_coor_convert_df, by=c("seqnames"="hg38_chr", "start"="hg38_start", "end"="hg38_end"))
all_edger_passing_threshold_hg38 <- all_edger_passing_threshold_hg38 %>%
filter(passes_significance_threashold == T) %>%
mutate(hg19_width = hg19_end - hg19_start)
all_transitions_atac_windowed <- tibble(chr = character(),
pos_start = numeric(),
pos_end = numeric(),
mean_delta = numeric(),
count = numeric(),
cumulative_peak_width = numeric(),
key_transition=numeric(),
delta_increasing=logical())
for (t in 1:4) {
for(d in c(T,F)) {
tran_windowed <- window_average_genome_atac_transition(all_edger_passing_threshold_hg38 %>%
filter(hg19_chr == "chrX" & most_significant_transition == t & delta_increasing == d) %>%
select(-seqnames, -start, -end, -width, seqnames=hg19_chr, start=hg19_start, end=hg19_end, width=hg19_width),
window_size = 100000,
step_size = 50000,
include_empty=T)
tran_windowed$key_transition <- t
tran_windowed$delta_increasing <- d
if (d == F) {
tran_windowed$cumulative_peak_width <- -1 * tran_windowed$cumulative_peak_width
}
all_transitions_atac_windowed <- rbind(all_transitions_atac_windowed, tran_windowed)
}
}
library(facetscales)
mean_methylation_df_escape <- read_csv("mean_methylation_df_escape.csv.gz")
#hg19_pos was manually annotated for each gene
escapee_pos_df <- mean_methylation_df_escape %>%
transmute(hg38_gene_name,
delta_tran_1=hg19_pos,
delta_tran_2=hg19_pos,
delta_tran_3=hg19_pos,
delta_tran_4=hg19_pos,
delta_tran_5=hg19_pos) %>%
reshape2::melt(id.vars=c("hg38_gene_name"),
value.name="TSS",
variable.name="type")
scales_y <- list(
"1" = scale_y_continuous(limits=c(-8000, 8000), expand=c(0,0), breaks=c(-4000,0,4000)),
"2" = scale_y_continuous(limits=c(-8000, 8000), expand=c(0,0), breaks=c(-4000,0,4000)),
"3" = scale_y_continuous(limits=c(-4000, 4000), expand=c(0,0), breaks=c(-2000,0,2000)),
"4" = scale_y_continuous(limits=c(-4000, 4000), expand=c(0,0), breaks=c(-2000,0,2000)))
browser_view_atac <- ggplot(all_transitions_atac_windowed, mapping=aes(x=pos_start, y = cumulative_peak_width)) +
geom_vline(data=escapee_pos_df, aes(xintercept=TSS), color="darkgrey", size=.3, alpha=.8) +
facet_grid_sc(rows=vars(factor(key_transition, levels = 1:4)), scales = list(y=scales_y), labeller = labeller(.rows=function(n){
return(list("1"="Tran. 1",
"2"="Tran. 2",
"3"="Tran. 3",
"4"="Tran. 4/5")[n])
})) +
geom_area(data=all_transitions_atac_windowed %>% filter(delta_increasing == T), mapping=aes(x=pos_start, y = cumulative_peak_width, fill=factor(key_transition)), inherit.aes = F) +
geom_area(data=all_transitions_atac_windowed %>% filter(delta_increasing == F), mapping=aes(x=pos_start, y= cumulative_peak_width, fill=factor(key_transition)) , inherit.aes = F) +
labs(y="Number of Peaks",
x="X Chr Coordinate (Mb)") +
scale_y_continuous(limits=c(-8000,8000), breaks = c(-4000,0,4000)) +
scale_x_continuous(limits=c(0, 155270560), breaks = seq(0, 155270560, 2e7), labels=c(0, 20, 40, 60, 80, 100, 120, 140), minor_breaks = seq(0, 155270560, 1e7), expand=c(0,0)) +
theme_bw() +
scale_fill_manual(limits=c(1,2,3,5,4),
values=pal_d3()(5), guide=F) +
base_plot_theme +
# scale_color_material("cyan") +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.text = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")),
strip.background = element_rect(size = .3),
title = element_blank(),
panel.spacing.y = unit(.5, "mm"),
axis.title.y = element_blank(),
axis.text.y=element_text(size=5)) +
labs(title="Significant ATAC-seq Peaks")
ggsave("figs/main/fig3_atac_spreading.pdf", browser_view_atac, width = 171, height=45, units = "mm")
# atac fold change densities
all_edgers_test_filtered <- all_edgers_test %>% filter(passes_significance_threashold == T & seqnames =="chrX")
for(t in 1:4) {
atac_tran_density <- ggplot(all_edgers_test_filtered %>% filter(most_significant_transition == t)) +
geom_density(aes(x=abs(delta)), size=.15) +
theme_bw() +
base_plot_theme +
scale_y_continuous() +
scale_x_continuous(breaks = c(0:8)) +
labs(y="Density") +
theme(axis.ticks.length = unit(.2, "mm"),
axis.ticks.x = element_line(size=.15),
plot.margin = margin(l=0,b=0,t=0.5,r=0.4, "mm"),
axis.title.x = element_blank(),
axis.title.y = element_text(size=2, margin = margin(r=.1,b=0,t=0,l=0, unit="mm")),
plot.title = element_text(size=3.5),
axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
panel.grid.major =element_blank(),
panel.grid.minor =element_blank(),
axis.text.x=element_text(size=2, margin = margin(t=.2,b=0,r=0,l=0, unit="mm")),
)
ggsave(sprintf("figs/main/fig3_atac_inset_density_%d.pdf", t), atac_tran_density, width=6, height=4, units = "mm")
}
# v2 of log2FC densities
atac_all_tran_density <- ggplot(all_edgers_test_filtered) +
geom_density(aes(x=abs(delta), color=as.factor(most_significant_transition), group=most_significant_transition), size=.15) +
theme_bw() +
base_plot_theme +
scale_y_continuous(expand = c(0,.05)) +
scale_x_continuous(breaks = c(0:8), expand = c(0,0)) +
scale_color_manual(limits=c(1,2,3,4),
values=pal_d3()(5)[c(1:3,5)], labels=c("1", "2", "3", "4/5"), guide=F) +
labs(y="Density", color="Transition", x="ATAC log2(FC)") +
theme(axis.ticks.length = unit(.2, "mm"),
axis.ticks = element_line(size=.15),
plot.margin = margin(l=0,b=0,t=0.5,r=0.4, "mm"),
axis.title.x = element_text(size=4, margin = margin(t=.1,b=0,r=0,l=0, unit="mm")),
axis.title.y = element_text(size=4, margin = margin(r=.1,b=0,t=0,l=0, unit="mm")),
plot.title = element_blank(),
# axis.ticks.y = element_blank(),
# axis.text.y = element_blank(),
panel.grid.major =element_blank(),
panel.grid.minor =element_blank(),
axis.text.x=element_text(size=4, margin = margin(t=.2,b=0,r=0,l=0, unit="mm")),
axis.text.y=element_text(size=2, margin = margin(t=0,b=0,r=0,l=0, unit="mm")),
)
ggsave("figs/main/fig3_atac_all_tran_density.pdf", atac_all_tran_density, width=26.419, height=13.5, units = "mm")
chrom_size_hg38 <- function(chr) {
return(as.numeric(list(
"chr1" = 248956422,
"chr2" = 242193529,
"chr3" = 198295559,
"chr4" = 190214555,
"chr5" = 181538259,
"chr6" = 170805979,
"chr7" = 159345973,
"chr8" = 145138636,
"chr9" = 138394717,
"chr10" = 133797422,
"chr11" = 135086622,
"chr12" = 133275309,
"chr13" = 114364328,
"chr14" = 107043718,
"chr15" = 101991189,
"chr16" = 90338345,
"chr17" = 83257441,
"chr18" = 80373285,
"chr19" = 58617616,
"chr20" = 64444167,
"chr21" = 46709983,
"chr22" = 50818468,
"chrX" = 156040895,
"chrY" = 57227415
)[chr]))
}
chrom_size_hg19 <- function(chr) {
return(as.numeric(list(
"chr1" = 249250621,
"chr2" = 243199373,
"chr3" = 198022430,
"chr4" = 191154276,
"chr5" = 180915260,
"chr6" = 171115067,
"chr7" = 159138663,
"chr8" = 146364022,
"chr9" = 141213431,
"chr10" = 135534747,
"chr11" = 135006516,
"chr12" = 133851895,
"chr13" = 115169878,
"chr14" = 107349540,
"chr15" = 102531392,
"chr16" = 90354753,
"chr17" = 81195210,
"chr18" = 78077248,
"chr20" = 63025520,
"chr19" = 59128983,
"chr22" = 51304566,
"chr21" = 48129895,
"chrX" = 155270560,
"chrY" = 59373566
)[chr]))
}
window_average_genome_atac_transition <- function(edger_results, window_size = 1000000, step_size = 10000, include_empty=FALSE, hg19=T) {
# bed_file_df <- bed_df
chrom_size <- chrom_size_hg38
if (hg19 == T) {
chrom_size <- chrom_size_hg19
}
windowed_bed_df <- tibble(chr = character(),
pos_start = numeric(),
pos_end = numeric(),
delta=numeric())
chromosomes <- unique(edger_results$seqnames)
for (chr in chromosomes) {
print(chr)
chr_size <- chrom_size(chr)
for (s in 0:floor(chr_size / step_size)) {
start_position <- s * step_size
end_position <- min((start_position + window_size), chr_size)
# print(chr)
# print(s)
num_rows <- nrow(edger_results %>%
filter((seqnames == chr) & ((abs(!!start_position - start) <= (window_size/2)) |
(abs(!!start_position - end) <= (window_size/2))))) # (abs((!!start_position) - pos) <= (window_size/2))))
# print(num_rows)
# print(start_position)
# print(end_position)
# print(start)
# print(end)
if ((num_rows > 0)) {
# print(edger_results %>%
# filter((seqnames == chr) & (min(abs(!!start_position - start), abs(!!start_position - end)) <= (window_size/2))))
window_mean_delta <- edger_results %>%
filter((seqnames == chr) & ((abs(!!start_position - start) <= (window_size/2)) |
(abs(!!start_position - end) <= (window_size/2)))) %>%
summarise(chr=first(seqnames),
pos_start=start_position,
pos_end=end_position,
mean_delta=mean(delta),
count=n(),
cumulative_peak_width=sum(width))
window_mean_delta[is.na(window_mean_delta)] <- 0
write_csv(window_mean_delta, "windowed_all_chr_deltas_temp.csv", col_names = F, append = T)
# windowed_bed_df <- add_row(windowed_bed_df,
# chr=chr,
# pos_start=start_position,
# pos_end=end_position,
# delta_1=as.numeric(window_mean_delta$mean_tran_1_delta),
# delta_2=as.numeric(window_mean_delta$mean_tran_2_delta),
# delta_3=as.numeric(window_mean_delta$mean_tran_3_delta),
# delta_4=as.numeric(window_mean_delta$mean_tran_4_delta),
# delta_5=as.numeric(window_mean_delta$mean_tran_5_delta))
}else if(include_empty == T) {
window_mean_delta <- tibble(chr=c(first(chr)),
pos_start=c(start_position),
pos_end=c(end_position),
mean_delta=c(0),
count=c(0),
cumulative_peak_width=c(0))
write_csv(window_mean_delta, "windowed_all_chr_deltas_temp.csv", col_names = F, append = T)
}
}
}
windowed_bed_df <- read_csv("windowed_all_chr_deltas_temp.csv", col_names = c("chr",
"pos_start",
"pos_end",
"mean_delta",
"count",
"cumulative_peak_width"))
file.remove("windowed_all_chr_deltas_temp.csv")
return(windowed_bed_df)
}
h3k27me3_peaks <- read_table2("bedgraphs/GSM1528885_H9_XISTpos_K27me3.peaks.bed", col_names=c("chr", "start", "end", "signal"))
h3k27me3_peaks_chrX <- h3k27me3_peaks %>% filter(chr == "chrX")
h3k27me3_peaks_chrX_windowed <- window_average_chrX(bed_file_df=h3k27me3_peaks_chrX %>% rename(pos=start, delta=signal), window_size = 100000, step_size = 50000,
include_empty = T, calculate_sum = T)
h3k9_xi_peaks <- read_table2("bedgraphs/GSM1528888_H9_XISTpos_K9me3.peaks.bed.gz", col_names=c("chr", "start", "end", "signal"))
h3k9_xi_peaks_chrX <- h3k9_xi_peaks %>% filter(chr == "chrX")
h3k9_xi_peaks_chrX_windowed <- window_average_chrX(bed_file_df=h3k9_xi_peaks_chrX %>% rename(pos=start, delta=signal), window_size = 100000, step_size = 50000, include_empty = T, calculate_sum = T)
# write_tsv(all_edger_passing_threshold %>% filter(seqnames == "chrX") %>% select(seqnames, start, end), "all_edger_passing_threshold_hg38.bed", col_names = F)
all_edger_passing_threshold_hg38 <- read_tsv("all_edger_passing_threshold_hg38.bed", col_names = c("hg38_chr", "hg38_start", "hg38_end")) %>%
filter(hg38_start != 50211461 & hg38_start != 149903954)
liftover_coords_all_edger <- read_tsv("all_edger_passing_threshold_hg19_liftover.bed", col_names = c("hg19_chr", "hg19_start", "hg19_end"))
all_edger_coor_convert_df <- all_edger_passing_threshold_hg38 %>%
cbind(liftover_coords_all_edger)
all_edger_coor_converted_df <- all_edger_passing_threshold %>%
filter(seqnames == "chrX") %>%
right_join(all_edger_coor_convert_df, by=c("seqnames"="hg38_chr", "start"="hg38_start", "end"="hg38_end"))
all_edger_coor_converted_df <- all_edger_coor_converted_df %>%
filter(passes_significance_threashold == T) %>%
mutate(hg19_width = hg19_end - hg19_start)
histone_mark_correlations_atac <- tibble(transition=numeric(),
h3k27me3_correlation=numeric(),
h3k27me3_correlation_p_val=numeric(),
h3k9me3_correlation=numeric(),
h3k9me3_correlation_p_val=numeric(),
dmp_correlation=numeric(),
dmp_correlation_p_val=numeric())
all_dmp_df <- read_csv("all_dmp_df.csv.gz")
for (t in 1:4) {
tran_windowed <- window_average_genome_atac_transition(all_edger_coor_converted_df %>%
filter(hg19_chr == "chrX" & most_significant_transition == t & delta_increasing == T) %>%
select(-seqnames, -start, -end, -width, seqnames=hg19_chr, start=hg19_start, end=hg19_end, width=hg19_width),
window_size = 100000,
step_size = 50000,
include_empty=T,
hg19 = T)
k27_cor_test <- cor.test(abs(tran_windowed$cumulative_peak_width), h3k27me3_peaks_chrX_windowed$delta)
k9_cor_test <- cor.test(abs(tran_windowed$cumulative_peak_width), h3k9_xi_peaks_chrX_windowed$delta) # h3k9me3_signal_chrX$signal)
bed_df <- all_dmp_df %>% filter((hg19_chr == "chrX") & (passes_threshold == T) & (key_transition == t))
averaged_distance <- window_average_chrX(bed_file_df=bed_df %>% rename(chr=hg19_chr, pos=hg19_pos), window_size = 100000, step_size = 50000, include_empty = T)
dmp_cor_test <- cor.test(abs(tran_windowed$cumulative_peak_width), abs(averaged_distance$delta))
# k27_xe_cor_test <- cor.test(abs(averaged_distance$delta), h3k27me3_xe_signal_chrX$signal)
# k9_xe_cor_test <- cor.test(abs(averaged_distance$delta), h3k9_xe_peaks_windowed$delta)
histone_mark_correlations_atac <- add_row(histone_mark_correlations_atac,
transition=t,
h3k27me3_correlation=k27_cor_test$estimate,
h3k27me3_correlation_p_val=k27_cor_test$p.value,
h3k9me3_correlation=k9_cor_test$estimate,
h3k9me3_correlation_p_val=k9_cor_test$p.value,
dmp_correlation=dmp_cor_test$estimate,
dmp_correlation_p_val=dmp_cor_test$p.value)
}
histone_cor_df <- reshape2::melt(histone_mark_correlations_atac %>%
dplyr::select(transition,
h3k27me3_correlation,
h3k9me3_correlation,
dmp_correlation),
id.vars=c("transition"))
histone_cor_df$variable <- factor(histone_cor_df$variable, levels=c("h3k9me3_correlation", "h3k27me3_correlation", "dmp_correlation"))
histon_cor_plot <- ggplot(histone_cor_df %>% filter(variable != "dmp_correlation")) +
geom_bar(aes(x=transition, y=value, fill=variable), stat="identity", position=position_dodge()) +
theme_bw() +
base_plot_theme +
coord_flip() +
facet_grid(rows=vars(transition), scales="free_y", labeller = labeller(transition = function(c){
return(list(
"1"="Tran. 1",
"2"="Tran. 2",
"3"="Tran. 3",
"4"="Tran. 4/5"
)[c])
}))+
labs(title="Corr. of ATAC-peaks With Histone Meth.",
x="",
y="Pearson Correlation",
fill="Correlation With: ") +
scale_fill_manual(limits=c("h3k27me3_correlation", "h3k9me3_correlation", "dmp_correlation"),
labels=c("H3K27me3", "H3K9me3", "DMP"),
values = c(pal_jama()(1), "#8C564B", pal_jama()(5)[5]),
guide=F) +
scale_x_continuous(labels = NULL) +
scale_y_continuous(limits=c(-.13,.33)) +
theme(strip.text = element_text(size=5, margin = margin(0,.5,0,.5, "mm")),
strip.background = element_rect(size = .3),
legend.text = element_text(size=4),
legend.title = element_text(size=4, face="bold"),
legend.key.width = unit(2, units="mm"),
legend.key.height = unit(3, units="mm"),
legend.key.size = unit(.2, units = "mm"),
panel.grid.minor.y = element_blank(),
axis.ticks.length.y = unit(0, "mm"),
axis.title.y = element_blank(),
axis.text.y = element_blank(),
panel.spacing.y = unit(.75, "mm"))
ggsave("figs/main/fig_3_atac_histone_cor_plot_pearson.pdf", histon_cor_plot, width = 46, height = 40, units = "mm")
## Short arm distances
chr_x_atac_peaks_df_short_arm <- all_edgers_test %>% filter((seqnames == "chrX") & (start <= 58100000))
short_arm_escapees <- mean_methylation_df_escape %>% filter(TSS <= 58100000)
canonical_transcripts <- read_csv("hg38_canonical_transcripts.csv.gz")
hg38_biomart_structure_df <- read_csv("hg38_biomart_structure_df.csv.gz", col_types = cols(chromosome_name="c"))
hg38_biomart_structure_df_canonical <- hg38_biomart_structure_df %>%
filter(ensembl_transcript_id %in% canonical_transcripts$ensembl_transcript_id &
chromosome_name=="X" &
transcription_start_site <= 58100000) %>%
group_by(ensembl_gene_id) %>%
summarise(transcription_start_site=dplyr::first(transcription_start_site),
end_position=dplyr::first(end_position))
short_arm_escapees_canonical <- hg38_biomart_structure_df_canonical %>%
filter(ensembl_gene_id %in% short_arm_escapees$hg38_ensembl_gene_id)
for (row_idx in 1:nrow(chr_x_atac_peaks_df_short_arm)) {
pos_start <- unlist(chr_x_atac_peaks_df_short_arm[row_idx, "start"])
pos_end <- unlist(chr_x_atac_peaks_df_short_arm[row_idx, "end"])
min_distance <- min(c(abs(short_arm_escapees_canonical["transcription_start_site"] - pos_start),
abs(short_arm_escapees_canonical["transcription_start_site"] - pos_end),
abs(short_arm_escapees_canonical["end_position"] - pos_start),
abs(short_arm_escapees_canonical["end_position"] - pos_end), recursive=T))
#min_distance_end <- min(abs(short_arm_escapees["end_position"] - pos))
chr_x_atac_peaks_df_short_arm[row_idx, "min_dist_to_pb_escapee"] <- min_distance #min(min_distance, min_distance_end)
}
get_random_escapee_permutation <- function(short_arm_escapees, chr_x_atac_peaks_df_short_arm, hg38_biomart_structure_df_canonical=hg38_biomart_structure_df_canonical, n=1000) {
mean_min_distance_random_df_short_arm <- tibble(run=numeric(),
key_transition=numeric(),
mean_distance=numeric())
for (i in 1:n) {
random_escapees <- sample(hg38_biomart_structure_df_canonical$ensembl_gene_id, nrow(short_arm_escapees))
random_escapees_df <- hg38_biomart_structure_df_canonical %>% filter(ensembl_gene_id %in% random_escapees)
test_df <- chr_x_atac_peaks_df_short_arm %>%
filter(passes_significance_threashold == T & delta_increasing == T) %>%
group_by(seqnames, start, end) %>% summarise(
key_transition=dplyr::first(most_significant_transition),
passes_threshold=dplyr::first(passes_significance_threashold),
min_dist_to_random_escapee=min(c(abs(random_escapees_df$transcription_start_site - start),
abs(random_escapees_df$transcription_start_site - end),
abs(random_escapees_df$end_position - start),
abs(random_escapees_df$end_position - end), recursive=T))
)
mean_min_distance_random_df_short_arm <- full_join(mean_min_distance_random_df_short_arm, test_df %>% group_by(key_transition) %>%
summarise(mean_distance=median(min_dist_to_random_escapee)) %>% mutate(run=i),
by=c("run", "key_transition", "mean_distance"))
# write_csv(test_df, "short_arm_pb_random_escapee_distances.csv", append=T, col_names=F)
# distances_to_random_escapees <- full_join(distances_to_random_escapees, test_df, by=c("key_transition", "min_dist_to_random_escapee", "passes_threshold"))
}
return(mean_min_distance_random_df_short_arm)
}
mean_min_distance_random_df_short_arm <- get_random_escapee_permutation(short_arm_escapees, chr_x_atac_peaks_df_short_arm, hg38_biomart_structure_df_canonical = hg38_biomart_structure_df_canonical)
# permutation 1 p-values
p1_atac_short_pvals <- c()
for (t in 1:4) {
p <- nrow(mean_min_distance_random_df_short_arm %>%
arrange(mean_distance) %>% filter(key_transition == t &
(mean_distance <= median((chr_x_atac_peaks_df_short_arm %>% filter(passes_significance_threashold==T &
most_significant_transition==t))$min_dist_to_pb_escapee))))
p2 <- nrow(mean_min_distance_random_df_short_arm %>%
arrange(mean_distance) %>% filter(key_transition == t &
(mean_distance >= median((chr_x_atac_peaks_df_short_arm %>% filter(passes_significance_threashold==T &
most_significant_transition==t))$min_dist_to_pb_escapee))))
print(min(p,p2)/1000)
pval <- min(p,p2)/1000
if(pval == 0) {
pval <- "p < 0.001"
}else {
pval <- sprintf("p = %.3f", pval)
}
p1_atac_short_pvals <- c(p1_atac_short_pvals, pval)
}
# distances_to_random_escapees_short_arm <- read_csv("short_arm_pb_random_escapee_distances.csv", col_names = c("rowname","key_transition","passes_threshold", "min_dist"))
# distances_to_random_escapees_short_arm$type <- "random"
get_random_peaks_permutation <- function(short_arm_escapees_canonical, chr_x_atac_peaks_df_short_arm, hg38_biomart_structure_df_canonical=hg38_biomart_structure_df_canonical, n=1000) {
mean_min_distance_random_probes_df_short_arm <- tibble(run=numeric(),
key_transition=numeric(),
mean_distance=numeric())
chr_x_atac_peaks_df_short_arm <- chr_x_atac_peaks_df_short_arm %>%
mutate(rowname=paste(seqnames, start, end, sep="_"))
for (i in 1:n) {
for (t in 1:4) {
random_short_arm_probes <- sample(chr_x_atac_peaks_df_short_arm$rowname, nrow(chr_x_atac_peaks_df_short_arm %>% filter(passes_significance_threashold==T & most_significant_transition==t & delta_increasing == T)))
random_short_arm_probes_df <- chr_x_atac_peaks_df_short_arm %>% filter(rowname %in% random_short_arm_probes)
test_df <- random_short_arm_probes_df %>% group_by(rowname) %>% summarise(
key_transition=t,
passes_threshold=T,
min_dist_to_random_escapee=min(c(abs(short_arm_escapees_canonical$transcription_start_site - start),
abs(short_arm_escapees_canonical$transcription_start_site - end),
abs(short_arm_escapees_canonical$end_position - start),
abs(short_arm_escapees_canonical$end_position - end), recursive=T))
)
mean_min_distance_random_probes_df_short_arm <- add_row(mean_min_distance_random_probes_df_short_arm,
run=i,
key_transition=t,
mean_distance=median(test_df$min_dist_to_random_escapee))
# write_csv(test_df, "short_arm_pb_random_probes_distances.csv", append=T, col_names=F)
# distances_to_random_escapees <- full_join(distances_to_random_escapees, test_df, by=c("key_transition", "min_dist_to_random_escapee", "passes_threshold"))
}
}
return(mean_min_distance_random_probes_df_short_arm)
}
mean_min_distance_random_probes_df_short_arm <- get_random_peaks_permutation(short_arm_escapees_canonical, chr_x_atac_peaks_df_short_arm)
# permutation 2 p-values
p2_atac_short_pvals <- c()
for (t in 1:4) {
p <- nrow(mean_min_distance_random_probes_df_short_arm %>%
arrange(mean_distance) %>% filter(key_transition == t &
(mean_distance <= median((chr_x_atac_peaks_df_short_arm %>% filter(passes_significance_threashold==T &
most_significant_transition==t))$min_dist_to_pb_escapee))))
p2 <- nrow(mean_min_distance_random_probes_df_short_arm %>%
arrange(mean_distance) %>% filter(key_transition == t &
(mean_distance >= median((chr_x_atac_peaks_df_short_arm %>% filter(passes_significance_threashold==T &
most_significant_transition==t))$min_dist_to_pb_escapee))))
print(min(p,p2)/1000)
pval <- min(p,p2)/1000
if(pval == 0) {
pval <- "p < 0.001"
}else {
pval <- sprintf("p = %.3f", pval)
}
p2_atac_short_pvals <- c(p2_atac_short_pvals, pval)
}
# distances_to_random_probes_short_arm <- read_csv("short_arm_pb_random_probes_distances.csv", col_names = c("rowname","key_transition","passes_threshold", "min_dist"))
# distances_to_random_probes_short_arm$type <- "random_probes"
p1_atac_short_df <- tibble(most_significant_transition = c(1:3,5),
pval = p1_atac_short_pvals,
x=0,
y=1)
p2_atac_short_df <- tibble(most_significant_transition = c(1:3,5),
pval = p2_atac_short_pvals,
x=3.5e7,
y=1)
chr_x_atac_peaks_df_short_arm[chr_x_atac_peaks_df_short_arm$most_significant_transition==4, "most_significant_transition"] <- 5
short_arm_permutation_plot <- ggplot(data=chr_x_atac_peaks_df_short_arm %>% filter(passes_significance_threashold==T & delta_increasing == T))+#mean_min_distance_random_df_short_arm) +
geom_density(aes(mean_distance, y=..scaled..), data=mean_min_distance_random_df_short_arm,
n = 1e4,
fill="darkgrey",
color="darkgray",
size=.3,
alpha=.5) +
geom_density(aes(mean_distance, y=..scaled..), data=mean_min_distance_random_probes_df_short_arm,
n = 1e4,
fill="grey",
color="grey",
size=.3,
alpha=.5) +
facet_grid(most_significant_transition~., scales = "free_y", labeller = labeller(most_significant_transition = function(c){
return(list(
"1"="Tran. 1",
"2"="Tran. 2",
"3"="Tran. 3",
"4"="Tran. 4",
"5"="Tran. 4/5"
)[c])
})) +
geom_vline(data=(chr_x_atac_peaks_df_short_arm %>% filter(passes_significance_threashold==T & delta_increasing == T) %>% group_by(most_significant_transition) %>% summarise(mean_distance=median(min_dist_to_pb_escapee))),
mapping = aes(xintercept=mean_distance), size=.3) +
geom_density(data=chr_x_atac_peaks_df_short_arm %>% filter(passes_significance_threashold==T & delta_increasing == T),
mapping=aes(min_dist_to_pb_escapee,
y=..scaled..,
fill=as.factor(most_significant_transition),
color=as.factor(most_significant_transition)),
n = 1e4, alpha=.5, size=.3) +
geom_text(data=(chr_x_atac_peaks_df_short_arm %>%
filter(passes_significance_threashold==T & delta_increasing == T) %>%
group_by(most_significant_transition) %>%
summarise(mean_distance=median(min_dist_to_pb_escapee))),
mapping = aes(x=mean_distance, y=.5, label=sprintf("%.1f Mb", mean_distance / 1e6)),
hjust=1.1,
size=1.2) +
geom_text(data=p1_atac_short_df, mapping=aes(x=x, y=y, label=pval), inherit.aes = F, size=1.2, hjust=-0.1, vjust=0) +
geom_text(data=p2_atac_short_df, mapping=aes(x=x, y=y, label=pval), inherit.aes = F, size=1.2, hjust=1.1, vjust=0) +
theme_bw() +
# scale_x_continuous(breaks = c(0,.5,1,1.5,2,2.5,3,3.5)*1e7, labels = c(0,.5,1,1.5,2,2.5,3,3.5)*10) +
# scale_x_log10(limits = c(1e4,1e8), breaks=c(1e4, 1e5,1e6,1e7,1e8), labels = c(.01,.1,1,10,100)) +
scale_x_sqrt(breaks=c(1e6, 5e6, 1e7, 1.5e7, 2e7, 2.5e7, 3e7), labels=c(1, 5, 10, 15, 20, 25, 30), expand=c(0,0)) +
scale_y_continuous(limits=c(0,1.2), breaks = c(0,.5,1), minor_breaks = c(.25,.75)) +
scale_color_d3(guide=F, limits=c("1","2","3","4","5")) +
scale_fill_d3(guide=F, limits=c("1","2","3","4","5")) +
# scale_fill_material("teal", name="Transition", guide=F) +
# scale_color_material("teal", name="Transition", guide=F) +
labs(title="Min. Distance of ATAC-peaks to Escapee on Short Arm",
x="Minimum Distance to Gene Set (Mb)",
y="Scaled Density") +
base_plot_theme +
theme(strip.text = element_text(size=5, margin = margin(0,.5,0,.5, "mm")),
strip.background = element_rect(size = .3),
panel.spacing.y = unit(.75, "mm"))
ggsave("figs/main/fig3_min_dist_atac_escapee_short_arm.pdf", short_arm_permutation_plot, width = 64, height=40, units = "mm")
## Long arm distances
chr_x_atac_peaks_df_long_arm <- all_edgers_test %>% filter((seqnames == "chrX") & (start > 58100000))
long_arm_escapees <- mean_methylation_df_escape %>% filter(TSS > 58100000)
hg38_biomart_structure_df_canonical_long_arm <- hg38_biomart_structure_df %>%
filter(ensembl_transcript_id %in% canonical_transcripts$ensembl_transcript_id &
chromosome_name=="X" &
transcription_start_site > 58100000) %>%
group_by(ensembl_gene_id) %>%
summarise(transcription_start_site=dplyr::first(transcription_start_site),
end_position=dplyr::first(end_position))
long_arm_escapees_canonical <- hg38_biomart_structure_df_canonical_long_arm %>%
filter(ensembl_gene_id %in% long_arm_escapees$hg38_ensembl_gene_id)
for (row_idx in 1:nrow(chr_x_atac_peaks_df_long_arm)) {
pos_start <- unlist(chr_x_atac_peaks_df_long_arm[row_idx, "start"])
pos_end <- unlist(chr_x_atac_peaks_df_long_arm[row_idx, "end"])
min_distance <- min(c(abs(long_arm_escapees_canonical["transcription_start_site"] - pos_start),
abs(long_arm_escapees_canonical["transcription_start_site"] - pos_end),
abs(long_arm_escapees_canonical["end_position"] - pos_start),
abs(long_arm_escapees_canonical["end_position"] - pos_end), recursive=T))
#min_distance_end <- min(abs(short_arm_escapees["end_position"] - pos))
chr_x_atac_peaks_df_long_arm[row_idx, "min_dist_to_pb_escapee"] <- min_distance #min(min_distance, min_distance_end)
}
mean_min_distance_random_df_long_arm <- get_random_escapee_permutation(long_arm_escapees, chr_x_atac_peaks_df_long_arm, hg38_biomart_structure_df_canonical = hg38_biomart_structure_df_canonical_long_arm)
# permutation 1 p-values
p1_atac_long_pvals <- c()
for (t in 1:4) {
p <- nrow(mean_min_distance_random_df_long_arm %>%
arrange(mean_distance) %>% filter(key_transition == t &
(mean_distance <= median((chr_x_atac_peaks_df_long_arm %>% filter(passes_significance_threashold==T &
most_significant_transition==t))$min_dist_to_pb_escapee))))
p2 <- nrow(mean_min_distance_random_df_long_arm %>%
arrange(mean_distance) %>% filter(key_transition == t &
(mean_distance >= median((chr_x_atac_peaks_df_long_arm %>% filter(passes_significance_threashold==T &
most_significant_transition==t))$min_dist_to_pb_escapee))))
print(min(p,p2)/1000)
pval <- min(p,p2)/1000
if(pval == 0) {
pval <- "p < 0.001"
}else {
pval <- sprintf("p = %.3f", pval)
}
p1_atac_long_pvals <- c(p1_atac_long_pvals, pval)
}
mean_min_distance_random_probes_df_long_arm <- get_random_peaks_permutation(long_arm_escapees_canonical, chr_x_atac_peaks_df_long_arm, hg38_biomart_structure_df_canonical = hg38_biomart_structure_df_canonical_long_arm)
# permutation 2 p-values
p2_atac_long_pvals <- c()
for (t in 1:4) {
p <- nrow(mean_min_distance_random_probes_df_long_arm %>%
arrange(mean_distance) %>% filter(key_transition == t &
(mean_distance <= median((chr_x_atac_peaks_df_long_arm %>% filter(passes_significance_threashold==T &
most_significant_transition==t))$min_dist_to_pb_escapee))))
p2 <- nrow(mean_min_distance_random_probes_df_long_arm %>%
arrange(mean_distance) %>% filter(key_transition == t &
(mean_distance >= median((chr_x_atac_peaks_df_long_arm %>% filter(passes_significance_threashold==T &
most_significant_transition==t))$min_dist_to_pb_escapee))))
print(min(p,p2)/1000)
pval <- min(p,p2)/1000
if(pval == 0) {
pval <- "p < 0.001"
}else {
pval <- sprintf("p = %.3f", pval)
}
p2_atac_long_pvals <- c(p2_atac_long_pvals, pval)
}
p1_atac_long_df <- tibble(most_significant_transition = c(1:4),
pval = p1_atac_long_pvals,
x=0,
y=1)
p2_atac_long_df <- tibble(most_significant_transition = c(1:4),
pval = p2_atac_long_pvals,
x=6.5e7,
y=1)
long_arm_permutation_plot_atac_peaks <- ggplot(data=chr_x_atac_peaks_df_long_arm %>% filter(passes_significance_threashold==T & delta_increasing == T))+#mean_min_distance_random_df_short_arm) +
geom_density(aes(mean_distance, y=..scaled..), data=mean_min_distance_random_df_long_arm,
n = 1e4,
fill="darkgrey",
color="darkgray",
size=.3,
alpha=.5) +
geom_density(aes(mean_distance, y=..scaled..), data=mean_min_distance_random_probes_df_long_arm,
n = 1e4,
fill="grey",
color="grey",
size=.3,
alpha=.5) +
facet_grid(most_significant_transition~., scales = "free_y", labeller = labeller(most_significant_transition = function(c){
return(list(
"1"="Tran. 1",
"2"="Tran. 2",
"3"="Tran. 3",
"4"="Tran. 4/5",
"5"="Tran. 5"
)[c])
})) +
geom_vline(data=(chr_x_atac_peaks_df_long_arm %>% filter(passes_significance_threashold==T & delta_increasing == T) %>% group_by(most_significant_transition) %>% summarise(mean_distance=median(min_dist_to_pb_escapee))),
mapping = aes(xintercept=mean_distance), size=.3) +
geom_density(data=chr_x_atac_peaks_df_long_arm %>% filter(passes_significance_threashold==T & delta_increasing == T),
mapping=aes(min_dist_to_pb_escapee,
y=..scaled..,
fill=as.factor(most_significant_transition),
color=as.factor(most_significant_transition)),
n = 1e4, alpha=.5, size=.3) +
geom_text(data=(chr_x_atac_peaks_df_long_arm %>%
filter(passes_significance_threashold==T & delta_increasing == T) %>%
group_by(most_significant_transition) %>%
summarise(mean_distance=median(min_dist_to_pb_escapee))),
mapping = aes(x=mean_distance, y=.5, label=sprintf("%.1f Mb", mean_distance / 1e6)),
hjust=1.1,
size=1.2) +
geom_text(data=p1_atac_long_df, mapping=aes(x=x, y=y, label=pval), inherit.aes = F, size=1.2, hjust=-0.1, vjust=0) +
geom_text(data=p2_atac_long_df, mapping=aes(x=x, y=y, label=pval), inherit.aes = F, size=1.2, hjust=1.1, vjust=0) +
theme_bw() +
# scale_x_continuous(breaks = c(0,.5,1,1.5,2,2.5,3,3.5)*1e7, labels = c(0,.5,1,1.5,2,2.5,3,3.5)*10) +
# scale_x_log10(limits = c(1e4,1e8), breaks=c(1e4, 1e5,1e6,1e7,1e8), labels = c(.01,.1,1,10,100)) +
scale_x_sqrt(breaks=c(1e6, 5e6, 1e7, 1.5e7, 2e7,
2.5e7, 3e7, 3.5e7, 4e7, 4.5e7, 5e7, 5.5e7, 6e7),
labels=c(1, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60), expand=c(0,0)) +
scale_y_continuous(limits=c(0,1.2), breaks = c(0,.5,1), minor_breaks = c(.25,.75)) +
scale_color_d3(guide=F, limits=c(1:3,5,4)) +
scale_fill_d3(guide=F, limits=c(1:3,5,4)) +
# scale_fill_material("teal", name="Transition", guide=F) +
# scale_color_material("teal", name="Transition", guide=F) +
labs(title="Min. Distance of ATAC-peaks to Escapee on Long Arm",
x="Minimum Distance to Gene Set (Mb)",
y="Scaled Density") +
base_plot_theme +
theme(strip.text = element_text(size=5, margin = margin(0,.5,0,.5, "mm")),
strip.background = element_rect(size = .3),
legend.key.width = unit(2, units="mm"),
legend.key.height = unit(3, units="mm"),
legend.key.size = unit(.2, units = "mm"),
panel.spacing.y = unit(.75, "mm"))
ggsave("figs/main/min_dist_atac_escapee_long_arm.pdf", long_arm_permutation_plot_atac_peaks, width = 64, height=40, units = "mm")
# escapee distance cumulative plots
probability_tib_short_arm <- tibble(
min_distance = numeric(),
transition = numeric(),
probability = numeric()
)
for (dist_to_escapee in seq(1, 7e7, 1e6)) {
# dist_to_escapee <- dist_to_escapee * 1e4
sites_within_dist <- chr_x_atac_peaks_df_short_arm %>% filter(((min_dist_to_pb_escapee <= dist_to_escapee)) & (passes_significance_threashold == T))
for (t in 1:5) {
prob <- nrow(sites_within_dist %>% filter(most_significant_transition == t & passes_significance_threashold == T)) /
nrow(chr_x_atac_peaks_df_short_arm %>% filter(most_significant_transition == t & passes_significance_threashold == T))
probability_tib_short_arm <- add_row(probability_tib_short_arm,
min_distance = dist_to_escapee,
transition = t,
probability = prob)
}
}
d50_df_short <- tibble(transition=numeric(),d50=numeric())
for (t in 1:5) {
d50_dist <- as.numeric(chr_x_atac_peaks_df_short_arm %>%
filter(most_significant_transition == t & passes_significance_threashold==T) %>%
arrange(min_dist_to_pb_escapee) %>% filter(row_number()==ceiling(n()/2)) %>%
dplyr::select(min_dist_to_pb_escapee))
d50_df_short <- add_row(d50_df_short, transition=t, d50=d50_dist)
}
probability_of_atac_peak_short_arm <- ggplot(na.omit(probability_tib_short_arm), aes(x=min_distance, y=probability, group=transition, color=as.factor(transition))) +
geom_line() +
labs(x = "Minimum Distance to Escapee (Mb)",
y = "Fraction of Transition Sites",
color = "Transition",
title="Probability of Open Chromatin on Short Arm by\nDistance from Escapee") +
theme_bw() +
scale_color_futurama(breaks=c(0,1,2,3,5,4), labels=c("0","1", "2", "3", "NA", "4/5")) +
scale_x_continuous(breaks = (0:6)*1e7, labels = (0:6)*10) +
base_plot_theme +
theme(legend.title = element_text(size=5, face="bold"),
legend.text = element_text(size=5),
legend.key.width = unit(2, "mm"))
ggsave("figs/supplementary/sup2_prob_atac_peak_short_arm.svg", probability_of_atac_peak_short_arm, width = 58, height=40, units = "mm")
probability_tib_long_arm <- tibble(
min_distance = numeric(),
transition = numeric(),
probability = numeric()
)
for (dist_to_escapee in seq(1, 7e7, 1e6)) {
# dist_to_escapee <- dist_to_escapee * 1e4
sites_within_dist <- chr_x_atac_peaks_df_long_arm %>% filter(((min_dist_to_pb_escapee <= dist_to_escapee)) & (passes_significance_threashold == T))
for (t in 1:5) {
prob <- nrow(sites_within_dist %>% filter(most_significant_transition == t & passes_significance_threashold == T)) /
nrow(chr_x_atac_peaks_df_long_arm %>% filter(most_significant_transition == t & passes_significance_threashold == T))
probability_tib_long_arm <- add_row(probability_tib_long_arm,
min_distance = dist_to_escapee,
transition = t,
probability = prob)
}
}
d50_df <- tibble(transition=numeric(),d50=numeric())
for (t in 1:5) {
d50_dist <- as.numeric(chr_x_atac_peaks_df_long_arm %>%
filter(most_significant_transition == t & passes_significance_threashold==T) %>%
arrange(min_dist_to_pb_escapee) %>% filter(row_number()==ceiling(n()/2)) %>%
dplyr::select(min_dist_to_pb_escapee))
d50_df <- add_row(d50_df, transition=t, d50=d50_dist)
}
probability_of_atac_peaks_long_arm <- ggplot(na.omit(probability_tib_long_arm), aes(x=min_distance, y=probability, group=transition, color=as.factor(transition))) +
geom_line() +
labs(x = "Minimum Distance to Escapee (Mb)",
y = "Fraction of Transition Sites",
color = "Transition",
title="Probability of Open Chromatin on Long Arm by\nDistance from Escapee") +
theme_bw() +
scale_color_futurama(breaks=c(0,1,2,3,5,4), labels=c("0","1", "2", "3", "NA", "4/5")) +
scale_x_continuous(breaks = (0:6)*1e7, labels = (0:6)*10) +
base_plot_theme +
theme(legend.title = element_text(size=5, face="bold"),
legend.text = element_text(size=5),
legend.key.width = unit(2, "mm"))
ggsave("figs/supplementary/sup2_prob_atac_peak_long_arm.svg", probability_of_atac_peaks_long_arm, width = 58, height=40, units = "mm")
#probability of escape for whole x chromosome
chr_x_atac_peaks_df_whole_chrom <- all_edgers_test %>% filter((seqnames == "chrX"))
whole_chrom_escapees <- mean_methylation_df_escape
hg38_biomart_structure_df_canonical_whole_chrom <- hg38_biomart_structure_df %>%
filter(ensembl_transcript_id %in% canonical_transcripts$ensembl_transcript_id &
chromosome_name=="X") %>%
group_by(ensembl_gene_id) %>%
summarise(transcription_start_site=dplyr::first(transcription_start_site),
end_position=dplyr::first(end_position))
whole_chrom_escapees_canonical <- hg38_biomart_structure_df_canonical_whole_chrom %>%
filter(ensembl_gene_id %in% whole_chrom_escapees$hg38_ensembl_gene_id)
for (row_idx in 1:nrow(chr_x_atac_peaks_df_whole_chrom)) {
pos_start <- unlist(chr_x_atac_peaks_df_whole_chrom[row_idx, "start"])
pos_end <- unlist(chr_x_atac_peaks_df_whole_chrom[row_idx, "end"])
min_distance <- min(c(abs(whole_chrom_escapees_canonical["transcription_start_site"] - pos_start),
abs(whole_chrom_escapees_canonical["transcription_start_site"] - pos_end),
abs(whole_chrom_escapees_canonical["end_position"] - pos_start),
abs(whole_chrom_escapees_canonical["end_position"] - pos_end), recursive=T))
#min_distance_end <- min(abs(short_arm_escapees["end_position"] - pos))
chr_x_atac_peaks_df_whole_chrom[row_idx, "min_dist_to_pb_escapee"] <- min_distance #min(min_distance, min_distance_end)
}
probability_tib_whole_chrom <- tibble(
min_distance = numeric(),
transition = numeric(),
probability = numeric()
)
for (dist_to_escapee in seq(1, 7e7, 1e6)) {
# dist_to_escapee <- dist_to_escapee * 1e4
sites_within_dist <- chr_x_atac_peaks_df_whole_chrom %>% filter(((min_dist_to_pb_escapee <= dist_to_escapee)) & (passes_significance_threashold == T))
for (t in 1:5) {
prob <- nrow(sites_within_dist %>% filter(most_significant_transition == t & passes_significance_threashold == T)) /
nrow(chr_x_atac_peaks_df_whole_chrom %>% filter(most_significant_transition == t & passes_significance_threashold == T))
probability_tib_whole_chrom <- add_row(probability_tib_whole_chrom,
min_distance = dist_to_escapee,
transition = t,
probability = prob)
}
}
probability_of_atac_peaks_whole_chrom <- ggplot(na.omit(probability_tib_whole_chrom), aes(x=min_distance, y=probability, group=transition, color=as.factor(transition))) +
geom_line() +
labs(x = "Minimum Distance to Escapee (Mb)",
y = "Fraction of Transition Sites",
color = "Transition",
title="Probability of Open Chromatin on Whole Chromosome by\nDistance from Escapee") +
theme_bw() +
scale_color_futurama(breaks=c(0,1,2,3,5,4), labels=c("0","1", "2", "3", "NA", "4/5")) +
scale_x_continuous(breaks = (0:6)*1e7, labels = (0:6)*10) +
base_plot_theme +
theme(legend.title = element_text(size=5, face="bold"),
legend.text = element_text(size=5),
legend.key.width = unit(2, "mm"))
ggsave("figs/supplementary/sup2_prob_atac_peak_whole_chrom.svg", probability_of_atac_peaks_whole_chrom, width = 58, height=40, units = "mm")
## Figure for XIST
canonical_transcripts <- read_csv("hg38_canonical_transcripts.csv.gz")
hg38_biomart_structure_df <- read_csv("hg38_biomart_structure_df.csv.gz", col_types = cols(chromosome_name="c"))
hg38_biomart_structure_df_canonical <- hg38_biomart_structure_df %>%
filter(ensembl_transcript_id %in% canonical_transcripts$ensembl_transcript_id &
chromosome_name=="X") %>%
group_by(ensembl_gene_id) %>%
summarise(external_gene_name=dplyr::first(external_gene_name),
transcription_start_site=dplyr::first(transcription_start_site),
start_position=dplyr::first(start_position),
end_position=dplyr::first(end_position))
xist_cov_start <- 73845000 # as.numeric((hg38_biomart_structure_df_canonical %>% filter(external_gene_name == "XIST"))$start_position[1])
xist_cov_end <- as.numeric((hg38_biomart_structure_df_canonical %>% filter(external_gene_name == "XIST"))$end_position[1]) + 5000
firre_cov_start <- as.numeric((hg38_biomart_structure_df_canonical %>% filter(external_gene_name == "FIRRE"))$start_position[1])
firre_cov_end <- as.numeric((hg38_biomart_structure_df_canonical %>% filter(external_gene_name == "FIRRE"))$end_position[1]) + 5000
xist_peaks <- tibble(sample_atac_srr=character(),
chr=character(),
start=numeric(),
end=numeric(),
cluster=numeric(),
pileup=numeric(),
abs_summit=numeric())
firre_peaks <- tibble(sample_atac_srr=character(),
chr=character(),
start=numeric(),
end=numeric(),
cluster=numeric(),
pileup=numeric(),
abs_summit=numeric())
for (i in 1:nrow(atac_sample_info)) {
srr <- as.character(atac_sample_info[i, "atac_accession"])
cluster <- as.numeric(atac_sample_info[i, "cluster"])
peaks_file <- as.character(atac_sample_info[i, "Peaks"])
peaks_df <- read_tsv(peaks_file, comment = "#")
# chrX:73,820,656-73,852,723)
sample_xist_peaks <- peaks_df %>%
filter(chr == "chrX" &
start >= 73820656 &
end <= 73852723 + 5000) %>%
mutate(sample_atac_srr=srr,
cluster=cluster) %>%
dplyr::select(sample_atac_srr, chr, start, end, cluster, pileup, abs_summit)
xist_peaks <- xist_peaks %>%
rbind(sample_xist_peaks)
# chrX:73,820,656-73,852,723)
sample_firre_peaks <- peaks_df %>%
filter(chr == "chrX" &
end >= firre_cov_start &
start <= firre_cov_end) %>%
mutate(sample_atac_srr=srr,
cluster=cluster) %>%
dplyr::select(sample_atac_srr, chr, start, end, cluster, pileup, abs_summit)
firre_peaks <- firre_peaks %>%
rbind(sample_firre_peaks)
}
ggplot(xist_peaks) +
annotate("rect", xmin=73851080, xmax=73851597, ymin=0, ymax=150, fill="grey") +
annotate("rect", xmin=73851995, xmax=73852383, ymin=0, ymax=150, fill="red") +
annotate("rect", xmin=73851195, xmax=73851236, ymin=0, ymax=150, fill="blue") +
annotate("rect", xmin=73849811, xmax=73849975, ymin=0, ymax=150, fill="green") +
geom_vline(xintercept=73852723) +
geom_rect(aes(xmin=start, xmax=end, ymin=0, ymax=pileup, fill=sample_atac_srr), alpha=.2, color="black") +
facet_grid(rows=vars(cluster)) +
scale_x_continuous(limits=c(73845000, 73852723+5000))
# plot xist peaks using the read bed files
all_reads_df <- tibble(chr=character(),
start=numeric(),
end=numeric(),
score=numeric(),
sample=character(),
cluster=numeric())
for (i in 1:nrow(atac_sample_info)) {
srr <- as.character(atac_sample_info[i, "atac_accession"])
cluster <- as.numeric(atac_sample_info[i, "cluster"])
bedgraph_file <- sprintf("banovich-atacseq-data/macs2_peaks/%s.BedGraph", srr)
reads_df <- read_tsv(bedgraph_file, col_names = c("chr", "start", "end", "score"))
reads_df <- reads_df %>%
filter(chr == "chrX" &
((end >= xist_cov_start & start <= xist_cov_end) |
(end >= firre_cov_start & start <= firre_cov_end)))
reads_df$sample <- srr
reads_df$cluster <- cluster
all_reads_df <- all_reads_df %>%
rbind(reads_df)
}
write_csv(all_reads_df, "xist_firre_atac_coverage_reads.csv.gz")
xist_gene_end <- as.numeric((hg38_biomart_structure_df_canonical %>% filter(external_gene_name == "XIST"))$end_position[1])
xist_facet_df <- tibble(name=c("XIST", "Minimal Promoter", "CpgII/P2", "YY1", "YY1", "YY1", "YY1", "Exon1"),
cluster=6,
ymin=0,
ymax=.3,
xmin=c(xist_cov_start+1500, xist_gene_end, 73851080, 73851489, 73851374, 73851341, 73851205, 73841382),
xmax=c(xist_gene_end, 73852964, 73851597, 73851498, 73851383, 73851350, 73851214, 73852753))
xist_tss_df <- tibble(cluster=6,
group=c("down_line", "down_line", "left_line", "left_line", "arrow", "arrow", "arrow"),
x=c(xist_gene_end, xist_gene_end, xist_gene_end, xist_gene_end-100, xist_gene_end-100, xist_gene_end-100, xist_gene_end-200),
y=c(.3, -.1, -.1, -.1, -.05, -.15, -.1))
xist_text_df <- tibble(cluster=6,
x=xist_gene_end-350,
y=-.1,
text="TSS")
library(facetscales)
scales_y <- list(
"0" = scale_y_continuous(limits=c(-.2, 1.2), expand=c(0,0), breaks=c(0,.5,1)),
"0.5" = scale_y_continuous(limits=c(-.2, 1.2), expand=c(0,0), breaks=c(0,.5,1)),
"1" = scale_y_continuous(limits=c(-.2, 1.2), expand=c(0,0), breaks=c(0,.5,1)),
"2" = scale_y_continuous(limits=c(-.2, 1.2), expand=c(0,0), breaks=c(0,.5,1)),
"3" = scale_y_continuous(limits=c(-.2, 1.2), expand=c(0,0), breaks=c(0,.5,1)),
"4" = scale_y_continuous(limits=c(-.2, 1.2), expand=c(0,0), breaks=c(0,.5,1)),
"5" = scale_y_continuous(limits=c(-.2, 1.2), expand=c(0,0), breaks=c(0,.5,1)),
"6" = scale_y_continuous(limits=c(-.25, .55), expand=c(0,0), breaks=NULL)
)
significant_diff_peaks_xist_df <- read_csv("all_edgers_test.csv.gz") %>%
filter(passes_significance_threashold == T &
seqnames == "chrX" &
start <= (xist_cov_end-3500) &
end >= (xist_cov_start+1500)) %>%
mutate(cluster=most_significant_transition - 1)
filtered_site_annotation_df <- read_csv("filtered_site_annotation_df.csv.gz")
xist_annotation_df <- filtered_site_annotation_df %>% filter(hg38_chromosome == "chrX" & between(hg38_pos, xist_cov_start, xist_cov_end))
# separate low XIST lines in cluster 0/A
all_reads_df <- all_reads_df %>%
mutate(cluster=case_when(sample %in% c("SRR6412441", "SRR6412449") ~ .5,
T ~ cluster))
xist_atac_coverage_plot <- ggplot(all_reads_df) +
geom_vline(data=xist_annotation_df, mapping=aes(xintercept=hg38_pos), alpha=.3, size=.15) +
geom_rect(aes(xmin=start, xmax=end, ymin=0, ymax=score), alpha=.2, fill="black") +
#fill=as.factor(cluster)),
geom_segment(data=xist_peaks, aes(x=start, xend=end, y=-.1, yend=-.1, color="no change"), size=.7, alpha=.4) +
geom_segment(data=significant_diff_peaks_xist_df, aes(x=start, xend=end, y=-.1, yend=-.1, color=delta_increasing), size=.7, alpha=1) +
facet_grid_sc(rows = vars(as.factor(cluster)), space="free_y", scales = list(y=scales_y), labeller = labeller(.rows=function(n){
return(list("0"="Cluster A",
"0.5"="Cluster A (XIST-)",
"1"="Cluster B",
"2"="Cluster C",
"3"="Cluster D",
"5"="Cluster F",
"6"="XIST")[n])
})) +
geom_rect(data= xist_facet_df, aes(xmin=xmin, xmax=xmax, fill=name, ymin = ymin, ymax=ymax)) +
geom_polygon(data=xist_tss_df, aes(x=x, y=y, group=group)) +
geom_path(data=xist_tss_df, aes(x=x, y=y, group=group), size=.3) +
geom_text(data=xist_text_df, aes(x=x, y=y, label=text), size=1.2) +
theme_bw() +
base_plot_theme +
scale_fill_manual(values = c("#000000","#000000", "#7AC943", "#FF931E", "#ED1C24"),
breaks = c("XIST", "Exon1", "Minimal Promoter", "CpgII/P2", "YY1"),
labels = c("XIST", "Exon 1", "Minimal Promoter", "CpgII/P2", "YY1")) +
scale_color_manual(values = c("#000000", pal_npg()(2)),
breaks = c("no change", "FALSE", "TRUE"),
labels=c("Invariant", "Closing", "Opening")) +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.text = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")),
strip.background = element_rect(size = .3)) +
scale_x_continuous(limits=c(xist_cov_start+1500, xist_cov_end-3500), expand=c(0,0)) +
labs(title="XIST ATAC Coverage",
x="X Chr Coordinate",
y="Reads per million",
fill="XIST Region",
color="ATAC Peak")
ggsave("figs/main/fig2_xist_atac_coverage_plot.svg",xist_atac_coverage_plot, width=84, height=80, units = "mm")
scales_y_firre_atac <- list(
"0" = scale_y_continuous(limits=c(-.2, 1.2), expand=c(0,0), breaks=c(0,.5,1)),
"1" = scale_y_continuous(limits=c(-.2, 1.2), expand=c(0,0), breaks=c(0,.5,1)),
"2" = scale_y_continuous(limits=c(-.2, 1.2), expand=c(0,0), breaks=c(0,.5,1)),
"3" = scale_y_continuous(limits=c(-.2, 1.2), expand=c(0,0), breaks=c(0,.5,1)),
"4" = scale_y_continuous(limits=c(-.2, 1.2), expand=c(0,0), breaks=c(0,.5,1)),
"5" = scale_y_continuous(limits=c(-.2, 1.2), expand=c(0,0), breaks=c(0,.5,1)),
"6" = scale_y_continuous(limits=c(-.25, .55), expand=c(0,0), breaks=NULL)
)
firre_gene_end <- as.numeric((hg38_biomart_structure_df_canonical %>% filter(external_gene_name == "FIRRE"))$transcription_start_site[1])
firre_repeats <- read_table("firre.fa.out_repeats",
col_names = c("SW_score",
"perc_div",
"perc_del",
"perc_ins",
"query_sequence",
"pos_start",
"pos_end",
"pos_left",
"strand",
"matching_repeat",
"repeat_class_family",
"pos_in_repeat_start",
"pos_in_repeat_end",
"pos_in_repeat_left",
"id",
"astericks"),
skip = 3)
firre_facet_df <- tibble(name=c("FIRRE", "Exon", "Exon", "Exon", "Exon", "Exon", "Exon", "Exon", "Exon", "Exon", "Exon", "Exon", "Exon"),
cluster=6,
ymin=0,
ymax=.3,
xmin=c(firre_cov_start+1500, 131830604, 131825365, 131794466, 131794115, 131785258, 131784039, 131777744, 131768696, 131756709, 131755746, 131749458, 131692636),
xmax=c(firre_gene_end, 131830292, 131825222, 131794324, 131793994, 131785157, 131783887, 131777623, 131768549, 131756611, 131755598, 131749306, 131688779)) %>%
rbind( firre_repeats %>%
filter(repeat_class_family=="Unspecified" & !is.na(matching_repeat) & matching_repeat != "") %>%
mutate(end=131830862-pos_start+1, start=131830862-pos_end+1, cluster=6, name=matching_repeat, ymin=0, ymax=.3) %>%
select(name, cluster, ymin, ymax, xmin=start, xmax=end) %>%
filter(xmax >= firre_cov_start & xmin <= firre_cov_end & name %in% c("R=1"))) %>% #,"R=3","R=12","R=15","R=17"
mutate(name=factor(name,
levels = (c("FIRRE",
"Exon",
"R=0",
"R=1",
"R=2",
"R=3",
"R=4",
"R=5",
"R=6",
"R=7",
"R=8",
"R=9",
"R=10",
"R=11",
"R=12",
"R=13",
"R=14",
"R=15",
"R=16",
"R=17"))))
# firre_facet_df[firre_facet_df$name == "R5", "ymin"] <- -1
firre_tss_df <- tibble(cluster=6,
group=c("down_line", "down_line", "left_line", "left_line", "arrow", "arrow", "arrow"),
x=c(firre_gene_end, firre_gene_end, firre_gene_end, firre_gene_end-2000, firre_gene_end-2000, firre_gene_end-2000, firre_gene_end-4000),
y=c(.3, -.1, -.1, -.1, -.05, -.15, -.1))
firre_text_df <- tibble(cluster=6,
x=firre_gene_end-7000,
y=-.1,
text="TSS")
significant_diff_peaks_firre_df <- read_csv("all_edgers_test.csv.gz") %>%
filter(passes_significance_threashold == T &
seqnames == "chrX" &
start <= (firre_cov_end) &
end >= (firre_cov_start)) %>%
mutate(cluster=most_significant_transition - 1)
firre_annotation_df <- filtered_site_annotation_df %>% filter(hg38_chromosome == "chrX" & between(hg38_pos, firre_cov_start, firre_cov_end))
firre_atac_coverage_plot <- ggplot(all_reads_df) +
geom_vline(data=firre_annotation_df, mapping=aes(xintercept=hg38_pos), alpha=.3, size=.15) +
geom_rect(aes(xmin=start, xmax=end, ymin=0, ymax=score), alpha=.2, fill="black") +
geom_segment(data=firre_peaks, aes(x=start, xend=end, y=-.1, yend=-.1, color="no change"), size=.7, alpha=.4) +
geom_segment(data=significant_diff_peaks_firre_df, aes(x=start, xend=end, y=-.1, yend=-.1, color=delta_increasing), size=.7, alpha=1) +
facet_grid_sc(rows = vars(factor(cluster, levels=c(6,0:3,5))), scales = list(y=scales_y_firre_atac), space="free_y", labeller = labeller(.rows=function(n){
return(list("0"="Cluster A",
"1"="Cluster B",
"2"="Cluster C",
"3"="Cluster D",
"5"="Cluster F",
"6"="FIRRE")[n])
})) +
geom_rect(data= firre_facet_df, aes(xmin=xmin, xmax=xmax, fill=name, ymin = ymin, ymax=ymax)) +
geom_polygon(data=firre_tss_df, aes(x=x, y=y, group=group)) +
geom_path(data=firre_tss_df, aes(x=x, y=y, group=group), size=.3) +
geom_text(data=firre_text_df, aes(x=x, y=y, label=text), size=1.2) +
scale_fill_manual(values = c("#0195F8","#000000", "yellow"),# "red", "red", "red", "red","red", "red", "red", "red", "red", "red", "red", "red", "red", "red","red", "red", "red"),
breaks = c("FIRRE",
"Exon",
# "R=0",
"R=1",
# "R=2",
"R=3",
# "R=4",
# "R=5",
# "R=6",
# "R=7",
# "R=8",
# "R=9",
# "R=10",
# "R=11",
"R=12",
# "R=13",
# "R=14",
"R=15",
# "R=16",
"R=17"),
labels = c("Intron",
"Exon",
"Repeat",
"Repeat",
"Repeat",
"Repeat",
# "Repeat",
# "Repeat",
# "Repeat",
# "Repeat",
# "Repeat",
# "Repeat",
# "Repeat",
# "Repeat",
# "Repeat",
# "Repeat",
# "Repeat",
# "Repeat",
# "Repeat",
"Repeat")) +
theme_bw() +
base_plot_theme +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.text = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")),
strip.background = element_rect(size = .3)) +
scale_color_manual(values = c("#000000", pal_npg()(2)),
breaks = c("no change", "FALSE", "TRUE"),
labels=c("Invariant", "Closing", "Opening")) +
scale_x_continuous(limits=c(firre_cov_start, firre_cov_end), expand=c(0,0)) +
labs(title="FIRRE ATAC Coverage",
x="X Chr Coordinate",
y="Reads per million")
ggsave("figs/main/fig2_firre_atac_coverage_plot.svg",firre_atac_coverage_plot, width=84, height=80, units = "mm")
heatmap_xist_peaks <- tibble(start=numeric(),
end=numeric(),
sample=character(),
strength=numeric(),
cluster=numeric())
for (i in 1:nrow(atac_sample_info)) {
srr <- as.character(atac_sample_info[i, "atac_accession"])
cluster <- as.numeric(atac_sample_info[i, "cluster"])
for (x in 1:18) {
start_coor <- 73847000+((x-1)*500)
end_coor <- 73847000+((x)*500)
pileup <- NA
if (nrow(xist_peaks %>%
filter(sample_atac_srr == srr &
start <= end_coor &
end >= start_coor)) > 0){
pileup <- as.numeric((xist_peaks %>%
filter(sample_atac_srr == srr &
start <= end_coor &
end >= start_coor))[1,"fold_enrichment"])
# if (pileup < 50) {
# pileup <- 50
# }else if (pileup > 100) {
# pileup<- 100
# }
}
heatmap_xist_peaks <- add_row(heatmap_xist_peaks,
start=start_coor,
end=end_coor,
sample=srr,
cluster=cluster,
strength=pileup)
}
}
ggplot(heatmap_xist_peaks) +
geom_tile(aes(x=sample, y=start, fill=strength))+
facet_grid(cols=vars(cluster), scale="free_x") +
scale_fill_viridis_c()
# Footprint motif analysis
# motif_counts_df <- tibble(sample_atac_srr=character(),
# motif_tf=character(),
# cluster=numeric(),
# cou=numeric())
motif_counts_all <- tibble(tf_motif=character())
for (i in 1:nrow(atac_sample_info)) {
srr <- as.character(atac_sample_info[i, "atac_accession"])
cluster <- as.numeric(atac_sample_info[i, "cluster"])
motif_file <- sprintf("banovich-atacseq-data/macs2_peaks/hint_atac_output/match/%s_1_mpbs.bed", srr)
sample_motifs_df <- read_tsv(motif_file, col_names = c("chr","start","end","tf_motif","score","strand", "empty"))
motif_counts <- sample_motifs_df %>%
count(tf_motif) %>%
rename(!!srr:=n) #%>%
#add_row(tf_motif="cluster",
# !!srr:=cluster)
motif_counts_all <- full_join(motif_counts_all,
motif_counts,
by=c("tf_motif"))
# chrX:73,820,656-73,852,723)
}
arranged_sample_names <- (atac_sample_info %>%
arrange(cluster))$atac_accession
motif_counts_all <- motif_counts_all %>% select(tf_motif, !!arranged_sample_names)
library(broom)
motif_counts_all_norm <- motif_counts_all
count_sums_df <- colSums(motif_counts_all %>% select(-tf_motif)) %>% tidy()
for (i in 1:nrow(count_sums_df)) {
srr <- as.character(count_sums_df[i, "names"])
total <- as.numeric(count_sums_df[i, "x"])
counts_col <- as.numeric(unlist(motif_counts_all_norm[srr]))
motif_counts_all_norm <- motif_counts_all_norm %>%
mutate(!!srr:=1e6*counts_col/total)
}
write_csv(motif_counts_all %>% select(tf_motif, !!arranged_sample_names), "motif_counts_all.csv")
write_csv(motif_counts_all_norm %>% select(tf_motif, !!arranged_sample_names), "motif_counts_all_norm.csv")
tf_anova_pvals <- tibble(tf_motif=character(), pval=numeric())
for (tf in unique(motif_counts_all_norm$tf_motif)) {
tf_df <- motif_counts_all_norm %>%
filter(tf_motif == tf)
motif_counts_all_melted <- tf_df %>%
reshape2::melt(id.vars="tf_motif", variable.name="sample_srr", value.name="count") %>%
left_join(atac_sample_info %>% select(atac_accession, cluster), by=c("sample_srr"="atac_accession")) %>%
filter(cluster %in% c(0:5))
aov_obj <- tidy(aov(count ~ cluster, motif_counts_all_melted))
pval <- as.numeric(unlist(aov_obj[1, "p.value"]))
tf_anova_pvals <- add_row(tf_anova_pvals,
tf_motif=tf,
pval=pval)
}
sig_tfs <- unlist((tf_anova_pvals %>% filter(pval <= .1))$tf_motif)
motif_counts_all_melted_sig <- motif_counts_all_norm %>%
reshape2::melt(id.vars="tf_motif", variable.name="sample_srr", value.name="count") %>%
left_join(atac_sample_info %>% select(atac_accession, cluster), by=c("sample_srr"="atac_accession")) %>%
filter(tf_motif %in% sig_tfs)
ggplot(motif_counts_all_melted_sig %>% filter(cluster %in% c(0:5))) +
geom_boxplot(aes(x=cluster, y=count, group=cluster, fill=factor(cluster))) +
facet_wrap(vars(tf_motif), scales = "free_y")
scale_x_continuous(limits=c(73820656, 73852723+5000))
xist_peaks_plot_df <- xist_peaks %>%
reshape2::melt(id.vars=c("sample_atac_srr", "cluster", "pileup"),
measure.vars=c("start", "end", "abs_summit"),
value.name="pos") %>%
mutate(pileup=case_when(variable == "start" ~ 0,
variable == "end" ~ 0,
variable == "abs_summit" ~pileup))
ggplot(xist_peaks_plot_df) +
geom_line(aes(x=pos,y=pileup, color=sample_atac_srr), alpha=.2) +
facet_grid(rows=vars(cluster)) +
scale_x_continuous(limits=c(73820656, 73852723+5000))
#concordance with dmps
all_dmp_df <- read_csv("all_dmp_df.csv.gz")
all_edgers_test <- read_csv("all_edgers_test.csv.gz")
coverage_distance_from_dmp_df <- tibble(dist_from_dmp=numeric(),
covered_atac_peaks=numeric(),
covered_opposite=numeric(),
total_atac_peaks=numeric())
library(IRanges)
for(d in c(100, 500, 1000, 10e3, 100e3)) {
count <- 0
rangesA <- split(IRanges(all_dmp_df$hg38_pos - d, all_dmp_df$hg38_pos + d), all_dmp_df$hg38_chr)
rangesB <- split(IRanges(all_edgers_test$start, all_edgers_test$end), all_edgers_test$seqnames)
#which rangesB wholly contain at least one rangesA?
ov <- sum(as.data.frame(countOverlaps(rangesB, rangesA))$value)
rev_ov <- nrow(as.data.frame(subsetByOverlaps(rangesB, rangesA)))
coverage_distance_from_dmp_df <- add_row(coverage_distance_from_dmp_df,
dist_from_dmp=d,
covered_atac_peaks=ov,
covered_opposite=rev_ov,
total_atac_peaks=nrow(all_edgers_test))
}
canonical_transcripts <- read_csv("hg38_canonical_transcripts.csv.gz")
hg38_biomart_structure_df <- read_csv("hg38_biomart_structure_df.csv.gz", col_types = cols(chromosome_name="c"))
hg38_biomart_structure_df_canonical <- hg38_biomart_structure_df %>%
filter(ensembl_transcript_id %in% canonical_transcripts$ensembl_transcript_id) %>%
group_by(ensembl_gene_id) %>%
summarise(external_gene_name=dplyr::first(external_gene_name),
description=dplyr::first(description),
transcription_start_site=dplyr::first(transcription_start_site),
chromosome_name=dplyr::first(chromosome_name),
end_position=dplyr::first(end_position),
start_position=dplyr::first(start_position),
strand=dplyr::first(strand)) %>%
mutate(promoter_start=case_when(strand == 1 ~ transcription_start_site-5000,
strand == -1 ~ start_position),
promoter_end=case_when(strand == 1 ~ end_position,
strand == -1 ~ transcription_start_site+5000))
all_edgers_test_2 <- all_edgers_test
hg38_biomart_structure_df_canonical <- hg38_biomart_structure_df_canonical %>% arrange(end_position - start_position)
for (r in 1:nrow(hg38_biomart_structure_df_canonical)) {
ensembl_gene_id <- as.character(unlist(hg38_biomart_structure_df_canonical[r, "ensembl_gene_id"]))
external_gene_name <- as.character(unlist(hg38_biomart_structure_df_canonical[r, "external_gene_name"]))
description <- as.character(unlist(hg38_biomart_structure_df_canonical[r, "description"]))
chr <- sprintf("chr%s", as.character(unlist(hg38_biomart_structure_df_canonical[r, "chromosome_name"])))
promoter_start <- as.integer((unlist(hg38_biomart_structure_df_canonical[r, "promoter_start"])))
promoter_end <- as.integer((unlist(hg38_biomart_structure_df_canonical[r, "promoter_end"])))
all_edgers_test_2 <- all_edgers_test_2 %>%
mutate(ensembl_id=case_when(start <= promoter_end & end >= promoter_start & seqnames == chr ~ensembl_gene_id,
TRUE ~ensembl_id),
gene=case_when(start <= promoter_end & end >= promoter_start & seqnames == chr ~external_gene_name,
TRUE ~gene),
gene_description=case_when(start <= promoter_end & end >= promoter_start & seqnames == chr ~description,
TRUE ~gene_description),
)
}
write_csv(all_edgers_test_2, "all_edgers_test_2_full_gene.csv.gz")
all_dmp_df <- read_csv("all_dmp_df.csv.gz")
library(radiant.data)
grouped_dmp_df <- all_dmp_df %>%
group_by(hg38_gene) %>%
summarise(cluster_0_mean_meth=mean(cluster_0_mean),
cluster_1_mean_meth=mean(cluster_1_mean),
cluster_2_mean_meth=mean(cluster_2_mean),
cluster_3_mean_meth=mean(cluster_3_mean),
cluster_4_mean_meth=mean(cluster_4_mean),
cluster_5_mean_meth=mean(cluster_5_mean),
transition_1_probes=sum(key_transition == 1),
transition_2_probes=sum(key_transition == 2),
transition_3_probes=sum(key_transition == 3),
transition_4_probes=sum(key_transition == 4),
transition_5_probes=sum(key_transition == 5),
transition_1_probes_passing_increasing=sum(key_transition == 1 & passes_threshold == T & delta_increasing == T),
transition_1_probes_passing_decreasing=sum(key_transition == 1 & passes_threshold == T & delta_increasing == F),
transition_2_probes_passing_increasing=sum(key_transition == 2 & passes_threshold == T & delta_increasing == T),
transition_2_probes_passing_decreasing=sum(key_transition == 2 & passes_threshold == T & delta_increasing == F),
transition_3_probes_passing_increasing=sum(key_transition == 3 & passes_threshold == T & delta_increasing == T),
transition_3_probes_passing_decreasing=sum(key_transition == 3 & passes_threshold == T & delta_increasing == F),
transition_4_probes_passing_increasing=sum(key_transition == 4 & passes_threshold == T & delta_increasing == T),
transition_4_probes_passing_decreasing=sum(key_transition == 4 & passes_threshold == T & delta_increasing == F),
transition_5_probes_passing_increasing=sum(key_transition == 5 & passes_threshold == T & delta_increasing == T),
transition_5_probes_passing_decreasing=sum(key_transition == 5 & passes_threshold == T & delta_increasing == F))
grouped_atac_df <- all_edgers_test_2 %>%
filter(passes_significance_threashold == T) %>%
group_by(gene) %>%
summarise(mean_tran_1_atac_fold=mean(transition_1_fold),
mean_tran_2_atac_fold=mean(transition_2_fold),
mean_tran_3_atac_fold=mean(transition_3_fold),
mean_tran_4_atac_fold=mean(transition_4_fold),
mean_tran_1_atac_fdr=mean(transition_1_FDR),
mean_tran_2_atac_fdr=mean(transition_2_FDR),
mean_tran_3_atac_fdr=mean(transition_3_FDR),
mean_tran_4_atac_fdr=mean(transition_4_FDR),
atac_chr=dplyr::first(seqnames))
atac_plotting_df <- all_edgers_test %>% filter(passes_significance_threashold == T & seqnames != "chrY") # & abs(delta) >= .5)
atac_plotting_df$chr_alt <- gsub("chr", "", atac_plotting_df$seqnames)
atac_plotting_df$chr_alt <- gsub("X", "23", atac_plotting_df$chr_alt)
atac_plotting_df$chr_alt <- as.numeric(atac_plotting_df$chr_alt)
atac_plotting_df <- atac_plotting_df %>% filter(!is.na(chr_alt))
chromosomes_df <- tibble(chromosome=c(1, 2, 3, 4, 5, 6, 7, 23, 8,
9, 11, 10, 12, 13, 14, 15,
16, 17, 18, 20, 19, 24, 22, 21),
size=c(248956422, 242193529, 198295559, 190214555,
181538259, 170805979, 159345973, 156040895,
145138636, 138394717, 135086622, 133797422,
133275309, 114364328, 107043718, 101991189,
90338345, 83257441, 80373285, 64444167, 58617616,
57227415, 50818468, 46709983)) %>% filter(chromosome != 24)
global_atac_changes <- ggplot(atac_plotting_df %>% mutate(most_significant_transition=case_when(most_significant_transition==4~5,
T~most_significant_transition)) %>% add_row(most_significant_transition=4)) +
geom_rect(data=chromosomes_df,
aes(xmin=chromosome-.4, xmax=chromosome+.4, ymin=0, ymax=size), fill="lightgrey")+
# geom_rect(aes(ymin=start, xmin=chr_alt-.4, ymax=end, xmax=chr_alt+.4, color=delta_increasing, alpha=abs(delta)), size=.1) +
geom_segment(aes(y=start, x=chr_alt-.4, yend=start, xend=chr_alt+.4, color=delta_increasing, alpha=abs(delta)), size=.3) +
facet_grid(rows = vars(most_significant_transition), labeller = labeller(.rows=function(n){
return(list("1"="Tran. 1",
"2"="Tran. 2",
"3"="Tran. 3",
"4"="empty",
"5"="Tran. 4/5")[n])
})) +
scale_x_reverse(breaks = 1:23, labels = c(1:22, "X"), minor_breaks = NULL, expand=c(.01,.01)) +
scale_y_continuous(expand=c(0,0), breaks = c(0,50e6,100e6,150e6,200e6,250e6), labels=c(0,50,100,150,200,250)) +
scale_color_aaas(limits=c(F,T),labels=c("Decreasing", "Increasing"), name="Direction of Change") +
scale_fill_aaas(limits=c(F,T),labels=c("Decreasing", "Increasing"), name="Direction of Change") +
labs(x="Chromosome",
y="Position (Mb)",
title="Differential ATAC Peaks on all chromosomes") +
theme_bw() +
base_plot_theme +
coord_flip() +
scale_alpha(limits=c(0.5,4), oob=scales::squish, name="abs(log2FC)") +
theme(strip.text.y = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")),
strip.text.x = element_text(size=5, margin = margin(1,1.5,1,1.5, "mm")),
strip.background = element_rect(size = .3),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
legend.box="vertical",
legend.box.just="right",
legend.box.spacing = unit(5, "mm"),
legend.spacing.y=unit(1, "mm"))
ggsave("figs/supplementary/supp_3_atac_nocutoff_rotated.pdf",global_atac_changes, width=174/3, height=236, units="mm")
for(t in 1:4) {
tran_5_density <- ggplot(plotting_df %>% filter(chr_alt != 23 & key_transition == 5)) +
geom_density(aes(x=abs(delta)), fill="black") +
theme_bw() +
base_plot_theme +
scale_y_continuous(breaks = c(0,10,20)) +
scale_x_continuous(breaks = c(0,.2,.4, .6)) +
labs(title="Autosomal β Density") +
theme(axis.ticks.length = unit(.4, "mm"),
plot.margin = margin(l=0,b=0,t=0.5,r=0.4, "mm"),
axis.title.y = element_text(size=3, margin = margin(t = 0, r = -0.2, b = 0, l = 0, "mm")),
axis.title.x = element_text(size=3,margin = margin(t = -0.2, r = 0, b = 0, l = 0, "mm")),
plot.title = element_text(size=3.5))
ggsave(sprintf("figs/supplementary/supp_3_tran_%d_density_atac.pdf", t), tran_5_density, width = 18, height=12, units = "mm")
}
dmp_atac_df <- right_join(grouped_atac_df, grouped_dmp_df, by=c("gene"="hg38_gene")) %>% filter(!is.na(atac_chr))
dmp_atac_df <- dmp_atac_df %>%
mutate(max_tran_meth_delta=case_when( max_transition==1 ~ cluster_1_mean_meth - cluster_0_mean_meth,
max_transition==2 ~ cluster_2_mean_meth - cluster_1_mean_meth,
max_transition==3 ~ cluster_3_mean_meth - cluster_2_mean_meth,
max_transition==4 ~ cluster_4_mean_meth - cluster_3_mean_meth,
max_transition==5 ~ cluster_5_mean_meth - cluster_4_mean_meth,
TRUE ~ 0),
tran_1_delta=cluster_1_mean_meth - cluster_0_mean_meth,
tran_2_delta=cluster_2_mean_meth - cluster_1_mean_meth,
tran_3_delta=cluster_3_mean_meth - cluster_2_mean_meth,
tran_45_delta=cluster_5_mean_meth - cluster_3_mean_meth)
for (i in 1:nrow(dmp_atac_df)) {
gene_meth_deltas <- unlist(dmp_atac_df[i, c("tran_1_delta", "tran_2_delta", "tran_3_delta", "tran_45_delta")])
gene_atac_deltas <- unlist(dmp_atac_df[i, c("mean_tran_1_atac_fold", "mean_tran_2_atac_fold", "mean_tran_3_atac_fold", "mean_tran_4_atac_fold")])
cor_test <- cor.test(gene_meth_deltas, gene_atac_deltas, method="spearman")
dmp_atac_df[i, "atac_meth_corr"] <- as.numeric(cor_test$estimate)
dmp_atac_df[i, "atac_meth_corr_pval"] <- as.numeric(cor_test$p.value)
}
concordant_count <- dmp_atac_df %>%
filter(max_transition > 0 &
atac_meth_corr < 0 &
atac_meth_corr_pval <= .1) %>% nrow()
library("ChIPseeker")
library("TxDb.Hsapiens.UCSC.hg38.knownGene")
library(GenomicRanges)
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
edgers_peaks_granges <- makeGRangesFromDataFrame(all_edgers_test %>% filter(passes_significance_threashold == T & !(seqnames %in% c("chrX", "chrY")) & abs(delta) >= 1) %>% dplyr::select(chr=seqnames, start, end, length=width))
# edgers_peaks_granges_tran4_dec <- makeGRangesFromDataFrame(all_edgers_test %>% filter(passes_significance_threashold == T & !(seqnames %in% c("chrX", "chrY")) & abs(delta) >= 1 & most_significant_transition == 4 & delta_increasing == F) %>% dplyr::select(chr=seqnames, start, end, length=width))
annotated_peaks <- annotatePeak(edgers_peaks_granges, tssRegion=c(-3000, 3000), TxDb=txdb) %>% as.data.frame()
library(biomaRt)
hg38_mart <- useMart('ensembl', dataset='hsapiens_gene_ensembl')
biomart_genes <- getBM(mart=hg38_mart,
filters = c( "ensembl_transcript_id_version"),
values = list(ensembl_transcript_id_version=annotated_peaks$transcriptId),
attributes=c('external_gene_name', 'description', 'chromosome_name',
'ensembl_transcript_id',
'ensembl_transcript_id_version',
'ensembl_gene_id',
'ensembl_gene_id_version'))
annotated_peaks <- left_join(annotated_peaks,
biomart_genes,
by=c("transcriptId"="ensembl_transcript_id_version"))
annotated_edgers <- all_edgers_test %>%
filter(passes_significance_threashold == T & !(seqnames %in% c("chrX", "chrY")) & abs(delta) >= 1) %>%
left_join(annotated_peaks, by=c("seqnames", "start", "end"))
annotated_edgers_gene_collated <- annotated_edgers %>%
group_by(ensembl_gene_id_version,external_gene_name, most_significant_transition, delta_increasing) %>%
summarise(count=n(),
mean_delta=mean(delta))
grouped_concordance_df <- read_csv("grouped_concordance_df.csv.gz")
grouped_concordance_df$atac_concordant <- F
for(r in 1:nrow(grouped_concordance_df)) {
gene <- as.character(unlist(grouped_concordance_df[r, "gene"]))
chr <- paste("chr", as.character(unlist(grouped_concordance_df[r, "chr"])), sep="")
start_position <- as.numeric(unlist(grouped_concordance_df[r, "start_position"]))
end_position <- as.numeric(unlist(grouped_concordance_df[r, "end_position"]))
expr_key_transition <- as.numeric(unlist(grouped_concordance_df[r, "expr_key_transition"]))
expr_key_log2FC <- as.numeric(unlist(grouped_concordance_df[r, "expr_key_log2FC"]))
# filtered_atac <- annotated_edgers_gene_collated %>%
# filter(external_gene_name == gene & most_significant_transition == expr_key_transition & delta_increasing == (expr_key_log2FC > 0) & count > 0)
# grouped_concordance_df[r, "atac_concordant"] <- nrow(filtered_atac) > 0
filtered_atac <- all_edgers_test %>% filter(seqnames == chr & start <= (end_position+1e6) & end >= (start_position-1e6) & passes_significance_threashold == T & most_significant_transition == expr_key_transition)
grouped_concordance_df[r, "atac_concordant"] <- nrow(filtered_atac) > 0
}
# diff_expr_all_df <- read_csv("diff_expr_all_df.csv.gz")
diff_expr_all_df <- read_csv("diff_expr_all_df.csv.gz")
new_concordance_df <- diff_expr_all_df
new_concordance_df$atac_concordant <- F
new_concordance_df$meth_concordant <- F
for(r in 1:nrow(new_concordance_df)) {
gene <- as.character(unlist(new_concordance_df[r, "gene"]))
chr <- paste("chr", as.character(unlist(new_concordance_df[r, "chr"])), sep="")
start_position <- as.numeric(unlist(new_concordance_df[r, "start_position"]))
end_position <- as.numeric(unlist(new_concordance_df[r, "end_position"]))
expr_key_transition <- as.numeric(unlist(new_concordance_df[r, "key_transition"]))
expr_log2FC_increasing <- as.logical(unlist(new_concordance_df[r, "positive_delta"]))
expr_key_log2FC <- as.numeric(unlist(new_concordance_df[r, sprintf("transition_%d_log2FC", expr_key_transition)]))
new_concordance_df[r, "delta"] <- expr_key_log2FC
filtered_atac <- all_edgers_test %>% filter(seqnames == chr & start <= (end_position+2e3) & end >= (start_position-2e3) & passes_significance_threashold == T & most_significant_transition == expr_key_transition)
new_concordance_df[r, "atac_concordant"] <- nrow(filtered_atac) > 0
filtered_dmp <- grouped_dmp_df %>% filter(hg38_gene == gene)
if(nrow(filtered_dmp) > 0 & expr_key_transition < 4) {
sig_probes_in_key_transition <- as.numeric(unlist(filtered_dmp[1, sprintf("transition_%d_probes_passing_increasing", expr_key_transition)]))
if(expr_log2FC_increasing) {
sig_probes_in_key_transition <- as.numeric(unlist(filtered_dmp[1, sprintf("transition_%d_probes_passing_decreasing", expr_key_transition)]))
}
new_concordance_df[r, "meth_concordant"] <- sig_probes_in_key_transition > 0
}else if (nrow(filtered_dmp) > 0 & expr_key_transition == 4) {
sig_probes_in_key_transition <- as.numeric(unlist(filtered_dmp[1, sprintf("transition_%d_probes_passing_increasing", expr_key_transition)])) +
as.numeric(unlist(filtered_dmp[1, sprintf("transition_%d_probes_passing_increasing", 5)]))
if(expr_log2FC_increasing) {
sig_probes_in_key_transition <- as.numeric(unlist(filtered_dmp[1, sprintf("transition_%d_probes_passing_decreasing", expr_key_transition)])) +
as.numeric(unlist(filtered_dmp[1, sprintf("transition_%d_probes_passing_decreasing", 5)]))
}
new_concordance_df[r, "meth_concordant"] <- sig_probes_in_key_transition > 0
}
}
write_csv(new_concordance_df, "new_concordance_df.csv.gz")
write_csv(new_concordance_df %>% dplyr::select(ens_id, gene, chr, start_position, end_position, key_transition, passes_expression_threshold=passes_threshold, expression_increasing=positive_delta, atac_concordant, meth_concordant), "table_4_meth_atac_expr_concordance.csv")
# grouped_concordance_df_filtered <- grouped_concordance_df %>%
# filter((pval <= .1 & corr < 0 & max_probes != 0) | atac_concordant == T)
expr_plotting_df <- new_concordance_df %>% filter(passes_threshold == T & (atac_concordant == T | meth_concordant == T)) # diff_expr_all_df #%>% filter(passes_threshold == T)
expr_plotting_df$chr_alt <- gsub("X", "23", expr_plotting_df$chr)
expr_plotting_df$chr_alt <- as.numeric(expr_plotting_df$chr_alt)
chromosomes_df <- tibble(chromosome=c(1, 2, 3, 4, 5, 6, 7, 23, 8,
9, 11, 10, 12, 13, 14, 15,
16, 17, 18, 20, 19, 24, 22, 21),
size=c(248956422, 242193529, 198295559, 190214555,
181538259, 170805979, 159345973, 156040895,
145138636, 138394717, 135086622, 133797422,
133275309, 114364328, 107043718, 101991189,
90338345, 83257441, 80373285, 64444167, 58617616,
57227415, 50818468, 46709983)) %>% filter(chromosome != 24)
global_expr_changes <- ggplot(expr_plotting_df %>% mutate(key_transition=case_when(key_transition==4~5,
T~key_transition)) %>% add_row(key_transition=4)) +# %>% filter(abs(max_tran_log2FC) >= .5)) +
geom_rect(data=chromosomes_df,
aes(xmin=chromosome-.4, xmax=chromosome+.4, ymin=0, ymax=size), fill="lightgrey")+
geom_segment(aes(y=start_position, x=chr_alt-.4, yend=start_position, xend=chr_alt+.4, color=delta >= 0, alpha=abs(delta)), size=.3) +
facet_grid(rows = vars(key_transition), labeller = labeller(.rows=function(n){
return(list("1"="Tran. 1",
"2"="Tran. 2",
"3"="Tran. 3",
"4"="Tran. 4",
"5"="Tran. 4/5")[n])
})) +
scale_alpha(limits=c(0,.5), oob=scales::squish, breaks=c(0,.25,.5), name="abs(log2FC)") +
scale_x_reverse(breaks = 1:23, labels = c(1:22, "X"), minor_breaks = NULL, expand=c(.01,.01)) +
scale_y_continuous(expand=c(0,0), breaks = c(0,50e6,100e6,150e6,200e6,250e6), labels=c(0,50,100,150,200,250)) +
scale_color_aaas(limits=c(F, T),labels=c("Decreasing", "Increasing"), name="Direction of Expression Change") +
labs(x="Chromosome",
y="Position (Mb)",
title="Concordant Expression Changes on all chromosomes") +
theme_bw() +
base_plot_theme +
coord_flip() +
guides(alpha="legend", colours="legend") +
theme(strip.text.y = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")),
strip.text.x = element_text(size=5, margin = margin(1,1.5,1,1.5, "mm")),
strip.background = element_rect(size = .3),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
legend.box="vertical",
legend.box.just="right",
legend.box.spacing = unit(5, "mm"),
legend.spacing.y=unit(1, "mm"))
ggsave("figs/supplementary/supp_3c_alt_meth_dir.pdf", global_expr_changes, width=174/3, height=236, units="mm")
global_expr_changes_tran45 <- ggplot(expr_plotting_df %>% filter(key_transition==4)) +# %>% filter(abs(max_tran_log2FC) >= .5)) +
geom_rect(data=chromosomes_df,
aes(xmin=chromosome-.4, xmax=chromosome+.4, ymin=0, ymax=size), fill="lightgrey")+
geom_segment(aes(y=start_position, x=chr_alt-.4, yend=start_position, xend=chr_alt+.4, color=delta >= 0, alpha=abs(delta)), size=.3) +
facet_grid(rows = vars(key_transition), labeller = labeller(.rows=function(n){
return(list("1"="Tran. 1",
"2"="Tran. 2",
"3"="Tran. 3",
"4"="Tran. 4/5",
"5"="Tran. 4/5")[n])
})) +
scale_x_reverse(breaks = 1:23, labels = c(1:22, "X"), minor_breaks = NULL, expand=c(0.01,0.01)) +
scale_y_continuous(expand=c(0,0), breaks = c(0,50e6,100e6,150e6,200e6,250e6), labels=c(0,50,100,150,200,250)) +
scale_color_aaas(limits=c(F, T),labels=c("Decreasing", "Increasing"), name="Direction of Expression Change") +
scale_alpha(limits=c(0,.5), oob=scales::squish, breaks=c(0,.25,.5), name="abs(log2FC)") +
labs(x="Chromosome",
y="Position (Mb)",
title="Concordant Expression Changes") +
theme_bw() +
base_plot_theme +
coord_flip() +
guides(alpha=guide_legend(order = 2), col=guide_legend(order = 1)) +
theme(strip.text.y = element_text(size=5, margin = margin(0,0.5,0,.5, "mm")),
strip.text.x = element_text(size=5, margin = margin(0,0.5,0,.5, "mm")),
strip.background = element_rect(size = .3),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
legend.box="vertical",
legend.box.just="right",
legend.box.spacing = unit(3.5, "mm"),
legend.spacing.y=unit(.2, "mm"),
axis.text.y=element_text(size=5))
ggsave("figs/main/fig_4_global_changes_expr.pdf", global_expr_changes_tran45, width = 58, height=47, units = "mm")
# check for overlap with repeats
rmsk_df <- read_tsv("atac_rmsk_fischer/filtered_hg38_rmsk.bed", col_names = c("chr", "start", "end", "rep_name", "score", "strand"))
all_edgers_test <- read_csv("all_edgers_test.csv.gz")
library(GenomicRanges)
rmsk_grange <- makeGRangesFromDataFrame(rmsk_df %>% select(-strand), keep.extra.columns = T, ignore.strand = T)
sig_edgers_test_grange <- makeGRangesFromDataFrame(all_edgers_test %>% filter(passes_significance_threashold == T), keep.extra.columns = T, ignore.strand = T)
overlapping_rmsks <- subsetByOverlaps(rmsk_grange, sig_edgers_test_grange)
overlapping_rmsks_df <- as.data.frame(overlapping_rmsks)
overlapping_repnames <- unique(overlapping_rmsks_df$rep_name)
sorted_chrom_sizes <- read_tsv("atac_rmsk_fischer/hg38.chrom.sizes.sorted.txt", col_names = c("chr", "size"))
rmsk_df %>%
filter(rep_name %in% overlapping_repnames & chr %in% sorted_chrom_sizes$chr) %>%
group_by(rep_name) %>%
arrange(chr, start) %>%
group_walk(~write_tsv(.x, sprintf("atac_rmsk_fischer/rep_beds/%s_rmsk.bed", .y$rep_name), col_names = F))
all_edgers_test %>%
filter(passes_significance_threashold == T & seqnames %in% sorted_chrom_sizes$chr) %>%
group_by(delta_increasing, most_significant_transition) %>%
arrange(seqnames, start) %>%
group_walk(~write_tsv(.x %>% select(seqnames, start, end), sprintf("atac_rmsk_fischer/edgers_atac_beds/T%d_up_%s_edgers.bed", .y$most_significant_transition, .y$delta_increasing), col_names = F))
fisher_output_df <- read_csv("atac_rmsk_fischer/fisher_output.csv")
filtered_fisher_df <- fisher_output_df %>% filter(right_pval<.01) %>%
mutate(delta_increasing_mod=case_when(delta_increasing==T ~ 1,
delta_increasing == F ~ -1))
expression_hervs <- c("LTR1",
"LTR2C",
"LTR40a",
"LTR40b",
"LTR5_Hs",
"LTR5_Hs",
"LTR5",
"LTR5_Hs",
"LTR5B",
"LTR6A",
"LTR7",
"LTR7Y",
"MER50",
"LTR1",
"LTR2C",
"LTR40a",
"LTR40b",
"LTR5_Hs",
"LTR5_Hs",
"LTR5",
"LTR5_Hs",
"LTR5B",
"LTR6A",
"LTR7",
"LTR7Y",
"MER50")
ltr_pal <- c("#fdfdfd", "#1d1d1d", "#ebce2b", "#702c8c", "#db6917", "#96cde6", "#ba1c30", "#c0bd7f", "#7f7e80", "#5fa641", "#d485b2", "#4277b6", "#df8461", "#463397", "#e1a11a", "#91218c", "#e8e948", "#7e1510", "#92ae31", "#6f340d", "#d32b1e", "#2b3514")
plotting_fisher_df <- filtered_fisher_df %>%
filter(rep %in% expression_hervs) %>%
mutate(sig_marker=case_when(right_pval < .0001 ~ "***",
right_pval < .001 ~ "**",
right_pval < .01 ~ "*")) %>%
arrange(factor(rep, levels = c("LTR5_Hs", "LTR7", "LTR7Y", "LTR1", "LTR2C", "LTR6A", "MER50"))) %>%
mutate(rep=factor(rep, levels = c("LTR5_Hs", "LTR7", "LTR7Y", "LTR1", "LTR2C", "LTR6A", "MER50")),
lab_ypos = cumsum(delta_increasing_mod*overlaps) - 0.5 * overlaps)
library(shadowtext)
atac_fisher_plot <- ggplot(plotting_fisher_df, aes(x=transition, y=delta_increasing_mod*overlaps, fill=rep)) +
geom_bar(stat="identity") +
# geom_text(data=plotting_fisher_df, aes(label = rep),position = position_stack(vjust = 0.5), size = 1.) +
geom_text(data=plotting_fisher_df, aes(label = paste(rep,sig_marker, sep="")),position = position_stack(vjust = 0.5), size = 1.3) +
scale_fill_manual(values=ltr_pal[c(6,8,10,3,7,22,13)],
limits=c("LTR5_Hs",
"LTR7",
"LTR7Y",
"LTR1",
"LTR2C",
"LTR6A",
"MER50"), guide=F) +
scale_x_discrete(limits=1:4, labels=c("1", "2", "3", "4/5")) +
theme_bw() +
base_plot_theme +
labs(title="Repeats Overlapped by\nDiff ATAC Peaks",
y="Repeat Overlap",
x="Transition")
ggsave("figs/main/fig4_atac_fisher_plot.pdf", atac_fisher_plot, width=40, height = 61, units = "mm")