make_figures.R


## ---------------------------
##
## Purpose of script: Generate figures for thesis
##
## Author: Prakhar Bansal
##
## Date Created: 2022-03-27
##
## Copyright (c) Prakhar Bansal, 2022
## Email: pbansal@uchc.edu
##
## ---------------------------
##
## Notes:
##
##
## ---------------------------

## set working directory
setwd(dirname(unlist(rstudioapi::getSourceEditorContext()['path'])))

## ---------------------------

## load up the packages we will need:  (uncomment as required)
require(tidyverse)
theme_set(
  theme_classic() + 
    theme(
      text=element_text(size=9, family="sans"),
      plot.title=element_text(hjust=0.5, size=10, family = "sans", face="bold"),
      axis.title = element_text(size=9, face="bold"),
      axis.text = element_text(size=9),
      legend.spacing.x = unit(0, "cm"),
      legend.spacing.y = unit(1, "mm"),
      legend.key.height = unit(4, "mm"),
      legend.margin = margin(0,0,0,0),
      axis.line = element_line(size=0),
      strip.background = element_blank(),
      strip.text = element_text(margin = margin(1,1,1,1, unit="mm")),
      panel.border = element_rect(color="black", fill=NA, size=.5)
    )
)
require(ggsci)
library(ggdendro)
library(ComplexHeatmap)
library(rstatix)
library(ggpubr)
## ---------------------------

output_dir <- "outputs/2024-02-17 21-39-53.641348 timepoint+dox_status+effective_genotype"
diff_expr_compare_to <- "T21_vs_D21"
diff_expr_comparisons <- c("dox_effect", "dT21XIST_vs_T21", "wdT21XIST_vs_T21", "dwdT21SIT_vs_T21")
diff_expr_combined_padj_cutoff <- .1
gsea_compare_to <- "T21_vs_D21"
gsea_comparisons <- c("dox_effect", "dT21XIST_vs_T21", "wdT21XIST_vs_T21", "dwdT21SIT_vs_T21")
chr21_changes_comparisons <- c("dT21XIST_vs_T21", "wdT21XIST_vs_T21")
gsea_combined_padj_cutoff <- .2
lol_cutoff_primary <- .1
lol_cutoff_overlap <- .1

fig_output_dir <- sprintf("revision_figs/%s", str_replace(output_dir, "outputs/", ""))
table_output_dir <- sprintf("revision_tables/%s", str_replace(output_dir, "outputs/", ""))

dir.create(fig_output_dir)
dir.create(table_output_dir)

gene_info_df <- read_csv("input_data/gene_info_df.csv.gz")
sample_sheet_df <- read_csv(paste0(output_dir, "/tables/sample_sheet_df.csv.gz"))
batch_corrected_vsd_df <- read_csv(paste0(output_dir, "/tables/batch_corrected_vsd_salmon_df.csv.gz"))
combined_results_df <- read_csv(paste0(output_dir, "/tables/combined_results_df.csv.gz"))

neural_output_dir <- (read_csv(paste0(output_dir, "/params.csv.gz")) |>
                        filter(parameter == "neural_output_dir") |>
                        pull(value))[1]
neural_results_nonrtta_T21_vs_D21 <- read_csv(paste0(neural_output_dir, "/tables/results_neural_t21_d21.csv.gz"))
neural_combined_results_df <- read_csv(paste0(neural_output_dir, "/tables/combined_results_df.csv.gz"))
combined_results_df <- read_csv(paste0(output_dir, "/tables/combined_results_df.csv.gz")) |>
  full_join(neural_combined_results_df) |>
  mutate_at(vars(starts_with("padj")), list(~ if_else(is.na(.), 1, .))) |>
  mutate_at(vars(starts_with("pvalue")), list(~ if_else(is.na(.), 1, .))) |>
  mutate_at(vars(starts_with("log2FoldChange")), list(~ if_else(is.na(.), 0, .))) |>
  mutate_at(vars(starts_with("stat")), list(~ if_else(is.na(.), 0, .))) |>
  left_join(gene_info_df, by = c("ensembl_id" = "ensembl_gene_id_version"))

write_csv(combined_results_df, paste0(table_output_dir, "/deseq2_results.csv"))

combined_results_df |> 
  dplyr::select(ensembl_id, hgnc_symbol, chromosome_name, start_position, gene_biotype,
                ends_with(c("T21_vs_D21", "D21_vs_T21", "dT21XIST_vs_T21", 
                            "wdT21XIST_vs_T21", "dwdT21SIT_vs_T21",
                            "neural_t21_d21", "neural_t21xistdox_t21", "neural_t21xistwd_t21", "neural_t21xistdoxcombined_t21"))) |> 
  rename_all(
    .funs = list(
      ~str_replace_all(.,"dwdT21SIT_vs_T21", "combinedT21XIST_vs_T21") |> 
      str_replace_all("neural_t21_d21", "neural_T21_vs_D21") |> 
      str_replace_all("neural_t21xistdox_t21", "neural_dT21XIST_vs_T21") |> 
      str_replace_all("neural_t21xistwd_t21", "neural_wdT21XIST_vs_T21") |> 
      str_replace_all("neural_t21xistdoxcombined_t21", "neural_combinedT21XIST_vs_T21")
    )
  ) |> 
  write_csv(paste0(table_output_dir, "/Data_S1_bulk_deseq2.csv.gz"))

######################## Diff Expr 198-1 and 198-2 #############################

dds <- readRDS(sprintf("%s/tables/dds.rds", output_dir))
vsd <- varianceStabilizingTransformation(dds, blind = F)

sprintf("Outputting DESeq2 Results...\n")

diff_exprs <- list(
  "1981_vs_1982" = c(
    'cell_line == "198_1"',
    'cell_line == "198_2"',
    "timepoint|dox_status")
)

for (de_name in names(diff_exprs)) {
  write_results(de_name,
                diff_exprs[[de_name]][1],
                diff_exprs[[de_name]][2],
                dds,
                outdir,
                diff_exprs[[de_name]][3])
}

# MA plot between D21 and T21XIST
vsd_salmon_df <- read_csv(paste0(output_dir, "/tables/vsd_salmon_df.csv.gz"))

mean_d21_t21xist_vst_df <- vsd_salmon_df |> 
  pivot_longer(cols = -ensembl_id,
               names_to = "sample_id",
               values_to = "vst_count") |> 
  left_join(sample_sheet_df, by="sample_id") |> 
  filter(genotype == "D21" | (genotype == "T21XIST" & dox_status != "no")) |> 
  group_by(ensembl_id) |> 
  summarise(mean_vst_count = mean(vst_count))


aov_d21_t21xist_df <- aov(vst_count ~ group,
                          vsd_salmon_df |>
                            pivot_longer(cols = -ensembl_id,
                                         names_to = "sample_id",
                                         values_to = "vst_count") |> 
                            left_join(sample_sheet_df, by="sample_id") |> 
                            left_join(gene_info_df |> dplyr::select(ensembl_id=ensembl_gene_id_version, chromosome_name)) |> 
                            # filter(chromosome_name != "21") |> 
                            filter(genotype == "D21" | (genotype == "T21XIST" & dox_status != "no")) |> 
                            mutate(group = case_when(
                              genotype == "D21"  ~ "D21",
                              genotype == "T21XIST" & dox_status != "no" ~ "T21XIST"
                            ))) |>
  broom::tidy()

aov_t21_d21_df <- aov(vst_count ~ group,
              vsd_salmon_df |> 
                pivot_longer(cols = -ensembl_id,
                             names_to = "sample_id",
                             values_to = "vst_count") |> 
                left_join(sample_sheet_df, by="sample_id") |> 
                left_join(gene_info_df |> dplyr::select(ensembl_id=ensembl_gene_id_version, chromosome_name)) |> 
                # filter(chromosome_name != "21") |> 
                filter(genotype == "D21" | genotype == "T21") |> 
                mutate(group = case_when(
                  genotype == "D21"  ~ "D21",
                  genotype == "T21" ~ "T21"
                ))) |> 
  broom::tidy()

aov_t21_t21xist_df <- aov(vst_count ~ group,
    vsd_salmon_df |> 
      pivot_longer(cols = -ensembl_id,
                   names_to = "sample_id",
                   values_to = "vst_count") |> 
      left_join(sample_sheet_df, by="sample_id") |>
      left_join(gene_info_df |> dplyr::select(ensembl_id=ensembl_gene_id_version, chromosome_name)) |> 
      # filter(chromosome_name != "21") |> 
      filter(genotype == "T21" | (genotype == "T21XIST" & dox_status != "no")) |> 
      mutate(group = case_when(
        genotype == "T21"  ~ "T21",
        genotype == "T21XIST" & dox_status != "no" ~ "T21XIST"
      ))) |> 
  broom::tidy()

aov_d21_t21xist_df |> 
  mutate(comparison = "T21XIST vs D21") |> 
  rbind(
    aov_t21_t21xist_df |> 
      mutate(comparison = "T21XIST vs T21")) |> 
  rbind(
    aov_t21_d21_df |> 
      mutate(comparison = "T21 vs D21")) |> 
  filter(term == "group") |> 
  ggplot(aes(x=comparison, y=statistic)) +
  geom_col() +
  labs(
    title="ANOVA Test Statistic w/ chr21",
    # title="ANOVA Test Statistic",
       x="Groups Compared") +
  geom_text(aes(label=sprintf("p = %.1g", p.value)), vjust=-0.2, size=3) +
  # scale_y_continuous(limits=c(0,1)) +
  scale_y_continuous(limits=c(0,2.4)) +
  theme(
    axis.text.x = element_text(angle=30, hjust=1),
    axis.title.x = element_blank()
  )

ggsave(sprintf("%s/anova_statistic.pdf", fig_output_dir), width = 2, height=2)
ggsave(sprintf("%s/anova_statistic_withchr21.pdf", fig_output_dir), width = 2, height=2)


ggplotly(combined_results_df |> 
  # filter(!(gene_biotype %in% c("rRNA", "TEC")) & !is.na(gene_biotype))|>
  filter((gene_biotype %in% c("protein_coding")) & !is.na(gene_biotype))|>
  select(ensembl_id, chromosome_name, hgnc_symbol, gene_biotype, ends_with("_dwdT21SIT_vs_D21")) |> 
  left_join(mean_d21_t21xist_vst_df) |> 
  mutate(colorby = case_when(
    padj_dwdT21SIT_vs_D21 > .1 ~ "nonDE",
    padj_dwdT21SIT_vs_D21 <= .1 & log2FoldChange_dwdT21SIT_vs_D21 > 0 ~ "upReg",
    padj_dwdT21SIT_vs_D21 <= .1 & log2FoldChange_dwdT21SIT_vs_D21 < 0 ~ "downReg"
  )) |> 
  ggplot(aes(x=mean_vst_count, y=log2FoldChange_dwdT21SIT_vs_D21, color=colorby, name=ensembl_id, biotype=gene_biotype)) +
    geom_point() +
    scale_color_manual(values=c("blue", "grey", "red"), limits=c("downReg", "nonDE", "upReg")))

# MA plot between D21 and T21
vsd_salmon_df <- read_csv(paste0(output_dir, "/tables/vsd_salmon_df.csv.gz"))

mean_d21_t21xist_vst_df <- vsd_salmon_df |> 
  pivot_longer(cols = -ensembl_id,
               names_to = "sample_id",
               values_to = "vst_count") |> 
  left_join(sample_sheet_df, by="sample_id") |> 
  filter(genotype == "D21" | (genotype == "T21")) |> 
  group_by(ensembl_id) |> 
  summarise(mean_vst_count = mean(vst_count))

ggplotly(combined_results_df |> 
           # filter(!(gene_biotype %in% c("rRNA", "TEC")) & !is.na(gene_biotype))|> 
           filter((gene_biotype %in% c("protein_coding")) & !is.na(gene_biotype))|>
  select(ensembl_id, chromosome_name, hgnc_symbol, gene_biotype, ends_with("_T21_vs_D21")) |> 
  left_join(mean_d21_t21xist_vst_df) |> 
  mutate(colorby = case_when(
    padj_T21_vs_D21 > .1 ~ "nonDE",
    padj_T21_vs_D21 <= .1 & log2FoldChange_T21_vs_D21 > 0 ~ "upReg",
    padj_T21_vs_D21 <= .1 & log2FoldChange_T21_vs_D21 < 0 ~ "downReg"
  )) |> 
  ggplot(aes(x=mean_vst_count, y=log2FoldChange_T21_vs_D21, color=colorby, name=ensembl_id, biotype=gene_biotype)) +
  geom_point() +
  scale_color_manual(values=c("blue", "grey", "red"), limits=c("downReg", "nonDE", "upReg")))

################################################################################

combined_results_df |> 
  pivot_longer(
    cols = c(tidyselect::ends_with(c(
      "D21_vs_T21", "dT21XIST_vs_T21", "wdT21XIST_vs_T21", "dwdT21SIT_vs_T21",
      "T21_vs_D21", "dT21XIST_vs_D21", "wdT21XIST_vs_D21", "dwdT21SIT_vs_D21"))),
    names_pattern = "([A-Za-z0-9\\.]*)_(.*)", names_to = c("value_type", "comparison")
  ) |> 
  pivot_wider(id_cols = c(ensembl_id, comparison),
              names_from = value_type,
              values_from = value) |> 
  left_join(gene_info_df |> dplyr::select(ensembl_gene_id_version, chromosome_name), 
            by=c("ensembl_id"="ensembl_gene_id_version")) |> 
  mutate(padj = case_when(is.na(padj) ~ 1,
                          T ~ padj),
         chromosome_name = case_when(is.na(chromosome_name) ~ "NA",
                                     T ~ chromosome_name)) |> 
  dplyr::count(comparison, chromosome_name == "21", log2FoldChange > 0, padj <= .01) |> 
  pivot_wider(names_from = comparison,
              values_from = n) |> 
  arrange(`padj <= 0.01`, desc(`chromosome_name == "21"`), desc(`log2FoldChange > 0`)) |> 
  dplyr::select(`padj <= 0.01`, `chromosome_name == "21"`, `log2FoldChange > 0`, 
                T21_vs_D21, dT21XIST_vs_D21, dwdT21SIT_vs_D21,
                dT21XIST_vs_T21, dwdT21SIT_vs_T21) |> 
  write_csv("revision_figs/diff_expr_counts.csv")


gene_updown_df <- combined_results_df |> 
  pivot_longer(
    cols = c(tidyselect::ends_with(c(
      "dT21XIST_vs_T21", "wdT21XIST_vs_T21", "dwdT21SIT_vs_T21",
      "T21_vs_D21"))),
    names_pattern = "([A-Za-z0-9\\.]*)_(.*)", names_to = c("value_type", "comparison")
  ) |> 
  pivot_wider(id_cols = c(ensembl_id, comparison),
              names_from = value_type,
              values_from = value) |> 
  left_join(gene_info_df |> dplyr::select(ensembl_gene_id_version, chromosome_name), 
            by=c("ensembl_id"="ensembl_gene_id_version")) |> 
  mutate(padj = case_when(is.na(padj) ~ 1,
                          T ~ padj),
         chromosome_name = case_when(is.na(chromosome_name) ~ "NA",
                                     T ~ chromosome_name)) |> 
  dplyr::count(comparison, chromosome_name == "21", log2FoldChange > 0, padj <= .1) |> 
  pivot_wider(names_from = comparison,
              values_from = n) |> 
  arrange(`padj <= 0.1`, desc(`chromosome_name == "21"`), desc(`log2FoldChange > 0`)) |> 
  dplyr::select(`padj <= 0.1`, 
                `chromosome_name == "21"`, 
                `log2FoldChange > 0`, 
                T21_vs_D21, dT21XIST_vs_T21, wdT21XIST_vs_T21, dwdT21SIT_vs_T21) |> 
  pivot_longer(cols = -c(`padj <= 0.1`, 
                         `chromosome_name == "21"`, 
                         `log2FoldChange > 0`),
               names_to = "comparison",
               values_to= "num_genes" ) |> 
  filter(`padj <= 0.1` == T) |> 
  mutate(`chromosome_name == "21"` = case_when(
    `chromosome_name == "21"` == T ~ "chr21",
    T ~ "non chr21"
  ))

binom_test_data_df <- gene_updown_df |> 
  pivot_wider(names_from = `log2FoldChange > 0`, values_from = "num_genes") |> 
  mutate(total = `TRUE` + `FALSE`) |> 
  mutate(binom_pval = purrr::pmap(.l = list(x=`TRUE`, tot=total),
                                  .f = \(x, tot) (binom.test(x = x, 
                                        n = tot, 
                                        alternative = "two.sided")$p.value)[1])) |> 
  tidyr::unnest(binom_pval)

ggplot(gene_updown_df) +
  geom_col(aes(x=comparison, y=num_genes, fill=`log2FoldChange > 0`), position = position_dodge()) +
  facet_grid(rows = vars(`chromosome_name == "21"`), scales = "free_y") +
  geom_text(data = binom_test_data_df |> 
              mutate(max_count = pmax(`TRUE`, `FALSE`)), 
            mapping = aes(x=comparison, y=max_count*1.2, label=sprintf("%.2g", binom_pval), vjust=1),
            size=3) +
  scale_fill_aaas() +
  scale_x_discrete(limits = c("T21_vs_D21", "dT21XIST_vs_T21", "wdT21XIST_vs_T21", "dwdT21SIT_vs_T21"),
                   labels = c("T21 vs D21", "+dox vs T21", "w/d vs T21", "combined vs T21")) +
  theme(axis.title.x = element_blank()) +
  labs(y = "# Genes",
       title = "Differential Expression by Chromosome")

ggsave(sprintf("%s/diff_expr_by_chr.pdf", fig_output_dir), width = 4, height = 3)


combined_results_df |> 
  pivot_longer(
    cols = c(tidyselect::ends_with(c(
      "D21_vs_T21", "dT21XIST_vs_T21", "wdT21XIST_vs_T21", "dwdT21SIT_vs_T21",
      "T21_vs_D21", "dT21XIST_vs_D21", "wdT21XIST_vs_D21", "dwdT21SIT_vs_D21"))),
    names_pattern = "([A-Za-z0-9\\.]*)_(.*)", names_to = c("value_type", "comparison")
  ) |> 
  pivot_wider(id_cols = c(ensembl_id, comparison),
              names_from = value_type,
              values_from = value) |> 
  left_join(gene_info_df |> dplyr::select(ensembl_gene_id_version, chromosome_name), 
            by=c("ensembl_id"="ensembl_gene_id_version")) |> 
  mutate(padj = case_when(is.na(padj) ~ 1,
                          T ~ padj),
         chromosome_name = case_when(is.na(chromosome_name) ~ "NA",
                                     T ~ chromosome_name),
         is21 = case_when(chromosome_name == "21" ~ "chr21",
                          T ~ "non chr21")) |> 
  filter(comparison %in% c("dwdT21SIT_vs_T21", "dwdT21SIT_vs_D21")) |> 
  ggplot() +
  geom_density(aes(x=padj, color = comparison)) +
  facet_grid(rows = vars(is21)) +
  scale_color_aaas(breaks = c("dwdT21SIT_vs_T21", "dwdT21SIT_vs_D21"), labels = c("T21XIST vs T21", "T21XIST vs D21")) +
  scale_x_log10(limits=c(1e-4, 1e0), oob = scales::squish) +
  labs(title="Distribution of padj")
ggsave(sprintf("%s/2_dist_padj.png", fig_output_dir))
ggsave(sprintf("%s/2_dist_padj_linear.png", fig_output_dir))

combined_results_df |> 
  pivot_longer(
    cols = c(tidyselect::ends_with(c(
      "D21_vs_T21", "dT21XIST_vs_T21", "wdT21XIST_vs_T21", "dwdT21SIT_vs_T21",
      "T21_vs_D21", "dT21XIST_vs_D21", "wdT21XIST_vs_D21", "dwdT21SIT_vs_D21"))),
    names_pattern = "([A-Za-z0-9\\.]*)_(.*)", names_to = c("value_type", "comparison")
  ) |> 
  pivot_wider(id_cols = c(ensembl_id, comparison),
              names_from = value_type,
              values_from = value) |> 
  left_join(gene_info_df |> dplyr::select(ensembl_gene_id_version, chromosome_name), 
            by=c("ensembl_id"="ensembl_gene_id_version")) |> 
  mutate(padj = case_when(is.na(padj) ~ 1,
                          T ~ padj),
         chromosome_name = case_when(is.na(chromosome_name) ~ "NA",
                                     T ~ chromosome_name),
         is21 = case_when(chromosome_name == "21" ~ "chr21",
                          T ~ "non chr21")) |> 
  filter(comparison %in% c("dwdT21SIT_vs_T21", "dwdT21SIT_vs_D21") &
           padj <= .01) |> 
  ggplot() +
  geom_density(aes(x=log2FoldChange, color = comparison)) +
  facet_grid(rows = vars(is21)) +
  geom_vline(xintercept = 0) +
  scale_color_aaas(breaks = c("dwdT21SIT_vs_T21", "dwdT21SIT_vs_D21"), labels = c("T21XIST vs T21", "T21XIST vs D21")) +
  scale_x_continuous(limits = c(-1.5, 1.5), breaks = c(seq(0,15, by=2)/10, seq(0,15, by=2)/-10), oob = scales::squish) +
  labs(title="Distribution of significant log2FC (padj <= .01)")
ggsave(sprintf("%s/2_dist_log2fc_01.png", fig_output_dir))

file.copy(fs::dir_ls(path = sprintf("%s/tables", neural_output_dir),
                     glob = sprintf(
                       "%s/tables/gsea_*csv.gz",
                       neural_output_dir |>
                         str_replace_all("\\+", "\\\\+") |>
                         str_replace_all("\\,", "\\\\,")
                     )), 
          sprintf("%s/tables", output_dir),
          overwrite=TRUE)

gsea_files <- fs::dir_ls(
  path = sprintf("%s/tables", output_dir),
  glob = sprintf(
    "%s/tables/gsea_*csv.gz",
    output_dir |>
      str_replace_all("\\+", "\\\\+") |>
      str_replace_all("\\,", "\\\\,")
  )
)
print(gsea_files)
combined_enrichment_df <- read_csv(gsea_files,
                                   id = "origin_file") |>
  mutate(comparison = str_replace_all(origin_file, sprintf("(%s/tables/gsea_)|(.csv.gz)", output_dir |>
                                                             str_replace_all("\\+", "\\\\+") |>
                                                             str_replace_all("\\(", "\\\\(") |>
                                                             str_replace_all("\\)", "\\\\)") |>
                                                             str_replace_all("\\,", "\\\\,")), "")) |>
  mutate(
    leadingEdgeTags = as.numeric(str_replace_all(str_split(leading_edge, ", ", simplify = T)[, 1], "tags=|%", "")) / 100,
    leadingEdgeList = as.numeric(str_replace_all(str_split(leading_edge, ", ", simplify = T)[, 2], "list=|%", "")) / 100,
    leadingEdgeSignal = as.numeric(str_replace_all(str_split(leading_edge, ", ", simplify = T)[, 3], "signal=|%", "")) / 100,
    numCoreEnrichment = str_count(str_split(core_enrichment, "/")) + 1
  ) |>
  pivot_wider(id_cols = ID, names_from = comparison, values_from = c(NES, p.adjust, qvalue, enrichmentScore, setSize, leadingEdgeTags, leadingEdgeList, leadingEdgeSignal, numCoreEnrichment)) |>
  mutate_at(vars(starts_with("p.adjust")), list(~ if_else(is.na(.), 1, .))) |>
  mutate_at(vars(starts_with("qvalues")), list(~ if_else(is.na(.), 1, .))) |>
  mutate_at(vars(starts_with("NES")), list(~ if_else(is.na(.), 0, .))) |>
  mutate_at(vars(starts_with("enrichmentScore")), list(~ if_else(is.na(.), 0, .))) |>
  # left_join(term_clusters_df, by = c("ID" = "term_name")) |>
  # left_join(advanced_terms_clusters_df |> dplyr::select(clustered, top_words)) |>
  dplyr::mutate(category = dplyr::case_when(
    str_detect(ID, "STRESS") ~ "stress",
    str_detect(ID, "ENDOPLASMIC_RETICULUM") ~ "ER",
    str_detect(ID, "chr21") ~ "chr21",
    str_detect(ID, "TRANSLATION") ~ "translation",
    str_detect(ID, "RIBOSOM") ~ "ribosome",
    str_detect(ID, "SERINE") & (!str_detect(ID, "SERINE_THREONINE")) ~ "serine",
    str_detect(ID, "MITOCH") ~ "mitochondria",
    str_detect(ID, "NRF2") ~ "stress",
    # str_detect(ID, "FOLATE") ~ "Folate",
    # str_detect(ID, "MTORC1") ~ "MTORC1",
    str_detect(ID, "APOPTO") | str_detect(ID, "PROGRAMMED_CELL_DEATH") ~ "apoptosis",
    # str_detect(top_words, "EXTRACELLULAR|MATRIX") ~ "ECM",
    T ~ "other"
  ))

write_csv(combined_enrichment_df, paste0(table_output_dir, "/bulk_gsea_results.csv"))

combined_enrichment_df |> 
  dplyr::select(ID, category,
                ends_with(c("T21_vs_D21", "D21_vs_T21", "dT21XIST_vs_T21", 
                            "wdT21XIST_vs_T21", "dwdT21SIT_vs_T21",
                            "neural_t21_d21", "neural_t21xistdox_t21", "neural_t21xistwd_t21", "neural_t21xistdoxcombined_t21"))) |> 
  rename_all(
    .funs = list(
      ~str_replace_all(.,"dwdT21SIT_vs_T21", "combinedT21XIST_vs_T21") |> 
        str_replace_all("neural_t21_d21", "neural_T21_vs_D21") |> 
        str_replace_all("neural_t21xistdox_t21", "neural_dT21XIST_vs_T21") |> 
        str_replace_all("neural_t21xistwd_t21", "neural_wdT21XIST_vs_T21") |> 
        str_replace_all("neural_t21xistdoxcombined_t21", "neural_combinedT21XIST_vs_T21")
    )
  ) |> 
  write_csv(paste0(table_output_dir, "/Data_S2_bulk_gsea.csv.gz"))

## PCA & dendro
batch_corrected_vsd <- batch_corrected_vsd_df |>
    column_to_rownames(var="ensembl_id")
pca <- batch_corrected_vsd |> 
    t() |>
    prcomp()

pca["x"] |>
    as.data.frame() |>
    rownames_to_column(var = "sample_id") |>
    rename_with(~ str_remove(.x, "x\\.")) |>
    left_join(sample_sheet_df, by="sample_id") |>
    mutate(
      designation = case_when(
        cell_line %in% c("198_1", "198_2") ~ "Euploid",
        cell_line == "198_5" ~ "T21",
        dox_status == "no" ~ "T21",
        dox_status == "yes" ~ "Euploid",
        dox_status == "withdrawal" ~ "w/d"
      ),
      cell_line = factor(case_when(
        cell_line == "rtTA_XIST_c1" ~ "c1",
        cell_line == "rtTA_XIST_c3" ~ "c3",
        cell_line == "rtTA_XIST_c4" ~ "c4",
        cell_line == "rtTA_XIST_c7" ~ "c7",
        cell_line == "198_1" ~ "198-1",
        cell_line == "198_2" ~ "198-2",
        cell_line == "198_5" ~ "198-5"
      ),
      levels = c("c1", "c3", "c4", "c7", "198-1", "198-2", "198-5")
      )
    ) |>
    ggplot(aes(x = PC1, y = PC2, color = designation, shape = cell_line)) +
    geom_point(size = 1.5) +
    scale_color_manual(values = pal_d3()(5)[3:5], name = "Designation") +
    scale_shape_manual(limits=c("c1", "c4", "c7", "198-1", "198-2", "198-5"), values = c(15,16,17,0,1,2), name="Cell Line") +
    # geom_label(aes(label = sample_id)) +
    labs(
      x = sprintf("PC%d (%.2f%%)", 1, (pca$sdev^2 / sum(pca$sdev^2) * 100)[1]),
      y = sprintf("PC%d (%.2f%%)", 2, (pca$sdev^2 / sum(pca$sdev^2) * 100)[2]),
      title = "PCA"
    ) +
  theme(legend.box.margin = margin(l=-3, r=-2, unit = "mm"))
ggsave(sprintf("%s/pca.pdf", fig_output_dir), width = 3.12, height = 2.3)

sampleTree <- hclust(dist(t(batch_corrected_vsd)), method = "average")
  # ggdendrogram(sampleTree) + labs(title = "WGCNA Sample Dendrogram")

  sample_dend_data <- dendro_data(sampleTree, type = "rectangle")
  sample_dend_data$labels <- sample_dend_data$labels %>%
    left_join(sample_sheet_df |>
      dplyr::select(sample_id, dox_status, genotype, effective_ploidy), by = c("label" = "sample_id"))

  ggplot() +
    geom_segment(
      data = sample_dend_data$segments,
      mapping = aes(x = x, y = y, xend = xend, yend = yend)
    ) +
    geom_text(
      data = sample_dend_data$labels,
      aes(x = x, y = y, label = label),
      angle = 90, size = 3
    ) +
    geom_tile(
      data = sample_dend_data$labels,
      aes(x = x, y = -6, fill = effective_ploidy), height = 2
    ) +
    annotate(geom = "text", x = 50, y = -6, label = "Effective Ploidy", vjust = .5) +
    scale_x_continuous(limits = c(0, 60)) +
    scale_fill_npg() +
    geom_tile(
      data = sample_dend_data$labels,
      aes(x = x, y = -9, fill = genotype), height = 2
    ) +
    annotate(geom = "text", x = 50, y = -9, label = "Genotype", hjust = 0) +
    geom_tile(
      data = sample_dend_data$labels,
      aes(x = x, y = -12, fill = dox_status), height = 2
    ) +
    annotate(geom = "text", x = 50, y = -12, label = "Dox Status", hjust = 0)


## DESeq2 Results Summary

setups_df <- read_csv(paste0(output_dir, "/diff_exprs.csv.gz")) 


## Chr21 Changes

gene_counts_df <- batch_corrected_vsd_df |>
    pivot_longer(cols = -ensembl_id, names_to = "sample_id", values_to = "vsd_count") |>
    left_join(gene_info_df, by = c("ensembl_id" = "ensembl_gene_id_version")) |>
    left_join(sample_sheet_df)

gene_counts_df |>
    filter(hgnc_symbol == "XIST" &
      genotype == "T21XIST") |>
    mutate(
      dox_status = case_when(
        dox_status == "yes" ~ "+dox",
        dox_status == "no" ~ "-dox",
        dox_status == "withdrawal" ~ "w/d"
      ),
      dox_status = factor(dox_status, levels = c("-dox", "+dox", "w/d")),
      clone = case_when(
        cell_line == "rtTA_XIST_c1" ~ "c1",
        cell_line == "rtTA_XIST_c3" ~ "c3",
        cell_line == "rtTA_XIST_c4" ~ "c4",
        cell_line == "rtTA_XIST_c7" ~ "c7"
      ),
      timepoint = case_when(
        timepoint == "14wk" ~ "14wk (6)",
        timepoint == "17wk" ~ "17wk (6)",
        timepoint == "20wk" ~ "20wk (9)",
        timepoint == "3wk" ~ "3wk (6)",
        timepoint == "6wk" ~ "6wk (9)",
        timepoint == "9wk" ~ "9wk (8)",
        T ~ as.character(timepoint)
      ),
      timepoint = factor(timepoint, levels=rev(c("3wk (6)", "6wk (9)", "9wk (8)", 
                                                 "14wk (6)", "17wk (6)", "20wk (9)")))
    ) |>
  arrange(timepoint) |> 
    ggplot(aes(y = vsd_count, x = clone, color = timepoint)) +
    facet_grid(cols = vars(dox_status)) +
    geom_violin(color = "lightgrey") +
    ggbeeswarm::geom_quasirandom(size = 1) +
  scale_color_manual(values = c(pal_aaas()(4), pal_d3()(2)[2],  pal_cosmic()(4)[4]), name="Timepoint",
                     limits=c("3wk (6)", "6wk (9)", "9wk (8)", 
                              "14wk (6)", "17wk (6)", "20wk (9)")) +
    # scale_color_d3(name = "Timepoint") +
    theme(axis.title.x = element_blank()) +
    labs(
      title = "XIST Expression",
      y = "VST Count"
    )
ggsave(sprintf("%s/xist_expression.pdf", fig_output_dir), width = 3, height = 1.7)


# using complexheatmap to add more sample and gene annotations
chr21_rownorm_vsd <- batch_corrected_vsd_df |>
    pivot_longer(cols = -ensembl_id, names_to = "sample_id", values_to = "vst_count") |>
    left_join(gene_info_df, by = c("ensembl_id" = "ensembl_gene_id_version")) |>
    left_join(sample_sheet_df, by = c("sample_id")) |>
    filter(chromosome_name == "21") |>
    group_by(ensembl_id) |>
    mutate(
      row_norm_count = vst_count - mean(cur_data() %>% filter(`Cell Line` == "198-5") %>% pull(vst_count)),
      mean_1985_count = mean(cur_data() %>% filter(`Cell Line` == "198-5") %>% pull(vst_count))
    ) |>
    ungroup() |>
    mutate(dox_status = factor(dox_status, levels = c("no", "yes", "withdrawal")))

chr21_rownorm_hm_df <- chr21_rownorm_vsd |> 
  filter( # mean_1985_count>8 &
  !(gene_biotype %in% c("rRNA", "TEC"))) |> 
  mutate(
    gene_biotype = factor(case_when(
      gene_biotype == "protein_coding" ~ "Protein\nCoding",
      gene_biotype == "processed_pseudogene" ~ "Pseudo-\ngene",
      gene_biotype == "transcribed_processed_pseudogene" ~ "Pseudo-\ngene",
      T ~ gene_biotype), 
      levels = c("Protein\nCoding", "lncRNA", "Pseudo-\ngene")))
ipsc_rownorm_mat <- chr21_rownorm_hm_df |> 
  pivot_wider(id_cols = ensembl_id,
              names_from = sample_id,
              values_from = row_norm_count) |> 
  column_to_rownames(var="ensembl_id")
t21count_col_fun <- circlize::colorRamp2(c(8,16), c("#ffffff", "#7105E0"))
gene_annotation <- rowAnnotation(
  `Mean T21 Count` = 
    anno_simple(
      x=chr21_rownorm_hm_df |>
                  dplyr::select(ensembl_id, mean_1985_count) |> 
                  dplyr::distinct() |> 
                  deframe(),
    col = t21count_col_fun,
    simple_anno_size = unit(3, "mm")),
  annotation_name_gp= gpar(fontsize = 7))

lgd_t21count = Legend(title = "Mean T21\nCount", col_fun = t21count_col_fun, at = c(8,10,12,14,16), 
                    labels = c(8,10,12,14,16),
                    direction = "horizontal",
                    labels_gp=gpar(fontsize=7), 
                    title_gp=gpar(fontsize=7),
                    grid_height = unit(2, "mm"))


sample_annotation <- columnAnnotation(
  `Cell Line` = chr21_rownorm_hm_df |> 
    dplyr::select(sample_id, `Cell Line`) |> 
    dplyr::distinct() |> 
    mutate(`Cell Line` = str_replace(`Cell Line`, "rtTA-XIST", "T21XIST")) |> 
    column_to_rownames(var="sample_id") |> 
    deframe(),
  `Batch` = chr21_rownorm_hm_df |> 
    dplyr::select(sample_id, `timepoint`) |> 
    dplyr::distinct() |>
    column_to_rownames(var="sample_id") |> 
    deframe() |> 
    factor(levels=c("3wk", "6wk", "9wk", "14wk", "17wk", "20wk")),
  simple_anno_size = unit(3, "mm"),
  annotation_legend_param = list(labels_gp=gpar(fontsize=7), title_gp=gpar(fontsize=7), grid_height = unit(2, "mm"), grid_width = unit(2, "mm")),
  col = list(`Cell Line` = c("198-1"="#1697a6", "198-2"="#0e606b", "198-5"="#ffc24b",
                             "T21XIST c1"="#fff4f1", "T21XIST c4"="#ffb3ae", "T21XIST c7"="#f47068"),
             `Batch` = c("3wk"="#faedcd", "6wk"="#e7c8a0", "9wk"="#d4a373",
                         "14wk"="#a1c349", "17wk"="#87a330", "20wk"="#6a8532")),
  annotation_name_gp= gpar(fontsize = 7)
)

ipsc_hm <- Heatmap(ipsc_rownorm_mat,
        col=circlize::colorRamp2(c(-1, 0, 1), c("#3784AF", "#f7f7f7", "#D34B16")),
        # col=circlize::colorRamp2(c(-1, 0, 1), c("#0571B0", "#f7f7f7", "#CA0020")),
        name="Row Norm\nVST Count",
        border=T,
        cluster_rows = F,
        row_order = chr21_rownorm_hm_df |> dplyr::select(ensembl_id, mean_1985_count) |> deframe() |> sort() |> names() |> unique() |> rev(),
        row_split = chr21_rownorm_hm_df |> 
          dplyr::select(ensembl_id, gene_biotype) |> 
          distinct() |> 
          deframe(),
        row_title_side = "right",
        row_gap = unit(1, "mm"),
        show_row_names = F,
        row_title_gp = gpar(fontsize = 7),
        row_title_rot = 45,
        column_title_gp = gpar(fontsize = 7),
        column_names_gp = gpar(fontsize = 7),
        left_annotation = gene_annotation,
        show_column_dend=F,
        column_split = chr21_rownorm_hm_df |> 
          mutate(split_label = case_when(
            genotype == "T21XIST" & dox_status == "yes" ~ "+dox",
            genotype == "T21XIST" & dox_status == "no" ~ "-dox",
            genotype == "T21XIST" & dox_status == "withdrawal" ~ "w/d",
            T ~ genotype)) |> 
          dplyr::select(sample_id, split_label) |> 
          distinct() |> 
          deframe() |> 
          factor(levels=c("T21", "-dox", "+dox", "w/d", "D21")),
        cluster_column_slices=F,
        column_gap = unit(1, "mm"),
        show_column_names = F,
        bottom_annotation = sample_annotation,
        heatmap_legend_param = list(direction = "horizontal", 
                                    labels_gp=gpar(fontsize=7), 
                                    title_gp=gpar(fontsize=7),
                                    grid_height = unit(2, "mm")))
pdf(sprintf("%s/expression_relative_to_1985.pdf", fig_output_dir), width = 3.5, height = 4.7)
draw(ipsc_hm, annotation_legend_list = list(lgd_t21count), 
     merge_legend=T, heatmap_legend_side = "bottom", annotation_legend_side = "bottom")
dev.off()          

chr21_changes_comparisons <- c("D21_vs_T21", "dT21XIST_vs_T21", "wdT21XIST_vs_T21")
chr21_changes_comparison_cols <- purrr::cross(list(
  c("baseMean_", "log2FoldChange_", "pvalue_", "padj_", "stat_"), 
  chr21_changes_comparisons
  )) |> map(lift(paste0)) |> unlist()
  
chr21_changes_df <- combined_results_df |>
    # left_join(gene_info_df, by = c("ensembl_id" = "ensembl_gene_id_version")) |>
    dplyr::filter(chromosome_name == "21") |>
    pivot_longer(
      cols = c(ends_with(chr21_changes_comparison_cols)),
      names_pattern = "([A-Za-z0-9\\.]*)_(.*)", names_to = c("value_type", "comparison")
    ) |>
    pivot_wider(names_from = "value_type", values_from = "value") |>
    dplyr::mutate(
      # log2FoldChange = log2FoldChange_dox_t21xist_vs_1985,
      # padj = padj_dox_t21xist_vs_1985,
      # baseMean = baseMean_dox_t21xist_vs_1985,
      color_label = case_when(
        padj <= .1 & dplyr::between(log2FoldChange, log2((2 / 3) - .1), log2((2 / 3) + .1)) ~ "padj≤.1 & FC=(2/3)±.1",
        padj <= .1 & dplyr::between(log2FoldChange, log2((2 / 3) - .15), log2((2 / 3)+.15)) ~ "padj≤.1 & FC=(2/3)±.15",
        padj <= .1 ~ "padj≤.1",
        T ~ "padj>.1"
      )
    )

chr21_arms <- read_tsv("http://hgdownload.cse.ucsc.edu/goldenpath/hg38/database/cytoBand.txt.gz", 
           col_names = c("chrom","chromStart","chromEnd","name","gieStain")) |> 
  filter(chrom == "chr21") |> 
  mutate(arm = substring(name, 1, 1)) |> 
  group_by(chrom, arm) |> 
  summarise(start_pos = min(chromStart),
            end_pos = max(chromEnd),
            len = end_pos - start_pos)

chr21_changes_df |> 
  dplyr::select(ensembl_id, start_position, end_position) |> 
  unique() |> 
  filter(start_position < 12000000) |> 
  arrange(start_position) |> 
  top_n(-1)

ggplot(
    chr21_changes_df |> 
      mutate(comparison = factor(case_when(
        comparison == "D21_vs_T21" ~ "D21",
        comparison == "dT21XIST_vs_T21" ~ "+dox",
        comparison == "wdT21XIST_vs_T21" ~ "w/d",
        T ~ comparison), levels = c("D21", "+dox", "w/d"))),
    aes(x = fct_reorder(ensembl_id, start_position), y = log2FoldChange, color = color_label, size = baseMean)
  ) +
    facet_grid(rows = vars(comparison)) +
    geom_point(alpha = .5) +
    scale_color_manual(
      values = c(pal_d3()(3)[c(1, 3, 2)], pal_uchicago()(2)[2]), 
      limits = c("padj≤.1 & FC=(2/3)±.1", "padj≤.1 & FC=(2/3)±.15", "padj≤.1", "padj>.1")
    ) +
    scale_size_area(limits = c(NA, 10000), oob = scales::squish, max_size = 4, breaks=c(1,10,100,1000,10000), guide = "none") +
    labs(
      title = "chr21 Dosage Correction",
      color = "Gene Category",
      size = "Base Mean Expression",
      x = "chr21 Genes (ordered by position)",
      y = "log2(FC)"
    ) +
    geom_hline(yintercept = log2(2 / 3), linetype = "dashed", size = .3) +
    geom_hline(yintercept = 0, linetype = "solid") +
    geom_vline(xintercept = "ENSG00000157540.22", linetype = "dotdash", size = .3) +
    # geom_vline(xintercept = "ENSG00000277117.5", linetype = "dotted", size=.8) +
    guides(color=guide_legend(ncol=2, label.position = "right", label.hjust = .1)) +
    theme(
      axis.text.x = element_blank(),
      axis.ticks.x = element_blank(),
      legend.position = "top",
      # legend.position = "right",
      legend.box.margin = margin(t=-5,b=-9),
      legend.spacing.y = unit(1, "mm")
      # legend.spacing.x = unit(3, "mm"),
      # legend.margin = margin(3,3,3,3, "mm"),
      # legend.box.background = element_rect(color = "black", size=.5)
    ) +
  scale_y_continuous(breaks = c(-1,0, 1)) +
    coord_cartesian(ylim = c(-1.5, 1.5))
ggsave(sprintf("%s/hsa21_dosage_correction.pdf", fig_output_dir), width = 3.5, height = 2.75, device = cairo_pdf)
ggsave(sprintf("%s/hsa21_dosage_correction_basemeanscale.pdf", fig_output_dir), width = 3.5, height = 2.75, device = cairo_pdf)

chr21_changes_df |> 
  mutate(comparison = factor(case_when(
    comparison == "D21_vs_T21" ~ "D21",
    comparison == "dT21XIST_vs_T21" ~ "+dox",
    comparison == "wdT21XIST_vs_T21" ~ "w/d",
    T ~ comparison), levels = c("D21", "+dox", "w/d")),
    baseMean = case_when(is.na(baseMean) ~ 0,
                         T ~ baseMean)) |> 
  dplyr::count(comparison, color_label, baseMean >= 1000) |> 
  pivot_wider(names_from = `baseMean >= 1000`, values_from = n) |> 
  dplyr::rename("baseMean>=1000"=`TRUE`, "baseMean<1000"=`FALSE`) |> 
  write_csv(sprintf("%s/ipsc_hsa21_dosage_correction_counts.csv", table_output_dir), quote = "all")
chr21_changes_comparisons <- c("dT21XIST_vs_T21", "wdT21XIST_vs_T21")
combined_results_df |>
  # left_join(gene_info_df, by = c("ensembl_id" = "ensembl_gene_id_version")) |>
  dplyr::filter(chromosome_name == "21") |>
  pivot_longer(
    cols = c(ends_with(purrr::cross(list(
      c("baseMean_", "log2FoldChange_", "pvalue_", "padj_", "stat_"), 
      c(chr21_changes_comparisons, "dwdT21SIT_vs_T21")
    )) |> map(lift(paste0)) |> unlist())),
    names_pattern = "([A-Za-z0-9\\.]*)_(.*)", names_to = c("value_type", "comparison")
  ) |>
  pivot_wider(names_from = "value_type", values_from = "value") |>
  dplyr::mutate(
    # log2FoldChange = log2FoldChange_dox_t21xist_vs_1985,
    # padj = padj_dox_t21xist_vs_1985,
    # baseMean = baseMean_dox_t21xist_vs_1985,
    color_label = case_when(
      padj <= .1 & dplyr::between(log2FoldChange, log2((2 / 3) - .1), log2((2 / 3) + .1)) ~ "padj≤.1 & FC=(2/3)±.1",
      padj <= .1 & dplyr::between(log2FoldChange, log2((2 / 3) - .15), log2((2 / 3)+.15)) ~ "padj≤.1 & FC=(2/3)±.15",
      padj <= .1 ~ "padj≤.1",
      T ~ "padj>.1"
    )
  ) |> 
  mutate(comparison = case_when(comparison == "dT21XIST_vs_T21" ~ "+dox",
                                comparison == "wdT21XIST_vs_T21" ~ "w/d",
                                comparison == "dwdT21SIT_vs_T21" ~ "combined",
                                T ~ comparison)) |> 
  # filter(padj <= .1) |>
  filter(baseMean >= 1000) |>
ggplot() +
    geom_vline(xintercept = log2(.9 * (2 / 3)), color = pal_d3()(3)[1], linetype="dotted", size=.3) +
    geom_vline(xintercept = log2(1.1 * (2 / 3)), color = pal_d3()(3)[1], linetype="dotted", size=.3) +
    geom_vline(xintercept = log2(.85 * (2 / 3)), color = pal_d3()(3)[3], linetype="dotted", size=.3) +
    geom_vline(xintercept = log2(1.15 * (2 / 3)), color = pal_d3()(3)[3], linetype="dotted", size=.3) +
    annotate("rect", xmin = log2(.9 * (2 / 3)), xmax = log2(1.1 * (2 / 3)), ymin = -Inf, ymax = Inf, fill = pal_d3()(3)[1], alpha = .3) +
    annotate("rect", xmin = log2(.85 * (2 / 3)), xmax = log2(.9 * (2 / 3)), ymin = -Inf, ymax = Inf, fill = pal_d3()(3)[3], alpha = .3) +
    annotate("rect", xmin = log2(1.1 * (2 / 3)), xmax = log2(1.15 * (2 / 3)), ymin = -Inf, ymax = Inf, fill = pal_d3()(3)[3], alpha = .3) +
    geom_density(aes(x = log2FoldChange, color = comparison)) +
    scale_color_manual(limits=c("+dox", "combined", "w/d"), values=pal_d3()(5)[c(4,2,5)]) +
    geom_vline(xintercept = log2(2 / 3), linetype = "dashed", color = "black", size = .3) +
    geom_vline(xintercept = 0, linetype = "solid", color = "black", size = .3) +
    annotate("text", x = -1.2, y = 2, label = "log2(2/3)", color = "black", size = 2) +
    coord_cartesian(xlim = c(-1.5, 1.5)) +
    labs(y = "Density", x = "log2(FC)", title = "chr21 Log2(FC) (baseMean >= 1000)") +
    theme(legend.position = c(.8, .5))
  
ggsave(sprintf("%s/hsa21_log2fc_1000.pdf", fig_output_dir), width = 3.5, height = 1.5)
# ggsave(sprintf("%s/hsa21_log2fc.pdf", fig_output_dir), width = 3.25, height = 1.5)

combined_results_df |>
  # left_join(gene_info_df, by = c("ensembl_id" = "ensembl_gene_id_version")) |>
  dplyr::filter(chromosome_name == "21") |>
  pivot_longer(
    cols = c(ends_with(purrr::cross(list(
      c("baseMean_", "log2FoldChange_", "pvalue_", "padj_", "stat_"), 
      c(chr21_changes_comparisons, "dwdT21SIT_vs_T21")
    )) |> map(lift(paste0)) |> unlist())),
    names_pattern = "([A-Za-z0-9\\.]*)_(.*)", names_to = c("value_type", "comparison")
  ) |>
  pivot_wider(names_from = "value_type", values_from = "value") |>
  dplyr::mutate(
    # log2FoldChange = log2FoldChange_dox_t21xist_vs_1985,
    # padj = padj_dox_t21xist_vs_1985,
    # baseMean = baseMean_dox_t21xist_vs_1985,
    color_label = case_when(
      padj <= .1 & dplyr::between(log2FoldChange, log2((2 / 3) - .1), log2((2 / 3) + .1)) ~ "padj≤.1 & FC=(2/3)±.1",
      padj <= .1 & dplyr::between(log2FoldChange, log2((2 / 3) - .15), log2((2 / 3)+.15)) ~ "padj≤.1 & FC=(2/3)±.15",
      padj <= .1 ~ "padj≤.1",
      T ~ "padj>.1"
    )
  ) |> 
  mutate(comparison = case_when(comparison == "dT21XIST_vs_T21" ~ "+dox",
                                comparison == "wdT21XIST_vs_T21" ~ "w/d",
                                comparison == "dwdT21SIT_vs_T21" ~ "combined",
                                T ~ comparison)) |> 
  filter(padj <= .1) |> 
  group_by(comparison) |> 
  summarise(num_genes = length(unique(ensembl_id)))

combined_results_df |>
  dplyr::filter(chromosome_name == "21") |>
  pivot_longer(
    cols = c(ends_with(purrr::cross(list(
      c("baseMean_", "log2FoldChange_", "pvalue_", "padj_", "stat_"), 
      c("dT21XIST_vs_D21",  "wdT21XIST_vs_D21", "dwdT21SIT_vs_D21")
    )) |> map(lift(paste0)) |> unlist())),
    names_pattern = "([A-Za-z0-9\\.]*)_(.*)", names_to = c("value_type", "comparison")
  ) |>
  pivot_wider(names_from = "value_type", values_from = "value") |>
  dplyr::mutate(
    color_label = case_when(
      padj <= .1 & dplyr::between(log2FoldChange, log2((2 / 3) - .1), log2((2 / 3) + .1)) ~ "padj≤.1 & FC=(2/3)±.1",
      padj <= .1 & dplyr::between(log2FoldChange, log2((2 / 3) - .15), log2((2 / 3)+.15)) ~ "padj≤.1 & FC=(2/3)±.15",
      padj <= .1 ~ "padj≤.1",
      T ~ "padj>.1"
    )
  ) |> 
  mutate(comparison = case_when(comparison == "dT21XIST_vs_D21" ~ "+dox",
                                comparison == "wdT21XIST_vs_D21" ~ "w/d",
                                comparison == "dwdT21SIT_vs_D21" ~ "combined",
                                T ~ comparison)) |> 
  # filter(padj <= .1) |> 
  filter(baseMean >= 1000) |>
  ggplot() +
  # geom_vline(xintercept = log2(.9 * (2 / 3)), color = pal_d3()(3)[1], linetype="dotted", size=.3) +
  # geom_vline(xintercept = log2(1.1 * (2 / 3)), color = pal_d3()(3)[1], linetype="dotted", size=.3) +
  # geom_vline(xintercept = log2(.85 * (2 / 3)), color = pal_d3()(3)[3], linetype="dotted", size=.3) +
  # geom_vline(xintercept = log2(1.15 * (2 / 3)), color = pal_d3()(3)[3], linetype="dotted", size=.3) +
  # annotate("rect", xmin = log2(.9 * (2 / 3)), xmax = log2(1.1 * (2 / 3)), ymin = -Inf, ymax = Inf, fill = pal_d3()(3)[1], alpha = .3) +
  # annotate("rect", xmin = log2(.85 * (2 / 3)), xmax = log2(.9 * (2 / 3)), ymin = -Inf, ymax = Inf, fill = pal_d3()(3)[3], alpha = .3) +
  # annotate("rect", xmin = log2(1.1 * (2 / 3)), xmax = log2(1.15 * (2 / 3)), ymin = -Inf, ymax = Inf, fill = pal_d3()(3)[3], alpha = .3) +
  geom_density(aes(x = log2FoldChange, color = comparison)) +
  scale_color_manual(limits=c("+dox", "combined", "w/d"), values=pal_d3()(5)[c(4,2,5)]) +
  # geom_vline(xintercept = log2(2 / 3), linetype = "dashed", color = "black", size = .3) +
  geom_vline(xintercept = 0, linetype = "solid", color = "black", size = .3) +
  # annotate("text", x = -1.2, y = 2, label = "log2(2/3)", color = "black", size = 2) +
  coord_cartesian(xlim = c(-1.5, 1.5)) +
  labs(y = "Density", x = "log2(FC)", title = "chr21 Log2(FC) from D21\niPSCs (baseMean >= 1000)") +
  theme(legend.position = c(.8, .7))
ggsave(sprintf("%s/hsa21_log2fc_d21_1000.pdf", fig_output_dir), width = 3.5, height = 2)


combined_results_df |>
  # left_join(gene_info_df, by = c("ensembl_id" = "ensembl_gene_id_version")) |>
  dplyr::filter(chromosome_name == "21") |>
  pivot_longer(
    cols = c(ends_with(purrr::cross(list(
      c("baseMean_", "log2FoldChange_", "pvalue_", "padj_", "stat_"), 
      c(chr21_changes_comparisons, "dwdT21SIT_vs_T21")
    )) |> map(lift(paste0)) |> unlist())),
    names_pattern = "([A-Za-z0-9\\.]*)_(.*)", names_to = c("value_type", "comparison")
  ) |>
  pivot_wider(names_from = "value_type", values_from = "value") |>
  dplyr::mutate(
    # log2FoldChange = log2FoldChange_dox_t21xist_vs_1985,
    # padj = padj_dox_t21xist_vs_1985,
    # baseMean = baseMean_dox_t21xist_vs_1985,
    color_label = case_when(
      padj <= .1 & dplyr::between(log2FoldChange, log2((2 / 3) - .1), log2((2 / 3) + .1)) ~ "padj≤.1 & FC=(2/3)±.1",
      padj <= .1 & dplyr::between(log2FoldChange, log2((2 / 3) - .15), log2((2 / 3)+.15)) ~ "padj≤.1 & FC=(2/3)±.15",
      padj <= .1 ~ "padj≤.1",
      T ~ "padj>.1"
    )
  ) |> 
  mutate(comparison = case_when(comparison == "dT21XIST_vs_T21" ~ "+dox",
                                comparison == "wdT21XIST_vs_T21" ~ "w/d",
                                comparison == "dwdT21SIT_vs_T21" ~ "combined",
                                T ~ comparison)) |> 
  filter(padj <= .1) |> 
ggplot() +
    geom_vline(xintercept = log2(.9 * (2 / 3)), color = pal_d3()(3)[1], linetype="dotted", size=.3) +
    geom_vline(xintercept = log2(1.1 * (2 / 3)), color = pal_d3()(3)[1], linetype="dotted", size=.3) +
    geom_vline(xintercept = log2(.85 * (2 / 3)), color = pal_d3()(3)[3], linetype="dotted", size=.3) +
    geom_vline(xintercept = log2(1.15 * (2 / 3)), color = pal_d3()(3)[3], linetype="dotted", size=.3) +
    annotate("rect", xmin = log2(.9 * (2 / 3)), xmax = log2(1.1 * (2 / 3)), ymin = -Inf, ymax = Inf, fill = pal_d3()(3)[1], alpha = .3) +
    annotate("rect", xmin = log2(.85 * (2 / 3)), xmax = log2(.9 * (2 / 3)), ymin = -Inf, ymax = Inf, fill = pal_d3()(3)[3], alpha = .3) +
    annotate("rect", xmin = log2(1.1 * (2 / 3)), xmax = log2(1.15 * (2 / 3)), ymin = -Inf, ymax = Inf, fill = pal_d3()(3)[3], alpha = .3) +
    geom_density(aes(x = log2FoldChange, color = comparison)) +
    scale_color_manual(limits=c("+dox", "combined", "w/d"), values=pal_d3()(5)[c(4,2,5)]) +
    geom_vline(xintercept = log2(2 / 3), linetype = "dashed", color = "black", size = .3) +
    geom_vline(xintercept = 0, linetype = "solid", color = "black", size = .3) +
    annotate("text", x = -1.2, y = 2, label = "log2(2/3)", color = "black", size = 2) +
    coord_cartesian(xlim = c(-1.5, 1.5)) +
    labs(y = "Density", x = "log2(FC)", title = "HSA21 Significant Log2(FC)") +
    theme(legend.position = c(.8, .7))


batch_corrected_vsd_df |>
  pivot_longer(
    cols = -ensembl_id,
    names_to = "sample_id",
    values_to = "vst_count"
  ) |>
  left_join(gene_info_df, by = c("ensembl_id" = "ensembl_gene_id_version")) |>
  dplyr::filter(chromosome_name == "21") |>
  left_join(sample_sheet_df, by="sample_id") |> 
  mutate(sample_group = case_when(genotype %in% c("D21", "T21") ~ genotype,
                                  genotype == "T21XIST" ~ paste(dox_status, timepoint, sep="_"))) |>
  group_by(ensembl_id, sample_group) |> 
  summarise(mean_vst = mean(vst_count)) |> 
  ungroup() |> 
  pivot_wider(id_cols = "ensembl_id", names_from="sample_group", values_from = "mean_vst") |> 
  pivot_longer(cols = -c(ensembl_id, T21), names_to = "sample_group", values_to = "mean_vst") |> 
  mutate(diff_from_t21 = mean_vst - T21,
         dox_status = str_extract(sample_group, "D21|no|yes|withdrawal"),
         timepoint = str_extract(sample_group, "3wk|6wk|9wk|14wk|17wk|20wk")) |> 
  filter(dox_status == "withdrawal") |> 
  mutate(dox_group = case_when(
    str_detect(timepoint, "14wk|17wk|20wk") ~ "2m",
    T ~ "3w"),
    wd_group = case_when(
      timepoint %in% c("6wk", "17wk") ~ "3wk",
      timepoint %in% c("9wk", "20wk") ~ "6wk"
    )) |> 
  ggplot() +
  geom_density(aes(x = diff_from_t21, color = wd_group, linetype = dox_group), alpha=.7) +
  geom_vline(xintercept = 0, linetype = "solid", color = "black", size = .3) +
  coord_cartesian(xlim = c(-1, 1)) +
  scale_color_manual(values=pal_simpsons()(5)[c(2,5)]) +
  scale_linetype_manual(limits=c("3w", "2m"), values=c("solid", "dashed")) +
  labs(y = "Density", x = "VST Diff", title = "HSA21 Mean VST diff from T21") +
  theme(legend.position = c(.8, .57))

ggsave(sprintf("%s/hsa21_withdrawal_vst_diff.pdf", fig_output_dir), width = 3.25, height = 2)

# only looking at allelic escapees
potential_escapees <- c("ENSG00000142166.13", "ENSG00000142192.21", "ENSG00000154734.16", "ENSG00000157557.13", "ENSG00000159110.21", "ENSG00000182362.14", "ENSG00000243927.6",  "ENSG00000249624.10")    
ggplot(
      chr21_changes_df |> 
        mutate(comparison = factor(case_when(
          comparison == "D21_vs_T21" ~ "D21",
          comparison == "dT21XIST_vs_T21" ~ "+dox",
          comparison == "wdT21XIST_vs_T21" ~ "w/d",
          T ~ comparison), levels = c("D21", "+dox", "w/d"))),
      aes(x = fct_reorder(ensembl_id, start_position), y = log2FoldChange, color = color_label, size = baseMean)
    ) +
      facet_grid(rows = vars(comparison)) +
      geom_point(data = chr21_changes_df |> 
                   mutate(comparison = factor(case_when(
                     comparison == "D21_vs_T21" ~ "D21",
                     comparison == "dT21XIST_vs_T21" ~ "+dox",
                     comparison == "wdT21XIST_vs_T21" ~ "w/d",
                     T ~ comparison), levels = c("D21", "+dox", "w/d"))), color="white") +
      geom_point(data = chr21_changes_df |> 
                   mutate(comparison = factor(case_when(
                     comparison == "D21_vs_T21" ~ "D21",
                     comparison == "dT21XIST_vs_T21" ~ "+dox",
                     comparison == "wdT21XIST_vs_T21" ~ "w/d",
                     T ~ comparison), levels = c("D21", "+dox", "w/d"))) |> 
                   filter(ensembl_id %in% potential_escapees), alpha=.5) +
      scale_color_manual(
        values = c(pal_d3()(3)[c(1, 3, 2)], pal_uchicago()(2)[2]), limits = c("padj≤.1 & FC=(2/3)±.1", "padj≤.1 & FC=(2/3)±.15", "padj≤.1", "padj>.1"),
        guide="none"
      ) +
      scale_size_area(limits = c(NA, 10000), oob = scales::squish, guide = "none", max_size = 4) +
      labs(
        title = "chr21 Dosage Correction - Allelic Escapees",
        color = "Gene Category",
        size = "Base Mean Expression",
        x = "chr21 Genes (ordered by position)",
        y = "log2(FC)"
      ) +
  ggrepel::geom_text_repel(data = chr21_changes_df |> 
               mutate(comparison = factor(case_when(
                 comparison == "D21_vs_T21" ~ "D21",
                 comparison == "dT21XIST_vs_T21" ~ "+dox",
                 comparison == "wdT21XIST_vs_T21" ~ "w/d",
                 T ~ comparison), levels = c("D21", "+dox", "w/d")))|> 
               filter(ensembl_id %in% potential_escapees),
                      mapping=aes(label=hgnc_symbol), size=2,
               min.segment.length=0) +
      geom_hline(yintercept = log2(2 / 3), linetype = "dashed", size = .3) +
      geom_hline(yintercept = 0, linetype = "solid") +
      geom_vline(xintercept = "ENSG00000157540.22", linetype = "dotdash", size = .3) +
      # guides(color=guide_legend(ncol=2, label.position = "right", label.hjust = .1)) +
      theme(
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        legend.position = "top",
        # legend.spacing.x = unit(3, "mm"),
        # legend.margin = margin(3,3,3,3, "mm"),
        legend.box.background = element_rect(color = "black", size=1)
      ) +
      coord_cartesian(ylim = c(-1.5, 1.5))
ggsave(sprintf("%s/hsa21_dosage_correction_allelic_escapees.pdf", fig_output_dir), width = 3.5, height = 2.28)
# ggsave(sprintf("%s/hsa21_dosage_correction_allelic_escapees.png", fig_output_dir), width = 3.25, height = 1.97, dpi=300)
    
chr21_changes_comparison_cols <- purrr::cross(list(
    c("baseMean_", "log2FoldChange_", "pvalue_", "padj_", "stat_"), 
    chr21_changes_comparisons
  )) |> map(lift(paste0)) |> unlist()
  
combined_results_df |>
    # left_join(gene_info_df, by = c("ensembl_id" = "ensembl_gene_id_version")) |>
    filter(chromosome_name %in% c(
      "1", "2", "3", "4", "5", "6", "7",
      "8", "9", "10", "11", "12", "13",
      "14", "15", "16", "17", "18", "19",
      "20", "21", "22", "X", "Y"
    )) |>
    pivot_longer(
      # cols = c(ends_with("nonrtta_d21_vs_t21"), ends_with("rtta_wd_vs_nd"), ends_with("rtta_wd_vs_nd_powered"), ends_with("_rtta_d_vs_nd"), ends_with("dox_t21xist_vs_1985")),
      cols = c(starts_with(c("baseMean_", "log2FoldChange_", "pvalue_", "padj_", "stat_"))),
      names_pattern = "([A-Za-z0-9\\.]*)_(.*)", names_to = c("value_type", "comparison")
    ) |>
    pivot_wider(names_from = "value_type", values_from = "value") |>
    mutate(category = factor(case_when(
      padj < .1 & log2FoldChange > 0 ~ "gain",
      padj < .1 & log2FoldChange < 0 ~ "loss",
      T ~ "nochange"
    ), levels = c("gain", "nochange", "loss"))) |>
    dplyr::count(chromosome_name, category, comparison) |>
    filter(comparison %in% c("dwdT21SIT_vs_T21", "T21_vs_D21")) |> 
    mutate(comparison = case_when(comparison == "dwdT21SIT_vs_T21" ~ "T21XIST/T21",
                                  comparison == "T21_vs_D21" ~ "T21/D21"),
           comparison= factor(comparison, levels = c("T21/D21", "T21XIST/T21"))) |> 
    ggplot() +
    geom_bar(aes(
      x = factor(
        chromosome_name,
        levels = c(
          "1", "2", "3", "4", "5", "6", "7",
          "8", "9", "10", "11", "12", "13",
          "14", "15", "16", "17", "18", "19",
          "20", "21", "22", "X", "Y"
        )
      ),
      y = n,
      fill = category
    ),
    stat = "identity",
    position = position_fill()
    ) +
    # facet_grid(rows = vars(factor(comparison, levels = c("nonrtta_d21_vs_t21", "rtta_wd_vs_nd", "rtta_wd_vs_nd_powered", "rtta_d_vs_nd", "dox_t21xist_vs_1985")))) +
    facet_grid(rows = vars(comparison)) +
    geom_hline(yintercept = .5, linetype = "dashed", size = .3) +
    scale_fill_uchicago(
      limits = c("gain", "nochange", "loss"),
      labels = c("Inc", "~0", "Dec")
    ) +
    labs(
      title = "Global Changes in Gene Expression - iPSCs",
      fill = "Direction",
      x = "Chromosome",
      y = "% Genes"
    ) +
  theme(axis.text.x = element_text(angle=45, vjust=1, hjust=1, size=8),
        plot.margin = margin(t=.5,0,0,0, "mm"),
        legend.margin = margin(l=-3, unit = "mm"),
        axis.title.x = element_blank())
ggsave(sprintf("%s/global_changes_in_gene_expression.pdf", fig_output_dir), width = 4, height = 2.3)

combined_results_df |>
  # left_join(gene_info_df, by = c("ensembl_id" = "ensembl_gene_id_version")) |>
  filter(chromosome_name %in% c(
    "1", "2", "3", "4", "5", "6", "7",
    "8", "9", "10", "11", "12", "13",
    "14", "15", "16", "17", "18", "19",
    "20", "21", "22", "X", "Y"
  )) |>
  pivot_longer(
    # cols = c(ends_with("nonrtta_d21_vs_t21"), ends_with("rtta_wd_vs_nd"), ends_with("rtta_wd_vs_nd_powered"), ends_with("_rtta_d_vs_nd"), ends_with("dox_t21xist_vs_1985")),
    cols = c(starts_with(c("baseMean_", "log2FoldChange_", "pvalue_", "padj_", "stat_"))),
    names_pattern = "([A-Za-z0-9\\.]*)_(.*)", names_to = c("value_type", "comparison")
  ) |>
  pivot_wider(names_from = "value_type", values_from = "value") |>
  mutate(category = factor(case_when(
    padj < .1 & log2FoldChange > 0 ~ "gain",
    padj < .1 & log2FoldChange < 0 ~ "loss",
    T ~ "nochange"
  ), levels = c("gain", "nochange", "loss"))) |>
  dplyr::count(chromosome_name, category, comparison) |>
  filter(comparison %in% c("dT21XIST_vs_T21")) |> 
  mutate(comparison = case_when(comparison == "dT21XIST_vs_T21" ~ "+dox",
                                comparison == "wdT21XIST_vs_T21" ~ "w/d")) |> 
  ggplot() +
  geom_bar(aes(
    x = factor(
      chromosome_name,
      levels = c(
        "1", "2", "3", "4", "5", "6", "7",
        "8", "9", "10", "11", "12", "13",
        "14", "15", "16", "17", "18", "19",
        "20", "21", "22", "X", "Y"
      )
    ),
    y = n,
    fill = category
  ),
  stat = "identity",
  position = position_fill()
  ) +
  # facet_grid(rows = vars(factor(comparison, levels = c("nonrtta_d21_vs_t21", "rtta_wd_vs_nd", "rtta_wd_vs_nd_powered", "rtta_d_vs_nd", "dox_t21xist_vs_1985")))) +
  # facet_grid(rows = vars(comparison)) +
  geom_hline(yintercept = .5, linetype = "dashed", size = .3) +
  scale_fill_uchicago(
    limits = c("gain", "nochange", "loss"),
    labels = c("Inc", "~0", "Dec")
  ) +
  labs(
    title = "Global Changes in Gene Expression",
    fill = "Direction",
    x = "Chromosome",
    y = "% Genes"
  ) +
  theme(axis.text.x = element_text(angle=45, vjust=1, hjust=1, size=8),
        plot.margin = margin(t=.5,0,0,0, "mm"),
        legend.margin = margin(l=-3, unit = "mm"),
        axis.title.x = element_blank())
ggsave(sprintf("%s/global_changes_in_gene_expression.pdf", fig_output_dir), width = 4, height = 1.3)
   ## Diff. Expression Overlaps


combined_results_df |>
  # left_join(gene_info_df, by = c("ensembl_id" = "ensembl_gene_id_version")) |>
  dplyr::filter(chromosome_name == "21") |>
  pivot_longer(
    cols = c(ends_with(purrr::cross(list(
      c("baseMean_", "log2FoldChange_", "pvalue_", "padj_", "stat_"), 
      c("neural_t21xistdoxcombined_t21",
        "neural_t21xistdox_t21",
        "neural_t21xistwd_t21")
    )) |> map(lift(paste0)) |> unlist())),
    names_pattern = "([A-Za-z0-9\\.]*)_(.*)", names_to = c("value_type", "comparison")
  ) |>
  pivot_wider(names_from = "value_type", values_from = "value") |>
  mutate(comparison = case_when(comparison == "neural_t21xistdox_t21" ~ "+dox",
                                comparison == "neural_t21xistwd_t21" ~ "w/d",
                                comparison == "neural_t21xistdoxcombined_t21" ~ "combined",
                                T ~ comparison)) |> 
  filter(baseMean >= 1000) |>
  ggplot() +
  geom_vline(xintercept = log2(.9 * (2 / 3)), color = pal_d3()(3)[1], linetype="dotted", size=.3) +
  geom_vline(xintercept = log2(1.1 * (2 / 3)), color = pal_d3()(3)[1], linetype="dotted", size=.3) +
  geom_vline(xintercept = log2(.85 * (2 / 3)), color = pal_d3()(3)[3], linetype="dotted", size=.3) +
  geom_vline(xintercept = log2(1.15 * (2 / 3)), color = pal_d3()(3)[3], linetype="dotted", size=.3) +
  annotate("rect", xmin = log2(.9 * (2 / 3)), xmax = log2(1.1 * (2 / 3)), ymin = -Inf, ymax = Inf, fill = pal_d3()(3)[1], alpha = .3) +
  annotate("rect", xmin = log2(.85 * (2 / 3)), xmax = log2(.9 * (2 / 3)), ymin = -Inf, ymax = Inf, fill = pal_d3()(3)[3], alpha = .3) +
  annotate("rect", xmin = log2(1.1 * (2 / 3)), xmax = log2(1.15 * (2 / 3)), ymin = -Inf, ymax = Inf, fill = pal_d3()(3)[3], alpha = .3) +
  geom_density(aes(x = log2FoldChange, color = comparison)) +
  scale_color_manual(limits=c("+dox", "combined", "w/d"), values=pal_d3()(5)[c(4,2,5)]) +
  geom_vline(xintercept = log2(2 / 3), linetype = "dashed", color = "black", size = .3) +
  geom_vline(xintercept = 0, linetype = "solid", color = "black", size = .3) +
  annotate("text", x = -1, y = .75, label = "log2(2/3)", color = "black", size = 2) +
  coord_cartesian(xlim = c(-1.5, 1.5)) +
  labs(y = "Density", x = "log2(FC)", title = "Neural chr21 Log2(FC) (baseMean >= 1000)") +
  theme(legend.position = c(.8, .7))
ggsave(sprintf("%s/chr21_dist_neural_1000.pdf", fig_output_dir), width = 3.5, height = 1.5)

combined_results_df |>
  # left_join(gene_info_df, by = c("ensembl_id" = "ensembl_gene_id_version")) |>
  dplyr::filter(chromosome_name == "21") |>
  pivot_longer(
    cols = c(ends_with(purrr::cross(list(
      c("baseMean_", "log2FoldChange_", "pvalue_", "padj_", "stat_"), 
      c("neural_d21_t21xistdoxcombined",
        "neural_d21_t21xistdox",
        "neural_d21_t21xistwd")
    )) |> map(lift(paste0)) |> unlist())),
    names_pattern = "([A-Za-z0-9\\.]*)_(.*)", names_to = c("value_type", "comparison")
  ) |>
  pivot_wider(names_from = "value_type", values_from = "value") |>
  mutate(comparison = case_when(comparison == "neural_d21_t21xistdox" ~ "+dox",
                                comparison == "neural_d21_t21xistwd" ~ "w/d",
                                comparison == "neural_d21_t21xistdoxcombined" ~ "combined",
                                T ~ comparison)) |> 
  filter(baseMean >= 1000) |>
  ggplot() +
  geom_density(aes(x = -1 * log2FoldChange, color = comparison)) +
  scale_color_manual(limits=c("+dox", "combined", "w/d"), values=pal_d3()(5)[c(4,2,5)]) +
  geom_vline(xintercept = 0, linetype = "solid", color = "black", size = .3) +
  coord_cartesian(xlim = c(-1.5, 1.5)) +
  labs(y = "Density", x = "log2(FC)", title = "chr21 Log2(FC) from D21\nNeural (baseMean >= 1000)") +
  theme(legend.position = c(.8, .7))
ggsave(sprintf("%s/chr21_dist_neural_d21_1000.pdf", fig_output_dir), width = 3.5, height = 2)

# batch_corrected_vsd_df_neural <- read_csv(paste0(neural_output_dir, "/tables/batch_corrected_vsd_salmon_df.csv.gz"))
batch_corrected_vsd_df_neural <- read_csv(paste0(neural_output_dir, "/tables/vsd_salmon_df.csv.gz"))
sample_sheet_df_neural <- read_csv(paste0(neural_output_dir, "/tables/sample_sheet_df.csv.gz"))

write_csv(sample_sheet_df |> 
            dplyr::select(sample_id,
                   timepoint,
                   dox_status,
                   genotype,
                   cell_line
            ), paste0(table_output_dir, "/bulk_ipsc_rnaseq_samples.csv"))
write_csv(sample_sheet_df_neural |> 
            dplyr::select(sample_id,
                   cell_line,
                   neural_diff,
                   genotype
            ), paste0(table_output_dir, "/bulk_neural_rnaseq_samples.csv"))

chr21_rownorm_vsd <- batch_corrected_vsd_df_neural |>
  pivot_longer(cols = -ensembl_id, names_to = "sample_id", values_to = "vst_count") |>
  left_join(gene_info_df, by = c("ensembl_id" = "ensembl_gene_id_version")) |>
  left_join(sample_sheet_df_neural, by = c("sample_id")) |>
  filter(chromosome_name == "21") |>
  group_by(ensembl_id) |>
  mutate(
    row_norm_count = vst_count - mean(cur_data() %>% filter(cell_line == "198_5") %>% pull(vst_count)),
    mean_1985_count = mean(cur_data() %>% filter(cell_line == "198_5") %>% pull(vst_count))
  ) |>
  ungroup()


# complexheatmap for neurons
chr21_rownorm_hm_df <- chr21_rownorm_vsd |> 
  filter( # mean_1985_count>8 &
    !(gene_biotype %in% c("rRNA", "TEC"))) |> 
  mutate(
    gene_biotype = factor(case_when(
      gene_biotype == "protein_coding" ~ "Protein\nCoding",
      gene_biotype == "processed_pseudogene" ~ "Pseudo-\ngene",
      gene_biotype == "transcribed_processed_pseudogene" ~ "Pseudo-\ngene",
      T ~ gene_biotype), 
      levels = c("Protein\nCoding", "lncRNA", "Pseudo-\ngene")))
neural_rownorm_mat <- chr21_rownorm_hm_df |> 
  pivot_wider(id_cols = ensembl_id,
              names_from = sample_id,
              values_from = row_norm_count) |> 
  column_to_rownames(var="ensembl_id")
t21count_col_fun <- circlize::colorRamp2(c(8,16), c("#ffffff", "#7105E0"))
gene_annotation <- rowAnnotation(
  `Mean T21 Count` = 
    anno_simple(
      x=chr21_rownorm_hm_df |>
        dplyr::select(ensembl_id, mean_1985_count) |> 
        dplyr::distinct() |> 
        deframe(),
      col = t21count_col_fun,
      simple_anno_size = unit(3, "mm")),
  annotation_name_gp= gpar(fontsize = 7))

lgd_t21count = Legend(title = "Mean T21 Count", col_fun = t21count_col_fun, at = c(8,10,12,14,16), 
                      labels = c(8,10,12,14,16),
                      direction = "horizontal",
                      labels_gp=gpar(fontsize=7), 
                      title_gp=gpar(fontsize=7),
                      grid_height = unit(2, "mm"))


sample_annotation <- columnAnnotation(
  `Cell Line` = chr21_rownorm_hm_df |> 
    dplyr::select(sample_id, `cell_line`) |> 
    dplyr::distinct() |> 
    mutate(`cell_line` = case_when(
      cell_line == "198_1" ~ "198-1",
      cell_line == "198_2" ~ "198-2",
      cell_line == "198_5" ~ "198-5",
      cell_line == "C1_8" ~ "T21XIST c1",
      cell_line == "C4_8" ~ "T21XIST c4",
      cell_line == "C7_9" ~ "T21XIST c7")) |> 
    column_to_rownames(var="sample_id") |> 
    deframe(),
  `Diff Round` = chr21_rownorm_hm_df |> 
    dplyr::select(sample_id, `neural_diff`) |> 
    dplyr::distinct() |>
    column_to_rownames(var="sample_id") |> 
    deframe() |> 
    factor(levels=c("ND6", "ND7", "ND9", "ND11", "ND12", "ND18", "ND20")),
  simple_anno_size = unit(3, "mm"),
  annotation_legend_param = list(labels_gp=gpar(fontsize=7), title_gp=gpar(fontsize=7), grid_height = unit(2, "mm"), grid_width = unit(2, "mm")),
  col = list(`Cell Line` = c("198-1"="#1697a6", "198-2"="#0e606b", "198-5"="#ffc24b",
                             "T21XIST c1"="#fff4f1", "T21XIST c4"="#ffb3ae", "T21XIST c7"="#f47068"),
             `Diff Round` = c("ND6"="#ffadad", "ND7"="#ffd6a5", "ND9"="#fdffb6",
                              "ND11"="#caffbf", "ND12"="#9bf6ff", "ND18"="#a0c4ff", 
                              "ND20"="#bdb2ff")),
  annotation_name_gp= gpar(fontsize = 7)
)

neural_hm <- Heatmap(neural_rownorm_mat,
                   col=circlize::colorRamp2(c(-1, 0, 1), c("#3784AF", "#f7f7f7", "#D34B16")),
                   # col=circlize::colorRamp2(c(-1, 0, 1), c("#0571B0", "#f7f7f7", "#CA0020")),
                   name="Row Norm VST Count",
                   border=T,
                   cluster_rows = F,
                   row_order = chr21_rownorm_hm_df |> dplyr::select(ensembl_id, mean_1985_count) |> deframe() |> sort() |> names() |> unique() |> rev(),
                   row_split = chr21_rownorm_hm_df |> 
                     dplyr::select(ensembl_id, gene_biotype) |> 
                     distinct() |> 
                     deframe(),
                   row_title_side = "right",
                   row_gap = unit(1, "mm"),
                   show_row_names = F,
                   row_title_gp = gpar(fontsize = 7),
                   row_title_rot = 45,
                   column_title_gp = gpar(fontsize = 7),
                   column_names_gp = gpar(fontsize = 7),
                   left_annotation = gene_annotation,
                   show_column_dend=F,
                   column_split = chr21_rownorm_hm_df |> 
                     mutate(split_label = case_when(
                       genotype == "T21XIST" & !is.na(description.y) & description.y != "wd" ~ "+dox",
                       genotype == "T21XIST" & dox_status == "no" ~ "-dox",
                       genotype == "T21XIST" & description.y == "wd" ~ "w/d",
                       T ~ genotype)) |> 
                     dplyr::select(sample_id, split_label) |> 
                     distinct() |> 
                     deframe() |> 
                     factor(levels=c("T21", "-dox", "+dox", "w/d", "D21")),
                   cluster_column_slices=F,
                   column_gap = unit(1, "mm"),
                   show_column_names = F,
                   bottom_annotation = sample_annotation,
                   heatmap_legend_param = list(direction = "horizontal", 
                                               labels_gp=gpar(fontsize=7), 
                                               title_gp=gpar(fontsize=7),
                                               grid_height = unit(2, "mm")))
pdf(sprintf("%s/expression_relative_to_1985_neural.pdf", fig_output_dir), width = 3.5, height = 4.7)
draw(neural_hm, annotation_legend_list = list(lgd_t21count), 
     merge_legend=T, heatmap_legend_side = "bottom", annotation_legend_side = "bottom")
dev.off()          

chr21_changes_comparison_cols <- purrr::cross(list(
  c("baseMean_", "log2FoldChange_", "pvalue_", "padj_", "stat_"), 
  c("neural_t21_d21", "neural_t21xistwd_t21", "neural_t21xistdox_t21")
)) |> map(lift(paste0)) |> unlist()

chr21_changes_df <- combined_results_df |>
  dplyr::filter(chromosome_name == "21") |>
  pivot_longer(
    cols = c(ends_with(chr21_changes_comparison_cols)),
    names_pattern = "([A-Za-z0-9\\.]*)_(.*)", names_to = c("value_type", "comparison")
  ) |>
  pivot_wider(names_from = "value_type", values_from = "value") |>
  dplyr::mutate(
    # log2FoldChange = log2FoldChange_dox_t21xist_vs_1985,
    # padj = padj_dox_t21xist_vs_1985,
    # baseMean = baseMean_dox_t21xist_vs_1985,
    color_label = case_when(
      padj <= .1 & dplyr::between(log2FoldChange, log2((3 / 2) - .1), log2((3 / 2) + .1)) ~ "padj≤.1 & FC=(3/2)±.1",
      padj <= .1 & dplyr::between(log2FoldChange, log2((3 / 2) - .15), log2((3 / 2)+.15)) ~ "padj≤.1 & FC=(3/2)±.15",
      padj <= .1 ~ "padj≤.1",
      T ~ "padj>.1"
    )
  )

ggplot(
  chr21_changes_df |> 
    mutate(comparison = case_when(comparison == "dT21XIST_vs_T21" ~ "+dox",
                                  comparison == "wdT21XIST_vs_T21" ~ "w/d",
                                  T ~ comparison)),
  aes(x = fct_reorder(ensembl_id, start_position), y = log2FoldChange, color = color_label, size = baseMean)
) +
  facet_grid(rows = vars(comparison)) +
  
  scale_color_manual(
    values = c(pal_d3()(3)[c(1, 3, 2)], pal_uchicago()(2)[2]), limits = c("padj≤.1 & FC=(3/2)±.1", "padj≤.1 & FC=(3/2)±.15", "padj≤.1", "padj>.1")
  ) +
  scale_size_area(limits = c(NA, 10000), oob = scales::squish, guide = "none", max_size = 4) +
  labs(
    title = "Neural HSA21 Excess Dosage",
    color = "Gene Category",
    size = "Base Mean Expression",
    x = "HSA21 Genes",
    y = "log2(FC)"
  ) +
  geom_hline(yintercept = log2(3 / 2), linetype = "dashed", size = .3) +
  geom_hline(yintercept = log2(2 / 3), linetype = "dashed", size = .3) +
  geom_hline(yintercept = 0, linetype = "solid") +
  geom_vline(xintercept = "ENSG00000157540.22", linetype = "dotdash", size = .3) +
  geom_point(alpha = .5) +
  guides(color=guide_legend(ncol=2, label.position = "right", label.hjust = .1)) +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    legend.position = "top",
    # legend.spacing.x = unit(3, "mm"),
    # legend.margin = margin(3,3,3,3, "mm"),
    legend.box.background = element_rect(color = "black", size=.5)
  ) +
  coord_cartesian(ylim = c(-1.5, 1.5))
ggsave(sprintf("%s/hsa21_dosage_correction_neural.pdf", fig_output_dir), width = 3.25, height = 2)

chr21_changes_df |> 
  # mutate(comparison = case_when(comparison == "dT21XIST_vs_T21" ~ "+dox",
  #                               comparison == "wdT21XIST_vs_T21" ~ "w/d",
  #                               T ~ comparison)) |> 
  ggplot(aes(x=log2FoldChange, color=comparison)) +
  # facet_grid(rows=vars(comparison)) +
  geom_density() +
  coord_cartesian(xlim=c(-2, 2)) +
  geom_vline(xintercept = 0) +
  geom_vline(xintercept = log2(3/2), linetype="dashed") +
  geom_vline(xintercept = log2(2/3), linetype="dashed") +
  labs(title="Log2(FC) Neural T21/D21",
       x="log2(FC)")
ggsave(sprintf("%s/log2fc_dist_neural.pdf", fig_output_dir), width = 3, height = 1.7)


vectorized_sumlog <- function(x, y) {
    if (length(x) != length(y)) {
      stop("lengths of lists must be the same")
    }
    combined_pvals <- c()
    for (i in 1:length(x)) {
      combined_pvals <- c(combined_pvals, (metap::sumlog(c(x[i], y[i])))$p)
    }

    return(combined_pvals)
  }
  
compared_to_log2FC <- sprintf("log2FoldChange_%s", diff_expr_compare_to)
compared_to_padj <- sprintf("padj_%s", diff_expr_compare_to)
# compared_from_suffixes <- sprintf("%s_%s", c("baseMean", "log2FoldChange", "pvalue", "padj", "stat"), diff_expr_comparisons)
diff_expr_comparisons <- c("dox_effect", "dwdT21SIT_vs_T21")
compared_from_suffixes <- purrr::cross(list(
  c("baseMean_", "log2FoldChange_", "pvalue_", "padj_", "stat_"), 
  diff_expr_comparisons
  )) |> map(lift(paste0)) |> unlist()
        
  
combined_results_df |>
      filter(hgnc_symbol != "XIST") |>
      pivot_longer(
          cols = c(ends_with(compared_from_suffixes)),
          names_pattern = "([A-Za-z0-9\\.]*)_(.*)", names_to = c("value_type", "comparison")
      ) |>
      pivot_wider(names_from = "value_type", values_from = "value") |>
      dplyr::mutate(comparison = case_when(comparison == "dox_effect" ~ "Dox Effect",
                                           comparison == "dwdT21SIT_vs_T21" ~ "XIST Correction")) |> 
      dplyr::mutate(combined_pval := vectorized_sumlog(!!as.name(compared_to_padj), padj)) %>%
      # filter(combined_pval <= diff_expr_combined_padj_cutoff) |>
      ggplot(aes(x = log2FoldChange, y = !!as.name(compared_to_log2FC))) +
      facet_grid(cols = vars(comparison)) +
      scale_color_viridis_c(limits = c(1, 5), oob = scales::squish) +
      geom_hline(yintercept = 0, size = .3) +
      geom_vline(xintercept = 0, size = .3) +
      geom_point(data = . %>% filter(combined_pval > .1),color="lightgrey", size = 1.5, alpha = .5) +
      geom_point(data = . %>% filter(combined_pval <= .1), mapping = aes(color = -log10(combined_pval)), size = 1.5, alpha = .5) +
      # geom_point(data = . %>% filter(combined_pval <= .1), mapping = aes(color = -log10(combined_pval)), size = 1.5, alpha = .5) +
      # geom_smooth(method = "lm", color = "black", size = .3, linetype = "dashed") +
      geom_smooth(data = . %>% filter(combined_pval <= .1), method = "lm", color = "black", size = .3, linetype = "dashed") +
      coord_cartesian(xlim = c(-5, 5), ylim = c(-5, 5)) +
      # ggpubr::stat_cor(method = "spearman", label.x = -4, label.y = 4, size = 3) +
      ggpubr::stat_cor(data = . %>% filter(combined_pval <= .1), method = "spearman", label.x = -4, label.y = 4, size = 3) +
      labs(
          y = "log2FC(T21 vs. D21)",
          x = "log2FC",
          title = "Correlation with T21-driven Gene Expression",
          color = "-log10\n(c.padj)"
      ) +
      theme(
          legend.position = "top",
          legend.margin = margin(l = -5, b=-3, unit = "mm"),
          legend.title = element_text(margin = margin(r = 2, unit = "mm")),
          legend.key.height = unit(3, "mm"),
          legend.key.width = unit(3.5, "mm")
                )
# ggsave(sprintf("%s/rescue_of_t21_dysregulated_gene_expression.pdf", fig_output_dir), width=3, height = 4.125)
ggsave(sprintf("%s/rescue_of_t21_ipsc.pdf", fig_output_dir), width=4.1, height = 3.3)
ggsave(sprintf("%s/rescue_of_t21_ipsc_sigcoor.pdf", fig_output_dir), width=4.1, height = 3.3)
# ggsave(sprintf("%s/rescue_of_t21_dysregulated_gene_expression_horizontal.pdf", fig_output_dir), width=4, height = 4.125-.5)
# ggsave(sprintf("%s/rescue_of_t21_dysregulated_gene_expression_horizontal_sigcoor.pdf", fig_output_dir), width=4, height = 4.125-.5)
# ggsave(sprintf("%s/rescue_of_t21_dysregulated_gene_expression_vertical.pdf", fig_output_dir), width=3, height = 4.125)
# ggsave(sprintf("%s/rescue_of_t21_dysregulated_gene_expression_vertical_sigcoor.pdf", fig_output_dir), width=3, height = 4.125)

compared_to_log2FC <- sprintf("log2FoldChange_%s", diff_expr_compare_to)
compared_to_padj <- sprintf("padj_%s", diff_expr_compare_to)
# compared_from_suffixes <- sprintf("%s_%s", c("baseMean", "log2FoldChange", "pvalue", "padj", "stat"), diff_expr_comparisons)
diff_expr_comparisons <- c("dox_effect", "dwdT21SIT_vs_T21", "dwdT21SIT_vs_D21")
compared_from_suffixes <- purrr::cross(list(
  c("baseMean_", "log2FoldChange_", "pvalue_", "padj_", "stat_"), 
  diff_expr_comparisons
)) |> map(lift(paste0)) |> unlist()

combined_results_df |>
  filter(hgnc_symbol != "XIST" & chromosome_name != "21") |>
  pivot_longer(
    cols = c(ends_with(compared_from_suffixes)),
    names_pattern = "([A-Za-z0-9\\.]*)_(.*)", names_to = c("value_type", "comparison")
  ) |>
  pivot_wider(names_from = "value_type", values_from = "value") |>
  dplyr::mutate(comparison = factor(case_when(comparison == "dox_effect" ~ "Dox Effect",
                                       comparison == "dwdT21SIT_vs_T21" ~ "T21XIST/T21",
                                       comparison == "dwdT21SIT_vs_D21" ~ "T21XIST/D21"),
                                    levels = c("Dox Effect", "T21XIST/T21","T21XIST/D21"))) |> 
  dplyr::mutate(combined_pval := vectorized_sumlog(!!as.name(compared_to_padj), padj)) %>%
  # filter(combined_pval <= diff_expr_combined_padj_cutoff) |>
  ggplot(aes(x = log2FoldChange, y = !!as.name(compared_to_log2FC))) +
  facet_grid(cols = vars(comparison)) +
  scale_color_viridis_c(limits = c(1, 5), oob = scales::squish) +
  geom_hline(yintercept = 0, size = .3) +
  geom_vline(xintercept = 0, size = .3) +
  geom_point(data = . %>% filter(combined_pval > .1),color="lightgrey", size = 1.5, alpha = .5) +
  geom_point(data = . %>% filter(combined_pval <= .1), mapping = aes(color = -log10(combined_pval)), size = 1.5, alpha = .5) +
  # geom_point(data = . %>% filter(combined_pval <= .1), mapping = aes(color = -log10(combined_pval)), size = 1.5, alpha = .5) +
  geom_smooth(data = . %>% filter(combined_pval <= .1 & hgnc_symbol != "XIST" & chromosome_name != "21"), method = "lm", color = "black", size = .3, linetype = "dashed") +
  # geom_smooth(method = "lm", color = "black", size = .3, linetype = "dashed") +
  coord_cartesian(xlim = c(-5, 5), ylim = c(-5, 5)) +
  ggpubr::stat_cor(data = . %>% filter(combined_pval <= .1 & hgnc_symbol != "XIST" & chromosome_name != "21"), method = "spearman", label.x = -4, label.y = 4, size = 3) +
  # ggpubr::stat_cor(method = "spearman", label.x = -4, label.y = 4, size = 3) +
  labs(
    y = "log2FC(T21 vs. D21)",
    x = "log2FC",
    title = "Correlation with T21-driven Gene Expression\nexcluding Chr21",
    color = "-log10\n(c.padj)"
  ) +
  theme(
    legend.position = "top",
    legend.margin = margin(l = -5, b=-3, unit = "mm"),
    legend.title = element_text(margin = margin(r = 2, unit = "mm")),
    legend.key.height = unit(3, "mm"),
    legend.key.width = unit(3.5, "mm")
  )
ggsave(sprintf("%s/rescue_of_t21_ipsc_nochr21.pdf", fig_output_dir), width=3.55, height = 3.3)
ggsave(sprintf("%s/rescue_of_t21_ipsc_nochr21_witht21xistd21.pdf", fig_output_dir), width=3.55, height = 3.3)
ggsave(sprintf("%s/rescue_of_t21_ipsc_nochr21_witht21xistd21_sigcoor.pdf", fig_output_dir), width=4.5, height = 2.7)


compared_to_log2FC <- sprintf("log2FoldChange_%s", diff_expr_compare_to)
compared_to_padj <- sprintf("padj_%s", diff_expr_compare_to)
# compared_from_suffixes <- sprintf("%s_%s", c("baseMean", "log2FoldChange", "pvalue", "padj", "stat"), diff_expr_comparisons)
diff_expr_comparisons <- c("neural_T21_D21")
compared_from_suffixes <- purrr::cross(list(
  c("baseMean_", "log2FoldChange_", "pvalue_", "padj_", "stat_"), 
  diff_expr_comparisons
)) |> map(lift(paste0)) |> unlist()


combined_results_df |>
  filter(hgnc_symbol != "XIST") |>
  pivot_longer(
    cols = c(ends_with(compared_from_suffixes)),
    names_pattern = "([A-Za-z0-9\\.]*)_(.*)", names_to = c("value_type", "comparison")
  ) |>
  pivot_wider(names_from = "value_type", values_from = "value") |>
  dplyr::mutate(comparison = case_when(comparison == "neural_T21_D21" ~ "Neural T21/D21")) |> 
  dplyr::mutate(combined_pval := vectorized_sumlog(!!as.name(compared_to_padj), padj)) %>%
  # filter(combined_pval <= diff_expr_combined_padj_cutoff) |>
  ggplot(aes(x = log2FoldChange, y = !!as.name(compared_to_log2FC))) +
  # facet_grid(cols = vars(comparison)) +
  scale_color_viridis_c(limits = c(1, 5), oob = scales::squish) +
  geom_hline(yintercept = 0, size = .3) +
  geom_vline(xintercept = 0, size = .3) +
  geom_point(data = . %>% filter(combined_pval > .1),color="lightgrey", size = 1.5, alpha = .5) +
  geom_point(data = . %>% filter(combined_pval <= .1), mapping = aes(color = -log10(combined_pval)), size = 1.5, alpha = .5) +
  # geom_point(data = . %>% filter(combined_pval <= .1), mapping = aes(color = -log10(combined_pval)), size = 1.5, alpha = .5) +
  # geom_smooth( method = "lm", color = "black", size = .3, linetype = "dashed") +
  geom_smooth(data = . %>% filter(combined_pval <= .1), method = "lm", color = "black", size = .3, linetype = "dashed") +
  coord_cartesian(xlim = c(-5, 5), ylim = c(-5, 5)) +
  # ggpubr::stat_cor( method = "spearman", label.x = -4, label.y = 4, size = 3) +
  ggpubr::stat_cor(data = . %>% filter(combined_pval <= .1), method = "spearman", label.x = -4, label.y = 4, size = 3) +
  labs(
    y = "log2FC(iPSC T21 vs. D21)",
    x = "log2FC(Neural T21 vs. D21)",
    title = "iPSC and Neural T21 Overlap",
    color = "-log10\n(c.padj)"
  ) +
  theme(
    legend.position = "top",
    legend.margin = margin(l = -5, b=-3, unit = "mm"),
    legend.title = element_text(margin = margin(r = 2, unit = "mm")),
    legend.key.height = unit(3, "mm"),
    legend.key.width = unit(3.5, "mm")
  )
ggsave(sprintf("%s/neural_ipsc_gene_overlap.pdf", fig_output_dir), width=3, height = 3.1)
ggsave(sprintf("%s/neural_ipsc_gene_overlap_sigcoor.pdf", fig_output_dir), width=3, height = 3.1)

compared_to_log2FC <- sprintf("log2FoldChange_%s", "neural_t21_d21")
compared_to_padj <- sprintf("padj_%s", "neural_t21_d21")
# compared_from_suffixes <- sprintf("%s_%s", c("baseMean", "log2FoldChange", "pvalue", "padj", "stat"), diff_expr_comparisons)
diff_expr_comparisons <- c("neural_t21xistdoxcombined_t21")
compared_from_suffixes <- purrr::cross(list(
  c("baseMean_", "log2FoldChange_", "pvalue_", "padj_", "stat_"), 
  diff_expr_comparisons
)) |> map(lift(paste0)) |> unlist()


combined_results_df |>
  filter(hgnc_symbol != "XIST") |>
  pivot_longer(
    cols = c(ends_with(compared_from_suffixes)),
    names_pattern = "([A-Za-z0-9\\.]*)_(.*)", names_to = c("value_type", "comparison")
  ) |>
  pivot_wider(names_from = "value_type", values_from = "value") |>
  dplyr::mutate(comparison = case_when(comparison == "neural_t21xistdox_t21" ~ "+Dox",
                                       comparison == "neural_t21xistwd_t21" ~ "w/d")) |> 
  dplyr::mutate(combined_pval := vectorized_sumlog(!!as.name(compared_to_padj), padj)) %>%
  # filter(combined_pval <= diff_expr_combined_padj_cutoff) |>
  ggplot(aes(x = log2FoldChange, y = !!as.name(compared_to_log2FC))) +
  # facet_grid(cols = vars(comparison)) +
  scale_color_viridis_c(limits = c(1, 5), oob = scales::squish) +
  geom_hline(yintercept = 0, size = .3) +
  geom_vline(xintercept = 0, size = .3) +
  geom_point(data = . %>% filter(combined_pval > .1),color="lightgrey", size = 1.5, alpha = .5) +
  geom_point(data = . %>% filter(combined_pval <= .1), mapping = aes(color = -log10(combined_pval)), size = 1.5, alpha = .5) +
  # geom_point(data = . %>% filter(combined_pval <= .1), mapping = aes(color = -log10(combined_pval)), size = 1.5, alpha = .5) +
  # geom_smooth(method = "lm", color = "black", size = .3, linetype = "dashed") +
  geom_smooth(data = . %>% filter(combined_pval <= .1), method = "lm", color = "black", size = .3, linetype = "dashed") +
  coord_cartesian(xlim = c(-10, 10), ylim = c(-10, 10)) +
  # ggpubr::stat_cor(method = "spearman", label.x = -4, label.y = 4, size = 3) +
  ggpubr::stat_cor(data = . %>% filter(combined_pval <= .1), method = "spearman", label.x = -9, label.y = 6, size = 3) +
  labs(
    y = "log2FC(Neural T21 vs. D21)",
    x = "log2FC (Neural T21XIST vs T21)",
    title = "Correlation with T21-driven Expression - Neural",
    color = "-log10\n(c.padj)"
  ) +
  theme(
    legend.position = "top",
    legend.margin = margin(l = -5, b=-3, unit = "mm"),
    legend.title = element_text(margin = margin(r = 2, unit = "mm")),
    legend.key.height = unit(3, "mm"),
    legend.key.width = unit(3.5, "mm")
  )
# ggsave(sprintf("%s/rescue_of_t21_neural.pdf", fig_output_dir), width=3, height = 3.3)
ggsave(sprintf("%s/rescue_of_t21_neural_sigcoor.pdf", fig_output_dir), width=3.5, height = 2.4)

combined_results_df |>
      filter(hgnc_symbol != "XIST") |>
      pivot_longer(
          cols = c(ends_with(compared_from_suffixes)),
          names_pattern = "([A-Za-z0-9\\.]*)_(.*)", names_to = c("value_type", "comparison")
      ) |>
      pivot_wider(names_from = "value_type", values_from = "value") |>
      dplyr::mutate(comparison = case_when(comparison == "dox_effect" ~ "Dox Effect",
                                           comparison == "dwdT21SIT_vs_T21" ~ "XIST Correction")) |> 
      dplyr::mutate(combined_pval := vectorized_sumlog(!!as.name(compared_to_padj), padj)) %>%
      # filter(combined_pval <= diff_expr_combined_padj_cutoff) |>
      ggplot(aes(x = log2FoldChange, y = !!as.name(compared_to_log2FC))) +
      facet_grid(cols = vars(comparison)) +
      scale_color_viridis_c(limits = c(1, 5), oob = scales::squish) +
      geom_hline(yintercept = 0, size = .3) +
      geom_vline(xintercept = 0, size = .3) +
      geom_point(data = . %>% filter(combined_pval > .1),color="lightgrey", size = 1.5, alpha = .5) +
      geom_point(data = . %>% filter(combined_pval <= .1), mapping = aes(color = -log10(combined_pval)), size = 1.5, alpha = .5) +
      # geom_point(data = . %>% filter(combined_pval <= .1), mapping = aes(color = -log10(combined_pval)), size = 1.5, alpha = .5) +
      # geom_smooth(method = "lm", color = "black", size = .3, linetype = "dashed") +
      geom_smooth(data = . %>% filter(combined_pval <= .1), method = "lm", color = "black", size = .3, linetype = "dashed") +
      coord_cartesian(xlim = c(-5, 5), ylim = c(-5, 5)) +
      # ggpubr::stat_cor(method = "spearman", label.x = -4, label.y = 4, size = 3) +
      ggpubr::stat_cor(data = . %>% filter(combined_pval <= .1), method = "spearman", label.x = -4, label.y = 4, size = 3) +
      labs(
          y = "log2FC(T21 vs. D21)",
          x = "log2FC",
          title = "Rescue of T21 Dysregulation",
          color = "-log10\n(c.padj)"
      ) +
      theme(
          legend.position = "top",
          legend.margin = margin(l = -5, b=-3, unit = "mm"),
          legend.title = element_text(margin = margin(r = 2, unit = "mm")),
          legend.key.height = unit(3, "mm"),
          legend.key.width = unit(3.5, "mm")
                )


compared_from_suffixes <- purrr::cross(list(
  c("baseMean_", "log2FoldChange_", "pvalue_", "padj_", "stat_"), 
  c("dox_effect", "dwdT21SIT_vs_T21", "T21_vs_D21")
)) |> map(lift(paste0)) |> unlist()

combined_results_df |>
  filter(hgnc_symbol != "XIST") |>
  pivot_longer(
    cols = c(ends_with(compared_from_suffixes)),
    names_pattern = "([A-Za-z0-9\\.]*)_(.*)", names_to = c("value_type", "comparison")
  ) |>
  pivot_wider(names_from = "value_type", values_from = "value") |>
  dplyr::mutate(comparison = case_when(comparison == "dox_effect" ~ "Dox Effect",
                                       comparison == "dwdT21SIT_vs_T21" ~ "XIST Correction",
                                       comparison == "T21_vs_D21" ~ "T21/D21"
                                       )) |> 
  dplyr::mutate(comparison = factor(comparison, levels = c("T21/D21", "XIST Correction", "Dox Effect"))) %>%
  filter(padj <= .1 ) |>
  ggplot(aes(x = abs(log2FoldChange), color=comparison)) +
  geom_density(aes(y = ..count.., fill=comparison)) +
  scale_fill_manual(limits=c("XIST Correction", "Dox Effect"), values = c("lightgrey", "white"), guide="none") +
  scale_color_aaas() +
  geom_density(aes(y = ..count..)) +
  coord_cartesian(xlim=c(0, 1)) +
  scale_x_continuous(limits = c(0, 1), oob = scales::squish, expand = c(0,0),
                     breaks = c(0,.25,.5,.75,1),
                     labels = c("0.00","0.25", "0.50", "0.75", "???1.00")) +
  scale_y_continuous(expand=c(0,0))+
  labs(
    y = "scaled densities",
    x = "log2FC",
    title = "Significant Fold-Changes",
    color = "Diff. Expr"
  ) +
  theme(
    legend.position = "top",
    legend.margin = margin(l = -5, b=-3, unit = "mm"),
    legend.title = element_blank(),
    legend.key.height = unit(3, "mm"),
    legend.key.width = unit(3.5, "mm")
  )
ggsave(sprintf("%s/sig_fold_changes.pdf", fig_output_dir), width = 2.5, height = 2.8)
## GSEA Overlaps

term_order <- c(
    # "chr21",
    # "ECM",
    # "Folate",
    "apoptosis",
    "serine",
    "stress",
    "ER",
    "mitochondria",
    "ribosome",
    "translation"
  )

  term_colors <- c(
    # "#ff6c92",
    "#8e000c",
    "#da5c29",
    "#f5cf87",
    "#a90aff",
    "#017c2f",
    "#00208a",
    "#fe85f3"
    # "#000000"
  )
  

compared_to_NES <- sprintf("NES_%s", gsea_compare_to)
compared_to_padj <- sprintf("p.adjust_%s", gsea_compare_to)
# compared_from_suffixes <- sprintf("_%s", gsea_comparisons)
gsea_comparisons <- c("dwdT21SIT_vs_T21")
compared_from_suffixes <- purrr::cross(list(
  c("NES_", "p.adjust_", "qvalues_", "enrichmentScore_", "setSize_", "leadingEdgeTags_", "leadingEdgeList_", "leadingEdgeSignal_", "numCoreEnrichment_"), 
  gsea_comparisons
  )) |> map(lift(paste0)) |> unlist()

gsea_combined_pval_cutoff <- gsea_combined_padj_cutoff

combined_enrichment_df |>
    # filter(hgnc_symbol != "XIST") |>
    pivot_longer(
        cols = c(ends_with(compared_from_suffixes)),
        names_pattern = "([A-Za-z0-9\\.]*)_(.*)", names_to = c("value_type", "comparison")
    ) |>
    pivot_wider(names_from = "value_type", values_from = "value") |>
    mutate(combined_pval = vectorized_sumlog(!!as.name(compared_to_padj), p.adjust)) |>
    filter(combined_pval <= gsea_combined_pval_cutoff & category != "chr21") |>
    # mutate(comparison = case_when(
    #     comparison == "wd_nd" ~ "w/d",
    #     comparison == "wd_nd_powered" ~ "w/d-p",
    #     comparison == "d_t21xist_t21" ~ "+dox",
    #     comparison == "dox" ~ "Dox Effect",
    #     comparison == "neural_t21_d21" ~ "Neural T21/D21"
    # )) %>%
    ggplot(aes(x = NES, y = !!as.name(compared_to_NES), size = -log10(combined_pval), color = category, ID = ID)) +
    # facet_grid(cols = vars(factor(comparison, levels = gsea_comparisons))) +
    geom_point(data = . %>% filter(category == "other"), alpha = .5) +
    geom_point(data = . %>% filter(category != "other"), alpha = .7) +
    labs(
        title = "Rescue of Acute Enrichment Terms",
        y = "NES (iPSC T21/D21)", #sprintf("NES (%s)", gsea_compare_to),
        x = "NES (iPSC T21XIST/T21)",
        color = "Category"
    ) +
    scale_color_manual(
        limits = c(term_order, "other"),
        values = c(term_colors, "lightgrey")
    ) +
    scale_size_area(max_size = 3.5, breaks = c(1, 2, 5, 10), name="-log10(c.padj)") +
    geom_vline(xintercept = 0, size = .3) +
    geom_hline(yintercept = 0, size = .3) +
  theme(legend.box = "horizontal")
ggsave(sprintf("%s/acute_enrrichment_rescue.pdf", fig_output_dir), width = 3, height = 2.3)
ggsave(sprintf("%s/acute_enrrichment_rescue_cpval guide.pdf", fig_output_dir), width = 4, height = 2.3)

compared_to_NES <- sprintf("NES_%s", "neural_t21_d21")
compared_to_padj <- sprintf("p.adjust_%s", "neural_t21_d21")
# compared_from_suffixes <- sprintf("_%s", gsea_comparisons)
gsea_comparisons <- c("neural_t21xistdoxcombined_t21")
compared_from_suffixes <- purrr::cross(list(
  c("NES_", "p.adjust_", "qvalues_", "enrichmentScore_", "setSize_", "leadingEdgeTags_", "leadingEdgeList_", "leadingEdgeSignal_", "numCoreEnrichment_"), 
  gsea_comparisons
)) |> map(lift(paste0)) |> unlist()

gsea_combined_pval_cutoff <- gsea_combined_padj_cutoff

combined_enrichment_df |>
  # filter(hgnc_symbol != "XIST") |>
  pivot_longer(
    cols = c(ends_with(compared_from_suffixes)),
    names_pattern = "([A-Za-z0-9\\.]*)_(.*)", names_to = c("value_type", "comparison")
  ) |>
  pivot_wider(names_from = "value_type", values_from = "value") |>
  mutate(combined_pval = vectorized_sumlog(!!as.name(compared_to_padj), p.adjust)) |>
  filter(combined_pval <= gsea_combined_pval_cutoff & category != "chr21") |>
  # mutate(comparison = case_when(
  #     comparison == "wd_nd" ~ "w/d",
  #     comparison == "wd_nd_powered" ~ "w/d-p",
  #     comparison == "d_t21xist_t21" ~ "+dox",
  #     comparison == "dox" ~ "Dox Effect",
  #     comparison == "neural_t21_d21" ~ "Neural T21/D21"
  # )) %>%
  ggplot(aes(x = NES, y = !!as.name(compared_to_NES), size = -log10(combined_pval), color = category, ID = ID)) +
  # facet_grid(cols = vars(factor(comparison, levels = gsea_comparisons))) +
  geom_point(data = . %>% filter(category == "other"), alpha = .5) +
  geom_point(data = . %>% filter(category != "other"), alpha = .7) +
  labs(
    title = "Neural Rescue of Acute Enrichment Terms",
    y = "NES (Neural T21/D21)", #sprintf("NES (%s)", gsea_compare_to),
    x = "NES (Neural XIST/T21)",
    color = "Category"
  ) +
  scale_color_manual(
    limits = c(term_order, "other"),
    values = c(term_colors, "lightgrey")
  ) +
  scale_size_area(max_size = 3.5, breaks = c(1, 2, 5, 10), name="-log10(c.padj)") +
  geom_vline(xintercept = 0, size = .3) +
  geom_hline(yintercept = 0, size = .3) +  
theme(legend.box = "vertical",
      legend.box.margin = margin(l=-5),
      legend.spacing.y = unit(.5, "mm"))
ggsave(sprintf("%s/acute_enrrichment_rescue_neural_2.pdf", fig_output_dir), width = 3.6, height = 2.5)

compared_to_NES <- sprintf("NES_%s", gsea_compare_to)
compared_to_padj <- sprintf("p.adjust_%s", gsea_compare_to)
compared_from_suffixes <- sprintf("_%s", gsea_comparisons)
gsea_comparisons <- c("neural_t21_d21", 
                      "neural_t21xistwd_t21", 
                      "neural_t21xistdox_t21",
                      "neural_t21xistdoxcombined_t21")
compared_from_suffixes <- purrr::cross(list(
  c("NES_", "p.adjust_", "qvalues_", "enrichmentScore_", "setSize_", "leadingEdgeTags_", "leadingEdgeList_", "leadingEdgeSignal_", "numCoreEnrichment_"), 
  gsea_comparisons
)) |> map(lift(paste0)) |> unlist()

gsea_combined_pval_cutoff <- .2

compared_to_NES <- sprintf("NES_%s", "T21_vs_D21")
compared_to_padj <- sprintf("p.adjust_%s",  "T21_vs_D21")
combined_enrichment_df |>
  # filter(hgnc_symbol != "XIST") |>
  pivot_longer(
    cols = c(ends_with(gsea_comparisons
      # purrr::cross(list(
      # c("NES_", "p.adjust_", "qvalues_", "enrichmentScore_", "setSize_", "leadingEdgeTags_", "leadingEdgeList_", "leadingEdgeSignal_", "numCoreEnrichment_"), 
      # gsea_comparisons
    # )
    # ) |> map(lift(paste0)) |> unlist()
    )),
    names_pattern = "([A-Za-z0-9\\.]*)_(.*)", names_to = c("value_type", "comparison")
  ) |>
  pivot_wider(names_from = "value_type", values_from = "value") |>
  mutate(combined_pval = vectorized_sumlog(!!as.name(compared_to_padj), p.adjust)) |>
  filter(combined_pval <= gsea_combined_pval_cutoff & category != "chr21") |>
  # mutate(comparison = case_when(
  #     comparison == "wd_nd" ~ "w/d",
  #     comparison == "wd_nd_powered" ~ "w/d-p",
  #     comparison == "d_t21xist_t21" ~ "+dox",
  #     comparison == "dox" ~ "Dox Effect",
  #     comparison == "neural_t21_d21" ~ "Neural T21/D21"
  # )) %>%
  ggplot(aes(x = NES, y = !!as.name(compared_to_NES), size = -log10(combined_pval), color = category, ID = ID)) +
  facet_grid(cols = vars(factor(comparison, levels = gsea_comparisons))) +
  geom_point(data = . %>% filter(category == "other"), alpha = .5) +
  geom_point(data = . %>% filter(category != "other"), alpha = .7) +
  labs(
    title = "iPSC and Neural Term Overlap",
    y = "NES (iPSC T21/D21)", #sprintf("NES (%s)", gsea_compare_to),
    # x = "NES (Neural T21/D21)",
    color = "Category"
  ) +
  scale_color_manual(
    limits = c(term_order, "other"),
    values = c(term_colors, "lightgrey")
  ) +
  scale_size_area(max_size = 3.5, guide = "none") +
  geom_vline(xintercept = 0, size = .3) +
  geom_hline(yintercept = 0, size = .3)  
# ggsave(sprintf("%s/acute_enrrichment_ipsc_neural.pdf", fig_output_dir), width = 3.25, height = 2.5)
ggsave(sprintf("%s/acute_enrrichment_ipsc_neural.pdf", fig_output_dir), width = 8, height = 2.5)

compared_to_NES <- sprintf("NES_%s", "neural_t21_d21")
compared_to_padj <- sprintf("p.adjust_%s",  "neural_t21_d21")
combined_enrichment_df |>
  # filter(hgnc_symbol != "XIST") |>
  pivot_longer(
    cols = c(ends_with(c("neural_t21xistwd_t21", 
                         "neural_t21xistdox_t21",
                         "neural_t21xistdoxcombined_t21")
                       # purrr::cross(list(
                       # c("NES_", "p.adjust_", "qvalues_", "enrichmentScore_", "setSize_", "leadingEdgeTags_", "leadingEdgeList_", "leadingEdgeSignal_", "numCoreEnrichment_"), 
                       # gsea_comparisons
                       # )
                       # ) |> map(lift(paste0)) |> unlist()
    )),
    names_pattern = "([A-Za-z0-9\\.]*)_(.*)", names_to = c("value_type", "comparison")
  ) |>
  pivot_wider(names_from = "value_type", values_from = "value") |>
  mutate(combined_pval = vectorized_sumlog(!!as.name(compared_to_padj), p.adjust)) |>
  filter(combined_pval <= gsea_combined_pval_cutoff & category != "chr21") |>
  # mutate(comparison = case_when(
  #     comparison == "wd_nd" ~ "w/d",
  #     comparison == "wd_nd_powered" ~ "w/d-p",
  #     comparison == "d_t21xist_t21" ~ "+dox",
  #     comparison == "dox" ~ "Dox Effect",
  #     comparison == "neural_t21_d21" ~ "Neural T21/D21"
  # )) %>%
  ggplot(aes(x = NES, y = !!as.name(compared_to_NES), size = -log10(combined_pval), color = category, ID = ID)) +
  facet_grid(cols = vars(factor(comparison, levels = gsea_comparisons))) +
  geom_point(data = . %>% filter(category == "other"), alpha = .5) +
  geom_point(data = . %>% filter(category != "other"), alpha = .7) +
  labs(
    title = "iPSC and Neural Term Overlap",
    y = "NES (iPSC T21/D21)", #sprintf("NES (%s)", gsea_compare_to),
    # x = "NES (Neural T21/D21)",
    color = "Category"
  ) +
  scale_color_manual(
    limits = c(term_order, "other"),
    values = c(term_colors, "lightgrey")
  ) +
  scale_size_area(max_size = 3.5, guide = "none") +
  geom_vline(xintercept = 0, size = .3) +
  geom_hline(yintercept = 0, size = .3)  
# ggsave(sprintf("%s/acute_enrrichment_ipsc_neural.pdf", fig_output_dir), width = 3.25, height = 2.5)
ggsave(sprintf("%s/neural_rescue.pdf", fig_output_dir), width = 8, height = 2.5)

gsea_combined_pval_cutoff <- .1
selected_terms <- combined_enrichment_df |>
  # filter(hgnc_symbol != "XIST") |>
  pivot_longer(
    cols = c(ends_with("_dwdT21SIT_vs_T21")),
    names_pattern = "([A-Za-z0-9\\.]*)_(.*)", names_to = c("value_type", "comparison")
  ) |>
  pivot_wider(names_from = "value_type", values_from = "value") |>
  mutate(combined_pval = vectorized_sumlog(p.adjust_T21_vs_D21, p.adjust)) |>
  filter(combined_pval <= gsea_combined_pval_cutoff & category != "chr21") |> 
  mutate(category = case_when(category == "ribosome" & str_detect(ID, "PRERIBOSOME") ~ "preribosome",
                              T ~ category),
         quadrant = case_when(
           NES > 0 & NES_T21_vs_D21 > 0 ~ 1,
           NES < 0 & NES_T21_vs_D21 > 0 ~ 2,
           NES < 0 & NES_T21_vs_D21 < 0 ~ 3,
           NES > 0 & NES_T21_vs_D21 < 0 ~ 4
         )) |> 
  filter(category != "other" & quadrant %in% c(2,4)) |> 
  mutate(dist_from_center = sqrt(NES^2 + NES_T21_vs_D21^2)) |> 
  group_by(category) |> 
  arrange(desc(dist_from_center)) |> 
  filter(row_number() <= 3) |> 
  ungroup() |> 
  dplyr::select(ID, dist_from_center, category2=category)

selected_terms2 <- combined_enrichment_df |>
  # filter(hgnc_symbol != "XIST") |>
  # pivot_longer(
  #   cols = c(ends_with("_dwdT21SIT_vs_T21", "_neural_t21_d21")),
  #   names_pattern = "([A-Za-z0-9\\.]*)_(.*)", names_to = c("value_type", "comparison")
  # ) |>
  # pivot_wider(names_from = "value_type", values_from = "value") |>
  mutate(combined_pval = vectorized_sumlog(vectorized_sumlog(`p.adjust_neural_t21_d21`, `p.adjust_T21_vs_D21`), `p.adjust_dwdT21SIT_vs_T21`)) |>
  filter(combined_pval <= gsea_combined_pval_cutoff & category != "chr21") |> 
  mutate(quadrant = case_when(
           NES_T21_vs_D21 > 0 & NES_neural_t21_d21 > 0 & NES_dwdT21SIT_vs_T21 < 0 ~ 2,
           NES_T21_vs_D21 < 0 & NES_neural_t21_d21 < 0 & NES_dwdT21SIT_vs_T21 > 0 ~ 4,
           T ~ 0
         )) |> 
  filter(category != "other" & quadrant %in% c(2,4)) |> 
  mutate(dist_from_center = sqrt(NES_T21_vs_D21^2 + NES_dwdT21SIT_vs_T21^2 + NES_neural_t21_d21^2)) |> 
  group_by(category) |> 
  arrange(desc(dist_from_center)) |> 
  filter(row_number() <= 3) |> 
  ungroup() |> 
  dplyr::select(ID, dist_from_center, category2=category)

selected_terms_ipsc <- combined_enrichment_df |>
  # filter(hgnc_symbol != "XIST") |>
  pivot_longer(
    cols = c(ends_with("_dwdT21SIT_vs_T21")),
    names_pattern = "([A-Za-z0-9\\.]*)_(.*)", names_to = c("value_type", "comparison")
  ) |>
  pivot_wider(names_from = "value_type", values_from = "value") |>
  mutate(combined_pval = vectorized_sumlog(p.adjust_T21_vs_D21, p.adjust)) |>
  filter(combined_pval <= gsea_combined_pval_cutoff & category != "chr21") |> 
  mutate(quadrant = case_when(
           NES > 0 & NES_T21_vs_D21 > 0 ~ 1,
           NES < 0 & NES_T21_vs_D21 > 0 ~ 2,
           NES < 0 & NES_T21_vs_D21 < 0 ~ 3,
           NES > 0 & NES_T21_vs_D21 < 0 ~ 4
         )) |> 
  filter(category != "other" & quadrant %in% c(2,4)) |> 
  mutate(dist_from_center = sqrt(NES^2 + NES_T21_vs_D21^2)) |> 
  group_by(category) |> 
  arrange(desc(dist_from_center)) |> 
  filter(row_number() <= 3) |> 
  ungroup() |> 
  dplyr::select(ID, dist_from_center, category2=category)

selected_terms_neural <- combined_enrichment_df |>
  # filter(hgnc_symbol != "XIST") |>
  pivot_longer(
    cols = c(ends_with("_neural_t21xistdoxcombined_t21")),
    names_pattern = "([A-Za-z0-9\\.]*)_(.*)", names_to = c("value_type", "comparison")
  ) |>
  pivot_wider(names_from = "value_type", values_from = "value") |>
  mutate(combined_pval = vectorized_sumlog(p.adjust_neural_t21_d21, p.adjust)) |>
  filter(combined_pval <= gsea_combined_pval_cutoff & category != "chr21") |> 
  mutate(quadrant = case_when(
           NES > 0 & NES_neural_t21_d21 > 0 ~ 1,
           NES < 0 & NES_neural_t21_d21 > 0 ~ 2,
           NES < 0 & NES_neural_t21_d21 < 0 ~ 3,
           NES > 0 & NES_neural_t21_d21 < 0 ~ 4
         )) |> 
  filter(category != "other" & quadrant %in% c(2,4)) |> 
  mutate(dist_from_center = sqrt(NES^2 + NES_neural_t21_d21^2)) |> 
  group_by(category) |> 
  arrange(desc(dist_from_center)) |> 
  filter(row_number() <= 3) |> 
  ungroup() |> 
  dplyr::select(ID, dist_from_center, category2=category)

library(ggh4x)

combined_enrichment_df |> 
  inner_join(selected_terms2) |> 
  dplyr::select(-starts_with("combined_pval_")) |> 
  pivot_longer(cols = starts_with(c("NES_", "p.adjust_", "qvalues_", "enrichmentScore_", "setSize_", "leadingEdgeTags_", "leadingEdgeList_", "leadingEdgeSignal_", "numCoreEnrichment_")), 
               names_pattern = "([A-Za-z\\.]*)_(.*)", names_to=c("value_type", "comparison")) |>
  pivot_wider(names_from = "value_type", values_from="value") |> 
  mutate(ID=str_replace_all(ID, "_", " ")) |> 
  filter(comparison %in% c(
           "T21_vs_D21", 
           "dwdT21SIT_vs_T21",
           "neural_t21_d21",
           "neural_t21xistdoxcombined_t21"
           # # "dox", 
           # "d_t21xist_t21", 
           # "wd_nd"
         )) |> 
  mutate(
    ID = str_replace_all(ID, "PROCESS", "PROC"),
    ID = str_replace_all(ID, "POSITIVE", "(+)"),
    ID = str_replace_all(ID, "REGULATION", "REG"),
    ID = str_replace_all(ID, "FAMILY", "FAM"),
    ID = str_replace_all(ID, "OXIDATIVE", "OXI"),
    ID = str_replace_all(ID, "LOCALIZATION", "LOCAL"),
    ID = str_replace_all(ID, "MITOCHONDRIAL", "MITO"),
    ID = str_replace_all(ID, "CYTOCHROME C", "CYT C"),
    ID = str_replace_all(ID, "DEPENDENT", "DEP"),
    ID = str_replace_all(ID, "TRANSLATION", "TRANS"),
    ID = str_replace_all(ID, "CHEMICAL", "CHEM"),
    ID = str_replace_all(ID, "RIBOSOME", "RIBO"),
    ID = str_replace_all(ID, "PROTEIN", "PROT"),
    ID = str_replace_all(ID, "ESTABLISHMENT", "ESTAB"),
    ID = str_replace_all(ID, "TRANSCRIPTIONAL", "TRANSCRIP"),
    ID = str_replace_all(ID, "TARGETING", "TARGET"),
    ID = str_replace_all(ID, "GOBP |WP |GOCC |REACTOME |GOCC |GOMF", ""),
    ID = str_replace_all(ID, "RIBOSOMAL", "RIBO"),
    ID = str_replace_all(ID, "CYTOPLASMIC|CYTOSOLIC", "CYTO"),
    ID = str_replace_all(ID, "ENDOPLASMIC RETICULUM", "ER"),
    ID = str_replace_all(ID, "EUKARYOTIC", "EUKAR"),
    ID = str_replace_all(ID, "EXTRINSIC", "EXT"),
    ID = str_replace_all(ID, "TARGET", "TRGT"),
    ID = str_replace_all(ID, "MEMBRANE", "MEM"),
    ID = str_replace_all(ID, "COMPONENT", "COMP"),
    ID = str_replace_all(ID, "PHYTO", "PHYT"),
    ID = str_replace_all(ID, "AMINO ACID", "AA"),
    ID = str_replace_all(ID, "SMOOTH MUSCLE CELL", "SMC"),
    ID = str_replace_all(ID, "NRF2 TRANSCRIP", "NRF2"),
    ID = str_replace_all(ID, "APOPTOTIC", "APOP"),
    ID = str_replace_all(ID, "DISEASES", "DIS"),
    ID = str_replace_all(ID, "PROGRAMMED", "PROG"),
    ID = str_replace_all(ID, "BIOSYNTHETIC", "BIOSYNTH"),
    ID = str_replace_all(ID, "ACTIVITY", "ACT"),
    ID = str_replace_all(ID, "SENESCENCE", "SENSNCE"),
    ID = str_replace_all(ID, "COTRANSAL", "COTRANS"),
    ID = str_replace_all(ID, "SMALL", "SML"),
    ID = str_replace_all(ID, "SUBUNIT", "SUB"),
    ID = str_replace_all(ID, "PRECURSOR", "PRECUR"),
    ID = str_replace_all(ID, "ACTIVATION", "ACTIV"),
    ID = str_replace_all(ID, "APOPTOSIS", "APOP"),
    ID = str_replace_all(ID, "PATHWAY", "PATH"),
    ID = str_replace_all(ID, "SIGNALING", "SIG"),
    ID = str_replace_all(ID, "RESPONSE", "RESP"),
    ID = str_replace_all(ID, "MULTICELLULAR", "MULTICELL"),
    ID = str_replace_all(ID, "RESPIRATORY", "RESP"),
    ID = str_replace_all(ID, "RELATED", "REL"),
    ID = str_replace_all(ID, "INDUCER", "IND"),
    ID = str_replace_all(ID, "ENDOPEPTIDASE", "ENDOPEP"),
    ID = str_replace_all(ID, "ORGANISMAL", "ORGNSML"),
    ID = str_replace_all(ID, "NETWORK", "NTWRK"),
    ID = str_replace_all(ID, "ALTERED", "ALT"),
    ID = str_replace_all(ID, "OVARIAN", "OVRIN"),
    ID = str_replace_all(ID, "CANCER", "CNCR"),
    ) |> 
ggplot(aes(x=comparison, y=fct_reorder(ID, dplyr::desc(dist_from_center)), color=NES, size=-log10(p.adjust))) +
  geom_point() +
  ggh4x::facet_grid2(rows=vars(factor(category2,
                                levels=c("apoptosis", "serine", "stress","ER", "ribosome", "mitochondria", "translation"))
  ), scales = "free", space = "free", strip=ggh4x::strip_nested(clip="off")) +
  # scale_color_viridis_c() +
  scale_color_gradient2(high = scales::muted("red"), low = scales::muted("blue")) +
  # scale_y_discrete(label=scales::label_wrap(30)) +
  scale_x_discrete(limits=c(
    "neural_t21_d21",
    "T21_vs_D21", 
    "dwdT21SIT_vs_T21",
    "neural_t21xistdoxcombined_t21"
  ),
  labels = c("iPSC T21 vs. D21", "iPSC T21XIST vs. T21", "Neural T21 vs. D21", "Neural T21XIST vs. T21")) +
  theme(axis.text.x = element_text(angle=65, hjust=1),
        axis.title.x = element_blank(),
        axis.text.y = element_text(size=6),
        axis.title.y=element_blank(),
        legend.position = c(-1.5, -.1),
        legend.box = "horizontal",
        legend.margin = margin(r=2, unit = "mm"),
        legend.key.width = unit(2, units = "mm"),
        plot.margin = margin(r=-2, unit="mm"),
        plot.title = element_text(hjust=1.8),
        # axis.text.y = element_blank(),
        # axis.ticks.y=element_blank(),
        strip.text.y = element_text(angle=45, hjust = 0, vjust = .5)) +
  scale_size_area(max_size=4) +
  labs(title="Enriched Pathways")
ggsave(sprintf("%s/enriched_pathways_with_neural.pdf", fig_output_dir), width = 3.9, height = 6)

combined_enrichment_df |> 
  inner_join(selected_terms_ipsc) |> 
  dplyr::select(-starts_with("combined_pval_")) |> 
  pivot_longer(cols = starts_with(c("NES_", "p.adjust_", "qvalues_", "enrichmentScore_", "setSize_", "leadingEdgeTags_", "leadingEdgeList_", "leadingEdgeSignal_", "numCoreEnrichment_")), 
               names_pattern = "([A-Za-z\\.]*)_(.*)", names_to=c("value_type", "comparison")) |>
  pivot_wider(names_from = "value_type", values_from="value") |> 
  mutate(ID=str_replace_all(ID, "_", " ")) |> 
  filter(comparison %in% c(
    "T21_vs_D21", 
    "dwdT21SIT_vs_T21",
    "neural_t21_d21",
    "neural_t21xistdoxcombined_t21"
    # # "dox", 
    # "d_t21xist_t21", 
    # "wd_nd"
  )) |> 
  mutate(
    ID = str_replace_all(ID, "PROCESS", "PROC"),
    ID = str_replace_all(ID, "POSITIVE", "(+)"),
    ID = str_replace_all(ID, "REGULATION", "REG"),
    ID = str_replace_all(ID, "FAMILY", "FAM"),
    ID = str_replace_all(ID, "OXIDATIVE", "OXI"),
    ID = str_replace_all(ID, "LOCALIZATION", "LOCAL"),
    ID = str_replace_all(ID, "MITOCHONDRIAL", "MITO"),
    ID = str_replace_all(ID, "CYTOCHROME C", "CYT C"),
    ID = str_replace_all(ID, "DEPENDENT", "DEP"),
    ID = str_replace_all(ID, "TRANSLATION", "TRANS"),
    ID = str_replace_all(ID, "CHEMICAL", "CHEM"),
    ID = str_replace_all(ID, "RIBOSOME", "RIBO"),
    ID = str_replace_all(ID, "PROTEIN", "PROT"),
    ID = str_replace_all(ID, "ESTABLISHMENT", "ESTAB"),
    ID = str_replace_all(ID, "TRANSCRIPTIONAL", "TRANSCRIP"),
    ID = str_replace_all(ID, "TARGETING", "TARGET"),
    ID = str_replace_all(ID, "GOBP |WP |GOCC |REACTOME ", ""),
    ID = str_replace_all(ID, "RIBOSOMAL", "RIBO"),
    ID = str_replace_all(ID, "CYTOPLASMIC|CYTOSOLIC", "CYTO"),
    ID = str_replace_all(ID, "ENDOPLASMIC RETICULUM", "ER"),
    ID = str_replace_all(ID, "EUKARYOTIC", "EUKAR"),
    ID = str_replace_all(ID, "EXTRINSIC", "EXT"),
    ID = str_replace_all(ID, "TARGET", "TRGT"),
    ID = str_replace_all(ID, "MEMBRANE", "MEM"),
    ID = str_replace_all(ID, "COMPONENT", "COMP"),
    ID = str_replace_all(ID, "PHYTO", "PHYT"),
    ID = str_replace_all(ID, "AMINO ACID", "AA"),
    ID = str_replace_all(ID, "SMOOTH MUSCLE CELL", "SMC"),
    ID = str_replace_all(ID, "NRF2 TRANSCRIP", "NRF2"),
    ID = str_replace_all(ID, "APOPTOTIC", "APOP"),
    ID = str_replace_all(ID, "DISEASES", "DIS"),
    ID = str_replace_all(ID, "PROGRAMMED", "PROG"),
    ID = str_replace_all(ID, "BIOSYNTHETIC", "BIOSYNTH"),
    ID = str_replace_all(ID, "ACTIVITY", "ACT"),
    ID = str_replace_all(ID, "SENESCENCE", "SENSNCE"),
    ID = str_replace_all(ID, "COTRANSAL", "COTRANS"),
    ID = str_replace_all(ID, "SMALL", "SML"),
    ID = str_replace_all(ID, "SUBUNIT", "SUB"),
    ID = str_replace_all(ID, "PRECURSOR", "PRECUR"),
    ID = str_replace_all(ID, "ACTIVATION", "ACTIV"),
    ID = str_replace_all(ID, "RESPIRATORY", "RESP"),
    ID = str_replace_all(ID, "RESPONSE", "RESP"),
  ) |> 
  ggplot(aes(x=comparison, y=fct_reorder(ID, dplyr::desc(dist_from_center)), color=NES, size=-log10(p.adjust))) +
  geom_point() +
  ggh4x::facet_grid2(rows=vars(factor(category2,
                                      levels=c("apoptosis", "serine", "stress", "ER", "ribosome", "mitochondria", "translation"))
  ), scales = "free", space = "free", strip=ggh4x::strip_nested(clip="off")) +
  # scale_color_viridis_c() +
  scale_color_gradient2(high = scales::muted("red"), low = scales::muted("blue")) +
  # scale_y_discrete(label=scales::label_wrap(30)) +
  scale_x_discrete(limits=c(
    "T21_vs_D21", 
    "dwdT21SIT_vs_T21"
  ),
  labels = c("T21 vs. D21", "T21XIST vs. T21")) +
  scale_size_area(max_size=4) +
  labs(title="Top GSEA Terms - iPSCs") +
  theme(axis.text.x = element_text(angle=45, hjust=1),
        axis.title.x = element_blank(),
        axis.text.y = element_text(size=6),
        axis.title.y=element_blank(),
        legend.position = c(-3, -.1),
        legend.box = "horizontal",
        legend.margin = margin(r=2, unit = "mm"),
        legend.key.width = unit(2, units = "mm"),
        plot.margin = margin(r=-2, unit="mm"),
        plot.title = element_text(hjust=1.8),
        # axis.text.y = element_blank(),
        # axis.ticks.y=element_blank(),
        strip.text.y = element_text(angle=45, hjust = 0, vjust = .5))
ggsave(sprintf("%s/enriched_pathways_ipsc.pdf", fig_output_dir), width = 3.15, height = 5.5)

combined_enrichment_df |> 
  inner_join(selected_terms_neural, by="ID") |> 
  dplyr::select(-starts_with("combined_pval_")) |> 
  pivot_longer(cols = starts_with(c("NES_", "p.adjust_", "qvalues_", "enrichmentScore_", "setSize_", "leadingEdgeTags_", "leadingEdgeList_", "leadingEdgeSignal_", "numCoreEnrichment_")), 
               names_pattern = "([A-Za-z\\.]*)_(.*)", names_to=c("value_type", "comparison")) |>
  pivot_wider(names_from = "value_type", values_from="value") |> 
  mutate(ID=str_replace_all(ID, "_", " ")) |> 
  filter(comparison %in% c(
    "neural_t21_d21",
    "neural_t21xistdoxcombined_t21"
    # # "dox", 
    # "d_t21xist_t21", 
    # "wd_nd"
  )) |> 
  mutate(
    ID = str_replace_all(ID, "REACTOME MITOCHONDRIAL TRANSLATION", "RCTME MTO TRANS"),
    ID = str_replace_all(ID, "GOBP MITOCHONDRIAL TRANSLATION", "GO MTO TRANS"),
    ID = str_replace_all(ID, "GOBP |WP |GOCC |REACTOME ", ""),
    ID = str_replace_all(ID, "PROCESS", "PROC"),
    ID = str_replace_all(ID, "POSITIVE", "(+)"),
    ID = str_replace_all(ID, "REGULATION", "REG"),
    ID = str_replace_all(ID, "FAMILY", "FAM"),
    ID = str_replace_all(ID, "OXIDATIVE", "OXI"),
    ID = str_replace_all(ID, "LOCALIZATION", "LOCAL"),
    ID = str_replace_all(ID, "MITOCHONDRIAL", "MITO"),
    ID = str_replace_all(ID, "CYTOCHROME C", "CYT C"),
    ID = str_replace_all(ID, "DEPENDENT", "DEP"),
    ID = str_replace_all(ID, "TRANSLATION", "TRANS"),
    ID = str_replace_all(ID, "CHEMICAL", "CHEM"),
    ID = str_replace_all(ID, "RIBOSOME", "RIBO"),
    ID = str_replace_all(ID, "PROTEIN", "PROT"),
    ID = str_replace_all(ID, "ESTABLISHMENT", "ESTAB"),
    ID = str_replace_all(ID, "TRANSCRIPTIONAL", "TRANSCRIP"),
    ID = str_replace_all(ID, "TARGETING", "TARGET"),
    ID = str_replace_all(ID, "RIBOSOMAL", "RIBO"),
    ID = str_replace_all(ID, "CYTOPLASMIC|CYTOSOLIC", "CYTO"),
    ID = str_replace_all(ID, "ENDOPLASMIC RETICULUM", "ER"),
    ID = str_replace_all(ID, "EUKARYOTIC", "EUKAR"),
    ID = str_replace_all(ID, "EXTRINSIC", "EXT"),
    ID = str_replace_all(ID, "TARGET", "TRGT"),
    ID = str_replace_all(ID, "MEMBRANE", "MEM"),
    ID = str_replace_all(ID, "COMPONENT", "COMP"),
    ID = str_replace_all(ID, "PHYTO", "PHYT"),
    ID = str_replace_all(ID, "AMINO ACID", "AA"),
    ID = str_replace_all(ID, "SMOOTH MUSCLE CELL", "SMC"),
    ID = str_replace_all(ID, "NRF2 TRANSCRIP", "NRF2"),
    ID = str_replace_all(ID, "APOPTOTIC", "APOP"),
    ID = str_replace_all(ID, "DISEASES", "DIS"),
    ID = str_replace_all(ID, "PROGRAMMED", "PROG"),
    ID = str_replace_all(ID, "BIOSYNTHETIC", "BIOSYNTH"),
    ID = str_replace_all(ID, "ACTIVITY", "ACT"),
    ID = str_replace_all(ID, "SENESCENCE", "SENSNCE"),
    ID = str_replace_all(ID, "COTRANSAL", "COTRANS"),
    ID = str_replace_all(ID, "SMALL", "SML"),
    ID = str_replace_all(ID, "SUBUNIT", "SUB"),
    ID = str_replace_all(ID, "PRECURSOR", "PRECUR"),
    ID = str_replace_all(ID, "ACTIVATION", "ACTIV"),
    ID = str_replace_all(ID, "RESPIRATORY", "RESP"),
    ID = str_replace_all(ID, "RESPONSE", "RESP"),
    ID = str_replace_all(ID, "TRANSCRIPTION", "TRNCRIP"),
    ID = str_replace_all(ID, "TEMPLATED", "TMPLT"),
    ID = str_replace_all(ID, "INITIATION", "INIT"),
    ID = str_replace_all(ID, "PHOSPHORYLATION", "PHOS"),
    ID = str_replace_all(ID, "ALPHA", "α"),
    ID = str_replace_all(ID, "INHIBITOR", "INHIB"),
    ID = str_replace_all(ID, "ELECTRON TRANSPORT CHAIN", "E.T.C."),
    ID = str_replace_all(ID, "SYSTEM", "SYS"),
    ID = str_replace_all(ID, "MITOCHONDRIA", "MITO"),
    ID = str_replace_all(ID, "INFLAMMATORY", "INFLAM"),
    ID = str_replace_all(ID, "EFFECTS", "EFF."),
    ID = str_replace_all(ID, "PATHWAY", "PATH"),
  ) |> 
  ggplot(aes(x=comparison, y=fct_reorder(ID, dplyr::desc(dist_from_center)), color=NES, size=-log10(p.adjust))) +
  geom_point() +
  ggh4x::facet_grid2(rows=vars(factor(category2,
                                      levels=c("apoptosis", "serine", "stress", "ER", "ribosome", "mitochondria", "translation"))
  ), scales = "free", space = "free", strip=ggh4x::strip_nested(clip="off")) +
  # scale_color_viridis_c() +
  scale_color_gradient2(high = scales::muted("red"), low = scales::muted("blue")) +
  # scale_y_discrete(label=scales::label_wrap(30)) +
  scale_x_discrete(limits=c(
    "neural_t21_d21",
    "neural_t21xistdoxcombined_t21"
  ),
  labels = c("T21 vs. D21", "T21XIST vs. T21")) +
  scale_size_area(max_size=4) +
  labs(title="Top GSEA Terms - Neural") +
  theme(axis.text.x = element_text(angle=45, hjust=1),
        axis.title.x = element_blank(),
        axis.text.y = element_text(size=6),
        axis.title.y=element_blank(),
        legend.position = c(-2.6, 0),
        legend.box = "horizontal",
        legend.margin = margin(r=2, unit = "mm"),
        legend.key.width = unit(2, units = "mm"),
        plot.margin = margin(r=-2, unit="mm"),
        plot.title = element_text(hjust=1.8),
        # axis.text.y = element_blank(),
        # axis.ticks.y=element_blank(),
        strip.text.y = element_text(angle=45, hjust = 0, vjust = .5))
ggsave(sprintf("%s/enriched_pathways_neural.pdf", fig_output_dir), width = 3.6, height = 4.8,device = cairo_pdf)

## Lollipop Plots

gsea_compare_to <- "T21_vs_D21"
gsea_comparisons <- c("neural_t21_d21", "neural_t21xistdoxcombined_t21", "dwdT21SIT_vs_T21", "dox_effect")
compared_to_NES <- sprintf("NES_%s", gsea_compare_to)
compared_to_padj <- sprintf("p.adjust_%s", gsea_compare_to)
compared_from_suffixes <- sprintf("_%s", c(gsea_compare_to, gsea_comparisons))
gsea_lollipop_cutoff_1 <- lol_cutoff_primary
gsea_lollipop_cutoff_2 <- .2

combined_enrichment_df |>
    mutate(
        ontology = str_extract(ID, "[A-Z]+(?=_)"),
        ontology = case_when(
            is.na(ontology) | !(ontology %in% c("GOBP", "GOCC", "GOMF", "HALLMARK", "KEGG", "REACTOME", "WP")) ~ "other",
            T ~ ontology
        )
    ) |>
    filter(ontology != "other",
           !!as.name(compared_to_padj) <= gsea_lollipop_cutoff_1) |>
    # dplyr::select(-starts_with("combined_pval_")) |>
  mutate(
    ID = str_replace_all(ID, "_", " "),
    ID = str_replace_all(ID, "PROCESS", "PROC"),
    ID = str_replace_all(ID, "POSITIVE", "(+)"),
    ID = str_replace_all(ID, "REGULATION", "REG"),
    ID = str_replace_all(ID, "FAMILY", "FAM"),
    ID = str_replace_all(ID, "OXIDATIVE", "OXI"),
    ID = str_replace_all(ID, "LOCALIZATION", "LOCAL"),
    ID = str_replace_all(ID, "MITOCHONDRIAL", "MITO"),
    ID = str_replace_all(ID, "CYTOCHROME C", "CYT C"),
    ID = str_replace_all(ID, "DEPENDENT", "DEP"),
    ID = str_replace_all(ID, "TRANSLATION", "TRANS"),
    ID = str_replace_all(ID, "CHEMICAL", "CHEM"),
    ID = str_replace_all(ID, "RIBOSOME", "RIBO"),
    ID = str_replace_all(ID, "PROTEIN", "PROT"),
    ID = str_replace_all(ID, "ESTABLISHMENT", "ESTAB"),
    ID = str_replace_all(ID, "TRANSCRIPTIONAL", "TRANSCRIP"),
    ID = str_replace_all(ID, "TARGETING", "TARGET"),
    ID = str_replace_all(ID, "GOBP |WP |GOCC |REACTOME |HALLMARK |KEGG ", ""),
    ID = str_replace_all(ID, "RIBOSOMAL", "RIBO"),
    ID = str_replace_all(ID, "CYTOPLASMIC|CYTOSOLIC", "CYTO"),
    ID = str_replace_all(ID, "ENDOPLASMIC RETICULUM", "ER"),
    ID = str_replace_all(ID, "EUKARYOTIC", "EUKAR"),
    ID = str_replace_all(ID, "EXTRINSIC", "EXT"),
    ID = str_replace_all(ID, "TARGET", "TRGT"),
    ID = str_replace_all(ID, "MEMBRANE", "MEM"),
    ID = str_replace_all(ID, "COMPONENT", "COMP"),
    ID = str_replace_all(ID, "PHYTO", "PHYT"),
    ID = str_replace_all(ID, "AMINO ACID", "AA"),
    ID = str_replace_all(ID, "SMOOTH MUSCLE CELL", "SMC"),
    ID = str_replace_all(ID, "NRF2 TRANSCRIP", "NRF2"),
    ID = str_replace_all(ID, "APOPTOTIC", "APOP"),
    ID = str_replace_all(ID, "DISEASES", "DIS"),
    ID = str_replace_all(ID, "PROGRAMMED", "PROG"),
    ID = str_replace_all(ID, "BIOSYNTHETIC", "BIOSYNTH"),
    ID = str_replace_all(ID, "ACTIVITY", "ACT"),
    ID = str_replace_all(ID, "SENESCENCE", "SENSNCE"),
    ID = str_replace_all(ID, "COTRANSAL", "COTRANS"),
    ID = str_replace_all(ID, "SMALL", "SML"),
    ID = str_replace_all(ID, "SUBUNIT", "SUB"),
    ID = str_replace_all(ID, "PRECURSOR", "PRECUR"),
    ID = str_replace_all(ID, "ACTIVATION", "ACTIV"),
    ID = str_replace_all(ID, "HEPATOCELLULAR", "HEPATOCELL"),
    ID = str_replace_all(ID, "TRANSITION", "TRANS"),
    ID = str_replace_all(ID, "EXTRACELLULAR", "EXTRACELL"),
  ) |> 
  dplyr::mutate(ID = fct_reorder(ID,  -log10(!!as.name(compared_to_padj)) * sign(!!as.name(compared_to_NES)), .desc = T)) |>
    pivot_longer(
        cols = c(ends_with(compared_from_suffixes)),
        names_pattern = "([A-Za-z0-9\\.]*)_(.*)", names_to = c("value_type", "comparison")
    ) |>
    pivot_wider(names_from = "value_type", values_from = "value") |>
    dplyr::mutate(
        direction = case_when(
            sign(NES) == -1 ~ "supressed",
            T ~ "activated"
        ),
        GeneRatio = numCoreEnrichment / setSize,
        comparison = factor(comparison, levels = c(gsea_compare_to, gsea_comparisons)),
        signed_log10padj = -log10(p.adjust) * sign(NES)
    ) |>
    filter(p.adjust <= gsea_lollipop_cutoff_2 &
               comparison %in% c(gsea_compare_to, gsea_comparisons) &
             ontology %in% c("WP", "KEGG", "HALLMARK")) |>
    mutate(comparison = case_when(comparison == "T21_vs_D21" ~ "iPSC T21",
                                  comparison == "dwdT21SIT_vs_T21" ~ "iPSC Rescue",
                                  comparison == "neural_t21_d21" ~ "Neural T21",
                                  comparison == "neural_t21xistdoxcombined_t21" ~ "Neural Rescue",
                                  comparison == "dox_effect" ~ "iPSC Dox"),
           comparison = factor(comparison, levels = c("iPSC T21", "iPSC Rescue", "iPSC Dox", "Neural T21",  "Neural Rescue"))) |> 
  
    ggplot() +
    geom_segment(aes(x = 0, y = ID, xend = signed_log10padj, yend = ID), size = .3) +
    geom_point(aes(x = signed_log10padj, y = ID, size = GeneRatio, color = NES)) +
    scale_color_gradient2(midpoint = 0, low = "blue", high = "red") +
    ggh4x::facet_grid2(cols = vars(comparison), rows = vars(ontology), scales = "free", space = "free_y", strip = ggh4x::strip_nested(clip = "off"), ) +
    scale_y_discrete(label = function(x) abbreviate(x, minlength = 70, dot = T)) +
    # scale_x_continuous(breaks = c(-4, 0, 4), expand = c(.2, 0.2)) +
    geom_vline(xintercept = 0, size = .5) +
    theme(
        axis.text.x = element_text(angle = 65, hjust = 1),
        axis.title.y = element_blank(),
        axis.text.y = element_text(size=8),
        axis.ticks.y = element_blank(),
        strip.text.y = element_text(angle = 45, hjust = 0, vjust = .5),
        legend.position = "top",
        legend.title = element_text(margin = margin(r = 2, unit = "mm"), hjust = .5),
        legend.key.height = unit(3, "mm"),
        legend.key.width = unit(3, "mm"),
        legend.margin = margin(r = 2, l = 2, unit = "mm")
    ) +
    scale_size_area(max_size = 3) +
    labs(
        title = "Top T21 vs D21 GSEA Terms",
        x = "signed\nlog10(p-adj)",
        color = "NES"
    )
ggsave(sprintf("%s/lollipop.pdf", fig_output_dir), width=7.4, height = 7.8)


## Make lineage score card for iPSCs -----------------------

vsd_df <- batch_corrected_vsd_df |> 
  left_join(gene_info_df |> 
              dplyr::select(ensembl_id = ensembl_gene_id_version, gene_name = hgnc_symbol))

thermo_lineage_genes <- read_csv("../Thermo_Lineage_Scorecard_genes.csv")

lineage_gene_counts_df <- vsd_df %>%
  inner_join(thermo_lineage_genes, by=c("gene_name"="symbol")) %>%
  reshape2::melt(id.vars=c("gene_name", "Lineage"), measure.vars=sample_sheet_df$sample_id, variable.name="sample_id") %>%
  left_join(sample_sheet_df)

ordered_genes <- lineage_gene_counts_df %>% 
  group_by(gene_name) %>% 
  summarise(mean_expr=mean(value)) %>% 
  arrange(mean_expr) %>%
  pull(gene_name)
lineage_gene_counts_df |> 
  mutate(`Cell Line`=str_replace(`Cell Line`, "rtTA-XIST", "T21XIST")) |> 

ggplot(aes(x=factor(gene_name, levels=ordered_genes), y=value, fill=Lineage, color=Lineage)) +
  geom_boxplot() +
  labs(title="Lineage Scorecard for iPSCs", x="Genes", y="VST Counts") +
  scale_fill_npg(guide="none") +
  scale_color_npg(guide="none") +
  facet_grid(rows=vars(`Cell Line`), cols=vars(Lineage), scales = "free", space="free") +
  theme(axis.text.x = element_text(angle=90, vjust=.5, hjust=1, size=7))
ggsave(sprintf("%s/lineage_scorecard.pdf", fig_output_dir), width=7.25, height = 4.8)