differential_expression.R

## ---------------------------
##
## Script name: Transcript level differential expression
##
## Purpose of script: Using salmon transcript counts, perform differential gene and transcript expression
##
## Author: Prakhar Bansal
##
## Date Created: 2021-08-24
##
## Copyright (c) Prakhar Bansal, 2021
## Email: pbansal@uchc.edu
##
## ---------------------------
##
## Notes:
##
##
## ---------------------------

## set working directory
setwd(dirname(rstudioapi::getSourceEditorContext()$path))

## ---------------------------

# remotes::install_version("Rttf2pt1", version = "1.3.8")
# extrafont::font_import()

## load up the packages we will need:  (uncomment as required)
# library(showtext)
# font_add("Arial", "arial.ttf", bold='arialbd.ttf', italic='ariali.ttf', bolditalic='arialbi.ttf')
# showtext_auto()
require(tidyverse)
theme_set(
  theme_classic() +
    theme(
      text = element_text(size = 9, family = "sans"),
      plot.title = element_text(hjust = 0.5, size = 10, family = "sans", face = "bold"),
      axis.title = element_text(size = 9, face = "bold"),
      axis.text = element_text(size = 9),
      legend.spacing.x = unit(0, "cm"),
      legend.spacing.y = unit(2, "mm"),
      legend.key.height = unit(4, "mm"),
      legend.margin = margin(0, 0, 0, 0),
      axis.line = element_line(size = 0),
      strip.background = element_blank(),
      strip.text = element_text(margin = margin(1, 1, 1, 1, unit = "mm")),
      panel.border = element_rect(color = "black", fill = NA, size = .5)
    )
)
require(ggsci)
require(tximeta)
require(DESeq2)
require(clusterProfiler)
require(msigdbr)
require(metap)
require(patchwork)
require(ggdendro)

## --------------------------------------------------------

msig_df <- msigdbr(species = "Homo sapiens")
m_t2g <- msig_df |>
  filter(gs_cat == "H" |
           gs_cat == "C1" |
           (gs_cat == "C2" & str_detect(gs_subcat, "CP:")) |
           # (gs_cat == "C3" & str_detect(gs_subcat, "TFT:")) |
           (gs_cat == "C5" & str_detect(gs_subcat, "GO:"))) |>
  dplyr::select(gs_name, entrez_gene)

contrast_builder <- function(dds_obj, numerator_sample_ids, denominator_sample_ids, covariates_to_ignore, .id_col = "sample_id", .ignore_SVs = T) {
  # get the model matrix
  mod_mat <- model.matrix(design(dds_obj), colData(dds_obj))
  
  # calculate the vector of coefficient weights in the numerator and denominator
  numerator_coefs <- colMeans(mod_mat[dds_obj[[.id_col]] %in% numerator_sample_ids, ])
  denominator_coefs <- colMeans(mod_mat[dds_obj[[.id_col]] %in% denominator_sample_ids, ])
  
  # The contrast we are interested in is the difference between the two
  contrast_coefs <- numerator_coefs - denominator_coefs
  
  cov_ignore_regex <- covariates_to_ignore #(covariates_to_ignore |> toString() |> str_replace_all("~, ", "") |> str_split(" \\+ "))[[1]] |> 
    #paste(collapse = "|")
  
  for(coef_name in names(contrast_coefs)) {
    if (str_starts(coef_name, cov_ignore_regex)) {
      contrast_coefs[coef_name] <- 0
    }else if (.ignore_SVs == T & str_detect(coef_name, "SV[0-9]+")) {
      print(coef_name)
      contrast_coefs[coef_name] <- 0
    }
  }
  
  return(contrast_coefs)
}

write_results <- function(result_name, filter_top, filter_bottom, dds_obj, outdir, covariates_to_ignore) {
  print("in write results...")
  print(result_name)
  top_sample_ids <- colData(dds_obj) |>
    as.data.frame() |>
    filter(eval(rlang::parse_expr(filter_top))) |>
    pull(sample_id)
  print(top_sample_ids)
  bottom_sample_ids <- colData(dds_obj) |>
    as.data.frame() |>
    filter(eval(rlang::parse_expr(filter_bottom))) |>
    pull(sample_id)
  print(bottom_sample_ids)
  
  res_obj <- results(dds_obj, contrast = contrast_builder(dds_obj, top_sample_ids, bottom_sample_ids, covariates_to_ignore))
  summary(res_obj)
  
  write_csv(
    res_obj |>
      as.data.frame() |>
      rownames_to_column(var = "ensembl_id"),
    sprintf("%s/tables/results_%s.csv.gz", outdir, result_name)
  )
}

run_neural_pipeline <- function(sample_sheet_df,
                                deseq2_design,
                                limma_covariates,
                                limma_design,
                                sva_mod,
                                sva_mod0,
                                diff_exprs = list(
                                  "neural_t21_d21" = c(
                                    "genotype == 'T21'",
                                    "genotype == 'D21'",
                                    "neural_diff|dox_status"),
                                  "neural_d21_t21xistdox" = c(
                                    "genotype == 'T21XIST' & dox_status != 'no'",
                                    "(genotype == 'T21XIST' & dox_status == 'no') | genotype == 'T21'",
                                    "neural_diff|dox_status"),
                                  "neural_d21_t21xistdox" = c(
                                    "genotype == 'D21'",
                                    "genotype == 'T21XIST' & dox_status != 'no'",
                                    "neural_diff"))) {
  
  outdir <- sprintf(
    "outputs/neural_%s %s",
    str_replace(Sys.time(), " [A-Z]+", "") |> str_replace_all(":", "-"),
    deseq2_design |> toString() |> str_replace_all("~,| ", "") |> str_replace_all(":", "-") |> str_replace_all("\\*", "_x_")
  )
  
  dir.create(sprintf("%s/%s", outdir, "figures"), recursive = T)
  dir.create(sprintf("%s/%s", outdir, "tables"), recursive = T)
  
  write_csv(sample_sheet_df, paste0(outdir, "/tables/sample_sheet_df.csv.gz"))
  
  sum_exp <- tximeta(sample_sheet_df)

  gene_sum_exp <- summarizeToGene(sum_exp)
  print("before deseq2 here")
  dds <- DESeqDataSet(gene_sum_exp,
                      design = deseq2_design
  )
  dds <- estimateSizeFactors(dds)
  nc <- counts(dds, normalized = TRUE)
  filter <- rowSums(nc >= 20) >= ceiling(ncol(nc) / 10)
  
  dds <- dds[filter, ]
  
  dds <- estimateSizeFactors(dds)
  dds <- DESeq(dds, modelMatrixType = "standard")
  dds <- dds[which(mcols(dds)$betaConv), ]
  
  saveRDS(dds, paste0(outdir, "/tables/dds.rds"))
  
  vsd <- varianceStabilizingTransformation(dds, blind = F)
  
  vsd_salmon_df <- assay(vsd) |>
    as.data.frame() |>
    rownames_to_column(var = "ensembl_id")
  
  pca_expr_data <- vsd_salmon_df |>
    column_to_rownames(var = "ensembl_id")
  
  batch_corrected_vsd <- limma::removeBatchEffect(pca_expr_data,
                                                  covariates = model.matrix(limma_covariates, data = colData(dds)),
                                                  design = model.matrix(limma_design, data = colData(dds))
  )
  
  resultsNames(dds)
  
  write_csv(vsd_salmon_df, sprintf("%s/tables/vsd_salmon_df.csv.gz", outdir))
  write_csv(batch_corrected_vsd |> as.data.frame() |> rownames_to_column(var = "ensembl_id"), sprintf("%s/tables/batch_corrected_vsd_salmon_df.csv.gz", outdir))
  
  gene_info_df <- read_csv("input_data/gene_info_df.csv.gz")
  
  for (de_name in names(diff_exprs)) {
    print(de_name)
    write_results(de_name,
                  diff_exprs[[de_name]][1],
                  diff_exprs[[de_name]][2],
                  dds,
                  outdir,
                  diff_exprs[[de_name]][3])
  }
  
  results_files <- fs::dir_ls(
    path = sprintf("%s/tables", outdir),
    glob = sprintf(
      "%s/tables/results_*csv.gz",
      outdir |>
        str_replace_all("\\+", "\\\\+") |>
        str_replace_all("\\,", "\\\\,")
    )
  )
  combined_results_df <- read_csv(results_files,
                                  id = "origin_file") |>
    mutate(comparison = str_replace_all(
      origin_file,
      sprintf(
        "(%s/tables/results_)|(.csv.gz)",
        outdir |>
          str_replace_all("\\+", "\\\\+") |>
          str_replace_all("\\(", "\\\\(") |>
          str_replace_all("\\)", "\\\\)") |>
          str_replace_all("\\,", "\\\\,")
      ),
      ""
    )) |>
    pivot_wider(
      id_cols = ensembl_id,
      names_from = comparison,
      values_from = c(baseMean, log2FoldChange, pvalue, padj, stat)
    )
  write_csv(combined_results_df,
            sprintf("%s/tables/combined_results_df.csv.gz", outdir))
  
  
  sprintf("Running GSEA Enrichments...\n")
  combined_results_df <- combined_results_df |> 
    left_join(gene_info_df, by = c("ensembl_id" = "ensembl_gene_id_version"))
  # run gsea for all comparisons
  all_comparisons <- combined_results_df |>
    filter(hgnc_symbol != "XIST") |>
    pivot_longer(
      cols = c(starts_with("log2FoldChange_")),
      names_pattern = "([A-Za-z0-9\\.]*)_(.*)",
      names_to = c("value_type", "comparison")
    ) |>
    pull(comparison) |>
    unique()
  
  for (comparison in all_comparisons) {
    ordered_genes <- combined_results_df |>
      filter(!is.na(entrezgene_id)) |>
      arrange(desc(!!as.name(sprintf(
        "stat_%s", comparison
      )))) |>
      dplyr::select(entrezgene_id,!!sprintf("stat_%s", comparison)) |>
      deframe()
    
    gsea_result <-
      GSEA(ordered_genes,
           TERM2GENE = m_t2g,
           pvalueCutoff = 1)
    write_csv(
      gsea_result |> as.data.frame(),
      sprintf("%s/tables/gsea_%s.csv.gz", outdir, comparison)
    )
  }
  
  gsea_files <- fs::dir_ls(
    path = sprintf("%s/tables", outdir),
    glob = sprintf(
      "%s/tables/gsea_*csv.gz",
      outdir |>
        str_replace_all("\\+", "\\\\+") |>
        str_replace_all("\\,", "\\\\,")
    )
  )
  print(gsea_files)
  combined_enrichment_df <- read_csv(gsea_files,
                                     id = "origin_file") |>
    mutate(comparison = str_replace_all(
      origin_file,
      sprintf(
        "(%s/tables/gsea_)|(.csv.gz)",
        outdir |>
          str_replace_all("\\+", "\\\\+") |>
          str_replace_all("\\(", "\\\\(") |>
          str_replace_all("\\)", "\\\\)") |>
          str_replace_all("\\,", "\\\\,")
      ),
      ""
    )) |>
    mutate(
      leadingEdgeTags = as.numeric(str_replace_all(
        str_split(leading_edge, ", ", simplify = T)[, 1], "tags=|%", ""
      )) / 100,
      leadingEdgeList = as.numeric(str_replace_all(
        str_split(leading_edge, ", ", simplify = T)[, 2], "list=|%", ""
      )) / 100,
      leadingEdgeSignal = as.numeric(str_replace_all(
        str_split(leading_edge, ", ", simplify = T)[, 3], "signal=|%", ""
      )) / 100,
      numCoreEnrichment = str_count(str_split(core_enrichment, "/")) + 1
    ) |>
    pivot_wider(
      id_cols = ID,
      names_from = comparison,
      values_from = c(
        NES,
        p.adjust,
        qvalue,
        enrichmentScore,
        setSize,
        leadingEdgeTags,
        leadingEdgeList,
        leadingEdgeSignal,
        numCoreEnrichment
      )
    ) |>
    mutate_at(vars(starts_with("p.adjust")), list( ~ if_else(is.na(.), 1, .))) |>
    mutate_at(vars(starts_with("qvalue")), list( ~ if_else(is.na(.), 1, .))) |>
    mutate_at(vars(starts_with("NES")), list( ~ if_else(is.na(.), 0, .))) |>
    mutate_at(vars(starts_with("enrichmentScore")), list( ~ if_else(is.na(.), 0, .))) |>
    dplyr::mutate(
      category = dplyr::case_when(
        str_detect(ID, "STRESS") ~ "stress",
        str_detect(ID, "ENDOPLASMIC_RETICULUM") ~ "ER",
        str_detect(ID, "chr21") ~ "chr21",
        str_detect(ID, "TRANSLATION") ~ "translation",
        str_detect(ID, "RIBOSOM") ~ "ribosome",
        str_detect(ID, "SERINE") &
          (!str_detect(ID, "SERINE_THREONINE")) ~ "serine",
        str_detect(ID, "MITOCH") ~ "mitochondria",
        str_detect(ID, "NRF2") ~ "stress",
        str_detect(ID, "APOPTO") |
          str_detect(ID, "PROGRAMMED_CELL_DEATH") ~ "apoptosis",
        T ~ "other"
      )
    )
  print("saving combined enrichment df")
  write_csv(combined_enrichment_df,
            paste0(outdir, "/tables/combined_enrichment_df.csv.gz"))
  
  rmarkdown::render("summarize_neural_output.Rmd", output_file = paste0(outdir, "/neural_summary.html"), params = list(
    output_dir = outdir,
    samples_to_exclude = samples_to_exclude |> toString(),
    deseq2_design = deseq2_design |> toString(),
    limma_covariates = limma_covariates  |> toString(),
    limma_design = limma_design |> toString(),
    sva_mod = sva_mod |> toString(),
    sva_mod0 = sva_mod0 |> toString(),
    use_SV = use_SV,
    num_SV = num_SV
  ))
}

run_pipeline <- function(sample_sheet_df,
                         deseq2_design,
                         limma_covariates,
                         limma_design,
                         sva_mod,
                         sva_mod0,
                         diff_exprs = list(
                           "rtta_wd_vs_nd" = c(
                             'has_transgene & dox_status == "withdrawal"',
                             'has_transgene & dox_status == "no"'),
                           "rtta_wd_vs_nd_powered" = c(
                             'has_transgene & dox_status == "withdrawal"',
                             '(has_transgene | cell_line == "198_5") & dox_status == "no"'),
                           "rtta_d_vs_nd" = c(
                             'has_transgene & dox_status == "yes"',
                             'has_transgene & dox_status == "no"'),
                           "nonrtta_t21_vs_d21" = c(
                             'genotype == "T21" & dox_status != "yes"',
                             'genotype == "D21" & dox_status != "yes"'),
                           "nonrtta_t21_vs_d21_pow" = c(
                             '(genotype == "T21" & dox_status != "yes") | (genotype == "T21XIST" & dox_status == "no")',
                             'genotype == "D21" & dox_status != "yes"'),
                           "dox_t21xist_vs_1985" = c(
                             'genotype == "T21XIST" & dox_status == "yes"',
                             'genotype == "T21" & dox_status == "yes"'),
                           "nonrtta_d_vs_nd" = c(
                             '!has_transgene & dox_status == "yes"',
                             '!has_transgene & dox_status != "yes"')
                         ),
                         neural_output_dir = NULL,
                         chr21_changes_comparisons = c("rtta_wd_vs_nd_powered", "dox_t21xist_vs_1985"),
                         diff_expr_compare_to = "nonrtta_t21_vs_d21_pow",
                         diff_expr_comparisons = c(
                           "neural_t21_d21",
                           "rtta_d_vs_nd",
                           "dox_t21xist_vs_1985",
                           "rtta_wd_vs_nd",
                           "rtta_wd_vs_nd_powered"
                         ),
                         diff_expr_combined_padj_cutoff = .1,
                         gsea_compare_to = "nonrtta_t21_vs_d21_pow",
                         gsea_comparisons = c(
                           "neural_t21_d21",
                           "rtta_d_vs_nd",
                           "dox_t21xist_vs_1985",
                           "rtta_wd_vs_nd",
                           "rtta_wd_vs_nd_powered"
                         ),
                         gsea_combined_padj_cutoff = .2,
                         lol_cutoff_primary = .1,
                         lol_cutoff_overlap = .1,
                         generate_individual_figures = F,
                         generate_summary_report = T) {
  outdir <- sprintf(
    "outputs/%s %s",
    str_replace(Sys.time(), " [A-Z]+", "") |> str_replace_all(":", "-"),
    deseq2_design |> toString() |> str_replace_all("~,| ", "") |> str_replace_all(":", "-") |> str_replace_all("\\*", "_x_")
  )
  
  dir.create(sprintf("%s/%s", outdir, "figures"), recursive = T)
  dir.create(sprintf("%s/%s", outdir, "tables"), recursive = T)
  
  
  ## Import the counts using tximeta
  
  write_csv(sample_sheet_df,
            paste0(outdir, "/tables/sample_sheet_df.csv.gz"))
  
  sum_exp <- tximeta(sample_sheet_df)
  
  gene_sum_exp <- summarizeToGene(sum_exp)
  
  dds <- DESeqDataSet(gene_sum_exp,
                      design = deseq2_design)
  dds <- estimateSizeFactors(dds)
  nc <- counts(dds, normalized = TRUE)
  filter <- rowSums(nc >= 20) >= ceiling(ncol(nc) / 10)
  
  dds <- dds[filter,]
  
  dds <- estimateSizeFactors(dds)
  dds <- DESeq(dds, modelMatrixType = "standard")
  dds <- dds[which(mcols(dds)$betaConv),]
  
  saveRDS(dds, paste0(outdir, "/tables/dds.rds"))
  
  vsd <- varianceStabilizingTransformation(dds, blind = F)
  
  vsd_salmon_df <- assay(vsd) |>
    as.data.frame() |>
    rownames_to_column(var = "ensembl_id")
  
  pca_expr_data <- vsd_salmon_df |>
    column_to_rownames(var = "ensembl_id")
  
  batch_corrected_vsd <- limma::removeBatchEffect(
    pca_expr_data,
    covariates = model.matrix(limma_covariates, data = colData(dds)),
    design = model.matrix(limma_design, data = colData(dds))
  )
  
  # results
  resultsNames(dds)
  
  write_csv(vsd_salmon_df,
            sprintf("%s/tables/vsd_salmon_df.csv.gz", outdir))
  write_csv(
    batch_corrected_vsd |> as.data.frame() |> rownames_to_column(var = "ensembl_id"),
    sprintf("%s/tables/batch_corrected_vsd_salmon_df.csv.gz", outdir)
  )
  
  gene_info_df <- read_csv("input_data/gene_info_df.csv.gz")
  sprintf("Outputting DESeq2 Results...\n")
  
  
  for (de_name in names(diff_exprs)) {
      write_results(de_name,
                    diff_exprs[[de_name]][1],
                    diff_exprs[[de_name]][2],
                    dds,
                    outdir,
                    diff_exprs[[de_name]][3])
    
    
  }
  
  results_files <- fs::dir_ls(
    path = sprintf("%s/tables", outdir),
    glob = sprintf(
      "%s/tables/results_*csv.gz",
      outdir |>
        str_replace_all("\\+", "\\\\+") |>
        str_replace_all("\\,", "\\\\,")
    )
  )
  combined_results_df <- read_csv(results_files,
                                  id = "origin_file") |>
    mutate(comparison = str_replace_all(
      origin_file,
      sprintf(
        "(%s/tables/results_)|(.csv.gz)",
        outdir |>
          str_replace_all("\\+", "\\\\+") |>
          str_replace_all("\\(", "\\\\(") |>
          str_replace_all("\\)", "\\\\)") |>
          str_replace_all("\\,", "\\\\,")
      ),
      ""
    )) |>
    pivot_wider(
      id_cols = ensembl_id,
      names_from = comparison,
      values_from = c(baseMean, log2FoldChange, pvalue, padj, stat)
    )
  write_csv(combined_results_df,
            sprintf("%s/tables/combined_results_df.csv.gz", outdir))
  
  ## Gene Overlap Figures ---------------
  sprintf("Reading in neural enrichments...%s\n", neural_output_dir)
  
  if (is.null(neural_output_dir)) {
    neural_results_nonrtta_T21_vs_D21 <-
      read_csv("input_data/neural_results_nonrtta_T21_vs_D21.csv.gz")
  } else {
    neural_results_nonrtta_T21_vs_D21 <-
      read_csv(paste0(
        neural_output_dir,
        "/tables/results_neural_t21_d21.csv.gz"
      ))
  }
  
  combined_results_df <-
    read_csv(paste0(outdir, "/tables/combined_results_df.csv.gz")) |>
    inner_join(
      neural_results_nonrtta_T21_vs_D21 |>
        dplyr::select(
          ensembl_id,
          log2FoldChange_neural_t21_d21 = log2FoldChange,
          pvalue_neural_t21_d21 = pvalue,
          padj_neural_t21_d21 = padj,
          stat_neural_t21_d21 = stat
        )
    ) |>
    mutate_at(vars(starts_with("padj")), list( ~ if_else(is.na(.), 1, .))) |>
    mutate_at(vars(starts_with("pvalue")), list( ~ if_else(is.na(.), 1, .))) |>
    mutate_at(vars(starts_with("log2FoldChange")), list( ~ if_else(is.na(.), 0, .))) |>
    mutate_at(vars(starts_with("stat")), list( ~ if_else(is.na(.), 0, .))) |>
    left_join(gene_info_df, by = c("ensembl_id" = "ensembl_gene_id_version"))
  
  vectorized_sumlog <- function(x, y) {
    if (length(x) != length(y)) {
      stop("lengths of lists must be the same")
    }
    combined_pvals <- c()
    for (i in 1:length(x)) {
      combined_pvals <-
        c(combined_pvals, (metap::sumlog(c(x[i], y[i])))$p)
    }
    
    return(combined_pvals)
  }
  
  # GSEA Enrichments ----------------------------
  sprintf("Running GSEA Enrichments...\n")
  
  # run gsea for all comparisons
  all_comparisons <- combined_results_df |>
    filter(hgnc_symbol != "XIST") |>
    pivot_longer(
      cols = c(starts_with("log2FoldChange_")),
      names_pattern = "([A-Za-z0-9\\.]*)_(.*)",
      names_to = c("value_type", "comparison")
    ) |>
    pull(comparison) |>
    unique()
  
  for (comparison in all_comparisons) {
    ordered_genes <- combined_results_df |>
      filter(!is.na(entrezgene_id)) |>
      arrange(desc(!!as.name(sprintf(
        "stat_%s", comparison
      )))) |>
      dplyr::select(entrezgene_id,!!sprintf("stat_%s", comparison)) |>
      deframe()
    
    gsea_result <-
      GSEA(ordered_genes,
           TERM2GENE = m_t2g,
           pvalueCutoff = 1)
    write_csv(
      gsea_result |> as.data.frame(),
      sprintf("%s/tables/gsea_%s.csv.gz", outdir, comparison)
    )
  }
  
  gsea_files <- fs::dir_ls(
    path = sprintf("%s/tables", outdir),
    glob = sprintf(
      "%s/tables/gsea_*csv.gz",
      outdir |>
        str_replace_all("\\+", "\\\\+") |>
        str_replace_all("\\,", "\\\\,")
    )
  )
  print(gsea_files)
  combined_enrichment_df <- read_csv(gsea_files,
                                     id = "origin_file") |>
    mutate(comparison = str_replace_all(
      origin_file,
      sprintf(
        "(%s/tables/gsea_)|(.csv.gz)",
        outdir |>
          str_replace_all("\\+", "\\\\+") |>
          str_replace_all("\\(", "\\\\(") |>
          str_replace_all("\\)", "\\\\)") |>
          str_replace_all("\\,", "\\\\,")
      ),
      ""
    )) |>
    mutate(
      leadingEdgeTags = as.numeric(str_replace_all(
        str_split(leading_edge, ", ", simplify = T)[, 1], "tags=|%", ""
      )) / 100,
      leadingEdgeList = as.numeric(str_replace_all(
        str_split(leading_edge, ", ", simplify = T)[, 2], "list=|%", ""
      )) / 100,
      leadingEdgeSignal = as.numeric(str_replace_all(
        str_split(leading_edge, ", ", simplify = T)[, 3], "signal=|%", ""
      )) / 100,
      numCoreEnrichment = str_count(str_split(core_enrichment, "/")) + 1
    ) |>
    pivot_wider(
      id_cols = ID,
      names_from = comparison,
      values_from = c(
        NES,
        p.adjust,
        qvalue,
        enrichmentScore,
        setSize,
        leadingEdgeTags,
        leadingEdgeList,
        leadingEdgeSignal,
        numCoreEnrichment
      )
    ) |>
    mutate_at(vars(starts_with("p.adjust")), list( ~ if_else(is.na(.), 1, .))) |>
    mutate_at(vars(starts_with("qvalue")), list( ~ if_else(is.na(.), 1, .))) |>
    mutate_at(vars(starts_with("NES")), list( ~ if_else(is.na(.), 0, .))) |>
    mutate_at(vars(starts_with("enrichmentScore")), list( ~ if_else(is.na(.), 0, .))) |>
    dplyr::mutate(
      category = dplyr::case_when(
        str_detect(ID, "STRESS") ~ "stress",
        str_detect(ID, "ENDOPLASMIC_RETICULUM") ~ "ER",
        str_detect(ID, "chr21") ~ "chr21",
        str_detect(ID, "TRANSLATION") ~ "translation",
        str_detect(ID, "RIBOSOM") ~ "ribosome",
        str_detect(ID, "SERINE") &
          (!str_detect(ID, "SERINE_THREONINE")) ~ "serine",
        str_detect(ID, "MITOCH") ~ "mitochondria",
        str_detect(ID, "NRF2") ~ "stress",
        # str_detect(ID, "FOLATE") ~ "Folate",
        # str_detect(ID, "MTORC1") ~ "MTORC1",
        str_detect(ID, "APOPTO") |
          str_detect(ID, "PROGRAMMED_CELL_DEATH") ~ "apoptosis",
        # str_detect(top_words, "EXTRACELLULAR|MATRIX") ~ "ECM",
        T ~ "other"
      )
    )
  print("saving combined enrichment df")
  write_csv(combined_enrichment_df,
            paste0(outdir, "/tables/combined_enrichment_df.csv.gz"))
  
  tibble(
    parameter = c(
      "deseq2_design",
      "limma_covariates",
      "limma_design",
      "sva_mod",
      "sva_mod0",
      "neural_output_dir"
    ),
    value = c(
      deseq2_design |> toString(),
      limma_covariates  |> toString(),
      limma_design |> toString(),
      sva_mod |> toString(),
      sva_mod0 |> toString(),
      neural_output_dir
    )
  ) |>
    write_csv(paste0(outdir, "/params.csv.gz"))
  
  diff_expr_df <- tibble(name = character(),
                         numerator = character(),
                         denomenator = character())
  for (de_name in names(diff_exprs)) {
    diff_expr_df <- add_row(
      diff_expr_df,
      name = de_name,
      numerator = diff_exprs[[de_name]][1],
      denomenator = diff_exprs[[de_name]][2]
    )
  }
  write_csv(diff_expr_df, paste0(outdir, "/diff_exprs.csv.gz"))
  
  generate_report(
    paste0(getwd(), "/", outdir),
    chr21_changes_comparisons = chr21_changes_comparisons,
    diff_expr_compare_to = diff_expr_compare_to,
    diff_expr_comparisons = diff_expr_comparisons,
    diff_expr_combined_padj_cutoff = diff_expr_combined_padj_cutoff,
    gsea_compare_to = gsea_compare_to,
    gsea_comparisons = gsea_comparisons,
    gsea_combined_padj_cutoff = gsea_combined_padj_cutoff,
    lol_cutoff_primary = lol_cutoff_primary,
    lol_cutoff_overlap = lol_cutoff_overlap
  )
}

generate_report <- function(outdir,
                            chr21_changes_comparisons = c("rtta_wd_vs_nd_powered", "dox_t21xist_vs_1985"),
                            diff_expr_compare_to = "nonrtta_t21_vs_d21_pow",
                            diff_expr_comparisons = c(
                              "neural_t21_d21",
                              "rtta_d_vs_nd",
                              "dox_t21xist_vs_1985",
                              "rtta_wd_vs_nd",
                              "rtta_wd_vs_nd_powered"
                              ),
                            diff_expr_combined_padj_cutoff = .1,
                            gsea_compare_to = "nonrtta_t21_vs_d21_pow",
                            gsea_comparisons = c(
                              "neural_t21_d21",
                              "rtta_d_vs_nd",
                              "dox_t21xist_vs_1985",
                              "rtta_wd_vs_nd",
                              "rtta_wd_vs_nd_powered"
                            ),
                            gsea_combined_padj_cutoff = .2,
                            lol_cutoff_primary = .1,
                            lol_cutoff_overlap = .1) {
  
  output_filename <- "/summary.html"
  
  rmarkdown::render("summarize_output2.Rmd", output_file = paste0(outdir, output_filename), params = list(
    output_dir = outdir,
    chr21_changes_comparisons = chr21_changes_comparisons,
    diff_expr_compare_to = diff_expr_compare_to,
    diff_expr_comparisons = diff_expr_comparisons,
    diff_expr_combined_padj_cutoff = diff_expr_combined_padj_cutoff,
    gsea_compare_to = gsea_compare_to,
    gsea_comparisons = gsea_comparisons,
    gsea_combined_padj_cutoff = gsea_combined_padj_cutoff,
    lol_cutoff_primary = lol_cutoff_primary,
    lol_cutoff_overlap = lol_cutoff_overlap
  ))

}

neural_sample_sheet_df <- read_csv("input_data/bulkrna_neural_sample_sheet.csv") |> 
  mutate(
    names=sample_id,
    files=sprintf("input_data/salmon_quantification_v38_gibbs/%s_quant/quant.sf", sample_id)
  )

run_neural_pipeline(
  neural_sample_sheet_df,
  deseq2_design = ~ geno_dox,
  limma_covariates = ~ neural_diff,
  limma_design = ~ genotype,
  sva_mod = ~ neural_diff + genotype,
  sva_mod0 = ~ neural_diff,
  diff_exprs = list(
    "neural_t21_d21" = c(
      "(genotype == 'T21XIST' & dox_status == 'no') | genotype == 'T21'",
      "genotype == 'D21'",
      "1"),
    "neural_t21xistdoxcombined_t21" = c(
      "genotype == 'T21XIST' & dox_status != 'no'",
      "(genotype == 'T21XIST' & dox_status == 'no') | genotype == 'T21'",
      "1"),
    "neural_t21xistdox_t21" = c(
      "genotype == 'T21XIST' & dox_status != 'no' & dox_status != 'wd'",
      "(genotype == 'T21XIST' & dox_status == 'no') | genotype == 'T21'",
      "1"),
    "neural_t21xistwd_t21" = c(
      "genotype == 'T21XIST' & dox_status != 'no' & dox_status == 'wd'",
      "(genotype == 'T21XIST' & dox_status == 'no') | genotype == 'T21'",
      "1"),
    "neural_d21_t21xistdoxcombined" = c(
      "genotype == 'D21'",
      "genotype == 'T21XIST' & dox_status != 'no'",
      "1"),
    "neural_d21_t21xistdox" = c(
      "genotype == 'D21'",
      "genotype == 'T21XIST' & dox_status != 'no' & dox_status != 'wd'",
      "1"),
    "neural_d21_t21xistwd" = c(
      "genotype == 'D21'",
      "genotype == 'T21XIST' & dox_status != 'no' & dox_status == 'wd'",
      "1"))
)

ipsc_sample_sheet_df <- read_csv("input_data/bulkrna_neural_sample_sheet.csv") |> 
  mutate(
    names = sample_id,
    files = sprintf("input_data/salmon_quantification_v38_gibbs/%s_quant/quant.sf", sample_id)
  )

# iPSC differential expression
run_pipeline(
  sample_sheet_df = ipsc_sample_sheet_df,
  deseq2_design = ~ timepoint + dox_status + effective_genotype,
  limma_covariates = ~ timepoint + dox_status,
  limma_design = ~ effective_genotype,
  sva_mod = ~ timepoint + dox_status + effective_genotype,
  sva_mod0 = ~ timepoint + dox_status,
  neural_output_dir = "outputs/neural_2024-02-17 20-39-53.641348 geno_dox", # from the output of run_neural_pipeline above
  diff_exprs = list(
    "wdT21XIST_vs_T21" = c(
      'genotype == "T21XIST" & dox_status == "withdrawal"',
      '(genotype == "T21XIST" & dox_status == "no") | (genotype == "T21")',
      "timepoint|dox_status"),
    "dT21XIST_vs_T21" = c(
      'genotype == "T21XIST" & dox_status == "yes"',
      '(genotype == "T21XIST" & dox_status == "no") | (genotype == "T21")',
      "timepoint|dox_status"),
    "dwdT21SIT_vs_T21" = c(
      'genotype == "T21XIST" & dox_status != "no"',
      '(genotype == "T21XIST" & dox_status == "no") | (genotype == "T21")',
      "timepoint|dox_status"
    ),
    "wdT21XIST_vs_D21" = c(
      'genotype == "T21XIST" & dox_status == "withdrawal"',
      'genotype == "D21"',
      "timepoint|dox_status"),
    "dT21XIST_vs_D21" = c(
      'genotype == "T21XIST" & dox_status == "yes"',
      'genotype == "D21"',
      "timepoint|dox_status"),
    "dwdT21SIT_vs_D21" = c(
      'genotype == "T21XIST" & dox_status != "no"',
      'genotype == "D21"',
      "timepoint|dox_status"
    ),
    "T21_vs_D21" = c(
      '(genotype == "T21XIST" & dox_status == "no") | (genotype == "T21")',
      'genotype == "D21"',
      "timepoint|dox_status"),
    "D21_vs_T21" = c(
      'genotype == "D21"',
      '(genotype == "T21XIST" & dox_status == "no") | (genotype == "T21")',
      "timepoint|dox_status"),
    "dox_effect" = c(
      'genotype %in% c("T21", "D21") & dox_status == "yes"',
      'genotype %in% c("T21", "D21") & dox_status != "yes"',
      "timepoint|effective_genotype")
  ),
  chr21_changes_comparisons = c("wdT21XIST_vs_T21", "dT21XIST_vs_T21"),
  diff_expr_compare_to="T21_vs_D21",
  diff_expr_comparisons = c(
    "neural_t21_d21",
    "neural_t21xistdoxcombined_t21",
    "wdT21XIST_vs_T21",
    "dT21XIST_vs_T21",
    "dox_effect"
  ),
  gsea_compare_to = "T21_vs_D21",
  gsea_comparisons = c(
    "neural_t21_d21",
    "neural_t21xistdoxcombined_t21",
    "wdT21XIST_vs_T21",
    "dT21XIST_vs_T21",
    "dox_effect"
  )
)