Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
T21-XIST/differential_expression.R
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
872 lines (789 sloc)
30.4 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## --------------------------- | |
## | |
## Script name: Transcript level differential expression | |
## | |
## Purpose of script: Using salmon transcript counts, perform differential gene and transcript expression | |
## | |
## Author: Prakhar Bansal | |
## | |
## Date Created: 2021-08-24 | |
## | |
## Copyright (c) Prakhar Bansal, 2021 | |
## Email: pbansal@uchc.edu | |
## | |
## --------------------------- | |
## | |
## Notes: | |
## | |
## | |
## --------------------------- | |
## set working directory | |
setwd(dirname(rstudioapi::getSourceEditorContext()$path)) | |
## --------------------------- | |
# remotes::install_version("Rttf2pt1", version = "1.3.8") | |
# extrafont::font_import() | |
## load up the packages we will need: (uncomment as required) | |
# library(showtext) | |
# font_add("Arial", "arial.ttf", bold='arialbd.ttf', italic='ariali.ttf', bolditalic='arialbi.ttf') | |
# showtext_auto() | |
require(tidyverse) | |
theme_set( | |
theme_classic() + | |
theme( | |
text = element_text(size = 9, family = "sans"), | |
plot.title = element_text(hjust = 0.5, size = 10, family = "sans", face = "bold"), | |
axis.title = element_text(size = 9, face = "bold"), | |
axis.text = element_text(size = 9), | |
legend.spacing.x = unit(0, "cm"), | |
legend.spacing.y = unit(2, "mm"), | |
legend.key.height = unit(4, "mm"), | |
legend.margin = margin(0, 0, 0, 0), | |
axis.line = element_line(size = 0), | |
strip.background = element_blank(), | |
strip.text = element_text(margin = margin(1, 1, 1, 1, unit = "mm")), | |
panel.border = element_rect(color = "black", fill = NA, size = .5) | |
) | |
) | |
require(ggsci) | |
require(tximeta) | |
require(DESeq2) | |
require(clusterProfiler) | |
require(msigdbr) | |
require(metap) | |
require(patchwork) | |
require(ggdendro) | |
## -------------------------------------------------------- | |
msig_df <- msigdbr(species = "Homo sapiens") | |
m_t2g <- msig_df |> | |
filter(gs_cat == "H" | | |
gs_cat == "C1" | | |
(gs_cat == "C2" & str_detect(gs_subcat, "CP:")) | | |
# (gs_cat == "C3" & str_detect(gs_subcat, "TFT:")) | | |
(gs_cat == "C5" & str_detect(gs_subcat, "GO:"))) |> | |
dplyr::select(gs_name, entrez_gene) | |
contrast_builder <- function(dds_obj, numerator_sample_ids, denominator_sample_ids, covariates_to_ignore, .id_col = "sample_id", .ignore_SVs = T) { | |
# get the model matrix | |
mod_mat <- model.matrix(design(dds_obj), colData(dds_obj)) | |
# calculate the vector of coefficient weights in the numerator and denominator | |
numerator_coefs <- colMeans(mod_mat[dds_obj[[.id_col]] %in% numerator_sample_ids, ]) | |
denominator_coefs <- colMeans(mod_mat[dds_obj[[.id_col]] %in% denominator_sample_ids, ]) | |
# The contrast we are interested in is the difference between the two | |
contrast_coefs <- numerator_coefs - denominator_coefs | |
cov_ignore_regex <- covariates_to_ignore #(covariates_to_ignore |> toString() |> str_replace_all("~, ", "") |> str_split(" \\+ "))[[1]] |> | |
#paste(collapse = "|") | |
for(coef_name in names(contrast_coefs)) { | |
if (str_starts(coef_name, cov_ignore_regex)) { | |
contrast_coefs[coef_name] <- 0 | |
}else if (.ignore_SVs == T & str_detect(coef_name, "SV[0-9]+")) { | |
print(coef_name) | |
contrast_coefs[coef_name] <- 0 | |
} | |
} | |
return(contrast_coefs) | |
} | |
write_results <- function(result_name, filter_top, filter_bottom, dds_obj, outdir, covariates_to_ignore) { | |
print("in write results...") | |
print(result_name) | |
top_sample_ids <- colData(dds_obj) |> | |
as.data.frame() |> | |
filter(eval(rlang::parse_expr(filter_top))) |> | |
pull(sample_id) | |
print(top_sample_ids) | |
bottom_sample_ids <- colData(dds_obj) |> | |
as.data.frame() |> | |
filter(eval(rlang::parse_expr(filter_bottom))) |> | |
pull(sample_id) | |
print(bottom_sample_ids) | |
res_obj <- results(dds_obj, contrast = contrast_builder(dds_obj, top_sample_ids, bottom_sample_ids, covariates_to_ignore)) | |
summary(res_obj) | |
write_csv( | |
res_obj |> | |
as.data.frame() |> | |
rownames_to_column(var = "ensembl_id"), | |
sprintf("%s/tables/results_%s.csv.gz", outdir, result_name) | |
) | |
} | |
run_neural_pipeline <- function(sample_sheet_df, | |
deseq2_design, | |
limma_covariates, | |
limma_design, | |
sva_mod, | |
sva_mod0, | |
diff_exprs = list( | |
"neural_t21_d21" = c( | |
"genotype == 'T21'", | |
"genotype == 'D21'", | |
"neural_diff|dox_status"), | |
"neural_d21_t21xistdox" = c( | |
"genotype == 'T21XIST' & dox_status != 'no'", | |
"(genotype == 'T21XIST' & dox_status == 'no') | genotype == 'T21'", | |
"neural_diff|dox_status"), | |
"neural_d21_t21xistdox" = c( | |
"genotype == 'D21'", | |
"genotype == 'T21XIST' & dox_status != 'no'", | |
"neural_diff"))) { | |
outdir <- sprintf( | |
"outputs/neural_%s %s", | |
str_replace(Sys.time(), " [A-Z]+", "") |> str_replace_all(":", "-"), | |
deseq2_design |> toString() |> str_replace_all("~,| ", "") |> str_replace_all(":", "-") |> str_replace_all("\\*", "_x_") | |
) | |
dir.create(sprintf("%s/%s", outdir, "figures"), recursive = T) | |
dir.create(sprintf("%s/%s", outdir, "tables"), recursive = T) | |
write_csv(sample_sheet_df, paste0(outdir, "/tables/sample_sheet_df.csv.gz")) | |
sum_exp <- tximeta(sample_sheet_df) | |
gene_sum_exp <- summarizeToGene(sum_exp) | |
print("before deseq2 here") | |
dds <- DESeqDataSet(gene_sum_exp, | |
design = deseq2_design | |
) | |
dds <- estimateSizeFactors(dds) | |
nc <- counts(dds, normalized = TRUE) | |
filter <- rowSums(nc >= 20) >= ceiling(ncol(nc) / 10) | |
dds <- dds[filter, ] | |
dds <- estimateSizeFactors(dds) | |
dds <- DESeq(dds, modelMatrixType = "standard") | |
dds <- dds[which(mcols(dds)$betaConv), ] | |
saveRDS(dds, paste0(outdir, "/tables/dds.rds")) | |
vsd <- varianceStabilizingTransformation(dds, blind = F) | |
vsd_salmon_df <- assay(vsd) |> | |
as.data.frame() |> | |
rownames_to_column(var = "ensembl_id") | |
pca_expr_data <- vsd_salmon_df |> | |
column_to_rownames(var = "ensembl_id") | |
batch_corrected_vsd <- limma::removeBatchEffect(pca_expr_data, | |
covariates = model.matrix(limma_covariates, data = colData(dds)), | |
design = model.matrix(limma_design, data = colData(dds)) | |
) | |
resultsNames(dds) | |
write_csv(vsd_salmon_df, sprintf("%s/tables/vsd_salmon_df.csv.gz", outdir)) | |
write_csv(batch_corrected_vsd |> as.data.frame() |> rownames_to_column(var = "ensembl_id"), sprintf("%s/tables/batch_corrected_vsd_salmon_df.csv.gz", outdir)) | |
gene_info_df <- read_csv("input_data/gene_info_df.csv.gz") | |
for (de_name in names(diff_exprs)) { | |
print(de_name) | |
write_results(de_name, | |
diff_exprs[[de_name]][1], | |
diff_exprs[[de_name]][2], | |
dds, | |
outdir, | |
diff_exprs[[de_name]][3]) | |
} | |
results_files <- fs::dir_ls( | |
path = sprintf("%s/tables", outdir), | |
glob = sprintf( | |
"%s/tables/results_*csv.gz", | |
outdir |> | |
str_replace_all("\\+", "\\\\+") |> | |
str_replace_all("\\,", "\\\\,") | |
) | |
) | |
combined_results_df <- read_csv(results_files, | |
id = "origin_file") |> | |
mutate(comparison = str_replace_all( | |
origin_file, | |
sprintf( | |
"(%s/tables/results_)|(.csv.gz)", | |
outdir |> | |
str_replace_all("\\+", "\\\\+") |> | |
str_replace_all("\\(", "\\\\(") |> | |
str_replace_all("\\)", "\\\\)") |> | |
str_replace_all("\\,", "\\\\,") | |
), | |
"" | |
)) |> | |
pivot_wider( | |
id_cols = ensembl_id, | |
names_from = comparison, | |
values_from = c(baseMean, log2FoldChange, pvalue, padj, stat) | |
) | |
write_csv(combined_results_df, | |
sprintf("%s/tables/combined_results_df.csv.gz", outdir)) | |
sprintf("Running GSEA Enrichments...\n") | |
combined_results_df <- combined_results_df |> | |
left_join(gene_info_df, by = c("ensembl_id" = "ensembl_gene_id_version")) | |
# run gsea for all comparisons | |
all_comparisons <- combined_results_df |> | |
filter(hgnc_symbol != "XIST") |> | |
pivot_longer( | |
cols = c(starts_with("log2FoldChange_")), | |
names_pattern = "([A-Za-z0-9\\.]*)_(.*)", | |
names_to = c("value_type", "comparison") | |
) |> | |
pull(comparison) |> | |
unique() | |
for (comparison in all_comparisons) { | |
ordered_genes <- combined_results_df |> | |
filter(!is.na(entrezgene_id)) |> | |
arrange(desc(!!as.name(sprintf( | |
"stat_%s", comparison | |
)))) |> | |
dplyr::select(entrezgene_id,!!sprintf("stat_%s", comparison)) |> | |
deframe() | |
gsea_result <- | |
GSEA(ordered_genes, | |
TERM2GENE = m_t2g, | |
pvalueCutoff = 1) | |
write_csv( | |
gsea_result |> as.data.frame(), | |
sprintf("%s/tables/gsea_%s.csv.gz", outdir, comparison) | |
) | |
} | |
gsea_files <- fs::dir_ls( | |
path = sprintf("%s/tables", outdir), | |
glob = sprintf( | |
"%s/tables/gsea_*csv.gz", | |
outdir |> | |
str_replace_all("\\+", "\\\\+") |> | |
str_replace_all("\\,", "\\\\,") | |
) | |
) | |
print(gsea_files) | |
combined_enrichment_df <- read_csv(gsea_files, | |
id = "origin_file") |> | |
mutate(comparison = str_replace_all( | |
origin_file, | |
sprintf( | |
"(%s/tables/gsea_)|(.csv.gz)", | |
outdir |> | |
str_replace_all("\\+", "\\\\+") |> | |
str_replace_all("\\(", "\\\\(") |> | |
str_replace_all("\\)", "\\\\)") |> | |
str_replace_all("\\,", "\\\\,") | |
), | |
"" | |
)) |> | |
mutate( | |
leadingEdgeTags = as.numeric(str_replace_all( | |
str_split(leading_edge, ", ", simplify = T)[, 1], "tags=|%", "" | |
)) / 100, | |
leadingEdgeList = as.numeric(str_replace_all( | |
str_split(leading_edge, ", ", simplify = T)[, 2], "list=|%", "" | |
)) / 100, | |
leadingEdgeSignal = as.numeric(str_replace_all( | |
str_split(leading_edge, ", ", simplify = T)[, 3], "signal=|%", "" | |
)) / 100, | |
numCoreEnrichment = str_count(str_split(core_enrichment, "/")) + 1 | |
) |> | |
pivot_wider( | |
id_cols = ID, | |
names_from = comparison, | |
values_from = c( | |
NES, | |
p.adjust, | |
qvalue, | |
enrichmentScore, | |
setSize, | |
leadingEdgeTags, | |
leadingEdgeList, | |
leadingEdgeSignal, | |
numCoreEnrichment | |
) | |
) |> | |
mutate_at(vars(starts_with("p.adjust")), list( ~ if_else(is.na(.), 1, .))) |> | |
mutate_at(vars(starts_with("qvalue")), list( ~ if_else(is.na(.), 1, .))) |> | |
mutate_at(vars(starts_with("NES")), list( ~ if_else(is.na(.), 0, .))) |> | |
mutate_at(vars(starts_with("enrichmentScore")), list( ~ if_else(is.na(.), 0, .))) |> | |
dplyr::mutate( | |
category = dplyr::case_when( | |
str_detect(ID, "STRESS") ~ "stress", | |
str_detect(ID, "ENDOPLASMIC_RETICULUM") ~ "ER", | |
str_detect(ID, "chr21") ~ "chr21", | |
str_detect(ID, "TRANSLATION") ~ "translation", | |
str_detect(ID, "RIBOSOM") ~ "ribosome", | |
str_detect(ID, "SERINE") & | |
(!str_detect(ID, "SERINE_THREONINE")) ~ "serine", | |
str_detect(ID, "MITOCH") ~ "mitochondria", | |
str_detect(ID, "NRF2") ~ "stress", | |
str_detect(ID, "APOPTO") | | |
str_detect(ID, "PROGRAMMED_CELL_DEATH") ~ "apoptosis", | |
T ~ "other" | |
) | |
) | |
print("saving combined enrichment df") | |
write_csv(combined_enrichment_df, | |
paste0(outdir, "/tables/combined_enrichment_df.csv.gz")) | |
rmarkdown::render("summarize_neural_output.Rmd", output_file = paste0(outdir, "/neural_summary.html"), params = list( | |
output_dir = outdir, | |
samples_to_exclude = samples_to_exclude |> toString(), | |
deseq2_design = deseq2_design |> toString(), | |
limma_covariates = limma_covariates |> toString(), | |
limma_design = limma_design |> toString(), | |
sva_mod = sva_mod |> toString(), | |
sva_mod0 = sva_mod0 |> toString(), | |
use_SV = use_SV, | |
num_SV = num_SV | |
)) | |
} | |
run_pipeline <- function(sample_sheet_df, | |
deseq2_design, | |
limma_covariates, | |
limma_design, | |
sva_mod, | |
sva_mod0, | |
diff_exprs = list( | |
"rtta_wd_vs_nd" = c( | |
'has_transgene & dox_status == "withdrawal"', | |
'has_transgene & dox_status == "no"'), | |
"rtta_wd_vs_nd_powered" = c( | |
'has_transgene & dox_status == "withdrawal"', | |
'(has_transgene | cell_line == "198_5") & dox_status == "no"'), | |
"rtta_d_vs_nd" = c( | |
'has_transgene & dox_status == "yes"', | |
'has_transgene & dox_status == "no"'), | |
"nonrtta_t21_vs_d21" = c( | |
'genotype == "T21" & dox_status != "yes"', | |
'genotype == "D21" & dox_status != "yes"'), | |
"nonrtta_t21_vs_d21_pow" = c( | |
'(genotype == "T21" & dox_status != "yes") | (genotype == "T21XIST" & dox_status == "no")', | |
'genotype == "D21" & dox_status != "yes"'), | |
"dox_t21xist_vs_1985" = c( | |
'genotype == "T21XIST" & dox_status == "yes"', | |
'genotype == "T21" & dox_status == "yes"'), | |
"nonrtta_d_vs_nd" = c( | |
'!has_transgene & dox_status == "yes"', | |
'!has_transgene & dox_status != "yes"') | |
), | |
neural_output_dir = NULL, | |
chr21_changes_comparisons = c("rtta_wd_vs_nd_powered", "dox_t21xist_vs_1985"), | |
diff_expr_compare_to = "nonrtta_t21_vs_d21_pow", | |
diff_expr_comparisons = c( | |
"neural_t21_d21", | |
"rtta_d_vs_nd", | |
"dox_t21xist_vs_1985", | |
"rtta_wd_vs_nd", | |
"rtta_wd_vs_nd_powered" | |
), | |
diff_expr_combined_padj_cutoff = .1, | |
gsea_compare_to = "nonrtta_t21_vs_d21_pow", | |
gsea_comparisons = c( | |
"neural_t21_d21", | |
"rtta_d_vs_nd", | |
"dox_t21xist_vs_1985", | |
"rtta_wd_vs_nd", | |
"rtta_wd_vs_nd_powered" | |
), | |
gsea_combined_padj_cutoff = .2, | |
lol_cutoff_primary = .1, | |
lol_cutoff_overlap = .1, | |
generate_individual_figures = F, | |
generate_summary_report = T) { | |
outdir <- sprintf( | |
"outputs/%s %s", | |
str_replace(Sys.time(), " [A-Z]+", "") |> str_replace_all(":", "-"), | |
deseq2_design |> toString() |> str_replace_all("~,| ", "") |> str_replace_all(":", "-") |> str_replace_all("\\*", "_x_") | |
) | |
dir.create(sprintf("%s/%s", outdir, "figures"), recursive = T) | |
dir.create(sprintf("%s/%s", outdir, "tables"), recursive = T) | |
## Import the counts using tximeta | |
write_csv(sample_sheet_df, | |
paste0(outdir, "/tables/sample_sheet_df.csv.gz")) | |
sum_exp <- tximeta(sample_sheet_df) | |
gene_sum_exp <- summarizeToGene(sum_exp) | |
dds <- DESeqDataSet(gene_sum_exp, | |
design = deseq2_design) | |
dds <- estimateSizeFactors(dds) | |
nc <- counts(dds, normalized = TRUE) | |
filter <- rowSums(nc >= 20) >= ceiling(ncol(nc) / 10) | |
dds <- dds[filter,] | |
dds <- estimateSizeFactors(dds) | |
dds <- DESeq(dds, modelMatrixType = "standard") | |
dds <- dds[which(mcols(dds)$betaConv),] | |
saveRDS(dds, paste0(outdir, "/tables/dds.rds")) | |
vsd <- varianceStabilizingTransformation(dds, blind = F) | |
vsd_salmon_df <- assay(vsd) |> | |
as.data.frame() |> | |
rownames_to_column(var = "ensembl_id") | |
pca_expr_data <- vsd_salmon_df |> | |
column_to_rownames(var = "ensembl_id") | |
batch_corrected_vsd <- limma::removeBatchEffect( | |
pca_expr_data, | |
covariates = model.matrix(limma_covariates, data = colData(dds)), | |
design = model.matrix(limma_design, data = colData(dds)) | |
) | |
# results | |
resultsNames(dds) | |
write_csv(vsd_salmon_df, | |
sprintf("%s/tables/vsd_salmon_df.csv.gz", outdir)) | |
write_csv( | |
batch_corrected_vsd |> as.data.frame() |> rownames_to_column(var = "ensembl_id"), | |
sprintf("%s/tables/batch_corrected_vsd_salmon_df.csv.gz", outdir) | |
) | |
gene_info_df <- read_csv("input_data/gene_info_df.csv.gz") | |
sprintf("Outputting DESeq2 Results...\n") | |
for (de_name in names(diff_exprs)) { | |
write_results(de_name, | |
diff_exprs[[de_name]][1], | |
diff_exprs[[de_name]][2], | |
dds, | |
outdir, | |
diff_exprs[[de_name]][3]) | |
} | |
results_files <- fs::dir_ls( | |
path = sprintf("%s/tables", outdir), | |
glob = sprintf( | |
"%s/tables/results_*csv.gz", | |
outdir |> | |
str_replace_all("\\+", "\\\\+") |> | |
str_replace_all("\\,", "\\\\,") | |
) | |
) | |
combined_results_df <- read_csv(results_files, | |
id = "origin_file") |> | |
mutate(comparison = str_replace_all( | |
origin_file, | |
sprintf( | |
"(%s/tables/results_)|(.csv.gz)", | |
outdir |> | |
str_replace_all("\\+", "\\\\+") |> | |
str_replace_all("\\(", "\\\\(") |> | |
str_replace_all("\\)", "\\\\)") |> | |
str_replace_all("\\,", "\\\\,") | |
), | |
"" | |
)) |> | |
pivot_wider( | |
id_cols = ensembl_id, | |
names_from = comparison, | |
values_from = c(baseMean, log2FoldChange, pvalue, padj, stat) | |
) | |
write_csv(combined_results_df, | |
sprintf("%s/tables/combined_results_df.csv.gz", outdir)) | |
## Gene Overlap Figures --------------- | |
sprintf("Reading in neural enrichments...%s\n", neural_output_dir) | |
if (is.null(neural_output_dir)) { | |
neural_results_nonrtta_T21_vs_D21 <- | |
read_csv("input_data/neural_results_nonrtta_T21_vs_D21.csv.gz") | |
} else { | |
neural_results_nonrtta_T21_vs_D21 <- | |
read_csv(paste0( | |
neural_output_dir, | |
"/tables/results_neural_t21_d21.csv.gz" | |
)) | |
} | |
combined_results_df <- | |
read_csv(paste0(outdir, "/tables/combined_results_df.csv.gz")) |> | |
inner_join( | |
neural_results_nonrtta_T21_vs_D21 |> | |
dplyr::select( | |
ensembl_id, | |
log2FoldChange_neural_t21_d21 = log2FoldChange, | |
pvalue_neural_t21_d21 = pvalue, | |
padj_neural_t21_d21 = padj, | |
stat_neural_t21_d21 = stat | |
) | |
) |> | |
mutate_at(vars(starts_with("padj")), list( ~ if_else(is.na(.), 1, .))) |> | |
mutate_at(vars(starts_with("pvalue")), list( ~ if_else(is.na(.), 1, .))) |> | |
mutate_at(vars(starts_with("log2FoldChange")), list( ~ if_else(is.na(.), 0, .))) |> | |
mutate_at(vars(starts_with("stat")), list( ~ if_else(is.na(.), 0, .))) |> | |
left_join(gene_info_df, by = c("ensembl_id" = "ensembl_gene_id_version")) | |
vectorized_sumlog <- function(x, y) { | |
if (length(x) != length(y)) { | |
stop("lengths of lists must be the same") | |
} | |
combined_pvals <- c() | |
for (i in 1:length(x)) { | |
combined_pvals <- | |
c(combined_pvals, (metap::sumlog(c(x[i], y[i])))$p) | |
} | |
return(combined_pvals) | |
} | |
# GSEA Enrichments ---------------------------- | |
sprintf("Running GSEA Enrichments...\n") | |
# run gsea for all comparisons | |
all_comparisons <- combined_results_df |> | |
filter(hgnc_symbol != "XIST") |> | |
pivot_longer( | |
cols = c(starts_with("log2FoldChange_")), | |
names_pattern = "([A-Za-z0-9\\.]*)_(.*)", | |
names_to = c("value_type", "comparison") | |
) |> | |
pull(comparison) |> | |
unique() | |
for (comparison in all_comparisons) { | |
ordered_genes <- combined_results_df |> | |
filter(!is.na(entrezgene_id)) |> | |
arrange(desc(!!as.name(sprintf( | |
"stat_%s", comparison | |
)))) |> | |
dplyr::select(entrezgene_id,!!sprintf("stat_%s", comparison)) |> | |
deframe() | |
gsea_result <- | |
GSEA(ordered_genes, | |
TERM2GENE = m_t2g, | |
pvalueCutoff = 1) | |
write_csv( | |
gsea_result |> as.data.frame(), | |
sprintf("%s/tables/gsea_%s.csv.gz", outdir, comparison) | |
) | |
} | |
gsea_files <- fs::dir_ls( | |
path = sprintf("%s/tables", outdir), | |
glob = sprintf( | |
"%s/tables/gsea_*csv.gz", | |
outdir |> | |
str_replace_all("\\+", "\\\\+") |> | |
str_replace_all("\\,", "\\\\,") | |
) | |
) | |
print(gsea_files) | |
combined_enrichment_df <- read_csv(gsea_files, | |
id = "origin_file") |> | |
mutate(comparison = str_replace_all( | |
origin_file, | |
sprintf( | |
"(%s/tables/gsea_)|(.csv.gz)", | |
outdir |> | |
str_replace_all("\\+", "\\\\+") |> | |
str_replace_all("\\(", "\\\\(") |> | |
str_replace_all("\\)", "\\\\)") |> | |
str_replace_all("\\,", "\\\\,") | |
), | |
"" | |
)) |> | |
mutate( | |
leadingEdgeTags = as.numeric(str_replace_all( | |
str_split(leading_edge, ", ", simplify = T)[, 1], "tags=|%", "" | |
)) / 100, | |
leadingEdgeList = as.numeric(str_replace_all( | |
str_split(leading_edge, ", ", simplify = T)[, 2], "list=|%", "" | |
)) / 100, | |
leadingEdgeSignal = as.numeric(str_replace_all( | |
str_split(leading_edge, ", ", simplify = T)[, 3], "signal=|%", "" | |
)) / 100, | |
numCoreEnrichment = str_count(str_split(core_enrichment, "/")) + 1 | |
) |> | |
pivot_wider( | |
id_cols = ID, | |
names_from = comparison, | |
values_from = c( | |
NES, | |
p.adjust, | |
qvalue, | |
enrichmentScore, | |
setSize, | |
leadingEdgeTags, | |
leadingEdgeList, | |
leadingEdgeSignal, | |
numCoreEnrichment | |
) | |
) |> | |
mutate_at(vars(starts_with("p.adjust")), list( ~ if_else(is.na(.), 1, .))) |> | |
mutate_at(vars(starts_with("qvalue")), list( ~ if_else(is.na(.), 1, .))) |> | |
mutate_at(vars(starts_with("NES")), list( ~ if_else(is.na(.), 0, .))) |> | |
mutate_at(vars(starts_with("enrichmentScore")), list( ~ if_else(is.na(.), 0, .))) |> | |
dplyr::mutate( | |
category = dplyr::case_when( | |
str_detect(ID, "STRESS") ~ "stress", | |
str_detect(ID, "ENDOPLASMIC_RETICULUM") ~ "ER", | |
str_detect(ID, "chr21") ~ "chr21", | |
str_detect(ID, "TRANSLATION") ~ "translation", | |
str_detect(ID, "RIBOSOM") ~ "ribosome", | |
str_detect(ID, "SERINE") & | |
(!str_detect(ID, "SERINE_THREONINE")) ~ "serine", | |
str_detect(ID, "MITOCH") ~ "mitochondria", | |
str_detect(ID, "NRF2") ~ "stress", | |
# str_detect(ID, "FOLATE") ~ "Folate", | |
# str_detect(ID, "MTORC1") ~ "MTORC1", | |
str_detect(ID, "APOPTO") | | |
str_detect(ID, "PROGRAMMED_CELL_DEATH") ~ "apoptosis", | |
# str_detect(top_words, "EXTRACELLULAR|MATRIX") ~ "ECM", | |
T ~ "other" | |
) | |
) | |
print("saving combined enrichment df") | |
write_csv(combined_enrichment_df, | |
paste0(outdir, "/tables/combined_enrichment_df.csv.gz")) | |
tibble( | |
parameter = c( | |
"deseq2_design", | |
"limma_covariates", | |
"limma_design", | |
"sva_mod", | |
"sva_mod0", | |
"neural_output_dir" | |
), | |
value = c( | |
deseq2_design |> toString(), | |
limma_covariates |> toString(), | |
limma_design |> toString(), | |
sva_mod |> toString(), | |
sva_mod0 |> toString(), | |
neural_output_dir | |
) | |
) |> | |
write_csv(paste0(outdir, "/params.csv.gz")) | |
diff_expr_df <- tibble(name = character(), | |
numerator = character(), | |
denomenator = character()) | |
for (de_name in names(diff_exprs)) { | |
diff_expr_df <- add_row( | |
diff_expr_df, | |
name = de_name, | |
numerator = diff_exprs[[de_name]][1], | |
denomenator = diff_exprs[[de_name]][2] | |
) | |
} | |
write_csv(diff_expr_df, paste0(outdir, "/diff_exprs.csv.gz")) | |
generate_report( | |
paste0(getwd(), "/", outdir), | |
chr21_changes_comparisons = chr21_changes_comparisons, | |
diff_expr_compare_to = diff_expr_compare_to, | |
diff_expr_comparisons = diff_expr_comparisons, | |
diff_expr_combined_padj_cutoff = diff_expr_combined_padj_cutoff, | |
gsea_compare_to = gsea_compare_to, | |
gsea_comparisons = gsea_comparisons, | |
gsea_combined_padj_cutoff = gsea_combined_padj_cutoff, | |
lol_cutoff_primary = lol_cutoff_primary, | |
lol_cutoff_overlap = lol_cutoff_overlap | |
) | |
} | |
generate_report <- function(outdir, | |
chr21_changes_comparisons = c("rtta_wd_vs_nd_powered", "dox_t21xist_vs_1985"), | |
diff_expr_compare_to = "nonrtta_t21_vs_d21_pow", | |
diff_expr_comparisons = c( | |
"neural_t21_d21", | |
"rtta_d_vs_nd", | |
"dox_t21xist_vs_1985", | |
"rtta_wd_vs_nd", | |
"rtta_wd_vs_nd_powered" | |
), | |
diff_expr_combined_padj_cutoff = .1, | |
gsea_compare_to = "nonrtta_t21_vs_d21_pow", | |
gsea_comparisons = c( | |
"neural_t21_d21", | |
"rtta_d_vs_nd", | |
"dox_t21xist_vs_1985", | |
"rtta_wd_vs_nd", | |
"rtta_wd_vs_nd_powered" | |
), | |
gsea_combined_padj_cutoff = .2, | |
lol_cutoff_primary = .1, | |
lol_cutoff_overlap = .1) { | |
output_filename <- "/summary.html" | |
rmarkdown::render("summarize_output2.Rmd", output_file = paste0(outdir, output_filename), params = list( | |
output_dir = outdir, | |
chr21_changes_comparisons = chr21_changes_comparisons, | |
diff_expr_compare_to = diff_expr_compare_to, | |
diff_expr_comparisons = diff_expr_comparisons, | |
diff_expr_combined_padj_cutoff = diff_expr_combined_padj_cutoff, | |
gsea_compare_to = gsea_compare_to, | |
gsea_comparisons = gsea_comparisons, | |
gsea_combined_padj_cutoff = gsea_combined_padj_cutoff, | |
lol_cutoff_primary = lol_cutoff_primary, | |
lol_cutoff_overlap = lol_cutoff_overlap | |
)) | |
} | |
neural_sample_sheet_df <- read_csv("input_data/bulkrna_neural_sample_sheet.csv") |> | |
mutate( | |
names=sample_id, | |
files=sprintf("input_data/salmon_quantification_v38_gibbs/%s_quant/quant.sf", sample_id) | |
) | |
run_neural_pipeline( | |
neural_sample_sheet_df, | |
deseq2_design = ~ geno_dox, | |
limma_covariates = ~ neural_diff, | |
limma_design = ~ genotype, | |
sva_mod = ~ neural_diff + genotype, | |
sva_mod0 = ~ neural_diff, | |
diff_exprs = list( | |
"neural_t21_d21" = c( | |
"(genotype == 'T21XIST' & dox_status == 'no') | genotype == 'T21'", | |
"genotype == 'D21'", | |
"1"), | |
"neural_t21xistdoxcombined_t21" = c( | |
"genotype == 'T21XIST' & dox_status != 'no'", | |
"(genotype == 'T21XIST' & dox_status == 'no') | genotype == 'T21'", | |
"1"), | |
"neural_t21xistdox_t21" = c( | |
"genotype == 'T21XIST' & dox_status != 'no' & dox_status != 'wd'", | |
"(genotype == 'T21XIST' & dox_status == 'no') | genotype == 'T21'", | |
"1"), | |
"neural_t21xistwd_t21" = c( | |
"genotype == 'T21XIST' & dox_status != 'no' & dox_status == 'wd'", | |
"(genotype == 'T21XIST' & dox_status == 'no') | genotype == 'T21'", | |
"1"), | |
"neural_d21_t21xistdoxcombined" = c( | |
"genotype == 'D21'", | |
"genotype == 'T21XIST' & dox_status != 'no'", | |
"1"), | |
"neural_d21_t21xistdox" = c( | |
"genotype == 'D21'", | |
"genotype == 'T21XIST' & dox_status != 'no' & dox_status != 'wd'", | |
"1"), | |
"neural_d21_t21xistwd" = c( | |
"genotype == 'D21'", | |
"genotype == 'T21XIST' & dox_status != 'no' & dox_status == 'wd'", | |
"1")) | |
) | |
ipsc_sample_sheet_df <- read_csv("input_data/bulkrna_neural_sample_sheet.csv") |> | |
mutate( | |
names = sample_id, | |
files = sprintf("input_data/salmon_quantification_v38_gibbs/%s_quant/quant.sf", sample_id) | |
) | |
# iPSC differential expression | |
run_pipeline( | |
sample_sheet_df = ipsc_sample_sheet_df, | |
deseq2_design = ~ timepoint + dox_status + effective_genotype, | |
limma_covariates = ~ timepoint + dox_status, | |
limma_design = ~ effective_genotype, | |
sva_mod = ~ timepoint + dox_status + effective_genotype, | |
sva_mod0 = ~ timepoint + dox_status, | |
neural_output_dir = "outputs/neural_2024-02-17 20-39-53.641348 geno_dox", # from the output of run_neural_pipeline above | |
diff_exprs = list( | |
"wdT21XIST_vs_T21" = c( | |
'genotype == "T21XIST" & dox_status == "withdrawal"', | |
'(genotype == "T21XIST" & dox_status == "no") | (genotype == "T21")', | |
"timepoint|dox_status"), | |
"dT21XIST_vs_T21" = c( | |
'genotype == "T21XIST" & dox_status == "yes"', | |
'(genotype == "T21XIST" & dox_status == "no") | (genotype == "T21")', | |
"timepoint|dox_status"), | |
"dwdT21SIT_vs_T21" = c( | |
'genotype == "T21XIST" & dox_status != "no"', | |
'(genotype == "T21XIST" & dox_status == "no") | (genotype == "T21")', | |
"timepoint|dox_status" | |
), | |
"wdT21XIST_vs_D21" = c( | |
'genotype == "T21XIST" & dox_status == "withdrawal"', | |
'genotype == "D21"', | |
"timepoint|dox_status"), | |
"dT21XIST_vs_D21" = c( | |
'genotype == "T21XIST" & dox_status == "yes"', | |
'genotype == "D21"', | |
"timepoint|dox_status"), | |
"dwdT21SIT_vs_D21" = c( | |
'genotype == "T21XIST" & dox_status != "no"', | |
'genotype == "D21"', | |
"timepoint|dox_status" | |
), | |
"T21_vs_D21" = c( | |
'(genotype == "T21XIST" & dox_status == "no") | (genotype == "T21")', | |
'genotype == "D21"', | |
"timepoint|dox_status"), | |
"D21_vs_T21" = c( | |
'genotype == "D21"', | |
'(genotype == "T21XIST" & dox_status == "no") | (genotype == "T21")', | |
"timepoint|dox_status"), | |
"dox_effect" = c( | |
'genotype %in% c("T21", "D21") & dox_status == "yes"', | |
'genotype %in% c("T21", "D21") & dox_status != "yes"', | |
"timepoint|effective_genotype") | |
), | |
chr21_changes_comparisons = c("wdT21XIST_vs_T21", "dT21XIST_vs_T21"), | |
diff_expr_compare_to="T21_vs_D21", | |
diff_expr_comparisons = c( | |
"neural_t21_d21", | |
"neural_t21xistdoxcombined_t21", | |
"wdT21XIST_vs_T21", | |
"dT21XIST_vs_T21", | |
"dox_effect" | |
), | |
gsea_compare_to = "T21_vs_D21", | |
gsea_comparisons = c( | |
"neural_t21_d21", | |
"neural_t21xistdoxcombined_t21", | |
"wdT21XIST_vs_T21", | |
"dT21XIST_vs_T21", | |
"dox_effect" | |
) | |
) |