Initial Commit

sfp-lab · Feb 18, 2024 · 6b1307e · 6b1307e
commit 6b1307e
Show file tree

Hide file tree

Showing 10 changed files with 5,693 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+input_data/*sample_sheet.csv
+input_data/salmon_quantification_v38_gibbs/*
+external_tables/GSE125837_monolayer_rawcounts.txt.gz
+dna_methylation_arrays/*
+outputs/*
diff --git a/T21_seurat_anchor_SCT_integration_sp.R b/T21_seurat_anchor_SCT_integration_sp.R
diff --git a/allelic_analysis.R b/allelic_analysis.R
@@ -0,0 +1,194 @@
+## ---------------------------
+##
+## Script name: 
+##
+## Purpose of script:
+##
+## Author: Prakhar Bansal
+##
+## Date Created: 2021-09-08
+##
+## Copyright (c) Prakhar Bansal, 2021
+## Email: pbansal@uchc.edu
+##
+## ---------------------------
+##
+## Notes:
+##
+##
+## ---------------------------
+
+## set working directory
+setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
+
+## ---------------------------
+
+## load up the packages we will need:  (uncomment as required)
+require(tidyverse); theme_set(theme_classic() + theme(plot.title=element_text(hjust=0.5)))
+require(ggsci) 
+library(zeallot)
+
+## ---------------------------
+
+output_dir <- "outputs/2024-02-17 21-39-53.641348 timepoint+dox_status+effective_genotype"
+sample_sheet_df <- read_csv(paste0(output_dir, "/tables/sample_sheet_df.csv.gz")) |> 
+  mutate(cell_type = "ipscs")
+neural_output_dir <- paste0("../salmon_pipeline_with_stefan/",
+                            (read_csv(paste0(output_dir, "/params.csv.gz")) |>
+                        filter(parameter == "neural_output_dir") |>
+                        pull(value))[1])
+sample_sheet_df_neural <- read_csv(paste0(neural_output_dir, "/tables/sample_sheet_df.csv.gz")) |> 
+  mutate(
+    cell_line = str_replace_all(cell_line, "C(?=(1|4|7))", "rtTA_XIST_c"),
+    `Cell Line` = str_replace_all(cell_line, "_c", " c") |> 
+      str_replace_all("_", "-"),
+    dox_status = case_when(dox_status == "wd" ~ "withdrawal",
+                           T ~ dox_status),
+    has_transgene = case_when(cell_line %in% c("198_1", "198_5") ~ F,
+                              T ~ T)
+    )|> 
+  mutate(cell_type = "neurons")
+gene_info_df <- read_csv("tables/gene_info_df.csv.gz")
+
+comb_sample_sheet_df <- bind_rows(sample_sheet_df, sample_sheet_df_neural)
+
+chr21_allelic_counts_df <- read_tsv("phaser_output_all_chr21/combined_bams_gene_ae.txt",
+                                    col_types = cols(contig=col_character(),
+                                                     variants=col_character())) |> 
+  filter(contig == "21" &
+           totalCount > 0) |> 
+  mutate(sample_id=str_extract(bam, "d[0-9]*")) |> 
+  inner_join(sample_sheet_df)
+write_csv(chr21_allelic_counts_df, "tables/chr21_allelic_counts_df.csv.gz")
+
+chr21_allelic_counts_jan2024_df <- read_tsv("phaser_output_Jan2024/combined_bams_gene_ae.txt",
+                                    col_types = cols(contig=col_character(),
+                                                     variants=col_character())) |> 
+  filter(contig == "21" &
+           totalCount > 0) |> 
+  mutate(sample_id=str_extract(bam, "d[0-9]*")) |> 
+  right_join(comb_sample_sheet_df)
+write_csv(chr21_allelic_counts_df, "tables/chr21_allelic_counts_df_jan2024.csv.gz")
+
+make_allelic_plot <- function(chr21_allelic_counts_df, allele_designation_df_byt21=F, keep_cell_type = "ipscs") {
+
+  chr21_allelic_counts_df <- chr21_allelic_counts_df |> 
+    filter(cell_type == keep_cell_type)
+
+  print(chr21_allelic_counts_df$sample_id |> unique())
+
+  if(length(allele_designation_df_byt21) == 1 && allele_designation_df_byt21 == FALSE) {
+    allele_designation_df_byt21 <- chr21_allelic_counts_df |> 
+      filter(`Cell Line` %in% c("198-1", "198-2", "198-5"),
+             totalCount > 6) |> 
+      pivot_longer(cols=c(aCount, bCount), names_to = "allele", values_to = "count") |> 
+      group_by(name, allele, genotype) |> 
+      summarise(mean_ratio=mean(count/totalCount)) |> 
+      ungroup() |> 
+      tidyr::pivot_wider(names_from = c("genotype", "allele"), values_from = "mean_ratio") |> 
+      na.omit() |> 
+      mutate(a_allele_diff=T21_aCount-D21_aCount,
+             b_allele_diff=T21_bCount-D21_bCount,
+             paternal_allele=case_when(abs(a_allele_diff) < .1 & abs(b_allele_diff) < .1 ~ "nodiff",
+                                       a_allele_diff>0 & b_allele_diff < 0 ~ "bCount",
+                                       a_allele_diff<0 & b_allele_diff > 0 ~ "aCount",
+                                       T ~ "undetermined"),
+             maternal_allele=case_when(paternal_allele == "aCount" ~ "bCount",
+                                       paternal_allele == "bCount" ~ "aCount")) |>
+      filter(paternal_allele %in% c("aCount", "bCount")) |>
+      pivot_longer(cols = c(paternal_allele, maternal_allele), names_to = "allele_type", values_to = "allele")
+  }
+
+  print("ENSG00000154645.14" %in% allele_designation_df_byt21$name)
+
+  rtta_xist_ae_plot_df <- chr21_allelic_counts_df |> 
+    filter(!is.na(`Cell Line`) & totalCount >= 10) |> 
+    mutate(dox_status=str_replace(dox_status, "withdrawal", "w/d"),
+           sample_type=case_when(has_transgene == T & dox_status == "no" ~ "T21XIST\n-dox",
+                                 has_transgene == T & dox_status == "yes" ~ "T21XIST\n+dox",
+                                 has_transgene == T & dox_status == "w/d" & timepoint == "6wk" ~ "T21XIST\n3w/d",
+                                 has_transgene == T & dox_status == "w/d" & timepoint == "17wk" ~ "T21XIST\n3w/d",
+                                 has_transgene == T & dox_status == "w/d" & timepoint == "9wk" ~ "T21XIST\n6w/d",
+                                 has_transgene == T & dox_status == "w/d" & timepoint == "20wk" ~ "T21XIST\n6w/d",
+                                 has_transgene == T & dox_status == "w/d" & is.na(timepoint) ~ "T21XIST\nw/d",
+                                 cell_line == "198_5" ~ "T21",
+                                 cell_line %in% c("198_1", "198_2") ~ "D21",
+                                 T ~ cell_line)) |> 
+    pivot_longer(cols=c(aCount, bCount), names_to = "allele", values_to = "count") |> 
+    mutate(ratio=count/totalCount) |> 
+    inner_join(allele_designation_df_byt21, by=c("name", "allele")) |> 
+    group_by(sample_type, name, start, allele_type) |> 
+    summarise(mean_count=mean(count),
+              mean_ratio=mean(ratio),
+              totalCount=dplyr::first(totalCount)) |> 
+    ungroup()
+
+  ensembl_ids_to_keep <- rtta_xist_ae_plot_df |>
+    left_join(gene_info_df, by=c("name"="ensembl_gene_id_version")) |> 
+    filter(allele_type=="paternal_allele" &
+             gene_biotype %in% c("protein_coding")) |> 
+    pivot_wider(id_cols=c(name), names_from=sample_type, values_from=mean_ratio) |> 
+    na.exclude() |> 
+
+    # dplyr::filter(`pb_no` < .9 & `pb_no` > .1) |> 
+    pull(name)
+
+  genes_sorted_by_position <- rtta_xist_ae_plot_df |> 
+    filter(name %in% ensembl_ids_to_keep) |>
+    arrange(start) |> 
+    pull(name) |> 
+    unique()
+
+  #potential escapees
+  potential_escapees <- rtta_xist_ae_plot_df |> 
+    filter(sample_type == "T21XIST\n+dox" &
+             name %in% ensembl_ids_to_keep &
+             allele_type == "paternal_allele" &
+             mean_ratio >= 1/6) |> 
+    left_join(gene_info_df, by=c("name"="ensembl_gene_id_version")) |> 
+    pull(name)
+
+  plt <- ggplot(rtta_xist_ae_plot_df |> 
+           filter(name %in% ensembl_ids_to_keep) |>
+           mutate(sample_type1 = case_when(str_detect(sample_type, "T21XIST") ~ "T21XIST",
+                                           T ~ sample_type),
+                  sample_type2 = case_when(!str_detect(sample_type, "T21XIST") ~ "",
+                                           T ~ str_extract(sample_type, "\\-dox|\\+dox|3w\\/d|6w\\/d|w\\/d"))) |> 
+           left_join(gene_info_df, by=c("name"="ensembl_gene_id_version")), 
+         aes(x=name, y=mean_ratio, fill=allele_type, totalCount=totalCount, gene_name=hgnc_symbol)) +
+    geom_col(position=position_fill(), width=.8) +
+    scale_x_discrete(limits=genes_sorted_by_position) +
+    scale_fill_d3(limits=c("maternal_allele", "paternal_allele"), labels=c("M", "P")) +
+    ggh4x::facet_nested(rows = vars(factor(sample_type1, levels = c("D21", "T21", "T21XIST")), 
+                                    factor(sample_type2, levels = c("","-dox", "+dox", "3w/d", "6w/d", "w/d"))),
+                        strip = ggh4x::strip_nested(clip = "off", size = "variable"),
+                        nest_line = element_line(linetype = 1)) +
+    #strip=ggh4x::strip_nested(clip="off")
+    # facet_grid(rows = ) +
+    scale_y_continuous(breaks = c(0,.5,1)) +
+    geom_hline(yintercept=.5, size=.5) +
+    geom_hline(yintercept=1/3, linetype="dashed", size=.3) +
+    labs(title="HSA21 Protein Coding Gene Allelic Expression",
+         x="Gene",
+         y="Fraction of Expression",
+         fill="Allele") +
+    theme(axis.ticks.x = element_blank(),
+          axis.text.x = element_blank(),
+          legend.position = "top",
+          legend.title = element_text(margin = margin(r=3, unit="mm")),
+          legend.text = element_text(margin = margin(r=2, unit="mm")),
+          strip.text.y = element_text(angle=0)
+          # text = element_text(size=7),
+          # axis.title = element_text(size=7),
+          # plot.title = element_text(size=7, face="bold", margin=margin(t=2, 0)),
+          # axis.text.y = element_text(size=5),
+          # panel.spacing = unit(.2, "mm"),
+          # strip.text = element_text(size=12, margin=margin(t=0,r=1,b=0,l=1)),
+          # strip.background = element_rect(size = .3),
+          # plot.margin = margin(0.2)
+    )
+
+  return(list(plt, potential_escapees, allele_designation_df_byt21))
+}
+
+c(allelic_plt, potential_escapees, allele_designation_df) %<-% make_allelic_plot(chr21_allelic_counts_df)