allelic_analysis.R

## ---------------------------
##
## Script name: 
##
## Purpose of script:
##
## Author: Prakhar Bansal
##
## Date Created: 2021-09-08
##
## Copyright (c) Prakhar Bansal, 2021
## Email: pbansal@uchc.edu
##
## ---------------------------
##
## Notes:
##
##
## ---------------------------

## set working directory
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

## ---------------------------

## load up the packages we will need:  (uncomment as required)
require(tidyverse); theme_set(theme_classic() + theme(plot.title=element_text(hjust=0.5)))
require(ggsci) 
library(zeallot)

## ---------------------------

output_dir <- "outputs/2024-02-17 21-39-53.641348 timepoint+dox_status+effective_genotype"
sample_sheet_df <- read_csv(paste0(output_dir, "/tables/sample_sheet_df.csv.gz")) |> 
  mutate(cell_type = "ipscs")
neural_output_dir <- paste0("../salmon_pipeline_with_stefan/",
                            (read_csv(paste0(output_dir, "/params.csv.gz")) |>
                        filter(parameter == "neural_output_dir") |>
                        pull(value))[1])
sample_sheet_df_neural <- read_csv(paste0(neural_output_dir, "/tables/sample_sheet_df.csv.gz")) |> 
  mutate(
    cell_line = str_replace_all(cell_line, "C(?=(1|4|7))", "rtTA_XIST_c"),
    `Cell Line` = str_replace_all(cell_line, "_c", " c") |> 
      str_replace_all("_", "-"),
    dox_status = case_when(dox_status == "wd" ~ "withdrawal",
                           T ~ dox_status),
    has_transgene = case_when(cell_line %in% c("198_1", "198_5") ~ F,
                              T ~ T)
    )|> 
  mutate(cell_type = "neurons")
gene_info_df <- read_csv("tables/gene_info_df.csv.gz")

comb_sample_sheet_df <- bind_rows(sample_sheet_df, sample_sheet_df_neural)

chr21_allelic_counts_df <- read_tsv("phaser_output_all_chr21/combined_bams_gene_ae.txt",
                                    col_types = cols(contig=col_character(),
                                                     variants=col_character())) |> 
  filter(contig == "21" &
           totalCount > 0) |> 
  mutate(sample_id=str_extract(bam, "d[0-9]*")) |> 
  inner_join(sample_sheet_df)
write_csv(chr21_allelic_counts_df, "tables/chr21_allelic_counts_df.csv.gz")

chr21_allelic_counts_jan2024_df <- read_tsv("phaser_output_Jan2024/combined_bams_gene_ae.txt",
                                    col_types = cols(contig=col_character(),
                                                     variants=col_character())) |> 
  filter(contig == "21" &
           totalCount > 0) |> 
  mutate(sample_id=str_extract(bam, "d[0-9]*")) |> 
  right_join(comb_sample_sheet_df)
write_csv(chr21_allelic_counts_df, "tables/chr21_allelic_counts_df_jan2024.csv.gz")

make_allelic_plot <- function(chr21_allelic_counts_df, allele_designation_df_byt21=F, keep_cell_type = "ipscs") {
  
  chr21_allelic_counts_df <- chr21_allelic_counts_df |> 
    filter(cell_type == keep_cell_type)
  
  print(chr21_allelic_counts_df$sample_id |> unique())
  
  if(length(allele_designation_df_byt21) == 1 && allele_designation_df_byt21 == FALSE) {
    allele_designation_df_byt21 <- chr21_allelic_counts_df |> 
      filter(`Cell Line` %in% c("198-1", "198-2", "198-5"),
             totalCount > 6) |> 
      pivot_longer(cols=c(aCount, bCount), names_to = "allele", values_to = "count") |> 
      group_by(name, allele, genotype) |> 
      summarise(mean_ratio=mean(count/totalCount)) |> 
      ungroup() |> 
      tidyr::pivot_wider(names_from = c("genotype", "allele"), values_from = "mean_ratio") |> 
      na.omit() |> 
      mutate(a_allele_diff=T21_aCount-D21_aCount,
             b_allele_diff=T21_bCount-D21_bCount,
             paternal_allele=case_when(abs(a_allele_diff) < .1 & abs(b_allele_diff) < .1 ~ "nodiff",
                                       a_allele_diff>0 & b_allele_diff < 0 ~ "bCount",
                                       a_allele_diff<0 & b_allele_diff > 0 ~ "aCount",
                                       T ~ "undetermined"),
             maternal_allele=case_when(paternal_allele == "aCount" ~ "bCount",
                                       paternal_allele == "bCount" ~ "aCount")) |>
      filter(paternal_allele %in% c("aCount", "bCount")) |>
      pivot_longer(cols = c(paternal_allele, maternal_allele), names_to = "allele_type", values_to = "allele")
  }
  
  print("ENSG00000154645.14" %in% allele_designation_df_byt21$name)
  
  rtta_xist_ae_plot_df <- chr21_allelic_counts_df |> 
    filter(!is.na(`Cell Line`) & totalCount >= 10) |> 
    mutate(dox_status=str_replace(dox_status, "withdrawal", "w/d"),
           sample_type=case_when(has_transgene == T & dox_status == "no" ~ "T21XIST\n-dox",
                                 has_transgene == T & dox_status == "yes" ~ "T21XIST\n+dox",
                                 has_transgene == T & dox_status == "w/d" & timepoint == "6wk" ~ "T21XIST\n3w/d",
                                 has_transgene == T & dox_status == "w/d" & timepoint == "17wk" ~ "T21XIST\n3w/d",
                                 has_transgene == T & dox_status == "w/d" & timepoint == "9wk" ~ "T21XIST\n6w/d",
                                 has_transgene == T & dox_status == "w/d" & timepoint == "20wk" ~ "T21XIST\n6w/d",
                                 has_transgene == T & dox_status == "w/d" & is.na(timepoint) ~ "T21XIST\nw/d",
                                 cell_line == "198_5" ~ "T21",
                                 cell_line %in% c("198_1", "198_2") ~ "D21",
                                 T ~ cell_line)) |> 
    pivot_longer(cols=c(aCount, bCount), names_to = "allele", values_to = "count") |> 
    mutate(ratio=count/totalCount) |> 
    inner_join(allele_designation_df_byt21, by=c("name", "allele")) |> 
    group_by(sample_type, name, start, allele_type) |> 
    summarise(mean_count=mean(count),
              mean_ratio=mean(ratio),
              totalCount=dplyr::first(totalCount)) |> 
    ungroup()
  
  ensembl_ids_to_keep <- rtta_xist_ae_plot_df |>
    left_join(gene_info_df, by=c("name"="ensembl_gene_id_version")) |> 
    filter(allele_type=="paternal_allele" &
             gene_biotype %in% c("protein_coding")) |> 
    pivot_wider(id_cols=c(name), names_from=sample_type, values_from=mean_ratio) |> 
    na.exclude() |> 
    
    # dplyr::filter(`pb_no` < .9 & `pb_no` > .1) |> 
    pull(name)
  
  genes_sorted_by_position <- rtta_xist_ae_plot_df |> 
    filter(name %in% ensembl_ids_to_keep) |>
    arrange(start) |> 
    pull(name) |> 
    unique()
  
  #potential escapees
  potential_escapees <- rtta_xist_ae_plot_df |> 
    filter(sample_type == "T21XIST\n+dox" &
             name %in% ensembl_ids_to_keep &
             allele_type == "paternal_allele" &
             mean_ratio >= 1/6) |> 
    left_join(gene_info_df, by=c("name"="ensembl_gene_id_version")) |> 
    pull(name)
  
  plt <- ggplot(rtta_xist_ae_plot_df |> 
           filter(name %in% ensembl_ids_to_keep) |>
           mutate(sample_type1 = case_when(str_detect(sample_type, "T21XIST") ~ "T21XIST",
                                           T ~ sample_type),
                  sample_type2 = case_when(!str_detect(sample_type, "T21XIST") ~ "",
                                           T ~ str_extract(sample_type, "\\-dox|\\+dox|3w\\/d|6w\\/d|w\\/d"))) |> 
           left_join(gene_info_df, by=c("name"="ensembl_gene_id_version")), 
         aes(x=name, y=mean_ratio, fill=allele_type, totalCount=totalCount, gene_name=hgnc_symbol)) +
    geom_col(position=position_fill(), width=.8) +
    scale_x_discrete(limits=genes_sorted_by_position) +
    scale_fill_d3(limits=c("maternal_allele", "paternal_allele"), labels=c("M", "P")) +
    ggh4x::facet_nested(rows = vars(factor(sample_type1, levels = c("D21", "T21", "T21XIST")), 
                                    factor(sample_type2, levels = c("","-dox", "+dox", "3w/d", "6w/d", "w/d"))),
                        strip = ggh4x::strip_nested(clip = "off", size = "variable"),
                        nest_line = element_line(linetype = 1)) +
    #strip=ggh4x::strip_nested(clip="off")
    # facet_grid(rows = ) +
    scale_y_continuous(breaks = c(0,.5,1)) +
    geom_hline(yintercept=.5, size=.5) +
    geom_hline(yintercept=1/3, linetype="dashed", size=.3) +
    labs(title="HSA21 Protein Coding Gene Allelic Expression",
         x="Gene",
         y="Fraction of Expression",
         fill="Allele") +
    theme(axis.ticks.x = element_blank(),
          axis.text.x = element_blank(),
          legend.position = "top",
          legend.title = element_text(margin = margin(r=3, unit="mm")),
          legend.text = element_text(margin = margin(r=2, unit="mm")),
          strip.text.y = element_text(angle=0)
          # text = element_text(size=7),
          # axis.title = element_text(size=7),
          # plot.title = element_text(size=7, face="bold", margin=margin(t=2, 0)),
          # axis.text.y = element_text(size=5),
          # panel.spacing = unit(.2, "mm"),
          # strip.text = element_text(size=12, margin=margin(t=0,r=1,b=0,l=1)),
          # strip.background = element_rect(size = .3),
          # plot.margin = margin(0.2)
    )
  
  return(list(plt, potential_escapees, allele_designation_df_byt21))
}

c(allelic_plt, potential_escapees, allele_designation_df) %<-% make_allelic_plot(chr21_allelic_counts_df)