T21_seurat_anchor_SCT_integration_sp.R


## set working directory
setwd(dirname(unlist(rstudioapi::getSourceEditorContext()['path'])))

## ---------------------------

## load up the packages we will need:  (uncomment as required)
require(tidyverse);
theme_set(
  theme_classic() +
    theme(
      text = element_text(size = 6, family = "sans",),
      plot.title = element_text(hjust = 0.5, size = 8, family = "sans", face = "bold"),
      axis.title = element_text(size = 7, face = "bold"),
      axis.text = element_text(size = 6),
      legend.spacing.x = unit(0, "cm"),
      legend.spacing.y = unit(2, "mm"),
      legend.key.height = unit(4, "mm"),
      legend.margin = margin(0, 0, 0, 0),
      axis.line = element_line(size = 0),
      strip.background = element_blank(),
      strip.text = element_text(margin = margin(1, 1, 1, 1, unit = "mm")),
      panel.border = element_rect(color = "black", fill = NA, size = .5)
    )
)
require(ggsci)
#require(scales)
require(ggthemes)
library(Seurat)
library(SeuratData)
library(patchwork)
library(HGNChelper)
library(openxlsx)
## ---------------------------
gene_info_df <- read_csv("external_tables/gene_info_df.csv.gz")

# Adding the data to Seurat
mat_path <- "input_data/parsebio_scrnaseq/real_outputs/combined/all-well/DGE_filtered"
mat <- ReadParseBio(mat_path)

# Check to see if empty gene names are present, add name if so.
table(rownames(mat) == "")
rownames(mat)[rownames(mat) == ""] <- "unknown"

# Read in cell meta data
cell_meta <- read.csv(paste0(mat_path, "/cell_metadata.csv"), row.names = 1)

# Create object
t21neu <- CreateSeuratObject(mat, min_genes = 100, min_cells = 100,
                             names.feild = 0, meta.data = cell_meta)
#Setting our initial cell class to a single type, this will changer after clustering.
t21neu@meta.data$orig.ident <- factor(rep("t21neu", nrow(t21neu@meta.data)))
Idents(t21neu) <- t21neu@meta.data$orig.ident

saveRDS(t21neu, "tables/seurat_obj_before_QC.RDS")


t21neu_pre <- readRDS("tables/seurat_obj_before_QC.RDS")


library(clusterProfiler)
library(msigdbr)
msig_df <- msigdbr(species = "Homo sapiens")
msig_terms <- msig_df %>%
  filter(gs_cat == "H" |
           gs_cat == "C1" |
           (gs_cat == "C2" & str_detect(gs_subcat, "^CP:")) |
           #(gs_cat == "C3" & str_detect(gs_subcat, "^TFT:")) |
           #gs_cat == "C4" |
           (gs_cat == "C5" & str_detect(gs_subcat, "^HPO"))# | gs_cat == "C8"
           ) %>%
  dplyr::select(gs_name, gene_symbol)
rm(msig_df)


cust.name<-"fin13"
vc.regress<-c("MT")
min.xist<-2
RNAvSCT<-"integrated"
vc.tissue<-"Burke"
collections<-c("WP","REACTOME","KEGG", "HALLMARK", "BIOCARTA", "PID")
system.time(fx.master(cust.name, vc.regress, min.xist, RNAvSCT, vc.tissue, collections))

fx.master<-function(cust.name, vc.regress, min.xist, RNAvSCT, vc.tissue, collections)
{

outdir<-paste0(c(cust.name, vc.tissue, RNAvSCT,"with",vc.regress), collapse = "_")
dir.create(outdir)
dir.create(paste0(outdir,"/tables"))

vc.nameBYchr<-setNames(gene_info_df$hgnc_symbol, gene_info_df$chromosome_name)
ls.nameBYchr<-split(vc.nameBYchr, names(vc.nameBYchr))

ls.countsBYchr<-lapply(ls.nameBYchr,function(X) {
  fraction <- Matrix::colSums(t21neu_pre[as.vector(X), ])/Matrix::colSums(t21neu_pre)
})

df.CHRfrac<-as.data.frame(ls.countsBYchr)

rb.genes <- rownames(t21neu_pre)[grep("^RP[SL]",rownames(t21neu_pre))]
rpsl<- Matrix::colSums(t21neu_pre[rb.genes, ])/Matrix::colSums(t21neu_pre)
rn5.genes <- rownames(t21neu_pre)[grep("^RNA5S",rownames(t21neu_pre))]
rn5s<- Matrix::colSums(t21neu_pre[rn5.genes, ])/Matrix::colSums(t21neu_pre)

df.neu<-df.CHRfrac |> rownames_to_column("cell_id") |>
  left_join(t21neu_pre@meta.data |> rownames_to_column("cell_id"), by="cell_id") |>
  left_join(cbind(rpsl, rn5s) |> as.data.frame() |> rownames_to_column("cell_id"), by="cell_id") |>
  column_to_rownames("cell_id")

t21neu_pre@meta.data<- df.neu |> mutate(dox = str_detect(sample, "_dox")) |>
  mutate(sample = case_when(str_detect(sample, "198_1") ~ "D21a",
                            str_detect(sample, "198_2") ~ "D21b",
                            str_detect(sample, "nodox") ~ "T21",
                            str_detect(sample, "d0") ~ ".day0",
                            str_detect(sample, "pD") ~ "pDAPT")) |>
 mutate(mcount_ratio = mread_count/tscp_count)

t21neu_removed <- subset(t21neu_pre, subset = nFeature_RNA > 7500 | nFeature_RNA < 1500 | nCount_RNA < 2500 | MT > 0.1)
t21neu <-subset(t21neu_pre, subset = nFeature_RNA <= 7500 & nFeature_RNA >= 1500 & nCount_RNA >= 2500 & MT <= 0.1)

tb.CHR<-pivot_longer(t21neu_pre@meta.data |> rownames_to_column("cell_id"),
                     c(colnames(df.CHRfrac),"rpsl", "rn5s", "gene_count", "tscp_count", "mcount_ratio"))
ggplot(tb.CHR, aes(x=value, color = sample)) + stat_ecdf(geom = "step") +
  facet_wrap(vars(name), scales="free_x")
ggsave("count_ecdf.pdf", path=outdir, height = 5, width = 5, units="in")

ggplot(t21neu_removed@meta.data, aes(x=MT, fill=sample)) +
  geom_histogram() + facet_wrap(vars(sample), scales="free") +
  geom_vline(xintercept = 0.1)
ggsave("MT_removed.pdf", path=outdir, height = 3, width = 3.6, units="in")

ggplot(t21neu_removed@meta.data, aes(x=gene_count, fill=sample)) +
  geom_histogram() + facet_wrap(vars(sample), scales="free") +
  geom_vline(xintercept = c(7500))
ggsave("gene_count_removed.pdf", path=outdir, height = 3, width = 3.6, units="in")

ggplot(t21neu_removed@meta.data, aes(x=tscp_count, fill=sample)) +
  geom_histogram() + facet_wrap(vars(sample), scales="free_y") +
  geom_vline(xintercept = c(2500))
ggsave("tscp_count_removed.pdf", path=outdir, height = 3, width = 3.6, units="in")

ggplot(t21neu_pre@meta.data, aes(y=tscp_count, x=sample, color = sample)) +
  geom_boxplot(varwidth = T) +
  stat_summary(fun.data = function(x) {
    data.frame(label=signif(quantile(x, prob=c(0.1, 0.2, .3,.4,.5,.6,.7,.8,.9), type=1),4), y=c(seq(1,9)*(-3000)))
  }, geom = "text",
  size=3) + geom_hline(yintercept = c(3200))
ggsave("tscp_count_dis.pdf", path=outdir, height = 3, width =3.6, units="in")

rm(tb.CHR)
rm(vc.nameBYchr)
rm(ls.nameBYchr)
gc()
#based on:
#https://satijalab.org/seurat/articles/sctransform_v2_vignette.html
#https://github.com/satijalab/seurat/issues/5879

t21neu_list <- SplitObject(t21neu, split.by = "sample")

t21neu_list <- lapply(t21neu_list, FUN = function(x) {
  x <- NormalizeData(x, normalization.method = "RC", scale.factor = 1e6)
  x<-CellCycleScoring(x, s.features = cc.genes.updated.2019$s.genes,
                      g2m.features = cc.genes.updated.2019$g2m.genes,
                      set.ident = F)
  x@meta.data<-x@meta.data |> mutate(CCdiff = (S.Score-G2M.Score))
  #x<-FindVariableFeatures(x, selection.method = "vst", nfeatures = 3000)
  #x<-ScaleData(x, vars.to.regress = vc.regress)
  x<-SCTransform(x, vst.flavor = "v2", vars.to.regress = vc.regress, return.only.var.genes =F, verbose = F)
})

features <- SelectIntegrationFeatures(object.list = t21neu_list, nfeatures = 3000)
t21neu_list <- PrepSCTIntegration(object.list = t21neu_list, anchor.features = features)

t21neu_anchors <- FindIntegrationAnchors(object.list = t21neu_list, normalization.method = "SCT",
                                         anchor.features = features,
                      k.filter = min(t21neu@meta.data$sample |> table()))
t21neu_combined <- IntegrateData(anchorset = t21neu_anchors, normalization.method = "SCT")

t21neu_combined  <- RunPCA(t21neu_combined, verbose = FALSE, features = features)
t21neu_combined  <- RunUMAP(t21neu_combined, reduction = "pca", dims = 1:30)
t21neu_combined <- FindNeighbors(t21neu_combined, reduction = "pca", dims = 1:30)
t21neu_combined <- FindClusters(t21neu_combined, resolution = 0.5)

rm(t21neu_pre)
rm(t21neu_removed)
rm(t21neu)
rm(t21neu_list)
rm(t21neu_anchors)
gc()

# Determine metrics to plot present in seurat_control@meta.data
require(cowplot)
metrics <-  c("gene_count", "tscp_count", "mcount_ratio", "S.Score", "G2M.Score", "CCdiff",
              "MT", "rn5s", "rpsl")

# Extract the UMAP coordinates for each cell and include information about the metrics to plot
qc_data <- FetchData(t21neu_combined,
                     vars = c(metrics, "ident", "UMAP_1", "UMAP_2"))

# Adding cluster label to center of cluster on UMAP
umap_label <- FetchData(t21neu_combined,
                        vars = c("ident", "UMAP_1", "UMAP_2"))  %>%
  as.data.frame() %>%
  group_by(ident) %>%
  summarise(x=mean(UMAP_1), y=mean(UMAP_2))

# Plot a UMAP plot for each metric
map(metrics, function(qc){
  ggplot(qc_data,
         aes(UMAP_1, UMAP_2)) +
    geom_point(aes_string(color=qc),
               alpha = 0.7) +
    scale_color_gradient(guide = "none",
                         low = "grey90",
                         high = "blue")  +
    geom_text(data=umap_label,
              aes(label=ident, x, y)) +
    ggtitle(qc)
}) %>% plot_grid(plotlist = .)
ggsave("cowplot.pdf", path=outdir, width=3.6, height=3.6, units="in")


p1 <- DimPlot(t21neu_combined, reduction = "umap", group.by = "sample")
p2 <- DimPlot(t21neu_combined, reduction = "umap", label = TRUE, repel = TRUE)
p1 + p2

DimPlot(t21neu_combined, reduction = "umap", split.by = "sample") +
  theme(legend.position = "bottom") + scale_color_brewer(palette = "Paired")
ggsave("umap_bysample.pdf", path=outdir, width=7.2, height=4, units="in")

source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/gene_sets_prepare.R")
source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/sctype_score_.R")
source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/auto_detect_tissue_type.R")

# get cell-type-specific gene sets from our in-built database (DB)
# gs_list <- gene_sets_prepare("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/ScTypeDB_short.xlsx", "Brain") # e.g. Immune system, Liver, Pancreas, Kidney, Eye, Brain

# Burke custom gene set
t21neu_combined <- PrepSCTFindMarkers(t21neu_combined)
DefaultAssay(t21neu_combined)<-RNAvSCT

tissue_guess = auto_detect_tissue_type(path_to_db_file = "../scRNAseq/external_tables/ScTypeDB_full.xlsx",
                                       seuratObject = t21neu_combined, scaled = T,
                                       assay = RNAvSCT)
print(tissue_guess)

tissue_type = vc.tissue
gs_list <- gene_sets_prepare("../scRNAseq/external_tables/ScTypeDB_short.xlsx", tissue_type)

# assign cell types
es.max <- sctype_score(scRNAseqData = t21neu_combined[[DefaultAssay(t21neu_combined)]]@scale.data,
                       scaled = TRUE,
                       gs = gs_list$gs_positive,
                       gs2 = gs_list$gs_negative)

# merge by cluster
cL_results <- do.call("rbind", lapply(unique(t21neu_combined@meta.data$seurat_clusters), function(cl){
  es.max.cl = sort(rowSums(es.max[ ,rownames(t21neu_combined@meta.data[t21neu_combined@meta.data$seurat_clusters==cl, ])]), decreasing = !0)
  head(data.frame(cluster = cl, type = names(es.max.cl), scores = es.max.cl, ncells = sum(t21neu_combined@meta.data$seurat_clusters==cl)), 10)
}))
write_csv(cL_results, paste0(outdir,"/tables/cL_results.csv"))
sctype_scores <- cL_results |> arrange(desc(scores)) |>  group_by(cluster) |> top_n(1, scores)

# set low-confident (low ScType score) clusters to "unknown"
sctype_scores$type[as.numeric(as.character(sctype_scores$scores)) < sctype_scores$ncells/4] <- "Unknown"
print(sctype_scores[,1:3])

t21neu_combined@meta.data$customclassif <- ""
for(j in unique(sctype_scores$cluster)){
  cl_type <- sctype_scores[sctype_scores$cluster==j,];
  t21neu_combined@meta.data$customclassif[t21neu_combined@meta.data$seurat_clusters == j] <- as.character(cl_type$type[1])
}

xist_threshold <- min.xist
t21neu_combined@meta.data<- t21neu_combined@meta.data |>
  mutate(simplified_celltype = #"neuron") #customclassif) |>
           case_when(#customclassif %in% c("Endothelial cells","Neuroblasts") ~ "Unknown",
             #customclassif %in% c("Immature neurons", "Neuroblasts") ~ "neuron",
             #                   #customclassif %in% c("Tanycytes", "Non myelinating Schwann cells") ~ "glia",
             (str_detect(customclassif, "euron") |
                str_detect(customclassif, "Fetal_quiescent"))
             ~ "Neurons",
             (str_detect(customclassif, "stem") |
                str_detect(customclassif, "ndothelial") |
                str_detect(customclassif, "Neuroepithelial") |
                str_detect(customclassif, "Neural Progenitor") |
                str_detect(customclassif, "NPC") |
                str_detect(customclassif, "Fetal_replicating") |
                str_detect(customclassif, "Radial")) ~ "NPCs",
             (#str_detect(customclassif, "Tanycytes") |
               #str_detect(customclassif, "Oligodendro") |
                #str_detect(customclassif, "OPC") |
                str_detect(customclassif, "Astrocyte"))~ "Astrocytes",
             T ~ "Other")) |>
  rownames_to_column("cell_id") |>
  left_join(t21neu_combined$RNA@counts["XIST",] |>
           enframe(name="cell_id", value="XIST_counts")) |>
  mutate(XIST_status = case_when(XIST_counts >= xist_threshold ~ T,
                                 XIST_counts == 0 ~ F,
                                 T ~ NA)) |>
  column_to_rownames("cell_id")

sample.cols<-setNames(c("#E69F00", "#56B4E9", "#0072B2", "#D55E00", "#CC79A7"),
                      nm = names(t21neu_combined$sample |> table()))

p1 <- DimPlot(t21neu_combined, reduction = "umap", group.by = "XIST_status", shuffle = F, pt.size = 0.1) +
  theme(legend.position = "top", text = element_text(size = 8), axis.title = element_text(size=8))  +
  scale_color_jco()
p2 <- DimPlot(t21neu_combined, reduction = "umap", group.by = "Phase", shuffle = T, pt.size = 0.1) +
  theme(legend.position = "top", text = element_text(size = 8), axis.title = element_text(size=8))  +
  scale_color_jama()
p3 <- DimPlot(t21neu_combined, reduction = "umap", group.by = "simplified_celltype", shuffle = T, pt.size = 0.1) +
  theme(legend.position = "top", text = element_text(size = 8), axis.title = element_text(size=8))  +
  scale_color_tron()
p4 <- DimPlot(t21neu_combined, reduction = "umap", group.by = "customclassif", shuffle = T, pt.size = 0.1) +
  theme(legend.position = "top", text = element_text(size = 8), axis.title = element_text(size=8))  +
  scale_color_brewer(palette = "Set2")
p5 <- DimPlot(t21neu_combined, reduction = "umap", group.by = "sample", shuffle = T, pt.size = 0.1) +
  scale_color_manual(values = sample.cols) +
  theme(legend.position = "top", text = element_text(size = 8), axis.title = element_text(size=8))
p6<-DimPlot(t21neu_combined, reduction = "umap", label = F, shuffle = T, pt.size = 0.1) +
  theme(legend.position = "top", text = element_text(size = 8), axis.title = element_text(size=8))  +
  scale_color_brewer(palette = "Paired")
#p1 + p5 + p6 + p4 + p3 + p2
p6 + p4 +
p2 + p3 +
p1 + p5 + plot_layout(ncol = 2)
ggsave("umaps.pdf", path=outdir, width=3.6, height=6.5, units="in")

b1<-t21neu_combined@meta.data |> mutate(sample = case_when(sample == ".day0" ~ "day0", T ~ sample)) |>
  filter(!is.na(XIST_status)) |>
    group_by(sample, simplified_celltype, XIST_status) |>
    tally() |> group_by(sample, simplified_celltype) |> mutate(sum = sum(n)) |> mutate(ratio = signif(100*n/sum,2)) |> ungroup() |>
  mutate(label_pos = case_when(XIST_status ~ (ratio-3), T ~ (-50))) |>
  ggplot(aes(x = sample, y=ratio, fill = XIST_status, label = ratio)) + geom_bar(stat = "identity") +
  geom_text(aes(y = label_pos, label = ratio), size=2) +
  facet_wrap(vars(simplified_celltype)) + scale_fill_jco() + coord_cartesian(ylim=c(0,100)) + theme(text=element_text(size=6)) +
  theme(axis.text.x = element_text(angle = 45, hjust=1, size = 6)) +
  theme(axis.text.y = element_text(size = 6), axis.title.x = element_blank()) + labs(y="% nuclei") +
  theme(legend.position = "top", legend.title=element_blank(), axis.title = element_blank(), legend.key.height = unit(0.1, 'in'))
  #ggbarplot("sample", "ratio", color = "white", fill = "XIST_status", ggtheme = theme_classic(), facet.by = "simplified_celltype",
  #          label = T, lab.pos = "in", lab.col = "black", palette = "jco", position = position_fill())
#ggsave("XIST_bysample.pdf", path=outdir, width=1.8, height=2.4, units="in")

b2<-t21neu_combined@meta.data |> mutate(sample = case_when(sample == ".day0" ~ "day0", T ~ sample)) |>
  filter(!is.na(simplified_celltype)) |>
  group_by(sample, customclassif, simplified_celltype) |>
  tally() |> group_by(sample, simplified_celltype) |> mutate(sum = sum(n)) |> mutate(ratio = round(100*n/sum,digits = 0)) |> ungroup() |>
  mutate(label_pos = case_when(XIST_status ~ (ratio-3), T ~ (-10))) |>
  ggplot(aes(x = sample, y=ratio, fill = customclassif)) + geom_bar(stat = "identity") +# geom_text(aes(y = label_pos, label = ratio), size=2) +
  facet_wrap(vars(simplified_celltype)) + scale_fill_brewer(palette = "Set2") + coord_cartesian(ylim=c(0,100)) + theme(text=element_text(size=6)) +
  theme(axis.text.x = element_text(angle = 45, hjust=1, size = 6)) +
  theme(axis.text.y = element_text(size = 6), axis.title.x = element_blank()) + labs(y="% nuclei") +
  theme(legend.position = "top", legend.direction = "horizontal", legend.title=element_blank(),legend.key.height = unit(0.1, 'in'))
#ggbarplot("sample", "ratio", color = "white", fill = "XIST_status", ggtheme = theme_classic(), facet.by = "simplified_celltype",
#          label = T, lab.pos = "in", lab.col = "black", palette = "jco", position = position_fill())
#ggsave("Customclassif_bysample.pdf", path=outdir, width=1.8, height=2.57, units="in")

b3<-t21neu_combined@meta.data |> mutate(sample = case_when(sample == ".day0" ~ "day0", T ~ sample)) |>
  filter(!is.na(Phase)) |>
  group_by(sample, Phase, simplified_celltype) |>
  tally() |> group_by(sample, simplified_celltype) |> mutate(sum = sum(n)) |> mutate(ratio_label = signif(100*n/sum,digits =2), ratio = 100*n/sum) |> ungroup() |>
  mutate(label_pos = case_when(Phase == "G1" ~ 95, T ~ (-10))) |>
  ggplot(aes(x = sample, y=ratio, fill = Phase)) + geom_bar(stat = "identity") + geom_text(aes(y = label_pos, label = ratio_label, color="white"), size=2) +
  facet_wrap(vars(simplified_celltype)) + scale_fill_jama() + scale_color_manual(values = "white") + coord_cartesian(ylim=c(0,100)) + theme(text=element_text(size=6)) +
  theme(axis.text.x = element_text(angle = 45, hjust=1, size = 6)) +
  theme(axis.text.y = element_text(size = 6), axis.title.x = element_blank()) + labs(y="% nuclei") +
  theme(legend.position = "top", legend.title=element_blank(), axis.title = element_blank(), legend.key.height = unit(0.1, 'in'))
#ggbarplot("sample", "ratio", color = "white", fill = "XIST_status", ggtheme = theme_classic(), facet.by = "simplified_celltype",
#          label = T, lab.pos = "in", lab.col = "black", palette = "jco", position = position_fill())
#ggsave("Phase_bysample.pdf", path=outdir, width=1.8, height=2.4, units="in")


ggsave(b2+b3+b1, filename = "supp_plots.pdf",path = outdir, height = 2.5, width = 5.4, units="in")
#DimPlot(t21neu_combined, reduction = "umap", split.by = "sample", group.by = "customclassif")
DimPlot(t21neu_combined, reduction = "umap", split.by = "sample", group.by = "simplified_celltype", shuffle=T) +
  theme(legend.position = "bottom", text = element_text(size = 8)) + scale_color_tron()
ggsave("umap_bysample.pdf", path=outdir, width=3.6, height=3, units="in")

# add allelic data to the metadata
allelic_df <-read_csv("../parsebio_scRNAseq/tables/indv_cells_gene_ae_filtered_paternal_withneu2.csv.gz") |>
  filter(cell_id %in% rownames(t21neu_combined@meta.data))
            #substr(rownames(t21neu_combined@meta.data),start = 1,nchar(rownames(t21neu_combined@meta.data))-4)))

patallele_fix<-allelic_df |>
  mutate(karyo = case_when(str_detect(cell_line, "198") ~ "d21",
                                                       T ~ "t21")) |>
  group_by(karyo, name) |>
  summarise(A = sum(aCount), B = sum(bCount),
            pat_bulk = unique(bulk_pat)) |>
  pivot_wider(id_cols = c(name, pat_bulk), names_from = karyo, values_from = c(A, B)) |>
  replace_na(list(A_d21 = 0, B_d21 = 0, A_t21 = 0, B_t21 = 0)) |>
  mutate(d21_rA =  A_d21/( A_d21 +  B_d21),
         d21_rB =  B_d21/( A_d21 +  B_d21),
         t21_rA =  A_t21/( A_t21 +  B_t21),
         t21_rB =  B_t21/( A_t21 +  B_t21)) |>
  mutate(a_allele_diff=t21_rA-d21_rA,
         b_allele_diff=t21_rB-d21_rB,
         pat_sn = case_when(abs(a_allele_diff) < .05 & abs(b_allele_diff) < .05 ~ "nodiff",
                                   a_allele_diff > 0 & b_allele_diff < 0 ~ "bCount",
                                   a_allele_diff < 0 & b_allele_diff > 0 ~ "aCount",
                                   T ~ "undetermined"))
patallele_fix |> dplyr::select(pat_bulk, pat_sn) |> table(useNA = "always")

patallele_fix2<-patallele_fix |>
  mutate(pat_sn = case_when(!is.na(pat_bulk) & pat_sn %in% c("nodiff","undetermined") ~  pat_bulk,
                          is.na(pat_bulk) & pat_sn %in% c("nodiff","undetermined") ~ "drop",
                          pat_sn == pat_bulk ~ pat_sn,
                          !is.na(pat_bulk) & pat_sn != pat_bulk ~ pat_sn,
                          is.na(pat_bulk) & pat_sn %in% c("aCount","bCount") ~ pat_sn,
                          T ~ "unresolved"
                         )) |>
  mutate(mat_sn=case_when(pat_sn == "aCount" ~ "bCount",
                          pat_sn == "bCount" ~ "aCount",
                          T ~ "drop"))

patallele_fix2 |> filter(pat_sn != "drop") |> dplyr::select(pat_bulk, pat_sn) |> table(useNA = "always")

allelic_df2 <-allelic_df |>
  left_join(patallele_fix2 |>
              pivot_longer(cols = c(pat_sn), names_to = "allele_type", values_to = "allele") |>
              filter(allele_type == "pat_sn")
                                      , by = "name")

pat_ar_df <- allelic_df2 |>
 filter(
    !is.na(cell_line) & !is.na(allele)
  ) |>
  left_join(t21neu_combined@meta.data |> rownames_to_column("cell_id")) |>
  mutate(pat_count = case_when(
      allele == 'aCount' ~ aCount,
      allele == 'bCount' ~ bCount
    ),
  mat_count = totalCount - pat_count
  # cell_id = case_when(
  #    str_detect(bam, "2k") ~ sprintf("%s__s1", cell),
  #    str_detect(bam, "5k") ~ sprintf("%s__s2", cell)
  #  )
  ) |>
  group_by(cell_line, cell_id) |>
  summarise(
    sample = unique(sample),
    XIST_status = unique(XIST_status),
    simplified_celltype = unique(simplified_celltype),
    nvar = median(n_variants, na.rm = T),
    total = sum(totalCount, na.rm = T),
    sumPat = sum(pat_count),
    sumMat = sum(mat_count),
    perc_paternal = sum(pat_count, na.rm = T) / sum(totalCount, na.rm = T),
    perc_paternal2 = sum(pat_count/totalCount, na.rm = T ),
    pat_to_mat = sum(pat_count, na.rm = T) / sum(mat_count, na.rm = T),
    pat_to_mat2 = mean(pat_count/mat_count, na.rm = T)
  ) |>
  ungroup()


require(ggpubr)

pat_ar_df2<-pat_ar_df |>
  filter(!(str_detect(sample, "D21") & XIST_status) & !(sample == ".day0" & !XIST_status)) |>
  mutate(sample = case_when(#sample %in% c("D21a","D21b") ~ "D21",
                            sample == "pDAPT" & !XIST_status ~ "T21_pD-",
                            sample == "pDAPT" & XIST_status ~ "T21_pD+",
                            sample == ".day0" & !XIST_status ~ "T21_d0-",
                            sample == ".day0" & XIST_status ~ "T21_d0+",
                            sample == "T21" ~ "T21",
                            T ~ sample)) |>
  filter(total > 5 & !is.na(simplified_celltype)) |> filter(!is.na(XIST_status)) |>
  mutate(sample = case_when(sample == "D21" ~ "euploid", T ~ sample)) |>
  mutate(XIST_status = case_when(sample %in% c("D21a","D21b") ~ "absent", T ~ as.character(XIST_status)))

allelic_plot<-ggplot(pat_ar_df2 #|> filter(simplified_celltype != "Other")
                     , aes(x=sample, y=perc_paternal)) +
  ggbeeswarm::geom_quasirandom(aes(color = XIST_status), alpha = .5, size=.5) +
  geom_boxplot(fill = NA, notch = T, outlier.shape = NA, size=.3) +
  scale_color_jama() +
  stat_summary(fun.data = function(x) {
    data.frame(label=signif(median(x),2), y=-0.1)},
  geom = "text", size=2) +
  stat_compare_means(ref.group = "T21", method = "wilcox",
                     #method.args = list(alternative="less"),
                     label.y = 0.9, hide.ns = T, label = "p.format", size=2) +
  theme(axis.ticks.x = element_blank(),
        axis.title.x = element_blank(),
          legend.position = "top") +
  labs(
    title="Paternal chr21 allele expression",
    y="Lesser allele fraction",
    color="XIST",
    fill="XIST"
  ) +geom_hline(yintercept = c(0.33,0.5), alpha = 0.1)  +
  scale_x_discrete(position = "top") + ylim(c(-0.1,1)) #+ facet_grid(rows = vars(simplified_celltype), scales = "free")
ggsave("allelic_all.pdf", path=outdir, width=3.6, height = 1.8, units="in")


ggplot(pat_ar_df2, aes(x=sample, y=perc_paternal)) +
  ggbeeswarm::geom_quasirandom(aes(color = XIST_status), alpha = .5, size=.5) +
  geom_boxplot(fill = NA, notch = T, outlier.shape = NA, size=.3) +
  scale_color_jama() +
  stat_summary(fun.data = function(x) {
    data.frame(label=signif(median(x),2), y=-0.1)},
    geom = "text", size=2) +
  stat_compare_means(ref.group = "T21", method = "wilcox",
                     #method.args = list(alternative="less"),
                     label.y = 0.9, hide.ns = T, label = "p.format", size=2) +
  theme(axis.ticks.x = element_blank(),
        axis.title.x = element_blank(),
        legend.position = "top") +
  labs(
    title="Paternal chr21 allele expression",
    y="Lesser allele fraction",
    color="XIST",
    fill="XIST"
  ) +geom_hline(yintercept = c(0.33,0.5), alpha = 0.1)  +
  scale_x_discrete(position = "top") + ylim(c(-0.1,1)) + facet_grid(rows = vars(simplified_celltype), scales = "free")
ggsave("allelic_by_celltype.pdf", path=outdir, width=2.5, height = 3.6, units="in")

rm(allelic_df)
rm(allelic_df2)
gc()


##ECDF plots### the combined_output_df(2) is a large file to hold in memory -
# maybe there's a more lightweight container for this

rm(combined_output_df)
gc()

combined_output_df <- t21neu_combined[["RNA"]]@data |>
  as.data.frame() |>
  rownames_to_column(var="gene_name") |>
  tibble() |>
  pivot_longer(cols = -gene_name, names_to = "cell_id", values_to = "expr_val") |>
  left_join(t21neu_combined$sample |> enframe(name="cell_id", value = "sample_id"),
            by="cell_id") |>
  left_join(
  gene_info_df |> dplyr::select(-c(description,wikigene_name)) |>
  rename(gene_name = "hgnc_symbol") |> filter(!is.na(gene_name) & !duplicated(gene_name)) |>
  mutate(gene_id = str_replace(ensembl_gene_id_version, "\\.[0-9]+", "")),
  by= "gene_name")

gc()

#mean non21 expr by cell - should rename this instead of "autosome" to include X (& Y)
mean_autosome_expression_df <- combined_output_df |>
  filter(!(chromosome_name %in% c(#"X", "Y",
    "MT",
    NA, "21"))) |>
  #filter(expr_val > 0) |>
  group_by(cell_id) |>
  summarise(mean_a_expr = mean(expr_val), countA = n())

#each chr21 gene by cell
chr21gene_bycell <- combined_output_df |>
  filter(chromosome_name %in% c("21")) |>
  dplyr::rename(expr21 = "expr_val") |>
  #filter(expr21 > 0) |>
  left_join(mean_autosome_expression_df, by="cell_id") #|>
#group_by(cell_id) |>
#summarise(mean21 = median(expr21), count21 = n())

rm(combined_output_df)
gc()

###   BY CELL APPROACH
fx_ecdf <- function(x) {
  # Sort the input vector in ascending order
  sorted_x <- sort(x)
  # Calculate the unique values and their frequencies in the input vector
  unique_x <- unique(sorted_x)
  counts <- tabulate(match(sorted_x, unique_x))
  # Calculate the empirical cumulative density for each unique value
  ecdf <- cumsum(counts)/length(x)
  # Create a vector of empirical cumulative densities corresponding to the original input vector
  output_ecdf <- ecdf[match(x, unique_x)]
  # Return the vector of empirical cumulative densities
  return(output_ecdf)
}


t21_d21_cell_ratio <- chr21gene_bycell |>
  left_join(t21neu_combined@meta.data |>
              as.data.frame() |>
              rownames_to_column(var="cell_id") |>
              as_tibble() |>
            dplyr::select(cell_id, simplified_celltype, XIST_status, XIST_counts),
            by = "cell_id") |>
  group_by(sample_id, simplified_celltype, cell_id, XIST_status) |>
  summarise(mean21 = mean(expr21), count = n(), mean_a_expr = unique(mean_a_expr)) |>
  mutate(ratio = mean21/mean_a_expr) |> filter(!(sample_id == ".day0" & !XIST_status))


tb.bycell<-t21_d21_cell_ratio |> filter(!(is.na(XIST_status) | (XIST_status & str_detect(sample_id, "[DT]21")))) |>
  mutate(sample = case_when(#sample_id %in% c("D21a","D21b") ~ "D21",
    sample_id == "pDAPT" & !XIST_status ~ "T21_pD-",
    sample_id == "pDAPT" & XIST_status ~ "T21_pD+",
    sample_id == ".day0" & !XIST_status ~ "T21_d0-",
    sample_id == ".day0" & XIST_status ~ "T21_d0+",
    sample_id == "T21" ~ "T21",
    T ~ sample_id)) |>
  mutate(XIST_status = case_when(!XIST_status & str_detect(sample_id, "D21") ~ "absent",
                                 T ~ as.character(XIST_status)))

bp.percell<-ggplot(tb.bycell |> filter(simplified_celltype != "Other"), aes(x=sample, y=ratio, fill = XIST_status)) +
  ggbeeswarm::geom_quasirandom(aes(color = XIST_status), alpha = .5, size=.5) +
  geom_boxplot(fill = NA, notch = T, outlier.shape = NA, size=.3) +
  scale_color_jama() +
  stat_summary(fun.data = function(x) {
    data.frame(label=signif(median(x),2), y=0)},
  geom = "text", size=2) +
  stat_compare_means(ref.group = "T21", method = "wilcox",
                            method.args = list(alternative="less"),
                            label.y = 2.2, hide.ns = T, label = "p.format", size=2) +

  theme( axis.ticks.x = element_blank(),
        axis.title.x = element_blank(),
      legend.position = "top"
  ) +
  labs(
   title="chr21 dosage by cell",
   y ="chr21/genome_mean ratio",
    color="XIST",
    fill="XIST"
  ) + scale_x_discrete(position = "top") # + facet_grid(rows = vars(simplified_celltype), scales = "free", space = "free")
ggsave(bp.percell, filename = "boxplot_bycell2.pdf",  path=outdir, width =  3.6, height = 1.8, units = "in")

ggplot(tb.bycell, aes(x=sample, y=ratio, fill = XIST_status)) +
  ggbeeswarm::geom_quasirandom(aes(color = XIST_status), alpha = .5, size=.5) +
  geom_boxplot(fill = NA, notch = T, outlier.shape = NA, size=.3) +
  scale_color_jama() +
  stat_summary(fun.data = function(x) {
    data.frame(label=signif(median(x),2), y=0)},
    geom = "text", size=2) +
  stat_compare_means(ref.group = "T21", method = "wilcox", label.x.npc = "center",
                     method.args = list(alternative="less"), label.y.npc = 0.9,
                     hide.ns = T, label = "p.signif", size=3) +

  theme( axis.ticks.x = element_blank(),
         axis.title.x = element_blank(),
         legend.position = "top"
  ) +
  labs(
    title="chr21 dosage by cell",
    y ="chr21/genome_mean ratio",
    color="XIST",
    fill="XIST"
  ) + scale_x_discrete(position = "top") + facet_grid(rows = vars(simplified_celltype), scales = "free")
ggsave(filename = "boxplot_bycell.pdf",  path=outdir, width = 2.5, height = 4, units = "in")


require(NSM3)
test_cell<-t21_d21_cell_ratio |> ungroup() |> filter(!(sample_id == ".day0" & !XIST_status)) |>

  group_by(sample_id, simplified_celltype, XIST_status) |>
  add_tally() |> #filter(n>=10) |>
  arrange(ratio) |> filter(!is.infinite(ratio) & !is.na(ratio) & !is.na(XIST_status)
                           #  & simplified_celltype != "Other"
                           ) |>
  group_by(sample_id, simplified_celltype, XIST_status) |>
  mutate(eCDF = fx_ecdf(ratio)) |>
  mutate(upperCI = sort(rbind(ratio, ecdf.ks.CI(ratio)$upper)[2,])) |>
  mutate(lowerCI = sort(rbind(ratio, ecdf.ks.CI(ratio)$lower)[2,])) |>
  ungroup()


refratio <-"T21"
testref <- test_cell |> group_by(sample_id) |> filter(simplified_celltype == "Neurons" & sample_id == refratio)
tested <- test_cell |> group_by(sample_id, XIST_status) |> filter(simplified_celltype == "Neurons" & sample_id != refratio)
ks.neurons<-tested |> summarise(ksp = ks.test(unique(testref$ratio), unique(ratio), exact = T, alternative = "l")$p.value,
                                ksd = ks.test(unique(testref$ratio), unique(ratio), exaxt = T, alternative = "l")$statistic)

testref <- test_cell |> group_by(sample_id) |> filter(simplified_celltype == "Astrocytes" & sample_id == refratio)
tested <- test_cell |> group_by(sample_id, XIST_status) |> filter(simplified_celltype == "Astrocytes" & sample_id != refratio)
ks.glia<-tested |> summarise(ksp = ks.test(unique(testref$ratio), unique(ratio), exact = T, alternative = "l")$p.value,
                             ksd = ks.test(unique(testref$ratio), unique(ratio), exaxt = T, alternative = "l")$statistic)

testref <- test_cell |> group_by(sample_id) |> filter(simplified_celltype == "NPCs" & sample_id == refratio)
tested <- test_cell |> group_by(sample_id, XIST_status) |> filter(simplified_celltype == "NPCs" & sample_id != refratio)
ks.rgc<-tested |> summarise(ksp = ks.test(unique(testref$ratio), unique(ratio), exact = T, alternative = "l")$p.value,
                            ksd = ks.test(unique(testref$ratio), unique(ratio), exaxt = T, alternative = "l")$statistic)

testref <- test_cell |> group_by(sample_id) |> filter(simplified_celltype == "Other" & sample_id == refratio)
tested <- test_cell |> group_by(sample_id, XIST_status) |> filter(simplified_celltype == "Other" & sample_id != refratio)
ks.unk<-tested |> summarise(ksp = ks.test(unique(testref$ratio), unique(ratio), exact = T, alternative = "l")$p.value,
                            ksd = ks.test(unique(testref$ratio), unique(ratio), exaxt = T, alternative = "l")$statistic)

ks21_bycell<-bind_rows(list("Neurons" = ks.neurons, "Astrocytes" = ks.glia, "NPCs" = ks.rgc, "Other" = ks.unk), .id = 'simplified_celltype')
df_anno_cell<-ks21_bycell |> filter(ksp <= 0.05) |> arrange(simplified_celltype, desc(ksd)) |> ungroup() |>
 group_by(simplified_celltype) |> mutate(ksp = signif(ksp, 2), ksd = signif(ksd, 2)) |> add_column(x = 0.75) |> mutate(y = 1-row_number()/10)

min.cells<-4
gg.ecdf_bycell<-ggplot(test_cell |> group_by(sample_id, simplified_celltype, XIST_status) |>
                         add_tally(name = "cells") |> filter(cells>min.cells) |> ungroup() |>
                         mutate(upperCI = case_when(sample_id == refratio ~ upperCI, T ~ NA)) |>
                         mutate(lowerCI = case_when(sample_id == refratio ~ lowerCI, T ~ NA))
                       , aes(x=ratio, y=eCDF, colour=sample_id, linetype= XIST_status)) +
  geom_step() + geom_text(data = df_anno_cell, aes(x = x,  y = y, label = ksp), size =2) +
  geom_hline(yintercept = 0.5, alpha = 0.2) + geom_vline(xintercept = c(1,1.5), alpha =0.2) +
  geom_ribbon(aes(ymin=lowerCI, ymax=upperCI, fill = sample_id),
              alpha=0.2, color = NA) +
  facet_grid(rows = vars(simplified_celltype)) + xlim(0.5,2) + theme(legend.position = "right")
ggsave(gg.ecdf_bycell, filename = "ecdf_bycell.pdf", path=outdir, width = 2.4, height = 3.6, units = "in")

test_cell2<-t21_d21_cell_ratio |> ungroup() |>
  filter(!(sample_id == ".day0" & !XIST_status)) |>
  group_by(sample_id, XIST_status) |>
  add_tally() |> #filter(n>=10) |>
  arrange(ratio) |> filter(!is.infinite(ratio) & !is.na(ratio) & !is.na(XIST_status)
                             & simplified_celltype != "Other"
                           ) |>
  group_by(sample_id, XIST_status) |>
  mutate(eCDF = fx_ecdf(ratio)) |>
  mutate(upperCI = sort(rbind(ratio, ecdf.ks.CI(ratio)$upper)[2,])) |>
  mutate(lowerCI = sort(rbind(ratio, ecdf.ks.CI(ratio)$lower)[2,])) |>
  ungroup()


refratio <-"T21"
testref <- test_cell2 |> group_by(sample_id) |> filter(sample_id == refratio)
tested <- test_cell2 |> group_by(sample_id, XIST_status) |> filter(sample_id != refratio)
ks.all<-tested |> summarise(ksp = ks.test(unique(testref$ratio), unique(ratio), exact = T, alternative = "l")$p.value,
                            ksd = ks.test(unique(testref$ratio), unique(ratio), exaxt = T, alternative = "l")$statistic)

df_anno_cell2<-ks21_bycell2 |> ungroup() |> filter(ksp <= 0.05) |> arrange(desc(ksd)) |>
  mutate(ksp = signif(ksp, 2), ksd = signif(ksd, 2)) |>
  add_column(x = 0.75) |> mutate(y = 1-row_number()/10)

gg.ecdf_bycell2<-ggplot(test_cell2 |> group_by(sample_id, XIST_status) |>
                          add_tally(name = "cells") |> filter(cells>min.cells) |> ungroup() |>
                          mutate(upperCI = case_when(sample_id == refratio ~ upperCI, T ~ NA)) |>
                          mutate(lowerCI = case_when(sample_id == refratio ~ lowerCI, T ~ NA))
                        , aes(x=ratio, y=eCDF, colour=sample_id, linetype= XIST_status)) +
  geom_step() + geom_text(data = df_anno_cell2, aes(x = x,  y = y, label = ksp), size =3) +
  geom_hline(yintercept = 0.5, alpha = 0.2) + geom_vline(xintercept = c(1,1.5), alpha =0.2) +
  geom_ribbon(aes(ymin=lowerCI, ymax=upperCI, fill = sample_id),
              alpha=0.2, color = NA) + xlim(0.5,2)  + theme(legend.position = "right")
ggsave(gg.ecdf_bycell2, filename = "ecdf_bycell2.pdf",  path=outdir, width = 3.2, height = 1.45, units = "in")


###   BY GENE APPROACH
#mean euploid expression for each chr21 gene averaged over all cells of celltype in sample_id
chr21_mean_euploid_gene <- chr21gene_bycell |>
  left_join(t21neu_combined@meta.data |>
              as.data.frame() |>
              rownames_to_column(var="cell_id") |>
              as_tibble() |>
              dplyr::select(cell_id, simplified_celltype),
            by = "cell_id") |>
  group_by(sample_id, simplified_celltype, gene_id, gene_name, chromosome_name, start_position) |>
  summarise(mean21 = mean(expr21)) |> filter(mean21 > 0) |>
  pivot_wider(id_cols = c(simplified_celltype, gene_id, gene_name, chromosome_name, start_position), names_from = sample_id, values_from = mean21) |>
  mutate(mean21_euploid = rowMeans(cbind(D21a, D21b))) |>
  mutate(t21_ratio = T21/mean21_euploid) |>
  ungroup() |>
  dplyr::select(simplified_celltype, gene_id, mean21_euploid, t21_ratio)

require(NSM3)
t21_d21_gene_ratio <- chr21gene_bycell |>
  left_join(t21neu_combined@meta.data |>
              as.data.frame() |>
              rownames_to_column(var="cell_id") |>
              as_tibble() |>
              dplyr::select(cell_id, simplified_celltype, XIST_status),
            by = "cell_id") |>
  group_by(sample_id, simplified_celltype, gene_id, gene_name, chromosome_name, start_position, XIST_status) |>
  summarise(mean21 = mean(expr21), count = n()) |> filter(mean21 > 0) |>
  filter(count > 4) |>
  pivot_wider(id_cols = c(simplified_celltype, gene_id, gene_name, chromosome_name, start_position, XIST_status), names_from = sample_id, values_from = c(mean21, count)) |>
  left_join(chr21_mean_euploid_gene,
            by=c("simplified_celltype", "gene_id")) |>
  mutate(t21_d21_ratio = mean21_T21/mean21_euploid,
         t21_expr = mean21_T21,
         pD_d21_ratio = mean21_pDAPT/mean21_euploid,
         pD_expr = mean21_pDAPT,
         d0_d21_ratio = mean21_.day0/mean21_euploid,
         d0_expr = mean21_.day0) |>
  mutate(euploid1_ratio = mean21_D21a/mean21_euploid,
         euploid2_ratio = mean21_D21b/mean21_euploid,
         t21_pD_ratio = t21_expr/pD_expr)
#write_csv(t21_d21_ratio_df, "tables/t21_d21_ratio_df.csv.gz")
#
# gene_order <- t21_d21_gene_ratio |>
#   filter(!is.infinite(pD_d21_ratio) & !is.nan(pD_d21_ratio) &
#            simplified_celltype == "neuron") |>
#   arrange(pD_d21_ratio) |>
#   pull(gene_name)

tb.bygene<-t21_d21_gene_ratio |> ungroup() |> filter(!is.na(XIST_status)) |>
  dplyr::select(simplified_celltype, XIST_status, gene_id,
                t21_d21_ratio,
                pD_d21_ratio,
                d0_d21_ratio,
                euploid1_ratio,
                euploid2_ratio) |>
  rename_with(~gsub("_d21_ratio", "", .x), .cols=ends_with("d21_ratio")) |>
  rename_with(~gsub("_ratio", "", .x), .cols=ends_with("_ratio")) |>
  pivot_longer(cols = -c(simplified_celltype,
                         gene_id,
                         XIST_status), names_to = "sample", values_to = "ratio") |>
  mutate(XIST_status = case_when(str_detect(sample, "euploid") ~ "absent",
                                 T ~ as.character(XIST_status))) |> filter(!is.na(ratio)) |>
  mutate(sample = case_when(
    sample == "euploid1" ~ "D21a",
    sample == "euploid2" ~ "D21b",
    sample == "pD" & XIST_status == "FALSE" ~ "T21_pD-",
    sample == "pD" & XIST_status == "TRUE" ~ "T21_pD+",
    sample == "d0" & XIST_status == "FALSE" ~ "skip",
    sample == "d0" &XIST_status == "TRUE" ~ "T21_d0+",
    sample == "t21" ~ "T21",
    T ~ sample)) |> filter(sample != "skip")

bp.pergene<-ggplot(tb.bygene |> filter(simplified_celltype != "Other"),
                   aes(x=sample, y=ratio, fill = XIST_status)) +
  ggbeeswarm::geom_quasirandom(aes(color = XIST_status), alpha = .5, size=.5) +
  geom_boxplot(fill = NA, notch = T, outlier.shape = NA, size=.3) +
  scale_color_jama() + coord_cartesian(ylim = c(-0.1,4)) +
  #stat_summary(fun = median, geom = "point", shape=23, size=1, alpha=1, aes(fill=XIST_status)) +
  stat_summary(fun.data = function(x) {
    data.frame(label=signif(median(x),2), y=-0.1)
  },
  geom = "text",
  size=2) +
  stat_compare_means(ref.group = "T21", method = "wilcox",
                     method.args = list(alternative="less"),
                     label.y = 3.5, hide.ns = T, label = "p.format", size=2) +

  theme(
    axis.ticks.x = element_blank(),
    axis.title.x = element_blank(),
    legend.position = "top"
  ) +
  labs(
   title="chr21 dosage by gene",
   y ="chr21_gene/genome ratio",
    color="XIST",
    fill="XIST"
  ) +
  scale_x_discrete(position = "top") #+  facet_grid(rows = vars(simplified_celltype), scales = "free", space = "free")
ggsave(bp.pergene, filename = "boxplot_bygene2.pdf",  path=outdir, width = 3.6, height = 1.8, units = "in")

ggplot(tb.bygene,
                   aes(x=sample, y=ratio, fill = XIST_status)) +
  ggbeeswarm::geom_quasirandom(aes(color = XIST_status), alpha = .5, size=.5) +
  geom_boxplot(fill = NA, notch = T, outlier.shape = NA, size=.3) +
  scale_color_jama() + coord_cartesian(ylim = c(-0.1,4)) +
  #stat_summary(fun = median, geom = "point", shape=23, size=1, alpha=1, aes(fill=XIST_status)) +
  stat_summary(fun.data = function(x) {
    data.frame(label=signif(median(x),2), y=-0)
  },
  geom = "text",
  size=2) +
  stat_compare_means(ref.group = "T21", method = "wilcox",
                     method.args = list(alternative="less"), label.y = 3.5,
                     hide.ns = T, label = "p.signif", size=3) +

  theme(
    axis.ticks.x = element_blank(),
    axis.title.x = element_blank(),
    legend.position = "top"
  ) +
  labs(
    title="chr21 dosage by gene",
    y ="chr21_gene/genome ratio",
    color="XIST",
    fill="XIST"
  ) +
  scale_x_discrete(position = "top") +  facet_grid(rows = vars(simplified_celltype), scales = "free")
ggsave(filename = "boxplot_bygene.pdf",  path=outdir, width = 2.5, height = 4, units = "in")


test<-t21_d21_gene_ratio |>
  pivot_longer(cols = c(t21_d21_ratio,
                        pD_d21_ratio,
                        d0_d21_ratio,
                        euploid1_ratio,
                        euploid2_ratio), names_to = "ratio_type", values_to = "ratio") |>
  arrange(ratio) |> filter(!is.infinite(ratio) & !is.na(ratio) & !is.na(XIST_status) &
                            !(ratio_type == "d0_d21_ratio" & !XIST_status)
                           ) |>
  mutate(ratio_type = str_split_i(ratio_type, "_", 1)) |>
  mutate(ratio_type = case_when(ratio_type == "t21" ~ "T21",
                                ratio_type == "d0" ~ ".day0",
                                ratio_type == "euploid1" ~ "D21a",
                                ratio_type == "euploid2" ~ "D21b",
                                ratio_type == "pD" ~ "pDAPT"
  )) |>
  group_by(ratio_type, simplified_celltype, XIST_status) |>
  dplyr::select(ratio, ratio_type, gene_name) |>
  #test_new<- test |> group_modify(test %>% mutate(interp_dens = calc_density(test$ratio)))
  mutate(eCDF = fx_ecdf(ratio)) |>
  mutate(upperCI = sort(rbind(ratio, ecdf.ks.CI(ratio)$upper)[2,])) |>
  mutate(lowerCI = sort(rbind(ratio, ecdf.ks.CI(ratio)$lower)[2,])) |>
  ungroup()

refratio <-"T21"
testref <- test |> group_by(ratio_type) |> filter(simplified_celltype == "Neurons" & ratio_type == refratio)
tested <- test |> group_by(ratio_type, XIST_status) |> filter(simplified_celltype == "Neurons" & ratio_type != refratio)
ks.neurons<-tested |> summarise(ksp = ks.test(unique(testref$ratio), unique(ratio), exact = T, alternative = "l")$p.value,
                                ksd = ks.test(unique(testref$ratio), unique(ratio), exaxt = T, alternative = "l")$statistic)

testref <- test |> group_by(ratio_type) |> filter(simplified_celltype == "Astrocytes" & ratio_type == refratio)
tested <- test |> group_by(ratio_type, XIST_status) |> filter(simplified_celltype == "Astrocytes" & ratio_type != refratio)
ks.glia<-tested |> summarise(ksp = ks.test(unique(testref$ratio), unique(ratio), exact = T, alternative = "l")$p.value,
                             ksd = ks.test(unique(testref$ratio), unique(ratio), exaxt = T, alternative = "l")$statistic)

testref <- test |> group_by(ratio_type) |> filter(simplified_celltype == "NPCs" & ratio_type == refratio)
tested <- test |> group_by(ratio_type, XIST_status) |> filter(simplified_celltype == "NPCs" & ratio_type != refratio)
ks.rgc<-tested |> summarise(ksp = ks.test(unique(testref$ratio), unique(ratio), exact = T, alternative = "l")$p.value,
                            ksd = ks.test(unique(testref$ratio), unique(ratio), exaxt = T, alternative = "l")$statistic)

testref <- test |> group_by(ratio_type) |> filter(simplified_celltype == "Other" & ratio_type == refratio)
tested <- test |> group_by(ratio_type, XIST_status) |> filter(simplified_celltype == "Other" & ratio_type != refratio)
ks.unk<-tested |> summarise(ksp = ks.test(unique(testref$ratio), unique(ratio), exact = T, alternative = "l")$p.value,
                             ksd = ks.test(unique(testref$ratio), unique(ratio), exaxt = T, alternative = "l")$statistic)

ks21_bygene<-bind_rows(list("Neurons" = ks.neurons,
                            "Astrocytes" = ks.glia,
                            "Other" = ks.unk,
                            "NPCs" = ks.rgc), .id = 'simplified_celltype')
df_anno<-ks21_bygene |> ungroup() |> filter(ksp <= 0.05) |> arrange(simplified_celltype, desc(ksd)) |>
  mutate(ksp = signif(ksp, 2), ksd = signif(ksd, 2)) |> group_by(simplified_celltype) |>
  add_column(x = 0.75) |> mutate(y = 1-row_number()/10)

gg.ecdf_bygene<-ggplot(test |> # filter(ratio_type != "d0_d21_ratio") |>
                         mutate(upperCI = case_when(ratio_type == refratio ~ upperCI, T ~ NA)) |>
                         mutate(lowerCI = case_when(ratio_type == refratio ~ lowerCI, T ~ NA))
                       , aes(x=ratio, y=eCDF, colour=ratio_type, linetype= XIST_status)) +
  geom_step() + geom_text(data = df_anno, aes(x = x,  y = y, label = ksp), size = 2) +
  geom_hline(yintercept = 0.5, alpha = 0.2) + geom_vline(xintercept = c(1,1.5), alpha =0.2) +
  geom_ribbon(aes(ymin=lowerCI, ymax=upperCI, fill = ratio_type),
              alpha=0.2, color = NA) +
  facet_grid(rows = vars(simplified_celltype)) + xlim(0.5,2) + theme(legend.position = "right")
ggsave(gg.ecdf_bygene, filename = "ecdf_bygene.pdf", path=outdir, width = 2.4, height = 3.6, units = "in")


test2<-t21_d21_gene_ratio |>
  pivot_longer(cols = c(t21_d21_ratio,
                        pD_d21_ratio,
                        d0_d21_ratio,
                        euploid1_ratio,
                        euploid2_ratio), names_to = "ratio_type", values_to = "ratio") |>
  arrange(ratio) |> filter(!is.infinite(ratio) & !is.na(ratio) & !is.na(XIST_status) &
                             simplified_celltype != "Other" & !(ratio_type == "d0_d21_ratio" & !XIST_status)
                           ) |>
  mutate(ratio_type = str_split_i(ratio_type, "_", 1)) |>
  mutate(ratio_type = case_when(ratio_type == "t21" ~ "T21",
                                ratio_type == "d0" ~ ".day0",
                                ratio_type == "euploid1" ~ "D21a",
                                ratio_type == "euploid2" ~ "D21b",
                                ratio_type == "pD" ~ "pDAPT"
  )) |>
  group_by(ratio_type, XIST_status) |>
  dplyr::select(ratio, ratio_type, gene_name) |>
  mutate(eCDF = fx_ecdf(ratio)) |>
  mutate(upperCI = sort(rbind(ratio, ecdf.ks.CI(ratio)$upper)[2,])) |>
  mutate(lowerCI = sort(rbind(ratio, ecdf.ks.CI(ratio)$lower)[2,])) |>
  ungroup()

refratio <-"T21"
testref <- test2 |> group_by(ratio_type) |> filter(ratio_type == refratio)
tested <- test2 |> group_by(ratio_type, XIST_status) |> filter(ratio_type != refratio)
ks.all<-tested |> summarise(ksp = ks.test(unique(testref$ratio), unique(ratio), exact = T, alternative = "l")$p.value,
                            ksd = ks.test(unique(testref$ratio), unique(ratio), exaxt = T, alternative = "l")$statistic)

ks21_bygene2<-bind_rows(list(#"neuron" = ks.neurons,
  #"glia" = ks.glia,
  "all" = ks.all), .id = 'simplified_celltype')
#df_anno2<-ks21_bygene2 |> filter(ksp <= 0.05) |> arrange(simplified_celltype, desc(ksd)) |>
#  mutate(ksp = signif(ksp, 2), ksd = signif(ksd, 2)) |> add_column(x = 0.5) |> mutate(y = 1-row_number()/10)

df_anno2<-ks21_bygene2 |> ungroup() |> filter(ksp <= 0.05) |> arrange(desc(ksd)) |>
  mutate(ksp = signif(ksp, 2), ksd = signif(ksd, 2)) |>
  add_column(x = 0.75) |> mutate(y = 1-row_number()/10)


gg.ecdf_bygene2<-ggplot(test2 |>
                          mutate(upperCI = case_when(ratio_type == refratio ~ upperCI, T ~ NA)) |>
                          mutate(lowerCI = case_when(ratio_type == refratio ~ lowerCI, T ~ NA))
                        , aes(x=ratio, y=eCDF, colour=ratio_type, linetype= XIST_status)) +
  geom_step() + geom_text(data = df_anno2, aes(x = x,  y = y, label = ksp), size = 3) +
  geom_hline(yintercept = 0.5, alpha = 0.2) + geom_vline(xintercept = c(1,1.5), alpha =0.2) +
  geom_ribbon(aes(ymin=lowerCI, ymax=upperCI, fill = ratio_type),
              alpha=0.2, color = NA) + xlim(0.5,2) #+ theme(legend.position = "none")
ggsave(gg.ecdf_bygene2, filename = "ecdf_bygene2.pdf", path=outdir, width = 3.2, height = 1.45, units = "in")


rm(chr21gene_bycell)

### DIFF expression
t21neu_combined <- PrepSCTFindMarkers(t21neu_combined)

fx.diffroc<-function(slott, assayy, seuratt, ident1, ident2, testt="roc", by_celltype=F){
  DefaultAssay(seuratt) <- assayy
  if (by_celltype == F) {
    resultt<-list()
    resultt[["none"]]<-FindMarkers(
      seuratt,
      test.use = testt,
      slot = slott,
      assay=assayy,
      ident.1 = ident1,
      ident.2 = ident2,
      group.by = 'diff_expr',
      recorrect_umi = F,
      logfc.threshold = 0) |>
      mutate(
        testname = paste0(assayy, "_", slott),
        contrast = sprintf("%s_%s", ident1, ident2),
        test = (sign(avg_log2FC) * power)
      ) |>
      arrange(desc(test)) |>
      rownames_to_column("gene_symbol") |>
      left_join(gene_info_df |> dplyr::select(hgnc_symbol, chromosome_name, strand, start_position, end_position, gene_biotype),
                by=c("gene_symbol"="hgnc_symbol"))
  }else {
    celltypes <- unique(seuratt@meta.data$simplified_celltype)
    resultt <- list()
    for (celltype in celltypes) {
      resultt[[celltype]] <- FindMarkers(
        seuratt,
        test.use = testt,
        slot = slott,
        assay=assayy,
        ident.1 = sprintf("%s_%s", ident1, celltype),
        ident.2 = sprintf("%s_%s", ident2, celltype),
        group.by = 'diff_expr_celltype',
        recorrect_umi = F,
        logfc.threshold = 0) |>
        mutate(
          testname = paste0(assayy, "_", slott),
          contrast = sprintf("%s_%s_%s", celltype, ident1, ident2),
          test = (sign(avg_log2FC) * power)
        ) |>
        arrange(desc(test)) |>
        rownames_to_column("gene_symbol") |>
        left_join(gene_info_df |> dplyr::select(hgnc_symbol, chromosome_name, strand, start_position, end_position, gene_biotype),
                  by=c("gene_symbol"="hgnc_symbol"))
    }
  }
  return(resultt)
}


fx.diffroc2<-function(slott, assayy, seuratt, ident1, ident2, testt="roc", by_celltype=F){
  DefaultAssay(seuratt) <- assayy
  if (by_celltype == F) {
    resultt<-list()
    resultt[["none"]]<-FindMarkers(
      seuratt,
      test.use = testt,
      slot = slott,
      assay=assayy,
      ident.1 = ident1,
      ident.2 = ident2,
      group.by = 'diff_expr2',
      recorrect_umi = F,
      logfc.threshold = 0) |>
      mutate(
        testname = paste0(assayy, "_", slott),
        contrast = sprintf("%s_%s", ident1, ident2),
        test = (sign(avg_log2FC) * power)
      ) |>
      arrange(desc(test)) |>
      rownames_to_column("gene_symbol") |>
      left_join(gene_info_df |> dplyr::select(hgnc_symbol, chromosome_name, strand, start_position, end_position, gene_biotype),
                by=c("gene_symbol"="hgnc_symbol"))
  }else {
    celltypes <- unique(seuratt@meta.data$simplified_celltype)
    resultt <- list()
    for (celltype in celltypes) {
      resultt[[celltype]] <- FindMarkers(
        seuratt,
        test.use = testt,
        slot = slott,
        assay=assayy,
        ident.1 = sprintf("%s_%s", ident1, celltype),
        ident.2 = sprintf("%s_%s", ident2, celltype),
        group.by = 'diff_expr_celltype2',
        recorrect_umi = F,
        logfc.threshold = 0) |>
        mutate(
          testname = paste0(assayy, "_", slott),
          contrast = sprintf("%s_%s_%s", celltype, ident1, ident2),
          test = (sign(avg_log2FC) * power)
        ) |>
        arrange(desc(test)) |>
        rownames_to_column("gene_symbol") |>
        left_join(gene_info_df |> dplyr::select(hgnc_symbol, chromosome_name, strand, start_position, end_position, gene_biotype),
                  by=c("gene_symbol"="hgnc_symbol"))
    }
  }
  return(resultt)
}

t21neu_combined@meta.data$diff_expr <- t21neu_combined@meta.data |>
  as.data.frame() |>
  rownames_to_column(var = "cell_id") |>
  mutate(
    Diff_expr = case_when(
    sample == "pDAPT" & XIST_status ~ "pDxist",
    sample %in% c("pDAPT") & !XIST_status ~ "pDno",
    str_detect(sample, "D21") & !XIST_status ~ "d21",
    sample == "T21" & !XIST_status ~ "t21",
    sample == ".day0" & XIST_status ~ "d0",
    T~"left"
  )) |>
  pull(Diff_expr)


t21neu_combined@meta.data$diff_expr2 <- t21neu_combined@meta.data |>
  as.data.frame() |>
  rownames_to_column(var = "cell_id") |>
  mutate(
    Diff_expr = case_when(
      sample == "pDAPT" & XIST_status ~ "pDxist",
      sample %in% c("T21","pDAPT") & !XIST_status ~ "t21",
      str_detect(sample, "D21") & !XIST_status ~ "d21",
      #sample == "T21" & !XIST_status ~ "t21",
      sample == ".day0" & XIST_status ~ "d0",
      T~"left"
    )) |>
  pull(Diff_expr)

t21neu_combined@meta.data$diff_expr_celltype <- t21neu_combined@meta.data |>
  as.data.frame() |>
  rownames_to_column(var = "cell_id") |>
  mutate(
    Diff_expr = case_when(
      sample == "pDAPT" & XIST_status ~ sprintf("%s_%s", "pDxist", simplified_celltype),
      sample %in% c("pDAPT") & !XIST_status ~ sprintf("%s_%s", "pDno", simplified_celltype),
      str_detect(sample, "D21") & !XIST_status ~ sprintf("%s_%s", "d21", simplified_celltype),
      sample == "T21" & !XIST_status ~ sprintf("%s_%s", "t21", simplified_celltype),
      sample == ".day0" & XIST_status ~ sprintf("%s_%s", "d0", simplified_celltype),
      T~sprintf("%s_%s", "left", simplified_celltype)
    )) |>
  pull(Diff_expr)

t21neu_combined@meta.data$diff_expr_celltype2 <- t21neu_combined@meta.data |>
  as.data.frame() |>
  rownames_to_column(var = "cell_id") |>
  mutate(
    Diff_expr = case_when(
      sample == "pDAPT" & XIST_status ~ sprintf("%s_%s", "pDxist", simplified_celltype),
      sample %in% c("T21","pDAPT") & !XIST_status ~ sprintf("%s_%s", "t21", simplified_celltype),
      str_detect(sample, "D21") & !XIST_status ~ sprintf("%s_%s", "d21", simplified_celltype),
      #sample == "T21" & !XIST_status ~ sprintf("%s_%s", "t21", simplified_celltype),
      sample == ".day0" & XIST_status ~ sprintf("%s_%s", "d0", simplified_celltype),
      T~sprintf("%s_%s", "left", simplified_celltype)
    )) |>
  pull(Diff_expr)

make_plot <- function(df) {
  if (typeof(df) == "list") {

    comb_df <- tibble()

    for (celltype in names(df)) {
      comb_df <- rbind(
        comb_df,
        df[[celltype]] |>
          mutate(comparison_celltype = celltype)
      )
    }

    return(
      comb_df |>
        filter(chromosome_name == "21" & is.finite(test) & test != 0) |>
        ggplot(aes(x = fct_reorder(gene_symbol, test), y=test, fill=testname)) +
        facet_grid(rows = vars(testname), cols = vars(comparison_celltype)) +
        # facet_wrap("testname", scales = "free") +
        geom_hline(yintercept = c(-0.1,0.1)) +
        geom_bar(position="stack", stat="identity") +
        coord_flip()
    )
  }else {
    return(
      df |>
        filter(chromosome_name == "21" & is.finite(test) & test != 0) |>
        ggplot(aes(x = fct_reorder(gene_symbol, test), y=test, fill=testname)) +
        facet_wrap("testname", scales = "free") +
        geom_hline(yintercept = c(-0.1,0.1)) +
        geom_bar(position="stack", stat="identity") +
        coord_flip()
    )
  }
}

## separate
SCT_data_t21_ct <- fx.diffroc(
  slott = "data",
  assayy = "SCT",
  seuratt = t21neu_combined,
  ident1 = "d21",
  ident2 = "t21",
  by_celltype = T
)

SCT_data_pD_ct <- fx.diffroc(
  slott = "data",
  assayy = "SCT",
  seuratt = t21neu_combined,
  ident1 = "pDxist",
  ident2 = "pDno",
  by_celltype = T
)

SCT_data_d0_ct <- fx.diffroc(
  slott = "data",
  assayy = "SCT",
  seuratt = t21neu_combined,
  ident1 = "d0",
  ident2 = "pDno",
  by_celltype = T
)


##pD and t21 combined to t21
SCT_data_t21_ct2 <- fx.diffroc2(
  slott = "data",
  assayy = "SCT",
  seuratt = t21neu_combined,
  ident1 = "d21",
  ident2 = "t21",
  by_celltype = T
)

SCT_data_pD_ct2 <- fx.diffroc2(
  slott = "data",
  assayy = "SCT",
  seuratt = t21neu_combined,
  ident1 = "pDxist",
  ident2 = "t21",
  by_celltype = T
)

SCT_data_d0_ct2 <- fx.diffroc2(
  slott = "data",
  assayy = "SCT",
  seuratt = t21neu_combined,
  ident1 = "d0",
  ident2 = "t21",
  by_celltype = T
)


make_comb_df <- function(df) {
  comb_df <- tibble()

  for (celltype in names(df)) {
    comb_df <- rbind(
      comb_df,
      df[[celltype]] |>
        mutate(comparison_celltype = celltype)
    )
  }

  return(comb_df)
}

require(scales)
fx.chr_plots<-function(label, allsubset, chr, testn, hght){

  allsubset<- allsubset |>
  mutate(contrast3 = case_when(contrast3 == "d21_t21" ~ "euploid",
                               contrast3 == "d21_pDno" ~ "euploid",
                               contrast3 == "d0_t21" ~ "day0",
                               contrast3 == "d0_pDno" ~ "day0",
                               contrast3 == "pDxist_t21" ~ "pDAPT",
                               contrast3 == "pDxist_pDno" ~ "pDAPT",
                               T ~ "left"))

d21_normalized<-allsubset |> filter(chromosome_name == chr & testname == testn & contrast3=="euploid") |>
  rename(avg_euploid = "avg_log2FC") |> dplyr::select(gene_symbol, avg_euploid, comparison_celltype)

allsubset |> filter(chromosome_name == chr & testname == testn) |>
    left_join(d21_normalized) |> rowwise() |> mutate(diff_log2FC = avg_log2FC-avg_euploid) |>
  filter(!is.na(avg_euploid)) |>
    ggplot(aes(x=contrast3, y = avg_log2FC, color=contrast3)) +
    geom_boxplot(notch=T) +
    facet_grid(rows = vars(comparison_celltype)) +
    stat_summary(fun.data = function(x) {
      data.frame(label=signif(median(x),2), y=-0.9)}, geom = "text", color="black", size=2) +
    stat_compare_means(ref.group = "euploid", method = "wilcox",
                       label.y = 0.9, hide.ns = F, label = "p.format", size=2) +
    coord_cartesian(ylim = c(-1,1)) + geom_hline(yintercept = c(0)) +
    theme(legend.position = "top") + ggtitle("Differential to T21") +
    theme(text = element_text(size = 6), legend.text = element_text(size = 6))
  ggsave(filename = paste(testn, label, "box.pdf", sep = "_"), path=outdir, width = 1.8, height = hght, units = "in")

  allsubset |> filter(chromosome_name == chr & testname == testn)  |>
    left_join(d21_normalized) |> rowwise() |> mutate(diff_log2FC = avg_log2FC-avg_euploid) |>
    filter(!is.na(diff_log2FC)) |> mutate(chr21_pos = TSS/1000000) |>
    ggplot(aes(x=chr21_pos, y = avg_log2FC, color=contrast3)) +# geom_point(alpha=0.2) +
    geom_smooth(aes(fill = contrast3)) +
    facet_grid(rows = vars(comparison_celltype)) +
    coord_cartesian(ylim = c(-0.6,0.2)) + geom_hline(yintercept = 0) + geom_vline(xintercept = 37.550000, linetype=3) +
    theme(legend.position = "top") + ggtitle("Differential to T21 by chromosomal position") +
    theme(text = element_text(size = 6), legend.text = element_text(size = 6))
  ggsave(filename = paste(testn, label, "pos.pdf", sep = "_"), path=outdir, width = 2.4, height = hght, units = "in")

  allsubset |> filter(chromosome_name == chr & testname == testn)  |>
    left_join(d21_normalized) |> rowwise() |> mutate(diff_log2FC = avg_log2FC-avg_euploid) |>
    filter(!is.na(diff_log2FC) & contrast3 != "euploid") |>
    mutate(distance=abs(37550000-TSS)/1000000) |>
    ggscatter(x="distance", y = "avg_log2FC", color="contrast3", palette= hue_pal()(3)[c(1,3)], add = "reg.line", alpha = 0.2, size = 2,
              cor.coef = T, cor.coef.size = 2,conf.int = T, ggtheme = theme_classic(), #add.params = list(color="black"),
              cor.coeff.args = list(method = "pearson", label.x = 0, label.y = 0.1, digits = 2)) +
    facet_grid(rows = vars(comparison_celltype), cols = vars(contrast3)) +
    coord_cartesian(ylim = c(-0.6,0.2)) + geom_hline(yintercept = 0) +
    theme(legend.position = "none") + ggtitle("Differential to T21 by XIST distance") +
    theme(text = element_text(size = 6), legend.text = element_text(size = 6), axis.title = element_text(size = 7, face = "bold"),
          axis.text = element_text(size = 6)) #+ theme(axis.text.x = element_text(angle = 45, hjust=1, size = 6))
  ggsave(filename = paste(testn, label, "dist.pdf", sep = "_"), path=outdir, width = 2.4, height = hght, units = "in")


allsubset |> filter(chromosome_name == chr & testname == testn) |>
  group_by(contrast) |> filter(!duplicated(gene_symbol)) |>  ungroup() |> rename(avgL2FC = "avg_log2FC") |>
  pivot_wider(id_cols =c(gene_symbol, comparison_celltype), names_from = contrast3, values_from = c(avgL2FC, power)) |>
    filter(!is.na(avgL2FC_euploid) & !is.na(avgL2FC_day0) & !is.na(avgL2FC_pDAPT)) |>
    pivot_longer(-c(gene_symbol, comparison_celltype, avgL2FC_euploid, power_euploid),  names_to = c("Var", ".value"), names_sep="_") |>
    pivot_longer(-c(gene_symbol, comparison_celltype, avgL2FC_euploid, power_euploid, Var), names_to = "correction", values_to = "value") |>
  pivot_wider(id_cols = c(gene_symbol, comparison_celltype, avgL2FC_euploid, power_euploid, correction), names_from = Var, values_from = value) |>
 mutate(test = sign(avgL2FC)*power, test_euploid = sign(avgL2FC_euploid)*power_euploid) |>
  #filter(power_euploid >= 0.1) |>
  ggscatter(x="avgL2FC_euploid", y = "avgL2FC", #size="power_euploid",
            color= "power", alpha = "power_euploid",
              add.params = list(color = "black", size = 0.5), font.label = c(7, "plain", "black"), add = "reg.line", cor.coef.size =2,
              cor.coef = T, cor.method = "pearson", cor.coef.coord = c(-2.5,2), ggtheme = theme_classic()) +
    gradient_color(rev(viridis::viridis(11))) + #coord_cartesian(xlim = c(-limit, limit), ylim = c(-limit,limit)) +
    facet_grid(cols=vars(correction), rows=vars(comparison_celltype), scales = "free") + theme(text = element_text(size=8))
  ggsave(filename = paste(testn, label, "chr21vseuploid.pdf", sep = "_"), path=outdir, width = 3.6, height = 3.6, units = "in")

  allsubset |> filter(chromosome_name != chr & testname == testn) |>
    group_by(contrast) |> filter(!duplicated(gene_symbol)) |>  ungroup() |> rename(avgL2FC = "avg_log2FC") |>
    pivot_wider(id_cols =c(gene_symbol, comparison_celltype), names_from = contrast3, values_from = c(avgL2FC, power)) |>
    filter(!is.na(avgL2FC_euploid) & !is.na(avgL2FC_day0) & !is.na(avgL2FC_pDAPT)) |>
    pivot_longer(-c(gene_symbol, comparison_celltype, avgL2FC_euploid, power_euploid),  names_to = c("Var", ".value"), names_sep="_") |>
    pivot_longer(-c(gene_symbol, comparison_celltype, avgL2FC_euploid, power_euploid, Var), names_to = "correction", values_to = "value") |>
    pivot_wider(id_cols = c(gene_symbol, comparison_celltype, avgL2FC_euploid, power_euploid, correction), names_from = Var, values_from = value) |>
    mutate(test = sign(avgL2FC)*power, test_euploid = sign(avgL2FC_euploid)*power_euploid) |>
    filter(power_euploid >= 0.1) |>
    ggscatter(x="avgL2FC_euploid", y = "avgL2FC", #size="power_euploid",
              color= "power", alpha = "power_euploid",
              add.params = list(color = "black", size = 0.5), font.label = c(7, "plain", "black"), add = "reg.line", cor.coef.size =2,
              cor.coef = T, cor.method = "pearson", cor.coef.coord = c(-2.5,2), ggtheme = theme_classic()) +
   # scale_color_gradient2(low = "blue", high = "red") +
    gradient_color(rev(viridis::viridis(11))) + #coord_cartesian(xlim = c(-limit, limit), ylim = c(-limit,limit)) +
    facet_grid(cols=vars(correction), rows=vars(comparison_celltype), scales = "free") + theme(text = element_text(size=8))
  ggsave(filename = paste(testn, label, "allvsd21.pdf", sep = "_"), path=outdir, width = 3.6, height = 3.6, units = "in")

}


SCT<- bind_rows(list(make_comb_df(SCT_data_t21_ct),
                             make_comb_df(SCT_data_d0_ct),
                             make_comb_df(SCT_data_pD_ct))) |>
  mutate(TSS = case_when(strand == 1 ~ start_position, T ~ end_position)) |>
  mutate(contrast2 = paste0(testname,"_", str_extract(contrast, "(d21|pDxist|d0)_.*"))) |>
  mutate(contrast3 = str_extract(contrast, "(d21|pDxist|d0)_(t21|pDno)"))
write_csv(SCT, paste0(outdir,"/tables/SCT.csv.gz"))
fx.chr_plots("ct", SCT, 21, "SCT_data", 3.2)


SCT2<- bind_rows(list(make_comb_df(SCT_data_t21_ct2),
                      make_comb_df(SCT_data_d0_ct2),
                      make_comb_df(SCT_data_pD_ct2))) |>
  mutate(TSS = case_when(strand == 1 ~ start_position, T ~ end_position)) |>
  mutate(contrast2 = paste0(testname,"_", str_extract(contrast, "(d21|pDxist|d0)_.*"))) |>
  mutate(contrast3 = str_extract(contrast, "(d21|pDxist|d0)_(t21|pDno)"))
write_csv(SCT2, paste0(outdir,"/tables/SCT2.csv.gz"))
fx.chr_plots("ct2", SCT2, 21, "SCT_data", 3.2)


#GSEA

require(metap)
vectorized_sumlog <- function(x, y) {
  if (length(x) != length(y)) {
    stop("lengths of lists must be the same")
  }
  combined_pvals <- c()
  for (i in 1:length(x)) {
    combined_pvals <- c(combined_pvals, (metap::sumlog(c(x[i], y[i])))$p)
  }

  return(combined_pvals)
}


fx.labels1<- Vectorize(function(X) {
  s <- toupper(sapply(strsplit(str_replace(X, "_", ":"), "[:]"), getElement, 1 ))
  abbreviate(s, minlength = 25, dot = T)
})

fx.labels2<- Vectorize(function(X) {
  s <- toupper(sapply(strsplit(str_replace(X, "_", ":"), "[:]"), getElement, 2))
  s<-str_replace_all(s, "_", "")
  abbreviate(s, minlength = 25, dot = T)
})

fx.labels3<- Vectorize(function(X) {
  #s <- toupper(sapply(strsplit(str_replace(X, "_", ":"), "[:]"), getElement, 2))
  s<-str_replace_all(X, "_", "")
  abbreviate(s, minlength = 25, dot = T)
})


fx.gpfx<-function(gsea_resultt, typee, slcut, qcut, excluded, vc.collections, hght)
{
 gsea_with_cat <- gsea_resultt|>
    filter(data_type == typee) |>
    dplyr::mutate(category = dplyr::case_when(
      str_detect(ID, "_STRESS") ~ "stress",
      str_detect(ID, "ENDOPLASM") ~ "ER",
      str_detect(ID, "chr21") ~ "chr21",
      str_detect(ID, "TRANSLATION") ~ "translation",
      str_detect(ID, "RIBOSOM") ~ "translation",
      str_detect(ID, "SERINE") & (!str_detect(ID, "SERINE_THREONINE")) ~ "serine",
      str_detect(ID, "MITOC") ~ "mitochondria",
      str_detect(ID, "NRF2") ~ "stress",
      str_detect(ID, "APOPTO") | str_detect(ID, "PROGRAMMED_CELL_DEATH") ~ "apoptosis",
      T ~ "nocat"
    ))

  gsea_with_cat <- gsea_with_cat |>
    mutate(Collection = case_when(str_count(ID,"_") == 0 ~ ID,
                                  T~fx.labels1(ID)))

  if(excluded == "nocat"){vc.collections<-gsea_with_cat |> pull(Collection) |> unique()}
  if(excluded != "nocat"){vc.collections<-vc.collections}

  categories_to_view<-gsea_with_cat |>
    filter(Collection %in% vc.collections | str_detect(ID, "^chr21")) |>
    group_by(ID, cell_type) |>
    mutate(min_q_val = min(qvalues, na.rm=F),
           mean_NES = mean(NES, na.rm = F),
           ncount = n()) |>
    filter(ncount == 3) |>
    pivot_wider(id_cols = c(ID, cell_type, min_q_val), names_from = c(comparison), values_from = c(NES, qvalues, p.adjust)) |>
    #filter(min_q_val <= qcut)|>
   ungroup()

  #if(excluded == "nocat"){categories_to_view$mean_q_val_t21<-qcut}

  categories_to_view<-categories_to_view |> #dplyr::select(-c(names(which(duplicated(t(categories_to_view)))))) |>
    mutate(pD_t21 = vectorized_sumlog(p.adjust_pD, p.adjust_t21),
           d0_t21 = vectorized_sumlog(p.adjust_d0, p.adjust_t21),
           pD_d0 = vectorized_sumlog(p.adjust_d0, p.adjust_pD)) |> filter(qvalues_d0 <= qcut | qvalues_t21 <= qcut)

  vc.neurons<-categories_to_view |> filter(cell_type == "Neurons" & (pD_d0  <= slcut | pD_t21 <= slcut | d0_t21 <= slcut)) |> pull(ID)
  vc.Astrocytes<-categories_to_view |>  filter(cell_type == "Astrocytes" & (pD_d0  <= slcut | pD_t21 <= slcut | d0_t21 <= slcut)) |> pull(ID)
  vc.NPCs<-categories_to_view |>  filter(cell_type == "NPCs" & (pD_d0  <= slcut | pD_t21 <= slcut | d0_t21 <= slcut)) |> pull(ID)
  vc.Other<-categories_to_view |>  filter(cell_type == "Other" & (pD_d0  <= slcut | pD_t21 <= slcut | d0_t21 <= slcut)) |> pull(ID)

  gsea_with_cat |> # filter((cell_type == "Astrocytes" & ID %in% vc.Astrocytes) |
                  #           (cell_type == "Neurons" & ID %in% vc.neurons) |
                  #           (cell_type == "NPCs" & ID %in% vc.NPCs) |
                  #           (cell_type == "Other" & ID %in% vc.Other)) |>
   pivot_wider(id_cols =c(ID, cell_type, category), names_from = comparison, values_from = c(NES, qvalues)) |>
    filter(qvalues_t21 <= qcut) |>
    pivot_longer(-c(ID, cell_type, category, qvalues_t21, NES_t21),  names_to = c("Var", ".value"), names_sep="_") |>
    pivot_longer(-c(ID, cell_type, category, qvalues_t21, NES_t21, Var), names_to = "correction", values_to = "value") |>
    pivot_wider(id_cols = c(ID, cell_type, category, qvalues_t21, NES_t21, correction), names_from = Var, values_from = value) |>
    mutate(log10q = pmin(-log(qvalues,10),3), log10_q_euploid = pmin(-log(qvalues_t21,10),3)) |>
    ggscatter(x="NES_t21", y = "NES", color= "qvalues", alpha = "log10_q_euploid", #alpha = 0.5,
              add.params = list(color = "black", size = 0.5), font.label = c(8, "plain", "black"), add = "reg.line",
              cor.coef.size =2, cor.coef = T, cor.method = "pearson", cor.coef.coord = c(-2.5,0), ggtheme = theme_classic()) +
    gradient_color(viridis::plasma(11)) + #coord_cartesian(xlim = c(-limit, limit), ylim = c(-limit,limit)) +
    facet_grid(cols=vars(correction), rows=vars(cell_type), scales = "free") + theme(text = element_text(size=8))
  #ggsave(filename = paste(testn, label, "gsea_all.pdf", sep = "_"), path=outdir, width = 4, height = hght, units = "in")
  ggsave(paste0(deparse(substitute(gsea_resultt)),typee,qcut,excluded,slcut,"scatter.pdf",collapse="_"), path=outdir, width=3.6, height=hght, units="in")

  gsea_with_cat<-gsea_with_cat |>
   # pivot_wider(id_cols =c(ID, cell_type, category), names_from = comparison, values_from = c(NES, qvalues)) |>
    #filter(qvalues_t21 <= qcut & |>
   filter( category != excluded & #ID %in% c(vc.neurons, vc.Astrocytes, vc.NPCs, vc.Other))
     (cell_type == "Astrocytes" & ID %in% vc.Astrocytes) |
                                    (cell_type == "Neurons" & ID %in% vc.neurons) |
                                   (cell_type == "NPCs" & ID %in% vc.NPCs) |
                                    (cell_type == "Other" & ID %in% vc.Other))

  if(excluded != "nocat")
    {
    gsea_with_cat$category<-gsea_with_cat$cell_type
    gsea_with_cat$cell_type<-gsea_with_cat$comparison
   # gsea_with_cat<-gsea_with_cat |> filter(qvalues <= qcut)
  }


  gsea_with_cat |> mutate(comparison = case_when(comparison == "t21" ~ "euploid", T ~ comparison)) |>
    mutate(cell_type = case_when(cell_type == "t21" ~ "euploid", T ~ cell_type)) |>
    mutate(ID = case_when(str_count(ID,"_") == 0 ~ paste0("CHR_",ID), T~ID)) |>
    mutate(ID = fx.labels2(ID)) |>
    mutate(Score = -log10(qvalues)*sign(NES)) |>
    mutate(Score2 = pmin(-log10(qvalues),3)) |>
    mutate(ID = fct_reorder(ID, NES)) |>
    ggplot(aes(x=NES, y=ID)) +
    geom_segment(aes(x=0, y=ID, yend = ID, xend=NES), color="black", size=.4) +
    geom_point(aes(color=as.factor(sign(NES)), alpha = Score2)) +
    #scale_color_gradient2(low="blue",mid = "white", high="red") +
    scale_fill_aaas() +
    scale_color_aaas() +
    facet_grid(rows=vars(category), cols = vars(cell_type), scales="free", space="free", drop = T) +
    geom_vline(xintercept = c(0), alpha=0.5) +
    theme(legend.position = "right") +
    #theme(axis.text.y = element_blank()) +
    theme(axis.text.y = element_text(size = 6), axis.title.y = element_blank()) +
    scale_x_continuous(expand = c(.1,.1))
  ggsave(paste0(deparse(substitute(gsea_resultt)),typee,qcut,excluded,slcut,"gp.pdf",collapse="_"), path=outdir, width=7.2, height=6.8, units="in")

}


gsea_results_SCT <- tibble()
for(data_type in c("data")) {
  for (comparison_name in c("t21", "pD", "d0")) {
    print(comparison_name)
    test_df <- make_comb_df(eval(as.name(sprintf("SCT_%s_%s_ct", data_type, comparison_name)))) |>
    #test_df <- SCT |> filter(str_detect(contrast3, comparison_name) & str_detect(testname, data_type)) |>
      group_by(comparison_celltype) |>
      arrange(desc(test))

    genes <- split(test_df  |> ungroup() |> dplyr::select(gene_symbol, test) |> deframe(), test_df$comparison_celltype)

    for (celltype in names(genes)) {
      gc()
      gl <- genes[[celltype]]
      set.seed(42)
      gsea_result <- GSEA(gl, TERM2GENE = msig_terms, pvalueCutoff = 1,
                          pAdjustMethod = "BH", by="fgsea", seed=T, eps=1e-50)

      gsea_results_SCT <- gsea_results_SCT |>
        rbind(
          gsea_result |>
            as_tibble() |>
            mutate(comparison = comparison_name,
                   data_type = data_type,
                   cell_type = celltype)
        )
    }
  }
}
gc()
write_csv(gsea_results_SCT, paste0(outdir,"/tables/gsea_SCT_celltype.csv.gz"))


gsea_results_SCT2 <- tibble()
for(data_type in c("data")) {
  for (comparison_name in c("t21", "pD", "d0")) {
    print(comparison_name)
    test_df <- make_comb_df(eval(as.name(sprintf("SCT_%s_%s_ct2", data_type, comparison_name)))) |>
    #test_df <- SCT2 |> filter(str_detect(contrast3, comparison_name) & str_detect(testname, data_type)) |>
     group_by(comparison_celltype) |>
      arrange(desc(test))

    genes <- split(test_df  |> ungroup() |> dplyr::select(gene_symbol, test) |> deframe(), test_df$comparison_celltype)

    for (celltype in names(genes)) {
      gc()
      gl <- genes[[celltype]]
      set.seed(42)
      gsea_result <- GSEA(gl, TERM2GENE = msig_terms, pvalueCutoff = 1,
                          pAdjustMethod = "BH", by="fgsea", seed=T, eps=1e-50)

      gsea_results_SCT2 <- gsea_results_SCT2 |>
        rbind(
          gsea_result |>
            as_tibble() |>
            mutate(comparison = comparison_name,
                   data_type = data_type,
                   cell_type = celltype)
        )
    }
  }
}
gc()
write_csv(gsea_results_SCT2, paste0(outdir,"/tables/gsea_SCT_celltype2.csv.gz"))


qcut<-0.1
slcut<-0.005
# excluded<-"nocat"
excluded<-"_"
vc.collections<-collections

fx.gpfx(gsea_results_SCT, "data", slcut, qcut, excluded, vc.collections, 3.6)
fx.gpfx(gsea_results_SCT2, "data", slcut, qcut, excluded, vc.collections, 3.6)

}