Skip to content
Permalink
6b380cfa28
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
1595 lines (1386 sloc) 75.1 KB
## set working directory
setwd(dirname(unlist(rstudioapi::getSourceEditorContext()['path'])))
## ---------------------------
## load up the packages we will need: (uncomment as required)
require(tidyverse);
theme_set(
theme_classic() +
theme(
text = element_text(size = 6, family = "sans",),
plot.title = element_text(hjust = 0.5, size = 8, family = "sans", face = "bold"),
axis.title = element_text(size = 7, face = "bold"),
axis.text = element_text(size = 6),
legend.spacing.x = unit(0, "cm"),
legend.spacing.y = unit(2, "mm"),
legend.key.height = unit(4, "mm"),
legend.margin = margin(0, 0, 0, 0),
axis.line = element_line(size = 0),
strip.background = element_blank(),
strip.text = element_text(margin = margin(1, 1, 1, 1, unit = "mm")),
panel.border = element_rect(color = "black", fill = NA, size = .5)
)
)
require(ggsci)
#require(scales)
require(ggthemes)
library(Seurat)
library(SeuratData)
library(patchwork)
library(HGNChelper)
library(openxlsx)
## ---------------------------
gene_info_df <- read_csv("external_tables/gene_info_df.csv.gz")
# Adding the data to Seurat
mat_path <- "input_data/parsebio_scrnaseq/real_outputs/combined/all-well/DGE_filtered"
mat <- ReadParseBio(mat_path)
# Check to see if empty gene names are present, add name if so.
table(rownames(mat) == "")
rownames(mat)[rownames(mat) == ""] <- "unknown"
# Read in cell meta data
cell_meta <- read.csv(paste0(mat_path, "/cell_metadata.csv"), row.names = 1)
# Create object
t21neu <- CreateSeuratObject(mat, min_genes = 100, min_cells = 100,
names.feild = 0, meta.data = cell_meta)
#Setting our initial cell class to a single type, this will changer after clustering.
t21neu@meta.data$orig.ident <- factor(rep("t21neu", nrow(t21neu@meta.data)))
Idents(t21neu) <- t21neu@meta.data$orig.ident
saveRDS(t21neu, "tables/seurat_obj_before_QC.RDS")
t21neu_pre <- readRDS("tables/seurat_obj_before_QC.RDS")
library(clusterProfiler)
library(msigdbr)
msig_df <- msigdbr(species = "Homo sapiens")
msig_terms <- msig_df %>%
filter(gs_cat == "H" |
gs_cat == "C1" |
(gs_cat == "C2" & str_detect(gs_subcat, "^CP:")) |
#(gs_cat == "C3" & str_detect(gs_subcat, "^TFT:")) |
#gs_cat == "C4" |
(gs_cat == "C5" & str_detect(gs_subcat, "^HPO"))# | gs_cat == "C8"
) %>%
dplyr::select(gs_name, gene_symbol)
rm(msig_df)
cust.name<-"fin13"
vc.regress<-c("MT")
min.xist<-2
RNAvSCT<-"integrated"
vc.tissue<-"Burke"
collections<-c("WP","REACTOME","KEGG", "HALLMARK", "BIOCARTA", "PID")
system.time(fx.master(cust.name, vc.regress, min.xist, RNAvSCT, vc.tissue, collections))
fx.master<-function(cust.name, vc.regress, min.xist, RNAvSCT, vc.tissue, collections)
{
outdir<-paste0(c(cust.name, vc.tissue, RNAvSCT,"with",vc.regress), collapse = "_")
dir.create(outdir)
dir.create(paste0(outdir,"/tables"))
vc.nameBYchr<-setNames(gene_info_df$hgnc_symbol, gene_info_df$chromosome_name)
ls.nameBYchr<-split(vc.nameBYchr, names(vc.nameBYchr))
ls.countsBYchr<-lapply(ls.nameBYchr,function(X) {
fraction <- Matrix::colSums(t21neu_pre[as.vector(X), ])/Matrix::colSums(t21neu_pre)
})
df.CHRfrac<-as.data.frame(ls.countsBYchr)
rb.genes <- rownames(t21neu_pre)[grep("^RP[SL]",rownames(t21neu_pre))]
rpsl<- Matrix::colSums(t21neu_pre[rb.genes, ])/Matrix::colSums(t21neu_pre)
rn5.genes <- rownames(t21neu_pre)[grep("^RNA5S",rownames(t21neu_pre))]
rn5s<- Matrix::colSums(t21neu_pre[rn5.genes, ])/Matrix::colSums(t21neu_pre)
df.neu<-df.CHRfrac |> rownames_to_column("cell_id") |>
left_join(t21neu_pre@meta.data |> rownames_to_column("cell_id"), by="cell_id") |>
left_join(cbind(rpsl, rn5s) |> as.data.frame() |> rownames_to_column("cell_id"), by="cell_id") |>
column_to_rownames("cell_id")
t21neu_pre@meta.data<- df.neu |> mutate(dox = str_detect(sample, "_dox")) |>
mutate(sample = case_when(str_detect(sample, "198_1") ~ "D21a",
str_detect(sample, "198_2") ~ "D21b",
str_detect(sample, "nodox") ~ "T21",
str_detect(sample, "d0") ~ ".day0",
str_detect(sample, "pD") ~ "pDAPT")) |>
mutate(mcount_ratio = mread_count/tscp_count)
t21neu_removed <- subset(t21neu_pre, subset = nFeature_RNA > 7500 | nFeature_RNA < 1500 | nCount_RNA < 2500 | MT > 0.1)
t21neu <-subset(t21neu_pre, subset = nFeature_RNA <= 7500 & nFeature_RNA >= 1500 & nCount_RNA >= 2500 & MT <= 0.1)
tb.CHR<-pivot_longer(t21neu_pre@meta.data |> rownames_to_column("cell_id"),
c(colnames(df.CHRfrac),"rpsl", "rn5s", "gene_count", "tscp_count", "mcount_ratio"))
ggplot(tb.CHR, aes(x=value, color = sample)) + stat_ecdf(geom = "step") +
facet_wrap(vars(name), scales="free_x")
ggsave("count_ecdf.pdf", path=outdir, height = 5, width = 5, units="in")
ggplot(t21neu_removed@meta.data, aes(x=MT, fill=sample)) +
geom_histogram() + facet_wrap(vars(sample), scales="free") +
geom_vline(xintercept = 0.1)
ggsave("MT_removed.pdf", path=outdir, height = 3, width = 3.6, units="in")
ggplot(t21neu_removed@meta.data, aes(x=gene_count, fill=sample)) +
geom_histogram() + facet_wrap(vars(sample), scales="free") +
geom_vline(xintercept = c(7500))
ggsave("gene_count_removed.pdf", path=outdir, height = 3, width = 3.6, units="in")
ggplot(t21neu_removed@meta.data, aes(x=tscp_count, fill=sample)) +
geom_histogram() + facet_wrap(vars(sample), scales="free_y") +
geom_vline(xintercept = c(2500))
ggsave("tscp_count_removed.pdf", path=outdir, height = 3, width = 3.6, units="in")
ggplot(t21neu_pre@meta.data, aes(y=tscp_count, x=sample, color = sample)) +
geom_boxplot(varwidth = T) +
stat_summary(fun.data = function(x) {
data.frame(label=signif(quantile(x, prob=c(0.1, 0.2, .3,.4,.5,.6,.7,.8,.9), type=1),4), y=c(seq(1,9)*(-3000)))
}, geom = "text",
size=3) + geom_hline(yintercept = c(3200))
ggsave("tscp_count_dis.pdf", path=outdir, height = 3, width =3.6, units="in")
rm(tb.CHR)
rm(vc.nameBYchr)
rm(ls.nameBYchr)
gc()
#based on:
#https://satijalab.org/seurat/articles/sctransform_v2_vignette.html
#https://github.com/satijalab/seurat/issues/5879
t21neu_list <- SplitObject(t21neu, split.by = "sample")
t21neu_list <- lapply(t21neu_list, FUN = function(x) {
x <- NormalizeData(x, normalization.method = "RC", scale.factor = 1e6)
x<-CellCycleScoring(x, s.features = cc.genes.updated.2019$s.genes,
g2m.features = cc.genes.updated.2019$g2m.genes,
set.ident = F)
x@meta.data<-x@meta.data |> mutate(CCdiff = (S.Score-G2M.Score))
#x<-FindVariableFeatures(x, selection.method = "vst", nfeatures = 3000)
#x<-ScaleData(x, vars.to.regress = vc.regress)
x<-SCTransform(x, vst.flavor = "v2", vars.to.regress = vc.regress, return.only.var.genes =F, verbose = F)
})
features <- SelectIntegrationFeatures(object.list = t21neu_list, nfeatures = 3000)
t21neu_list <- PrepSCTIntegration(object.list = t21neu_list, anchor.features = features)
t21neu_anchors <- FindIntegrationAnchors(object.list = t21neu_list, normalization.method = "SCT",
anchor.features = features,
k.filter = min(t21neu@meta.data$sample |> table()))
t21neu_combined <- IntegrateData(anchorset = t21neu_anchors, normalization.method = "SCT")
t21neu_combined <- RunPCA(t21neu_combined, verbose = FALSE, features = features)
t21neu_combined <- RunUMAP(t21neu_combined, reduction = "pca", dims = 1:30)
t21neu_combined <- FindNeighbors(t21neu_combined, reduction = "pca", dims = 1:30)
t21neu_combined <- FindClusters(t21neu_combined, resolution = 0.5)
rm(t21neu_pre)
rm(t21neu_removed)
rm(t21neu)
rm(t21neu_list)
rm(t21neu_anchors)
gc()
# Determine metrics to plot present in seurat_control@meta.data
require(cowplot)
metrics <- c("gene_count", "tscp_count", "mcount_ratio", "S.Score", "G2M.Score", "CCdiff",
"MT", "rn5s", "rpsl")
# Extract the UMAP coordinates for each cell and include information about the metrics to plot
qc_data <- FetchData(t21neu_combined,
vars = c(metrics, "ident", "UMAP_1", "UMAP_2"))
# Adding cluster label to center of cluster on UMAP
umap_label <- FetchData(t21neu_combined,
vars = c("ident", "UMAP_1", "UMAP_2")) %>%
as.data.frame() %>%
group_by(ident) %>%
summarise(x=mean(UMAP_1), y=mean(UMAP_2))
# Plot a UMAP plot for each metric
map(metrics, function(qc){
ggplot(qc_data,
aes(UMAP_1, UMAP_2)) +
geom_point(aes_string(color=qc),
alpha = 0.7) +
scale_color_gradient(guide = "none",
low = "grey90",
high = "blue") +
geom_text(data=umap_label,
aes(label=ident, x, y)) +
ggtitle(qc)
}) %>% plot_grid(plotlist = .)
ggsave("cowplot.pdf", path=outdir, width=3.6, height=3.6, units="in")
p1 <- DimPlot(t21neu_combined, reduction = "umap", group.by = "sample")
p2 <- DimPlot(t21neu_combined, reduction = "umap", label = TRUE, repel = TRUE)
p1 + p2
DimPlot(t21neu_combined, reduction = "umap", split.by = "sample") +
theme(legend.position = "bottom") + scale_color_brewer(palette = "Paired")
ggsave("umap_bysample.pdf", path=outdir, width=7.2, height=4, units="in")
source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/gene_sets_prepare.R")
source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/sctype_score_.R")
source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/auto_detect_tissue_type.R")
# get cell-type-specific gene sets from our in-built database (DB)
# gs_list <- gene_sets_prepare("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/ScTypeDB_short.xlsx", "Brain") # e.g. Immune system, Liver, Pancreas, Kidney, Eye, Brain
# Burke custom gene set
t21neu_combined <- PrepSCTFindMarkers(t21neu_combined)
DefaultAssay(t21neu_combined)<-RNAvSCT
tissue_guess = auto_detect_tissue_type(path_to_db_file = "../scRNAseq/external_tables/ScTypeDB_full.xlsx",
seuratObject = t21neu_combined, scaled = T,
assay = RNAvSCT)
print(tissue_guess)
tissue_type = vc.tissue
gs_list <- gene_sets_prepare("../scRNAseq/external_tables/ScTypeDB_short.xlsx", tissue_type)
# assign cell types
es.max <- sctype_score(scRNAseqData = t21neu_combined[[DefaultAssay(t21neu_combined)]]@scale.data,
scaled = TRUE,
gs = gs_list$gs_positive,
gs2 = gs_list$gs_negative)
# merge by cluster
cL_results <- do.call("rbind", lapply(unique(t21neu_combined@meta.data$seurat_clusters), function(cl){
es.max.cl = sort(rowSums(es.max[ ,rownames(t21neu_combined@meta.data[t21neu_combined@meta.data$seurat_clusters==cl, ])]), decreasing = !0)
head(data.frame(cluster = cl, type = names(es.max.cl), scores = es.max.cl, ncells = sum(t21neu_combined@meta.data$seurat_clusters==cl)), 10)
}))
write_csv(cL_results, paste0(outdir,"/tables/cL_results.csv"))
sctype_scores <- cL_results |> arrange(desc(scores)) |> group_by(cluster) |> top_n(1, scores)
# set low-confident (low ScType score) clusters to "unknown"
sctype_scores$type[as.numeric(as.character(sctype_scores$scores)) < sctype_scores$ncells/4] <- "Unknown"
print(sctype_scores[,1:3])
t21neu_combined@meta.data$customclassif <- ""
for(j in unique(sctype_scores$cluster)){
cl_type <- sctype_scores[sctype_scores$cluster==j,];
t21neu_combined@meta.data$customclassif[t21neu_combined@meta.data$seurat_clusters == j] <- as.character(cl_type$type[1])
}
xist_threshold <- min.xist
t21neu_combined@meta.data<- t21neu_combined@meta.data |>
mutate(simplified_celltype = #"neuron") #customclassif) |>
case_when(#customclassif %in% c("Endothelial cells","Neuroblasts") ~ "Unknown",
#customclassif %in% c("Immature neurons", "Neuroblasts") ~ "neuron",
# #customclassif %in% c("Tanycytes", "Non myelinating Schwann cells") ~ "glia",
(str_detect(customclassif, "euron") |
str_detect(customclassif, "Fetal_quiescent"))
~ "Neurons",
(str_detect(customclassif, "stem") |
str_detect(customclassif, "ndothelial") |
str_detect(customclassif, "Neuroepithelial") |
str_detect(customclassif, "Neural Progenitor") |
str_detect(customclassif, "NPC") |
str_detect(customclassif, "Fetal_replicating") |
str_detect(customclassif, "Radial")) ~ "NPCs",
(#str_detect(customclassif, "Tanycytes") |
#str_detect(customclassif, "Oligodendro") |
#str_detect(customclassif, "OPC") |
str_detect(customclassif, "Astrocyte"))~ "Astrocytes",
T ~ "Other")) |>
rownames_to_column("cell_id") |>
left_join(t21neu_combined$RNA@counts["XIST",] |>
enframe(name="cell_id", value="XIST_counts")) |>
mutate(XIST_status = case_when(XIST_counts >= xist_threshold ~ T,
XIST_counts == 0 ~ F,
T ~ NA)) |>
column_to_rownames("cell_id")
sample.cols<-setNames(c("#E69F00", "#56B4E9", "#0072B2", "#D55E00", "#CC79A7"),
nm = names(t21neu_combined$sample |> table()))
p1 <- DimPlot(t21neu_combined, reduction = "umap", group.by = "XIST_status", shuffle = F, pt.size = 0.1) +
theme(legend.position = "top", text = element_text(size = 8), axis.title = element_text(size=8)) +
scale_color_jco()
p2 <- DimPlot(t21neu_combined, reduction = "umap", group.by = "Phase", shuffle = T, pt.size = 0.1) +
theme(legend.position = "top", text = element_text(size = 8), axis.title = element_text(size=8)) +
scale_color_jama()
p3 <- DimPlot(t21neu_combined, reduction = "umap", group.by = "simplified_celltype", shuffle = T, pt.size = 0.1) +
theme(legend.position = "top", text = element_text(size = 8), axis.title = element_text(size=8)) +
scale_color_tron()
p4 <- DimPlot(t21neu_combined, reduction = "umap", group.by = "customclassif", shuffle = T, pt.size = 0.1) +
theme(legend.position = "top", text = element_text(size = 8), axis.title = element_text(size=8)) +
scale_color_brewer(palette = "Set2")
p5 <- DimPlot(t21neu_combined, reduction = "umap", group.by = "sample", shuffle = T, pt.size = 0.1) +
scale_color_manual(values = sample.cols) +
theme(legend.position = "top", text = element_text(size = 8), axis.title = element_text(size=8))
p6<-DimPlot(t21neu_combined, reduction = "umap", label = F, shuffle = T, pt.size = 0.1) +
theme(legend.position = "top", text = element_text(size = 8), axis.title = element_text(size=8)) +
scale_color_brewer(palette = "Paired")
#p1 + p5 + p6 + p4 + p3 + p2
p6 + p4 +
p2 + p3 +
p1 + p5 + plot_layout(ncol = 2)
ggsave("umaps.pdf", path=outdir, width=3.6, height=6.5, units="in")
b1<-t21neu_combined@meta.data |> mutate(sample = case_when(sample == ".day0" ~ "day0", T ~ sample)) |>
filter(!is.na(XIST_status)) |>
group_by(sample, simplified_celltype, XIST_status) |>
tally() |> group_by(sample, simplified_celltype) |> mutate(sum = sum(n)) |> mutate(ratio = signif(100*n/sum,2)) |> ungroup() |>
mutate(label_pos = case_when(XIST_status ~ (ratio-3), T ~ (-50))) |>
ggplot(aes(x = sample, y=ratio, fill = XIST_status, label = ratio)) + geom_bar(stat = "identity") +
geom_text(aes(y = label_pos, label = ratio), size=2) +
facet_wrap(vars(simplified_celltype)) + scale_fill_jco() + coord_cartesian(ylim=c(0,100)) + theme(text=element_text(size=6)) +
theme(axis.text.x = element_text(angle = 45, hjust=1, size = 6)) +
theme(axis.text.y = element_text(size = 6), axis.title.x = element_blank()) + labs(y="% nuclei") +
theme(legend.position = "top", legend.title=element_blank(), axis.title = element_blank(), legend.key.height = unit(0.1, 'in'))
#ggbarplot("sample", "ratio", color = "white", fill = "XIST_status", ggtheme = theme_classic(), facet.by = "simplified_celltype",
# label = T, lab.pos = "in", lab.col = "black", palette = "jco", position = position_fill())
#ggsave("XIST_bysample.pdf", path=outdir, width=1.8, height=2.4, units="in")
b2<-t21neu_combined@meta.data |> mutate(sample = case_when(sample == ".day0" ~ "day0", T ~ sample)) |>
filter(!is.na(simplified_celltype)) |>
group_by(sample, customclassif, simplified_celltype) |>
tally() |> group_by(sample, simplified_celltype) |> mutate(sum = sum(n)) |> mutate(ratio = round(100*n/sum,digits = 0)) |> ungroup() |>
mutate(label_pos = case_when(XIST_status ~ (ratio-3), T ~ (-10))) |>
ggplot(aes(x = sample, y=ratio, fill = customclassif)) + geom_bar(stat = "identity") +# geom_text(aes(y = label_pos, label = ratio), size=2) +
facet_wrap(vars(simplified_celltype)) + scale_fill_brewer(palette = "Set2") + coord_cartesian(ylim=c(0,100)) + theme(text=element_text(size=6)) +
theme(axis.text.x = element_text(angle = 45, hjust=1, size = 6)) +
theme(axis.text.y = element_text(size = 6), axis.title.x = element_blank()) + labs(y="% nuclei") +
theme(legend.position = "top", legend.direction = "horizontal", legend.title=element_blank(),legend.key.height = unit(0.1, 'in'))
#ggbarplot("sample", "ratio", color = "white", fill = "XIST_status", ggtheme = theme_classic(), facet.by = "simplified_celltype",
# label = T, lab.pos = "in", lab.col = "black", palette = "jco", position = position_fill())
#ggsave("Customclassif_bysample.pdf", path=outdir, width=1.8, height=2.57, units="in")
b3<-t21neu_combined@meta.data |> mutate(sample = case_when(sample == ".day0" ~ "day0", T ~ sample)) |>
filter(!is.na(Phase)) |>
group_by(sample, Phase, simplified_celltype) |>
tally() |> group_by(sample, simplified_celltype) |> mutate(sum = sum(n)) |> mutate(ratio_label = signif(100*n/sum,digits =2), ratio = 100*n/sum) |> ungroup() |>
mutate(label_pos = case_when(Phase == "G1" ~ 95, T ~ (-10))) |>
ggplot(aes(x = sample, y=ratio, fill = Phase)) + geom_bar(stat = "identity") + geom_text(aes(y = label_pos, label = ratio_label, color="white"), size=2) +
facet_wrap(vars(simplified_celltype)) + scale_fill_jama() + scale_color_manual(values = "white") + coord_cartesian(ylim=c(0,100)) + theme(text=element_text(size=6)) +
theme(axis.text.x = element_text(angle = 45, hjust=1, size = 6)) +
theme(axis.text.y = element_text(size = 6), axis.title.x = element_blank()) + labs(y="% nuclei") +
theme(legend.position = "top", legend.title=element_blank(), axis.title = element_blank(), legend.key.height = unit(0.1, 'in'))
#ggbarplot("sample", "ratio", color = "white", fill = "XIST_status", ggtheme = theme_classic(), facet.by = "simplified_celltype",
# label = T, lab.pos = "in", lab.col = "black", palette = "jco", position = position_fill())
#ggsave("Phase_bysample.pdf", path=outdir, width=1.8, height=2.4, units="in")
ggsave(b2+b3+b1, filename = "supp_plots.pdf",path = outdir, height = 2.5, width = 5.4, units="in")
#DimPlot(t21neu_combined, reduction = "umap", split.by = "sample", group.by = "customclassif")
DimPlot(t21neu_combined, reduction = "umap", split.by = "sample", group.by = "simplified_celltype", shuffle=T) +
theme(legend.position = "bottom", text = element_text(size = 8)) + scale_color_tron()
ggsave("umap_bysample.pdf", path=outdir, width=3.6, height=3, units="in")
# add allelic data to the metadata
allelic_df <-read_csv("../parsebio_scRNAseq/tables/indv_cells_gene_ae_filtered_paternal_withneu2.csv.gz") |>
filter(cell_id %in% rownames(t21neu_combined@meta.data))
#substr(rownames(t21neu_combined@meta.data),start = 1,nchar(rownames(t21neu_combined@meta.data))-4)))
patallele_fix<-allelic_df |>
mutate(karyo = case_when(str_detect(cell_line, "198") ~ "d21",
T ~ "t21")) |>
group_by(karyo, name) |>
summarise(A = sum(aCount), B = sum(bCount),
pat_bulk = unique(bulk_pat)) |>
pivot_wider(id_cols = c(name, pat_bulk), names_from = karyo, values_from = c(A, B)) |>
replace_na(list(A_d21 = 0, B_d21 = 0, A_t21 = 0, B_t21 = 0)) |>
mutate(d21_rA = A_d21/( A_d21 + B_d21),
d21_rB = B_d21/( A_d21 + B_d21),
t21_rA = A_t21/( A_t21 + B_t21),
t21_rB = B_t21/( A_t21 + B_t21)) |>
mutate(a_allele_diff=t21_rA-d21_rA,
b_allele_diff=t21_rB-d21_rB,
pat_sn = case_when(abs(a_allele_diff) < .05 & abs(b_allele_diff) < .05 ~ "nodiff",
a_allele_diff > 0 & b_allele_diff < 0 ~ "bCount",
a_allele_diff < 0 & b_allele_diff > 0 ~ "aCount",
T ~ "undetermined"))
patallele_fix |> dplyr::select(pat_bulk, pat_sn) |> table(useNA = "always")
patallele_fix2<-patallele_fix |>
mutate(pat_sn = case_when(!is.na(pat_bulk) & pat_sn %in% c("nodiff","undetermined") ~ pat_bulk,
is.na(pat_bulk) & pat_sn %in% c("nodiff","undetermined") ~ "drop",
pat_sn == pat_bulk ~ pat_sn,
!is.na(pat_bulk) & pat_sn != pat_bulk ~ pat_sn,
is.na(pat_bulk) & pat_sn %in% c("aCount","bCount") ~ pat_sn,
T ~ "unresolved"
)) |>
mutate(mat_sn=case_when(pat_sn == "aCount" ~ "bCount",
pat_sn == "bCount" ~ "aCount",
T ~ "drop"))
patallele_fix2 |> filter(pat_sn != "drop") |> dplyr::select(pat_bulk, pat_sn) |> table(useNA = "always")
allelic_df2 <-allelic_df |>
left_join(patallele_fix2 |>
pivot_longer(cols = c(pat_sn), names_to = "allele_type", values_to = "allele") |>
filter(allele_type == "pat_sn")
, by = "name")
pat_ar_df <- allelic_df2 |>
filter(
!is.na(cell_line) & !is.na(allele)
) |>
left_join(t21neu_combined@meta.data |> rownames_to_column("cell_id")) |>
mutate(pat_count = case_when(
allele == 'aCount' ~ aCount,
allele == 'bCount' ~ bCount
),
mat_count = totalCount - pat_count
# cell_id = case_when(
# str_detect(bam, "2k") ~ sprintf("%s__s1", cell),
# str_detect(bam, "5k") ~ sprintf("%s__s2", cell)
# )
) |>
group_by(cell_line, cell_id) |>
summarise(
sample = unique(sample),
XIST_status = unique(XIST_status),
simplified_celltype = unique(simplified_celltype),
nvar = median(n_variants, na.rm = T),
total = sum(totalCount, na.rm = T),
sumPat = sum(pat_count),
sumMat = sum(mat_count),
perc_paternal = sum(pat_count, na.rm = T) / sum(totalCount, na.rm = T),
perc_paternal2 = sum(pat_count/totalCount, na.rm = T ),
pat_to_mat = sum(pat_count, na.rm = T) / sum(mat_count, na.rm = T),
pat_to_mat2 = mean(pat_count/mat_count, na.rm = T)
) |>
ungroup()
require(ggpubr)
pat_ar_df2<-pat_ar_df |>
filter(!(str_detect(sample, "D21") & XIST_status) & !(sample == ".day0" & !XIST_status)) |>
mutate(sample = case_when(#sample %in% c("D21a","D21b") ~ "D21",
sample == "pDAPT" & !XIST_status ~ "T21_pD-",
sample == "pDAPT" & XIST_status ~ "T21_pD+",
sample == ".day0" & !XIST_status ~ "T21_d0-",
sample == ".day0" & XIST_status ~ "T21_d0+",
sample == "T21" ~ "T21",
T ~ sample)) |>
filter(total > 5 & !is.na(simplified_celltype)) |> filter(!is.na(XIST_status)) |>
mutate(sample = case_when(sample == "D21" ~ "euploid", T ~ sample)) |>
mutate(XIST_status = case_when(sample %in% c("D21a","D21b") ~ "absent", T ~ as.character(XIST_status)))
allelic_plot<-ggplot(pat_ar_df2 #|> filter(simplified_celltype != "Other")
, aes(x=sample, y=perc_paternal)) +
ggbeeswarm::geom_quasirandom(aes(color = XIST_status), alpha = .5, size=.5) +
geom_boxplot(fill = NA, notch = T, outlier.shape = NA, size=.3) +
scale_color_jama() +
stat_summary(fun.data = function(x) {
data.frame(label=signif(median(x),2), y=-0.1)},
geom = "text", size=2) +
stat_compare_means(ref.group = "T21", method = "wilcox",
#method.args = list(alternative="less"),
label.y = 0.9, hide.ns = T, label = "p.format", size=2) +
theme(axis.ticks.x = element_blank(),
axis.title.x = element_blank(),
legend.position = "top") +
labs(
title="Paternal chr21 allele expression",
y="Lesser allele fraction",
color="XIST",
fill="XIST"
) +geom_hline(yintercept = c(0.33,0.5), alpha = 0.1) +
scale_x_discrete(position = "top") + ylim(c(-0.1,1)) #+ facet_grid(rows = vars(simplified_celltype), scales = "free")
ggsave("allelic_all.pdf", path=outdir, width=3.6, height = 1.8, units="in")
ggplot(pat_ar_df2, aes(x=sample, y=perc_paternal)) +
ggbeeswarm::geom_quasirandom(aes(color = XIST_status), alpha = .5, size=.5) +
geom_boxplot(fill = NA, notch = T, outlier.shape = NA, size=.3) +
scale_color_jama() +
stat_summary(fun.data = function(x) {
data.frame(label=signif(median(x),2), y=-0.1)},
geom = "text", size=2) +
stat_compare_means(ref.group = "T21", method = "wilcox",
#method.args = list(alternative="less"),
label.y = 0.9, hide.ns = T, label = "p.format", size=2) +
theme(axis.ticks.x = element_blank(),
axis.title.x = element_blank(),
legend.position = "top") +
labs(
title="Paternal chr21 allele expression",
y="Lesser allele fraction",
color="XIST",
fill="XIST"
) +geom_hline(yintercept = c(0.33,0.5), alpha = 0.1) +
scale_x_discrete(position = "top") + ylim(c(-0.1,1)) + facet_grid(rows = vars(simplified_celltype), scales = "free")
ggsave("allelic_by_celltype.pdf", path=outdir, width=2.5, height = 3.6, units="in")
rm(allelic_df)
rm(allelic_df2)
gc()
##ECDF plots### the combined_output_df(2) is a large file to hold in memory -
# maybe there's a more lightweight container for this
rm(combined_output_df)
gc()
combined_output_df <- t21neu_combined[["RNA"]]@data |>
as.data.frame() |>
rownames_to_column(var="gene_name") |>
tibble() |>
pivot_longer(cols = -gene_name, names_to = "cell_id", values_to = "expr_val") |>
left_join(t21neu_combined$sample |> enframe(name="cell_id", value = "sample_id"),
by="cell_id") |>
left_join(
gene_info_df |> dplyr::select(-c(description,wikigene_name)) |>
rename(gene_name = "hgnc_symbol") |> filter(!is.na(gene_name) & !duplicated(gene_name)) |>
mutate(gene_id = str_replace(ensembl_gene_id_version, "\\.[0-9]+", "")),
by= "gene_name")
gc()
#mean non21 expr by cell - should rename this instead of "autosome" to include X (& Y)
mean_autosome_expression_df <- combined_output_df |>
filter(!(chromosome_name %in% c(#"X", "Y",
"MT",
NA, "21"))) |>
#filter(expr_val > 0) |>
group_by(cell_id) |>
summarise(mean_a_expr = mean(expr_val), countA = n())
#each chr21 gene by cell
chr21gene_bycell <- combined_output_df |>
filter(chromosome_name %in% c("21")) |>
dplyr::rename(expr21 = "expr_val") |>
#filter(expr21 > 0) |>
left_join(mean_autosome_expression_df, by="cell_id") #|>
#group_by(cell_id) |>
#summarise(mean21 = median(expr21), count21 = n())
rm(combined_output_df)
gc()
### BY CELL APPROACH
fx_ecdf <- function(x) {
# Sort the input vector in ascending order
sorted_x <- sort(x)
# Calculate the unique values and their frequencies in the input vector
unique_x <- unique(sorted_x)
counts <- tabulate(match(sorted_x, unique_x))
# Calculate the empirical cumulative density for each unique value
ecdf <- cumsum(counts)/length(x)
# Create a vector of empirical cumulative densities corresponding to the original input vector
output_ecdf <- ecdf[match(x, unique_x)]
# Return the vector of empirical cumulative densities
return(output_ecdf)
}
t21_d21_cell_ratio <- chr21gene_bycell |>
left_join(t21neu_combined@meta.data |>
as.data.frame() |>
rownames_to_column(var="cell_id") |>
as_tibble() |>
dplyr::select(cell_id, simplified_celltype, XIST_status, XIST_counts),
by = "cell_id") |>
group_by(sample_id, simplified_celltype, cell_id, XIST_status) |>
summarise(mean21 = mean(expr21), count = n(), mean_a_expr = unique(mean_a_expr)) |>
mutate(ratio = mean21/mean_a_expr) |> filter(!(sample_id == ".day0" & !XIST_status))
tb.bycell<-t21_d21_cell_ratio |> filter(!(is.na(XIST_status) | (XIST_status & str_detect(sample_id, "[DT]21")))) |>
mutate(sample = case_when(#sample_id %in% c("D21a","D21b") ~ "D21",
sample_id == "pDAPT" & !XIST_status ~ "T21_pD-",
sample_id == "pDAPT" & XIST_status ~ "T21_pD+",
sample_id == ".day0" & !XIST_status ~ "T21_d0-",
sample_id == ".day0" & XIST_status ~ "T21_d0+",
sample_id == "T21" ~ "T21",
T ~ sample_id)) |>
mutate(XIST_status = case_when(!XIST_status & str_detect(sample_id, "D21") ~ "absent",
T ~ as.character(XIST_status)))
bp.percell<-ggplot(tb.bycell |> filter(simplified_celltype != "Other"), aes(x=sample, y=ratio, fill = XIST_status)) +
ggbeeswarm::geom_quasirandom(aes(color = XIST_status), alpha = .5, size=.5) +
geom_boxplot(fill = NA, notch = T, outlier.shape = NA, size=.3) +
scale_color_jama() +
stat_summary(fun.data = function(x) {
data.frame(label=signif(median(x),2), y=0)},
geom = "text", size=2) +
stat_compare_means(ref.group = "T21", method = "wilcox",
method.args = list(alternative="less"),
label.y = 2.2, hide.ns = T, label = "p.format", size=2) +
theme( axis.ticks.x = element_blank(),
axis.title.x = element_blank(),
legend.position = "top"
) +
labs(
title="chr21 dosage by cell",
y ="chr21/genome_mean ratio",
color="XIST",
fill="XIST"
) + scale_x_discrete(position = "top") # + facet_grid(rows = vars(simplified_celltype), scales = "free", space = "free")
ggsave(bp.percell, filename = "boxplot_bycell2.pdf", path=outdir, width = 3.6, height = 1.8, units = "in")
ggplot(tb.bycell, aes(x=sample, y=ratio, fill = XIST_status)) +
ggbeeswarm::geom_quasirandom(aes(color = XIST_status), alpha = .5, size=.5) +
geom_boxplot(fill = NA, notch = T, outlier.shape = NA, size=.3) +
scale_color_jama() +
stat_summary(fun.data = function(x) {
data.frame(label=signif(median(x),2), y=0)},
geom = "text", size=2) +
stat_compare_means(ref.group = "T21", method = "wilcox", label.x.npc = "center",
method.args = list(alternative="less"), label.y.npc = 0.9,
hide.ns = T, label = "p.signif", size=3) +
theme( axis.ticks.x = element_blank(),
axis.title.x = element_blank(),
legend.position = "top"
) +
labs(
title="chr21 dosage by cell",
y ="chr21/genome_mean ratio",
color="XIST",
fill="XIST"
) + scale_x_discrete(position = "top") + facet_grid(rows = vars(simplified_celltype), scales = "free")
ggsave(filename = "boxplot_bycell.pdf", path=outdir, width = 2.5, height = 4, units = "in")
require(NSM3)
test_cell<-t21_d21_cell_ratio |> ungroup() |> filter(!(sample_id == ".day0" & !XIST_status)) |>
group_by(sample_id, simplified_celltype, XIST_status) |>
add_tally() |> #filter(n>=10) |>
arrange(ratio) |> filter(!is.infinite(ratio) & !is.na(ratio) & !is.na(XIST_status)
# & simplified_celltype != "Other"
) |>
group_by(sample_id, simplified_celltype, XIST_status) |>
mutate(eCDF = fx_ecdf(ratio)) |>
mutate(upperCI = sort(rbind(ratio, ecdf.ks.CI(ratio)$upper)[2,])) |>
mutate(lowerCI = sort(rbind(ratio, ecdf.ks.CI(ratio)$lower)[2,])) |>
ungroup()
refratio <-"T21"
testref <- test_cell |> group_by(sample_id) |> filter(simplified_celltype == "Neurons" & sample_id == refratio)
tested <- test_cell |> group_by(sample_id, XIST_status) |> filter(simplified_celltype == "Neurons" & sample_id != refratio)
ks.neurons<-tested |> summarise(ksp = ks.test(unique(testref$ratio), unique(ratio), exact = T, alternative = "l")$p.value,
ksd = ks.test(unique(testref$ratio), unique(ratio), exaxt = T, alternative = "l")$statistic)
testref <- test_cell |> group_by(sample_id) |> filter(simplified_celltype == "Astrocytes" & sample_id == refratio)
tested <- test_cell |> group_by(sample_id, XIST_status) |> filter(simplified_celltype == "Astrocytes" & sample_id != refratio)
ks.glia<-tested |> summarise(ksp = ks.test(unique(testref$ratio), unique(ratio), exact = T, alternative = "l")$p.value,
ksd = ks.test(unique(testref$ratio), unique(ratio), exaxt = T, alternative = "l")$statistic)
testref <- test_cell |> group_by(sample_id) |> filter(simplified_celltype == "NPCs" & sample_id == refratio)
tested <- test_cell |> group_by(sample_id, XIST_status) |> filter(simplified_celltype == "NPCs" & sample_id != refratio)
ks.rgc<-tested |> summarise(ksp = ks.test(unique(testref$ratio), unique(ratio), exact = T, alternative = "l")$p.value,
ksd = ks.test(unique(testref$ratio), unique(ratio), exaxt = T, alternative = "l")$statistic)
testref <- test_cell |> group_by(sample_id) |> filter(simplified_celltype == "Other" & sample_id == refratio)
tested <- test_cell |> group_by(sample_id, XIST_status) |> filter(simplified_celltype == "Other" & sample_id != refratio)
ks.unk<-tested |> summarise(ksp = ks.test(unique(testref$ratio), unique(ratio), exact = T, alternative = "l")$p.value,
ksd = ks.test(unique(testref$ratio), unique(ratio), exaxt = T, alternative = "l")$statistic)
ks21_bycell<-bind_rows(list("Neurons" = ks.neurons, "Astrocytes" = ks.glia, "NPCs" = ks.rgc, "Other" = ks.unk), .id = 'simplified_celltype')
df_anno_cell<-ks21_bycell |> filter(ksp <= 0.05) |> arrange(simplified_celltype, desc(ksd)) |> ungroup() |>
group_by(simplified_celltype) |> mutate(ksp = signif(ksp, 2), ksd = signif(ksd, 2)) |> add_column(x = 0.75) |> mutate(y = 1-row_number()/10)
min.cells<-4
gg.ecdf_bycell<-ggplot(test_cell |> group_by(sample_id, simplified_celltype, XIST_status) |>
add_tally(name = "cells") |> filter(cells>min.cells) |> ungroup() |>
mutate(upperCI = case_when(sample_id == refratio ~ upperCI, T ~ NA)) |>
mutate(lowerCI = case_when(sample_id == refratio ~ lowerCI, T ~ NA))
, aes(x=ratio, y=eCDF, colour=sample_id, linetype= XIST_status)) +
geom_step() + geom_text(data = df_anno_cell, aes(x = x, y = y, label = ksp), size =2) +
geom_hline(yintercept = 0.5, alpha = 0.2) + geom_vline(xintercept = c(1,1.5), alpha =0.2) +
geom_ribbon(aes(ymin=lowerCI, ymax=upperCI, fill = sample_id),
alpha=0.2, color = NA) +
facet_grid(rows = vars(simplified_celltype)) + xlim(0.5,2) + theme(legend.position = "right")
ggsave(gg.ecdf_bycell, filename = "ecdf_bycell.pdf", path=outdir, width = 2.4, height = 3.6, units = "in")
test_cell2<-t21_d21_cell_ratio |> ungroup() |>
filter(!(sample_id == ".day0" & !XIST_status)) |>
group_by(sample_id, XIST_status) |>
add_tally() |> #filter(n>=10) |>
arrange(ratio) |> filter(!is.infinite(ratio) & !is.na(ratio) & !is.na(XIST_status)
& simplified_celltype != "Other"
) |>
group_by(sample_id, XIST_status) |>
mutate(eCDF = fx_ecdf(ratio)) |>
mutate(upperCI = sort(rbind(ratio, ecdf.ks.CI(ratio)$upper)[2,])) |>
mutate(lowerCI = sort(rbind(ratio, ecdf.ks.CI(ratio)$lower)[2,])) |>
ungroup()
refratio <-"T21"
testref <- test_cell2 |> group_by(sample_id) |> filter(sample_id == refratio)
tested <- test_cell2 |> group_by(sample_id, XIST_status) |> filter(sample_id != refratio)
ks.all<-tested |> summarise(ksp = ks.test(unique(testref$ratio), unique(ratio), exact = T, alternative = "l")$p.value,
ksd = ks.test(unique(testref$ratio), unique(ratio), exaxt = T, alternative = "l")$statistic)
df_anno_cell2<-ks21_bycell2 |> ungroup() |> filter(ksp <= 0.05) |> arrange(desc(ksd)) |>
mutate(ksp = signif(ksp, 2), ksd = signif(ksd, 2)) |>
add_column(x = 0.75) |> mutate(y = 1-row_number()/10)
gg.ecdf_bycell2<-ggplot(test_cell2 |> group_by(sample_id, XIST_status) |>
add_tally(name = "cells") |> filter(cells>min.cells) |> ungroup() |>
mutate(upperCI = case_when(sample_id == refratio ~ upperCI, T ~ NA)) |>
mutate(lowerCI = case_when(sample_id == refratio ~ lowerCI, T ~ NA))
, aes(x=ratio, y=eCDF, colour=sample_id, linetype= XIST_status)) +
geom_step() + geom_text(data = df_anno_cell2, aes(x = x, y = y, label = ksp), size =3) +
geom_hline(yintercept = 0.5, alpha = 0.2) + geom_vline(xintercept = c(1,1.5), alpha =0.2) +
geom_ribbon(aes(ymin=lowerCI, ymax=upperCI, fill = sample_id),
alpha=0.2, color = NA) + xlim(0.5,2) + theme(legend.position = "right")
ggsave(gg.ecdf_bycell2, filename = "ecdf_bycell2.pdf", path=outdir, width = 3.2, height = 1.45, units = "in")
### BY GENE APPROACH
#mean euploid expression for each chr21 gene averaged over all cells of celltype in sample_id
chr21_mean_euploid_gene <- chr21gene_bycell |>
left_join(t21neu_combined@meta.data |>
as.data.frame() |>
rownames_to_column(var="cell_id") |>
as_tibble() |>
dplyr::select(cell_id, simplified_celltype),
by = "cell_id") |>
group_by(sample_id, simplified_celltype, gene_id, gene_name, chromosome_name, start_position) |>
summarise(mean21 = mean(expr21)) |> filter(mean21 > 0) |>
pivot_wider(id_cols = c(simplified_celltype, gene_id, gene_name, chromosome_name, start_position), names_from = sample_id, values_from = mean21) |>
mutate(mean21_euploid = rowMeans(cbind(D21a, D21b))) |>
mutate(t21_ratio = T21/mean21_euploid) |>
ungroup() |>
dplyr::select(simplified_celltype, gene_id, mean21_euploid, t21_ratio)
require(NSM3)
t21_d21_gene_ratio <- chr21gene_bycell |>
left_join(t21neu_combined@meta.data |>
as.data.frame() |>
rownames_to_column(var="cell_id") |>
as_tibble() |>
dplyr::select(cell_id, simplified_celltype, XIST_status),
by = "cell_id") |>
group_by(sample_id, simplified_celltype, gene_id, gene_name, chromosome_name, start_position, XIST_status) |>
summarise(mean21 = mean(expr21), count = n()) |> filter(mean21 > 0) |>
filter(count > 4) |>
pivot_wider(id_cols = c(simplified_celltype, gene_id, gene_name, chromosome_name, start_position, XIST_status), names_from = sample_id, values_from = c(mean21, count)) |>
left_join(chr21_mean_euploid_gene,
by=c("simplified_celltype", "gene_id")) |>
mutate(t21_d21_ratio = mean21_T21/mean21_euploid,
t21_expr = mean21_T21,
pD_d21_ratio = mean21_pDAPT/mean21_euploid,
pD_expr = mean21_pDAPT,
d0_d21_ratio = mean21_.day0/mean21_euploid,
d0_expr = mean21_.day0) |>
mutate(euploid1_ratio = mean21_D21a/mean21_euploid,
euploid2_ratio = mean21_D21b/mean21_euploid,
t21_pD_ratio = t21_expr/pD_expr)
#write_csv(t21_d21_ratio_df, "tables/t21_d21_ratio_df.csv.gz")
#
# gene_order <- t21_d21_gene_ratio |>
# filter(!is.infinite(pD_d21_ratio) & !is.nan(pD_d21_ratio) &
# simplified_celltype == "neuron") |>
# arrange(pD_d21_ratio) |>
# pull(gene_name)
tb.bygene<-t21_d21_gene_ratio |> ungroup() |> filter(!is.na(XIST_status)) |>
dplyr::select(simplified_celltype, XIST_status, gene_id,
t21_d21_ratio,
pD_d21_ratio,
d0_d21_ratio,
euploid1_ratio,
euploid2_ratio) |>
rename_with(~gsub("_d21_ratio", "", .x), .cols=ends_with("d21_ratio")) |>
rename_with(~gsub("_ratio", "", .x), .cols=ends_with("_ratio")) |>
pivot_longer(cols = -c(simplified_celltype,
gene_id,
XIST_status), names_to = "sample", values_to = "ratio") |>
mutate(XIST_status = case_when(str_detect(sample, "euploid") ~ "absent",
T ~ as.character(XIST_status))) |> filter(!is.na(ratio)) |>
mutate(sample = case_when(
sample == "euploid1" ~ "D21a",
sample == "euploid2" ~ "D21b",
sample == "pD" & XIST_status == "FALSE" ~ "T21_pD-",
sample == "pD" & XIST_status == "TRUE" ~ "T21_pD+",
sample == "d0" & XIST_status == "FALSE" ~ "skip",
sample == "d0" &XIST_status == "TRUE" ~ "T21_d0+",
sample == "t21" ~ "T21",
T ~ sample)) |> filter(sample != "skip")
bp.pergene<-ggplot(tb.bygene |> filter(simplified_celltype != "Other"),
aes(x=sample, y=ratio, fill = XIST_status)) +
ggbeeswarm::geom_quasirandom(aes(color = XIST_status), alpha = .5, size=.5) +
geom_boxplot(fill = NA, notch = T, outlier.shape = NA, size=.3) +
scale_color_jama() + coord_cartesian(ylim = c(-0.1,4)) +
#stat_summary(fun = median, geom = "point", shape=23, size=1, alpha=1, aes(fill=XIST_status)) +
stat_summary(fun.data = function(x) {
data.frame(label=signif(median(x),2), y=-0.1)
},
geom = "text",
size=2) +
stat_compare_means(ref.group = "T21", method = "wilcox",
method.args = list(alternative="less"),
label.y = 3.5, hide.ns = T, label = "p.format", size=2) +
theme(
axis.ticks.x = element_blank(),
axis.title.x = element_blank(),
legend.position = "top"
) +
labs(
title="chr21 dosage by gene",
y ="chr21_gene/genome ratio",
color="XIST",
fill="XIST"
) +
scale_x_discrete(position = "top") #+ facet_grid(rows = vars(simplified_celltype), scales = "free", space = "free")
ggsave(bp.pergene, filename = "boxplot_bygene2.pdf", path=outdir, width = 3.6, height = 1.8, units = "in")
ggplot(tb.bygene,
aes(x=sample, y=ratio, fill = XIST_status)) +
ggbeeswarm::geom_quasirandom(aes(color = XIST_status), alpha = .5, size=.5) +
geom_boxplot(fill = NA, notch = T, outlier.shape = NA, size=.3) +
scale_color_jama() + coord_cartesian(ylim = c(-0.1,4)) +
#stat_summary(fun = median, geom = "point", shape=23, size=1, alpha=1, aes(fill=XIST_status)) +
stat_summary(fun.data = function(x) {
data.frame(label=signif(median(x),2), y=-0)
},
geom = "text",
size=2) +
stat_compare_means(ref.group = "T21", method = "wilcox",
method.args = list(alternative="less"), label.y = 3.5,
hide.ns = T, label = "p.signif", size=3) +
theme(
axis.ticks.x = element_blank(),
axis.title.x = element_blank(),
legend.position = "top"
) +
labs(
title="chr21 dosage by gene",
y ="chr21_gene/genome ratio",
color="XIST",
fill="XIST"
) +
scale_x_discrete(position = "top") + facet_grid(rows = vars(simplified_celltype), scales = "free")
ggsave(filename = "boxplot_bygene.pdf", path=outdir, width = 2.5, height = 4, units = "in")
test<-t21_d21_gene_ratio |>
pivot_longer(cols = c(t21_d21_ratio,
pD_d21_ratio,
d0_d21_ratio,
euploid1_ratio,
euploid2_ratio), names_to = "ratio_type", values_to = "ratio") |>
arrange(ratio) |> filter(!is.infinite(ratio) & !is.na(ratio) & !is.na(XIST_status) &
!(ratio_type == "d0_d21_ratio" & !XIST_status)
) |>
mutate(ratio_type = str_split_i(ratio_type, "_", 1)) |>
mutate(ratio_type = case_when(ratio_type == "t21" ~ "T21",
ratio_type == "d0" ~ ".day0",
ratio_type == "euploid1" ~ "D21a",
ratio_type == "euploid2" ~ "D21b",
ratio_type == "pD" ~ "pDAPT"
)) |>
group_by(ratio_type, simplified_celltype, XIST_status) |>
dplyr::select(ratio, ratio_type, gene_name) |>
#test_new<- test |> group_modify(test %>% mutate(interp_dens = calc_density(test$ratio)))
mutate(eCDF = fx_ecdf(ratio)) |>
mutate(upperCI = sort(rbind(ratio, ecdf.ks.CI(ratio)$upper)[2,])) |>
mutate(lowerCI = sort(rbind(ratio, ecdf.ks.CI(ratio)$lower)[2,])) |>
ungroup()
refratio <-"T21"
testref <- test |> group_by(ratio_type) |> filter(simplified_celltype == "Neurons" & ratio_type == refratio)
tested <- test |> group_by(ratio_type, XIST_status) |> filter(simplified_celltype == "Neurons" & ratio_type != refratio)
ks.neurons<-tested |> summarise(ksp = ks.test(unique(testref$ratio), unique(ratio), exact = T, alternative = "l")$p.value,
ksd = ks.test(unique(testref$ratio), unique(ratio), exaxt = T, alternative = "l")$statistic)
testref <- test |> group_by(ratio_type) |> filter(simplified_celltype == "Astrocytes" & ratio_type == refratio)
tested <- test |> group_by(ratio_type, XIST_status) |> filter(simplified_celltype == "Astrocytes" & ratio_type != refratio)
ks.glia<-tested |> summarise(ksp = ks.test(unique(testref$ratio), unique(ratio), exact = T, alternative = "l")$p.value,
ksd = ks.test(unique(testref$ratio), unique(ratio), exaxt = T, alternative = "l")$statistic)
testref <- test |> group_by(ratio_type) |> filter(simplified_celltype == "NPCs" & ratio_type == refratio)
tested <- test |> group_by(ratio_type, XIST_status) |> filter(simplified_celltype == "NPCs" & ratio_type != refratio)
ks.rgc<-tested |> summarise(ksp = ks.test(unique(testref$ratio), unique(ratio), exact = T, alternative = "l")$p.value,
ksd = ks.test(unique(testref$ratio), unique(ratio), exaxt = T, alternative = "l")$statistic)
testref <- test |> group_by(ratio_type) |> filter(simplified_celltype == "Other" & ratio_type == refratio)
tested <- test |> group_by(ratio_type, XIST_status) |> filter(simplified_celltype == "Other" & ratio_type != refratio)
ks.unk<-tested |> summarise(ksp = ks.test(unique(testref$ratio), unique(ratio), exact = T, alternative = "l")$p.value,
ksd = ks.test(unique(testref$ratio), unique(ratio), exaxt = T, alternative = "l")$statistic)
ks21_bygene<-bind_rows(list("Neurons" = ks.neurons,
"Astrocytes" = ks.glia,
"Other" = ks.unk,
"NPCs" = ks.rgc), .id = 'simplified_celltype')
df_anno<-ks21_bygene |> ungroup() |> filter(ksp <= 0.05) |> arrange(simplified_celltype, desc(ksd)) |>
mutate(ksp = signif(ksp, 2), ksd = signif(ksd, 2)) |> group_by(simplified_celltype) |>
add_column(x = 0.75) |> mutate(y = 1-row_number()/10)
gg.ecdf_bygene<-ggplot(test |> # filter(ratio_type != "d0_d21_ratio") |>
mutate(upperCI = case_when(ratio_type == refratio ~ upperCI, T ~ NA)) |>
mutate(lowerCI = case_when(ratio_type == refratio ~ lowerCI, T ~ NA))
, aes(x=ratio, y=eCDF, colour=ratio_type, linetype= XIST_status)) +
geom_step() + geom_text(data = df_anno, aes(x = x, y = y, label = ksp), size = 2) +
geom_hline(yintercept = 0.5, alpha = 0.2) + geom_vline(xintercept = c(1,1.5), alpha =0.2) +
geom_ribbon(aes(ymin=lowerCI, ymax=upperCI, fill = ratio_type),
alpha=0.2, color = NA) +
facet_grid(rows = vars(simplified_celltype)) + xlim(0.5,2) + theme(legend.position = "right")
ggsave(gg.ecdf_bygene, filename = "ecdf_bygene.pdf", path=outdir, width = 2.4, height = 3.6, units = "in")
test2<-t21_d21_gene_ratio |>
pivot_longer(cols = c(t21_d21_ratio,
pD_d21_ratio,
d0_d21_ratio,
euploid1_ratio,
euploid2_ratio), names_to = "ratio_type", values_to = "ratio") |>
arrange(ratio) |> filter(!is.infinite(ratio) & !is.na(ratio) & !is.na(XIST_status) &
simplified_celltype != "Other" & !(ratio_type == "d0_d21_ratio" & !XIST_status)
) |>
mutate(ratio_type = str_split_i(ratio_type, "_", 1)) |>
mutate(ratio_type = case_when(ratio_type == "t21" ~ "T21",
ratio_type == "d0" ~ ".day0",
ratio_type == "euploid1" ~ "D21a",
ratio_type == "euploid2" ~ "D21b",
ratio_type == "pD" ~ "pDAPT"
)) |>
group_by(ratio_type, XIST_status) |>
dplyr::select(ratio, ratio_type, gene_name) |>
mutate(eCDF = fx_ecdf(ratio)) |>
mutate(upperCI = sort(rbind(ratio, ecdf.ks.CI(ratio)$upper)[2,])) |>
mutate(lowerCI = sort(rbind(ratio, ecdf.ks.CI(ratio)$lower)[2,])) |>
ungroup()
refratio <-"T21"
testref <- test2 |> group_by(ratio_type) |> filter(ratio_type == refratio)
tested <- test2 |> group_by(ratio_type, XIST_status) |> filter(ratio_type != refratio)
ks.all<-tested |> summarise(ksp = ks.test(unique(testref$ratio), unique(ratio), exact = T, alternative = "l")$p.value,
ksd = ks.test(unique(testref$ratio), unique(ratio), exaxt = T, alternative = "l")$statistic)
ks21_bygene2<-bind_rows(list(#"neuron" = ks.neurons,
#"glia" = ks.glia,
"all" = ks.all), .id = 'simplified_celltype')
#df_anno2<-ks21_bygene2 |> filter(ksp <= 0.05) |> arrange(simplified_celltype, desc(ksd)) |>
# mutate(ksp = signif(ksp, 2), ksd = signif(ksd, 2)) |> add_column(x = 0.5) |> mutate(y = 1-row_number()/10)
df_anno2<-ks21_bygene2 |> ungroup() |> filter(ksp <= 0.05) |> arrange(desc(ksd)) |>
mutate(ksp = signif(ksp, 2), ksd = signif(ksd, 2)) |>
add_column(x = 0.75) |> mutate(y = 1-row_number()/10)
gg.ecdf_bygene2<-ggplot(test2 |>
mutate(upperCI = case_when(ratio_type == refratio ~ upperCI, T ~ NA)) |>
mutate(lowerCI = case_when(ratio_type == refratio ~ lowerCI, T ~ NA))
, aes(x=ratio, y=eCDF, colour=ratio_type, linetype= XIST_status)) +
geom_step() + geom_text(data = df_anno2, aes(x = x, y = y, label = ksp), size = 3) +
geom_hline(yintercept = 0.5, alpha = 0.2) + geom_vline(xintercept = c(1,1.5), alpha =0.2) +
geom_ribbon(aes(ymin=lowerCI, ymax=upperCI, fill = ratio_type),
alpha=0.2, color = NA) + xlim(0.5,2) #+ theme(legend.position = "none")
ggsave(gg.ecdf_bygene2, filename = "ecdf_bygene2.pdf", path=outdir, width = 3.2, height = 1.45, units = "in")
rm(chr21gene_bycell)
### DIFF expression
t21neu_combined <- PrepSCTFindMarkers(t21neu_combined)
fx.diffroc<-function(slott, assayy, seuratt, ident1, ident2, testt="roc", by_celltype=F){
DefaultAssay(seuratt) <- assayy
if (by_celltype == F) {
resultt<-list()
resultt[["none"]]<-FindMarkers(
seuratt,
test.use = testt,
slot = slott,
assay=assayy,
ident.1 = ident1,
ident.2 = ident2,
group.by = 'diff_expr',
recorrect_umi = F,
logfc.threshold = 0) |>
mutate(
testname = paste0(assayy, "_", slott),
contrast = sprintf("%s_%s", ident1, ident2),
test = (sign(avg_log2FC) * power)
) |>
arrange(desc(test)) |>
rownames_to_column("gene_symbol") |>
left_join(gene_info_df |> dplyr::select(hgnc_symbol, chromosome_name, strand, start_position, end_position, gene_biotype),
by=c("gene_symbol"="hgnc_symbol"))
}else {
celltypes <- unique(seuratt@meta.data$simplified_celltype)
resultt <- list()
for (celltype in celltypes) {
resultt[[celltype]] <- FindMarkers(
seuratt,
test.use = testt,
slot = slott,
assay=assayy,
ident.1 = sprintf("%s_%s", ident1, celltype),
ident.2 = sprintf("%s_%s", ident2, celltype),
group.by = 'diff_expr_celltype',
recorrect_umi = F,
logfc.threshold = 0) |>
mutate(
testname = paste0(assayy, "_", slott),
contrast = sprintf("%s_%s_%s", celltype, ident1, ident2),
test = (sign(avg_log2FC) * power)
) |>
arrange(desc(test)) |>
rownames_to_column("gene_symbol") |>
left_join(gene_info_df |> dplyr::select(hgnc_symbol, chromosome_name, strand, start_position, end_position, gene_biotype),
by=c("gene_symbol"="hgnc_symbol"))
}
}
return(resultt)
}
fx.diffroc2<-function(slott, assayy, seuratt, ident1, ident2, testt="roc", by_celltype=F){
DefaultAssay(seuratt) <- assayy
if (by_celltype == F) {
resultt<-list()
resultt[["none"]]<-FindMarkers(
seuratt,
test.use = testt,
slot = slott,
assay=assayy,
ident.1 = ident1,
ident.2 = ident2,
group.by = 'diff_expr2',
recorrect_umi = F,
logfc.threshold = 0) |>
mutate(
testname = paste0(assayy, "_", slott),
contrast = sprintf("%s_%s", ident1, ident2),
test = (sign(avg_log2FC) * power)
) |>
arrange(desc(test)) |>
rownames_to_column("gene_symbol") |>
left_join(gene_info_df |> dplyr::select(hgnc_symbol, chromosome_name, strand, start_position, end_position, gene_biotype),
by=c("gene_symbol"="hgnc_symbol"))
}else {
celltypes <- unique(seuratt@meta.data$simplified_celltype)
resultt <- list()
for (celltype in celltypes) {
resultt[[celltype]] <- FindMarkers(
seuratt,
test.use = testt,
slot = slott,
assay=assayy,
ident.1 = sprintf("%s_%s", ident1, celltype),
ident.2 = sprintf("%s_%s", ident2, celltype),
group.by = 'diff_expr_celltype2',
recorrect_umi = F,
logfc.threshold = 0) |>
mutate(
testname = paste0(assayy, "_", slott),
contrast = sprintf("%s_%s_%s", celltype, ident1, ident2),
test = (sign(avg_log2FC) * power)
) |>
arrange(desc(test)) |>
rownames_to_column("gene_symbol") |>
left_join(gene_info_df |> dplyr::select(hgnc_symbol, chromosome_name, strand, start_position, end_position, gene_biotype),
by=c("gene_symbol"="hgnc_symbol"))
}
}
return(resultt)
}
t21neu_combined@meta.data$diff_expr <- t21neu_combined@meta.data |>
as.data.frame() |>
rownames_to_column(var = "cell_id") |>
mutate(
Diff_expr = case_when(
sample == "pDAPT" & XIST_status ~ "pDxist",
sample %in% c("pDAPT") & !XIST_status ~ "pDno",
str_detect(sample, "D21") & !XIST_status ~ "d21",
sample == "T21" & !XIST_status ~ "t21",
sample == ".day0" & XIST_status ~ "d0",
T~"left"
)) |>
pull(Diff_expr)
t21neu_combined@meta.data$diff_expr2 <- t21neu_combined@meta.data |>
as.data.frame() |>
rownames_to_column(var = "cell_id") |>
mutate(
Diff_expr = case_when(
sample == "pDAPT" & XIST_status ~ "pDxist",
sample %in% c("T21","pDAPT") & !XIST_status ~ "t21",
str_detect(sample, "D21") & !XIST_status ~ "d21",
#sample == "T21" & !XIST_status ~ "t21",
sample == ".day0" & XIST_status ~ "d0",
T~"left"
)) |>
pull(Diff_expr)
t21neu_combined@meta.data$diff_expr_celltype <- t21neu_combined@meta.data |>
as.data.frame() |>
rownames_to_column(var = "cell_id") |>
mutate(
Diff_expr = case_when(
sample == "pDAPT" & XIST_status ~ sprintf("%s_%s", "pDxist", simplified_celltype),
sample %in% c("pDAPT") & !XIST_status ~ sprintf("%s_%s", "pDno", simplified_celltype),
str_detect(sample, "D21") & !XIST_status ~ sprintf("%s_%s", "d21", simplified_celltype),
sample == "T21" & !XIST_status ~ sprintf("%s_%s", "t21", simplified_celltype),
sample == ".day0" & XIST_status ~ sprintf("%s_%s", "d0", simplified_celltype),
T~sprintf("%s_%s", "left", simplified_celltype)
)) |>
pull(Diff_expr)
t21neu_combined@meta.data$diff_expr_celltype2 <- t21neu_combined@meta.data |>
as.data.frame() |>
rownames_to_column(var = "cell_id") |>
mutate(
Diff_expr = case_when(
sample == "pDAPT" & XIST_status ~ sprintf("%s_%s", "pDxist", simplified_celltype),
sample %in% c("T21","pDAPT") & !XIST_status ~ sprintf("%s_%s", "t21", simplified_celltype),
str_detect(sample, "D21") & !XIST_status ~ sprintf("%s_%s", "d21", simplified_celltype),
#sample == "T21" & !XIST_status ~ sprintf("%s_%s", "t21", simplified_celltype),
sample == ".day0" & XIST_status ~ sprintf("%s_%s", "d0", simplified_celltype),
T~sprintf("%s_%s", "left", simplified_celltype)
)) |>
pull(Diff_expr)
make_plot <- function(df) {
if (typeof(df) == "list") {
comb_df <- tibble()
for (celltype in names(df)) {
comb_df <- rbind(
comb_df,
df[[celltype]] |>
mutate(comparison_celltype = celltype)
)
}
return(
comb_df |>
filter(chromosome_name == "21" & is.finite(test) & test != 0) |>
ggplot(aes(x = fct_reorder(gene_symbol, test), y=test, fill=testname)) +
facet_grid(rows = vars(testname), cols = vars(comparison_celltype)) +
# facet_wrap("testname", scales = "free") +
geom_hline(yintercept = c(-0.1,0.1)) +
geom_bar(position="stack", stat="identity") +
coord_flip()
)
}else {
return(
df |>
filter(chromosome_name == "21" & is.finite(test) & test != 0) |>
ggplot(aes(x = fct_reorder(gene_symbol, test), y=test, fill=testname)) +
facet_wrap("testname", scales = "free") +
geom_hline(yintercept = c(-0.1,0.1)) +
geom_bar(position="stack", stat="identity") +
coord_flip()
)
}
}
## separate
SCT_data_t21_ct <- fx.diffroc(
slott = "data",
assayy = "SCT",
seuratt = t21neu_combined,
ident1 = "d21",
ident2 = "t21",
by_celltype = T
)
SCT_data_pD_ct <- fx.diffroc(
slott = "data",
assayy = "SCT",
seuratt = t21neu_combined,
ident1 = "pDxist",
ident2 = "pDno",
by_celltype = T
)
SCT_data_d0_ct <- fx.diffroc(
slott = "data",
assayy = "SCT",
seuratt = t21neu_combined,
ident1 = "d0",
ident2 = "pDno",
by_celltype = T
)
##pD and t21 combined to t21
SCT_data_t21_ct2 <- fx.diffroc2(
slott = "data",
assayy = "SCT",
seuratt = t21neu_combined,
ident1 = "d21",
ident2 = "t21",
by_celltype = T
)
SCT_data_pD_ct2 <- fx.diffroc2(
slott = "data",
assayy = "SCT",
seuratt = t21neu_combined,
ident1 = "pDxist",
ident2 = "t21",
by_celltype = T
)
SCT_data_d0_ct2 <- fx.diffroc2(
slott = "data",
assayy = "SCT",
seuratt = t21neu_combined,
ident1 = "d0",
ident2 = "t21",
by_celltype = T
)
make_comb_df <- function(df) {
comb_df <- tibble()
for (celltype in names(df)) {
comb_df <- rbind(
comb_df,
df[[celltype]] |>
mutate(comparison_celltype = celltype)
)
}
return(comb_df)
}
require(scales)
fx.chr_plots<-function(label, allsubset, chr, testn, hght){
allsubset<- allsubset |>
mutate(contrast3 = case_when(contrast3 == "d21_t21" ~ "euploid",
contrast3 == "d21_pDno" ~ "euploid",
contrast3 == "d0_t21" ~ "day0",
contrast3 == "d0_pDno" ~ "day0",
contrast3 == "pDxist_t21" ~ "pDAPT",
contrast3 == "pDxist_pDno" ~ "pDAPT",
T ~ "left"))
d21_normalized<-allsubset |> filter(chromosome_name == chr & testname == testn & contrast3=="euploid") |>
rename(avg_euploid = "avg_log2FC") |> dplyr::select(gene_symbol, avg_euploid, comparison_celltype)
allsubset |> filter(chromosome_name == chr & testname == testn) |>
left_join(d21_normalized) |> rowwise() |> mutate(diff_log2FC = avg_log2FC-avg_euploid) |>
filter(!is.na(avg_euploid)) |>
ggplot(aes(x=contrast3, y = avg_log2FC, color=contrast3)) +
geom_boxplot(notch=T) +
facet_grid(rows = vars(comparison_celltype)) +
stat_summary(fun.data = function(x) {
data.frame(label=signif(median(x),2), y=-0.9)}, geom = "text", color="black", size=2) +
stat_compare_means(ref.group = "euploid", method = "wilcox",
label.y = 0.9, hide.ns = F, label = "p.format", size=2) +
coord_cartesian(ylim = c(-1,1)) + geom_hline(yintercept = c(0)) +
theme(legend.position = "top") + ggtitle("Differential to T21") +
theme(text = element_text(size = 6), legend.text = element_text(size = 6))
ggsave(filename = paste(testn, label, "box.pdf", sep = "_"), path=outdir, width = 1.8, height = hght, units = "in")
allsubset |> filter(chromosome_name == chr & testname == testn) |>
left_join(d21_normalized) |> rowwise() |> mutate(diff_log2FC = avg_log2FC-avg_euploid) |>
filter(!is.na(diff_log2FC)) |> mutate(chr21_pos = TSS/1000000) |>
ggplot(aes(x=chr21_pos, y = avg_log2FC, color=contrast3)) +# geom_point(alpha=0.2) +
geom_smooth(aes(fill = contrast3)) +
facet_grid(rows = vars(comparison_celltype)) +
coord_cartesian(ylim = c(-0.6,0.2)) + geom_hline(yintercept = 0) + geom_vline(xintercept = 37.550000, linetype=3) +
theme(legend.position = "top") + ggtitle("Differential to T21 by chromosomal position") +
theme(text = element_text(size = 6), legend.text = element_text(size = 6))
ggsave(filename = paste(testn, label, "pos.pdf", sep = "_"), path=outdir, width = 2.4, height = hght, units = "in")
allsubset |> filter(chromosome_name == chr & testname == testn) |>
left_join(d21_normalized) |> rowwise() |> mutate(diff_log2FC = avg_log2FC-avg_euploid) |>
filter(!is.na(diff_log2FC) & contrast3 != "euploid") |>
mutate(distance=abs(37550000-TSS)/1000000) |>
ggscatter(x="distance", y = "avg_log2FC", color="contrast3", palette= hue_pal()(3)[c(1,3)], add = "reg.line", alpha = 0.2, size = 2,
cor.coef = T, cor.coef.size = 2,conf.int = T, ggtheme = theme_classic(), #add.params = list(color="black"),
cor.coeff.args = list(method = "pearson", label.x = 0, label.y = 0.1, digits = 2)) +
facet_grid(rows = vars(comparison_celltype), cols = vars(contrast3)) +
coord_cartesian(ylim = c(-0.6,0.2)) + geom_hline(yintercept = 0) +
theme(legend.position = "none") + ggtitle("Differential to T21 by XIST distance") +
theme(text = element_text(size = 6), legend.text = element_text(size = 6), axis.title = element_text(size = 7, face = "bold"),
axis.text = element_text(size = 6)) #+ theme(axis.text.x = element_text(angle = 45, hjust=1, size = 6))
ggsave(filename = paste(testn, label, "dist.pdf", sep = "_"), path=outdir, width = 2.4, height = hght, units = "in")
allsubset |> filter(chromosome_name == chr & testname == testn) |>
group_by(contrast) |> filter(!duplicated(gene_symbol)) |> ungroup() |> rename(avgL2FC = "avg_log2FC") |>
pivot_wider(id_cols =c(gene_symbol, comparison_celltype), names_from = contrast3, values_from = c(avgL2FC, power)) |>
filter(!is.na(avgL2FC_euploid) & !is.na(avgL2FC_day0) & !is.na(avgL2FC_pDAPT)) |>
pivot_longer(-c(gene_symbol, comparison_celltype, avgL2FC_euploid, power_euploid), names_to = c("Var", ".value"), names_sep="_") |>
pivot_longer(-c(gene_symbol, comparison_celltype, avgL2FC_euploid, power_euploid, Var), names_to = "correction", values_to = "value") |>
pivot_wider(id_cols = c(gene_symbol, comparison_celltype, avgL2FC_euploid, power_euploid, correction), names_from = Var, values_from = value) |>
mutate(test = sign(avgL2FC)*power, test_euploid = sign(avgL2FC_euploid)*power_euploid) |>
#filter(power_euploid >= 0.1) |>
ggscatter(x="avgL2FC_euploid", y = "avgL2FC", #size="power_euploid",
color= "power", alpha = "power_euploid",
add.params = list(color = "black", size = 0.5), font.label = c(7, "plain", "black"), add = "reg.line", cor.coef.size =2,
cor.coef = T, cor.method = "pearson", cor.coef.coord = c(-2.5,2), ggtheme = theme_classic()) +
gradient_color(rev(viridis::viridis(11))) + #coord_cartesian(xlim = c(-limit, limit), ylim = c(-limit,limit)) +
facet_grid(cols=vars(correction), rows=vars(comparison_celltype), scales = "free") + theme(text = element_text(size=8))
ggsave(filename = paste(testn, label, "chr21vseuploid.pdf", sep = "_"), path=outdir, width = 3.6, height = 3.6, units = "in")
allsubset |> filter(chromosome_name != chr & testname == testn) |>
group_by(contrast) |> filter(!duplicated(gene_symbol)) |> ungroup() |> rename(avgL2FC = "avg_log2FC") |>
pivot_wider(id_cols =c(gene_symbol, comparison_celltype), names_from = contrast3, values_from = c(avgL2FC, power)) |>
filter(!is.na(avgL2FC_euploid) & !is.na(avgL2FC_day0) & !is.na(avgL2FC_pDAPT)) |>
pivot_longer(-c(gene_symbol, comparison_celltype, avgL2FC_euploid, power_euploid), names_to = c("Var", ".value"), names_sep="_") |>
pivot_longer(-c(gene_symbol, comparison_celltype, avgL2FC_euploid, power_euploid, Var), names_to = "correction", values_to = "value") |>
pivot_wider(id_cols = c(gene_symbol, comparison_celltype, avgL2FC_euploid, power_euploid, correction), names_from = Var, values_from = value) |>
mutate(test = sign(avgL2FC)*power, test_euploid = sign(avgL2FC_euploid)*power_euploid) |>
filter(power_euploid >= 0.1) |>
ggscatter(x="avgL2FC_euploid", y = "avgL2FC", #size="power_euploid",
color= "power", alpha = "power_euploid",
add.params = list(color = "black", size = 0.5), font.label = c(7, "plain", "black"), add = "reg.line", cor.coef.size =2,
cor.coef = T, cor.method = "pearson", cor.coef.coord = c(-2.5,2), ggtheme = theme_classic()) +
# scale_color_gradient2(low = "blue", high = "red") +
gradient_color(rev(viridis::viridis(11))) + #coord_cartesian(xlim = c(-limit, limit), ylim = c(-limit,limit)) +
facet_grid(cols=vars(correction), rows=vars(comparison_celltype), scales = "free") + theme(text = element_text(size=8))
ggsave(filename = paste(testn, label, "allvsd21.pdf", sep = "_"), path=outdir, width = 3.6, height = 3.6, units = "in")
}
SCT<- bind_rows(list(make_comb_df(SCT_data_t21_ct),
make_comb_df(SCT_data_d0_ct),
make_comb_df(SCT_data_pD_ct))) |>
mutate(TSS = case_when(strand == 1 ~ start_position, T ~ end_position)) |>
mutate(contrast2 = paste0(testname,"_", str_extract(contrast, "(d21|pDxist|d0)_.*"))) |>
mutate(contrast3 = str_extract(contrast, "(d21|pDxist|d0)_(t21|pDno)"))
write_csv(SCT, paste0(outdir,"/tables/SCT.csv.gz"))
fx.chr_plots("ct", SCT, 21, "SCT_data", 3.2)
SCT2<- bind_rows(list(make_comb_df(SCT_data_t21_ct2),
make_comb_df(SCT_data_d0_ct2),
make_comb_df(SCT_data_pD_ct2))) |>
mutate(TSS = case_when(strand == 1 ~ start_position, T ~ end_position)) |>
mutate(contrast2 = paste0(testname,"_", str_extract(contrast, "(d21|pDxist|d0)_.*"))) |>
mutate(contrast3 = str_extract(contrast, "(d21|pDxist|d0)_(t21|pDno)"))
write_csv(SCT2, paste0(outdir,"/tables/SCT2.csv.gz"))
fx.chr_plots("ct2", SCT2, 21, "SCT_data", 3.2)
#GSEA
require(metap)
vectorized_sumlog <- function(x, y) {
if (length(x) != length(y)) {
stop("lengths of lists must be the same")
}
combined_pvals <- c()
for (i in 1:length(x)) {
combined_pvals <- c(combined_pvals, (metap::sumlog(c(x[i], y[i])))$p)
}
return(combined_pvals)
}
fx.labels1<- Vectorize(function(X) {
s <- toupper(sapply(strsplit(str_replace(X, "_", ":"), "[:]"), getElement, 1 ))
abbreviate(s, minlength = 25, dot = T)
})
fx.labels2<- Vectorize(function(X) {
s <- toupper(sapply(strsplit(str_replace(X, "_", ":"), "[:]"), getElement, 2))
s<-str_replace_all(s, "_", "")
abbreviate(s, minlength = 25, dot = T)
})
fx.labels3<- Vectorize(function(X) {
#s <- toupper(sapply(strsplit(str_replace(X, "_", ":"), "[:]"), getElement, 2))
s<-str_replace_all(X, "_", "")
abbreviate(s, minlength = 25, dot = T)
})
fx.gpfx<-function(gsea_resultt, typee, slcut, qcut, excluded, vc.collections, hght)
{
gsea_with_cat <- gsea_resultt|>
filter(data_type == typee) |>
dplyr::mutate(category = dplyr::case_when(
str_detect(ID, "_STRESS") ~ "stress",
str_detect(ID, "ENDOPLASM") ~ "ER",
str_detect(ID, "chr21") ~ "chr21",
str_detect(ID, "TRANSLATION") ~ "translation",
str_detect(ID, "RIBOSOM") ~ "translation",
str_detect(ID, "SERINE") & (!str_detect(ID, "SERINE_THREONINE")) ~ "serine",
str_detect(ID, "MITOC") ~ "mitochondria",
str_detect(ID, "NRF2") ~ "stress",
str_detect(ID, "APOPTO") | str_detect(ID, "PROGRAMMED_CELL_DEATH") ~ "apoptosis",
T ~ "nocat"
))
gsea_with_cat <- gsea_with_cat |>
mutate(Collection = case_when(str_count(ID,"_") == 0 ~ ID,
T~fx.labels1(ID)))
if(excluded == "nocat"){vc.collections<-gsea_with_cat |> pull(Collection) |> unique()}
if(excluded != "nocat"){vc.collections<-vc.collections}
categories_to_view<-gsea_with_cat |>
filter(Collection %in% vc.collections | str_detect(ID, "^chr21")) |>
group_by(ID, cell_type) |>
mutate(min_q_val = min(qvalues, na.rm=F),
mean_NES = mean(NES, na.rm = F),
ncount = n()) |>
filter(ncount == 3) |>
pivot_wider(id_cols = c(ID, cell_type, min_q_val), names_from = c(comparison), values_from = c(NES, qvalues, p.adjust)) |>
#filter(min_q_val <= qcut)|>
ungroup()
#if(excluded == "nocat"){categories_to_view$mean_q_val_t21<-qcut}
categories_to_view<-categories_to_view |> #dplyr::select(-c(names(which(duplicated(t(categories_to_view)))))) |>
mutate(pD_t21 = vectorized_sumlog(p.adjust_pD, p.adjust_t21),
d0_t21 = vectorized_sumlog(p.adjust_d0, p.adjust_t21),
pD_d0 = vectorized_sumlog(p.adjust_d0, p.adjust_pD)) |> filter(qvalues_d0 <= qcut | qvalues_t21 <= qcut)
vc.neurons<-categories_to_view |> filter(cell_type == "Neurons" & (pD_d0 <= slcut | pD_t21 <= slcut | d0_t21 <= slcut)) |> pull(ID)
vc.Astrocytes<-categories_to_view |> filter(cell_type == "Astrocytes" & (pD_d0 <= slcut | pD_t21 <= slcut | d0_t21 <= slcut)) |> pull(ID)
vc.NPCs<-categories_to_view |> filter(cell_type == "NPCs" & (pD_d0 <= slcut | pD_t21 <= slcut | d0_t21 <= slcut)) |> pull(ID)
vc.Other<-categories_to_view |> filter(cell_type == "Other" & (pD_d0 <= slcut | pD_t21 <= slcut | d0_t21 <= slcut)) |> pull(ID)
gsea_with_cat |> # filter((cell_type == "Astrocytes" & ID %in% vc.Astrocytes) |
# (cell_type == "Neurons" & ID %in% vc.neurons) |
# (cell_type == "NPCs" & ID %in% vc.NPCs) |
# (cell_type == "Other" & ID %in% vc.Other)) |>
pivot_wider(id_cols =c(ID, cell_type, category), names_from = comparison, values_from = c(NES, qvalues)) |>
filter(qvalues_t21 <= qcut) |>
pivot_longer(-c(ID, cell_type, category, qvalues_t21, NES_t21), names_to = c("Var", ".value"), names_sep="_") |>
pivot_longer(-c(ID, cell_type, category, qvalues_t21, NES_t21, Var), names_to = "correction", values_to = "value") |>
pivot_wider(id_cols = c(ID, cell_type, category, qvalues_t21, NES_t21, correction), names_from = Var, values_from = value) |>
mutate(log10q = pmin(-log(qvalues,10),3), log10_q_euploid = pmin(-log(qvalues_t21,10),3)) |>
ggscatter(x="NES_t21", y = "NES", color= "qvalues", alpha = "log10_q_euploid", #alpha = 0.5,
add.params = list(color = "black", size = 0.5), font.label = c(8, "plain", "black"), add = "reg.line",
cor.coef.size =2, cor.coef = T, cor.method = "pearson", cor.coef.coord = c(-2.5,0), ggtheme = theme_classic()) +
gradient_color(viridis::plasma(11)) + #coord_cartesian(xlim = c(-limit, limit), ylim = c(-limit,limit)) +
facet_grid(cols=vars(correction), rows=vars(cell_type), scales = "free") + theme(text = element_text(size=8))
#ggsave(filename = paste(testn, label, "gsea_all.pdf", sep = "_"), path=outdir, width = 4, height = hght, units = "in")
ggsave(paste0(deparse(substitute(gsea_resultt)),typee,qcut,excluded,slcut,"scatter.pdf",collapse="_"), path=outdir, width=3.6, height=hght, units="in")
gsea_with_cat<-gsea_with_cat |>
# pivot_wider(id_cols =c(ID, cell_type, category), names_from = comparison, values_from = c(NES, qvalues)) |>
#filter(qvalues_t21 <= qcut & |>
filter( category != excluded & #ID %in% c(vc.neurons, vc.Astrocytes, vc.NPCs, vc.Other))
(cell_type == "Astrocytes" & ID %in% vc.Astrocytes) |
(cell_type == "Neurons" & ID %in% vc.neurons) |
(cell_type == "NPCs" & ID %in% vc.NPCs) |
(cell_type == "Other" & ID %in% vc.Other))
if(excluded != "nocat")
{
gsea_with_cat$category<-gsea_with_cat$cell_type
gsea_with_cat$cell_type<-gsea_with_cat$comparison
# gsea_with_cat<-gsea_with_cat |> filter(qvalues <= qcut)
}
gsea_with_cat |> mutate(comparison = case_when(comparison == "t21" ~ "euploid", T ~ comparison)) |>
mutate(cell_type = case_when(cell_type == "t21" ~ "euploid", T ~ cell_type)) |>
mutate(ID = case_when(str_count(ID,"_") == 0 ~ paste0("CHR_",ID), T~ID)) |>
mutate(ID = fx.labels2(ID)) |>
mutate(Score = -log10(qvalues)*sign(NES)) |>
mutate(Score2 = pmin(-log10(qvalues),3)) |>
mutate(ID = fct_reorder(ID, NES)) |>
ggplot(aes(x=NES, y=ID)) +
geom_segment(aes(x=0, y=ID, yend = ID, xend=NES), color="black", size=.4) +
geom_point(aes(color=as.factor(sign(NES)), alpha = Score2)) +
#scale_color_gradient2(low="blue",mid = "white", high="red") +
scale_fill_aaas() +
scale_color_aaas() +
facet_grid(rows=vars(category), cols = vars(cell_type), scales="free", space="free", drop = T) +
geom_vline(xintercept = c(0), alpha=0.5) +
theme(legend.position = "right") +
#theme(axis.text.y = element_blank()) +
theme(axis.text.y = element_text(size = 6), axis.title.y = element_blank()) +
scale_x_continuous(expand = c(.1,.1))
ggsave(paste0(deparse(substitute(gsea_resultt)),typee,qcut,excluded,slcut,"gp.pdf",collapse="_"), path=outdir, width=7.2, height=6.8, units="in")
}
gsea_results_SCT <- tibble()
for(data_type in c("data")) {
for (comparison_name in c("t21", "pD", "d0")) {
print(comparison_name)
test_df <- make_comb_df(eval(as.name(sprintf("SCT_%s_%s_ct", data_type, comparison_name)))) |>
#test_df <- SCT |> filter(str_detect(contrast3, comparison_name) & str_detect(testname, data_type)) |>
group_by(comparison_celltype) |>
arrange(desc(test))
genes <- split(test_df |> ungroup() |> dplyr::select(gene_symbol, test) |> deframe(), test_df$comparison_celltype)
for (celltype in names(genes)) {
gc()
gl <- genes[[celltype]]
set.seed(42)
gsea_result <- GSEA(gl, TERM2GENE = msig_terms, pvalueCutoff = 1,
pAdjustMethod = "BH", by="fgsea", seed=T, eps=1e-50)
gsea_results_SCT <- gsea_results_SCT |>
rbind(
gsea_result |>
as_tibble() |>
mutate(comparison = comparison_name,
data_type = data_type,
cell_type = celltype)
)
}
}
}
gc()
write_csv(gsea_results_SCT, paste0(outdir,"/tables/gsea_SCT_celltype.csv.gz"))
gsea_results_SCT2 <- tibble()
for(data_type in c("data")) {
for (comparison_name in c("t21", "pD", "d0")) {
print(comparison_name)
test_df <- make_comb_df(eval(as.name(sprintf("SCT_%s_%s_ct2", data_type, comparison_name)))) |>
#test_df <- SCT2 |> filter(str_detect(contrast3, comparison_name) & str_detect(testname, data_type)) |>
group_by(comparison_celltype) |>
arrange(desc(test))
genes <- split(test_df |> ungroup() |> dplyr::select(gene_symbol, test) |> deframe(), test_df$comparison_celltype)
for (celltype in names(genes)) {
gc()
gl <- genes[[celltype]]
set.seed(42)
gsea_result <- GSEA(gl, TERM2GENE = msig_terms, pvalueCutoff = 1,
pAdjustMethod = "BH", by="fgsea", seed=T, eps=1e-50)
gsea_results_SCT2 <- gsea_results_SCT2 |>
rbind(
gsea_result |>
as_tibble() |>
mutate(comparison = comparison_name,
data_type = data_type,
cell_type = celltype)
)
}
}
}
gc()
write_csv(gsea_results_SCT2, paste0(outdir,"/tables/gsea_SCT_celltype2.csv.gz"))
qcut<-0.1
slcut<-0.005
# excluded<-"nocat"
excluded<-"_"
vc.collections<-collections
fx.gpfx(gsea_results_SCT, "data", slcut, qcut, excluded, vc.collections, 3.6)
fx.gpfx(gsea_results_SCT2, "data", slcut, qcut, excluded, vc.collections, 3.6)
}