Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
T21-XIST/T21_seurat_anchor_SCT_integration_sp.R
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
1595 lines (1386 sloc)
75.1 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## set working directory | |
setwd(dirname(unlist(rstudioapi::getSourceEditorContext()['path']))) | |
## --------------------------- | |
## load up the packages we will need: (uncomment as required) | |
require(tidyverse); | |
theme_set( | |
theme_classic() + | |
theme( | |
text = element_text(size = 6, family = "sans",), | |
plot.title = element_text(hjust = 0.5, size = 8, family = "sans", face = "bold"), | |
axis.title = element_text(size = 7, face = "bold"), | |
axis.text = element_text(size = 6), | |
legend.spacing.x = unit(0, "cm"), | |
legend.spacing.y = unit(2, "mm"), | |
legend.key.height = unit(4, "mm"), | |
legend.margin = margin(0, 0, 0, 0), | |
axis.line = element_line(size = 0), | |
strip.background = element_blank(), | |
strip.text = element_text(margin = margin(1, 1, 1, 1, unit = "mm")), | |
panel.border = element_rect(color = "black", fill = NA, size = .5) | |
) | |
) | |
require(ggsci) | |
#require(scales) | |
require(ggthemes) | |
library(Seurat) | |
library(SeuratData) | |
library(patchwork) | |
library(HGNChelper) | |
library(openxlsx) | |
## --------------------------- | |
gene_info_df <- read_csv("external_tables/gene_info_df.csv.gz") | |
# Adding the data to Seurat | |
mat_path <- "input_data/parsebio_scrnaseq/real_outputs/combined/all-well/DGE_filtered" | |
mat <- ReadParseBio(mat_path) | |
# Check to see if empty gene names are present, add name if so. | |
table(rownames(mat) == "") | |
rownames(mat)[rownames(mat) == ""] <- "unknown" | |
# Read in cell meta data | |
cell_meta <- read.csv(paste0(mat_path, "/cell_metadata.csv"), row.names = 1) | |
# Create object | |
t21neu <- CreateSeuratObject(mat, min_genes = 100, min_cells = 100, | |
names.feild = 0, meta.data = cell_meta) | |
#Setting our initial cell class to a single type, this will changer after clustering. | |
t21neu@meta.data$orig.ident <- factor(rep("t21neu", nrow(t21neu@meta.data))) | |
Idents(t21neu) <- t21neu@meta.data$orig.ident | |
saveRDS(t21neu, "tables/seurat_obj_before_QC.RDS") | |
t21neu_pre <- readRDS("tables/seurat_obj_before_QC.RDS") | |
library(clusterProfiler) | |
library(msigdbr) | |
msig_df <- msigdbr(species = "Homo sapiens") | |
msig_terms <- msig_df %>% | |
filter(gs_cat == "H" | | |
gs_cat == "C1" | | |
(gs_cat == "C2" & str_detect(gs_subcat, "^CP:")) | | |
#(gs_cat == "C3" & str_detect(gs_subcat, "^TFT:")) | | |
#gs_cat == "C4" | | |
(gs_cat == "C5" & str_detect(gs_subcat, "^HPO"))# | gs_cat == "C8" | |
) %>% | |
dplyr::select(gs_name, gene_symbol) | |
rm(msig_df) | |
cust.name<-"fin13" | |
vc.regress<-c("MT") | |
min.xist<-2 | |
RNAvSCT<-"integrated" | |
vc.tissue<-"Burke" | |
collections<-c("WP","REACTOME","KEGG", "HALLMARK", "BIOCARTA", "PID") | |
system.time(fx.master(cust.name, vc.regress, min.xist, RNAvSCT, vc.tissue, collections)) | |
fx.master<-function(cust.name, vc.regress, min.xist, RNAvSCT, vc.tissue, collections) | |
{ | |
outdir<-paste0(c(cust.name, vc.tissue, RNAvSCT,"with",vc.regress), collapse = "_") | |
dir.create(outdir) | |
dir.create(paste0(outdir,"/tables")) | |
vc.nameBYchr<-setNames(gene_info_df$hgnc_symbol, gene_info_df$chromosome_name) | |
ls.nameBYchr<-split(vc.nameBYchr, names(vc.nameBYchr)) | |
ls.countsBYchr<-lapply(ls.nameBYchr,function(X) { | |
fraction <- Matrix::colSums(t21neu_pre[as.vector(X), ])/Matrix::colSums(t21neu_pre) | |
}) | |
df.CHRfrac<-as.data.frame(ls.countsBYchr) | |
rb.genes <- rownames(t21neu_pre)[grep("^RP[SL]",rownames(t21neu_pre))] | |
rpsl<- Matrix::colSums(t21neu_pre[rb.genes, ])/Matrix::colSums(t21neu_pre) | |
rn5.genes <- rownames(t21neu_pre)[grep("^RNA5S",rownames(t21neu_pre))] | |
rn5s<- Matrix::colSums(t21neu_pre[rn5.genes, ])/Matrix::colSums(t21neu_pre) | |
df.neu<-df.CHRfrac |> rownames_to_column("cell_id") |> | |
left_join(t21neu_pre@meta.data |> rownames_to_column("cell_id"), by="cell_id") |> | |
left_join(cbind(rpsl, rn5s) |> as.data.frame() |> rownames_to_column("cell_id"), by="cell_id") |> | |
column_to_rownames("cell_id") | |
t21neu_pre@meta.data<- df.neu |> mutate(dox = str_detect(sample, "_dox")) |> | |
mutate(sample = case_when(str_detect(sample, "198_1") ~ "D21a", | |
str_detect(sample, "198_2") ~ "D21b", | |
str_detect(sample, "nodox") ~ "T21", | |
str_detect(sample, "d0") ~ ".day0", | |
str_detect(sample, "pD") ~ "pDAPT")) |> | |
mutate(mcount_ratio = mread_count/tscp_count) | |
t21neu_removed <- subset(t21neu_pre, subset = nFeature_RNA > 7500 | nFeature_RNA < 1500 | nCount_RNA < 2500 | MT > 0.1) | |
t21neu <-subset(t21neu_pre, subset = nFeature_RNA <= 7500 & nFeature_RNA >= 1500 & nCount_RNA >= 2500 & MT <= 0.1) | |
tb.CHR<-pivot_longer(t21neu_pre@meta.data |> rownames_to_column("cell_id"), | |
c(colnames(df.CHRfrac),"rpsl", "rn5s", "gene_count", "tscp_count", "mcount_ratio")) | |
ggplot(tb.CHR, aes(x=value, color = sample)) + stat_ecdf(geom = "step") + | |
facet_wrap(vars(name), scales="free_x") | |
ggsave("count_ecdf.pdf", path=outdir, height = 5, width = 5, units="in") | |
ggplot(t21neu_removed@meta.data, aes(x=MT, fill=sample)) + | |
geom_histogram() + facet_wrap(vars(sample), scales="free") + | |
geom_vline(xintercept = 0.1) | |
ggsave("MT_removed.pdf", path=outdir, height = 3, width = 3.6, units="in") | |
ggplot(t21neu_removed@meta.data, aes(x=gene_count, fill=sample)) + | |
geom_histogram() + facet_wrap(vars(sample), scales="free") + | |
geom_vline(xintercept = c(7500)) | |
ggsave("gene_count_removed.pdf", path=outdir, height = 3, width = 3.6, units="in") | |
ggplot(t21neu_removed@meta.data, aes(x=tscp_count, fill=sample)) + | |
geom_histogram() + facet_wrap(vars(sample), scales="free_y") + | |
geom_vline(xintercept = c(2500)) | |
ggsave("tscp_count_removed.pdf", path=outdir, height = 3, width = 3.6, units="in") | |
ggplot(t21neu_pre@meta.data, aes(y=tscp_count, x=sample, color = sample)) + | |
geom_boxplot(varwidth = T) + | |
stat_summary(fun.data = function(x) { | |
data.frame(label=signif(quantile(x, prob=c(0.1, 0.2, .3,.4,.5,.6,.7,.8,.9), type=1),4), y=c(seq(1,9)*(-3000))) | |
}, geom = "text", | |
size=3) + geom_hline(yintercept = c(3200)) | |
ggsave("tscp_count_dis.pdf", path=outdir, height = 3, width =3.6, units="in") | |
rm(tb.CHR) | |
rm(vc.nameBYchr) | |
rm(ls.nameBYchr) | |
gc() | |
#based on: | |
#https://satijalab.org/seurat/articles/sctransform_v2_vignette.html | |
#https://github.com/satijalab/seurat/issues/5879 | |
t21neu_list <- SplitObject(t21neu, split.by = "sample") | |
t21neu_list <- lapply(t21neu_list, FUN = function(x) { | |
x <- NormalizeData(x, normalization.method = "RC", scale.factor = 1e6) | |
x<-CellCycleScoring(x, s.features = cc.genes.updated.2019$s.genes, | |
g2m.features = cc.genes.updated.2019$g2m.genes, | |
set.ident = F) | |
x@meta.data<-x@meta.data |> mutate(CCdiff = (S.Score-G2M.Score)) | |
#x<-FindVariableFeatures(x, selection.method = "vst", nfeatures = 3000) | |
#x<-ScaleData(x, vars.to.regress = vc.regress) | |
x<-SCTransform(x, vst.flavor = "v2", vars.to.regress = vc.regress, return.only.var.genes =F, verbose = F) | |
}) | |
features <- SelectIntegrationFeatures(object.list = t21neu_list, nfeatures = 3000) | |
t21neu_list <- PrepSCTIntegration(object.list = t21neu_list, anchor.features = features) | |
t21neu_anchors <- FindIntegrationAnchors(object.list = t21neu_list, normalization.method = "SCT", | |
anchor.features = features, | |
k.filter = min(t21neu@meta.data$sample |> table())) | |
t21neu_combined <- IntegrateData(anchorset = t21neu_anchors, normalization.method = "SCT") | |
t21neu_combined <- RunPCA(t21neu_combined, verbose = FALSE, features = features) | |
t21neu_combined <- RunUMAP(t21neu_combined, reduction = "pca", dims = 1:30) | |
t21neu_combined <- FindNeighbors(t21neu_combined, reduction = "pca", dims = 1:30) | |
t21neu_combined <- FindClusters(t21neu_combined, resolution = 0.5) | |
rm(t21neu_pre) | |
rm(t21neu_removed) | |
rm(t21neu) | |
rm(t21neu_list) | |
rm(t21neu_anchors) | |
gc() | |
# Determine metrics to plot present in seurat_control@meta.data | |
require(cowplot) | |
metrics <- c("gene_count", "tscp_count", "mcount_ratio", "S.Score", "G2M.Score", "CCdiff", | |
"MT", "rn5s", "rpsl") | |
# Extract the UMAP coordinates for each cell and include information about the metrics to plot | |
qc_data <- FetchData(t21neu_combined, | |
vars = c(metrics, "ident", "UMAP_1", "UMAP_2")) | |
# Adding cluster label to center of cluster on UMAP | |
umap_label <- FetchData(t21neu_combined, | |
vars = c("ident", "UMAP_1", "UMAP_2")) %>% | |
as.data.frame() %>% | |
group_by(ident) %>% | |
summarise(x=mean(UMAP_1), y=mean(UMAP_2)) | |
# Plot a UMAP plot for each metric | |
map(metrics, function(qc){ | |
ggplot(qc_data, | |
aes(UMAP_1, UMAP_2)) + | |
geom_point(aes_string(color=qc), | |
alpha = 0.7) + | |
scale_color_gradient(guide = "none", | |
low = "grey90", | |
high = "blue") + | |
geom_text(data=umap_label, | |
aes(label=ident, x, y)) + | |
ggtitle(qc) | |
}) %>% plot_grid(plotlist = .) | |
ggsave("cowplot.pdf", path=outdir, width=3.6, height=3.6, units="in") | |
p1 <- DimPlot(t21neu_combined, reduction = "umap", group.by = "sample") | |
p2 <- DimPlot(t21neu_combined, reduction = "umap", label = TRUE, repel = TRUE) | |
p1 + p2 | |
DimPlot(t21neu_combined, reduction = "umap", split.by = "sample") + | |
theme(legend.position = "bottom") + scale_color_brewer(palette = "Paired") | |
ggsave("umap_bysample.pdf", path=outdir, width=7.2, height=4, units="in") | |
source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/gene_sets_prepare.R") | |
source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/sctype_score_.R") | |
source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/auto_detect_tissue_type.R") | |
# get cell-type-specific gene sets from our in-built database (DB) | |
# gs_list <- gene_sets_prepare("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/ScTypeDB_short.xlsx", "Brain") # e.g. Immune system, Liver, Pancreas, Kidney, Eye, Brain | |
# Burke custom gene set | |
t21neu_combined <- PrepSCTFindMarkers(t21neu_combined) | |
DefaultAssay(t21neu_combined)<-RNAvSCT | |
tissue_guess = auto_detect_tissue_type(path_to_db_file = "../scRNAseq/external_tables/ScTypeDB_full.xlsx", | |
seuratObject = t21neu_combined, scaled = T, | |
assay = RNAvSCT) | |
print(tissue_guess) | |
tissue_type = vc.tissue | |
gs_list <- gene_sets_prepare("../scRNAseq/external_tables/ScTypeDB_short.xlsx", tissue_type) | |
# assign cell types | |
es.max <- sctype_score(scRNAseqData = t21neu_combined[[DefaultAssay(t21neu_combined)]]@scale.data, | |
scaled = TRUE, | |
gs = gs_list$gs_positive, | |
gs2 = gs_list$gs_negative) | |
# merge by cluster | |
cL_results <- do.call("rbind", lapply(unique(t21neu_combined@meta.data$seurat_clusters), function(cl){ | |
es.max.cl = sort(rowSums(es.max[ ,rownames(t21neu_combined@meta.data[t21neu_combined@meta.data$seurat_clusters==cl, ])]), decreasing = !0) | |
head(data.frame(cluster = cl, type = names(es.max.cl), scores = es.max.cl, ncells = sum(t21neu_combined@meta.data$seurat_clusters==cl)), 10) | |
})) | |
write_csv(cL_results, paste0(outdir,"/tables/cL_results.csv")) | |
sctype_scores <- cL_results |> arrange(desc(scores)) |> group_by(cluster) |> top_n(1, scores) | |
# set low-confident (low ScType score) clusters to "unknown" | |
sctype_scores$type[as.numeric(as.character(sctype_scores$scores)) < sctype_scores$ncells/4] <- "Unknown" | |
print(sctype_scores[,1:3]) | |
t21neu_combined@meta.data$customclassif <- "" | |
for(j in unique(sctype_scores$cluster)){ | |
cl_type <- sctype_scores[sctype_scores$cluster==j,]; | |
t21neu_combined@meta.data$customclassif[t21neu_combined@meta.data$seurat_clusters == j] <- as.character(cl_type$type[1]) | |
} | |
xist_threshold <- min.xist | |
t21neu_combined@meta.data<- t21neu_combined@meta.data |> | |
mutate(simplified_celltype = #"neuron") #customclassif) |> | |
case_when(#customclassif %in% c("Endothelial cells","Neuroblasts") ~ "Unknown", | |
#customclassif %in% c("Immature neurons", "Neuroblasts") ~ "neuron", | |
# #customclassif %in% c("Tanycytes", "Non myelinating Schwann cells") ~ "glia", | |
(str_detect(customclassif, "euron") | | |
str_detect(customclassif, "Fetal_quiescent")) | |
~ "Neurons", | |
(str_detect(customclassif, "stem") | | |
str_detect(customclassif, "ndothelial") | | |
str_detect(customclassif, "Neuroepithelial") | | |
str_detect(customclassif, "Neural Progenitor") | | |
str_detect(customclassif, "NPC") | | |
str_detect(customclassif, "Fetal_replicating") | | |
str_detect(customclassif, "Radial")) ~ "NPCs", | |
(#str_detect(customclassif, "Tanycytes") | | |
#str_detect(customclassif, "Oligodendro") | | |
#str_detect(customclassif, "OPC") | | |
str_detect(customclassif, "Astrocyte"))~ "Astrocytes", | |
T ~ "Other")) |> | |
rownames_to_column("cell_id") |> | |
left_join(t21neu_combined$RNA@counts["XIST",] |> | |
enframe(name="cell_id", value="XIST_counts")) |> | |
mutate(XIST_status = case_when(XIST_counts >= xist_threshold ~ T, | |
XIST_counts == 0 ~ F, | |
T ~ NA)) |> | |
column_to_rownames("cell_id") | |
sample.cols<-setNames(c("#E69F00", "#56B4E9", "#0072B2", "#D55E00", "#CC79A7"), | |
nm = names(t21neu_combined$sample |> table())) | |
p1 <- DimPlot(t21neu_combined, reduction = "umap", group.by = "XIST_status", shuffle = F, pt.size = 0.1) + | |
theme(legend.position = "top", text = element_text(size = 8), axis.title = element_text(size=8)) + | |
scale_color_jco() | |
p2 <- DimPlot(t21neu_combined, reduction = "umap", group.by = "Phase", shuffle = T, pt.size = 0.1) + | |
theme(legend.position = "top", text = element_text(size = 8), axis.title = element_text(size=8)) + | |
scale_color_jama() | |
p3 <- DimPlot(t21neu_combined, reduction = "umap", group.by = "simplified_celltype", shuffle = T, pt.size = 0.1) + | |
theme(legend.position = "top", text = element_text(size = 8), axis.title = element_text(size=8)) + | |
scale_color_tron() | |
p4 <- DimPlot(t21neu_combined, reduction = "umap", group.by = "customclassif", shuffle = T, pt.size = 0.1) + | |
theme(legend.position = "top", text = element_text(size = 8), axis.title = element_text(size=8)) + | |
scale_color_brewer(palette = "Set2") | |
p5 <- DimPlot(t21neu_combined, reduction = "umap", group.by = "sample", shuffle = T, pt.size = 0.1) + | |
scale_color_manual(values = sample.cols) + | |
theme(legend.position = "top", text = element_text(size = 8), axis.title = element_text(size=8)) | |
p6<-DimPlot(t21neu_combined, reduction = "umap", label = F, shuffle = T, pt.size = 0.1) + | |
theme(legend.position = "top", text = element_text(size = 8), axis.title = element_text(size=8)) + | |
scale_color_brewer(palette = "Paired") | |
#p1 + p5 + p6 + p4 + p3 + p2 | |
p6 + p4 + | |
p2 + p3 + | |
p1 + p5 + plot_layout(ncol = 2) | |
ggsave("umaps.pdf", path=outdir, width=3.6, height=6.5, units="in") | |
b1<-t21neu_combined@meta.data |> mutate(sample = case_when(sample == ".day0" ~ "day0", T ~ sample)) |> | |
filter(!is.na(XIST_status)) |> | |
group_by(sample, simplified_celltype, XIST_status) |> | |
tally() |> group_by(sample, simplified_celltype) |> mutate(sum = sum(n)) |> mutate(ratio = signif(100*n/sum,2)) |> ungroup() |> | |
mutate(label_pos = case_when(XIST_status ~ (ratio-3), T ~ (-50))) |> | |
ggplot(aes(x = sample, y=ratio, fill = XIST_status, label = ratio)) + geom_bar(stat = "identity") + | |
geom_text(aes(y = label_pos, label = ratio), size=2) + | |
facet_wrap(vars(simplified_celltype)) + scale_fill_jco() + coord_cartesian(ylim=c(0,100)) + theme(text=element_text(size=6)) + | |
theme(axis.text.x = element_text(angle = 45, hjust=1, size = 6)) + | |
theme(axis.text.y = element_text(size = 6), axis.title.x = element_blank()) + labs(y="% nuclei") + | |
theme(legend.position = "top", legend.title=element_blank(), axis.title = element_blank(), legend.key.height = unit(0.1, 'in')) | |
#ggbarplot("sample", "ratio", color = "white", fill = "XIST_status", ggtheme = theme_classic(), facet.by = "simplified_celltype", | |
# label = T, lab.pos = "in", lab.col = "black", palette = "jco", position = position_fill()) | |
#ggsave("XIST_bysample.pdf", path=outdir, width=1.8, height=2.4, units="in") | |
b2<-t21neu_combined@meta.data |> mutate(sample = case_when(sample == ".day0" ~ "day0", T ~ sample)) |> | |
filter(!is.na(simplified_celltype)) |> | |
group_by(sample, customclassif, simplified_celltype) |> | |
tally() |> group_by(sample, simplified_celltype) |> mutate(sum = sum(n)) |> mutate(ratio = round(100*n/sum,digits = 0)) |> ungroup() |> | |
mutate(label_pos = case_when(XIST_status ~ (ratio-3), T ~ (-10))) |> | |
ggplot(aes(x = sample, y=ratio, fill = customclassif)) + geom_bar(stat = "identity") +# geom_text(aes(y = label_pos, label = ratio), size=2) + | |
facet_wrap(vars(simplified_celltype)) + scale_fill_brewer(palette = "Set2") + coord_cartesian(ylim=c(0,100)) + theme(text=element_text(size=6)) + | |
theme(axis.text.x = element_text(angle = 45, hjust=1, size = 6)) + | |
theme(axis.text.y = element_text(size = 6), axis.title.x = element_blank()) + labs(y="% nuclei") + | |
theme(legend.position = "top", legend.direction = "horizontal", legend.title=element_blank(),legend.key.height = unit(0.1, 'in')) | |
#ggbarplot("sample", "ratio", color = "white", fill = "XIST_status", ggtheme = theme_classic(), facet.by = "simplified_celltype", | |
# label = T, lab.pos = "in", lab.col = "black", palette = "jco", position = position_fill()) | |
#ggsave("Customclassif_bysample.pdf", path=outdir, width=1.8, height=2.57, units="in") | |
b3<-t21neu_combined@meta.data |> mutate(sample = case_when(sample == ".day0" ~ "day0", T ~ sample)) |> | |
filter(!is.na(Phase)) |> | |
group_by(sample, Phase, simplified_celltype) |> | |
tally() |> group_by(sample, simplified_celltype) |> mutate(sum = sum(n)) |> mutate(ratio_label = signif(100*n/sum,digits =2), ratio = 100*n/sum) |> ungroup() |> | |
mutate(label_pos = case_when(Phase == "G1" ~ 95, T ~ (-10))) |> | |
ggplot(aes(x = sample, y=ratio, fill = Phase)) + geom_bar(stat = "identity") + geom_text(aes(y = label_pos, label = ratio_label, color="white"), size=2) + | |
facet_wrap(vars(simplified_celltype)) + scale_fill_jama() + scale_color_manual(values = "white") + coord_cartesian(ylim=c(0,100)) + theme(text=element_text(size=6)) + | |
theme(axis.text.x = element_text(angle = 45, hjust=1, size = 6)) + | |
theme(axis.text.y = element_text(size = 6), axis.title.x = element_blank()) + labs(y="% nuclei") + | |
theme(legend.position = "top", legend.title=element_blank(), axis.title = element_blank(), legend.key.height = unit(0.1, 'in')) | |
#ggbarplot("sample", "ratio", color = "white", fill = "XIST_status", ggtheme = theme_classic(), facet.by = "simplified_celltype", | |
# label = T, lab.pos = "in", lab.col = "black", palette = "jco", position = position_fill()) | |
#ggsave("Phase_bysample.pdf", path=outdir, width=1.8, height=2.4, units="in") | |
ggsave(b2+b3+b1, filename = "supp_plots.pdf",path = outdir, height = 2.5, width = 5.4, units="in") | |
#DimPlot(t21neu_combined, reduction = "umap", split.by = "sample", group.by = "customclassif") | |
DimPlot(t21neu_combined, reduction = "umap", split.by = "sample", group.by = "simplified_celltype", shuffle=T) + | |
theme(legend.position = "bottom", text = element_text(size = 8)) + scale_color_tron() | |
ggsave("umap_bysample.pdf", path=outdir, width=3.6, height=3, units="in") | |
# add allelic data to the metadata | |
allelic_df <-read_csv("../parsebio_scRNAseq/tables/indv_cells_gene_ae_filtered_paternal_withneu2.csv.gz") |> | |
filter(cell_id %in% rownames(t21neu_combined@meta.data)) | |
#substr(rownames(t21neu_combined@meta.data),start = 1,nchar(rownames(t21neu_combined@meta.data))-4))) | |
patallele_fix<-allelic_df |> | |
mutate(karyo = case_when(str_detect(cell_line, "198") ~ "d21", | |
T ~ "t21")) |> | |
group_by(karyo, name) |> | |
summarise(A = sum(aCount), B = sum(bCount), | |
pat_bulk = unique(bulk_pat)) |> | |
pivot_wider(id_cols = c(name, pat_bulk), names_from = karyo, values_from = c(A, B)) |> | |
replace_na(list(A_d21 = 0, B_d21 = 0, A_t21 = 0, B_t21 = 0)) |> | |
mutate(d21_rA = A_d21/( A_d21 + B_d21), | |
d21_rB = B_d21/( A_d21 + B_d21), | |
t21_rA = A_t21/( A_t21 + B_t21), | |
t21_rB = B_t21/( A_t21 + B_t21)) |> | |
mutate(a_allele_diff=t21_rA-d21_rA, | |
b_allele_diff=t21_rB-d21_rB, | |
pat_sn = case_when(abs(a_allele_diff) < .05 & abs(b_allele_diff) < .05 ~ "nodiff", | |
a_allele_diff > 0 & b_allele_diff < 0 ~ "bCount", | |
a_allele_diff < 0 & b_allele_diff > 0 ~ "aCount", | |
T ~ "undetermined")) | |
patallele_fix |> dplyr::select(pat_bulk, pat_sn) |> table(useNA = "always") | |
patallele_fix2<-patallele_fix |> | |
mutate(pat_sn = case_when(!is.na(pat_bulk) & pat_sn %in% c("nodiff","undetermined") ~ pat_bulk, | |
is.na(pat_bulk) & pat_sn %in% c("nodiff","undetermined") ~ "drop", | |
pat_sn == pat_bulk ~ pat_sn, | |
!is.na(pat_bulk) & pat_sn != pat_bulk ~ pat_sn, | |
is.na(pat_bulk) & pat_sn %in% c("aCount","bCount") ~ pat_sn, | |
T ~ "unresolved" | |
)) |> | |
mutate(mat_sn=case_when(pat_sn == "aCount" ~ "bCount", | |
pat_sn == "bCount" ~ "aCount", | |
T ~ "drop")) | |
patallele_fix2 |> filter(pat_sn != "drop") |> dplyr::select(pat_bulk, pat_sn) |> table(useNA = "always") | |
allelic_df2 <-allelic_df |> | |
left_join(patallele_fix2 |> | |
pivot_longer(cols = c(pat_sn), names_to = "allele_type", values_to = "allele") |> | |
filter(allele_type == "pat_sn") | |
, by = "name") | |
pat_ar_df <- allelic_df2 |> | |
filter( | |
!is.na(cell_line) & !is.na(allele) | |
) |> | |
left_join(t21neu_combined@meta.data |> rownames_to_column("cell_id")) |> | |
mutate(pat_count = case_when( | |
allele == 'aCount' ~ aCount, | |
allele == 'bCount' ~ bCount | |
), | |
mat_count = totalCount - pat_count | |
# cell_id = case_when( | |
# str_detect(bam, "2k") ~ sprintf("%s__s1", cell), | |
# str_detect(bam, "5k") ~ sprintf("%s__s2", cell) | |
# ) | |
) |> | |
group_by(cell_line, cell_id) |> | |
summarise( | |
sample = unique(sample), | |
XIST_status = unique(XIST_status), | |
simplified_celltype = unique(simplified_celltype), | |
nvar = median(n_variants, na.rm = T), | |
total = sum(totalCount, na.rm = T), | |
sumPat = sum(pat_count), | |
sumMat = sum(mat_count), | |
perc_paternal = sum(pat_count, na.rm = T) / sum(totalCount, na.rm = T), | |
perc_paternal2 = sum(pat_count/totalCount, na.rm = T ), | |
pat_to_mat = sum(pat_count, na.rm = T) / sum(mat_count, na.rm = T), | |
pat_to_mat2 = mean(pat_count/mat_count, na.rm = T) | |
) |> | |
ungroup() | |
require(ggpubr) | |
pat_ar_df2<-pat_ar_df |> | |
filter(!(str_detect(sample, "D21") & XIST_status) & !(sample == ".day0" & !XIST_status)) |> | |
mutate(sample = case_when(#sample %in% c("D21a","D21b") ~ "D21", | |
sample == "pDAPT" & !XIST_status ~ "T21_pD-", | |
sample == "pDAPT" & XIST_status ~ "T21_pD+", | |
sample == ".day0" & !XIST_status ~ "T21_d0-", | |
sample == ".day0" & XIST_status ~ "T21_d0+", | |
sample == "T21" ~ "T21", | |
T ~ sample)) |> | |
filter(total > 5 & !is.na(simplified_celltype)) |> filter(!is.na(XIST_status)) |> | |
mutate(sample = case_when(sample == "D21" ~ "euploid", T ~ sample)) |> | |
mutate(XIST_status = case_when(sample %in% c("D21a","D21b") ~ "absent", T ~ as.character(XIST_status))) | |
allelic_plot<-ggplot(pat_ar_df2 #|> filter(simplified_celltype != "Other") | |
, aes(x=sample, y=perc_paternal)) + | |
ggbeeswarm::geom_quasirandom(aes(color = XIST_status), alpha = .5, size=.5) + | |
geom_boxplot(fill = NA, notch = T, outlier.shape = NA, size=.3) + | |
scale_color_jama() + | |
stat_summary(fun.data = function(x) { | |
data.frame(label=signif(median(x),2), y=-0.1)}, | |
geom = "text", size=2) + | |
stat_compare_means(ref.group = "T21", method = "wilcox", | |
#method.args = list(alternative="less"), | |
label.y = 0.9, hide.ns = T, label = "p.format", size=2) + | |
theme(axis.ticks.x = element_blank(), | |
axis.title.x = element_blank(), | |
legend.position = "top") + | |
labs( | |
title="Paternal chr21 allele expression", | |
y="Lesser allele fraction", | |
color="XIST", | |
fill="XIST" | |
) +geom_hline(yintercept = c(0.33,0.5), alpha = 0.1) + | |
scale_x_discrete(position = "top") + ylim(c(-0.1,1)) #+ facet_grid(rows = vars(simplified_celltype), scales = "free") | |
ggsave("allelic_all.pdf", path=outdir, width=3.6, height = 1.8, units="in") | |
ggplot(pat_ar_df2, aes(x=sample, y=perc_paternal)) + | |
ggbeeswarm::geom_quasirandom(aes(color = XIST_status), alpha = .5, size=.5) + | |
geom_boxplot(fill = NA, notch = T, outlier.shape = NA, size=.3) + | |
scale_color_jama() + | |
stat_summary(fun.data = function(x) { | |
data.frame(label=signif(median(x),2), y=-0.1)}, | |
geom = "text", size=2) + | |
stat_compare_means(ref.group = "T21", method = "wilcox", | |
#method.args = list(alternative="less"), | |
label.y = 0.9, hide.ns = T, label = "p.format", size=2) + | |
theme(axis.ticks.x = element_blank(), | |
axis.title.x = element_blank(), | |
legend.position = "top") + | |
labs( | |
title="Paternal chr21 allele expression", | |
y="Lesser allele fraction", | |
color="XIST", | |
fill="XIST" | |
) +geom_hline(yintercept = c(0.33,0.5), alpha = 0.1) + | |
scale_x_discrete(position = "top") + ylim(c(-0.1,1)) + facet_grid(rows = vars(simplified_celltype), scales = "free") | |
ggsave("allelic_by_celltype.pdf", path=outdir, width=2.5, height = 3.6, units="in") | |
rm(allelic_df) | |
rm(allelic_df2) | |
gc() | |
##ECDF plots### the combined_output_df(2) is a large file to hold in memory - | |
# maybe there's a more lightweight container for this | |
rm(combined_output_df) | |
gc() | |
combined_output_df <- t21neu_combined[["RNA"]]@data |> | |
as.data.frame() |> | |
rownames_to_column(var="gene_name") |> | |
tibble() |> | |
pivot_longer(cols = -gene_name, names_to = "cell_id", values_to = "expr_val") |> | |
left_join(t21neu_combined$sample |> enframe(name="cell_id", value = "sample_id"), | |
by="cell_id") |> | |
left_join( | |
gene_info_df |> dplyr::select(-c(description,wikigene_name)) |> | |
rename(gene_name = "hgnc_symbol") |> filter(!is.na(gene_name) & !duplicated(gene_name)) |> | |
mutate(gene_id = str_replace(ensembl_gene_id_version, "\\.[0-9]+", "")), | |
by= "gene_name") | |
gc() | |
#mean non21 expr by cell - should rename this instead of "autosome" to include X (& Y) | |
mean_autosome_expression_df <- combined_output_df |> | |
filter(!(chromosome_name %in% c(#"X", "Y", | |
"MT", | |
NA, "21"))) |> | |
#filter(expr_val > 0) |> | |
group_by(cell_id) |> | |
summarise(mean_a_expr = mean(expr_val), countA = n()) | |
#each chr21 gene by cell | |
chr21gene_bycell <- combined_output_df |> | |
filter(chromosome_name %in% c("21")) |> | |
dplyr::rename(expr21 = "expr_val") |> | |
#filter(expr21 > 0) |> | |
left_join(mean_autosome_expression_df, by="cell_id") #|> | |
#group_by(cell_id) |> | |
#summarise(mean21 = median(expr21), count21 = n()) | |
rm(combined_output_df) | |
gc() | |
### BY CELL APPROACH | |
fx_ecdf <- function(x) { | |
# Sort the input vector in ascending order | |
sorted_x <- sort(x) | |
# Calculate the unique values and their frequencies in the input vector | |
unique_x <- unique(sorted_x) | |
counts <- tabulate(match(sorted_x, unique_x)) | |
# Calculate the empirical cumulative density for each unique value | |
ecdf <- cumsum(counts)/length(x) | |
# Create a vector of empirical cumulative densities corresponding to the original input vector | |
output_ecdf <- ecdf[match(x, unique_x)] | |
# Return the vector of empirical cumulative densities | |
return(output_ecdf) | |
} | |
t21_d21_cell_ratio <- chr21gene_bycell |> | |
left_join(t21neu_combined@meta.data |> | |
as.data.frame() |> | |
rownames_to_column(var="cell_id") |> | |
as_tibble() |> | |
dplyr::select(cell_id, simplified_celltype, XIST_status, XIST_counts), | |
by = "cell_id") |> | |
group_by(sample_id, simplified_celltype, cell_id, XIST_status) |> | |
summarise(mean21 = mean(expr21), count = n(), mean_a_expr = unique(mean_a_expr)) |> | |
mutate(ratio = mean21/mean_a_expr) |> filter(!(sample_id == ".day0" & !XIST_status)) | |
tb.bycell<-t21_d21_cell_ratio |> filter(!(is.na(XIST_status) | (XIST_status & str_detect(sample_id, "[DT]21")))) |> | |
mutate(sample = case_when(#sample_id %in% c("D21a","D21b") ~ "D21", | |
sample_id == "pDAPT" & !XIST_status ~ "T21_pD-", | |
sample_id == "pDAPT" & XIST_status ~ "T21_pD+", | |
sample_id == ".day0" & !XIST_status ~ "T21_d0-", | |
sample_id == ".day0" & XIST_status ~ "T21_d0+", | |
sample_id == "T21" ~ "T21", | |
T ~ sample_id)) |> | |
mutate(XIST_status = case_when(!XIST_status & str_detect(sample_id, "D21") ~ "absent", | |
T ~ as.character(XIST_status))) | |
bp.percell<-ggplot(tb.bycell |> filter(simplified_celltype != "Other"), aes(x=sample, y=ratio, fill = XIST_status)) + | |
ggbeeswarm::geom_quasirandom(aes(color = XIST_status), alpha = .5, size=.5) + | |
geom_boxplot(fill = NA, notch = T, outlier.shape = NA, size=.3) + | |
scale_color_jama() + | |
stat_summary(fun.data = function(x) { | |
data.frame(label=signif(median(x),2), y=0)}, | |
geom = "text", size=2) + | |
stat_compare_means(ref.group = "T21", method = "wilcox", | |
method.args = list(alternative="less"), | |
label.y = 2.2, hide.ns = T, label = "p.format", size=2) + | |
theme( axis.ticks.x = element_blank(), | |
axis.title.x = element_blank(), | |
legend.position = "top" | |
) + | |
labs( | |
title="chr21 dosage by cell", | |
y ="chr21/genome_mean ratio", | |
color="XIST", | |
fill="XIST" | |
) + scale_x_discrete(position = "top") # + facet_grid(rows = vars(simplified_celltype), scales = "free", space = "free") | |
ggsave(bp.percell, filename = "boxplot_bycell2.pdf", path=outdir, width = 3.6, height = 1.8, units = "in") | |
ggplot(tb.bycell, aes(x=sample, y=ratio, fill = XIST_status)) + | |
ggbeeswarm::geom_quasirandom(aes(color = XIST_status), alpha = .5, size=.5) + | |
geom_boxplot(fill = NA, notch = T, outlier.shape = NA, size=.3) + | |
scale_color_jama() + | |
stat_summary(fun.data = function(x) { | |
data.frame(label=signif(median(x),2), y=0)}, | |
geom = "text", size=2) + | |
stat_compare_means(ref.group = "T21", method = "wilcox", label.x.npc = "center", | |
method.args = list(alternative="less"), label.y.npc = 0.9, | |
hide.ns = T, label = "p.signif", size=3) + | |
theme( axis.ticks.x = element_blank(), | |
axis.title.x = element_blank(), | |
legend.position = "top" | |
) + | |
labs( | |
title="chr21 dosage by cell", | |
y ="chr21/genome_mean ratio", | |
color="XIST", | |
fill="XIST" | |
) + scale_x_discrete(position = "top") + facet_grid(rows = vars(simplified_celltype), scales = "free") | |
ggsave(filename = "boxplot_bycell.pdf", path=outdir, width = 2.5, height = 4, units = "in") | |
require(NSM3) | |
test_cell<-t21_d21_cell_ratio |> ungroup() |> filter(!(sample_id == ".day0" & !XIST_status)) |> | |
group_by(sample_id, simplified_celltype, XIST_status) |> | |
add_tally() |> #filter(n>=10) |> | |
arrange(ratio) |> filter(!is.infinite(ratio) & !is.na(ratio) & !is.na(XIST_status) | |
# & simplified_celltype != "Other" | |
) |> | |
group_by(sample_id, simplified_celltype, XIST_status) |> | |
mutate(eCDF = fx_ecdf(ratio)) |> | |
mutate(upperCI = sort(rbind(ratio, ecdf.ks.CI(ratio)$upper)[2,])) |> | |
mutate(lowerCI = sort(rbind(ratio, ecdf.ks.CI(ratio)$lower)[2,])) |> | |
ungroup() | |
refratio <-"T21" | |
testref <- test_cell |> group_by(sample_id) |> filter(simplified_celltype == "Neurons" & sample_id == refratio) | |
tested <- test_cell |> group_by(sample_id, XIST_status) |> filter(simplified_celltype == "Neurons" & sample_id != refratio) | |
ks.neurons<-tested |> summarise(ksp = ks.test(unique(testref$ratio), unique(ratio), exact = T, alternative = "l")$p.value, | |
ksd = ks.test(unique(testref$ratio), unique(ratio), exaxt = T, alternative = "l")$statistic) | |
testref <- test_cell |> group_by(sample_id) |> filter(simplified_celltype == "Astrocytes" & sample_id == refratio) | |
tested <- test_cell |> group_by(sample_id, XIST_status) |> filter(simplified_celltype == "Astrocytes" & sample_id != refratio) | |
ks.glia<-tested |> summarise(ksp = ks.test(unique(testref$ratio), unique(ratio), exact = T, alternative = "l")$p.value, | |
ksd = ks.test(unique(testref$ratio), unique(ratio), exaxt = T, alternative = "l")$statistic) | |
testref <- test_cell |> group_by(sample_id) |> filter(simplified_celltype == "NPCs" & sample_id == refratio) | |
tested <- test_cell |> group_by(sample_id, XIST_status) |> filter(simplified_celltype == "NPCs" & sample_id != refratio) | |
ks.rgc<-tested |> summarise(ksp = ks.test(unique(testref$ratio), unique(ratio), exact = T, alternative = "l")$p.value, | |
ksd = ks.test(unique(testref$ratio), unique(ratio), exaxt = T, alternative = "l")$statistic) | |
testref <- test_cell |> group_by(sample_id) |> filter(simplified_celltype == "Other" & sample_id == refratio) | |
tested <- test_cell |> group_by(sample_id, XIST_status) |> filter(simplified_celltype == "Other" & sample_id != refratio) | |
ks.unk<-tested |> summarise(ksp = ks.test(unique(testref$ratio), unique(ratio), exact = T, alternative = "l")$p.value, | |
ksd = ks.test(unique(testref$ratio), unique(ratio), exaxt = T, alternative = "l")$statistic) | |
ks21_bycell<-bind_rows(list("Neurons" = ks.neurons, "Astrocytes" = ks.glia, "NPCs" = ks.rgc, "Other" = ks.unk), .id = 'simplified_celltype') | |
df_anno_cell<-ks21_bycell |> filter(ksp <= 0.05) |> arrange(simplified_celltype, desc(ksd)) |> ungroup() |> | |
group_by(simplified_celltype) |> mutate(ksp = signif(ksp, 2), ksd = signif(ksd, 2)) |> add_column(x = 0.75) |> mutate(y = 1-row_number()/10) | |
min.cells<-4 | |
gg.ecdf_bycell<-ggplot(test_cell |> group_by(sample_id, simplified_celltype, XIST_status) |> | |
add_tally(name = "cells") |> filter(cells>min.cells) |> ungroup() |> | |
mutate(upperCI = case_when(sample_id == refratio ~ upperCI, T ~ NA)) |> | |
mutate(lowerCI = case_when(sample_id == refratio ~ lowerCI, T ~ NA)) | |
, aes(x=ratio, y=eCDF, colour=sample_id, linetype= XIST_status)) + | |
geom_step() + geom_text(data = df_anno_cell, aes(x = x, y = y, label = ksp), size =2) + | |
geom_hline(yintercept = 0.5, alpha = 0.2) + geom_vline(xintercept = c(1,1.5), alpha =0.2) + | |
geom_ribbon(aes(ymin=lowerCI, ymax=upperCI, fill = sample_id), | |
alpha=0.2, color = NA) + | |
facet_grid(rows = vars(simplified_celltype)) + xlim(0.5,2) + theme(legend.position = "right") | |
ggsave(gg.ecdf_bycell, filename = "ecdf_bycell.pdf", path=outdir, width = 2.4, height = 3.6, units = "in") | |
test_cell2<-t21_d21_cell_ratio |> ungroup() |> | |
filter(!(sample_id == ".day0" & !XIST_status)) |> | |
group_by(sample_id, XIST_status) |> | |
add_tally() |> #filter(n>=10) |> | |
arrange(ratio) |> filter(!is.infinite(ratio) & !is.na(ratio) & !is.na(XIST_status) | |
& simplified_celltype != "Other" | |
) |> | |
group_by(sample_id, XIST_status) |> | |
mutate(eCDF = fx_ecdf(ratio)) |> | |
mutate(upperCI = sort(rbind(ratio, ecdf.ks.CI(ratio)$upper)[2,])) |> | |
mutate(lowerCI = sort(rbind(ratio, ecdf.ks.CI(ratio)$lower)[2,])) |> | |
ungroup() | |
refratio <-"T21" | |
testref <- test_cell2 |> group_by(sample_id) |> filter(sample_id == refratio) | |
tested <- test_cell2 |> group_by(sample_id, XIST_status) |> filter(sample_id != refratio) | |
ks.all<-tested |> summarise(ksp = ks.test(unique(testref$ratio), unique(ratio), exact = T, alternative = "l")$p.value, | |
ksd = ks.test(unique(testref$ratio), unique(ratio), exaxt = T, alternative = "l")$statistic) | |
df_anno_cell2<-ks21_bycell2 |> ungroup() |> filter(ksp <= 0.05) |> arrange(desc(ksd)) |> | |
mutate(ksp = signif(ksp, 2), ksd = signif(ksd, 2)) |> | |
add_column(x = 0.75) |> mutate(y = 1-row_number()/10) | |
gg.ecdf_bycell2<-ggplot(test_cell2 |> group_by(sample_id, XIST_status) |> | |
add_tally(name = "cells") |> filter(cells>min.cells) |> ungroup() |> | |
mutate(upperCI = case_when(sample_id == refratio ~ upperCI, T ~ NA)) |> | |
mutate(lowerCI = case_when(sample_id == refratio ~ lowerCI, T ~ NA)) | |
, aes(x=ratio, y=eCDF, colour=sample_id, linetype= XIST_status)) + | |
geom_step() + geom_text(data = df_anno_cell2, aes(x = x, y = y, label = ksp), size =3) + | |
geom_hline(yintercept = 0.5, alpha = 0.2) + geom_vline(xintercept = c(1,1.5), alpha =0.2) + | |
geom_ribbon(aes(ymin=lowerCI, ymax=upperCI, fill = sample_id), | |
alpha=0.2, color = NA) + xlim(0.5,2) + theme(legend.position = "right") | |
ggsave(gg.ecdf_bycell2, filename = "ecdf_bycell2.pdf", path=outdir, width = 3.2, height = 1.45, units = "in") | |
### BY GENE APPROACH | |
#mean euploid expression for each chr21 gene averaged over all cells of celltype in sample_id | |
chr21_mean_euploid_gene <- chr21gene_bycell |> | |
left_join(t21neu_combined@meta.data |> | |
as.data.frame() |> | |
rownames_to_column(var="cell_id") |> | |
as_tibble() |> | |
dplyr::select(cell_id, simplified_celltype), | |
by = "cell_id") |> | |
group_by(sample_id, simplified_celltype, gene_id, gene_name, chromosome_name, start_position) |> | |
summarise(mean21 = mean(expr21)) |> filter(mean21 > 0) |> | |
pivot_wider(id_cols = c(simplified_celltype, gene_id, gene_name, chromosome_name, start_position), names_from = sample_id, values_from = mean21) |> | |
mutate(mean21_euploid = rowMeans(cbind(D21a, D21b))) |> | |
mutate(t21_ratio = T21/mean21_euploid) |> | |
ungroup() |> | |
dplyr::select(simplified_celltype, gene_id, mean21_euploid, t21_ratio) | |
require(NSM3) | |
t21_d21_gene_ratio <- chr21gene_bycell |> | |
left_join(t21neu_combined@meta.data |> | |
as.data.frame() |> | |
rownames_to_column(var="cell_id") |> | |
as_tibble() |> | |
dplyr::select(cell_id, simplified_celltype, XIST_status), | |
by = "cell_id") |> | |
group_by(sample_id, simplified_celltype, gene_id, gene_name, chromosome_name, start_position, XIST_status) |> | |
summarise(mean21 = mean(expr21), count = n()) |> filter(mean21 > 0) |> | |
filter(count > 4) |> | |
pivot_wider(id_cols = c(simplified_celltype, gene_id, gene_name, chromosome_name, start_position, XIST_status), names_from = sample_id, values_from = c(mean21, count)) |> | |
left_join(chr21_mean_euploid_gene, | |
by=c("simplified_celltype", "gene_id")) |> | |
mutate(t21_d21_ratio = mean21_T21/mean21_euploid, | |
t21_expr = mean21_T21, | |
pD_d21_ratio = mean21_pDAPT/mean21_euploid, | |
pD_expr = mean21_pDAPT, | |
d0_d21_ratio = mean21_.day0/mean21_euploid, | |
d0_expr = mean21_.day0) |> | |
mutate(euploid1_ratio = mean21_D21a/mean21_euploid, | |
euploid2_ratio = mean21_D21b/mean21_euploid, | |
t21_pD_ratio = t21_expr/pD_expr) | |
#write_csv(t21_d21_ratio_df, "tables/t21_d21_ratio_df.csv.gz") | |
# | |
# gene_order <- t21_d21_gene_ratio |> | |
# filter(!is.infinite(pD_d21_ratio) & !is.nan(pD_d21_ratio) & | |
# simplified_celltype == "neuron") |> | |
# arrange(pD_d21_ratio) |> | |
# pull(gene_name) | |
tb.bygene<-t21_d21_gene_ratio |> ungroup() |> filter(!is.na(XIST_status)) |> | |
dplyr::select(simplified_celltype, XIST_status, gene_id, | |
t21_d21_ratio, | |
pD_d21_ratio, | |
d0_d21_ratio, | |
euploid1_ratio, | |
euploid2_ratio) |> | |
rename_with(~gsub("_d21_ratio", "", .x), .cols=ends_with("d21_ratio")) |> | |
rename_with(~gsub("_ratio", "", .x), .cols=ends_with("_ratio")) |> | |
pivot_longer(cols = -c(simplified_celltype, | |
gene_id, | |
XIST_status), names_to = "sample", values_to = "ratio") |> | |
mutate(XIST_status = case_when(str_detect(sample, "euploid") ~ "absent", | |
T ~ as.character(XIST_status))) |> filter(!is.na(ratio)) |> | |
mutate(sample = case_when( | |
sample == "euploid1" ~ "D21a", | |
sample == "euploid2" ~ "D21b", | |
sample == "pD" & XIST_status == "FALSE" ~ "T21_pD-", | |
sample == "pD" & XIST_status == "TRUE" ~ "T21_pD+", | |
sample == "d0" & XIST_status == "FALSE" ~ "skip", | |
sample == "d0" &XIST_status == "TRUE" ~ "T21_d0+", | |
sample == "t21" ~ "T21", | |
T ~ sample)) |> filter(sample != "skip") | |
bp.pergene<-ggplot(tb.bygene |> filter(simplified_celltype != "Other"), | |
aes(x=sample, y=ratio, fill = XIST_status)) + | |
ggbeeswarm::geom_quasirandom(aes(color = XIST_status), alpha = .5, size=.5) + | |
geom_boxplot(fill = NA, notch = T, outlier.shape = NA, size=.3) + | |
scale_color_jama() + coord_cartesian(ylim = c(-0.1,4)) + | |
#stat_summary(fun = median, geom = "point", shape=23, size=1, alpha=1, aes(fill=XIST_status)) + | |
stat_summary(fun.data = function(x) { | |
data.frame(label=signif(median(x),2), y=-0.1) | |
}, | |
geom = "text", | |
size=2) + | |
stat_compare_means(ref.group = "T21", method = "wilcox", | |
method.args = list(alternative="less"), | |
label.y = 3.5, hide.ns = T, label = "p.format", size=2) + | |
theme( | |
axis.ticks.x = element_blank(), | |
axis.title.x = element_blank(), | |
legend.position = "top" | |
) + | |
labs( | |
title="chr21 dosage by gene", | |
y ="chr21_gene/genome ratio", | |
color="XIST", | |
fill="XIST" | |
) + | |
scale_x_discrete(position = "top") #+ facet_grid(rows = vars(simplified_celltype), scales = "free", space = "free") | |
ggsave(bp.pergene, filename = "boxplot_bygene2.pdf", path=outdir, width = 3.6, height = 1.8, units = "in") | |
ggplot(tb.bygene, | |
aes(x=sample, y=ratio, fill = XIST_status)) + | |
ggbeeswarm::geom_quasirandom(aes(color = XIST_status), alpha = .5, size=.5) + | |
geom_boxplot(fill = NA, notch = T, outlier.shape = NA, size=.3) + | |
scale_color_jama() + coord_cartesian(ylim = c(-0.1,4)) + | |
#stat_summary(fun = median, geom = "point", shape=23, size=1, alpha=1, aes(fill=XIST_status)) + | |
stat_summary(fun.data = function(x) { | |
data.frame(label=signif(median(x),2), y=-0) | |
}, | |
geom = "text", | |
size=2) + | |
stat_compare_means(ref.group = "T21", method = "wilcox", | |
method.args = list(alternative="less"), label.y = 3.5, | |
hide.ns = T, label = "p.signif", size=3) + | |
theme( | |
axis.ticks.x = element_blank(), | |
axis.title.x = element_blank(), | |
legend.position = "top" | |
) + | |
labs( | |
title="chr21 dosage by gene", | |
y ="chr21_gene/genome ratio", | |
color="XIST", | |
fill="XIST" | |
) + | |
scale_x_discrete(position = "top") + facet_grid(rows = vars(simplified_celltype), scales = "free") | |
ggsave(filename = "boxplot_bygene.pdf", path=outdir, width = 2.5, height = 4, units = "in") | |
test<-t21_d21_gene_ratio |> | |
pivot_longer(cols = c(t21_d21_ratio, | |
pD_d21_ratio, | |
d0_d21_ratio, | |
euploid1_ratio, | |
euploid2_ratio), names_to = "ratio_type", values_to = "ratio") |> | |
arrange(ratio) |> filter(!is.infinite(ratio) & !is.na(ratio) & !is.na(XIST_status) & | |
!(ratio_type == "d0_d21_ratio" & !XIST_status) | |
) |> | |
mutate(ratio_type = str_split_i(ratio_type, "_", 1)) |> | |
mutate(ratio_type = case_when(ratio_type == "t21" ~ "T21", | |
ratio_type == "d0" ~ ".day0", | |
ratio_type == "euploid1" ~ "D21a", | |
ratio_type == "euploid2" ~ "D21b", | |
ratio_type == "pD" ~ "pDAPT" | |
)) |> | |
group_by(ratio_type, simplified_celltype, XIST_status) |> | |
dplyr::select(ratio, ratio_type, gene_name) |> | |
#test_new<- test |> group_modify(test %>% mutate(interp_dens = calc_density(test$ratio))) | |
mutate(eCDF = fx_ecdf(ratio)) |> | |
mutate(upperCI = sort(rbind(ratio, ecdf.ks.CI(ratio)$upper)[2,])) |> | |
mutate(lowerCI = sort(rbind(ratio, ecdf.ks.CI(ratio)$lower)[2,])) |> | |
ungroup() | |
refratio <-"T21" | |
testref <- test |> group_by(ratio_type) |> filter(simplified_celltype == "Neurons" & ratio_type == refratio) | |
tested <- test |> group_by(ratio_type, XIST_status) |> filter(simplified_celltype == "Neurons" & ratio_type != refratio) | |
ks.neurons<-tested |> summarise(ksp = ks.test(unique(testref$ratio), unique(ratio), exact = T, alternative = "l")$p.value, | |
ksd = ks.test(unique(testref$ratio), unique(ratio), exaxt = T, alternative = "l")$statistic) | |
testref <- test |> group_by(ratio_type) |> filter(simplified_celltype == "Astrocytes" & ratio_type == refratio) | |
tested <- test |> group_by(ratio_type, XIST_status) |> filter(simplified_celltype == "Astrocytes" & ratio_type != refratio) | |
ks.glia<-tested |> summarise(ksp = ks.test(unique(testref$ratio), unique(ratio), exact = T, alternative = "l")$p.value, | |
ksd = ks.test(unique(testref$ratio), unique(ratio), exaxt = T, alternative = "l")$statistic) | |
testref <- test |> group_by(ratio_type) |> filter(simplified_celltype == "NPCs" & ratio_type == refratio) | |
tested <- test |> group_by(ratio_type, XIST_status) |> filter(simplified_celltype == "NPCs" & ratio_type != refratio) | |
ks.rgc<-tested |> summarise(ksp = ks.test(unique(testref$ratio), unique(ratio), exact = T, alternative = "l")$p.value, | |
ksd = ks.test(unique(testref$ratio), unique(ratio), exaxt = T, alternative = "l")$statistic) | |
testref <- test |> group_by(ratio_type) |> filter(simplified_celltype == "Other" & ratio_type == refratio) | |
tested <- test |> group_by(ratio_type, XIST_status) |> filter(simplified_celltype == "Other" & ratio_type != refratio) | |
ks.unk<-tested |> summarise(ksp = ks.test(unique(testref$ratio), unique(ratio), exact = T, alternative = "l")$p.value, | |
ksd = ks.test(unique(testref$ratio), unique(ratio), exaxt = T, alternative = "l")$statistic) | |
ks21_bygene<-bind_rows(list("Neurons" = ks.neurons, | |
"Astrocytes" = ks.glia, | |
"Other" = ks.unk, | |
"NPCs" = ks.rgc), .id = 'simplified_celltype') | |
df_anno<-ks21_bygene |> ungroup() |> filter(ksp <= 0.05) |> arrange(simplified_celltype, desc(ksd)) |> | |
mutate(ksp = signif(ksp, 2), ksd = signif(ksd, 2)) |> group_by(simplified_celltype) |> | |
add_column(x = 0.75) |> mutate(y = 1-row_number()/10) | |
gg.ecdf_bygene<-ggplot(test |> # filter(ratio_type != "d0_d21_ratio") |> | |
mutate(upperCI = case_when(ratio_type == refratio ~ upperCI, T ~ NA)) |> | |
mutate(lowerCI = case_when(ratio_type == refratio ~ lowerCI, T ~ NA)) | |
, aes(x=ratio, y=eCDF, colour=ratio_type, linetype= XIST_status)) + | |
geom_step() + geom_text(data = df_anno, aes(x = x, y = y, label = ksp), size = 2) + | |
geom_hline(yintercept = 0.5, alpha = 0.2) + geom_vline(xintercept = c(1,1.5), alpha =0.2) + | |
geom_ribbon(aes(ymin=lowerCI, ymax=upperCI, fill = ratio_type), | |
alpha=0.2, color = NA) + | |
facet_grid(rows = vars(simplified_celltype)) + xlim(0.5,2) + theme(legend.position = "right") | |
ggsave(gg.ecdf_bygene, filename = "ecdf_bygene.pdf", path=outdir, width = 2.4, height = 3.6, units = "in") | |
test2<-t21_d21_gene_ratio |> | |
pivot_longer(cols = c(t21_d21_ratio, | |
pD_d21_ratio, | |
d0_d21_ratio, | |
euploid1_ratio, | |
euploid2_ratio), names_to = "ratio_type", values_to = "ratio") |> | |
arrange(ratio) |> filter(!is.infinite(ratio) & !is.na(ratio) & !is.na(XIST_status) & | |
simplified_celltype != "Other" & !(ratio_type == "d0_d21_ratio" & !XIST_status) | |
) |> | |
mutate(ratio_type = str_split_i(ratio_type, "_", 1)) |> | |
mutate(ratio_type = case_when(ratio_type == "t21" ~ "T21", | |
ratio_type == "d0" ~ ".day0", | |
ratio_type == "euploid1" ~ "D21a", | |
ratio_type == "euploid2" ~ "D21b", | |
ratio_type == "pD" ~ "pDAPT" | |
)) |> | |
group_by(ratio_type, XIST_status) |> | |
dplyr::select(ratio, ratio_type, gene_name) |> | |
mutate(eCDF = fx_ecdf(ratio)) |> | |
mutate(upperCI = sort(rbind(ratio, ecdf.ks.CI(ratio)$upper)[2,])) |> | |
mutate(lowerCI = sort(rbind(ratio, ecdf.ks.CI(ratio)$lower)[2,])) |> | |
ungroup() | |
refratio <-"T21" | |
testref <- test2 |> group_by(ratio_type) |> filter(ratio_type == refratio) | |
tested <- test2 |> group_by(ratio_type, XIST_status) |> filter(ratio_type != refratio) | |
ks.all<-tested |> summarise(ksp = ks.test(unique(testref$ratio), unique(ratio), exact = T, alternative = "l")$p.value, | |
ksd = ks.test(unique(testref$ratio), unique(ratio), exaxt = T, alternative = "l")$statistic) | |
ks21_bygene2<-bind_rows(list(#"neuron" = ks.neurons, | |
#"glia" = ks.glia, | |
"all" = ks.all), .id = 'simplified_celltype') | |
#df_anno2<-ks21_bygene2 |> filter(ksp <= 0.05) |> arrange(simplified_celltype, desc(ksd)) |> | |
# mutate(ksp = signif(ksp, 2), ksd = signif(ksd, 2)) |> add_column(x = 0.5) |> mutate(y = 1-row_number()/10) | |
df_anno2<-ks21_bygene2 |> ungroup() |> filter(ksp <= 0.05) |> arrange(desc(ksd)) |> | |
mutate(ksp = signif(ksp, 2), ksd = signif(ksd, 2)) |> | |
add_column(x = 0.75) |> mutate(y = 1-row_number()/10) | |
gg.ecdf_bygene2<-ggplot(test2 |> | |
mutate(upperCI = case_when(ratio_type == refratio ~ upperCI, T ~ NA)) |> | |
mutate(lowerCI = case_when(ratio_type == refratio ~ lowerCI, T ~ NA)) | |
, aes(x=ratio, y=eCDF, colour=ratio_type, linetype= XIST_status)) + | |
geom_step() + geom_text(data = df_anno2, aes(x = x, y = y, label = ksp), size = 3) + | |
geom_hline(yintercept = 0.5, alpha = 0.2) + geom_vline(xintercept = c(1,1.5), alpha =0.2) + | |
geom_ribbon(aes(ymin=lowerCI, ymax=upperCI, fill = ratio_type), | |
alpha=0.2, color = NA) + xlim(0.5,2) #+ theme(legend.position = "none") | |
ggsave(gg.ecdf_bygene2, filename = "ecdf_bygene2.pdf", path=outdir, width = 3.2, height = 1.45, units = "in") | |
rm(chr21gene_bycell) | |
### DIFF expression | |
t21neu_combined <- PrepSCTFindMarkers(t21neu_combined) | |
fx.diffroc<-function(slott, assayy, seuratt, ident1, ident2, testt="roc", by_celltype=F){ | |
DefaultAssay(seuratt) <- assayy | |
if (by_celltype == F) { | |
resultt<-list() | |
resultt[["none"]]<-FindMarkers( | |
seuratt, | |
test.use = testt, | |
slot = slott, | |
assay=assayy, | |
ident.1 = ident1, | |
ident.2 = ident2, | |
group.by = 'diff_expr', | |
recorrect_umi = F, | |
logfc.threshold = 0) |> | |
mutate( | |
testname = paste0(assayy, "_", slott), | |
contrast = sprintf("%s_%s", ident1, ident2), | |
test = (sign(avg_log2FC) * power) | |
) |> | |
arrange(desc(test)) |> | |
rownames_to_column("gene_symbol") |> | |
left_join(gene_info_df |> dplyr::select(hgnc_symbol, chromosome_name, strand, start_position, end_position, gene_biotype), | |
by=c("gene_symbol"="hgnc_symbol")) | |
}else { | |
celltypes <- unique(seuratt@meta.data$simplified_celltype) | |
resultt <- list() | |
for (celltype in celltypes) { | |
resultt[[celltype]] <- FindMarkers( | |
seuratt, | |
test.use = testt, | |
slot = slott, | |
assay=assayy, | |
ident.1 = sprintf("%s_%s", ident1, celltype), | |
ident.2 = sprintf("%s_%s", ident2, celltype), | |
group.by = 'diff_expr_celltype', | |
recorrect_umi = F, | |
logfc.threshold = 0) |> | |
mutate( | |
testname = paste0(assayy, "_", slott), | |
contrast = sprintf("%s_%s_%s", celltype, ident1, ident2), | |
test = (sign(avg_log2FC) * power) | |
) |> | |
arrange(desc(test)) |> | |
rownames_to_column("gene_symbol") |> | |
left_join(gene_info_df |> dplyr::select(hgnc_symbol, chromosome_name, strand, start_position, end_position, gene_biotype), | |
by=c("gene_symbol"="hgnc_symbol")) | |
} | |
} | |
return(resultt) | |
} | |
fx.diffroc2<-function(slott, assayy, seuratt, ident1, ident2, testt="roc", by_celltype=F){ | |
DefaultAssay(seuratt) <- assayy | |
if (by_celltype == F) { | |
resultt<-list() | |
resultt[["none"]]<-FindMarkers( | |
seuratt, | |
test.use = testt, | |
slot = slott, | |
assay=assayy, | |
ident.1 = ident1, | |
ident.2 = ident2, | |
group.by = 'diff_expr2', | |
recorrect_umi = F, | |
logfc.threshold = 0) |> | |
mutate( | |
testname = paste0(assayy, "_", slott), | |
contrast = sprintf("%s_%s", ident1, ident2), | |
test = (sign(avg_log2FC) * power) | |
) |> | |
arrange(desc(test)) |> | |
rownames_to_column("gene_symbol") |> | |
left_join(gene_info_df |> dplyr::select(hgnc_symbol, chromosome_name, strand, start_position, end_position, gene_biotype), | |
by=c("gene_symbol"="hgnc_symbol")) | |
}else { | |
celltypes <- unique(seuratt@meta.data$simplified_celltype) | |
resultt <- list() | |
for (celltype in celltypes) { | |
resultt[[celltype]] <- FindMarkers( | |
seuratt, | |
test.use = testt, | |
slot = slott, | |
assay=assayy, | |
ident.1 = sprintf("%s_%s", ident1, celltype), | |
ident.2 = sprintf("%s_%s", ident2, celltype), | |
group.by = 'diff_expr_celltype2', | |
recorrect_umi = F, | |
logfc.threshold = 0) |> | |
mutate( | |
testname = paste0(assayy, "_", slott), | |
contrast = sprintf("%s_%s_%s", celltype, ident1, ident2), | |
test = (sign(avg_log2FC) * power) | |
) |> | |
arrange(desc(test)) |> | |
rownames_to_column("gene_symbol") |> | |
left_join(gene_info_df |> dplyr::select(hgnc_symbol, chromosome_name, strand, start_position, end_position, gene_biotype), | |
by=c("gene_symbol"="hgnc_symbol")) | |
} | |
} | |
return(resultt) | |
} | |
t21neu_combined@meta.data$diff_expr <- t21neu_combined@meta.data |> | |
as.data.frame() |> | |
rownames_to_column(var = "cell_id") |> | |
mutate( | |
Diff_expr = case_when( | |
sample == "pDAPT" & XIST_status ~ "pDxist", | |
sample %in% c("pDAPT") & !XIST_status ~ "pDno", | |
str_detect(sample, "D21") & !XIST_status ~ "d21", | |
sample == "T21" & !XIST_status ~ "t21", | |
sample == ".day0" & XIST_status ~ "d0", | |
T~"left" | |
)) |> | |
pull(Diff_expr) | |
t21neu_combined@meta.data$diff_expr2 <- t21neu_combined@meta.data |> | |
as.data.frame() |> | |
rownames_to_column(var = "cell_id") |> | |
mutate( | |
Diff_expr = case_when( | |
sample == "pDAPT" & XIST_status ~ "pDxist", | |
sample %in% c("T21","pDAPT") & !XIST_status ~ "t21", | |
str_detect(sample, "D21") & !XIST_status ~ "d21", | |
#sample == "T21" & !XIST_status ~ "t21", | |
sample == ".day0" & XIST_status ~ "d0", | |
T~"left" | |
)) |> | |
pull(Diff_expr) | |
t21neu_combined@meta.data$diff_expr_celltype <- t21neu_combined@meta.data |> | |
as.data.frame() |> | |
rownames_to_column(var = "cell_id") |> | |
mutate( | |
Diff_expr = case_when( | |
sample == "pDAPT" & XIST_status ~ sprintf("%s_%s", "pDxist", simplified_celltype), | |
sample %in% c("pDAPT") & !XIST_status ~ sprintf("%s_%s", "pDno", simplified_celltype), | |
str_detect(sample, "D21") & !XIST_status ~ sprintf("%s_%s", "d21", simplified_celltype), | |
sample == "T21" & !XIST_status ~ sprintf("%s_%s", "t21", simplified_celltype), | |
sample == ".day0" & XIST_status ~ sprintf("%s_%s", "d0", simplified_celltype), | |
T~sprintf("%s_%s", "left", simplified_celltype) | |
)) |> | |
pull(Diff_expr) | |
t21neu_combined@meta.data$diff_expr_celltype2 <- t21neu_combined@meta.data |> | |
as.data.frame() |> | |
rownames_to_column(var = "cell_id") |> | |
mutate( | |
Diff_expr = case_when( | |
sample == "pDAPT" & XIST_status ~ sprintf("%s_%s", "pDxist", simplified_celltype), | |
sample %in% c("T21","pDAPT") & !XIST_status ~ sprintf("%s_%s", "t21", simplified_celltype), | |
str_detect(sample, "D21") & !XIST_status ~ sprintf("%s_%s", "d21", simplified_celltype), | |
#sample == "T21" & !XIST_status ~ sprintf("%s_%s", "t21", simplified_celltype), | |
sample == ".day0" & XIST_status ~ sprintf("%s_%s", "d0", simplified_celltype), | |
T~sprintf("%s_%s", "left", simplified_celltype) | |
)) |> | |
pull(Diff_expr) | |
make_plot <- function(df) { | |
if (typeof(df) == "list") { | |
comb_df <- tibble() | |
for (celltype in names(df)) { | |
comb_df <- rbind( | |
comb_df, | |
df[[celltype]] |> | |
mutate(comparison_celltype = celltype) | |
) | |
} | |
return( | |
comb_df |> | |
filter(chromosome_name == "21" & is.finite(test) & test != 0) |> | |
ggplot(aes(x = fct_reorder(gene_symbol, test), y=test, fill=testname)) + | |
facet_grid(rows = vars(testname), cols = vars(comparison_celltype)) + | |
# facet_wrap("testname", scales = "free") + | |
geom_hline(yintercept = c(-0.1,0.1)) + | |
geom_bar(position="stack", stat="identity") + | |
coord_flip() | |
) | |
}else { | |
return( | |
df |> | |
filter(chromosome_name == "21" & is.finite(test) & test != 0) |> | |
ggplot(aes(x = fct_reorder(gene_symbol, test), y=test, fill=testname)) + | |
facet_wrap("testname", scales = "free") + | |
geom_hline(yintercept = c(-0.1,0.1)) + | |
geom_bar(position="stack", stat="identity") + | |
coord_flip() | |
) | |
} | |
} | |
## separate | |
SCT_data_t21_ct <- fx.diffroc( | |
slott = "data", | |
assayy = "SCT", | |
seuratt = t21neu_combined, | |
ident1 = "d21", | |
ident2 = "t21", | |
by_celltype = T | |
) | |
SCT_data_pD_ct <- fx.diffroc( | |
slott = "data", | |
assayy = "SCT", | |
seuratt = t21neu_combined, | |
ident1 = "pDxist", | |
ident2 = "pDno", | |
by_celltype = T | |
) | |
SCT_data_d0_ct <- fx.diffroc( | |
slott = "data", | |
assayy = "SCT", | |
seuratt = t21neu_combined, | |
ident1 = "d0", | |
ident2 = "pDno", | |
by_celltype = T | |
) | |
##pD and t21 combined to t21 | |
SCT_data_t21_ct2 <- fx.diffroc2( | |
slott = "data", | |
assayy = "SCT", | |
seuratt = t21neu_combined, | |
ident1 = "d21", | |
ident2 = "t21", | |
by_celltype = T | |
) | |
SCT_data_pD_ct2 <- fx.diffroc2( | |
slott = "data", | |
assayy = "SCT", | |
seuratt = t21neu_combined, | |
ident1 = "pDxist", | |
ident2 = "t21", | |
by_celltype = T | |
) | |
SCT_data_d0_ct2 <- fx.diffroc2( | |
slott = "data", | |
assayy = "SCT", | |
seuratt = t21neu_combined, | |
ident1 = "d0", | |
ident2 = "t21", | |
by_celltype = T | |
) | |
make_comb_df <- function(df) { | |
comb_df <- tibble() | |
for (celltype in names(df)) { | |
comb_df <- rbind( | |
comb_df, | |
df[[celltype]] |> | |
mutate(comparison_celltype = celltype) | |
) | |
} | |
return(comb_df) | |
} | |
require(scales) | |
fx.chr_plots<-function(label, allsubset, chr, testn, hght){ | |
allsubset<- allsubset |> | |
mutate(contrast3 = case_when(contrast3 == "d21_t21" ~ "euploid", | |
contrast3 == "d21_pDno" ~ "euploid", | |
contrast3 == "d0_t21" ~ "day0", | |
contrast3 == "d0_pDno" ~ "day0", | |
contrast3 == "pDxist_t21" ~ "pDAPT", | |
contrast3 == "pDxist_pDno" ~ "pDAPT", | |
T ~ "left")) | |
d21_normalized<-allsubset |> filter(chromosome_name == chr & testname == testn & contrast3=="euploid") |> | |
rename(avg_euploid = "avg_log2FC") |> dplyr::select(gene_symbol, avg_euploid, comparison_celltype) | |
allsubset |> filter(chromosome_name == chr & testname == testn) |> | |
left_join(d21_normalized) |> rowwise() |> mutate(diff_log2FC = avg_log2FC-avg_euploid) |> | |
filter(!is.na(avg_euploid)) |> | |
ggplot(aes(x=contrast3, y = avg_log2FC, color=contrast3)) + | |
geom_boxplot(notch=T) + | |
facet_grid(rows = vars(comparison_celltype)) + | |
stat_summary(fun.data = function(x) { | |
data.frame(label=signif(median(x),2), y=-0.9)}, geom = "text", color="black", size=2) + | |
stat_compare_means(ref.group = "euploid", method = "wilcox", | |
label.y = 0.9, hide.ns = F, label = "p.format", size=2) + | |
coord_cartesian(ylim = c(-1,1)) + geom_hline(yintercept = c(0)) + | |
theme(legend.position = "top") + ggtitle("Differential to T21") + | |
theme(text = element_text(size = 6), legend.text = element_text(size = 6)) | |
ggsave(filename = paste(testn, label, "box.pdf", sep = "_"), path=outdir, width = 1.8, height = hght, units = "in") | |
allsubset |> filter(chromosome_name == chr & testname == testn) |> | |
left_join(d21_normalized) |> rowwise() |> mutate(diff_log2FC = avg_log2FC-avg_euploid) |> | |
filter(!is.na(diff_log2FC)) |> mutate(chr21_pos = TSS/1000000) |> | |
ggplot(aes(x=chr21_pos, y = avg_log2FC, color=contrast3)) +# geom_point(alpha=0.2) + | |
geom_smooth(aes(fill = contrast3)) + | |
facet_grid(rows = vars(comparison_celltype)) + | |
coord_cartesian(ylim = c(-0.6,0.2)) + geom_hline(yintercept = 0) + geom_vline(xintercept = 37.550000, linetype=3) + | |
theme(legend.position = "top") + ggtitle("Differential to T21 by chromosomal position") + | |
theme(text = element_text(size = 6), legend.text = element_text(size = 6)) | |
ggsave(filename = paste(testn, label, "pos.pdf", sep = "_"), path=outdir, width = 2.4, height = hght, units = "in") | |
allsubset |> filter(chromosome_name == chr & testname == testn) |> | |
left_join(d21_normalized) |> rowwise() |> mutate(diff_log2FC = avg_log2FC-avg_euploid) |> | |
filter(!is.na(diff_log2FC) & contrast3 != "euploid") |> | |
mutate(distance=abs(37550000-TSS)/1000000) |> | |
ggscatter(x="distance", y = "avg_log2FC", color="contrast3", palette= hue_pal()(3)[c(1,3)], add = "reg.line", alpha = 0.2, size = 2, | |
cor.coef = T, cor.coef.size = 2,conf.int = T, ggtheme = theme_classic(), #add.params = list(color="black"), | |
cor.coeff.args = list(method = "pearson", label.x = 0, label.y = 0.1, digits = 2)) + | |
facet_grid(rows = vars(comparison_celltype), cols = vars(contrast3)) + | |
coord_cartesian(ylim = c(-0.6,0.2)) + geom_hline(yintercept = 0) + | |
theme(legend.position = "none") + ggtitle("Differential to T21 by XIST distance") + | |
theme(text = element_text(size = 6), legend.text = element_text(size = 6), axis.title = element_text(size = 7, face = "bold"), | |
axis.text = element_text(size = 6)) #+ theme(axis.text.x = element_text(angle = 45, hjust=1, size = 6)) | |
ggsave(filename = paste(testn, label, "dist.pdf", sep = "_"), path=outdir, width = 2.4, height = hght, units = "in") | |
allsubset |> filter(chromosome_name == chr & testname == testn) |> | |
group_by(contrast) |> filter(!duplicated(gene_symbol)) |> ungroup() |> rename(avgL2FC = "avg_log2FC") |> | |
pivot_wider(id_cols =c(gene_symbol, comparison_celltype), names_from = contrast3, values_from = c(avgL2FC, power)) |> | |
filter(!is.na(avgL2FC_euploid) & !is.na(avgL2FC_day0) & !is.na(avgL2FC_pDAPT)) |> | |
pivot_longer(-c(gene_symbol, comparison_celltype, avgL2FC_euploid, power_euploid), names_to = c("Var", ".value"), names_sep="_") |> | |
pivot_longer(-c(gene_symbol, comparison_celltype, avgL2FC_euploid, power_euploid, Var), names_to = "correction", values_to = "value") |> | |
pivot_wider(id_cols = c(gene_symbol, comparison_celltype, avgL2FC_euploid, power_euploid, correction), names_from = Var, values_from = value) |> | |
mutate(test = sign(avgL2FC)*power, test_euploid = sign(avgL2FC_euploid)*power_euploid) |> | |
#filter(power_euploid >= 0.1) |> | |
ggscatter(x="avgL2FC_euploid", y = "avgL2FC", #size="power_euploid", | |
color= "power", alpha = "power_euploid", | |
add.params = list(color = "black", size = 0.5), font.label = c(7, "plain", "black"), add = "reg.line", cor.coef.size =2, | |
cor.coef = T, cor.method = "pearson", cor.coef.coord = c(-2.5,2), ggtheme = theme_classic()) + | |
gradient_color(rev(viridis::viridis(11))) + #coord_cartesian(xlim = c(-limit, limit), ylim = c(-limit,limit)) + | |
facet_grid(cols=vars(correction), rows=vars(comparison_celltype), scales = "free") + theme(text = element_text(size=8)) | |
ggsave(filename = paste(testn, label, "chr21vseuploid.pdf", sep = "_"), path=outdir, width = 3.6, height = 3.6, units = "in") | |
allsubset |> filter(chromosome_name != chr & testname == testn) |> | |
group_by(contrast) |> filter(!duplicated(gene_symbol)) |> ungroup() |> rename(avgL2FC = "avg_log2FC") |> | |
pivot_wider(id_cols =c(gene_symbol, comparison_celltype), names_from = contrast3, values_from = c(avgL2FC, power)) |> | |
filter(!is.na(avgL2FC_euploid) & !is.na(avgL2FC_day0) & !is.na(avgL2FC_pDAPT)) |> | |
pivot_longer(-c(gene_symbol, comparison_celltype, avgL2FC_euploid, power_euploid), names_to = c("Var", ".value"), names_sep="_") |> | |
pivot_longer(-c(gene_symbol, comparison_celltype, avgL2FC_euploid, power_euploid, Var), names_to = "correction", values_to = "value") |> | |
pivot_wider(id_cols = c(gene_symbol, comparison_celltype, avgL2FC_euploid, power_euploid, correction), names_from = Var, values_from = value) |> | |
mutate(test = sign(avgL2FC)*power, test_euploid = sign(avgL2FC_euploid)*power_euploid) |> | |
filter(power_euploid >= 0.1) |> | |
ggscatter(x="avgL2FC_euploid", y = "avgL2FC", #size="power_euploid", | |
color= "power", alpha = "power_euploid", | |
add.params = list(color = "black", size = 0.5), font.label = c(7, "plain", "black"), add = "reg.line", cor.coef.size =2, | |
cor.coef = T, cor.method = "pearson", cor.coef.coord = c(-2.5,2), ggtheme = theme_classic()) + | |
# scale_color_gradient2(low = "blue", high = "red") + | |
gradient_color(rev(viridis::viridis(11))) + #coord_cartesian(xlim = c(-limit, limit), ylim = c(-limit,limit)) + | |
facet_grid(cols=vars(correction), rows=vars(comparison_celltype), scales = "free") + theme(text = element_text(size=8)) | |
ggsave(filename = paste(testn, label, "allvsd21.pdf", sep = "_"), path=outdir, width = 3.6, height = 3.6, units = "in") | |
} | |
SCT<- bind_rows(list(make_comb_df(SCT_data_t21_ct), | |
make_comb_df(SCT_data_d0_ct), | |
make_comb_df(SCT_data_pD_ct))) |> | |
mutate(TSS = case_when(strand == 1 ~ start_position, T ~ end_position)) |> | |
mutate(contrast2 = paste0(testname,"_", str_extract(contrast, "(d21|pDxist|d0)_.*"))) |> | |
mutate(contrast3 = str_extract(contrast, "(d21|pDxist|d0)_(t21|pDno)")) | |
write_csv(SCT, paste0(outdir,"/tables/SCT.csv.gz")) | |
fx.chr_plots("ct", SCT, 21, "SCT_data", 3.2) | |
SCT2<- bind_rows(list(make_comb_df(SCT_data_t21_ct2), | |
make_comb_df(SCT_data_d0_ct2), | |
make_comb_df(SCT_data_pD_ct2))) |> | |
mutate(TSS = case_when(strand == 1 ~ start_position, T ~ end_position)) |> | |
mutate(contrast2 = paste0(testname,"_", str_extract(contrast, "(d21|pDxist|d0)_.*"))) |> | |
mutate(contrast3 = str_extract(contrast, "(d21|pDxist|d0)_(t21|pDno)")) | |
write_csv(SCT2, paste0(outdir,"/tables/SCT2.csv.gz")) | |
fx.chr_plots("ct2", SCT2, 21, "SCT_data", 3.2) | |
#GSEA | |
require(metap) | |
vectorized_sumlog <- function(x, y) { | |
if (length(x) != length(y)) { | |
stop("lengths of lists must be the same") | |
} | |
combined_pvals <- c() | |
for (i in 1:length(x)) { | |
combined_pvals <- c(combined_pvals, (metap::sumlog(c(x[i], y[i])))$p) | |
} | |
return(combined_pvals) | |
} | |
fx.labels1<- Vectorize(function(X) { | |
s <- toupper(sapply(strsplit(str_replace(X, "_", ":"), "[:]"), getElement, 1 )) | |
abbreviate(s, minlength = 25, dot = T) | |
}) | |
fx.labels2<- Vectorize(function(X) { | |
s <- toupper(sapply(strsplit(str_replace(X, "_", ":"), "[:]"), getElement, 2)) | |
s<-str_replace_all(s, "_", "") | |
abbreviate(s, minlength = 25, dot = T) | |
}) | |
fx.labels3<- Vectorize(function(X) { | |
#s <- toupper(sapply(strsplit(str_replace(X, "_", ":"), "[:]"), getElement, 2)) | |
s<-str_replace_all(X, "_", "") | |
abbreviate(s, minlength = 25, dot = T) | |
}) | |
fx.gpfx<-function(gsea_resultt, typee, slcut, qcut, excluded, vc.collections, hght) | |
{ | |
gsea_with_cat <- gsea_resultt|> | |
filter(data_type == typee) |> | |
dplyr::mutate(category = dplyr::case_when( | |
str_detect(ID, "_STRESS") ~ "stress", | |
str_detect(ID, "ENDOPLASM") ~ "ER", | |
str_detect(ID, "chr21") ~ "chr21", | |
str_detect(ID, "TRANSLATION") ~ "translation", | |
str_detect(ID, "RIBOSOM") ~ "translation", | |
str_detect(ID, "SERINE") & (!str_detect(ID, "SERINE_THREONINE")) ~ "serine", | |
str_detect(ID, "MITOC") ~ "mitochondria", | |
str_detect(ID, "NRF2") ~ "stress", | |
str_detect(ID, "APOPTO") | str_detect(ID, "PROGRAMMED_CELL_DEATH") ~ "apoptosis", | |
T ~ "nocat" | |
)) | |
gsea_with_cat <- gsea_with_cat |> | |
mutate(Collection = case_when(str_count(ID,"_") == 0 ~ ID, | |
T~fx.labels1(ID))) | |
if(excluded == "nocat"){vc.collections<-gsea_with_cat |> pull(Collection) |> unique()} | |
if(excluded != "nocat"){vc.collections<-vc.collections} | |
categories_to_view<-gsea_with_cat |> | |
filter(Collection %in% vc.collections | str_detect(ID, "^chr21")) |> | |
group_by(ID, cell_type) |> | |
mutate(min_q_val = min(qvalues, na.rm=F), | |
mean_NES = mean(NES, na.rm = F), | |
ncount = n()) |> | |
filter(ncount == 3) |> | |
pivot_wider(id_cols = c(ID, cell_type, min_q_val), names_from = c(comparison), values_from = c(NES, qvalues, p.adjust)) |> | |
#filter(min_q_val <= qcut)|> | |
ungroup() | |
#if(excluded == "nocat"){categories_to_view$mean_q_val_t21<-qcut} | |
categories_to_view<-categories_to_view |> #dplyr::select(-c(names(which(duplicated(t(categories_to_view)))))) |> | |
mutate(pD_t21 = vectorized_sumlog(p.adjust_pD, p.adjust_t21), | |
d0_t21 = vectorized_sumlog(p.adjust_d0, p.adjust_t21), | |
pD_d0 = vectorized_sumlog(p.adjust_d0, p.adjust_pD)) |> filter(qvalues_d0 <= qcut | qvalues_t21 <= qcut) | |
vc.neurons<-categories_to_view |> filter(cell_type == "Neurons" & (pD_d0 <= slcut | pD_t21 <= slcut | d0_t21 <= slcut)) |> pull(ID) | |
vc.Astrocytes<-categories_to_view |> filter(cell_type == "Astrocytes" & (pD_d0 <= slcut | pD_t21 <= slcut | d0_t21 <= slcut)) |> pull(ID) | |
vc.NPCs<-categories_to_view |> filter(cell_type == "NPCs" & (pD_d0 <= slcut | pD_t21 <= slcut | d0_t21 <= slcut)) |> pull(ID) | |
vc.Other<-categories_to_view |> filter(cell_type == "Other" & (pD_d0 <= slcut | pD_t21 <= slcut | d0_t21 <= slcut)) |> pull(ID) | |
gsea_with_cat |> # filter((cell_type == "Astrocytes" & ID %in% vc.Astrocytes) | | |
# (cell_type == "Neurons" & ID %in% vc.neurons) | | |
# (cell_type == "NPCs" & ID %in% vc.NPCs) | | |
# (cell_type == "Other" & ID %in% vc.Other)) |> | |
pivot_wider(id_cols =c(ID, cell_type, category), names_from = comparison, values_from = c(NES, qvalues)) |> | |
filter(qvalues_t21 <= qcut) |> | |
pivot_longer(-c(ID, cell_type, category, qvalues_t21, NES_t21), names_to = c("Var", ".value"), names_sep="_") |> | |
pivot_longer(-c(ID, cell_type, category, qvalues_t21, NES_t21, Var), names_to = "correction", values_to = "value") |> | |
pivot_wider(id_cols = c(ID, cell_type, category, qvalues_t21, NES_t21, correction), names_from = Var, values_from = value) |> | |
mutate(log10q = pmin(-log(qvalues,10),3), log10_q_euploid = pmin(-log(qvalues_t21,10),3)) |> | |
ggscatter(x="NES_t21", y = "NES", color= "qvalues", alpha = "log10_q_euploid", #alpha = 0.5, | |
add.params = list(color = "black", size = 0.5), font.label = c(8, "plain", "black"), add = "reg.line", | |
cor.coef.size =2, cor.coef = T, cor.method = "pearson", cor.coef.coord = c(-2.5,0), ggtheme = theme_classic()) + | |
gradient_color(viridis::plasma(11)) + #coord_cartesian(xlim = c(-limit, limit), ylim = c(-limit,limit)) + | |
facet_grid(cols=vars(correction), rows=vars(cell_type), scales = "free") + theme(text = element_text(size=8)) | |
#ggsave(filename = paste(testn, label, "gsea_all.pdf", sep = "_"), path=outdir, width = 4, height = hght, units = "in") | |
ggsave(paste0(deparse(substitute(gsea_resultt)),typee,qcut,excluded,slcut,"scatter.pdf",collapse="_"), path=outdir, width=3.6, height=hght, units="in") | |
gsea_with_cat<-gsea_with_cat |> | |
# pivot_wider(id_cols =c(ID, cell_type, category), names_from = comparison, values_from = c(NES, qvalues)) |> | |
#filter(qvalues_t21 <= qcut & |> | |
filter( category != excluded & #ID %in% c(vc.neurons, vc.Astrocytes, vc.NPCs, vc.Other)) | |
(cell_type == "Astrocytes" & ID %in% vc.Astrocytes) | | |
(cell_type == "Neurons" & ID %in% vc.neurons) | | |
(cell_type == "NPCs" & ID %in% vc.NPCs) | | |
(cell_type == "Other" & ID %in% vc.Other)) | |
if(excluded != "nocat") | |
{ | |
gsea_with_cat$category<-gsea_with_cat$cell_type | |
gsea_with_cat$cell_type<-gsea_with_cat$comparison | |
# gsea_with_cat<-gsea_with_cat |> filter(qvalues <= qcut) | |
} | |
gsea_with_cat |> mutate(comparison = case_when(comparison == "t21" ~ "euploid", T ~ comparison)) |> | |
mutate(cell_type = case_when(cell_type == "t21" ~ "euploid", T ~ cell_type)) |> | |
mutate(ID = case_when(str_count(ID,"_") == 0 ~ paste0("CHR_",ID), T~ID)) |> | |
mutate(ID = fx.labels2(ID)) |> | |
mutate(Score = -log10(qvalues)*sign(NES)) |> | |
mutate(Score2 = pmin(-log10(qvalues),3)) |> | |
mutate(ID = fct_reorder(ID, NES)) |> | |
ggplot(aes(x=NES, y=ID)) + | |
geom_segment(aes(x=0, y=ID, yend = ID, xend=NES), color="black", size=.4) + | |
geom_point(aes(color=as.factor(sign(NES)), alpha = Score2)) + | |
#scale_color_gradient2(low="blue",mid = "white", high="red") + | |
scale_fill_aaas() + | |
scale_color_aaas() + | |
facet_grid(rows=vars(category), cols = vars(cell_type), scales="free", space="free", drop = T) + | |
geom_vline(xintercept = c(0), alpha=0.5) + | |
theme(legend.position = "right") + | |
#theme(axis.text.y = element_blank()) + | |
theme(axis.text.y = element_text(size = 6), axis.title.y = element_blank()) + | |
scale_x_continuous(expand = c(.1,.1)) | |
ggsave(paste0(deparse(substitute(gsea_resultt)),typee,qcut,excluded,slcut,"gp.pdf",collapse="_"), path=outdir, width=7.2, height=6.8, units="in") | |
} | |
gsea_results_SCT <- tibble() | |
for(data_type in c("data")) { | |
for (comparison_name in c("t21", "pD", "d0")) { | |
print(comparison_name) | |
test_df <- make_comb_df(eval(as.name(sprintf("SCT_%s_%s_ct", data_type, comparison_name)))) |> | |
#test_df <- SCT |> filter(str_detect(contrast3, comparison_name) & str_detect(testname, data_type)) |> | |
group_by(comparison_celltype) |> | |
arrange(desc(test)) | |
genes <- split(test_df |> ungroup() |> dplyr::select(gene_symbol, test) |> deframe(), test_df$comparison_celltype) | |
for (celltype in names(genes)) { | |
gc() | |
gl <- genes[[celltype]] | |
set.seed(42) | |
gsea_result <- GSEA(gl, TERM2GENE = msig_terms, pvalueCutoff = 1, | |
pAdjustMethod = "BH", by="fgsea", seed=T, eps=1e-50) | |
gsea_results_SCT <- gsea_results_SCT |> | |
rbind( | |
gsea_result |> | |
as_tibble() |> | |
mutate(comparison = comparison_name, | |
data_type = data_type, | |
cell_type = celltype) | |
) | |
} | |
} | |
} | |
gc() | |
write_csv(gsea_results_SCT, paste0(outdir,"/tables/gsea_SCT_celltype.csv.gz")) | |
gsea_results_SCT2 <- tibble() | |
for(data_type in c("data")) { | |
for (comparison_name in c("t21", "pD", "d0")) { | |
print(comparison_name) | |
test_df <- make_comb_df(eval(as.name(sprintf("SCT_%s_%s_ct2", data_type, comparison_name)))) |> | |
#test_df <- SCT2 |> filter(str_detect(contrast3, comparison_name) & str_detect(testname, data_type)) |> | |
group_by(comparison_celltype) |> | |
arrange(desc(test)) | |
genes <- split(test_df |> ungroup() |> dplyr::select(gene_symbol, test) |> deframe(), test_df$comparison_celltype) | |
for (celltype in names(genes)) { | |
gc() | |
gl <- genes[[celltype]] | |
set.seed(42) | |
gsea_result <- GSEA(gl, TERM2GENE = msig_terms, pvalueCutoff = 1, | |
pAdjustMethod = "BH", by="fgsea", seed=T, eps=1e-50) | |
gsea_results_SCT2 <- gsea_results_SCT2 |> | |
rbind( | |
gsea_result |> | |
as_tibble() |> | |
mutate(comparison = comparison_name, | |
data_type = data_type, | |
cell_type = celltype) | |
) | |
} | |
} | |
} | |
gc() | |
write_csv(gsea_results_SCT2, paste0(outdir,"/tables/gsea_SCT_celltype2.csv.gz")) | |
qcut<-0.1 | |
slcut<-0.005 | |
# excluded<-"nocat" | |
excluded<-"_" | |
vc.collections<-collections | |
fx.gpfx(gsea_results_SCT, "data", slcut, qcut, excluded, vc.collections, 3.6) | |
fx.gpfx(gsea_results_SCT2, "data", slcut, qcut, excluded, vc.collections, 3.6) | |
} |