Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
contiguous_x_erosion/expression_analysis_notebook.Rmd
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
1789 lines (1458 sloc)
82 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "Expression Analysis" | |
output: html_notebook | |
--- | |
# Expression Analysis | |
## Collecting RNA-seq expression data | |
We aligned the fastq files using STAR to hg38, then used HTseq-count to calculate the expression counts. Now we will use DESeq2 package to calculate which genes are differentially expressed in all the different sample clusters we have. | |
```{r} | |
library("DESeq2") | |
library('biomaRt') | |
library("TxDb.Hsapiens.UCSC.hg38.knownGene") | |
library("tidyverse") | |
library("org.Hs.eg.db") | |
diff_expr_all_df <- NA | |
directory <- "htseq_counts" | |
count_file_suffix <- "Aligned.out.sorted.bam.htseq_counts" | |
sampleFiles <- grep(count_file_suffix,list.files(directory),value=TRUE) | |
rna_seq_accessions_tib <- read_csv("rna_seq_accessions_tib.csv.gz") | |
cluster_tib <- read_csv("saved_sample_clusters.csv", col_names = c("sample_name", "cluster")) | |
cluster_tib <- column_to_rownames(cluster_tib, "sample_name") | |
seq_type_df <- read_csv("sample_seq_type.csv.gz") | |
sample_info_df <- read_csv("sample_data_combined_all_info.csv") | |
sample_file_to_keep <- c() | |
sample_names <- c() | |
sampleCondition <- c() | |
sample_seq_type <- c() | |
sample_group <- c() | |
xist_expression <- c() | |
all_htseq_counts_df <- tibble(gene_id=character(),gene_name=character()) | |
for(filename in sampleFiles) { | |
sra_acc <-sub(count_file_suffix, "", filename) | |
if(sra_acc == "SRR4027227") { | |
print("I'm getting the import file") | |
} | |
if (sra_acc %in% rna_seq_accessions_tib$sra_accession) { | |
sample_name <- (filter(rna_seq_accessions_tib, sra_accession == sra_acc))$SampleName | |
cluster <- cluster_tib[sample_name, 1] | |
print(cluster) | |
if(is.na(cluster) == F) { | |
sample_file_to_keep <- c(sample_file_to_keep, filename) | |
sample_names <- c(sample_names, sample_name) | |
sampleCondition <- c(sampleCondition, cluster) | |
sample_seq_type <- c(sample_seq_type, | |
as.character(seq_type_df[seq_type_df$`Sample Name` == sample_name, "end_type"][1])) | |
sample_group <- c(sample_group, | |
as.character(sample_info_df[sample_info_df$sample_name == sample_name, "group_accession"][1])) | |
counts_tib <- read_tsv(paste(directory, "/", filename, sep=""), col_names = c('id', 'name', 'level')) | |
all_htseq_counts_df <- full_join(all_htseq_counts_df, dplyr::select(counts_tib, id,name, !! sample_name := level), by=c("gene_id"="id", "gene_name"="name")) | |
} | |
} | |
} | |
sampleTable <- data.frame(sampleName = sample_names, | |
fileName = sample_file_to_keep, | |
cluster = factor(sampleCondition), | |
seq_type = sample_seq_type, | |
group = sample_group) | |
ddsHTSeq_all <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable, | |
directory = directory, | |
design= ~ seq_type + group + cluster) | |
keep <- rowSums(counts(ddsHTSeq_all)) >= 10 | |
ddsHTSeq_all <- ddsHTSeq_all[keep,] | |
dds_all <- DESeq(ddsHTSeq_all) | |
ntd <- normTransform(dds_all) | |
vsd <- vst(dds_all, blind=FALSE) | |
select <- order(rowMeans(counts(dds_all,normalized=TRUE)), | |
decreasing=TRUE)[1:20] | |
df <- as.data.frame(colData(dds_all)[,c("group","seq_type", "cluster")]) | |
library("pheatmap") | |
pheatmap(assay(ntd)[select,], cluster_rows=FALSE, show_rownames=FALSE, | |
cluster_cols=FALSE, annotation_col=df) | |
pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=FALSE, | |
cluster_cols=FALSE, annotation_col=df) | |
sampleDists <- dist(t(assay(vsd))) | |
library("RColorBrewer") | |
sampleDistMatrix <- as.matrix(sampleDists) | |
rownames(sampleDistMatrix) <- paste(vsd$cluster, vsd$seq_type, vsd$group, sep="-") | |
colnames(sampleDistMatrix) <- NULL | |
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255) | |
pheatmap(sampleDistMatrix, | |
clustering_distance_rows=sampleDists, | |
clustering_distance_cols=sampleDists, | |
col=colors) | |
plotPCA(vsd, intgroup=c("group")) | |
library(limma) | |
library(ggfortify) | |
plotPCA(removeBatchEffect(assay(vsd), batch = vsd$group, batch2 = vsd$seq_type), intgroup=c("group")) | |
batch_corrected_vsd <- t(removeBatchEffect(assay(vsd), batch = vsd$group, batch2 = vsd$seq_type)) | |
data_vsd <- as.data.frame(batch_corrected_vsd) | |
data_vsd$cluster <- factor(vsd$cluster) | |
data_vsd$group <- vsd$group | |
data_vsd$seq_type <- vsd$seq_type | |
autoplot(prcomp(batch_corrected_vsd), data = data_vsd, colour="group") | |
batch_corrected_vsd_df <- as.data.frame(removeBatchEffect(assay(vsd), batch = vsd$group, batch2 = vsd$seq_type)) %>% | |
rownames_to_column("ensembl_id") %>% | |
mutate(ensembl_id_no_version=gsub("\\..*", "", ensembl_id)) | |
get_genename_from_ensembl <- function(ensembl_ids) { | |
# mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl")) | |
mart <- useEnsembl(biomart = "ensembl", | |
dataset = "hsapiens_gene_ensembl", | |
mirror = "useast") | |
G_list <- getBM(filters= "ensembl_gene_id", | |
attributes= c("ensembl_gene_id", | |
"hgnc_symbol", | |
"chromosome_name", | |
"start_position", | |
"end_position", | |
"strand", | |
"description", | |
"wikigene_name"), | |
values=ensembl_ids, | |
mart= mart) | |
G_list[G_list$hgnc_symbol == "", "hgnc_symbol"] <- G_list[G_list$hgnc_symbol == "", "wikigene_name"] | |
return(G_list %>% dplyr::select(-wikigene_name)) | |
} | |
genes <- batch_corrected_vsd_df$ensembl_id_no_version | |
G_list <- get_genename_from_ensembl(genes) | |
batch_corrected_vsd_df <- batch_corrected_vsd_df %>% | |
left_join(G_list, by=c("ensembl_id_no_version"="ensembl_gene_id")) | |
write_csv(batch_corrected_vsd_df, "batch_corrected_vsd_df.csv") | |
# cluster F vs A,B,C,D individually | |
res <- results(dds_all, contrast = c("cluster", "5", "0"), tidy=T) | |
new_res_tib <- as.data.frame(res) %>% | |
as_tibble() %>% | |
dplyr::rename( | |
ensembl_id=row, | |
tran_1_log2FC=log2FoldChange, | |
tran_1_lfcSE=lfcSE, | |
tran_1_stat=stat, | |
tran_1_pvalue=pvalue, | |
tran_1_padj=padj) | |
for (clust in c("1", "2", "3")) { | |
res2 <- results(dds_all, contrast = c("cluster", "5", clust), tidy=T) | |
new_res_tib <- new_res_tib %>% | |
left_join( | |
as.data.frame(res2) %>% | |
as_tibble() %>% | |
dplyr::rename( | |
ensembl_id=row, | |
!!sprintf("%s_vs_F_log2FC", clust):=log2FoldChange, | |
!!sprintf("%s_vs_F_lfcSE", clust):=lfcSE, | |
!!sprintf("%s_vs_F_stat", clust):=stat, | |
!!sprintf("%s_vs_F_pvalue", clust):=pvalue, | |
!!sprintf("%s_vs_F_padj", clust):=padj) %>% | |
dplyr::select(-baseMean), | |
by="ensembl_id") | |
} | |
new_res_tib$EnsGeneNoVer <- gsub("\\..*", "", new_res_tib$ensembl_id) | |
genes <- new_res_tib$EnsGeneNoVer | |
G_list <- get_genename_from_ensembl(genes) | |
new_res_tib <- full_join(new_res_tib, G_list, by=c("EnsGeneNoVer"="ensembl_gene_id")) | |
write_csv(new_res_tib %>% dplyr::select(EnsGeneNoVer, gene=hgnc_symbol, chromosome_name, start_position, end_position, strand, description, ends_with("log2FC"), ends_with("padj")), "diff_expr_f_vs_each.csv") | |
# Cluster A vs cluster F: | |
res <- results(dds_all, contrast = c("cluster", "5", "0"), tidy=T) | |
new_res_tib <- res %>% | |
as_tibble() %>% | |
dplyr::rename( | |
ensembl_id=row) | |
write_csv(new_res_tib, "cluster_F_vs_A_expr.csv") | |
res <- results(dds_all, contrast = c("cluster", "3", "0"), tidy=T) | |
new_res_tib <- res %>% | |
as_tibble() %>% | |
dplyr::rename( | |
ensembl_id=row) | |
write_csv(new_res_tib, "cluster_D_vs_A_expr.csv") | |
res <- results(dds_all, contrast = c("cluster", "1", "0"), tidy=T) | |
new_res_tib <- as.data.frame(res) %>% | |
as_tibble() %>% | |
dplyr::rename( | |
ensembl_id=row, | |
tran_1_log2FC=log2FoldChange, | |
tran_1_lfcSE=lfcSE, | |
tran_1_stat=stat, | |
tran_1_pvalue=pvalue, | |
tran_1_padj=padj) | |
res2 <- results(dds_all, contrast = c("cluster", "2", "1"), tidy=T) | |
new_res_tib <- new_res_tib %>% | |
left_join( | |
as.data.frame(res2) %>% | |
as_tibble() %>% | |
dplyr::rename( | |
ensembl_id=row, | |
tran_2_log2FC=log2FoldChange, | |
tran_2_lfcSE=lfcSE, | |
tran_2_stat=stat, | |
tran_2_pvalue=pvalue, | |
tran_2_padj=padj) %>% | |
dplyr::select(-baseMean), | |
by="ensembl_id") | |
res2 <- results(dds_all, contrast = c("cluster", "3", "2"), tidy=T) | |
new_res_tib <- new_res_tib %>% | |
left_join( | |
as.data.frame(res2) %>% | |
as_tibble() %>% | |
dplyr::rename(ensembl_id=row, | |
tran_3_log2FC=log2FoldChange, | |
tran_3_lfcSE=lfcSE, | |
tran_3_stat=stat, | |
tran_3_pvalue=pvalue, | |
tran_3_padj=padj) %>% | |
dplyr::select(-baseMean), | |
by="ensembl_id") | |
res2 <- results(dds_all, contrast = c("cluster", "5", "3"), tidy=T) | |
new_res_tib <- new_res_tib %>% | |
left_join( | |
as.data.frame(res2) %>% | |
as_tibble() %>% | |
dplyr::rename(ensembl_id=row, | |
tran_45_log2FC=log2FoldChange, | |
tran_45_lfcSE=lfcSE, | |
tran_45_stat=stat, | |
tran_45_pvalue=pvalue, | |
tran_45_padj=padj) %>% | |
dplyr::select(-baseMean), | |
by="ensembl_id") | |
new_res_tib$EnsGeneNoVer <- gsub("\\..*", "", new_res_tib$ensembl_id) | |
genes <- new_res_tib$EnsGeneNoVer | |
G_list <- get_genename_from_ensembl(genes) | |
new_res_tib <- full_join(new_res_tib, G_list, by=c("EnsGeneNoVer"="ensembl_gene_id")) | |
write_csv(new_res_tib, "pairwise_differential_expression.csv") | |
gene_changes <- tibble(transition=c("1","2","3","4/5"), | |
all_upchanges=c(new_res_tib %>% filter(tran_1_pvalue <= .05 & tran_1_log2FC > 0) %>% nrow(), | |
new_res_tib %>% filter(tran_2_pvalue <= .05 & tran_2_log2FC > 0) %>% nrow(), | |
new_res_tib %>% filter(tran_3_pvalue <= .05 & tran_3_log2FC > 0) %>% nrow(), | |
new_res_tib %>% filter(tran_45_pvalue <= .05 & tran_45_log2FC > 0) %>% nrow()), | |
all_downchanges=c(new_res_tib %>% filter(tran_1_pvalue <= .05 & tran_1_log2FC < 0) %>% nrow(), | |
new_res_tib %>% filter(tran_2_pvalue <= .05 & tran_2_log2FC < 0) %>% nrow(), | |
new_res_tib %>% filter(tran_3_pvalue <= .05 & tran_3_log2FC < 0) %>% nrow(), | |
new_res_tib %>% filter(tran_45_pvalue <= .05 & tran_45_log2FC < 0) %>% nrow()), | |
x_upchanges=c(new_res_tib %>% filter(tran_1_pvalue <= .05 & tran_1_log2FC > 0 & chromosome_name == "X") %>% nrow(), | |
new_res_tib %>% filter(tran_2_pvalue <= .05 & tran_2_log2FC > 0 & chromosome_name == "X") %>% nrow(), | |
new_res_tib %>% filter(tran_3_pvalue <= .05 & tran_3_log2FC > 0 & chromosome_name == "X") %>% nrow(), | |
new_res_tib %>% filter(tran_45_pvalue <= .05 & tran_45_log2FC > 0 & chromosome_name == "X") %>% nrow()), | |
x_downchanges=c(new_res_tib %>% filter(tran_1_pvalue <= .05 & tran_1_log2FC < 0 & chromosome_name == "X") %>% nrow(), | |
new_res_tib %>% filter(tran_2_pvalue <= .05 & tran_2_log2FC < 0 & chromosome_name == "X") %>% nrow(), | |
new_res_tib %>% filter(tran_3_pvalue <= .05 & tran_3_log2FC < 0 & chromosome_name == "X") %>% nrow(), | |
new_res_tib %>% filter(tran_45_pvalue <= .05 & tran_45_log2FC < 0 & chromosome_name == "X") %>% nrow())) | |
# sampleTable <- data.frame(sampleName = sample_names, | |
# fileName = sample_file_to_keep, | |
# condition = sampleCondition) | |
sampleTable <- data.frame(sampleName = sample_names, | |
fileName = sample_file_to_keep, | |
cluster = sampleCondition, | |
seq_type = sample_seq_type, | |
group = sample_group) | |
diff_expr_all_df <- NA | |
for (cluster in 1:4) { | |
cluster_0 <- as.numeric(cluster - 1) | |
cluster_1 <- as.numeric(cluster) | |
if (cluster == 4) { | |
cluster_1 <- 5 | |
} | |
sampleTableFiltered <- sampleTable #%>% filter(cluster >= cluster_0)# %in% c(cluster_0, cluster_1)) | |
sampleTableFiltered[sampleTableFiltered$cluster <= cluster_0, "cluster"] <- cluster_0 | |
sampleTableFiltered[sampleTableFiltered$cluster > cluster_0, "cluster"] <- cluster_1 | |
sampleTableFiltered$cluster <- factor(sampleTableFiltered$cluster) | |
ddsHTSeq <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTableFiltered, | |
directory = directory, | |
design= ~ group + seq_type + cluster) | |
keep <- rowSums(counts(ddsHTSeq)) >= 10 | |
ddsHTSeq_filtered <- ddsHTSeq[keep,] | |
dds <- DESeq(ddsHTSeq_filtered) | |
res <- results(dds) | |
res_tib <- as_tibble(res, rownames = NA) | |
res_tib <- rownames_to_column(res_tib, "EnsGene") | |
transition_fold_change <- sprintf("transition_%s_log2FC", cluster) | |
transition_padj <- sprintf("transition_%s_padj", cluster) | |
if (is.na(diff_expr_all_df)) { | |
res_tib$EnsGeneNoVer <- gsub("\\..*","",res_tib$EnsGene) | |
genes <- res_tib$EnsGeneNoVer | |
G_list <- get_genename_from_ensembl(genes) | |
res_tib_named <- full_join(res_tib,G_list,by=c("EnsGeneNoVer"="ensembl_gene_id")) | |
diff_expr_all_df <- dplyr::select(res_tib_named, | |
ens_id=EnsGene, | |
gene=hgnc_symbol, | |
chr=chromosome_name, | |
start_position, | |
end_position, strand, | |
!!transition_fold_change := log2FoldChange, | |
!!transition_padj := padj) | |
}else { | |
temp_df <- dplyr::select(res_tib, ens_id=EnsGene, | |
!!transition_fold_change := log2FoldChange, | |
!!transition_padj := padj) | |
diff_expr_all_df <- full_join(diff_expr_all_df, temp_df, by="ens_id") | |
} | |
} | |
temp_df <- dplyr::select(diff_expr_all_df, ends_with("padj")) | |
temp_df[is.na(temp_df)] <- 1 | |
diff_expr_all_df$key_transition <- as.integer(unlist(apply(temp_df, 1, which.min))) | |
diff_expr_all_df$passes_threshold <- FALSE | |
diff_expr_all_df$positive_delta <- FALSE | |
for (row_idx in 1:nrow(diff_expr_all_df)) { | |
sig_transition <- sprintf("transition_%d_padj", unlist(diff_expr_all_df[row_idx, "key_transition"])) | |
sig_log2FC <- sprintf("transition_%d_log2FC", unlist(diff_expr_all_df[row_idx, "key_transition"])) | |
diff_expr_all_df[row_idx, "positive_delta"] <- as.logical(unlist(diff_expr_all_df[row_idx, sig_log2FC] > 0)) | |
if ((! is.na(diff_expr_all_df[row_idx, sig_transition])) & unlist(diff_expr_all_df[row_idx, sig_transition]) <= .1) { | |
diff_expr_all_df[row_idx, "passes_threshold"] <- TRUE | |
} | |
} | |
# diff_expr_all_df_old <- read_csv("diff_expr_all_df_one_v_many.csv") | |
# diff_expr_all_df <- left_join(diff_expr_all_df, diff_expr_all_df_old %>% dplyr::select(ens_id, | |
# gene, | |
# chr, | |
# start_position, | |
# end_position, | |
# strand), by="ens_id") | |
diff_expr_all_df <- diff_expr_all_df %>% filter((! is.na(chr)) & (chr != "MT" & chr != "Y")) | |
write_csv(diff_expr_all_df, "diff_expr_all_df.csv") | |
gene_counts_df <- diff_expr_all_df %>% filter(passes_threshold == T) %>% count(key_transition, positive_delta) | |
# positions <- diff_expr_all_df %>% dplyr::select(chrom=chr, start=start_position, end=end_position, strand) | |
# positions$chrom <- paste("chr", positions$chrom, sep="") | |
# positions[positions$strand == 1, "strand"] <- "+" | |
# positions[positions$strand == -1, "strand"] <- "-" | |
# positions <- positions %>% makeGRangesFromDataFrame | |
# mapped_tbl <- nearest(positions, genes(TxDb.Hsapiens.UCSC.hg38.knownGene)) | |
# gene_id_df <- as.data.frame(org.Hs.egSYMBOL) %>% column_to_rownames('gene_id') | |
# diff_expr_all_df$closest_gene <- gene_id_df[genes(TxDb.Hsapiens.UCSC.hg38.knownGene)[mapped_tbl,]$gene_id,] | |
``` | |
## Looking at XIST Expression | |
A function to look at expression of specific genes based on the clusters. We primarily use it to check XIST expression | |
```{r} | |
library("ggplot2") | |
sample_table_all <- sampleTable | |
sample_table_all$condition <- factor(sample_table_all$condition) | |
ddsHTSeq_all <- DESeqDataSetFromHTSeqCount(sampleTable = sample_table_all, | |
directory = directory, | |
design= ~ condition) | |
keep <- rowSums(counts(ddsHTSeq_all)) >= 10 | |
ddsHTSeq_all_filtered <- ddsHTSeq_all[keep,] | |
dds_all <- DESeq(ddsHTSeq_all_filtered) | |
library(stringr) | |
counts_all <- counts(dds_all, normalized=TRUE) | |
counts_all_df <- as.data.frame(counts_all) %>% rownames_to_column(var="gene_id") | |
counts_all_df <- counts_all_df %>% mutate(gene_id_nover = str_replace(gene_id, "\\.[0-9]*", "")) | |
sample_table_all <- column_to_rownames(sample_table_all, var="sampleName") | |
hg38_mart <- useMart('ensembl', dataset='hsapiens_gene_ensembl') | |
hg38_biomart_gene_df <- getBM(mart=hg38_mart, | |
attributes=c( 'ensembl_gene_id', 'hgnc_symbol')) | |
counts_all_df <- left_join(counts_all_df, hg38_biomart_gene_df %>% | |
dplyr::select(ensembl_gene_id, gene_name=hgnc_symbol), by=c("gene_id_nover"="ensembl_gene_id")) %>% | |
dplyr::select(-gene_id_nover, -gene_id) | |
write_csv(counts_all_df, "counts_all_df.csv") | |
counts_all_df <- read_csv("counts_all_df.csv") | |
cluster_tib <- read_csv("saved_sample_clusters.csv", col_names = c("sample_name", "cluster")) | |
count_cluster_means <- counts_all_df %>% mutate( | |
cluster_A_mean = rowMeans(select(., !!(filter(cluster_tib, cluster == 0 & sample_name %in% colnames(counts_all_df))$sample_name))), | |
cluster_B_mean = rowMeans(select(., !!(filter(cluster_tib, cluster == 1 & sample_name %in% colnames(counts_all_df))$sample_name))), | |
cluster_C_mean = rowMeans(select(., !!(filter(cluster_tib, cluster == 2 & sample_name %in% colnames(counts_all_df))$sample_name))), | |
cluster_D_mean = rowMeans(select(., !!(filter(cluster_tib, cluster == 3 & sample_name %in% colnames(counts_all_df))$sample_name))), | |
cluster_F_mean = rowMeans(select(., !!(filter(cluster_tib, cluster == 5 & sample_name %in% colnames(counts_all_df))$sample_name))) | |
) %>% | |
select(gene_name, cluster_A_mean, cluster_B_mean, cluster_C_mean, cluster_D_mean, cluster_F_mean) | |
write_csv(count_cluster_means, "counts_all_by_cluster.csv") | |
cluster_tib <- column_to_rownames(cluster_tib, "sample_name") | |
ggplot(count_cluster_means %>% reshape2::melt(id.vars="gene_name")) + geom_violin(aes(x=gene_name, y=value, fill=variable)) | |
batch_corrected_vsd_df <- read_csv("batch_corrected_vsd_df.csv") | |
batch_corrected_cluster_means <- batch_corrected_vsd_df %>% mutate( | |
cluster_A_mean = rowMeans(dplyr::select(., !!(filter(cluster_tib, cluster == 0 & sample_name %in% colnames(batch_corrected_vsd_df))$sample_name))), | |
cluster_B_mean = rowMeans(dplyr::select(., !!(filter(cluster_tib, cluster == 1 & sample_name %in% colnames(batch_corrected_vsd_df))$sample_name))), | |
cluster_C_mean = rowMeans(dplyr::select(., !!(filter(cluster_tib, cluster == 2 & sample_name %in% colnames(batch_corrected_vsd_df))$sample_name))), | |
cluster_D_mean = rowMeans(dplyr::select(., !!(filter(cluster_tib, cluster == 3 & sample_name %in% colnames(batch_corrected_vsd_df))$sample_name))), | |
cluster_F_mean = rowMeans(dplyr::select(., !!(filter(cluster_tib, cluster == 5 & sample_name %in% colnames(batch_corrected_vsd_df))$sample_name))) | |
) %>% | |
dplyr::select(ensembl_id, hgnc_symbol=hgnc_symbol.y, cluster_A_mean, cluster_B_mean, cluster_C_mean, cluster_D_mean, cluster_F_mean) | |
write_csv(batch_corrected_cluster_means, "batch_corrected_cluster_means.csv") | |
batch_corrected_cluster_means <- read_csv("batch_corrected_cluster_means.csv") | |
table_3_expression <- left_join(table_3_expression, batch_corrected_cluster_means, by=c("ens_id"="ensembl_id")) | |
write_csv(table_3_expression %>% dplyr::select(`Ensembl ID`=ens_id, | |
`Gene Symbol` = gene, | |
`Chromosome` = chr, | |
`Start Position`=start_position, | |
`End Position`=end_position, | |
`Strand`=strand, | |
`Cluster A Mean`=cluster_A_mean, | |
`Cluster B Mean`=cluster_B_mean, | |
`Cluster C Mean`=cluster_C_mean, | |
`Cluster D Mean`=cluster_D_mean, | |
`Cluster F Mean`=cluster_F_mean, | |
`Transition 1 log2FoldChange`=transition_1_log2FC, | |
`Transition 1 p-adj`=transition_1_padj, | |
`Transition 2 log2FoldChange`=transition_2_log2FC, | |
`Transition 2 p-adj`=transition_2_padj, | |
`Transition 3 log2FoldChange`=transition_3_log2FC, | |
`Transition 3 p-adj`=transition_3_padj, | |
`Transition 4/5 log2FoldChange`=transition_4_log2FC, | |
`Transition 4/5 p-adj`=transition_4_padj, | |
`Most Significant Transition`=key_transition, | |
`Passes Significance Threshold`=passes_threshold, | |
`Positive log2FC`=positive_delta, | |
kielens_log2FC, | |
kielens_padj, | |
`kielens_T2iLGö_mean`=`T2iLGö`, | |
`kielens_KSR+FGF2_mean`=`KSR+FGF2`), | |
"table_3_expression_all_v_all.csv") | |
table_2_expr <- full_join(diff_expr_all_df, batch_corrected_cluster_means, by=c("ens_id"="ensembl_id")) | |
write_csv(table_2_expr %>% select(`Ensembl ID`=ens_id, | |
`Gene Symbol` = hgnc_symbol, | |
`Chromosome` = chr, | |
`Start Position`=start_position, | |
`End Position`=end_position, | |
`Strand`=strand, | |
`Cluster A Mean`=cluster_A_mean, | |
`Cluster B Mean`=cluster_B_mean, | |
`Cluster C Mean`=cluster_C_mean, | |
`Cluster D Mean`=cluster_D_mean, | |
`Cluster F Mean`=cluster_F_mean, | |
`Transition 1 log2FoldChange`=transition_1_log2FC, | |
`Transition 1 p-adj`=transition_1_padj, | |
`Transition 2 log2FoldChange`=transition_2_log2FC, | |
`Transition 2 p-adj`=transition_2_padj, | |
`Transition 3 log2FoldChange`=transition_3_log2FC, | |
`Transition 3 p-adj`=transition_3_padj, | |
`Transition 4/5 log2FoldChange`=transition_4_log2FC, | |
`Transition 4/5 p-adj`=transition_4_padj, | |
`Most Significant Transition`=key_transition, | |
`Passes Significance Threshold`=passes_threshold, | |
`Positive log2FC`=positive_delta), | |
"table_3_expression_all_v_all.csv") | |
ggplot(batch_corrected_cluster_means %>% reshape2::melt(id.vars="hgnc_symbol")) + geom_violin(aes(x=variable, y=value, color=variable)) | |
ggplot(batch_corrected_cluster_means %>% reshape2::melt(id.vars="hgnc_symbol")) + geom_density(aes(value, color=variable)) | |
cluster_tib <- read_csv("saved_sample_clusters.csv", col_names = c("sample_name", "cluster")) | |
plot_gene_expression <- function(gene_name, hgnc_symbol=T, multiple=F, batch_corrected_vsd_df=NA, show_significance=F) { | |
if(is.na(batch_corrected_vsd_df)) { | |
batch_corrected_vsd_df <- read_csv("batch_corrected_vsd_df.csv.gz") | |
} | |
# gene_id <- as.character(unlist(hg38_biomart_gene_df %>% dplyr::filter(hgnc_symbol == gene_name) %>% dplyr::select(ensembl_gene_id))) | |
# print(gene_id) | |
# gene_expression <- plotCounts(dds_all, gene=gene_id, intgroup="condition", | |
# returnData=TRUE) | |
# gene_expression <- reshape2::melt(filter(counts_all_df, gene_name == gene_name) %>% dplyr::select(-gene_name)) | |
# gene_expression <- tibble(expression = as.numeric(filter(counts_all_df, gene_name == !!gene_name) %>% dplyr::select(-gene_name)), cluster= as.character(cluster_tib[colnames(counts_all_df %>% dplyr::select(-gene_name)), "cluster"])) | |
sample_names <- colnames(batch_corrected_vsd_df %>% | |
dplyr::select(-c(hgnc_symbol, | |
ensembl_id, | |
ensembl_id_no_version, | |
end_position, | |
start_position, | |
description, | |
strand, | |
chromosome_name))) | |
gene_expression <- batch_corrected_vsd_df %>% | |
filter(hgnc_symbol %in% !!gene_name) %>% | |
reshape2::melt(id.vars=c("hgnc_symbol"), measure.vars=sample_names, variable.name="sample_name", value.name="expression") %>% | |
left_join(cluster_tib, by="sample_name") %>% | |
mutate(cluster=as.character(cluster)) | |
if (hgnc_symbol == F) { | |
gene_expression <- batch_corrected_vsd_df %>% | |
filter(ensembl_id_no_version %in% !!gene_name) %>% | |
reshape2::melt(id.vars=c("hgnc_symbol"), measure.vars=sample_names, variable.name="sample_name", value.name="expression") %>% | |
left_join(cluster_tib, by="sample_name") | |
} | |
expr_plot <- ggplot() | |
if (multiple == T) { | |
expr_plot <- ggplot(gene_expression, aes(x=cluster, y=expression)) + | |
geom_boxplot(aes(alpha=cluster, fill="female", group=factor(cluster)),outlier.shape = NA, size=.3) + | |
geom_point(position=position_jitter(w=0.1,h=0), size=.5, shape=20) + | |
facet_grid(rows = vars(factor(hgnc_symbol, levels=gene_name)), scale="free") + | |
# scale_y_log10() + | |
labs(x="Cluster", | |
y='Log2 Counts (VST+Batch Corrected)', | |
title="Expression") + | |
theme(plot.title = element_text(hjust = 0.5)) + | |
theme_bw() + | |
scale_fill_npg(guide=F) + | |
scale_alpha_discrete(guide=F) + | |
scale_x_discrete(limits=c("0","1","2","3","5"), labels=c("A", "B", "C", "D", "F")) + | |
theme(strip.text = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")), | |
strip.background = element_rect(size = .3)) | |
}else { | |
expr_plot <- ggplot(gene_expression, aes(x=cluster, y=expression)) + | |
geom_boxplot(aes(alpha=cluster, fill="female"),outlier.shape = NA, size=.3) + | |
geom_point(position=position_jitter(w=0.1,h=0), size=.5, shape=20) + | |
# scale_y_log10() + | |
labs(x="Cluster", | |
y='Log2 Counts (VST+Batch Corrected)', | |
title=paste(gene_name, "Expression", sep = " ")) + | |
theme(plot.title = element_text(hjust = 0.5)) + | |
theme_bw() + | |
scale_fill_npg(guide=F) + | |
scale_alpha_discrete(guide=F) + | |
scale_x_discrete(limits=c("0","1","2","3","5"), labels=c("A", "B", "C", "D", "F")) | |
} | |
return(expr_plot) | |
} | |
xist_expr_plt <- plot_gene_expression("XIST") | |
ggsave("figs/main/fig_2e.svg", xist_expr_plt + base_plot_theme, width = 85, height = 35, units = "mm") | |
xist_expr_plt | |
plot_gene_expression("FIRRE") | |
rex_genes <- c("ZFP42", | |
"RLIM", | |
"YY1", | |
"POU5F1", | |
"NANOG", | |
"SOX2", | |
"KLF4", | |
"MYC") | |
rex1_expr <- plot_gene_expression("ZFP42") | |
for (g in rex_genes) { | |
plt <- plot_gene_expression(g) | |
ggsave(sprintf("figs/rex1_genes/%s.svg", g), plt + base_plot_theme, width = 85, height = 60.646, units = "mm") | |
} | |
#rex1 expression without outliers | |
cluster_tib <- column_to_rownames(cluster_tib, "sample_name") | |
gene_expression <- tibble(expression = as.numeric(filter(batch_corrected_vsd_df, hgnc_symbol == "ZFP42") %>% | |
dplyr::select(-c(hgnc_symbol, | |
ensembl_id, | |
ensembl_id_no_version, | |
end_position, | |
start_position, | |
description, | |
strand, | |
chromosome_name))), | |
cluster= as.character(cluster_tib[colnames(batch_corrected_vsd_df %>% | |
dplyr::select(-c(hgnc_symbol, | |
ensembl_id, | |
ensembl_id_no_version, | |
end_position, | |
start_position, | |
description, | |
strand, | |
chromosome_name))), "cluster"])) | |
rex1_expr <- ggplot(gene_expression %>% filter(expression > 7), aes(x=cluster, y=expression)) + | |
geom_boxplot(aes(alpha=cluster, fill="female"),outlier.shape = NA, size=.3) + | |
geom_point(position=position_jitter(w=0.1,h=0), size=.5, shape=20) + | |
labs(x="Cluster", | |
y='Log2 Counts\n(VST+Batch Corrected)', | |
title=paste("REX1/ZFP42", "Expression", sep = " ")) + | |
theme(plot.title = element_text(hjust = 0.5)) + | |
theme_bw() + | |
scale_fill_npg(guide=F) + | |
scale_alpha_discrete(guide=F) + | |
scale_x_discrete(limits=c("0","1","2","3","5"), labels=c("A", "B", "C", "D", "F")) + | |
base_plot_theme | |
ggsave("figs/supplementary/rex1_nooutliers.pdf", rex1_expr, width = 40, height = 35, units = "mm") | |
klf4_expr <- plot_gene_expression("KLF4") + labs( y='Log2 Counts\n(VST+Batch Corrected)') + base_plot_theme | |
ggsave("figs/supplementary/klf4_expr.pdf", klf4_expr, width = 40, height = 35, units = "mm") | |
# expression for only banovich samples: | |
batch_corrected_vsd_df <- read_csv("batch_corrected_vsd_df.csv.gz") | |
banovich_batch_corrected_vsd_df <- batch_corrected_vsd_df %>% | |
dplyr::select(ensembl_id, ensembl_id_no_version,hgnc_symbol,description,strand,end_position,start_position,chromosome_name, starts_with("Banovich_")) | |
banovich_xist_expr_plt <- plot_gene_expression("XIST", batch_corrected_vsd_df=banovich_batch_corrected_vsd_df) + labs(title="ATAC-only samples XIST Expr") | |
banovich_firre_expr_plt <- plot_gene_expression("FIRRE", batch_corrected_vsd_df=banovich_batch_corrected_vsd_df) + labs(title="ATAC-only samples FIRRE Expr") | |
ggsave("figs/review images/fig_2d_atac_only.png", banovich_xist_expr_plt + base_plot_theme, width = 43, height = 64,units = "mm") | |
ggsave("figs/review images/fig_2e_atac_only.png", banovich_firre_expr_plt + base_plot_theme, width = 43, height = 64, units = "mm") | |
``` | |
## Looking for pluripotency genes | |
```{r} | |
pluripotency_genes <- c( | |
"DPPA3", | |
"PRDM14", | |
"ZFP42", | |
"KLF4", | |
"UHRF1", | |
"FOXD3", | |
"POU5F1", | |
"SALL4", | |
"MYC", | |
"GDF3", | |
"FGF2", | |
"DNMT3A", | |
"DNMT3B", | |
"GAL", | |
"LIN28A", | |
"DPPA4", | |
"TDGF1", | |
"TNFRSF8", | |
"TET1", | |
"TET2", | |
"NANOG", | |
"ALPL", | |
"E2F1", | |
"PODXL", | |
"TERT", | |
"UTF1" | |
) | |
plot_df <- counts_all_df %>% | |
filter(gene_name %in% pluripotency_genes) %>% | |
reshape2::melt(id.vars=c("gene_name")) | |
plot_df <- left_join(plot_df, rownames_to_column(cluster_tib, "sample_name"), by=c("variable"="sample_name")) | |
ggplot(plot_df, aes(y=value, x=cluster)) + | |
geom_boxplot(aes(fill=as.factor(cluster)), alpha=.2) + | |
geom_smooth() + | |
scale_fill_aaas() + | |
facet_wrap(facets = vars(gene_name), scales="free_y") | |
# kielens | |
# continuous_expr_change <- read_csv("continuous_expr_change.csv") | |
# continous_expr_no_F <- read_csv("diff_expr_continuous_no_cluster_F.csv") | |
# kielens_genes <- read_tsv("random_checks/kielens_genes.csv") | |
kielens_genes <- read_tsv("random_checks/all_kielens_genes.csv") %>% | |
dplyr::select(gene=`Gene Symbol`, | |
kielens_log2FC=`Log2(Fold-Change)`, | |
kielens_padj=`Adjusted p-value`) | |
# diff_expr_all_df <- read_csv("diff_expr_all_df.csv") | |
# table_3_expression <- diff_expr_all_df | |
table_3_expression <- read_csv("table_3_expression_all_v_all.csv.gz") | |
# kielens_genes_2 <- read_tsv("random_checks/kielens_2.csv") | |
# kielens_genes <- kielens_genes_2 %>% rename(gene=gene_symbol, log2FC=kielen_log2FC) | |
kielens_common_genes <- intersect(table_3_expression$`Gene Symbol`, kielens_genes$gene) | |
# kielen_expr_change_df <- continuous_expr_change %>% | |
# filter(hgnc_symbol %in% kielens_genes$gene) | |
# | |
# kielens_genes <- kielens_genes %>% rename(kielen_log2FC=log2FC) | |
kielen_expr_change_df <- inner_join(table_3_expression, kielens_genes, by=c("Gene Symbol"="gene")) | |
table_3_expression <- left_join(table_3_expression, kielens_genes, by=c("Gene Symbol"="gene")) | |
# add kielens counts | |
kielens_cell_lines <- read_csv("random_checks/kielens_cell_lines.csv") | |
kielens_expression_data <- read_csv("random_checks/kielens_counts.csv") | |
library(broom) | |
expression_cell_lines <- kielens_expression_data %>% | |
dplyr::select(-`Gene Symbol`, -overlap, -`Log2(Fold-Change)`, -`Adjusted p-value`, -`False discovery Rate`) %>% | |
colnames() %>% | |
tidy() %>% | |
mutate(cell_line=str_replace(x, "_P.*", ""), | |
passage=str_replace(x, ".*_P", "")) %>% | |
left_join(kielens_cell_lines, by=c("cell_line")) | |
kielens_expression_counts_df <- kielens_expression_data %>% | |
dplyr::select(-overlap, -`Adjusted p-value`, -`False discovery Rate`) %>% | |
reshape2::melt(id.vars=c("Gene Symbol", "Log2(Fold-Change)"), value.name="expression_count", variable.name="cell_line_name") %>% | |
left_join(expression_cell_lines, by=c("cell_line_name"="x")) | |
kielens_expression_counts_df <- kielens_expression_counts_df %>% | |
filter(growth_media %in% c("T2iLGö", "KSR+FGF2")) %>% | |
group_by(growth_media, `Gene Symbol`) %>% | |
summarise(mean_expression=mean(expression_count)) | |
kielens_expression_counts_df_casted <- reshape2::dcast(kielens_expression_counts_df, `Gene Symbol` ~ growth_media) | |
table_3_expression <- left_join(table_3_expression, kielens_expression_counts_df_casted, by=c("Gene Symbol")) | |
cluster_F_vs_A_expr <- read_csv("cluster_F_vs_A_expr.csv") %>% | |
dplyr::select(ensembl_id, | |
`log2(F/A)`=log2FoldChange, | |
`padj (F/A)`=padj) | |
kielen_expr_change_df <- left_join(kielen_expr_change_df, cluster_F_vs_A_expr, by=c("Ensembl ID"="ensembl_id")) | |
cluster_D_vs_A_expr <- read_csv("cluster_D_vs_A_expr.csv") %>% | |
dplyr::select(ensembl_id, | |
`log2(D/A)`=log2FoldChange, | |
`padj (D/A)`=padj) | |
kielen_expr_change_df <- left_join(kielen_expr_change_df, cluster_D_vs_A_expr, by=c("Ensembl ID"="ensembl_id")) | |
temp_df <- table_3_expression %>% filter(`Transition 4/5 p-adj` <= .1 & Chromosome != "X") | |
#%>% mutate(`F_minus_A`=`Cluster F Mean`-`Cluster A Mean`) | |
corr_log2fc <- cor.test(temp_df$kielens_log2FC, | |
temp_df$`Transition 4/5 log2FoldChange`, | |
method = "spearman") | |
kielens_log2_plot <- ggplot(table_3_expression %>% filter(`Transition 4/5 p-adj` <= .1 & Chromosome != "X"), | |
aes(x=kielens_log2FC, y=`Transition 4/5 log2FoldChange`, color=`Transition 4/5 p-adj`)) + | |
geom_point(size=.7) + | |
scale_color_viridis_c() + | |
geom_text(label=sprintf("Spearman Corr: %.2f",corr_log2fc$estimate), x=-1, y=3.5, size=1.4, fontface = "plain", color="black") + | |
scale_y_continuous(limits = c(-2,5)) + | |
labs(title="Kilens, et al. 2018 vs Tran. 4/5", | |
color="Tran. 4/5 p-adj.", | |
x="Kilens, et al. 2018 log2(F.C.)", | |
y="Transition 4/5 log2(F.C.)") + | |
theme_bw() + | |
base_plot_theme + | |
theme(legend.key.width = unit(4, units="mm"), | |
legend.title=element_text(vjust=.95), | |
legend.box.margin=margin(l=-6,t=-3, unit="mm")) | |
ggsave("figs/main/fig4_kilens_log2_tran4_correlation.pdf",kielens_log2_plot,width = 57.855, height=58, units = "mm") | |
#%>% mutate(`F_minus_A`=`Cluster F Mean`-`Cluster A Mean`) | |
corr_counts <- cor.test(temp_df$kielens_T2iLGö_mean - temp_df$`kielens_KSR+FGF2_mean`, | |
temp_df$`Cluster F Mean`-temp_df$`Cluster A Mean`, | |
method = "spearman") | |
kielens_counts_plot <- ggplot(table_3_expression %>% filter(`Transition 4/5 p-adj` <= .1 & Chromosome != "X"), | |
aes(x=kielens_T2iLGö_mean - `kielens_KSR+FGF2_mean` , y=`Cluster F Mean`-`Cluster A Mean`, color=`Transition 4/5 p-adj`)) + | |
geom_point(size=.7) + | |
scale_color_viridis_c() + | |
scale_x_continuous(limits=c(-250,500), trans = scales::pseudo_log_trans()) + | |
scale_y_continuous(limits=c(-2,3)) + | |
geom_text(aes(label=sprintf("Spearman Corr: %.2f",corr_counts$estimate), x=-50, y=2), size=2) + | |
labs(title="Kielens vs Transiton 4/5 By Counts", | |
color="Transition 4/5 p-adjusted", | |
x="Kielens Naive - Primed Counts", | |
y="Cluster F - Cluster A Log2 Counts (VST-Batch Corrected)") + | |
theme_bw() + | |
base_plot_theme + | |
theme(legend.key.width = unit(4, units="mm")) | |
ggsave("figs/supplementary/kielens_counts_clusterF_correlation.svg",kielens_counts_plot,width = 87, height=87, units = "mm") | |
kielens_barplot_df <- table_3_expression %>% | |
filter(`Transition 4/5 p-adj` <= .1 & !is.na(`kielens_log2FC`)) %>% | |
arrange(desc(`kielens_log2FC`)) %>% | |
dplyr::select(`Gene Symbol`, `Transition 4/5 p-adj`, `Chromosome`, `kielens_log2FC`, `Transition 4/5 log2FoldChange`) %>% | |
reshape2::melt(id.vars=c("Gene Symbol", "Transition 4/5 p-adj", "Chromosome")) | |
positive_genes <- (kielens_barplot_df %>% filter(variable == "kielens_log2FC" & value > 0))$`Gene Symbol` | |
negative_genes <- (kielens_barplot_df %>% filter(variable == "kielens_log2FC" & value < 0))$`Gene Symbol` | |
kielens_barplot_labels_df_pos <- kielens_barplot_df %>% | |
filter(Chromosome != "X" & `Gene Symbol` %in% positive_genes) %>% | |
group_by(`Gene Symbol`) %>% | |
summarise(pos=max(abs(value))*sign(prod(value))) | |
kielens_barplot_labels_df_neg <- kielens_barplot_df %>% | |
filter(Chromosome != "X" & `Gene Symbol` %in% negative_genes) %>% | |
group_by(`Gene Symbol`) %>% | |
summarise(pos=-1*max(abs(value))*sign(prod(value))) | |
kielens_barplot <- ggplot(kielens_barplot_df %>% filter(Chromosome != "X")) + | |
geom_bar(aes(x=factor(`Gene Symbol`, levels=rev(unique(kielens_barplot_df$`Gene Symbol`))), y=value, fill=variable), | |
stat="identity", | |
position=position_dodge()) + | |
geom_text(data= kielens_barplot_labels_df_pos, | |
aes(x=factor(`Gene Symbol`, levels=rev(unique(kielens_barplot_df$`Gene Symbol`))), | |
label=`Gene Symbol`, | |
y=pos), hjust=0, nudge_x = 0, nudge_y = .5, size=1.2) + | |
geom_text(data= kielens_barplot_labels_df_neg, | |
aes(x=factor(`Gene Symbol`, levels=rev(unique(kielens_barplot_df$`Gene Symbol`))), | |
label=`Gene Symbol`, | |
y=pos), hjust=1,nudge_x = 0, nudge_y = -.5, size=1.2) + | |
coord_flip() + | |
scale_fill_uchicago(limits=c("Transition 4/5 log2FoldChange", "kielens_log2FC"), | |
labels=c("Transition 4/5", "Kilens")) + | |
labs(x="Genes", y="log2(Fold-Change)", title="Fold Change of Kilens, et al. 2018 Genes", fill="Dataset") + | |
theme_bw() + | |
base_plot_theme + | |
scale_x_discrete(breaks = NULL) + | |
scale_y_continuous(limits=c(-7.5,10)) | |
ggsave("figs/main/fig4_kilens_barplot.svg",kielens_barplot,width = 58, height=174, units = "mm") | |
#kielens samples heatmap, and our female samples heatmap using the expression data | |
kielens_cell_lines <- read_csv("random_checks/kielens_cell_lines.csv") | |
kielens_expression_data <- read_csv("random_checks/kielens_counts.csv") | |
kielens_cell_lines <- kielens_cell_lines %>% filter(growth_media %in% c("KSR+FGF2", "T2iLGö")) | |
kielens_expression_data_filtered <- kielens_expression_data %>% | |
select(`Gene Symbol`, matches(paste(sprintf("%s.*", kielens_cell_lines$cell_line), collapse="|"))) | |
batch_corrected_vsd_df <- read_csv("batch_corrected_vsd_df.csv") | |
batch_corrected_vsd_df <- batch_corrected_vsd_df %>% | |
select(-c(ensembl_id, | |
ensembl_id_no_version, | |
end_position, | |
start_position, | |
description, | |
strand)) | |
kielens_and_cluster_samples <- inner_join(batch_corrected_vsd_df, kielens_expression_data_filtered, by=c("hgnc_symbol"="Gene Symbol")) | |
kielens_and_cluster_samples_heatmap_df <- kielens_and_cluster_samples %>% | |
reshape2::melt(id.vars=c("hgnc_symbol", "chromosome_name")) | |
cluster_tib <- read_csv("saved_sample_clusters.csv", col_names = c("sample_name", "cluster")) | |
kielens_and_cluster_samples_heatmap_df <- kielens_and_cluster_samples_heatmap_df %>% | |
left_join(cluster_tib, by=c("variable"="sample_name")) | |
naive_cell_lines <- filter(kielens_cell_lines, growth_media == "T2iLGö")$cell_line | |
primed_cell_lines <- filter(kielens_cell_lines, growth_media == "KSR+FGF2")$cell_line | |
kielens_and_cluster_samples_heatmap_df[grepl(paste(sprintf("%s.*", primed_cell_lines), collapse="|"), kielens_and_cluster_samples_heatmap_df$variable), "cluster"] <- "KSR+FGF2" | |
kielens_and_cluster_samples_heatmap_df[grepl(paste(sprintf("^%s", naive_cell_lines), collapse="|"), kielens_and_cluster_samples_heatmap_df$variable), "cluster"] <- "T2iLGö" | |
gene_order <- unique(kielens_barplot_df$`Gene Symbol`) | |
kielens_heatmap <- ggplot(kielens_and_cluster_samples_heatmap_df %>% filter(chromosome_name != "X", !is.na(hgnc_symbol))) + | |
facet_grid(cols=vars(cluster), scales="free_x") + | |
geom_tile(mapping=aes(x=variable, y=factor(hgnc_symbol, levels = gene_order), fill=value)) | |
ggplot(kielen_expr_change_df %>% filter(`padj (F/A)` <= .1), aes(x=kielens_log2FC, y=`Cluster F Mean`-`Cluster A Mean`, color=`padj (F/A)`)) + | |
geom_point() + | |
scale_color_viridis_c() + | |
scale_y_continuous(trans="pseudo_log", limits = c(-2.5,2.5)) + | |
labs(title="kielens_log2FC vs `Cluster F Mean`-`Cluster A Mean`") | |
cor.test(temp_df$kielens_log2FC, | |
temp_df$F_minus_A, | |
method = "spearman") | |
ggplot(kielen_expr_change_df %>% filter(`padj (D/A)` <= .1), aes(x=kielens_log2FC, y=`log2(D/A)`, color=`padj (D/A)`)) + | |
geom_point() + | |
scale_color_viridis_c() + | |
labs(title="kielens_log2FC vs log2(clusterD/A)") | |
ggplot(kielen_expr_change_df %>% filter(`padj (D/A)` <= .1), aes(x=kielens_log2FC, y=`Cluster D Mean`-`Cluster A Mean`, color=`padj (D/A)`)) + | |
geom_point() + | |
scale_color_viridis_c() + | |
scale_y_continuous(trans="pseudo_log") + | |
labs(title="kielens_log2FC vs `Cluster D Mean`-`Cluster A Mean`") | |
# look for progressive selection of naive vs primed genes | |
pairwise_differential_expression <- read_csv("pairwise_differential_expression.csv") | |
naive_kielens_genes <- kielens_genes %>% filter(kielens_log2FC > 0) | |
plot_df <- pairwise_differential_expression %>% | |
filter(hgnc_symbol %in% naive_kielens_genes$gene) %>% | |
dplyr::select(hgnc_symbol, | |
tran_1_log2FC, | |
tran_2_log2FC, | |
tran_3_log2FC, | |
tran_45_log2FC) %>% | |
reshape2::melt(id.vars="hgnc_symbol") | |
ggplot(plot_df, aes(x=variable, y=value)) + | |
geom_violin() + | |
labs(title="Changes in naive genes (kielens_log2FC > 0)") | |
plot_df <- kielen_expr_change_df %>% | |
filter(`Gene Symbol` %in% naive_kielens_genes$gene) %>% | |
dplyr::select(`Gene Symbol`, | |
`Cluster A Mean`, | |
`Cluster B Mean`, | |
`Cluster C Mean`, | |
`Cluster D Mean`, | |
`Cluster F Mean`) %>% | |
reshape2::melt(id.vars="Gene Symbol") | |
ggplot(plot_df, aes(x=variable, y=value)) + | |
geom_violin() + | |
labs(title="Counts of naive genes (kielens_log2FC > 0)") | |
table_3_expression_naive_genes <- kielen_expr_change_df %>% filter(`Gene Symbol` %in% naive_kielens_genes$gene) | |
table_3_expression_naive_genes <- table_3_expression_naive_genes %>% | |
dplyr::transmute(`padj (F/A)`=`padj (F/A)`, | |
`Gene Symbol`=`Gene Symbol`, | |
cluster_A_change = 0, | |
cluster_B_change = `Cluster B Mean` - `Cluster A Mean`, | |
cluster_C_change = `Cluster C Mean` - `Cluster A Mean`, | |
cluster_D_change = `Cluster D Mean` - `Cluster A Mean`, | |
cluster_F_change = `Cluster F Mean` - `Cluster A Mean`) %>% | |
reshape2::melt(id.vars=c("Gene Symbol", "padj (F/A)")) | |
ggplot(table_3_expression_naive_genes %>% filter(`padj (F/A)` <= .1), aes(x=variable, y=value, group=`Gene Symbol`, color=`padj (F/A)`)) + | |
geom_line() + | |
scale_color_viridis_c() | |
primed_kielens_genes <- kielens_genes %>% filter(kielens_log2FC < 0) | |
table_3_expression_primed_genes <- kielen_expr_change_df %>% filter(`Gene Symbol` %in% primed_kielens_genes$gene) | |
table_3_expression_primed_genes <- table_3_expression_primed_genes %>% | |
dplyr::transmute(`padj (F/A)`=`padj (F/A)`, | |
`Gene Symbol`=`Gene Symbol`, | |
cluster_A_change = 0, | |
cluster_B_change = `Cluster B Mean` - `Cluster A Mean`, | |
cluster_C_change = `Cluster C Mean` - `Cluster A Mean`, | |
cluster_D_change = `Cluster D Mean` - `Cluster A Mean`, | |
cluster_F_change = `Cluster F Mean` - `Cluster A Mean`) %>% | |
reshape2::melt(id.vars=c("Gene Symbol", "padj (F/A)")) | |
ggplot(table_3_expression_primed_genes %>% filter(`padj (F/A)` <= .1), aes(x=variable, y=value, group=`Gene Symbol`, color=`padj (F/A)`)) + | |
geom_line() + | |
scale_color_viridis_c() | |
table_3_expression <- left_join(table_3_expression, cluster_F_vs_A_expr, by=c("Ensembl ID"="ensembl_id")) | |
table_3_expression_non_prime_non_naive_genes <- table_3_expression %>% filter(!(`Gene Symbol` %in% c(naive_kielens_genes$gene, primed_kielens_genes$gene))) | |
table_3_expression_non_prime_non_naive_genes <- table_3_expression_non_prime_non_naive_genes %>% | |
dplyr::transmute(`padj (F/A)`=`padj (F/A)`, | |
`Gene Symbol`=`Gene Symbol`, | |
cluster_A_change = 0, | |
cluster_B_change = `Cluster B Mean` - `Cluster A Mean`, | |
cluster_C_change = `Cluster C Mean` - `Cluster A Mean`, | |
cluster_D_change = `Cluster D Mean` - `Cluster A Mean`, | |
cluster_F_change = `Cluster F Mean` - `Cluster A Mean`) %>% | |
reshape2::melt(id.vars=c("Gene Symbol", "padj (F/A)")) | |
non_prime_plot <- ggplot(table_3_expression_non_prime_non_naive_genes %>% filter(`padj (F/A)` <= .1), aes(x=variable, y=value, group=`Gene Symbol`, color=`padj (F/A)`)) + | |
geom_line() + | |
scale_color_viridis_c() | |
ggsave("figs/non_prime_non_naive.png", non_prime_plot) | |
primed_plot_df <- pairwise_differential_expression %>% | |
filter(hgnc_symbol %in% primed_kielens_genes$gene) %>% | |
dplyr::select(hgnc_symbol, | |
tran_1_log2FC, | |
tran_2_log2FC, | |
tran_3_log2FC, | |
tran_45_log2FC) %>% | |
reshape2::melt(id.vars="hgnc_symbol") | |
ggplot(primed_plot_df, aes(x=variable, y=value)) + | |
geom_violin() + | |
labs(title="Changes in primed genes (kielens_log2FC > 0)") | |
plot_df <- kielen_expr_change_df %>% | |
filter(`Gene Symbol` %in% primed_kielens_genes$gene) %>% | |
dplyr::select(`Gene Symbol`, | |
`Cluster A Mean`, | |
`Cluster B Mean`, | |
`Cluster C Mean`, | |
`Cluster D Mean`, | |
`Cluster F Mean`) %>% | |
reshape2::melt(id.vars="Gene Symbol") | |
ggplot(plot_df, aes(x=variable, y=value)) + | |
geom_violin() + | |
labs(title="Counts of primed genes (kielens_log2FC > 0)") | |
ggplot(kielen_expr_change_df, aes(x=kielen_log2FC, y=-log10(padj))) + | |
geom_point() + | |
labs(title="kielen_log2FC vs continuous diff exp padj") | |
ggplot(kielen_expr_change_df, aes(x=kielen_log2FC, y=log2FoldChange)) + | |
geom_point() + | |
labs(title="kielen_log2FC vs continuous diff exp log2FC") | |
kielen_expr_change_df <- left_join(kielen_expr_change_df, count_cluster_means, by=c("hgnc_symbol"="gene_name")) | |
kielen_counts_df <- reshape2::melt(kielen_expr_change_df %>% filter(padj <= .1) %>% select(hgnc_symbol, chromosome_name, kielen_log2FC, !!sprintf("cluster_%s_mean", c("A","B","C","D","F"))), id.vars=c("hgnc_symbol", "chromosome_name", "kielen_log2FC")) | |
ggplot(kielen_counts_df %>% filter(chromosome_name != "X"), aes(x=kielen_log2FC, y=value, group=hgnc_symbol)) + | |
# geom_point(aes(size=kielen_log2FC)) + | |
geom_point(aes(color=kielen_log2FC), alpha=.1) + | |
facet_grid(rows = vars(variable)) + | |
# scale_y_log10() + | |
scale_y_continuous(limits = c(0,5e3)) + | |
scale_color_viridis_c() | |
kielens_cell_lines <- read_csv("random_checks/kielens_cell_lines.csv") | |
kielens_expression_data <- read_csv("random_checks/kielens_counts.csv") | |
library(broom) | |
expression_cell_lines <- kielens_expression_data %>% | |
dplyr::select(-`Gene Symbol`, -overlap, -`Log2(Fold-Change)`, -`Adjusted p-value`, -`False discovery Rate`) %>% | |
colnames() %>% | |
tidy() %>% | |
mutate(cell_line=str_replace(x, "_P.*", ""), | |
passage=str_replace(x, ".*_P", "")) %>% | |
left_join(kielens_cell_lines, by=c("cell_line")) | |
kielens_expression_counts_df <- kielens_expression_data %>% | |
dplyr::select(-overlap, -`Adjusted p-value`, -`False discovery Rate`) %>% | |
reshape2::melt(id.vars=c("Gene Symbol", "Log2(Fold-Change)"), value.name="expression_count", variable.name="cell_line_name") %>% | |
left_join(expression_cell_lines, by=c("cell_line_name"="x")) | |
counts_all_by_cluster <- read_csv("counts_all_by_cluster.csv") | |
kielens_expression_counts_df_naive <- kielens_expression_counts_df %>% | |
filter(growth_media == "T2iLGö") %>% | |
group_by(`Gene Symbol`) %>% | |
summarise(mean_naive_expression=mean(expression_count), | |
kielens_log2FC=dplyr::first(`Log2(Fold-Change)`)) %>% | |
left_join(counts_all_by_cluster, by=c("Gene Symbol"="gene_name")) %>% | |
# reshape2::melt(id.vars=c("Gene Symbol", "mean_naive_expression"), variable.name="cluster", value.name="cluster_mean") %>% | |
left_join(continuous_expr_change %>% dplyr::select(hgnc_symbol, chromosome_name, padj), by=c("Gene Symbol"="hgnc_symbol") ) | |
kielens_expression_counts_df_naive <- left_join(kielens_expression_counts_df_naive, | |
continous_expr_no_F %>% dplyr::select(hgnc_symbol, no_F_padj=padj, no_F_log2FC=log2FoldChange), | |
by=c("Gene Symbol"="hgnc_symbol")) | |
ggplot(kielens_expression_counts_df_naive %>% filter(padj < .1 & chromosome_name != "X"), aes(x=mean_naive_expression, y=cluster_mean)) + | |
geom_point() + | |
facet_grid(rows = vars(cluster)) + | |
labs(title="Naive cells - T2iLGö") | |
cor.test((kielens_expression_counts_df_naive %>% | |
filter(no_F_padj.x < .1 & chromosome_name != "X") %>% | |
mutate(diff=mean_naive_expression-mean_primed_expression))$diff, | |
(kielens_expression_counts_df_naive %>% | |
filter(no_F_padj.x < .1 & chromosome_name != "X") %>% | |
mutate(diff=cluster_D_mean-cluster_A_mean))$diff, method="spearman") | |
cor.test((kielens_expression_counts_df_naive %>% | |
filter(padj < .1 & chromosome_name != "X") %>% | |
mutate(diff=mean_naive_expression-mean_primed_expression))$diff, | |
(kielens_expression_counts_df_naive %>% | |
filter(padj < .1 & chromosome_name != "X") %>% | |
mutate(diff=cluster_D_mean-cluster_A_mean))$diff, method="spearman") | |
cor.test((kielens_expression_counts_df_naive %>% | |
filter(padj < .1 & chromosome_name != "X") %>% | |
mutate(diff=mean_naive_expression-mean_primed_expression))$diff, | |
(kielens_expression_counts_df_naive %>% | |
filter(padj < .1 & chromosome_name != "X") %>% | |
mutate(diff=cluster_F_mean-cluster_A_mean))$diff, method="spearman") | |
cor.test((kielens_expression_counts_df_naive %>% | |
filter(padj < .1 & chromosome_name != "X") %>% | |
mutate(diff=mean_naive_expression-mean_primed_expression))$diff, | |
(kielens_expression_counts_df_naive %>% | |
filter(padj < .1 & chromosome_name != "X") %>% | |
mutate(diff=cluster_D_mean-cluster_A_mean))$diff) | |
cor.test((kielens_expression_counts_df_naive %>% | |
filter(no_F_padj < .1 & chromosome_name != "X"))$kielens_log2FC, | |
(kielens_expression_counts_df_naive %>% | |
filter(no_F_padj < .1 & chromosome_name != "X"))$no_F_log2FC, method="spearman") | |
cor.test((kielens_expression_counts_df_naive %>% | |
filter(no_F_padj < .1 & chromosome_name != "X"))$kielens_log2FC, | |
(kielens_expression_counts_df_naive %>% | |
filter(no_F_padj < .1 & chromosome_name != "X"))$log2FC, method="spearman") | |
kielens_expression_counts_df_primed <- kielens_expression_counts_df %>% | |
filter(growth_media %in% c("KSR+FGF2")) %>% | |
group_by(`Gene Symbol`) %>% | |
summarise(mean_primed_expression=mean(expression_count)) %>% | |
left_join(counts_all_by_cluster, by=c("Gene Symbol"="gene_name")) %>% | |
# reshape2::melt(id.vars=c("Gene Symbol", "mean_primed_expression"), variable.name="cluster", value.name="cluster_mean")%>% | |
left_join(continuous_expr_change %>% dplyr::select(hgnc_symbol, chromosome_name, padj), by=c("Gene Symbol"="hgnc_symbol") ) | |
ggplot(kielens_expression_counts_df_primed %>% filter(padj < .1 & chromosome_name != "X"), aes(x=mean_primed_expression, y=cluster_mean)) + | |
geom_point() + | |
facet_grid(rows = vars(cluster)) + | |
labs(title="Primed cells - KSR+FGF2") | |
kielens_expression_counts_df_naive <- left_join(kielens_expression_counts_df_naive, kielens_expression_counts_df_primed %>% dplyr::select(`Gene Symbol`, mean_primed_expression), by="Gene Symbol") | |
kielens_expression_counts_df_naive <- left_join(kielens_expression_counts_df_naive, | |
continous_expr_no_F %>% dplyr::select(hgnc_symbol, no_F_padj=padj, no_F_log2FC=log2FoldChange), | |
by=c("Gene Symbol"="hgnc_symbol")) | |
library(scales) | |
ggplot(kielens_expression_counts_df_naive %>% | |
filter(padj < .1 & chromosome_name != "X"), aes(x=mean_naive_expression-mean_primed_expression, y=cluster_D_mean-cluster_A_mean)) + | |
geom_point(aes(color=padj)) + | |
labs(title="naive-primed and clusterD-clusterA") + | |
scale_color_viridis_c() + | |
scale_x_continuous(trans = pseudo_log_trans(base = 2)) + | |
scale_y_continuous(trans = pseudo_log_trans(base = 2)) | |
ggplot(kielens_expression_counts_df_naive %>% | |
filter(padj < .1 & chromosome_name != "X"), aes(x=mean_naive_expression-mean_primed_expression, y=cluster_F_mean-cluster_A_mean)) + | |
geom_point(aes(color=padj)) + | |
labs(title="naive-primed and clusterF-clusterA") + | |
scale_color_viridis_c() + | |
scale_x_continuous(trans = pseudo_log_trans(base = 2)) + | |
scale_y_continuous(trans = pseudo_log_trans(base = 2)) | |
ggplot(kielens_expression_counts_df_naive %>% | |
filter(no_F_padj < .1 & chromosome_name != "X"), | |
aes(x=log2(mean_naive_expression/mean_primed_expression), | |
y=no_F_log2FC)) + | |
geom_point(aes(color=no_F_padj)) + | |
labs(title="log2(naive/primed) and log2(clusterD/clusterA)") + | |
scale_color_viridis_c() | |
ggplot(kielens_expression_counts_df_naive %>% | |
filter(padj < .1 & chromosome_name != "X"), | |
aes(x=log2(mean_naive_expression/mean_primed_expression), | |
y=log2(cluster_F_mean/cluster_A_mean))) + | |
geom_point(aes(color=padj)) + | |
labs(title="log2(naive/primed) and log2(clusterF/clusterA)") + | |
scale_color_viridis_c() | |
``` | |
## Methylation PCA Validation that cluster F expression represents entire cluster | |
```{r} | |
female_df_no_h9 <- read_csv("female_df_no_h9.csv") | |
filtered_site_annotation_df <- read_csv("filtered_site_annotation_df.csv") | |
sample_annotation_df <- read_csv("sample_data_combined_all_info.csv") | |
statistics <- read_csv('variance_statistics_noh9_nooutliers.csv') | |
sigvar_probes <- statistics$Position[statistics$alpha <= .01 & (statistics$femaleVariance > statistics$maleVariance)] | |
# sig_x_sites <- intersect(positions_to_keep, x_annotation_df$rowname) | |
cluster_F_vs_A_expr <- read_csv("cluster_F_vs_A_expr.csv") %>% | |
dplyr::select(ensembl_id, | |
`log2(F/A)`=log2FoldChange, | |
`padj (F/A)`=padj) | |
cluster_F_vs_A_expr$ensembl_id_no_ver <- gsub("\\..*", "", cluster_F_vs_A_expr$ensembl_id) | |
cutoff_1_gene_names <- get_genename_from_ensembl(cluster_F_vs_A_expr$ensembl_id_no_ver) | |
cluster_F_vs_A_expr <- left_join(cluster_F_vs_A_expr, cutoff_1_gene_names, by=c("ensembl_id_no_ver" = "ensembl_gene_id")) | |
cutoff_1_genes <- filter(cluster_F_vs_A_expr, `padj (F/A)` < .1) | |
cutoff_1_probes <- filtered_site_annotation_df %>% filter(hg38_gene_name %in% cutoff_1_genes$hgnc_symbol) | |
female_df_no_h9_cutoff_1 <- female_df_no_h9 %>% filter(rn %in% cutoff_1_probes$rowname) | |
female_df_no_h9_cutoff_1_mat <- t(as.matrix(female_df_no_h9_cutoff_1 %>% column_to_rownames(var="rn"))) | |
cluster_tib <- read_csv("saved_sample_clusters.csv", col_names = c("sample_name", "cluster")) | |
cluster_tib <- left_join(cluster_tib, | |
sample_annotation_df %>% | |
dplyr::select(sample_name, sra_accession), | |
by="sample_name") | |
cluster_tib <- column_to_rownames(cluster_tib, "sample_name") | |
library(ggfortify) | |
cluster_tib$have_expression <- !is.na(cluster_tib$sra_accession) | |
cluster_tib$cluster <- factor(cluster_tib$cluster, levels = c(0,1,2,3,4,5)) | |
autoplot(prcomp(female_df_no_h9_cutoff_1_mat), data=cluster_tib, colour="cluster", shape="have_expression") | |
female_df_no_h9_cutoff_1_sigvar <- female_df_no_h9 %>% filter(rn %in% cutoff_1_probes$rowname & rn %in% sigvar_probes) | |
female_df_no_h9_cutoff_1_sigvar_mat <- t(as.matrix(female_df_no_h9_cutoff_1_sigvar %>% column_to_rownames(var="rn"))) | |
autoplot(prcomp(female_df_no_h9_cutoff_1_sigvar_mat), | |
data=cluster_tib, | |
colour="cluster", | |
shape="have_expression", | |
x=1, | |
y=2) | |
x_probes <- filtered_site_annotation_df %>% filter(hg38_chromosome == "chrX") | |
female_df_no_h9_cutoff_1_sigvar_no_x <- female_df_no_h9 %>% filter(rn %in% cutoff_1_probes$rowname & rn %in% sigvar_probes & !(rn %in% x_probes$rowname)) | |
female_df_no_h9_cutoff_1_sigvar_no_x_mat <- t(as.matrix(female_df_no_h9_cutoff_1_sigvar_no_x %>% column_to_rownames(var="rn"))) | |
autoplot(prcomp(female_df_no_h9_cutoff_1_sigvar_no_x_mat), | |
data=cluster_tib, | |
colour="cluster", | |
shape="have_expression", | |
x=1, | |
y=2) | |
female_df_no_h9_cutoff_1_no_x <- female_df_no_h9 %>% filter(rn %in% cutoff_1_probes$rowname & !(rn %in% x_probes$rowname)) | |
female_df_no_h9_cutoff_1_no_x_mat <- t(as.matrix(female_df_no_h9_cutoff_1_no_x %>% column_to_rownames(var="rn"))) | |
cluster_tib$clutser_F = cluster_tib$cluster == 5 | |
autoplot(prcomp(female_df_no_h9_cutoff_1_no_x_mat), | |
data=cluster_tib, | |
colour="clutser_F", | |
shape="have_expression", | |
x=1, | |
y=2) | |
cluster_a_f_samples <- cluster_tib %>% filter(cluster %in% c(0,5)) | |
heatmap_mat <- t(female_df_no_h9_cutoff_1_no_x_mat[cluster_a_f_samples$sample_name,]) | |
library(heatmap3) | |
heatmap3(heatmap_mat) | |
``` | |
## Looking for ERV representation in clusters | |
```{r} | |
library(tidyverse) | |
library(stringr) | |
# parsing GTF file | |
journal_gtf_df <- read_tsv("journal_gtf_summary.tsv") | |
journal_gtf_df_ltrs <- journal_gtf_df %>% | |
filter(geneRegion == "ltr") %>% | |
group_by(locus) %>% | |
summarise(ltr_rep_name=paste(unique(repName), collapse=",")) | |
journal_gtf_df_internal <- journal_gtf_df %>% | |
filter(geneRegion == "internal") %>% | |
group_by(locus) %>% | |
summarise(internal_rep_name=unique(repName)) | |
journal_gtf_df_formatted <- full_join(journal_gtf_df_ltrs, journal_gtf_df_internal, by="locus") | |
write_csv(journal_gtf_df_formatted, "journal_gtf_df_formatted.csv") | |
library(DESeq2) | |
all_erv_counts <- read_csv("all_erv_counts_df.csv") | |
cluster_tib <- read_csv("saved_sample_clusters.csv", col_names = c("sample_name", "cluster")) | |
seq_type_df <- read_csv("sample_seq_type.csv") | |
sample_info_df <- read_csv("sample_data_combined_all_info.csv") | |
sample_info_df <- left_join(sample_info_df, seq_type_df, by=c("sample_name"="Sample Name")) %>% | |
left_join(cluster_tib, by="sample_name") | |
sample_info_df_filtered <- sample_info_df %>% filter(gender=="female" & !is.na(sra_accession) & !is.na(cluster)) | |
sample_annotation_df <- tibble(sample_name=sample_info_df_filtered$sra_accession, | |
cluster=as.factor(sample_info_df_filtered$cluster), | |
group=sample_info_df_filtered$group_accession, | |
seq_type=sample_info_df_filtered$end_type) | |
sample_annotation_df <- column_to_rownames(sample_annotation_df, var="sample_name") | |
all_erv_counts <- column_to_rownames(all_erv_counts, var="transcript") | |
all_erv_counts[is.na(all_erv_counts)] <- 0 | |
dds <- DESeqDataSetFromMatrix(countData = all_erv_counts[rownames(sample_annotation_df)], | |
colData = sample_annotation_df, | |
design = ~ seq_type + group + cluster) | |
keep <- rowSums(counts(dds)) >= 10 | |
dds <- dds[keep,] | |
dds <- DESeq(dds) | |
erv_vsd <- vst(dds, blind = F) | |
library(limma) | |
erv_batch_corrected_vsd_df <- as.data.frame(removeBatchEffect(assay(erv_vsd), | |
batch = erv_vsd$group, batch2 = erv_vsd$seq_type)) %>% | |
rownames_to_column("erv_id") | |
# genes <- batch_corrected_vsd_df$ensembl_id_no_version | |
# G_list <- get_genename_from_ensembl(genes) | |
# batch_corrected_vsd_df <- batch_corrected_vsd_df %>% | |
# left_join(G_list, by=c("ensembl_id_no_version"="ensembl_gene_id")) | |
write_csv(erv_batch_corrected_vsd_df, "erv_batch_corrected_vsd_df.csv") | |
# sampleTable <- data.frame(sampleName = sample_names, | |
# fileName = sample_file_to_keep, | |
# cluster = sampleCondition, | |
# seq_type = sample_seq_type, | |
# group = sample_group) | |
sample_annotation_df <- tibble(sample_name=sample_info_df_filtered$sra_accession, | |
cluster=sample_info_df_filtered$cluster, | |
group=sample_info_df_filtered$group_accession, | |
seq_type=sample_info_df_filtered$end_type) | |
diff_expr_all_df <- tibble(erv_id=character()) | |
for (cluster in 1:4) { | |
cluster_0 <- as.numeric(cluster - 1) | |
cluster_1 <- as.numeric(cluster) | |
if (cluster == 4) { | |
cluster_1 <- 5 | |
} | |
sampleTableFiltered <- sample_annotation_df %>% column_to_rownames(var="sample_name") #%>% filter(cluster >= cluster_0)# %in% c(cluster_0, cluster_1)) | |
sampleTableFiltered[sampleTableFiltered$cluster <= cluster_0, "cluster"] <- cluster_0 | |
sampleTableFiltered[sampleTableFiltered$cluster > cluster_0, "cluster"] <- cluster_1 | |
sampleTableFiltered$cluster <- factor(sampleTableFiltered$cluster) | |
dds <- DESeqDataSetFromMatrix(countData = all_erv_counts[rownames(sampleTableFiltered)], | |
colData = sampleTableFiltered, | |
design = ~ seq_type + group + cluster) | |
keep <- rowSums(counts(dds)) >= 10 | |
dds <- dds[keep,] | |
dds <- DESeq(dds) | |
res <- results(dds, tidy=T) %>% dplyr::rename(erv_id=row) | |
# res_tib <- as_tibble(res, rownames = NA) | |
# res_tib <- rownames_to_column(res_tib, "EnsGene") | |
transition_fold_change <- sprintf("transition_%s_log2FC", cluster) | |
transition_padj <- sprintf("transition_%s_padj", cluster) | |
temp_df <- dplyr::select(res, erv_id, | |
!!transition_fold_change := log2FoldChange, | |
!!transition_padj := padj) | |
diff_expr_all_df <- full_join(diff_expr_all_df, temp_df, by="erv_id") | |
} | |
temp_df <- dplyr::select(diff_expr_all_df, ends_with("padj")) | |
temp_df[is.na(temp_df)] <- 1 | |
diff_expr_all_df$key_transition <- as.integer(unlist(apply(temp_df, 1, which.min))) | |
diff_expr_all_df$passes_threshold <- FALSE | |
diff_expr_all_df$positive_delta <- FALSE | |
for (row_idx in 1:nrow(diff_expr_all_df)) { | |
sig_transition <- sprintf("transition_%d_padj", unlist(diff_expr_all_df[row_idx, "key_transition"])) | |
sig_log2FC <- sprintf("transition_%d_log2FC", unlist(diff_expr_all_df[row_idx, "key_transition"])) | |
diff_expr_all_df[row_idx, "positive_delta"] <- as.logical(unlist(diff_expr_all_df[row_idx, sig_log2FC] > 0)) | |
if ((! is.na(diff_expr_all_df[row_idx, sig_transition])) & unlist(diff_expr_all_df[row_idx, sig_transition]) <= .1) { | |
diff_expr_all_df[row_idx, "passes_threshold"] <- TRUE | |
} | |
} | |
write_csv(diff_expr_all_df, "erv_diff_expr_all_df.csv") | |
tran_4_ervs <- (diff_expr_all_df %>% filter(transition_4_padj < .1 & transition_4_log2FC < 0))$erv_id | |
plot_df <- erv_batch_corrected_vsd_df %>% | |
filter(erv_id %in% tran_4_ervs) %>% | |
reshape2::melt(id.vars="erv_id", variable.name="sample_name", value.name="batch_corrected_expression") %>% | |
left_join(sample_annotation_df, by="sample_name") | |
ggplot(plot_df, aes(x=erv_id, fill=as.factor(cluster), y=batch_corrected_expression)) + | |
geom_bar(stat="identity", position = position_dodge()) | |
erv_gtf <- read_csv("erv_gtf.csv") | |
diff_expr_all_df <- left_join(diff_expr_all_df, erv_gtf, by="erv_id") | |
write_csv(diff_expr_all_df, "erv_diff_expr_all_df.csv") | |
sample_info_df <- read_csv("sample_data_combined_all_info.csv") | |
sample_info_df <- sample_info_df %>% | |
left_join(cluster_tib, by="sample_name") | |
sample_info_df_filtered <- sample_info_df %>% filter(gender=="female" & !is.na(sra_accession) & !is.na(cluster)) | |
sample_annotation_df <- tibble(sample_name=sample_info_df_filtered$sra_accession, | |
cluster=as.factor(sample_info_df_filtered$cluster), | |
group=sample_info_df_filtered$group_accession) | |
erv_batch_corrected_vsd_df <- read_csv("erv_batch_corrected_vsd_df.csv.gz") | |
erv_diff_expr_all_df <- read_csv("erv_diff_expr_all_df.csv.gz") | |
cluster_erv_means <- erv_batch_corrected_vsd_df %>% | |
reshape2::melt(id.vars="erv_id", variable.name="sample_name", value.name="batch_corrected_expression") %>% | |
left_join(sample_annotation_df, by="sample_name") %>% | |
group_by(erv_id, cluster) %>% | |
summarise(mean_count=mean(batch_corrected_expression)) %>% | |
reshape2::dcast(erv_id ~ cluster) %>% | |
dplyr::rename(cluster_A_mean=`0`, | |
cluster_B_mean=`1`, | |
cluster_C_mean=`2`, | |
cluster_D_mean=`3`, | |
cluster_F_mean=`5`) | |
erv_diff_expr_all_df <- read_csv("erv_diff_expr_all_df.csv.gz") | |
erv_diff_expr_all_df <- full_join(erv_diff_expr_all_df, cluster_erv_means, by="erv_id") | |
# write_csv(erv_diff_expr_all_df %>% dplyr::select(erv_id,chr,start,end,ends_with("_mean"), starts_with("transition"), key_transition, passes_threshold, positive_delta), "formatted_erv_diff_expr_all_df.csv") | |
journal_gtf_df_formatted <- read_csv("journal_gtf_df_formatted.csv.gz") | |
erv_diff_expr_all_df <- left_join(erv_diff_expr_all_df, journal_gtf_df_formatted, by=c("erv_id"="locus")) | |
erv_plot_df <- erv_diff_expr_all_df %>% | |
mutate(cluster_A_diff=0, | |
cluster_B_diff=cluster_B_mean-cluster_A_mean, | |
cluster_C_diff=cluster_C_mean-cluster_A_mean, | |
cluster_D_diff=cluster_D_mean-cluster_A_mean, | |
cluster_F_diff=cluster_F_mean-cluster_A_mean) %>% | |
reshape2::melt(id.vars=c("erv_id", "ltr_rep_name", "internal_rep_name", "transition_4_padj", "chr"), | |
measure.vars=c("cluster_A_diff", "cluster_B_diff", "cluster_C_diff", "cluster_D_diff", "cluster_F_diff")) | |
ltr_counts <- erv_plot_df %>% | |
filter(transition_4_padj <= .1 & chr != "chrX") %>% | |
count(ltr_rep_name) %>% | |
mutate(n=n/5) %>% | |
rename(!!'transition_4_padj <= .1 & chr != "chrX" counts':=n) %>% | |
left_join(erv_diff_expr_all_df %>% | |
count(ltr_rep_name) %>% | |
rename(total=n), | |
by="ltr_rep_name") %>% | |
mutate(hypergeometric_pval=phyper(`transition_4_padj <= .1 & chr != "chrX" counts` - 1, | |
total, | |
sum(total)-total, | |
sum(`transition_4_padj <= .1 & chr != "chrX" counts`), | |
lower.tail = F)) | |
write_csv(ltr_counts, "ltr_counts.csv") | |
erv_plot_df <- left_join(erv_plot_df, ltr_counts, by="ltr_rep_name") | |
ltr_pal <- c("#fdfdfd", "#1d1d1d", "#ebce2b", "#702c8c", "#db6917", "#96cde6", "#ba1c30", "#c0bd7f", "#7f7e80", "#5fa641", "#d485b2", "#4277b6", "#df8461", "#463397", "#e1a11a", "#91218c", "#e8e948", "#7e1510", "#92ae31", "#6f340d", "#d32b1e", "#2b3514") | |
ltr_plot <- ggplot(erv_plot_df %>% filter(transition_4_padj <= .1 & chr != "chrX" & hypergeometric_pval > .1), aes(x=variable, y=value, group=erv_id)) + | |
geom_line(color="grey", size=.3, alpha=.5) + | |
geom_line(data=erv_plot_df %>% filter(transition_4_padj <= .1 & chr != "chrX" & hypergeometric_pval <= .1 & ltr_rep_name=="LTR7"), aes(color=ltr_rep_name), size=.3, alpha=.7) + | |
geom_line(data=erv_plot_df %>% filter(transition_4_padj <= .1 & chr != "chrX" & hypergeometric_pval <= .1 & ltr_rep_name != "LTR7"), aes(color=ltr_rep_name), size=.3, alpha=1) + | |
scale_color_manual(values=ltr_pal[c(5,6,12,4,8,10,3,7,22,13)], | |
limits=c("LTR40a,LTR40b", | |
"LTR5_Hs", | |
"LTR5_Hs,LTR5", | |
"LTR5_Hs,LTR5B", | |
"LTR7", | |
"LTR7Y", | |
"LTR1", | |
"LTR2C", | |
"LTR6A", | |
"MER50"), | |
labels=c("LTR40a,LTR40b - 1/3*", | |
"LTR5_Hs - 5/37***", | |
"LTR5_Hs,LTR5 - 1/1**", | |
"LTR5_Hs,LTR5B - 1/3*", | |
"LTR7 - 41/865****", | |
"LTR7Y - 5/80*", | |
"LTR1 - 1/2*", | |
"LTR2C - 3/38*", | |
"LTR6A - 2/16*", | |
"MER50 - 4/53*")) + | |
theme_bw() + | |
base_plot_theme + | |
scale_x_discrete(breaks=c("cluster_A_diff", | |
"cluster_B_diff", | |
"cluster_C_diff", | |
"cluster_D_diff", | |
"cluster_F_diff"), | |
labels=c("A", | |
"B", | |
"C", | |
"D", | |
"F"), | |
expand=c(0,0)) + | |
labs(title="Differentially Expressed HERVs in Transition 4/5", | |
y="Cumulative Δ Expr. (Log2 VST+Batch Corr.)", | |
x="Cluster", | |
color="LTR Class") + | |
guides(color = guide_legend(ncol = 1)) + | |
theme(legend.position="right", | |
legend.box.margin = margin(0,0,0,0, unit="mm"), | |
legend.box.spacing = unit(1, "mm")) | |
ggsave("figs/main/fig4_ltr_plot.svg", ltr_plot, height = 61, width=76, units = "mm") | |
``` | |
## Looking for concordance between methylation and expression changes | |
## Seeing if I can find escapee genes by comparing males to females, and looking for similarly expressed genes | |
```{r} | |
library(DESeq2) | |
directory <- "htseq_counts" | |
count_file_suffix <- "Aligned.out.sorted.bam.htseq_counts" | |
sampleFiles <- grep(count_file_suffix,list.files(directory),value=TRUE) | |
rna_seq_accessions_tib <- read_csv("rna_seq_accessions_tib.csv") | |
rna_seq_accessions_tib[grepl("female", rna_seq_accessions_tib$SampleName), "sex"] <- "female" | |
rna_seq_accessions_tib[!grepl("female", rna_seq_accessions_tib$SampleName), "sex"] <- "male" | |
rna_seq_accessions_tib <- rna_seq_accessions_tib %>% filter() | |
cluster_tib <- read_csv("saved_sample_clusters.csv", col_names = c("sample_name", "cluster")) | |
cluster_tib <- column_to_rownames(cluster_tib, "sample_name") | |
sample_file_to_keep <- c() | |
sample_names <- c() | |
sampleCondition <- c() | |
for(filename in sampleFiles) { | |
sra_acc <-sub(count_file_suffix, "", filename) | |
if (sra_acc %in% rna_seq_accessions_tib$sra_accession) { | |
sample_name <- (filter(rna_seq_accessions_tib, sra_accession == sra_acc))$SampleName | |
cluster <- cluster_tib[sample_name, 1] | |
sample_file_to_keep <- c(sample_file_to_keep, filename) | |
sample_names <- c(sample_names, sample_name) | |
sampleCondition <- c(sampleCondition, (filter(rna_seq_accessions_tib, sra_accession == sra_acc))$sex) | |
# counts_tib <- read_tsv(paste(directory, "/", filename, sep=""), col_names = c('id', 'name', 'level')) | |
# all_htseq_counts_df <- full_join(all_htseq_counts_df, dplyr::select(counts_tib, id,name, !! sample_name := level), by=c("gene_id"="id", "gene_name"="name")) | |
} | |
} | |
sample_table_male_female <- data.frame(sampleName = sample_names, | |
fileName = sample_file_to_keep, | |
condition = sampleCondition) | |
sample_table_male_female$condition <- factor(sample_table_male_female$condition) | |
ddsHTSeq_male_female <- DESeqDataSetFromHTSeqCount(sampleTable = sample_table_male_female, | |
directory = directory, | |
design= ~ condition) | |
keep <- rowSums(counts(ddsHTSeq_male_female)) >= 10 | |
ddsHTSeq_male_female_filtered <- ddsHTSeq_male_female[keep,] | |
dds_male_female <- DESeq(ddsHTSeq_male_female_filtered) | |
res_male_female <- results(dds_male_female) | |
res_male_female_tib <- as_tibble(res_male_female, rownames = NA) | |
res_male_female_tib <- rownames_to_column(res_male_female_tib, "EnsGene") | |
mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl")) | |
res_male_female_tib$EnsGeneNoVer <- gsub("\\..*","",res_male_female_tib$EnsGene) | |
genes <- res_male_female_tib$EnsGeneNoVer | |
G_list <- getBM(filters= "ensembl_gene_id", attributes= c("ensembl_gene_id", | |
"hgnc_symbol", "chromosome_name", "start_position", "end_position", "strand", "description"),values=genes,mart= mart) | |
res_male_female_tib_named <- full_join(res_male_female_tib,G_list,by=c("EnsGeneNoVer"="ensembl_gene_id")) | |
``` | |
## Showing changes for all transitions across the genome | |
```{r} | |
``` | |
## Doing GSEA on expression changes | |
We will generate rank files to run GSEA | |
```{r} | |
library(quantsmooth) | |
scale_function <- function(x) { | |
scaled_x <- x | |
scaled_x[scaled_x > 0] <- scaled_x[scaled_x > 0] / max(scaled_x[scaled_x > 0]) | |
scaled_x[scaled_x < 0] <- -1 * scaled_x[scaled_x < 0] / min(scaled_x[scaled_x < 0]) | |
return(scaled_x) | |
} | |
for (transition in 1:5){ | |
transition_1_rank_df <- diff_expr_all_df %>% filter(gene != "") | |
transition_log2FC_col_name <- sprintf("transition_%d_log2FC", transition) | |
transition_padj_col_name <- sprintf("transition_%d_padj", transition) | |
transition_1_rank_df["rank_score"] <- (transition_1_rank_df[transition_log2FC_col_name]) / (transition_1_rank_df[transition_padj_col_name]) | |
transition_1_rank_df <- na.omit(transition_1_rank_df) | |
transition_1_rank_df[transition_1_rank_df[transition_padj_col_name] > 0.1, "rank_score"] <- 0 | |
transition_1_rank_df$scaled_rank <- format(scale_function(transition_1_rank_df$rank_score), scientific = FALSE) | |
transition_1_rank_df <- arrange(transition_1_rank_df, rank_score) | |
write_tsv(transition_1_rank_df[(transition_1_rank_df$rank_score != 0) & (abs(transition_1_rank_df[transition_log2FC_col_name]) > .8),c("gene", "rank_score")], sprintf("gsea_transition_%d.rnk", transition), col_names = F) | |
write_csv(transition_1_rank_df[transition_1_rank_df["rank_score"] > 0,c("gene", "scaled_rank")], sprintf("enrichr_transition_%d_pos.csv", transition), col_names = F, ) | |
write_csv(transition_1_rank_df[transition_1_rank_df["rank_score"] < 0,c("gene", "scaled_rank")], sprintf("enrichr_transition_%d_neg.csv", transition), col_names = F) | |
test_df <- transition_1_rank_df %>% dplyr::rename(CHR=chr, MapInfo=start_position) | |
test_df <- as.data.frame(test_df) | |
plot.new() | |
chrompos <- prepareGenomePlot( | |
test_df, | |
paintCytobands = TRUE, | |
organism="hsa", | |
sexChromosomes = T, | |
units="hg38") | |
points(chrompos[(test_df$rank_score > 0) ,2], | |
chrompos[(test_df$rank_score > 0) ,1]+0.1, | |
pch="|",col="red") | |
points(chrompos[(test_df$rank_score < 0) ,2], | |
chrompos[(test_df$rank_score < 0) ,1]-0.1, | |
pch="|",col="blue") | |
meth_df <- dmp_info_df %>% filter(alt_key_transition == transition) %>% rename(CHR=chr, MapInfo=pos) | |
meth_df$CHR <- sub("chr", "", meth_df$CHR) | |
meth_df <- as.data.frame(meth_df) | |
meth_chrompos <- prepareGenomePlot( | |
meth_df, | |
paintCytobands = TRUE, | |
organism="hsa", | |
sexChromosomes = T, | |
units="hg19" | |
) | |
points(meth_chrompos[meth_df$delta >= .2,2],meth_chrompos[meth_df$delta >= .2,1]+0.1,pch="|",col="red") | |
points(meth_chrompos[meth_df$delta <= -.2,2],meth_chrompos[meth_df$delta <= -.2,1]-0.1,pch="|",col="blue") | |
points(meth_chrompos[meth_df$alt_delta > 0.2,2],meth_chrompos[meth_df$alt_delta > 0.2,1]+0.1,pch="|",col="red") | |
points(meth_chrompos[meth_df$alt_delta < -0.2,2],meth_chrompos[meth_df$alt_delta < -0.2,1]-0.1,pch="|",col="blue") | |
} | |
``` | |
## Combining Methylation and Expression Data | |
We are interested in finding genes that both change their promoter DNA methylation, and their expression levels in these clusters. | |
```{r} | |
dmp_info_df <- read_csv('dmp_transition_info.csv') | |
gene_compiled_tss_sites_df <- tibble(gene=character(), | |
total_sites=integer(), | |
transition_1_inc_sites=integer(), | |
transition_1_dec_sites=integer(), | |
transition_2_inc_sites=integer(), | |
transition_2_dec_sites=integer(), | |
transition_3_inc_sites=integer(), | |
transition_3_dec_sites=integer(), | |
transition_4_inc_sites=integer(), | |
transition_4_dec_sites=integer(), | |
transition_5_inc_sites=integer(), | |
transition_5_dec_sites=integer(), | |
) | |
for (row_idx in 1:nrow(dmp_info_df)) { | |
gene <- unlist(dmp_info_df[row_idx,"mapped_gene"]) | |
dist_to_tss <- unlist(dmp_info_df[row_idx,"dist_to_tss"]) | |
delta_increasing <- unlist(dmp_info_df[row_idx,"delta_increasing"]) | |
key_transition <- unlist(dmp_info_df[row_idx,"key_transition"]) | |
alt_key_transition <- unlist(dmp_info_df[row_idx,"alt_key_transition"]) | |
alt_delta <- unlist(dmp_info_df[row_idx,"alt_delta"]) | |
refgene_group <- unlist(dmp_info_df[row_idx, "refgene_group"]) | |
passes_threshold <- unlist(dmp_info_df[row_idx, "passes_threshold"]) | |
if (grepl("TSS1500|TSS200|5'UTR", refgene_group) & (! is.na(gene)) & passes_threshold) { | |
# if (grepl("TSS1500|TSS200", refgene_group) & (! is.na(gene)) & (! is.na(alt_key_transition))) { | |
gene_row_idx <- 0 | |
if (!(gene %in% gene_compiled_tss_sites_df$gene)) { | |
# gene is not in gene_compiled_tss_sites_df, so add a row for it | |
gene_row_idx <- nrow(gene_compiled_tss_sites_df)+1 | |
gene_compiled_tss_sites_df[gene_row_idx, c("gene")] = gene | |
gene_compiled_tss_sites_df[is.na(gene_compiled_tss_sites_df)] <- 0 | |
}else { | |
i <- 1 | |
for (gene_name in gene_compiled_tss_sites_df$gene) { | |
if ((! is.na(gene_name)) && gene == gene_name) { | |
gene_row_idx <- i | |
break | |
} | |
i <- i+1 | |
} | |
} | |
gene_compiled_tss_sites_df[gene_row_idx, "total_sites"] <- gene_compiled_tss_sites_df[gene_row_idx, "total_sites"] + 1 | |
change <- "dec" | |
if (delta_increasing) { | |
# if (alt_delta > 0) { | |
change <- "inc" | |
} | |
gene_compiled_tss_sites_df[gene_row_idx, sprintf("transition_%d_%s_sites", key_transition, change)] <- | |
gene_compiled_tss_sites_df[gene_row_idx, sprintf("transition_%d_%s_sites", key_transition, change)] + 1 | |
# gene_compiled_tss_sites_df[gene_row_idx, sprintf("transition_%d_%s_sites", alt_key_transition, change)] <- | |
# gene_compiled_tss_sites_df[gene_row_idx, sprintf("transition_%d_%s_sites", alt_key_transition, change)] + 1 | |
} | |
} | |
write_csv(gene_compiled_tss_sites_df, "gene_compiled_tss_sites.csv") | |
gene_compiled_df <- gene_compiled_tss_sites_df | |
gene_compiled_df <- add_column(gene_compiled_df, cluster=c(0)*nrow(gene_compiled_df)) | |
# gene_clusters <- tibble(gene=character(), cluster=integer()) | |
for (row_idx in 1:nrow(gene_compiled_df)) { | |
gene <- unlist(gene_compiled_df[row_idx, "gene"]) | |
total <- unlist(gene_compiled_df[row_idx, "total_sites"]) | |
maximum_transition <- 0 | |
maximum_change <- "" | |
maximum_count <- 0 | |
for(key_transition in 1:5) { | |
for (change in c("inc", "dec")) { | |
transition <- sprintf("transition_%d_%s_sites", key_transition, change) | |
if (gene_compiled_df[row_idx, transition] > maximum_count) { | |
maximum_transition <- key_transition | |
maximum_change <- change | |
maximum_count <- as.integer(unlist(gene_compiled_df[row_idx, transition])) | |
} | |
} | |
} | |
cluster <- 10 | |
if (maximum_count / total >= .5) { | |
if (maximum_change == "dec") { | |
cluster <- maximum_transition - 1 + 5 | |
}else { | |
cluster <- maximum_transition - 1 | |
} | |
} | |
i <- 1 | |
for (gene_name in gene_compiled_df$gene) { | |
if ((! is.na(gene_name)) && gene == gene_name) { | |
gene_row_idx <- i | |
break | |
} | |
i <- i+1 | |
} | |
gene_compiled_df[i, "cluster"] <- cluster | |
} | |
transition_1_methylated_genes <- gene_compiled_df %>% filter(cluster %in% c(0)) | |
transition_1_demethylated_genes <- gene_compiled_df %>% filter(cluster %in% c(5)) | |
transition_1_decreased_expression_genes <- diff_expr_all_df %>% | |
filter((transition_1_padj <= 0.1) & | |
(transition_1_log2FC < 0)& (gene %in% gene_compiled_df$gene)) | |
transition_1_increased_expression_genes <- diff_expr_all_df %>% | |
filter((transition_1_padj <= 0.1) & | |
(transition_1_log2FC > 0) & (gene %in% gene_compiled_df$gene)) | |
transition_1_increased_expression_genes <- diff_expr_all_df %>% | |
filter((transition_3_padj <= 0.1) & | |
(transition_3_log2FC > 0)) | |
common_genes <- intersect(transition_1_increased_expression_genes$gene, transition_1_demethylated_genes$gene) | |
expr_pvals <- column_to_rownames(transition_1_increased_expression_genes, var = "gene")[common_genes, "transition_1_padj"] | |
# meth_pvals <- column_to_rownames(transition_1_demethylated_genes, var = "gene")[common_genes, "transition_1_padj"] | |
transition_1_genes <- tibble( | |
genes=common_genes, | |
rank_score=1/expr_pvals | |
) | |
early_demethylated_genes <- gene_compiled_df %>% filter(cluster %in% c(5,6)) | |
early_methylated_genes <- gene_compiled_df %>% filter(cluster %in% c(0,1)) | |
early_decreased_expression_genes <- diff_expr_all_df %>% filter(((transition_1_padj <= 0.1) & (transition_1_log2FC < 0)) | | |
((transition_2_padj <= 0.1) & (transition_2_log2FC < 0))) | |
early_increased_expression_genes <- diff_expr_all_df %>% filter((transition_1_padj <= 0.1 & transition_1_log2FC > 0) | | |
(transition_2_padj <= 0.1 & transition_2_log2FC > 0)) | |
write_csv(diff_expr_all_df, "deseq_transitions.csv") | |
``` | |