Skip to content
Permalink
2e6bcacfa8
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
1789 lines (1458 sloc) 82 KB
---
title: "Expression Analysis"
output: html_notebook
---
# Expression Analysis
## Collecting RNA-seq expression data
We aligned the fastq files using STAR to hg38, then used HTseq-count to calculate the expression counts. Now we will use DESeq2 package to calculate which genes are differentially expressed in all the different sample clusters we have.
```{r}
library("DESeq2")
library('biomaRt')
library("TxDb.Hsapiens.UCSC.hg38.knownGene")
library("tidyverse")
library("org.Hs.eg.db")
diff_expr_all_df <- NA
directory <- "htseq_counts"
count_file_suffix <- "Aligned.out.sorted.bam.htseq_counts"
sampleFiles <- grep(count_file_suffix,list.files(directory),value=TRUE)
rna_seq_accessions_tib <- read_csv("rna_seq_accessions_tib.csv.gz")
cluster_tib <- read_csv("saved_sample_clusters.csv", col_names = c("sample_name", "cluster"))
cluster_tib <- column_to_rownames(cluster_tib, "sample_name")
seq_type_df <- read_csv("sample_seq_type.csv.gz")
sample_info_df <- read_csv("sample_data_combined_all_info.csv")
sample_file_to_keep <- c()
sample_names <- c()
sampleCondition <- c()
sample_seq_type <- c()
sample_group <- c()
xist_expression <- c()
all_htseq_counts_df <- tibble(gene_id=character(),gene_name=character())
for(filename in sampleFiles) {
sra_acc <-sub(count_file_suffix, "", filename)
if(sra_acc == "SRR4027227") {
print("I'm getting the import file")
}
if (sra_acc %in% rna_seq_accessions_tib$sra_accession) {
sample_name <- (filter(rna_seq_accessions_tib, sra_accession == sra_acc))$SampleName
cluster <- cluster_tib[sample_name, 1]
print(cluster)
if(is.na(cluster) == F) {
sample_file_to_keep <- c(sample_file_to_keep, filename)
sample_names <- c(sample_names, sample_name)
sampleCondition <- c(sampleCondition, cluster)
sample_seq_type <- c(sample_seq_type,
as.character(seq_type_df[seq_type_df$`Sample Name` == sample_name, "end_type"][1]))
sample_group <- c(sample_group,
as.character(sample_info_df[sample_info_df$sample_name == sample_name, "group_accession"][1]))
counts_tib <- read_tsv(paste(directory, "/", filename, sep=""), col_names = c('id', 'name', 'level'))
all_htseq_counts_df <- full_join(all_htseq_counts_df, dplyr::select(counts_tib, id,name, !! sample_name := level), by=c("gene_id"="id", "gene_name"="name"))
}
}
}
sampleTable <- data.frame(sampleName = sample_names,
fileName = sample_file_to_keep,
cluster = factor(sampleCondition),
seq_type = sample_seq_type,
group = sample_group)
ddsHTSeq_all <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable,
directory = directory,
design= ~ seq_type + group + cluster)
keep <- rowSums(counts(ddsHTSeq_all)) >= 10
ddsHTSeq_all <- ddsHTSeq_all[keep,]
dds_all <- DESeq(ddsHTSeq_all)
ntd <- normTransform(dds_all)
vsd <- vst(dds_all, blind=FALSE)
select <- order(rowMeans(counts(dds_all,normalized=TRUE)),
decreasing=TRUE)[1:20]
df <- as.data.frame(colData(dds_all)[,c("group","seq_type", "cluster")])
library("pheatmap")
pheatmap(assay(ntd)[select,], cluster_rows=FALSE, show_rownames=FALSE,
cluster_cols=FALSE, annotation_col=df)
pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=FALSE,
cluster_cols=FALSE, annotation_col=df)
sampleDists <- dist(t(assay(vsd)))
library("RColorBrewer")
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd$cluster, vsd$seq_type, vsd$group, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col=colors)
plotPCA(vsd, intgroup=c("group"))
library(limma)
library(ggfortify)
plotPCA(removeBatchEffect(assay(vsd), batch = vsd$group, batch2 = vsd$seq_type), intgroup=c("group"))
batch_corrected_vsd <- t(removeBatchEffect(assay(vsd), batch = vsd$group, batch2 = vsd$seq_type))
data_vsd <- as.data.frame(batch_corrected_vsd)
data_vsd$cluster <- factor(vsd$cluster)
data_vsd$group <- vsd$group
data_vsd$seq_type <- vsd$seq_type
autoplot(prcomp(batch_corrected_vsd), data = data_vsd, colour="group")
batch_corrected_vsd_df <- as.data.frame(removeBatchEffect(assay(vsd), batch = vsd$group, batch2 = vsd$seq_type)) %>%
rownames_to_column("ensembl_id") %>%
mutate(ensembl_id_no_version=gsub("\\..*", "", ensembl_id))
get_genename_from_ensembl <- function(ensembl_ids) {
# mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
mart <- useEnsembl(biomart = "ensembl",
dataset = "hsapiens_gene_ensembl",
mirror = "useast")
G_list <- getBM(filters= "ensembl_gene_id",
attributes= c("ensembl_gene_id",
"hgnc_symbol",
"chromosome_name",
"start_position",
"end_position",
"strand",
"description",
"wikigene_name"),
values=ensembl_ids,
mart= mart)
G_list[G_list$hgnc_symbol == "", "hgnc_symbol"] <- G_list[G_list$hgnc_symbol == "", "wikigene_name"]
return(G_list %>% dplyr::select(-wikigene_name))
}
genes <- batch_corrected_vsd_df$ensembl_id_no_version
G_list <- get_genename_from_ensembl(genes)
batch_corrected_vsd_df <- batch_corrected_vsd_df %>%
left_join(G_list, by=c("ensembl_id_no_version"="ensembl_gene_id"))
write_csv(batch_corrected_vsd_df, "batch_corrected_vsd_df.csv")
# cluster F vs A,B,C,D individually
res <- results(dds_all, contrast = c("cluster", "5", "0"), tidy=T)
new_res_tib <- as.data.frame(res) %>%
as_tibble() %>%
dplyr::rename(
ensembl_id=row,
tran_1_log2FC=log2FoldChange,
tran_1_lfcSE=lfcSE,
tran_1_stat=stat,
tran_1_pvalue=pvalue,
tran_1_padj=padj)
for (clust in c("1", "2", "3")) {
res2 <- results(dds_all, contrast = c("cluster", "5", clust), tidy=T)
new_res_tib <- new_res_tib %>%
left_join(
as.data.frame(res2) %>%
as_tibble() %>%
dplyr::rename(
ensembl_id=row,
!!sprintf("%s_vs_F_log2FC", clust):=log2FoldChange,
!!sprintf("%s_vs_F_lfcSE", clust):=lfcSE,
!!sprintf("%s_vs_F_stat", clust):=stat,
!!sprintf("%s_vs_F_pvalue", clust):=pvalue,
!!sprintf("%s_vs_F_padj", clust):=padj) %>%
dplyr::select(-baseMean),
by="ensembl_id")
}
new_res_tib$EnsGeneNoVer <- gsub("\\..*", "", new_res_tib$ensembl_id)
genes <- new_res_tib$EnsGeneNoVer
G_list <- get_genename_from_ensembl(genes)
new_res_tib <- full_join(new_res_tib, G_list, by=c("EnsGeneNoVer"="ensembl_gene_id"))
write_csv(new_res_tib %>% dplyr::select(EnsGeneNoVer, gene=hgnc_symbol, chromosome_name, start_position, end_position, strand, description, ends_with("log2FC"), ends_with("padj")), "diff_expr_f_vs_each.csv")
# Cluster A vs cluster F:
res <- results(dds_all, contrast = c("cluster", "5", "0"), tidy=T)
new_res_tib <- res %>%
as_tibble() %>%
dplyr::rename(
ensembl_id=row)
write_csv(new_res_tib, "cluster_F_vs_A_expr.csv")
res <- results(dds_all, contrast = c("cluster", "3", "0"), tidy=T)
new_res_tib <- res %>%
as_tibble() %>%
dplyr::rename(
ensembl_id=row)
write_csv(new_res_tib, "cluster_D_vs_A_expr.csv")
res <- results(dds_all, contrast = c("cluster", "1", "0"), tidy=T)
new_res_tib <- as.data.frame(res) %>%
as_tibble() %>%
dplyr::rename(
ensembl_id=row,
tran_1_log2FC=log2FoldChange,
tran_1_lfcSE=lfcSE,
tran_1_stat=stat,
tran_1_pvalue=pvalue,
tran_1_padj=padj)
res2 <- results(dds_all, contrast = c("cluster", "2", "1"), tidy=T)
new_res_tib <- new_res_tib %>%
left_join(
as.data.frame(res2) %>%
as_tibble() %>%
dplyr::rename(
ensembl_id=row,
tran_2_log2FC=log2FoldChange,
tran_2_lfcSE=lfcSE,
tran_2_stat=stat,
tran_2_pvalue=pvalue,
tran_2_padj=padj) %>%
dplyr::select(-baseMean),
by="ensembl_id")
res2 <- results(dds_all, contrast = c("cluster", "3", "2"), tidy=T)
new_res_tib <- new_res_tib %>%
left_join(
as.data.frame(res2) %>%
as_tibble() %>%
dplyr::rename(ensembl_id=row,
tran_3_log2FC=log2FoldChange,
tran_3_lfcSE=lfcSE,
tran_3_stat=stat,
tran_3_pvalue=pvalue,
tran_3_padj=padj) %>%
dplyr::select(-baseMean),
by="ensembl_id")
res2 <- results(dds_all, contrast = c("cluster", "5", "3"), tidy=T)
new_res_tib <- new_res_tib %>%
left_join(
as.data.frame(res2) %>%
as_tibble() %>%
dplyr::rename(ensembl_id=row,
tran_45_log2FC=log2FoldChange,
tran_45_lfcSE=lfcSE,
tran_45_stat=stat,
tran_45_pvalue=pvalue,
tran_45_padj=padj) %>%
dplyr::select(-baseMean),
by="ensembl_id")
new_res_tib$EnsGeneNoVer <- gsub("\\..*", "", new_res_tib$ensembl_id)
genes <- new_res_tib$EnsGeneNoVer
G_list <- get_genename_from_ensembl(genes)
new_res_tib <- full_join(new_res_tib, G_list, by=c("EnsGeneNoVer"="ensembl_gene_id"))
write_csv(new_res_tib, "pairwise_differential_expression.csv")
gene_changes <- tibble(transition=c("1","2","3","4/5"),
all_upchanges=c(new_res_tib %>% filter(tran_1_pvalue <= .05 & tran_1_log2FC > 0) %>% nrow(),
new_res_tib %>% filter(tran_2_pvalue <= .05 & tran_2_log2FC > 0) %>% nrow(),
new_res_tib %>% filter(tran_3_pvalue <= .05 & tran_3_log2FC > 0) %>% nrow(),
new_res_tib %>% filter(tran_45_pvalue <= .05 & tran_45_log2FC > 0) %>% nrow()),
all_downchanges=c(new_res_tib %>% filter(tran_1_pvalue <= .05 & tran_1_log2FC < 0) %>% nrow(),
new_res_tib %>% filter(tran_2_pvalue <= .05 & tran_2_log2FC < 0) %>% nrow(),
new_res_tib %>% filter(tran_3_pvalue <= .05 & tran_3_log2FC < 0) %>% nrow(),
new_res_tib %>% filter(tran_45_pvalue <= .05 & tran_45_log2FC < 0) %>% nrow()),
x_upchanges=c(new_res_tib %>% filter(tran_1_pvalue <= .05 & tran_1_log2FC > 0 & chromosome_name == "X") %>% nrow(),
new_res_tib %>% filter(tran_2_pvalue <= .05 & tran_2_log2FC > 0 & chromosome_name == "X") %>% nrow(),
new_res_tib %>% filter(tran_3_pvalue <= .05 & tran_3_log2FC > 0 & chromosome_name == "X") %>% nrow(),
new_res_tib %>% filter(tran_45_pvalue <= .05 & tran_45_log2FC > 0 & chromosome_name == "X") %>% nrow()),
x_downchanges=c(new_res_tib %>% filter(tran_1_pvalue <= .05 & tran_1_log2FC < 0 & chromosome_name == "X") %>% nrow(),
new_res_tib %>% filter(tran_2_pvalue <= .05 & tran_2_log2FC < 0 & chromosome_name == "X") %>% nrow(),
new_res_tib %>% filter(tran_3_pvalue <= .05 & tran_3_log2FC < 0 & chromosome_name == "X") %>% nrow(),
new_res_tib %>% filter(tran_45_pvalue <= .05 & tran_45_log2FC < 0 & chromosome_name == "X") %>% nrow()))
# sampleTable <- data.frame(sampleName = sample_names,
# fileName = sample_file_to_keep,
# condition = sampleCondition)
sampleTable <- data.frame(sampleName = sample_names,
fileName = sample_file_to_keep,
cluster = sampleCondition,
seq_type = sample_seq_type,
group = sample_group)
diff_expr_all_df <- NA
for (cluster in 1:4) {
cluster_0 <- as.numeric(cluster - 1)
cluster_1 <- as.numeric(cluster)
if (cluster == 4) {
cluster_1 <- 5
}
sampleTableFiltered <- sampleTable #%>% filter(cluster >= cluster_0)# %in% c(cluster_0, cluster_1))
sampleTableFiltered[sampleTableFiltered$cluster <= cluster_0, "cluster"] <- cluster_0
sampleTableFiltered[sampleTableFiltered$cluster > cluster_0, "cluster"] <- cluster_1
sampleTableFiltered$cluster <- factor(sampleTableFiltered$cluster)
ddsHTSeq <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTableFiltered,
directory = directory,
design= ~ group + seq_type + cluster)
keep <- rowSums(counts(ddsHTSeq)) >= 10
ddsHTSeq_filtered <- ddsHTSeq[keep,]
dds <- DESeq(ddsHTSeq_filtered)
res <- results(dds)
res_tib <- as_tibble(res, rownames = NA)
res_tib <- rownames_to_column(res_tib, "EnsGene")
transition_fold_change <- sprintf("transition_%s_log2FC", cluster)
transition_padj <- sprintf("transition_%s_padj", cluster)
if (is.na(diff_expr_all_df)) {
res_tib$EnsGeneNoVer <- gsub("\\..*","",res_tib$EnsGene)
genes <- res_tib$EnsGeneNoVer
G_list <- get_genename_from_ensembl(genes)
res_tib_named <- full_join(res_tib,G_list,by=c("EnsGeneNoVer"="ensembl_gene_id"))
diff_expr_all_df <- dplyr::select(res_tib_named,
ens_id=EnsGene,
gene=hgnc_symbol,
chr=chromosome_name,
start_position,
end_position, strand,
!!transition_fold_change := log2FoldChange,
!!transition_padj := padj)
}else {
temp_df <- dplyr::select(res_tib, ens_id=EnsGene,
!!transition_fold_change := log2FoldChange,
!!transition_padj := padj)
diff_expr_all_df <- full_join(diff_expr_all_df, temp_df, by="ens_id")
}
}
temp_df <- dplyr::select(diff_expr_all_df, ends_with("padj"))
temp_df[is.na(temp_df)] <- 1
diff_expr_all_df$key_transition <- as.integer(unlist(apply(temp_df, 1, which.min)))
diff_expr_all_df$passes_threshold <- FALSE
diff_expr_all_df$positive_delta <- FALSE
for (row_idx in 1:nrow(diff_expr_all_df)) {
sig_transition <- sprintf("transition_%d_padj", unlist(diff_expr_all_df[row_idx, "key_transition"]))
sig_log2FC <- sprintf("transition_%d_log2FC", unlist(diff_expr_all_df[row_idx, "key_transition"]))
diff_expr_all_df[row_idx, "positive_delta"] <- as.logical(unlist(diff_expr_all_df[row_idx, sig_log2FC] > 0))
if ((! is.na(diff_expr_all_df[row_idx, sig_transition])) & unlist(diff_expr_all_df[row_idx, sig_transition]) <= .1) {
diff_expr_all_df[row_idx, "passes_threshold"] <- TRUE
}
}
# diff_expr_all_df_old <- read_csv("diff_expr_all_df_one_v_many.csv")
# diff_expr_all_df <- left_join(diff_expr_all_df, diff_expr_all_df_old %>% dplyr::select(ens_id,
# gene,
# chr,
# start_position,
# end_position,
# strand), by="ens_id")
diff_expr_all_df <- diff_expr_all_df %>% filter((! is.na(chr)) & (chr != "MT" & chr != "Y"))
write_csv(diff_expr_all_df, "diff_expr_all_df.csv")
gene_counts_df <- diff_expr_all_df %>% filter(passes_threshold == T) %>% count(key_transition, positive_delta)
# positions <- diff_expr_all_df %>% dplyr::select(chrom=chr, start=start_position, end=end_position, strand)
# positions$chrom <- paste("chr", positions$chrom, sep="")
# positions[positions$strand == 1, "strand"] <- "+"
# positions[positions$strand == -1, "strand"] <- "-"
# positions <- positions %>% makeGRangesFromDataFrame
# mapped_tbl <- nearest(positions, genes(TxDb.Hsapiens.UCSC.hg38.knownGene))
# gene_id_df <- as.data.frame(org.Hs.egSYMBOL) %>% column_to_rownames('gene_id')
# diff_expr_all_df$closest_gene <- gene_id_df[genes(TxDb.Hsapiens.UCSC.hg38.knownGene)[mapped_tbl,]$gene_id,]
```
## Looking at XIST Expression
A function to look at expression of specific genes based on the clusters. We primarily use it to check XIST expression
```{r}
library("ggplot2")
sample_table_all <- sampleTable
sample_table_all$condition <- factor(sample_table_all$condition)
ddsHTSeq_all <- DESeqDataSetFromHTSeqCount(sampleTable = sample_table_all,
directory = directory,
design= ~ condition)
keep <- rowSums(counts(ddsHTSeq_all)) >= 10
ddsHTSeq_all_filtered <- ddsHTSeq_all[keep,]
dds_all <- DESeq(ddsHTSeq_all_filtered)
library(stringr)
counts_all <- counts(dds_all, normalized=TRUE)
counts_all_df <- as.data.frame(counts_all) %>% rownames_to_column(var="gene_id")
counts_all_df <- counts_all_df %>% mutate(gene_id_nover = str_replace(gene_id, "\\.[0-9]*", ""))
sample_table_all <- column_to_rownames(sample_table_all, var="sampleName")
hg38_mart <- useMart('ensembl', dataset='hsapiens_gene_ensembl')
hg38_biomart_gene_df <- getBM(mart=hg38_mart,
attributes=c( 'ensembl_gene_id', 'hgnc_symbol'))
counts_all_df <- left_join(counts_all_df, hg38_biomart_gene_df %>%
dplyr::select(ensembl_gene_id, gene_name=hgnc_symbol), by=c("gene_id_nover"="ensembl_gene_id")) %>%
dplyr::select(-gene_id_nover, -gene_id)
write_csv(counts_all_df, "counts_all_df.csv")
counts_all_df <- read_csv("counts_all_df.csv")
cluster_tib <- read_csv("saved_sample_clusters.csv", col_names = c("sample_name", "cluster"))
count_cluster_means <- counts_all_df %>% mutate(
cluster_A_mean = rowMeans(select(., !!(filter(cluster_tib, cluster == 0 & sample_name %in% colnames(counts_all_df))$sample_name))),
cluster_B_mean = rowMeans(select(., !!(filter(cluster_tib, cluster == 1 & sample_name %in% colnames(counts_all_df))$sample_name))),
cluster_C_mean = rowMeans(select(., !!(filter(cluster_tib, cluster == 2 & sample_name %in% colnames(counts_all_df))$sample_name))),
cluster_D_mean = rowMeans(select(., !!(filter(cluster_tib, cluster == 3 & sample_name %in% colnames(counts_all_df))$sample_name))),
cluster_F_mean = rowMeans(select(., !!(filter(cluster_tib, cluster == 5 & sample_name %in% colnames(counts_all_df))$sample_name)))
) %>%
select(gene_name, cluster_A_mean, cluster_B_mean, cluster_C_mean, cluster_D_mean, cluster_F_mean)
write_csv(count_cluster_means, "counts_all_by_cluster.csv")
cluster_tib <- column_to_rownames(cluster_tib, "sample_name")
ggplot(count_cluster_means %>% reshape2::melt(id.vars="gene_name")) + geom_violin(aes(x=gene_name, y=value, fill=variable))
batch_corrected_vsd_df <- read_csv("batch_corrected_vsd_df.csv")
batch_corrected_cluster_means <- batch_corrected_vsd_df %>% mutate(
cluster_A_mean = rowMeans(dplyr::select(., !!(filter(cluster_tib, cluster == 0 & sample_name %in% colnames(batch_corrected_vsd_df))$sample_name))),
cluster_B_mean = rowMeans(dplyr::select(., !!(filter(cluster_tib, cluster == 1 & sample_name %in% colnames(batch_corrected_vsd_df))$sample_name))),
cluster_C_mean = rowMeans(dplyr::select(., !!(filter(cluster_tib, cluster == 2 & sample_name %in% colnames(batch_corrected_vsd_df))$sample_name))),
cluster_D_mean = rowMeans(dplyr::select(., !!(filter(cluster_tib, cluster == 3 & sample_name %in% colnames(batch_corrected_vsd_df))$sample_name))),
cluster_F_mean = rowMeans(dplyr::select(., !!(filter(cluster_tib, cluster == 5 & sample_name %in% colnames(batch_corrected_vsd_df))$sample_name)))
) %>%
dplyr::select(ensembl_id, hgnc_symbol=hgnc_symbol.y, cluster_A_mean, cluster_B_mean, cluster_C_mean, cluster_D_mean, cluster_F_mean)
write_csv(batch_corrected_cluster_means, "batch_corrected_cluster_means.csv")
batch_corrected_cluster_means <- read_csv("batch_corrected_cluster_means.csv")
table_3_expression <- left_join(table_3_expression, batch_corrected_cluster_means, by=c("ens_id"="ensembl_id"))
write_csv(table_3_expression %>% dplyr::select(`Ensembl ID`=ens_id,
`Gene Symbol` = gene,
`Chromosome` = chr,
`Start Position`=start_position,
`End Position`=end_position,
`Strand`=strand,
`Cluster A Mean`=cluster_A_mean,
`Cluster B Mean`=cluster_B_mean,
`Cluster C Mean`=cluster_C_mean,
`Cluster D Mean`=cluster_D_mean,
`Cluster F Mean`=cluster_F_mean,
`Transition 1 log2FoldChange`=transition_1_log2FC,
`Transition 1 p-adj`=transition_1_padj,
`Transition 2 log2FoldChange`=transition_2_log2FC,
`Transition 2 p-adj`=transition_2_padj,
`Transition 3 log2FoldChange`=transition_3_log2FC,
`Transition 3 p-adj`=transition_3_padj,
`Transition 4/5 log2FoldChange`=transition_4_log2FC,
`Transition 4/5 p-adj`=transition_4_padj,
`Most Significant Transition`=key_transition,
`Passes Significance Threshold`=passes_threshold,
`Positive log2FC`=positive_delta,
kielens_log2FC,
kielens_padj,
`kielens_T2iLGö_mean`=`T2iLGö`,
`kielens_KSR+FGF2_mean`=`KSR+FGF2`),
"table_3_expression_all_v_all.csv")
table_2_expr <- full_join(diff_expr_all_df, batch_corrected_cluster_means, by=c("ens_id"="ensembl_id"))
write_csv(table_2_expr %>% select(`Ensembl ID`=ens_id,
`Gene Symbol` = hgnc_symbol,
`Chromosome` = chr,
`Start Position`=start_position,
`End Position`=end_position,
`Strand`=strand,
`Cluster A Mean`=cluster_A_mean,
`Cluster B Mean`=cluster_B_mean,
`Cluster C Mean`=cluster_C_mean,
`Cluster D Mean`=cluster_D_mean,
`Cluster F Mean`=cluster_F_mean,
`Transition 1 log2FoldChange`=transition_1_log2FC,
`Transition 1 p-adj`=transition_1_padj,
`Transition 2 log2FoldChange`=transition_2_log2FC,
`Transition 2 p-adj`=transition_2_padj,
`Transition 3 log2FoldChange`=transition_3_log2FC,
`Transition 3 p-adj`=transition_3_padj,
`Transition 4/5 log2FoldChange`=transition_4_log2FC,
`Transition 4/5 p-adj`=transition_4_padj,
`Most Significant Transition`=key_transition,
`Passes Significance Threshold`=passes_threshold,
`Positive log2FC`=positive_delta),
"table_3_expression_all_v_all.csv")
ggplot(batch_corrected_cluster_means %>% reshape2::melt(id.vars="hgnc_symbol")) + geom_violin(aes(x=variable, y=value, color=variable))
ggplot(batch_corrected_cluster_means %>% reshape2::melt(id.vars="hgnc_symbol")) + geom_density(aes(value, color=variable))
cluster_tib <- read_csv("saved_sample_clusters.csv", col_names = c("sample_name", "cluster"))
plot_gene_expression <- function(gene_name, hgnc_symbol=T, multiple=F, batch_corrected_vsd_df=NA, show_significance=F) {
if(is.na(batch_corrected_vsd_df)) {
batch_corrected_vsd_df <- read_csv("batch_corrected_vsd_df.csv.gz")
}
# gene_id <- as.character(unlist(hg38_biomart_gene_df %>% dplyr::filter(hgnc_symbol == gene_name) %>% dplyr::select(ensembl_gene_id)))
# print(gene_id)
# gene_expression <- plotCounts(dds_all, gene=gene_id, intgroup="condition",
# returnData=TRUE)
# gene_expression <- reshape2::melt(filter(counts_all_df, gene_name == gene_name) %>% dplyr::select(-gene_name))
# gene_expression <- tibble(expression = as.numeric(filter(counts_all_df, gene_name == !!gene_name) %>% dplyr::select(-gene_name)), cluster= as.character(cluster_tib[colnames(counts_all_df %>% dplyr::select(-gene_name)), "cluster"]))
sample_names <- colnames(batch_corrected_vsd_df %>%
dplyr::select(-c(hgnc_symbol,
ensembl_id,
ensembl_id_no_version,
end_position,
start_position,
description,
strand,
chromosome_name)))
gene_expression <- batch_corrected_vsd_df %>%
filter(hgnc_symbol %in% !!gene_name) %>%
reshape2::melt(id.vars=c("hgnc_symbol"), measure.vars=sample_names, variable.name="sample_name", value.name="expression") %>%
left_join(cluster_tib, by="sample_name") %>%
mutate(cluster=as.character(cluster))
if (hgnc_symbol == F) {
gene_expression <- batch_corrected_vsd_df %>%
filter(ensembl_id_no_version %in% !!gene_name) %>%
reshape2::melt(id.vars=c("hgnc_symbol"), measure.vars=sample_names, variable.name="sample_name", value.name="expression") %>%
left_join(cluster_tib, by="sample_name")
}
expr_plot <- ggplot()
if (multiple == T) {
expr_plot <- ggplot(gene_expression, aes(x=cluster, y=expression)) +
geom_boxplot(aes(alpha=cluster, fill="female", group=factor(cluster)),outlier.shape = NA, size=.3) +
geom_point(position=position_jitter(w=0.1,h=0), size=.5, shape=20) +
facet_grid(rows = vars(factor(hgnc_symbol, levels=gene_name)), scale="free") +
# scale_y_log10() +
labs(x="Cluster",
y='Log2 Counts (VST+Batch Corrected)',
title="Expression") +
theme(plot.title = element_text(hjust = 0.5)) +
theme_bw() +
scale_fill_npg(guide=F) +
scale_alpha_discrete(guide=F) +
scale_x_discrete(limits=c("0","1","2","3","5"), labels=c("A", "B", "C", "D", "F")) +
theme(strip.text = element_text(size=5, margin = margin(1.5,1,1.5,1, "mm")),
strip.background = element_rect(size = .3))
}else {
expr_plot <- ggplot(gene_expression, aes(x=cluster, y=expression)) +
geom_boxplot(aes(alpha=cluster, fill="female"),outlier.shape = NA, size=.3) +
geom_point(position=position_jitter(w=0.1,h=0), size=.5, shape=20) +
# scale_y_log10() +
labs(x="Cluster",
y='Log2 Counts (VST+Batch Corrected)',
title=paste(gene_name, "Expression", sep = " ")) +
theme(plot.title = element_text(hjust = 0.5)) +
theme_bw() +
scale_fill_npg(guide=F) +
scale_alpha_discrete(guide=F) +
scale_x_discrete(limits=c("0","1","2","3","5"), labels=c("A", "B", "C", "D", "F"))
}
return(expr_plot)
}
xist_expr_plt <- plot_gene_expression("XIST")
ggsave("figs/main/fig_2e.svg", xist_expr_plt + base_plot_theme, width = 85, height = 35, units = "mm")
xist_expr_plt
plot_gene_expression("FIRRE")
rex_genes <- c("ZFP42",
"RLIM",
"YY1",
"POU5F1",
"NANOG",
"SOX2",
"KLF4",
"MYC")
rex1_expr <- plot_gene_expression("ZFP42")
for (g in rex_genes) {
plt <- plot_gene_expression(g)
ggsave(sprintf("figs/rex1_genes/%s.svg", g), plt + base_plot_theme, width = 85, height = 60.646, units = "mm")
}
#rex1 expression without outliers
cluster_tib <- column_to_rownames(cluster_tib, "sample_name")
gene_expression <- tibble(expression = as.numeric(filter(batch_corrected_vsd_df, hgnc_symbol == "ZFP42") %>%
dplyr::select(-c(hgnc_symbol,
ensembl_id,
ensembl_id_no_version,
end_position,
start_position,
description,
strand,
chromosome_name))),
cluster= as.character(cluster_tib[colnames(batch_corrected_vsd_df %>%
dplyr::select(-c(hgnc_symbol,
ensembl_id,
ensembl_id_no_version,
end_position,
start_position,
description,
strand,
chromosome_name))), "cluster"]))
rex1_expr <- ggplot(gene_expression %>% filter(expression > 7), aes(x=cluster, y=expression)) +
geom_boxplot(aes(alpha=cluster, fill="female"),outlier.shape = NA, size=.3) +
geom_point(position=position_jitter(w=0.1,h=0), size=.5, shape=20) +
labs(x="Cluster",
y='Log2 Counts\n(VST+Batch Corrected)',
title=paste("REX1/ZFP42", "Expression", sep = " ")) +
theme(plot.title = element_text(hjust = 0.5)) +
theme_bw() +
scale_fill_npg(guide=F) +
scale_alpha_discrete(guide=F) +
scale_x_discrete(limits=c("0","1","2","3","5"), labels=c("A", "B", "C", "D", "F")) +
base_plot_theme
ggsave("figs/supplementary/rex1_nooutliers.pdf", rex1_expr, width = 40, height = 35, units = "mm")
klf4_expr <- plot_gene_expression("KLF4") + labs( y='Log2 Counts\n(VST+Batch Corrected)') + base_plot_theme
ggsave("figs/supplementary/klf4_expr.pdf", klf4_expr, width = 40, height = 35, units = "mm")
# expression for only banovich samples:
batch_corrected_vsd_df <- read_csv("batch_corrected_vsd_df.csv.gz")
banovich_batch_corrected_vsd_df <- batch_corrected_vsd_df %>%
dplyr::select(ensembl_id, ensembl_id_no_version,hgnc_symbol,description,strand,end_position,start_position,chromosome_name, starts_with("Banovich_"))
banovich_xist_expr_plt <- plot_gene_expression("XIST", batch_corrected_vsd_df=banovich_batch_corrected_vsd_df) + labs(title="ATAC-only samples XIST Expr")
banovich_firre_expr_plt <- plot_gene_expression("FIRRE", batch_corrected_vsd_df=banovich_batch_corrected_vsd_df) + labs(title="ATAC-only samples FIRRE Expr")
ggsave("figs/review images/fig_2d_atac_only.png", banovich_xist_expr_plt + base_plot_theme, width = 43, height = 64,units = "mm")
ggsave("figs/review images/fig_2e_atac_only.png", banovich_firre_expr_plt + base_plot_theme, width = 43, height = 64, units = "mm")
```
## Looking for pluripotency genes
```{r}
pluripotency_genes <- c(
"DPPA3",
"PRDM14",
"ZFP42",
"KLF4",
"UHRF1",
"FOXD3",
"POU5F1",
"SALL4",
"MYC",
"GDF3",
"FGF2",
"DNMT3A",
"DNMT3B",
"GAL",
"LIN28A",
"DPPA4",
"TDGF1",
"TNFRSF8",
"TET1",
"TET2",
"NANOG",
"ALPL",
"E2F1",
"PODXL",
"TERT",
"UTF1"
)
plot_df <- counts_all_df %>%
filter(gene_name %in% pluripotency_genes) %>%
reshape2::melt(id.vars=c("gene_name"))
plot_df <- left_join(plot_df, rownames_to_column(cluster_tib, "sample_name"), by=c("variable"="sample_name"))
ggplot(plot_df, aes(y=value, x=cluster)) +
geom_boxplot(aes(fill=as.factor(cluster)), alpha=.2) +
geom_smooth() +
scale_fill_aaas() +
facet_wrap(facets = vars(gene_name), scales="free_y")
# kielens
# continuous_expr_change <- read_csv("continuous_expr_change.csv")
# continous_expr_no_F <- read_csv("diff_expr_continuous_no_cluster_F.csv")
# kielens_genes <- read_tsv("random_checks/kielens_genes.csv")
kielens_genes <- read_tsv("random_checks/all_kielens_genes.csv") %>%
dplyr::select(gene=`Gene Symbol`,
kielens_log2FC=`Log2(Fold-Change)`,
kielens_padj=`Adjusted p-value`)
# diff_expr_all_df <- read_csv("diff_expr_all_df.csv")
# table_3_expression <- diff_expr_all_df
table_3_expression <- read_csv("table_3_expression_all_v_all.csv.gz")
# kielens_genes_2 <- read_tsv("random_checks/kielens_2.csv")
# kielens_genes <- kielens_genes_2 %>% rename(gene=gene_symbol, log2FC=kielen_log2FC)
kielens_common_genes <- intersect(table_3_expression$`Gene Symbol`, kielens_genes$gene)
# kielen_expr_change_df <- continuous_expr_change %>%
# filter(hgnc_symbol %in% kielens_genes$gene)
#
# kielens_genes <- kielens_genes %>% rename(kielen_log2FC=log2FC)
kielen_expr_change_df <- inner_join(table_3_expression, kielens_genes, by=c("Gene Symbol"="gene"))
table_3_expression <- left_join(table_3_expression, kielens_genes, by=c("Gene Symbol"="gene"))
# add kielens counts
kielens_cell_lines <- read_csv("random_checks/kielens_cell_lines.csv")
kielens_expression_data <- read_csv("random_checks/kielens_counts.csv")
library(broom)
expression_cell_lines <- kielens_expression_data %>%
dplyr::select(-`Gene Symbol`, -overlap, -`Log2(Fold-Change)`, -`Adjusted p-value`, -`False discovery Rate`) %>%
colnames() %>%
tidy() %>%
mutate(cell_line=str_replace(x, "_P.*", ""),
passage=str_replace(x, ".*_P", "")) %>%
left_join(kielens_cell_lines, by=c("cell_line"))
kielens_expression_counts_df <- kielens_expression_data %>%
dplyr::select(-overlap, -`Adjusted p-value`, -`False discovery Rate`) %>%
reshape2::melt(id.vars=c("Gene Symbol", "Log2(Fold-Change)"), value.name="expression_count", variable.name="cell_line_name") %>%
left_join(expression_cell_lines, by=c("cell_line_name"="x"))
kielens_expression_counts_df <- kielens_expression_counts_df %>%
filter(growth_media %in% c("T2iLGö", "KSR+FGF2")) %>%
group_by(growth_media, `Gene Symbol`) %>%
summarise(mean_expression=mean(expression_count))
kielens_expression_counts_df_casted <- reshape2::dcast(kielens_expression_counts_df, `Gene Symbol` ~ growth_media)
table_3_expression <- left_join(table_3_expression, kielens_expression_counts_df_casted, by=c("Gene Symbol"))
cluster_F_vs_A_expr <- read_csv("cluster_F_vs_A_expr.csv") %>%
dplyr::select(ensembl_id,
`log2(F/A)`=log2FoldChange,
`padj (F/A)`=padj)
kielen_expr_change_df <- left_join(kielen_expr_change_df, cluster_F_vs_A_expr, by=c("Ensembl ID"="ensembl_id"))
cluster_D_vs_A_expr <- read_csv("cluster_D_vs_A_expr.csv") %>%
dplyr::select(ensembl_id,
`log2(D/A)`=log2FoldChange,
`padj (D/A)`=padj)
kielen_expr_change_df <- left_join(kielen_expr_change_df, cluster_D_vs_A_expr, by=c("Ensembl ID"="ensembl_id"))
temp_df <- table_3_expression %>% filter(`Transition 4/5 p-adj` <= .1 & Chromosome != "X")
#%>% mutate(`F_minus_A`=`Cluster F Mean`-`Cluster A Mean`)
corr_log2fc <- cor.test(temp_df$kielens_log2FC,
temp_df$`Transition 4/5 log2FoldChange`,
method = "spearman")
kielens_log2_plot <- ggplot(table_3_expression %>% filter(`Transition 4/5 p-adj` <= .1 & Chromosome != "X"),
aes(x=kielens_log2FC, y=`Transition 4/5 log2FoldChange`, color=`Transition 4/5 p-adj`)) +
geom_point(size=.7) +
scale_color_viridis_c() +
geom_text(label=sprintf("Spearman Corr: %.2f",corr_log2fc$estimate), x=-1, y=3.5, size=1.4, fontface = "plain", color="black") +
scale_y_continuous(limits = c(-2,5)) +
labs(title="Kilens, et al. 2018 vs Tran. 4/5",
color="Tran. 4/5 p-adj.",
x="Kilens, et al. 2018 log2(F.C.)",
y="Transition 4/5 log2(F.C.)") +
theme_bw() +
base_plot_theme +
theme(legend.key.width = unit(4, units="mm"),
legend.title=element_text(vjust=.95),
legend.box.margin=margin(l=-6,t=-3, unit="mm"))
ggsave("figs/main/fig4_kilens_log2_tran4_correlation.pdf",kielens_log2_plot,width = 57.855, height=58, units = "mm")
#%>% mutate(`F_minus_A`=`Cluster F Mean`-`Cluster A Mean`)
corr_counts <- cor.test(temp_df$kielens_T2iLGö_mean - temp_df$`kielens_KSR+FGF2_mean`,
temp_df$`Cluster F Mean`-temp_df$`Cluster A Mean`,
method = "spearman")
kielens_counts_plot <- ggplot(table_3_expression %>% filter(`Transition 4/5 p-adj` <= .1 & Chromosome != "X"),
aes(x=kielens_T2iLGö_mean - `kielens_KSR+FGF2_mean` , y=`Cluster F Mean`-`Cluster A Mean`, color=`Transition 4/5 p-adj`)) +
geom_point(size=.7) +
scale_color_viridis_c() +
scale_x_continuous(limits=c(-250,500), trans = scales::pseudo_log_trans()) +
scale_y_continuous(limits=c(-2,3)) +
geom_text(aes(label=sprintf("Spearman Corr: %.2f",corr_counts$estimate), x=-50, y=2), size=2) +
labs(title="Kielens vs Transiton 4/5 By Counts",
color="Transition 4/5 p-adjusted",
x="Kielens Naive - Primed Counts",
y="Cluster F - Cluster A Log2 Counts (VST-Batch Corrected)") +
theme_bw() +
base_plot_theme +
theme(legend.key.width = unit(4, units="mm"))
ggsave("figs/supplementary/kielens_counts_clusterF_correlation.svg",kielens_counts_plot,width = 87, height=87, units = "mm")
kielens_barplot_df <- table_3_expression %>%
filter(`Transition 4/5 p-adj` <= .1 & !is.na(`kielens_log2FC`)) %>%
arrange(desc(`kielens_log2FC`)) %>%
dplyr::select(`Gene Symbol`, `Transition 4/5 p-adj`, `Chromosome`, `kielens_log2FC`, `Transition 4/5 log2FoldChange`) %>%
reshape2::melt(id.vars=c("Gene Symbol", "Transition 4/5 p-adj", "Chromosome"))
positive_genes <- (kielens_barplot_df %>% filter(variable == "kielens_log2FC" & value > 0))$`Gene Symbol`
negative_genes <- (kielens_barplot_df %>% filter(variable == "kielens_log2FC" & value < 0))$`Gene Symbol`
kielens_barplot_labels_df_pos <- kielens_barplot_df %>%
filter(Chromosome != "X" & `Gene Symbol` %in% positive_genes) %>%
group_by(`Gene Symbol`) %>%
summarise(pos=max(abs(value))*sign(prod(value)))
kielens_barplot_labels_df_neg <- kielens_barplot_df %>%
filter(Chromosome != "X" & `Gene Symbol` %in% negative_genes) %>%
group_by(`Gene Symbol`) %>%
summarise(pos=-1*max(abs(value))*sign(prod(value)))
kielens_barplot <- ggplot(kielens_barplot_df %>% filter(Chromosome != "X")) +
geom_bar(aes(x=factor(`Gene Symbol`, levels=rev(unique(kielens_barplot_df$`Gene Symbol`))), y=value, fill=variable),
stat="identity",
position=position_dodge()) +
geom_text(data= kielens_barplot_labels_df_pos,
aes(x=factor(`Gene Symbol`, levels=rev(unique(kielens_barplot_df$`Gene Symbol`))),
label=`Gene Symbol`,
y=pos), hjust=0, nudge_x = 0, nudge_y = .5, size=1.2) +
geom_text(data= kielens_barplot_labels_df_neg,
aes(x=factor(`Gene Symbol`, levels=rev(unique(kielens_barplot_df$`Gene Symbol`))),
label=`Gene Symbol`,
y=pos), hjust=1,nudge_x = 0, nudge_y = -.5, size=1.2) +
coord_flip() +
scale_fill_uchicago(limits=c("Transition 4/5 log2FoldChange", "kielens_log2FC"),
labels=c("Transition 4/5", "Kilens")) +
labs(x="Genes", y="log2(Fold-Change)", title="Fold Change of Kilens, et al. 2018 Genes", fill="Dataset") +
theme_bw() +
base_plot_theme +
scale_x_discrete(breaks = NULL) +
scale_y_continuous(limits=c(-7.5,10))
ggsave("figs/main/fig4_kilens_barplot.svg",kielens_barplot,width = 58, height=174, units = "mm")
#kielens samples heatmap, and our female samples heatmap using the expression data
kielens_cell_lines <- read_csv("random_checks/kielens_cell_lines.csv")
kielens_expression_data <- read_csv("random_checks/kielens_counts.csv")
kielens_cell_lines <- kielens_cell_lines %>% filter(growth_media %in% c("KSR+FGF2", "T2iLGö"))
kielens_expression_data_filtered <- kielens_expression_data %>%
select(`Gene Symbol`, matches(paste(sprintf("%s.*", kielens_cell_lines$cell_line), collapse="|")))
batch_corrected_vsd_df <- read_csv("batch_corrected_vsd_df.csv")
batch_corrected_vsd_df <- batch_corrected_vsd_df %>%
select(-c(ensembl_id,
ensembl_id_no_version,
end_position,
start_position,
description,
strand))
kielens_and_cluster_samples <- inner_join(batch_corrected_vsd_df, kielens_expression_data_filtered, by=c("hgnc_symbol"="Gene Symbol"))
kielens_and_cluster_samples_heatmap_df <- kielens_and_cluster_samples %>%
reshape2::melt(id.vars=c("hgnc_symbol", "chromosome_name"))
cluster_tib <- read_csv("saved_sample_clusters.csv", col_names = c("sample_name", "cluster"))
kielens_and_cluster_samples_heatmap_df <- kielens_and_cluster_samples_heatmap_df %>%
left_join(cluster_tib, by=c("variable"="sample_name"))
naive_cell_lines <- filter(kielens_cell_lines, growth_media == "T2iLGö")$cell_line
primed_cell_lines <- filter(kielens_cell_lines, growth_media == "KSR+FGF2")$cell_line
kielens_and_cluster_samples_heatmap_df[grepl(paste(sprintf("%s.*", primed_cell_lines), collapse="|"), kielens_and_cluster_samples_heatmap_df$variable), "cluster"] <- "KSR+FGF2"
kielens_and_cluster_samples_heatmap_df[grepl(paste(sprintf("^%s", naive_cell_lines), collapse="|"), kielens_and_cluster_samples_heatmap_df$variable), "cluster"] <- "T2iLGö"
gene_order <- unique(kielens_barplot_df$`Gene Symbol`)
kielens_heatmap <- ggplot(kielens_and_cluster_samples_heatmap_df %>% filter(chromosome_name != "X", !is.na(hgnc_symbol))) +
facet_grid(cols=vars(cluster), scales="free_x") +
geom_tile(mapping=aes(x=variable, y=factor(hgnc_symbol, levels = gene_order), fill=value))
ggplot(kielen_expr_change_df %>% filter(`padj (F/A)` <= .1), aes(x=kielens_log2FC, y=`Cluster F Mean`-`Cluster A Mean`, color=`padj (F/A)`)) +
geom_point() +
scale_color_viridis_c() +
scale_y_continuous(trans="pseudo_log", limits = c(-2.5,2.5)) +
labs(title="kielens_log2FC vs `Cluster F Mean`-`Cluster A Mean`")
cor.test(temp_df$kielens_log2FC,
temp_df$F_minus_A,
method = "spearman")
ggplot(kielen_expr_change_df %>% filter(`padj (D/A)` <= .1), aes(x=kielens_log2FC, y=`log2(D/A)`, color=`padj (D/A)`)) +
geom_point() +
scale_color_viridis_c() +
labs(title="kielens_log2FC vs log2(clusterD/A)")
ggplot(kielen_expr_change_df %>% filter(`padj (D/A)` <= .1), aes(x=kielens_log2FC, y=`Cluster D Mean`-`Cluster A Mean`, color=`padj (D/A)`)) +
geom_point() +
scale_color_viridis_c() +
scale_y_continuous(trans="pseudo_log") +
labs(title="kielens_log2FC vs `Cluster D Mean`-`Cluster A Mean`")
# look for progressive selection of naive vs primed genes
pairwise_differential_expression <- read_csv("pairwise_differential_expression.csv")
naive_kielens_genes <- kielens_genes %>% filter(kielens_log2FC > 0)
plot_df <- pairwise_differential_expression %>%
filter(hgnc_symbol %in% naive_kielens_genes$gene) %>%
dplyr::select(hgnc_symbol,
tran_1_log2FC,
tran_2_log2FC,
tran_3_log2FC,
tran_45_log2FC) %>%
reshape2::melt(id.vars="hgnc_symbol")
ggplot(plot_df, aes(x=variable, y=value)) +
geom_violin() +
labs(title="Changes in naive genes (kielens_log2FC > 0)")
plot_df <- kielen_expr_change_df %>%
filter(`Gene Symbol` %in% naive_kielens_genes$gene) %>%
dplyr::select(`Gene Symbol`,
`Cluster A Mean`,
`Cluster B Mean`,
`Cluster C Mean`,
`Cluster D Mean`,
`Cluster F Mean`) %>%
reshape2::melt(id.vars="Gene Symbol")
ggplot(plot_df, aes(x=variable, y=value)) +
geom_violin() +
labs(title="Counts of naive genes (kielens_log2FC > 0)")
table_3_expression_naive_genes <- kielen_expr_change_df %>% filter(`Gene Symbol` %in% naive_kielens_genes$gene)
table_3_expression_naive_genes <- table_3_expression_naive_genes %>%
dplyr::transmute(`padj (F/A)`=`padj (F/A)`,
`Gene Symbol`=`Gene Symbol`,
cluster_A_change = 0,
cluster_B_change = `Cluster B Mean` - `Cluster A Mean`,
cluster_C_change = `Cluster C Mean` - `Cluster A Mean`,
cluster_D_change = `Cluster D Mean` - `Cluster A Mean`,
cluster_F_change = `Cluster F Mean` - `Cluster A Mean`) %>%
reshape2::melt(id.vars=c("Gene Symbol", "padj (F/A)"))
ggplot(table_3_expression_naive_genes %>% filter(`padj (F/A)` <= .1), aes(x=variable, y=value, group=`Gene Symbol`, color=`padj (F/A)`)) +
geom_line() +
scale_color_viridis_c()
primed_kielens_genes <- kielens_genes %>% filter(kielens_log2FC < 0)
table_3_expression_primed_genes <- kielen_expr_change_df %>% filter(`Gene Symbol` %in% primed_kielens_genes$gene)
table_3_expression_primed_genes <- table_3_expression_primed_genes %>%
dplyr::transmute(`padj (F/A)`=`padj (F/A)`,
`Gene Symbol`=`Gene Symbol`,
cluster_A_change = 0,
cluster_B_change = `Cluster B Mean` - `Cluster A Mean`,
cluster_C_change = `Cluster C Mean` - `Cluster A Mean`,
cluster_D_change = `Cluster D Mean` - `Cluster A Mean`,
cluster_F_change = `Cluster F Mean` - `Cluster A Mean`) %>%
reshape2::melt(id.vars=c("Gene Symbol", "padj (F/A)"))
ggplot(table_3_expression_primed_genes %>% filter(`padj (F/A)` <= .1), aes(x=variable, y=value, group=`Gene Symbol`, color=`padj (F/A)`)) +
geom_line() +
scale_color_viridis_c()
table_3_expression <- left_join(table_3_expression, cluster_F_vs_A_expr, by=c("Ensembl ID"="ensembl_id"))
table_3_expression_non_prime_non_naive_genes <- table_3_expression %>% filter(!(`Gene Symbol` %in% c(naive_kielens_genes$gene, primed_kielens_genes$gene)))
table_3_expression_non_prime_non_naive_genes <- table_3_expression_non_prime_non_naive_genes %>%
dplyr::transmute(`padj (F/A)`=`padj (F/A)`,
`Gene Symbol`=`Gene Symbol`,
cluster_A_change = 0,
cluster_B_change = `Cluster B Mean` - `Cluster A Mean`,
cluster_C_change = `Cluster C Mean` - `Cluster A Mean`,
cluster_D_change = `Cluster D Mean` - `Cluster A Mean`,
cluster_F_change = `Cluster F Mean` - `Cluster A Mean`) %>%
reshape2::melt(id.vars=c("Gene Symbol", "padj (F/A)"))
non_prime_plot <- ggplot(table_3_expression_non_prime_non_naive_genes %>% filter(`padj (F/A)` <= .1), aes(x=variable, y=value, group=`Gene Symbol`, color=`padj (F/A)`)) +
geom_line() +
scale_color_viridis_c()
ggsave("figs/non_prime_non_naive.png", non_prime_plot)
primed_plot_df <- pairwise_differential_expression %>%
filter(hgnc_symbol %in% primed_kielens_genes$gene) %>%
dplyr::select(hgnc_symbol,
tran_1_log2FC,
tran_2_log2FC,
tran_3_log2FC,
tran_45_log2FC) %>%
reshape2::melt(id.vars="hgnc_symbol")
ggplot(primed_plot_df, aes(x=variable, y=value)) +
geom_violin() +
labs(title="Changes in primed genes (kielens_log2FC > 0)")
plot_df <- kielen_expr_change_df %>%
filter(`Gene Symbol` %in% primed_kielens_genes$gene) %>%
dplyr::select(`Gene Symbol`,
`Cluster A Mean`,
`Cluster B Mean`,
`Cluster C Mean`,
`Cluster D Mean`,
`Cluster F Mean`) %>%
reshape2::melt(id.vars="Gene Symbol")
ggplot(plot_df, aes(x=variable, y=value)) +
geom_violin() +
labs(title="Counts of primed genes (kielens_log2FC > 0)")
ggplot(kielen_expr_change_df, aes(x=kielen_log2FC, y=-log10(padj))) +
geom_point() +
labs(title="kielen_log2FC vs continuous diff exp padj")
ggplot(kielen_expr_change_df, aes(x=kielen_log2FC, y=log2FoldChange)) +
geom_point() +
labs(title="kielen_log2FC vs continuous diff exp log2FC")
kielen_expr_change_df <- left_join(kielen_expr_change_df, count_cluster_means, by=c("hgnc_symbol"="gene_name"))
kielen_counts_df <- reshape2::melt(kielen_expr_change_df %>% filter(padj <= .1) %>% select(hgnc_symbol, chromosome_name, kielen_log2FC, !!sprintf("cluster_%s_mean", c("A","B","C","D","F"))), id.vars=c("hgnc_symbol", "chromosome_name", "kielen_log2FC"))
ggplot(kielen_counts_df %>% filter(chromosome_name != "X"), aes(x=kielen_log2FC, y=value, group=hgnc_symbol)) +
# geom_point(aes(size=kielen_log2FC)) +
geom_point(aes(color=kielen_log2FC), alpha=.1) +
facet_grid(rows = vars(variable)) +
# scale_y_log10() +
scale_y_continuous(limits = c(0,5e3)) +
scale_color_viridis_c()
kielens_cell_lines <- read_csv("random_checks/kielens_cell_lines.csv")
kielens_expression_data <- read_csv("random_checks/kielens_counts.csv")
library(broom)
expression_cell_lines <- kielens_expression_data %>%
dplyr::select(-`Gene Symbol`, -overlap, -`Log2(Fold-Change)`, -`Adjusted p-value`, -`False discovery Rate`) %>%
colnames() %>%
tidy() %>%
mutate(cell_line=str_replace(x, "_P.*", ""),
passage=str_replace(x, ".*_P", "")) %>%
left_join(kielens_cell_lines, by=c("cell_line"))
kielens_expression_counts_df <- kielens_expression_data %>%
dplyr::select(-overlap, -`Adjusted p-value`, -`False discovery Rate`) %>%
reshape2::melt(id.vars=c("Gene Symbol", "Log2(Fold-Change)"), value.name="expression_count", variable.name="cell_line_name") %>%
left_join(expression_cell_lines, by=c("cell_line_name"="x"))
counts_all_by_cluster <- read_csv("counts_all_by_cluster.csv")
kielens_expression_counts_df_naive <- kielens_expression_counts_df %>%
filter(growth_media == "T2iLGö") %>%
group_by(`Gene Symbol`) %>%
summarise(mean_naive_expression=mean(expression_count),
kielens_log2FC=dplyr::first(`Log2(Fold-Change)`)) %>%
left_join(counts_all_by_cluster, by=c("Gene Symbol"="gene_name")) %>%
# reshape2::melt(id.vars=c("Gene Symbol", "mean_naive_expression"), variable.name="cluster", value.name="cluster_mean") %>%
left_join(continuous_expr_change %>% dplyr::select(hgnc_symbol, chromosome_name, padj), by=c("Gene Symbol"="hgnc_symbol") )
kielens_expression_counts_df_naive <- left_join(kielens_expression_counts_df_naive,
continous_expr_no_F %>% dplyr::select(hgnc_symbol, no_F_padj=padj, no_F_log2FC=log2FoldChange),
by=c("Gene Symbol"="hgnc_symbol"))
ggplot(kielens_expression_counts_df_naive %>% filter(padj < .1 & chromosome_name != "X"), aes(x=mean_naive_expression, y=cluster_mean)) +
geom_point() +
facet_grid(rows = vars(cluster)) +
labs(title="Naive cells - T2iLGö")
cor.test((kielens_expression_counts_df_naive %>%
filter(no_F_padj.x < .1 & chromosome_name != "X") %>%
mutate(diff=mean_naive_expression-mean_primed_expression))$diff,
(kielens_expression_counts_df_naive %>%
filter(no_F_padj.x < .1 & chromosome_name != "X") %>%
mutate(diff=cluster_D_mean-cluster_A_mean))$diff, method="spearman")
cor.test((kielens_expression_counts_df_naive %>%
filter(padj < .1 & chromosome_name != "X") %>%
mutate(diff=mean_naive_expression-mean_primed_expression))$diff,
(kielens_expression_counts_df_naive %>%
filter(padj < .1 & chromosome_name != "X") %>%
mutate(diff=cluster_D_mean-cluster_A_mean))$diff, method="spearman")
cor.test((kielens_expression_counts_df_naive %>%
filter(padj < .1 & chromosome_name != "X") %>%
mutate(diff=mean_naive_expression-mean_primed_expression))$diff,
(kielens_expression_counts_df_naive %>%
filter(padj < .1 & chromosome_name != "X") %>%
mutate(diff=cluster_F_mean-cluster_A_mean))$diff, method="spearman")
cor.test((kielens_expression_counts_df_naive %>%
filter(padj < .1 & chromosome_name != "X") %>%
mutate(diff=mean_naive_expression-mean_primed_expression))$diff,
(kielens_expression_counts_df_naive %>%
filter(padj < .1 & chromosome_name != "X") %>%
mutate(diff=cluster_D_mean-cluster_A_mean))$diff)
cor.test((kielens_expression_counts_df_naive %>%
filter(no_F_padj < .1 & chromosome_name != "X"))$kielens_log2FC,
(kielens_expression_counts_df_naive %>%
filter(no_F_padj < .1 & chromosome_name != "X"))$no_F_log2FC, method="spearman")
cor.test((kielens_expression_counts_df_naive %>%
filter(no_F_padj < .1 & chromosome_name != "X"))$kielens_log2FC,
(kielens_expression_counts_df_naive %>%
filter(no_F_padj < .1 & chromosome_name != "X"))$log2FC, method="spearman")
kielens_expression_counts_df_primed <- kielens_expression_counts_df %>%
filter(growth_media %in% c("KSR+FGF2")) %>%
group_by(`Gene Symbol`) %>%
summarise(mean_primed_expression=mean(expression_count)) %>%
left_join(counts_all_by_cluster, by=c("Gene Symbol"="gene_name")) %>%
# reshape2::melt(id.vars=c("Gene Symbol", "mean_primed_expression"), variable.name="cluster", value.name="cluster_mean")%>%
left_join(continuous_expr_change %>% dplyr::select(hgnc_symbol, chromosome_name, padj), by=c("Gene Symbol"="hgnc_symbol") )
ggplot(kielens_expression_counts_df_primed %>% filter(padj < .1 & chromosome_name != "X"), aes(x=mean_primed_expression, y=cluster_mean)) +
geom_point() +
facet_grid(rows = vars(cluster)) +
labs(title="Primed cells - KSR+FGF2")
kielens_expression_counts_df_naive <- left_join(kielens_expression_counts_df_naive, kielens_expression_counts_df_primed %>% dplyr::select(`Gene Symbol`, mean_primed_expression), by="Gene Symbol")
kielens_expression_counts_df_naive <- left_join(kielens_expression_counts_df_naive,
continous_expr_no_F %>% dplyr::select(hgnc_symbol, no_F_padj=padj, no_F_log2FC=log2FoldChange),
by=c("Gene Symbol"="hgnc_symbol"))
library(scales)
ggplot(kielens_expression_counts_df_naive %>%
filter(padj < .1 & chromosome_name != "X"), aes(x=mean_naive_expression-mean_primed_expression, y=cluster_D_mean-cluster_A_mean)) +
geom_point(aes(color=padj)) +
labs(title="naive-primed and clusterD-clusterA") +
scale_color_viridis_c() +
scale_x_continuous(trans = pseudo_log_trans(base = 2)) +
scale_y_continuous(trans = pseudo_log_trans(base = 2))
ggplot(kielens_expression_counts_df_naive %>%
filter(padj < .1 & chromosome_name != "X"), aes(x=mean_naive_expression-mean_primed_expression, y=cluster_F_mean-cluster_A_mean)) +
geom_point(aes(color=padj)) +
labs(title="naive-primed and clusterF-clusterA") +
scale_color_viridis_c() +
scale_x_continuous(trans = pseudo_log_trans(base = 2)) +
scale_y_continuous(trans = pseudo_log_trans(base = 2))
ggplot(kielens_expression_counts_df_naive %>%
filter(no_F_padj < .1 & chromosome_name != "X"),
aes(x=log2(mean_naive_expression/mean_primed_expression),
y=no_F_log2FC)) +
geom_point(aes(color=no_F_padj)) +
labs(title="log2(naive/primed) and log2(clusterD/clusterA)") +
scale_color_viridis_c()
ggplot(kielens_expression_counts_df_naive %>%
filter(padj < .1 & chromosome_name != "X"),
aes(x=log2(mean_naive_expression/mean_primed_expression),
y=log2(cluster_F_mean/cluster_A_mean))) +
geom_point(aes(color=padj)) +
labs(title="log2(naive/primed) and log2(clusterF/clusterA)") +
scale_color_viridis_c()
```
## Methylation PCA Validation that cluster F expression represents entire cluster
```{r}
female_df_no_h9 <- read_csv("female_df_no_h9.csv")
filtered_site_annotation_df <- read_csv("filtered_site_annotation_df.csv")
sample_annotation_df <- read_csv("sample_data_combined_all_info.csv")
statistics <- read_csv('variance_statistics_noh9_nooutliers.csv')
sigvar_probes <- statistics$Position[statistics$alpha <= .01 & (statistics$femaleVariance > statistics$maleVariance)]
# sig_x_sites <- intersect(positions_to_keep, x_annotation_df$rowname)
cluster_F_vs_A_expr <- read_csv("cluster_F_vs_A_expr.csv") %>%
dplyr::select(ensembl_id,
`log2(F/A)`=log2FoldChange,
`padj (F/A)`=padj)
cluster_F_vs_A_expr$ensembl_id_no_ver <- gsub("\\..*", "", cluster_F_vs_A_expr$ensembl_id)
cutoff_1_gene_names <- get_genename_from_ensembl(cluster_F_vs_A_expr$ensembl_id_no_ver)
cluster_F_vs_A_expr <- left_join(cluster_F_vs_A_expr, cutoff_1_gene_names, by=c("ensembl_id_no_ver" = "ensembl_gene_id"))
cutoff_1_genes <- filter(cluster_F_vs_A_expr, `padj (F/A)` < .1)
cutoff_1_probes <- filtered_site_annotation_df %>% filter(hg38_gene_name %in% cutoff_1_genes$hgnc_symbol)
female_df_no_h9_cutoff_1 <- female_df_no_h9 %>% filter(rn %in% cutoff_1_probes$rowname)
female_df_no_h9_cutoff_1_mat <- t(as.matrix(female_df_no_h9_cutoff_1 %>% column_to_rownames(var="rn")))
cluster_tib <- read_csv("saved_sample_clusters.csv", col_names = c("sample_name", "cluster"))
cluster_tib <- left_join(cluster_tib,
sample_annotation_df %>%
dplyr::select(sample_name, sra_accession),
by="sample_name")
cluster_tib <- column_to_rownames(cluster_tib, "sample_name")
library(ggfortify)
cluster_tib$have_expression <- !is.na(cluster_tib$sra_accession)
cluster_tib$cluster <- factor(cluster_tib$cluster, levels = c(0,1,2,3,4,5))
autoplot(prcomp(female_df_no_h9_cutoff_1_mat), data=cluster_tib, colour="cluster", shape="have_expression")
female_df_no_h9_cutoff_1_sigvar <- female_df_no_h9 %>% filter(rn %in% cutoff_1_probes$rowname & rn %in% sigvar_probes)
female_df_no_h9_cutoff_1_sigvar_mat <- t(as.matrix(female_df_no_h9_cutoff_1_sigvar %>% column_to_rownames(var="rn")))
autoplot(prcomp(female_df_no_h9_cutoff_1_sigvar_mat),
data=cluster_tib,
colour="cluster",
shape="have_expression",
x=1,
y=2)
x_probes <- filtered_site_annotation_df %>% filter(hg38_chromosome == "chrX")
female_df_no_h9_cutoff_1_sigvar_no_x <- female_df_no_h9 %>% filter(rn %in% cutoff_1_probes$rowname & rn %in% sigvar_probes & !(rn %in% x_probes$rowname))
female_df_no_h9_cutoff_1_sigvar_no_x_mat <- t(as.matrix(female_df_no_h9_cutoff_1_sigvar_no_x %>% column_to_rownames(var="rn")))
autoplot(prcomp(female_df_no_h9_cutoff_1_sigvar_no_x_mat),
data=cluster_tib,
colour="cluster",
shape="have_expression",
x=1,
y=2)
female_df_no_h9_cutoff_1_no_x <- female_df_no_h9 %>% filter(rn %in% cutoff_1_probes$rowname & !(rn %in% x_probes$rowname))
female_df_no_h9_cutoff_1_no_x_mat <- t(as.matrix(female_df_no_h9_cutoff_1_no_x %>% column_to_rownames(var="rn")))
cluster_tib$clutser_F = cluster_tib$cluster == 5
autoplot(prcomp(female_df_no_h9_cutoff_1_no_x_mat),
data=cluster_tib,
colour="clutser_F",
shape="have_expression",
x=1,
y=2)
cluster_a_f_samples <- cluster_tib %>% filter(cluster %in% c(0,5))
heatmap_mat <- t(female_df_no_h9_cutoff_1_no_x_mat[cluster_a_f_samples$sample_name,])
library(heatmap3)
heatmap3(heatmap_mat)
```
## Looking for ERV representation in clusters
```{r}
library(tidyverse)
library(stringr)
# parsing GTF file
journal_gtf_df <- read_tsv("journal_gtf_summary.tsv")
journal_gtf_df_ltrs <- journal_gtf_df %>%
filter(geneRegion == "ltr") %>%
group_by(locus) %>%
summarise(ltr_rep_name=paste(unique(repName), collapse=","))
journal_gtf_df_internal <- journal_gtf_df %>%
filter(geneRegion == "internal") %>%
group_by(locus) %>%
summarise(internal_rep_name=unique(repName))
journal_gtf_df_formatted <- full_join(journal_gtf_df_ltrs, journal_gtf_df_internal, by="locus")
write_csv(journal_gtf_df_formatted, "journal_gtf_df_formatted.csv")
library(DESeq2)
all_erv_counts <- read_csv("all_erv_counts_df.csv")
cluster_tib <- read_csv("saved_sample_clusters.csv", col_names = c("sample_name", "cluster"))
seq_type_df <- read_csv("sample_seq_type.csv")
sample_info_df <- read_csv("sample_data_combined_all_info.csv")
sample_info_df <- left_join(sample_info_df, seq_type_df, by=c("sample_name"="Sample Name")) %>%
left_join(cluster_tib, by="sample_name")
sample_info_df_filtered <- sample_info_df %>% filter(gender=="female" & !is.na(sra_accession) & !is.na(cluster))
sample_annotation_df <- tibble(sample_name=sample_info_df_filtered$sra_accession,
cluster=as.factor(sample_info_df_filtered$cluster),
group=sample_info_df_filtered$group_accession,
seq_type=sample_info_df_filtered$end_type)
sample_annotation_df <- column_to_rownames(sample_annotation_df, var="sample_name")
all_erv_counts <- column_to_rownames(all_erv_counts, var="transcript")
all_erv_counts[is.na(all_erv_counts)] <- 0
dds <- DESeqDataSetFromMatrix(countData = all_erv_counts[rownames(sample_annotation_df)],
colData = sample_annotation_df,
design = ~ seq_type + group + cluster)
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]
dds <- DESeq(dds)
erv_vsd <- vst(dds, blind = F)
library(limma)
erv_batch_corrected_vsd_df <- as.data.frame(removeBatchEffect(assay(erv_vsd),
batch = erv_vsd$group, batch2 = erv_vsd$seq_type)) %>%
rownames_to_column("erv_id")
# genes <- batch_corrected_vsd_df$ensembl_id_no_version
# G_list <- get_genename_from_ensembl(genes)
# batch_corrected_vsd_df <- batch_corrected_vsd_df %>%
# left_join(G_list, by=c("ensembl_id_no_version"="ensembl_gene_id"))
write_csv(erv_batch_corrected_vsd_df, "erv_batch_corrected_vsd_df.csv")
# sampleTable <- data.frame(sampleName = sample_names,
# fileName = sample_file_to_keep,
# cluster = sampleCondition,
# seq_type = sample_seq_type,
# group = sample_group)
sample_annotation_df <- tibble(sample_name=sample_info_df_filtered$sra_accession,
cluster=sample_info_df_filtered$cluster,
group=sample_info_df_filtered$group_accession,
seq_type=sample_info_df_filtered$end_type)
diff_expr_all_df <- tibble(erv_id=character())
for (cluster in 1:4) {
cluster_0 <- as.numeric(cluster - 1)
cluster_1 <- as.numeric(cluster)
if (cluster == 4) {
cluster_1 <- 5
}
sampleTableFiltered <- sample_annotation_df %>% column_to_rownames(var="sample_name") #%>% filter(cluster >= cluster_0)# %in% c(cluster_0, cluster_1))
sampleTableFiltered[sampleTableFiltered$cluster <= cluster_0, "cluster"] <- cluster_0
sampleTableFiltered[sampleTableFiltered$cluster > cluster_0, "cluster"] <- cluster_1
sampleTableFiltered$cluster <- factor(sampleTableFiltered$cluster)
dds <- DESeqDataSetFromMatrix(countData = all_erv_counts[rownames(sampleTableFiltered)],
colData = sampleTableFiltered,
design = ~ seq_type + group + cluster)
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]
dds <- DESeq(dds)
res <- results(dds, tidy=T) %>% dplyr::rename(erv_id=row)
# res_tib <- as_tibble(res, rownames = NA)
# res_tib <- rownames_to_column(res_tib, "EnsGene")
transition_fold_change <- sprintf("transition_%s_log2FC", cluster)
transition_padj <- sprintf("transition_%s_padj", cluster)
temp_df <- dplyr::select(res, erv_id,
!!transition_fold_change := log2FoldChange,
!!transition_padj := padj)
diff_expr_all_df <- full_join(diff_expr_all_df, temp_df, by="erv_id")
}
temp_df <- dplyr::select(diff_expr_all_df, ends_with("padj"))
temp_df[is.na(temp_df)] <- 1
diff_expr_all_df$key_transition <- as.integer(unlist(apply(temp_df, 1, which.min)))
diff_expr_all_df$passes_threshold <- FALSE
diff_expr_all_df$positive_delta <- FALSE
for (row_idx in 1:nrow(diff_expr_all_df)) {
sig_transition <- sprintf("transition_%d_padj", unlist(diff_expr_all_df[row_idx, "key_transition"]))
sig_log2FC <- sprintf("transition_%d_log2FC", unlist(diff_expr_all_df[row_idx, "key_transition"]))
diff_expr_all_df[row_idx, "positive_delta"] <- as.logical(unlist(diff_expr_all_df[row_idx, sig_log2FC] > 0))
if ((! is.na(diff_expr_all_df[row_idx, sig_transition])) & unlist(diff_expr_all_df[row_idx, sig_transition]) <= .1) {
diff_expr_all_df[row_idx, "passes_threshold"] <- TRUE
}
}
write_csv(diff_expr_all_df, "erv_diff_expr_all_df.csv")
tran_4_ervs <- (diff_expr_all_df %>% filter(transition_4_padj < .1 & transition_4_log2FC < 0))$erv_id
plot_df <- erv_batch_corrected_vsd_df %>%
filter(erv_id %in% tran_4_ervs) %>%
reshape2::melt(id.vars="erv_id", variable.name="sample_name", value.name="batch_corrected_expression") %>%
left_join(sample_annotation_df, by="sample_name")
ggplot(plot_df, aes(x=erv_id, fill=as.factor(cluster), y=batch_corrected_expression)) +
geom_bar(stat="identity", position = position_dodge())
erv_gtf <- read_csv("erv_gtf.csv")
diff_expr_all_df <- left_join(diff_expr_all_df, erv_gtf, by="erv_id")
write_csv(diff_expr_all_df, "erv_diff_expr_all_df.csv")
sample_info_df <- read_csv("sample_data_combined_all_info.csv")
sample_info_df <- sample_info_df %>%
left_join(cluster_tib, by="sample_name")
sample_info_df_filtered <- sample_info_df %>% filter(gender=="female" & !is.na(sra_accession) & !is.na(cluster))
sample_annotation_df <- tibble(sample_name=sample_info_df_filtered$sra_accession,
cluster=as.factor(sample_info_df_filtered$cluster),
group=sample_info_df_filtered$group_accession)
erv_batch_corrected_vsd_df <- read_csv("erv_batch_corrected_vsd_df.csv.gz")
erv_diff_expr_all_df <- read_csv("erv_diff_expr_all_df.csv.gz")
cluster_erv_means <- erv_batch_corrected_vsd_df %>%
reshape2::melt(id.vars="erv_id", variable.name="sample_name", value.name="batch_corrected_expression") %>%
left_join(sample_annotation_df, by="sample_name") %>%
group_by(erv_id, cluster) %>%
summarise(mean_count=mean(batch_corrected_expression)) %>%
reshape2::dcast(erv_id ~ cluster) %>%
dplyr::rename(cluster_A_mean=`0`,
cluster_B_mean=`1`,
cluster_C_mean=`2`,
cluster_D_mean=`3`,
cluster_F_mean=`5`)
erv_diff_expr_all_df <- read_csv("erv_diff_expr_all_df.csv.gz")
erv_diff_expr_all_df <- full_join(erv_diff_expr_all_df, cluster_erv_means, by="erv_id")
# write_csv(erv_diff_expr_all_df %>% dplyr::select(erv_id,chr,start,end,ends_with("_mean"), starts_with("transition"), key_transition, passes_threshold, positive_delta), "formatted_erv_diff_expr_all_df.csv")
journal_gtf_df_formatted <- read_csv("journal_gtf_df_formatted.csv.gz")
erv_diff_expr_all_df <- left_join(erv_diff_expr_all_df, journal_gtf_df_formatted, by=c("erv_id"="locus"))
erv_plot_df <- erv_diff_expr_all_df %>%
mutate(cluster_A_diff=0,
cluster_B_diff=cluster_B_mean-cluster_A_mean,
cluster_C_diff=cluster_C_mean-cluster_A_mean,
cluster_D_diff=cluster_D_mean-cluster_A_mean,
cluster_F_diff=cluster_F_mean-cluster_A_mean) %>%
reshape2::melt(id.vars=c("erv_id", "ltr_rep_name", "internal_rep_name", "transition_4_padj", "chr"),
measure.vars=c("cluster_A_diff", "cluster_B_diff", "cluster_C_diff", "cluster_D_diff", "cluster_F_diff"))
ltr_counts <- erv_plot_df %>%
filter(transition_4_padj <= .1 & chr != "chrX") %>%
count(ltr_rep_name) %>%
mutate(n=n/5) %>%
rename(!!'transition_4_padj <= .1 & chr != "chrX" counts':=n) %>%
left_join(erv_diff_expr_all_df %>%
count(ltr_rep_name) %>%
rename(total=n),
by="ltr_rep_name") %>%
mutate(hypergeometric_pval=phyper(`transition_4_padj <= .1 & chr != "chrX" counts` - 1,
total,
sum(total)-total,
sum(`transition_4_padj <= .1 & chr != "chrX" counts`),
lower.tail = F))
write_csv(ltr_counts, "ltr_counts.csv")
erv_plot_df <- left_join(erv_plot_df, ltr_counts, by="ltr_rep_name")
ltr_pal <- c("#fdfdfd", "#1d1d1d", "#ebce2b", "#702c8c", "#db6917", "#96cde6", "#ba1c30", "#c0bd7f", "#7f7e80", "#5fa641", "#d485b2", "#4277b6", "#df8461", "#463397", "#e1a11a", "#91218c", "#e8e948", "#7e1510", "#92ae31", "#6f340d", "#d32b1e", "#2b3514")
ltr_plot <- ggplot(erv_plot_df %>% filter(transition_4_padj <= .1 & chr != "chrX" & hypergeometric_pval > .1), aes(x=variable, y=value, group=erv_id)) +
geom_line(color="grey", size=.3, alpha=.5) +
geom_line(data=erv_plot_df %>% filter(transition_4_padj <= .1 & chr != "chrX" & hypergeometric_pval <= .1 & ltr_rep_name=="LTR7"), aes(color=ltr_rep_name), size=.3, alpha=.7) +
geom_line(data=erv_plot_df %>% filter(transition_4_padj <= .1 & chr != "chrX" & hypergeometric_pval <= .1 & ltr_rep_name != "LTR7"), aes(color=ltr_rep_name), size=.3, alpha=1) +
scale_color_manual(values=ltr_pal[c(5,6,12,4,8,10,3,7,22,13)],
limits=c("LTR40a,LTR40b",
"LTR5_Hs",
"LTR5_Hs,LTR5",
"LTR5_Hs,LTR5B",
"LTR7",
"LTR7Y",
"LTR1",
"LTR2C",
"LTR6A",
"MER50"),
labels=c("LTR40a,LTR40b - 1/3*",
"LTR5_Hs - 5/37***",
"LTR5_Hs,LTR5 - 1/1**",
"LTR5_Hs,LTR5B - 1/3*",
"LTR7 - 41/865****",
"LTR7Y - 5/80*",
"LTR1 - 1/2*",
"LTR2C - 3/38*",
"LTR6A - 2/16*",
"MER50 - 4/53*")) +
theme_bw() +
base_plot_theme +
scale_x_discrete(breaks=c("cluster_A_diff",
"cluster_B_diff",
"cluster_C_diff",
"cluster_D_diff",
"cluster_F_diff"),
labels=c("A",
"B",
"C",
"D",
"F"),
expand=c(0,0)) +
labs(title="Differentially Expressed HERVs in Transition 4/5",
y="Cumulative Δ Expr. (Log2 VST+Batch Corr.)",
x="Cluster",
color="LTR Class") +
guides(color = guide_legend(ncol = 1)) +
theme(legend.position="right",
legend.box.margin = margin(0,0,0,0, unit="mm"),
legend.box.spacing = unit(1, "mm"))
ggsave("figs/main/fig4_ltr_plot.svg", ltr_plot, height = 61, width=76, units = "mm")
```
## Looking for concordance between methylation and expression changes
## Seeing if I can find escapee genes by comparing males to females, and looking for similarly expressed genes
```{r}
library(DESeq2)
directory <- "htseq_counts"
count_file_suffix <- "Aligned.out.sorted.bam.htseq_counts"
sampleFiles <- grep(count_file_suffix,list.files(directory),value=TRUE)
rna_seq_accessions_tib <- read_csv("rna_seq_accessions_tib.csv")
rna_seq_accessions_tib[grepl("female", rna_seq_accessions_tib$SampleName), "sex"] <- "female"
rna_seq_accessions_tib[!grepl("female", rna_seq_accessions_tib$SampleName), "sex"] <- "male"
rna_seq_accessions_tib <- rna_seq_accessions_tib %>% filter()
cluster_tib <- read_csv("saved_sample_clusters.csv", col_names = c("sample_name", "cluster"))
cluster_tib <- column_to_rownames(cluster_tib, "sample_name")
sample_file_to_keep <- c()
sample_names <- c()
sampleCondition <- c()
for(filename in sampleFiles) {
sra_acc <-sub(count_file_suffix, "", filename)
if (sra_acc %in% rna_seq_accessions_tib$sra_accession) {
sample_name <- (filter(rna_seq_accessions_tib, sra_accession == sra_acc))$SampleName
cluster <- cluster_tib[sample_name, 1]
sample_file_to_keep <- c(sample_file_to_keep, filename)
sample_names <- c(sample_names, sample_name)
sampleCondition <- c(sampleCondition, (filter(rna_seq_accessions_tib, sra_accession == sra_acc))$sex)
# counts_tib <- read_tsv(paste(directory, "/", filename, sep=""), col_names = c('id', 'name', 'level'))
# all_htseq_counts_df <- full_join(all_htseq_counts_df, dplyr::select(counts_tib, id,name, !! sample_name := level), by=c("gene_id"="id", "gene_name"="name"))
}
}
sample_table_male_female <- data.frame(sampleName = sample_names,
fileName = sample_file_to_keep,
condition = sampleCondition)
sample_table_male_female$condition <- factor(sample_table_male_female$condition)
ddsHTSeq_male_female <- DESeqDataSetFromHTSeqCount(sampleTable = sample_table_male_female,
directory = directory,
design= ~ condition)
keep <- rowSums(counts(ddsHTSeq_male_female)) >= 10
ddsHTSeq_male_female_filtered <- ddsHTSeq_male_female[keep,]
dds_male_female <- DESeq(ddsHTSeq_male_female_filtered)
res_male_female <- results(dds_male_female)
res_male_female_tib <- as_tibble(res_male_female, rownames = NA)
res_male_female_tib <- rownames_to_column(res_male_female_tib, "EnsGene")
mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
res_male_female_tib$EnsGeneNoVer <- gsub("\\..*","",res_male_female_tib$EnsGene)
genes <- res_male_female_tib$EnsGeneNoVer
G_list <- getBM(filters= "ensembl_gene_id", attributes= c("ensembl_gene_id",
"hgnc_symbol", "chromosome_name", "start_position", "end_position", "strand", "description"),values=genes,mart= mart)
res_male_female_tib_named <- full_join(res_male_female_tib,G_list,by=c("EnsGeneNoVer"="ensembl_gene_id"))
```
## Showing changes for all transitions across the genome
```{r}
```
## Doing GSEA on expression changes
We will generate rank files to run GSEA
```{r}
library(quantsmooth)
scale_function <- function(x) {
scaled_x <- x
scaled_x[scaled_x > 0] <- scaled_x[scaled_x > 0] / max(scaled_x[scaled_x > 0])
scaled_x[scaled_x < 0] <- -1 * scaled_x[scaled_x < 0] / min(scaled_x[scaled_x < 0])
return(scaled_x)
}
for (transition in 1:5){
transition_1_rank_df <- diff_expr_all_df %>% filter(gene != "")
transition_log2FC_col_name <- sprintf("transition_%d_log2FC", transition)
transition_padj_col_name <- sprintf("transition_%d_padj", transition)
transition_1_rank_df["rank_score"] <- (transition_1_rank_df[transition_log2FC_col_name]) / (transition_1_rank_df[transition_padj_col_name])
transition_1_rank_df <- na.omit(transition_1_rank_df)
transition_1_rank_df[transition_1_rank_df[transition_padj_col_name] > 0.1, "rank_score"] <- 0
transition_1_rank_df$scaled_rank <- format(scale_function(transition_1_rank_df$rank_score), scientific = FALSE)
transition_1_rank_df <- arrange(transition_1_rank_df, rank_score)
write_tsv(transition_1_rank_df[(transition_1_rank_df$rank_score != 0) & (abs(transition_1_rank_df[transition_log2FC_col_name]) > .8),c("gene", "rank_score")], sprintf("gsea_transition_%d.rnk", transition), col_names = F)
write_csv(transition_1_rank_df[transition_1_rank_df["rank_score"] > 0,c("gene", "scaled_rank")], sprintf("enrichr_transition_%d_pos.csv", transition), col_names = F, )
write_csv(transition_1_rank_df[transition_1_rank_df["rank_score"] < 0,c("gene", "scaled_rank")], sprintf("enrichr_transition_%d_neg.csv", transition), col_names = F)
test_df <- transition_1_rank_df %>% dplyr::rename(CHR=chr, MapInfo=start_position)
test_df <- as.data.frame(test_df)
plot.new()
chrompos <- prepareGenomePlot(
test_df,
paintCytobands = TRUE,
organism="hsa",
sexChromosomes = T,
units="hg38")
points(chrompos[(test_df$rank_score > 0) ,2],
chrompos[(test_df$rank_score > 0) ,1]+0.1,
pch="|",col="red")
points(chrompos[(test_df$rank_score < 0) ,2],
chrompos[(test_df$rank_score < 0) ,1]-0.1,
pch="|",col="blue")
meth_df <- dmp_info_df %>% filter(alt_key_transition == transition) %>% rename(CHR=chr, MapInfo=pos)
meth_df$CHR <- sub("chr", "", meth_df$CHR)
meth_df <- as.data.frame(meth_df)
meth_chrompos <- prepareGenomePlot(
meth_df,
paintCytobands = TRUE,
organism="hsa",
sexChromosomes = T,
units="hg19"
)
points(meth_chrompos[meth_df$delta >= .2,2],meth_chrompos[meth_df$delta >= .2,1]+0.1,pch="|",col="red")
points(meth_chrompos[meth_df$delta <= -.2,2],meth_chrompos[meth_df$delta <= -.2,1]-0.1,pch="|",col="blue")
points(meth_chrompos[meth_df$alt_delta > 0.2,2],meth_chrompos[meth_df$alt_delta > 0.2,1]+0.1,pch="|",col="red")
points(meth_chrompos[meth_df$alt_delta < -0.2,2],meth_chrompos[meth_df$alt_delta < -0.2,1]-0.1,pch="|",col="blue")
}
```
## Combining Methylation and Expression Data
We are interested in finding genes that both change their promoter DNA methylation, and their expression levels in these clusters.
```{r}
dmp_info_df <- read_csv('dmp_transition_info.csv')
gene_compiled_tss_sites_df <- tibble(gene=character(),
total_sites=integer(),
transition_1_inc_sites=integer(),
transition_1_dec_sites=integer(),
transition_2_inc_sites=integer(),
transition_2_dec_sites=integer(),
transition_3_inc_sites=integer(),
transition_3_dec_sites=integer(),
transition_4_inc_sites=integer(),
transition_4_dec_sites=integer(),
transition_5_inc_sites=integer(),
transition_5_dec_sites=integer(),
)
for (row_idx in 1:nrow(dmp_info_df)) {
gene <- unlist(dmp_info_df[row_idx,"mapped_gene"])
dist_to_tss <- unlist(dmp_info_df[row_idx,"dist_to_tss"])
delta_increasing <- unlist(dmp_info_df[row_idx,"delta_increasing"])
key_transition <- unlist(dmp_info_df[row_idx,"key_transition"])
alt_key_transition <- unlist(dmp_info_df[row_idx,"alt_key_transition"])
alt_delta <- unlist(dmp_info_df[row_idx,"alt_delta"])
refgene_group <- unlist(dmp_info_df[row_idx, "refgene_group"])
passes_threshold <- unlist(dmp_info_df[row_idx, "passes_threshold"])
if (grepl("TSS1500|TSS200|5'UTR", refgene_group) & (! is.na(gene)) & passes_threshold) {
# if (grepl("TSS1500|TSS200", refgene_group) & (! is.na(gene)) & (! is.na(alt_key_transition))) {
gene_row_idx <- 0
if (!(gene %in% gene_compiled_tss_sites_df$gene)) {
# gene is not in gene_compiled_tss_sites_df, so add a row for it
gene_row_idx <- nrow(gene_compiled_tss_sites_df)+1
gene_compiled_tss_sites_df[gene_row_idx, c("gene")] = gene
gene_compiled_tss_sites_df[is.na(gene_compiled_tss_sites_df)] <- 0
}else {
i <- 1
for (gene_name in gene_compiled_tss_sites_df$gene) {
if ((! is.na(gene_name)) && gene == gene_name) {
gene_row_idx <- i
break
}
i <- i+1
}
}
gene_compiled_tss_sites_df[gene_row_idx, "total_sites"] <- gene_compiled_tss_sites_df[gene_row_idx, "total_sites"] + 1
change <- "dec"
if (delta_increasing) {
# if (alt_delta > 0) {
change <- "inc"
}
gene_compiled_tss_sites_df[gene_row_idx, sprintf("transition_%d_%s_sites", key_transition, change)] <-
gene_compiled_tss_sites_df[gene_row_idx, sprintf("transition_%d_%s_sites", key_transition, change)] + 1
# gene_compiled_tss_sites_df[gene_row_idx, sprintf("transition_%d_%s_sites", alt_key_transition, change)] <-
# gene_compiled_tss_sites_df[gene_row_idx, sprintf("transition_%d_%s_sites", alt_key_transition, change)] + 1
}
}
write_csv(gene_compiled_tss_sites_df, "gene_compiled_tss_sites.csv")
gene_compiled_df <- gene_compiled_tss_sites_df
gene_compiled_df <- add_column(gene_compiled_df, cluster=c(0)*nrow(gene_compiled_df))
# gene_clusters <- tibble(gene=character(), cluster=integer())
for (row_idx in 1:nrow(gene_compiled_df)) {
gene <- unlist(gene_compiled_df[row_idx, "gene"])
total <- unlist(gene_compiled_df[row_idx, "total_sites"])
maximum_transition <- 0
maximum_change <- ""
maximum_count <- 0
for(key_transition in 1:5) {
for (change in c("inc", "dec")) {
transition <- sprintf("transition_%d_%s_sites", key_transition, change)
if (gene_compiled_df[row_idx, transition] > maximum_count) {
maximum_transition <- key_transition
maximum_change <- change
maximum_count <- as.integer(unlist(gene_compiled_df[row_idx, transition]))
}
}
}
cluster <- 10
if (maximum_count / total >= .5) {
if (maximum_change == "dec") {
cluster <- maximum_transition - 1 + 5
}else {
cluster <- maximum_transition - 1
}
}
i <- 1
for (gene_name in gene_compiled_df$gene) {
if ((! is.na(gene_name)) && gene == gene_name) {
gene_row_idx <- i
break
}
i <- i+1
}
gene_compiled_df[i, "cluster"] <- cluster
}
transition_1_methylated_genes <- gene_compiled_df %>% filter(cluster %in% c(0))
transition_1_demethylated_genes <- gene_compiled_df %>% filter(cluster %in% c(5))
transition_1_decreased_expression_genes <- diff_expr_all_df %>%
filter((transition_1_padj <= 0.1) &
(transition_1_log2FC < 0)& (gene %in% gene_compiled_df$gene))
transition_1_increased_expression_genes <- diff_expr_all_df %>%
filter((transition_1_padj <= 0.1) &
(transition_1_log2FC > 0) & (gene %in% gene_compiled_df$gene))
transition_1_increased_expression_genes <- diff_expr_all_df %>%
filter((transition_3_padj <= 0.1) &
(transition_3_log2FC > 0))
common_genes <- intersect(transition_1_increased_expression_genes$gene, transition_1_demethylated_genes$gene)
expr_pvals <- column_to_rownames(transition_1_increased_expression_genes, var = "gene")[common_genes, "transition_1_padj"]
# meth_pvals <- column_to_rownames(transition_1_demethylated_genes, var = "gene")[common_genes, "transition_1_padj"]
transition_1_genes <- tibble(
genes=common_genes,
rank_score=1/expr_pvals
)
early_demethylated_genes <- gene_compiled_df %>% filter(cluster %in% c(5,6))
early_methylated_genes <- gene_compiled_df %>% filter(cluster %in% c(0,1))
early_decreased_expression_genes <- diff_expr_all_df %>% filter(((transition_1_padj <= 0.1) & (transition_1_log2FC < 0)) |
((transition_2_padj <= 0.1) & (transition_2_log2FC < 0)))
early_increased_expression_genes <- diff_expr_all_df %>% filter((transition_1_padj <= 0.1 & transition_1_log2FC > 0) |
(transition_2_padj <= 0.1 & transition_2_log2FC > 0))
write_csv(diff_expr_all_df, "deseq_transitions.csv")
```