Skip to content

Commit

Permalink
ENH: Add statistical tests and plots
Browse files Browse the repository at this point in the history
  • Loading branch information
pan14001 committed Jan 20, 2017
1 parent 9bfa762 commit 90734bf
Show file tree
Hide file tree
Showing 3 changed files with 107 additions and 13 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ cellprofiler_all-plates.tar.xz
cellprofiler_all-plates/all-plates.db
z_stack.tar.xz
z_stack/
*.csv
*.pdf
*.png

# Log files from CellProfiler Analyst
*.log
80 changes: 67 additions & 13 deletions analysis-only_overlap.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,16 @@ suppressPackageStartupMessages({

## Configuration values.
file_db <- "cellprofiler_all-plates/all-plates.db"
wells_control <- c(
"C03", # Should be 25-30%
"E05", # Should be 25-30%
"E09", # Should be <= 5%
"H21", # Should be 25-30%
"I04", # Should be 25-30%
"L09" # Should be 5-15%
)

wells_control <- data.frame(
well = c("C03", "C22", "D12", "D13", "E05", "E20", "F11", "F14",
"G07", "G18", "H09", "H16", "I09", "I16", "J07", "J18",
"K11", "K14", "L05", "L20", "M12", "M13", "N03", "N22"),
type = "Lacz") %>%
rbind(data.frame(
type = c("BROWN", "CAL1", "FACT", "Rho1", "thread", "water"),
well = c("I04", "E09", "L03", "E16", "L16", "H21")
))

## Database boilerplate.
driver <- dbDriver("SQLite")
Expand Down Expand Up @@ -45,11 +47,63 @@ wells <- merge(cells, select(images, imagenumber, image_metadata_well,
image_metadata_plate)) %>%
replace_na(list(ect_classify_coloc = 0)) %>%
group_by(image_metadata_well, image_metadata_plate) %>%
summarise(n = n(), n_coloc = sum(ect_classify_coloc))
summarise(n = n(), n_coloc = sum(ect_classify_coloc)) %>%
rename(well = image_metadata_well,
plate = image_metadata_plate)

# Check the controls.
controls <- filter(wells,
image_metadata_well %in% wells_control,
image_metadata_plate == "160415_015529-V") %>%
controls_plate1 <- merge(wells, wells_control) %>%
filter(plate == "160415_015529-V") %>%
group_by(type) %>%
summarise(n = sum(n), n_coloc = sum(n_coloc), coloc = n_coloc / n)
print(controls_plate1)

controls_all <- merge(wells, wells_control) %>%
group_by(type) %>%
summarise(n = sum(n), n_coloc = sum(n_coloc), coloc = n_coloc / n)
print(controls_all)

wells_rnai <- data.frame(
well = setdiff(wells$well, wells_control$well),
type = "RNAi")

rnai <- filter(wells, well %in% wells_rnai) %>%
mutate(coloc = n_coloc / n)
print(rnai)

wells_type_all <- rbind(wells_rnai, wells_control)
wells_all <- merge(wells, wells_type_all) %>%
mutate(coloc = n_coloc / n)
print(controls)

## Apply statistical significance test.
##
## Prof. Mellone suggested aggregating all the RNAi well counts in the
## contingency table.

## Fisher's exact test for alternative = "greater".
##
## Extracted the relevant parts of the fisher.test() source code to
## optimize by >1000x per http://adv-r.had.co.nz/Profiling.html#t-test
p_value <- function(x) {
m <- sum(x[, 1L])
n <- sum(x[, 2L])
k <- sum(x[1L, ])
x <- x[1L, 1L]
phyper(x - 1, m, n, k, lower.tail = FALSE)
}
cells <- filter(wells_all, type == "RNAi") %>% select(n) %>% unlist
overlaps <- filter(wells_all, type == "RNAi") %>% select(n_coloc) %>%
unlist
cells_all <- sum(cells)
overlaps_all <- sum(overlaps)
contingency_matrix <- rbind(cells, overlaps,
cells_all, overlaps_all)
## Organize array to have 2x2 margin instead of 1x4.
dim(contingency_matrix) <- c(2, 2, length(contingency_matrix) / 4)
p_values_uncorrected <- apply(contingency_matrix, 3, p_value)
p_values <- p.adjust(p_values_uncorrected, method = "bonferroni")
wells_all$p_value <- NA
wells_all[wells_all$type == "RNAi", "p_value"] <- p_values_uncorrected

write.csv(wells_all %>% arrange(p_value), file = "rnai-p_values.csv",
quote = FALSE, row.names = FALSE)
37 changes: 37 additions & 0 deletions plots.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
## See the distribution of % Overlap vs. wells

suppressPackageStartupMessages({
library(ggplot2)
})

wells_all <- read.csv("rnai-p_values.csv") %>%
mutate(significant = p_value < 0.05)
binwidth <- 0.025
## Bring RNAi to the front of the level list.
wells_all$type <- relevel(wells_all$type, "RNAi")

ggplot(filter(wells_all, type == "RNAi"),
aes(coloc, fill = significant)) +
geom_histogram(binwidth = binwidth) +
scale_fill_manual(values = c("grey70", "blue")) +
ggtitle("Histogram of RNAi wells with significant overlap (p-value < 5%)") +
xlab("% colocalization")
ggsave("rnai-hist-signif.png")
ggsave("rnai-hist-signif.pdf")

ggplot(wells_all, aes(coloc, color = type)) +
geom_freqpoly(binwidth = binwidth) +
scale_y_log10() +
facet_wrap(~ type) +
ggtitle("Histogram of all well types") +
xlab("% colocalization")
ggsave("all-hist.png")
ggsave("all-hist.pdf")

ggplot(wells_all, aes(coloc, color = type)) +
geom_density() +
facet_wrap(~ type) +
ggtitle("Density plot of wells to show distribution") +
xlab("% colocalization")
ggsave("all-density.png")
ggsave("all-density.pdf")

0 comments on commit 90734bf

Please sign in to comment.