ENH: Add statistical tests and plots

MelloneLab · Jan 20, 2017 · 90734bf · 90734bf
1 parent 9bfa762
commit 90734bf
Show file tree

Hide file tree

Showing 3 changed files with 107 additions and 13 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,9 @@ cellprofiler_all-plates.tar.xz
 cellprofiler_all-plates/all-plates.db
 z_stack.tar.xz
 z_stack/
+*.csv
+*.pdf
+*.png
 
 # Log files from CellProfiler Analyst
 *.log
diff --git a/analysis-only_overlap.R b/analysis-only_overlap.R
@@ -6,14 +6,16 @@ suppressPackageStartupMessages({
 
 ## Configuration values.
 file_db <- "cellprofiler_all-plates/all-plates.db"
-wells_control <- c(
-  "C03", # Should be 25-30%
-  "E05", # Should be 25-30%
-  "E09", # Should be <= 5%
-  "H21", # Should be 25-30%
-  "I04", # Should be 25-30%
-  "L09"  # Should be 5-15%
-)
+
+wells_control <- data.frame(
+    well = c("C03", "C22", "D12", "D13", "E05", "E20", "F11", "F14",
+             "G07", "G18", "H09", "H16", "I09", "I16", "J07", "J18",
+             "K11", "K14", "L05", "L20", "M12", "M13", "N03", "N22"),
+    type = "Lacz") %>%
+    rbind(data.frame(
+        type = c("BROWN", "CAL1", "FACT", "Rho1", "thread", "water"),
+        well = c("I04", "E09", "L03", "E16", "L16", "H21")
+    ))
 
 ## Database boilerplate.
 driver <- dbDriver("SQLite")
@@ -45,11 +47,63 @@ wells <- merge(cells, select(images, imagenumber, image_metadata_well,
                              image_metadata_plate)) %>%
     replace_na(list(ect_classify_coloc = 0)) %>%
     group_by(image_metadata_well, image_metadata_plate) %>%
-    summarise(n = n(), n_coloc = sum(ect_classify_coloc))
+    summarise(n = n(), n_coloc = sum(ect_classify_coloc)) %>%
+    rename(well = image_metadata_well,
+           plate = image_metadata_plate)
 
 # Check the controls.
-controls <- filter(wells,
-                   image_metadata_well %in% wells_control,
-                   image_metadata_plate == "160415_015529-V") %>%
+controls_plate1 <- merge(wells, wells_control) %>%
+    filter(plate == "160415_015529-V") %>%
+    group_by(type) %>%
+    summarise(n = sum(n), n_coloc = sum(n_coloc), coloc = n_coloc / n)
+print(controls_plate1)
+
+controls_all <- merge(wells, wells_control) %>%
+    group_by(type) %>%
+    summarise(n = sum(n), n_coloc = sum(n_coloc), coloc = n_coloc / n)
+print(controls_all)
+
+wells_rnai <- data.frame(
+    well = setdiff(wells$well, wells_control$well),
+    type = "RNAi")
+
+rnai <- filter(wells, well %in% wells_rnai) %>%
+    mutate(coloc = n_coloc / n)
+print(rnai)
+
+wells_type_all <- rbind(wells_rnai, wells_control)
+wells_all <- merge(wells, wells_type_all) %>%
     mutate(coloc = n_coloc / n)
-print(controls)
+
+## Apply statistical significance test.
+## 
+## Prof. Mellone suggested aggregating all the RNAi well counts in the
+## contingency table.
+
+## Fisher's exact test for alternative = "greater".
+##
+## Extracted the relevant parts of the fisher.test() source code to
+## optimize by >1000x per http://adv-r.had.co.nz/Profiling.html#t-test
+p_value <- function(x) {
+    m <- sum(x[, 1L])
+    n <- sum(x[, 2L])
+    k <- sum(x[1L, ])
+    x <- x[1L, 1L]
+    phyper(x - 1, m, n, k, lower.tail = FALSE)
+}
+cells <- filter(wells_all, type == "RNAi") %>% select(n) %>% unlist
+overlaps <- filter(wells_all, type == "RNAi") %>% select(n_coloc) %>%
+    unlist
+cells_all <- sum(cells)
+overlaps_all <- sum(overlaps)
+contingency_matrix <- rbind(cells, overlaps,
+                            cells_all, overlaps_all)
+## Organize array to have 2x2 margin instead of 1x4.
+dim(contingency_matrix) <- c(2, 2, length(contingency_matrix) / 4)
+p_values_uncorrected <- apply(contingency_matrix, 3, p_value)
+p_values <- p.adjust(p_values_uncorrected, method = "bonferroni")
+wells_all$p_value <- NA
+wells_all[wells_all$type == "RNAi", "p_value"] <- p_values_uncorrected
+
+write.csv(wells_all %>% arrange(p_value), file = "rnai-p_values.csv",
+          quote = FALSE, row.names = FALSE)
diff --git a/plots.R b/plots.R
@@ -0,0 +1,37 @@
+## See the distribution of % Overlap vs. wells
+
+suppressPackageStartupMessages({
+    library(ggplot2)
+})
+
+wells_all <- read.csv("rnai-p_values.csv") %>%
+    mutate(significant = p_value < 0.05)
+binwidth <- 0.025
+## Bring RNAi to the front of the level list.
+wells_all$type <- relevel(wells_all$type, "RNAi")
+
+ggplot(filter(wells_all, type == "RNAi"),
+       aes(coloc, fill = significant)) +
+    geom_histogram(binwidth = binwidth) +
+    scale_fill_manual(values = c("grey70", "blue")) +
+    ggtitle("Histogram of RNAi wells with significant overlap (p-value < 5%)") +
+    xlab("% colocalization")
+ggsave("rnai-hist-signif.png")
+ggsave("rnai-hist-signif.pdf")
+
+ggplot(wells_all, aes(coloc, color = type)) +
+    geom_freqpoly(binwidth = binwidth) +
+    scale_y_log10() +
+    facet_wrap(~ type) +
+    ggtitle("Histogram of all well types") +
+    xlab("% colocalization")
+ggsave("all-hist.png")
+ggsave("all-hist.pdf")
+
+ggplot(wells_all, aes(coloc, color = type)) +
+    geom_density() +
+    facet_wrap(~ type) +
+    ggtitle("Density plot of wells to show distribution") +
+    xlab("% colocalization")
+ggsave("all-density.png")
+ggsave("all-density.pdf")