diff --git a/analysis-only_overlap.R b/analysis-only_overlap.R index 78b8ab3..c3e717c 100644 --- a/analysis-only_overlap.R +++ b/analysis-only_overlap.R @@ -1,23 +1,66 @@ suppressPackageStartupMessages({ - library(RSQLite) + library(RSQLite) # TODO: Use dplyr instead of RSQLite. library(tidyr) library(dplyr) + library(readr) + library(readxl) + library(stringr) }) ## Configuration values. file_db <- "cellprofiler_all-plates/all-plates.db" +files_plate <- "plates/DRSC_TF_Library_Distribution.xls" -wells_control <- data.frame( - well = c("C03", "C22", "D12", "D13", "E05", "E20", "F11", "F14", - "G07", "G18", "H09", "H16", "I09", "I16", "J07", "J18", - "K11", "K14", "L05", "L20", "M12", "M13", "N03", "N22"), - type = "Lacz", - control = "negative") %>% - rbind(data.frame( - type = c("BROWN", "CAL1", "FACT", "Rho1", "thread", "water"), - well = c("I04", "E09", "L03", "E16", "L16", "H21"), - control = c(NA, NA, "positive", NA, NA, NA) - )) +## Plate 504 was run at the last minute, and so the controls are not +## reflected in the Excel spreadsheet provided by the core facility. +## Therefore below is the list of added wells, which we will later +## merge with the spreadsheet data: +wells_control_plate504 <- tibble::tribble( + ~well, ~symbol, + ##---|------ + "I04", "BROWN", + "E09", "CAL1", + "L03", "FACT", + "E16", "Rho1", + "L16", "Thread", + "H21", "water") %>% + bind_rows(tibble( + symbol = "LacZ", + well = c("C03", "C22", "D12", "D13", "E05", "E20", "F11", + "F14", "G07", "G18", "H09", "H16", "I09", "I16", + "J07", "J18", "K11", "K14", "L05", "L20", "M12", + "M13", "N03", "N22"))) +## FIXME: Need to confirm what type of control Thread is. +wells_layout <- + read_excel(files_plate, sheet = 3) %>% + setNames(tolower(names(.))) %>% + select(plate, well, `symbol(s)`, `fbgn(s)`, amplicon) %>% + rename(symbol = `symbol(s)`, + fbgn = `fbgn(s)`) + +wells_control_comparison <- merge( + wells_layout %>% filter(well %in% wells_control_plate504$well, + plate %in% c(501:504, 507)) %>% + select(-fbgn, -amplicon) %>% spread(plate, symbol), + wells_control_plate504 %>% rename(`504_new` = symbol)) %>% + .[, order(colnames(.))] %>% select(well, everything()) +write_csv(wells_control_comparison, "plate504-different-controls.csv") + +wells_layout %<>% + bind_rows(mutate(wells_control_plate504, plate = 504)) %>% + arrange(plate, well) %>% + mutate(control = ifelse( + symbol %in% c("CAL1", "FACT", "Rho1"), ## Rho1 gives binucleates + "positive", NA)) %>% + mutate(control = ifelse( + symbol %in% c("LacZ", "Empty", "BROWN", "water"), + "negative", control)) %>% + mutate(control = ifelse( + is.na(control) & is.na(amplicon), + "unknown", control)) + +## List all controls +print(wells_layout %>% filter(!is.na(control)) %>% .$symbol %>% unique) ## Database boilerplate. driver <- dbDriver("SQLite") @@ -51,34 +94,20 @@ wells <- merge(cells, select(images, imagenumber, image_metadata_well, group_by(image_metadata_well, image_metadata_plate) %>% summarise(n = n(), n_coloc = sum(ect_classify_coloc)) %>% rename(well = image_metadata_well, - plate = image_metadata_plate) + plate = image_metadata_plate) %>% + mutate(plate = ifelse(plate == "160415_015529-V", "504", + str_extract(plate, ".{3}"))) %>% + mutate(plate = as.numeric(plate)) # For subsequent merge. # Check the controls. -controls_plate1 <- merge(wells, wells_control) %>% - filter(plate == "160415_015529-V") %>% - group_by(type, control) %>% - summarise(n = sum(n), - n_coloc = sum(n_coloc), - coloc = n_coloc / n) -print(controls_plate1) - -controls_all <- merge(wells, wells_control) %>% - group_by(type, control) %>% +controls_all <- merge(wells, filter(wells_layout, ! is.na(control))) %>% + group_by(symbol, control) %>% summarise(n = sum(n), n_coloc = sum(n_coloc), coloc = n_coloc / n) print(controls_all) -wells_rnai <- data.frame( - well = setdiff(wells$well, wells_control$well), - type = "RNAi", - control = NA) - -rnai <- filter(wells, well %in% wells_rnai$well) %>% - mutate(coloc = n_coloc / n) -print(rnai) - -wells_type_all <- rbind(wells_rnai, wells_control) -wells_all <- merge(wells, wells_type_all) %>% - mutate(coloc = n_coloc / n) +## wells_repeated <- group_by(wells_all, symbol) %>% +## summarise(n = n(), m = mean(p_value), sd = sd(p_value)) %>% +## filter(n > 1) ## Apply statistical significance test. ## @@ -96,11 +125,15 @@ p_value <- function(x) { x <- x[1L, 1L] phyper(x - 1, m, n, k, lower.tail = FALSE) } +wells_all <- merge(wells, wells_layout) %>% + mutate(coloc = n_coloc / n) cells <- filter(wells_all) %>% select(n) %>% unlist overlaps <- filter(wells_all) %>% select(n_coloc) %>% unlist -cells_all <- sum(filter(cells, type == "RNAi")) -overlaps_all <- sum(filter(overlaps, type == "RNAi")) +cells_all <- filter(wells_all, is.na(control)) %>% select(n) %>% + unlist %>% sum +overlaps_all <- filter(wells_all, is.na(control)) %>% + select(n_coloc) %>% unlist %>% sum contingency_matrix <- rbind(cells, overlaps, cells_all, overlaps_all) ## Organize array to have 2x2 margin instead of 1x4. @@ -109,5 +142,10 @@ p_values_uncorrected <- apply(contingency_matrix, 3, p_value) p_values <- p.adjust(p_values_uncorrected, method = "bonferroni") wells_all$p_value <- p_values_uncorrected -write.csv(wells_all %>% arrange(p_value), file = "rnai-p_values.csv", - quote = FALSE, row.names = FALSE) +## Check repeated RNAi wells. +wells_repeated <- group_by(wells_all, symbol) %>% + summarise(n = n(), m = mean(coloc), sd = sd(coloc)) %>% + filter(n > 1) +print(summary(wells_repeated)) + +write_csv(wells_all %>% arrange(p_value), "rnai-p_values.csv") diff --git a/plates/DRSC_TF_Library_Distribution.xls b/plates/DRSC_TF_Library_Distribution.xls new file mode 100755 index 0000000..97686b6 Binary files /dev/null and b/plates/DRSC_TF_Library_Distribution.xls differ