diff --git a/.Rbuildignore b/.Rbuildignore new file mode 100644 index 0000000..a32bacb --- /dev/null +++ b/.Rbuildignore @@ -0,0 +1,4 @@ +^.*\.Rproj$ +^\.Rproj\.user$ +^doc$ +^Meta$ diff --git a/.Rproj.user/43F9182D/build_options b/.Rproj.user/43F9182D/build_options new file mode 100644 index 0000000..0f3563e --- /dev/null +++ b/.Rproj.user/43F9182D/build_options @@ -0,0 +1,7 @@ +auto_roxygenize_for_build_and_reload="1" +auto_roxygenize_for_build_package="1" +auto_roxygenize_for_check="1" +live_preview_website="1" +makefile_args="" +preview_website="1" +website_output_format="all" diff --git a/.Rproj.user/43F9182D/console06/INDEX001 b/.Rproj.user/43F9182D/console06/INDEX001 new file mode 100644 index 0000000..4629bea --- /dev/null +++ b/.Rproj.user/43F9182D/console06/INDEX001 @@ -0,0 +1 @@ +[{"allow_restart":true,"alt_buffer":false,"autoclose":1,"buffered_output":"\n\n\n\n\n\n\n\n\n","caption":"Terminal 1","channel_id":"5921","channel_mode":1,"child_procs":false,"cols":92,"cwd":"~/Box Sync/CorNetwork_Project/edgefinder","exit_code":1,"handle":"66F40F68","interaction_mode":2,"max_output_lines":1000,"restarted":false,"rows":13,"shell_type":7,"show_on_output":false,"terminal_sequence":1,"title":"~/Box Sync/CorNetwork_Project/edgefinder","track_env":true,"zombie":false}] \ No newline at end of file diff --git a/.Rproj.user/43F9182D/cpp-definition-cache b/.Rproj.user/43F9182D/cpp-definition-cache new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/.Rproj.user/43F9182D/cpp-definition-cache @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/.Rproj.user/43F9182D/pcs/debug-breakpoints.pper b/.Rproj.user/43F9182D/pcs/debug-breakpoints.pper new file mode 100644 index 0000000..4893a8a --- /dev/null +++ b/.Rproj.user/43F9182D/pcs/debug-breakpoints.pper @@ -0,0 +1,5 @@ +{ + "debugBreakpointsState": { + "breakpoints": [] + } +} \ No newline at end of file diff --git a/.Rproj.user/43F9182D/pcs/files-pane.pper b/.Rproj.user/43F9182D/pcs/files-pane.pper new file mode 100644 index 0000000..1f20ee2 --- /dev/null +++ b/.Rproj.user/43F9182D/pcs/files-pane.pper @@ -0,0 +1,13 @@ +{ + "path": "~/Dropbox/Packages/edgefinder/doc", + "sortOrder": [ + { + "columnIndex": 4, + "ascending": false + }, + { + "columnIndex": 2, + "ascending": true + } + ] +} \ No newline at end of file diff --git a/.Rproj.user/43F9182D/pcs/source-pane.pper b/.Rproj.user/43F9182D/pcs/source-pane.pper new file mode 100644 index 0000000..ddca97d --- /dev/null +++ b/.Rproj.user/43F9182D/pcs/source-pane.pper @@ -0,0 +1,3 @@ +{ + "activeTab": 2 +} \ No newline at end of file diff --git a/.Rproj.user/43F9182D/pcs/windowlayoutstate.pper b/.Rproj.user/43F9182D/pcs/windowlayoutstate.pper new file mode 100644 index 0000000..f4fc4e6 --- /dev/null +++ b/.Rproj.user/43F9182D/pcs/windowlayoutstate.pper @@ -0,0 +1,14 @@ +{ + "left": { + "splitterpos": 272, + "topwindowstate": "NORMAL", + "panelheight": 664, + "windowheight": 702 + }, + "right": { + "splitterpos": 426, + "topwindowstate": "NORMAL", + "panelheight": 664, + "windowheight": 702 + } +} \ No newline at end of file diff --git a/.Rproj.user/43F9182D/pcs/workbench-pane.pper b/.Rproj.user/43F9182D/pcs/workbench-pane.pper new file mode 100644 index 0000000..f398270 --- /dev/null +++ b/.Rproj.user/43F9182D/pcs/workbench-pane.pper @@ -0,0 +1,5 @@ +{ + "TabSet1": 3, + "TabSet2": 3, + "TabZoom": {} +} \ No newline at end of file diff --git a/.Rproj.user/43F9182D/persistent-state b/.Rproj.user/43F9182D/persistent-state new file mode 100644 index 0000000..82465a3 --- /dev/null +++ b/.Rproj.user/43F9182D/persistent-state @@ -0,0 +1,8 @@ +build-last-errors="[]" +build-last-errors-base-dir="~/Dropbox/Packages/edgefinder/" +build-last-outputs="[{\"type\":0,\"output\":\"==> devtools::document(roclets = c('rd', 'collate', 'namespace', 'vignette'))\\n\\n\"},{\"type\":2,\"output\":\"Updating edgefinder documentation\\n\"},{\"type\":2,\"output\":\"Loading edgefinder\\n\"},{\"type\":1,\"output\":\"Writing NAMESPACE\\n\"},{\"type\":1,\"output\":\"Writing NAMESPACE\\n\"},{\"type\":2,\"output\":\"Updating vignettes\\n\"},{\"type\":1,\"output\":\"Documentation completed\\n\\n\"},{\"type\":0,\"output\":\"==> R CMD INSTALL --no-multiarch --with-keep.source edgefinder\\n\\n\"},{\"type\":1,\"output\":\"* installing to library ‘/Library/Frameworks/R.framework/Versions/4.0/Resources/library’\\n\"},{\"type\":1,\"output\":\"* installing *source* package ‘edgefinder’ ...\\n\"},{\"type\":1,\"output\":\"** using staged installation\\n\"},{\"type\":1,\"output\":\"** R\\n\"},{\"type\":1,\"output\":\"\"},{\"type\":1,\"output\":\"** data\\n\"},{\"type\":1,\"output\":\"*** moving datasets to lazyload DB\\n\"},{\"type\":1,\"output\":\"\"},{\"type\":1,\"output\":\"** byte-compile and prepare package for lazy loading\\n\"},{\"type\":1,\"output\":\"\"},{\"type\":1,\"output\":\"** help\\n\"},{\"type\":1,\"output\":\"\"},{\"type\":1,\"output\":\"*** installing help indices\\n\"},{\"type\":1,\"output\":\"\"},{\"type\":1,\"output\":\"** building package indices\\n\"},{\"type\":1,\"output\":\"\"},{\"type\":1,\"output\":\"** installing vignettes\\n\"},{\"type\":1,\"output\":\"\"},{\"type\":1,\"output\":\"** testing if installed package can be loaded from temporary location\\n\"},{\"type\":1,\"output\":\"\"},{\"type\":1,\"output\":\"** testing if installed package can be loaded from final location\\n\"},{\"type\":1,\"output\":\"\"},{\"type\":1,\"output\":\"** testing if installed package keeps a record of temporary installation path\\n\"},{\"type\":1,\"output\":\"* DONE (edgefinder)\\n\"},{\"type\":1,\"output\":\"\"}]" +compile_pdf_state="{\"tab_visible\":false,\"running\":false,\"target_file\":\"\",\"output\":\"\",\"errors\":[]}" +files.monitored-path="" +find-in-files-state="{\"handle\":\"\",\"input\":\"\",\"path\":\"\",\"regex\":false,\"ignoreCase\":false,\"results\":{\"file\":[],\"line\":[],\"lineValue\":[],\"matchOn\":[],\"matchOff\":[],\"replaceMatchOn\":[],\"replaceMatchOff\":[]},\"running\":false,\"replace\":false,\"preview\":false,\"gitFlag\":false,\"replacePattern\":\"\"}" +imageDirtyState="1" +saveActionState="-1" diff --git a/.Rproj.user/43F9182D/rmd-outputs b/.Rproj.user/43F9182D/rmd-outputs new file mode 100644 index 0000000..8d2fdb7 --- /dev/null +++ b/.Rproj.user/43F9182D/rmd-outputs @@ -0,0 +1,8 @@ +/private/var/folders/lw/f9y_rrvs1lncs_8wvgg56kd00000gp/T/RtmpMSNhDi/preview-fe4ece1fb6e.dir/edgefinder.pdf +/private/var/folders/lw/f9y_rrvs1lncs_8wvgg56kd00000gp/T/RtmpMSNhDi/preview-fe4e45da014b.dir/edgefinder.html +/private/var/folders/lw/f9y_rrvs1lncs_8wvgg56kd00000gp/T/RtmpDlrmgt/preview-eb9f7662e2ce.dir/edgefinder.html + + + + + diff --git a/.Rproj.user/43F9182D/saved_source_markers b/.Rproj.user/43F9182D/saved_source_markers new file mode 100644 index 0000000..2b1bef1 --- /dev/null +++ b/.Rproj.user/43F9182D/saved_source_markers @@ -0,0 +1 @@ +{"active_set":"","sets":[]} \ No newline at end of file diff --git a/.Rproj.user/43F9182D/sources/prop/08EF61B6 b/.Rproj.user/43F9182D/sources/prop/08EF61B6 new file mode 100644 index 0000000..48b1b22 --- /dev/null +++ b/.Rproj.user/43F9182D/sources/prop/08EF61B6 @@ -0,0 +1,4 @@ +{ + "cursorPosition": "3,22", + "scrollLine": "0" +} \ No newline at end of file diff --git a/.Rproj.user/43F9182D/sources/prop/17C45040 b/.Rproj.user/43F9182D/sources/prop/17C45040 new file mode 100644 index 0000000..b9603bd --- /dev/null +++ b/.Rproj.user/43F9182D/sources/prop/17C45040 @@ -0,0 +1,4 @@ +{ + "cursorPosition" : "512,45", + "scrollLine" : "501" +} \ No newline at end of file diff --git a/.Rproj.user/43F9182D/sources/prop/268C439B b/.Rproj.user/43F9182D/sources/prop/268C439B new file mode 100644 index 0000000..0b69569 --- /dev/null +++ b/.Rproj.user/43F9182D/sources/prop/268C439B @@ -0,0 +1,4 @@ +{ + "cursorPosition": "9,17", + "scrollLine": "0" +} \ No newline at end of file diff --git a/.Rproj.user/43F9182D/sources/prop/3545E1B5 b/.Rproj.user/43F9182D/sources/prop/3545E1B5 new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/.Rproj.user/43F9182D/sources/prop/3545E1B5 @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/.Rproj.user/43F9182D/sources/prop/707DE2BB b/.Rproj.user/43F9182D/sources/prop/707DE2BB new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/.Rproj.user/43F9182D/sources/prop/707DE2BB @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/.Rproj.user/43F9182D/sources/prop/73839B89 b/.Rproj.user/43F9182D/sources/prop/73839B89 new file mode 100644 index 0000000..128e931 --- /dev/null +++ b/.Rproj.user/43F9182D/sources/prop/73839B89 @@ -0,0 +1,4 @@ +{ + "cursorPosition": "13,29", + "scrollLine": "0" +} \ No newline at end of file diff --git a/.Rproj.user/43F9182D/sources/prop/7FABB90B b/.Rproj.user/43F9182D/sources/prop/7FABB90B new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/.Rproj.user/43F9182D/sources/prop/7FABB90B @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/.Rproj.user/43F9182D/sources/prop/8F178DC0 b/.Rproj.user/43F9182D/sources/prop/8F178DC0 new file mode 100644 index 0000000..32e8647 --- /dev/null +++ b/.Rproj.user/43F9182D/sources/prop/8F178DC0 @@ -0,0 +1,4 @@ +{ + "cursorPosition": "428,28", + "scrollLine": "0" +} \ No newline at end of file diff --git a/.Rproj.user/43F9182D/sources/prop/C7E78658 b/.Rproj.user/43F9182D/sources/prop/C7E78658 new file mode 100644 index 0000000..c3507bb --- /dev/null +++ b/.Rproj.user/43F9182D/sources/prop/C7E78658 @@ -0,0 +1,4 @@ +{ + "cursorPosition" : "4,0", + "scrollLine" : "0" +} \ No newline at end of file diff --git a/.Rproj.user/43F9182D/sources/prop/D7FD0573 b/.Rproj.user/43F9182D/sources/prop/D7FD0573 new file mode 100644 index 0000000..17d91b8 --- /dev/null +++ b/.Rproj.user/43F9182D/sources/prop/D7FD0573 @@ -0,0 +1,4 @@ +{ + "cursorPosition": "11,29", + "scrollLine": "0" +} \ No newline at end of file diff --git a/.Rproj.user/43F9182D/sources/prop/FB923355 b/.Rproj.user/43F9182D/sources/prop/FB923355 new file mode 100644 index 0000000..533abe0 --- /dev/null +++ b/.Rproj.user/43F9182D/sources/prop/FB923355 @@ -0,0 +1,5 @@ +{ + "cursorPosition" : "141,25", + "scrollLine" : "134", + "working_dir" : "current" +} \ No newline at end of file diff --git a/.Rproj.user/43F9182D/sources/prop/INDEX b/.Rproj.user/43F9182D/sources/prop/INDEX new file mode 100644 index 0000000..a131bfe --- /dev/null +++ b/.Rproj.user/43F9182D/sources/prop/INDEX @@ -0,0 +1,40 @@ +~%2FBox%20Sync%2FCorNetwork_Project%2Fedgefinder%2FDESCRIPTION="BD6E27DA" +~%2FBox%20Sync%2FCorNetwork_Project%2Fedgefinder%2FNAMESPACE="F5F5A899" +~%2FBox%20Sync%2FCorNetwork_Project%2Fedgefinder%2FR%2Fedgefinder.R="7E5E47A6" +~%2FBox%20Sync%2FCorNetwork_Project%2Fedgefinder%2Fman%2FDUP.Rd="DCADE622" +~%2FBox%20Sync%2FCorNetwork_Project%2Fedgefinder%2Fman%2FEM.Rd="D6EB3B12" +~%2FBox%20Sync%2FCorNetwork_Project%2Fedgefinder%2Fman%2FWT.Rd="88AE8AC5" +~%2FBox%20Sync%2FCorNetwork_Project%2Fedgefinder%2Fman%2Fedgefinder.Rd="58782F92" +~%2FBox%20Sync%2FCorNetwork_Project%2Fedgefinder%2Fman%2FplotBitmapCC.Rd="1D2C4151" +~%2FBox%20Sync%2FCorNetwork_Project%2Fedgefinder%2Fman%2FplotDegCC.Rd="4AE36A33" +~%2FBox%20Sync%2FCorNetwork_Project%2Fedgefinder%2Fman%2FplotMixture.Rd="E5101845" +~%2FBox%20Sync%2FCorNetwork_Project%2Fedgefinder%2Fman%2FshortSummary.Rd="7BEE52C9" +~%2FBox%20Sync%2FCorNetwork_Project%2Fedgefinder%2Fman%2Fsummary.edgefinder.Rd="AE7CC627" +~%2FBox%20Sync%2FCorNetwork_Project%2Fedgefinder%2Fvignettes%2FExamples.Rmd="9FC8CA06" +~%2FBox%20Sync%2FCorNetwork_Project%2Fedgefinder%2Fvignettes%2FExamples.html="1BDD16EE" +~%2FBox%20Sync%2FCorNetwork_Project%2Fedgefinder%2Fvignettes%2Fedgefinder.R="D5383BFE" +~%2FBox%20Sync%2FCorNetwork_Project%2Fedgefinder%2Fvignettes%2Fedgefinder.Rmd="C6822807" +~%2FBox%2FCorNetwork_Project%2Fedgefinder%2FR%2Fedgefinder.R="17C45040" +~%2FBox%2FCorNetwork_Project%2Fedgefinder%2Fvignettes%2Fedgefinder.Rmd="FB923355" +~%2FDesktop%2FZEPT_A_1341276_SM7185%20(1)%2FR%20code%20CSA%20PTSD%20EJPT%20June%2026%202017.R="8EDF9A5B" +~%2FDropbox%2FCMapCodeWork%2FConeDistance.R="724A7613" +~%2FDropbox%2FCMapCodeWork%2FConeDistance01.R="977F8AD5" +~%2FDropbox%2FGuy.R="BAB987C4" +~%2FDropbox%2FGuy091218.R="FAB15A69" +~%2FDropbox%2FMDS%2Faas%2Fmdsanalysis.r="C7E78658" +~%2FDropbox%2FPackages%2Fedgefinder%2F.Rbuildignore="3545E1B5" +~%2FDropbox%2FPackages%2Fedgefinder%2F.gitignore="707DE2BB" +~%2FDropbox%2FPackages%2Fedgefinder%2FDESCRIPTION="73839B89" +~%2FDropbox%2FPackages%2Fedgefinder%2FNAMESPACE="268C439B" +~%2FDropbox%2FPackages%2Fedgefinder%2FR%2Fedgefinder.R="8F178DC0" +~%2FDropbox%2FPackages%2Fedgefinder%2Fdoc%2Fedgefinder.Rmd="D7FD0573" +~%2FDropbox%2FPackages%2Fedgefinder%2Fvignettes%2Fedgefinder.Rmd="08EF61B6" +~%2FDropbox%2FPackages%2Fedgefinder%2Fvignettes%2Fedgefinder.md="7FABB90B" +~%2FDropbox%2FProjects%2FCorrelationNetworks%2FCNV03.R="B270593A" +~%2FDropbox%2FProjects%2FCorrelationNetworks%2FEdgeFinder%2FCNV_WvsP.R="65139561" +~%2FDropbox%2FProjects%2FCorrelationNetworks%2FEdgeFinder%2FToDo.txt="E7188EA7" +~%2FDropbox%2FProjects%2FGaya%2FProject2%2Friboprof01.Rmd="66D24356" +~%2FDropbox%2FProjects%2FGaya%2FProject2%2Friboprof02.Rmd="3C37A340" +~%2FDropbox%2FProjects%2FGaya%2FProject2%2Frp01.R="559018D3" +~%2FDropbox%2FProjects%2FGaya%2FRP0827.R="9914CB74" +~%2FDropbox%2Fbuild.txt="A7C98F10" diff --git a/.Rproj.user/43F9182D/sources/s-918F8619/3FDFFDAC b/.Rproj.user/43F9182D/sources/s-918F8619/3FDFFDAC new file mode 100644 index 0000000..2be45d0 --- /dev/null +++ b/.Rproj.user/43F9182D/sources/s-918F8619/3FDFFDAC @@ -0,0 +1,25 @@ +{ + "id": "3FDFFDAC", + "path": null, + "project_path": null, + "type": "r_source", + "hash": "0", + "contents": "", + "dirty": true, + "created": 1562254910191.0, + "source_on_save": false, + "relative_order": 4, + "properties": { + "cursorPosition": "7,18", + "scrollLine": "0", + "tempName": "Untitled1" + }, + "folds": "", + "lastKnownWriteTime": 8243105118349059431, + "encoding": "", + "collab_server": "", + "source_window": "", + "last_content_update": 1562257287832, + "read_only": false, + "read_only_alternatives": [] +} \ No newline at end of file diff --git a/.Rproj.user/43F9182D/sources/s-918F8619/3FDFFDAC-contents b/.Rproj.user/43F9182D/sources/s-918F8619/3FDFFDAC-contents new file mode 100644 index 0000000..a6279fe --- /dev/null +++ b/.Rproj.user/43F9182D/sources/s-918F8619/3FDFFDAC-contents @@ -0,0 +1,9 @@ +sigResid <- sort(union(Sres$rt,Sres$lt)) +G <- Sres$G +tmpmat <- Matrix(0,G, G) +vec <- rep(0, choose(G,2)) +vec[sigResid] <- 1 +tmpmat[upper.tri(tmpmat)] = vec +AdjMat <- tmpmat+t(tmpmat) +AdjMat1 <- getNeighborhood(AdjMat, numSteps=1) +image(AdjMat1$Mat) diff --git a/.Rproj.user/43F9182D/sources/s-918F8619/76CEB453-contents b/.Rproj.user/43F9182D/sources/s-918F8619/76CEB453-contents new file mode 100644 index 0000000..00b31dc --- /dev/null +++ b/.Rproj.user/43F9182D/sources/s-918F8619/76CEB453-contents @@ -0,0 +1,2 @@ +doc +Meta diff --git a/.Rproj.user/43F9182D/sources/s-918F8619/89DD615B-contents b/.Rproj.user/43F9182D/sources/s-918F8619/89DD615B-contents new file mode 100644 index 0000000..a32bacb --- /dev/null +++ b/.Rproj.user/43F9182D/sources/s-918F8619/89DD615B-contents @@ -0,0 +1,4 @@ +^.*\.Rproj$ +^\.Rproj\.user$ +^doc$ +^Meta$ diff --git a/.Rproj.user/43F9182D/sources/s-918F8619/A1F1319E b/.Rproj.user/43F9182D/sources/s-918F8619/A1F1319E new file mode 100644 index 0000000..8321412 --- /dev/null +++ b/.Rproj.user/43F9182D/sources/s-918F8619/A1F1319E @@ -0,0 +1,24 @@ +{ + "id": "A1F1319E", + "path": "~/Dropbox/Packages/edgefinder/vignettes/edgefinder.Rmd", + "project_path": "vignettes/edgefinder.Rmd", + "type": "r_markdown", + "hash": "0", + "contents": "", + "dirty": false, + "created": 1596070839055.0, + "source_on_save": false, + "relative_order": 3, + "properties": { + "cursorPosition": "3,22", + "scrollLine": "0" + }, + "folds": "", + "lastKnownWriteTime": 1596070910, + "encoding": "UTF-8", + "collab_server": "", + "source_window": "", + "last_content_update": 1596070910163, + "read_only": false, + "read_only_alternatives": [] +} \ No newline at end of file diff --git a/.Rproj.user/43F9182D/sources/s-918F8619/A1F1319E-contents b/.Rproj.user/43F9182D/sources/s-918F8619/A1F1319E-contents new file mode 100644 index 0000000..5a302c3 --- /dev/null +++ b/.Rproj.user/43F9182D/sources/s-918F8619/A1F1319E-contents @@ -0,0 +1,350 @@ +--- +title: "edgefinder" +author: "Haim Bar" +date: "`r Sys.Date()`" +output: rmarkdown::pdf_document +vignette: > + %\VignetteIndexEntry{edgefinder} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +The edgefinder package is used to find edges in gene networks using co-expression +data. The input to the program is a normalized expression matrix, with genes (nodes) +in the rows, and samples in the columns. +The program calculates the pair-wise correlations, performs Fisher's Z +transformation, and fits the L2N model to the transformed data. L2N is a mixture +model with three components: the uncorrelated pairs belong to the null component +which is assumed to be normally distributed, and the correlated pairs belong to one +of the two non-null components which are assumed to follow lognormal distributions. + +Typical datasets consist of hundreds, or thousands of genes, and hence a very +large number of pairs. Therefore, edgefinder randomly selects a subset of the pairs (the +default number of pairs is 20,000), fits the L2N model to the subset, and calculates +the component probabilities for *all* possible pairs. +Using the posterior probabilities, edgefinder determines which pairs are +highly correlated while controlling the false discovery rate. +Note that edgefinder makes no assumptions about the structure of the network. + +The edgefinder package depends on the 'Matrix' package, to allow for efficient +storage and computation of large co-occurrence matrices. For simulating datasets +we used the 'huge' and 'MASS' packages, but they are not required when +using edgefinder. + +# Real data examples + +We use a publicly available dataset from +https://www.ncbi.nlm.nih.gov/sites/GDSbrowser?acc=GDS4430 +(Horev G, Ellegood J, Lerch JP, Son YE et al. Dosage-dependent phenotypes in models +of 16p11.2 lesions found in autism. Proc Natl Acad Sci U.S.A. 2011, Oct. +11;108(41):17076-81. PMID: 21969575). + +The dataset contains three groups: wild type (2 copies of 16p11.2), +deletion (1 copy), and duplication (3 copies). +We focus on a subset of 3,454, genes which were found to be differentially +expressed when comparing the wild-type and duplication groups (using an +FDR threshold of 0.05.) We fit the +L2N model to this set of genes in each group, and compare the properties +of the two networks. First, we load the wild-type data (WT). +WT is a matrix with 3454 rows (genes) and 15 columns (samples) from +the wild-type group. + + + +``` +library("edgefinder") +# Wild-type first: +data(WT) +WTres <- edgefinder(WT, ttl = "Wild Type") +shortSummary(WTres) +``` + +The **edgefinder** function fits the L2N model to the data, and plots the fitted mixture distribution: + +```{r echo=FALSE, out.width='60%'} +knitr::include_graphics('./WTfit.png') +``` + +The function **shortSummary** produces the following output: + +``` +No. nodes = 3,454 +Max no. edges = 5,963,331 +No. edges detected = 80,332 +p1 = 0.0942 +p2 = 0.0185 +Est. FDR <= 0.00997 +``` + +Note that the estimated FDR is calculated based on the fitted L2N model. +The default FDR threshold used by the edgefinder function is 0.01, and in this case, the +empirical FDR is very close to the level set by the user. If the empirical FDR is too +high, you may increase **LOvals** from its default value (30). This will result in larger +(stricter) thresholds for determining significant correlations, and will decrease the +proportion of false discoveries. +The FDR threshold (the **BHthr** parameter) should be set according to the number of edges. +In this example, the algorithm finds 80,332 edges, and an FDR of 0.01 means that +at most 800 of the detected edges may not be true discoveries. If this number of edges +is too large in the sense that it may affect our inference about the network structure, +or a subsequent gene enrichment analysis, we could lower the FDR threshold. + +The function **graphComponents** finds clusters of genes. To do that, it takes as input an +adjacency (0/1) matrix (e.g. WTres$AdjMat in our example.) To find clusters it first +calculates a centrality for each node, using the formula (type\*CC+1)\*deg where +deg is the degree of the node, and CC is its clustering coefficient (CC). **type** is set +by default to 1. When it is set to 0, the centrality measure is just the degree of +the node. Setting type=1 means that we assign a higher value to nodes that not only have +many neighbors, but the neighbors are highly interconnected. For example, suppose we +have two components with k nodes, one has a star shape, and the other is a complete +graph. With type=0 both graphs will get the same value, but with type=1 the complete +graph will be picked by the algorithm first. +You can also set a minimum centrality value (the parameter **minCtr**) to determine the +smallest possible cluster size. + +The function returns a data frame with the following information about each node: +a label (e.g. gene name), degree, clustering coefficient, centrality measure, +cluster number, iscenter (1 for the node was chosen as the cluster's center, 0 otherwise), +the number of edges from the node to nodes in the same cluster the number of edges +from the node to nodes NOT in the same cluster, and the standardized Manhattan distance +to the central node in the cluster (in terms of the number neighbors they do not have +in common.) + +``` +WTComp <- graphComponents(WTres$AdjMat) +head(WTComp) + + labels degree cc ctr clustNo iscenter intEdges extEdges distCenter +1 1 251 0.5999044 401.5760 1 0 187 64 0.072958888 +2 2 0 0.0000000 0.0000 0 0 0 0 0.000000000 +3 3 202 0.7217378 347.7910 1 0 164 38 0.072090330 +4 4 202 0.5819910 319.5622 4 0 98 104 0.008396063 +5 5 0 0.0000000 0.0000 0 0 0 0 0.000000000 +6 6 9 0.6944444 15.2500 0 0 0 0 0.000000000 +``` + +The function **summarizeClusters** returns summary statistics about each cluster. +It prints the number of nodes, edges, clusters and unclustered nodes to the screen, +and returns a matrix with cluster number, number of nodes in the cluster, +fivenum summary for the degrees of nodes in the cluster, and fivenum summary for +the percentage of edges that are within the cluster. + +``` +summtab <- summarizeClusters(WTComp) +head(summtab[,1:7]) +head(summtab[,c(1:2,8:12)]) + +Num of nodes: 3454 +Num of edges: 80332 +Num of clusters: 72 +Num of unclustered nodes: 1837 + + Cluster Nodes degreeMin degreeQ25 degreeMedian degreeQ75 degreeMax +[1,] 1 374 59 222.0 257 299.0 373 +[2,] 2 69 17 96.0 134 164.0 234 +[3,] 3 39 2 53.5 74 122.5 209 +[4,] 4 107 25 108.0 130 155.5 209 +[5,] 5 35 26 58.5 80 109.0 154 +[6,] 6 19 17 45.5 80 108.5 133 + + + Cluster Nodes pctInClstMin pctInClstQ25 pctInClstMedian pctInClstQ75 pctInClstMax +[1,] 1 374 0.52073733 0.78536585 0.8452080 0.9083969 1.0000000 +[2,] 2 69 0.07109005 0.23952096 0.3061224 0.4226804 0.8235294 +[3,] 3 39 0.03571429 0.09923455 0.1358025 0.2197585 1.0000000 +[4,] 4 107 0.18750000 0.44693586 0.5555556 0.6298886 0.8529412 +[5,] 5 35 0.10344828 0.21717172 0.2777778 0.3584826 0.7692308 +[6,] 6 19 0.06666667 0.10270206 0.1262136 0.1594156 0.4210526 + +``` + +It can be seen, for example, the cluster 1 has 374 nodes, and most of them have many neighbors +(more than 75% of them have at least 222 edges), and this cluster is very interconnected (at least 75% +of the nodes are mostly connected within the cluster with at least 79% of their edges being inside +the cluster. + +Next, we can visualize clusters using the **plotCluster** function. For example, to plot +clusters 5 and 9 we use the following syntax: + +``` +plotCluster(WTres$AdjMat,5,WTComp) +plotCluster(WTres$AdjMat,5,WTComp) +``` + +The central node is marked by a black circle. The radius of each point corresponds +to its degree. The opacity corresponds to the percentage of edges from the node +that is in the cluster (the darker it is, the larger the percentage of edges is +within the cluster.) The distance from the center corresponds to the relative +dissimilarity with the central node. This is computed as the number of neighbors +the node and the central node do not have in common. +For example, in cluster 9 (right plot) the dark shade of blue of all the nodes +shows that the majority of edges connecting to these nodes are within the cluster. +In contrast, the nodes in cluster 4 (left) have a larger percentage of their neighbors outside the +cluster. + + +```{r echo=FALSE, out.width='45%'} +knitr::include_graphics('./WTcluster5.png') +knitr::include_graphics('./WTcluster9.png') +``` + +Indeed, when we look at the data +``` +summtab[9,c(1:2,8:12)] + Cluster Nodes pctInClstMin pctInClstQ25 pctInClstMedian pctInClstQ75 pctInClstMax + 9.0000000 108.0000000 0.6857143 0.8768939 0.9301901 0.9657132 1.0000000 +``` +We see that the cluster contains 108 nodes, and the smallest percentage of within-cluster +edges is 68.5%, and for 75% of the nodes, the percentage is greater than 87.6%. This means that +cluster 9 is highly inter-connected, and fairly isolated. + +We can collapse the network data for more compact visualization by defining +a subset in which clusters are represented by their central nodes. The function +**collapsedGraph** returns an adjacency matrix which contains all the unclustered +nodes, and the centers of the clusters. The elements in the matrix contain the +total number of edges in the original graph. That is, the total count of edges +between clusters i and j is stored in the matrix, rather than just 0/1. To convert +it to a 0/1 adjacency matrix we can use the following: +``` +Adj1 <- collapsedGraph(WTres$AdjMat, WTComp) > 0 +``` + +We can use the **igraph** package to visualize the collapsed network. +For example, the following code will produce a network graph containing +all the clusters and unclustered nodes which have at least one neighbor. +``` +library("igraph") +inc <- which(Matrix::rowSums(Adj1) > 0) +plot(graph.adjacency(Adj1[inc,inc], mode="undirected"), + vertex.label.cex=0.7, vertex.size=0.1, edge.color='lightgreen',asp=1) +``` + +If we want to show only the relationships between clusters, we use the following: +``` +library("igraph") +inc <- which(substr(rownames(Adj1),1,3) == "CLS") +plot(graph.adjacency(Adj1[inc,inc], mode="undirected"),vertex.label.cex=0.7, +vertex.size=0.1,edge.color='lightgreen', asp=1) +``` +This gives the following graph, where it can be seen that cluster 9 is connected to +clusters 8, 19, 20, 33, and 35. + +```{r echo=FALSE, out.width='70%'} +knitr::include_graphics('./WTclustersCropped.png') +``` + +If we want to create a subset of the original data by taking a representative from each +clusters, we can do the following + +``` +WTclustered <- WT[union(which(WTComp$iscenter == 1), which(WTComp$clustNo == 0)),] +dim(WTclustered) +[1] 1909 15 +``` + +*Other visualizations:* + +The **plotDegCC** function can be used to plot the degree of nodes versus the +degree times the clustering coefficient of nodes. We can also highlight specific groups. +For example, in the following code we highlight +cluster 1, which as we've seen before, is a large (374 genes) and highly connected +75% of the nodes have at least 222 neighbors, and most of the connections are within the cluster +(75% of the nodes have at least 78.5% of their neighbors within the cluster.) + +``` +plotDegCC(WTres,WTComp,highlightNodes = which(WTComp$clustNo==1)) +``` + +```{r echo=FALSE, out.width='60%'} +knitr::include_graphics('./WTdg.png') +``` + +The **plotBitmapCC** function is used to show the network as a 0/1 matrix, where a black +dot corresponds to an edge in the graph. Setting **orderByDegree=T** is used to +sort the nodes by clusters. When set to FALSE, the original order +of the nodes as it appears in the gene expression file, is preserved. +We can create the bitmap plot for nodes with degree greater than or equal to +some threshold. For example, **showMinDegree=30** will result in a plot which includes +only node which have at least 30 neighbors. + +``` +plotBitmapCC(WTres$AdjMat, WTComp, orderByCluster=TRUE, showMinDegree = 30) +``` + +```{r echo=FALSE, out.width='50%'} +knitr::include_graphics('./WTbitmap.png') +``` + +We repeat the same process with the duplication group. +DUP is a matrix with 3454 rows (genes) and 12 columns (samples). +We only show the collapsed cluster plot, and observe that unlike the WT group, +the network in the DUP group consists of two "super-clusters". + +``` +data("DUP") +DUPres <- edgefinder(DUP, ttl = "Duplication") +DUPComp <- graphComponents(DUPres$AdjMat) +Adj2 <- collapsedGraph(DUPres$AdjMat, DUPComp) > 0 +inc <- which(substr(rownames(Adj2),1,3) == "CLS") +plot(graph.adjacency(Adj2[inc,inc], mode="undirected"),vertex.label.cex=0.7, +vertex.size=0.1,edge.color='lightgreen', asp=1) +``` + + +```{r echo=FALSE, out.width='70%'} +knitr::include_graphics('./DUPclustersCropped.png') +``` + + + + +# Simulated data + +The following examples shows a simulated dataset with a hub structure, consisting +of 1000 nodes and 50 hubs. The bitmap plot shows the network that was created +by edgefinder. + +``` +library("huge") +library("MASS") +N=200; D=1000 +set.seed(23197) +L = huge.generator(n = N, d = D, graph = "hub", g=50, v = 0.3, u = 0.1) +x = mvrnorm(N, rep(0, D), L$sigma) +``` + +Data generated like this is provided with the package in a dataset called SIM. We perform similar analysis +and display the bitmap plot, which shows that edgefinder finds the actual network structure overall, with +almost no false discoveries. +We also display the network of cluster 1, which shows that the cluster is how we expected it to be, +with one central node with high degree (the hub gene), with interconnected neighbors each having a smaller degree +than the hub gene. From the dark shade of blue for each node, we can infer that the nodes are connected +within the cluster but almost no edges to other clusters or nodes. +The smallest percentage of edges within cluster 1 is 66.7%, and at least 75% of the nodes are connected only to nodes within the same cluster. + +``` +data(SIM) +Sres <- edgefinder(SIM, ttl = "Simulation", BHthr=0.05) +plotBitmapCC(Sres$AdjMat,orderByCluster=FALSE) +SIMComp <- graphComponents(Sres$AdjMat) +plotCluster(Sres$AdjMat,1,SIMComp) +sumtab <- summarizeClusters(SIMComp) +sumtab[1,c(1:2,8:12)] + +Cluster Nodes pctInClstMin pctInClstQ25 pctInClstMedian pctInClstQ75 pctInClstMax + 1 20 0.6666667 1.0000000 1.0000000 1.0000000 1.0000000 + +``` + + +```{r echo=FALSE, out.width='45%'} +knitr::include_graphics('./SIMbitmap3.png') +knitr::include_graphics('./SIMcluster1.png') +``` + diff --git a/.Rproj.user/43F9182D/sources/s-918F8619/B1CDF10D-contents b/.Rproj.user/43F9182D/sources/s-918F8619/B1CDF10D-contents new file mode 100644 index 0000000..78c4f02 --- /dev/null +++ b/.Rproj.user/43F9182D/sources/s-918F8619/B1CDF10D-contents @@ -0,0 +1,350 @@ +--- +title: "edgefinder" +author: "Haim Bar" +date: "`r Sys.Date()`" +output: rmarkdown::pdf_document +vignette: > + %\VignetteIndexEntry{edgefinder} + %\VignetteEngine{knitr::knitr} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +The edgefinder package is used to find edges in gene networks using co-expression +data. The input to the program is a normalized expression matrix, with genes (nodes) +in the rows, and samples in the columns. +The program calculates the pair-wise correlations, performs Fisher's Z +transformation, and fits the L2N model to the transformed data. L2N is a mixture +model with three components: the uncorrelated pairs belong to the null component +which is assumed to be normally distributed, and the correlated pairs belong to one +of the two non-null components which are assumed to follow lognormal distributions. + +Typical datasets consist of hundreds, or thousands of genes, and hence a very +large number of pairs. Therefore, edgefinder randomly selects a subset of the pairs (the +default number of pairs is 20,000), fits the L2N model to the subset, and calculates +the component probabilities for *all* possible pairs. +Using the posterior probabilities, edgefinder determines which pairs are +highly correlated while controlling the false discovery rate. +Note that edgefinder makes no assumptions about the structure of the network. + +The edgefinder package depends on the 'Matrix' package, to allow for efficient +storage and computation of large co-occurrence matrices. For simulating datasets +we used the 'huge' and 'MASS' packages, but they are not required when +using edgefinder. + +# Real data examples + +We use a publicly available dataset from +https://www.ncbi.nlm.nih.gov/sites/GDSbrowser?acc=GDS4430 +(Horev G, Ellegood J, Lerch JP, Son YE et al. Dosage-dependent phenotypes in models +of 16p11.2 lesions found in autism. Proc Natl Acad Sci U.S.A. 2011, Oct. +11;108(41):17076-81. PMID: 21969575). + +The dataset contains three groups: wild type (2 copies of 16p11.2), +deletion (1 copy), and duplication (3 copies). +We focus on a subset of 3,454, genes which were found to be differentially +expressed when comparing the wild-type and duplication groups (using an +FDR threshold of 0.05.) We fit the +L2N model to this set of genes in each group, and compare the properties +of the two networks. First, we load the wild-type data (WT). +WT is a matrix with 3454 rows (genes) and 15 columns (samples) from +the wild-type group. + + + +``` +library("edgefinder") +# Wild-type first: +data(WT) +WTres <- edgefinder(WT, ttl = "Wild Type") +shortSummary(WTres) +``` + +The **edgefinder** function fits the L2N model to the data, and plots the fitted mixture distribution: + +```{r echo=FALSE, out.width='60%'} +knitr::include_graphics('./WTfit.png') +``` + +The function **shortSummary** produces the following output: + +``` +No. nodes = 3,454 +Max no. edges = 5,963,331 +No. edges detected = 80,332 +p1 = 0.0942 +p2 = 0.0185 +Est. FDR <= 0.00997 +``` + +Note that the estimated FDR is calculated based on the fitted L2N model. +The default FDR threshold used by the edgefinder function is 0.01, and in this case, the +empirical FDR is very close to the level set by the user. If the empirical FDR is too +high, you may increase **LOvals** from its default value (30). This will result in larger +(stricter) thresholds for determining significant correlations, and will decrease the +proportion of false discoveries. +The FDR threshold (the **BHthr** parameter) should be set according to the number of edges. +In this example, the algorithm finds 80,332 edges, and an FDR of 0.01 means that +at most 800 of the detected edges may not be true discoveries. If this number of edges +is too large in the sense that it may affect our inference about the network structure, +or a subsequent gene enrichment analysis, we could lower the FDR threshold. + +The function **graphComponents** finds clusters of genes. To do that, it takes as input an +adjacency (0/1) matrix (e.g. WTres$AdjMat in our example.) To find clusters it first +calculates a centrality for each node, using the formula (type\*CC+1)\*deg where +deg is the degree of the node, and CC is its clustering coefficient (CC). **type** is set +by default to 1. When it is set to 0, the centrality measure is just the degree of +the node. Setting type=1 means that we assign a higher value to nodes that not only have +many neighbors, but the neighbors are highly interconnected. For example, suppose we +have two components with k nodes, one has a star shape, and the other is a complete +graph. With type=0 both graphs will get the same value, but with type=1 the complete +graph will be picked by the algorithm first. +You can also set a minimum centrality value (the parameter **minCtr**) to determine the +smallest possible cluster size. + +The function returns a data frame with the following information about each node: +a label (e.g. gene name), degree, clustering coefficient, centrality measure, +cluster number, iscenter (1 for the node was chosen as the cluster's center, 0 otherwise), +the number of edges from the node to nodes in the same cluster the number of edges +from the node to nodes NOT in the same cluster, and the standardized Manhattan distance +to the central node in the cluster (in terms of the number neighbors they do not have +in common.) + +``` +WTComp <- graphComponents(WTres$AdjMat) +head(WTComp) + + labels degree cc ctr clustNo iscenter intEdges extEdges distCenter +1 1 251 0.5999044 401.5760 1 0 187 64 0.072958888 +2 2 0 0.0000000 0.0000 0 0 0 0 0.000000000 +3 3 202 0.7217378 347.7910 1 0 164 38 0.072090330 +4 4 202 0.5819910 319.5622 4 0 98 104 0.008396063 +5 5 0 0.0000000 0.0000 0 0 0 0 0.000000000 +6 6 9 0.6944444 15.2500 0 0 0 0 0.000000000 +``` + +The function **summarizeClusters** returns summary statistics about each cluster. +It prints the number of nodes, edges, clusters and unclustered nodes to the screen, +and returns a matrix with cluster number, number of nodes in the cluster, +fivenum summary for the degrees of nodes in the cluster, and fivenum summary for +the percentage of edges that are within the cluster. + +``` +summtab <- summarizeClusters(WTComp) +head(summtab[,1:7]) +head(summtab[,c(1:2,8:12)]) + +Num of nodes: 3454 +Num of edges: 80332 +Num of clusters: 72 +Num of unclustered nodes: 1837 + + Cluster Nodes degreeMin degreeQ25 degreeMedian degreeQ75 degreeMax +[1,] 1 374 59 222.0 257 299.0 373 +[2,] 2 69 17 96.0 134 164.0 234 +[3,] 3 39 2 53.5 74 122.5 209 +[4,] 4 107 25 108.0 130 155.5 209 +[5,] 5 35 26 58.5 80 109.0 154 +[6,] 6 19 17 45.5 80 108.5 133 + + + Cluster Nodes pctInClstMin pctInClstQ25 pctInClstMedian pctInClstQ75 pctInClstMax +[1,] 1 374 0.52073733 0.78536585 0.8452080 0.9083969 1.0000000 +[2,] 2 69 0.07109005 0.23952096 0.3061224 0.4226804 0.8235294 +[3,] 3 39 0.03571429 0.09923455 0.1358025 0.2197585 1.0000000 +[4,] 4 107 0.18750000 0.44693586 0.5555556 0.6298886 0.8529412 +[5,] 5 35 0.10344828 0.21717172 0.2777778 0.3584826 0.7692308 +[6,] 6 19 0.06666667 0.10270206 0.1262136 0.1594156 0.4210526 + +``` + +It can be seen, for example, the cluster 1 has 374 nodes, and most of them have many neighbors +(more than 75% of them have at least 222 edges), and this cluster is very interconnected (at least 75% +of the nodes are mostly connected within the cluster with at least 79% of their edges being inside +the cluster. + +Next, we can visualize clusters using the **plotCluster** function. For example, to plot +clusters 5 and 9 we use the following syntax: + +``` +plotCluster(WTres$AdjMat,5,WTComp) +plotCluster(WTres$AdjMat,5,WTComp) +``` + +The central node is marked by a black circle. The radius of each point corresponds +to its degree. The opacity corresponds to the percentage of edges from the node +that is in the cluster (the darker it is, the larger the percentage of edges is +within the cluster.) The distance from the center corresponds to the relative +dissimilarity with the central node. This is computed as the number of neighbors +the node and the central node do not have in common. +For example, in cluster 9 (right plot) the dark shade of blue of all the nodes +shows that the majority of edges connecting to these nodes are within the cluster. +In contrast, the nodes in cluster 4 (left) have a larger percentage of their neighbors outside the +cluster. + + +```{r echo=FALSE, out.width='45%'} +knitr::include_graphics('./WTcluster5.png') +knitr::include_graphics('./WTcluster9.png') +``` + +Indeed, when we look at the data +``` +summtab[9,c(1:2,8:12)] + Cluster Nodes pctInClstMin pctInClstQ25 pctInClstMedian pctInClstQ75 pctInClstMax + 9.0000000 108.0000000 0.6857143 0.8768939 0.9301901 0.9657132 1.0000000 +``` +We see that the cluster contains 108 nodes, and the smallest percentage of within-cluster +edges is 68.5%, and for 75% of the nodes, the percentage is greater than 87.6%. This means that +cluster 9 is highly inter-connected, and fairly isolated. + +We can collapse the network data for more compact visualization by defining +a subset in which clusters are represented by their central nodes. The function +**collapsedGraph** returns an adjacency matrix which contains all the unclustered +nodes, and the centers of the clusters. The elements in the matrix contain the +total number of edges in the original graph. That is, the total count of edges +between clusters i and j is stored in the matrix, rather than just 0/1. To convert +it to a 0/1 adjacency matrix we can use the following: +``` +Adj1 <- collapsedGraph(WTres$AdjMat, WTComp) > 0 +``` + +We can use the **igraph** package to visualize the collapsed network. +For example, the following code will produce a network graph containing +all the clusters and unclustered nodes which have at least one neighbor. +``` +library("igraph") +inc <- which(Matrix::rowSums(Adj1) > 0) +plot(graph.adjacency(Adj1[inc,inc], mode="undirected"), + vertex.label.cex=0.7, vertex.size=0.1, edge.color='lightgreen',asp=1) +``` + +If we want to show only the relationships between clusters, we use the following: +``` +library("igraph") +inc <- which(substr(rownames(Adj1),1,3) == "CLS") +plot(graph.adjacency(Adj1[inc,inc], mode="undirected"),vertex.label.cex=0.7, +vertex.size=0.1,edge.color='lightgreen', asp=1) +``` +This gives the following graph, where it can be seen that cluster 9 is connected to +clusters 8, 19, 20, 33, and 35. + +```{r echo=FALSE, out.width='70%'} +knitr::include_graphics('./WTclustersCropped.png') +``` + +If we want to create a subset of the original data by taking a representative from each +clusters, we can do the following + +``` +WTclustered <- WT[union(which(WTComp$iscenter == 1), which(WTComp$clustNo == 0)),] +dim(WTclustered) +[1] 1909 15 +``` + +*Other visualizations:* + +The **plotDegCC** function can be used to plot the degree of nodes versus the +degree times the clustering coefficient of nodes. We can also highlight specific groups. +For example, in the following code we highlight +cluster 1, which as we've seen before, is a large (374 genes) and highly connected +75% of the nodes have at least 222 neighbors, and most of the connections are within the cluster +(75% of the nodes have at least 78.5% of their neighbors within the cluster.) + +``` +plotDegCC(WTres,WTComp,highlightNodes = which(WTComp$clustNo==1)) +``` + +```{r echo=FALSE, out.width='60%'} +knitr::include_graphics('./WTdg.png') +``` + +The **plotBitmapCC** function is used to show the network as a 0/1 matrix, where a black +dot corresponds to an edge in the graph. Setting **orderByDegree=T** is used to +sort the nodes by clusters. When set to FALSE, the original order +of the nodes as it appears in the gene expression file, is preserved. +We can create the bitmap plot for nodes with degree greater than or equal to +some threshold. For example, **showMinDegree=30** will result in a plot which includes +only node which have at least 30 neighbors. + +``` +plotBitmapCC(WTres$AdjMat, WTComp, orderByCluster=TRUE, showMinDegree = 30) +``` + +```{r echo=FALSE, out.width='50%'} +knitr::include_graphics('./WTbitmap.png') +``` + +We repeat the same process with the duplication group. +DUP is a matrix with 3454 rows (genes) and 12 columns (samples). +We only show the collapsed cluster plot, and observe that unlike the WT group, +the network in the DUP group consists of two "super-clusters". + +``` +data("DUP") +DUPres <- edgefinder(DUP, ttl = "Duplication") +DUPComp <- graphComponents(DUPres$AdjMat) +Adj2 <- collapsedGraph(DUPres$AdjMat, DUPComp) > 0 +inc <- which(substr(rownames(Adj2),1,3) == "CLS") +plot(graph.adjacency(Adj2[inc,inc], mode="undirected"),vertex.label.cex=0.7, +vertex.size=0.1,edge.color='lightgreen', asp=1) +``` + + +```{r echo=FALSE, out.width='70%'} +knitr::include_graphics('./DUPclustersCropped.png') +``` + + + + +# Simulated data + +The following examples shows a simulated dataset with a hub structure, consisting +of 1000 nodes and 50 hubs. The bitmap plot shows the network that was created +by edgefinder. + +``` +library("huge") +library("MASS") +N=200; D=1000 +set.seed(23197) +L = huge.generator(n = N, d = D, graph = "hub", g=50, v = 0.3, u = 0.1) +x = mvrnorm(N, rep(0, D), L$sigma) +``` + +Data generated like this is provided with the package in a dataset called SIM. We perform similar analysis +and display the bitmap plot, which shows that edgefinder finds the actual network structure overall, with +almost no false discoveries. +We also display the network of cluster 1, which shows that the cluster is how we expected it to be, +with one central node with high degree (the hub gene), with interconnected neighbors each having a smaller degree +than the hub gene. From the dark shade of blue for each node, we can infer that the nodes are connected +within the cluster but almost no edges to other clusters or nodes. +The smallest percentage of edges within cluster 1 is 66.7%, and at least 75% of the nodes are connected only to nodes within the same cluster. + +``` +data(SIM) +Sres <- edgefinder(SIM, ttl = "Simulation", BHthr=0.05) +plotBitmapCC(Sres$AdjMat,orderByCluster=FALSE) +SIMComp <- graphComponents(Sres$AdjMat) +plotCluster(Sres$AdjMat,1,SIMComp) +sumtab <- summarizeClusters(SIMComp) +sumtab[1,c(1:2,8:12)] + +Cluster Nodes pctInClstMin pctInClstQ25 pctInClstMedian pctInClstQ75 pctInClstMax + 1 20 0.6666667 1.0000000 1.0000000 1.0000000 1.0000000 + +``` + + +```{r echo=FALSE, out.width='45%'} +knitr::include_graphics('./SIMbitmap3.png') +knitr::include_graphics('./SIMcluster1.png') +``` + diff --git a/.Rproj.user/43F9182D/sources/s-918F8619/E3F75F6E b/.Rproj.user/43F9182D/sources/s-918F8619/E3F75F6E new file mode 100644 index 0000000..d8bca48 --- /dev/null +++ b/.Rproj.user/43F9182D/sources/s-918F8619/E3F75F6E @@ -0,0 +1,24 @@ +{ + "id": "E3F75F6E", + "path": "~/Dropbox/Packages/edgefinder/DESCRIPTION", + "project_path": "DESCRIPTION", + "type": "dcf", + "hash": "0", + "contents": "", + "dirty": false, + "created": 1596070787097.0, + "source_on_save": false, + "relative_order": 2, + "properties": { + "cursorPosition": "13,29", + "scrollLine": "0" + }, + "folds": "", + "lastKnownWriteTime": 1596066735, + "encoding": "UTF-8", + "collab_server": "", + "source_window": "", + "last_content_update": 1596066735, + "read_only": false, + "read_only_alternatives": [] +} \ No newline at end of file diff --git a/.Rproj.user/43F9182D/sources/s-918F8619/E3F75F6E-contents b/.Rproj.user/43F9182D/sources/s-918F8619/E3F75F6E-contents new file mode 100644 index 0000000..c13a239 --- /dev/null +++ b/.Rproj.user/43F9182D/sources/s-918F8619/E3F75F6E-contents @@ -0,0 +1,14 @@ +Package: edgefinder +Type: Package +Title: Detect Edges in Sparse Co-expression Graphs +Version: 0.1.5 +Author: Haim Bar and Seojin Bang +Maintainer: Haim Bar +Description: Finding edges in co-expression graphs, based on "A Mixture Model to Detect Edges in Sparse Co-expression Graphs", Haim Bar and Seojin Bang. See more details in the vignettes. +License: GPL-2 +Encoding: UTF-8 +LazyData: true +RoxygenNote: 7.1.1 +Suggests: knitr, rmarkdown +VignetteBuilder: knitr +Depends: R (>= 3.4.0), Matrix diff --git a/.Rproj.user/43F9182D/sources/s-918F8619/FFA512A5-contents b/.Rproj.user/43F9182D/sources/s-918F8619/FFA512A5-contents new file mode 100644 index 0000000..37654c6 --- /dev/null +++ b/.Rproj.user/43F9182D/sources/s-918F8619/FFA512A5-contents @@ -0,0 +1,17 @@ +# Generated by roxygen2: do not edit by hand + +export(clusteringCoef) +export(collapsedGraph) +export(edgefinder) +export(graphComponents) +export(plotBitmapCC) +export(plotCluster) +export(plotDegCC) +export(plotMixture) +export(shortSummary) +export(shortestPathDistance) +export(summarizeClusters) +import(graphics) +import(stats) +importFrom(Matrix,Matrix) +importFrom(grDevices,rgb) diff --git a/.Rproj.user/43F9182D/sources/s-918F8619/lock_file b/.Rproj.user/43F9182D/sources/s-918F8619/lock_file new file mode 100644 index 0000000..e69de29 diff --git a/.Rproj.user/59ACAE32/build_options b/.Rproj.user/59ACAE32/build_options new file mode 100644 index 0000000..7e9daa4 --- /dev/null +++ b/.Rproj.user/59ACAE32/build_options @@ -0,0 +1,7 @@ +auto_roxygenize_for_build_and_reload="0" +auto_roxygenize_for_build_package="1" +auto_roxygenize_for_check="1" +live_preview_website="1" +makefile_args="" +preview_website="1" +website_output_format="all" diff --git a/.Rproj.user/59ACAE32/cpp-definition-cache b/.Rproj.user/59ACAE32/cpp-definition-cache new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/.Rproj.user/59ACAE32/cpp-definition-cache @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/.Rproj.user/59ACAE32/pcs/debug-breakpoints.pper b/.Rproj.user/59ACAE32/pcs/debug-breakpoints.pper new file mode 100644 index 0000000..4893a8a --- /dev/null +++ b/.Rproj.user/59ACAE32/pcs/debug-breakpoints.pper @@ -0,0 +1,5 @@ +{ + "debugBreakpointsState": { + "breakpoints": [] + } +} \ No newline at end of file diff --git a/.Rproj.user/59ACAE32/pcs/files-pane.pper b/.Rproj.user/59ACAE32/pcs/files-pane.pper new file mode 100644 index 0000000..5da99b2 --- /dev/null +++ b/.Rproj.user/59ACAE32/pcs/files-pane.pper @@ -0,0 +1,9 @@ +{ + "sortOrder": [ + { + "columnIndex": 2, + "ascending": true + } + ], + "path": "~/Dropbox/Packages/edgefinder/vignettes" +} \ No newline at end of file diff --git a/.Rproj.user/59ACAE32/pcs/source-pane.pper b/.Rproj.user/59ACAE32/pcs/source-pane.pper new file mode 100644 index 0000000..28a3c2e --- /dev/null +++ b/.Rproj.user/59ACAE32/pcs/source-pane.pper @@ -0,0 +1,3 @@ +{ + "activeTab": 3 +} \ No newline at end of file diff --git a/.Rproj.user/59ACAE32/pcs/windowlayoutstate.pper b/.Rproj.user/59ACAE32/pcs/windowlayoutstate.pper new file mode 100644 index 0000000..e03429b --- /dev/null +++ b/.Rproj.user/59ACAE32/pcs/windowlayoutstate.pper @@ -0,0 +1,14 @@ +{ + "left": { + "splitterpos": 351, + "topwindowstate": "NORMAL", + "panelheight": 845, + "windowheight": 883 + }, + "right": { + "splitterpos": 529, + "topwindowstate": "NORMAL", + "panelheight": 845, + "windowheight": 883 + } +} \ No newline at end of file diff --git a/.Rproj.user/59ACAE32/pcs/workbench-pane.pper b/.Rproj.user/59ACAE32/pcs/workbench-pane.pper new file mode 100644 index 0000000..f398270 --- /dev/null +++ b/.Rproj.user/59ACAE32/pcs/workbench-pane.pper @@ -0,0 +1,5 @@ +{ + "TabSet1": 3, + "TabSet2": 3, + "TabZoom": {} +} \ No newline at end of file diff --git a/.Rproj.user/59ACAE32/rmd-outputs b/.Rproj.user/59ACAE32/rmd-outputs new file mode 100644 index 0000000..081b6e1 --- /dev/null +++ b/.Rproj.user/59ACAE32/rmd-outputs @@ -0,0 +1,10 @@ +/tmp/RtmpzBAEZc/preview-30b023fcd87.dir/edgefinder.html +/tmp/RtmpzBAEZc/preview-30b0cb14dd9.dir/edgefinder.html +/tmp/RtmpzBAEZc/preview-30b02da5dfee.dir/edgefinder.html +/tmp/RtmpzBAEZc/preview-30b02c11b1ae.dir/edgefinder.html +/tmp/RtmpzBAEZc/preview-30b078fb3a4b.dir/edgefinder.html + + + + + diff --git a/.Rproj.user/59ACAE32/saved_source_markers b/.Rproj.user/59ACAE32/saved_source_markers new file mode 100644 index 0000000..2b1bef1 --- /dev/null +++ b/.Rproj.user/59ACAE32/saved_source_markers @@ -0,0 +1 @@ +{"active_set":"","sets":[]} \ No newline at end of file diff --git a/.Rproj.user/59ACAE32/sources/per/t/2044BCA7 b/.Rproj.user/59ACAE32/sources/per/t/2044BCA7 new file mode 100644 index 0000000..a15d97e --- /dev/null +++ b/.Rproj.user/59ACAE32/sources/per/t/2044BCA7 @@ -0,0 +1,25 @@ +{ + "id": "2044BCA7", + "path": "~/Documents/Projects/edgefinder_ACES/data_source.txt", + "project_path": null, + "type": "text", + "hash": "3682463091", + "contents": "", + "dirty": false, + "created": 1568586771724.0, + "source_on_save": false, + "relative_order": 3, + "properties": { + "cursorPosition": "14,0", + "scrollLine": "0", + "tempName": "Untitled1" + }, + "folds": "", + "lastKnownWriteTime": 1568589152, + "encoding": "UTF-8", + "collab_server": "", + "source_window": "", + "last_content_update": 1568589152013, + "read_only": false, + "read_only_alternatives": [] +} \ No newline at end of file diff --git a/.Rproj.user/59ACAE32/sources/per/t/2044BCA7-contents b/.Rproj.user/59ACAE32/sources/per/t/2044BCA7-contents new file mode 100644 index 0000000..fff162b --- /dev/null +++ b/.Rproj.user/59ACAE32/sources/per/t/2044BCA7-contents @@ -0,0 +1,32 @@ + +Used the ACES data from +Allahyar A, Ubels J, de Ridder J (2019) A data-driven interactome of synergistic genes +improves network-based cancer outcome prediction. PLoS Comput Biol 15(2): e1006657. +https://doi.org/10.1371/journal.pcbi.1006657 + +Data available from +https://github.com/UMCUGenetics/SyNet/ +Executed +https://github.com/UMCUGenetics/SyNet/blob/master/Gene_Expression_Datasets/ACES/S01_Collect_Data.m +to obtain the ACES database, which contains 12 data sets, with G=12750, n=1616 + +The original data are obtained via +wget http://ccb.nki.nl/software/aces/ACES.tar.gz +tar -xvzf ACES.tar.gz + +Then, we run the matlab code (on UConn's SkyDrive), which includes some steps in Python. +Note that it required using Python 2.7 + +I created ACES.RData with a gene expression matrix called GE, and a data frame called +patdat with patient data; and also genenames.RData with the 12750 Entrez IDs. + +Then, I ran ACESnetwork.R with fdrlvl=0.01, and sampled 80 from each subtype (Normal, +Her2, LumA, LumB, and Basal.) +See https://www.mayoclinic.org/diseases-conditions/breast-cancer/in-depth/breast-cancer/art-20045654 +for details. +All genes were included. This created the files ResultsSUBTYPE_001.RData + +Post processing and analysis was done with ACES_analysis.R + + + diff --git a/.Rproj.user/59ACAE32/sources/per/t/95EC0C2B b/.Rproj.user/59ACAE32/sources/per/t/95EC0C2B new file mode 100644 index 0000000..b792a5c --- /dev/null +++ b/.Rproj.user/59ACAE32/sources/per/t/95EC0C2B @@ -0,0 +1,24 @@ +{ + "id": "95EC0C2B", + "path": "~/Dropbox/Packages/edgefinder/R/edgefinder.R", + "project_path": "R/edgefinder.R", + "type": "r_source", + "hash": "3340883629", + "contents": "", + "dirty": false, + "created": 1596044881899.0, + "source_on_save": false, + "relative_order": 4, + "properties": { + "cursorPosition": "336,6", + "scrollLine": "0" + }, + "folds": "", + "lastKnownWriteTime": 1596066691, + "encoding": "UTF-8", + "collab_server": "", + "source_window": "", + "last_content_update": 1596066691358, + "read_only": false, + "read_only_alternatives": [] +} \ No newline at end of file diff --git a/.Rproj.user/59ACAE32/sources/per/t/95EC0C2B-contents b/.Rproj.user/59ACAE32/sources/per/t/95EC0C2B-contents new file mode 100644 index 0000000..083360a --- /dev/null +++ b/.Rproj.user/59ACAE32/sources/per/t/95EC0C2B-contents @@ -0,0 +1,704 @@ + +#' Detect edges in co-expression datasets. +#' +#' Fit the L2N model to normalized correlation coefficients between pairs of +#' genes. The mixture model has three component - the null component follows +#' a normal distribution, and the two non-null components follow lognormal +#' distributions. An edge is in the graph if the correlation between the two +#' end-point genes is large enough and determined to be in one of the non-null +#' components. +#' @param Exprs A numeric matrix with normalized gene expression data. Rows +#' correspond to genes, and columns correspond to samples. +#' @param BHthr the Benjamini-Hochberg fasle discovery rate threshold to be +#' used to determine which pairs are strongly correlated. Default=0.01. +#' @param rndseed The random seed used to select a subset of the pairs. +#' @param maxLen The maximum number of pairs that will be randomly selected +#' to fit the L2N model. Default=20000. +#' @param LOvals the maximum log-odds ratio to be used to determine the +#' cut-off points to declare which correlations are significant. +#' The program will check which log-odds ratio (1,2,...,LOvals) results in +#' FDR less than or equal to the user-specified BHthr. Default=30. +#' @param ttl Title for the fitted-model plot. Default="" +#' @param trim Fraction of extreme values to exclude from the fitted-model +#' plot. Default=0 (show all the data). +#' @return A list with the following elements +#' \itemize{ +#' \item{G} {The total number of genes.} +#' \item{p1} {The proportion of genes in the right mixture component (positively correlated.)} +#' \item{p2} {The proportion of genes in the left mixture component (negtively correlated.)} +#' \item{p0} {The proportion of genes in the null component (un-correlated.)} +#' \item{m0, m1, m2, s0, s1, s2} {The location and scale parameters of the three mixture components.} +#' \item {thrtable} {A table with 6 columns: posterior probability ratio (ppr) between the non-null components and the null component), the right component cutoff corresponding to the ppr, the left component cutoff, the estimated probability of Type-I errors, the estimated power, the estimated FDR.} +#' \item {LogOddsRatio} {The log-odds ratio that yields FDR closest to the desired level.} +#' \item {fitted} {The fitted model (as returned by the EM function).} +#' \item {rmse} {The root mean-squared error of the fitted model.} +#' \item {rt, lt} {The significant edges (from the right and left mixture components.)} +#' \item {AdjMat} {The (sparse) adjacency matrix with edges corresponding to rt, lt.} +#' } +#' @export +#' @import stats +#' @importFrom Matrix Matrix +#' @importFrom grDevices rgb +#' @examples +#' \donttest{ +#' data(WT) +#' WTres <- edgefinder(WT, ttl = "Wild Type") +#' } +edgefinder <- function(Exprs, BHthr = 0.01, rndseed=112211, + maxLen=20000, LOvals=30, ttl="",trim=0) { + corM <- cor(t(Exprs), use = "pairwise.complete.obs") + N <- ncol(Exprs) + y <- atanh(corM[upper.tri(corM)]) + fix <- which(is.infinite(y)) + if (length(fix) > 0) + y[fix] <- max(abs(y[-fix]))*(1 + runif(length(fix))) + set.seed(rndseed) + sset <- sample(1:length(y),size = min(maxLen,length(y))) + y0 <- y[sset] + fittedL2N <- EM(y0*sqrt(N-3)) + rmseL2N <- GoodnessOfFit(fittedL2N) + plotMixture(fittedL2N,gof=rmseL2N,trim=trim, ttl=ttl) + cat("Calculating the posterior density...\n") + B <- posteriorDensityL2N(fittedL2N, y*sqrt(N-3)) + p1L2N <- mean(fittedL2N$b1) + p2L2N <- mean(fittedL2N$b2) + p0L2N <- 1-(p1L2N+p2L2N) + m0L2N <- fittedL2N$theta + m1L2N <- fittedL2N$mu1 + m2L2N <- fittedL2N$mu2 + s0L2N <- fittedL2N$tau + s1L2N <- fittedL2N$s1 + s2L2N <- fittedL2N$s2 + + cat("Calculating the log-odds...\n") + ret <- logoddsValues(fittedL2N$x,m0L2N,s0L2N,m1L2N,s1L2N, + m2L2N,s2L2N,p1L2N,p2L2N,vals=1:LOvals) + if (length(which(ret[,6] < BHthr) > 0)) { + LogOddsRatio <- max(min(which(ret[,6] < BHthr)),2) + } else { + LogOddsRatio <- LOvals + } + RtBFL2N <- which(B[[2]]/B[[1]] > LogOddsRatio) + LtBFL2N <- which(B[[3]]/B[[1]] > LogOddsRatio) + + cat("Calculating the adjacency matrix...\n") + G <- nrow(Exprs) + sigW <- sort(union(RtBFL2N,LtBFL2N)) + tmpmat <- Matrix::Matrix(0,G, G) + vec <- rep(0, choose(G,2)) + vec[sigW] <- 1 + tmpmat[upper.tri(tmpmat)] <- vec + AdjMat <- tmpmat+Matrix::t(tmpmat) + + list(G=G, p1=p1L2N, p2=p2L2N, p0=p0L2N, m0=m0L2N, m1=m1L2N, m2=m2L2N, + s0=s0L2N, s1=s1L2N, s2=s2L2N, thrtable=ret, LogOddsRatio=LogOddsRatio, + fitted=fittedL2N, rmse=rmseL2N, rt=RtBFL2N, lt=LtBFL2N, AdjMat=AdjMat) +} + + +#' Print a short summary of the fitted mixture model. +#' +#' Show the number of nodes, the number of possible and detected edges, the estimated proportion of positively/negatively correlated pairs, and the estimated false discovery rate. +#' @param edgefinderobj The object (list) returned from the edgefinder function. +#' @export +#' @examples +#' \donttest{ +#' data(WT) +#' WTres <- edgefinder(WT, ttl = "Wild Type") +#' shortSummary(WTres) +#' } +shortSummary <- function(edgefinderobj) { + with(edgefinderobj,{ + cat("No. nodes =", prettyNum(G,big.mark = ","),"\n") + cat("Max no. edges =", prettyNum(choose(G, 2),big.mark = ","),"\n") + cat("No. edges detected =", prettyNum(length(union(lt,rt)),big.mark = ","),"\n") + cat("p1 =",format(p1,digits=3),"\n") + cat("p2 =",format(p2,digits=3),"\n") + cat("Est. FDR <=", format(thrtable[LogOddsRatio,6],digits=3),"\n") + }) +} + +# The EM algorithm to fit the L2N model. +# +# Fit the L2N model to normalized correlation coefficients between pairs of genes. The mixture model has three component - the null component follows a normal distribution, and the two non-null components follow lognormal distributions. An edge is in the graph if the correlation between the two end-point genes is large enough and determined to be in one of the non-null components. +# @param x A vector of normalized correlation coefficients. +# @param max.it The maximum number of EM algorithm iterations (default=1000). +# @param tol The tolerance level to assess convergence of the EM algorithm (default=1e-12.) +# @return A list of the parameter estimates for the L2N model. +EM <- function(x, max.it=1000, tol=1e-12) { + N <- length(x) + err <- 1 + # initialize the parameter values + adjustMean <- mean(x) # centering the data around the mean + x <- x - adjustMean + # The parameters of the null ditribution, N(theta,tau) : + theta <- mean(x) + tau <- 1 + # The location and scale parameters of the nonnull components: + mu <- abs(quantile(x,c(0.05,.95))) + names(mu) <- c() + sig <- c(1, 1) + # The initial probabilities of the three components: + p0 <- 0.98 + p1 <- 0.01 + p2 <- 0.01 + # Set the initial component indicator variables: + b1 <- rep(0,N) + b2 <- rep(0,N) + m1 <- 0 + m2 <- 0 + ct <- 0 + # Run the EM algorithm until the mixture fits the empirical + # density well (total squared errors < tol) + while (err > tol) { + adjustMean <- adjustMean + theta + x <- x - theta # iteratively center the data, so that the mean of the + # null component ends up being 0 + pos <- which(x > 0) # Fit the nonnull components according to the + neg <- which(x < 0) # sign of x + + d0 <- dnorm(x, theta, tau) # null component is normal + d1 <- dlnorm(x, mu[1], sig[1]) + d2 <- dlnorm(-x, mu[2], sig[2]) + wtsm <- p0*d0 + p1*d1 + p2*d2 # The density of the mixture + b1[-pos] <- 0 + b2[-neg] <- 0 + b1[pos] <- pmin(1,p1*d1[pos]/wtsm[pos]) # Posterior probabilities of the positive nonnull + b2[neg] <- pmin(1,p2*d2[neg]/wtsm[neg]) # Posterior probabilities of the negative nonnull + b0 <- 1 - (b1+b2) # The posterior null probabilities + # Update the component weights: + p0 <- mean(b0) + p1 <- mean(b1) + p2 <- mean(b2) + # Update the null component parameters: + theta <- sum(b0*x)/sum(b0) + tau <- sqrt(sum(b0*(x-theta)^2)/sum(b0)) + d0 <- dnorm(x, theta, tau) + # Update the nonnull (nonnull) components parameters: + if (sum(b1[pos]) < 1e-2) { + mu[1] <- 0 + sig[1] <- 0 + d1 <- rep(0, N) + } else { + mu[1] <- sum(b1[pos]*(log(x[pos])))/sum(b1[pos]) + sig[1] <- sqrt(sum(b1[pos]*(log(x[pos])-mu[1])^2)/sum(b1[pos])) + d1 <- dlnorm(x, mu[1], sig[1]) + } + + if (sum(b2[neg]) < 1e-2) { + mu[2] <- 0 + sig[2] <- 0 + d2 <- rep(0, N) + } else { + mu[2] <- sum(b2[neg]*(log(-x[neg])))/sum(b2[neg]) + sig[2] <- sqrt(sum(b2[neg]*(log(-x[neg])-mu[2])^2)/sum(b2[neg])) + d2 <- dlnorm(-x, mu[2], sig[2]) + } + + # Check convergence + err <- sum((p0*d0 + p1*d1 + p2*d2 - wtsm)^2) + ct <- ct + 1 + if(ct > max.it) + break + } + b1[-pos] <- 0 + b2[-neg] <- 0 + b1[pos] <- pmin(1,p1*d1[pos]/wtsm[pos]) # Posterior probabilities of the positive nonnull + b2[neg] <- pmin(1,p2*d2[neg]/wtsm[neg]) # Posterior probabilities of the negative nonnull + b0 <- 1 - (b1+b2) # The posterior null probabilities + pvals <- 2*(1-pnorm(abs(x), mean=0, sd=tau)) + bh <- p.adjust(pvals, method="BH") + list(x=x, adjustMean=adjustMean, + theta=theta, tau=tau, + mu1=mu[1], s1=sig[1], + mu2=mu[2], s2=sig[2], + b0=b0, b1=b1, b2=b2, + p.val=pvals, bh=bh, + err=err, its=ct) +} + + +# Calculate the log-odds ratios to determine for each gene, in which +# of the three components in the L2N model, it belongs +logoddsValues <- function(y,theta,tau,mu1,s1,mu2,s2,p1,p2,vals=1:30) { + ret <- matrix(0,nrow=length(vals),ncol=6) + ret[,1] <- vals + p0 <- 1-p1-p2 + xs <- seq(min(y),max(y),length=10000) + pxs <- seq(1e-6,max(y),length=10000) + nxs <- seq(min(y),-1e-6,length=10000) + i <- 0 + for (val in vals) { + i <- i + 1 + if (p1 < 1/length(y)) { + ret[i,2] <- Inf + } else { + f <- function(x) { log((p1*dlnorm(x, mu1, s1))/ + (p0*dnorm(x, theta, tau)))-log(val) } + rt <- try(uniroot(f, lower = 1e-6, upper = max(y)), silent = T) + if (class(rt) == "try-error") + ret[i,2] <- Inf + else + ret[i,2] <- rt$root + } + if (p2 < 1/length(y)) { + ret[i,3] <- -Inf + } else { + f <- function(x) { log((p2*dlnorm(-x, mu2, s2))/ + (p0*dnorm(x, theta, tau)))-log(val) } + rt <- try(uniroot(f, lower = min(y), upper = -1e-6), silent = T) + if (class(rt) == "try-error") + ret[i,3] <- -Inf + else + ret[i,3] <- rt$root + } + # type I: + ret[i,4] <- pnorm(ret[i,3], theta, tau) + + 1 - pnorm(ret[i,2], theta, tau) + # "Power": + ret[i,5] <- (p1*(1-plnorm(ret[i,2], mu1, s1)) + + p2*(1-plnorm(-ret[i,3], mu2, s2)))/(p1+p2) + # FDR: + ret[i,6] <- p0*ret[i,4]/(p0*ret[i,4]+ret[i,5]*(p1+p2)) + } + colnames(ret) <- c("ppr","Right","Left","TypeI","Power","FDR") + ret +} + + +# calculate the posterior L2N mixture model density of x, given the parameter +# estimates +posteriorDensityL2N <- function(fit.em, x) { + p0 <- mean(fit.em$b0) + p1 <- mean(fit.em$b1) + p2 <- mean(fit.em$b2) + adjustMean <- fit.em$adjustMean + fit.em$theta + x <- x - fit.em$adjustMean + pos <- which(x > 0) # Fit the nonnull components according to the + neg <- which(x < 0) # sign of x + d0 <- dnorm(x, fit.em$theta, fit.em$tau) # null component is normal + d1 <- dlnorm(x, fit.em$mu1, fit.em$s1) + d2 <- dlnorm(-x, fit.em$mu2, fit.em$s2) + wtsm <- p0*d0 + p1*d1 + p2*d2 + b1 <- rep(0, length(x)) + b2 <- rep(0, length(x)) + b1[pos] <- pmin(1,p1*d1[pos]/wtsm[pos]) # Posterior probabilities of the positive nonnull + b2[neg] <- pmin(1,p2*d2[neg]/wtsm[neg]) # Posterior probabilities of the negative nonnull + b0 <- 1 - (b1+b2) # The posterior null probabilities + list(b0=b0,b1=b1,b2=b2) +} + + +# Return the estimated density function of the mixture +mixtureDensityL2N <- function(fit.em, x) { + mean(fit.em$b0)*dnorm(x, fit.em$theta, fit.em$tau) + + mean(fit.em$b1)*dlnorm(x, fit.em$mu1, fit.em$s1) + + mean(fit.em$b2)*dlnorm(-x, fit.em$mu2, fit.em$s2) +} + + +# Calculate the root mean squared error of the fitted mixture +GoodnessOfFit <- function(fit.em, mixturemodel="L2N") { + x <- sort(fit.em$x) + if(length(x) > 10000) + x <- x[seq(1,length(x), length=10000)] + diffs <- x[-1] - x[-length(x)] + dnsfn <- approxfun(density(x,bw="SJ")) + return(sqrt(sum((diffs* (dnsfn(x[-1])-mixtureDensityL2N(fit.em,x[-1])) ^2)))) +} + + +#' Find clusters, and return node characteristics. +#' +#' Take an adjacency Matrix as input and find clusters. For each node, find the degree and clustering coefficient (CC). Then, calculate a centrality measure (type\*CC+1)\*deg. For type=0, it's just the degree. Note that setting type=1 we assign a higher value to nodes that not only have many neighbors, but the neighbors are highly interconnected. For example, suppose we have two components with k nodes, one has a star shape, and the other is a complete graph. With type=0 both graphs will get the same value, but with type=1 the complete graph will be picked by the algorithm first. +#' @param A An adjacency Matrix(0/1). +#' @param minCtr The minimum centrality value to be considered for a cluster center (default=5). +#' @param type Determines how the centrality measure is computed. +#' @return A data frame with the following columns +#' \itemize{ +#' \item{labels} {Node label (e.g. gene names).} +#' \item{degree} {Node degree.} +#' \item{cc} {Node clustering coefficient.} +#' \item{ctr} {Node centrality measure: (type\*CC+1)\*deg.} +#' \item{clustNo} {Cluster number.} +#' \item {iscenter} {1 for the node was chosen as the cluster's center, 0 otherwise.} +#' \item {intEdges} {Number of edges from the node to nodes in the same cluster.} +#' \item {extEdges} {Number of edges from the node to nodes NOT in the same cluster.} +#' \item {distCenter} {Standardized Manhattan distance to the central node.} +#' } +#' @export +#' @examples +#' \donttest{ +#' data(SIM) +#' Sres <- edgefinder(SIM, ttl = "hub network") +#' SimComp <- graphComponents(Sres$AdjMat) +#' head(SimComp) +#' } +graphComponents <- function(A, minCtr=5, type=1) { + stopifnot(grep("Matrix", class(A)) > 0) + Vn <- ncol(A) + ctrs <- rep(2*Vn, Vn) + labels <- 1:Vn + if(!is.null(rownames(A))) + labels <- rownames(A) + deg <- Matrix::rowSums(A) + CC <- clusteringCoef(A) + ctrs <- (type*CC+1)*deg + clustersInfo <- data.frame(labels=labels, degree=deg, cc=CC, ctr=ctrs, + clustNo=rep(0,Vn), iscenter=rep(0,Vn), + intEdges=rep(0,Vn), extEdges=rep(0,Vn), + distCenter=rep(0,Vn)) + clustNo <- 1 + clustered <- which(deg < 1) + while(length(clustered) < Vn) { + notInCluster <- setdiff(1:Vn, clustered) + if (max(ctrs[notInCluster]) < minCtr) + return(clustersInfo) + ctrnode <- notInCluster[which.max(ctrs[notInCluster])] + # candidate cluster neighbors + nbrs <- setdiff(sort(c(ctrnode, which(A[ctrnode,] != 0))), clustered) + if(length(nbrs) > 1) { + if (length(nbrs) > minCtr) { + clustersInfo$iscenter[ctrnode] <- 1 + clustersInfo$clustNo[union(ctrnode,nbrs)] <- clustNo + clustersInfo$intEdges[nbrs] <- Matrix::rowSums(A[nbrs,nbrs]) + clustersInfo$extEdges[nbrs] <- Matrix::rowSums(A[nbrs,-nbrs]) + for (i in 1:length(nbrs)) { + clustersInfo$distCenter[nbrs[i]] <- mean(xor(A[ctrnode,], A[nbrs[i],])) + } + clustNo <- clustNo + 1 + } else { + nbrs <- c() + } + } else { + nbrs <- c() + } + clustered <- union(clustered, c(nbrs, ctrnode)) + } + return(clustersInfo) +} + + + +#' Show cluster characteristics. +#' +#' Takes an object obtained from graphComponents and prints and returns summary statistics. +#' @param clustersInfo Obtained from graphComponents. +#' @return A matrix with cluster number, number of nodes, and fivenum summaries for the degrees of nodes in the cluster, and the percentage of edges that are within the cluster. +#' @export +#' @examples +#' \donttest{ +#' data(WT) +#' WTres <- edgefinder(WT, ttl = "Wild Type") +#' WTComp <- graphComponents(WTres$AdjMat) +#' (summtab <- summarizeClusters(WTComp)) +#' } +summarizeClusters <- function(clustersInfo) { + cat("Num of nodes:", nrow(clustersInfo),"\n") + cat("Num of edges:", sum(clustersInfo$degree)/2,"\n") + cat("Num of clusters:", max(clustersInfo$clustNo),"\n") + cat("Num of unclustered nodes:", length(which(clustersInfo$clustNo == 0)),"\n") + percentInCluster <- clustersInfo$intEdges/clustersInfo$degree + percentInCluster[which(clustersInfo$degree == 0)] <- 0 + tab <- matrix(0,nrow=max(clustersInfo$clustNo),ncol=12) + for (cnum in 1:max(clustersInfo$clustNo)) { + tmpclusterInfo <- clustersInfo[which(clustersInfo$clustNo == cnum),] + tab[cnum,] <- c(cnum,nrow(tmpclusterInfo), fivenum(tmpclusterInfo$degree), + fivenum(percentInCluster[which(clustersInfo$clustNo == cnum)])) + } + colnames(tab) <- c("Cluster","Nodes","degreeMin","degreeQ25","degreeMedian", + "degreeQ75","degreeMax","pctInClstMin","pctInClstQ25", + "pctInClstMedian", "pctInClstQ75","pctInClstMax") + tab +} + + +#' Return an adjacency matrix after collapsing clusters into their central nodes. +#' +#' Takes an object obtained from graphComponents and prints summary statistics. +#' @param A An adjacency Matrix. +#' @param clustersInfo Obtained from graphComponents +#' @return A weighted adjacency matrix between clusters and unclustered nodes. +#' @export +#' @examples +#' \donttest{ +#' data(WT) +#' WTres <- edgefinder(WT, ttl = "Wild Type") +#' rownames(WTres$AdjMat) = rownames(WT) +#' WTComp <- graphComponents(WTres$AdjMat) +#' Adj1 <- collapsedGraph(WTres$AdjMat, WTComp) > 0 +#' plotBitmapCC(Adj1,showMinDegree = 2) +#' } +collapsedGraph <- function(A, clustersInfo) { + collDim <- length(which(clustersInfo$clustNo == 0)) + max(clustersInfo$clustNo) + collA <- Matrix::Matrix(0, ncol=collDim, nrow=collDim) + inCluster <- which(clustersInfo$clustNo > 0) + notInCluster <- which(clustersInfo$clustNo == 0) + collA[1:length(notInCluster), 1:length(notInCluster)] <- A[notInCluster, notInCluster]>0 + if (length(rownames(A)) != nrow(A)) { + rownames(A) <- 1:nrow(A) + } + rownames(collA) <- c(rownames(A)[notInCluster], + paste0("CLS",1:max(clustersInfo$clustNo))) + for (i in 1:max(clustersInfo$clustNo)) { + Ci <- which(clustersInfo$clustNo == i) + collA[i+length(notInCluster),1:length(notInCluster)] <- + Matrix::rowSums(A[notInCluster,which(clustersInfo$clustNo==i)]) + if (i < max(clustersInfo$clustNo)) { + for (j in (i+1):max(clustersInfo$clustNo)) { + Cj <- which(clustersInfo$clustNo == j) + collA[i+length(notInCluster),j+length(notInCluster)] <- sum(A[Ci,Cj]) + } + } + } + collA + Matrix::t(collA) +} + + +#' Calculate the clustering coefficient of each node. +#' +#' @param A an adjacency Matrix (0/1). +#' @return A vector with the clustering coefficient of each node. +#' @export +#' @examples +#' \donttest{ +#' data(WT) +#' WTres <- edgefinder(WT, ttl = "Wild Type") +#' clusteringCoef(WTres$AdjMat) +#' } +#' +clusteringCoef <- function(A) { + rsum <- Matrix::rowSums(A) + cc <- rep(0,nrow(A)) + for (i in 1:nrow(A)) { + if (rsum[i] <= 1) + cc[i] <- 0 + else { + nbrs <- which(A[i,] == 1) + At <- A[nbrs, nbrs] + cc[i] <- 0.5*sum(At)/choose(rsum[i],2) + } + } + cc +} + + +#' Plot the histogram of the data and the fitted mixture distribution. +#' +#' The function is called by the edgefinder function. +#' @param fit.em The object (list) returned from the EM function with the parameter estimates for the L2N model. +#' @param gof The root mean-squared error of the fitted model (to appear in the title of the plot). +#' @param ttl The title of the plot (default=""). +#' @param trim The proportion of extreme values on both sides of the distribution to eliminate from the plot (default=0.) This can be useful if a small number of values are so extreme, that the plot shows mostly the tails and a spike in the middle. Default=0. +#' @export +#' @examples +#' \donttest{ +#' data(WT) +#' WTres <- edgefinder(WT, ttl = "Wild Type") +#' plotMixture(WTres$fitted, WTres$rmse) +#' } +plotMixture <- function(fit.em, gof, ttl="", trim=0) { + xlim <- quantile(fit.em$x, c(trim/2, 1-trim/2)) + brks <- min(80,floor(length(fit.em$x)/100)) + hist(fit.em$x, freq=FALSE, breaks=brks, + main=sprintf("%s\nrMSE %2.2f",ttl, gof), + xlim=xlim,xlab="x", border="white", col="wheat") + xs <- seq(min(fit.em$x), max(fit.em$x), length=1000) + p0 <- mean(fit.em$b0) + p1 <- mean(fit.em$b1) + p2 <- mean(fit.em$b2) + lines(xs, p0*dnorm(xs, fit.em$theta, fit.em$tau), col=2, lwd=2) + lines(xs, p1*dlnorm(xs, fit.em$mu1, fit.em$s1), col=3, lwd=2) + lines(-xs, p2*dlnorm(xs, fit.em$mu2, fit.em$s2), col=3, lwd=2) + mxfit <- p0*dnorm(xs,fit.em$theta, fit.em$tau) + + p1*dlnorm(xs, fit.em$mu1, fit.em$s1) + + p2*dlnorm(-xs, fit.em$mu2, fit.em$s2) + lines(xs, mxfit, lwd=3, col=4, lty=2) +} + + +#' Plot the degree of nodes versus the degree times the clustering coefficient. +#' +#' The x-axis represents the number of neighbors of each node, and the y-axis represents the proportion of neighbors which are connected to each other. +#' @param edgefinderobj The object (list) returned by edgefinder. +#' @param clusterInfo obtained from graphComponents. If not provided by the user, it will be computed on the fly. +#' @param highlightNodes A vector of node-numbers which will be shown in red. Default is NULL. +#' @export +#' @import stats graphics +#' @examples +#' \donttest{ +#' data(WT) +#' WTres <- edgefinder(WT, ttl = "Wild Type") +#' WTComp <- graphComponents(WTres$AdjMat) +#' plotDegCC(WTres,WTComp) +#' } +plotDegCC <- function(edgefinderobj, clusterInfo=NULL, highlightNodes=NULL) { + if (is.null(clusterInfo)) + clusterInfo <- graphComponents(edgefinderobj$AdjMat) + cc0 <- clusterInfo$cc + deg0 <- clusterInfo$degree + plot(deg0, deg0*cc0,axes=F,xlim=c(0,max(deg0)), + ylim=c(0,1.1*max(deg0*cc0)),main="", + xlab=bquote("degree"),ylab=bquote("CC*degree"), + col="thistle",pch=24,cex=0.5); axis(1); axis(2) + grid(); abline(0,1,col="seagreen1", lwd=2) + if (!is.null(highlightNodes)) + points(deg0[highlightNodes],(deg0*cc0)[highlightNodes],col=2,pch=24,cex=0.5) +} + + +#' Edge-indicator bitmap plot. +#' +#' Plot a bitmap in which a black dot corresponds to a pair of highly correlated genes (an edge in the graph). +#' The default is to show the nodes according to their order in the input. +#' By setting orderByDegree=T as below, it is possible to change the order and cluster them, and show them in increasing degree order (from left to right.) +#' @param AdjMat An adjacency Matrix (0/1). +#' @param clusterInfo obtained from graphComponents. If not provided by the user, it will be computed on the fly. +#' @param orderByCluster If false, show the bitmap is the original node order. If TRUE, show nodes by clusters, and sort by distance from the center of the cluster. +#' @param showMinDegree Non-negative integer indicating the minimum degree of nodes that should be displayed. Default=0 (all nodes). +#' @export +#' @examples +#' \donttest{ +#' data(WT) +#' WTres <- edgefinder(WT, ttl = "Wild Type") +#' WTComp <- graphComponents(WTres$AdjMat) +#' plotBitmapCC(WTres$AdjMat) +#' plotBitmapCC(WTres$AdjMat, WTComp, orderByCluster=TRUE) +#' plotBitmapCC(WTres$AdjMat, WTComp, orderByCluster=TRUE, showMinDegree = 30) +#' } +plotBitmapCC <- function(AdjMat, clusterInfo=NULL, orderByCluster=FALSE, showMinDegree=0) { + if(!is.null(clusterInfo)) + orderByCluster <- TRUE + if (orderByCluster) { + if (is.null(clusterInfo)) + clusterInfo <- graphComponents(AdjMat) + nodeOrder <- order(clusterInfo$clustNo,clusterInfo$distCenter) + AdjMat <- AdjMat[nodeOrder, nodeOrder] + } + showNodes <- which(Matrix::rowSums(AdjMat) >= showMinDegree) + Matrix::image(AdjMat[showNodes, showNodes]) +} + + +#' Plot cluster network +#' +#' Plot a cluster network with all the nodes and edges - the central node is marked by a black circle. The radius of each point corresponds to its degree. The opacity corresponds to the percentage of edges from the node that is in the cluster (the darker it is, the larger the percentage of edges is within the cluster.) The distance from the center corresponds to the relative dissimilarity with the central node. This is computed as the number of neighbors the node and the central node do not have in common. +#' @param AdjMat An adjacency Matrix (0/1). +#' @param clustNo The chosen cluster. +#' @param clusterInfo Obtained from graphComponents. +#' @export +#' @examples +#' \donttest{ +#' data(WT) +#' WTres <- edgefinder(WT, ttl = "Wild Type") +#' WTComp <- graphComponents(WTres$AdjMat) +#' plotCluster(WTres$AdjMat, 5, WTComp) +#' } +plotCluster <- function(AdjMat, clustNo, clusterInfo=NULL) { + if(is.null(clusterInfo)) + clusterInfo <- graphComponents(AdjMat) + ids <- which(clusterInfo$clustNo == clustNo) + if (length(ids) > 0) { + tmpA <- AdjMat[ids,ids] + tmpclusterInfo <- clusterInfo[ids,] + rads <- round(10*tmpclusterInfo$distCenter/max(tmpclusterInfo$distCenter)) + thetas <- rep(0,length(rads)) + intvls <- findInterval(rads,seq(1,10)) + for (intvl in unique(sort(intvls))) { + pts <- which(intvls == intvl) + thetas[pts] <- 3*intvl*pi/max(intvls)+seq(0,1.9*pi,length=length(pts)) + } + sizes <- pmax(0.3,tmpclusterInfo$degree/max(tmpclusterInfo$degree)) + opacity <- 0.25+tmpclusterInfo$intEdges/tmpclusterInfo$degree + opacity <- opacity/max(opacity) + plot(rads*cos(thetas), rads*sin(thetas),cex=sizes*3, pch=19,axes=F, + xlab="",ylab="",col=rgb(red = 0, green = 0, blue = 1, alpha = opacity)) + for (i in 1:(ncol(tmpA)-1)) { + nbrs <- which(tmpA[i,i:ncol(tmpA)] == 1) + for (j in i:ncol(tmpA)) { + lines(c(rads[i]*cos(thetas[i]), rads[j]*cos(thetas[j])), + c(rads[i]*sin(thetas[i]), rads[j]*sin(thetas[j])), + col="grey88", lwd=0.5) + } + } + points(rads*cos(thetas), rads*sin(thetas),cex=sizes*3, pch=19, + col=rgb(red = 0, green = 0, blue = 1, alpha = opacity)) + ctr <- which(tmpclusterInfo$iscenter==1) + points(rads[ctr]*cos(thetas[ctr]), rads[ctr]*sin(thetas[ctr]),pch=21, + cex=sizes[ctr]*3, col="black",lwd=2) + } else { + cat("Invalid cluster number\n") + } +} + + +#' Return a Matrix with the shortest path distance between nodes (check up to numSteps.) +#' +#' return the adjacency matrix of expMat connecting neighbors up to numSteps away. +#' @param AdjMat An adjacency Matrix (0/1). +#' @param numSteps The maximum number of edges between pairs of nodes. If numSteps=0, returns the input matrix. numSteps=1 adds neighbors of direct neighbors, etc. +#' @return A Matrix containing the shortset paths between nodes i and j +#' @export +#' @examples +#' \donttest{ +#' data(SIM) +#' Sres <- edgefinder(SIM, ttl = "hub network") +#' AdjMat1 <- shortestPathDistance(Sres$AdjMat, numSteps=50) +#' max(AdjMat1) +#' Matrix::image(AdjMat1) +#' } +shortestPathDistance <- function(AdjMat, numSteps=0) { + degs <- 1:ncol(AdjMat) + if (numSteps == 0) + return(AdjMat) + An <- Ap <- minDist <- AdjMat + for (i in 1:numSteps) { + An <- Ap%*%AdjMat + if (sum((An | Ap) - (An & Ap)) == 0) + break + minDist[(An > 0) & (Ap == 0) & (minDist == 0)] <- i + Ap <- An + } + rownames(minDist) <- colnames(minDist) <- rownames(AdjMat) + minDist +} + + +#' Gene Expression data for the WildType group +#' +#' WT is a matrix with normalized gene expression data containing 3454 differentially expressed genes (when compared with the duplication group) from 15 samples (columns) from the wild-type group. +#' +#' @docType data +#' @keywords datasets +#' @name WT +#' @usage data(WT) +#' @format A matrix with 3454 rows and 15 columns +#' @references \url{https://www.ncbi.nlm.nih.gov/sites/GDSbrowser?acc=GDS4430} +NULL + + +#' Gene Expression data for the Duplication group +#' +#' DUP is a matrix with normalized gene expression data containing 3454 differentially expressed genes (when compared with wild-type) from 12 samples (columns) from the duplication group. +#' +#' @docType data +#' @keywords datasets +#' @name DUP +#' @usage data(DUP) +#' @format A matrix with 3454 rows and 12 columns. +#' @references \url{https://www.ncbi.nlm.nih.gov/sites/GDSbrowser?acc=GDS4430} +NULL + + +#' Simulated gene Expression data using the huge package +#' +#' SIM is a a simulated dataset with a hub structure, consisting of 1000 nodes and 50 hubs +#' +#' @docType data +#' @keywords datasets +#' @name SIM +#' @usage data(SIM) +#' @format A 1000 by 200 matrix, representing 50 hubs +NULL + + diff --git a/.Rproj.user/59ACAE32/sources/per/t/B94384B9 b/.Rproj.user/59ACAE32/sources/per/t/B94384B9 new file mode 100644 index 0000000..caa43b7 --- /dev/null +++ b/.Rproj.user/59ACAE32/sources/per/t/B94384B9 @@ -0,0 +1,24 @@ +{ + "id": "B94384B9", + "path": "~/Dropbox/Packages/edgefinder/DESCRIPTION", + "project_path": "DESCRIPTION", + "type": "dcf", + "hash": "229925295", + "contents": "", + "dirty": false, + "created": 1596044690350.0, + "source_on_save": false, + "relative_order": 4, + "properties": { + "cursorPosition": "3,14", + "scrollLine": "0" + }, + "folds": "", + "lastKnownWriteTime": 1596066735, + "encoding": "UTF-8", + "collab_server": "", + "source_window": "", + "last_content_update": 1596066735201, + "read_only": false, + "read_only_alternatives": [] +} \ No newline at end of file diff --git a/.Rproj.user/59ACAE32/sources/per/t/B94384B9-contents b/.Rproj.user/59ACAE32/sources/per/t/B94384B9-contents new file mode 100644 index 0000000..c13a239 --- /dev/null +++ b/.Rproj.user/59ACAE32/sources/per/t/B94384B9-contents @@ -0,0 +1,14 @@ +Package: edgefinder +Type: Package +Title: Detect Edges in Sparse Co-expression Graphs +Version: 0.1.5 +Author: Haim Bar and Seojin Bang +Maintainer: Haim Bar +Description: Finding edges in co-expression graphs, based on "A Mixture Model to Detect Edges in Sparse Co-expression Graphs", Haim Bar and Seojin Bang. See more details in the vignettes. +License: GPL-2 +Encoding: UTF-8 +LazyData: true +RoxygenNote: 7.1.1 +Suggests: knitr, rmarkdown +VignetteBuilder: knitr +Depends: R (>= 3.4.0), Matrix diff --git a/.Rproj.user/59ACAE32/sources/per/t/D2DA680E b/.Rproj.user/59ACAE32/sources/per/t/D2DA680E new file mode 100644 index 0000000..bdd6820 --- /dev/null +++ b/.Rproj.user/59ACAE32/sources/per/t/D2DA680E @@ -0,0 +1,24 @@ +{ + "id": "D2DA680E", + "path": "~/Documents/Projects/edgefinder_ACES/ACES_analysis.R", + "project_path": null, + "type": "r_source", + "hash": "1292737346", + "contents": "", + "dirty": false, + "created": 1568589159048.0, + "source_on_save": false, + "relative_order": 3, + "properties": { + "cursorPosition": "153,0", + "scrollLine": "0" + }, + "folds": "", + "lastKnownWriteTime": 1568771011, + "encoding": "UTF-8", + "collab_server": "", + "source_window": "", + "last_content_update": 1568771011644, + "read_only": false, + "read_only_alternatives": [] +} \ No newline at end of file diff --git a/.Rproj.user/59ACAE32/sources/per/t/D2DA680E-contents b/.Rproj.user/59ACAE32/sources/per/t/D2DA680E-contents new file mode 100644 index 0000000..02a3811 --- /dev/null +++ b/.Rproj.user/59ACAE32/sources/per/t/D2DA680E-contents @@ -0,0 +1,153 @@ +source("~/edgefinder.R") +#load("ACES.RData") +load("genenames.RData") +library("curl") +library("jsonlite") +#dim(GE) +#genenames <- colnames(GE) +#rm(GE) +load("ResultsNormal_001.RData") +SresN <- Sres +load("ResultsHer2_001.RData") +SresH <- Sres +load("ResultsLumA_001.RData") +SresLA <- Sres +load("ResultsLumB_001.RData") +SresLB <- Sres +load("ResultsBasal_001.RData") +SresB <- Sres + +plotDegCC(SresN) +plotDegCC(SresB) +plotDegCC(SresH) +plotDegCC(SresLB) +plotDegCC(SresLA) + +plotBitmapCC(SresN$AdjMat, orderByDegree = TRUE, showMinDegree = 10) +plotBitmapCC(SresB$AdjMat, orderByDegree = TRUE, showMinDegree = 10) +plotBitmapCC(SresH$AdjMat, orderByDegree = TRUE, showMinDegree = 10) +plotBitmapCC(SresLB$AdjMat, orderByDegree = TRUE, showMinDegree = 10) +plotBitmapCC(SresLA$AdjMat, orderByDegree = TRUE, showMinDegree = 10) + +Her2Nodes <- exportNodeInfo(SresH$AdjMat) +cl2 <- which(Her2Nodes$Cluster == 2) +cl4 <- which(Her2Nodes$Cluster == 4) +cl5 <- which(Her2Nodes$Cluster == 5) +cl6 <- which(Her2Nodes$Cluster == 6) + + +plotDegCC(SresH, highlightNodes = cl2) +plotDegCC(SresH, highlightNodes = cl5) +plotDegCC(SresH, highlightNodes = cl6) + +CL2 = Her2Nodes[cl2,] +CL5 = Her2Nodes[cl5,] +CL6 = Her2Nodes[cl6,] + +tail(CL2[order(CL2$Degree),],30) +tail(CL6[order(CL6$Degree),],30) + +plot(CL2$Degree,CL2$DegreeNIC) +plot(CL5$Degree,CL5$DegreeNIC) +plot(CL6$Degree,CL6$DegreeNIC) + +plot(CL2$Degree,CL2$CC,ylim=c(0,1)) +plot(CL5$Degree,CL5$CC,ylim=c(0,1)) +plot(CL6$Degree,CL6$CC,ylim=c(0,1)) + +geneIDs2 <- genenames[as.numeric(rownames(CL2[order(CL2$Degree),]))] # 415 (lymph?) +geneIDs6 <- genenames[as.numeric(rownames(CL6[order(CL6$Degree),]))] # 200 (placenta, ovary, endometrium) + +URLtemplate <- "https://www.ncbi.nlm.nih.gov/gene/ENTREZID/?report=expression" +# e.g. https://www.ncbi.nlm.nih.gov/gene/1088/?report=expression +# extract var tissues_data = {...} + +sink(file = "URLs2.txt") +for (i in 1:length(geneIDs2)) { + geneID <- gsub("Entrez_","",geneIDs2[i]) + URLtmp <- gsub("ENTREZID",geneID, URLtemplate) + cat(URLtmp,"\n") + # cat("wget ",URLtmp,"|grep \"var tissues_data\"\n") +} +sink() + +sink(file = "URLs6.txt") +for (i in 1:length(geneIDs6)) { + geneID <- gsub("Entrez_","",geneIDs6[i]) + URLtmp <- gsub("ENTREZID",geneID, URLtemplate) + cat(URLtmp,"\n") + # cat("wget ",URLtmp,"|grep \"var tissues_data\"\n") +} +sink() + +# wget -i URLs2.txt -O GenesCluster2 +# grep "tissues_data" GenesCluster2 > GenesCluster2.tissues +# wget -i URLs6.txt -O GenesCluster6 +# grep "tissues_data" GenesCluster6 > GenesCluster6.tissues +# +# var tissues_data = {'adrenal': {'id': '5613_adrenal_PRJEB4337', 'gene': 5613, 'source_name': 'adrenal', 'full_rpkm': 0.903184, 'exp_rpkm': 0.903, 'var': 0.02526453, 'project_desc': 'PRJEB4337', '_version_': 1620403309341835266}, 'appendix': {'id': '5613_appendix_PRJEB4337', 'gene': 5613, 'source_name': 'appendix', 'full_rpkm': 6.08874, 'exp_rpkm': 6.09, 'var': 0.799857, 'project_desc': 'PRJEB4337', '_version_': 1620403309341835267}, 'bone marrow': {'id': '5613_bone marrow_PRJEB4337', 'gene': 5613, 'source_name': 'bone marrow', 'full_rpkm': 3.09866, 'exp_rpkm': 3.1, 'var': 1.632732, 'project_desc': 'PRJEB4337', '_version_': 1620403309341835268}, 'brain': {'id': '5613_brain_PRJEB4337', 'gene': 5613, 'source_name': 'brain', 'full_rpkm': 2.9376, 'exp_rpkm': 2.94, 'var': 5.34003, 'project_desc': 'PRJEB4337', '_version_': 1620403309341835269}, 'colon': {'id': '5613_colon_PRJEB4337', 'gene': 5613, 'source_name': 'colon', 'full_rpkm': 5.00289, 'exp_rpkm': 5.0, 'var': 4.766245, 'project_desc': 'PRJEB4337', '_version_': 1620403309341835270}, 'duodenum': {'id': '5613_duodenum_PRJEB4337', 'gene': 5613, 'source_name': 'duodenum', 'full_rpkm': 2.59066, 'exp_rpkm': 2.59, 'var': 0.389364, 'project_desc': 'PRJEB4337', '_version_': 1620403309341835271}, 'endometrium': {'id': '5613_endometrium_PRJEB4337', 'gene': 5613, 'source_name': 'endometrium', 'full_rpkm': 3.00588, 'exp_rpkm': 3.01, 'var': 0.443391, 'project_desc': 'PRJEB4337', '_version_': 1620403309341835272}, 'esophagus': {'id': '5613_esophagus_PRJEB4337', 'gene': 5613, 'source_name': 'esophagus', 'full_rpkm': 6.04169, 'exp_rpkm': 6.04, 'var': 1.539765, 'project_desc': 'PRJEB4337', '_version_': 1620403309341835273}, 'fat': {'id': '5613_fat_PRJEB4337', 'gene': 5613, 'source_name': 'fat', 'full_rpkm': 2.99678, 'exp_rpkm': 3.0, 'var': 0.1530957, 'project_desc': 'PRJEB4337', '_version_': 1620403309341835274}, 'gall bladder': {'id': '5613_gall bladder_PRJEB4337', 'gene': 5613, 'source_name': 'gall bladder', 'full_rpkm': 3.16368, 'exp_rpkm': 3.16, 'var': 0.762264, 'project_desc': 'PRJEB4337', '_version_': 1620403309341835275}, 'heart': {'id': '5613_heart_PRJEB4337', 'gene': 5613, 'source_name': 'heart', 'full_rpkm': 0.938249, 'exp_rpkm': 0.938, 'var': 0.0765732, 'project_desc': 'PRJEB4337', '_version_': 1620403309341835276}, 'kidney': {'id': '5613_kidney_PRJEB4337', 'gene': 5613, 'source_name': 'kidney', 'full_rpkm': 7.2615, 'exp_rpkm': 7.26, 'var': 35.78764, 'project_desc': 'PRJEB4337', '_version_': 1620403309341835277}, 'liver': {'id': '5613_liver_PRJEB4337', 'gene': 5613, 'source_name': 'liver', 'full_rpkm': 0.580494, 'exp_rpkm': 0.58, 'var': 0.00538803, 'project_desc': 'PRJEB4337', '_version_': 1620403309341835278}, 'lung': {'id': '5613_lung_PRJEB4337', 'gene': 5613, 'source_name': 'lung', 'full_rpkm': 4.0049, 'exp_rpkm': 4.0, 'var': 0.267662, 'project_desc': 'PRJEB4337', '_version_': 1620403309341835279}, 'lymph node': {'id': '5613_lymph node_PRJEB4337', 'gene': 5613, 'source_name': 'lymph node', 'full_rpkm': 6.96668, 'exp_rpkm': 6.97, 'var': 4.148825, 'project_desc': 'PRJEB4337', '_version_': 1620403309341835280}, 'ovary': {'id': '5613_ovary_PRJEB4337', 'gene': 5613, 'source_name': 'ovary', 'full_rpkm': 1.35511, 'exp_rpkm': 1.36, 'var': 0.0706074, 'project_desc': 'PRJEB4337', '_version_': 1620403309341835281}, 'pancreas': {'id': '5613_pancreas_PRJEB4337', 'gene': 5613, 'source_name': 'pancreas', 'full_rpkm': 0.577169, 'exp_rpkm': 0.577, 'var': 0.00141569, 'project_desc': 'PRJEB4337', '_version_': 1620403309341835282}, 'placenta': {'id': '5613_placenta_PRJEB4337', 'gene': 5613, 'source_name': 'placenta', 'full_rpkm': 3.26101, 'exp_rpkm': 3.26, 'var': 2.03578, 'project_desc': 'PRJEB4337', '_version_': 1620403309341835283}, 'prostate': {'id': '5613_prostate_PRJEB4337', 'gene': 5613, 'source_name': 'prostate', 'full_rpkm': 1.9352, 'exp_rpkm': 1.94, 'var': 0.421672, 'project_desc': 'PRJEB4337', '_version_': 1620403309341835284}, 'salivary gland': {'id': '5613_salivary gland_PRJEB4337', 'gene': 5613, 'source_name': 'salivary gland', 'full_rpkm': 1.06906, 'exp_rpkm': 1.07, 'var': 0.2523831, 'project_desc': 'PRJEB4337', '_version_': 1620403309341835285}, 'skin': {'id': '5613_skin_PRJEB4337', 'gene': 5613, 'source_name': 'skin', 'full_rpkm': 5.92819, 'exp_rpkm': 5.93, 'var': 4.49205, 'project_desc': 'PRJEB4337', '_version_': 1620403309342883840}, 'small intestine': {'id': '5613_small intestine_PRJEB4337', 'gene': 5613, 'source_name': 'small intestine', 'full_rpkm': 2.80388, 'exp_rpkm': 2.8, 'var': 0.618324, 'project_desc': 'PRJEB4337', '_version_': 1620403309342883841}, 'spleen': {'id': '5613_spleen_PRJEB4337', 'gene': 5613, 'source_name': 'spleen', 'full_rpkm': 7.0287, 'exp_rpkm': 7.03, 'var': 3.7221, 'project_desc': 'PRJEB4337', '_version_': 1620403309342883842}, 'stomach': {'id': '5613_stomach_PRJEB4337', 'gene': 5613, 'source_name': 'stomach', 'full_rpkm': 1.84125, 'exp_rpkm': 1.84, 'var': 0.0611346, 'project_desc': 'PRJEB4337', '_version_': 1620403309342883843}, 'testis': {'id': '5613_testis_PRJEB4337', 'gene': 5613, 'source_name': 'testis', 'full_rpkm': 5.39896, 'exp_rpkm': 5.4, 'var': 1.186563, 'project_desc': 'PRJEB4337', '_version_': 1620403309342883844}, 'thyroid': {'id': '5613_thyroid_PRJEB4337', 'gene': 5613, 'source_name': 'thyroid', 'full_rpkm': 22.9467, 'exp_rpkm': 22.9, 'var': 49.6716, 'project_desc': 'PRJEB4337', '_version_': 1620403309342883845}, 'urinary bladder': {'id': '5613_urinary bladder_PRJEB4337', 'gene': 5613, 'source_name': 'urinary bladder', 'full_rpkm': 3.04352, 'exp_rpkm': 3.04, 'var': 1.019288, 'project_desc': 'PRJEB4337', '_version_': 1620403309342883846}}; + +tissues <- readLines("GenesCluster2.tissues") +RPKM <- matrix(0,nrow = length(tissues), ncol = 1+27*2) +for (tnum in 1:length(tissues)) { + tissue <- gsub("\\s*var tissues_data = \\{\\s*","",tissues[tnum]) + tissue <- gsub("\\};","",tissue) + tissueVector <- unlist(strsplit(tissue, "\\},")) + for (j in 1:length(tissueVector)) { + x <- unlist(strsplit(gsub("\\S+ \\{","",tissueVector[j]), ", ")) + source_name <- unlist(strsplit(tissueVector[j],"': \\{"))[1] + source_name <- gsub(" *'","",source_name) + geneID <- as.numeric(gsub("\\D","",x[2])) + exp_rpkm <- as.numeric(gsub("\\S+: ", "", x[5])) + var_rpkm <- as.numeric(gsub("\\S+: ", "", x[6])) + cat(geneID,source_name,exp_rpkm,sqrt(var_rpkm),"\n") + RPKM[tnum, 1] <- geneID + RPKM[tnum, 2+2*(j-1)] <- exp_rpkm + RPKM[tnum, 3+2*(j-1)] <- sqrt(var_rpkm) + } +} +# genes 56, 223 returned empty arrays +RPKM <- RPKM[-c(56,223),] +rpkm <- RPKM[,seq(2,54,by=2)] +plot(apply(rpkm,2,median)) +rpkmMin <- RPKM[,seq(2,54,by=2)] - RPKM[,seq(3,55,by=2)] +plot(colMeans(rpkmMin)) + + +tissueNames <- rep("", 27) +tissues <- readLines("GenesCluster6.tissues") +RPKM6 <- matrix(0,nrow = length(tissues), ncol = 1+27*2) +for (tnum in 1:length(tissues)) { + tissue <- gsub("\\s*var tissues_data = \\{\\s*","",tissues[tnum]) + tissue <- gsub("\\};","",tissue) + tissueVector <- unlist(strsplit(tissue, "\\},")) + for (j in 1:length(tissueVector)) { + x <- unlist(strsplit(gsub("\\S+ \\{","",tissueVector[j]), ", ")) + source_name <- unlist(strsplit(tissueVector[j],"': \\{"))[1] + source_name <- gsub(" *'","",source_name) + geneID <- as.numeric(gsub("\\D","",x[2])) + exp_rpkm <- as.numeric(gsub("\\S+: ", "", x[5])) + var_rpkm <- as.numeric(gsub("\\S+: ", "", x[6])) + cat(geneID,source_name,exp_rpkm,sqrt(var_rpkm),"\n") + RPKM6[tnum, 1] <- geneID + RPKM6[tnum, 2+2*(j-1)] <- exp_rpkm + RPKM6[tnum, 3+2*(j-1)] <- sqrt(var_rpkm) + tissueNames[j] <- source_name + } +} +rpkm6 <- RPKM6[,seq(2,54,by=2)] +plot(apply(rpkm6,2,median)) +rpkm6Min <- RPKM6[,seq(2,54,by=2)] - RPKM6[,seq(3,55,by=2)] +plot(colMeans(rpkm6Min)) + +for (j in 1:27) { + plot(log(1+RPKM6[,2*j]),ylim=c(0,10), main=tissueNames[j]) + abline(h=0,col=2,lwd=2) + hist(log(1+RPKM6[,2*j]), xlim=c(0,10),breaks=20, main=tissueNames[j]) +} + +for (j in 1:27) { + plot(log(1+RPKM[,2*j]),ylim=c(0,10), main=tissueNames[j]) + abline(h=0,col=2,lwd=2) + hist(log(1+RPKM[,2*j]), xlim=c(0,10),breaks=20, main=tissueNames[j]) +} diff --git a/.Rproj.user/59ACAE32/sources/per/t/F7B7B971 b/.Rproj.user/59ACAE32/sources/per/t/F7B7B971 new file mode 100644 index 0000000..c7926f8 --- /dev/null +++ b/.Rproj.user/59ACAE32/sources/per/t/F7B7B971 @@ -0,0 +1,24 @@ +{ + "id": "F7B7B971", + "path": "~/Dropbox/Packages/edgefinder/vignettes/edgefinder.Rmd", + "project_path": "vignettes/edgefinder.Rmd", + "type": "r_markdown", + "hash": "4068354180", + "contents": "", + "dirty": false, + "created": 1596066698174.0, + "source_on_save": false, + "relative_order": 6, + "properties": { + "cursorPosition": "119,25", + "scrollLine": "0" + }, + "folds": "", + "lastKnownWriteTime": 1596066727, + "encoding": "UTF-8", + "collab_server": "", + "source_window": "", + "last_content_update": 1596066727321, + "read_only": false, + "read_only_alternatives": [] +} \ No newline at end of file diff --git a/.Rproj.user/59ACAE32/sources/per/t/F7B7B971-contents b/.Rproj.user/59ACAE32/sources/per/t/F7B7B971-contents new file mode 100644 index 0000000..c9f8641 --- /dev/null +++ b/.Rproj.user/59ACAE32/sources/per/t/F7B7B971-contents @@ -0,0 +1,350 @@ +--- +title: "edgefinder" +author: "Haim Bar" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{edgefinder} + %\VignetteEngine{knitr::knitr} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +The edgefinder package is used to find edges in gene networks using co-expression +data. The input to the program is a normalized expression matrix, with genes (nodes) +in the rows, and samples in the columns. +The program calculates the pair-wise correlations, performs Fisher's Z +transformation, and fits the L2N model to the transformed data. L2N is a mixture +model with three components: the uncorrelated pairs belong to the null component +which is assumed to be normally distributed, and the correlated pairs belong to one +of the two non-null components which are assumed to follow lognormal distributions. + +Typical datasets consist of hundreds, or thousands of genes, and hence a very +large number of pairs. Therefore, edgefinder randomly selects a subset of the pairs (the +default number of pairs is 20,000), fits the L2N model to the subset, and calculates +the component probabilities for *all* possible pairs. +Using the posterior probabilities, edgefinder determines which pairs are +highly correlated while controlling the false discovery rate. +Note that edgefinder makes no assumptions about the structure of the network. + +The edgefinder package depends on the 'Matrix' package, to allow for efficient +storage and computation of large co-occurrence matrices. For simulating datasets +we used the 'huge' and 'MASS' packages, but they are not required when +using edgefinder. + +# Real data examples + +We use a publicly available dataset from +https://www.ncbi.nlm.nih.gov/sites/GDSbrowser?acc=GDS4430 +(Horev G, Ellegood J, Lerch JP, Son YE et al. Dosage-dependent phenotypes in models +of 16p11.2 lesions found in autism. Proc Natl Acad Sci U.S.A. 2011, Oct. +11;108(41):17076-81. PMID: 21969575). + +The dataset contains three groups: wild type (2 copies of 16p11.2), +deletion (1 copy), and duplication (3 copies). +We focus on a subset of 3,454, genes which were found to be differentially +expressed when comparing the wild-type and duplication groups (using an +FDR threshold of 0.05.) We fit the +L2N model to this set of genes in each group, and compare the properties +of the two networks. First, we load the wild-type data (WT). +WT is a matrix with 3454 rows (genes) and 15 columns (samples) from +the wild-type group. + + + +``` +library("edgefinder") +# Wild-type first: +data(WT) +WTres <- edgefinder(WT, ttl = "Wild Type") +shortSummary(WTres) +``` + +The **edgefinder** function fits the L2N model to the data, and plots the fitted mixture distribution: + +```{r echo=FALSE, out.width='60%'} +knitr::include_graphics('./WTfit.png') +``` + +The function **shortSummary** produces the following output: + +``` +No. nodes = 3,454 +Max no. edges = 5,963,331 +No. edges detected = 80,332 +p1 = 0.0942 +p2 = 0.0185 +Est. FDR <= 0.00997 +``` + +Note that the estimated FDR is calculated based on the fitted L2N model. +The default FDR threshold used by the edgefinder function is 0.01, and in this case, the +empirical FDR is very close to the level set by the user. If the empirical FDR is too +high, you may increase **LOvals** from its default value (30). This will result in larger +(stricter) thresholds for determining significant correlations, and will decrease the +proportion of false discoveries. +The FDR threshold (the **BHthr** parameter) should be set according to the number of edges. +In this example, the algorithm finds 80,332 edges, and an FDR of 0.01 means that +at most 800 of the detected edges may not be true discoveries. If this number of edges +is too large in the sense that it may affect our inference about the network structure, +or a subsequent gene enrichment analysis, we could lower the FDR threshold. + +The function **graphComponents** finds clusters of genes. To do that, it takes as input an +adjacency (0/1) matrix (e.g. WTres$AdjMat in our example.) To find clusters it first +calculates a centrality for each node, using the formula (type\*CC+1)\*deg where +deg is the degree of the node, and CC is its clustering coefficient (CC). **type** is set +by default to 1. When it is set to 0, the centrality measure is just the degree of +the node. Setting type=1 means that we assign a higher value to nodes that not only have +many neighbors, but the neighbors are highly interconnected. For example, suppose we +have two components with k nodes, one has a star shape, and the other is a complete +graph. With type=0 both graphs will get the same value, but with type=1 the complete +graph will be picked by the algorithm first. +You can also set a minimum centrality value (the parameter **minCtr**) to determine the +smallest possible cluster size. + +The function returns a data frame with the following information about each node: +a label (e.g. gene name), degree, clustering coefficient, centrality measure, +cluster number, iscenter (1 for the node was chosen as the cluster's center, 0 otherwise), +the number of edges from the node to nodes in the same cluster the number of edges +from the node to nodes NOT in the same cluster, and the standardized Manhattan distance +to the central node in the cluster (in terms of the number neighbors they do not have +in common.) + +``` +WTComp <- graphComponents(WTres$AdjMat) +head(WTComp) + + labels degree cc ctr clustNo iscenter intEdges extEdges distCenter +1 1 251 0.5999044 401.5760 1 0 187 64 0.072958888 +2 2 0 0.0000000 0.0000 0 0 0 0 0.000000000 +3 3 202 0.7217378 347.7910 1 0 164 38 0.072090330 +4 4 202 0.5819910 319.5622 4 0 98 104 0.008396063 +5 5 0 0.0000000 0.0000 0 0 0 0 0.000000000 +6 6 9 0.6944444 15.2500 0 0 0 0 0.000000000 +``` + +The function **summarizeClusters** returns summary statistics about each cluster. +It prints the number of nodes, edges, clusters and unclustered nodes to the screen, +and returns a matrix with cluster number, number of nodes in the cluster, +fivenum summary for the degrees of nodes in the cluster, and fivenum summary for +the percentage of edges that are within the cluster. + +``` +summtab <- summarizeClusters(WTComp) +head(summtab[,1:7]) +head(summtab[,c(1:2,8:12)]) + +Num of nodes: 3454 +Num of edges: 80332 +Num of clusters: 72 +Num of unclustered nodes: 1837 + + Cluster Nodes degreeMin degreeQ25 degreeMedian degreeQ75 degreeMax +[1,] 1 374 59 222.0 257 299.0 373 +[2,] 2 69 17 96.0 134 164.0 234 +[3,] 3 39 2 53.5 74 122.5 209 +[4,] 4 107 25 108.0 130 155.5 209 +[5,] 5 35 26 58.5 80 109.0 154 +[6,] 6 19 17 45.5 80 108.5 133 + + + Cluster Nodes pctInClstMin pctInClstQ25 pctInClstMedian pctInClstQ75 pctInClstMax +[1,] 1 374 0.52073733 0.78536585 0.8452080 0.9083969 1.0000000 +[2,] 2 69 0.07109005 0.23952096 0.3061224 0.4226804 0.8235294 +[3,] 3 39 0.03571429 0.09923455 0.1358025 0.2197585 1.0000000 +[4,] 4 107 0.18750000 0.44693586 0.5555556 0.6298886 0.8529412 +[5,] 5 35 0.10344828 0.21717172 0.2777778 0.3584826 0.7692308 +[6,] 6 19 0.06666667 0.10270206 0.1262136 0.1594156 0.4210526 + +``` + +It can be seen, for example, the cluster 1 has 374 nodes, and most of them have many neighbors +(more than 75% of them have at least 222 edges), and this cluster is very interconnected (at least 75% +of the nodes are mostly connected within the cluster with at least 79% of their edges being inside +the cluster. + +Next, we can visualize clusters using the **plotCluster** function. For example, to plot +clusters 5 and 9 we use the following syntax: + +``` +plotCluster(WTres$AdjMat,5,WTComp) +plotCluster(WTres$AdjMat,5,WTComp) +``` + +The central node is marked by a black circle. The radius of each point corresponds +to its degree. The opacity corresponds to the percentage of edges from the node +that is in the cluster (the darker it is, the larger the percentage of edges is +within the cluster.) The distance from the center corresponds to the relative +dissimilarity with the central node. This is computed as the number of neighbors +the node and the central node do not have in common. +For example, in cluster 9 (right plot) the dark shade of blue of all the nodes +shows that the majority of edges connecting to these nodes are within the cluster. +In contrast, the nodes in cluster 4 (left) have a larger percentage of their neighbors outside the +cluster. + + +```{r echo=FALSE, out.width='45%'} +knitr::include_graphics('./WTcluster5.png') +knitr::include_graphics('./WTcluster9.png') +``` + +Indeed, when we look at the data +``` +summtab[9,c(1:2,8:12)] + Cluster Nodes pctInClstMin pctInClstQ25 pctInClstMedian pctInClstQ75 pctInClstMax + 9.0000000 108.0000000 0.6857143 0.8768939 0.9301901 0.9657132 1.0000000 +``` +We see that the cluster contains 108 nodes, and the smallest percentage of within-cluster +edges is 68.5%, and for 75% of the nodes, the percentage is greater than 87.6%. This means that +cluster 9 is highly inter-connected, and fairly isolated. + +We can collapse the network data for more compact visualization by defining +a subset in which clusters are represented by their central nodes. The function +**collapsedGraph** returns an adjacency matrix which contains all the unclustered +nodes, and the centers of the clusters. The elements in the matrix contain the +total number of edges in the original graph. That is, the total count of edges +between clusters i and j is stored in the matrix, rather than just 0/1. To convert +it to a 0/1 adjacency matrix we can use the following: +``` +Adj1 <- collapsedGraph(WTres$AdjMat, WTComp) > 0 +``` + +We can use the **igraph** package to visualize the collapsed network. +For example, the following code will produce a network graph containing +all the clusters and unclustered nodes which have at least one neighbor. +``` +library("igraph") +inc <- which(Matrix::rowSums(Adj1) > 0) +plot(graph.adjacency(Adj1[inc,inc], mode="undirected"), + vertex.label.cex=0.7, vertex.size=0.1, edge.color='lightgreen',asp=1) +``` + +If we want to show only the relationships between clusters, we use the following: +``` +library("igraph") +inc <- which(substr(rownames(Adj1),1,3) == "CLS") +plot(graph.adjacency(Adj1[inc,inc], mode="undirected"),vertex.label.cex=0.7, +vertex.size=0.1,edge.color='lightgreen', asp=1) +``` +This gives the following graph, where it can be seen that cluster 9 is connected to +clusters 8, 19, 20, 33, and 35. + +```{r echo=FALSE, out.width='70%'} +knitr::include_graphics('./WTclustersCropped.png') +``` + +If we want to create a subset of the original data by taking a representative from each +clusters, we can do the following + +``` +WTclustered <- WT[union(which(WTComp$iscenter == 1), which(WTComp$clustNo == 0)),] +dim(WTclustered) +[1] 1909 15 +``` + +*Other visualizations:* + +The **plotDegCC** function can be used to plot the degree of nodes versus the +degree times the clustering coefficient of nodes. We can also highlight specific groups. +For example, in the following code we highlight +cluster 1, which as we've seen before, is a large (374 genes) and highly connected +75% of the nodes have at least 222 neighbors, and most of the connections are within the cluster +(75% of the nodes have at least 78.5% of their neighbors within the cluster.) + +``` +plotDegCC(WTres,WTComp,highlightNodes = which(WTComp$clustNo==1)) +``` + +```{r echo=FALSE, out.width='60%'} +knitr::include_graphics('./WTdg.png') +``` + +The **plotBitmapCC** function is used to show the network as a 0/1 matrix, where a black +dot corresponds to an edge in the graph. Setting **orderByDegree=T** is used to +sort the nodes by clusters. When set to FALSE, the original order +of the nodes as it appears in the gene expression file, is preserved. +We can create the bitmap plot for nodes with degree greater than or equal to +some threshold. For example, **showMinDegree=30** will result in a plot which includes +only node which have at least 30 neighbors. + +``` +plotBitmapCC(WTres$AdjMat, WTComp, orderByCluster=TRUE, showMinDegree = 30) +``` + +```{r echo=FALSE, out.width='50%'} +knitr::include_graphics('./WTbitmap.png') +``` + +We repeat the same process with the duplication group. +DUP is a matrix with 3454 rows (genes) and 12 columns (samples). +We only show the collapsed cluster plot, and observe that unlike the WT group, +the network in the DUP group consists of two "super-clusters". + +``` +data("DUP") +DUPres <- edgefinder(DUP, ttl = "Duplication") +DUPComp <- graphComponents(DUPres$AdjMat) +Adj2 <- collapsedGraph(DUPres$AdjMat, DUPComp) > 0 +inc <- which(substr(rownames(Adj2),1,3) == "CLS") +plot(graph.adjacency(Adj2[inc,inc], mode="undirected"),vertex.label.cex=0.7, +vertex.size=0.1,edge.color='lightgreen', asp=1) +``` + + +```{r echo=FALSE, out.width='70%'} +knitr::include_graphics('./DUPclustersCropped.png') +``` + + + + +# Simulated data + +The following examples shows a simulated dataset with a hub structure, consisting +of 1000 nodes and 50 hubs. The bitmap plot shows the network that was created +by edgefinder. + +``` +library("huge") +library("MASS") +N=200; D=1000 +set.seed(23197) +L = huge.generator(n = N, d = D, graph = "hub", g=50, v = 0.3, u = 0.1) +x = mvrnorm(N, rep(0, D), L$sigma) +``` + +Data generated like this is provided with the package in a dataset called SIM. We perform similar analysis +and display the bitmap plot, which shows that edgefinder finds the actual network structure overall, with +almost no false discoveries. +We also display the network of cluster 1, which shows that the cluster is how we expected it to be, +with one central node with high degree (the hub gene), with interconnected neighbors each having a smaller degree +than the hub gene. From the dark shade of blue for each node, we can infer that the nodes are connected +within the cluster but almost no edges to other clusters or nodes. +The smallest percentage of edges within cluster 1 is 66.7%, and at least 75% of the nodes are connected only to nodes within the same cluster. + +``` +data(SIM) +Sres <- edgefinder(SIM, ttl = "Simulation", BHthr=0.05) +plotBitmapCC(Sres$AdjMat,orderByCluster=FALSE) +SIMComp <- graphComponents(Sres$AdjMat) +plotCluster(Sres$AdjMat,1,SIMComp) +sumtab <- summarizeClusters(SIMComp) +sumtab[1,c(1:2,8:12)] + +Cluster Nodes pctInClstMin pctInClstQ25 pctInClstMedian pctInClstQ75 pctInClstMax + 1 20 0.6666667 1.0000000 1.0000000 1.0000000 1.0000000 + +``` + + +```{r echo=FALSE, out.width='45%'} +knitr::include_graphics('./SIMbitmap3.png') +knitr::include_graphics('./SIMcluster1.png') +``` + diff --git a/.Rproj.user/59ACAE32/sources/per/t/FD7067BA b/.Rproj.user/59ACAE32/sources/per/t/FD7067BA new file mode 100644 index 0000000..4894545 --- /dev/null +++ b/.Rproj.user/59ACAE32/sources/per/t/FD7067BA @@ -0,0 +1,24 @@ +{ + "id": "FD7067BA", + "path": "~/Dropbox/ef_todo.txt", + "project_path": null, + "type": "text", + "hash": "730158240", + "contents": "", + "dirty": false, + "created": 1596045643231.0, + "source_on_save": false, + "relative_order": 5, + "properties": { + "cursorPosition": "2,0", + "scrollLine": "0" + }, + "folds": "", + "lastKnownWriteTime": 1596059243, + "encoding": "UTF-8", + "collab_server": "", + "source_window": "", + "last_content_update": 1596059243749, + "read_only": false, + "read_only_alternatives": [] +} \ No newline at end of file diff --git a/.Rproj.user/59ACAE32/sources/per/t/FD7067BA-contents b/.Rproj.user/59ACAE32/sources/per/t/FD7067BA-contents new file mode 100644 index 0000000..1439612 --- /dev/null +++ b/.Rproj.user/59ACAE32/sources/per/t/FD7067BA-contents @@ -0,0 +1,15 @@ + +update vignette + exportNodeInfo replace with graphComponenets + remove getNeighborhood + shortestPathDistance - show example? + +github !! + + +split large files +cluster similarity ?? +Hausdorff bw clusters using shortestPathDistance + the greatest of all the distances from a point in one set to the closest + point in the other set. +use plotly ?? pgf? diff --git a/.Rproj.user/59ACAE32/sources/prop/10936438 b/.Rproj.user/59ACAE32/sources/prop/10936438 new file mode 100644 index 0000000..be4de41 --- /dev/null +++ b/.Rproj.user/59ACAE32/sources/prop/10936438 @@ -0,0 +1,4 @@ +{ + "cursorPosition": "5,0", + "scrollLine": "0" +} \ No newline at end of file diff --git a/.Rproj.user/59ACAE32/sources/prop/15134A08 b/.Rproj.user/59ACAE32/sources/prop/15134A08 new file mode 100644 index 0000000..0f9d678 --- /dev/null +++ b/.Rproj.user/59ACAE32/sources/prop/15134A08 @@ -0,0 +1,4 @@ +{ + "cursorPosition": "2,0", + "scrollLine": "0" +} \ No newline at end of file diff --git a/.Rproj.user/59ACAE32/sources/prop/3591249D b/.Rproj.user/59ACAE32/sources/prop/3591249D new file mode 100644 index 0000000..a947b1f --- /dev/null +++ b/.Rproj.user/59ACAE32/sources/prop/3591249D @@ -0,0 +1,4 @@ +{ + "cursorPosition" : "61,29", + "scrollLine" : "45" +} \ No newline at end of file diff --git a/.Rproj.user/59ACAE32/sources/prop/3F1D19A4 b/.Rproj.user/59ACAE32/sources/prop/3F1D19A4 new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/.Rproj.user/59ACAE32/sources/prop/3F1D19A4 @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/.Rproj.user/59ACAE32/sources/prop/74CE1C94 b/.Rproj.user/59ACAE32/sources/prop/74CE1C94 new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/.Rproj.user/59ACAE32/sources/prop/74CE1C94 @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/.Rproj.user/59ACAE32/sources/prop/8306B9C5 b/.Rproj.user/59ACAE32/sources/prop/8306B9C5 new file mode 100644 index 0000000..0839e7b --- /dev/null +++ b/.Rproj.user/59ACAE32/sources/prop/8306B9C5 @@ -0,0 +1,4 @@ +{ + "cursorPosition" : "6,31", + "scrollLine" : "0" +} \ No newline at end of file diff --git a/.Rproj.user/59ACAE32/sources/prop/97383E84 b/.Rproj.user/59ACAE32/sources/prop/97383E84 new file mode 100644 index 0000000..d6a7d8f --- /dev/null +++ b/.Rproj.user/59ACAE32/sources/prop/97383E84 @@ -0,0 +1,4 @@ +{ + "cursorPosition": "153,0", + "scrollLine": "0" +} \ No newline at end of file diff --git a/.Rproj.user/59ACAE32/sources/prop/A8050ED4 b/.Rproj.user/59ACAE32/sources/prop/A8050ED4 new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/.Rproj.user/59ACAE32/sources/prop/A8050ED4 @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/.Rproj.user/59ACAE32/sources/prop/A93E5F3F b/.Rproj.user/59ACAE32/sources/prop/A93E5F3F new file mode 100644 index 0000000..f33b676 --- /dev/null +++ b/.Rproj.user/59ACAE32/sources/prop/A93E5F3F @@ -0,0 +1,5 @@ +{ + "tempName": "Untitled1", + "cursorPosition": "127,0", + "scrollLine": "102" +} \ No newline at end of file diff --git a/.Rproj.user/59ACAE32/sources/prop/AEFC0747 b/.Rproj.user/59ACAE32/sources/prop/AEFC0747 new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/.Rproj.user/59ACAE32/sources/prop/AEFC0747 @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/.Rproj.user/59ACAE32/sources/prop/AF364F44 b/.Rproj.user/59ACAE32/sources/prop/AF364F44 new file mode 100644 index 0000000..3319d49 --- /dev/null +++ b/.Rproj.user/59ACAE32/sources/prop/AF364F44 @@ -0,0 +1,5 @@ +{ + "cursorPosition": "14,0", + "scrollLine": "0", + "tempName": "Untitled1" +} \ No newline at end of file diff --git a/.Rproj.user/59ACAE32/sources/prop/C2D7035F b/.Rproj.user/59ACAE32/sources/prop/C2D7035F new file mode 100644 index 0000000..a1be1af --- /dev/null +++ b/.Rproj.user/59ACAE32/sources/prop/C2D7035F @@ -0,0 +1,4 @@ +{ + "cursorPosition": "3,14", + "scrollLine": "0" +} \ No newline at end of file diff --git a/.Rproj.user/59ACAE32/sources/prop/CD77E297 b/.Rproj.user/59ACAE32/sources/prop/CD77E297 new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/.Rproj.user/59ACAE32/sources/prop/CD77E297 @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/.Rproj.user/59ACAE32/sources/prop/DB526224 b/.Rproj.user/59ACAE32/sources/prop/DB526224 new file mode 100644 index 0000000..cde21a7 --- /dev/null +++ b/.Rproj.user/59ACAE32/sources/prop/DB526224 @@ -0,0 +1,4 @@ +{ + "cursorPosition": "336,6", + "scrollLine": "0" +} \ No newline at end of file diff --git a/.Rproj.user/59ACAE32/sources/prop/DE093683 b/.Rproj.user/59ACAE32/sources/prop/DE093683 new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/.Rproj.user/59ACAE32/sources/prop/DE093683 @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/.Rproj.user/59ACAE32/sources/prop/E6F11122 b/.Rproj.user/59ACAE32/sources/prop/E6F11122 new file mode 100644 index 0000000..4a6d6ab --- /dev/null +++ b/.Rproj.user/59ACAE32/sources/prop/E6F11122 @@ -0,0 +1,4 @@ +{ + "cursorPosition": "119,25", + "scrollLine": "0" +} \ No newline at end of file diff --git a/.Rproj.user/59ACAE32/sources/prop/F05E0165 b/.Rproj.user/59ACAE32/sources/prop/F05E0165 new file mode 100644 index 0000000..1bed898 --- /dev/null +++ b/.Rproj.user/59ACAE32/sources/prop/F05E0165 @@ -0,0 +1,5 @@ +{ + "cursorPosition" : "17,16", + "scrollLine" : "0", + "tempName" : "Untitled2" +} \ No newline at end of file diff --git a/.Rproj.user/59ACAE32/sources/prop/INDEX b/.Rproj.user/59ACAE32/sources/prop/INDEX new file mode 100644 index 0000000..c00b3d8 --- /dev/null +++ b/.Rproj.user/59ACAE32/sources/prop/INDEX @@ -0,0 +1,17 @@ +~%2FDocuments%2FProjects%2Fedgefinder_ACES%2FACES_analysis.R="97383E84" +~%2FDocuments%2FProjects%2Fedgefinder_ACES%2FACESnetwork.R="3591249D" +~%2FDocuments%2FProjects%2Fedgefinder_ACES%2Fdata_source.txt="AF364F44" +~%2FDropbox%2FACESnetwork.R="8306B9C5" +~%2FDropbox%2FPackages%2Fedgefinder%2FDESCRIPTION="C2D7035F" +~%2FDropbox%2FPackages%2Fedgefinder%2FNAMESPACE="10936438" +~%2FDropbox%2FPackages%2Fedgefinder%2FR%2Fedgefinder.R="DB526224" +~%2FDropbox%2FPackages%2Fedgefinder%2Fman%2FclusteringCoef.Rd="DE093683" +~%2FDropbox%2FPackages%2Fedgefinder%2Fman%2FcollapsedGraph.Rd="AEFC0747" +~%2FDropbox%2FPackages%2Fedgefinder%2Fman%2Fedgefinder.Rd="74CE1C94" +~%2FDropbox%2FPackages%2Fedgefinder%2Fman%2FgraphComponenets.Rd="CD77E297" +~%2FDropbox%2FPackages%2Fedgefinder%2Fman%2FshortSummary.Rd="A8050ED4" +~%2FDropbox%2FPackages%2Fedgefinder%2Fvignettes%2Fedgefinder.Rmd="E6F11122" +~%2FDropbox%2FPackages%2Fedgefinder%2Fvignettes%2Fedgefinder.md="3F1D19A4" +~%2FDropbox%2Fef_todo.txt="15134A08" +~%2FDropbox%2Foldeffunc.R="A93E5F3F" +~%2Fmmm%2FACESplots.R="F05E0165" diff --git a/.Rproj.user/8B8E5067/console06/INDEX001 b/.Rproj.user/8B8E5067/console06/INDEX001 new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/.Rproj.user/8B8E5067/console06/INDEX001 @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/.Rproj.user/8B8E5067/cpp-definition-cache b/.Rproj.user/8B8E5067/cpp-definition-cache new file mode 100644 index 0000000..32960f8 --- /dev/null +++ b/.Rproj.user/8B8E5067/cpp-definition-cache @@ -0,0 +1,2 @@ +[ +] \ No newline at end of file diff --git a/.Rproj.user/8B8E5067/pcs/files-pane.pper b/.Rproj.user/8B8E5067/pcs/files-pane.pper new file mode 100644 index 0000000..91499d6 --- /dev/null +++ b/.Rproj.user/8B8E5067/pcs/files-pane.pper @@ -0,0 +1,13 @@ +{ + "path" : "~/Dropbox (Personal)/Packages/edgefinder/man", + "sortOrder" : [ + { + "ascending" : false, + "columnIndex" : 4 + }, + { + "ascending" : true, + "columnIndex" : 2 + } + ] +} \ No newline at end of file diff --git a/.Rproj.user/8B8E5067/pcs/windowlayoutstate.pper b/.Rproj.user/8B8E5067/pcs/windowlayoutstate.pper new file mode 100644 index 0000000..3dc6337 --- /dev/null +++ b/.Rproj.user/8B8E5067/pcs/windowlayoutstate.pper @@ -0,0 +1,14 @@ +{ + "left" : { + "panelheight" : 688, + "splitterpos" : 292, + "topwindowstate" : "NORMAL", + "windowheight" : 727 + }, + "right" : { + "panelheight" : 688, + "splitterpos" : 438, + "topwindowstate" : "NORMAL", + "windowheight" : 727 + } +} \ No newline at end of file diff --git a/.Rproj.user/8B8E5067/pcs/workbench-pane.pper b/.Rproj.user/8B8E5067/pcs/workbench-pane.pper new file mode 100644 index 0000000..f4cc0e3 --- /dev/null +++ b/.Rproj.user/8B8E5067/pcs/workbench-pane.pper @@ -0,0 +1,6 @@ +{ + "TabSet1" : 3, + "TabSet2" : 1, + "TabZoom" : { + } +} \ No newline at end of file diff --git a/.Rproj.user/8B8E5067/persistent-state b/.Rproj.user/8B8E5067/persistent-state new file mode 100644 index 0000000..408f7e4 --- /dev/null +++ b/.Rproj.user/8B8E5067/persistent-state @@ -0,0 +1,8 @@ +build-last-errors="[]" +build-last-errors-base-dir="~/Dropbox (Personal)/Packages/edgefinder/" +build-last-outputs="[{\"output\":\"==> R CMD INSTALL --no-multiarch --with-keep.source edgefinder\\n\\n\",\"type\":0},{\"output\":\"* installing to library ‘/Users/hyb13001/Library/R/3.5/library’\\n\",\"type\":1},{\"output\":\"* installing *source* package ‘edgefinder’ ...\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"** R\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"** data\\n\",\"type\":1},{\"output\":\"*** moving datasets to lazyload DB\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"** byte-compile and prepare package for lazy loading\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"** help\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"*** installing help indices\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"** building package indices\\n\",\"type\":1},{\"output\":\"** installing vignettes\\n\",\"type\":1},{\"output\":\"** testing if installed package can be loaded\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"* DONE (edgefinder)\\n\",\"type\":1},{\"output\":\"\",\"type\":1}]" +compile_pdf_state="{\"errors\":[],\"output\":\"\",\"running\":false,\"tab_visible\":false,\"target_file\":\"\"}" +files.monitored-path="" +find-in-files-state="{\"handle\":\"\",\"input\":\"\",\"path\":\"\",\"regex\":false,\"results\":{\"file\":[],\"line\":[],\"lineValue\":[],\"matchOff\":[],\"matchOn\":[]},\"running\":false}" +imageDirtyState="1" +saveActionState="-1" diff --git a/.Rproj.user/8B8E5067/rmd-outputs b/.Rproj.user/8B8E5067/rmd-outputs new file mode 100644 index 0000000..fae6736 --- /dev/null +++ b/.Rproj.user/8B8E5067/rmd-outputs @@ -0,0 +1,6 @@ +/private/var/folders/h1/ptlb5pbx4rzfq799lrtbmk8406v_dc/T/RtmpKKVrO4/preview-e9b275b6a098.dir/edgefinder.html + + + + + diff --git a/.Rproj.user/8B8E5067/saved_source_markers b/.Rproj.user/8B8E5067/saved_source_markers new file mode 100644 index 0000000..2b1bef1 --- /dev/null +++ b/.Rproj.user/8B8E5067/saved_source_markers @@ -0,0 +1 @@ +{"active_set":"","sets":[]} \ No newline at end of file diff --git a/.Rproj.user/8B8E5067/sources/per/t/6F931C78 b/.Rproj.user/8B8E5067/sources/per/t/6F931C78 new file mode 100644 index 0000000..d6e8321 --- /dev/null +++ b/.Rproj.user/8B8E5067/sources/per/t/6F931C78 @@ -0,0 +1,20 @@ +{ + "collab_server" : "", + "contents" : "", + "created" : 1575306043981.000, + "dirty" : false, + "encoding" : "UTF-8", + "folds" : "", + "hash" : "1999177993", + "id" : "6F931C78", + "lastKnownWriteTime" : 1575306001, + "last_content_update" : 1575306001, + "path" : "~/Dropbox (Personal)/Packages/edgefinder/DESCRIPTION", + "project_path" : "DESCRIPTION", + "properties" : { + }, + "relative_order" : 2, + "source_on_save" : false, + "source_window" : "", + "type" : "dcf" +} \ No newline at end of file diff --git a/.Rproj.user/8B8E5067/sources/per/t/6F931C78-contents b/.Rproj.user/8B8E5067/sources/per/t/6F931C78-contents new file mode 100644 index 0000000..6934f45 --- /dev/null +++ b/.Rproj.user/8B8E5067/sources/per/t/6F931C78-contents @@ -0,0 +1,14 @@ +Package: edgefinder +Type: Package +Title: Detect Edges in Sparse Co-expression Graphs +Version: 0.1.3 +Author: Haim Bar and Seojin Bang +Maintainer: Haim Bar +Description: Finding edges in co-expression graphs, based on "A Mixture Model to Detect Edges in Sparse Co-expression Graphs", Haim Bar and Seojin Bang. See more details in the vignettes. +License: GPL-2 +Encoding: UTF-8 +LazyData: true +RoxygenNote: 6.1.1 +Suggests: knitr, rmarkdown +VignetteBuilder: knitr +Depends: R (>= 3.4.0), Matrix diff --git a/.Rproj.user/8B8E5067/sources/prop/6572C16D b/.Rproj.user/8B8E5067/sources/prop/6572C16D new file mode 100644 index 0000000..ed575fa --- /dev/null +++ b/.Rproj.user/8B8E5067/sources/prop/6572C16D @@ -0,0 +1,4 @@ +{ + "cursorPosition" : "527,2", + "scrollLine" : "513" +} \ No newline at end of file diff --git a/.Rproj.user/8B8E5067/sources/prop/98C8AA5E b/.Rproj.user/8B8E5067/sources/prop/98C8AA5E new file mode 100644 index 0000000..7a73a41 --- /dev/null +++ b/.Rproj.user/8B8E5067/sources/prop/98C8AA5E @@ -0,0 +1,2 @@ +{ +} \ No newline at end of file diff --git a/.Rproj.user/8B8E5067/sources/prop/FDA935E3 b/.Rproj.user/8B8E5067/sources/prop/FDA935E3 new file mode 100644 index 0000000..3807b7a --- /dev/null +++ b/.Rproj.user/8B8E5067/sources/prop/FDA935E3 @@ -0,0 +1,4 @@ +{ + "cursorPosition" : "147,0", + "scrollLine" : "0" +} \ No newline at end of file diff --git a/.Rproj.user/8B8E5067/sources/prop/INDEX b/.Rproj.user/8B8E5067/sources/prop/INDEX new file mode 100644 index 0000000..623bf51 --- /dev/null +++ b/.Rproj.user/8B8E5067/sources/prop/INDEX @@ -0,0 +1,7 @@ +~%2FDropbox%20(Personal)%2FCMapCodeWork%2Fserver.R="F9FC0905" +~%2FDropbox%20(Personal)%2FGuy091218.R="A43E3C30" +~%2FDropbox%20(Personal)%2FPackages%2Fedgefinder%2FDESCRIPTION="98C8AA5E" +~%2FDropbox%20(Personal)%2FPackages%2Fedgefinder%2FNAMESPACE="1378FE07" +~%2FDropbox%20(Personal)%2FPackages%2Fedgefinder%2FR%2Fedgefinder.R="6572C16D" +~%2FDropbox%20(Personal)%2FPackages%2Fedgefinder%2Fvignettes%2Fedgefinder.Rmd="FDA935E3" +~%2FDropbox%20(Personal)%2Fedgefinder.html="2AED2535" diff --git a/.Rproj.user/A07CE820/pcs/files-pane.pper b/.Rproj.user/A07CE820/pcs/files-pane.pper new file mode 100644 index 0000000..64a2473 --- /dev/null +++ b/.Rproj.user/A07CE820/pcs/files-pane.pper @@ -0,0 +1,9 @@ +{ + "path" : "C:/Users/Haim/Box/CorNetwork_Project/edgefinder", + "sortOrder" : [ + { + "ascending" : true, + "columnIndex" : 2 + } + ] +} \ No newline at end of file diff --git a/.Rproj.user/A07CE820/persistent-state b/.Rproj.user/A07CE820/persistent-state new file mode 100644 index 0000000..a27ed20 --- /dev/null +++ b/.Rproj.user/A07CE820/persistent-state @@ -0,0 +1,8 @@ +build-last-errors="[]" +build-last-errors-base-dir="C:/Users/Haim/Box/CorNetwork_Project/edgefinder/" +build-last-outputs="[{\"output\":\"==> devtools::document(roclets=c('rd', 'collate', 'namespace', 'vignette'))\\n\\n\",\"type\":0},{\"output\":\"Updating edgefinder documentation\\r\\nLoading edgefinder\\r\\n\",\"type\":2},{\"output\":\"Writing NAMESPACE\\r\\n\",\"type\":1},{\"output\":\"Writing NAMESPACE\\r\\n\",\"type\":1},{\"output\":\"Updating vignettes\\r\\n\",\"type\":2},{\"output\":\"Documentation completed\\n\\n\",\"type\":1},{\"output\":\"==> Rcmd.exe INSTALL --no-multiarch --with-keep.source edgefinder\\n\\n\",\"type\":0},{\"output\":\"* installing to library 'C:/Users/Haim/Documents/R/win-library/3.5'\\r\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"* installing *source* package 'edgefinder' ...\\r\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"** R\\r\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"** data\\r\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"*** moving datasets to lazyload DB\\r\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"** byte-compile and prepare package for lazy loading\\r\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"** help\\r\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"*** installing help indices\\r\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\" converting help for package 'edgefinder'\\r\\n\",\"type\":1},{\"output\":\" edgefinder html \",\"type\":1},{\"output\":\" finding HTML links ... done\\r\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"\\r\\n\",\"type\":1},{\"output\":\" plotBitmapCC html \\r\\n\",\"type\":1},{\"output\":\" plotDegCC html \\r\\n\",\"type\":1},{\"output\":\" plotMixture html \\r\\n\",\"type\":1},{\"output\":\" shortsummary html \",\"type\":1},{\"output\":\"\\r\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"** building package indices\\r\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"** installing vignettes\\r\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"** testing if installed package can be loaded\\r\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"* DONE (edgefinder)\\r\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"In R CMD INSTALL\\r\\n\",\"type\":1},{\"output\":\"\",\"type\":1}]" +compile_pdf_state="{\"errors\":[],\"output\":\"\",\"running\":false,\"tab_visible\":false,\"target_file\":\"\"}" +files.monitored-path="" +find-in-files-state="{\"handle\":\"\",\"input\":\"\",\"path\":\"\",\"regex\":true,\"results\":{\"file\":[],\"line\":[],\"lineValue\":[],\"matchOff\":[],\"matchOn\":[]},\"running\":false}" +imageDirtyState="1" +saveActionState="-1" diff --git a/.Rproj.user/A07CE820/sources/per/t/3F57D228-contents b/.Rproj.user/A07CE820/sources/per/t/3F57D228-contents new file mode 100644 index 0000000..ac09663 --- /dev/null +++ b/.Rproj.user/A07CE820/sources/per/t/3F57D228-contents @@ -0,0 +1,157 @@ +==> roxygen2::roxygenize('.', roclets=c('rd', 'collate', 'namespace', 'vignette')) + +Writing NAMESPACE +Loading edgefinder +Writing NAMESPACE +Updating vignettes +Documentation completed + +==> Rcmd.exe build edgefinder + +* checking for file 'edgefinder/DESCRIPTION' ... OK +* preparing 'edgefinder': +* checking DESCRIPTION meta-information ... OK +* installing the package to build vignettes +* creating vignettes ... OK +* checking for LF line-endings in source and make files and shell scripts +* checking for empty or unneeded directories +* looking to see if a 'data/datalist' file should be added +* building 'edgefinder_0.1.0.tar.gz' + +==> Rcmd.exe check edgefinder_0.1.0.tar.gz + +* using log directory 'C:/Users/Haim/Box/CorNetwork_Project/edgefinder.Rcheck' +* using R version 3.5.1 (2018-07-02) +* using platform: x86_64-w64-mingw32 (64-bit) +* using session charset: ISO8859-1 +* checking for file 'edgefinder/DESCRIPTION' ... OK +* checking extension type ... Package +* this is package 'edgefinder' version '0.1.0' +* package encoding: UTF-8 +* checking package namespace information ... OK +* checking package dependencies ... OK +* checking if this is a source package ... OK +* checking if there is a namespace ... OK +* checking for executable files ... OK +* checking for hidden files and directories ... OK +* checking for portable file names ... OK +* checking whether package 'edgefinder' can be installed ... OK +* checking installed package size ... OK +* checking package directory ... OK +* checking 'build' directory ... OK +* checking DESCRIPTION meta-information ... OK +* checking top-level files ... OK +* checking for left-over files ... OK +* checking index information ... OK +* checking package subdirectories ... OK +* checking R files for non-ASCII characters ... OK +* checking R files for syntax errors ... OK +* checking whether the package can be loaded ... OK +* checking whether the package can be loaded with stated dependencies ... OK +* checking whether the package can be unloaded cleanly ... OK +* checking whether the namespace can be loaded with stated dependencies ... OK +* checking whether the namespace can be unloaded cleanly ... OK +* checking loading without being on the library search path ... OK +* checking dependencies in R code ... NOTE +Package in Depends field not imported from: 'Matrix' + These packages need to be imported from (in the NAMESPACE file) + for when this namespace is loaded but not attached. +* checking S3 generic/method consistency ... OK +* checking replacement functions ... OK +* checking foreign function calls ... OK +* checking R code for possible problems ... NOTE +EM: no visible global function definition for 'quantile' +EM: no visible global function definition for 'dnorm' +EM: no visible global function definition for 'dlnorm' +EM: no visible global function definition for 'pnorm' +EM: no visible global function definition for 'p.adjust' +GoodnessOfFit: no visible global function definition for 'approxfun' +GoodnessOfFit: no visible global function definition for 'density' +edgefinder: no visible global function definition for 'cor' +edgefinder: no visible global function definition for 'runif' +logoddsValues : f: no visible global function definition for 'dlnorm' +logoddsValues : f: no visible global function definition for 'dnorm' +logoddsValues: no visible global function definition for 'uniroot' +logoddsValues: no visible global function definition for 'pnorm' +logoddsValues: no visible global function definition for 'plnorm' +mixtureDensityL2N: no visible global function definition for 'dnorm' +mixtureDensityL2N: no visible global function definition for 'dlnorm' +plotDegCC: no visible global function definition for 'Matrix' +plotDegCC: no visible global function definition for 'lm' +plotDegCC: no visible global function definition for 'plot' +plotDegCC: no visible global function definition for 'axis' +plotDegCC: no visible global function definition for 'grid' +plotDegCC: no visible global function definition for 'abline' +plotDegCC: no visible global function definition for 'predict.lm' +plotDegCC: no visible global function definition for 'lines' +plotMixture: no visible global function definition for 'quantile' +plotMixture: no visible global function definition for 'hist' +plotMixture: no visible global function definition for 'lines' +plotMixture: no visible global function definition for 'dnorm' +plotMixture: no visible global function definition for 'dlnorm' +posteriorDensityL2N: no visible global function definition for 'dnorm' +posteriorDensityL2N: no visible global function definition for 'dlnorm' +Undefined global functions or variables: + Matrix abline approxfun axis cor density dlnorm dnorm grid hist lines + lm p.adjust plnorm plot pnorm predict.lm quantile runif uniroot +Consider adding + importFrom("graphics", "abline", "axis", "grid", "hist", "lines", + "plot") + importFrom("stats", "approxfun", "cor", "density", "dlnorm", "dnorm", + "lm", "p.adjust", "plnorm", "pnorm", "predict.lm", + "quantile", "runif", "uniroot") +to your NAMESPACE file. +* checking Rd files ... OK +* checking Rd metadata ... OK +* checking Rd cross-references ... OK +* checking for missing documentation entries ... WARNING +Undocumented code objects: + 'DUP' 'WT' +Undocumented data sets: + 'DUP' 'WT' +All user-level objects in a package should have documentation entries. +See chapter 'Writing R documentation files' in the 'Writing R +Extensions' manual. +* checking for code/documentation mismatches ... OK +* checking Rd \usage sections ... WARNING +Undocumented arguments in documentation object 'EM' + 'mixturemodel' 'fixedNullVar' + +Undocumented arguments in documentation object 'plotMixture' + 'xlab' 'mixturemodel' + +Functions with \usage entries need to have the appropriate \alias +entries, and all their arguments documented. +The \usage entries must correspond to syntactically valid R code. +See chapter 'Writing R documentation files' in the 'Writing R +Extensions' manual. +* checking Rd contents ... OK +* checking for unstated dependencies in examples ... OK +* checking contents of 'data' directory ... OK +* checking data for non-ASCII characters ... OK +* checking data for ASCII and uncompressed saves ... WARNING + + Note: significantly better compression could be obtained + by using R CMD build --resave-data + old_size new_size compress + DUP.RData 202Kb 135Kb xz + WT.RData 249Kb 164Kb xz +* checking installed files from 'inst/doc' ... OK +* checking files in 'vignettes' ... OK +* checking examples ... OK +* checking for unstated dependencies in vignettes ... OK +* checking package vignettes in 'inst/doc' ... OK +* checking running R code from vignettes ... + 'Examples.Rmd' using 'UTF-8' ... OK + NONE +* checking re-building of vignette outputs ... OK +* checking PDF version of manual ... OK +* DONE +Status: 3 WARNINGs, 2 NOTEs + +See + 'C:/Users/Haim/Box/CorNetwork_Project/edgefinder.Rcheck/00check.log' +for details. + + +R CMD check succeeded diff --git a/.Rproj.user/A07CE820/sources/per/t/417D6893-contents b/.Rproj.user/A07CE820/sources/per/t/417D6893-contents new file mode 100644 index 0000000..5f0ff71 --- /dev/null +++ b/.Rproj.user/A07CE820/sources/per/t/417D6893-contents @@ -0,0 +1,428 @@ + +#' Detect edges in co-expression datasete. +#' +#' Fit the L2N model to normalized correlation coefficients between pairs of genes. The mixture model has three component - the null component follows a normal distribution, and the two non-null components follow lognormal distributions. An edge is in the graph if the correlation between the two end-point genes is large enough and determined to be in one of the non-null components. +#' @param Exprs A numeric matrix with normalized gene expression data. Rows +#' correspond to genes, and columns correspond to samples. +#' @param BHthr the Benjamini-Hochberg fasle discovery rate threshold to be +#' used to determine which pairs are strongly correlated. Default=0.05. +#' @param rndseed The random seed used to select a subset of the pairs. +#' @param maxLen The maximum number of pairs that will be randomly selected +#' to fit the L2N model. Default=20000. +#' @param LOvals the maximum log-odds ratio to be used to be used to +#' determine the cut-off points to declare which correlations are significant. +#' The program will check which log-odds ratio (1,2,...,LOvals) results in +#' FDR less than or equal to BHthr. +#' Default=20. +#' @param ttl Title for the fitted-model plot. Default="" +#' @return A list with the following elements +#' \itemize{ +#' \item{G} {The number of genes} +#' \item{p1} {The proportion of genes in the right mixture component (positively correlated.)} +#' \item{p2} {The proportion of genes in the left mixture component (negtively correlated.)} +#' \item{p0} {The proportion of genes in the null component (un-correlated.)} +#' \item{m0, m1, m2, s0, s1, s2} {The location and scale parameters of the three mixture components.} +#' \item {thrtable} {A table with 6 columns: posterior probability ratio (ppr) between the non-null components and the null component), the right component cutoff corresponding to the ppr, the left component cutoff, the estimated probability of Type-I errors, the estimated power, the estimated FDR.} +#' \item {LogOddsRatio} {The log-odds ratio that yields FDR less than or equal to the desired level.} +#' \item {rmse} {The root mean-squared error of the fitted model.} +#' \item {rt, lt} {The significant edges (from the right, and left mixture component.)} +#' } +#' @export +#' @examples +#' \donttest{ +#' data(WT) +#' WTres <- edgefinder(WT, ttl = "Wild Type") +#' } + +edgefinder <- function(Exprs, BHthr = 0.05, + rndseed=112211, maxLen=20000, LOvals=20, ttl="") { + corM <- cor(t(Exprs), use = "pairwise.complete.obs") + N <- ncol(Exprs) + y <- atanh(corM[upper.tri(corM)]) + fix <- which(is.infinite(y)) + if (length(fix) > 0) + y[fix] <- max(abs(y[-fix]))*(1 + runif(length(fix))) + set.seed(rndseed) + sset <- sample(1:length(y),size = min(maxLen,length(y))) + y0 <- y[sset] + fittedL2N <- EM(y0*sqrt(N-3), fixedNullVar = F) + rmseL2N <- GoodnessOfFit(fittedL2N) + plotMixture(fittedL2N,gof=rmseL2N,trim=0, ttl=ttl) + B <- posteriorDensityL2N(fittedL2N, y*sqrt(N-3)) + p1L2N <- mean(fittedL2N$b1) + p2L2N <- mean(fittedL2N$b2) + p0L2N <- 1-(p1L2N+p2L2N) + m0L2N = fittedL2N$theta + m1L2N = fittedL2N$mu1 + m2L2N = fittedL2N$mu2 + s0L2N = fittedL2N$tau + s1L2N = fittedL2N$s1 + s2L2N = fittedL2N$s2 + + ret = logoddsValues(fittedL2N$x,m0L2N,s0L2N,m1L2N, + s1L2N,m2L2N,s2L2N,p1L2N,p2L2N, + vals=1:LOvals) + if (length(which(ret[,6] < BHthr) > 0)) { + LogOddsRatio <- max(min(which(ret[,6] < BHthr)),2) + } else { + LogOddsRatio <- LOvals + } + RtBFL2N <- which(B[[2]]/B[[1]] > LogOddsRatio) + LtBFL2N <- which(B[[3]]/B[[1]] > LogOddsRatio) + + list(G=nrow(Exprs), p1=p1L2N, p2=p2L2N, p0=p0L2N, m0=m0L2N, m1=m1L2N, m2=m2L2N, + s0=s0L2N, s1=s1L2N, s2=s2L2N, thrtable=ret, LogOddsRatio=LogOddsRatio, + rmse=rmseL2N, rt=RtBFL2N, lt=LtBFL2N) +} + +#' Print a short summary of the fitted mixture model. +#' +#' Show the number of nodes, the number of possible edges, the proportion of possitively/negatively correlated pairs, the estimated power and false discovery rate. +#' @param edgefinderobj The object (list) returned from the edgefinder function. +#' @export +#' @examples +#' \donttest{ +#' data(WT) +#' WTres <- edgefinder(WT, ttl = "Wild Type") +#' shortSummary(WTres) +#' } +shortSummary <- function(edgefinderobj) { + with(edgefinderobj,{ + cat("No. nodes =", prettyNum(G,big.mark = ","),"\n") + cat("Max no. edges =", prettyNum(choose(G, 2),big.mark = ","),"\n") + cat("p1 =",format(p1,digits=3),"\n") + cat("p2 =",format(p2,digits=3),"\n") + cat("Est. Power =", format(thrtable[LogOddsRatio,5],digits=3),"\n") + cat("Est. FDR =", format(thrtable[LogOddsRatio,6],digits=3),"\n") + }) +} + +# The EM algorithm to fit the L2N model. +# +# Fit the L2N model to normalized correlation coefficients between pairs of genes. The mixture model has three component - the null component follows a normal distribution, and the two non-null components follow lognormal distributions. An edge is in the graph if the correlation between the two end-point genes is large enough and determined to be in one of the non-null components. +# @param x A vector of normalized correlation coefficients +# @param max.it The maximum number of EM algorithm iterations (default=1000) +# @param tol The tolerance level to assess convergence of the EM algorithm (default=1e-12.) +# @return A list of the parameter estimates for the L2N model. +# @export +EM <- function(x, max.it=1000, tol=1e-12, mixturemodel= "L2N", + fixedNullVar=FALSE) { + N <- length(x) + err <- 1 + # initialize the parameter values + adjustMean <- mean(x) # centering the data around the mean + x <- x - adjustMean + # The parameters of the null ditribution, N(theta,tau) : + theta <- mean(x) + tau <- 1 + # The location and scale parameters of the nonnull components: + mu <- abs(quantile(x,c(0.05,.95))) + names(mu) <- c() + sig <- c(1, 1) + # The initial probabilities of the three components: + p0 <- 0.98 + p1 <- 0.01 + p2 <- 0.01 + # Set the initial component indicator variables: + b1 <- rep(0,N) + b2 <- rep(0,N) + m1 <- 0 + m2 <- 0 + ct <- 0 + # Run the EM algorithm until the mixture fits the empirical + # density well (total squared errors < tol) + while (err > tol) { + adjustMean <- adjustMean + theta + x <- x - theta # iteratively center the data, so that the mean of the + # null component ends up being 0 + pos <- which(x > 0) # Fit the nonnull components according to the + neg <- which(x < 0) # sign of x + + d0 <- dnorm(x, theta, tau) # null component is normal + d1 <- dlnorm(x, mu[1], sig[1]) + d2 <- dlnorm(-x, mu[2], sig[2]) + wtsm <- p0*d0 + p1*d1 + p2*d2 # The density of the mixture + b1[-pos] <- 0 + b2[-neg] <- 0 + b1[pos] <- pmin(1,p1*d1[pos]/wtsm[pos]) # Posterior probabilities of the positive nonnull + b2[neg] <- pmin(1,p2*d2[neg]/wtsm[neg]) # Posterior probabilities of the negative nonnull + b0 <- 1 - (b1+b2) # The posterior null probabilities + # Update the component weights: + p0 <- mean(b0) + p1 <- mean(b1) + p2 <- mean(b2) + # Update the null component parameters: + theta <- sum(b0*x)/sum(b0) + if (fixedNullVar) + tau <- 1 + else + tau <- sqrt(sum(b0*(x-theta)^2)/sum(b0)) + d0 <- dnorm(x, theta, tau) + # Update the nonnull (nonnull) components parameters: + if (sum(b1[pos]) < 1e-2) { + mu[1] <- 0 + sig[1] <- 0 + d1 <- rep(0, N) + } else { + mu[1] <- sum(b1[pos]*(log(x[pos])))/sum(b1[pos]) + sig[1] <- sqrt(sum(b1[pos]*(log(x[pos])-mu[1])^2)/sum(b1[pos])) + d1 <- dlnorm(x, mu[1], sig[1]) + } + + if (sum(b2[neg]) < 1e-2) { + mu[2] <- 0 + sig[2] <- 0 + d2 <- rep(0, N) + } else { + mu[2] <- sum(b2[neg]*(log(-x[neg])))/sum(b2[neg]) + sig[2] <- sqrt(sum(b2[neg]*(log(-x[neg])-mu[2])^2)/sum(b2[neg])) + d2 <- dlnorm(-x, mu[2], sig[2]) + } + + # Check convergence + err <- sum((p0*d0 + p1*d1 + p2*d2 - wtsm)^2) + ct <- ct + 1 + if(ct > max.it) + break + } + b1[-pos] <- 0 + b2[-neg] <- 0 + b1[pos] <- pmin(1,p1*d1[pos]/wtsm[pos]) # Posterior probabilities of the positive nonnull + b2[neg] <- pmin(1,p2*d2[neg]/wtsm[neg]) # Posterior probabilities of the negative nonnull + b0 <- 1 - (b1+b2) # The posterior null probabilities + pvals <- 2*(1-pnorm(abs(x), mean=0, sd=tau)) + bh <- p.adjust(pvals, method="BH") + #qvals <- qvalue(pvals)$qvalues + list(x=x, adjustMean=adjustMean, + theta=theta, tau=tau, + mu1=mu[1], s1=sig[1], + mu2=mu[2], s2=sig[2], + b0=b0, b1=b1, b2=b2, + p.val=pvals, bh=bh, + #q.val=qvals, + err=err, its=ct) +} + +# Calculate the log-odds ratios to determine for each gene, in which +# of the three components in the L2N model, it belongs +logoddsValues <- function(y,theta,tau,mu1,s1,mu2,s2,p1,p2, + mixturemodel="L2N",vals=1:10) { + ret = matrix(0,nrow=length(vals),ncol=6) + ret[,1] = vals + p0 = 1-p1-p2 + xs <- seq(min(y),max(y),length=10000) + pxs <- seq(1e-6,max(y),length=10000) + nxs <- seq(min(y),-1e-6,length=10000) + i=0 + for (val in vals) { + i = i + 1 + if (p1 < 1/length(y)) { + ret[i,2] <- Inf + } else { + f <- function(x) { log((p1*dlnorm(x, mu1, s1))/ + (p0*dnorm(x, theta, tau)))-log(val) } + rt <- try(uniroot(f, lower = 1e-6, upper = max(y)), silent = T) + if (class(rt) == "try-error") + ret[i,2] = Inf + else + ret[i,2] = rt$root + } + if (p2 < 1/length(y)) { + ret[i,3] <- -Inf + } else { + f <- function(x) { log((p2*dlnorm(-x, mu2, s2))/ + (p0*dnorm(x, theta, tau)))-log(val) } + rt <- try(uniroot(f, lower = min(y), upper = -1e-6), silent = T) + if (class(rt) == "try-error") + ret[i,3] = -Inf + else + ret[i,3] = rt$root + } + # type I: + ret[i,4] = pnorm(ret[i,3], theta, tau) + + 1 - pnorm(ret[i,2], theta, tau) + # Power: + ret[i,5] = (p1*(1-plnorm(ret[i,2], mu1, s1)) + + p2*(1-plnorm(-ret[i,3], mu2, s2)))/(p1+p2) + # FDR: + ret[i,6] <- p0*ret[i,4]/(p0*ret[i,4]+ret[i,5]*(p1+p2)) + } + colnames(ret) <- c("ppr","Right","Left","TypeI","Power","FDR") + ret +} + +# calculate the posterior L2N mixture model density of x, given the parameter +# estimates +posteriorDensityL2N <- function(fit.em, x) { + p0 <- mean(fit.em$b0) + p1 <- mean(fit.em$b1) + p2 <- mean(fit.em$b2) + adjustMean <- fit.em$adjustMean + fit.em$theta + x <- x - fit.em$adjustMean + pos <- which(x > 0) # Fit the nonnull components according to the + neg <- which(x < 0) # sign of x + d0 <- dnorm(x, fit.em$theta, fit.em$tau) # null component is normal + d1 <- dlnorm(x, fit.em$mu1, fit.em$s1) + d2 <- dlnorm(-x, fit.em$mu2, fit.em$s2) + wtsm <- p0*d0 + p1*d1 + p2*d2 + b1 <- rep(0, length(x)) + b2 <- rep(0, length(x)) + b1[pos] <- pmin(1,p1*d1[pos]/wtsm[pos]) # Posterior probabilities of the positive nonnull + b2[neg] <- pmin(1,p2*d2[neg]/wtsm[neg]) # Posterior probabilities of the negative nonnull + b0 <- 1 - (b1+b2) # The posterior null probabilities + list(b0=b0,b1=b1,b2=b2) +} + +# Return the estimated density function of the mixture +mixtureDensityL2N <- function(fit.em, x) { + mean(fit.em$b0)*dnorm(x, fit.em$theta, fit.em$tau) + + mean(fit.em$b1)*dlnorm(x, fit.em$mu1, fit.em$s1) + + mean(fit.em$b2)*dlnorm(-x, fit.em$mu2, fit.em$s2) +} + +# Calculate the root mean squared error of the fitted mixture +GoodnessOfFit <- function(fit.em, mixturemodel="L2N") { + x <- sort(fit.em$x) + if(length(x) > 10000) + x <- x[seq(1,length(x), length=10000)] + diffs <- x[-1] - x[-length(x)] + dnsfn <- approxfun(density(x,bw="SJ")) + return(sqrt(sum((diffs* (dnsfn(x[-1])-mixtureDensityL2N(fit.em,x[-1])) ^2)))) +} + +# create clusters of nodes, based on similarity of their edges +clustNode <- function(A) { + degs <- rowSums(A) + deg1copy <- degs + ord <- rev(order(degs)) + while(max(deg1copy) >= 0) { + maxdeg <- which.max(deg1copy) + nbrs <- which(A[maxdeg,] == 1) + nbrs <- setdiff(nbrs, which(deg1copy < 0)) + deg1copy[maxdeg] <- -length(which(deg1copy < 0)) -1 + if (length(nbrs) == 0) + next + extdeg <- rep(0, length(nbrs)) + for (i in 1:length(nbrs)) { + nbr <- nbrs[i] + extdeg[i] <- length(setdiff(which(A[nbr,] == 1), nbrs)) + # external links + length(setdiff(nbrs, which(A[nbr,] == 0))) # missing internal links + } + deg1copy[nbrs[rev(order(extdeg[nbrs]))]] <- -length(which(deg1copy < 0)) - 1:length(nbrs) + } + deg1copy +} + +# calculate the clustering coefficient of a node +clusteringCoef <- function(A) { + rsum <- rowSums(A) + cc <- rep(0,nrow(A)) + for (i in 1:nrow(A)) { + if (rsum[i] <= 1) + cc[i] <- 0 + else { + nbrs <- which(A[i,] == 1) + At <- A[nbrs, nbrs] + cc[i] <- 0.5*sum(At)/choose(rsum[i],2) + } + } + cc +} + +#' Plot the histogram of the data and the fitted mixture distribution. +#' +#' The function is called by the edgefinder function. +#' @param fit.em The object (list) returned from the EM function with the parameter estimates for the L2N model. +#' @param gof The root mean-squared error of the fitted model (to appear in the title of the plot). +#' @param ttl The title of the plot (default=""). +#' @param trim The proportion of extreme values on both sides of the distribution to eliminate from the plot (default=0.01.) This can be useful if a small number of values are so extreme, that the plot shows mostly the tails and a spike in the middle. +#' @export +plotMixture <- function(fit.em, gof, ttl="", xlab="x", trim=0.01, + mixturemodel="L2N") { + xlim <- quantile(fit.em$x, c(trim/2, 1-trim/2)) + brks <- min(80,floor(length(fit.em$x)/100)) + hist(fit.em$x, freq=FALSE, breaks=brks, + main=sprintf("%s\nrMSE %2.2f",ttl, gof), + xlim=xlim,xlab=xlab, border="white", col="wheat") + xs <- seq(min(fit.em$x), max(fit.em$x), length=1000) + p0 <- mean(fit.em$b0) + p1 <- mean(fit.em$b1) + p2 <- mean(fit.em$b2) + lines(xs, p0*dnorm(xs, fit.em$theta, fit.em$tau), col=2, lwd=2) + lines(xs, p1*dlnorm(xs, fit.em$mu1, fit.em$s1), col=3, lwd=2) + lines(-xs, p2*dlnorm(xs, fit.em$mu2, fit.em$s2), col=3, lwd=2) + mxfit <- p0*dnorm(xs,fit.em$theta, fit.em$tau) + + p1*dlnorm(xs, fit.em$mu1, fit.em$s1) + + p2*dlnorm(-xs, fit.em$mu2, fit.em$s2) + lines(xs, mxfit, lwd=3, col=4, lty=2) +} + +#' Plot the degree of nodes versus the degree times the clustering coefficients. +#' +#' The x-axis represents the number of neighbors of each node, and the y-axis represents the proportion of neighbors which are connected to each other. +#' @param edgefinderobj The object (list) returned by edgefinder. +#' @export +#' @examples +#' \donttest{ +#' data(WT) +#' WTres <- edgefinder(WT, ttl = "Wild Type") +#' plotDegCC(WTres) +#' } + +plotDegCC <- function(edgefinderobj) { + sigW <- sort(union(edgefinderobj$rt,edgefinderobj$lt)) + G <- edgefinderobj$G + tmpmat <- Matrix(0,G, G) + vec <- rep(0, choose(G,2)) + vec[sigW] <- 1 + tmpmat[upper.tri(tmpmat)] = vec + A0 <- tmpmat+t.data.frame(tmpmat) + cc0 <- clusteringCoef(as.matrix(A0)) + deg0 <- rowSums(as.matrix(A0)) + lm0 <- lm(sqrt(deg0*cc0) ~ sqrt(deg0)) + M <- max(deg0) + plot(deg0, deg0*cc0,axes=F,xlim=c(0,M), + ylim=c(0,M),main="", + xlab=bquote("degree"),ylab=bquote("CC*degree"), + col="thistle",pch=24,cex=0.5); axis(1); axis(2) + grid(); abline(0,1,col="seagreen1", lwd=2) + sq <- seq(0,M,length=length(deg0)) + newdat = data.frame(deg0 = sq) + pred = predict.lm(lm0, newdata=newdat) + lines(sq, pred^2, col="orange",lwd=3,lty=2) +} + +#' Edge-indicator bitmap plot. +#' +#' Plot a bitmap in which a black dot corresponds to a pair of highly correlated genes (an edge in the graph). +#' The default is to show the nodes according to their order in the input. +#' By setting orderByDegree=T as below, it is possible to change the order and cluster them, and show them in increasing degree order (from left to right.) +#' @param edgefinderobj The object (list) returned by edgefinder. +#' @export +#' @examples +#' \donttest{ +#' data(WT) +#' WTres <- edgefinder(WT, ttl = "Wild Type") +#' plotBitmapCC(WTres) +#' WTres$orderByDegree=T +#' plotBitmapCC(WTres) +#' } +plotBitmapCC <- function(edgefinderobj) { + with(edgefinderobj,{ + sigW <- sort(union(rt,lt)) + tmpmat <- Matrix(0,G, G) + vec <- rep(0, choose(G,2)) + vec[sigW] <- 1 + tmpmat[upper.tri(tmpmat)] = vec + A0 <- tmpmat+t.data.frame(tmpmat) + deg0copy <- clustNode(as.matrix(A0)) + if ("orderByDegree" %in% ls()) { + if (orderByDegree) + image(A0[order(deg0copy), order(deg0copy)]) + else + image(A0) + } else { + image(A0) + } + }) +} diff --git a/.Rproj.user/A07CE820/sources/per/t/FCB2482F-contents b/.Rproj.user/A07CE820/sources/per/t/FCB2482F-contents new file mode 100644 index 0000000..24ff3fb --- /dev/null +++ b/.Rproj.user/A07CE820/sources/per/t/FCB2482F-contents @@ -0,0 +1,162 @@ +--- +title: "The edgefinder package - examples" +author: "Haim Bar" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{edgefinder} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +The edgefinder package is used to find edges in gene networks using on co-expression +data. The input to the program is a normalized expression matrix, with genes (nodes) +in the rows, and samples in the columns. +The program calculates the pair-wise correlations, performs Fisher's Z +transformation, and fits the L2N model to the transformed data. L2N is a mixture +model with three components: the uncorrelated pairs belong to the null component +which is assumed to be normally distributed, and the correlated pairs belong to one +of the two non-null components which are assumed to follow lognormal distributions. + +Typical datasets consist of hundreds, or thousands of genes, and hence a very +large number of pairs. Therefore, edgefinder selects a subset of the pairs (the +default number of pairs is 20,000), fits the L2N model to the subset, and calculates +the component probabilities for *all* possible pairs. +Using the posterior probabilities, edgefinder determines which pairs are +highly correlated while controling the false discovery rate. +Note that edgefinder makes no assumptions about the structure of the network. + +The edgefinder package depends on the 'Matrix' package, to allow for efficient +storage and computation of large co-occurence matrices. For simulating datasets +we used the 'huge' package, but it is not required when using edgefinder. + +# Real data examples + +We use a publiclly available dataset from +https://www.ncbi.nlm.nih.gov/sites/GDSbrowser?acc=GDS4430 +(Horev G, Ellegood J, Lerch JP, Son YE et al. Dosage-dependent phenotypes in models +of 16p11.2 lesions found in autism. Proc Natl Acad Sci U.S.A. 2011, Oct. +11;108(41):17076-81. PMID: 21969575). + +The dataset contains three groups: wild type (2 copies of 16p11.2), +deletion (1 copy), and duplication (3 copies). +We focus on a subset of 3,454, genes which were found to be differentially +expressed when comparing the wild-type and duplication groups (using an +FDR threshold of 0.05.) +We fit the +L2N model to this set of genes in each group, and compare the properties +of the two networks. First, we load the wild-type data (WT). +WT is a matrix with 3454 rows (genes) and 15 columns (samples) from +the wild-type group. + + + +``` +library("edgefinder") +# Wild-type first: +data(WT) +WTres <- edgefinder(WT, ttl = "Wild Type") +shortSummary(WTres) +``` + +The edgefinder function plots the fitted mixture distribution: + +```{r echo=FALSE, out.width='60%'} +knitr::include_graphics('./WTfit.png') +``` + +The function shortSummary produces the following output: + +``` +No. nodes = 3,454 +Max no. edges = 5,963,331 +p1 = 0.0523 +p2 = 0.00892 +Est. Power = 0.25 +Est. FDR = 0.0472 +``` + +Note that the estimated power and FDR are calculated based on the fitted L2N model. + +The plotDegCC function can be used to plot the degree of nodes versus the +degree times the clustering coefficient of nodes. + +``` +plotDegCC(WTres) +``` + +```{r echo=FALSE, out.width='60%'} +knitr::include_graphics('./WTdg.png') +``` + +The plotBitmapCC function is used to show the network as a 0/1 matrix, where a black +dot corresponds to an edge in the graph. Setting orderByDegree=T is used to +sort the nodes by clusters (by degree). When set to FALSE, the original order +of the nodes as it appears in the gene expression file, is preserved. + + +``` +WTres$orderByDegree=T +plotBitmapCC(WTres) +``` + +```{r echo=FALSE, out.width='60%'} +knitr::include_graphics('./WTbitmap.png') +``` + +We repeat the same process with the duplication group. +DUP is a matrix with 3454 rows (genes) and 12 columns (samples). + +``` +data("DUP") +Dres <- edgefinder(DUP, ttl = "Duplication") +shortSummary(Dres) +plotDegCC(Dres) +Dres$orderByDegree=T +plotBitmapCC(Dres) + +No. nodes = 3,454 +Max no. edges = 5,963,331 +p1 = 0.0642 +p2 = 0.00402 +Est. Power = 0.267 +Est. FDR = 0.0488 +``` + +```{r echo=FALSE, out.width='60%'} +knitr::include_graphics('./DUPfit.png') +knitr::include_graphics('./DUPdg.png') +knitr::include_graphics('./DUPbitmap.png') +``` + + +# Simulated data + +The following examples shows a simulated dataset with a hub structure, consisting +of 1000 nodes and 50 hubs. The bitmap plot shows the network that was created +by edgefinder. + +``` +library("huge") +N=200; D=1000 +set.seed(23197) +L = huge.generator(n = N, d = D, graph = "hub", g=50, + v = 0.3, u = 0.1) +x = mvrnorm(N, rep(0, D), L$sigma) +Sres <- edgefinder(t(x), ttl = "Simulation") +Sres$orderByDegree=F +plotBitmapCC(Sres) +``` + +library("huge") + +```{r echo=FALSE, out.width='100%'} +knitr::include_graphics('./SIMbitmap.png') +``` diff --git a/.Rproj.user/A07CE820/sources/prop/INDEX b/.Rproj.user/A07CE820/sources/prop/INDEX new file mode 100644 index 0000000..ccf2fdd --- /dev/null +++ b/.Rproj.user/A07CE820/sources/prop/INDEX @@ -0,0 +1,9 @@ +C%3A%2FUsers%2FHaim%2FBox%2FCorNetwork_Project%2Fedgefinder%2F.Rbuildignore="AC17A43B" +C%3A%2FUsers%2FHaim%2FBox%2FCorNetwork_Project%2Fedgefinder%2FDESCRIPTION="22BE9A1F" +C%3A%2FUsers%2FHaim%2FBox%2FCorNetwork_Project%2Fedgefinder%2FNAMESPACE="AF374B3F" +C%3A%2FUsers%2FHaim%2FBox%2FCorNetwork_Project%2Fedgefinder%2FR%2Fedgefinder.R="B8026A94" +C%3A%2FUsers%2FHaim%2FBox%2FCorNetwork_Project%2Fedgefinder%2Fvignettes%2FExamples.R="4074369A" +C%3A%2FUsers%2FHaim%2FBox%2FCorNetwork_Project%2Fedgefinder%2Fvignettes%2FExamples.Rmd="F6C015EE" +C%3A%2FUsers%2FHaim%2FBox%2FCorNetwork_Project%2Fedgefinder%2Fvignettes%2Fedgefinder.Rmd="F13143D0" +C%3A%2FUsers%2FHaim%2FDesktop%2Fbuild.txt="5F75CF9B" +C%3A%2FUsers%2FHaim%2FDropbox%20(Personal)%2Fbuild.txt="CB82AEA3" diff --git a/.Rproj.user/D1CCA32/pcs/files-pane.pper b/.Rproj.user/D1CCA32/pcs/files-pane.pper new file mode 100644 index 0000000..96c4265 --- /dev/null +++ b/.Rproj.user/D1CCA32/pcs/files-pane.pper @@ -0,0 +1,9 @@ +{ + "path" : "~/Box/CorNetwork_Project/edgefinder", + "sortOrder" : [ + { + "ascending" : true, + "columnIndex" : 2 + } + ] +} \ No newline at end of file diff --git a/.Rproj.user/D1CCA32/sdb/per/t/87D35FB b/.Rproj.user/D1CCA32/sdb/per/t/87D35FB new file mode 100644 index 0000000..cc12dce --- /dev/null +++ b/.Rproj.user/D1CCA32/sdb/per/t/87D35FB @@ -0,0 +1,20 @@ +{ + "collab_server" : "", + "contents" : "\n#' Detect edges in co-expression datasete.\n#'\n#' Fit the L2N model to normalized correlation coefficients between pairs of genes. The mixture model has three component - the null component follows a normal distribution, and the two non-null components follow lognormal distributions. An edge is in the graph if the correlation between the two end-point genes is large enough and determined to be in one of the non-null components.\n#' @param Exprs A numeric matrix with normalized gene expression data. Rows\n#' correspond to genes, and columns correspond to samples.\n#' @param BHthr the Benjamini-Hochberg fasle discovery rate threshold to be\n#' used to determine which pairs are strongly correlated. Default=0.05.\n#' @param rndseed The random seed used to select a subset of the pairs.\n#' @param maxLen The maximum number of pairs that will be randomly selected\n#' to fit the L2N model. Default=20000.\n#' @param LOvals the maximum log-odds ratio to be used to be used to\n#' determine the cut-off points to declare which correlations are significant.\n#' The program will check which log-odds ratio (1,2,...,LOvals) results in\n#' FDR less than or equal to BHthr.\n#' Default=20.\n#' @param ttl Title for the fitted-model plot. Default=\"\"\n#' @return A list with the following elements\n#' \\itemize{\n#' \\item{G} {The number of genes}\n#' \\item{p1} {The proportion of genes in the right mixture component (positively correlated.)}\n#' \\item{p2} {The proportion of genes in the left mixture component (negtively correlated.)}\n#' \\item{p0} {The proportion of genes in the null component (un-correlated.)}\n#' \\item{m0, m1, m2, s0, s1, s2} {The location and scale parameters of the three mixture components.}\n#' \\item {thrtable} {A table with 6 columns: posterior probability ratio (ppr) between the non-null components and the null component), the right component cutoff corresponding to the ppr, the left component cutoff, the estimated probability of Type-I errors, the estimated power, the estimated FDR.}\n#' \\item {LogOddsRatio} {The log-odds ratio that yields FDR less than or equal to the desired level.}\n#' \\item {rmse} {The root mean-squared error of the fitted model.}\n#' \\item {rt, lt} {The significant edges (from the right, and left mixture component.)}\n#' }\n#' @export\n#' @examples\n#' \\donttest{\n#' data(WT)\n#' WTres <- edgefinder(WT, ttl = \"Wild Type\")\n#' }\n\nedgefinder <- function(Exprs, BHthr = 0.05,\n rndseed=112211, maxLen=20000, LOvals=20, ttl=\"\") {\n corM <- cor(t(Exprs), use = \"pairwise.complete.obs\")\n N <- ncol(Exprs)\n y <- atanh(corM[upper.tri(corM)])\n fix <- which(is.infinite(y))\n if (length(fix) > 0)\n y[fix] <- max(abs(y[-fix]))*(1 + runif(length(fix)))\n set.seed(rndseed)\n sset <- sample(1:length(y),size = min(maxLen,length(y)))\n y0 <- y[sset]\n fittedL2N <- EM(y0*sqrt(N-3), fixedNullVar = F)\n rmseL2N <- GoodnessOfFit(fittedL2N)\n plotMixture(fittedL2N,gof=rmseL2N,trim=0, ttl=ttl)\n B <- posteriorDensityL2N(fittedL2N, y*sqrt(N-3))\n p1L2N <- mean(fittedL2N$b1)\n p2L2N <- mean(fittedL2N$b2)\n p0L2N <- 1-(p1L2N+p2L2N)\n m0L2N = fittedL2N$theta\n m1L2N = fittedL2N$mu1\n m2L2N = fittedL2N$mu2\n s0L2N = fittedL2N$tau\n s1L2N = fittedL2N$s1\n s2L2N = fittedL2N$s2\n\n ret = logoddsValues(fittedL2N$x,m0L2N,s0L2N,m1L2N,\n s1L2N,m2L2N,s2L2N,p1L2N,p2L2N,\n vals=1:LOvals)\n if (length(which(ret[,6] < BHthr) > 0)) {\n LogOddsRatio <- max(min(which(ret[,6] < BHthr)),2)\n } else {\n LogOddsRatio <- LOvals\n }\n RtBFL2N <- which(B[[2]]/B[[1]] > LogOddsRatio)\n LtBFL2N <- which(B[[3]]/B[[1]] > LogOddsRatio)\n\n list(G=nrow(Exprs), p1=p1L2N, p2=p2L2N, p0=p0L2N, m0=m0L2N, m1=m1L2N, m2=m2L2N,\n s0=s0L2N, s1=s1L2N, s2=s2L2N, thrtable=ret, LogOddsRatio=LogOddsRatio,\n rmse=rmseL2N, rt=RtBFL2N, lt=LtBFL2N)\n}\n\n#' Print a short summary of the fitted mixture model.\n#'\n#' Show the number of nodes, the number of possible edges, the proportion of possitively/negatively correlated pairs, the estimated power and false discovery rate.\n#' @param edgefinderobj The object (list) returned from the edgefinder function.\n#' @export\n#' @examples\n#' \\donttest{\n#' data(WT)\n#' WTres <- edgefinder(WT, ttl = \"Wild Type\")\n#' shortSummary(WTres)\n#' }\nshortSummary <- function(edgefinderobj) {\n with(edgefinderobj,{\n cat(\"No. nodes =\", prettyNum(G,big.mark = \",\"),\"\\n\")\n cat(\"Max no. edges =\", prettyNum(choose(G, 2),big.mark = \",\"),\"\\n\")\n cat(\"p1 =\",format(p1,digits=3),\"\\n\")\n cat(\"p2 =\",format(p2,digits=3),\"\\n\")\n cat(\"Est. Power =\", format(thrtable[LogOddsRatio,5],digits=3),\"\\n\")\n cat(\"Est. FDR =\", format(thrtable[LogOddsRatio,6],digits=3),\"\\n\")\n })\n}\n\n# The EM algorithm to fit the L2N model.\n#\n# Fit the L2N model to normalized correlation coefficients between pairs of genes. The mixture model has three component - the null component follows a normal distribution, and the two non-null components follow lognormal distributions. An edge is in the graph if the correlation between the two end-point genes is large enough and determined to be in one of the non-null components.\n# @param x A vector of normalized correlation coefficients\n# @param max.it The maximum number of EM algorithm iterations (default=1000)\n# @param tol The tolerance level to assess convergence of the EM algorithm (default=1e-12.)\n# @return A list of the parameter estimates for the L2N model.\n# @export\nEM <- function(x, max.it=1000, tol=1e-12, mixturemodel= \"L2N\",\n fixedNullVar=FALSE) {\n N <- length(x)\n err <- 1\n # initialize the parameter values\n adjustMean <- mean(x) # centering the data around the mean\n x <- x - adjustMean\n # The parameters of the null ditribution, N(theta,tau) :\n theta <- mean(x)\n tau <- 1\n # The location and scale parameters of the nonnull components:\n mu <- abs(quantile(x,c(0.05,.95)))\n names(mu) <- c()\n sig <- c(1, 1)\n # The initial probabilities of the three components:\n p0 <- 0.98\n p1 <- 0.01\n p2 <- 0.01\n # Set the initial component indicator variables:\n b1 <- rep(0,N)\n b2 <- rep(0,N)\n m1 <- 0\n m2 <- 0\n ct <- 0\n # Run the EM algorithm until the mixture fits the empirical\n # density well (total squared errors < tol)\n while (err > tol) {\n adjustMean <- adjustMean + theta\n x <- x - theta # iteratively center the data, so that the mean of the\n # null component ends up being 0\n pos <- which(x > 0) # Fit the nonnull components according to the\n neg <- which(x < 0) # sign of x\n\n d0 <- dnorm(x, theta, tau) # null component is normal\n d1 <- dlnorm(x, mu[1], sig[1])\n d2 <- dlnorm(-x, mu[2], sig[2])\n wtsm <- p0*d0 + p1*d1 + p2*d2 # The density of the mixture\n b1[-pos] <- 0\n b2[-neg] <- 0\n b1[pos] <- pmin(1,p1*d1[pos]/wtsm[pos]) # Posterior probabilities of the positive nonnull\n b2[neg] <- pmin(1,p2*d2[neg]/wtsm[neg]) # Posterior probabilities of the negative nonnull\n b0 <- 1 - (b1+b2) # The posterior null probabilities\n # Update the component weights:\n p0 <- mean(b0)\n p1 <- mean(b1)\n p2 <- mean(b2)\n # Update the null component parameters:\n theta <- sum(b0*x)/sum(b0)\n if (fixedNullVar)\n tau <- 1\n else\n tau <- sqrt(sum(b0*(x-theta)^2)/sum(b0))\n d0 <- dnorm(x, theta, tau)\n # Update the nonnull (nonnull) components parameters:\n if (sum(b1[pos]) < 1e-2) {\n mu[1] <- 0\n sig[1] <- 0\n d1 <- rep(0, N)\n } else {\n mu[1] <- sum(b1[pos]*(log(x[pos])))/sum(b1[pos])\n sig[1] <- sqrt(sum(b1[pos]*(log(x[pos])-mu[1])^2)/sum(b1[pos]))\n d1 <- dlnorm(x, mu[1], sig[1])\n }\n\n if (sum(b2[neg]) < 1e-2) {\n mu[2] <- 0\n sig[2] <- 0\n d2 <- rep(0, N)\n } else {\n mu[2] <- sum(b2[neg]*(log(-x[neg])))/sum(b2[neg])\n sig[2] <- sqrt(sum(b2[neg]*(log(-x[neg])-mu[2])^2)/sum(b2[neg]))\n d2 <- dlnorm(-x, mu[2], sig[2])\n }\n\n # Check convergence\n err <- sum((p0*d0 + p1*d1 + p2*d2 - wtsm)^2)\n ct <- ct + 1\n if(ct > max.it)\n break\n }\n b1[-pos] <- 0\n b2[-neg] <- 0\n b1[pos] <- pmin(1,p1*d1[pos]/wtsm[pos]) # Posterior probabilities of the positive nonnull\n b2[neg] <- pmin(1,p2*d2[neg]/wtsm[neg]) # Posterior probabilities of the negative nonnull\n b0 <- 1 - (b1+b2) # The posterior null probabilities\n pvals <- 2*(1-pnorm(abs(x), mean=0, sd=tau))\n bh <- p.adjust(pvals, method=\"BH\")\n #qvals <- qvalue(pvals)$qvalues\n list(x=x, adjustMean=adjustMean,\n theta=theta, tau=tau,\n mu1=mu[1], s1=sig[1],\n mu2=mu[2], s2=sig[2],\n b0=b0, b1=b1, b2=b2,\n p.val=pvals, bh=bh,\n #q.val=qvals,\n err=err, its=ct)\n}\n\n# Calculate the log-odds ratios to determine for each gene, in which\n# of the three components in the L2N model, it belongs\nlogoddsValues <- function(y,theta,tau,mu1,s1,mu2,s2,p1,p2,\n mixturemodel=\"L2N\",vals=1:10) {\n ret = matrix(0,nrow=length(vals),ncol=6)\n ret[,1] = vals\n p0 = 1-p1-p2\n xs <- seq(min(y),max(y),length=10000)\n pxs <- seq(1e-6,max(y),length=10000)\n nxs <- seq(min(y),-1e-6,length=10000)\n i=0\n for (val in vals) {\n i = i + 1\n if (p1 < 1/length(y)) {\n ret[i,2] <- Inf\n } else {\n f <- function(x) { log((p1*dlnorm(x, mu1, s1))/\n (p0*dnorm(x, theta, tau)))-log(val) }\n rt <- try(uniroot(f, lower = 1e-6, upper = max(y)), silent = T)\n if (class(rt) == \"try-error\")\n ret[i,2] = Inf\n else\n ret[i,2] = rt$root\n }\n if (p2 < 1/length(y)) {\n ret[i,3] <- -Inf\n } else {\n f <- function(x) { log((p2*dlnorm(-x, mu2, s2))/\n (p0*dnorm(x, theta, tau)))-log(val) }\n rt <- try(uniroot(f, lower = min(y), upper = -1e-6), silent = T)\n if (class(rt) == \"try-error\")\n ret[i,3] = -Inf\n else\n ret[i,3] = rt$root\n }\n # type I:\n ret[i,4] = pnorm(ret[i,3], theta, tau) +\n 1 - pnorm(ret[i,2], theta, tau)\n # Power:\n ret[i,5] = (p1*(1-plnorm(ret[i,2], mu1, s1)) +\n p2*(1-plnorm(-ret[i,3], mu2, s2)))/(p1+p2)\n # FDR:\n ret[i,6] <- p0*ret[i,4]/(p0*ret[i,4]+ret[i,5]*(p1+p2))\n }\n colnames(ret) <- c(\"ppr\",\"Right\",\"Left\",\"TypeI\",\"Power\",\"FDR\")\n ret\n}\n\n# calculate the posterior L2N mixture model density of x, given the parameter\n# estimates\nposteriorDensityL2N <- function(fit.em, x) {\n p0 <- mean(fit.em$b0)\n p1 <- mean(fit.em$b1)\n p2 <- mean(fit.em$b2)\n adjustMean <- fit.em$adjustMean + fit.em$theta\n x <- x - fit.em$adjustMean\n pos <- which(x > 0) # Fit the nonnull components according to the\n neg <- which(x < 0) # sign of x\n d0 <- dnorm(x, fit.em$theta, fit.em$tau) # null component is normal\n d1 <- dlnorm(x, fit.em$mu1, fit.em$s1)\n d2 <- dlnorm(-x, fit.em$mu2, fit.em$s2)\n wtsm <- p0*d0 + p1*d1 + p2*d2\n b1 <- rep(0, length(x))\n b2 <- rep(0, length(x))\n b1[pos] <- pmin(1,p1*d1[pos]/wtsm[pos]) # Posterior probabilities of the positive nonnull\n b2[neg] <- pmin(1,p2*d2[neg]/wtsm[neg]) # Posterior probabilities of the negative nonnull\n b0 <- 1 - (b1+b2) # The posterior null probabilities\n list(b0=b0,b1=b1,b2=b2)\n}\n\n# Return the estimated density function of the mixture\nmixtureDensityL2N <- function(fit.em, x) {\n mean(fit.em$b0)*dnorm(x, fit.em$theta, fit.em$tau) +\n mean(fit.em$b1)*dlnorm(x, fit.em$mu1, fit.em$s1) +\n mean(fit.em$b2)*dlnorm(-x, fit.em$mu2, fit.em$s2)\n}\n\n# Calculate the root mean squared error of the fitted mixture\nGoodnessOfFit <- function(fit.em, mixturemodel=\"L2N\") {\n x <- sort(fit.em$x)\n if(length(x) > 10000)\n x <- x[seq(1,length(x), length=10000)]\n diffs <- x[-1] - x[-length(x)]\n dnsfn <- approxfun(density(x,bw=\"SJ\"))\n return(sqrt(sum((diffs* (dnsfn(x[-1])-mixtureDensityL2N(fit.em,x[-1])) ^2))))\n}\n\n# create clusters of nodes, based on similarity of their edges\nclustNode <- function(A) {\n degs <- rowSums(A)\n deg1copy <- degs\n ord <- rev(order(degs))\n while(max(deg1copy) >= 0) {\n maxdeg <- which.max(deg1copy)\n nbrs <- which(A[maxdeg,] == 1)\n nbrs <- setdiff(nbrs, which(deg1copy < 0))\n deg1copy[maxdeg] <- -length(which(deg1copy < 0)) -1\n if (length(nbrs) == 0)\n next\n extdeg <- rep(0, length(nbrs))\n for (i in 1:length(nbrs)) {\n nbr <- nbrs[i]\n extdeg[i] <- length(setdiff(which(A[nbr,] == 1), nbrs)) + # external links\n length(setdiff(nbrs, which(A[nbr,] == 0))) # missing internal links\n }\n deg1copy[nbrs[rev(order(extdeg[nbrs]))]] <- -length(which(deg1copy < 0)) - 1:length(nbrs)\n }\n deg1copy\n}\n\n# calculate the clustering coefficient of a node\nclusteringCoef <- function(A) {\n rsum <- rowSums(A)\n cc <- rep(0,nrow(A))\n for (i in 1:nrow(A)) {\n if (rsum[i] <= 1)\n cc[i] <- 0\n else {\n nbrs <- which(A[i,] == 1)\n At <- A[nbrs, nbrs]\n cc[i] <- 0.5*sum(At)/choose(rsum[i],2)\n }\n }\n cc\n}\n\n#' Plot the histogram of the data and the fitted mixture distribution.\n#'\n#' The function is called by the edgefinder function.\n#' @param fit.em The object (list) returned from the EM function with the parameter estimates for the L2N model.\n#' @param gof The root mean-squared error of the fitted model (to appear in the title of the plot).\n#' @param ttl The title of the plot (default=\"\").\n#' @param trim The proportion of extreme values on both sides of the distribution to eliminate from the plot (default=0.01.) This can be useful if a small number of values are so extreme, that the plot shows mostly the tails and a spike in the middle.\n#' @export\nplotMixture <- function(fit.em, gof, ttl=\"\", xlab=\"x\", trim=0.01,\n mixturemodel=\"L2N\") {\n xlim <- quantile(fit.em$x, c(trim/2, 1-trim/2))\n brks <- min(80,floor(length(fit.em$x)/100))\n hist(fit.em$x, freq=FALSE, breaks=brks,\n main=sprintf(\"%s\\nrMSE %2.2f\",ttl, gof),\n xlim=xlim,xlab=xlab, border=\"white\", col=\"wheat\")\n xs <- seq(min(fit.em$x), max(fit.em$x), length=1000)\n p0 <- mean(fit.em$b0)\n p1 <- mean(fit.em$b1)\n p2 <- mean(fit.em$b2)\n lines(xs, p0*dnorm(xs, fit.em$theta, fit.em$tau), col=2, lwd=2)\n lines(xs, p1*dlnorm(xs, fit.em$mu1, fit.em$s1), col=3, lwd=2)\n lines(-xs, p2*dlnorm(xs, fit.em$mu2, fit.em$s2), col=3, lwd=2)\n mxfit <- p0*dnorm(xs,fit.em$theta, fit.em$tau) +\n p1*dlnorm(xs, fit.em$mu1, fit.em$s1) +\n p2*dlnorm(-xs, fit.em$mu2, fit.em$s2)\n lines(xs, mxfit, lwd=3, col=4, lty=2)\n}\n\n#' Plot the degree of nodes versus the degree times the clustering coefficients.\n#'\n#' The x-axis represents the number of neighbors of each node, and the y-axis represents the proportion of neighbors which are connected to each other.\n#' @param edgefinderobj The object (list) returned by edgefinder.\n#' @export\n#' @examples\n#' \\donttest{\n#' data(WT)\n#' WTres <- edgefinder(WT, ttl = \"Wild Type\")\n#' plotDegCC(WTres)\n#' }\n\nplotDegCC <- function(edgefinderobj) {\n sigW <- sort(union(edgefinderobj$rt,edgefinderobj$lt))\n G <- edgefinderobj$G\n tmpmat <- Matrix(0,G, G)\n vec <- rep(0, choose(G,2))\n vec[sigW] <- 1\n tmpmat[upper.tri(tmpmat)] = vec\n A0 <- tmpmat+t.data.frame(tmpmat)\n cc0 <- clusteringCoef(as.matrix(A0))\n deg0 <- rowSums(as.matrix(A0))\n lm0 <- lm(sqrt(deg0*cc0) ~ sqrt(deg0))\n M <- max(deg0)\n plot(deg0, deg0*cc0,axes=F,xlim=c(0,M),\n ylim=c(0,M),main=\"\",\n xlab=bquote(\"degree\"),ylab=bquote(\"CC*degree\"),\n col=\"thistle\",pch=24,cex=0.5); axis(1); axis(2)\n grid(); abline(0,1,col=\"seagreen1\", lwd=2)\n sq <- seq(0,M,length=length(deg0))\n newdat = data.frame(deg0 = sq)\n pred = predict.lm(lm0, newdata=newdat)\n lines(sq, pred^2, col=\"orange\",lwd=3,lty=2)\n}\n\n#' Edge-indicator bitmap plot.\n#'\n#' Plot a bitmap in which a black dot corresponds to a pair of highly correlated genes (an edge in the graph).\n#' The default is to show the nodes according to their order in the input.\n#' By setting orderByDegree=T as below, it is possible to change the order and cluster them, and show them in increasing degree order (from left to right.)\n#' @param edgefinderobj The object (list) returned by edgefinder.\n#' @export\n#' @examples\n#' \\donttest{\n#' data(WT)\n#' WTres <- edgefinder(WT, ttl = \"Wild Type\")\n#' plotBitmapCC(WTres)\n#' WTres$orderByDegree=T\n#' plotBitmapCC(WTres)\n#' }\nplotBitmapCC <- function(edgefinderobj) {\n with(edgefinderobj,{\n sigW <- sort(union(rt,lt))\n tmpmat <- Matrix(0,G, G)\n vec <- rep(0, choose(G,2))\n vec[sigW] <- 1\n tmpmat[upper.tri(tmpmat)] = vec\n A0 <- tmpmat+t.data.frame(tmpmat)\n deg0copy <- clustNode(as.matrix(A0))\n if (\"orderByDegree\" %in% ls()) {\n if (orderByDegree)\n image(A0[order(deg0copy), order(deg0copy)])\n else\n image(A0)\n } else {\n image(A0)\n }\n })\n}\n", + "created" : 1536637894659.000, + "dirty" : false, + "encoding" : "UTF-8", + "folds" : "", + "hash" : "329483066", + "id" : "87D35FB", + "lastKnownWriteTime" : 1536432023, + "last_content_update" : 1536432023, + "path" : "~/Box/CorNetwork_Project/edgefinder/R/edgefinder.R", + "project_path" : "R/edgefinder.R", + "properties" : { + }, + "relative_order" : 1, + "source_on_save" : false, + "source_window" : "", + "type" : "r_source" +} \ No newline at end of file diff --git a/.Rproj.user/D1CCA32/sdb/prop/INDEX b/.Rproj.user/D1CCA32/sdb/prop/INDEX new file mode 100644 index 0000000..47da102 --- /dev/null +++ b/.Rproj.user/D1CCA32/sdb/prop/INDEX @@ -0,0 +1 @@ +~%2FBox%2FCorNetwork_Project%2Fedgefinder%2FR%2Fedgefinder.R="36689D63" diff --git a/.Rproj.user/D1CCA32/session-persistent-state b/.Rproj.user/D1CCA32/session-persistent-state new file mode 100644 index 0000000..4027b02 --- /dev/null +++ b/.Rproj.user/D1CCA32/session-persistent-state @@ -0,0 +1 @@ +virtual-session-id="6B3319CC" diff --git a/.Rproj.user/shared/notebooks/B27BBBD6-edgefinder/1/43F9182D0779B66C/chunks.json b/.Rproj.user/shared/notebooks/B27BBBD6-edgefinder/1/43F9182D0779B66C/chunks.json new file mode 100644 index 0000000..ef65265 --- /dev/null +++ b/.Rproj.user/shared/notebooks/B27BBBD6-edgefinder/1/43F9182D0779B66C/chunks.json @@ -0,0 +1 @@ +{"chunk_definitions":[],"doc_write_time":1596066727} \ No newline at end of file diff --git a/.Rproj.user/shared/notebooks/B27BBBD6-edgefinder/1/43F9182D918F8619/chunks.json b/.Rproj.user/shared/notebooks/B27BBBD6-edgefinder/1/43F9182D918F8619/chunks.json new file mode 100644 index 0000000..3ff1359 --- /dev/null +++ b/.Rproj.user/shared/notebooks/B27BBBD6-edgefinder/1/43F9182D918F8619/chunks.json @@ -0,0 +1 @@ +{"chunk_definitions":[],"doc_write_time":1596070501} \ No newline at end of file diff --git a/.Rproj.user/shared/notebooks/B27BBBD6-edgefinder/1/43F9182D97C01DAA/chunks.json b/.Rproj.user/shared/notebooks/B27BBBD6-edgefinder/1/43F9182D97C01DAA/chunks.json new file mode 100644 index 0000000..c5038f1 --- /dev/null +++ b/.Rproj.user/shared/notebooks/B27BBBD6-edgefinder/1/43F9182D97C01DAA/chunks.json @@ -0,0 +1 @@ +{"chunk_definitions":[],"doc_write_time":1596069949} \ No newline at end of file diff --git a/.Rproj.user/shared/notebooks/B27BBBD6-edgefinder/1/s/chunks.json b/.Rproj.user/shared/notebooks/B27BBBD6-edgefinder/1/s/chunks.json new file mode 100644 index 0000000..3ff1359 --- /dev/null +++ b/.Rproj.user/shared/notebooks/B27BBBD6-edgefinder/1/s/chunks.json @@ -0,0 +1 @@ +{"chunk_definitions":[],"doc_write_time":1596070501} \ No newline at end of file diff --git a/.Rproj.user/shared/notebooks/D63BAF88-edgefinder/1/s/chunks.json b/.Rproj.user/shared/notebooks/D63BAF88-edgefinder/1/s/chunks.json new file mode 100644 index 0000000..1d094ee --- /dev/null +++ b/.Rproj.user/shared/notebooks/D63BAF88-edgefinder/1/s/chunks.json @@ -0,0 +1 @@ +{"chunk_definitions":[],"doc_write_time":1596069054} \ No newline at end of file diff --git a/.Rproj.user/shared/notebooks/patch-chunk-names b/.Rproj.user/shared/notebooks/patch-chunk-names new file mode 100644 index 0000000..e69de29 diff --git a/.Rproj.user/shared/notebooks/paths b/.Rproj.user/shared/notebooks/paths new file mode 100644 index 0000000..5c8576c --- /dev/null +++ b/.Rproj.user/shared/notebooks/paths @@ -0,0 +1,8 @@ +/Users/haim/Dropbox/Packages/edgefinder/.Rbuildignore="28F9C71D" +/Users/haim/Dropbox/Packages/edgefinder/.gitignore="48AFABFC" +/Users/haim/Dropbox/Packages/edgefinder/DESCRIPTION="3517FB69" +/Users/haim/Dropbox/Packages/edgefinder/NAMESPACE="5E7BF351" +/Users/haim/Dropbox/Packages/edgefinder/R/edgefinder.R="3FE41109" +/Users/haim/Dropbox/Packages/edgefinder/doc/edgefinder.Rmd="D63BAF88" +/Users/haim/Dropbox/Packages/edgefinder/vignettes/edgefinder.Rmd="B27BBBD6" +/Users/haim/Dropbox/Packages/edgefinder/vignettes/edgefinder.md="89E0A151" diff --git a/.Rproj.user/shared/notebooks/paths (CLAS Mac's conflicted copy 2020-07-29) b/.Rproj.user/shared/notebooks/paths (CLAS Mac's conflicted copy 2020-07-29) new file mode 100644 index 0000000..bda2f19 --- /dev/null +++ b/.Rproj.user/shared/notebooks/paths (CLAS Mac's conflicted copy 2020-07-29) @@ -0,0 +1 @@ +/Users/haim/Dropbox/Packages/edgefinder/R/edgefinder.R="D3ED7F56" diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9acb1ef --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +doc +Meta +^.*\.Rproj$ +^\.Rproj\.user$ +.DS_Store +.Rhistory +.git diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..c13a239 --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,14 @@ +Package: edgefinder +Type: Package +Title: Detect Edges in Sparse Co-expression Graphs +Version: 0.1.5 +Author: Haim Bar and Seojin Bang +Maintainer: Haim Bar +Description: Finding edges in co-expression graphs, based on "A Mixture Model to Detect Edges in Sparse Co-expression Graphs", Haim Bar and Seojin Bang. See more details in the vignettes. +License: GPL-2 +Encoding: UTF-8 +LazyData: true +RoxygenNote: 7.1.1 +Suggests: knitr, rmarkdown +VignetteBuilder: knitr +Depends: R (>= 3.4.0), Matrix diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..37654c6 --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,17 @@ +# Generated by roxygen2: do not edit by hand + +export(clusteringCoef) +export(collapsedGraph) +export(edgefinder) +export(graphComponents) +export(plotBitmapCC) +export(plotCluster) +export(plotDegCC) +export(plotMixture) +export(shortSummary) +export(shortestPathDistance) +export(summarizeClusters) +import(graphics) +import(stats) +importFrom(Matrix,Matrix) +importFrom(grDevices,rgb) diff --git a/R/edgefinder.R b/R/edgefinder.R new file mode 100644 index 0000000..fe1c191 --- /dev/null +++ b/R/edgefinder.R @@ -0,0 +1,704 @@ + +#' Detect edges in co-expression datasets. +#' +#' Fit the L2N model to normalized correlation coefficients between pairs of +#' genes. The mixture model has three component - the null component follows +#' a normal distribution, and the two non-null components follow lognormal +#' distributions. An edge is in the graph if the correlation between the two +#' end-point genes is large enough and determined to be in one of the non-null +#' components. +#' @param Exprs A numeric matrix with normalized gene expression data. Rows +#' correspond to genes, and columns correspond to samples. +#' @param BHthr the Benjamini-Hochberg fasle discovery rate threshold to be +#' used to determine which pairs are strongly correlated. Default=0.01. +#' @param rndseed The random seed used to select a subset of the pairs. +#' @param maxLen The maximum number of pairs that will be randomly selected +#' to fit the L2N model. Default=20000. +#' @param LOvals the maximum log-odds ratio to be used to determine the +#' cut-off points to declare which correlations are significant. +#' The program will check which log-odds ratio (1,2,...,LOvals) results in +#' FDR less than or equal to the user-specified BHthr. Default=30. +#' @param ttl Title for the fitted-model plot. Default="" +#' @param trim Fraction of extreme values to exclude from the fitted-model +#' plot. Default=0 (show all the data). +#' @return A list with the following elements +#' \itemize{ +#' \item{G} {The total number of genes.} +#' \item{p1} {The proportion of genes in the right mixture component (positively correlated.)} +#' \item{p2} {The proportion of genes in the left mixture component (negtively correlated.)} +#' \item{p0} {The proportion of genes in the null component (un-correlated.)} +#' \item{m0, m1, m2, s0, s1, s2} {The location and scale parameters of the three mixture components.} +#' \item {thrtable} {A table with 6 columns: posterior probability ratio (ppr) between the non-null components and the null component), the right component cutoff corresponding to the ppr, the left component cutoff, the estimated probability of Type-I errors, the estimated power, the estimated FDR.} +#' \item {LogOddsRatio} {The log-odds ratio that yields FDR closest to the desired level.} +#' \item {fitted} {The fitted model (as returned by the EM function).} +#' \item {rmse} {The root mean-squared error of the fitted model.} +#' \item {rt, lt} {The significant edges (from the right and left mixture components.)} +#' \item {AdjMat} {The (sparse) adjacency matrix with edges corresponding to rt, lt.} +#' } +#' @export +#' @import stats +#' @importFrom Matrix Matrix +#' @importFrom grDevices rgb +#' @examples +#' \donttest{ +#' data(WT) +#' WTres <- edgefinder(WT, ttl = "Wild Type") +#' } +edgefinder <- function(Exprs, BHthr = 0.01, rndseed=112211, + maxLen=20000, LOvals=30, ttl="",trim=0) { + corM <- cor(t(Exprs), use = "pairwise.complete.obs") + N <- ncol(Exprs) + y <- atanh(corM[upper.tri(corM)]) + fix <- which(is.infinite(y)) + if (length(fix) > 0) + y[fix] <- max(abs(y[-fix]))*(1 + runif(length(fix))) + set.seed(rndseed) + sset <- sample(1:length(y),size = min(maxLen,length(y))) + y0 <- y[sset] + fittedL2N <- EM(y0*sqrt(N-3)) + rmseL2N <- GoodnessOfFit(fittedL2N) + plotMixture(fittedL2N,gof=rmseL2N,trim=trim, ttl=ttl) + cat("Calculating the posterior density...\n") + B <- posteriorDensityL2N(fittedL2N, y*sqrt(N-3)) + p1L2N <- mean(fittedL2N$b1) + p2L2N <- mean(fittedL2N$b2) + p0L2N <- 1-(p1L2N+p2L2N) + m0L2N <- fittedL2N$theta + m1L2N <- fittedL2N$mu1 + m2L2N <- fittedL2N$mu2 + s0L2N <- fittedL2N$tau + s1L2N <- fittedL2N$s1 + s2L2N <- fittedL2N$s2 + + cat("Calculating the log-odds...\n") + ret <- logoddsValues(fittedL2N$x,m0L2N,s0L2N,m1L2N,s1L2N, + m2L2N,s2L2N,p1L2N,p2L2N,vals=1:LOvals) + if (length(which(ret[,6] < BHthr) > 0)) { + LogOddsRatio <- max(min(which(ret[,6] < BHthr)),2) + } else { + LogOddsRatio <- LOvals + } + RtBFL2N <- which(B[[2]]/B[[1]] > LogOddsRatio) + LtBFL2N <- which(B[[3]]/B[[1]] > LogOddsRatio) + + cat("Calculating the adjacency matrix...\n") + G <- nrow(Exprs) + sigW <- sort(union(RtBFL2N,LtBFL2N)) + tmpmat <- Matrix::Matrix(0,G, G) + vec <- rep(0, choose(G,2)) + vec[sigW] <- 1 + tmpmat[upper.tri(tmpmat)] <- vec + AdjMat <- tmpmat+Matrix::t(tmpmat) + + list(G=G, p1=p1L2N, p2=p2L2N, p0=p0L2N, m0=m0L2N, m1=m1L2N, m2=m2L2N, + s0=s0L2N, s1=s1L2N, s2=s2L2N, thrtable=ret, LogOddsRatio=LogOddsRatio, + fitted=fittedL2N, rmse=rmseL2N, rt=RtBFL2N, lt=LtBFL2N, AdjMat=AdjMat) +} + + +#' Print a short summary of the fitted mixture model. +#' +#' Show the number of nodes, the number of possible and detected edges, the estimated proportion of positively/negatively correlated pairs, and the estimated false discovery rate. +#' @param edgefinderobj The object (list) returned from the edgefinder function. +#' @export +#' @examples +#' \donttest{ +#' data(WT) +#' WTres <- edgefinder(WT, ttl = "Wild Type") +#' shortSummary(WTres) +#' } +shortSummary <- function(edgefinderobj) { + with(edgefinderobj,{ + cat("No. nodes =", prettyNum(G,big.mark = ","),"\n") + cat("Max no. edges =", prettyNum(choose(G, 2),big.mark = ","),"\n") + cat("No. edges detected =", prettyNum(length(union(lt,rt)),big.mark = ","),"\n") + cat("p1 =",format(p1,digits=3),"\n") + cat("p2 =",format(p2,digits=3),"\n") + cat("Est. FDR <=", format(thrtable[LogOddsRatio,6],digits=3),"\n") + }) +} + +# The EM algorithm to fit the L2N model. +# +# Fit the L2N model to normalized correlation coefficients between pairs of genes. The mixture model has three component - the null component follows a normal distribution, and the two non-null components follow lognormal distributions. An edge is in the graph if the correlation between the two end-point genes is large enough and determined to be in one of the non-null components. +# @param x A vector of normalized correlation coefficients. +# @param max.it The maximum number of EM algorithm iterations (default=1000). +# @param tol The tolerance level to assess convergence of the EM algorithm (default=1e-12.) +# @return A list of the parameter estimates for the L2N model. +EM <- function(x, max.it=1000, tol=1e-12) { + N <- length(x) + err <- 1 + # initialize the parameter values + adjustMean <- mean(x) # centering the data around the mean + x <- x - adjustMean + # The parameters of the null ditribution, N(theta,tau) : + theta <- mean(x) + tau <- 1 + # The location and scale parameters of the nonnull components: + mu <- abs(quantile(x,c(0.05,.95))) + names(mu) <- c() + sig <- c(1, 1) + # The initial probabilities of the three components: + p0 <- 0.98 + p1 <- 0.01 + p2 <- 0.01 + # Set the initial component indicator variables: + b1 <- rep(0,N) + b2 <- rep(0,N) + m1 <- 0 + m2 <- 0 + ct <- 0 + # Run the EM algorithm until the mixture fits the empirical + # density well (total squared errors < tol) + while (err > tol) { + adjustMean <- adjustMean + theta + x <- x - theta # iteratively center the data, so that the mean of the + # null component ends up being 0 + pos <- which(x > 0) # Fit the nonnull components according to the + neg <- which(x < 0) # sign of x + + d0 <- dnorm(x, theta, tau) # null component is normal + d1 <- dlnorm(x, mu[1], sig[1]) + d2 <- dlnorm(-x, mu[2], sig[2]) + wtsm <- p0*d0 + p1*d1 + p2*d2 # The density of the mixture + b1[-pos] <- 0 + b2[-neg] <- 0 + b1[pos] <- pmin(1,p1*d1[pos]/wtsm[pos]) # Posterior probabilities of the positive nonnull + b2[neg] <- pmin(1,p2*d2[neg]/wtsm[neg]) # Posterior probabilities of the negative nonnull + b0 <- 1 - (b1+b2) # The posterior null probabilities + # Update the component weights: + p0 <- mean(b0) + p1 <- mean(b1) + p2 <- mean(b2) + # Update the null component parameters: + theta <- sum(b0*x)/sum(b0) + tau <- sqrt(sum(b0*(x-theta)^2)/sum(b0)) + d0 <- dnorm(x, theta, tau) + # Update the nonnull (nonnull) components parameters: + if (sum(b1[pos]) < 1e-2) { + mu[1] <- 0 + sig[1] <- 0 + d1 <- rep(0, N) + } else { + mu[1] <- sum(b1[pos]*(log(x[pos])))/sum(b1[pos]) + sig[1] <- sqrt(sum(b1[pos]*(log(x[pos])-mu[1])^2)/sum(b1[pos])) + d1 <- dlnorm(x, mu[1], sig[1]) + } + + if (sum(b2[neg]) < 1e-2) { + mu[2] <- 0 + sig[2] <- 0 + d2 <- rep(0, N) + } else { + mu[2] <- sum(b2[neg]*(log(-x[neg])))/sum(b2[neg]) + sig[2] <- sqrt(sum(b2[neg]*(log(-x[neg])-mu[2])^2)/sum(b2[neg])) + d2 <- dlnorm(-x, mu[2], sig[2]) + } + + # Check convergence + err <- sum((p0*d0 + p1*d1 + p2*d2 - wtsm)^2) + ct <- ct + 1 + if(ct > max.it) + break + } + b1[-pos] <- 0 + b2[-neg] <- 0 + b1[pos] <- pmin(1,p1*d1[pos]/wtsm[pos]) # Posterior probabilities of the positive nonnull + b2[neg] <- pmin(1,p2*d2[neg]/wtsm[neg]) # Posterior probabilities of the negative nonnull + b0 <- 1 - (b1+b2) # The posterior null probabilities + pvals <- 2*(1-pnorm(abs(x), mean=0, sd=tau)) + bh <- p.adjust(pvals, method="BH") + list(x=x, adjustMean=adjustMean, + theta=theta, tau=tau, + mu1=mu[1], s1=sig[1], + mu2=mu[2], s2=sig[2], + b0=b0, b1=b1, b2=b2, + p.val=pvals, bh=bh, + err=err, its=ct) +} + + +# Calculate the log-odds ratios to determine for each gene, in which +# of the three components in the L2N model, it belongs +logoddsValues <- function(y,theta,tau,mu1,s1,mu2,s2,p1,p2,vals=1:30) { + ret <- matrix(0,nrow=length(vals),ncol=6) + ret[,1] <- vals + p0 <- 1-p1-p2 + xs <- seq(min(y),max(y),length=10000) + pxs <- seq(1e-6,max(y),length=10000) + nxs <- seq(min(y),-1e-6,length=10000) + i <- 0 + for (val in vals) { + i <- i + 1 + if (p1 < 1/length(y)) { + ret[i,2] <- Inf + } else { + f <- function(x) { log((p1*dlnorm(x, mu1, s1))/ + (p0*dnorm(x, theta, tau)))-log(val) } + rt <- try(uniroot(f, lower = 1e-6, upper = max(y)), silent = T) + if (class(rt) == "try-error") + ret[i,2] <- Inf + else + ret[i,2] <- rt$root + } + if (p2 < 1/length(y)) { + ret[i,3] <- -Inf + } else { + f <- function(x) { log((p2*dlnorm(-x, mu2, s2))/ + (p0*dnorm(x, theta, tau)))-log(val) } + rt <- try(uniroot(f, lower = min(y), upper = -1e-6), silent = T) + if (class(rt) == "try-error") + ret[i,3] <- -Inf + else + ret[i,3] <- rt$root + } + # type I: + ret[i,4] <- pnorm(ret[i,3], theta, tau) + + 1 - pnorm(ret[i,2], theta, tau) + # "Power": + ret[i,5] <- (p1*(1-plnorm(ret[i,2], mu1, s1)) + + p2*(1-plnorm(-ret[i,3], mu2, s2)))/(p1+p2) + # FDR: + ret[i,6] <- p0*ret[i,4]/(p0*ret[i,4]+ret[i,5]*(p1+p2)) + } + colnames(ret) <- c("ppr","Right","Left","TypeI","Power","FDR") + ret +} + + +# calculate the posterior L2N mixture model density of x, given the parameter +# estimates +posteriorDensityL2N <- function(fit.em, x) { + p0 <- mean(fit.em$b0) + p1 <- mean(fit.em$b1) + p2 <- mean(fit.em$b2) + adjustMean <- fit.em$adjustMean + fit.em$theta + x <- x - fit.em$adjustMean + pos <- which(x > 0) # Fit the nonnull components according to the + neg <- which(x < 0) # sign of x + d0 <- dnorm(x, fit.em$theta, fit.em$tau) # null component is normal + d1 <- dlnorm(x, fit.em$mu1, fit.em$s1) + d2 <- dlnorm(-x, fit.em$mu2, fit.em$s2) + wtsm <- p0*d0 + p1*d1 + p2*d2 + b1 <- rep(0, length(x)) + b2 <- rep(0, length(x)) + b1[pos] <- pmin(1,p1*d1[pos]/wtsm[pos]) # Posterior probabilities of the positive nonnull + b2[neg] <- pmin(1,p2*d2[neg]/wtsm[neg]) # Posterior probabilities of the negative nonnull + b0 <- 1 - (b1+b2) # The posterior null probabilities + list(b0=b0,b1=b1,b2=b2) +} + + +# Return the estimated density function of the mixture +mixtureDensityL2N <- function(fit.em, x) { + mean(fit.em$b0)*dnorm(x, fit.em$theta, fit.em$tau) + + mean(fit.em$b1)*dlnorm(x, fit.em$mu1, fit.em$s1) + + mean(fit.em$b2)*dlnorm(-x, fit.em$mu2, fit.em$s2) +} + + +# Calculate the root mean squared error of the fitted mixture +GoodnessOfFit <- function(fit.em, mixturemodel="L2N") { + x <- sort(fit.em$x) + if(length(x) > 10000) + x <- x[seq(1,length(x), length=10000)] + diffs <- x[-1] - x[-length(x)] + dnsfn <- approxfun(density(x,bw="SJ")) + return(sqrt(sum((diffs* (dnsfn(x[-1])-mixtureDensityL2N(fit.em,x[-1])) ^2)))) +} + + +#' Find clusters, and return node characteristics. +#' +#' Take an adjacency Matrix as input and find clusters. For each node, find the degree and clustering coefficient (CC). Then, calculate a centrality measure (type\*CC+1)\*deg. For type=0, it's just the degree. Note that setting type=1 we assign a higher value to nodes that not only have many neighbors, but the neighbors are highly interconnected. For example, suppose we have two components with k nodes, one has a star shape, and the other is a complete graph. With type=0 both graphs will get the same value, but with type=1 the complete graph will be picked by the algorithm first. +#' @param A An adjacency Matrix(0/1). +#' @param minCtr The minimum centrality value to be considered for a cluster center (default=5). +#' @param type Determines how the centrality measure is computed. +#' @return A data frame with the following columns +#' \itemize{ +#' \item{labels} {Node label (e.g. gene names).} +#' \item{degree} {Node degree.} +#' \item{cc} {Node clustering coefficient.} +#' \item{ctr} {Node centrality measure: (type\*CC+1)\*deg.} +#' \item{clustNo} {Cluster number.} +#' \item {iscenter} {1 for the node was chosen as the cluster's center, 0 otherwise.} +#' \item {intEdges} {Number of edges from the node to nodes in the same cluster.} +#' \item {extEdges} {Number of edges from the node to nodes NOT in the same cluster.} +#' \item {distCenter} {Standardized Manhattan distance to the central node.} +#' } +#' @export +#' @examples +#' \donttest{ +#' data(SIM) +#' Sres <- edgefinder(SIM, ttl = "hub network") +#' SimComp <- graphComponents(Sres$AdjMat) +#' head(SimComp) +#' } +graphComponents <- function(A, minCtr=5, type=1) { + stopifnot(grep("Matrix", class(A)) > 0) + Vn <- ncol(A) + ctrs <- rep(2*Vn, Vn) + labels <- 1:Vn + if(!is.null(rownames(A))) + labels <- rownames(A) + deg <- Matrix::rowSums(A) + CC <- clusteringCoef(A) + ctrs <- (type*CC+1)*deg + clustersInfo <- data.frame(labels=labels, degree=deg, cc=CC, ctr=ctrs, + clustNo=rep(0,Vn), iscenter=rep(0,Vn), + intEdges=rep(0,Vn), extEdges=rep(0,Vn), + distCenter=rep(0,Vn)) + clustNo <- 1 + clustered <- which(deg < 1) + while(length(clustered) < Vn) { + notInCluster <- setdiff(1:Vn, clustered) + if (max(ctrs[notInCluster]) < minCtr) + return(clustersInfo) + ctrnode <- notInCluster[which.max(ctrs[notInCluster])] + # candidate cluster neighbors + nbrs <- setdiff(sort(c(ctrnode, which(A[ctrnode,] != 0))), clustered) + if(length(nbrs) > 1) { + if (length(nbrs) > minCtr) { + clustersInfo$iscenter[ctrnode] <- 1 + clustersInfo$clustNo[union(ctrnode,nbrs)] <- clustNo + clustersInfo$intEdges[nbrs] <- Matrix::rowSums(A[nbrs,nbrs]) + clustersInfo$extEdges[nbrs] <- Matrix::rowSums(A[nbrs,-nbrs]) + for (i in 1:length(nbrs)) { + clustersInfo$distCenter[nbrs[i]] <- mean(xor(A[ctrnode,], A[nbrs[i],])) + } + clustNo <- clustNo + 1 + } else { + nbrs <- c() + } + } else { + nbrs <- c() + } + clustered <- union(clustered, c(nbrs, ctrnode)) + } + return(clustersInfo) +} + + + +#' Show cluster characteristics. +#' +#' Takes an object obtained from graphComponents and prints and returns summary statistics. +#' @param clustersInfo Obtained from graphComponents. +#' @return A matrix with cluster number, number of nodes, and fivenum summaries for the degrees of nodes in the cluster, and the percentage of edges that are within the cluster. +#' @export +#' @examples +#' \donttest{ +#' data(WT) +#' WTres <- edgefinder(WT, ttl = "Wild Type") +#' WTComp <- graphComponents(WTres$AdjMat) +#' (summtab <- summarizeClusters(WTComp)) +#' } +summarizeClusters <- function(clustersInfo) { + cat("Num of nodes:", nrow(clustersInfo),"\n") + cat("Num of edges:", sum(clustersInfo$degree)/2,"\n") + cat("Num of clusters:", max(clustersInfo$clustNo),"\n") + cat("Num of unclustered nodes:", length(which(clustersInfo$clustNo == 0)),"\n") + percentInCluster <- clustersInfo$intEdges/clustersInfo$degree + percentInCluster[which(clustersInfo$degree == 0)] <- 0 + tab <- matrix(0,nrow=max(clustersInfo$clustNo),ncol=12) + for (cnum in 1:max(clustersInfo$clustNo)) { + tmpclusterInfo <- clustersInfo[which(clustersInfo$clustNo == cnum),] + tab[cnum,] <- c(cnum,nrow(tmpclusterInfo), fivenum(tmpclusterInfo$degree), + fivenum(percentInCluster[which(clustersInfo$clustNo == cnum)])) + } + colnames(tab) <- c("Cluster","Nodes","degreeMin","degreeQ25","degreeMedian", + "degreeQ75","degreeMax","pctInClstMin","pctInClstQ25", + "pctInClstMedian", "pctInClstQ75","pctInClstMax") + tab +} + + +#' Return an adjacency matrix after collapsing clusters into their central nodes. +#' +#' Takes an object obtained from graphComponents and prints summary statistics. +#' @param A An adjacency Matrix. +#' @param clustersInfo Obtained from graphComponents +#' @return A weighted adjacency matrix between clusters and unclustered nodes. +#' @export +#' @examples +#' \donttest{ +#' data(WT) +#' WTres <- edgefinder(WT, ttl = "Wild Type") +#' rownames(WTres$AdjMat) = rownames(WT) +#' WTComp <- graphComponents(WTres$AdjMat) +#' Adj1 <- collapsedGraph(WTres$AdjMat, WTComp) > 0 +#' plotBitmapCC(Adj1,showMinDegree = 2) +#' } +collapsedGraph <- function(A, clustersInfo) { + collDim <- length(which(clustersInfo$clustNo == 0)) + max(clustersInfo$clustNo) + collA <- Matrix::Matrix(0, ncol=collDim, nrow=collDim) + inCluster <- which(clustersInfo$clustNo > 0) + notInCluster <- which(clustersInfo$clustNo == 0) + collA[1:length(notInCluster), 1:length(notInCluster)] <- A[notInCluster, notInCluster]>0 + if (length(rownames(A)) != nrow(A)) { + rownames(A) <- 1:nrow(A) + } + rownames(collA) <- c(rownames(A)[notInCluster], + paste0("CLS",1:max(clustersInfo$clustNo))) + for (i in 1:max(clustersInfo$clustNo)) { + Ci <- which(clustersInfo$clustNo == i) + collA[i+length(notInCluster),1:length(notInCluster)] <- + Matrix::rowSums(A[notInCluster,which(clustersInfo$clustNo==i)]) + if (i < max(clustersInfo$clustNo)) { + for (j in (i+1):max(clustersInfo$clustNo)) { + Cj <- which(clustersInfo$clustNo == j) + collA[i+length(notInCluster),j+length(notInCluster)] <- sum(A[Ci,Cj]) + } + } + } + collA + Matrix::t(collA) +} + + +#' Calculate the clustering coefficient of each node. +#' +#' @param A an adjacency Matrix (0/1). +#' @return A vector with the clustering coefficient of each node. +#' @export +#' @examples +#' \donttest{ +#' data(WT) +#' WTres <- edgefinder(WT, ttl = "Wild Type") +#' clusteringCoef(WTres$AdjMat) +#' } +#' +clusteringCoef <- function(A) { + rsum <- Matrix::rowSums(A) + cc <- rep(0,nrow(A)) + for (i in 1:nrow(A)) { + if (rsum[i] <= 1) + cc[i] <- 0 + else { + nbrs <- which(A[i,] == 1) + At <- A[nbrs, nbrs] + cc[i] <- 0.5*sum(At)/choose(rsum[i],2) + } + } + cc +} + + +#' Plot the histogram of the data and the fitted mixture distribution. +#' +#' The function is called by the edgefinder function. +#' @param fit.em The object (list) returned from the EM function with the parameter estimates for the L2N model. +#' @param gof The root mean-squared error of the fitted model (to appear in the title of the plot). +#' @param ttl The title of the plot (default=""). +#' @param trim The proportion of extreme values on both sides of the distribution to eliminate from the plot (default=0.) This can be useful if a small number of values are so extreme, that the plot shows mostly the tails and a spike in the middle. Default=0. +#' @export +#' @examples +#' \donttest{ +#' data(WT) +#' WTres <- edgefinder(WT, ttl = "Wild Type") +#' plotMixture(WTres$fitted, WTres$rmse) +#' } +plotMixture <- function(fit.em, gof, ttl="", trim=0) { + xlim <- quantile(fit.em$x, c(trim/2, 1-trim/2)) + brks <- min(80,floor(length(fit.em$x)/100)) + hist(fit.em$x, freq=FALSE, breaks=brks, + main=sprintf("%s\nrMSE %2.2f",ttl, gof), + xlim=xlim,xlab="x", border="white", col="wheat") + xs <- seq(min(fit.em$x), max(fit.em$x), length=1000) + p0 <- mean(fit.em$b0) + p1 <- mean(fit.em$b1) + p2 <- mean(fit.em$b2) + lines(xs, p0*dnorm(xs, fit.em$theta, fit.em$tau), col=2, lwd=2) + lines(xs, p1*dlnorm(xs, fit.em$mu1, fit.em$s1), col=3, lwd=2) + lines(-xs, p2*dlnorm(xs, fit.em$mu2, fit.em$s2), col=3, lwd=2) + mxfit <- p0*dnorm(xs,fit.em$theta, fit.em$tau) + + p1*dlnorm(xs, fit.em$mu1, fit.em$s1) + + p2*dlnorm(-xs, fit.em$mu2, fit.em$s2) + lines(xs, mxfit, lwd=3, col=4, lty=2) +} + + +#' Plot the degree of nodes versus the degree times the clustering coefficient. +#' +#' The x-axis represents the number of neighbors of each node, and the y-axis represents the proportion of neighbors which are connected to each other. +#' @param edgefinderobj The object (list) returned by edgefinder. +#' @param clusterInfo obtained from graphComponents. If not provided by the user, it will be computed on the fly. +#' @param highlightNodes A vector of node-numbers which will be shown in red. Default is NULL. +#' @export +#' @import stats graphics +#' @examples +#' \donttest{ +#' data(WT) +#' WTres <- edgefinder(WT, ttl = "Wild Type") +#' WTComp <- graphComponents(WTres$AdjMat) +#' plotDegCC(WTres,WTComp) +#' } +plotDegCC <- function(edgefinderobj, clusterInfo=NULL, highlightNodes=NULL) { + if (is.null(clusterInfo)) + clusterInfo <- graphComponents(edgefinderobj$AdjMat) + cc0 <- clusterInfo$cc + deg0 <- clusterInfo$degree + plot(deg0, deg0*cc0,axes=F,xlim=c(0,max(deg0)), + ylim=c(0,1.1*max(deg0*cc0)),main="", + xlab=bquote("degree"),ylab=bquote("CC*degree"), + col="thistle",pch=24,cex=0.5); axis(1); axis(2) + grid(); abline(0,1,col="seagreen1", lwd=2) + if (!is.null(highlightNodes)) + points(deg0[highlightNodes],(deg0*cc0)[highlightNodes],col=2,pch=24,cex=0.5) +} + + +#' Edge-indicator bitmap plot. +#' +#' Plot a bitmap in which a black dot corresponds to a pair of highly correlated genes (an edge in the graph). +#' The default is to show the nodes according to their order in the input. +#' By setting orderByDegree=T as below, it is possible to change the order and cluster them, and show them in increasing degree order (from left to right.) +#' @param AdjMat An adjacency Matrix (0/1). +#' @param clusterInfo obtained from graphComponents. If not provided by the user, it will be computed on the fly. +#' @param orderByCluster If false, show the bitmap is the original node order. If TRUE, show nodes by clusters, and sort by distance from the center of the cluster. +#' @param showMinDegree Non-negative integer indicating the minimum degree of nodes that should be displayed. Default=0 (all nodes). +#' @export +#' @examples +#' \donttest{ +#' data(WT) +#' WTres <- edgefinder(WT, ttl = "Wild Type") +#' WTComp <- graphComponents(WTres$AdjMat) +#' plotBitmapCC(WTres$AdjMat) +#' plotBitmapCC(WTres$AdjMat, WTComp, orderByCluster=TRUE) +#' plotBitmapCC(WTres$AdjMat, WTComp, orderByCluster=TRUE, showMinDegree = 30) +#' } +plotBitmapCC <- function(AdjMat, clusterInfo=NULL, orderByCluster=FALSE, showMinDegree=0) { + if(!is.null(clusterInfo)) + orderByCluster <- TRUE + if (orderByCluster) { + if (is.null(clusterInfo)) + clusterInfo <- graphComponents(AdjMat) + nodeOrder <- order(clusterInfo$clustNo,clusterInfo$distCenter) + AdjMat <- AdjMat[nodeOrder, nodeOrder] + } + showNodes <- which(Matrix::rowSums(AdjMat) >= showMinDegree) + Matrix::image(AdjMat[showNodes, showNodes]) +} + + +#' Plot cluster network +#' +#' Plot a cluster network with all the nodes and edges - the central node is marked by a black circle. The radius of each point corresponds to its degree. The opacity corresponds to the percentage of edges from the node that is in the cluster (the darker it is, the larger the percentage of edges is within the cluster.) The distance from the center corresponds to the relative dissimilarity with the central node. This is computed as the number of neighbors the node and the central node do not have in common. +#' @param AdjMat An adjacency Matrix (0/1). +#' @param clustNo The chosen cluster. +#' @param clusterInfo Obtained from graphComponents. +#' @export +#' @examples +#' \donttest{ +#' data(WT) +#' WTres <- edgefinder(WT, ttl = "Wild Type") +#' WTComp <- graphComponents(WTres$AdjMat) +#' plotCluster(WTres$AdjMat, 5, WTComp) +#' } +plotCluster <- function(AdjMat, clustNo, clusterInfo=NULL) { + if(is.null(clusterInfo)) + clusterInfo <- graphComponents(AdjMat) + ids <- which(clusterInfo$clustNo == clustNo) + if (length(ids) > 0) { + tmpA <- AdjMat[ids,ids] + tmpclusterInfo <- clusterInfo[ids,] + rads <- round(10*tmpclusterInfo$distCenter/max(tmpclusterInfo$distCenter)) + thetas <- rep(0,length(rads)) + intvls <- findInterval(rads,seq(1,10)) + for (intvl in unique(sort(intvls))) { + pts <- which(intvls == intvl) + thetas[pts] <- 3*intvl*pi/max(intvls)+seq(0,1.9*pi,length=length(pts)) + } + sizes <- pmax(0.3,tmpclusterInfo$degree/max(tmpclusterInfo$degree)) + opacity <- 0.25+tmpclusterInfo$intEdges/tmpclusterInfo$degree + opacity <- opacity/max(opacity) + plot(rads*cos(thetas), rads*sin(thetas),cex=sizes*3, pch=19,axes=F, + xlab="",ylab="",col=rgb(red = 0, green = 0, blue = 1, alpha = opacity)) + for (i in 1:(ncol(tmpA)-1)) { + nbrs <- which(tmpA[i,i:ncol(tmpA)] == 1) + for (j in i:ncol(tmpA)) { + lines(c(rads[i]*cos(thetas[i]), rads[j]*cos(thetas[j])), + c(rads[i]*sin(thetas[i]), rads[j]*sin(thetas[j])), + col="grey88", lwd=0.5) + } + } + points(rads*cos(thetas), rads*sin(thetas),cex=sizes*3, pch=19, + col=rgb(red = 0, green = 0, blue = 1, alpha = opacity)) + ctr <- which(tmpclusterInfo$iscenter==1) + points(rads[ctr]*cos(thetas[ctr]), rads[ctr]*sin(thetas[ctr]),pch=21, + cex=sizes[ctr]*3, col="black",lwd=2) + } else { + cat("Invalid cluster number\n") + } +} + + +#' Return a Matrix with the shortest path distance between nodes (check up to numSteps.) +#' +#' return the adjacency matrix of expMat connecting neighbors up to numSteps away. +#' @param AdjMat An adjacency Matrix (0/1). +#' @param numSteps The maximum number of edges between pairs of nodes. If numSteps=0, returns the input matrix. numSteps=1 adds neighbors of direct neighbors, etc. +#' @return A Matrix containing the shortset paths between nodes i and j +#' @export +#' @examples +#' \donttest{ +#' data(SIM) +#' Sres <- edgefinder(SIM, ttl = "hub network") +#' AdjMat1 <- shortestPathDistance(Sres$AdjMat, numSteps=50) +#' max(AdjMat1) +#' Matrix::image(AdjMat1) +#' } +shortestPathDistance <- function(AdjMat, numSteps=0) { + degs <- 1:ncol(AdjMat) + if (numSteps == 0) + return(AdjMat) + An <- Ap <- minDist <- AdjMat + for (i in 1:numSteps) { + An <- Ap%*%AdjMat + if (sum((An | Ap) - (An & Ap)) == 0) + break + minDist[(An > 0) & (Ap == 0) & (minDist == 0)] <- i + Ap <- An + } + rownames(minDist) <- colnames(minDist) <- rownames(AdjMat) + minDist +} + + +#' Gene Expression data for the WildType group +#' +#' WT is a matrix with normalized gene expression data containing 3454 differentially expressed genes (when compared with the duplication group) from 15 samples (columns) from the wild-type group. +#' +#' @docType data +#' @keywords datasets +#' @name WT +#' @usage data(WT) +#' @format A matrix with 3454 rows and 15 columns +#' @references \url{https://www.ncbi.nlm.nih.gov/sites/GDSbrowser?acc=GDS4430} +NULL + + +#' Gene Expression data for the Duplication group +#' +#' DUP is a matrix with normalized gene expression data containing 3454 differentially expressed genes (when compared with wild-type) from 12 samples (columns) from the duplication group. +#' +#' @docType data +#' @keywords datasets +#' @name DUP +#' @usage data(DUP) +#' @format A matrix with 3454 rows and 12 columns. +#' @references \url{https://www.ncbi.nlm.nih.gov/sites/GDSbrowser?acc=GDS4430} +NULL + + +#' Simulated gene Expression data using the huge package +#' +#' SIM is a a simulated dataset with a hub structure, consisting of 1000 nodes and 50 hubs +#' +#' @docType data +#' @keywords datasets +#' @name SIM +#' @usage data(SIM) +#' @format A 1000 by 200 matrix, representing 50 hubs +NULL + + diff --git a/data/DUP.RData b/data/DUP.RData new file mode 100644 index 0000000..b9b6cb2 Binary files /dev/null and b/data/DUP.RData differ diff --git a/data/SIM.RData b/data/SIM.RData new file mode 100644 index 0000000..8baedff Binary files /dev/null and b/data/SIM.RData differ diff --git a/data/WT.RData b/data/WT.RData new file mode 100644 index 0000000..c687e95 Binary files /dev/null and b/data/WT.RData differ diff --git a/edgefinder.Rproj b/edgefinder.Rproj new file mode 100644 index 0000000..04f5ba3 --- /dev/null +++ b/edgefinder.Rproj @@ -0,0 +1,22 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +AutoAppendNewline: Yes +StripTrailingWhitespace: Yes + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source +PackageBuildArgs: --resave-data +PackageRoxygenize: rd,collate,namespace,vignette diff --git a/man/DUP.Rd b/man/DUP.Rd new file mode 100644 index 0000000..ddec06f --- /dev/null +++ b/man/DUP.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/edgefinder.R +\docType{data} +\name{DUP} +\alias{DUP} +\title{Gene Expression data for the Duplication group} +\format{ +A matrix with 3454 rows and 12 columns. +} +\usage{ +data(DUP) +} +\description{ +DUP is a matrix with normalized gene expression data containing 3454 differentially expressed genes (when compared with wild-type) from 12 samples (columns) from the duplication group. +} +\references{ +\url{https://www.ncbi.nlm.nih.gov/sites/GDSbrowser?acc=GDS4430} +} +\keyword{datasets} diff --git a/man/SIM.Rd b/man/SIM.Rd new file mode 100644 index 0000000..29b67ca --- /dev/null +++ b/man/SIM.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/edgefinder.R +\docType{data} +\name{SIM} +\alias{SIM} +\title{Simulated gene Expression data using the huge package} +\format{ +A 1000 by 200 matrix, representing 50 hubs +} +\usage{ +data(SIM) +} +\description{ +SIM is a a simulated dataset with a hub structure, consisting of 1000 nodes and 50 hubs +} +\keyword{datasets} diff --git a/man/WT.Rd b/man/WT.Rd new file mode 100644 index 0000000..4dd6635 --- /dev/null +++ b/man/WT.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/edgefinder.R +\docType{data} +\name{WT} +\alias{WT} +\title{Gene Expression data for the WildType group} +\format{ +A matrix with 3454 rows and 15 columns +} +\usage{ +data(WT) +} +\description{ +WT is a matrix with normalized gene expression data containing 3454 differentially expressed genes (when compared with the duplication group) from 15 samples (columns) from the wild-type group. +} +\references{ +\url{https://www.ncbi.nlm.nih.gov/sites/GDSbrowser?acc=GDS4430} +} +\keyword{datasets} diff --git a/man/clusteringCoef.Rd b/man/clusteringCoef.Rd new file mode 100644 index 0000000..cf76a6d --- /dev/null +++ b/man/clusteringCoef.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/edgefinder.R +\name{clusteringCoef} +\alias{clusteringCoef} +\title{Calculate the clustering coefficient of each node.} +\usage{ +clusteringCoef(A) +} +\arguments{ +\item{A}{an adjacency Matrix (0/1).} +} +\value{ +A vector with the clustering coefficient of each node. +} +\description{ +Calculate the clustering coefficient of each node. +} +\examples{ +\donttest{ + data(WT) + WTres <- edgefinder(WT, ttl = "Wild Type") + clusteringCoef(WTres$AdjMat) +} + +} diff --git a/man/collapsedGraph.Rd b/man/collapsedGraph.Rd new file mode 100644 index 0000000..22fde89 --- /dev/null +++ b/man/collapsedGraph.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/edgefinder.R +\name{collapsedGraph} +\alias{collapsedGraph} +\title{Return an adjacency matrix after collapsing clusters into their central nodes.} +\usage{ +collapsedGraph(A, clustersInfo) +} +\arguments{ +\item{A}{An adjacency Matrix.} + +\item{clustersInfo}{Obtained from graphComponents} +} +\value{ +A weighted adjacency matrix between clusters and unclustered nodes. +} +\description{ +Takes an object obtained from graphComponents and prints summary statistics. +} +\examples{ +\donttest{ + data(WT) + WTres <- edgefinder(WT, ttl = "Wild Type") + rownames(WTres$AdjMat) = rownames(WT) + WTComp <- graphComponents(WTres$AdjMat) + Adj1 <- collapsedGraph(WTres$AdjMat, WTComp) > 0 + plotBitmapCC(Adj1,showMinDegree = 2) +} +} diff --git a/man/edgefinder.Rd b/man/edgefinder.Rd new file mode 100644 index 0000000..db76d21 --- /dev/null +++ b/man/edgefinder.Rd @@ -0,0 +1,68 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/edgefinder.R +\name{edgefinder} +\alias{edgefinder} +\title{Detect edges in co-expression datasets.} +\usage{ +edgefinder( + Exprs, + BHthr = 0.01, + rndseed = 112211, + maxLen = 20000, + LOvals = 30, + ttl = "", + trim = 0 +) +} +\arguments{ +\item{Exprs}{A numeric matrix with normalized gene expression data. Rows +correspond to genes, and columns correspond to samples.} + +\item{BHthr}{the Benjamini-Hochberg fasle discovery rate threshold to be +used to determine which pairs are strongly correlated. Default=0.01.} + +\item{rndseed}{The random seed used to select a subset of the pairs.} + +\item{maxLen}{The maximum number of pairs that will be randomly selected +to fit the L2N model. Default=20000.} + +\item{LOvals}{the maximum log-odds ratio to be used to determine the +cut-off points to declare which correlations are significant. +The program will check which log-odds ratio (1,2,...,LOvals) results in +FDR less than or equal to the user-specified BHthr. Default=30.} + +\item{ttl}{Title for the fitted-model plot. Default=""} + +\item{trim}{Fraction of extreme values to exclude from the fitted-model +plot. Default=0 (show all the data).} +} +\value{ +A list with the following elements +\itemize{ +\item{G} {The total number of genes.} +\item{p1} {The proportion of genes in the right mixture component (positively correlated.)} +\item{p2} {The proportion of genes in the left mixture component (negtively correlated.)} +\item{p0} {The proportion of genes in the null component (un-correlated.)} +\item{m0, m1, m2, s0, s1, s2} {The location and scale parameters of the three mixture components.} +\item {thrtable} {A table with 6 columns: posterior probability ratio (ppr) between the non-null components and the null component), the right component cutoff corresponding to the ppr, the left component cutoff, the estimated probability of Type-I errors, the estimated power, the estimated FDR.} +\item {LogOddsRatio} {The log-odds ratio that yields FDR closest to the desired level.} +\item {fitted} {The fitted model (as returned by the EM function).} +\item {rmse} {The root mean-squared error of the fitted model.} +\item {rt, lt} {The significant edges (from the right and left mixture components.)} +\item {AdjMat} {The (sparse) adjacency matrix with edges corresponding to rt, lt.} +} +} +\description{ +Fit the L2N model to normalized correlation coefficients between pairs of +genes. The mixture model has three component - the null component follows +a normal distribution, and the two non-null components follow lognormal +distributions. An edge is in the graph if the correlation between the two +end-point genes is large enough and determined to be in one of the non-null +components. +} +\examples{ +\donttest{ + data(WT) + WTres <- edgefinder(WT, ttl = "Wild Type") +} +} diff --git a/man/graphComponents.Rd b/man/graphComponents.Rd new file mode 100644 index 0000000..041e1d4 --- /dev/null +++ b/man/graphComponents.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/edgefinder.R +\name{graphComponents} +\alias{graphComponents} +\title{Find clusters, and return node characteristics.} +\usage{ +graphComponents(A, minCtr = 5, type = 1) +} +\arguments{ +\item{A}{An adjacency Matrix(0/1).} + +\item{minCtr}{The minimum centrality value to be considered for a cluster center (default=5).} + +\item{type}{Determines how the centrality measure is computed.} +} +\value{ +A data frame with the following columns +\itemize{ + \item{labels} {Node label (e.g. gene names).} +\item{degree} {Node degree.} +\item{cc} {Node clustering coefficient.} +\item{ctr} {Node centrality measure: (type\*CC+1)\*deg.} +\item{clustNo} {Cluster number.} +\item {iscenter} {1 for the node was chosen as the cluster's center, 0 otherwise.} +\item {intEdges} {Number of edges from the node to nodes in the same cluster.} +\item {extEdges} {Number of edges from the node to nodes NOT in the same cluster.} +\item {distCenter} {Standardized Manhattan distance to the central node.} +} +} +\description{ +Take an adjacency Matrix as input and find clusters. For each node, find the degree and clustering coefficient (CC). Then, calculate a centrality measure (type\*CC+1)\*deg. For type=0, it's just the degree. Note that setting type=1 we assign a higher value to nodes that not only have many neighbors, but the neighbors are highly interconnected. For example, suppose we have two components with k nodes, one has a star shape, and the other is a complete graph. With type=0 both graphs will get the same value, but with type=1 the complete graph will be picked by the algorithm first. +} +\examples{ +\donttest{ + data(SIM) + Sres <- edgefinder(SIM, ttl = "hub network") + SimComp <- graphComponents(Sres$AdjMat) + head(SimComp) +} +} diff --git a/man/plotBitmapCC.Rd b/man/plotBitmapCC.Rd new file mode 100644 index 0000000..09d154d --- /dev/null +++ b/man/plotBitmapCC.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/edgefinder.R +\name{plotBitmapCC} +\alias{plotBitmapCC} +\title{Edge-indicator bitmap plot.} +\usage{ +plotBitmapCC( + AdjMat, + clusterInfo = NULL, + orderByCluster = FALSE, + showMinDegree = 0 +) +} +\arguments{ +\item{AdjMat}{An adjacency Matrix (0/1).} + +\item{clusterInfo}{obtained from graphComponents. If not provided by the user, it will be computed on the fly.} + +\item{orderByCluster}{If false, show the bitmap is the original node order. If TRUE, show nodes by clusters, and sort by distance from the center of the cluster.} + +\item{showMinDegree}{Non-negative integer indicating the minimum degree of nodes that should be displayed. Default=0 (all nodes).} +} +\description{ +Plot a bitmap in which a black dot corresponds to a pair of highly correlated genes (an edge in the graph). +The default is to show the nodes according to their order in the input. +By setting orderByDegree=T as below, it is possible to change the order and cluster them, and show them in increasing degree order (from left to right.) +} +\examples{ +\donttest{ + data(WT) + WTres <- edgefinder(WT, ttl = "Wild Type") + WTComp <- graphComponents(WTres$AdjMat) + plotBitmapCC(WTres$AdjMat) + plotBitmapCC(WTres$AdjMat, WTComp, orderByCluster=TRUE) + plotBitmapCC(WTres$AdjMat, WTComp, orderByCluster=TRUE, showMinDegree = 30) +} +} diff --git a/man/plotCluster.Rd b/man/plotCluster.Rd new file mode 100644 index 0000000..b1c949b --- /dev/null +++ b/man/plotCluster.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/edgefinder.R +\name{plotCluster} +\alias{plotCluster} +\title{Plot cluster network} +\usage{ +plotCluster(AdjMat, clustNo, clusterInfo = NULL) +} +\arguments{ +\item{AdjMat}{An adjacency Matrix (0/1).} + +\item{clustNo}{The chosen cluster.} + +\item{clusterInfo}{Obtained from graphComponents.} +} +\description{ +Plot a cluster network with all the nodes and edges - the central node is marked by a black circle. The radius of each point corresponds to its degree. The opacity corresponds to the percentage of edges from the node that is in the cluster (the darker it is, the larger the percentage of edges is within the cluster.) The distance from the center corresponds to the relative dissimilarity with the central node. This is computed as the number of neighbors the node and the central node do not have in common. +} +\examples{ +\donttest{ + data(WT) + WTres <- edgefinder(WT, ttl = "Wild Type") + WTComp <- graphComponents(WTres$AdjMat) + plotCluster(WTres$AdjMat, 5, WTComp) +} +} diff --git a/man/plotDegCC.Rd b/man/plotDegCC.Rd new file mode 100644 index 0000000..88506b6 --- /dev/null +++ b/man/plotDegCC.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/edgefinder.R +\name{plotDegCC} +\alias{plotDegCC} +\title{Plot the degree of nodes versus the degree times the clustering coefficient.} +\usage{ +plotDegCC(edgefinderobj, clusterInfo = NULL, highlightNodes = NULL) +} +\arguments{ +\item{edgefinderobj}{The object (list) returned by edgefinder.} + +\item{clusterInfo}{obtained from graphComponents. If not provided by the user, it will be computed on the fly.} + +\item{highlightNodes}{A vector of node-numbers which will be shown in red. Default is NULL.} +} +\description{ +The x-axis represents the number of neighbors of each node, and the y-axis represents the proportion of neighbors which are connected to each other. +} +\examples{ +\donttest{ + data(WT) + WTres <- edgefinder(WT, ttl = "Wild Type") + WTComp <- graphComponents(WTres$AdjMat) + plotDegCC(WTres,WTComp) +} +} diff --git a/man/plotMixture.Rd b/man/plotMixture.Rd new file mode 100644 index 0000000..6e5bc0e --- /dev/null +++ b/man/plotMixture.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/edgefinder.R +\name{plotMixture} +\alias{plotMixture} +\title{Plot the histogram of the data and the fitted mixture distribution.} +\usage{ +plotMixture(fit.em, gof, ttl = "", trim = 0) +} +\arguments{ +\item{fit.em}{The object (list) returned from the EM function with the parameter estimates for the L2N model.} + +\item{gof}{The root mean-squared error of the fitted model (to appear in the title of the plot).} + +\item{ttl}{The title of the plot (default="").} + +\item{trim}{The proportion of extreme values on both sides of the distribution to eliminate from the plot (default=0.) This can be useful if a small number of values are so extreme, that the plot shows mostly the tails and a spike in the middle. Default=0.} +} +\description{ +The function is called by the edgefinder function. +} +\examples{ +\donttest{ + data(WT) + WTres <- edgefinder(WT, ttl = "Wild Type") + plotMixture(WTres$fitted, WTres$rmse) +} +} diff --git a/man/shortSummary.Rd b/man/shortSummary.Rd new file mode 100644 index 0000000..5a8e981 --- /dev/null +++ b/man/shortSummary.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/edgefinder.R +\name{shortSummary} +\alias{shortSummary} +\title{Print a short summary of the fitted mixture model.} +\usage{ +shortSummary(edgefinderobj) +} +\arguments{ +\item{edgefinderobj}{The object (list) returned from the edgefinder function.} +} +\description{ +Show the number of nodes, the number of possible and detected edges, the estimated proportion of positively/negatively correlated pairs, and the estimated false discovery rate. +} +\examples{ +\donttest{ + data(WT) + WTres <- edgefinder(WT, ttl = "Wild Type") + shortSummary(WTres) +} +} diff --git a/man/shortestPathDistance.Rd b/man/shortestPathDistance.Rd new file mode 100644 index 0000000..1435a30 --- /dev/null +++ b/man/shortestPathDistance.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/edgefinder.R +\name{shortestPathDistance} +\alias{shortestPathDistance} +\title{Return a Matrix with the shortest path distance between nodes (check up to numSteps.)} +\usage{ +shortestPathDistance(AdjMat, numSteps = 0) +} +\arguments{ +\item{AdjMat}{An adjacency Matrix (0/1).} + +\item{numSteps}{The maximum number of edges between pairs of nodes. If numSteps=0, returns the input matrix. numSteps=1 adds neighbors of direct neighbors, etc.} +} +\value{ +A Matrix containing the shortset paths between nodes i and j +} +\description{ +return the adjacency matrix of expMat connecting neighbors up to numSteps away. +} +\examples{ +\donttest{ + data(SIM) + Sres <- edgefinder(SIM, ttl = "hub network") + AdjMat1 <- shortestPathDistance(Sres$AdjMat, numSteps=50) + max(AdjMat1) + Matrix::image(AdjMat1) +} +} diff --git a/man/summarizeClusters.Rd b/man/summarizeClusters.Rd new file mode 100644 index 0000000..c573f16 --- /dev/null +++ b/man/summarizeClusters.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/edgefinder.R +\name{summarizeClusters} +\alias{summarizeClusters} +\title{Show cluster characteristics.} +\usage{ +summarizeClusters(clustersInfo) +} +\arguments{ +\item{clustersInfo}{Obtained from graphComponents.} +} +\value{ +A matrix with cluster number, number of nodes, and fivenum summaries for the degrees of nodes in the cluster, and the percentage of edges that are within the cluster. +} +\description{ +Takes an object obtained from graphComponents and prints and returns summary statistics. +} +\examples{ +\donttest{ + data(WT) + WTres <- edgefinder(WT, ttl = "Wild Type") + WTComp <- graphComponents(WTres$AdjMat) + (summtab <- summarizeClusters(WTComp)) +} +} diff --git a/vignettes/DUPbitmap.png b/vignettes/DUPbitmap.png new file mode 100644 index 0000000..5535c51 Binary files /dev/null and b/vignettes/DUPbitmap.png differ diff --git a/vignettes/DUPclusters.png b/vignettes/DUPclusters.png new file mode 100644 index 0000000..482ae83 Binary files /dev/null and b/vignettes/DUPclusters.png differ diff --git a/vignettes/DUPclustersCropped.png b/vignettes/DUPclustersCropped.png new file mode 100644 index 0000000..e7ad63e Binary files /dev/null and b/vignettes/DUPclustersCropped.png differ diff --git a/vignettes/DUPdg.png b/vignettes/DUPdg.png new file mode 100644 index 0000000..33a96c4 Binary files /dev/null and b/vignettes/DUPdg.png differ diff --git a/vignettes/DUPdgHighlight.png b/vignettes/DUPdgHighlight.png new file mode 100644 index 0000000..756ecf7 Binary files /dev/null and b/vignettes/DUPdgHighlight.png differ diff --git a/vignettes/DUPfit.png b/vignettes/DUPfit.png new file mode 100644 index 0000000..0752eaa Binary files /dev/null and b/vignettes/DUPfit.png differ diff --git a/vignettes/SIMbitmap.png b/vignettes/SIMbitmap.png new file mode 100644 index 0000000..bd2fe70 Binary files /dev/null and b/vignettes/SIMbitmap.png differ diff --git a/vignettes/SIMbitmap2.png b/vignettes/SIMbitmap2.png new file mode 100644 index 0000000..8f8880f Binary files /dev/null and b/vignettes/SIMbitmap2.png differ diff --git a/vignettes/SIMbitmap3.png b/vignettes/SIMbitmap3.png new file mode 100644 index 0000000..d3388d4 Binary files /dev/null and b/vignettes/SIMbitmap3.png differ diff --git a/vignettes/SIMcluster1.png b/vignettes/SIMcluster1.png new file mode 100644 index 0000000..3ae598e Binary files /dev/null and b/vignettes/SIMcluster1.png differ diff --git a/vignettes/WTbitmap.png b/vignettes/WTbitmap.png new file mode 100644 index 0000000..d7aca19 Binary files /dev/null and b/vignettes/WTbitmap.png differ diff --git a/vignettes/WTcluster5.png b/vignettes/WTcluster5.png new file mode 100644 index 0000000..7c878d9 Binary files /dev/null and b/vignettes/WTcluster5.png differ diff --git a/vignettes/WTcluster9.png b/vignettes/WTcluster9.png new file mode 100644 index 0000000..8efb0e6 Binary files /dev/null and b/vignettes/WTcluster9.png differ diff --git a/vignettes/WTclusters.png b/vignettes/WTclusters.png new file mode 100644 index 0000000..c2690b3 Binary files /dev/null and b/vignettes/WTclusters.png differ diff --git a/vignettes/WTclustersCropped.png b/vignettes/WTclustersCropped.png new file mode 100644 index 0000000..5224551 Binary files /dev/null and b/vignettes/WTclustersCropped.png differ diff --git a/vignettes/WTdg.png b/vignettes/WTdg.png new file mode 100644 index 0000000..594c8f0 Binary files /dev/null and b/vignettes/WTdg.png differ diff --git a/vignettes/WTfit.png b/vignettes/WTfit.png new file mode 100644 index 0000000..1ba7469 Binary files /dev/null and b/vignettes/WTfit.png differ diff --git a/vignettes/edgefinder.R b/vignettes/edgefinder.R new file mode 100644 index 0000000..d2a9984 --- /dev/null +++ b/vignettes/edgefinder.R @@ -0,0 +1,29 @@ +## ----setup, include = FALSE--------------------------------------------------- +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) + +## ----echo=FALSE, out.width='60%'---------------------------------------------- +knitr::include_graphics('./WTfit.png') + +## ----echo=FALSE, out.width='45%'---------------------------------------------- +knitr::include_graphics('./WTcluster5.png') +knitr::include_graphics('./WTcluster9.png') + +## ----echo=FALSE, out.width='70%'---------------------------------------------- +knitr::include_graphics('./WTclustersCropped.png') + +## ----echo=FALSE, out.width='60%'---------------------------------------------- +knitr::include_graphics('./WTdg.png') + +## ----echo=FALSE, out.width='50%'---------------------------------------------- +knitr::include_graphics('./WTbitmap.png') + +## ----echo=FALSE, out.width='70%'---------------------------------------------- +knitr::include_graphics('./DUPclustersCropped.png') + +## ----echo=FALSE, out.width='45%'---------------------------------------------- +knitr::include_graphics('./SIMbitmap3.png') +knitr::include_graphics('./SIMcluster1.png') + diff --git a/vignettes/edgefinder.Rmd b/vignettes/edgefinder.Rmd new file mode 100644 index 0000000..5a302c3 --- /dev/null +++ b/vignettes/edgefinder.Rmd @@ -0,0 +1,350 @@ +--- +title: "edgefinder" +author: "Haim Bar" +date: "`r Sys.Date()`" +output: rmarkdown::pdf_document +vignette: > + %\VignetteIndexEntry{edgefinder} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +The edgefinder package is used to find edges in gene networks using co-expression +data. The input to the program is a normalized expression matrix, with genes (nodes) +in the rows, and samples in the columns. +The program calculates the pair-wise correlations, performs Fisher's Z +transformation, and fits the L2N model to the transformed data. L2N is a mixture +model with three components: the uncorrelated pairs belong to the null component +which is assumed to be normally distributed, and the correlated pairs belong to one +of the two non-null components which are assumed to follow lognormal distributions. + +Typical datasets consist of hundreds, or thousands of genes, and hence a very +large number of pairs. Therefore, edgefinder randomly selects a subset of the pairs (the +default number of pairs is 20,000), fits the L2N model to the subset, and calculates +the component probabilities for *all* possible pairs. +Using the posterior probabilities, edgefinder determines which pairs are +highly correlated while controlling the false discovery rate. +Note that edgefinder makes no assumptions about the structure of the network. + +The edgefinder package depends on the 'Matrix' package, to allow for efficient +storage and computation of large co-occurrence matrices. For simulating datasets +we used the 'huge' and 'MASS' packages, but they are not required when +using edgefinder. + +# Real data examples + +We use a publicly available dataset from +https://www.ncbi.nlm.nih.gov/sites/GDSbrowser?acc=GDS4430 +(Horev G, Ellegood J, Lerch JP, Son YE et al. Dosage-dependent phenotypes in models +of 16p11.2 lesions found in autism. Proc Natl Acad Sci U.S.A. 2011, Oct. +11;108(41):17076-81. PMID: 21969575). + +The dataset contains three groups: wild type (2 copies of 16p11.2), +deletion (1 copy), and duplication (3 copies). +We focus on a subset of 3,454, genes which were found to be differentially +expressed when comparing the wild-type and duplication groups (using an +FDR threshold of 0.05.) We fit the +L2N model to this set of genes in each group, and compare the properties +of the two networks. First, we load the wild-type data (WT). +WT is a matrix with 3454 rows (genes) and 15 columns (samples) from +the wild-type group. + + + +``` +library("edgefinder") +# Wild-type first: +data(WT) +WTres <- edgefinder(WT, ttl = "Wild Type") +shortSummary(WTres) +``` + +The **edgefinder** function fits the L2N model to the data, and plots the fitted mixture distribution: + +```{r echo=FALSE, out.width='60%'} +knitr::include_graphics('./WTfit.png') +``` + +The function **shortSummary** produces the following output: + +``` +No. nodes = 3,454 +Max no. edges = 5,963,331 +No. edges detected = 80,332 +p1 = 0.0942 +p2 = 0.0185 +Est. FDR <= 0.00997 +``` + +Note that the estimated FDR is calculated based on the fitted L2N model. +The default FDR threshold used by the edgefinder function is 0.01, and in this case, the +empirical FDR is very close to the level set by the user. If the empirical FDR is too +high, you may increase **LOvals** from its default value (30). This will result in larger +(stricter) thresholds for determining significant correlations, and will decrease the +proportion of false discoveries. +The FDR threshold (the **BHthr** parameter) should be set according to the number of edges. +In this example, the algorithm finds 80,332 edges, and an FDR of 0.01 means that +at most 800 of the detected edges may not be true discoveries. If this number of edges +is too large in the sense that it may affect our inference about the network structure, +or a subsequent gene enrichment analysis, we could lower the FDR threshold. + +The function **graphComponents** finds clusters of genes. To do that, it takes as input an +adjacency (0/1) matrix (e.g. WTres$AdjMat in our example.) To find clusters it first +calculates a centrality for each node, using the formula (type\*CC+1)\*deg where +deg is the degree of the node, and CC is its clustering coefficient (CC). **type** is set +by default to 1. When it is set to 0, the centrality measure is just the degree of +the node. Setting type=1 means that we assign a higher value to nodes that not only have +many neighbors, but the neighbors are highly interconnected. For example, suppose we +have two components with k nodes, one has a star shape, and the other is a complete +graph. With type=0 both graphs will get the same value, but with type=1 the complete +graph will be picked by the algorithm first. +You can also set a minimum centrality value (the parameter **minCtr**) to determine the +smallest possible cluster size. + +The function returns a data frame with the following information about each node: +a label (e.g. gene name), degree, clustering coefficient, centrality measure, +cluster number, iscenter (1 for the node was chosen as the cluster's center, 0 otherwise), +the number of edges from the node to nodes in the same cluster the number of edges +from the node to nodes NOT in the same cluster, and the standardized Manhattan distance +to the central node in the cluster (in terms of the number neighbors they do not have +in common.) + +``` +WTComp <- graphComponents(WTres$AdjMat) +head(WTComp) + + labels degree cc ctr clustNo iscenter intEdges extEdges distCenter +1 1 251 0.5999044 401.5760 1 0 187 64 0.072958888 +2 2 0 0.0000000 0.0000 0 0 0 0 0.000000000 +3 3 202 0.7217378 347.7910 1 0 164 38 0.072090330 +4 4 202 0.5819910 319.5622 4 0 98 104 0.008396063 +5 5 0 0.0000000 0.0000 0 0 0 0 0.000000000 +6 6 9 0.6944444 15.2500 0 0 0 0 0.000000000 +``` + +The function **summarizeClusters** returns summary statistics about each cluster. +It prints the number of nodes, edges, clusters and unclustered nodes to the screen, +and returns a matrix with cluster number, number of nodes in the cluster, +fivenum summary for the degrees of nodes in the cluster, and fivenum summary for +the percentage of edges that are within the cluster. + +``` +summtab <- summarizeClusters(WTComp) +head(summtab[,1:7]) +head(summtab[,c(1:2,8:12)]) + +Num of nodes: 3454 +Num of edges: 80332 +Num of clusters: 72 +Num of unclustered nodes: 1837 + + Cluster Nodes degreeMin degreeQ25 degreeMedian degreeQ75 degreeMax +[1,] 1 374 59 222.0 257 299.0 373 +[2,] 2 69 17 96.0 134 164.0 234 +[3,] 3 39 2 53.5 74 122.5 209 +[4,] 4 107 25 108.0 130 155.5 209 +[5,] 5 35 26 58.5 80 109.0 154 +[6,] 6 19 17 45.5 80 108.5 133 + + + Cluster Nodes pctInClstMin pctInClstQ25 pctInClstMedian pctInClstQ75 pctInClstMax +[1,] 1 374 0.52073733 0.78536585 0.8452080 0.9083969 1.0000000 +[2,] 2 69 0.07109005 0.23952096 0.3061224 0.4226804 0.8235294 +[3,] 3 39 0.03571429 0.09923455 0.1358025 0.2197585 1.0000000 +[4,] 4 107 0.18750000 0.44693586 0.5555556 0.6298886 0.8529412 +[5,] 5 35 0.10344828 0.21717172 0.2777778 0.3584826 0.7692308 +[6,] 6 19 0.06666667 0.10270206 0.1262136 0.1594156 0.4210526 + +``` + +It can be seen, for example, the cluster 1 has 374 nodes, and most of them have many neighbors +(more than 75% of them have at least 222 edges), and this cluster is very interconnected (at least 75% +of the nodes are mostly connected within the cluster with at least 79% of their edges being inside +the cluster. + +Next, we can visualize clusters using the **plotCluster** function. For example, to plot +clusters 5 and 9 we use the following syntax: + +``` +plotCluster(WTres$AdjMat,5,WTComp) +plotCluster(WTres$AdjMat,5,WTComp) +``` + +The central node is marked by a black circle. The radius of each point corresponds +to its degree. The opacity corresponds to the percentage of edges from the node +that is in the cluster (the darker it is, the larger the percentage of edges is +within the cluster.) The distance from the center corresponds to the relative +dissimilarity with the central node. This is computed as the number of neighbors +the node and the central node do not have in common. +For example, in cluster 9 (right plot) the dark shade of blue of all the nodes +shows that the majority of edges connecting to these nodes are within the cluster. +In contrast, the nodes in cluster 4 (left) have a larger percentage of their neighbors outside the +cluster. + + +```{r echo=FALSE, out.width='45%'} +knitr::include_graphics('./WTcluster5.png') +knitr::include_graphics('./WTcluster9.png') +``` + +Indeed, when we look at the data +``` +summtab[9,c(1:2,8:12)] + Cluster Nodes pctInClstMin pctInClstQ25 pctInClstMedian pctInClstQ75 pctInClstMax + 9.0000000 108.0000000 0.6857143 0.8768939 0.9301901 0.9657132 1.0000000 +``` +We see that the cluster contains 108 nodes, and the smallest percentage of within-cluster +edges is 68.5%, and for 75% of the nodes, the percentage is greater than 87.6%. This means that +cluster 9 is highly inter-connected, and fairly isolated. + +We can collapse the network data for more compact visualization by defining +a subset in which clusters are represented by their central nodes. The function +**collapsedGraph** returns an adjacency matrix which contains all the unclustered +nodes, and the centers of the clusters. The elements in the matrix contain the +total number of edges in the original graph. That is, the total count of edges +between clusters i and j is stored in the matrix, rather than just 0/1. To convert +it to a 0/1 adjacency matrix we can use the following: +``` +Adj1 <- collapsedGraph(WTres$AdjMat, WTComp) > 0 +``` + +We can use the **igraph** package to visualize the collapsed network. +For example, the following code will produce a network graph containing +all the clusters and unclustered nodes which have at least one neighbor. +``` +library("igraph") +inc <- which(Matrix::rowSums(Adj1) > 0) +plot(graph.adjacency(Adj1[inc,inc], mode="undirected"), + vertex.label.cex=0.7, vertex.size=0.1, edge.color='lightgreen',asp=1) +``` + +If we want to show only the relationships between clusters, we use the following: +``` +library("igraph") +inc <- which(substr(rownames(Adj1),1,3) == "CLS") +plot(graph.adjacency(Adj1[inc,inc], mode="undirected"),vertex.label.cex=0.7, +vertex.size=0.1,edge.color='lightgreen', asp=1) +``` +This gives the following graph, where it can be seen that cluster 9 is connected to +clusters 8, 19, 20, 33, and 35. + +```{r echo=FALSE, out.width='70%'} +knitr::include_graphics('./WTclustersCropped.png') +``` + +If we want to create a subset of the original data by taking a representative from each +clusters, we can do the following + +``` +WTclustered <- WT[union(which(WTComp$iscenter == 1), which(WTComp$clustNo == 0)),] +dim(WTclustered) +[1] 1909 15 +``` + +*Other visualizations:* + +The **plotDegCC** function can be used to plot the degree of nodes versus the +degree times the clustering coefficient of nodes. We can also highlight specific groups. +For example, in the following code we highlight +cluster 1, which as we've seen before, is a large (374 genes) and highly connected +75% of the nodes have at least 222 neighbors, and most of the connections are within the cluster +(75% of the nodes have at least 78.5% of their neighbors within the cluster.) + +``` +plotDegCC(WTres,WTComp,highlightNodes = which(WTComp$clustNo==1)) +``` + +```{r echo=FALSE, out.width='60%'} +knitr::include_graphics('./WTdg.png') +``` + +The **plotBitmapCC** function is used to show the network as a 0/1 matrix, where a black +dot corresponds to an edge in the graph. Setting **orderByDegree=T** is used to +sort the nodes by clusters. When set to FALSE, the original order +of the nodes as it appears in the gene expression file, is preserved. +We can create the bitmap plot for nodes with degree greater than or equal to +some threshold. For example, **showMinDegree=30** will result in a plot which includes +only node which have at least 30 neighbors. + +``` +plotBitmapCC(WTres$AdjMat, WTComp, orderByCluster=TRUE, showMinDegree = 30) +``` + +```{r echo=FALSE, out.width='50%'} +knitr::include_graphics('./WTbitmap.png') +``` + +We repeat the same process with the duplication group. +DUP is a matrix with 3454 rows (genes) and 12 columns (samples). +We only show the collapsed cluster plot, and observe that unlike the WT group, +the network in the DUP group consists of two "super-clusters". + +``` +data("DUP") +DUPres <- edgefinder(DUP, ttl = "Duplication") +DUPComp <- graphComponents(DUPres$AdjMat) +Adj2 <- collapsedGraph(DUPres$AdjMat, DUPComp) > 0 +inc <- which(substr(rownames(Adj2),1,3) == "CLS") +plot(graph.adjacency(Adj2[inc,inc], mode="undirected"),vertex.label.cex=0.7, +vertex.size=0.1,edge.color='lightgreen', asp=1) +``` + + +```{r echo=FALSE, out.width='70%'} +knitr::include_graphics('./DUPclustersCropped.png') +``` + + + + +# Simulated data + +The following examples shows a simulated dataset with a hub structure, consisting +of 1000 nodes and 50 hubs. The bitmap plot shows the network that was created +by edgefinder. + +``` +library("huge") +library("MASS") +N=200; D=1000 +set.seed(23197) +L = huge.generator(n = N, d = D, graph = "hub", g=50, v = 0.3, u = 0.1) +x = mvrnorm(N, rep(0, D), L$sigma) +``` + +Data generated like this is provided with the package in a dataset called SIM. We perform similar analysis +and display the bitmap plot, which shows that edgefinder finds the actual network structure overall, with +almost no false discoveries. +We also display the network of cluster 1, which shows that the cluster is how we expected it to be, +with one central node with high degree (the hub gene), with interconnected neighbors each having a smaller degree +than the hub gene. From the dark shade of blue for each node, we can infer that the nodes are connected +within the cluster but almost no edges to other clusters or nodes. +The smallest percentage of edges within cluster 1 is 66.7%, and at least 75% of the nodes are connected only to nodes within the same cluster. + +``` +data(SIM) +Sres <- edgefinder(SIM, ttl = "Simulation", BHthr=0.05) +plotBitmapCC(Sres$AdjMat,orderByCluster=FALSE) +SIMComp <- graphComponents(Sres$AdjMat) +plotCluster(Sres$AdjMat,1,SIMComp) +sumtab <- summarizeClusters(SIMComp) +sumtab[1,c(1:2,8:12)] + +Cluster Nodes pctInClstMin pctInClstQ25 pctInClstMedian pctInClstQ75 pctInClstMax + 1 20 0.6666667 1.0000000 1.0000000 1.0000000 1.0000000 + +``` + + +```{r echo=FALSE, out.width='45%'} +knitr::include_graphics('./SIMbitmap3.png') +knitr::include_graphics('./SIMcluster1.png') +``` + diff --git a/vignettes/edgefinder.html b/vignettes/edgefinder.html new file mode 100644 index 0000000..d7620ea --- /dev/null +++ b/vignettes/edgefinder.html @@ -0,0 +1,398 @@ + + + + + + + + + + + + + + + + +edgefinder + + + + + + + + + + + + + + + + + + + + +

edgefinder

+

Haim Bar

+

2020-07-29

+ + + +

The edgefinder package is used to find edges in gene networks using co-expression data. The input to the program is a normalized expression matrix, with genes (nodes) in the rows, and samples in the columns. The program calculates the pair-wise correlations, performs Fisher’s Z transformation, and fits the L2N model to the transformed data. L2N is a mixture model with three components: the uncorrelated pairs belong to the null component which is assumed to be normally distributed, and the correlated pairs belong to one of the two non-null components which are assumed to follow lognormal distributions.

+

Typical datasets consist of hundreds, or thousands of genes, and hence a very large number of pairs. Therefore, edgefinder randomly selects a subset of the pairs (the default number of pairs is 20,000), fits the L2N model to the subset, and calculates the component probabilities for all possible pairs. Using the posterior probabilities, edgefinder determines which pairs are highly correlated while controlling the false discovery rate. Note that edgefinder makes no assumptions about the structure of the network.

+

The edgefinder package depends on the ‘Matrix’ package, to allow for efficient storage and computation of large co-occurrence matrices. For simulating datasets we used the ‘huge’ and ‘MASS’ packages, but they are not required when using edgefinder.

+
+

Real data examples

+

We use a publicly available dataset from https://www.ncbi.nlm.nih.gov/sites/GDSbrowser?acc=GDS4430 (Horev G, Ellegood J, Lerch JP, Son YE et al. Dosage-dependent phenotypes in models of 16p11.2 lesions found in autism. Proc Natl Acad Sci U.S.A. 2011, Oct.  11;108(41):17076-81. PMID: 21969575).

+

The dataset contains three groups: wild type (2 copies of 16p11.2), deletion (1 copy), and duplication (3 copies). We focus on a subset of 3,454, genes which were found to be differentially expressed when comparing the wild-type and duplication groups (using an FDR threshold of 0.05.) We fit the L2N model to this set of genes in each group, and compare the properties of the two networks. First, we load the wild-type data (WT). WT is a matrix with 3454 rows (genes) and 15 columns (samples) from the wild-type group.

+
library("edgefinder")
+# Wild-type first:
+data(WT)
+WTres <- edgefinder(WT, ttl = "Wild Type")
+shortSummary(WTres)
+

The edgefinder function fits the L2N model to the data, and plots the fitted mixture distribution:

+

+

The function shortSummary produces the following output:

+
No. nodes = 3,454 
+Max no. edges = 5,963,331 
+No. edges detected = 80,332 
+p1 = 0.0942 
+p2 = 0.0185 
+Est. FDR <= 0.00997 
+

Note that the estimated FDR is calculated based on the fitted L2N model. The default FDR threshold used by the edgefinder function is 0.01, and in this case, the empirical FDR is very close to the level set by the user. If the empirical FDR is too high, you may increase LOvals from its default value (30). This will result in larger (stricter) thresholds for determining significant correlations, and will decrease the proportion of false discoveries. The FDR threshold (the BHthr parameter) should be set according to the number of edges. In this example, the algorithm finds 80,332 edges, and an FDR of 0.01 means that at most 800 of the detected edges may not be true discoveries. If this number of edges is too large in the sense that it may affect our inference about the network structure, or a subsequent gene enrichment analysis, we could lower the FDR threshold.

+

The function graphComponents finds clusters of genes. To do that, it takes as input an adjacency (0/1) matrix (e.g. WTres$AdjMat in our example.) To find clusters it first calculates a centrality for each node, using the formula (type*CC+1)*deg where deg is the degree of the node, and CC is its clustering coefficient (CC). type is set by default to 1. When it is set to 0, the centrality measure is just the degree of the node. Setting type=1 means that we assign a higher value to nodes that not only have many neighbors, but the neighbors are highly interconnected. For example, suppose we have two components with k nodes, one has a star shape, and the other is a complete graph. With type=0 both graphs will get the same value, but with type=1 the complete graph will be picked by the algorithm first. You can also set a minimum centrality value (the parameter minCtr) to determine the smallest possible cluster size.

+

The function returns a data frame with the following information about each node: a label (e.g. gene name), degree, clustering coefficient, centrality measure, cluster number, iscenter (1 for the node was chosen as the cluster’s center, 0 otherwise), the number of edges from the node to nodes in the same cluster the number of edges from the node to nodes NOT in the same cluster, and the standardized Manhattan distance to the central node in the cluster (in terms of the number neighbors they do not have in common.)

+
WTComp <- graphComponents(WTres$AdjMat)
+head(WTComp)
+
+  labels degree        cc      ctr clustNo iscenter intEdges extEdges  distCenter
+1      1    251 0.5999044 401.5760       1        0      187       64 0.072958888
+2      2      0 0.0000000   0.0000       0        0        0        0 0.000000000
+3      3    202 0.7217378 347.7910       1        0      164       38 0.072090330
+4      4    202 0.5819910 319.5622       4        0       98      104 0.008396063
+5      5      0 0.0000000   0.0000       0        0        0        0 0.000000000
+6      6      9 0.6944444  15.2500       0        0        0        0 0.000000000
+

The function summarizeClusters returns summary statistics about each cluster. It prints the number of nodes, edges, clusters and unclustered nodes to the screen, and returns a matrix with cluster number, number of nodes in the cluster, fivenum summary for the degrees of nodes in the cluster, and fivenum summary for the percentage of edges that are within the cluster.

+
summtab <- summarizeClusters(WTComp)
+head(summtab[,1:7])
+head(summtab[,c(1:2,8:12)])
+
+Num of nodes: 3454 
+Num of edges: 80332 
+Num of clusters: 72 
+Num of unclustered nodes: 1837 
+
+     Cluster Nodes degreeMin degreeQ25 degreeMedian degreeQ75 degreeMax
+[1,]       1   374        59     222.0          257     299.0       373
+[2,]       2    69        17      96.0          134     164.0       234
+[3,]       3    39         2      53.5           74     122.5       209
+[4,]       4   107        25     108.0          130     155.5       209
+[5,]       5    35        26      58.5           80     109.0       154
+[6,]       6    19        17      45.5           80     108.5       133
+
+
+     Cluster Nodes pctInClstMin pctInClstQ25 pctInClstMedian pctInClstQ75 pctInClstMax
+[1,]       1   374   0.52073733   0.78536585       0.8452080    0.9083969    1.0000000
+[2,]       2    69   0.07109005   0.23952096       0.3061224    0.4226804    0.8235294
+[3,]       3    39   0.03571429   0.09923455       0.1358025    0.2197585    1.0000000
+[4,]       4   107   0.18750000   0.44693586       0.5555556    0.6298886    0.8529412
+[5,]       5    35   0.10344828   0.21717172       0.2777778    0.3584826    0.7692308
+[6,]       6    19   0.06666667   0.10270206       0.1262136    0.1594156    0.4210526
+
+

It can be seen, for example, the cluster 1 has 374 nodes, and most of them have many neighbors (more than 75% of them have at least 222 edges), and this cluster is very interconnected (at least 75% of the nodes are mostly connected within the cluster with at least 79% of their edges being inside the cluster.

+

Next, we can visualize clusters using the plotCluster function. For example, to plot clusters 5 and 9 we use the following syntax:

+
plotCluster(WTres$AdjMat,5,WTComp)
+plotCluster(WTres$AdjMat,5,WTComp)
+

The central node is marked by a black circle. The radius of each point corresponds to its degree. The opacity corresponds to the percentage of edges from the node that is in the cluster (the darker it is, the larger the percentage of edges is within the cluster.) The distance from the center corresponds to the relative dissimilarity with the central node. This is computed as the number of neighbors the node and the central node do not have in common. For example, in cluster 9 (right plot) the dark shade of blue of all the nodes shows that the majority of edges connecting to these nodes are within the cluster. In contrast, the nodes in cluster 4 (left) have a larger percentage of their neighbors outside the cluster.

+

+

Indeed, when we look at the data

+
summtab[9,c(1:2,8:12)]
+        Cluster           Nodes    pctInClstMin    pctInClstQ25 pctInClstMedian    pctInClstQ75    pctInClstMax 
+      9.0000000     108.0000000       0.6857143       0.8768939       0.9301901       0.9657132       1.0000000 
+

We see that the cluster contains 108 nodes, and the smallest percentage of within-cluster edges is 68.5%, and for 75% of the nodes, the percentage is greater than 87.6%. This means that cluster 9 is highly inter-connected, and fairly isolated.

+

We can collapse the network data for more compact visualization by defining a subset in which clusters are represented by their central nodes. The function collapsedGraph returns an adjacency matrix which contains all the unclustered nodes, and the centers of the clusters. The elements in the matrix contain the total number of edges in the original graph. That is, the total count of edges between clusters i and j is stored in the matrix, rather than just 0/1. To convert it to a 0/1 adjacency matrix we can use the following:

+
Adj1 <- collapsedGraph(WTres$AdjMat, WTComp) > 0
+

We can use the igraph package to visualize the collapsed network. For example, the following code will produce a network graph containing all the clusters and unclustered nodes which have at least one neighbor.

+
library("igraph")
+inc <- which(Matrix::rowSums(Adj1) > 0)
+plot(graph.adjacency(Adj1[inc,inc], mode="undirected"),
+     vertex.label.cex=0.7, vertex.size=0.1, edge.color='lightgreen',asp=1)
+

If we want to show only the relationships between clusters, we use the following:

+
library("igraph")
+inc <- which(substr(rownames(Adj1),1,3) == "CLS")
+plot(graph.adjacency(Adj1[inc,inc], mode="undirected"),vertex.label.cex=0.7,
+vertex.size=0.1,edge.color='lightgreen', asp=1)
+

This gives the following graph, where it can be seen that cluster 9 is connected to clusters 8, 19, 20, 33, and 35.

+

+

If we want to create a subset of the original data by taking a representative from each clusters, we can do the following

+
WTclustered <- WT[union(which(WTComp$iscenter == 1), which(WTComp$clustNo == 0)),]
+dim(WTclustered)
+[1] 1909   15
+

Other visualizations:

+

The plotDegCC function can be used to plot the degree of nodes versus the degree times the clustering coefficient of nodes. We can also highlight specific groups. For example, in the following code we highlight cluster 1, which as we’ve seen before, is a large (374 genes) and highly connected 75% of the nodes have at least 222 neighbors, and most of the connections are within the cluster (75% of the nodes have at least 78.5% of their neighbors within the cluster.)

+
plotDegCC(WTres,WTComp,highlightNodes = which(WTComp$clustNo==1))
+

+

The plotBitmapCC function is used to show the network as a 0/1 matrix, where a black dot corresponds to an edge in the graph. Setting orderByDegree=T is used to sort the nodes by clusters. When set to FALSE, the original order of the nodes as it appears in the gene expression file, is preserved. We can create the bitmap plot for nodes with degree greater than or equal to some threshold. For example, showMinDegree=30 will result in a plot which includes only node which have at least 30 neighbors.

+
plotBitmapCC(WTres$AdjMat, WTComp, orderByCluster=TRUE, showMinDegree = 30)
+

+

We repeat the same process with the duplication group. DUP is a matrix with 3454 rows (genes) and 12 columns (samples). We only show the collapsed cluster plot, and observe that unlike the WT group, the network in the DUP group consists of two “super-clusters”.

+
data("DUP")
+DUPres <- edgefinder(DUP, ttl = "Duplication")
+DUPComp <- graphComponents(DUPres$AdjMat)
+Adj2 <- collapsedGraph(DUPres$AdjMat, DUPComp) > 0
+inc <- which(substr(rownames(Adj2),1,3) == "CLS")
+plot(graph.adjacency(Adj2[inc,inc], mode="undirected"),vertex.label.cex=0.7,
+vertex.size=0.1,edge.color='lightgreen', asp=1)
+

+
+
+

Simulated data

+

The following examples shows a simulated dataset with a hub structure, consisting of 1000 nodes and 50 hubs. The bitmap plot shows the network that was created by edgefinder.

+
library("huge")
+library("MASS")
+N=200; D=1000
+set.seed(23197)
+L = huge.generator(n = N, d = D, graph = "hub", g=50,  v = 0.3, u = 0.1)
+x = mvrnorm(N, rep(0, D), L$sigma)
+

Data generated like this is provided with the package in a dataset called SIM. We perform similar analysis and display the bitmap plot, which shows that edgefinder finds the actual network structure overall, with almost no false discoveries. We also display the network of cluster 1, which shows that the cluster is how we expected it to be, with one central node with high degree (the hub gene), with interconnected neighbors each having a smaller degree than the hub gene. From the dark shade of blue for each node, we can infer that the nodes are connected within the cluster but almost no edges to other clusters or nodes. The smallest percentage of edges within cluster 1 is 66.7%, and at least 75% of the nodes are connected only to nodes within the same cluster.

+
data(SIM)
+Sres <- edgefinder(SIM, ttl = "Simulation", BHthr=0.05)
+plotBitmapCC(Sres$AdjMat,orderByCluster=FALSE)
+SIMComp <- graphComponents(Sres$AdjMat)
+plotCluster(Sres$AdjMat,1,SIMComp)
+sumtab <- summarizeClusters(SIMComp)
+sumtab[1,c(1:2,8:12)]
+
+Cluster Nodes   pctInClstMin  pctInClstQ25 pctInClstMedian   pctInClstQ75  pctInClstMax 
+      1    20      0.6666667     1.0000000       1.0000000      1.0000000     1.0000000 
+
+

+
+ + + + + + + + + + + diff --git a/vignettes/edgefinder.md b/vignettes/edgefinder.md new file mode 100644 index 0000000..4f0e413 --- /dev/null +++ b/vignettes/edgefinder.md @@ -0,0 +1,329 @@ +--- +title: "edgefinder" +author: "Haim Bar" +date: "2020-07-29" +output: rmarkdown::pdf_document +vignette: > + %\VignetteIndexEntry{edgefinder} + %\VignetteEngine{knitr::knitr} + %\VignetteEncoding{UTF-8} +--- + + + +The edgefinder package is used to find edges in gene networks using co-expression +data. The input to the program is a normalized expression matrix, with genes (nodes) +in the rows, and samples in the columns. +The program calculates the pair-wise correlations, performs Fisher's Z +transformation, and fits the L2N model to the transformed data. L2N is a mixture +model with three components: the uncorrelated pairs belong to the null component +which is assumed to be normally distributed, and the correlated pairs belong to one +of the two non-null components which are assumed to follow lognormal distributions. + +Typical datasets consist of hundreds, or thousands of genes, and hence a very +large number of pairs. Therefore, edgefinder randomly selects a subset of the pairs (the +default number of pairs is 20,000), fits the L2N model to the subset, and calculates +the component probabilities for *all* possible pairs. +Using the posterior probabilities, edgefinder determines which pairs are +highly correlated while controlling the false discovery rate. +Note that edgefinder makes no assumptions about the structure of the network. + +The edgefinder package depends on the 'Matrix' package, to allow for efficient +storage and computation of large co-occurrence matrices. For simulating datasets +we used the 'huge' and 'MASS' packages, but they are not required when +using edgefinder. + +# Real data examples + +We use a publicly available dataset from +https://www.ncbi.nlm.nih.gov/sites/GDSbrowser?acc=GDS4430 +(Horev G, Ellegood J, Lerch JP, Son YE et al. Dosage-dependent phenotypes in models +of 16p11.2 lesions found in autism. Proc Natl Acad Sci U.S.A. 2011, Oct. +11;108(41):17076-81. PMID: 21969575). + +The dataset contains three groups: wild type (2 copies of 16p11.2), +deletion (1 copy), and duplication (3 copies). +We focus on a subset of 3,454, genes which were found to be differentially +expressed when comparing the wild-type and duplication groups (using an +FDR threshold of 0.05.) We fit the +L2N model to this set of genes in each group, and compare the properties +of the two networks. First, we load the wild-type data (WT). +WT is a matrix with 3454 rows (genes) and 15 columns (samples) from +the wild-type group. + + + +``` +library("edgefinder") +# Wild-type first: +data(WT) +WTres <- edgefinder(WT, ttl = "Wild Type") +shortSummary(WTres) +``` + +The **edgefinder** function fits the L2N model to the data, and plots the fitted mixture distribution: + +plot of chunk unnamed-chunk-1 + +The function **shortSummary** produces the following output: + +``` +No. nodes = 3,454 +Max no. edges = 5,963,331 +No. edges detected = 80,332 +p1 = 0.0942 +p2 = 0.0185 +Est. FDR <= 0.00997 +``` + +Note that the estimated FDR is calculated based on the fitted L2N model. +The default FDR threshold used by the edgefinder function is 0.01, and in this case, the +empirical FDR is very close to the level set by the user. If the empirical FDR is too +high, you may increase **LOvals** from its default value (30). This will result in larger +(stricter) thresholds for determining significant correlations, and will decrease the +proportion of false discoveries. +The FDR threshold (the **BHthr** parameter) should be set according to the number of edges. +In this example, the algorithm finds 80,332 edges, and an FDR of 0.01 means that +at most 800 of the detected edges may not be true discoveries. If this number of edges +is too large in the sense that it may affect our inference about the network structure, +or a subsequent gene enrichment analysis, we could lower the FDR threshold. + +The function **graphComponents** finds clusters of genes. To do that, it takes as input an +adjacency (0/1) matrix (e.g. WTres$AdjMat in our example.) To find clusters it first +calculates a centrality for each node, using the formula (type\*CC+1)\*deg where +deg is the degree of the node, and CC is its clustering coefficient (CC). **type** is set +by default to 1. When it is set to 0, the centrality measure is just the degree of +the node. Setting type=1 means that we assign a higher value to nodes that not only have +many neighbors, but the neighbors are highly interconnected. For example, suppose we +have two components with k nodes, one has a star shape, and the other is a complete +graph. With type=0 both graphs will get the same value, but with type=1 the complete +graph will be picked by the algorithm first. +You can also set a minimum centrality value (the parameter **minCtr**) to determine the +smallest possible cluster size. + +The function returns a data frame with the following information about each node: +a label (e.g. gene name), degree, clustering coefficient, centrality measure, +cluster number, iscenter (1 for the node was chosen as the cluster's center, 0 otherwise), +the number of edges from the node to nodes in the same cluster the number of edges +from the node to nodes NOT in the same cluster, and the standardized Manhattan distance +to the central node in the cluster (in terms of the number neighbors they do not have +in common.) + +``` +WTComp <- graphComponents(WTres$AdjMat) +head(WTComp) + + labels degree cc ctr clustNo iscenter intEdges extEdges distCenter +1 1 251 0.5999044 401.5760 1 0 187 64 0.072958888 +2 2 0 0.0000000 0.0000 0 0 0 0 0.000000000 +3 3 202 0.7217378 347.7910 1 0 164 38 0.072090330 +4 4 202 0.5819910 319.5622 4 0 98 104 0.008396063 +5 5 0 0.0000000 0.0000 0 0 0 0 0.000000000 +6 6 9 0.6944444 15.2500 0 0 0 0 0.000000000 +``` + +The function **summarizeClusters** returns summary statistics about each cluster. +It prints the number of nodes, edges, clusters and unclustered nodes to the screen, +and returns a matrix with cluster number, number of nodes in the cluster, +fivenum summary for the degrees of nodes in the cluster, and fivenum summary for +the percentage of edges that are within the cluster. + +``` +summtab <- summarizeClusters(WTComp) +head(summtab[,1:7]) +head(summtab[,c(1:2,8:12)]) + +Num of nodes: 3454 +Num of edges: 80332 +Num of clusters: 72 +Num of unclustered nodes: 1837 + + Cluster Nodes degreeMin degreeQ25 degreeMedian degreeQ75 degreeMax +[1,] 1 374 59 222.0 257 299.0 373 +[2,] 2 69 17 96.0 134 164.0 234 +[3,] 3 39 2 53.5 74 122.5 209 +[4,] 4 107 25 108.0 130 155.5 209 +[5,] 5 35 26 58.5 80 109.0 154 +[6,] 6 19 17 45.5 80 108.5 133 + + + Cluster Nodes pctInClstMin pctInClstQ25 pctInClstMedian pctInClstQ75 pctInClstMax +[1,] 1 374 0.52073733 0.78536585 0.8452080 0.9083969 1.0000000 +[2,] 2 69 0.07109005 0.23952096 0.3061224 0.4226804 0.8235294 +[3,] 3 39 0.03571429 0.09923455 0.1358025 0.2197585 1.0000000 +[4,] 4 107 0.18750000 0.44693586 0.5555556 0.6298886 0.8529412 +[5,] 5 35 0.10344828 0.21717172 0.2777778 0.3584826 0.7692308 +[6,] 6 19 0.06666667 0.10270206 0.1262136 0.1594156 0.4210526 + +``` + +It can be seen, for example, the cluster 1 has 374 nodes, and most of them have many neighbors +(more than 75% of them have at least 222 edges), and this cluster is very interconnected (at least 75% +of the nodes are mostly connected within the cluster with at least 79% of their edges being inside +the cluster. + +Next, we can visualize clusters using the **plotCluster** function. For example, to plot +clusters 5 and 9 we use the following syntax: + +``` +plotCluster(WTres$AdjMat,5,WTComp) +plotCluster(WTres$AdjMat,5,WTComp) +``` + +The central node is marked by a black circle. The radius of each point corresponds +to its degree. The opacity corresponds to the percentage of edges from the node +that is in the cluster (the darker it is, the larger the percentage of edges is +within the cluster.) The distance from the center corresponds to the relative +dissimilarity with the central node. This is computed as the number of neighbors +the node and the central node do not have in common. +For example, in cluster 9 (right plot) the dark shade of blue of all the nodes +shows that the majority of edges connecting to these nodes are within the cluster. +In contrast, the nodes in cluster 4 (left) have a larger percentage of their neighbors outside the +cluster. + + +plot of chunk unnamed-chunk-2plot of chunk unnamed-chunk-2 + +Indeed, when we look at the data +``` +summtab[9,c(1:2,8:12)] + Cluster Nodes pctInClstMin pctInClstQ25 pctInClstMedian pctInClstQ75 pctInClstMax + 9.0000000 108.0000000 0.6857143 0.8768939 0.9301901 0.9657132 1.0000000 +``` +We see that the cluster contains 108 nodes, and the smallest percentage of within-cluster +edges is 68.5%, and for 75% of the nodes, the percentage is greater than 87.6%. This means that +cluster 9 is highly inter-connected, and fairly isolated. + +We can collapse the network data for more compact visualization by defining +a subset in which clusters are represented by their central nodes. The function +**collapsedGraph** returns an adjacency matrix which contains all the unclustered +nodes, and the centers of the clusters. The elements in the matrix contain the +total number of edges in the original graph. That is, the total count of edges +between clusters i and j is stored in the matrix, rather than just 0/1. To convert +it to a 0/1 adjacency matrix we can use the following: +``` +Adj1 <- collapsedGraph(WTres$AdjMat, WTComp) > 0 +``` + +We can use the **igraph** package to visualize the collapsed network. +For example, the following code will produce a network graph containing +all the clusters and unclustered nodes which have at least one neighbor. +``` +library("igraph") +inc <- which(Matrix::rowSums(Adj1) > 0) +plot(graph.adjacency(Adj1[inc,inc], mode="undirected"), + vertex.label.cex=0.7, vertex.size=0.1, edge.color='lightgreen',asp=1) +``` + +If we want to show only the relationships between clusters, we use the following: +``` +library("igraph") +inc <- which(substr(rownames(Adj1),1,3) == "CLS") +plot(graph.adjacency(Adj1[inc,inc], mode="undirected"),vertex.label.cex=0.7, +vertex.size=0.1,edge.color='lightgreen', asp=1) +``` +This gives the following graph, where it can be seen that cluster 9 is connected to +clusters 8, 19, 20, 33, and 35. + +plot of chunk unnamed-chunk-3 + +If we want to create a subset of the original data by taking a representative from each +clusters, we can do the following + +``` +WTclustered <- WT[union(which(WTComp$iscenter == 1), which(WTComp$clustNo == 0)),] +dim(WTclustered) +[1] 1909 15 +``` + +*Other visualizations:* + +The **plotDegCC** function can be used to plot the degree of nodes versus the +degree times the clustering coefficient of nodes. We can also highlight specific groups. +For example, in the following code we highlight +cluster 1, which as we've seen before, is a large (374 genes) and highly connected +75% of the nodes have at least 222 neighbors, and most of the connections are within the cluster +(75% of the nodes have at least 78.5% of their neighbors within the cluster.) + +``` +plotDegCC(WTres,WTComp,highlightNodes = which(WTComp$clustNo==1)) +``` + +plot of chunk unnamed-chunk-4 + +The **plotBitmapCC** function is used to show the network as a 0/1 matrix, where a black +dot corresponds to an edge in the graph. Setting **orderByDegree=T** is used to +sort the nodes by clusters. When set to FALSE, the original order +of the nodes as it appears in the gene expression file, is preserved. +We can create the bitmap plot for nodes with degree greater than or equal to +some threshold. For example, **showMinDegree=30** will result in a plot which includes +only node which have at least 30 neighbors. + +``` +plotBitmapCC(WTres$AdjMat, WTComp, orderByCluster=TRUE, showMinDegree = 30) +``` + +plot of chunk unnamed-chunk-5 + +We repeat the same process with the duplication group. +DUP is a matrix with 3454 rows (genes) and 12 columns (samples). +We only show the collapsed cluster plot, and observe that unlike the WT group, +the network in the DUP group consists of two "super-clusters". + +``` +data("DUP") +DUPres <- edgefinder(DUP, ttl = "Duplication") +DUPComp <- graphComponents(DUPres$AdjMat) +Adj2 <- collapsedGraph(DUPres$AdjMat, DUPComp) > 0 +inc <- which(substr(rownames(Adj2),1,3) == "CLS") +plot(graph.adjacency(Adj2[inc,inc], mode="undirected"),vertex.label.cex=0.7, +vertex.size=0.1,edge.color='lightgreen', asp=1) +``` + + +plot of chunk unnamed-chunk-6 + + + + +# Simulated data + +The following examples shows a simulated dataset with a hub structure, consisting +of 1000 nodes and 50 hubs. The bitmap plot shows the network that was created +by edgefinder. + +``` +library("huge") +library("MASS") +N=200; D=1000 +set.seed(23197) +L = huge.generator(n = N, d = D, graph = "hub", g=50, v = 0.3, u = 0.1) +x = mvrnorm(N, rep(0, D), L$sigma) +``` + +Data generated like this is provided with the package in a dataset called SIM. We perform similar analysis +and display the bitmap plot, which shows that edgefinder finds the actual network structure overall, with +almost no false discoveries. +We also display the network of cluster 1, which shows that the cluster is how we expected it to be, +with one central node with high degree (the hub gene), with interconnected neighbors each having a smaller degree +than the hub gene. From the dark shade of blue for each node, we can infer that the nodes are connected +within the cluster but almost no edges to other clusters or nodes. +The smallest percentage of edges within cluster 1 is 66.7%, and at least 75% of the nodes are connected only to nodes within the same cluster. + +``` +data(SIM) +Sres <- edgefinder(SIM, ttl = "Simulation", BHthr=0.05) +plotBitmapCC(Sres$AdjMat,orderByCluster=FALSE) +SIMComp <- graphComponents(Sres$AdjMat) +plotCluster(Sres$AdjMat,1,SIMComp) +sumtab <- summarizeClusters(SIMComp) +sumtab[1,c(1:2,8:12)] + +Cluster Nodes pctInClstMin pctInClstQ25 pctInClstMedian pctInClstQ75 pctInClstMax + 1 20 0.6666667 1.0000000 1.0000000 1.0000000 1.0000000 + +``` + + +plot of chunk unnamed-chunk-7plot of chunk unnamed-chunk-7 + diff --git a/vignettes/edgefinder.pdf b/vignettes/edgefinder.pdf new file mode 100644 index 0000000..a24ed99 Binary files /dev/null and b/vignettes/edgefinder.pdf differ