From 0d339968a7c3f9ef56d28af2b6462cfae352f0e4 Mon Sep 17 00:00:00 2001 From: root Date: Sun, 15 Dec 2019 09:23:30 -0500 Subject: [PATCH] The first commit. --- .gitignore | 20 + algos.yaml | 700 ++++++++++++++++++ ann-bench.def | 10 + ann_benchmarks/__init__.py | 2 + ann_benchmarks/algorithms/__init__.py | 0 ann_benchmarks/algorithms/annoy.py | 24 + ann_benchmarks/algorithms/balltree.py | 21 + ann_benchmarks/algorithms/base.py | 27 + ann_benchmarks/algorithms/bruteforce.py | 98 +++ ann_benchmarks/algorithms/datasketch.py | 40 + ann_benchmarks/algorithms/definitions.py | 175 +++++ ann_benchmarks/algorithms/dolphinnpy.py | 30 + ann_benchmarks/algorithms/dummy_algo.py | 24 + ann_benchmarks/algorithms/faiss.py | 71 ++ ann_benchmarks/algorithms/faiss_gpu.py | 56 ++ ann_benchmarks/algorithms/faiss_hnsw.py | 36 + ann_benchmarks/algorithms/flann.py | 24 + ann_benchmarks/algorithms/hdidx.py | 30 + ann_benchmarks/algorithms/hnswlib.py | 33 + ann_benchmarks/algorithms/kdtree.py | 21 + ann_benchmarks/algorithms/kgraph.py | 37 + ann_benchmarks/algorithms/lshf.py | 22 + ann_benchmarks/algorithms/mrpt.py | 31 + ann_benchmarks/algorithms/nearpy.py | 63 ++ ann_benchmarks/algorithms/nmslib.py | 95 +++ ann_benchmarks/algorithms/nmslib_sparse.py | 95 +++ ann_benchmarks/algorithms/onng_ngt.py | 93 +++ ann_benchmarks/algorithms/panng_ngt.py | 66 ++ ann_benchmarks/algorithms/panns.py | 19 + ann_benchmarks/algorithms/pynndescent.py | 35 + ann_benchmarks/algorithms/risc.py | 83 +++ ann_benchmarks/algorithms/rpforest.py | 19 + ann_benchmarks/constants.py | 1 + ann_benchmarks/data.py | 36 + ann_benchmarks/datasets.py | 548 ++++++++++++++ ann_benchmarks/datasets_old.py | 480 ++++++++++++ ann_benchmarks/distance.py | 53 ++ ann_benchmarks/main.py | 198 +++++ ann_benchmarks/plotting/__init__.py | 2 + ann_benchmarks/plotting/metrics.py | 113 +++ ann_benchmarks/plotting/plot_variants.py | 12 + ann_benchmarks/plotting/utils.py | 115 +++ ann_benchmarks/results.py | 77 ++ ann_benchmarks/runner.py | 305 ++++++++ create_dataset.py | 12 + create_website.py | 213 ++++++ data/.gitignore | 4 + docker-install/Dockerfile | 10 + docker-install/Dockerfile.annoy | 5 + docker-install/Dockerfile.datasketch | 4 + docker-install/Dockerfile.dolphinn | 5 + docker-install/Dockerfile.faiss | 12 + docker-install/Dockerfile.flann | 10 + docker-install/Dockerfile.hdidx | 18 + docker-install/Dockerfile.hnswlib | 10 + docker-install/Dockerfile.kgraph | 11 + docker-install/Dockerfile.mrpt | 6 + docker-install/Dockerfile.nearpy | 5 + docker-install/Dockerfile.ngt | 13 + docker-install/Dockerfile.nmslib | 16 + docker-install/Dockerfile.nmslib-sparse | 17 + docker-install/Dockerfile.panns | 10 + docker-install/Dockerfile.pynndescent | 6 + docker-install/Dockerfile.rpforest | 5 + docker-install/Dockerfile.sklearn | 4 + install.py | 63 ++ run.py | 4 + run.sh | 39 + run_algorithm.py | 3 + running.txt | 6 + singularity-install/ann-bench-datasketch.def | 14 + singularity-install/ann-bench-ngt.def | 22 + singularity-install/ann-bench-nmslib.def | 20 + singularity-install/ann-bench-nmslib3.def | 20 + singularity-install/ann-bench-pynndescent.def | 16 + singularity-install/ann-bench-risc.def | 16 + singularity-install/ann-bench-sklearn.def | 13 + singularity-install/ann-bench.def | 16 + singularity-install/requirements.txt | 10 + singularity-install/run_algorithm.py | 3 + templates/chartjs.template | 102 +++ templates/detail_page.html | 23 + templates/general.html | 58 ++ templates/latex.template | 30 + templates/summary.html | 60 ++ test/__init__.py | 0 test/test-metrics.py | 63 ++ 87 files changed, 5037 insertions(+) create mode 100644 .gitignore create mode 100644 algos.yaml create mode 100644 ann-bench.def create mode 100644 ann_benchmarks/__init__.py create mode 100644 ann_benchmarks/algorithms/__init__.py create mode 100644 ann_benchmarks/algorithms/annoy.py create mode 100644 ann_benchmarks/algorithms/balltree.py create mode 100644 ann_benchmarks/algorithms/base.py create mode 100644 ann_benchmarks/algorithms/bruteforce.py create mode 100644 ann_benchmarks/algorithms/datasketch.py create mode 100644 ann_benchmarks/algorithms/definitions.py create mode 100644 ann_benchmarks/algorithms/dolphinnpy.py create mode 100644 ann_benchmarks/algorithms/dummy_algo.py create mode 100644 ann_benchmarks/algorithms/faiss.py create mode 100644 ann_benchmarks/algorithms/faiss_gpu.py create mode 100644 ann_benchmarks/algorithms/faiss_hnsw.py create mode 100644 ann_benchmarks/algorithms/flann.py create mode 100644 ann_benchmarks/algorithms/hdidx.py create mode 100644 ann_benchmarks/algorithms/hnswlib.py create mode 100644 ann_benchmarks/algorithms/kdtree.py create mode 100644 ann_benchmarks/algorithms/kgraph.py create mode 100644 ann_benchmarks/algorithms/lshf.py create mode 100644 ann_benchmarks/algorithms/mrpt.py create mode 100644 ann_benchmarks/algorithms/nearpy.py create mode 100644 ann_benchmarks/algorithms/nmslib.py create mode 100644 ann_benchmarks/algorithms/nmslib_sparse.py create mode 100644 ann_benchmarks/algorithms/onng_ngt.py create mode 100644 ann_benchmarks/algorithms/panng_ngt.py create mode 100644 ann_benchmarks/algorithms/panns.py create mode 100644 ann_benchmarks/algorithms/pynndescent.py create mode 100644 ann_benchmarks/algorithms/risc.py create mode 100644 ann_benchmarks/algorithms/rpforest.py create mode 100644 ann_benchmarks/constants.py create mode 100644 ann_benchmarks/data.py create mode 100644 ann_benchmarks/datasets.py create mode 100644 ann_benchmarks/datasets_old.py create mode 100644 ann_benchmarks/distance.py create mode 100644 ann_benchmarks/main.py create mode 100644 ann_benchmarks/plotting/__init__.py create mode 100644 ann_benchmarks/plotting/metrics.py create mode 100644 ann_benchmarks/plotting/plot_variants.py create mode 100644 ann_benchmarks/plotting/utils.py create mode 100644 ann_benchmarks/results.py create mode 100644 ann_benchmarks/runner.py create mode 100644 create_dataset.py create mode 100644 create_website.py create mode 100644 data/.gitignore create mode 100644 docker-install/Dockerfile create mode 100644 docker-install/Dockerfile.annoy create mode 100644 docker-install/Dockerfile.datasketch create mode 100644 docker-install/Dockerfile.dolphinn create mode 100644 docker-install/Dockerfile.faiss create mode 100644 docker-install/Dockerfile.flann create mode 100644 docker-install/Dockerfile.hdidx create mode 100644 docker-install/Dockerfile.hnswlib create mode 100644 docker-install/Dockerfile.kgraph create mode 100644 docker-install/Dockerfile.mrpt create mode 100644 docker-install/Dockerfile.nearpy create mode 100644 docker-install/Dockerfile.ngt create mode 100644 docker-install/Dockerfile.nmslib create mode 100644 docker-install/Dockerfile.nmslib-sparse create mode 100644 docker-install/Dockerfile.panns create mode 100644 docker-install/Dockerfile.pynndescent create mode 100644 docker-install/Dockerfile.rpforest create mode 100644 docker-install/Dockerfile.sklearn create mode 100644 install.py create mode 100644 run.py create mode 100644 run.sh create mode 100644 run_algorithm.py create mode 100644 running.txt create mode 100644 singularity-install/ann-bench-datasketch.def create mode 100644 singularity-install/ann-bench-ngt.def create mode 100644 singularity-install/ann-bench-nmslib.def create mode 100644 singularity-install/ann-bench-nmslib3.def create mode 100644 singularity-install/ann-bench-pynndescent.def create mode 100644 singularity-install/ann-bench-risc.def create mode 100644 singularity-install/ann-bench-sklearn.def create mode 100644 singularity-install/ann-bench.def create mode 100644 singularity-install/requirements.txt create mode 100644 singularity-install/run_algorithm.py create mode 100644 templates/chartjs.template create mode 100644 templates/detail_page.html create mode 100644 templates/general.html create mode 100644 templates/latex.template create mode 100644 templates/summary.html create mode 100644 test/__init__.py create mode 100644 test/test-metrics.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8d62cf6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,20 @@ +*.pyc +*.o +protocol/c/fr-* + +install/*.txt +install/*.yaml +install/lib-*/ + +*.class + +cp-* +*.out +*.log +results +indexes +cpBuildingTime.py +algos_v1.yaml +README.md +note.txt + diff --git a/algos.yaml b/algos.yaml new file mode 100644 index 0000000..048fd6f --- /dev/null +++ b/algos.yaml @@ -0,0 +1,700 @@ +float: + any: + DolphinnPy: + disabled: true + docker-tag: ann-benchmarks-dolphinn # Docker tag + module: ann_benchmarks.algorithms.dolphinnpy # Python class + constructor: DolphinnPy # Python class name + run-groups: + base: + args: [[10, 50, 100, 200, 1000, 2000]] + faiss-lsh: + disabled: true + docker-tag: ann-benchmarks-faiss + module: ann_benchmarks.algorithms.faiss + constructor: FaissLSH + base-args: ["@metric"] + run-groups: + base: + # When @args is a list, the result is the Cartesian product of all of + # the things it contains; entries that aren't a list will be treated + # as lists of length one. + args: [[32, 64, 128, 256, 512, 1024, 2048, 4096]] + # This run group will produce eight algorithm instances: + # FaissLSH(32), FaissLSH(64), and so on up to FaissLSH(4096). + faiss-ivf: + docker-tag: ann-benchmarks-faiss + module: ann_benchmarks.algorithms.faiss + constructor: FaissIVF + base-args: ["@metric"] + run-groups: + base: + args: [[32,64,128,256,512,1024,2048,4096,8192]] + query-args: [[1, 5, 10, 50, 100, 200]] + faiss-gpu: + disabled: true + docker-tag: ann-benchmarks-faiss + module: ann_benchmarks.algorithms.faiss_gpu + constructor: FaissGPU + run-groups: + base: + args: [[400, 1024, 4096, 8192, 16384], + [1, 10, 40, 100, 200]] + hnswlib: + docker-tag: ann-benchmarks-hnswlib + module: ann_benchmarks.algorithms.hnswlib + constructor: HnswLib + base-args: ["@metric"] + run-groups: + M-4: + arg-groups: + - {"M": 4, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-8: + arg-groups: + - {"M": 8, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-12: + arg-groups: + - {"M": 12, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-16: + arg-groups: + - {"M": 16, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-24: + arg-groups: + - {"M": 24, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-36: + arg-groups: + - {"M": 36, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-48: + arg-groups: + - {"M": 48, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-64: + arg-groups: + - {"M": 64, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-96: + arg-groups: + - {"M": 96, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + + hnsw(faiss): + docker-tag: ann-benchmarks-faiss + module: ann_benchmarks.algorithms.faiss_hnsw + constructor: FaissHNSW + base-args: ["@metric"] + run-groups: + M-4: + arg-groups: + - {"M": 4, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-8: + arg-groups: + - {"M": 8, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-12: + arg-groups: + - {"M": 12, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-16: + arg-groups: + - {"M": 16, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-24: + arg-groups: + - {"M": 24, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-36: + arg-groups: + - {"M": 36, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-48: + arg-groups: + - {"M": 48, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-64: + arg-groups: + - {"M": 64, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-96: + arg-groups: + - {"M": 96, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + + + flann: + docker-tag: ann-benchmarks-flann + module: ann_benchmarks.algorithms.flann + constructor: FLANN + base-args: ["@metric"] + run-groups: + flann: + args: [[0.2, 0.5, 0.7, 0.8, 0.9, 0.95, 0.97]] + panns: + disabled: true + docker-tag: ann-benchmarks-panns + module: ann_benchmarks.algorithms.panns + constructor: PANNS + base-args: ["@metric"] + run-groups: + five-trees: + args: [5, 20] + ten-trees: + args: [10, [10, 50]] + hundred-candidates: + args: [[10, 20, 40], 100] + annoy: + docker-tag: ann-benchmarks-annoy + module: ann_benchmarks.algorithms.annoy + constructor: Annoy + base-args: ["@metric"] + run-groups: + annoy: + args: [[100, 200, 400]] + query-args: [[100, 200, 400, 1000, 2000, 4000, 10000, 20000, 40000, + 100000, 200000, 400000]] + # This run group produces 3 algorithm instances -- Annoy("angular", + # 100), Annoy("angular", 200), and Annoy("angular", 400) -- each of + # which will be used to run 12 different queries. + shidx: + docker-tag: ann-benchmarks-hdidx + module: ann_benchmarks.algorithms.hdidx + constructor: SHIdx + base-args: [] + run-groups: + shidx: + args: [[4, 8, 16, 32, 64, 128, 256]] + nearpy: + disabled: true + docker-tag: ann-benchmarks-nearpy + module: ann_benchmarks.algorithms.nearpy + constructor: NearPy + base-args: ["@metric"] + run-groups: + nearpy: + args: [[10, 12, 14, 16], [5, 10, 20, 40]] + extra: + args: [16, [5, 10, 15, 20, 25, 30, 40]] + bruteforce: + disabled: true + docker-tag: ann-benchmarks-sklearn + module: ann_benchmarks.algorithms.bruteforce + constructor: BruteForce + base-args: ["@metric"] + run-groups: + empty: + args: [] + bruteforce-blas: + docker-tag: ann-benchmarks-sklearn + module: ann_benchmarks.algorithms.bruteforce + constructor: BruteForceBLAS + base-args: ["@metric"] + run-groups: + empty: + args: [] + dummy-algo-st: + disabled: true + docker-tag: ann-benchmarks-sklearn + module: ann_benchmarks.algorithms.dummy_algo + constructor: DummyAlgoSt + base-args: ["@metric"] + run-groups: + empty: + args: [] + dummy-algo-mt: + disabled: true + docker-tag: ann-benchmarks-sklearn + module: ann_benchmarks.algorithms.dummy_algo + constructor: DummyAlgoMt + base-args: ["@metric"] + run-groups: + empty: + args: [] + ball: + disabled: true + docker-tag: ann-benchmarks-sklearn + module: ann_benchmarks.algorithms.balltree + constructor: BallTree + base-args: ["@metric"] + run-groups: + ball: + args: &treeargs [[10, 20, 40, 100, 200, 400, 1000]] + kd: + docker-tag: ann-benchmarks-sklearn + module: ann_benchmarks.algorithms.kdtree + constructor: KDTree + base-args: ["@metric"] + run-groups: + ball: + args: *treeargs + BallTree(nmslib): + docker-tag: ann-benchmarks-nmslib + module: ann_benchmarks.algorithms.nmslib + constructor: NmslibReuseIndex + base-args: ["@metric", "vptree"] + run-groups: + base: + # When @args is a dictionary, algorithm instances will be generated + # by taking the Cartesian product of all of its values. + arg-groups: + - {"tuneK": 10, "desiredRecall": [0.99, 0.97, 0.95, 0.9, 0.85, 0.8, + 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1]} + - False + # This run group produces thirteen algorithm instances: + # NmslibNewIndex("angular", "vptree", {"tuneK": 10, + # "desiredRecall": 0.99}), NmslibNewIndex("angular", "vptree", + # {"tuneK": 10, "desiredRecall": 0.97}), and so on up to + # NmslibNewIndex("angular", "vptree", {"tuneK": 10, "desiredRecall": + # 0.1}). + pynndescent: + docker-tag: ann-benchmarks-pynndescent + module: ann_benchmarks.algorithms.pynndescent + constructor: PyNNDescent + base-args: ["@metric"] + run-groups: + pynndescent: + args: [[10, 20, 40, 80], [4, 8], [30]] + query-args: [[1.0, 2.0, 4.0, 8.0]] + NGT-panng: + docker-tag: ann-benchmarks-ngt + module: ann_benchmarks.algorithms.panng_ngt + constructor : PANNG + base-args : ["@metric", "Float"] + run-groups : + panng: + args : [{'edge': 20, 'pathadj': 40, 'searchedge': 60}] + query-args : [[0.6, 0.8, 0.9, 1.0, 1.02, 1.05, 1.1, 1.2]] + NGT-onng: + docker-tag: ann-benchmarks-ngt + module: ann_benchmarks.algorithms.onng_ngt + constructor : ONNG + base-args : ["@metric", "Float", 0.1] + run-groups : + onng: + args : [{'edge': 100, 'outdegree': 10, 'indegree': 120}] + query-args : [[0.6, 0.8, 0.9, 1.0, 1.02, 1.05, 1.1, 1.2]] + mrpt: + docker-tag: ann-benchmarks-mrpt + module: ann_benchmarks.algorithms.mrpt + constructor: MRPT + base-args: ["@metric"] + run-groups: + # See https://github.com/ejaasaari/mrpt-comparison/blob/master/parameters/gist.sh + mrpt: + args: [[5, 25, 100], [1, 2, 4, 8]] + query-args: [[1, 2, 4, 10, 20, 40, 100]] + euclidean: + kgraph: + docker-tag: ann-benchmarks-kgraph + module: ann_benchmarks.algorithms.kgraph + constructor: KGraph + base-args: ["@metric"] + run-groups: + kgraph: + args: [ {'reverse': -1}, True] # XXX: hard-codes save_index as True + query-args: [[1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]] + hnsw(nmslib): + docker-tag: ann-benchmarks-nmslib + module: ann_benchmarks.algorithms.nmslib + constructor: NmslibReuseIndex + base-args: ["@metric", "hnsw"] + run-groups: + M-32: + # If a run group has an array called @arg-groups instead of one + # called @args, then every element in that array will be separately + # expanded before then taking the Cartesian product of all of those + # expansions. + # + # Yes, this is a bit of a hack, but some constructors are weird. + # (This one used to require that dictionaries be encoded as lists + # of environment variable-style strings -- ["M=32", "post=2", + # "efConstruction=400"] -- which didn't work with this at all...) + arg-groups: + - {"M": 32, "post": 2, "efConstruction": 400} + - False + query-args: [[20, 30, 40, 50, 60, 70, 80, 90, 100, 120, 140, 160, 200, + 300, 400]] + M-20: + arg-groups: + - {"M": 20, "post": 2, "efConstruction": 400} + - False + query-args: [[2, 5, 10, 15, 20, 30, 40, 50, 70, 80, 120, 200, 400]] + M-12: + arg-groups: + - {"M": 12, "post": 0, "efConstruction": 400} + - False + query-args: [[1, 2, 5, 10, 15, 20, 30, 40, 50, 70, 80, 120]] + M-4: + arg-groups: + - {"M": 4, "post": 0, "efConstruction": 400} + - False + query-args: [[1, 2, 5, 10, 20, 30, 50, 70, 90, 120]] + M-8: + arg-groups: + - {"M": 8, "post": 0, "efConstruction": 400} + - False + query-args: [[1, 2, 5, 10, 20, 30, 50, 70, 90, 120, 160]] + SW-graph(nmslib): + docker-tag: ann-benchmarks-nmslib + module: ann_benchmarks.algorithms.nmslib + constructor: NmslibReuseIndex + base-args: ["@metric", "sw-graph"] + run-groups: + NN-24: + arg-groups: + - {"NN": 24} + - False + query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]] + NN-16: + arg-groups: + - {"NN": 16} + - False + query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]] + NN-10: + arg-groups: + - {"NN": 10} + - False + query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]] + NN-5: + arg-groups: + - {"NN": 5} + - False + query-args: [[30, 25, 20, 15, 10, 5, 4, 3, 2, 1]] + pynndescent: + docker-tag: ann-benchmarks-pynndescent + module: ann_benchmarks.algorithms.pynndescent + constructor: PyNNDescent + base-args: ["@metric"] + run-groups: + pynndescent: + args: [[5, 10, 20, 40, 80], [4, 8], [20]] + query-args: [[1.0, 1.5, 2.0, 4.0, 8.0]] + angular: + kgraph: + docker-tag: ann-benchmarks-kgraph + module: ann_benchmarks.algorithms.kgraph + constructor: KGraph + base-args: ["@metric"] + run-groups: + kgraph: + args: [{'reverse': -1, "K": 200, "L": 300, "S": 20}, False] + query-args: [[1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]] + hnsw(nmslib): + docker-tag: ann-benchmarks-nmslib + module: ann_benchmarks.algorithms.nmslib + constructor: NmslibReuseIndex + base-args: ["@metric", "hnsw"] + run-groups: + M-48: + arg-groups: + - {"M": 48, "post": 2, "efConstruction": 800} + - False + query-args: [[50, 70, 90, 120, 160, 200, 400, 600, 700, 800, 1000, + 1400, 1600, 2000]] + M-32: + arg-groups: + - {"M": 32, "post": 2, "efConstruction": 800} + - False + query-args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 120, 140, 160, + 200, 300, 400, 600, 700, 800, 1000, 1200, 1400, 1600, 2000]] + M-20: + arg-groups: + - {"M": 20, "post": 0, "efConstruction": 800} + - False + query-args: [[2, 5, 10, 15, 20, 30, 40, 50, 70, 80]] + M-12: + arg-groups: + - {"M": 12, "post": 0, "efConstruction": 800} + - False + query-args: [[1, 2, 5, 10, 15, 20, 30, 40, 50, 70, 80]] + SW-graph(nmslib): + docker-tag: ann-benchmarks-nmslib + module: ann_benchmarks.algorithms.nmslib + constructor: NmslibReuseIndex + base-args: ["@metric", "sw-graph"] + run-groups: + NN-30: + arg-groups: + - {"NN": 30} + - False + query-args: [[700, 650, 550, 450, 350, 275, 200, 150, 120, 80, + 50, 30]] + NN-15: + arg-groups: + - {"NN": 15} + - False + query-args: [[80, 50, 30, 20]] + NN-3: + arg-groups: + - {"NN": 3} + - False + query-args: [[120, 80, 60, 40, 20, 10, 8, 4, 2]] + rpforest: + docker-tag: ann-benchmarks-rpforest + module: ann_benchmarks.algorithms.rpforest + constructor: RPForest + run-groups: + base: + args: [[3, 10, 40, 100, 400], + [3, 10, 40, 100, 400]] + pynndescent: + docker-tag: ann-benchmarks-pynndescent + module: ann_benchmarks.algorithms.pynndescent + constructor: PyNNDescent + base-args: ["@metric"] + run-groups: + pynndescent: + args: [[5, 10, 20, 40, 80, 160], [8], [40]] + query-args: [[1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0]] +bit: + hamming: + kgraph: + docker-tag: ann-benchmarks-kgraph + module: ann_benchmarks.algorithms.kgraph + constructor: KGraph + base-args: ["euclidean"] + run-groups: + kgraph: +# args: [[1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100], +# {'reverse': -1, "K": 200, "L": 300, "S": 20}, False] + args: [{'reverse': -1, "K": 200, "L": 300, "S": 20}, False] + query-args: [[1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]] + hnsw(nmslib): + docker-tag: ann-benchmarks-nmslib + module: ann_benchmarks.algorithms.nmslib + constructor: NmslibReuseIndex + base-args: ["euclidean", "hnsw"] + run-groups: + M-48: + arg-groups: + - {"M": 48, "post": 2, "efConstruction": 800} + # Chunjiang added + - False + query-args: [[50, 70, 90, 120, 160, 200, 400, 600, 700, 800, 1000, + 1400, 1600, 2000]] + M-32: + arg-groups: + - {"M": 32, "post": 2, "efConstruction": 800} + # Chunjiang added + - False + query-args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 120, 140, 160, + 200, 300, 400, 600, 700, 800, 1000, 1200, 1400, 1600, 2000]] + M-20: + arg-groups: + - {"M": 20, "post": 0, "efConstruction": 800} + # Chunjiang added + - False + query-args: [[2, 5, 10, 15, 20, 30, 40, 50, 70, 80]] + M-12: + arg-groups: + - {"M": 12, "post": 0, "efConstruction": 800} + # Chunjiang added + - False + query-args: [[1, 2, 5, 10, 15, 20, 30, 40, 50, 70, 80]] + pynndescent: + docker-tag: ann-benchmarks-pynndescent + module: ann_benchmarks.algorithms.pynndescent + constructor: PyNNDescent + base-args: ["euclidean"] + run-groups: + pynndescent: + args: [[20, 40, 80, 160, 250], [4], [40]] + query-args: [[1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0]] + annoy: + docker-tag: ann-benchmarks-annoy + module: ann_benchmarks.algorithms.annoy + constructor: Annoy + base-args: ["@metric"] + run-groups: + annoy: + args: [[100, 200, 400]] + query-args: [[100, 200, 400, 1000, 2000, 4000, 10000, 20000, 40000, + 100000, 200000, 400000]] + # This run group produces 3 algorithm instances -- Annoy("angular", + # 100), Annoy("angular", 200), and Annoy("angular", 400) -- each of + # which will be used to run 12 different queries. + faiss-ivf: + docker-tag: ann-benchmarks-faiss + module: ann_benchmarks.algorithms.faiss + constructor: FaissIVF + base-args: ["euclidean"] + run-groups: + base: + args: [[32,64,128,256,512,1024,2048,4096,8192]] + query-args: [[1, 5, 10, 50, 100, 200]] + jaccard: + Bruteforce: + disabled: false + docker-tag: ann-benchmarks-sklearn + module: ann_benchmarks.algorithms.bruteforce + constructor: BruteForceBLAS + base-args: ["@metric"] + run-groups: + base: + args: {} + Balltree(Sklearn): + disabled: false + docker-tag: ann-benchmarks-sklearn + module: ann_benchmarks.algorithms.balltree + constructor: BallTree + base-args: ["@metric"] + run-groups: + ball: + args: [[1, 10, 20, 40, 100, 200, 400, 1000]] + VPtree(Nmslib): + disabled: false + docker-tag: ann-benchmarks-nmslib + module: ann_benchmarks.algorithms.nmslib + constructor: NmslibReuseIndex + base-args: ["@metric", "vptree"] + run-groups: + base: + # When @args is a dictionary, algorithm instances will be generated + # by taking the Cartesian product of all of its values. + arg-groups: + - {"tuneK": 10, "desiredRecall": [0.999, 0.997, 0.995, 0.99, 0.97, 0.95, 0.9, 0.85, 0.8, + 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0.05, 0.02, 0.01]} + - False + Datasketch: + disabled: false + docker-tag: ann-benchmarks-datasketch + module: ann_benchmarks.algorithms.datasketch + constructor: DataSketch + base-args: ["@metric"] + run-groups: + base: + args: [[32, 64, 128, 256, 512, 1024, 2048],[10, 30, 50, 70]] + Hnsw(Nmslib): + disabled: false + docker-tag: ann-benchmarks-nmslib + module: ann_benchmarks.algorithms.nmslib + constructor: NmslibReuseIndex + base-args: ["@metric", "hnsw"] + run-groups: + M-48: + arg-groups: + - {"M": 48, "post": 2, "efConstruction": 800} + - False + query-args: [[50, 70, 90, 120, 160, 200, 400, 600, 700, 800, 1000, + 1400, 1600, 2000]] + M-32: + arg-groups: + - {"M": 32, "post": 2, "efConstruction": 800} + - False + query-args: [[100, 300, 500, 700, 1000, 1500, 2000]] + M-20: + arg-groups: + - {"M": 20, "post": 0, "efConstruction": 800} + - False + query-args: [[2, 5, 10, 15, 20, 30, 40, 50, 70, 80]] + M-12: + arg-groups: + - {"M": 12, "post": 0, "efConstruction": 800} + - False + query-args: [[1, 2, 5, 10, 15, 20, 30, 40, 50, 70, 80]] + M-5: + arg-groups: + - {"M": 5, "post": 0, "efConstruction": 10} + - False + query-args: [[1, 2, 5, 10]] + M-2: + arg-groups: + - {"M": 2, "post": 0, "efConstruction": 1} + - False + query-args: [[1, 2]] + SW-graph(Nmslib): + disabled: false + docker-tag: ann-benchmarks-nmslib + module: ann_benchmarks.algorithms.nmslib + constructor: NmslibReuseIndex + base-args: ["@metric", "sw-graph"] + run-groups: + NN-48: + arg-groups: + - {"NN": 48} + - False + query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]] + NN-24: + arg-groups: + - {"NN": 24} + - False + query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]] + NN-16: + arg-groups: + - {"NN": 16} + - False + query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]] + NN-10: + arg-groups: + - {"NN": 10} + - False + query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]] + NN-5: + arg-groups: + - {"NN": 5} + - False + query-args: [[30, 25, 20, 15, 10, 5, 4, 3, 2, 1]] + NN-2: + arg-groups: + - {"NN": 2} + - False + query-args: [[30, 25, 20, 15, 10, 5, 4, 3, 2, 1]] + NN-1: + arg-groups: + - {"NN": 1} + - False + query-args: [[30, 25, 20, 15, 10, 5, 4, 3, 2, 1]] + Pynndescent: + disabled: false + docker-tag: ann-benchmarks-pynndescent + module: ann_benchmarks.algorithms.pynndescent + constructor: PyNNDescent + base-args: ["@metric"] + run-groups: + pynndescent: + args: [[2, 5, 10, 20, 40, 80, 120, 160], [2, 4, 8], [30]] + query-args: [[1.0, 2.0, 4.0, 8.0]] + Onng(Ngt): + disabled: false + docker-tag: ann-benchmarks-ngt + module: ann_benchmarks.algorithms.onng_ngt + constructor: ONNG + base-args: ["@metric", "Byte", 1.0] + run-groups: + onng: + args: [[100, 300, 500, 1000], [10, 30, 50, 100], [10, 30, 50, 120]] + query-args: [[0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0]] + Panng(Ngt): + disabled: false + docker-tag: ann-benchmarks-ngt + module: ann_benchmarks.algorithms.panng_ngt + constructor: PANNG + base-args: ["@metric", "Byte"] + run-groups: + panng: + args: [[10, 20, 40], [40], [30, 60, 120]] + query-args: [[0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 0.9, 0.95, 1.0, 1.02, 1.05, 1.1, 1.2, 1.5, 2.0]] + Risc: + disabled: false + docker-tag: ann-benchmarks-risc + module: ann_benchmarks.algorithms.risc + constructor: Risc + base-args: ["@metric", "Risc"] + run-groups: + empty: + args: [] + + DivideSkip: + disabled: false + docker-tag: ann-benchmarks-risc + module: ann_benchmarks.algorithms.risc + constructor: Risc + base-args: ["@metric", "DivideSkip"] + run-groups: + empty: + args: [] diff --git a/ann-bench.def b/ann-bench.def new file mode 100644 index 0000000..e2d3136 --- /dev/null +++ b/ann-bench.def @@ -0,0 +1,10 @@ +Bootstrap: library +From: ubuntu:16.04 + +%post + apt-get update && apt-get install -y python3-numpy python3-scipy python3-pip build-essential git + pip3 install -r requirements.txt + python3 install.py + +%runscript + python3 run.py diff --git a/ann_benchmarks/__init__.py b/ann_benchmarks/__init__.py new file mode 100644 index 0000000..75db8ab --- /dev/null +++ b/ann_benchmarks/__init__.py @@ -0,0 +1,2 @@ +from __future__ import absolute_import +# from ann_benchmarks.main import * diff --git a/ann_benchmarks/algorithms/__init__.py b/ann_benchmarks/algorithms/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ann_benchmarks/algorithms/annoy.py b/ann_benchmarks/algorithms/annoy.py new file mode 100644 index 0000000..7f23bbf --- /dev/null +++ b/ann_benchmarks/algorithms/annoy.py @@ -0,0 +1,24 @@ +from __future__ import absolute_import +import annoy +from ann_benchmarks.algorithms.base import BaseANN + +class Annoy(BaseANN): + def __init__(self, metric, n_trees): + self._n_trees = n_trees + self._search_k = None + self._metric = metric + + def fit(self, X): + self._annoy = annoy.AnnoyIndex(X.shape[1], metric=self._metric) + for i, x in enumerate(X): + self._annoy.add_item(i, x.tolist()) + self._annoy.build(self._n_trees) + + def set_query_arguments(self, search_k): + self._search_k = search_k + + def query(self, v, n): + return self._annoy.get_nns_by_vector(v.tolist(), n, self._search_k) + + def __str__(self): + return 'Annoy(n_trees=%d, search_k=%d)' % (self._n_trees, self._search_k) diff --git a/ann_benchmarks/algorithms/balltree.py b/ann_benchmarks/algorithms/balltree.py new file mode 100644 index 0000000..fd60d22 --- /dev/null +++ b/ann_benchmarks/algorithms/balltree.py @@ -0,0 +1,21 @@ +from __future__ import absolute_import +import sklearn.neighbors +import sklearn.preprocessing +from ann_benchmarks.algorithms.base import BaseANN + +class BallTree(BaseANN): + def __init__(self, metric, leaf_size=20): + self._leaf_size = leaf_size + self._metric = metric + self.name = 'BallTree(leaf_size=%d)' % self._leaf_size + + def fit(self, X): + if self._metric == 'angular': + X = sklearn.preprocessing.normalize(X, axis=1, norm='l2') + self._tree = sklearn.neighbors.BallTree(X, leaf_size=self._leaf_size, metric=self._metric) + + def query(self, v, n): + if self._metric == 'angular': + v = sklearn.preprocessing.normalize([v], axis=1, norm='l2')[0] + dist, ind = self._tree.query([v], k=n) + return ind[0] diff --git a/ann_benchmarks/algorithms/base.py b/ann_benchmarks/algorithms/base.py new file mode 100644 index 0000000..288564a --- /dev/null +++ b/ann_benchmarks/algorithms/base.py @@ -0,0 +1,27 @@ +from __future__ import absolute_import +import psutil + +class BaseANN(object): + def done(self): + pass + + def get_index_size(self, process): + """Returns the size of the index in kB or -1 if not implemented.""" + return psutil.Process().memory_info().rss / 1024 # return in kB for backwards compatibility + + def fit(self, X): + pass + + def query(self, q, n): + return [] # array of candidate indices + + def batch_query(self, X, n): + self.res = [] + for q in X: + self.res.append(self.query(q, n)) + + def get_batch_results(self): + return self.res + + def __str__(self): + return self.name diff --git a/ann_benchmarks/algorithms/bruteforce.py b/ann_benchmarks/algorithms/bruteforce.py new file mode 100644 index 0000000..afebcb6 --- /dev/null +++ b/ann_benchmarks/algorithms/bruteforce.py @@ -0,0 +1,98 @@ +from __future__ import absolute_import +import numpy +import sklearn.neighbors +from ann_benchmarks.distance import metrics as pd +from ann_benchmarks.algorithms.base import BaseANN +from scipy.sparse import issparse + +class BruteForce(BaseANN): + def __init__(self, metric): + if metric not in ('angular', 'euclidean', 'hamming', 'jaccard'): + raise NotImplementedError("BruteForce doesn't support metric %s" % metric) + self._metric = metric + self.name = 'BruteForce()' + + def fit(self, X): + metric = {'angular': 'cosine', 'euclidean': 'l2', 'hamming': 'hamming', 'jaccard' : 'jaccard'}[self._metric] + self._nbrs = sklearn.neighbors.NearestNeighbors(algorithm='brute', metric=metric) + self._nbrs.fit(X) + + def query(self, v, n): + return list(self._nbrs.kneighbors([v], + return_distance = False, n_neighbors = n)[0]) + + def query_with_distances(self, v, n): + (distances, positions) = self._nbrs.kneighbors([v], + return_distance = True, n_neighbors = n) + return zip(list(positions[0]), list(distances[0])) + + +class BruteForceBLAS(BaseANN): + """kNN search that uses a linear scan = brute force.""" + def __init__(self, metric, precision=numpy.float32): + if metric not in ('angular', 'euclidean', 'hamming', 'jaccard'): + raise NotImplementedError("BruteForceBLAS doesn't support metric %s" % metric) + elif metric == 'hamming' and precision != numpy.bool: + raise NotImplementedError("BruteForceBLAS doesn't support precision %s with Hamming distances" % precision) + self._metric = metric + self._precision = precision + self.name = 'BruteForceBLAS()' + + def fit(self, X): + """Initialize the search index.""" + if self._metric == 'angular': + lens = (X ** 2).sum(-1) # precompute (squared) length of each vector + X /= numpy.sqrt(lens)[..., numpy.newaxis] # normalize index vectors to unit length + self.index = numpy.ascontiguousarray(X, dtype=self._precision) + elif self._metric == 'hamming': + # Regarding bitvectors as vectors in l_2 is faster for blas + X = X.astype(numpy.float32) + lens = (X ** 2).sum(-1) # precompute (squared) length of each vector + self.index = numpy.ascontiguousarray(X, dtype=numpy.float32) + self.lengths = numpy.ascontiguousarray(lens, dtype=numpy.float32) + elif self._metric == 'euclidean': + lens = (X ** 2).sum(-1) # precompute (squared) length of each vector + self.index = numpy.ascontiguousarray(X, dtype=self._precision) + self.lengths = numpy.ascontiguousarray(lens, dtype=self._precision) + elif self._metric == 'jaccard': + self.index = X + else: + assert False, "invalid metric" # shouldn't get past the constructor! + + def query(self, v, n): + return [index for index, _ in self.query_with_distances(v, n)] + + def query_with_distances(self, v, n): + """Find indices of `n` most similar vectors from the index to query vector `v`.""" + + if self._metric != 'jaccard': + # use same precision for query as for index + v = numpy.ascontiguousarray(v, dtype = self.index.dtype) + + # HACK we ignore query length as that's a constant not affecting the final ordering + if self._metric == 'angular': + # argmax_a cossim(a, b) = argmax_a dot(a, b) / |a||b| = argmin_a -dot(a, b) + dists = -numpy.dot(self.index, v) + elif self._metric == 'euclidean': + # argmin_a (a - b)^2 = argmin_a a^2 - 2ab + b^2 = argmin_a a^2 - 2ab + dists = self.lengths - 2 * numpy.dot(self.index, v) + elif self._metric == 'hamming': + # Just compute hamming distance using euclidean distance + dists = self.lengths - 2 * numpy.dot(self.index, v) + elif self._metric == 'jaccard': + if issparse(self.index): + dists = [pd[self._metric]['distance'](v, e.toarray()[0]) for e in self.index] + else: + dists = [pd[self._metric]['distance'](v, e) for e in self.index] + else: + assert False, "invalid metric" # shouldn't get past the constructor! + nearest_indices = numpy.argpartition(dists, n)[:n] # partition-sort by distance, get `n` closest + indices = [idx for idx in nearest_indices if pd[self._metric]["distance_valid"](dists[idx])] + def fix(index): + if issparse(self.index): + ep = self.index[index].toarray()[0] + else: + ep = self.index[index] + ev = v + return (index, pd[self._metric]['distance'](ep, ev)) + return map(fix, indices) diff --git a/ann_benchmarks/algorithms/datasketch.py b/ann_benchmarks/algorithms/datasketch.py new file mode 100644 index 0000000..8fe4d32 --- /dev/null +++ b/ann_benchmarks/algorithms/datasketch.py @@ -0,0 +1,40 @@ +from __future__ import absolute_import +from datasketch import MinHashLSHForest, MinHash +from ann_benchmarks.algorithms.base import BaseANN + +class DataSketch(BaseANN): + def __init__(self, metric, n_perm, n_rep): + if metric not in ('jaccard'): + raise NotImplementedError("Datasketch doesn't support metric %s" % metric) + self._n_perm = n_perm + self._n_rep = n_rep + self._metric = metric + self.name = 'Datasketch(n_perm=%d, n_rep=%d)' % (n_perm, n_rep) + + def fit(self, X): + self._index = MinHashLSHForest(num_perm = self._n_perm, l = self._n_rep) + # the original implementation is for a list/array of list of integer + # for i, x in enumerate(X): + # m = MinHash(num_perm = self._n_perm) + # for e in x: + # m.update(str(e)) + # self._index.add(str(i), m) + + # Chunjiang modified it to take a 2D array as input + for i in range(len(X)): + m = MinHash(num_perm=self._n_perm) + for j in range(len(X[i])): + if X[i][j] == 1: + m.update(str(j).encode('utf8')) + self._index.add(str(i), m) + self._index.index() + + def query(self, v, n): + m = MinHash(num_perm = self._n_perm) + # for e in v: + # m.update(str(e)) + # Chunjiang modified + for j in range(len(v)): + if v[j] == 1: + m.update(str(j).encode('utf8')) + return map(int, self._index.query(m, n)) diff --git a/ann_benchmarks/algorithms/definitions.py b/ann_benchmarks/algorithms/definitions.py new file mode 100644 index 0000000..7906eae --- /dev/null +++ b/ann_benchmarks/algorithms/definitions.py @@ -0,0 +1,175 @@ +from __future__ import absolute_import +from os import sep as pathsep +import collections +import importlib +import os +import sys +import traceback +import yaml +from enum import Enum +from itertools import product + + +Definition = collections.namedtuple('Definition', ['algorithm', 'constructor', 'module', 'docker_tag', 'arguments', 'query_argument_groups', 'disabled']) + +def get_algorithm_name(name, batch): + if batch: + return name + "-batch" + return name + + +def instantiate_algorithm(definition): + print('Trying to instantiate %s.%s(%s)' % (definition.module, definition.constructor, definition.arguments)) + # special code for Risc + if "Risc" in definition.algorithm: + import sys + sys.path.append('/home/app/risc/Code') + module = importlib.import_module(definition.module) + constructor = getattr(module, definition.constructor) + return constructor(*definition.arguments) + + +class InstantiationStatus(Enum): + AVAILABLE = 0 + NO_CONSTRUCTOR = 1 + NO_MODULE = 2 + + +def algorithm_status(definition): + try: + module = importlib.import_module(definition.module) + if hasattr(module, definition.constructor): + return InstantiationStatus.AVAILABLE + else: + return InstantiationStatus.NO_CONSTRUCTOR + except ImportError: + return InstantiationStatus.NO_MODULE + +def _generate_combinations(args): + if isinstance(args, list): + args = [el if isinstance(el, list) else [el] for el in args] + return [list(x) for x in product(*args)] + elif isinstance(args, dict): + flat = [] + for k, v in args.items(): + if isinstance(v, list): + flat.append([(k, el) for el in v]) + else: + flat.append([(k, v)]) + return [dict(x) for x in product(*flat)] + else: + raise TypeError("No args handling exists for %s" % type(args).__name__) + + +def _substitute_variables(arg, vs): + if isinstance(arg, dict): + return dict([(k, _substitute_variables(v, vs)) for k, v in arg.items()]) + elif isinstance(arg, list): + return [_substitute_variables(a, vs) for a in arg] + elif isinstance(arg, str) and arg in vs: + return vs[arg] + else: + return arg + + +def _get_definitions(definition_file): + with open(definition_file, "r") as f: + return yaml.load(f) + + +def list_algorithms(definition_file): + definitions = _get_definitions(definition_file) + + print('The following algorithms are supported...') + for point in definitions: + print('\t... for the point type "%s"...' % point) + for metric in definitions[point]: + print('\t\t... and the distance metric "%s":' % metric) + for algorithm in definitions[point][metric]: + print('\t\t\t%s' % algorithm) + + +def get_unique_algorithms(definition_file): + definitions = _get_definitions(definition_file) + algos = set() + for point in definitions: + for metric in definitions[point]: + for algorithm in definitions[point][metric]: + algos.add(algorithm) + return list(sorted(algos)) + + +def get_definitions(definition_file, dimension, point_type="float", distance_metric="euclidean", count=10): + definitions = _get_definitions(definition_file) + + algorithm_definitions = {} + if "any" in definitions[point_type]: + algorithm_definitions.update(definitions[point_type]["any"]) + algorithm_definitions.update(definitions[point_type][distance_metric]) + + definitions = [] + for (name, algo) in algorithm_definitions.items(): + for k in ['docker-tag', 'module', 'constructor']: + if k not in algo: + raise Exception('algorithm %s does not define a "%s" property' % (name, k)) + + base_args = [] + if "base-args" in algo: + base_args = algo["base-args"] + + for run_group in algo["run-groups"].values(): + if "arg-groups" in run_group: + groups = [] + for arg_group in run_group["arg-groups"]: + if isinstance(arg_group, dict): + # Dictionaries need to be expanded into lists in order + # for the subsequent call to _generate_combinations to + # do the right thing + groups.append(_generate_combinations(arg_group)) + else: + groups.append(arg_group) + args = _generate_combinations(groups) + elif "args" in run_group: + args = _generate_combinations(run_group["args"]) + else: + assert False, "? what? %s" % run_group + + if "query-arg-groups" in run_group: + groups = [] + for arg_group in run_group["query-arg-groups"]: + if isinstance(arg_group, dict): + groups.append(_generate_combinations(arg_group)) + else: + groups.append(arg_group) + query_args = _generate_combinations(groups) + elif "query-args" in run_group: + query_args = _generate_combinations(run_group["query-args"]) + else: + query_args = [] + + for arg_group in args: + obj = None + aargs = [] + aargs.extend(base_args) + if isinstance(arg_group, list): + aargs.extend(arg_group) + else: + aargs.append(arg_group) + + vs = { + "@count": count, + "@metric": distance_metric, + "@dimension": dimension + } + aargs = [_substitute_variables(arg, vs) for arg in aargs] + definitions.append(Definition( + algorithm=name, + docker_tag=algo['docker-tag'], + module=algo['module'], + constructor=algo['constructor'], + arguments=aargs, + query_argument_groups=query_args, + disabled=algo.get('disabled', False) + )) + + return definitions diff --git a/ann_benchmarks/algorithms/dolphinnpy.py b/ann_benchmarks/algorithms/dolphinnpy.py new file mode 100644 index 0000000..1090cd0 --- /dev/null +++ b/ann_benchmarks/algorithms/dolphinnpy.py @@ -0,0 +1,30 @@ +from __future__ import absolute_import +import sys +sys.path.append("install/lib-dolphinnpy") +import numpy +import ctypes +from dolphinn import Dolphinn +from utils import findmean, isotropize +from ann_benchmarks.algorithms.base import BaseANN + +class DolphinnPy(BaseANN): + def __init__(self, num_probes): + self.name = 'Dolphinn(num_probes={} )'.format(num_probes) + self.num_probes = num_probes + self.m = 1 + self._index = None + + def fit(self, X): + if X.dtype != numpy.float32: + X = numpy.array(X, dtype=numpy.float32) + d = X.shape[1] + self.m = findmean(X, d, 10) + X = isotropize(X, d, self.m) + hypercube_dim = int(numpy.log2(len(X))) - 2 + self._index = Dolphinn(X, d, hypercube_dim) + + def query(self, v, n): + q = numpy.array([v]) + q = isotropize(q, len(v), self.m) + res = self._index.queries(q, n, self.num_probes) + return res[0] diff --git a/ann_benchmarks/algorithms/dummy_algo.py b/ann_benchmarks/algorithms/dummy_algo.py new file mode 100644 index 0000000..0682b03 --- /dev/null +++ b/ann_benchmarks/algorithms/dummy_algo.py @@ -0,0 +1,24 @@ +from __future__ import absolute_import +import numpy as np +from ann_benchmarks.algorithms.base import BaseANN + +class DummyAlgoMt(BaseANN): + def __init__(self, metric): + self.name = 'DummyAlgoMultiThread' + + def fit(self, X): + self.len=len(X)-1 + + def query(self, v, n): + return np.random.randint(self.len, size=n) + + +class DummyAlgoSt(BaseANN): + def __init__(self, metric): + self.name = 'DummyAlgoSingleThread' + + def fit(self, X): + self.len=len(X)-1 + + def query(self, v, n): + return np.random.randint(self.len, size=n) diff --git a/ann_benchmarks/algorithms/faiss.py b/ann_benchmarks/algorithms/faiss.py new file mode 100644 index 0000000..2f41986 --- /dev/null +++ b/ann_benchmarks/algorithms/faiss.py @@ -0,0 +1,71 @@ +from __future__ import absolute_import +import sys +sys.path.append("install/lib-faiss") +import numpy +import sklearn.preprocessing +import ctypes +import faiss +from ann_benchmarks.algorithms.base import BaseANN + +class Faiss(BaseANN): + def query(self, v, n): + if self._metric == 'angular': + v /= numpy.linalg.norm(v) + D, I = self.index.search(numpy.expand_dims(v,axis=0).astype(numpy.float32), n) + return I[0] + + def batch_query(self, X, n): + if self._metric == 'angular': + X /= numpy.linalg.norm(X) + self.res = self.index.search(X.astype(numpy.float32), n) + + def get_batch_results(self): + D, L = self.res + res = [] + for i in range(len(D)): + r = [] + for l, d in zip(L[i], D[i]): + if l != -1: + r.append(l) + res.append(r) + return res + +class FaissLSH(Faiss): + def __init__(self, metric, n_bits): + self._n_bits = n_bits + self.index = None + self._metric = metric + self.name = 'FaissLSH(n_bits={})'.format(self._n_bits) + + def fit(self, X): + if X.dtype != numpy.float32: + X = X.astype(numpy.float32) + f = X.shape[1] + self.index = faiss.IndexLSH(f, self._n_bits) + self.index.train(X) + self.index.add(X) + +class FaissIVF(Faiss): + def __init__(self, metric, n_list): + self._n_list = n_list + self._metric = metric + + def fit(self, X): + if self._metric == 'angular': + X = sklearn.preprocessing.normalize(X, axis=1, norm='l2') + + if X.dtype != numpy.float32: + X = X.astype(numpy.float32) + + self.quantizer = faiss.IndexFlatL2(X.shape[1]) + index = faiss.IndexIVFFlat(self.quantizer, X.shape[1], self._n_list, faiss.METRIC_L2) + index.train(X) + index.add(X) + self.index = index + + def set_query_arguments(self, n_probe): + self._n_probe = n_probe + self.index.nprobe = self._n_probe + + def __str__(self): + return 'FaissIVF(n_list=%d, n_probe=%d)' % (self._n_list, self._n_probe) diff --git a/ann_benchmarks/algorithms/faiss_gpu.py b/ann_benchmarks/algorithms/faiss_gpu.py new file mode 100644 index 0000000..79b91f3 --- /dev/null +++ b/ann_benchmarks/algorithms/faiss_gpu.py @@ -0,0 +1,56 @@ +from __future__ import absolute_import +import sys +# Assumes local installation of FAISS +sys.path.append("faiss") +import numpy +import ctypes +import faiss +from ann_benchmarks.algorithms.base import BaseANN + +# Implementation based on +# https://github.com/facebookresearch/faiss/blob/master/benchs/bench_gpu_sift1m.py +class FaissGPU(BaseANN): + def __init__(self, n_bits, n_probes): + self.name = 'FaissGPU(n_bits={}, n_probes={})'.format(n_bits, n_probes) + self._n_bits = n_bits + self._n_probes = n_probes + self._res = faiss.StandardGpuResources() + self._index = None + + def fit(self, X): + X = X.astype(numpy.float32) + self._index = faiss.GpuIndexIVFFlat(self._res, len(X[0]), self._n_bits, + faiss.METRIC_L2) +# self._index = faiss.index_factory(len(X[0]), "IVF%d,Flat" % self._n_bits) +# co = faiss.GpuClonerOptions() +# co.useFloat16 = True +# self._index = faiss.index_cpu_to_gpu(self._res, 0, self._index, co) + self._index.train(X) + self._index.add(X) + self._index.setNumProbes(self._n_probes) + + def query(self, v, n): + return [label for label, _ in self.query_with_distances(v, n)] + + def query_with_distances(self, v, n): + v = v.astype(numpy.float32).reshape(1, -1) + distances, labels = self._index.search(v, n) + r = [] + for l, d in zip(labels[0], distances[0]): + if l != -1: + r.append((l, d)) + return r + + def batch_query(self, X, n): + self.res = self._index.search(X.astype(numpy.float32),n) + + def get_batch_results(self): + D, L = self.res + res = [] + for i in range(len(D)): + r = [] + for l, d in zip(L[i], D[i]): + if l != -1: + r.append(l) + res.append(r) + return res diff --git a/ann_benchmarks/algorithms/faiss_hnsw.py b/ann_benchmarks/algorithms/faiss_hnsw.py new file mode 100644 index 0000000..cda3571 --- /dev/null +++ b/ann_benchmarks/algorithms/faiss_hnsw.py @@ -0,0 +1,36 @@ +from __future__ import absolute_import +import os +import faiss +import numpy as np +from ann_benchmarks.constants import INDEX_DIR +from ann_benchmarks.algorithms.base import BaseANN +from ann_benchmarks.algorithms.faiss import Faiss + + +class FaissHNSW(Faiss): + def __init__(self, metric, method_param): + self._metric = metric + self.method_param = method_param + self.name = 'faiss (%s)' % (self.method_param) + + def fit(self, X): + self.index = faiss.IndexHNSWFlat(X.shape[1],self.method_param["M"]) + self.index.hnsw.efConstruction = self.method_param["efConstruction"] + self.index.verbose = True + + if self._metric == 'angular': + X = X / np.linalg.norm(X, axis=1)[:, np.newaxis] + if X.dtype != np.float32: + X = X.astype(np.float32) + + self.index.add(X) + faiss.omp_set_num_threads(1) + + def set_query_arguments(self, ef): + self.index.hnsw.efSearch = ef + + def freeIndex(self): + del self.p + + def __str__(self): + return 'faiss (%s, efSearch: %s)' % (self.method_param, self.index.hnsw.efSearch) diff --git a/ann_benchmarks/algorithms/flann.py b/ann_benchmarks/algorithms/flann.py new file mode 100644 index 0000000..47b1fe6 --- /dev/null +++ b/ann_benchmarks/algorithms/flann.py @@ -0,0 +1,24 @@ +from __future__ import absolute_import +import pyflann +import numpy +import sklearn.preprocessing +from ann_benchmarks.algorithms.base import BaseANN + +class FLANN(BaseANN): + def __init__(self, metric, target_precision): + self._target_precision = target_precision + self.name = 'FLANN(target_precision=%f)' % self._target_precision + self._metric = metric + + def fit(self, X): + self._flann = pyflann.FLANN(target_precision=self._target_precision, algorithm='autotuned', log_level='info') + if self._metric == 'angular': + X = sklearn.preprocessing.normalize(X, axis=1, norm='l2') + self._flann.build_index(X) + + def query(self, v, n): + if self._metric == 'angular': + v = sklearn.preprocessing.normalize([v], axis=1, norm='l2')[0] + if v.dtype != numpy.float32: + v = v.astype(numpy.float32) + return self._flann.nn_index(v, n)[0][0] diff --git a/ann_benchmarks/algorithms/hdidx.py b/ann_benchmarks/algorithms/hdidx.py new file mode 100644 index 0000000..f044926 --- /dev/null +++ b/ann_benchmarks/algorithms/hdidx.py @@ -0,0 +1,30 @@ +from __future__ import absolute_import +import numpy as np +from hdidx.indexer import SHIndexer +from ann_benchmarks.algorithms.base import BaseANN + + +class HdIdx(BaseANN): + def __init__(self, index): + self._index = index + self._index.set_storage() # defaults to main memory + self._params = dict() # set by sub-classes + + def fit(self, X): + # don't update self._params directly, as it's used in __str__ + fit_params = dict(self._params) + fit_params['vals'] = X + self._index.build(fit_params) + self._index.add(X) + + def query(self, v, n): + return self._index.search(np.expand_dims(v, axis=0), n)[0][0].tolist() + + +class SHIdx(HdIdx): + def __init__(self, n_bits=256): + super(SHIdx, self).__init__(SHIndexer()) + self._params['nbits'] = n_bits + + def __str__(self): + return 'SHIndexer_({})'.format(self._params) diff --git a/ann_benchmarks/algorithms/hnswlib.py b/ann_benchmarks/algorithms/hnswlib.py new file mode 100644 index 0000000..ef86377 --- /dev/null +++ b/ann_benchmarks/algorithms/hnswlib.py @@ -0,0 +1,33 @@ +from __future__ import absolute_import +import os +import hnswlib +import numpy as np +from ann_benchmarks.constants import INDEX_DIR +from ann_benchmarks.algorithms.base import BaseANN + + +class HnswLib(BaseANN): + def __init__(self, metric, method_param): + self.metric = {'angular': 'cosine', 'euclidean': 'l2'}[metric] + self.method_param = method_param + # print(self.method_param,save_index,query_param) + # self.ef=query_param['ef'] + self.name = 'hnswlib (%s)' % (self.method_param) + + def fit(self, X): + self.p = hnswlib.Index(space=self.metric, dim=len(X[0])) # Only l2 is supported currently + self.p.init_index(max_elements=len(X), ef_construction=self.method_param["efConstruction"], M=self.method_param["M"]) + data_labels = np.arange(len(X)) + self.p.add_items(np.asarray(X), data_labels) + self.p.set_num_threads(1) + + def set_query_arguments(self, ef): + self.p.set_ef(ef) + + def query(self, v, n): + # print(np.expand_dims(v,axis=0).shape) + # print(self.p.knn_query(np.expand_dims(v,axis=0), k = n)[0]) + return self.p.knn_query(np.expand_dims(v,axis=0), k = n)[0][0] + + def freeIndex(self): + del self.p diff --git a/ann_benchmarks/algorithms/kdtree.py b/ann_benchmarks/algorithms/kdtree.py new file mode 100644 index 0000000..229a7cb --- /dev/null +++ b/ann_benchmarks/algorithms/kdtree.py @@ -0,0 +1,21 @@ +from __future__ import absolute_import +import sklearn.neighbors +import sklearn.preprocessing +from ann_benchmarks.algorithms.base import BaseANN + +class KDTree(BaseANN): + def __init__(self, metric, leaf_size=20): + self._leaf_size = leaf_size + self._metric = metric + self.name = 'KDTree(leaf_size=%d)' % self._leaf_size + + def fit(self, X): + if self._metric == 'angular': + X = sklearn.preprocessing.normalize(X, axis=1, norm='l2') + self._tree = sklearn.neighbors.KDTree(X, leaf_size=self._leaf_size) + + def query(self, v, n): + if self._metric == 'angular': + v = sklearn.preprocessing.normalize([v], axis=1, norm='l2')[0] + dist, ind = self._tree.query([v], k=n) + return ind[0] diff --git a/ann_benchmarks/algorithms/kgraph.py b/ann_benchmarks/algorithms/kgraph.py new file mode 100644 index 0000000..ed8db39 --- /dev/null +++ b/ann_benchmarks/algorithms/kgraph.py @@ -0,0 +1,37 @@ +from __future__ import absolute_import +import os +import numpy +import pykgraph +from ann_benchmarks.constants import INDEX_DIR +from ann_benchmarks.algorithms.base import BaseANN + +class KGraph(BaseANN): + def __init__(self, metric, index_params, save_index): + if type(metric) == unicode: + metric = str(metric) + self.name = 'KGraph(%s)' % (metric) + self._metric = metric + self._index_params = index_params + self._save_index = save_index + + def fit(self, X): + if X.dtype != numpy.float32: + X = X.astype(numpy.float32) + self._kgraph = pykgraph.KGraph(X, self._metric) + path = os.path.join(INDEX_DIR, 'kgraph-index-%s' % self._metric) + if os.path.exists(path): + self._kgraph.load(path) + else: + self._kgraph.build(**self._index_params) #iterations=30, L=100, delta=0.002, recall=0.99, K=25) + if not os.path.exists(INDEX_DIR): + os.makedirs(INDEX_DIR) + self._kgraph.save(path) + + def set_query_arguments(self, P): + self._P = P + + def query(self, v, n): + if v.dtype != numpy.float32: + v = v.astype(numpy.float32) + result = self._kgraph.search(numpy.array([v]), K=n, threads=1, P=self._P) + return result[0] diff --git a/ann_benchmarks/algorithms/lshf.py b/ann_benchmarks/algorithms/lshf.py new file mode 100644 index 0000000..1854e7f --- /dev/null +++ b/ann_benchmarks/algorithms/lshf.py @@ -0,0 +1,22 @@ +from __future__ import absolute_import +import sklearn.neighbors +import sklearn.preprocessing +from ann_benchmarks.algorithms.base import BaseANN + +class LSHF(BaseANN): + def __init__(self, metric, n_estimators=10, n_candidates=50): + self.name = 'LSHF(n_est=%d, n_cand=%d)' % (n_estimators, n_candidates) + self._metric = metric + self._n_estimators = n_estimators + self._n_candidates = n_candidates + + def fit(self, X): + self._lshf = sklearn.neighbors.LSHForest(n_estimators=self._n_estimators, n_candidates=self._n_candidates) + if self._metric == 'angular': + X = sklearn.preprocessing.normalize(X, axis=1, norm='l2') + self._lshf.fit(X) + + def query(self, v, n): + if self._metric == 'angular': + v = sklearn.preprocessing.normalize([v], axis=1, norm='l2')[0] + return self._lshf.kneighbors([v], return_distance=False, n_neighbors=n)[0] diff --git a/ann_benchmarks/algorithms/mrpt.py b/ann_benchmarks/algorithms/mrpt.py new file mode 100644 index 0000000..e7ffc14 --- /dev/null +++ b/ann_benchmarks/algorithms/mrpt.py @@ -0,0 +1,31 @@ +from __future__ import absolute_import +import numpy +import sklearn.preprocessing +import mrpt +from ann_benchmarks.algorithms.base import BaseANN + +class MRPT(BaseANN): + def __init__(self, metric, n_trees, depth): + self._metric = metric + self._n_trees = n_trees + self._depth = depth + self._votes_required = None + + def fit(self, X): + if self._metric == 'angular': + X = sklearn.preprocessing.normalize(X, axis=1, norm='l2') + + self._index = mrpt.MRPTIndex(X, depth=self._depth, n_trees=self._n_trees) + self._index.build() + + def set_query_arguments(self, votes_required): + self._votes_required = votes_required + + def query(self, v, n): + if self._metric == 'angular': + v /= numpy.linalg.norm(v) + + return self._index.ann(v, n, votes_required=self._votes_required) + + def __str__(self): + return 'MRPT(n_trees=%d, depth=%d, votes_required=%d)' % (self._n_trees, self._depth, self._votes_required) diff --git a/ann_benchmarks/algorithms/nearpy.py b/ann_benchmarks/algorithms/nearpy.py new file mode 100644 index 0000000..318fa2a --- /dev/null +++ b/ann_benchmarks/algorithms/nearpy.py @@ -0,0 +1,63 @@ +from __future__ import absolute_import +import nearpy +from nearpy.filters import NearestFilter +import sklearn.preprocessing +from ann_benchmarks.algorithms.base import BaseANN +import scipy +from scipy.spatial.distance import jaccard + +# Chunjiang modified 0220 +class JaccardDistance(): + """ Jaccard distance """ + + def distance(self, x, y): + """ + Computes distance measure between vectors x and y. Returns float. + """ + return jaccard(x, y) + +class NearPy(BaseANN): + def __init__(self, metric, n_bits, hash_counts): + self._n_bits = n_bits + self._hash_counts = hash_counts + self._metric = metric + self._filter = NearestFilter(10) + self.name = 'NearPy(n_bits=%d, hash_counts=%d)' % (self._n_bits, self._hash_counts) + + def fit(self, X): + hashes = [] + + for k in range(self._hash_counts): + nearpy_rbp = nearpy.hashes.RandomBinaryProjections('rbp_%d' % k, self._n_bits) + hashes.append(nearpy_rbp) + + if self._metric == 'euclidean': + dist = nearpy.distances.EuclideanDistance() + self._nearpy_engine = nearpy.Engine( + X.shape[1], + lshashes=hashes, + distance=dist) + elif self._metric == 'jaccard': # Chunjiang modified 0220 + dist = JaccardDistance() + self._nearpy_engine = nearpy.Engine( + X.shape[1], + lshashes=hashes, + distance=dist) + else: # Default (angular) = Cosine distance + self._nearpy_engine = nearpy.Engine( + X.shape[1], + lshashes=hashes, + vector_filters=[self._filter]) + + if self._metric == 'angular': + X = sklearn.preprocessing.normalize(X, axis=1, norm='l2') + for i, x in enumerate(X): + self._nearpy_engine.store_vector(x, i) + + def query(self, v, n): + # XXX: This feels like an unpleasant hack, but it's not clear how to do + # better without making changes to NearPy + self._filter.N = n + if self._metric == 'angular': + v = sklearn.preprocessing.normalize([v], axis=1, norm='l2')[0] + return [y for x, y, z in self._nearpy_engine.neighbours(v)] diff --git a/ann_benchmarks/algorithms/nmslib.py b/ann_benchmarks/algorithms/nmslib.py new file mode 100644 index 0000000..44732b2 --- /dev/null +++ b/ann_benchmarks/algorithms/nmslib.py @@ -0,0 +1,95 @@ +from __future__ import absolute_import +import os +import nmslib +from ann_benchmarks.constants import INDEX_DIR +from ann_benchmarks.algorithms.base import BaseANN +from scipy.sparse import csr_matrix +import numpy + + +class NmslibReuseIndex(BaseANN): + @staticmethod + def encode(d): + return ["%s=%s" % (a, b) for (a, b) in d.items()] + + # For each entry in the sparse matrix, extract a list of IDs and + # convert them to a string. Return a list of such strings. + @staticmethod + def matrToStrArray(sparseMatr): + res = [] + indptr = sparseMatr.indptr + indices = sparseMatr.indices + for row in range(sparseMatr.shape[0]): + arr = [k for k in indices[indptr[row]: indptr[row + 1]]] + arr.sort() + res.append(' '.join([str(k) for k in arr])) + return res + + def __init__(self, metric, method_name, index_param, query_param): + self._nmslib_metric = {'angular': 'cosinesimil', 'euclidean': 'l2', 'jaccard': 'jaccard_sparse'}[metric] + self._method_name = method_name + self._save_index = False + self._index_param = NmslibReuseIndex.encode(index_param) + if query_param!=False: + self._query_param = NmslibReuseIndex.encode(query_param) + self.name = 'Nmslib(method_name=%s, index_param=%s, query_param=%s)' % ( + self._method_name, self._index_param, self._query_param) + else: + self._query_param = None + self.name = 'Nmslib(method_name=%s, index_param=%s)' % ( + self._method_name, self._index_param) + + self._index_name = os.path.join(INDEX_DIR, "nmslib_%s_%s_%s" % (self._method_name, metric, '_'.join(self._index_param))) + + d = os.path.dirname(self._index_name) + if not os.path.exists(d): + os.makedirs(d) + + def fit(self, X): + if self._method_name == 'vptree': + # To avoid this issue: + # terminate called after throwing an instance of 'std::runtime_error' + # what(): The data size is too small or the bucket size is too big. Select the parameters so that is NOT less than * 1000 + # Aborted (core dumped) + self._index_param.append('bucketSize=%d' % min(int(X.shape[0] * 0.0005), 1000)) + + # Chunjiang modified it to "if" for jaccard + if self._nmslib_metric == 'jaccard_sparse': + X_trans = NmslibReuseIndex.matrToStrArray(csr_matrix(X)) + self._index = nmslib.init(space=self._nmslib_metric, method=self._method_name, data_type=nmslib.DataType.OBJECT_AS_STRING) + self._index.addDataPointBatch(X_trans) + else: + self._index = nmslib.init(space=self._nmslib_metric, method=self._method_name) + self._index.addDataPointBatch(X) + + if os.path.exists(self._index_name): + print('Loading index from file') + self._index.loadIndex(self._index_name) + else: + self._index.createIndex(self._index_param) + if self._save_index: + self._index.saveIndex(self._index_name) + if self._query_param is not None: + self._index.setQueryTimeParams(self._query_param) + + def set_query_arguments(self, ef): + if self._method_name == 'hnsw' or self._method_name == 'sw-graph': + self._index.setQueryTimeParams(["efSearch=%s"%(ef)]) + + def query(self, v, n): + # Chunjiang modified + if self._nmslib_metric == 'jaccard_sparse': + nz = numpy.nonzero(v)[0] + v = ' '.join([str(k) for k in nz]) + ids, distances = self._index.knnQuery(v, n) + return ids + + def batch_query(self, X, n): + # Chunjiang modified + if self._nmslib_metric == 'jaccard_sparse': + X = NmslibReuseIndex.matrToStrArray(csr_matrix(X)) + self.res = self._index.knnQueryBatch(X, n) + + def get_batch_results(self): + return [x for x, _ in self.res] + diff --git a/ann_benchmarks/algorithms/nmslib_sparse.py b/ann_benchmarks/algorithms/nmslib_sparse.py new file mode 100644 index 0000000..58af969 --- /dev/null +++ b/ann_benchmarks/algorithms/nmslib_sparse.py @@ -0,0 +1,95 @@ +from __future__ import absolute_import +import os +import nmslib +from ann_benchmarks.constants import INDEX_DIR +from ann_benchmarks.algorithms.base import BaseANN +from scipy.sparse import csr_matrix +import numpy + + +class NmslibSparseReuseIndex(BaseANN): + @staticmethod + def encode(d): + return ["%s=%s" % (a, b) for (a, b) in d.iteritems()] + + # For each entry in the sparse matrix, extract a list of IDs and + # convert them to a string. Return a list of such strings. + @staticmethod + def matrToStrArray(sparseMatr): + res = [] + indptr = sparseMatr.indptr + indices = sparseMatr.indices + for row in range(sparseMatr.shape[0]): + arr = [k for k in indices[indptr[row]: indptr[row + 1]]] + arr.sort() + res.append(' '.join([str(k) for k in arr])) + return res + + def __init__(self, metric, method_name, index_param, query_param): + self._nmslib_metric = {'angular': 'cosinesimil', 'euclidean': 'l2', 'jaccard': 'jaccard_sparse'}[metric] + self._method_name = method_name + self._save_index = False + self._index_param = NmslibSparseReuseIndex.encode(index_param) + if query_param!=False: + self._query_param = NmslibSparseReuseIndex.encode(query_param) + self.name = 'Nmslib(method_name=%s, index_param=%s, query_param=%s)' % ( + self._method_name, self._index_param, self._query_param) + else: + self._query_param = None + self.name = 'Nmslib(method_name=%s, index_param=%s)' % ( + self._method_name, self._index_param) + + self._index_name = os.path.join(INDEX_DIR, "nmslib_%s_%s_%s" % (self._method_name, metric, '_'.join(self._index_param))) + + d = os.path.dirname(self._index_name) + if not os.path.exists(d): + os.makedirs(d) + + def fit(self, X): + if self._method_name == 'vptree': + # To avoid this issue: + # terminate called after throwing an instance of 'std::runtime_error' + # what(): The data size is too small or the bucket size is too big. Select the parameters so that is NOT less than * 1000 + # Aborted (core dumped) + self._index_param.append('bucketSize=%d' % min(int(X.shape[0] * 0.0005), 1000)) + + # Chunjiang modified it to "if" for jaccard + if self._nmslib_metric == 'jaccard_sparse': + X_trans = NmslibSparseReuseIndex.matrToStrArray(X) + self._index = nmslib.init(space=self._nmslib_metric, method=self._method_name, data_type=nmslib.DataType.OBJECT_AS_STRING) + self._index.addDataPointBatch(X_trans) + else: + self._index = nmslib.init(space=self._nmslib_metric, method=self._method_name) + self._index.addDataPointBatch(X) + + if os.path.exists(self._index_name): + print('Loading index from file') + self._index.loadIndex(self._index_name) + else: + self._index.createIndex(self._index_param) + if self._save_index: + self._index.saveIndex(self._index_name) + if self._query_param is not None: + self._index.setQueryTimeParams(self._query_param) + + def set_query_arguments(self, ef): + if self._method_name == 'hnsw' or self._method_name == 'sw-graph': + self._index.setQueryTimeParams(["efSearch=%s"%(ef)]) + + def query(self, v, n): + # Chunjiang modified + if self._nmslib_metric == 'jaccard_sparse': + nz = numpy.nonzero(v)[0] + v = ' '.join([str(k) for k in nz]) + ids, distances = self._index.knnQuery(v, n) + return ids + + def batch_query(self, X, n): + # Chunjiang modified + if self._nmslib_metric == 'jaccard_sparse': + X = NmslibSparseReuseIndex.matrToStrArray(csr_matrix(X)) + self.res = self._index.knnQueryBatch(X, n) + + def get_batch_results(self): + return [x for x, _ in self.res] + diff --git a/ann_benchmarks/algorithms/onng_ngt.py b/ann_benchmarks/algorithms/onng_ngt.py new file mode 100644 index 0000000..fce0743 --- /dev/null +++ b/ann_benchmarks/algorithms/onng_ngt.py @@ -0,0 +1,93 @@ +from __future__ import absolute_import +import sys +import os +import ngtpy +import numpy as np +import subprocess +import time +from ann_benchmarks.algorithms.base import BaseANN +from ann_benchmarks.constants import INDEX_DIR + +class ONNG(BaseANN): + def __init__(self, metric, object_type, epsilon, edge, outdegree, indegree): + # Chunjiang modified 0222 + metrics = {'euclidean': '2', 'angular': 'C', 'jaccard': 'j'} + types = {'Float' : 'f', 'Byte' : 'c'} + self._edge_size = edge # edge_size_for_construction + self._outdegree = outdegree + self._indegree = indegree + self._metric = metrics[metric] + # Chunjiang modified 0222 + self._object_type = types[object_type] + self._edge_size_for_search = 0 + self._build_time_limit = 4 + self._epsilon = epsilon + print('ONNG: edge_size=' + str(self._edge_size)) + print('ONNG: outdegree=' + str(self._outdegree)) + print('ONNG: indegree=' + str(self._indegree)) + print('ONNG: edge_size_for_search=' + str(self._edge_size_for_search)) + print('ONNG: epsilon=' + str(self._epsilon)) + print('ONNG: metric=' + metric) + print('ONNG: object_type=' + object_type) + + def fit(self, X): + print('ONNG: start indexing...') + dim = X.shape[1] + print('ONNG: # of data=' + str(X.shape[0])) + print('ONNG: dimensionality=' + str(dim)) + index_dir = 'indexes' + if not os.path.exists(index_dir): + os.makedirs(index_dir) + index = os.path.join(index_dir, 'ONNG-' + str(self._edge_size) + '-' + str(self._outdegree) + '-' + str(self._indegree)) + anngIndex = os.path.join(index_dir, 'ANNG-' + str(self._edge_size)) + print('ONNG: index=' + index) + print('ANNG: index=' + anngIndex) + + if (not os.path.exists(index)) and (not os.path.exists(anngIndex)): + print('ONNG: create ANNG') + t = time.time() + #'-b500', + args = ['ngt', 'create', '-it', '-p8', '-b500', '-ga', '-o' + self._object_type, '-D' + self._metric, '-d' + str(dim), '-E' + str(self._edge_size), '-S' + str(self._edge_size_for_search), '-e' + str(self._epsilon), '-P0', '-B30', '-T' + str(self._build_time_limit), anngIndex] + print(args) + subprocess.call(args) + idx = ngtpy.Index(path=anngIndex) + idx.batch_insert(X, num_threads=24, debug=False) + idx.save() + idx.close() + print('ONNG: ANNG construction time(sec)=' + str(time.time() - t)) + if not os.path.exists(index): + print('ONNG: degree adjustment') + t = time.time() + args = ['ngt', 'reconstruct-graph', '-mS', '-o ' + str(self._outdegree), '-i ' + str(self._indegree), anngIndex, index] + subprocess.call(args) + print('ONNG: degree adjustment time(sec)=' + str(time.time() -t)) + if os.path.exists(index): + print('ONNG: index already exists! ' + str(index)) + t = time.time() + self.index = ngtpy.Index(index, read_only=True) + self.indexName = index + print('ONNG: open time(sec)=' + str(time.time() - t)) + else: + print('ONNG: something wrong.') + print('ONNG: end of fit') + + def set_query_arguments(self, epsilon): + print("ONNG: epsilon=" + str(epsilon)) + self._epsilon = epsilon - 1.0 + self.name = 'ONNG-NGT(%s, %s, %s, %s, %1.3f)' % (self._edge_size, self._outdegree, self._indegree, self._edge_size_for_search, self._epsilon + 1.0) + + def query(self, v, n, rq=False): + if rq: + # direct method + #self.index.set(sys.maxsize, n) + #n = 0 # then input size 0 to search + #results = self.index.search(v, n, self._epsilon, self._edge_size_for_search, with_distance=False) + + # indirect method + results = self.index.searchRange(v, n, self._epsilon, self._edge_size_for_search, with_distance=False) + else: + results = self.index.search(v, n, self._epsilon, self._edge_size_for_search, with_distance=False) + return results + + def freeIndex(self): + print('ONNG: free') diff --git a/ann_benchmarks/algorithms/panng_ngt.py b/ann_benchmarks/algorithms/panng_ngt.py new file mode 100644 index 0000000..a06f6e6 --- /dev/null +++ b/ann_benchmarks/algorithms/panng_ngt.py @@ -0,0 +1,66 @@ +from __future__ import absolute_import +import sys +import os +import ngtpy +import numpy as np +import subprocess +import time +from ann_benchmarks.algorithms.base import BaseANN +from ann_benchmarks.constants import INDEX_DIR + +class PANNG(BaseANN): + def __init__(self, metric, object_type, edge, pathadj, searchedge): + metrics = {'euclidean': 'L2', 'angular': 'Cosine', 'jaccard': 'Jaccard'} + self._edge_size = edge + self._pathadj_size = pathadj + self._edge_size_for_search = searchedge + self._metric = metrics[metric] + self._object_type = object_type + print('PANNG: edge_size=' + str(self._edge_size)) + print('PANNG: pathadj_size=' + str(self._pathadj_size)) + print('PANNG: edge_size_for_search=' + str(self._edge_size_for_search)) + print('PANNG: metric=' + metric) + print('PANNG: object_type=' + object_type) + + def fit(self, X): + print('PANNG: start indexing...') + dim = len(X[0]) + print('PANNG: # of data=' + str(len(X))) + print('PANNG: Dimensionality=' + str(dim)) + index_dir = 'indexes' + if not os.path.exists(index_dir): + os.makedirs(index_dir) + index = os.path.join(index_dir, 'PANNG-' + str(self._edge_size) + '-' + str(self._pathadj_size)) + print(index) + if os.path.exists(index): + print('PANNG: index already exists! ' + str(index)) + else: + t0 = time.time() + ngtpy.create(path=index, dimension=dim, edge_size_for_creation=self._edge_size, distance_type=self._metric, + object_type=self._object_type) + idx = ngtpy.Index(path=index) + idx.batch_insert(X, num_threads=24, debug=False) + idx.save() + idx.close() + if self._pathadj_size > 0 : + print('PANNG: path adjustment') + args = ['ngt', 'prune', '-s ' + str(self._pathadj_size), index] + subprocess.call(args) + indexingtime = time.time() - t0 + print('PANNG: indexing, adjustment and saving time(sec)=' + str(indexingtime)) + t0 = time.time() + self.index = ngtpy.Index(path=index, read_only=True) + opentime = time.time() - t0 + print('PANNG: open time(sec)=' + str(opentime)) + + def set_query_arguments(self, epsilon): + print("PANNG: epsilon=" + str(epsilon)) + self._epsilon = epsilon - 1.0 + self.name = 'PANNG-NGT(%d, %d, %d, %1.3f)' % (self._edge_size, self._pathadj_size, self._edge_size_for_search, self._epsilon + 1.0) + + def query(self, v, n): + results = self.index.search(v, n, self._epsilon, self._edge_size_for_search, with_distance=False) + return results + + def freeIndex(self): + print('PANNG: free') diff --git a/ann_benchmarks/algorithms/panns.py b/ann_benchmarks/algorithms/panns.py new file mode 100644 index 0000000..d42867d --- /dev/null +++ b/ann_benchmarks/algorithms/panns.py @@ -0,0 +1,19 @@ +from __future__ import absolute_import +import panns +from ann_benchmarks.algorithms.base import BaseANN + +class PANNS(BaseANN): + def __init__(self, metric, n_trees, n_candidates): + self._n_trees = n_trees + self._n_candidates = n_candidates + self._metric = metric + self.name = 'PANNS(n_trees=%d, n_cand=%d)' % (self._n_trees, self._n_candidates) + + def fit(self, X): + self._panns = panns.PannsIndex(X.shape[1], metric=self._metric) + for x in X: + self._panns.add_vector(x) + self._panns.build(self._n_trees) + + def query(self, v, n): + return [x for x, y in self._panns.query(v, n)] diff --git a/ann_benchmarks/algorithms/pynndescent.py b/ann_benchmarks/algorithms/pynndescent.py new file mode 100644 index 0000000..190bf3e --- /dev/null +++ b/ann_benchmarks/algorithms/pynndescent.py @@ -0,0 +1,35 @@ +from __future__ import absolute_import +import pynndescent +from ann_benchmarks.algorithms.base import BaseANN + +class PyNNDescent(BaseANN): + def __init__(self, metric, n_neighbors=10, n_trees=8, leaf_size=20): + self._n_neighbors = int(n_neighbors) + self._n_trees = int(n_trees) + self._leaf_size = int(leaf_size) + self._queue_size=None + self._pynnd_metric = {'angular': 'cosine', + 'euclidean': 'euclidean', + 'hamming': 'hamming', + 'jaccard': 'jaccard'}[metric] + + def fit(self, X): + self._index = pynndescent.NNDescent(X, + n_neighbors=self._n_neighbors, + n_trees=self._n_trees, + leaf_size=self._leaf_size, + metric=self._pynnd_metric) + + def set_query_arguments(self, queue_size): + self._queue_size = float(queue_size) + + + def query(self, v, n): + ind, dist = self._index.query(v.reshape(1, -1).astype('float32'), k=n, queue_size=self._queue_size) + return ind[0] + + def __str__(self): + return 'PyNNDescent(n_neighbors=%d, n_trees=%d, leaf_size=%d, queue_size=%.2f)' % (self._n_neighbors, + self._n_trees, + self._leaf_size, + self._queue_size) diff --git a/ann_benchmarks/algorithms/risc.py b/ann_benchmarks/algorithms/risc.py new file mode 100644 index 0000000..fe89304 --- /dev/null +++ b/ann_benchmarks/algorithms/risc.py @@ -0,0 +1,83 @@ +from __future__ import absolute_import +import sys +sys.path.append('/risc/Code') +print(sys.path) +import pyrisc +from ann_benchmarks.algorithms.base import BaseANN +from scipy.sparse import csr_matrix +import numpy +import os + +class Risc(BaseANN): + + def __init__(self, metric, method): + if metric != "jaccard": + raise NotImplementedError("BruteForce doesn't support metric %s, only jaccard metric is supported." % metric) + methods = {'Risc': 1, 'Linearscan': 2, 'AOR': 3, 'DivideSkip': 4} + self._metric = metric + self._method = methods[method] + self.name = method + "()" + + def pre_fit(self, X): + def matrToStrArray(sparseMatr): + res = "" + indptr = sparseMatr.indptr + indices = sparseMatr.indices + for row in range(sparseMatr.shape[0]): + arr = [k for k in indices[indptr[row]: indptr[row + 1]]] + arr.sort() + res1 = "{" + ':1 , '.join([str(k) for k in arr]) + ':1}' + res += res1 + "\n" + return res + + # transform data and store in file + data_trans = matrToStrArray(csr_matrix(X)) + # print(data_trans) + text_file = open("train.txt", "w") + text_file.write(data_trans) + text_file.close() + + # call function with file + self._featureId = pyrisc.getFeatureId("train.txt", "features.txt") + self._data = pyrisc.readDatabase("train.txt", self._featureId) + # self._data = pyrisc.readDatabase("train.txt", "features.txt") + + + def fit(self, X): + self._index = pyrisc.getIndex(self._data, self._method) + + def pre_query(self, v, n): + # transform data and store in file + nz = numpy.nonzero(v)[0] + v = '{' + ':1 , '.join([str(k) for k in nz]) + ':1}\n' + if os.path.isfile("query.txt"): + os.remove("query.txt") + text_file = open("query.txt", "w") + text_file.write(v) + text_file.close() + + # queries = pyrisc.readQueries("train.txt", "query.txt", "features.txt") + queries = pyrisc.readQueries("query.txt", self._featureId) + self._queryFP = pyrisc.dataBinary_getFingerPrint(queries, 0) + + def query(self, v, n): + self._n = n + self._results = pyrisc._experiments_runTopK_inMemory(self._index, self._data, self._queryFP, self._n, self._method) + + def post_query(self): + if os.path.isfile("result.txt"): + os.remove("result.txt") + pyrisc.writeResults("result.txt", self._data, self._results, self._n) + + # read results from output file + result = [] + with open("result.txt", "r") as fp: + line = fp.readline() + while line: + if line.startswith("#"): + line = fp.readline() + continue + # make 1 based index 0 based + result.append(int(line[:-1])-1) + line = fp.readline() + return result diff --git a/ann_benchmarks/algorithms/rpforest.py b/ann_benchmarks/algorithms/rpforest.py new file mode 100644 index 0000000..d2628bf --- /dev/null +++ b/ann_benchmarks/algorithms/rpforest.py @@ -0,0 +1,19 @@ +from __future__ import absolute_import +import rpforest +import numpy +from ann_benchmarks.algorithms.base import BaseANN + +class RPForest(BaseANN): + def __init__(self, leaf_size, n_trees): + self.name = 'RPForest(leaf_size=%d, n_trees=%d)' % (leaf_size, n_trees) + self._model = rpforest.RPForest(leaf_size=leaf_size, no_trees=n_trees) + + def fit(self, X): + if X.dtype != numpy.double: + X = numpy.array(X).astype(numpy.double) + self._model.fit(X) + + def query(self, v, n): + if v.dtype != numpy.double: + v = numpy.array(v).astype(numpy.double) + return self._model.query(v, n) diff --git a/ann_benchmarks/constants.py b/ann_benchmarks/constants.py new file mode 100644 index 0000000..407200b --- /dev/null +++ b/ann_benchmarks/constants.py @@ -0,0 +1 @@ +INDEX_DIR = 'indices' diff --git a/ann_benchmarks/data.py b/ann_benchmarks/data.py new file mode 100644 index 0000000..1b4d1d3 --- /dev/null +++ b/ann_benchmarks/data.py @@ -0,0 +1,36 @@ +from __future__ import absolute_import +import numpy + +def float_parse_entry(line): + return [float(x) for x in line.strip().split()] +def float_unparse_entry(entry): + return " ".join(map(str, entry)) +def int_parse_entry(line): + return frozenset([int(x) for x in line.strip().split()]) +def int_unparse_entry(entry): + return " ".join(map(str, map(int, entry))) + +def bit_parse_entry(line): + return [bool(int(x)) for x in list(line.strip().replace(" ", "").replace("\t", ""))] +def bit_unparse_entry(entry): + return " ".join(map(lambda el: "1" if el else "0", entry)) + +type_info = { + "float": { + "type": numpy.float, + "parse_entry": float_parse_entry, + "unparse_entry": float_unparse_entry, + "finish_entries": numpy.vstack + }, + "bit": { + "type": numpy.bool_, + "parse_entry": bit_parse_entry, + "unparse_entry": bit_unparse_entry + }, + "int" : { + "type": numpy.object, + "parse_entry": int_parse_entry, + "unparse_entry": int_unparse_entry, + }, +} + diff --git a/ann_benchmarks/datasets.py b/ann_benchmarks/datasets.py new file mode 100644 index 0000000..0f7b273 --- /dev/null +++ b/ann_benchmarks/datasets.py @@ -0,0 +1,548 @@ +import h5py +import numpy +import os +import random +import sys +try: + from urllib import urlretrieve +except ImportError: + from urllib.request import urlretrieve # Python 3 + + +def download(src, dst): + if not os.path.exists(dst): + # TODO: should be atomic + print('downloading %s -> %s...' % (src, dst)) + urlretrieve(src, dst) + + +def get_dataset_fn(dataset): + if not os.path.exists('data'): + os.mkdir('data') + return os.path.join('data', '%s.hdf5' % dataset) + + +def get_dataset(which): + import h5sparse + + hdf5_fn = get_dataset_fn(which) + try: + url = 'http://ann-benchmarks.com/%s.hdf5' % which + download(url, hdf5_fn) + except: + print("Cannot download %s" % url) + if which in DATASETS: + print("Creating dataset locally") + DATASETS[which](hdf5_fn) + + hdf5_f = h5sparse.File(hdf5_fn) + return hdf5_f + + +# Everything below this line is related to creating datasets +# You probably never need to do this at home, just rely on the prepared datasets at http://ann-benchmarks.com + +def write_output(train, test, fn, distance, point_type='float', count=1000, SMILES=None): + from ann_benchmarks.algorithms.bruteforce import BruteForceBLAS + import sklearn.neighbors + import h5sparse + + def replace_last(source_string, replace_what, replace_with): + head, _sep, tail = source_string.rpartition(replace_what) + return head + replace_with + tail + + # store SMILES first + if SMILES: + smile_fn = replace_last(fn, '.hdf5', '-SMILES.hdf5') + print('Write Smiles to File %s' % smile_fn) + f = h5sparse.File(smile_fn, 'w') + asciiList = [n.encode("ascii", "ignore") for n in SMILES] + f.create_dataset('smile', (len(asciiList), 1), 'S10', asciiList) + f.close() + print('Finish.') + + print('Write Dataset %s' % fn) + f = h5sparse.File(fn, 'w') + f.attrs['distance'] = distance + f.attrs['point_type'] = point_type + print('train size: %9d * %4d' % train.shape) + print('test size: %9d * %4d' % test.shape) + f.create_dataset('train',data=train) + f.create_dataset('test', test.shape, dtype=test.dtype)[:] = test + neighbors = f.create_dataset('neighbors', (test.shape[0], count), dtype='i') + distances = f.create_dataset('distances', (test.shape[0], count), dtype='f') + + # use which method to compute the groundtruth + train = train.toarray() + method = 'bruteforth' + if method == 'balltree': + tree = sklearn.neighbors.BallTree(train, leaf_size=1000000, metric=distance) + else: + bf = BruteForceBLAS(metric=distance, precision=train.dtype) + bf.fit(train) + + print(test) + for i, x in enumerate(test): + if i % 1 == 0: + print('%d/%d...' % (i, test.shape[0])) + if method == 'balltree': + dist, ind = tree.query([x], k=count) + neighbors[i] = ind[0] + distances[i] = dist[0] + else: + res = list(bf.query_with_distances(x, count)) + res.sort(key=lambda t: t[-1]) + neighbors[i] = [j for j, _ in res] + distances[i] = [d for _, d in res] + print(neighbors[i]) + print(distances[i]) + f.close() + print('Finish.') + + +def train_test_split(X, test_size=10000): + import sklearn.model_selection + print('Splitting %d*%d into train/test' % X.shape) + return sklearn.model_selection.train_test_split(X, test_size=test_size, random_state=1) + + +def glove(out_fn, d): + import zipfile + + url = 'http://nlp.stanford.edu/data/glove.twitter.27B.zip' + fn = os.path.join('data', 'glove.twitter.27B.zip') + download(url, fn) + with zipfile.ZipFile(fn) as z: + print('preparing %s' % out_fn) + z_fn = 'glove.twitter.27B.%dd.txt' % d + X = [] + for line in z.open(z_fn): + v = [float(x) for x in line.strip().split()[1:]] + X.append(numpy.array(v)) + X_train, X_test = train_test_split(X) + write_output(numpy.array(X_train), numpy.array(X_test), out_fn, 'angular') + + +def _load_texmex_vectors(f, n, k): + import struct + + v = numpy.zeros((n, k)) + for i in range(n): + f.read(4) # ignore vec length + v[i] = struct.unpack('f' * k, f.read(k*4)) + + return v + + +def _get_irisa_matrix(t, fn): + import struct + m = t.getmember(fn) + f = t.extractfile(m) + k, = struct.unpack('i', f.read(4)) + n = m.size // (4 + 4*k) + f.seek(0) + return _load_texmex_vectors(f, n, k) + + +def sift(out_fn): + import tarfile + + url = 'ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz' + fn = os.path.join('data', 'sift.tar.tz') + download(url, fn) + with tarfile.open(fn, 'r:gz') as t: + train = _get_irisa_matrix(t, 'sift/sift_base.fvecs') + test = _get_irisa_matrix(t, 'sift/sift_query.fvecs') + write_output(train, test, out_fn, 'euclidean') + + +def gist(out_fn): + import tarfile + + url = 'ftp://ftp.irisa.fr/local/texmex/corpus/gist.tar.gz' + fn = os.path.join('data', 'gist.tar.tz') + download(url, fn) + with tarfile.open(fn, 'r:gz') as t: + train = _get_irisa_matrix(t, 'gist/gist_base.fvecs') + test = _get_irisa_matrix(t, 'gist/gist_query.fvecs') + write_output(train, test, out_fn, 'euclidean') + + +def _load_mnist_vectors(fn): + import gzip + import struct + + print('parsing vectors in %s...' % fn) + f = gzip.open(fn) + type_code_info = { + 0x08: (1, "!B"), + 0x09: (1, "!b"), + 0x0B: (2, "!H"), + 0x0C: (4, "!I"), + 0x0D: (4, "!f"), + 0x0E: (8, "!d") + } + magic, type_code, dim_count = struct.unpack("!hBB", f.read(4)) + assert magic == 0 + assert type_code in type_code_info + + dimensions = [struct.unpack("!I", f.read(4))[0] for i in range(dim_count)] + + entry_count = dimensions[0] + entry_size = numpy.product(dimensions[1:]) + + b, format_string = type_code_info[type_code] + vectors = [] + for i in range(entry_count): + vectors.append([struct.unpack(format_string, f.read(b))[0] for j in range(entry_size)]) + return numpy.array(vectors) + + +def mnist(out_fn): + download('http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz', 'mnist-train.gz') + download('http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz', 'mnist-test.gz') + train = _load_mnist_vectors('mnist-train.gz') + test = _load_mnist_vectors('mnist-test.gz') + write_output(train, test, out_fn, 'euclidean') + + +def fashion_mnist(out_fn): + download('http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz', 'fashion-mnist-train.gz') + download('http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz', 'fashion-mnist-test.gz') + train = _load_mnist_vectors('fashion-mnist-train.gz') + test = _load_mnist_vectors('fashion-mnist-test.gz') + write_output(train, test, out_fn, 'euclidean') + + +def transform_bag_of_words(filename, n_dimensions, out_fn): + import gzip + from scipy.sparse import lil_matrix + from sklearn.feature_extraction.text import TfidfTransformer + from sklearn import random_projection + with gzip.open(filename, 'rb') as f: + file_content = f.readlines() + entries = int(file_content[0]) + words = int(file_content[1]) + file_content = file_content[3:] # strip first three entries + print("building matrix...") + A = lil_matrix((entries, words)) + for e in file_content: + doc, word, cnt = [int(v) for v in e.strip().split()] + A[doc - 1, word - 1] = cnt + print("normalizing matrix entries with tfidf...") + B = TfidfTransformer().fit_transform(A) + print("reducing dimensionality...") + C = random_projection.GaussianRandomProjection(n_components = n_dimensions).fit_transform(B) + X_train, X_test = train_test_split(C) + write_output(numpy.array(X_train), numpy.array(X_test), out_fn, 'angular') + + +def nytimes(out_fn, n_dimensions): + fn = 'nytimes_%s.txt.gz' % n_dimensions + download('https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/docword.nytimes.txt.gz', fn) + transform_bag_of_words(fn, n_dimensions, out_fn) + + +def random(out_fn, n_dims, n_samples, centers, distance): + import sklearn.datasets + + X, _ = sklearn.datasets.make_blobs(n_samples=n_samples, n_features=n_dims, centers=centers, random_state=1) + X_train, X_test = train_test_split(X, test_size=0.1) + write_output(X_train, X_test, out_fn, distance) + + +def word2bits(out_fn, path, fn): + import tarfile + local_fn = fn + '.tar.gz' + url = 'http://web.stanford.edu/~maxlam/word_vectors/compressed/%s/%s.tar.gz' % (path, fn) + download(url, local_fn) + print('parsing vectors in %s...' % local_fn) + with tarfile.open(local_fn, 'r:gz') as t: + f = t.extractfile(fn) + n_words, k = [int(z) for z in next(f).strip().split()] + X = numpy.zeros((n_words, k), dtype=numpy.bool) + for i in range(n_words): + X[i] = numpy.array([float(z) > 0 for z in next(f).strip().split()[1:]], dtype=numpy.bool) + + X_train, X_test = train_test_split(X, test_size=1000) + write_output(X_train, X_test, out_fn, 'hamming', 'bit') + +def sift_hamming(out_fn, fn): + import tarfile + local_fn = fn + '.tar.gz' + url = 'http://sss.projects.itu.dk/ann-benchmarks/datasets/%s.tar.gz' % fn + download(url, local_fn) + print('parsing vectors in %s...' % local_fn) + with tarfile.open(local_fn, 'r:gz') as t: + f = t.extractfile(fn) + lines = f.readlines() + X = numpy.zeros((len(lines), 256), dtype=numpy.bool) + for i, line in enumerate(lines): + X[i] = numpy.array([int(x) > 0 for x in line.decode().strip()], dtype=numpy.bool) + X_train, X_test = train_test_split(X, test_size = 1000) + write_output(X_train, X_test, out_fn, 'hamming', 'bit') + +def lastfm(out_fn, n_dimensions, test_size=50000): + # This tests out ANN methods for retrieval on simple matrix factorization based + # recommendation algorithms. The idea being that the query/test vectors are user factors + # and the train set are item factors from the matrix factorization model. + + # Since the predictor is a dot product, we transform the factors first as described in this + # paper: https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/XboxInnerProduct.pdf + # This hopefully replicates the experiments done in this post: + # http://www.benfrederickson.com/approximate-nearest-neighbours-for-recommender-systems/ + + # The dataset is from "Last.fm Dataset - 360K users": + # http://www.dtic.upf.edu/~ocelma/MusicRecommendationDataset/lastfm-360K.html + + # this requires the implicit package to generate the factors (on my desktop/gpu this only + # takes 4-5 seconds to train - but could take 1-2 minutes on a laptop) + from implicit.datasets.lastfm import get_lastfm + from implicit.approximate_als import augment_inner_product_matrix + import implicit + + # train an als model on the lastfm data + _, _, play_counts = get_lastfm() + model = implicit.als.AlternatingLeastSquares(factors=n_dimensions) + model.fit(implicit.nearest_neighbours.bm25_weight(play_counts, K1=100, B=0.8)) + + # transform item factors so that each one has the same norm, and transform the user + # factors such by appending a 0 column + _, item_factors = augment_inner_product_matrix(model.item_factors) + user_factors = numpy.append(model.user_factors, + numpy.zeros((model.user_factors.shape[0], 1)), + axis=1) + + # only query the first 50k users (speeds things up signficantly without changing results) + user_factors = user_factors[:test_size] + + # after that transformation a cosine lookup will return the same results as the inner product + # on the untransformed data + write_output(item_factors, user_factors, out_fn, 'angular') + +def get_sparse_matrix_from_txt(file=None, dtype=numpy.bool): + from rdkit import Chem + from rdkit.Chem import AllChem + from scipy.sparse import csr_matrix + dimension = 1024 + + SMILES = [] + indptr = [0] + indices = [] + data = [] + num_mols = 0 + if file == None: + file = '../pycharm_project_422/clustering_toydata.txt' + file_object = open(file, "r") + for line in file_object.readlines(): + elements = line.split() + if len(elements) != 14: continue + smile = elements[7] + mol = Chem.MolFromSmiles(smile) + if mol is None: continue + SMILES.append(smile) + fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=dimension) + for i in range(dimension): + if fp.GetBit(i) is True: + indices.append(i) + data.append(1) + indptr.append(len(indices)) + num_mols += 1 + + fps = csr_matrix((data, indices, indptr), shape=(num_mols, dimension), dtype=dtype) + print('The dimension of the returned sparse matrix: %d*%d' %fps.shape) + + return fps, SMILES + +def get_sparse_matrix_from_sdf(dir, dtype=numpy.bool): + from rdkit import Chem + from rdkit.Chem import AllChem + import glob + import gzip + from scipy.sparse import csr_matrix + dimension = 1024 + + SMILES = [] + indptr = [0] + indices = [] + data = [] + num_mols = 0 + file_list = glob.glob(dir + '/*.sdf.gz') + print(file_list) + for file in file_list: + inf = gzip.open(file) + suppl = Chem.ForwardSDMolSupplier(inf) + for mol in suppl: + if mol is None: continue + smile = Chem.MolToSmiles(mol) + SMILES.append(smile) + fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=dimension) + for i in range(dimension): + if fp.GetBit(i) is True: + indices.append(i) + data.append(1) + indptr.append(len(indices)) + num_mols += 1 + + fps = csr_matrix((data, indices, indptr), shape=(num_mols, dimension), dtype=dtype) + print('The dimension of the returned sparse matrix: %d*%d' % fps.shape) + + return fps, SMILES + +def ecfp(out_fn, dataset_name, dimension, distance, type, test_size=1000): + from sklearn.utils import shuffle + print('prepare dataset ' + dataset_name) + + if type == 'bit': + dtype = numpy.bool + elif type == 'int': + dtype = numpy.int + else: + dtype = numpy.float + if dataset_name.startswith('toy'): + # toy + X, SMILES = get_sparse_matrix_from_txt(dtype=dtype) + else: + # others, e.g., Chembl and Molport + if dataset_name == 'Molport': + dir = '/home/cjz18001/Molport' + elif dataset_name == 'Chembl': + dir = '/home/cjz18001/Chembl' + else: + print('unknown dataset') + exit(0) + X, SMILES = get_sparse_matrix_from_sdf(dir=dir, dtype=dtype) + + # random shuffle fingerprints and smiles at the same time + seed = 1 # random.randint(0, 2 ** 32 - 1) + X, SMILES = shuffle(X, SMILES, random_state=seed) + + # data split and make test data full matrix + train_size = X.shape[0] - test_size + X_train = X[:train_size] + X_test = X[train_size:] + X_test = X_test.toarray() + print('finish dataset preparation') + + print('Train data dimension: %d*%d' %X_train.shape) + print('Test data dimension: %d*%d' %X_test.shape) + write_output(X_train, X_test, out_fn, distance, type, count=1000, SMILES=SMILES) + + +def ecfp_sparse_multi(out_fn, dataset_name, num_files, dimension, distance, type): + print('prepare dataset ' + dataset_name) + import pickle + from scipy.sparse import vstack + path = '/data/chunjiangzhu/Enamine_680M_SparseMatrix/' + if type == 'bit': + dtype = numpy.bool + elif type == 'int': + dtype = numpy.int + else: + dtype = numpy.float + + # vertically stack sparse matrices from multiple files + test_size = 1 + if num_files==0.5: + with open(path + dataset_name + '_' + str(dimension) + '_trunk_0.pickle', 'rb') as handle: + Y = pickle.load(handle, encoding='latin1') + size = 1000000 + print('select %i out of %i' %(size, Y.shape[0])) + Y = Y[:size] + X_test = Y[Y.shape[0] - test_size:] + X_train = Y[:Y.shape[0] - test_size] + else: + first = False + for i in range(num_files): + print('process ' + str(i) + ' trunk') + if first == False: + first = True + with open(path + dataset_name + '_' + str(dimension) + '_trunk_' + str(i) + '.pickle', 'rb') as handle: + Y = pickle.load(handle, encoding='latin1') + if i==num_files-1: #last one + X_test = Y[Y.shape[0] - test_size:] + X_train = Y[:Y.shape[0] - test_size] + else: + X_train = Y + else: + with open(path + dataset_name + '_' + str(dimension) + '_trunk_' + str(i) + '.pickle', 'rb') as handle: + Y = pickle.load(handle, encoding='latin1') + if i==num_files-1: #last one + X_test = Y[Y.shape[0] - test_size:] + X_train = vstack([X_train, Y[:Y.shape[0] - test_size]]) + else: + X_train = vstack([X_train, Y]) + # X_train = X_train.astype(dtype) + # X_test = X_test.astype(dtype) + + # X_train, X_test = train_test_split(X, test_size=1000) + # X_test = X_test.toarray() + # encounter memory error when calling train_test_split, for 100M + X_test = X_test.toarray() + print('finish dataset preparation') + + print(X_train.shape) + print(X_test.shape) + write_output(X_train, X_test, out_fn, distance, type, 1000) + +def ecfp_multi(out_fn, dataset_name, num_files, dimension, distance, type): + print('prepare dataset ' + dataset_name) + import pickle + from scipy.sparse import vstack + path = '/data/chunjiangzhu/Enamine_680M_SparseMatrix/' + if type == 'bit': + dtype = numpy.bool + elif type == 'int': + dtype = numpy.int + else: + dtype = numpy.float + + # vertically stack sparse matrices from multiple files + test_size = 3 + with open(path + dataset_name + '_' + str(dimension) + '_trunk_0.pickle', 'rb') as handle: + Y = pickle.load(handle, encoding='latin1') + size = 10000000 + print('select %i out of %i' %(size, Y.shape[0])) + Y = Y[:size] + X_test = Y[Y.shape[0] - test_size:] + X_train = Y[:Y.shape[0] - test_size] + + # make them full matrices here + X_train = X_train.toarray() + X_test = X_test.toarray() + print('finish dataset preparation') + + print(X_train.shape) + print(X_test.shape) + write_output(X_train, X_test, out_fn, distance, type, 1000) + +DATASETS = { + 'fashion-mnist-784-euclidean': fashion_mnist, + 'gist-960-euclidean': gist, + 'glove-25-angular': lambda out_fn: glove(out_fn, 25), + 'glove-50-angular': lambda out_fn: glove(out_fn, 50), + 'glove-100-angular': lambda out_fn: glove(out_fn, 100), + 'glove-200-angular': lambda out_fn: glove(out_fn, 200), + 'mnist-784-euclidean': mnist, + 'random-xs-20-euclidean': lambda out_fn: random(out_fn, 20, 10000, 100, 'euclidean'), + 'random-s-100-euclidean': lambda out_fn: random(out_fn, 100, 100000, 1000, 'euclidean'), + 'random-xs-20-angular': lambda out_fn: random(out_fn, 20, 10000, 100, 'angular'), + 'random-s-100-angular': lambda out_fn: random(out_fn, 100, 100000, 1000, 'angular'), + 'sift-128-euclidean': sift, + 'nytimes-256-angular': lambda out_fn: nytimes(out_fn, 256), + 'nytimes-16-angular': lambda out_fn: nytimes(out_fn, 16), + 'word2bits-800-hamming': lambda out_fn: word2bits(out_fn, '400K', 'w2b_bitlevel1_size800_vocab400K'), + 'lastfm-64-dot': lambda out_fn: lastfm(out_fn, 64), + 'sift-256-hamming': lambda out_fn: sift_hamming(out_fn, 'sift.hamming.256'), + # below are datasets Chunjiang added + 'toy-1024-jaccard': lambda out_fn: ecfp(out_fn, 'toydata', 1024, 'jaccard', 'bit', 100), + 'chembl-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl', 1024, 'jaccard', 'bit'), + 'molport-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Molport', 1024, 'jaccard', 'bit'), + 'chembl100K-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl100K', 1024, 'jaccard', 'int'), + 'chembl10K-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl10K', 1024, 'jaccard', 'int'), + 'chembl-sparse-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl10K-sparse', 1024, 'jaccard', 'bit'), + 'enamine100M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 5, 1024, 'jaccard', 'bit'), + 'enamine10M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 0.5, 1024, 'jaccard', 'bit'), + 'enamine20M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 1, 1024, 'jaccard', 'bit'), + 'enamine10M-1024-jaccard': lambda out_fn: ecfp_multi(out_fn, 'Enamine', 0.5, 1024, 'jaccard', 'bit') +} diff --git a/ann_benchmarks/datasets_old.py b/ann_benchmarks/datasets_old.py new file mode 100644 index 0000000..64c7716 --- /dev/null +++ b/ann_benchmarks/datasets_old.py @@ -0,0 +1,480 @@ +import h5py +import numpy +import os +import random +import sys +try: + from urllib import urlretrieve +except ImportError: + from urllib.request import urlretrieve # Python 3 + + +def download(src, dst): + if not os.path.exists(dst): + # TODO: should be atomic + print('downloading %s -> %s...' % (src, dst)) + urlretrieve(src, dst) + + +def get_dataset_fn(dataset): + if not os.path.exists('data'): + os.mkdir('data') + return os.path.join('data', '%s.hdf5' % dataset) + + +def get_dataset(which): + hdf5_fn = get_dataset_fn(which) + try: + url = 'http://ann-benchmarks.com/%s.hdf5' % which + download(url, hdf5_fn) + except: + print("Cannot download %s" % url) + if which in DATASETS: + print("Creating dataset locally") + DATASETS[which](hdf5_fn) + if "sparse" not in which: + hdf5_f = h5py.File(hdf5_fn) + else: + import h5sparse + hdf5_f = h5sparse.File(hdf5_fn) + return hdf5_f + + +# Everything below this line is related to creating datasets +# You probably never need to do this at home, just rely on the prepared datasets at http://ann-benchmarks.com + +def write_output(train, test, fn, distance, point_type='float', count=1000, sparse=False): + from ann_benchmarks.algorithms.bruteforce import BruteForceBLAS + import sklearn.neighbors + + n = 0 + if sparse == False: + f = h5py.File(fn, 'w') + else: + import h5sparse + f = h5sparse.File(fn, 'w') + f.attrs['distance'] = distance + f.attrs['point_type'] = point_type + print('train size: %9d * %4d' % train.shape) + print('test size: %9d * %4d' % test.shape) + if sparse == False: + f.create_dataset('train', train.shape, dtype=train.dtype)[:] = train + else: + f.create_dataset('train',data=train) + f.create_dataset('test', test.shape, dtype=test.dtype)[:] = test + # f.create_dataset('test', data=test) + neighbors = f.create_dataset('neighbors', (test.shape[0], count), dtype='i') + distances = f.create_dataset('distances', (test.shape[0], count), dtype='f') + + # use which method to compute the groundtruth + method = 'balltree' + if method == 'balltree': + # only serve for jaccard + # todo: generalize to other metrics + tree = sklearn.neighbors.BallTree(train, leaf_size=20, metric='jaccard') + else: + bf = BruteForceBLAS(distance, precision=train.dtype) + bf.fit(train) + + print(test) + for i, x in enumerate(test): + if i % 1 == 0: + print('%d/%d...' % (i, test.shape[0])) + if method == 'balltree': + res = tree.query(x, k=count) + else: + res = list(bf.query_with_distances(x, count)) + res.sort(key=lambda t: t[-1]) + neighbors[i] = [j for j, _ in res] + distances[i] = [d for _, d in res] + print(neighbors[i]) + print(distances[i]) + f.close() + + +def train_test_split(X, test_size=10000): + import sklearn.model_selection + print('Splitting %d*%d into train/test' % X.shape) + return sklearn.model_selection.train_test_split(X, test_size=test_size, random_state=1) + + +def glove(out_fn, d): + import zipfile + + url = 'http://nlp.stanford.edu/data/glove.twitter.27B.zip' + fn = os.path.join('data', 'glove.twitter.27B.zip') + download(url, fn) + with zipfile.ZipFile(fn) as z: + print('preparing %s' % out_fn) + z_fn = 'glove.twitter.27B.%dd.txt' % d + X = [] + for line in z.open(z_fn): + v = [float(x) for x in line.strip().split()[1:]] + X.append(numpy.array(v)) + X_train, X_test = train_test_split(X) + write_output(numpy.array(X_train), numpy.array(X_test), out_fn, 'angular') + + +def _load_texmex_vectors(f, n, k): + import struct + + v = numpy.zeros((n, k)) + for i in range(n): + f.read(4) # ignore vec length + v[i] = struct.unpack('f' * k, f.read(k*4)) + + return v + + +def _get_irisa_matrix(t, fn): + import struct + m = t.getmember(fn) + f = t.extractfile(m) + k, = struct.unpack('i', f.read(4)) + n = m.size // (4 + 4*k) + f.seek(0) + return _load_texmex_vectors(f, n, k) + + +def sift(out_fn): + import tarfile + + url = 'ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz' + fn = os.path.join('data', 'sift.tar.tz') + download(url, fn) + with tarfile.open(fn, 'r:gz') as t: + train = _get_irisa_matrix(t, 'sift/sift_base.fvecs') + test = _get_irisa_matrix(t, 'sift/sift_query.fvecs') + write_output(train, test, out_fn, 'euclidean') + + +def gist(out_fn): + import tarfile + + url = 'ftp://ftp.irisa.fr/local/texmex/corpus/gist.tar.gz' + fn = os.path.join('data', 'gist.tar.tz') + download(url, fn) + with tarfile.open(fn, 'r:gz') as t: + train = _get_irisa_matrix(t, 'gist/gist_base.fvecs') + test = _get_irisa_matrix(t, 'gist/gist_query.fvecs') + write_output(train, test, out_fn, 'euclidean') + + +def _load_mnist_vectors(fn): + import gzip + import struct + + print('parsing vectors in %s...' % fn) + f = gzip.open(fn) + type_code_info = { + 0x08: (1, "!B"), + 0x09: (1, "!b"), + 0x0B: (2, "!H"), + 0x0C: (4, "!I"), + 0x0D: (4, "!f"), + 0x0E: (8, "!d") + } + magic, type_code, dim_count = struct.unpack("!hBB", f.read(4)) + assert magic == 0 + assert type_code in type_code_info + + dimensions = [struct.unpack("!I", f.read(4))[0] for i in range(dim_count)] + + entry_count = dimensions[0] + entry_size = numpy.product(dimensions[1:]) + + b, format_string = type_code_info[type_code] + vectors = [] + for i in range(entry_count): + vectors.append([struct.unpack(format_string, f.read(b))[0] for j in range(entry_size)]) + return numpy.array(vectors) + + +def mnist(out_fn): + download('http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz', 'mnist-train.gz') + download('http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz', 'mnist-test.gz') + train = _load_mnist_vectors('mnist-train.gz') + test = _load_mnist_vectors('mnist-test.gz') + write_output(train, test, out_fn, 'euclidean') + + +def fashion_mnist(out_fn): + download('http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz', 'fashion-mnist-train.gz') + download('http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz', 'fashion-mnist-test.gz') + train = _load_mnist_vectors('fashion-mnist-train.gz') + test = _load_mnist_vectors('fashion-mnist-test.gz') + write_output(train, test, out_fn, 'euclidean') + + +def transform_bag_of_words(filename, n_dimensions, out_fn): + import gzip + from scipy.sparse import lil_matrix + from sklearn.feature_extraction.text import TfidfTransformer + from sklearn import random_projection + with gzip.open(filename, 'rb') as f: + file_content = f.readlines() + entries = int(file_content[0]) + words = int(file_content[1]) + file_content = file_content[3:] # strip first three entries + print("building matrix...") + A = lil_matrix((entries, words)) + for e in file_content: + doc, word, cnt = [int(v) for v in e.strip().split()] + A[doc - 1, word - 1] = cnt + print("normalizing matrix entries with tfidf...") + B = TfidfTransformer().fit_transform(A) + print("reducing dimensionality...") + C = random_projection.GaussianRandomProjection(n_components = n_dimensions).fit_transform(B) + X_train, X_test = train_test_split(C) + write_output(numpy.array(X_train), numpy.array(X_test), out_fn, 'angular') + + +def nytimes(out_fn, n_dimensions): + fn = 'nytimes_%s.txt.gz' % n_dimensions + download('https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/docword.nytimes.txt.gz', fn) + transform_bag_of_words(fn, n_dimensions, out_fn) + + +def random(out_fn, n_dims, n_samples, centers, distance): + import sklearn.datasets + + X, _ = sklearn.datasets.make_blobs(n_samples=n_samples, n_features=n_dims, centers=centers, random_state=1) + X_train, X_test = train_test_split(X, test_size=0.1) + write_output(X_train, X_test, out_fn, distance) + + +def word2bits(out_fn, path, fn): + import tarfile + local_fn = fn + '.tar.gz' + url = 'http://web.stanford.edu/~maxlam/word_vectors/compressed/%s/%s.tar.gz' % (path, fn) + download(url, local_fn) + print('parsing vectors in %s...' % local_fn) + with tarfile.open(local_fn, 'r:gz') as t: + f = t.extractfile(fn) + n_words, k = [int(z) for z in next(f).strip().split()] + X = numpy.zeros((n_words, k), dtype=numpy.bool) + for i in range(n_words): + X[i] = numpy.array([float(z) > 0 for z in next(f).strip().split()[1:]], dtype=numpy.bool) + + X_train, X_test = train_test_split(X, test_size=1000) + write_output(X_train, X_test, out_fn, 'hamming', 'bit') + +def sift_hamming(out_fn, fn): + import tarfile + local_fn = fn + '.tar.gz' + url = 'http://sss.projects.itu.dk/ann-benchmarks/datasets/%s.tar.gz' % fn + download(url, local_fn) + print('parsing vectors in %s...' % local_fn) + with tarfile.open(local_fn, 'r:gz') as t: + f = t.extractfile(fn) + lines = f.readlines() + X = numpy.zeros((len(lines), 256), dtype=numpy.bool) + for i, line in enumerate(lines): + X[i] = numpy.array([int(x) > 0 for x in line.decode().strip()], dtype=numpy.bool) + X_train, X_test = train_test_split(X, test_size = 1000) + write_output(X_train, X_test, out_fn, 'hamming', 'bit') + +def lastfm(out_fn, n_dimensions, test_size=50000): + # This tests out ANN methods for retrieval on simple matrix factorization based + # recommendation algorithms. The idea being that the query/test vectors are user factors + # and the train set are item factors from the matrix factorization model. + + # Since the predictor is a dot product, we transform the factors first as described in this + # paper: https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/XboxInnerProduct.pdf + # This hopefully replicates the experiments done in this post: + # http://www.benfrederickson.com/approximate-nearest-neighbours-for-recommender-systems/ + + # The dataset is from "Last.fm Dataset - 360K users": + # http://www.dtic.upf.edu/~ocelma/MusicRecommendationDataset/lastfm-360K.html + + # this requires the implicit package to generate the factors (on my desktop/gpu this only + # takes 4-5 seconds to train - but could take 1-2 minutes on a laptop) + from implicit.datasets.lastfm import get_lastfm + from implicit.approximate_als import augment_inner_product_matrix + import implicit + + # train an als model on the lastfm data + _, _, play_counts = get_lastfm() + model = implicit.als.AlternatingLeastSquares(factors=n_dimensions) + model.fit(implicit.nearest_neighbours.bm25_weight(play_counts, K1=100, B=0.8)) + + # transform item factors so that each one has the same norm, and transform the user + # factors such by appending a 0 column + _, item_factors = augment_inner_product_matrix(model.item_factors) + user_factors = numpy.append(model.user_factors, + numpy.zeros((model.user_factors.shape[0], 1)), + axis=1) + + # only query the first 50k users (speeds things up signficantly without changing results) + user_factors = user_factors[:test_size] + + # after that transformation a cosine lookup will return the same results as the inner product + # on the untransformed data + write_output(item_factors, user_factors, out_fn, 'angular') + +def ecfp(out_fn, dataset_name, dimension, distance, type): + print('prepare dataset ' + dataset_name) + import pickle + path = '../pycharm_project_426/src/' + if type == 'bit': + dtype = numpy.bool + elif type == 'int': + dtype = numpy.int + else: + dtype = numpy.float + if dataset_name.startswith('toy'): + # toy + with open(path + dataset_name + '_' + str(dimension) + '_training.pickle', 'rb') as handle: + X_train = pickle.load(handle, encoding='latin1') + with open(path + dataset_name + '_' + str(dimension) + '_test.pickle', 'rb') as handle: + X_test = pickle.load(handle, encoding='latin1') + X_train = numpy.asarray(X_train.toarray(), dtype) + X_test = numpy.asarray(X_test.toarray(), dtype) + else: + # Chembl + with open(path + dataset_name + '_' + str(dimension) + '.pickle', 'rb') as handle: + X = pickle.load(handle, encoding='latin1') + X = numpy.asarray(X.toarray(), dtype) + X_train, X_test = train_test_split(X, test_size=1000) + + print(X_train) + print(X_test) + write_output(X_train, X_test, out_fn, distance, type) + +def ecfp_sparse(out_fn, dataset_name, dimension, distance, type): + print('prepare dataset ' + dataset_name) + import pickle + path = '../pycharm_project_426/src/' + if type == 'bit': + dtype = numpy.bool + elif type == 'int': + dtype = numpy.int + else: + dtype = numpy.float + + with open(path + dataset_name + '_' + str(dimension) + '.pickle', 'rb') as handle: + X = pickle.load(handle, encoding='latin1') + X = X.astype(dtype) + X_train, X_test = train_test_split(X, test_size=100) + X_test = X_test.toarray() + + print(X_train) + print(X_test) + write_output(X_train, X_test, out_fn, distance, type, 1000, True) + +def ecfp_sparse_multi(out_fn, dataset_name, num_files, dimension, distance, type): + print('prepare dataset ' + dataset_name) + import pickle + from scipy.sparse import vstack + path = '/data/chunjiangzhu/Enamine_680M_SparseMatrix/' + if type == 'bit': + dtype = numpy.bool + elif type == 'int': + dtype = numpy.int + else: + dtype = numpy.float + + # vertically stack sparse matrices from multiple files + test_size = 1 + if num_files==0.5: + with open(path + dataset_name + '_' + str(dimension) + '_trunk_0.pickle', 'rb') as handle: + Y = pickle.load(handle, encoding='latin1') + size = 2000000 + print('select %i out of %i' %(size, Y.shape[0])) + Y = Y[:size] + X_test = Y[Y.shape[0] - test_size:] + X_train = Y[:Y.shape[0] - test_size] + else: + first = False + for i in range(num_files): + print('process ' + str(i) + ' trunk') + if first == False: + first = True + with open(path + dataset_name + '_' + str(dimension) + '_trunk_' + str(i) + '.pickle', 'rb') as handle: + Y = pickle.load(handle, encoding='latin1') + if i==num_files-1: #last one + X_test = Y[Y.shape[0] - test_size:] + X_train = Y[:Y.shape[0] - test_size] + else: + X_train = Y + else: + with open(path + dataset_name + '_' + str(dimension) + '_trunk_' + str(i) + '.pickle', 'rb') as handle: + Y = pickle.load(handle, encoding='latin1') + if i==num_files-1: #last one + X_test = Y[Y.shape[0] - test_size:] + X_train = vstack([X_train, Y[:Y.shape[0] - test_size]]) + else: + X_train = vstack([X_train, Y]) + # X_train = X_train.astype(dtype) + # X_test = X_test.astype(dtype) + + # X_train, X_test = train_test_split(X, test_size=1000) + # X_test = X_test.toarray() + # encounter memory error when calling train_test_split, for 100M + X_test = X_test.toarray() + print('finish data preparation') + + print(X_train.shape) + print(X_test.shape) + write_output(X_train, X_test, out_fn, distance, type, 1000, True) + +def ecfp_multi(out_fn, dataset_name, num_files, dimension, distance, type): + print('prepare dataset ' + dataset_name) + import pickle + from scipy.sparse import vstack + path = '/data/chunjiangzhu/Enamine_680M_SparseMatrix/' + if type == 'bit': + dtype = numpy.bool + elif type == 'int': + dtype = numpy.int + else: + dtype = numpy.float + + # vertically stack sparse matrices from multiple files + test_size = 3 + with open(path + dataset_name + '_' + str(dimension) + '_trunk_0.pickle', 'rb') as handle: + Y = pickle.load(handle, encoding='latin1') + size = 10000000 + print('select %i out of %i' %(size, Y.shape[0])) + Y = Y[:size] + X_test = Y[Y.shape[0] - test_size:] + X_train = Y[:Y.shape[0] - test_size] + + # make them full matrices here + X_train = X_train.toarray() + X_test = X_test.toarray() + print('finish data preparation') + + print(X_train.shape) + print(X_test.shape) + write_output(X_train, X_test, out_fn, distance, type, 1000) + +DATASETS = { + 'fashion-mnist-784-euclidean': fashion_mnist, + 'gist-960-euclidean': gist, + 'glove-25-angular': lambda out_fn: glove(out_fn, 25), + 'glove-50-angular': lambda out_fn: glove(out_fn, 50), + 'glove-100-angular': lambda out_fn: glove(out_fn, 100), + 'glove-200-angular': lambda out_fn: glove(out_fn, 200), + 'mnist-784-euclidean': mnist, + 'random-xs-20-euclidean': lambda out_fn: random(out_fn, 20, 10000, 100, 'euclidean'), + 'random-s-100-euclidean': lambda out_fn: random(out_fn, 100, 100000, 1000, 'euclidean'), + 'random-xs-20-angular': lambda out_fn: random(out_fn, 20, 10000, 100, 'angular'), + 'random-s-100-angular': lambda out_fn: random(out_fn, 100, 100000, 1000, 'angular'), + 'sift-128-euclidean': sift, + 'nytimes-256-angular': lambda out_fn: nytimes(out_fn, 256), + 'nytimes-16-angular': lambda out_fn: nytimes(out_fn, 16), + 'word2bits-800-hamming': lambda out_fn: word2bits(out_fn, '400K', 'w2b_bitlevel1_size800_vocab400K'), + 'lastfm-64-dot': lambda out_fn: lastfm(out_fn, 64), + 'sift-256-hamming': lambda out_fn: sift_hamming(out_fn, 'sift.hamming.256'), + # below are datasets Chunjiang added + 'toy-1024-jaccard': lambda out_fn: ecfp(out_fn, 'toydata', 1024, 'jaccard', 'int'), + 'chembl-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl', 1024, 'jaccard', 'int'), + 'chembl100K-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl100K', 1024, 'jaccard', 'int'), + 'chembl10K-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl10K', 1024, 'jaccard', 'int'), + 'chembl-sparse-1024-jaccard': lambda out_fn: ecfp_sparse(out_fn, 'Chembl10K', 1024, 'jaccard', 'bit'), + 'enamine100M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 5, 1024, 'jaccard', 'bit'), + 'enamine10M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 0.5, 1024, 'jaccard', 'bit'), + 'enamine20M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 1, 1024, 'jaccard', 'bit'), + 'enamine10M-1024-jaccard': lambda out_fn: ecfp_multi(out_fn, 'Enamine', 0.5, 1024, 'jaccard', 'bit') +} diff --git a/ann_benchmarks/distance.py b/ann_benchmarks/distance.py new file mode 100644 index 0000000..9228338 --- /dev/null +++ b/ann_benchmarks/distance.py @@ -0,0 +1,53 @@ +from __future__ import absolute_import +from scipy.spatial.distance import pdist as scipy_pdist + +def pdist(a, b, metric): + return scipy_pdist([a, b], metric=metric)[0] + +# Need own implementation of jaccard because numpy's implementation is different +def jaccard(a, b): + if len(a) == 0 or len(b) == 0: + return 0 + intersect = len(a & b) + return intersect / (float)(len(a) + len(b) - intersect) + + +# metrics = { +# 'hamming': { +# 'distance' : lambda a, b: pdist(a, b, "hamming"), +# 'distance_valid' : lambda a: True +# }, +# # return 1 - jaccard similarity, because smaller distances are better. +# 'jaccard': { +# 'distance' : lambda a, b: 1 - jaccard(a, b), +# 'distance_valid' : lambda a: a < 1 - 1e-5 +# }, +# 'euclidean': { +# 'distance' : lambda a, b: pdist(a, b, "euclidean"), +# 'distance_valid' : lambda a: True +# }, +# 'angular': { +# 'distance' : lambda a, b: pdist(a, b, "cosine"), +# 'distance_valid' : lambda a: True +# } +# } +# Chunjiang Modified 20190216 +metrics = { + 'hamming': { + 'distance' : lambda a, b: pdist(a, b, "hamming"), + 'distance_valid' : lambda a: True + }, + # return 1 - jaccard similarity, because smaller distances are better. + 'jaccard': { + 'distance' : lambda a, b: pdist(a, b, "jaccard"), + 'distance_valid' : lambda a: a < 1 - 1e-5 + }, + 'euclidean': { + 'distance' : lambda a, b: pdist(a, b, "euclidean"), + 'distance_valid' : lambda a: True + }, + 'angular': { + 'distance' : lambda a, b: pdist(a, b, "cosine"), + 'distance_valid' : lambda a: True + } +} diff --git a/ann_benchmarks/main.py b/ann_benchmarks/main.py new file mode 100644 index 0000000..55c1534 --- /dev/null +++ b/ann_benchmarks/main.py @@ -0,0 +1,198 @@ +from __future__ import absolute_import +import argparse +import docker +import os +import random +import sys +import shutil +import traceback + +from ann_benchmarks.datasets import get_dataset, DATASETS +from ann_benchmarks.constants import INDEX_DIR +from ann_benchmarks.algorithms.definitions import get_definitions, list_algorithms, algorithm_status, InstantiationStatus +from ann_benchmarks.results import get_result_filename +from ann_benchmarks.runner import run, run_docker, run_singularity + + +def positive_int(s): + i = None + try: + i = int(s) + except ValueError: + pass + if not i or i < 1: + raise argparse.ArgumentTypeError("%r is not a positive integer" % s) + return i + +def main(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + '--dataset', + metavar='NAME', + help='the dataset to load training points from', + default='glove-100-angular', + choices=DATASETS.keys()) + parser.add_argument( + "-k", "--count", + default=10, + type=positive_int, + help="the number of near neighbours to search for") + parser.add_argument( + '--definitions', + metavar='FILE', + help='load algorithm definitions from FILE', + default='algos.yaml') + parser.add_argument( + '--algorithm', + metavar='NAME', + help='run only the named algorithm', + default=None) + parser.add_argument( + '--docker-tag', + metavar='NAME', + help='run only algorithms in a particular docker image', + default=None) + parser.add_argument( + '--list-algorithms', + help='print the names of all known algorithms and exit', + action='store_true') + parser.add_argument( + '--force', + help='''re-run algorithms even if their results already exist''', + action='store_true') + parser.add_argument( + '--runs', + metavar='COUNT', + type=positive_int, + help='run each algorithm instance %(metavar)s times and use only the best result', + default=2) + parser.add_argument( + '--timeout', + type=int, + help='Timeout (in seconds) for each individual algorithm run, or -1 if no timeout should be set', + default=-1) + parser.add_argument( + '--local', + action='store_true', + help='If set, then will run everything locally (inside the same process) rather than using Docker') + parser.add_argument( + '--batch', + action='store_true', + help='If set, algorithms get all queries at once') + parser.add_argument( + '--rq', + action='store_true', + help='If set, perform range queries') + parser.add_argument( + "--radius", + default=0.3, + type=float, + help="th range of similarity to search for") + parser.add_argument( + '--max-n-algorithms', + type=int, + help='Max number of algorithms to run (just used for testing)', + default=-1) + parser.add_argument( + '--run-disabled', + help='run algorithms that are disabled in algos.yml', + action='store_true') + + args = parser.parse_args() + if args.timeout == -1: + args.timeout = None + + if args.list_algorithms: + list_algorithms(args.definitions) + sys.exit(0) + + # Nmslib specific code + # Remove old indices stored on disk + #if os.path.exists(INDEX_DIR): + # shutil.rmtree(INDEX_DIR) + + dataset = get_dataset(args.dataset) + # adapt to sparse matrix + # dimension = len(dataset['train'][0]) # TODO(erikbern): ugly + dimension = dataset['train'].shape[1] + point_type = dataset.attrs.get('point_type', 'float') + distance = dataset.attrs['distance'] + definitions = get_definitions(args.definitions, dimension, point_type, distance, args.count) + + # Filter out, from the loaded definitions, all those query argument groups + # that correspond to experiments that have already been run. (This might + # mean removing a definition altogether, so we can't just use a list + # comprehension.) + filtered_definitions = [] + for definition in definitions: + query_argument_groups = definition.query_argument_groups + if not query_argument_groups: + query_argument_groups = [[]] + not_yet_run = [] + for query_arguments in query_argument_groups: + if args.rq: + fn = get_result_filename(args.dataset, + args.radius, definition, query_arguments, args.batch) + else: + fn = get_result_filename(args.dataset, + args.count, definition, query_arguments, args.batch) + if args.force or not os.path.exists(fn): + not_yet_run.append(query_arguments) + if not_yet_run: + if definition.query_argument_groups: + definition = definition._replace( + query_argument_groups = not_yet_run) + filtered_definitions.append(definition) + definitions = filtered_definitions + + random.shuffle(definitions) + + if args.algorithm: + print('running only', args.algorithm) + definitions = [d for d in definitions if d.algorithm == args.algorithm] + + if args.local: + def _test(df): + status = algorithm_status(df) + # If the module was loaded but doesn't actually have a constructor of + # the right name, then the definition is broken + assert status != InstantiationStatus.NO_CONSTRUCTOR, """\ +%s.%s(%s): error: the module '%s' does not expose the named constructor""" % (df.module, df.constructor, df.arguments, df.module) + if status == InstantiationStatus.NO_MODULE: + # If the module couldn't be loaded (presumably because of a missing + # dependency), print a warning and remove this definition from the + # list of things to be run + print("""\ +%s.%s(%s): warning: the module '%s' could not be loaded; skipping""" % (df.module, df.constructor, df.arguments, df.module)) + return False + else: + return True + definitions = [d for d in definitions if _test(d)] + + if not args.run_disabled: + if len([d for d in definitions if d.disabled]): + print('Not running disabled algorithms:', [d for d in definitions if d.disabled]) + definitions = [d for d in definitions if not d.disabled] + + if args.max_n_algorithms >= 0: + definitions = definitions[:args.max_n_algorithms] + + if len(definitions) == 0: + raise Exception('Nothing to run') + else: + print('Order:', definitions) + + for definition in definitions: + print(definition, '...') + + try: + if args.local: + run(definition, args.dataset, args.count, args.runs, args.batch, args.rq, args.radius) + else: + # run_docker(definition, args.dataset, args.count, args.runs, args.timeout, args.batch, args.rq, args.radius) + run_singularity(definition, args.dataset, args.count, args.runs, args.timeout, args.batch, args.rq, args.radius) + except KeyboardInterrupt: + break + except: + traceback.print_exc() diff --git a/ann_benchmarks/plotting/__init__.py b/ann_benchmarks/plotting/__init__.py new file mode 100644 index 0000000..4d4042e --- /dev/null +++ b/ann_benchmarks/plotting/__init__.py @@ -0,0 +1,2 @@ +from __future__ import absolute_import +from ann_benchmarks.plotting import * diff --git a/ann_benchmarks/plotting/metrics.py b/ann_benchmarks/plotting/metrics.py new file mode 100644 index 0000000..cdf2800 --- /dev/null +++ b/ann_benchmarks/plotting/metrics.py @@ -0,0 +1,113 @@ +from __future__ import absolute_import + +def knn(dataset_distances, run_distances, count, epsilon=1e-10): + total = len(run_distances) * count + actual = 0 + for true_distances, found_distances in zip(dataset_distances, run_distances): + within = [d for d in found_distances[:count] if d <= true_distances[count - 1] + epsilon] + actual += len(within) + return float(actual) / float(total) + +def rangequery(dataset_distances, run_distances, radius, epsilon=1e-10): + total = 0.0 + for true_distances, found_distances in zip(dataset_distances, run_distances): + true = [d for d in true_distances if d <= radius + epsilon] + found = [d for d in found_distances if d <= radius + epsilon] + print('found: ' + str(len(found)) + '/true: ' + str(len(true))) + if len(true) == 0: + if len(found) == 0: + total += 1.0 + else: + if len(found) > len(true): + print(found) + total += 1.0 + continue + total += float(len(found))/float(len(true)) + return float(total) / float(len(run_distances)) + +def epsilon(dataset_distances, run_distances, count, epsilon=0.01): + total = len(run_distances) * count + actual = 0 + for true_distances, found_distances in zip(dataset_distances, run_distances): + within = [d for d in found_distances[:count] if d <= true_distances[count - 1] * (1 + epsilon)] + actual += len(within) + return float(actual) / float(total) + +def rel(dataset_distances, run_distances): + total_closest_distance = 0.0 + total_candidate_distance = 0.0 + for true_distances, found_distances in zip(dataset_distances, run_distances): + for rdist, cdist in zip(true_distances, found_distances): + total_closest_distance += rdist + total_candidate_distance += cdist + if total_closest_distance < 0.01: + return float("inf") + return total_candidate_distance / total_closest_distance + +def queries_per_second(queries, attrs): + return 1.0 / attrs["best_search_time"] + +def index_size(queries, attrs): + # TODO(erikbern): should replace this with peak memory usage or something + return attrs.get("index_size", 0) + +def build_time(queries, attrs): + return attrs["build_time"] + +def candidates(queries, attrs): + return attrs["candidates"] + +all_metrics = { + "k-nn": { + "description": "Recall", + "function": lambda true_distances, run_distances, run_attrs: knn(true_distances, run_distances, run_attrs["count"]), + "worst": float("-inf"), + "lim": [0.0, 1.03] + }, + "range": { + "description": "Recall", + "function": lambda true_distances, run_distances, run_attrs, radius: rangequery(true_distances, run_distances, radius), + "worst": float("-inf"), + "lim": [0.0, 1.03] + }, + "epsilon": { + "description": "Epsilon 0.01 Recall", + "function": lambda true_distances, run_distances, run_attrs: epsilon(true_distances, run_distances, run_attrs["count"]), + "worst": float("-inf") + }, + "largeepsilon": { + "description": "Epsilon 0.1 Recall", + "function": lambda true_distances, run_distances, run_attrs: epsilon(true_distances, run_distances, run_attrs["count"], 0.1), + "worst": float("-inf") + }, + "rel": { + "description": "Relative Error", + "function": lambda true_distances, run_distances, run_attrs: rel(true_distances, run_distances), + "worst": float("inf") + }, + "qps": { + "description": "Queries per second (1/s)", + "function": lambda true_distances, run_distances, run_attrs: queries_per_second(true_distances, run_attrs), + "worst": float("-inf") + }, + "build": { + "description": "Indexing time (s)", + "function": lambda true_distances, run_distances, run_attrs: build_time(true_distances, run_attrs), + "worst": float("inf") + }, + "candidates" : { + "description": "Candidates generated", + "function": lambda true_distances, run_distances, run_attrs: candidates(true_distances, run_attrs), + "worst": float("inf") + }, + "indexsize" : { + "description": "Index size (kB)", + "function": lambda true_distances, run_distances, run_attrs: index_size(true_distances, run_attrs), + "worst": float("inf") + }, + "queriessize" : { + "description": "Index size (kB)/Queries per second (s)", + "function": lambda true_distances, run_distances, run_attrs: index_size(true_distances, run_attrs) / queries_per_second(true_distances, run_attrs), + "worst": float("inf") + } +} diff --git a/ann_benchmarks/plotting/plot_variants.py b/ann_benchmarks/plotting/plot_variants.py new file mode 100644 index 0000000..7eb91d5 --- /dev/null +++ b/ann_benchmarks/plotting/plot_variants.py @@ -0,0 +1,12 @@ +from ann_benchmarks.plotting.metrics import all_metrics as metrics + +all_plot_variants = { + "recall/time" : ("k-nn", "qps"), + "recall/buildtime" : ("k-nn", "build"), + "recall/indexsize" : ("k-nn", "indexsize"), + "rel/time" : ("rel", "qps"), + "recall/candidates" : ("k-nn", "candidates"), + "recall/qpssize" : ("k-nn", "queriessize"), + "eps/time" : ("epsilon", "qps"), + "largeeps/time" : ("largeepsilon", "qps") +} diff --git a/ann_benchmarks/plotting/utils.py b/ann_benchmarks/plotting/utils.py new file mode 100644 index 0000000..5fd5915 --- /dev/null +++ b/ann_benchmarks/plotting/utils.py @@ -0,0 +1,115 @@ +from __future__ import absolute_import + +import os, itertools, json, numpy, pickle +from ann_benchmarks.plotting.metrics import all_metrics as metrics +import matplotlib.pyplot as plt + +def create_pointset(data, xn, yn): + xm, ym = (metrics[xn], metrics[yn]) + rev = ym["worst"] < 0 + data.sort(key=lambda t: t[-1], reverse=rev) # sort by y coordinate + + axs, ays, als = [], [], [] + # Generate Pareto frontier + xs, ys, ls = [], [], [] + last_x = xm["worst"] + comparator = \ + (lambda xv, lx: xv > lx) if last_x < 0 else (lambda xv, lx: xv < lx) + for algo, algo_name, xv, yv in data: + if not xv or not yv: + continue + axs.append(xv) + ays.append(yv) + als.append(algo_name) + if comparator(xv, last_x): + last_x = xv + xs.append(xv) + ys.append(yv) + ls.append(algo_name) + return xs, ys, ls, axs, ays, als + +def compute_metrics(true_nn_distances, res, metric_1, metric_2, radius=-1): + all_results = {} + for i, (properties, run) in enumerate(res): + algo = properties['algo'] + algo_name = properties['name'] + # cache distances to avoid access to hdf5 file + run_distances = list(run['distances']) + + if metric_1 == 'range': + metric_1_value = metrics[metric_1]['function'](true_nn_distances, run_distances, properties, radius) + else: + metric_1_value = metrics[metric_1]['function'](true_nn_distances, run_distances, properties) + if metric_2 == 'range': + metric_2_value = metrics[metric_2]['function'](true_nn_distances, run_distances, properties, radius) + else: + metric_2_value = metrics[metric_2]['function'](true_nn_distances, run_distances, properties) + + print('%3d: %80s %12.3f %12.3f' % (i, algo_name, metric_1_value, metric_2_value)) + + all_results.setdefault(algo, []).append((algo, algo_name, metric_1_value, metric_2_value)) + + return all_results + +def compute_metrics_K(all_results, true_nn_distances, res, count, metric_1, metric_2): + for i, (properties, run) in enumerate(res): + algo = properties['algo'] + algo_name = properties['name'] + # cache distances to avoid access to hdf5 file + run_distances = list(run['distances']) + + metric_1_value = metrics[metric_1]['function'](true_nn_distances, run_distances, properties) + metric_2_value = metrics[metric_2]['function'](true_nn_distances, run_distances, properties) + + print('%3d: %80s %12.3f %12.3f' % (i, algo_name, metric_1_value, metric_2_value)) + + all_results.setdefault(algo + '-K=' + str(count), []).append((algo + str(count), algo_name, metric_1_value, metric_2_value)) + + return all_results + +def compute_all_metrics(true_nn_distances, run, properties): + algo = properties["algo"] + algo_name = properties["name"] + print('--') + print(algo_name) + results = {} + # cache distances to avoid access to hdf5 file + run_distances = list(run["distances"]) + for name, metric in metrics.items(): + v = metric["function"](true_nn_distances, run_distances, properties) + results[name] = v + if v: + print('%s: %g' % (name, v)) + return (algo, algo_name, results) + +def generate_n_colors(n): + vs = numpy.linspace(0.4, 1.0, 7) + colors = [(.9, .4, .4, 1.)] + def euclidean(a, b): + return sum((x-y)**2 for x, y in zip(a, b)) + while len(colors) < n: + new_color = max(itertools.product(vs, vs, vs), key=lambda a: min(euclidean(a, b) for b in colors)) + colors.append(new_color + (1.,)) + return colors + +def create_linestyles(unique_algorithms): + colors = dict(zip(unique_algorithms, generate_n_colors(len(unique_algorithms)))) + linestyles = dict((algo, ['--', '-.', '-', ':'][i%4]) for i, algo in enumerate(unique_algorithms)) + markerstyles = dict((algo, ['+', '<', 'o', '*', 'x'][i%5]) for i, algo in enumerate(unique_algorithms)) + faded = dict((algo, (r, g, b, 0.3)) for algo, (r, g, b, a) in colors.items()) + return dict((algo, (colors[algo], faded[algo], linestyles[algo], markerstyles[algo])) for algo in unique_algorithms) + +def get_up_down(metric): + if metric["worst"] == float("inf"): + return "down" + return "up" + +def get_left_right(metric): + if metric["worst"] == float("inf"): + return "left" + return "right" + +def get_plot_label(xm, ym): + return "%(xlabel)s-%(ylabel)s tradeoff - %(updown)s and to the %(leftright)s is better" % { + "xlabel" : xm["description"], "ylabel" : ym["description"], "updown" : get_up_down(ym), "leftright" : get_left_right(xm) } + diff --git a/ann_benchmarks/results.py b/ann_benchmarks/results.py new file mode 100644 index 0000000..3adbcd5 --- /dev/null +++ b/ann_benchmarks/results.py @@ -0,0 +1,77 @@ +from __future__ import absolute_import + +import h5py +import json +import os +import re + +def get_algorithm_name(name, batch_mode): + if batch_mode: + return name + "-batch" + return name + +def is_batch(name): + return "-batch" in name + +def get_result_filename(dataset=None, count=None, definition=None, query_arguments=None, batch_mode=False): + d = ['results'] + if dataset: + d.append(dataset) + if count: + d.append(str(count)) + if definition: + d.append(get_algorithm_name(definition.algorithm, batch_mode)) + d.append(re.sub(r'\W+', '_', json.dumps(definition.arguments + query_arguments, sort_keys=True)).strip('_')) + return os.path.join(*d) + +def store_results(dataset, count, definition, query_arguments, attrs, results, batch, rq): + fn = get_result_filename(dataset, count, definition, query_arguments, batch) + head, tail = os.path.split(fn) + if not os.path.isdir(head): + os.makedirs(head) + f = h5py.File(fn, 'w') + for k, v in attrs.items(): + f.attrs[k] = v + times = f.create_dataset('times', (len(results),), 'f') + if rq: + count=1000 #the maximum number of items returned + neighbors = f.create_dataset('neighbors', (len(results), count), 'i') + distances = f.create_dataset('distances', (len(results), count), 'f') + for i, (time, ds) in enumerate(results): + times[i] = time + if rq and count < len(ds): + neighbors[i] = [n for n, d in ds[:count]] + distances[i] = [d for n, d in ds[:count]] + else: + neighbors[i] = [n for n, d in ds] + [-1] * (count - len(ds)) + distances[i] = [d for n, d in ds] + [float('inf')] * (count - len(ds)) + #print(neighbors[i]) + #print(distances[i]) + f.close() + + +def load_all_results(dataset=None, count=None, split_batched=False, batch_mode=False): + for root, _, files in os.walk(get_result_filename(dataset, count)): + for fn in files: + try: + if split_batched and batch_mode != is_batch(root): + continue + f = h5py.File(os.path.join(root, fn)) + properties = dict(f.attrs) + # TODO Fix this properly. Sometimes the hdf5 file returns bytes + # This converts these bytes to strings before we work with them + for k in properties.keys(): + try: + properties[k]= properties[k].decode() + except: + pass + yield properties, f + f.close() + except: + pass + +def get_unique_algorithms(): + algorithms = set() + for properties, _ in load_all_results(): + algorithms.add(properties['algo']) + return algorithms diff --git a/ann_benchmarks/runner.py b/ann_benchmarks/runner.py new file mode 100644 index 0000000..9a857ba --- /dev/null +++ b/ann_benchmarks/runner.py @@ -0,0 +1,305 @@ +from __future__ import print_function +__true_print = print + +import argparse +import datetime +import docker +import json +import multiprocessing +import numpy +import os +import psutil +import requests +import sys +import threading +import time +import subprocess + +def print(*args, **kwargs): + __true_print(*args, **kwargs) + sys.stdout.flush() + +from ann_benchmarks.datasets import get_dataset, DATASETS +from ann_benchmarks.algorithms.definitions import Definition, instantiate_algorithm, get_algorithm_name +from ann_benchmarks.distance import metrics +from ann_benchmarks.results import store_results + +from scipy.sparse import issparse + + +def run_individual_query(algoname, algo, X_train, X_test, distance, count, run_count, batch, rq): + best_search_time = float('inf') + for i in range(run_count): + print('Run %d/%d...' % (i+1, run_count)) + n_items_processed = [0] # a bit dumb but can't be a scalar since of Python's scoping rules + + def single_query(v): + # special code for Risc + if "Risc" in algoname or 'DivideSkip' in algoname: + algo.pre_query(v, count) + start = time.time() + if rq: + candidates = algo.query(v, count, rq) # now count is the radius + else: + candidates = algo.query(v, count) + total = (time.time() - start) + # special code for Risc + if "Risc" in algoname or 'DivideSkip' in algoname: + candidates = algo.post_query() + if issparse(X_train): + candidates = [(int(idx), float(metrics[distance]['distance'](v, X_train[idx].toarray()[0]))) + for idx in candidates] + else: + candidates = [(int(idx), float(metrics[distance]['distance'](v, X_train[idx]))) + for idx in candidates] + n_items_processed[0] += 1 + if n_items_processed[0] % 1000 == 0: + print('Processed %d/%d queries...' % (n_items_processed[0], X_test.shape[0])) + if rq==False and len(candidates) > count: + print('warning: algorithm %s returned %d results, but count is only %d)' % (algo, len(candidates), count)) + return (total, candidates) + + def batch_query(X): + start = time.time() + algo.batch_query(X, count) + total = (time.time() - start) + results = algo.get_batch_results() + # needs testing + if issparse(X_train): + candidates = [[(int(idx), float(metrics[distance]['distance'](v, X_train[idx].toarray()[0]))) + for idx in single_results] + for v, single_results in zip(X, results)] + else: + candidates = [[(int(idx), float(metrics[distance]['distance'](v, X_train[idx]))) + for idx in single_results] + for v, single_results in zip(X, results)] + return [(total / float(X.shape[0]), v) for v in candidates] + + if batch: + results = batch_query(X_test) + else: + results = [single_query(x) for x in X_test] + + total_time = sum(time for time, _ in results) + total_candidates = sum(len(candidates) for _, candidates in results) + search_time = total_time / len(X_test) + avg_candidates = total_candidates / len(X_test) + best_search_time = min(best_search_time, search_time) + + verbose = hasattr(algo, "query_verbose") + attrs = { + "batch_mode": batch, + "best_search_time": best_search_time, + "candidates": avg_candidates, + "expect_extra": verbose, + "name": str(algo), + "run_count": run_count, + "distance": distance, + "count": int(count) + } + return (attrs, results) + + +def run(definition, dataset, count, run_count, batch, rq): + algo = instantiate_algorithm(definition) + assert not definition.query_argument_groups \ + or hasattr(algo, "set_query_arguments"), """\ +error: query argument groups have been specified for %s.%s(%s), but the \ +algorithm instantiated from it does not implement the set_query_arguments \ +function""" % (definition.module, definition.constructor, definition.arguments) + + D = get_dataset(dataset) + # Chunjiang modified + print('Is the train set a sparse matrix? %d' % issparse(D['train'][()])) + if 'sparse' not in dataset: + X_train = D['train'][()].toarray() + else: + X_train = D['train'][()] + # X_train = numpy.array(D['train']) + #X_train = X_train[:2000] + X_test = numpy.array(D['test']) + distance = D.attrs['distance'] + print('got a train set of size (%d * %d)' % X_train.shape) + print('got %d queries' % len(X_test)) + + try: + # special code for Risc + print(X_train.shape) + if 'Risc' in definition.algorithm or 'DivideSkip' in definition.algorithm: + X_train = numpy.concatenate((X_train, [numpy.ones(X_train.shape[1], dtype=numpy.bool)]), axis=0) + print(X_train.shape) + algo.pre_fit(X_train) + t0 = time.time() + index_size_before = algo.get_index_size("self") + algo.fit(X_train) + build_time = time.time() - t0 + index_size = algo.get_index_size("self") - index_size_before + print('Built index in', build_time) + print('Index size: ', index_size) + + query_argument_groups = definition.query_argument_groups + # Make sure that algorithms with no query argument groups still get run + # once by providing them with a single, empty, harmless group + if not query_argument_groups: + query_argument_groups = [[]] + + for pos, query_arguments in enumerate(query_argument_groups, 1): + print("Running query argument group %d of %d..." % + (pos, len(query_argument_groups))) + if query_arguments: + algo.set_query_arguments(*query_arguments) + descriptor, results = run_individual_query(definition.algorithm, algo, X_train, X_test, + distance, count, run_count, batch, rq) + descriptor["build_time"] = build_time + descriptor["index_size"] = index_size + descriptor["algo"] = get_algorithm_name(definition.algorithm, batch) + descriptor["dataset"] = dataset + store_results(dataset, count, definition, + query_arguments, descriptor, results, batch, rq) + + finally: + algo.done() + + +def run_from_cmdline(): + parser = argparse.ArgumentParser() + parser.add_argument( + '--dataset', + choices=DATASETS.keys(), + required=True) + parser.add_argument( + '--algorithm', + required=True) + parser.add_argument( + '--module', + required=True) + parser.add_argument( + '--constructor', + required=True) + parser.add_argument( + '--count', + required=True, + type=int) + parser.add_argument( + '--runs', + required=True, + type=int) + parser.add_argument( + '--batch', + action='store_true') + parser.add_argument( + '--rq', + action='store_true') + parser.add_argument( + '--radius', + type=float) + parser.add_argument( + 'build') + parser.add_argument( + 'queries', + nargs='*', + default=[]) + args = parser.parse_args() + algo_args = json.loads(args.build) + query_args = [json.loads(q) for q in args.queries] + + definition = Definition( + algorithm=args.algorithm, + docker_tag=None, # not needed + module=args.module, + constructor=args.constructor, + arguments=algo_args, + query_argument_groups=query_args, + disabled=False + ) + if args.rq: + run(definition, args.dataset, args.radius, args.runs, args.batch, args.rq) + else: + run(definition, args.dataset, args.count, args.runs, args.batch) + + +def run_docker(definition, dataset, count, runs, timeout, batch, rq, radius, mem_limit=None): + import colors # Think it doesn't work in Python 2 + + cmd = ['--dataset', dataset, + '--algorithm', definition.algorithm, + '--module', definition.module, + '--constructor', definition.constructor, + '--runs', str(runs), + '--count', str(count)] + if batch: + cmd += ['--batch'] + if rq: + cmd += ['--rq', '--radius', str(radius)] + cmd.append(json.dumps(definition.arguments)) + cmd += [json.dumps(qag) for qag in definition.query_argument_groups] + print('Running command', cmd) + client = docker.from_env() + if mem_limit is None: + mem_limit = psutil.virtual_memory().available + print('Memory limit:', mem_limit) + cpu_limit = "0-%d" % (multiprocessing.cpu_count() - 1) + if not batch: + # Limit to first cpu if not in batch mode + cpu_limit = "0" + print('Running on CPUs:', cpu_limit) + + container = client.containers.run( + definition.docker_tag, + cmd, + volumes={ + os.path.abspath('ann_benchmarks'): {'bind': '/home/app/ann_benchmarks', 'mode': 'ro'}, + os.path.abspath('data'): {'bind': '/home/app/data', 'mode': 'ro'}, + os.path.abspath('results'): {'bind': '/home/app/results', 'mode': 'rw'}, + }, + cpuset_cpus=cpu_limit, + mem_limit=mem_limit, + detach=True) + + def stream_logs(): + for line in container.logs(stream=True): + print(colors.color(line.decode().rstrip(), fg='blue')) + + if sys.version_info >= (3, 0): + t = threading.Thread(target=stream_logs, daemon=True) + else: + t = threading.Thread(target=stream_logs) + t.daemon = True + t.start() + try: + exit_code = container.wait(timeout=timeout) + + # Exit if exit code + if exit_code == 0: + return + elif exit_code is not None: + print(colors.color(container.logs().decode(), fg='red')) + raise Exception('Child process raised exception %d' % exit_code) + + finally: + container.remove(force=True) +def run_singularity(definition, dataset, count, runs, timeout, batch, rq, radius, mem_limit=None): + cmd = ['--dataset', dataset, + '--algorithm', definition.algorithm, + '--module', definition.module, + '--constructor', definition.constructor, + '--runs', str(runs), + '--count', str(count)] + if batch: + cmd += ['--batch'] + if rq: + cmd += ['--rq', '--radius', str(radius)] + cmd.append(json.dumps(definition.arguments)) + cmd += [json.dumps(qag) for qag in definition.query_argument_groups] + print('Running command', cmd) + + strCmd = ' '.join(["'" + k + "'" for k in cmd]) + print('String of command', strCmd) + + subprocess.check_call('singularity exec ../../singularity/ann-bench-nmslib3.sif python3 run_algorithm.py %s' %(strCmd), shell=True) + #subprocess.check_call('singularity exec ../singularity/ann-bench-pynndescent.sif python3 run_algorithm.py %s' %(strCmd), shell=True) + #subprocess.check_call('singularity exec ../singularity/ann-bench-datasketch.sif python3 run_algorithm.py %s' %(strCmd), shell=True) + #subprocess.check_call('singularity exec ../singularity/ann-bench-sklearn.sif python3 run_algorithm.py %s' %(strCmd), shell=True) + #subprocess.check_call('singularity exec ../singularity/ann-bench-risc.sif python3 run_algorithm.py %s' %(strCmd), shell=True) + #subprocess.check_call('singularity exec ../singularity/ann-bench-ngt.sif python3 run_algorithm.py %s' %(strCmd), shell=True) + diff --git a/create_dataset.py b/create_dataset.py new file mode 100644 index 0000000..b9463a8 --- /dev/null +++ b/create_dataset.py @@ -0,0 +1,12 @@ +import argparse +from ann_benchmarks.datasets import DATASETS, get_dataset_fn + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + '--dataset', + choices=DATASETS.keys(), + required=True) + args = parser.parse_args() + fn = get_dataset_fn(args.dataset) + DATASETS[args.dataset](fn) diff --git a/create_website.py b/create_website.py new file mode 100644 index 0000000..0d9eaa1 --- /dev/null +++ b/create_website.py @@ -0,0 +1,213 @@ +import matplotlib as mpl +mpl.use('Agg') +import argparse +import os, json, pickle, yaml +import numpy +import hashlib +from jinja2 import Environment, FileSystemLoader + +from ann_benchmarks import results +from ann_benchmarks.algorithms.definitions import get_algorithm_name +from ann_benchmarks.datasets import get_dataset +from ann_benchmarks.plotting.plot_variants import all_plot_variants as plot_variants +from ann_benchmarks.plotting.metrics import all_metrics as metrics +from ann_benchmarks.plotting.utils import get_plot_label, compute_metrics, compute_all_metrics, create_pointset, create_linestyles +import plot + +colors = [ + "rgba(166,206,227,1)", + "rgba(31,120,180,1)", + "rgba(178,223,138,1)", + "rgba(51,160,44,1)", + "rgba(251,154,153,1)", + "rgba(227,26,28,1)", + "rgba(253,191,111,1)", + "rgba(255,127,0,1)", + "rgba(202,178,214,1)" + ] + +point_styles = { + "o" : "circle", + "<" : "triangle", + "*" : "star", + "x" : "cross", + "+" : "rect", + } + +def convert_color(color): + r, g, b, a = color + return "rgba(%(r)d, %(g)d, %(b)d, %(a)d)" % { + "r" : r * 255, "g" : g * 255, "b" : b * 255 , "a" : a} + +def convert_linestyle(ls): + new_ls = {} + for algo in ls.keys(): + algostyle = ls[algo] + new_ls[algo] = (convert_color(algostyle[0]), convert_color(algostyle[1]), + algostyle[2], point_styles[algostyle[3]]) + return new_ls + +def get_run_desc(properties): + return "%(dataset)s_%(count)d_%(distance)s" % properties + +def get_dataset_from_desc(desc): + return desc.split("_")[0] + +def get_count_from_desc(desc): + return desc.split("_")[1] + +def get_distance_from_desc(desc): + return desc.split("_")[2] + +def get_dataset_label(desc): + return get_dataset_from_desc(desc) + " (k = " + get_count_from_desc(desc) + ")" + +def directory_path(s): + if not os.path.isdir(s): + raise argparse.ArgumentTypeError("'%s' is not a directory" % s) + return s + "/" + +def prepare_data(data, xn, yn): + """Change format from (algo, instance, dict) to (algo, instance, x, y).""" + res = [] + for algo, algo_name, result in data: + res.append((algo, algo_name, result[xn], result[yn])) + return res + +parser = argparse.ArgumentParser() +parser.add_argument( + '--plottype', + help = 'Generate only the plots specified', + nargs = '*', + choices = plot_variants.keys(), + default = plot_variants.keys()) +parser.add_argument( + '--outputdir', + help = 'Select output directory', + default = '.', + type=directory_path, + action = 'store') +parser.add_argument( + '--latex', + help='generates latex code for each plot', + action = 'store_true') +parser.add_argument( + '--scatter', + help='create scatterplot for data', + action = 'store_true') +args = parser.parse_args() + +def get_lines(all_data, xn, yn, render_all_points): + """ For each algorithm run on a dataset, obtain its performance curve coords.""" + plot_data = [] + for algo in sorted(all_data.keys(), key=lambda x: x.lower()): + xs, ys, ls, axs, ays, als = \ + create_pointset(prepare_data(all_data[algo], xn, yn), xn, yn) + if render_all_points: + xs, ys, ls = axs, ays, als + plot_data.append({ "name": algo, "coords" : zip(xs, ys), "labels" : ls, + "scatter" : render_all_points}) + return plot_data + +def create_plot(all_data, xn, yn, linestyle, j2_env, additional_label = "", plottype = "line"): + xm, ym = (metrics[xn], metrics[yn]) + render_all_points = plottype == "bubble" + plot_data = get_lines(all_data, xn, yn, render_all_points) + latex_code = j2_env.get_template("latex.template").\ + render(plot_data = plot_data, caption = get_plot_label(xm, ym), + xlabel = xm["description"], ylabel = ym["description"]) + plot_data = get_lines(all_data, xn, yn, render_all_points) + button_label = hashlib.sha224((get_plot_label(xm, ym) + + additional_label).encode("utf-8")).hexdigest() + return j2_env.get_template("chartjs.template").\ + render(args = args, latex_code = latex_code, button_label = button_label, + data_points = plot_data, + xlabel = xm["description"], ylabel = ym["description"], + plottype = plottype, plot_label = get_plot_label(xm, ym), + label = additional_label, linestyle = linestyle, + render_all_points = render_all_points) + +def build_detail_site(data, label_func, j2_env, linestyles, batch=False): + for (name, runs) in data.items(): + print("Building '%s'" % name) + all_runs = runs.keys() + label = label_func(name) + data = {"normal" : [], "scatter" : []} + + for plottype in args.plottype: + xn, yn = plot_variants[plottype] + data["normal"].append(create_plot(runs, xn, yn, convert_linestyle(linestyles), j2_env)) + if args.scatter: + data["scatter"].append(create_plot(runs, xn, yn, + convert_linestyle(linestyles), j2_env, "Scatterplot ", "bubble")) + + # create png plot for summary page + data_for_plot = {} + for k in runs.keys(): + data_for_plot[k] = prepare_data(runs[k], 'k-nn', 'qps') + plot.create_plot(data_for_plot, False, + False, True, 'k-nn', 'qps', args.outputdir + get_algorithm_name(name, batch) + ".png", + linestyles, batch) + with open(args.outputdir + get_algorithm_name(name, batch) + ".html", "w") as text_file: + text_file.write(j2_env.get_template("detail_page.html"). + render(title = label, plot_data = data, args = args, batch=batch)) + + +def build_index_site(datasets, algorithms, j2_env, file_name): + dataset_data = {'batch' : [], 'non-batch' : []} + for mode in ['batch', 'non-batch']: + distance_measures = sorted(set([get_distance_from_desc(e) for e in datasets[mode].keys()])) + sorted_datasets = sorted(set([get_dataset_from_desc(e) for e in datasets[mode].keys()])) + + for dm in distance_measures: + d = {"name" : dm.capitalize(), "entries": []} + for ds in sorted_datasets: + matching_datasets = [e for e in datasets[mode].keys() \ + if get_dataset_from_desc(e) == ds and \ + get_distance_from_desc(e) == dm] + sorted_matches = sorted(matching_datasets, \ + key = lambda e: int(get_count_from_desc(e))) + for idd in sorted_matches: + d["entries"].append({"name" : idd, "desc" : get_dataset_label(idd)}) + dataset_data[mode].append(d) + + with open(args.outputdir + "index.html", "w") as text_file: + text_file.write(j2_env.get_template("summary.html"). + render(title = "ANN-Benchmarks", dataset_with_distances = dataset_data, + algorithms = algorithms, label_func=get_algorithm_name)) + +def load_all_results(): + """Read all result files and compute all metrics""" + all_runs_by_dataset = {'batch' : {}, 'non-batch': {}} + all_runs_by_algorithm = {'batch' : {}, 'non-batch' : {}} + cached_true_dist = [] + old_sdn = None + for properties, f in results.load_all_results(): + sdn = get_run_desc(properties) + if sdn != old_sdn: + dataset = get_dataset(properties["dataset"]) + cached_true_dist = list(dataset["distances"]) + old_sdn = sdn + algo = properties["algo"] + ms = compute_all_metrics(cached_true_dist, f, properties) + algo_ds = get_dataset_label(sdn) + idx = "non-batch" + if properties["batch_mode"]: + idx = "batch" + all_runs_by_algorithm[idx].setdefault(algo, {}).setdefault(algo_ds, []).append(ms) + all_runs_by_dataset[idx].setdefault(sdn, {}).setdefault(algo, []).append(ms) + + return (all_runs_by_dataset, all_runs_by_algorithm) + +j2_env = Environment(loader=FileSystemLoader("./templates/"), trim_blocks = True) +j2_env.globals.update(zip=zip, len=len) +runs_by_ds, runs_by_algo = load_all_results() +dataset_names = [get_dataset_label(x) for x in list(runs_by_ds['batch'].keys()) + list(runs_by_ds['non-batch'].keys())] +algorithm_names = list(runs_by_algo['batch'].keys()) + list(runs_by_algo['non-batch'].keys()) +linestyles = {**create_linestyles(dataset_names), **create_linestyles(algorithm_names)} + +build_detail_site(runs_by_ds['non-batch'], lambda label: get_dataset_label(label), j2_env, linestyles, False) +build_detail_site(runs_by_ds['batch'], lambda label: get_dataset_label(label), j2_env, linestyles, True) +build_detail_site(runs_by_algo['non-batch'], lambda x: x, j2_env, linestyles, False) +build_detail_site(runs_by_algo['batch'], lambda x: x, j2_env, linestyles, True) +build_index_site(runs_by_ds, runs_by_algo, j2_env, "index.html") diff --git a/data/.gitignore b/data/.gitignore new file mode 100644 index 0000000..5e7d273 --- /dev/null +++ b/data/.gitignore @@ -0,0 +1,4 @@ +# Ignore everything in this directory +* +# Except this file +!.gitignore diff --git a/docker-install/Dockerfile b/docker-install/Dockerfile new file mode 100644 index 0000000..aff9650 --- /dev/null +++ b/docker-install/Dockerfile @@ -0,0 +1,10 @@ +FROM ubuntu:latest + +RUN apt-get update +RUN apt-get install -y python3-numpy python3-scipy python3-pip build-essential git + +WORKDIR /home/app +COPY requirements.txt run_algorithm.py ./ +RUN pip3 install -rrequirements.txt + +ENTRYPOINT ["python3", "run_algorithm.py"] diff --git a/docker-install/Dockerfile.annoy b/docker-install/Dockerfile.annoy new file mode 100644 index 0000000..e426125 --- /dev/null +++ b/docker-install/Dockerfile.annoy @@ -0,0 +1,5 @@ +FROM ann-benchmarks + +RUN git clone https://github.com/spotify/annoy +RUN cd annoy && python3 setup.py install +RUN python3 -c 'import annoy' diff --git a/docker-install/Dockerfile.datasketch b/docker-install/Dockerfile.datasketch new file mode 100644 index 0000000..d70c592 --- /dev/null +++ b/docker-install/Dockerfile.datasketch @@ -0,0 +1,4 @@ +FROM ann-benchmarks + +RUN pip3 install datasketch +RUN python3 -c 'import datasketch' diff --git a/docker-install/Dockerfile.dolphinn b/docker-install/Dockerfile.dolphinn new file mode 100644 index 0000000..4e2f7cc --- /dev/null +++ b/docker-install/Dockerfile.dolphinn @@ -0,0 +1,5 @@ +FROM ann-benchmarks + +RUN git clone https://github.com/ipsarros/DolphinnPy lib-dolphinnpy +ENV PYTHONPATH lib-dolphinnpy +RUN python3 -c 'import dolphinn' diff --git a/docker-install/Dockerfile.faiss b/docker-install/Dockerfile.faiss new file mode 100644 index 0000000..a39531a --- /dev/null +++ b/docker-install/Dockerfile.faiss @@ -0,0 +1,12 @@ +FROM ann-benchmarks + +RUN apt-get update && apt-get install -y libopenblas-base libopenblas-dev libpython-dev python-numpy python-pip swig +RUN git clone https://github.com/facebookresearch/faiss lib-faiss +RUN cd lib-faiss && git checkout tags/v1.2.1 -b lib-faiss && cp example_makefiles/makefile.inc.Linux makefile.inc && make -j4 py BLASLDFLAGS=/usr/lib/x86_64-linux-gnu/libopenblas.so.0 +ENV PYTHONPATH lib-faiss + +# faiss doesn't work with python3 afaik +RUN python -c 'import faiss' +RUN pip install -r requirements.txt +RUN pip install sklearn enum34 +ENTRYPOINT ["python", "run_algorithm.py"] diff --git a/docker-install/Dockerfile.flann b/docker-install/Dockerfile.flann new file mode 100644 index 0000000..4ca2584 --- /dev/null +++ b/docker-install/Dockerfile.flann @@ -0,0 +1,10 @@ +FROM ann-benchmarks + +RUN apt-get update && apt-get install -y cmake +RUN git clone https://github.com/mariusmuja/flann +RUN mkdir flann/build +RUN cd flann/build && cmake .. +RUN cd flann/build && make -j4 +RUN cd flann/build && make install +RUN pip3 install sklearn +RUN python3 -c 'import pyflann' diff --git a/docker-install/Dockerfile.hdidx b/docker-install/Dockerfile.hdidx new file mode 100644 index 0000000..5d533d1 --- /dev/null +++ b/docker-install/Dockerfile.hdidx @@ -0,0 +1,18 @@ +FROM ann-benchmarks + +# needed to avoid some dependencies starting interaction on the command line +ARG DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && apt-get install -y \ + python-opencv \ + python-numpy \ + python-pip \ + git +RUN pip install cython +RUN pip install -r requirements.txt + +RUN git clone https://github.com/hdidx/hdidx.git +RUN cd hdidx && python setup.py install + +RUN python -c 'import hdidx; a = hdidx.indexer.SHIndexer' +ENTRYPOINT ["python", "run_algorithm.py"] diff --git a/docker-install/Dockerfile.hnswlib b/docker-install/Dockerfile.hnswlib new file mode 100644 index 0000000..9648903 --- /dev/null +++ b/docker-install/Dockerfile.hnswlib @@ -0,0 +1,10 @@ +FROM ann-benchmarks + +RUN apt-get install -y python-setuptools python-pip +RUN pip3 install pybind11 numpy setuptools +RUN git clone https://github.com/nmslib/hnsw.git;cd hnsw; git checkout denorm + +RUN cd hnsw/python_bindings; python3 setup.py install + +RUN python3 -c 'import hnswlib' + diff --git a/docker-install/Dockerfile.kgraph b/docker-install/Dockerfile.kgraph new file mode 100644 index 0000000..43c9bf3 --- /dev/null +++ b/docker-install/Dockerfile.kgraph @@ -0,0 +1,11 @@ +FROM ann-benchmarks + +RUN apt-get update && apt-get install -y libboost-timer-dev libboost-chrono-dev libboost-program-options-dev libboost-system-dev libboost-python-dev python-numpy python-pip +RUN git clone https://github.com/aaalgo/kgraph +RUN cd kgraph && python setup.py build && python setup.py install + +# kgraph doesn't work with python3 afaik +RUN python -c 'import pykgraph' +RUN pip install -rrequirements.txt +RUN pip install enum34 +ENTRYPOINT ["python", "run_algorithm.py"] diff --git a/docker-install/Dockerfile.mrpt b/docker-install/Dockerfile.mrpt new file mode 100644 index 0000000..c14ab76 --- /dev/null +++ b/docker-install/Dockerfile.mrpt @@ -0,0 +1,6 @@ +FROM ann-benchmarks + +RUN pip3 install sklearn +#RUN pip3 install git+https://github.com/teemupitkanen/mrpt/tree/2369a9df0fd7e9774b02237253f022a55bd6f532 +RUN pip3 install git+https://github.com/chunjiangzhu/mrpt +#https://github.com/teemupitkanen/mrpt diff --git a/docker-install/Dockerfile.nearpy b/docker-install/Dockerfile.nearpy new file mode 100644 index 0000000..4411816 --- /dev/null +++ b/docker-install/Dockerfile.nearpy @@ -0,0 +1,5 @@ +FROM ann-benchmarks + +RUN apt-get install -y libhdf5-openmpi-dev cython +RUN pip3 install nearpy bitarray redis sklearn +RUN python3 -c 'import nearpy' \ No newline at end of file diff --git a/docker-install/Dockerfile.ngt b/docker-install/Dockerfile.ngt new file mode 100644 index 0000000..c0ef95c --- /dev/null +++ b/docker-install/Dockerfile.ngt @@ -0,0 +1,13 @@ +FROM ann-benchmarks + +RUN apt-get update +RUN apt-get install -y git cmake g++ python3 python3-setuptools python3-pip +RUN pip3 install wheel pybind11 +RUN git clone https://github.com/chunjiangzhu/ngt.git +RUN mkdir -p ngt/build +RUN cd ngt/build && cmake .. +RUN cd ngt/build && make && make install +RUN ldconfig +RUN cd ngt/python && python3 setup.py bdist_wheel +RUN pip3 install ngt/python/dist/ngt-*-linux_x86_64.whl + diff --git a/docker-install/Dockerfile.nmslib b/docker-install/Dockerfile.nmslib new file mode 100644 index 0000000..2f1bba6 --- /dev/null +++ b/docker-install/Dockerfile.nmslib @@ -0,0 +1,16 @@ +FROM ann-benchmarks + +RUN apt-get update && apt-get install -y cmake libboost-all-dev libeigen3-dev libgsl0-dev +RUN git clone https://github.com/searchivarius/nmslib.git +RUN cd nmslib/similarity_search && cmake . -DWITH_EXTRAS=1 +RUN cd nmslib/similarity_search && make -j4 +RUN apt-get install -y python-setuptools python-pip python-numpy +RUN pip install pybind11 +RUN cd nmslib/python_bindings && python setup.py build +RUN cd nmslib/python_bindings && python setup.py install + +# nmslib doesn't work with python3 afaik +RUN python -c 'import nmslib' +RUN pip install -rrequirements.txt +RUN pip install enum34 +ENTRYPOINT ["python", "run_algorithm.py"] diff --git a/docker-install/Dockerfile.nmslib-sparse b/docker-install/Dockerfile.nmslib-sparse new file mode 100644 index 0000000..39ffc6f --- /dev/null +++ b/docker-install/Dockerfile.nmslib-sparse @@ -0,0 +1,17 @@ +FROM ann-benchmarks + +RUN apt-get update && apt-get install -y cmake libboost-all-dev libeigen3-dev libgsl0-dev +RUN git clone https://github.com/searchivarius/nmslib.git +RUN cd nmslib/similarity_search && cmake . -DWITH_EXTRAS=1 +RUN cd nmslib/similarity_search && make -j4 +RUN apt-get install -y python-setuptools python-pip python-numpy +RUN pip install pybind11 +RUN cd nmslib/python_bindings && python setup.py build +RUN cd nmslib/python_bindings && python setup.py install + +# nmslib doesn't work with python3 afaik +RUN python -c 'import nmslib' +RUN pip install -rrequirements.txt +RUN pip install enum34 +RUN pip install h5sparse +ENTRYPOINT ["python", "run_algorithm.py"] diff --git a/docker-install/Dockerfile.panns b/docker-install/Dockerfile.panns new file mode 100644 index 0000000..db5438f --- /dev/null +++ b/docker-install/Dockerfile.panns @@ -0,0 +1,10 @@ +FROM ann-benchmarks + +RUN apt-get update && apt-get install -y python-pip python-numpy python-scipy +RUN pip install panns + +# panns doesn't work with python3 afaik +RUN python -c 'import panns' +RUN pip install -rrequirements.txt +RUN pip install enum34 +ENTRYPOINT ["python", "run_algorithm.py"] diff --git a/docker-install/Dockerfile.pynndescent b/docker-install/Dockerfile.pynndescent new file mode 100644 index 0000000..cc2b8fd --- /dev/null +++ b/docker-install/Dockerfile.pynndescent @@ -0,0 +1,6 @@ +FROM ann-benchmarks + +RUN pip3 install numba scikit-learn +RUN git clone https://github.com/lmcinnes/pynndescent +RUN cd pynndescent && python3 setup.py install +RUN python3 -c 'import pynndescent' diff --git a/docker-install/Dockerfile.rpforest b/docker-install/Dockerfile.rpforest new file mode 100644 index 0000000..72c1231 --- /dev/null +++ b/docker-install/Dockerfile.rpforest @@ -0,0 +1,5 @@ +FROM ann-benchmarks + +RUN git clone https://github.com/lyst/rpforest +RUN cd rpforest && python3 setup.py install +RUN python3 -c 'import rpforest' diff --git a/docker-install/Dockerfile.sklearn b/docker-install/Dockerfile.sklearn new file mode 100644 index 0000000..c61a79f --- /dev/null +++ b/docker-install/Dockerfile.sklearn @@ -0,0 +1,4 @@ +FROM ann-benchmarks + +RUN pip3 install scikit-learn +RUN python3 -c 'import sklearn' diff --git a/install.py b/install.py new file mode 100644 index 0000000..33b42a0 --- /dev/null +++ b/install.py @@ -0,0 +1,63 @@ +import json +import os +import argparse +import subprocess +from multiprocessing import Pool +from ann_benchmarks.main import positive_int + + +def build(library,args): + print('Building %s...' % library) + if args is not None and len(args) != 0: + q = " ".join(["--build-arg " + x.replace(" ","\\ ") for x in args]) + else: + q = "" + subprocess.check_call( + 'docker build %s\ + --rm -t ann-benchmarks-%s -f docker-install/Dockerfile.%s .' % (q, library, library), shell=True) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + "--proc", + default=1, + type=positive_int, + help="the number of process to build docker images") + parser.add_argument( + '--algorithm', + metavar='NAME', + help='build only the named algorithm image', + default=None) + parser.add_argument( + '--build-arg', + help='pass given args to all docker builds', + nargs="+") + args = parser.parse_args() + + print('Building base image...') + subprocess.check_call( + 'docker build \ + --rm -t ann-benchmarks -f docker-install/Dockerfile .', shell=True) + + if args.algorithm: + print('Building algorithm(%s) image...' % args.algorithm) + build(args.algorithm,args.build_arg) + elif os.getenv('LIBRARY'): + print('Building algorithm(%s) image...' % os.getenv('LIBRARY')) + build(os.getenv('LIBRARY'),args.build_arg) + else: + print('Building algorithm images... with (%d) processes' % args.proc) + dockerfiles = [] + for fn in os.listdir('docker-install'): + if fn.startswith('Dockerfile.'): + dockerfiles.append(fn.split('.')[-1]) + + if args.proc == 1: + [build(tag,args.build_arg) for tag in dockerfiles] + else: + pool = Pool(processes=args.proc) + pool.map(lambda x: build(x, args.build_arg), dockerfiles) + pool.close() + pool.join() diff --git a/run.py b/run.py new file mode 100644 index 0000000..adf3247 --- /dev/null +++ b/run.py @@ -0,0 +1,4 @@ +from ann_benchmarks.main import main + +main() + diff --git a/run.sh b/run.sh new file mode 100644 index 0000000..3529ebc --- /dev/null +++ b/run.sh @@ -0,0 +1,39 @@ +#!/bin/bash +#SBATCH --partition=HaswellPriority # Name of partition +#SBATCH --ntasks=1 # Request 48 CPU cores +#SBATCH --exclude=cn[65-69,71-136,325-343,345-353,355-358,360-364,369-398,400-401],gpu[07-10] +#SBATCH --exclusive + +module load anaconda/5.1.0 +source activate ann_env +module purge +module load gcc/5.4.0 +module load singularity/3.1 +#python cpBuildingTime.py +#singularity exec ../singularity/ann-bench-nmslib.sif python -c 'import nmslib' +#python run.py --dataset=molport-1024-jaccard --algorithm='Hnsw(Nmslib)' # Replace with your application's commands +#python run.py --dataset=molport-1024-jaccard --algorithm='Hnsw(Nmslib)' --batch +#python run.py --dataset=molport-1024-jaccard --algorithm='SW-graph(Nmslib)' +#python run.py --dataset=molport-1024-jaccard --algorithm='VPtree(Nmslib)' +#python run.py --dataset=molport-1024-jaccard --algorithm='Pynndescent' +#python run.py --dataset=molport-1024-jaccard --algorithm='Datasketch' +#python run.py --dataset=molport-1024-jaccard --algorithm='Bruteforce' +#python run.py --dataset=molport-1024-jaccard --algorithm='Balltree(Sklearn)' +#python run.py --dataset=molport-1024-jaccard --algorithm='Risc' +#python run.py --dataset=molport-1024-jaccard --algorithm='DivideSkip' +python run.py --dataset=molport-1024-jaccard --rq --radius=0.4 --algorithm='Onng(Ngt)' +#python run.py --dataset=molport-1024-jaccard --algorithm='Panng(Ngt)' + +#python run.py --dataset=chembl-1024-jaccard --algorithm='Hnsw(Nmslib)' # Replace with your application's commands +#python run.py --dataset=chembl-1024-jaccard --algorithm='Hnsw(Nmslib)' --batch +#python run.py --dataset=chembl-1024-jaccard --algorithm='SW-graph(Nmslib)' +#python run.py --dataset=chembl-1024-jaccard --algorithm='VPtree(Nmslib)' +#python run.py --dataset=chembl-1024-jaccard --algorithm='Pynndescent' +#python run.py --dataset=chembl-1024-jaccard --algorithm='Datasketch' +#python run.py --dataset=chembl-1024-jaccard --algorithm='Bruteforce' +#python run.py --dataset=chembl-1024-jaccard --algorithm='Balltree(Sklearn)' +#python run.py --dataset=chembl-1024-jaccard --algorithm='Risc' +#python run.py --dataset=chembl-1024-jaccard --algorithm='DivideSkip' +#python run.py --dataset=chembl-1024-jaccard --algorithm='Onng(Ngt)' +#python run.py --dataset=chembl-1024-jaccard --algorithm='Panng(Ngt)' + diff --git a/run_algorithm.py b/run_algorithm.py new file mode 100644 index 0000000..f1add5b --- /dev/null +++ b/run_algorithm.py @@ -0,0 +1,3 @@ +from ann_benchmarks.runner import run_from_cmdline + +run_from_cmdline() diff --git a/running.txt b/running.txt new file mode 100644 index 0000000..1612246 --- /dev/null +++ b/running.txt @@ -0,0 +1,6 @@ +2358986:c-onng +2363468:m-onng +2363449:m-bruteforce +2363450:m-balltree +2363453:m-datasketch +2363454:c-datasketch diff --git a/singularity-install/ann-bench-datasketch.def b/singularity-install/ann-bench-datasketch.def new file mode 100644 index 0000000..75b7198 --- /dev/null +++ b/singularity-install/ann-bench-datasketch.def @@ -0,0 +1,14 @@ +Bootstrap: localimage +From: ann-bench.sif + +%files +ann-bench.sif + +%post +pip3 install datasketch h5sparse + +python3 -c 'import datasketch' + + +%runscript +python3 run_algorithm.py diff --git a/singularity-install/ann-bench-ngt.def b/singularity-install/ann-bench-ngt.def new file mode 100644 index 0000000..3c5844a --- /dev/null +++ b/singularity-install/ann-bench-ngt.def @@ -0,0 +1,22 @@ +Bootstrap: localimage +From: ann-bench.sif + +%files +ann-bench.sif + +%post +apt-get update +apt-get install -y git cmake g++ python3 python3-setuptools python3-pip +pip3 install wheel pybind11 +git clone https://github.com/chunjiangzhu/ngt.git +mkdir -p ngt/build +cd ngt/build && cmake .. +make && make install && cd ../../ +ldconfig +cd ngt/python && python3 setup.py bdist_wheel && cd ../../ +pip3 install ngt/python/dist/ngt-*-linux_x86_64.whl +pip3 install h5sparse + +%runscript +python3 run_algorithm.py + diff --git a/singularity-install/ann-bench-nmslib.def b/singularity-install/ann-bench-nmslib.def new file mode 100644 index 0000000..69e836c --- /dev/null +++ b/singularity-install/ann-bench-nmslib.def @@ -0,0 +1,20 @@ +Bootstrap: localimage +From: /home/cjz18001/singularity/ann-bench.sif + +%post +apt-get update && apt-get install -y cmake libboost-all-dev libeigen3-dev libgsl0-dev +git clone https://github.com/searchivarius/nmslib.git +cd nmslib/similarity_search && cmake . -DWITH_EXTRAS=1 +make -j4 && cd ../../ +apt-get install -y python-setuptools python-pip python-numpy +pip install pybind11 +cd nmslib/python_bindings && python setup.py build +python setup.py install && cd ../../ + +python -c 'import nmslib' +pip install -rrequirements.txt +pip install enum34 +pip install h5sparse + +%runscript +python run_algorithm.py diff --git a/singularity-install/ann-bench-nmslib3.def b/singularity-install/ann-bench-nmslib3.def new file mode 100644 index 0000000..2452421 --- /dev/null +++ b/singularity-install/ann-bench-nmslib3.def @@ -0,0 +1,20 @@ +Bootstrap: localimage +From: ann-bench.sif + +%post +apt-get update && apt-get install -y cmake libboost-all-dev libeigen3-dev libgsl0-dev +git clone https://github.uconn.edu/mldrugdiscovery/nmslib.git +cd nmslib/similarity_search && cmake . -DWITH_EXTRAS=1 +make -j4 && cd ../../ +apt-get install -y python-setuptools python-pip python-numpy +pip3 install pybind11 +cd nmslib/python_bindings && python3 setup.py build +python3 setup.py install && cd ../../ + +python3 -c 'import nmslib' +pip3 install -rrequirements.txt +pip3 install enum34 +pip3 install h5sparse + +%runscript +python3 run_algorithm.py diff --git a/singularity-install/ann-bench-pynndescent.def b/singularity-install/ann-bench-pynndescent.def new file mode 100644 index 0000000..c8b7c6f --- /dev/null +++ b/singularity-install/ann-bench-pynndescent.def @@ -0,0 +1,16 @@ +Bootstrap: localimage +From: ann-bench.sif + +%files +ann-bench.sif + +%post +pip3 install numba scikit-learn h5sparse +git clone https://github.com/lmcinnes/pynndescent +cd pynndescent && python3 setup.py install && cd ../ + +python3 -c 'import pynndescent' + + +%runscript +python3 run_algorithm.py diff --git a/singularity-install/ann-bench-risc.def b/singularity-install/ann-bench-risc.def new file mode 100644 index 0000000..6245485 --- /dev/null +++ b/singularity-install/ann-bench-risc.def @@ -0,0 +1,16 @@ +Bootstrap: localimage +From: ann-bench.sif + +%files +ann-bench.sif + +%post +apt-get update && apt-get install -y python3-dev swig +git clone https://github.com/chunjiangzhu/risc.git +cd risc/Code && ./build.sh && cd ../../ +pip3 install h5sparse +cd risc/Code && python3 -c 'import pyrisc' + + +%runscript +python3 run_algorithm.py diff --git a/singularity-install/ann-bench-sklearn.def b/singularity-install/ann-bench-sklearn.def new file mode 100644 index 0000000..2d354d5 --- /dev/null +++ b/singularity-install/ann-bench-sklearn.def @@ -0,0 +1,13 @@ +Bootstrap: localimage +From: ann-bench.sif + +%files +ann-bench.sif + +%post +pip3 install scikit-learn h5sparse + +python3 -c 'import sklearn' + +%runscript +python3 run_algorithm.py diff --git a/singularity-install/ann-bench.def b/singularity-install/ann-bench.def new file mode 100644 index 0000000..956abbe --- /dev/null +++ b/singularity-install/ann-bench.def @@ -0,0 +1,16 @@ +Bootstrap: library +From: ubuntu:16.04 + +%files +requirements.txt +run_algorithm.py + +%post +apt-get update +apt-get install -y python3-numpy python3-scipy python3-pip build-essential git + +pip3 install -rrequirements.txt + +%runscript +python3 run_algorithm.py + diff --git a/singularity-install/requirements.txt b/singularity-install/requirements.txt new file mode 100644 index 0000000..a453071 --- /dev/null +++ b/singularity-install/requirements.txt @@ -0,0 +1,10 @@ +ansicolors==1.1.8 +docker==2.6.1 +h5py==2.7.1 +matplotlib==2.1.0 +numpy==1.13.3 +pyyaml==3.12 +psutil==5.4.2 +scipy==1.0.0 +scikit-learn==0.19.1 +jinja2==2.10 diff --git a/singularity-install/run_algorithm.py b/singularity-install/run_algorithm.py new file mode 100644 index 0000000..f1add5b --- /dev/null +++ b/singularity-install/run_algorithm.py @@ -0,0 +1,3 @@ +from ann_benchmarks.runner import run_from_cmdline + +run_from_cmdline() diff --git a/templates/chartjs.template b/templates/chartjs.template new file mode 100644 index 0000000..466e355 --- /dev/null +++ b/templates/chartjs.template @@ -0,0 +1,102 @@ +

{{xlabel}}/{{ylabel}}

+
+ + +
+ {% if args.latex %} +
+
+ +
+
+ + + {% endif %} diff --git a/templates/detail_page.html b/templates/detail_page.html new file mode 100644 index 0000000..2188e15 --- /dev/null +++ b/templates/detail_page.html @@ -0,0 +1,23 @@ +{% extends "general.html" %} +{% block content %} +
+ {% for item in plot_data.keys() %} + {% if item=="normal" %} + {% if batch %} +

Plots for {{title}} in batch mode

+ {% else %} +

Plots for {{title}}

+ {% endif %} + {% elif item=="scatter" and args.scatter %} + {% if batch %} +

Scatterplots for {{title}} in batch mode

+ {% else %} +

Scatterplots for {{title}}

+ {% endif %} + {% endif %} + {% for plot in plot_data[item] %} + {{ plot }} + {% endfor %} +
+ {% endfor %} +{% endblock %} diff --git a/templates/general.html b/templates/general.html new file mode 100644 index 0000000..74ba2a6 --- /dev/null +++ b/templates/general.html @@ -0,0 +1,58 @@ + + + + + + + + {{ title }} + + + + + + + + + + + + + + + + {% block content %} {% endblock %} + +
+

Contact

+

ANN-Benchmarks has been developed by Martin Aumueller (maau@itu.dk), Erik Bernhardsson (mail@erikbern.com), and Alec Faitfull (alef@itu.dk). Please use + Github to submit your implementation or improvements.

+
+
+ + diff --git a/templates/latex.template b/templates/latex.template new file mode 100644 index 0000000..4383534 --- /dev/null +++ b/templates/latex.template @@ -0,0 +1,30 @@ + +\begin{figure} + \centering + \begin{tikzpicture} + \begin{axis}[ + xlabel={ {{xlabel}} }, + ylabel={ {{ylabel}} }, + ymode = log, + yticklabel style={/pgf/number format/fixed, + /pgf/number format/precision=3}, + legend style = { anchor=west}, + cycle list name = black white + ] + {% for algo in plot_data %} + {% if algo.scatter %} + \addplot [only marks] coordinates { + {% else %} + \addplot coordinates { + {% endif %} + {% for coord in algo.coords %} + ({{ coord[0]}}, {{ coord[1] }}) + {% endfor %} + }; + \addlegendentry{ {{algo.name}} }; + {% endfor %} + \end{axis} + \end{tikzpicture} + \caption{ {{caption}} } + \label{} +\end{figure} diff --git a/templates/summary.html b/templates/summary.html new file mode 100644 index 0000000..b07e670 --- /dev/null +++ b/templates/summary.html @@ -0,0 +1,60 @@ +{% extends "general.html" %} +{% block content %} +
+

Info

+

ANN-Benchmarks is a benchmarking environment for approximate nearest neighbor algorithms search. This website contains the current benchmarking results. Please visit http://github.com/erikbern/ann-benchmarks/ to get an overview over evaluated data sets and algorithms. Make a pull request on Github to add your own code or improvements to the + benchmarking system. +

+
+

Benchmarking Results

+

Results are split by distance measure and dataset. In the bottom, you can find an overview of an algorithm's performance on all datasets. Each dataset is annoted + by (k = ...), the number of nearest neighbors an algorithm was supposed to return. The plot shown depicts Recall (the fraction + of true nearest neighbors found, on average over all queries) against Queries per second. Clicking on a plot reveils detailled interactive plots, including + approximate recall, index size, and build time.

+ {% for type in ['non-batch', 'batch'] %} + {% if len(dataset_with_distances[type]) > 0 %} + {% if type == 'batch' %} +

Benchmarks for Batched Queries

+ {% else %} +

Benchmarks for Single Queries

+ {% endif %} + +

Results by Dataset

+ {% for distance_data in dataset_with_distances[type] %} +

Distance: {{ distance_data.name }}

+ {% for entry in distance_data.entries %} + +
+
+

{{entry.desc}}

+
+
+ +
+
+
+
+ {% endfor %} + {% endfor %} +

Results by Algorithm

+
    Algorithms: + {% for algo in algorithms[type].keys() %} +
  • {{algo}}
  • + {% endfor %} +
+ {% for algo in algorithms[type].keys()%} + +
+
+

{{algo}}

+
+
+ +
+
+
+
+ {% endfor %} + {% endif %} + {% endfor %} +{% endblock %} diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/test-metrics.py b/test/test-metrics.py new file mode 100644 index 0000000..f75f0d7 --- /dev/null +++ b/test/test-metrics.py @@ -0,0 +1,63 @@ +import unittest +from ann_benchmarks.plotting.metrics import knn, queries_per_second,\ + index_size, build_time, candidates, epsilon, rel + +class TestMetrics(unittest.TestCase): + + def setUp(self): + pass + + def test_recall(self): + exact_queries = [[0.1, 0.25]] + run1 = [[]] + run2 = [[0.2, 0.3]] + run3 = [[0.2]] + run4 = [[0.2, 0.25]] + + self.assertAlmostEqual(knn(exact_queries, run1, 2), 0.0) + self.assertAlmostEqual(knn(exact_queries, run2, 2), 0.5) + self.assertAlmostEqual(knn(exact_queries, run3, 2), 0.5) + self.assertAlmostEqual(knn(exact_queries, run4, 2), 1.0) + + def test_epsilon_recall(self): + exact_queries = [[0.05, 0.08, 0.24, 0.3]] + run1 = [[]] + run2 = [[0.1, 0.2, 0.55, 0.7]] + + self.assertAlmostEqual(epsilon(exact_queries, run1, 4, 1), 0.0) + + self.assertAlmostEqual(epsilon(exact_queries, run2, 4, 0.0001), 0.5) + # distance can be off by factor (1 + 1) * 0.3 = 0.6 => recall .75 + self.assertAlmostEqual(epsilon(exact_queries, run2, 4, 1), 0.75) + # distance can be off by factor (1 + 2) * 0.3 = 0.9 => recall 1 + self.assertAlmostEqual(epsilon(exact_queries, run2, 4, 2), 1.0) + + def test_relative(self): + exact_queries = [[0.1, 0.2, 0.25, 0.3]] + run1 = [] + run2 = [[0.1, 0.2, 0.25, 0.3]] + run3 = [[0.1, 0.2, 0.55, 0.9]] + + self.assertAlmostEqual(rel(exact_queries, run1), float("inf")) + self.assertAlmostEqual(rel(exact_queries, run2), 1) + # total distance exact: 0.85, total distance run3: 1.75 + self.assertAlmostEqual(rel(exact_queries, run3), 1.75 / + 0.85) + + def test_queries_per_second(self): + self.assertAlmostEqual(queries_per_second([], {"best_search_time" : 0.01}), + 100) + + def test_index_size(self): + self.assertEqual(index_size([], {"index_size" : 100}), 100) + + def test_build_time(self): + self.assertEqual(build_time([], {"build_time" : 100}), 100) + + def test_candidates(self): + self.assertEqual(candidates([], {"candidates" : 10}), 10) + + +if __name__ == '__main__': + unittest.main() +