From 0d339968a7c3f9ef56d28af2b6462cfae352f0e4 Mon Sep 17 00:00:00 2001
From: root <root@bi-iteb-srvr05.engr.uconn.edu>
Date: Sun, 15 Dec 2019 09:23:30 -0500
Subject: [PATCH] The first commit.

---
 .gitignore                                    |  20 +
 algos.yaml                                    | 700 ++++++++++++++++++
 ann-bench.def                                 |  10 +
 ann_benchmarks/__init__.py                    |   2 +
 ann_benchmarks/algorithms/__init__.py         |   0
 ann_benchmarks/algorithms/annoy.py            |  24 +
 ann_benchmarks/algorithms/balltree.py         |  21 +
 ann_benchmarks/algorithms/base.py             |  27 +
 ann_benchmarks/algorithms/bruteforce.py       |  98 +++
 ann_benchmarks/algorithms/datasketch.py       |  40 +
 ann_benchmarks/algorithms/definitions.py      | 175 +++++
 ann_benchmarks/algorithms/dolphinnpy.py       |  30 +
 ann_benchmarks/algorithms/dummy_algo.py       |  24 +
 ann_benchmarks/algorithms/faiss.py            |  71 ++
 ann_benchmarks/algorithms/faiss_gpu.py        |  56 ++
 ann_benchmarks/algorithms/faiss_hnsw.py       |  36 +
 ann_benchmarks/algorithms/flann.py            |  24 +
 ann_benchmarks/algorithms/hdidx.py            |  30 +
 ann_benchmarks/algorithms/hnswlib.py          |  33 +
 ann_benchmarks/algorithms/kdtree.py           |  21 +
 ann_benchmarks/algorithms/kgraph.py           |  37 +
 ann_benchmarks/algorithms/lshf.py             |  22 +
 ann_benchmarks/algorithms/mrpt.py             |  31 +
 ann_benchmarks/algorithms/nearpy.py           |  63 ++
 ann_benchmarks/algorithms/nmslib.py           |  95 +++
 ann_benchmarks/algorithms/nmslib_sparse.py    |  95 +++
 ann_benchmarks/algorithms/onng_ngt.py         |  93 +++
 ann_benchmarks/algorithms/panng_ngt.py        |  66 ++
 ann_benchmarks/algorithms/panns.py            |  19 +
 ann_benchmarks/algorithms/pynndescent.py      |  35 +
 ann_benchmarks/algorithms/risc.py             |  83 +++
 ann_benchmarks/algorithms/rpforest.py         |  19 +
 ann_benchmarks/constants.py                   |   1 +
 ann_benchmarks/data.py                        |  36 +
 ann_benchmarks/datasets.py                    | 548 ++++++++++++++
 ann_benchmarks/datasets_old.py                | 480 ++++++++++++
 ann_benchmarks/distance.py                    |  53 ++
 ann_benchmarks/main.py                        | 198 +++++
 ann_benchmarks/plotting/__init__.py           |   2 +
 ann_benchmarks/plotting/metrics.py            | 113 +++
 ann_benchmarks/plotting/plot_variants.py      |  12 +
 ann_benchmarks/plotting/utils.py              | 115 +++
 ann_benchmarks/results.py                     |  77 ++
 ann_benchmarks/runner.py                      | 305 ++++++++
 create_dataset.py                             |  12 +
 create_website.py                             | 213 ++++++
 data/.gitignore                               |   4 +
 docker-install/Dockerfile                     |  10 +
 docker-install/Dockerfile.annoy               |   5 +
 docker-install/Dockerfile.datasketch          |   4 +
 docker-install/Dockerfile.dolphinn            |   5 +
 docker-install/Dockerfile.faiss               |  12 +
 docker-install/Dockerfile.flann               |  10 +
 docker-install/Dockerfile.hdidx               |  18 +
 docker-install/Dockerfile.hnswlib             |  10 +
 docker-install/Dockerfile.kgraph              |  11 +
 docker-install/Dockerfile.mrpt                |   6 +
 docker-install/Dockerfile.nearpy              |   5 +
 docker-install/Dockerfile.ngt                 |  13 +
 docker-install/Dockerfile.nmslib              |  16 +
 docker-install/Dockerfile.nmslib-sparse       |  17 +
 docker-install/Dockerfile.panns               |  10 +
 docker-install/Dockerfile.pynndescent         |   6 +
 docker-install/Dockerfile.rpforest            |   5 +
 docker-install/Dockerfile.sklearn             |   4 +
 install.py                                    |  63 ++
 run.py                                        |   4 +
 run.sh                                        |  39 +
 run_algorithm.py                              |   3 +
 running.txt                                   |   6 +
 singularity-install/ann-bench-datasketch.def  |  14 +
 singularity-install/ann-bench-ngt.def         |  22 +
 singularity-install/ann-bench-nmslib.def      |  20 +
 singularity-install/ann-bench-nmslib3.def     |  20 +
 singularity-install/ann-bench-pynndescent.def |  16 +
 singularity-install/ann-bench-risc.def        |  16 +
 singularity-install/ann-bench-sklearn.def     |  13 +
 singularity-install/ann-bench.def             |  16 +
 singularity-install/requirements.txt          |  10 +
 singularity-install/run_algorithm.py          |   3 +
 templates/chartjs.template                    | 102 +++
 templates/detail_page.html                    |  23 +
 templates/general.html                        |  58 ++
 templates/latex.template                      |  30 +
 templates/summary.html                        |  60 ++
 test/__init__.py                              |   0
 test/test-metrics.py                          |  63 ++
 87 files changed, 5037 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 algos.yaml
 create mode 100644 ann-bench.def
 create mode 100644 ann_benchmarks/__init__.py
 create mode 100644 ann_benchmarks/algorithms/__init__.py
 create mode 100644 ann_benchmarks/algorithms/annoy.py
 create mode 100644 ann_benchmarks/algorithms/balltree.py
 create mode 100644 ann_benchmarks/algorithms/base.py
 create mode 100644 ann_benchmarks/algorithms/bruteforce.py
 create mode 100644 ann_benchmarks/algorithms/datasketch.py
 create mode 100644 ann_benchmarks/algorithms/definitions.py
 create mode 100644 ann_benchmarks/algorithms/dolphinnpy.py
 create mode 100644 ann_benchmarks/algorithms/dummy_algo.py
 create mode 100644 ann_benchmarks/algorithms/faiss.py
 create mode 100644 ann_benchmarks/algorithms/faiss_gpu.py
 create mode 100644 ann_benchmarks/algorithms/faiss_hnsw.py
 create mode 100644 ann_benchmarks/algorithms/flann.py
 create mode 100644 ann_benchmarks/algorithms/hdidx.py
 create mode 100644 ann_benchmarks/algorithms/hnswlib.py
 create mode 100644 ann_benchmarks/algorithms/kdtree.py
 create mode 100644 ann_benchmarks/algorithms/kgraph.py
 create mode 100644 ann_benchmarks/algorithms/lshf.py
 create mode 100644 ann_benchmarks/algorithms/mrpt.py
 create mode 100644 ann_benchmarks/algorithms/nearpy.py
 create mode 100644 ann_benchmarks/algorithms/nmslib.py
 create mode 100644 ann_benchmarks/algorithms/nmslib_sparse.py
 create mode 100644 ann_benchmarks/algorithms/onng_ngt.py
 create mode 100644 ann_benchmarks/algorithms/panng_ngt.py
 create mode 100644 ann_benchmarks/algorithms/panns.py
 create mode 100644 ann_benchmarks/algorithms/pynndescent.py
 create mode 100644 ann_benchmarks/algorithms/risc.py
 create mode 100644 ann_benchmarks/algorithms/rpforest.py
 create mode 100644 ann_benchmarks/constants.py
 create mode 100644 ann_benchmarks/data.py
 create mode 100644 ann_benchmarks/datasets.py
 create mode 100644 ann_benchmarks/datasets_old.py
 create mode 100644 ann_benchmarks/distance.py
 create mode 100644 ann_benchmarks/main.py
 create mode 100644 ann_benchmarks/plotting/__init__.py
 create mode 100644 ann_benchmarks/plotting/metrics.py
 create mode 100644 ann_benchmarks/plotting/plot_variants.py
 create mode 100644 ann_benchmarks/plotting/utils.py
 create mode 100644 ann_benchmarks/results.py
 create mode 100644 ann_benchmarks/runner.py
 create mode 100644 create_dataset.py
 create mode 100644 create_website.py
 create mode 100644 data/.gitignore
 create mode 100644 docker-install/Dockerfile
 create mode 100644 docker-install/Dockerfile.annoy
 create mode 100644 docker-install/Dockerfile.datasketch
 create mode 100644 docker-install/Dockerfile.dolphinn
 create mode 100644 docker-install/Dockerfile.faiss
 create mode 100644 docker-install/Dockerfile.flann
 create mode 100644 docker-install/Dockerfile.hdidx
 create mode 100644 docker-install/Dockerfile.hnswlib
 create mode 100644 docker-install/Dockerfile.kgraph
 create mode 100644 docker-install/Dockerfile.mrpt
 create mode 100644 docker-install/Dockerfile.nearpy
 create mode 100644 docker-install/Dockerfile.ngt
 create mode 100644 docker-install/Dockerfile.nmslib
 create mode 100644 docker-install/Dockerfile.nmslib-sparse
 create mode 100644 docker-install/Dockerfile.panns
 create mode 100644 docker-install/Dockerfile.pynndescent
 create mode 100644 docker-install/Dockerfile.rpforest
 create mode 100644 docker-install/Dockerfile.sklearn
 create mode 100644 install.py
 create mode 100644 run.py
 create mode 100644 run.sh
 create mode 100644 run_algorithm.py
 create mode 100644 running.txt
 create mode 100644 singularity-install/ann-bench-datasketch.def
 create mode 100644 singularity-install/ann-bench-ngt.def
 create mode 100644 singularity-install/ann-bench-nmslib.def
 create mode 100644 singularity-install/ann-bench-nmslib3.def
 create mode 100644 singularity-install/ann-bench-pynndescent.def
 create mode 100644 singularity-install/ann-bench-risc.def
 create mode 100644 singularity-install/ann-bench-sklearn.def
 create mode 100644 singularity-install/ann-bench.def
 create mode 100644 singularity-install/requirements.txt
 create mode 100644 singularity-install/run_algorithm.py
 create mode 100644 templates/chartjs.template
 create mode 100644 templates/detail_page.html
 create mode 100644 templates/general.html
 create mode 100644 templates/latex.template
 create mode 100644 templates/summary.html
 create mode 100644 test/__init__.py
 create mode 100644 test/test-metrics.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..8d62cf6
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,20 @@
+*.pyc
+*.o
+protocol/c/fr-*
+
+install/*.txt
+install/*.yaml
+install/lib-*/
+
+*.class
+
+cp-*
+*.out
+*.log
+results
+indexes
+cpBuildingTime.py
+algos_v1.yaml
+README.md
+note.txt
+
diff --git a/algos.yaml b/algos.yaml
new file mode 100644
index 0000000..048fd6f
--- /dev/null
+++ b/algos.yaml
@@ -0,0 +1,700 @@
+float:
+  any:
+    DolphinnPy:
+      disabled: true
+      docker-tag: ann-benchmarks-dolphinn # Docker tag
+      module: ann_benchmarks.algorithms.dolphinnpy # Python class
+      constructor: DolphinnPy # Python class name
+      run-groups:
+        base:
+          args: [[10, 50, 100, 200, 1000, 2000]]
+    faiss-lsh:
+      disabled: true
+      docker-tag: ann-benchmarks-faiss
+      module: ann_benchmarks.algorithms.faiss
+      constructor: FaissLSH
+      base-args: ["@metric"]
+      run-groups:
+        base:
+          # When @args is a list, the result is the Cartesian product of all of
+          # the things it contains; entries that aren't a list will be treated
+          # as lists of length one.
+          args: [[32, 64, 128, 256, 512, 1024, 2048, 4096]]
+          # This run group will produce eight algorithm instances:
+          # FaissLSH(32), FaissLSH(64), and so on up to FaissLSH(4096).
+    faiss-ivf:
+      docker-tag: ann-benchmarks-faiss
+      module: ann_benchmarks.algorithms.faiss
+      constructor: FaissIVF
+      base-args: ["@metric"]
+      run-groups:
+        base:
+          args: [[32,64,128,256,512,1024,2048,4096,8192]]
+          query-args: [[1, 5, 10, 50, 100, 200]]
+    faiss-gpu:
+      disabled: true
+      docker-tag: ann-benchmarks-faiss
+      module: ann_benchmarks.algorithms.faiss_gpu
+      constructor: FaissGPU
+      run-groups:
+        base:
+          args: [[400, 1024, 4096, 8192, 16384],
+                 [1, 10, 40, 100, 200]]
+    hnswlib:
+      docker-tag: ann-benchmarks-hnswlib
+      module: ann_benchmarks.algorithms.hnswlib
+      constructor: HnswLib
+      base-args: ["@metric"]
+      run-groups:
+        M-4:
+          arg-groups:
+            - {"M": 4,  "efConstruction": 500}
+          query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]]
+        M-8:
+          arg-groups:
+            - {"M": 8,  "efConstruction": 500}
+          query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]]
+        M-12:
+          arg-groups:
+            - {"M": 12,  "efConstruction": 500}
+          query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]]
+        M-16:
+          arg-groups:
+            - {"M": 16,  "efConstruction": 500}
+          query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]]
+        M-24:
+          arg-groups:
+            - {"M": 24,  "efConstruction": 500}
+          query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]]
+        M-36:
+          arg-groups:
+            - {"M": 36,  "efConstruction": 500}
+          query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]]
+        M-48:
+          arg-groups:
+            - {"M": 48,  "efConstruction": 500}
+          query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]]
+        M-64:
+          arg-groups:
+            - {"M": 64,  "efConstruction": 500}
+          query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]]
+        M-96:
+          arg-groups:
+            - {"M": 96,  "efConstruction": 500}
+          query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]]
+
+    hnsw(faiss):
+     docker-tag: ann-benchmarks-faiss
+     module: ann_benchmarks.algorithms.faiss_hnsw
+     constructor: FaissHNSW
+     base-args: ["@metric"]
+     run-groups:
+       M-4:
+         arg-groups:
+           - {"M": 4,  "efConstruction": 500}
+         query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]]
+       M-8:
+         arg-groups:
+           - {"M": 8,  "efConstruction": 500}
+         query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]]
+       M-12:
+         arg-groups:
+           - {"M": 12,  "efConstruction": 500}
+         query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]]
+       M-16:
+         arg-groups:
+           - {"M": 16,  "efConstruction": 500}
+         query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]]
+       M-24:
+         arg-groups:
+           - {"M": 24,  "efConstruction": 500}
+         query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]]
+       M-36:
+         arg-groups:
+           - {"M": 36,  "efConstruction": 500}
+         query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]]
+       M-48:
+         arg-groups:
+           - {"M": 48,  "efConstruction": 500}
+         query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]]
+       M-64:
+         arg-groups:
+           - {"M": 64,  "efConstruction": 500}
+         query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]]
+       M-96:
+         arg-groups:
+           - {"M": 96,  "efConstruction": 500}
+         query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]]
+
+
+    flann:
+     docker-tag: ann-benchmarks-flann
+     module: ann_benchmarks.algorithms.flann
+     constructor: FLANN
+     base-args: ["@metric"]
+     run-groups:
+       flann:
+         args: [[0.2, 0.5, 0.7, 0.8, 0.9, 0.95, 0.97]]
+    panns:
+     disabled: true
+     docker-tag: ann-benchmarks-panns
+     module: ann_benchmarks.algorithms.panns
+     constructor: PANNS
+     base-args: ["@metric"]
+     run-groups:
+       five-trees:
+         args: [5, 20]
+       ten-trees:
+         args: [10, [10, 50]]
+       hundred-candidates:
+         args: [[10, 20, 40], 100]
+    annoy:
+      docker-tag: ann-benchmarks-annoy
+      module: ann_benchmarks.algorithms.annoy
+      constructor: Annoy
+      base-args: ["@metric"]
+      run-groups:
+        annoy:
+          args: [[100, 200, 400]]
+          query-args: [[100, 200, 400, 1000, 2000, 4000, 10000, 20000, 40000,
+                      100000, 200000, 400000]]
+          # This run group produces 3 algorithm instances -- Annoy("angular",
+          # 100), Annoy("angular", 200), and Annoy("angular", 400) -- each of
+          # which will be used to run 12 different queries.
+    shidx:
+      docker-tag: ann-benchmarks-hdidx
+      module: ann_benchmarks.algorithms.hdidx
+      constructor: SHIdx
+      base-args: []
+      run-groups:
+        shidx:
+          args: [[4, 8, 16, 32, 64, 128, 256]]
+    nearpy:
+      disabled: true
+      docker-tag: ann-benchmarks-nearpy
+      module: ann_benchmarks.algorithms.nearpy
+      constructor: NearPy
+      base-args: ["@metric"]
+      run-groups:
+        nearpy:
+          args: [[10, 12, 14, 16], [5, 10, 20, 40]]
+        extra:
+          args: [16, [5, 10, 15, 20, 25, 30, 40]]
+    bruteforce:
+      disabled: true
+      docker-tag: ann-benchmarks-sklearn
+      module: ann_benchmarks.algorithms.bruteforce
+      constructor: BruteForce
+      base-args: ["@metric"]
+      run-groups:
+        empty:
+          args: []
+    bruteforce-blas:
+      docker-tag: ann-benchmarks-sklearn
+      module: ann_benchmarks.algorithms.bruteforce
+      constructor: BruteForceBLAS
+      base-args: ["@metric"]
+      run-groups:
+        empty:
+          args: []
+    dummy-algo-st:
+      disabled: true
+      docker-tag: ann-benchmarks-sklearn
+      module: ann_benchmarks.algorithms.dummy_algo
+      constructor: DummyAlgoSt
+      base-args: ["@metric"]
+      run-groups:
+        empty:
+          args: []
+    dummy-algo-mt:
+      disabled: true
+      docker-tag: ann-benchmarks-sklearn
+      module: ann_benchmarks.algorithms.dummy_algo
+      constructor: DummyAlgoMt
+      base-args: ["@metric"]
+      run-groups:
+        empty:
+          args: []
+    ball:
+      disabled: true
+      docker-tag: ann-benchmarks-sklearn
+      module: ann_benchmarks.algorithms.balltree
+      constructor: BallTree
+      base-args: ["@metric"]
+      run-groups:
+        ball:
+          args: &treeargs [[10, 20, 40, 100, 200, 400, 1000]]
+    kd:
+      docker-tag: ann-benchmarks-sklearn
+      module: ann_benchmarks.algorithms.kdtree
+      constructor: KDTree
+      base-args: ["@metric"]
+      run-groups:
+        ball:
+          args: *treeargs
+    BallTree(nmslib):
+      docker-tag: ann-benchmarks-nmslib
+      module: ann_benchmarks.algorithms.nmslib
+      constructor: NmslibReuseIndex
+      base-args: ["@metric", "vptree"]
+      run-groups:
+        base:
+          # When @args is a dictionary, algorithm instances will be generated
+          # by taking the Cartesian product of all of its values.
+          arg-groups:
+                - {"tuneK": 10, "desiredRecall": [0.99, 0.97, 0.95, 0.9, 0.85, 0.8,
+                    0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1]}
+                - False
+          # This run group produces thirteen algorithm instances:
+          # NmslibNewIndex("angular", "vptree", {"tuneK": 10,
+          # "desiredRecall": 0.99}), NmslibNewIndex("angular", "vptree",
+          # {"tuneK": 10, "desiredRecall": 0.97}), and so on up to
+          # NmslibNewIndex("angular", "vptree", {"tuneK": 10, "desiredRecall":
+          # 0.1}).
+    pynndescent:
+      docker-tag: ann-benchmarks-pynndescent
+      module: ann_benchmarks.algorithms.pynndescent
+      constructor: PyNNDescent
+      base-args: ["@metric"]
+      run-groups:
+        pynndescent:
+          args: [[10, 20, 40, 80], [4, 8], [30]]
+          query-args: [[1.0, 2.0, 4.0, 8.0]]
+    NGT-panng:
+      docker-tag: ann-benchmarks-ngt
+      module: ann_benchmarks.algorithms.panng_ngt
+      constructor : PANNG
+      base-args : ["@metric", "Float"]
+      run-groups :
+        panng:
+           args : [{'edge': 20, 'pathadj': 40, 'searchedge': 60}]
+           query-args : [[0.6, 0.8, 0.9, 1.0, 1.02, 1.05, 1.1, 1.2]]
+    NGT-onng:
+      docker-tag: ann-benchmarks-ngt
+      module: ann_benchmarks.algorithms.onng_ngt
+      constructor : ONNG
+      base-args : ["@metric", "Float", 0.1]
+      run-groups :
+        onng:
+           args : [{'edge': 100, 'outdegree': 10, 'indegree': 120}]
+           query-args : [[0.6, 0.8, 0.9, 1.0, 1.02, 1.05, 1.1, 1.2]]
+    mrpt:
+      docker-tag: ann-benchmarks-mrpt
+      module: ann_benchmarks.algorithms.mrpt
+      constructor: MRPT
+      base-args: ["@metric"]
+      run-groups:
+        # See https://github.com/ejaasaari/mrpt-comparison/blob/master/parameters/gist.sh
+        mrpt:
+          args: [[5, 25, 100], [1, 2, 4, 8]]
+          query-args: [[1, 2, 4, 10, 20, 40, 100]]
+  euclidean:
+    kgraph:
+      docker-tag: ann-benchmarks-kgraph
+      module: ann_benchmarks.algorithms.kgraph
+      constructor: KGraph
+      base-args: ["@metric"]
+      run-groups:
+        kgraph:
+          args: [ {'reverse': -1}, True] # XXX: hard-codes save_index as True
+          query-args: [[1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]]
+    hnsw(nmslib):
+      docker-tag: ann-benchmarks-nmslib
+      module: ann_benchmarks.algorithms.nmslib
+      constructor: NmslibReuseIndex
+      base-args: ["@metric", "hnsw"]
+      run-groups:
+        M-32:
+          # If a run group has an array called @arg-groups instead of one
+          # called @args, then every element in that array will be separately
+          # expanded before then taking the Cartesian product of all of those
+          # expansions.
+          #
+          # Yes, this is a bit of a hack, but some constructors are weird.
+          # (This one used to require that dictionaries be encoded as lists
+          # of environment variable-style strings -- ["M=32", "post=2",
+          # "efConstruction=400"] -- which didn't work with this at all...)
+          arg-groups:
+            - {"M": 32, "post": 2, "efConstruction": 400}
+            - False
+          query-args: [[20, 30, 40, 50, 60, 70, 80, 90, 100, 120, 140, 160, 200,
+              300, 400]]
+        M-20:
+          arg-groups:
+            - {"M": 20, "post": 2, "efConstruction": 400}
+            - False
+          query-args: [[2, 5, 10, 15, 20, 30, 40, 50, 70, 80, 120, 200, 400]]
+        M-12:
+          arg-groups:
+            - {"M": 12, "post": 0, "efConstruction": 400}
+            - False
+          query-args: [[1, 2, 5, 10, 15, 20, 30, 40, 50, 70, 80, 120]]
+        M-4:
+          arg-groups:
+            - {"M": 4, "post": 0, "efConstruction": 400}
+            - False
+          query-args: [[1, 2, 5, 10, 20, 30, 50, 70, 90, 120]]
+        M-8:
+          arg-groups:
+            - {"M": 8, "post": 0, "efConstruction": 400}
+            - False
+          query-args: [[1, 2, 5, 10, 20, 30, 50, 70, 90, 120, 160]]
+    SW-graph(nmslib):
+      docker-tag: ann-benchmarks-nmslib
+      module: ann_benchmarks.algorithms.nmslib
+      constructor: NmslibReuseIndex
+      base-args: ["@metric", "sw-graph"]
+      run-groups:
+        NN-24:
+          arg-groups:
+            - {"NN": 24}
+            - False
+          query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]]
+        NN-16:
+          arg-groups:
+            - {"NN": 16}
+            - False
+          query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]]
+        NN-10:
+          arg-groups:
+            - {"NN": 10}
+            - False
+          query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]]
+        NN-5:
+          arg-groups:
+            - {"NN": 5}
+            - False
+          query-args: [[30, 25, 20, 15, 10, 5, 4, 3, 2, 1]]
+    pynndescent:
+      docker-tag: ann-benchmarks-pynndescent
+      module: ann_benchmarks.algorithms.pynndescent
+      constructor: PyNNDescent
+      base-args: ["@metric"]
+      run-groups:
+        pynndescent:
+          args: [[5, 10, 20, 40, 80], [4, 8], [20]]
+          query-args: [[1.0, 1.5, 2.0, 4.0, 8.0]]
+  angular:
+    kgraph:
+      docker-tag: ann-benchmarks-kgraph
+      module: ann_benchmarks.algorithms.kgraph
+      constructor: KGraph
+      base-args: ["@metric"]
+      run-groups:
+        kgraph:
+          args: [{'reverse': -1, "K": 200, "L": 300, "S": 20}, False]
+          query-args: [[1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]]
+    hnsw(nmslib):
+      docker-tag: ann-benchmarks-nmslib
+      module: ann_benchmarks.algorithms.nmslib
+      constructor: NmslibReuseIndex
+      base-args: ["@metric", "hnsw"]
+      run-groups:
+        M-48:
+          arg-groups:
+            - {"M": 48, "post": 2, "efConstruction": 800}
+            - False
+          query-args: [[50, 70, 90, 120, 160, 200, 400, 600, 700, 800, 1000,
+              1400, 1600, 2000]]
+        M-32:
+          arg-groups:
+            - {"M": 32, "post": 2, "efConstruction": 800}
+            - False
+          query-args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 120, 140, 160,
+              200, 300, 400, 600, 700, 800, 1000, 1200, 1400, 1600, 2000]]
+        M-20:
+          arg-groups:
+            - {"M": 20, "post": 0, "efConstruction": 800}
+            - False
+          query-args: [[2, 5, 10, 15, 20, 30, 40, 50, 70, 80]]
+        M-12:
+          arg-groups:
+            - {"M": 12, "post": 0, "efConstruction": 800}
+            - False
+          query-args: [[1, 2, 5, 10, 15, 20, 30, 40, 50, 70, 80]]
+    SW-graph(nmslib):
+      docker-tag: ann-benchmarks-nmslib
+      module: ann_benchmarks.algorithms.nmslib
+      constructor: NmslibReuseIndex
+      base-args: ["@metric", "sw-graph"]
+      run-groups:
+        NN-30:
+          arg-groups:
+            - {"NN": 30}
+            - False
+          query-args: [[700, 650, 550, 450, 350, 275, 200, 150, 120, 80,
+              50, 30]]
+        NN-15:
+          arg-groups:
+            - {"NN": 15}
+            - False
+          query-args: [[80, 50, 30, 20]]
+        NN-3:
+          arg-groups:
+            - {"NN": 3}
+            - False
+          query-args: [[120, 80, 60, 40, 20, 10, 8, 4, 2]]
+    rpforest:
+      docker-tag: ann-benchmarks-rpforest
+      module: ann_benchmarks.algorithms.rpforest
+      constructor: RPForest
+      run-groups:
+        base:
+          args: [[3, 10, 40, 100, 400],
+                [3, 10, 40, 100, 400]]
+    pynndescent:
+      docker-tag: ann-benchmarks-pynndescent
+      module: ann_benchmarks.algorithms.pynndescent
+      constructor: PyNNDescent
+      base-args: ["@metric"]
+      run-groups:
+        pynndescent:
+          args: [[5, 10, 20, 40, 80, 160], [8], [40]]
+          query-args: [[1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0]]
+bit:
+  hamming:
+    kgraph:
+      docker-tag: ann-benchmarks-kgraph
+      module: ann_benchmarks.algorithms.kgraph
+      constructor: KGraph
+      base-args: ["euclidean"]
+      run-groups:
+        kgraph:
+#          args: [[1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
+#                 {'reverse': -1, "K": 200, "L": 300, "S": 20}, False]
+          args: [{'reverse': -1, "K": 200, "L": 300, "S": 20}, False]
+          query-args: [[1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]]
+    hnsw(nmslib):
+      docker-tag: ann-benchmarks-nmslib
+      module: ann_benchmarks.algorithms.nmslib
+      constructor: NmslibReuseIndex
+      base-args: ["euclidean", "hnsw"]
+      run-groups:
+        M-48:
+          arg-groups:
+            - {"M": 48, "post": 2, "efConstruction": 800}
+            # Chunjiang added
+            - False
+          query-args: [[50, 70, 90, 120, 160, 200, 400, 600, 700, 800, 1000,
+              1400, 1600, 2000]]
+        M-32:
+          arg-groups:
+            - {"M": 32, "post": 2, "efConstruction": 800}
+            # Chunjiang added
+            - False
+          query-args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 120, 140, 160,
+              200, 300, 400, 600, 700, 800, 1000, 1200, 1400, 1600, 2000]]
+        M-20:
+          arg-groups:
+            - {"M": 20, "post": 0, "efConstruction": 800}
+            # Chunjiang added
+            - False
+          query-args: [[2, 5, 10, 15, 20, 30, 40, 50, 70, 80]]
+        M-12:
+          arg-groups:
+            - {"M": 12, "post": 0, "efConstruction": 800}
+            # Chunjiang added
+            - False
+          query-args: [[1, 2, 5, 10, 15, 20, 30, 40, 50, 70, 80]]
+    pynndescent:
+      docker-tag: ann-benchmarks-pynndescent
+      module: ann_benchmarks.algorithms.pynndescent
+      constructor: PyNNDescent
+      base-args: ["euclidean"]
+      run-groups:
+        pynndescent:
+          args: [[20, 40, 80, 160, 250], [4], [40]]
+          query-args: [[1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0]]
+    annoy:
+      docker-tag: ann-benchmarks-annoy
+      module: ann_benchmarks.algorithms.annoy
+      constructor: Annoy
+      base-args: ["@metric"]
+      run-groups:
+        annoy:
+          args: [[100, 200, 400]]
+          query-args: [[100, 200, 400, 1000, 2000, 4000, 10000, 20000, 40000,
+                      100000, 200000, 400000]]
+          # This run group produces 3 algorithm instances -- Annoy("angular",
+          # 100), Annoy("angular", 200), and Annoy("angular", 400) -- each of
+          # which will be used to run 12 different queries.
+    faiss-ivf:
+      docker-tag: ann-benchmarks-faiss
+      module: ann_benchmarks.algorithms.faiss
+      constructor: FaissIVF
+      base-args: ["euclidean"]
+      run-groups:
+        base:
+          args: [[32,64,128,256,512,1024,2048,4096,8192]]
+          query-args: [[1, 5, 10, 50, 100, 200]]
+  jaccard:
+    Bruteforce:
+      disabled: false
+      docker-tag: ann-benchmarks-sklearn
+      module: ann_benchmarks.algorithms.bruteforce
+      constructor: BruteForceBLAS
+      base-args: ["@metric"]
+      run-groups:
+        base:
+          args: {}
+    Balltree(Sklearn):
+      disabled: false
+      docker-tag: ann-benchmarks-sklearn
+      module: ann_benchmarks.algorithms.balltree
+      constructor: BallTree
+      base-args: ["@metric"]
+      run-groups:
+        ball:
+          args: [[1, 10, 20, 40, 100, 200, 400, 1000]]
+    VPtree(Nmslib):
+      disabled: false
+      docker-tag: ann-benchmarks-nmslib
+      module: ann_benchmarks.algorithms.nmslib
+      constructor: NmslibReuseIndex
+      base-args: ["@metric", "vptree"]
+      run-groups:
+        base:
+          # When @args is a dictionary, algorithm instances will be generated
+          # by taking the Cartesian product of all of its values.
+          arg-groups:
+          - {"tuneK": 10, "desiredRecall": [0.999, 0.997, 0.995, 0.99, 0.97, 0.95, 0.9, 0.85, 0.8,
+                                            0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0.05, 0.02, 0.01]}
+          - False
+    Datasketch:
+      disabled: false
+      docker-tag: ann-benchmarks-datasketch
+      module: ann_benchmarks.algorithms.datasketch
+      constructor: DataSketch
+      base-args: ["@metric"]
+      run-groups:
+        base:
+          args:  [[32, 64, 128, 256, 512, 1024, 2048],[10, 30, 50, 70]]
+    Hnsw(Nmslib):
+      disabled: false
+      docker-tag: ann-benchmarks-nmslib
+      module: ann_benchmarks.algorithms.nmslib
+      constructor: NmslibReuseIndex
+      base-args: ["@metric", "hnsw"]
+      run-groups:
+        M-48:
+          arg-groups:
+          - {"M": 48, "post": 2, "efConstruction": 800}
+          - False
+          query-args: [[50, 70, 90, 120, 160, 200, 400, 600, 700, 800, 1000,
+                        1400, 1600, 2000]]
+        M-32:
+          arg-groups:
+          - {"M": 32, "post": 2, "efConstruction": 800}
+          - False
+          query-args: [[100, 300, 500, 700, 1000, 1500, 2000]]
+        M-20:
+          arg-groups:
+          - {"M": 20, "post": 0, "efConstruction": 800}
+          - False
+          query-args: [[2, 5, 10, 15, 20, 30, 40, 50, 70, 80]]
+        M-12:
+          arg-groups:
+          - {"M": 12, "post": 0, "efConstruction": 800}
+          - False
+          query-args: [[1, 2, 5, 10, 15, 20, 30, 40, 50, 70, 80]]
+        M-5:
+          arg-groups:
+          - {"M": 5, "post": 0, "efConstruction": 10}
+          - False
+          query-args: [[1, 2, 5, 10]]
+        M-2:
+          arg-groups:
+          - {"M": 2, "post": 0, "efConstruction": 1}
+          - False
+          query-args: [[1, 2]]
+    SW-graph(Nmslib):
+      disabled: false
+      docker-tag: ann-benchmarks-nmslib
+      module: ann_benchmarks.algorithms.nmslib
+      constructor: NmslibReuseIndex
+      base-args: ["@metric", "sw-graph"]
+      run-groups:
+        NN-48:
+          arg-groups:
+          - {"NN": 48}
+          - False
+          query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]]
+        NN-24:
+          arg-groups:
+          - {"NN": 24}
+          - False
+          query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]]
+        NN-16:
+          arg-groups:
+          - {"NN": 16}
+          - False
+          query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]]
+        NN-10:
+          arg-groups:
+          - {"NN": 10}
+          - False
+          query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]]
+        NN-5:
+          arg-groups:
+          - {"NN": 5}
+          - False
+          query-args: [[30, 25, 20, 15, 10, 5, 4, 3, 2, 1]]
+        NN-2:
+          arg-groups:
+          - {"NN": 2}
+          - False
+          query-args: [[30, 25, 20, 15, 10, 5, 4, 3, 2, 1]]
+        NN-1:
+          arg-groups:
+          - {"NN": 1}
+          - False
+          query-args: [[30, 25, 20, 15, 10, 5, 4, 3, 2, 1]]
+    Pynndescent:
+      disabled: false
+      docker-tag: ann-benchmarks-pynndescent
+      module: ann_benchmarks.algorithms.pynndescent
+      constructor: PyNNDescent
+      base-args: ["@metric"]
+      run-groups:
+        pynndescent:
+          args: [[2, 5, 10, 20, 40, 80, 120, 160], [2, 4, 8], [30]]
+          query-args: [[1.0, 2.0, 4.0, 8.0]]
+    Onng(Ngt):
+      disabled: false
+      docker-tag: ann-benchmarks-ngt
+      module: ann_benchmarks.algorithms.onng_ngt
+      constructor: ONNG
+      base-args: ["@metric", "Byte", 1.0]
+      run-groups:
+        onng:
+          args: [[100, 300, 500, 1000], [10, 30, 50, 100], [10, 30, 50, 120]]
+          query-args: [[0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0]]
+    Panng(Ngt):
+      disabled: false
+      docker-tag: ann-benchmarks-ngt
+      module: ann_benchmarks.algorithms.panng_ngt
+      constructor: PANNG
+      base-args: ["@metric", "Byte"]
+      run-groups:
+        panng:
+          args: [[10, 20, 40], [40], [30, 60, 120]]
+          query-args: [[0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 0.9, 0.95, 1.0, 1.02, 1.05, 1.1, 1.2, 1.5, 2.0]]
+    Risc:
+      disabled: false
+      docker-tag: ann-benchmarks-risc
+      module: ann_benchmarks.algorithms.risc
+      constructor: Risc
+      base-args: ["@metric", "Risc"]
+      run-groups:
+        empty:
+          args: []
+
+    DivideSkip:
+      disabled: false
+      docker-tag: ann-benchmarks-risc
+      module: ann_benchmarks.algorithms.risc
+      constructor: Risc
+      base-args: ["@metric", "DivideSkip"]
+      run-groups:
+        empty:
+          args: []
diff --git a/ann-bench.def b/ann-bench.def
new file mode 100644
index 0000000..e2d3136
--- /dev/null
+++ b/ann-bench.def
@@ -0,0 +1,10 @@
+Bootstrap: library
+From: ubuntu:16.04
+
+%post
+    apt-get update && apt-get install -y python3-numpy python3-scipy python3-pip build-essential git
+    pip3 install -r requirements.txt
+    python3 install.py
+
+%runscript
+    python3 run.py
diff --git a/ann_benchmarks/__init__.py b/ann_benchmarks/__init__.py
new file mode 100644
index 0000000..75db8ab
--- /dev/null
+++ b/ann_benchmarks/__init__.py
@@ -0,0 +1,2 @@
+from __future__ import absolute_import
+# from ann_benchmarks.main import *
diff --git a/ann_benchmarks/algorithms/__init__.py b/ann_benchmarks/algorithms/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/ann_benchmarks/algorithms/annoy.py b/ann_benchmarks/algorithms/annoy.py
new file mode 100644
index 0000000..7f23bbf
--- /dev/null
+++ b/ann_benchmarks/algorithms/annoy.py
@@ -0,0 +1,24 @@
+from __future__ import absolute_import
+import annoy
+from ann_benchmarks.algorithms.base import BaseANN
+
+class Annoy(BaseANN):
+    def __init__(self, metric, n_trees):
+        self._n_trees = n_trees
+        self._search_k = None
+        self._metric = metric
+
+    def fit(self, X):
+        self._annoy = annoy.AnnoyIndex(X.shape[1], metric=self._metric)
+        for i, x in enumerate(X):
+            self._annoy.add_item(i, x.tolist())
+        self._annoy.build(self._n_trees)
+
+    def set_query_arguments(self, search_k):
+        self._search_k = search_k
+
+    def query(self, v, n):
+        return self._annoy.get_nns_by_vector(v.tolist(), n, self._search_k)
+
+    def __str__(self):
+        return 'Annoy(n_trees=%d, search_k=%d)' % (self._n_trees, self._search_k)
diff --git a/ann_benchmarks/algorithms/balltree.py b/ann_benchmarks/algorithms/balltree.py
new file mode 100644
index 0000000..fd60d22
--- /dev/null
+++ b/ann_benchmarks/algorithms/balltree.py
@@ -0,0 +1,21 @@
+from __future__ import absolute_import
+import sklearn.neighbors
+import sklearn.preprocessing
+from ann_benchmarks.algorithms.base import BaseANN
+
+class BallTree(BaseANN):
+    def __init__(self, metric, leaf_size=20):
+        self._leaf_size = leaf_size
+        self._metric = metric
+        self.name = 'BallTree(leaf_size=%d)' % self._leaf_size
+
+    def fit(self, X):
+        if self._metric == 'angular':
+            X = sklearn.preprocessing.normalize(X, axis=1, norm='l2')
+        self._tree = sklearn.neighbors.BallTree(X, leaf_size=self._leaf_size, metric=self._metric)
+
+    def query(self, v, n):
+        if self._metric == 'angular':
+            v = sklearn.preprocessing.normalize([v], axis=1, norm='l2')[0]
+        dist, ind = self._tree.query([v], k=n)
+        return ind[0]
diff --git a/ann_benchmarks/algorithms/base.py b/ann_benchmarks/algorithms/base.py
new file mode 100644
index 0000000..288564a
--- /dev/null
+++ b/ann_benchmarks/algorithms/base.py
@@ -0,0 +1,27 @@
+from __future__ import absolute_import
+import psutil
+
+class BaseANN(object):
+    def done(self):
+        pass
+
+    def get_index_size(self, process):
+        """Returns the size of the index in kB or -1 if not implemented."""
+        return psutil.Process().memory_info().rss / 1024  # return in kB for backwards compatibility
+
+    def fit(self, X):
+        pass
+
+    def query(self, q, n):
+        return [] # array of candidate indices
+
+    def batch_query(self, X, n):
+        self.res = []
+        for q in X:
+            self.res.append(self.query(q, n))
+
+    def get_batch_results(self):
+        return self.res
+
+    def __str__(self):
+        return self.name
diff --git a/ann_benchmarks/algorithms/bruteforce.py b/ann_benchmarks/algorithms/bruteforce.py
new file mode 100644
index 0000000..afebcb6
--- /dev/null
+++ b/ann_benchmarks/algorithms/bruteforce.py
@@ -0,0 +1,98 @@
+from __future__ import absolute_import
+import numpy
+import sklearn.neighbors
+from ann_benchmarks.distance import metrics as pd
+from ann_benchmarks.algorithms.base import BaseANN
+from scipy.sparse import issparse
+
+class BruteForce(BaseANN):
+    def __init__(self, metric):
+        if metric not in ('angular', 'euclidean', 'hamming', 'jaccard'):
+            raise NotImplementedError("BruteForce doesn't support metric %s" % metric)
+        self._metric = metric
+        self.name = 'BruteForce()'
+
+    def fit(self, X):
+        metric = {'angular': 'cosine', 'euclidean': 'l2', 'hamming': 'hamming', 'jaccard' : 'jaccard'}[self._metric]
+        self._nbrs = sklearn.neighbors.NearestNeighbors(algorithm='brute', metric=metric)
+        self._nbrs.fit(X)
+
+    def query(self, v, n):
+        return list(self._nbrs.kneighbors([v],
+            return_distance = False, n_neighbors = n)[0])
+
+    def query_with_distances(self, v, n):
+        (distances, positions) = self._nbrs.kneighbors([v],
+            return_distance = True, n_neighbors = n)
+        return zip(list(positions[0]), list(distances[0]))
+
+
+class BruteForceBLAS(BaseANN):
+    """kNN search that uses a linear scan = brute force."""
+    def __init__(self, metric, precision=numpy.float32):
+        if metric not in ('angular', 'euclidean', 'hamming', 'jaccard'):
+            raise NotImplementedError("BruteForceBLAS doesn't support metric %s" % metric)
+        elif metric == 'hamming' and precision != numpy.bool:
+            raise NotImplementedError("BruteForceBLAS doesn't support precision %s with Hamming distances" % precision)
+        self._metric = metric
+        self._precision = precision
+        self.name = 'BruteForceBLAS()'
+
+    def fit(self, X):
+        """Initialize the search index."""
+        if self._metric == 'angular':
+            lens = (X ** 2).sum(-1)  # precompute (squared) length of each vector
+            X /= numpy.sqrt(lens)[..., numpy.newaxis]  # normalize index vectors to unit length
+            self.index = numpy.ascontiguousarray(X, dtype=self._precision)
+        elif self._metric == 'hamming':
+            # Regarding bitvectors as vectors in l_2 is faster for blas
+            X = X.astype(numpy.float32)
+            lens = (X ** 2).sum(-1)  # precompute (squared) length of each vector
+            self.index = numpy.ascontiguousarray(X, dtype=numpy.float32)
+            self.lengths = numpy.ascontiguousarray(lens, dtype=numpy.float32)
+        elif self._metric == 'euclidean':
+            lens = (X ** 2).sum(-1)  # precompute (squared) length of each vector
+            self.index = numpy.ascontiguousarray(X, dtype=self._precision)
+            self.lengths = numpy.ascontiguousarray(lens, dtype=self._precision)
+        elif self._metric == 'jaccard':
+            self.index = X
+        else:
+            assert False, "invalid metric"  # shouldn't get past the constructor!
+
+    def query(self, v, n):
+        return [index for index, _ in self.query_with_distances(v, n)]
+
+    def query_with_distances(self, v, n):
+        """Find indices of `n` most similar vectors from the index to query vector `v`."""
+
+        if self._metric != 'jaccard':
+            # use same precision for query as for index
+            v = numpy.ascontiguousarray(v, dtype = self.index.dtype)
+
+        # HACK we ignore query length as that's a constant not affecting the final ordering
+        if self._metric == 'angular':
+            # argmax_a cossim(a, b) = argmax_a dot(a, b) / |a||b| = argmin_a -dot(a, b)
+            dists = -numpy.dot(self.index, v)
+        elif self._metric == 'euclidean':
+            # argmin_a (a - b)^2 = argmin_a a^2 - 2ab + b^2 = argmin_a a^2 - 2ab
+            dists = self.lengths - 2 * numpy.dot(self.index, v)
+        elif self._metric == 'hamming':
+            # Just compute hamming distance using euclidean distance
+            dists = self.lengths - 2 * numpy.dot(self.index, v)
+        elif self._metric == 'jaccard':
+            if issparse(self.index):
+                dists = [pd[self._metric]['distance'](v, e.toarray()[0]) for e in self.index]
+            else:
+                dists = [pd[self._metric]['distance'](v, e) for e in self.index]
+        else:
+            assert False, "invalid metric"  # shouldn't get past the constructor!
+        nearest_indices = numpy.argpartition(dists, n)[:n]  # partition-sort by distance, get `n` closest
+        indices = [idx for idx in nearest_indices if pd[self._metric]["distance_valid"](dists[idx])]
+        def fix(index):
+            if issparse(self.index):
+                ep = self.index[index].toarray()[0]
+            else:
+                ep = self.index[index]
+            ev = v
+            return (index, pd[self._metric]['distance'](ep, ev))
+        return map(fix, indices)
diff --git a/ann_benchmarks/algorithms/datasketch.py b/ann_benchmarks/algorithms/datasketch.py
new file mode 100644
index 0000000..8fe4d32
--- /dev/null
+++ b/ann_benchmarks/algorithms/datasketch.py
@@ -0,0 +1,40 @@
+from __future__ import absolute_import
+from datasketch import MinHashLSHForest, MinHash
+from ann_benchmarks.algorithms.base import BaseANN
+
+class DataSketch(BaseANN):
+    def __init__(self, metric, n_perm, n_rep):
+        if metric not in ('jaccard'):
+            raise NotImplementedError("Datasketch doesn't support metric %s" % metric)
+        self._n_perm = n_perm
+        self._n_rep = n_rep
+        self._metric = metric
+        self.name = 'Datasketch(n_perm=%d, n_rep=%d)' % (n_perm, n_rep)
+
+    def fit(self, X):
+        self._index = MinHashLSHForest(num_perm = self._n_perm, l = self._n_rep)
+        # the original implementation is for a list/array of list of integer
+        # for i, x in enumerate(X):
+        #     m = MinHash(num_perm = self._n_perm)
+        #     for e in x:
+        #         m.update(str(e))
+        #     self._index.add(str(i), m)
+
+        # Chunjiang modified it to take a 2D array as input
+        for i in range(len(X)):
+            m = MinHash(num_perm=self._n_perm)
+            for j in range(len(X[i])):
+                if X[i][j] == 1:
+                    m.update(str(j).encode('utf8'))
+            self._index.add(str(i), m)
+        self._index.index()
+
+    def query(self, v, n):
+        m = MinHash(num_perm = self._n_perm)
+        # for e in v:
+        #     m.update(str(e))
+        # Chunjiang modified
+        for j in range(len(v)):
+            if v[j] == 1:
+                m.update(str(j).encode('utf8'))
+        return map(int, self._index.query(m, n))
diff --git a/ann_benchmarks/algorithms/definitions.py b/ann_benchmarks/algorithms/definitions.py
new file mode 100644
index 0000000..7906eae
--- /dev/null
+++ b/ann_benchmarks/algorithms/definitions.py
@@ -0,0 +1,175 @@
+from __future__ import absolute_import
+from os import sep as pathsep
+import collections
+import importlib
+import os
+import sys
+import traceback
+import yaml
+from enum import Enum
+from itertools import product
+
+
+Definition = collections.namedtuple('Definition', ['algorithm', 'constructor', 'module', 'docker_tag', 'arguments', 'query_argument_groups', 'disabled'])
+
+def get_algorithm_name(name, batch):
+    if batch:
+        return name + "-batch"
+    return name
+
+
+def instantiate_algorithm(definition):
+    print('Trying to instantiate %s.%s(%s)' % (definition.module, definition.constructor, definition.arguments))
+    # special code for Risc
+    if "Risc" in definition.algorithm:
+        import sys
+        sys.path.append('/home/app/risc/Code')
+    module = importlib.import_module(definition.module)
+    constructor = getattr(module, definition.constructor)
+    return constructor(*definition.arguments)
+
+
+class InstantiationStatus(Enum):
+    AVAILABLE = 0
+    NO_CONSTRUCTOR = 1
+    NO_MODULE = 2
+
+
+def algorithm_status(definition):
+    try:
+        module = importlib.import_module(definition.module)
+        if hasattr(module, definition.constructor):
+            return InstantiationStatus.AVAILABLE
+        else:
+            return InstantiationStatus.NO_CONSTRUCTOR
+    except ImportError:
+        return InstantiationStatus.NO_MODULE
+
+def _generate_combinations(args):
+    if isinstance(args, list):
+        args = [el if isinstance(el, list) else [el] for el in args]
+        return [list(x) for x in product(*args)]
+    elif isinstance(args, dict):
+        flat = []
+        for k, v in args.items():
+            if isinstance(v, list):
+                flat.append([(k, el) for el in v])
+            else:
+                flat.append([(k, v)])
+        return [dict(x) for x in product(*flat)]
+    else:
+        raise TypeError("No args handling exists for %s" % type(args).__name__)
+
+
+def _substitute_variables(arg, vs):
+    if isinstance(arg, dict):
+        return dict([(k, _substitute_variables(v, vs)) for k, v in arg.items()])
+    elif isinstance(arg, list):
+        return [_substitute_variables(a, vs) for a in arg]
+    elif isinstance(arg, str) and arg in vs:
+        return vs[arg]
+    else:
+        return arg
+
+
+def _get_definitions(definition_file):
+    with open(definition_file, "r") as f:
+        return yaml.load(f)
+
+
+def list_algorithms(definition_file):
+    definitions = _get_definitions(definition_file)
+
+    print('The following algorithms are supported...')
+    for point in definitions:
+        print('\t... for the point type "%s"...' % point)
+        for metric in definitions[point]:
+            print('\t\t... and the distance metric "%s":' % metric)
+            for algorithm in definitions[point][metric]:
+                print('\t\t\t%s' % algorithm)
+
+
+def get_unique_algorithms(definition_file):
+    definitions = _get_definitions(definition_file)
+    algos = set()
+    for point in definitions:
+        for metric in definitions[point]:
+            for algorithm in definitions[point][metric]:
+                algos.add(algorithm)
+    return list(sorted(algos))
+
+
+def get_definitions(definition_file, dimension, point_type="float", distance_metric="euclidean", count=10):
+    definitions = _get_definitions(definition_file)
+
+    algorithm_definitions = {}
+    if "any" in definitions[point_type]:
+        algorithm_definitions.update(definitions[point_type]["any"])
+    algorithm_definitions.update(definitions[point_type][distance_metric])
+
+    definitions = []
+    for (name, algo) in algorithm_definitions.items():
+        for k in ['docker-tag', 'module', 'constructor']:
+            if k not in algo:
+                raise Exception('algorithm %s does not define a "%s" property' % (name, k))
+
+        base_args = []
+        if "base-args" in algo:
+            base_args = algo["base-args"]
+
+        for run_group in algo["run-groups"].values():
+            if "arg-groups" in run_group:
+                groups = []
+                for arg_group in run_group["arg-groups"]:
+                    if isinstance(arg_group, dict):
+                        # Dictionaries need to be expanded into lists in order
+                        # for the subsequent call to _generate_combinations to
+                        # do the right thing
+                        groups.append(_generate_combinations(arg_group))
+                    else:
+                        groups.append(arg_group)
+                args = _generate_combinations(groups)
+            elif "args" in run_group:
+                args = _generate_combinations(run_group["args"])
+            else:
+                assert False, "? what? %s" % run_group
+
+            if "query-arg-groups" in run_group:
+                groups = []
+                for arg_group in run_group["query-arg-groups"]:
+                    if isinstance(arg_group, dict):
+                        groups.append(_generate_combinations(arg_group))
+                    else:
+                        groups.append(arg_group)
+                query_args = _generate_combinations(groups)
+            elif "query-args" in run_group:
+                query_args = _generate_combinations(run_group["query-args"])
+            else:
+                query_args = []
+
+            for arg_group in args:
+                obj = None
+                aargs = []
+                aargs.extend(base_args)
+                if isinstance(arg_group, list):
+                    aargs.extend(arg_group)
+                else:
+                    aargs.append(arg_group)
+
+                vs = {
+                    "@count": count,
+                    "@metric": distance_metric,
+                    "@dimension": dimension
+                }
+                aargs = [_substitute_variables(arg, vs) for arg in aargs]
+                definitions.append(Definition(
+                    algorithm=name,
+                    docker_tag=algo['docker-tag'],
+                    module=algo['module'],
+                    constructor=algo['constructor'],
+                    arguments=aargs,
+                    query_argument_groups=query_args,
+                    disabled=algo.get('disabled', False)
+                ))
+
+    return definitions
diff --git a/ann_benchmarks/algorithms/dolphinnpy.py b/ann_benchmarks/algorithms/dolphinnpy.py
new file mode 100644
index 0000000..1090cd0
--- /dev/null
+++ b/ann_benchmarks/algorithms/dolphinnpy.py
@@ -0,0 +1,30 @@
+from __future__ import absolute_import
+import sys
+sys.path.append("install/lib-dolphinnpy")
+import numpy 
+import ctypes
+from dolphinn import Dolphinn
+from utils import findmean, isotropize
+from ann_benchmarks.algorithms.base import BaseANN
+
+class DolphinnPy(BaseANN):
+    def __init__(self, num_probes):
+        self.name = 'Dolphinn(num_probes={} )'.format(num_probes)
+        self.num_probes = num_probes
+        self.m = 1
+        self._index = None
+
+    def fit(self, X):
+        if X.dtype != numpy.float32:
+            X = numpy.array(X, dtype=numpy.float32)
+        d = X.shape[1]
+        self.m = findmean(X, d, 10)
+        X = isotropize(X, d, self.m)
+        hypercube_dim = int(numpy.log2(len(X))) - 2
+        self._index = Dolphinn(X, d, hypercube_dim) 
+
+    def query(self, v, n):
+        q = numpy.array([v])
+        q = isotropize(q, len(v), self.m)                
+        res = self._index.queries(q, n, self.num_probes)
+        return res[0]
diff --git a/ann_benchmarks/algorithms/dummy_algo.py b/ann_benchmarks/algorithms/dummy_algo.py
new file mode 100644
index 0000000..0682b03
--- /dev/null
+++ b/ann_benchmarks/algorithms/dummy_algo.py
@@ -0,0 +1,24 @@
+from __future__ import absolute_import
+import numpy as np
+from ann_benchmarks.algorithms.base import BaseANN
+
+class DummyAlgoMt(BaseANN):
+    def __init__(self, metric):
+        self.name = 'DummyAlgoMultiThread'
+
+    def fit(self, X):
+        self.len=len(X)-1
+
+    def query(self, v, n):
+        return np.random.randint(self.len, size=n)
+
+
+class DummyAlgoSt(BaseANN):
+    def __init__(self, metric):
+        self.name = 'DummyAlgoSingleThread'
+
+    def fit(self, X):
+        self.len=len(X)-1
+
+    def query(self, v, n):
+        return np.random.randint(self.len, size=n)
diff --git a/ann_benchmarks/algorithms/faiss.py b/ann_benchmarks/algorithms/faiss.py
new file mode 100644
index 0000000..2f41986
--- /dev/null
+++ b/ann_benchmarks/algorithms/faiss.py
@@ -0,0 +1,71 @@
+from __future__ import absolute_import
+import sys
+sys.path.append("install/lib-faiss")
+import numpy
+import sklearn.preprocessing
+import ctypes
+import faiss
+from ann_benchmarks.algorithms.base import BaseANN
+
+class Faiss(BaseANN):
+    def query(self, v, n):
+        if self._metric == 'angular':
+            v /= numpy.linalg.norm(v)
+        D, I = self.index.search(numpy.expand_dims(v,axis=0).astype(numpy.float32), n)
+        return I[0]
+
+    def batch_query(self, X, n):
+        if self._metric == 'angular':
+            X /= numpy.linalg.norm(X)
+        self.res = self.index.search(X.astype(numpy.float32), n)
+
+    def get_batch_results(self):
+        D, L = self.res
+        res = []
+        for i in range(len(D)):
+            r = []
+            for l, d in zip(L[i], D[i]):
+                if l != -1:
+                    r.append(l)
+            res.append(r)
+        return res
+
+class FaissLSH(Faiss):
+    def __init__(self, metric, n_bits):
+        self._n_bits = n_bits
+        self.index = None
+        self._metric = metric
+        self.name = 'FaissLSH(n_bits={})'.format(self._n_bits)
+
+    def fit(self, X):
+        if X.dtype != numpy.float32:
+            X = X.astype(numpy.float32)
+        f = X.shape[1]
+        self.index = faiss.IndexLSH(f, self._n_bits)
+        self.index.train(X)
+        self.index.add(X)
+
+class FaissIVF(Faiss):
+    def __init__(self, metric, n_list):
+        self._n_list = n_list
+        self._metric = metric
+
+    def fit(self, X):
+        if self._metric == 'angular':
+            X = sklearn.preprocessing.normalize(X, axis=1, norm='l2')
+
+        if X.dtype != numpy.float32:
+            X = X.astype(numpy.float32)
+
+        self.quantizer = faiss.IndexFlatL2(X.shape[1])
+        index = faiss.IndexIVFFlat(self.quantizer, X.shape[1], self._n_list, faiss.METRIC_L2)
+        index.train(X)
+        index.add(X)
+        self.index = index
+
+    def set_query_arguments(self, n_probe):
+        self._n_probe = n_probe
+        self.index.nprobe = self._n_probe
+
+    def __str__(self):
+        return 'FaissIVF(n_list=%d, n_probe=%d)' % (self._n_list, self._n_probe)
diff --git a/ann_benchmarks/algorithms/faiss_gpu.py b/ann_benchmarks/algorithms/faiss_gpu.py
new file mode 100644
index 0000000..79b91f3
--- /dev/null
+++ b/ann_benchmarks/algorithms/faiss_gpu.py
@@ -0,0 +1,56 @@
+from __future__ import absolute_import
+import sys
+# Assumes local installation of FAISS
+sys.path.append("faiss")
+import numpy
+import ctypes
+import faiss
+from ann_benchmarks.algorithms.base import BaseANN
+
+# Implementation based on
+# https://github.com/facebookresearch/faiss/blob/master/benchs/bench_gpu_sift1m.py
+class FaissGPU(BaseANN):
+    def __init__(self, n_bits, n_probes):
+        self.name = 'FaissGPU(n_bits={}, n_probes={})'.format(n_bits, n_probes)
+        self._n_bits = n_bits
+        self._n_probes = n_probes
+        self._res = faiss.StandardGpuResources()
+        self._index = None
+
+    def fit(self, X):
+        X = X.astype(numpy.float32)
+        self._index = faiss.GpuIndexIVFFlat(self._res, len(X[0]), self._n_bits,
+                                               faiss.METRIC_L2)
+#        self._index = faiss.index_factory(len(X[0]), "IVF%d,Flat" % self._n_bits)
+#        co = faiss.GpuClonerOptions()
+#        co.useFloat16 = True
+#        self._index = faiss.index_cpu_to_gpu(self._res, 0, self._index, co)
+        self._index.train(X)
+        self._index.add(X)
+        self._index.setNumProbes(self._n_probes)
+
+    def query(self, v, n):
+        return [label for label, _ in self.query_with_distances(v, n)]
+
+    def query_with_distances(self, v, n):
+        v = v.astype(numpy.float32).reshape(1, -1)
+        distances, labels = self._index.search(v, n)
+        r = []
+        for l, d in zip(labels[0], distances[0]):
+            if l != -1:
+                r.append((l, d))
+        return r
+
+    def batch_query(self, X, n):
+        self.res = self._index.search(X.astype(numpy.float32),n)
+
+    def get_batch_results(self):
+        D, L = self.res
+        res = []
+        for i in range(len(D)):
+            r = []
+            for l, d in zip(L[i], D[i]):
+                if l != -1:
+                    r.append(l)
+            res.append(r)
+        return res
diff --git a/ann_benchmarks/algorithms/faiss_hnsw.py b/ann_benchmarks/algorithms/faiss_hnsw.py
new file mode 100644
index 0000000..cda3571
--- /dev/null
+++ b/ann_benchmarks/algorithms/faiss_hnsw.py
@@ -0,0 +1,36 @@
+from __future__ import absolute_import
+import os
+import faiss
+import numpy as np
+from ann_benchmarks.constants import INDEX_DIR
+from ann_benchmarks.algorithms.base import BaseANN
+from ann_benchmarks.algorithms.faiss import Faiss
+
+
+class FaissHNSW(Faiss):
+    def __init__(self, metric, method_param):
+        self._metric = metric
+        self.method_param = method_param
+        self.name = 'faiss (%s)' % (self.method_param)
+
+    def fit(self, X):
+        self.index = faiss.IndexHNSWFlat(X.shape[1],self.method_param["M"])
+        self.index.hnsw.efConstruction = self.method_param["efConstruction"]
+        self.index.verbose = True
+
+        if self._metric == 'angular':
+            X = X / np.linalg.norm(X, axis=1)[:, np.newaxis]
+        if X.dtype != np.float32:
+            X = X.astype(np.float32)
+
+        self.index.add(X)
+        faiss.omp_set_num_threads(1)
+
+    def set_query_arguments(self, ef):
+        self.index.hnsw.efSearch = ef
+
+    def freeIndex(self):
+        del self.p
+
+    def __str__(self):
+        return 'faiss (%s, efSearch: %s)' % (self.method_param, self.index.hnsw.efSearch)
diff --git a/ann_benchmarks/algorithms/flann.py b/ann_benchmarks/algorithms/flann.py
new file mode 100644
index 0000000..47b1fe6
--- /dev/null
+++ b/ann_benchmarks/algorithms/flann.py
@@ -0,0 +1,24 @@
+from __future__ import absolute_import
+import pyflann
+import numpy
+import sklearn.preprocessing
+from ann_benchmarks.algorithms.base import BaseANN
+
+class FLANN(BaseANN):
+    def __init__(self, metric, target_precision):
+        self._target_precision = target_precision
+        self.name = 'FLANN(target_precision=%f)' % self._target_precision
+        self._metric = metric
+
+    def fit(self, X):
+        self._flann = pyflann.FLANN(target_precision=self._target_precision, algorithm='autotuned', log_level='info')
+        if self._metric == 'angular':
+            X = sklearn.preprocessing.normalize(X, axis=1, norm='l2')
+        self._flann.build_index(X)
+
+    def query(self, v, n):
+        if self._metric == 'angular':
+            v = sklearn.preprocessing.normalize([v], axis=1, norm='l2')[0]
+        if v.dtype != numpy.float32:
+            v = v.astype(numpy.float32)
+        return self._flann.nn_index(v, n)[0][0]
diff --git a/ann_benchmarks/algorithms/hdidx.py b/ann_benchmarks/algorithms/hdidx.py
new file mode 100644
index 0000000..f044926
--- /dev/null
+++ b/ann_benchmarks/algorithms/hdidx.py
@@ -0,0 +1,30 @@
+from __future__ import absolute_import
+import numpy as np
+from hdidx.indexer import SHIndexer
+from ann_benchmarks.algorithms.base import BaseANN
+
+
+class HdIdx(BaseANN):
+    def __init__(self, index):
+        self._index = index
+        self._index.set_storage()  # defaults to main memory
+        self._params = dict()  # set by sub-classes
+
+    def fit(self, X):
+        # don't update self._params directly, as it's used in __str__
+        fit_params = dict(self._params)
+        fit_params['vals'] = X
+        self._index.build(fit_params)
+        self._index.add(X)
+
+    def query(self, v, n):
+        return self._index.search(np.expand_dims(v, axis=0), n)[0][0].tolist()
+
+
+class SHIdx(HdIdx):
+    def __init__(self, n_bits=256):
+        super(SHIdx, self).__init__(SHIndexer())
+        self._params['nbits'] = n_bits
+
+    def __str__(self):
+        return 'SHIndexer_({})'.format(self._params)
diff --git a/ann_benchmarks/algorithms/hnswlib.py b/ann_benchmarks/algorithms/hnswlib.py
new file mode 100644
index 0000000..ef86377
--- /dev/null
+++ b/ann_benchmarks/algorithms/hnswlib.py
@@ -0,0 +1,33 @@
+from __future__ import absolute_import
+import os
+import hnswlib
+import numpy as np
+from ann_benchmarks.constants import INDEX_DIR
+from ann_benchmarks.algorithms.base import BaseANN
+
+
+class HnswLib(BaseANN):
+    def __init__(self, metric, method_param):
+        self.metric = {'angular': 'cosine', 'euclidean': 'l2'}[metric]
+        self.method_param = method_param
+        # print(self.method_param,save_index,query_param)
+        # self.ef=query_param['ef']
+        self.name = 'hnswlib (%s)' % (self.method_param)
+
+    def fit(self, X):
+        self.p = hnswlib.Index(space=self.metric, dim=len(X[0]))  # Only l2 is supported currently
+        self.p.init_index(max_elements=len(X), ef_construction=self.method_param["efConstruction"], M=self.method_param["M"])
+        data_labels = np.arange(len(X))
+        self.p.add_items(np.asarray(X), data_labels)
+        self.p.set_num_threads(1)
+
+    def set_query_arguments(self, ef):
+        self.p.set_ef(ef)
+
+    def query(self, v, n):
+        # print(np.expand_dims(v,axis=0).shape)
+        # print(self.p.knn_query(np.expand_dims(v,axis=0), k = n)[0])
+        return self.p.knn_query(np.expand_dims(v,axis=0), k = n)[0][0]
+
+    def freeIndex(self):
+        del self.p
diff --git a/ann_benchmarks/algorithms/kdtree.py b/ann_benchmarks/algorithms/kdtree.py
new file mode 100644
index 0000000..229a7cb
--- /dev/null
+++ b/ann_benchmarks/algorithms/kdtree.py
@@ -0,0 +1,21 @@
+from __future__ import absolute_import
+import sklearn.neighbors
+import sklearn.preprocessing
+from ann_benchmarks.algorithms.base import BaseANN
+
+class KDTree(BaseANN):
+    def __init__(self, metric, leaf_size=20):
+        self._leaf_size = leaf_size
+        self._metric = metric
+        self.name = 'KDTree(leaf_size=%d)' % self._leaf_size
+
+    def fit(self, X):
+        if self._metric == 'angular':
+            X = sklearn.preprocessing.normalize(X, axis=1, norm='l2')
+        self._tree = sklearn.neighbors.KDTree(X, leaf_size=self._leaf_size)
+
+    def query(self, v, n):
+        if self._metric == 'angular':
+            v = sklearn.preprocessing.normalize([v], axis=1, norm='l2')[0]
+        dist, ind = self._tree.query([v], k=n)
+        return ind[0]
diff --git a/ann_benchmarks/algorithms/kgraph.py b/ann_benchmarks/algorithms/kgraph.py
new file mode 100644
index 0000000..ed8db39
--- /dev/null
+++ b/ann_benchmarks/algorithms/kgraph.py
@@ -0,0 +1,37 @@
+from __future__ import absolute_import
+import os
+import numpy
+import pykgraph
+from ann_benchmarks.constants import INDEX_DIR
+from ann_benchmarks.algorithms.base import BaseANN
+
+class KGraph(BaseANN):
+    def __init__(self, metric, index_params, save_index):
+        if type(metric) == unicode:
+            metric = str(metric)
+        self.name = 'KGraph(%s)' % (metric)
+        self._metric = metric
+        self._index_params = index_params
+        self._save_index = save_index
+
+    def fit(self, X):
+        if X.dtype != numpy.float32:
+            X = X.astype(numpy.float32)
+        self._kgraph = pykgraph.KGraph(X, self._metric)
+        path = os.path.join(INDEX_DIR, 'kgraph-index-%s' % self._metric)
+        if os.path.exists(path):
+            self._kgraph.load(path)
+        else:
+            self._kgraph.build(**self._index_params) #iterations=30, L=100, delta=0.002, recall=0.99, K=25)
+            if not os.path.exists(INDEX_DIR):
+              os.makedirs(INDEX_DIR)
+            self._kgraph.save(path)
+
+    def set_query_arguments(self, P):
+        self._P = P
+
+    def query(self, v, n):
+        if v.dtype != numpy.float32:
+            v = v.astype(numpy.float32)
+        result = self._kgraph.search(numpy.array([v]), K=n, threads=1, P=self._P)
+        return result[0]
diff --git a/ann_benchmarks/algorithms/lshf.py b/ann_benchmarks/algorithms/lshf.py
new file mode 100644
index 0000000..1854e7f
--- /dev/null
+++ b/ann_benchmarks/algorithms/lshf.py
@@ -0,0 +1,22 @@
+from __future__ import absolute_import
+import sklearn.neighbors
+import sklearn.preprocessing
+from ann_benchmarks.algorithms.base import BaseANN
+
+class LSHF(BaseANN):
+    def __init__(self, metric, n_estimators=10, n_candidates=50):
+        self.name = 'LSHF(n_est=%d, n_cand=%d)' % (n_estimators, n_candidates)
+        self._metric = metric
+        self._n_estimators = n_estimators
+        self._n_candidates = n_candidates
+
+    def fit(self, X):
+        self._lshf = sklearn.neighbors.LSHForest(n_estimators=self._n_estimators, n_candidates=self._n_candidates)
+        if self._metric == 'angular':
+            X = sklearn.preprocessing.normalize(X, axis=1, norm='l2')
+        self._lshf.fit(X)
+
+    def query(self, v, n):
+        if self._metric == 'angular':
+            v = sklearn.preprocessing.normalize([v], axis=1, norm='l2')[0]
+        return self._lshf.kneighbors([v], return_distance=False, n_neighbors=n)[0]
diff --git a/ann_benchmarks/algorithms/mrpt.py b/ann_benchmarks/algorithms/mrpt.py
new file mode 100644
index 0000000..e7ffc14
--- /dev/null
+++ b/ann_benchmarks/algorithms/mrpt.py
@@ -0,0 +1,31 @@
+from __future__ import absolute_import
+import numpy
+import sklearn.preprocessing
+import mrpt
+from ann_benchmarks.algorithms.base import BaseANN
+
+class MRPT(BaseANN):
+    def __init__(self, metric, n_trees, depth):
+        self._metric = metric
+        self._n_trees = n_trees
+        self._depth = depth
+        self._votes_required = None
+
+    def fit(self, X):
+        if self._metric == 'angular':
+            X = sklearn.preprocessing.normalize(X, axis=1, norm='l2')
+
+        self._index = mrpt.MRPTIndex(X, depth=self._depth, n_trees=self._n_trees)
+        self._index.build()
+
+    def set_query_arguments(self, votes_required):
+        self._votes_required = votes_required
+
+    def query(self, v, n):
+        if self._metric == 'angular':
+            v /= numpy.linalg.norm(v)
+
+        return self._index.ann(v, n, votes_required=self._votes_required)
+
+    def __str__(self):
+        return 'MRPT(n_trees=%d, depth=%d, votes_required=%d)' % (self._n_trees, self._depth, self._votes_required)
diff --git a/ann_benchmarks/algorithms/nearpy.py b/ann_benchmarks/algorithms/nearpy.py
new file mode 100644
index 0000000..318fa2a
--- /dev/null
+++ b/ann_benchmarks/algorithms/nearpy.py
@@ -0,0 +1,63 @@
+from __future__ import absolute_import
+import nearpy
+from nearpy.filters import NearestFilter
+import sklearn.preprocessing
+from ann_benchmarks.algorithms.base import BaseANN
+import scipy
+from scipy.spatial.distance import jaccard
+
+# Chunjiang modified 0220
+class JaccardDistance():
+    """ Jaccard distance """
+
+    def distance(self, x, y):
+        """
+        Computes distance measure between vectors x and y. Returns float.
+        """
+        return jaccard(x, y)
+
+class NearPy(BaseANN):
+    def __init__(self, metric, n_bits, hash_counts):
+        self._n_bits = n_bits
+        self._hash_counts = hash_counts
+        self._metric = metric
+        self._filter = NearestFilter(10)
+        self.name = 'NearPy(n_bits=%d, hash_counts=%d)' % (self._n_bits, self._hash_counts)
+
+    def fit(self, X):
+        hashes = []
+
+        for k in range(self._hash_counts):
+            nearpy_rbp = nearpy.hashes.RandomBinaryProjections('rbp_%d' % k, self._n_bits)
+            hashes.append(nearpy_rbp)
+
+        if self._metric == 'euclidean':
+            dist = nearpy.distances.EuclideanDistance()
+            self._nearpy_engine = nearpy.Engine(
+                    X.shape[1],
+                    lshashes=hashes,
+                    distance=dist)
+        elif self._metric == 'jaccard': # Chunjiang modified 0220
+            dist = JaccardDistance()
+            self._nearpy_engine = nearpy.Engine(
+                X.shape[1],
+                lshashes=hashes,
+                distance=dist)
+        else: # Default (angular) = Cosine distance
+            self._nearpy_engine = nearpy.Engine(
+                    X.shape[1],
+                    lshashes=hashes,
+                    vector_filters=[self._filter])
+
+        if self._metric == 'angular':
+            X = sklearn.preprocessing.normalize(X, axis=1, norm='l2')
+        for i, x in enumerate(X):
+            self._nearpy_engine.store_vector(x, i)
+
+    def query(self, v, n):
+        # XXX: This feels like an unpleasant hack, but it's not clear how to do
+        # better without making changes to NearPy
+        self._filter.N = n
+        if self._metric == 'angular':
+            v = sklearn.preprocessing.normalize([v], axis=1, norm='l2')[0]
+        return [y for x, y, z in self._nearpy_engine.neighbours(v)]
diff --git a/ann_benchmarks/algorithms/nmslib.py b/ann_benchmarks/algorithms/nmslib.py
new file mode 100644
index 0000000..44732b2
--- /dev/null
+++ b/ann_benchmarks/algorithms/nmslib.py
@@ -0,0 +1,95 @@
+from __future__ import absolute_import
+import os
+import nmslib
+from ann_benchmarks.constants import INDEX_DIR
+from ann_benchmarks.algorithms.base import BaseANN
+from scipy.sparse import csr_matrix
+import numpy
+
+
+class NmslibReuseIndex(BaseANN):
+    @staticmethod
+    def encode(d):
+        return ["%s=%s" % (a, b) for (a, b) in d.items()]
+
+    # For each entry in the sparse matrix, extract a list of IDs and
+    # convert them to a string. Return a list of such strings.
+    @staticmethod
+    def matrToStrArray(sparseMatr):
+        res = []
+        indptr = sparseMatr.indptr
+        indices = sparseMatr.indices
+        for row in range(sparseMatr.shape[0]):
+            arr = [k for k in indices[indptr[row]: indptr[row + 1]]]
+            arr.sort()
+            res.append(' '.join([str(k) for k in arr]))
+        return res
+
+    def __init__(self, metric, method_name, index_param, query_param):
+        self._nmslib_metric = {'angular': 'cosinesimil', 'euclidean': 'l2', 'jaccard': 'jaccard_sparse'}[metric]
+        self._method_name = method_name
+        self._save_index = False
+        self._index_param = NmslibReuseIndex.encode(index_param)
+        if query_param!=False:
+            self._query_param = NmslibReuseIndex.encode(query_param)
+            self.name = 'Nmslib(method_name=%s, index_param=%s, query_param=%s)' % (
+            self._method_name, self._index_param, self._query_param)
+        else:
+            self._query_param = None
+            self.name = 'Nmslib(method_name=%s, index_param=%s)' % (
+            self._method_name, self._index_param)
+
+        self._index_name = os.path.join(INDEX_DIR, "nmslib_%s_%s_%s" % (self._method_name, metric, '_'.join(self._index_param)))
+
+        d = os.path.dirname(self._index_name)
+        if not os.path.exists(d):
+            os.makedirs(d)
+
+    def fit(self, X):
+        if self._method_name == 'vptree':
+            # To avoid this issue:
+            # terminate called after throwing an instance of 'std::runtime_error'
+            # what():  The data size is too small or the bucket size is too big. Select the parameters so that <total # of records> is NOT less than <bucket size> * 1000
+            # Aborted (core dumped)
+            self._index_param.append('bucketSize=%d' % min(int(X.shape[0] * 0.0005), 1000))
+
+        # Chunjiang modified it to "if" for jaccard
+        if self._nmslib_metric == 'jaccard_sparse':
+            X_trans = NmslibReuseIndex.matrToStrArray(csr_matrix(X))
+            self._index = nmslib.init(space=self._nmslib_metric, method=self._method_name, data_type=nmslib.DataType.OBJECT_AS_STRING)
+            self._index.addDataPointBatch(X_trans)
+        else:
+            self._index = nmslib.init(space=self._nmslib_metric, method=self._method_name)
+            self._index.addDataPointBatch(X)
+
+        if os.path.exists(self._index_name):
+            print('Loading index from file')
+            self._index.loadIndex(self._index_name)
+        else:
+            self._index.createIndex(self._index_param)
+            if self._save_index:
+                self._index.saveIndex(self._index_name)
+        if self._query_param is not None:
+            self._index.setQueryTimeParams(self._query_param)
+
+    def set_query_arguments(self, ef):
+        if self._method_name == 'hnsw' or self._method_name == 'sw-graph':
+            self._index.setQueryTimeParams(["efSearch=%s"%(ef)])
+
+    def query(self, v, n):
+        # Chunjiang modified
+        if self._nmslib_metric == 'jaccard_sparse':
+            nz = numpy.nonzero(v)[0]
+            v = ' '.join([str(k) for k in nz])
+        ids, distances = self._index.knnQuery(v, n)
+        return ids
+
+    def batch_query(self, X, n):
+        # Chunjiang modified
+        if self._nmslib_metric == 'jaccard_sparse':
+            X = NmslibReuseIndex.matrToStrArray(csr_matrix(X))
+        self.res = self._index.knnQueryBatch(X, n)
+
+    def get_batch_results(self):
+        return [x for x, _ in self.res]
+
diff --git a/ann_benchmarks/algorithms/nmslib_sparse.py b/ann_benchmarks/algorithms/nmslib_sparse.py
new file mode 100644
index 0000000..58af969
--- /dev/null
+++ b/ann_benchmarks/algorithms/nmslib_sparse.py
@@ -0,0 +1,95 @@
+from __future__ import absolute_import
+import os
+import nmslib
+from ann_benchmarks.constants import INDEX_DIR
+from ann_benchmarks.algorithms.base import BaseANN
+from scipy.sparse import csr_matrix
+import numpy
+
+
+class NmslibSparseReuseIndex(BaseANN):
+    @staticmethod
+    def encode(d):
+        return ["%s=%s" % (a, b) for (a, b) in d.iteritems()]
+
+    # For each entry in the sparse matrix, extract a list of IDs and
+    # convert them to a string. Return a list of such strings.
+    @staticmethod
+    def matrToStrArray(sparseMatr):
+        res = []
+        indptr = sparseMatr.indptr
+        indices = sparseMatr.indices
+        for row in range(sparseMatr.shape[0]):
+            arr = [k for k in indices[indptr[row]: indptr[row + 1]]]
+            arr.sort()
+            res.append(' '.join([str(k) for k in arr]))
+        return res
+
+    def __init__(self, metric, method_name, index_param, query_param):
+        self._nmslib_metric = {'angular': 'cosinesimil', 'euclidean': 'l2', 'jaccard': 'jaccard_sparse'}[metric]
+        self._method_name = method_name
+        self._save_index = False
+        self._index_param = NmslibSparseReuseIndex.encode(index_param)
+        if query_param!=False:
+            self._query_param = NmslibSparseReuseIndex.encode(query_param)
+            self.name = 'Nmslib(method_name=%s, index_param=%s, query_param=%s)' % (
+            self._method_name, self._index_param, self._query_param)
+        else:
+            self._query_param = None
+            self.name = 'Nmslib(method_name=%s, index_param=%s)' % (
+            self._method_name, self._index_param)
+
+        self._index_name = os.path.join(INDEX_DIR, "nmslib_%s_%s_%s" % (self._method_name, metric, '_'.join(self._index_param)))
+
+        d = os.path.dirname(self._index_name)
+        if not os.path.exists(d):
+            os.makedirs(d)
+
+    def fit(self, X):
+        if self._method_name == 'vptree':
+            # To avoid this issue:
+            # terminate called after throwing an instance of 'std::runtime_error'
+            # what():  The data size is too small or the bucket size is too big. Select the parameters so that <total # of records> is NOT less than <bucket size> * 1000
+            # Aborted (core dumped)
+            self._index_param.append('bucketSize=%d' % min(int(X.shape[0] * 0.0005), 1000))
+
+        # Chunjiang modified it to "if" for jaccard
+        if self._nmslib_metric == 'jaccard_sparse':
+            X_trans = NmslibSparseReuseIndex.matrToStrArray(X)
+            self._index = nmslib.init(space=self._nmslib_metric, method=self._method_name, data_type=nmslib.DataType.OBJECT_AS_STRING)
+            self._index.addDataPointBatch(X_trans)
+        else:
+            self._index = nmslib.init(space=self._nmslib_metric, method=self._method_name)
+            self._index.addDataPointBatch(X)
+
+        if os.path.exists(self._index_name):
+            print('Loading index from file')
+            self._index.loadIndex(self._index_name)
+        else:
+            self._index.createIndex(self._index_param)
+            if self._save_index:
+                self._index.saveIndex(self._index_name)
+        if self._query_param is not None:
+            self._index.setQueryTimeParams(self._query_param)
+
+    def set_query_arguments(self, ef):
+        if self._method_name == 'hnsw' or self._method_name == 'sw-graph':
+            self._index.setQueryTimeParams(["efSearch=%s"%(ef)])
+
+    def query(self, v, n):
+        # Chunjiang modified
+        if self._nmslib_metric == 'jaccard_sparse':
+            nz = numpy.nonzero(v)[0]
+            v = ' '.join([str(k) for k in nz])
+        ids, distances = self._index.knnQuery(v, n)
+        return ids
+
+    def batch_query(self, X, n):
+        # Chunjiang modified
+        if self._nmslib_metric == 'jaccard_sparse':
+            X = NmslibSparseReuseIndex.matrToStrArray(csr_matrix(X))
+        self.res = self._index.knnQueryBatch(X, n)
+
+    def get_batch_results(self):
+        return [x for x, _ in self.res]
+
diff --git a/ann_benchmarks/algorithms/onng_ngt.py b/ann_benchmarks/algorithms/onng_ngt.py
new file mode 100644
index 0000000..fce0743
--- /dev/null
+++ b/ann_benchmarks/algorithms/onng_ngt.py
@@ -0,0 +1,93 @@
+from __future__ import absolute_import
+import sys
+import os
+import ngtpy
+import numpy as np
+import subprocess
+import time
+from ann_benchmarks.algorithms.base import BaseANN
+from ann_benchmarks.constants import INDEX_DIR
+
+class ONNG(BaseANN):
+    def __init__(self, metric, object_type, epsilon, edge, outdegree, indegree):
+        # Chunjiang modified 0222
+        metrics = {'euclidean': '2', 'angular': 'C', 'jaccard': 'j'}
+        types = {'Float' : 'f', 'Byte' : 'c'}
+        self._edge_size = edge # edge_size_for_construction
+        self._outdegree = outdegree
+        self._indegree = indegree
+        self._metric = metrics[metric]
+        # Chunjiang modified 0222
+        self._object_type = types[object_type]
+        self._edge_size_for_search = 0
+        self._build_time_limit = 4
+        self._epsilon = epsilon
+        print('ONNG: edge_size=' + str(self._edge_size))
+        print('ONNG: outdegree=' + str(self._outdegree))
+        print('ONNG: indegree=' + str(self._indegree))
+        print('ONNG: edge_size_for_search=' + str(self._edge_size_for_search))
+        print('ONNG: epsilon=' + str(self._epsilon))
+        print('ONNG: metric=' + metric)
+        print('ONNG: object_type=' + object_type)
+
+    def fit(self, X):
+        print('ONNG: start indexing...')
+        dim = X.shape[1]
+        print('ONNG: # of data=' + str(X.shape[0]))
+        print('ONNG: dimensionality=' + str(dim))
+        index_dir = 'indexes'
+        if not os.path.exists(index_dir):
+            os.makedirs(index_dir)
+        index = os.path.join(index_dir, 'ONNG-' + str(self._edge_size) + '-' + str(self._outdegree) + '-' + str(self._indegree))
+        anngIndex = os.path.join(index_dir, 'ANNG-' + str(self._edge_size))
+        print('ONNG: index=' + index)
+        print('ANNG: index=' + anngIndex)
+
+        if (not os.path.exists(index)) and (not os.path.exists(anngIndex)):
+            print('ONNG: create ANNG')
+            t = time.time()
+            #'-b500',
+            args = ['ngt', 'create', '-it', '-p8', '-b500', '-ga', '-o' + self._object_type, '-D' + self._metric, '-d' + str(dim), '-E' + str(self._edge_size), '-S' + str(self._edge_size_for_search), '-e' + str(self._epsilon), '-P0', '-B30', '-T' + str(self._build_time_limit), anngIndex]
+            print(args)
+            subprocess.call(args)
+            idx = ngtpy.Index(path=anngIndex)
+            idx.batch_insert(X, num_threads=24, debug=False)
+            idx.save()
+            idx.close()
+            print('ONNG: ANNG construction time(sec)=' + str(time.time() - t))
+        if not os.path.exists(index):
+            print('ONNG: degree adjustment')
+            t = time.time()
+            args = ['ngt', 'reconstruct-graph', '-mS', '-o ' + str(self._outdegree), '-i ' + str(self._indegree), anngIndex, index]
+            subprocess.call(args)
+            print('ONNG: degree adjustment time(sec)=' + str(time.time() -t))
+        if os.path.exists(index):
+            print('ONNG: index already exists! ' + str(index))
+            t = time.time()
+            self.index = ngtpy.Index(index, read_only=True)
+            self.indexName = index
+            print('ONNG: open time(sec)=' + str(time.time() - t))
+        else:
+            print('ONNG: something wrong.')
+        print('ONNG: end of fit')
+
+    def set_query_arguments(self, epsilon):
+        print("ONNG: epsilon=" + str(epsilon))
+        self._epsilon = epsilon - 1.0
+        self.name = 'ONNG-NGT(%s, %s, %s, %s, %1.3f)' % (self._edge_size, self._outdegree, self._indegree, self._edge_size_for_search, self._epsilon + 1.0)
+
+    def query(self, v, n, rq=False):
+        if rq:
+            # direct method
+            #self.index.set(sys.maxsize, n)
+            #n = 0 # then input size 0 to search
+            #results = self.index.search(v, n, self._epsilon, self._edge_size_for_search, with_distance=False)
+            
+            # indirect method
+            results = self.index.searchRange(v, n, self._epsilon, self._edge_size_for_search, with_distance=False)
+        else:
+            results = self.index.search(v, n, self._epsilon, self._edge_size_for_search, with_distance=False)
+        return results
+
+    def freeIndex(self):
+        print('ONNG: free')
diff --git a/ann_benchmarks/algorithms/panng_ngt.py b/ann_benchmarks/algorithms/panng_ngt.py
new file mode 100644
index 0000000..a06f6e6
--- /dev/null
+++ b/ann_benchmarks/algorithms/panng_ngt.py
@@ -0,0 +1,66 @@
+from __future__ import absolute_import
+import sys
+import os
+import ngtpy
+import numpy as np
+import subprocess
+import time
+from ann_benchmarks.algorithms.base import BaseANN
+from ann_benchmarks.constants import INDEX_DIR
+
+class PANNG(BaseANN):
+    def __init__(self, metric, object_type, edge, pathadj, searchedge):
+        metrics = {'euclidean': 'L2', 'angular': 'Cosine', 'jaccard': 'Jaccard'}
+        self._edge_size = edge
+        self._pathadj_size = pathadj
+        self._edge_size_for_search = searchedge
+        self._metric = metrics[metric]
+        self._object_type = object_type
+        print('PANNG: edge_size=' + str(self._edge_size))
+        print('PANNG: pathadj_size=' + str(self._pathadj_size))
+        print('PANNG: edge_size_for_search=' + str(self._edge_size_for_search))
+        print('PANNG: metric=' + metric)
+        print('PANNG: object_type=' + object_type)
+
+    def fit(self, X):
+        print('PANNG: start indexing...')
+        dim = len(X[0])
+        print('PANNG: # of data=' + str(len(X)))
+        print('PANNG: Dimensionality=' + str(dim))
+        index_dir = 'indexes'
+        if not os.path.exists(index_dir):
+            os.makedirs(index_dir)
+        index = os.path.join(index_dir, 'PANNG-' + str(self._edge_size) + '-' + str(self._pathadj_size))
+        print(index)
+        if os.path.exists(index):
+            print('PANNG: index already exists! ' + str(index))
+        else:
+            t0 = time.time()
+            ngtpy.create(path=index, dimension=dim, edge_size_for_creation=self._edge_size, distance_type=self._metric, 
+                         object_type=self._object_type)
+            idx = ngtpy.Index(path=index)
+            idx.batch_insert(X, num_threads=24, debug=False)
+            idx.save()
+            idx.close()
+            if self._pathadj_size > 0 :
+                print('PANNG: path adjustment')
+                args = ['ngt', 'prune', '-s ' + str(self._pathadj_size), index]
+                subprocess.call(args)
+            indexingtime = time.time() - t0
+            print('PANNG: indexing, adjustment and saving time(sec)=' + str(indexingtime))
+        t0 = time.time()
+        self.index = ngtpy.Index(path=index, read_only=True)
+        opentime = time.time() - t0
+        print('PANNG: open time(sec)=' + str(opentime))
+
+    def set_query_arguments(self, epsilon):
+        print("PANNG: epsilon=" + str(epsilon))
+        self._epsilon = epsilon - 1.0
+        self.name = 'PANNG-NGT(%d, %d, %d, %1.3f)' % (self._edge_size, self._pathadj_size, self._edge_size_for_search, self._epsilon + 1.0)
+
+    def query(self, v, n):
+        results = self.index.search(v, n, self._epsilon, self._edge_size_for_search, with_distance=False)
+        return results
+
+    def freeIndex(self):
+        print('PANNG: free')
diff --git a/ann_benchmarks/algorithms/panns.py b/ann_benchmarks/algorithms/panns.py
new file mode 100644
index 0000000..d42867d
--- /dev/null
+++ b/ann_benchmarks/algorithms/panns.py
@@ -0,0 +1,19 @@
+from __future__ import absolute_import
+import panns
+from ann_benchmarks.algorithms.base import BaseANN
+
+class PANNS(BaseANN):
+    def __init__(self, metric, n_trees, n_candidates):
+        self._n_trees = n_trees
+        self._n_candidates = n_candidates
+        self._metric = metric
+        self.name = 'PANNS(n_trees=%d, n_cand=%d)' % (self._n_trees, self._n_candidates)
+
+    def fit(self, X):
+        self._panns = panns.PannsIndex(X.shape[1], metric=self._metric)
+        for x in X:
+            self._panns.add_vector(x)
+        self._panns.build(self._n_trees)
+
+    def query(self, v, n):
+        return [x for x, y in self._panns.query(v, n)]
diff --git a/ann_benchmarks/algorithms/pynndescent.py b/ann_benchmarks/algorithms/pynndescent.py
new file mode 100644
index 0000000..190bf3e
--- /dev/null
+++ b/ann_benchmarks/algorithms/pynndescent.py
@@ -0,0 +1,35 @@
+from __future__ import absolute_import
+import pynndescent
+from ann_benchmarks.algorithms.base import BaseANN
+
+class PyNNDescent(BaseANN):
+    def __init__(self, metric, n_neighbors=10, n_trees=8, leaf_size=20):
+        self._n_neighbors = int(n_neighbors)
+        self._n_trees = int(n_trees)
+        self._leaf_size = int(leaf_size)
+        self._queue_size=None
+        self._pynnd_metric = {'angular': 'cosine',
+                              'euclidean': 'euclidean',
+                              'hamming': 'hamming',
+                              'jaccard': 'jaccard'}[metric]
+
+    def fit(self, X):
+            self._index = pynndescent.NNDescent(X,
+                                                n_neighbors=self._n_neighbors,
+                                                n_trees=self._n_trees,
+                                                leaf_size=self._leaf_size,
+                                                metric=self._pynnd_metric)
+
+    def set_query_arguments(self, queue_size):
+        self._queue_size = float(queue_size)
+
+
+    def query(self, v, n):
+        ind, dist = self._index.query(v.reshape(1, -1).astype('float32'), k=n, queue_size=self._queue_size)
+        return ind[0]
+
+    def __str__(self):
+        return 'PyNNDescent(n_neighbors=%d, n_trees=%d, leaf_size=%d, queue_size=%.2f)' % (self._n_neighbors,
+                                                                                           self._n_trees,
+                                                                                           self._leaf_size,
+                                                                                           self._queue_size)
diff --git a/ann_benchmarks/algorithms/risc.py b/ann_benchmarks/algorithms/risc.py
new file mode 100644
index 0000000..fe89304
--- /dev/null
+++ b/ann_benchmarks/algorithms/risc.py
@@ -0,0 +1,83 @@
+from __future__ import absolute_import
+import sys
+sys.path.append('/risc/Code')
+print(sys.path)
+import pyrisc
+from ann_benchmarks.algorithms.base import BaseANN
+from scipy.sparse import csr_matrix
+import numpy
+import os
+
+class Risc(BaseANN):
+
+    def __init__(self, metric, method):
+        if metric != "jaccard":
+            raise NotImplementedError("BruteForce doesn't support metric %s, only jaccard metric is supported." % metric)
+        methods = {'Risc': 1, 'Linearscan': 2, 'AOR': 3, 'DivideSkip': 4}
+        self._metric = metric
+        self._method = methods[method]
+        self.name = method + "()"
+
+    def pre_fit(self, X):
+        def matrToStrArray(sparseMatr):
+            res = ""
+            indptr = sparseMatr.indptr
+            indices = sparseMatr.indices
+            for row in range(sparseMatr.shape[0]):
+                arr = [k for k in indices[indptr[row]: indptr[row + 1]]]
+                arr.sort()
+                res1 = "{" + ':1 , '.join([str(k) for k in arr]) + ':1}'
+                res += res1 + "\n"
+            return res
+
+        # transform data and store in file
+        data_trans = matrToStrArray(csr_matrix(X))
+        # print(data_trans)
+        text_file = open("train.txt", "w")
+        text_file.write(data_trans)
+        text_file.close()
+
+        # call function with file
+        self._featureId = pyrisc.getFeatureId("train.txt", "features.txt")
+        self._data = pyrisc.readDatabase("train.txt", self._featureId)
+        # self._data = pyrisc.readDatabase("train.txt", "features.txt")
+
+
+    def fit(self, X):
+        self._index = pyrisc.getIndex(self._data, self._method)
+
+    def pre_query(self, v, n):
+        # transform data and store in file
+        nz = numpy.nonzero(v)[0]
+        v = '{' + ':1 , '.join([str(k) for k in nz]) + ':1}\n'
+        if os.path.isfile("query.txt"):
+            os.remove("query.txt")
+        text_file = open("query.txt", "w")
+        text_file.write(v)
+        text_file.close()
+
+        # queries = pyrisc.readQueries("train.txt", "query.txt", "features.txt")
+        queries = pyrisc.readQueries("query.txt", self._featureId)
+        self._queryFP = pyrisc.dataBinary_getFingerPrint(queries, 0)
+
+    def query(self, v, n):
+        self._n = n
+        self._results = pyrisc._experiments_runTopK_inMemory(self._index, self._data, self._queryFP, self._n, self._method)
+
+    def post_query(self):
+        if os.path.isfile("result.txt"):
+            os.remove("result.txt")
+        pyrisc.writeResults("result.txt", self._data, self._results, self._n)
+
+        # read results from output file
+        result = []
+        with open("result.txt", "r") as fp:
+            line = fp.readline()
+            while line:
+                if line.startswith("#"):
+                    line = fp.readline()
+                    continue
+                # make 1 based index 0 based
+                result.append(int(line[:-1])-1)
+                line = fp.readline()
+        return result
diff --git a/ann_benchmarks/algorithms/rpforest.py b/ann_benchmarks/algorithms/rpforest.py
new file mode 100644
index 0000000..d2628bf
--- /dev/null
+++ b/ann_benchmarks/algorithms/rpforest.py
@@ -0,0 +1,19 @@
+from __future__ import absolute_import
+import rpforest
+import numpy
+from ann_benchmarks.algorithms.base import BaseANN
+
+class RPForest(BaseANN):
+    def __init__(self, leaf_size, n_trees):
+        self.name = 'RPForest(leaf_size=%d, n_trees=%d)' % (leaf_size, n_trees)
+        self._model = rpforest.RPForest(leaf_size=leaf_size, no_trees=n_trees)
+
+    def fit(self, X):
+        if X.dtype != numpy.double:
+            X = numpy.array(X).astype(numpy.double)
+        self._model.fit(X)
+
+    def query(self, v, n):
+        if v.dtype != numpy.double:
+            v = numpy.array(v).astype(numpy.double)
+        return self._model.query(v, n)
diff --git a/ann_benchmarks/constants.py b/ann_benchmarks/constants.py
new file mode 100644
index 0000000..407200b
--- /dev/null
+++ b/ann_benchmarks/constants.py
@@ -0,0 +1 @@
+INDEX_DIR = 'indices'
diff --git a/ann_benchmarks/data.py b/ann_benchmarks/data.py
new file mode 100644
index 0000000..1b4d1d3
--- /dev/null
+++ b/ann_benchmarks/data.py
@@ -0,0 +1,36 @@
+from __future__ import absolute_import
+import numpy
+
+def float_parse_entry(line):
+    return [float(x) for x in line.strip().split()]
+def float_unparse_entry(entry):
+    return " ".join(map(str, entry))
+def int_parse_entry(line):
+    return frozenset([int(x) for x in line.strip().split()])
+def int_unparse_entry(entry):
+    return " ".join(map(str, map(int, entry)))
+
+def bit_parse_entry(line):
+    return [bool(int(x)) for x in list(line.strip().replace(" ", "").replace("\t", ""))]
+def bit_unparse_entry(entry):
+    return " ".join(map(lambda el: "1" if el else "0", entry))
+
+type_info = {
+    "float": {
+        "type": numpy.float,
+        "parse_entry": float_parse_entry,
+        "unparse_entry": float_unparse_entry,
+        "finish_entries": numpy.vstack
+    },
+    "bit": {
+        "type": numpy.bool_,
+        "parse_entry": bit_parse_entry,
+        "unparse_entry": bit_unparse_entry
+    },
+    "int" : {
+        "type": numpy.object,
+        "parse_entry": int_parse_entry,
+        "unparse_entry": int_unparse_entry,
+    },
+}
+
diff --git a/ann_benchmarks/datasets.py b/ann_benchmarks/datasets.py
new file mode 100644
index 0000000..0f7b273
--- /dev/null
+++ b/ann_benchmarks/datasets.py
@@ -0,0 +1,548 @@
+import h5py
+import numpy
+import os
+import random
+import sys
+try:
+    from urllib import urlretrieve
+except ImportError:
+    from urllib.request import urlretrieve # Python 3
+
+
+def download(src, dst):
+    if not os.path.exists(dst):
+        # TODO: should be atomic
+        print('downloading %s -> %s...' % (src, dst))
+        urlretrieve(src, dst)
+
+
+def get_dataset_fn(dataset):
+    if not os.path.exists('data'):
+        os.mkdir('data')
+    return os.path.join('data', '%s.hdf5' % dataset)
+
+
+def get_dataset(which):
+    import h5sparse
+
+    hdf5_fn = get_dataset_fn(which)
+    try:
+        url = 'http://ann-benchmarks.com/%s.hdf5' % which
+        download(url, hdf5_fn)
+    except:
+        print("Cannot download %s" % url)
+        if which in DATASETS:
+            print("Creating dataset locally")
+            DATASETS[which](hdf5_fn)
+
+    hdf5_f = h5sparse.File(hdf5_fn)
+    return hdf5_f
+
+
+# Everything below this line is related to creating datasets
+# You probably never need to do this at home, just rely on the prepared datasets at http://ann-benchmarks.com
+
+def write_output(train, test, fn, distance, point_type='float', count=1000, SMILES=None):
+    from ann_benchmarks.algorithms.bruteforce import BruteForceBLAS
+    import sklearn.neighbors
+    import h5sparse
+
+    def replace_last(source_string, replace_what, replace_with):
+        head, _sep, tail = source_string.rpartition(replace_what)
+        return head + replace_with + tail
+
+    # store SMILES first
+    if SMILES:
+        smile_fn = replace_last(fn, '.hdf5', '-SMILES.hdf5')
+        print('Write Smiles to File %s' % smile_fn)
+        f = h5sparse.File(smile_fn, 'w')
+        asciiList = [n.encode("ascii", "ignore") for n in SMILES]
+        f.create_dataset('smile', (len(asciiList), 1), 'S10', asciiList)
+        f.close()
+        print('Finish.')
+
+    print('Write Dataset %s' % fn)
+    f = h5sparse.File(fn, 'w')
+    f.attrs['distance'] = distance
+    f.attrs['point_type'] = point_type
+    print('train size: %9d * %4d' % train.shape)
+    print('test size:  %9d * %4d' % test.shape)
+    f.create_dataset('train',data=train)
+    f.create_dataset('test', test.shape, dtype=test.dtype)[:] = test
+    neighbors = f.create_dataset('neighbors', (test.shape[0], count), dtype='i')
+    distances = f.create_dataset('distances', (test.shape[0], count), dtype='f')
+
+    # use which method to compute the groundtruth
+    train = train.toarray()
+    method = 'bruteforth'
+    if method == 'balltree':
+        tree = sklearn.neighbors.BallTree(train, leaf_size=1000000, metric=distance)
+    else:
+        bf = BruteForceBLAS(metric=distance, precision=train.dtype)
+        bf.fit(train)
+
+    print(test)
+    for i, x in enumerate(test):
+        if i % 1 == 0:
+            print('%d/%d...' % (i, test.shape[0]))
+        if method == 'balltree':
+            dist, ind = tree.query([x], k=count)
+            neighbors[i] = ind[0]
+            distances[i] = dist[0]
+        else:
+            res = list(bf.query_with_distances(x, count))
+            res.sort(key=lambda t: t[-1])
+            neighbors[i] = [j for j, _ in res]
+            distances[i] = [d for _, d in res]
+        print(neighbors[i])
+        print(distances[i])
+    f.close()
+    print('Finish.')
+
+
+def train_test_split(X, test_size=10000):
+    import sklearn.model_selection
+    print('Splitting %d*%d into train/test' % X.shape)
+    return sklearn.model_selection.train_test_split(X, test_size=test_size, random_state=1)
+
+
+def glove(out_fn, d):
+    import zipfile
+
+    url = 'http://nlp.stanford.edu/data/glove.twitter.27B.zip'
+    fn = os.path.join('data', 'glove.twitter.27B.zip')
+    download(url, fn)
+    with zipfile.ZipFile(fn) as z:
+        print('preparing %s' % out_fn)
+        z_fn = 'glove.twitter.27B.%dd.txt' % d
+        X = []
+        for line in z.open(z_fn):
+            v = [float(x) for x in line.strip().split()[1:]]
+            X.append(numpy.array(v))
+        X_train, X_test = train_test_split(X)
+        write_output(numpy.array(X_train), numpy.array(X_test), out_fn, 'angular')
+
+
+def _load_texmex_vectors(f, n, k):
+    import struct
+
+    v = numpy.zeros((n, k))
+    for i in range(n):
+        f.read(4)  # ignore vec length
+        v[i] = struct.unpack('f' * k, f.read(k*4))
+
+    return v
+
+
+def _get_irisa_matrix(t, fn):
+    import struct
+    m = t.getmember(fn)
+    f = t.extractfile(m)
+    k, = struct.unpack('i', f.read(4))
+    n = m.size // (4 + 4*k)
+    f.seek(0)
+    return _load_texmex_vectors(f, n, k)
+
+
+def sift(out_fn):
+    import tarfile
+
+    url = 'ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz'
+    fn = os.path.join('data', 'sift.tar.tz')
+    download(url, fn)
+    with tarfile.open(fn, 'r:gz') as t:
+        train = _get_irisa_matrix(t, 'sift/sift_base.fvecs')
+        test = _get_irisa_matrix(t, 'sift/sift_query.fvecs')
+        write_output(train, test, out_fn, 'euclidean')
+
+
+def gist(out_fn):
+    import tarfile
+
+    url = 'ftp://ftp.irisa.fr/local/texmex/corpus/gist.tar.gz'
+    fn = os.path.join('data', 'gist.tar.tz')
+    download(url, fn)
+    with tarfile.open(fn, 'r:gz') as t:
+        train = _get_irisa_matrix(t, 'gist/gist_base.fvecs')
+        test = _get_irisa_matrix(t, 'gist/gist_query.fvecs')
+        write_output(train, test, out_fn, 'euclidean')
+
+
+def _load_mnist_vectors(fn):
+    import gzip
+    import struct
+
+    print('parsing vectors in %s...' % fn)
+    f = gzip.open(fn)
+    type_code_info = {
+        0x08: (1, "!B"),
+        0x09: (1, "!b"),
+        0x0B: (2, "!H"),
+        0x0C: (4, "!I"),
+        0x0D: (4, "!f"),
+        0x0E: (8, "!d")
+    }
+    magic, type_code, dim_count = struct.unpack("!hBB", f.read(4))
+    assert magic == 0
+    assert type_code in type_code_info
+
+    dimensions = [struct.unpack("!I", f.read(4))[0] for i in range(dim_count)]
+
+    entry_count = dimensions[0]
+    entry_size = numpy.product(dimensions[1:])
+
+    b, format_string = type_code_info[type_code]
+    vectors = []
+    for i in range(entry_count):
+        vectors.append([struct.unpack(format_string, f.read(b))[0] for j in range(entry_size)])
+    return numpy.array(vectors)
+
+
+def mnist(out_fn):
+    download('http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz', 'mnist-train.gz')
+    download('http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz', 'mnist-test.gz')
+    train = _load_mnist_vectors('mnist-train.gz')
+    test = _load_mnist_vectors('mnist-test.gz')
+    write_output(train, test, out_fn, 'euclidean')
+
+
+def fashion_mnist(out_fn):
+    download('http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz', 'fashion-mnist-train.gz')
+    download('http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz', 'fashion-mnist-test.gz')
+    train = _load_mnist_vectors('fashion-mnist-train.gz')
+    test = _load_mnist_vectors('fashion-mnist-test.gz')
+    write_output(train, test, out_fn, 'euclidean')
+
+
+def transform_bag_of_words(filename, n_dimensions, out_fn):
+    import gzip
+    from scipy.sparse import lil_matrix
+    from sklearn.feature_extraction.text import TfidfTransformer
+    from sklearn import random_projection
+    with gzip.open(filename, 'rb') as f:
+        file_content = f.readlines()
+        entries = int(file_content[0])
+        words = int(file_content[1])
+        file_content = file_content[3:] # strip first three entries
+        print("building matrix...")
+        A = lil_matrix((entries, words))
+        for e in file_content:
+            doc, word, cnt = [int(v) for v in e.strip().split()]
+            A[doc - 1, word - 1] = cnt
+        print("normalizing matrix entries with tfidf...")
+        B = TfidfTransformer().fit_transform(A)
+        print("reducing dimensionality...")
+        C = random_projection.GaussianRandomProjection(n_components = n_dimensions).fit_transform(B)
+        X_train, X_test = train_test_split(C)
+        write_output(numpy.array(X_train), numpy.array(X_test), out_fn, 'angular')
+
+
+def nytimes(out_fn, n_dimensions):
+    fn = 'nytimes_%s.txt.gz' % n_dimensions
+    download('https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/docword.nytimes.txt.gz', fn)
+    transform_bag_of_words(fn, n_dimensions, out_fn)
+
+
+def random(out_fn, n_dims, n_samples, centers, distance):
+    import sklearn.datasets
+
+    X, _ = sklearn.datasets.make_blobs(n_samples=n_samples, n_features=n_dims, centers=centers, random_state=1)
+    X_train, X_test = train_test_split(X, test_size=0.1)
+    write_output(X_train, X_test, out_fn, distance)
+
+
+def word2bits(out_fn, path, fn):
+    import tarfile
+    local_fn = fn + '.tar.gz'
+    url = 'http://web.stanford.edu/~maxlam/word_vectors/compressed/%s/%s.tar.gz' % (path, fn)
+    download(url, local_fn)
+    print('parsing vectors in %s...' % local_fn)
+    with tarfile.open(local_fn, 'r:gz') as t:
+        f = t.extractfile(fn)
+        n_words, k = [int(z) for z in next(f).strip().split()]
+        X = numpy.zeros((n_words, k), dtype=numpy.bool)
+        for i in range(n_words):
+            X[i] = numpy.array([float(z) > 0 for z in next(f).strip().split()[1:]], dtype=numpy.bool)
+
+        X_train, X_test = train_test_split(X, test_size=1000)
+        write_output(X_train, X_test, out_fn, 'hamming', 'bit')
+
+def sift_hamming(out_fn, fn):
+    import tarfile
+    local_fn = fn + '.tar.gz'
+    url = 'http://sss.projects.itu.dk/ann-benchmarks/datasets/%s.tar.gz' % fn
+    download(url, local_fn)
+    print('parsing vectors in %s...' % local_fn)
+    with tarfile.open(local_fn, 'r:gz') as t:
+        f = t.extractfile(fn)
+        lines = f.readlines()
+        X = numpy.zeros((len(lines), 256), dtype=numpy.bool)
+        for i, line in enumerate(lines):
+            X[i] = numpy.array([int(x) > 0 for x in line.decode().strip()], dtype=numpy.bool)
+        X_train, X_test = train_test_split(X, test_size = 1000)
+        write_output(X_train, X_test, out_fn, 'hamming', 'bit')
+
+def lastfm(out_fn, n_dimensions, test_size=50000):
+    # This tests out ANN methods for retrieval on simple matrix factorization based
+    # recommendation algorithms. The idea being that the query/test vectors are user factors
+    # and the train set are item factors from the matrix factorization model.
+
+    # Since the predictor is a dot product, we transform the factors first as described in this
+    # paper: https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/XboxInnerProduct.pdf
+    # This hopefully replicates the experiments done in this post:
+    # http://www.benfrederickson.com/approximate-nearest-neighbours-for-recommender-systems/
+
+    # The dataset is from "Last.fm Dataset - 360K users":
+    # http://www.dtic.upf.edu/~ocelma/MusicRecommendationDataset/lastfm-360K.html
+
+    # this requires the implicit package to generate the factors (on my desktop/gpu this only
+    # takes 4-5 seconds to train - but could take 1-2 minutes on a laptop)
+    from implicit.datasets.lastfm import get_lastfm
+    from implicit.approximate_als import augment_inner_product_matrix
+    import implicit
+
+    # train an als model on the lastfm data
+    _, _, play_counts = get_lastfm()
+    model = implicit.als.AlternatingLeastSquares(factors=n_dimensions)
+    model.fit(implicit.nearest_neighbours.bm25_weight(play_counts, K1=100, B=0.8))
+
+    # transform item factors so that each one has the same norm, and transform the user
+    # factors such by appending a 0 column
+    _, item_factors = augment_inner_product_matrix(model.item_factors)
+    user_factors = numpy.append(model.user_factors,
+                                numpy.zeros((model.user_factors.shape[0], 1)),
+                                axis=1)
+
+    # only query the first 50k users (speeds things up signficantly without changing results)
+    user_factors = user_factors[:test_size]
+
+    # after that transformation a cosine lookup will return the same results as the inner product
+    # on the untransformed data
+    write_output(item_factors, user_factors, out_fn, 'angular')
+
+def get_sparse_matrix_from_txt(file=None, dtype=numpy.bool):
+    from rdkit import Chem
+    from rdkit.Chem import AllChem
+    from scipy.sparse import csr_matrix
+    dimension = 1024
+
+    SMILES = []
+    indptr = [0]
+    indices = []
+    data = []
+    num_mols = 0
+    if file == None:
+        file = '../pycharm_project_422/clustering_toydata.txt'
+    file_object = open(file, "r")
+    for line in file_object.readlines():
+        elements = line.split()
+        if len(elements) != 14: continue
+        smile = elements[7]
+        mol = Chem.MolFromSmiles(smile)
+        if mol is None: continue
+        SMILES.append(smile)
+        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=dimension)
+        for i in range(dimension):
+            if fp.GetBit(i) is True:
+                indices.append(i)
+                data.append(1)
+        indptr.append(len(indices))
+        num_mols += 1
+
+    fps = csr_matrix((data, indices, indptr), shape=(num_mols, dimension), dtype=dtype)
+    print('The dimension of the returned sparse matrix: %d*%d' %fps.shape)
+
+    return fps, SMILES
+
+def get_sparse_matrix_from_sdf(dir, dtype=numpy.bool):
+    from rdkit import Chem
+    from rdkit.Chem import AllChem
+    import glob
+    import gzip
+    from scipy.sparse import csr_matrix
+    dimension = 1024
+
+    SMILES = []
+    indptr = [0]
+    indices = []
+    data = []
+    num_mols = 0
+    file_list = glob.glob(dir + '/*.sdf.gz')
+    print(file_list)
+    for file in file_list:
+        inf = gzip.open(file)
+        suppl = Chem.ForwardSDMolSupplier(inf)
+        for mol in suppl:
+            if mol is None: continue
+            smile = Chem.MolToSmiles(mol)
+            SMILES.append(smile)
+            fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=dimension)
+            for i in range(dimension):
+                if fp.GetBit(i) is True:
+                    indices.append(i)
+                    data.append(1)
+            indptr.append(len(indices))
+            num_mols += 1
+
+    fps = csr_matrix((data, indices, indptr), shape=(num_mols, dimension), dtype=dtype)
+    print('The dimension of the returned sparse matrix: %d*%d' % fps.shape)
+
+    return fps, SMILES
+
+def ecfp(out_fn, dataset_name, dimension, distance, type, test_size=1000):
+    from sklearn.utils import shuffle
+    print('prepare dataset ' + dataset_name)
+
+    if type == 'bit':
+        dtype = numpy.bool
+    elif type == 'int':
+        dtype = numpy.int
+    else:
+        dtype = numpy.float
+    if dataset_name.startswith('toy'):
+        # toy
+        X, SMILES = get_sparse_matrix_from_txt(dtype=dtype)
+    else:
+        # others, e.g., Chembl and Molport
+        if dataset_name == 'Molport':
+            dir = '/home/cjz18001/Molport'
+        elif dataset_name == 'Chembl':
+            dir = '/home/cjz18001/Chembl'
+        else:
+            print('unknown dataset')
+            exit(0)
+        X, SMILES = get_sparse_matrix_from_sdf(dir=dir, dtype=dtype)
+
+    # random shuffle fingerprints and smiles at the same time
+    seed = 1 # random.randint(0, 2 ** 32 - 1)
+    X, SMILES = shuffle(X, SMILES, random_state=seed)
+
+    # data split and make test data full matrix
+    train_size = X.shape[0] - test_size
+    X_train = X[:train_size]
+    X_test = X[train_size:]
+    X_test = X_test.toarray()
+    print('finish dataset preparation')
+
+    print('Train data dimension: %d*%d' %X_train.shape)
+    print('Test data dimension: %d*%d' %X_test.shape)
+    write_output(X_train, X_test, out_fn, distance, type, count=1000, SMILES=SMILES)
+
+
+def ecfp_sparse_multi(out_fn, dataset_name, num_files, dimension, distance, type):
+    print('prepare dataset ' + dataset_name)
+    import pickle
+    from scipy.sparse import vstack
+    path = '/data/chunjiangzhu/Enamine_680M_SparseMatrix/'
+    if type == 'bit':
+        dtype = numpy.bool
+    elif type == 'int':
+        dtype = numpy.int
+    else:
+        dtype = numpy.float
+
+    # vertically stack sparse matrices from multiple files
+    test_size = 1
+    if num_files==0.5:
+        with open(path + dataset_name + '_' + str(dimension) + '_trunk_0.pickle', 'rb') as handle:
+            Y = pickle.load(handle, encoding='latin1')
+        size = 1000000
+        print('select %i out of %i' %(size, Y.shape[0]))
+        Y = Y[:size]
+        X_test = Y[Y.shape[0] - test_size:]
+        X_train = Y[:Y.shape[0] - test_size]
+    else:
+        first = False
+        for i in range(num_files):
+            print('process ' + str(i) + ' trunk')
+            if first == False:
+                first = True
+                with open(path + dataset_name + '_' + str(dimension) + '_trunk_' + str(i) + '.pickle', 'rb') as handle:
+                    Y = pickle.load(handle, encoding='latin1')
+                if i==num_files-1: #last one
+                    X_test = Y[Y.shape[0] - test_size:]
+                    X_train = Y[:Y.shape[0] - test_size]
+                else:
+                    X_train = Y
+            else:
+                with open(path + dataset_name + '_' + str(dimension) + '_trunk_' + str(i) + '.pickle', 'rb') as handle:
+                    Y = pickle.load(handle, encoding='latin1')
+                if i==num_files-1: #last one
+                    X_test = Y[Y.shape[0] - test_size:]
+                    X_train = vstack([X_train, Y[:Y.shape[0] - test_size]])
+                else:
+                    X_train = vstack([X_train, Y])
+    # X_train = X_train.astype(dtype)
+    # X_test = X_test.astype(dtype)
+
+    # X_train, X_test = train_test_split(X, test_size=1000)
+    # X_test = X_test.toarray()
+    # encounter memory error when calling train_test_split, for 100M
+    X_test = X_test.toarray()
+    print('finish dataset preparation')
+
+    print(X_train.shape)
+    print(X_test.shape)
+    write_output(X_train, X_test, out_fn, distance, type, 1000)
+
+def ecfp_multi(out_fn, dataset_name, num_files, dimension, distance, type):
+    print('prepare dataset ' + dataset_name)
+    import pickle
+    from scipy.sparse import vstack
+    path = '/data/chunjiangzhu/Enamine_680M_SparseMatrix/'
+    if type == 'bit':
+        dtype = numpy.bool
+    elif type == 'int':
+        dtype = numpy.int
+    else:
+        dtype = numpy.float
+
+    # vertically stack sparse matrices from multiple files
+    test_size = 3
+    with open(path + dataset_name + '_' + str(dimension) + '_trunk_0.pickle', 'rb') as handle:
+        Y = pickle.load(handle, encoding='latin1')
+    size = 10000000
+    print('select %i out of %i' %(size, Y.shape[0]))
+    Y = Y[:size]
+    X_test = Y[Y.shape[0] - test_size:]
+    X_train = Y[:Y.shape[0] - test_size]
+
+    # make them full matrices here
+    X_train = X_train.toarray()
+    X_test = X_test.toarray()
+    print('finish dataset preparation')
+
+    print(X_train.shape)
+    print(X_test.shape)
+    write_output(X_train, X_test, out_fn, distance, type, 1000)
+
+DATASETS = {
+    'fashion-mnist-784-euclidean': fashion_mnist,
+    'gist-960-euclidean': gist,
+    'glove-25-angular': lambda out_fn: glove(out_fn, 25),
+    'glove-50-angular': lambda out_fn: glove(out_fn, 50),
+    'glove-100-angular': lambda out_fn: glove(out_fn, 100),
+    'glove-200-angular': lambda out_fn: glove(out_fn, 200),
+    'mnist-784-euclidean': mnist,
+    'random-xs-20-euclidean': lambda out_fn: random(out_fn, 20, 10000, 100, 'euclidean'),
+    'random-s-100-euclidean': lambda out_fn: random(out_fn, 100, 100000, 1000, 'euclidean'),
+    'random-xs-20-angular': lambda out_fn: random(out_fn, 20, 10000, 100, 'angular'),
+    'random-s-100-angular': lambda out_fn: random(out_fn, 100, 100000, 1000, 'angular'),
+    'sift-128-euclidean': sift,
+    'nytimes-256-angular': lambda out_fn: nytimes(out_fn, 256),
+    'nytimes-16-angular': lambda out_fn: nytimes(out_fn, 16),
+    'word2bits-800-hamming': lambda out_fn: word2bits(out_fn, '400K', 'w2b_bitlevel1_size800_vocab400K'),
+    'lastfm-64-dot': lambda out_fn: lastfm(out_fn, 64),
+    'sift-256-hamming': lambda out_fn: sift_hamming(out_fn, 'sift.hamming.256'),
+    # below are datasets Chunjiang added
+    'toy-1024-jaccard': lambda out_fn: ecfp(out_fn, 'toydata', 1024, 'jaccard', 'bit', 100),
+    'chembl-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl', 1024, 'jaccard', 'bit'),
+    'molport-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Molport', 1024, 'jaccard', 'bit'),
+    'chembl100K-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl100K', 1024, 'jaccard', 'int'),
+    'chembl10K-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl10K', 1024, 'jaccard', 'int'),
+    'chembl-sparse-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl10K-sparse', 1024, 'jaccard', 'bit'),
+    'enamine100M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 5, 1024, 'jaccard', 'bit'),
+    'enamine10M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 0.5, 1024, 'jaccard', 'bit'),
+    'enamine20M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 1, 1024, 'jaccard', 'bit'),
+    'enamine10M-1024-jaccard': lambda out_fn: ecfp_multi(out_fn, 'Enamine', 0.5, 1024, 'jaccard', 'bit')
+}
diff --git a/ann_benchmarks/datasets_old.py b/ann_benchmarks/datasets_old.py
new file mode 100644
index 0000000..64c7716
--- /dev/null
+++ b/ann_benchmarks/datasets_old.py
@@ -0,0 +1,480 @@
+import h5py
+import numpy
+import os
+import random
+import sys
+try:
+    from urllib import urlretrieve
+except ImportError:
+    from urllib.request import urlretrieve # Python 3
+
+
+def download(src, dst):
+    if not os.path.exists(dst):
+        # TODO: should be atomic
+        print('downloading %s -> %s...' % (src, dst))
+        urlretrieve(src, dst)
+
+
+def get_dataset_fn(dataset):
+    if not os.path.exists('data'):
+        os.mkdir('data')
+    return os.path.join('data', '%s.hdf5' % dataset)
+
+
+def get_dataset(which):
+    hdf5_fn = get_dataset_fn(which)
+    try:
+        url = 'http://ann-benchmarks.com/%s.hdf5' % which
+        download(url, hdf5_fn)
+    except:
+        print("Cannot download %s" % url)
+        if which in DATASETS:
+            print("Creating dataset locally")
+            DATASETS[which](hdf5_fn)
+    if "sparse" not in which:
+        hdf5_f = h5py.File(hdf5_fn)
+    else:
+        import h5sparse
+        hdf5_f = h5sparse.File(hdf5_fn)
+    return hdf5_f
+
+
+# Everything below this line is related to creating datasets
+# You probably never need to do this at home, just rely on the prepared datasets at http://ann-benchmarks.com
+
+def write_output(train, test, fn, distance, point_type='float', count=1000, sparse=False):
+    from ann_benchmarks.algorithms.bruteforce import BruteForceBLAS
+    import sklearn.neighbors
+
+    n = 0
+    if sparse == False:
+        f = h5py.File(fn, 'w')
+    else:
+        import h5sparse
+        f = h5sparse.File(fn, 'w')
+    f.attrs['distance'] = distance
+    f.attrs['point_type'] = point_type
+    print('train size: %9d * %4d' % train.shape)
+    print('test size:  %9d * %4d' % test.shape)
+    if sparse == False:
+        f.create_dataset('train', train.shape, dtype=train.dtype)[:] = train
+    else:
+        f.create_dataset('train',data=train)
+    f.create_dataset('test', test.shape, dtype=test.dtype)[:] = test
+        # f.create_dataset('test', data=test)
+    neighbors = f.create_dataset('neighbors', (test.shape[0], count), dtype='i')
+    distances = f.create_dataset('distances', (test.shape[0], count), dtype='f')
+
+    # use which method to compute the groundtruth
+    method = 'balltree'
+    if method == 'balltree':
+        # only serve for jaccard
+        # todo: generalize to other metrics
+        tree = sklearn.neighbors.BallTree(train, leaf_size=20, metric='jaccard')
+    else:
+        bf = BruteForceBLAS(distance, precision=train.dtype)
+        bf.fit(train)
+
+    print(test)
+    for i, x in enumerate(test):
+        if i % 1 == 0:
+            print('%d/%d...' % (i, test.shape[0]))
+        if method == 'balltree':
+            res = tree.query(x, k=count)
+        else:
+            res = list(bf.query_with_distances(x, count))
+        res.sort(key=lambda t: t[-1])
+        neighbors[i] = [j for j, _ in res]
+        distances[i] = [d for _, d in res]
+        print(neighbors[i])
+        print(distances[i])
+    f.close()
+
+
+def train_test_split(X, test_size=10000):
+    import sklearn.model_selection
+    print('Splitting %d*%d into train/test' % X.shape)
+    return sklearn.model_selection.train_test_split(X, test_size=test_size, random_state=1)
+
+
+def glove(out_fn, d):
+    import zipfile
+
+    url = 'http://nlp.stanford.edu/data/glove.twitter.27B.zip'
+    fn = os.path.join('data', 'glove.twitter.27B.zip')
+    download(url, fn)
+    with zipfile.ZipFile(fn) as z:
+        print('preparing %s' % out_fn)
+        z_fn = 'glove.twitter.27B.%dd.txt' % d
+        X = []
+        for line in z.open(z_fn):
+            v = [float(x) for x in line.strip().split()[1:]]
+            X.append(numpy.array(v))
+        X_train, X_test = train_test_split(X)
+        write_output(numpy.array(X_train), numpy.array(X_test), out_fn, 'angular')
+
+
+def _load_texmex_vectors(f, n, k):
+    import struct
+
+    v = numpy.zeros((n, k))
+    for i in range(n):
+        f.read(4)  # ignore vec length
+        v[i] = struct.unpack('f' * k, f.read(k*4))
+
+    return v
+
+
+def _get_irisa_matrix(t, fn):
+    import struct
+    m = t.getmember(fn)
+    f = t.extractfile(m)
+    k, = struct.unpack('i', f.read(4))
+    n = m.size // (4 + 4*k)
+    f.seek(0)
+    return _load_texmex_vectors(f, n, k)
+
+
+def sift(out_fn):
+    import tarfile
+
+    url = 'ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz'
+    fn = os.path.join('data', 'sift.tar.tz')
+    download(url, fn)
+    with tarfile.open(fn, 'r:gz') as t:
+        train = _get_irisa_matrix(t, 'sift/sift_base.fvecs')
+        test = _get_irisa_matrix(t, 'sift/sift_query.fvecs')
+        write_output(train, test, out_fn, 'euclidean')
+
+
+def gist(out_fn):
+    import tarfile
+
+    url = 'ftp://ftp.irisa.fr/local/texmex/corpus/gist.tar.gz'
+    fn = os.path.join('data', 'gist.tar.tz')
+    download(url, fn)
+    with tarfile.open(fn, 'r:gz') as t:
+        train = _get_irisa_matrix(t, 'gist/gist_base.fvecs')
+        test = _get_irisa_matrix(t, 'gist/gist_query.fvecs')
+        write_output(train, test, out_fn, 'euclidean')
+
+
+def _load_mnist_vectors(fn):
+    import gzip
+    import struct
+
+    print('parsing vectors in %s...' % fn)
+    f = gzip.open(fn)
+    type_code_info = {
+        0x08: (1, "!B"),
+        0x09: (1, "!b"),
+        0x0B: (2, "!H"),
+        0x0C: (4, "!I"),
+        0x0D: (4, "!f"),
+        0x0E: (8, "!d")
+    }
+    magic, type_code, dim_count = struct.unpack("!hBB", f.read(4))
+    assert magic == 0
+    assert type_code in type_code_info
+
+    dimensions = [struct.unpack("!I", f.read(4))[0] for i in range(dim_count)]
+
+    entry_count = dimensions[0]
+    entry_size = numpy.product(dimensions[1:])
+
+    b, format_string = type_code_info[type_code]
+    vectors = []
+    for i in range(entry_count):
+        vectors.append([struct.unpack(format_string, f.read(b))[0] for j in range(entry_size)])
+    return numpy.array(vectors)
+
+
+def mnist(out_fn):
+    download('http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz', 'mnist-train.gz')
+    download('http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz', 'mnist-test.gz')
+    train = _load_mnist_vectors('mnist-train.gz')
+    test = _load_mnist_vectors('mnist-test.gz')
+    write_output(train, test, out_fn, 'euclidean')
+
+
+def fashion_mnist(out_fn):
+    download('http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz', 'fashion-mnist-train.gz')
+    download('http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz', 'fashion-mnist-test.gz')
+    train = _load_mnist_vectors('fashion-mnist-train.gz')
+    test = _load_mnist_vectors('fashion-mnist-test.gz')
+    write_output(train, test, out_fn, 'euclidean')
+
+
+def transform_bag_of_words(filename, n_dimensions, out_fn):
+    import gzip
+    from scipy.sparse import lil_matrix
+    from sklearn.feature_extraction.text import TfidfTransformer
+    from sklearn import random_projection
+    with gzip.open(filename, 'rb') as f:
+        file_content = f.readlines()
+        entries = int(file_content[0])
+        words = int(file_content[1])
+        file_content = file_content[3:] # strip first three entries
+        print("building matrix...")
+        A = lil_matrix((entries, words))
+        for e in file_content:
+            doc, word, cnt = [int(v) for v in e.strip().split()]
+            A[doc - 1, word - 1] = cnt
+        print("normalizing matrix entries with tfidf...")
+        B = TfidfTransformer().fit_transform(A)
+        print("reducing dimensionality...")
+        C = random_projection.GaussianRandomProjection(n_components = n_dimensions).fit_transform(B)
+        X_train, X_test = train_test_split(C)
+        write_output(numpy.array(X_train), numpy.array(X_test), out_fn, 'angular')
+
+
+def nytimes(out_fn, n_dimensions):
+    fn = 'nytimes_%s.txt.gz' % n_dimensions
+    download('https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/docword.nytimes.txt.gz', fn)
+    transform_bag_of_words(fn, n_dimensions, out_fn)
+
+
+def random(out_fn, n_dims, n_samples, centers, distance):
+    import sklearn.datasets
+
+    X, _ = sklearn.datasets.make_blobs(n_samples=n_samples, n_features=n_dims, centers=centers, random_state=1)
+    X_train, X_test = train_test_split(X, test_size=0.1)
+    write_output(X_train, X_test, out_fn, distance)
+
+
+def word2bits(out_fn, path, fn):
+    import tarfile
+    local_fn = fn + '.tar.gz'
+    url = 'http://web.stanford.edu/~maxlam/word_vectors/compressed/%s/%s.tar.gz' % (path, fn)
+    download(url, local_fn)
+    print('parsing vectors in %s...' % local_fn)
+    with tarfile.open(local_fn, 'r:gz') as t:
+        f = t.extractfile(fn)
+        n_words, k = [int(z) for z in next(f).strip().split()]
+        X = numpy.zeros((n_words, k), dtype=numpy.bool)
+        for i in range(n_words):
+            X[i] = numpy.array([float(z) > 0 for z in next(f).strip().split()[1:]], dtype=numpy.bool)
+
+        X_train, X_test = train_test_split(X, test_size=1000)
+        write_output(X_train, X_test, out_fn, 'hamming', 'bit')
+
+def sift_hamming(out_fn, fn):
+    import tarfile
+    local_fn = fn + '.tar.gz'
+    url = 'http://sss.projects.itu.dk/ann-benchmarks/datasets/%s.tar.gz' % fn
+    download(url, local_fn)
+    print('parsing vectors in %s...' % local_fn)
+    with tarfile.open(local_fn, 'r:gz') as t:
+        f = t.extractfile(fn)
+        lines = f.readlines()
+        X = numpy.zeros((len(lines), 256), dtype=numpy.bool)
+        for i, line in enumerate(lines):
+            X[i] = numpy.array([int(x) > 0 for x in line.decode().strip()], dtype=numpy.bool)
+        X_train, X_test = train_test_split(X, test_size = 1000)
+        write_output(X_train, X_test, out_fn, 'hamming', 'bit')
+
+def lastfm(out_fn, n_dimensions, test_size=50000):
+    # This tests out ANN methods for retrieval on simple matrix factorization based
+    # recommendation algorithms. The idea being that the query/test vectors are user factors
+    # and the train set are item factors from the matrix factorization model.
+
+    # Since the predictor is a dot product, we transform the factors first as described in this
+    # paper: https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/XboxInnerProduct.pdf
+    # This hopefully replicates the experiments done in this post:
+    # http://www.benfrederickson.com/approximate-nearest-neighbours-for-recommender-systems/
+
+    # The dataset is from "Last.fm Dataset - 360K users":
+    # http://www.dtic.upf.edu/~ocelma/MusicRecommendationDataset/lastfm-360K.html
+
+    # this requires the implicit package to generate the factors (on my desktop/gpu this only
+    # takes 4-5 seconds to train - but could take 1-2 minutes on a laptop)
+    from implicit.datasets.lastfm import get_lastfm
+    from implicit.approximate_als import augment_inner_product_matrix
+    import implicit
+
+    # train an als model on the lastfm data
+    _, _, play_counts = get_lastfm()
+    model = implicit.als.AlternatingLeastSquares(factors=n_dimensions)
+    model.fit(implicit.nearest_neighbours.bm25_weight(play_counts, K1=100, B=0.8))
+
+    # transform item factors so that each one has the same norm, and transform the user
+    # factors such by appending a 0 column
+    _, item_factors = augment_inner_product_matrix(model.item_factors)
+    user_factors = numpy.append(model.user_factors,
+                                numpy.zeros((model.user_factors.shape[0], 1)),
+                                axis=1)
+
+    # only query the first 50k users (speeds things up signficantly without changing results)
+    user_factors = user_factors[:test_size]
+
+    # after that transformation a cosine lookup will return the same results as the inner product
+    # on the untransformed data
+    write_output(item_factors, user_factors, out_fn, 'angular')
+
+def ecfp(out_fn, dataset_name, dimension, distance, type):
+    print('prepare dataset ' + dataset_name)
+    import pickle
+    path = '../pycharm_project_426/src/'
+    if type == 'bit':
+        dtype = numpy.bool
+    elif type == 'int':
+        dtype = numpy.int
+    else:
+        dtype = numpy.float
+    if dataset_name.startswith('toy'):
+        # toy
+        with open(path + dataset_name + '_' + str(dimension) + '_training.pickle', 'rb') as handle:
+            X_train = pickle.load(handle, encoding='latin1')
+        with open(path + dataset_name + '_' + str(dimension) + '_test.pickle', 'rb') as handle:
+            X_test = pickle.load(handle, encoding='latin1')
+        X_train = numpy.asarray(X_train.toarray(), dtype)
+        X_test = numpy.asarray(X_test.toarray(), dtype)
+    else:
+        # Chembl
+        with open(path + dataset_name + '_' + str(dimension) + '.pickle', 'rb') as handle:
+            X = pickle.load(handle, encoding='latin1')
+        X = numpy.asarray(X.toarray(), dtype)
+        X_train, X_test = train_test_split(X, test_size=1000)
+
+    print(X_train)
+    print(X_test)
+    write_output(X_train, X_test, out_fn, distance, type)
+
+def ecfp_sparse(out_fn, dataset_name, dimension, distance, type):
+    print('prepare dataset ' + dataset_name)
+    import pickle
+    path = '../pycharm_project_426/src/'
+    if type == 'bit':
+        dtype = numpy.bool
+    elif type == 'int':
+        dtype = numpy.int
+    else:
+        dtype = numpy.float
+
+    with open(path + dataset_name + '_' + str(dimension) + '.pickle', 'rb') as handle:
+        X = pickle.load(handle, encoding='latin1')
+    X = X.astype(dtype)
+    X_train, X_test = train_test_split(X, test_size=100)
+    X_test = X_test.toarray()
+
+    print(X_train)
+    print(X_test)
+    write_output(X_train, X_test, out_fn, distance, type, 1000, True)
+
+def ecfp_sparse_multi(out_fn, dataset_name, num_files, dimension, distance, type):
+    print('prepare dataset ' + dataset_name)
+    import pickle
+    from scipy.sparse import vstack
+    path = '/data/chunjiangzhu/Enamine_680M_SparseMatrix/'
+    if type == 'bit':
+        dtype = numpy.bool
+    elif type == 'int':
+        dtype = numpy.int
+    else:
+        dtype = numpy.float
+
+    # vertically stack sparse matrices from multiple files
+    test_size = 1
+    if num_files==0.5:
+        with open(path + dataset_name + '_' + str(dimension) + '_trunk_0.pickle', 'rb') as handle:
+            Y = pickle.load(handle, encoding='latin1')
+        size = 2000000
+        print('select %i out of %i' %(size, Y.shape[0]))
+        Y = Y[:size]
+        X_test = Y[Y.shape[0] - test_size:]
+        X_train = Y[:Y.shape[0] - test_size]
+    else:
+        first = False
+        for i in range(num_files):
+            print('process ' + str(i) + ' trunk')
+            if first == False:
+                first = True
+                with open(path + dataset_name + '_' + str(dimension) + '_trunk_' + str(i) + '.pickle', 'rb') as handle:
+                    Y = pickle.load(handle, encoding='latin1')
+                if i==num_files-1: #last one
+                    X_test = Y[Y.shape[0] - test_size:]
+                    X_train = Y[:Y.shape[0] - test_size]
+                else:
+                    X_train = Y
+            else:
+                with open(path + dataset_name + '_' + str(dimension) + '_trunk_' + str(i) + '.pickle', 'rb') as handle:
+                    Y = pickle.load(handle, encoding='latin1')
+                if i==num_files-1: #last one
+                    X_test = Y[Y.shape[0] - test_size:]
+                    X_train = vstack([X_train, Y[:Y.shape[0] - test_size]])
+                else:
+                    X_train = vstack([X_train, Y])
+    # X_train = X_train.astype(dtype)
+    # X_test = X_test.astype(dtype)
+
+    # X_train, X_test = train_test_split(X, test_size=1000)
+    # X_test = X_test.toarray()
+    # encounter memory error when calling train_test_split, for 100M
+    X_test = X_test.toarray()
+    print('finish data preparation')
+
+    print(X_train.shape)
+    print(X_test.shape)
+    write_output(X_train, X_test, out_fn, distance, type, 1000, True)
+
+def ecfp_multi(out_fn, dataset_name, num_files, dimension, distance, type):
+    print('prepare dataset ' + dataset_name)
+    import pickle
+    from scipy.sparse import vstack
+    path = '/data/chunjiangzhu/Enamine_680M_SparseMatrix/'
+    if type == 'bit':
+        dtype = numpy.bool
+    elif type == 'int':
+        dtype = numpy.int
+    else:
+        dtype = numpy.float
+
+    # vertically stack sparse matrices from multiple files
+    test_size = 3
+    with open(path + dataset_name + '_' + str(dimension) + '_trunk_0.pickle', 'rb') as handle:
+        Y = pickle.load(handle, encoding='latin1')
+    size = 10000000
+    print('select %i out of %i' %(size, Y.shape[0]))
+    Y = Y[:size]
+    X_test = Y[Y.shape[0] - test_size:]
+    X_train = Y[:Y.shape[0] - test_size]
+
+    # make them full matrices here
+    X_train = X_train.toarray()
+    X_test = X_test.toarray()
+    print('finish data preparation')
+
+    print(X_train.shape)
+    print(X_test.shape)
+    write_output(X_train, X_test, out_fn, distance, type, 1000)
+
+DATASETS = {
+    'fashion-mnist-784-euclidean': fashion_mnist,
+    'gist-960-euclidean': gist,
+    'glove-25-angular': lambda out_fn: glove(out_fn, 25),
+    'glove-50-angular': lambda out_fn: glove(out_fn, 50),
+    'glove-100-angular': lambda out_fn: glove(out_fn, 100),
+    'glove-200-angular': lambda out_fn: glove(out_fn, 200),
+    'mnist-784-euclidean': mnist,
+    'random-xs-20-euclidean': lambda out_fn: random(out_fn, 20, 10000, 100, 'euclidean'),
+    'random-s-100-euclidean': lambda out_fn: random(out_fn, 100, 100000, 1000, 'euclidean'),
+    'random-xs-20-angular': lambda out_fn: random(out_fn, 20, 10000, 100, 'angular'),
+    'random-s-100-angular': lambda out_fn: random(out_fn, 100, 100000, 1000, 'angular'),
+    'sift-128-euclidean': sift,
+    'nytimes-256-angular': lambda out_fn: nytimes(out_fn, 256),
+    'nytimes-16-angular': lambda out_fn: nytimes(out_fn, 16),
+    'word2bits-800-hamming': lambda out_fn: word2bits(out_fn, '400K', 'w2b_bitlevel1_size800_vocab400K'),
+    'lastfm-64-dot': lambda out_fn: lastfm(out_fn, 64),
+    'sift-256-hamming': lambda out_fn: sift_hamming(out_fn, 'sift.hamming.256'),
+    # below are datasets Chunjiang added
+    'toy-1024-jaccard': lambda out_fn: ecfp(out_fn, 'toydata', 1024, 'jaccard', 'int'),
+    'chembl-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl', 1024, 'jaccard', 'int'),
+    'chembl100K-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl100K', 1024, 'jaccard', 'int'),
+    'chembl10K-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl10K', 1024, 'jaccard', 'int'),
+    'chembl-sparse-1024-jaccard': lambda out_fn: ecfp_sparse(out_fn, 'Chembl10K', 1024, 'jaccard', 'bit'),
+    'enamine100M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 5, 1024, 'jaccard', 'bit'),
+    'enamine10M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 0.5, 1024, 'jaccard', 'bit'),
+    'enamine20M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 1, 1024, 'jaccard', 'bit'),
+    'enamine10M-1024-jaccard': lambda out_fn: ecfp_multi(out_fn, 'Enamine', 0.5, 1024, 'jaccard', 'bit')
+}
diff --git a/ann_benchmarks/distance.py b/ann_benchmarks/distance.py
new file mode 100644
index 0000000..9228338
--- /dev/null
+++ b/ann_benchmarks/distance.py
@@ -0,0 +1,53 @@
+from __future__ import absolute_import
+from scipy.spatial.distance import pdist as scipy_pdist
+
+def pdist(a, b, metric):
+    return scipy_pdist([a, b], metric=metric)[0]
+
+# Need own implementation of jaccard because numpy's implementation is different
+def jaccard(a, b):
+    if len(a) == 0 or len(b) == 0:
+        return 0
+    intersect = len(a & b)
+    return  intersect / (float)(len(a) + len(b) - intersect)
+
+
+# metrics = {
+#     'hamming': {
+#         'distance' : lambda a, b: pdist(a, b, "hamming"),
+#         'distance_valid' : lambda a: True
+#         },
+#     # return 1 - jaccard similarity, because smaller distances are better.
+#     'jaccard': {
+#         'distance' : lambda a, b:  1 - jaccard(a, b),
+#         'distance_valid' : lambda a: a < 1 - 1e-5
+#         },
+#     'euclidean': {
+#         'distance' : lambda a, b: pdist(a, b, "euclidean"),
+#         'distance_valid' : lambda a: True
+#         },
+#     'angular': {
+#         'distance' : lambda a, b: pdist(a, b, "cosine"),
+#         'distance_valid' : lambda a: True
+#         }
+# }
+# Chunjiang Modified 20190216
+metrics = {
+    'hamming': {
+        'distance' : lambda a, b: pdist(a, b, "hamming"),
+        'distance_valid' : lambda a: True
+        },
+    # return 1 - jaccard similarity, because smaller distances are better.
+    'jaccard': {
+        'distance' : lambda a, b:  pdist(a, b, "jaccard"),
+        'distance_valid' : lambda a: a < 1 - 1e-5
+        },
+    'euclidean': {
+        'distance' : lambda a, b: pdist(a, b, "euclidean"),
+        'distance_valid' : lambda a: True
+        },
+    'angular': {
+        'distance' : lambda a, b: pdist(a, b, "cosine"),
+        'distance_valid' : lambda a: True
+        }
+}
diff --git a/ann_benchmarks/main.py b/ann_benchmarks/main.py
new file mode 100644
index 0000000..55c1534
--- /dev/null
+++ b/ann_benchmarks/main.py
@@ -0,0 +1,198 @@
+from __future__ import absolute_import
+import argparse
+import docker
+import os
+import random
+import sys
+import shutil
+import traceback
+
+from ann_benchmarks.datasets import get_dataset, DATASETS
+from ann_benchmarks.constants import INDEX_DIR
+from ann_benchmarks.algorithms.definitions import get_definitions, list_algorithms, algorithm_status, InstantiationStatus
+from ann_benchmarks.results import get_result_filename
+from ann_benchmarks.runner import run, run_docker, run_singularity
+
+
+def positive_int(s):
+    i = None
+    try:
+        i = int(s)
+    except ValueError:
+        pass
+    if not i or i < 1:
+        raise argparse.ArgumentTypeError("%r is not a positive integer" % s)
+    return i
+
+def main():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument(
+        '--dataset',
+        metavar='NAME',
+        help='the dataset to load training points from',
+        default='glove-100-angular',
+        choices=DATASETS.keys())
+    parser.add_argument(
+        "-k", "--count",
+        default=10,
+        type=positive_int,
+        help="the number of near neighbours to search for")
+    parser.add_argument(
+        '--definitions',
+        metavar='FILE',
+        help='load algorithm definitions from FILE',
+        default='algos.yaml')
+    parser.add_argument(
+        '--algorithm',
+        metavar='NAME',
+        help='run only the named algorithm',
+        default=None)
+    parser.add_argument(
+        '--docker-tag',
+        metavar='NAME',
+        help='run only algorithms in a particular docker image',
+        default=None)
+    parser.add_argument(
+        '--list-algorithms',
+        help='print the names of all known algorithms and exit',
+        action='store_true')
+    parser.add_argument(
+        '--force',
+        help='''re-run algorithms even if their results already exist''',
+        action='store_true')
+    parser.add_argument(
+        '--runs',
+        metavar='COUNT',
+        type=positive_int,
+        help='run each algorithm instance %(metavar)s times and use only the best result',
+        default=2)
+    parser.add_argument(
+        '--timeout',
+        type=int,
+        help='Timeout (in seconds) for each individual algorithm run, or -1 if no timeout should be set',
+        default=-1)
+    parser.add_argument(
+        '--local',
+        action='store_true',
+        help='If set, then will run everything locally (inside the same process) rather than using Docker')
+    parser.add_argument(
+        '--batch',
+        action='store_true',
+        help='If set, algorithms get all queries at once')
+    parser.add_argument(
+        '--rq',
+        action='store_true',
+        help='If set, perform range queries')
+    parser.add_argument(
+        "--radius",
+        default=0.3,
+        type=float,
+        help="th range of similarity to search for")
+    parser.add_argument(
+        '--max-n-algorithms',
+        type=int,
+        help='Max number of algorithms to run (just used for testing)',
+        default=-1)
+    parser.add_argument(
+        '--run-disabled',
+        help='run algorithms that are disabled in algos.yml',
+        action='store_true')
+
+    args = parser.parse_args()
+    if args.timeout == -1:
+        args.timeout = None
+
+    if args.list_algorithms:
+        list_algorithms(args.definitions)
+        sys.exit(0)
+
+    # Nmslib specific code
+    # Remove old indices stored on disk
+    #if os.path.exists(INDEX_DIR):
+    #    shutil.rmtree(INDEX_DIR)
+
+    dataset = get_dataset(args.dataset)
+    # adapt to sparse matrix
+    # dimension = len(dataset['train'][0]) # TODO(erikbern): ugly
+    dimension = dataset['train'].shape[1]
+    point_type = dataset.attrs.get('point_type', 'float')
+    distance = dataset.attrs['distance']
+    definitions = get_definitions(args.definitions, dimension, point_type, distance, args.count)
+
+    # Filter out, from the loaded definitions, all those query argument groups
+    # that correspond to experiments that have already been run. (This might
+    # mean removing a definition altogether, so we can't just use a list
+    # comprehension.)
+    filtered_definitions = []
+    for definition in definitions:
+        query_argument_groups = definition.query_argument_groups
+        if not query_argument_groups:
+            query_argument_groups = [[]]
+        not_yet_run = []
+        for query_arguments in query_argument_groups:
+            if args.rq:
+                fn = get_result_filename(args.dataset,
+                    args.radius, definition, query_arguments, args.batch)
+            else:
+                fn = get_result_filename(args.dataset,
+                    args.count, definition, query_arguments, args.batch)
+            if args.force or not os.path.exists(fn):
+                not_yet_run.append(query_arguments)
+        if not_yet_run:
+            if definition.query_argument_groups:
+                definition = definition._replace(
+                        query_argument_groups = not_yet_run)
+            filtered_definitions.append(definition)
+    definitions = filtered_definitions
+
+    random.shuffle(definitions)
+
+    if args.algorithm:
+        print('running only', args.algorithm)
+        definitions = [d for d in definitions if d.algorithm == args.algorithm]
+
+    if args.local:
+        def _test(df):
+            status = algorithm_status(df)
+            # If the module was loaded but doesn't actually have a constructor of
+            # the right name, then the definition is broken
+            assert status != InstantiationStatus.NO_CONSTRUCTOR, """\
+%s.%s(%s): error: the module '%s' does not expose the named constructor""" % (df.module, df.constructor, df.arguments, df.module)
+            if status == InstantiationStatus.NO_MODULE:
+                # If the module couldn't be loaded (presumably because of a missing
+                # dependency), print a warning and remove this definition from the
+                # list of things to be run
+                print("""\
+%s.%s(%s): warning: the module '%s' could not be loaded; skipping""" % (df.module, df.constructor, df.arguments, df.module))
+                return False
+            else:
+                return True
+        definitions = [d for d in definitions if _test(d)]
+
+    if not args.run_disabled:
+        if len([d for d in definitions if d.disabled]):
+            print('Not running disabled algorithms:', [d for d in definitions if d.disabled])
+        definitions = [d for d in definitions if not d.disabled]
+
+    if args.max_n_algorithms >= 0:
+        definitions = definitions[:args.max_n_algorithms]
+
+    if len(definitions) == 0:
+        raise Exception('Nothing to run')
+    else:
+        print('Order:', definitions)
+
+    for definition in definitions:
+        print(definition, '...')
+
+        try:
+            if args.local:
+                run(definition, args.dataset, args.count, args.runs, args.batch, args.rq, args.radius)
+            else:
+                # run_docker(definition, args.dataset, args.count, args.runs, args.timeout, args.batch, args.rq, args.radius)
+                run_singularity(definition, args.dataset, args.count, args.runs, args.timeout, args.batch, args.rq, args.radius)
+        except KeyboardInterrupt:
+            break
+        except:
+            traceback.print_exc()
diff --git a/ann_benchmarks/plotting/__init__.py b/ann_benchmarks/plotting/__init__.py
new file mode 100644
index 0000000..4d4042e
--- /dev/null
+++ b/ann_benchmarks/plotting/__init__.py
@@ -0,0 +1,2 @@
+from __future__ import absolute_import
+from ann_benchmarks.plotting import *
diff --git a/ann_benchmarks/plotting/metrics.py b/ann_benchmarks/plotting/metrics.py
new file mode 100644
index 0000000..cdf2800
--- /dev/null
+++ b/ann_benchmarks/plotting/metrics.py
@@ -0,0 +1,113 @@
+from __future__ import absolute_import
+
+def knn(dataset_distances, run_distances, count, epsilon=1e-10):
+    total = len(run_distances) * count
+    actual = 0
+    for true_distances, found_distances in zip(dataset_distances, run_distances):
+        within = [d for d in found_distances[:count] if d <= true_distances[count - 1] + epsilon]
+        actual += len(within)
+    return float(actual) / float(total)
+
+def rangequery(dataset_distances, run_distances, radius, epsilon=1e-10):
+    total = 0.0
+    for true_distances, found_distances in zip(dataset_distances, run_distances):
+        true = [d for d in true_distances if d <= radius + epsilon]
+        found = [d for d in found_distances if d <= radius + epsilon]
+        print('found: ' + str(len(found)) + '/true: ' + str(len(true)))
+        if len(true) == 0:
+            if len(found) == 0:
+                total += 1.0
+        else:
+            if len(found) > len(true):
+                print(found)
+                total += 1.0
+                continue
+            total += float(len(found))/float(len(true))
+    return float(total) / float(len(run_distances))
+
+def epsilon(dataset_distances, run_distances, count, epsilon=0.01):
+    total = len(run_distances) * count
+    actual = 0
+    for true_distances, found_distances in zip(dataset_distances, run_distances):
+        within = [d for d in found_distances[:count] if d <= true_distances[count - 1] * (1 + epsilon)]
+        actual += len(within)
+    return float(actual) / float(total)
+
+def rel(dataset_distances, run_distances):
+    total_closest_distance = 0.0
+    total_candidate_distance = 0.0
+    for true_distances, found_distances in zip(dataset_distances, run_distances):
+        for rdist, cdist in zip(true_distances, found_distances):
+            total_closest_distance += rdist
+            total_candidate_distance += cdist
+    if total_closest_distance < 0.01:
+        return float("inf")
+    return total_candidate_distance / total_closest_distance
+
+def queries_per_second(queries, attrs):
+    return 1.0 / attrs["best_search_time"]
+
+def index_size(queries, attrs):
+    # TODO(erikbern): should replace this with peak memory usage or something
+    return attrs.get("index_size", 0)
+
+def build_time(queries, attrs):
+    return attrs["build_time"]
+
+def candidates(queries, attrs):
+    return attrs["candidates"]
+
+all_metrics = {
+    "k-nn": {
+        "description": "Recall",
+        "function": lambda true_distances, run_distances, run_attrs: knn(true_distances, run_distances, run_attrs["count"]),
+        "worst": float("-inf"),
+        "lim": [0.0, 1.03]
+    },
+    "range": {
+        "description": "Recall",
+        "function": lambda true_distances, run_distances, run_attrs, radius: rangequery(true_distances, run_distances, radius),
+        "worst": float("-inf"),
+        "lim": [0.0, 1.03]
+    },
+    "epsilon": {
+        "description": "Epsilon 0.01 Recall",
+        "function": lambda true_distances, run_distances, run_attrs: epsilon(true_distances, run_distances, run_attrs["count"]),
+        "worst": float("-inf")
+    },
+    "largeepsilon": {
+        "description": "Epsilon 0.1 Recall",
+        "function": lambda true_distances, run_distances, run_attrs: epsilon(true_distances, run_distances, run_attrs["count"], 0.1),
+        "worst": float("-inf")
+    },
+    "rel": {
+        "description": "Relative Error",
+        "function": lambda true_distances, run_distances, run_attrs: rel(true_distances, run_distances),
+        "worst": float("inf")
+    },
+    "qps": {
+        "description": "Queries per second (1/s)",
+        "function": lambda true_distances, run_distances, run_attrs: queries_per_second(true_distances, run_attrs),
+        "worst": float("-inf")
+    },
+    "build": {
+        "description": "Indexing time (s)",
+        "function": lambda true_distances, run_distances, run_attrs: build_time(true_distances, run_attrs),
+        "worst": float("inf")
+    },
+    "candidates" : {
+        "description": "Candidates generated",
+        "function": lambda true_distances, run_distances, run_attrs: candidates(true_distances, run_attrs),
+        "worst": float("inf")
+    },
+    "indexsize" : {
+        "description": "Index size (kB)",
+        "function": lambda true_distances, run_distances, run_attrs: index_size(true_distances, run_attrs),
+        "worst": float("inf")
+    },
+    "queriessize" : {
+        "description": "Index size (kB)/Queries per second (s)",
+        "function": lambda true_distances, run_distances, run_attrs: index_size(true_distances, run_attrs) / queries_per_second(true_distances, run_attrs),
+        "worst": float("inf")
+    }
+}
diff --git a/ann_benchmarks/plotting/plot_variants.py b/ann_benchmarks/plotting/plot_variants.py
new file mode 100644
index 0000000..7eb91d5
--- /dev/null
+++ b/ann_benchmarks/plotting/plot_variants.py
@@ -0,0 +1,12 @@
+from ann_benchmarks.plotting.metrics import all_metrics as metrics
+
+all_plot_variants = {
+    "recall/time" : ("k-nn", "qps"),
+    "recall/buildtime" : ("k-nn", "build"),
+    "recall/indexsize" : ("k-nn", "indexsize"),
+    "rel/time" : ("rel", "qps"),
+    "recall/candidates" : ("k-nn", "candidates"),
+    "recall/qpssize" : ("k-nn", "queriessize"),
+    "eps/time" : ("epsilon", "qps"),
+    "largeeps/time" : ("largeepsilon", "qps")
+}
diff --git a/ann_benchmarks/plotting/utils.py b/ann_benchmarks/plotting/utils.py
new file mode 100644
index 0000000..5fd5915
--- /dev/null
+++ b/ann_benchmarks/plotting/utils.py
@@ -0,0 +1,115 @@
+from __future__ import absolute_import
+
+import os, itertools, json, numpy, pickle
+from ann_benchmarks.plotting.metrics import all_metrics as metrics
+import matplotlib.pyplot as plt
+
+def create_pointset(data, xn, yn):
+    xm, ym = (metrics[xn], metrics[yn])
+    rev = ym["worst"] < 0
+    data.sort(key=lambda t: t[-1], reverse=rev) # sort by y coordinate
+
+    axs, ays, als = [], [], []
+    # Generate Pareto frontier
+    xs, ys, ls = [], [], []
+    last_x = xm["worst"]
+    comparator = \
+      (lambda xv, lx: xv > lx) if last_x < 0 else (lambda xv, lx: xv < lx)
+    for algo, algo_name, xv, yv in data:
+        if not xv or not yv:
+            continue
+        axs.append(xv)
+        ays.append(yv)
+        als.append(algo_name)
+        if comparator(xv, last_x):
+            last_x = xv
+            xs.append(xv)
+            ys.append(yv)
+            ls.append(algo_name)
+    return xs, ys, ls, axs, ays, als
+
+def compute_metrics(true_nn_distances, res, metric_1, metric_2, radius=-1):
+    all_results = {}
+    for i, (properties, run) in enumerate(res):
+        algo = properties['algo']
+        algo_name = properties['name']
+        # cache distances to avoid access to hdf5 file
+        run_distances = list(run['distances'])
+
+        if metric_1 == 'range':
+            metric_1_value = metrics[metric_1]['function'](true_nn_distances, run_distances, properties, radius)
+        else:
+            metric_1_value = metrics[metric_1]['function'](true_nn_distances, run_distances, properties)
+        if metric_2 == 'range':
+            metric_2_value = metrics[metric_2]['function'](true_nn_distances, run_distances, properties, radius)
+        else:
+            metric_2_value = metrics[metric_2]['function'](true_nn_distances, run_distances, properties)
+
+        print('%3d: %80s %12.3f %12.3f' % (i, algo_name, metric_1_value, metric_2_value))
+
+        all_results.setdefault(algo, []).append((algo, algo_name, metric_1_value, metric_2_value))
+
+    return all_results
+
+def compute_metrics_K(all_results, true_nn_distances, res, count, metric_1, metric_2):
+    for i, (properties, run) in enumerate(res):
+        algo = properties['algo']
+        algo_name = properties['name']
+        # cache distances to avoid access to hdf5 file
+        run_distances = list(run['distances'])
+
+        metric_1_value = metrics[metric_1]['function'](true_nn_distances, run_distances, properties)
+        metric_2_value = metrics[metric_2]['function'](true_nn_distances, run_distances, properties)
+
+        print('%3d: %80s %12.3f %12.3f' % (i, algo_name, metric_1_value, metric_2_value))
+
+        all_results.setdefault(algo + '-K=' + str(count), []).append((algo + str(count), algo_name, metric_1_value, metric_2_value))
+
+    return all_results
+
+def compute_all_metrics(true_nn_distances, run, properties):
+    algo = properties["algo"]
+    algo_name = properties["name"]
+    print('--')
+    print(algo_name)
+    results = {}
+    # cache distances to avoid access to hdf5 file
+    run_distances = list(run["distances"])
+    for name, metric in metrics.items():
+        v = metric["function"](true_nn_distances, run_distances, properties)
+        results[name] = v
+        if v:
+            print('%s: %g' % (name, v))
+    return (algo, algo_name, results)
+
+def generate_n_colors(n):
+    vs = numpy.linspace(0.4, 1.0, 7)
+    colors = [(.9, .4, .4, 1.)]
+    def euclidean(a, b):
+        return sum((x-y)**2 for x, y in zip(a, b))
+    while len(colors) < n:
+        new_color = max(itertools.product(vs, vs, vs), key=lambda a: min(euclidean(a, b) for b in colors))
+        colors.append(new_color + (1.,))
+    return colors
+
+def create_linestyles(unique_algorithms):
+    colors = dict(zip(unique_algorithms, generate_n_colors(len(unique_algorithms))))
+    linestyles = dict((algo, ['--', '-.', '-', ':'][i%4]) for i, algo in enumerate(unique_algorithms))
+    markerstyles = dict((algo, ['+', '<', 'o', '*', 'x'][i%5]) for i, algo in enumerate(unique_algorithms))
+    faded = dict((algo, (r, g, b, 0.3)) for algo, (r, g, b, a) in colors.items())
+    return dict((algo, (colors[algo], faded[algo], linestyles[algo], markerstyles[algo])) for algo in unique_algorithms)
+
+def get_up_down(metric):
+    if metric["worst"] == float("inf"):
+        return "down"
+    return "up"
+
+def get_left_right(metric):
+    if metric["worst"] == float("inf"):
+        return "left"
+    return "right"
+
+def get_plot_label(xm, ym):
+    return "%(xlabel)s-%(ylabel)s tradeoff - %(updown)s and to the %(leftright)s is better" % {
+            "xlabel" : xm["description"], "ylabel" : ym["description"], "updown" : get_up_down(ym), "leftright" : get_left_right(xm) }
+
diff --git a/ann_benchmarks/results.py b/ann_benchmarks/results.py
new file mode 100644
index 0000000..3adbcd5
--- /dev/null
+++ b/ann_benchmarks/results.py
@@ -0,0 +1,77 @@
+from __future__ import absolute_import
+
+import h5py
+import json
+import os
+import re
+
+def get_algorithm_name(name, batch_mode):
+    if batch_mode:
+        return name + "-batch"
+    return name
+
+def is_batch(name):
+    return "-batch" in name
+
+def get_result_filename(dataset=None, count=None, definition=None, query_arguments=None, batch_mode=False):
+    d = ['results']
+    if dataset:
+        d.append(dataset)
+    if count:
+        d.append(str(count))
+    if definition:
+        d.append(get_algorithm_name(definition.algorithm, batch_mode))
+        d.append(re.sub(r'\W+', '_', json.dumps(definition.arguments + query_arguments, sort_keys=True)).strip('_'))
+    return os.path.join(*d)
+
+def store_results(dataset, count, definition, query_arguments, attrs, results, batch, rq):
+    fn = get_result_filename(dataset, count, definition, query_arguments, batch)
+    head, tail = os.path.split(fn)
+    if not os.path.isdir(head):
+        os.makedirs(head)
+    f = h5py.File(fn, 'w')
+    for k, v in attrs.items():
+        f.attrs[k] = v
+    times = f.create_dataset('times', (len(results),), 'f')
+    if rq:
+        count=1000 #the maximum number of items returned
+    neighbors = f.create_dataset('neighbors', (len(results), count), 'i')
+    distances = f.create_dataset('distances', (len(results), count), 'f')
+    for i, (time, ds) in enumerate(results):
+        times[i] = time
+        if rq and count < len(ds):
+            neighbors[i] = [n for n, d in ds[:count]]
+            distances[i] = [d for n, d in ds[:count]]
+        else:
+            neighbors[i] = [n for n, d in ds] + [-1] * (count - len(ds))
+            distances[i] = [d for n, d in ds] + [float('inf')] * (count - len(ds))
+        #print(neighbors[i])
+        #print(distances[i])
+    f.close()
+
+
+def load_all_results(dataset=None, count=None, split_batched=False,  batch_mode=False):
+    for root, _, files in os.walk(get_result_filename(dataset, count)):
+        for fn in files:
+            try:
+                if split_batched and batch_mode != is_batch(root):
+                    continue
+                f = h5py.File(os.path.join(root, fn))
+                properties = dict(f.attrs)
+                # TODO Fix this properly. Sometimes the hdf5 file returns bytes
+                # This converts these bytes to strings before we work with them
+                for k in properties.keys():
+                    try:
+                        properties[k]= properties[k].decode()
+                    except:
+                        pass
+                yield properties, f
+                f.close()
+            except:
+                pass
+
+def get_unique_algorithms():
+    algorithms = set()
+    for properties, _ in load_all_results():
+        algorithms.add(properties['algo'])
+    return algorithms
diff --git a/ann_benchmarks/runner.py b/ann_benchmarks/runner.py
new file mode 100644
index 0000000..9a857ba
--- /dev/null
+++ b/ann_benchmarks/runner.py
@@ -0,0 +1,305 @@
+from __future__ import print_function
+__true_print = print
+
+import argparse
+import datetime
+import docker
+import json
+import multiprocessing
+import numpy
+import os
+import psutil
+import requests
+import sys
+import threading
+import time
+import subprocess
+
+def print(*args, **kwargs):
+    __true_print(*args, **kwargs)
+    sys.stdout.flush()
+
+from ann_benchmarks.datasets import get_dataset, DATASETS
+from ann_benchmarks.algorithms.definitions import Definition, instantiate_algorithm, get_algorithm_name
+from ann_benchmarks.distance import metrics
+from ann_benchmarks.results import store_results
+
+from scipy.sparse import issparse
+
+
+def run_individual_query(algoname, algo, X_train, X_test, distance, count, run_count, batch, rq):
+    best_search_time = float('inf')
+    for i in range(run_count):
+        print('Run %d/%d...' % (i+1, run_count))
+        n_items_processed = [0]  # a bit dumb but can't be a scalar since of Python's scoping rules
+
+        def single_query(v):
+            # special code for Risc
+            if "Risc" in algoname or 'DivideSkip' in algoname:
+                algo.pre_query(v, count)
+            start = time.time()
+            if rq:
+                candidates = algo.query(v, count, rq) # now count is the radius
+            else:
+                candidates = algo.query(v, count)
+            total = (time.time() - start)
+            # special code for Risc
+            if "Risc" in algoname or 'DivideSkip' in algoname:
+                candidates = algo.post_query()
+            if issparse(X_train):
+                candidates = [(int(idx), float(metrics[distance]['distance'](v, X_train[idx].toarray()[0])))
+                              for idx in candidates]
+            else:
+                candidates = [(int(idx), float(metrics[distance]['distance'](v, X_train[idx])))
+                              for idx in candidates]
+            n_items_processed[0] += 1
+            if n_items_processed[0] % 1000 == 0:
+                print('Processed %d/%d queries...' % (n_items_processed[0], X_test.shape[0]))
+            if rq==False and len(candidates) > count:
+                print('warning: algorithm %s returned %d results, but count is only %d)' % (algo, len(candidates), count))
+            return (total, candidates)
+
+        def batch_query(X):
+            start = time.time()
+            algo.batch_query(X, count)
+            total = (time.time() - start)
+            results = algo.get_batch_results()
+            # needs testing
+            if issparse(X_train):
+                candidates = [[(int(idx), float(metrics[distance]['distance'](v, X_train[idx].toarray()[0])))
+                               for idx in single_results]
+                              for v, single_results in zip(X, results)]
+            else:
+                candidates = [[(int(idx), float(metrics[distance]['distance'](v, X_train[idx])))
+                               for idx in single_results]
+                              for v, single_results in zip(X, results)]
+            return [(total / float(X.shape[0]), v) for v in candidates]
+
+        if batch:
+            results = batch_query(X_test)
+        else:
+            results = [single_query(x) for x in X_test]
+
+        total_time = sum(time for time, _ in results)
+        total_candidates = sum(len(candidates) for _, candidates in results)
+        search_time = total_time / len(X_test)
+        avg_candidates = total_candidates / len(X_test)
+        best_search_time = min(best_search_time, search_time)
+
+    verbose = hasattr(algo, "query_verbose")
+    attrs = {
+        "batch_mode": batch,
+        "best_search_time": best_search_time,
+        "candidates": avg_candidates,
+        "expect_extra": verbose,
+        "name": str(algo),
+        "run_count": run_count,
+        "distance": distance,
+        "count": int(count)
+    }
+    return (attrs, results)
+
+
+def run(definition, dataset, count, run_count, batch, rq):
+    algo = instantiate_algorithm(definition)
+    assert not definition.query_argument_groups \
+            or hasattr(algo, "set_query_arguments"), """\
+error: query argument groups have been specified for %s.%s(%s), but the \
+algorithm instantiated from it does not implement the set_query_arguments \
+function""" % (definition.module, definition.constructor, definition.arguments)
+
+    D = get_dataset(dataset)
+    # Chunjiang modified
+    print('Is the train set a sparse matrix? %d' % issparse(D['train'][()]))
+    if 'sparse' not in dataset:
+        X_train = D['train'][()].toarray()
+    else:
+        X_train = D['train'][()]
+    # X_train = numpy.array(D['train'])
+    #X_train = X_train[:2000]
+    X_test = numpy.array(D['test'])
+    distance = D.attrs['distance']
+    print('got a train set of size (%d * %d)' % X_train.shape)
+    print('got %d queries' % len(X_test))
+
+    try:
+        # special code for Risc
+        print(X_train.shape)
+        if 'Risc' in definition.algorithm or 'DivideSkip' in definition.algorithm:
+            X_train = numpy.concatenate((X_train, [numpy.ones(X_train.shape[1], dtype=numpy.bool)]), axis=0)
+            print(X_train.shape)
+            algo.pre_fit(X_train)
+        t0 = time.time()
+        index_size_before = algo.get_index_size("self")
+        algo.fit(X_train)
+        build_time = time.time() - t0
+        index_size = algo.get_index_size("self") - index_size_before
+        print('Built index in', build_time)
+        print('Index size: ', index_size)
+
+        query_argument_groups = definition.query_argument_groups
+        # Make sure that algorithms with no query argument groups still get run
+        # once by providing them with a single, empty, harmless group
+        if not query_argument_groups:
+            query_argument_groups = [[]]
+
+        for pos, query_arguments in enumerate(query_argument_groups, 1):
+            print("Running query argument group %d of %d..." %
+                    (pos, len(query_argument_groups)))
+            if query_arguments:
+                algo.set_query_arguments(*query_arguments)
+            descriptor, results = run_individual_query(definition.algorithm, algo, X_train, X_test,
+                    distance, count, run_count, batch, rq)
+            descriptor["build_time"] = build_time
+            descriptor["index_size"] = index_size
+            descriptor["algo"] = get_algorithm_name(definition.algorithm, batch)
+            descriptor["dataset"] = dataset
+            store_results(dataset, count, definition,
+                    query_arguments, descriptor, results, batch, rq)
+
+    finally:
+        algo.done()
+
+
+def run_from_cmdline():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--dataset',
+        choices=DATASETS.keys(),
+        required=True)
+    parser.add_argument(
+        '--algorithm',
+        required=True)
+    parser.add_argument(
+        '--module',
+        required=True)
+    parser.add_argument(
+        '--constructor',
+        required=True)
+    parser.add_argument(
+        '--count',
+        required=True,
+        type=int)
+    parser.add_argument(
+        '--runs',
+        required=True,
+        type=int)
+    parser.add_argument(
+        '--batch',
+        action='store_true')
+    parser.add_argument(
+        '--rq',
+        action='store_true')
+    parser.add_argument(
+        '--radius',
+        type=float)
+    parser.add_argument(
+        'build')
+    parser.add_argument(
+        'queries',
+        nargs='*',
+        default=[])
+    args = parser.parse_args()
+    algo_args = json.loads(args.build)
+    query_args = [json.loads(q) for q in args.queries]
+
+    definition = Definition(
+        algorithm=args.algorithm,
+        docker_tag=None, # not needed
+        module=args.module,
+        constructor=args.constructor,
+        arguments=algo_args,
+        query_argument_groups=query_args,
+        disabled=False
+    )
+    if args.rq:
+        run(definition, args.dataset, args.radius, args.runs, args.batch, args.rq)
+    else:
+        run(definition, args.dataset, args.count, args.runs, args.batch)
+
+
+def run_docker(definition, dataset, count, runs, timeout, batch, rq, radius, mem_limit=None):
+    import colors  # Think it doesn't work in Python 2
+
+    cmd = ['--dataset', dataset,
+           '--algorithm', definition.algorithm,
+           '--module', definition.module,
+           '--constructor', definition.constructor,
+           '--runs', str(runs),
+           '--count', str(count)]
+    if batch:
+        cmd += ['--batch']
+    if rq:
+        cmd += ['--rq', '--radius', str(radius)]
+    cmd.append(json.dumps(definition.arguments))
+    cmd += [json.dumps(qag) for qag in definition.query_argument_groups]
+    print('Running command', cmd)
+    client = docker.from_env()
+    if mem_limit is None:
+        mem_limit = psutil.virtual_memory().available
+    print('Memory limit:', mem_limit)
+    cpu_limit = "0-%d" % (multiprocessing.cpu_count() - 1)
+    if not batch:
+        # Limit to first cpu if not in batch mode
+        cpu_limit = "0"
+    print('Running on CPUs:', cpu_limit)
+
+    container = client.containers.run(
+        definition.docker_tag,
+        cmd,
+        volumes={
+            os.path.abspath('ann_benchmarks'): {'bind': '/home/app/ann_benchmarks', 'mode': 'ro'},
+            os.path.abspath('data'): {'bind': '/home/app/data', 'mode': 'ro'},
+            os.path.abspath('results'): {'bind': '/home/app/results', 'mode': 'rw'},
+        },
+        cpuset_cpus=cpu_limit,
+        mem_limit=mem_limit,
+        detach=True)
+
+    def stream_logs():
+        for line in container.logs(stream=True):
+            print(colors.color(line.decode().rstrip(), fg='blue'))
+
+    if sys.version_info >= (3, 0):
+        t = threading.Thread(target=stream_logs, daemon=True)
+    else:
+        t = threading.Thread(target=stream_logs)
+        t.daemon = True
+    t.start()
+    try:
+        exit_code = container.wait(timeout=timeout)
+
+        # Exit if exit code
+        if exit_code == 0:
+            return
+        elif exit_code is not None:
+            print(colors.color(container.logs().decode(), fg='red'))
+            raise Exception('Child process raised exception %d' % exit_code)
+
+    finally:
+        container.remove(force=True)
+def run_singularity(definition, dataset, count, runs, timeout, batch, rq, radius, mem_limit=None):
+    cmd = ['--dataset', dataset,
+           '--algorithm', definition.algorithm,
+           '--module', definition.module,
+           '--constructor', definition.constructor,
+           '--runs', str(runs),
+           '--count', str(count)]
+    if batch:
+        cmd += ['--batch']
+    if rq:
+        cmd += ['--rq', '--radius', str(radius)]
+    cmd.append(json.dumps(definition.arguments))
+    cmd += [json.dumps(qag) for qag in definition.query_argument_groups]
+    print('Running command', cmd)
+
+    strCmd = ' '.join(["'" + k + "'" for k in cmd])
+    print('String of command', strCmd)
+
+    subprocess.check_call('singularity exec ../../singularity/ann-bench-nmslib3.sif python3 run_algorithm.py %s' %(strCmd), shell=True)
+    #subprocess.check_call('singularity exec ../singularity/ann-bench-pynndescent.sif python3 run_algorithm.py %s' %(strCmd), shell=True)
+    #subprocess.check_call('singularity exec ../singularity/ann-bench-datasketch.sif python3 run_algorithm.py %s' %(strCmd), shell=True)
+    #subprocess.check_call('singularity exec ../singularity/ann-bench-sklearn.sif python3 run_algorithm.py %s' %(strCmd), shell=True)
+    #subprocess.check_call('singularity exec ../singularity/ann-bench-risc.sif python3 run_algorithm.py %s' %(strCmd), shell=True)
+    #subprocess.check_call('singularity exec ../singularity/ann-bench-ngt.sif python3 run_algorithm.py %s' %(strCmd), shell=True)
+
diff --git a/create_dataset.py b/create_dataset.py
new file mode 100644
index 0000000..b9463a8
--- /dev/null
+++ b/create_dataset.py
@@ -0,0 +1,12 @@
+import argparse
+from ann_benchmarks.datasets import DATASETS, get_dataset_fn
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--dataset',
+        choices=DATASETS.keys(),
+        required=True)
+    args = parser.parse_args()
+    fn = get_dataset_fn(args.dataset)
+    DATASETS[args.dataset](fn)
diff --git a/create_website.py b/create_website.py
new file mode 100644
index 0000000..0d9eaa1
--- /dev/null
+++ b/create_website.py
@@ -0,0 +1,213 @@
+import matplotlib as mpl
+mpl.use('Agg')
+import argparse
+import os, json, pickle, yaml
+import numpy
+import hashlib
+from jinja2 import Environment, FileSystemLoader
+
+from ann_benchmarks import results
+from ann_benchmarks.algorithms.definitions import get_algorithm_name
+from ann_benchmarks.datasets import get_dataset
+from ann_benchmarks.plotting.plot_variants import all_plot_variants as plot_variants
+from ann_benchmarks.plotting.metrics import all_metrics as metrics
+from ann_benchmarks.plotting.utils  import get_plot_label, compute_metrics, compute_all_metrics, create_pointset, create_linestyles
+import plot
+
+colors = [
+    "rgba(166,206,227,1)",
+    "rgba(31,120,180,1)",
+    "rgba(178,223,138,1)",
+    "rgba(51,160,44,1)",
+    "rgba(251,154,153,1)",
+    "rgba(227,26,28,1)",
+    "rgba(253,191,111,1)",
+    "rgba(255,127,0,1)",
+    "rgba(202,178,214,1)"
+    ]
+
+point_styles = {
+        "o" : "circle",
+        "<" : "triangle",
+        "*" : "star",
+        "x" : "cross",
+        "+" : "rect",
+        }
+
+def convert_color(color):
+    r, g, b, a = color
+    return "rgba(%(r)d, %(g)d, %(b)d, %(a)d)" % {
+            "r" : r * 255, "g" : g * 255,  "b" : b * 255 , "a" : a}
+
+def convert_linestyle(ls):
+    new_ls = {}
+    for algo in ls.keys():
+        algostyle = ls[algo]
+        new_ls[algo] = (convert_color(algostyle[0]), convert_color(algostyle[1]),
+                algostyle[2], point_styles[algostyle[3]])
+    return new_ls
+
+def get_run_desc(properties):
+    return "%(dataset)s_%(count)d_%(distance)s" % properties
+
+def get_dataset_from_desc(desc):
+    return desc.split("_")[0]
+
+def get_count_from_desc(desc):
+    return desc.split("_")[1]
+
+def get_distance_from_desc(desc):
+    return desc.split("_")[2]
+
+def get_dataset_label(desc):
+    return get_dataset_from_desc(desc) + " (k = " + get_count_from_desc(desc) + ")"
+
+def directory_path(s):
+    if not os.path.isdir(s):
+        raise argparse.ArgumentTypeError("'%s' is not a directory" % s)
+    return s + "/"
+
+def prepare_data(data, xn, yn):
+    """Change format from (algo, instance, dict) to (algo, instance, x, y)."""
+    res = []
+    for algo, algo_name, result in data:
+        res.append((algo, algo_name, result[xn], result[yn]))
+    return res
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    '--plottype',
+    help = 'Generate only the plots specified',
+    nargs = '*',
+    choices = plot_variants.keys(),
+    default = plot_variants.keys())
+parser.add_argument(
+    '--outputdir',
+    help = 'Select output directory',
+    default = '.',
+    type=directory_path,
+    action = 'store')
+parser.add_argument(
+    '--latex',
+    help='generates latex code for each plot',
+    action = 'store_true')
+parser.add_argument(
+    '--scatter',
+    help='create scatterplot for data',
+    action = 'store_true')
+args = parser.parse_args()
+
+def get_lines(all_data, xn, yn, render_all_points):
+    """ For each algorithm run on a dataset, obtain its performance curve coords."""
+    plot_data = []
+    for algo in sorted(all_data.keys(), key=lambda x: x.lower()):
+            xs, ys, ls, axs, ays, als = \
+                create_pointset(prepare_data(all_data[algo], xn, yn), xn, yn)
+            if render_all_points:
+                xs, ys, ls = axs, ays, als
+            plot_data.append({ "name": algo, "coords" : zip(xs, ys), "labels" : ls,
+                "scatter" : render_all_points})
+    return plot_data
+
+def create_plot(all_data, xn, yn, linestyle, j2_env, additional_label = "", plottype = "line"):
+    xm, ym = (metrics[xn], metrics[yn])
+    render_all_points = plottype == "bubble"
+    plot_data = get_lines(all_data, xn, yn, render_all_points)
+    latex_code = j2_env.get_template("latex.template").\
+                    render(plot_data = plot_data, caption = get_plot_label(xm, ym),
+                    xlabel = xm["description"], ylabel = ym["description"])
+    plot_data = get_lines(all_data, xn, yn, render_all_points)
+    button_label = hashlib.sha224((get_plot_label(xm, ym) +
+                additional_label).encode("utf-8")).hexdigest()
+    return j2_env.get_template("chartjs.template").\
+            render(args = args, latex_code = latex_code, button_label = button_label,
+                    data_points = plot_data,
+                    xlabel = xm["description"], ylabel = ym["description"],
+                    plottype = plottype, plot_label = get_plot_label(xm, ym),
+                    label = additional_label, linestyle = linestyle,
+                    render_all_points = render_all_points)
+
+def build_detail_site(data, label_func, j2_env, linestyles, batch=False):
+    for (name, runs) in data.items():
+        print("Building '%s'" % name)
+        all_runs = runs.keys()
+        label = label_func(name)
+        data = {"normal" : [], "scatter" : []}
+
+        for plottype in args.plottype:
+            xn, yn = plot_variants[plottype]
+            data["normal"].append(create_plot(runs, xn, yn, convert_linestyle(linestyles), j2_env))
+            if args.scatter:
+                data["scatter"].append(create_plot(runs, xn, yn,
+                    convert_linestyle(linestyles), j2_env, "Scatterplot ", "bubble"))
+
+        # create png plot for summary page
+        data_for_plot = {}
+        for k in runs.keys():
+            data_for_plot[k] = prepare_data(runs[k], 'k-nn', 'qps')
+        plot.create_plot(data_for_plot, False,
+                False, True, 'k-nn', 'qps',  args.outputdir + get_algorithm_name(name, batch) + ".png",
+                linestyles, batch)
+        with open(args.outputdir + get_algorithm_name(name, batch) + ".html", "w") as text_file:
+            text_file.write(j2_env.get_template("detail_page.html").
+                render(title = label, plot_data = data, args = args, batch=batch))
+
+
+def build_index_site(datasets, algorithms, j2_env, file_name):
+    dataset_data = {'batch' : [], 'non-batch' : []}
+    for mode in ['batch', 'non-batch']:
+        distance_measures = sorted(set([get_distance_from_desc(e) for e in datasets[mode].keys()]))
+        sorted_datasets = sorted(set([get_dataset_from_desc(e) for e in datasets[mode].keys()]))
+
+        for dm in distance_measures:
+            d = {"name" : dm.capitalize(), "entries": []}
+            for ds in sorted_datasets:
+                matching_datasets = [e for e in datasets[mode].keys() \
+                        if get_dataset_from_desc(e) == ds and \
+                           get_distance_from_desc(e) == dm]
+                sorted_matches = sorted(matching_datasets, \
+                        key = lambda e: int(get_count_from_desc(e)))
+                for idd in sorted_matches:
+                    d["entries"].append({"name" : idd, "desc" : get_dataset_label(idd)})
+            dataset_data[mode].append(d)
+
+    with open(args.outputdir + "index.html", "w") as text_file:
+        text_file.write(j2_env.get_template("summary.html").
+                render(title = "ANN-Benchmarks", dataset_with_distances = dataset_data,
+                    algorithms = algorithms, label_func=get_algorithm_name))
+
+def load_all_results():
+    """Read all result files and compute all metrics"""
+    all_runs_by_dataset = {'batch' : {}, 'non-batch': {}}
+    all_runs_by_algorithm = {'batch' : {}, 'non-batch' : {}}
+    cached_true_dist = []
+    old_sdn = None
+    for properties, f in results.load_all_results():
+        sdn = get_run_desc(properties)
+        if sdn != old_sdn:
+            dataset = get_dataset(properties["dataset"])
+            cached_true_dist = list(dataset["distances"])
+            old_sdn = sdn
+        algo = properties["algo"]
+        ms = compute_all_metrics(cached_true_dist, f, properties)
+        algo_ds = get_dataset_label(sdn)
+        idx = "non-batch"
+        if properties["batch_mode"]:
+            idx = "batch"
+        all_runs_by_algorithm[idx].setdefault(algo, {}).setdefault(algo_ds, []).append(ms)
+        all_runs_by_dataset[idx].setdefault(sdn, {}).setdefault(algo, []).append(ms)
+
+    return (all_runs_by_dataset, all_runs_by_algorithm)
+
+j2_env = Environment(loader=FileSystemLoader("./templates/"), trim_blocks = True)
+j2_env.globals.update(zip=zip, len=len)
+runs_by_ds, runs_by_algo = load_all_results()
+dataset_names = [get_dataset_label(x) for x in list(runs_by_ds['batch'].keys()) + list(runs_by_ds['non-batch'].keys())]
+algorithm_names = list(runs_by_algo['batch'].keys()) + list(runs_by_algo['non-batch'].keys())
+linestyles = {**create_linestyles(dataset_names), **create_linestyles(algorithm_names)}
+
+build_detail_site(runs_by_ds['non-batch'], lambda label: get_dataset_label(label), j2_env, linestyles, False)
+build_detail_site(runs_by_ds['batch'], lambda label: get_dataset_label(label), j2_env, linestyles, True)
+build_detail_site(runs_by_algo['non-batch'], lambda x: x, j2_env, linestyles, False)
+build_detail_site(runs_by_algo['batch'], lambda x: x, j2_env, linestyles, True)
+build_index_site(runs_by_ds, runs_by_algo, j2_env, "index.html")
diff --git a/data/.gitignore b/data/.gitignore
new file mode 100644
index 0000000..5e7d273
--- /dev/null
+++ b/data/.gitignore
@@ -0,0 +1,4 @@
+# Ignore everything in this directory
+*
+# Except this file
+!.gitignore
diff --git a/docker-install/Dockerfile b/docker-install/Dockerfile
new file mode 100644
index 0000000..aff9650
--- /dev/null
+++ b/docker-install/Dockerfile
@@ -0,0 +1,10 @@
+FROM ubuntu:latest
+
+RUN apt-get update
+RUN apt-get install -y python3-numpy python3-scipy python3-pip build-essential git
+
+WORKDIR /home/app
+COPY requirements.txt run_algorithm.py ./
+RUN pip3 install -rrequirements.txt
+
+ENTRYPOINT ["python3", "run_algorithm.py"]
diff --git a/docker-install/Dockerfile.annoy b/docker-install/Dockerfile.annoy
new file mode 100644
index 0000000..e426125
--- /dev/null
+++ b/docker-install/Dockerfile.annoy
@@ -0,0 +1,5 @@
+FROM ann-benchmarks
+
+RUN git clone https://github.com/spotify/annoy
+RUN cd annoy && python3 setup.py install
+RUN python3 -c 'import annoy'
diff --git a/docker-install/Dockerfile.datasketch b/docker-install/Dockerfile.datasketch
new file mode 100644
index 0000000..d70c592
--- /dev/null
+++ b/docker-install/Dockerfile.datasketch
@@ -0,0 +1,4 @@
+FROM ann-benchmarks
+
+RUN pip3 install datasketch
+RUN python3 -c 'import datasketch'
diff --git a/docker-install/Dockerfile.dolphinn b/docker-install/Dockerfile.dolphinn
new file mode 100644
index 0000000..4e2f7cc
--- /dev/null
+++ b/docker-install/Dockerfile.dolphinn
@@ -0,0 +1,5 @@
+FROM ann-benchmarks
+
+RUN git clone https://github.com/ipsarros/DolphinnPy lib-dolphinnpy
+ENV PYTHONPATH lib-dolphinnpy
+RUN python3 -c 'import dolphinn'
diff --git a/docker-install/Dockerfile.faiss b/docker-install/Dockerfile.faiss
new file mode 100644
index 0000000..a39531a
--- /dev/null
+++ b/docker-install/Dockerfile.faiss
@@ -0,0 +1,12 @@
+FROM ann-benchmarks
+
+RUN apt-get update && apt-get install -y libopenblas-base libopenblas-dev libpython-dev python-numpy python-pip swig
+RUN git clone https://github.com/facebookresearch/faiss lib-faiss
+RUN cd lib-faiss && git checkout tags/v1.2.1 -b lib-faiss && cp example_makefiles/makefile.inc.Linux makefile.inc && make -j4 py BLASLDFLAGS=/usr/lib/x86_64-linux-gnu/libopenblas.so.0
+ENV PYTHONPATH lib-faiss
+
+# faiss doesn't work with python3 afaik
+RUN python -c 'import faiss'
+RUN pip install -r requirements.txt
+RUN pip install sklearn enum34
+ENTRYPOINT ["python", "run_algorithm.py"]
diff --git a/docker-install/Dockerfile.flann b/docker-install/Dockerfile.flann
new file mode 100644
index 0000000..4ca2584
--- /dev/null
+++ b/docker-install/Dockerfile.flann
@@ -0,0 +1,10 @@
+FROM ann-benchmarks
+
+RUN apt-get update && apt-get install -y cmake
+RUN git clone https://github.com/mariusmuja/flann
+RUN mkdir flann/build
+RUN cd flann/build && cmake ..
+RUN cd flann/build && make -j4
+RUN cd flann/build && make install
+RUN pip3 install sklearn
+RUN python3 -c 'import pyflann'
diff --git a/docker-install/Dockerfile.hdidx b/docker-install/Dockerfile.hdidx
new file mode 100644
index 0000000..5d533d1
--- /dev/null
+++ b/docker-install/Dockerfile.hdidx
@@ -0,0 +1,18 @@
+FROM ann-benchmarks
+
+# needed to avoid some dependencies starting interaction on the command line
+ARG DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && apt-get install -y \
+    python-opencv \
+    python-numpy \
+    python-pip \
+    git
+RUN pip install cython
+RUN pip install -r requirements.txt
+
+RUN git clone https://github.com/hdidx/hdidx.git
+RUN cd hdidx && python setup.py install
+
+RUN python -c 'import hdidx; a = hdidx.indexer.SHIndexer'
+ENTRYPOINT ["python", "run_algorithm.py"]
diff --git a/docker-install/Dockerfile.hnswlib b/docker-install/Dockerfile.hnswlib
new file mode 100644
index 0000000..9648903
--- /dev/null
+++ b/docker-install/Dockerfile.hnswlib
@@ -0,0 +1,10 @@
+FROM ann-benchmarks
+
+RUN apt-get install -y python-setuptools python-pip
+RUN pip3 install pybind11 numpy setuptools
+RUN git clone https://github.com/nmslib/hnsw.git;cd hnsw; git checkout denorm
+
+RUN cd hnsw/python_bindings; python3 setup.py install
+
+RUN python3 -c 'import hnswlib'
+
diff --git a/docker-install/Dockerfile.kgraph b/docker-install/Dockerfile.kgraph
new file mode 100644
index 0000000..43c9bf3
--- /dev/null
+++ b/docker-install/Dockerfile.kgraph
@@ -0,0 +1,11 @@
+FROM ann-benchmarks
+
+RUN apt-get update && apt-get install -y libboost-timer-dev libboost-chrono-dev libboost-program-options-dev libboost-system-dev libboost-python-dev python-numpy python-pip
+RUN git clone https://github.com/aaalgo/kgraph
+RUN cd kgraph && python setup.py build && python setup.py install
+
+# kgraph doesn't work with python3 afaik
+RUN python -c 'import pykgraph'
+RUN pip install -rrequirements.txt
+RUN pip install enum34
+ENTRYPOINT ["python", "run_algorithm.py"]
diff --git a/docker-install/Dockerfile.mrpt b/docker-install/Dockerfile.mrpt
new file mode 100644
index 0000000..c14ab76
--- /dev/null
+++ b/docker-install/Dockerfile.mrpt
@@ -0,0 +1,6 @@
+FROM ann-benchmarks
+
+RUN pip3 install sklearn
+#RUN pip3 install git+https://github.com/teemupitkanen/mrpt/tree/2369a9df0fd7e9774b02237253f022a55bd6f532
+RUN pip3 install git+https://github.com/chunjiangzhu/mrpt
+#https://github.com/teemupitkanen/mrpt
diff --git a/docker-install/Dockerfile.nearpy b/docker-install/Dockerfile.nearpy
new file mode 100644
index 0000000..4411816
--- /dev/null
+++ b/docker-install/Dockerfile.nearpy
@@ -0,0 +1,5 @@
+FROM ann-benchmarks
+
+RUN apt-get install -y libhdf5-openmpi-dev cython
+RUN pip3 install nearpy bitarray redis sklearn
+RUN python3 -c 'import nearpy'
\ No newline at end of file
diff --git a/docker-install/Dockerfile.ngt b/docker-install/Dockerfile.ngt
new file mode 100644
index 0000000..c0ef95c
--- /dev/null
+++ b/docker-install/Dockerfile.ngt
@@ -0,0 +1,13 @@
+FROM ann-benchmarks
+
+RUN apt-get update
+RUN apt-get install -y git cmake g++ python3 python3-setuptools python3-pip
+RUN pip3 install wheel pybind11
+RUN git clone https://github.com/chunjiangzhu/ngt.git
+RUN mkdir -p ngt/build
+RUN cd ngt/build && cmake ..
+RUN cd ngt/build && make && make install
+RUN ldconfig
+RUN cd ngt/python && python3 setup.py bdist_wheel
+RUN pip3 install ngt/python/dist/ngt-*-linux_x86_64.whl
+
diff --git a/docker-install/Dockerfile.nmslib b/docker-install/Dockerfile.nmslib
new file mode 100644
index 0000000..2f1bba6
--- /dev/null
+++ b/docker-install/Dockerfile.nmslib
@@ -0,0 +1,16 @@
+FROM ann-benchmarks
+
+RUN apt-get update && apt-get install -y cmake libboost-all-dev libeigen3-dev libgsl0-dev
+RUN git clone https://github.com/searchivarius/nmslib.git
+RUN cd nmslib/similarity_search && cmake . -DWITH_EXTRAS=1
+RUN cd nmslib/similarity_search && make -j4
+RUN apt-get install -y python-setuptools python-pip python-numpy
+RUN pip install pybind11
+RUN cd nmslib/python_bindings && python setup.py build
+RUN cd nmslib/python_bindings && python setup.py install
+
+# nmslib doesn't work with python3 afaik
+RUN python -c 'import nmslib'
+RUN pip install -rrequirements.txt
+RUN pip install enum34
+ENTRYPOINT ["python", "run_algorithm.py"]
diff --git a/docker-install/Dockerfile.nmslib-sparse b/docker-install/Dockerfile.nmslib-sparse
new file mode 100644
index 0000000..39ffc6f
--- /dev/null
+++ b/docker-install/Dockerfile.nmslib-sparse
@@ -0,0 +1,17 @@
+FROM ann-benchmarks
+
+RUN apt-get update && apt-get install -y cmake libboost-all-dev libeigen3-dev libgsl0-dev
+RUN git clone https://github.com/searchivarius/nmslib.git
+RUN cd nmslib/similarity_search && cmake . -DWITH_EXTRAS=1
+RUN cd nmslib/similarity_search && make -j4
+RUN apt-get install -y python-setuptools python-pip python-numpy
+RUN pip install pybind11
+RUN cd nmslib/python_bindings && python setup.py build
+RUN cd nmslib/python_bindings && python setup.py install
+
+# nmslib doesn't work with python3 afaik
+RUN python -c 'import nmslib'
+RUN pip install -rrequirements.txt
+RUN pip install enum34
+RUN pip install h5sparse
+ENTRYPOINT ["python", "run_algorithm.py"]
diff --git a/docker-install/Dockerfile.panns b/docker-install/Dockerfile.panns
new file mode 100644
index 0000000..db5438f
--- /dev/null
+++ b/docker-install/Dockerfile.panns
@@ -0,0 +1,10 @@
+FROM ann-benchmarks
+
+RUN apt-get update && apt-get install -y python-pip python-numpy python-scipy
+RUN pip install panns
+
+# panns doesn't work with python3 afaik
+RUN python -c 'import panns'
+RUN pip install -rrequirements.txt
+RUN pip install enum34
+ENTRYPOINT ["python", "run_algorithm.py"]
diff --git a/docker-install/Dockerfile.pynndescent b/docker-install/Dockerfile.pynndescent
new file mode 100644
index 0000000..cc2b8fd
--- /dev/null
+++ b/docker-install/Dockerfile.pynndescent
@@ -0,0 +1,6 @@
+FROM ann-benchmarks
+
+RUN pip3 install numba scikit-learn
+RUN git clone https://github.com/lmcinnes/pynndescent
+RUN cd pynndescent && python3 setup.py install
+RUN python3 -c 'import pynndescent'
diff --git a/docker-install/Dockerfile.rpforest b/docker-install/Dockerfile.rpforest
new file mode 100644
index 0000000..72c1231
--- /dev/null
+++ b/docker-install/Dockerfile.rpforest
@@ -0,0 +1,5 @@
+FROM ann-benchmarks
+
+RUN git clone https://github.com/lyst/rpforest
+RUN cd rpforest && python3 setup.py install
+RUN python3 -c 'import rpforest'
diff --git a/docker-install/Dockerfile.sklearn b/docker-install/Dockerfile.sklearn
new file mode 100644
index 0000000..c61a79f
--- /dev/null
+++ b/docker-install/Dockerfile.sklearn
@@ -0,0 +1,4 @@
+FROM ann-benchmarks
+
+RUN pip3 install scikit-learn
+RUN python3 -c 'import sklearn'
diff --git a/install.py b/install.py
new file mode 100644
index 0000000..33b42a0
--- /dev/null
+++ b/install.py
@@ -0,0 +1,63 @@
+import json
+import os
+import argparse
+import subprocess
+from multiprocessing import Pool
+from ann_benchmarks.main import positive_int
+
+
+def build(library,args):
+    print('Building %s...' % library)
+    if args is not None and len(args) != 0:
+        q = " ".join(["--build-arg " + x.replace(" ","\\ ") for x in args])
+    else:
+        q = ""
+    subprocess.check_call(
+        'docker build %s\
+        --rm -t ann-benchmarks-%s -f docker-install/Dockerfile.%s .' % (q, library, library), shell=True)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument(
+        "--proc",
+        default=1,
+        type=positive_int,
+        help="the number of process to build docker images")
+    parser.add_argument(
+        '--algorithm',
+        metavar='NAME',
+        help='build only the named algorithm image',
+        default=None)
+    parser.add_argument(
+        '--build-arg',
+        help='pass given args to all docker builds',
+        nargs="+")
+    args = parser.parse_args()
+
+    print('Building base image...')
+    subprocess.check_call(
+        'docker build \
+        --rm -t ann-benchmarks -f docker-install/Dockerfile .', shell=True)
+
+    if args.algorithm:
+        print('Building algorithm(%s) image...' % args.algorithm)
+        build(args.algorithm,args.build_arg)
+    elif os.getenv('LIBRARY'):
+        print('Building algorithm(%s) image...' % os.getenv('LIBRARY'))
+        build(os.getenv('LIBRARY'),args.build_arg)
+    else:
+        print('Building algorithm images... with (%d) processes' % args.proc)
+        dockerfiles = []
+        for fn in os.listdir('docker-install'):
+            if fn.startswith('Dockerfile.'):
+                dockerfiles.append(fn.split('.')[-1])
+
+        if args.proc == 1:
+            [build(tag,args.build_arg) for tag in dockerfiles]
+        else:
+            pool = Pool(processes=args.proc)
+            pool.map(lambda x: build(x, args.build_arg), dockerfiles)
+            pool.close()
+            pool.join()
diff --git a/run.py b/run.py
new file mode 100644
index 0000000..adf3247
--- /dev/null
+++ b/run.py
@@ -0,0 +1,4 @@
+from ann_benchmarks.main import main
+
+main()
+
diff --git a/run.sh b/run.sh
new file mode 100644
index 0000000..3529ebc
--- /dev/null
+++ b/run.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+#SBATCH --partition=HaswellPriority           # Name of partition
+#SBATCH --ntasks=1                            # Request 48 CPU cores
+#SBATCH --exclude=cn[65-69,71-136,325-343,345-353,355-358,360-364,369-398,400-401],gpu[07-10]
+#SBATCH --exclusive
+
+module load anaconda/5.1.0
+source activate ann_env
+module purge
+module load gcc/5.4.0
+module load singularity/3.1
+#python cpBuildingTime.py
+#singularity exec ../singularity/ann-bench-nmslib.sif python -c 'import nmslib'
+#python run.py --dataset=molport-1024-jaccard --algorithm='Hnsw(Nmslib)'  # Replace with your application's commands
+#python run.py --dataset=molport-1024-jaccard --algorithm='Hnsw(Nmslib)' --batch
+#python run.py --dataset=molport-1024-jaccard --algorithm='SW-graph(Nmslib)'
+#python run.py --dataset=molport-1024-jaccard --algorithm='VPtree(Nmslib)'
+#python run.py --dataset=molport-1024-jaccard --algorithm='Pynndescent'
+#python run.py --dataset=molport-1024-jaccard --algorithm='Datasketch'
+#python run.py --dataset=molport-1024-jaccard --algorithm='Bruteforce'
+#python run.py --dataset=molport-1024-jaccard --algorithm='Balltree(Sklearn)'
+#python run.py --dataset=molport-1024-jaccard --algorithm='Risc'
+#python run.py --dataset=molport-1024-jaccard --algorithm='DivideSkip'
+python run.py --dataset=molport-1024-jaccard --rq --radius=0.4 --algorithm='Onng(Ngt)'
+#python run.py --dataset=molport-1024-jaccard --algorithm='Panng(Ngt)'
+
+#python run.py --dataset=chembl-1024-jaccard --algorithm='Hnsw(Nmslib)'  # Replace with your application's commands
+#python run.py --dataset=chembl-1024-jaccard --algorithm='Hnsw(Nmslib)' --batch
+#python run.py --dataset=chembl-1024-jaccard --algorithm='SW-graph(Nmslib)'
+#python run.py --dataset=chembl-1024-jaccard --algorithm='VPtree(Nmslib)'
+#python run.py --dataset=chembl-1024-jaccard --algorithm='Pynndescent'
+#python run.py --dataset=chembl-1024-jaccard --algorithm='Datasketch'
+#python run.py --dataset=chembl-1024-jaccard --algorithm='Bruteforce'
+#python run.py --dataset=chembl-1024-jaccard --algorithm='Balltree(Sklearn)'
+#python run.py --dataset=chembl-1024-jaccard --algorithm='Risc'
+#python run.py --dataset=chembl-1024-jaccard --algorithm='DivideSkip'
+#python run.py --dataset=chembl-1024-jaccard --algorithm='Onng(Ngt)'
+#python run.py --dataset=chembl-1024-jaccard --algorithm='Panng(Ngt)'
+
diff --git a/run_algorithm.py b/run_algorithm.py
new file mode 100644
index 0000000..f1add5b
--- /dev/null
+++ b/run_algorithm.py
@@ -0,0 +1,3 @@
+from ann_benchmarks.runner import run_from_cmdline
+
+run_from_cmdline()
diff --git a/running.txt b/running.txt
new file mode 100644
index 0000000..1612246
--- /dev/null
+++ b/running.txt
@@ -0,0 +1,6 @@
+2358986:c-onng
+2363468:m-onng
+2363449:m-bruteforce
+2363450:m-balltree
+2363453:m-datasketch
+2363454:c-datasketch
diff --git a/singularity-install/ann-bench-datasketch.def b/singularity-install/ann-bench-datasketch.def
new file mode 100644
index 0000000..75b7198
--- /dev/null
+++ b/singularity-install/ann-bench-datasketch.def
@@ -0,0 +1,14 @@
+Bootstrap: localimage
+From: ann-bench.sif
+
+%files
+ann-bench.sif
+
+%post
+pip3 install datasketch h5sparse
+
+python3 -c 'import datasketch'
+
+
+%runscript
+python3 run_algorithm.py
diff --git a/singularity-install/ann-bench-ngt.def b/singularity-install/ann-bench-ngt.def
new file mode 100644
index 0000000..3c5844a
--- /dev/null
+++ b/singularity-install/ann-bench-ngt.def
@@ -0,0 +1,22 @@
+Bootstrap: localimage
+From: ann-bench.sif
+
+%files
+ann-bench.sif
+
+%post
+apt-get update
+apt-get install -y git cmake g++ python3 python3-setuptools python3-pip
+pip3 install wheel pybind11
+git clone https://github.com/chunjiangzhu/ngt.git
+mkdir -p ngt/build
+cd ngt/build && cmake ..
+make && make install && cd ../../
+ldconfig
+cd ngt/python && python3 setup.py bdist_wheel && cd ../../
+pip3 install ngt/python/dist/ngt-*-linux_x86_64.whl
+pip3 install h5sparse
+
+%runscript
+python3 run_algorithm.py
+
diff --git a/singularity-install/ann-bench-nmslib.def b/singularity-install/ann-bench-nmslib.def
new file mode 100644
index 0000000..69e836c
--- /dev/null
+++ b/singularity-install/ann-bench-nmslib.def
@@ -0,0 +1,20 @@
+Bootstrap: localimage
+From: /home/cjz18001/singularity/ann-bench.sif
+
+%post
+apt-get update && apt-get install -y cmake libboost-all-dev libeigen3-dev libgsl0-dev
+git clone https://github.com/searchivarius/nmslib.git
+cd nmslib/similarity_search && cmake . -DWITH_EXTRAS=1
+make -j4 && cd ../../
+apt-get install -y python-setuptools python-pip python-numpy
+pip install pybind11
+cd nmslib/python_bindings && python setup.py build
+python setup.py install && cd ../../
+
+python -c 'import nmslib'
+pip install -rrequirements.txt
+pip install enum34
+pip install h5sparse
+
+%runscript
+python run_algorithm.py
diff --git a/singularity-install/ann-bench-nmslib3.def b/singularity-install/ann-bench-nmslib3.def
new file mode 100644
index 0000000..2452421
--- /dev/null
+++ b/singularity-install/ann-bench-nmslib3.def
@@ -0,0 +1,20 @@
+Bootstrap: localimage
+From: ann-bench.sif
+
+%post
+apt-get update && apt-get install -y cmake libboost-all-dev libeigen3-dev libgsl0-dev
+git clone https://github.uconn.edu/mldrugdiscovery/nmslib.git
+cd nmslib/similarity_search && cmake . -DWITH_EXTRAS=1
+make -j4 && cd ../../
+apt-get install -y python-setuptools python-pip python-numpy
+pip3 install pybind11
+cd nmslib/python_bindings && python3 setup.py build
+python3 setup.py install && cd ../../
+
+python3 -c 'import nmslib'
+pip3 install -rrequirements.txt
+pip3 install enum34
+pip3 install h5sparse
+
+%runscript
+python3 run_algorithm.py
diff --git a/singularity-install/ann-bench-pynndescent.def b/singularity-install/ann-bench-pynndescent.def
new file mode 100644
index 0000000..c8b7c6f
--- /dev/null
+++ b/singularity-install/ann-bench-pynndescent.def
@@ -0,0 +1,16 @@
+Bootstrap: localimage
+From: ann-bench.sif
+
+%files
+ann-bench.sif
+
+%post
+pip3 install numba scikit-learn h5sparse
+git clone https://github.com/lmcinnes/pynndescent
+cd pynndescent && python3 setup.py install && cd ../
+
+python3 -c 'import pynndescent'
+
+
+%runscript
+python3 run_algorithm.py
diff --git a/singularity-install/ann-bench-risc.def b/singularity-install/ann-bench-risc.def
new file mode 100644
index 0000000..6245485
--- /dev/null
+++ b/singularity-install/ann-bench-risc.def
@@ -0,0 +1,16 @@
+Bootstrap: localimage
+From: ann-bench.sif
+
+%files
+ann-bench.sif
+
+%post
+apt-get update && apt-get install -y python3-dev swig
+git clone https://github.com/chunjiangzhu/risc.git
+cd risc/Code && ./build.sh && cd ../../
+pip3 install h5sparse
+cd risc/Code && python3 -c 'import pyrisc'
+
+
+%runscript
+python3 run_algorithm.py
diff --git a/singularity-install/ann-bench-sklearn.def b/singularity-install/ann-bench-sklearn.def
new file mode 100644
index 0000000..2d354d5
--- /dev/null
+++ b/singularity-install/ann-bench-sklearn.def
@@ -0,0 +1,13 @@
+Bootstrap: localimage
+From: ann-bench.sif
+
+%files
+ann-bench.sif
+
+%post
+pip3 install scikit-learn h5sparse
+
+python3 -c 'import sklearn'
+
+%runscript
+python3 run_algorithm.py
diff --git a/singularity-install/ann-bench.def b/singularity-install/ann-bench.def
new file mode 100644
index 0000000..956abbe
--- /dev/null
+++ b/singularity-install/ann-bench.def
@@ -0,0 +1,16 @@
+Bootstrap: library
+From: ubuntu:16.04
+
+%files
+requirements.txt
+run_algorithm.py
+
+%post
+apt-get update
+apt-get install -y python3-numpy python3-scipy python3-pip build-essential git
+
+pip3 install -rrequirements.txt
+
+%runscript
+python3 run_algorithm.py
+
diff --git a/singularity-install/requirements.txt b/singularity-install/requirements.txt
new file mode 100644
index 0000000..a453071
--- /dev/null
+++ b/singularity-install/requirements.txt
@@ -0,0 +1,10 @@
+ansicolors==1.1.8
+docker==2.6.1
+h5py==2.7.1
+matplotlib==2.1.0
+numpy==1.13.3
+pyyaml==3.12
+psutil==5.4.2
+scipy==1.0.0
+scikit-learn==0.19.1
+jinja2==2.10
diff --git a/singularity-install/run_algorithm.py b/singularity-install/run_algorithm.py
new file mode 100644
index 0000000..f1add5b
--- /dev/null
+++ b/singularity-install/run_algorithm.py
@@ -0,0 +1,3 @@
+from ann_benchmarks.runner import run_from_cmdline
+
+run_from_cmdline()
diff --git a/templates/chartjs.template b/templates/chartjs.template
new file mode 100644
index 0000000..466e355
--- /dev/null
+++ b/templates/chartjs.template
@@ -0,0 +1,102 @@
+            <h3>{{xlabel}}/{{ylabel}}</h3>
+            <div id="{{ xlabel }}{{ ylabel }}{{ label }}">
+            <canvas id="chart{{ xlabel }}{{ ylabel }}{{ label }}" width="800" height="600"></canvas>
+            <script>
+                var ctx = document.getElementById("chart{{ xlabel }}{{ ylabel }}{{ label }}");
+                var chart = new Chart(ctx, {
+                    {% if not render_all_points %}
+                    type: "line",
+                    {% else %}
+                    type: "bubble",
+                    {% endif %}
+                    data: { datasets: [
+                        {% for run in data_points %}
+                        {
+                            label: "{{ run["name"] }}",
+                            fill: false,
+                            pointStyle: "{{ linestyle[run["name"]][3] }}",
+                            borderColor: "{{ linestyle[run["name"]][0] }}",
+                            data: [
+                                {% for (x, y), l in zip(run["coords"], run["labels"]) %}
+                                    { x: {{ x }} , y: {{ y }}, label: "{{ l }}" },
+                                {% endfor %}
+                            ]
+                        },
+                        {% endfor %}
+                        ]},
+                        options: {
+                            responsive: false,
+                            title:{
+                                display:true,
+                                text: '{{ plot_label }}'
+                            },
+                            scales: {
+                                xAxes: [{
+                                    display: true,
+                                    type: 'linear',
+                                    max: '1',
+                                    position: 'bottom',
+                                    scaleLabel: {
+                                        display: true,
+                                        labelString: ' {{ xlabel }}   '
+                                    }
+                                }],
+                                yAxes: [{
+                                    display: true,
+                                    type: 'logarithmic',
+                                    scaleLabel: {
+                                        display: true,
+                                        labelString: ' {{ ylabel }} '
+                                    }
+                                }]
+                            }
+                        }
+                    });
+                function pushOrConcat(base, toPush) {
+                        if (toPush) {
+                                if (Chart.helpers.isArray(toPush)) {
+                                        // base = base.concat(toPush);
+                                        Array.prototype.push.apply(base, toPush);
+                                } else {
+                                        base.push(toPush);
+                                }
+                        }
+
+                        return base;
+                }
+                Chart.Tooltip.prototype.getFooter = function(tooltipItem, data) {
+                    var me = this;
+                    var callbacks = me._options.callbacks;
+                    var item = tooltipItem[0];
+
+                    var beforeFooter = callbacks.beforeFooter.apply(me, arguments);
+                    var footer = "Parameters: " + data.datasets[item.datasetIndex].data[item.index].label || '';
+                    var afterFooter = callbacks.afterFooter.apply(me, arguments);
+
+                    var lines = [];
+                    lines = pushOrConcat(lines, beforeFooter);
+                    lines = pushOrConcat(lines, footer);
+                    lines = pushOrConcat(lines, afterFooter);
+
+                    return lines;
+                }
+
+                </script>
+            </div>
+            {% if args.latex %}
+                <div class="row">
+                    <div class="col-md-4 text-center">
+                        <button type="button" id="button_{{button_label}}" class="btn btn-default" >Toggle latex code</button>
+                    </div>
+                </div>
+                <script>
+                    $("#button_{{button_label}}").click(function() {
+                        $("#plot_{{button_label}}").toggle();
+                    });
+                </script>
+                <div id="plot_{{button_label}}" style="display:none">
+                    <pre>
+                    {{latex_code}}
+                    </pre>
+                </div>
+            {% endif %}
diff --git a/templates/detail_page.html b/templates/detail_page.html
new file mode 100644
index 0000000..2188e15
--- /dev/null
+++ b/templates/detail_page.html
@@ -0,0 +1,23 @@
+{% extends "general.html" %}
+{% block content %}
+        <div class="container">
+        {% for item in plot_data.keys() %}
+            {% if item=="normal" %}
+                {% if batch %}
+                    <h2>Plots for {{title}} in batch mode</h2>
+                {% else %}
+                    <h2>Plots for {{title}}</h2>
+                {% endif %}
+            {% elif item=="scatter" and args.scatter %}
+                {% if batch %}
+                    <h2>Scatterplots for {{title}} in batch mode</h2>
+                {% else %}
+                    <h2>Scatterplots for {{title}}</h2>
+                {% endif %}
+            {% endif %}
+            {% for plot in plot_data[item] %}
+            {{ plot }}
+            {% endfor %}
+            <hr />
+        {% endfor %}
+{% endblock %}
diff --git a/templates/general.html b/templates/general.html
new file mode 100644
index 0000000..74ba2a6
--- /dev/null
+++ b/templates/general.html
@@ -0,0 +1,58 @@
+<!DOCTYPE html>
+    <html lang="en">
+      <head>
+        <meta charset="utf-8">
+        <meta http-equiv="X-UA-Compatible" content="IE=edge">
+        <meta name="viewport" content="width=device-width, initial-scale=1">
+        <!-- The above 3 meta tags *must* come first in the head; any other head content must come *after* these tags -->
+        <title>{{ title }}</title>
+        <script src="https://cdnjs.cloudflare.com/ajax/libs/Chart.js/2.5.0/Chart.js"></script>
+        <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.12.4/jquery.min.js"></script>
+        <!-- Include all compiled plugins (below), or include individual files as needed -->
+        <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha384-BVYiiSIFeK1dGmJRAkycuHAHRg32OmUcww7on3RYdg4Va+PmSTsz/K68vbdEjh4u" crossorigin="anonymous">
+        <!-- Bootstrap -->
+        <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha384-Tc5IQib027qvyjSMfHjOMaLkfuWVxZxUPnCJA7l2mCWNIpG9mGCD8wGNIcPD7Txa" crossorigin="anonymous"></script>
+        <style>
+            body { padding-top: 50px; }
+        </style>
+        <!-- HTML5 shim and Respond.js for IE8 support of HTML5 elements and media queries -->
+        <!-- WARNING: Respond.js doesn't work if you view the page via file:// -->
+        <!--[if lt IE 9]>
+          <script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
+          <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
+        <![endif]-->
+      </head>
+         <body>
+
+            <nav class="navbar navbar-inverse navbar-fixed-top">
+              <div class="container">
+                <div class="navbar-header">
+                  <a class="navbar-brand" href="index.html">ANN Benchmarks</a>
+                </div>
+                <div id="navbar" class="collapse navbar-collapse">
+                  <ul class="nav navbar-nav">
+                    <li class="active"><a href="index.html">Home</a></li>
+                  </ul>
+                  <ul class="nav navbar-nav">
+                    <li class="active"><a href="index.html#datasets">Datasets</a></li>
+                  </ul>
+                  <ul class="nav navbar-nav">
+                    <li class="active"><a href="index.html#algorithms">Algorithms</a></li>
+                  </ul>
+                  <ul class="nav navbar-nav">
+                    <li class="active"><a href="index.html#contact">Contact</a></li>
+                  </ul>
+                </div><!--/.nav-collapse -->
+              </div>
+            </nav>
+
+            {% block content %} {% endblock %}
+
+            <div id="contact">
+            <h2>Contact</h2>
+            <p>ANN-Benchmarks has been developed by Martin Aumueller (maau@itu.dk), Erik Bernhardsson (mail@erikbern.com), and Alec Faitfull (alef@itu.dk). Please use
+            <a href="https://github.com/erikbern/ann-benchmarks/">Github</a> to submit your implementation or improvements.</p>
+            </div>
+        </div>
+    </body>
+</html>
diff --git a/templates/latex.template b/templates/latex.template
new file mode 100644
index 0000000..4383534
--- /dev/null
+++ b/templates/latex.template
@@ -0,0 +1,30 @@
+
+\begin{figure}
+    \centering
+    \begin{tikzpicture}
+        \begin{axis}[
+            xlabel={ {{xlabel}} },
+            ylabel={ {{ylabel}} },
+            ymode = log,
+            yticklabel style={/pgf/number format/fixed,
+                              /pgf/number format/precision=3},
+            legend style = { anchor=west},
+            cycle list name = black white
+            ]
+        {% for algo in plot_data %}
+            {% if algo.scatter %}
+            \addplot [only marks] coordinates {
+            {% else %}
+            \addplot coordinates {
+            {% endif %}
+                {% for coord in algo.coords %}
+                    ({{ coord[0]}}, {{ coord[1] }})
+                {% endfor %}
+            };
+            \addlegendentry{ {{algo.name}} };
+        {% endfor %}
+    \end{axis}
+    \end{tikzpicture}
+    \caption{ {{caption}} }
+    \label{}
+\end{figure}
diff --git a/templates/summary.html b/templates/summary.html
new file mode 100644
index 0000000..b07e670
--- /dev/null
+++ b/templates/summary.html
@@ -0,0 +1,60 @@
+{% extends "general.html" %}
+{% block content %}
+        <div class="container">
+            <h1>Info</h1>
+            <p>ANN-Benchmarks is a benchmarking environment for approximate nearest neighbor algorithms search. This website contains the current benchmarking results. Please visit <a href="http://github.com/erikbern/ann-benchmarks/">http://github.com/erikbern/ann-benchmarks/</a> to get an overview over evaluated data sets and algorithms. Make a pull request on <a href="http://github.com/erikbern/ann-benchmarks/">Github</a> to add your own code or improvements to the
+            benchmarking system.
+            </p>
+            <div id="results">
+            <h1>Benchmarking Results</h1>
+            <p>Results are split by distance measure and dataset. In the bottom, you can find an overview of an algorithm's performance on all datasets. Each dataset is annoted
+            by <em>(k = ...)</em>, the number of nearest neighbors an algorithm was supposed to return. The plot shown depicts <em>Recall</em> (the fraction
+            of true nearest neighbors found, on average over all queries) against <em>Queries per second</em>.  Clicking on a plot reveils detailled interactive plots, including
+            approximate recall, index size, and build time.</p>
+            {% for type in ['non-batch', 'batch'] %}
+                {% if len(dataset_with_distances[type]) > 0 %}
+                    {% if type == 'batch' %}
+                        <h2>Benchmarks for Batched Queries</h2>
+                    {% else %}
+                        <h2>Benchmarks for Single Queries</h2>
+                    {% endif %}
+
+                    <h2 id ="datasets">Results by Dataset</h2>
+                    {% for distance_data in dataset_with_distances[type] %}
+                        <h3>Distance: {{ distance_data.name }} </h3>
+                        {% for entry in distance_data.entries %}
+                            <a href="./{{label_func(entry.name, type == 'batch')}}.html">
+                            <div class="row" id="{{entry.name}}">
+                                <div class = "col-md-4 bg-success">
+                                    <h4>{{entry.desc}}</h4>
+                            </div>
+                            <div class = "col-md-8">
+                                <img class = "img-responsive" src="{{label_func(entry.name, type == 'batch')}}.png" />
+                            </div>
+                        </div>
+                        </a>
+                        <hr />
+                        {% endfor %}
+                    {% endfor %}
+                    <h2 id="algorithms">Results by Algorithm</h2>
+                    <ul class="list-inline"><b>Algorithms:</b>
+                        {% for algo in algorithms[type].keys() %}
+                            <li><a href="#{{algo}}">{{algo}}</a></li>
+                        {% endfor %}
+                    </ul>
+                    {% for algo in algorithms[type].keys()%}
+                    <a href="./{{label_func(algo, type == 'batch')}}.html">
+                        <div class="row" id="{{algo}}">
+                            <div class = "col-md-4 bg-success">
+                                <h4>{{algo}}</h4>
+                        </div>
+                        <div class = "col-md-8">
+                            <img class = "img-responsive" src="{{label_func(algo, type == 'batch')}}.png" />
+                        </div>
+                    </div>
+                    </a>
+                    <hr />
+                    {% endfor %}
+                {% endif %}
+            {% endfor %}
+{% endblock %}
diff --git a/test/__init__.py b/test/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/test/test-metrics.py b/test/test-metrics.py
new file mode 100644
index 0000000..f75f0d7
--- /dev/null
+++ b/test/test-metrics.py
@@ -0,0 +1,63 @@
+import unittest
+from ann_benchmarks.plotting.metrics import knn, queries_per_second,\
+        index_size, build_time, candidates, epsilon, rel
+
+class TestMetrics(unittest.TestCase):
+
+    def setUp(self):
+        pass
+
+    def test_recall(self):
+        exact_queries = [[0.1, 0.25]]
+        run1 = [[]]
+        run2 = [[0.2, 0.3]]
+        run3 = [[0.2]]
+        run4 = [[0.2, 0.25]]
+
+        self.assertAlmostEqual(knn(exact_queries, run1, 2), 0.0)
+        self.assertAlmostEqual(knn(exact_queries, run2, 2), 0.5)
+        self.assertAlmostEqual(knn(exact_queries, run3, 2), 0.5)
+        self.assertAlmostEqual(knn(exact_queries, run4, 2), 1.0)
+
+    def test_epsilon_recall(self):
+        exact_queries = [[0.05, 0.08, 0.24, 0.3]]
+        run1 = [[]]
+        run2 = [[0.1, 0.2, 0.55, 0.7]]
+
+        self.assertAlmostEqual(epsilon(exact_queries, run1, 4, 1), 0.0)
+
+        self.assertAlmostEqual(epsilon(exact_queries, run2, 4, 0.0001), 0.5)
+        # distance can be off by factor (1 + 1) * 0.3 = 0.6 => recall .75
+        self.assertAlmostEqual(epsilon(exact_queries, run2, 4, 1), 0.75)
+        # distance can be off by factor (1 + 2) * 0.3 = 0.9 => recall 1
+        self.assertAlmostEqual(epsilon(exact_queries, run2, 4, 2), 1.0)
+
+    def test_relative(self):
+        exact_queries = [[0.1, 0.2, 0.25, 0.3]]
+        run1 = []
+        run2 = [[0.1, 0.2, 0.25, 0.3]]
+        run3 = [[0.1, 0.2, 0.55, 0.9]]
+
+        self.assertAlmostEqual(rel(exact_queries, run1), float("inf"))
+        self.assertAlmostEqual(rel(exact_queries, run2), 1)
+        # total distance exact: 0.85, total distance run3: 1.75
+        self.assertAlmostEqual(rel(exact_queries, run3), 1.75 /
+                0.85)
+
+    def test_queries_per_second(self):
+        self.assertAlmostEqual(queries_per_second([], {"best_search_time" : 0.01}),
+                100)
+
+    def test_index_size(self):
+        self.assertEqual(index_size([], {"index_size" : 100}), 100)
+
+    def test_build_time(self):
+        self.assertEqual(build_time([], {"build_time" : 100}), 100)
+
+    def test_candidates(self):
+        self.assertEqual(candidates([], {"candidates" : 10}), 10)
+
+
+if __name__ == '__main__':
+    unittest.main()
+