From 9a2db4c5005a12c4b0d9f37772d6eafde93aa7a0 Mon Sep 17 00:00:00 2001
From: Chun Jiang Zhu <chun_jiang.zhu@uconn.edu>
Date: Tue, 19 May 2020 10:26:36 -0400
Subject: [PATCH 01/14] add requirement.txt

---
 requirements.txt | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..5677802
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,11 @@
+ansicolors==1.1.8
+docker==2.6.1
+h5py==2.7.1
+matplotlib==2.1.0
+numpy==1.13.3
+pyyaml==3.12
+psutil==5.4.2
+scipy==1.0.0
+scikit-learn==0.19.1
+jinja2==2.10
+h5sparse=0.1.0

From 05f8fcc98306469954341c01422df7cca9c4e24f Mon Sep 17 00:00:00 2001
From: ChunjiangZhu <chunjiang.zhu@uconn.edu>
Date: Tue, 19 May 2020 10:28:18 -0400
Subject: [PATCH 02/14] add install requirement.txt

---
 .DS_Store        | Bin 10244 -> 10244 bytes
 README.md        |   6 +++++-
 requirements.txt |   1 +
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/.DS_Store b/.DS_Store
index 38f525a7587d3f226046cd3ce72954d08816ae95..afb1bdc376a0c0241d3bfeb1d4203670f90495ff 100644
GIT binary patch
delta 50
rcmZn(XbIRLAvSrrKsTfD<kv#7lLG`*Ha`(l;@r%xAjpOxF3$)6zGo1*

delta 37
ncmZn(XbIRLAvU>F{LJJ9qUxJ@#D8;aW>XMigNaP$5&sDQ3(XDO

diff --git a/README.md b/README.md
index dc59844..f7ec712 100644
--- a/README.md
+++ b/README.md
@@ -40,7 +40,11 @@ Computational environments supported:
 1. Download and put a dataset, e.g. Chembl-1024-jaccard.hdf5, under "data" folder;
 2. Download and put a Singularity image file, e.g. "ann-bench-nmslib3.sif" under "singularity" folder.
 
-# Executions under a PC with Singularity
+# Executions under a PC through Singularity
+1. pip install -r requirements.txt
+
+2. Run your algorithm
+
 Run.py Parameters:
 
     dataset: dataset name (Required)
diff --git a/requirements.txt b/requirements.txt
index 5677802..4b8d866 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
 ansicolors==1.1.8
 docker==2.6.1
+singularity==3.1.1
 h5py==2.7.1
 matplotlib==2.1.0
 numpy==1.13.3

From fc810949e98181e5f8b47e1f4e3da7469f2acdc7 Mon Sep 17 00:00:00 2001
From: ChunjiangZhu <chunjiang.zhu@uconn.edu>
Date: Tue, 19 May 2020 17:05:15 -0400
Subject: [PATCH 03/14] Fix requirement.txt bug

---
 README.md                            | 1 +
 requirements.txt                     | 2 +-
 singularity-install/requirements.txt | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index f7ec712..9ee284f 100644
--- a/README.md
+++ b/README.md
@@ -109,6 +109,7 @@ Command Examples (for Singularity only):
 2. Create anaconda environment, and then install dependent libraries
 
     conda create -c rdkit -n ann_env rdkit python=3.5.2
+	
     pip install -r singularity-install/requirements.txt
 
 3. Run your algorithm scripts by SLURM shell
diff --git a/requirements.txt b/requirements.txt
index 4b8d866..244077a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,4 +9,4 @@ psutil==5.4.2
 scipy==1.0.0
 scikit-learn==0.19.1
 jinja2==2.10
-h5sparse=0.1.0
+h5sparse==0.1.0
diff --git a/singularity-install/requirements.txt b/singularity-install/requirements.txt
index 5677802..dc5013c 100644
--- a/singularity-install/requirements.txt
+++ b/singularity-install/requirements.txt
@@ -8,4 +8,4 @@ psutil==5.4.2
 scipy==1.0.0
 scikit-learn==0.19.1
 jinja2==2.10
-h5sparse=0.1.0
+h5sparse==0.1.0

From 1dd99763bc63cf97e62925ecc8d713757edc86c2 Mon Sep 17 00:00:00 2001
From: ChunjiangZhu <chunjiang.zhu@uconn.edu>
Date: Tue, 19 May 2020 17:06:46 -0400
Subject: [PATCH 04/14] add ann_dev activation

---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index 9ee284f..b982b98 100644
--- a/README.md
+++ b/README.md
@@ -110,7 +110,11 @@ Command Examples (for Singularity only):
 
     conda create -c rdkit -n ann_env rdkit python=3.5.2
 	
+	source activate ann_dev
+	
     pip install -r singularity-install/requirements.txt
+	
+	source deactivate ann_dev
 
 3. Run your algorithm scripts by SLURM shell
 

From bb7cc2cb3009535a18ad4c89759fc54b6d4ecb3e Mon Sep 17 00:00:00 2001
From: ChunjiangZhu <chunjiang.zhu@uconn.edu>
Date: Tue, 19 May 2020 17:26:09 -0400
Subject: [PATCH 05/14] add uconn hpc specific code

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b982b98..08d1bfa 100644
--- a/README.md
+++ b/README.md
@@ -125,7 +125,7 @@ An example "run.sh":
     #!/bin/bash
     
     #SBATCH --ntasks=1
-    
+    #SBATCH --exclude=cn[65-69,71-136,325-343,345-353,355-358,360-364,369-398,400-401],gpu[07-10]
       
     
     module load anaconda/5.1.0

From 321ef63a552bb8bc6ed7c0807e9e58fddabd15b7 Mon Sep 17 00:00:00 2001
From: ChunjiangZhu <chunjiang.zhu@uconn.edu>
Date: Tue, 19 May 2020 20:48:52 -0400
Subject: [PATCH 06/14] add ploting in readme

---
 README.md | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 90 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 08d1bfa..756cbd4 100644
--- a/README.md
+++ b/README.md
@@ -52,7 +52,7 @@ Run.py Parameters:
 	    - chembl-1024-jaccard
 	    - molport-1024-jaccard
     algorithm: algorithm name (Required)
-	    Options:
+	    Choices:
 	    - Balltree(Sklearn)
 	    - Bruteforce
 	    - Datasketch
@@ -100,6 +100,67 @@ Command Examples (for Singularity only):
 
     python run.py --dataset=chembl-1024-jaccard --algorithm='Hnsw(Nmslib)' --count=100 --sif-dir="./singularity"  --batch
 
+# Visualization of Execution Results under a PC
+1. Run python plot.py
+
+plot.py Parameters:
+
+    dataset: dataset name (Required)
+	    Examples:
+	    - chembl-1024-jaccard
+	    - molport-1024-jaccard
+    count: the value of K for top-K nearest neighbor search
+	    Default: 10
+	output/-o: the output file
+	x-axis/-x: which metric to use on the X-axis
+	    Choices:
+	    - k-nn: Recall for top-K nearest neighbor search (Default)
+	    - range: Recall for range query
+	    - qps: Queries per second (1/s)
+	    - build: Indexing time (s)
+	    - indexsize: Index size (kB)
+	y-axis/-y: which metric to use on the Y-axis
+	    Choices:
+	    - k-nn: Recall for top-K nearest neighbor search
+	    - range: Recall for range query
+	    - qps: Queries per second (1/s) (Default)
+	    - build: Indexing time (s)
+	    - indexsize: Index size (kB)
+	x-log/-X: Draw the X-axis using a logarithmic scale
+		Default: False
+	y-log/-Y: draw the Y-axis using a logarithmic scale
+		Default: False
+	raw: show raw results (not just Pareto frontier) in faded colours
+		Default: False
+	batch: batch query mode
+		Default: False
+	rq: range query / threshold-based query mode
+		Default: False
+	radius: in the range query mode, the used cut-off value. Here the distance is used, so if all near neighbors with a similarity coefficient larger than 0.8, please set it 0.2.
+		Default: 0.3
+
+
+Command Examples:
+- Plot results on chembl-1024-jaccard dataset for top-K (K=100) nearest neighbor query to "results/chembl-1024-jaccard-100.png". X-axis: recall. Y-axis: qps, log-scale.
+
+    python plot.py --dataset=chembl-1024-jaccard -Y --count=100 -o=results/chembl-1024-jaccard-100
+
+- Plot results on molport-1024-jaccard dataset for top-K (K=10) nearest neighbor query to "results/molport-1024-jaccard-indexsize-10.png". X-axis: recall. Y-axis: index size, log-scale.
+
+    ython plot.py --dataset=molport-1024-jaccard -Y -y=indexsize --count=10 -o=results/molport-1024-jaccard-indexsize-10
+
+- Plot results on molport-1024-jaccard dataset for top-K (K=10) nearest neighbor query to "results/molport-1024-jaccard-buildtime-10.png". X-axis: recall. Y-axis: indexing time, log-scale.
+
+    python plot.py --dataset=molport-1024-jaccard -Y -y=build --count=10 -o=results/molport-1024-jaccard-buildtime-10
+
+- Plot batch mode results on molport-1024-jaccard dataset for top-K (K=100) nearest neighbor query to "results/molport-1024-jaccard-batch-100.png". X-axis: recall. Y-axis: qps, log-scale.
+
+    python plot.py --dataset=molport-1024-jaccard -Y --batch --count=100 -o=results/molport-1024-jaccard-batch-100
+
+- Plot batch mode results on chembl-1024-jaccard dataset for range query with similarity cutoff 0.6 to "results/chembl-1024-jaccard-0_4.png". X-axis: recall (range query). Y-axis: qps, log-scale.
+
+    python plot.py --dataset=chembl-1024-jaccard -Y -x=range --rq --radius=0.4 -o=results/chembl-1024-jaccard-0_4
+
 # Executions under an HPC environment
 
 1. Load anaconda module
@@ -143,6 +204,34 @@ An example "run.sh":
     python run.py --dataset=chembl-1024-jaccard --algorithm='Hnsw(Nmslib)' --count=100 --sif-dir="./singularity"  
 
 
+# Visualization of Execution Results under an HPC environment
+Run your algorithm scripts by SLURM shell
+
+    sbatch plot.sh
+
+An example "plot.sh":
+
+    #!/bin/bash
+    
+    #SBATCH --ntasks=1
+    #SBATCH --exclude=cn[65-69,71-136,325-343,345-353,355-358,360-364,369-398,400-401],gpu[07-10]
+      
+    
+    module load anaconda/5.1.0
+    
+    source activate ann_env
+    
+    module purge
+    
+    module load gcc/5.4.0
+    
+    module load singularity/3.1
+    
+      
+    
+    python plot.py --dataset=chembl-1024-jaccard -Y --count=100 -o=results/chembl-1024-jaccard-100
+
+
 # Parameter tuning
 All algorithmic parameter settings are included in the "./algos.yaml" file.
 

From 6f8111b1398c5089dd4c49419542e58e374becb8 Mon Sep 17 00:00:00 2001
From: ChunjiangZhu <chunjiang.zhu@uconn.edu>
Date: Tue, 19 May 2020 21:11:54 -0400
Subject: [PATCH 07/14] fix typo in readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 756cbd4..2b23aaa 100644
--- a/README.md
+++ b/README.md
@@ -101,7 +101,7 @@ Command Examples (for Singularity only):
     python run.py --dataset=chembl-1024-jaccard --algorithm='Hnsw(Nmslib)' --count=100 --sif-dir="./singularity"  --batch
 
 # Visualization of Execution Results under a PC
-1. Run python plot.py
+Run plotting python: plot.py
 
 plot.py Parameters:
 

From e7330bd6d176cf305c157109a40917375031c3c5 Mon Sep 17 00:00:00 2001
From: ChunjiangZhu <chunjiang.zhu@uconn.edu>
Date: Tue, 19 May 2020 21:12:44 -0400
Subject: [PATCH 08/14] Fix typo in README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2b23aaa..608d204 100644
--- a/README.md
+++ b/README.md
@@ -103,7 +103,7 @@ Command Examples (for Singularity only):
 # Visualization of Execution Results under a PC
 Run plotting python: plot.py
 
-plot.py Parameters:
+Plot.py Parameters:
 
     dataset: dataset name (Required)
 	    Examples:

From 513b065b90d1c788df3b221a64e08e9adae265dd Mon Sep 17 00:00:00 2001
From: ChunjiangZhu <chunjiang.zhu@uconn.edu>
Date: Tue, 19 May 2020 21:15:16 -0400
Subject: [PATCH 09/14] Fix typo in README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 608d204..f45c047 100644
--- a/README.md
+++ b/README.md
@@ -157,7 +157,7 @@ Command Examples:
 
     python plot.py --dataset=molport-1024-jaccard -Y --batch --count=100 -o=results/molport-1024-jaccard-batch-100
 
-- Plot batch mode results on chembl-1024-jaccard dataset for range query with similarity cutoff 0.6 to "results/chembl-1024-jaccard-0_4.png". X-axis: recall (range query). Y-axis: qps, log-scale.
+- Plot results on chembl-1024-jaccard dataset for range query with similarity cutoff 0.6 to "results/chembl-1024-jaccard-0_4.png". X-axis: recall (range query). Y-axis: qps, log-scale.
 
     python plot.py --dataset=chembl-1024-jaccard -Y -x=range --rq --radius=0.4 -o=results/chembl-1024-jaccard-0_4
 

From b4f71b831627694a5069c0de30f54fdde020f943 Mon Sep 17 00:00:00 2001
From: Chun Jiang Zhu <chun_jiang.zhu@uconn.edu>
Date: Fri, 5 Jun 2020 22:17:48 -0400
Subject: [PATCH 10/14] add note for using an existing dataset

---
 README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index f45c047..3fdf9a0 100644
--- a/README.md
+++ b/README.md
@@ -383,7 +383,7 @@ At the beginning of the file, there is "bit:\n jaccard:\n". It means that we use
 
 Here is the process to add a custom dataset. We will use Chembl dataset and 2048-bits ECFP as example.
 1. Put raw sdf file, e.g. chembl_24_1.sdf.gz, under "data" folder. Note only ".sdf.gz" files are accepted. Multiple sdf files are allowed.
-2. Include the key-value pair below to DATASETS, defined at the bottom of "./ann_benchmark/datasets.py".
+2. Include the key-value pair below to the data strucutre DATASETS, defined at the bottom of "./ann_benchmark/datasets.py".
 If a new fingerprint rather than ECFP is used, please define a fingerprint calculation function similar to ecfp() in the same Python file.
 
     'chembl-2048-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl', 2048, 'jaccard', 'bit'),
@@ -392,6 +392,7 @@ If a new fingerprint rather than ECFP is used, please define a fingerprint calcu
 
     python run.py --dataset=chembl-2048-jaccard --algorithm='Hnsw(Nmslib)' --count=100 --sif-dir="./singularity"
 
+Note: to use an existing dataset, e.g. X, one needs to make sure the data structure DATASETS, defined at the bottom of "./ann_benchmark/datasets.py" contains a key-value pair with key X. Otherwise, one needs to include a key-value pair with key X and an arbitrary value, e.g., "'X': gist", to the DATASETS.
 # References
 - Omohundro, S. M. Five Balltree Construction Algorithms.  _Tech. report, UC Berkeley_**1989**.
 
@@ -413,4 +414,4 @@ If a new fingerprint rather than ECFP is used, please define a fingerprint calcu
 
 - Datasketch: Big data looks small https://ekzhu.github.io/datasketch (accessed May 31, 2019).
 
-- Gaulton, A.; Bellis, L. J.; Bento, P.; Chambers, J.; Davies, M.; Hersey, A.; Light, Y.; McGlinchey, S.; Michalovich, D.; Al-Lazikani, B.; et al. ChEMBL: A Large-Scale Bioactivity Database for Drug Discovery.  _Nucleic Acids Res._**2012**,_40_, 1100–1107.
\ No newline at end of file
+- Gaulton, A.; Bellis, L. J.; Bento, P.; Chambers, J.; Davies, M.; Hersey, A.; Light, Y.; McGlinchey, S.; Michalovich, D.; Al-Lazikani, B.; et al. ChEMBL: A Large-Scale Bioactivity Database for Drug Discovery.  _Nucleic Acids Res._**2012**,_40_, 1100–1107.

From 7216d024e843f4bc47b170275cabdf95b08c9a0b Mon Sep 17 00:00:00 2001
From: Chun Jiang Zhu <chun_jiang.zhu@uconn.edu>
Date: Tue, 9 Jun 2020 15:11:54 -0400
Subject: [PATCH 11/14] Comment out an unnecessary log.

---
 ann_benchmarks/algorithms/nmslib.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ann_benchmarks/algorithms/nmslib.py b/ann_benchmarks/algorithms/nmslib.py
index 0fbb4e2..47933f5 100644
--- a/ann_benchmarks/algorithms/nmslib.py
+++ b/ann_benchmarks/algorithms/nmslib.py
@@ -81,7 +81,7 @@ def query(self, v, n, rq=False):
         if self._nmslib_metric == 'jaccard_sparse':
             nz = numpy.nonzero(v)[0]
             v = ' '.join([str(k) for k in nz])
-        print(n)
+        #print(n)
         if rq:
             ids, distances = self._index.rangeQuery(v, n)
         else:

From c0da55da5b13bdede17eebd3a38c3425823ad794 Mon Sep 17 00:00:00 2001
From: Chun Jiang Zhu <chun_jiang.zhu@uconn.edu>
Date: Thu, 18 Jun 2020 17:25:22 -0400
Subject: [PATCH 12/14] fix distance_valid for jaccard

---
 ann_benchmarks/distance.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/ann_benchmarks/distance.py b/ann_benchmarks/distance.py
index 9228338..5f5aa07 100644
--- a/ann_benchmarks/distance.py
+++ b/ann_benchmarks/distance.py
@@ -4,7 +4,6 @@
 def pdist(a, b, metric):
     return scipy_pdist([a, b], metric=metric)[0]
 
-# Need own implementation of jaccard because numpy's implementation is different
 def jaccard(a, b):
     if len(a) == 0 or len(b) == 0:
         return 0
@@ -31,7 +30,7 @@ def jaccard(a, b):
 #         'distance_valid' : lambda a: True
 #         }
 # }
-# Chunjiang Modified 20190216
+
 metrics = {
     'hamming': {
         'distance' : lambda a, b: pdist(a, b, "hamming"),
@@ -40,7 +39,7 @@ def jaccard(a, b):
     # return 1 - jaccard similarity, because smaller distances are better.
     'jaccard': {
         'distance' : lambda a, b:  pdist(a, b, "jaccard"),
-        'distance_valid' : lambda a: a < 1 - 1e-5
+        'distance_valid' : lambda a: a < 1 + 1e-5
         },
     'euclidean': {
         'distance' : lambda a, b: pdist(a, b, "euclidean"),

From 0e81a7b14834e1137587e552a0101d455cf1f3e2 Mon Sep 17 00:00:00 2001
From: ChunjiangZhu <chunjiang.zhu@uconn.edu>
Date: Fri, 19 Jun 2020 19:01:09 -0400
Subject: [PATCH 13/14] add map4 fingerprint

---
 .DS_Store                           | Bin 10244 -> 10244 bytes
 algos.yaml                          | 126 +++++++++++++-
 ann_benchmarks/.DS_Store            | Bin 8196 -> 8196 bytes
 ann_benchmarks/algorithms/nmslib.py |  32 +++-
 ann_benchmarks/datasets.py          | 246 +++++++++++++++++++++++-----
 ann_benchmarks/runner.py            |   2 +-
 requirements.txt                    |   1 -
 7 files changed, 348 insertions(+), 59 deletions(-)

diff --git a/.DS_Store b/.DS_Store
index afb1bdc376a0c0241d3bfeb1d4203670f90495ff..e51d2361ba5c217b11d91dd82a50f93559957cc0 100644
GIT binary patch
delta 200
zcmZn(XbG6$&nUSuU^hRb<YpcLc}8|414|tRW3$Pxgyc2{2tHw+oFK|#n95MdP|A?W
zP{feRkPF0l3?&T340=Gaf}zASCqFqUCqIdSfkA+Qf$=jCb8OZWEk-eGa)6-9=BHwh
SI5x8@=(1AL(8*85?gIb>dot(%

delta 99
zcmZn(XbG6$&nU4mU^hRb#AY4=c}5OnV-p<(V^hP)uZ3hc2M9i4-drM@0T(s~2~Q3X
XRN1U5{)l5@gTrQa1zlF0O5_;<kNz9Z

diff --git a/algos.yaml b/algos.yaml
index 43220d8..e9087ee 100644
--- a/algos.yaml
+++ b/algos.yaml
@@ -26,7 +26,7 @@ bit:
       singularity-tag: ann-bench-nmslib3
       module: ann_benchmarks.algorithms.nmslib
       constructor: NmslibReuseIndex
-      base-args: ["@metric", "vptree"]
+      base-args: ["@metric", "Byte", "vptree"]
       run-groups:
         base:
           # When @args is a dictionary, algorithm instances will be generated
@@ -51,7 +51,7 @@ bit:
       singularity-tag: ann-bench-nmslib3
       module: ann_benchmarks.algorithms.nmslib
       constructor: NmslibReuseIndex
-      base-args: ["@metric", "hnsw"]
+      base-args: ["@metric", "Byte", "hnsw"]
       run-groups:
         M-48:
           arg-groups:
@@ -90,7 +90,7 @@ bit:
       singularity-tag: ann-bench-nmslib3
       module: ann_benchmarks.algorithms.nmslib
       constructor: NmslibReuseIndex
-      base-args: ["@metric", "sw-graph"]
+      base-args: ["@metric", "Byte", "sw-graph"]
       run-groups:
         NN-96:
           arg-groups:
@@ -186,3 +186,123 @@ bit:
       run-groups:
         empty:
           args: []
+int:
+  jaccard:
+    Bruteforce:
+      disabled: false
+      docker-tag: ann-benchmarks-sklearn
+      singularity-tag: ann-bench-sklearn
+      module: ann_benchmarks.algorithms.bruteforce
+      constructor: BruteForceBLAS
+      base-args: ["@metric"]
+      run-groups:
+        base:
+          args: {}
+    Hnsw(Nmslib):
+      disabled: false
+      docker-tag: ann-benchmarks-nmslib
+      singularity-tag: ann-bench-nmslib3
+      module: ann_benchmarks.algorithms.nmslib
+      constructor: NmslibReuseIndex
+      base-args: ["@metric", "Int", "hnsw"]
+      run-groups:
+        M-48:
+          arg-groups:
+          - {"M": 48, "post": 2, "efConstruction": 800}
+          - False
+          query-args: [[50, 70, 90, 120, 160, 200, 400, 600, 700, 800, 1000,
+                        1400, 1600, 2000]]
+        M-32:
+          arg-groups:
+          - {"M": 32, "post": 2, "efConstruction": 800}
+          - False
+          query-args: [[100, 300, 500, 700, 1000, 1500, 2000]]
+        M-20:
+          arg-groups:
+          - {"M": 20, "post": 0, "efConstruction": 800}
+          - False
+          query-args: [[2, 5, 10, 15, 20, 30, 40, 50, 70, 80]]
+        M-12:
+          arg-groups:
+          - {"M": 12, "post": 0, "efConstruction": 800}
+          - False
+          query-args: [[1, 2, 5, 10, 15, 20, 30, 40, 50, 70, 80]]
+        M-5:
+          arg-groups:
+          - {"M": 5, "post": 0, "efConstruction": 10}
+          - False
+          query-args: [[1, 2, 5, 10]]
+        M-2:
+          arg-groups:
+          - {"M": 2, "post": 0, "efConstruction": 1}
+          - False
+          query-args: [[1, 2]]
+    SW-graph(Nmslib):
+      disabled: false
+      docker-tag: ann-benchmarks-nmslib
+      singularity-tag: ann-bench-nmslib3
+      module: ann_benchmarks.algorithms.nmslib
+      constructor: NmslibReuseIndex
+      base-args: ["@metric", "Int", "sw-graph"]
+      run-groups:
+        NN-96:
+          arg-groups:
+          - {"NN": 96}
+          - False
+          query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]]
+        NN-48:
+          arg-groups:
+          - {"NN": 48}
+          - False
+          query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]]
+        NN-24:
+          arg-groups:
+          - {"NN": 24}
+          - False
+          query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]]
+        NN-16:
+          arg-groups:
+          - {"NN": 16}
+          - False
+          query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]]
+        NN-10:
+          arg-groups:
+          - {"NN": 10}
+          - False
+          query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]]
+        NN-5:
+          arg-groups:
+          - {"NN": 5}
+          - False
+          query-args: [[30, 25, 20, 15, 10, 5, 4, 3, 2, 1]]
+        NN-2:
+          arg-groups:
+          - {"NN": 2}
+          - False
+          query-args: [[30, 25, 20, 15, 10, 5, 4, 3, 2, 1]]
+        NN-1:
+          arg-groups:
+          - {"NN": 1}
+          - False
+          query-args: [[30, 25, 20, 15, 10, 5, 4, 3, 2, 1]]
+    Onng(Ngt):
+      disabled: false
+      docker-tag: ann-benchmarks-ngt
+      singularity-tag: ann-bench-ngt
+      module: ann_benchmarks.algorithms.onng_ngt
+      constructor: ONNG
+      base-args: ["@metric", "Byte", 1.0]
+      run-groups:
+        onng:
+          args: [[100, 300, 500, 1000], [10, 30, 50, 100], [10, 30, 50, 120]]
+          query-args: [[0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0]]
+    Risc:
+      disabled: false
+      docker-tag: ann-benchmarks-risc
+      singularity-tag: ann-bench-risc
+      module: ann_benchmarks.algorithms.risc
+      constructor: Risc
+      base-args: ["@metric", "Risc"]
+      run-groups:
+        empty:
+          args: []
diff --git a/ann_benchmarks/.DS_Store b/ann_benchmarks/.DS_Store
index 827c239a459a68fb374e916c34eb01900326db89..8c30ee3120dda5d33f6ee9d277229d8142cc109e 100644
GIT binary patch
delta 151
zcmZp1XmQvuQ-Ja0<XHj@o)Xp7Mh2ET3dUxpwK@vb=H>=E3MR&8wY8ia;;M$Wo(Z{?
zRn;}Mbu$^jfRPbGGw?%c7&U8hlAs%tnCRqHg7vcT0{q3vMfo{70r|z5C7Jnok<sZz
knJNBhX~n4}lih@xm<kU}J|$$#xOMYCAr98f>=J+30cs~HC;$Ke

delta 150
zcmZp1XmQvuQ-JZr<XHj@9^%#2#)gJE3KpidIttb1<_0<nCdOv9wVWK{s)n|n3AvS3
z)it$sGZ?^tkr6^O@Iz@BHFI*Zpc|8@*yPoM^|I0FMVTr7X=%l&C6Vz0{Kd&d`8hcO
j`Nf$fnfZB>-G!Q%iZ)C>Eo98Ne)E4J4%W?V693o%NUbQ`

diff --git a/ann_benchmarks/algorithms/nmslib.py b/ann_benchmarks/algorithms/nmslib.py
index 47933f5..72f3ebc 100644
--- a/ann_benchmarks/algorithms/nmslib.py
+++ b/ann_benchmarks/algorithms/nmslib.py
@@ -24,9 +24,17 @@ def matrToStrArray(sparseMatr):
             arr.sort()
             res.append(' '.join([str(k) for k in arr]))
         return res
+        
+    @staticmethod
+    def intMatrToStrArray(intMatr):
+        res = []
+        for row in range(intMatr.shape[0]):
+            res.append(' '.join([str(k) for k in intMatr[row]]))
+        return res
 
-    def __init__(self, metric, method_name, index_param, query_param):
+    def __init__(self, metric, object_type, method_name, index_param, query_param):
         self._nmslib_metric = {'angular': 'cosinesimil', 'euclidean': 'l2', 'jaccard': 'jaccard_sparse'}[metric]
+        self._object_type = object_type
         self._method_name = method_name
         self._save_index = False
         self._index_param = NmslibReuseIndex.encode(index_param)
@@ -53,11 +61,11 @@ def fit(self, X):
             # Aborted (core dumped)
             self._index_param.append('bucketSize=%d' % min(int(X.shape[0] * 0.0005), 1000))
         
-        # Chunjiang modified it to "if" for jaccard
         if self._nmslib_metric == 'jaccard_sparse':
-            X_trans = NmslibReuseIndex.matrToStrArray(csr_matrix(X))
-            self._index = nmslib.init(space=self._nmslib_metric, method=self._method_name, data_type=nmslib.DataType.OBJECT_AS_STRING)
-            self._index.addDataPointBatch(X_trans)
+            if self._object_type == 'Byte':
+                X_trans = NmslibReuseIndex.matrToStrArray(csr_matrix(X))
+            else:
+                X_trans = NmslibReuseIndex.intMatrToStrArray(X)
         else:
             self._index = nmslib.init(space=self._nmslib_metric, method=self._method_name)
             self._index.addDataPointBatch(X)
@@ -79,9 +87,12 @@ def set_query_arguments(self, ef):
     def query(self, v, n, rq=False):
         # Chunjiang modified
         if self._nmslib_metric == 'jaccard_sparse':
-            nz = numpy.nonzero(v)[0]
-            v = ' '.join([str(k) for k in nz])
-        #print(n)
+            if self._object_type == 'Byte':
+                nz = numpy.nonzero(v)[0]
+                v = ' '.join([str(k) for k in nz])
+            else:
+                v = ' '.join([str(k) for k in v])
+
         if rq:
             ids, distances = self._index.rangeQuery(v, n)
         else:
@@ -91,7 +102,10 @@ def query(self, v, n, rq=False):
     def batch_query(self, X, n):
         # Chunjiang modified
         if self._nmslib_metric == 'jaccard_sparse':
-            X = NmslibReuseIndex.matrToStrArray(csr_matrix(X))
+            if self._object_type == 'Byte':
+                X = NmslibReuseIndex.matrToStrArray(csr_matrix(X))
+            else:
+                X = NmslibReuseIndex.intMatrToStrArray(X)
         self.res = self._index.knnQueryBatch(X, n)
 
     def get_batch_results(self):
diff --git a/ann_benchmarks/datasets.py b/ann_benchmarks/datasets.py
index 68728bf..2ee8551 100644
--- a/ann_benchmarks/datasets.py
+++ b/ann_benchmarks/datasets.py
@@ -42,14 +42,11 @@ def get_dataset(which):
 # Everything below this line is related to creating datasets
 # You probably never need to do this at home, just rely on the prepared datasets at http://ann-benchmarks.com
 
-def write_output(train, test, fn, distance, point_type='float', count=1000, SMILES=None):
+def write_output(train, test, fn, distance, point_type='float', count=1000, SMILES=None, IDS=None):
     from ann_benchmarks.algorithms.bruteforce import BruteForceBLAS
     import sklearn.neighbors
     import h5sparse
-
-    def replace_last(source_string, replace_what, replace_with):
-        head, _sep, tail = source_string.rpartition(replace_what)
-        return head + replace_with + tail
+    from scipy.sparse import issparse
 
     # store SMILES first
     if SMILES:
@@ -62,43 +59,60 @@ def replace_last(source_string, replace_what, replace_with):
         f.close()
         print('Finish.')
 
-    print('Write Dataset %s' % fn)
-    f = h5sparse.File(fn, 'w')
-    f.attrs['distance'] = distance
-    f.attrs['point_type'] = point_type
-    print('train size: %9d * %4d' % train.shape)
-    print('test size:  %9d * %4d' % test.shape)
-    f.create_dataset('train',data=train)
-    f.create_dataset('test', test.shape, dtype=test.dtype)[:] = test
-    neighbors = f.create_dataset('neighbors', (test.shape[0], count), dtype='i')
-    distances = f.create_dataset('distances', (test.shape[0], count), dtype='f')
-
-    # use which method to compute the groundtruth
-    train = train.toarray()
-    method = 'bruteforth'
-    if method == 'balltree':
-        tree = sklearn.neighbors.BallTree(train, leaf_size=1000000, metric=distance)
-    else:
-        bf = BruteForceBLAS(metric=distance, precision=train.dtype)
-        bf.fit(train)
+    if IDS:
+        smile_fn = replace_last(fn, '.hdf5', '-IDS.hdf5')
+        print('Write Smiles to File %s' % smile_fn)
+        f = h5sparse.File(smile_fn, 'w')
+        dt = h5py.special_dtype(vlen=bytes)
+        asciiList = [n.encode("ascii", "ignore") for n in IDS]
+        f.create_dataset('smile', (len(asciiList), 1), dtype=dt, data=asciiList)
+        f.close()
 
-    print(test)
-    for i, x in enumerate(test):
-        if i % 1 == 0:
-            print('%d/%d...' % (i, test.shape[0]))
+    print('Write Dataset %s' % fn)
+        f = h5sparse.File(fn, 'w')
+        f.attrs['distance'] = distance
+        f.attrs['point_type'] = point_type
+        print('train size: %9d * %4d' % train.shape)
+        print('test size:  %9d * %4d' % test.shape)
+        if issparse(train):
+            f.create_dataset('train',data=train)
+        else:
+            f.create_dataset('train', train.shape, dtype=train.dtype)[:] = train
+        if issparse(test):
+            f.create_dataset('test',data=test)
+        else:
+            f.create_dataset('test', test.shape, dtype=test.dtype)[:] = test
+        neighbors = f.create_dataset('neighbors', (test.shape[0], count), dtype='i')
+        distances = f.create_dataset('distances', (test.shape[0], count), dtype='f')
+
+        # use which method to compute the groundtruth
+        if issparse(train):
+            train = train.toarray()
+        method = 'bruteforce'
         if method == 'balltree':
-            dist, ind = tree.query([x], k=count)
-            neighbors[i] = ind[0]
-            distances[i] = dist[0]
+            tree = sklearn.neighbors.BallTree(train, leaf_size=1000000, metric=distance)
         else:
-            res = list(bf.query_with_distances(x, count))
-            res.sort(key=lambda t: t[-1])
-            neighbors[i] = [j for j, _ in res]
-            distances[i] = [d for _, d in res]
-        print(neighbors[i])
-        print(distances[i])
-    f.close()
-    print('Finish.')
+            bf = BruteForceBLAS(metric=distance, precision=train.dtype)
+            bf.fit(train)
+
+        print(test)
+        for i, x in enumerate(test):
+            if i % 1 == 0:
+                print('%d/%d...' % (i, test.shape[0]))
+            if method == 'balltree':
+                dist, ind = tree.query([x], k=count)
+                neighbors[i] = ind[0]
+                distances[i] = dist[0]
+            else:
+                res = list(bf.query_with_distances(x, count))
+                print(len(res))
+                res.sort(key=lambda t: t[-1])
+                neighbors[i] = [j for j, _ in res]
+                distances[i] = [d for _, d in res]
+            print(neighbors[i])
+            print(distances[i])
+        f.close()
+        print('Finish.')
 
 
 def train_test_split(X, test_size=10000):
@@ -355,7 +369,7 @@ def get_sparse_matrix_from_txt(file=None, dtype=numpy.bool):
 
     return fps, SMILES
 
-def get_sparse_matrix_from_sdf(dir, dimension=1024, dtype=numpy.bool):
+def get_sparse_matrix_from_sdf(dir, dimension = 1024, dtype=numpy.bool):
     from rdkit import Chem
     from rdkit.Chem import AllChem
     import glob
@@ -363,6 +377,7 @@ def get_sparse_matrix_from_sdf(dir, dimension=1024, dtype=numpy.bool):
     from scipy.sparse import csr_matrix
 
     SMILES = []
+    IDS = []
     indptr = [0]
     indices = []
     data = []
@@ -376,6 +391,7 @@ def get_sparse_matrix_from_sdf(dir, dimension=1024, dtype=numpy.bool):
             if mol is None: continue
             smile = Chem.MolToSmiles(mol)
             SMILES.append(smile)
+            IDS.append(mol.GetProp("_Name"))
             fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=dimension)
             for i in range(dimension):
                 if fp.GetBit(i) is True:
@@ -383,11 +399,12 @@ def get_sparse_matrix_from_sdf(dir, dimension=1024, dtype=numpy.bool):
                     data.append(1)
             indptr.append(len(indices))
             num_mols += 1
+            if num_mols > 3000: break
 
     fps = csr_matrix((data, indices, indptr), shape=(num_mols, dimension), dtype=dtype)
     print('The dimension of the returned sparse matrix: %d*%d' % fps.shape)
 
-    return fps, SMILES
+    return fps, SMILES, IDS
 
 def ecfp(out_fn, dataset_name, dimension, distance, type, test_size=1000):
     from sklearn.utils import shuffle
@@ -402,11 +419,11 @@ def ecfp(out_fn, dataset_name, dimension, distance, type, test_size=1000):
 
     dir = './data'
     
-    X, SMILES = get_sparse_matrix_from_sdf(dir=dir, dimension=dimension, dtype=dtype)
+    X, SMILES, IDS = get_sparse_matrix_from_sdf(dir=dir, dimension=dimension, dtype=dtype)
 
     # random shuffle fingerprints and smiles at the same time
     seed = 1 # random.randint(0, 2 ** 32 - 1)
-    X, SMILES = shuffle(X, SMILES, random_state=seed)
+    X, SMILES, IDS = shuffle(X, SMILES, IDS, random_state=seed)
 
     # data split and make test data full matrix
     train_size = X.shape[0] - test_size
@@ -417,8 +434,145 @@ def ecfp(out_fn, dataset_name, dimension, distance, type, test_size=1000):
 
     print('Train data dimension: %d*%d' %X_train.shape)
     print('Test data dimension: %d*%d' %X_test.shape)
-    write_output(X_train, X_test, out_fn, distance, type, count=1000, SMILES=SMILES)
+    write_output(X_train, X_test, out_fn, distance, type, count=1000, SMILES=SMILES, IDS=IDS)
 
+# Molecular topological fingerprints
+def get_sparse_matrix_from_sdf_topological_fp(dir, dimension=1024, dtype=numpy.bool):
+    from rdkit import Chem
+    from rdkit.Chem import AllChem
+    import glob
+    import gzip
+    from scipy.sparse import csr_matrix
+
+    SMILES = []
+    IDS = []
+    indptr = [0]
+    indices = []
+    data = []
+    num_mols = 0
+    file_list = glob.glob(dir + '/*.sdf.gz')
+    print(file_list)
+    for file in file_list:
+        inf = gzip.open(file)
+        suppl = Chem.ForwardSDMolSupplier(inf)
+        for mol in suppl:
+            if mol is None: continue
+            smile = Chem.MolToSmiles(mol)
+            SMILES.append(smile)
+            IDS.append(mol.GetProp("_Name"))
+            fp = Chem.rdmolops.RDKFingerprint(mol, fpSize=dimension)
+            for i in range(dimension):
+                if fp.GetBit(i) is True:
+                    indices.append(i)
+                    data.append(1)
+            indptr.append(len(indices))
+            num_mols += 1
+
+    fps = csr_matrix((data, indices, indptr), shape=(num_mols, dimension), dtype=dtype)
+    print('The dimension of the returned sparse matrix: %d*%d' % fps.shape)
+
+    return fps, SMILES, IDS
+    
+def topological_fp(out_fn, dataset_name, dimension, distance, type, test_size=1000):
+    from sklearn.utils import shuffle
+    print('prepare dataset ' + dataset_name)
+
+    if type == 'bit':
+        dtype = numpy.bool
+    elif type == 'int':
+        dtype = numpy.int
+    else:
+        dtype = numpy.float
+    
+    dir = './data'
+    
+    X, SMILES, IDS = get_sparse_matrix_from_sdf_topological_fp(dir=dir, dimension=dimension, dtype=dtype)
+
+    # random shuffle fingerprints and smiles at the same time
+    seed = 1 # random.randint(0, 2 ** 32 - 1)
+    X, SMILES, IDS = shuffle(X, SMILES, IDS, random_state=seed)
+
+    # data split and make test data full matrix
+    train_size = X.shape[0] - test_size
+    X_train = X[:train_size]
+    X_test = X[train_size:]
+    X_test = X_test.toarray()
+    print('finish dataset preparation')
+
+    print('Train data dimension: %d*%d' %X_train.shape)
+    print('Test data dimension: %d*%d' %X_test.shape)
+    write_output(X_train, X_test, out_fn, distance, type, count=1000, SMILES=SMILES, IDS=IDS)
+    
+def sdf_2_map4(dir, dimension=1024, dtype=numpy.bool):
+    from rdkit import Chem
+    from rdkit.Chem import AllChem
+    import glob
+    import gzip
+    from scipy.sparse import csr_matrix
+    from map4 import MAP4Calculator
+
+    MAP4 = MAP4Calculator(dimensions=dimension)
+
+    SMILES = []
+    IDS = []
+    fps = []
+    file_list = glob.glob(dir + '/*.sdf.gz')
+    print(file_list)
+    for file in file_list:
+        inf = gzip.open(file)
+        suppl = Chem.ForwardSDMolSupplier(inf)
+        #mols = [x for x in suppl if x is not None]
+        mols = []
+        num_mols = 0
+        for mol in suppl:
+            if mol is None: continue
+            mols.append(mol)
+            SMILES.append(Chem.MolToSmiles(mol))
+            IDS.append(mol.GetProp("_Name"))
+            num_mols += 1
+            if num_mols == 3000:
+                fps.extend(MAP4.calculate_many(mols))
+                mols = []
+                num_mols = 0
+        if num_mols > 0:
+            fps.extend(MAP4.calculate_many(mols))
+            mols = []
+            num_mols = 0
+
+    fps = numpy.array(fps, dtype=dtype)
+    print('The dimension of the returned matrix: %d*%d' % fps.shape)
+
+    return fps, SMILES, IDS
+    
+def map4(out_fn, dataset_name, dimension, distance, type, test_size=1000):
+    from sklearn.utils import shuffle
+    from map4 import MAP4Calculator
+    print('prepare dataset ' + dataset_name)
+
+    if type == 'bit':
+        dtype = numpy.bool
+    elif type == 'int':
+        dtype = numpy.int
+    else:
+        dtype = numpy.float
+    
+    dir = './data'
+    
+    X, SMILES, IDS = sdf_2_map4(dir=dir, dimension=dimension, dtype=dtype)
+
+    # random shuffle fingerprints and smiles at the same time
+    seed = 1 # random.randint(0, 2 ** 32 - 1)
+    X, SMILES, IDS = shuffle(X, SMILES, IDS, random_state=seed)
+
+    # data split and make test data full matrix
+    train_size = X.shape[0] - test_size
+    X_train = X[:train_size]
+    X_test = X[train_size:]
+    print('finish dataset preparation')
+
+    print('Train data dimension: %d*%d' %X_train.shape)
+    print('Test data dimension: %d*%d' %X_test.shape)
+    write_output(X_train, X_test, out_fn, distance, type, count=1000, SMILES=SMILES, IDS=IDS)
 
 def ecfp_sparse_multi(out_fn, dataset_name, num_files, dimension, distance, type):
     print('prepare dataset ' + dataset_name)
@@ -535,5 +689,7 @@ def ecfp_multi(out_fn, dataset_name, num_files, dimension, distance, type):
     'enamine100M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 5, 1024, 'jaccard', 'bit'),
     'enamine10M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 0.5, 1024, 'jaccard', 'bit'),
     'enamine20M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 1, 1024, 'jaccard', 'bit'),
-    'enamine10M-1024-jaccard': lambda out_fn: ecfp_multi(out_fn, 'Enamine', 0.5, 1024, 'jaccard', 'bit')
+    'enamine10M-1024-jaccard': lambda out_fn: ecfp_multi(out_fn, 'Enamine', 0.5, 1024, 'jaccard', 'bit'),
+    'chembl-1024-jaccard-tp': lambda out_fn: topological_fp(out_fn, 'Chembl', 1024, 'jaccard', 'bit'),
+    'chembl-1024-jaccard-map4': lambda out_fn: map4(out_fn, 'Chembl', 1024, 'jaccard', 'int')
 }
diff --git a/ann_benchmarks/runner.py b/ann_benchmarks/runner.py
index d51cdf5..1d3c3a4 100644
--- a/ann_benchmarks/runner.py
+++ b/ann_benchmarks/runner.py
@@ -113,7 +113,7 @@ def run(definition, dataset, count, run_count, batch, rq):
     D = get_dataset(dataset)
     # Chunjiang modified
     print('Is the train set a sparse matrix? %d' % issparse(D['train'][()]))
-    if 'sparse' not in dataset:
+    if issparse(D['train'][()]):
         X_train = D['train'][()].toarray()
     else:
         X_train = D['train'][()]
diff --git a/requirements.txt b/requirements.txt
index 244077a..dc5013c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,5 @@
 ansicolors==1.1.8
 docker==2.6.1
-singularity==3.1.1
 h5py==2.7.1
 matplotlib==2.1.0
 numpy==1.13.3

From e9f4436f4766ac1a989b7bf1f6e920717ac78cdb Mon Sep 17 00:00:00 2001
From: Chun Jiang Zhu <chun_jiang.zhu@uconn.edu>
Date: Fri, 19 Jun 2020 19:02:23 -0400
Subject: [PATCH 14/14] Delete nmslib_sparse.py

---
 ann_benchmarks/algorithms/nmslib_sparse.py | 95 ----------------------
 1 file changed, 95 deletions(-)
 delete mode 100644 ann_benchmarks/algorithms/nmslib_sparse.py

diff --git a/ann_benchmarks/algorithms/nmslib_sparse.py b/ann_benchmarks/algorithms/nmslib_sparse.py
deleted file mode 100644
index 58af969..0000000
--- a/ann_benchmarks/algorithms/nmslib_sparse.py
+++ /dev/null
@@ -1,95 +0,0 @@
-from __future__ import absolute_import
-import os
-import nmslib
-from ann_benchmarks.constants import INDEX_DIR
-from ann_benchmarks.algorithms.base import BaseANN
-from scipy.sparse import csr_matrix
-import numpy
-
-
-class NmslibSparseReuseIndex(BaseANN):
-    @staticmethod
-    def encode(d):
-        return ["%s=%s" % (a, b) for (a, b) in d.iteritems()]
-
-    # For each entry in the sparse matrix, extract a list of IDs and
-    # convert them to a string. Return a list of such strings.
-    @staticmethod
-    def matrToStrArray(sparseMatr):
-        res = []
-        indptr = sparseMatr.indptr
-        indices = sparseMatr.indices
-        for row in range(sparseMatr.shape[0]):
-            arr = [k for k in indices[indptr[row]: indptr[row + 1]]]
-            arr.sort()
-            res.append(' '.join([str(k) for k in arr]))
-        return res
-
-    def __init__(self, metric, method_name, index_param, query_param):
-        self._nmslib_metric = {'angular': 'cosinesimil', 'euclidean': 'l2', 'jaccard': 'jaccard_sparse'}[metric]
-        self._method_name = method_name
-        self._save_index = False
-        self._index_param = NmslibSparseReuseIndex.encode(index_param)
-        if query_param!=False:
-            self._query_param = NmslibSparseReuseIndex.encode(query_param)
-            self.name = 'Nmslib(method_name=%s, index_param=%s, query_param=%s)' % (
-            self._method_name, self._index_param, self._query_param)
-        else:
-            self._query_param = None
-            self.name = 'Nmslib(method_name=%s, index_param=%s)' % (
-            self._method_name, self._index_param)
-
-        self._index_name = os.path.join(INDEX_DIR, "nmslib_%s_%s_%s" % (self._method_name, metric, '_'.join(self._index_param)))
-
-        d = os.path.dirname(self._index_name)
-        if not os.path.exists(d):
-            os.makedirs(d)
-
-    def fit(self, X):
-        if self._method_name == 'vptree':
-            # To avoid this issue:
-            # terminate called after throwing an instance of 'std::runtime_error'
-            # what():  The data size is too small or the bucket size is too big. Select the parameters so that <total # of records> is NOT less than <bucket size> * 1000
-            # Aborted (core dumped)
-            self._index_param.append('bucketSize=%d' % min(int(X.shape[0] * 0.0005), 1000))
-
-        # Chunjiang modified it to "if" for jaccard
-        if self._nmslib_metric == 'jaccard_sparse':
-            X_trans = NmslibSparseReuseIndex.matrToStrArray(X)
-            self._index = nmslib.init(space=self._nmslib_metric, method=self._method_name, data_type=nmslib.DataType.OBJECT_AS_STRING)
-            self._index.addDataPointBatch(X_trans)
-        else:
-            self._index = nmslib.init(space=self._nmslib_metric, method=self._method_name)
-            self._index.addDataPointBatch(X)
-
-        if os.path.exists(self._index_name):
-            print('Loading index from file')
-            self._index.loadIndex(self._index_name)
-        else:
-            self._index.createIndex(self._index_param)
-            if self._save_index:
-                self._index.saveIndex(self._index_name)
-        if self._query_param is not None:
-            self._index.setQueryTimeParams(self._query_param)
-
-    def set_query_arguments(self, ef):
-        if self._method_name == 'hnsw' or self._method_name == 'sw-graph':
-            self._index.setQueryTimeParams(["efSearch=%s"%(ef)])
-
-    def query(self, v, n):
-        # Chunjiang modified
-        if self._nmslib_metric == 'jaccard_sparse':
-            nz = numpy.nonzero(v)[0]
-            v = ' '.join([str(k) for k in nz])
-        ids, distances = self._index.knnQuery(v, n)
-        return ids
-
-    def batch_query(self, X, n):
-        # Chunjiang modified
-        if self._nmslib_metric == 'jaccard_sparse':
-            X = NmslibSparseReuseIndex.matrToStrArray(csr_matrix(X))
-        self.res = self._index.knnQueryBatch(X, n)
-
-    def get_batch_results(self):
-        return [x for x, _ in self.res]
-