Merge pull request #1 from mldrugdiscovery/master

update to match
cdb17006 · Jun 20, 2020 · 7fadc34 · 7fadc34
2 parents 6a42628 + e9f4436
commit 7fadc34
Show file tree

Hide file tree

Showing 11 changed files with 466 additions and 162 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/README.md b/README.md
@@ -40,15 +40,19 @@ Computational environments supported:
 1. Download and put a dataset, e.g. Chembl-1024-jaccard.hdf5, under "data" folder;
 2. Download and put a Singularity image file, e.g. "ann-bench-nmslib3.sif" under "singularity" folder.
 
-# Executions under a PC with Singularity
+# Executions under a PC through Singularity
+1. pip install -r requirements.txt
+
+2. Run your algorithm
+
 Run.py Parameters:
 
     dataset: dataset name (Required)
 	    Examples:
 	    - chembl-1024-jaccard
 	    - molport-1024-jaccard
     algorithm: algorithm name (Required)
-	    Options:
+	    Choices:
 	    - Balltree(Sklearn)
 	    - Bruteforce
 	    - Datasketch
@@ -96,6 +100,67 @@ Command Examples (for Singularity only):
 
     python run.py --dataset=chembl-1024-jaccard --algorithm='Hnsw(Nmslib)' --count=100 --sif-dir="./singularity"  --batch
 
+# Visualization of Execution Results under a PC
+Run plotting python: plot.py
+
+Plot.py Parameters:
+
+    dataset: dataset name (Required)
+	    Examples:
+	    - chembl-1024-jaccard
+	    - molport-1024-jaccard
+    count: the value of K for top-K nearest neighbor search
+	    Default: 10
+	output/-o: the output file
+	x-axis/-x: which metric to use on the X-axis
+	    Choices:
+	    - k-nn: Recall for top-K nearest neighbor search (Default)
+	    - range: Recall for range query
+	    - qps: Queries per second (1/s)
+	    - build: Indexing time (s)
+	    - indexsize: Index size (kB)
+	y-axis/-y: which metric to use on the Y-axis
+	    Choices:
+	    - k-nn: Recall for top-K nearest neighbor search
+	    - range: Recall for range query
+	    - qps: Queries per second (1/s) (Default)
+	    - build: Indexing time (s)
+	    - indexsize: Index size (kB)
+	x-log/-X: Draw the X-axis using a logarithmic scale
+		Default: False
+	y-log/-Y: draw the Y-axis using a logarithmic scale
+		Default: False
+	raw: show raw results (not just Pareto frontier) in faded colours
+		Default: False
+	batch: batch query mode
+		Default: False
+	rq: range query / threshold-based query mode
+		Default: False
+	radius: in the range query mode, the used cut-off value. Here the distance is used, so if all near neighbors with a similarity coefficient larger than 0.8, please set it 0.2.
+		Default: 0.3
+
+
+Command Examples:
+- Plot results on chembl-1024-jaccard dataset for top-K (K=100) nearest neighbor query to "results/chembl-1024-jaccard-100.png". X-axis: recall. Y-axis: qps, log-scale.
+
+    python plot.py --dataset=chembl-1024-jaccard -Y --count=100 -o=results/chembl-1024-jaccard-100
+
+- Plot results on molport-1024-jaccard dataset for top-K (K=10) nearest neighbor query to "results/molport-1024-jaccard-indexsize-10.png". X-axis: recall. Y-axis: index size, log-scale.
+
+    ython plot.py --dataset=molport-1024-jaccard -Y -y=indexsize --count=10 -o=results/molport-1024-jaccard-indexsize-10
+
+- Plot results on molport-1024-jaccard dataset for top-K (K=10) nearest neighbor query to "results/molport-1024-jaccard-buildtime-10.png". X-axis: recall. Y-axis: indexing time, log-scale.
+
+    python plot.py --dataset=molport-1024-jaccard -Y -y=build --count=10 -o=results/molport-1024-jaccard-buildtime-10
+
+- Plot batch mode results on molport-1024-jaccard dataset for top-K (K=100) nearest neighbor query to "results/molport-1024-jaccard-batch-100.png". X-axis: recall. Y-axis: qps, log-scale.
+
+    python plot.py --dataset=molport-1024-jaccard -Y --batch --count=100 -o=results/molport-1024-jaccard-batch-100
+
+- Plot results on chembl-1024-jaccard dataset for range query with similarity cutoff 0.6 to "results/chembl-1024-jaccard-0_4.png". X-axis: recall (range query). Y-axis: qps, log-scale.
+
+    python plot.py --dataset=chembl-1024-jaccard -Y -x=range --rq --radius=0.4 -o=results/chembl-1024-jaccard-0_4
+
 # Executions under an HPC environment
 
 1. Load anaconda module
@@ -105,7 +170,12 @@ Command Examples (for Singularity only):
 2. Create anaconda environment, and then install dependent libraries
 
     conda create -c rdkit -n ann_env rdkit python=3.5.2
+
+	source activate ann_dev
+
     pip install -r singularity-install/requirements.txt
+
+	source deactivate ann_dev
 
 3. Run your algorithm scripts by SLURM shell
 
@@ -116,7 +186,7 @@ An example "run.sh":
     #!/bin/bash
 
     #SBATCH --ntasks=1
-
+    #SBATCH --exclude=cn[65-69,71-136,325-343,345-353,355-358,360-364,369-398,400-401],gpu[07-10]
       
 
     module load anaconda/5.1.0
@@ -134,6 +204,34 @@ An example "run.sh":
     python run.py --dataset=chembl-1024-jaccard --algorithm='Hnsw(Nmslib)' --count=100 --sif-dir="./singularity"  
 
 
+# Visualization of Execution Results under an HPC environment
+Run your algorithm scripts by SLURM shell
+
+    sbatch plot.sh
+
+An example "plot.sh":
+
+    #!/bin/bash
+
+    #SBATCH --ntasks=1
+    #SBATCH --exclude=cn[65-69,71-136,325-343,345-353,355-358,360-364,369-398,400-401],gpu[07-10]
+      
+
+    module load anaconda/5.1.0
+
+    source activate ann_env
+
+    module purge
+
+    module load gcc/5.4.0
+
+    module load singularity/3.1
+
+      
+
+    python plot.py --dataset=chembl-1024-jaccard -Y --count=100 -o=results/chembl-1024-jaccard-100
+
+
 # Parameter tuning
 All algorithmic parameter settings are included in the "./algos.yaml" file.
 
@@ -285,7 +383,7 @@ At the beginning of the file, there is "bit:\n jaccard:\n". It means that we use
 
 Here is the process to add a custom dataset. We will use Chembl dataset and 2048-bits ECFP as example.
 1. Put raw sdf file, e.g. chembl_24_1.sdf.gz, under "data" folder. Note only ".sdf.gz" files are accepted. Multiple sdf files are allowed.
-2. Include the key-value pair below to DATASETS, defined at the bottom of "./ann_benchmark/datasets.py".
+2. Include the key-value pair below to the data strucutre DATASETS, defined at the bottom of "./ann_benchmark/datasets.py".
 If a new fingerprint rather than ECFP is used, please define a fingerprint calculation function similar to ecfp() in the same Python file.
 
     'chembl-2048-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl', 2048, 'jaccard', 'bit'),
@@ -294,6 +392,7 @@ If a new fingerprint rather than ECFP is used, please define a fingerprint calcu
 
     python run.py --dataset=chembl-2048-jaccard --algorithm='Hnsw(Nmslib)' --count=100 --sif-dir="./singularity"
 
+Note: to use an existing dataset, e.g. X, one needs to make sure the data structure DATASETS, defined at the bottom of "./ann_benchmark/datasets.py" contains a key-value pair with key X. Otherwise, one needs to include a key-value pair with key X and an arbitrary value, e.g., "'X': gist", to the DATASETS.
 # References
 - Omohundro, S. M. Five Balltree Construction Algorithms.  _Tech. report, UC Berkeley_**1989**.
 
@@ -315,4 +414,4 @@ If a new fingerprint rather than ECFP is used, please define a fingerprint calcu
 
 - Datasketch: Big data looks small https://ekzhu.github.io/datasketch (accessed May 31, 2019).
 
-- Gaulton, A.; Bellis, L. J.; Bento, P.; Chambers, J.; Davies, M.; Hersey, A.; Light, Y.; McGlinchey, S.; Michalovich, D.; Al-Lazikani, B.; et al. ChEMBL: A Large-Scale Bioactivity Database for Drug Discovery.  _Nucleic Acids Res._**2012**,_40_, 1100–1107.
+- Gaulton, A.; Bellis, L. J.; Bento, P.; Chambers, J.; Davies, M.; Hersey, A.; Light, Y.; McGlinchey, S.; Michalovich, D.; Al-Lazikani, B.; et al. ChEMBL: A Large-Scale Bioactivity Database for Drug Discovery.  _Nucleic Acids Res._**2012**,_40_, 1100–1107.
diff --git a/algos.yaml b/algos.yaml
@@ -26,7 +26,7 @@ bit:
       singularity-tag: ann-bench-nmslib3
       module: ann_benchmarks.algorithms.nmslib
       constructor: NmslibReuseIndex
-      base-args: ["@metric", "vptree"]
+      base-args: ["@metric", "Byte", "vptree"]
       run-groups:
         base:
           # When @args is a dictionary, algorithm instances will be generated
@@ -51,7 +51,7 @@ bit:
       singularity-tag: ann-bench-nmslib3
       module: ann_benchmarks.algorithms.nmslib
       constructor: NmslibReuseIndex
-      base-args: ["@metric", "hnsw"]
+      base-args: ["@metric", "Byte", "hnsw"]
       run-groups:
         M-48:
           arg-groups:
@@ -90,7 +90,7 @@ bit:
       singularity-tag: ann-bench-nmslib3
       module: ann_benchmarks.algorithms.nmslib
       constructor: NmslibReuseIndex
-      base-args: ["@metric", "sw-graph"]
+      base-args: ["@metric", "Byte", "sw-graph"]
       run-groups:
         NN-96:
           arg-groups:
@@ -186,3 +186,123 @@ bit:
       run-groups:
         empty:
           args: []
+int:
+  jaccard:
+    Bruteforce:
+      disabled: false
+      docker-tag: ann-benchmarks-sklearn
+      singularity-tag: ann-bench-sklearn
+      module: ann_benchmarks.algorithms.bruteforce
+      constructor: BruteForceBLAS
+      base-args: ["@metric"]
+      run-groups:
+        base:
+          args: {}
+    Hnsw(Nmslib):
+      disabled: false
+      docker-tag: ann-benchmarks-nmslib
+      singularity-tag: ann-bench-nmslib3
+      module: ann_benchmarks.algorithms.nmslib
+      constructor: NmslibReuseIndex
+      base-args: ["@metric", "Int", "hnsw"]
+      run-groups:
+        M-48:
+          arg-groups:
+          - {"M": 48, "post": 2, "efConstruction": 800}
+          - False
+          query-args: [[50, 70, 90, 120, 160, 200, 400, 600, 700, 800, 1000,
+                        1400, 1600, 2000]]
+        M-32:
+          arg-groups:
+          - {"M": 32, "post": 2, "efConstruction": 800}
+          - False
+          query-args: [[100, 300, 500, 700, 1000, 1500, 2000]]
+        M-20:
+          arg-groups:
+          - {"M": 20, "post": 0, "efConstruction": 800}
+          - False
+          query-args: [[2, 5, 10, 15, 20, 30, 40, 50, 70, 80]]
+        M-12:
+          arg-groups:
+          - {"M": 12, "post": 0, "efConstruction": 800}
+          - False
+          query-args: [[1, 2, 5, 10, 15, 20, 30, 40, 50, 70, 80]]
+        M-5:
+          arg-groups:
+          - {"M": 5, "post": 0, "efConstruction": 10}
+          - False
+          query-args: [[1, 2, 5, 10]]
+        M-2:
+          arg-groups:
+          - {"M": 2, "post": 0, "efConstruction": 1}
+          - False
+          query-args: [[1, 2]]
+    SW-graph(Nmslib):
+      disabled: false
+      docker-tag: ann-benchmarks-nmslib
+      singularity-tag: ann-bench-nmslib3
+      module: ann_benchmarks.algorithms.nmslib
+      constructor: NmslibReuseIndex
+      base-args: ["@metric", "Int", "sw-graph"]
+      run-groups:
+        NN-96:
+          arg-groups:
+          - {"NN": 96}
+          - False
+          query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]]
+        NN-48:
+          arg-groups:
+          - {"NN": 48}
+          - False
+          query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]]
+        NN-24:
+          arg-groups:
+          - {"NN": 24}
+          - False
+          query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]]
+        NN-16:
+          arg-groups:
+          - {"NN": 16}
+          - False
+          query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]]
+        NN-10:
+          arg-groups:
+          - {"NN": 10}
+          - False
+          query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]]
+        NN-5:
+          arg-groups:
+          - {"NN": 5}
+          - False
+          query-args: [[30, 25, 20, 15, 10, 5, 4, 3, 2, 1]]
+        NN-2:
+          arg-groups:
+          - {"NN": 2}
+          - False
+          query-args: [[30, 25, 20, 15, 10, 5, 4, 3, 2, 1]]
+        NN-1:
+          arg-groups:
+          - {"NN": 1}
+          - False
+          query-args: [[30, 25, 20, 15, 10, 5, 4, 3, 2, 1]]
+    Onng(Ngt):
+      disabled: false
+      docker-tag: ann-benchmarks-ngt
+      singularity-tag: ann-bench-ngt
+      module: ann_benchmarks.algorithms.onng_ngt
+      constructor: ONNG
+      base-args: ["@metric", "Byte", 1.0]
+      run-groups:
+        onng:
+          args: [[100, 300, 500, 1000], [10, 30, 50, 100], [10, 30, 50, 120]]
+          query-args: [[0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0]]
+    Risc:
+      disabled: false
+      docker-tag: ann-benchmarks-risc
+      singularity-tag: ann-bench-risc
+      module: ann_benchmarks.algorithms.risc
+      constructor: Risc
+      base-args: ["@metric", "Risc"]
+      run-groups:
+        empty:
+          args: []
diff --git a/ann_benchmarks/.DS_Store b/ann_benchmarks/.DS_Store
diff --git a/ann_benchmarks/algorithms/nmslib.py b/ann_benchmarks/algorithms/nmslib.py
@@ -24,9 +24,17 @@ def matrToStrArray(sparseMatr):
             arr.sort()
             res.append(' '.join([str(k) for k in arr]))
         return res
+
+    @staticmethod
+    def intMatrToStrArray(intMatr):
+        res = []
+        for row in range(intMatr.shape[0]):
+            res.append(' '.join([str(k) for k in intMatr[row]]))
+        return res
 
-    def __init__(self, metric, method_name, index_param, query_param):
+    def __init__(self, metric, object_type, method_name, index_param, query_param):
         self._nmslib_metric = {'angular': 'cosinesimil', 'euclidean': 'l2', 'jaccard': 'jaccard_sparse'}[metric]
+        self._object_type = object_type
         self._method_name = method_name
         self._save_index = False
         self._index_param = NmslibReuseIndex.encode(index_param)
@@ -53,11 +61,11 @@ def fit(self, X):
             # Aborted (core dumped)
             self._index_param.append('bucketSize=%d' % min(int(X.shape[0] * 0.0005), 1000))
 
-        # Chunjiang modified it to "if" for jaccard
         if self._nmslib_metric == 'jaccard_sparse':
-            X_trans = NmslibReuseIndex.matrToStrArray(csr_matrix(X))
-            self._index = nmslib.init(space=self._nmslib_metric, method=self._method_name, data_type=nmslib.DataType.OBJECT_AS_STRING)
-            self._index.addDataPointBatch(X_trans)
+            if self._object_type == 'Byte':
+                X_trans = NmslibReuseIndex.matrToStrArray(csr_matrix(X))
+            else:
+                X_trans = NmslibReuseIndex.intMatrToStrArray(X)
         else:
             self._index = nmslib.init(space=self._nmslib_metric, method=self._method_name)
             self._index.addDataPointBatch(X)
@@ -79,9 +87,12 @@ def set_query_arguments(self, ef):
     def query(self, v, n, rq=False):
         # Chunjiang modified
         if self._nmslib_metric == 'jaccard_sparse':
-            nz = numpy.nonzero(v)[0]
-            v = ' '.join([str(k) for k in nz])
-        print(n)
+            if self._object_type == 'Byte':
+                nz = numpy.nonzero(v)[0]
+                v = ' '.join([str(k) for k in nz])
+            else:
+                v = ' '.join([str(k) for k in v])
+
         if rq:
             ids, distances = self._index.rangeQuery(v, n)
         else:
@@ -91,7 +102,10 @@ def query(self, v, n, rq=False):
     def batch_query(self, X, n):
         # Chunjiang modified
         if self._nmslib_metric == 'jaccard_sparse':
-            X = NmslibReuseIndex.matrToStrArray(csr_matrix(X))
+            if self._object_type == 'Byte':
+                X = NmslibReuseIndex.matrToStrArray(csr_matrix(X))
+            else:
+                X = NmslibReuseIndex.intMatrToStrArray(X)
         self.res = self._index.knnQueryBatch(X, n)
 
     def get_batch_results(self):