Skip to content

update to match #1

Merged
merged 14 commits into from
Jun 20, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified .DS_Store
Binary file not shown.
109 changes: 104 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,19 @@ Computational environments supported:
1. Download and put a dataset, e.g. Chembl-1024-jaccard.hdf5, under "data" folder;
2. Download and put a Singularity image file, e.g. "ann-bench-nmslib3.sif" under "singularity" folder.

# Executions under a PC with Singularity
# Executions under a PC through Singularity
1. pip install -r requirements.txt

2. Run your algorithm

Run.py Parameters:

dataset: dataset name (Required)
Examples:
- chembl-1024-jaccard
- molport-1024-jaccard
algorithm: algorithm name (Required)
Options:
Choices:
- Balltree(Sklearn)
- Bruteforce
- Datasketch
Expand Down Expand Up @@ -96,6 +100,67 @@ Command Examples (for Singularity only):

python run.py --dataset=chembl-1024-jaccard --algorithm='Hnsw(Nmslib)' --count=100 --sif-dir="./singularity" --batch

# Visualization of Execution Results under a PC
Run plotting python: plot.py

Plot.py Parameters:

dataset: dataset name (Required)
Examples:
- chembl-1024-jaccard
- molport-1024-jaccard
count: the value of K for top-K nearest neighbor search
Default: 10
output/-o: the output file
x-axis/-x: which metric to use on the X-axis
Choices:
- k-nn: Recall for top-K nearest neighbor search (Default)
- range: Recall for range query
- qps: Queries per second (1/s)
- build: Indexing time (s)
- indexsize: Index size (kB)
y-axis/-y: which metric to use on the Y-axis
Choices:
- k-nn: Recall for top-K nearest neighbor search
- range: Recall for range query
- qps: Queries per second (1/s) (Default)
- build: Indexing time (s)
- indexsize: Index size (kB)
x-log/-X: Draw the X-axis using a logarithmic scale
Default: False
y-log/-Y: draw the Y-axis using a logarithmic scale
Default: False
raw: show raw results (not just Pareto frontier) in faded colours
Default: False
batch: batch query mode
Default: False
rq: range query / threshold-based query mode
Default: False
radius: in the range query mode, the used cut-off value. Here the distance is used, so if all near neighbors with a similarity coefficient larger than 0.8, please set it 0.2.
Default: 0.3


Command Examples:
- Plot results on chembl-1024-jaccard dataset for top-K (K=100) nearest neighbor query to "results/chembl-1024-jaccard-100.png". X-axis: recall. Y-axis: qps, log-scale.

python plot.py --dataset=chembl-1024-jaccard -Y --count=100 -o=results/chembl-1024-jaccard-100

- Plot results on molport-1024-jaccard dataset for top-K (K=10) nearest neighbor query to "results/molport-1024-jaccard-indexsize-10.png". X-axis: recall. Y-axis: index size, log-scale.

ython plot.py --dataset=molport-1024-jaccard -Y -y=indexsize --count=10 -o=results/molport-1024-jaccard-indexsize-10

- Plot results on molport-1024-jaccard dataset for top-K (K=10) nearest neighbor query to "results/molport-1024-jaccard-buildtime-10.png". X-axis: recall. Y-axis: indexing time, log-scale.

python plot.py --dataset=molport-1024-jaccard -Y -y=build --count=10 -o=results/molport-1024-jaccard-buildtime-10

- Plot batch mode results on molport-1024-jaccard dataset for top-K (K=100) nearest neighbor query to "results/molport-1024-jaccard-batch-100.png". X-axis: recall. Y-axis: qps, log-scale.

python plot.py --dataset=molport-1024-jaccard -Y --batch --count=100 -o=results/molport-1024-jaccard-batch-100

- Plot results on chembl-1024-jaccard dataset for range query with similarity cutoff 0.6 to "results/chembl-1024-jaccard-0_4.png". X-axis: recall (range query). Y-axis: qps, log-scale.

python plot.py --dataset=chembl-1024-jaccard -Y -x=range --rq --radius=0.4 -o=results/chembl-1024-jaccard-0_4

# Executions under an HPC environment

1. Load anaconda module
Expand All @@ -105,7 +170,12 @@ Command Examples (for Singularity only):
2. Create anaconda environment, and then install dependent libraries

conda create -c rdkit -n ann_env rdkit python=3.5.2

source activate ann_dev

pip install -r singularity-install/requirements.txt

source deactivate ann_dev

3. Run your algorithm scripts by SLURM shell

Expand All @@ -116,7 +186,7 @@ An example "run.sh":
#!/bin/bash

#SBATCH --ntasks=1

#SBATCH --exclude=cn[65-69,71-136,325-343,345-353,355-358,360-364,369-398,400-401],gpu[07-10]

module load anaconda/5.1.0
Expand All @@ -134,6 +204,34 @@ An example "run.sh":
python run.py --dataset=chembl-1024-jaccard --algorithm='Hnsw(Nmslib)' --count=100 --sif-dir="./singularity"


# Visualization of Execution Results under an HPC environment
Run your algorithm scripts by SLURM shell

sbatch plot.sh

An example "plot.sh":

#!/bin/bash

#SBATCH --ntasks=1
#SBATCH --exclude=cn[65-69,71-136,325-343,345-353,355-358,360-364,369-398,400-401],gpu[07-10]

module load anaconda/5.1.0

source activate ann_env

module purge

module load gcc/5.4.0

module load singularity/3.1


python plot.py --dataset=chembl-1024-jaccard -Y --count=100 -o=results/chembl-1024-jaccard-100


# Parameter tuning
All algorithmic parameter settings are included in the "./algos.yaml" file.

Expand Down Expand Up @@ -285,7 +383,7 @@ At the beginning of the file, there is "bit:\n jaccard:\n". It means that we use

Here is the process to add a custom dataset. We will use Chembl dataset and 2048-bits ECFP as example.
1. Put raw sdf file, e.g. chembl_24_1.sdf.gz, under "data" folder. Note only ".sdf.gz" files are accepted. Multiple sdf files are allowed.
2. Include the key-value pair below to DATASETS, defined at the bottom of "./ann_benchmark/datasets.py".
2. Include the key-value pair below to the data strucutre DATASETS, defined at the bottom of "./ann_benchmark/datasets.py".
If a new fingerprint rather than ECFP is used, please define a fingerprint calculation function similar to ecfp() in the same Python file.

'chembl-2048-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl', 2048, 'jaccard', 'bit'),
Expand All @@ -294,6 +392,7 @@ If a new fingerprint rather than ECFP is used, please define a fingerprint calcu

python run.py --dataset=chembl-2048-jaccard --algorithm='Hnsw(Nmslib)' --count=100 --sif-dir="./singularity"

Note: to use an existing dataset, e.g. X, one needs to make sure the data structure DATASETS, defined at the bottom of "./ann_benchmark/datasets.py" contains a key-value pair with key X. Otherwise, one needs to include a key-value pair with key X and an arbitrary value, e.g., "'X': gist", to the DATASETS.
# References
- Omohundro, S. M. Five Balltree Construction Algorithms. _Tech. report, UC Berkeley_**1989**.

Expand All @@ -315,4 +414,4 @@ If a new fingerprint rather than ECFP is used, please define a fingerprint calcu

- Datasketch: Big data looks small https://ekzhu.github.io/datasketch (accessed May 31, 2019).

- Gaulton, A.; Bellis, L. J.; Bento, P.; Chambers, J.; Davies, M.; Hersey, A.; Light, Y.; McGlinchey, S.; Michalovich, D.; Al-Lazikani, B.; et al. ChEMBL: A Large-Scale Bioactivity Database for Drug Discovery. _Nucleic Acids Res._**2012**,_40_, 1100–1107.
- Gaulton, A.; Bellis, L. J.; Bento, P.; Chambers, J.; Davies, M.; Hersey, A.; Light, Y.; McGlinchey, S.; Michalovich, D.; Al-Lazikani, B.; et al. ChEMBL: A Large-Scale Bioactivity Database for Drug Discovery. _Nucleic Acids Res._**2012**,_40_, 1100–1107.
126 changes: 123 additions & 3 deletions algos.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ bit:
singularity-tag: ann-bench-nmslib3
module: ann_benchmarks.algorithms.nmslib
constructor: NmslibReuseIndex
base-args: ["@metric", "vptree"]
base-args: ["@metric", "Byte", "vptree"]
run-groups:
base:
# When @args is a dictionary, algorithm instances will be generated
Expand All @@ -51,7 +51,7 @@ bit:
singularity-tag: ann-bench-nmslib3
module: ann_benchmarks.algorithms.nmslib
constructor: NmslibReuseIndex
base-args: ["@metric", "hnsw"]
base-args: ["@metric", "Byte", "hnsw"]
run-groups:
M-48:
arg-groups:
Expand Down Expand Up @@ -90,7 +90,7 @@ bit:
singularity-tag: ann-bench-nmslib3
module: ann_benchmarks.algorithms.nmslib
constructor: NmslibReuseIndex
base-args: ["@metric", "sw-graph"]
base-args: ["@metric", "Byte", "sw-graph"]
run-groups:
NN-96:
arg-groups:
Expand Down Expand Up @@ -186,3 +186,123 @@ bit:
run-groups:
empty:
args: []
int:
jaccard:
Bruteforce:
disabled: false
docker-tag: ann-benchmarks-sklearn
singularity-tag: ann-bench-sklearn
module: ann_benchmarks.algorithms.bruteforce
constructor: BruteForceBLAS
base-args: ["@metric"]
run-groups:
base:
args: {}
Hnsw(Nmslib):
disabled: false
docker-tag: ann-benchmarks-nmslib
singularity-tag: ann-bench-nmslib3
module: ann_benchmarks.algorithms.nmslib
constructor: NmslibReuseIndex
base-args: ["@metric", "Int", "hnsw"]
run-groups:
M-48:
arg-groups:
- {"M": 48, "post": 2, "efConstruction": 800}
- False
query-args: [[50, 70, 90, 120, 160, 200, 400, 600, 700, 800, 1000,
1400, 1600, 2000]]
M-32:
arg-groups:
- {"M": 32, "post": 2, "efConstruction": 800}
- False
query-args: [[100, 300, 500, 700, 1000, 1500, 2000]]
M-20:
arg-groups:
- {"M": 20, "post": 0, "efConstruction": 800}
- False
query-args: [[2, 5, 10, 15, 20, 30, 40, 50, 70, 80]]
M-12:
arg-groups:
- {"M": 12, "post": 0, "efConstruction": 800}
- False
query-args: [[1, 2, 5, 10, 15, 20, 30, 40, 50, 70, 80]]
M-5:
arg-groups:
- {"M": 5, "post": 0, "efConstruction": 10}
- False
query-args: [[1, 2, 5, 10]]
M-2:
arg-groups:
- {"M": 2, "post": 0, "efConstruction": 1}
- False
query-args: [[1, 2]]
SW-graph(Nmslib):
disabled: false
docker-tag: ann-benchmarks-nmslib
singularity-tag: ann-bench-nmslib3
module: ann_benchmarks.algorithms.nmslib
constructor: NmslibReuseIndex
base-args: ["@metric", "Int", "sw-graph"]
run-groups:
NN-96:
arg-groups:
- {"NN": 96}
- False
query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]]
NN-48:
arg-groups:
- {"NN": 48}
- False
query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]]
NN-24:
arg-groups:
- {"NN": 24}
- False
query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]]
NN-16:
arg-groups:
- {"NN": 16}
- False
query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]]
NN-10:
arg-groups:
- {"NN": 10}
- False
query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]]
NN-5:
arg-groups:
- {"NN": 5}
- False
query-args: [[30, 25, 20, 15, 10, 5, 4, 3, 2, 1]]
NN-2:
arg-groups:
- {"NN": 2}
- False
query-args: [[30, 25, 20, 15, 10, 5, 4, 3, 2, 1]]
NN-1:
arg-groups:
- {"NN": 1}
- False
query-args: [[30, 25, 20, 15, 10, 5, 4, 3, 2, 1]]
Onng(Ngt):
disabled: false
docker-tag: ann-benchmarks-ngt
singularity-tag: ann-bench-ngt
module: ann_benchmarks.algorithms.onng_ngt
constructor: ONNG
base-args: ["@metric", "Byte", 1.0]
run-groups:
onng:
args: [[100, 300, 500, 1000], [10, 30, 50, 100], [10, 30, 50, 120]]
query-args: [[0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0]]
Risc:
disabled: false
docker-tag: ann-benchmarks-risc
singularity-tag: ann-bench-risc
module: ann_benchmarks.algorithms.risc
constructor: Risc
base-args: ["@metric", "Risc"]
run-groups:
empty:
args: []
Binary file modified ann_benchmarks/.DS_Store
Binary file not shown.
32 changes: 23 additions & 9 deletions ann_benchmarks/algorithms/nmslib.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,17 @@ def matrToStrArray(sparseMatr):
arr.sort()
res.append(' '.join([str(k) for k in arr]))
return res

@staticmethod
def intMatrToStrArray(intMatr):
res = []
for row in range(intMatr.shape[0]):
res.append(' '.join([str(k) for k in intMatr[row]]))
return res

def __init__(self, metric, method_name, index_param, query_param):
def __init__(self, metric, object_type, method_name, index_param, query_param):
self._nmslib_metric = {'angular': 'cosinesimil', 'euclidean': 'l2', 'jaccard': 'jaccard_sparse'}[metric]
self._object_type = object_type
self._method_name = method_name
self._save_index = False
self._index_param = NmslibReuseIndex.encode(index_param)
Expand All @@ -53,11 +61,11 @@ def fit(self, X):
# Aborted (core dumped)
self._index_param.append('bucketSize=%d' % min(int(X.shape[0] * 0.0005), 1000))

# Chunjiang modified it to "if" for jaccard
if self._nmslib_metric == 'jaccard_sparse':
X_trans = NmslibReuseIndex.matrToStrArray(csr_matrix(X))
self._index = nmslib.init(space=self._nmslib_metric, method=self._method_name, data_type=nmslib.DataType.OBJECT_AS_STRING)
self._index.addDataPointBatch(X_trans)
if self._object_type == 'Byte':
X_trans = NmslibReuseIndex.matrToStrArray(csr_matrix(X))
else:
X_trans = NmslibReuseIndex.intMatrToStrArray(X)
else:
self._index = nmslib.init(space=self._nmslib_metric, method=self._method_name)
self._index.addDataPointBatch(X)
Expand All @@ -79,9 +87,12 @@ def set_query_arguments(self, ef):
def query(self, v, n, rq=False):
# Chunjiang modified
if self._nmslib_metric == 'jaccard_sparse':
nz = numpy.nonzero(v)[0]
v = ' '.join([str(k) for k in nz])
print(n)
if self._object_type == 'Byte':
nz = numpy.nonzero(v)[0]
v = ' '.join([str(k) for k in nz])
else:
v = ' '.join([str(k) for k in v])

if rq:
ids, distances = self._index.rangeQuery(v, n)
else:
Expand All @@ -91,7 +102,10 @@ def query(self, v, n, rq=False):
def batch_query(self, X, n):
# Chunjiang modified
if self._nmslib_metric == 'jaccard_sparse':
X = NmslibReuseIndex.matrToStrArray(csr_matrix(X))
if self._object_type == 'Byte':
X = NmslibReuseIndex.matrToStrArray(csr_matrix(X))
else:
X = NmslibReuseIndex.intMatrToStrArray(X)
self.res = self._index.knnQueryBatch(X, n)

def get_batch_results(self):
Expand Down
Loading