From 9a2db4c5005a12c4b0d9f37772d6eafde93aa7a0 Mon Sep 17 00:00:00 2001 From: Chun Jiang Zhu Date: Tue, 19 May 2020 10:26:36 -0400 Subject: [PATCH 01/14] add requirement.txt --- requirements.txt | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..5677802 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,11 @@ +ansicolors==1.1.8 +docker==2.6.1 +h5py==2.7.1 +matplotlib==2.1.0 +numpy==1.13.3 +pyyaml==3.12 +psutil==5.4.2 +scipy==1.0.0 +scikit-learn==0.19.1 +jinja2==2.10 +h5sparse=0.1.0 From 05f8fcc98306469954341c01422df7cca9c4e24f Mon Sep 17 00:00:00 2001 From: ChunjiangZhu Date: Tue, 19 May 2020 10:28:18 -0400 Subject: [PATCH 02/14] add install requirement.txt --- .DS_Store | Bin 10244 -> 10244 bytes README.md | 6 +++++- requirements.txt | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/.DS_Store b/.DS_Store index 38f525a7587d3f226046cd3ce72954d08816ae95..afb1bdc376a0c0241d3bfeb1d4203670f90495ff 100644 GIT binary patch delta 50 rcmZn(XbIRLAvSrrKsTfDF{LJJ9qUxJ@#D8;aW>XMigNaP$5&sDQ3(XDO diff --git a/README.md b/README.md index dc59844..f7ec712 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,11 @@ Computational environments supported: 1. Download and put a dataset, e.g. Chembl-1024-jaccard.hdf5, under "data" folder; 2. Download and put a Singularity image file, e.g. "ann-bench-nmslib3.sif" under "singularity" folder. -# Executions under a PC with Singularity +# Executions under a PC through Singularity +1. pip install -r requirements.txt + +2. Run your algorithm + Run.py Parameters: dataset: dataset name (Required) diff --git a/requirements.txt b/requirements.txt index 5677802..4b8d866 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ ansicolors==1.1.8 docker==2.6.1 +singularity==3.1.1 h5py==2.7.1 matplotlib==2.1.0 numpy==1.13.3 From fc810949e98181e5f8b47e1f4e3da7469f2acdc7 Mon Sep 17 00:00:00 2001 From: ChunjiangZhu Date: Tue, 19 May 2020 17:05:15 -0400 Subject: [PATCH 03/14] Fix requirement.txt bug --- README.md | 1 + requirements.txt | 2 +- singularity-install/requirements.txt | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f7ec712..9ee284f 100644 --- a/README.md +++ b/README.md @@ -109,6 +109,7 @@ Command Examples (for Singularity only): 2. Create anaconda environment, and then install dependent libraries conda create -c rdkit -n ann_env rdkit python=3.5.2 + pip install -r singularity-install/requirements.txt 3. Run your algorithm scripts by SLURM shell diff --git a/requirements.txt b/requirements.txt index 4b8d866..244077a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,4 @@ psutil==5.4.2 scipy==1.0.0 scikit-learn==0.19.1 jinja2==2.10 -h5sparse=0.1.0 +h5sparse==0.1.0 diff --git a/singularity-install/requirements.txt b/singularity-install/requirements.txt index 5677802..dc5013c 100644 --- a/singularity-install/requirements.txt +++ b/singularity-install/requirements.txt @@ -8,4 +8,4 @@ psutil==5.4.2 scipy==1.0.0 scikit-learn==0.19.1 jinja2==2.10 -h5sparse=0.1.0 +h5sparse==0.1.0 From 1dd99763bc63cf97e62925ecc8d713757edc86c2 Mon Sep 17 00:00:00 2001 From: ChunjiangZhu Date: Tue, 19 May 2020 17:06:46 -0400 Subject: [PATCH 04/14] add ann_dev activation --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 9ee284f..b982b98 100644 --- a/README.md +++ b/README.md @@ -110,7 +110,11 @@ Command Examples (for Singularity only): conda create -c rdkit -n ann_env rdkit python=3.5.2 + source activate ann_dev + pip install -r singularity-install/requirements.txt + + source deactivate ann_dev 3. Run your algorithm scripts by SLURM shell From bb7cc2cb3009535a18ad4c89759fc54b6d4ecb3e Mon Sep 17 00:00:00 2001 From: ChunjiangZhu Date: Tue, 19 May 2020 17:26:09 -0400 Subject: [PATCH 05/14] add uconn hpc specific code --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b982b98..08d1bfa 100644 --- a/README.md +++ b/README.md @@ -125,7 +125,7 @@ An example "run.sh": #!/bin/bash #SBATCH --ntasks=1 - + #SBATCH --exclude=cn[65-69,71-136,325-343,345-353,355-358,360-364,369-398,400-401],gpu[07-10] module load anaconda/5.1.0 From 321ef63a552bb8bc6ed7c0807e9e58fddabd15b7 Mon Sep 17 00:00:00 2001 From: ChunjiangZhu Date: Tue, 19 May 2020 20:48:52 -0400 Subject: [PATCH 06/14] add ploting in readme --- README.md | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 90 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 08d1bfa..756cbd4 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ Run.py Parameters: - chembl-1024-jaccard - molport-1024-jaccard algorithm: algorithm name (Required) - Options: + Choices: - Balltree(Sklearn) - Bruteforce - Datasketch @@ -100,6 +100,67 @@ Command Examples (for Singularity only): python run.py --dataset=chembl-1024-jaccard --algorithm='Hnsw(Nmslib)' --count=100 --sif-dir="./singularity" --batch +# Visualization of Execution Results under a PC +1. Run python plot.py + +plot.py Parameters: + + dataset: dataset name (Required) + Examples: + - chembl-1024-jaccard + - molport-1024-jaccard + count: the value of K for top-K nearest neighbor search + Default: 10 + output/-o: the output file + x-axis/-x: which metric to use on the X-axis + Choices: + - k-nn: Recall for top-K nearest neighbor search (Default) + - range: Recall for range query + - qps: Queries per second (1/s) + - build: Indexing time (s) + - indexsize: Index size (kB) + y-axis/-y: which metric to use on the Y-axis + Choices: + - k-nn: Recall for top-K nearest neighbor search + - range: Recall for range query + - qps: Queries per second (1/s) (Default) + - build: Indexing time (s) + - indexsize: Index size (kB) + x-log/-X: Draw the X-axis using a logarithmic scale + Default: False + y-log/-Y: draw the Y-axis using a logarithmic scale + Default: False + raw: show raw results (not just Pareto frontier) in faded colours + Default: False + batch: batch query mode + Default: False + rq: range query / threshold-based query mode + Default: False + radius: in the range query mode, the used cut-off value. Here the distance is used, so if all near neighbors with a similarity coefficient larger than 0.8, please set it 0.2. + Default: 0.3 + + +Command Examples: +- Plot results on chembl-1024-jaccard dataset for top-K (K=100) nearest neighbor query to "results/chembl-1024-jaccard-100.png". X-axis: recall. Y-axis: qps, log-scale. + + python plot.py --dataset=chembl-1024-jaccard -Y --count=100 -o=results/chembl-1024-jaccard-100 + +- Plot results on molport-1024-jaccard dataset for top-K (K=10) nearest neighbor query to "results/molport-1024-jaccard-indexsize-10.png". X-axis: recall. Y-axis: index size, log-scale. + + ython plot.py --dataset=molport-1024-jaccard -Y -y=indexsize --count=10 -o=results/molport-1024-jaccard-indexsize-10 + +- Plot results on molport-1024-jaccard dataset for top-K (K=10) nearest neighbor query to "results/molport-1024-jaccard-buildtime-10.png". X-axis: recall. Y-axis: indexing time, log-scale. + + python plot.py --dataset=molport-1024-jaccard -Y -y=build --count=10 -o=results/molport-1024-jaccard-buildtime-10 + +- Plot batch mode results on molport-1024-jaccard dataset for top-K (K=100) nearest neighbor query to "results/molport-1024-jaccard-batch-100.png". X-axis: recall. Y-axis: qps, log-scale. + + python plot.py --dataset=molport-1024-jaccard -Y --batch --count=100 -o=results/molport-1024-jaccard-batch-100 + +- Plot batch mode results on chembl-1024-jaccard dataset for range query with similarity cutoff 0.6 to "results/chembl-1024-jaccard-0_4.png". X-axis: recall (range query). Y-axis: qps, log-scale. + + python plot.py --dataset=chembl-1024-jaccard -Y -x=range --rq --radius=0.4 -o=results/chembl-1024-jaccard-0_4 + # Executions under an HPC environment 1. Load anaconda module @@ -143,6 +204,34 @@ An example "run.sh": python run.py --dataset=chembl-1024-jaccard --algorithm='Hnsw(Nmslib)' --count=100 --sif-dir="./singularity" +# Visualization of Execution Results under an HPC environment +Run your algorithm scripts by SLURM shell + + sbatch plot.sh + +An example "plot.sh": + + #!/bin/bash + + #SBATCH --ntasks=1 + #SBATCH --exclude=cn[65-69,71-136,325-343,345-353,355-358,360-364,369-398,400-401],gpu[07-10] + + + module load anaconda/5.1.0 + + source activate ann_env + + module purge + + module load gcc/5.4.0 + + module load singularity/3.1 + + + + python plot.py --dataset=chembl-1024-jaccard -Y --count=100 -o=results/chembl-1024-jaccard-100 + + # Parameter tuning All algorithmic parameter settings are included in the "./algos.yaml" file. From 6f8111b1398c5089dd4c49419542e58e374becb8 Mon Sep 17 00:00:00 2001 From: ChunjiangZhu Date: Tue, 19 May 2020 21:11:54 -0400 Subject: [PATCH 07/14] fix typo in readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 756cbd4..2b23aaa 100644 --- a/README.md +++ b/README.md @@ -101,7 +101,7 @@ Command Examples (for Singularity only): python run.py --dataset=chembl-1024-jaccard --algorithm='Hnsw(Nmslib)' --count=100 --sif-dir="./singularity" --batch # Visualization of Execution Results under a PC -1. Run python plot.py +Run plotting python: plot.py plot.py Parameters: From e7330bd6d176cf305c157109a40917375031c3c5 Mon Sep 17 00:00:00 2001 From: ChunjiangZhu Date: Tue, 19 May 2020 21:12:44 -0400 Subject: [PATCH 08/14] Fix typo in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2b23aaa..608d204 100644 --- a/README.md +++ b/README.md @@ -103,7 +103,7 @@ Command Examples (for Singularity only): # Visualization of Execution Results under a PC Run plotting python: plot.py -plot.py Parameters: +Plot.py Parameters: dataset: dataset name (Required) Examples: From 513b065b90d1c788df3b221a64e08e9adae265dd Mon Sep 17 00:00:00 2001 From: ChunjiangZhu Date: Tue, 19 May 2020 21:15:16 -0400 Subject: [PATCH 09/14] Fix typo in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 608d204..f45c047 100644 --- a/README.md +++ b/README.md @@ -157,7 +157,7 @@ Command Examples: python plot.py --dataset=molport-1024-jaccard -Y --batch --count=100 -o=results/molport-1024-jaccard-batch-100 -- Plot batch mode results on chembl-1024-jaccard dataset for range query with similarity cutoff 0.6 to "results/chembl-1024-jaccard-0_4.png". X-axis: recall (range query). Y-axis: qps, log-scale. +- Plot results on chembl-1024-jaccard dataset for range query with similarity cutoff 0.6 to "results/chembl-1024-jaccard-0_4.png". X-axis: recall (range query). Y-axis: qps, log-scale. python plot.py --dataset=chembl-1024-jaccard -Y -x=range --rq --radius=0.4 -o=results/chembl-1024-jaccard-0_4 From b4f71b831627694a5069c0de30f54fdde020f943 Mon Sep 17 00:00:00 2001 From: Chun Jiang Zhu Date: Fri, 5 Jun 2020 22:17:48 -0400 Subject: [PATCH 10/14] add note for using an existing dataset --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f45c047..3fdf9a0 100644 --- a/README.md +++ b/README.md @@ -383,7 +383,7 @@ At the beginning of the file, there is "bit:\n jaccard:\n". It means that we use Here is the process to add a custom dataset. We will use Chembl dataset and 2048-bits ECFP as example. 1. Put raw sdf file, e.g. chembl_24_1.sdf.gz, under "data" folder. Note only ".sdf.gz" files are accepted. Multiple sdf files are allowed. -2. Include the key-value pair below to DATASETS, defined at the bottom of "./ann_benchmark/datasets.py". +2. Include the key-value pair below to the data strucutre DATASETS, defined at the bottom of "./ann_benchmark/datasets.py". If a new fingerprint rather than ECFP is used, please define a fingerprint calculation function similar to ecfp() in the same Python file. 'chembl-2048-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl', 2048, 'jaccard', 'bit'), @@ -392,6 +392,7 @@ If a new fingerprint rather than ECFP is used, please define a fingerprint calcu python run.py --dataset=chembl-2048-jaccard --algorithm='Hnsw(Nmslib)' --count=100 --sif-dir="./singularity" +Note: to use an existing dataset, e.g. X, one needs to make sure the data structure DATASETS, defined at the bottom of "./ann_benchmark/datasets.py" contains a key-value pair with key X. Otherwise, one needs to include a key-value pair with key X and an arbitrary value, e.g., "'X': gist", to the DATASETS. # References - Omohundro, S. M. Five Balltree Construction Algorithms. _Tech. report, UC Berkeley_**1989**. @@ -413,4 +414,4 @@ If a new fingerprint rather than ECFP is used, please define a fingerprint calcu - Datasketch: Big data looks small https://ekzhu.github.io/datasketch (accessed May 31, 2019). -- Gaulton, A.; Bellis, L. J.; Bento, P.; Chambers, J.; Davies, M.; Hersey, A.; Light, Y.; McGlinchey, S.; Michalovich, D.; Al-Lazikani, B.; et al. ChEMBL: A Large-Scale Bioactivity Database for Drug Discovery. _Nucleic Acids Res._**2012**,_40_, 1100–1107. \ No newline at end of file +- Gaulton, A.; Bellis, L. J.; Bento, P.; Chambers, J.; Davies, M.; Hersey, A.; Light, Y.; McGlinchey, S.; Michalovich, D.; Al-Lazikani, B.; et al. ChEMBL: A Large-Scale Bioactivity Database for Drug Discovery. _Nucleic Acids Res._**2012**,_40_, 1100–1107. From 7216d024e843f4bc47b170275cabdf95b08c9a0b Mon Sep 17 00:00:00 2001 From: Chun Jiang Zhu Date: Tue, 9 Jun 2020 15:11:54 -0400 Subject: [PATCH 11/14] Comment out an unnecessary log. --- ann_benchmarks/algorithms/nmslib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ann_benchmarks/algorithms/nmslib.py b/ann_benchmarks/algorithms/nmslib.py index 0fbb4e2..47933f5 100644 --- a/ann_benchmarks/algorithms/nmslib.py +++ b/ann_benchmarks/algorithms/nmslib.py @@ -81,7 +81,7 @@ def query(self, v, n, rq=False): if self._nmslib_metric == 'jaccard_sparse': nz = numpy.nonzero(v)[0] v = ' '.join([str(k) for k in nz]) - print(n) + #print(n) if rq: ids, distances = self._index.rangeQuery(v, n) else: From c0da55da5b13bdede17eebd3a38c3425823ad794 Mon Sep 17 00:00:00 2001 From: Chun Jiang Zhu Date: Thu, 18 Jun 2020 17:25:22 -0400 Subject: [PATCH 12/14] fix distance_valid for jaccard --- ann_benchmarks/distance.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ann_benchmarks/distance.py b/ann_benchmarks/distance.py index 9228338..5f5aa07 100644 --- a/ann_benchmarks/distance.py +++ b/ann_benchmarks/distance.py @@ -4,7 +4,6 @@ def pdist(a, b, metric): return scipy_pdist([a, b], metric=metric)[0] -# Need own implementation of jaccard because numpy's implementation is different def jaccard(a, b): if len(a) == 0 or len(b) == 0: return 0 @@ -31,7 +30,7 @@ def jaccard(a, b): # 'distance_valid' : lambda a: True # } # } -# Chunjiang Modified 20190216 + metrics = { 'hamming': { 'distance' : lambda a, b: pdist(a, b, "hamming"), @@ -40,7 +39,7 @@ def jaccard(a, b): # return 1 - jaccard similarity, because smaller distances are better. 'jaccard': { 'distance' : lambda a, b: pdist(a, b, "jaccard"), - 'distance_valid' : lambda a: a < 1 - 1e-5 + 'distance_valid' : lambda a: a < 1 + 1e-5 }, 'euclidean': { 'distance' : lambda a, b: pdist(a, b, "euclidean"), From 0e81a7b14834e1137587e552a0101d455cf1f3e2 Mon Sep 17 00:00:00 2001 From: ChunjiangZhu Date: Fri, 19 Jun 2020 19:01:09 -0400 Subject: [PATCH 13/14] add map4 fingerprint --- .DS_Store | Bin 10244 -> 10244 bytes algos.yaml | 126 +++++++++++++- ann_benchmarks/.DS_Store | Bin 8196 -> 8196 bytes ann_benchmarks/algorithms/nmslib.py | 32 +++- ann_benchmarks/datasets.py | 246 +++++++++++++++++++++++----- ann_benchmarks/runner.py | 2 +- requirements.txt | 1 - 7 files changed, 348 insertions(+), 59 deletions(-) diff --git a/.DS_Store b/.DS_Store index afb1bdc376a0c0241d3bfeb1d4203670f90495ff..e51d2361ba5c217b11d91dd82a50f93559957cc0 100644 GIT binary patch delta 200 zcmZn(XbG6$&nUSuU^hRbdot(% delta 99 zcmZn(XbG6$&nU4mU^hRb#AY4=c}5OnV-p<(V^hP)uZ3hc2M9i4-drM@0T(s~2~Q3X XRN1U5{)l5@gTrQa1zlF0O5_;=E3MR&8wY8ia;;M$Wo(Z{? zRn;}Mbu$^jfRPbGGw?%c7&U8hlAs%tnCRqHg7vcT0{q3vMfo{70r|z5C7Jnok=J+30cs~HC;$Ke delta 150 zcmZp1XmQvuQ-JZr-G!Q%iZ)C>Eo98Ne)E4J4%W?V693o%NUbQ` diff --git a/ann_benchmarks/algorithms/nmslib.py b/ann_benchmarks/algorithms/nmslib.py index 47933f5..72f3ebc 100644 --- a/ann_benchmarks/algorithms/nmslib.py +++ b/ann_benchmarks/algorithms/nmslib.py @@ -24,9 +24,17 @@ def matrToStrArray(sparseMatr): arr.sort() res.append(' '.join([str(k) for k in arr])) return res + + @staticmethod + def intMatrToStrArray(intMatr): + res = [] + for row in range(intMatr.shape[0]): + res.append(' '.join([str(k) for k in intMatr[row]])) + return res - def __init__(self, metric, method_name, index_param, query_param): + def __init__(self, metric, object_type, method_name, index_param, query_param): self._nmslib_metric = {'angular': 'cosinesimil', 'euclidean': 'l2', 'jaccard': 'jaccard_sparse'}[metric] + self._object_type = object_type self._method_name = method_name self._save_index = False self._index_param = NmslibReuseIndex.encode(index_param) @@ -53,11 +61,11 @@ def fit(self, X): # Aborted (core dumped) self._index_param.append('bucketSize=%d' % min(int(X.shape[0] * 0.0005), 1000)) - # Chunjiang modified it to "if" for jaccard if self._nmslib_metric == 'jaccard_sparse': - X_trans = NmslibReuseIndex.matrToStrArray(csr_matrix(X)) - self._index = nmslib.init(space=self._nmslib_metric, method=self._method_name, data_type=nmslib.DataType.OBJECT_AS_STRING) - self._index.addDataPointBatch(X_trans) + if self._object_type == 'Byte': + X_trans = NmslibReuseIndex.matrToStrArray(csr_matrix(X)) + else: + X_trans = NmslibReuseIndex.intMatrToStrArray(X) else: self._index = nmslib.init(space=self._nmslib_metric, method=self._method_name) self._index.addDataPointBatch(X) @@ -79,9 +87,12 @@ def set_query_arguments(self, ef): def query(self, v, n, rq=False): # Chunjiang modified if self._nmslib_metric == 'jaccard_sparse': - nz = numpy.nonzero(v)[0] - v = ' '.join([str(k) for k in nz]) - #print(n) + if self._object_type == 'Byte': + nz = numpy.nonzero(v)[0] + v = ' '.join([str(k) for k in nz]) + else: + v = ' '.join([str(k) for k in v]) + if rq: ids, distances = self._index.rangeQuery(v, n) else: @@ -91,7 +102,10 @@ def query(self, v, n, rq=False): def batch_query(self, X, n): # Chunjiang modified if self._nmslib_metric == 'jaccard_sparse': - X = NmslibReuseIndex.matrToStrArray(csr_matrix(X)) + if self._object_type == 'Byte': + X = NmslibReuseIndex.matrToStrArray(csr_matrix(X)) + else: + X = NmslibReuseIndex.intMatrToStrArray(X) self.res = self._index.knnQueryBatch(X, n) def get_batch_results(self): diff --git a/ann_benchmarks/datasets.py b/ann_benchmarks/datasets.py index 68728bf..2ee8551 100644 --- a/ann_benchmarks/datasets.py +++ b/ann_benchmarks/datasets.py @@ -42,14 +42,11 @@ def get_dataset(which): # Everything below this line is related to creating datasets # You probably never need to do this at home, just rely on the prepared datasets at http://ann-benchmarks.com -def write_output(train, test, fn, distance, point_type='float', count=1000, SMILES=None): +def write_output(train, test, fn, distance, point_type='float', count=1000, SMILES=None, IDS=None): from ann_benchmarks.algorithms.bruteforce import BruteForceBLAS import sklearn.neighbors import h5sparse - - def replace_last(source_string, replace_what, replace_with): - head, _sep, tail = source_string.rpartition(replace_what) - return head + replace_with + tail + from scipy.sparse import issparse # store SMILES first if SMILES: @@ -62,43 +59,60 @@ def replace_last(source_string, replace_what, replace_with): f.close() print('Finish.') - print('Write Dataset %s' % fn) - f = h5sparse.File(fn, 'w') - f.attrs['distance'] = distance - f.attrs['point_type'] = point_type - print('train size: %9d * %4d' % train.shape) - print('test size: %9d * %4d' % test.shape) - f.create_dataset('train',data=train) - f.create_dataset('test', test.shape, dtype=test.dtype)[:] = test - neighbors = f.create_dataset('neighbors', (test.shape[0], count), dtype='i') - distances = f.create_dataset('distances', (test.shape[0], count), dtype='f') - - # use which method to compute the groundtruth - train = train.toarray() - method = 'bruteforth' - if method == 'balltree': - tree = sklearn.neighbors.BallTree(train, leaf_size=1000000, metric=distance) - else: - bf = BruteForceBLAS(metric=distance, precision=train.dtype) - bf.fit(train) + if IDS: + smile_fn = replace_last(fn, '.hdf5', '-IDS.hdf5') + print('Write Smiles to File %s' % smile_fn) + f = h5sparse.File(smile_fn, 'w') + dt = h5py.special_dtype(vlen=bytes) + asciiList = [n.encode("ascii", "ignore") for n in IDS] + f.create_dataset('smile', (len(asciiList), 1), dtype=dt, data=asciiList) + f.close() - print(test) - for i, x in enumerate(test): - if i % 1 == 0: - print('%d/%d...' % (i, test.shape[0])) + print('Write Dataset %s' % fn) + f = h5sparse.File(fn, 'w') + f.attrs['distance'] = distance + f.attrs['point_type'] = point_type + print('train size: %9d * %4d' % train.shape) + print('test size: %9d * %4d' % test.shape) + if issparse(train): + f.create_dataset('train',data=train) + else: + f.create_dataset('train', train.shape, dtype=train.dtype)[:] = train + if issparse(test): + f.create_dataset('test',data=test) + else: + f.create_dataset('test', test.shape, dtype=test.dtype)[:] = test + neighbors = f.create_dataset('neighbors', (test.shape[0], count), dtype='i') + distances = f.create_dataset('distances', (test.shape[0], count), dtype='f') + + # use which method to compute the groundtruth + if issparse(train): + train = train.toarray() + method = 'bruteforce' if method == 'balltree': - dist, ind = tree.query([x], k=count) - neighbors[i] = ind[0] - distances[i] = dist[0] + tree = sklearn.neighbors.BallTree(train, leaf_size=1000000, metric=distance) else: - res = list(bf.query_with_distances(x, count)) - res.sort(key=lambda t: t[-1]) - neighbors[i] = [j for j, _ in res] - distances[i] = [d for _, d in res] - print(neighbors[i]) - print(distances[i]) - f.close() - print('Finish.') + bf = BruteForceBLAS(metric=distance, precision=train.dtype) + bf.fit(train) + + print(test) + for i, x in enumerate(test): + if i % 1 == 0: + print('%d/%d...' % (i, test.shape[0])) + if method == 'balltree': + dist, ind = tree.query([x], k=count) + neighbors[i] = ind[0] + distances[i] = dist[0] + else: + res = list(bf.query_with_distances(x, count)) + print(len(res)) + res.sort(key=lambda t: t[-1]) + neighbors[i] = [j for j, _ in res] + distances[i] = [d for _, d in res] + print(neighbors[i]) + print(distances[i]) + f.close() + print('Finish.') def train_test_split(X, test_size=10000): @@ -355,7 +369,7 @@ def get_sparse_matrix_from_txt(file=None, dtype=numpy.bool): return fps, SMILES -def get_sparse_matrix_from_sdf(dir, dimension=1024, dtype=numpy.bool): +def get_sparse_matrix_from_sdf(dir, dimension = 1024, dtype=numpy.bool): from rdkit import Chem from rdkit.Chem import AllChem import glob @@ -363,6 +377,7 @@ def get_sparse_matrix_from_sdf(dir, dimension=1024, dtype=numpy.bool): from scipy.sparse import csr_matrix SMILES = [] + IDS = [] indptr = [0] indices = [] data = [] @@ -376,6 +391,7 @@ def get_sparse_matrix_from_sdf(dir, dimension=1024, dtype=numpy.bool): if mol is None: continue smile = Chem.MolToSmiles(mol) SMILES.append(smile) + IDS.append(mol.GetProp("_Name")) fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=dimension) for i in range(dimension): if fp.GetBit(i) is True: @@ -383,11 +399,12 @@ def get_sparse_matrix_from_sdf(dir, dimension=1024, dtype=numpy.bool): data.append(1) indptr.append(len(indices)) num_mols += 1 + if num_mols > 3000: break fps = csr_matrix((data, indices, indptr), shape=(num_mols, dimension), dtype=dtype) print('The dimension of the returned sparse matrix: %d*%d' % fps.shape) - return fps, SMILES + return fps, SMILES, IDS def ecfp(out_fn, dataset_name, dimension, distance, type, test_size=1000): from sklearn.utils import shuffle @@ -402,11 +419,11 @@ def ecfp(out_fn, dataset_name, dimension, distance, type, test_size=1000): dir = './data' - X, SMILES = get_sparse_matrix_from_sdf(dir=dir, dimension=dimension, dtype=dtype) + X, SMILES, IDS = get_sparse_matrix_from_sdf(dir=dir, dimension=dimension, dtype=dtype) # random shuffle fingerprints and smiles at the same time seed = 1 # random.randint(0, 2 ** 32 - 1) - X, SMILES = shuffle(X, SMILES, random_state=seed) + X, SMILES, IDS = shuffle(X, SMILES, IDS, random_state=seed) # data split and make test data full matrix train_size = X.shape[0] - test_size @@ -417,8 +434,145 @@ def ecfp(out_fn, dataset_name, dimension, distance, type, test_size=1000): print('Train data dimension: %d*%d' %X_train.shape) print('Test data dimension: %d*%d' %X_test.shape) - write_output(X_train, X_test, out_fn, distance, type, count=1000, SMILES=SMILES) + write_output(X_train, X_test, out_fn, distance, type, count=1000, SMILES=SMILES, IDS=IDS) +# Molecular topological fingerprints +def get_sparse_matrix_from_sdf_topological_fp(dir, dimension=1024, dtype=numpy.bool): + from rdkit import Chem + from rdkit.Chem import AllChem + import glob + import gzip + from scipy.sparse import csr_matrix + + SMILES = [] + IDS = [] + indptr = [0] + indices = [] + data = [] + num_mols = 0 + file_list = glob.glob(dir + '/*.sdf.gz') + print(file_list) + for file in file_list: + inf = gzip.open(file) + suppl = Chem.ForwardSDMolSupplier(inf) + for mol in suppl: + if mol is None: continue + smile = Chem.MolToSmiles(mol) + SMILES.append(smile) + IDS.append(mol.GetProp("_Name")) + fp = Chem.rdmolops.RDKFingerprint(mol, fpSize=dimension) + for i in range(dimension): + if fp.GetBit(i) is True: + indices.append(i) + data.append(1) + indptr.append(len(indices)) + num_mols += 1 + + fps = csr_matrix((data, indices, indptr), shape=(num_mols, dimension), dtype=dtype) + print('The dimension of the returned sparse matrix: %d*%d' % fps.shape) + + return fps, SMILES, IDS + +def topological_fp(out_fn, dataset_name, dimension, distance, type, test_size=1000): + from sklearn.utils import shuffle + print('prepare dataset ' + dataset_name) + + if type == 'bit': + dtype = numpy.bool + elif type == 'int': + dtype = numpy.int + else: + dtype = numpy.float + + dir = './data' + + X, SMILES, IDS = get_sparse_matrix_from_sdf_topological_fp(dir=dir, dimension=dimension, dtype=dtype) + + # random shuffle fingerprints and smiles at the same time + seed = 1 # random.randint(0, 2 ** 32 - 1) + X, SMILES, IDS = shuffle(X, SMILES, IDS, random_state=seed) + + # data split and make test data full matrix + train_size = X.shape[0] - test_size + X_train = X[:train_size] + X_test = X[train_size:] + X_test = X_test.toarray() + print('finish dataset preparation') + + print('Train data dimension: %d*%d' %X_train.shape) + print('Test data dimension: %d*%d' %X_test.shape) + write_output(X_train, X_test, out_fn, distance, type, count=1000, SMILES=SMILES, IDS=IDS) + +def sdf_2_map4(dir, dimension=1024, dtype=numpy.bool): + from rdkit import Chem + from rdkit.Chem import AllChem + import glob + import gzip + from scipy.sparse import csr_matrix + from map4 import MAP4Calculator + + MAP4 = MAP4Calculator(dimensions=dimension) + + SMILES = [] + IDS = [] + fps = [] + file_list = glob.glob(dir + '/*.sdf.gz') + print(file_list) + for file in file_list: + inf = gzip.open(file) + suppl = Chem.ForwardSDMolSupplier(inf) + #mols = [x for x in suppl if x is not None] + mols = [] + num_mols = 0 + for mol in suppl: + if mol is None: continue + mols.append(mol) + SMILES.append(Chem.MolToSmiles(mol)) + IDS.append(mol.GetProp("_Name")) + num_mols += 1 + if num_mols == 3000: + fps.extend(MAP4.calculate_many(mols)) + mols = [] + num_mols = 0 + if num_mols > 0: + fps.extend(MAP4.calculate_many(mols)) + mols = [] + num_mols = 0 + + fps = numpy.array(fps, dtype=dtype) + print('The dimension of the returned matrix: %d*%d' % fps.shape) + + return fps, SMILES, IDS + +def map4(out_fn, dataset_name, dimension, distance, type, test_size=1000): + from sklearn.utils import shuffle + from map4 import MAP4Calculator + print('prepare dataset ' + dataset_name) + + if type == 'bit': + dtype = numpy.bool + elif type == 'int': + dtype = numpy.int + else: + dtype = numpy.float + + dir = './data' + + X, SMILES, IDS = sdf_2_map4(dir=dir, dimension=dimension, dtype=dtype) + + # random shuffle fingerprints and smiles at the same time + seed = 1 # random.randint(0, 2 ** 32 - 1) + X, SMILES, IDS = shuffle(X, SMILES, IDS, random_state=seed) + + # data split and make test data full matrix + train_size = X.shape[0] - test_size + X_train = X[:train_size] + X_test = X[train_size:] + print('finish dataset preparation') + + print('Train data dimension: %d*%d' %X_train.shape) + print('Test data dimension: %d*%d' %X_test.shape) + write_output(X_train, X_test, out_fn, distance, type, count=1000, SMILES=SMILES, IDS=IDS) def ecfp_sparse_multi(out_fn, dataset_name, num_files, dimension, distance, type): print('prepare dataset ' + dataset_name) @@ -535,5 +689,7 @@ def ecfp_multi(out_fn, dataset_name, num_files, dimension, distance, type): 'enamine100M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 5, 1024, 'jaccard', 'bit'), 'enamine10M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 0.5, 1024, 'jaccard', 'bit'), 'enamine20M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 1, 1024, 'jaccard', 'bit'), - 'enamine10M-1024-jaccard': lambda out_fn: ecfp_multi(out_fn, 'Enamine', 0.5, 1024, 'jaccard', 'bit') + 'enamine10M-1024-jaccard': lambda out_fn: ecfp_multi(out_fn, 'Enamine', 0.5, 1024, 'jaccard', 'bit'), + 'chembl-1024-jaccard-tp': lambda out_fn: topological_fp(out_fn, 'Chembl', 1024, 'jaccard', 'bit'), + 'chembl-1024-jaccard-map4': lambda out_fn: map4(out_fn, 'Chembl', 1024, 'jaccard', 'int') } diff --git a/ann_benchmarks/runner.py b/ann_benchmarks/runner.py index d51cdf5..1d3c3a4 100644 --- a/ann_benchmarks/runner.py +++ b/ann_benchmarks/runner.py @@ -113,7 +113,7 @@ def run(definition, dataset, count, run_count, batch, rq): D = get_dataset(dataset) # Chunjiang modified print('Is the train set a sparse matrix? %d' % issparse(D['train'][()])) - if 'sparse' not in dataset: + if issparse(D['train'][()]): X_train = D['train'][()].toarray() else: X_train = D['train'][()] diff --git a/requirements.txt b/requirements.txt index 244077a..dc5013c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,5 @@ ansicolors==1.1.8 docker==2.6.1 -singularity==3.1.1 h5py==2.7.1 matplotlib==2.1.0 numpy==1.13.3 From e9f4436f4766ac1a989b7bf1f6e920717ac78cdb Mon Sep 17 00:00:00 2001 From: Chun Jiang Zhu Date: Fri, 19 Jun 2020 19:02:23 -0400 Subject: [PATCH 14/14] Delete nmslib_sparse.py --- ann_benchmarks/algorithms/nmslib_sparse.py | 95 ---------------------- 1 file changed, 95 deletions(-) delete mode 100644 ann_benchmarks/algorithms/nmslib_sparse.py diff --git a/ann_benchmarks/algorithms/nmslib_sparse.py b/ann_benchmarks/algorithms/nmslib_sparse.py deleted file mode 100644 index 58af969..0000000 --- a/ann_benchmarks/algorithms/nmslib_sparse.py +++ /dev/null @@ -1,95 +0,0 @@ -from __future__ import absolute_import -import os -import nmslib -from ann_benchmarks.constants import INDEX_DIR -from ann_benchmarks.algorithms.base import BaseANN -from scipy.sparse import csr_matrix -import numpy - - -class NmslibSparseReuseIndex(BaseANN): - @staticmethod - def encode(d): - return ["%s=%s" % (a, b) for (a, b) in d.iteritems()] - - # For each entry in the sparse matrix, extract a list of IDs and - # convert them to a string. Return a list of such strings. - @staticmethod - def matrToStrArray(sparseMatr): - res = [] - indptr = sparseMatr.indptr - indices = sparseMatr.indices - for row in range(sparseMatr.shape[0]): - arr = [k for k in indices[indptr[row]: indptr[row + 1]]] - arr.sort() - res.append(' '.join([str(k) for k in arr])) - return res - - def __init__(self, metric, method_name, index_param, query_param): - self._nmslib_metric = {'angular': 'cosinesimil', 'euclidean': 'l2', 'jaccard': 'jaccard_sparse'}[metric] - self._method_name = method_name - self._save_index = False - self._index_param = NmslibSparseReuseIndex.encode(index_param) - if query_param!=False: - self._query_param = NmslibSparseReuseIndex.encode(query_param) - self.name = 'Nmslib(method_name=%s, index_param=%s, query_param=%s)' % ( - self._method_name, self._index_param, self._query_param) - else: - self._query_param = None - self.name = 'Nmslib(method_name=%s, index_param=%s)' % ( - self._method_name, self._index_param) - - self._index_name = os.path.join(INDEX_DIR, "nmslib_%s_%s_%s" % (self._method_name, metric, '_'.join(self._index_param))) - - d = os.path.dirname(self._index_name) - if not os.path.exists(d): - os.makedirs(d) - - def fit(self, X): - if self._method_name == 'vptree': - # To avoid this issue: - # terminate called after throwing an instance of 'std::runtime_error' - # what(): The data size is too small or the bucket size is too big. Select the parameters so that is NOT less than * 1000 - # Aborted (core dumped) - self._index_param.append('bucketSize=%d' % min(int(X.shape[0] * 0.0005), 1000)) - - # Chunjiang modified it to "if" for jaccard - if self._nmslib_metric == 'jaccard_sparse': - X_trans = NmslibSparseReuseIndex.matrToStrArray(X) - self._index = nmslib.init(space=self._nmslib_metric, method=self._method_name, data_type=nmslib.DataType.OBJECT_AS_STRING) - self._index.addDataPointBatch(X_trans) - else: - self._index = nmslib.init(space=self._nmslib_metric, method=self._method_name) - self._index.addDataPointBatch(X) - - if os.path.exists(self._index_name): - print('Loading index from file') - self._index.loadIndex(self._index_name) - else: - self._index.createIndex(self._index_param) - if self._save_index: - self._index.saveIndex(self._index_name) - if self._query_param is not None: - self._index.setQueryTimeParams(self._query_param) - - def set_query_arguments(self, ef): - if self._method_name == 'hnsw' or self._method_name == 'sw-graph': - self._index.setQueryTimeParams(["efSearch=%s"%(ef)]) - - def query(self, v, n): - # Chunjiang modified - if self._nmslib_metric == 'jaccard_sparse': - nz = numpy.nonzero(v)[0] - v = ' '.join([str(k) for k in nz]) - ids, distances = self._index.knnQuery(v, n) - return ids - - def batch_query(self, X, n): - # Chunjiang modified - if self._nmslib_metric == 'jaccard_sparse': - X = NmslibSparseReuseIndex.matrToStrArray(csr_matrix(X)) - self.res = self._index.knnQueryBatch(X, n) - - def get_batch_results(self): - return [x for x, _ in self.res] -