diff --git a/.DS_Store b/.DS_Store index 38f525a..e51d236 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/README.md b/README.md index dc59844..3fdf9a0 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,11 @@ Computational environments supported: 1. Download and put a dataset, e.g. Chembl-1024-jaccard.hdf5, under "data" folder; 2. Download and put a Singularity image file, e.g. "ann-bench-nmslib3.sif" under "singularity" folder. -# Executions under a PC with Singularity +# Executions under a PC through Singularity +1. pip install -r requirements.txt + +2. Run your algorithm + Run.py Parameters: dataset: dataset name (Required) @@ -48,7 +52,7 @@ Run.py Parameters: - chembl-1024-jaccard - molport-1024-jaccard algorithm: algorithm name (Required) - Options: + Choices: - Balltree(Sklearn) - Bruteforce - Datasketch @@ -96,6 +100,67 @@ Command Examples (for Singularity only): python run.py --dataset=chembl-1024-jaccard --algorithm='Hnsw(Nmslib)' --count=100 --sif-dir="./singularity" --batch +# Visualization of Execution Results under a PC +Run plotting python: plot.py + +Plot.py Parameters: + + dataset: dataset name (Required) + Examples: + - chembl-1024-jaccard + - molport-1024-jaccard + count: the value of K for top-K nearest neighbor search + Default: 10 + output/-o: the output file + x-axis/-x: which metric to use on the X-axis + Choices: + - k-nn: Recall for top-K nearest neighbor search (Default) + - range: Recall for range query + - qps: Queries per second (1/s) + - build: Indexing time (s) + - indexsize: Index size (kB) + y-axis/-y: which metric to use on the Y-axis + Choices: + - k-nn: Recall for top-K nearest neighbor search + - range: Recall for range query + - qps: Queries per second (1/s) (Default) + - build: Indexing time (s) + - indexsize: Index size (kB) + x-log/-X: Draw the X-axis using a logarithmic scale + Default: False + y-log/-Y: draw the Y-axis using a logarithmic scale + Default: False + raw: show raw results (not just Pareto frontier) in faded colours + Default: False + batch: batch query mode + Default: False + rq: range query / threshold-based query mode + Default: False + radius: in the range query mode, the used cut-off value. Here the distance is used, so if all near neighbors with a similarity coefficient larger than 0.8, please set it 0.2. + Default: 0.3 + + +Command Examples: +- Plot results on chembl-1024-jaccard dataset for top-K (K=100) nearest neighbor query to "results/chembl-1024-jaccard-100.png". X-axis: recall. Y-axis: qps, log-scale. + + python plot.py --dataset=chembl-1024-jaccard -Y --count=100 -o=results/chembl-1024-jaccard-100 + +- Plot results on molport-1024-jaccard dataset for top-K (K=10) nearest neighbor query to "results/molport-1024-jaccard-indexsize-10.png". X-axis: recall. Y-axis: index size, log-scale. + + ython plot.py --dataset=molport-1024-jaccard -Y -y=indexsize --count=10 -o=results/molport-1024-jaccard-indexsize-10 + +- Plot results on molport-1024-jaccard dataset for top-K (K=10) nearest neighbor query to "results/molport-1024-jaccard-buildtime-10.png". X-axis: recall. Y-axis: indexing time, log-scale. + + python plot.py --dataset=molport-1024-jaccard -Y -y=build --count=10 -o=results/molport-1024-jaccard-buildtime-10 + +- Plot batch mode results on molport-1024-jaccard dataset for top-K (K=100) nearest neighbor query to "results/molport-1024-jaccard-batch-100.png". X-axis: recall. Y-axis: qps, log-scale. + + python plot.py --dataset=molport-1024-jaccard -Y --batch --count=100 -o=results/molport-1024-jaccard-batch-100 + +- Plot results on chembl-1024-jaccard dataset for range query with similarity cutoff 0.6 to "results/chembl-1024-jaccard-0_4.png". X-axis: recall (range query). Y-axis: qps, log-scale. + + python plot.py --dataset=chembl-1024-jaccard -Y -x=range --rq --radius=0.4 -o=results/chembl-1024-jaccard-0_4 + # Executions under an HPC environment 1. Load anaconda module @@ -105,7 +170,12 @@ Command Examples (for Singularity only): 2. Create anaconda environment, and then install dependent libraries conda create -c rdkit -n ann_env rdkit python=3.5.2 + + source activate ann_dev + pip install -r singularity-install/requirements.txt + + source deactivate ann_dev 3. Run your algorithm scripts by SLURM shell @@ -116,7 +186,7 @@ An example "run.sh": #!/bin/bash #SBATCH --ntasks=1 - + #SBATCH --exclude=cn[65-69,71-136,325-343,345-353,355-358,360-364,369-398,400-401],gpu[07-10] module load anaconda/5.1.0 @@ -134,6 +204,34 @@ An example "run.sh": python run.py --dataset=chembl-1024-jaccard --algorithm='Hnsw(Nmslib)' --count=100 --sif-dir="./singularity" +# Visualization of Execution Results under an HPC environment +Run your algorithm scripts by SLURM shell + + sbatch plot.sh + +An example "plot.sh": + + #!/bin/bash + + #SBATCH --ntasks=1 + #SBATCH --exclude=cn[65-69,71-136,325-343,345-353,355-358,360-364,369-398,400-401],gpu[07-10] + + + module load anaconda/5.1.0 + + source activate ann_env + + module purge + + module load gcc/5.4.0 + + module load singularity/3.1 + + + + python plot.py --dataset=chembl-1024-jaccard -Y --count=100 -o=results/chembl-1024-jaccard-100 + + # Parameter tuning All algorithmic parameter settings are included in the "./algos.yaml" file. @@ -285,7 +383,7 @@ At the beginning of the file, there is "bit:\n jaccard:\n". It means that we use Here is the process to add a custom dataset. We will use Chembl dataset and 2048-bits ECFP as example. 1. Put raw sdf file, e.g. chembl_24_1.sdf.gz, under "data" folder. Note only ".sdf.gz" files are accepted. Multiple sdf files are allowed. -2. Include the key-value pair below to DATASETS, defined at the bottom of "./ann_benchmark/datasets.py". +2. Include the key-value pair below to the data strucutre DATASETS, defined at the bottom of "./ann_benchmark/datasets.py". If a new fingerprint rather than ECFP is used, please define a fingerprint calculation function similar to ecfp() in the same Python file. 'chembl-2048-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl', 2048, 'jaccard', 'bit'), @@ -294,6 +392,7 @@ If a new fingerprint rather than ECFP is used, please define a fingerprint calcu python run.py --dataset=chembl-2048-jaccard --algorithm='Hnsw(Nmslib)' --count=100 --sif-dir="./singularity" +Note: to use an existing dataset, e.g. X, one needs to make sure the data structure DATASETS, defined at the bottom of "./ann_benchmark/datasets.py" contains a key-value pair with key X. Otherwise, one needs to include a key-value pair with key X and an arbitrary value, e.g., "'X': gist", to the DATASETS. # References - Omohundro, S. M. Five Balltree Construction Algorithms. _Tech. report, UC Berkeley_**1989**. @@ -315,4 +414,4 @@ If a new fingerprint rather than ECFP is used, please define a fingerprint calcu - Datasketch: Big data looks small https://ekzhu.github.io/datasketch (accessed May 31, 2019). -- Gaulton, A.; Bellis, L. J.; Bento, P.; Chambers, J.; Davies, M.; Hersey, A.; Light, Y.; McGlinchey, S.; Michalovich, D.; Al-Lazikani, B.; et al. ChEMBL: A Large-Scale Bioactivity Database for Drug Discovery. _Nucleic Acids Res._**2012**,_40_, 1100–1107. \ No newline at end of file +- Gaulton, A.; Bellis, L. J.; Bento, P.; Chambers, J.; Davies, M.; Hersey, A.; Light, Y.; McGlinchey, S.; Michalovich, D.; Al-Lazikani, B.; et al. ChEMBL: A Large-Scale Bioactivity Database for Drug Discovery. _Nucleic Acids Res._**2012**,_40_, 1100–1107. diff --git a/algos.yaml b/algos.yaml index 43220d8..e9087ee 100644 --- a/algos.yaml +++ b/algos.yaml @@ -26,7 +26,7 @@ bit: singularity-tag: ann-bench-nmslib3 module: ann_benchmarks.algorithms.nmslib constructor: NmslibReuseIndex - base-args: ["@metric", "vptree"] + base-args: ["@metric", "Byte", "vptree"] run-groups: base: # When @args is a dictionary, algorithm instances will be generated @@ -51,7 +51,7 @@ bit: singularity-tag: ann-bench-nmslib3 module: ann_benchmarks.algorithms.nmslib constructor: NmslibReuseIndex - base-args: ["@metric", "hnsw"] + base-args: ["@metric", "Byte", "hnsw"] run-groups: M-48: arg-groups: @@ -90,7 +90,7 @@ bit: singularity-tag: ann-bench-nmslib3 module: ann_benchmarks.algorithms.nmslib constructor: NmslibReuseIndex - base-args: ["@metric", "sw-graph"] + base-args: ["@metric", "Byte", "sw-graph"] run-groups: NN-96: arg-groups: @@ -186,3 +186,123 @@ bit: run-groups: empty: args: [] +int: + jaccard: + Bruteforce: + disabled: false + docker-tag: ann-benchmarks-sklearn + singularity-tag: ann-bench-sklearn + module: ann_benchmarks.algorithms.bruteforce + constructor: BruteForceBLAS + base-args: ["@metric"] + run-groups: + base: + args: {} + Hnsw(Nmslib): + disabled: false + docker-tag: ann-benchmarks-nmslib + singularity-tag: ann-bench-nmslib3 + module: ann_benchmarks.algorithms.nmslib + constructor: NmslibReuseIndex + base-args: ["@metric", "Int", "hnsw"] + run-groups: + M-48: + arg-groups: + - {"M": 48, "post": 2, "efConstruction": 800} + - False + query-args: [[50, 70, 90, 120, 160, 200, 400, 600, 700, 800, 1000, + 1400, 1600, 2000]] + M-32: + arg-groups: + - {"M": 32, "post": 2, "efConstruction": 800} + - False + query-args: [[100, 300, 500, 700, 1000, 1500, 2000]] + M-20: + arg-groups: + - {"M": 20, "post": 0, "efConstruction": 800} + - False + query-args: [[2, 5, 10, 15, 20, 30, 40, 50, 70, 80]] + M-12: + arg-groups: + - {"M": 12, "post": 0, "efConstruction": 800} + - False + query-args: [[1, 2, 5, 10, 15, 20, 30, 40, 50, 70, 80]] + M-5: + arg-groups: + - {"M": 5, "post": 0, "efConstruction": 10} + - False + query-args: [[1, 2, 5, 10]] + M-2: + arg-groups: + - {"M": 2, "post": 0, "efConstruction": 1} + - False + query-args: [[1, 2]] + SW-graph(Nmslib): + disabled: false + docker-tag: ann-benchmarks-nmslib + singularity-tag: ann-bench-nmslib3 + module: ann_benchmarks.algorithms.nmslib + constructor: NmslibReuseIndex + base-args: ["@metric", "Int", "sw-graph"] + run-groups: + NN-96: + arg-groups: + - {"NN": 96} + - False + query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]] + NN-48: + arg-groups: + - {"NN": 48} + - False + query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]] + NN-24: + arg-groups: + - {"NN": 24} + - False + query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]] + NN-16: + arg-groups: + - {"NN": 16} + - False + query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]] + NN-10: + arg-groups: + - {"NN": 10} + - False + query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]] + NN-5: + arg-groups: + - {"NN": 5} + - False + query-args: [[30, 25, 20, 15, 10, 5, 4, 3, 2, 1]] + NN-2: + arg-groups: + - {"NN": 2} + - False + query-args: [[30, 25, 20, 15, 10, 5, 4, 3, 2, 1]] + NN-1: + arg-groups: + - {"NN": 1} + - False + query-args: [[30, 25, 20, 15, 10, 5, 4, 3, 2, 1]] + Onng(Ngt): + disabled: false + docker-tag: ann-benchmarks-ngt + singularity-tag: ann-bench-ngt + module: ann_benchmarks.algorithms.onng_ngt + constructor: ONNG + base-args: ["@metric", "Byte", 1.0] + run-groups: + onng: + args: [[100, 300, 500, 1000], [10, 30, 50, 100], [10, 30, 50, 120]] + query-args: [[0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0]] + Risc: + disabled: false + docker-tag: ann-benchmarks-risc + singularity-tag: ann-bench-risc + module: ann_benchmarks.algorithms.risc + constructor: Risc + base-args: ["@metric", "Risc"] + run-groups: + empty: + args: [] diff --git a/ann_benchmarks/.DS_Store b/ann_benchmarks/.DS_Store index 827c239..8c30ee3 100644 Binary files a/ann_benchmarks/.DS_Store and b/ann_benchmarks/.DS_Store differ diff --git a/ann_benchmarks/algorithms/nmslib.py b/ann_benchmarks/algorithms/nmslib.py index 0fbb4e2..72f3ebc 100644 --- a/ann_benchmarks/algorithms/nmslib.py +++ b/ann_benchmarks/algorithms/nmslib.py @@ -24,9 +24,17 @@ def matrToStrArray(sparseMatr): arr.sort() res.append(' '.join([str(k) for k in arr])) return res + + @staticmethod + def intMatrToStrArray(intMatr): + res = [] + for row in range(intMatr.shape[0]): + res.append(' '.join([str(k) for k in intMatr[row]])) + return res - def __init__(self, metric, method_name, index_param, query_param): + def __init__(self, metric, object_type, method_name, index_param, query_param): self._nmslib_metric = {'angular': 'cosinesimil', 'euclidean': 'l2', 'jaccard': 'jaccard_sparse'}[metric] + self._object_type = object_type self._method_name = method_name self._save_index = False self._index_param = NmslibReuseIndex.encode(index_param) @@ -53,11 +61,11 @@ def fit(self, X): # Aborted (core dumped) self._index_param.append('bucketSize=%d' % min(int(X.shape[0] * 0.0005), 1000)) - # Chunjiang modified it to "if" for jaccard if self._nmslib_metric == 'jaccard_sparse': - X_trans = NmslibReuseIndex.matrToStrArray(csr_matrix(X)) - self._index = nmslib.init(space=self._nmslib_metric, method=self._method_name, data_type=nmslib.DataType.OBJECT_AS_STRING) - self._index.addDataPointBatch(X_trans) + if self._object_type == 'Byte': + X_trans = NmslibReuseIndex.matrToStrArray(csr_matrix(X)) + else: + X_trans = NmslibReuseIndex.intMatrToStrArray(X) else: self._index = nmslib.init(space=self._nmslib_metric, method=self._method_name) self._index.addDataPointBatch(X) @@ -79,9 +87,12 @@ def set_query_arguments(self, ef): def query(self, v, n, rq=False): # Chunjiang modified if self._nmslib_metric == 'jaccard_sparse': - nz = numpy.nonzero(v)[0] - v = ' '.join([str(k) for k in nz]) - print(n) + if self._object_type == 'Byte': + nz = numpy.nonzero(v)[0] + v = ' '.join([str(k) for k in nz]) + else: + v = ' '.join([str(k) for k in v]) + if rq: ids, distances = self._index.rangeQuery(v, n) else: @@ -91,7 +102,10 @@ def query(self, v, n, rq=False): def batch_query(self, X, n): # Chunjiang modified if self._nmslib_metric == 'jaccard_sparse': - X = NmslibReuseIndex.matrToStrArray(csr_matrix(X)) + if self._object_type == 'Byte': + X = NmslibReuseIndex.matrToStrArray(csr_matrix(X)) + else: + X = NmslibReuseIndex.intMatrToStrArray(X) self.res = self._index.knnQueryBatch(X, n) def get_batch_results(self): diff --git a/ann_benchmarks/algorithms/nmslib_sparse.py b/ann_benchmarks/algorithms/nmslib_sparse.py deleted file mode 100644 index 58af969..0000000 --- a/ann_benchmarks/algorithms/nmslib_sparse.py +++ /dev/null @@ -1,95 +0,0 @@ -from __future__ import absolute_import -import os -import nmslib -from ann_benchmarks.constants import INDEX_DIR -from ann_benchmarks.algorithms.base import BaseANN -from scipy.sparse import csr_matrix -import numpy - - -class NmslibSparseReuseIndex(BaseANN): - @staticmethod - def encode(d): - return ["%s=%s" % (a, b) for (a, b) in d.iteritems()] - - # For each entry in the sparse matrix, extract a list of IDs and - # convert them to a string. Return a list of such strings. - @staticmethod - def matrToStrArray(sparseMatr): - res = [] - indptr = sparseMatr.indptr - indices = sparseMatr.indices - for row in range(sparseMatr.shape[0]): - arr = [k for k in indices[indptr[row]: indptr[row + 1]]] - arr.sort() - res.append(' '.join([str(k) for k in arr])) - return res - - def __init__(self, metric, method_name, index_param, query_param): - self._nmslib_metric = {'angular': 'cosinesimil', 'euclidean': 'l2', 'jaccard': 'jaccard_sparse'}[metric] - self._method_name = method_name - self._save_index = False - self._index_param = NmslibSparseReuseIndex.encode(index_param) - if query_param!=False: - self._query_param = NmslibSparseReuseIndex.encode(query_param) - self.name = 'Nmslib(method_name=%s, index_param=%s, query_param=%s)' % ( - self._method_name, self._index_param, self._query_param) - else: - self._query_param = None - self.name = 'Nmslib(method_name=%s, index_param=%s)' % ( - self._method_name, self._index_param) - - self._index_name = os.path.join(INDEX_DIR, "nmslib_%s_%s_%s" % (self._method_name, metric, '_'.join(self._index_param))) - - d = os.path.dirname(self._index_name) - if not os.path.exists(d): - os.makedirs(d) - - def fit(self, X): - if self._method_name == 'vptree': - # To avoid this issue: - # terminate called after throwing an instance of 'std::runtime_error' - # what(): The data size is too small or the bucket size is too big. Select the parameters so that is NOT less than * 1000 - # Aborted (core dumped) - self._index_param.append('bucketSize=%d' % min(int(X.shape[0] * 0.0005), 1000)) - - # Chunjiang modified it to "if" for jaccard - if self._nmslib_metric == 'jaccard_sparse': - X_trans = NmslibSparseReuseIndex.matrToStrArray(X) - self._index = nmslib.init(space=self._nmslib_metric, method=self._method_name, data_type=nmslib.DataType.OBJECT_AS_STRING) - self._index.addDataPointBatch(X_trans) - else: - self._index = nmslib.init(space=self._nmslib_metric, method=self._method_name) - self._index.addDataPointBatch(X) - - if os.path.exists(self._index_name): - print('Loading index from file') - self._index.loadIndex(self._index_name) - else: - self._index.createIndex(self._index_param) - if self._save_index: - self._index.saveIndex(self._index_name) - if self._query_param is not None: - self._index.setQueryTimeParams(self._query_param) - - def set_query_arguments(self, ef): - if self._method_name == 'hnsw' or self._method_name == 'sw-graph': - self._index.setQueryTimeParams(["efSearch=%s"%(ef)]) - - def query(self, v, n): - # Chunjiang modified - if self._nmslib_metric == 'jaccard_sparse': - nz = numpy.nonzero(v)[0] - v = ' '.join([str(k) for k in nz]) - ids, distances = self._index.knnQuery(v, n) - return ids - - def batch_query(self, X, n): - # Chunjiang modified - if self._nmslib_metric == 'jaccard_sparse': - X = NmslibSparseReuseIndex.matrToStrArray(csr_matrix(X)) - self.res = self._index.knnQueryBatch(X, n) - - def get_batch_results(self): - return [x for x, _ in self.res] - diff --git a/ann_benchmarks/datasets.py b/ann_benchmarks/datasets.py index 68728bf..2ee8551 100644 --- a/ann_benchmarks/datasets.py +++ b/ann_benchmarks/datasets.py @@ -42,14 +42,11 @@ def get_dataset(which): # Everything below this line is related to creating datasets # You probably never need to do this at home, just rely on the prepared datasets at http://ann-benchmarks.com -def write_output(train, test, fn, distance, point_type='float', count=1000, SMILES=None): +def write_output(train, test, fn, distance, point_type='float', count=1000, SMILES=None, IDS=None): from ann_benchmarks.algorithms.bruteforce import BruteForceBLAS import sklearn.neighbors import h5sparse - - def replace_last(source_string, replace_what, replace_with): - head, _sep, tail = source_string.rpartition(replace_what) - return head + replace_with + tail + from scipy.sparse import issparse # store SMILES first if SMILES: @@ -62,43 +59,60 @@ def replace_last(source_string, replace_what, replace_with): f.close() print('Finish.') - print('Write Dataset %s' % fn) - f = h5sparse.File(fn, 'w') - f.attrs['distance'] = distance - f.attrs['point_type'] = point_type - print('train size: %9d * %4d' % train.shape) - print('test size: %9d * %4d' % test.shape) - f.create_dataset('train',data=train) - f.create_dataset('test', test.shape, dtype=test.dtype)[:] = test - neighbors = f.create_dataset('neighbors', (test.shape[0], count), dtype='i') - distances = f.create_dataset('distances', (test.shape[0], count), dtype='f') - - # use which method to compute the groundtruth - train = train.toarray() - method = 'bruteforth' - if method == 'balltree': - tree = sklearn.neighbors.BallTree(train, leaf_size=1000000, metric=distance) - else: - bf = BruteForceBLAS(metric=distance, precision=train.dtype) - bf.fit(train) + if IDS: + smile_fn = replace_last(fn, '.hdf5', '-IDS.hdf5') + print('Write Smiles to File %s' % smile_fn) + f = h5sparse.File(smile_fn, 'w') + dt = h5py.special_dtype(vlen=bytes) + asciiList = [n.encode("ascii", "ignore") for n in IDS] + f.create_dataset('smile', (len(asciiList), 1), dtype=dt, data=asciiList) + f.close() - print(test) - for i, x in enumerate(test): - if i % 1 == 0: - print('%d/%d...' % (i, test.shape[0])) + print('Write Dataset %s' % fn) + f = h5sparse.File(fn, 'w') + f.attrs['distance'] = distance + f.attrs['point_type'] = point_type + print('train size: %9d * %4d' % train.shape) + print('test size: %9d * %4d' % test.shape) + if issparse(train): + f.create_dataset('train',data=train) + else: + f.create_dataset('train', train.shape, dtype=train.dtype)[:] = train + if issparse(test): + f.create_dataset('test',data=test) + else: + f.create_dataset('test', test.shape, dtype=test.dtype)[:] = test + neighbors = f.create_dataset('neighbors', (test.shape[0], count), dtype='i') + distances = f.create_dataset('distances', (test.shape[0], count), dtype='f') + + # use which method to compute the groundtruth + if issparse(train): + train = train.toarray() + method = 'bruteforce' if method == 'balltree': - dist, ind = tree.query([x], k=count) - neighbors[i] = ind[0] - distances[i] = dist[0] + tree = sklearn.neighbors.BallTree(train, leaf_size=1000000, metric=distance) else: - res = list(bf.query_with_distances(x, count)) - res.sort(key=lambda t: t[-1]) - neighbors[i] = [j for j, _ in res] - distances[i] = [d for _, d in res] - print(neighbors[i]) - print(distances[i]) - f.close() - print('Finish.') + bf = BruteForceBLAS(metric=distance, precision=train.dtype) + bf.fit(train) + + print(test) + for i, x in enumerate(test): + if i % 1 == 0: + print('%d/%d...' % (i, test.shape[0])) + if method == 'balltree': + dist, ind = tree.query([x], k=count) + neighbors[i] = ind[0] + distances[i] = dist[0] + else: + res = list(bf.query_with_distances(x, count)) + print(len(res)) + res.sort(key=lambda t: t[-1]) + neighbors[i] = [j for j, _ in res] + distances[i] = [d for _, d in res] + print(neighbors[i]) + print(distances[i]) + f.close() + print('Finish.') def train_test_split(X, test_size=10000): @@ -355,7 +369,7 @@ def get_sparse_matrix_from_txt(file=None, dtype=numpy.bool): return fps, SMILES -def get_sparse_matrix_from_sdf(dir, dimension=1024, dtype=numpy.bool): +def get_sparse_matrix_from_sdf(dir, dimension = 1024, dtype=numpy.bool): from rdkit import Chem from rdkit.Chem import AllChem import glob @@ -363,6 +377,7 @@ def get_sparse_matrix_from_sdf(dir, dimension=1024, dtype=numpy.bool): from scipy.sparse import csr_matrix SMILES = [] + IDS = [] indptr = [0] indices = [] data = [] @@ -376,6 +391,7 @@ def get_sparse_matrix_from_sdf(dir, dimension=1024, dtype=numpy.bool): if mol is None: continue smile = Chem.MolToSmiles(mol) SMILES.append(smile) + IDS.append(mol.GetProp("_Name")) fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=dimension) for i in range(dimension): if fp.GetBit(i) is True: @@ -383,11 +399,12 @@ def get_sparse_matrix_from_sdf(dir, dimension=1024, dtype=numpy.bool): data.append(1) indptr.append(len(indices)) num_mols += 1 + if num_mols > 3000: break fps = csr_matrix((data, indices, indptr), shape=(num_mols, dimension), dtype=dtype) print('The dimension of the returned sparse matrix: %d*%d' % fps.shape) - return fps, SMILES + return fps, SMILES, IDS def ecfp(out_fn, dataset_name, dimension, distance, type, test_size=1000): from sklearn.utils import shuffle @@ -402,11 +419,11 @@ def ecfp(out_fn, dataset_name, dimension, distance, type, test_size=1000): dir = './data' - X, SMILES = get_sparse_matrix_from_sdf(dir=dir, dimension=dimension, dtype=dtype) + X, SMILES, IDS = get_sparse_matrix_from_sdf(dir=dir, dimension=dimension, dtype=dtype) # random shuffle fingerprints and smiles at the same time seed = 1 # random.randint(0, 2 ** 32 - 1) - X, SMILES = shuffle(X, SMILES, random_state=seed) + X, SMILES, IDS = shuffle(X, SMILES, IDS, random_state=seed) # data split and make test data full matrix train_size = X.shape[0] - test_size @@ -417,8 +434,145 @@ def ecfp(out_fn, dataset_name, dimension, distance, type, test_size=1000): print('Train data dimension: %d*%d' %X_train.shape) print('Test data dimension: %d*%d' %X_test.shape) - write_output(X_train, X_test, out_fn, distance, type, count=1000, SMILES=SMILES) + write_output(X_train, X_test, out_fn, distance, type, count=1000, SMILES=SMILES, IDS=IDS) +# Molecular topological fingerprints +def get_sparse_matrix_from_sdf_topological_fp(dir, dimension=1024, dtype=numpy.bool): + from rdkit import Chem + from rdkit.Chem import AllChem + import glob + import gzip + from scipy.sparse import csr_matrix + + SMILES = [] + IDS = [] + indptr = [0] + indices = [] + data = [] + num_mols = 0 + file_list = glob.glob(dir + '/*.sdf.gz') + print(file_list) + for file in file_list: + inf = gzip.open(file) + suppl = Chem.ForwardSDMolSupplier(inf) + for mol in suppl: + if mol is None: continue + smile = Chem.MolToSmiles(mol) + SMILES.append(smile) + IDS.append(mol.GetProp("_Name")) + fp = Chem.rdmolops.RDKFingerprint(mol, fpSize=dimension) + for i in range(dimension): + if fp.GetBit(i) is True: + indices.append(i) + data.append(1) + indptr.append(len(indices)) + num_mols += 1 + + fps = csr_matrix((data, indices, indptr), shape=(num_mols, dimension), dtype=dtype) + print('The dimension of the returned sparse matrix: %d*%d' % fps.shape) + + return fps, SMILES, IDS + +def topological_fp(out_fn, dataset_name, dimension, distance, type, test_size=1000): + from sklearn.utils import shuffle + print('prepare dataset ' + dataset_name) + + if type == 'bit': + dtype = numpy.bool + elif type == 'int': + dtype = numpy.int + else: + dtype = numpy.float + + dir = './data' + + X, SMILES, IDS = get_sparse_matrix_from_sdf_topological_fp(dir=dir, dimension=dimension, dtype=dtype) + + # random shuffle fingerprints and smiles at the same time + seed = 1 # random.randint(0, 2 ** 32 - 1) + X, SMILES, IDS = shuffle(X, SMILES, IDS, random_state=seed) + + # data split and make test data full matrix + train_size = X.shape[0] - test_size + X_train = X[:train_size] + X_test = X[train_size:] + X_test = X_test.toarray() + print('finish dataset preparation') + + print('Train data dimension: %d*%d' %X_train.shape) + print('Test data dimension: %d*%d' %X_test.shape) + write_output(X_train, X_test, out_fn, distance, type, count=1000, SMILES=SMILES, IDS=IDS) + +def sdf_2_map4(dir, dimension=1024, dtype=numpy.bool): + from rdkit import Chem + from rdkit.Chem import AllChem + import glob + import gzip + from scipy.sparse import csr_matrix + from map4 import MAP4Calculator + + MAP4 = MAP4Calculator(dimensions=dimension) + + SMILES = [] + IDS = [] + fps = [] + file_list = glob.glob(dir + '/*.sdf.gz') + print(file_list) + for file in file_list: + inf = gzip.open(file) + suppl = Chem.ForwardSDMolSupplier(inf) + #mols = [x for x in suppl if x is not None] + mols = [] + num_mols = 0 + for mol in suppl: + if mol is None: continue + mols.append(mol) + SMILES.append(Chem.MolToSmiles(mol)) + IDS.append(mol.GetProp("_Name")) + num_mols += 1 + if num_mols == 3000: + fps.extend(MAP4.calculate_many(mols)) + mols = [] + num_mols = 0 + if num_mols > 0: + fps.extend(MAP4.calculate_many(mols)) + mols = [] + num_mols = 0 + + fps = numpy.array(fps, dtype=dtype) + print('The dimension of the returned matrix: %d*%d' % fps.shape) + + return fps, SMILES, IDS + +def map4(out_fn, dataset_name, dimension, distance, type, test_size=1000): + from sklearn.utils import shuffle + from map4 import MAP4Calculator + print('prepare dataset ' + dataset_name) + + if type == 'bit': + dtype = numpy.bool + elif type == 'int': + dtype = numpy.int + else: + dtype = numpy.float + + dir = './data' + + X, SMILES, IDS = sdf_2_map4(dir=dir, dimension=dimension, dtype=dtype) + + # random shuffle fingerprints and smiles at the same time + seed = 1 # random.randint(0, 2 ** 32 - 1) + X, SMILES, IDS = shuffle(X, SMILES, IDS, random_state=seed) + + # data split and make test data full matrix + train_size = X.shape[0] - test_size + X_train = X[:train_size] + X_test = X[train_size:] + print('finish dataset preparation') + + print('Train data dimension: %d*%d' %X_train.shape) + print('Test data dimension: %d*%d' %X_test.shape) + write_output(X_train, X_test, out_fn, distance, type, count=1000, SMILES=SMILES, IDS=IDS) def ecfp_sparse_multi(out_fn, dataset_name, num_files, dimension, distance, type): print('prepare dataset ' + dataset_name) @@ -535,5 +689,7 @@ def ecfp_multi(out_fn, dataset_name, num_files, dimension, distance, type): 'enamine100M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 5, 1024, 'jaccard', 'bit'), 'enamine10M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 0.5, 1024, 'jaccard', 'bit'), 'enamine20M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 1, 1024, 'jaccard', 'bit'), - 'enamine10M-1024-jaccard': lambda out_fn: ecfp_multi(out_fn, 'Enamine', 0.5, 1024, 'jaccard', 'bit') + 'enamine10M-1024-jaccard': lambda out_fn: ecfp_multi(out_fn, 'Enamine', 0.5, 1024, 'jaccard', 'bit'), + 'chembl-1024-jaccard-tp': lambda out_fn: topological_fp(out_fn, 'Chembl', 1024, 'jaccard', 'bit'), + 'chembl-1024-jaccard-map4': lambda out_fn: map4(out_fn, 'Chembl', 1024, 'jaccard', 'int') } diff --git a/ann_benchmarks/distance.py b/ann_benchmarks/distance.py index 9228338..5f5aa07 100644 --- a/ann_benchmarks/distance.py +++ b/ann_benchmarks/distance.py @@ -4,7 +4,6 @@ def pdist(a, b, metric): return scipy_pdist([a, b], metric=metric)[0] -# Need own implementation of jaccard because numpy's implementation is different def jaccard(a, b): if len(a) == 0 or len(b) == 0: return 0 @@ -31,7 +30,7 @@ def jaccard(a, b): # 'distance_valid' : lambda a: True # } # } -# Chunjiang Modified 20190216 + metrics = { 'hamming': { 'distance' : lambda a, b: pdist(a, b, "hamming"), @@ -40,7 +39,7 @@ def jaccard(a, b): # return 1 - jaccard similarity, because smaller distances are better. 'jaccard': { 'distance' : lambda a, b: pdist(a, b, "jaccard"), - 'distance_valid' : lambda a: a < 1 - 1e-5 + 'distance_valid' : lambda a: a < 1 + 1e-5 }, 'euclidean': { 'distance' : lambda a, b: pdist(a, b, "euclidean"), diff --git a/ann_benchmarks/runner.py b/ann_benchmarks/runner.py index d51cdf5..1d3c3a4 100644 --- a/ann_benchmarks/runner.py +++ b/ann_benchmarks/runner.py @@ -113,7 +113,7 @@ def run(definition, dataset, count, run_count, batch, rq): D = get_dataset(dataset) # Chunjiang modified print('Is the train set a sparse matrix? %d' % issparse(D['train'][()])) - if 'sparse' not in dataset: + if issparse(D['train'][()]): X_train = D['train'][()].toarray() else: X_train = D['train'][()] diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..dc5013c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,11 @@ +ansicolors==1.1.8 +docker==2.6.1 +h5py==2.7.1 +matplotlib==2.1.0 +numpy==1.13.3 +pyyaml==3.12 +psutil==5.4.2 +scipy==1.0.0 +scikit-learn==0.19.1 +jinja2==2.10 +h5sparse==0.1.0 diff --git a/singularity-install/requirements.txt b/singularity-install/requirements.txt index 5677802..dc5013c 100644 --- a/singularity-install/requirements.txt +++ b/singularity-install/requirements.txt @@ -8,4 +8,4 @@ psutil==5.4.2 scipy==1.0.0 scikit-learn==0.19.1 jinja2==2.10 -h5sparse=0.1.0 +h5sparse==0.1.0