From 01adfa3b25c4ce8fd2c91b9f3fa4d29454d1155a Mon Sep 17 00:00:00 2001 From: ChunjiangZhu Date: Mon, 21 Sep 2020 10:56:31 -0400 Subject: [PATCH] Fix measurement of index size and indexing time in chemfp and Bruteforce, and optimize the code --- .DS_Store | Bin 10244 -> 10244 bytes README.md | 4 ++- algos.yaml | 11 ++++++ ann_benchmarks/.DS_Store | Bin 10244 -> 10244 bytes ann_benchmarks/algorithms/base.py | 12 +++++++ ann_benchmarks/algorithms/bruteforce.py | 44 +++++++++++++++--------- ann_benchmarks/algorithms/chemfp.py | 16 ++++++--- ann_benchmarks/algorithms/risc.py | 2 ++ ann_benchmarks/runner.py | 23 +++---------- 9 files changed, 72 insertions(+), 40 deletions(-) diff --git a/.DS_Store b/.DS_Store index 31991bce35778f2d7060377f9fffcf0c843c14fc..dcae827976c61a3d5da91ab6e38f286275af24b4 100644 GIT binary patch delta 24 fcmZn(XbIS$Bgk%Us-s|JW-$4@gyZHC!3kmjUE>Fa delta 16 XcmZn(XbIS$BRE-C#A|bzV4oNOGOY!Y diff --git a/README.md b/README.md index b70a86f..0deb987 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,8 @@ Algorithms currently supported: - [Balltree](http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.91.8209) - Bruteforce/Exhausive search - - [Chemfp 1.5](https://jcheminf.biomedcentral.com/articles/10.1186/s13321-019-0398-8) + - [Chemfp 1.6.1](https://jcheminf.biomedcentral.com/articles/10.1186/s13321-019-0398-8) + - [the standard modulo-OR-compression algorithm, or folding](https://pubs.acs.org/doi/10.1021/ci100132g) - [Min-Hash](https://ekzhu.github.io/datasketch) - [DivideSkip](https://pubs.acs.org/doi/10.1021/ci200552r) - [Hnsw](https://arxiv.org/abs/1603.09320) @@ -61,6 +62,7 @@ Run.py Parameters: - Chemfp - Datasketch - DivideSkip + - Folding - Hnsw(Nmslib) - Onng(Ngt) - Panng(Ngt) diff --git a/algos.yaml b/algos.yaml index 864bcbb..34aef16 100644 --- a/algos.yaml +++ b/algos.yaml @@ -198,6 +198,17 @@ bit: empty: args: [] + Folding: + disabled: false + docker-tag: ann-benchmarks-chemfp + singularity-tag: ann-bench-chemfp + module: ann_benchmarks.algorithms.folding + constructor: Folding + base-args: ["@metric"] + run-groups: + panng: + args: [[64, 128, 192, 256, 512]] + int: jaccard: Bruteforce: diff --git a/ann_benchmarks/.DS_Store b/ann_benchmarks/.DS_Store index e7f84d2127c9fda3fe25aea7b11ac7da0996b2e6..17dad90de08542a739bdedf6f967dfe8a04df4e5 100644 GIT binary patch delta 24 fcmZn(XbIRLC&+GYs-s|JW-$4;l;h?!!Fgf;UMmNt delta 24 fcmZn(XbIRLC&+GLqN8ABYBKq^l;h?!!Fgf;UP%Y3 diff --git a/ann_benchmarks/algorithms/base.py b/ann_benchmarks/algorithms/base.py index 288564a..4030808 100644 --- a/ann_benchmarks/algorithms/base.py +++ b/ann_benchmarks/algorithms/base.py @@ -9,12 +9,24 @@ class BaseANN(object): """Returns the size of the index in kB or -1 if not implemented.""" return psutil.Process().memory_info().rss / 1024 # return in kB for backwards compatibility + def pre_fit(self, X): + pass + def fit(self, X): pass + def pre_query(self, q, n): + pass + def query(self, q, n): return [] # array of candidate indices + def post_query(self, rq=False): + pass + + def pre_batch_query(self, X, n): + pass + def batch_query(self, X, n): self.res = [] for q in X: diff --git a/ann_benchmarks/algorithms/bruteforce.py b/ann_benchmarks/algorithms/bruteforce.py index 6c5c2ad..b538f79 100644 --- a/ann_benchmarks/algorithms/bruteforce.py +++ b/ann_benchmarks/algorithms/bruteforce.py @@ -4,6 +4,8 @@ import sklearn.neighbors from ann_benchmarks.distance import metrics as pd from ann_benchmarks.algorithms.base import BaseANN from scipy.sparse import issparse +import chemfp +from bitarray import bitarray class BruteForce(BaseANN): def __init__(self, metric): @@ -104,35 +106,43 @@ class BruteForceFPS(BaseANN): self._metric = metric self.name = 'BruteForceFPS()' - - def fit(self, X): - self.index = X - - def query(self, v, n): - import operator - import chemfp - dist_dict = {} - for j in range(self.index.shape[0]): - dist_dict[j] = chemfp.bitops.byte_tanimoto(self._queries.get_fingerprint(0), self._target.get_fingerprint(j)) - sorted_dict = sorted(dist_dict.items(), key=operator.itemgetter(1), reverse=True)[:n] - return [sorted_dict[j][0] for j in range(n)] - @staticmethod def matrToArena(X): - import chemfp - from bitarray import bitarray # convert X to Chemfp fingerprintArena in memory fps = [] for row in range(X.shape[0]): fp = bitarray(endian='big') fp.extend(X[row]) fps.append((row,fp.tobytes())) - return chemfp.load_fingerprints(fps,chemfp.Metadata(num_bits=X.shape[1]),reorder=False) + return chemfp.load_fingerprints(fps,chemfp.Metadata(num_bits=X.shape[1]), reorder=False) def pre_fit(self, X): - self._target = BruteForceFPS.matrToArena(X) + self._fps = [] + for row in range(X.shape[0]): + fp = bitarray(endian='big') + fp.extend(X[row]) + self._fps.append((row,fp.tobytes())) + + def fit(self, X): + self._target = chemfp.load_fingerprints(self._fps,chemfp.Metadata(num_bits=X.shape[1]), reorder=False) + # To ensure that BitBound is not used + self._target.popcount_indices = "" def pre_query(self, v, n): queryMatr = numpy.array([v]) self._queries = BruteForceFPS.matrToArena(queryMatr) + + def query(self, v, n, rq=False): + if rq: + self._results = chemfp.threshold_tanimoto_search(self._queries, self._target, threshold=1.0-n) + else: + self._results = chemfp.knearest_tanimoto_search(self._queries, self._target, k=n, threshold=0.0) + + def post_query(self, rq=False): + # parse the results + for (query_id, hits) in self._results: + if hits: + return hits.get_ids() + else: + return [] diff --git a/ann_benchmarks/algorithms/chemfp.py b/ann_benchmarks/algorithms/chemfp.py index 3e8a7c2..736c95b 100644 --- a/ann_benchmarks/algorithms/chemfp.py +++ b/ann_benchmarks/algorithms/chemfp.py @@ -25,7 +25,14 @@ class Chemfp(BaseANN): return chemfp.load_fingerprints(fps,chemfp.Metadata(num_bits=X.shape[1]),reorder=reorder) def pre_fit(self, X): - self._target = Chemfp.matrToArena(X) + self._fps = [] + for row in range(X.shape[0]): + fp = bitarray(endian='big') + fp.extend(X[row]) + self._fps.append((row,fp.tobytes())) + + def fit(self, X): + self._target = chemfp.load_fingerprints(self._fps,chemfp.Metadata(num_bits=X.shape[1]), reorder=True) def pre_query(self, v, n): @@ -45,8 +52,9 @@ class Chemfp(BaseANN): return hits.get_ids() else: return [] + def pre_batch_query(self, X, n): - self._queries = Chemfp.matrToArena(X, False) + self._queries = Chemfp.matrToArena(X) def batch_query(self, X, n): self._results = chemfp.knearest_tanimoto_search(self._queries, self._target, k=n, threshold=0.0) @@ -54,10 +62,10 @@ class Chemfp(BaseANN): def get_batch_results(self): # parse the results res = [] - for (query_id, hits) in self._results: + for (query_id, hits) in sorted(self._results): if hits: res.append(hits.get_ids()) else: res.append([]) - print(res) + #print(res) return res diff --git a/ann_benchmarks/algorithms/risc.py b/ann_benchmarks/algorithms/risc.py index 368233d..ceb3e05 100644 --- a/ann_benchmarks/algorithms/risc.py +++ b/ann_benchmarks/algorithms/risc.py @@ -19,6 +19,8 @@ class Risc(BaseANN): self.name = method + "()" def pre_fit(self, X): + X = numpy.concatenate((X, [numpy.ones(X.shape[1], dtype=numpy.bool)]), axis=0) + print(X.shape) def matrToStrArray(sparseMatr): res = "" indptr = sparseMatr.indptr diff --git a/ann_benchmarks/runner.py b/ann_benchmarks/runner.py index fce0723..ddc15e6 100644 --- a/ann_benchmarks/runner.py +++ b/ann_benchmarks/runner.py @@ -34,9 +34,7 @@ def run_individual_query(algoname, algo, X_train, X_test, distance, count, run_c n_items_processed = [0] # a bit dumb but can't be a scalar since of Python's scoping rules def single_query(v): - # special code for the Risc, DivideSkip, and Chemfp - if algoname in ['Risc', 'DivideSkip', 'Chemfp', 'Bruteforce']: - algo.pre_query(v, count) + algo.pre_query(v, count) start = time.time() if rq: @@ -45,9 +43,7 @@ def run_individual_query(algoname, algo, X_train, X_test, distance, count, run_c candidates = algo.query(v, count) total = (time.time() - start) - # special code for the Risc, DivideSkip, and Chemfp - if algoname in ['Risc', 'DivideSkip', 'Chemfp']: - candidates = algo.post_query(rq) + candidates = algo.post_query(rq) if issparse(X_train): candidates = [(int(idx), float(metrics[distance]['distance'](v, X_train[idx].toarray()[0]))) @@ -63,9 +59,7 @@ def run_individual_query(algoname, algo, X_train, X_test, distance, count, run_c return (total, candidates) def batch_query(X): - # special code for Chemfp - if algoname in ['Chemfp']: - algo.pre_batch_query(X, count) + algo.pre_batch_query(X, count) start = time.time() algo.batch_query(X, count) total = (time.time() - start) @@ -127,14 +121,7 @@ function""" % (definition.module, definition.constructor, definition.arguments) try: print(X_train.shape) - # special code for Risc and DivideSkip - if definition.algorithm in ['Risc', 'DivideSkip']: - X_train = numpy.concatenate((X_train, [numpy.ones(X_train.shape[1], dtype=numpy.bool)]), axis=0) - print(X_train.shape) - algo.pre_fit(X_train) - # special code for Chemfp - if definition.algorithm in ['Chemfp', 'Bruteforce']: - algo.pre_fit(X_train) + algo.pre_fit(X_train) t0 = time.time() index_size_before = algo.get_index_size("self") @@ -305,7 +292,7 @@ def run_singularity(definition, dataset, count, runs, timeout, batch, rq, radius print('String of command', strCmd) # Chemfp uses Python2 while others use Python3 - if definition.algorithm in ['Chemfp', 'Bruteforce']: + if definition.algorithm in ['Chemfp', 'Bruteforce', 'Folding']: subprocess.check_call('singularity exec %s/%s.sif python run_algorithm.py %s' %(sif_dir, definition.singularity_tag, strCmd), shell=True) else: subprocess.check_call('singularity exec %s/%s.sif python3 run_algorithm.py %s' %(sif_dir, definition.singularity_tag, strCmd), shell=True)