Skip to content

Commit

Permalink
Fix measurement of index size and indexing time in chemfp and Brutefo…
Browse files Browse the repository at this point in the history
…rce, and optimize the code
  • Loading branch information
ChunjiangZhu committed Sep 21, 2020
1 parent 528f4b5 commit 01adfa3
Show file tree
Hide file tree
Showing 9 changed files with 72 additions and 40 deletions.
Binary file modified .DS_Store
Binary file not shown.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ Algorithms currently supported:

- [Balltree](http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.91.8209)
- Bruteforce/Exhausive search
- [Chemfp 1.5](https://jcheminf.biomedcentral.com/articles/10.1186/s13321-019-0398-8)
- [Chemfp 1.6.1](https://jcheminf.biomedcentral.com/articles/10.1186/s13321-019-0398-8)
- [the standard modulo-OR-compression algorithm, or folding](https://pubs.acs.org/doi/10.1021/ci100132g)
- [Min-Hash](https://ekzhu.github.io/datasketch)
- [DivideSkip](https://pubs.acs.org/doi/10.1021/ci200552r)
- [Hnsw](https://arxiv.org/abs/1603.09320)
Expand Down Expand Up @@ -61,6 +62,7 @@ Run.py Parameters:
- Chemfp
- Datasketch
- DivideSkip
- Folding
- Hnsw(Nmslib)
- Onng(Ngt)
- Panng(Ngt)
Expand Down
11 changes: 11 additions & 0 deletions algos.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,17 @@ bit:
empty:
args: []

Folding:
disabled: false
docker-tag: ann-benchmarks-chemfp
singularity-tag: ann-bench-chemfp
module: ann_benchmarks.algorithms.folding
constructor: Folding
base-args: ["@metric"]
run-groups:
panng:
args: [[64, 128, 192, 256, 512]]

int:
jaccard:
Bruteforce:
Expand Down
Binary file modified ann_benchmarks/.DS_Store
Binary file not shown.
12 changes: 12 additions & 0 deletions ann_benchmarks/algorithms/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,24 @@ def get_index_size(self, process):
"""Returns the size of the index in kB or -1 if not implemented."""
return psutil.Process().memory_info().rss / 1024 # return in kB for backwards compatibility

def pre_fit(self, X):
pass

def fit(self, X):
pass

def pre_query(self, q, n):
pass

def query(self, q, n):
return [] # array of candidate indices

def post_query(self, rq=False):
pass

def pre_batch_query(self, X, n):
pass

def batch_query(self, X, n):
self.res = []
for q in X:
Expand Down
44 changes: 27 additions & 17 deletions ann_benchmarks/algorithms/bruteforce.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from ann_benchmarks.distance import metrics as pd
from ann_benchmarks.algorithms.base import BaseANN
from scipy.sparse import issparse
import chemfp
from bitarray import bitarray

class BruteForce(BaseANN):
def __init__(self, metric):
Expand Down Expand Up @@ -104,35 +106,43 @@ def __init__(self, metric):
self._metric = metric
self.name = 'BruteForceFPS()'


def fit(self, X):
self.index = X

def query(self, v, n):
import operator
import chemfp
dist_dict = {}
for j in range(self.index.shape[0]):
dist_dict[j] = chemfp.bitops.byte_tanimoto(self._queries.get_fingerprint(0), self._target.get_fingerprint(j))
sorted_dict = sorted(dist_dict.items(), key=operator.itemgetter(1), reverse=True)[:n]
return [sorted_dict[j][0] for j in range(n)]

@staticmethod
def matrToArena(X):
import chemfp
from bitarray import bitarray
# convert X to Chemfp fingerprintArena in memory
fps = []
for row in range(X.shape[0]):
fp = bitarray(endian='big')
fp.extend(X[row])
fps.append((row,fp.tobytes()))
return chemfp.load_fingerprints(fps,chemfp.Metadata(num_bits=X.shape[1]),reorder=False)
return chemfp.load_fingerprints(fps,chemfp.Metadata(num_bits=X.shape[1]), reorder=False)

def pre_fit(self, X):
self._target = BruteForceFPS.matrToArena(X)
self._fps = []
for row in range(X.shape[0]):
fp = bitarray(endian='big')
fp.extend(X[row])
self._fps.append((row,fp.tobytes()))

def fit(self, X):
self._target = chemfp.load_fingerprints(self._fps,chemfp.Metadata(num_bits=X.shape[1]), reorder=False)
# To ensure that BitBound is not used
self._target.popcount_indices = ""


def pre_query(self, v, n):
queryMatr = numpy.array([v])
self._queries = BruteForceFPS.matrToArena(queryMatr)

def query(self, v, n, rq=False):
if rq:
self._results = chemfp.threshold_tanimoto_search(self._queries, self._target, threshold=1.0-n)
else:
self._results = chemfp.knearest_tanimoto_search(self._queries, self._target, k=n, threshold=0.0)

def post_query(self, rq=False):
# parse the results
for (query_id, hits) in self._results:
if hits:
return hits.get_ids()
else:
return []
16 changes: 12 additions & 4 deletions ann_benchmarks/algorithms/chemfp.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,14 @@ def matrToArena(X, reorder=True):
return chemfp.load_fingerprints(fps,chemfp.Metadata(num_bits=X.shape[1]),reorder=reorder)

def pre_fit(self, X):
self._target = Chemfp.matrToArena(X)
self._fps = []
for row in range(X.shape[0]):
fp = bitarray(endian='big')
fp.extend(X[row])
self._fps.append((row,fp.tobytes()))

def fit(self, X):
self._target = chemfp.load_fingerprints(self._fps,chemfp.Metadata(num_bits=X.shape[1]), reorder=True)


def pre_query(self, v, n):
Expand All @@ -45,19 +52,20 @@ def post_query(self, rq=False):
return hits.get_ids()
else:
return []

def pre_batch_query(self, X, n):
self._queries = Chemfp.matrToArena(X, False)
self._queries = Chemfp.matrToArena(X)

def batch_query(self, X, n):
self._results = chemfp.knearest_tanimoto_search(self._queries, self._target, k=n, threshold=0.0)

def get_batch_results(self):
# parse the results
res = []
for (query_id, hits) in self._results:
for (query_id, hits) in sorted(self._results):
if hits:
res.append(hits.get_ids())
else:
res.append([])
print(res)
#print(res)
return res
2 changes: 2 additions & 0 deletions ann_benchmarks/algorithms/risc.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ def __init__(self, metric, method):
self.name = method + "()"

def pre_fit(self, X):
X = numpy.concatenate((X, [numpy.ones(X.shape[1], dtype=numpy.bool)]), axis=0)
print(X.shape)
def matrToStrArray(sparseMatr):
res = ""
indptr = sparseMatr.indptr
Expand Down
23 changes: 5 additions & 18 deletions ann_benchmarks/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,7 @@ def run_individual_query(algoname, algo, X_train, X_test, distance, count, run_c
n_items_processed = [0] # a bit dumb but can't be a scalar since of Python's scoping rules

def single_query(v):
# special code for the Risc, DivideSkip, and Chemfp
if algoname in ['Risc', 'DivideSkip', 'Chemfp', 'Bruteforce']:
algo.pre_query(v, count)
algo.pre_query(v, count)

start = time.time()
if rq:
Expand All @@ -45,9 +43,7 @@ def single_query(v):
candidates = algo.query(v, count)
total = (time.time() - start)

# special code for the Risc, DivideSkip, and Chemfp
if algoname in ['Risc', 'DivideSkip', 'Chemfp']:
candidates = algo.post_query(rq)
candidates = algo.post_query(rq)

if issparse(X_train):
candidates = [(int(idx), float(metrics[distance]['distance'](v, X_train[idx].toarray()[0])))
Expand All @@ -63,9 +59,7 @@ def single_query(v):
return (total, candidates)

def batch_query(X):
# special code for Chemfp
if algoname in ['Chemfp']:
algo.pre_batch_query(X, count)
algo.pre_batch_query(X, count)
start = time.time()
algo.batch_query(X, count)
total = (time.time() - start)
Expand Down Expand Up @@ -127,14 +121,7 @@ def run(definition, dataset, count, run_count, batch, rq):

try:
print(X_train.shape)
# special code for Risc and DivideSkip
if definition.algorithm in ['Risc', 'DivideSkip']:
X_train = numpy.concatenate((X_train, [numpy.ones(X_train.shape[1], dtype=numpy.bool)]), axis=0)
print(X_train.shape)
algo.pre_fit(X_train)
# special code for Chemfp
if definition.algorithm in ['Chemfp', 'Bruteforce']:
algo.pre_fit(X_train)
algo.pre_fit(X_train)

t0 = time.time()
index_size_before = algo.get_index_size("self")
Expand Down Expand Up @@ -305,7 +292,7 @@ def run_singularity(definition, dataset, count, runs, timeout, batch, rq, radius
print('String of command', strCmd)

# Chemfp uses Python2 while others use Python3
if definition.algorithm in ['Chemfp', 'Bruteforce']:
if definition.algorithm in ['Chemfp', 'Bruteforce', 'Folding']:
subprocess.check_call('singularity exec %s/%s.sif python run_algorithm.py %s' %(sif_dir, definition.singularity_tag, strCmd), shell=True)
else:
subprocess.check_call('singularity exec %s/%s.sif python3 run_algorithm.py %s' %(sif_dir, definition.singularity_tag, strCmd), shell=True)
Expand Down

0 comments on commit 01adfa3

Please sign in to comment.