Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Fix measurement of index size and indexing time in chemfp and Brutefo…
…rce, and optimize the code
  • Loading branch information
ChunjiangZhu committed Sep 21, 2020
1 parent 528f4b5 commit 01adfa3
Show file tree
Hide file tree
Showing 9 changed files with 72 additions and 40 deletions.
Binary file modified .DS_Store
Binary file not shown.
4 changes: 3 additions & 1 deletion README.md
Expand Up @@ -6,7 +6,8 @@ Algorithms currently supported:

- [Balltree](http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.91.8209)
- Bruteforce/Exhausive search
- [Chemfp 1.5](https://jcheminf.biomedcentral.com/articles/10.1186/s13321-019-0398-8)
- [Chemfp 1.6.1](https://jcheminf.biomedcentral.com/articles/10.1186/s13321-019-0398-8)
- [the standard modulo-OR-compression algorithm, or folding](https://pubs.acs.org/doi/10.1021/ci100132g)
- [Min-Hash](https://ekzhu.github.io/datasketch)
- [DivideSkip](https://pubs.acs.org/doi/10.1021/ci200552r)
- [Hnsw](https://arxiv.org/abs/1603.09320)
Expand Down Expand Up @@ -61,6 +62,7 @@ Run.py Parameters:
- Chemfp
- Datasketch
- DivideSkip
- Folding
- Hnsw(Nmslib)
- Onng(Ngt)
- Panng(Ngt)
Expand Down
11 changes: 11 additions & 0 deletions algos.yaml
Expand Up @@ -198,6 +198,17 @@ bit:
empty:
args: []

Folding:
disabled: false
docker-tag: ann-benchmarks-chemfp
singularity-tag: ann-bench-chemfp
module: ann_benchmarks.algorithms.folding
constructor: Folding
base-args: ["@metric"]
run-groups:
panng:
args: [[64, 128, 192, 256, 512]]

int:
jaccard:
Bruteforce:
Expand Down
Binary file modified ann_benchmarks/.DS_Store
Binary file not shown.
12 changes: 12 additions & 0 deletions ann_benchmarks/algorithms/base.py
Expand Up @@ -9,12 +9,24 @@ class BaseANN(object):
"""Returns the size of the index in kB or -1 if not implemented."""
return psutil.Process().memory_info().rss / 1024 # return in kB for backwards compatibility

def pre_fit(self, X):
pass

def fit(self, X):
pass

def pre_query(self, q, n):
pass

def query(self, q, n):
return [] # array of candidate indices

def post_query(self, rq=False):
pass

def pre_batch_query(self, X, n):
pass

def batch_query(self, X, n):
self.res = []
for q in X:
Expand Down
44 changes: 27 additions & 17 deletions ann_benchmarks/algorithms/bruteforce.py
Expand Up @@ -4,6 +4,8 @@ import sklearn.neighbors
from ann_benchmarks.distance import metrics as pd
from ann_benchmarks.algorithms.base import BaseANN
from scipy.sparse import issparse
import chemfp
from bitarray import bitarray

class BruteForce(BaseANN):
def __init__(self, metric):
Expand Down Expand Up @@ -104,35 +106,43 @@ class BruteForceFPS(BaseANN):
self._metric = metric
self.name = 'BruteForceFPS()'


def fit(self, X):
self.index = X

def query(self, v, n):
import operator
import chemfp
dist_dict = {}
for j in range(self.index.shape[0]):
dist_dict[j] = chemfp.bitops.byte_tanimoto(self._queries.get_fingerprint(0), self._target.get_fingerprint(j))
sorted_dict = sorted(dist_dict.items(), key=operator.itemgetter(1), reverse=True)[:n]
return [sorted_dict[j][0] for j in range(n)]

@staticmethod
def matrToArena(X):
import chemfp
from bitarray import bitarray
# convert X to Chemfp fingerprintArena in memory
fps = []
for row in range(X.shape[0]):
fp = bitarray(endian='big')
fp.extend(X[row])
fps.append((row,fp.tobytes()))
return chemfp.load_fingerprints(fps,chemfp.Metadata(num_bits=X.shape[1]),reorder=False)
return chemfp.load_fingerprints(fps,chemfp.Metadata(num_bits=X.shape[1]), reorder=False)

def pre_fit(self, X):
self._target = BruteForceFPS.matrToArena(X)
self._fps = []
for row in range(X.shape[0]):
fp = bitarray(endian='big')
fp.extend(X[row])
self._fps.append((row,fp.tobytes()))

def fit(self, X):
self._target = chemfp.load_fingerprints(self._fps,chemfp.Metadata(num_bits=X.shape[1]), reorder=False)
# To ensure that BitBound is not used
self._target.popcount_indices = ""


def pre_query(self, v, n):
queryMatr = numpy.array([v])
self._queries = BruteForceFPS.matrToArena(queryMatr)

def query(self, v, n, rq=False):
if rq:
self._results = chemfp.threshold_tanimoto_search(self._queries, self._target, threshold=1.0-n)
else:
self._results = chemfp.knearest_tanimoto_search(self._queries, self._target, k=n, threshold=0.0)

def post_query(self, rq=False):
# parse the results
for (query_id, hits) in self._results:
if hits:
return hits.get_ids()
else:
return []
16 changes: 12 additions & 4 deletions ann_benchmarks/algorithms/chemfp.py
Expand Up @@ -25,7 +25,14 @@ class Chemfp(BaseANN):
return chemfp.load_fingerprints(fps,chemfp.Metadata(num_bits=X.shape[1]),reorder=reorder)

def pre_fit(self, X):
self._target = Chemfp.matrToArena(X)
self._fps = []
for row in range(X.shape[0]):
fp = bitarray(endian='big')
fp.extend(X[row])
self._fps.append((row,fp.tobytes()))

def fit(self, X):
self._target = chemfp.load_fingerprints(self._fps,chemfp.Metadata(num_bits=X.shape[1]), reorder=True)


def pre_query(self, v, n):
Expand All @@ -45,19 +52,20 @@ class Chemfp(BaseANN):
return hits.get_ids()
else:
return []

def pre_batch_query(self, X, n):
self._queries = Chemfp.matrToArena(X, False)
self._queries = Chemfp.matrToArena(X)

def batch_query(self, X, n):
self._results = chemfp.knearest_tanimoto_search(self._queries, self._target, k=n, threshold=0.0)

def get_batch_results(self):
# parse the results
res = []
for (query_id, hits) in self._results:
for (query_id, hits) in sorted(self._results):
if hits:
res.append(hits.get_ids())
else:
res.append([])
print(res)
#print(res)
return res
2 changes: 2 additions & 0 deletions ann_benchmarks/algorithms/risc.py
Expand Up @@ -19,6 +19,8 @@ class Risc(BaseANN):
self.name = method + "()"

def pre_fit(self, X):
X = numpy.concatenate((X, [numpy.ones(X.shape[1], dtype=numpy.bool)]), axis=0)
print(X.shape)
def matrToStrArray(sparseMatr):
res = ""
indptr = sparseMatr.indptr
Expand Down
23 changes: 5 additions & 18 deletions ann_benchmarks/runner.py
Expand Up @@ -34,9 +34,7 @@ def run_individual_query(algoname, algo, X_train, X_test, distance, count, run_c
n_items_processed = [0] # a bit dumb but can't be a scalar since of Python's scoping rules

def single_query(v):
# special code for the Risc, DivideSkip, and Chemfp
if algoname in ['Risc', 'DivideSkip', 'Chemfp', 'Bruteforce']:
algo.pre_query(v, count)
algo.pre_query(v, count)

start = time.time()
if rq:
Expand All @@ -45,9 +43,7 @@ def run_individual_query(algoname, algo, X_train, X_test, distance, count, run_c
candidates = algo.query(v, count)
total = (time.time() - start)

# special code for the Risc, DivideSkip, and Chemfp
if algoname in ['Risc', 'DivideSkip', 'Chemfp']:
candidates = algo.post_query(rq)
candidates = algo.post_query(rq)

if issparse(X_train):
candidates = [(int(idx), float(metrics[distance]['distance'](v, X_train[idx].toarray()[0])))
Expand All @@ -63,9 +59,7 @@ def run_individual_query(algoname, algo, X_train, X_test, distance, count, run_c
return (total, candidates)

def batch_query(X):
# special code for Chemfp
if algoname in ['Chemfp']:
algo.pre_batch_query(X, count)
algo.pre_batch_query(X, count)
start = time.time()
algo.batch_query(X, count)
total = (time.time() - start)
Expand Down Expand Up @@ -127,14 +121,7 @@ function""" % (definition.module, definition.constructor, definition.arguments)

try:
print(X_train.shape)
# special code for Risc and DivideSkip
if definition.algorithm in ['Risc', 'DivideSkip']:
X_train = numpy.concatenate((X_train, [numpy.ones(X_train.shape[1], dtype=numpy.bool)]), axis=0)
print(X_train.shape)
algo.pre_fit(X_train)
# special code for Chemfp
if definition.algorithm in ['Chemfp', 'Bruteforce']:
algo.pre_fit(X_train)
algo.pre_fit(X_train)

t0 = time.time()
index_size_before = algo.get_index_size("self")
Expand Down Expand Up @@ -305,7 +292,7 @@ def run_singularity(definition, dataset, count, runs, timeout, batch, rq, radius
print('String of command', strCmd)

# Chemfp uses Python2 while others use Python3
if definition.algorithm in ['Chemfp', 'Bruteforce']:
if definition.algorithm in ['Chemfp', 'Bruteforce', 'Folding']:
subprocess.check_call('singularity exec %s/%s.sif python run_algorithm.py %s' %(sif_dir, definition.singularity_tag, strCmd), shell=True)
else:
subprocess.check_call('singularity exec %s/%s.sif python3 run_algorithm.py %s' %(sif_dir, definition.singularity_tag, strCmd), shell=True)
Expand Down

0 comments on commit 01adfa3

Please sign in to comment.