From 01adfa3b25c4ce8fd2c91b9f3fa4d29454d1155a Mon Sep 17 00:00:00 2001
From: ChunjiangZhu <chunjiang.zhu@uconn.edu>
Date: Mon, 21 Sep 2020 10:56:31 -0400
Subject: [PATCH] Fix measurement of index size and indexing time in chemfp and
 Bruteforce, and optimize the code

---
 .DS_Store                               | Bin 10244 -> 10244 bytes
 README.md                               |   4 ++-
 algos.yaml                              |  11 ++++++
 ann_benchmarks/.DS_Store                | Bin 10244 -> 10244 bytes
 ann_benchmarks/algorithms/base.py       |  12 +++++++
 ann_benchmarks/algorithms/bruteforce.py |  44 +++++++++++++++---------
 ann_benchmarks/algorithms/chemfp.py     |  16 ++++++---
 ann_benchmarks/algorithms/risc.py       |   2 ++
 ann_benchmarks/runner.py                |  23 +++----------
 9 files changed, 72 insertions(+), 40 deletions(-)

diff --git a/.DS_Store b/.DS_Store
index 31991bce35778f2d7060377f9fffcf0c843c14fc..dcae827976c61a3d5da91ab6e38f286275af24b4 100644
GIT binary patch
delta 24
fcmZn(XbIS$Bgk%Us-s|JW-$4@gyZHC!3kmjUE>Fa

delta 16
XcmZn(XbIS$BRE-C#A|bzV4oNOGOY!Y

diff --git a/README.md b/README.md
index b70a86f..0deb987 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,8 @@ Algorithms currently supported:
 
     - [Balltree](http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.91.8209)
     - Bruteforce/Exhausive search
-	- [Chemfp 1.5](https://jcheminf.biomedcentral.com/articles/10.1186/s13321-019-0398-8)
+	- [Chemfp 1.6.1](https://jcheminf.biomedcentral.com/articles/10.1186/s13321-019-0398-8)
+	- [the standard modulo-OR-compression algorithm, or folding](https://pubs.acs.org/doi/10.1021/ci100132g)
     - [Min-Hash](https://ekzhu.github.io/datasketch)
     - [DivideSkip](https://pubs.acs.org/doi/10.1021/ci200552r)
     - [Hnsw](https://arxiv.org/abs/1603.09320)
@@ -61,6 +62,7 @@ Run.py Parameters:
 	    - Chemfp
 	    - Datasketch
 	    - DivideSkip
+		- Folding
 	    - Hnsw(Nmslib)
 	    - Onng(Ngt)
 	    - Panng(Ngt)
diff --git a/algos.yaml b/algos.yaml
index 864bcbb..34aef16 100644
--- a/algos.yaml
+++ b/algos.yaml
@@ -198,6 +198,17 @@ bit:
         empty:
           args: []
 
+    Folding:
+      disabled: false
+      docker-tag: ann-benchmarks-chemfp
+      singularity-tag: ann-bench-chemfp
+      module: ann_benchmarks.algorithms.folding
+      constructor: Folding
+      base-args: ["@metric"]
+      run-groups:
+        panng:
+          args: [[64, 128, 192, 256, 512]]
+
 int:
   jaccard:
     Bruteforce:
diff --git a/ann_benchmarks/.DS_Store b/ann_benchmarks/.DS_Store
index e7f84d2127c9fda3fe25aea7b11ac7da0996b2e6..17dad90de08542a739bdedf6f967dfe8a04df4e5 100644
GIT binary patch
delta 24
fcmZn(XbIRLC&+GYs-s|JW-$4;l;h?!!Fgf;UMmNt

delta 24
fcmZn(XbIRLC&+GLqN8ABYBKq^l;h?!!Fgf;UP%Y3

diff --git a/ann_benchmarks/algorithms/base.py b/ann_benchmarks/algorithms/base.py
index 288564a..4030808 100644
--- a/ann_benchmarks/algorithms/base.py
+++ b/ann_benchmarks/algorithms/base.py
@@ -9,12 +9,24 @@ class BaseANN(object):
         """Returns the size of the index in kB or -1 if not implemented."""
         return psutil.Process().memory_info().rss / 1024  # return in kB for backwards compatibility
 
+    def pre_fit(self, X):
+        pass
+        
     def fit(self, X):
         pass
 
+    def pre_query(self, q, n):
+        pass
+        
     def query(self, q, n):
         return [] # array of candidate indices
 
+    def post_query(self, rq=False):
+        pass
+
+    def pre_batch_query(self, X, n):
+        pass
+                
     def batch_query(self, X, n):
         self.res = []
         for q in X:
diff --git a/ann_benchmarks/algorithms/bruteforce.py b/ann_benchmarks/algorithms/bruteforce.py
index 6c5c2ad..b538f79 100644
--- a/ann_benchmarks/algorithms/bruteforce.py
+++ b/ann_benchmarks/algorithms/bruteforce.py
@@ -4,6 +4,8 @@ import sklearn.neighbors
 from ann_benchmarks.distance import metrics as pd
 from ann_benchmarks.algorithms.base import BaseANN
 from scipy.sparse import issparse
+import chemfp
+from bitarray import bitarray
 
 class BruteForce(BaseANN):
     def __init__(self, metric):
@@ -104,35 +106,43 @@ class BruteForceFPS(BaseANN):
         self._metric = metric
         self.name = 'BruteForceFPS()'
 
-
-    def fit(self, X):
-        self.index = X
-
-    def query(self, v, n):
-        import operator
-        import chemfp
-        dist_dict = {}
-        for j in range(self.index.shape[0]):
-            dist_dict[j] = chemfp.bitops.byte_tanimoto(self._queries.get_fingerprint(0), self._target.get_fingerprint(j))
-        sorted_dict = sorted(dist_dict.items(), key=operator.itemgetter(1), reverse=True)[:n]
-        return [sorted_dict[j][0] for j in range(n)]
-
     @staticmethod
     def matrToArena(X):
-        import chemfp
-        from bitarray import bitarray
         # convert X to Chemfp fingerprintArena in memory
         fps = []
         for row in range(X.shape[0]):
             fp = bitarray(endian='big')
             fp.extend(X[row])
             fps.append((row,fp.tobytes()))
-        return chemfp.load_fingerprints(fps,chemfp.Metadata(num_bits=X.shape[1]),reorder=False)
+        return chemfp.load_fingerprints(fps,chemfp.Metadata(num_bits=X.shape[1]), reorder=False)
 
     def pre_fit(self, X):
-        self._target = BruteForceFPS.matrToArena(X)
+        self._fps = []
+        for row in range(X.shape[0]):
+            fp = bitarray(endian='big')
+            fp.extend(X[row])
+            self._fps.append((row,fp.tobytes()))
+
+    def fit(self, X):
+        self._target = chemfp.load_fingerprints(self._fps,chemfp.Metadata(num_bits=X.shape[1]), reorder=False)
+        # To ensure that BitBound is not used
+        self._target.popcount_indices = ""
 
 
     def pre_query(self, v, n):
         queryMatr = numpy.array([v])
         self._queries = BruteForceFPS.matrToArena(queryMatr)
+
+    def query(self, v, n, rq=False):
+        if rq:
+            self._results = chemfp.threshold_tanimoto_search(self._queries, self._target, threshold=1.0-n)
+        else:
+            self._results = chemfp.knearest_tanimoto_search(self._queries, self._target, k=n, threshold=0.0)
+
+    def post_query(self, rq=False):
+        # parse the results
+        for (query_id, hits) in self._results:
+            if hits:
+                return hits.get_ids()
+            else:
+                return []
diff --git a/ann_benchmarks/algorithms/chemfp.py b/ann_benchmarks/algorithms/chemfp.py
index 3e8a7c2..736c95b 100644
--- a/ann_benchmarks/algorithms/chemfp.py
+++ b/ann_benchmarks/algorithms/chemfp.py
@@ -25,7 +25,14 @@ class Chemfp(BaseANN):
         return chemfp.load_fingerprints(fps,chemfp.Metadata(num_bits=X.shape[1]),reorder=reorder)
 
     def pre_fit(self, X):
-        self._target = Chemfp.matrToArena(X)
+        self._fps = []
+        for row in range(X.shape[0]):
+            fp = bitarray(endian='big')
+            fp.extend(X[row])
+            self._fps.append((row,fp.tobytes()))
+
+    def fit(self, X):
+        self._target = chemfp.load_fingerprints(self._fps,chemfp.Metadata(num_bits=X.shape[1]), reorder=True)
 
 
     def pre_query(self, v, n):
@@ -45,8 +52,9 @@ class Chemfp(BaseANN):
                 return hits.get_ids()
             else:
                 return []
+
     def pre_batch_query(self, X, n):
-        self._queries = Chemfp.matrToArena(X, False)
+        self._queries = Chemfp.matrToArena(X)
 
     def batch_query(self, X, n):
         self._results = chemfp.knearest_tanimoto_search(self._queries, self._target, k=n, threshold=0.0)
@@ -54,10 +62,10 @@ class Chemfp(BaseANN):
     def get_batch_results(self):
         # parse the results
         res = []
-        for (query_id, hits) in self._results:
+        for (query_id, hits) in sorted(self._results):
             if hits:
                 res.append(hits.get_ids())
             else:
                 res.append([])
-        print(res)
+        #print(res)
         return res
diff --git a/ann_benchmarks/algorithms/risc.py b/ann_benchmarks/algorithms/risc.py
index 368233d..ceb3e05 100644
--- a/ann_benchmarks/algorithms/risc.py
+++ b/ann_benchmarks/algorithms/risc.py
@@ -19,6 +19,8 @@ class Risc(BaseANN):
         self.name = method + "()"
 
     def pre_fit(self, X):
+        X = numpy.concatenate((X, [numpy.ones(X.shape[1], dtype=numpy.bool)]), axis=0)
+        print(X.shape)
         def matrToStrArray(sparseMatr):
             res = ""
             indptr = sparseMatr.indptr
diff --git a/ann_benchmarks/runner.py b/ann_benchmarks/runner.py
index fce0723..ddc15e6 100644
--- a/ann_benchmarks/runner.py
+++ b/ann_benchmarks/runner.py
@@ -34,9 +34,7 @@ def run_individual_query(algoname, algo, X_train, X_test, distance, count, run_c
         n_items_processed = [0]  # a bit dumb but can't be a scalar since of Python's scoping rules
 
         def single_query(v):
-            # special code for the Risc, DivideSkip, and Chemfp
-            if algoname in ['Risc', 'DivideSkip', 'Chemfp', 'Bruteforce']:
-                algo.pre_query(v, count)
+            algo.pre_query(v, count)
 
             start = time.time()
             if rq:
@@ -45,9 +43,7 @@ def run_individual_query(algoname, algo, X_train, X_test, distance, count, run_c
                 candidates = algo.query(v, count)
             total = (time.time() - start)
 
-            # special code for the Risc, DivideSkip, and Chemfp
-            if algoname in ['Risc', 'DivideSkip', 'Chemfp']:
-                candidates = algo.post_query(rq)
+            candidates = algo.post_query(rq)
 
             if issparse(X_train):
                 candidates = [(int(idx), float(metrics[distance]['distance'](v, X_train[idx].toarray()[0])))
@@ -63,9 +59,7 @@ def run_individual_query(algoname, algo, X_train, X_test, distance, count, run_c
             return (total, candidates)
 
         def batch_query(X):
-            # special code for Chemfp
-            if algoname in ['Chemfp']:
-                algo.pre_batch_query(X, count)
+            algo.pre_batch_query(X, count)
             start = time.time()
             algo.batch_query(X, count)
             total = (time.time() - start)
@@ -127,14 +121,7 @@ function""" % (definition.module, definition.constructor, definition.arguments)
 
     try:
         print(X_train.shape)
-        # special code for Risc and DivideSkip
-        if definition.algorithm in ['Risc', 'DivideSkip']:
-            X_train = numpy.concatenate((X_train, [numpy.ones(X_train.shape[1], dtype=numpy.bool)]), axis=0)
-            print(X_train.shape)
-            algo.pre_fit(X_train)
-        # special code for Chemfp
-        if definition.algorithm in ['Chemfp', 'Bruteforce']:
-            algo.pre_fit(X_train)
+        algo.pre_fit(X_train)
 
         t0 = time.time()
         index_size_before = algo.get_index_size("self")
@@ -305,7 +292,7 @@ def run_singularity(definition, dataset, count, runs, timeout, batch, rq, radius
     print('String of command', strCmd)
 
     # Chemfp uses Python2 while others use Python3
-    if definition.algorithm in ['Chemfp', 'Bruteforce']:
+    if definition.algorithm in ['Chemfp', 'Bruteforce', 'Folding']:
         subprocess.check_call('singularity exec %s/%s.sif python run_algorithm.py %s' %(sif_dir, definition.singularity_tag, strCmd), shell=True)
     else:
         subprocess.check_call('singularity exec %s/%s.sif python3 run_algorithm.py %s' %(sif_dir, definition.singularity_tag, strCmd), shell=True)