From 0e81a7b14834e1137587e552a0101d455cf1f3e2 Mon Sep 17 00:00:00 2001
From: ChunjiangZhu <chunjiang.zhu@uconn.edu>
Date: Fri, 19 Jun 2020 19:01:09 -0400
Subject: [PATCH] add map4 fingerprint

---
 .DS_Store                           | Bin 10244 -> 10244 bytes
 algos.yaml                          | 126 +++++++++++++-
 ann_benchmarks/.DS_Store            | Bin 8196 -> 8196 bytes
 ann_benchmarks/algorithms/nmslib.py |  32 +++-
 ann_benchmarks/datasets.py          | 246 +++++++++++++++++++++++-----
 ann_benchmarks/runner.py            |   2 +-
 requirements.txt                    |   1 -
 7 files changed, 348 insertions(+), 59 deletions(-)

diff --git a/.DS_Store b/.DS_Store
index afb1bdc376a0c0241d3bfeb1d4203670f90495ff..e51d2361ba5c217b11d91dd82a50f93559957cc0 100644
GIT binary patch
delta 200
zcmZn(XbG6$&nUSuU^hRb<YpcLc}8|414|tRW3$Pxgyc2{2tHw+oFK|#n95MdP|A?W
zP{feRkPF0l3?&T340=Gaf}zASCqFqUCqIdSfkA+Qf$=jCb8OZWEk-eGa)6-9=BHwh
SI5x8@=(1AL(8*85?gIb>dot(%

delta 99
zcmZn(XbG6$&nU4mU^hRb#AY4=c}5OnV-p<(V^hP)uZ3hc2M9i4-drM@0T(s~2~Q3X
XRN1U5{)l5@gTrQa1zlF0O5_;<kNz9Z

diff --git a/algos.yaml b/algos.yaml
index 43220d8..e9087ee 100644
--- a/algos.yaml
+++ b/algos.yaml
@@ -26,7 +26,7 @@ bit:
       singularity-tag: ann-bench-nmslib3
       module: ann_benchmarks.algorithms.nmslib
       constructor: NmslibReuseIndex
-      base-args: ["@metric", "vptree"]
+      base-args: ["@metric", "Byte", "vptree"]
       run-groups:
         base:
           # When @args is a dictionary, algorithm instances will be generated
@@ -51,7 +51,7 @@ bit:
       singularity-tag: ann-bench-nmslib3
       module: ann_benchmarks.algorithms.nmslib
       constructor: NmslibReuseIndex
-      base-args: ["@metric", "hnsw"]
+      base-args: ["@metric", "Byte", "hnsw"]
       run-groups:
         M-48:
           arg-groups:
@@ -90,7 +90,7 @@ bit:
       singularity-tag: ann-bench-nmslib3
       module: ann_benchmarks.algorithms.nmslib
       constructor: NmslibReuseIndex
-      base-args: ["@metric", "sw-graph"]
+      base-args: ["@metric", "Byte", "sw-graph"]
       run-groups:
         NN-96:
           arg-groups:
@@ -186,3 +186,123 @@ bit:
       run-groups:
         empty:
           args: []
+int:
+  jaccard:
+    Bruteforce:
+      disabled: false
+      docker-tag: ann-benchmarks-sklearn
+      singularity-tag: ann-bench-sklearn
+      module: ann_benchmarks.algorithms.bruteforce
+      constructor: BruteForceBLAS
+      base-args: ["@metric"]
+      run-groups:
+        base:
+          args: {}
+    Hnsw(Nmslib):
+      disabled: false
+      docker-tag: ann-benchmarks-nmslib
+      singularity-tag: ann-bench-nmslib3
+      module: ann_benchmarks.algorithms.nmslib
+      constructor: NmslibReuseIndex
+      base-args: ["@metric", "Int", "hnsw"]
+      run-groups:
+        M-48:
+          arg-groups:
+          - {"M": 48, "post": 2, "efConstruction": 800}
+          - False
+          query-args: [[50, 70, 90, 120, 160, 200, 400, 600, 700, 800, 1000,
+                        1400, 1600, 2000]]
+        M-32:
+          arg-groups:
+          - {"M": 32, "post": 2, "efConstruction": 800}
+          - False
+          query-args: [[100, 300, 500, 700, 1000, 1500, 2000]]
+        M-20:
+          arg-groups:
+          - {"M": 20, "post": 0, "efConstruction": 800}
+          - False
+          query-args: [[2, 5, 10, 15, 20, 30, 40, 50, 70, 80]]
+        M-12:
+          arg-groups:
+          - {"M": 12, "post": 0, "efConstruction": 800}
+          - False
+          query-args: [[1, 2, 5, 10, 15, 20, 30, 40, 50, 70, 80]]
+        M-5:
+          arg-groups:
+          - {"M": 5, "post": 0, "efConstruction": 10}
+          - False
+          query-args: [[1, 2, 5, 10]]
+        M-2:
+          arg-groups:
+          - {"M": 2, "post": 0, "efConstruction": 1}
+          - False
+          query-args: [[1, 2]]
+    SW-graph(Nmslib):
+      disabled: false
+      docker-tag: ann-benchmarks-nmslib
+      singularity-tag: ann-bench-nmslib3
+      module: ann_benchmarks.algorithms.nmslib
+      constructor: NmslibReuseIndex
+      base-args: ["@metric", "Int", "sw-graph"]
+      run-groups:
+        NN-96:
+          arg-groups:
+          - {"NN": 96}
+          - False
+          query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]]
+        NN-48:
+          arg-groups:
+          - {"NN": 48}
+          - False
+          query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]]
+        NN-24:
+          arg-groups:
+          - {"NN": 24}
+          - False
+          query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]]
+        NN-16:
+          arg-groups:
+          - {"NN": 16}
+          - False
+          query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]]
+        NN-10:
+          arg-groups:
+          - {"NN": 10}
+          - False
+          query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]]
+        NN-5:
+          arg-groups:
+          - {"NN": 5}
+          - False
+          query-args: [[30, 25, 20, 15, 10, 5, 4, 3, 2, 1]]
+        NN-2:
+          arg-groups:
+          - {"NN": 2}
+          - False
+          query-args: [[30, 25, 20, 15, 10, 5, 4, 3, 2, 1]]
+        NN-1:
+          arg-groups:
+          - {"NN": 1}
+          - False
+          query-args: [[30, 25, 20, 15, 10, 5, 4, 3, 2, 1]]
+    Onng(Ngt):
+      disabled: false
+      docker-tag: ann-benchmarks-ngt
+      singularity-tag: ann-bench-ngt
+      module: ann_benchmarks.algorithms.onng_ngt
+      constructor: ONNG
+      base-args: ["@metric", "Byte", 1.0]
+      run-groups:
+        onng:
+          args: [[100, 300, 500, 1000], [10, 30, 50, 100], [10, 30, 50, 120]]
+          query-args: [[0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0]]
+    Risc:
+      disabled: false
+      docker-tag: ann-benchmarks-risc
+      singularity-tag: ann-bench-risc
+      module: ann_benchmarks.algorithms.risc
+      constructor: Risc
+      base-args: ["@metric", "Risc"]
+      run-groups:
+        empty:
+          args: []
diff --git a/ann_benchmarks/.DS_Store b/ann_benchmarks/.DS_Store
index 827c239a459a68fb374e916c34eb01900326db89..8c30ee3120dda5d33f6ee9d277229d8142cc109e 100644
GIT binary patch
delta 151
zcmZp1XmQvuQ-Ja0<XHj@o)Xp7Mh2ET3dUxpwK@vb=H>=E3MR&8wY8ia;;M$Wo(Z{?
zRn;}Mbu$^jfRPbGGw?%c7&U8hlAs%tnCRqHg7vcT0{q3vMfo{70r|z5C7Jnok<sZz
knJNBhX~n4}lih@xm<kU}J|$$#xOMYCAr98f>=J+30cs~HC;$Ke

delta 150
zcmZp1XmQvuQ-JZr<XHj@9^%#2#)gJE3KpidIttb1<_0<nCdOv9wVWK{s)n|n3AvS3
z)it$sGZ?^tkr6^O@Iz@BHFI*Zpc|8@*yPoM^|I0FMVTr7X=%l&C6Vz0{Kd&d`8hcO
j`Nf$fnfZB>-G!Q%iZ)C>Eo98Ne)E4J4%W?V693o%NUbQ`

diff --git a/ann_benchmarks/algorithms/nmslib.py b/ann_benchmarks/algorithms/nmslib.py
index 47933f5..72f3ebc 100644
--- a/ann_benchmarks/algorithms/nmslib.py
+++ b/ann_benchmarks/algorithms/nmslib.py
@@ -24,9 +24,17 @@ class NmslibReuseIndex(BaseANN):
             arr.sort()
             res.append(' '.join([str(k) for k in arr]))
         return res
+        
+    @staticmethod
+    def intMatrToStrArray(intMatr):
+        res = []
+        for row in range(intMatr.shape[0]):
+            res.append(' '.join([str(k) for k in intMatr[row]]))
+        return res
 
-    def __init__(self, metric, method_name, index_param, query_param):
+    def __init__(self, metric, object_type, method_name, index_param, query_param):
         self._nmslib_metric = {'angular': 'cosinesimil', 'euclidean': 'l2', 'jaccard': 'jaccard_sparse'}[metric]
+        self._object_type = object_type
         self._method_name = method_name
         self._save_index = False
         self._index_param = NmslibReuseIndex.encode(index_param)
@@ -53,11 +61,11 @@ class NmslibReuseIndex(BaseANN):
             # Aborted (core dumped)
             self._index_param.append('bucketSize=%d' % min(int(X.shape[0] * 0.0005), 1000))
         
-        # Chunjiang modified it to "if" for jaccard
         if self._nmslib_metric == 'jaccard_sparse':
-            X_trans = NmslibReuseIndex.matrToStrArray(csr_matrix(X))
-            self._index = nmslib.init(space=self._nmslib_metric, method=self._method_name, data_type=nmslib.DataType.OBJECT_AS_STRING)
-            self._index.addDataPointBatch(X_trans)
+            if self._object_type == 'Byte':
+                X_trans = NmslibReuseIndex.matrToStrArray(csr_matrix(X))
+            else:
+                X_trans = NmslibReuseIndex.intMatrToStrArray(X)
         else:
             self._index = nmslib.init(space=self._nmslib_metric, method=self._method_name)
             self._index.addDataPointBatch(X)
@@ -79,9 +87,12 @@ class NmslibReuseIndex(BaseANN):
     def query(self, v, n, rq=False):
         # Chunjiang modified
         if self._nmslib_metric == 'jaccard_sparse':
-            nz = numpy.nonzero(v)[0]
-            v = ' '.join([str(k) for k in nz])
-        #print(n)
+            if self._object_type == 'Byte':
+                nz = numpy.nonzero(v)[0]
+                v = ' '.join([str(k) for k in nz])
+            else:
+                v = ' '.join([str(k) for k in v])
+
         if rq:
             ids, distances = self._index.rangeQuery(v, n)
         else:
@@ -91,7 +102,10 @@ class NmslibReuseIndex(BaseANN):
     def batch_query(self, X, n):
         # Chunjiang modified
         if self._nmslib_metric == 'jaccard_sparse':
-            X = NmslibReuseIndex.matrToStrArray(csr_matrix(X))
+            if self._object_type == 'Byte':
+                X = NmslibReuseIndex.matrToStrArray(csr_matrix(X))
+            else:
+                X = NmslibReuseIndex.intMatrToStrArray(X)
         self.res = self._index.knnQueryBatch(X, n)
 
     def get_batch_results(self):
diff --git a/ann_benchmarks/datasets.py b/ann_benchmarks/datasets.py
index 68728bf..2ee8551 100644
--- a/ann_benchmarks/datasets.py
+++ b/ann_benchmarks/datasets.py
@@ -42,14 +42,11 @@ def get_dataset(which):
 # Everything below this line is related to creating datasets
 # You probably never need to do this at home, just rely on the prepared datasets at http://ann-benchmarks.com
 
-def write_output(train, test, fn, distance, point_type='float', count=1000, SMILES=None):
+def write_output(train, test, fn, distance, point_type='float', count=1000, SMILES=None, IDS=None):
     from ann_benchmarks.algorithms.bruteforce import BruteForceBLAS
     import sklearn.neighbors
     import h5sparse
-
-    def replace_last(source_string, replace_what, replace_with):
-        head, _sep, tail = source_string.rpartition(replace_what)
-        return head + replace_with + tail
+    from scipy.sparse import issparse
 
     # store SMILES first
     if SMILES:
@@ -62,43 +59,60 @@ def write_output(train, test, fn, distance, point_type='float', count=1000, SMIL
         f.close()
         print('Finish.')
 
-    print('Write Dataset %s' % fn)
-    f = h5sparse.File(fn, 'w')
-    f.attrs['distance'] = distance
-    f.attrs['point_type'] = point_type
-    print('train size: %9d * %4d' % train.shape)
-    print('test size:  %9d * %4d' % test.shape)
-    f.create_dataset('train',data=train)
-    f.create_dataset('test', test.shape, dtype=test.dtype)[:] = test
-    neighbors = f.create_dataset('neighbors', (test.shape[0], count), dtype='i')
-    distances = f.create_dataset('distances', (test.shape[0], count), dtype='f')
-
-    # use which method to compute the groundtruth
-    train = train.toarray()
-    method = 'bruteforth'
-    if method == 'balltree':
-        tree = sklearn.neighbors.BallTree(train, leaf_size=1000000, metric=distance)
-    else:
-        bf = BruteForceBLAS(metric=distance, precision=train.dtype)
-        bf.fit(train)
+    if IDS:
+        smile_fn = replace_last(fn, '.hdf5', '-IDS.hdf5')
+        print('Write Smiles to File %s' % smile_fn)
+        f = h5sparse.File(smile_fn, 'w')
+        dt = h5py.special_dtype(vlen=bytes)
+        asciiList = [n.encode("ascii", "ignore") for n in IDS]
+        f.create_dataset('smile', (len(asciiList), 1), dtype=dt, data=asciiList)
+        f.close()
 
-    print(test)
-    for i, x in enumerate(test):
-        if i % 1 == 0:
-            print('%d/%d...' % (i, test.shape[0]))
+    print('Write Dataset %s' % fn)
+        f = h5sparse.File(fn, 'w')
+        f.attrs['distance'] = distance
+        f.attrs['point_type'] = point_type
+        print('train size: %9d * %4d' % train.shape)
+        print('test size:  %9d * %4d' % test.shape)
+        if issparse(train):
+            f.create_dataset('train',data=train)
+        else:
+            f.create_dataset('train', train.shape, dtype=train.dtype)[:] = train
+        if issparse(test):
+            f.create_dataset('test',data=test)
+        else:
+            f.create_dataset('test', test.shape, dtype=test.dtype)[:] = test
+        neighbors = f.create_dataset('neighbors', (test.shape[0], count), dtype='i')
+        distances = f.create_dataset('distances', (test.shape[0], count), dtype='f')
+
+        # use which method to compute the groundtruth
+        if issparse(train):
+            train = train.toarray()
+        method = 'bruteforce'
         if method == 'balltree':
-            dist, ind = tree.query([x], k=count)
-            neighbors[i] = ind[0]
-            distances[i] = dist[0]
+            tree = sklearn.neighbors.BallTree(train, leaf_size=1000000, metric=distance)
         else:
-            res = list(bf.query_with_distances(x, count))
-            res.sort(key=lambda t: t[-1])
-            neighbors[i] = [j for j, _ in res]
-            distances[i] = [d for _, d in res]
-        print(neighbors[i])
-        print(distances[i])
-    f.close()
-    print('Finish.')
+            bf = BruteForceBLAS(metric=distance, precision=train.dtype)
+            bf.fit(train)
+
+        print(test)
+        for i, x in enumerate(test):
+            if i % 1 == 0:
+                print('%d/%d...' % (i, test.shape[0]))
+            if method == 'balltree':
+                dist, ind = tree.query([x], k=count)
+                neighbors[i] = ind[0]
+                distances[i] = dist[0]
+            else:
+                res = list(bf.query_with_distances(x, count))
+                print(len(res))
+                res.sort(key=lambda t: t[-1])
+                neighbors[i] = [j for j, _ in res]
+                distances[i] = [d for _, d in res]
+            print(neighbors[i])
+            print(distances[i])
+        f.close()
+        print('Finish.')
 
 
 def train_test_split(X, test_size=10000):
@@ -355,7 +369,7 @@ def get_sparse_matrix_from_txt(file=None, dtype=numpy.bool):
 
     return fps, SMILES
 
-def get_sparse_matrix_from_sdf(dir, dimension=1024, dtype=numpy.bool):
+def get_sparse_matrix_from_sdf(dir, dimension = 1024, dtype=numpy.bool):
     from rdkit import Chem
     from rdkit.Chem import AllChem
     import glob
@@ -363,6 +377,7 @@ def get_sparse_matrix_from_sdf(dir, dimension=1024, dtype=numpy.bool):
     from scipy.sparse import csr_matrix
 
     SMILES = []
+    IDS = []
     indptr = [0]
     indices = []
     data = []
@@ -376,6 +391,7 @@ def get_sparse_matrix_from_sdf(dir, dimension=1024, dtype=numpy.bool):
             if mol is None: continue
             smile = Chem.MolToSmiles(mol)
             SMILES.append(smile)
+            IDS.append(mol.GetProp("_Name"))
             fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=dimension)
             for i in range(dimension):
                 if fp.GetBit(i) is True:
@@ -383,11 +399,12 @@ def get_sparse_matrix_from_sdf(dir, dimension=1024, dtype=numpy.bool):
                     data.append(1)
             indptr.append(len(indices))
             num_mols += 1
+            if num_mols > 3000: break
 
     fps = csr_matrix((data, indices, indptr), shape=(num_mols, dimension), dtype=dtype)
     print('The dimension of the returned sparse matrix: %d*%d' % fps.shape)
 
-    return fps, SMILES
+    return fps, SMILES, IDS
 
 def ecfp(out_fn, dataset_name, dimension, distance, type, test_size=1000):
     from sklearn.utils import shuffle
@@ -402,11 +419,11 @@ def ecfp(out_fn, dataset_name, dimension, distance, type, test_size=1000):
 
     dir = './data'
     
-    X, SMILES = get_sparse_matrix_from_sdf(dir=dir, dimension=dimension, dtype=dtype)
+    X, SMILES, IDS = get_sparse_matrix_from_sdf(dir=dir, dimension=dimension, dtype=dtype)
 
     # random shuffle fingerprints and smiles at the same time
     seed = 1 # random.randint(0, 2 ** 32 - 1)
-    X, SMILES = shuffle(X, SMILES, random_state=seed)
+    X, SMILES, IDS = shuffle(X, SMILES, IDS, random_state=seed)
 
     # data split and make test data full matrix
     train_size = X.shape[0] - test_size
@@ -417,8 +434,145 @@ def ecfp(out_fn, dataset_name, dimension, distance, type, test_size=1000):
 
     print('Train data dimension: %d*%d' %X_train.shape)
     print('Test data dimension: %d*%d' %X_test.shape)
-    write_output(X_train, X_test, out_fn, distance, type, count=1000, SMILES=SMILES)
+    write_output(X_train, X_test, out_fn, distance, type, count=1000, SMILES=SMILES, IDS=IDS)
 
+# Molecular topological fingerprints
+def get_sparse_matrix_from_sdf_topological_fp(dir, dimension=1024, dtype=numpy.bool):
+    from rdkit import Chem
+    from rdkit.Chem import AllChem
+    import glob
+    import gzip
+    from scipy.sparse import csr_matrix
+
+    SMILES = []
+    IDS = []
+    indptr = [0]
+    indices = []
+    data = []
+    num_mols = 0
+    file_list = glob.glob(dir + '/*.sdf.gz')
+    print(file_list)
+    for file in file_list:
+        inf = gzip.open(file)
+        suppl = Chem.ForwardSDMolSupplier(inf)
+        for mol in suppl:
+            if mol is None: continue
+            smile = Chem.MolToSmiles(mol)
+            SMILES.append(smile)
+            IDS.append(mol.GetProp("_Name"))
+            fp = Chem.rdmolops.RDKFingerprint(mol, fpSize=dimension)
+            for i in range(dimension):
+                if fp.GetBit(i) is True:
+                    indices.append(i)
+                    data.append(1)
+            indptr.append(len(indices))
+            num_mols += 1
+
+    fps = csr_matrix((data, indices, indptr), shape=(num_mols, dimension), dtype=dtype)
+    print('The dimension of the returned sparse matrix: %d*%d' % fps.shape)
+
+    return fps, SMILES, IDS
+    
+def topological_fp(out_fn, dataset_name, dimension, distance, type, test_size=1000):
+    from sklearn.utils import shuffle
+    print('prepare dataset ' + dataset_name)
+
+    if type == 'bit':
+        dtype = numpy.bool
+    elif type == 'int':
+        dtype = numpy.int
+    else:
+        dtype = numpy.float
+    
+    dir = './data'
+    
+    X, SMILES, IDS = get_sparse_matrix_from_sdf_topological_fp(dir=dir, dimension=dimension, dtype=dtype)
+
+    # random shuffle fingerprints and smiles at the same time
+    seed = 1 # random.randint(0, 2 ** 32 - 1)
+    X, SMILES, IDS = shuffle(X, SMILES, IDS, random_state=seed)
+
+    # data split and make test data full matrix
+    train_size = X.shape[0] - test_size
+    X_train = X[:train_size]
+    X_test = X[train_size:]
+    X_test = X_test.toarray()
+    print('finish dataset preparation')
+
+    print('Train data dimension: %d*%d' %X_train.shape)
+    print('Test data dimension: %d*%d' %X_test.shape)
+    write_output(X_train, X_test, out_fn, distance, type, count=1000, SMILES=SMILES, IDS=IDS)
+    
+def sdf_2_map4(dir, dimension=1024, dtype=numpy.bool):
+    from rdkit import Chem
+    from rdkit.Chem import AllChem
+    import glob
+    import gzip
+    from scipy.sparse import csr_matrix
+    from map4 import MAP4Calculator
+
+    MAP4 = MAP4Calculator(dimensions=dimension)
+
+    SMILES = []
+    IDS = []
+    fps = []
+    file_list = glob.glob(dir + '/*.sdf.gz')
+    print(file_list)
+    for file in file_list:
+        inf = gzip.open(file)
+        suppl = Chem.ForwardSDMolSupplier(inf)
+        #mols = [x for x in suppl if x is not None]
+        mols = []
+        num_mols = 0
+        for mol in suppl:
+            if mol is None: continue
+            mols.append(mol)
+            SMILES.append(Chem.MolToSmiles(mol))
+            IDS.append(mol.GetProp("_Name"))
+            num_mols += 1
+            if num_mols == 3000:
+                fps.extend(MAP4.calculate_many(mols))
+                mols = []
+                num_mols = 0
+        if num_mols > 0:
+            fps.extend(MAP4.calculate_many(mols))
+            mols = []
+            num_mols = 0
+
+    fps = numpy.array(fps, dtype=dtype)
+    print('The dimension of the returned matrix: %d*%d' % fps.shape)
+
+    return fps, SMILES, IDS
+    
+def map4(out_fn, dataset_name, dimension, distance, type, test_size=1000):
+    from sklearn.utils import shuffle
+    from map4 import MAP4Calculator
+    print('prepare dataset ' + dataset_name)
+
+    if type == 'bit':
+        dtype = numpy.bool
+    elif type == 'int':
+        dtype = numpy.int
+    else:
+        dtype = numpy.float
+    
+    dir = './data'
+    
+    X, SMILES, IDS = sdf_2_map4(dir=dir, dimension=dimension, dtype=dtype)
+
+    # random shuffle fingerprints and smiles at the same time
+    seed = 1 # random.randint(0, 2 ** 32 - 1)
+    X, SMILES, IDS = shuffle(X, SMILES, IDS, random_state=seed)
+
+    # data split and make test data full matrix
+    train_size = X.shape[0] - test_size
+    X_train = X[:train_size]
+    X_test = X[train_size:]
+    print('finish dataset preparation')
+
+    print('Train data dimension: %d*%d' %X_train.shape)
+    print('Test data dimension: %d*%d' %X_test.shape)
+    write_output(X_train, X_test, out_fn, distance, type, count=1000, SMILES=SMILES, IDS=IDS)
 
 def ecfp_sparse_multi(out_fn, dataset_name, num_files, dimension, distance, type):
     print('prepare dataset ' + dataset_name)
@@ -535,5 +689,7 @@ DATASETS = {
     'enamine100M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 5, 1024, 'jaccard', 'bit'),
     'enamine10M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 0.5, 1024, 'jaccard', 'bit'),
     'enamine20M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 1, 1024, 'jaccard', 'bit'),
-    'enamine10M-1024-jaccard': lambda out_fn: ecfp_multi(out_fn, 'Enamine', 0.5, 1024, 'jaccard', 'bit')
+    'enamine10M-1024-jaccard': lambda out_fn: ecfp_multi(out_fn, 'Enamine', 0.5, 1024, 'jaccard', 'bit'),
+    'chembl-1024-jaccard-tp': lambda out_fn: topological_fp(out_fn, 'Chembl', 1024, 'jaccard', 'bit'),
+    'chembl-1024-jaccard-map4': lambda out_fn: map4(out_fn, 'Chembl', 1024, 'jaccard', 'int')
 }
diff --git a/ann_benchmarks/runner.py b/ann_benchmarks/runner.py
index d51cdf5..1d3c3a4 100644
--- a/ann_benchmarks/runner.py
+++ b/ann_benchmarks/runner.py
@@ -113,7 +113,7 @@ function""" % (definition.module, definition.constructor, definition.arguments)
     D = get_dataset(dataset)
     # Chunjiang modified
     print('Is the train set a sparse matrix? %d' % issparse(D['train'][()]))
-    if 'sparse' not in dataset:
+    if issparse(D['train'][()]):
         X_train = D['train'][()].toarray()
     else:
         X_train = D['train'][()]
diff --git a/requirements.txt b/requirements.txt
index 244077a..dc5013c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,5 @@
 ansicolors==1.1.8
 docker==2.6.1
-singularity==3.1.1
 h5py==2.7.1
 matplotlib==2.1.0
 numpy==1.13.3