From 0e81a7b14834e1137587e552a0101d455cf1f3e2 Mon Sep 17 00:00:00 2001 From: ChunjiangZhu Date: Fri, 19 Jun 2020 19:01:09 -0400 Subject: [PATCH] add map4 fingerprint --- .DS_Store | Bin 10244 -> 10244 bytes algos.yaml | 126 +++++++++++++- ann_benchmarks/.DS_Store | Bin 8196 -> 8196 bytes ann_benchmarks/algorithms/nmslib.py | 32 +++- ann_benchmarks/datasets.py | 246 +++++++++++++++++++++++----- ann_benchmarks/runner.py | 2 +- requirements.txt | 1 - 7 files changed, 348 insertions(+), 59 deletions(-) diff --git a/.DS_Store b/.DS_Store index afb1bdc376a0c0241d3bfeb1d4203670f90495ff..e51d2361ba5c217b11d91dd82a50f93559957cc0 100644 GIT binary patch delta 200 zcmZn(XbG6$&nUSuU^hRbdot(% delta 99 zcmZn(XbG6$&nU4mU^hRb#AY4=c}5OnV-p<(V^hP)uZ3hc2M9i4-drM@0T(s~2~Q3X XRN1U5{)l5@gTrQa1zlF0O5_;=E3MR&8wY8ia;;M$Wo(Z{? zRn;}Mbu$^jfRPbGGw?%c7&U8hlAs%tnCRqHg7vcT0{q3vMfo{70r|z5C7Jnok=J+30cs~HC;$Ke delta 150 zcmZp1XmQvuQ-JZr-G!Q%iZ)C>Eo98Ne)E4J4%W?V693o%NUbQ` diff --git a/ann_benchmarks/algorithms/nmslib.py b/ann_benchmarks/algorithms/nmslib.py index 47933f5..72f3ebc 100644 --- a/ann_benchmarks/algorithms/nmslib.py +++ b/ann_benchmarks/algorithms/nmslib.py @@ -24,9 +24,17 @@ class NmslibReuseIndex(BaseANN): arr.sort() res.append(' '.join([str(k) for k in arr])) return res + + @staticmethod + def intMatrToStrArray(intMatr): + res = [] + for row in range(intMatr.shape[0]): + res.append(' '.join([str(k) for k in intMatr[row]])) + return res - def __init__(self, metric, method_name, index_param, query_param): + def __init__(self, metric, object_type, method_name, index_param, query_param): self._nmslib_metric = {'angular': 'cosinesimil', 'euclidean': 'l2', 'jaccard': 'jaccard_sparse'}[metric] + self._object_type = object_type self._method_name = method_name self._save_index = False self._index_param = NmslibReuseIndex.encode(index_param) @@ -53,11 +61,11 @@ class NmslibReuseIndex(BaseANN): # Aborted (core dumped) self._index_param.append('bucketSize=%d' % min(int(X.shape[0] * 0.0005), 1000)) - # Chunjiang modified it to "if" for jaccard if self._nmslib_metric == 'jaccard_sparse': - X_trans = NmslibReuseIndex.matrToStrArray(csr_matrix(X)) - self._index = nmslib.init(space=self._nmslib_metric, method=self._method_name, data_type=nmslib.DataType.OBJECT_AS_STRING) - self._index.addDataPointBatch(X_trans) + if self._object_type == 'Byte': + X_trans = NmslibReuseIndex.matrToStrArray(csr_matrix(X)) + else: + X_trans = NmslibReuseIndex.intMatrToStrArray(X) else: self._index = nmslib.init(space=self._nmslib_metric, method=self._method_name) self._index.addDataPointBatch(X) @@ -79,9 +87,12 @@ class NmslibReuseIndex(BaseANN): def query(self, v, n, rq=False): # Chunjiang modified if self._nmslib_metric == 'jaccard_sparse': - nz = numpy.nonzero(v)[0] - v = ' '.join([str(k) for k in nz]) - #print(n) + if self._object_type == 'Byte': + nz = numpy.nonzero(v)[0] + v = ' '.join([str(k) for k in nz]) + else: + v = ' '.join([str(k) for k in v]) + if rq: ids, distances = self._index.rangeQuery(v, n) else: @@ -91,7 +102,10 @@ class NmslibReuseIndex(BaseANN): def batch_query(self, X, n): # Chunjiang modified if self._nmslib_metric == 'jaccard_sparse': - X = NmslibReuseIndex.matrToStrArray(csr_matrix(X)) + if self._object_type == 'Byte': + X = NmslibReuseIndex.matrToStrArray(csr_matrix(X)) + else: + X = NmslibReuseIndex.intMatrToStrArray(X) self.res = self._index.knnQueryBatch(X, n) def get_batch_results(self): diff --git a/ann_benchmarks/datasets.py b/ann_benchmarks/datasets.py index 68728bf..2ee8551 100644 --- a/ann_benchmarks/datasets.py +++ b/ann_benchmarks/datasets.py @@ -42,14 +42,11 @@ def get_dataset(which): # Everything below this line is related to creating datasets # You probably never need to do this at home, just rely on the prepared datasets at http://ann-benchmarks.com -def write_output(train, test, fn, distance, point_type='float', count=1000, SMILES=None): +def write_output(train, test, fn, distance, point_type='float', count=1000, SMILES=None, IDS=None): from ann_benchmarks.algorithms.bruteforce import BruteForceBLAS import sklearn.neighbors import h5sparse - - def replace_last(source_string, replace_what, replace_with): - head, _sep, tail = source_string.rpartition(replace_what) - return head + replace_with + tail + from scipy.sparse import issparse # store SMILES first if SMILES: @@ -62,43 +59,60 @@ def write_output(train, test, fn, distance, point_type='float', count=1000, SMIL f.close() print('Finish.') - print('Write Dataset %s' % fn) - f = h5sparse.File(fn, 'w') - f.attrs['distance'] = distance - f.attrs['point_type'] = point_type - print('train size: %9d * %4d' % train.shape) - print('test size: %9d * %4d' % test.shape) - f.create_dataset('train',data=train) - f.create_dataset('test', test.shape, dtype=test.dtype)[:] = test - neighbors = f.create_dataset('neighbors', (test.shape[0], count), dtype='i') - distances = f.create_dataset('distances', (test.shape[0], count), dtype='f') - - # use which method to compute the groundtruth - train = train.toarray() - method = 'bruteforth' - if method == 'balltree': - tree = sklearn.neighbors.BallTree(train, leaf_size=1000000, metric=distance) - else: - bf = BruteForceBLAS(metric=distance, precision=train.dtype) - bf.fit(train) + if IDS: + smile_fn = replace_last(fn, '.hdf5', '-IDS.hdf5') + print('Write Smiles to File %s' % smile_fn) + f = h5sparse.File(smile_fn, 'w') + dt = h5py.special_dtype(vlen=bytes) + asciiList = [n.encode("ascii", "ignore") for n in IDS] + f.create_dataset('smile', (len(asciiList), 1), dtype=dt, data=asciiList) + f.close() - print(test) - for i, x in enumerate(test): - if i % 1 == 0: - print('%d/%d...' % (i, test.shape[0])) + print('Write Dataset %s' % fn) + f = h5sparse.File(fn, 'w') + f.attrs['distance'] = distance + f.attrs['point_type'] = point_type + print('train size: %9d * %4d' % train.shape) + print('test size: %9d * %4d' % test.shape) + if issparse(train): + f.create_dataset('train',data=train) + else: + f.create_dataset('train', train.shape, dtype=train.dtype)[:] = train + if issparse(test): + f.create_dataset('test',data=test) + else: + f.create_dataset('test', test.shape, dtype=test.dtype)[:] = test + neighbors = f.create_dataset('neighbors', (test.shape[0], count), dtype='i') + distances = f.create_dataset('distances', (test.shape[0], count), dtype='f') + + # use which method to compute the groundtruth + if issparse(train): + train = train.toarray() + method = 'bruteforce' if method == 'balltree': - dist, ind = tree.query([x], k=count) - neighbors[i] = ind[0] - distances[i] = dist[0] + tree = sklearn.neighbors.BallTree(train, leaf_size=1000000, metric=distance) else: - res = list(bf.query_with_distances(x, count)) - res.sort(key=lambda t: t[-1]) - neighbors[i] = [j for j, _ in res] - distances[i] = [d for _, d in res] - print(neighbors[i]) - print(distances[i]) - f.close() - print('Finish.') + bf = BruteForceBLAS(metric=distance, precision=train.dtype) + bf.fit(train) + + print(test) + for i, x in enumerate(test): + if i % 1 == 0: + print('%d/%d...' % (i, test.shape[0])) + if method == 'balltree': + dist, ind = tree.query([x], k=count) + neighbors[i] = ind[0] + distances[i] = dist[0] + else: + res = list(bf.query_with_distances(x, count)) + print(len(res)) + res.sort(key=lambda t: t[-1]) + neighbors[i] = [j for j, _ in res] + distances[i] = [d for _, d in res] + print(neighbors[i]) + print(distances[i]) + f.close() + print('Finish.') def train_test_split(X, test_size=10000): @@ -355,7 +369,7 @@ def get_sparse_matrix_from_txt(file=None, dtype=numpy.bool): return fps, SMILES -def get_sparse_matrix_from_sdf(dir, dimension=1024, dtype=numpy.bool): +def get_sparse_matrix_from_sdf(dir, dimension = 1024, dtype=numpy.bool): from rdkit import Chem from rdkit.Chem import AllChem import glob @@ -363,6 +377,7 @@ def get_sparse_matrix_from_sdf(dir, dimension=1024, dtype=numpy.bool): from scipy.sparse import csr_matrix SMILES = [] + IDS = [] indptr = [0] indices = [] data = [] @@ -376,6 +391,7 @@ def get_sparse_matrix_from_sdf(dir, dimension=1024, dtype=numpy.bool): if mol is None: continue smile = Chem.MolToSmiles(mol) SMILES.append(smile) + IDS.append(mol.GetProp("_Name")) fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=dimension) for i in range(dimension): if fp.GetBit(i) is True: @@ -383,11 +399,12 @@ def get_sparse_matrix_from_sdf(dir, dimension=1024, dtype=numpy.bool): data.append(1) indptr.append(len(indices)) num_mols += 1 + if num_mols > 3000: break fps = csr_matrix((data, indices, indptr), shape=(num_mols, dimension), dtype=dtype) print('The dimension of the returned sparse matrix: %d*%d' % fps.shape) - return fps, SMILES + return fps, SMILES, IDS def ecfp(out_fn, dataset_name, dimension, distance, type, test_size=1000): from sklearn.utils import shuffle @@ -402,11 +419,11 @@ def ecfp(out_fn, dataset_name, dimension, distance, type, test_size=1000): dir = './data' - X, SMILES = get_sparse_matrix_from_sdf(dir=dir, dimension=dimension, dtype=dtype) + X, SMILES, IDS = get_sparse_matrix_from_sdf(dir=dir, dimension=dimension, dtype=dtype) # random shuffle fingerprints and smiles at the same time seed = 1 # random.randint(0, 2 ** 32 - 1) - X, SMILES = shuffle(X, SMILES, random_state=seed) + X, SMILES, IDS = shuffle(X, SMILES, IDS, random_state=seed) # data split and make test data full matrix train_size = X.shape[0] - test_size @@ -417,8 +434,145 @@ def ecfp(out_fn, dataset_name, dimension, distance, type, test_size=1000): print('Train data dimension: %d*%d' %X_train.shape) print('Test data dimension: %d*%d' %X_test.shape) - write_output(X_train, X_test, out_fn, distance, type, count=1000, SMILES=SMILES) + write_output(X_train, X_test, out_fn, distance, type, count=1000, SMILES=SMILES, IDS=IDS) +# Molecular topological fingerprints +def get_sparse_matrix_from_sdf_topological_fp(dir, dimension=1024, dtype=numpy.bool): + from rdkit import Chem + from rdkit.Chem import AllChem + import glob + import gzip + from scipy.sparse import csr_matrix + + SMILES = [] + IDS = [] + indptr = [0] + indices = [] + data = [] + num_mols = 0 + file_list = glob.glob(dir + '/*.sdf.gz') + print(file_list) + for file in file_list: + inf = gzip.open(file) + suppl = Chem.ForwardSDMolSupplier(inf) + for mol in suppl: + if mol is None: continue + smile = Chem.MolToSmiles(mol) + SMILES.append(smile) + IDS.append(mol.GetProp("_Name")) + fp = Chem.rdmolops.RDKFingerprint(mol, fpSize=dimension) + for i in range(dimension): + if fp.GetBit(i) is True: + indices.append(i) + data.append(1) + indptr.append(len(indices)) + num_mols += 1 + + fps = csr_matrix((data, indices, indptr), shape=(num_mols, dimension), dtype=dtype) + print('The dimension of the returned sparse matrix: %d*%d' % fps.shape) + + return fps, SMILES, IDS + +def topological_fp(out_fn, dataset_name, dimension, distance, type, test_size=1000): + from sklearn.utils import shuffle + print('prepare dataset ' + dataset_name) + + if type == 'bit': + dtype = numpy.bool + elif type == 'int': + dtype = numpy.int + else: + dtype = numpy.float + + dir = './data' + + X, SMILES, IDS = get_sparse_matrix_from_sdf_topological_fp(dir=dir, dimension=dimension, dtype=dtype) + + # random shuffle fingerprints and smiles at the same time + seed = 1 # random.randint(0, 2 ** 32 - 1) + X, SMILES, IDS = shuffle(X, SMILES, IDS, random_state=seed) + + # data split and make test data full matrix + train_size = X.shape[0] - test_size + X_train = X[:train_size] + X_test = X[train_size:] + X_test = X_test.toarray() + print('finish dataset preparation') + + print('Train data dimension: %d*%d' %X_train.shape) + print('Test data dimension: %d*%d' %X_test.shape) + write_output(X_train, X_test, out_fn, distance, type, count=1000, SMILES=SMILES, IDS=IDS) + +def sdf_2_map4(dir, dimension=1024, dtype=numpy.bool): + from rdkit import Chem + from rdkit.Chem import AllChem + import glob + import gzip + from scipy.sparse import csr_matrix + from map4 import MAP4Calculator + + MAP4 = MAP4Calculator(dimensions=dimension) + + SMILES = [] + IDS = [] + fps = [] + file_list = glob.glob(dir + '/*.sdf.gz') + print(file_list) + for file in file_list: + inf = gzip.open(file) + suppl = Chem.ForwardSDMolSupplier(inf) + #mols = [x for x in suppl if x is not None] + mols = [] + num_mols = 0 + for mol in suppl: + if mol is None: continue + mols.append(mol) + SMILES.append(Chem.MolToSmiles(mol)) + IDS.append(mol.GetProp("_Name")) + num_mols += 1 + if num_mols == 3000: + fps.extend(MAP4.calculate_many(mols)) + mols = [] + num_mols = 0 + if num_mols > 0: + fps.extend(MAP4.calculate_many(mols)) + mols = [] + num_mols = 0 + + fps = numpy.array(fps, dtype=dtype) + print('The dimension of the returned matrix: %d*%d' % fps.shape) + + return fps, SMILES, IDS + +def map4(out_fn, dataset_name, dimension, distance, type, test_size=1000): + from sklearn.utils import shuffle + from map4 import MAP4Calculator + print('prepare dataset ' + dataset_name) + + if type == 'bit': + dtype = numpy.bool + elif type == 'int': + dtype = numpy.int + else: + dtype = numpy.float + + dir = './data' + + X, SMILES, IDS = sdf_2_map4(dir=dir, dimension=dimension, dtype=dtype) + + # random shuffle fingerprints and smiles at the same time + seed = 1 # random.randint(0, 2 ** 32 - 1) + X, SMILES, IDS = shuffle(X, SMILES, IDS, random_state=seed) + + # data split and make test data full matrix + train_size = X.shape[0] - test_size + X_train = X[:train_size] + X_test = X[train_size:] + print('finish dataset preparation') + + print('Train data dimension: %d*%d' %X_train.shape) + print('Test data dimension: %d*%d' %X_test.shape) + write_output(X_train, X_test, out_fn, distance, type, count=1000, SMILES=SMILES, IDS=IDS) def ecfp_sparse_multi(out_fn, dataset_name, num_files, dimension, distance, type): print('prepare dataset ' + dataset_name) @@ -535,5 +689,7 @@ DATASETS = { 'enamine100M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 5, 1024, 'jaccard', 'bit'), 'enamine10M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 0.5, 1024, 'jaccard', 'bit'), 'enamine20M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 1, 1024, 'jaccard', 'bit'), - 'enamine10M-1024-jaccard': lambda out_fn: ecfp_multi(out_fn, 'Enamine', 0.5, 1024, 'jaccard', 'bit') + 'enamine10M-1024-jaccard': lambda out_fn: ecfp_multi(out_fn, 'Enamine', 0.5, 1024, 'jaccard', 'bit'), + 'chembl-1024-jaccard-tp': lambda out_fn: topological_fp(out_fn, 'Chembl', 1024, 'jaccard', 'bit'), + 'chembl-1024-jaccard-map4': lambda out_fn: map4(out_fn, 'Chembl', 1024, 'jaccard', 'int') } diff --git a/ann_benchmarks/runner.py b/ann_benchmarks/runner.py index d51cdf5..1d3c3a4 100644 --- a/ann_benchmarks/runner.py +++ b/ann_benchmarks/runner.py @@ -113,7 +113,7 @@ function""" % (definition.module, definition.constructor, definition.arguments) D = get_dataset(dataset) # Chunjiang modified print('Is the train set a sparse matrix? %d' % issparse(D['train'][()])) - if 'sparse' not in dataset: + if issparse(D['train'][()]): X_train = D['train'][()].toarray() else: X_train = D['train'][()] diff --git a/requirements.txt b/requirements.txt index 244077a..dc5013c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,5 @@ ansicolors==1.1.8 docker==2.6.1 -singularity==3.1.1 h5py==2.7.1 matplotlib==2.1.0 numpy==1.13.3