From c4c913e1dd52ba884b13f0907901eeb79c3bb9cc Mon Sep 17 00:00:00 2001 From: ChunjiangZhu Date: Tue, 21 Jul 2020 17:29:12 -0400 Subject: [PATCH] add more fingerprint generation functions --- .DS_Store | Bin 10244 -> 10244 bytes ann_benchmarks/datasets.py | 48 +++++++++++++++++-------------------- 2 files changed, 22 insertions(+), 26 deletions(-) diff --git a/.DS_Store b/.DS_Store index e51d2361ba5c217b11d91dd82a50f93559957cc0..14fe294cba9fa9e06ee5db8cf87b5ee49a3703d9 100644 GIT binary patch delta 25 gcmZn(XbIS$Ey!VLVWOj8WNI?`wUF%Q0KqL{0Aq6ptN;K2 delta 16 XcmZn(XbIS$Ejam(nCIpa!3kmjI35N; diff --git a/ann_benchmarks/datasets.py b/ann_benchmarks/datasets.py index 33da4a5..02b1c07 100644 --- a/ann_benchmarks/datasets.py +++ b/ann_benchmarks/datasets.py @@ -369,7 +369,7 @@ def get_sparse_matrix_from_txt(file=None, dtype=numpy.bool): return fps, SMILES -def get_sparse_matrix_from_sdf(dir, dimension = 1024, dtype=numpy.bool): +def get_sparse_matrix_from_sdf(dir, dimension = 1024, radius=2, dtype=numpy.bool): from rdkit import Chem from rdkit.Chem import AllChem import glob @@ -392,21 +392,20 @@ def get_sparse_matrix_from_sdf(dir, dimension = 1024, dtype=numpy.bool): smile = Chem.MolToSmiles(mol) SMILES.append(smile) IDS.append(mol.GetProp("_Name")) - fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=dimension) + fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=dimension) for i in range(dimension): if fp.GetBit(i) is True: indices.append(i) data.append(1) indptr.append(len(indices)) num_mols += 1 - if num_mols > 3000: break fps = csr_matrix((data, indices, indptr), shape=(num_mols, dimension), dtype=dtype) print('The dimension of the returned sparse matrix: %d*%d' % fps.shape) return fps, SMILES, IDS -def ecfp(out_fn, dataset_name, dimension, distance, type, test_size=1000): +def ecfp(out_fn, dataset_name, dimension, radius, distance, type, test_size=1000): from sklearn.utils import shuffle print('prepare dataset ' + dataset_name) @@ -418,8 +417,8 @@ def ecfp(out_fn, dataset_name, dimension, distance, type, test_size=1000): dtype = numpy.float dir = './data' - - X, SMILES, IDS = get_sparse_matrix_from_sdf(dir=dir, dimension=dimension, dtype=dtype) + + X, SMILES, IDS = get_sparse_matrix_from_sdf(dir=dir, dimension=dimension, radius=radius, dtype=dtype) # random shuffle fingerprints and smiles at the same time seed = 1 # random.randint(0, 2 ** 32 - 1) @@ -437,9 +436,8 @@ def ecfp(out_fn, dataset_name, dimension, distance, type, test_size=1000): write_output(X_train, X_test, out_fn, distance, type, count=1000, SMILES=SMILES, IDS=IDS) # Molecular topological fingerprints -def get_sparse_matrix_from_sdf_topological_fp(dir, dimension=1024, dtype=numpy.bool): +def get_sparse_matrix_from_sdf_topological_fp(dir, dimension=1024, maxPath=7, dtype=numpy.bool): from rdkit import Chem - from rdkit.Chem import AllChem import glob import gzip from scipy.sparse import csr_matrix @@ -460,7 +458,7 @@ def get_sparse_matrix_from_sdf_topological_fp(dir, dimension=1024, dtype=numpy.b smile = Chem.MolToSmiles(mol) SMILES.append(smile) IDS.append(mol.GetProp("_Name")) - fp = Chem.rdmolops.RDKFingerprint(mol, fpSize=dimension) + fp = Chem.rdmolops.RDKFingerprint(mol, fpSize=dimension, maxPath=maxPath) for i in range(dimension): if fp.GetBit(i) is True: indices.append(i) @@ -473,7 +471,7 @@ def get_sparse_matrix_from_sdf_topological_fp(dir, dimension=1024, dtype=numpy.b return fps, SMILES, IDS -def topological_fp(out_fn, dataset_name, dimension, distance, type, test_size=1000): +def topological_fp(out_fn, dataset_name, dimension, maxPath, distance, type, test_size=1000): from sklearn.utils import shuffle print('prepare dataset ' + dataset_name) @@ -483,10 +481,10 @@ def topological_fp(out_fn, dataset_name, dimension, distance, type, test_size=10 dtype = numpy.int else: dtype = numpy.float - + dir = './data' - - X, SMILES, IDS = get_sparse_matrix_from_sdf_topological_fp(dir=dir, dimension=dimension, dtype=dtype) + + X, SMILES, IDS = get_sparse_matrix_from_sdf_topological_fp(dir=dir, dimension=dimension, maxPath=maxPath, dtype=dtype) # random shuffle fingerprints and smiles at the same time seed = 1 # random.randint(0, 2 ** 32 - 1) @@ -497,6 +495,7 @@ def topological_fp(out_fn, dataset_name, dimension, distance, type, test_size=10 X_train = X[:train_size] X_test = X[train_size:] X_test = X_test.toarray() + print('finish dataset preparation') print('Train data dimension: %d*%d' %X_train.shape) @@ -679,17 +678,14 @@ def ecfp_multi(out_fn, dataset_name, num_files, dimension, distance, type): 'word2bits-800-hamming': lambda out_fn: word2bits(out_fn, '400K', 'w2b_bitlevel1_size800_vocab400K'), 'lastfm-64-dot': lambda out_fn: lastfm(out_fn, 64), 'sift-256-hamming': lambda out_fn: sift_hamming(out_fn, 'sift.hamming.256'), - # below are datasets Chunjiang added - 'toy-1024-jaccard': lambda out_fn: ecfp(out_fn, 'toydata', 1024, 'jaccard', 'bit', 100), - 'chembl-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl', 1024, 'jaccard', 'bit'), - 'molport-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Molport', 1024, 'jaccard', 'bit'), - 'chembl100K-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl100K', 1024, 'jaccard', 'int'), - 'chembl10K-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl10K', 1024, 'jaccard', 'int'), - 'chembl-sparse-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl10K-sparse', 1024, 'jaccard', 'bit'), - 'enamine100M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 5, 1024, 'jaccard', 'bit'), - 'enamine10M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 0.5, 1024, 'jaccard', 'bit'), - 'enamine20M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 1, 1024, 'jaccard', 'bit'), - 'enamine10M-1024-jaccard': lambda out_fn: ecfp_multi(out_fn, 'Enamine', 0.5, 1024, 'jaccard', 'bit'), - 'chembl-1024-jaccard-tp': lambda out_fn: topological_fp(out_fn, 'Chembl', 1024, 'jaccard', 'bit'), - 'chembl-1024-jaccard-map4': lambda out_fn: map4(out_fn, 'Chembl', 1024, 'jaccard', 'int') + 'chembl-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl', 1024, 2, 'jaccard', 'bit'), + 'molport-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Molport', 1024, 2, 'jaccard', 'bit'), + 'chembl-1024-jaccard-tp': lambda out_fn: topological_fp(out_fn, 'Chembl', 1024, 7, 'jaccard', 'bit'), + 'chembl-1024-r3-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl', 1024, 3, 'jaccard', 'bit'), + 'chembl-512-r2-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl', 512, 2, 'jaccard', 'bit'), + 'chembl-2048-r2-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl', 2048, 2, 'jaccard', 'bit'), + 'chembl-1024-p3-jaccard-tp': lambda out_fn: topological_fp(out_fn, 'Chembl', 1024, 3, 'jaccard', 'bit'), + 'chembl-1024-p5-jaccard-tp': lambda out_fn: topological_fp(out_fn, 'Chembl', 1024, 5, 'jaccard', 'bit'), + 'chembl-512-p7-jaccard-tp': lambda out_fn: topological_fp(out_fn, 'Chembl', 512, 7, 'jaccard', 'bit'), + 'chembl-2048-p7-jaccard-tp': lambda out_fn: topological_fp(out_fn, 'Chembl', 2048, 7, 'jaccard', 'bit') }