Skip to content

Commit

Permalink
add more fingerprint generation functions
Browse files Browse the repository at this point in the history
  • Loading branch information
ChunjiangZhu committed Jul 21, 2020
1 parent 37e1c03 commit c4c913e
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 26 deletions.
Binary file modified .DS_Store
Binary file not shown.
48 changes: 22 additions & 26 deletions ann_benchmarks/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,7 @@ def get_sparse_matrix_from_txt(file=None, dtype=numpy.bool):

return fps, SMILES

def get_sparse_matrix_from_sdf(dir, dimension = 1024, dtype=numpy.bool):
def get_sparse_matrix_from_sdf(dir, dimension = 1024, radius=2, dtype=numpy.bool):
from rdkit import Chem
from rdkit.Chem import AllChem
import glob
Expand All @@ -392,21 +392,20 @@ def get_sparse_matrix_from_sdf(dir, dimension = 1024, dtype=numpy.bool):
smile = Chem.MolToSmiles(mol)
SMILES.append(smile)
IDS.append(mol.GetProp("_Name"))
fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=dimension)
fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=dimension)
for i in range(dimension):
if fp.GetBit(i) is True:
indices.append(i)
data.append(1)
indptr.append(len(indices))
num_mols += 1
if num_mols > 3000: break

fps = csr_matrix((data, indices, indptr), shape=(num_mols, dimension), dtype=dtype)
print('The dimension of the returned sparse matrix: %d*%d' % fps.shape)

return fps, SMILES, IDS

def ecfp(out_fn, dataset_name, dimension, distance, type, test_size=1000):
def ecfp(out_fn, dataset_name, dimension, radius, distance, type, test_size=1000):
from sklearn.utils import shuffle
print('prepare dataset ' + dataset_name)

Expand All @@ -418,8 +417,8 @@ def ecfp(out_fn, dataset_name, dimension, distance, type, test_size=1000):
dtype = numpy.float

dir = './data'
X, SMILES, IDS = get_sparse_matrix_from_sdf(dir=dir, dimension=dimension, dtype=dtype)

X, SMILES, IDS = get_sparse_matrix_from_sdf(dir=dir, dimension=dimension, radius=radius, dtype=dtype)

# random shuffle fingerprints and smiles at the same time
seed = 1 # random.randint(0, 2 ** 32 - 1)
Expand All @@ -437,9 +436,8 @@ def ecfp(out_fn, dataset_name, dimension, distance, type, test_size=1000):
write_output(X_train, X_test, out_fn, distance, type, count=1000, SMILES=SMILES, IDS=IDS)

# Molecular topological fingerprints
def get_sparse_matrix_from_sdf_topological_fp(dir, dimension=1024, dtype=numpy.bool):
def get_sparse_matrix_from_sdf_topological_fp(dir, dimension=1024, maxPath=7, dtype=numpy.bool):
from rdkit import Chem
from rdkit.Chem import AllChem
import glob
import gzip
from scipy.sparse import csr_matrix
Expand All @@ -460,7 +458,7 @@ def get_sparse_matrix_from_sdf_topological_fp(dir, dimension=1024, dtype=numpy.b
smile = Chem.MolToSmiles(mol)
SMILES.append(smile)
IDS.append(mol.GetProp("_Name"))
fp = Chem.rdmolops.RDKFingerprint(mol, fpSize=dimension)
fp = Chem.rdmolops.RDKFingerprint(mol, fpSize=dimension, maxPath=maxPath)
for i in range(dimension):
if fp.GetBit(i) is True:
indices.append(i)
Expand All @@ -473,7 +471,7 @@ def get_sparse_matrix_from_sdf_topological_fp(dir, dimension=1024, dtype=numpy.b

return fps, SMILES, IDS

def topological_fp(out_fn, dataset_name, dimension, distance, type, test_size=1000):
def topological_fp(out_fn, dataset_name, dimension, maxPath, distance, type, test_size=1000):
from sklearn.utils import shuffle
print('prepare dataset ' + dataset_name)

Expand All @@ -483,10 +481,10 @@ def topological_fp(out_fn, dataset_name, dimension, distance, type, test_size=10
dtype = numpy.int
else:
dtype = numpy.float

dir = './data'
X, SMILES, IDS = get_sparse_matrix_from_sdf_topological_fp(dir=dir, dimension=dimension, dtype=dtype)

X, SMILES, IDS = get_sparse_matrix_from_sdf_topological_fp(dir=dir, dimension=dimension, maxPath=maxPath, dtype=dtype)

# random shuffle fingerprints and smiles at the same time
seed = 1 # random.randint(0, 2 ** 32 - 1)
Expand All @@ -497,6 +495,7 @@ def topological_fp(out_fn, dataset_name, dimension, distance, type, test_size=10
X_train = X[:train_size]
X_test = X[train_size:]
X_test = X_test.toarray()

print('finish dataset preparation')

print('Train data dimension: %d*%d' %X_train.shape)
Expand Down Expand Up @@ -679,17 +678,14 @@ def ecfp_multi(out_fn, dataset_name, num_files, dimension, distance, type):
'word2bits-800-hamming': lambda out_fn: word2bits(out_fn, '400K', 'w2b_bitlevel1_size800_vocab400K'),
'lastfm-64-dot': lambda out_fn: lastfm(out_fn, 64),
'sift-256-hamming': lambda out_fn: sift_hamming(out_fn, 'sift.hamming.256'),
# below are datasets Chunjiang added
'toy-1024-jaccard': lambda out_fn: ecfp(out_fn, 'toydata', 1024, 'jaccard', 'bit', 100),
'chembl-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl', 1024, 'jaccard', 'bit'),
'molport-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Molport', 1024, 'jaccard', 'bit'),
'chembl100K-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl100K', 1024, 'jaccard', 'int'),
'chembl10K-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl10K', 1024, 'jaccard', 'int'),
'chembl-sparse-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl10K-sparse', 1024, 'jaccard', 'bit'),
'enamine100M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 5, 1024, 'jaccard', 'bit'),
'enamine10M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 0.5, 1024, 'jaccard', 'bit'),
'enamine20M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 1, 1024, 'jaccard', 'bit'),
'enamine10M-1024-jaccard': lambda out_fn: ecfp_multi(out_fn, 'Enamine', 0.5, 1024, 'jaccard', 'bit'),
'chembl-1024-jaccard-tp': lambda out_fn: topological_fp(out_fn, 'Chembl', 1024, 'jaccard', 'bit'),
'chembl-1024-jaccard-map4': lambda out_fn: map4(out_fn, 'Chembl', 1024, 'jaccard', 'int')
'chembl-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl', 1024, 2, 'jaccard', 'bit'),
'molport-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Molport', 1024, 2, 'jaccard', 'bit'),
'chembl-1024-jaccard-tp': lambda out_fn: topological_fp(out_fn, 'Chembl', 1024, 7, 'jaccard', 'bit'),
'chembl-1024-r3-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl', 1024, 3, 'jaccard', 'bit'),
'chembl-512-r2-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl', 512, 2, 'jaccard', 'bit'),
'chembl-2048-r2-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl', 2048, 2, 'jaccard', 'bit'),
'chembl-1024-p3-jaccard-tp': lambda out_fn: topological_fp(out_fn, 'Chembl', 1024, 3, 'jaccard', 'bit'),
'chembl-1024-p5-jaccard-tp': lambda out_fn: topological_fp(out_fn, 'Chembl', 1024, 5, 'jaccard', 'bit'),
'chembl-512-p7-jaccard-tp': lambda out_fn: topological_fp(out_fn, 'Chembl', 512, 7, 'jaccard', 'bit'),
'chembl-2048-p7-jaccard-tp': lambda out_fn: topological_fp(out_fn, 'Chembl', 2048, 7, 'jaccard', 'bit')
}

0 comments on commit c4c913e

Please sign in to comment.