Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Fix datasets smile length limit
  • Loading branch information
ChunjiangZhu committed Apr 28, 2020
1 parent 5d6569d commit 5e9b4f1
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 15 deletions.
Binary file added .DS_Store
Binary file not shown.
26 changes: 11 additions & 15 deletions ann_benchmarks/datasets.py
Expand Up @@ -56,8 +56,9 @@ def write_output(train, test, fn, distance, point_type='float', count=1000, SMIL
smile_fn = replace_last(fn, '.hdf5', '-SMILES.hdf5')
print('Write Smiles to File %s' % smile_fn)
f = h5sparse.File(smile_fn, 'w')
dt = h5py.special_dtype(vlen=bytes)
asciiList = [n.encode("ascii", "ignore") for n in SMILES]
f.create_dataset('smile', (len(asciiList), 1), 'S10', asciiList)
f.create_dataset('smile', (len(asciiList), 1), dtype=dt, data=asciiList)
f.close()
print('Finish.')

Expand Down Expand Up @@ -354,13 +355,12 @@ def get_sparse_matrix_from_txt(file=None, dtype=numpy.bool):

return fps, SMILES

def get_sparse_matrix_from_sdf(dir, dtype=numpy.bool):
def get_sparse_matrix_from_sdf(dir, dimension=1024, dtype=numpy.bool):
from rdkit import Chem
from rdkit.Chem import AllChem
import glob
import gzip
from scipy.sparse import csr_matrix
dimension = 1024

SMILES = []
indptr = [0]
Expand Down Expand Up @@ -399,19 +399,15 @@ def ecfp(out_fn, dataset_name, dimension, distance, type, test_size=1000):
dtype = numpy.int
else:
dtype = numpy.float
if dataset_name.startswith('toy'):
# toy
X, SMILES = get_sparse_matrix_from_txt(dtype=dtype)

if dataset_name == 'Molport':
dir = '/home/cjz18001/Molport'
elif dataset_name == 'Chembl':
dir = '/home/cjz18001/Chembl'
else:
# others, e.g., Chembl and Molport
if dataset_name == 'Molport':
dir = '/home/cjz18001/Molport'
elif dataset_name == 'Chembl':
dir = '/home/cjz18001/Chembl'
else:
print('unknown dataset')
exit(0)
X, SMILES = get_sparse_matrix_from_sdf(dir=dir, dtype=dtype)
print('unknown dataset')
exit(0)
X, SMILES = get_sparse_matrix_from_sdf(dir=dir, dimension=dimension, dtype=dtype)

# random shuffle fingerprints and smiles at the same time
seed = 1 # random.randint(0, 2 ** 32 - 1)
Expand Down

0 comments on commit 5e9b4f1

Please sign in to comment.