diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..ebf969c Binary files /dev/null and b/.DS_Store differ diff --git a/ann_benchmarks/datasets.py b/ann_benchmarks/datasets.py index 0f7b273..d386510 100644 --- a/ann_benchmarks/datasets.py +++ b/ann_benchmarks/datasets.py @@ -56,8 +56,9 @@ def write_output(train, test, fn, distance, point_type='float', count=1000, SMIL smile_fn = replace_last(fn, '.hdf5', '-SMILES.hdf5') print('Write Smiles to File %s' % smile_fn) f = h5sparse.File(smile_fn, 'w') + dt = h5py.special_dtype(vlen=bytes) asciiList = [n.encode("ascii", "ignore") for n in SMILES] - f.create_dataset('smile', (len(asciiList), 1), 'S10', asciiList) + f.create_dataset('smile', (len(asciiList), 1), dtype=dt, data=asciiList) f.close() print('Finish.') @@ -354,13 +355,12 @@ def get_sparse_matrix_from_txt(file=None, dtype=numpy.bool): return fps, SMILES -def get_sparse_matrix_from_sdf(dir, dtype=numpy.bool): +def get_sparse_matrix_from_sdf(dir, dimension=1024, dtype=numpy.bool): from rdkit import Chem from rdkit.Chem import AllChem import glob import gzip from scipy.sparse import csr_matrix - dimension = 1024 SMILES = [] indptr = [0] @@ -399,19 +399,15 @@ def ecfp(out_fn, dataset_name, dimension, distance, type, test_size=1000): dtype = numpy.int else: dtype = numpy.float - if dataset_name.startswith('toy'): - # toy - X, SMILES = get_sparse_matrix_from_txt(dtype=dtype) + + if dataset_name == 'Molport': + dir = '/home/cjz18001/Molport' + elif dataset_name == 'Chembl': + dir = '/home/cjz18001/Chembl' else: - # others, e.g., Chembl and Molport - if dataset_name == 'Molport': - dir = '/home/cjz18001/Molport' - elif dataset_name == 'Chembl': - dir = '/home/cjz18001/Chembl' - else: - print('unknown dataset') - exit(0) - X, SMILES = get_sparse_matrix_from_sdf(dir=dir, dtype=dtype) + print('unknown dataset') + exit(0) + X, SMILES = get_sparse_matrix_from_sdf(dir=dir, dimension=dimension, dtype=dtype) # random shuffle fingerprints and smiles at the same time seed = 1 # random.randint(0, 2 ** 32 - 1)