From 5e9b4f136e800db05510a781d998772d2fd89773 Mon Sep 17 00:00:00 2001 From: ChunjiangZhu Date: Tue, 28 Apr 2020 14:00:24 -0400 Subject: [PATCH] Fix datasets smile length limit --- .DS_Store | Bin 0 -> 6148 bytes ann_benchmarks/datasets.py | 26 +++++++++++--------------- 2 files changed, 11 insertions(+), 15 deletions(-) create mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..ebf969c050bf019577fb14d65a3a2354e1309ed6 GIT binary patch literal 6148 zcmeHK&2G~`5T0#K>ljgjKnh1+kT`@YQGp^Mq$rd_Z%9FK0F>HI)L3=w$aWfq5R?n= zP#%Fd;SG2Y9ss`CU6Ad#AT9{OPP9Ag-T7vAzx~S+k?2kbZK666*+{G{2U&%1JEt|- z&^1?~h4hiqF3riKF@X zH;QMy%*&=}*GpdV*HbSW`MUfpj>8{i&wm~aXZ`w($8nT-K{QMxI2a6J^6Etp4dPih zo<@UI`st|#+pgKQe!a0+?6uoX_rbko(_QTEx0>$Vz5B~$&2HRkKRP@b&)+QGF5jI? zMo8ccl){|mQRQgPXk%~>%mqraz!kkB1Oh!7`3k3$NJShLlk+pi46_fO{ffO*Kv>F(veNj_zk(gQyC!RoNu(R!t{f|JmX z51}Iq{X!8kI{drRoJ2=ylN$yM14RZl)n}dW|0fr}|BE2=Wf(9F{8J1F>!5eg#gO#f zx;8kzYh9!xBo?+?DJ>z$^mQx?d==kBl7cpu3(y^{S4yJ>V*Ut78cb#w_@fMb165}f AGXMYp literal 0 HcmV?d00001 diff --git a/ann_benchmarks/datasets.py b/ann_benchmarks/datasets.py index 0f7b273..d386510 100644 --- a/ann_benchmarks/datasets.py +++ b/ann_benchmarks/datasets.py @@ -56,8 +56,9 @@ def write_output(train, test, fn, distance, point_type='float', count=1000, SMIL smile_fn = replace_last(fn, '.hdf5', '-SMILES.hdf5') print('Write Smiles to File %s' % smile_fn) f = h5sparse.File(smile_fn, 'w') + dt = h5py.special_dtype(vlen=bytes) asciiList = [n.encode("ascii", "ignore") for n in SMILES] - f.create_dataset('smile', (len(asciiList), 1), 'S10', asciiList) + f.create_dataset('smile', (len(asciiList), 1), dtype=dt, data=asciiList) f.close() print('Finish.') @@ -354,13 +355,12 @@ def get_sparse_matrix_from_txt(file=None, dtype=numpy.bool): return fps, SMILES -def get_sparse_matrix_from_sdf(dir, dtype=numpy.bool): +def get_sparse_matrix_from_sdf(dir, dimension=1024, dtype=numpy.bool): from rdkit import Chem from rdkit.Chem import AllChem import glob import gzip from scipy.sparse import csr_matrix - dimension = 1024 SMILES = [] indptr = [0] @@ -399,19 +399,15 @@ def ecfp(out_fn, dataset_name, dimension, distance, type, test_size=1000): dtype = numpy.int else: dtype = numpy.float - if dataset_name.startswith('toy'): - # toy - X, SMILES = get_sparse_matrix_from_txt(dtype=dtype) + + if dataset_name == 'Molport': + dir = '/home/cjz18001/Molport' + elif dataset_name == 'Chembl': + dir = '/home/cjz18001/Chembl' else: - # others, e.g., Chembl and Molport - if dataset_name == 'Molport': - dir = '/home/cjz18001/Molport' - elif dataset_name == 'Chembl': - dir = '/home/cjz18001/Chembl' - else: - print('unknown dataset') - exit(0) - X, SMILES = get_sparse_matrix_from_sdf(dir=dir, dtype=dtype) + print('unknown dataset') + exit(0) + X, SMILES = get_sparse_matrix_from_sdf(dir=dir, dimension=dimension, dtype=dtype) # random shuffle fingerprints and smiles at the same time seed = 1 # random.randint(0, 2 ** 32 - 1)