diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000..ebf969c
Binary files /dev/null and b/.DS_Store differ
diff --git a/ann_benchmarks/datasets.py b/ann_benchmarks/datasets.py
index 0f7b273..d386510 100644
--- a/ann_benchmarks/datasets.py
+++ b/ann_benchmarks/datasets.py
@@ -56,8 +56,9 @@ def write_output(train, test, fn, distance, point_type='float', count=1000, SMIL
         smile_fn = replace_last(fn, '.hdf5', '-SMILES.hdf5')
         print('Write Smiles to File %s' % smile_fn)
         f = h5sparse.File(smile_fn, 'w')
+        dt = h5py.special_dtype(vlen=bytes)
         asciiList = [n.encode("ascii", "ignore") for n in SMILES]
-        f.create_dataset('smile', (len(asciiList), 1), 'S10', asciiList)
+        f.create_dataset('smile', (len(asciiList), 1), dtype=dt, data=asciiList)
         f.close()
         print('Finish.')
 
@@ -354,13 +355,12 @@ def get_sparse_matrix_from_txt(file=None, dtype=numpy.bool):
 
     return fps, SMILES
 
-def get_sparse_matrix_from_sdf(dir, dtype=numpy.bool):
+def get_sparse_matrix_from_sdf(dir, dimension=1024, dtype=numpy.bool):
     from rdkit import Chem
     from rdkit.Chem import AllChem
     import glob
     import gzip
     from scipy.sparse import csr_matrix
-    dimension = 1024
 
     SMILES = []
     indptr = [0]
@@ -399,19 +399,15 @@ def ecfp(out_fn, dataset_name, dimension, distance, type, test_size=1000):
         dtype = numpy.int
     else:
         dtype = numpy.float
-    if dataset_name.startswith('toy'):
-        # toy
-        X, SMILES = get_sparse_matrix_from_txt(dtype=dtype)
+
+    if dataset_name == 'Molport':
+        dir = '/home/cjz18001/Molport'
+    elif dataset_name == 'Chembl':
+        dir = '/home/cjz18001/Chembl'
     else:
-        # others, e.g., Chembl and Molport
-        if dataset_name == 'Molport':
-            dir = '/home/cjz18001/Molport'
-        elif dataset_name == 'Chembl':
-            dir = '/home/cjz18001/Chembl'
-        else:
-            print('unknown dataset')
-            exit(0)
-        X, SMILES = get_sparse_matrix_from_sdf(dir=dir, dtype=dtype)
+        print('unknown dataset')
+        exit(0)
+    X, SMILES = get_sparse_matrix_from_sdf(dir=dir, dimension=dimension, dtype=dtype)
 
     # random shuffle fingerprints and smiles at the same time
     seed = 1 # random.randint(0, 2 ** 32 - 1)