diff --git a/get_hdf5_program.py b/get_hdf5_program.py new file mode 100644 index 0000000..b5fcba5 --- /dev/null +++ b/get_hdf5_program.py @@ -0,0 +1,71 @@ +import csv +from csv import reader +import argparse +from rdkit import Chem +from rdkit.Chem import AllChem +from rdkit.Chem import Draw +from scipy.sparse import csr_matrix +import h5sparse +import numpy + +#PARTS 2 + 3 + +def get_hdf5_from_tfp(csv, ofn, dimension = 1024, dtype = numpy.bool): + + indptr = [0] + indices = [] + data = [] + + num_mols = 0 + + with open(csv, 'r') as mols: + csv_reader = reader(mols) + for row in csv_reader: + smile = row[1] + m = Chem.MolFromSmiles(smile) + m.SetProp("_Name", row[0]) + + fp = Chem.rdmolops.RDKFingerprint(m, fpSize = dimension) + + for i in range(dimension): + if fp.GetBit(i) is True: + indices.append(i) + data.append(1) + + indptr.append(len(indices)) + num_mols += 1 + + mol_fps = csr_matrix((data, indices, indptr), shape = (num_mols, dimension), dtype = dtype) + #print(mol_fps) + print('the dimension of the returned sparse matrix: %d*%d' % mol_fps.shape) + + f = h5sparse.File(ofn + '.hdf5', 'w') + f.create_dataset('mol_fps', data = mol_fps) + + f.close() + + return mol_fps + +def main(): + parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter) + + parser.add_argument( + '--csv', + type = str) + parser.add_argument( + '--ofn', + type = str) + parser.add_argument( + '--dim', + type = int, + default = 1024) + parser.add_argument( + '--dtype', + default = numpy.bool) + + args = parser.parse_args() + + get_hdf5_from_tfp(args.csv, args.ofn, args.dim, args.dtype) + + + diff --git a/partition.py b/partition.py new file mode 100644 index 0000000..8d8b80d --- /dev/null +++ b/partition.py @@ -0,0 +1,3 @@ +from partitioner_program import main + +main() diff --git a/partitioner_program.py b/partitioner_program.py new file mode 100644 index 0000000..abc01ab --- /dev/null +++ b/partitioner_program.py @@ -0,0 +1,95 @@ +import csv +import argparse +from rdkit import Chem +from rdkit.Chem import AllChem +import glob +import gzip +from scipy.sparse import csr_matrix + +#PART 1 + +""" +get_partioned_csv_files takes a .sdf.gz file with N molecules in it, +and splits it into ceil(N/T) .csv files. +Each value in the .csv file is of the form (id, smile) + +PARAMETERS: dir - path to and including the .sdf.gz file name + T - the maximum amount of molecules in each .csv file + partion_name - the desired name for each .csv file + EX: partion_name = Chembl26, the file names will look + like "Chembl26-part#.csv" +""" +def get_partitioned_csv_files(dir, T, name = "set", dest = "./"): + + cur_num_mols = 0 + cur_subset = [] + + fnum = 0 + + file_list = glob.glob(dir + '/*.sdf.gz') + #print(file_list) + + for file in file_list: + + inf = gzip.open(file) + suppl = Chem.ForwardSDMolSupplier(inf) + + for mol in suppl: + if mol is None: continue + + smile = Chem.MolToSmiles(mol) + + ID = mol.GetProp("_Name") + + #print([ID, smile]) + + cur_subset.append([ID, smile]) + cur_num_mols += 1 + + if cur_num_mols >= T: + fname = dest + "/" + name + "-part" + str(fnum) + ".csv" + with open(fname, 'w', newline = '') as file: + writer = csv.writer(file) + writer.writerows(cur_subset) + fnum += 1 + cur_subset = [] + cur_num_mols = 0 + + if cur_num_mols > 0: + fname = dest + "/" + name + "-part" + str(fnum) + ".csv" + with open(fname, 'w', newline = '') as file: + writer = csv.writer(file) + writer.writerows(cur_subset) + + return fnum #returning this so people can see if they got the right number + +def main(): + parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter) + + parser.add_argument( + '--dir', + help = 'file path upto and including name of the .sdf.gz') + parser.add_argument( + '--T', + type = int, + help = 'each subset will hold at most T molecules') + parser.add_argument( + '--name', + help = 'name for the partitioned files', + default = 'set') + parser.add_argument( + '--dest', + help = 'where the partitioned files will be', + default = './') + + args = parser.parse_args() + #print(type(args.T)) + get_partitioned_csv_files(args.dir, int(args.T), args.name, args.dest) + + + + + + + + diff --git a/run_fp.sh b/run_fp.sh new file mode 100644 index 0000000..b624070 --- /dev/null +++ b/run_fp.sh @@ -0,0 +1,12 @@ +#!/bin/sh +#SBATCH --ntasks=1 + +#SBATCH --nodes=1 + +#SBATCH --cpus-per-task=1 + +#SBATCH --array=0-7 + +jobid=$SLURM_ARRAY_TASK_ID + +python run_get_hdf5.py --csv=./partitioned/chembl27-part$jobid.csv --ofn=chembl27_hdf5/chembl27$jobid diff --git a/run_get_hdf5.py b/run_get_hdf5.py new file mode 100644 index 0000000..5fad9af --- /dev/null +++ b/run_get_hdf5.py @@ -0,0 +1,3 @@ +from get_hdf5_program import main + +main()