partitioner_program.py

import csv
import argparse
from rdkit import Chem
from rdkit.Chem import AllChem
import glob
import gzip
from scipy.sparse import csr_matrix

#PART 1

"""
get_partioned_csv_files takes a .sdf.gz file with N molecules in it,
and splits it into ceil(N/T) .csv files.
Each value in the .csv file is of the form (id, smile)

PARAMETERS: dir - path to and including the .sdf.gz file name
            T - the maximum amount of molecules in each .csv file
            partion_name - the desired name for each .csv file
                EX: partion_name = Chembl26, the file names will look
                like "Chembl26-part#.csv"
"""
def get_partitioned_csv_files(dir, T, name = "set", dest = "./"):
    
    cur_num_mols = 0
    cur_subset = []

    fnum = 0

    file_list = glob.glob(dir + '/*.sdf.gz')
    #print(file_list)

    for file in file_list:

        inf = gzip.open(file)
        suppl = Chem.ForwardSDMolSupplier(inf)

        for mol in suppl:
            if mol is None: continue

            smile = Chem.MolToSmiles(mol)

            ID = mol.GetProp("_Name")

            #print([ID, smile])

            cur_subset.append([ID, smile])
            cur_num_mols += 1

            if cur_num_mols >= T:
                fname = dest + "/" + name + "-part" + str(fnum) + ".csv"
                with open(fname, 'w', newline = '') as file:
                    writer = csv.writer(file)
                    writer.writerows(cur_subset)
                fnum += 1
                cur_subset = []
                cur_num_mols = 0

    if cur_num_mols > 0:
        fname = dest + "/" + name + "-part" + str(fnum) + ".csv"
        with open(fname, 'w', newline = '') as file:
            writer = csv.writer(file)
            writer.writerows(cur_subset)

    return fnum #returning this so people can see if they got the right number

def main():
    parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument(
        '--dir',
        help = 'file path upto and including name of the .sdf.gz')
    parser.add_argument(
        '--T',
        type = int,
        help = 'each subset will hold at most T molecules')
    parser.add_argument(
        '--name',
        help = 'name for the partitioned files',
        default = 'set')
    parser.add_argument(
        '--dest',
        help = 'where the partitioned files will be',
        default = './')

    args = parser.parse_args()
    #print(type(args.T))
    get_partitioned_csv_files(args.dir, int(args.T), args.name, args.dest)