Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
import csv
import argparse
from rdkit import Chem
from rdkit.Chem import AllChem
import glob
import gzip
from scipy.sparse import csr_matrix
#PART 1
"""
get_partioned_csv_files takes a .sdf.gz file with N molecules in it,
and splits it into ceil(N/T) .csv files.
Each value in the .csv file is of the form (id, smile)
PARAMETERS: dir - path to and including the .sdf.gz file name
T - the maximum amount of molecules in each .csv file
partion_name - the desired name for each .csv file
EX: partion_name = Chembl26, the file names will look
like "Chembl26-part#.csv"
"""
def get_partitioned_csv_files(dir, T, name = "set", dest = "./"):
cur_num_mols = 0
cur_subset = []
fnum = 0
file_list = glob.glob(dir + '/*.sdf.gz')
#print(file_list)
for file in file_list:
inf = gzip.open(file)
suppl = Chem.ForwardSDMolSupplier(inf)
for mol in suppl:
if mol is None: continue
smile = Chem.MolToSmiles(mol)
ID = mol.GetProp("_Name")
#print([ID, smile])
cur_subset.append([ID, smile])
cur_num_mols += 1
if cur_num_mols >= T:
fname = dest + "/" + name + "-part" + str(fnum) + ".csv"
with open(fname, 'w', newline = '') as file:
writer = csv.writer(file)
writer.writerows(cur_subset)
fnum += 1
cur_subset = []
cur_num_mols = 0
if cur_num_mols > 0:
fname = dest + "/" + name + "-part" + str(fnum) + ".csv"
with open(fname, 'w', newline = '') as file:
writer = csv.writer(file)
writer.writerows(cur_subset)
return fnum #returning this so people can see if they got the right number
def main():
parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(
'--dir',
help = 'file path upto and including name of the .sdf.gz')
parser.add_argument(
'--T',
type = int,
help = 'each subset will hold at most T molecules')
parser.add_argument(
'--name',
help = 'name for the partitioned files',
default = 'set')
parser.add_argument(
'--dest',
help = 'where the partitioned files will be',
default = './')
args = parser.parse_args()
#print(type(args.T))
get_partitioned_csv_files(args.dir, int(args.T), args.name, args.dest)