Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
mol_fps/partitioner_program.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
95 lines (68 sloc)
2.62 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import argparse | |
from rdkit import Chem | |
from rdkit.Chem import AllChem | |
import glob | |
import gzip | |
from scipy.sparse import csr_matrix | |
#PART 1 | |
""" | |
get_partioned_csv_files takes a .sdf.gz file with N molecules in it, | |
and splits it into ceil(N/T) .csv files. | |
Each value in the .csv file is of the form (id, smile) | |
PARAMETERS: dir - path to and including the .sdf.gz file name | |
T - the maximum amount of molecules in each .csv file | |
partion_name - the desired name for each .csv file | |
EX: partion_name = Chembl26, the file names will look | |
like "Chembl26-part#.csv" | |
""" | |
def get_partitioned_csv_files(dir, T, name = "set", dest = "./"): | |
cur_num_mols = 0 | |
cur_subset = [] | |
fnum = 0 | |
file_list = glob.glob(dir + '/*.sdf.gz') | |
#print(file_list) | |
for file in file_list: | |
inf = gzip.open(file) | |
suppl = Chem.ForwardSDMolSupplier(inf) | |
for mol in suppl: | |
if mol is None: continue | |
smile = Chem.MolToSmiles(mol) | |
ID = mol.GetProp("_Name") | |
#print([ID, smile]) | |
cur_subset.append([ID, smile]) | |
cur_num_mols += 1 | |
if cur_num_mols >= T: | |
fname = dest + "/" + name + "-part" + str(fnum) + ".csv" | |
with open(fname, 'w', newline = '') as file: | |
writer = csv.writer(file) | |
writer.writerows(cur_subset) | |
fnum += 1 | |
cur_subset = [] | |
cur_num_mols = 0 | |
if cur_num_mols > 0: | |
fname = dest + "/" + name + "-part" + str(fnum) + ".csv" | |
with open(fname, 'w', newline = '') as file: | |
writer = csv.writer(file) | |
writer.writerows(cur_subset) | |
return fnum #returning this so people can see if they got the right number | |
def main(): | |
parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter) | |
parser.add_argument( | |
'--dir', | |
help = 'file path upto and including name of the .sdf.gz') | |
parser.add_argument( | |
'--T', | |
type = int, | |
help = 'each subset will hold at most T molecules') | |
parser.add_argument( | |
'--name', | |
help = 'name for the partitioned files', | |
default = 'set') | |
parser.add_argument( | |
'--dest', | |
help = 'where the partitioned files will be', | |
default = './') | |
args = parser.parse_args() | |
#print(type(args.T)) | |
get_partitioned_csv_files(args.dir, int(args.T), args.name, args.dest) | |