Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Add files via upload
  • Loading branch information
cdb17006 committed May 28, 2020
0 parents commit 735897e
Show file tree
Hide file tree
Showing 5 changed files with 184 additions and 0 deletions.
71 changes: 71 additions & 0 deletions get_hdf5_program.py
@@ -0,0 +1,71 @@
import csv
from csv import reader
import argparse
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from scipy.sparse import csr_matrix
import h5sparse
import numpy

#PARTS 2 + 3

def get_hdf5_from_tfp(csv, ofn, dimension = 1024, dtype = numpy.bool):

indptr = [0]
indices = []
data = []

num_mols = 0

with open(csv, 'r') as mols:
csv_reader = reader(mols)
for row in csv_reader:
smile = row[1]
m = Chem.MolFromSmiles(smile)
m.SetProp("_Name", row[0])

fp = Chem.rdmolops.RDKFingerprint(m, fpSize = dimension)

for i in range(dimension):
if fp.GetBit(i) is True:
indices.append(i)
data.append(1)

indptr.append(len(indices))
num_mols += 1

mol_fps = csr_matrix((data, indices, indptr), shape = (num_mols, dimension), dtype = dtype)
#print(mol_fps)
print('the dimension of the returned sparse matrix: %d*%d' % mol_fps.shape)

f = h5sparse.File(ofn + '.hdf5', 'w')
f.create_dataset('mol_fps', data = mol_fps)

f.close()

return mol_fps

def main():
parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter)

parser.add_argument(
'--csv',
type = str)
parser.add_argument(
'--ofn',
type = str)
parser.add_argument(
'--dim',
type = int,
default = 1024)
parser.add_argument(
'--dtype',
default = numpy.bool)

args = parser.parse_args()

get_hdf5_from_tfp(args.csv, args.ofn, args.dim, args.dtype)



3 changes: 3 additions & 0 deletions partition.py
@@ -0,0 +1,3 @@
from partitioner_program import main

main()
95 changes: 95 additions & 0 deletions partitioner_program.py
@@ -0,0 +1,95 @@
import csv
import argparse
from rdkit import Chem
from rdkit.Chem import AllChem
import glob
import gzip
from scipy.sparse import csr_matrix

#PART 1

"""
get_partioned_csv_files takes a .sdf.gz file with N molecules in it,
and splits it into ceil(N/T) .csv files.
Each value in the .csv file is of the form (id, smile)
PARAMETERS: dir - path to and including the .sdf.gz file name
T - the maximum amount of molecules in each .csv file
partion_name - the desired name for each .csv file
EX: partion_name = Chembl26, the file names will look
like "Chembl26-part#.csv"
"""
def get_partitioned_csv_files(dir, T, name = "set", dest = "./"):

cur_num_mols = 0
cur_subset = []

fnum = 0

file_list = glob.glob(dir + '/*.sdf.gz')
#print(file_list)

for file in file_list:

inf = gzip.open(file)
suppl = Chem.ForwardSDMolSupplier(inf)

for mol in suppl:
if mol is None: continue

smile = Chem.MolToSmiles(mol)

ID = mol.GetProp("_Name")

#print([ID, smile])

cur_subset.append([ID, smile])
cur_num_mols += 1

if cur_num_mols >= T:
fname = dest + "/" + name + "-part" + str(fnum) + ".csv"
with open(fname, 'w', newline = '') as file:
writer = csv.writer(file)
writer.writerows(cur_subset)
fnum += 1
cur_subset = []
cur_num_mols = 0

if cur_num_mols > 0:
fname = dest + "/" + name + "-part" + str(fnum) + ".csv"
with open(fname, 'w', newline = '') as file:
writer = csv.writer(file)
writer.writerows(cur_subset)

return fnum #returning this so people can see if they got the right number

def main():
parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter)

parser.add_argument(
'--dir',
help = 'file path upto and including name of the .sdf.gz')
parser.add_argument(
'--T',
type = int,
help = 'each subset will hold at most T molecules')
parser.add_argument(
'--name',
help = 'name for the partitioned files',
default = 'set')
parser.add_argument(
'--dest',
help = 'where the partitioned files will be',
default = './')

args = parser.parse_args()
#print(type(args.T))
get_partitioned_csv_files(args.dir, int(args.T), args.name, args.dest)








12 changes: 12 additions & 0 deletions run_fp.sh
@@ -0,0 +1,12 @@
#!/bin/sh
#SBATCH --ntasks=1

#SBATCH --nodes=1

#SBATCH --cpus-per-task=1

#SBATCH --array=0-7

jobid=$SLURM_ARRAY_TASK_ID

python run_get_hdf5.py --csv=./partitioned/chembl27-part$jobid.csv --ofn=chembl27_hdf5/chembl27$jobid
3 changes: 3 additions & 0 deletions run_get_hdf5.py
@@ -0,0 +1,3 @@
from get_hdf5_program import main

main()

0 comments on commit 735897e

Please sign in to comment.