Add files via upload

mldrugdiscovery · May 28, 2020 · 735897e · 735897e
commit 735897e
Show file tree

Hide file tree

Showing 5 changed files with 184 additions and 0 deletions.
diff --git a/get_hdf5_program.py b/get_hdf5_program.py
@@ -0,0 +1,71 @@
+import csv
+from csv import reader
+import argparse
+from rdkit import Chem
+from rdkit.Chem import AllChem
+from rdkit.Chem import Draw
+from scipy.sparse import csr_matrix
+import h5sparse
+import numpy
+
+#PARTS 2 + 3
+
+def get_hdf5_from_tfp(csv, ofn, dimension = 1024, dtype = numpy.bool):
+
+    indptr = [0]
+    indices = []
+    data = []
+
+    num_mols = 0
+
+    with open(csv, 'r') as mols:
+        csv_reader = reader(mols)
+        for row in csv_reader:
+            smile = row[1]
+            m = Chem.MolFromSmiles(smile)
+            m.SetProp("_Name", row[0])
+
+            fp = Chem.rdmolops.RDKFingerprint(m, fpSize = dimension)
+
+            for i in range(dimension):
+                if fp.GetBit(i) is True:
+                    indices.append(i)
+                    data.append(1)
+
+            indptr.append(len(indices))
+            num_mols += 1
+
+    mol_fps = csr_matrix((data, indices, indptr), shape = (num_mols, dimension), dtype = dtype)
+    #print(mol_fps)
+    print('the dimension of the returned sparse matrix: %d*%d' % mol_fps.shape)
+
+    f = h5sparse.File(ofn + '.hdf5', 'w')
+    f.create_dataset('mol_fps', data = mol_fps)
+
+    f.close()
+
+    return mol_fps
+
+def main():
+    parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter)
+
+    parser.add_argument(
+        '--csv',
+        type = str)
+    parser.add_argument(
+        '--ofn',
+        type = str)
+    parser.add_argument(
+        '--dim',
+        type = int,
+        default = 1024)
+    parser.add_argument(
+        '--dtype',
+        default = numpy.bool)
+
+    args = parser.parse_args()
+
+    get_hdf5_from_tfp(args.csv, args.ofn, args.dim, args.dtype)
+
+
+
diff --git a/partition.py b/partition.py
@@ -0,0 +1,3 @@
+from partitioner_program import main
+
+main()
diff --git a/partitioner_program.py b/partitioner_program.py
@@ -0,0 +1,95 @@
+import csv
+import argparse
+from rdkit import Chem
+from rdkit.Chem import AllChem
+import glob
+import gzip
+from scipy.sparse import csr_matrix
+
+#PART 1
+
+"""
+get_partioned_csv_files takes a .sdf.gz file with N molecules in it,
+and splits it into ceil(N/T) .csv files.
+Each value in the .csv file is of the form (id, smile)
+
+PARAMETERS: dir - path to and including the .sdf.gz file name
+            T - the maximum amount of molecules in each .csv file
+            partion_name - the desired name for each .csv file
+                EX: partion_name = Chembl26, the file names will look
+                like "Chembl26-part#.csv"
+"""
+def get_partitioned_csv_files(dir, T, name = "set", dest = "./"):
+
+    cur_num_mols = 0
+    cur_subset = []
+
+    fnum = 0
+
+    file_list = glob.glob(dir + '/*.sdf.gz')
+    #print(file_list)
+
+    for file in file_list:
+
+        inf = gzip.open(file)
+        suppl = Chem.ForwardSDMolSupplier(inf)
+
+        for mol in suppl:
+            if mol is None: continue
+
+            smile = Chem.MolToSmiles(mol)
+
+            ID = mol.GetProp("_Name")
+
+            #print([ID, smile])
+
+            cur_subset.append([ID, smile])
+            cur_num_mols += 1
+
+            if cur_num_mols >= T:
+                fname = dest + "/" + name + "-part" + str(fnum) + ".csv"
+                with open(fname, 'w', newline = '') as file:
+                    writer = csv.writer(file)
+                    writer.writerows(cur_subset)
+                fnum += 1
+                cur_subset = []
+                cur_num_mols = 0
+
+    if cur_num_mols > 0:
+        fname = dest + "/" + name + "-part" + str(fnum) + ".csv"
+        with open(fname, 'w', newline = '') as file:
+            writer = csv.writer(file)
+            writer.writerows(cur_subset)
+
+    return fnum #returning this so people can see if they got the right number
+
+def main():
+    parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter)
+
+    parser.add_argument(
+        '--dir',
+        help = 'file path upto and including name of the .sdf.gz')
+    parser.add_argument(
+        '--T',
+        type = int,
+        help = 'each subset will hold at most T molecules')
+    parser.add_argument(
+        '--name',
+        help = 'name for the partitioned files',
+        default = 'set')
+    parser.add_argument(
+        '--dest',
+        help = 'where the partitioned files will be',
+        default = './')
+
+    args = parser.parse_args()
+    #print(type(args.T))
+    get_partitioned_csv_files(args.dir, int(args.T), args.name, args.dest)
+
+
+
+
+
+
+
+
diff --git a/run_fp.sh b/run_fp.sh
@@ -0,0 +1,12 @@
+#!/bin/sh
+#SBATCH --ntasks=1
+
+#SBATCH --nodes=1
+
+#SBATCH --cpus-per-task=1
+
+#SBATCH --array=0-7
+
+jobid=$SLURM_ARRAY_TASK_ID
+
+python run_get_hdf5.py --csv=./partitioned/chembl27-part$jobid.csv --ofn=chembl27_hdf5/chembl27$jobid
diff --git a/run_get_hdf5.py b/run_get_hdf5.py
@@ -0,0 +1,3 @@
+from get_hdf5_program import main
+
+main()