From c4c913e1dd52ba884b13f0907901eeb79c3bb9cc Mon Sep 17 00:00:00 2001
From: ChunjiangZhu <chunjiang.zhu@uconn.edu>
Date: Tue, 21 Jul 2020 17:29:12 -0400
Subject: [PATCH] add more fingerprint generation functions

---
 .DS_Store                  | Bin 10244 -> 10244 bytes
 ann_benchmarks/datasets.py |  48 +++++++++++++++++--------------------
 2 files changed, 22 insertions(+), 26 deletions(-)

diff --git a/.DS_Store b/.DS_Store
index e51d2361ba5c217b11d91dd82a50f93559957cc0..14fe294cba9fa9e06ee5db8cf87b5ee49a3703d9 100644
GIT binary patch
delta 25
gcmZn(XbIS$Ey!VLVWOj8WNI?`wUF%Q0KqL{0Aq6ptN;K2

delta 16
XcmZn(XbIS$Ejam(nCIpa!3kmjI35N;

diff --git a/ann_benchmarks/datasets.py b/ann_benchmarks/datasets.py
index 33da4a5..02b1c07 100644
--- a/ann_benchmarks/datasets.py
+++ b/ann_benchmarks/datasets.py
@@ -369,7 +369,7 @@ def get_sparse_matrix_from_txt(file=None, dtype=numpy.bool):
 
     return fps, SMILES
 
-def get_sparse_matrix_from_sdf(dir, dimension = 1024, dtype=numpy.bool):
+def get_sparse_matrix_from_sdf(dir, dimension = 1024, radius=2, dtype=numpy.bool):
     from rdkit import Chem
     from rdkit.Chem import AllChem
     import glob
@@ -392,21 +392,20 @@ def get_sparse_matrix_from_sdf(dir, dimension = 1024, dtype=numpy.bool):
             smile = Chem.MolToSmiles(mol)
             SMILES.append(smile)
             IDS.append(mol.GetProp("_Name"))
-            fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=dimension)
+            fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=dimension)
             for i in range(dimension):
                 if fp.GetBit(i) is True:
                     indices.append(i)
                     data.append(1)
             indptr.append(len(indices))
             num_mols += 1
-            if num_mols > 3000: break
 
     fps = csr_matrix((data, indices, indptr), shape=(num_mols, dimension), dtype=dtype)
     print('The dimension of the returned sparse matrix: %d*%d' % fps.shape)
 
     return fps, SMILES, IDS
 
-def ecfp(out_fn, dataset_name, dimension, distance, type, test_size=1000):
+def ecfp(out_fn, dataset_name, dimension, radius, distance, type, test_size=1000):
     from sklearn.utils import shuffle
     print('prepare dataset ' + dataset_name)
 
@@ -418,8 +417,8 @@ def ecfp(out_fn, dataset_name, dimension, distance, type, test_size=1000):
         dtype = numpy.float
 
     dir = './data'
-    
-    X, SMILES, IDS = get_sparse_matrix_from_sdf(dir=dir, dimension=dimension, dtype=dtype)
+
+    X, SMILES, IDS = get_sparse_matrix_from_sdf(dir=dir, dimension=dimension, radius=radius, dtype=dtype)
 
     # random shuffle fingerprints and smiles at the same time
     seed = 1 # random.randint(0, 2 ** 32 - 1)
@@ -437,9 +436,8 @@ def ecfp(out_fn, dataset_name, dimension, distance, type, test_size=1000):
     write_output(X_train, X_test, out_fn, distance, type, count=1000, SMILES=SMILES, IDS=IDS)
 
 # Molecular topological fingerprints
-def get_sparse_matrix_from_sdf_topological_fp(dir, dimension=1024, dtype=numpy.bool):
+def get_sparse_matrix_from_sdf_topological_fp(dir, dimension=1024, maxPath=7, dtype=numpy.bool):
     from rdkit import Chem
-    from rdkit.Chem import AllChem
     import glob
     import gzip
     from scipy.sparse import csr_matrix
@@ -460,7 +458,7 @@ def get_sparse_matrix_from_sdf_topological_fp(dir, dimension=1024, dtype=numpy.b
             smile = Chem.MolToSmiles(mol)
             SMILES.append(smile)
             IDS.append(mol.GetProp("_Name"))
-            fp = Chem.rdmolops.RDKFingerprint(mol, fpSize=dimension)
+            fp = Chem.rdmolops.RDKFingerprint(mol, fpSize=dimension, maxPath=maxPath)
             for i in range(dimension):
                 if fp.GetBit(i) is True:
                     indices.append(i)
@@ -473,7 +471,7 @@ def get_sparse_matrix_from_sdf_topological_fp(dir, dimension=1024, dtype=numpy.b
 
     return fps, SMILES, IDS
     
-def topological_fp(out_fn, dataset_name, dimension, distance, type, test_size=1000):
+def topological_fp(out_fn, dataset_name, dimension, maxPath, distance, type, test_size=1000):
     from sklearn.utils import shuffle
     print('prepare dataset ' + dataset_name)
 
@@ -483,10 +481,10 @@ def topological_fp(out_fn, dataset_name, dimension, distance, type, test_size=10
         dtype = numpy.int
     else:
         dtype = numpy.float
-    
+
     dir = './data'
-    
-    X, SMILES, IDS = get_sparse_matrix_from_sdf_topological_fp(dir=dir, dimension=dimension, dtype=dtype)
+
+    X, SMILES, IDS = get_sparse_matrix_from_sdf_topological_fp(dir=dir, dimension=dimension, maxPath=maxPath, dtype=dtype)
 
     # random shuffle fingerprints and smiles at the same time
     seed = 1 # random.randint(0, 2 ** 32 - 1)
@@ -497,6 +495,7 @@ def topological_fp(out_fn, dataset_name, dimension, distance, type, test_size=10
     X_train = X[:train_size]
     X_test = X[train_size:]
     X_test = X_test.toarray()
+
     print('finish dataset preparation')
 
     print('Train data dimension: %d*%d' %X_train.shape)
@@ -679,17 +678,14 @@ def ecfp_multi(out_fn, dataset_name, num_files, dimension, distance, type):
     'word2bits-800-hamming': lambda out_fn: word2bits(out_fn, '400K', 'w2b_bitlevel1_size800_vocab400K'),
     'lastfm-64-dot': lambda out_fn: lastfm(out_fn, 64),
     'sift-256-hamming': lambda out_fn: sift_hamming(out_fn, 'sift.hamming.256'),
-    # below are datasets Chunjiang added
-    'toy-1024-jaccard': lambda out_fn: ecfp(out_fn, 'toydata', 1024, 'jaccard', 'bit', 100),
-    'chembl-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl', 1024, 'jaccard', 'bit'),
-    'molport-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Molport', 1024, 'jaccard', 'bit'),
-    'chembl100K-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl100K', 1024, 'jaccard', 'int'),
-    'chembl10K-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl10K', 1024, 'jaccard', 'int'),
-    'chembl-sparse-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl10K-sparse', 1024, 'jaccard', 'bit'),
-    'enamine100M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 5, 1024, 'jaccard', 'bit'),
-    'enamine10M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 0.5, 1024, 'jaccard', 'bit'),
-    'enamine20M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 1, 1024, 'jaccard', 'bit'),
-    'enamine10M-1024-jaccard': lambda out_fn: ecfp_multi(out_fn, 'Enamine', 0.5, 1024, 'jaccard', 'bit'),
-    'chembl-1024-jaccard-tp': lambda out_fn: topological_fp(out_fn, 'Chembl', 1024, 'jaccard', 'bit'),
-    'chembl-1024-jaccard-map4': lambda out_fn: map4(out_fn, 'Chembl', 1024, 'jaccard', 'int')
+    'chembl-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl', 1024, 2, 'jaccard', 'bit'),
+    'molport-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Molport', 1024, 2, 'jaccard', 'bit'),
+    'chembl-1024-jaccard-tp': lambda out_fn: topological_fp(out_fn, 'Chembl', 1024, 7, 'jaccard', 'bit'),
+    'chembl-1024-r3-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl', 1024, 3, 'jaccard', 'bit'),
+    'chembl-512-r2-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl', 512, 2, 'jaccard', 'bit'),
+    'chembl-2048-r2-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl', 2048, 2, 'jaccard', 'bit'),
+    'chembl-1024-p3-jaccard-tp': lambda out_fn: topological_fp(out_fn, 'Chembl', 1024, 3, 'jaccard', 'bit'),
+    'chembl-1024-p5-jaccard-tp': lambda out_fn: topological_fp(out_fn, 'Chembl', 1024, 5, 'jaccard', 'bit'),
+    'chembl-512-p7-jaccard-tp': lambda out_fn: topological_fp(out_fn, 'Chembl', 512, 7, 'jaccard', 'bit'),
+    'chembl-2048-p7-jaccard-tp': lambda out_fn: topological_fp(out_fn, 'Chembl', 2048, 7, 'jaccard', 'bit')
 }