diff --git a/.DS_Store b/.DS_Store index 4e1bd74..38f525a 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/algos.yaml b/algos.yaml index 43beb47..43220d8 100644 --- a/algos.yaml +++ b/algos.yaml @@ -1,532 +1,4 @@ -float: - any: - DolphinnPy: - disabled: true - docker-tag: ann-benchmarks-dolphinn # Docker tag - module: ann_benchmarks.algorithms.dolphinnpy # Python class - constructor: DolphinnPy # Python class name - run-groups: - base: - args: [[10, 50, 100, 200, 1000, 2000]] - faiss-lsh: - disabled: true - docker-tag: ann-benchmarks-faiss - module: ann_benchmarks.algorithms.faiss - constructor: FaissLSH - base-args: ["@metric"] - run-groups: - base: - # When @args is a list, the result is the Cartesian product of all of - # the things it contains; entries that aren't a list will be treated - # as lists of length one. - args: [[32, 64, 128, 256, 512, 1024, 2048, 4096]] - # This run group will produce eight algorithm instances: - # FaissLSH(32), FaissLSH(64), and so on up to FaissLSH(4096). - faiss-ivf: - docker-tag: ann-benchmarks-faiss - module: ann_benchmarks.algorithms.faiss - constructor: FaissIVF - base-args: ["@metric"] - run-groups: - base: - args: [[32,64,128,256,512,1024,2048,4096,8192]] - query-args: [[1, 5, 10, 50, 100, 200]] - faiss-gpu: - disabled: true - docker-tag: ann-benchmarks-faiss - module: ann_benchmarks.algorithms.faiss_gpu - constructor: FaissGPU - run-groups: - base: - args: [[400, 1024, 4096, 8192, 16384], - [1, 10, 40, 100, 200]] - hnswlib: - docker-tag: ann-benchmarks-hnswlib - module: ann_benchmarks.algorithms.hnswlib - constructor: HnswLib - base-args: ["@metric"] - run-groups: - M-4: - arg-groups: - - {"M": 4, "efConstruction": 500} - query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] - M-8: - arg-groups: - - {"M": 8, "efConstruction": 500} - query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] - M-12: - arg-groups: - - {"M": 12, "efConstruction": 500} - query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] - M-16: - arg-groups: - - {"M": 16, "efConstruction": 500} - query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] - M-24: - arg-groups: - - {"M": 24, "efConstruction": 500} - query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] - M-36: - arg-groups: - - {"M": 36, "efConstruction": 500} - query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] - M-48: - arg-groups: - - {"M": 48, "efConstruction": 500} - query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] - M-64: - arg-groups: - - {"M": 64, "efConstruction": 500} - query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] - M-96: - arg-groups: - - {"M": 96, "efConstruction": 500} - query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] - - hnsw(faiss): - docker-tag: ann-benchmarks-faiss - module: ann_benchmarks.algorithms.faiss_hnsw - constructor: FaissHNSW - base-args: ["@metric"] - run-groups: - M-4: - arg-groups: - - {"M": 4, "efConstruction": 500} - query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] - M-8: - arg-groups: - - {"M": 8, "efConstruction": 500} - query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] - M-12: - arg-groups: - - {"M": 12, "efConstruction": 500} - query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] - M-16: - arg-groups: - - {"M": 16, "efConstruction": 500} - query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] - M-24: - arg-groups: - - {"M": 24, "efConstruction": 500} - query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] - M-36: - arg-groups: - - {"M": 36, "efConstruction": 500} - query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] - M-48: - arg-groups: - - {"M": 48, "efConstruction": 500} - query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] - M-64: - arg-groups: - - {"M": 64, "efConstruction": 500} - query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] - M-96: - arg-groups: - - {"M": 96, "efConstruction": 500} - query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] - - - flann: - docker-tag: ann-benchmarks-flann - module: ann_benchmarks.algorithms.flann - constructor: FLANN - base-args: ["@metric"] - run-groups: - flann: - args: [[0.2, 0.5, 0.7, 0.8, 0.9, 0.95, 0.97]] - panns: - disabled: true - docker-tag: ann-benchmarks-panns - module: ann_benchmarks.algorithms.panns - constructor: PANNS - base-args: ["@metric"] - run-groups: - five-trees: - args: [5, 20] - ten-trees: - args: [10, [10, 50]] - hundred-candidates: - args: [[10, 20, 40], 100] - annoy: - docker-tag: ann-benchmarks-annoy - module: ann_benchmarks.algorithms.annoy - constructor: Annoy - base-args: ["@metric"] - run-groups: - annoy: - args: [[100, 200, 400]] - query-args: [[100, 200, 400, 1000, 2000, 4000, 10000, 20000, 40000, - 100000, 200000, 400000]] - # This run group produces 3 algorithm instances -- Annoy("angular", - # 100), Annoy("angular", 200), and Annoy("angular", 400) -- each of - # which will be used to run 12 different queries. - shidx: - docker-tag: ann-benchmarks-hdidx - module: ann_benchmarks.algorithms.hdidx - constructor: SHIdx - base-args: [] - run-groups: - shidx: - args: [[4, 8, 16, 32, 64, 128, 256]] - nearpy: - disabled: true - docker-tag: ann-benchmarks-nearpy - module: ann_benchmarks.algorithms.nearpy - constructor: NearPy - base-args: ["@metric"] - run-groups: - nearpy: - args: [[10, 12, 14, 16], [5, 10, 20, 40]] - extra: - args: [16, [5, 10, 15, 20, 25, 30, 40]] - bruteforce: - disabled: true - docker-tag: ann-benchmarks-sklearn - module: ann_benchmarks.algorithms.bruteforce - constructor: BruteForce - base-args: ["@metric"] - run-groups: - empty: - args: [] - bruteforce-blas: - docker-tag: ann-benchmarks-sklearn - module: ann_benchmarks.algorithms.bruteforce - constructor: BruteForceBLAS - base-args: ["@metric"] - run-groups: - empty: - args: [] - dummy-algo-st: - disabled: true - docker-tag: ann-benchmarks-sklearn - module: ann_benchmarks.algorithms.dummy_algo - constructor: DummyAlgoSt - base-args: ["@metric"] - run-groups: - empty: - args: [] - dummy-algo-mt: - disabled: true - docker-tag: ann-benchmarks-sklearn - module: ann_benchmarks.algorithms.dummy_algo - constructor: DummyAlgoMt - base-args: ["@metric"] - run-groups: - empty: - args: [] - ball: - disabled: true - docker-tag: ann-benchmarks-sklearn - module: ann_benchmarks.algorithms.balltree - constructor: BallTree - base-args: ["@metric"] - run-groups: - ball: - args: &treeargs [[10, 20, 40, 100, 200, 400, 1000]] - kd: - docker-tag: ann-benchmarks-sklearn - module: ann_benchmarks.algorithms.kdtree - constructor: KDTree - base-args: ["@metric"] - run-groups: - ball: - args: *treeargs - BallTree(nmslib): - docker-tag: ann-benchmarks-nmslib - module: ann_benchmarks.algorithms.nmslib - constructor: NmslibReuseIndex - base-args: ["@metric", "vptree"] - run-groups: - base: - # When @args is a dictionary, algorithm instances will be generated - # by taking the Cartesian product of all of its values. - arg-groups: - - {"tuneK": 10, "desiredRecall": [0.99, 0.97, 0.95, 0.9, 0.85, 0.8, - 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1]} - - False - # This run group produces thirteen algorithm instances: - # NmslibNewIndex("angular", "vptree", {"tuneK": 10, - # "desiredRecall": 0.99}), NmslibNewIndex("angular", "vptree", - # {"tuneK": 10, "desiredRecall": 0.97}), and so on up to - # NmslibNewIndex("angular", "vptree", {"tuneK": 10, "desiredRecall": - # 0.1}). - pynndescent: - docker-tag: ann-benchmarks-pynndescent - module: ann_benchmarks.algorithms.pynndescent - constructor: PyNNDescent - base-args: ["@metric"] - run-groups: - pynndescent: - args: [[10, 20, 40, 80], [4, 8], [30]] - query-args: [[1.0, 2.0, 4.0, 8.0]] - NGT-panng: - docker-tag: ann-benchmarks-ngt - module: ann_benchmarks.algorithms.panng_ngt - constructor : PANNG - base-args : ["@metric", "Float"] - run-groups : - panng: - args : [{'edge': 20, 'pathadj': 40, 'searchedge': 60}] - query-args : [[0.6, 0.8, 0.9, 1.0, 1.02, 1.05, 1.1, 1.2]] - NGT-onng: - docker-tag: ann-benchmarks-ngt - module: ann_benchmarks.algorithms.onng_ngt - constructor : ONNG - base-args : ["@metric", "Float", 0.1] - run-groups : - onng: - args : [{'edge': 100, 'outdegree': 10, 'indegree': 120}] - query-args : [[0.6, 0.8, 0.9, 1.0, 1.02, 1.05, 1.1, 1.2]] - mrpt: - docker-tag: ann-benchmarks-mrpt - module: ann_benchmarks.algorithms.mrpt - constructor: MRPT - base-args: ["@metric"] - run-groups: - # See https://github.com/ejaasaari/mrpt-comparison/blob/master/parameters/gist.sh - mrpt: - args: [[5, 25, 100], [1, 2, 4, 8]] - query-args: [[1, 2, 4, 10, 20, 40, 100]] - euclidean: - kgraph: - docker-tag: ann-benchmarks-kgraph - module: ann_benchmarks.algorithms.kgraph - constructor: KGraph - base-args: ["@metric"] - run-groups: - kgraph: - args: [ {'reverse': -1}, True] # XXX: hard-codes save_index as True - query-args: [[1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]] - hnsw(nmslib): - docker-tag: ann-benchmarks-nmslib - module: ann_benchmarks.algorithms.nmslib - constructor: NmslibReuseIndex - base-args: ["@metric", "hnsw"] - run-groups: - M-32: - # If a run group has an array called @arg-groups instead of one - # called @args, then every element in that array will be separately - # expanded before then taking the Cartesian product of all of those - # expansions. - # - # Yes, this is a bit of a hack, but some constructors are weird. - # (This one used to require that dictionaries be encoded as lists - # of environment variable-style strings -- ["M=32", "post=2", - # "efConstruction=400"] -- which didn't work with this at all...) - arg-groups: - - {"M": 32, "post": 2, "efConstruction": 400} - - False - query-args: [[20, 30, 40, 50, 60, 70, 80, 90, 100, 120, 140, 160, 200, - 300, 400]] - M-20: - arg-groups: - - {"M": 20, "post": 2, "efConstruction": 400} - - False - query-args: [[2, 5, 10, 15, 20, 30, 40, 50, 70, 80, 120, 200, 400]] - M-12: - arg-groups: - - {"M": 12, "post": 0, "efConstruction": 400} - - False - query-args: [[1, 2, 5, 10, 15, 20, 30, 40, 50, 70, 80, 120]] - M-4: - arg-groups: - - {"M": 4, "post": 0, "efConstruction": 400} - - False - query-args: [[1, 2, 5, 10, 20, 30, 50, 70, 90, 120]] - M-8: - arg-groups: - - {"M": 8, "post": 0, "efConstruction": 400} - - False - query-args: [[1, 2, 5, 10, 20, 30, 50, 70, 90, 120, 160]] - SW-graph(nmslib): - docker-tag: ann-benchmarks-nmslib - module: ann_benchmarks.algorithms.nmslib - constructor: NmslibReuseIndex - base-args: ["@metric", "sw-graph"] - run-groups: - NN-24: - arg-groups: - - {"NN": 24} - - False - query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]] - NN-16: - arg-groups: - - {"NN": 16} - - False - query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]] - NN-10: - arg-groups: - - {"NN": 10} - - False - query-args: [[800, 400, 200, 100, 50, 30, 20, 15, 10, 5, 1]] - NN-5: - arg-groups: - - {"NN": 5} - - False - query-args: [[30, 25, 20, 15, 10, 5, 4, 3, 2, 1]] - pynndescent: - docker-tag: ann-benchmarks-pynndescent - module: ann_benchmarks.algorithms.pynndescent - constructor: PyNNDescent - base-args: ["@metric"] - run-groups: - pynndescent: - args: [[5, 10, 20, 40, 80], [4, 8], [20]] - query-args: [[1.0, 1.5, 2.0, 4.0, 8.0]] - angular: - kgraph: - docker-tag: ann-benchmarks-kgraph - module: ann_benchmarks.algorithms.kgraph - constructor: KGraph - base-args: ["@metric"] - run-groups: - kgraph: - args: [{'reverse': -1, "K": 200, "L": 300, "S": 20}, False] - query-args: [[1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]] - hnsw(nmslib): - docker-tag: ann-benchmarks-nmslib - module: ann_benchmarks.algorithms.nmslib - constructor: NmslibReuseIndex - base-args: ["@metric", "hnsw"] - run-groups: - M-48: - arg-groups: - - {"M": 48, "post": 2, "efConstruction": 800} - - False - query-args: [[50, 70, 90, 120, 160, 200, 400, 600, 700, 800, 1000, - 1400, 1600, 2000]] - M-32: - arg-groups: - - {"M": 32, "post": 2, "efConstruction": 800} - - False - query-args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 120, 140, 160, - 200, 300, 400, 600, 700, 800, 1000, 1200, 1400, 1600, 2000]] - M-20: - arg-groups: - - {"M": 20, "post": 0, "efConstruction": 800} - - False - query-args: [[2, 5, 10, 15, 20, 30, 40, 50, 70, 80]] - M-12: - arg-groups: - - {"M": 12, "post": 0, "efConstruction": 800} - - False - query-args: [[1, 2, 5, 10, 15, 20, 30, 40, 50, 70, 80]] - SW-graph(nmslib): - docker-tag: ann-benchmarks-nmslib - module: ann_benchmarks.algorithms.nmslib - constructor: NmslibReuseIndex - base-args: ["@metric", "sw-graph"] - run-groups: - NN-30: - arg-groups: - - {"NN": 30} - - False - query-args: [[700, 650, 550, 450, 350, 275, 200, 150, 120, 80, - 50, 30]] - NN-15: - arg-groups: - - {"NN": 15} - - False - query-args: [[80, 50, 30, 20]] - NN-3: - arg-groups: - - {"NN": 3} - - False - query-args: [[120, 80, 60, 40, 20, 10, 8, 4, 2]] - rpforest: - docker-tag: ann-benchmarks-rpforest - module: ann_benchmarks.algorithms.rpforest - constructor: RPForest - run-groups: - base: - args: [[3, 10, 40, 100, 400], - [3, 10, 40, 100, 400]] - pynndescent: - docker-tag: ann-benchmarks-pynndescent - module: ann_benchmarks.algorithms.pynndescent - constructor: PyNNDescent - base-args: ["@metric"] - run-groups: - pynndescent: - args: [[5, 10, 20, 40, 80, 160], [8], [40]] - query-args: [[1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0]] bit: - hamming: - kgraph: - docker-tag: ann-benchmarks-kgraph - module: ann_benchmarks.algorithms.kgraph - constructor: KGraph - base-args: ["euclidean"] - run-groups: - kgraph: -# args: [[1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100], -# {'reverse': -1, "K": 200, "L": 300, "S": 20}, False] - args: [{'reverse': -1, "K": 200, "L": 300, "S": 20}, False] - query-args: [[1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]] - hnsw(nmslib): - docker-tag: ann-benchmarks-nmslib - module: ann_benchmarks.algorithms.nmslib - constructor: NmslibReuseIndex - base-args: ["euclidean", "hnsw"] - run-groups: - M-48: - arg-groups: - - {"M": 48, "post": 2, "efConstruction": 800} - # Chunjiang added - - False - query-args: [[50, 70, 90, 120, 160, 200, 400, 600, 700, 800, 1000, - 1400, 1600, 2000]] - M-32: - arg-groups: - - {"M": 32, "post": 2, "efConstruction": 800} - # Chunjiang added - - False - query-args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 120, 140, 160, - 200, 300, 400, 600, 700, 800, 1000, 1200, 1400, 1600, 2000]] - M-20: - arg-groups: - - {"M": 20, "post": 0, "efConstruction": 800} - # Chunjiang added - - False - query-args: [[2, 5, 10, 15, 20, 30, 40, 50, 70, 80]] - M-12: - arg-groups: - - {"M": 12, "post": 0, "efConstruction": 800} - # Chunjiang added - - False - query-args: [[1, 2, 5, 10, 15, 20, 30, 40, 50, 70, 80]] - pynndescent: - docker-tag: ann-benchmarks-pynndescent - module: ann_benchmarks.algorithms.pynndescent - constructor: PyNNDescent - base-args: ["euclidean"] - run-groups: - pynndescent: - args: [[20, 40, 80, 160, 250], [4], [40]] - query-args: [[1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0]] - annoy: - docker-tag: ann-benchmarks-annoy - module: ann_benchmarks.algorithms.annoy - constructor: Annoy - base-args: ["@metric"] - run-groups: - annoy: - args: [[100, 200, 400]] - query-args: [[100, 200, 400, 1000, 2000, 4000, 10000, 20000, 40000, - 100000, 200000, 400000]] - # This run group produces 3 algorithm instances -- Annoy("angular", - # 100), Annoy("angular", 200), and Annoy("angular", 400) -- each of - # which will be used to run 12 different queries. - faiss-ivf: - docker-tag: ann-benchmarks-faiss - module: ann_benchmarks.algorithms.faiss - constructor: FaissIVF - base-args: ["euclidean"] - run-groups: - base: - args: [[32,64,128,256,512,1024,2048,4096,8192]] - query-args: [[1, 5, 10, 50, 100, 200]] jaccard: Bruteforce: disabled: false diff --git a/ann_benchmarks/datasets.py b/ann_benchmarks/datasets.py index d386510..68728bf 100644 --- a/ann_benchmarks/datasets.py +++ b/ann_benchmarks/datasets.py @@ -400,13 +400,8 @@ def ecfp(out_fn, dataset_name, dimension, distance, type, test_size=1000): else: dtype = numpy.float - if dataset_name == 'Molport': - dir = '/home/cjz18001/Molport' - elif dataset_name == 'Chembl': - dir = '/home/cjz18001/Chembl' - else: - print('unknown dataset') - exit(0) + dir = './data' + X, SMILES = get_sparse_matrix_from_sdf(dir=dir, dimension=dimension, dtype=dtype) # random shuffle fingerprints and smiles at the same time