diff --git a/algos.yaml b/algos.yaml index 048fd6f..23a708f 100644 --- a/algos.yaml +++ b/algos.yaml @@ -531,6 +531,7 @@ bit: Bruteforce: disabled: false docker-tag: ann-benchmarks-sklearn + singularity-tag: ann-bench-sklearn module: ann_benchmarks.algorithms.bruteforce constructor: BruteForceBLAS base-args: ["@metric"] @@ -540,6 +541,7 @@ bit: Balltree(Sklearn): disabled: false docker-tag: ann-benchmarks-sklearn + singularity-tag: ann-bench-sklearn module: ann_benchmarks.algorithms.balltree constructor: BallTree base-args: ["@metric"] @@ -549,6 +551,7 @@ bit: VPtree(Nmslib): disabled: false docker-tag: ann-benchmarks-nmslib + singularity-tag: ann-bench-nmslib3 module: ann_benchmarks.algorithms.nmslib constructor: NmslibReuseIndex base-args: ["@metric", "vptree"] @@ -557,12 +560,12 @@ bit: # When @args is a dictionary, algorithm instances will be generated # by taking the Cartesian product of all of its values. arg-groups: - - {"tuneK": 10, "desiredRecall": [0.999, 0.997, 0.995, 0.99, 0.97, 0.95, 0.9, 0.85, 0.8, - 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0.05, 0.02, 0.01]} + - {"tuneK": 10, "desiredRecall": [0.01]} - False Datasketch: disabled: false docker-tag: ann-benchmarks-datasketch + singularity-tag: ann-bench-datasketch module: ann_benchmarks.algorithms.datasketch constructor: DataSketch base-args: ["@metric"] @@ -572,6 +575,7 @@ bit: Hnsw(Nmslib): disabled: false docker-tag: ann-benchmarks-nmslib + singularity-tag: ann-bench-nmslib3 module: ann_benchmarks.algorithms.nmslib constructor: NmslibReuseIndex base-args: ["@metric", "hnsw"] @@ -610,6 +614,7 @@ bit: SW-graph(Nmslib): disabled: false docker-tag: ann-benchmarks-nmslib + singularity-tag: ann-bench-nmslib3 module: ann_benchmarks.algorithms.nmslib constructor: NmslibReuseIndex base-args: ["@metric", "sw-graph"] @@ -652,6 +657,7 @@ bit: Pynndescent: disabled: false docker-tag: ann-benchmarks-pynndescent + singularity-tag: ann-bench-pynndescent module: ann_benchmarks.algorithms.pynndescent constructor: PyNNDescent base-args: ["@metric"] @@ -662,26 +668,29 @@ bit: Onng(Ngt): disabled: false docker-tag: ann-benchmarks-ngt + singularity-tag: ann-bench-ngt module: ann_benchmarks.algorithms.onng_ngt constructor: ONNG base-args: ["@metric", "Byte", 1.0] run-groups: onng: - args: [[100, 300, 500, 1000], [10, 30, 50, 100], [10, 30, 50, 120]] - query-args: [[0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0]] + args: [[1000], [100], [120]] + query-args: [[2.0]] Panng(Ngt): disabled: false docker-tag: ann-benchmarks-ngt + singularity-tag: ann-bench-ngt module: ann_benchmarks.algorithms.panng_ngt constructor: PANNG base-args: ["@metric", "Byte"] run-groups: panng: - args: [[10, 20, 40], [40], [30, 60, 120]] - query-args: [[0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 0.9, 0.95, 1.0, 1.02, 1.05, 1.1, 1.2, 1.5, 2.0]] + args: [[40], [40], [120]] + query-args: [[2.2]] Risc: disabled: false docker-tag: ann-benchmarks-risc + singularity-tag: ann-bench-risc module: ann_benchmarks.algorithms.risc constructor: Risc base-args: ["@metric", "Risc"] @@ -692,6 +701,7 @@ bit: DivideSkip: disabled: false docker-tag: ann-benchmarks-risc + singularity-tag: ann-bench-risc module: ann_benchmarks.algorithms.risc constructor: Risc base-args: ["@metric", "DivideSkip"] diff --git a/ann_benchmarks/algorithms/definitions.py b/ann_benchmarks/algorithms/definitions.py index 7906eae..af67ea2 100644 --- a/ann_benchmarks/algorithms/definitions.py +++ b/ann_benchmarks/algorithms/definitions.py @@ -10,7 +10,7 @@ from itertools import product -Definition = collections.namedtuple('Definition', ['algorithm', 'constructor', 'module', 'docker_tag', 'arguments', 'query_argument_groups', 'disabled']) +Definition = collections.namedtuple('Definition', ['algorithm', 'constructor', 'module', 'docker_tag', 'singularity_tag', 'arguments', 'query_argument_groups', 'disabled']) def get_algorithm_name(name, batch): if batch: @@ -165,6 +165,7 @@ def get_definitions(definition_file, dimension, point_type="float", distance_met definitions.append(Definition( algorithm=name, docker_tag=algo['docker-tag'], + singularity_tag=algo['singularity-tag'], module=algo['module'], constructor=algo['constructor'], arguments=aargs, diff --git a/ann_benchmarks/algorithms/nmslib.py b/ann_benchmarks/algorithms/nmslib.py index 44732b2..0fbb4e2 100644 --- a/ann_benchmarks/algorithms/nmslib.py +++ b/ann_benchmarks/algorithms/nmslib.py @@ -52,7 +52,7 @@ def fit(self, X): # what(): The data size is too small or the bucket size is too big. Select the parameters so that is NOT less than * 1000 # Aborted (core dumped) self._index_param.append('bucketSize=%d' % min(int(X.shape[0] * 0.0005), 1000)) - + # Chunjiang modified it to "if" for jaccard if self._nmslib_metric == 'jaccard_sparse': X_trans = NmslibReuseIndex.matrToStrArray(csr_matrix(X)) @@ -76,12 +76,16 @@ def set_query_arguments(self, ef): if self._method_name == 'hnsw' or self._method_name == 'sw-graph': self._index.setQueryTimeParams(["efSearch=%s"%(ef)]) - def query(self, v, n): + def query(self, v, n, rq=False): # Chunjiang modified if self._nmslib_metric == 'jaccard_sparse': nz = numpy.nonzero(v)[0] v = ' '.join([str(k) for k in nz]) - ids, distances = self._index.knnQuery(v, n) + print(n) + if rq: + ids, distances = self._index.rangeQuery(v, n) + else: + ids, distances = self._index.knnQuery(v, n) return ids def batch_query(self, X, n): diff --git a/ann_benchmarks/algorithms/panng_ngt.py b/ann_benchmarks/algorithms/panng_ngt.py index a06f6e6..6158f30 100644 --- a/ann_benchmarks/algorithms/panng_ngt.py +++ b/ann_benchmarks/algorithms/panng_ngt.py @@ -58,8 +58,17 @@ def set_query_arguments(self, epsilon): self._epsilon = epsilon - 1.0 self.name = 'PANNG-NGT(%d, %d, %d, %1.3f)' % (self._edge_size, self._pathadj_size, self._edge_size_for_search, self._epsilon + 1.0) - def query(self, v, n): - results = self.index.search(v, n, self._epsilon, self._edge_size_for_search, with_distance=False) + def query(self, v, n, rq=False): + if rq: + # direct method + #self.index.set(sys.maxsize, n) + #n = 0 # then input size 0 to search + #results = self.index.search(v, n, self._epsilon, self._edge_size_for_search, with_distance=False) + + # indirect method + results = self.index.searchRange(v, n, self._epsilon, self._edge_size_for_search, with_distance=False) + else: + results = self.index.search(v, n, self._epsilon, self._edge_size_for_search, with_distance=False) return results def freeIndex(self): diff --git a/ann_benchmarks/algorithms/risc.py b/ann_benchmarks/algorithms/risc.py index fe89304..368233d 100644 --- a/ann_benchmarks/algorithms/risc.py +++ b/ann_benchmarks/algorithms/risc.py @@ -60,14 +60,20 @@ def pre_query(self, v, n): queries = pyrisc.readQueries("query.txt", self._featureId) self._queryFP = pyrisc.dataBinary_getFingerPrint(queries, 0) - def query(self, v, n): - self._n = n - self._results = pyrisc._experiments_runTopK_inMemory(self._index, self._data, self._queryFP, self._n, self._method) + def query(self, v, n, rq=False): + if rq: + self._results = pyrisc._experiments_runRange_InMemory(self._index, self._data, self._queryFP, 1.0-n, self._method) + else: + self._n = n + self._results = pyrisc._experiments_runTopK_inMemory(self._index, self._data, self._queryFP, n, self._method) - def post_query(self): + def post_query(self, rq=False): if os.path.isfile("result.txt"): os.remove("result.txt") - pyrisc.writeResults("result.txt", self._data, self._results, self._n) + if rq: + pyrisc.writeResults_Range("result.txt", self._data, self._results) + else: + pyrisc.writeResults("result.txt", self._data, self._results, self._n) # read results from output file result = [] diff --git a/ann_benchmarks/main.py b/ann_benchmarks/main.py index 55c1534..dc34bf8 100644 --- a/ann_benchmarks/main.py +++ b/ann_benchmarks/main.py @@ -75,7 +75,15 @@ def main(): parser.add_argument( '--local', action='store_true', - help='If set, then will run everything locally (inside the same process) rather than using Docker') + help='If set, then will run everything locally (inside the same process) rather than using Docker or Singularity') + parser.add_argument( + '--docker', + action='store_true', + help='If set, then will run Docker') + parser.add_argument( + '--sif-dir', + help='Singularity image files directory', + default='./singularity') parser.add_argument( '--batch', action='store_true', @@ -152,7 +160,25 @@ def main(): print('running only', args.algorithm) definitions = [d for d in definitions if d.algorithm == args.algorithm] - if args.local: + if not args.local: + if args.docker: + # See which Docker images we have available + docker_client = docker.from_env() + docker_tags = set() + for image in docker_client.images.list(): + for tag in image.tags: + tag = tag.split(':')[0] + docker_tags.add(tag) + + if args.docker_tag: + print('running only', args.docker_tag) + definitions = [d for d in definitions if d.docker_tag == args.docker_tag] + + if set(d.docker_tag for d in definitions).difference(docker_tags): + print('not all docker images available, only:', set(docker_tags)) + print('missing docker images:', set(d.docker_tag for d in definitions).difference(docker_tags)) + definitions = [d for d in definitions if d.docker_tag in docker_tags] + else: def _test(df): status = algorithm_status(df) # If the module was loaded but doesn't actually have a constructor of @@ -189,9 +215,10 @@ def _test(df): try: if args.local: run(definition, args.dataset, args.count, args.runs, args.batch, args.rq, args.radius) - else: - # run_docker(definition, args.dataset, args.count, args.runs, args.timeout, args.batch, args.rq, args.radius) - run_singularity(definition, args.dataset, args.count, args.runs, args.timeout, args.batch, args.rq, args.radius) + elif args.docker: + run_docker(definition, args.dataset, args.count, args.runs, args.timeout, args.batch, args.rq, args.radius) + else:# by default run Singularity + run_singularity(definition, args.dataset, args.count, args.runs, args.timeout, args.batch, args.rq, args.radius, args.sif_dir) except KeyboardInterrupt: break except: diff --git a/ann_benchmarks/results.py b/ann_benchmarks/results.py index 3adbcd5..94f7c6b 100644 --- a/ann_benchmarks/results.py +++ b/ann_benchmarks/results.py @@ -70,8 +70,31 @@ def load_all_results(dataset=None, count=None, split_batched=False, batch_mode= except: pass +# only load results in algorithm +def load_algorithms_results(algorithm=None, dataset=None, count=None, split_batched=False, batch_mode=False): + for root, _, files in os.walk(get_result_filename(dataset, count)): + # only load results in algorithm + if root[root.rfind('/')+1:] not in algorithm: continue + for fn in files: + try: + f = h5py.File(os.path.join(root, fn)) + properties = dict(f.attrs) + # TODO Fix this properly. Sometimes the hdf5 file returns bytes + # This converts these bytes to strings before we work with them + for k in properties.keys(): + try: + properties[k]= properties[k].decode() + except: + pass + yield properties, f + f.close() + except: + pass + def get_unique_algorithms(): algorithms = set() for properties, _ in load_all_results(): - algorithms.add(properties['algo']) + if properties: + if 'algo' in properties.keys(): + algorithms.add(properties['algo']) return algorithms diff --git a/ann_benchmarks/runner.py b/ann_benchmarks/runner.py index aa40d87..d51cdf5 100644 --- a/ann_benchmarks/runner.py +++ b/ann_benchmarks/runner.py @@ -34,18 +34,21 @@ def run_individual_query(algoname, algo, X_train, X_test, distance, count, run_c n_items_processed = [0] # a bit dumb but can't be a scalar since of Python's scoping rules def single_query(v): - # special code for Risc + # special code for the Risc and DivideSkip if "Risc" in algoname or 'DivideSkip' in algoname: algo.pre_query(v, count) + start = time.time() if rq: - candidates = algo.query(v, count, rq) # now count is the radius + candidates = algo.query(v, count, rq) else: candidates = algo.query(v, count) total = (time.time() - start) - # special code for Risc + + # special code for the Risc and DivideSkip if "Risc" in algoname or 'DivideSkip' in algoname: candidates = algo.post_query() + if issparse(X_train): candidates = [(int(idx), float(metrics[distance]['distance'](v, X_train[idx].toarray()[0]))) for idx in candidates] @@ -64,7 +67,6 @@ def batch_query(X): algo.batch_query(X, count) total = (time.time() - start) results = algo.get_batch_results() - # needs testing if issparse(X_train): candidates = [[(int(idx), float(metrics[distance]['distance'](v, X_train[idx].toarray()[0]))) for idx in single_results] @@ -115,20 +117,19 @@ def run(definition, dataset, count, run_count, batch, rq): X_train = D['train'][()].toarray() else: X_train = D['train'][()] - # X_train = numpy.array(D['train']) - #X_train = X_train[:2000] X_test = numpy.array(D['test']) distance = D.attrs['distance'] print('got a train set of size (%d * %d)' % X_train.shape) print('got %d queries' % len(X_test)) try: - # special code for Risc + # special code for the Risc and DivideSkip print(X_train.shape) if 'Risc' in definition.algorithm or 'DivideSkip' in definition.algorithm: X_train = numpy.concatenate((X_train, [numpy.ones(X_train.shape[1], dtype=numpy.bool)]), axis=0) print(X_train.shape) algo.pre_fit(X_train) + t0 = time.time() index_size_before = algo.get_index_size("self") algo.fit(X_train) @@ -206,6 +207,7 @@ def run_from_cmdline(): definition = Definition( algorithm=args.algorithm, docker_tag=None, # not needed + singularity_tag=None, # not needed module=args.module, constructor=args.constructor, arguments=algo_args, @@ -215,7 +217,7 @@ def run_from_cmdline(): if args.rq: run(definition, args.dataset, args.radius, args.runs, args.batch, args.rq) else: - run(definition, args.dataset, args.count, args.runs, args.batch) + run(definition, args.dataset, args.count, args.runs, args.batch, args.rq) def run_docker(definition, dataset, count, runs, timeout, batch, rq, radius, mem_limit=None): @@ -278,7 +280,7 @@ def stream_logs(): finally: container.remove(force=True) -def run_singularity(definition, dataset, count, runs, timeout, batch, rq, radius, mem_limit=None): +def run_singularity(definition, dataset, count, runs, timeout, batch, rq, radius, sif_dir, mem_limit=None): cmd = ['--dataset', dataset, '--algorithm', definition.algorithm, '--module', definition.module, @@ -296,7 +298,8 @@ def run_singularity(definition, dataset, count, runs, timeout, batch, rq, radius strCmd = ' '.join(["'" + k + "'" for k in cmd]) print('String of command', strCmd) - subprocess.check_call('singularity exec ./singularity/ann-bench-nmslib3.sif python3 run_algorithm.py %s' %(strCmd), shell=True) + subprocess.check_call('singularity exec %s/%s.sif python3 run_algorithm.py %s' %(sif_dir, definition.singularity_tag, strCmd), shell=True) + #subprocess.check_call('singularity exec ../../singularity/ann-bench-nmslib3.sif python3 run_algorithm.py %s' %(strCmd), shell=True) #subprocess.check_call('singularity exec ../singularity/ann-bench-pynndescent.sif python3 run_algorithm.py %s' %(strCmd), shell=True) #subprocess.check_call('singularity exec ../singularity/ann-bench-datasketch.sif python3 run_algorithm.py %s' %(strCmd), shell=True) #subprocess.check_call('singularity exec ../singularity/ann-bench-sklearn.sif python3 run_algorithm.py %s' %(strCmd), shell=True)