Skip to content

Commit

Permalink
add the singulariy tag
Browse files Browse the repository at this point in the history
  • Loading branch information
root committed Jan 5, 2020
1 parent 0536c86 commit 3c8296d
Show file tree
Hide file tree
Showing 8 changed files with 116 additions and 33 deletions.
22 changes: 16 additions & 6 deletions algos.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -531,6 +531,7 @@ bit:
Bruteforce:
disabled: false
docker-tag: ann-benchmarks-sklearn
singularity-tag: ann-bench-sklearn
module: ann_benchmarks.algorithms.bruteforce
constructor: BruteForceBLAS
base-args: ["@metric"]
Expand All @@ -540,6 +541,7 @@ bit:
Balltree(Sklearn):
disabled: false
docker-tag: ann-benchmarks-sklearn
singularity-tag: ann-bench-sklearn
module: ann_benchmarks.algorithms.balltree
constructor: BallTree
base-args: ["@metric"]
Expand All @@ -549,6 +551,7 @@ bit:
VPtree(Nmslib):
disabled: false
docker-tag: ann-benchmarks-nmslib
singularity-tag: ann-bench-nmslib3
module: ann_benchmarks.algorithms.nmslib
constructor: NmslibReuseIndex
base-args: ["@metric", "vptree"]
Expand All @@ -557,12 +560,12 @@ bit:
# When @args is a dictionary, algorithm instances will be generated
# by taking the Cartesian product of all of its values.
arg-groups:
- {"tuneK": 10, "desiredRecall": [0.999, 0.997, 0.995, 0.99, 0.97, 0.95, 0.9, 0.85, 0.8,
0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0.05, 0.02, 0.01]}
- {"tuneK": 10, "desiredRecall": [0.01]}
- False
Datasketch:
disabled: false
docker-tag: ann-benchmarks-datasketch
singularity-tag: ann-bench-datasketch
module: ann_benchmarks.algorithms.datasketch
constructor: DataSketch
base-args: ["@metric"]
Expand All @@ -572,6 +575,7 @@ bit:
Hnsw(Nmslib):
disabled: false
docker-tag: ann-benchmarks-nmslib
singularity-tag: ann-bench-nmslib3
module: ann_benchmarks.algorithms.nmslib
constructor: NmslibReuseIndex
base-args: ["@metric", "hnsw"]
Expand Down Expand Up @@ -610,6 +614,7 @@ bit:
SW-graph(Nmslib):
disabled: false
docker-tag: ann-benchmarks-nmslib
singularity-tag: ann-bench-nmslib3
module: ann_benchmarks.algorithms.nmslib
constructor: NmslibReuseIndex
base-args: ["@metric", "sw-graph"]
Expand Down Expand Up @@ -652,6 +657,7 @@ bit:
Pynndescent:
disabled: false
docker-tag: ann-benchmarks-pynndescent
singularity-tag: ann-bench-pynndescent
module: ann_benchmarks.algorithms.pynndescent
constructor: PyNNDescent
base-args: ["@metric"]
Expand All @@ -662,26 +668,29 @@ bit:
Onng(Ngt):
disabled: false
docker-tag: ann-benchmarks-ngt
singularity-tag: ann-bench-ngt
module: ann_benchmarks.algorithms.onng_ngt
constructor: ONNG
base-args: ["@metric", "Byte", 1.0]
run-groups:
onng:
args: [[100, 300, 500, 1000], [10, 30, 50, 100], [10, 30, 50, 120]]
query-args: [[0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0]]
args: [[1000], [100], [120]]
query-args: [[2.0]]
Panng(Ngt):
disabled: false
docker-tag: ann-benchmarks-ngt
singularity-tag: ann-bench-ngt
module: ann_benchmarks.algorithms.panng_ngt
constructor: PANNG
base-args: ["@metric", "Byte"]
run-groups:
panng:
args: [[10, 20, 40], [40], [30, 60, 120]]
query-args: [[0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 0.9, 0.95, 1.0, 1.02, 1.05, 1.1, 1.2, 1.5, 2.0]]
args: [[40], [40], [120]]
query-args: [[2.2]]
Risc:
disabled: false
docker-tag: ann-benchmarks-risc
singularity-tag: ann-bench-risc
module: ann_benchmarks.algorithms.risc
constructor: Risc
base-args: ["@metric", "Risc"]
Expand All @@ -692,6 +701,7 @@ bit:
DivideSkip:
disabled: false
docker-tag: ann-benchmarks-risc
singularity-tag: ann-bench-risc
module: ann_benchmarks.algorithms.risc
constructor: Risc
base-args: ["@metric", "DivideSkip"]
Expand Down
3 changes: 2 additions & 1 deletion ann_benchmarks/algorithms/definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from itertools import product


Definition = collections.namedtuple('Definition', ['algorithm', 'constructor', 'module', 'docker_tag', 'arguments', 'query_argument_groups', 'disabled'])
Definition = collections.namedtuple('Definition', ['algorithm', 'constructor', 'module', 'docker_tag', 'singularity_tag', 'arguments', 'query_argument_groups', 'disabled'])

def get_algorithm_name(name, batch):
if batch:
Expand Down Expand Up @@ -165,6 +165,7 @@ def get_definitions(definition_file, dimension, point_type="float", distance_met
definitions.append(Definition(
algorithm=name,
docker_tag=algo['docker-tag'],
singularity_tag=algo['singularity-tag'],
module=algo['module'],
constructor=algo['constructor'],
arguments=aargs,
Expand Down
10 changes: 7 additions & 3 deletions ann_benchmarks/algorithms/nmslib.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def fit(self, X):
# what(): The data size is too small or the bucket size is too big. Select the parameters so that <total # of records> is NOT less than <bucket size> * 1000
# Aborted (core dumped)
self._index_param.append('bucketSize=%d' % min(int(X.shape[0] * 0.0005), 1000))

# Chunjiang modified it to "if" for jaccard
if self._nmslib_metric == 'jaccard_sparse':
X_trans = NmslibReuseIndex.matrToStrArray(csr_matrix(X))
Expand All @@ -76,12 +76,16 @@ def set_query_arguments(self, ef):
if self._method_name == 'hnsw' or self._method_name == 'sw-graph':
self._index.setQueryTimeParams(["efSearch=%s"%(ef)])

def query(self, v, n):
def query(self, v, n, rq=False):
# Chunjiang modified
if self._nmslib_metric == 'jaccard_sparse':
nz = numpy.nonzero(v)[0]
v = ' '.join([str(k) for k in nz])
ids, distances = self._index.knnQuery(v, n)
print(n)
if rq:
ids, distances = self._index.rangeQuery(v, n)
else:
ids, distances = self._index.knnQuery(v, n)
return ids

def batch_query(self, X, n):
Expand Down
13 changes: 11 additions & 2 deletions ann_benchmarks/algorithms/panng_ngt.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,17 @@ def set_query_arguments(self, epsilon):
self._epsilon = epsilon - 1.0
self.name = 'PANNG-NGT(%d, %d, %d, %1.3f)' % (self._edge_size, self._pathadj_size, self._edge_size_for_search, self._epsilon + 1.0)

def query(self, v, n):
results = self.index.search(v, n, self._epsilon, self._edge_size_for_search, with_distance=False)
def query(self, v, n, rq=False):
if rq:
# direct method
#self.index.set(sys.maxsize, n)
#n = 0 # then input size 0 to search
#results = self.index.search(v, n, self._epsilon, self._edge_size_for_search, with_distance=False)

# indirect method
results = self.index.searchRange(v, n, self._epsilon, self._edge_size_for_search, with_distance=False)
else:
results = self.index.search(v, n, self._epsilon, self._edge_size_for_search, with_distance=False)
return results

def freeIndex(self):
Expand Down
16 changes: 11 additions & 5 deletions ann_benchmarks/algorithms/risc.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,14 +60,20 @@ def pre_query(self, v, n):
queries = pyrisc.readQueries("query.txt", self._featureId)
self._queryFP = pyrisc.dataBinary_getFingerPrint(queries, 0)

def query(self, v, n):
self._n = n
self._results = pyrisc._experiments_runTopK_inMemory(self._index, self._data, self._queryFP, self._n, self._method)
def query(self, v, n, rq=False):
if rq:
self._results = pyrisc._experiments_runRange_InMemory(self._index, self._data, self._queryFP, 1.0-n, self._method)
else:
self._n = n
self._results = pyrisc._experiments_runTopK_inMemory(self._index, self._data, self._queryFP, n, self._method)

def post_query(self):
def post_query(self, rq=False):
if os.path.isfile("result.txt"):
os.remove("result.txt")
pyrisc.writeResults("result.txt", self._data, self._results, self._n)
if rq:
pyrisc.writeResults_Range("result.txt", self._data, self._results)
else:
pyrisc.writeResults("result.txt", self._data, self._results, self._n)

# read results from output file
result = []
Expand Down
37 changes: 32 additions & 5 deletions ann_benchmarks/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,15 @@ def main():
parser.add_argument(
'--local',
action='store_true',
help='If set, then will run everything locally (inside the same process) rather than using Docker')
help='If set, then will run everything locally (inside the same process) rather than using Docker or Singularity')
parser.add_argument(
'--docker',
action='store_true',
help='If set, then will run Docker')
parser.add_argument(
'--sif-dir',
help='Singularity image files directory',
default='./singularity')
parser.add_argument(
'--batch',
action='store_true',
Expand Down Expand Up @@ -152,7 +160,25 @@ def main():
print('running only', args.algorithm)
definitions = [d for d in definitions if d.algorithm == args.algorithm]

if args.local:
if not args.local:
if args.docker:
# See which Docker images we have available
docker_client = docker.from_env()
docker_tags = set()
for image in docker_client.images.list():
for tag in image.tags:
tag = tag.split(':')[0]
docker_tags.add(tag)

if args.docker_tag:
print('running only', args.docker_tag)
definitions = [d for d in definitions if d.docker_tag == args.docker_tag]

if set(d.docker_tag for d in definitions).difference(docker_tags):
print('not all docker images available, only:', set(docker_tags))
print('missing docker images:', set(d.docker_tag for d in definitions).difference(docker_tags))
definitions = [d for d in definitions if d.docker_tag in docker_tags]
else:
def _test(df):
status = algorithm_status(df)
# If the module was loaded but doesn't actually have a constructor of
Expand Down Expand Up @@ -189,9 +215,10 @@ def _test(df):
try:
if args.local:
run(definition, args.dataset, args.count, args.runs, args.batch, args.rq, args.radius)
else:
# run_docker(definition, args.dataset, args.count, args.runs, args.timeout, args.batch, args.rq, args.radius)
run_singularity(definition, args.dataset, args.count, args.runs, args.timeout, args.batch, args.rq, args.radius)
elif args.docker:
run_docker(definition, args.dataset, args.count, args.runs, args.timeout, args.batch, args.rq, args.radius)
else:# by default run Singularity
run_singularity(definition, args.dataset, args.count, args.runs, args.timeout, args.batch, args.rq, args.radius, args.sif_dir)
except KeyboardInterrupt:
break
except:
Expand Down
25 changes: 24 additions & 1 deletion ann_benchmarks/results.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,31 @@ def load_all_results(dataset=None, count=None, split_batched=False, batch_mode=
except:
pass

# only load results in algorithm
def load_algorithms_results(algorithm=None, dataset=None, count=None, split_batched=False, batch_mode=False):
for root, _, files in os.walk(get_result_filename(dataset, count)):
# only load results in algorithm
if root[root.rfind('/')+1:] not in algorithm: continue
for fn in files:
try:
f = h5py.File(os.path.join(root, fn))
properties = dict(f.attrs)
# TODO Fix this properly. Sometimes the hdf5 file returns bytes
# This converts these bytes to strings before we work with them
for k in properties.keys():
try:
properties[k]= properties[k].decode()
except:
pass
yield properties, f
f.close()
except:
pass

def get_unique_algorithms():
algorithms = set()
for properties, _ in load_all_results():
algorithms.add(properties['algo'])
if properties:
if 'algo' in properties.keys():
algorithms.add(properties['algo'])
return algorithms
23 changes: 13 additions & 10 deletions ann_benchmarks/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,18 +34,21 @@ def run_individual_query(algoname, algo, X_train, X_test, distance, count, run_c
n_items_processed = [0] # a bit dumb but can't be a scalar since of Python's scoping rules

def single_query(v):
# special code for Risc
# special code for the Risc and DivideSkip
if "Risc" in algoname or 'DivideSkip' in algoname:
algo.pre_query(v, count)

start = time.time()
if rq:
candidates = algo.query(v, count, rq) # now count is the radius
candidates = algo.query(v, count, rq)
else:
candidates = algo.query(v, count)
total = (time.time() - start)
# special code for Risc

# special code for the Risc and DivideSkip
if "Risc" in algoname or 'DivideSkip' in algoname:
candidates = algo.post_query()

if issparse(X_train):
candidates = [(int(idx), float(metrics[distance]['distance'](v, X_train[idx].toarray()[0])))
for idx in candidates]
Expand All @@ -64,7 +67,6 @@ def batch_query(X):
algo.batch_query(X, count)
total = (time.time() - start)
results = algo.get_batch_results()
# needs testing
if issparse(X_train):
candidates = [[(int(idx), float(metrics[distance]['distance'](v, X_train[idx].toarray()[0])))
for idx in single_results]
Expand Down Expand Up @@ -115,20 +117,19 @@ def run(definition, dataset, count, run_count, batch, rq):
X_train = D['train'][()].toarray()
else:
X_train = D['train'][()]
# X_train = numpy.array(D['train'])
#X_train = X_train[:2000]
X_test = numpy.array(D['test'])
distance = D.attrs['distance']
print('got a train set of size (%d * %d)' % X_train.shape)
print('got %d queries' % len(X_test))

try:
# special code for Risc
# special code for the Risc and DivideSkip
print(X_train.shape)
if 'Risc' in definition.algorithm or 'DivideSkip' in definition.algorithm:
X_train = numpy.concatenate((X_train, [numpy.ones(X_train.shape[1], dtype=numpy.bool)]), axis=0)
print(X_train.shape)
algo.pre_fit(X_train)

t0 = time.time()
index_size_before = algo.get_index_size("self")
algo.fit(X_train)
Expand Down Expand Up @@ -206,6 +207,7 @@ def run_from_cmdline():
definition = Definition(
algorithm=args.algorithm,
docker_tag=None, # not needed
singularity_tag=None, # not needed
module=args.module,
constructor=args.constructor,
arguments=algo_args,
Expand All @@ -215,7 +217,7 @@ def run_from_cmdline():
if args.rq:
run(definition, args.dataset, args.radius, args.runs, args.batch, args.rq)
else:
run(definition, args.dataset, args.count, args.runs, args.batch)
run(definition, args.dataset, args.count, args.runs, args.batch, args.rq)


def run_docker(definition, dataset, count, runs, timeout, batch, rq, radius, mem_limit=None):
Expand Down Expand Up @@ -278,7 +280,7 @@ def stream_logs():

finally:
container.remove(force=True)
def run_singularity(definition, dataset, count, runs, timeout, batch, rq, radius, mem_limit=None):
def run_singularity(definition, dataset, count, runs, timeout, batch, rq, radius, sif_dir, mem_limit=None):
cmd = ['--dataset', dataset,
'--algorithm', definition.algorithm,
'--module', definition.module,
Expand All @@ -296,7 +298,8 @@ def run_singularity(definition, dataset, count, runs, timeout, batch, rq, radius
strCmd = ' '.join(["'" + k + "'" for k in cmd])
print('String of command', strCmd)

subprocess.check_call('singularity exec ./singularity/ann-bench-nmslib3.sif python3 run_algorithm.py %s' %(strCmd), shell=True)
subprocess.check_call('singularity exec %s/%s.sif python3 run_algorithm.py %s' %(sif_dir, definition.singularity_tag, strCmd), shell=True)
#subprocess.check_call('singularity exec ../../singularity/ann-bench-nmslib3.sif python3 run_algorithm.py %s' %(strCmd), shell=True)
#subprocess.check_call('singularity exec ../singularity/ann-bench-pynndescent.sif python3 run_algorithm.py %s' %(strCmd), shell=True)
#subprocess.check_call('singularity exec ../singularity/ann-bench-datasketch.sif python3 run_algorithm.py %s' %(strCmd), shell=True)
#subprocess.check_call('singularity exec ../singularity/ann-bench-sklearn.sif python3 run_algorithm.py %s' %(strCmd), shell=True)
Expand Down

0 comments on commit 3c8296d

Please sign in to comment.