From 2cebede4b128d109d8a7ebd2aa9602f641c75b03 Mon Sep 17 00:00:00 2001 From: ChunjiangZhu Date: Wed, 29 Apr 2020 15:28:45 -0400 Subject: [PATCH] add plot.py --- .DS_Store | Bin 6148 -> 10244 bytes ann_benchmarks/.DS_Store | Bin 0 -> 8196 bytes ann_benchmarks/datasets_old.py | 480 --------------------------------- plot.py | 123 +++++++++ run.sh | 39 --- 5 files changed, 123 insertions(+), 519 deletions(-) create mode 100644 ann_benchmarks/.DS_Store delete mode 100644 ann_benchmarks/datasets_old.py create mode 100644 plot.py delete mode 100644 run.sh diff --git a/.DS_Store b/.DS_Store index ebf969c050bf019577fb14d65a3a2354e1309ed6..4e1bd748b650552aa40f334f5e1003a448294a31 100644 GIT binary patch delta 324 zcmZoMXbF&DU|?W$DortDU{C-uIe-{M3-C-V6q~50$SA%sU^nw-1&%+=i`h9iCN{7z zif`s&v1j6BVMt*}WGG=soIHo+x+t7i7F?8MqW0hcH1I>*>s3?#4Z5DqE5d6 delta 153 zcmZn(XfcprU|?W$DortDU=RQ@Ie-{Mvv5r;6q~50$jHAjU^g=(|7IQmd#1_G!ZwrV zig4BQQ4`BV7PE732r>gD jKyutb!WCrt#=`H+llfIVL3S}PLF@%Ng<*3%&m3j||5qH~ diff --git a/ann_benchmarks/.DS_Store b/ann_benchmarks/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..827c239a459a68fb374e916c34eb01900326db89 GIT binary patch literal 8196 zcmeHM&uBO`W|oFVHL0nSOu&CRspNPx1a!h zXUpR3^S!S}ZEF><3LHoU*!KOw zfQE`D#XwD`aZkDv`UaaCH62jX0nN&2G89y+6VFrbfWAiCS_P~Edlitm`=H9%L*Sp~ z@7sk)RLYSklZQN1m#LoqpbA+4SqLD48VHCLZyh!u>EitPC7kkBQa9_oDc7SgNvEg( z!M;QNKO7!#oUD@_%HA!mi>+cBq>UsW#4GaMe2}h}jlCd5)V722V#u~i?$~v~(;(z5 zR^+hC$Yu3T$g5&2FB-g>sCkNdz;QB8rsPiS?3|mKner}O*q!orW@o3Tyz}QS?(Sxs ziBmII3img*AMZTbefmNv0B9~F7}oFG?Mt$H>JwHrO|1B;73(|KZAF{O4jvggI_!>& z9vc}O86Q7BF*$x>^v9n{?y+)Ec~FbxzORc&#QCVfO73t%N)#%B&&O<&qFzebqqG=4 zrlHX;!fjT)_o-`PyG}l1pE}f}zs8L5m0BED=hoH|mM(SJ%W}0!ETU*$Bw-o~zPzed zPuWI+uL)XF?qIGihmu7WCxm-AOUpqV2fW7e+jpa&%%VA-7ilUsmO_d`?p!TdtOu2l z*K$|?(~@R#{%LKwkZY-&^W(pj?YOwW1!v?nuLpDf@5al4GjI)xunG^M3D4mV_zPac zTlg2=!v`F~<9HI!;1piKS@f}hzu`^1g|~4<^ZQz#@XuyG16U=a=O?^bjCmw@jm9PS z2vq2Va7gIMSU{MPC7+IUoi3ap-_FxD09EproL=6~y1WP5-U+b%)=($-MaK!2tgyYa z3Nd@E1@a7cvIgj!2gKvOyj2{X-VVhYkpd~bB&YWI4S_O}PX8?3CXDvfAl80b{olpO z5T`+Nb7IE5b|oW6rz3Kjz!s~(0axIVj(<|-|G(`2{QrQfWW%=#eBTOS|Bd2}JjK}h zvC|!{%(XqL_o>Pv{hAsTg&LKP6RLEau>FT2*&ew{j&HE3k$R}u2LVb3+gJs@ssbMY DA8TKM literal 0 HcmV?d00001 diff --git a/ann_benchmarks/datasets_old.py b/ann_benchmarks/datasets_old.py deleted file mode 100644 index 64c7716..0000000 --- a/ann_benchmarks/datasets_old.py +++ /dev/null @@ -1,480 +0,0 @@ -import h5py -import numpy -import os -import random -import sys -try: - from urllib import urlretrieve -except ImportError: - from urllib.request import urlretrieve # Python 3 - - -def download(src, dst): - if not os.path.exists(dst): - # TODO: should be atomic - print('downloading %s -> %s...' % (src, dst)) - urlretrieve(src, dst) - - -def get_dataset_fn(dataset): - if not os.path.exists('data'): - os.mkdir('data') - return os.path.join('data', '%s.hdf5' % dataset) - - -def get_dataset(which): - hdf5_fn = get_dataset_fn(which) - try: - url = 'http://ann-benchmarks.com/%s.hdf5' % which - download(url, hdf5_fn) - except: - print("Cannot download %s" % url) - if which in DATASETS: - print("Creating dataset locally") - DATASETS[which](hdf5_fn) - if "sparse" not in which: - hdf5_f = h5py.File(hdf5_fn) - else: - import h5sparse - hdf5_f = h5sparse.File(hdf5_fn) - return hdf5_f - - -# Everything below this line is related to creating datasets -# You probably never need to do this at home, just rely on the prepared datasets at http://ann-benchmarks.com - -def write_output(train, test, fn, distance, point_type='float', count=1000, sparse=False): - from ann_benchmarks.algorithms.bruteforce import BruteForceBLAS - import sklearn.neighbors - - n = 0 - if sparse == False: - f = h5py.File(fn, 'w') - else: - import h5sparse - f = h5sparse.File(fn, 'w') - f.attrs['distance'] = distance - f.attrs['point_type'] = point_type - print('train size: %9d * %4d' % train.shape) - print('test size: %9d * %4d' % test.shape) - if sparse == False: - f.create_dataset('train', train.shape, dtype=train.dtype)[:] = train - else: - f.create_dataset('train',data=train) - f.create_dataset('test', test.shape, dtype=test.dtype)[:] = test - # f.create_dataset('test', data=test) - neighbors = f.create_dataset('neighbors', (test.shape[0], count), dtype='i') - distances = f.create_dataset('distances', (test.shape[0], count), dtype='f') - - # use which method to compute the groundtruth - method = 'balltree' - if method == 'balltree': - # only serve for jaccard - # todo: generalize to other metrics - tree = sklearn.neighbors.BallTree(train, leaf_size=20, metric='jaccard') - else: - bf = BruteForceBLAS(distance, precision=train.dtype) - bf.fit(train) - - print(test) - for i, x in enumerate(test): - if i % 1 == 0: - print('%d/%d...' % (i, test.shape[0])) - if method == 'balltree': - res = tree.query(x, k=count) - else: - res = list(bf.query_with_distances(x, count)) - res.sort(key=lambda t: t[-1]) - neighbors[i] = [j for j, _ in res] - distances[i] = [d for _, d in res] - print(neighbors[i]) - print(distances[i]) - f.close() - - -def train_test_split(X, test_size=10000): - import sklearn.model_selection - print('Splitting %d*%d into train/test' % X.shape) - return sklearn.model_selection.train_test_split(X, test_size=test_size, random_state=1) - - -def glove(out_fn, d): - import zipfile - - url = 'http://nlp.stanford.edu/data/glove.twitter.27B.zip' - fn = os.path.join('data', 'glove.twitter.27B.zip') - download(url, fn) - with zipfile.ZipFile(fn) as z: - print('preparing %s' % out_fn) - z_fn = 'glove.twitter.27B.%dd.txt' % d - X = [] - for line in z.open(z_fn): - v = [float(x) for x in line.strip().split()[1:]] - X.append(numpy.array(v)) - X_train, X_test = train_test_split(X) - write_output(numpy.array(X_train), numpy.array(X_test), out_fn, 'angular') - - -def _load_texmex_vectors(f, n, k): - import struct - - v = numpy.zeros((n, k)) - for i in range(n): - f.read(4) # ignore vec length - v[i] = struct.unpack('f' * k, f.read(k*4)) - - return v - - -def _get_irisa_matrix(t, fn): - import struct - m = t.getmember(fn) - f = t.extractfile(m) - k, = struct.unpack('i', f.read(4)) - n = m.size // (4 + 4*k) - f.seek(0) - return _load_texmex_vectors(f, n, k) - - -def sift(out_fn): - import tarfile - - url = 'ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz' - fn = os.path.join('data', 'sift.tar.tz') - download(url, fn) - with tarfile.open(fn, 'r:gz') as t: - train = _get_irisa_matrix(t, 'sift/sift_base.fvecs') - test = _get_irisa_matrix(t, 'sift/sift_query.fvecs') - write_output(train, test, out_fn, 'euclidean') - - -def gist(out_fn): - import tarfile - - url = 'ftp://ftp.irisa.fr/local/texmex/corpus/gist.tar.gz' - fn = os.path.join('data', 'gist.tar.tz') - download(url, fn) - with tarfile.open(fn, 'r:gz') as t: - train = _get_irisa_matrix(t, 'gist/gist_base.fvecs') - test = _get_irisa_matrix(t, 'gist/gist_query.fvecs') - write_output(train, test, out_fn, 'euclidean') - - -def _load_mnist_vectors(fn): - import gzip - import struct - - print('parsing vectors in %s...' % fn) - f = gzip.open(fn) - type_code_info = { - 0x08: (1, "!B"), - 0x09: (1, "!b"), - 0x0B: (2, "!H"), - 0x0C: (4, "!I"), - 0x0D: (4, "!f"), - 0x0E: (8, "!d") - } - magic, type_code, dim_count = struct.unpack("!hBB", f.read(4)) - assert magic == 0 - assert type_code in type_code_info - - dimensions = [struct.unpack("!I", f.read(4))[0] for i in range(dim_count)] - - entry_count = dimensions[0] - entry_size = numpy.product(dimensions[1:]) - - b, format_string = type_code_info[type_code] - vectors = [] - for i in range(entry_count): - vectors.append([struct.unpack(format_string, f.read(b))[0] for j in range(entry_size)]) - return numpy.array(vectors) - - -def mnist(out_fn): - download('http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz', 'mnist-train.gz') - download('http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz', 'mnist-test.gz') - train = _load_mnist_vectors('mnist-train.gz') - test = _load_mnist_vectors('mnist-test.gz') - write_output(train, test, out_fn, 'euclidean') - - -def fashion_mnist(out_fn): - download('http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz', 'fashion-mnist-train.gz') - download('http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz', 'fashion-mnist-test.gz') - train = _load_mnist_vectors('fashion-mnist-train.gz') - test = _load_mnist_vectors('fashion-mnist-test.gz') - write_output(train, test, out_fn, 'euclidean') - - -def transform_bag_of_words(filename, n_dimensions, out_fn): - import gzip - from scipy.sparse import lil_matrix - from sklearn.feature_extraction.text import TfidfTransformer - from sklearn import random_projection - with gzip.open(filename, 'rb') as f: - file_content = f.readlines() - entries = int(file_content[0]) - words = int(file_content[1]) - file_content = file_content[3:] # strip first three entries - print("building matrix...") - A = lil_matrix((entries, words)) - for e in file_content: - doc, word, cnt = [int(v) for v in e.strip().split()] - A[doc - 1, word - 1] = cnt - print("normalizing matrix entries with tfidf...") - B = TfidfTransformer().fit_transform(A) - print("reducing dimensionality...") - C = random_projection.GaussianRandomProjection(n_components = n_dimensions).fit_transform(B) - X_train, X_test = train_test_split(C) - write_output(numpy.array(X_train), numpy.array(X_test), out_fn, 'angular') - - -def nytimes(out_fn, n_dimensions): - fn = 'nytimes_%s.txt.gz' % n_dimensions - download('https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/docword.nytimes.txt.gz', fn) - transform_bag_of_words(fn, n_dimensions, out_fn) - - -def random(out_fn, n_dims, n_samples, centers, distance): - import sklearn.datasets - - X, _ = sklearn.datasets.make_blobs(n_samples=n_samples, n_features=n_dims, centers=centers, random_state=1) - X_train, X_test = train_test_split(X, test_size=0.1) - write_output(X_train, X_test, out_fn, distance) - - -def word2bits(out_fn, path, fn): - import tarfile - local_fn = fn + '.tar.gz' - url = 'http://web.stanford.edu/~maxlam/word_vectors/compressed/%s/%s.tar.gz' % (path, fn) - download(url, local_fn) - print('parsing vectors in %s...' % local_fn) - with tarfile.open(local_fn, 'r:gz') as t: - f = t.extractfile(fn) - n_words, k = [int(z) for z in next(f).strip().split()] - X = numpy.zeros((n_words, k), dtype=numpy.bool) - for i in range(n_words): - X[i] = numpy.array([float(z) > 0 for z in next(f).strip().split()[1:]], dtype=numpy.bool) - - X_train, X_test = train_test_split(X, test_size=1000) - write_output(X_train, X_test, out_fn, 'hamming', 'bit') - -def sift_hamming(out_fn, fn): - import tarfile - local_fn = fn + '.tar.gz' - url = 'http://sss.projects.itu.dk/ann-benchmarks/datasets/%s.tar.gz' % fn - download(url, local_fn) - print('parsing vectors in %s...' % local_fn) - with tarfile.open(local_fn, 'r:gz') as t: - f = t.extractfile(fn) - lines = f.readlines() - X = numpy.zeros((len(lines), 256), dtype=numpy.bool) - for i, line in enumerate(lines): - X[i] = numpy.array([int(x) > 0 for x in line.decode().strip()], dtype=numpy.bool) - X_train, X_test = train_test_split(X, test_size = 1000) - write_output(X_train, X_test, out_fn, 'hamming', 'bit') - -def lastfm(out_fn, n_dimensions, test_size=50000): - # This tests out ANN methods for retrieval on simple matrix factorization based - # recommendation algorithms. The idea being that the query/test vectors are user factors - # and the train set are item factors from the matrix factorization model. - - # Since the predictor is a dot product, we transform the factors first as described in this - # paper: https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/XboxInnerProduct.pdf - # This hopefully replicates the experiments done in this post: - # http://www.benfrederickson.com/approximate-nearest-neighbours-for-recommender-systems/ - - # The dataset is from "Last.fm Dataset - 360K users": - # http://www.dtic.upf.edu/~ocelma/MusicRecommendationDataset/lastfm-360K.html - - # this requires the implicit package to generate the factors (on my desktop/gpu this only - # takes 4-5 seconds to train - but could take 1-2 minutes on a laptop) - from implicit.datasets.lastfm import get_lastfm - from implicit.approximate_als import augment_inner_product_matrix - import implicit - - # train an als model on the lastfm data - _, _, play_counts = get_lastfm() - model = implicit.als.AlternatingLeastSquares(factors=n_dimensions) - model.fit(implicit.nearest_neighbours.bm25_weight(play_counts, K1=100, B=0.8)) - - # transform item factors so that each one has the same norm, and transform the user - # factors such by appending a 0 column - _, item_factors = augment_inner_product_matrix(model.item_factors) - user_factors = numpy.append(model.user_factors, - numpy.zeros((model.user_factors.shape[0], 1)), - axis=1) - - # only query the first 50k users (speeds things up signficantly without changing results) - user_factors = user_factors[:test_size] - - # after that transformation a cosine lookup will return the same results as the inner product - # on the untransformed data - write_output(item_factors, user_factors, out_fn, 'angular') - -def ecfp(out_fn, dataset_name, dimension, distance, type): - print('prepare dataset ' + dataset_name) - import pickle - path = '../pycharm_project_426/src/' - if type == 'bit': - dtype = numpy.bool - elif type == 'int': - dtype = numpy.int - else: - dtype = numpy.float - if dataset_name.startswith('toy'): - # toy - with open(path + dataset_name + '_' + str(dimension) + '_training.pickle', 'rb') as handle: - X_train = pickle.load(handle, encoding='latin1') - with open(path + dataset_name + '_' + str(dimension) + '_test.pickle', 'rb') as handle: - X_test = pickle.load(handle, encoding='latin1') - X_train = numpy.asarray(X_train.toarray(), dtype) - X_test = numpy.asarray(X_test.toarray(), dtype) - else: - # Chembl - with open(path + dataset_name + '_' + str(dimension) + '.pickle', 'rb') as handle: - X = pickle.load(handle, encoding='latin1') - X = numpy.asarray(X.toarray(), dtype) - X_train, X_test = train_test_split(X, test_size=1000) - - print(X_train) - print(X_test) - write_output(X_train, X_test, out_fn, distance, type) - -def ecfp_sparse(out_fn, dataset_name, dimension, distance, type): - print('prepare dataset ' + dataset_name) - import pickle - path = '../pycharm_project_426/src/' - if type == 'bit': - dtype = numpy.bool - elif type == 'int': - dtype = numpy.int - else: - dtype = numpy.float - - with open(path + dataset_name + '_' + str(dimension) + '.pickle', 'rb') as handle: - X = pickle.load(handle, encoding='latin1') - X = X.astype(dtype) - X_train, X_test = train_test_split(X, test_size=100) - X_test = X_test.toarray() - - print(X_train) - print(X_test) - write_output(X_train, X_test, out_fn, distance, type, 1000, True) - -def ecfp_sparse_multi(out_fn, dataset_name, num_files, dimension, distance, type): - print('prepare dataset ' + dataset_name) - import pickle - from scipy.sparse import vstack - path = '/data/chunjiangzhu/Enamine_680M_SparseMatrix/' - if type == 'bit': - dtype = numpy.bool - elif type == 'int': - dtype = numpy.int - else: - dtype = numpy.float - - # vertically stack sparse matrices from multiple files - test_size = 1 - if num_files==0.5: - with open(path + dataset_name + '_' + str(dimension) + '_trunk_0.pickle', 'rb') as handle: - Y = pickle.load(handle, encoding='latin1') - size = 2000000 - print('select %i out of %i' %(size, Y.shape[0])) - Y = Y[:size] - X_test = Y[Y.shape[0] - test_size:] - X_train = Y[:Y.shape[0] - test_size] - else: - first = False - for i in range(num_files): - print('process ' + str(i) + ' trunk') - if first == False: - first = True - with open(path + dataset_name + '_' + str(dimension) + '_trunk_' + str(i) + '.pickle', 'rb') as handle: - Y = pickle.load(handle, encoding='latin1') - if i==num_files-1: #last one - X_test = Y[Y.shape[0] - test_size:] - X_train = Y[:Y.shape[0] - test_size] - else: - X_train = Y - else: - with open(path + dataset_name + '_' + str(dimension) + '_trunk_' + str(i) + '.pickle', 'rb') as handle: - Y = pickle.load(handle, encoding='latin1') - if i==num_files-1: #last one - X_test = Y[Y.shape[0] - test_size:] - X_train = vstack([X_train, Y[:Y.shape[0] - test_size]]) - else: - X_train = vstack([X_train, Y]) - # X_train = X_train.astype(dtype) - # X_test = X_test.astype(dtype) - - # X_train, X_test = train_test_split(X, test_size=1000) - # X_test = X_test.toarray() - # encounter memory error when calling train_test_split, for 100M - X_test = X_test.toarray() - print('finish data preparation') - - print(X_train.shape) - print(X_test.shape) - write_output(X_train, X_test, out_fn, distance, type, 1000, True) - -def ecfp_multi(out_fn, dataset_name, num_files, dimension, distance, type): - print('prepare dataset ' + dataset_name) - import pickle - from scipy.sparse import vstack - path = '/data/chunjiangzhu/Enamine_680M_SparseMatrix/' - if type == 'bit': - dtype = numpy.bool - elif type == 'int': - dtype = numpy.int - else: - dtype = numpy.float - - # vertically stack sparse matrices from multiple files - test_size = 3 - with open(path + dataset_name + '_' + str(dimension) + '_trunk_0.pickle', 'rb') as handle: - Y = pickle.load(handle, encoding='latin1') - size = 10000000 - print('select %i out of %i' %(size, Y.shape[0])) - Y = Y[:size] - X_test = Y[Y.shape[0] - test_size:] - X_train = Y[:Y.shape[0] - test_size] - - # make them full matrices here - X_train = X_train.toarray() - X_test = X_test.toarray() - print('finish data preparation') - - print(X_train.shape) - print(X_test.shape) - write_output(X_train, X_test, out_fn, distance, type, 1000) - -DATASETS = { - 'fashion-mnist-784-euclidean': fashion_mnist, - 'gist-960-euclidean': gist, - 'glove-25-angular': lambda out_fn: glove(out_fn, 25), - 'glove-50-angular': lambda out_fn: glove(out_fn, 50), - 'glove-100-angular': lambda out_fn: glove(out_fn, 100), - 'glove-200-angular': lambda out_fn: glove(out_fn, 200), - 'mnist-784-euclidean': mnist, - 'random-xs-20-euclidean': lambda out_fn: random(out_fn, 20, 10000, 100, 'euclidean'), - 'random-s-100-euclidean': lambda out_fn: random(out_fn, 100, 100000, 1000, 'euclidean'), - 'random-xs-20-angular': lambda out_fn: random(out_fn, 20, 10000, 100, 'angular'), - 'random-s-100-angular': lambda out_fn: random(out_fn, 100, 100000, 1000, 'angular'), - 'sift-128-euclidean': sift, - 'nytimes-256-angular': lambda out_fn: nytimes(out_fn, 256), - 'nytimes-16-angular': lambda out_fn: nytimes(out_fn, 16), - 'word2bits-800-hamming': lambda out_fn: word2bits(out_fn, '400K', 'w2b_bitlevel1_size800_vocab400K'), - 'lastfm-64-dot': lambda out_fn: lastfm(out_fn, 64), - 'sift-256-hamming': lambda out_fn: sift_hamming(out_fn, 'sift.hamming.256'), - # below are datasets Chunjiang added - 'toy-1024-jaccard': lambda out_fn: ecfp(out_fn, 'toydata', 1024, 'jaccard', 'int'), - 'chembl-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl', 1024, 'jaccard', 'int'), - 'chembl100K-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl100K', 1024, 'jaccard', 'int'), - 'chembl10K-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl10K', 1024, 'jaccard', 'int'), - 'chembl-sparse-1024-jaccard': lambda out_fn: ecfp_sparse(out_fn, 'Chembl10K', 1024, 'jaccard', 'bit'), - 'enamine100M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 5, 1024, 'jaccard', 'bit'), - 'enamine10M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 0.5, 1024, 'jaccard', 'bit'), - 'enamine20M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 1, 1024, 'jaccard', 'bit'), - 'enamine10M-1024-jaccard': lambda out_fn: ecfp_multi(out_fn, 'Enamine', 0.5, 1024, 'jaccard', 'bit') -} diff --git a/plot.py b/plot.py new file mode 100644 index 0000000..191116e --- /dev/null +++ b/plot.py @@ -0,0 +1,123 @@ +import os +import matplotlib as mpl +mpl.use('Agg') +import matplotlib.pyplot as plt +import argparse + +from ann_benchmarks.datasets import get_dataset +from ann_benchmarks.algorithms.definitions import get_definitions +from ann_benchmarks.plotting.metrics import all_metrics as metrics +from ann_benchmarks.plotting.utils import get_plot_label, compute_metrics, create_linestyles, create_pointset +from ann_benchmarks.results import store_results, load_all_results, get_unique_algorithms, get_algorithm_name + + +def create_plot(all_data, raw, x_log, y_log, xn, yn, fn_out, linestyles, batch): + xm, ym = (metrics[xn], metrics[yn]) + # Now generate each plot + handles = [] + labels = [] + plt.figure(figsize=(12, 9)) + for algo in sorted(all_data.keys(), key=lambda x: x.lower()): + xs, ys, ls, axs, ays, als = create_pointset(all_data[algo], xn, yn) + color, faded, linestyle, marker = linestyles[algo] + handle, = plt.plot(xs, ys, '-', label=algo, color=color, ms=7, mew=3, lw=3, linestyle=linestyle, marker=marker) + handles.append(handle) + if raw: + handle2, = plt.plot(axs, ays, '-', label=algo, color=faded, ms=5, mew=2, lw=2, linestyle=linestyle, marker=marker) + labels.append(get_algorithm_name(algo, batch)) + + if x_log: + plt.gca().set_xscale('log') + if y_log: + plt.gca().set_yscale('log') + #plt.gca().set_title(get_plot_label(xm, ym), fontsize=15) + plt.gca().set_ylabel(ym['description'], fontsize=15) + plt.gca().set_xlabel(xm['description'], fontsize=15) + box = plt.gca().get_position() + # plt.gca().set_position([box.x0, box.y0, box.width * 0.8, box.height]) + #plt.gca().legend(handles, labels, loc='lower left', prop={'size': 12}) + plt.xticks(size=15) + plt.yticks(size=15) + plt.grid(b=True, which='major', color='0.65',linestyle='-') + if 'lim' in xm: + plt.xlim(xm['lim']) + if 'lim' in ym: + plt.ylim(ym['lim']) + plt.savefig(fn_out, bbox_inches='tight') + plt.close() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + '--dataset', + metavar="DATASET", + default='glove-100-angular') + parser.add_argument( + '--count', + default=10) + parser.add_argument( + '--definitions', + metavar='FILE', + help='load algorithm definitions from FILE', + default='algos.yaml') + parser.add_argument( + '--limit', + default=-1) + parser.add_argument( + '-o', '--output') + parser.add_argument( + '-x', '--x-axis', + help = 'Which metric to use on the X-axis', + choices = metrics.keys(), + default = "k-nn") + parser.add_argument( + '-y', '--y-axis', + help = 'Which metric to use on the Y-axis', + choices = metrics.keys(), + default = "qps") + parser.add_argument( + '-X', '--x-log', + help='Draw the X-axis using a logarithmic scale', + action='store_true') + parser.add_argument( + '-Y', '--y-log', + help='Draw the Y-axis using a logarithmic scale', + action='store_true') + parser.add_argument( + '--raw', + help='Show raw results (not just Pareto frontier) in faded colours', + action='store_true') + parser.add_argument( + '--batch', + help='Plot runs in batch mode', + action='store_true') + parser.add_argument( + '--rq', + action='store_true', + help='If set, plot range queries') + parser.add_argument( + "--radius", + default=0.3, + type=float, + help="th range of similarity to search for") + args = parser.parse_args() + + if not args.output: + args.output = 'results/%s.png' % get_algorithm_name(args.dataset, args.batch) + print('writing output to %s' % args.output) + + dataset = get_dataset(args.dataset) + if args.rq: + count = args.radius + else: + count = int(args.count) + unique_algorithms = get_unique_algorithms() + results = load_all_results(args.dataset, count, True, args.batch) + linestyles = create_linestyles(sorted(unique_algorithms)) + runs = compute_metrics(list(dataset["distances"]), results, args.x_axis, args.y_axis) + if not runs: + raise Exception('Nothing to plot') + + create_plot(runs, args.raw, args.x_log, + args.y_log, args.x_axis, args.y_axis, args.output, linestyles, args.batch) diff --git a/run.sh b/run.sh deleted file mode 100644 index 3529ebc..0000000 --- a/run.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -#SBATCH --partition=HaswellPriority # Name of partition -#SBATCH --ntasks=1 # Request 48 CPU cores -#SBATCH --exclude=cn[65-69,71-136,325-343,345-353,355-358,360-364,369-398,400-401],gpu[07-10] -#SBATCH --exclusive - -module load anaconda/5.1.0 -source activate ann_env -module purge -module load gcc/5.4.0 -module load singularity/3.1 -#python cpBuildingTime.py -#singularity exec ../singularity/ann-bench-nmslib.sif python -c 'import nmslib' -#python run.py --dataset=molport-1024-jaccard --algorithm='Hnsw(Nmslib)' # Replace with your application's commands -#python run.py --dataset=molport-1024-jaccard --algorithm='Hnsw(Nmslib)' --batch -#python run.py --dataset=molport-1024-jaccard --algorithm='SW-graph(Nmslib)' -#python run.py --dataset=molport-1024-jaccard --algorithm='VPtree(Nmslib)' -#python run.py --dataset=molport-1024-jaccard --algorithm='Pynndescent' -#python run.py --dataset=molport-1024-jaccard --algorithm='Datasketch' -#python run.py --dataset=molport-1024-jaccard --algorithm='Bruteforce' -#python run.py --dataset=molport-1024-jaccard --algorithm='Balltree(Sklearn)' -#python run.py --dataset=molport-1024-jaccard --algorithm='Risc' -#python run.py --dataset=molport-1024-jaccard --algorithm='DivideSkip' -python run.py --dataset=molport-1024-jaccard --rq --radius=0.4 --algorithm='Onng(Ngt)' -#python run.py --dataset=molport-1024-jaccard --algorithm='Panng(Ngt)' - -#python run.py --dataset=chembl-1024-jaccard --algorithm='Hnsw(Nmslib)' # Replace with your application's commands -#python run.py --dataset=chembl-1024-jaccard --algorithm='Hnsw(Nmslib)' --batch -#python run.py --dataset=chembl-1024-jaccard --algorithm='SW-graph(Nmslib)' -#python run.py --dataset=chembl-1024-jaccard --algorithm='VPtree(Nmslib)' -#python run.py --dataset=chembl-1024-jaccard --algorithm='Pynndescent' -#python run.py --dataset=chembl-1024-jaccard --algorithm='Datasketch' -#python run.py --dataset=chembl-1024-jaccard --algorithm='Bruteforce' -#python run.py --dataset=chembl-1024-jaccard --algorithm='Balltree(Sklearn)' -#python run.py --dataset=chembl-1024-jaccard --algorithm='Risc' -#python run.py --dataset=chembl-1024-jaccard --algorithm='DivideSkip' -#python run.py --dataset=chembl-1024-jaccard --algorithm='Onng(Ngt)' -#python run.py --dataset=chembl-1024-jaccard --algorithm='Panng(Ngt)' -