From 2cebede4b128d109d8a7ebd2aa9602f641c75b03 Mon Sep 17 00:00:00 2001
From: ChunjiangZhu <chunjiang.zhu@uconn.edu>
Date: Wed, 29 Apr 2020 15:28:45 -0400
Subject: [PATCH] add plot.py

---
 .DS_Store                      | Bin 6148 -> 10244 bytes
 ann_benchmarks/.DS_Store       | Bin 0 -> 8196 bytes
 ann_benchmarks/datasets_old.py | 480 ---------------------------------
 plot.py                        | 123 +++++++++
 run.sh                         |  39 ---
 5 files changed, 123 insertions(+), 519 deletions(-)
 create mode 100644 ann_benchmarks/.DS_Store
 delete mode 100644 ann_benchmarks/datasets_old.py
 create mode 100644 plot.py
 delete mode 100644 run.sh
diff --git a/.DS_Store b/.DS_Store
index ebf969c050bf019577fb14d65a3a2354e1309ed6..4e1bd748b650552aa40f334f5e1003a448294a31 100644
GIT binary patch
delta 324
zcmZoMXbF&DU|?W$DortDU{C-uIe-{M3-C-V6q~50$SA%sU^nw-1&%+=i`h9iCN{7z
zif`s&v1j6BVMt*}WGG=soIHo+x+t7i7F?8<lb@FkG=p)n2b=w5Z#G$=ikp*t*`#?;
zBwN_bIoTNs7;+f$8A>MqW0hcH<e0pH&3v*jd)wq-c6E@sAR1NIH1^oZr`b87mX<K2
zG88kEAe*~ckVA|G=-}PVg3Lfi0D%HGkZ=Y02`KlSc{0C<=VX5#4iP4ZcQrt27#SE0
SK=kASk>1I>*>s3?#4Z5DqE5d6

delta 153
zcmZn(XfcprU|?W$DortDU=RQ@Ie-{Mvv5r;6q~50$jHAjU^g=(|7IQmd#1_G!ZwrV
zi<nGi6xESsV<=)MWyoXDV<=|G@XX0iPRhwog6L-aJXub3>g4BQQ4`BV7PE732r>gD
jKyutb!WCrt#=`H+llfIVL3S}PLF@%Ng<*3%&m3j||5qH~

diff --git a/ann_benchmarks/.DS_Store b/ann_benchmarks/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..827c239a459a68fb374e916c34eb01900326db89
GIT binary patch
literal 8196
zcmeHM&u<e)6n-zHWbF{L2?+^mE3L4Qkcdzsegp|27DH9JMM}U?NR)tC+Y@KSX2$HU
z6G8~`g}(tf_taDS7a(!t#I;8b{Rb*loZ-z68{;?<Cxpn1HS;~syl-aSXU*7e4FH&L
zRA&Gt0l=Xu(H*3wN0H;Q?kRnyXNVAq4-i5MC3>BO`W|oFVHL0nSOu&CRspNPx1a!h
zXUpR3^S!S}ZEF><3LHoU<o)1LmFOF6YE)MTH7W%l?5A#3$fi6%aXf>*!KOw<r8#x>
zfQE`D#XwD`aZkDv`UaaCH62jX0nN&2G89y+6VFrbfWAiCS_P~Edlitm`=H9%L*Sp~
z@7sk)RLYSklZQN1m#LoqpbA+4SqLD48VHCLZyh!u>EitPC7kkBQa9_oDc7SgNvEg(
z!M;QNKO7!#oUD@_%HA!mi>+cBq>UsW#4GaMe2}h}jlCd5)V722V#u~i?$~v~(;(z5
zR^+hC$Yu3T$g5&2FB-g>sCkNdz;QB8rsPiS?3|mKner}O*q!orW@o3Tyz}QS?(Sxs
ziBmII3img*AMZTbefmNv0B9~F7}oFG?Mt$H>JwHrO|1B;73(|KZAF{O4jvggI_!>&
z9vc}O86Q7BF*$x>^v9n{?y+)Ec~FbxzORc&#QCVfO73t%N)#%B&&O<&qFzebqqG=4
zrlHX;!fjT)_o-`PyG}l1pE}f}zs8L5m0BED=hoH|mM(SJ%W}0!ETU*$Bw-o~zPzed
zPuWI+uL)XF?qIGihmu7WCxm-AOUpqV2fW7e+jpa&%%VA-7ilUsmO_d`?p!TdtOu2l
z*K$|?(~@R#{%LKwkZY-&^W(pj?YOwW1!v?nuLpDf@5al4GjI)xunG^M3D4mV_zPac
zTlg2=!v`F~<9HI!;1piKS@f}hzu`^1g|~4<^ZQz#@XuyG16U=a=O?^bjCmw@jm9PS
z2vq2Va7gIMSU{MPC7+IUoi3ap-_FxD09EproL=6~y1WP5-U+b%)=($-MaK!2tgyYa
z3Nd@E1@a7cvIgj!2gKvOyj2{X-VVhYkpd~bB&YWI4S_O}PX8?3CXDvfAl80b{olpO
z5T`+Nb7IE5b|oW6rz3Kjz!s~(0axIVj(<|-|G(`2{QrQfWW%=#eBTOS|Bd2}JjK}h
zvC|!{%(XqL_o>Pv{hAsTg&LKP6RLEau>FT2*&ew{j&HE3k$R}u2LVb3+gJs@ssbMY
DA8TKM

literal 0
HcmV?d00001

diff --git a/ann_benchmarks/datasets_old.py b/ann_benchmarks/datasets_old.py
deleted file mode 100644
index 64c7716..0000000
--- a/ann_benchmarks/datasets_old.py
+++ /dev/null
@@ -1,480 +0,0 @@
-import h5py
-import numpy
-import os
-import random
-import sys
-try:
-    from urllib import urlretrieve
-except ImportError:
-    from urllib.request import urlretrieve # Python 3
-
-
-def download(src, dst):
-    if not os.path.exists(dst):
-        # TODO: should be atomic
-        print('downloading %s -> %s...' % (src, dst))
-        urlretrieve(src, dst)
-
-
-def get_dataset_fn(dataset):
-    if not os.path.exists('data'):
-        os.mkdir('data')
-    return os.path.join('data', '%s.hdf5' % dataset)
-
-
-def get_dataset(which):
-    hdf5_fn = get_dataset_fn(which)
-    try:
-        url = 'http://ann-benchmarks.com/%s.hdf5' % which
-        download(url, hdf5_fn)
-    except:
-        print("Cannot download %s" % url)
-        if which in DATASETS:
-            print("Creating dataset locally")
-            DATASETS[which](hdf5_fn)
-    if "sparse" not in which:
-        hdf5_f = h5py.File(hdf5_fn)
-    else:
-        import h5sparse
-        hdf5_f = h5sparse.File(hdf5_fn)
-    return hdf5_f
-
-
-# Everything below this line is related to creating datasets
-# You probably never need to do this at home, just rely on the prepared datasets at http://ann-benchmarks.com
-
-def write_output(train, test, fn, distance, point_type='float', count=1000, sparse=False):
-    from ann_benchmarks.algorithms.bruteforce import BruteForceBLAS
-    import sklearn.neighbors
-
-    n = 0
-    if sparse == False:
-        f = h5py.File(fn, 'w')
-    else:
-        import h5sparse
-        f = h5sparse.File(fn, 'w')
-    f.attrs['distance'] = distance
-    f.attrs['point_type'] = point_type
-    print('train size: %9d * %4d' % train.shape)
-    print('test size:  %9d * %4d' % test.shape)
-    if sparse == False:
-        f.create_dataset('train', train.shape, dtype=train.dtype)[:] = train
-    else:
-        f.create_dataset('train',data=train)
-    f.create_dataset('test', test.shape, dtype=test.dtype)[:] = test
-        # f.create_dataset('test', data=test)
-    neighbors = f.create_dataset('neighbors', (test.shape[0], count), dtype='i')
-    distances = f.create_dataset('distances', (test.shape[0], count), dtype='f')
-
-    # use which method to compute the groundtruth
-    method = 'balltree'
-    if method == 'balltree':
-        # only serve for jaccard
-        # todo: generalize to other metrics
-        tree = sklearn.neighbors.BallTree(train, leaf_size=20, metric='jaccard')
-    else:
-        bf = BruteForceBLAS(distance, precision=train.dtype)
-        bf.fit(train)
-
-    print(test)
-    for i, x in enumerate(test):
-        if i % 1 == 0:
-            print('%d/%d...' % (i, test.shape[0]))
-        if method == 'balltree':
-            res = tree.query(x, k=count)
-        else:
-            res = list(bf.query_with_distances(x, count))
-        res.sort(key=lambda t: t[-1])
-        neighbors[i] = [j for j, _ in res]
-        distances[i] = [d for _, d in res]
-        print(neighbors[i])
-        print(distances[i])
-    f.close()
-
-
-def train_test_split(X, test_size=10000):
-    import sklearn.model_selection
-    print('Splitting %d*%d into train/test' % X.shape)
-    return sklearn.model_selection.train_test_split(X, test_size=test_size, random_state=1)
-
-
-def glove(out_fn, d):
-    import zipfile
-
-    url = 'http://nlp.stanford.edu/data/glove.twitter.27B.zip'
-    fn = os.path.join('data', 'glove.twitter.27B.zip')
-    download(url, fn)
-    with zipfile.ZipFile(fn) as z:
-        print('preparing %s' % out_fn)
-        z_fn = 'glove.twitter.27B.%dd.txt' % d
-        X = []
-        for line in z.open(z_fn):
-            v = [float(x) for x in line.strip().split()[1:]]
-            X.append(numpy.array(v))
-        X_train, X_test = train_test_split(X)
-        write_output(numpy.array(X_train), numpy.array(X_test), out_fn, 'angular')
-
-
-def _load_texmex_vectors(f, n, k):
-    import struct
-
-    v = numpy.zeros((n, k))
-    for i in range(n):
-        f.read(4)  # ignore vec length
-        v[i] = struct.unpack('f' * k, f.read(k*4))
-
-    return v
-
-
-def _get_irisa_matrix(t, fn):
-    import struct
-    m = t.getmember(fn)
-    f = t.extractfile(m)
-    k, = struct.unpack('i', f.read(4))
-    n = m.size // (4 + 4*k)
-    f.seek(0)
-    return _load_texmex_vectors(f, n, k)
-
-
-def sift(out_fn):
-    import tarfile
-
-    url = 'ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz'
-    fn = os.path.join('data', 'sift.tar.tz')
-    download(url, fn)
-    with tarfile.open(fn, 'r:gz') as t:
-        train = _get_irisa_matrix(t, 'sift/sift_base.fvecs')
-        test = _get_irisa_matrix(t, 'sift/sift_query.fvecs')
-        write_output(train, test, out_fn, 'euclidean')
-
-
-def gist(out_fn):
-    import tarfile
-
-    url = 'ftp://ftp.irisa.fr/local/texmex/corpus/gist.tar.gz'
-    fn = os.path.join('data', 'gist.tar.tz')
-    download(url, fn)
-    with tarfile.open(fn, 'r:gz') as t:
-        train = _get_irisa_matrix(t, 'gist/gist_base.fvecs')
-        test = _get_irisa_matrix(t, 'gist/gist_query.fvecs')
-        write_output(train, test, out_fn, 'euclidean')
-
-
-def _load_mnist_vectors(fn):
-    import gzip
-    import struct
-
-    print('parsing vectors in %s...' % fn)
-    f = gzip.open(fn)
-    type_code_info = {
-        0x08: (1, "!B"),
-        0x09: (1, "!b"),
-        0x0B: (2, "!H"),
-        0x0C: (4, "!I"),
-        0x0D: (4, "!f"),
-        0x0E: (8, "!d")
-    }
-    magic, type_code, dim_count = struct.unpack("!hBB", f.read(4))
-    assert magic == 0
-    assert type_code in type_code_info
-
-    dimensions = [struct.unpack("!I", f.read(4))[0] for i in range(dim_count)]
-
-    entry_count = dimensions[0]
-    entry_size = numpy.product(dimensions[1:])
-
-    b, format_string = type_code_info[type_code]
-    vectors = []
-    for i in range(entry_count):
-        vectors.append([struct.unpack(format_string, f.read(b))[0] for j in range(entry_size)])
-    return numpy.array(vectors)
-
-
-def mnist(out_fn):
-    download('http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz', 'mnist-train.gz')
-    download('http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz', 'mnist-test.gz')
-    train = _load_mnist_vectors('mnist-train.gz')
-    test = _load_mnist_vectors('mnist-test.gz')
-    write_output(train, test, out_fn, 'euclidean')
-
-
-def fashion_mnist(out_fn):
-    download('http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz', 'fashion-mnist-train.gz')
-    download('http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz', 'fashion-mnist-test.gz')
-    train = _load_mnist_vectors('fashion-mnist-train.gz')
-    test = _load_mnist_vectors('fashion-mnist-test.gz')
-    write_output(train, test, out_fn, 'euclidean')
-
-
-def transform_bag_of_words(filename, n_dimensions, out_fn):
-    import gzip
-    from scipy.sparse import lil_matrix
-    from sklearn.feature_extraction.text import TfidfTransformer
-    from sklearn import random_projection
-    with gzip.open(filename, 'rb') as f:
-        file_content = f.readlines()
-        entries = int(file_content[0])
-        words = int(file_content[1])
-        file_content = file_content[3:] # strip first three entries
-        print("building matrix...")
-        A = lil_matrix((entries, words))
-        for e in file_content:
-            doc, word, cnt = [int(v) for v in e.strip().split()]
-            A[doc - 1, word - 1] = cnt
-        print("normalizing matrix entries with tfidf...")
-        B = TfidfTransformer().fit_transform(A)
-        print("reducing dimensionality...")
-        C = random_projection.GaussianRandomProjection(n_components = n_dimensions).fit_transform(B)
-        X_train, X_test = train_test_split(C)
-        write_output(numpy.array(X_train), numpy.array(X_test), out_fn, 'angular')
-
-
-def nytimes(out_fn, n_dimensions):
-    fn = 'nytimes_%s.txt.gz' % n_dimensions
-    download('https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/docword.nytimes.txt.gz', fn)
-    transform_bag_of_words(fn, n_dimensions, out_fn)
-
-
-def random(out_fn, n_dims, n_samples, centers, distance):
-    import sklearn.datasets
-
-    X, _ = sklearn.datasets.make_blobs(n_samples=n_samples, n_features=n_dims, centers=centers, random_state=1)
-    X_train, X_test = train_test_split(X, test_size=0.1)
-    write_output(X_train, X_test, out_fn, distance)
-
-
-def word2bits(out_fn, path, fn):
-    import tarfile
-    local_fn = fn + '.tar.gz'
-    url = 'http://web.stanford.edu/~maxlam/word_vectors/compressed/%s/%s.tar.gz' % (path, fn)
-    download(url, local_fn)
-    print('parsing vectors in %s...' % local_fn)
-    with tarfile.open(local_fn, 'r:gz') as t:
-        f = t.extractfile(fn)
-        n_words, k = [int(z) for z in next(f).strip().split()]
-        X = numpy.zeros((n_words, k), dtype=numpy.bool)
-        for i in range(n_words):
-            X[i] = numpy.array([float(z) > 0 for z in next(f).strip().split()[1:]], dtype=numpy.bool)
-
-        X_train, X_test = train_test_split(X, test_size=1000)
-        write_output(X_train, X_test, out_fn, 'hamming', 'bit')
-
-def sift_hamming(out_fn, fn):
-    import tarfile
-    local_fn = fn + '.tar.gz'
-    url = 'http://sss.projects.itu.dk/ann-benchmarks/datasets/%s.tar.gz' % fn
-    download(url, local_fn)
-    print('parsing vectors in %s...' % local_fn)
-    with tarfile.open(local_fn, 'r:gz') as t:
-        f = t.extractfile(fn)
-        lines = f.readlines()
-        X = numpy.zeros((len(lines), 256), dtype=numpy.bool)
-        for i, line in enumerate(lines):
-            X[i] = numpy.array([int(x) > 0 for x in line.decode().strip()], dtype=numpy.bool)
-        X_train, X_test = train_test_split(X, test_size = 1000)
-        write_output(X_train, X_test, out_fn, 'hamming', 'bit')
-
-def lastfm(out_fn, n_dimensions, test_size=50000):
-    # This tests out ANN methods for retrieval on simple matrix factorization based
-    # recommendation algorithms. The idea being that the query/test vectors are user factors
-    # and the train set are item factors from the matrix factorization model.
-
-    # Since the predictor is a dot product, we transform the factors first as described in this
-    # paper: https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/XboxInnerProduct.pdf
-    # This hopefully replicates the experiments done in this post:
-    # http://www.benfrederickson.com/approximate-nearest-neighbours-for-recommender-systems/
-
-    # The dataset is from "Last.fm Dataset - 360K users":
-    # http://www.dtic.upf.edu/~ocelma/MusicRecommendationDataset/lastfm-360K.html
-
-    # this requires the implicit package to generate the factors (on my desktop/gpu this only
-    # takes 4-5 seconds to train - but could take 1-2 minutes on a laptop)
-    from implicit.datasets.lastfm import get_lastfm
-    from implicit.approximate_als import augment_inner_product_matrix
-    import implicit
-
-    # train an als model on the lastfm data
-    _, _, play_counts = get_lastfm()
-    model = implicit.als.AlternatingLeastSquares(factors=n_dimensions)
-    model.fit(implicit.nearest_neighbours.bm25_weight(play_counts, K1=100, B=0.8))
-
-    # transform item factors so that each one has the same norm, and transform the user
-    # factors such by appending a 0 column
-    _, item_factors = augment_inner_product_matrix(model.item_factors)
-    user_factors = numpy.append(model.user_factors,
-                                numpy.zeros((model.user_factors.shape[0], 1)),
-                                axis=1)
-
-    # only query the first 50k users (speeds things up signficantly without changing results)
-    user_factors = user_factors[:test_size]
-
-    # after that transformation a cosine lookup will return the same results as the inner product
-    # on the untransformed data
-    write_output(item_factors, user_factors, out_fn, 'angular')
-
-def ecfp(out_fn, dataset_name, dimension, distance, type):
-    print('prepare dataset ' + dataset_name)
-    import pickle
-    path = '../pycharm_project_426/src/'
-    if type == 'bit':
-        dtype = numpy.bool
-    elif type == 'int':
-        dtype = numpy.int
-    else:
-        dtype = numpy.float
-    if dataset_name.startswith('toy'):
-        # toy
-        with open(path + dataset_name + '_' + str(dimension) + '_training.pickle', 'rb') as handle:
-            X_train = pickle.load(handle, encoding='latin1')
-        with open(path + dataset_name + '_' + str(dimension) + '_test.pickle', 'rb') as handle:
-            X_test = pickle.load(handle, encoding='latin1')
-        X_train = numpy.asarray(X_train.toarray(), dtype)
-        X_test = numpy.asarray(X_test.toarray(), dtype)
-    else:
-        # Chembl
-        with open(path + dataset_name + '_' + str(dimension) + '.pickle', 'rb') as handle:
-            X = pickle.load(handle, encoding='latin1')
-        X = numpy.asarray(X.toarray(), dtype)
-        X_train, X_test = train_test_split(X, test_size=1000)
-
-    print(X_train)
-    print(X_test)
-    write_output(X_train, X_test, out_fn, distance, type)
-
-def ecfp_sparse(out_fn, dataset_name, dimension, distance, type):
-    print('prepare dataset ' + dataset_name)
-    import pickle
-    path = '../pycharm_project_426/src/'
-    if type == 'bit':
-        dtype = numpy.bool
-    elif type == 'int':
-        dtype = numpy.int
-    else:
-        dtype = numpy.float
-
-    with open(path + dataset_name + '_' + str(dimension) + '.pickle', 'rb') as handle:
-        X = pickle.load(handle, encoding='latin1')
-    X = X.astype(dtype)
-    X_train, X_test = train_test_split(X, test_size=100)
-    X_test = X_test.toarray()
-
-    print(X_train)
-    print(X_test)
-    write_output(X_train, X_test, out_fn, distance, type, 1000, True)
-
-def ecfp_sparse_multi(out_fn, dataset_name, num_files, dimension, distance, type):
-    print('prepare dataset ' + dataset_name)
-    import pickle
-    from scipy.sparse import vstack
-    path = '/data/chunjiangzhu/Enamine_680M_SparseMatrix/'
-    if type == 'bit':
-        dtype = numpy.bool
-    elif type == 'int':
-        dtype = numpy.int
-    else:
-        dtype = numpy.float
-
-    # vertically stack sparse matrices from multiple files
-    test_size = 1
-    if num_files==0.5:
-        with open(path + dataset_name + '_' + str(dimension) + '_trunk_0.pickle', 'rb') as handle:
-            Y = pickle.load(handle, encoding='latin1')
-        size = 2000000
-        print('select %i out of %i' %(size, Y.shape[0]))
-        Y = Y[:size]
-        X_test = Y[Y.shape[0] - test_size:]
-        X_train = Y[:Y.shape[0] - test_size]
-    else:
-        first = False
-        for i in range(num_files):
-            print('process ' + str(i) + ' trunk')
-            if first == False:
-                first = True
-                with open(path + dataset_name + '_' + str(dimension) + '_trunk_' + str(i) + '.pickle', 'rb') as handle:
-                    Y = pickle.load(handle, encoding='latin1')
-                if i==num_files-1: #last one
-                    X_test = Y[Y.shape[0] - test_size:]
-                    X_train = Y[:Y.shape[0] - test_size]
-                else:
-                    X_train = Y
-            else:
-                with open(path + dataset_name + '_' + str(dimension) + '_trunk_' + str(i) + '.pickle', 'rb') as handle:
-                    Y = pickle.load(handle, encoding='latin1')
-                if i==num_files-1: #last one
-                    X_test = Y[Y.shape[0] - test_size:]
-                    X_train = vstack([X_train, Y[:Y.shape[0] - test_size]])
-                else:
-                    X_train = vstack([X_train, Y])
-    # X_train = X_train.astype(dtype)
-    # X_test = X_test.astype(dtype)
-
-    # X_train, X_test = train_test_split(X, test_size=1000)
-    # X_test = X_test.toarray()
-    # encounter memory error when calling train_test_split, for 100M
-    X_test = X_test.toarray()
-    print('finish data preparation')
-
-    print(X_train.shape)
-    print(X_test.shape)
-    write_output(X_train, X_test, out_fn, distance, type, 1000, True)
-
-def ecfp_multi(out_fn, dataset_name, num_files, dimension, distance, type):
-    print('prepare dataset ' + dataset_name)
-    import pickle
-    from scipy.sparse import vstack
-    path = '/data/chunjiangzhu/Enamine_680M_SparseMatrix/'
-    if type == 'bit':
-        dtype = numpy.bool
-    elif type == 'int':
-        dtype = numpy.int
-    else:
-        dtype = numpy.float
-
-    # vertically stack sparse matrices from multiple files
-    test_size = 3
-    with open(path + dataset_name + '_' + str(dimension) + '_trunk_0.pickle', 'rb') as handle:
-        Y = pickle.load(handle, encoding='latin1')
-    size = 10000000
-    print('select %i out of %i' %(size, Y.shape[0]))
-    Y = Y[:size]
-    X_test = Y[Y.shape[0] - test_size:]
-    X_train = Y[:Y.shape[0] - test_size]
-
-    # make them full matrices here
-    X_train = X_train.toarray()
-    X_test = X_test.toarray()
-    print('finish data preparation')
-
-    print(X_train.shape)
-    print(X_test.shape)
-    write_output(X_train, X_test, out_fn, distance, type, 1000)
-
-DATASETS = {
-    'fashion-mnist-784-euclidean': fashion_mnist,
-    'gist-960-euclidean': gist,
-    'glove-25-angular': lambda out_fn: glove(out_fn, 25),
-    'glove-50-angular': lambda out_fn: glove(out_fn, 50),
-    'glove-100-angular': lambda out_fn: glove(out_fn, 100),
-    'glove-200-angular': lambda out_fn: glove(out_fn, 200),
-    'mnist-784-euclidean': mnist,
-    'random-xs-20-euclidean': lambda out_fn: random(out_fn, 20, 10000, 100, 'euclidean'),
-    'random-s-100-euclidean': lambda out_fn: random(out_fn, 100, 100000, 1000, 'euclidean'),
-    'random-xs-20-angular': lambda out_fn: random(out_fn, 20, 10000, 100, 'angular'),
-    'random-s-100-angular': lambda out_fn: random(out_fn, 100, 100000, 1000, 'angular'),
-    'sift-128-euclidean': sift,
-    'nytimes-256-angular': lambda out_fn: nytimes(out_fn, 256),
-    'nytimes-16-angular': lambda out_fn: nytimes(out_fn, 16),
-    'word2bits-800-hamming': lambda out_fn: word2bits(out_fn, '400K', 'w2b_bitlevel1_size800_vocab400K'),
-    'lastfm-64-dot': lambda out_fn: lastfm(out_fn, 64),
-    'sift-256-hamming': lambda out_fn: sift_hamming(out_fn, 'sift.hamming.256'),
-    # below are datasets Chunjiang added
-    'toy-1024-jaccard': lambda out_fn: ecfp(out_fn, 'toydata', 1024, 'jaccard', 'int'),
-    'chembl-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl', 1024, 'jaccard', 'int'),
-    'chembl100K-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl100K', 1024, 'jaccard', 'int'),
-    'chembl10K-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl10K', 1024, 'jaccard', 'int'),
-    'chembl-sparse-1024-jaccard': lambda out_fn: ecfp_sparse(out_fn, 'Chembl10K', 1024, 'jaccard', 'bit'),
-    'enamine100M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 5, 1024, 'jaccard', 'bit'),
-    'enamine10M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 0.5, 1024, 'jaccard', 'bit'),
-    'enamine20M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 1, 1024, 'jaccard', 'bit'),
-    'enamine10M-1024-jaccard': lambda out_fn: ecfp_multi(out_fn, 'Enamine', 0.5, 1024, 'jaccard', 'bit')
-}
diff --git a/plot.py b/plot.py
new file mode 100644
index 0000000..191116e
--- /dev/null
+++ b/plot.py
@@ -0,0 +1,123 @@
+import os
+import matplotlib as mpl
+mpl.use('Agg')
+import matplotlib.pyplot as plt
+import argparse
+
+from ann_benchmarks.datasets import get_dataset
+from ann_benchmarks.algorithms.definitions import get_definitions
+from ann_benchmarks.plotting.metrics import all_metrics as metrics
+from ann_benchmarks.plotting.utils  import get_plot_label, compute_metrics, create_linestyles, create_pointset
+from ann_benchmarks.results import store_results, load_all_results, get_unique_algorithms, get_algorithm_name
+
+
+def create_plot(all_data, raw, x_log, y_log, xn, yn, fn_out, linestyles, batch):
+    xm, ym = (metrics[xn], metrics[yn])
+    # Now generate each plot
+    handles = []
+    labels = []
+    plt.figure(figsize=(12, 9))
+    for algo in sorted(all_data.keys(), key=lambda x: x.lower()):
+        xs, ys, ls, axs, ays, als = create_pointset(all_data[algo], xn, yn)
+        color, faded, linestyle, marker = linestyles[algo]
+        handle, = plt.plot(xs, ys, '-', label=algo, color=color, ms=7, mew=3, lw=3, linestyle=linestyle, marker=marker)
+        handles.append(handle)
+        if raw:
+            handle2, = plt.plot(axs, ays, '-', label=algo, color=faded, ms=5, mew=2, lw=2, linestyle=linestyle, marker=marker)
+        labels.append(get_algorithm_name(algo, batch))
+
+    if x_log:
+        plt.gca().set_xscale('log')
+    if y_log:
+        plt.gca().set_yscale('log')
+    #plt.gca().set_title(get_plot_label(xm, ym), fontsize=15)
+    plt.gca().set_ylabel(ym['description'], fontsize=15)
+    plt.gca().set_xlabel(xm['description'], fontsize=15)
+    box = plt.gca().get_position()
+    # plt.gca().set_position([box.x0, box.y0, box.width * 0.8, box.height])
+    #plt.gca().legend(handles, labels, loc='lower left', prop={'size': 12})
+    plt.xticks(size=15)
+    plt.yticks(size=15)
+    plt.grid(b=True, which='major', color='0.65',linestyle='-')
+    if 'lim' in xm:
+        plt.xlim(xm['lim'])
+    if 'lim' in ym:
+        plt.ylim(ym['lim'])
+    plt.savefig(fn_out, bbox_inches='tight')
+    plt.close()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--dataset',
+        metavar="DATASET",
+        default='glove-100-angular')
+    parser.add_argument(
+        '--count',
+        default=10)
+    parser.add_argument(
+        '--definitions',
+        metavar='FILE',
+        help='load algorithm definitions from FILE',
+        default='algos.yaml')
+    parser.add_argument(
+        '--limit',
+        default=-1)
+    parser.add_argument(
+        '-o', '--output')
+    parser.add_argument(
+        '-x', '--x-axis',
+        help = 'Which metric to use on the X-axis',
+        choices = metrics.keys(),
+        default = "k-nn")
+    parser.add_argument(
+        '-y', '--y-axis',
+        help = 'Which metric to use on the Y-axis',
+        choices = metrics.keys(),
+        default = "qps")
+    parser.add_argument(
+        '-X', '--x-log',
+        help='Draw the X-axis using a logarithmic scale',
+        action='store_true')
+    parser.add_argument(
+        '-Y', '--y-log',
+        help='Draw the Y-axis using a logarithmic scale',
+        action='store_true')
+    parser.add_argument(
+        '--raw',
+        help='Show raw results (not just Pareto frontier) in faded colours',
+        action='store_true')
+    parser.add_argument(
+        '--batch',
+        help='Plot runs in batch mode',
+        action='store_true')
+    parser.add_argument(
+        '--rq',
+        action='store_true',
+        help='If set, plot range queries')
+    parser.add_argument(
+        "--radius",
+        default=0.3,
+        type=float,
+        help="th range of similarity to search for")
+    args = parser.parse_args()
+
+    if not args.output:
+        args.output = 'results/%s.png' % get_algorithm_name(args.dataset, args.batch)
+        print('writing output to %s' % args.output)
+
+    dataset = get_dataset(args.dataset)
+    if args.rq:
+        count = args.radius
+    else:
+        count = int(args.count)
+    unique_algorithms = get_unique_algorithms()
+    results = load_all_results(args.dataset, count, True, args.batch)
+    linestyles = create_linestyles(sorted(unique_algorithms))
+    runs = compute_metrics(list(dataset["distances"]), results, args.x_axis, args.y_axis)
+    if not runs:
+        raise Exception('Nothing to plot')
+
+    create_plot(runs, args.raw, args.x_log,
+            args.y_log, args.x_axis, args.y_axis, args.output, linestyles, args.batch)
diff --git a/run.sh b/run.sh
deleted file mode 100644
index 3529ebc..0000000
--- a/run.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-#SBATCH --partition=HaswellPriority           # Name of partition
-#SBATCH --ntasks=1                            # Request 48 CPU cores
-#SBATCH --exclude=cn[65-69,71-136,325-343,345-353,355-358,360-364,369-398,400-401],gpu[07-10]
-#SBATCH --exclusive
-
-module load anaconda/5.1.0
-source activate ann_env
-module purge
-module load gcc/5.4.0
-module load singularity/3.1
-#python cpBuildingTime.py
-#singularity exec ../singularity/ann-bench-nmslib.sif python -c 'import nmslib'
-#python run.py --dataset=molport-1024-jaccard --algorithm='Hnsw(Nmslib)'  # Replace with your application's commands
-#python run.py --dataset=molport-1024-jaccard --algorithm='Hnsw(Nmslib)' --batch
-#python run.py --dataset=molport-1024-jaccard --algorithm='SW-graph(Nmslib)'
-#python run.py --dataset=molport-1024-jaccard --algorithm='VPtree(Nmslib)'
-#python run.py --dataset=molport-1024-jaccard --algorithm='Pynndescent'
-#python run.py --dataset=molport-1024-jaccard --algorithm='Datasketch'
-#python run.py --dataset=molport-1024-jaccard --algorithm='Bruteforce'
-#python run.py --dataset=molport-1024-jaccard --algorithm='Balltree(Sklearn)'
-#python run.py --dataset=molport-1024-jaccard --algorithm='Risc'
-#python run.py --dataset=molport-1024-jaccard --algorithm='DivideSkip'
-python run.py --dataset=molport-1024-jaccard --rq --radius=0.4 --algorithm='Onng(Ngt)'
-#python run.py --dataset=molport-1024-jaccard --algorithm='Panng(Ngt)'
-
-#python run.py --dataset=chembl-1024-jaccard --algorithm='Hnsw(Nmslib)'  # Replace with your application's commands
-#python run.py --dataset=chembl-1024-jaccard --algorithm='Hnsw(Nmslib)' --batch
-#python run.py --dataset=chembl-1024-jaccard --algorithm='SW-graph(Nmslib)'
-#python run.py --dataset=chembl-1024-jaccard --algorithm='VPtree(Nmslib)'
-#python run.py --dataset=chembl-1024-jaccard --algorithm='Pynndescent'
-#python run.py --dataset=chembl-1024-jaccard --algorithm='Datasketch'
-#python run.py --dataset=chembl-1024-jaccard --algorithm='Bruteforce'
-#python run.py --dataset=chembl-1024-jaccard --algorithm='Balltree(Sklearn)'
-#python run.py --dataset=chembl-1024-jaccard --algorithm='Risc'
-#python run.py --dataset=chembl-1024-jaccard --algorithm='DivideSkip'
-#python run.py --dataset=chembl-1024-jaccard --algorithm='Onng(Ngt)'
-#python run.py --dataset=chembl-1024-jaccard --algorithm='Panng(Ngt)'
-