From d515b327080e7c8898e776171d7f7fc7cfef429c Mon Sep 17 00:00:00 2001 From: searchivairus Date: Tue, 6 Feb 2018 22:00:58 -0500 Subject: [PATCH] Improving Python notebooks (no ticket) --- .../notebooks/search_dense_nonoptim.ipynb | 376 ++++++++++++++++++ ...h_dense.ipynb => search_dense_optim.ipynb} | 92 ++--- python_bindings/notebooks/search_sparse.ipynb | 86 ++-- 3 files changed, 463 insertions(+), 91 deletions(-) create mode 100644 python_bindings/notebooks/search_dense_nonoptim.ipynb rename python_bindings/notebooks/{search_dense.ipynb => search_dense_optim.ipynb} (82%) diff --git a/python_bindings/notebooks/search_dense_nonoptim.ipynb b/python_bindings/notebooks/search_dense_nonoptim.ipynb new file mode 100644 index 0000000..77ff7e4 --- /dev/null +++ b/python_bindings/notebooks/search_dense_nonoptim.ipynb @@ -0,0 +1,376 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import numpy \n", + "import sys \n", + "import nmslib \n", + "import time \n", + "import math \n", + "from sklearn.neighbors import NearestNeighbors\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Just read the data\n", + "all_data_matrix = numpy.loadtxt('../../sample_data/final128_10K.txt')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Create a held-out query data set\n", + "(data_matrix, query_matrix) = train_test_split(all_data_matrix, test_size = 0.1)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# of queries 1000, # of data points 9000\n" + ] + } + ], + "source": [ + "print(\"# of queries %d, # of data points %d\" % (query_matrix.shape[0], data_matrix.shape[0]) )" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Set index parameters\n", + "# These are the most important onese\n", + "M = 15\n", + "efC = 100\n", + "\n", + "num_threads = 4\n", + "index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post' : 0,\n", + " 'skip_optimized_index' : 1 # using non-optimized index!\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Number of neighbors \n", + "K=100" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Space name should correspond to the space name \n", + "# used for brute-force search\n", + "space_name='l2'" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "9000" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Intitialize the library, specify the space, the type of the vector and add data points \n", + "index = nmslib.init(method='hnsw', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR) \n", + "index.addDataPointBatch(data_matrix) " + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index-time parameters {'M': 15, 'indexThreadQty': 4, 'efConstruction': 100, 'skip_optimized_index': 1, 'post': 0}\n", + "Indexing time = 0.291947\n" + ] + } + ], + "source": [ + "# Create an index\n", + "start = time.time()\n", + "index.createIndex(index_time_params) \n", + "end = time.time() \n", + "print('Index-time parameters', index_time_params)\n", + "print('Indexing time = %f' % (end-start))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Setting query-time parameters {'efSearch': 100}\n" + ] + } + ], + "source": [ + "# Setting query-time parameters\n", + "efS = 100\n", + "query_time_params = {'efSearch': efS}\n", + "print('Setting query-time parameters', query_time_params)\n", + "index.setQueryTimeParams(query_time_params)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "kNN time total=0.037669 (sec), per query=0.000038 (sec), per query adjusted for thread number=0.000151 (sec)\n" + ] + } + ], + "source": [ + "# Querying\n", + "query_qty = query_matrix.shape[0]\n", + "start = time.time() \n", + "nbrs = index.knnQueryBatch(query_matrix, k = K, num_threads = num_threads)\n", + "end = time.time() \n", + "print('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' % \n", + " (end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty)) " + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Computing gold-standard data\n", + "Brute-force preparation time 0.001139\n", + "brute-force kNN time total=0.319399 (sec), per query=0.000319 (sec)\n" + ] + } + ], + "source": [ + "# Computing gold-standard data \n", + "print('Computing gold-standard data')\n", + "\n", + "start = time.time()\n", + "sindx = NearestNeighbors(n_neighbors=K, metric='l2', algorithm='brute').fit(data_matrix)\n", + "end = time.time()\n", + "\n", + "print('Brute-force preparation time %f' % (end - start))\n", + "\n", + "start = time.time() \n", + "gs = sindx.kneighbors(query_matrix)\n", + "end = time.time()\n", + "\n", + "print('brute-force kNN time total=%f (sec), per query=%f (sec)' % \n", + " (end-start, float(end-start)/query_qty) )" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "kNN recall 0.993040\n" + ] + } + ], + "source": [ + "# Finally computing recall\n", + "recall=0.0\n", + "for i in range(0, query_qty):\n", + " correct_set = set(gs[1][i])\n", + " ret_set = set(nbrs[i][0])\n", + " recall = recall + float(len(correct_set.intersection(ret_set))) / len(correct_set)\n", + "recall = recall / query_qty\n", + "print('kNN recall %f' % recall)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Save a meta index\n", + "index.saveIndex('dense_index_nonoptim.bin')" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# Re-intitialize the library, specify the space, the type of the vector.\n", + "newIndex = nmslib.init(method='hnsw', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR) " + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "9000" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# For non-optimized indices or methods different from HNSW we DO need to re-add data points\n", + "newIndex.addDataPointBatch(data_matrix) " + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Re-load the index and re-run queries\n", + "newIndex.loadIndex('dense_index_nonoptim.bin')" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Setting query-time parameters {'efSearch': 100}\n", + "kNN time total=0.031991 (sec), per query=0.000032 (sec), per query adjusted for thread number=0.000128 (sec)\n" + ] + } + ], + "source": [ + "# Setting query-time parameters and querying\n", + "print('Setting query-time parameters', query_time_params)\n", + "newIndex.setQueryTimeParams(query_time_params)\n", + "\n", + "query_qty = query_matrix.shape[0]\n", + "start = time.time() \n", + "new_nbrs = newIndex.knnQueryBatch(query_matrix, k = K, num_threads = num_threads)\n", + "end = time.time() \n", + "print('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' % \n", + " (end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty)) " + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "kNN recall 0.993040\n" + ] + } + ], + "source": [ + "# Finally computing recall for the new result set\n", + "recall=0.0\n", + "for i in range(0, query_qty):\n", + " correct_set = set(gs[1][i])\n", + " ret_set = set(new_nbrs[i][0])\n", + " recall = recall + float(len(correct_set.intersection(ret_set))) / len(correct_set)\n", + "recall = recall / query_qty\n", + "print('kNN recall %f' % recall)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/python_bindings/notebooks/search_dense.ipynb b/python_bindings/notebooks/search_dense_optim.ipynb similarity index 82% rename from python_bindings/notebooks/search_dense.ipynb rename to python_bindings/notebooks/search_dense_optim.ipynb index 8d07ff0..21b6875 100644 --- a/python_bindings/notebooks/search_dense.ipynb +++ b/python_bindings/notebooks/search_dense_optim.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 183, + "execution_count": 1, "metadata": { "collapsed": true }, @@ -19,8 +19,10 @@ }, { "cell_type": "code", - "execution_count": 184, - "metadata": {}, + "execution_count": 2, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# Just read the data\n", @@ -29,8 +31,10 @@ }, { "cell_type": "code", - "execution_count": 185, - "metadata": {}, + "execution_count": 3, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# Create a held-out query data set\n", @@ -39,7 +43,7 @@ }, { "cell_type": "code", - "execution_count": 186, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -56,14 +60,14 @@ }, { "cell_type": "code", - "execution_count": 187, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Index-time parameters {'indexThreadQty': 4, 'M': 15, 'efConstruction': 100, 'post': 0}\n" + "Index-time parameters {'efConstruction': 100, 'indexThreadQty': 4, 'M': 15, 'post': 0}\n" ] } ], @@ -80,7 +84,7 @@ }, { "cell_type": "code", - "execution_count": 188, + "execution_count": 6, "metadata": { "collapsed": true }, @@ -92,7 +96,7 @@ }, { "cell_type": "code", - "execution_count": 189, + "execution_count": 7, "metadata": { "collapsed": true }, @@ -105,7 +109,7 @@ }, { "cell_type": "code", - "execution_count": 190, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -114,7 +118,7 @@ "9000" ] }, - "execution_count": 190, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -127,15 +131,15 @@ }, { "cell_type": "code", - "execution_count": 191, + "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Index-time parameters {'indexThreadQty': 4, 'M': 15, 'efConstruction': 100}\n", - "Indexing time = 0.233778\n" + "Index-time parameters {'efConstruction': 100, 'indexThreadQty': 4, 'M': 15}\n", + "Indexing time = 0.298415\n" ] } ], @@ -151,7 +155,7 @@ }, { "cell_type": "code", - "execution_count": 192, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -172,14 +176,14 @@ }, { "cell_type": "code", - "execution_count": 193, + "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "kNN time total=0.025079 (sec), per query=0.000025 (sec), per query adjusted for thread number=0.000100 (sec)\n" + "kNN time total=0.025963 (sec), per query=0.000026 (sec), per query adjusted for thread number=0.000104 (sec)\n" ] } ], @@ -195,7 +199,7 @@ }, { "cell_type": "code", - "execution_count": 194, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -203,8 +207,8 @@ "output_type": "stream", "text": [ "Computing gold-standard data\n", - "Brute-force preparation time 0.001082\n", - "brute-force kNN time total=0.222535 (sec), per query=0.000223 (sec)\n" + "Brute-force preparation time 0.001208\n", + "brute-force kNN time total=0.438362 (sec), per query=0.000438 (sec)\n" ] } ], @@ -228,14 +232,14 @@ }, { "cell_type": "code", - "execution_count": 195, + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "kNN recall 0.992800\n" + "kNN recall 0.991740\n" ] } ], @@ -252,54 +256,46 @@ }, { "cell_type": "code", - "execution_count": 196, + "execution_count": 14, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Save a meta index\n", - "index.saveIndex('dense_index.bin')" + "index.saveIndex('dense_index_optim.bin')" ] }, { "cell_type": "code", - "execution_count": 197, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "9000" - ] - }, - "execution_count": 197, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": 15, + "metadata": { + "collapsed": true + }, + "outputs": [], "source": [ - "# Re-intitialize the library, specify the space, the type of the vector and add data points \n", + "# Re-intitialize the library, specify the space, the type of the vector.\n", "newIndex = nmslib.init(method='hnsw', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR) \n", - "\n", - "newIndex.addDataPointBatch(data_matrix) " + "# For an optimized L2 index, there's no need to re-load data points, but this would be required for\n", + "# non-optimized index or any other methods different from HNSW (other methods can save only meta indices)\n", + "#newIndex.addDataPointBatch(data_matrix) " ] }, { "cell_type": "code", - "execution_count": 198, + "execution_count": 16, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Re-load the index and re-run queries\n", - "newIndex.loadIndex('dense_index.bin')" + "newIndex.loadIndex('dense_index_optim.bin')" ] }, { "cell_type": "code", - "execution_count": 199, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -307,7 +303,7 @@ "output_type": "stream", "text": [ "Setting query-time parameters {'efSearch': 100}\n", - "kNN time total=0.027727 (sec), per query=0.000028 (sec), per query adjusted for thread number=0.000111 (sec)\n" + "kNN time total=0.022647 (sec), per query=0.000023 (sec), per query adjusted for thread number=0.000091 (sec)\n" ] } ], @@ -326,14 +322,14 @@ }, { "cell_type": "code", - "execution_count": 200, + "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "kNN recall 0.992800\n" + "kNN recall 0.991740\n" ] } ], diff --git a/python_bindings/notebooks/search_sparse.ipynb b/python_bindings/notebooks/search_sparse.ipynb index 0b0e2d9..94ea321 100644 --- a/python_bindings/notebooks/search_sparse.ipynb +++ b/python_bindings/notebooks/search_sparse.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 70, + "execution_count": 1, "metadata": { "collapsed": true }, @@ -20,7 +20,7 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 2, "metadata": { "collapsed": true }, @@ -57,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -75,7 +75,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 4, "metadata": { "collapsed": true }, @@ -87,7 +87,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -104,17 +104,9 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 6, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index-time parameters {'M': 30, 'indexThreadQty': 4, 'efConstruction': 100, 'post': 0}\n" - ] - } - ], + "outputs": [], "source": [ "# Set index parameters\n", "# These are the most important onese\n", @@ -122,13 +114,12 @@ "efC = 100\n", "\n", "num_threads = 4\n", - "index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post' : 0}\n", - "print('Index-time parameters', index_time_params)" + "index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post' : 0}" ] }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 7, "metadata": { "collapsed": true }, @@ -140,7 +131,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -149,7 +140,7 @@ "4500" ] }, - "execution_count": 77, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -162,22 +153,21 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Index-time parameters {'M': 30, 'indexThreadQty': 4, 'efConstruction': 100}\n", - "Indexing time = 12.817909\n" + "Index-time parameters {'indexThreadQty': 4, 'post': 0, 'M': 30, 'efConstruction': 100}\n", + "Indexing time = 16.409487\n" ] } ], "source": [ "# Create an index\n", "start = time.time()\n", - "index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC}\n", "index.createIndex(index_time_params) \n", "end = time.time() \n", "print('Index-time parameters', index_time_params)\n", @@ -186,7 +176,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -207,14 +197,14 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "kNN time total=1.068709 (sec), per query=0.002137 (sec), per query adjusted for thread number=0.008550 (sec)\n" + "kNN time total=2.378762 (sec), per query=0.004758 (sec), per query adjusted for thread number=0.019030 (sec)\n" ] } ], @@ -230,7 +220,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -238,8 +228,8 @@ "output_type": "stream", "text": [ "Computing gold-standard data\n", - "Brute-force preparation time 0.085145\n", - "brute-force kNN time total=4.342679 (sec), per query=0.008685 (sec)\n" + "Brute-force preparation time 0.058578\n", + "brute-force kNN time total=2.372437 (sec), per query=0.004745 (sec)\n" ] } ], @@ -263,14 +253,14 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "kNN recall 0.971320\n" + "kNN recall 0.970280\n" ] } ], @@ -287,7 +277,7 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": 14, "metadata": { "collapsed": true }, @@ -299,7 +289,19 @@ }, { "cell_type": "code", - "execution_count": 84, + "execution_count": 15, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Re-intitialize the library, specify the space, the type of the vector\n", + "newIndex = nmslib.init(method='hnsw', space='cosinesimil_sparse', data_type=nmslib.DataType.SPARSE_VECTOR) " + ] + }, + { + "cell_type": "code", + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -308,21 +310,19 @@ "4500" ] }, - "execution_count": 84, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# Re-intitialize the library, specify the space, the type of the vector and add data points \n", - "newIndex = nmslib.init(method='hnsw', space='cosinesimil_sparse', data_type=nmslib.DataType.SPARSE_VECTOR) \n", - "\n", + "# For non-optimized indices we need to re-add data points\n", "newIndex.addDataPointBatch(data_matrix) " ] }, { "cell_type": "code", - "execution_count": 85, + "execution_count": 17, "metadata": { "collapsed": true }, @@ -334,7 +334,7 @@ }, { "cell_type": "code", - "execution_count": 86, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -342,7 +342,7 @@ "output_type": "stream", "text": [ "Setting query-time parameters {'efSearch': 100}\n", - "kNN time total=1.036072 (sec), per query=0.002072 (sec), per query adjusted for thread number=0.008289 (sec)\n" + "kNN time total=1.224607 (sec), per query=0.002449 (sec), per query adjusted for thread number=0.009797 (sec)\n" ] } ], @@ -361,14 +361,14 @@ }, { "cell_type": "code", - "execution_count": 87, + "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "kNN recall 0.971320\n" + "kNN recall 0.970280\n" ] } ],