diff --git a/python_bindings/notebooks/search_dense.ipynb b/python_bindings/notebooks/search_dense.ipynb new file mode 100644 index 0000000..8d07ff0 --- /dev/null +++ b/python_bindings/notebooks/search_dense.ipynb @@ -0,0 +1,373 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 183, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import numpy \n", + "import sys \n", + "import nmslib \n", + "import time \n", + "import math \n", + "from sklearn.neighbors import NearestNeighbors\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": 184, + "metadata": {}, + "outputs": [], + "source": [ + "# Just read the data\n", + "all_data_matrix = numpy.loadtxt('../../sample_data/final128_10K.txt')" + ] + }, + { + "cell_type": "code", + "execution_count": 185, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a held-out query data set\n", + "(data_matrix, query_matrix) = train_test_split(all_data_matrix, test_size = 0.1)" + ] + }, + { + "cell_type": "code", + "execution_count": 186, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# of queries 1000, # of data points 9000\n" + ] + } + ], + "source": [ + "print(\"# of queries %d, # of data points %d\" % (query_matrix.shape[0], data_matrix.shape[0]) )" + ] + }, + { + "cell_type": "code", + "execution_count": 187, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index-time parameters {'indexThreadQty': 4, 'M': 15, 'efConstruction': 100, 'post': 0}\n" + ] + } + ], + "source": [ + "# Set index parameters\n", + "# These are the most important onese\n", + "M = 15\n", + "efC = 100\n", + "\n", + "num_threads = 4\n", + "index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post' : 0}\n", + "print('Index-time parameters', index_time_params)" + ] + }, + { + "cell_type": "code", + "execution_count": 188, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Number of neighbors \n", + "K=100" + ] + }, + { + "cell_type": "code", + "execution_count": 189, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Space name should correspond to the space name \n", + "# used for brute-force search\n", + "space_name='l2'" + ] + }, + { + "cell_type": "code", + "execution_count": 190, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "9000" + ] + }, + "execution_count": 190, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Intitialize the library, specify the space, the type of the vector and add data points \n", + "index = nmslib.init(method='hnsw', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR) \n", + "index.addDataPointBatch(data_matrix) " + ] + }, + { + "cell_type": "code", + "execution_count": 191, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index-time parameters {'indexThreadQty': 4, 'M': 15, 'efConstruction': 100}\n", + "Indexing time = 0.233778\n" + ] + } + ], + "source": [ + "# Create an index\n", + "start = time.time()\n", + "index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC}\n", + "index.createIndex(index_time_params) \n", + "end = time.time() \n", + "print('Index-time parameters', index_time_params)\n", + "print('Indexing time = %f' % (end-start))" + ] + }, + { + "cell_type": "code", + "execution_count": 192, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Setting query-time parameters {'efSearch': 100}\n" + ] + } + ], + "source": [ + "# Setting query-time parameters\n", + "efS = 100\n", + "query_time_params = {'efSearch': efS}\n", + "print('Setting query-time parameters', query_time_params)\n", + "index.setQueryTimeParams(query_time_params)" + ] + }, + { + "cell_type": "code", + "execution_count": 193, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "kNN time total=0.025079 (sec), per query=0.000025 (sec), per query adjusted for thread number=0.000100 (sec)\n" + ] + } + ], + "source": [ + "# Querying\n", + "query_qty = query_matrix.shape[0]\n", + "start = time.time() \n", + "nbrs = index.knnQueryBatch(query_matrix, k = K, num_threads = num_threads)\n", + "end = time.time() \n", + "print('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' % \n", + " (end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty)) " + ] + }, + { + "cell_type": "code", + "execution_count": 194, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Computing gold-standard data\n", + "Brute-force preparation time 0.001082\n", + "brute-force kNN time total=0.222535 (sec), per query=0.000223 (sec)\n" + ] + } + ], + "source": [ + "# Computing gold-standard data \n", + "print('Computing gold-standard data')\n", + "\n", + "start = time.time()\n", + "sindx = NearestNeighbors(n_neighbors=K, metric='l2', algorithm='brute').fit(data_matrix)\n", + "end = time.time()\n", + "\n", + "print('Brute-force preparation time %f' % (end - start))\n", + "\n", + "start = time.time() \n", + "gs = sindx.kneighbors(query_matrix)\n", + "end = time.time()\n", + "\n", + "print('brute-force kNN time total=%f (sec), per query=%f (sec)' % \n", + " (end-start, float(end-start)/query_qty) )" + ] + }, + { + "cell_type": "code", + "execution_count": 195, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "kNN recall 0.992800\n" + ] + } + ], + "source": [ + "# Finally computing recall\n", + "recall=0.0\n", + "for i in range(0, query_qty):\n", + " correct_set = set(gs[1][i])\n", + " ret_set = set(nbrs[i][0])\n", + " recall = recall + float(len(correct_set.intersection(ret_set))) / len(correct_set)\n", + "recall = recall / query_qty\n", + "print('kNN recall %f' % recall)" + ] + }, + { + "cell_type": "code", + "execution_count": 196, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Save a meta index\n", + "index.saveIndex('dense_index.bin')" + ] + }, + { + "cell_type": "code", + "execution_count": 197, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "9000" + ] + }, + "execution_count": 197, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Re-intitialize the library, specify the space, the type of the vector and add data points \n", + "newIndex = nmslib.init(method='hnsw', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR) \n", + "\n", + "newIndex.addDataPointBatch(data_matrix) " + ] + }, + { + "cell_type": "code", + "execution_count": 198, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Re-load the index and re-run queries\n", + "newIndex.loadIndex('dense_index.bin')" + ] + }, + { + "cell_type": "code", + "execution_count": 199, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Setting query-time parameters {'efSearch': 100}\n", + "kNN time total=0.027727 (sec), per query=0.000028 (sec), per query adjusted for thread number=0.000111 (sec)\n" + ] + } + ], + "source": [ + "# Setting query-time parameters and querying\n", + "print('Setting query-time parameters', query_time_params)\n", + "newIndex.setQueryTimeParams(query_time_params)\n", + "\n", + "query_qty = query_matrix.shape[0]\n", + "start = time.time() \n", + "new_nbrs = newIndex.knnQueryBatch(query_matrix, k = K, num_threads = num_threads)\n", + "end = time.time() \n", + "print('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' % \n", + " (end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty)) " + ] + }, + { + "cell_type": "code", + "execution_count": 200, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "kNN recall 0.992800\n" + ] + } + ], + "source": [ + "# Finally computing recall for the new result set\n", + "recall=0.0\n", + "for i in range(0, query_qty):\n", + " correct_set = set(gs[1][i])\n", + " ret_set = set(new_nbrs[i][0])\n", + " recall = recall + float(len(correct_set.intersection(ret_set))) / len(correct_set)\n", + "recall = recall / query_qty\n", + "print('kNN recall %f' % recall)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/python_bindings/notebooks/search_sparse.ipynb b/python_bindings/notebooks/search_sparse.ipynb new file mode 100644 index 0000000..0b0e2d9 --- /dev/null +++ b/python_bindings/notebooks/search_sparse.ipynb @@ -0,0 +1,408 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 70, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import numpy \n", + "import sys \n", + "import nmslib \n", + "import time \n", + "import math \n", + "from scipy.sparse import csr_matrix \n", + "from sklearn.neighbors import NearestNeighbors\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Just a function to read sparse data\n", + "def read_data(filename, max_qty = None): \n", + " row = [] \n", + " col = [] \n", + " data = [] \n", + " read_qty = 0 \n", + " with open(filename,'r') as f: \n", + " read_num_ft = 0\n", + " for line in f: \n", + " x = line.replace(':', ' ').strip().split() \n", + " if (len(x) % 2 != 0):\n", + " raise(Exception('Poorly formated line %d in file %s' % (read_qty + 1, filename)))\n", + " if (len(x) == 0): continue\n", + " for i in range(0, len(x), 2): \n", + " row.append(read_qty) \n", + " feat_id = int(x[i])\n", + " read_num_ft = max(read_num_ft, feat_id + 1)\n", + " col.append(feat_id) \n", + " data.append(float(x[i+1])) \n", + "\n", + " read_qty = read_qty+1 \n", + " if max_qty != None and read_qty >= max_qty: break\n", + " \n", + " print('Read %d rows, # of features %d' % (read_qty, read_num_ft))\n", + " ft_mat = csr_matrix((numpy.array(data), (numpy.array(row), numpy.array(col))), \n", + " shape=(read_qty, read_num_ft)) \n", + " return (read_qty, ft_mat)" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Read 5000 rows, # of features 100000\n" + ] + } + ], + "source": [ + "# Read data points\n", + "(all_qty, all_data_matrix) = read_data('../../sample_data/sparse_wiki_5K.txt') " + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Create a held-out query data set\n", + "(data_matrix, query_matrix) = train_test_split(all_data_matrix, test_size = 0.1)" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# of queries 500, # of data points 4500\n" + ] + } + ], + "source": [ + "print(\"# of queries %d, # of data points %d\" % (query_matrix.shape[0], data_matrix.shape[0]) )" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index-time parameters {'M': 30, 'indexThreadQty': 4, 'efConstruction': 100, 'post': 0}\n" + ] + } + ], + "source": [ + "# Set index parameters\n", + "# These are the most important onese\n", + "M = 30\n", + "efC = 100\n", + "\n", + "num_threads = 4\n", + "index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post' : 0}\n", + "print('Index-time parameters', index_time_params)" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Number of neighbors \n", + "K=100" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4500" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Intitialize the library, specify the space, the type of the vector and add data points \n", + "index = nmslib.init(method='hnsw', space='cosinesimil_sparse', data_type=nmslib.DataType.SPARSE_VECTOR) \n", + "index.addDataPointBatch(data_matrix) " + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index-time parameters {'M': 30, 'indexThreadQty': 4, 'efConstruction': 100}\n", + "Indexing time = 12.817909\n" + ] + } + ], + "source": [ + "# Create an index\n", + "start = time.time()\n", + "index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC}\n", + "index.createIndex(index_time_params) \n", + "end = time.time() \n", + "print('Index-time parameters', index_time_params)\n", + "print('Indexing time = %f' % (end-start))" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Setting query-time parameters {'efSearch': 100}\n" + ] + } + ], + "source": [ + "# Setting query-time parameters\n", + "efS = 100\n", + "query_time_params = {'efSearch': efS}\n", + "print('Setting query-time parameters', query_time_params)\n", + "index.setQueryTimeParams(query_time_params) " + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "kNN time total=1.068709 (sec), per query=0.002137 (sec), per query adjusted for thread number=0.008550 (sec)\n" + ] + } + ], + "source": [ + "# Querying\n", + "query_qty = query_matrix.shape[0]\n", + "start = time.time() \n", + "nbrs = index.knnQueryBatch(query_matrix, k = K, num_threads = num_threads)\n", + "end = time.time() \n", + "print('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' % \n", + " (end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty)) " + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Computing gold-standard data\n", + "Brute-force preparation time 0.085145\n", + "brute-force kNN time total=4.342679 (sec), per query=0.008685 (sec)\n" + ] + } + ], + "source": [ + "# Computing gold-standard data \n", + "print('Computing gold-standard data')\n", + "\n", + "start = time.time()\n", + "sindx = NearestNeighbors(n_neighbors=K, metric='cosine', algorithm='brute').fit(data_matrix)\n", + "end = time.time()\n", + "\n", + "print('Brute-force preparation time %f' % (end - start))\n", + "\n", + "start = time.time() \n", + "gs = sindx.kneighbors(query_matrix)\n", + "end = time.time()\n", + "\n", + "print('brute-force kNN time total=%f (sec), per query=%f (sec)' % \n", + " (end-start, float(end-start)/query_qty) )" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "kNN recall 0.971320\n" + ] + } + ], + "source": [ + "# Finally computing recall\n", + "recall=0.0\n", + "for i in range(0, query_qty):\n", + " correct_set = set(gs[1][i])\n", + " ret_set = set(nbrs[i][0])\n", + " recall = recall + float(len(correct_set.intersection(ret_set))) / len(correct_set)\n", + "recall = recall / query_qty\n", + "print('kNN recall %f' % recall)" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Save a meta index\n", + "index.saveIndex('sparse_index.bin')" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4500" + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Re-intitialize the library, specify the space, the type of the vector and add data points \n", + "newIndex = nmslib.init(method='hnsw', space='cosinesimil_sparse', data_type=nmslib.DataType.SPARSE_VECTOR) \n", + "\n", + "newIndex.addDataPointBatch(data_matrix) " + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Re-load the index and re-run queries\n", + "newIndex.loadIndex('sparse_index.bin')" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Setting query-time parameters {'efSearch': 100}\n", + "kNN time total=1.036072 (sec), per query=0.002072 (sec), per query adjusted for thread number=0.008289 (sec)\n" + ] + } + ], + "source": [ + "# Setting query-time parameters and querying\n", + "print('Setting query-time parameters', query_time_params)\n", + "newIndex.setQueryTimeParams(query_time_params)\n", + "\n", + "query_qty = query_matrix.shape[0]\n", + "start = time.time() \n", + "new_nbrs = newIndex.knnQueryBatch(query_matrix, k = K, num_threads = num_threads)\n", + "end = time.time() \n", + "print('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' % \n", + " (end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty)) " + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "kNN recall 0.971320\n" + ] + } + ], + "source": [ + "# Finally computing recall for the new result set\n", + "recall=0.0\n", + "for i in range(0, query_qty):\n", + " correct_set = set(gs[1][i])\n", + " ret_set = set(new_nbrs[i][0])\n", + " recall = recall + float(len(correct_set.intersection(ret_set))) / len(correct_set)\n", + "recall = recall / query_qty\n", + "print('kNN recall %f' % recall)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}