diff --git a/python_bindings/notebooks/README.md b/python_bindings/notebooks/README.md index 593a789..18cde39 100644 --- a/python_bindings/notebooks/README.md +++ b/python_bindings/notebooks/README.md @@ -1,5 +1,7 @@ -We have three Notebooks: three are for dense spaces and one is for the sparse space. For the dense space, we have examples of the so-called optimized and non-optimized indices. Except HNSW, all the methods save meta-indices rather than real onese. Meta indices contain only index structure, but not the data. Hence, before a meta-index can be loaded, we need to re-load data. One example is a memory efficient space to search for SIFT vectors. - -HNSW, can save real indices, but only for the dense spaces: Euclidean and the cosine. When you use these optimized indices, the search does not require reloading all the data. However, reloading the data is **required** if you want to use the function **getDistance**. Furthermore, creation of the optimized index can always be disabled specifying the index-time parameter **skip_optimized_index** (value 1). +We have four Notebooks: three are for dense vector spaces, one is for the sparse vector space, and one is for the sparse Jaccard space. For the dense space, we have examples of the so-called optimized and non-optimized indices. Except HNSW, all the methods save meta-indices rather than real ones. Meta indices contain only index structure, but not the data. Hence, before a meta-index can be loaded, we need to re-load data. One example is a memory efficient space to search for SIFT vectors. +HNSW, can save real indices, but only for the dense vector spaces: Euclidean and the cosine. When you use these optimized indices, the search does not require reloading all the data. However, reloading the data is **required** if you want to use the function **getDistance**. Furthermore, creation of the optimized index can always be disabled specifying the index-time parameter **skip_optimized_index** (value 1). This separation into optimized and non-optimized indices is not very convenient. In the future, we will fix this issue. + +The sparse Jaccard space example is particularly interesting, because data set objects are represented by strings. In the case of the Jaccard space, a string is simply a list of sorted space-separated IDs. Other non-vector spaces can use different formats. + diff --git a/python_bindings/notebooks/search_generic_sparse_jaccard.ipynb b/python_bindings/notebooks/search_generic_sparse_jaccard.ipynb new file mode 100644 index 0000000..70baf55 --- /dev/null +++ b/python_bindings/notebooks/search_generic_sparse_jaccard.ipynb @@ -0,0 +1,460 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import numpy \n", + "import sys \n", + "import nmslib \n", + "import time \n", + "import math \n", + "from scipy.sparse import csr_matrix \n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Just a function to read sparse data\n", + "def read_data(filename, max_qty = None): \n", + " row = [] \n", + " col = [] \n", + " data = [] \n", + " read_qty = 0 \n", + " with open(filename,'r') as f: \n", + " read_num_ft = 0\n", + " for line in f: \n", + " x = line.replace(':', ' ').strip().split() \n", + " if (len(x) % 2 != 0):\n", + " raise(Exception('Poorly formated line %d in file %s' % (read_qty + 1, filename)))\n", + " if (len(x) == 0): continue\n", + " for i in range(0, len(x), 2): \n", + " row.append(read_qty) \n", + " feat_id = int(x[i])\n", + " read_num_ft = max(read_num_ft, feat_id + 1)\n", + " col.append(feat_id) \n", + " data.append(1) \n", + "\n", + " read_qty = read_qty+1 \n", + " if max_qty != None and read_qty >= max_qty: break\n", + " \n", + " print('Read %d rows, # of features %d' % (read_qty, read_num_ft))\n", + " ft_mat = csr_matrix((numpy.array(data), (numpy.array(row), numpy.array(col))), \n", + " shape=(read_qty, read_num_ft)) \n", + " return (read_qty, ft_mat)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Read 5000 rows, # of features 100000\n" + ] + } + ], + "source": [ + "# Read data points\n", + "(all_qty, all_data_matrix) = read_data('../../sample_data/sparse_wiki_5K.txt') " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Create a held-out query data set\n", + "(data_matrix, query_matrix) = train_test_split(all_data_matrix, test_size = 0.1)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# of queries 500, # of data points 4500\n" + ] + } + ], + "source": [ + "print(\"# of queries %d, # of data points %d\" % (query_matrix.shape[0], data_matrix.shape[0]) )" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# For each entry in the sparse matrix, extract a list of IDs and\n", + "# convert them to a string. Return a list of such strings.\n", + "def matrToStrArray(sparseMatr):\n", + " res = []\n", + " indptr = sparseMatr.indptr\n", + " indices = sparseMatr.indices\n", + " for row in range(sparseMatr.shape[0]):\n", + " arr = [k for k in indices[indptr[row] : indptr[row + 1]]]\n", + " arr.sort()\n", + " res.append(' '.join([str(k) for k in arr]))\n", + " return res" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "data_matrix_str = matrToStrArray(data_matrix)\n", + "query_matrix_str = matrToStrArray(query_matrix)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Set index parameters\n", + "# These are the most important onese\n", + "M = 30\n", + "efC = 100\n", + "\n", + "num_threads = 4\n", + "index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post' : 0}" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Number of neighbors \n", + "K=100" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4500" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Intitialize the library, specify the space, the type of the data and add data points \n", + "# Note that in the GENERIC case, data points are passed as strings!\n", + "index = nmslib.init(method='hnsw', space='jaccard_sparse', data_type=nmslib.DataType.OBJECT_AS_STRING) \n", + "index.addDataPointBatch(data_matrix_str) " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index-time parameters {'post': 0, 'efConstruction': 100, 'M': 30, 'indexThreadQty': 4}\n", + "Indexing time = 15.290068\n" + ] + } + ], + "source": [ + "# Create an index\n", + "start = time.time()\n", + "index.createIndex(index_time_params) \n", + "end = time.time() \n", + "print('Index-time parameters', index_time_params)\n", + "print('Indexing time = %f' % (end-start))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Setting query-time parameters {'efSearch': 100}\n" + ] + } + ], + "source": [ + "# Setting query-time parameters\n", + "efS = 100\n", + "query_time_params = {'efSearch': efS}\n", + "print('Setting query-time parameters', query_time_params)\n", + "index.setQueryTimeParams(query_time_params) " + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "kNN time total=2.595220 (sec), per query=0.005190 (sec), per query adjusted for thread number=0.020762 (sec)\n" + ] + } + ], + "source": [ + "# Querying\n", + "query_qty = len(query_matrix_str)\n", + "start = time.time() \n", + "nbrs = index.knnQueryBatch(query_matrix_str, k = K, num_threads = num_threads)\n", + "end = time.time() \n", + "print('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' % \n", + " (end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty)) " + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Computing gold-standard data\n", + "brute-force kNN time total=45.190543 (sec), per query=0.090381 (sec)\n" + ] + } + ], + "source": [ + "# Computing gold-standard data \n", + "print('Computing gold-standard data')\n", + "\n", + "gs = []\n", + "\n", + "start = time.time()\n", + "\n", + "query_qty = query_matrix.shape[0]\n", + "data_qty = data_matrix.shape[0]\n", + "\n", + "data_row_card = numpy.array([data_matrix.getrow(i).getnnz() for i in range(data_qty)])\n", + "\n", + "for i in range(query_qty):\n", + " #print('Processing query %d/%d' % (i+1, query_qty))\n", + " q = query_matrix.getrow(i)\n", + " query_card = float(q.getnnz())\n", + " inter_card = data_matrix.dot(q.transpose()).toarray().reshape(data_qty)\n", + " # Jaccard distance 1 - intersection_size / union_size\n", + " # union_size = cardinality1 + cardinality2 - intersection_size\n", + " dist_vals = numpy.full(data_qty, 1) - inter_card / (data_row_card + query_card - inter_card)\n", + " tmp = [ (dist_vals[i], i) for i in range(data_qty)]\n", + " tmp.sort()\n", + " gs.append( [tmp[i][1] for i in range(K)] )\n", + "\n", + "end = time.time()\n", + "\n", + "print('brute-force kNN time total=%f (sec), per query=%f (sec)' % \n", + " (end-start, float(end-start)/query_qty) )" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "kNN recall 0.957420\n" + ] + } + ], + "source": [ + "# Finally computing recall\n", + "recall=0.0\n", + "for i in range(0, query_qty):\n", + " correct_set = set(gs[i])\n", + " ret_set = set(nbrs[i][0])\n", + " recall = recall + float(len(correct_set.intersection(ret_set))) / len(correct_set)\n", + "recall = recall / query_qty\n", + "print('kNN recall %f' % recall)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Save a meta index\n", + "index.saveIndex('sparse_jaccard_index.bin')" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Re-intitialize the library, specify the space, the type of the data and add data points \n", + "# Note again that in the GENERIC case, data points are passed as strings!\n", + "newIndex = nmslib.init(method='hnsw', space='jaccard_sparse', data_type=nmslib.DataType.OBJECT_AS_STRING) " + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4500" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# For non-optimized indices we need to re-add data points\n", + "newIndex.addDataPointBatch(data_matrix_str) " + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Re-load the index and re-run queries\n", + "newIndex.loadIndex('sparse_jaccard_index.bin')" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Setting query-time parameters {'efSearch': 100}\n", + "kNN time total=1.291015 (sec), per query=0.002582 (sec), per query adjusted for thread number=0.010328 (sec)\n" + ] + } + ], + "source": [ + "# Setting query-time parameters and querying\n", + "print('Setting query-time parameters', query_time_params)\n", + "newIndex.setQueryTimeParams(query_time_params)\n", + "\n", + "query_qty = len(query_matrix_str)\n", + "start = time.time() \n", + "new_nbrs = newIndex.knnQueryBatch(query_matrix_str, k = K, num_threads = num_threads)\n", + "end = time.time() \n", + "print('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' % \n", + " (end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty)) " + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "kNN recall 0.957420\n" + ] + } + ], + "source": [ + "# Finally computing recall for the new result set\n", + "recall=0.0\n", + "for i in range(0, query_qty):\n", + " correct_set = set(gs[i])\n", + " ret_set = set(new_nbrs[i][0])\n", + " recall = recall + float(len(correct_set.intersection(ret_set))) / len(correct_set)\n", + "recall = recall / query_qty\n", + "print('kNN recall %f' % recall)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/similarity_search/src/method/hnsw.cc b/similarity_search/src/method/hnsw.cc index 30a7c8a..8e0775b 100644 --- a/similarity_search/src/method/hnsw.cc +++ b/similarity_search/src/method/hnsw.cc @@ -684,6 +684,9 @@ namespace similarity { void Hnsw::Search(KNNQuery *query, IdType) const { + if (this->data_.empty()) { + return; + } bool useOld = searchAlgoType_ == kOld || (searchAlgoType_ == kHybrid && ef_ >= 1000); // cout << "Ef = " << ef_ << " use old = " << useOld << endl; switch (searchMethod_) { @@ -927,6 +930,10 @@ namespace similarity { dist_func_type_ = 0; searchMethod_ = 0; + CHECK_MSG(totalElementsStored_ == this->data_.size(), + "The number of stored elements " + ConvertToString(totalElementsStored_) + + " doesn't match the number of data points " + ConvertToString(this->data_.size() + + "! Did you forget to re-load data?")) ElList_.resize(totalElementsStored_); for (unsigned id = 0; id < totalElementsStored_; ++id) {