diff --git a/python_bindings/integration_tests/sparse_bench.py b/python_bindings/integration_tests/sparse_bench.py index dfa369a..0f6778e 100755 --- a/python_bindings/integration_tests/sparse_bench.py +++ b/python_bindings/integration_tests/sparse_bench.py @@ -76,8 +76,8 @@ def bench_sparse_vector(batch=True): print('Let\'s invoke the index-build process') - index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] - query_time_param = ['initSearchAttempts=3'] + index_param = ['NN=17', 'efConstruction=50', 'indexThreadQty=4'] + query_time_param = ['efSearch=50'] with TimeIt('building index'): nmslib.createIndex(index, index_param) diff --git a/python_bindings/integration_tests/test_nmslib.py b/python_bindings/integration_tests/test_nmslib.py index 6acfc52..f53a418 100755 --- a/python_bindings/integration_tests/test_nmslib.py +++ b/python_bindings/integration_tests/test_nmslib.py @@ -134,8 +134,8 @@ def test_vector_fresh(fast=True): print('Let\'s invoke the index-build process') - index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] - query_time_param = ['initSearchAttempts=3'] + index_param = ['NN=17', 'efConstruction=50', 'indexThreadQty=4'] + query_time_param = ['efSearch=50'] nmslib.createIndex(index, index_param) @@ -195,7 +195,7 @@ def test_vector_loaded(): print('Let\'s invoke the index-build process') - query_time_param = ['initSearchAttempts=3'] + query_time_param = ['efSearch=50'] nmslib.loadIndex(index, index_name) @@ -250,8 +250,8 @@ def test_sparse_vector_fresh(): print('Let\'s invoke the index-build process') - index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] - query_time_param = ['initSearchAttempts=3'] + index_param = ['NN=17', 'efConstruction=50', 'indexThreadQty=4'] + query_time_param = ['efSearch=50'] nmslib.createIndex(index, index_param) @@ -314,8 +314,8 @@ def test_string_fresh(batch=True): print('Let\'s invoke the index-build process') - index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] - query_time_param = ['initSearchAttempts=3'] + index_param = ['NN=17', 'efConstruction=50', 'indexThreadQty=4'] + query_time_param = ['efSearch=50'] nmslib.createIndex(index, index_param) nmslib.setQueryTimeParams(index, query_time_param) @@ -369,8 +369,8 @@ def test_string_loaded(): print('Let\'s invoke the index-build process') - index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] - query_time_param = ['initSearchAttempts=3'] + index_param = ['NN=17', 'efConstruction=50', 'indexThreadQty=4'] + query_time_param = ['efSearch=50'] nmslib.loadIndex(index, index_name) @@ -419,8 +419,8 @@ def test_object_as_string_fresh(batch=True): print('Let\'s invoke the index-build process') - index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] - query_time_param = ['initSearchAttempts=3'] + index_param = ['NN=17', 'efConstruction=50', 'indexThreadQty=4'] + query_time_param = ['efSearch=50'] nmslib.createIndex(index, index_param) @@ -444,6 +444,98 @@ def test_object_as_string_fresh(batch=True): nmslib.freeIndex(index) +def bench_sparse_vector(batch=True): + # delay importing these so CI can import module + from scipy.sparse import csr_matrix + from scipy.spatial import distance + + dim = 20000 + dataset = np.random.binomial(1, 0.01, size=(40000, dim)) + queryset = np.random.binomial(1, 0.009, size=(1000, dim)) + + print('dataset[0]:', [[i, v] for i, v in enumerate(dataset[0]) if v > 0]) + + k = 3 + + q0 = queryset[0] + res = [] + for i in range(dataset.shape[0]): + res.append([i, distance.cosine(q0, dataset[i,:])]) + res.sort(key=lambda x: x[1]) + print('q0 res', res[:k]) + + data_matrix = csr_matrix(dataset, dtype=np.float32) + query_matrix = csr_matrix(queryset, dtype=np.float32) + + data_to_return = range(dataset.shape[0]) + + #space_type = 'cosinesimil_sparse' + space_type = 'cosinesimil_sparse_fast' + space_param = [] + method_name = 'small_world_rand' + index_name = method_name + '_sparse.index' + if os.path.isfile(index_name): + os.remove(index_name) + index = nmslib.init(space_type, + space_param, + method_name, + nmslib.DataType.SPARSE_VECTOR, + nmslib.DistType.FLOAT) + + if batch: + with TimeIt('batch add'): + positions = nmslib.addDataPointBatch(index, np.arange(len(dataset), dtype=np.int32), data_matrix) + print('positions', positions) + else: + d = [] + q = [] + with TimeIt('preparing'): + for data in dataset: + d.append([[i, v] for i, v in enumerate(data) if v > 0]) + for data in queryset: + q.append([[i, v] for i, v in enumerate(data) if v > 0]) + with TimeIt('adding points'): + for id, data in enumerate(d): + nmslib.addDataPoint(index, id, data) + + print('Let\'s invoke the index-build process') + + index_param = ['NN=17', 'efConstruction=50', 'indexThreadQty=4'] + query_time_param = ['efSearch=50'] + + with TimeIt('building index'): + nmslib.createIndex(index, index_param) + + print('The index is created') + + nmslib.setQueryTimeParams(index,query_time_param) + + print('Query time parameters are set') + + print("Results for the freshly created index:") + + with TimeIt('knn query'): + if batch: + num_threads = 10 + res = nmslib.knnQueryBatch(index, num_threads, k, query_matrix) + for idx, v in enumerate(res): + if idx < 5: + print(idx, v) + if idx == 0: + for i in v: + print('q0', i, distance.cosine(q0, dataset[i,:])) + else: + for idx, data in enumerate(q): + res = nmslib.knnQuery(index, k, data) + if idx < 5: + print(idx, res) + + nmslib.saveIndex(index, index_name) + + print("The index %s is saved" % index_name) + + nmslib.freeIndex(index) + if __name__ == '__main__': print('DENSE_VECTOR', nmslib.DataType.DENSE_VECTOR) @@ -470,3 +562,5 @@ def test_object_as_string_fresh(batch=True): test_object_as_string_fresh() test_object_as_string_fresh(False) + bench_sparse_vector() + diff --git a/similarity_search/src/method/small_world_rand.cc b/similarity_search/src/method/small_world_rand.cc index 1e73474..dfb0ece 100644 --- a/similarity_search/src/method/small_world_rand.cc +++ b/similarity_search/src/method/small_world_rand.cc @@ -701,7 +701,7 @@ void SmallWorldRand::SearchOld(KNNQuery* query) const { closestDistQueue.emplace(d); IdType nodeId = provider->getId(); - CHECK_MSG(nodeId < NextNodeId_, "Bug: nodeId (" + ConvertToString(nodeId) + ") > NextNodeId_ (" +ConvertToString(NextNodeId_)); + CHECK_MSG(nodeId < NextNodeId_, "Bug: nodeId (" + ConvertToString(nodeId) + ") > NextNodeId_ (" +ConvertToString(NextNodeId_) + ")"); visitedBitset[nodeId] = true; while(!candidateQueue.empty()){ @@ -862,6 +862,12 @@ void SmallWorldRand::LoadIndex(const string &location) { " read so far doesn't match the number of read lines: " + ConvertToString(lineNum) + ")"); inFile.close(); } + + pEntryPoint_ = ElList_.empty() ? nullptr : ElList_.begin()->second; + CHECK(pEntryPoint_ != nullptr || ElList_.empty()); + NextNodeId_ = ElList_.size(); + + LOG(LIB_INFO) << "Next node id: " << NextNodeId_ << " ElList_.size(): " << ElList_.size(); } template class SmallWorldRand;