Skip to content

Commit

Permalink
Fixing/improving two things:
Browse files Browse the repository at this point in the history
1) python integration tests #221
2) LoadIndex of the SW-graph was broken after I added batch addition and deletion.
  • Loading branch information
searchivairus committed Aug 4, 2017
1 parent 0c07822 commit 50f7e31
Show file tree
Hide file tree
Showing 3 changed files with 114 additions and 14 deletions.
4 changes: 2 additions & 2 deletions python_bindings/integration_tests/sparse_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ def bench_sparse_vector(batch=True):

print('Let\'s invoke the index-build process')

index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4']
query_time_param = ['initSearchAttempts=3']
index_param = ['NN=17', 'efConstruction=50', 'indexThreadQty=4']
query_time_param = ['efSearch=50']

with TimeIt('building index'):
nmslib.createIndex(index, index_param)
Expand Down
116 changes: 105 additions & 11 deletions python_bindings/integration_tests/test_nmslib.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,8 @@ def test_vector_fresh(fast=True):

print('Let\'s invoke the index-build process')

index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4']
query_time_param = ['initSearchAttempts=3']
index_param = ['NN=17', 'efConstruction=50', 'indexThreadQty=4']
query_time_param = ['efSearch=50']

nmslib.createIndex(index, index_param)

Expand Down Expand Up @@ -195,7 +195,7 @@ def test_vector_loaded():
print('Let\'s invoke the index-build process')


query_time_param = ['initSearchAttempts=3']
query_time_param = ['efSearch=50']

nmslib.loadIndex(index, index_name)

Expand Down Expand Up @@ -250,8 +250,8 @@ def test_sparse_vector_fresh():

print('Let\'s invoke the index-build process')

index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4']
query_time_param = ['initSearchAttempts=3']
index_param = ['NN=17', 'efConstruction=50', 'indexThreadQty=4']
query_time_param = ['efSearch=50']

nmslib.createIndex(index, index_param)

Expand Down Expand Up @@ -314,8 +314,8 @@ def test_string_fresh(batch=True):

print('Let\'s invoke the index-build process')

index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4']
query_time_param = ['initSearchAttempts=3']
index_param = ['NN=17', 'efConstruction=50', 'indexThreadQty=4']
query_time_param = ['efSearch=50']

nmslib.createIndex(index, index_param)
nmslib.setQueryTimeParams(index, query_time_param)
Expand Down Expand Up @@ -369,8 +369,8 @@ def test_string_loaded():

print('Let\'s invoke the index-build process')

index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4']
query_time_param = ['initSearchAttempts=3']
index_param = ['NN=17', 'efConstruction=50', 'indexThreadQty=4']
query_time_param = ['efSearch=50']


nmslib.loadIndex(index, index_name)
Expand Down Expand Up @@ -419,8 +419,8 @@ def test_object_as_string_fresh(batch=True):

print('Let\'s invoke the index-build process')

index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4']
query_time_param = ['initSearchAttempts=3']
index_param = ['NN=17', 'efConstruction=50', 'indexThreadQty=4']
query_time_param = ['efSearch=50']

nmslib.createIndex(index, index_param)

Expand All @@ -444,6 +444,98 @@ def test_object_as_string_fresh(batch=True):
nmslib.freeIndex(index)


def bench_sparse_vector(batch=True):
# delay importing these so CI can import module
from scipy.sparse import csr_matrix
from scipy.spatial import distance

dim = 20000
dataset = np.random.binomial(1, 0.01, size=(40000, dim))
queryset = np.random.binomial(1, 0.009, size=(1000, dim))

print('dataset[0]:', [[i, v] for i, v in enumerate(dataset[0]) if v > 0])

k = 3

q0 = queryset[0]
res = []
for i in range(dataset.shape[0]):
res.append([i, distance.cosine(q0, dataset[i,:])])
res.sort(key=lambda x: x[1])
print('q0 res', res[:k])

data_matrix = csr_matrix(dataset, dtype=np.float32)
query_matrix = csr_matrix(queryset, dtype=np.float32)

data_to_return = range(dataset.shape[0])

#space_type = 'cosinesimil_sparse'
space_type = 'cosinesimil_sparse_fast'
space_param = []
method_name = 'small_world_rand'
index_name = method_name + '_sparse.index'
if os.path.isfile(index_name):
os.remove(index_name)
index = nmslib.init(space_type,
space_param,
method_name,
nmslib.DataType.SPARSE_VECTOR,
nmslib.DistType.FLOAT)

if batch:
with TimeIt('batch add'):
positions = nmslib.addDataPointBatch(index, np.arange(len(dataset), dtype=np.int32), data_matrix)
print('positions', positions)
else:
d = []
q = []
with TimeIt('preparing'):
for data in dataset:
d.append([[i, v] for i, v in enumerate(data) if v > 0])
for data in queryset:
q.append([[i, v] for i, v in enumerate(data) if v > 0])
with TimeIt('adding points'):
for id, data in enumerate(d):
nmslib.addDataPoint(index, id, data)

print('Let\'s invoke the index-build process')

index_param = ['NN=17', 'efConstruction=50', 'indexThreadQty=4']
query_time_param = ['efSearch=50']

with TimeIt('building index'):
nmslib.createIndex(index, index_param)

print('The index is created')

nmslib.setQueryTimeParams(index,query_time_param)

print('Query time parameters are set')

print("Results for the freshly created index:")

with TimeIt('knn query'):
if batch:
num_threads = 10
res = nmslib.knnQueryBatch(index, num_threads, k, query_matrix)
for idx, v in enumerate(res):
if idx < 5:
print(idx, v)
if idx == 0:
for i in v:
print('q0', i, distance.cosine(q0, dataset[i,:]))
else:
for idx, data in enumerate(q):
res = nmslib.knnQuery(index, k, data)
if idx < 5:
print(idx, res)

nmslib.saveIndex(index, index_name)

print("The index %s is saved" % index_name)

nmslib.freeIndex(index)

if __name__ == '__main__':

print('DENSE_VECTOR', nmslib.DataType.DENSE_VECTOR)
Expand All @@ -470,3 +562,5 @@ def test_object_as_string_fresh(batch=True):
test_object_as_string_fresh()
test_object_as_string_fresh(False)

bench_sparse_vector()

8 changes: 7 additions & 1 deletion similarity_search/src/method/small_world_rand.cc
Original file line number Diff line number Diff line change
Expand Up @@ -701,7 +701,7 @@ void SmallWorldRand<dist_t>::SearchOld(KNNQuery<dist_t>* query) const {
closestDistQueue.emplace(d);

IdType nodeId = provider->getId();
CHECK_MSG(nodeId < NextNodeId_, "Bug: nodeId (" + ConvertToString(nodeId) + ") > NextNodeId_ (" +ConvertToString(NextNodeId_));
CHECK_MSG(nodeId < NextNodeId_, "Bug: nodeId (" + ConvertToString(nodeId) + ") > NextNodeId_ (" +ConvertToString(NextNodeId_) + ")");
visitedBitset[nodeId] = true;

while(!candidateQueue.empty()){
Expand Down Expand Up @@ -862,6 +862,12 @@ void SmallWorldRand<dist_t>::LoadIndex(const string &location) {
" read so far doesn't match the number of read lines: " + ConvertToString(lineNum) + ")");
inFile.close();
}

pEntryPoint_ = ElList_.empty() ? nullptr : ElList_.begin()->second;
CHECK(pEntryPoint_ != nullptr || ElList_.empty());
NextNodeId_ = ElList_.size();

LOG(LIB_INFO) << "Next node id: " << NextNodeId_ << " ElList_.size(): " << ElList_.size();
}

template class SmallWorldRand<float>;
Expand Down

0 comments on commit 50f7e31

Please sign in to comment.