Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
Leonid Boytsov authored and Leonid Boytsov committed Aug 6, 2018
2 parents 63d363d + 766e480 commit 4dd995e
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 5 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ install:
script:
- $PY --version
- cd python_bindings && $PY setup.py test && flake8 && cd ..
- cd python_bindings && $PY setup.py test && cd ..
- |
set -e
if [ "$TRAVIS_OS_NAME" = "linux" ] ; then
Expand Down
8 changes: 4 additions & 4 deletions data/genunif.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,15 @@
dim = args['dim']

if args['binary'] and args['gauss']:
print "You cannot specify both 'binary' and 'normal'"
print("You cannot specify both 'binary' and 'normal'")
sys.exit(1)

f=open(outf, 'w')
if args['gauss']:
f.write("\n".join(["\t".join([str(random.normalvariate(0,1)) for _ in xrange(dim)]) for _ in xrange(nd+1)]))
f.write("\n".join(["\t".join([str(random.normalvariate(0,1)) for _ in range(dim)]) for _ in range(nd+1)]))
elif args['binary'] :
f.write("\n".join(["\t".join([str(random.randint(0,1)) for _ in xrange(dim)]) for _ in xrange(nd+1)]))
f.write("\n".join(["\t".join([str(random.randint(0,1)) for _ in range(dim)]) for _ in range(nd+1)]))
else:
f.write("\n".join(["\t".join([str(random.random()) for _ in xrange(dim)]) for _ in xrange(nd+1)]))
f.write("\n".join(["\t".join([str(random.random()) for _ in range(dim)]) for _ in range(nd+1)]))
f.close()

92 changes: 92 additions & 0 deletions python_bindings/notebooks/test_hnsw_recall.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#!/usr/bin/env python3
import logging
# Uncomment to print logs to the screen
#logging.basicConfig(level=logging.INFO)

import numpy as np
import sys
import nmslib
import time
import math

from sklearn.neighbors import NearestNeighbors
from sklearn.datasets.samples_generator import make_blobs

def testHnswRecallL2(dataMatrix, queryMatrix, k, M=30, efC=200, efS=1000, numThreads=4):
queryQty = queryMatrix.shape[0]
indexTimeParams = {'M': M, 'indexThreadQty': numThreads, 'efConstruction': efC, 'post' : 0}

#Indexing
print('Index-time parameters', indexTimeParams)
spaceName='l2'
index = nmslib.init(method='hnsw', space=spaceName, data_type=nmslib.DataType.DENSE_VECTOR)
index.addDataPointBatch(dataMatrix)

start = time.time()
index.createIndex(indexTimeParams)
end = time.time()
print('Indexing time = %f' % (end-start))


# Querying
start = time.time()
nmslibFound = index.knnQueryBatch(queryMatrix, k=k, num_threads=numThreads)
end = time.time()
print('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' %
(end - start, float(end - start) / queryQty, numThreads * float(end - start) / queryQty))


# Computing gold-standard data
print('Computing gold-standard data')

start = time.time()
sindx = NearestNeighbors(n_neighbors=k, metric='l2', algorithm='brute').fit(dataMatrix)
end = time.time()

print('Brute-force preparation time %f' % (end - start))

start = time.time()
bruteForceFound = sindx.kneighbors(queryMatrix)
end = time.time()

print('brute-force kNN time total=%f (sec), per query=%f (sec)' %
(end-start, float(end-start)/queryQty) )

# Setting query-time parameters
queryTimeParams = {'efSearch': efS}
print('Setting query-time parameters', queryTimeParams)
index.setQueryTimeParams(queryTimeParams)

# Finally computing recall for every i-th neighbor
for n in range(k):
recall=0.0
for i in range(0, queryQty):
correctSet = set(bruteForceFound[1][i])
retArr = nmslibFound[i][0]
retElem = retArr[n] if len(retArr) > n else -1

recall = recall + int(retElem in correctSet)
recall = recall / queryQty
print('kNN recall for neighbor %d %f' % (n+1, recall))


def testRandomUnif(dataQty, queryQty, efS, dim, k):
queryQty = min(dataQty, queryQty)
dataMatrix = np.random.randn(dataQty, dim).astype(np.float32)
indx = np.random.choice(np.arange(dataQty), size=queryQty, replace=False)
queryMatrix = dataMatrix[indx, ].astype(np.float32)
testHnswRecallL2(dataMatrix, queryMatrix, k, efS=efS)


def testRandomClustered(dataQty, centerQty, queryQty, efS, dim, k):
queryQty = min(dataQty, queryQty)
dataMatrix, _ = make_blobs(n_samples=dataQty, centers=centerQty, n_features=dim, random_state=0)
dataMatrix = dataMatrix.astype(np.float32)
indx = np.random.choice(np.arange(dataQty), size=queryQty, replace=False)
queryMatrix = dataMatrix[indx, ].astype(np.float32)
testHnswRecallL2(dataMatrix, queryMatrix, k, efS=efS)

testRandomClustered(100_000, centerQty=20, queryQty=1000, dim=100, k=10, efS=200)
testRandomUnif(100_000, 1000, dim=100, k=10, efS=200)


0 comments on commit 4dd995e

Please sign in to comment.