From e9f4436f4766ac1a989b7bf1f6e920717ac78cdb Mon Sep 17 00:00:00 2001 From: Chun Jiang Zhu Date: Fri, 19 Jun 2020 19:02:23 -0400 Subject: [PATCH] Delete nmslib_sparse.py --- ann_benchmarks/algorithms/nmslib_sparse.py | 95 ---------------------- 1 file changed, 95 deletions(-) delete mode 100644 ann_benchmarks/algorithms/nmslib_sparse.py diff --git a/ann_benchmarks/algorithms/nmslib_sparse.py b/ann_benchmarks/algorithms/nmslib_sparse.py deleted file mode 100644 index 58af969..0000000 --- a/ann_benchmarks/algorithms/nmslib_sparse.py +++ /dev/null @@ -1,95 +0,0 @@ -from __future__ import absolute_import -import os -import nmslib -from ann_benchmarks.constants import INDEX_DIR -from ann_benchmarks.algorithms.base import BaseANN -from scipy.sparse import csr_matrix -import numpy - - -class NmslibSparseReuseIndex(BaseANN): - @staticmethod - def encode(d): - return ["%s=%s" % (a, b) for (a, b) in d.iteritems()] - - # For each entry in the sparse matrix, extract a list of IDs and - # convert them to a string. Return a list of such strings. - @staticmethod - def matrToStrArray(sparseMatr): - res = [] - indptr = sparseMatr.indptr - indices = sparseMatr.indices - for row in range(sparseMatr.shape[0]): - arr = [k for k in indices[indptr[row]: indptr[row + 1]]] - arr.sort() - res.append(' '.join([str(k) for k in arr])) - return res - - def __init__(self, metric, method_name, index_param, query_param): - self._nmslib_metric = {'angular': 'cosinesimil', 'euclidean': 'l2', 'jaccard': 'jaccard_sparse'}[metric] - self._method_name = method_name - self._save_index = False - self._index_param = NmslibSparseReuseIndex.encode(index_param) - if query_param!=False: - self._query_param = NmslibSparseReuseIndex.encode(query_param) - self.name = 'Nmslib(method_name=%s, index_param=%s, query_param=%s)' % ( - self._method_name, self._index_param, self._query_param) - else: - self._query_param = None - self.name = 'Nmslib(method_name=%s, index_param=%s)' % ( - self._method_name, self._index_param) - - self._index_name = os.path.join(INDEX_DIR, "nmslib_%s_%s_%s" % (self._method_name, metric, '_'.join(self._index_param))) - - d = os.path.dirname(self._index_name) - if not os.path.exists(d): - os.makedirs(d) - - def fit(self, X): - if self._method_name == 'vptree': - # To avoid this issue: - # terminate called after throwing an instance of 'std::runtime_error' - # what(): The data size is too small or the bucket size is too big. Select the parameters so that is NOT less than * 1000 - # Aborted (core dumped) - self._index_param.append('bucketSize=%d' % min(int(X.shape[0] * 0.0005), 1000)) - - # Chunjiang modified it to "if" for jaccard - if self._nmslib_metric == 'jaccard_sparse': - X_trans = NmslibSparseReuseIndex.matrToStrArray(X) - self._index = nmslib.init(space=self._nmslib_metric, method=self._method_name, data_type=nmslib.DataType.OBJECT_AS_STRING) - self._index.addDataPointBatch(X_trans) - else: - self._index = nmslib.init(space=self._nmslib_metric, method=self._method_name) - self._index.addDataPointBatch(X) - - if os.path.exists(self._index_name): - print('Loading index from file') - self._index.loadIndex(self._index_name) - else: - self._index.createIndex(self._index_param) - if self._save_index: - self._index.saveIndex(self._index_name) - if self._query_param is not None: - self._index.setQueryTimeParams(self._query_param) - - def set_query_arguments(self, ef): - if self._method_name == 'hnsw' or self._method_name == 'sw-graph': - self._index.setQueryTimeParams(["efSearch=%s"%(ef)]) - - def query(self, v, n): - # Chunjiang modified - if self._nmslib_metric == 'jaccard_sparse': - nz = numpy.nonzero(v)[0] - v = ' '.join([str(k) for k in nz]) - ids, distances = self._index.knnQuery(v, n) - return ids - - def batch_query(self, X, n): - # Chunjiang modified - if self._nmslib_metric == 'jaccard_sparse': - X = NmslibSparseReuseIndex.matrToStrArray(csr_matrix(X)) - self.res = self._index.knnQueryBatch(X, n) - - def get_batch_results(self): - return [x for x, _ in self.res] -