diff --git a/.idea/codeStyles/Project.xml b/.idea/codeStyles/Project.xml
new file mode 100644
index 0000000..664f8f1
--- /dev/null
+++ b/.idea/codeStyles/Project.xml
@@ -0,0 +1,10 @@
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/codeStyles/codeStyleConfig.xml b/.idea/codeStyles/codeStyleConfig.xml
new file mode 100644
index 0000000..79ee123
--- /dev/null
+++ b/.idea/codeStyles/codeStyleConfig.xml
@@ -0,0 +1,5 @@
+
+
+
+
+
\ No newline at end of file
diff --git a/python_bindings/setup.py b/python_bindings/setup.py
index 1a16797..0808994 100755
--- a/python_bindings/setup.py
+++ b/python_bindings/setup.py
@@ -21,15 +21,15 @@
if os.path.exists(library_file):
# if we have a prebuilt nmslib library file, use that.
extra_objects.append(library_file)
-
else:
- # Otherwise build all the files here directly (excluding extras which need eigen/boost)
- exclude_files = set("""bbtree.cc lsh.cc lsh_multiprobe.cc lsh_space.cc falconn.cc nndes.cc space_sqfd.cc
- dummy_app.cc main.cc""".split())
-
- for root, subdirs, files in os.walk(os.path.join(libdir, "src")):
- source_files.extend(os.path.join(root, f) for f in files
- if f.endswith(".cc") and f not in exclude_files)
+ raise RuntimeError("can't find prebuild lib: " + os.path.abspath(library_file))
+ # # Otherwise build all the files here directly (excluding extras which need eigen/boost)
+ # exclude_files = set("""bbtree.cc lsh.cc lsh_multiprobe.cc lsh_space.cc falconn.cc nndes.cc space_sqfd.cc
+ # dummy_app.cc main.cc""".split())
+ #
+ # for root, subdirs, files in os.walk(os.path.join(libdir, "src")):
+ # source_files.extend(os.path.join(root, f) for f in files
+ # if f.endswith(".cc") and f not in exclude_files)
if sys.platform.startswith('linux'):
diff --git a/python_bindings/tests/bindings_test.py b/python_bindings/tests/bindings_test.py
index 56add76..47c0989 100644
--- a/python_bindings/tests/bindings_test.py
+++ b/python_bindings/tests/bindings_test.py
@@ -87,11 +87,99 @@ def testReloadIndex(self):
reloaded_results)
+class BitVectorIndexTestMixin(object):
+ def _get_index(self, space='bit_jaccard'):
+ raise NotImplementedError()
+
+ def testKnnQuery(self):
+ np.random.seed(23)
+ nbits = 128
+
+ index = self._get_index()
+
+ for i in range(100):
+ a = np.random.rand(nbits) > 0.5
+ s = " ".join(["1" if e else "0" for e in a])
+ index.addDataPoint(id=i, data=s)
+ index.createIndex()
+
+ a = np.ones(nbits)
+ s = " ".join(["1" if e else "0" for e in a])
+ ids, distances = index.knnQuery(s, k=10)
+ print(ids)
+ print(distances)
+ # self.assertTrue(get_hitrate(get_exact_cosine(row, data), ids) >= 5)
+
+ # def testKnnQueryBatch(self):
+ # np.random.seed(23)
+ # data = np.random.randn(1000, 10).astype(np.float32)
+ #
+ # index = self._get_index()
+ # index.addDataPointBatch(data)
+ # index.createIndex()
+ #
+ # queries = data[:10]
+ # results = index.knnQueryBatch(queries, k=10)
+ # for query, (ids, distances) in zip(queries, results):
+ # self.assertTrue(get_hitrate(get_exact_cosine(query, data), ids) >= 5)
+ #
+ # # test col-major arrays
+ # queries = np.asfortranarray(queries)
+ # results = index.knnQueryBatch(queries, k=10)
+ # for query, (ids, distances) in zip(queries, results):
+ # self.assertTrue(get_hitrate(get_exact_cosine(query, data), ids) >= 5)
+ #
+ # # test custom ids (set id to square of each row)
+ # index = self._get_index()
+ # index.addDataPointBatch(data, ids=np.arange(data.shape[0]) ** 2)
+ # index.createIndex()
+ #
+ # queries = data[:10]
+ # results = index.knnQueryBatch(queries, k=10)
+ # for query, (ids, distances) in zip(queries, results):
+ # # convert from square back to row id
+ # ids = np.sqrt(ids).astype(int)
+ # self.assertTrue(get_hitrate(get_exact_cosine(query, data), ids) >= 5)
+
+ # def testReloadIndex(self):
+ # np.random.seed(23)
+ # data = np.random.randn(1000, 10).astype(np.float32)
+ #
+ # original = self._get_index()
+ # original.addDataPointBatch(data)
+ # original.createIndex()
+ #
+ # # test out saving/reloading index
+ # with tempfile.NamedTemporaryFile() as tmp:
+ # original.saveIndex(tmp.name + ".index")
+ #
+ # reloaded = self._get_index()
+ # reloaded.addDataPointBatch(data)
+ # reloaded.loadIndex(tmp.name + ".index")
+ #
+ # original_results = original.knnQuery(data[0])
+ # reloaded_results = reloaded.knnQuery(data[0])
+ # npt.assert_allclose(original_results,
+ # reloaded_results)
+
+
class HNSWTestCase(unittest.TestCase, DenseIndexTestMixin):
def _get_index(self, space='cosinesimil'):
return nmslib.init(method='hnsw', space=space)
+class BitJaccardTestCase(unittest.TestCase, BitVectorIndexTestMixin):
+ def _get_index(self, space='bit_jaccard'):
+ return nmslib.init(method='hnsw', space='bit_jaccard', data_type=nmslib.DataType.OBJECT_AS_STRING,
+ dtype=nmslib.DistType.INT)
+
+
+# class BitHammingTestCase(unittest.TestCase, BitVectorIndexTestMixin):
+# def _get_index(self, space='bit_hamming'):
+# return nmslib.init(method='hnsw', space='bit_hamming', data_type=nmslib.DataType.OBJECT_AS_STRING,
+# dtype=nmslib.DistType.INT)
+
+
class SWGraphTestCase(unittest.TestCase, DenseIndexTestMixin):
def _get_index(self, space='cosinesimil'):
return nmslib.init(method='sw-graph', space=space)
diff --git a/similarity_search/include/distcomp.h b/similarity_search/include/distcomp.h
index 7863837..729d70d 100644
--- a/similarity_search/include/distcomp.h
+++ b/similarity_search/include/distcomp.h
@@ -223,6 +223,19 @@ int SpearmanRho(const PivotIdType* x, const PivotIdType* y, size_t qty);
int SpearmanFootruleSIMD(const PivotIdType* x, const PivotIdType* y, size_t qty);
int SpearmanRhoSIMD(const PivotIdType* x, const PivotIdType* y, size_t qty);
+//template
+double inline BitJaccard(const uint64_t* a, const uint64_t* b, size_t qty) {
+ uint64_t num = 0, den = 0;
+
+ for (size_t i=0; i < qty; ++i) {
+ // __builtin_popcount quickly computes the number on 1s
+ num += __builtin_popcount(a[i] & b[i]);
+ den += __builtin_popcount(a[i] | b[i]);
+ }
+
+ return double(num) / double(den);
+}
+
//unsigned BitHamming(const uint32_t* a, const uint32_t* b, size_t qty);
unsigned inline BitHamming(const uint32_t* a, const uint32_t* b, size_t qty) {
diff --git a/similarity_search/include/factory/init_spaces.h b/similarity_search/include/factory/init_spaces.h
index 0984e08..dd0aae7 100644
--- a/similarity_search/include/factory/init_spaces.h
+++ b/similarity_search/include/factory/init_spaces.h
@@ -19,6 +19,7 @@
#include "factory/space/space_edist.h"
#include "factory/space/space_bit_hamming.h"
+#include "factory/space/space_bit_jaccard.h"
#include "factory/space/space_bregman.h"
#include "factory/space/space_dummy.h"
#include "factory/space/space_js.h"
@@ -36,15 +37,17 @@
namespace similarity {
-
inline void initSpaces() {
// Registering a dummy space
REGISTER_SPACE_CREATOR(int, SPACE_DUMMY, CreateDummy)
REGISTER_SPACE_CREATOR(float, SPACE_DUMMY, CreateDummy)
REGISTER_SPACE_CREATOR(double, SPACE_DUMMY, CreateDummy)
- // Registering binary/bit Hamming
- REGISTER_SPACE_CREATOR(int, SPACE_BIT_HAMMING, CreateBitHamming)
+ // Registering binary/bit Hamming/Jaccard
+ SpaceFactoryRegistry::CreateFuncPtr bit_hamming_func_ptr = CreateBitHamming;
+ REGISTER_SPACE_CREATOR(int, SPACE_BIT_HAMMING, bit_hamming_func_ptr )
+ SpaceFactoryRegistry::CreateFuncPtr bit_jaccard_func_ptr = CreateBitJaccard;
+ REGISTER_SPACE_CREATOR(double, SPACE_BIT_JACCARD, bit_jaccard_func_ptr )
// Registering the Levensthein-distance: regular and normalized
REGISTER_SPACE_CREATOR(int, SPACE_LEVENSHTEIN, CreateLevenshtein)
diff --git a/similarity_search/include/factory/space/space_bit_hamming.h b/similarity_search/include/factory/space/space_bit_hamming.h
index 53bcc34..d191594 100644
--- a/similarity_search/include/factory/space/space_bit_hamming.h
+++ b/similarity_search/include/factory/space/space_bit_hamming.h
@@ -23,8 +23,9 @@ namespace similarity {
* Creating functions.
*/
-inline Space* CreateBitHamming(const AnyParams& /* ignoring params */) {
- return new SpaceBitHamming();
+template
+inline Space* CreateBitHamming(const AnyParams& /* ignoring params */) {
+ return new SpaceBitHamming();
}
/*
diff --git a/similarity_search/include/factory/space/space_bit_jaccard.h b/similarity_search/include/factory/space/space_bit_jaccard.h
new file mode 100644
index 0000000..48f81dd
--- /dev/null
+++ b/similarity_search/include/factory/space/space_bit_jaccard.h
@@ -0,0 +1,39 @@
+/**
+ * Non-metric Space Library
+ *
+ * Main developers: Bilegsaikhan Naidan, Leonid Boytsov, Yury Malkov, Ben Frederickson, David Novak
+ *
+ * For the complete list of contributors and further details see:
+ * https://github.com/searchivarius/NonMetricSpaceLib
+ *
+ * Copyright (c) 2013-2018
+ *
+ * This code is released under the
+ * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ */
+#ifndef FACTORY_SPACE_BIT_JACCARD_H
+#define FACTORY_SPACE_BIT_JACCARD_H
+
+#include
+
+namespace similarity {
+
+/*
+ * Creating functions.
+ */
+
+template
+inline Space* CreateBitJaccard(const AnyParams& /* ignoring params */) {
+ return new SpaceBitJaccard();
+}
+
+/*
+ * End of creating functions.
+ */
+}
+
+#endif
+
+
+
diff --git a/similarity_search/include/method/perm_bin_vptree.h b/similarity_search/include/method/perm_bin_vptree.h
index a58c492..6202c96 100644
--- a/similarity_search/include/method/perm_bin_vptree.h
+++ b/similarity_search/include/method/perm_bin_vptree.h
@@ -66,7 +66,7 @@ class PermBinVPTree : public Index {
ObjectVector BinPermData_;
unique_ptr>> VPTreeIndex_;
- unique_ptr VPTreeSpace_;
+ unique_ptr> VPTreeSpace_;
// disable copy and assign
DISABLE_COPY_AND_ASSIGN(PermBinVPTree);
diff --git a/similarity_search/include/permutation_utils.h b/similarity_search/include/permutation_utils.h
index 00a141d..dd09e84 100644
--- a/similarity_search/include/permutation_utils.h
+++ b/similarity_search/include/permutation_utils.h
@@ -23,7 +23,6 @@
#include "rangequery.h"
#include "knnquery.h"
#include "permutation_type.h"
-#include "distcomp.h"
#include "utils.h"
namespace similarity {
@@ -163,6 +162,21 @@ inline void Binarize(const vector &perm, const PivotIdType thresh,
}
}
+inline void Binarize(const vector &perm, const PivotIdType thresh, vector&bin_perm) {
+ size_t bin_perm_word_qty = (perm.size() + 63)/64;
+
+ bin_perm.resize(bin_perm_word_qty);
+ fill(bin_perm.begin(), bin_perm.end(), 0);
+
+ for (size_t i = 0; i < perm.size(); ++i) {
+ bool b =perm[i] >= thresh;
+
+ if (b) {
+ bin_perm[i/64] |= (1<<(i%64)) ;
+ }
+ }
+}
+
} // namespace similarity
#endif // _PERMUTATION_UTILS_H_
diff --git a/similarity_search/include/space/space_bit_hamming.h b/similarity_search/include/space/space_bit_hamming.h
index d7524ae..a641aed 100644
--- a/similarity_search/include/space/space_bit_hamming.h
+++ b/similarity_search/include/space/space_bit_hamming.h
@@ -25,55 +25,31 @@
#include "utils.h"
#include "space.h"
#include "distcomp.h"
+#include "space_bit_vector.h"
#define SPACE_BIT_HAMMING "bit_hamming"
namespace similarity {
-class SpaceBitHamming : public Space {
+template
+class SpaceBitHamming : public SpaceBitVector {
public:
explicit SpaceBitHamming() {}
virtual ~SpaceBitHamming() {}
- /** Standard functions to read/write/create objects */
- // Create an object from string representation.
- virtual unique_ptr