From e984532fe4d78ea7d714fd64dc145a6891ddc6b3 Mon Sep 17 00:00:00 2001 From: Greg Friedland Date: Sun, 17 Feb 2019 00:06:31 -0800 Subject: [PATCH] added bit_jaccard and ABC bit_vector --- .idea/codeStyles/Project.xml | 10 + .idea/codeStyles/codeStyleConfig.xml | 5 + python_bindings/setup.py | 16 +- python_bindings/tests/bindings_test.py | 88 +++++++ similarity_search/include/distcomp.h | 13 + .../include/factory/init_spaces.h | 9 +- .../include/factory/space/space_bit_hamming.h | 5 +- .../include/factory/space/space_bit_jaccard.h | 39 +++ .../include/method/perm_bin_vptree.h | 2 +- similarity_search/include/permutation_utils.h | 16 +- .../include/space/space_bit_hamming.h | 52 ++-- .../include/space/space_bit_jaccard.h | 59 +++++ .../include/space/space_bit_vector.h | 209 ++++++++++++++++ .../src/method/perm_bin_vptree.cc | 2 +- .../src/space/space_bit_hamming.cc | 229 ++++-------------- .../src/space/space_bit_jaccard.cc | 44 ++++ .../src/space/space_bit_vector.cc | 195 +++++++++++++++ similarity_search/test/test_space_serial.cc | 17 ++ 18 files changed, 771 insertions(+), 239 deletions(-) create mode 100644 .idea/codeStyles/Project.xml create mode 100644 .idea/codeStyles/codeStyleConfig.xml create mode 100644 similarity_search/include/factory/space/space_bit_jaccard.h create mode 100644 similarity_search/include/space/space_bit_jaccard.h create mode 100644 similarity_search/include/space/space_bit_vector.h create mode 100644 similarity_search/src/space/space_bit_jaccard.cc create mode 100644 similarity_search/src/space/space_bit_vector.cc diff --git a/.idea/codeStyles/Project.xml b/.idea/codeStyles/Project.xml new file mode 100644 index 0000000..664f8f1 --- /dev/null +++ b/.idea/codeStyles/Project.xml @@ -0,0 +1,10 @@ + + + + + \ No newline at end of file diff --git a/.idea/codeStyles/codeStyleConfig.xml b/.idea/codeStyles/codeStyleConfig.xml new file mode 100644 index 0000000..79ee123 --- /dev/null +++ b/.idea/codeStyles/codeStyleConfig.xml @@ -0,0 +1,5 @@ + + + + \ No newline at end of file diff --git a/python_bindings/setup.py b/python_bindings/setup.py index 1a16797..0808994 100755 --- a/python_bindings/setup.py +++ b/python_bindings/setup.py @@ -21,15 +21,15 @@ if os.path.exists(library_file): # if we have a prebuilt nmslib library file, use that. extra_objects.append(library_file) - else: - # Otherwise build all the files here directly (excluding extras which need eigen/boost) - exclude_files = set("""bbtree.cc lsh.cc lsh_multiprobe.cc lsh_space.cc falconn.cc nndes.cc space_sqfd.cc - dummy_app.cc main.cc""".split()) - - for root, subdirs, files in os.walk(os.path.join(libdir, "src")): - source_files.extend(os.path.join(root, f) for f in files - if f.endswith(".cc") and f not in exclude_files) + raise RuntimeError("can't find prebuild lib: " + os.path.abspath(library_file)) + # # Otherwise build all the files here directly (excluding extras which need eigen/boost) + # exclude_files = set("""bbtree.cc lsh.cc lsh_multiprobe.cc lsh_space.cc falconn.cc nndes.cc space_sqfd.cc + # dummy_app.cc main.cc""".split()) + # + # for root, subdirs, files in os.walk(os.path.join(libdir, "src")): + # source_files.extend(os.path.join(root, f) for f in files + # if f.endswith(".cc") and f not in exclude_files) if sys.platform.startswith('linux'): diff --git a/python_bindings/tests/bindings_test.py b/python_bindings/tests/bindings_test.py index 56add76..47c0989 100644 --- a/python_bindings/tests/bindings_test.py +++ b/python_bindings/tests/bindings_test.py @@ -87,11 +87,99 @@ def testReloadIndex(self): reloaded_results) +class BitVectorIndexTestMixin(object): + def _get_index(self, space='bit_jaccard'): + raise NotImplementedError() + + def testKnnQuery(self): + np.random.seed(23) + nbits = 128 + + index = self._get_index() + + for i in range(100): + a = np.random.rand(nbits) > 0.5 + s = " ".join(["1" if e else "0" for e in a]) + index.addDataPoint(id=i, data=s) + index.createIndex() + + a = np.ones(nbits) + s = " ".join(["1" if e else "0" for e in a]) + ids, distances = index.knnQuery(s, k=10) + print(ids) + print(distances) + # self.assertTrue(get_hitrate(get_exact_cosine(row, data), ids) >= 5) + + # def testKnnQueryBatch(self): + # np.random.seed(23) + # data = np.random.randn(1000, 10).astype(np.float32) + # + # index = self._get_index() + # index.addDataPointBatch(data) + # index.createIndex() + # + # queries = data[:10] + # results = index.knnQueryBatch(queries, k=10) + # for query, (ids, distances) in zip(queries, results): + # self.assertTrue(get_hitrate(get_exact_cosine(query, data), ids) >= 5) + # + # # test col-major arrays + # queries = np.asfortranarray(queries) + # results = index.knnQueryBatch(queries, k=10) + # for query, (ids, distances) in zip(queries, results): + # self.assertTrue(get_hitrate(get_exact_cosine(query, data), ids) >= 5) + # + # # test custom ids (set id to square of each row) + # index = self._get_index() + # index.addDataPointBatch(data, ids=np.arange(data.shape[0]) ** 2) + # index.createIndex() + # + # queries = data[:10] + # results = index.knnQueryBatch(queries, k=10) + # for query, (ids, distances) in zip(queries, results): + # # convert from square back to row id + # ids = np.sqrt(ids).astype(int) + # self.assertTrue(get_hitrate(get_exact_cosine(query, data), ids) >= 5) + + # def testReloadIndex(self): + # np.random.seed(23) + # data = np.random.randn(1000, 10).astype(np.float32) + # + # original = self._get_index() + # original.addDataPointBatch(data) + # original.createIndex() + # + # # test out saving/reloading index + # with tempfile.NamedTemporaryFile() as tmp: + # original.saveIndex(tmp.name + ".index") + # + # reloaded = self._get_index() + # reloaded.addDataPointBatch(data) + # reloaded.loadIndex(tmp.name + ".index") + # + # original_results = original.knnQuery(data[0]) + # reloaded_results = reloaded.knnQuery(data[0]) + # npt.assert_allclose(original_results, + # reloaded_results) + + class HNSWTestCase(unittest.TestCase, DenseIndexTestMixin): def _get_index(self, space='cosinesimil'): return nmslib.init(method='hnsw', space=space) +class BitJaccardTestCase(unittest.TestCase, BitVectorIndexTestMixin): + def _get_index(self, space='bit_jaccard'): + return nmslib.init(method='hnsw', space='bit_jaccard', data_type=nmslib.DataType.OBJECT_AS_STRING, + dtype=nmslib.DistType.INT) + + +# class BitHammingTestCase(unittest.TestCase, BitVectorIndexTestMixin): +# def _get_index(self, space='bit_hamming'): +# return nmslib.init(method='hnsw', space='bit_hamming', data_type=nmslib.DataType.OBJECT_AS_STRING, +# dtype=nmslib.DistType.INT) + + class SWGraphTestCase(unittest.TestCase, DenseIndexTestMixin): def _get_index(self, space='cosinesimil'): return nmslib.init(method='sw-graph', space=space) diff --git a/similarity_search/include/distcomp.h b/similarity_search/include/distcomp.h index 7863837..729d70d 100644 --- a/similarity_search/include/distcomp.h +++ b/similarity_search/include/distcomp.h @@ -223,6 +223,19 @@ int SpearmanRho(const PivotIdType* x, const PivotIdType* y, size_t qty); int SpearmanFootruleSIMD(const PivotIdType* x, const PivotIdType* y, size_t qty); int SpearmanRhoSIMD(const PivotIdType* x, const PivotIdType* y, size_t qty); +//template +double inline BitJaccard(const uint64_t* a, const uint64_t* b, size_t qty) { + uint64_t num = 0, den = 0; + + for (size_t i=0; i < qty; ++i) { + // __builtin_popcount quickly computes the number on 1s + num += __builtin_popcount(a[i] & b[i]); + den += __builtin_popcount(a[i] | b[i]); + } + + return double(num) / double(den); +} + //unsigned BitHamming(const uint32_t* a, const uint32_t* b, size_t qty); unsigned inline BitHamming(const uint32_t* a, const uint32_t* b, size_t qty) { diff --git a/similarity_search/include/factory/init_spaces.h b/similarity_search/include/factory/init_spaces.h index 0984e08..dd0aae7 100644 --- a/similarity_search/include/factory/init_spaces.h +++ b/similarity_search/include/factory/init_spaces.h @@ -19,6 +19,7 @@ #include "factory/space/space_edist.h" #include "factory/space/space_bit_hamming.h" +#include "factory/space/space_bit_jaccard.h" #include "factory/space/space_bregman.h" #include "factory/space/space_dummy.h" #include "factory/space/space_js.h" @@ -36,15 +37,17 @@ namespace similarity { - inline void initSpaces() { // Registering a dummy space REGISTER_SPACE_CREATOR(int, SPACE_DUMMY, CreateDummy) REGISTER_SPACE_CREATOR(float, SPACE_DUMMY, CreateDummy) REGISTER_SPACE_CREATOR(double, SPACE_DUMMY, CreateDummy) - // Registering binary/bit Hamming - REGISTER_SPACE_CREATOR(int, SPACE_BIT_HAMMING, CreateBitHamming) + // Registering binary/bit Hamming/Jaccard + SpaceFactoryRegistry::CreateFuncPtr bit_hamming_func_ptr = CreateBitHamming; + REGISTER_SPACE_CREATOR(int, SPACE_BIT_HAMMING, bit_hamming_func_ptr ) + SpaceFactoryRegistry::CreateFuncPtr bit_jaccard_func_ptr = CreateBitJaccard; + REGISTER_SPACE_CREATOR(double, SPACE_BIT_JACCARD, bit_jaccard_func_ptr ) // Registering the Levensthein-distance: regular and normalized REGISTER_SPACE_CREATOR(int, SPACE_LEVENSHTEIN, CreateLevenshtein) diff --git a/similarity_search/include/factory/space/space_bit_hamming.h b/similarity_search/include/factory/space/space_bit_hamming.h index 53bcc34..d191594 100644 --- a/similarity_search/include/factory/space/space_bit_hamming.h +++ b/similarity_search/include/factory/space/space_bit_hamming.h @@ -23,8 +23,9 @@ namespace similarity { * Creating functions. */ -inline Space* CreateBitHamming(const AnyParams& /* ignoring params */) { - return new SpaceBitHamming(); +template +inline Space* CreateBitHamming(const AnyParams& /* ignoring params */) { + return new SpaceBitHamming(); } /* diff --git a/similarity_search/include/factory/space/space_bit_jaccard.h b/similarity_search/include/factory/space/space_bit_jaccard.h new file mode 100644 index 0000000..48f81dd --- /dev/null +++ b/similarity_search/include/factory/space/space_bit_jaccard.h @@ -0,0 +1,39 @@ +/** + * Non-metric Space Library + * + * Main developers: Bilegsaikhan Naidan, Leonid Boytsov, Yury Malkov, Ben Frederickson, David Novak + * + * For the complete list of contributors and further details see: + * https://github.com/searchivarius/NonMetricSpaceLib + * + * Copyright (c) 2013-2018 + * + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + */ +#ifndef FACTORY_SPACE_BIT_JACCARD_H +#define FACTORY_SPACE_BIT_JACCARD_H + +#include + +namespace similarity { + +/* + * Creating functions. + */ + +template +inline Space* CreateBitJaccard(const AnyParams& /* ignoring params */) { + return new SpaceBitJaccard(); +} + +/* + * End of creating functions. + */ +} + +#endif + + + diff --git a/similarity_search/include/method/perm_bin_vptree.h b/similarity_search/include/method/perm_bin_vptree.h index a58c492..6202c96 100644 --- a/similarity_search/include/method/perm_bin_vptree.h +++ b/similarity_search/include/method/perm_bin_vptree.h @@ -66,7 +66,7 @@ class PermBinVPTree : public Index { ObjectVector BinPermData_; unique_ptr>> VPTreeIndex_; - unique_ptr VPTreeSpace_; + unique_ptr> VPTreeSpace_; // disable copy and assign DISABLE_COPY_AND_ASSIGN(PermBinVPTree); diff --git a/similarity_search/include/permutation_utils.h b/similarity_search/include/permutation_utils.h index 00a141d..dd09e84 100644 --- a/similarity_search/include/permutation_utils.h +++ b/similarity_search/include/permutation_utils.h @@ -23,7 +23,6 @@ #include "rangequery.h" #include "knnquery.h" #include "permutation_type.h" -#include "distcomp.h" #include "utils.h" namespace similarity { @@ -163,6 +162,21 @@ inline void Binarize(const vector &perm, const PivotIdType thresh, } } +inline void Binarize(const vector &perm, const PivotIdType thresh, vector&bin_perm) { + size_t bin_perm_word_qty = (perm.size() + 63)/64; + + bin_perm.resize(bin_perm_word_qty); + fill(bin_perm.begin(), bin_perm.end(), 0); + + for (size_t i = 0; i < perm.size(); ++i) { + bool b =perm[i] >= thresh; + + if (b) { + bin_perm[i/64] |= (1<<(i%64)) ; + } + } +} + } // namespace similarity #endif // _PERMUTATION_UTILS_H_ diff --git a/similarity_search/include/space/space_bit_hamming.h b/similarity_search/include/space/space_bit_hamming.h index d7524ae..a641aed 100644 --- a/similarity_search/include/space/space_bit_hamming.h +++ b/similarity_search/include/space/space_bit_hamming.h @@ -25,55 +25,31 @@ #include "utils.h" #include "space.h" #include "distcomp.h" +#include "space_bit_vector.h" #define SPACE_BIT_HAMMING "bit_hamming" namespace similarity { -class SpaceBitHamming : public Space { +template +class SpaceBitHamming : public SpaceBitVector { public: explicit SpaceBitHamming() {} virtual ~SpaceBitHamming() {} - /** Standard functions to read/write/create objects */ - // Create an object from string representation. - virtual unique_ptr CreateObjFromStr(IdType id, LabelType label, const string& s, - DataFileInputState* pInpState) const; - // Create a string representation of an object. - virtual string CreateStrFromObj(const Object* pObj, const string& externId /* ignored */) const; - // Open a file for reading, fetch a header (if there is any) and memorize an input state - virtual unique_ptr OpenReadFileHeader(const string& inputFile) const; - // Open a file for writing, write a header (if there is any) and memorize an output state - virtual unique_ptr OpenWriteFileHeader(const ObjectVector& dataset, - const string& outputFile) const; - /* - * Read a string representation of the next object in a file as well - * as its label. Return false, on EOF. - */ - virtual bool ReadNextObjStr(DataFileInputState &, string& strObj, LabelType& label, string& externId) const; - /** End of standard functions to read/write/create objects */ - - /* - * Used only for testing/debugging: compares objects approximately. Floating point numbers - * should be nearly equal. Integers and strings should coincide exactly. - */ - virtual bool ApproxEqual(const Object& obj1, const Object& obj2) const; - virtual std::string StrDesc() const { return "Hamming (bit-storage) space"; } - virtual void CreateDenseVectFromObj(const Object* obj, int* pVect, - size_t nElem) const { - throw runtime_error("Cannot create a dense vector for the space: " + StrDesc()); - } - virtual size_t GetElemQty(const Object* object) const {return 0;} - virtual Object* CreateObjFromVect(IdType id, LabelType label, std::vector& InpVect) const { - InpVect.push_back(InpVect.size()); - return CreateObjFromVectInternal(id, label, InpVect); - } + protected: - virtual Object* CreateObjFromVectInternal(IdType id, LabelType label, const std::vector& InpVect) const; - Object* CreateObjFromBitMaskVect(IdType id, LabelType label, const std::vector& bitMaskVect) const; - virtual int HiddenDistance(const Object* obj1, const Object* obj2) const; - void ReadBitMaskVect(std::string line, LabelType& label, std::vector& v) const; + virtual dist_t HiddenDistance(const Object* obj1, const Object* obj2) const { + CHECK(obj1->datalength() > 0); + CHECK(obj1->datalength() == obj2->datalength()); + const dist_uint_t* x = reinterpret_cast(obj1->data()); + const dist_uint_t* y = reinterpret_cast(obj2->data()); + const size_t length = obj1->datalength() / sizeof(dist_uint_t) + - 1; // the last integer is an original number of elements + + return BitHamming(x, y, length); + } DISABLE_COPY_AND_ASSIGN(SpaceBitHamming); }; diff --git a/similarity_search/include/space/space_bit_jaccard.h b/similarity_search/include/space/space_bit_jaccard.h new file mode 100644 index 0000000..8d7803d --- /dev/null +++ b/similarity_search/include/space/space_bit_jaccard.h @@ -0,0 +1,59 @@ +/** + * Non-metric Space Library + * + * Main developers: Bilegsaikhan Naidan, Leonid Boytsov, Yury Malkov, Ben Frederickson, David Novak + * + * For the complete list of contributors and further details see: + * https://github.com/searchivarius/NonMetricSpaceLib + * + * Copyright (c) 2013-2018 + * + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + */ +#ifndef _SPACE_BIT_JACCARD_H_ +#define _SPACE_BIT_JACCARD_H_ + +#include +#include +#include + +#include +#include "global.h" +#include "object.h" +#include "utils.h" +#include "space.h" +#include "distcomp.h" +#include "space_bit_vector.h" + +#define SPACE_BIT_JACCARD "bit_jaccard" + +namespace similarity { + +template +class SpaceBitJaccard : public SpaceBitVector { + public: + explicit SpaceBitJaccard() {} + virtual ~SpaceBitJaccard() {} + + virtual std::string StrDesc() const { return "Jaccard (bit-storage) space"; } + + protected: + virtual dist_t HiddenDistance(const Object* obj1, const Object* obj2) const { + CHECK(obj1->datalength() > 0); + CHECK(obj1->datalength() == obj2->datalength()); + const dist_uint_t* x = reinterpret_cast(obj1->data()); + const dist_uint_t* y = reinterpret_cast(obj2->data()); + const size_t length = obj1->datalength() / sizeof(dist_uint_t) + - 1; // the last integer is an original number of elements + + return BitJaccard(x, y, length); + } + + DISABLE_COPY_AND_ASSIGN(SpaceBitJaccard); +}; + +} // namespace similarity + +#endif diff --git a/similarity_search/include/space/space_bit_vector.h b/similarity_search/include/space/space_bit_vector.h new file mode 100644 index 0000000..a2dd172 --- /dev/null +++ b/similarity_search/include/space/space_bit_vector.h @@ -0,0 +1,209 @@ +/** + * Non-metric Space Library + * + * Main developers: Bilegsaikhan Naidan, Leonid Boytsov, Yury Malkov, Ben Frederickson, David Novak + * + * For the complete list of contributors and further details see: + * https://github.com/searchivarius/NonMetricSpaceLib + * + * Copyright (c) 2013-2018 + * + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + */ +#ifndef _SPACE_BIT_VECTOR_H_ +#define _SPACE_BIT_VECTOR_H_ + +#include +#include +#include + +#include +#include "global.h" +#include "object.h" +#include "utils.h" +#include "space.h" +#include "distcomp.h" +#include "read_data.h" +#include "permutation_utils.h" +#include "logging.h" +#include "experimentconf.h" + + +// Defines an abstract base class for BitVector spaces + +namespace similarity { + +template +class SpaceBitVector : public Space { + public: + explicit SpaceBitVector() {} + virtual ~SpaceBitVector() {} + + /** Standard functions to read/write/create objects */ + // Create an object from string representation. + virtual unique_ptr CreateObjFromStr(IdType id, LabelType label, const string& s, + DataFileInputState* pInpStateBase) const { + DataFileInputStateVec* pInpState = NULL; + if (pInpStateBase != NULL) { + pInpState = dynamic_cast(pInpStateBase); + if (NULL == pInpState) { + PREPARE_RUNTIME_ERR(err) << "Bug: unexpected pointer type"; + THROW_RUNTIME_ERR(err); + } + } + vector vec; + ReadBitMaskVect(s, label, vec); + if (pInpState != NULL) { + size_t elemQty = vec[vec.size() - 1]; + if (pInpState->dim_ == 0) pInpState->dim_ = elemQty; + else if (elemQty != pInpState->dim_) { + PREPARE_RUNTIME_ERR(err) << "The # of bit-vector elements (" << elemQty << ")" << + " doesn't match the # of elements in previous lines. (" << pInpState->dim_ << " )"; + THROW_RUNTIME_ERR(err); + } + } + return unique_ptr(CreateObjFromVectInternal(id, label, vec)); + } + + // Create a string representation of an object. + virtual string CreateStrFromObj(const Object* pObj, const string& externId /* ignored */) const { + stringstream out; + const dist_uint_t* p = reinterpret_cast(pObj->data()); + const size_t length = pObj->datalength() / sizeof(dist_uint_t) + - 1; // the last integer is an original number of elements + const size_t elemQty = p[length]; // last elem + + for (size_t i = 0; i < elemQty; ++i) { + if (i) out << " "; + out << ((p[i/32] >> (i & 31)) & 1); + } + + return out.str(); + } + + // Open a file for reading, fetch a header (if there is any) and memorize an input state + virtual unique_ptr OpenReadFileHeader(const string& inpFileName) const { + return unique_ptr(new DataFileInputStateVec(inpFileName)); + } + + // Open a file for writing, write a header (if there is any) and memorize an output state + virtual unique_ptr OpenWriteFileHeader(const ObjectVector& dataset, + const string& outFileName) const { + return unique_ptr(new DataFileOutputState(outFileName)); + } + + /* + * Read a string representation of the next object in a file as well + * as its label. Return false, on EOF. + */ + virtual bool ReadNextObjStr(DataFileInputState &inpStateBase, string& strObj, LabelType& label, string& externId) const { + externId.clear(); + DataFileInputStateOneFile* pInpState = dynamic_cast(&inpStateBase); + CHECK_MSG(pInpState != NULL, "Bug: unexpected pointer type"); + if (!pInpState->inp_file_) return false; + if (!getline(pInpState->inp_file_, strObj)) return false; + pInpState->line_num_++; + return true; + } + /** End of standard functions to read/write/create objects */ + + /* + * Used only for testing/debugging: compares objects approximately. Floating point numbers + * should be nearly equal. Integers and strings should coincide exactly. + */ + virtual bool ApproxEqual(const Object& obj1, const Object& obj2) const { + const dist_uint_t* p1 = reinterpret_cast(obj1.data()); + const dist_uint_t* p2 = reinterpret_cast(obj2.data()); + const size_t len1 = obj1.datalength() / sizeof(dist_uint_t) + - 1; // the last integer is an original number of elements + const size_t len2 = obj2.datalength() / sizeof(dist_uint_t) + - 1; // the last integer is an original number of elements + if (len1 != len2) { + PREPARE_RUNTIME_ERR(err) << "Bug: comparing vectors of different lengths: " << len1 << " and " << len2; + THROW_RUNTIME_ERR(err); + } + for (size_t i = 0; i < len1; ++i) { + dist_uint_t v1 = ((p1[i/32] >> (i & 31)) & 1); + dist_uint_t v2 = ((p2[i/32] >> (i & 31)) & 1); + if (v1 != v2) return false; + } + + return true; + } + + virtual std::string StrDesc() const { return "Vector (bit-storage) space"; } + virtual void CreateDenseVectFromObj(const Object* obj, dist_t* pVect, + size_t nElem) const { + throw runtime_error("Cannot create a dense vector for the space: " + StrDesc()); + } + virtual size_t GetElemQty(const Object* object) const {return 0;} + virtual Object* CreateObjFromVect(IdType id, LabelType label, std::vector& InpVect) const { + InpVect.push_back(InpVect.size()); + return CreateObjFromVectInternal(id, label, InpVect); + } + protected: + virtual Object* CreateObjFromVectInternal(IdType id, LabelType label, const std::vector& InpVect) const { + return new Object(id, label, InpVect.size() * sizeof(dist_uint_t), &InpVect[0]); + } + + Object* CreateObjFromBitMaskVect(IdType id, LabelType label, const std::vector& bitMaskVect) const { + return new Object(id, label, bitMaskVect.size() * sizeof(dist_uint_t), &bitMaskVect[0]); + } + +// virtual dist_t HiddenDistance(const Object* obj1, const Object* obj2) const; + void ReadBitMaskVect(std::string line, LabelType& label, std::vector& binVect) const { + binVect.clear(); + + label = Object::extractLabel(line); + + std::stringstream str(line); + + str.exceptions(std::ios::badbit); + + + ReplaceSomePunct(line); + + vector v; + + #if 0 + try { + unsigned val; + + while (str >> val) { + if (val != 0 && val != 1) { + throw runtime_error("Only zeros and ones are allowed"); + } + v.push_back(val); + } + } catch (const std::exception &e) { + LOG(LIB_ERROR) << "Exception: " << e.what(); + PREPARE_RUNTIME_ERR(err) << "Failed to parse the line: '" << line << "'"; + LOG(LIB_ERROR) << err.stream().str(); + THROW_RUNTIME_ERR(err); + } + #else + if (!ReadVecDataEfficiently(line, v)) { + PREPARE_RUNTIME_ERR(err) << "Failed to parse the line: '" << line << "'"; + LOG(LIB_ERROR) << err.stream().str(); + THROW_RUNTIME_ERR(err); + } + for (auto val : v) { + if (val != 0 && val != 1) { + PREPARE_RUNTIME_ERR(err) << "Only zeros and ones are allowed, offending line: '" << line << "'"; + LOG(LIB_ERROR) << err.stream().str(); + THROW_RUNTIME_ERR(err); + } + } + #endif + Binarize(v, 1, binVect); // Create the binary vector + binVect.push_back(v.size()); // Put the number of elements in the end + } + + DISABLE_COPY_AND_ASSIGN(SpaceBitVector); +}; + +} // namespace similarity + +#endif diff --git a/similarity_search/src/method/perm_bin_vptree.cc b/similarity_search/src/method/perm_bin_vptree.cc index 9eca6a0..40fff85 100644 --- a/similarity_search/src/method/perm_bin_vptree.cc +++ b/similarity_search/src/method/perm_bin_vptree.cc @@ -38,7 +38,7 @@ PermBinVPTree::PermBinVPTree( const ObjectVector& data) : Index(data), space_(space), PrintProgress_(PrintProgress), - VPTreeSpace_(new SpaceBitHamming()) + VPTreeSpace_(new SpaceBitHamming()) {} template diff --git a/similarity_search/src/space/space_bit_hamming.cc b/similarity_search/src/space/space_bit_hamming.cc index f4dbc57..72c2d18 100644 --- a/similarity_search/src/space/space_bit_hamming.cc +++ b/similarity_search/src/space/space_bit_hamming.cc @@ -1,185 +1,44 @@ -/** - * Non-metric Space Library - * - * Main developers: Bilegsaikhan Naidan, Leonid Boytsov, Yury Malkov, Ben Frederickson, David Novak - * - * For the complete list of contributors and further details see: - * https://github.com/searchivarius/NonMetricSpaceLib - * - * Copyright (c) 2013-2018 - * - * This code is released under the - * Apache License Version 2.0 http://www.apache.org/licenses/. - * - */ -#include -#include -#include -#include -#include - -#include "space/space_bit_hamming.h" -#include "permutation_utils.h" -#include "logging.h" -#include "distcomp.h" -#include "read_data.h" -#include "experimentconf.h" - -namespace similarity { - -using namespace std; - -int SpaceBitHamming::HiddenDistance(const Object* obj1, const Object* obj2) const { - CHECK(obj1->datalength() > 0); - CHECK(obj1->datalength() == obj2->datalength()); - const uint32_t* x = reinterpret_cast(obj1->data()); - const uint32_t* y = reinterpret_cast(obj2->data()); - const size_t length = obj1->datalength() / sizeof(uint32_t) - - 1; // the last integer is an original number of elements - - return BitHamming(x, y, length); -} - -void SpaceBitHamming::ReadBitMaskVect(std::string line, LabelType& label, std::vector& binVect) const -{ - binVect.clear(); - - label = Object::extractLabel(line); - - std::stringstream str(line); - - str.exceptions(std::ios::badbit); - - - ReplaceSomePunct(line); - - vector v; - -#if 0 - try { - unsigned val; - - while (str >> val) { - if (val != 0 && val != 1) { - throw runtime_error("Only zeros and ones are allowed"); - } - v.push_back(val); - } - } catch (const std::exception &e) { - LOG(LIB_ERROR) << "Exception: " << e.what(); - PREPARE_RUNTIME_ERR(err) << "Failed to parse the line: '" << line << "'"; - LOG(LIB_ERROR) << err.stream().str(); - THROW_RUNTIME_ERR(err); - } -#else - if (!ReadVecDataEfficiently(line, v)) { - PREPARE_RUNTIME_ERR(err) << "Failed to parse the line: '" << line << "'"; - LOG(LIB_ERROR) << err.stream().str(); - THROW_RUNTIME_ERR(err); - } - for (auto val : v) { - if (val != 0 && val != 1) { - PREPARE_RUNTIME_ERR(err) << "Only zeros and ones are allowed, offending line: '" << line << "'"; - LOG(LIB_ERROR) << err.stream().str(); - THROW_RUNTIME_ERR(err); - } - } -#endif - Binarize(v, 1, binVect); // Create the binary vector - binVect.push_back(v.size()); // Put the number of elements in the end -} - -Object* SpaceBitHamming::CreateObjFromBitMaskVect(IdType id, LabelType label, const std::vector& bitMaskVect) const { - return new Object(id, label, bitMaskVect.size() * sizeof(uint32_t), &bitMaskVect[0]); -}; - -/** Standard functions to read/write/create objects */ - -unique_ptr SpaceBitHamming::OpenReadFileHeader(const string& inpFileName) const { - return unique_ptr(new DataFileInputStateVec(inpFileName)); -} - -unique_ptr SpaceBitHamming::OpenWriteFileHeader(const ObjectVector& dataset, - const string& outFileName) const { - return unique_ptr(new DataFileOutputState(outFileName)); -} - -unique_ptr -SpaceBitHamming::CreateObjFromStr(IdType id, LabelType label, const string& s, - DataFileInputState* pInpStateBase) const { - DataFileInputStateVec* pInpState = NULL; - if (pInpStateBase != NULL) { - pInpState = dynamic_cast(pInpStateBase); - if (NULL == pInpState) { - PREPARE_RUNTIME_ERR(err) << "Bug: unexpected pointer type"; - THROW_RUNTIME_ERR(err); - } - } - vector vec; - ReadBitMaskVect(s, label, vec); - if (pInpState != NULL) { - size_t elemQty = vec[vec.size() - 1]; - if (pInpState->dim_ == 0) pInpState->dim_ = elemQty; - else if (elemQty != pInpState->dim_) { - PREPARE_RUNTIME_ERR(err) << "The # of bit-vector elements (" << elemQty << ")" << - " doesn't match the # of elements in previous lines. (" << pInpState->dim_ << " )"; - THROW_RUNTIME_ERR(err); - } - } - return unique_ptr(CreateObjFromVectInternal(id, label, vec)); -} - -Object* SpaceBitHamming::CreateObjFromVectInternal(IdType id, LabelType label, const std::vector& InpVect) const { - return new Object(id, label, InpVect.size() * sizeof(uint32_t), &InpVect[0]); -}; - -bool SpaceBitHamming::ApproxEqual(const Object& obj1, const Object& obj2) const { - const uint32_t* p1 = reinterpret_cast(obj1.data()); - const uint32_t* p2 = reinterpret_cast(obj2.data()); - const size_t len1 = obj1.datalength() / sizeof(uint32_t) - - 1; // the last integer is an original number of elements - const size_t len2 = obj2.datalength() / sizeof(uint32_t) - - 1; // the last integer is an original number of elements - if (len1 != len2) { - PREPARE_RUNTIME_ERR(err) << "Bug: comparing vectors of different lengths: " << len1 << " and " << len2; - THROW_RUNTIME_ERR(err); - } - for (size_t i = 0; i < len1; ++i) { - uint32_t v1 = ((p1[i/32] >> (i & 31)) & 1); - uint32_t v2 = ((p2[i/32] >> (i & 31)) & 1); - if (v1 != v2) return false; - } - - return true; -} - - -string SpaceBitHamming::CreateStrFromObj(const Object* pObj, const string& externId /* ignored */) const { - stringstream out; - const uint32_t* p = reinterpret_cast(pObj->data()); - const size_t length = pObj->datalength() / sizeof(uint32_t) - - 1; // the last integer is an original number of elements - const size_t elemQty = p[length]; // last elem - - for (size_t i = 0; i < elemQty; ++i) { - if (i) out << " "; - out << ((p[i/32] >> (i & 31)) & 1); - } - - return out.str(); -} - -bool SpaceBitHamming::ReadNextObjStr(DataFileInputState &inpStateBase, string& strObj, LabelType& label, string& externId) const { - externId.clear(); - DataFileInputStateOneFile* pInpState = dynamic_cast(&inpStateBase); - CHECK_MSG(pInpState != NULL, "Bug: unexpected pointer type"); - if (!pInpState->inp_file_) return false; - if (!getline(pInpState->inp_file_, strObj)) return false; - pInpState->line_num_++; - return true; -} - - -/** End of standard functions to read/write/create objects */ - -} // namespace similarity +///** +// * Non-metric Space Library +// * +// * Main developers: Bilegsaikhan Naidan, Leonid Boytsov, Yury Malkov, Ben Frederickson, David Novak +// * +// * For the complete list of contributors and further details see: +// * https://github.com/searchivarius/NonMetricSpaceLib +// * +// * Copyright (c) 2013-2018 +// * +// * This code is released under the +// * Apache License Version 2.0 http://www.apache.org/licenses/. +// * +// */ +//#include +//#include +//#include +//#include +//#include +// +//#include "space/space_bit_hamming.h" +//#include "permutation_utils.h" +//#include "logging.h" +//#include "distcomp.h" +//#include "read_data.h" +//#include "experimentconf.h" +// +//namespace similarity { +// +//using namespace std; +// +//template +//dist_t SpaceBitHamming::HiddenDistance(const Object* obj1, const Object* obj2) const { +// CHECK(obj1->datalength() > 0); +// CHECK(obj1->datalength() == obj2->datalength()); +// const dist_uint_t* x = reinterpret_cast(obj1->data()); +// const dist_uint_t* y = reinterpret_cast(obj2->data()); +// const size_t length = obj1->datalength() / sizeof(dist_uint_t) +// - 1; // the last integer is an original number of elements +// +// return BitHamming(x, y, length); +//} +// +//} diff --git a/similarity_search/src/space/space_bit_jaccard.cc b/similarity_search/src/space/space_bit_jaccard.cc new file mode 100644 index 0000000..aa0ecad --- /dev/null +++ b/similarity_search/src/space/space_bit_jaccard.cc @@ -0,0 +1,44 @@ +///** +// * Non-metric Space Library +// * +// * Main developers: Bilegsaikhan Naidan, Leonid Boytsov, Yury Malkov, Ben Frederickson, David Novak +// * +// * For the complete list of contributors and further details see: +// * https://github.com/searchivarius/NonMetricSpaceLib +// * +// * Copyright (c) 2013-2018 +// * +// * This code is released under the +// * Apache License Version 2.0 http://www.apache.org/licenses/. +// * +// */ +//#include +//#include +//#include +//#include +//#include +// +//#include "space/space_bit_jaccard.h" +//#include "permutation_utils.h" +//#include "logging.h" +//#include "distcomp.h" +//#include "read_data.h" +//#include "experimentconf.h" +// +////namespace similarity { +//// +//////using namespace std; +//// +//////template +//////dist_t SpaceBitJaccard::HiddenDistance(const Object* obj1, const Object* obj2) const { +////// CHECK(obj1->datalength() > 0); +////// CHECK(obj1->datalength() == obj2->datalength()); +////// const dist_uint_t* x = reinterpret_cast(obj1->data()); +////// const dist_uint_t* y = reinterpret_cast(obj2->data()); +////// const size_t length = obj1->datalength() / sizeof(dist_uint_t) +////// - 1; // the last integer is an original number of elements +////// +////// return BitJaccard(x, y, length); +//////} +//// +////} diff --git a/similarity_search/src/space/space_bit_vector.cc b/similarity_search/src/space/space_bit_vector.cc new file mode 100644 index 0000000..043319d --- /dev/null +++ b/similarity_search/src/space/space_bit_vector.cc @@ -0,0 +1,195 @@ +///** +// * Non-metric Space Library +// * +// * Main developers: Bilegsaikhan Naidan, Leonid Boytsov, Yury Malkov, Ben Frederickson, David Novak +// * +// * For the complete list of contributors and further details see: +// * https://github.com/searchivarius/NonMetricSpaceLib +// * +// * Copyright (c) 2013-2018 +// * +// * This code is released under the +// * Apache License Version 2.0 http://www.apache.org/licenses/. +// * +// */ +//#include +//#include +//#include +//#include +//#include +// +//#include "space/space_bit_vector.h" +//#include "permutation_utils.h" +//#include "logging.h" +//#include "distcomp.h" +//#include "read_data.h" +//#include "experimentconf.h" +// +//namespace similarity { +// +//using namespace std; +// +////template +////dist_t SpaceBitVector::HiddenDistance(const Object* obj1, const Object* obj2) const { +//// CHECK(obj1->datalength() > 0); +//// CHECK(obj1->datalength() == obj2->datalength()); +//// const dist_uint_t* x = reinterpret_cast(obj1->data()); +//// const dist_uint_t* y = reinterpret_cast(obj2->data()); +//// const size_t length = obj1->datalength() / sizeof(dist_uint_t) +//// - 1; // the last integer is an original number of elements +//// +//// return BitVector(x, y, length); +////} +// +////template +////void SpaceBitVector::ReadBitMaskVect(std::string line, LabelType& label, std::vector& binVect) const +////{ +//// binVect.clear(); +//// +//// label = Object::extractLabel(line); +//// +//// std::stringstream str(line); +//// +//// str.exceptions(std::ios::badbit); +//// +//// +//// ReplaceSomePunct(line); +//// +//// vector v; +//// +////#if 0 +//// try { +//// unsigned val; +//// +//// while (str >> val) { +//// if (val != 0 && val != 1) { +//// throw runtime_error("Only zeros and ones are allowed"); +//// } +//// v.push_back(val); +//// } +//// } catch (const std::exception &e) { +//// LOG(LIB_ERROR) << "Exception: " << e.what(); +//// PREPARE_RUNTIME_ERR(err) << "Failed to parse the line: '" << line << "'"; +//// LOG(LIB_ERROR) << err.stream().str(); +//// THROW_RUNTIME_ERR(err); +//// } +////#else +//// if (!ReadVecDataEfficiently(line, v)) { +//// PREPARE_RUNTIME_ERR(err) << "Failed to parse the line: '" << line << "'"; +//// LOG(LIB_ERROR) << err.stream().str(); +//// THROW_RUNTIME_ERR(err); +//// } +//// for (auto val : v) { +//// if (val != 0 && val != 1) { +//// PREPARE_RUNTIME_ERR(err) << "Only zeros and ones are allowed, offending line: '" << line << "'"; +//// LOG(LIB_ERROR) << err.stream().str(); +//// THROW_RUNTIME_ERR(err); +//// } +//// } +////#endif +//// Binarize(v, 1, binVect); // Create the binary vector +//// binVect.push_back(v.size()); // Put the number of elements in the end +////} +// +////template +////Object* SpaceBitVector::CreateObjFromBitMaskVect(IdType id, LabelType label, const std::vector& bitMaskVect) const { +//// return new Object(id, label, bitMaskVect.size() * sizeof(dist_uint_t), &bitMaskVect[0]); +////}; +// +///** Standard functions to read/write/create objects */ +// +////template +////unique_ptr SpaceBitVector::OpenReadFileHeader(const string& inpFileName) const { +//// return unique_ptr(new DataFileInputStateVec(inpFileName)); +////} +// +////template +////unique_ptr SpaceBitVector::OpenWriteFileHeader(const ObjectVector& dataset, +//// const string& outFileName) const { +//// return unique_ptr(new DataFileOutputState(outFileName)); +////} +// +////template +////unique_ptr +////SpaceBitVector::CreateObjFromStr(IdType id, LabelType label, const string& s, +//// DataFileInputState* pInpStateBase) const { +//// DataFileInputStateVec* pInpState = NULL; +//// if (pInpStateBase != NULL) { +//// pInpState = dynamic_cast(pInpStateBase); +//// if (NULL == pInpState) { +//// PREPARE_RUNTIME_ERR(err) << "Bug: unexpected pointer type"; +//// THROW_RUNTIME_ERR(err); +//// } +//// } +//// vector vec; +//// ReadBitMaskVect(s, label, vec); +//// if (pInpState != NULL) { +//// size_t elemQty = vec[vec.size() - 1]; +//// if (pInpState->dim_ == 0) pInpState->dim_ = elemQty; +//// else if (elemQty != pInpState->dim_) { +//// PREPARE_RUNTIME_ERR(err) << "The # of bit-vector elements (" << elemQty << ")" << +//// " doesn't match the # of elements in previous lines. (" << pInpState->dim_ << " )"; +//// THROW_RUNTIME_ERR(err); +//// } +//// } +//// return unique_ptr(CreateObjFromVectInternal(id, label, vec)); +////} +// +////template +////Object* SpaceBitVector::CreateObjFromVectInternal(IdType id, LabelType label, const std::vector& InpVect) const { +//// return new Object(id, label, InpVect.size() * sizeof(dist_uint_t), &InpVect[0]); +////}; +// +////template +////bool SpaceBitVector::ApproxEqual(const Object& obj1, const Object& obj2) const { +//// const dist_uint_t* p1 = reinterpret_cast(obj1.data()); +//// const dist_uint_t* p2 = reinterpret_cast(obj2.data()); +//// const size_t len1 = obj1.datalength() / sizeof(dist_uint_t) +//// - 1; // the last integer is an original number of elements +//// const size_t len2 = obj2.datalength() / sizeof(dist_uint_t) +//// - 1; // the last integer is an original number of elements +//// if (len1 != len2) { +//// PREPARE_RUNTIME_ERR(err) << "Bug: comparing vectors of different lengths: " << len1 << " and " << len2; +//// THROW_RUNTIME_ERR(err); +//// } +//// for (size_t i = 0; i < len1; ++i) { +//// dist_uint_t v1 = ((p1[i/32] >> (i & 31)) & 1); +//// dist_uint_t v2 = ((p2[i/32] >> (i & 31)) & 1); +//// if (v1 != v2) return false; +//// } +//// +//// return true; +////} +// +// +////template +////string SpaceBitVector::CreateStrFromObj(const Object* pObj, const string& externId /* ignored */) const { +//// stringstream out; +//// const dist_uint_t* p = reinterpret_cast(pObj->data()); +//// const size_t length = pObj->datalength() / sizeof(dist_uint_t) +//// - 1; // the last integer is an original number of elements +//// const size_t elemQty = p[length]; // last elem +//// +//// for (size_t i = 0; i < elemQty; ++i) { +//// if (i) out << " "; +//// out << ((p[i/32] >> (i & 31)) & 1); +//// } +//// +//// return out.str(); +////} +// +////template +////bool SpaceBitVector::ReadNextObjStr(DataFileInputState &inpStateBase, string& strObj, LabelType& label, string& externId) const { +//// externId.clear(); +//// DataFileInputStateOneFile* pInpState = dynamic_cast(&inpStateBase); +//// CHECK_MSG(pInpState != NULL, "Bug: unexpected pointer type"); +//// if (!pInpState->inp_file_) return false; +//// if (!getline(pInpState->inp_file_, strObj)) return false; +//// pInpState->line_num_++; +//// return true; +////} +// +// +///** End of standard functions to read/write/create objects */ +// +//} // namespace similarity diff --git a/similarity_search/test/test_space_serial.cc b/similarity_search/test/test_space_serial.cc index dddc978..53e1017 100644 --- a/similarity_search/test/test_space_serial.cc +++ b/similarity_search/test/test_space_serial.cc @@ -215,6 +215,23 @@ TEST(Test_BitHamming) { } } +TEST(Test_BitJaccard) { + vector testVect; + + for (size_t i = 0; i < MAX_NUM_REC; ++i) { + stringstream ss; + + for (size_t k = 0; k < 128; ++k) { + if (k) ss << " "; + ss << (RandomInt() % 2); + } + testVect.push_back(ss.str()); + } + for (size_t maxNumRec = 1; maxNumRec < MAX_NUM_REC; ++maxNumRec) { + EXPECT_EQ(true, fullTest(testVect, maxNumRec, "tmp_out_file.txt", "bit_jaccard", emptyParams, false)); + } +} + #if defined(WITH_EXTRAS) TEST(Test_SQFD) { const char* sqfdParams[] = {"alpha=1", NULL} ;