diff --git a/.gitignore b/.gitignore index d4c34ec..d48b88b 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ similarity_search/test/Makefile *.so *.pyc *.egg-info/ +.idea diff --git a/python_bindings/tests/bindings_test.py b/python_bindings/tests/bindings_test.py index 667e994..61daf70 100644 --- a/python_bindings/tests/bindings_test.py +++ b/python_bindings/tests/bindings_test.py @@ -18,6 +18,14 @@ def get_hitrate(ground_truth, ids): return len(set(i for i, _ in ground_truth).intersection(ids)) +def bit_vector_to_str(bit_vect): + return " ".join(["1" if e else "0" for e in bit_vect]) + + +def bit_vector_sparse_str(bit_vect): + return " ".join([str(k) for k, b in enumerate(bit_vect) if b]) + + class DenseIndexTestMixin(object): def _get_index(self, space='cosinesimil'): raise NotImplementedError() @@ -95,11 +103,86 @@ def testReloadIndex(self): reloaded_results) +class BitVectorIndexTestMixin(object): + def _get_index(self, space='bit_jaccard'): + raise NotImplementedError() + + def _get_batches(self, index, nbits, num_elems, chunk_size): + if "bit_" in str(index): + self.bit_vector_str_func = bit_vector_to_str + else: + self.bit_vector_str_func = bit_vector_sparse_str + + batches = [] + for i in range(0, num_elems, chunk_size): + strs = [] + for j in range(chunk_size): + a = np.random.rand(nbits) > 0.5 + strs.append(self.bit_vector_str_func(a)) + batches.append([np.arange(i, i + chunk_size), strs]) + return batches + + def testKnnQuery(self): + np.random.seed(23) + + index = self._get_index() + batches = self._get_batches(index, 512, 2000, 1000) + for ids, data in batches: + index.addDataPointBatch(ids=ids, data=data) + + index.createIndex() + + s = self.bit_vector_str_func(np.ones(512)) + index.knnQuery(s, k=10) + + def testReloadIndex(self): + np.random.seed(23) + + original = self._get_index() + batches = self._get_batches(original, 512, 2000, 1000) + for ids, data in batches: + original.addDataPointBatch(ids=ids, data=data) + original.createIndex() + + # test out saving/reloading index + with tempfile.NamedTemporaryFile() as tmp: + original.saveIndex(tmp.name + ".index") + + reloaded = self._get_index() + for ids, data in batches: + reloaded.addDataPointBatch(ids=ids, data=data) + reloaded.loadIndex(tmp.name + ".index") + + s = self.bit_vector_str_func(np.ones(512)) + original_results = original.knnQuery(s) + reloaded_results = reloaded.knnQuery(s) + npt.assert_allclose(original_results, + reloaded_results) + + class HNSWTestCase(unittest.TestCase, DenseIndexTestMixin): def _get_index(self, space='cosinesimil'): return nmslib.init(method='hnsw', space=space) +class BitJaccardTestCase(unittest.TestCase, BitVectorIndexTestMixin): + def _get_index(self, space='bit_jaccard'): + return nmslib.init(method='hnsw', space=space, data_type=nmslib.DataType.OBJECT_AS_STRING, + dtype=nmslib.DistType.FLOAT) + + +class SparseJaccardTestCase(unittest.TestCase, BitVectorIndexTestMixin): + def _get_index(self, space='jaccard_sparse'): + return nmslib.init(method='hnsw', space=space, data_type=nmslib.DataType.OBJECT_AS_STRING, + dtype=nmslib.DistType.FLOAT) + + +class BitHammingTestCase(unittest.TestCase, BitVectorIndexTestMixin): + def _get_index(self, space='bit_hamming'): + return nmslib.init(method='hnsw', space=space, data_type=nmslib.DataType.OBJECT_AS_STRING, + dtype=nmslib.DistType.INT) + + class SWGraphTestCase(unittest.TestCase, DenseIndexTestMixin): def _get_index(self, space='cosinesimil'): return nmslib.init(method='sw-graph', space=space) diff --git a/similarity_search/include/distcomp.h b/similarity_search/include/distcomp.h index 7863837..84e960b 100644 --- a/similarity_search/include/distcomp.h +++ b/similarity_search/include/distcomp.h @@ -223,6 +223,19 @@ int SpearmanRho(const PivotIdType* x, const PivotIdType* y, size_t qty); int SpearmanFootruleSIMD(const PivotIdType* x, const PivotIdType* y, size_t qty); int SpearmanRhoSIMD(const PivotIdType* x, const PivotIdType* y, size_t qty); +template +dist_t inline BitJaccard(const dist_uint_t* a, const dist_uint_t* b, size_t qty) { + dist_uint_t num = 0, den = 0; + + for (size_t i=0; i < qty; ++i) { + // __builtin_popcount quickly computes the number on 1s + num += __builtin_popcount(a[i] & b[i]); + den += __builtin_popcount(a[i] | b[i]); + } + + return 1 - (dist_t(num) / dist_t(den)); +} + //unsigned BitHamming(const uint32_t* a, const uint32_t* b, size_t qty); unsigned inline BitHamming(const uint32_t* a, const uint32_t* b, size_t qty) { diff --git a/similarity_search/include/factory/init_spaces.h b/similarity_search/include/factory/init_spaces.h index 0984e08..8abee56 100644 --- a/similarity_search/include/factory/init_spaces.h +++ b/similarity_search/include/factory/init_spaces.h @@ -19,6 +19,7 @@ #include "factory/space/space_edist.h" #include "factory/space/space_bit_hamming.h" +#include "factory/space/space_bit_jaccard.h" #include "factory/space/space_bregman.h" #include "factory/space/space_dummy.h" #include "factory/space/space_js.h" @@ -36,15 +37,17 @@ namespace similarity { - inline void initSpaces() { // Registering a dummy space REGISTER_SPACE_CREATOR(int, SPACE_DUMMY, CreateDummy) REGISTER_SPACE_CREATOR(float, SPACE_DUMMY, CreateDummy) REGISTER_SPACE_CREATOR(double, SPACE_DUMMY, CreateDummy) - // Registering binary/bit Hamming - REGISTER_SPACE_CREATOR(int, SPACE_BIT_HAMMING, CreateBitHamming) + // Registering binary/bit Hamming/Jaccard + SpaceFactoryRegistry::CreateFuncPtr bit_hamming_func_ptr = CreateBitHamming; + REGISTER_SPACE_CREATOR(int, SPACE_BIT_HAMMING, bit_hamming_func_ptr ) + SpaceFactoryRegistry::CreateFuncPtr bit_jaccard_func_ptr = CreateBitJaccard; + REGISTER_SPACE_CREATOR(float, SPACE_BIT_JACCARD, bit_jaccard_func_ptr ) // Registering the Levensthein-distance: regular and normalized REGISTER_SPACE_CREATOR(int, SPACE_LEVENSHTEIN, CreateLevenshtein) diff --git a/similarity_search/include/factory/space/space_bit_hamming.h b/similarity_search/include/factory/space/space_bit_hamming.h index 53bcc34..d191594 100644 --- a/similarity_search/include/factory/space/space_bit_hamming.h +++ b/similarity_search/include/factory/space/space_bit_hamming.h @@ -23,8 +23,9 @@ namespace similarity { * Creating functions. */ -inline Space* CreateBitHamming(const AnyParams& /* ignoring params */) { - return new SpaceBitHamming(); +template +inline Space* CreateBitHamming(const AnyParams& /* ignoring params */) { + return new SpaceBitHamming(); } /* diff --git a/similarity_search/include/factory/space/space_bit_jaccard.h b/similarity_search/include/factory/space/space_bit_jaccard.h new file mode 100644 index 0000000..48f81dd --- /dev/null +++ b/similarity_search/include/factory/space/space_bit_jaccard.h @@ -0,0 +1,39 @@ +/** + * Non-metric Space Library + * + * Main developers: Bilegsaikhan Naidan, Leonid Boytsov, Yury Malkov, Ben Frederickson, David Novak + * + * For the complete list of contributors and further details see: + * https://github.com/searchivarius/NonMetricSpaceLib + * + * Copyright (c) 2013-2018 + * + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + */ +#ifndef FACTORY_SPACE_BIT_JACCARD_H +#define FACTORY_SPACE_BIT_JACCARD_H + +#include + +namespace similarity { + +/* + * Creating functions. + */ + +template +inline Space* CreateBitJaccard(const AnyParams& /* ignoring params */) { + return new SpaceBitJaccard(); +} + +/* + * End of creating functions. + */ +} + +#endif + + + diff --git a/similarity_search/include/method/perm_bin_vptree.h b/similarity_search/include/method/perm_bin_vptree.h index a58c492..6202c96 100644 --- a/similarity_search/include/method/perm_bin_vptree.h +++ b/similarity_search/include/method/perm_bin_vptree.h @@ -66,7 +66,7 @@ class PermBinVPTree : public Index { ObjectVector BinPermData_; unique_ptr>> VPTreeIndex_; - unique_ptr VPTreeSpace_; + unique_ptr> VPTreeSpace_; // disable copy and assign DISABLE_COPY_AND_ASSIGN(PermBinVPTree); diff --git a/similarity_search/include/permutation_utils.h b/similarity_search/include/permutation_utils.h index 00a141d..dd09e84 100644 --- a/similarity_search/include/permutation_utils.h +++ b/similarity_search/include/permutation_utils.h @@ -23,7 +23,6 @@ #include "rangequery.h" #include "knnquery.h" #include "permutation_type.h" -#include "distcomp.h" #include "utils.h" namespace similarity { @@ -163,6 +162,21 @@ inline void Binarize(const vector &perm, const PivotIdType thresh, } } +inline void Binarize(const vector &perm, const PivotIdType thresh, vector&bin_perm) { + size_t bin_perm_word_qty = (perm.size() + 63)/64; + + bin_perm.resize(bin_perm_word_qty); + fill(bin_perm.begin(), bin_perm.end(), 0); + + for (size_t i = 0; i < perm.size(); ++i) { + bool b =perm[i] >= thresh; + + if (b) { + bin_perm[i/64] |= (1<<(i%64)) ; + } + } +} + } // namespace similarity #endif // _PERMUTATION_UTILS_H_ diff --git a/similarity_search/include/space/space_bit_hamming.h b/similarity_search/include/space/space_bit_hamming.h index d7524ae..a641aed 100644 --- a/similarity_search/include/space/space_bit_hamming.h +++ b/similarity_search/include/space/space_bit_hamming.h @@ -25,55 +25,31 @@ #include "utils.h" #include "space.h" #include "distcomp.h" +#include "space_bit_vector.h" #define SPACE_BIT_HAMMING "bit_hamming" namespace similarity { -class SpaceBitHamming : public Space { +template +class SpaceBitHamming : public SpaceBitVector { public: explicit SpaceBitHamming() {} virtual ~SpaceBitHamming() {} - /** Standard functions to read/write/create objects */ - // Create an object from string representation. - virtual unique_ptr CreateObjFromStr(IdType id, LabelType label, const string& s, - DataFileInputState* pInpState) const; - // Create a string representation of an object. - virtual string CreateStrFromObj(const Object* pObj, const string& externId /* ignored */) const; - // Open a file for reading, fetch a header (if there is any) and memorize an input state - virtual unique_ptr OpenReadFileHeader(const string& inputFile) const; - // Open a file for writing, write a header (if there is any) and memorize an output state - virtual unique_ptr OpenWriteFileHeader(const ObjectVector& dataset, - const string& outputFile) const; - /* - * Read a string representation of the next object in a file as well - * as its label. Return false, on EOF. - */ - virtual bool ReadNextObjStr(DataFileInputState &, string& strObj, LabelType& label, string& externId) const; - /** End of standard functions to read/write/create objects */ - - /* - * Used only for testing/debugging: compares objects approximately. Floating point numbers - * should be nearly equal. Integers and strings should coincide exactly. - */ - virtual bool ApproxEqual(const Object& obj1, const Object& obj2) const; - virtual std::string StrDesc() const { return "Hamming (bit-storage) space"; } - virtual void CreateDenseVectFromObj(const Object* obj, int* pVect, - size_t nElem) const { - throw runtime_error("Cannot create a dense vector for the space: " + StrDesc()); - } - virtual size_t GetElemQty(const Object* object) const {return 0;} - virtual Object* CreateObjFromVect(IdType id, LabelType label, std::vector& InpVect) const { - InpVect.push_back(InpVect.size()); - return CreateObjFromVectInternal(id, label, InpVect); - } + protected: - virtual Object* CreateObjFromVectInternal(IdType id, LabelType label, const std::vector& InpVect) const; - Object* CreateObjFromBitMaskVect(IdType id, LabelType label, const std::vector& bitMaskVect) const; - virtual int HiddenDistance(const Object* obj1, const Object* obj2) const; - void ReadBitMaskVect(std::string line, LabelType& label, std::vector& v) const; + virtual dist_t HiddenDistance(const Object* obj1, const Object* obj2) const { + CHECK(obj1->datalength() > 0); + CHECK(obj1->datalength() == obj2->datalength()); + const dist_uint_t* x = reinterpret_cast(obj1->data()); + const dist_uint_t* y = reinterpret_cast(obj2->data()); + const size_t length = obj1->datalength() / sizeof(dist_uint_t) + - 1; // the last integer is an original number of elements + + return BitHamming(x, y, length); + } DISABLE_COPY_AND_ASSIGN(SpaceBitHamming); }; diff --git a/similarity_search/include/space/space_bit_jaccard.h b/similarity_search/include/space/space_bit_jaccard.h new file mode 100644 index 0000000..b53a3d7 --- /dev/null +++ b/similarity_search/include/space/space_bit_jaccard.h @@ -0,0 +1,59 @@ +/** + * Non-metric Space Library + * + * Main developers: Bilegsaikhan Naidan, Leonid Boytsov, Yury Malkov, Ben Frederickson, David Novak + * + * For the complete list of contributors and further details see: + * https://github.com/searchivarius/NonMetricSpaceLib + * + * Copyright (c) 2013-2018 + * + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + */ +#ifndef _SPACE_BIT_JACCARD_H_ +#define _SPACE_BIT_JACCARD_H_ + +#include +#include +#include + +#include +#include "global.h" +#include "object.h" +#include "utils.h" +#include "space.h" +#include "distcomp.h" +#include "space_bit_vector.h" + +#define SPACE_BIT_JACCARD "bit_jaccard" + +namespace similarity { + +template +class SpaceBitJaccard : public SpaceBitVector { + public: + explicit SpaceBitJaccard() {} + virtual ~SpaceBitJaccard() {} + + virtual std::string StrDesc() const { return "Jaccard (bit-storage) space"; } + + protected: + virtual dist_t HiddenDistance(const Object* obj1, const Object* obj2) const { + CHECK(obj1->datalength() > 0); + CHECK(obj1->datalength() == obj2->datalength()); + const dist_uint_t* x = reinterpret_cast(obj1->data()); + const dist_uint_t* y = reinterpret_cast(obj2->data()); + const size_t length = obj1->datalength() / sizeof(dist_uint_t) + - 1; // the last integer is an original number of elements + + return BitJaccard(x, y, length); + } + + DISABLE_COPY_AND_ASSIGN(SpaceBitJaccard); +}; + +} // namespace similarity + +#endif diff --git a/similarity_search/include/space/space_bit_vector.h b/similarity_search/include/space/space_bit_vector.h new file mode 100644 index 0000000..a2dd172 --- /dev/null +++ b/similarity_search/include/space/space_bit_vector.h @@ -0,0 +1,209 @@ +/** + * Non-metric Space Library + * + * Main developers: Bilegsaikhan Naidan, Leonid Boytsov, Yury Malkov, Ben Frederickson, David Novak + * + * For the complete list of contributors and further details see: + * https://github.com/searchivarius/NonMetricSpaceLib + * + * Copyright (c) 2013-2018 + * + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + */ +#ifndef _SPACE_BIT_VECTOR_H_ +#define _SPACE_BIT_VECTOR_H_ + +#include +#include +#include + +#include +#include "global.h" +#include "object.h" +#include "utils.h" +#include "space.h" +#include "distcomp.h" +#include "read_data.h" +#include "permutation_utils.h" +#include "logging.h" +#include "experimentconf.h" + + +// Defines an abstract base class for BitVector spaces + +namespace similarity { + +template +class SpaceBitVector : public Space { + public: + explicit SpaceBitVector() {} + virtual ~SpaceBitVector() {} + + /** Standard functions to read/write/create objects */ + // Create an object from string representation. + virtual unique_ptr CreateObjFromStr(IdType id, LabelType label, const string& s, + DataFileInputState* pInpStateBase) const { + DataFileInputStateVec* pInpState = NULL; + if (pInpStateBase != NULL) { + pInpState = dynamic_cast(pInpStateBase); + if (NULL == pInpState) { + PREPARE_RUNTIME_ERR(err) << "Bug: unexpected pointer type"; + THROW_RUNTIME_ERR(err); + } + } + vector vec; + ReadBitMaskVect(s, label, vec); + if (pInpState != NULL) { + size_t elemQty = vec[vec.size() - 1]; + if (pInpState->dim_ == 0) pInpState->dim_ = elemQty; + else if (elemQty != pInpState->dim_) { + PREPARE_RUNTIME_ERR(err) << "The # of bit-vector elements (" << elemQty << ")" << + " doesn't match the # of elements in previous lines. (" << pInpState->dim_ << " )"; + THROW_RUNTIME_ERR(err); + } + } + return unique_ptr(CreateObjFromVectInternal(id, label, vec)); + } + + // Create a string representation of an object. + virtual string CreateStrFromObj(const Object* pObj, const string& externId /* ignored */) const { + stringstream out; + const dist_uint_t* p = reinterpret_cast(pObj->data()); + const size_t length = pObj->datalength() / sizeof(dist_uint_t) + - 1; // the last integer is an original number of elements + const size_t elemQty = p[length]; // last elem + + for (size_t i = 0; i < elemQty; ++i) { + if (i) out << " "; + out << ((p[i/32] >> (i & 31)) & 1); + } + + return out.str(); + } + + // Open a file for reading, fetch a header (if there is any) and memorize an input state + virtual unique_ptr OpenReadFileHeader(const string& inpFileName) const { + return unique_ptr(new DataFileInputStateVec(inpFileName)); + } + + // Open a file for writing, write a header (if there is any) and memorize an output state + virtual unique_ptr OpenWriteFileHeader(const ObjectVector& dataset, + const string& outFileName) const { + return unique_ptr(new DataFileOutputState(outFileName)); + } + + /* + * Read a string representation of the next object in a file as well + * as its label. Return false, on EOF. + */ + virtual bool ReadNextObjStr(DataFileInputState &inpStateBase, string& strObj, LabelType& label, string& externId) const { + externId.clear(); + DataFileInputStateOneFile* pInpState = dynamic_cast(&inpStateBase); + CHECK_MSG(pInpState != NULL, "Bug: unexpected pointer type"); + if (!pInpState->inp_file_) return false; + if (!getline(pInpState->inp_file_, strObj)) return false; + pInpState->line_num_++; + return true; + } + /** End of standard functions to read/write/create objects */ + + /* + * Used only for testing/debugging: compares objects approximately. Floating point numbers + * should be nearly equal. Integers and strings should coincide exactly. + */ + virtual bool ApproxEqual(const Object& obj1, const Object& obj2) const { + const dist_uint_t* p1 = reinterpret_cast(obj1.data()); + const dist_uint_t* p2 = reinterpret_cast(obj2.data()); + const size_t len1 = obj1.datalength() / sizeof(dist_uint_t) + - 1; // the last integer is an original number of elements + const size_t len2 = obj2.datalength() / sizeof(dist_uint_t) + - 1; // the last integer is an original number of elements + if (len1 != len2) { + PREPARE_RUNTIME_ERR(err) << "Bug: comparing vectors of different lengths: " << len1 << " and " << len2; + THROW_RUNTIME_ERR(err); + } + for (size_t i = 0; i < len1; ++i) { + dist_uint_t v1 = ((p1[i/32] >> (i & 31)) & 1); + dist_uint_t v2 = ((p2[i/32] >> (i & 31)) & 1); + if (v1 != v2) return false; + } + + return true; + } + + virtual std::string StrDesc() const { return "Vector (bit-storage) space"; } + virtual void CreateDenseVectFromObj(const Object* obj, dist_t* pVect, + size_t nElem) const { + throw runtime_error("Cannot create a dense vector for the space: " + StrDesc()); + } + virtual size_t GetElemQty(const Object* object) const {return 0;} + virtual Object* CreateObjFromVect(IdType id, LabelType label, std::vector& InpVect) const { + InpVect.push_back(InpVect.size()); + return CreateObjFromVectInternal(id, label, InpVect); + } + protected: + virtual Object* CreateObjFromVectInternal(IdType id, LabelType label, const std::vector& InpVect) const { + return new Object(id, label, InpVect.size() * sizeof(dist_uint_t), &InpVect[0]); + } + + Object* CreateObjFromBitMaskVect(IdType id, LabelType label, const std::vector& bitMaskVect) const { + return new Object(id, label, bitMaskVect.size() * sizeof(dist_uint_t), &bitMaskVect[0]); + } + +// virtual dist_t HiddenDistance(const Object* obj1, const Object* obj2) const; + void ReadBitMaskVect(std::string line, LabelType& label, std::vector& binVect) const { + binVect.clear(); + + label = Object::extractLabel(line); + + std::stringstream str(line); + + str.exceptions(std::ios::badbit); + + + ReplaceSomePunct(line); + + vector v; + + #if 0 + try { + unsigned val; + + while (str >> val) { + if (val != 0 && val != 1) { + throw runtime_error("Only zeros and ones are allowed"); + } + v.push_back(val); + } + } catch (const std::exception &e) { + LOG(LIB_ERROR) << "Exception: " << e.what(); + PREPARE_RUNTIME_ERR(err) << "Failed to parse the line: '" << line << "'"; + LOG(LIB_ERROR) << err.stream().str(); + THROW_RUNTIME_ERR(err); + } + #else + if (!ReadVecDataEfficiently(line, v)) { + PREPARE_RUNTIME_ERR(err) << "Failed to parse the line: '" << line << "'"; + LOG(LIB_ERROR) << err.stream().str(); + THROW_RUNTIME_ERR(err); + } + for (auto val : v) { + if (val != 0 && val != 1) { + PREPARE_RUNTIME_ERR(err) << "Only zeros and ones are allowed, offending line: '" << line << "'"; + LOG(LIB_ERROR) << err.stream().str(); + THROW_RUNTIME_ERR(err); + } + } + #endif + Binarize(v, 1, binVect); // Create the binary vector + binVect.push_back(v.size()); // Put the number of elements in the end + } + + DISABLE_COPY_AND_ASSIGN(SpaceBitVector); +}; + +} // namespace similarity + +#endif diff --git a/similarity_search/src/method/perm_bin_vptree.cc b/similarity_search/src/method/perm_bin_vptree.cc index 0018f6d..d017085 100644 --- a/similarity_search/src/method/perm_bin_vptree.cc +++ b/similarity_search/src/method/perm_bin_vptree.cc @@ -38,7 +38,7 @@ PermBinVPTree::PermBinVPTree( const ObjectVector& data) : Index(data), space_(space), PrintProgress_(PrintProgress), - VPTreeSpace_(new SpaceBitHamming()) + VPTreeSpace_(new SpaceBitHamming()) {} template diff --git a/similarity_search/src/method/perm_index_incr_bin.cc b/similarity_search/src/method/perm_index_incr_bin.cc index f96c727..b565364 100644 --- a/similarity_search/src/method/perm_index_incr_bin.cc +++ b/similarity_search/src/method/perm_index_incr_bin.cc @@ -23,6 +23,7 @@ #include "incremental_quick_select.h" #include "method/perm_index_incr_bin.h" #include "utils.h" +#include "distcomp.h" namespace similarity { diff --git a/similarity_search/src/space/space_bit_hamming.cc b/similarity_search/src/space/space_bit_hamming.cc deleted file mode 100644 index f4dbc57..0000000 --- a/similarity_search/src/space/space_bit_hamming.cc +++ /dev/null @@ -1,185 +0,0 @@ -/** - * Non-metric Space Library - * - * Main developers: Bilegsaikhan Naidan, Leonid Boytsov, Yury Malkov, Ben Frederickson, David Novak - * - * For the complete list of contributors and further details see: - * https://github.com/searchivarius/NonMetricSpaceLib - * - * Copyright (c) 2013-2018 - * - * This code is released under the - * Apache License Version 2.0 http://www.apache.org/licenses/. - * - */ -#include -#include -#include -#include -#include - -#include "space/space_bit_hamming.h" -#include "permutation_utils.h" -#include "logging.h" -#include "distcomp.h" -#include "read_data.h" -#include "experimentconf.h" - -namespace similarity { - -using namespace std; - -int SpaceBitHamming::HiddenDistance(const Object* obj1, const Object* obj2) const { - CHECK(obj1->datalength() > 0); - CHECK(obj1->datalength() == obj2->datalength()); - const uint32_t* x = reinterpret_cast(obj1->data()); - const uint32_t* y = reinterpret_cast(obj2->data()); - const size_t length = obj1->datalength() / sizeof(uint32_t) - - 1; // the last integer is an original number of elements - - return BitHamming(x, y, length); -} - -void SpaceBitHamming::ReadBitMaskVect(std::string line, LabelType& label, std::vector& binVect) const -{ - binVect.clear(); - - label = Object::extractLabel(line); - - std::stringstream str(line); - - str.exceptions(std::ios::badbit); - - - ReplaceSomePunct(line); - - vector v; - -#if 0 - try { - unsigned val; - - while (str >> val) { - if (val != 0 && val != 1) { - throw runtime_error("Only zeros and ones are allowed"); - } - v.push_back(val); - } - } catch (const std::exception &e) { - LOG(LIB_ERROR) << "Exception: " << e.what(); - PREPARE_RUNTIME_ERR(err) << "Failed to parse the line: '" << line << "'"; - LOG(LIB_ERROR) << err.stream().str(); - THROW_RUNTIME_ERR(err); - } -#else - if (!ReadVecDataEfficiently(line, v)) { - PREPARE_RUNTIME_ERR(err) << "Failed to parse the line: '" << line << "'"; - LOG(LIB_ERROR) << err.stream().str(); - THROW_RUNTIME_ERR(err); - } - for (auto val : v) { - if (val != 0 && val != 1) { - PREPARE_RUNTIME_ERR(err) << "Only zeros and ones are allowed, offending line: '" << line << "'"; - LOG(LIB_ERROR) << err.stream().str(); - THROW_RUNTIME_ERR(err); - } - } -#endif - Binarize(v, 1, binVect); // Create the binary vector - binVect.push_back(v.size()); // Put the number of elements in the end -} - -Object* SpaceBitHamming::CreateObjFromBitMaskVect(IdType id, LabelType label, const std::vector& bitMaskVect) const { - return new Object(id, label, bitMaskVect.size() * sizeof(uint32_t), &bitMaskVect[0]); -}; - -/** Standard functions to read/write/create objects */ - -unique_ptr SpaceBitHamming::OpenReadFileHeader(const string& inpFileName) const { - return unique_ptr(new DataFileInputStateVec(inpFileName)); -} - -unique_ptr SpaceBitHamming::OpenWriteFileHeader(const ObjectVector& dataset, - const string& outFileName) const { - return unique_ptr(new DataFileOutputState(outFileName)); -} - -unique_ptr -SpaceBitHamming::CreateObjFromStr(IdType id, LabelType label, const string& s, - DataFileInputState* pInpStateBase) const { - DataFileInputStateVec* pInpState = NULL; - if (pInpStateBase != NULL) { - pInpState = dynamic_cast(pInpStateBase); - if (NULL == pInpState) { - PREPARE_RUNTIME_ERR(err) << "Bug: unexpected pointer type"; - THROW_RUNTIME_ERR(err); - } - } - vector vec; - ReadBitMaskVect(s, label, vec); - if (pInpState != NULL) { - size_t elemQty = vec[vec.size() - 1]; - if (pInpState->dim_ == 0) pInpState->dim_ = elemQty; - else if (elemQty != pInpState->dim_) { - PREPARE_RUNTIME_ERR(err) << "The # of bit-vector elements (" << elemQty << ")" << - " doesn't match the # of elements in previous lines. (" << pInpState->dim_ << " )"; - THROW_RUNTIME_ERR(err); - } - } - return unique_ptr(CreateObjFromVectInternal(id, label, vec)); -} - -Object* SpaceBitHamming::CreateObjFromVectInternal(IdType id, LabelType label, const std::vector& InpVect) const { - return new Object(id, label, InpVect.size() * sizeof(uint32_t), &InpVect[0]); -}; - -bool SpaceBitHamming::ApproxEqual(const Object& obj1, const Object& obj2) const { - const uint32_t* p1 = reinterpret_cast(obj1.data()); - const uint32_t* p2 = reinterpret_cast(obj2.data()); - const size_t len1 = obj1.datalength() / sizeof(uint32_t) - - 1; // the last integer is an original number of elements - const size_t len2 = obj2.datalength() / sizeof(uint32_t) - - 1; // the last integer is an original number of elements - if (len1 != len2) { - PREPARE_RUNTIME_ERR(err) << "Bug: comparing vectors of different lengths: " << len1 << " and " << len2; - THROW_RUNTIME_ERR(err); - } - for (size_t i = 0; i < len1; ++i) { - uint32_t v1 = ((p1[i/32] >> (i & 31)) & 1); - uint32_t v2 = ((p2[i/32] >> (i & 31)) & 1); - if (v1 != v2) return false; - } - - return true; -} - - -string SpaceBitHamming::CreateStrFromObj(const Object* pObj, const string& externId /* ignored */) const { - stringstream out; - const uint32_t* p = reinterpret_cast(pObj->data()); - const size_t length = pObj->datalength() / sizeof(uint32_t) - - 1; // the last integer is an original number of elements - const size_t elemQty = p[length]; // last elem - - for (size_t i = 0; i < elemQty; ++i) { - if (i) out << " "; - out << ((p[i/32] >> (i & 31)) & 1); - } - - return out.str(); -} - -bool SpaceBitHamming::ReadNextObjStr(DataFileInputState &inpStateBase, string& strObj, LabelType& label, string& externId) const { - externId.clear(); - DataFileInputStateOneFile* pInpState = dynamic_cast(&inpStateBase); - CHECK_MSG(pInpState != NULL, "Bug: unexpected pointer type"); - if (!pInpState->inp_file_) return false; - if (!getline(pInpState->inp_file_, strObj)) return false; - pInpState->line_num_++; - return true; -} - - -/** End of standard functions to read/write/create objects */ - -} // namespace similarity diff --git a/similarity_search/test/test_space_serial.cc b/similarity_search/test/test_space_serial.cc index dddc978..00ec2f9 100644 --- a/similarity_search/test/test_space_serial.cc +++ b/similarity_search/test/test_space_serial.cc @@ -134,7 +134,6 @@ bool fullTest(const vector& dataSetStr, size_t maxNumRec, const string& dataSet1.push_back(space->CreateObjFromStr(id++, -1, s, NULL).release()); vExternIds1.push_back(ss.str()); - if (id >= maxNumRec) break; } @@ -215,9 +214,26 @@ TEST(Test_BitHamming) { } } +TEST(Test_BitJaccard) { + vector testVect; + + for (size_t i = 0; i < MAX_NUM_REC; ++i) { + stringstream ss; + + for (size_t k = 0; k < 128; ++k) { + if (k) ss << " "; + ss << (RandomInt() % 2); + } + testVect.push_back(ss.str()); + } + for (size_t maxNumRec = 1; maxNumRec < MAX_NUM_REC; ++maxNumRec) { + EXPECT_EQ(true, fullTest(testVect, maxNumRec, "tmp_out_file.txt", "bit_jaccard", emptyParams, false)); + } +} + #if defined(WITH_EXTRAS) TEST(Test_SQFD) { - const char* sqfdParams[] = {"alpha=1", NULL} ; + const char* sqfdParams[] = {"alpha=1", NULL} ; for (size_t maxNumRec = 1; maxNumRec < MAX_NUM_REC; ++maxNumRec) { EXPECT_EQ(true, fullTest("sqfd20_10k_10k.txt", maxNumRec, "tmp_out_file.txt", "sqfd_heuristic_func", sqfdParams, false)); EXPECT_EQ(true, fullTest("sqfd20_10k_10k.txt", maxNumRec, "tmp_out_file.txt", "sqfd_heuristic_func", sqfdParams, false));