From e984532fe4d78ea7d714fd64dc145a6891ddc6b3 Mon Sep 17 00:00:00 2001 From: Greg Friedland Date: Sun, 17 Feb 2019 00:06:31 -0800 Subject: [PATCH 01/17] added bit_jaccard and ABC bit_vector --- .idea/codeStyles/Project.xml | 10 + .idea/codeStyles/codeStyleConfig.xml | 5 + python_bindings/setup.py | 16 +- python_bindings/tests/bindings_test.py | 88 +++++++ similarity_search/include/distcomp.h | 13 + .../include/factory/init_spaces.h | 9 +- .../include/factory/space/space_bit_hamming.h | 5 +- .../include/factory/space/space_bit_jaccard.h | 39 +++ .../include/method/perm_bin_vptree.h | 2 +- similarity_search/include/permutation_utils.h | 16 +- .../include/space/space_bit_hamming.h | 52 ++-- .../include/space/space_bit_jaccard.h | 59 +++++ .../include/space/space_bit_vector.h | 209 ++++++++++++++++ .../src/method/perm_bin_vptree.cc | 2 +- .../src/space/space_bit_hamming.cc | 229 ++++-------------- .../src/space/space_bit_jaccard.cc | 44 ++++ .../src/space/space_bit_vector.cc | 195 +++++++++++++++ similarity_search/test/test_space_serial.cc | 17 ++ 18 files changed, 771 insertions(+), 239 deletions(-) create mode 100644 .idea/codeStyles/Project.xml create mode 100644 .idea/codeStyles/codeStyleConfig.xml create mode 100644 similarity_search/include/factory/space/space_bit_jaccard.h create mode 100644 similarity_search/include/space/space_bit_jaccard.h create mode 100644 similarity_search/include/space/space_bit_vector.h create mode 100644 similarity_search/src/space/space_bit_jaccard.cc create mode 100644 similarity_search/src/space/space_bit_vector.cc diff --git a/.idea/codeStyles/Project.xml b/.idea/codeStyles/Project.xml new file mode 100644 index 0000000..664f8f1 --- /dev/null +++ b/.idea/codeStyles/Project.xml @@ -0,0 +1,10 @@ + + + + + \ No newline at end of file diff --git a/.idea/codeStyles/codeStyleConfig.xml b/.idea/codeStyles/codeStyleConfig.xml new file mode 100644 index 0000000..79ee123 --- /dev/null +++ b/.idea/codeStyles/codeStyleConfig.xml @@ -0,0 +1,5 @@ + + + + \ No newline at end of file diff --git a/python_bindings/setup.py b/python_bindings/setup.py index 1a16797..0808994 100755 --- a/python_bindings/setup.py +++ b/python_bindings/setup.py @@ -21,15 +21,15 @@ if os.path.exists(library_file): # if we have a prebuilt nmslib library file, use that. extra_objects.append(library_file) - else: - # Otherwise build all the files here directly (excluding extras which need eigen/boost) - exclude_files = set("""bbtree.cc lsh.cc lsh_multiprobe.cc lsh_space.cc falconn.cc nndes.cc space_sqfd.cc - dummy_app.cc main.cc""".split()) - - for root, subdirs, files in os.walk(os.path.join(libdir, "src")): - source_files.extend(os.path.join(root, f) for f in files - if f.endswith(".cc") and f not in exclude_files) + raise RuntimeError("can't find prebuild lib: " + os.path.abspath(library_file)) + # # Otherwise build all the files here directly (excluding extras which need eigen/boost) + # exclude_files = set("""bbtree.cc lsh.cc lsh_multiprobe.cc lsh_space.cc falconn.cc nndes.cc space_sqfd.cc + # dummy_app.cc main.cc""".split()) + # + # for root, subdirs, files in os.walk(os.path.join(libdir, "src")): + # source_files.extend(os.path.join(root, f) for f in files + # if f.endswith(".cc") and f not in exclude_files) if sys.platform.startswith('linux'): diff --git a/python_bindings/tests/bindings_test.py b/python_bindings/tests/bindings_test.py index 56add76..47c0989 100644 --- a/python_bindings/tests/bindings_test.py +++ b/python_bindings/tests/bindings_test.py @@ -87,11 +87,99 @@ def testReloadIndex(self): reloaded_results) +class BitVectorIndexTestMixin(object): + def _get_index(self, space='bit_jaccard'): + raise NotImplementedError() + + def testKnnQuery(self): + np.random.seed(23) + nbits = 128 + + index = self._get_index() + + for i in range(100): + a = np.random.rand(nbits) > 0.5 + s = " ".join(["1" if e else "0" for e in a]) + index.addDataPoint(id=i, data=s) + index.createIndex() + + a = np.ones(nbits) + s = " ".join(["1" if e else "0" for e in a]) + ids, distances = index.knnQuery(s, k=10) + print(ids) + print(distances) + # self.assertTrue(get_hitrate(get_exact_cosine(row, data), ids) >= 5) + + # def testKnnQueryBatch(self): + # np.random.seed(23) + # data = np.random.randn(1000, 10).astype(np.float32) + # + # index = self._get_index() + # index.addDataPointBatch(data) + # index.createIndex() + # + # queries = data[:10] + # results = index.knnQueryBatch(queries, k=10) + # for query, (ids, distances) in zip(queries, results): + # self.assertTrue(get_hitrate(get_exact_cosine(query, data), ids) >= 5) + # + # # test col-major arrays + # queries = np.asfortranarray(queries) + # results = index.knnQueryBatch(queries, k=10) + # for query, (ids, distances) in zip(queries, results): + # self.assertTrue(get_hitrate(get_exact_cosine(query, data), ids) >= 5) + # + # # test custom ids (set id to square of each row) + # index = self._get_index() + # index.addDataPointBatch(data, ids=np.arange(data.shape[0]) ** 2) + # index.createIndex() + # + # queries = data[:10] + # results = index.knnQueryBatch(queries, k=10) + # for query, (ids, distances) in zip(queries, results): + # # convert from square back to row id + # ids = np.sqrt(ids).astype(int) + # self.assertTrue(get_hitrate(get_exact_cosine(query, data), ids) >= 5) + + # def testReloadIndex(self): + # np.random.seed(23) + # data = np.random.randn(1000, 10).astype(np.float32) + # + # original = self._get_index() + # original.addDataPointBatch(data) + # original.createIndex() + # + # # test out saving/reloading index + # with tempfile.NamedTemporaryFile() as tmp: + # original.saveIndex(tmp.name + ".index") + # + # reloaded = self._get_index() + # reloaded.addDataPointBatch(data) + # reloaded.loadIndex(tmp.name + ".index") + # + # original_results = original.knnQuery(data[0]) + # reloaded_results = reloaded.knnQuery(data[0]) + # npt.assert_allclose(original_results, + # reloaded_results) + + class HNSWTestCase(unittest.TestCase, DenseIndexTestMixin): def _get_index(self, space='cosinesimil'): return nmslib.init(method='hnsw', space=space) +class BitJaccardTestCase(unittest.TestCase, BitVectorIndexTestMixin): + def _get_index(self, space='bit_jaccard'): + return nmslib.init(method='hnsw', space='bit_jaccard', data_type=nmslib.DataType.OBJECT_AS_STRING, + dtype=nmslib.DistType.INT) + + +# class BitHammingTestCase(unittest.TestCase, BitVectorIndexTestMixin): +# def _get_index(self, space='bit_hamming'): +# return nmslib.init(method='hnsw', space='bit_hamming', data_type=nmslib.DataType.OBJECT_AS_STRING, +# dtype=nmslib.DistType.INT) + + class SWGraphTestCase(unittest.TestCase, DenseIndexTestMixin): def _get_index(self, space='cosinesimil'): return nmslib.init(method='sw-graph', space=space) diff --git a/similarity_search/include/distcomp.h b/similarity_search/include/distcomp.h index 7863837..729d70d 100644 --- a/similarity_search/include/distcomp.h +++ b/similarity_search/include/distcomp.h @@ -223,6 +223,19 @@ int SpearmanRho(const PivotIdType* x, const PivotIdType* y, size_t qty); int SpearmanFootruleSIMD(const PivotIdType* x, const PivotIdType* y, size_t qty); int SpearmanRhoSIMD(const PivotIdType* x, const PivotIdType* y, size_t qty); +//template +double inline BitJaccard(const uint64_t* a, const uint64_t* b, size_t qty) { + uint64_t num = 0, den = 0; + + for (size_t i=0; i < qty; ++i) { + // __builtin_popcount quickly computes the number on 1s + num += __builtin_popcount(a[i] & b[i]); + den += __builtin_popcount(a[i] | b[i]); + } + + return double(num) / double(den); +} + //unsigned BitHamming(const uint32_t* a, const uint32_t* b, size_t qty); unsigned inline BitHamming(const uint32_t* a, const uint32_t* b, size_t qty) { diff --git a/similarity_search/include/factory/init_spaces.h b/similarity_search/include/factory/init_spaces.h index 0984e08..dd0aae7 100644 --- a/similarity_search/include/factory/init_spaces.h +++ b/similarity_search/include/factory/init_spaces.h @@ -19,6 +19,7 @@ #include "factory/space/space_edist.h" #include "factory/space/space_bit_hamming.h" +#include "factory/space/space_bit_jaccard.h" #include "factory/space/space_bregman.h" #include "factory/space/space_dummy.h" #include "factory/space/space_js.h" @@ -36,15 +37,17 @@ namespace similarity { - inline void initSpaces() { // Registering a dummy space REGISTER_SPACE_CREATOR(int, SPACE_DUMMY, CreateDummy) REGISTER_SPACE_CREATOR(float, SPACE_DUMMY, CreateDummy) REGISTER_SPACE_CREATOR(double, SPACE_DUMMY, CreateDummy) - // Registering binary/bit Hamming - REGISTER_SPACE_CREATOR(int, SPACE_BIT_HAMMING, CreateBitHamming) + // Registering binary/bit Hamming/Jaccard + SpaceFactoryRegistry::CreateFuncPtr bit_hamming_func_ptr = CreateBitHamming; + REGISTER_SPACE_CREATOR(int, SPACE_BIT_HAMMING, bit_hamming_func_ptr ) + SpaceFactoryRegistry::CreateFuncPtr bit_jaccard_func_ptr = CreateBitJaccard; + REGISTER_SPACE_CREATOR(double, SPACE_BIT_JACCARD, bit_jaccard_func_ptr ) // Registering the Levensthein-distance: regular and normalized REGISTER_SPACE_CREATOR(int, SPACE_LEVENSHTEIN, CreateLevenshtein) diff --git a/similarity_search/include/factory/space/space_bit_hamming.h b/similarity_search/include/factory/space/space_bit_hamming.h index 53bcc34..d191594 100644 --- a/similarity_search/include/factory/space/space_bit_hamming.h +++ b/similarity_search/include/factory/space/space_bit_hamming.h @@ -23,8 +23,9 @@ namespace similarity { * Creating functions. */ -inline Space* CreateBitHamming(const AnyParams& /* ignoring params */) { - return new SpaceBitHamming(); +template +inline Space* CreateBitHamming(const AnyParams& /* ignoring params */) { + return new SpaceBitHamming(); } /* diff --git a/similarity_search/include/factory/space/space_bit_jaccard.h b/similarity_search/include/factory/space/space_bit_jaccard.h new file mode 100644 index 0000000..48f81dd --- /dev/null +++ b/similarity_search/include/factory/space/space_bit_jaccard.h @@ -0,0 +1,39 @@ +/** + * Non-metric Space Library + * + * Main developers: Bilegsaikhan Naidan, Leonid Boytsov, Yury Malkov, Ben Frederickson, David Novak + * + * For the complete list of contributors and further details see: + * https://github.com/searchivarius/NonMetricSpaceLib + * + * Copyright (c) 2013-2018 + * + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + */ +#ifndef FACTORY_SPACE_BIT_JACCARD_H +#define FACTORY_SPACE_BIT_JACCARD_H + +#include + +namespace similarity { + +/* + * Creating functions. + */ + +template +inline Space* CreateBitJaccard(const AnyParams& /* ignoring params */) { + return new SpaceBitJaccard(); +} + +/* + * End of creating functions. + */ +} + +#endif + + + diff --git a/similarity_search/include/method/perm_bin_vptree.h b/similarity_search/include/method/perm_bin_vptree.h index a58c492..6202c96 100644 --- a/similarity_search/include/method/perm_bin_vptree.h +++ b/similarity_search/include/method/perm_bin_vptree.h @@ -66,7 +66,7 @@ class PermBinVPTree : public Index { ObjectVector BinPermData_; unique_ptr>> VPTreeIndex_; - unique_ptr VPTreeSpace_; + unique_ptr> VPTreeSpace_; // disable copy and assign DISABLE_COPY_AND_ASSIGN(PermBinVPTree); diff --git a/similarity_search/include/permutation_utils.h b/similarity_search/include/permutation_utils.h index 00a141d..dd09e84 100644 --- a/similarity_search/include/permutation_utils.h +++ b/similarity_search/include/permutation_utils.h @@ -23,7 +23,6 @@ #include "rangequery.h" #include "knnquery.h" #include "permutation_type.h" -#include "distcomp.h" #include "utils.h" namespace similarity { @@ -163,6 +162,21 @@ inline void Binarize(const vector &perm, const PivotIdType thresh, } } +inline void Binarize(const vector &perm, const PivotIdType thresh, vector&bin_perm) { + size_t bin_perm_word_qty = (perm.size() + 63)/64; + + bin_perm.resize(bin_perm_word_qty); + fill(bin_perm.begin(), bin_perm.end(), 0); + + for (size_t i = 0; i < perm.size(); ++i) { + bool b =perm[i] >= thresh; + + if (b) { + bin_perm[i/64] |= (1<<(i%64)) ; + } + } +} + } // namespace similarity #endif // _PERMUTATION_UTILS_H_ diff --git a/similarity_search/include/space/space_bit_hamming.h b/similarity_search/include/space/space_bit_hamming.h index d7524ae..a641aed 100644 --- a/similarity_search/include/space/space_bit_hamming.h +++ b/similarity_search/include/space/space_bit_hamming.h @@ -25,55 +25,31 @@ #include "utils.h" #include "space.h" #include "distcomp.h" +#include "space_bit_vector.h" #define SPACE_BIT_HAMMING "bit_hamming" namespace similarity { -class SpaceBitHamming : public Space { +template +class SpaceBitHamming : public SpaceBitVector { public: explicit SpaceBitHamming() {} virtual ~SpaceBitHamming() {} - /** Standard functions to read/write/create objects */ - // Create an object from string representation. - virtual unique_ptr CreateObjFromStr(IdType id, LabelType label, const string& s, - DataFileInputState* pInpState) const; - // Create a string representation of an object. - virtual string CreateStrFromObj(const Object* pObj, const string& externId /* ignored */) const; - // Open a file for reading, fetch a header (if there is any) and memorize an input state - virtual unique_ptr OpenReadFileHeader(const string& inputFile) const; - // Open a file for writing, write a header (if there is any) and memorize an output state - virtual unique_ptr OpenWriteFileHeader(const ObjectVector& dataset, - const string& outputFile) const; - /* - * Read a string representation of the next object in a file as well - * as its label. Return false, on EOF. - */ - virtual bool ReadNextObjStr(DataFileInputState &, string& strObj, LabelType& label, string& externId) const; - /** End of standard functions to read/write/create objects */ - - /* - * Used only for testing/debugging: compares objects approximately. Floating point numbers - * should be nearly equal. Integers and strings should coincide exactly. - */ - virtual bool ApproxEqual(const Object& obj1, const Object& obj2) const; - virtual std::string StrDesc() const { return "Hamming (bit-storage) space"; } - virtual void CreateDenseVectFromObj(const Object* obj, int* pVect, - size_t nElem) const { - throw runtime_error("Cannot create a dense vector for the space: " + StrDesc()); - } - virtual size_t GetElemQty(const Object* object) const {return 0;} - virtual Object* CreateObjFromVect(IdType id, LabelType label, std::vector& InpVect) const { - InpVect.push_back(InpVect.size()); - return CreateObjFromVectInternal(id, label, InpVect); - } + protected: - virtual Object* CreateObjFromVectInternal(IdType id, LabelType label, const std::vector& InpVect) const; - Object* CreateObjFromBitMaskVect(IdType id, LabelType label, const std::vector& bitMaskVect) const; - virtual int HiddenDistance(const Object* obj1, const Object* obj2) const; - void ReadBitMaskVect(std::string line, LabelType& label, std::vector& v) const; + virtual dist_t HiddenDistance(const Object* obj1, const Object* obj2) const { + CHECK(obj1->datalength() > 0); + CHECK(obj1->datalength() == obj2->datalength()); + const dist_uint_t* x = reinterpret_cast(obj1->data()); + const dist_uint_t* y = reinterpret_cast(obj2->data()); + const size_t length = obj1->datalength() / sizeof(dist_uint_t) + - 1; // the last integer is an original number of elements + + return BitHamming(x, y, length); + } DISABLE_COPY_AND_ASSIGN(SpaceBitHamming); }; diff --git a/similarity_search/include/space/space_bit_jaccard.h b/similarity_search/include/space/space_bit_jaccard.h new file mode 100644 index 0000000..8d7803d --- /dev/null +++ b/similarity_search/include/space/space_bit_jaccard.h @@ -0,0 +1,59 @@ +/** + * Non-metric Space Library + * + * Main developers: Bilegsaikhan Naidan, Leonid Boytsov, Yury Malkov, Ben Frederickson, David Novak + * + * For the complete list of contributors and further details see: + * https://github.com/searchivarius/NonMetricSpaceLib + * + * Copyright (c) 2013-2018 + * + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + */ +#ifndef _SPACE_BIT_JACCARD_H_ +#define _SPACE_BIT_JACCARD_H_ + +#include +#include +#include + +#include +#include "global.h" +#include "object.h" +#include "utils.h" +#include "space.h" +#include "distcomp.h" +#include "space_bit_vector.h" + +#define SPACE_BIT_JACCARD "bit_jaccard" + +namespace similarity { + +template +class SpaceBitJaccard : public SpaceBitVector { + public: + explicit SpaceBitJaccard() {} + virtual ~SpaceBitJaccard() {} + + virtual std::string StrDesc() const { return "Jaccard (bit-storage) space"; } + + protected: + virtual dist_t HiddenDistance(const Object* obj1, const Object* obj2) const { + CHECK(obj1->datalength() > 0); + CHECK(obj1->datalength() == obj2->datalength()); + const dist_uint_t* x = reinterpret_cast(obj1->data()); + const dist_uint_t* y = reinterpret_cast(obj2->data()); + const size_t length = obj1->datalength() / sizeof(dist_uint_t) + - 1; // the last integer is an original number of elements + + return BitJaccard(x, y, length); + } + + DISABLE_COPY_AND_ASSIGN(SpaceBitJaccard); +}; + +} // namespace similarity + +#endif diff --git a/similarity_search/include/space/space_bit_vector.h b/similarity_search/include/space/space_bit_vector.h new file mode 100644 index 0000000..a2dd172 --- /dev/null +++ b/similarity_search/include/space/space_bit_vector.h @@ -0,0 +1,209 @@ +/** + * Non-metric Space Library + * + * Main developers: Bilegsaikhan Naidan, Leonid Boytsov, Yury Malkov, Ben Frederickson, David Novak + * + * For the complete list of contributors and further details see: + * https://github.com/searchivarius/NonMetricSpaceLib + * + * Copyright (c) 2013-2018 + * + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + */ +#ifndef _SPACE_BIT_VECTOR_H_ +#define _SPACE_BIT_VECTOR_H_ + +#include +#include +#include + +#include +#include "global.h" +#include "object.h" +#include "utils.h" +#include "space.h" +#include "distcomp.h" +#include "read_data.h" +#include "permutation_utils.h" +#include "logging.h" +#include "experimentconf.h" + + +// Defines an abstract base class for BitVector spaces + +namespace similarity { + +template +class SpaceBitVector : public Space { + public: + explicit SpaceBitVector() {} + virtual ~SpaceBitVector() {} + + /** Standard functions to read/write/create objects */ + // Create an object from string representation. + virtual unique_ptr CreateObjFromStr(IdType id, LabelType label, const string& s, + DataFileInputState* pInpStateBase) const { + DataFileInputStateVec* pInpState = NULL; + if (pInpStateBase != NULL) { + pInpState = dynamic_cast(pInpStateBase); + if (NULL == pInpState) { + PREPARE_RUNTIME_ERR(err) << "Bug: unexpected pointer type"; + THROW_RUNTIME_ERR(err); + } + } + vector vec; + ReadBitMaskVect(s, label, vec); + if (pInpState != NULL) { + size_t elemQty = vec[vec.size() - 1]; + if (pInpState->dim_ == 0) pInpState->dim_ = elemQty; + else if (elemQty != pInpState->dim_) { + PREPARE_RUNTIME_ERR(err) << "The # of bit-vector elements (" << elemQty << ")" << + " doesn't match the # of elements in previous lines. (" << pInpState->dim_ << " )"; + THROW_RUNTIME_ERR(err); + } + } + return unique_ptr(CreateObjFromVectInternal(id, label, vec)); + } + + // Create a string representation of an object. + virtual string CreateStrFromObj(const Object* pObj, const string& externId /* ignored */) const { + stringstream out; + const dist_uint_t* p = reinterpret_cast(pObj->data()); + const size_t length = pObj->datalength() / sizeof(dist_uint_t) + - 1; // the last integer is an original number of elements + const size_t elemQty = p[length]; // last elem + + for (size_t i = 0; i < elemQty; ++i) { + if (i) out << " "; + out << ((p[i/32] >> (i & 31)) & 1); + } + + return out.str(); + } + + // Open a file for reading, fetch a header (if there is any) and memorize an input state + virtual unique_ptr OpenReadFileHeader(const string& inpFileName) const { + return unique_ptr(new DataFileInputStateVec(inpFileName)); + } + + // Open a file for writing, write a header (if there is any) and memorize an output state + virtual unique_ptr OpenWriteFileHeader(const ObjectVector& dataset, + const string& outFileName) const { + return unique_ptr(new DataFileOutputState(outFileName)); + } + + /* + * Read a string representation of the next object in a file as well + * as its label. Return false, on EOF. + */ + virtual bool ReadNextObjStr(DataFileInputState &inpStateBase, string& strObj, LabelType& label, string& externId) const { + externId.clear(); + DataFileInputStateOneFile* pInpState = dynamic_cast(&inpStateBase); + CHECK_MSG(pInpState != NULL, "Bug: unexpected pointer type"); + if (!pInpState->inp_file_) return false; + if (!getline(pInpState->inp_file_, strObj)) return false; + pInpState->line_num_++; + return true; + } + /** End of standard functions to read/write/create objects */ + + /* + * Used only for testing/debugging: compares objects approximately. Floating point numbers + * should be nearly equal. Integers and strings should coincide exactly. + */ + virtual bool ApproxEqual(const Object& obj1, const Object& obj2) const { + const dist_uint_t* p1 = reinterpret_cast(obj1.data()); + const dist_uint_t* p2 = reinterpret_cast(obj2.data()); + const size_t len1 = obj1.datalength() / sizeof(dist_uint_t) + - 1; // the last integer is an original number of elements + const size_t len2 = obj2.datalength() / sizeof(dist_uint_t) + - 1; // the last integer is an original number of elements + if (len1 != len2) { + PREPARE_RUNTIME_ERR(err) << "Bug: comparing vectors of different lengths: " << len1 << " and " << len2; + THROW_RUNTIME_ERR(err); + } + for (size_t i = 0; i < len1; ++i) { + dist_uint_t v1 = ((p1[i/32] >> (i & 31)) & 1); + dist_uint_t v2 = ((p2[i/32] >> (i & 31)) & 1); + if (v1 != v2) return false; + } + + return true; + } + + virtual std::string StrDesc() const { return "Vector (bit-storage) space"; } + virtual void CreateDenseVectFromObj(const Object* obj, dist_t* pVect, + size_t nElem) const { + throw runtime_error("Cannot create a dense vector for the space: " + StrDesc()); + } + virtual size_t GetElemQty(const Object* object) const {return 0;} + virtual Object* CreateObjFromVect(IdType id, LabelType label, std::vector& InpVect) const { + InpVect.push_back(InpVect.size()); + return CreateObjFromVectInternal(id, label, InpVect); + } + protected: + virtual Object* CreateObjFromVectInternal(IdType id, LabelType label, const std::vector& InpVect) const { + return new Object(id, label, InpVect.size() * sizeof(dist_uint_t), &InpVect[0]); + } + + Object* CreateObjFromBitMaskVect(IdType id, LabelType label, const std::vector& bitMaskVect) const { + return new Object(id, label, bitMaskVect.size() * sizeof(dist_uint_t), &bitMaskVect[0]); + } + +// virtual dist_t HiddenDistance(const Object* obj1, const Object* obj2) const; + void ReadBitMaskVect(std::string line, LabelType& label, std::vector& binVect) const { + binVect.clear(); + + label = Object::extractLabel(line); + + std::stringstream str(line); + + str.exceptions(std::ios::badbit); + + + ReplaceSomePunct(line); + + vector v; + + #if 0 + try { + unsigned val; + + while (str >> val) { + if (val != 0 && val != 1) { + throw runtime_error("Only zeros and ones are allowed"); + } + v.push_back(val); + } + } catch (const std::exception &e) { + LOG(LIB_ERROR) << "Exception: " << e.what(); + PREPARE_RUNTIME_ERR(err) << "Failed to parse the line: '" << line << "'"; + LOG(LIB_ERROR) << err.stream().str(); + THROW_RUNTIME_ERR(err); + } + #else + if (!ReadVecDataEfficiently(line, v)) { + PREPARE_RUNTIME_ERR(err) << "Failed to parse the line: '" << line << "'"; + LOG(LIB_ERROR) << err.stream().str(); + THROW_RUNTIME_ERR(err); + } + for (auto val : v) { + if (val != 0 && val != 1) { + PREPARE_RUNTIME_ERR(err) << "Only zeros and ones are allowed, offending line: '" << line << "'"; + LOG(LIB_ERROR) << err.stream().str(); + THROW_RUNTIME_ERR(err); + } + } + #endif + Binarize(v, 1, binVect); // Create the binary vector + binVect.push_back(v.size()); // Put the number of elements in the end + } + + DISABLE_COPY_AND_ASSIGN(SpaceBitVector); +}; + +} // namespace similarity + +#endif diff --git a/similarity_search/src/method/perm_bin_vptree.cc b/similarity_search/src/method/perm_bin_vptree.cc index 9eca6a0..40fff85 100644 --- a/similarity_search/src/method/perm_bin_vptree.cc +++ b/similarity_search/src/method/perm_bin_vptree.cc @@ -38,7 +38,7 @@ PermBinVPTree::PermBinVPTree( const ObjectVector& data) : Index(data), space_(space), PrintProgress_(PrintProgress), - VPTreeSpace_(new SpaceBitHamming()) + VPTreeSpace_(new SpaceBitHamming()) {} template diff --git a/similarity_search/src/space/space_bit_hamming.cc b/similarity_search/src/space/space_bit_hamming.cc index f4dbc57..72c2d18 100644 --- a/similarity_search/src/space/space_bit_hamming.cc +++ b/similarity_search/src/space/space_bit_hamming.cc @@ -1,185 +1,44 @@ -/** - * Non-metric Space Library - * - * Main developers: Bilegsaikhan Naidan, Leonid Boytsov, Yury Malkov, Ben Frederickson, David Novak - * - * For the complete list of contributors and further details see: - * https://github.com/searchivarius/NonMetricSpaceLib - * - * Copyright (c) 2013-2018 - * - * This code is released under the - * Apache License Version 2.0 http://www.apache.org/licenses/. - * - */ -#include -#include -#include -#include -#include - -#include "space/space_bit_hamming.h" -#include "permutation_utils.h" -#include "logging.h" -#include "distcomp.h" -#include "read_data.h" -#include "experimentconf.h" - -namespace similarity { - -using namespace std; - -int SpaceBitHamming::HiddenDistance(const Object* obj1, const Object* obj2) const { - CHECK(obj1->datalength() > 0); - CHECK(obj1->datalength() == obj2->datalength()); - const uint32_t* x = reinterpret_cast(obj1->data()); - const uint32_t* y = reinterpret_cast(obj2->data()); - const size_t length = obj1->datalength() / sizeof(uint32_t) - - 1; // the last integer is an original number of elements - - return BitHamming(x, y, length); -} - -void SpaceBitHamming::ReadBitMaskVect(std::string line, LabelType& label, std::vector& binVect) const -{ - binVect.clear(); - - label = Object::extractLabel(line); - - std::stringstream str(line); - - str.exceptions(std::ios::badbit); - - - ReplaceSomePunct(line); - - vector v; - -#if 0 - try { - unsigned val; - - while (str >> val) { - if (val != 0 && val != 1) { - throw runtime_error("Only zeros and ones are allowed"); - } - v.push_back(val); - } - } catch (const std::exception &e) { - LOG(LIB_ERROR) << "Exception: " << e.what(); - PREPARE_RUNTIME_ERR(err) << "Failed to parse the line: '" << line << "'"; - LOG(LIB_ERROR) << err.stream().str(); - THROW_RUNTIME_ERR(err); - } -#else - if (!ReadVecDataEfficiently(line, v)) { - PREPARE_RUNTIME_ERR(err) << "Failed to parse the line: '" << line << "'"; - LOG(LIB_ERROR) << err.stream().str(); - THROW_RUNTIME_ERR(err); - } - for (auto val : v) { - if (val != 0 && val != 1) { - PREPARE_RUNTIME_ERR(err) << "Only zeros and ones are allowed, offending line: '" << line << "'"; - LOG(LIB_ERROR) << err.stream().str(); - THROW_RUNTIME_ERR(err); - } - } -#endif - Binarize(v, 1, binVect); // Create the binary vector - binVect.push_back(v.size()); // Put the number of elements in the end -} - -Object* SpaceBitHamming::CreateObjFromBitMaskVect(IdType id, LabelType label, const std::vector& bitMaskVect) const { - return new Object(id, label, bitMaskVect.size() * sizeof(uint32_t), &bitMaskVect[0]); -}; - -/** Standard functions to read/write/create objects */ - -unique_ptr SpaceBitHamming::OpenReadFileHeader(const string& inpFileName) const { - return unique_ptr(new DataFileInputStateVec(inpFileName)); -} - -unique_ptr SpaceBitHamming::OpenWriteFileHeader(const ObjectVector& dataset, - const string& outFileName) const { - return unique_ptr(new DataFileOutputState(outFileName)); -} - -unique_ptr -SpaceBitHamming::CreateObjFromStr(IdType id, LabelType label, const string& s, - DataFileInputState* pInpStateBase) const { - DataFileInputStateVec* pInpState = NULL; - if (pInpStateBase != NULL) { - pInpState = dynamic_cast(pInpStateBase); - if (NULL == pInpState) { - PREPARE_RUNTIME_ERR(err) << "Bug: unexpected pointer type"; - THROW_RUNTIME_ERR(err); - } - } - vector vec; - ReadBitMaskVect(s, label, vec); - if (pInpState != NULL) { - size_t elemQty = vec[vec.size() - 1]; - if (pInpState->dim_ == 0) pInpState->dim_ = elemQty; - else if (elemQty != pInpState->dim_) { - PREPARE_RUNTIME_ERR(err) << "The # of bit-vector elements (" << elemQty << ")" << - " doesn't match the # of elements in previous lines. (" << pInpState->dim_ << " )"; - THROW_RUNTIME_ERR(err); - } - } - return unique_ptr(CreateObjFromVectInternal(id, label, vec)); -} - -Object* SpaceBitHamming::CreateObjFromVectInternal(IdType id, LabelType label, const std::vector& InpVect) const { - return new Object(id, label, InpVect.size() * sizeof(uint32_t), &InpVect[0]); -}; - -bool SpaceBitHamming::ApproxEqual(const Object& obj1, const Object& obj2) const { - const uint32_t* p1 = reinterpret_cast(obj1.data()); - const uint32_t* p2 = reinterpret_cast(obj2.data()); - const size_t len1 = obj1.datalength() / sizeof(uint32_t) - - 1; // the last integer is an original number of elements - const size_t len2 = obj2.datalength() / sizeof(uint32_t) - - 1; // the last integer is an original number of elements - if (len1 != len2) { - PREPARE_RUNTIME_ERR(err) << "Bug: comparing vectors of different lengths: " << len1 << " and " << len2; - THROW_RUNTIME_ERR(err); - } - for (size_t i = 0; i < len1; ++i) { - uint32_t v1 = ((p1[i/32] >> (i & 31)) & 1); - uint32_t v2 = ((p2[i/32] >> (i & 31)) & 1); - if (v1 != v2) return false; - } - - return true; -} - - -string SpaceBitHamming::CreateStrFromObj(const Object* pObj, const string& externId /* ignored */) const { - stringstream out; - const uint32_t* p = reinterpret_cast(pObj->data()); - const size_t length = pObj->datalength() / sizeof(uint32_t) - - 1; // the last integer is an original number of elements - const size_t elemQty = p[length]; // last elem - - for (size_t i = 0; i < elemQty; ++i) { - if (i) out << " "; - out << ((p[i/32] >> (i & 31)) & 1); - } - - return out.str(); -} - -bool SpaceBitHamming::ReadNextObjStr(DataFileInputState &inpStateBase, string& strObj, LabelType& label, string& externId) const { - externId.clear(); - DataFileInputStateOneFile* pInpState = dynamic_cast(&inpStateBase); - CHECK_MSG(pInpState != NULL, "Bug: unexpected pointer type"); - if (!pInpState->inp_file_) return false; - if (!getline(pInpState->inp_file_, strObj)) return false; - pInpState->line_num_++; - return true; -} - - -/** End of standard functions to read/write/create objects */ - -} // namespace similarity +///** +// * Non-metric Space Library +// * +// * Main developers: Bilegsaikhan Naidan, Leonid Boytsov, Yury Malkov, Ben Frederickson, David Novak +// * +// * For the complete list of contributors and further details see: +// * https://github.com/searchivarius/NonMetricSpaceLib +// * +// * Copyright (c) 2013-2018 +// * +// * This code is released under the +// * Apache License Version 2.0 http://www.apache.org/licenses/. +// * +// */ +//#include +//#include +//#include +//#include +//#include +// +//#include "space/space_bit_hamming.h" +//#include "permutation_utils.h" +//#include "logging.h" +//#include "distcomp.h" +//#include "read_data.h" +//#include "experimentconf.h" +// +//namespace similarity { +// +//using namespace std; +// +//template +//dist_t SpaceBitHamming::HiddenDistance(const Object* obj1, const Object* obj2) const { +// CHECK(obj1->datalength() > 0); +// CHECK(obj1->datalength() == obj2->datalength()); +// const dist_uint_t* x = reinterpret_cast(obj1->data()); +// const dist_uint_t* y = reinterpret_cast(obj2->data()); +// const size_t length = obj1->datalength() / sizeof(dist_uint_t) +// - 1; // the last integer is an original number of elements +// +// return BitHamming(x, y, length); +//} +// +//} diff --git a/similarity_search/src/space/space_bit_jaccard.cc b/similarity_search/src/space/space_bit_jaccard.cc new file mode 100644 index 0000000..aa0ecad --- /dev/null +++ b/similarity_search/src/space/space_bit_jaccard.cc @@ -0,0 +1,44 @@ +///** +// * Non-metric Space Library +// * +// * Main developers: Bilegsaikhan Naidan, Leonid Boytsov, Yury Malkov, Ben Frederickson, David Novak +// * +// * For the complete list of contributors and further details see: +// * https://github.com/searchivarius/NonMetricSpaceLib +// * +// * Copyright (c) 2013-2018 +// * +// * This code is released under the +// * Apache License Version 2.0 http://www.apache.org/licenses/. +// * +// */ +//#include +//#include +//#include +//#include +//#include +// +//#include "space/space_bit_jaccard.h" +//#include "permutation_utils.h" +//#include "logging.h" +//#include "distcomp.h" +//#include "read_data.h" +//#include "experimentconf.h" +// +////namespace similarity { +//// +//////using namespace std; +//// +//////template +//////dist_t SpaceBitJaccard::HiddenDistance(const Object* obj1, const Object* obj2) const { +////// CHECK(obj1->datalength() > 0); +////// CHECK(obj1->datalength() == obj2->datalength()); +////// const dist_uint_t* x = reinterpret_cast(obj1->data()); +////// const dist_uint_t* y = reinterpret_cast(obj2->data()); +////// const size_t length = obj1->datalength() / sizeof(dist_uint_t) +////// - 1; // the last integer is an original number of elements +////// +////// return BitJaccard(x, y, length); +//////} +//// +////} diff --git a/similarity_search/src/space/space_bit_vector.cc b/similarity_search/src/space/space_bit_vector.cc new file mode 100644 index 0000000..043319d --- /dev/null +++ b/similarity_search/src/space/space_bit_vector.cc @@ -0,0 +1,195 @@ +///** +// * Non-metric Space Library +// * +// * Main developers: Bilegsaikhan Naidan, Leonid Boytsov, Yury Malkov, Ben Frederickson, David Novak +// * +// * For the complete list of contributors and further details see: +// * https://github.com/searchivarius/NonMetricSpaceLib +// * +// * Copyright (c) 2013-2018 +// * +// * This code is released under the +// * Apache License Version 2.0 http://www.apache.org/licenses/. +// * +// */ +//#include +//#include +//#include +//#include +//#include +// +//#include "space/space_bit_vector.h" +//#include "permutation_utils.h" +//#include "logging.h" +//#include "distcomp.h" +//#include "read_data.h" +//#include "experimentconf.h" +// +//namespace similarity { +// +//using namespace std; +// +////template +////dist_t SpaceBitVector::HiddenDistance(const Object* obj1, const Object* obj2) const { +//// CHECK(obj1->datalength() > 0); +//// CHECK(obj1->datalength() == obj2->datalength()); +//// const dist_uint_t* x = reinterpret_cast(obj1->data()); +//// const dist_uint_t* y = reinterpret_cast(obj2->data()); +//// const size_t length = obj1->datalength() / sizeof(dist_uint_t) +//// - 1; // the last integer is an original number of elements +//// +//// return BitVector(x, y, length); +////} +// +////template +////void SpaceBitVector::ReadBitMaskVect(std::string line, LabelType& label, std::vector& binVect) const +////{ +//// binVect.clear(); +//// +//// label = Object::extractLabel(line); +//// +//// std::stringstream str(line); +//// +//// str.exceptions(std::ios::badbit); +//// +//// +//// ReplaceSomePunct(line); +//// +//// vector v; +//// +////#if 0 +//// try { +//// unsigned val; +//// +//// while (str >> val) { +//// if (val != 0 && val != 1) { +//// throw runtime_error("Only zeros and ones are allowed"); +//// } +//// v.push_back(val); +//// } +//// } catch (const std::exception &e) { +//// LOG(LIB_ERROR) << "Exception: " << e.what(); +//// PREPARE_RUNTIME_ERR(err) << "Failed to parse the line: '" << line << "'"; +//// LOG(LIB_ERROR) << err.stream().str(); +//// THROW_RUNTIME_ERR(err); +//// } +////#else +//// if (!ReadVecDataEfficiently(line, v)) { +//// PREPARE_RUNTIME_ERR(err) << "Failed to parse the line: '" << line << "'"; +//// LOG(LIB_ERROR) << err.stream().str(); +//// THROW_RUNTIME_ERR(err); +//// } +//// for (auto val : v) { +//// if (val != 0 && val != 1) { +//// PREPARE_RUNTIME_ERR(err) << "Only zeros and ones are allowed, offending line: '" << line << "'"; +//// LOG(LIB_ERROR) << err.stream().str(); +//// THROW_RUNTIME_ERR(err); +//// } +//// } +////#endif +//// Binarize(v, 1, binVect); // Create the binary vector +//// binVect.push_back(v.size()); // Put the number of elements in the end +////} +// +////template +////Object* SpaceBitVector::CreateObjFromBitMaskVect(IdType id, LabelType label, const std::vector& bitMaskVect) const { +//// return new Object(id, label, bitMaskVect.size() * sizeof(dist_uint_t), &bitMaskVect[0]); +////}; +// +///** Standard functions to read/write/create objects */ +// +////template +////unique_ptr SpaceBitVector::OpenReadFileHeader(const string& inpFileName) const { +//// return unique_ptr(new DataFileInputStateVec(inpFileName)); +////} +// +////template +////unique_ptr SpaceBitVector::OpenWriteFileHeader(const ObjectVector& dataset, +//// const string& outFileName) const { +//// return unique_ptr(new DataFileOutputState(outFileName)); +////} +// +////template +////unique_ptr +////SpaceBitVector::CreateObjFromStr(IdType id, LabelType label, const string& s, +//// DataFileInputState* pInpStateBase) const { +//// DataFileInputStateVec* pInpState = NULL; +//// if (pInpStateBase != NULL) { +//// pInpState = dynamic_cast(pInpStateBase); +//// if (NULL == pInpState) { +//// PREPARE_RUNTIME_ERR(err) << "Bug: unexpected pointer type"; +//// THROW_RUNTIME_ERR(err); +//// } +//// } +//// vector vec; +//// ReadBitMaskVect(s, label, vec); +//// if (pInpState != NULL) { +//// size_t elemQty = vec[vec.size() - 1]; +//// if (pInpState->dim_ == 0) pInpState->dim_ = elemQty; +//// else if (elemQty != pInpState->dim_) { +//// PREPARE_RUNTIME_ERR(err) << "The # of bit-vector elements (" << elemQty << ")" << +//// " doesn't match the # of elements in previous lines. (" << pInpState->dim_ << " )"; +//// THROW_RUNTIME_ERR(err); +//// } +//// } +//// return unique_ptr(CreateObjFromVectInternal(id, label, vec)); +////} +// +////template +////Object* SpaceBitVector::CreateObjFromVectInternal(IdType id, LabelType label, const std::vector& InpVect) const { +//// return new Object(id, label, InpVect.size() * sizeof(dist_uint_t), &InpVect[0]); +////}; +// +////template +////bool SpaceBitVector::ApproxEqual(const Object& obj1, const Object& obj2) const { +//// const dist_uint_t* p1 = reinterpret_cast(obj1.data()); +//// const dist_uint_t* p2 = reinterpret_cast(obj2.data()); +//// const size_t len1 = obj1.datalength() / sizeof(dist_uint_t) +//// - 1; // the last integer is an original number of elements +//// const size_t len2 = obj2.datalength() / sizeof(dist_uint_t) +//// - 1; // the last integer is an original number of elements +//// if (len1 != len2) { +//// PREPARE_RUNTIME_ERR(err) << "Bug: comparing vectors of different lengths: " << len1 << " and " << len2; +//// THROW_RUNTIME_ERR(err); +//// } +//// for (size_t i = 0; i < len1; ++i) { +//// dist_uint_t v1 = ((p1[i/32] >> (i & 31)) & 1); +//// dist_uint_t v2 = ((p2[i/32] >> (i & 31)) & 1); +//// if (v1 != v2) return false; +//// } +//// +//// return true; +////} +// +// +////template +////string SpaceBitVector::CreateStrFromObj(const Object* pObj, const string& externId /* ignored */) const { +//// stringstream out; +//// const dist_uint_t* p = reinterpret_cast(pObj->data()); +//// const size_t length = pObj->datalength() / sizeof(dist_uint_t) +//// - 1; // the last integer is an original number of elements +//// const size_t elemQty = p[length]; // last elem +//// +//// for (size_t i = 0; i < elemQty; ++i) { +//// if (i) out << " "; +//// out << ((p[i/32] >> (i & 31)) & 1); +//// } +//// +//// return out.str(); +////} +// +////template +////bool SpaceBitVector::ReadNextObjStr(DataFileInputState &inpStateBase, string& strObj, LabelType& label, string& externId) const { +//// externId.clear(); +//// DataFileInputStateOneFile* pInpState = dynamic_cast(&inpStateBase); +//// CHECK_MSG(pInpState != NULL, "Bug: unexpected pointer type"); +//// if (!pInpState->inp_file_) return false; +//// if (!getline(pInpState->inp_file_, strObj)) return false; +//// pInpState->line_num_++; +//// return true; +////} +// +// +///** End of standard functions to read/write/create objects */ +// +//} // namespace similarity diff --git a/similarity_search/test/test_space_serial.cc b/similarity_search/test/test_space_serial.cc index dddc978..53e1017 100644 --- a/similarity_search/test/test_space_serial.cc +++ b/similarity_search/test/test_space_serial.cc @@ -215,6 +215,23 @@ TEST(Test_BitHamming) { } } +TEST(Test_BitJaccard) { + vector testVect; + + for (size_t i = 0; i < MAX_NUM_REC; ++i) { + stringstream ss; + + for (size_t k = 0; k < 128; ++k) { + if (k) ss << " "; + ss << (RandomInt() % 2); + } + testVect.push_back(ss.str()); + } + for (size_t maxNumRec = 1; maxNumRec < MAX_NUM_REC; ++maxNumRec) { + EXPECT_EQ(true, fullTest(testVect, maxNumRec, "tmp_out_file.txt", "bit_jaccard", emptyParams, false)); + } +} + #if defined(WITH_EXTRAS) TEST(Test_SQFD) { const char* sqfdParams[] = {"alpha=1", NULL} ; From 1f3b16d6570cf575cd53de17e5acb8ce93f78d18 Mon Sep 17 00:00:00 2001 From: Greg Friedland Date: Sun, 17 Feb 2019 00:36:02 -0800 Subject: [PATCH 02/17] fix compiler errors --- python_bindings/tests/bindings_test.py | 2 +- similarity_search/include/distcomp.h | 8 +- .../include/space/space_bit_jaccard.h | 2 +- .../src/method/perm_index_incr_bin.cc | 1 + similarity_search/test/test_distfunc.cc | 2370 ++++++++--------- similarity_search/test/test_space_serial.cc | 119 +- 6 files changed, 1253 insertions(+), 1249 deletions(-) diff --git a/python_bindings/tests/bindings_test.py b/python_bindings/tests/bindings_test.py index 47c0989..f134738 100644 --- a/python_bindings/tests/bindings_test.py +++ b/python_bindings/tests/bindings_test.py @@ -171,7 +171,7 @@ def _get_index(self, space='cosinesimil'): class BitJaccardTestCase(unittest.TestCase, BitVectorIndexTestMixin): def _get_index(self, space='bit_jaccard'): return nmslib.init(method='hnsw', space='bit_jaccard', data_type=nmslib.DataType.OBJECT_AS_STRING, - dtype=nmslib.DistType.INT) + dtype=nmslib.DistType.DOUBLE) # class BitHammingTestCase(unittest.TestCase, BitVectorIndexTestMixin): diff --git a/similarity_search/include/distcomp.h b/similarity_search/include/distcomp.h index 729d70d..d16e80b 100644 --- a/similarity_search/include/distcomp.h +++ b/similarity_search/include/distcomp.h @@ -223,9 +223,9 @@ int SpearmanRho(const PivotIdType* x, const PivotIdType* y, size_t qty); int SpearmanFootruleSIMD(const PivotIdType* x, const PivotIdType* y, size_t qty); int SpearmanRhoSIMD(const PivotIdType* x, const PivotIdType* y, size_t qty); -//template -double inline BitJaccard(const uint64_t* a, const uint64_t* b, size_t qty) { - uint64_t num = 0, den = 0; +template +dist_t inline BitJaccard(const dist_uint_t* a, const dist_uint_t* b, size_t qty) { + dist_uint_t num = 0, den = 0; for (size_t i=0; i < qty; ++i) { // __builtin_popcount quickly computes the number on 1s @@ -233,7 +233,7 @@ double inline BitJaccard(const uint64_t* a, const uint64_t* b, size_t qty) { den += __builtin_popcount(a[i] | b[i]); } - return double(num) / double(den); + return dist_t(num) / dist_t(den); } //unsigned BitHamming(const uint32_t* a, const uint32_t* b, size_t qty); diff --git a/similarity_search/include/space/space_bit_jaccard.h b/similarity_search/include/space/space_bit_jaccard.h index 8d7803d..b53a3d7 100644 --- a/similarity_search/include/space/space_bit_jaccard.h +++ b/similarity_search/include/space/space_bit_jaccard.h @@ -48,7 +48,7 @@ class SpaceBitJaccard : public SpaceBitVector { const size_t length = obj1->datalength() / sizeof(dist_uint_t) - 1; // the last integer is an original number of elements - return BitJaccard(x, y, length); + return BitJaccard(x, y, length); } DISABLE_COPY_AND_ASSIGN(SpaceBitJaccard); diff --git a/similarity_search/src/method/perm_index_incr_bin.cc b/similarity_search/src/method/perm_index_incr_bin.cc index f96c727..b565364 100644 --- a/similarity_search/src/method/perm_index_incr_bin.cc +++ b/similarity_search/src/method/perm_index_incr_bin.cc @@ -23,6 +23,7 @@ #include "incremental_quick_select.h" #include "method/perm_index_incr_bin.h" #include "utils.h" +#include "distcomp.h" namespace similarity { diff --git a/similarity_search/test/test_distfunc.cc b/similarity_search/test/test_distfunc.cc index 252520d..668f3d4 100644 --- a/similarity_search/test/test_distfunc.cc +++ b/similarity_search/test/test_distfunc.cc @@ -63,1189 +63,1189 @@ TEST(set_intel) { */ -TEST(Platform64) { - EXPECT_EQ(8 == sizeof(size_t), true); -} - -template -bool checkElemVectEq(const vector>& source, - const vector>& target) { - if (source.size() != target.size()) return false; - - for (size_t i = 0; i < source.size(); ++i) - if (source[i] != target[i]) return false; - - return true; -} - -template -void TestSparsePackUnpack() { - for (size_t maxSize = 1024 ; maxSize < 1024*1024; maxSize += 8192) { - vector> source; - GenSparseVectZipf(maxSize, source); - - LOG(LIB_INFO) << "testing maxSize: " << maxSize << "\nqty: " << source.size() - << " maxId: " << source.back().id_; - - char* pBuff = NULL; - size_t dataLen = 0; - - PackSparseElements(source, pBuff, dataLen); - - vector> target; - UnpackSparseElements(pBuff, dataLen, target); - - bool eqFlag = checkElemVectEq(source, target); - - if (!eqFlag) { - LOG(LIB_INFO) << "Different source and target, source.size(): " << source.size() - << " target.size(): " << target.size(); - // Let's print the first different in the case of equal # of elements - size_t i = 0; - for (; i < min(source.size(), target.size()); ++i) { - if (!(source[i] == target[i])) { - LOG(LIB_INFO) << "First diff, i = " << i << " " << source[i] << " vs " << target[i]; - break; - } - } - } - - EXPECT_EQ(eqFlag, true); - } -} - -TEST(BlockZeros) { - for (size_t id = 0 ; id <= 3*65536; id++) { - size_t id1 = removeBlockZeros(id); - - size_t id2 = addBlockZeros(id1); - EXPECT_EQ(id, id2); - } -} - -#ifdef DISABLE_LONG_TESTS -TEST(DISABLE_SparsePackUnpack) { -#else -TEST(SparsePackUnpack) { -#endif - TestSparsePackUnpack(); - TestSparsePackUnpack(); -} - -TEST(TestEfficientPower) { - double f = 2.0; - - for (unsigned i = 1; i <= 64; i++) { - double p1 = std::pow(f, i); - double p2 = EfficientPow(f, i); - - EXPECT_EQ(p1, p2); - } -} - -TEST(TestEfficientFract) { - unsigned MaxNumDig = 16; - - for (float a = 1.1f ; a <= 2.0f; a+= 0.1f) { - for (unsigned NumDig = 1; NumDig < MaxNumDig; ++NumDig) { - uint64_t MaxFract = uint64_t(1) << NumDig; - - for (uint64_t intFract = 0; intFract < MaxFract; ++intFract) { - float fract = float(intFract) / float(MaxFract); - float v1 = pow(a, fract); - float v2 = EfficientFractPow(a, fract, NumDig); - - EXPECT_EQ_EPS(v1, v2, 1e-5f); - } - } - } -} - -template -bool TestScalarProductAgree(size_t N, size_t dim, size_t Rep) { - vector vect1(dim), vect2(dim); - T* pVect1 = &vect1[0]; - T* pVect2 = &vect2[0]; - - float maxRelDiff = 1e-6f; - float maxAbsDiff = 1e-6f; - - for (size_t i = 0; i < Rep; ++i) { - for (size_t j = 1; j < N; ++j) { - GenRandVect(pVect1, dim, T(1), T(2), true /* do normalize */); - GenRandVect(pVect2, dim, T(1), T(2), true /* do normalize */); - - T val1 = ScalarProduct(pVect1, pVect2, dim); - T val2 = ScalarProductSIMD(pVect1, pVect2, dim); - - bool bug = false; - T diff = fabs(val1 - val2); - T diffRel = diff/max(max(fabs(val1),fabs(val2)),T(1e-18)); - if (diffRel > maxRelDiff && diff > maxAbsDiff) { - bug = true; - cerr << "Bug ScalarProduct !!! Dim = " << dim << " val1 = " << val1 << " val2 = " << val2 << " diff=" << diff << " diffRel=" << diffRel << endl; - } - - if (bug) return false; - } - } - - return true; -} - -template -bool TestNormScalarProductAgree(size_t N, size_t dim, size_t Rep) { - vector vect1(dim), vect2(dim); - T* pVect1 = &vect1[0]; - T* pVect2 = &vect2[0]; - - float maxRelDiff = 1e-6f; - float maxAbsDiff = 1e-6f; - - for (size_t i = 0; i < Rep; ++i) { - for (size_t j = 1; j < N; ++j) { - GenRandVect(pVect1, dim, T(1), T(2), true /* do normalize */); - GenRandVect(pVect2, dim, T(1), T(2), true /* do normalize */); - - T val1 = NormScalarProduct(pVect1, pVect2, dim); - T val2 = NormScalarProductSIMD(pVect1, pVect2, dim); - - bool bug = false; - T diff = fabs(val1 - val2); - T diffRel = diff/max(max(fabs(val1),fabs(val2)),T(1e-18)); - if (diffRel > maxRelDiff && diff > maxAbsDiff) { - bug = true; - cerr << "Bug NormScalarProduct !!! Dim = " << dim << " val1 = " << val1 << " val2 = " << val2 << " diff=" << diff << " diffRel=" << diffRel << endl; - } - - if (bug) return false; - } - } - - return true; -} - -// Agreement test functions -template -bool TestLInfAgree(size_t N, size_t dim, size_t Rep) { - vector vect1(dim), vect2(dim); - T* pVect1 = &vect1[0]; - T* pVect2 = &vect2[0]; - - for (size_t i = 0; i < Rep; ++i) { - for (size_t j = 1; j < N; ++j) { - GenRandVect(pVect1, dim, -T(RANGE), T(RANGE)); - GenRandVect(pVect2, dim, -T(RANGE), T(RANGE)); - - T val1 = LInfNormStandard(pVect1, pVect2, dim); - T val2 = LInfNorm(pVect1, pVect2, dim); - T val3 = LInfNormSIMD(pVect1, pVect2, dim); - - bool bug = false; - - if (fabs(val1 - val2)/max(max(val1,val2),T(1e-18)) > 1e-6) { - cerr << "Bug LInf !!! Dim = " << dim << " val1 = " << val1 << " val2 = " << val2 << endl; - bug = true; - } - if (fabs(val1 - val3)/max(max(val1,val2),T(1e-18)) > 1e-6) { - cerr << "Bug LInf !!! Dim = " << dim << " val1 = " << val1 << " val3 = " << val3 << endl; - bug = true; - } - if (bug) return false; - } - } - - - return true; -} - -template -bool TestL1Agree(size_t N, size_t dim, size_t Rep) { - vector vect1(dim), vect2(dim); - T* pVect1 = &vect1[0]; - T* pVect2 = &vect2[0]; - - for (size_t i = 0; i < Rep; ++i) { - for (size_t j = 1; j < N; ++j) { - GenRandVect(pVect1, dim, -T(RANGE), T(RANGE)); - GenRandVect(pVect2, dim, -T(RANGE), T(RANGE)); - - T val1 = L1NormStandard(pVect1, pVect2, dim); - T val2 = L1Norm(pVect1, pVect2, dim); - T val3 = L1NormSIMD(pVect1, pVect2, dim); - - bool bug = false; - - if (fabs(val1 - val2)/max(max(val1,val2),T(1e-18)) > 1e-6) { - cerr << "Bug L1 !!! Dim = " << dim << " val1 = " << val1 << " val2 = " << val2 << endl; - bug = true; - } - if (fabs(val1 - val3)/max(max(val1,val2),T(1e-18)) > 1e-6) { - cerr << "Bug L1 !!! Dim = " << dim << " val1 = " << val1 << " val3 = " << val3 << endl; - bug = true; - } - if (bug) return false; - } - } - - return true; -} - -template -bool TestL2Agree(size_t N, size_t dim, size_t Rep) { - vector vect1(dim), vect2(dim); - T* pVect1 = &vect1[0]; - T* pVect2 = &vect2[0]; - - for (size_t i = 0; i < Rep; ++i) { - for (size_t j = 1; j < N; ++j) { - GenRandVect(pVect1, dim, -T(RANGE), T(RANGE)); - GenRandVect(pVect2, dim, -T(RANGE), T(RANGE)); - - T val1 = L2NormStandard(pVect1, pVect2, dim); - T val2 = L2Norm(pVect1, pVect2, dim); - T val3 = L2NormSIMD(pVect1, pVect2, dim); - - bool bug = false; - - if (fabs(val1 - val2)/max(max(val1,val2),T(1e-18)) > 1e-6) { - cerr << "Bug L2 !!! Dim = " << dim << " val1 = " << val1 << " val2 = " << val2 << endl; - bug = true; - } - if (fabs(val1 - val3)/max(max(val1,val2),T(1e-18)) > 1e-6) { - cerr << "Bug L2 !!! Dim = " << dim << " val1 = " << val1 << " val3 = " << val3 << endl; - bug = true; - } - if (bug) return false; - } - } - - - return true; -} - -template -bool TestItakuraSaitoAgree(size_t N, size_t dim, size_t Rep) { - vector vect1(dim), vect2(dim); - T* pVect1 = &vect1[0]; - T* pVect2 = &vect2[0]; - vector precompVect1(dim *2), precompVect2(dim * 2); - T* pPrecompVect1 = &precompVect1[0]; - T* pPrecompVect2 = &precompVect2[0]; - - for (size_t i = 0; i < Rep; ++i) { - for (size_t j = 1; j < N; ++j) { - GenRandVect(pVect1, dim, T(RANGE_SMALL), T(1.0), true); - GenRandVect(pVect2, dim, T(RANGE_SMALL), T(1.0), true); - - copy(pVect1, pVect1 + dim, pPrecompVect1); - copy(pVect2, pVect2 + dim, pPrecompVect2); - - PrecompLogarithms(pPrecompVect1, dim); - PrecompLogarithms(pPrecompVect2, dim); - - T val0 = ItakuraSaito(pVect1, pVect2, dim); - T val1 = ItakuraSaitoPrecomp(pPrecompVect1, pPrecompVect2, dim); - T val2 = ItakuraSaitoPrecompSIMD(pPrecompVect1, pPrecompVect2, dim); - - bool bug = false; - - T AbsDiff1 = fabs(val1 - val0); - T RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val0)),T(1e-18)); - - if (RelDiff1 > 1e-5 && AbsDiff1 > 1e-5) { - cerr << "Bug ItakuraSaito !!! Dim = " << dim << " val1 = " << val1 << " val0 = " << val0 << " Diff: " << (val1 - val0) << " RelDiff1: " << RelDiff1 << " << AbsDiff1: " << AbsDiff1 << endl; - bug = true; - } - - T AbsDiff2 = fabs(val1 - val2); - T RelDiff2 = AbsDiff2/max(max(fabs(val1),fabs(val2)),T(1e-18)); - if (RelDiff2 > 1e-5 && AbsDiff2 > 1e-5) { - cerr << "Bug ItakuraSaito !!! Dim = " << dim << " val1 = " << val1 << " val2 = " << val2 << " Diff: " << (val1 - val2) << " RelDiff2: " << RelDiff2 << " AbsDiff2: " << AbsDiff2 << endl; - bug = true; - } - - if (bug) return false; - } - } - - - return true; -} - -template -bool TestKLAgree(size_t N, size_t dim, size_t Rep) { - vector vect1(dim), vect2(dim); - T* pVect1 = &vect1[0]; - T* pVect2 = &vect2[0]; - vector precompVect1(dim *2), precompVect2(dim * 2); - T* pPrecompVect1 = &precompVect1[0]; - T* pPrecompVect2 = &precompVect2[0]; - - for (size_t i = 0; i < Rep; ++i) { - for (size_t j = 1; j < N; ++j) { - GenRandVect(pVect1, dim, T(RANGE_SMALL), T(1.0), true); - GenRandVect(pVect2, dim, T(RANGE_SMALL), T(1.0), true); - - copy(pVect1, pVect1 + dim, pPrecompVect1); - copy(pVect2, pVect2 + dim, pPrecompVect2); - - PrecompLogarithms(pPrecompVect1, dim); - PrecompLogarithms(pPrecompVect2, dim); - - T val0 = KLStandard(pVect1, pVect2, dim); - T val1 = KLStandardLogDiff(pVect1, pVect2, dim); - T val2 = KLPrecomp(pPrecompVect1, pPrecompVect2, dim); - T val3 = KLPrecompSIMD(pPrecompVect1, pPrecompVect2, dim); - - bool bug = false; - - /* - * KLStandardLog has a worse accuracy due to computing the log of ratios - * as opposed to difference of logs, but it is more efficient (log can be - * expensive to compute) - */ - - T AbsDiff1 = fabs(val1 - val0); - T RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val0)),T(1e-18)); - if (RelDiff1 > 1e-5 && AbsDiff1 > 1e-5) { - cerr << "Bug KL !!! Dim = " << dim << " val0 = " << val0 << " val1 = " << val1 << " Diff: " << (val0 - val1) << " RelDiff1: " << RelDiff1 << " AbsDiff1: " << AbsDiff1 << endl; - bug = true; - } - - T AbsDiff2 = fabs(val1 - val2); - T RelDiff2 = AbsDiff2/max(max(fabs(val1),fabs(val2)),T(1e-18)); - if (RelDiff2 > 1e-5 && AbsDiff2 > 1e-5) { - cerr << "Bug KL !!! Dim = " << dim << " val2 = " << val2 << " val1 = " << val1 << " Diff: " << (val2 - val1) << " RelDiff2: " << RelDiff2 << " AbsDiff2: " << AbsDiff2 << endl; - bug = true; - } - - T AbsDiff3 = fabs(val1 - val3); - T RelDiff3 = AbsDiff3/max(max(fabs(val1),fabs(val3)),T(1e-18)); - if (RelDiff3 > 1e-5 && AbsDiff3 > 1e-5) { - cerr << "Bug KL !!! Dim = " << dim << " val3 = " << val3 << " val1 = " << val1 << " Diff: " << (val3 - val1) << " RelDiff3: " << RelDiff3 << " AbsDiff3: " << AbsDiff3 << endl; - bug = true; - } - - if (bug) return false; - } - } - - - return true; -} - -template -bool TestKLGeneralAgree(size_t N, size_t dim, size_t Rep) { - T* pVect1 = new T[dim]; - T* pVect2 = new T[dim]; - T* pPrecompVect1 = new T[dim * 2]; - T* pPrecompVect2 = new T[dim * 2]; - - for (size_t i = 0; i < Rep; ++i) { - for (size_t j = 1; j < N; ++j) { - GenRandVect(pVect1, dim, T(RANGE_SMALL), T(1.0), false); - GenRandVect(pVect2, dim, T(RANGE_SMALL), T(1.0), false); - - copy(pVect1, pVect1 + dim, pPrecompVect1); - copy(pVect2, pVect2 + dim, pPrecompVect2); - - PrecompLogarithms(pPrecompVect1, dim); - PrecompLogarithms(pPrecompVect2, dim); - - T val0 = KLGeneralStandard(pVect1, pVect2, dim); - T val2 = KLGeneralPrecomp(pPrecompVect1, pPrecompVect2, dim); - T val3 = KLGeneralPrecompSIMD(pPrecompVect1, pPrecompVect2, dim); - - bool bug = false; - - T AbsDiff1 = fabs(val2 - val0); - T RelDiff1 = AbsDiff1/max(max(fabs(val2),fabs(val0)),T(1e-18)); - if (RelDiff1 > 1e-5 && AbsDiff1 > 1e-5) { - cerr << "Bug KL !!! Dim = " << dim << " val0 = " << val0 << " val2 = " << val2 << " Diff: " << (val0 - val2) << " RelDiff1: " << RelDiff1 << " AbsDiff1: " << AbsDiff1 << endl; - bug = true; - } - - T AbsDiff2 = fabs(val3 - val2); - T RelDiff2 = AbsDiff2/max(max(fabs(val3),fabs(val2)),T(1e-18)); - if (RelDiff2 > 1e-5 && AbsDiff2 > 1e-5) { - cerr << "Bug KL !!! Dim = " << dim << " val2 = " << val2 << " val3 = " << val3 << " Diff: " << (val2 - val3) << " RelDiff2: " << RelDiff2 << " AbsDiff2: " << AbsDiff2 << endl; - bug = true; - } - - if (bug) return false; - } - } - - - return true; -} - -template -bool TestJSAgree(size_t N, size_t dim, size_t Rep, double pZero) { - vector vect1(dim), vect2(dim); - T* pVect1 = &vect1[0]; - T* pVect2 = &vect2[0]; - vector precompVect1(dim *2), precompVect2(dim * 2); - T* pPrecompVect1 = &precompVect1[0]; - T* pPrecompVect2 = &precompVect2[0]; - - T Dist = 0; - T Error = 0; - T TotalQty = 0; - - for (size_t i = 0; i < Rep; ++i) { - for (size_t j = 1; j < N; ++j) { - GenRandVect(pVect1, dim, T(RANGE_SMALL), T(1.0), true); - SetRandZeros(pVect1, dim, pZero); - Normalize(pVect1, dim); - GenRandVect(pVect2, dim, T(RANGE_SMALL), T(1.0), true); - SetRandZeros(pVect2, dim, pZero); - Normalize(pVect2, dim); - - copy(pVect1, pVect1 + dim, pPrecompVect1); - copy(pVect2, pVect2 + dim, pPrecompVect2); - - PrecompLogarithms(pPrecompVect1, dim); - PrecompLogarithms(pPrecompVect2, dim); - - T val0 = JSStandard(pVect1, pVect2, dim); - T val1 = JSPrecomp(pPrecompVect1, pPrecompVect2, dim); - - bool bug = false; - - T AbsDiff1 = fabs(val1 - val0); - T RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val0)),T(1e-18)); - - if (RelDiff1 > 1e-5 && AbsDiff1 > 1e-5) { - cerr << "Bug JS (1) " << typeid(T).name() << " !!! Dim = " << dim << " val0 = " << val0 << " val1 = " << val1 << " Diff: " << (val0 - val1) << " RelDiff1: " << RelDiff1 << " AbsDiff1: " << AbsDiff1 << endl; - bug = true; - } - - T val2 = JSPrecompApproxLog(pPrecompVect1, pPrecompVect2, dim); - T val3 = JSPrecompSIMDApproxLog(pPrecompVect1, pPrecompVect2, dim); - - T AbsDiff2 = fabs(val2 - val3); - T RelDiff2 = AbsDiff2/max(max(fabs(val2),fabs(val3)),T(1e-18)); - - if (RelDiff2 > 1e-5 && AbsDiff2 > 1e-5) { - cerr << "Bug JS (2) " << typeid(T).name() << " !!! Dim = " << dim << " val2 = " << val2 << " val3 = " << val3 << " Diff: " << (val2 - val3) << " RelDiff2: " << RelDiff2 << " AbsDiff2: " << AbsDiff2 << endl; - bug = true; - } - - T AbsDiff3 = fabs(val1 - val2); - T RelDiff3 = AbsDiff3/max(max(fabs(val1),fabs(val2)),T(1e-18)); - - Dist += val1; - Error += AbsDiff3; - ++TotalQty; - - if (RelDiff3 > 1e-4 && AbsDiff3 > 1e-4) { - cerr << "Bug JS (3) " << typeid(T).name() << " !!! Dim = " << dim << " val1 = " << val1 << " val2 = " << val2 << " Diff: " << (val1 - val2) << " RelDiff3: " << RelDiff3 << " AbsDiff2: " << AbsDiff3 << endl; - bug = true; - } - - if (bug) return false; - } - } - - LOG(LIB_INFO) << typeid(T).name() << " JS approximation error: average absolute: " << Error / TotalQty << - " avg. dist: " << Dist / TotalQty << " average relative: " << Error/Dist; - - - return true; -} - -template -bool TestRenyiDivAgree(size_t N, size_t dim, size_t Rep, T alpha) { - vector vect1(dim), vect2(dim); - T* pVect1 = &vect1[0]; - T* pVect2 = &vect2[0]; - - T Dist = 0; - T Error = 0; - T TotalQty = 0; - - for (size_t i = 0; i < Rep; ++i) { - for (size_t j = 1; j < N; ++j) { - GenRandVect(pVect1, dim, T(RANGE_SMALL), T(1.0), true); - GenRandVect(pVect2, dim, T(RANGE_SMALL), T(1.0), true); - - Normalize(pVect1, dim); - Normalize(pVect2, dim); - - T val0 = renyiDivergenceSlow(pVect1, pVect2, dim, alpha); - T val1 = renyiDivergenceFast(pVect1, pVect2, dim, alpha); - - bool bug = false; - - T AbsDiff1 = fabs(val1 - val0); - T RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val0)),T(1e-18)); - - Error += AbsDiff1; - ++TotalQty; - - if (RelDiff1 > 1e-5 && AbsDiff1 > 1e-5) { - cerr << "Bug Reniy Div. (1) " << typeid(T).name() << " !!! Dim = " << dim - << "alpha=" << alpha << " val0 = " << val0 << " val1 = " << val1 - << " Diff: " << (val0 - val1) << " RelDiff1: " << RelDiff1 - << " AbsDiff1: " << AbsDiff1 << endl; - bug = true; - } - - if (bug) return false; - } - } - - LOG(LIB_INFO) << typeid(T).name() << " Renyi Div. approximation error: average absolute: " << Error / TotalQty << - " avg. dist: " << Dist / TotalQty << " average relative: " << Error/Dist; - - - return true; -} - -template -bool TestAlphaBetaDivAgree(size_t N, size_t dim, size_t Rep, T alpha, T beta) { - vector vect1(dim), vect2(dim); - T* pVect1 = &vect1[0]; - T* pVect2 = &vect2[0]; - - T Dist = 0; - T Error = 0; - T TotalQty = 0; - - for (size_t i = 0; i < Rep; ++i) { - for (size_t j = 1; j < N; ++j) { - GenRandVect(pVect1, dim, T(RANGE_SMALL), T(1.0), true); - GenRandVect(pVect2, dim, T(RANGE_SMALL), T(1.0), true); - - Normalize(pVect1, dim); - Normalize(pVect2, dim); - - T val0 = alphaBetaDivergenceSlow(pVect1, pVect2, dim, alpha, beta); - T val1 = alphaBetaDivergenceFast(pVect1, pVect2, dim, alpha, beta); - - bool bug = false; - - T AbsDiff1 = fabs(val1 - val0); - T RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val0)),T(1e-18)); - - Error += AbsDiff1; - ++TotalQty; - - if (RelDiff1 > 1e-5 && AbsDiff1 > 1e-5) { - cerr << "Bug alpha-beta Div. (1) " << typeid(T).name() << " !!! Dim = " << dim - << "alpha=" << alpha << " val0 = " << val0 << " val1 = " << val1 - << " Diff: " << (val0 - val1) << " RelDiff1: " << RelDiff1 - << " AbsDiff1: " << AbsDiff1 << endl; - bug = true; - } - - if (bug) return false; - } - } - - LOG(LIB_INFO) << typeid(T).name() << " alpha-beta div. approximation error: average absolute: " << Error / TotalQty << " avg. dist: " << Dist / TotalQty << " average relative: " << Error/Dist; - - - return true; -} - -bool TestSpearmanFootruleAgree(size_t N, size_t dim, size_t Rep) { - vector vect1(dim), vect2(dim); - PivotIdType* pVect1 = &vect1[0]; - PivotIdType* pVect2 = &vect2[0]; - - for (size_t i = 0; i < Rep; ++i) { - for (size_t j = 1; j < N; ++j) { - GenRandIntVect(pVect1, dim); - GenRandIntVect(pVect2, dim); - - int val0 = SpearmanFootrule(pVect1, pVect2, dim); - int val1 = SpearmanFootruleSIMD(pVect1, pVect2, dim); - - bool bug = false; - - - if (val0 != val1) { - cerr << "Bug SpearmanFootrule !!! Dim = " << dim << " val0 = " << val0 << " val1 = " << val1 << endl; - bug = true; - } - - if (bug) return false; - } - } - - - return true; -} - -bool TestSpearmanRhoAgree(size_t N, size_t dim, size_t Rep) { - vector vect1(dim), vect2(dim); - PivotIdType* pVect1 = &vect1[0]; - PivotIdType* pVect2 = &vect2[0]; - - for (size_t i = 0; i < Rep; ++i) { - for (size_t j = 1; j < N; ++j) { - GenRandIntVect(pVect1, dim); - GenRandIntVect(pVect2, dim); - - int val0 = SpearmanRho(pVect1, pVect2, dim); - int val1 = SpearmanRhoSIMD(pVect1, pVect2, dim); - - bool bug = false; - - - if (val0 != val1) { - cerr << "Bug SpearmanRho !!! Dim = " << dim << " val0 = " << val0 << " val1 = " << val1 << " Diff: " << (val0 - val1) << endl; - bug = true; - } - - if (bug) return false; - } - } - - - return true; -} - -template -bool TestLPGenericAgree(size_t N, size_t dim, size_t Rep, T power) { - vector vect1(dim), vect2(dim); - T* pVect1 = &vect1[0]; - T* pVect2 = &vect2[0]; - - T TotalQty = 0, Error = 0, Dist = 0; - - for (size_t i = 0; i < Rep; ++i) { - for (size_t j = 1; j < N; ++j) { - GenRandVect(pVect1, dim, -T(RANGE), T(RANGE)); - GenRandVect(pVect2, dim, -T(RANGE), T(RANGE)); - - T val0 = LPGenericDistance(pVect1, pVect2, dim, power); - T val1 = LPGenericDistanceOptim(pVect1, pVect2, dim, power); - - bool bug = false; - - T AbsDiff1 = fabs(val1 - val0); - T RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val0)),T(1e-18)); - - T maxRelDiff = 1e-5f; - T maxAbsDiff = 1e-5f; - /* - * For large powers, the difference can be larger, - * because our approximations are efficient, but not very - * precise - */ - if (power > 8) { maxAbsDiff = maxRelDiff = 1e-3f;} - if (power > 12) { maxAbsDiff = maxRelDiff = 0.01f;} - if (power > 22) { maxAbsDiff = maxRelDiff = 0.1f;} - - ++TotalQty; - Error += RelDiff1; - Dist += val0; - - if (RelDiff1 > maxRelDiff && AbsDiff1 > maxAbsDiff) { - cerr << "Bug LP" << power << " !!! Dim = " << dim << - " val1 = " << val1 << " val0 = " << val0 << - " Diff: " << (val1 - val0) << - " RelDiff1: " << RelDiff1 << - " (max for this power: " << maxRelDiff << ") " << - " AbsDiff1: " << AbsDiff1 << " (max for this power: " << maxAbsDiff << ")" << endl; - } - - if (bug) return false; - } - } - - if (power < 4) { - LOG(LIB_INFO) << typeid(T).name() << " LP approximation error: average absolute " << Error / TotalQty << " avg. dist: " << Dist / TotalQty << " average relative: " << Error/Dist; - - } - - return true; -} - -bool TestBitHammingAgree(size_t N, size_t dim, size_t Rep) { - size_t WordQty = (dim + 31)/32; - vector arr(N * WordQty); - uint32_t* pArr = &arr[0]; - - uint32_t *p = pArr; - for (size_t i = 0; i < N; ++i, p+= WordQty) { - vector perm(dim); - GenRandIntVect(&perm[0], dim); - for (unsigned j = 0; j < dim; ++j) - perm[j] = perm[j] % 2; - vector h; - Binarize(perm, 1, h); - CHECK(h.size() == WordQty); - memcpy(p, &h[0], WordQty * sizeof(h[0])); - } - - WallClockTimer t; - - t.reset(); - - bool res = true; - - for (size_t j = 1; j < N; ++j) { - uint32_t* pVect1 = pArr + j*WordQty; - uint32_t* pVect2 = pArr + (j-1)*WordQty; - int d1 = BitHamming(pVect1, pVect2, WordQty); - int d2 = 0; - - for (unsigned t = 0; t < WordQty; ++t) { - for (unsigned k = 0; k < 32; ++k) { - d2 += ((pVect1[t]>>k)&1) != ((pVect2[t]>>k)&1); - } - } - if (d1 != d2) { - cerr << "Bug bit hamming, WordQty = " << WordQty << " d1 = " << d1 << " d2 = " << d2 << endl; - res = false; - break; - } - } - - return res; -} - - -bool TestSparseAngularDistanceAgree(const string& dataFile, size_t N, size_t Rep) { - typedef float T; - - unique_ptr spaceFast(new SpaceSparseAngularDistanceFast()); - unique_ptr> spaceReg(new SpaceSparseAngularDistance()); - - ObjectVector elemsFast; - ObjectVector elemsReg; - vector tmp; - - unique_ptr inpStateFast(spaceFast->ReadDataset(elemsFast, tmp, dataFile, N)); - spaceFast->UpdateParamsFromFile(*inpStateFast); - unique_ptr inpStateReg(spaceReg->ReadDataset(elemsReg, tmp, dataFile, N)); - spaceReg->UpdateParamsFromFile(*inpStateReg); - - CHECK(elemsFast.size() == elemsReg.size()); - - N = min(N, elemsReg.size()); - - bool bug = false; - - float maxRelDiff = 2e-5f; - float maxAbsDiff = 1e-6f; - - for (size_t j = Rep; j < N; ++j) - for (size_t k = j - Rep; k < j; ++k) { - float val1 = spaceFast->IndexTimeDistance(elemsFast[k], elemsFast[j]); - float val2 = spaceReg->IndexTimeDistance(elemsReg[k], elemsReg[j]); - - float AbsDiff1 = fabs(val1 - val2); - float RelDiff1 = AbsDiff1 / max(max(fabs(val1), fabs(val2)), T(1e-18)); - - if (RelDiff1 > maxRelDiff && AbsDiff1 > maxAbsDiff) { - cerr << "Bug fast vs non-fast angular dist " << - " val1 = " << val1 << " val2 = " << val2 << - " Diff: " << (val1 - val2) << - " RelDiff1: " << RelDiff1 << - " AbsDiff1: " << AbsDiff1 << endl; - bug = true; - } - - if (bug) return false; - } - - return true; -} - - - -bool TestSparseCosineSimilarityAgree(const string& dataFile, size_t N, size_t Rep) { - typedef float T; - - unique_ptr spaceFast(new SpaceSparseCosineSimilarityFast()); - unique_ptr> spaceReg (new SpaceSparseCosineSimilarity()); - - ObjectVector elemsFast; - ObjectVector elemsReg; - vector tmp; - - unique_ptr inpStateFast(spaceFast->ReadDataset(elemsFast, tmp, dataFile, N)); - spaceFast->UpdateParamsFromFile(*inpStateFast); - unique_ptr inpStateReg(spaceReg->ReadDataset(elemsReg, tmp, dataFile, N)); - spaceReg->UpdateParamsFromFile(*inpStateReg); - - CHECK(elemsFast.size() == elemsReg.size()); - - N = min(N, elemsReg.size()); - - bool bug = false; - - float maxRelDiff = 1e-5f; - float maxAbsDiff = 1e-5f; - - for (size_t j = Rep; j < N; ++j) - for (size_t k = j - Rep; k < j; ++k) { - float val1 = spaceFast->IndexTimeDistance(elemsFast[k], elemsFast[j]); - float val2 = spaceReg->IndexTimeDistance(elemsReg[k], elemsReg[j]); - - float AbsDiff1 = fabs(val1 - val2); - float RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val2)),T(1e-18)); - - if (RelDiff1 > maxRelDiff && AbsDiff1 > maxAbsDiff) { - cerr << "Bug fast vs non-fast cosine " << - " val1 = " << val1 << " val2 = " << val2 << - " Diff: " << (val1 - val2) << - " RelDiff1: " << RelDiff1 << - " AbsDiff1: " << AbsDiff1 << endl; - bug = true; - } - - if (bug) return false; - } - - return true; -} - -bool TestSparseNegativeScalarProductAgree(const string& dataFile, size_t N, size_t Rep) { - typedef float T; - - unique_ptr spaceFast(new SpaceSparseNegativeScalarProductFast()); - unique_ptr> spaceReg (new SpaceSparseNegativeScalarProduct()); - - ObjectVector elemsFast; - ObjectVector elemsReg; - vector tmp; - - unique_ptr inpStateFast(spaceFast->ReadDataset(elemsFast, tmp, dataFile, N)); - spaceFast->UpdateParamsFromFile(*inpStateFast); - unique_ptr inpStateReg(spaceReg->ReadDataset(elemsReg, tmp, dataFile, N)); - spaceReg->UpdateParamsFromFile(*inpStateReg); - - CHECK(elemsFast.size() == elemsReg.size()); - - N = min(N, elemsReg.size()); - - bool bug = false; - - float maxRelDiff = 1e-6f; - float maxAbsDiff = 1e-6f; - - for (size_t j = Rep; j < N; ++j) - for (size_t k = j - Rep; k < j; ++k) { - float val1 = spaceFast->IndexTimeDistance(elemsFast[k], elemsFast[j]); - float val2 = spaceReg->IndexTimeDistance(elemsReg[k], elemsReg[j]); - - float AbsDiff1 = fabs(val1 - val2); - float RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val2)),T(1e-18)); - - if (RelDiff1 > maxRelDiff && AbsDiff1 > maxAbsDiff) { - cerr << "Bug fast vs non-fast negative scalar/dot product " << - " val1 = " << val1 << " val2 = " << val2 << - " Diff: " << (val1 - val2) << - " RelDiff1: " << RelDiff1 << - " AbsDiff1: " << AbsDiff1 << endl; - bug = true; - } - - if (bug) return false; - } - - return true; -} - -bool TestSparseQueryNormNegativeScalarProductAgree(const string& dataFile, size_t N, size_t Rep) { - typedef float T; - - unique_ptr spaceFast(new SpaceSparseQueryNormNegativeScalarProductFast()); - unique_ptr> spaceReg (new SpaceSparseQueryNormNegativeScalarProduct()); - - ObjectVector elemsFast; - ObjectVector elemsReg; - vector tmp; - - unique_ptr inpStateFast(spaceFast->ReadDataset(elemsFast, tmp, dataFile, N)); - spaceFast->UpdateParamsFromFile(*inpStateFast); - unique_ptr inpStateReg(spaceReg->ReadDataset(elemsReg, tmp, dataFile, N)); - spaceReg->UpdateParamsFromFile(*inpStateReg); - - CHECK(elemsFast.size() == elemsReg.size()); - - N = min(N, elemsReg.size()); - - bool bug = false; - - float maxRelDiff = 1e-6f; - float maxAbsDiff = 1e-6f; - - for (size_t j = Rep; j < N; ++j) - for (size_t k = j - Rep; k < j; ++k) { - float val1 = spaceFast->IndexTimeDistance(elemsFast[k], elemsFast[j]); - float val2 = spaceReg->IndexTimeDistance(elemsReg[k], elemsReg[j]); - - float AbsDiff1 = fabs(val1 - val2); - float RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val2)),T(1e-18)); - - if (RelDiff1 > maxRelDiff && AbsDiff1 > maxAbsDiff) { - cerr << "Bug fast vs non-fast QUERY-NORMALIZED negative scalar/dot product " << - " val1 = " << val1 << " val2 = " << val2 << - " Diff: " << (val1 - val2) << - " RelDiff1: " << RelDiff1 << - " AbsDiff1: " << AbsDiff1 << endl; - bug = true; - } - - if (bug) return false; - } - - return true; -} - -// Limitation: this is only for spaces without params -bool TestPivotIndex(const string& spaceName, - bool useDummyIndex, - const string& dataFile, size_t dataQty, - const string& pivotFile, size_t pivotQty) { - - LOG(LIB_INFO) << "space: " << spaceName << " real pivot index?: " << !useDummyIndex << " " << - " dataFile: " << dataFile << " " << - " pivotFile: " << pivotFile; - try { - typedef float T; - - AnyParams emptyParams; - - unique_ptr> space(SpaceFactoryRegistry::Instance().CreateSpace(spaceName, emptyParams)); - - ObjectVector data; - ObjectVector pivots; - vector tmp; - - float maxRelDiff = 1e-6f; - float maxAbsDiff = 1e-6f; - - unique_ptr inpStateFast(space->ReadDataset(data, tmp, dataFile, dataQty)); - space->UpdateParamsFromFile(*inpStateFast); - space->ReadDataset(pivots, tmp, pivotFile, pivotQty); - - unique_ptr> pivIndx(useDummyIndex ? - new DummyPivotIndex(*space, pivots) - : - space->CreatePivotIndex(pivots, - 0 /* Let's not test using the hashing trick here, b/c distances would be somewhat different */)); - - for (size_t did = 0; did < dataQty; ++did) { - vector vDst; - pivIndx->ComputePivotDistancesIndexTime(data[did], vDst); - CHECK_MSG(vDst.size() == pivotQty, "ComputePivotDistancesIndexTime returns incorrect # of elements different from the # of pivots"); - - for (size_t pid = 0; pid < pivotQty; ++pid) { - T val2 = space->IndexTimeDistance(pivots[pid], data[did]); - T val1 = vDst[pid]; - - float AbsDiff1 = fabs(val1 - val2); - float RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val2)),T(1e-18)); - - if (RelDiff1 > maxRelDiff && AbsDiff1 > maxAbsDiff) { - cerr << "Bug in fast computation of all-pivot distance, " << - " space: " << spaceName << " real pivot index?: " << !useDummyIndex << endl << - " dataFile: " << dataFile << endl << - " pivotFile: " << pivotFile << endl << - " data index: " << did << " pivot index: " << pid << endl << - " val1 = " << val1 << " val2 = " << val2 << - " Diff: " << (val1 - val2) << - " RelDiff1: " << RelDiff1 << - " AbsDiff1: " << AbsDiff1 << endl; - return false; - } - } - } - } catch (const exception& e) { - LOG(LIB_INFO) << "Got exception while testing: " << e.what(); - return false; - } - return true; -} - - - - -#ifdef DISABLE_LONG_TESTS -TEST(DISABLE_TestAgree) { -#else -TEST(TestAgree) { -#endif - int nTest = 0; - int nFail = 0; - - nTest++; - nFail += !TestSparseAngularDistanceAgree(sampleDataPrefix + "sparse_5K.txt", 1000, 200); - - nTest++; - nFail += !TestSparseAngularDistanceAgree(sampleDataPrefix + "sparse_wiki_5K.txt", 1000, 200); - - nTest++; - nFail += !TestSparseCosineSimilarityAgree(sampleDataPrefix + "sparse_5K.txt", 1000, 200); - - nTest++; - nFail += !TestSparseCosineSimilarityAgree(sampleDataPrefix + "sparse_wiki_5K.txt", 1000, 200); - - - nTest++; - nFail += !TestSparseNegativeScalarProductAgree(sampleDataPrefix + "sparse_5K.txt", 1000, 200); - - nTest++; - nFail += !TestSparseNegativeScalarProductAgree(sampleDataPrefix + "sparse_wiki_5K.txt", 1000, 200); - - nTest++; - nFail += !TestSparseQueryNormNegativeScalarProductAgree(sampleDataPrefix + "sparse_5K.txt", 1000, 200); - - nTest++; - nFail += !TestSparseQueryNormNegativeScalarProductAgree(sampleDataPrefix + "sparse_wiki_5K.txt", 1000, 200); - - - /* - * 32 should be more than enough for almost all methods, - * where loop-unrolling includes at most 16 distance computations. - * - * Bit-Hamming is an exception. - * - */ - for (unsigned dim = 1; dim <= 1024; dim+=2) { - LOG(LIB_INFO) << "Dim = " << dim; - - nFail += !TestBitHammingAgree(1000, dim, 1000); - } - - for (unsigned dim = 1; dim <= 32; ++dim) { - LOG(LIB_INFO) << "Dim = " << dim; - - /* - * This is a costly check, we don't need to do it for large # dimensions. - * Anyways, the function is not using any loop unrolling, so 8 should be sufficient. - */ - if (dim <= 8) { - - for (float power = 0.125; power <= 32; power += 0.125) { - TestLPGenericAgree(1024, dim, 10, power); - } - for (double power = 0.125; power <= 32; power += 0.125) { - TestLPGenericAgree(1024, dim, 10, power); - } - - // In the case of Renyi divergence 0 < alpha < 1, 1 < alpha < infinity - // https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy#R%C3%A9nyi_divergence - for (float alpha = 0.125; alpha <= 2; alpha += 0.125) { - if (fabs(alpha - 1) < 1e-6) continue; - TestRenyiDivAgree(1024, dim, 10, alpha); - } - for (double alpha = 0.125; alpha <= 2; alpha += 0.125) { - if (fabs(alpha - 1) < 1e-6) continue; - TestRenyiDivAgree(1024, dim, 10, alpha); - } - - for (float alpha = -2; alpha <= 2; alpha += 0.5) - for (float beta = -2; beta <= 2; beta += 0.5) - { - TestAlphaBetaDivAgree(1024, dim, 10, alpha, beta); - } - - for (double alpha = -2; alpha <= 2; alpha += 0.5) - for (double beta = -2; beta <= 2; beta += 0.5) - { - TestAlphaBetaDivAgree(1024, dim, 10, alpha, beta); - } - } - - nTest++; - nFail += !TestNormScalarProductAgree(1024, dim, 10); - nTest++; - nFail += !TestNormScalarProductAgree(1024, dim, 10); - - nTest++; - nFail += !TestScalarProductAgree(1024, dim, 10); - nTest++; - nFail += !TestScalarProductAgree(1024, dim, 10); - - nTest++; - nFail += !TestSpearmanFootruleAgree(1024, dim, 10); - - nTest++; - nFail += !TestSpearmanRhoAgree(1024, dim, 10); - - nTest++; - nFail += !TestJSAgree(1024, dim, 10, 0.5); - nTest++; - nFail += !TestJSAgree(1024, dim, 10, 0.5); - - nTest++; - nFail += !TestKLGeneralAgree(1024, dim, 10); - nTest++; - nFail += !TestKLGeneralAgree(1024, dim, 10); - - nTest++; - nFail += !TestLInfAgree(1024, dim, 10); - nTest++; - nFail += !TestLInfAgree(1024, dim, 10); - - nTest++; - nFail += !TestL1Agree(1024, dim, 10); - nTest++; - nFail += !TestL1Agree(1024, dim, 10); - - nTest++; - nFail += !TestL2Agree(1024, dim, 10); - nTest++; - nFail += !TestL2Agree(1024, dim, 10); - - nTest++; - nFail += !TestKLAgree(1024, dim, 10); - nTest++; - nFail += !TestKLAgree(1024, dim, 10); - - nTest++; - nFail += !TestItakuraSaitoAgree(1024, dim, 10); - nTest++; - nFail += !TestItakuraSaitoAgree(1024, dim, 10); - } - - LOG(LIB_INFO) << nTest << " (sub) tests performed " << nFail << " failed"; - - EXPECT_EQ(0, nFail); -} - -#ifdef DISABLE_LONG_TESTS -TEST(DISABLE_TestAgreePivotIndex) { -#else -TEST(TestAgreePivotIndex) { -#endif - int nTest = 0; - int nFail = 0; - - const size_t dataQty = 1000; - const size_t pivotQty = 100; - - vector vDataFiles = {"sparse_5K.txt", "sparse_wiki_5K.txt"}; - vector vSpaces = {SPACE_SPARSE_COSINE_SIMILARITY_FAST, SPACE_SPARSE_ANGULAR_DISTANCE_FAST, - SPACE_SPARSE_NEGATIVE_SCALAR_FAST, SPACE_SPARSE_QUERY_NORM_NEGATIVE_SCALAR_FAST}; - const string pivotFile = "sparse_pivots1K_termQty5K_maxId_100K.txt"; - - for (string spaceName : vSpaces) - for (string dataFile : vDataFiles) { - // 1. test with a dummy pivot index - nTest++; - nFail += !TestPivotIndex(spaceName, true, sampleDataPrefix + dataFile, dataQty, sampleDataPrefix + pivotFile, pivotQty); - - // 2. test with a real pivot index - nTest++; - nFail += !TestPivotIndex(spaceName, false, sampleDataPrefix + dataFile, dataQty, sampleDataPrefix + pivotFile, pivotQty); - } - - LOG(LIB_INFO) << nTest << " (sub) tests performed " << nFail << " failed"; - - EXPECT_EQ(0, nFail); -} - - +//TEST(Platform64) { +// EXPECT_EQ(8 == sizeof(size_t), true); +//} +// +//template +//bool checkElemVectEq(const vector>& source, +// const vector>& target) { +// if (source.size() != target.size()) return false; +// +// for (size_t i = 0; i < source.size(); ++i) +// if (source[i] != target[i]) return false; +// +// return true; +//} +// +//template +//void TestSparsePackUnpack() { +// for (size_t maxSize = 1024 ; maxSize < 1024*1024; maxSize += 8192) { +// vector> source; +// GenSparseVectZipf(maxSize, source); +// +// LOG(LIB_INFO) << "testing maxSize: " << maxSize << "\nqty: " << source.size() +// << " maxId: " << source.back().id_; +// +// char* pBuff = NULL; +// size_t dataLen = 0; +// +// PackSparseElements(source, pBuff, dataLen); +// +// vector> target; +// UnpackSparseElements(pBuff, dataLen, target); +// +// bool eqFlag = checkElemVectEq(source, target); +// +// if (!eqFlag) { +// LOG(LIB_INFO) << "Different source and target, source.size(): " << source.size() +// << " target.size(): " << target.size(); +// // Let's print the first different in the case of equal # of elements +// size_t i = 0; +// for (; i < min(source.size(), target.size()); ++i) { +// if (!(source[i] == target[i])) { +// LOG(LIB_INFO) << "First diff, i = " << i << " " << source[i] << " vs " << target[i]; +// break; +// } +// } +// } +// +// EXPECT_EQ(eqFlag, true); +// } +//} +// +//TEST(BlockZeros) { +// for (size_t id = 0 ; id <= 3*65536; id++) { +// size_t id1 = removeBlockZeros(id); +// +// size_t id2 = addBlockZeros(id1); +// EXPECT_EQ(id, id2); +// } +//} +// +//#ifdef DISABLE_LONG_TESTS +//TEST(DISABLE_SparsePackUnpack) { +//#else +//TEST(SparsePackUnpack) { +//#endif +// TestSparsePackUnpack(); +// TestSparsePackUnpack(); +//} +// +//TEST(TestEfficientPower) { +// double f = 2.0; +// +// for (unsigned i = 1; i <= 64; i++) { +// double p1 = std::pow(f, i); +// double p2 = EfficientPow(f, i); +// +// EXPECT_EQ(p1, p2); +// } +//} +// +//TEST(TestEfficientFract) { +// unsigned MaxNumDig = 16; +// +// for (float a = 1.1f ; a <= 2.0f; a+= 0.1f) { +// for (unsigned NumDig = 1; NumDig < MaxNumDig; ++NumDig) { +// uint64_t MaxFract = uint64_t(1) << NumDig; +// +// for (uint64_t intFract = 0; intFract < MaxFract; ++intFract) { +// float fract = float(intFract) / float(MaxFract); +// float v1 = pow(a, fract); +// float v2 = EfficientFractPow(a, fract, NumDig); +// +// EXPECT_EQ_EPS(v1, v2, 1e-5f); +// } +// } +// } +//} +// +//template +//bool TestScalarProductAgree(size_t N, size_t dim, size_t Rep) { +// vector vect1(dim), vect2(dim); +// T* pVect1 = &vect1[0]; +// T* pVect2 = &vect2[0]; +// +// float maxRelDiff = 1e-6f; +// float maxAbsDiff = 1e-6f; +// +// for (size_t i = 0; i < Rep; ++i) { +// for (size_t j = 1; j < N; ++j) { +// GenRandVect(pVect1, dim, T(1), T(2), true /* do normalize */); +// GenRandVect(pVect2, dim, T(1), T(2), true /* do normalize */); +// +// T val1 = ScalarProduct(pVect1, pVect2, dim); +// T val2 = ScalarProductSIMD(pVect1, pVect2, dim); +// +// bool bug = false; +// T diff = fabs(val1 - val2); +// T diffRel = diff/max(max(fabs(val1),fabs(val2)),T(1e-18)); +// if (diffRel > maxRelDiff && diff > maxAbsDiff) { +// bug = true; +// cerr << "Bug ScalarProduct !!! Dim = " << dim << " val1 = " << val1 << " val2 = " << val2 << " diff=" << diff << " diffRel=" << diffRel << endl; +// } +// +// if (bug) return false; +// } +// } +// +// return true; +//} +// +//template +//bool TestNormScalarProductAgree(size_t N, size_t dim, size_t Rep) { +// vector vect1(dim), vect2(dim); +// T* pVect1 = &vect1[0]; +// T* pVect2 = &vect2[0]; +// +// float maxRelDiff = 1e-6f; +// float maxAbsDiff = 1e-6f; +// +// for (size_t i = 0; i < Rep; ++i) { +// for (size_t j = 1; j < N; ++j) { +// GenRandVect(pVect1, dim, T(1), T(2), true /* do normalize */); +// GenRandVect(pVect2, dim, T(1), T(2), true /* do normalize */); +// +// T val1 = NormScalarProduct(pVect1, pVect2, dim); +// T val2 = NormScalarProductSIMD(pVect1, pVect2, dim); +// +// bool bug = false; +// T diff = fabs(val1 - val2); +// T diffRel = diff/max(max(fabs(val1),fabs(val2)),T(1e-18)); +// if (diffRel > maxRelDiff && diff > maxAbsDiff) { +// bug = true; +// cerr << "Bug NormScalarProduct !!! Dim = " << dim << " val1 = " << val1 << " val2 = " << val2 << " diff=" << diff << " diffRel=" << diffRel << endl; +// } +// +// if (bug) return false; +// } +// } +// +// return true; +//} +// +//// Agreement test functions +//template +//bool TestLInfAgree(size_t N, size_t dim, size_t Rep) { +// vector vect1(dim), vect2(dim); +// T* pVect1 = &vect1[0]; +// T* pVect2 = &vect2[0]; +// +// for (size_t i = 0; i < Rep; ++i) { +// for (size_t j = 1; j < N; ++j) { +// GenRandVect(pVect1, dim, -T(RANGE), T(RANGE)); +// GenRandVect(pVect2, dim, -T(RANGE), T(RANGE)); +// +// T val1 = LInfNormStandard(pVect1, pVect2, dim); +// T val2 = LInfNorm(pVect1, pVect2, dim); +// T val3 = LInfNormSIMD(pVect1, pVect2, dim); +// +// bool bug = false; +// +// if (fabs(val1 - val2)/max(max(val1,val2),T(1e-18)) > 1e-6) { +// cerr << "Bug LInf !!! Dim = " << dim << " val1 = " << val1 << " val2 = " << val2 << endl; +// bug = true; +// } +// if (fabs(val1 - val3)/max(max(val1,val2),T(1e-18)) > 1e-6) { +// cerr << "Bug LInf !!! Dim = " << dim << " val1 = " << val1 << " val3 = " << val3 << endl; +// bug = true; +// } +// if (bug) return false; +// } +// } +// +// +// return true; +//} +// +//template +//bool TestL1Agree(size_t N, size_t dim, size_t Rep) { +// vector vect1(dim), vect2(dim); +// T* pVect1 = &vect1[0]; +// T* pVect2 = &vect2[0]; +// +// for (size_t i = 0; i < Rep; ++i) { +// for (size_t j = 1; j < N; ++j) { +// GenRandVect(pVect1, dim, -T(RANGE), T(RANGE)); +// GenRandVect(pVect2, dim, -T(RANGE), T(RANGE)); +// +// T val1 = L1NormStandard(pVect1, pVect2, dim); +// T val2 = L1Norm(pVect1, pVect2, dim); +// T val3 = L1NormSIMD(pVect1, pVect2, dim); +// +// bool bug = false; +// +// if (fabs(val1 - val2)/max(max(val1,val2),T(1e-18)) > 1e-6) { +// cerr << "Bug L1 !!! Dim = " << dim << " val1 = " << val1 << " val2 = " << val2 << endl; +// bug = true; +// } +// if (fabs(val1 - val3)/max(max(val1,val2),T(1e-18)) > 1e-6) { +// cerr << "Bug L1 !!! Dim = " << dim << " val1 = " << val1 << " val3 = " << val3 << endl; +// bug = true; +// } +// if (bug) return false; +// } +// } +// +// return true; +//} +// +//template +//bool TestL2Agree(size_t N, size_t dim, size_t Rep) { +// vector vect1(dim), vect2(dim); +// T* pVect1 = &vect1[0]; +// T* pVect2 = &vect2[0]; +// +// for (size_t i = 0; i < Rep; ++i) { +// for (size_t j = 1; j < N; ++j) { +// GenRandVect(pVect1, dim, -T(RANGE), T(RANGE)); +// GenRandVect(pVect2, dim, -T(RANGE), T(RANGE)); +// +// T val1 = L2NormStandard(pVect1, pVect2, dim); +// T val2 = L2Norm(pVect1, pVect2, dim); +// T val3 = L2NormSIMD(pVect1, pVect2, dim); +// +// bool bug = false; +// +// if (fabs(val1 - val2)/max(max(val1,val2),T(1e-18)) > 1e-6) { +// cerr << "Bug L2 !!! Dim = " << dim << " val1 = " << val1 << " val2 = " << val2 << endl; +// bug = true; +// } +// if (fabs(val1 - val3)/max(max(val1,val2),T(1e-18)) > 1e-6) { +// cerr << "Bug L2 !!! Dim = " << dim << " val1 = " << val1 << " val3 = " << val3 << endl; +// bug = true; +// } +// if (bug) return false; +// } +// } +// +// +// return true; +//} +// +//template +//bool TestItakuraSaitoAgree(size_t N, size_t dim, size_t Rep) { +// vector vect1(dim), vect2(dim); +// T* pVect1 = &vect1[0]; +// T* pVect2 = &vect2[0]; +// vector precompVect1(dim *2), precompVect2(dim * 2); +// T* pPrecompVect1 = &precompVect1[0]; +// T* pPrecompVect2 = &precompVect2[0]; +// +// for (size_t i = 0; i < Rep; ++i) { +// for (size_t j = 1; j < N; ++j) { +// GenRandVect(pVect1, dim, T(RANGE_SMALL), T(1.0), true); +// GenRandVect(pVect2, dim, T(RANGE_SMALL), T(1.0), true); +// +// copy(pVect1, pVect1 + dim, pPrecompVect1); +// copy(pVect2, pVect2 + dim, pPrecompVect2); +// +// PrecompLogarithms(pPrecompVect1, dim); +// PrecompLogarithms(pPrecompVect2, dim); +// +// T val0 = ItakuraSaito(pVect1, pVect2, dim); +// T val1 = ItakuraSaitoPrecomp(pPrecompVect1, pPrecompVect2, dim); +// T val2 = ItakuraSaitoPrecompSIMD(pPrecompVect1, pPrecompVect2, dim); +// +// bool bug = false; +// +// T AbsDiff1 = fabs(val1 - val0); +// T RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val0)),T(1e-18)); +// +// if (RelDiff1 > 1e-5 && AbsDiff1 > 1e-5) { +// cerr << "Bug ItakuraSaito !!! Dim = " << dim << " val1 = " << val1 << " val0 = " << val0 << " Diff: " << (val1 - val0) << " RelDiff1: " << RelDiff1 << " << AbsDiff1: " << AbsDiff1 << endl; +// bug = true; +// } +// +// T AbsDiff2 = fabs(val1 - val2); +// T RelDiff2 = AbsDiff2/max(max(fabs(val1),fabs(val2)),T(1e-18)); +// if (RelDiff2 > 1e-5 && AbsDiff2 > 1e-5) { +// cerr << "Bug ItakuraSaito !!! Dim = " << dim << " val1 = " << val1 << " val2 = " << val2 << " Diff: " << (val1 - val2) << " RelDiff2: " << RelDiff2 << " AbsDiff2: " << AbsDiff2 << endl; +// bug = true; +// } +// +// if (bug) return false; +// } +// } +// +// +// return true; +//} +// +//template +//bool TestKLAgree(size_t N, size_t dim, size_t Rep) { +// vector vect1(dim), vect2(dim); +// T* pVect1 = &vect1[0]; +// T* pVect2 = &vect2[0]; +// vector precompVect1(dim *2), precompVect2(dim * 2); +// T* pPrecompVect1 = &precompVect1[0]; +// T* pPrecompVect2 = &precompVect2[0]; +// +// for (size_t i = 0; i < Rep; ++i) { +// for (size_t j = 1; j < N; ++j) { +// GenRandVect(pVect1, dim, T(RANGE_SMALL), T(1.0), true); +// GenRandVect(pVect2, dim, T(RANGE_SMALL), T(1.0), true); +// +// copy(pVect1, pVect1 + dim, pPrecompVect1); +// copy(pVect2, pVect2 + dim, pPrecompVect2); +// +// PrecompLogarithms(pPrecompVect1, dim); +// PrecompLogarithms(pPrecompVect2, dim); +// +// T val0 = KLStandard(pVect1, pVect2, dim); +// T val1 = KLStandardLogDiff(pVect1, pVect2, dim); +// T val2 = KLPrecomp(pPrecompVect1, pPrecompVect2, dim); +// T val3 = KLPrecompSIMD(pPrecompVect1, pPrecompVect2, dim); +// +// bool bug = false; +// +// /* +// * KLStandardLog has a worse accuracy due to computing the log of ratios +// * as opposed to difference of logs, but it is more efficient (log can be +// * expensive to compute) +// */ +// +// T AbsDiff1 = fabs(val1 - val0); +// T RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val0)),T(1e-18)); +// if (RelDiff1 > 1e-5 && AbsDiff1 > 1e-5) { +// cerr << "Bug KL !!! Dim = " << dim << " val0 = " << val0 << " val1 = " << val1 << " Diff: " << (val0 - val1) << " RelDiff1: " << RelDiff1 << " AbsDiff1: " << AbsDiff1 << endl; +// bug = true; +// } +// +// T AbsDiff2 = fabs(val1 - val2); +// T RelDiff2 = AbsDiff2/max(max(fabs(val1),fabs(val2)),T(1e-18)); +// if (RelDiff2 > 1e-5 && AbsDiff2 > 1e-5) { +// cerr << "Bug KL !!! Dim = " << dim << " val2 = " << val2 << " val1 = " << val1 << " Diff: " << (val2 - val1) << " RelDiff2: " << RelDiff2 << " AbsDiff2: " << AbsDiff2 << endl; +// bug = true; +// } +// +// T AbsDiff3 = fabs(val1 - val3); +// T RelDiff3 = AbsDiff3/max(max(fabs(val1),fabs(val3)),T(1e-18)); +// if (RelDiff3 > 1e-5 && AbsDiff3 > 1e-5) { +// cerr << "Bug KL !!! Dim = " << dim << " val3 = " << val3 << " val1 = " << val1 << " Diff: " << (val3 - val1) << " RelDiff3: " << RelDiff3 << " AbsDiff3: " << AbsDiff3 << endl; +// bug = true; +// } +// +// if (bug) return false; +// } +// } +// +// +// return true; +//} +// +//template +//bool TestKLGeneralAgree(size_t N, size_t dim, size_t Rep) { +// T* pVect1 = new T[dim]; +// T* pVect2 = new T[dim]; +// T* pPrecompVect1 = new T[dim * 2]; +// T* pPrecompVect2 = new T[dim * 2]; +// +// for (size_t i = 0; i < Rep; ++i) { +// for (size_t j = 1; j < N; ++j) { +// GenRandVect(pVect1, dim, T(RANGE_SMALL), T(1.0), false); +// GenRandVect(pVect2, dim, T(RANGE_SMALL), T(1.0), false); +// +// copy(pVect1, pVect1 + dim, pPrecompVect1); +// copy(pVect2, pVect2 + dim, pPrecompVect2); +// +// PrecompLogarithms(pPrecompVect1, dim); +// PrecompLogarithms(pPrecompVect2, dim); +// +// T val0 = KLGeneralStandard(pVect1, pVect2, dim); +// T val2 = KLGeneralPrecomp(pPrecompVect1, pPrecompVect2, dim); +// T val3 = KLGeneralPrecompSIMD(pPrecompVect1, pPrecompVect2, dim); +// +// bool bug = false; +// +// T AbsDiff1 = fabs(val2 - val0); +// T RelDiff1 = AbsDiff1/max(max(fabs(val2),fabs(val0)),T(1e-18)); +// if (RelDiff1 > 1e-5 && AbsDiff1 > 1e-5) { +// cerr << "Bug KL !!! Dim = " << dim << " val0 = " << val0 << " val2 = " << val2 << " Diff: " << (val0 - val2) << " RelDiff1: " << RelDiff1 << " AbsDiff1: " << AbsDiff1 << endl; +// bug = true; +// } +// +// T AbsDiff2 = fabs(val3 - val2); +// T RelDiff2 = AbsDiff2/max(max(fabs(val3),fabs(val2)),T(1e-18)); +// if (RelDiff2 > 1e-5 && AbsDiff2 > 1e-5) { +// cerr << "Bug KL !!! Dim = " << dim << " val2 = " << val2 << " val3 = " << val3 << " Diff: " << (val2 - val3) << " RelDiff2: " << RelDiff2 << " AbsDiff2: " << AbsDiff2 << endl; +// bug = true; +// } +// +// if (bug) return false; +// } +// } +// +// +// return true; +//} +// +//template +//bool TestJSAgree(size_t N, size_t dim, size_t Rep, double pZero) { +// vector vect1(dim), vect2(dim); +// T* pVect1 = &vect1[0]; +// T* pVect2 = &vect2[0]; +// vector precompVect1(dim *2), precompVect2(dim * 2); +// T* pPrecompVect1 = &precompVect1[0]; +// T* pPrecompVect2 = &precompVect2[0]; +// +// T Dist = 0; +// T Error = 0; +// T TotalQty = 0; +// +// for (size_t i = 0; i < Rep; ++i) { +// for (size_t j = 1; j < N; ++j) { +// GenRandVect(pVect1, dim, T(RANGE_SMALL), T(1.0), true); +// SetRandZeros(pVect1, dim, pZero); +// Normalize(pVect1, dim); +// GenRandVect(pVect2, dim, T(RANGE_SMALL), T(1.0), true); +// SetRandZeros(pVect2, dim, pZero); +// Normalize(pVect2, dim); +// +// copy(pVect1, pVect1 + dim, pPrecompVect1); +// copy(pVect2, pVect2 + dim, pPrecompVect2); +// +// PrecompLogarithms(pPrecompVect1, dim); +// PrecompLogarithms(pPrecompVect2, dim); +// +// T val0 = JSStandard(pVect1, pVect2, dim); +// T val1 = JSPrecomp(pPrecompVect1, pPrecompVect2, dim); +// +// bool bug = false; +// +// T AbsDiff1 = fabs(val1 - val0); +// T RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val0)),T(1e-18)); +// +// if (RelDiff1 > 1e-5 && AbsDiff1 > 1e-5) { +// cerr << "Bug JS (1) " << typeid(T).name() << " !!! Dim = " << dim << " val0 = " << val0 << " val1 = " << val1 << " Diff: " << (val0 - val1) << " RelDiff1: " << RelDiff1 << " AbsDiff1: " << AbsDiff1 << endl; +// bug = true; +// } +// +// T val2 = JSPrecompApproxLog(pPrecompVect1, pPrecompVect2, dim); +// T val3 = JSPrecompSIMDApproxLog(pPrecompVect1, pPrecompVect2, dim); +// +// T AbsDiff2 = fabs(val2 - val3); +// T RelDiff2 = AbsDiff2/max(max(fabs(val2),fabs(val3)),T(1e-18)); +// +// if (RelDiff2 > 1e-5 && AbsDiff2 > 1e-5) { +// cerr << "Bug JS (2) " << typeid(T).name() << " !!! Dim = " << dim << " val2 = " << val2 << " val3 = " << val3 << " Diff: " << (val2 - val3) << " RelDiff2: " << RelDiff2 << " AbsDiff2: " << AbsDiff2 << endl; +// bug = true; +// } +// +// T AbsDiff3 = fabs(val1 - val2); +// T RelDiff3 = AbsDiff3/max(max(fabs(val1),fabs(val2)),T(1e-18)); +// +// Dist += val1; +// Error += AbsDiff3; +// ++TotalQty; +// +// if (RelDiff3 > 1e-4 && AbsDiff3 > 1e-4) { +// cerr << "Bug JS (3) " << typeid(T).name() << " !!! Dim = " << dim << " val1 = " << val1 << " val2 = " << val2 << " Diff: " << (val1 - val2) << " RelDiff3: " << RelDiff3 << " AbsDiff2: " << AbsDiff3 << endl; +// bug = true; +// } +// +// if (bug) return false; +// } +// } +// +// LOG(LIB_INFO) << typeid(T).name() << " JS approximation error: average absolute: " << Error / TotalQty << +// " avg. dist: " << Dist / TotalQty << " average relative: " << Error/Dist; +// +// +// return true; +//} +// +//template +//bool TestRenyiDivAgree(size_t N, size_t dim, size_t Rep, T alpha) { +// vector vect1(dim), vect2(dim); +// T* pVect1 = &vect1[0]; +// T* pVect2 = &vect2[0]; +// +// T Dist = 0; +// T Error = 0; +// T TotalQty = 0; +// +// for (size_t i = 0; i < Rep; ++i) { +// for (size_t j = 1; j < N; ++j) { +// GenRandVect(pVect1, dim, T(RANGE_SMALL), T(1.0), true); +// GenRandVect(pVect2, dim, T(RANGE_SMALL), T(1.0), true); +// +// Normalize(pVect1, dim); +// Normalize(pVect2, dim); +// +// T val0 = renyiDivergenceSlow(pVect1, pVect2, dim, alpha); +// T val1 = renyiDivergenceFast(pVect1, pVect2, dim, alpha); +// +// bool bug = false; +// +// T AbsDiff1 = fabs(val1 - val0); +// T RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val0)),T(1e-18)); +// +// Error += AbsDiff1; +// ++TotalQty; +// +// if (RelDiff1 > 1e-5 && AbsDiff1 > 1e-5) { +// cerr << "Bug Reniy Div. (1) " << typeid(T).name() << " !!! Dim = " << dim +// << "alpha=" << alpha << " val0 = " << val0 << " val1 = " << val1 +// << " Diff: " << (val0 - val1) << " RelDiff1: " << RelDiff1 +// << " AbsDiff1: " << AbsDiff1 << endl; +// bug = true; +// } +// +// if (bug) return false; +// } +// } +// +// LOG(LIB_INFO) << typeid(T).name() << " Renyi Div. approximation error: average absolute: " << Error / TotalQty << +// " avg. dist: " << Dist / TotalQty << " average relative: " << Error/Dist; +// +// +// return true; +//} +// +//template +//bool TestAlphaBetaDivAgree(size_t N, size_t dim, size_t Rep, T alpha, T beta) { +// vector vect1(dim), vect2(dim); +// T* pVect1 = &vect1[0]; +// T* pVect2 = &vect2[0]; +// +// T Dist = 0; +// T Error = 0; +// T TotalQty = 0; +// +// for (size_t i = 0; i < Rep; ++i) { +// for (size_t j = 1; j < N; ++j) { +// GenRandVect(pVect1, dim, T(RANGE_SMALL), T(1.0), true); +// GenRandVect(pVect2, dim, T(RANGE_SMALL), T(1.0), true); +// +// Normalize(pVect1, dim); +// Normalize(pVect2, dim); +// +// T val0 = alphaBetaDivergenceSlow(pVect1, pVect2, dim, alpha, beta); +// T val1 = alphaBetaDivergenceFast(pVect1, pVect2, dim, alpha, beta); +// +// bool bug = false; +// +// T AbsDiff1 = fabs(val1 - val0); +// T RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val0)),T(1e-18)); +// +// Error += AbsDiff1; +// ++TotalQty; +// +// if (RelDiff1 > 1e-5 && AbsDiff1 > 1e-5) { +// cerr << "Bug alpha-beta Div. (1) " << typeid(T).name() << " !!! Dim = " << dim +// << "alpha=" << alpha << " val0 = " << val0 << " val1 = " << val1 +// << " Diff: " << (val0 - val1) << " RelDiff1: " << RelDiff1 +// << " AbsDiff1: " << AbsDiff1 << endl; +// bug = true; +// } +// +// if (bug) return false; +// } +// } +// +// LOG(LIB_INFO) << typeid(T).name() << " alpha-beta div. approximation error: average absolute: " << Error / TotalQty << " avg. dist: " << Dist / TotalQty << " average relative: " << Error/Dist; +// +// +// return true; +//} +// +//bool TestSpearmanFootruleAgree(size_t N, size_t dim, size_t Rep) { +// vector vect1(dim), vect2(dim); +// PivotIdType* pVect1 = &vect1[0]; +// PivotIdType* pVect2 = &vect2[0]; +// +// for (size_t i = 0; i < Rep; ++i) { +// for (size_t j = 1; j < N; ++j) { +// GenRandIntVect(pVect1, dim); +// GenRandIntVect(pVect2, dim); +// +// int val0 = SpearmanFootrule(pVect1, pVect2, dim); +// int val1 = SpearmanFootruleSIMD(pVect1, pVect2, dim); +// +// bool bug = false; +// +// +// if (val0 != val1) { +// cerr << "Bug SpearmanFootrule !!! Dim = " << dim << " val0 = " << val0 << " val1 = " << val1 << endl; +// bug = true; +// } +// +// if (bug) return false; +// } +// } +// +// +// return true; +//} +// +//bool TestSpearmanRhoAgree(size_t N, size_t dim, size_t Rep) { +// vector vect1(dim), vect2(dim); +// PivotIdType* pVect1 = &vect1[0]; +// PivotIdType* pVect2 = &vect2[0]; +// +// for (size_t i = 0; i < Rep; ++i) { +// for (size_t j = 1; j < N; ++j) { +// GenRandIntVect(pVect1, dim); +// GenRandIntVect(pVect2, dim); +// +// int val0 = SpearmanRho(pVect1, pVect2, dim); +// int val1 = SpearmanRhoSIMD(pVect1, pVect2, dim); +// +// bool bug = false; +// +// +// if (val0 != val1) { +// cerr << "Bug SpearmanRho !!! Dim = " << dim << " val0 = " << val0 << " val1 = " << val1 << " Diff: " << (val0 - val1) << endl; +// bug = true; +// } +// +// if (bug) return false; +// } +// } +// +// +// return true; +//} +// +//template +//bool TestLPGenericAgree(size_t N, size_t dim, size_t Rep, T power) { +// vector vect1(dim), vect2(dim); +// T* pVect1 = &vect1[0]; +// T* pVect2 = &vect2[0]; +// +// T TotalQty = 0, Error = 0, Dist = 0; +// +// for (size_t i = 0; i < Rep; ++i) { +// for (size_t j = 1; j < N; ++j) { +// GenRandVect(pVect1, dim, -T(RANGE), T(RANGE)); +// GenRandVect(pVect2, dim, -T(RANGE), T(RANGE)); +// +// T val0 = LPGenericDistance(pVect1, pVect2, dim, power); +// T val1 = LPGenericDistanceOptim(pVect1, pVect2, dim, power); +// +// bool bug = false; +// +// T AbsDiff1 = fabs(val1 - val0); +// T RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val0)),T(1e-18)); +// +// T maxRelDiff = 1e-5f; +// T maxAbsDiff = 1e-5f; +// /* +// * For large powers, the difference can be larger, +// * because our approximations are efficient, but not very +// * precise +// */ +// if (power > 8) { maxAbsDiff = maxRelDiff = 1e-3f;} +// if (power > 12) { maxAbsDiff = maxRelDiff = 0.01f;} +// if (power > 22) { maxAbsDiff = maxRelDiff = 0.1f;} +// +// ++TotalQty; +// Error += RelDiff1; +// Dist += val0; +// +// if (RelDiff1 > maxRelDiff && AbsDiff1 > maxAbsDiff) { +// cerr << "Bug LP" << power << " !!! Dim = " << dim << +// " val1 = " << val1 << " val0 = " << val0 << +// " Diff: " << (val1 - val0) << +// " RelDiff1: " << RelDiff1 << +// " (max for this power: " << maxRelDiff << ") " << +// " AbsDiff1: " << AbsDiff1 << " (max for this power: " << maxAbsDiff << ")" << endl; +// } +// +// if (bug) return false; +// } +// } +// +// if (power < 4) { +// LOG(LIB_INFO) << typeid(T).name() << " LP approximation error: average absolute " << Error / TotalQty << " avg. dist: " << Dist / TotalQty << " average relative: " << Error/Dist; +// +// } +// +// return true; +//} +// +//bool TestBitHammingAgree(size_t N, size_t dim, size_t Rep) { +// size_t WordQty = (dim + 31)/32; +// vector arr(N * WordQty); +// uint32_t* pArr = &arr[0]; +// +// uint32_t *p = pArr; +// for (size_t i = 0; i < N; ++i, p+= WordQty) { +// vector perm(dim); +// GenRandIntVect(&perm[0], dim); +// for (unsigned j = 0; j < dim; ++j) +// perm[j] = perm[j] % 2; +// vector h; +// Binarize(perm, 1, h); +// CHECK(h.size() == WordQty); +// memcpy(p, &h[0], WordQty * sizeof(h[0])); +// } +// +// WallClockTimer t; +// +// t.reset(); +// +// bool res = true; +// +// for (size_t j = 1; j < N; ++j) { +// uint32_t* pVect1 = pArr + j*WordQty; +// uint32_t* pVect2 = pArr + (j-1)*WordQty; +// int d1 = BitHamming(pVect1, pVect2, WordQty); +// int d2 = 0; +// +// for (unsigned t = 0; t < WordQty; ++t) { +// for (unsigned k = 0; k < 32; ++k) { +// d2 += ((pVect1[t]>>k)&1) != ((pVect2[t]>>k)&1); +// } +// } +// if (d1 != d2) { +// cerr << "Bug bit hamming, WordQty = " << WordQty << " d1 = " << d1 << " d2 = " << d2 << endl; +// res = false; +// break; +// } +// } +// +// return res; +//} +// +// +//bool TestSparseAngularDistanceAgree(const string& dataFile, size_t N, size_t Rep) { +// typedef float T; +// +// unique_ptr spaceFast(new SpaceSparseAngularDistanceFast()); +// unique_ptr> spaceReg(new SpaceSparseAngularDistance()); +// +// ObjectVector elemsFast; +// ObjectVector elemsReg; +// vector tmp; +// +// unique_ptr inpStateFast(spaceFast->ReadDataset(elemsFast, tmp, dataFile, N)); +// spaceFast->UpdateParamsFromFile(*inpStateFast); +// unique_ptr inpStateReg(spaceReg->ReadDataset(elemsReg, tmp, dataFile, N)); +// spaceReg->UpdateParamsFromFile(*inpStateReg); +// +// CHECK(elemsFast.size() == elemsReg.size()); +// +// N = min(N, elemsReg.size()); +// +// bool bug = false; +// +// float maxRelDiff = 2e-5f; +// float maxAbsDiff = 1e-6f; +// +// for (size_t j = Rep; j < N; ++j) +// for (size_t k = j - Rep; k < j; ++k) { +// float val1 = spaceFast->IndexTimeDistance(elemsFast[k], elemsFast[j]); +// float val2 = spaceReg->IndexTimeDistance(elemsReg[k], elemsReg[j]); +// +// float AbsDiff1 = fabs(val1 - val2); +// float RelDiff1 = AbsDiff1 / max(max(fabs(val1), fabs(val2)), T(1e-18)); +// +// if (RelDiff1 > maxRelDiff && AbsDiff1 > maxAbsDiff) { +// cerr << "Bug fast vs non-fast angular dist " << +// " val1 = " << val1 << " val2 = " << val2 << +// " Diff: " << (val1 - val2) << +// " RelDiff1: " << RelDiff1 << +// " AbsDiff1: " << AbsDiff1 << endl; +// bug = true; +// } +// +// if (bug) return false; +// } +// +// return true; +//} +// +// +// +//bool TestSparseCosineSimilarityAgree(const string& dataFile, size_t N, size_t Rep) { +// typedef float T; +// +// unique_ptr spaceFast(new SpaceSparseCosineSimilarityFast()); +// unique_ptr> spaceReg (new SpaceSparseCosineSimilarity()); +// +// ObjectVector elemsFast; +// ObjectVector elemsReg; +// vector tmp; +// +// unique_ptr inpStateFast(spaceFast->ReadDataset(elemsFast, tmp, dataFile, N)); +// spaceFast->UpdateParamsFromFile(*inpStateFast); +// unique_ptr inpStateReg(spaceReg->ReadDataset(elemsReg, tmp, dataFile, N)); +// spaceReg->UpdateParamsFromFile(*inpStateReg); +// +// CHECK(elemsFast.size() == elemsReg.size()); +// +// N = min(N, elemsReg.size()); +// +// bool bug = false; +// +// float maxRelDiff = 1e-5f; +// float maxAbsDiff = 1e-5f; +// +// for (size_t j = Rep; j < N; ++j) +// for (size_t k = j - Rep; k < j; ++k) { +// float val1 = spaceFast->IndexTimeDistance(elemsFast[k], elemsFast[j]); +// float val2 = spaceReg->IndexTimeDistance(elemsReg[k], elemsReg[j]); +// +// float AbsDiff1 = fabs(val1 - val2); +// float RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val2)),T(1e-18)); +// +// if (RelDiff1 > maxRelDiff && AbsDiff1 > maxAbsDiff) { +// cerr << "Bug fast vs non-fast cosine " << +// " val1 = " << val1 << " val2 = " << val2 << +// " Diff: " << (val1 - val2) << +// " RelDiff1: " << RelDiff1 << +// " AbsDiff1: " << AbsDiff1 << endl; +// bug = true; +// } +// +// if (bug) return false; +// } +// +// return true; +//} +// +//bool TestSparseNegativeScalarProductAgree(const string& dataFile, size_t N, size_t Rep) { +// typedef float T; +// +// unique_ptr spaceFast(new SpaceSparseNegativeScalarProductFast()); +// unique_ptr> spaceReg (new SpaceSparseNegativeScalarProduct()); +// +// ObjectVector elemsFast; +// ObjectVector elemsReg; +// vector tmp; +// +// unique_ptr inpStateFast(spaceFast->ReadDataset(elemsFast, tmp, dataFile, N)); +// spaceFast->UpdateParamsFromFile(*inpStateFast); +// unique_ptr inpStateReg(spaceReg->ReadDataset(elemsReg, tmp, dataFile, N)); +// spaceReg->UpdateParamsFromFile(*inpStateReg); +// +// CHECK(elemsFast.size() == elemsReg.size()); +// +// N = min(N, elemsReg.size()); +// +// bool bug = false; +// +// float maxRelDiff = 1e-6f; +// float maxAbsDiff = 1e-6f; +// +// for (size_t j = Rep; j < N; ++j) +// for (size_t k = j - Rep; k < j; ++k) { +// float val1 = spaceFast->IndexTimeDistance(elemsFast[k], elemsFast[j]); +// float val2 = spaceReg->IndexTimeDistance(elemsReg[k], elemsReg[j]); +// +// float AbsDiff1 = fabs(val1 - val2); +// float RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val2)),T(1e-18)); +// +// if (RelDiff1 > maxRelDiff && AbsDiff1 > maxAbsDiff) { +// cerr << "Bug fast vs non-fast negative scalar/dot product " << +// " val1 = " << val1 << " val2 = " << val2 << +// " Diff: " << (val1 - val2) << +// " RelDiff1: " << RelDiff1 << +// " AbsDiff1: " << AbsDiff1 << endl; +// bug = true; +// } +// +// if (bug) return false; +// } +// +// return true; +//} +// +//bool TestSparseQueryNormNegativeScalarProductAgree(const string& dataFile, size_t N, size_t Rep) { +// typedef float T; +// +// unique_ptr spaceFast(new SpaceSparseQueryNormNegativeScalarProductFast()); +// unique_ptr> spaceReg (new SpaceSparseQueryNormNegativeScalarProduct()); +// +// ObjectVector elemsFast; +// ObjectVector elemsReg; +// vector tmp; +// +// unique_ptr inpStateFast(spaceFast->ReadDataset(elemsFast, tmp, dataFile, N)); +// spaceFast->UpdateParamsFromFile(*inpStateFast); +// unique_ptr inpStateReg(spaceReg->ReadDataset(elemsReg, tmp, dataFile, N)); +// spaceReg->UpdateParamsFromFile(*inpStateReg); +// +// CHECK(elemsFast.size() == elemsReg.size()); +// +// N = min(N, elemsReg.size()); +// +// bool bug = false; +// +// float maxRelDiff = 1e-6f; +// float maxAbsDiff = 1e-6f; +// +// for (size_t j = Rep; j < N; ++j) +// for (size_t k = j - Rep; k < j; ++k) { +// float val1 = spaceFast->IndexTimeDistance(elemsFast[k], elemsFast[j]); +// float val2 = spaceReg->IndexTimeDistance(elemsReg[k], elemsReg[j]); +// +// float AbsDiff1 = fabs(val1 - val2); +// float RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val2)),T(1e-18)); +// +// if (RelDiff1 > maxRelDiff && AbsDiff1 > maxAbsDiff) { +// cerr << "Bug fast vs non-fast QUERY-NORMALIZED negative scalar/dot product " << +// " val1 = " << val1 << " val2 = " << val2 << +// " Diff: " << (val1 - val2) << +// " RelDiff1: " << RelDiff1 << +// " AbsDiff1: " << AbsDiff1 << endl; +// bug = true; +// } +// +// if (bug) return false; +// } +// +// return true; +//} +// +//// Limitation: this is only for spaces without params +//bool TestPivotIndex(const string& spaceName, +// bool useDummyIndex, +// const string& dataFile, size_t dataQty, +// const string& pivotFile, size_t pivotQty) { +// +// LOG(LIB_INFO) << "space: " << spaceName << " real pivot index?: " << !useDummyIndex << " " << +// " dataFile: " << dataFile << " " << +// " pivotFile: " << pivotFile; +// try { +// typedef float T; +// +// AnyParams emptyParams; +// +// unique_ptr> space(SpaceFactoryRegistry::Instance().CreateSpace(spaceName, emptyParams)); +// +// ObjectVector data; +// ObjectVector pivots; +// vector tmp; +// +// float maxRelDiff = 1e-6f; +// float maxAbsDiff = 1e-6f; +// +// unique_ptr inpStateFast(space->ReadDataset(data, tmp, dataFile, dataQty)); +// space->UpdateParamsFromFile(*inpStateFast); +// space->ReadDataset(pivots, tmp, pivotFile, pivotQty); +// +// unique_ptr> pivIndx(useDummyIndex ? +// new DummyPivotIndex(*space, pivots) +// : +// space->CreatePivotIndex(pivots, +// 0 /* Let's not test using the hashing trick here, b/c distances would be somewhat different */)); +// +// for (size_t did = 0; did < dataQty; ++did) { +// vector vDst; +// pivIndx->ComputePivotDistancesIndexTime(data[did], vDst); +// CHECK_MSG(vDst.size() == pivotQty, "ComputePivotDistancesIndexTime returns incorrect # of elements different from the # of pivots"); +// +// for (size_t pid = 0; pid < pivotQty; ++pid) { +// T val2 = space->IndexTimeDistance(pivots[pid], data[did]); +// T val1 = vDst[pid]; +// +// float AbsDiff1 = fabs(val1 - val2); +// float RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val2)),T(1e-18)); +// +// if (RelDiff1 > maxRelDiff && AbsDiff1 > maxAbsDiff) { +// cerr << "Bug in fast computation of all-pivot distance, " << +// " space: " << spaceName << " real pivot index?: " << !useDummyIndex << endl << +// " dataFile: " << dataFile << endl << +// " pivotFile: " << pivotFile << endl << +// " data index: " << did << " pivot index: " << pid << endl << +// " val1 = " << val1 << " val2 = " << val2 << +// " Diff: " << (val1 - val2) << +// " RelDiff1: " << RelDiff1 << +// " AbsDiff1: " << AbsDiff1 << endl; +// return false; +// } +// } +// } +// } catch (const exception& e) { +// LOG(LIB_INFO) << "Got exception while testing: " << e.what(); +// return false; +// } +// return true; +//} +// +// +// +// +//#ifdef DISABLE_LONG_TESTS +//TEST(DISABLE_TestAgree) { +//#else +//TEST(TestAgree) { +//#endif +// int nTest = 0; +// int nFail = 0; +// +// nTest++; +// nFail += !TestSparseAngularDistanceAgree(sampleDataPrefix + "sparse_5K.txt", 1000, 200); +// +// nTest++; +// nFail += !TestSparseAngularDistanceAgree(sampleDataPrefix + "sparse_wiki_5K.txt", 1000, 200); +// +// nTest++; +// nFail += !TestSparseCosineSimilarityAgree(sampleDataPrefix + "sparse_5K.txt", 1000, 200); +// +// nTest++; +// nFail += !TestSparseCosineSimilarityAgree(sampleDataPrefix + "sparse_wiki_5K.txt", 1000, 200); +// +// +// nTest++; +// nFail += !TestSparseNegativeScalarProductAgree(sampleDataPrefix + "sparse_5K.txt", 1000, 200); +// +// nTest++; +// nFail += !TestSparseNegativeScalarProductAgree(sampleDataPrefix + "sparse_wiki_5K.txt", 1000, 200); +// +// nTest++; +// nFail += !TestSparseQueryNormNegativeScalarProductAgree(sampleDataPrefix + "sparse_5K.txt", 1000, 200); +// +// nTest++; +// nFail += !TestSparseQueryNormNegativeScalarProductAgree(sampleDataPrefix + "sparse_wiki_5K.txt", 1000, 200); +// +// +// /* +// * 32 should be more than enough for almost all methods, +// * where loop-unrolling includes at most 16 distance computations. +// * +// * Bit-Hamming is an exception. +// * +// */ +// for (unsigned dim = 1; dim <= 1024; dim+=2) { +// LOG(LIB_INFO) << "Dim = " << dim; +// +// nFail += !TestBitHammingAgree(1000, dim, 1000); +// } +// +// for (unsigned dim = 1; dim <= 32; ++dim) { +// LOG(LIB_INFO) << "Dim = " << dim; +// +// /* +// * This is a costly check, we don't need to do it for large # dimensions. +// * Anyways, the function is not using any loop unrolling, so 8 should be sufficient. +// */ +// if (dim <= 8) { +// +// for (float power = 0.125; power <= 32; power += 0.125) { +// TestLPGenericAgree(1024, dim, 10, power); +// } +// for (double power = 0.125; power <= 32; power += 0.125) { +// TestLPGenericAgree(1024, dim, 10, power); +// } +// +// // In the case of Renyi divergence 0 < alpha < 1, 1 < alpha < infinity +// // https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy#R%C3%A9nyi_divergence +// for (float alpha = 0.125; alpha <= 2; alpha += 0.125) { +// if (fabs(alpha - 1) < 1e-6) continue; +// TestRenyiDivAgree(1024, dim, 10, alpha); +// } +// for (double alpha = 0.125; alpha <= 2; alpha += 0.125) { +// if (fabs(alpha - 1) < 1e-6) continue; +// TestRenyiDivAgree(1024, dim, 10, alpha); +// } +// +// for (float alpha = -2; alpha <= 2; alpha += 0.5) +// for (float beta = -2; beta <= 2; beta += 0.5) +// { +// TestAlphaBetaDivAgree(1024, dim, 10, alpha, beta); +// } +// +// for (double alpha = -2; alpha <= 2; alpha += 0.5) +// for (double beta = -2; beta <= 2; beta += 0.5) +// { +// TestAlphaBetaDivAgree(1024, dim, 10, alpha, beta); +// } +// } +// +// nTest++; +// nFail += !TestNormScalarProductAgree(1024, dim, 10); +// nTest++; +// nFail += !TestNormScalarProductAgree(1024, dim, 10); +// +// nTest++; +// nFail += !TestScalarProductAgree(1024, dim, 10); +// nTest++; +// nFail += !TestScalarProductAgree(1024, dim, 10); +// +// nTest++; +// nFail += !TestSpearmanFootruleAgree(1024, dim, 10); +// +// nTest++; +// nFail += !TestSpearmanRhoAgree(1024, dim, 10); +// +// nTest++; +// nFail += !TestJSAgree(1024, dim, 10, 0.5); +// nTest++; +// nFail += !TestJSAgree(1024, dim, 10, 0.5); +// +// nTest++; +// nFail += !TestKLGeneralAgree(1024, dim, 10); +// nTest++; +// nFail += !TestKLGeneralAgree(1024, dim, 10); +// +// nTest++; +// nFail += !TestLInfAgree(1024, dim, 10); +// nTest++; +// nFail += !TestLInfAgree(1024, dim, 10); +// +// nTest++; +// nFail += !TestL1Agree(1024, dim, 10); +// nTest++; +// nFail += !TestL1Agree(1024, dim, 10); +// +// nTest++; +// nFail += !TestL2Agree(1024, dim, 10); +// nTest++; +// nFail += !TestL2Agree(1024, dim, 10); +// +// nTest++; +// nFail += !TestKLAgree(1024, dim, 10); +// nTest++; +// nFail += !TestKLAgree(1024, dim, 10); +// +// nTest++; +// nFail += !TestItakuraSaitoAgree(1024, dim, 10); +// nTest++; +// nFail += !TestItakuraSaitoAgree(1024, dim, 10); +// } +// +// LOG(LIB_INFO) << nTest << " (sub) tests performed " << nFail << " failed"; +// +// EXPECT_EQ(0, nFail); +//} +// +//#ifdef DISABLE_LONG_TESTS +//TEST(DISABLE_TestAgreePivotIndex) { +//#else +//TEST(TestAgreePivotIndex) { +//#endif +// int nTest = 0; +// int nFail = 0; +// +// const size_t dataQty = 1000; +// const size_t pivotQty = 100; +// +// vector vDataFiles = {"sparse_5K.txt", "sparse_wiki_5K.txt"}; +// vector vSpaces = {SPACE_SPARSE_COSINE_SIMILARITY_FAST, SPACE_SPARSE_ANGULAR_DISTANCE_FAST, +// SPACE_SPARSE_NEGATIVE_SCALAR_FAST, SPACE_SPARSE_QUERY_NORM_NEGATIVE_SCALAR_FAST}; +// const string pivotFile = "sparse_pivots1K_termQty5K_maxId_100K.txt"; +// +// for (string spaceName : vSpaces) +// for (string dataFile : vDataFiles) { +// // 1. test with a dummy pivot index +// nTest++; +// nFail += !TestPivotIndex(spaceName, true, sampleDataPrefix + dataFile, dataQty, sampleDataPrefix + pivotFile, pivotQty); +// +// // 2. test with a real pivot index +// nTest++; +// nFail += !TestPivotIndex(spaceName, false, sampleDataPrefix + dataFile, dataQty, sampleDataPrefix + pivotFile, pivotQty); +// } +// +// LOG(LIB_INFO) << nTest << " (sub) tests performed " << nFail << " failed"; +// +// EXPECT_EQ(0, nFail); +//} +// +// } // namespace similarity - +// diff --git a/similarity_search/test/test_space_serial.cc b/similarity_search/test/test_space_serial.cc index 53e1017..e8db105 100644 --- a/similarity_search/test/test_space_serial.cc +++ b/similarity_search/test/test_space_serial.cc @@ -134,7 +134,10 @@ bool fullTest(const vector& dataSetStr, size_t maxNumRec, const string& dataSet1.push_back(space->CreateObjFromStr(id++, -1, s, NULL).release()); vExternIds1.push_back(ss.str()); - + +// std::cout << space->CreateStrFromObj(dataSet1[dataSet1.size() - 1], NULL) << std::endl; + std::cout << s << std::endl; + if (id >= maxNumRec) break; } @@ -150,53 +153,53 @@ const char *emptyParams[] = {NULL}; const char *paramsDistL2[] = {"dist=" SPACE_WORD_EMBED_DIST_L2, NULL}; const char *paramsDistCosine[] = {"dist=" SPACE_WORD_EMBED_DIST_COSINE, NULL}; -TEST(Test_WordEmbedSpace) { - for (size_t maxNumRec = 1; maxNumRec < MAX_NUM_REC; ++maxNumRec) { - EXPECT_EQ(true, fullTest("glove.6B.100d_100.txt", maxNumRec, "tmp_out_file.txt", SPACE_WORD_EMBED, paramsDistL2, true)); - EXPECT_EQ(true, fullTest("glove.6B.100d_100.txt", maxNumRec, "tmp_out_file.txt",SPACE_WORD_EMBED, paramsDistCosine, true)); - EXPECT_EQ(true, fullTest("glove.6B.100d_100.txt", maxNumRec, "tmp_out_file.txt", SPACE_WORD_EMBED, paramsDistL2, true)); - EXPECT_EQ(true, fullTest("glove.6B.100d_100.txt", maxNumRec, "tmp_out_file.txt",SPACE_WORD_EMBED, paramsDistCosine, true)); - } -} - -TEST(Test_DenseVectorSpace) { - for (size_t maxNumRec = 1; maxNumRec < MAX_NUM_REC; ++maxNumRec) { - EXPECT_EQ(true, fullTest("final128_10K.txt", maxNumRec, "tmp_out_file.txt", "l2", emptyParams, false)); - EXPECT_EQ(true, fullTest("final128_10K.txt", maxNumRec, "tmp_out_file.txt", "l2", emptyParams, false)); - } -} - -TEST(Test_DenseVectorKLDiv) { - // Test KL-diverg. with and without precomputation of logarithms - for (size_t maxNumRec = 1; maxNumRec < MAX_NUM_REC; ++maxNumRec) { - EXPECT_EQ(true, fullTest("final128_10K.txt", maxNumRec, "tmp_out_file.txt", "kldivgenfast", emptyParams, false)); - EXPECT_EQ(true, fullTest("final128_10K.txt", maxNumRec, "tmp_out_file.txt", "kldivgenfast", emptyParams, false)); - EXPECT_EQ(true, fullTest("final128_10K.txt", maxNumRec, "tmp_out_file.txt", "kldivgenslow", emptyParams, false)); - EXPECT_EQ(true, fullTest("final128_10K.txt", maxNumRec, "tmp_out_file.txt", "kldivgenslow", emptyParams, false)); - } -} - -TEST(Test_SparseVectorSpace) { - for (size_t maxNumRec = 1; maxNumRec < MAX_NUM_REC; ++maxNumRec) { - EXPECT_EQ(true, fullTest("sparse_5K.txt", maxNumRec, "tmp_out_file.txt", "cosinesimil_sparse", emptyParams, false)); - EXPECT_EQ(true, fullTest("sparse_5K.txt", maxNumRec, "tmp_out_file.txt", "angulardist_sparse", emptyParams, false)); - EXPECT_EQ(true, fullTest("sparse_5K.txt", maxNumRec, "tmp_out_file.txt", "cosinesimil_sparse", emptyParams, false)); - EXPECT_EQ(true, fullTest("sparse_5K.txt", maxNumRec, "tmp_out_file.txt", "angulardist_sparse", emptyParams, false)); - } -} - -TEST(Test_SparseVectorSpaceFast) { - for (size_t maxNumRec = 1; maxNumRec < MAX_NUM_REC; ++maxNumRec) { - EXPECT_EQ(true, fullTest("sparse_5K.txt", maxNumRec, "tmp_out_file.txt", "cosinesimil_sparse_fast", emptyParams, false)); - EXPECT_EQ(true, fullTest("sparse_5K.txt", maxNumRec, "tmp_out_file.txt", "angulardist_sparse_fast", emptyParams, false)); - } -} - -TEST(Test_StringSpace) { - for (size_t maxNumRec = 1; maxNumRec < MAX_NUM_REC; ++maxNumRec) { - EXPECT_EQ(true, fullTest("dna32_4_5K.txt", maxNumRec, "tmp_out_file.txt", "leven", emptyParams, false)); - } -} +//TEST(Test_WordEmbedSpace) { +// for (size_t maxNumRec = 1; maxNumRec < MAX_NUM_REC; ++maxNumRec) { +// EXPECT_EQ(true, fullTest("glove.6B.100d_100.txt", maxNumRec, "tmp_out_file.txt", SPACE_WORD_EMBED, paramsDistL2, true)); +// EXPECT_EQ(true, fullTest("glove.6B.100d_100.txt", maxNumRec, "tmp_out_file.txt",SPACE_WORD_EMBED, paramsDistCosine, true)); +// EXPECT_EQ(true, fullTest("glove.6B.100d_100.txt", maxNumRec, "tmp_out_file.txt", SPACE_WORD_EMBED, paramsDistL2, true)); +// EXPECT_EQ(true, fullTest("glove.6B.100d_100.txt", maxNumRec, "tmp_out_file.txt",SPACE_WORD_EMBED, paramsDistCosine, true)); +// } +//} +// +//TEST(Test_DenseVectorSpace) { +// for (size_t maxNumRec = 1; maxNumRec < MAX_NUM_REC; ++maxNumRec) { +// EXPECT_EQ(true, fullTest("final128_10K.txt", maxNumRec, "tmp_out_file.txt", "l2", emptyParams, false)); +// EXPECT_EQ(true, fullTest("final128_10K.txt", maxNumRec, "tmp_out_file.txt", "l2", emptyParams, false)); +// } +//} +// +//TEST(Test_DenseVectorKLDiv) { +// // Test KL-diverg. with and without precomputation of logarithms +// for (size_t maxNumRec = 1; maxNumRec < MAX_NUM_REC; ++maxNumRec) { +// EXPECT_EQ(true, fullTest("final128_10K.txt", maxNumRec, "tmp_out_file.txt", "kldivgenfast", emptyParams, false)); +// EXPECT_EQ(true, fullTest("final128_10K.txt", maxNumRec, "tmp_out_file.txt", "kldivgenfast", emptyParams, false)); +// EXPECT_EQ(true, fullTest("final128_10K.txt", maxNumRec, "tmp_out_file.txt", "kldivgenslow", emptyParams, false)); +// EXPECT_EQ(true, fullTest("final128_10K.txt", maxNumRec, "tmp_out_file.txt", "kldivgenslow", emptyParams, false)); +// } +//} +// +//TEST(Test_SparseVectorSpace) { +// for (size_t maxNumRec = 1; maxNumRec < MAX_NUM_REC; ++maxNumRec) { +// EXPECT_EQ(true, fullTest("sparse_5K.txt", maxNumRec, "tmp_out_file.txt", "cosinesimil_sparse", emptyParams, false)); +// EXPECT_EQ(true, fullTest("sparse_5K.txt", maxNumRec, "tmp_out_file.txt", "angulardist_sparse", emptyParams, false)); +// EXPECT_EQ(true, fullTest("sparse_5K.txt", maxNumRec, "tmp_out_file.txt", "cosinesimil_sparse", emptyParams, false)); +// EXPECT_EQ(true, fullTest("sparse_5K.txt", maxNumRec, "tmp_out_file.txt", "angulardist_sparse", emptyParams, false)); +// } +//} +// +//TEST(Test_SparseVectorSpaceFast) { +// for (size_t maxNumRec = 1; maxNumRec < MAX_NUM_REC; ++maxNumRec) { +// EXPECT_EQ(true, fullTest("sparse_5K.txt", maxNumRec, "tmp_out_file.txt", "cosinesimil_sparse_fast", emptyParams, false)); +// EXPECT_EQ(true, fullTest("sparse_5K.txt", maxNumRec, "tmp_out_file.txt", "angulardist_sparse_fast", emptyParams, false)); +// } +//} +// +//TEST(Test_StringSpace) { +// for (size_t maxNumRec = 1; maxNumRec < MAX_NUM_REC; ++maxNumRec) { +// EXPECT_EQ(true, fullTest("dna32_4_5K.txt", maxNumRec, "tmp_out_file.txt", "leven", emptyParams, false)); +// } +//} TEST(Test_BitHamming) { vector testVect; @@ -228,19 +231,19 @@ TEST(Test_BitJaccard) { testVect.push_back(ss.str()); } for (size_t maxNumRec = 1; maxNumRec < MAX_NUM_REC; ++maxNumRec) { - EXPECT_EQ(true, fullTest(testVect, maxNumRec, "tmp_out_file.txt", "bit_jaccard", emptyParams, false)); + EXPECT_EQ(true, fullTest(testVect, maxNumRec, "tmp_out_file.txt", "bit_jaccard", emptyParams, false)); } } -#if defined(WITH_EXTRAS) -TEST(Test_SQFD) { - const char* sqfdParams[] = {"alpha=1", NULL} ; - for (size_t maxNumRec = 1; maxNumRec < MAX_NUM_REC; ++maxNumRec) { - EXPECT_EQ(true, fullTest("sqfd20_10k_10k.txt", maxNumRec, "tmp_out_file.txt", "sqfd_heuristic_func", sqfdParams, false)); - EXPECT_EQ(true, fullTest("sqfd20_10k_10k.txt", maxNumRec, "tmp_out_file.txt", "sqfd_heuristic_func", sqfdParams, false)); - } -} -#endif +//#if defined(WITH_EXTRAS) +//TEST(Test_SQFD) { +// const char* sqfdParams[] = {"alpha=1", NULL} ; +// for (size_t maxNumRec = 1; maxNumRec < MAX_NUM_REC; ++maxNumRec) { +// EXPECT_EQ(true, fullTest("sqfd20_10k_10k.txt", maxNumRec, "tmp_out_file.txt", "sqfd_heuristic_func", sqfdParams, false)); +// EXPECT_EQ(true, fullTest("sqfd20_10k_10k.txt", maxNumRec, "tmp_out_file.txt", "sqfd_heuristic_func", sqfdParams, false)); +// } +//} +//#endif } From ea72c886c32389cf7f1383d6fd7b629dba195627 Mon Sep 17 00:00:00 2001 From: Greg Friedland Date: Sun, 17 Feb 2019 01:07:01 -0800 Subject: [PATCH 03/17] use 32 bit for bit_jaccard for n ow b/c 64 bit causes mysterious problems --- python_bindings/setup.py | 1 + python_bindings/tests/bindings_test.py | 34 ++++++++++++++----- .../include/factory/init_spaces.h | 4 +-- similarity_search/test/test_space_serial.cc | 2 +- 4 files changed, 29 insertions(+), 12 deletions(-) diff --git a/python_bindings/setup.py b/python_bindings/setup.py index 0808994..c01800e 100755 --- a/python_bindings/setup.py +++ b/python_bindings/setup.py @@ -21,6 +21,7 @@ if os.path.exists(library_file): # if we have a prebuilt nmslib library file, use that. extra_objects.append(library_file) + print("Found: " + os.path.abspath(library_file)) else: raise RuntimeError("can't find prebuild lib: " + os.path.abspath(library_file)) # # Otherwise build all the files here directly (excluding extras which need eigen/boost) diff --git a/python_bindings/tests/bindings_test.py b/python_bindings/tests/bindings_test.py index f134738..a42939e 100644 --- a/python_bindings/tests/bindings_test.py +++ b/python_bindings/tests/bindings_test.py @@ -6,6 +6,7 @@ import numpy.testing as npt import nmslib +import psutil def get_exact_cosine(row, data, N=10): @@ -92,21 +93,30 @@ def _get_index(self, space='bit_jaccard'): raise NotImplementedError() def testKnnQuery(self): - np.random.seed(23) - nbits = 128 + nbits = 2048 + chunk_size = 1000 + ps_proc = psutil.Process() + print(f"\n{ps_proc.memory_info()}") index = self._get_index() - for i in range(100): - a = np.random.rand(nbits) > 0.5 - s = " ".join(["1" if e else "0" for e in a]) - index.addDataPoint(id=i, data=s) + np.random.seed(23) + for i in range(0, 10000, chunk_size): + strs = [] + for j in range(chunk_size): + a = np.random.rand(nbits) > 0.5 + s = " ".join(["1" if e else "0" for e in a]) + strs.append(s) + index.addDataPointBatch(ids=np.arange(i, i + chunk_size), data=strs) + + print(f"\n{ps_proc.memory_info()}") index.createIndex() + print(f"\n{ps_proc.memory_info()}") a = np.ones(nbits) s = " ".join(["1" if e else "0" for e in a]) ids, distances = index.knnQuery(s, k=10) - print(ids) + # print(ids) print(distances) # self.assertTrue(get_hitrate(get_exact_cosine(row, data), ids) >= 5) @@ -170,8 +180,14 @@ def _get_index(self, space='cosinesimil'): class BitJaccardTestCase(unittest.TestCase, BitVectorIndexTestMixin): def _get_index(self, space='bit_jaccard'): - return nmslib.init(method='hnsw', space='bit_jaccard', data_type=nmslib.DataType.OBJECT_AS_STRING, - dtype=nmslib.DistType.DOUBLE) + return nmslib.init(method='hnsw', space=space, data_type=nmslib.DataType.OBJECT_AS_STRING, + dtype=nmslib.DistType.FLOAT) + + +class SparseJaccardTestCase(unittest.TestCase, BitVectorIndexTestMixin): + def _get_index(self, space='jaccard_sparse'): + return nmslib.init(method='hnsw', space=space, data_type=nmslib.DataType.OBJECT_AS_STRING, + dtype=nmslib.DistType.FLOAT) # class BitHammingTestCase(unittest.TestCase, BitVectorIndexTestMixin): diff --git a/similarity_search/include/factory/init_spaces.h b/similarity_search/include/factory/init_spaces.h index dd0aae7..8abee56 100644 --- a/similarity_search/include/factory/init_spaces.h +++ b/similarity_search/include/factory/init_spaces.h @@ -46,8 +46,8 @@ inline void initSpaces() { // Registering binary/bit Hamming/Jaccard SpaceFactoryRegistry::CreateFuncPtr bit_hamming_func_ptr = CreateBitHamming; REGISTER_SPACE_CREATOR(int, SPACE_BIT_HAMMING, bit_hamming_func_ptr ) - SpaceFactoryRegistry::CreateFuncPtr bit_jaccard_func_ptr = CreateBitJaccard; - REGISTER_SPACE_CREATOR(double, SPACE_BIT_JACCARD, bit_jaccard_func_ptr ) + SpaceFactoryRegistry::CreateFuncPtr bit_jaccard_func_ptr = CreateBitJaccard; + REGISTER_SPACE_CREATOR(float, SPACE_BIT_JACCARD, bit_jaccard_func_ptr ) // Registering the Levensthein-distance: regular and normalized REGISTER_SPACE_CREATOR(int, SPACE_LEVENSHTEIN, CreateLevenshtein) diff --git a/similarity_search/test/test_space_serial.cc b/similarity_search/test/test_space_serial.cc index e8db105..5dd36b7 100644 --- a/similarity_search/test/test_space_serial.cc +++ b/similarity_search/test/test_space_serial.cc @@ -231,7 +231,7 @@ TEST(Test_BitJaccard) { testVect.push_back(ss.str()); } for (size_t maxNumRec = 1; maxNumRec < MAX_NUM_REC; ++maxNumRec) { - EXPECT_EQ(true, fullTest(testVect, maxNumRec, "tmp_out_file.txt", "bit_jaccard", emptyParams, false)); + EXPECT_EQ(true, fullTest(testVect, maxNumRec, "tmp_out_file.txt", "bit_jaccard", emptyParams, false)); } } From c37e33db31deb3b86e01bd6b971e1541e0861ada Mon Sep 17 00:00:00 2001 From: Greg Friedland Date: Sun, 17 Feb 2019 02:34:58 -0800 Subject: [PATCH 04/17] fix tanimoto --- python_bindings/tests/bindings_test.py | 170 ++++++++++++++++++++++--- similarity_search/include/distcomp.h | 2 +- 2 files changed, 152 insertions(+), 20 deletions(-) diff --git a/python_bindings/tests/bindings_test.py b/python_bindings/tests/bindings_test.py index a42939e..b5b1062 100644 --- a/python_bindings/tests/bindings_test.py +++ b/python_bindings/tests/bindings_test.py @@ -7,6 +7,124 @@ import nmslib import psutil +import logging +import multiprocessing +import time +import os +import threading + +MB = 1024 * 1024 + + +class StoppableThread(threading.Thread): + """Thread class with a stop() method. The thread itself has to check + regularly for the stopped() condition.""" + + def __init__(self, *args, **kwargs): + super().__init__() + self._stop_event = threading.Event() + + def stop(self): + self._stop_event.set() + + def stopped(self): + return self._stop_event.is_set() + + +class Timer: + """ Context manager for timing named blocks of code """ + def __init__(self, name, logger=None): + self.name = name + self.logger = logger if logger else logging.getLogger() + + def __enter__(self): + self.start = time.time() + self.logger.debug("Starting {}".format(self.name)) + + def __exit__(self, type, value, trace): + self.logger.info("{}: {:0.2f}s".format(self.name, time.time() - self.start)) + + +class PeakMemoryUsage: + class Worker(StoppableThread): + def __init__(self, interval, *args, **kwargs): + super().__init__(*args, **kwargs) + self.interval = interval + self.max_rss = self.max_vms = 0 + + def run(self): + process = psutil.Process() + while not self.stopped(): + mem = process.memory_info() + self.max_rss = max(self.max_rss, mem.rss) + self.max_vms = max(self.max_vms, mem.vms) + time.sleep(self.interval) + + """ Context manager to calculate peak memory usage in a statement block """ + def __init__(self, name, logger=None, interval=1): + self.name = name + self.logger = logger if logger else logging.getLogger() + self.interval = interval + self.start = time.time() + self.worker = None + + def __enter__(self): + if self.interval > 0: + pid = os.getpid() + mem = psutil.Process(pid).memory_info() + self.start_rss, self.start_vms = mem.rss, mem.vms + + self.worker = PeakMemoryUsage.Worker(self.interval) + self.worker.start() + return self + + def __exit__(self, _, value, trace): + if self.worker: + self.worker.stop() + self.worker.join() + self.logger.warning("Peak memory usage for '{}' in MBs: orig=(rss={:0.1f} vms={:0.1f}) " + "peak=(rss={:0.1f} vms={:0.1f}) in {:0.2f}s" + .format(self.name, self.start_rss / MB, self.start_vms / MB, + self.worker.max_rss / MB, + self.worker.max_vms / MB, time.time() - self.start)) + + +class PsUtil(object): + def __init__(self, attr=('virtual_memory',), proc_attr=None, + logger=None, interval=60): + """ attr can be multiple methods of psutil (e.g. attr=['virtual_memory', 'cpu_times_percent']) """ + self.ps_mon = None + self.attr = attr + self.proc_attr = proc_attr + self.logger = logger if logger else logging.getLogger() + self.interval = interval + + def psutil_worker(self, pid): + root_proc = psutil.Process(pid) + while True: + for attr in self.attr: + self.logger.warning("PSUTIL {}".format(getattr(psutil, attr)())) + if self.proc_attr: + procs = set(root_proc.children(recursive=True)) + procs.add(root_proc) + procs = sorted(procs, key=lambda p: p.pid) + + for proc in procs: + self.logger.warning("PSUTIL process={}: {}" + .format(proc.pid, proc.as_dict(self.proc_attr))) + + time.sleep(self.interval) + + def __enter__(self): + if self.interval > 0: + self.ps_mon = multiprocessing.Process(target=self.psutil_worker, args=(os.getpid(),)) + self.ps_mon.start() + time.sleep(1) # sleep so the first iteration doesn't include statements in the PsUtil context + return self + + def __exit__(self, type, value, trace): + if self.ps_mon is not None: + self.ps_mon.terminate() def get_exact_cosine(row, data, N=10): @@ -19,6 +137,14 @@ def get_hitrate(ground_truth, ids): return len(set(i for i, _ in ground_truth).intersection(ids)) +def bit_vector_to_str(bit_vect): + return " ".join(["1" if e else "0" for e in bit_vect]) + + +def bit_vector_sparse_str(bit_vect): + return " ".join([str(k) for k, b in enumerate(bit_vect) if b]) + + class DenseIndexTestMixin(object): def _get_index(self, space='cosinesimil'): raise NotImplementedError() @@ -93,33 +219,39 @@ def _get_index(self, space='bit_jaccard'): raise NotImplementedError() def testKnnQuery(self): - nbits = 2048 - chunk_size = 1000 + nbits = 512 + chunk_size = 10000 + num_elems = 100000 ps_proc = psutil.Process() - print(f"\n{ps_proc.memory_info()}") + # print(f"\n{ps_proc.memory_info()}") index = self._get_index() - - np.random.seed(23) - for i in range(0, 10000, chunk_size): - strs = [] - for j in range(chunk_size): - a = np.random.rand(nbits) > 0.5 - s = " ".join(["1" if e else "0" for e in a]) - strs.append(s) - index.addDataPointBatch(ids=np.arange(i, i + chunk_size), data=strs) - - print(f"\n{ps_proc.memory_info()}") - index.createIndex() - print(f"\n{ps_proc.memory_info()}") + if "bit_jaccard" in str(index): + bit_vector_str_func = bit_vector_to_str + else: + bit_vector_str_func = bit_vector_sparse_str + + # logging.basicConfig(level=logging.INFO) + # with PsUtil(interval=2, proc_attr=["memory_info"]): + with PeakMemoryUsage(f"AddData: vector={nbits}-bit elems={num_elems}"): + np.random.seed(23) + for i in range(0, num_elems, chunk_size): + strs = [] + for j in range(chunk_size): + a = np.random.rand(nbits) > 0.5 + strs.append(bit_vector_str_func(a)) + index.addDataPointBatch(ids=np.arange(i, i + chunk_size), data=strs) + + # print(f"\n{ps_proc.memory_info()}") + with PeakMemoryUsage(f"CreateIndex: vector={nbits}-bit of elems={num_elems}"): + index.createIndex() + # print(f"\n{ps_proc.memory_info()}") a = np.ones(nbits) - s = " ".join(["1" if e else "0" for e in a]) - ids, distances = index.knnQuery(s, k=10) + ids, distances = index.knnQuery(bit_vector_str_func(a), k=10) # print(ids) print(distances) # self.assertTrue(get_hitrate(get_exact_cosine(row, data), ids) >= 5) - # def testKnnQueryBatch(self): # np.random.seed(23) # data = np.random.randn(1000, 10).astype(np.float32) diff --git a/similarity_search/include/distcomp.h b/similarity_search/include/distcomp.h index d16e80b..84e960b 100644 --- a/similarity_search/include/distcomp.h +++ b/similarity_search/include/distcomp.h @@ -233,7 +233,7 @@ dist_t inline BitJaccard(const dist_uint_t* a, const dist_uint_t* b, size_t qty) den += __builtin_popcount(a[i] | b[i]); } - return dist_t(num) / dist_t(den); + return 1 - (dist_t(num) / dist_t(den)); } //unsigned BitHamming(const uint32_t* a, const uint32_t* b, size_t qty); From 1cbf780c001748be522ae21f401798d28cb25879 Mon Sep 17 00:00:00 2001 From: Greg Friedland Date: Sun, 17 Feb 2019 10:02:17 -0800 Subject: [PATCH 05/17] profile multiple conditions --- python_bindings/tests/bindings_test.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/python_bindings/tests/bindings_test.py b/python_bindings/tests/bindings_test.py index b5b1062..f812160 100644 --- a/python_bindings/tests/bindings_test.py +++ b/python_bindings/tests/bindings_test.py @@ -219,9 +219,12 @@ def _get_index(self, space='bit_jaccard'): raise NotImplementedError() def testKnnQuery(self): - nbits = 512 + for num_elems in [30000, 100000, 300000, 1000000]: + for nbits in [512, 2048]: + self._testKnnQuery(nbits, num_elems) + + def _testKnnQuery(self, nbits, num_elems): chunk_size = 10000 - num_elems = 100000 ps_proc = psutil.Process() # print(f"\n{ps_proc.memory_info()}") From 94023177f099cd6560d8947e5eb3e161c046d895 Mon Sep 17 00:00:00 2001 From: Greg Friedland Date: Mon, 18 Feb 2019 13:01:04 -0800 Subject: [PATCH 06/17] add jaccard timing --- python_bindings/tests/jaccard_comparison.py | 165 ++++++++++++++++++++ python_bindings/tests/jaccard_comparison.sh | 9 ++ 2 files changed, 174 insertions(+) create mode 100644 python_bindings/tests/jaccard_comparison.py create mode 100644 python_bindings/tests/jaccard_comparison.sh diff --git a/python_bindings/tests/jaccard_comparison.py b/python_bindings/tests/jaccard_comparison.py new file mode 100644 index 0000000..7dedf6a --- /dev/null +++ b/python_bindings/tests/jaccard_comparison.py @@ -0,0 +1,165 @@ +import sys +import numpy as np +import nmslib +import psutil +import logging +import multiprocessing +import time +import os +import threading + +MB = 1024 * 1024 +CHUNK_SIZE = 10000 + + +class StoppableThread(threading.Thread): + """Thread class with a stop() method. The thread itself has to check + regularly for the stopped() condition.""" + + def __init__(self, *args, **kwargs): + super().__init__() + self._stop_event = threading.Event() + + def stop(self): + self._stop_event.set() + + def stopped(self): + return self._stop_event.is_set() + + +class Timer: + """ Context manager for timing named blocks of code """ + def __init__(self, name, logger=None): + self.name = name + self.logger = logger if logger else logging.getLogger() + + def __enter__(self): + self.start = time.time() + self.logger.debug("Starting {}".format(self.name)) + + def __exit__(self, type, value, trace): + self.logger.info("{}: {:0.2f}s".format(self.name, time.time() - self.start)) + + +class PeakMemoryUsage: + class Worker(StoppableThread): + def __init__(self, interval, *args, **kwargs): + super().__init__(*args, **kwargs) + self.interval = interval + self.max_rss = self.max_vms = 0 + + def run(self): + process = psutil.Process() + while not self.stopped(): + mem = process.memory_info() + self.max_rss = max(self.max_rss, mem.rss) + self.max_vms = max(self.max_vms, mem.vms) + time.sleep(self.interval) + + """ Context manager to calculate peak memory usage in a statement block """ + def __init__(self, name, logger=None, interval=1): + self.name = name + self.logger = logger if logger else logging.getLogger() + self.interval = interval + self.start = time.time() + self.worker = None + + def __enter__(self): + if self.interval > 0: + pid = os.getpid() + mem = psutil.Process(pid).memory_info() + self.start_rss, self.start_vms = mem.rss, mem.vms + + self.worker = PeakMemoryUsage.Worker(self.interval) + self.worker.start() + return self + + def __exit__(self, _, value, trace): + if self.worker: + self.worker.stop() + self.worker.join() + self.logger.warning("Peak memory usage for '{}' in MBs: orig=(rss={:0.1f} vms={:0.1f}) " + "peak=(rss={:0.1f} vms={:0.1f}) in {:0.2f}s" + .format(self.name, self.start_rss / MB, self.start_vms / MB, + self.worker.max_rss / MB, + self.worker.max_vms / MB, time.time() - self.start)) + + +class PsUtil(object): + def __init__(self, attr=('virtual_memory',), proc_attr=None, + logger=None, interval=60): + """ attr can be multiple methods of psutil (e.g. attr=['virtual_memory', 'cpu_times_percent']) """ + self.ps_mon = None + self.attr = attr + self.proc_attr = proc_attr + self.logger = logger if logger else logging.getLogger() + self.interval = interval + + def psutil_worker(self, pid): + root_proc = psutil.Process(pid) + while True: + for attr in self.attr: + self.logger.warning("PSUTIL {}".format(getattr(psutil, attr)())) + if self.proc_attr: + procs = set(root_proc.children(recursive=True)) + procs.add(root_proc) + procs = sorted(procs, key=lambda p: p.pid) + + for proc in procs: + self.logger.warning("PSUTIL process={}: {}" + .format(proc.pid, proc.as_dict(self.proc_attr))) + + time.sleep(self.interval) + + def __enter__(self): + if self.interval > 0: + self.ps_mon = multiprocessing.Process(target=self.psutil_worker, args=(os.getpid(),)) + self.ps_mon.start() + time.sleep(1) # sleep so the first iteration doesn't include statements in the PsUtil context + return self + + def __exit__(self, type, value, trace): + if self.ps_mon is not None: + self.ps_mon.terminate() + + +def bit_vector_to_str(bit_vect): + return " ".join(["1" if e else "0" for e in bit_vect]) + + +def bit_vector_sparse_str(bit_vect): + return " ".join([str(k) for k, b in enumerate(bit_vect) if b]) + + +def run(space, num_elems, nbits): + if space == "bit_jaccard": + bit_vector_str_func = bit_vector_to_str + index = nmslib.init(method='hnsw', space=space, data_type=nmslib.DataType.OBJECT_AS_STRING, + dtype=nmslib.DistType.FLOAT) + else: + bit_vector_str_func = bit_vector_sparse_str + index = nmslib.init(method='hnsw', space=space, data_type=nmslib.DataType.OBJECT_AS_STRING, + dtype=nmslib.DistType.FLOAT) + + with PeakMemoryUsage(f"All: space={space} nbits={nbits} elems={num_elems}"): + np.random.seed(23) + for i in range(0, num_elems, CHUNK_SIZE): + strs = [] + for j in range(CHUNK_SIZE): + a = np.random.rand(nbits) > 0.5 + strs.append(bit_vector_str_func(a)) + index.addDataPointBatch(ids=np.arange(i, i + CHUNK_SIZE), data=strs) + + index.createIndex() + + a = np.ones(nbits) + ids, distances = index.knnQuery(bit_vector_str_func(a), k=10) + print(distances) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.WARNING) + space = sys.argv[1] + num_elems = int(sys.argv[2]) + nbits = int(sys.argv[3]) + run(space, num_elems, nbits) diff --git a/python_bindings/tests/jaccard_comparison.sh b/python_bindings/tests/jaccard_comparison.sh new file mode 100644 index 0000000..9b1ed23 --- /dev/null +++ b/python_bindings/tests/jaccard_comparison.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +for space in bit_jaccard jaccard_sparse; do + for num_elems in 30000 100000 300000 1000000 3000000 10000000 30000000; do + for nbits in 512 20148; do + python jaccard_comparison.py $space $num_elems $nbits + done + done +done From a51d37881bc7e4eea2a5ae4cc99a7c2cbc724ac5 Mon Sep 17 00:00:00 2001 From: Greg Friedland Date: Mon, 18 Feb 2019 19:22:54 -0800 Subject: [PATCH 07/17] fix typos --- python_bindings/tests/jaccard_comparison.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python_bindings/tests/jaccard_comparison.sh b/python_bindings/tests/jaccard_comparison.sh index 9b1ed23..3bdf8b7 100644 --- a/python_bindings/tests/jaccard_comparison.sh +++ b/python_bindings/tests/jaccard_comparison.sh @@ -1,8 +1,10 @@ #!/bin/bash -for space in bit_jaccard jaccard_sparse; do +set -e + +for space in jaccard_sparse; do for num_elems in 30000 100000 300000 1000000 3000000 10000000 30000000; do - for nbits in 512 20148; do + for nbits in 512 2048; do python jaccard_comparison.py $space $num_elems $nbits done done From 0f26eb3f2005f9bf2ffa237c023491c7427f4b31 Mon Sep 17 00:00:00 2001 From: Greg Friedland Date: Mon, 18 Feb 2019 19:24:04 -0800 Subject: [PATCH 08/17] update jaccard comp plot --- .../tests/jaccard_comparison_plot.py | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 python_bindings/tests/jaccard_comparison_plot.py diff --git a/python_bindings/tests/jaccard_comparison_plot.py b/python_bindings/tests/jaccard_comparison_plot.py new file mode 100644 index 0000000..02a8117 --- /dev/null +++ b/python_bindings/tests/jaccard_comparison_plot.py @@ -0,0 +1,39 @@ +import pandas as pd +import sys +import statsmodels.api as sm +from plotnine import ggplot, geom_point, aes, stat_smooth, geom_line, scale_x_log10, \ + scale_y_log10, theme, element_text, ylim + + +if __name__ == "__main__": + df = pd.read_csv(sys.argv[1]) + df["space_nbits"] = df.space.astype(str) + "_" + df.nbits.astype(str) + df.memory = df.memory.astype(float) + df.time = df.time.astype(float) + print(df.info()) + + for col in ["time", "memory"]: + funcs = [] + for space_nbits in df.space_nbits.unique(): + sub_df = df.loc[df.space_nbits == space_nbits] + model = sm.OLS(sub_df.num_elems, sm.add_constant(sub_df[col])) + params = model.fit().params + func = lambda x: params.const + x * getattr(params, col) + funcs.append(func) + + p = (ggplot(df, aes("num_elems", col, color="space_nbits")) + + geom_point() + geom_line() + + scale_x_log10(limits=[10000,10000000]) + + scale_y_log10(limits=[3,10000]) + + theme(axis_text_x=element_text(rotation=90, hjust=1))) + p.save(filename=col + ".png", height=5, width=5, units='in', dpi=300) + + # p = ggplot(aes(x="num_elems", y=col, color="space_nbits"), data=df) + geom_line() + geom_point() + stat_function(fun=funcs[0]) + # p.make() + + # fig = plt.gcf() + # ax = plt.gca() + # plt.gca().set_xscale('log') + # plt.gca().set_yscale('log') + # + # ggsave(plot=p, filename=col + ".png") From 03071c2b384f56227c90e5e04e175294dfe53b86 Mon Sep 17 00:00:00 2001 From: Greg Friedland Date: Thu, 21 Feb 2019 09:13:54 -0800 Subject: [PATCH 09/17] move set seeed --- python_bindings/tests/jaccard_comparison.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python_bindings/tests/jaccard_comparison.py b/python_bindings/tests/jaccard_comparison.py index 7dedf6a..26d402e 100644 --- a/python_bindings/tests/jaccard_comparison.py +++ b/python_bindings/tests/jaccard_comparison.py @@ -132,6 +132,7 @@ def bit_vector_sparse_str(bit_vect): def run(space, num_elems, nbits): + np.random.seed(23) if space == "bit_jaccard": bit_vector_str_func = bit_vector_to_str index = nmslib.init(method='hnsw', space=space, data_type=nmslib.DataType.OBJECT_AS_STRING, @@ -142,7 +143,6 @@ def run(space, num_elems, nbits): dtype=nmslib.DistType.FLOAT) with PeakMemoryUsage(f"All: space={space} nbits={nbits} elems={num_elems}"): - np.random.seed(23) for i in range(0, num_elems, CHUNK_SIZE): strs = [] for j in range(CHUNK_SIZE): @@ -158,6 +158,8 @@ def run(space, num_elems, nbits): if __name__ == "__main__": + np.set_printoptions(linewidth=500) + logging.basicConfig(level=logging.WARNING) space = sys.argv[1] num_elems = int(sys.argv[2]) From 56aa99c63e5de1a151be831e6ba8b2534ea2a55e Mon Sep 17 00:00:00 2001 From: Greg Friedland Date: Thu, 21 Feb 2019 13:45:27 -0800 Subject: [PATCH 10/17] update unit test --- python_bindings/tests/bindings_test.py | 144 ++++++--------- python_bindings/tests/jaccard_comparison.py | 167 ------------------ python_bindings/tests/jaccard_comparison.sh | 11 -- .../tests/jaccard_comparison_plot.py | 39 ---- 4 files changed, 55 insertions(+), 306 deletions(-) delete mode 100644 python_bindings/tests/jaccard_comparison.py delete mode 100644 python_bindings/tests/jaccard_comparison.sh delete mode 100644 python_bindings/tests/jaccard_comparison_plot.py diff --git a/python_bindings/tests/bindings_test.py b/python_bindings/tests/bindings_test.py index f812160..4c78875 100644 --- a/python_bindings/tests/bindings_test.py +++ b/python_bindings/tests/bindings_test.py @@ -218,94 +218,60 @@ class BitVectorIndexTestMixin(object): def _get_index(self, space='bit_jaccard'): raise NotImplementedError() - def testKnnQuery(self): - for num_elems in [30000, 100000, 300000, 1000000]: - for nbits in [512, 2048]: - self._testKnnQuery(nbits, num_elems) + def _get_batches(self, index, nbits, num_elems, chunk_size): + if "bit_" in str(index): + self.bit_vector_str_func = bit_vector_to_str + else: + self.bit_vector_str_func = bit_vector_sparse_str + + batches = [] + for i in range(0, num_elems, chunk_size): + strs = [] + for j in range(chunk_size): + a = np.random.rand(nbits) > 0.5 + strs.append(self.bit_vector_str_func(a)) + batches.append([np.arange(i, i + chunk_size), strs]) + return batches - def _testKnnQuery(self, nbits, num_elems): - chunk_size = 10000 + def testKnnQuery(self): + np.random.seed(23) - ps_proc = psutil.Process() - # print(f"\n{ps_proc.memory_info()}") index = self._get_index() - if "bit_jaccard" in str(index): - bit_vector_str_func = bit_vector_to_str - else: - bit_vector_str_func = bit_vector_sparse_str - - # logging.basicConfig(level=logging.INFO) - # with PsUtil(interval=2, proc_attr=["memory_info"]): - with PeakMemoryUsage(f"AddData: vector={nbits}-bit elems={num_elems}"): - np.random.seed(23) - for i in range(0, num_elems, chunk_size): - strs = [] - for j in range(chunk_size): - a = np.random.rand(nbits) > 0.5 - strs.append(bit_vector_str_func(a)) - index.addDataPointBatch(ids=np.arange(i, i + chunk_size), data=strs) - - # print(f"\n{ps_proc.memory_info()}") - with PeakMemoryUsage(f"CreateIndex: vector={nbits}-bit of elems={num_elems}"): - index.createIndex() - # print(f"\n{ps_proc.memory_info()}") - - a = np.ones(nbits) - ids, distances = index.knnQuery(bit_vector_str_func(a), k=10) - # print(ids) - print(distances) - # self.assertTrue(get_hitrate(get_exact_cosine(row, data), ids) >= 5) - # def testKnnQueryBatch(self): - # np.random.seed(23) - # data = np.random.randn(1000, 10).astype(np.float32) - # - # index = self._get_index() - # index.addDataPointBatch(data) - # index.createIndex() - # - # queries = data[:10] - # results = index.knnQueryBatch(queries, k=10) - # for query, (ids, distances) in zip(queries, results): - # self.assertTrue(get_hitrate(get_exact_cosine(query, data), ids) >= 5) - # - # # test col-major arrays - # queries = np.asfortranarray(queries) - # results = index.knnQueryBatch(queries, k=10) - # for query, (ids, distances) in zip(queries, results): - # self.assertTrue(get_hitrate(get_exact_cosine(query, data), ids) >= 5) - # - # # test custom ids (set id to square of each row) - # index = self._get_index() - # index.addDataPointBatch(data, ids=np.arange(data.shape[0]) ** 2) - # index.createIndex() - # - # queries = data[:10] - # results = index.knnQueryBatch(queries, k=10) - # for query, (ids, distances) in zip(queries, results): - # # convert from square back to row id - # ids = np.sqrt(ids).astype(int) - # self.assertTrue(get_hitrate(get_exact_cosine(query, data), ids) >= 5) - - # def testReloadIndex(self): - # np.random.seed(23) - # data = np.random.randn(1000, 10).astype(np.float32) - # - # original = self._get_index() - # original.addDataPointBatch(data) - # original.createIndex() - # - # # test out saving/reloading index - # with tempfile.NamedTemporaryFile() as tmp: - # original.saveIndex(tmp.name + ".index") - # - # reloaded = self._get_index() - # reloaded.addDataPointBatch(data) - # reloaded.loadIndex(tmp.name + ".index") - # - # original_results = original.knnQuery(data[0]) - # reloaded_results = reloaded.knnQuery(data[0]) - # npt.assert_allclose(original_results, - # reloaded_results) + batches = self._get_batches(index, 512, 2000, 1000) + for ids, data in batches: + index.addDataPointBatch(ids=ids, data=data) + + index.createIndex() + + s = self.bit_vector_str_func(np.ones(512)) + index.knnQuery(s, k=10) + + def testReloadIndex(self): + np.random.seed(23) + + original = self._get_index() + batches = self._get_batches(original, 512, 2000, 1000) + for ids, data in batches: + original.addDataPointBatch(ids=ids, data=data) + original.createIndex() + + # test out saving/reloading index + with tempfile.NamedTemporaryFile() as tmp: + original.saveIndex(tmp.name + ".index") + + reloaded = self._get_index() + for ids, data in batches: + reloaded.addDataPointBatch(ids=ids, data=data) + reloaded.loadIndex(tmp.name + ".index") + + s = self.bit_vector_str_func(np.ones(512)) + original_results = original.knnQuery(s) + reloaded_results = reloaded.knnQuery(s) + original_results = list(zip(list(original_results[0]), list(original_results[1]))) + original_results = sorted(original_results, key=lambda x: x[1]) + reloaded_results = list(zip(list(reloaded_results[0]), list(reloaded_results[1]))) + reloaded_results = sorted(reloaded_results, key=lambda x: x[1]) + npt.assert_allclose(original_results, reloaded_results) class HNSWTestCase(unittest.TestCase, DenseIndexTestMixin): @@ -325,10 +291,10 @@ def _get_index(self, space='jaccard_sparse'): dtype=nmslib.DistType.FLOAT) -# class BitHammingTestCase(unittest.TestCase, BitVectorIndexTestMixin): -# def _get_index(self, space='bit_hamming'): -# return nmslib.init(method='hnsw', space='bit_hamming', data_type=nmslib.DataType.OBJECT_AS_STRING, -# dtype=nmslib.DistType.INT) +class BitHammingTestCase(unittest.TestCase, BitVectorIndexTestMixin): + def _get_index(self, space='bit_hamming'): + return nmslib.init(method='hnsw', space='bit_hamming', data_type=nmslib.DataType.OBJECT_AS_STRING, + dtype=nmslib.DistType.INT) class SWGraphTestCase(unittest.TestCase, DenseIndexTestMixin): diff --git a/python_bindings/tests/jaccard_comparison.py b/python_bindings/tests/jaccard_comparison.py deleted file mode 100644 index 26d402e..0000000 --- a/python_bindings/tests/jaccard_comparison.py +++ /dev/null @@ -1,167 +0,0 @@ -import sys -import numpy as np -import nmslib -import psutil -import logging -import multiprocessing -import time -import os -import threading - -MB = 1024 * 1024 -CHUNK_SIZE = 10000 - - -class StoppableThread(threading.Thread): - """Thread class with a stop() method. The thread itself has to check - regularly for the stopped() condition.""" - - def __init__(self, *args, **kwargs): - super().__init__() - self._stop_event = threading.Event() - - def stop(self): - self._stop_event.set() - - def stopped(self): - return self._stop_event.is_set() - - -class Timer: - """ Context manager for timing named blocks of code """ - def __init__(self, name, logger=None): - self.name = name - self.logger = logger if logger else logging.getLogger() - - def __enter__(self): - self.start = time.time() - self.logger.debug("Starting {}".format(self.name)) - - def __exit__(self, type, value, trace): - self.logger.info("{}: {:0.2f}s".format(self.name, time.time() - self.start)) - - -class PeakMemoryUsage: - class Worker(StoppableThread): - def __init__(self, interval, *args, **kwargs): - super().__init__(*args, **kwargs) - self.interval = interval - self.max_rss = self.max_vms = 0 - - def run(self): - process = psutil.Process() - while not self.stopped(): - mem = process.memory_info() - self.max_rss = max(self.max_rss, mem.rss) - self.max_vms = max(self.max_vms, mem.vms) - time.sleep(self.interval) - - """ Context manager to calculate peak memory usage in a statement block """ - def __init__(self, name, logger=None, interval=1): - self.name = name - self.logger = logger if logger else logging.getLogger() - self.interval = interval - self.start = time.time() - self.worker = None - - def __enter__(self): - if self.interval > 0: - pid = os.getpid() - mem = psutil.Process(pid).memory_info() - self.start_rss, self.start_vms = mem.rss, mem.vms - - self.worker = PeakMemoryUsage.Worker(self.interval) - self.worker.start() - return self - - def __exit__(self, _, value, trace): - if self.worker: - self.worker.stop() - self.worker.join() - self.logger.warning("Peak memory usage for '{}' in MBs: orig=(rss={:0.1f} vms={:0.1f}) " - "peak=(rss={:0.1f} vms={:0.1f}) in {:0.2f}s" - .format(self.name, self.start_rss / MB, self.start_vms / MB, - self.worker.max_rss / MB, - self.worker.max_vms / MB, time.time() - self.start)) - - -class PsUtil(object): - def __init__(self, attr=('virtual_memory',), proc_attr=None, - logger=None, interval=60): - """ attr can be multiple methods of psutil (e.g. attr=['virtual_memory', 'cpu_times_percent']) """ - self.ps_mon = None - self.attr = attr - self.proc_attr = proc_attr - self.logger = logger if logger else logging.getLogger() - self.interval = interval - - def psutil_worker(self, pid): - root_proc = psutil.Process(pid) - while True: - for attr in self.attr: - self.logger.warning("PSUTIL {}".format(getattr(psutil, attr)())) - if self.proc_attr: - procs = set(root_proc.children(recursive=True)) - procs.add(root_proc) - procs = sorted(procs, key=lambda p: p.pid) - - for proc in procs: - self.logger.warning("PSUTIL process={}: {}" - .format(proc.pid, proc.as_dict(self.proc_attr))) - - time.sleep(self.interval) - - def __enter__(self): - if self.interval > 0: - self.ps_mon = multiprocessing.Process(target=self.psutil_worker, args=(os.getpid(),)) - self.ps_mon.start() - time.sleep(1) # sleep so the first iteration doesn't include statements in the PsUtil context - return self - - def __exit__(self, type, value, trace): - if self.ps_mon is not None: - self.ps_mon.terminate() - - -def bit_vector_to_str(bit_vect): - return " ".join(["1" if e else "0" for e in bit_vect]) - - -def bit_vector_sparse_str(bit_vect): - return " ".join([str(k) for k, b in enumerate(bit_vect) if b]) - - -def run(space, num_elems, nbits): - np.random.seed(23) - if space == "bit_jaccard": - bit_vector_str_func = bit_vector_to_str - index = nmslib.init(method='hnsw', space=space, data_type=nmslib.DataType.OBJECT_AS_STRING, - dtype=nmslib.DistType.FLOAT) - else: - bit_vector_str_func = bit_vector_sparse_str - index = nmslib.init(method='hnsw', space=space, data_type=nmslib.DataType.OBJECT_AS_STRING, - dtype=nmslib.DistType.FLOAT) - - with PeakMemoryUsage(f"All: space={space} nbits={nbits} elems={num_elems}"): - for i in range(0, num_elems, CHUNK_SIZE): - strs = [] - for j in range(CHUNK_SIZE): - a = np.random.rand(nbits) > 0.5 - strs.append(bit_vector_str_func(a)) - index.addDataPointBatch(ids=np.arange(i, i + CHUNK_SIZE), data=strs) - - index.createIndex() - - a = np.ones(nbits) - ids, distances = index.knnQuery(bit_vector_str_func(a), k=10) - print(distances) - - -if __name__ == "__main__": - np.set_printoptions(linewidth=500) - - logging.basicConfig(level=logging.WARNING) - space = sys.argv[1] - num_elems = int(sys.argv[2]) - nbits = int(sys.argv[3]) - run(space, num_elems, nbits) diff --git a/python_bindings/tests/jaccard_comparison.sh b/python_bindings/tests/jaccard_comparison.sh deleted file mode 100644 index 3bdf8b7..0000000 --- a/python_bindings/tests/jaccard_comparison.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -set -e - -for space in jaccard_sparse; do - for num_elems in 30000 100000 300000 1000000 3000000 10000000 30000000; do - for nbits in 512 2048; do - python jaccard_comparison.py $space $num_elems $nbits - done - done -done diff --git a/python_bindings/tests/jaccard_comparison_plot.py b/python_bindings/tests/jaccard_comparison_plot.py deleted file mode 100644 index 02a8117..0000000 --- a/python_bindings/tests/jaccard_comparison_plot.py +++ /dev/null @@ -1,39 +0,0 @@ -import pandas as pd -import sys -import statsmodels.api as sm -from plotnine import ggplot, geom_point, aes, stat_smooth, geom_line, scale_x_log10, \ - scale_y_log10, theme, element_text, ylim - - -if __name__ == "__main__": - df = pd.read_csv(sys.argv[1]) - df["space_nbits"] = df.space.astype(str) + "_" + df.nbits.astype(str) - df.memory = df.memory.astype(float) - df.time = df.time.astype(float) - print(df.info()) - - for col in ["time", "memory"]: - funcs = [] - for space_nbits in df.space_nbits.unique(): - sub_df = df.loc[df.space_nbits == space_nbits] - model = sm.OLS(sub_df.num_elems, sm.add_constant(sub_df[col])) - params = model.fit().params - func = lambda x: params.const + x * getattr(params, col) - funcs.append(func) - - p = (ggplot(df, aes("num_elems", col, color="space_nbits")) - + geom_point() + geom_line() - + scale_x_log10(limits=[10000,10000000]) - + scale_y_log10(limits=[3,10000]) - + theme(axis_text_x=element_text(rotation=90, hjust=1))) - p.save(filename=col + ".png", height=5, width=5, units='in', dpi=300) - - # p = ggplot(aes(x="num_elems", y=col, color="space_nbits"), data=df) + geom_line() + geom_point() + stat_function(fun=funcs[0]) - # p.make() - - # fig = plt.gcf() - # ax = plt.gca() - # plt.gca().set_xscale('log') - # plt.gca().set_yscale('log') - # - # ggsave(plot=p, filename=col + ".png") From 7a55b91c5e405a1c3d81a0955b29b05e76578f65 Mon Sep 17 00:00:00 2001 From: Greg Friedland Date: Thu, 21 Feb 2019 13:48:15 -0800 Subject: [PATCH 11/17] remove pycharm files --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index d4c34ec..d48b88b 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ similarity_search/test/Makefile *.so *.pyc *.egg-info/ +.idea From c712dcdf3f6fc894e984fc97c1b8f02fce9ebc5f Mon Sep 17 00:00:00 2001 From: Greg Friedland Date: Thu, 21 Feb 2019 13:50:52 -0800 Subject: [PATCH 12/17] remove pycharm files --- .idea/codeStyles/Project.xml | 10 ---------- .idea/codeStyles/codeStyleConfig.xml | 5 ----- 2 files changed, 15 deletions(-) delete mode 100644 .idea/codeStyles/Project.xml delete mode 100644 .idea/codeStyles/codeStyleConfig.xml diff --git a/.idea/codeStyles/Project.xml b/.idea/codeStyles/Project.xml deleted file mode 100644 index 664f8f1..0000000 --- a/.idea/codeStyles/Project.xml +++ /dev/null @@ -1,10 +0,0 @@ - - - - - \ No newline at end of file diff --git a/.idea/codeStyles/codeStyleConfig.xml b/.idea/codeStyles/codeStyleConfig.xml deleted file mode 100644 index 79ee123..0000000 --- a/.idea/codeStyles/codeStyleConfig.xml +++ /dev/null @@ -1,5 +0,0 @@ - - - - \ No newline at end of file From 3d5686f9a4ad6479d7f06825604a33378038fc13 Mon Sep 17 00:00:00 2001 From: Greg Friedland Date: Thu, 21 Feb 2019 13:58:45 -0800 Subject: [PATCH 13/17] cleanup --- python_bindings/setup.py | 16 +- python_bindings/tests/bindings_test.py | 119 - similarity_search/test/test_distfunc.cc | 2370 +++++++++---------- similarity_search/test/test_space_serial.cc | 115 +- 4 files changed, 1248 insertions(+), 1372 deletions(-) diff --git a/python_bindings/setup.py b/python_bindings/setup.py index c01800e..88930a9 100755 --- a/python_bindings/setup.py +++ b/python_bindings/setup.py @@ -21,16 +21,14 @@ if os.path.exists(library_file): # if we have a prebuilt nmslib library file, use that. extra_objects.append(library_file) - print("Found: " + os.path.abspath(library_file)) else: - raise RuntimeError("can't find prebuild lib: " + os.path.abspath(library_file)) - # # Otherwise build all the files here directly (excluding extras which need eigen/boost) - # exclude_files = set("""bbtree.cc lsh.cc lsh_multiprobe.cc lsh_space.cc falconn.cc nndes.cc space_sqfd.cc - # dummy_app.cc main.cc""".split()) - # - # for root, subdirs, files in os.walk(os.path.join(libdir, "src")): - # source_files.extend(os.path.join(root, f) for f in files - # if f.endswith(".cc") and f not in exclude_files) + # Otherwise build all the files here directly (excluding extras which need eigen/boost) + exclude_files = set("""bbtree.cc lsh.cc lsh_multiprobe.cc lsh_space.cc falconn.cc nndes.cc space_sqfd.cc + dummy_app.cc main.cc""".split()) + + for root, subdirs, files in os.walk(os.path.join(libdir, "src")): + source_files.extend(os.path.join(root, f) for f in files + if f.endswith(".cc") and f not in exclude_files) if sys.platform.startswith('linux'): diff --git a/python_bindings/tests/bindings_test.py b/python_bindings/tests/bindings_test.py index 4c78875..1e3ec87 100644 --- a/python_bindings/tests/bindings_test.py +++ b/python_bindings/tests/bindings_test.py @@ -6,125 +6,6 @@ import numpy.testing as npt import nmslib -import psutil -import logging -import multiprocessing -import time -import os -import threading - -MB = 1024 * 1024 - - -class StoppableThread(threading.Thread): - """Thread class with a stop() method. The thread itself has to check - regularly for the stopped() condition.""" - - def __init__(self, *args, **kwargs): - super().__init__() - self._stop_event = threading.Event() - - def stop(self): - self._stop_event.set() - - def stopped(self): - return self._stop_event.is_set() - - -class Timer: - """ Context manager for timing named blocks of code """ - def __init__(self, name, logger=None): - self.name = name - self.logger = logger if logger else logging.getLogger() - - def __enter__(self): - self.start = time.time() - self.logger.debug("Starting {}".format(self.name)) - - def __exit__(self, type, value, trace): - self.logger.info("{}: {:0.2f}s".format(self.name, time.time() - self.start)) - - -class PeakMemoryUsage: - class Worker(StoppableThread): - def __init__(self, interval, *args, **kwargs): - super().__init__(*args, **kwargs) - self.interval = interval - self.max_rss = self.max_vms = 0 - - def run(self): - process = psutil.Process() - while not self.stopped(): - mem = process.memory_info() - self.max_rss = max(self.max_rss, mem.rss) - self.max_vms = max(self.max_vms, mem.vms) - time.sleep(self.interval) - - """ Context manager to calculate peak memory usage in a statement block """ - def __init__(self, name, logger=None, interval=1): - self.name = name - self.logger = logger if logger else logging.getLogger() - self.interval = interval - self.start = time.time() - self.worker = None - - def __enter__(self): - if self.interval > 0: - pid = os.getpid() - mem = psutil.Process(pid).memory_info() - self.start_rss, self.start_vms = mem.rss, mem.vms - - self.worker = PeakMemoryUsage.Worker(self.interval) - self.worker.start() - return self - - def __exit__(self, _, value, trace): - if self.worker: - self.worker.stop() - self.worker.join() - self.logger.warning("Peak memory usage for '{}' in MBs: orig=(rss={:0.1f} vms={:0.1f}) " - "peak=(rss={:0.1f} vms={:0.1f}) in {:0.2f}s" - .format(self.name, self.start_rss / MB, self.start_vms / MB, - self.worker.max_rss / MB, - self.worker.max_vms / MB, time.time() - self.start)) - - -class PsUtil(object): - def __init__(self, attr=('virtual_memory',), proc_attr=None, - logger=None, interval=60): - """ attr can be multiple methods of psutil (e.g. attr=['virtual_memory', 'cpu_times_percent']) """ - self.ps_mon = None - self.attr = attr - self.proc_attr = proc_attr - self.logger = logger if logger else logging.getLogger() - self.interval = interval - - def psutil_worker(self, pid): - root_proc = psutil.Process(pid) - while True: - for attr in self.attr: - self.logger.warning("PSUTIL {}".format(getattr(psutil, attr)())) - if self.proc_attr: - procs = set(root_proc.children(recursive=True)) - procs.add(root_proc) - procs = sorted(procs, key=lambda p: p.pid) - - for proc in procs: - self.logger.warning("PSUTIL process={}: {}" - .format(proc.pid, proc.as_dict(self.proc_attr))) - - time.sleep(self.interval) - - def __enter__(self): - if self.interval > 0: - self.ps_mon = multiprocessing.Process(target=self.psutil_worker, args=(os.getpid(),)) - self.ps_mon.start() - time.sleep(1) # sleep so the first iteration doesn't include statements in the PsUtil context - return self - - def __exit__(self, type, value, trace): - if self.ps_mon is not None: - self.ps_mon.terminate() def get_exact_cosine(row, data, N=10): diff --git a/similarity_search/test/test_distfunc.cc b/similarity_search/test/test_distfunc.cc index 668f3d4..252520d 100644 --- a/similarity_search/test/test_distfunc.cc +++ b/similarity_search/test/test_distfunc.cc @@ -63,1189 +63,1189 @@ TEST(set_intel) { */ -//TEST(Platform64) { -// EXPECT_EQ(8 == sizeof(size_t), true); -//} -// -//template -//bool checkElemVectEq(const vector>& source, -// const vector>& target) { -// if (source.size() != target.size()) return false; -// -// for (size_t i = 0; i < source.size(); ++i) -// if (source[i] != target[i]) return false; -// -// return true; -//} -// -//template -//void TestSparsePackUnpack() { -// for (size_t maxSize = 1024 ; maxSize < 1024*1024; maxSize += 8192) { -// vector> source; -// GenSparseVectZipf(maxSize, source); -// -// LOG(LIB_INFO) << "testing maxSize: " << maxSize << "\nqty: " << source.size() -// << " maxId: " << source.back().id_; -// -// char* pBuff = NULL; -// size_t dataLen = 0; -// -// PackSparseElements(source, pBuff, dataLen); -// -// vector> target; -// UnpackSparseElements(pBuff, dataLen, target); -// -// bool eqFlag = checkElemVectEq(source, target); -// -// if (!eqFlag) { -// LOG(LIB_INFO) << "Different source and target, source.size(): " << source.size() -// << " target.size(): " << target.size(); -// // Let's print the first different in the case of equal # of elements -// size_t i = 0; -// for (; i < min(source.size(), target.size()); ++i) { -// if (!(source[i] == target[i])) { -// LOG(LIB_INFO) << "First diff, i = " << i << " " << source[i] << " vs " << target[i]; -// break; -// } -// } -// } -// -// EXPECT_EQ(eqFlag, true); -// } -//} -// -//TEST(BlockZeros) { -// for (size_t id = 0 ; id <= 3*65536; id++) { -// size_t id1 = removeBlockZeros(id); -// -// size_t id2 = addBlockZeros(id1); -// EXPECT_EQ(id, id2); -// } -//} -// -//#ifdef DISABLE_LONG_TESTS -//TEST(DISABLE_SparsePackUnpack) { -//#else -//TEST(SparsePackUnpack) { -//#endif -// TestSparsePackUnpack(); -// TestSparsePackUnpack(); -//} -// -//TEST(TestEfficientPower) { -// double f = 2.0; -// -// for (unsigned i = 1; i <= 64; i++) { -// double p1 = std::pow(f, i); -// double p2 = EfficientPow(f, i); -// -// EXPECT_EQ(p1, p2); -// } -//} -// -//TEST(TestEfficientFract) { -// unsigned MaxNumDig = 16; -// -// for (float a = 1.1f ; a <= 2.0f; a+= 0.1f) { -// for (unsigned NumDig = 1; NumDig < MaxNumDig; ++NumDig) { -// uint64_t MaxFract = uint64_t(1) << NumDig; -// -// for (uint64_t intFract = 0; intFract < MaxFract; ++intFract) { -// float fract = float(intFract) / float(MaxFract); -// float v1 = pow(a, fract); -// float v2 = EfficientFractPow(a, fract, NumDig); -// -// EXPECT_EQ_EPS(v1, v2, 1e-5f); -// } -// } -// } -//} -// -//template -//bool TestScalarProductAgree(size_t N, size_t dim, size_t Rep) { -// vector vect1(dim), vect2(dim); -// T* pVect1 = &vect1[0]; -// T* pVect2 = &vect2[0]; -// -// float maxRelDiff = 1e-6f; -// float maxAbsDiff = 1e-6f; -// -// for (size_t i = 0; i < Rep; ++i) { -// for (size_t j = 1; j < N; ++j) { -// GenRandVect(pVect1, dim, T(1), T(2), true /* do normalize */); -// GenRandVect(pVect2, dim, T(1), T(2), true /* do normalize */); -// -// T val1 = ScalarProduct(pVect1, pVect2, dim); -// T val2 = ScalarProductSIMD(pVect1, pVect2, dim); -// -// bool bug = false; -// T diff = fabs(val1 - val2); -// T diffRel = diff/max(max(fabs(val1),fabs(val2)),T(1e-18)); -// if (diffRel > maxRelDiff && diff > maxAbsDiff) { -// bug = true; -// cerr << "Bug ScalarProduct !!! Dim = " << dim << " val1 = " << val1 << " val2 = " << val2 << " diff=" << diff << " diffRel=" << diffRel << endl; -// } -// -// if (bug) return false; -// } -// } -// -// return true; -//} -// -//template -//bool TestNormScalarProductAgree(size_t N, size_t dim, size_t Rep) { -// vector vect1(dim), vect2(dim); -// T* pVect1 = &vect1[0]; -// T* pVect2 = &vect2[0]; -// -// float maxRelDiff = 1e-6f; -// float maxAbsDiff = 1e-6f; -// -// for (size_t i = 0; i < Rep; ++i) { -// for (size_t j = 1; j < N; ++j) { -// GenRandVect(pVect1, dim, T(1), T(2), true /* do normalize */); -// GenRandVect(pVect2, dim, T(1), T(2), true /* do normalize */); -// -// T val1 = NormScalarProduct(pVect1, pVect2, dim); -// T val2 = NormScalarProductSIMD(pVect1, pVect2, dim); -// -// bool bug = false; -// T diff = fabs(val1 - val2); -// T diffRel = diff/max(max(fabs(val1),fabs(val2)),T(1e-18)); -// if (diffRel > maxRelDiff && diff > maxAbsDiff) { -// bug = true; -// cerr << "Bug NormScalarProduct !!! Dim = " << dim << " val1 = " << val1 << " val2 = " << val2 << " diff=" << diff << " diffRel=" << diffRel << endl; -// } -// -// if (bug) return false; -// } -// } -// -// return true; -//} -// -//// Agreement test functions -//template -//bool TestLInfAgree(size_t N, size_t dim, size_t Rep) { -// vector vect1(dim), vect2(dim); -// T* pVect1 = &vect1[0]; -// T* pVect2 = &vect2[0]; -// -// for (size_t i = 0; i < Rep; ++i) { -// for (size_t j = 1; j < N; ++j) { -// GenRandVect(pVect1, dim, -T(RANGE), T(RANGE)); -// GenRandVect(pVect2, dim, -T(RANGE), T(RANGE)); -// -// T val1 = LInfNormStandard(pVect1, pVect2, dim); -// T val2 = LInfNorm(pVect1, pVect2, dim); -// T val3 = LInfNormSIMD(pVect1, pVect2, dim); -// -// bool bug = false; -// -// if (fabs(val1 - val2)/max(max(val1,val2),T(1e-18)) > 1e-6) { -// cerr << "Bug LInf !!! Dim = " << dim << " val1 = " << val1 << " val2 = " << val2 << endl; -// bug = true; -// } -// if (fabs(val1 - val3)/max(max(val1,val2),T(1e-18)) > 1e-6) { -// cerr << "Bug LInf !!! Dim = " << dim << " val1 = " << val1 << " val3 = " << val3 << endl; -// bug = true; -// } -// if (bug) return false; -// } -// } -// -// -// return true; -//} -// -//template -//bool TestL1Agree(size_t N, size_t dim, size_t Rep) { -// vector vect1(dim), vect2(dim); -// T* pVect1 = &vect1[0]; -// T* pVect2 = &vect2[0]; -// -// for (size_t i = 0; i < Rep; ++i) { -// for (size_t j = 1; j < N; ++j) { -// GenRandVect(pVect1, dim, -T(RANGE), T(RANGE)); -// GenRandVect(pVect2, dim, -T(RANGE), T(RANGE)); -// -// T val1 = L1NormStandard(pVect1, pVect2, dim); -// T val2 = L1Norm(pVect1, pVect2, dim); -// T val3 = L1NormSIMD(pVect1, pVect2, dim); -// -// bool bug = false; -// -// if (fabs(val1 - val2)/max(max(val1,val2),T(1e-18)) > 1e-6) { -// cerr << "Bug L1 !!! Dim = " << dim << " val1 = " << val1 << " val2 = " << val2 << endl; -// bug = true; -// } -// if (fabs(val1 - val3)/max(max(val1,val2),T(1e-18)) > 1e-6) { -// cerr << "Bug L1 !!! Dim = " << dim << " val1 = " << val1 << " val3 = " << val3 << endl; -// bug = true; -// } -// if (bug) return false; -// } -// } -// -// return true; -//} -// -//template -//bool TestL2Agree(size_t N, size_t dim, size_t Rep) { -// vector vect1(dim), vect2(dim); -// T* pVect1 = &vect1[0]; -// T* pVect2 = &vect2[0]; -// -// for (size_t i = 0; i < Rep; ++i) { -// for (size_t j = 1; j < N; ++j) { -// GenRandVect(pVect1, dim, -T(RANGE), T(RANGE)); -// GenRandVect(pVect2, dim, -T(RANGE), T(RANGE)); -// -// T val1 = L2NormStandard(pVect1, pVect2, dim); -// T val2 = L2Norm(pVect1, pVect2, dim); -// T val3 = L2NormSIMD(pVect1, pVect2, dim); -// -// bool bug = false; -// -// if (fabs(val1 - val2)/max(max(val1,val2),T(1e-18)) > 1e-6) { -// cerr << "Bug L2 !!! Dim = " << dim << " val1 = " << val1 << " val2 = " << val2 << endl; -// bug = true; -// } -// if (fabs(val1 - val3)/max(max(val1,val2),T(1e-18)) > 1e-6) { -// cerr << "Bug L2 !!! Dim = " << dim << " val1 = " << val1 << " val3 = " << val3 << endl; -// bug = true; -// } -// if (bug) return false; -// } -// } -// -// -// return true; -//} -// -//template -//bool TestItakuraSaitoAgree(size_t N, size_t dim, size_t Rep) { -// vector vect1(dim), vect2(dim); -// T* pVect1 = &vect1[0]; -// T* pVect2 = &vect2[0]; -// vector precompVect1(dim *2), precompVect2(dim * 2); -// T* pPrecompVect1 = &precompVect1[0]; -// T* pPrecompVect2 = &precompVect2[0]; -// -// for (size_t i = 0; i < Rep; ++i) { -// for (size_t j = 1; j < N; ++j) { -// GenRandVect(pVect1, dim, T(RANGE_SMALL), T(1.0), true); -// GenRandVect(pVect2, dim, T(RANGE_SMALL), T(1.0), true); -// -// copy(pVect1, pVect1 + dim, pPrecompVect1); -// copy(pVect2, pVect2 + dim, pPrecompVect2); -// -// PrecompLogarithms(pPrecompVect1, dim); -// PrecompLogarithms(pPrecompVect2, dim); -// -// T val0 = ItakuraSaito(pVect1, pVect2, dim); -// T val1 = ItakuraSaitoPrecomp(pPrecompVect1, pPrecompVect2, dim); -// T val2 = ItakuraSaitoPrecompSIMD(pPrecompVect1, pPrecompVect2, dim); -// -// bool bug = false; -// -// T AbsDiff1 = fabs(val1 - val0); -// T RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val0)),T(1e-18)); -// -// if (RelDiff1 > 1e-5 && AbsDiff1 > 1e-5) { -// cerr << "Bug ItakuraSaito !!! Dim = " << dim << " val1 = " << val1 << " val0 = " << val0 << " Diff: " << (val1 - val0) << " RelDiff1: " << RelDiff1 << " << AbsDiff1: " << AbsDiff1 << endl; -// bug = true; -// } -// -// T AbsDiff2 = fabs(val1 - val2); -// T RelDiff2 = AbsDiff2/max(max(fabs(val1),fabs(val2)),T(1e-18)); -// if (RelDiff2 > 1e-5 && AbsDiff2 > 1e-5) { -// cerr << "Bug ItakuraSaito !!! Dim = " << dim << " val1 = " << val1 << " val2 = " << val2 << " Diff: " << (val1 - val2) << " RelDiff2: " << RelDiff2 << " AbsDiff2: " << AbsDiff2 << endl; -// bug = true; -// } -// -// if (bug) return false; -// } -// } -// -// -// return true; -//} -// -//template -//bool TestKLAgree(size_t N, size_t dim, size_t Rep) { -// vector vect1(dim), vect2(dim); -// T* pVect1 = &vect1[0]; -// T* pVect2 = &vect2[0]; -// vector precompVect1(dim *2), precompVect2(dim * 2); -// T* pPrecompVect1 = &precompVect1[0]; -// T* pPrecompVect2 = &precompVect2[0]; -// -// for (size_t i = 0; i < Rep; ++i) { -// for (size_t j = 1; j < N; ++j) { -// GenRandVect(pVect1, dim, T(RANGE_SMALL), T(1.0), true); -// GenRandVect(pVect2, dim, T(RANGE_SMALL), T(1.0), true); -// -// copy(pVect1, pVect1 + dim, pPrecompVect1); -// copy(pVect2, pVect2 + dim, pPrecompVect2); -// -// PrecompLogarithms(pPrecompVect1, dim); -// PrecompLogarithms(pPrecompVect2, dim); -// -// T val0 = KLStandard(pVect1, pVect2, dim); -// T val1 = KLStandardLogDiff(pVect1, pVect2, dim); -// T val2 = KLPrecomp(pPrecompVect1, pPrecompVect2, dim); -// T val3 = KLPrecompSIMD(pPrecompVect1, pPrecompVect2, dim); -// -// bool bug = false; -// -// /* -// * KLStandardLog has a worse accuracy due to computing the log of ratios -// * as opposed to difference of logs, but it is more efficient (log can be -// * expensive to compute) -// */ -// -// T AbsDiff1 = fabs(val1 - val0); -// T RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val0)),T(1e-18)); -// if (RelDiff1 > 1e-5 && AbsDiff1 > 1e-5) { -// cerr << "Bug KL !!! Dim = " << dim << " val0 = " << val0 << " val1 = " << val1 << " Diff: " << (val0 - val1) << " RelDiff1: " << RelDiff1 << " AbsDiff1: " << AbsDiff1 << endl; -// bug = true; -// } -// -// T AbsDiff2 = fabs(val1 - val2); -// T RelDiff2 = AbsDiff2/max(max(fabs(val1),fabs(val2)),T(1e-18)); -// if (RelDiff2 > 1e-5 && AbsDiff2 > 1e-5) { -// cerr << "Bug KL !!! Dim = " << dim << " val2 = " << val2 << " val1 = " << val1 << " Diff: " << (val2 - val1) << " RelDiff2: " << RelDiff2 << " AbsDiff2: " << AbsDiff2 << endl; -// bug = true; -// } -// -// T AbsDiff3 = fabs(val1 - val3); -// T RelDiff3 = AbsDiff3/max(max(fabs(val1),fabs(val3)),T(1e-18)); -// if (RelDiff3 > 1e-5 && AbsDiff3 > 1e-5) { -// cerr << "Bug KL !!! Dim = " << dim << " val3 = " << val3 << " val1 = " << val1 << " Diff: " << (val3 - val1) << " RelDiff3: " << RelDiff3 << " AbsDiff3: " << AbsDiff3 << endl; -// bug = true; -// } -// -// if (bug) return false; -// } -// } -// -// -// return true; -//} -// -//template -//bool TestKLGeneralAgree(size_t N, size_t dim, size_t Rep) { -// T* pVect1 = new T[dim]; -// T* pVect2 = new T[dim]; -// T* pPrecompVect1 = new T[dim * 2]; -// T* pPrecompVect2 = new T[dim * 2]; -// -// for (size_t i = 0; i < Rep; ++i) { -// for (size_t j = 1; j < N; ++j) { -// GenRandVect(pVect1, dim, T(RANGE_SMALL), T(1.0), false); -// GenRandVect(pVect2, dim, T(RANGE_SMALL), T(1.0), false); -// -// copy(pVect1, pVect1 + dim, pPrecompVect1); -// copy(pVect2, pVect2 + dim, pPrecompVect2); -// -// PrecompLogarithms(pPrecompVect1, dim); -// PrecompLogarithms(pPrecompVect2, dim); -// -// T val0 = KLGeneralStandard(pVect1, pVect2, dim); -// T val2 = KLGeneralPrecomp(pPrecompVect1, pPrecompVect2, dim); -// T val3 = KLGeneralPrecompSIMD(pPrecompVect1, pPrecompVect2, dim); -// -// bool bug = false; -// -// T AbsDiff1 = fabs(val2 - val0); -// T RelDiff1 = AbsDiff1/max(max(fabs(val2),fabs(val0)),T(1e-18)); -// if (RelDiff1 > 1e-5 && AbsDiff1 > 1e-5) { -// cerr << "Bug KL !!! Dim = " << dim << " val0 = " << val0 << " val2 = " << val2 << " Diff: " << (val0 - val2) << " RelDiff1: " << RelDiff1 << " AbsDiff1: " << AbsDiff1 << endl; -// bug = true; -// } -// -// T AbsDiff2 = fabs(val3 - val2); -// T RelDiff2 = AbsDiff2/max(max(fabs(val3),fabs(val2)),T(1e-18)); -// if (RelDiff2 > 1e-5 && AbsDiff2 > 1e-5) { -// cerr << "Bug KL !!! Dim = " << dim << " val2 = " << val2 << " val3 = " << val3 << " Diff: " << (val2 - val3) << " RelDiff2: " << RelDiff2 << " AbsDiff2: " << AbsDiff2 << endl; -// bug = true; -// } -// -// if (bug) return false; -// } -// } -// -// -// return true; -//} -// -//template -//bool TestJSAgree(size_t N, size_t dim, size_t Rep, double pZero) { -// vector vect1(dim), vect2(dim); -// T* pVect1 = &vect1[0]; -// T* pVect2 = &vect2[0]; -// vector precompVect1(dim *2), precompVect2(dim * 2); -// T* pPrecompVect1 = &precompVect1[0]; -// T* pPrecompVect2 = &precompVect2[0]; -// -// T Dist = 0; -// T Error = 0; -// T TotalQty = 0; -// -// for (size_t i = 0; i < Rep; ++i) { -// for (size_t j = 1; j < N; ++j) { -// GenRandVect(pVect1, dim, T(RANGE_SMALL), T(1.0), true); -// SetRandZeros(pVect1, dim, pZero); -// Normalize(pVect1, dim); -// GenRandVect(pVect2, dim, T(RANGE_SMALL), T(1.0), true); -// SetRandZeros(pVect2, dim, pZero); -// Normalize(pVect2, dim); -// -// copy(pVect1, pVect1 + dim, pPrecompVect1); -// copy(pVect2, pVect2 + dim, pPrecompVect2); -// -// PrecompLogarithms(pPrecompVect1, dim); -// PrecompLogarithms(pPrecompVect2, dim); -// -// T val0 = JSStandard(pVect1, pVect2, dim); -// T val1 = JSPrecomp(pPrecompVect1, pPrecompVect2, dim); -// -// bool bug = false; -// -// T AbsDiff1 = fabs(val1 - val0); -// T RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val0)),T(1e-18)); -// -// if (RelDiff1 > 1e-5 && AbsDiff1 > 1e-5) { -// cerr << "Bug JS (1) " << typeid(T).name() << " !!! Dim = " << dim << " val0 = " << val0 << " val1 = " << val1 << " Diff: " << (val0 - val1) << " RelDiff1: " << RelDiff1 << " AbsDiff1: " << AbsDiff1 << endl; -// bug = true; -// } -// -// T val2 = JSPrecompApproxLog(pPrecompVect1, pPrecompVect2, dim); -// T val3 = JSPrecompSIMDApproxLog(pPrecompVect1, pPrecompVect2, dim); -// -// T AbsDiff2 = fabs(val2 - val3); -// T RelDiff2 = AbsDiff2/max(max(fabs(val2),fabs(val3)),T(1e-18)); -// -// if (RelDiff2 > 1e-5 && AbsDiff2 > 1e-5) { -// cerr << "Bug JS (2) " << typeid(T).name() << " !!! Dim = " << dim << " val2 = " << val2 << " val3 = " << val3 << " Diff: " << (val2 - val3) << " RelDiff2: " << RelDiff2 << " AbsDiff2: " << AbsDiff2 << endl; -// bug = true; -// } -// -// T AbsDiff3 = fabs(val1 - val2); -// T RelDiff3 = AbsDiff3/max(max(fabs(val1),fabs(val2)),T(1e-18)); -// -// Dist += val1; -// Error += AbsDiff3; -// ++TotalQty; -// -// if (RelDiff3 > 1e-4 && AbsDiff3 > 1e-4) { -// cerr << "Bug JS (3) " << typeid(T).name() << " !!! Dim = " << dim << " val1 = " << val1 << " val2 = " << val2 << " Diff: " << (val1 - val2) << " RelDiff3: " << RelDiff3 << " AbsDiff2: " << AbsDiff3 << endl; -// bug = true; -// } -// -// if (bug) return false; -// } -// } -// -// LOG(LIB_INFO) << typeid(T).name() << " JS approximation error: average absolute: " << Error / TotalQty << -// " avg. dist: " << Dist / TotalQty << " average relative: " << Error/Dist; -// -// -// return true; -//} -// -//template -//bool TestRenyiDivAgree(size_t N, size_t dim, size_t Rep, T alpha) { -// vector vect1(dim), vect2(dim); -// T* pVect1 = &vect1[0]; -// T* pVect2 = &vect2[0]; -// -// T Dist = 0; -// T Error = 0; -// T TotalQty = 0; -// -// for (size_t i = 0; i < Rep; ++i) { -// for (size_t j = 1; j < N; ++j) { -// GenRandVect(pVect1, dim, T(RANGE_SMALL), T(1.0), true); -// GenRandVect(pVect2, dim, T(RANGE_SMALL), T(1.0), true); -// -// Normalize(pVect1, dim); -// Normalize(pVect2, dim); -// -// T val0 = renyiDivergenceSlow(pVect1, pVect2, dim, alpha); -// T val1 = renyiDivergenceFast(pVect1, pVect2, dim, alpha); -// -// bool bug = false; -// -// T AbsDiff1 = fabs(val1 - val0); -// T RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val0)),T(1e-18)); -// -// Error += AbsDiff1; -// ++TotalQty; -// -// if (RelDiff1 > 1e-5 && AbsDiff1 > 1e-5) { -// cerr << "Bug Reniy Div. (1) " << typeid(T).name() << " !!! Dim = " << dim -// << "alpha=" << alpha << " val0 = " << val0 << " val1 = " << val1 -// << " Diff: " << (val0 - val1) << " RelDiff1: " << RelDiff1 -// << " AbsDiff1: " << AbsDiff1 << endl; -// bug = true; -// } -// -// if (bug) return false; -// } -// } -// -// LOG(LIB_INFO) << typeid(T).name() << " Renyi Div. approximation error: average absolute: " << Error / TotalQty << -// " avg. dist: " << Dist / TotalQty << " average relative: " << Error/Dist; -// -// -// return true; -//} -// -//template -//bool TestAlphaBetaDivAgree(size_t N, size_t dim, size_t Rep, T alpha, T beta) { -// vector vect1(dim), vect2(dim); -// T* pVect1 = &vect1[0]; -// T* pVect2 = &vect2[0]; -// -// T Dist = 0; -// T Error = 0; -// T TotalQty = 0; -// -// for (size_t i = 0; i < Rep; ++i) { -// for (size_t j = 1; j < N; ++j) { -// GenRandVect(pVect1, dim, T(RANGE_SMALL), T(1.0), true); -// GenRandVect(pVect2, dim, T(RANGE_SMALL), T(1.0), true); -// -// Normalize(pVect1, dim); -// Normalize(pVect2, dim); -// -// T val0 = alphaBetaDivergenceSlow(pVect1, pVect2, dim, alpha, beta); -// T val1 = alphaBetaDivergenceFast(pVect1, pVect2, dim, alpha, beta); -// -// bool bug = false; -// -// T AbsDiff1 = fabs(val1 - val0); -// T RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val0)),T(1e-18)); -// -// Error += AbsDiff1; -// ++TotalQty; -// -// if (RelDiff1 > 1e-5 && AbsDiff1 > 1e-5) { -// cerr << "Bug alpha-beta Div. (1) " << typeid(T).name() << " !!! Dim = " << dim -// << "alpha=" << alpha << " val0 = " << val0 << " val1 = " << val1 -// << " Diff: " << (val0 - val1) << " RelDiff1: " << RelDiff1 -// << " AbsDiff1: " << AbsDiff1 << endl; -// bug = true; -// } -// -// if (bug) return false; -// } -// } -// -// LOG(LIB_INFO) << typeid(T).name() << " alpha-beta div. approximation error: average absolute: " << Error / TotalQty << " avg. dist: " << Dist / TotalQty << " average relative: " << Error/Dist; -// -// -// return true; -//} -// -//bool TestSpearmanFootruleAgree(size_t N, size_t dim, size_t Rep) { -// vector vect1(dim), vect2(dim); -// PivotIdType* pVect1 = &vect1[0]; -// PivotIdType* pVect2 = &vect2[0]; -// -// for (size_t i = 0; i < Rep; ++i) { -// for (size_t j = 1; j < N; ++j) { -// GenRandIntVect(pVect1, dim); -// GenRandIntVect(pVect2, dim); -// -// int val0 = SpearmanFootrule(pVect1, pVect2, dim); -// int val1 = SpearmanFootruleSIMD(pVect1, pVect2, dim); -// -// bool bug = false; -// -// -// if (val0 != val1) { -// cerr << "Bug SpearmanFootrule !!! Dim = " << dim << " val0 = " << val0 << " val1 = " << val1 << endl; -// bug = true; -// } -// -// if (bug) return false; -// } -// } -// -// -// return true; -//} -// -//bool TestSpearmanRhoAgree(size_t N, size_t dim, size_t Rep) { -// vector vect1(dim), vect2(dim); -// PivotIdType* pVect1 = &vect1[0]; -// PivotIdType* pVect2 = &vect2[0]; -// -// for (size_t i = 0; i < Rep; ++i) { -// for (size_t j = 1; j < N; ++j) { -// GenRandIntVect(pVect1, dim); -// GenRandIntVect(pVect2, dim); -// -// int val0 = SpearmanRho(pVect1, pVect2, dim); -// int val1 = SpearmanRhoSIMD(pVect1, pVect2, dim); -// -// bool bug = false; -// -// -// if (val0 != val1) { -// cerr << "Bug SpearmanRho !!! Dim = " << dim << " val0 = " << val0 << " val1 = " << val1 << " Diff: " << (val0 - val1) << endl; -// bug = true; -// } -// -// if (bug) return false; -// } -// } -// -// -// return true; -//} -// -//template -//bool TestLPGenericAgree(size_t N, size_t dim, size_t Rep, T power) { -// vector vect1(dim), vect2(dim); -// T* pVect1 = &vect1[0]; -// T* pVect2 = &vect2[0]; -// -// T TotalQty = 0, Error = 0, Dist = 0; -// -// for (size_t i = 0; i < Rep; ++i) { -// for (size_t j = 1; j < N; ++j) { -// GenRandVect(pVect1, dim, -T(RANGE), T(RANGE)); -// GenRandVect(pVect2, dim, -T(RANGE), T(RANGE)); -// -// T val0 = LPGenericDistance(pVect1, pVect2, dim, power); -// T val1 = LPGenericDistanceOptim(pVect1, pVect2, dim, power); -// -// bool bug = false; -// -// T AbsDiff1 = fabs(val1 - val0); -// T RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val0)),T(1e-18)); -// -// T maxRelDiff = 1e-5f; -// T maxAbsDiff = 1e-5f; -// /* -// * For large powers, the difference can be larger, -// * because our approximations are efficient, but not very -// * precise -// */ -// if (power > 8) { maxAbsDiff = maxRelDiff = 1e-3f;} -// if (power > 12) { maxAbsDiff = maxRelDiff = 0.01f;} -// if (power > 22) { maxAbsDiff = maxRelDiff = 0.1f;} -// -// ++TotalQty; -// Error += RelDiff1; -// Dist += val0; -// -// if (RelDiff1 > maxRelDiff && AbsDiff1 > maxAbsDiff) { -// cerr << "Bug LP" << power << " !!! Dim = " << dim << -// " val1 = " << val1 << " val0 = " << val0 << -// " Diff: " << (val1 - val0) << -// " RelDiff1: " << RelDiff1 << -// " (max for this power: " << maxRelDiff << ") " << -// " AbsDiff1: " << AbsDiff1 << " (max for this power: " << maxAbsDiff << ")" << endl; -// } -// -// if (bug) return false; -// } -// } -// -// if (power < 4) { -// LOG(LIB_INFO) << typeid(T).name() << " LP approximation error: average absolute " << Error / TotalQty << " avg. dist: " << Dist / TotalQty << " average relative: " << Error/Dist; -// -// } -// -// return true; -//} -// -//bool TestBitHammingAgree(size_t N, size_t dim, size_t Rep) { -// size_t WordQty = (dim + 31)/32; -// vector arr(N * WordQty); -// uint32_t* pArr = &arr[0]; -// -// uint32_t *p = pArr; -// for (size_t i = 0; i < N; ++i, p+= WordQty) { -// vector perm(dim); -// GenRandIntVect(&perm[0], dim); -// for (unsigned j = 0; j < dim; ++j) -// perm[j] = perm[j] % 2; -// vector h; -// Binarize(perm, 1, h); -// CHECK(h.size() == WordQty); -// memcpy(p, &h[0], WordQty * sizeof(h[0])); -// } -// -// WallClockTimer t; -// -// t.reset(); -// -// bool res = true; -// -// for (size_t j = 1; j < N; ++j) { -// uint32_t* pVect1 = pArr + j*WordQty; -// uint32_t* pVect2 = pArr + (j-1)*WordQty; -// int d1 = BitHamming(pVect1, pVect2, WordQty); -// int d2 = 0; -// -// for (unsigned t = 0; t < WordQty; ++t) { -// for (unsigned k = 0; k < 32; ++k) { -// d2 += ((pVect1[t]>>k)&1) != ((pVect2[t]>>k)&1); -// } -// } -// if (d1 != d2) { -// cerr << "Bug bit hamming, WordQty = " << WordQty << " d1 = " << d1 << " d2 = " << d2 << endl; -// res = false; -// break; -// } -// } -// -// return res; -//} -// -// -//bool TestSparseAngularDistanceAgree(const string& dataFile, size_t N, size_t Rep) { -// typedef float T; -// -// unique_ptr spaceFast(new SpaceSparseAngularDistanceFast()); -// unique_ptr> spaceReg(new SpaceSparseAngularDistance()); -// -// ObjectVector elemsFast; -// ObjectVector elemsReg; -// vector tmp; -// -// unique_ptr inpStateFast(spaceFast->ReadDataset(elemsFast, tmp, dataFile, N)); -// spaceFast->UpdateParamsFromFile(*inpStateFast); -// unique_ptr inpStateReg(spaceReg->ReadDataset(elemsReg, tmp, dataFile, N)); -// spaceReg->UpdateParamsFromFile(*inpStateReg); -// -// CHECK(elemsFast.size() == elemsReg.size()); -// -// N = min(N, elemsReg.size()); -// -// bool bug = false; -// -// float maxRelDiff = 2e-5f; -// float maxAbsDiff = 1e-6f; -// -// for (size_t j = Rep; j < N; ++j) -// for (size_t k = j - Rep; k < j; ++k) { -// float val1 = spaceFast->IndexTimeDistance(elemsFast[k], elemsFast[j]); -// float val2 = spaceReg->IndexTimeDistance(elemsReg[k], elemsReg[j]); -// -// float AbsDiff1 = fabs(val1 - val2); -// float RelDiff1 = AbsDiff1 / max(max(fabs(val1), fabs(val2)), T(1e-18)); -// -// if (RelDiff1 > maxRelDiff && AbsDiff1 > maxAbsDiff) { -// cerr << "Bug fast vs non-fast angular dist " << -// " val1 = " << val1 << " val2 = " << val2 << -// " Diff: " << (val1 - val2) << -// " RelDiff1: " << RelDiff1 << -// " AbsDiff1: " << AbsDiff1 << endl; -// bug = true; -// } -// -// if (bug) return false; -// } -// -// return true; -//} -// -// -// -//bool TestSparseCosineSimilarityAgree(const string& dataFile, size_t N, size_t Rep) { -// typedef float T; -// -// unique_ptr spaceFast(new SpaceSparseCosineSimilarityFast()); -// unique_ptr> spaceReg (new SpaceSparseCosineSimilarity()); -// -// ObjectVector elemsFast; -// ObjectVector elemsReg; -// vector tmp; -// -// unique_ptr inpStateFast(spaceFast->ReadDataset(elemsFast, tmp, dataFile, N)); -// spaceFast->UpdateParamsFromFile(*inpStateFast); -// unique_ptr inpStateReg(spaceReg->ReadDataset(elemsReg, tmp, dataFile, N)); -// spaceReg->UpdateParamsFromFile(*inpStateReg); -// -// CHECK(elemsFast.size() == elemsReg.size()); -// -// N = min(N, elemsReg.size()); -// -// bool bug = false; -// -// float maxRelDiff = 1e-5f; -// float maxAbsDiff = 1e-5f; -// -// for (size_t j = Rep; j < N; ++j) -// for (size_t k = j - Rep; k < j; ++k) { -// float val1 = spaceFast->IndexTimeDistance(elemsFast[k], elemsFast[j]); -// float val2 = spaceReg->IndexTimeDistance(elemsReg[k], elemsReg[j]); -// -// float AbsDiff1 = fabs(val1 - val2); -// float RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val2)),T(1e-18)); -// -// if (RelDiff1 > maxRelDiff && AbsDiff1 > maxAbsDiff) { -// cerr << "Bug fast vs non-fast cosine " << -// " val1 = " << val1 << " val2 = " << val2 << -// " Diff: " << (val1 - val2) << -// " RelDiff1: " << RelDiff1 << -// " AbsDiff1: " << AbsDiff1 << endl; -// bug = true; -// } -// -// if (bug) return false; -// } -// -// return true; -//} -// -//bool TestSparseNegativeScalarProductAgree(const string& dataFile, size_t N, size_t Rep) { -// typedef float T; -// -// unique_ptr spaceFast(new SpaceSparseNegativeScalarProductFast()); -// unique_ptr> spaceReg (new SpaceSparseNegativeScalarProduct()); -// -// ObjectVector elemsFast; -// ObjectVector elemsReg; -// vector tmp; -// -// unique_ptr inpStateFast(spaceFast->ReadDataset(elemsFast, tmp, dataFile, N)); -// spaceFast->UpdateParamsFromFile(*inpStateFast); -// unique_ptr inpStateReg(spaceReg->ReadDataset(elemsReg, tmp, dataFile, N)); -// spaceReg->UpdateParamsFromFile(*inpStateReg); -// -// CHECK(elemsFast.size() == elemsReg.size()); -// -// N = min(N, elemsReg.size()); -// -// bool bug = false; -// -// float maxRelDiff = 1e-6f; -// float maxAbsDiff = 1e-6f; -// -// for (size_t j = Rep; j < N; ++j) -// for (size_t k = j - Rep; k < j; ++k) { -// float val1 = spaceFast->IndexTimeDistance(elemsFast[k], elemsFast[j]); -// float val2 = spaceReg->IndexTimeDistance(elemsReg[k], elemsReg[j]); -// -// float AbsDiff1 = fabs(val1 - val2); -// float RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val2)),T(1e-18)); -// -// if (RelDiff1 > maxRelDiff && AbsDiff1 > maxAbsDiff) { -// cerr << "Bug fast vs non-fast negative scalar/dot product " << -// " val1 = " << val1 << " val2 = " << val2 << -// " Diff: " << (val1 - val2) << -// " RelDiff1: " << RelDiff1 << -// " AbsDiff1: " << AbsDiff1 << endl; -// bug = true; -// } -// -// if (bug) return false; -// } -// -// return true; -//} -// -//bool TestSparseQueryNormNegativeScalarProductAgree(const string& dataFile, size_t N, size_t Rep) { -// typedef float T; -// -// unique_ptr spaceFast(new SpaceSparseQueryNormNegativeScalarProductFast()); -// unique_ptr> spaceReg (new SpaceSparseQueryNormNegativeScalarProduct()); -// -// ObjectVector elemsFast; -// ObjectVector elemsReg; -// vector tmp; -// -// unique_ptr inpStateFast(spaceFast->ReadDataset(elemsFast, tmp, dataFile, N)); -// spaceFast->UpdateParamsFromFile(*inpStateFast); -// unique_ptr inpStateReg(spaceReg->ReadDataset(elemsReg, tmp, dataFile, N)); -// spaceReg->UpdateParamsFromFile(*inpStateReg); -// -// CHECK(elemsFast.size() == elemsReg.size()); -// -// N = min(N, elemsReg.size()); -// -// bool bug = false; -// -// float maxRelDiff = 1e-6f; -// float maxAbsDiff = 1e-6f; -// -// for (size_t j = Rep; j < N; ++j) -// for (size_t k = j - Rep; k < j; ++k) { -// float val1 = spaceFast->IndexTimeDistance(elemsFast[k], elemsFast[j]); -// float val2 = spaceReg->IndexTimeDistance(elemsReg[k], elemsReg[j]); -// -// float AbsDiff1 = fabs(val1 - val2); -// float RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val2)),T(1e-18)); -// -// if (RelDiff1 > maxRelDiff && AbsDiff1 > maxAbsDiff) { -// cerr << "Bug fast vs non-fast QUERY-NORMALIZED negative scalar/dot product " << -// " val1 = " << val1 << " val2 = " << val2 << -// " Diff: " << (val1 - val2) << -// " RelDiff1: " << RelDiff1 << -// " AbsDiff1: " << AbsDiff1 << endl; -// bug = true; -// } -// -// if (bug) return false; -// } -// -// return true; -//} -// -//// Limitation: this is only for spaces without params -//bool TestPivotIndex(const string& spaceName, -// bool useDummyIndex, -// const string& dataFile, size_t dataQty, -// const string& pivotFile, size_t pivotQty) { -// -// LOG(LIB_INFO) << "space: " << spaceName << " real pivot index?: " << !useDummyIndex << " " << -// " dataFile: " << dataFile << " " << -// " pivotFile: " << pivotFile; -// try { -// typedef float T; -// -// AnyParams emptyParams; -// -// unique_ptr> space(SpaceFactoryRegistry::Instance().CreateSpace(spaceName, emptyParams)); -// -// ObjectVector data; -// ObjectVector pivots; -// vector tmp; -// -// float maxRelDiff = 1e-6f; -// float maxAbsDiff = 1e-6f; -// -// unique_ptr inpStateFast(space->ReadDataset(data, tmp, dataFile, dataQty)); -// space->UpdateParamsFromFile(*inpStateFast); -// space->ReadDataset(pivots, tmp, pivotFile, pivotQty); -// -// unique_ptr> pivIndx(useDummyIndex ? -// new DummyPivotIndex(*space, pivots) -// : -// space->CreatePivotIndex(pivots, -// 0 /* Let's not test using the hashing trick here, b/c distances would be somewhat different */)); -// -// for (size_t did = 0; did < dataQty; ++did) { -// vector vDst; -// pivIndx->ComputePivotDistancesIndexTime(data[did], vDst); -// CHECK_MSG(vDst.size() == pivotQty, "ComputePivotDistancesIndexTime returns incorrect # of elements different from the # of pivots"); -// -// for (size_t pid = 0; pid < pivotQty; ++pid) { -// T val2 = space->IndexTimeDistance(pivots[pid], data[did]); -// T val1 = vDst[pid]; -// -// float AbsDiff1 = fabs(val1 - val2); -// float RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val2)),T(1e-18)); -// -// if (RelDiff1 > maxRelDiff && AbsDiff1 > maxAbsDiff) { -// cerr << "Bug in fast computation of all-pivot distance, " << -// " space: " << spaceName << " real pivot index?: " << !useDummyIndex << endl << -// " dataFile: " << dataFile << endl << -// " pivotFile: " << pivotFile << endl << -// " data index: " << did << " pivot index: " << pid << endl << -// " val1 = " << val1 << " val2 = " << val2 << -// " Diff: " << (val1 - val2) << -// " RelDiff1: " << RelDiff1 << -// " AbsDiff1: " << AbsDiff1 << endl; -// return false; -// } -// } -// } -// } catch (const exception& e) { -// LOG(LIB_INFO) << "Got exception while testing: " << e.what(); -// return false; -// } -// return true; -//} -// -// -// -// -//#ifdef DISABLE_LONG_TESTS -//TEST(DISABLE_TestAgree) { -//#else -//TEST(TestAgree) { -//#endif -// int nTest = 0; -// int nFail = 0; -// -// nTest++; -// nFail += !TestSparseAngularDistanceAgree(sampleDataPrefix + "sparse_5K.txt", 1000, 200); -// -// nTest++; -// nFail += !TestSparseAngularDistanceAgree(sampleDataPrefix + "sparse_wiki_5K.txt", 1000, 200); -// -// nTest++; -// nFail += !TestSparseCosineSimilarityAgree(sampleDataPrefix + "sparse_5K.txt", 1000, 200); -// -// nTest++; -// nFail += !TestSparseCosineSimilarityAgree(sampleDataPrefix + "sparse_wiki_5K.txt", 1000, 200); -// -// -// nTest++; -// nFail += !TestSparseNegativeScalarProductAgree(sampleDataPrefix + "sparse_5K.txt", 1000, 200); -// -// nTest++; -// nFail += !TestSparseNegativeScalarProductAgree(sampleDataPrefix + "sparse_wiki_5K.txt", 1000, 200); -// -// nTest++; -// nFail += !TestSparseQueryNormNegativeScalarProductAgree(sampleDataPrefix + "sparse_5K.txt", 1000, 200); -// -// nTest++; -// nFail += !TestSparseQueryNormNegativeScalarProductAgree(sampleDataPrefix + "sparse_wiki_5K.txt", 1000, 200); -// -// -// /* -// * 32 should be more than enough for almost all methods, -// * where loop-unrolling includes at most 16 distance computations. -// * -// * Bit-Hamming is an exception. -// * -// */ -// for (unsigned dim = 1; dim <= 1024; dim+=2) { -// LOG(LIB_INFO) << "Dim = " << dim; -// -// nFail += !TestBitHammingAgree(1000, dim, 1000); -// } -// -// for (unsigned dim = 1; dim <= 32; ++dim) { -// LOG(LIB_INFO) << "Dim = " << dim; -// -// /* -// * This is a costly check, we don't need to do it for large # dimensions. -// * Anyways, the function is not using any loop unrolling, so 8 should be sufficient. -// */ -// if (dim <= 8) { -// -// for (float power = 0.125; power <= 32; power += 0.125) { -// TestLPGenericAgree(1024, dim, 10, power); -// } -// for (double power = 0.125; power <= 32; power += 0.125) { -// TestLPGenericAgree(1024, dim, 10, power); -// } -// -// // In the case of Renyi divergence 0 < alpha < 1, 1 < alpha < infinity -// // https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy#R%C3%A9nyi_divergence -// for (float alpha = 0.125; alpha <= 2; alpha += 0.125) { -// if (fabs(alpha - 1) < 1e-6) continue; -// TestRenyiDivAgree(1024, dim, 10, alpha); -// } -// for (double alpha = 0.125; alpha <= 2; alpha += 0.125) { -// if (fabs(alpha - 1) < 1e-6) continue; -// TestRenyiDivAgree(1024, dim, 10, alpha); -// } -// -// for (float alpha = -2; alpha <= 2; alpha += 0.5) -// for (float beta = -2; beta <= 2; beta += 0.5) -// { -// TestAlphaBetaDivAgree(1024, dim, 10, alpha, beta); -// } -// -// for (double alpha = -2; alpha <= 2; alpha += 0.5) -// for (double beta = -2; beta <= 2; beta += 0.5) -// { -// TestAlphaBetaDivAgree(1024, dim, 10, alpha, beta); -// } -// } -// -// nTest++; -// nFail += !TestNormScalarProductAgree(1024, dim, 10); -// nTest++; -// nFail += !TestNormScalarProductAgree(1024, dim, 10); -// -// nTest++; -// nFail += !TestScalarProductAgree(1024, dim, 10); -// nTest++; -// nFail += !TestScalarProductAgree(1024, dim, 10); -// -// nTest++; -// nFail += !TestSpearmanFootruleAgree(1024, dim, 10); -// -// nTest++; -// nFail += !TestSpearmanRhoAgree(1024, dim, 10); -// -// nTest++; -// nFail += !TestJSAgree(1024, dim, 10, 0.5); -// nTest++; -// nFail += !TestJSAgree(1024, dim, 10, 0.5); -// -// nTest++; -// nFail += !TestKLGeneralAgree(1024, dim, 10); -// nTest++; -// nFail += !TestKLGeneralAgree(1024, dim, 10); -// -// nTest++; -// nFail += !TestLInfAgree(1024, dim, 10); -// nTest++; -// nFail += !TestLInfAgree(1024, dim, 10); -// -// nTest++; -// nFail += !TestL1Agree(1024, dim, 10); -// nTest++; -// nFail += !TestL1Agree(1024, dim, 10); -// -// nTest++; -// nFail += !TestL2Agree(1024, dim, 10); -// nTest++; -// nFail += !TestL2Agree(1024, dim, 10); -// -// nTest++; -// nFail += !TestKLAgree(1024, dim, 10); -// nTest++; -// nFail += !TestKLAgree(1024, dim, 10); -// -// nTest++; -// nFail += !TestItakuraSaitoAgree(1024, dim, 10); -// nTest++; -// nFail += !TestItakuraSaitoAgree(1024, dim, 10); -// } -// -// LOG(LIB_INFO) << nTest << " (sub) tests performed " << nFail << " failed"; -// -// EXPECT_EQ(0, nFail); -//} -// -//#ifdef DISABLE_LONG_TESTS -//TEST(DISABLE_TestAgreePivotIndex) { -//#else -//TEST(TestAgreePivotIndex) { -//#endif -// int nTest = 0; -// int nFail = 0; -// -// const size_t dataQty = 1000; -// const size_t pivotQty = 100; -// -// vector vDataFiles = {"sparse_5K.txt", "sparse_wiki_5K.txt"}; -// vector vSpaces = {SPACE_SPARSE_COSINE_SIMILARITY_FAST, SPACE_SPARSE_ANGULAR_DISTANCE_FAST, -// SPACE_SPARSE_NEGATIVE_SCALAR_FAST, SPACE_SPARSE_QUERY_NORM_NEGATIVE_SCALAR_FAST}; -// const string pivotFile = "sparse_pivots1K_termQty5K_maxId_100K.txt"; -// -// for (string spaceName : vSpaces) -// for (string dataFile : vDataFiles) { -// // 1. test with a dummy pivot index -// nTest++; -// nFail += !TestPivotIndex(spaceName, true, sampleDataPrefix + dataFile, dataQty, sampleDataPrefix + pivotFile, pivotQty); -// -// // 2. test with a real pivot index -// nTest++; -// nFail += !TestPivotIndex(spaceName, false, sampleDataPrefix + dataFile, dataQty, sampleDataPrefix + pivotFile, pivotQty); -// } -// -// LOG(LIB_INFO) << nTest << " (sub) tests performed " << nFail << " failed"; -// -// EXPECT_EQ(0, nFail); -//} -// -// +TEST(Platform64) { + EXPECT_EQ(8 == sizeof(size_t), true); +} + +template +bool checkElemVectEq(const vector>& source, + const vector>& target) { + if (source.size() != target.size()) return false; + + for (size_t i = 0; i < source.size(); ++i) + if (source[i] != target[i]) return false; + + return true; +} + +template +void TestSparsePackUnpack() { + for (size_t maxSize = 1024 ; maxSize < 1024*1024; maxSize += 8192) { + vector> source; + GenSparseVectZipf(maxSize, source); + + LOG(LIB_INFO) << "testing maxSize: " << maxSize << "\nqty: " << source.size() + << " maxId: " << source.back().id_; + + char* pBuff = NULL; + size_t dataLen = 0; + + PackSparseElements(source, pBuff, dataLen); + + vector> target; + UnpackSparseElements(pBuff, dataLen, target); + + bool eqFlag = checkElemVectEq(source, target); + + if (!eqFlag) { + LOG(LIB_INFO) << "Different source and target, source.size(): " << source.size() + << " target.size(): " << target.size(); + // Let's print the first different in the case of equal # of elements + size_t i = 0; + for (; i < min(source.size(), target.size()); ++i) { + if (!(source[i] == target[i])) { + LOG(LIB_INFO) << "First diff, i = " << i << " " << source[i] << " vs " << target[i]; + break; + } + } + } + + EXPECT_EQ(eqFlag, true); + } +} + +TEST(BlockZeros) { + for (size_t id = 0 ; id <= 3*65536; id++) { + size_t id1 = removeBlockZeros(id); + + size_t id2 = addBlockZeros(id1); + EXPECT_EQ(id, id2); + } +} + +#ifdef DISABLE_LONG_TESTS +TEST(DISABLE_SparsePackUnpack) { +#else +TEST(SparsePackUnpack) { +#endif + TestSparsePackUnpack(); + TestSparsePackUnpack(); +} + +TEST(TestEfficientPower) { + double f = 2.0; + + for (unsigned i = 1; i <= 64; i++) { + double p1 = std::pow(f, i); + double p2 = EfficientPow(f, i); + + EXPECT_EQ(p1, p2); + } +} + +TEST(TestEfficientFract) { + unsigned MaxNumDig = 16; + + for (float a = 1.1f ; a <= 2.0f; a+= 0.1f) { + for (unsigned NumDig = 1; NumDig < MaxNumDig; ++NumDig) { + uint64_t MaxFract = uint64_t(1) << NumDig; + + for (uint64_t intFract = 0; intFract < MaxFract; ++intFract) { + float fract = float(intFract) / float(MaxFract); + float v1 = pow(a, fract); + float v2 = EfficientFractPow(a, fract, NumDig); + + EXPECT_EQ_EPS(v1, v2, 1e-5f); + } + } + } +} + +template +bool TestScalarProductAgree(size_t N, size_t dim, size_t Rep) { + vector vect1(dim), vect2(dim); + T* pVect1 = &vect1[0]; + T* pVect2 = &vect2[0]; + + float maxRelDiff = 1e-6f; + float maxAbsDiff = 1e-6f; + + for (size_t i = 0; i < Rep; ++i) { + for (size_t j = 1; j < N; ++j) { + GenRandVect(pVect1, dim, T(1), T(2), true /* do normalize */); + GenRandVect(pVect2, dim, T(1), T(2), true /* do normalize */); + + T val1 = ScalarProduct(pVect1, pVect2, dim); + T val2 = ScalarProductSIMD(pVect1, pVect2, dim); + + bool bug = false; + T diff = fabs(val1 - val2); + T diffRel = diff/max(max(fabs(val1),fabs(val2)),T(1e-18)); + if (diffRel > maxRelDiff && diff > maxAbsDiff) { + bug = true; + cerr << "Bug ScalarProduct !!! Dim = " << dim << " val1 = " << val1 << " val2 = " << val2 << " diff=" << diff << " diffRel=" << diffRel << endl; + } + + if (bug) return false; + } + } + + return true; +} + +template +bool TestNormScalarProductAgree(size_t N, size_t dim, size_t Rep) { + vector vect1(dim), vect2(dim); + T* pVect1 = &vect1[0]; + T* pVect2 = &vect2[0]; + + float maxRelDiff = 1e-6f; + float maxAbsDiff = 1e-6f; + + for (size_t i = 0; i < Rep; ++i) { + for (size_t j = 1; j < N; ++j) { + GenRandVect(pVect1, dim, T(1), T(2), true /* do normalize */); + GenRandVect(pVect2, dim, T(1), T(2), true /* do normalize */); + + T val1 = NormScalarProduct(pVect1, pVect2, dim); + T val2 = NormScalarProductSIMD(pVect1, pVect2, dim); + + bool bug = false; + T diff = fabs(val1 - val2); + T diffRel = diff/max(max(fabs(val1),fabs(val2)),T(1e-18)); + if (diffRel > maxRelDiff && diff > maxAbsDiff) { + bug = true; + cerr << "Bug NormScalarProduct !!! Dim = " << dim << " val1 = " << val1 << " val2 = " << val2 << " diff=" << diff << " diffRel=" << diffRel << endl; + } + + if (bug) return false; + } + } + + return true; +} + +// Agreement test functions +template +bool TestLInfAgree(size_t N, size_t dim, size_t Rep) { + vector vect1(dim), vect2(dim); + T* pVect1 = &vect1[0]; + T* pVect2 = &vect2[0]; + + for (size_t i = 0; i < Rep; ++i) { + for (size_t j = 1; j < N; ++j) { + GenRandVect(pVect1, dim, -T(RANGE), T(RANGE)); + GenRandVect(pVect2, dim, -T(RANGE), T(RANGE)); + + T val1 = LInfNormStandard(pVect1, pVect2, dim); + T val2 = LInfNorm(pVect1, pVect2, dim); + T val3 = LInfNormSIMD(pVect1, pVect2, dim); + + bool bug = false; + + if (fabs(val1 - val2)/max(max(val1,val2),T(1e-18)) > 1e-6) { + cerr << "Bug LInf !!! Dim = " << dim << " val1 = " << val1 << " val2 = " << val2 << endl; + bug = true; + } + if (fabs(val1 - val3)/max(max(val1,val2),T(1e-18)) > 1e-6) { + cerr << "Bug LInf !!! Dim = " << dim << " val1 = " << val1 << " val3 = " << val3 << endl; + bug = true; + } + if (bug) return false; + } + } + + + return true; +} + +template +bool TestL1Agree(size_t N, size_t dim, size_t Rep) { + vector vect1(dim), vect2(dim); + T* pVect1 = &vect1[0]; + T* pVect2 = &vect2[0]; + + for (size_t i = 0; i < Rep; ++i) { + for (size_t j = 1; j < N; ++j) { + GenRandVect(pVect1, dim, -T(RANGE), T(RANGE)); + GenRandVect(pVect2, dim, -T(RANGE), T(RANGE)); + + T val1 = L1NormStandard(pVect1, pVect2, dim); + T val2 = L1Norm(pVect1, pVect2, dim); + T val3 = L1NormSIMD(pVect1, pVect2, dim); + + bool bug = false; + + if (fabs(val1 - val2)/max(max(val1,val2),T(1e-18)) > 1e-6) { + cerr << "Bug L1 !!! Dim = " << dim << " val1 = " << val1 << " val2 = " << val2 << endl; + bug = true; + } + if (fabs(val1 - val3)/max(max(val1,val2),T(1e-18)) > 1e-6) { + cerr << "Bug L1 !!! Dim = " << dim << " val1 = " << val1 << " val3 = " << val3 << endl; + bug = true; + } + if (bug) return false; + } + } + + return true; +} + +template +bool TestL2Agree(size_t N, size_t dim, size_t Rep) { + vector vect1(dim), vect2(dim); + T* pVect1 = &vect1[0]; + T* pVect2 = &vect2[0]; + + for (size_t i = 0; i < Rep; ++i) { + for (size_t j = 1; j < N; ++j) { + GenRandVect(pVect1, dim, -T(RANGE), T(RANGE)); + GenRandVect(pVect2, dim, -T(RANGE), T(RANGE)); + + T val1 = L2NormStandard(pVect1, pVect2, dim); + T val2 = L2Norm(pVect1, pVect2, dim); + T val3 = L2NormSIMD(pVect1, pVect2, dim); + + bool bug = false; + + if (fabs(val1 - val2)/max(max(val1,val2),T(1e-18)) > 1e-6) { + cerr << "Bug L2 !!! Dim = " << dim << " val1 = " << val1 << " val2 = " << val2 << endl; + bug = true; + } + if (fabs(val1 - val3)/max(max(val1,val2),T(1e-18)) > 1e-6) { + cerr << "Bug L2 !!! Dim = " << dim << " val1 = " << val1 << " val3 = " << val3 << endl; + bug = true; + } + if (bug) return false; + } + } + + + return true; +} + +template +bool TestItakuraSaitoAgree(size_t N, size_t dim, size_t Rep) { + vector vect1(dim), vect2(dim); + T* pVect1 = &vect1[0]; + T* pVect2 = &vect2[0]; + vector precompVect1(dim *2), precompVect2(dim * 2); + T* pPrecompVect1 = &precompVect1[0]; + T* pPrecompVect2 = &precompVect2[0]; + + for (size_t i = 0; i < Rep; ++i) { + for (size_t j = 1; j < N; ++j) { + GenRandVect(pVect1, dim, T(RANGE_SMALL), T(1.0), true); + GenRandVect(pVect2, dim, T(RANGE_SMALL), T(1.0), true); + + copy(pVect1, pVect1 + dim, pPrecompVect1); + copy(pVect2, pVect2 + dim, pPrecompVect2); + + PrecompLogarithms(pPrecompVect1, dim); + PrecompLogarithms(pPrecompVect2, dim); + + T val0 = ItakuraSaito(pVect1, pVect2, dim); + T val1 = ItakuraSaitoPrecomp(pPrecompVect1, pPrecompVect2, dim); + T val2 = ItakuraSaitoPrecompSIMD(pPrecompVect1, pPrecompVect2, dim); + + bool bug = false; + + T AbsDiff1 = fabs(val1 - val0); + T RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val0)),T(1e-18)); + + if (RelDiff1 > 1e-5 && AbsDiff1 > 1e-5) { + cerr << "Bug ItakuraSaito !!! Dim = " << dim << " val1 = " << val1 << " val0 = " << val0 << " Diff: " << (val1 - val0) << " RelDiff1: " << RelDiff1 << " << AbsDiff1: " << AbsDiff1 << endl; + bug = true; + } + + T AbsDiff2 = fabs(val1 - val2); + T RelDiff2 = AbsDiff2/max(max(fabs(val1),fabs(val2)),T(1e-18)); + if (RelDiff2 > 1e-5 && AbsDiff2 > 1e-5) { + cerr << "Bug ItakuraSaito !!! Dim = " << dim << " val1 = " << val1 << " val2 = " << val2 << " Diff: " << (val1 - val2) << " RelDiff2: " << RelDiff2 << " AbsDiff2: " << AbsDiff2 << endl; + bug = true; + } + + if (bug) return false; + } + } + + + return true; +} + +template +bool TestKLAgree(size_t N, size_t dim, size_t Rep) { + vector vect1(dim), vect2(dim); + T* pVect1 = &vect1[0]; + T* pVect2 = &vect2[0]; + vector precompVect1(dim *2), precompVect2(dim * 2); + T* pPrecompVect1 = &precompVect1[0]; + T* pPrecompVect2 = &precompVect2[0]; + + for (size_t i = 0; i < Rep; ++i) { + for (size_t j = 1; j < N; ++j) { + GenRandVect(pVect1, dim, T(RANGE_SMALL), T(1.0), true); + GenRandVect(pVect2, dim, T(RANGE_SMALL), T(1.0), true); + + copy(pVect1, pVect1 + dim, pPrecompVect1); + copy(pVect2, pVect2 + dim, pPrecompVect2); + + PrecompLogarithms(pPrecompVect1, dim); + PrecompLogarithms(pPrecompVect2, dim); + + T val0 = KLStandard(pVect1, pVect2, dim); + T val1 = KLStandardLogDiff(pVect1, pVect2, dim); + T val2 = KLPrecomp(pPrecompVect1, pPrecompVect2, dim); + T val3 = KLPrecompSIMD(pPrecompVect1, pPrecompVect2, dim); + + bool bug = false; + + /* + * KLStandardLog has a worse accuracy due to computing the log of ratios + * as opposed to difference of logs, but it is more efficient (log can be + * expensive to compute) + */ + + T AbsDiff1 = fabs(val1 - val0); + T RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val0)),T(1e-18)); + if (RelDiff1 > 1e-5 && AbsDiff1 > 1e-5) { + cerr << "Bug KL !!! Dim = " << dim << " val0 = " << val0 << " val1 = " << val1 << " Diff: " << (val0 - val1) << " RelDiff1: " << RelDiff1 << " AbsDiff1: " << AbsDiff1 << endl; + bug = true; + } + + T AbsDiff2 = fabs(val1 - val2); + T RelDiff2 = AbsDiff2/max(max(fabs(val1),fabs(val2)),T(1e-18)); + if (RelDiff2 > 1e-5 && AbsDiff2 > 1e-5) { + cerr << "Bug KL !!! Dim = " << dim << " val2 = " << val2 << " val1 = " << val1 << " Diff: " << (val2 - val1) << " RelDiff2: " << RelDiff2 << " AbsDiff2: " << AbsDiff2 << endl; + bug = true; + } + + T AbsDiff3 = fabs(val1 - val3); + T RelDiff3 = AbsDiff3/max(max(fabs(val1),fabs(val3)),T(1e-18)); + if (RelDiff3 > 1e-5 && AbsDiff3 > 1e-5) { + cerr << "Bug KL !!! Dim = " << dim << " val3 = " << val3 << " val1 = " << val1 << " Diff: " << (val3 - val1) << " RelDiff3: " << RelDiff3 << " AbsDiff3: " << AbsDiff3 << endl; + bug = true; + } + + if (bug) return false; + } + } + + + return true; +} + +template +bool TestKLGeneralAgree(size_t N, size_t dim, size_t Rep) { + T* pVect1 = new T[dim]; + T* pVect2 = new T[dim]; + T* pPrecompVect1 = new T[dim * 2]; + T* pPrecompVect2 = new T[dim * 2]; + + for (size_t i = 0; i < Rep; ++i) { + for (size_t j = 1; j < N; ++j) { + GenRandVect(pVect1, dim, T(RANGE_SMALL), T(1.0), false); + GenRandVect(pVect2, dim, T(RANGE_SMALL), T(1.0), false); + + copy(pVect1, pVect1 + dim, pPrecompVect1); + copy(pVect2, pVect2 + dim, pPrecompVect2); + + PrecompLogarithms(pPrecompVect1, dim); + PrecompLogarithms(pPrecompVect2, dim); + + T val0 = KLGeneralStandard(pVect1, pVect2, dim); + T val2 = KLGeneralPrecomp(pPrecompVect1, pPrecompVect2, dim); + T val3 = KLGeneralPrecompSIMD(pPrecompVect1, pPrecompVect2, dim); + + bool bug = false; + + T AbsDiff1 = fabs(val2 - val0); + T RelDiff1 = AbsDiff1/max(max(fabs(val2),fabs(val0)),T(1e-18)); + if (RelDiff1 > 1e-5 && AbsDiff1 > 1e-5) { + cerr << "Bug KL !!! Dim = " << dim << " val0 = " << val0 << " val2 = " << val2 << " Diff: " << (val0 - val2) << " RelDiff1: " << RelDiff1 << " AbsDiff1: " << AbsDiff1 << endl; + bug = true; + } + + T AbsDiff2 = fabs(val3 - val2); + T RelDiff2 = AbsDiff2/max(max(fabs(val3),fabs(val2)),T(1e-18)); + if (RelDiff2 > 1e-5 && AbsDiff2 > 1e-5) { + cerr << "Bug KL !!! Dim = " << dim << " val2 = " << val2 << " val3 = " << val3 << " Diff: " << (val2 - val3) << " RelDiff2: " << RelDiff2 << " AbsDiff2: " << AbsDiff2 << endl; + bug = true; + } + + if (bug) return false; + } + } + + + return true; +} + +template +bool TestJSAgree(size_t N, size_t dim, size_t Rep, double pZero) { + vector vect1(dim), vect2(dim); + T* pVect1 = &vect1[0]; + T* pVect2 = &vect2[0]; + vector precompVect1(dim *2), precompVect2(dim * 2); + T* pPrecompVect1 = &precompVect1[0]; + T* pPrecompVect2 = &precompVect2[0]; + + T Dist = 0; + T Error = 0; + T TotalQty = 0; + + for (size_t i = 0; i < Rep; ++i) { + for (size_t j = 1; j < N; ++j) { + GenRandVect(pVect1, dim, T(RANGE_SMALL), T(1.0), true); + SetRandZeros(pVect1, dim, pZero); + Normalize(pVect1, dim); + GenRandVect(pVect2, dim, T(RANGE_SMALL), T(1.0), true); + SetRandZeros(pVect2, dim, pZero); + Normalize(pVect2, dim); + + copy(pVect1, pVect1 + dim, pPrecompVect1); + copy(pVect2, pVect2 + dim, pPrecompVect2); + + PrecompLogarithms(pPrecompVect1, dim); + PrecompLogarithms(pPrecompVect2, dim); + + T val0 = JSStandard(pVect1, pVect2, dim); + T val1 = JSPrecomp(pPrecompVect1, pPrecompVect2, dim); + + bool bug = false; + + T AbsDiff1 = fabs(val1 - val0); + T RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val0)),T(1e-18)); + + if (RelDiff1 > 1e-5 && AbsDiff1 > 1e-5) { + cerr << "Bug JS (1) " << typeid(T).name() << " !!! Dim = " << dim << " val0 = " << val0 << " val1 = " << val1 << " Diff: " << (val0 - val1) << " RelDiff1: " << RelDiff1 << " AbsDiff1: " << AbsDiff1 << endl; + bug = true; + } + + T val2 = JSPrecompApproxLog(pPrecompVect1, pPrecompVect2, dim); + T val3 = JSPrecompSIMDApproxLog(pPrecompVect1, pPrecompVect2, dim); + + T AbsDiff2 = fabs(val2 - val3); + T RelDiff2 = AbsDiff2/max(max(fabs(val2),fabs(val3)),T(1e-18)); + + if (RelDiff2 > 1e-5 && AbsDiff2 > 1e-5) { + cerr << "Bug JS (2) " << typeid(T).name() << " !!! Dim = " << dim << " val2 = " << val2 << " val3 = " << val3 << " Diff: " << (val2 - val3) << " RelDiff2: " << RelDiff2 << " AbsDiff2: " << AbsDiff2 << endl; + bug = true; + } + + T AbsDiff3 = fabs(val1 - val2); + T RelDiff3 = AbsDiff3/max(max(fabs(val1),fabs(val2)),T(1e-18)); + + Dist += val1; + Error += AbsDiff3; + ++TotalQty; + + if (RelDiff3 > 1e-4 && AbsDiff3 > 1e-4) { + cerr << "Bug JS (3) " << typeid(T).name() << " !!! Dim = " << dim << " val1 = " << val1 << " val2 = " << val2 << " Diff: " << (val1 - val2) << " RelDiff3: " << RelDiff3 << " AbsDiff2: " << AbsDiff3 << endl; + bug = true; + } + + if (bug) return false; + } + } + + LOG(LIB_INFO) << typeid(T).name() << " JS approximation error: average absolute: " << Error / TotalQty << + " avg. dist: " << Dist / TotalQty << " average relative: " << Error/Dist; + + + return true; +} + +template +bool TestRenyiDivAgree(size_t N, size_t dim, size_t Rep, T alpha) { + vector vect1(dim), vect2(dim); + T* pVect1 = &vect1[0]; + T* pVect2 = &vect2[0]; + + T Dist = 0; + T Error = 0; + T TotalQty = 0; + + for (size_t i = 0; i < Rep; ++i) { + for (size_t j = 1; j < N; ++j) { + GenRandVect(pVect1, dim, T(RANGE_SMALL), T(1.0), true); + GenRandVect(pVect2, dim, T(RANGE_SMALL), T(1.0), true); + + Normalize(pVect1, dim); + Normalize(pVect2, dim); + + T val0 = renyiDivergenceSlow(pVect1, pVect2, dim, alpha); + T val1 = renyiDivergenceFast(pVect1, pVect2, dim, alpha); + + bool bug = false; + + T AbsDiff1 = fabs(val1 - val0); + T RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val0)),T(1e-18)); + + Error += AbsDiff1; + ++TotalQty; + + if (RelDiff1 > 1e-5 && AbsDiff1 > 1e-5) { + cerr << "Bug Reniy Div. (1) " << typeid(T).name() << " !!! Dim = " << dim + << "alpha=" << alpha << " val0 = " << val0 << " val1 = " << val1 + << " Diff: " << (val0 - val1) << " RelDiff1: " << RelDiff1 + << " AbsDiff1: " << AbsDiff1 << endl; + bug = true; + } + + if (bug) return false; + } + } + + LOG(LIB_INFO) << typeid(T).name() << " Renyi Div. approximation error: average absolute: " << Error / TotalQty << + " avg. dist: " << Dist / TotalQty << " average relative: " << Error/Dist; + + + return true; +} + +template +bool TestAlphaBetaDivAgree(size_t N, size_t dim, size_t Rep, T alpha, T beta) { + vector vect1(dim), vect2(dim); + T* pVect1 = &vect1[0]; + T* pVect2 = &vect2[0]; + + T Dist = 0; + T Error = 0; + T TotalQty = 0; + + for (size_t i = 0; i < Rep; ++i) { + for (size_t j = 1; j < N; ++j) { + GenRandVect(pVect1, dim, T(RANGE_SMALL), T(1.0), true); + GenRandVect(pVect2, dim, T(RANGE_SMALL), T(1.0), true); + + Normalize(pVect1, dim); + Normalize(pVect2, dim); + + T val0 = alphaBetaDivergenceSlow(pVect1, pVect2, dim, alpha, beta); + T val1 = alphaBetaDivergenceFast(pVect1, pVect2, dim, alpha, beta); + + bool bug = false; + + T AbsDiff1 = fabs(val1 - val0); + T RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val0)),T(1e-18)); + + Error += AbsDiff1; + ++TotalQty; + + if (RelDiff1 > 1e-5 && AbsDiff1 > 1e-5) { + cerr << "Bug alpha-beta Div. (1) " << typeid(T).name() << " !!! Dim = " << dim + << "alpha=" << alpha << " val0 = " << val0 << " val1 = " << val1 + << " Diff: " << (val0 - val1) << " RelDiff1: " << RelDiff1 + << " AbsDiff1: " << AbsDiff1 << endl; + bug = true; + } + + if (bug) return false; + } + } + + LOG(LIB_INFO) << typeid(T).name() << " alpha-beta div. approximation error: average absolute: " << Error / TotalQty << " avg. dist: " << Dist / TotalQty << " average relative: " << Error/Dist; + + + return true; +} + +bool TestSpearmanFootruleAgree(size_t N, size_t dim, size_t Rep) { + vector vect1(dim), vect2(dim); + PivotIdType* pVect1 = &vect1[0]; + PivotIdType* pVect2 = &vect2[0]; + + for (size_t i = 0; i < Rep; ++i) { + for (size_t j = 1; j < N; ++j) { + GenRandIntVect(pVect1, dim); + GenRandIntVect(pVect2, dim); + + int val0 = SpearmanFootrule(pVect1, pVect2, dim); + int val1 = SpearmanFootruleSIMD(pVect1, pVect2, dim); + + bool bug = false; + + + if (val0 != val1) { + cerr << "Bug SpearmanFootrule !!! Dim = " << dim << " val0 = " << val0 << " val1 = " << val1 << endl; + bug = true; + } + + if (bug) return false; + } + } + + + return true; +} + +bool TestSpearmanRhoAgree(size_t N, size_t dim, size_t Rep) { + vector vect1(dim), vect2(dim); + PivotIdType* pVect1 = &vect1[0]; + PivotIdType* pVect2 = &vect2[0]; + + for (size_t i = 0; i < Rep; ++i) { + for (size_t j = 1; j < N; ++j) { + GenRandIntVect(pVect1, dim); + GenRandIntVect(pVect2, dim); + + int val0 = SpearmanRho(pVect1, pVect2, dim); + int val1 = SpearmanRhoSIMD(pVect1, pVect2, dim); + + bool bug = false; + + + if (val0 != val1) { + cerr << "Bug SpearmanRho !!! Dim = " << dim << " val0 = " << val0 << " val1 = " << val1 << " Diff: " << (val0 - val1) << endl; + bug = true; + } + + if (bug) return false; + } + } + + + return true; +} + +template +bool TestLPGenericAgree(size_t N, size_t dim, size_t Rep, T power) { + vector vect1(dim), vect2(dim); + T* pVect1 = &vect1[0]; + T* pVect2 = &vect2[0]; + + T TotalQty = 0, Error = 0, Dist = 0; + + for (size_t i = 0; i < Rep; ++i) { + for (size_t j = 1; j < N; ++j) { + GenRandVect(pVect1, dim, -T(RANGE), T(RANGE)); + GenRandVect(pVect2, dim, -T(RANGE), T(RANGE)); + + T val0 = LPGenericDistance(pVect1, pVect2, dim, power); + T val1 = LPGenericDistanceOptim(pVect1, pVect2, dim, power); + + bool bug = false; + + T AbsDiff1 = fabs(val1 - val0); + T RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val0)),T(1e-18)); + + T maxRelDiff = 1e-5f; + T maxAbsDiff = 1e-5f; + /* + * For large powers, the difference can be larger, + * because our approximations are efficient, but not very + * precise + */ + if (power > 8) { maxAbsDiff = maxRelDiff = 1e-3f;} + if (power > 12) { maxAbsDiff = maxRelDiff = 0.01f;} + if (power > 22) { maxAbsDiff = maxRelDiff = 0.1f;} + + ++TotalQty; + Error += RelDiff1; + Dist += val0; + + if (RelDiff1 > maxRelDiff && AbsDiff1 > maxAbsDiff) { + cerr << "Bug LP" << power << " !!! Dim = " << dim << + " val1 = " << val1 << " val0 = " << val0 << + " Diff: " << (val1 - val0) << + " RelDiff1: " << RelDiff1 << + " (max for this power: " << maxRelDiff << ") " << + " AbsDiff1: " << AbsDiff1 << " (max for this power: " << maxAbsDiff << ")" << endl; + } + + if (bug) return false; + } + } + + if (power < 4) { + LOG(LIB_INFO) << typeid(T).name() << " LP approximation error: average absolute " << Error / TotalQty << " avg. dist: " << Dist / TotalQty << " average relative: " << Error/Dist; + + } + + return true; +} + +bool TestBitHammingAgree(size_t N, size_t dim, size_t Rep) { + size_t WordQty = (dim + 31)/32; + vector arr(N * WordQty); + uint32_t* pArr = &arr[0]; + + uint32_t *p = pArr; + for (size_t i = 0; i < N; ++i, p+= WordQty) { + vector perm(dim); + GenRandIntVect(&perm[0], dim); + for (unsigned j = 0; j < dim; ++j) + perm[j] = perm[j] % 2; + vector h; + Binarize(perm, 1, h); + CHECK(h.size() == WordQty); + memcpy(p, &h[0], WordQty * sizeof(h[0])); + } + + WallClockTimer t; + + t.reset(); + + bool res = true; + + for (size_t j = 1; j < N; ++j) { + uint32_t* pVect1 = pArr + j*WordQty; + uint32_t* pVect2 = pArr + (j-1)*WordQty; + int d1 = BitHamming(pVect1, pVect2, WordQty); + int d2 = 0; + + for (unsigned t = 0; t < WordQty; ++t) { + for (unsigned k = 0; k < 32; ++k) { + d2 += ((pVect1[t]>>k)&1) != ((pVect2[t]>>k)&1); + } + } + if (d1 != d2) { + cerr << "Bug bit hamming, WordQty = " << WordQty << " d1 = " << d1 << " d2 = " << d2 << endl; + res = false; + break; + } + } + + return res; +} + + +bool TestSparseAngularDistanceAgree(const string& dataFile, size_t N, size_t Rep) { + typedef float T; + + unique_ptr spaceFast(new SpaceSparseAngularDistanceFast()); + unique_ptr> spaceReg(new SpaceSparseAngularDistance()); + + ObjectVector elemsFast; + ObjectVector elemsReg; + vector tmp; + + unique_ptr inpStateFast(spaceFast->ReadDataset(elemsFast, tmp, dataFile, N)); + spaceFast->UpdateParamsFromFile(*inpStateFast); + unique_ptr inpStateReg(spaceReg->ReadDataset(elemsReg, tmp, dataFile, N)); + spaceReg->UpdateParamsFromFile(*inpStateReg); + + CHECK(elemsFast.size() == elemsReg.size()); + + N = min(N, elemsReg.size()); + + bool bug = false; + + float maxRelDiff = 2e-5f; + float maxAbsDiff = 1e-6f; + + for (size_t j = Rep; j < N; ++j) + for (size_t k = j - Rep; k < j; ++k) { + float val1 = spaceFast->IndexTimeDistance(elemsFast[k], elemsFast[j]); + float val2 = spaceReg->IndexTimeDistance(elemsReg[k], elemsReg[j]); + + float AbsDiff1 = fabs(val1 - val2); + float RelDiff1 = AbsDiff1 / max(max(fabs(val1), fabs(val2)), T(1e-18)); + + if (RelDiff1 > maxRelDiff && AbsDiff1 > maxAbsDiff) { + cerr << "Bug fast vs non-fast angular dist " << + " val1 = " << val1 << " val2 = " << val2 << + " Diff: " << (val1 - val2) << + " RelDiff1: " << RelDiff1 << + " AbsDiff1: " << AbsDiff1 << endl; + bug = true; + } + + if (bug) return false; + } + + return true; +} + + + +bool TestSparseCosineSimilarityAgree(const string& dataFile, size_t N, size_t Rep) { + typedef float T; + + unique_ptr spaceFast(new SpaceSparseCosineSimilarityFast()); + unique_ptr> spaceReg (new SpaceSparseCosineSimilarity()); + + ObjectVector elemsFast; + ObjectVector elemsReg; + vector tmp; + + unique_ptr inpStateFast(spaceFast->ReadDataset(elemsFast, tmp, dataFile, N)); + spaceFast->UpdateParamsFromFile(*inpStateFast); + unique_ptr inpStateReg(spaceReg->ReadDataset(elemsReg, tmp, dataFile, N)); + spaceReg->UpdateParamsFromFile(*inpStateReg); + + CHECK(elemsFast.size() == elemsReg.size()); + + N = min(N, elemsReg.size()); + + bool bug = false; + + float maxRelDiff = 1e-5f; + float maxAbsDiff = 1e-5f; + + for (size_t j = Rep; j < N; ++j) + for (size_t k = j - Rep; k < j; ++k) { + float val1 = spaceFast->IndexTimeDistance(elemsFast[k], elemsFast[j]); + float val2 = spaceReg->IndexTimeDistance(elemsReg[k], elemsReg[j]); + + float AbsDiff1 = fabs(val1 - val2); + float RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val2)),T(1e-18)); + + if (RelDiff1 > maxRelDiff && AbsDiff1 > maxAbsDiff) { + cerr << "Bug fast vs non-fast cosine " << + " val1 = " << val1 << " val2 = " << val2 << + " Diff: " << (val1 - val2) << + " RelDiff1: " << RelDiff1 << + " AbsDiff1: " << AbsDiff1 << endl; + bug = true; + } + + if (bug) return false; + } + + return true; +} + +bool TestSparseNegativeScalarProductAgree(const string& dataFile, size_t N, size_t Rep) { + typedef float T; + + unique_ptr spaceFast(new SpaceSparseNegativeScalarProductFast()); + unique_ptr> spaceReg (new SpaceSparseNegativeScalarProduct()); + + ObjectVector elemsFast; + ObjectVector elemsReg; + vector tmp; + + unique_ptr inpStateFast(spaceFast->ReadDataset(elemsFast, tmp, dataFile, N)); + spaceFast->UpdateParamsFromFile(*inpStateFast); + unique_ptr inpStateReg(spaceReg->ReadDataset(elemsReg, tmp, dataFile, N)); + spaceReg->UpdateParamsFromFile(*inpStateReg); + + CHECK(elemsFast.size() == elemsReg.size()); + + N = min(N, elemsReg.size()); + + bool bug = false; + + float maxRelDiff = 1e-6f; + float maxAbsDiff = 1e-6f; + + for (size_t j = Rep; j < N; ++j) + for (size_t k = j - Rep; k < j; ++k) { + float val1 = spaceFast->IndexTimeDistance(elemsFast[k], elemsFast[j]); + float val2 = spaceReg->IndexTimeDistance(elemsReg[k], elemsReg[j]); + + float AbsDiff1 = fabs(val1 - val2); + float RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val2)),T(1e-18)); + + if (RelDiff1 > maxRelDiff && AbsDiff1 > maxAbsDiff) { + cerr << "Bug fast vs non-fast negative scalar/dot product " << + " val1 = " << val1 << " val2 = " << val2 << + " Diff: " << (val1 - val2) << + " RelDiff1: " << RelDiff1 << + " AbsDiff1: " << AbsDiff1 << endl; + bug = true; + } + + if (bug) return false; + } + + return true; +} + +bool TestSparseQueryNormNegativeScalarProductAgree(const string& dataFile, size_t N, size_t Rep) { + typedef float T; + + unique_ptr spaceFast(new SpaceSparseQueryNormNegativeScalarProductFast()); + unique_ptr> spaceReg (new SpaceSparseQueryNormNegativeScalarProduct()); + + ObjectVector elemsFast; + ObjectVector elemsReg; + vector tmp; + + unique_ptr inpStateFast(spaceFast->ReadDataset(elemsFast, tmp, dataFile, N)); + spaceFast->UpdateParamsFromFile(*inpStateFast); + unique_ptr inpStateReg(spaceReg->ReadDataset(elemsReg, tmp, dataFile, N)); + spaceReg->UpdateParamsFromFile(*inpStateReg); + + CHECK(elemsFast.size() == elemsReg.size()); + + N = min(N, elemsReg.size()); + + bool bug = false; + + float maxRelDiff = 1e-6f; + float maxAbsDiff = 1e-6f; + + for (size_t j = Rep; j < N; ++j) + for (size_t k = j - Rep; k < j; ++k) { + float val1 = spaceFast->IndexTimeDistance(elemsFast[k], elemsFast[j]); + float val2 = spaceReg->IndexTimeDistance(elemsReg[k], elemsReg[j]); + + float AbsDiff1 = fabs(val1 - val2); + float RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val2)),T(1e-18)); + + if (RelDiff1 > maxRelDiff && AbsDiff1 > maxAbsDiff) { + cerr << "Bug fast vs non-fast QUERY-NORMALIZED negative scalar/dot product " << + " val1 = " << val1 << " val2 = " << val2 << + " Diff: " << (val1 - val2) << + " RelDiff1: " << RelDiff1 << + " AbsDiff1: " << AbsDiff1 << endl; + bug = true; + } + + if (bug) return false; + } + + return true; +} + +// Limitation: this is only for spaces without params +bool TestPivotIndex(const string& spaceName, + bool useDummyIndex, + const string& dataFile, size_t dataQty, + const string& pivotFile, size_t pivotQty) { + + LOG(LIB_INFO) << "space: " << spaceName << " real pivot index?: " << !useDummyIndex << " " << + " dataFile: " << dataFile << " " << + " pivotFile: " << pivotFile; + try { + typedef float T; + + AnyParams emptyParams; + + unique_ptr> space(SpaceFactoryRegistry::Instance().CreateSpace(spaceName, emptyParams)); + + ObjectVector data; + ObjectVector pivots; + vector tmp; + + float maxRelDiff = 1e-6f; + float maxAbsDiff = 1e-6f; + + unique_ptr inpStateFast(space->ReadDataset(data, tmp, dataFile, dataQty)); + space->UpdateParamsFromFile(*inpStateFast); + space->ReadDataset(pivots, tmp, pivotFile, pivotQty); + + unique_ptr> pivIndx(useDummyIndex ? + new DummyPivotIndex(*space, pivots) + : + space->CreatePivotIndex(pivots, + 0 /* Let's not test using the hashing trick here, b/c distances would be somewhat different */)); + + for (size_t did = 0; did < dataQty; ++did) { + vector vDst; + pivIndx->ComputePivotDistancesIndexTime(data[did], vDst); + CHECK_MSG(vDst.size() == pivotQty, "ComputePivotDistancesIndexTime returns incorrect # of elements different from the # of pivots"); + + for (size_t pid = 0; pid < pivotQty; ++pid) { + T val2 = space->IndexTimeDistance(pivots[pid], data[did]); + T val1 = vDst[pid]; + + float AbsDiff1 = fabs(val1 - val2); + float RelDiff1 = AbsDiff1/max(max(fabs(val1),fabs(val2)),T(1e-18)); + + if (RelDiff1 > maxRelDiff && AbsDiff1 > maxAbsDiff) { + cerr << "Bug in fast computation of all-pivot distance, " << + " space: " << spaceName << " real pivot index?: " << !useDummyIndex << endl << + " dataFile: " << dataFile << endl << + " pivotFile: " << pivotFile << endl << + " data index: " << did << " pivot index: " << pid << endl << + " val1 = " << val1 << " val2 = " << val2 << + " Diff: " << (val1 - val2) << + " RelDiff1: " << RelDiff1 << + " AbsDiff1: " << AbsDiff1 << endl; + return false; + } + } + } + } catch (const exception& e) { + LOG(LIB_INFO) << "Got exception while testing: " << e.what(); + return false; + } + return true; +} + + + + +#ifdef DISABLE_LONG_TESTS +TEST(DISABLE_TestAgree) { +#else +TEST(TestAgree) { +#endif + int nTest = 0; + int nFail = 0; + + nTest++; + nFail += !TestSparseAngularDistanceAgree(sampleDataPrefix + "sparse_5K.txt", 1000, 200); + + nTest++; + nFail += !TestSparseAngularDistanceAgree(sampleDataPrefix + "sparse_wiki_5K.txt", 1000, 200); + + nTest++; + nFail += !TestSparseCosineSimilarityAgree(sampleDataPrefix + "sparse_5K.txt", 1000, 200); + + nTest++; + nFail += !TestSparseCosineSimilarityAgree(sampleDataPrefix + "sparse_wiki_5K.txt", 1000, 200); + + + nTest++; + nFail += !TestSparseNegativeScalarProductAgree(sampleDataPrefix + "sparse_5K.txt", 1000, 200); + + nTest++; + nFail += !TestSparseNegativeScalarProductAgree(sampleDataPrefix + "sparse_wiki_5K.txt", 1000, 200); + + nTest++; + nFail += !TestSparseQueryNormNegativeScalarProductAgree(sampleDataPrefix + "sparse_5K.txt", 1000, 200); + + nTest++; + nFail += !TestSparseQueryNormNegativeScalarProductAgree(sampleDataPrefix + "sparse_wiki_5K.txt", 1000, 200); + + + /* + * 32 should be more than enough for almost all methods, + * where loop-unrolling includes at most 16 distance computations. + * + * Bit-Hamming is an exception. + * + */ + for (unsigned dim = 1; dim <= 1024; dim+=2) { + LOG(LIB_INFO) << "Dim = " << dim; + + nFail += !TestBitHammingAgree(1000, dim, 1000); + } + + for (unsigned dim = 1; dim <= 32; ++dim) { + LOG(LIB_INFO) << "Dim = " << dim; + + /* + * This is a costly check, we don't need to do it for large # dimensions. + * Anyways, the function is not using any loop unrolling, so 8 should be sufficient. + */ + if (dim <= 8) { + + for (float power = 0.125; power <= 32; power += 0.125) { + TestLPGenericAgree(1024, dim, 10, power); + } + for (double power = 0.125; power <= 32; power += 0.125) { + TestLPGenericAgree(1024, dim, 10, power); + } + + // In the case of Renyi divergence 0 < alpha < 1, 1 < alpha < infinity + // https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy#R%C3%A9nyi_divergence + for (float alpha = 0.125; alpha <= 2; alpha += 0.125) { + if (fabs(alpha - 1) < 1e-6) continue; + TestRenyiDivAgree(1024, dim, 10, alpha); + } + for (double alpha = 0.125; alpha <= 2; alpha += 0.125) { + if (fabs(alpha - 1) < 1e-6) continue; + TestRenyiDivAgree(1024, dim, 10, alpha); + } + + for (float alpha = -2; alpha <= 2; alpha += 0.5) + for (float beta = -2; beta <= 2; beta += 0.5) + { + TestAlphaBetaDivAgree(1024, dim, 10, alpha, beta); + } + + for (double alpha = -2; alpha <= 2; alpha += 0.5) + for (double beta = -2; beta <= 2; beta += 0.5) + { + TestAlphaBetaDivAgree(1024, dim, 10, alpha, beta); + } + } + + nTest++; + nFail += !TestNormScalarProductAgree(1024, dim, 10); + nTest++; + nFail += !TestNormScalarProductAgree(1024, dim, 10); + + nTest++; + nFail += !TestScalarProductAgree(1024, dim, 10); + nTest++; + nFail += !TestScalarProductAgree(1024, dim, 10); + + nTest++; + nFail += !TestSpearmanFootruleAgree(1024, dim, 10); + + nTest++; + nFail += !TestSpearmanRhoAgree(1024, dim, 10); + + nTest++; + nFail += !TestJSAgree(1024, dim, 10, 0.5); + nTest++; + nFail += !TestJSAgree(1024, dim, 10, 0.5); + + nTest++; + nFail += !TestKLGeneralAgree(1024, dim, 10); + nTest++; + nFail += !TestKLGeneralAgree(1024, dim, 10); + + nTest++; + nFail += !TestLInfAgree(1024, dim, 10); + nTest++; + nFail += !TestLInfAgree(1024, dim, 10); + + nTest++; + nFail += !TestL1Agree(1024, dim, 10); + nTest++; + nFail += !TestL1Agree(1024, dim, 10); + + nTest++; + nFail += !TestL2Agree(1024, dim, 10); + nTest++; + nFail += !TestL2Agree(1024, dim, 10); + + nTest++; + nFail += !TestKLAgree(1024, dim, 10); + nTest++; + nFail += !TestKLAgree(1024, dim, 10); + + nTest++; + nFail += !TestItakuraSaitoAgree(1024, dim, 10); + nTest++; + nFail += !TestItakuraSaitoAgree(1024, dim, 10); + } + + LOG(LIB_INFO) << nTest << " (sub) tests performed " << nFail << " failed"; + + EXPECT_EQ(0, nFail); +} + +#ifdef DISABLE_LONG_TESTS +TEST(DISABLE_TestAgreePivotIndex) { +#else +TEST(TestAgreePivotIndex) { +#endif + int nTest = 0; + int nFail = 0; + + const size_t dataQty = 1000; + const size_t pivotQty = 100; + + vector vDataFiles = {"sparse_5K.txt", "sparse_wiki_5K.txt"}; + vector vSpaces = {SPACE_SPARSE_COSINE_SIMILARITY_FAST, SPACE_SPARSE_ANGULAR_DISTANCE_FAST, + SPACE_SPARSE_NEGATIVE_SCALAR_FAST, SPACE_SPARSE_QUERY_NORM_NEGATIVE_SCALAR_FAST}; + const string pivotFile = "sparse_pivots1K_termQty5K_maxId_100K.txt"; + + for (string spaceName : vSpaces) + for (string dataFile : vDataFiles) { + // 1. test with a dummy pivot index + nTest++; + nFail += !TestPivotIndex(spaceName, true, sampleDataPrefix + dataFile, dataQty, sampleDataPrefix + pivotFile, pivotQty); + + // 2. test with a real pivot index + nTest++; + nFail += !TestPivotIndex(spaceName, false, sampleDataPrefix + dataFile, dataQty, sampleDataPrefix + pivotFile, pivotQty); + } + + LOG(LIB_INFO) << nTest << " (sub) tests performed " << nFail << " failed"; + + EXPECT_EQ(0, nFail); +} + + } // namespace similarity -// + diff --git a/similarity_search/test/test_space_serial.cc b/similarity_search/test/test_space_serial.cc index 5dd36b7..17a5c6b 100644 --- a/similarity_search/test/test_space_serial.cc +++ b/similarity_search/test/test_space_serial.cc @@ -135,9 +135,6 @@ bool fullTest(const vector& dataSetStr, size_t maxNumRec, const string& dataSet1.push_back(space->CreateObjFromStr(id++, -1, s, NULL).release()); vExternIds1.push_back(ss.str()); -// std::cout << space->CreateStrFromObj(dataSet1[dataSet1.size() - 1], NULL) << std::endl; - std::cout << s << std::endl; - if (id >= maxNumRec) break; } @@ -153,53 +150,53 @@ const char *emptyParams[] = {NULL}; const char *paramsDistL2[] = {"dist=" SPACE_WORD_EMBED_DIST_L2, NULL}; const char *paramsDistCosine[] = {"dist=" SPACE_WORD_EMBED_DIST_COSINE, NULL}; -//TEST(Test_WordEmbedSpace) { -// for (size_t maxNumRec = 1; maxNumRec < MAX_NUM_REC; ++maxNumRec) { -// EXPECT_EQ(true, fullTest("glove.6B.100d_100.txt", maxNumRec, "tmp_out_file.txt", SPACE_WORD_EMBED, paramsDistL2, true)); -// EXPECT_EQ(true, fullTest("glove.6B.100d_100.txt", maxNumRec, "tmp_out_file.txt",SPACE_WORD_EMBED, paramsDistCosine, true)); -// EXPECT_EQ(true, fullTest("glove.6B.100d_100.txt", maxNumRec, "tmp_out_file.txt", SPACE_WORD_EMBED, paramsDistL2, true)); -// EXPECT_EQ(true, fullTest("glove.6B.100d_100.txt", maxNumRec, "tmp_out_file.txt",SPACE_WORD_EMBED, paramsDistCosine, true)); -// } -//} -// -//TEST(Test_DenseVectorSpace) { -// for (size_t maxNumRec = 1; maxNumRec < MAX_NUM_REC; ++maxNumRec) { -// EXPECT_EQ(true, fullTest("final128_10K.txt", maxNumRec, "tmp_out_file.txt", "l2", emptyParams, false)); -// EXPECT_EQ(true, fullTest("final128_10K.txt", maxNumRec, "tmp_out_file.txt", "l2", emptyParams, false)); -// } -//} -// -//TEST(Test_DenseVectorKLDiv) { -// // Test KL-diverg. with and without precomputation of logarithms -// for (size_t maxNumRec = 1; maxNumRec < MAX_NUM_REC; ++maxNumRec) { -// EXPECT_EQ(true, fullTest("final128_10K.txt", maxNumRec, "tmp_out_file.txt", "kldivgenfast", emptyParams, false)); -// EXPECT_EQ(true, fullTest("final128_10K.txt", maxNumRec, "tmp_out_file.txt", "kldivgenfast", emptyParams, false)); -// EXPECT_EQ(true, fullTest("final128_10K.txt", maxNumRec, "tmp_out_file.txt", "kldivgenslow", emptyParams, false)); -// EXPECT_EQ(true, fullTest("final128_10K.txt", maxNumRec, "tmp_out_file.txt", "kldivgenslow", emptyParams, false)); -// } -//} -// -//TEST(Test_SparseVectorSpace) { -// for (size_t maxNumRec = 1; maxNumRec < MAX_NUM_REC; ++maxNumRec) { -// EXPECT_EQ(true, fullTest("sparse_5K.txt", maxNumRec, "tmp_out_file.txt", "cosinesimil_sparse", emptyParams, false)); -// EXPECT_EQ(true, fullTest("sparse_5K.txt", maxNumRec, "tmp_out_file.txt", "angulardist_sparse", emptyParams, false)); -// EXPECT_EQ(true, fullTest("sparse_5K.txt", maxNumRec, "tmp_out_file.txt", "cosinesimil_sparse", emptyParams, false)); -// EXPECT_EQ(true, fullTest("sparse_5K.txt", maxNumRec, "tmp_out_file.txt", "angulardist_sparse", emptyParams, false)); -// } -//} -// -//TEST(Test_SparseVectorSpaceFast) { -// for (size_t maxNumRec = 1; maxNumRec < MAX_NUM_REC; ++maxNumRec) { -// EXPECT_EQ(true, fullTest("sparse_5K.txt", maxNumRec, "tmp_out_file.txt", "cosinesimil_sparse_fast", emptyParams, false)); -// EXPECT_EQ(true, fullTest("sparse_5K.txt", maxNumRec, "tmp_out_file.txt", "angulardist_sparse_fast", emptyParams, false)); -// } -//} -// -//TEST(Test_StringSpace) { -// for (size_t maxNumRec = 1; maxNumRec < MAX_NUM_REC; ++maxNumRec) { -// EXPECT_EQ(true, fullTest("dna32_4_5K.txt", maxNumRec, "tmp_out_file.txt", "leven", emptyParams, false)); -// } -//} +TEST(Test_WordEmbedSpace) { + for (size_t maxNumRec = 1; maxNumRec < MAX_NUM_REC; ++maxNumRec) { + EXPECT_EQ(true, fullTest("glove.6B.100d_100.txt", maxNumRec, "tmp_out_file.txt", SPACE_WORD_EMBED, paramsDistL2, true)); + EXPECT_EQ(true, fullTest("glove.6B.100d_100.txt", maxNumRec, "tmp_out_file.txt",SPACE_WORD_EMBED, paramsDistCosine, true)); + EXPECT_EQ(true, fullTest("glove.6B.100d_100.txt", maxNumRec, "tmp_out_file.txt", SPACE_WORD_EMBED, paramsDistL2, true)); + EXPECT_EQ(true, fullTest("glove.6B.100d_100.txt", maxNumRec, "tmp_out_file.txt",SPACE_WORD_EMBED, paramsDistCosine, true)); + } +} + +TEST(Test_DenseVectorSpace) { + for (size_t maxNumRec = 1; maxNumRec < MAX_NUM_REC; ++maxNumRec) { + EXPECT_EQ(true, fullTest("final128_10K.txt", maxNumRec, "tmp_out_file.txt", "l2", emptyParams, false)); + EXPECT_EQ(true, fullTest("final128_10K.txt", maxNumRec, "tmp_out_file.txt", "l2", emptyParams, false)); + } +} + +TEST(Test_DenseVectorKLDiv) { + // Test KL-diverg. with and without precomputation of logarithms + for (size_t maxNumRec = 1; maxNumRec < MAX_NUM_REC; ++maxNumRec) { + EXPECT_EQ(true, fullTest("final128_10K.txt", maxNumRec, "tmp_out_file.txt", "kldivgenfast", emptyParams, false)); + EXPECT_EQ(true, fullTest("final128_10K.txt", maxNumRec, "tmp_out_file.txt", "kldivgenfast", emptyParams, false)); + EXPECT_EQ(true, fullTest("final128_10K.txt", maxNumRec, "tmp_out_file.txt", "kldivgenslow", emptyParams, false)); + EXPECT_EQ(true, fullTest("final128_10K.txt", maxNumRec, "tmp_out_file.txt", "kldivgenslow", emptyParams, false)); + } +} + +TEST(Test_SparseVectorSpace) { + for (size_t maxNumRec = 1; maxNumRec < MAX_NUM_REC; ++maxNumRec) { + EXPECT_EQ(true, fullTest("sparse_5K.txt", maxNumRec, "tmp_out_file.txt", "cosinesimil_sparse", emptyParams, false)); + EXPECT_EQ(true, fullTest("sparse_5K.txt", maxNumRec, "tmp_out_file.txt", "angulardist_sparse", emptyParams, false)); + EXPECT_EQ(true, fullTest("sparse_5K.txt", maxNumRec, "tmp_out_file.txt", "cosinesimil_sparse", emptyParams, false)); + EXPECT_EQ(true, fullTest("sparse_5K.txt", maxNumRec, "tmp_out_file.txt", "angulardist_sparse", emptyParams, false)); + } +} + +TEST(Test_SparseVectorSpaceFast) { + for (size_t maxNumRec = 1; maxNumRec < MAX_NUM_REC; ++maxNumRec) { + EXPECT_EQ(true, fullTest("sparse_5K.txt", maxNumRec, "tmp_out_file.txt", "cosinesimil_sparse_fast", emptyParams, false)); + EXPECT_EQ(true, fullTest("sparse_5K.txt", maxNumRec, "tmp_out_file.txt", "angulardist_sparse_fast", emptyParams, false)); + } +} + +TEST(Test_StringSpace) { + for (size_t maxNumRec = 1; maxNumRec < MAX_NUM_REC; ++maxNumRec) { + EXPECT_EQ(true, fullTest("dna32_4_5K.txt", maxNumRec, "tmp_out_file.txt", "leven", emptyParams, false)); + } +} TEST(Test_BitHamming) { vector testVect; @@ -235,15 +232,15 @@ TEST(Test_BitJaccard) { } } -//#if defined(WITH_EXTRAS) -//TEST(Test_SQFD) { -// const char* sqfdParams[] = {"alpha=1", NULL} ; -// for (size_t maxNumRec = 1; maxNumRec < MAX_NUM_REC; ++maxNumRec) { -// EXPECT_EQ(true, fullTest("sqfd20_10k_10k.txt", maxNumRec, "tmp_out_file.txt", "sqfd_heuristic_func", sqfdParams, false)); -// EXPECT_EQ(true, fullTest("sqfd20_10k_10k.txt", maxNumRec, "tmp_out_file.txt", "sqfd_heuristic_func", sqfdParams, false)); -// } -//} -//#endif +#if defined(WITH_EXTRAS) +TEST(Test_SQFD) { + const char* sqfdParams[] = {"alpha=1", NULL} ; + for (size_t maxNumRec = 1; maxNumRec < MAX_NUM_REC; ++maxNumRec) { + EXPECT_EQ(true, fullTest("sqfd20_10k_10k.txt", maxNumRec, "tmp_out_file.txt", "sqfd_heuristic_func", sqfdParams, false)); + EXPECT_EQ(true, fullTest("sqfd20_10k_10k.txt", maxNumRec, "tmp_out_file.txt", "sqfd_heuristic_func", sqfdParams, false)); + } +} +#endif } From 18895c5154638379fd5fbfac08c66d6b9826e45f Mon Sep 17 00:00:00 2001 From: Greg Friedland Date: Thu, 21 Feb 2019 16:15:01 -0800 Subject: [PATCH 14/17] remove empty files --- .../src/space/space_bit_hamming.cc | 44 ---- .../src/space/space_bit_jaccard.cc | 44 ---- .../src/space/space_bit_vector.cc | 195 ------------------ 3 files changed, 283 deletions(-) delete mode 100644 similarity_search/src/space/space_bit_hamming.cc delete mode 100644 similarity_search/src/space/space_bit_jaccard.cc delete mode 100644 similarity_search/src/space/space_bit_vector.cc diff --git a/similarity_search/src/space/space_bit_hamming.cc b/similarity_search/src/space/space_bit_hamming.cc deleted file mode 100644 index 72c2d18..0000000 --- a/similarity_search/src/space/space_bit_hamming.cc +++ /dev/null @@ -1,44 +0,0 @@ -///** -// * Non-metric Space Library -// * -// * Main developers: Bilegsaikhan Naidan, Leonid Boytsov, Yury Malkov, Ben Frederickson, David Novak -// * -// * For the complete list of contributors and further details see: -// * https://github.com/searchivarius/NonMetricSpaceLib -// * -// * Copyright (c) 2013-2018 -// * -// * This code is released under the -// * Apache License Version 2.0 http://www.apache.org/licenses/. -// * -// */ -//#include -//#include -//#include -//#include -//#include -// -//#include "space/space_bit_hamming.h" -//#include "permutation_utils.h" -//#include "logging.h" -//#include "distcomp.h" -//#include "read_data.h" -//#include "experimentconf.h" -// -//namespace similarity { -// -//using namespace std; -// -//template -//dist_t SpaceBitHamming::HiddenDistance(const Object* obj1, const Object* obj2) const { -// CHECK(obj1->datalength() > 0); -// CHECK(obj1->datalength() == obj2->datalength()); -// const dist_uint_t* x = reinterpret_cast(obj1->data()); -// const dist_uint_t* y = reinterpret_cast(obj2->data()); -// const size_t length = obj1->datalength() / sizeof(dist_uint_t) -// - 1; // the last integer is an original number of elements -// -// return BitHamming(x, y, length); -//} -// -//} diff --git a/similarity_search/src/space/space_bit_jaccard.cc b/similarity_search/src/space/space_bit_jaccard.cc deleted file mode 100644 index aa0ecad..0000000 --- a/similarity_search/src/space/space_bit_jaccard.cc +++ /dev/null @@ -1,44 +0,0 @@ -///** -// * Non-metric Space Library -// * -// * Main developers: Bilegsaikhan Naidan, Leonid Boytsov, Yury Malkov, Ben Frederickson, David Novak -// * -// * For the complete list of contributors and further details see: -// * https://github.com/searchivarius/NonMetricSpaceLib -// * -// * Copyright (c) 2013-2018 -// * -// * This code is released under the -// * Apache License Version 2.0 http://www.apache.org/licenses/. -// * -// */ -//#include -//#include -//#include -//#include -//#include -// -//#include "space/space_bit_jaccard.h" -//#include "permutation_utils.h" -//#include "logging.h" -//#include "distcomp.h" -//#include "read_data.h" -//#include "experimentconf.h" -// -////namespace similarity { -//// -//////using namespace std; -//// -//////template -//////dist_t SpaceBitJaccard::HiddenDistance(const Object* obj1, const Object* obj2) const { -////// CHECK(obj1->datalength() > 0); -////// CHECK(obj1->datalength() == obj2->datalength()); -////// const dist_uint_t* x = reinterpret_cast(obj1->data()); -////// const dist_uint_t* y = reinterpret_cast(obj2->data()); -////// const size_t length = obj1->datalength() / sizeof(dist_uint_t) -////// - 1; // the last integer is an original number of elements -////// -////// return BitJaccard(x, y, length); -//////} -//// -////} diff --git a/similarity_search/src/space/space_bit_vector.cc b/similarity_search/src/space/space_bit_vector.cc deleted file mode 100644 index 043319d..0000000 --- a/similarity_search/src/space/space_bit_vector.cc +++ /dev/null @@ -1,195 +0,0 @@ -///** -// * Non-metric Space Library -// * -// * Main developers: Bilegsaikhan Naidan, Leonid Boytsov, Yury Malkov, Ben Frederickson, David Novak -// * -// * For the complete list of contributors and further details see: -// * https://github.com/searchivarius/NonMetricSpaceLib -// * -// * Copyright (c) 2013-2018 -// * -// * This code is released under the -// * Apache License Version 2.0 http://www.apache.org/licenses/. -// * -// */ -//#include -//#include -//#include -//#include -//#include -// -//#include "space/space_bit_vector.h" -//#include "permutation_utils.h" -//#include "logging.h" -//#include "distcomp.h" -//#include "read_data.h" -//#include "experimentconf.h" -// -//namespace similarity { -// -//using namespace std; -// -////template -////dist_t SpaceBitVector::HiddenDistance(const Object* obj1, const Object* obj2) const { -//// CHECK(obj1->datalength() > 0); -//// CHECK(obj1->datalength() == obj2->datalength()); -//// const dist_uint_t* x = reinterpret_cast(obj1->data()); -//// const dist_uint_t* y = reinterpret_cast(obj2->data()); -//// const size_t length = obj1->datalength() / sizeof(dist_uint_t) -//// - 1; // the last integer is an original number of elements -//// -//// return BitVector(x, y, length); -////} -// -////template -////void SpaceBitVector::ReadBitMaskVect(std::string line, LabelType& label, std::vector& binVect) const -////{ -//// binVect.clear(); -//// -//// label = Object::extractLabel(line); -//// -//// std::stringstream str(line); -//// -//// str.exceptions(std::ios::badbit); -//// -//// -//// ReplaceSomePunct(line); -//// -//// vector v; -//// -////#if 0 -//// try { -//// unsigned val; -//// -//// while (str >> val) { -//// if (val != 0 && val != 1) { -//// throw runtime_error("Only zeros and ones are allowed"); -//// } -//// v.push_back(val); -//// } -//// } catch (const std::exception &e) { -//// LOG(LIB_ERROR) << "Exception: " << e.what(); -//// PREPARE_RUNTIME_ERR(err) << "Failed to parse the line: '" << line << "'"; -//// LOG(LIB_ERROR) << err.stream().str(); -//// THROW_RUNTIME_ERR(err); -//// } -////#else -//// if (!ReadVecDataEfficiently(line, v)) { -//// PREPARE_RUNTIME_ERR(err) << "Failed to parse the line: '" << line << "'"; -//// LOG(LIB_ERROR) << err.stream().str(); -//// THROW_RUNTIME_ERR(err); -//// } -//// for (auto val : v) { -//// if (val != 0 && val != 1) { -//// PREPARE_RUNTIME_ERR(err) << "Only zeros and ones are allowed, offending line: '" << line << "'"; -//// LOG(LIB_ERROR) << err.stream().str(); -//// THROW_RUNTIME_ERR(err); -//// } -//// } -////#endif -//// Binarize(v, 1, binVect); // Create the binary vector -//// binVect.push_back(v.size()); // Put the number of elements in the end -////} -// -////template -////Object* SpaceBitVector::CreateObjFromBitMaskVect(IdType id, LabelType label, const std::vector& bitMaskVect) const { -//// return new Object(id, label, bitMaskVect.size() * sizeof(dist_uint_t), &bitMaskVect[0]); -////}; -// -///** Standard functions to read/write/create objects */ -// -////template -////unique_ptr SpaceBitVector::OpenReadFileHeader(const string& inpFileName) const { -//// return unique_ptr(new DataFileInputStateVec(inpFileName)); -////} -// -////template -////unique_ptr SpaceBitVector::OpenWriteFileHeader(const ObjectVector& dataset, -//// const string& outFileName) const { -//// return unique_ptr(new DataFileOutputState(outFileName)); -////} -// -////template -////unique_ptr -////SpaceBitVector::CreateObjFromStr(IdType id, LabelType label, const string& s, -//// DataFileInputState* pInpStateBase) const { -//// DataFileInputStateVec* pInpState = NULL; -//// if (pInpStateBase != NULL) { -//// pInpState = dynamic_cast(pInpStateBase); -//// if (NULL == pInpState) { -//// PREPARE_RUNTIME_ERR(err) << "Bug: unexpected pointer type"; -//// THROW_RUNTIME_ERR(err); -//// } -//// } -//// vector vec; -//// ReadBitMaskVect(s, label, vec); -//// if (pInpState != NULL) { -//// size_t elemQty = vec[vec.size() - 1]; -//// if (pInpState->dim_ == 0) pInpState->dim_ = elemQty; -//// else if (elemQty != pInpState->dim_) { -//// PREPARE_RUNTIME_ERR(err) << "The # of bit-vector elements (" << elemQty << ")" << -//// " doesn't match the # of elements in previous lines. (" << pInpState->dim_ << " )"; -//// THROW_RUNTIME_ERR(err); -//// } -//// } -//// return unique_ptr(CreateObjFromVectInternal(id, label, vec)); -////} -// -////template -////Object* SpaceBitVector::CreateObjFromVectInternal(IdType id, LabelType label, const std::vector& InpVect) const { -//// return new Object(id, label, InpVect.size() * sizeof(dist_uint_t), &InpVect[0]); -////}; -// -////template -////bool SpaceBitVector::ApproxEqual(const Object& obj1, const Object& obj2) const { -//// const dist_uint_t* p1 = reinterpret_cast(obj1.data()); -//// const dist_uint_t* p2 = reinterpret_cast(obj2.data()); -//// const size_t len1 = obj1.datalength() / sizeof(dist_uint_t) -//// - 1; // the last integer is an original number of elements -//// const size_t len2 = obj2.datalength() / sizeof(dist_uint_t) -//// - 1; // the last integer is an original number of elements -//// if (len1 != len2) { -//// PREPARE_RUNTIME_ERR(err) << "Bug: comparing vectors of different lengths: " << len1 << " and " << len2; -//// THROW_RUNTIME_ERR(err); -//// } -//// for (size_t i = 0; i < len1; ++i) { -//// dist_uint_t v1 = ((p1[i/32] >> (i & 31)) & 1); -//// dist_uint_t v2 = ((p2[i/32] >> (i & 31)) & 1); -//// if (v1 != v2) return false; -//// } -//// -//// return true; -////} -// -// -////template -////string SpaceBitVector::CreateStrFromObj(const Object* pObj, const string& externId /* ignored */) const { -//// stringstream out; -//// const dist_uint_t* p = reinterpret_cast(pObj->data()); -//// const size_t length = pObj->datalength() / sizeof(dist_uint_t) -//// - 1; // the last integer is an original number of elements -//// const size_t elemQty = p[length]; // last elem -//// -//// for (size_t i = 0; i < elemQty; ++i) { -//// if (i) out << " "; -//// out << ((p[i/32] >> (i & 31)) & 1); -//// } -//// -//// return out.str(); -////} -// -////template -////bool SpaceBitVector::ReadNextObjStr(DataFileInputState &inpStateBase, string& strObj, LabelType& label, string& externId) const { -//// externId.clear(); -//// DataFileInputStateOneFile* pInpState = dynamic_cast(&inpStateBase); -//// CHECK_MSG(pInpState != NULL, "Bug: unexpected pointer type"); -//// if (!pInpState->inp_file_) return false; -//// if (!getline(pInpState->inp_file_, strObj)) return false; -//// pInpState->line_num_++; -//// return true; -////} -// -// -///** End of standard functions to read/write/create objects */ -// -//} // namespace similarity From 9ec3069e2b905e96c729cf06722bce3ec825292b Mon Sep 17 00:00:00 2001 From: Greg Friedland Date: Fri, 22 Feb 2019 09:45:07 -0800 Subject: [PATCH 15/17] cleanup --- python_bindings/setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python_bindings/setup.py b/python_bindings/setup.py index 88930a9..1a16797 100755 --- a/python_bindings/setup.py +++ b/python_bindings/setup.py @@ -21,6 +21,7 @@ if os.path.exists(library_file): # if we have a prebuilt nmslib library file, use that. extra_objects.append(library_file) + else: # Otherwise build all the files here directly (excluding extras which need eigen/boost) exclude_files = set("""bbtree.cc lsh.cc lsh_multiprobe.cc lsh_space.cc falconn.cc nndes.cc space_sqfd.cc From e4c0fb64653f6cc8001b13cec1c56f45b03b5aea Mon Sep 17 00:00:00 2001 From: Greg Friedland Date: Fri, 22 Feb 2019 09:47:32 -0800 Subject: [PATCH 16/17] more cleanup --- python_bindings/tests/bindings_test.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/python_bindings/tests/bindings_test.py b/python_bindings/tests/bindings_test.py index 1e3ec87..8406033 100644 --- a/python_bindings/tests/bindings_test.py +++ b/python_bindings/tests/bindings_test.py @@ -148,11 +148,8 @@ def testReloadIndex(self): s = self.bit_vector_str_func(np.ones(512)) original_results = original.knnQuery(s) reloaded_results = reloaded.knnQuery(s) - original_results = list(zip(list(original_results[0]), list(original_results[1]))) - original_results = sorted(original_results, key=lambda x: x[1]) - reloaded_results = list(zip(list(reloaded_results[0]), list(reloaded_results[1]))) - reloaded_results = sorted(reloaded_results, key=lambda x: x[1]) - npt.assert_allclose(original_results, reloaded_results) + npt.assert_allclose(original_results, + reloaded_results) class HNSWTestCase(unittest.TestCase, DenseIndexTestMixin): @@ -174,7 +171,7 @@ def _get_index(self, space='jaccard_sparse'): class BitHammingTestCase(unittest.TestCase, BitVectorIndexTestMixin): def _get_index(self, space='bit_hamming'): - return nmslib.init(method='hnsw', space='bit_hamming', data_type=nmslib.DataType.OBJECT_AS_STRING, + return nmslib.init(method='hnsw', space=space, data_type=nmslib.DataType.OBJECT_AS_STRING, dtype=nmslib.DistType.INT) From 4fbae0666d1c6e4ee9c796104290e3ccceb8ae3d Mon Sep 17 00:00:00 2001 From: Greg Friedland Date: Fri, 22 Feb 2019 09:48:15 -0800 Subject: [PATCH 17/17] more cleanup --- similarity_search/test/test_space_serial.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/similarity_search/test/test_space_serial.cc b/similarity_search/test/test_space_serial.cc index 17a5c6b..00ec2f9 100644 --- a/similarity_search/test/test_space_serial.cc +++ b/similarity_search/test/test_space_serial.cc @@ -134,7 +134,6 @@ bool fullTest(const vector& dataSetStr, size_t maxNumRec, const string& dataSet1.push_back(space->CreateObjFromStr(id++, -1, s, NULL).release()); vExternIds1.push_back(ss.str()); - if (id >= maxNumRec) break; }