From 70465dadfc8429bf338de140d734c3780d4b5163 Mon Sep 17 00:00:00 2001 From: searchivairus Date: Wed, 7 Feb 2018 12:44:47 -0500 Subject: [PATCH] Implementing a space efficient SIFT/L2 space #280 --- similarity_search/include/distcomp.h | 15 +- .../include/factory/init_spaces.h | 1 + .../include/factory/space/space_lp.h | 5 + .../include/method/simple_inverted_index.h | 4 +- similarity_search/include/portable_align.h | 11 +- .../include/portable_intrinsics.h | 2 +- .../include/space/space_l2sqr_sift.h | 84 +++++++++ .../include/space/space_sift_vector.h | 76 -------- ...p_sift_l2sqr.cc => distcomp_l2sqr_sift.cc} | 0 .../src/space/space_l2sqr_sift.cc | 165 ++++++++++++++++++ .../src/space/space_sift_vector.cc | 145 --------------- 11 files changed, 273 insertions(+), 235 deletions(-) create mode 100644 similarity_search/include/space/space_l2sqr_sift.h delete mode 100644 similarity_search/include/space/space_sift_vector.h rename similarity_search/src/{distcomp_sift_l2sqr.cc => distcomp_l2sqr_sift.cc} (100%) create mode 100644 similarity_search/src/space/space_l2sqr_sift.cc delete mode 100644 similarity_search/src/space/space_sift_vector.cc diff --git a/similarity_search/include/distcomp.h b/similarity_search/include/distcomp.h index a32e957..ebbf4de 100644 --- a/similarity_search/include/distcomp.h +++ b/similarity_search/include/distcomp.h @@ -248,13 +248,6 @@ inline float JaccardSparse(const IdType *pArr1, size_t qty1, const IdType *pArr2 return 1 - qtyInter/(qtyS - qtyInter); } -} - -/* - * Edit distances - */ -#include "distcomp_edist.h" - // For SIFT vectors (whose dim=128) int is enough to store the smallest and the largest difference typedef int DistTypeSIFT; @@ -265,5 +258,13 @@ DistTypeSIFT l2SqrSIFTNaive(const uint8_t* pVect1, const uint8_t* pVect2); DistTypeSIFT l2SqrSIFTPrecomp(const uint8_t* pVect1, const uint8_t* pVect2); DistTypeSIFT l2SqrSIFTPrecompAVX(const uint8_t* pVect1, const uint8_t* pVect2); +} + + +/* + * Edit distances not included into the similarity space, + * because the namespace is specified in the distcomp_edist.h + */ +#include "distcomp_edist.h" #endif diff --git a/similarity_search/include/factory/init_spaces.h b/similarity_search/include/factory/init_spaces.h index 0799ced..0984e08 100644 --- a/similarity_search/include/factory/init_spaces.h +++ b/similarity_search/include/factory/init_spaces.h @@ -147,6 +147,7 @@ inline void initSpaces() { REGISTER_SPACE_CREATOR(float, SPACE_RENYI_DIVERG_FAST, CreateRenyiDivergFast) REGISTER_SPACE_CREATOR(double, SPACE_RENYI_DIVERG_FAST, CreateRenyiDivergFast) + REGISTER_SPACE_CREATOR(int, SPACE_L2SQR_SIFT, CreateL2SqrSIFT) } } diff --git a/similarity_search/include/factory/space/space_lp.h b/similarity_search/include/factory/space/space_lp.h index 252e63f..b39d068 100644 --- a/similarity_search/include/factory/space/space_lp.h +++ b/similarity_search/include/factory/space/space_lp.h @@ -16,6 +16,7 @@ #define FACTORY_SPACE_LP_H #include +#include namespace similarity { @@ -49,6 +50,10 @@ Space* CreateL(const AnyParams& AllParams) { return new SpaceLp(p); } +Space* CreateL2SqrSIFT(const AnyParams& ){ + return new SpaceL2SqrSift(); +} + /* * End of creating functions. */ diff --git a/similarity_search/include/method/simple_inverted_index.h b/similarity_search/include/method/simple_inverted_index.h index 2f4ae25..3666495 100644 --- a/similarity_search/include/method/simple_inverted_index.h +++ b/similarity_search/include/method/simple_inverted_index.h @@ -39,8 +39,8 @@ class SimplInvIndex : public Index { */ SimplInvIndex(bool printProgress, Space& space, - const ObjectVector& data) : printProgress_(printProgress), - Index(data), + const ObjectVector& data) : Index(data), + printProgress_(printProgress), pSpace_(dynamic_cast(&space)) { if (pSpace_ == nullptr) { PREPARE_RUNTIME_ERR(err) << diff --git a/similarity_search/include/portable_align.h b/similarity_search/include/portable_align.h index bc0e29f..2ec86bc 100644 --- a/similarity_search/include/portable_align.h +++ b/similarity_search/include/portable_align.h @@ -20,9 +20,12 @@ */ #if defined(__GNUC__) - #define PORTABLE_ALIGN16 __attribute__((aligned(16))) - #define PORTABLE_ALIGN32 __attribute__((aligned(32))) +#define PORTABLE_ALIGN16 __attribute__((aligned(16))) #else - #define PORTABLE_ALIGN16 __declspec(align(16)) - #define PORTABLE_ALIGN32 __declspec(align(32)) +#define PORTABLE_ALIGN16 __declspec(align(16)) +#endif +#if defined(__GNUC__) +#define PORTABLE_ALIGN32 __attribute__((aligned(32))) +#else +#define PORTABLE_ALIGN32 __declspec(align(32)) #endif diff --git a/similarity_search/include/portable_intrinsics.h b/similarity_search/include/portable_intrinsics.h index 6f4a217..917fce8 100644 --- a/similarity_search/include/portable_intrinsics.h +++ b/similarity_search/include/portable_intrinsics.h @@ -14,8 +14,8 @@ */ #pragma once -#include #include +#include // On 64-bit platforms SSE2 is always present, but Windows doesn't set SSE2 flag // http://stackoverflow.com/questions/1067630/sse2-option-in-visual-c-x64 diff --git a/similarity_search/include/space/space_l2sqr_sift.h b/similarity_search/include/space/space_l2sqr_sift.h new file mode 100644 index 0000000..fbbec8f --- /dev/null +++ b/similarity_search/include/space/space_l2sqr_sift.h @@ -0,0 +1,84 @@ +/** + * Non-metric Space Library + * + * Main developers: Bilegsaikhan Naidan, Leonid Boytsov, Yury Malkov, Ben Frederickson, David Novak + * + * For the complete list of contributors and further details see: + * https://github.com/searchivarius/NonMetricSpaceLib + * + * Copyright (c) 2013-2018 + * + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + */ +#ifndef _SPACE_L2_SQR_SIFT_H_ +#define _SPACE_L2_SQR_SIFT_H_ + +#include +#include +#include +#include +#include + +#include +#include "global.h" +#include "object.h" +#include "utils.h" +#include "space.h" +#include "distcomp.h" + +#define SPACE_L2SQR_SIFT "l2sqr_sift" + +namespace similarity { + +using std::string; +using std::unique_ptr; + +class SpaceL2SqrSift : public Space { + public: + explicit SpaceL2SqrSift() {} + virtual ~SpaceL2SqrSift() {} + + /** Standard functions to read/write/create objects */ + virtual unique_ptr CreateObjFromStr(IdType id, LabelType label, const string& s, + DataFileInputState* pInpState) const override; + // Create a string representation of an object. + virtual string CreateStrFromObj(const Object* pObj, const string& externId /* ignored */) const override; + // Open a file for reading, fetch a header (if there is any) and memorize an input state + virtual unique_ptr OpenReadFileHeader(const string& inputFile) const override; + // Open a file for writing, write a header (if there is any) and memorize an output state + virtual unique_ptr OpenWriteFileHeader(const ObjectVector& dataset, + const string& outputFile) const override; + /* + * Read a string representation of the next object in a file as well + * as its label. Return false, on EOF. + */ + virtual bool ReadNextObjStr(DataFileInputState &, string& strObj, LabelType& label, string& externId) const override; + /** End of standard functions to read/write/create objects */ + + virtual Object* CreateObjFromUint8Vect(IdType id, LabelType label, const std::vector& InpVect) const; + virtual size_t GetElemQty(const Object* object) const override { return SIFT_DIM; } + + virtual string StrDesc() const override { return SPACE_L2SQR_SIFT; } + + virtual bool ApproxEqual(const Object& obj1, const Object& obj2) const override; + + virtual void CreateDenseVectFromObj(const Object* obj, DistTypeSIFT* pVect, size_t nElem) const override; + + static void ReadUint8Vec(std::string line, LabelType& label, std::vector& v); + +protected: + DISABLE_COPY_AND_ASSIGN(SpaceL2SqrSift); + + virtual DistTypeSIFT HiddenDistance(const Object* obj1, const Object* obj2) const override { + const uint8_t* pVect1 = reinterpret_cast(obj1->data()); + const uint8_t* pVect2 = reinterpret_cast(obj2->data()); + + return l2SqrSIFTPrecompAVX(pVect1, pVect2); + } +}; + +} // namespace similarity + +#endif diff --git a/similarity_search/include/space/space_sift_vector.h b/similarity_search/include/space/space_sift_vector.h deleted file mode 100644 index 2d77d8a..0000000 --- a/similarity_search/include/space/space_sift_vector.h +++ /dev/null @@ -1,76 +0,0 @@ -/** - * Non-metric Space Library - * - * Main developers: Bilegsaikhan Naidan, Leonid Boytsov, Yury Malkov, Ben Frederickson, David Novak - * - * For the complete list of contributors and further details see: - * https://github.com/searchivarius/NonMetricSpaceLib - * - * Copyright (c) 2013-2018 - * - * This code is released under the - * Apache License Version 2.0 http://www.apache.org/licenses/. - * - */ -#ifndef _SPACE_SIFT_VECTOR_H_ -#define _SPACE_SIFT_VECTOR_H_ - -#include -#include -#include -#include -#include - -#include -#include "global.h" -#include "object.h" -#include "utils.h" -#include "space.h" -#include "distcomp.h" - -namespace similarity { - -using std::string; -using std::unique_ptr; - -class SiftVectorSpace : public Space { - public: - explicit SiftVectorSpace() {} - virtual ~SiftVectorSpace() {} - - /** Standard functions to read/write/create objects */ - virtual unique_ptr CreateObjFromStr(IdType id, LabelType label, const string& s, - DataFileInputState* pInpState) const; - // Create a string representation of an object. - virtual string CreateStrFromObj(const Object* pObj, const string& externId /* ignored */) const; - // Open a file for reading, fetch a header (if there is any) and memorize an input state - virtual unique_ptr OpenReadFileHeader(const string& inputFile) const; - // Open a file for writing, write a header (if there is any) and memorize an output state - virtual unique_ptr OpenWriteFileHeader(const ObjectVector& dataset, - const string& outputFile) const; - /* - * Read a string representation of the next object in a file as well - * as its label. Return false, on EOF. - */ - virtual bool ReadNextObjStr(DataFileInputState &, string& strObj, LabelType& label, string& externId) const; - /** End of standard functions to read/write/create objects */ - - virtual Object* CreateObjFromUint8Vect(IdType id, LabelType label, const std::vector& InpVect) const; - virtual size_t GetElemQty(const Object* object) const { return SIFT_DIM; } - - static void ReadUint8Vec(std::string line, LabelType& label, std::vector& v); - -protected: - DISABLE_COPY_AND_ASSIGN(VectorSpace); - - virtual DistTypeSift HiddenDistance(const Object* obj1, const Object* obj2) const override { - const uint8_t* pVect1 = reinterpret_cast(obj1->data()); - const uint8_t* pVect2 = reinterpret_cast(obj2->data()); - - return l2SqrSIFTPrecompAVX(pVect1, pVect2); - } -}; - -} // namespace similarity - -#endif diff --git a/similarity_search/src/distcomp_sift_l2sqr.cc b/similarity_search/src/distcomp_l2sqr_sift.cc similarity index 100% rename from similarity_search/src/distcomp_sift_l2sqr.cc rename to similarity_search/src/distcomp_l2sqr_sift.cc diff --git a/similarity_search/src/space/space_l2sqr_sift.cc b/similarity_search/src/space/space_l2sqr_sift.cc new file mode 100644 index 0000000..5c8264d --- /dev/null +++ b/similarity_search/src/space/space_l2sqr_sift.cc @@ -0,0 +1,165 @@ +/** + * Non-metric Space Library + * + * Main developers: Bilegsaikhan Naidan, Leonid Boytsov, Yury Malkov, Ben Frederickson, David Novak + * + * For the complete list of contributors and further details see: + * https://github.com/searchivarius/NonMetricSpaceLib + * + * Copyright (c) 2013-2018 + * + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "object.h" +#include "utils.h" +#include "logging.h" +#include "distcomp.h" +#include "experimentconf.h" +#include "space/space_l2sqr_sift.h" +#include "read_data.h" + +namespace similarity { + +using namespace std; + +/** Standard functions to read/write/create objects */ + +unique_ptr SpaceL2SqrSift::OpenReadFileHeader(const string& inpFileName) const { + return unique_ptr(new DataFileInputStateVec(inpFileName)); +} + +unique_ptr SpaceL2SqrSift::OpenWriteFileHeader(const ObjectVector& dataset, + const string& outFileName) const { + return unique_ptr(new DataFileOutputState(outFileName)); +} + +unique_ptr +SpaceL2SqrSift::CreateObjFromStr(IdType id, LabelType label, const string& s, + DataFileInputState* pInpStateBase) const { + DataFileInputStateVec* pInpState = NULL; + if (pInpStateBase != NULL) { + pInpState = dynamic_cast(pInpStateBase); + if (NULL == pInpState) { + PREPARE_RUNTIME_ERR(err) << "Bug: unexpected pointer type"; + THROW_RUNTIME_ERR(err); + } + } + vector vec; + ReadUint8Vec(s, label, vec); + if (pInpState != NULL) { + if (pInpState->dim_ == 0) pInpState->dim_ = vec.size(); + else if (vec.size() != pInpState->dim_) { + stringstream lineStr; + if (pInpStateBase != NULL) lineStr << " line:" << pInpState->line_num_ << " "; + PREPARE_RUNTIME_ERR(err) << "The # of vector elements (" << vec.size() << ")" << lineStr.str() << + " doesn't match the # of elements in previous lines. (" << pInpState->dim_ << " )"; + THROW_RUNTIME_ERR(err); + } + } + return unique_ptr(CreateObjFromUint8Vect(id, label, vec)); +} + +string SpaceL2SqrSift::CreateStrFromObj(const Object* pObj, const string& externId /* ignored */) const { + stringstream out; + const uint8_t* p = reinterpret_cast(pObj->data()); + for (size_t i = 0; i < SIFT_DIM; ++i) { + if (i) out << " "; + // Clear all previous flags & set to the maximum precision available + out << p[i]; + } + + return out.str(); +} + +bool SpaceL2SqrSift::ReadNextObjStr(DataFileInputState &inpStateBase, string& strObj, LabelType& label, string& externId) const { + externId.clear(); + DataFileInputStateOneFile* pInpState = dynamic_cast(&inpStateBase); + CHECK_MSG(pInpState != NULL, "Bug: unexpected pointer type"); + if (!pInpState->inp_file_) return false; + if (!getline(pInpState->inp_file_, strObj)) return false; + pInpState->line_num_++; + return true; +} + +/** End of standard functions to read/write/create objects */ + +void SpaceL2SqrSift::ReadUint8Vec(string line, LabelType& label, vector& v) +{ + v.clear(); + + label = Object::extractLabel(line); + + vector vtmp; + + if (!ReadVecDataEfficiently(line, vtmp)) { + PREPARE_RUNTIME_ERR(err) << "Failed to parse the line: '" << line << "'"; + LOG(LIB_ERROR) << err.stream().str(); + THROW_RUNTIME_ERR(err); + } + if (vtmp.size() != SIFT_DIM) { + PREPARE_RUNTIME_ERR(err) << "Wrong number of vector elements " + << "(expected " << SIFT_DIM << " but got " << vtmp.size() << ")" + << " in line: '" << line << "'"; + LOG(LIB_ERROR) << err.stream().str(); + THROW_RUNTIME_ERR(err); + } + v.resize(SIFT_DIM); + for (unsigned i = 0; i < SIFT_DIM; ++i) { + float fval = vtmp[i]; + if (fval < 0 || fval > numeric_limits::max()) { + PREPARE_RUNTIME_ERR(err) << "Out-of range integer values (for SIFT) in the line: '" << line << "'"; + LOG(LIB_ERROR) << err.stream().str(); + THROW_RUNTIME_ERR(err); + } + v[i] = static_cast(fval); + if (fabs(v[i] - fval) > numeric_limits::min()) { + PREPARE_RUNTIME_ERR(err) << "Non-integer values (for SIFT) in the line: '" << line << "'"; + LOG(LIB_ERROR) << err.stream().str(); + THROW_RUNTIME_ERR(err); + } + } +} + +Object* +SpaceL2SqrSift::CreateObjFromUint8Vect(IdType id, LabelType label, const vector& InpVect) const { + CHECK_MSG(InpVect.size() == SIFT_DIM, + "Bug or internal error, SIFT vectors dim " + ConvertToString(InpVect.size()) + + " isn't == " + ConvertToString(SIFT_DIM)); + DistTypeSIFT sum = 0; + // We precompute and memorize the sum + for (DistTypeSIFT e : InpVect) + sum += e * e; + unique_ptr res(new Object(id, label, SIFT_DIM + sizeof(DistTypeSIFT), + &InpVect[0])); + + *reinterpret_cast(res->data() + SIFT_DIM) = sum; + return res.release(); +} + +void +SpaceL2SqrSift::CreateDenseVectFromObj(const Object* obj, DistTypeSIFT* pVect, size_t nElem) const { + const uint8_t* p = reinterpret_cast(obj->data()); + for (unsigned i = 0; i < min(nElem, SIFT_DIM); ++i) { + pVect[i] = p[i]; + } +} + +// This approximate comparison is actually an exact one +bool SpaceL2SqrSift::ApproxEqual(const Object& obj1, const Object& obj2) const { + return HiddenDistance(&obj1, &obj2) == 0; +} + +} // namespace similarity diff --git a/similarity_search/src/space/space_sift_vector.cc b/similarity_search/src/space/space_sift_vector.cc deleted file mode 100644 index 95f5f38..0000000 --- a/similarity_search/src/space/space_sift_vector.cc +++ /dev/null @@ -1,145 +0,0 @@ -/** - * Non-metric Space Library - * - * Main developers: Bilegsaikhan Naidan, Leonid Boytsov, Yury Malkov, Ben Frederickson, David Novak - * - * For the complete list of contributors and further details see: - * https://github.com/searchivarius/NonMetricSpaceLib - * - * Copyright (c) 2013-2018 - * - * This code is released under the - * Apache License Version 2.0 http://www.apache.org/licenses/. - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "object.h" -#include "utils.h" -#include "logging.h" -#include "distcomp.h" -#include "experimentconf.h" -#include "space/space_vector.h" -#include "read_data.h" - -namespace similarity { - -using namespace std; - -/** Standard functions to read/write/create objects */ - -unique_ptr SiftVectorSpace::OpenReadFileHeader(const string& inpFileName) const { - return unique_ptr(new DataFileInputStateVec(inpFileName)); -} - -template -unique_ptr SiftVectorSpace::OpenWriteFileHeader(const ObjectVector& dataset, - const string& outFileName) const { - return unique_ptr(new DataFileOutputState(outFileName)); -} - -template -unique_ptr -SiftVectorSpace::CreateObjFromStr(IdType id, LabelType label, const string& s, - DataFileInputState* pInpStateBase) const { - DataFileInputStateVec* pInpState = NULL; - if (pInpStateBase != NULL) { - pInpState = dynamic_cast(pInpStateBase); - if (NULL == pInpState) { - PREPARE_RUNTIME_ERR(err) << "Bug: unexpected pointer type"; - THROW_RUNTIME_ERR(err); - } - } - vector vec; - ReadVec(s, label, vec); - if (pInpState != NULL) { - if (pInpState->dim_ == 0) pInpState->dim_ = vec.size(); - else if (vec.size() != pInpState->dim_) { - stringstream lineStr; - if (pInpStateBase != NULL) lineStr << " line:" << pInpState->line_num_ << " "; - PREPARE_RUNTIME_ERR(err) << "The # of vector elements (" << vec.size() << ")" << lineStr.str() << - " doesn't match the # of elements in previous lines. (" << pInpState->dim_ << " )"; - THROW_RUNTIME_ERR(err); - } - } - return unique_ptr(CreateObjFromVect(id, label, vec)); -} - -template -bool SiftVectorSpace::ApproxEqual(const Object& obj1, const Object& obj2) const { - const dist_t* p1 = reinterpret_cast(obj1.data()); - const dist_t* p2 = reinterpret_cast(obj2.data()); - const size_t len1 = GetElemQty(&obj1); - const size_t len2 = GetElemQty(&obj2); - if (len1 != len2) { - PREPARE_RUNTIME_ERR(err) << "Bug: comparing vectors of different lengths: " << len1 << " and " << len2; - THROW_RUNTIME_ERR(err); - } - for (size_t i = 0; i < len1; ++i) - // We have to specify the namespace here, otherwise a compiler - // thinks that it should use the equally named member function - if (!similarity::ApproxEqual(p1[i], p2[i])) return false; - return true; -} - -template -string SiftVectorSpace::CreateStrFromObj(const Object* pObj, const string& externId /* ignored */) const { - stringstream out; - const dist_t* p = reinterpret_cast(pObj->data()); - const size_t length = GetElemQty(pObj); - for (size_t i = 0; i < length; ++i) { - if (i) out << " "; - // Clear all previous flags & set to the maximum precision available - out.unsetf(ios_base::floatfield); - out << setprecision(numeric_limits::max_digits10) << noshowpoint << p[i]; - } - - return out.str(); -} - -template -bool SiftVectorSpace::ReadNextObjStr(DataFileInputState &inpStateBase, string& strObj, LabelType& label, string& externId) const { - externId.clear(); - DataFileInputStateOneFile* pInpState = dynamic_cast(&inpStateBase); - CHECK_MSG(pInpState != NULL, "Bug: unexpected pointer type"); - if (!pInpState->inp_file_) return false; - if (!getline(pInpState->inp_file_, strObj)) return false; - pInpState->line_num_++; - return true; -} - -/** End of standard functions to read/write/create objects */ - -template -void SiftVectorSpace::ReadVec(string line, LabelType& label, vector& v) -{ - v.clear(); - - label = Object::extractLabel(line); - -#if 0 - if (!ReadVecDataViaStream(line, v)) { -#else - if (!ReadVecDataEfficiently(line, v)) { -#endif - PREPARE_RUNTIME_ERR(err) << "Failed to parse the line: '" << line << "'"; - LOG(LIB_ERROR) << err.stream().str(); - THROW_RUNTIME_ERR(err); - } -} - -template -Object* SiftVectorSpace::CreateObjFromVect(IdType id, LabelType label, const vector& InpVect) const { - return new Object(id, label, InpVect.size() * sizeof(dist_t), &InpVect[0]); -}; - -} // namespace similarity