From 7281031e4feafe3ad25368b807a9314416376567 Mon Sep 17 00:00:00 2001 From: searchivairus Date: Tue, 6 Feb 2018 23:51:13 -0500 Subject: [PATCH] Intermediate commit for SIFT space. --- scripts/update_all_files_header.sh | 10 ++ scripts/update_file_header.py | 60 +++++++ similarity_search/include/distcomp.h | 10 ++ similarity_search/include/portable_align.h | 11 +- .../include/portable_intrinsics.h | 4 + .../include/space/space_sift_vector.h | 78 ++++++++++ similarity_search/include/utils.h | 10 +- similarity_search/src/distcomp_sift_l2sqr.cc | 101 ++++++++++++ .../src/space/space_sift_vector.cc | 147 ++++++++++++++++++ 9 files changed, 418 insertions(+), 13 deletions(-) create mode 100755 scripts/update_all_files_header.sh create mode 100755 scripts/update_file_header.py create mode 100644 similarity_search/include/space/space_sift_vector.h create mode 100644 similarity_search/src/distcomp_sift_l2sqr.cc create mode 100644 similarity_search/src/space/space_sift_vector.cc diff --git a/scripts/update_all_files_header.sh b/scripts/update_all_files_header.sh new file mode 100755 index 0000000..5c4e9ee --- /dev/null +++ b/scripts/update_all_files_header.sh @@ -0,0 +1,10 @@ +#/bin/bash +tmpn=`mktemp` + +for suff in h cc cpp c ; do + for f in `find . -name *.$suff` ; do + echo $f + scripts/update_file_header.py $f $tmpn + done +done + diff --git a/scripts/update_file_header.py b/scripts/update_file_header.py new file mode 100755 index 0000000..295e255 --- /dev/null +++ b/scripts/update_file_header.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +import sys +import os +import datetime + +now = datetime.datetime.now() + +def printErr(str): + sys.stderr.write(str + '\n') + +if len(sys.argv) != 3: + printErr("PARAMETERS: ") + sys.exit(1) + +inFileName = sys.argv[1] +outFileName = sys.argv[2] + +outFile = open(outFileName, 'w') +inFile = open(inFileName, 'r') + +lines = inFile.readlines() + +mainDevelList = 'Bilegsaikhan Naidan, Leonid Boytsov, Yury Malkov, Ben Frederickson, David Novak' + +newHeader = """/** + * Non-metric Space Library + * + * Main developers: %s + * + * For the complete list of contributors and further details see: + * https://github.com/searchivarius/NonMetricSpaceLib + * + * Copyright (c) 2013-%d + * + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + */""" + +lenQty = len(lines) + +headEnd = None +if lines[0].strip() == '/**' and lines[1].strip() == '* Non-metric Space Library': + for ln in range(lenQty): + if lines[ln].strip() == '*/': + headEnd = ln + break + if headEnd is None: + printErr('Cannot find the end of the template header in the file %s' % inFileName) + sys.exit(1) +else: + print('WRANING Cannot find a template header in the file %s, IGNORING' % inFileName) + sys.exit(0) + +outFile.write(newHeader % (mainDevelList, now.year)) + +for ln in range(headEnd + 1, lenQty): + outFile.write(lines[ln]) + +outFile.close() diff --git a/similarity_search/include/distcomp.h b/similarity_search/include/distcomp.h index 0202206..d4cd650 100644 --- a/similarity_search/include/distcomp.h +++ b/similarity_search/include/distcomp.h @@ -256,5 +256,15 @@ inline float JaccardSparse(const IdType *pArr1, size_t qty1, const IdType *pArr2 */ #include "distcomp_edist.h" +// For SIFT vectors (whose dim=128) int is enough to store the smallest and the largest difference +typedef int DistTypeSIFT; + +const uint_fast32_t SIFT_DIM = 128; + +// All SIFT vectors are expected to have the same dimensionality (SIFT_DIM) +DistTypeSIFT l2SqrSIFTNaive(const uint8_t* pVect1, const uint8_t* pVect2); +DistTypeSIFT l2SqrSIFTPrecomp(const uint8_t* pVect1, const uint8_t* pVect2); +DistTypeSIFT l2SqrSIFTPrecompAVX(const uint8_t* pVect1, const uint8_t* pVect2); + #endif diff --git a/similarity_search/include/portable_align.h b/similarity_search/include/portable_align.h index a7ac157..584ec63 100644 --- a/similarity_search/include/portable_align.h +++ b/similarity_search/include/portable_align.h @@ -6,12 +6,9 @@ */ #if defined(__GNUC__) -#define PORTABLE_ALIGN16 __attribute__((aligned(16))) + #define PORTABLE_ALIGN16 __attribute__((aligned(16))) + #define PORTABLE_ALIGN32 __attribute__((aligned(32))) #else -#define PORTABLE_ALIGN16 __declspec(align(16)) -#endif -#if defined(__GNUC__) -#define PORTABLE_ALIGN32 __attribute__((aligned(32))) -#else -#define PORTABLE_ALIGN32 __declspec(align(32)) + #define PORTABLE_ALIGN16 __declspec(align(16)) + #define PORTABLE_ALIGN32 __declspec(align(32)) #endif diff --git a/similarity_search/include/portable_intrinsics.h b/similarity_search/include/portable_intrinsics.h index 7fd74b9..4ccad8b 100644 --- a/similarity_search/include/portable_intrinsics.h +++ b/similarity_search/include/portable_intrinsics.h @@ -25,6 +25,10 @@ #define PORTABLE_SSE4 #endif +#if defined(__AVX__) +#define PORTABLE_AVX +#endif + #if defined(PORTABLE_SSE4) #include diff --git a/similarity_search/include/space/space_sift_vector.h b/similarity_search/include/space/space_sift_vector.h new file mode 100644 index 0000000..d1c1792 --- /dev/null +++ b/similarity_search/include/space/space_sift_vector.h @@ -0,0 +1,78 @@ +/** + * Non-metric Space Library + * + * Authors: Bilegsaikhan Naidan (https://github.com/bileg), Leonid Boytsov (http://boytsov.info). + * With contributions from Lawrence Cayton (http://lcayton.com/) and others. + * + * For the complete list of contributors and further details see: + * https://github.com/searchivarius/NonMetricSpaceLib + * + * Copyright (c) 2018 + * + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + */ + +#ifndef _SPACE_SIFT_VECTOR_H_ +#define _SPACE_SIFT_VECTOR_H_ + +#include +#include +#include +#include +#include + +#include +#include "global.h" +#include "object.h" +#include "utils.h" +#include "space.h" +#include "distcomp.h" + +namespace similarity { + +using std::string; +using std::unique_ptr; + +class SiftVectorSpace : public Space { + public: + explicit SiftVectorSpace() {} + virtual ~SiftVectorSpace() {} + + /** Standard functions to read/write/create objects */ + virtual unique_ptr CreateObjFromStr(IdType id, LabelType label, const string& s, + DataFileInputState* pInpState) const; + // Create a string representation of an object. + virtual string CreateStrFromObj(const Object* pObj, const string& externId /* ignored */) const; + // Open a file for reading, fetch a header (if there is any) and memorize an input state + virtual unique_ptr OpenReadFileHeader(const string& inputFile) const; + // Open a file for writing, write a header (if there is any) and memorize an output state + virtual unique_ptr OpenWriteFileHeader(const ObjectVector& dataset, + const string& outputFile) const; + /* + * Read a string representation of the next object in a file as well + * as its label. Return false, on EOF. + */ + virtual bool ReadNextObjStr(DataFileInputState &, string& strObj, LabelType& label, string& externId) const; + /** End of standard functions to read/write/create objects */ + + virtual Object* CreateObjFromUint8Vect(IdType id, LabelType label, const std::vector& InpVect) const; + virtual size_t GetElemQty(const Object* object) const { return SIFT_DIM; } + + static void ReadUint8Vec(std::string line, LabelType& label, std::vector& v); + +protected: + DISABLE_COPY_AND_ASSIGN(VectorSpace); + + virtual DistTypeSift HiddenDistance(const Object* obj1, const Object* obj2) const override { + const uint8_t* pVect1 = reinterpret_cast(obj1->data()); + const uint8_t* pVect2 = reinterpret_cast(obj2->data()); + + return l2SqrSIFTPrecompAVX(pVect1, pVect2); + } +}; + +} // namespace similarity + +#endif diff --git a/similarity_search/include/utils.h b/similarity_search/include/utils.h index 6b1b182..0c08be5 100644 --- a/similarity_search/include/utils.h +++ b/similarity_search/include/utils.h @@ -1,19 +1,17 @@ /** * Non-metric Space Library * - * Authors: Bilegsaikhan Naidan (https://github.com/bileg), Leonid Boytsov (http://boytsov.info). - * With contributions from Lawrence Cayton (http://lcayton.com/) and others. + * Main developers: Bilegsaikhan Naidan, Leonid Boytsov, Yury Malkov, Ben Frederickson, David Novak * * For the complete list of contributors and further details see: - * https://github.com/searchivarius/NonMetricSpaceLib - * - * Copyright (c) 2014 + * https://github.com/searchivarius/NonMetricSpaceLib + * + * Copyright (c) 2013-2018 * * This code is released under the * Apache License Version 2.0 http://www.apache.org/licenses/. * */ - #ifndef _UTILS_H_ #define _UTILS_H_ diff --git a/similarity_search/src/distcomp_sift_l2sqr.cc b/similarity_search/src/distcomp_sift_l2sqr.cc new file mode 100644 index 0000000..c5707af --- /dev/null +++ b/similarity_search/src/distcomp_sift_l2sqr.cc @@ -0,0 +1,101 @@ +/** + * Non-metric Space Library + * + * Authors: Bilegsaikhan Naidan (https://github.com/bileg), Leonid Boytsov (http://boytsov.info). + * With contributions from Lawrence Cayton (http://lcayton.com/) and others. + * + * For the complete list of contributors and further details see: + * https://github.com/searchivarius/NonMetricSpaceLib + * + * Copyright (c) 2014 + * + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + */ +#include "distcomp.h" +#include "logging.h" +#include "utils.h" +#include "pow.h" +#include "portable_intrinsics.h" + +#include +#include +#include +#include + +namespace similarity { + +using namespace std; + +DistTypeSIFT l2SqrSIFTNaive(const uint8_t* pVect1, + const uint8_t* pVect2) { + DistTypeSIFT res = 0; + for (uint_fast32_t i = 0; i < SIFT_DIM; ++i) { + DistTypeSIFT d = DistTypeSIFT(pVect1[i]) - DistTypeSIFT(pVect1[i]); + res += d*d; + } + + return res; +} + +DistTypeSIFT l2SqrSIFTPrecomp(const uint8_t* pVect1, + const uint8_t* pVect2) { + DistTypeSIFT sumProd = 0; + for (uint_fast32_t i = 0; i < SIFT_DIM; ++i) { + sumProd += DistTypeSIFT(pVect1[i]) * DistTypeSIFT(pVect2[i]); + } + + return *reinterpret_cast(pVect1 + SIFT_DIM) + + *reinterpret_cast(pVect2 + SIFT_DIM) - 2 * sumProd; +} + +DistTypeSIFT l2SqrSIFTPrecompAVX(const uint8_t* pVect1, + const uint8_t* pVect2) { +#ifndef PORTABLE_AVX +#pragma message WARN("l2_sqrt_sift_precomp_avx: AVX is not available, defaulting to pure C++ implementation!") + return l2SqrSIFTPrecomp(pVect1, pVect2); +#else + const unsigned dim = SIFT_DIM; + + DistTypeSIFT sumProd = 0; + + size_t sse_offset = (dim / 32) * 32; + + const __m256i* pStart1 = reinterpret_cast(pVect1); + const __m256i* pStart2 = reinterpret_cast(pVect2); + const __m256i* pEnd1 = reinterpret_cast(pVect1 + sse_offset); + + __m256i zero, x1, y1; + zero = _mm256_xor_si256(zero,zero); + __m256i sum = zero; + + int32_t PORTABLE_ALIGN32 unpack[8]; + + while (pStart1 < pEnd1) { + const __m256i x = _mm256_loadu_si256(pStart1++); + const __m256i y = _mm256_loadu_si256(pStart2++); + x1 = _mm256_unpackhi_epi8(x,zero); + y1 = _mm256_unpackhi_epi8(y,zero); + sum = _mm256_add_epi32(sum, _mm256_madd_epi16(x1, y1)); + x1 = _mm256_unpacklo_epi8(x,zero); + y1 = _mm256_unpacklo_epi8(y,zero); + sum = _mm256_add_epi32(sum, _mm256_madd_epi16(x1, y1)); + } + _mm256_store_si256((__m256i *)unpack, sum); + sumProd += unpack[0] + unpack[1] + unpack[2] + unpack[3] + + unpack[4] + unpack[5] + unpack[6] + unpack[7]; + + if (dim & 32) { + for (uint_fast32_t i = sse_offset; i < dim; ++i) { + sumProd += DistTypeSIFT(pVect1[i]) * DistTypeSIFT(pVect2[i]); + } + } + + return + *reinterpret_cast(pVect1+dim) + + *reinterpret_cast(pVect2+dim) - 2*sumProd; +#endif +} + +} diff --git a/similarity_search/src/space/space_sift_vector.cc b/similarity_search/src/space/space_sift_vector.cc new file mode 100644 index 0000000..4e2ff8a --- /dev/null +++ b/similarity_search/src/space/space_sift_vector.cc @@ -0,0 +1,147 @@ +/** + * Non-metric Space Library + * + * Authors: Bilegsaikhan Naidan (https://github.com/bileg), Leonid Boytsov (http://boytsov.info). + * With contributions from Lawrence Cayton (http://lcayton.com/) and others. + * + * For the complete list of contributors and further details see: + * https://github.com/searchivarius/NonMetricSpaceLib + * + * Copyright (c) 2018 + * + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "object.h" +#include "utils.h" +#include "logging.h" +#include "distcomp.h" +#include "experimentconf.h" +#include "space/space_vector.h" +#include "read_data.h" + +namespace similarity { + +using namespace std; + +/** Standard functions to read/write/create objects */ + +unique_ptr SiftVectorSpace::OpenReadFileHeader(const string& inpFileName) const { + return unique_ptr(new DataFileInputStateVec(inpFileName)); +} + +template +unique_ptr SiftVectorSpace::OpenWriteFileHeader(const ObjectVector& dataset, + const string& outFileName) const { + return unique_ptr(new DataFileOutputState(outFileName)); +} + +template +unique_ptr +SiftVectorSpace::CreateObjFromStr(IdType id, LabelType label, const string& s, + DataFileInputState* pInpStateBase) const { + DataFileInputStateVec* pInpState = NULL; + if (pInpStateBase != NULL) { + pInpState = dynamic_cast(pInpStateBase); + if (NULL == pInpState) { + PREPARE_RUNTIME_ERR(err) << "Bug: unexpected pointer type"; + THROW_RUNTIME_ERR(err); + } + } + vector vec; + ReadVec(s, label, vec); + if (pInpState != NULL) { + if (pInpState->dim_ == 0) pInpState->dim_ = vec.size(); + else if (vec.size() != pInpState->dim_) { + stringstream lineStr; + if (pInpStateBase != NULL) lineStr << " line:" << pInpState->line_num_ << " "; + PREPARE_RUNTIME_ERR(err) << "The # of vector elements (" << vec.size() << ")" << lineStr.str() << + " doesn't match the # of elements in previous lines. (" << pInpState->dim_ << " )"; + THROW_RUNTIME_ERR(err); + } + } + return unique_ptr(CreateObjFromVect(id, label, vec)); +} + +template +bool SiftVectorSpace::ApproxEqual(const Object& obj1, const Object& obj2) const { + const dist_t* p1 = reinterpret_cast(obj1.data()); + const dist_t* p2 = reinterpret_cast(obj2.data()); + const size_t len1 = GetElemQty(&obj1); + const size_t len2 = GetElemQty(&obj2); + if (len1 != len2) { + PREPARE_RUNTIME_ERR(err) << "Bug: comparing vectors of different lengths: " << len1 << " and " << len2; + THROW_RUNTIME_ERR(err); + } + for (size_t i = 0; i < len1; ++i) + // We have to specify the namespace here, otherwise a compiler + // thinks that it should use the equally named member function + if (!similarity::ApproxEqual(p1[i], p2[i])) return false; + return true; +} + +template +string SiftVectorSpace::CreateStrFromObj(const Object* pObj, const string& externId /* ignored */) const { + stringstream out; + const dist_t* p = reinterpret_cast(pObj->data()); + const size_t length = GetElemQty(pObj); + for (size_t i = 0; i < length; ++i) { + if (i) out << " "; + // Clear all previous flags & set to the maximum precision available + out.unsetf(ios_base::floatfield); + out << setprecision(numeric_limits::max_digits10) << noshowpoint << p[i]; + } + + return out.str(); +} + +template +bool SiftVectorSpace::ReadNextObjStr(DataFileInputState &inpStateBase, string& strObj, LabelType& label, string& externId) const { + externId.clear(); + DataFileInputStateOneFile* pInpState = dynamic_cast(&inpStateBase); + CHECK_MSG(pInpState != NULL, "Bug: unexpected pointer type"); + if (!pInpState->inp_file_) return false; + if (!getline(pInpState->inp_file_, strObj)) return false; + pInpState->line_num_++; + return true; +} + +/** End of standard functions to read/write/create objects */ + +template +void SiftVectorSpace::ReadVec(string line, LabelType& label, vector& v) +{ + v.clear(); + + label = Object::extractLabel(line); + +#if 0 + if (!ReadVecDataViaStream(line, v)) { +#else + if (!ReadVecDataEfficiently(line, v)) { +#endif + PREPARE_RUNTIME_ERR(err) << "Failed to parse the line: '" << line << "'"; + LOG(LIB_ERROR) << err.stream().str(); + THROW_RUNTIME_ERR(err); + } +} + +template +Object* SiftVectorSpace::CreateObjFromVect(IdType id, LabelType label, const vector& InpVect) const { + return new Object(id, label, InpVect.size() * sizeof(dist_t), &InpVect[0]); +}; + +} // namespace similarity