From e53183c82330f21bfad0c3cbe7aa81d2e5571df3 Mon Sep 17 00:00:00 2001 From: searchivairus Date: Wed, 7 Feb 2018 14:33:55 -0500 Subject: [PATCH] Issue #280. Another attempt to fix Travis compilation error. --- similarity_search/include/distcomp.h | 1 + .../include/portable_intrinsics.h | 4 ++ similarity_search/src/distcomp_l2sqr_sift.cc | 61 +++++++++++++++++-- 3 files changed, 62 insertions(+), 4 deletions(-) diff --git a/similarity_search/include/distcomp.h b/similarity_search/include/distcomp.h index ebbf4de..7863837 100644 --- a/similarity_search/include/distcomp.h +++ b/similarity_search/include/distcomp.h @@ -256,6 +256,7 @@ const uint_fast32_t SIFT_DIM = 128; // All SIFT vectors are expected to have the same dimensionality (SIFT_DIM) DistTypeSIFT l2SqrSIFTNaive(const uint8_t* pVect1, const uint8_t* pVect2); DistTypeSIFT l2SqrSIFTPrecomp(const uint8_t* pVect1, const uint8_t* pVect2); +DistTypeSIFT l2SqrSIFTPrecompSSE2(const uint8_t* pVect1, const uint8_t* pVect2); DistTypeSIFT l2SqrSIFTPrecompAVX(const uint8_t* pVect1, const uint8_t* pVect2); } diff --git a/similarity_search/include/portable_intrinsics.h b/similarity_search/include/portable_intrinsics.h index 37afc3e..f34d59b 100644 --- a/similarity_search/include/portable_intrinsics.h +++ b/similarity_search/include/portable_intrinsics.h @@ -32,6 +32,10 @@ #define PORTABLE_AVX #endif +#if defined(__AVX2__) +#define PORTABLE_AVX2 +#endif + #if defined(PORTABLE_SSE2) #include diff --git a/similarity_search/src/distcomp_l2sqr_sift.cc b/similarity_search/src/distcomp_l2sqr_sift.cc index dd5089e..a5490ba 100644 --- a/similarity_search/src/distcomp_l2sqr_sift.cc +++ b/similarity_search/src/distcomp_l2sqr_sift.cc @@ -31,7 +31,7 @@ DistTypeSIFT l2SqrSIFTNaive(const uint8_t* pVect1, const uint8_t* pVect2) { DistTypeSIFT res = 0; for (uint_fast32_t i = 0; i < SIFT_DIM; ++i) { - DistTypeSIFT d = DistTypeSIFT(pVect1[i]) - DistTypeSIFT(pVect1[i]); + DistTypeSIFT d = DistTypeSIFT(pVect1[i]) - DistTypeSIFT(pVect2[i]); res += d*d; } @@ -49,11 +49,64 @@ DistTypeSIFT l2SqrSIFTPrecomp(const uint8_t* pVect1, *reinterpret_cast(pVect2 + SIFT_DIM) - 2 * sumProd; } +DistTypeSIFT l2SqrSIFTPrecompSSE2(const uint8_t* pVect1, + const uint8_t* pVect2) { +#ifndef PORTABLE_SSE2 + #pragma message WARN("l2SqrSIFTPrecompSSE4: SSE2 is not available") + return l2SqrSIFTPrecomp(pVect1, pVect2); +#else + const unsigned dim = SIFT_DIM; + + DistTypeSIFT sumProd = 0; + + size_t sse_offset = (dim / 16) * 16; + + const __m128i* pStart1 = reinterpret_cast(pVect1); + const __m128i* pStart2 = reinterpret_cast(pVect2); + const __m128i* pEnd2 = reinterpret_cast(pVect1 + sse_offset); + + __m128i zero, x1, y1; + zero = _mm_xor_si128(zero,zero); + __m128i sum = zero; + + PORTABLE_ALIGN32 int32_t unpack[4]; + + + while (pStart1 < pEnd2) { + const __m128i x = _mm_loadu_si128(pStart1++); + const __m128i y = _mm_loadu_si128(pStart2++); + x1 = _mm_unpackhi_epi8(x,zero); + y1 = _mm_unpackhi_epi8(y,zero); + sum = _mm_add_epi32(sum, _mm_madd_epi16(x1, y1)); + x1 = _mm_unpacklo_epi8(x,zero); + y1 = _mm_unpacklo_epi8(y,zero); + sum = _mm_add_epi32(sum, _mm_madd_epi16(x1, y1)); + } + _mm_store_si128((__m128i *)unpack, sum); + sumProd += unpack[0] + unpack[1] + unpack[2] + unpack[3]; + + if (dim & 16) { + for (uint_fast32_t i = sse_offset; i < dim; ++i) { + sumProd += DistTypeSIFT(pVect1[i])*DistTypeSIFT(pVect2[i]); + } + } + + return + *reinterpret_cast(pVect1 + dim) + + *reinterpret_cast(pVect2 + dim) - 2*sumProd; +#endif +} + DistTypeSIFT l2SqrSIFTPrecompAVX(const uint8_t* pVect1, const uint8_t* pVect2) { -#ifndef PORTABLE_AVX -#pragma message WARN("l2_sqrt_sift_precomp_avx: AVX is not available, defaulting to pure C++ implementation!") - return l2SqrSIFTPrecomp(pVect1, pVect2); +#ifndef PORTABLE_AVX2 +#pragma message WARN("l2SqrSIFTPrecompAVX: AVX2 is not available") + #ifndef PORTABLE_SSE4 + #pragma message WARN("l2SqrSIFTPrecompAVX: SSE4 is not available") + return l2SqrSIFTPrecomp(pVect1, pVect2); + #else + return l2SqrSIFTPrecompSSE2(pVect1, pVect2); + #endif #else const unsigned dim = SIFT_DIM;