From 1dd5cd17a04a4785026e48291c8088d1431850cb Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Sun, 26 Nov 2023 23:53:05 +0800 Subject: [PATCH] fix(tsdb): add simd for decompress timestamp. --- cmake/cmake.define | 4 +- include/util/tcompression.h | 1 + source/util/src/tdecompress.c | 148 ++++++++++++++++++++++------------ 3 files changed, 100 insertions(+), 53 deletions(-) diff --git a/cmake/cmake.define b/cmake/cmake.define index 7710c071eb..3eb872cfee 100644 --- a/cmake/cmake.define +++ b/cmake/cmake.define @@ -181,8 +181,8 @@ ELSE () MESSAGE(STATUS "SIMD instructions (FMA/AVX/AVX2) is ACTIVATED") IF (COMPILER_SUPPORT_AVX512F AND COMPILER_SUPPORT_AVX512BMI) - SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512f -mavx512vbmi") - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512vbmi") + SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512f -mavx512vbmi -mavx512vl") + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512vbmi -mavx512vl") MESSAGE(STATUS "avx512 supported by gcc") ENDIF() ENDIF() diff --git a/include/util/tcompression.h b/include/util/tcompression.h index ab0c22fc9b..7da3587f1c 100644 --- a/include/util/tcompression.h +++ b/include/util/tcompression.h @@ -139,6 +139,7 @@ int32_t getWordLength(char type); int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements, char *const output, const char type); int32_t tsDecompressFloatImplAvx512(const char *const input, const int32_t nelements, char *const output); int32_t tsDecompressFloatImplAvx2(const char *const input, const int32_t nelements, char *const output); +int32_t tsDecompressTimestampAvx512(const char* const input, const int32_t nelements, char *const output, bool bigEndian); /************************************************************************* * STREAM COMPRESSION diff --git a/source/util/src/tdecompress.c b/source/util/src/tdecompress.c index ac9d70b2e7..b1c6df95c4 100644 --- a/source/util/src/tdecompress.c +++ b/source/util/src/tdecompress.c @@ -247,75 +247,121 @@ int32_t tsDecompressFloatImplAvx2(const char *const input, const int32_t nelemen return 0; } -int32_t tsDecompressTimestampAvx2(const char* const input, const int32_t nelements, char *const output, bool bigEndian) { +int32_t tsDecompressTimestampAvx512(const char* const input, const int32_t nelements, char *const output, bool bigEndian) { int64_t *ostream = (int64_t *)output; int32_t ipos = 1, opos = 0; - int8_t nbytes = 0; - - int64_t prevValue = 0; - int64_t prevDelta = 0; - - int64_t deltaOfDelta = 0; - int32_t longBytes = LONG_BYTES; + __m128i prevVal = _mm_setzero_si128(); + __m128i prevDelta = _mm_setzero_si128(); // _mm_maskz_loadu_epi8 #if __AVX512F__ - // _mm_blendv_epi8 - int32_t batch = nelements >> 4; - int32_t remainder = nelements & 0x03; + int32_t batch = nelements >> 1; + int32_t remainder = nelements & 0x01; - for(int32_t i = 0; i < batch; ++i) { + int32_t i = 0; + if (batch > 1) { + // first loop uint8_t flags = input[ipos++]; - // Decode dd1 - uint64_t dd1 = 0; - nbytes = flags & INT8MASK(4); // range of nbytes starts from 0 to 7 - // __m128i mask = {};//[0], [] + int8_t nbytes1 = flags & INT8MASK(4); // range of nbytes starts from 0 to 7 + int8_t nbytes2 = (flags >> 4) & INT8MASK(4); - if (nbytes == 0) { - deltaOfDelta = 0; - } else { - if (bigEndian) { - memcpy(((char *)(&dd1)) + longBytes - nbytes, input + ipos, nbytes); - } else { - memcpy(&dd1, input + ipos, nbytes); - } - deltaOfDelta = ZIGZAG_DECODE(int64_t, dd1); - } + __mmask16 mask2[16] = {0, 0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff}; + __m128i data1 = _mm_maskz_loadu_epi8(mask2[nbytes1], (const void*)(input + ipos)); + __m128i data2 = _mm_maskz_loadu_epi8(mask2[nbytes2], (const void*)(input + ipos + nbytes1)); + data2 = _mm_broadcastq_epi64(data2); - ipos += nbytes; - prevDelta += deltaOfDelta; - prevValue += prevDelta; - ostream[opos++] = prevValue; + __m128i zzVal = _mm_blend_epi32(data2, data1, 0x03); - // Decode dd2 - uint64_t dd2 = 0; - nbytes = (flags >> 4) & INT8MASK(4); - if (nbytes == 0) { - deltaOfDelta = 0; - } else { - if (bigEndian) { - memcpy(((char *)(&dd2)) + longBytes - nbytes, input + ipos, nbytes); - } else { - memcpy(&dd2, input + ipos, nbytes); - } - // zigzag_decoding - deltaOfDelta = ZIGZAG_DECODE(int64_t, dd2); - } + // ZIGZAG_DECODE(T, v) (((v) >> 1) ^ -((T)((v)&1))) + __m128i signmask = _mm_and_si128(_mm_set1_epi64x(1), zzVal); + signmask = _mm_sub_epi64(_mm_setzero_si128(), signmask); - ipos += nbytes; - prevDelta += deltaOfDelta; - prevValue += prevDelta; - ostream[opos++] = prevValue; + // get two zigzag values here + __m128i deltaOfDelta = _mm_xor_si128(_mm_srli_epi64(zzVal, 1), signmask); - if (opos == nelements) { - return nelements * longBytes; - } + __m128i deltaCurrent = _mm_add_epi64(deltaOfDelta, prevDelta); + deltaCurrent = _mm_add_epi64(_mm_slli_si128(deltaCurrent, 8), deltaCurrent); + + __m128i val = _mm_add_epi64(deltaCurrent, prevVal); + _mm_storeu_si128((__m128i *)&ostream[opos], val); + + // keep the previous value + prevVal = _mm_set1_epi64x(val[1]); + + // keep the previous delta of delta, for the first item + prevDelta = _mm_set1_epi64x(deltaOfDelta[1]); + + opos += 2; + ipos += nbytes1 + nbytes2; + i += 1; + } + + // the remain + for(; i < batch; ++i) { + uint8_t flags = input[ipos++]; + + int8_t nbytes1 = flags & INT8MASK(4); // range of nbytes starts from 0 to 7 + int8_t nbytes2 = (flags >> 4) & INT8MASK(4); + + __mmask16 mask2[16] = {0, 0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff}; + __m128i data1 = _mm_maskz_loadu_epi8(mask2[nbytes1], (const void*)(input + ipos)); + __m128i data2 = _mm_maskz_loadu_epi8(mask2[nbytes2], (const void*)(input + ipos + nbytes1)); + data2 = _mm_broadcastq_epi64(data2); + + __m128i zzVal = _mm_blend_epi32(data2, data1, 0x03); + + // ZIGZAG_DECODE(T, v) (((v) >> 1) ^ -((T)((v)&1))) + __m128i signmask = _mm_and_si128(_mm_set1_epi64x(1), zzVal); + signmask = _mm_sub_epi64(_mm_setzero_si128(), signmask); + + // get two zigzag values here + __m128i deltaOfDelta = _mm_xor_si128(_mm_srli_epi64(zzVal, 1), signmask); + + __m128i deltaCurrent = _mm_add_epi64(deltaOfDelta, prevDelta); + deltaCurrent = _mm_add_epi64(_mm_slli_si128(deltaCurrent, 8), deltaCurrent); + + __m128i val = _mm_add_epi64(deltaCurrent, prevVal); + _mm_storeu_si128((__m128i *)&ostream[opos], val); + + // keep the previous value + prevVal = _mm_set1_epi64x(val[1]); + + // keep the previous delta of delta + __m128i delta = _mm_add_epi64(_mm_slli_si128(deltaOfDelta, 8), deltaOfDelta); + prevDelta = _mm_set1_epi64x(_mm_add_epi64(delta, prevDelta)[1]); + + opos += 2; + ipos += nbytes1 + nbytes2; } if (remainder > 0) { + uint64_t dd = 0; + uint8_t flags = input[ipos++]; + int32_t nbytes = flags & INT8MASK(4); + int64_t deltaOfDelta = 0; + if (nbytes == 0) { + deltaOfDelta = 0; + } else { + // if (is_bigendian()) { + // memcpy(((char *)(&dd1)) + longBytes - nbytes, input + ipos, nbytes); + // } else { + memcpy(&dd, input + ipos, nbytes); + // } + deltaOfDelta = ZIGZAG_DECODE(int64_t, dd); + } + + ipos += nbytes; + if (opos == 0) { + ostream[opos++] = deltaOfDelta; + } else { + int64_t prevV = prevVal[1]; + + int64_t prevDeltaX = deltaOfDelta + prevDelta[1]; + ostream[opos++] = prevV + prevDeltaX; + } } #endif