From f9d717d0f2fdc7b0a9917a29831d20e6f021eabd Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Mon, 27 Nov 2023 14:42:13 +0800 Subject: [PATCH] refactor: do some internal refactor. --- cmake/cmake.define | 4 +- include/util/tcompression.h | 1 + source/util/src/tcompression.c | 4 + source/util/src/tdecompress.c | 165 ++++++++++++++++++++++++++-- source/util/test/decompressTest.cpp | 2 +- 5 files changed, 164 insertions(+), 12 deletions(-) diff --git a/cmake/cmake.define b/cmake/cmake.define index c685ba6161..7db6baafab 100644 --- a/cmake/cmake.define +++ b/cmake/cmake.define @@ -182,8 +182,8 @@ ELSE () MESSAGE(STATUS "SIMD instructions (FMA/AVX/AVX2) is ACTIVATED") IF (COMPILER_SUPPORT_AVX512F AND COMPILER_SUPPORT_AVX512BMI) - SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512f -mavx512vbmi -mavx512vl") - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512vbmi -mavx512vl") + SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512f -mavx512vbmi") + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512vbmi") MESSAGE(STATUS "avx512f/avx512bmi supported by compiler") ENDIF() diff --git a/include/util/tcompression.h b/include/util/tcompression.h index 7da3587f1c..75ddbb12e7 100644 --- a/include/util/tcompression.h +++ b/include/util/tcompression.h @@ -140,6 +140,7 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements, int32_t tsDecompressFloatImplAvx512(const char *const input, const int32_t nelements, char *const output); int32_t tsDecompressFloatImplAvx2(const char *const input, const int32_t nelements, char *const output); int32_t tsDecompressTimestampAvx512(const char* const input, const int32_t nelements, char *const output, bool bigEndian); +int32_t tsDecompressTimestampAvx2(const char* const input, const int32_t nelements, char *const output, bool bigEndian); /************************************************************************* * STREAM COMPRESSION diff --git a/source/util/src/tcompression.c b/source/util/src/tcompression.c index 8a79d62cda..3cc00ddc7f 100644 --- a/source/util/src/tcompression.c +++ b/source/util/src/tcompression.c @@ -540,6 +540,8 @@ int32_t tsDecompressTimestampImp(const char *const input, const int32_t nelement } else if (input[0] == 1) { // Decompress if (tsSIMDEnable && tsAVX512Enable) { tsDecompressTimestampAvx512(input, nelements, output, false); + } else if (tsSIMDEnable && tsAVX2Enable) { + tsDecompressTimestampAvx2(input, nelements, output, false); } else { int64_t *ostream = (int64_t *)output; @@ -599,6 +601,8 @@ int32_t tsDecompressTimestampImp(const char *const input, const int32_t nelement } } } + + return nelements * longBytes; } /* --------------------------------------------Double Compression ---------------------------------------------- */ diff --git a/source/util/src/tdecompress.c b/source/util/src/tdecompress.c index b39a340ac6..5a5e60093c 100644 --- a/source/util/src/tdecompress.c +++ b/source/util/src/tdecompress.c @@ -247,15 +247,14 @@ int32_t tsDecompressFloatImplAvx2(const char *const input, const int32_t nelemen return 0; } -int32_t tsDecompressTimestampAvx512(const char* const input, const int32_t nelements, char *const output, bool bigEndian) { +int32_t tsDecompressTimestampAvx2(const char *const input, const int32_t nelements, char *const output, + bool bigEndian) { int64_t *ostream = (int64_t *)output; int32_t ipos = 1, opos = 0; __m128i prevVal = _mm_setzero_si128(); __m128i prevDelta = _mm_setzero_si128(); - // _mm_maskz_loadu_epi8 -#if __AVX512VL__ - +#if __AVX2__ int32_t batch = nelements >> 1; int32_t remainder = nelements & 0x01; __mmask16 mask2[16] = {0, 0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff}; @@ -268,10 +267,22 @@ int32_t tsDecompressTimestampAvx512(const char* const input, const int32_t nelem int8_t nbytes1 = flags & INT8MASK(4); // range of nbytes starts from 0 to 7 int8_t nbytes2 = (flags >> 4) & INT8MASK(4); - __m128i data1 = _mm_maskz_loadu_epi8(mask2[nbytes1], (const void*)(input + ipos)); - __m128i data2 = _mm_maskz_loadu_epi8(mask2[nbytes2], (const void*)(input + ipos + nbytes1)); - data2 = _mm_broadcastq_epi64(data2); + __m128i data1; + if (nbytes1 == 0) { + data1 = _mm_setzero_si128(); + } else { +// _mm_shuffle_epi8() + memcpy(&data1, (const void*) (input + ipos), nbytes1); + } + __m128i data2; + if (nbytes2 == 0) { + data2 = _mm_setzero_si128(); + } else { + memcpy(&data2, (const void*) (input + ipos + nbytes1), nbytes2); + } + + data2 = _mm_broadcastq_epi64(data2); __m128i zzVal = _mm_blend_epi32(data2, data1, 0x03); // ZIGZAG_DECODE(T, v) (((v) >> 1) ^ -((T)((v)&1))) @@ -305,8 +316,26 @@ int32_t tsDecompressTimestampAvx512(const char* const input, const int32_t nelem int8_t nbytes1 = flags & INT8MASK(4); // range of nbytes starts from 0 to 7 int8_t nbytes2 = (flags >> 4) & INT8MASK(4); - __m128i data1 = _mm_maskz_loadu_epi8(mask2[nbytes1], (const void*)(input + ipos)); - __m128i data2 = _mm_maskz_loadu_epi8(mask2[nbytes2], (const void*)(input + ipos + nbytes1)); +// __m128i data1 = _mm_maskz_loadu_epi8(mask2[nbytes1], (const void*)(input + ipos)); +// __m128i data2 = _mm_maskz_loadu_epi8(mask2[nbytes2], (const void*)(input + ipos + nbytes1)); + __m128i data1; + if (nbytes1 == 0) { + data1 = _mm_setzero_si128(); + } else { + int64_t dd = 0; + memcpy(&dd, (const void*) (input + ipos), nbytes1); + data1 = _mm_loadu_si64(&dd); + } + + __m128i data2; + if (nbytes2 == 0) { + data2 = _mm_setzero_si128(); + } else { + int64_t dd = 0; + memcpy(&dd, (const void*) (input + ipos + nbytes1), nbytes2); + data2 = _mm_loadu_si64(&dd); + } + data2 = _mm_broadcastq_epi64(data2); __m128i zzVal = _mm_blend_epi32(data2, data1, 0x03); @@ -335,6 +364,124 @@ int32_t tsDecompressTimestampAvx512(const char* const input, const int32_t nelem ipos += nbytes1 + nbytes2; } + if (remainder > 0) { + uint64_t dd = 0; + uint8_t flags = input[ipos++]; + + int32_t nbytes = flags & INT8MASK(4); + int64_t deltaOfDelta = 0; + if (nbytes == 0) { + deltaOfDelta = 0; + } else { + // if (is_bigendian()) { + // memcpy(((char *)(&dd1)) + longBytes - nbytes, input + ipos, nbytes); + // } else { + memcpy(&dd, input + ipos, nbytes); + // } + deltaOfDelta = ZIGZAG_DECODE(int64_t, dd); + } + + ipos += nbytes; + if (opos == 0) { + ostream[opos++] = deltaOfDelta; + } else { + int64_t prevDeltaX = deltaOfDelta + prevDelta[1]; + ostream[opos++] = prevVal[1] + prevDeltaX; + } + } +#endif + + return 0; +} + +int32_t tsDecompressTimestampAvx512(const char* const input, const int32_t nelements, char *const output, bool bigEndian) { + int64_t *ostream = (int64_t *)output; + int32_t ipos = 1, opos = 0; + __m128i prevVal = _mm_setzero_si128(); + __m128i prevDelta = _mm_setzero_si128(); + + // _mm_maskz_loadu_epi8 +#if __AVX512VL__ + + int32_t batch = nelements >> 1; + int32_t remainder = nelements & 0x01; + __mmask16 mask2[16] = {0, 0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff}; + + int32_t i = 0; + if (batch > 1) { + // first loop + uint8_t flags = input[ipos++]; + + int8_t nbytes1 = flags & INT8MASK(4); // range of nbytes starts from 0 to 7 + int8_t nbytes2 = (flags >> 4) & INT8MASK(4); + + __m128i data1 = _mm_maskz_loadu_epi8(mask2[nbytes1], (const void*)(input + ipos)); + __m128i data2 = _mm_maskz_loadu_epi8(mask2[nbytes2], (const void*)(input + ipos + nbytes1)); + data2 = _mm_broadcastq_epi64(data2); + + __m128i zzVal = _mm_blend_epi32(data2, data1, 0x03); + + // ZIGZAG_DECODE(T, v) (((v) >> 1) ^ -((T)((v)&1))) + __m128i signmask = _mm_and_si128(_mm_set1_epi64x(1), zzVal); + signmask = _mm_sub_epi64(_mm_setzero_si128(), signmask); + + // get two zigzag values here + __m128i deltaOfDelta = _mm_xor_si128(_mm_srli_epi64(zzVal, 1), signmask); + + __m128i deltaCurrent = _mm_add_epi64(deltaOfDelta, prevDelta); + deltaCurrent = _mm_add_epi64(_mm_slli_si128(deltaCurrent, 8), deltaCurrent); + + __m128i val = _mm_add_epi64(deltaCurrent, prevVal); + _mm_storeu_si128((__m128i *)&ostream[opos], val); + + // keep the previous value + prevVal = _mm_shuffle_epi32 (val, 0xEE); + + // keep the previous delta of delta, for the first item + prevDelta = _mm_shuffle_epi32(deltaOfDelta, 0xEE); + + opos += 2; + ipos += nbytes1 + nbytes2; + i += 1; + } + + // the remain + for(; i < batch; ++i) { + uint8_t flags = input[ipos++]; + + int8_t nbytes1 = flags & INT8MASK(4); // range of nbytes starts from 0 to 7 + int8_t nbytes2 = (flags >> 4) & INT8MASK(4); + + __m128i data1 = _mm_maskz_loadu_epi8(mask2[nbytes1], (const void*)(input + ipos)); + __m128i data2 = _mm_maskz_loadu_epi8(mask2[nbytes2], (const void*)(input + ipos + nbytes1)); + data2 = _mm_broadcastq_epi64(data2); + + __m128i zzVal = _mm_blend_epi32(data2, data1, 0x03); + + // ZIGZAG_DECODE(T, v) (((v) >> 1) ^ -((T)((v)&1))) + __m128i signmask = _mm_and_si128(_mm_set1_epi64x(1), zzVal); + signmask = _mm_sub_epi64(_mm_setzero_si128(), signmask); + + // get two zigzag values here + __m128i deltaOfDelta = _mm_xor_si128(_mm_srli_epi64(zzVal, 1), signmask); + + __m128i deltaCurrent = _mm_add_epi64(deltaOfDelta, prevDelta); + deltaCurrent = _mm_add_epi64(_mm_slli_si128(deltaCurrent, 8), deltaCurrent); + + __m128i val = _mm_add_epi64(deltaCurrent, prevVal); + _mm_storeu_si128((__m128i *)&ostream[opos], val); + + // keep the previous value + prevVal = _mm_shuffle_epi32 (val, 0xEE); + + // keep the previous delta of delta + __m128i delta = _mm_add_epi64(_mm_slli_si128(deltaOfDelta, 8), deltaOfDelta); + prevDelta = _mm_shuffle_epi32(_mm_add_epi64(delta, prevDelta), 0xEE); + + opos += 2; + ipos += nbytes1 + nbytes2; + } + if (remainder > 0) { uint64_t dd = 0; uint8_t flags = input[ipos++]; diff --git a/source/util/test/decompressTest.cpp b/source/util/test/decompressTest.cpp index 378b67edbb..caf8df3ba8 100644 --- a/source/util/test/decompressTest.cpp +++ b/source/util/test/decompressTest.cpp @@ -76,7 +76,7 @@ TEST(utilTest, decompress_perf_test) { } int64_t el1 = taosGetTimestampUs() - st; - std::cout << "decompress elapsed time:" << el1 << " us" << std::endl; + std::cout << "soft decompress elapsed time:" << el1 << " us" << std::endl; memset(pOutput, 0, num * sizeof(int64_t)); st = taosGetTimestampUs();