diff --git a/cmake/cmake.define b/cmake/cmake.define index 12e1b50539..735735f0cc 100644 --- a/cmake/cmake.define +++ b/cmake/cmake.define @@ -181,12 +181,12 @@ ELSE () ENDIF() MESSAGE(STATUS "SIMD instructions (FMA/AVX/AVX2) is ACTIVATED") -# IF (COMPILER_SUPPORT_AVX512F AND COMPILER_SUPPORT_AVX512BMI) -# SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512f -mavx512vbmi") -# SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512vbmi") -# MESSAGE(STATUS "avx512f/avx512bmi supported by compiler") -# ENDIF() -# + IF (COMPILER_SUPPORT_AVX512F AND COMPILER_SUPPORT_AVX512BMI) + SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512f -mavx512vbmi") + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512vbmi") + MESSAGE(STATUS "avx512f/avx512bmi supported by compiler") + ENDIF() + # IF (COMPILER_SUPPORT_AVX512VL) # SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512vl") # SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512vl") diff --git a/source/util/src/tdecompress.c b/source/util/src/tdecompress.c index f212bf5231..d3cb3118d2 100644 --- a/source/util/src/tdecompress.c +++ b/source/util/src/tdecompress.c @@ -52,7 +52,7 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements, int32_t _pos = 0; int64_t prevValue = 0; -#if __AVX2__ +#if __AVX2__ || __AVX512F__ while (_pos < nelements) { uint64_t w = *(uint64_t*) ip; @@ -72,10 +72,33 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements, int32_t gRemainder = (nelements - _pos); int32_t num = (gRemainder > elems)? elems:gRemainder; - int32_t batch = num >> 2; - int32_t remain = num & 0x03; + int32_t batch = 0; + int32_t remain = 0; + if (tsSIMDEnable && tsAVX512Enable) { +#if __AVX512F__ + batch = num >> 3; + remain = num & 0x07; +#endif + } else if (tsSIMDEnable && tsAVX2Enable) { +#if __AVX2__ + batch = num >> 2; + remain = num & 0x03; +#endif + } + if (selector == 0 || selector == 1) { - if (tsSIMDEnable && tsAVX2Enable) { + if (tsSIMDEnable && tsAVX512Enable) { +#if __AVX512F__ + for (int32_t i = 0; i < batch; ++i) { + __m512i prev = _mm512_set1_epi64(prevValue); + _mm512_storeu_si512((__m512i *)&p[_pos], prev); + _pos += 8; //handle 64bit x 8 = 512bit + } + for (int32_t i = 0; i < remain; ++i) { + p[_pos++] = prevValue; + } +#endif + } else if (tsSIMDEnable && tsAVX2Enable) { for (int32_t i = 0; i < batch; ++i) { __m256i prev = _mm256_set1_epi64x(prevValue); _mm256_storeu_si256((__m256i *)&p[_pos], prev); @@ -85,10 +108,7 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements, for (int32_t i = 0; i < remain; ++i) { p[_pos++] = prevValue; } - } else if (tsSIMDEnable && tsAVX512Enable) { -#if __AVX512F__ - // todo add avx512 impl -#endif + } else { // alternative implementation without SIMD instructions. for (int32_t i = 0; i < elems && count < nelements; i++, count++) { p[_pos++] = prevValue; @@ -96,7 +116,73 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements, } } } else { - if (tsSIMDEnable && tsAVX2Enable) { + if (tsSIMDEnable && tsAVX512Enable) { + #if __AVX512F__ + __m512i sum_mask1 = _mm512_set_epi64(6, 6, 4, 4, 2, 2, 0, 0); + __m512i sum_mask2 = _mm512_set_epi64(5, 5, 5, 5, 1, 1, 1, 1); + __m512i sum_mask3 = _mm512_set_epi64(3, 3, 3, 3, 3, 3, 3, 3); + __m512i base = _mm512_set1_epi64(w); + __m512i maskVal = _mm512_set1_epi64(mask); + __m512i shiftBits = _mm512_set_epi64(bit * 7 + 4, bit * 6 + 4, bit * 5 + 4, bit * 4 + 4, bit * 3 + 4, bit * 2 + 4, bit + 4, 4); + __m512i inc = _mm512_set1_epi64(bit << 3); + + for (int32_t i = 0; i < batch; ++i) { + + __m512i after = _mm512_srlv_epi64(base, shiftBits); + __m512i zigzagVal = _mm512_and_si512(after, maskVal); + + // ZIGZAG_DECODE(T, v) (((v) >> 1) ^ -((T)((v)&1))) + __m512i signmask = _mm512_and_si512(_mm512_set1_epi64(1), zigzagVal); + signmask = _mm512_sub_epi64(_mm512_setzero_si512(), signmask); + __m512i delta = _mm512_xor_si512(_mm512_srli_epi64(zigzagVal, 1), signmask); + + // calculate the cumulative sum (prefix sum) for each number + // decode[0] = prevValue + final[0] + // decode[1] = decode[0] + final[1] -----> prevValue + final[0] + final[1] + // decode[2] = decode[1] + final[2] -----> prevValue + final[0] + final[1] + final[2] + // decode[3] = decode[2] + final[3] -----> prevValue + final[0] + final[1] + final[2] + final[3] + + + //7 6 5 4 3 2 1 0 + //D7 D6 D5 D4 D3 D2 D1 D0 + //D6 0 D4 0 D2 0 D0 0 + //D7+D6 D6 D5+D4 D4 D3+D2 D2 D1+D0 D0 + //13 6 9 4 5 2 1 0 + __m512i prev = _mm512_set1_epi64(prevValue); + __m512i cum_sum = _mm512_add_epi64(delta, _mm512_maskz_permutexvar_epi64(0xaa, sum_mask1, delta)); + cum_sum = _mm512_add_epi64(cum_sum, _mm512_maskz_permutexvar_epi64(0xcc, sum_mask2, cum_sum)); + cum_sum = _mm512_add_epi64(cum_sum, _mm512_maskz_permutexvar_epi64(0xf0, sum_mask3, cum_sum)); + + + + //13 6 9 4 5 2 1 0 + //D7,D6 D6 D5,D4 D4 D3,D2 D2 D1,D0 D0 + //+D5,D4 D5,D4, 0 0 D1,D0 D1,D0 0 0 + //D7~D4 D6~D4 D5~D4 D4 D3~D0 D2~D0 D1~D0 D0 + //22 15 9 4 6 3 1 0 + // + //D3~D0 D3~D0 D3~D0 D3~D0 0 0 0 0 + //28 21 15 10 6 3 1 0 + + + cum_sum = _mm512_add_epi64(cum_sum, prev); + _mm512_storeu_si512((__m512i *)&p[_pos], cum_sum); + + shiftBits = _mm512_add_epi64(shiftBits, inc); + prevValue = p[_pos + 7]; + _pos += 8; + + } + // handle the remain value + for (int32_t i = 0; i < remain; i++) { + zigzag_value = ((w >> (v + (batch * bit * 8))) & mask); + prevValue += ZIGZAG_DECODE(int64_t, zigzag_value); + + p[_pos++] = prevValue; + v += bit; + } +#endif + } else if (tsSIMDEnable && tsAVX2Enable) { __m256i base = _mm256_set1_epi64x(w); __m256i maskVal = _mm256_set1_epi64x(mask); @@ -157,10 +243,6 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements, p[_pos++] = prevValue; v += bit; } - } else if (tsSIMDEnable && tsAVX512Enable) { -#if __AVX512F__ - // todo add avx512 impl -#endif } else { // alternative implementation without SIMD instructions. for (int32_t i = 0; i < elems && count < nelements; i++, count++) { zigzag_value = ((w >> v) & mask); @@ -507,4 +589,4 @@ int32_t tsDecompressTimestampAvx512(const char *const input, const int32_t nelem #endif return 0; -} \ No newline at end of file +}