Improve tsDecompressIntImpl_Hw performance with AVX512.

Signed-off-by: Kai Jiang <kai.jiang@intel.com> Huanrui Zhang <huanruix.zhang@intel.com>
2024-01-23 15:14:04 +08:00 · 2024-01-23 15:14:04 +08:00 · 6e2a5b9292
parent 2534ce5071
commit 6e2a5b9292
2 changed files with 102 additions and 20 deletions
--- a/cmake/cmake.define
+++ b/cmake/cmake.define
@ -181,12 +181,12 @@ ELSE ()
        ENDIF()
        MESSAGE(STATUS "SIMD instructions (FMA/AVX/AVX2) is ACTIVATED")

-#        IF (COMPILER_SUPPORT_AVX512F AND COMPILER_SUPPORT_AVX512BMI)
-#            SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512f -mavx512vbmi")
-#            SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512vbmi")
-#            MESSAGE(STATUS "avx512f/avx512bmi supported by compiler")
-#        ENDIF()
-#
+        IF (COMPILER_SUPPORT_AVX512F AND COMPILER_SUPPORT_AVX512BMI)
+            SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512f -mavx512vbmi")
+            SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512vbmi")
+            MESSAGE(STATUS "avx512f/avx512bmi supported by compiler")
+        ENDIF()
+
 #        IF (COMPILER_SUPPORT_AVX512VL)
 #            SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512vl")
 #            SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512vl")
--- a/source/util/src/tdecompress.c
+++ b/source/util/src/tdecompress.c
@ -52,7 +52,7 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements,
  int32_t     _pos = 0;
  int64_t     prevValue = 0;

-#if __AVX2__
+#if __AVX2__ || __AVX512F__
  while (_pos < nelements) {
    uint64_t w = *(uint64_t*) ip;

@ -72,10 +72,33 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements,
        int32_t gRemainder = (nelements - _pos);
        int32_t num = (gRemainder > elems)? elems:gRemainder;

-        int32_t batch = num >> 2;
-        int32_t remain = num & 0x03;
+        int32_t batch = 0;
+        int32_t remain = 0;
+        if (tsSIMDEnable && tsAVX512Enable) {
+#if __AVX512F__
+        batch = num >> 3;
+        remain = num & 0x07;
+#endif
+        } else if (tsSIMDEnable && tsAVX2Enable) {
+#if __AVX2__
+        batch = num >> 2;
+        remain = num & 0x03;
+#endif
+        }
+
        if (selector == 0 || selector == 1) {
-          if (tsSIMDEnable && tsAVX2Enable) {
+          if (tsSIMDEnable && tsAVX512Enable) {
+#if __AVX512F__
+            for (int32_t i = 0; i < batch; ++i) {
+            __m512i prev = _mm512_set1_epi64(prevValue);
+            _mm512_storeu_si512((__m512i *)&p[_pos], prev);
+            _pos += 8;	//handle 64bit x 8 = 512bit
+            }
+            for (int32_t i = 0; i < remain; ++i) {
+            p[_pos++] = prevValue;
+            }
+#endif
+          } else if (tsSIMDEnable && tsAVX2Enable) {
            for (int32_t i = 0; i < batch; ++i) {
              __m256i prev = _mm256_set1_epi64x(prevValue);
              _mm256_storeu_si256((__m256i *)&p[_pos], prev);
@ -85,10 +108,7 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements,
            for (int32_t i = 0; i < remain; ++i) {
              p[_pos++] = prevValue;
            }
-          } else if (tsSIMDEnable && tsAVX512Enable) {
-#if __AVX512F__
-            // todo add avx512 impl
-#endif
+
          } else { // alternative implementation without SIMD instructions.
            for (int32_t i = 0; i < elems && count < nelements; i++, count++) {
              p[_pos++] = prevValue;
@ -96,7 +116,73 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements,
            }
          }
        } else {
-          if (tsSIMDEnable && tsAVX2Enable) {
+          if (tsSIMDEnable && tsAVX512Enable) {
+ #if __AVX512F__
+            __m512i sum_mask1 = _mm512_set_epi64(6, 6, 4, 4, 2, 2, 0, 0);
+            __m512i sum_mask2 = _mm512_set_epi64(5, 5, 5, 5, 1, 1, 1, 1);
+            __m512i sum_mask3 = _mm512_set_epi64(3, 3, 3, 3, 3, 3, 3, 3);
+            __m512i base = _mm512_set1_epi64(w);
+            __m512i maskVal = _mm512_set1_epi64(mask);
+            __m512i shiftBits = _mm512_set_epi64(bit * 7 + 4, bit * 6 + 4, bit * 5 + 4, bit * 4 + 4, bit * 3 + 4, bit * 2 + 4, bit + 4, 4);
+            __m512i inc = _mm512_set1_epi64(bit << 3);
+
+            for (int32_t i = 0; i < batch; ++i) {
+
+            __m512i after = _mm512_srlv_epi64(base, shiftBits);
+            __m512i zigzagVal = _mm512_and_si512(after, maskVal);
+
+            // ZIGZAG_DECODE(T, v) (((v) >> 1) ^ -((T)((v)&1)))
+            __m512i signmask = _mm512_and_si512(_mm512_set1_epi64(1), zigzagVal);
+            signmask = _mm512_sub_epi64(_mm512_setzero_si512(), signmask);
+            __m512i delta = _mm512_xor_si512(_mm512_srli_epi64(zigzagVal, 1), signmask);
+
+            // calculate the cumulative sum (prefix sum) for each number
+            // decode[0] =  prevValue + final[0]
+            // decode[1] = decode[0] + final[1]   -----> prevValue + final[0] + final[1]
+            // decode[2] = decode[1] + final[2]   -----> prevValue + final[0] + final[1] + final[2]
+            // decode[3] = decode[2] + final[3]   -----> prevValue + final[0] + final[1] + final[2] + final[3]
+
+
+            //7		6		5		4		3		2		1		0
+            //D7		D6		D5		D4		D3		D2		D1		D0
+            //D6		0		D4		0		D2		0		D0		0
+            //D7+D6		D6		D5+D4		D4		D3+D2		D2		D1+D0		D0
+            //13		6		9		4		5		2		1		0
+            __m512i prev = _mm512_set1_epi64(prevValue);
+            __m512i cum_sum = _mm512_add_epi64(delta, _mm512_maskz_permutexvar_epi64(0xaa, sum_mask1, delta));
+            cum_sum = _mm512_add_epi64(cum_sum, _mm512_maskz_permutexvar_epi64(0xcc, sum_mask2, cum_sum));
+            cum_sum = _mm512_add_epi64(cum_sum, _mm512_maskz_permutexvar_epi64(0xf0, sum_mask3, cum_sum));
+
+
+
+            //13		6		9		4		5		2		1		0
+            //D7,D6		D6		D5,D4		D4		D3,D2		D2		D1,D0		D0
+            //+D5,D4	D5,D4,		0		0		D1,D0		D1,D0		0		0
+            //D7~D4		D6~D4		D5~D4		D4		D3~D0		D2~D0		D1~D0		D0
+            //22		15		9		4		6		3		1		0
+            //
+            //D3~D0		D3~D0		D3~D0		D3~D0		0		0		0		0
+            //28		21		15		10		6		3		1		0
+
+
+            cum_sum = _mm512_add_epi64(cum_sum, prev);
+            _mm512_storeu_si512((__m512i *)&p[_pos], cum_sum);
+
+            shiftBits = _mm512_add_epi64(shiftBits, inc);
+            prevValue = p[_pos + 7];
+            _pos += 8;
+
+            }
+            // handle the remain value
+            for (int32_t i = 0; i < remain; i++) {
+            zigzag_value = ((w >> (v + (batch * bit * 8))) & mask);
+            prevValue += ZIGZAG_DECODE(int64_t, zigzag_value);
+
+            p[_pos++] = prevValue;
+            v += bit;
+            }
+#endif
+          } else if (tsSIMDEnable && tsAVX2Enable) {
            __m256i base = _mm256_set1_epi64x(w);
            __m256i maskVal = _mm256_set1_epi64x(mask);

@ -157,10 +243,6 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements,
              p[_pos++] = prevValue;
              v += bit;
            }
-          } else if (tsSIMDEnable && tsAVX512Enable) {
-#if __AVX512F__
-            // todo add avx512 impl
-#endif
          } else {  // alternative implementation without SIMD instructions.
            for (int32_t i = 0; i < elems && count < nelements; i++, count++) {
              zigzag_value = ((w >> v) & mask);
@ -507,4 +589,4 @@ int32_t tsDecompressTimestampAvx512(const char *const input, const int32_t nelem

 #endif
  return 0;
-}
+}