Improve tsDecompressIntImpl_Hw performance with AVX512.

Signed-off-by: Kai Jiang <kai.jiang@intel.com>
               Huanrui Zhang <huanruix.zhang@intel.com>
This commit is contained in:
Kai Jiang 2024-01-23 15:14:04 +08:00 committed by Haojun Liao
parent 2534ce5071
commit 6e2a5b9292
2 changed files with 102 additions and 20 deletions

View File

@ -181,12 +181,12 @@ ELSE ()
ENDIF()
MESSAGE(STATUS "SIMD instructions (FMA/AVX/AVX2) is ACTIVATED")
# IF (COMPILER_SUPPORT_AVX512F AND COMPILER_SUPPORT_AVX512BMI)
# SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512f -mavx512vbmi")
# SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512vbmi")
# MESSAGE(STATUS "avx512f/avx512bmi supported by compiler")
# ENDIF()
#
IF (COMPILER_SUPPORT_AVX512F AND COMPILER_SUPPORT_AVX512BMI)
SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512f -mavx512vbmi")
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512vbmi")
MESSAGE(STATUS "avx512f/avx512bmi supported by compiler")
ENDIF()
# IF (COMPILER_SUPPORT_AVX512VL)
# SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512vl")
# SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512vl")

View File

@ -52,7 +52,7 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements,
int32_t _pos = 0;
int64_t prevValue = 0;
#if __AVX2__
#if __AVX2__ || __AVX512F__
while (_pos < nelements) {
uint64_t w = *(uint64_t*) ip;
@ -72,10 +72,33 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements,
int32_t gRemainder = (nelements - _pos);
int32_t num = (gRemainder > elems)? elems:gRemainder;
int32_t batch = num >> 2;
int32_t remain = num & 0x03;
int32_t batch = 0;
int32_t remain = 0;
if (tsSIMDEnable && tsAVX512Enable) {
#if __AVX512F__
batch = num >> 3;
remain = num & 0x07;
#endif
} else if (tsSIMDEnable && tsAVX2Enable) {
#if __AVX2__
batch = num >> 2;
remain = num & 0x03;
#endif
}
if (selector == 0 || selector == 1) {
if (tsSIMDEnable && tsAVX2Enable) {
if (tsSIMDEnable && tsAVX512Enable) {
#if __AVX512F__
for (int32_t i = 0; i < batch; ++i) {
__m512i prev = _mm512_set1_epi64(prevValue);
_mm512_storeu_si512((__m512i *)&p[_pos], prev);
_pos += 8; //handle 64bit x 8 = 512bit
}
for (int32_t i = 0; i < remain; ++i) {
p[_pos++] = prevValue;
}
#endif
} else if (tsSIMDEnable && tsAVX2Enable) {
for (int32_t i = 0; i < batch; ++i) {
__m256i prev = _mm256_set1_epi64x(prevValue);
_mm256_storeu_si256((__m256i *)&p[_pos], prev);
@ -85,10 +108,7 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements,
for (int32_t i = 0; i < remain; ++i) {
p[_pos++] = prevValue;
}
} else if (tsSIMDEnable && tsAVX512Enable) {
#if __AVX512F__
// todo add avx512 impl
#endif
} else { // alternative implementation without SIMD instructions.
for (int32_t i = 0; i < elems && count < nelements; i++, count++) {
p[_pos++] = prevValue;
@ -96,7 +116,73 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements,
}
}
} else {
if (tsSIMDEnable && tsAVX2Enable) {
if (tsSIMDEnable && tsAVX512Enable) {
#if __AVX512F__
__m512i sum_mask1 = _mm512_set_epi64(6, 6, 4, 4, 2, 2, 0, 0);
__m512i sum_mask2 = _mm512_set_epi64(5, 5, 5, 5, 1, 1, 1, 1);
__m512i sum_mask3 = _mm512_set_epi64(3, 3, 3, 3, 3, 3, 3, 3);
__m512i base = _mm512_set1_epi64(w);
__m512i maskVal = _mm512_set1_epi64(mask);
__m512i shiftBits = _mm512_set_epi64(bit * 7 + 4, bit * 6 + 4, bit * 5 + 4, bit * 4 + 4, bit * 3 + 4, bit * 2 + 4, bit + 4, 4);
__m512i inc = _mm512_set1_epi64(bit << 3);
for (int32_t i = 0; i < batch; ++i) {
__m512i after = _mm512_srlv_epi64(base, shiftBits);
__m512i zigzagVal = _mm512_and_si512(after, maskVal);
// ZIGZAG_DECODE(T, v) (((v) >> 1) ^ -((T)((v)&1)))
__m512i signmask = _mm512_and_si512(_mm512_set1_epi64(1), zigzagVal);
signmask = _mm512_sub_epi64(_mm512_setzero_si512(), signmask);
__m512i delta = _mm512_xor_si512(_mm512_srli_epi64(zigzagVal, 1), signmask);
// calculate the cumulative sum (prefix sum) for each number
// decode[0] = prevValue + final[0]
// decode[1] = decode[0] + final[1] -----> prevValue + final[0] + final[1]
// decode[2] = decode[1] + final[2] -----> prevValue + final[0] + final[1] + final[2]
// decode[3] = decode[2] + final[3] -----> prevValue + final[0] + final[1] + final[2] + final[3]
//7 6 5 4 3 2 1 0
//D7 D6 D5 D4 D3 D2 D1 D0
//D6 0 D4 0 D2 0 D0 0
//D7+D6 D6 D5+D4 D4 D3+D2 D2 D1+D0 D0
//13 6 9 4 5 2 1 0
__m512i prev = _mm512_set1_epi64(prevValue);
__m512i cum_sum = _mm512_add_epi64(delta, _mm512_maskz_permutexvar_epi64(0xaa, sum_mask1, delta));
cum_sum = _mm512_add_epi64(cum_sum, _mm512_maskz_permutexvar_epi64(0xcc, sum_mask2, cum_sum));
cum_sum = _mm512_add_epi64(cum_sum, _mm512_maskz_permutexvar_epi64(0xf0, sum_mask3, cum_sum));
//13 6 9 4 5 2 1 0
//D7,D6 D6 D5,D4 D4 D3,D2 D2 D1,D0 D0
//+D5,D4 D5,D4, 0 0 D1,D0 D1,D0 0 0
//D7~D4 D6~D4 D5~D4 D4 D3~D0 D2~D0 D1~D0 D0
//22 15 9 4 6 3 1 0
//
//D3~D0 D3~D0 D3~D0 D3~D0 0 0 0 0
//28 21 15 10 6 3 1 0
cum_sum = _mm512_add_epi64(cum_sum, prev);
_mm512_storeu_si512((__m512i *)&p[_pos], cum_sum);
shiftBits = _mm512_add_epi64(shiftBits, inc);
prevValue = p[_pos + 7];
_pos += 8;
}
// handle the remain value
for (int32_t i = 0; i < remain; i++) {
zigzag_value = ((w >> (v + (batch * bit * 8))) & mask);
prevValue += ZIGZAG_DECODE(int64_t, zigzag_value);
p[_pos++] = prevValue;
v += bit;
}
#endif
} else if (tsSIMDEnable && tsAVX2Enable) {
__m256i base = _mm256_set1_epi64x(w);
__m256i maskVal = _mm256_set1_epi64x(mask);
@ -157,10 +243,6 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements,
p[_pos++] = prevValue;
v += bit;
}
} else if (tsSIMDEnable && tsAVX512Enable) {
#if __AVX512F__
// todo add avx512 impl
#endif
} else { // alternative implementation without SIMD instructions.
for (int32_t i = 0; i < elems && count < nelements; i++, count++) {
zigzag_value = ((w >> v) & mask);
@ -507,4 +589,4 @@ int32_t tsDecompressTimestampAvx512(const char *const input, const int32_t nelem
#endif
return 0;
}
}