From 6e2a5b9292c30eab74f9414fa510cb58502abbbd Mon Sep 17 00:00:00 2001 From: Kai Jiang Date: Tue, 23 Jan 2024 15:14:04 +0800 Subject: [PATCH 1/5] Improve tsDecompressIntImpl_Hw performance with AVX512. Signed-off-by: Kai Jiang Huanrui Zhang --- cmake/cmake.define | 12 ++-- source/util/src/tdecompress.c | 110 +++++++++++++++++++++++++++++----- 2 files changed, 102 insertions(+), 20 deletions(-) diff --git a/cmake/cmake.define b/cmake/cmake.define index 12e1b50539..735735f0cc 100644 --- a/cmake/cmake.define +++ b/cmake/cmake.define @@ -181,12 +181,12 @@ ELSE () ENDIF() MESSAGE(STATUS "SIMD instructions (FMA/AVX/AVX2) is ACTIVATED") -# IF (COMPILER_SUPPORT_AVX512F AND COMPILER_SUPPORT_AVX512BMI) -# SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512f -mavx512vbmi") -# SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512vbmi") -# MESSAGE(STATUS "avx512f/avx512bmi supported by compiler") -# ENDIF() -# + IF (COMPILER_SUPPORT_AVX512F AND COMPILER_SUPPORT_AVX512BMI) + SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512f -mavx512vbmi") + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512vbmi") + MESSAGE(STATUS "avx512f/avx512bmi supported by compiler") + ENDIF() + # IF (COMPILER_SUPPORT_AVX512VL) # SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512vl") # SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512vl") diff --git a/source/util/src/tdecompress.c b/source/util/src/tdecompress.c index f212bf5231..d3cb3118d2 100644 --- a/source/util/src/tdecompress.c +++ b/source/util/src/tdecompress.c @@ -52,7 +52,7 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements, int32_t _pos = 0; int64_t prevValue = 0; -#if __AVX2__ +#if __AVX2__ || __AVX512F__ while (_pos < nelements) { uint64_t w = *(uint64_t*) ip; @@ -72,10 +72,33 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements, int32_t gRemainder = (nelements - _pos); int32_t num = (gRemainder > elems)? elems:gRemainder; - int32_t batch = num >> 2; - int32_t remain = num & 0x03; + int32_t batch = 0; + int32_t remain = 0; + if (tsSIMDEnable && tsAVX512Enable) { +#if __AVX512F__ + batch = num >> 3; + remain = num & 0x07; +#endif + } else if (tsSIMDEnable && tsAVX2Enable) { +#if __AVX2__ + batch = num >> 2; + remain = num & 0x03; +#endif + } + if (selector == 0 || selector == 1) { - if (tsSIMDEnable && tsAVX2Enable) { + if (tsSIMDEnable && tsAVX512Enable) { +#if __AVX512F__ + for (int32_t i = 0; i < batch; ++i) { + __m512i prev = _mm512_set1_epi64(prevValue); + _mm512_storeu_si512((__m512i *)&p[_pos], prev); + _pos += 8; //handle 64bit x 8 = 512bit + } + for (int32_t i = 0; i < remain; ++i) { + p[_pos++] = prevValue; + } +#endif + } else if (tsSIMDEnable && tsAVX2Enable) { for (int32_t i = 0; i < batch; ++i) { __m256i prev = _mm256_set1_epi64x(prevValue); _mm256_storeu_si256((__m256i *)&p[_pos], prev); @@ -85,10 +108,7 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements, for (int32_t i = 0; i < remain; ++i) { p[_pos++] = prevValue; } - } else if (tsSIMDEnable && tsAVX512Enable) { -#if __AVX512F__ - // todo add avx512 impl -#endif + } else { // alternative implementation without SIMD instructions. for (int32_t i = 0; i < elems && count < nelements; i++, count++) { p[_pos++] = prevValue; @@ -96,7 +116,73 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements, } } } else { - if (tsSIMDEnable && tsAVX2Enable) { + if (tsSIMDEnable && tsAVX512Enable) { + #if __AVX512F__ + __m512i sum_mask1 = _mm512_set_epi64(6, 6, 4, 4, 2, 2, 0, 0); + __m512i sum_mask2 = _mm512_set_epi64(5, 5, 5, 5, 1, 1, 1, 1); + __m512i sum_mask3 = _mm512_set_epi64(3, 3, 3, 3, 3, 3, 3, 3); + __m512i base = _mm512_set1_epi64(w); + __m512i maskVal = _mm512_set1_epi64(mask); + __m512i shiftBits = _mm512_set_epi64(bit * 7 + 4, bit * 6 + 4, bit * 5 + 4, bit * 4 + 4, bit * 3 + 4, bit * 2 + 4, bit + 4, 4); + __m512i inc = _mm512_set1_epi64(bit << 3); + + for (int32_t i = 0; i < batch; ++i) { + + __m512i after = _mm512_srlv_epi64(base, shiftBits); + __m512i zigzagVal = _mm512_and_si512(after, maskVal); + + // ZIGZAG_DECODE(T, v) (((v) >> 1) ^ -((T)((v)&1))) + __m512i signmask = _mm512_and_si512(_mm512_set1_epi64(1), zigzagVal); + signmask = _mm512_sub_epi64(_mm512_setzero_si512(), signmask); + __m512i delta = _mm512_xor_si512(_mm512_srli_epi64(zigzagVal, 1), signmask); + + // calculate the cumulative sum (prefix sum) for each number + // decode[0] = prevValue + final[0] + // decode[1] = decode[0] + final[1] -----> prevValue + final[0] + final[1] + // decode[2] = decode[1] + final[2] -----> prevValue + final[0] + final[1] + final[2] + // decode[3] = decode[2] + final[3] -----> prevValue + final[0] + final[1] + final[2] + final[3] + + + //7 6 5 4 3 2 1 0 + //D7 D6 D5 D4 D3 D2 D1 D0 + //D6 0 D4 0 D2 0 D0 0 + //D7+D6 D6 D5+D4 D4 D3+D2 D2 D1+D0 D0 + //13 6 9 4 5 2 1 0 + __m512i prev = _mm512_set1_epi64(prevValue); + __m512i cum_sum = _mm512_add_epi64(delta, _mm512_maskz_permutexvar_epi64(0xaa, sum_mask1, delta)); + cum_sum = _mm512_add_epi64(cum_sum, _mm512_maskz_permutexvar_epi64(0xcc, sum_mask2, cum_sum)); + cum_sum = _mm512_add_epi64(cum_sum, _mm512_maskz_permutexvar_epi64(0xf0, sum_mask3, cum_sum)); + + + + //13 6 9 4 5 2 1 0 + //D7,D6 D6 D5,D4 D4 D3,D2 D2 D1,D0 D0 + //+D5,D4 D5,D4, 0 0 D1,D0 D1,D0 0 0 + //D7~D4 D6~D4 D5~D4 D4 D3~D0 D2~D0 D1~D0 D0 + //22 15 9 4 6 3 1 0 + // + //D3~D0 D3~D0 D3~D0 D3~D0 0 0 0 0 + //28 21 15 10 6 3 1 0 + + + cum_sum = _mm512_add_epi64(cum_sum, prev); + _mm512_storeu_si512((__m512i *)&p[_pos], cum_sum); + + shiftBits = _mm512_add_epi64(shiftBits, inc); + prevValue = p[_pos + 7]; + _pos += 8; + + } + // handle the remain value + for (int32_t i = 0; i < remain; i++) { + zigzag_value = ((w >> (v + (batch * bit * 8))) & mask); + prevValue += ZIGZAG_DECODE(int64_t, zigzag_value); + + p[_pos++] = prevValue; + v += bit; + } +#endif + } else if (tsSIMDEnable && tsAVX2Enable) { __m256i base = _mm256_set1_epi64x(w); __m256i maskVal = _mm256_set1_epi64x(mask); @@ -157,10 +243,6 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements, p[_pos++] = prevValue; v += bit; } - } else if (tsSIMDEnable && tsAVX512Enable) { -#if __AVX512F__ - // todo add avx512 impl -#endif } else { // alternative implementation without SIMD instructions. for (int32_t i = 0; i < elems && count < nelements; i++, count++) { zigzag_value = ((w >> v) & mask); @@ -507,4 +589,4 @@ int32_t tsDecompressTimestampAvx512(const char *const input, const int32_t nelem #endif return 0; -} \ No newline at end of file +} From 04281bcd07388b21255e433002b9178f2e8a5005 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Wed, 19 Jun 2024 15:06:09 +0800 Subject: [PATCH 2/5] fix(util): uncomment the timestamp decode function with AVX2 instructions, and do some internal refactor. --- cmake/cmake.define | 14 +++--- include/os/osEnv.h | 7 +-- source/client/test/clientTests.cpp | 9 +--- source/common/src/tglobal.c | 7 +-- source/libs/function/src/detail/tminmax.c | 6 +-- source/os/src/osEnv.c | 7 +-- source/os/src/osSysinfo.c | 2 +- source/util/src/tcompression.c | 10 ++--- source/util/src/tdecompress.c | 21 ++++----- source/util/test/decompressTest.cpp | 54 ++++++++++++++++++++++- 10 files changed, 92 insertions(+), 45 deletions(-) diff --git a/cmake/cmake.define b/cmake/cmake.define index 735735f0cc..f1a5cef67e 100644 --- a/cmake/cmake.define +++ b/cmake/cmake.define @@ -180,18 +180,20 @@ ELSE () SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2") ENDIF() MESSAGE(STATUS "SIMD instructions (FMA/AVX/AVX2) is ACTIVATED") + ENDIF() + IF ("${SIMD_AVX512_SUPPORT}" MATCHES "true") IF (COMPILER_SUPPORT_AVX512F AND COMPILER_SUPPORT_AVX512BMI) SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512f -mavx512vbmi") SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512vbmi") - MESSAGE(STATUS "avx512f/avx512bmi supported by compiler") + MESSAGE(STATUS "avx512f/avx512bmi enabled by compiler") ENDIF() -# IF (COMPILER_SUPPORT_AVX512VL) -# SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512vl") -# SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512vl") -# MESSAGE(STATUS "avx512vl supported by compiler") -# ENDIF() + IF (COMPILER_SUPPORT_AVX512VL) + SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512vl") + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512vl") + MESSAGE(STATUS "avx512vl enabled by compiler") + ENDIF() ENDIF() # build mode diff --git a/include/os/osEnv.h b/include/os/osEnv.h index ac4ecd4212..e3e5da59f5 100644 --- a/include/os/osEnv.h +++ b/include/os/osEnv.h @@ -39,9 +39,10 @@ extern char *tsProcPath; extern char tsSIMDEnable; extern char tsSSE42Enable; extern char tsAVXEnable; -extern char tsAVX2Enable; -extern char tsFMAEnable; -extern char tsAVX512Enable; +extern char tsAVX2Supported; +extern char tsFMASupported; +extern char tsAVX512Supported; +extern char tsAVX512Enable; extern char tsTagFilterCache; extern char configDir[]; diff --git a/source/client/test/clientTests.cpp b/source/client/test/clientTests.cpp index b5bad92dc4..bbd759b3d1 100644 --- a/source/client/test/clientTests.cpp +++ b/source/client/test/clientTests.cpp @@ -828,12 +828,8 @@ TEST(clientCase, projection_query_tables) { // printf("error in create db, reason:%s\n", taos_errstr(pRes)); // } // taos_free_result(pRes); -/* - TAOS_RES* pRes = taos_query(pConn, "select last(ts), ts from cache_1.t1"); -// pRes = taos_query(pConn, "select last(ts), ts from cache_1.no_pk_t1"); - if (taos_errno(pRes) != 0) { - printf("failed to create table tu, reason:%s\n", taos_errstr(pRes)); - } + + pRes= taos_query(pConn, "use abc1"); taos_free_result(pRes); pRes = taos_query(pConn, "create table tu using st2 tags(2)"); @@ -868,7 +864,6 @@ TEST(clientCase, projection_query_tables) { createNewTable(pConn, i, 100000, 0, pstr); } } -*/ pRes = taos_query(pConn, "select * from abc1.st2"); if (taos_errno(pRes) != 0) { diff --git a/source/common/src/tglobal.c b/source/common/src/tglobal.c index c68dc85c29..5b0361f8ca 100644 --- a/source/common/src/tglobal.c +++ b/source/common/src/tglobal.c @@ -596,10 +596,11 @@ static int32_t taosAddSystemCfg(SConfig *pCfg) { if (cfgAddBool(pCfg, "ssd42", tsSSE42Enable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; if (cfgAddBool(pCfg, "avx", tsAVXEnable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; - if (cfgAddBool(pCfg, "avx2", tsAVX2Enable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; - if (cfgAddBool(pCfg, "fma", tsFMAEnable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; - if (cfgAddBool(pCfg, "avx512", tsAVX512Enable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; + if (cfgAddBool(pCfg, "avx2", tsAVX2Supported, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; + if (cfgAddBool(pCfg, "fma", tsFMASupported, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; + if (cfgAddBool(pCfg, "avx512", tsAVX512Supported, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; if (cfgAddBool(pCfg, "simdEnable", tsSIMDEnable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; + if (cfgAddBool(pCfg, "AVX512Enable", tsAVX512Enable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; if (cfgAddBool(pCfg, "tagFilterCache", tsTagFilterCache, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; if (cfgAddInt64(pCfg, "openMax", tsOpenMax, 0, INT64_MAX, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; diff --git a/source/libs/function/src/detail/tminmax.c b/source/libs/function/src/detail/tminmax.c index a6c91a57ce..e36157b565 100644 --- a/source/libs/function/src/detail/tminmax.c +++ b/source/libs/function/src/detail/tminmax.c @@ -370,7 +370,7 @@ static int32_t findFirstValPosition(const SColumnInfoData* pCol, int32_t start, static void handleInt8Col(const void* data, int32_t start, int32_t numOfRows, SMinmaxResInfo* pBuf, bool isMinFunc, bool signVal) { // AVX2 version to speedup the loop - if (tsAVX2Enable && tsSIMDEnable) { + if (tsAVX2Supported && tsSIMDEnable) { pBuf->v = i8VectorCmpAVX2(data, numOfRows, isMinFunc, signVal); } else { if (!pBuf->assign) { @@ -404,7 +404,7 @@ static void handleInt8Col(const void* data, int32_t start, int32_t numOfRows, SM static void handleInt16Col(const void* data, int32_t start, int32_t numOfRows, SMinmaxResInfo* pBuf, bool isMinFunc, bool signVal) { // AVX2 version to speedup the loop - if (tsAVX2Enable && tsSIMDEnable) { + if (tsAVX2Supported && tsSIMDEnable) { pBuf->v = i16VectorCmpAVX2(data, numOfRows, isMinFunc, signVal); } else { if (!pBuf->assign) { @@ -438,7 +438,7 @@ static void handleInt16Col(const void* data, int32_t start, int32_t numOfRows, S static void handleInt32Col(const void* data, int32_t start, int32_t numOfRows, SMinmaxResInfo* pBuf, bool isMinFunc, bool signVal) { // AVX2 version to speedup the loop - if (tsAVX2Enable && tsSIMDEnable) { + if (tsAVX2Supported && tsSIMDEnable) { pBuf->v = i32VectorCmpAVX2(data, numOfRows, isMinFunc, signVal); } else { if (!pBuf->assign) { diff --git a/source/os/src/osEnv.c b/source/os/src/osEnv.c index 54107db325..ea88d5307b 100644 --- a/source/os/src/osEnv.c +++ b/source/os/src/osEnv.c @@ -38,11 +38,12 @@ int64_t tsTotalMemoryKB = 0; char *tsProcPath = NULL; char tsSIMDEnable = 0; +char tsAVX512Enable = 0; char tsSSE42Enable = 0; char tsAVXEnable = 0; -char tsAVX2Enable = 0; -char tsFMAEnable = 0; -char tsAVX512Enable = 0; +char tsAVX2Supported = 0; +char tsFMASupported = 0; +char tsAVX512Supported = 0; void osDefaultInit() { taosSeedRand(taosSafeRand()); diff --git a/source/os/src/osSysinfo.c b/source/os/src/osSysinfo.c index 187461826a..71cbf5541f 100644 --- a/source/os/src/osSysinfo.c +++ b/source/os/src/osSysinfo.c @@ -250,7 +250,7 @@ void taosGetSystemInfo() { taosGetCpuCores(&tsNumOfCores, false); taosGetTotalMemory(&tsTotalMemoryKB); taosGetCpuUsage(NULL, NULL); - taosGetCpuInstructions(&tsSSE42Enable, &tsAVXEnable, &tsAVX2Enable, &tsFMAEnable, &tsAVX512Enable); + taosGetCpuInstructions(&tsSSE42Enable, &tsAVXEnable, &tsAVX2Supported, &tsFMASupported, &tsAVX512Supported); #endif } diff --git a/source/util/src/tcompression.c b/source/util/src/tcompression.c index 4635ec340d..053a716721 100644 --- a/source/util/src/tcompression.c +++ b/source/util/src/tcompression.c @@ -822,9 +822,9 @@ int32_t tsDecompressTimestampImp(const char *const input, const int32_t nelement memcpy(output, input + 1, nelements * longBytes); return nelements * longBytes; } else if (input[0] == 1) { // Decompress - if (tsSIMDEnable && tsAVX512Enable) { + if (tsSIMDEnable && tsAVX512Supported && tsAVX512Enable) { tsDecompressTimestampAvx512(input, nelements, output, false); - } else if (tsSIMDEnable && tsAVX2Enable) { + } else if (tsSIMDEnable && tsAVX2Supported) { tsDecompressTimestampAvx2(input, nelements, output, false); } else { int64_t *ostream = (int64_t *)output; @@ -1198,9 +1198,9 @@ int32_t tsDecompressFloatImp(const char *const input, const int32_t nelements, c return nelements * FLOAT_BYTES; } - if (tsSIMDEnable && tsAVX2Enable) { + if (tsSIMDEnable && tsAVX2Supported) { tsDecompressFloatImplAvx2(input, nelements, output); - } else if (tsSIMDEnable && tsAVX512Enable) { + } else if (tsSIMDEnable && tsAVX512Supported && tsAVX512Enable) { tsDecompressFloatImplAvx512(input, nelements, output); } else { // alternative implementation without SIMD instructions. tsDecompressFloatHelper(input, nelements, (float *)output); @@ -2713,7 +2713,7 @@ int32_t tsDecompressBigint(void *pIn, int32_t nIn, int32_t nEle, void *pOut, int int8_t alvl = tsGetCompressL2Level(l2, lvl); \ return compressL2Dict[l2].comprFn(pIn, nIn, pOut, nOut, type, alvl); \ } else { \ - uTrace("dencode:%s, dcompress:%s, level:%d, type:%s", "disabled", compressL2Dict[l1].name, lvl, \ + uTrace("dencode:%s, decompress:%s, level:%d, type:%s", "disabled", compressL2Dict[l1].name, lvl, \ tDataTypes[type].name); \ return compressL2Dict[l2].decomprFn(pIn, nIn, pOut, nOut, type); \ } \ diff --git a/source/util/src/tdecompress.c b/source/util/src/tdecompress.c index d3cb3118d2..38f277fb48 100644 --- a/source/util/src/tdecompress.c +++ b/source/util/src/tdecompress.c @@ -74,12 +74,12 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements, int32_t batch = 0; int32_t remain = 0; - if (tsSIMDEnable && tsAVX512Enable) { + if (tsSIMDEnable && tsAVX512Supported && tsAVX512Enable) { #if __AVX512F__ batch = num >> 3; remain = num & 0x07; #endif - } else if (tsSIMDEnable && tsAVX2Enable) { + } else if (tsSIMDEnable && tsAVX2Supported) { #if __AVX2__ batch = num >> 2; remain = num & 0x03; @@ -87,7 +87,7 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements, } if (selector == 0 || selector == 1) { - if (tsSIMDEnable && tsAVX512Enable) { + if (tsSIMDEnable && tsAVX512Supported && tsAVX512Enable) { #if __AVX512F__ for (int32_t i = 0; i < batch; ++i) { __m512i prev = _mm512_set1_epi64(prevValue); @@ -98,7 +98,7 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements, p[_pos++] = prevValue; } #endif - } else if (tsSIMDEnable && tsAVX2Enable) { + } else if (tsSIMDEnable && tsAVX2Supported) { for (int32_t i = 0; i < batch; ++i) { __m256i prev = _mm256_set1_epi64x(prevValue); _mm256_storeu_si256((__m256i *)&p[_pos], prev); @@ -116,7 +116,7 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements, } } } else { - if (tsSIMDEnable && tsAVX512Enable) { + if (tsSIMDEnable && tsAVX512Supported && tsAVX512Enable) { #if __AVX512F__ __m512i sum_mask1 = _mm512_set_epi64(6, 6, 4, 4, 2, 2, 0, 0); __m512i sum_mask2 = _mm512_set_epi64(5, 5, 5, 5, 1, 1, 1, 1); @@ -182,7 +182,7 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements, v += bit; } #endif - } else if (tsSIMDEnable && tsAVX2Enable) { + } else if (tsSIMDEnable && tsAVX2Supported) { __m256i base = _mm256_set1_epi64x(w); __m256i maskVal = _mm256_set1_epi64x(mask); @@ -331,16 +331,16 @@ int32_t tsDecompressFloatImplAvx2(const char *const input, const int32_t nelemen int32_t tsDecompressTimestampAvx2(const char *const input, const int32_t nelements, char *const output, bool bigEndian) { -#if 0 int64_t *ostream = (int64_t *)output; int32_t ipos = 1, opos = 0; + +#if __AVX2__ __m128i prevVal = _mm_setzero_si128(); __m128i prevDelta = _mm_setzero_si128(); -#if __AVX2__ int32_t batch = nelements >> 1; int32_t remainder = nelements & 0x01; - __mmask16 mask2[16] = {0, 0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff}; +// __mmask16 mask2[16] = {0, 0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff}; int32_t i = 0; if (batch > 1) { @@ -398,8 +398,6 @@ int32_t tsDecompressTimestampAvx2(const char *const input, const int32_t nelemen int8_t nbytes1 = flags & INT8MASK(4); // range of nbytes starts from 0 to 7 int8_t nbytes2 = (flags >> 4) & INT8MASK(4); -// __m128i data1 = _mm_maskz_loadu_epi8(mask2[nbytes1], (const void*)(input + ipos)); -// __m128i data2 = _mm_maskz_loadu_epi8(mask2[nbytes2], (const void*)(input + ipos + nbytes1)); __m128i data1; if (nbytes1 == 0) { data1 = _mm_setzero_si128(); @@ -471,7 +469,6 @@ int32_t tsDecompressTimestampAvx2(const char *const input, const int32_t nelemen ostream[opos++] = prevVal[1] + prevDeltaX; } } -#endif #endif return 0; } diff --git a/source/util/test/decompressTest.cpp b/source/util/test/decompressTest.cpp index 2ddef3f595..c982f450ea 100644 --- a/source/util/test/decompressTest.cpp +++ b/source/util/test/decompressTest.cpp @@ -4,9 +4,16 @@ #include #include "ttypes.h" -namespace {} // namespace +namespace { + +} // namespace + +TEST(utilTest, decompress_ts_test) { + { + tsSIMDEnable = 1; + tsAVX2Supported = 1; + } -TEST(utilTest, decompress_test) { int64_t tsList[10] = {1700000000, 1700000100, 1700000200, 1700000300, 1700000400, 1700000500, 1700000600, 1700000700, 1700000800, 1700000900}; @@ -57,6 +64,49 @@ TEST(utilTest, decompress_test) { } } +TEST(utilTest, decompress_bigint_avx2_test) { + { + tsSIMDEnable = 1; + tsAVX2Supported = 1; + } + + int64_t tsList[10] = {1700000000, 1700000100, 1700000200, 1700000300, 1700000400, + 1700000500, 1700000600, 1700000700, 1700000800, 1700000900}; + + char* pOutput[10 * sizeof(int64_t)] = {0}; + int32_t len = tsCompressBigint(tsList, sizeof(tsList), sizeof(tsList) / sizeof(tsList[0]), pOutput, 10, + ONE_STAGE_COMP, NULL, 0); + + char* decompOutput[10 * 8] = {0}; + + tsDecompressBigint(pOutput, len, 10, decompOutput, sizeof(int64_t) * 10, ONE_STAGE_COMP, NULL, 0); + + for (int32_t i = 0; i < 10; ++i) { + std::cout << ((int64_t*)decompOutput)[i] << std::endl; + } +} + +TEST(utilTest, decompress_int_avx2_test) { + { + tsSIMDEnable = 1; + tsAVX2Supported = 1; + } + + int32_t tsList[10] = {17000000, 17000001, 17000002, 17000003, 17000004, + 17000005, 17000006, 17000007, 17000008, 17000009}; + + char* pOutput[10 * sizeof(int32_t)] = {0}; + int32_t len = + tsCompressInt(tsList, sizeof(tsList), sizeof(tsList) / sizeof(tsList[0]), pOutput, 10, ONE_STAGE_COMP, NULL, 0); + + char* decompOutput[10 * 8] = {0}; + tsDecompressInt(pOutput, len, 10, decompOutput, sizeof(int32_t) * 10, ONE_STAGE_COMP, NULL, 0); + + for (int32_t i = 0; i < 10; ++i) { + std::cout << ((int32_t*)decompOutput)[i] << std::endl; + } +} + TEST(utilTest, decompress_perf_test) { int32_t num = 10000; From f3536361367499c7222f13b278904d8d9c74a2bd Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Wed, 19 Jun 2024 15:08:25 +0800 Subject: [PATCH 3/5] refactor: do some internal refactor. --- include/os/osEnv.h | 4 ++-- source/common/src/tglobal.c | 4 ++-- source/libs/function/src/detail/tavgfunction.c | 2 +- source/libs/function/src/detail/tminmax.c | 4 ++-- source/os/src/osEnv.c | 4 ++-- source/os/src/osSysinfo.c | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/os/osEnv.h b/include/os/osEnv.h index e3e5da59f5..4f4a58d4e8 100644 --- a/include/os/osEnv.h +++ b/include/os/osEnv.h @@ -37,8 +37,8 @@ extern float tsNumOfCores; extern int64_t tsTotalMemoryKB; extern char *tsProcPath; extern char tsSIMDEnable; -extern char tsSSE42Enable; -extern char tsAVXEnable; +extern char tsSSE42Supported; +extern char tsAVXSupported; extern char tsAVX2Supported; extern char tsFMASupported; extern char tsAVX512Supported; diff --git a/source/common/src/tglobal.c b/source/common/src/tglobal.c index 5b0361f8ca..97cb4b4e1d 100644 --- a/source/common/src/tglobal.c +++ b/source/common/src/tglobal.c @@ -594,8 +594,8 @@ static int32_t taosAddSystemCfg(SConfig *pCfg) { if (cfgAddBool(pCfg, "enableCoreFile", 1, CFG_SCOPE_BOTH, CFG_DYN_CLIENT) != 0) return -1; if (cfgAddFloat(pCfg, "numOfCores", tsNumOfCores, 1, 100000, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; - if (cfgAddBool(pCfg, "ssd42", tsSSE42Enable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; - if (cfgAddBool(pCfg, "avx", tsAVXEnable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; + if (cfgAddBool(pCfg, "ssd42", tsSSE42Supported, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; + if (cfgAddBool(pCfg, "avx", tsAVXSupported, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; if (cfgAddBool(pCfg, "avx2", tsAVX2Supported, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; if (cfgAddBool(pCfg, "fma", tsFMASupported, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; if (cfgAddBool(pCfg, "avx512", tsAVX512Supported, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; diff --git a/source/libs/function/src/detail/tavgfunction.c b/source/libs/function/src/detail/tavgfunction.c index 66ed092f76..3d51f0cd16 100644 --- a/source/libs/function/src/detail/tavgfunction.c +++ b/source/libs/function/src/detail/tavgfunction.c @@ -565,7 +565,7 @@ int32_t avgFunction(SqlFunctionCtx* pCtx) { numOfElem = pInput->numOfRows; pAvgRes->count += pInput->numOfRows; - bool simdAvailable = tsAVXEnable && tsSIMDEnable && (numOfRows > THRESHOLD_SIZE); + bool simdAvailable = tsAVXSupported && tsSIMDEnable && (numOfRows > THRESHOLD_SIZE); switch(type) { case TSDB_DATA_TYPE_UTINYINT: diff --git a/source/libs/function/src/detail/tminmax.c b/source/libs/function/src/detail/tminmax.c index e36157b565..331f222a71 100644 --- a/source/libs/function/src/detail/tminmax.c +++ b/source/libs/function/src/detail/tminmax.c @@ -502,7 +502,7 @@ static void handleFloatCol(SColumnInfoData* pCol, int32_t start, int32_t numOfRo float* val = (float*)&pBuf->v; // AVX version to speedup the loop - if (tsAVXEnable && tsSIMDEnable) { + if (tsAVXSupported && tsSIMDEnable) { *val = floatVectorCmpAVX(pData, numOfRows, isMinFunc); } else { if (!pBuf->assign) { @@ -533,7 +533,7 @@ static void handleDoubleCol(SColumnInfoData* pCol, int32_t start, int32_t numOfR double* val = (double*)&pBuf->v; // AVX version to speedup the loop - if (tsAVXEnable && tsSIMDEnable) { + if (tsAVXSupported && tsSIMDEnable) { *val = (double)doubleVectorCmpAVX(pData, numOfRows, isMinFunc); } else { if (!pBuf->assign) { diff --git a/source/os/src/osEnv.c b/source/os/src/osEnv.c index ea88d5307b..28f4178790 100644 --- a/source/os/src/osEnv.c +++ b/source/os/src/osEnv.c @@ -39,8 +39,8 @@ char *tsProcPath = NULL; char tsSIMDEnable = 0; char tsAVX512Enable = 0; -char tsSSE42Enable = 0; -char tsAVXEnable = 0; +char tsSSE42Supported = 0; +char tsAVXSupported = 0; char tsAVX2Supported = 0; char tsFMASupported = 0; char tsAVX512Supported = 0; diff --git a/source/os/src/osSysinfo.c b/source/os/src/osSysinfo.c index 71cbf5541f..50eb8413c0 100644 --- a/source/os/src/osSysinfo.c +++ b/source/os/src/osSysinfo.c @@ -250,7 +250,7 @@ void taosGetSystemInfo() { taosGetCpuCores(&tsNumOfCores, false); taosGetTotalMemory(&tsTotalMemoryKB); taosGetCpuUsage(NULL, NULL); - taosGetCpuInstructions(&tsSSE42Enable, &tsAVXEnable, &tsAVX2Supported, &tsFMASupported, &tsAVX512Supported); + taosGetCpuInstructions(&tsSSE42Supported, &tsAVXSupported, &tsAVX2Supported, &tsFMASupported, &tsAVX512Supported); #endif } From 8d18e7cd9d0df9c135a51a304e5d5fc47d0015f3 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Wed, 19 Jun 2024 19:11:52 +0800 Subject: [PATCH 4/5] fix(util): fix decode timestamp error by using avx2. --- source/util/src/tdecompress.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/source/util/src/tdecompress.c b/source/util/src/tdecompress.c index 38f277fb48..046cba8686 100644 --- a/source/util/src/tdecompress.c +++ b/source/util/src/tdecompress.c @@ -329,6 +329,7 @@ int32_t tsDecompressFloatImplAvx2(const char *const input, const int32_t nelemen return 0; } +// decode two timestamps in one loop. int32_t tsDecompressTimestampAvx2(const char *const input, const int32_t nelements, char *const output, bool bigEndian) { int64_t *ostream = (int64_t *)output; @@ -375,13 +376,13 @@ int32_t tsDecompressTimestampAvx2(const char *const input, const int32_t nelemen __m128i deltaOfDelta = _mm_xor_si128(_mm_srli_epi64(zzVal, 1), signmask); __m128i deltaCurrent = _mm_add_epi64(deltaOfDelta, prevDelta); - deltaCurrent = _mm_add_epi64(_mm_slli_si128(deltaCurrent, 8), deltaCurrent); + deltaCurrent = _mm_add_epi64(_mm_slli_si128(deltaOfDelta, 8), deltaCurrent); - __m128i val = _mm_add_epi64(deltaCurrent, prevVal); - _mm_storeu_si128((__m128i *)&ostream[opos], val); + __m128i finalVal = _mm_add_epi64(deltaCurrent, prevVal); + _mm_storeu_si128((__m128i *)&ostream[opos], finalVal); // keep the previous value - prevVal = _mm_shuffle_epi32 (val, 0xEE); + prevVal = _mm_shuffle_epi32 (finalVal, 0xEE); // keep the previous delta of delta, for the first item prevDelta = _mm_shuffle_epi32(deltaOfDelta, 0xEE); @@ -428,17 +429,18 @@ int32_t tsDecompressTimestampAvx2(const char *const input, const int32_t nelemen __m128i deltaOfDelta = _mm_xor_si128(_mm_srli_epi64(zzVal, 1), signmask); __m128i deltaCurrent = _mm_add_epi64(deltaOfDelta, prevDelta); - deltaCurrent = _mm_add_epi64(_mm_slli_si128(deltaCurrent, 8), deltaCurrent); + deltaCurrent = _mm_add_epi64(_mm_slli_si128(deltaOfDelta, 8), deltaCurrent); - __m128i val = _mm_add_epi64(deltaCurrent, prevVal); - _mm_storeu_si128((__m128i *)&ostream[opos], val); + __m128i finalVal = _mm_add_epi64(deltaCurrent, prevVal); + finalVal = _mm_add_epi64(_mm_slli_si128(deltaCurrent, 8), finalVal); + + _mm_storeu_si128((__m128i *)&ostream[opos], finalVal); // keep the previous value - prevVal = _mm_shuffle_epi32 (val, 0xEE); + prevVal = _mm_shuffle_epi32 (finalVal, 0xEE); // keep the previous delta of delta - __m128i delta = _mm_add_epi64(_mm_slli_si128(deltaOfDelta, 8), deltaOfDelta); - prevDelta = _mm_shuffle_epi32(_mm_add_epi64(delta, prevDelta), 0xEE); + prevDelta = _mm_shuffle_epi32 (deltaCurrent, 0xEE); opos += 2; ipos += nbytes1 + nbytes2; From e9b07459473f8f391b9ab9d5d39153e156befa86 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Wed, 19 Jun 2024 19:15:45 +0800 Subject: [PATCH 5/5] fix(util): update test cases. --- source/util/test/decompressTest.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/source/util/test/decompressTest.cpp b/source/util/test/decompressTest.cpp index c982f450ea..b3cf46fea6 100644 --- a/source/util/test/decompressTest.cpp +++ b/source/util/test/decompressTest.cpp @@ -37,6 +37,20 @@ TEST(utilTest, decompress_ts_test) { std::cout << ((int64_t*)decompOutput)[i] << std::endl; } + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + int64_t tsList[4] = {1286, 1124, 2681, 2823}; + + char* pOutput[4 * sizeof(int64_t)] = {0}; + int32_t len = tsCompressTimestamp(tsList, sizeof(tsList), sizeof(tsList) / sizeof(tsList[0]), pOutput, 4, + ONE_STAGE_COMP, NULL, 0); + + char* decompOutput[4 * 8] = {0}; + tsDecompressTimestamp(pOutput, len, 4, decompOutput, sizeof(int64_t) * 4, ONE_STAGE_COMP, NULL, 0); + + for (int32_t i = 0; i < 4; ++i) { + std::cout << ((int64_t*)decompOutput)[i] << std::endl; + } + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// int64_t tsList1[7] = {1700000000, 1700000000, 1700000000, 1700000000, 1700000000, 1700000000, 1700000900}; int32_t len1 = tsCompressTimestamp(tsList1, sizeof(tsList1), sizeof(tsList1) / sizeof(tsList1[0]), pOutput, 7,