diff --git a/cmake/cmake.define b/cmake/cmake.define index 12e1b50539..f1a5cef67e 100644 --- a/cmake/cmake.define +++ b/cmake/cmake.define @@ -180,18 +180,20 @@ ELSE () SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2") ENDIF() MESSAGE(STATUS "SIMD instructions (FMA/AVX/AVX2) is ACTIVATED") + ENDIF() -# IF (COMPILER_SUPPORT_AVX512F AND COMPILER_SUPPORT_AVX512BMI) -# SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512f -mavx512vbmi") -# SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512vbmi") -# MESSAGE(STATUS "avx512f/avx512bmi supported by compiler") -# ENDIF() -# -# IF (COMPILER_SUPPORT_AVX512VL) -# SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512vl") -# SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512vl") -# MESSAGE(STATUS "avx512vl supported by compiler") -# ENDIF() + IF ("${SIMD_AVX512_SUPPORT}" MATCHES "true") + IF (COMPILER_SUPPORT_AVX512F AND COMPILER_SUPPORT_AVX512BMI) + SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512f -mavx512vbmi") + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512vbmi") + MESSAGE(STATUS "avx512f/avx512bmi enabled by compiler") + ENDIF() + + IF (COMPILER_SUPPORT_AVX512VL) + SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512vl") + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512vl") + MESSAGE(STATUS "avx512vl enabled by compiler") + ENDIF() ENDIF() # build mode diff --git a/include/os/osEnv.h b/include/os/osEnv.h index ac4ecd4212..4f4a58d4e8 100644 --- a/include/os/osEnv.h +++ b/include/os/osEnv.h @@ -37,11 +37,12 @@ extern float tsNumOfCores; extern int64_t tsTotalMemoryKB; extern char *tsProcPath; extern char tsSIMDEnable; -extern char tsSSE42Enable; -extern char tsAVXEnable; -extern char tsAVX2Enable; -extern char tsFMAEnable; -extern char tsAVX512Enable; +extern char tsSSE42Supported; +extern char tsAVXSupported; +extern char tsAVX2Supported; +extern char tsFMASupported; +extern char tsAVX512Supported; +extern char tsAVX512Enable; extern char tsTagFilterCache; extern char configDir[]; diff --git a/source/client/test/clientTests.cpp b/source/client/test/clientTests.cpp index b5bad92dc4..bbd759b3d1 100644 --- a/source/client/test/clientTests.cpp +++ b/source/client/test/clientTests.cpp @@ -828,12 +828,8 @@ TEST(clientCase, projection_query_tables) { // printf("error in create db, reason:%s\n", taos_errstr(pRes)); // } // taos_free_result(pRes); -/* - TAOS_RES* pRes = taos_query(pConn, "select last(ts), ts from cache_1.t1"); -// pRes = taos_query(pConn, "select last(ts), ts from cache_1.no_pk_t1"); - if (taos_errno(pRes) != 0) { - printf("failed to create table tu, reason:%s\n", taos_errstr(pRes)); - } + + pRes= taos_query(pConn, "use abc1"); taos_free_result(pRes); pRes = taos_query(pConn, "create table tu using st2 tags(2)"); @@ -868,7 +864,6 @@ TEST(clientCase, projection_query_tables) { createNewTable(pConn, i, 100000, 0, pstr); } } -*/ pRes = taos_query(pConn, "select * from abc1.st2"); if (taos_errno(pRes) != 0) { diff --git a/source/common/src/tglobal.c b/source/common/src/tglobal.c index da692e78fd..ee6ccb5c69 100644 --- a/source/common/src/tglobal.c +++ b/source/common/src/tglobal.c @@ -595,12 +595,13 @@ static int32_t taosAddSystemCfg(SConfig *pCfg) { if (cfgAddBool(pCfg, "enableCoreFile", 1, CFG_SCOPE_BOTH, CFG_DYN_CLIENT) != 0) return -1; if (cfgAddFloat(pCfg, "numOfCores", tsNumOfCores, 1, 100000, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; - if (cfgAddBool(pCfg, "ssd42", tsSSE42Enable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; - if (cfgAddBool(pCfg, "avx", tsAVXEnable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; - if (cfgAddBool(pCfg, "avx2", tsAVX2Enable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; - if (cfgAddBool(pCfg, "fma", tsFMAEnable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; - if (cfgAddBool(pCfg, "avx512", tsAVX512Enable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; + if (cfgAddBool(pCfg, "ssd42", tsSSE42Supported, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; + if (cfgAddBool(pCfg, "avx", tsAVXSupported, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; + if (cfgAddBool(pCfg, "avx2", tsAVX2Supported, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; + if (cfgAddBool(pCfg, "fma", tsFMASupported, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; + if (cfgAddBool(pCfg, "avx512", tsAVX512Supported, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; if (cfgAddBool(pCfg, "simdEnable", tsSIMDEnable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; + if (cfgAddBool(pCfg, "AVX512Enable", tsAVX512Enable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; if (cfgAddBool(pCfg, "tagFilterCache", tsTagFilterCache, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; if (cfgAddInt64(pCfg, "openMax", tsOpenMax, 0, INT64_MAX, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; diff --git a/source/libs/function/src/detail/tavgfunction.c b/source/libs/function/src/detail/tavgfunction.c index 66ed092f76..3d51f0cd16 100644 --- a/source/libs/function/src/detail/tavgfunction.c +++ b/source/libs/function/src/detail/tavgfunction.c @@ -565,7 +565,7 @@ int32_t avgFunction(SqlFunctionCtx* pCtx) { numOfElem = pInput->numOfRows; pAvgRes->count += pInput->numOfRows; - bool simdAvailable = tsAVXEnable && tsSIMDEnable && (numOfRows > THRESHOLD_SIZE); + bool simdAvailable = tsAVXSupported && tsSIMDEnable && (numOfRows > THRESHOLD_SIZE); switch(type) { case TSDB_DATA_TYPE_UTINYINT: diff --git a/source/libs/function/src/detail/tminmax.c b/source/libs/function/src/detail/tminmax.c index 653b8adad7..590a15c753 100644 --- a/source/libs/function/src/detail/tminmax.c +++ b/source/libs/function/src/detail/tminmax.c @@ -370,7 +370,7 @@ static int32_t findFirstValPosition(const SColumnInfoData* pCol, int32_t start, static void handleInt8Col(const void* data, int32_t start, int32_t numOfRows, SMinmaxResInfo* pBuf, bool isMinFunc, bool signVal) { // AVX2 version to speedup the loop - if (tsAVX2Enable && tsSIMDEnable) { + if (tsAVX2Supported && tsSIMDEnable) { pBuf->v = i8VectorCmpAVX2(data, numOfRows, isMinFunc, signVal); } else { if (!pBuf->assign) { @@ -404,7 +404,7 @@ static void handleInt8Col(const void* data, int32_t start, int32_t numOfRows, SM static void handleInt16Col(const void* data, int32_t start, int32_t numOfRows, SMinmaxResInfo* pBuf, bool isMinFunc, bool signVal) { // AVX2 version to speedup the loop - if (tsAVX2Enable && tsSIMDEnable) { + if (tsAVX2Supported && tsSIMDEnable) { pBuf->v = i16VectorCmpAVX2(data, numOfRows, isMinFunc, signVal); } else { if (!pBuf->assign) { @@ -438,7 +438,7 @@ static void handleInt16Col(const void* data, int32_t start, int32_t numOfRows, S static void handleInt32Col(const void* data, int32_t start, int32_t numOfRows, SMinmaxResInfo* pBuf, bool isMinFunc, bool signVal) { // AVX2 version to speedup the loop - if (tsAVX2Enable && tsSIMDEnable) { + if (tsAVX2Supported && tsSIMDEnable) { pBuf->v = i32VectorCmpAVX2(data, numOfRows, isMinFunc, signVal); } else { if (!pBuf->assign) { @@ -502,7 +502,7 @@ static void handleFloatCol(SColumnInfoData* pCol, int32_t start, int32_t numOfRo float* val = (float*)&pBuf->v; // AVX version to speedup the loop - if (tsAVXEnable && tsSIMDEnable) { + if (tsAVXSupported && tsSIMDEnable) { *val = floatVectorCmpAVX(pData, numOfRows, isMinFunc); } else { if (!pBuf->assign) { @@ -533,7 +533,7 @@ static void handleDoubleCol(SColumnInfoData* pCol, int32_t start, int32_t numOfR double* val = (double*)&pBuf->v; // AVX version to speedup the loop - if (tsAVXEnable && tsSIMDEnable) { + if (tsAVXSupported && tsSIMDEnable) { *val = (double)doubleVectorCmpAVX(pData, numOfRows, isMinFunc); } else { if (!pBuf->assign) { diff --git a/source/os/src/osEnv.c b/source/os/src/osEnv.c index 54107db325..28f4178790 100644 --- a/source/os/src/osEnv.c +++ b/source/os/src/osEnv.c @@ -38,11 +38,12 @@ int64_t tsTotalMemoryKB = 0; char *tsProcPath = NULL; char tsSIMDEnable = 0; -char tsSSE42Enable = 0; -char tsAVXEnable = 0; -char tsAVX2Enable = 0; -char tsFMAEnable = 0; char tsAVX512Enable = 0; +char tsSSE42Supported = 0; +char tsAVXSupported = 0; +char tsAVX2Supported = 0; +char tsFMASupported = 0; +char tsAVX512Supported = 0; void osDefaultInit() { taosSeedRand(taosSafeRand()); diff --git a/source/os/src/osSysinfo.c b/source/os/src/osSysinfo.c index 187461826a..50eb8413c0 100644 --- a/source/os/src/osSysinfo.c +++ b/source/os/src/osSysinfo.c @@ -250,7 +250,7 @@ void taosGetSystemInfo() { taosGetCpuCores(&tsNumOfCores, false); taosGetTotalMemory(&tsTotalMemoryKB); taosGetCpuUsage(NULL, NULL); - taosGetCpuInstructions(&tsSSE42Enable, &tsAVXEnable, &tsAVX2Enable, &tsFMAEnable, &tsAVX512Enable); + taosGetCpuInstructions(&tsSSE42Supported, &tsAVXSupported, &tsAVX2Supported, &tsFMASupported, &tsAVX512Supported); #endif } diff --git a/source/util/src/tcompression.c b/source/util/src/tcompression.c index 73875d836a..884d7ea1b6 100644 --- a/source/util/src/tcompression.c +++ b/source/util/src/tcompression.c @@ -822,9 +822,9 @@ int32_t tsDecompressTimestampImp(const char *const input, const int32_t nelement memcpy(output, input + 1, nelements * longBytes); return nelements * longBytes; } else if (input[0] == 1) { // Decompress - if (tsSIMDEnable && tsAVX512Enable) { + if (tsSIMDEnable && tsAVX512Supported && tsAVX512Enable) { tsDecompressTimestampAvx512(input, nelements, output, false); - } else if (tsSIMDEnable && tsAVX2Enable) { + } else if (tsSIMDEnable && tsAVX2Supported) { tsDecompressTimestampAvx2(input, nelements, output, false); } else { int64_t *ostream = (int64_t *)output; @@ -1198,9 +1198,9 @@ int32_t tsDecompressFloatImp(const char *const input, const int32_t nelements, c return nelements * FLOAT_BYTES; } - if (tsSIMDEnable && tsAVX2Enable) { + if (tsSIMDEnable && tsAVX2Supported) { tsDecompressFloatImplAvx2(input, nelements, output); - } else if (tsSIMDEnable && tsAVX512Enable) { + } else if (tsSIMDEnable && tsAVX512Supported && tsAVX512Enable) { tsDecompressFloatImplAvx512(input, nelements, output); } else { // alternative implementation without SIMD instructions. tsDecompressFloatHelper(input, nelements, (float *)output); @@ -2713,7 +2713,7 @@ int32_t tsDecompressBigint(void *pIn, int32_t nIn, int32_t nEle, void *pOut, int int8_t alvl = tsGetCompressL2Level(l2, lvl); \ return compressL2Dict[l2].comprFn(pIn, nIn, pOut, nOut, type, alvl); \ } else { \ - uTrace("dencode:%s, dcompress:%s, level:%d, type:%s", "disabled", compressL2Dict[l1].name, lvl, \ + uTrace("dencode:%s, decompress:%s, level:%d, type:%s", "disabled", compressL2Dict[l1].name, lvl, \ tDataTypes[type].name); \ return compressL2Dict[l2].decomprFn(pIn, nIn, pOut, nOut, type); \ } \ diff --git a/source/util/src/tdecompress.c b/source/util/src/tdecompress.c index f212bf5231..046cba8686 100644 --- a/source/util/src/tdecompress.c +++ b/source/util/src/tdecompress.c @@ -52,7 +52,7 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements, int32_t _pos = 0; int64_t prevValue = 0; -#if __AVX2__ +#if __AVX2__ || __AVX512F__ while (_pos < nelements) { uint64_t w = *(uint64_t*) ip; @@ -72,10 +72,33 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements, int32_t gRemainder = (nelements - _pos); int32_t num = (gRemainder > elems)? elems:gRemainder; - int32_t batch = num >> 2; - int32_t remain = num & 0x03; + int32_t batch = 0; + int32_t remain = 0; + if (tsSIMDEnable && tsAVX512Supported && tsAVX512Enable) { +#if __AVX512F__ + batch = num >> 3; + remain = num & 0x07; +#endif + } else if (tsSIMDEnable && tsAVX2Supported) { +#if __AVX2__ + batch = num >> 2; + remain = num & 0x03; +#endif + } + if (selector == 0 || selector == 1) { - if (tsSIMDEnable && tsAVX2Enable) { + if (tsSIMDEnable && tsAVX512Supported && tsAVX512Enable) { +#if __AVX512F__ + for (int32_t i = 0; i < batch; ++i) { + __m512i prev = _mm512_set1_epi64(prevValue); + _mm512_storeu_si512((__m512i *)&p[_pos], prev); + _pos += 8; //handle 64bit x 8 = 512bit + } + for (int32_t i = 0; i < remain; ++i) { + p[_pos++] = prevValue; + } +#endif + } else if (tsSIMDEnable && tsAVX2Supported) { for (int32_t i = 0; i < batch; ++i) { __m256i prev = _mm256_set1_epi64x(prevValue); _mm256_storeu_si256((__m256i *)&p[_pos], prev); @@ -85,10 +108,7 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements, for (int32_t i = 0; i < remain; ++i) { p[_pos++] = prevValue; } - } else if (tsSIMDEnable && tsAVX512Enable) { -#if __AVX512F__ - // todo add avx512 impl -#endif + } else { // alternative implementation without SIMD instructions. for (int32_t i = 0; i < elems && count < nelements; i++, count++) { p[_pos++] = prevValue; @@ -96,7 +116,73 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements, } } } else { - if (tsSIMDEnable && tsAVX2Enable) { + if (tsSIMDEnable && tsAVX512Supported && tsAVX512Enable) { + #if __AVX512F__ + __m512i sum_mask1 = _mm512_set_epi64(6, 6, 4, 4, 2, 2, 0, 0); + __m512i sum_mask2 = _mm512_set_epi64(5, 5, 5, 5, 1, 1, 1, 1); + __m512i sum_mask3 = _mm512_set_epi64(3, 3, 3, 3, 3, 3, 3, 3); + __m512i base = _mm512_set1_epi64(w); + __m512i maskVal = _mm512_set1_epi64(mask); + __m512i shiftBits = _mm512_set_epi64(bit * 7 + 4, bit * 6 + 4, bit * 5 + 4, bit * 4 + 4, bit * 3 + 4, bit * 2 + 4, bit + 4, 4); + __m512i inc = _mm512_set1_epi64(bit << 3); + + for (int32_t i = 0; i < batch; ++i) { + + __m512i after = _mm512_srlv_epi64(base, shiftBits); + __m512i zigzagVal = _mm512_and_si512(after, maskVal); + + // ZIGZAG_DECODE(T, v) (((v) >> 1) ^ -((T)((v)&1))) + __m512i signmask = _mm512_and_si512(_mm512_set1_epi64(1), zigzagVal); + signmask = _mm512_sub_epi64(_mm512_setzero_si512(), signmask); + __m512i delta = _mm512_xor_si512(_mm512_srli_epi64(zigzagVal, 1), signmask); + + // calculate the cumulative sum (prefix sum) for each number + // decode[0] = prevValue + final[0] + // decode[1] = decode[0] + final[1] -----> prevValue + final[0] + final[1] + // decode[2] = decode[1] + final[2] -----> prevValue + final[0] + final[1] + final[2] + // decode[3] = decode[2] + final[3] -----> prevValue + final[0] + final[1] + final[2] + final[3] + + + //7 6 5 4 3 2 1 0 + //D7 D6 D5 D4 D3 D2 D1 D0 + //D6 0 D4 0 D2 0 D0 0 + //D7+D6 D6 D5+D4 D4 D3+D2 D2 D1+D0 D0 + //13 6 9 4 5 2 1 0 + __m512i prev = _mm512_set1_epi64(prevValue); + __m512i cum_sum = _mm512_add_epi64(delta, _mm512_maskz_permutexvar_epi64(0xaa, sum_mask1, delta)); + cum_sum = _mm512_add_epi64(cum_sum, _mm512_maskz_permutexvar_epi64(0xcc, sum_mask2, cum_sum)); + cum_sum = _mm512_add_epi64(cum_sum, _mm512_maskz_permutexvar_epi64(0xf0, sum_mask3, cum_sum)); + + + + //13 6 9 4 5 2 1 0 + //D7,D6 D6 D5,D4 D4 D3,D2 D2 D1,D0 D0 + //+D5,D4 D5,D4, 0 0 D1,D0 D1,D0 0 0 + //D7~D4 D6~D4 D5~D4 D4 D3~D0 D2~D0 D1~D0 D0 + //22 15 9 4 6 3 1 0 + // + //D3~D0 D3~D0 D3~D0 D3~D0 0 0 0 0 + //28 21 15 10 6 3 1 0 + + + cum_sum = _mm512_add_epi64(cum_sum, prev); + _mm512_storeu_si512((__m512i *)&p[_pos], cum_sum); + + shiftBits = _mm512_add_epi64(shiftBits, inc); + prevValue = p[_pos + 7]; + _pos += 8; + + } + // handle the remain value + for (int32_t i = 0; i < remain; i++) { + zigzag_value = ((w >> (v + (batch * bit * 8))) & mask); + prevValue += ZIGZAG_DECODE(int64_t, zigzag_value); + + p[_pos++] = prevValue; + v += bit; + } +#endif + } else if (tsSIMDEnable && tsAVX2Supported) { __m256i base = _mm256_set1_epi64x(w); __m256i maskVal = _mm256_set1_epi64x(mask); @@ -157,10 +243,6 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements, p[_pos++] = prevValue; v += bit; } - } else if (tsSIMDEnable && tsAVX512Enable) { -#if __AVX512F__ - // todo add avx512 impl -#endif } else { // alternative implementation without SIMD instructions. for (int32_t i = 0; i < elems && count < nelements; i++, count++) { zigzag_value = ((w >> v) & mask); @@ -247,18 +329,19 @@ int32_t tsDecompressFloatImplAvx2(const char *const input, const int32_t nelemen return 0; } +// decode two timestamps in one loop. int32_t tsDecompressTimestampAvx2(const char *const input, const int32_t nelements, char *const output, bool bigEndian) { -#if 0 int64_t *ostream = (int64_t *)output; int32_t ipos = 1, opos = 0; + +#if __AVX2__ __m128i prevVal = _mm_setzero_si128(); __m128i prevDelta = _mm_setzero_si128(); -#if __AVX2__ int32_t batch = nelements >> 1; int32_t remainder = nelements & 0x01; - __mmask16 mask2[16] = {0, 0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff}; +// __mmask16 mask2[16] = {0, 0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff}; int32_t i = 0; if (batch > 1) { @@ -293,13 +376,13 @@ int32_t tsDecompressTimestampAvx2(const char *const input, const int32_t nelemen __m128i deltaOfDelta = _mm_xor_si128(_mm_srli_epi64(zzVal, 1), signmask); __m128i deltaCurrent = _mm_add_epi64(deltaOfDelta, prevDelta); - deltaCurrent = _mm_add_epi64(_mm_slli_si128(deltaCurrent, 8), deltaCurrent); + deltaCurrent = _mm_add_epi64(_mm_slli_si128(deltaOfDelta, 8), deltaCurrent); - __m128i val = _mm_add_epi64(deltaCurrent, prevVal); - _mm_storeu_si128((__m128i *)&ostream[opos], val); + __m128i finalVal = _mm_add_epi64(deltaCurrent, prevVal); + _mm_storeu_si128((__m128i *)&ostream[opos], finalVal); // keep the previous value - prevVal = _mm_shuffle_epi32 (val, 0xEE); + prevVal = _mm_shuffle_epi32 (finalVal, 0xEE); // keep the previous delta of delta, for the first item prevDelta = _mm_shuffle_epi32(deltaOfDelta, 0xEE); @@ -316,8 +399,6 @@ int32_t tsDecompressTimestampAvx2(const char *const input, const int32_t nelemen int8_t nbytes1 = flags & INT8MASK(4); // range of nbytes starts from 0 to 7 int8_t nbytes2 = (flags >> 4) & INT8MASK(4); -// __m128i data1 = _mm_maskz_loadu_epi8(mask2[nbytes1], (const void*)(input + ipos)); -// __m128i data2 = _mm_maskz_loadu_epi8(mask2[nbytes2], (const void*)(input + ipos + nbytes1)); __m128i data1; if (nbytes1 == 0) { data1 = _mm_setzero_si128(); @@ -348,17 +429,18 @@ int32_t tsDecompressTimestampAvx2(const char *const input, const int32_t nelemen __m128i deltaOfDelta = _mm_xor_si128(_mm_srli_epi64(zzVal, 1), signmask); __m128i deltaCurrent = _mm_add_epi64(deltaOfDelta, prevDelta); - deltaCurrent = _mm_add_epi64(_mm_slli_si128(deltaCurrent, 8), deltaCurrent); + deltaCurrent = _mm_add_epi64(_mm_slli_si128(deltaOfDelta, 8), deltaCurrent); - __m128i val = _mm_add_epi64(deltaCurrent, prevVal); - _mm_storeu_si128((__m128i *)&ostream[opos], val); + __m128i finalVal = _mm_add_epi64(deltaCurrent, prevVal); + finalVal = _mm_add_epi64(_mm_slli_si128(deltaCurrent, 8), finalVal); + + _mm_storeu_si128((__m128i *)&ostream[opos], finalVal); // keep the previous value - prevVal = _mm_shuffle_epi32 (val, 0xEE); + prevVal = _mm_shuffle_epi32 (finalVal, 0xEE); // keep the previous delta of delta - __m128i delta = _mm_add_epi64(_mm_slli_si128(deltaOfDelta, 8), deltaOfDelta); - prevDelta = _mm_shuffle_epi32(_mm_add_epi64(delta, prevDelta), 0xEE); + prevDelta = _mm_shuffle_epi32 (deltaCurrent, 0xEE); opos += 2; ipos += nbytes1 + nbytes2; @@ -389,7 +471,6 @@ int32_t tsDecompressTimestampAvx2(const char *const input, const int32_t nelemen ostream[opos++] = prevVal[1] + prevDeltaX; } } -#endif #endif return 0; } @@ -507,4 +588,4 @@ int32_t tsDecompressTimestampAvx512(const char *const input, const int32_t nelem #endif return 0; -} \ No newline at end of file +} diff --git a/source/util/test/decompressTest.cpp b/source/util/test/decompressTest.cpp index 2ddef3f595..b3cf46fea6 100644 --- a/source/util/test/decompressTest.cpp +++ b/source/util/test/decompressTest.cpp @@ -4,9 +4,16 @@ #include #include "ttypes.h" -namespace {} // namespace +namespace { + +} // namespace + +TEST(utilTest, decompress_ts_test) { + { + tsSIMDEnable = 1; + tsAVX2Supported = 1; + } -TEST(utilTest, decompress_test) { int64_t tsList[10] = {1700000000, 1700000100, 1700000200, 1700000300, 1700000400, 1700000500, 1700000600, 1700000700, 1700000800, 1700000900}; @@ -30,6 +37,20 @@ TEST(utilTest, decompress_test) { std::cout << ((int64_t*)decompOutput)[i] << std::endl; } + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + int64_t tsList[4] = {1286, 1124, 2681, 2823}; + + char* pOutput[4 * sizeof(int64_t)] = {0}; + int32_t len = tsCompressTimestamp(tsList, sizeof(tsList), sizeof(tsList) / sizeof(tsList[0]), pOutput, 4, + ONE_STAGE_COMP, NULL, 0); + + char* decompOutput[4 * 8] = {0}; + tsDecompressTimestamp(pOutput, len, 4, decompOutput, sizeof(int64_t) * 4, ONE_STAGE_COMP, NULL, 0); + + for (int32_t i = 0; i < 4; ++i) { + std::cout << ((int64_t*)decompOutput)[i] << std::endl; + } + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// int64_t tsList1[7] = {1700000000, 1700000000, 1700000000, 1700000000, 1700000000, 1700000000, 1700000900}; int32_t len1 = tsCompressTimestamp(tsList1, sizeof(tsList1), sizeof(tsList1) / sizeof(tsList1[0]), pOutput, 7, @@ -57,6 +78,49 @@ TEST(utilTest, decompress_test) { } } +TEST(utilTest, decompress_bigint_avx2_test) { + { + tsSIMDEnable = 1; + tsAVX2Supported = 1; + } + + int64_t tsList[10] = {1700000000, 1700000100, 1700000200, 1700000300, 1700000400, + 1700000500, 1700000600, 1700000700, 1700000800, 1700000900}; + + char* pOutput[10 * sizeof(int64_t)] = {0}; + int32_t len = tsCompressBigint(tsList, sizeof(tsList), sizeof(tsList) / sizeof(tsList[0]), pOutput, 10, + ONE_STAGE_COMP, NULL, 0); + + char* decompOutput[10 * 8] = {0}; + + tsDecompressBigint(pOutput, len, 10, decompOutput, sizeof(int64_t) * 10, ONE_STAGE_COMP, NULL, 0); + + for (int32_t i = 0; i < 10; ++i) { + std::cout << ((int64_t*)decompOutput)[i] << std::endl; + } +} + +TEST(utilTest, decompress_int_avx2_test) { + { + tsSIMDEnable = 1; + tsAVX2Supported = 1; + } + + int32_t tsList[10] = {17000000, 17000001, 17000002, 17000003, 17000004, + 17000005, 17000006, 17000007, 17000008, 17000009}; + + char* pOutput[10 * sizeof(int32_t)] = {0}; + int32_t len = + tsCompressInt(tsList, sizeof(tsList), sizeof(tsList) / sizeof(tsList[0]), pOutput, 10, ONE_STAGE_COMP, NULL, 0); + + char* decompOutput[10 * 8] = {0}; + tsDecompressInt(pOutput, len, 10, decompOutput, sizeof(int32_t) * 10, ONE_STAGE_COMP, NULL, 0); + + for (int32_t i = 0; i < 10; ++i) { + std::cout << ((int32_t*)decompOutput)[i] << std::endl; + } +} + TEST(utilTest, decompress_perf_test) { int32_t num = 10000;