fix(util): uncomment the timestamp decode function with AVX2 instructions, and do some internal refactor.

This commit is contained in:
Haojun Liao 2024-06-19 15:06:09 +08:00
parent 6e2a5b9292
commit 04281bcd07
10 changed files with 92 additions and 45 deletions

View File

@ -180,18 +180,20 @@ ELSE ()
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2") SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2")
ENDIF() ENDIF()
MESSAGE(STATUS "SIMD instructions (FMA/AVX/AVX2) is ACTIVATED") MESSAGE(STATUS "SIMD instructions (FMA/AVX/AVX2) is ACTIVATED")
ENDIF()
IF ("${SIMD_AVX512_SUPPORT}" MATCHES "true")
IF (COMPILER_SUPPORT_AVX512F AND COMPILER_SUPPORT_AVX512BMI) IF (COMPILER_SUPPORT_AVX512F AND COMPILER_SUPPORT_AVX512BMI)
SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512f -mavx512vbmi") SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512f -mavx512vbmi")
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512vbmi") SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512vbmi")
MESSAGE(STATUS "avx512f/avx512bmi supported by compiler") MESSAGE(STATUS "avx512f/avx512bmi enabled by compiler")
ENDIF() ENDIF()
# IF (COMPILER_SUPPORT_AVX512VL) IF (COMPILER_SUPPORT_AVX512VL)
# SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512vl") SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512vl")
# SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512vl") SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512vl")
# MESSAGE(STATUS "avx512vl supported by compiler") MESSAGE(STATUS "avx512vl enabled by compiler")
# ENDIF() ENDIF()
ENDIF() ENDIF()
# build mode # build mode

View File

@ -39,9 +39,10 @@ extern char *tsProcPath;
extern char tsSIMDEnable; extern char tsSIMDEnable;
extern char tsSSE42Enable; extern char tsSSE42Enable;
extern char tsAVXEnable; extern char tsAVXEnable;
extern char tsAVX2Enable; extern char tsAVX2Supported;
extern char tsFMAEnable; extern char tsFMASupported;
extern char tsAVX512Enable; extern char tsAVX512Supported;
extern char tsAVX512Enable;
extern char tsTagFilterCache; extern char tsTagFilterCache;
extern char configDir[]; extern char configDir[];

View File

@ -828,12 +828,8 @@ TEST(clientCase, projection_query_tables) {
// printf("error in create db, reason:%s\n", taos_errstr(pRes)); // printf("error in create db, reason:%s\n", taos_errstr(pRes));
// } // }
// taos_free_result(pRes); // taos_free_result(pRes);
/*
TAOS_RES* pRes = taos_query(pConn, "select last(ts), ts from cache_1.t1"); pRes= taos_query(pConn, "use abc1");
// pRes = taos_query(pConn, "select last(ts), ts from cache_1.no_pk_t1");
if (taos_errno(pRes) != 0) {
printf("failed to create table tu, reason:%s\n", taos_errstr(pRes));
}
taos_free_result(pRes); taos_free_result(pRes);
pRes = taos_query(pConn, "create table tu using st2 tags(2)"); pRes = taos_query(pConn, "create table tu using st2 tags(2)");
@ -868,7 +864,6 @@ TEST(clientCase, projection_query_tables) {
createNewTable(pConn, i, 100000, 0, pstr); createNewTable(pConn, i, 100000, 0, pstr);
} }
} }
*/
pRes = taos_query(pConn, "select * from abc1.st2"); pRes = taos_query(pConn, "select * from abc1.st2");
if (taos_errno(pRes) != 0) { if (taos_errno(pRes) != 0) {

View File

@ -596,10 +596,11 @@ static int32_t taosAddSystemCfg(SConfig *pCfg) {
if (cfgAddBool(pCfg, "ssd42", tsSSE42Enable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; if (cfgAddBool(pCfg, "ssd42", tsSSE42Enable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;
if (cfgAddBool(pCfg, "avx", tsAVXEnable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; if (cfgAddBool(pCfg, "avx", tsAVXEnable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;
if (cfgAddBool(pCfg, "avx2", tsAVX2Enable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; if (cfgAddBool(pCfg, "avx2", tsAVX2Supported, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;
if (cfgAddBool(pCfg, "fma", tsFMAEnable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; if (cfgAddBool(pCfg, "fma", tsFMASupported, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;
if (cfgAddBool(pCfg, "avx512", tsAVX512Enable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; if (cfgAddBool(pCfg, "avx512", tsAVX512Supported, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;
if (cfgAddBool(pCfg, "simdEnable", tsSIMDEnable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; if (cfgAddBool(pCfg, "simdEnable", tsSIMDEnable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;
if (cfgAddBool(pCfg, "AVX512Enable", tsAVX512Enable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;
if (cfgAddBool(pCfg, "tagFilterCache", tsTagFilterCache, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; if (cfgAddBool(pCfg, "tagFilterCache", tsTagFilterCache, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;
if (cfgAddInt64(pCfg, "openMax", tsOpenMax, 0, INT64_MAX, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; if (cfgAddInt64(pCfg, "openMax", tsOpenMax, 0, INT64_MAX, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;

View File

@ -370,7 +370,7 @@ static int32_t findFirstValPosition(const SColumnInfoData* pCol, int32_t start,
static void handleInt8Col(const void* data, int32_t start, int32_t numOfRows, SMinmaxResInfo* pBuf, bool isMinFunc, static void handleInt8Col(const void* data, int32_t start, int32_t numOfRows, SMinmaxResInfo* pBuf, bool isMinFunc,
bool signVal) { bool signVal) {
// AVX2 version to speedup the loop // AVX2 version to speedup the loop
if (tsAVX2Enable && tsSIMDEnable) { if (tsAVX2Supported && tsSIMDEnable) {
pBuf->v = i8VectorCmpAVX2(data, numOfRows, isMinFunc, signVal); pBuf->v = i8VectorCmpAVX2(data, numOfRows, isMinFunc, signVal);
} else { } else {
if (!pBuf->assign) { if (!pBuf->assign) {
@ -404,7 +404,7 @@ static void handleInt8Col(const void* data, int32_t start, int32_t numOfRows, SM
static void handleInt16Col(const void* data, int32_t start, int32_t numOfRows, SMinmaxResInfo* pBuf, bool isMinFunc, static void handleInt16Col(const void* data, int32_t start, int32_t numOfRows, SMinmaxResInfo* pBuf, bool isMinFunc,
bool signVal) { bool signVal) {
// AVX2 version to speedup the loop // AVX2 version to speedup the loop
if (tsAVX2Enable && tsSIMDEnable) { if (tsAVX2Supported && tsSIMDEnable) {
pBuf->v = i16VectorCmpAVX2(data, numOfRows, isMinFunc, signVal); pBuf->v = i16VectorCmpAVX2(data, numOfRows, isMinFunc, signVal);
} else { } else {
if (!pBuf->assign) { if (!pBuf->assign) {
@ -438,7 +438,7 @@ static void handleInt16Col(const void* data, int32_t start, int32_t numOfRows, S
static void handleInt32Col(const void* data, int32_t start, int32_t numOfRows, SMinmaxResInfo* pBuf, bool isMinFunc, static void handleInt32Col(const void* data, int32_t start, int32_t numOfRows, SMinmaxResInfo* pBuf, bool isMinFunc,
bool signVal) { bool signVal) {
// AVX2 version to speedup the loop // AVX2 version to speedup the loop
if (tsAVX2Enable && tsSIMDEnable) { if (tsAVX2Supported && tsSIMDEnable) {
pBuf->v = i32VectorCmpAVX2(data, numOfRows, isMinFunc, signVal); pBuf->v = i32VectorCmpAVX2(data, numOfRows, isMinFunc, signVal);
} else { } else {
if (!pBuf->assign) { if (!pBuf->assign) {

View File

@ -38,11 +38,12 @@ int64_t tsTotalMemoryKB = 0;
char *tsProcPath = NULL; char *tsProcPath = NULL;
char tsSIMDEnable = 0; char tsSIMDEnable = 0;
char tsAVX512Enable = 0;
char tsSSE42Enable = 0; char tsSSE42Enable = 0;
char tsAVXEnable = 0; char tsAVXEnable = 0;
char tsAVX2Enable = 0; char tsAVX2Supported = 0;
char tsFMAEnable = 0; char tsFMASupported = 0;
char tsAVX512Enable = 0; char tsAVX512Supported = 0;
void osDefaultInit() { void osDefaultInit() {
taosSeedRand(taosSafeRand()); taosSeedRand(taosSafeRand());

View File

@ -250,7 +250,7 @@ void taosGetSystemInfo() {
taosGetCpuCores(&tsNumOfCores, false); taosGetCpuCores(&tsNumOfCores, false);
taosGetTotalMemory(&tsTotalMemoryKB); taosGetTotalMemory(&tsTotalMemoryKB);
taosGetCpuUsage(NULL, NULL); taosGetCpuUsage(NULL, NULL);
taosGetCpuInstructions(&tsSSE42Enable, &tsAVXEnable, &tsAVX2Enable, &tsFMAEnable, &tsAVX512Enable); taosGetCpuInstructions(&tsSSE42Enable, &tsAVXEnable, &tsAVX2Supported, &tsFMASupported, &tsAVX512Supported);
#endif #endif
} }

View File

@ -822,9 +822,9 @@ int32_t tsDecompressTimestampImp(const char *const input, const int32_t nelement
memcpy(output, input + 1, nelements * longBytes); memcpy(output, input + 1, nelements * longBytes);
return nelements * longBytes; return nelements * longBytes;
} else if (input[0] == 1) { // Decompress } else if (input[0] == 1) { // Decompress
if (tsSIMDEnable && tsAVX512Enable) { if (tsSIMDEnable && tsAVX512Supported && tsAVX512Enable) {
tsDecompressTimestampAvx512(input, nelements, output, false); tsDecompressTimestampAvx512(input, nelements, output, false);
} else if (tsSIMDEnable && tsAVX2Enable) { } else if (tsSIMDEnable && tsAVX2Supported) {
tsDecompressTimestampAvx2(input, nelements, output, false); tsDecompressTimestampAvx2(input, nelements, output, false);
} else { } else {
int64_t *ostream = (int64_t *)output; int64_t *ostream = (int64_t *)output;
@ -1198,9 +1198,9 @@ int32_t tsDecompressFloatImp(const char *const input, const int32_t nelements, c
return nelements * FLOAT_BYTES; return nelements * FLOAT_BYTES;
} }
if (tsSIMDEnable && tsAVX2Enable) { if (tsSIMDEnable && tsAVX2Supported) {
tsDecompressFloatImplAvx2(input, nelements, output); tsDecompressFloatImplAvx2(input, nelements, output);
} else if (tsSIMDEnable && tsAVX512Enable) { } else if (tsSIMDEnable && tsAVX512Supported && tsAVX512Enable) {
tsDecompressFloatImplAvx512(input, nelements, output); tsDecompressFloatImplAvx512(input, nelements, output);
} else { // alternative implementation without SIMD instructions. } else { // alternative implementation without SIMD instructions.
tsDecompressFloatHelper(input, nelements, (float *)output); tsDecompressFloatHelper(input, nelements, (float *)output);
@ -2713,7 +2713,7 @@ int32_t tsDecompressBigint(void *pIn, int32_t nIn, int32_t nEle, void *pOut, int
int8_t alvl = tsGetCompressL2Level(l2, lvl); \ int8_t alvl = tsGetCompressL2Level(l2, lvl); \
return compressL2Dict[l2].comprFn(pIn, nIn, pOut, nOut, type, alvl); \ return compressL2Dict[l2].comprFn(pIn, nIn, pOut, nOut, type, alvl); \
} else { \ } else { \
uTrace("dencode:%s, dcompress:%s, level:%d, type:%s", "disabled", compressL2Dict[l1].name, lvl, \ uTrace("dencode:%s, decompress:%s, level:%d, type:%s", "disabled", compressL2Dict[l1].name, lvl, \
tDataTypes[type].name); \ tDataTypes[type].name); \
return compressL2Dict[l2].decomprFn(pIn, nIn, pOut, nOut, type); \ return compressL2Dict[l2].decomprFn(pIn, nIn, pOut, nOut, type); \
} \ } \

View File

@ -74,12 +74,12 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements,
int32_t batch = 0; int32_t batch = 0;
int32_t remain = 0; int32_t remain = 0;
if (tsSIMDEnable && tsAVX512Enable) { if (tsSIMDEnable && tsAVX512Supported && tsAVX512Enable) {
#if __AVX512F__ #if __AVX512F__
batch = num >> 3; batch = num >> 3;
remain = num & 0x07; remain = num & 0x07;
#endif #endif
} else if (tsSIMDEnable && tsAVX2Enable) { } else if (tsSIMDEnable && tsAVX2Supported) {
#if __AVX2__ #if __AVX2__
batch = num >> 2; batch = num >> 2;
remain = num & 0x03; remain = num & 0x03;
@ -87,7 +87,7 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements,
} }
if (selector == 0 || selector == 1) { if (selector == 0 || selector == 1) {
if (tsSIMDEnable && tsAVX512Enable) { if (tsSIMDEnable && tsAVX512Supported && tsAVX512Enable) {
#if __AVX512F__ #if __AVX512F__
for (int32_t i = 0; i < batch; ++i) { for (int32_t i = 0; i < batch; ++i) {
__m512i prev = _mm512_set1_epi64(prevValue); __m512i prev = _mm512_set1_epi64(prevValue);
@ -98,7 +98,7 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements,
p[_pos++] = prevValue; p[_pos++] = prevValue;
} }
#endif #endif
} else if (tsSIMDEnable && tsAVX2Enable) { } else if (tsSIMDEnable && tsAVX2Supported) {
for (int32_t i = 0; i < batch; ++i) { for (int32_t i = 0; i < batch; ++i) {
__m256i prev = _mm256_set1_epi64x(prevValue); __m256i prev = _mm256_set1_epi64x(prevValue);
_mm256_storeu_si256((__m256i *)&p[_pos], prev); _mm256_storeu_si256((__m256i *)&p[_pos], prev);
@ -116,7 +116,7 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements,
} }
} }
} else { } else {
if (tsSIMDEnable && tsAVX512Enable) { if (tsSIMDEnable && tsAVX512Supported && tsAVX512Enable) {
#if __AVX512F__ #if __AVX512F__
__m512i sum_mask1 = _mm512_set_epi64(6, 6, 4, 4, 2, 2, 0, 0); __m512i sum_mask1 = _mm512_set_epi64(6, 6, 4, 4, 2, 2, 0, 0);
__m512i sum_mask2 = _mm512_set_epi64(5, 5, 5, 5, 1, 1, 1, 1); __m512i sum_mask2 = _mm512_set_epi64(5, 5, 5, 5, 1, 1, 1, 1);
@ -182,7 +182,7 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements,
v += bit; v += bit;
} }
#endif #endif
} else if (tsSIMDEnable && tsAVX2Enable) { } else if (tsSIMDEnable && tsAVX2Supported) {
__m256i base = _mm256_set1_epi64x(w); __m256i base = _mm256_set1_epi64x(w);
__m256i maskVal = _mm256_set1_epi64x(mask); __m256i maskVal = _mm256_set1_epi64x(mask);
@ -331,16 +331,16 @@ int32_t tsDecompressFloatImplAvx2(const char *const input, const int32_t nelemen
int32_t tsDecompressTimestampAvx2(const char *const input, const int32_t nelements, char *const output, int32_t tsDecompressTimestampAvx2(const char *const input, const int32_t nelements, char *const output,
bool bigEndian) { bool bigEndian) {
#if 0
int64_t *ostream = (int64_t *)output; int64_t *ostream = (int64_t *)output;
int32_t ipos = 1, opos = 0; int32_t ipos = 1, opos = 0;
#if __AVX2__
__m128i prevVal = _mm_setzero_si128(); __m128i prevVal = _mm_setzero_si128();
__m128i prevDelta = _mm_setzero_si128(); __m128i prevDelta = _mm_setzero_si128();
#if __AVX2__
int32_t batch = nelements >> 1; int32_t batch = nelements >> 1;
int32_t remainder = nelements & 0x01; int32_t remainder = nelements & 0x01;
__mmask16 mask2[16] = {0, 0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff}; // __mmask16 mask2[16] = {0, 0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff};
int32_t i = 0; int32_t i = 0;
if (batch > 1) { if (batch > 1) {
@ -398,8 +398,6 @@ int32_t tsDecompressTimestampAvx2(const char *const input, const int32_t nelemen
int8_t nbytes1 = flags & INT8MASK(4); // range of nbytes starts from 0 to 7 int8_t nbytes1 = flags & INT8MASK(4); // range of nbytes starts from 0 to 7
int8_t nbytes2 = (flags >> 4) & INT8MASK(4); int8_t nbytes2 = (flags >> 4) & INT8MASK(4);
// __m128i data1 = _mm_maskz_loadu_epi8(mask2[nbytes1], (const void*)(input + ipos));
// __m128i data2 = _mm_maskz_loadu_epi8(mask2[nbytes2], (const void*)(input + ipos + nbytes1));
__m128i data1; __m128i data1;
if (nbytes1 == 0) { if (nbytes1 == 0) {
data1 = _mm_setzero_si128(); data1 = _mm_setzero_si128();
@ -471,7 +469,6 @@ int32_t tsDecompressTimestampAvx2(const char *const input, const int32_t nelemen
ostream[opos++] = prevVal[1] + prevDeltaX; ostream[opos++] = prevVal[1] + prevDeltaX;
} }
} }
#endif
#endif #endif
return 0; return 0;
} }

View File

@ -4,9 +4,16 @@
#include <random> #include <random>
#include "ttypes.h" #include "ttypes.h"
namespace {} // namespace namespace {
} // namespace
TEST(utilTest, decompress_ts_test) {
{
tsSIMDEnable = 1;
tsAVX2Supported = 1;
}
TEST(utilTest, decompress_test) {
int64_t tsList[10] = {1700000000, 1700000100, 1700000200, 1700000300, 1700000400, int64_t tsList[10] = {1700000000, 1700000100, 1700000200, 1700000300, 1700000400,
1700000500, 1700000600, 1700000700, 1700000800, 1700000900}; 1700000500, 1700000600, 1700000700, 1700000800, 1700000900};
@ -57,6 +64,49 @@ TEST(utilTest, decompress_test) {
} }
} }
TEST(utilTest, decompress_bigint_avx2_test) {
{
tsSIMDEnable = 1;
tsAVX2Supported = 1;
}
int64_t tsList[10] = {1700000000, 1700000100, 1700000200, 1700000300, 1700000400,
1700000500, 1700000600, 1700000700, 1700000800, 1700000900};
char* pOutput[10 * sizeof(int64_t)] = {0};
int32_t len = tsCompressBigint(tsList, sizeof(tsList), sizeof(tsList) / sizeof(tsList[0]), pOutput, 10,
ONE_STAGE_COMP, NULL, 0);
char* decompOutput[10 * 8] = {0};
tsDecompressBigint(pOutput, len, 10, decompOutput, sizeof(int64_t) * 10, ONE_STAGE_COMP, NULL, 0);
for (int32_t i = 0; i < 10; ++i) {
std::cout << ((int64_t*)decompOutput)[i] << std::endl;
}
}
TEST(utilTest, decompress_int_avx2_test) {
{
tsSIMDEnable = 1;
tsAVX2Supported = 1;
}
int32_t tsList[10] = {17000000, 17000001, 17000002, 17000003, 17000004,
17000005, 17000006, 17000007, 17000008, 17000009};
char* pOutput[10 * sizeof(int32_t)] = {0};
int32_t len =
tsCompressInt(tsList, sizeof(tsList), sizeof(tsList) / sizeof(tsList[0]), pOutput, 10, ONE_STAGE_COMP, NULL, 0);
char* decompOutput[10 * 8] = {0};
tsDecompressInt(pOutput, len, 10, decompOutput, sizeof(int32_t) * 10, ONE_STAGE_COMP, NULL, 0);
for (int32_t i = 0; i < 10; ++i) {
std::cout << ((int32_t*)decompOutput)[i] << std::endl;
}
}
TEST(utilTest, decompress_perf_test) { TEST(utilTest, decompress_perf_test) {
int32_t num = 10000; int32_t num = 10000;