Merge pull request #26197 from taosdata/fix/3_liaohj

fix(util): uncomment the timestamp decode function with AVX2 instructions
This commit is contained in:
Haojun Liao 2024-06-20 16:16:52 +08:00 committed by GitHub
commit df003cca3c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 221 additions and 76 deletions

View File

@ -180,18 +180,20 @@ ELSE ()
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2") SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2")
ENDIF() ENDIF()
MESSAGE(STATUS "SIMD instructions (FMA/AVX/AVX2) is ACTIVATED") MESSAGE(STATUS "SIMD instructions (FMA/AVX/AVX2) is ACTIVATED")
ENDIF()
# IF (COMPILER_SUPPORT_AVX512F AND COMPILER_SUPPORT_AVX512BMI) IF ("${SIMD_AVX512_SUPPORT}" MATCHES "true")
# SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512f -mavx512vbmi") IF (COMPILER_SUPPORT_AVX512F AND COMPILER_SUPPORT_AVX512BMI)
# SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512vbmi") SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512f -mavx512vbmi")
# MESSAGE(STATUS "avx512f/avx512bmi supported by compiler") SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512vbmi")
# ENDIF() MESSAGE(STATUS "avx512f/avx512bmi enabled by compiler")
# ENDIF()
# IF (COMPILER_SUPPORT_AVX512VL)
# SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512vl") IF (COMPILER_SUPPORT_AVX512VL)
# SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512vl") SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512vl")
# MESSAGE(STATUS "avx512vl supported by compiler") SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512vl")
# ENDIF() MESSAGE(STATUS "avx512vl enabled by compiler")
ENDIF()
ENDIF() ENDIF()
# build mode # build mode

View File

@ -37,10 +37,11 @@ extern float tsNumOfCores;
extern int64_t tsTotalMemoryKB; extern int64_t tsTotalMemoryKB;
extern char *tsProcPath; extern char *tsProcPath;
extern char tsSIMDEnable; extern char tsSIMDEnable;
extern char tsSSE42Enable; extern char tsSSE42Supported;
extern char tsAVXEnable; extern char tsAVXSupported;
extern char tsAVX2Enable; extern char tsAVX2Supported;
extern char tsFMAEnable; extern char tsFMASupported;
extern char tsAVX512Supported;
extern char tsAVX512Enable; extern char tsAVX512Enable;
extern char tsTagFilterCache; extern char tsTagFilterCache;

View File

@ -828,12 +828,8 @@ TEST(clientCase, projection_query_tables) {
// printf("error in create db, reason:%s\n", taos_errstr(pRes)); // printf("error in create db, reason:%s\n", taos_errstr(pRes));
// } // }
// taos_free_result(pRes); // taos_free_result(pRes);
/*
TAOS_RES* pRes = taos_query(pConn, "select last(ts), ts from cache_1.t1"); pRes= taos_query(pConn, "use abc1");
// pRes = taos_query(pConn, "select last(ts), ts from cache_1.no_pk_t1");
if (taos_errno(pRes) != 0) {
printf("failed to create table tu, reason:%s\n", taos_errstr(pRes));
}
taos_free_result(pRes); taos_free_result(pRes);
pRes = taos_query(pConn, "create table tu using st2 tags(2)"); pRes = taos_query(pConn, "create table tu using st2 tags(2)");
@ -868,7 +864,6 @@ TEST(clientCase, projection_query_tables) {
createNewTable(pConn, i, 100000, 0, pstr); createNewTable(pConn, i, 100000, 0, pstr);
} }
} }
*/
pRes = taos_query(pConn, "select * from abc1.st2"); pRes = taos_query(pConn, "select * from abc1.st2");
if (taos_errno(pRes) != 0) { if (taos_errno(pRes) != 0) {

View File

@ -595,12 +595,13 @@ static int32_t taosAddSystemCfg(SConfig *pCfg) {
if (cfgAddBool(pCfg, "enableCoreFile", 1, CFG_SCOPE_BOTH, CFG_DYN_CLIENT) != 0) return -1; if (cfgAddBool(pCfg, "enableCoreFile", 1, CFG_SCOPE_BOTH, CFG_DYN_CLIENT) != 0) return -1;
if (cfgAddFloat(pCfg, "numOfCores", tsNumOfCores, 1, 100000, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; if (cfgAddFloat(pCfg, "numOfCores", tsNumOfCores, 1, 100000, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;
if (cfgAddBool(pCfg, "ssd42", tsSSE42Enable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; if (cfgAddBool(pCfg, "ssd42", tsSSE42Supported, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;
if (cfgAddBool(pCfg, "avx", tsAVXEnable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; if (cfgAddBool(pCfg, "avx", tsAVXSupported, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;
if (cfgAddBool(pCfg, "avx2", tsAVX2Enable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; if (cfgAddBool(pCfg, "avx2", tsAVX2Supported, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;
if (cfgAddBool(pCfg, "fma", tsFMAEnable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; if (cfgAddBool(pCfg, "fma", tsFMASupported, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;
if (cfgAddBool(pCfg, "avx512", tsAVX512Enable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; if (cfgAddBool(pCfg, "avx512", tsAVX512Supported, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;
if (cfgAddBool(pCfg, "simdEnable", tsSIMDEnable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; if (cfgAddBool(pCfg, "simdEnable", tsSIMDEnable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;
if (cfgAddBool(pCfg, "AVX512Enable", tsAVX512Enable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;
if (cfgAddBool(pCfg, "tagFilterCache", tsTagFilterCache, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; if (cfgAddBool(pCfg, "tagFilterCache", tsTagFilterCache, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;
if (cfgAddInt64(pCfg, "openMax", tsOpenMax, 0, INT64_MAX, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1; if (cfgAddInt64(pCfg, "openMax", tsOpenMax, 0, INT64_MAX, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;

View File

@ -565,7 +565,7 @@ int32_t avgFunction(SqlFunctionCtx* pCtx) {
numOfElem = pInput->numOfRows; numOfElem = pInput->numOfRows;
pAvgRes->count += pInput->numOfRows; pAvgRes->count += pInput->numOfRows;
bool simdAvailable = tsAVXEnable && tsSIMDEnable && (numOfRows > THRESHOLD_SIZE); bool simdAvailable = tsAVXSupported && tsSIMDEnable && (numOfRows > THRESHOLD_SIZE);
switch(type) { switch(type) {
case TSDB_DATA_TYPE_UTINYINT: case TSDB_DATA_TYPE_UTINYINT:

View File

@ -370,7 +370,7 @@ static int32_t findFirstValPosition(const SColumnInfoData* pCol, int32_t start,
static void handleInt8Col(const void* data, int32_t start, int32_t numOfRows, SMinmaxResInfo* pBuf, bool isMinFunc, static void handleInt8Col(const void* data, int32_t start, int32_t numOfRows, SMinmaxResInfo* pBuf, bool isMinFunc,
bool signVal) { bool signVal) {
// AVX2 version to speedup the loop // AVX2 version to speedup the loop
if (tsAVX2Enable && tsSIMDEnable) { if (tsAVX2Supported && tsSIMDEnable) {
pBuf->v = i8VectorCmpAVX2(data, numOfRows, isMinFunc, signVal); pBuf->v = i8VectorCmpAVX2(data, numOfRows, isMinFunc, signVal);
} else { } else {
if (!pBuf->assign) { if (!pBuf->assign) {
@ -404,7 +404,7 @@ static void handleInt8Col(const void* data, int32_t start, int32_t numOfRows, SM
static void handleInt16Col(const void* data, int32_t start, int32_t numOfRows, SMinmaxResInfo* pBuf, bool isMinFunc, static void handleInt16Col(const void* data, int32_t start, int32_t numOfRows, SMinmaxResInfo* pBuf, bool isMinFunc,
bool signVal) { bool signVal) {
// AVX2 version to speedup the loop // AVX2 version to speedup the loop
if (tsAVX2Enable && tsSIMDEnable) { if (tsAVX2Supported && tsSIMDEnable) {
pBuf->v = i16VectorCmpAVX2(data, numOfRows, isMinFunc, signVal); pBuf->v = i16VectorCmpAVX2(data, numOfRows, isMinFunc, signVal);
} else { } else {
if (!pBuf->assign) { if (!pBuf->assign) {
@ -438,7 +438,7 @@ static void handleInt16Col(const void* data, int32_t start, int32_t numOfRows, S
static void handleInt32Col(const void* data, int32_t start, int32_t numOfRows, SMinmaxResInfo* pBuf, bool isMinFunc, static void handleInt32Col(const void* data, int32_t start, int32_t numOfRows, SMinmaxResInfo* pBuf, bool isMinFunc,
bool signVal) { bool signVal) {
// AVX2 version to speedup the loop // AVX2 version to speedup the loop
if (tsAVX2Enable && tsSIMDEnable) { if (tsAVX2Supported && tsSIMDEnable) {
pBuf->v = i32VectorCmpAVX2(data, numOfRows, isMinFunc, signVal); pBuf->v = i32VectorCmpAVX2(data, numOfRows, isMinFunc, signVal);
} else { } else {
if (!pBuf->assign) { if (!pBuf->assign) {
@ -502,7 +502,7 @@ static void handleFloatCol(SColumnInfoData* pCol, int32_t start, int32_t numOfRo
float* val = (float*)&pBuf->v; float* val = (float*)&pBuf->v;
// AVX version to speedup the loop // AVX version to speedup the loop
if (tsAVXEnable && tsSIMDEnable) { if (tsAVXSupported && tsSIMDEnable) {
*val = floatVectorCmpAVX(pData, numOfRows, isMinFunc); *val = floatVectorCmpAVX(pData, numOfRows, isMinFunc);
} else { } else {
if (!pBuf->assign) { if (!pBuf->assign) {
@ -533,7 +533,7 @@ static void handleDoubleCol(SColumnInfoData* pCol, int32_t start, int32_t numOfR
double* val = (double*)&pBuf->v; double* val = (double*)&pBuf->v;
// AVX version to speedup the loop // AVX version to speedup the loop
if (tsAVXEnable && tsSIMDEnable) { if (tsAVXSupported && tsSIMDEnable) {
*val = (double)doubleVectorCmpAVX(pData, numOfRows, isMinFunc); *val = (double)doubleVectorCmpAVX(pData, numOfRows, isMinFunc);
} else { } else {
if (!pBuf->assign) { if (!pBuf->assign) {

View File

@ -38,11 +38,12 @@ int64_t tsTotalMemoryKB = 0;
char *tsProcPath = NULL; char *tsProcPath = NULL;
char tsSIMDEnable = 0; char tsSIMDEnable = 0;
char tsSSE42Enable = 0;
char tsAVXEnable = 0;
char tsAVX2Enable = 0;
char tsFMAEnable = 0;
char tsAVX512Enable = 0; char tsAVX512Enable = 0;
char tsSSE42Supported = 0;
char tsAVXSupported = 0;
char tsAVX2Supported = 0;
char tsFMASupported = 0;
char tsAVX512Supported = 0;
void osDefaultInit() { void osDefaultInit() {
taosSeedRand(taosSafeRand()); taosSeedRand(taosSafeRand());

View File

@ -250,7 +250,7 @@ void taosGetSystemInfo() {
taosGetCpuCores(&tsNumOfCores, false); taosGetCpuCores(&tsNumOfCores, false);
taosGetTotalMemory(&tsTotalMemoryKB); taosGetTotalMemory(&tsTotalMemoryKB);
taosGetCpuUsage(NULL, NULL); taosGetCpuUsage(NULL, NULL);
taosGetCpuInstructions(&tsSSE42Enable, &tsAVXEnable, &tsAVX2Enable, &tsFMAEnable, &tsAVX512Enable); taosGetCpuInstructions(&tsSSE42Supported, &tsAVXSupported, &tsAVX2Supported, &tsFMASupported, &tsAVX512Supported);
#endif #endif
} }

View File

@ -822,9 +822,9 @@ int32_t tsDecompressTimestampImp(const char *const input, const int32_t nelement
memcpy(output, input + 1, nelements * longBytes); memcpy(output, input + 1, nelements * longBytes);
return nelements * longBytes; return nelements * longBytes;
} else if (input[0] == 1) { // Decompress } else if (input[0] == 1) { // Decompress
if (tsSIMDEnable && tsAVX512Enable) { if (tsSIMDEnable && tsAVX512Supported && tsAVX512Enable) {
tsDecompressTimestampAvx512(input, nelements, output, false); tsDecompressTimestampAvx512(input, nelements, output, false);
} else if (tsSIMDEnable && tsAVX2Enable) { } else if (tsSIMDEnable && tsAVX2Supported) {
tsDecompressTimestampAvx2(input, nelements, output, false); tsDecompressTimestampAvx2(input, nelements, output, false);
} else { } else {
int64_t *ostream = (int64_t *)output; int64_t *ostream = (int64_t *)output;
@ -1198,9 +1198,9 @@ int32_t tsDecompressFloatImp(const char *const input, const int32_t nelements, c
return nelements * FLOAT_BYTES; return nelements * FLOAT_BYTES;
} }
if (tsSIMDEnable && tsAVX2Enable) { if (tsSIMDEnable && tsAVX2Supported) {
tsDecompressFloatImplAvx2(input, nelements, output); tsDecompressFloatImplAvx2(input, nelements, output);
} else if (tsSIMDEnable && tsAVX512Enable) { } else if (tsSIMDEnable && tsAVX512Supported && tsAVX512Enable) {
tsDecompressFloatImplAvx512(input, nelements, output); tsDecompressFloatImplAvx512(input, nelements, output);
} else { // alternative implementation without SIMD instructions. } else { // alternative implementation without SIMD instructions.
tsDecompressFloatHelper(input, nelements, (float *)output); tsDecompressFloatHelper(input, nelements, (float *)output);
@ -2713,7 +2713,7 @@ int32_t tsDecompressBigint(void *pIn, int32_t nIn, int32_t nEle, void *pOut, int
int8_t alvl = tsGetCompressL2Level(l2, lvl); \ int8_t alvl = tsGetCompressL2Level(l2, lvl); \
return compressL2Dict[l2].comprFn(pIn, nIn, pOut, nOut, type, alvl); \ return compressL2Dict[l2].comprFn(pIn, nIn, pOut, nOut, type, alvl); \
} else { \ } else { \
uTrace("dencode:%s, dcompress:%s, level:%d, type:%s", "disabled", compressL2Dict[l1].name, lvl, \ uTrace("dencode:%s, decompress:%s, level:%d, type:%s", "disabled", compressL2Dict[l1].name, lvl, \
tDataTypes[type].name); \ tDataTypes[type].name); \
return compressL2Dict[l2].decomprFn(pIn, nIn, pOut, nOut, type); \ return compressL2Dict[l2].decomprFn(pIn, nIn, pOut, nOut, type); \
} \ } \

View File

@ -52,7 +52,7 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements,
int32_t _pos = 0; int32_t _pos = 0;
int64_t prevValue = 0; int64_t prevValue = 0;
#if __AVX2__ #if __AVX2__ || __AVX512F__
while (_pos < nelements) { while (_pos < nelements) {
uint64_t w = *(uint64_t*) ip; uint64_t w = *(uint64_t*) ip;
@ -72,10 +72,33 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements,
int32_t gRemainder = (nelements - _pos); int32_t gRemainder = (nelements - _pos);
int32_t num = (gRemainder > elems)? elems:gRemainder; int32_t num = (gRemainder > elems)? elems:gRemainder;
int32_t batch = num >> 2; int32_t batch = 0;
int32_t remain = num & 0x03; int32_t remain = 0;
if (tsSIMDEnable && tsAVX512Supported && tsAVX512Enable) {
#if __AVX512F__
batch = num >> 3;
remain = num & 0x07;
#endif
} else if (tsSIMDEnable && tsAVX2Supported) {
#if __AVX2__
batch = num >> 2;
remain = num & 0x03;
#endif
}
if (selector == 0 || selector == 1) { if (selector == 0 || selector == 1) {
if (tsSIMDEnable && tsAVX2Enable) { if (tsSIMDEnable && tsAVX512Supported && tsAVX512Enable) {
#if __AVX512F__
for (int32_t i = 0; i < batch; ++i) {
__m512i prev = _mm512_set1_epi64(prevValue);
_mm512_storeu_si512((__m512i *)&p[_pos], prev);
_pos += 8; //handle 64bit x 8 = 512bit
}
for (int32_t i = 0; i < remain; ++i) {
p[_pos++] = prevValue;
}
#endif
} else if (tsSIMDEnable && tsAVX2Supported) {
for (int32_t i = 0; i < batch; ++i) { for (int32_t i = 0; i < batch; ++i) {
__m256i prev = _mm256_set1_epi64x(prevValue); __m256i prev = _mm256_set1_epi64x(prevValue);
_mm256_storeu_si256((__m256i *)&p[_pos], prev); _mm256_storeu_si256((__m256i *)&p[_pos], prev);
@ -85,10 +108,7 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements,
for (int32_t i = 0; i < remain; ++i) { for (int32_t i = 0; i < remain; ++i) {
p[_pos++] = prevValue; p[_pos++] = prevValue;
} }
} else if (tsSIMDEnable && tsAVX512Enable) {
#if __AVX512F__
// todo add avx512 impl
#endif
} else { // alternative implementation without SIMD instructions. } else { // alternative implementation without SIMD instructions.
for (int32_t i = 0; i < elems && count < nelements; i++, count++) { for (int32_t i = 0; i < elems && count < nelements; i++, count++) {
p[_pos++] = prevValue; p[_pos++] = prevValue;
@ -96,7 +116,73 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements,
} }
} }
} else { } else {
if (tsSIMDEnable && tsAVX2Enable) { if (tsSIMDEnable && tsAVX512Supported && tsAVX512Enable) {
#if __AVX512F__
__m512i sum_mask1 = _mm512_set_epi64(6, 6, 4, 4, 2, 2, 0, 0);
__m512i sum_mask2 = _mm512_set_epi64(5, 5, 5, 5, 1, 1, 1, 1);
__m512i sum_mask3 = _mm512_set_epi64(3, 3, 3, 3, 3, 3, 3, 3);
__m512i base = _mm512_set1_epi64(w);
__m512i maskVal = _mm512_set1_epi64(mask);
__m512i shiftBits = _mm512_set_epi64(bit * 7 + 4, bit * 6 + 4, bit * 5 + 4, bit * 4 + 4, bit * 3 + 4, bit * 2 + 4, bit + 4, 4);
__m512i inc = _mm512_set1_epi64(bit << 3);
for (int32_t i = 0; i < batch; ++i) {
__m512i after = _mm512_srlv_epi64(base, shiftBits);
__m512i zigzagVal = _mm512_and_si512(after, maskVal);
// ZIGZAG_DECODE(T, v) (((v) >> 1) ^ -((T)((v)&1)))
__m512i signmask = _mm512_and_si512(_mm512_set1_epi64(1), zigzagVal);
signmask = _mm512_sub_epi64(_mm512_setzero_si512(), signmask);
__m512i delta = _mm512_xor_si512(_mm512_srli_epi64(zigzagVal, 1), signmask);
// calculate the cumulative sum (prefix sum) for each number
// decode[0] = prevValue + final[0]
// decode[1] = decode[0] + final[1] -----> prevValue + final[0] + final[1]
// decode[2] = decode[1] + final[2] -----> prevValue + final[0] + final[1] + final[2]
// decode[3] = decode[2] + final[3] -----> prevValue + final[0] + final[1] + final[2] + final[3]
//7 6 5 4 3 2 1 0
//D7 D6 D5 D4 D3 D2 D1 D0
//D6 0 D4 0 D2 0 D0 0
//D7+D6 D6 D5+D4 D4 D3+D2 D2 D1+D0 D0
//13 6 9 4 5 2 1 0
__m512i prev = _mm512_set1_epi64(prevValue);
__m512i cum_sum = _mm512_add_epi64(delta, _mm512_maskz_permutexvar_epi64(0xaa, sum_mask1, delta));
cum_sum = _mm512_add_epi64(cum_sum, _mm512_maskz_permutexvar_epi64(0xcc, sum_mask2, cum_sum));
cum_sum = _mm512_add_epi64(cum_sum, _mm512_maskz_permutexvar_epi64(0xf0, sum_mask3, cum_sum));
//13 6 9 4 5 2 1 0
//D7,D6 D6 D5,D4 D4 D3,D2 D2 D1,D0 D0
//+D5,D4 D5,D4, 0 0 D1,D0 D1,D0 0 0
//D7~D4 D6~D4 D5~D4 D4 D3~D0 D2~D0 D1~D0 D0
//22 15 9 4 6 3 1 0
//
//D3~D0 D3~D0 D3~D0 D3~D0 0 0 0 0
//28 21 15 10 6 3 1 0
cum_sum = _mm512_add_epi64(cum_sum, prev);
_mm512_storeu_si512((__m512i *)&p[_pos], cum_sum);
shiftBits = _mm512_add_epi64(shiftBits, inc);
prevValue = p[_pos + 7];
_pos += 8;
}
// handle the remain value
for (int32_t i = 0; i < remain; i++) {
zigzag_value = ((w >> (v + (batch * bit * 8))) & mask);
prevValue += ZIGZAG_DECODE(int64_t, zigzag_value);
p[_pos++] = prevValue;
v += bit;
}
#endif
} else if (tsSIMDEnable && tsAVX2Supported) {
__m256i base = _mm256_set1_epi64x(w); __m256i base = _mm256_set1_epi64x(w);
__m256i maskVal = _mm256_set1_epi64x(mask); __m256i maskVal = _mm256_set1_epi64x(mask);
@ -157,10 +243,6 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements,
p[_pos++] = prevValue; p[_pos++] = prevValue;
v += bit; v += bit;
} }
} else if (tsSIMDEnable && tsAVX512Enable) {
#if __AVX512F__
// todo add avx512 impl
#endif
} else { // alternative implementation without SIMD instructions. } else { // alternative implementation without SIMD instructions.
for (int32_t i = 0; i < elems && count < nelements; i++, count++) { for (int32_t i = 0; i < elems && count < nelements; i++, count++) {
zigzag_value = ((w >> v) & mask); zigzag_value = ((w >> v) & mask);
@ -247,18 +329,19 @@ int32_t tsDecompressFloatImplAvx2(const char *const input, const int32_t nelemen
return 0; return 0;
} }
// decode two timestamps in one loop.
int32_t tsDecompressTimestampAvx2(const char *const input, const int32_t nelements, char *const output, int32_t tsDecompressTimestampAvx2(const char *const input, const int32_t nelements, char *const output,
bool bigEndian) { bool bigEndian) {
#if 0
int64_t *ostream = (int64_t *)output; int64_t *ostream = (int64_t *)output;
int32_t ipos = 1, opos = 0; int32_t ipos = 1, opos = 0;
#if __AVX2__
__m128i prevVal = _mm_setzero_si128(); __m128i prevVal = _mm_setzero_si128();
__m128i prevDelta = _mm_setzero_si128(); __m128i prevDelta = _mm_setzero_si128();
#if __AVX2__
int32_t batch = nelements >> 1; int32_t batch = nelements >> 1;
int32_t remainder = nelements & 0x01; int32_t remainder = nelements & 0x01;
__mmask16 mask2[16] = {0, 0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff}; // __mmask16 mask2[16] = {0, 0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff};
int32_t i = 0; int32_t i = 0;
if (batch > 1) { if (batch > 1) {
@ -293,13 +376,13 @@ int32_t tsDecompressTimestampAvx2(const char *const input, const int32_t nelemen
__m128i deltaOfDelta = _mm_xor_si128(_mm_srli_epi64(zzVal, 1), signmask); __m128i deltaOfDelta = _mm_xor_si128(_mm_srli_epi64(zzVal, 1), signmask);
__m128i deltaCurrent = _mm_add_epi64(deltaOfDelta, prevDelta); __m128i deltaCurrent = _mm_add_epi64(deltaOfDelta, prevDelta);
deltaCurrent = _mm_add_epi64(_mm_slli_si128(deltaCurrent, 8), deltaCurrent); deltaCurrent = _mm_add_epi64(_mm_slli_si128(deltaOfDelta, 8), deltaCurrent);
__m128i val = _mm_add_epi64(deltaCurrent, prevVal); __m128i finalVal = _mm_add_epi64(deltaCurrent, prevVal);
_mm_storeu_si128((__m128i *)&ostream[opos], val); _mm_storeu_si128((__m128i *)&ostream[opos], finalVal);
// keep the previous value // keep the previous value
prevVal = _mm_shuffle_epi32 (val, 0xEE); prevVal = _mm_shuffle_epi32 (finalVal, 0xEE);
// keep the previous delta of delta, for the first item // keep the previous delta of delta, for the first item
prevDelta = _mm_shuffle_epi32(deltaOfDelta, 0xEE); prevDelta = _mm_shuffle_epi32(deltaOfDelta, 0xEE);
@ -316,8 +399,6 @@ int32_t tsDecompressTimestampAvx2(const char *const input, const int32_t nelemen
int8_t nbytes1 = flags & INT8MASK(4); // range of nbytes starts from 0 to 7 int8_t nbytes1 = flags & INT8MASK(4); // range of nbytes starts from 0 to 7
int8_t nbytes2 = (flags >> 4) & INT8MASK(4); int8_t nbytes2 = (flags >> 4) & INT8MASK(4);
// __m128i data1 = _mm_maskz_loadu_epi8(mask2[nbytes1], (const void*)(input + ipos));
// __m128i data2 = _mm_maskz_loadu_epi8(mask2[nbytes2], (const void*)(input + ipos + nbytes1));
__m128i data1; __m128i data1;
if (nbytes1 == 0) { if (nbytes1 == 0) {
data1 = _mm_setzero_si128(); data1 = _mm_setzero_si128();
@ -348,17 +429,18 @@ int32_t tsDecompressTimestampAvx2(const char *const input, const int32_t nelemen
__m128i deltaOfDelta = _mm_xor_si128(_mm_srli_epi64(zzVal, 1), signmask); __m128i deltaOfDelta = _mm_xor_si128(_mm_srli_epi64(zzVal, 1), signmask);
__m128i deltaCurrent = _mm_add_epi64(deltaOfDelta, prevDelta); __m128i deltaCurrent = _mm_add_epi64(deltaOfDelta, prevDelta);
deltaCurrent = _mm_add_epi64(_mm_slli_si128(deltaCurrent, 8), deltaCurrent); deltaCurrent = _mm_add_epi64(_mm_slli_si128(deltaOfDelta, 8), deltaCurrent);
__m128i val = _mm_add_epi64(deltaCurrent, prevVal); __m128i finalVal = _mm_add_epi64(deltaCurrent, prevVal);
_mm_storeu_si128((__m128i *)&ostream[opos], val); finalVal = _mm_add_epi64(_mm_slli_si128(deltaCurrent, 8), finalVal);
_mm_storeu_si128((__m128i *)&ostream[opos], finalVal);
// keep the previous value // keep the previous value
prevVal = _mm_shuffle_epi32 (val, 0xEE); prevVal = _mm_shuffle_epi32 (finalVal, 0xEE);
// keep the previous delta of delta // keep the previous delta of delta
__m128i delta = _mm_add_epi64(_mm_slli_si128(deltaOfDelta, 8), deltaOfDelta); prevDelta = _mm_shuffle_epi32 (deltaCurrent, 0xEE);
prevDelta = _mm_shuffle_epi32(_mm_add_epi64(delta, prevDelta), 0xEE);
opos += 2; opos += 2;
ipos += nbytes1 + nbytes2; ipos += nbytes1 + nbytes2;
@ -389,7 +471,6 @@ int32_t tsDecompressTimestampAvx2(const char *const input, const int32_t nelemen
ostream[opos++] = prevVal[1] + prevDeltaX; ostream[opos++] = prevVal[1] + prevDeltaX;
} }
} }
#endif
#endif #endif
return 0; return 0;
} }

View File

@ -4,9 +4,16 @@
#include <random> #include <random>
#include "ttypes.h" #include "ttypes.h"
namespace {} // namespace namespace {
} // namespace
TEST(utilTest, decompress_ts_test) {
{
tsSIMDEnable = 1;
tsAVX2Supported = 1;
}
TEST(utilTest, decompress_test) {
int64_t tsList[10] = {1700000000, 1700000100, 1700000200, 1700000300, 1700000400, int64_t tsList[10] = {1700000000, 1700000100, 1700000200, 1700000300, 1700000400,
1700000500, 1700000600, 1700000700, 1700000800, 1700000900}; 1700000500, 1700000600, 1700000700, 1700000800, 1700000900};
@ -30,6 +37,20 @@ TEST(utilTest, decompress_test) {
std::cout << ((int64_t*)decompOutput)[i] << std::endl; std::cout << ((int64_t*)decompOutput)[i] << std::endl;
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int64_t tsList[4] = {1286, 1124, 2681, 2823};
char* pOutput[4 * sizeof(int64_t)] = {0};
int32_t len = tsCompressTimestamp(tsList, sizeof(tsList), sizeof(tsList) / sizeof(tsList[0]), pOutput, 4,
ONE_STAGE_COMP, NULL, 0);
char* decompOutput[4 * 8] = {0};
tsDecompressTimestamp(pOutput, len, 4, decompOutput, sizeof(int64_t) * 4, ONE_STAGE_COMP, NULL, 0);
for (int32_t i = 0; i < 4; ++i) {
std::cout << ((int64_t*)decompOutput)[i] << std::endl;
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int64_t tsList1[7] = {1700000000, 1700000000, 1700000000, 1700000000, 1700000000, 1700000000, 1700000900}; int64_t tsList1[7] = {1700000000, 1700000000, 1700000000, 1700000000, 1700000000, 1700000000, 1700000900};
int32_t len1 = tsCompressTimestamp(tsList1, sizeof(tsList1), sizeof(tsList1) / sizeof(tsList1[0]), pOutput, 7, int32_t len1 = tsCompressTimestamp(tsList1, sizeof(tsList1), sizeof(tsList1) / sizeof(tsList1[0]), pOutput, 7,
@ -57,6 +78,49 @@ TEST(utilTest, decompress_test) {
} }
} }
TEST(utilTest, decompress_bigint_avx2_test) {
{
tsSIMDEnable = 1;
tsAVX2Supported = 1;
}
int64_t tsList[10] = {1700000000, 1700000100, 1700000200, 1700000300, 1700000400,
1700000500, 1700000600, 1700000700, 1700000800, 1700000900};
char* pOutput[10 * sizeof(int64_t)] = {0};
int32_t len = tsCompressBigint(tsList, sizeof(tsList), sizeof(tsList) / sizeof(tsList[0]), pOutput, 10,
ONE_STAGE_COMP, NULL, 0);
char* decompOutput[10 * 8] = {0};
tsDecompressBigint(pOutput, len, 10, decompOutput, sizeof(int64_t) * 10, ONE_STAGE_COMP, NULL, 0);
for (int32_t i = 0; i < 10; ++i) {
std::cout << ((int64_t*)decompOutput)[i] << std::endl;
}
}
TEST(utilTest, decompress_int_avx2_test) {
{
tsSIMDEnable = 1;
tsAVX2Supported = 1;
}
int32_t tsList[10] = {17000000, 17000001, 17000002, 17000003, 17000004,
17000005, 17000006, 17000007, 17000008, 17000009};
char* pOutput[10 * sizeof(int32_t)] = {0};
int32_t len =
tsCompressInt(tsList, sizeof(tsList), sizeof(tsList) / sizeof(tsList[0]), pOutput, 10, ONE_STAGE_COMP, NULL, 0);
char* decompOutput[10 * 8] = {0};
tsDecompressInt(pOutput, len, 10, decompOutput, sizeof(int32_t) * 10, ONE_STAGE_COMP, NULL, 0);
for (int32_t i = 0; i < 10; ++i) {
std::cout << ((int32_t*)decompOutput)[i] << std::endl;
}
}
TEST(utilTest, decompress_perf_test) { TEST(utilTest, decompress_perf_test) {
int32_t num = 10000; int32_t num = 10000;