From d71439260fc7353ef0aea8689a5e3dc995ce83aa Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Thu, 24 Nov 2022 15:01:25 +0800 Subject: [PATCH] refactor(query): add simd support for minmax query. --- .../libs/function/src/detail/tavgfunction.c | 33 -- source/libs/function/src/detail/tminmax.c | 282 +++++++++++++++++- 2 files changed, 268 insertions(+), 47 deletions(-) diff --git a/source/libs/function/src/detail/tavgfunction.c b/source/libs/function/src/detail/tavgfunction.c index 744927d6c8..1553a446a7 100644 --- a/source/libs/function/src/detail/tavgfunction.c +++ b/source/libs/function/src/detail/tavgfunction.c @@ -272,39 +272,6 @@ static void i64VectorSumAVX2(const int64_t* plist, int32_t numOfRows, SAvgRes* p #endif } -static int32_t handleFloatCols(const SColumnInfoData* pCol, const SInputColumnInfoData* pInput, SAvgRes* pRes) { - int32_t numOfElems = 0; - float* plist = (float*)pCol->pData; - - const int32_t THRESHOLD_SIZE = 8; - - if (pCol->hasNull || pInput->numOfRows <= THRESHOLD_SIZE) { - for (int32_t i = pInput->startRowIndex; i < pInput->numOfRows + pInput->startRowIndex; ++i) { - if (colDataIsNull_f(pCol->nullbitmap, i)) { - continue; - } - - numOfElems += 1; - pRes->count += 1; - pRes->sum.dsum += plist[i]; - } - } else { // no null values exist - numOfElems = pInput->numOfRows; - pRes->count += pInput->numOfRows; - - // 3. If the CPU supports AVX, let's employ AVX instructions to speedup this loop - if (tsAVXEnable && tsSIMDEnable) { - floatVectorSumAVX(plist, pInput->numOfRows, pRes); - } else { - for (int32_t i = pInput->startRowIndex; i < pInput->numOfRows + pInput->startRowIndex; ++i) { - pRes->sum.dsum += plist[i]; - } - } - } - - return numOfElems; -} - int32_t getAvgInfoSize() { return (int32_t)sizeof(SAvgRes); } bool getAvgFuncEnv(SFunctionNode* UNUSED_PARAM(pFunc), SFuncExecEnv* pEnv) { diff --git a/source/libs/function/src/detail/tminmax.c b/source/libs/function/src/detail/tminmax.c index a98b172873..bda1fea90a 100644 --- a/source/libs/function/src/detail/tminmax.c +++ b/source/libs/function/src/detail/tminmax.c @@ -30,7 +30,7 @@ static int32_t i32VectorCmpAVX2(const int32_t* pData, int32_t numOfRows, bool is #if __AVX2__ __m256i next; - __m256i initialVal = _mm256_loadu_si256((__m256i*)p); + __m256i initialVal = _mm256_lddqu_si256((__m256i*)p); p += width; if (!isMinFunc) { // max function @@ -40,7 +40,7 @@ static int32_t i32VectorCmpAVX2(const int32_t* pData, int32_t numOfRows, bool is p += width; } - // let sum up the final results + // let compare the final results const int32_t* q = (const int32_t*)&initialVal; v = TMAX(q[0], q[1]); for (int32_t k = 1; k < width; ++k) { @@ -155,7 +155,7 @@ static int8_t i8VectorCmpAVX2(const int8_t* pData, int32_t numOfRows, bool isMin #if __AVX2__ __m256i next; - __m256i initialVal = _mm256_loadu_si256((__m256i*)p); + __m256i initialVal = _mm256_lddqu_si256((__m256i*)p); p += width; if (!isMinFunc) { // max function @@ -218,7 +218,7 @@ static int16_t i16VectorCmpAVX2(const int16_t* pData, int32_t numOfRows, bool is #if __AVX2__ __m256i next; - __m256i initialVal = _mm256_loadu_si256((__m256i*)p); + __m256i initialVal = _mm256_lddqu_si256((__m256i*)p); p += width; if (!isMinFunc) { // max function @@ -271,6 +271,179 @@ static int16_t i16VectorCmpAVX2(const int16_t* pData, int32_t numOfRows, bool is return v; } +static int32_t handleInt8Col(SColumnInfoData* pCol, int32_t start, int32_t numOfRows, SqlFunctionCtx* pCtx, + SMinmaxResInfo* pBuf, bool isMinFunc) { + int8_t* pData = (int8_t*)pCol->pData; + int8_t* val = (int8_t*)&pBuf->v; + + int32_t numOfElems = 0; + if (pCol->hasNull || numOfRows <= 8 || pCtx->subsidiaries.num > 0) { + int32_t i = start; + while (i < (start + numOfRows)) { + if (!colDataIsNull_f(pCol->nullbitmap, i)) { + break; + } + i += 1; + } + + if ((i < (start + numOfRows)) && (!pBuf->assign)) { + *val = pData[i]; + if (pCtx->subsidiaries.num > 0) { + pBuf->tuplePos = saveTupleData(pCtx, i, pCtx->pSrcBlock, NULL); + } + pBuf->assign = true; + numOfElems += 1; + } + + if (isMinFunc) { // min + for (; i < start + numOfRows; ++i) { + if (colDataIsNull_f(pCol->nullbitmap, i)) { + continue; + } + + if (*val > pData[i]) { + *val = pData[i]; + if (pCtx->subsidiaries.num > 0) { + updateTupleData(pCtx, i, pCtx->pSrcBlock, &pBuf->tuplePos); + } + } + numOfElems += 1; + } + + } else { // max function + for (; i < start + numOfRows; ++i) { + if (colDataIsNull_f(pCol->nullbitmap, i)) { + continue; + } + // ignore the equivalent data value + // NOTE: An faster version to avoid one additional comparison with FPU. + if (*val < pData[i]) { + *val = pData[i]; + if (pCtx->subsidiaries.num > 0) { + updateTupleData(pCtx, i, pCtx->pSrcBlock, &pBuf->tuplePos); + } + } + numOfElems += 1; + } + + } + } else { // not has null value + // AVX2 version to speedup the loop + if (tsAVX2Enable && tsSIMDEnable) { + *val = i8VectorCmpAVX2(pData, numOfRows, isMinFunc); + } else { + if (!pBuf->assign) { + *val = pData[0]; + pBuf->assign = true; + } + + if (isMinFunc) { // min + for (int32_t i = start; i < start + numOfRows; ++i) { + if (*val > pData[i]) { + *val = pData[i]; + } + } + } else { // max + for (int32_t i = start; i < start + numOfRows; ++i) { + if (*val < pData[i]) { + *val = pData[i]; + } + } + } + } + + numOfElems = numOfRows; + } + + return numOfElems; +} + +static int32_t handleInt16Col(SColumnInfoData* pCol, int32_t start, int32_t numOfRows, SqlFunctionCtx* pCtx, + SMinmaxResInfo* pBuf, bool isMinFunc) { + int16_t* pData = (int16_t*)pCol->pData; + int16_t* val = (int16_t*)&pBuf->v; + + int32_t numOfElems = 0; + if (pCol->hasNull || numOfRows <= 8 || pCtx->subsidiaries.num > 0) { + int32_t i = start; + while (i < (start + numOfRows)) { + if (!colDataIsNull_f(pCol->nullbitmap, i)) { + break; + } + i += 1; + } + + if ((i < (start + numOfRows)) && (!pBuf->assign)) { + *val = pData[i]; + if (pCtx->subsidiaries.num > 0) { + pBuf->tuplePos = saveTupleData(pCtx, i, pCtx->pSrcBlock, NULL); + } + pBuf->assign = true; + numOfElems += 1; + } + + if (isMinFunc) { // min + for (; i < start + numOfRows; ++i) { + if (colDataIsNull_f(pCol->nullbitmap, i)) { + continue; + } + + if (*val > pData[i]) { + *val = pData[i]; + if (pCtx->subsidiaries.num > 0) { + updateTupleData(pCtx, i, pCtx->pSrcBlock, &pBuf->tuplePos); + } + } + numOfElems += 1; + } + + } else { // max function + for (; i < start + numOfRows; ++i) { + if (colDataIsNull_f(pCol->nullbitmap, i)) { + continue; + } + // ignore the equivalent data value + // NOTE: An faster version to avoid one additional comparison with FPU. + if (*val < pData[i]) { + *val = pData[i]; + if (pCtx->subsidiaries.num > 0) { + updateTupleData(pCtx, i, pCtx->pSrcBlock, &pBuf->tuplePos); + } + } + numOfElems += 1; + } + + } + } else { // not has null value + // AVX2 version to speedup the loop + if (tsAVX2Enable && tsSIMDEnable) { + *val = i16VectorCmpAVX2(pData, numOfRows, isMinFunc); + } else { + if (!pBuf->assign) { + *val = pData[0]; + pBuf->assign = true; + } + + if (isMinFunc) { // min + for (int32_t i = start; i < start + numOfRows; ++i) { + if (*val > pData[i]) { + *val = pData[i]; + } + } + } else { // max + for (int32_t i = start; i < start + numOfRows; ++i) { + if (*val < pData[i]) { + *val = pData[i]; + } + } + } + } + + numOfElems = numOfRows; + } + + return numOfElems; +} static int32_t handleInt32Col(SColumnInfoData* pCol, int32_t start, int32_t numOfRows, SqlFunctionCtx* pCtx, SMinmaxResInfo* pBuf, bool isMinFunc) { @@ -359,6 +532,87 @@ static int32_t handleInt32Col(SColumnInfoData* pCol, int32_t start, int32_t numO return numOfElems; } +static int32_t handleInt64Col(SColumnInfoData* pCol, int32_t start, int32_t numOfRows, SqlFunctionCtx* pCtx, + SMinmaxResInfo* pBuf, bool isMinFunc) { + int32_t* pData = (int32_t*)pCol->pData; + int32_t* val = (int32_t*)&pBuf->v; + + int32_t numOfElems = 0; + if (pCol->hasNull || pCtx->subsidiaries.num > 0) { + int32_t i = start; + while (i < (start + numOfRows)) { + if (!colDataIsNull_f(pCol->nullbitmap, i)) { + break; + } + i += 1; + } + + if ((i < (start + numOfRows)) && (!pBuf->assign)) { + *val = pData[i]; + if (pCtx->subsidiaries.num > 0) { + pBuf->tuplePos = saveTupleData(pCtx, i, pCtx->pSrcBlock, NULL); + } + pBuf->assign = true; + numOfElems += 1; + } + + if (isMinFunc) { // min + for (; i < start + numOfRows; ++i) { + if (colDataIsNull_f(pCol->nullbitmap, i)) { + continue; + } + + if (*val > pData[i]) { + *val = pData[i]; + if (pCtx->subsidiaries.num > 0) { + updateTupleData(pCtx, i, pCtx->pSrcBlock, &pBuf->tuplePos); + } + } + numOfElems += 1; + } + + } else { // max function + for (; i < start + numOfRows; ++i) { + if (colDataIsNull_f(pCol->nullbitmap, i)) { + continue; + } + // ignore the equivalent data value + // NOTE: An faster version to avoid one additional comparison with FPU. + if (*val < pData[i]) { + *val = pData[i]; + if (pCtx->subsidiaries.num > 0) { + updateTupleData(pCtx, i, pCtx->pSrcBlock, &pBuf->tuplePos); + } + } + numOfElems += 1; + } + } + } else { // not has null value + // AVX2 version to speedup the loop + if (!pBuf->assign) { + *val = pData[0]; + pBuf->assign = true; + } + + if (isMinFunc) { // min + for (int32_t i = start; i < start + numOfRows; ++i) { + if (*val > pData[i]) { + *val = pData[i]; + } + } + } else { // max + for (int32_t i = start; i < start + numOfRows; ++i) { + if (*val < pData[i]) { + *val = pData[i]; + } + } + } + + numOfElems = numOfRows; + } + return numOfElems; +} + static int32_t handleFloatCol(SColumnInfoData* pCol, int32_t start, int32_t numOfRows, SqlFunctionCtx* pCtx, SMinmaxResInfo* pBuf, bool isMinFunc) { float* pData = (float*)pCol->pData; @@ -445,13 +699,13 @@ static int32_t handleFloatCol(SColumnInfoData* pCol, int32_t start, int32_t numO return numOfElems; } -static int32_t handleInt8Col(SColumnInfoData* pCol, int32_t start, int32_t numOfRows, SqlFunctionCtx* pCtx, - SMinmaxResInfo* pBuf, bool isMinFunc) { - int8_t* pData = (int8_t*)pCol->pData; - int8_t* val = (int8_t*)&pBuf->v; +static int32_t handleDoubleCol(SColumnInfoData* pCol, int32_t start, int32_t numOfRows, SqlFunctionCtx* pCtx, + SMinmaxResInfo* pBuf, bool isMinFunc) { + float* pData = (float*)pCol->pData; + double* val = (double*)&pBuf->v; int32_t numOfElems = 0; - if (pCol->hasNull || numOfRows <= 8 || pCtx->subsidiaries.num > 0) { + if (pCol->hasNull || numOfRows < 8 || pCtx->subsidiaries.num > 0) { int32_t i = start; while (i < (start + numOfRows)) { if (!colDataIsNull_f(pCol->nullbitmap, i)) { @@ -483,12 +737,12 @@ static int32_t handleInt8Col(SColumnInfoData* pCol, int32_t start, int32_t numOf } numOfElems += 1; } - } else { // max function for (; i < start + numOfRows; ++i) { if (colDataIsNull_f(pCol->nullbitmap, i)) { continue; } + // ignore the equivalent data value // NOTE: An faster version to avoid one additional comparison with FPU. if (*val < pData[i]) { @@ -499,12 +753,11 @@ static int32_t handleInt8Col(SColumnInfoData* pCol, int32_t start, int32_t numOf } numOfElems += 1; } - } } else { // not has null value - // AVX2 version to speedup the loop - if (tsAVX2Enable && tsSIMDEnable) { - *val = i8VectorCmpAVX2(pData, numOfRows, isMinFunc); + // AVX version to speedup the loop + if (tsAVXEnable && tsSIMDEnable) { + *val = (double) floatVectorCmpAVX(pData, numOfRows, isMinFunc); } else { if (!pBuf->assign) { *val = pData[0]; @@ -660,6 +913,7 @@ int32_t doMinMaxHelper(SqlFunctionCtx* pCtx, int32_t isMinFunc) { if (type == TSDB_DATA_TYPE_TINYINT || type == TSDB_DATA_TYPE_BOOL) { numOfElems = handleInt8Col(pCol, start, numOfRows, pCtx, pBuf, isMinFunc); } else if (type == TSDB_DATA_TYPE_SMALLINT) { + numOfElems = handleInt16Col(pCol, start, numOfRows, pCtx, pBuf, isMinFunc); int16_t* pData = (int16_t*)pCol->pData; int16_t* val = (int16_t*)&pBuf->v;