diff --git a/source/libs/function/src/detail/tminmax.c b/source/libs/function/src/detail/tminmax.c index bda1fea90a..21f8631156 100644 --- a/source/libs/function/src/detail/tminmax.c +++ b/source/libs/function/src/detail/tminmax.c @@ -19,139 +19,20 @@ #include "tfunctionInt.h" #include "tglobal.h" -static int32_t i32VectorCmpAVX2(const int32_t* pData, int32_t numOfRows, bool isMinFunc) { - int32_t v = 0; - const int32_t bitWidth = 256; - const int32_t* p = pData; - - int32_t width = (bitWidth>>3u) / sizeof(int32_t); - int32_t remain = numOfRows % width; - int32_t rounds = numOfRows / width; - -#if __AVX2__ - __m256i next; - __m256i initialVal = _mm256_lddqu_si256((__m256i*)p); - p += width; - - if (!isMinFunc) { // max function - for (int32_t i = 0; i < rounds; ++i) { - next = _mm256_lddqu_si256((__m256i*)p); - initialVal = _mm256_max_epi32(initialVal, next); - p += width; - } - - // let compare the final results - const int32_t* q = (const int32_t*)&initialVal; - v = TMAX(q[0], q[1]); - for (int32_t k = 1; k < width; ++k) { - v = TMAX(v, q[k]); - } - - // calculate the front and the reminder items in array list - int32_t start = rounds * width; - for (int32_t j = 0; j < remain; ++j) { - if (v < p[j + start]) { - v = p[j + start]; - } - } - } else { // min function - for (int32_t i = 0; i < rounds; ++i) { - next = _mm256_lddqu_si256((__m256i*)p); - initialVal = _mm256_min_epi32(initialVal, next); - p += width; - } - - // let sum up the final results - const int32_t* q = (const int32_t*)&initialVal; - v = TMIN(q[0], q[1]); - for (int32_t k = 1; k < width; ++k) { - v = TMIN(v, q[k]); - } - - // calculate the front and the remainder items in array list - int32_t start = rounds * width; - for (int32_t j = 0; j < remain; ++j) { - if (v > p[j + start]) { - v = p[j + start]; - } - } - } -#endif - - return v; -} - -static float floatVectorCmpAVX(const float* pData, int32_t numOfRows, bool isMinFunc) { - float v = 0; +static void calculateRounds(int32_t numOfRows, int32_t bytes, int32_t* remainder, int32_t* rounds, int32_t* width) { const int32_t bitWidth = 256; - const float* p = pData; - int32_t width = (bitWidth>>3u) / sizeof(float); - int32_t remain = numOfRows % width; - int32_t rounds = numOfRows / width; - -#if __AVX__ - - __m256 next; - __m256 initialVal = _mm256_loadu_ps(p); - p += width; - - if (!isMinFunc) { // max function - for (int32_t i = 1; i < rounds; ++i) { - next = _mm256_loadu_ps(p); - initialVal = _mm256_max_ps(initialVal, next); - p += width; - } - - // let sum up the final results - const float* q = (const float*)&initialVal; - v = TMAX(q[0], q[1]); - for (int32_t k = 1; k < width; ++k) { - v = TMAX(v, q[k]); - } - - // calculate the front and the reminder items in array list - int32_t start = rounds * width; - for (int32_t j = 0; j < remain; ++j) { - if (v < p[j + width]) { - v = p[j + width]; - } - } - } else { // min function - for (int32_t i = 1; i < rounds; ++i) { - next = _mm256_loadu_ps(p); - initialVal = _mm256_min_ps(initialVal, next); - p += width; - } - - // let sum up the final results - const float* q = (const float*)&initialVal; - v = TMIN(q[0], q[1]); - for (int32_t k = 1; k < width; ++k) { - v = TMIN(v, q[k]); - } - - // calculate the front and the reminder items in array list - int32_t start = rounds * bitWidth; - for (int32_t j = 0; j < remain; ++j) { - if (v > p[j + start]) { - v = p[j + start]; - } - } - } -#endif - - return v; + *width = (bitWidth>>3u) / bytes; + *remainder = numOfRows % (*width); + *rounds = numOfRows / (*width); } static int8_t i8VectorCmpAVX2(const int8_t* pData, int32_t numOfRows, bool isMinFunc) { int8_t v = 0; - const int32_t bitWidth = 256; const int8_t* p = pData; - int32_t width = (bitWidth>>3u) / sizeof(int8_t); - int32_t remain = numOfRows % width; - int32_t rounds = numOfRows / width; + int32_t width, remain, rounds; + calculateRounds(numOfRows, sizeof(int8_t), &remain, &rounds, &width); #if __AVX2__ __m256i next; @@ -209,12 +90,10 @@ static int8_t i8VectorCmpAVX2(const int8_t* pData, int32_t numOfRows, bool isMin static int16_t i16VectorCmpAVX2(const int16_t* pData, int32_t numOfRows, bool isMinFunc) { int16_t v = 0; - const int32_t bitWidth = 256; const int16_t* p = pData; - int32_t width = (bitWidth>>3u) / sizeof(int16_t); - int32_t remain = numOfRows % width; - int32_t rounds = numOfRows / width; + int32_t width, remain, rounds; + calculateRounds(numOfRows, sizeof(int16_t), &remain, &rounds, &width); #if __AVX2__ __m256i next; @@ -236,6 +115,7 @@ static int16_t i16VectorCmpAVX2(const int16_t* pData, int32_t numOfRows, bool is v = TMAX(v, q[k]); } + // calculate the front and the reminder items in array list int32_t start = rounds * width; for (int32_t j = 0; j < remain; ++j) { @@ -271,20 +151,208 @@ static int16_t i16VectorCmpAVX2(const int16_t* pData, int32_t numOfRows, bool is return v; } +static int32_t i32VectorCmpAVX2(const int32_t* pData, int32_t numOfRows, bool isMinFunc) { + int32_t v = 0; + const int32_t* p = pData; + + int32_t width, remain, rounds; + calculateRounds(numOfRows, sizeof(int32_t), &remain, &rounds, &width); + +#if __AVX2__ + __m256i next; + __m256i initialVal = _mm256_lddqu_si256((__m256i*)p); + p += width; + + if (!isMinFunc) { // max function + for (int32_t i = 0; i < rounds; ++i) { + next = _mm256_lddqu_si256((__m256i*)p); + initialVal = _mm256_max_epi32(initialVal, next); + p += width; + } + + // let compare the final results + const int32_t* q = (const int32_t*)&initialVal; + v = TMAX(q[0], q[1]); + for (int32_t k = 1; k < width; ++k) { + v = TMAX(v, q[k]); + } + + // calculate the front and the reminder items in array list + int32_t start = rounds * width; + for (int32_t j = 0; j < remain; ++j) { + if (v < p[j + start]) { + v = p[j + start]; + } + } + } else { // min function + for (int32_t i = 0; i < rounds; ++i) { + next = _mm256_lddqu_si256((__m256i*)p); + initialVal = _mm256_min_epi32(initialVal, next); + p += width; + } + + // let sum up the final results + const int32_t* q = (const int32_t*)&initialVal; + v = TMIN(q[0], q[1]); + for (int32_t k = 1; k < width; ++k) { + v = TMIN(v, q[k]); + } + + // calculate the front and the remainder items in array list + int32_t start = rounds * width; + for (int32_t j = 0; j < remain; ++j) { + if (v > p[j + start]) { + v = p[j + start]; + } + } + } +#endif + + return v; +} + +static float floatVectorCmpAVX(const float* pData, int32_t numOfRows, bool isMinFunc) { + float v = 0; + const float* p = pData; + + int32_t width, remain, rounds; + calculateRounds(numOfRows, sizeof(float), &remain, &rounds, &width); + +#if __AVX__ + + __m256 next; + __m256 initialVal = _mm256_loadu_ps(p); + p += width; + + if (!isMinFunc) { // max function + for (int32_t i = 1; i < rounds; ++i) { + next = _mm256_loadu_ps(p); + initialVal = _mm256_max_ps(initialVal, next); + p += width; + } + + // let sum up the final results + const float* q = (const float*)&initialVal; + v = TMAX(q[0], q[1]); + for (int32_t k = 1; k < width; ++k) { + v = TMAX(v, q[k]); + } + + // calculate the front and the reminder items in array list + int32_t start = rounds * width; + for (int32_t j = 0; j < remain; ++j) { + if (v < p[j + start]) { + v = p[j + start]; + } + } + } else { // min function + for (int32_t i = 1; i < rounds; ++i) { + next = _mm256_loadu_ps(p); + initialVal = _mm256_min_ps(initialVal, next); + p += width; + } + + // let sum up the final results + const float* q = (const float*)&initialVal; + v = TMIN(q[0], q[1]); + for (int32_t k = 1; k < width; ++k) { + v = TMIN(v, q[k]); + } + + // calculate the front and the reminder items in array list + int32_t start = rounds * width; + for (int32_t j = 0; j < remain; ++j) { + if (v > p[j + start]) { + v = p[j + start]; + } + } + } +#endif + + return v; +} + +static double doubleVectorCmpAVX(const double* pData, int32_t numOfRows, bool isMinFunc) { + double v = 0; + const double* p = pData; + + int32_t width, remain, rounds; + calculateRounds(numOfRows, sizeof(double), &remain, &rounds, &width); + +#if __AVX__ + + __m256d next; + __m256d initialVal = _mm256_loadu_pd(p); + p += width; + + if (!isMinFunc) { // max function + for (int32_t i = 1; i < rounds; ++i) { + next = _mm256_loadu_pd(p); + initialVal = _mm256_max_pd(initialVal, next); + p += width; + } + + // let sum up the final results + const double* q = (const double*)&initialVal; + v = TMAX(q[0], q[1]); + for (int32_t k = 1; k < width; ++k) { + v = TMAX(v, q[k]); + } + + // calculate the front and the reminder items in array list + int32_t start = rounds * width; + for (int32_t j = 0; j < remain; ++j) { + if (v < p[j + start]) { + v = p[j + start]; + } + } + } else { // min function + for (int32_t i = 1; i < rounds; ++i) { + next = _mm256_loadu_pd(p); + initialVal = _mm256_min_pd(initialVal, next); + p += width; + } + + // let sum up the final results + const double* q = (const double*)&initialVal; + v = TMIN(q[0], q[1]); + for (int32_t k = 1; k < width; ++k) { + v = TMIN(v, q[k]); + } + + // calculate the front and the reminder items in array list + int32_t start = rounds * width; + for (int32_t j = 0; j < remain; ++j) { + if (v > p[j + start]) { + v = p[j + start]; + } + } + } +#endif + + return v; +} + +static int32_t findFirstVal(const SColumnInfoData* pCol, int32_t start, int32_t numOfRows) { + int32_t i = start; + while (i < (start + numOfRows)) { + if (!colDataIsNull_f(pCol->nullbitmap, i)) { + break; + } + i += 1; + } + + return i; +} + static int32_t handleInt8Col(SColumnInfoData* pCol, int32_t start, int32_t numOfRows, SqlFunctionCtx* pCtx, SMinmaxResInfo* pBuf, bool isMinFunc) { int8_t* pData = (int8_t*)pCol->pData; int8_t* val = (int8_t*)&pBuf->v; int32_t numOfElems = 0; - if (pCol->hasNull || numOfRows <= 8 || pCtx->subsidiaries.num > 0) { - int32_t i = start; - while (i < (start + numOfRows)) { - if (!colDataIsNull_f(pCol->nullbitmap, i)) { - break; - } - i += 1; - } + if (pCol->hasNull || numOfRows <= 32 || pCtx->subsidiaries.num > 0) { + int32_t i = findFirstVal(pCol, start, numOfRows); if ((i < (start + numOfRows)) && (!pBuf->assign)) { *val = pData[i]; @@ -365,13 +433,7 @@ static int32_t handleInt16Col(SColumnInfoData* pCol, int32_t start, int32_t numO int32_t numOfElems = 0; if (pCol->hasNull || numOfRows <= 8 || pCtx->subsidiaries.num > 0) { - int32_t i = start; - while (i < (start + numOfRows)) { - if (!colDataIsNull_f(pCol->nullbitmap, i)) { - break; - } - i += 1; - } + int32_t i = findFirstVal(pCol, start, numOfRows); if ((i < (start + numOfRows)) && (!pBuf->assign)) { *val = pData[i]; @@ -452,13 +514,7 @@ static int32_t handleInt32Col(SColumnInfoData* pCol, int32_t start, int32_t numO int32_t numOfElems = 0; if (pCol->hasNull || numOfRows <= 8 || pCtx->subsidiaries.num > 0) { - int32_t i = start; - while (i < (start + numOfRows)) { - if (!colDataIsNull_f(pCol->nullbitmap, i)) { - break; - } - i += 1; - } + int32_t i = findFirstVal(pCol, start, numOfRows); if ((i < (start + numOfRows)) && (!pBuf->assign)) { *val = pData[i]; @@ -539,13 +595,7 @@ static int32_t handleInt64Col(SColumnInfoData* pCol, int32_t start, int32_t numO int32_t numOfElems = 0; if (pCol->hasNull || pCtx->subsidiaries.num > 0) { - int32_t i = start; - while (i < (start + numOfRows)) { - if (!colDataIsNull_f(pCol->nullbitmap, i)) { - break; - } - i += 1; - } + int32_t i = findFirstVal(pCol, start, numOfRows); if ((i < (start + numOfRows)) && (!pBuf->assign)) { *val = pData[i]; @@ -616,18 +666,11 @@ static int32_t handleInt64Col(SColumnInfoData* pCol, int32_t start, int32_t numO static int32_t handleFloatCol(SColumnInfoData* pCol, int32_t start, int32_t numOfRows, SqlFunctionCtx* pCtx, SMinmaxResInfo* pBuf, bool isMinFunc) { float* pData = (float*)pCol->pData; - double* val = (double*)&pBuf->v; + float* val = (float*)&pBuf->v; int32_t numOfElems = 0; if (pCol->hasNull || numOfRows < 8 || pCtx->subsidiaries.num > 0) { - int32_t i = start; - while (i < (start + numOfRows)) { - if (!colDataIsNull_f(pCol->nullbitmap, i)) { - break; - } - i += 1; - } - + int32_t i = findFirstVal(pCol, start, numOfRows); if ((i < (start + numOfRows)) && (!pBuf->assign)) { *val = pData[i]; if (pCtx->subsidiaries.num > 0) { @@ -701,18 +744,12 @@ static int32_t handleFloatCol(SColumnInfoData* pCol, int32_t start, int32_t numO static int32_t handleDoubleCol(SColumnInfoData* pCol, int32_t start, int32_t numOfRows, SqlFunctionCtx* pCtx, SMinmaxResInfo* pBuf, bool isMinFunc) { - float* pData = (float*)pCol->pData; + double* pData = (double*)pCol->pData; double* val = (double*)&pBuf->v; int32_t numOfElems = 0; - if (pCol->hasNull || numOfRows < 8 || pCtx->subsidiaries.num > 0) { - int32_t i = start; - while (i < (start + numOfRows)) { - if (!colDataIsNull_f(pCol->nullbitmap, i)) { - break; - } - i += 1; - } + if (pCol->hasNull || numOfRows < 4 || pCtx->subsidiaries.num > 0) { + int32_t i = findFirstVal(pCol, start, numOfRows); if ((i < (start + numOfRows)) && (!pBuf->assign)) { *val = pData[i]; @@ -757,7 +794,7 @@ static int32_t handleDoubleCol(SColumnInfoData* pCol, int32_t start, int32_t num } else { // not has null value // AVX version to speedup the loop if (tsAVXEnable && tsSIMDEnable) { - *val = (double) floatVectorCmpAVX(pData, numOfRows, isMinFunc); + *val = (double) doubleVectorCmpAVX(pData, numOfRows, isMinFunc); } else { if (!pBuf->assign) { *val = pData[0]; @@ -813,7 +850,7 @@ int32_t doMinMaxHelper(SqlFunctionCtx* pCtx, int32_t isMinFunc) { if (IS_NULL_TYPE(type)) { numOfElems = 0; - goto _min_max_over; + goto _over; } // data in current data block are qualified to the query @@ -914,117 +951,10 @@ int32_t doMinMaxHelper(SqlFunctionCtx* pCtx, int32_t isMinFunc) { numOfElems = handleInt8Col(pCol, start, numOfRows, pCtx, pBuf, isMinFunc); } else if (type == TSDB_DATA_TYPE_SMALLINT) { numOfElems = handleInt16Col(pCol, start, numOfRows, pCtx, pBuf, isMinFunc); - int16_t* pData = (int16_t*)pCol->pData; - int16_t* val = (int16_t*)&pBuf->v; - - for (int32_t i = start; i < start + numOfRows; ++i) { - if ((pCol->hasNull) && colDataIsNull_f(pCol->nullbitmap, i)) { - continue; - } - - if (!pBuf->assign) { - *val = pData[i]; - if (pCtx->subsidiaries.num > 0) { - pBuf->tuplePos = saveTupleData(pCtx, i, pCtx->pSrcBlock, NULL); - } - pBuf->assign = true; - } else { - // ignore the equivalent data value - // NOTE: An faster version to avoid one additional comparison with FPU. - if (isMinFunc) { // min - if (*val > pData[i]) { - *val = pData[i]; - if (pCtx->subsidiaries.num > 0) { - updateTupleData(pCtx, i, pCtx->pSrcBlock, &pBuf->tuplePos); - } - } - } else { // max - if (*val < pData[i]) { - *val = pData[i]; - if (pCtx->subsidiaries.num > 0) { - updateTupleData(pCtx, i, pCtx->pSrcBlock, &pBuf->tuplePos); - } - } - } - } - - numOfElems += 1; - } } else if (type == TSDB_DATA_TYPE_INT) { numOfElems = handleInt32Col(pCol, start, numOfRows, pCtx, pBuf, isMinFunc); -#if 0 - for (int32_t i = start; i < start + numOfRows; ++i) { - if ((pCol->hasNull) && colDataIsNull_f(pCol->nullbitmap, i)) { - continue; - } - - if (!pBuf->assign) { - *val = pData[i]; - if (pCtx->subsidiaries.num > 0) { - pBuf->tuplePos = saveTupleData(pCtx, i, pCtx->pSrcBlock, NULL); - } - pBuf->assign = true; - } else { - // ignore the equivalent data value - // NOTE: An faster version to avoid one additional comparison with FPU. - if (isMinFunc) { // min - if (*val > pData[i]) { - *val = pData[i]; - if (pCtx->subsidiaries.num > 0) { - updateTupleData(pCtx, i, pCtx->pSrcBlock, &pBuf->tuplePos); - } - } - } else { // max - if (*val < pData[i]) { - *val = pData[i]; - if (pCtx->subsidiaries.num > 0) { - updateTupleData(pCtx, i, pCtx->pSrcBlock, &pBuf->tuplePos); - } - } - } - } - - numOfElems += 1; - } -#endif - } else if (type == TSDB_DATA_TYPE_BIGINT) { - int64_t* pData = (int64_t*)pCol->pData; - int64_t* val = (int64_t*)&pBuf->v; - - for (int32_t i = start; i < start + numOfRows; ++i) { - if ((pCol->hasNull) && colDataIsNull_f(pCol->nullbitmap, i)) { - continue; - } - - if (!pBuf->assign) { - *val = pData[i]; - if (pCtx->subsidiaries.num > 0) { - pBuf->tuplePos = saveTupleData(pCtx, i, pCtx->pSrcBlock, NULL); - } - pBuf->assign = true; - } else { - // ignore the equivalent data value - // NOTE: An faster version to avoid one additional comparison with FPU. - if (isMinFunc) { // min - if (*val > pData[i]) { - *val = pData[i]; - if (pCtx->subsidiaries.num > 0) { - updateTupleData(pCtx, i, pCtx->pSrcBlock, &pBuf->tuplePos); - } - } - } else { // max - if (*val < pData[i]) { - *val = pData[i]; - if (pCtx->subsidiaries.num > 0) { - updateTupleData(pCtx, i, pCtx->pSrcBlock, &pBuf->tuplePos); - } - } - } - } - - numOfElems += 1; - } + numOfElems = handleInt64Col(pCol, start, numOfRows, pCtx, pBuf, isMinFunc); } } else if (IS_UNSIGNED_NUMERIC_TYPE(type)) { if (type == TSDB_DATA_TYPE_UTINYINT) { @@ -1215,56 +1145,9 @@ int32_t doMinMaxHelper(SqlFunctionCtx* pCtx, int32_t isMinFunc) { } } else if (type == TSDB_DATA_TYPE_FLOAT) { numOfElems = handleFloatCol(pCol, start, numOfRows, pCtx, pBuf, isMinFunc); -#if 0 - for (int32_t i = start; i < start + numOfRows; ++i) { - if ((pCol->hasNull) && colDataIsNull_f(pCol->nullbitmap, i)) { - continue; - } - - if (!pBuf->assign) { - *val = pData[i]; - if (pCtx->subsidiaries.num > 0) { - pBuf->tuplePos = saveTupleData(pCtx, i, pCtx->pSrcBlock, NULL); - } - pBuf->assign = true; - } else { -#if 0 - if ((*val) == pData[i]) { - continue; - } - - if ((*val < pData[i]) ^ isMinFunc) { - *val = pData[i]; - if (pCtx->subsidiaries.num > 0) { - updateTupleData(pCtx, i, pCtx->pSrcBlock, &pBuf->tuplePos); - } - } -#endif - // NOTE: An faster version to avoid one additional comparison with FPU. - if (isMinFunc) { // min - if (*val > pData[i]) { - *val = pData[i]; - if (pCtx->subsidiaries.num > 0) { - updateTupleData(pCtx, i, pCtx->pSrcBlock, &pBuf->tuplePos); - } - } - } else { // max - if (*val < pData[i]) { - *val = pData[i]; - if (pCtx->subsidiaries.num > 0) { - updateTupleData(pCtx, i, pCtx->pSrcBlock, &pBuf->tuplePos); - } - } - } - } - - numOfElems += 1; - } -#endif - } -_min_max_over: +_over: if (numOfElems == 0 && pCtx->subsidiaries.num > 0 && !pBuf->nullTupleSaved) { pBuf->nullTuplePos = saveTupleData(pCtx, pInput->startRowIndex, pCtx->pSrcBlock, NULL); pBuf->nullTupleSaved = true; diff --git a/source/libs/function/src/udfd.c b/source/libs/function/src/udfd.c index 088aa62248..2ad36469ff 100644 --- a/source/libs/function/src/udfd.c +++ b/source/libs/function/src/udfd.c @@ -27,6 +27,7 @@ #include "tglobal.h" #include "tmsg.h" #include "trpc.h" +#include "tmisce.h" // clang-foramt on typedef struct SUdfdContext {