refactor: do some internal refactor.

2022-11-14 14:14:24 +08:00 · 2022-11-14 14:14:24 +08:00 · c888cbf068
parent 7b53b8142e
commit c888cbf068
5 changed files with 304 additions and 50 deletions
--- a/include/libs/function/function.h
+++ b/include/libs/function/function.h
@ -115,7 +115,7 @@ typedef struct SInputColumnInfoData {
  int32_t           startRowIndex;    // handle started row index
  int32_t           numOfRows;        // the number of rows needs to be handled
  int32_t           numOfInputCols;   // PTS is not included
-  bool              colDataAggIsSet;  // if agg is set or not
+  bool              colDataSMAIsSet;  // if agg is set or not
  SColumnInfoData  *pPTS;             // primary timestamp column
  SColumnInfoData **pData;
  SColumnDataAgg  **pColumnDataAgg;
--- a/source/libs/executor/src/executorimpl.c
+++ b/source/libs/executor/src/executorimpl.c
@ -349,13 +349,13 @@ typedef struct {
 } SFunctionCtxStatus;

 static void functionCtxSave(SqlFunctionCtx* pCtx, SFunctionCtxStatus* pStatus) {
-  pStatus->hasAgg = pCtx->input.colDataAggIsSet;
+  pStatus->hasAgg = pCtx->input.colDataSMAIsSet;
  pStatus->numOfRows = pCtx->input.numOfRows;
  pStatus->startOffset = pCtx->input.startRowIndex;
 }

 static void functionCtxRestore(SqlFunctionCtx* pCtx, SFunctionCtxStatus* pStatus) {
-  pCtx->input.colDataAggIsSet = pStatus->hasAgg;
+  pCtx->input.colDataSMAIsSet = pStatus->hasAgg;
  pCtx->input.numOfRows = pStatus->numOfRows;
  pCtx->input.startRowIndex = pStatus->startOffset;
 }
@ -372,8 +372,8 @@ void doApplyFunctions(SExecTaskInfo* taskInfo, SqlFunctionCtx* pCtx, SColumnInfo

    // not a whole block involved in query processing, statistics data can not be used
    // NOTE: the original value of isSet have been changed here
-    if (pCtx[k].input.colDataAggIsSet && forwardStep < numOfTotal) {
-      pCtx[k].input.colDataAggIsSet = false;
+    if (pCtx[k].input.colDataSMAIsSet && forwardStep < numOfTotal) {
+      pCtx[k].input.colDataSMAIsSet = false;
    }

    if (fmIsWindowPseudoColumnFunc(pCtx[k].functionId)) {
@ -486,7 +486,7 @@ static int32_t doSetInputDataBlock(SExprSupp* pExprSup, SSDataBlock* pBlock, int

    SInputColumnInfoData* pInput = &pCtx[i].input;
    pInput->uid = pBlock->info.uid;
-    pInput->colDataAggIsSet = false;
+    pInput->colDataSMAIsSet = false;

    SExprInfo* pOneExpr = &pExprSup->pExprInfo[i];
    for (int32_t j = 0; j < pOneExpr->base.numOfParams; ++j) {
@ -798,7 +798,7 @@ void setBlockSMAInfo(SqlFunctionCtx* pCtx, SExprInfo* pExprInfo, SSDataBlock* pB
  pInput->totalRows = numOfRows;

  if (pBlock->pBlockAgg != NULL) {
-    pInput->colDataAggIsSet = true;
+    pInput->colDataSMAIsSet = true;

    for (int32_t j = 0; j < pExprInfo->base.numOfParams; ++j) {
      SFunctParam* pFuncParam = &pExprInfo->base.pParam[j];
@ -807,7 +807,7 @@ void setBlockSMAInfo(SqlFunctionCtx* pCtx, SExprInfo* pExprInfo, SSDataBlock* pB
        int32_t slotId = pFuncParam->pCol->slotId;
        pInput->pColumnDataAgg[j] = pBlock->pBlockAgg[slotId];
        if (pInput->pColumnDataAgg[j] == NULL) {
-          pInput->colDataAggIsSet = false;
+          pInput->colDataSMAIsSet = false;
        }

        // Here we set the column info data since the data type for each column data is required, but
@ -818,7 +818,7 @@ void setBlockSMAInfo(SqlFunctionCtx* pCtx, SExprInfo* pExprInfo, SSDataBlock* pB
      }
    }
  } else {
-    pInput->colDataAggIsSet = false;
+    pInput->colDataSMAIsSet = false;
  }
 }

--- a/source/libs/function/src/builtinsimpl.c
+++ b/source/libs/function/src/builtinsimpl.c
@ -498,13 +498,13 @@ static int32_t getNumOfElems(SqlFunctionCtx* pCtx) {
  int32_t numOfElem = 0;

  /*
-   * 1. column data missing (schema modified) causes pInputCol->hasNull == true. pInput->colDataAggIsSet == true;
-   * 2. for general non-primary key columns, pInputCol->hasNull may be true or false, pInput->colDataAggIsSet == true;
-   * 3. for primary key column, pInputCol->hasNull always be false, pInput->colDataAggIsSet == false;
+   * 1. column data missing (schema modified) causes pInputCol->hasNull == true. pInput->colDataSMAIsSet == true;
+   * 2. for general non-primary key columns, pInputCol->hasNull may be true or false, pInput->colDataSMAIsSet == true;
+   * 3. for primary key column, pInputCol->hasNull always be false, pInput->colDataSMAIsSet == false;
   */
  SInputColumnInfoData* pInput = &pCtx->input;
  SColumnInfoData*      pInputCol = pInput->pData[0];
-  if (pInput->colDataAggIsSet && pInput->totalRows == pInput->numOfRows) {
+  if (pInput->colDataSMAIsSet && pInput->totalRows == pInput->numOfRows) {
    numOfElem = pInput->numOfRows - pInput->pColumnDataAgg[0]->numOfNull;
    ASSERT(numOfElem >= 0);
  } else {
@ -593,7 +593,7 @@ int32_t sumFunction(SqlFunctionCtx* pCtx) {
    goto _sum_over;
  }

-  if (pInput->colDataAggIsSet) {
+  if (pInput->colDataSMAIsSet) {
    numOfElem = pInput->numOfRows - pAgg->numOfNull;
    ASSERT(numOfElem >= 0);

@ -658,7 +658,7 @@ int32_t sumInvertFunction(SqlFunctionCtx* pCtx) {

  SSumRes* pSumRes = GET_ROWCELL_INTERBUF(GET_RES_INFO(pCtx));

-  if (pInput->colDataAggIsSet) {
+  if (pInput->colDataSMAIsSet) {
    numOfElem = pInput->numOfRows - pAgg->numOfNull;
    ASSERT(numOfElem >= 0);

@ -770,7 +770,7 @@ bool getSumFuncEnv(SFunctionNode* UNUSED_PARAM(pFunc), SFuncExecEnv* pEnv) {
 //    goto _avg_over;
 //  }
 //
-//  if (pInput->colDataAggIsSet) {
+//  if (pInput->colDataSMAIsSet) {
 //    numOfElem = numOfRows - pAgg->numOfNull;
 //    ASSERT(numOfElem >= 0);
 //
@ -1161,7 +1161,7 @@ bool getMinmaxFuncEnv(SFunctionNode* UNUSED_PARAM(pFunc), SFuncExecEnv* pEnv) {
 //  }
 //
 //  // data in current data block are qualified to the query
-//  if (pInput->colDataAggIsSet) {
+//  if (pInput->colDataSMAIsSet) {
 //    numOfElems = pInput->numOfRows - pAgg->numOfNull;
 //    ASSERT(pInput->numOfRows == pInput->totalRows && numOfElems >= 0);
 //    if (numOfElems == 0) {
@ -2471,7 +2471,7 @@ int32_t percentileFunction(SqlFunctionCtx* pCtx) {

  // the first stage, only acquire the min/max value
  if (pInfo->stage == 0) {
-    if (pCtx->input.colDataAggIsSet) {
+    if (pCtx->input.colDataSMAIsSet) {
      double tmin = 0.0, tmax = 0.0;
      if (IS_SIGNED_NUMERIC_TYPE(type)) {
        tmin = (double)GET_INT64_VAL(&pAgg->min);
@ -2933,14 +2933,14 @@ int32_t firstFunction(SqlFunctionCtx* pCtx) {
  pInfo->bytes = pInputCol->info.bytes;

  // All null data column, return directly.
-  if (pInput->colDataAggIsSet && (pInput->pColumnDataAgg[0]->numOfNull == pInput->totalRows)) {
+  if (pInput->colDataSMAIsSet && (pInput->pColumnDataAgg[0]->numOfNull == pInput->totalRows)) {
    ASSERT(pInputCol->hasNull == true);
    // save selectivity value for column consisted of all null values
    firstlastSaveTupleData(pCtx->pSrcBlock, pInput->startRowIndex, pCtx, pInfo);
    return 0;
  }

-  SColumnDataAgg* pColAgg = (pInput->colDataAggIsSet) ? pInput->pColumnDataAgg[0] : NULL;
+  SColumnDataAgg* pColAgg = (pInput->colDataSMAIsSet) ? pInput->pColumnDataAgg[0] : NULL;

  TSKEY startKey = getRowPTs(pInput->pPTS, 0);
  TSKEY endKey = getRowPTs(pInput->pPTS, pInput->totalRows - 1);
@ -3037,14 +3037,14 @@ int32_t lastFunction(SqlFunctionCtx* pCtx) {
  pInfo->bytes = bytes;

  // All null data column, return directly.
-  if (pInput->colDataAggIsSet && (pInput->pColumnDataAgg[0]->numOfNull == pInput->totalRows)) {
+  if (pInput->colDataSMAIsSet && (pInput->pColumnDataAgg[0]->numOfNull == pInput->totalRows)) {
    ASSERT(pInputCol->hasNull == true);
    // save selectivity value for column consisted of all null values
    firstlastSaveTupleData(pCtx->pSrcBlock, pInput->startRowIndex, pCtx, pInfo);
    return 0;
  }

-  SColumnDataAgg* pColAgg = (pInput->colDataAggIsSet) ? pInput->pColumnDataAgg[0] : NULL;
+  SColumnDataAgg* pColAgg = (pInput->colDataSMAIsSet) ? pInput->pColumnDataAgg[0] : NULL;

  TSKEY startKey = getRowPTs(pInput->pPTS, 0);
  TSKEY endKey = getRowPTs(pInput->pPTS, pInput->totalRows - 1);
@ -3988,7 +3988,7 @@ int32_t spreadFunction(SqlFunctionCtx* pCtx) {

  SSpreadInfo* pInfo = GET_ROWCELL_INTERBUF(GET_RES_INFO(pCtx));

-  if (pInput->colDataAggIsSet) {
+  if (pInput->colDataSMAIsSet) {
    numOfElems = pInput->numOfRows - pAgg->numOfNull;
    if (numOfElems == 0) {
      goto _spread_over;
@ -4163,7 +4163,7 @@ int32_t elapsedFunction(SqlFunctionCtx* pCtx) {
    goto _elapsed_over;
  }

-  if (pInput->colDataAggIsSet) {
+  if (pInput->colDataSMAIsSet) {
    if (pInfo->min == TSKEY_MAX) {
      pInfo->min = GET_INT64_VAL(&pAgg->min);
      pInfo->max = GET_INT64_VAL(&pAgg->max);
--- a/source/libs/function/src/detail/tavgfunction.c
+++ b/source/libs/function/src/detail/tavgfunction.c
@ -48,15 +48,14 @@ typedef struct SAvgRes {
  int16_t type;  // store the original input type, used in merge function
 } SAvgRes;

-static void floatVectorSumAVX(const SInputColumnInfoData* pInput, const float* plist, SAvgRes* pRes) {
+static void floatVectorSumAVX(const float* plist, int32_t numOfRows, SAvgRes* pRes) {
 #if __AVX__
  // find the start position that are aligned to 32bytes address in memory
-  int32_t startIndex = 0;  //((uint64_t)plist) & ((1<<8u)-1);
  int32_t bitWidth = 8;
+  int32_t remainder = numOfRows % bitWidth;
+  int32_t rounds = numOfRows / bitWidth;

-  int32_t      remain = (pInput->numOfRows - startIndex) % bitWidth;
-  int32_t      rounds = (pInput->numOfRows - startIndex) / bitWidth;
-  const float* p = &plist[startIndex];
+  const float* p = plist;

  __m256 val;
  __m256 sum = _mm256_setzero_ps();
@ -71,18 +70,126 @@ static void floatVectorSumAVX(const SInputColumnInfoData* pInput, const float* p
  const float* q = (const float*)&sum;
  pRes->sum.dsum += q[0] + q[1] + q[2] + q[3] + q[4] + q[5] + q[6] + q[7];

-  // calculate the front and the reminder items in array list
-  for (int32_t j = 0; j < startIndex; ++j) {
-    pRes->sum.dsum += plist[j];
+  int32_t startIndex = rounds * bitWidth;
+  for (int32_t j = 0; j < remainder; ++j) {
+    pRes->sum.dsum += plist[j + startIndex];
+  }
+#endif
+}
+
+static void doubleVectorSumAVX(const double* plist, int32_t numOfRows, SAvgRes* pRes) {
+#if __AVX__
+  // find the start position that are aligned to 32bytes address in memory
+  int32_t bitWidth = 4;
+  int32_t remainder = numOfRows % bitWidth;
+  int32_t rounds = numOfRows / bitWidth;
+
+  const double* p = plist;
+
+  __m256d val;
+  __m256d sum = _mm256_setzero_pd();
+
+  for (int32_t i = 0; i < rounds; ++i) {
+    val = _mm256_loadu_pd(p);
+    sum = _mm256_add_pd(sum, val);
+    p += bitWidth;
  }

-  startIndex += rounds * bitWidth;
-  for (int32_t j = 0; j < remain; ++j) {
+  // let sum up the final results
+  const double* q = (const double*)&sum;
+  pRes->sum.dsum += q[0] + q[1] + q[2] + q[3];
+
+  int32_t startIndex = rounds * bitWidth;
+  for (int32_t j = 0; j < remainder; ++j) {
    pRes->sum.dsum += plist[j + startIndex];
  }
 #endif
 }

+static void i8VectorSumAVX2(const int8_t* plist, int32_t numOfRows, SAvgRes* pRes) {
+#if __AVX2__
+  // find the start position that are aligned to 32bytes address in memory
+  int32_t bitWidth = 16;
+  int32_t remainder = numOfRows % bitWidth;
+  int32_t rounds = numOfRows / bitWidth;
+
+  const int8_t* p = plist;
+
+  __m256i sum = _mm256_setzero_si256();
+
+  for (int32_t i = 0; i < rounds; ++i) {
+    __m256i val = _mm256_lddqu_si256((__m256i*)p);
+//    __m256i extVal = _mm256_cvtepi8_epi64(val);
+    sum = _mm256_add_epi8(sum, val);
+    p += bitWidth;
+  }
+
+  // let sum up the final results
+  const int8_t* q = (const int8_t*)&sum;
+  pRes->sum.isum += q[0] + q[1] + q[2] + q[3];
+
+  int32_t startIndex = rounds * bitWidth;
+  for (int32_t j = 0; j < remainder; ++j) {
+    pRes->sum.isum += plist[j + startIndex];
+  }
+#endif
+}
+
+static void i32VectorSumAVX2(const int32_t* plist, int32_t numOfRows, SAvgRes* pRes) {
+#if __AVX2__
+  // find the start position that are aligned to 32bytes address in memory
+  int32_t bitWidth = 8;
+  int32_t remainder = numOfRows % bitWidth;
+  int32_t rounds = numOfRows / bitWidth;
+
+  const int32_t* p = plist;
+
+  __m256i sum = _mm256_setzero_si256();
+  for (int32_t i = 0; i < rounds; ++i) {
+    __m256i val = _mm256_lddqu_si256((__m256i*)p);
+    sum = _mm256_add_epi32(sum, val);
+    p += bitWidth;
+  }
+
+  // let sum up the final results
+  const int64_t* q = (const int64_t*)&sum;
+  pRes->sum.isum += q[0] + q[1] + q[2] + q[3];
+
+  int32_t startIndex = rounds * bitWidth;
+  for (int32_t j = 0; j < remainder; ++j) {
+    pRes->sum.isum += plist[j + startIndex];
+  }
+#endif
+}
+
+static void i64VectorSumAVX2(const int64_t* plist, int32_t numOfRows, SAvgRes* pRes) {
+#if __AVX2__
+  // find the start position that are aligned to 32bytes address in memory
+  int32_t bitWidth = 4;
+  int32_t remainder = numOfRows % bitWidth;
+  int32_t rounds = numOfRows / bitWidth;
+
+  const int64_t* p = plist;
+
+  __m256i sum = _mm256_setzero_si256();
+
+  for (int32_t i = 0; i < rounds; ++i) {
+    __m256i val = _mm256_lddqu_si256((__m256i*)p);
+    sum = _mm256_add_epi64(sum, val);
+    p += bitWidth;
+  }
+
+  // let sum up the final results
+  const int64_t* q = (const int64_t*)&sum;
+  pRes->sum.isum += q[0] + q[1] + q[2] + q[3];
+
+  int32_t startIndex = rounds * bitWidth;
+  for (int32_t j = 0; j < remainder; ++j) {
+    pRes->sum.isum += plist[j + startIndex];
+  }
+#endif
+}
+
 static int32_t handleFloatCols(const SColumnInfoData* pCol, const SInputColumnInfoData* pInput, SAvgRes* pRes) {
  int32_t numOfElems = 0;
  float*  plist = (float*)pCol->pData;
@ -105,7 +212,7 @@ static int32_t handleFloatCols(const SColumnInfoData* pCol, const SInputColumnIn

    // 3. If the CPU supports AVX, let's employ AVX instructions to speedup this loop
    if (tsAVXEnable && tsSIMDEnable) {
-      floatVectorSumAVX(pInput, plist, pRes);
+      floatVectorSumAVX(plist, pInput->numOfRows, pRes);
    } else {
      for (int32_t i = pInput->startRowIndex; i < pInput->numOfRows + pInput->startRowIndex; ++i) {
        pRes->sum.dsum += plist[i];
@ -133,8 +240,25 @@ bool avgFunctionSetup(SqlFunctionCtx* pCtx, SResultRowEntryInfo* pResultInfo) {
  return true;
 }

+static int32_t calculateAvgBySMAInfo(SAvgRes* pRes, int32_t numOfRows, int32_t type, const SColumnDataAgg* pAgg) {
+  int32_t numOfElem = numOfRows - pAgg->numOfNull;
+  ASSERT(numOfElem >= 0);
+
+  pRes->count += numOfElem;
+  if (IS_SIGNED_NUMERIC_TYPE(type)) {
+    pRes->sum.isum += pAgg->sum;
+  } else if (IS_UNSIGNED_NUMERIC_TYPE(type)) {
+    pRes->sum.usum += pAgg->sum;
+  } else if (IS_FLOAT_TYPE(type)) {
+    pRes->sum.dsum += GET_DOUBLE_VAL((const char*)&(pAgg->sum));
+  }
+
+  return numOfElem;
+}
+
 int32_t avgFunction(SqlFunctionCtx* pCtx) {
  int32_t       numOfElem = 0;
+  const int32_t THRESHOLD_SIZE = 8;

  SInputColumnInfoData* pInput = &pCtx->input;
  SColumnDataAgg*       pAgg = pInput->pColumnDataAgg[0];
@ -154,19 +278,149 @@ int32_t avgFunction(SqlFunctionCtx* pCtx) {
    goto _avg_over;
  }

-  if (pInput->colDataAggIsSet) {
-    numOfElem = numOfRows - pAgg->numOfNull;
-    ASSERT(numOfElem >= 0);
+  if (pInput->colDataSMAIsSet) {  // try to use SMA if available
+    numOfElem = calculateAvgBySMAInfo(pAvgRes, numOfRows, type, pAgg);
+  } else if (!pCol->hasNull) {  // try to employ the simd instructions to speed up the loop
+    numOfElem = pInput->numOfRows;
+    pAvgRes->count += pInput->numOfRows;

-    pAvgRes->count += numOfElem;
-    if (IS_SIGNED_NUMERIC_TYPE(type)) {
-      pAvgRes->sum.isum += pAgg->sum;
-    } else if (IS_UNSIGNED_NUMERIC_TYPE(type)) {
-      pAvgRes->sum.usum += pAgg->sum;
-    } else if (IS_FLOAT_TYPE(type)) {
-      pAvgRes->sum.dsum += GET_DOUBLE_VAL((const char*)&(pAgg->sum));
+    bool simdAvaiable = tsAVXEnable && tsSIMDEnable && (numOfRows > THRESHOLD_SIZE);
+
+    switch(type) {
+      case TSDB_DATA_TYPE_TINYINT: {
+        const int8_t* plist = (const int8_t*) pCol->pData;
+
+        // 1. If the CPU supports AVX, let's employ AVX instructions to speedup this loop
+        if (simdAvaiable) {
+          i8VectorSumAVX2(plist, numOfRows, pAvgRes);
+        } else {
+          for (int32_t i = pInput->startRowIndex; i < pInput->numOfRows + pInput->startRowIndex; ++i) {
+            pAvgRes->sum.isum += plist[i];
          }
-  } else {  // computing based on the true data block
+        }
+        break;
+      }
+      case TSDB_DATA_TYPE_SMALLINT: {
+        const double* plist = (const double*)pCol->pData;
+
+        // 1. If the CPU supports AVX, let's employ AVX instructions to speedup this loop
+        if (simdAvaiable) {
+          doubleVectorSumAVX(plist, numOfRows, pAvgRes);
+        } else {
+          for (int32_t i = pInput->startRowIndex; i < pInput->numOfRows + pInput->startRowIndex; ++i) {
+            pAvgRes->sum.isum += plist[i];
+          }
+        }
+        break;
+      }
+      case TSDB_DATA_TYPE_INT: {
+        const int32_t* plist = (const int32_t*) pCol->pData;
+
+        // 1. If the CPU supports AVX, let's employ AVX instructions to speedup this loop
+        if (simdAvaiable) {
+          i32VectorSumAVX2(plist, numOfRows, pAvgRes);
+        } else {
+          for (int32_t i = pInput->startRowIndex; i < pInput->numOfRows + pInput->startRowIndex; ++i) {
+            pAvgRes->sum.isum += plist[i];
+          }
+        }
+        break;
+      }
+      case TSDB_DATA_TYPE_BIGINT: {
+        const int64_t* plist = (const int64_t*) pCol->pData;
+
+        // 1. If the CPU supports AVX, let's employ AVX instructions to speedup this loop
+        if (simdAvaiable) {
+          i64VectorSumAVX2(plist, numOfRows, pAvgRes);
+        } else {
+          for (int32_t i = pInput->startRowIndex; i < pInput->numOfRows + pInput->startRowIndex; ++i) {
+            pAvgRes->sum.isum += plist[i];
+          }
+        }
+        break;
+      }
+      case TSDB_DATA_TYPE_FLOAT: {
+        const float* plist = (const float*) pCol->pData;
+
+        // 1. If the CPU supports AVX, let's employ AVX instructions to speedup this loop
+        if (simdAvaiable) {
+          floatVectorSumAVX(plist, numOfRows, pAvgRes);
+        } else {
+          for (int32_t i = pInput->startRowIndex; i < pInput->numOfRows + pInput->startRowIndex; ++i) {
+            pAvgRes->sum.dsum += plist[i];
+          }
+        }
+        break;
+      }
+      case TSDB_DATA_TYPE_DOUBLE: {
+        const double* plist = (const double*) pCol->pData;
+
+        // 1. If the CPU supports AVX, let's employ AVX instructions to speedup this loop
+        if (simdAvaiable) {
+          doubleVectorSumAVX(plist, numOfRows, pAvgRes);
+        } else {
+          for (int32_t i = pInput->startRowIndex; i < pInput->numOfRows + pInput->startRowIndex; ++i) {
+            pAvgRes->sum.dsum += plist[i];
+          }
+        }
+        break;
+      }
+      case TSDB_DATA_TYPE_UTINYINT: {
+        const double* plist = (const double*) pCol->pData;
+
+        // 1. If the CPU supports AVX, let's employ AVX instructions to speedup this loop
+        if (simdAvaiable) {
+          doubleVectorSumAVX(plist, numOfRows, pAvgRes);
+        } else {
+          for (int32_t i = pInput->startRowIndex; i < pInput->numOfRows + pInput->startRowIndex; ++i) {
+            pAvgRes->sum.usum += plist[i];
+          }
+        }
+        break;
+      }
+      case TSDB_DATA_TYPE_USMALLINT: {
+        const double* plist = (const double*) pCol->pData;
+
+        // 1. If the CPU supports AVX, let's employ AVX instructions to speedup this loop
+        if (simdAvaiable) {
+          doubleVectorSumAVX(plist, numOfRows, pAvgRes);
+        } else {
+          for (int32_t i = pInput->startRowIndex; i < pInput->numOfRows + pInput->startRowIndex; ++i) {
+            pAvgRes->sum.usum += plist[i];
+          }
+        }
+        break;
+      }
+      case TSDB_DATA_TYPE_UINT: {
+        const double* plist = (const double*) pCol->pData;
+
+        // 1. If the CPU supports AVX, let's employ AVX instructions to speedup this loop
+        if (simdAvaiable) {
+          doubleVectorSumAVX(plist, numOfRows, pAvgRes);
+        } else {
+          for (int32_t i = pInput->startRowIndex; i < pInput->numOfRows + pInput->startRowIndex; ++i) {
+            pAvgRes->sum.usum += plist[i];
+          }
+        }
+        break;
+      }
+      case TSDB_DATA_TYPE_UBIGINT: {
+        const double* plist = (const double*) pCol->pData;
+
+        // 1. If the CPU supports AVX, let's employ AVX instructions to speedup this loop
+        if (simdAvaiable) {
+          doubleVectorSumAVX(plist, numOfRows, pAvgRes);
+        } else {
+          for (int32_t i = pInput->startRowIndex; i < pInput->numOfRows + pInput->startRowIndex; ++i) {
+            pAvgRes->sum.usum += plist[i];
+          }
+        }
+        break;
+      }
+      default:
+        ASSERT(0);
+    }
+  } else {
    switch (type) {
      case TSDB_DATA_TYPE_TINYINT: {
        int8_t* plist = (int8_t*)pCol->pData;
--- a/source/libs/function/src/detail/tminmax.c
+++ b/source/libs/function/src/detail/tminmax.c
@ -36,7 +36,7 @@ static int32_t i32VectorCmpAVX2(const int32_t* pData, int32_t numOfRows, bool is

  if (!isMinFunc) {  // max function
    for (int32_t i = 0; i < rounds; ++i) {
-      next = _mm256_loadu_si256((__m256i*)p);
+      next = _mm256_lddqu_si256((__m256i*)p);
      initialVal = _mm256_max_epi32(initialVal, next);
      p += bitWidth;
    }
@ -61,7 +61,7 @@ static int32_t i32VectorCmpAVX2(const int32_t* pData, int32_t numOfRows, bool is
    }
  } else {  // min function
    for (int32_t i = 0; i < rounds; ++i) {
-      next = _mm256_loadu_si256((__m256i*)p);
+      next = _mm256_lddqu_si256((__m256i*)p);
      initialVal = _mm256_min_epi32(initialVal, next);
      p += bitWidth;
    }
@ -369,7 +369,7 @@ int32_t doMinMaxHelper(SqlFunctionCtx* pCtx, int32_t isMinFunc) {
  }

  // data in current data block are qualified to the query
-  if (pInput->colDataAggIsSet) {
+  if (pInput->colDataSMAIsSet) {
    numOfElems = pInput->numOfRows - pAgg->numOfNull;
    ASSERT(pInput->numOfRows == pInput->totalRows && numOfElems >= 0);
    if (numOfElems == 0) {