refactor: do some internal refactor.

2022-11-11 14:16:13 +08:00 · 2022-11-11 14:16:13 +08:00 · b83f895726
parent ea83ae239e
commit b83f895726
17 changed files with 359 additions and 236 deletions
--- a/cmake/cmake.define
+++ b/cmake/cmake.define
@ -123,14 +123,20 @@ ELSE ()
        SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wno-literal-suffix -Werror=return-type -fPIC -gdwarf-2 -g3 -Wformat=2 -Wno-format-nonliteral -Wno-format-truncation -Wno-format-y2k")
    ENDIF ()
    MESSAGE("System processor ID: ${CMAKE_SYSTEM_PROCESSOR}")
    IF (TD_INTEL_64 OR TD_INTEL_32)
-        ADD_DEFINITIONS("-msse4.2 -mavx -mavx2")
+        ADD_DEFINITIONS("-msse4.2")
        IF("${FMA_SUPPORT}" MATCHES "true")
-            MESSAGE(STATUS "turn fma function support on")
+            MESSAGE(STATUS "fma function supported")
            ADD_DEFINITIONS("-mfma")
        ELSE ()
-            MESSAGE(STATUS "turn fma function support off")
+            MESSAGE(STATUS "fma function NOT supported")
        ENDIF()
        IF("${SIMD_SUPPORT}" MATCHES "true")
            ADD_DEFINITIONS("-mavx -mavx2")
            MESSAGE(STATUS "cpu simd instruction AVX/AVX2 supported")
        ELSE()
            MESSAGE(STATUS "cpu simd instruction AVX/AVX2 NOT supported")
        ENDIF()
    ENDIF ()
--- a/cmake/cmake.platform
+++ b/cmake/cmake.platform
@ -1,20 +1,17 @@
 cmake_minimum_required(VERSION 3.0)
 MESSAGE("Current system is ${CMAKE_SYSTEM_NAME}")
 # init
 SET(TD_LINUX FALSE)
 SET(TD_WINDOWS FALSE)
 SET(TD_DARWIN FALSE)
 MESSAGE("Compiler ID: ${CMAKE_CXX_COMPILER_ID}")
 if(CMAKE_COMPILER_IS_GNUCXX MATCHES 1)
    set(CXX_COMPILER_IS_GNU TRUE)
 else()
    set(CXX_COMPILER_IS_GNU FALSE)
 endif()
-MESSAGE("Current system name is ${CMAKE_SYSTEM_NAME}.")
+MESSAGE("Current system: ${CMAKE_SYSTEM_NAME}")
 IF (${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
@ -26,6 +23,8 @@ IF (${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "Darwin
        set(CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS "${CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS} -undefined dynamic_lookup")
    ENDIF ()
    MESSAGE("Current system processor: ${CMAKE_SYSTEM_PROCESSOR}")
    IF (${CMAKE_SYSTEM_NAME} MATCHES "Linux")
        SET(TD_LINUX TRUE)
@ -44,7 +43,6 @@ IF (${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "Darwin
        SET(OSTYPE "macOS")
        ADD_DEFINITIONS("-DDARWIN -Wno-tautological-pointer-compare")
        MESSAGE("Current system processor is ${CMAKE_SYSTEM_PROCESSOR}.")
        IF (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm64")
            MESSAGE("Current system arch is arm64")
            SET(TD_DARWIN_64 TRUE)
@ -80,24 +78,22 @@ ELSEIF (${CMAKE_SYSTEM_NAME} MATCHES "Windows")
 ENDIF()
 IF ("${CPUTYPE}" STREQUAL "")
  MESSAGE(STATUS "The current platform " ${CMAKE_SYSTEM_PROCESSOR} " is detected")
  IF (CMAKE_SYSTEM_PROCESSOR MATCHES "(amd64)|(AMD64)")
-    MESSAGE(STATUS "The current platform is amd64")
+    MESSAGE(STATUS "Current platform is amd64")
    SET(PLATFORM_ARCH_STR "amd64")
    SET(TD_INTEL_64 TRUE)
  ELSEIF (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)")
-    MESSAGE(STATUS "The current platform is x86")
+    MESSAGE(STATUS "Current platform is x86")
    SET(PLATFORM_ARCH_STR "i386")
    SET(TD_INTEL_32 TRUE)
  ELSEIF (CMAKE_SYSTEM_PROCESSOR MATCHES "armv7l")
-    MESSAGE(STATUS "The current platform is aarch32")
+    MESSAGE(STATUS "Current platform is aarch32")
    SET(PLATFORM_ARCH_STR "arm")
    SET(TD_ARM_32 TRUE)
    ADD_DEFINITIONS("-D_TD_ARM_")
    ADD_DEFINITIONS("-D_TD_ARM_32")
  ELSEIF (CMAKE_SYSTEM_PROCESSOR MATCHES "(aarch64)|(arm64)")
-    MESSAGE(STATUS "The current platform is aarch64")
+    MESSAGE(STATUS "Current platform is aarch64")
    SET(PLATFORM_ARCH_STR "arm64")
    SET(TD_ARM_64 TRUE)
    ADD_DEFINITIONS("-D_TD_ARM_")
--- a/cmake/cmake.version
+++ b/cmake/cmake.version
@ -26,7 +26,7 @@ ELSEIF (HAVE_GIT)
    SET(TD_VER_GIT "no git commit id")
  ENDIF ()
 ELSE ()
-  message(STATUS "no git cmd")
+  message(STATUS "no git found")
  SET(TD_VER_GIT "no git commit id")
 ENDIF ()
--- a/include/os/os.h
+++ b/include/os/os.h
@ -81,6 +81,13 @@ extern "C" {
 #include <string.h>
 #include <wchar.h>
 #include <wctype.h>
 #include <cpuid.h>
 #if __AVX__
 #include <immintrin.h>
 #elif __SSE4_2__
 #include <nmmintrin.h>
 #endif
 #include "osThread.h"
--- a/include/os/osDef.h
+++ b/include/os/osDef.h
@ -179,7 +179,7 @@ void syslog(int unused, const char *format, ...);
    if (isnan(y)) {                       \
      return 1;                           \
    }                                     \
-    if ((x) == (y)) {             \
+    if (fabs((x) - (y)) <= DBL_EPSILON) { \
      return 0;                           \
    } else {                              \
      return (x) < (y) ? -1 : 1;          \
--- a/include/os/osEnv.h
+++ b/include/os/osEnv.h
@ -36,6 +36,11 @@ extern int64_t         tsStreamMax;
 extern float           tsNumOfCores;
 extern int64_t         tsTotalMemoryKB;
 extern char           *tsProcPath;
 extern char            tsSIMDEnable;
 extern char            tsSSE42Enable;
 extern char            tsAVXEnable;
 extern char            tsAVX2Enable;
 extern char            tsFMAEnable;
 extern char configDir[];
 extern char tsDataDir[];
--- a/include/os/osSysinfo.h
+++ b/include/os/osSysinfo.h
@ -40,6 +40,7 @@ int32_t taosGetOsReleaseName(char *releaseName, int32_t maxLen);
 int32_t taosGetCpuInfo(char *cpuModel, int32_t maxLen, float *numOfCores);
 int32_t taosGetCpuCores(float *numOfCores);
 void    taosGetCpuUsage(double *cpu_system, double *cpu_engine);
 int32_t taosGetCpuInstructions(char* sse42, char* avx, char* avx2, char* fma);
 int32_t taosGetTotalMemory(int64_t *totalKB);
 int32_t taosGetProcMemory(int64_t *usedKB);
 int32_t taosGetSysMemory(int64_t *usedKB);
--- a/source/common/src/tglobal.c
+++ b/source/common/src/tglobal.c
@ -15,7 +15,6 @@
 #define _DEFAULT_SOURCE
 #include "tglobal.h"
 #include "tcompare.h"
 #include "tconfig.h"
 #include "tdatablock.h"
 #include "tgrant.h"
@ -312,7 +311,14 @@ static int32_t taosAddSystemCfg(SConfig *pCfg) {
  if (cfgAddLocale(pCfg, "locale", tsLocale) != 0) return -1;
  if (cfgAddCharset(pCfg, "charset", tsCharset) != 0) return -1;
  if (cfgAddBool(pCfg, "enableCoreFile", 1, 1) != 0) return -1;
-  if (cfgAddFloat(pCfg, "numOfCores", tsNumOfCores, 0, 100000, 1) != 0) return -1;
+  if (cfgAddFloat(pCfg, "numOfCores", tsNumOfCores, 1, 100000, 1) != 0) return -1;
  if (cfgAddBool(pCfg, "SSE42", tsSSE42Enable, 0) != 0) return -1;
  if (cfgAddBool(pCfg, "AVX", tsAVXEnable, 0) != 0) return -1;
  if (cfgAddBool(pCfg, "AVX2", tsAVX2Enable, 0) != 0) return -1;
  if (cfgAddBool(pCfg, "FMA", tsFMAEnable, 0) != 0) return -1;
  if (cfgAddBool(pCfg, "SIMD-Supported", tsSIMDEnable, 0) != 0) return -1;
  if (cfgAddInt64(pCfg, "openMax", tsOpenMax, 0, INT64_MAX, 1) != 0) return -1;
  if (cfgAddInt64(pCfg, "streamMax", tsStreamMax, 0, INT64_MAX, 1) != 0) return -1;
  if (cfgAddInt32(pCfg, "pageSizeKB", tsPageSizeKB, 0, INT64_MAX, 1) != 0) return -1;
--- a/source/libs/function/src/detail/tavgfunction.c
+++ b/source/libs/function/src/detail/tavgfunction.c
@ -13,7 +13,6 @@
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */
 #include <immintrin.h>
 #include "builtinsimpl.h"
 #include "function.h"
 #include "tdatablock.h"
@ -49,11 +48,48 @@ typedef struct SAvgRes {
  int16_t type;  // store the original input type, used in merge function
 } SAvgRes;
 static void floatVectorSumAVX(const SInputColumnInfoData* pInput, const float* plist, SAvgRes* pRes) {
 #if __AVX__
  // find the start position that are aligned to 32bytes address in memory
  int32_t startIndex = 0;  //((uint64_t)plist) & ((1<<8u)-1);
  int32_t bitWidth = 8;
  int32_t      remain = (pInput->numOfRows - startIndex) % bitWidth;
  int32_t      rounds = (pInput->numOfRows - startIndex) / bitWidth;
  const float* p = &plist[startIndex];
  __m256 val;
  __m256 sum = _mm256_setzero_ps();
  for (int32_t i = 0; i < rounds; ++i) {
    val = _mm256_loadu_ps(p);
    sum = _mm256_add_ps(sum, val);
    p += bitWidth;
  }
  // let sum up the final results
  const float* q = (const float*)&sum;
  pRes->sum.dsum += q[0] + q[1] + q[2] + q[3] + q[4] + q[5] + q[6] + q[7];
  // calculate the front and the reminder items in array list
  for (int32_t j = 0; j < startIndex; ++j) {
    pRes->sum.dsum += plist[j];
  }
  startIndex += rounds * bitWidth;
  for (int32_t j = 0; j < remain; ++j) {
    pRes->sum.dsum += plist[j + startIndex];
  }
 #endif
 }
 static int32_t handleFloatCols(const SColumnInfoData* pCol, const SInputColumnInfoData* pInput, SAvgRes* pRes) {
  int32_t numOfElems = 0;
  float*  plist = (float*)pCol->pData;
-  if (pCol->hasNull || pInput->numOfRows < 8) {
+  const int32_t THRESHOLD_SIZE = 8;
  if (pCol->hasNull || pInput->numOfRows <= THRESHOLD_SIZE) {
    for (int32_t i = pInput->startRowIndex; i < pInput->numOfRows + pInput->startRowIndex; ++i) {
      if (colDataIsNull_f(pCol->nullbitmap, i)) {
        continue;
@ -67,46 +103,13 @@ static int32_t handleFloatCols(const SColumnInfoData* pCol, const SInputColumnIn
    numOfElems = pInput->numOfRows;
    pRes->count += pInput->numOfRows;
-    // 1. an software version to speedup the process by using loop unwinding.
+    // 3. If the CPU supports AVX, let's employ AVX instructions to speedup this loop
-
+    if (tsAVXEnable && tsSIMDEnable) {
-
+      floatVectorSumAVX(pInput, plist, pRes);
-
+    } else {
-    // 2. if both the CPU and OS support SSE4.2, let's try the faster version by using SSE4.2 SIMD
+      for (int32_t i = pInput->startRowIndex; i < pInput->numOfRows + pInput->startRowIndex; ++i) {
-
+        pRes->sum.dsum += plist[i];
    // 3. If both the CPU and OS support AVX, let's employ AVX instruction to speedup this loop
    // 3.1 find the start position that are aligned to 32bytes address in memory
    int32_t startElem = 0;//((uint64_t)plist) & ((1<<8u)-1);
    int32_t i = 0;
    int32_t bitWidth = 8;
    int32_t remain = (pInput->numOfRows - startElem) % bitWidth;
    int32_t rounds = (pInput->numOfRows - startElem) / bitWidth;
    const float* p = &plist[startElem];
    __m256 loadVal;
    __m256 sum = _mm256_setzero_ps();
    for(; i < rounds; ++i) {
      loadVal = _mm256_loadu_ps(p);
      sum = _mm256_add_ps(sum, loadVal);
      p += bitWidth;
      }
    // let sum up the final results
    const float* q = (const float*)&sum;
    pRes->sum.dsum += q[0] + q[1] + q[2] + q[3] + q[4] + q[5] + q[6] + q[7];
    // calculate the front and the reminder items in array list
    for(int32_t j = 0; j < startElem; ++j) {
      pRes->sum.dsum += plist[j];
    }
    startElem += rounds * bitWidth;
    for(int32_t j = 0; j < remain; ++j) {
      pRes->sum.dsum += plist[j + startElem];
    }
  }
--- a/source/libs/function/src/detail/tminmax.c
+++ b/source/libs/function/src/detail/tminmax.c
@ -13,20 +13,163 @@
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */
 #include <immintrin.h>
 #include "builtinsimpl.h"
 #include "function.h"
 #include "tdatablock.h"
 #include "tfunctionInt.h"
 #include "tglobal.h"
 static int32_t i32VectorCmpAVX2(const int32_t* pData, int32_t numOfRows, bool isMinFunc) {
  int32_t v = 0;
 #if __AVX2__
  int32_t startElem = 0;//((uint64_t)plist) & ((1<<8u)-1);
  int32_t bitWidth = 8;
  int32_t remain = (numOfRows - startElem) % bitWidth;
  int32_t rounds = (numOfRows - startElem) / bitWidth;
  const int32_t* p = &pData[startElem];
  __m256i next;
  __m256i initialVal = _mm256_loadu_si256((__m256i*)p);
  p += bitWidth;
  if (!isMinFunc) {  // max function
    for (int32_t i = 0; i < rounds; ++i) {
      next = _mm256_loadu_si256((__m256i*)p);
      initialVal = _mm256_max_epi32(initialVal, next);
      p += bitWidth;
    }
    // let sum up the final results
    const int32_t* q = (const int32_t*)&initialVal;
    v = TMAX(q[0], q[1]);
    v = TMAX(v, q[2]);
    v = TMAX(v, q[3]);
    v = TMAX(v, q[4]);
    v = TMAX(v, q[5]);
    v = TMAX(v, q[6]);
    v = TMAX(v, q[7]);
    // calculate the front and the reminder items in array list
    startElem += rounds * bitWidth;
    for (int32_t j = 0; j < remain; ++j) {
      if (v < p[j + startElem]) {
        v = p[j + startElem];
      }
    }
  } else {  // min function
    for (int32_t i = 0; i < rounds; ++i) {
      next = _mm256_loadu_si256((__m256i*)p);
      initialVal = _mm256_min_epi32(initialVal, next);
      p += bitWidth;
    }
    // let sum up the final results
    const int32_t* q = (const int32_t*)&initialVal;
    v = TMIN(q[0], q[1]);
    v = TMIN(v, q[2]);
    v = TMIN(v, q[3]);
    v = TMIN(v, q[4]);
    v = TMIN(v, q[5]);
    v = TMIN(v, q[6]);
    v = TMIN(v, q[7]);
    // calculate the front and the remainder items in array list
    startElem += rounds * bitWidth;
    for (int32_t j = 0; j < remain; ++j) {
      if (v > p[j + startElem]) {
        v = p[j + startElem];
      }
    }
  }
 #endif
  return v;
 }
 static float floatVectorCmpAVX(const float* pData, int32_t numOfRows, bool isMinFunc) {
  float v = 0;
 #if __AVX__
  int32_t startElem = 0;//((uint64_t)plist) & ((1<<8u)-1);
  int32_t i = 0;
  int32_t bitWidth = 8;
  int32_t remain = (numOfRows - startElem) % bitWidth;
  int32_t rounds = (numOfRows - startElem) / bitWidth;
  const float* p = &pData[startElem];
  __m256 next;
  __m256 initialVal = _mm256_loadu_ps(p);
  p += bitWidth;
  if (!isMinFunc) {  // max function
    for (; i < rounds; ++i) {
      next = _mm256_loadu_ps(p);
      initialVal = _mm256_max_ps(initialVal, next);
      p += bitWidth;
    }
    // let sum up the final results
    const float* q = (const float*)&initialVal;
    v = TMAX(q[0], q[1]);
    v = TMAX(v, q[2]);
    v = TMAX(v, q[3]);
    v = TMAX(v, q[4]);
    v = TMAX(v, q[5]);
    v = TMAX(v, q[6]);
    v = TMAX(v, q[7]);
    // calculate the front and the reminder items in array list
    startElem += rounds * bitWidth;
    for (int32_t j = 0; j < remain; ++j) {
      if (v < p[j + startElem]) {
        v = p[j + startElem];
      }
    }
  } else {  // min function
    for (; i < rounds; ++i) {
      next = _mm256_loadu_ps(p);
      initialVal = _mm256_min_ps(initialVal, next);
      p += bitWidth;
    }
    // let sum up the final results
    const float* q = (const float*)&initialVal;
    v = TMIN(q[0], q[1]);
    v = TMIN(v, q[2]);
    v = TMIN(v, q[3]);
    v = TMIN(v, q[4]);
    v = TMIN(v, q[5]);
    v = TMIN(v, q[6]);
    v = TMIN(v, q[7]);
    // calculate the front and the reminder items in array list
    startElem += rounds * bitWidth;
    for (int32_t j = 0; j < remain; ++j) {
      if (v > p[j + startElem]) {
        v = p[j + startElem];
      }
    }
  }
 #endif
  return v;
 }
 static int32_t handleInt32Col(SColumnInfoData* pCol, int32_t start, int32_t numOfRows, SqlFunctionCtx* pCtx,
                              SMinmaxResInfo* pBuf, bool isMinFunc) {
  int32_t* pData = (int32_t*)pCol->pData;
  int32_t* val = (int32_t*)&pBuf->v;
  int32_t numOfElems = 0;
-  if (pCol->hasNull || numOfRows < 8 || pCtx->subsidiaries.num > 0) {
+  if (pCol->hasNull || numOfRows <= 8 || pCtx->subsidiaries.num > 0) {
    if (isMinFunc) {  // min
      for (int32_t i = start; i < start + numOfRows; ++i) {
        if (colDataIsNull_f(pCol->nullbitmap, i)) {
@ -77,79 +220,30 @@ static int32_t handleInt32Col(SColumnInfoData* pCol, int32_t start, int32_t numO
      }
    }
  } else { // not has null value
-    // 1. software version
+    // AVX2 version to speedup the loop
-
+    if (tsAVX2Enable && tsSIMDEnable) {
-
+      *val = i32VectorCmpAVX2(pData, numOfRows, isMinFunc);
-
+    } else {
-
+      if (!pBuf->assign) {
-    // 3. AVX2 version to speedup the loop
+        *val = pData[0];
-    int32_t startElem = 0;//((uint64_t)plist) & ((1<<8u)-1);
+        pBuf->assign = true;
    int32_t i = 0;
    int32_t bitWidth = 8;
    int32_t v = 0;
    int32_t remain = (numOfRows - startElem) % bitWidth;
    int32_t rounds = (numOfRows - startElem) / bitWidth;
    const int32_t* p = &pData[startElem];
    __m256i next;
    __m256i initialVal = _mm256_loadu_si256((__m256i*)p);
    p += bitWidth;
    if (!isMinFunc) {  // max function
      for (; i < rounds; ++i) {
        next = _mm256_loadu_si256((__m256i*)p);
        initialVal = _mm256_max_epi32(initialVal, next);
        p += bitWidth;
      }
-      // let sum up the final results
+      if (isMinFunc) {  // min
-      const int32_t* q = (const int32_t*)&initialVal;
+        for (int32_t i = start; i < start + numOfRows; ++i) {
-
+          if (*val > pData[i]) {
-      v = TMAX(q[0], q[1]);
+            *val = pData[i];
      v = TMAX(v, q[2]);
      v = TMAX(v, q[3]);
      v = TMAX(v, q[4]);
      v = TMAX(v, q[5]);
      v = TMAX(v, q[6]);
      v = TMAX(v, q[7]);
      // calculate the front and the reminder items in array list
      startElem += rounds * bitWidth;
      for (int32_t j = 0; j < remain; ++j) {
        if (v < p[j + startElem]) {
          v = p[j + startElem];
          }
        }
-    } else {  // min function
+      } else {  // max
-      for (; i < rounds; ++i) {
+        for (int32_t i = start; i < start + numOfRows; ++i) {
-        next = _mm256_loadu_si256((__m256i*)p);
+          if (*val < pData[i]) {
-        initialVal = _mm256_min_epi32(initialVal, next);
+            *val = pData[i];
        p += bitWidth;
          }
      // let sum up the final results
      const int32_t* q = (const int32_t*)&initialVal;
      v = TMIN(q[0], q[1]);
      v = TMIN(v, q[2]);
      v = TMIN(v, q[3]);
      v = TMIN(v, q[4]);
      v = TMIN(v, q[5]);
      v = TMIN(v, q[6]);
      v = TMIN(v, q[7]);
      // calculate the front and the reminder items in array list
      startElem += rounds * bitWidth;
      for (int32_t j = 0; j < remain; ++j) {
        if (v > p[j + startElem]) {
          v = p[j + startElem];
        }
      }
    }
    *val = v;
    numOfElems = numOfRows;
  }
@ -213,79 +307,30 @@ static int32_t handleFloatCol(SColumnInfoData* pCol, int32_t start, int32_t numO
      }
    }
  } else { // not has null value
-    // 1. software version
+    // AVX version to speedup the loop
-
+    if (tsAVXEnable && tsSIMDEnable) {
-
+      *val = (double) floatVectorCmpAVX(pData, numOfRows, isMinFunc);
-
+    } else {
-
+      if (!pBuf->assign) {
-    // 3. AVX2 version to speedup the loop
+        *val = pData[0];
-    int32_t startElem = 0;//((uint64_t)plist) & ((1<<8u)-1);
+        pBuf->assign = true;
    int32_t i = 0;
    int32_t bitWidth = 8;
    float v = 0;
    int32_t remain = (numOfRows - startElem) % bitWidth;
    int32_t rounds = (numOfRows - startElem) / bitWidth;
    const float* p = &pData[startElem];
    __m256 next;
    __m256 initialVal = _mm256_loadu_ps(p);
    p += bitWidth;
    if (!isMinFunc) {  // max function
      for (; i < rounds; ++i) {
        next = _mm256_loadu_ps(p);
        initialVal = _mm256_max_ps(initialVal, next);
        p += bitWidth;
      }
-      // let sum up the final results
+      if (isMinFunc) {  // min
-      const float* q = (const float*)&initialVal;
+        for (int32_t i = start; i < start + numOfRows; ++i) {
-
+          if (*val > pData[i]) {
-      v = TMAX(q[0], q[1]);
+            *val = pData[i];
      v = TMAX(v, q[2]);
      v = TMAX(v, q[3]);
      v = TMAX(v, q[4]);
      v = TMAX(v, q[5]);
      v = TMAX(v, q[6]);
      v = TMAX(v, q[7]);
      // calculate the front and the reminder items in array list
      startElem += rounds * bitWidth;
      for (int32_t j = 0; j < remain; ++j) {
        if (v < p[j + startElem]) {
          v = p[j + startElem];
          }
        }
-    } else {  // min function
+      } else {  // max
-      for (; i < rounds; ++i) {
+        for (int32_t i = start; i < start + numOfRows; ++i) {
-        next = _mm256_loadu_ps(p);
+          if (*val < pData[i]) {
-        initialVal = _mm256_min_ps(initialVal, next);
+            *val = pData[i];
        p += bitWidth;
          }
      // let sum up the final results
      const float* q = (const float*)&initialVal;
      v = TMIN(q[0], q[1]);
      v = TMIN(v, q[2]);
      v = TMIN(v, q[3]);
      v = TMIN(v, q[4]);
      v = TMIN(v, q[5]);
      v = TMIN(v, q[6]);
      v = TMIN(v, q[7]);
      // calculate the front and the reminder items in array list
      startElem += rounds * bitWidth;
      for (int32_t j = 0; j < remain; ++j) {
        if (v > p[j + startElem]) {
          v = p[j + startElem];
        }
      }
    }
    *val = v;
    numOfElems = numOfRows;
  }
--- a/source/os/src/osEnv.c
+++ b/source/os/src/osEnv.c
@ -37,6 +37,12 @@ float           tsNumOfCores = 0;
 int64_t         tsTotalMemoryKB = 0;
 char           *tsProcPath = NULL;
 char            tsSIMDEnable = 0;
 char            tsSSE42Enable = 0;
 char            tsAVXEnable = 0;
 char            tsAVX2Enable = 0;
 char            tsFMAEnable = 0;
 void osDefaultInit() {
  taosSeedRand(taosSafeRand());
  taosGetSystemLocale(tsLocale, tsCharset);
@ -99,7 +105,7 @@ bool osDataSpaceSufficient() { return tsDataSpace.size.avail > tsDataSpace.reser
 bool osTempSpaceSufficient() { return tsTempSpace.size.avail > tsTempSpace.reserved; }
-void osSetTimezone(const char *timezone) { taosSetSystemTimezone(timezone, tsTimezoneStr, &tsDaylight, &tsTimezone); }
+void osSetTimezone(const char *tz) { taosSetSystemTimezone(tz, tsTimezoneStr, &tsDaylight, &tsTimezone); }
 void osSetSystemLocale(const char *inLocale, const char *inCharSet) {
  memcpy(tsLocale, inLocale, strlen(inLocale) + 1);
--- a/source/os/src/osFile.c
+++ b/source/os/src/osFile.c
@ -775,6 +775,7 @@ int64_t taosGetLineFile(TdFilePtr pFile, char **__restrict ptrBuf) {
  return getline(ptrBuf, &len, pFile->fp);
 #endif
 }
 int64_t taosGetsFile(TdFilePtr pFile, int32_t maxSize, char *__restrict buf) {
  if (pFile == NULL || buf == NULL) {
    return -1;
@ -785,6 +786,7 @@ int64_t taosGetsFile(TdFilePtr pFile, int32_t maxSize, char *__restrict buf) {
  }
  return strlen(buf);
 }
 int32_t taosEOFFile(TdFilePtr pFile) {
  if (pFile == NULL) {
    return 0;
--- a/source/os/src/osLocale.c
+++ b/source/os/src/osLocale.c
@ -67,6 +67,9 @@ char *taosCharsetReplace(char *charsetstr) {
 }
 /**
 * TODO: here we may employ the systemctl API to set/get the correct locale on the Linux. In some cases, the setlocale
 *  seems does not response as expected.
 *
 * In some Linux systems, setLocale(LC_CTYPE, "") may return NULL, in which case the launch of
 * both the TDengine Server and the Client may be interrupted.
 *
@ -148,7 +151,7 @@ void taosGetSystemLocale(char *outLocale, char *outCharset) {
   *
   * example: en_US.UTF-8, zh_CN.GB18030, zh_CN.UTF-8,
   *
-   * if user does not specify the locale in taos.cfg the program use default LC_CTYPE as system locale.
+   * If user does not specify the locale in taos.cfg, the program then uses default LC_CTYPE as system locale.
   *
   * In case of some CentOS systems, their default locale is "en_US.utf8", which is not valid code_page
   * for libiconv that is employed to convert string in this system. This program will automatically use
--- a/source/os/src/osSysinfo.c
+++ b/source/os/src/osSysinfo.c
@ -155,8 +155,8 @@ static int32_t taosGetSysCpuInfo(SysCpuInfo *cpuInfo) {
  }
  char    line[1024];
-  ssize_t _bytes = taosGetsFile(pFile, sizeof(line), line);
+  ssize_t bytes = taosGetsFile(pFile, sizeof(line), line);
-  if ((_bytes < 0) || (line == NULL)) {
+  if (bytes < 0) {
    taosCloseFile(&pFile);
    return -1;
  }
@ -193,9 +193,9 @@ static int32_t taosGetProcCpuInfo(ProcCpuInfo *cpuInfo) {
    return -1;
  }
-  char    line[1024];
+  char    line[1024] = {0};
-  ssize_t _bytes = taosGetsFile(pFile, sizeof(line), line);
+  ssize_t bytes = taosGetsFile(pFile, sizeof(line), line);
-  if ((_bytes < 0) || (line == NULL)) {
+  if (bytes < 0) {
    taosCloseFile(&pFile);
    return -1;
  }
@ -239,6 +239,7 @@ void taosGetSystemInfo() {
  taosGetCpuCores(&tsNumOfCores);
  taosGetTotalMemory(&tsTotalMemoryKB);
  taosGetCpuUsage(NULL, NULL);
  taosGetCpuInstructions(&tsSSE42Enable, &tsAVXEnable, &tsAVX2Enable, &tsFMAEnable);
 #endif
 }
@ -366,7 +367,7 @@ int32_t taosGetCpuInfo(char *cpuModel, int32_t maxLen, float *numOfCores) {
  return code;
 #else
-  char    line[1024];
+  char    line[1024] = {0};
  size_t  size = 0;
  int32_t done = 0;
  int32_t code = -1;
@ -468,6 +469,46 @@ void taosGetCpuUsage(double *cpu_system, double *cpu_engine) {
  }
 }
 #define __cpuid_fix(level, a, b, c, d) \
              __asm__("xor %%ecx, %%ecx\n" \
                      "cpuid\n" \
                      : "=a"(a), "=b"(b), "=c"(c), "=d"(d) \
                      : "0"(level))
 // todo add for windows and mac
 int32_t taosGetCpuInstructions(char* sse42, char* avx, char* avx2, char* fma) {
 #ifdef WINDOWS
 #elif defined(_TD_DARWIN_64)
 #else
  // Since the compiler is not support avx/avx2 instructions, the global variables always need to be
  // set to be false
 #if __AVX__ || __AVX2__
  tsSIMDEnable = true;
 #else
  tsSIMDEnable = false;
 #endif
  uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0;
  int32_t ret = __get_cpuid(1, &eax, &ebx, &ecx, &edx);
  if (ret == 0) {
    return -1;  // failed to get the cpuid info
  }
  *sse42 = (char) ((ecx & bit_SSE4_2) == bit_SSE4_2);
  *avx   = (char) ((ecx & bit_AVX) == bit_AVX);
  *fma   = (char) ((ecx & bit_FMA) == bit_FMA);
  // work around a bug in GCC.
  // Ref to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77756
  __cpuid_fix(7u, eax, ebx, ecx, edx);
  *avx2 = (char) ((ebx & bit_AVX2) == bit_AVX2);
  return 0;
 #endif
 }
 int32_t taosGetTotalMemory(int64_t *totalKB) {
 #ifdef WINDOWS
  MEMORYSTATUSEX memsStat;
@ -511,11 +552,11 @@ int32_t taosGetProcMemory(int64_t *usedKB) {
    return -1;
  }
-  ssize_t _bytes = 0;
+  ssize_t bytes = 0;
-  char    line[1024];
+  char    line[1024] = {0};
  while (!taosEOFFile(pFile)) {
-    _bytes = taosGetsFile(pFile, sizeof(line), line);
+    bytes = taosGetsFile(pFile, sizeof(line), line);
-    if ((_bytes < 0) || (line == NULL)) {
+    if (bytes < 0) {
      break;
    }
    if (strstr(line, "VmRSS:") != NULL) {
@ -523,7 +564,7 @@ int32_t taosGetProcMemory(int64_t *usedKB) {
    }
  }
-  if (line == NULL) {
+  if (strlen(line) < 0) {
    // printf("read file:%s failed", tsProcMemFile);
    taosCloseFile(&pFile);
    return -1;
@ -624,14 +665,14 @@ int32_t taosGetProcIO(int64_t *rchars, int64_t *wchars, int64_t *read_bytes, int
  TdFilePtr pFile = taosOpenFile(tsProcIOFile, TD_FILE_READ | TD_FILE_STREAM);
  if (pFile == NULL) return -1;
-  ssize_t _bytes = 0;
+  ssize_t bytes = 0;
-  char    line[1024];
+  char    line[1024] = {0};
  char    tmp[24];
  int     readIndex = 0;
  while (!taosEOFFile(pFile)) {
-    _bytes = taosGetsFile(pFile, sizeof(line), line);
+    bytes = taosGetsFile(pFile, sizeof(line), line);
-    if (_bytes < 10 || line == NULL) {
+    if (bytes < 10) {
      break;
    }
    if (strstr(line, "rchar:") != NULL) {
--- a/source/os/src/osTime.c
+++ b/source/os/src/osTime.c
@ -339,7 +339,7 @@ char *taosStrpTime(const char *buf, const char *fmt, struct tm *tm) {
 #endif
 }
-FORCE_INLINE int32_t taosGetTimeOfDay(struct timeval *tv) {
+int32_t taosGetTimeOfDay(struct timeval *tv) {
 #ifdef WINDOWS
  time_t t;
  t = taosGetTimestampSec();
@ -455,6 +455,7 @@ static int isLeapYear(time_t year) {
  else
    return 1;
 }
 struct tm *taosLocalTimeNolock(struct tm *result, const time_t *timep, int dst) {
  if (result == NULL) {
    return localtime(timep);
@ -542,7 +543,9 @@ struct tm *taosLocalTimeNolock(struct tm *result, const time_t *timep, int dst)
 #endif
  return result;
 }
 int32_t taosGetTimestampSec() { return (int32_t)time(NULL); }
 int32_t taosClockGetTime(int clock_id, struct timespec *pTS) {
 #ifdef WINDOWS
  LARGE_INTEGER        t;
--- a/source/util/src/tconfig.c
+++ b/source/util/src/tconfig.c
@ -561,13 +561,13 @@ void cfgDumpCfg(SConfig *pCfg, bool tsc, bool dump) {
    if (dump && strcmp(pItem->name, "scriptDir") == 0) continue;
    if (dump && strcmp(pItem->name, "simDebugFlag") == 0) continue;
    tstrncpy(src, cfgStypeStr(pItem->stype), CFG_SRC_PRINT_LEN);
-    for (int32_t i = 0; i < CFG_SRC_PRINT_LEN; ++i) {
+    for (int32_t j = 0; j < CFG_SRC_PRINT_LEN; ++j) {
-      if (src[i] == 0) src[i] = ' ';
+      if (src[j] == 0) src[j] = ' ';
    }
    tstrncpy(name, pItem->name, CFG_NAME_PRINT_LEN);
-    for (int32_t i = 0; i < CFG_NAME_PRINT_LEN; ++i) {
+    for (int32_t j = 0; j < CFG_NAME_PRINT_LEN; ++j) {
-      if (name[i] == 0) name[i] = ' ';
+      if (name[j] == 0) name[j] = ' ';
    }
    switch (pItem->dtype) {
--- a/source/util/src/tcrc32c.c
+++ b/source/util/src/tcrc32c.c
@ -24,7 +24,6 @@
 #endif
 #include "tcrc32c.h"
 #include "tdef.h"
 #define POLY        0x82f63b78
 #define LONG_SHIFT  8192