From 6e2a5b9292c30eab74f9414fa510cb58502abbbd Mon Sep 17 00:00:00 2001
From: Kai Jiang <kai.jiang@intel.com>
Date: Tue, 23 Jan 2024 15:14:04 +0800
Subject: [PATCH 1/5] Improve tsDecompressIntImpl_Hw performance with AVX512.

Signed-off-by: Kai Jiang <kai.jiang@intel.com>
               Huanrui Zhang <huanruix.zhang@intel.com>
---
 cmake/cmake.define            |  12 ++--
 source/util/src/tdecompress.c | 110 +++++++++++++++++++++++++++++-----
 2 files changed, 102 insertions(+), 20 deletions(-)

diff --git a/cmake/cmake.define b/cmake/cmake.define
index 12e1b50539..735735f0cc 100644
--- a/cmake/cmake.define
+++ b/cmake/cmake.define
@@ -181,12 +181,12 @@ ELSE ()
         ENDIF()
         MESSAGE(STATUS "SIMD instructions (FMA/AVX/AVX2) is ACTIVATED")
 
-#        IF (COMPILER_SUPPORT_AVX512F AND COMPILER_SUPPORT_AVX512BMI)
-#            SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512f -mavx512vbmi")
-#            SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512vbmi")
-#            MESSAGE(STATUS "avx512f/avx512bmi supported by compiler")
-#        ENDIF()
-#
+        IF (COMPILER_SUPPORT_AVX512F AND COMPILER_SUPPORT_AVX512BMI)
+            SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512f -mavx512vbmi")
+            SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512vbmi")
+            MESSAGE(STATUS "avx512f/avx512bmi supported by compiler")
+        ENDIF()
+
 #        IF (COMPILER_SUPPORT_AVX512VL)
 #            SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512vl")
 #            SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512vl")
diff --git a/source/util/src/tdecompress.c b/source/util/src/tdecompress.c
index f212bf5231..d3cb3118d2 100644
--- a/source/util/src/tdecompress.c
+++ b/source/util/src/tdecompress.c
@@ -52,7 +52,7 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements,
   int32_t     _pos = 0;
   int64_t     prevValue = 0;
 
-#if __AVX2__
+#if __AVX2__ || __AVX512F__
   while (_pos < nelements) {
     uint64_t w = *(uint64_t*) ip;
 
@@ -72,10 +72,33 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements,
         int32_t gRemainder = (nelements - _pos);
         int32_t num = (gRemainder > elems)? elems:gRemainder;
 
-        int32_t batch = num >> 2;
-        int32_t remain = num & 0x03;
+        int32_t batch = 0;
+        int32_t remain = 0;
+        if (tsSIMDEnable && tsAVX512Enable) {
+#if __AVX512F__
+        batch = num >> 3;
+        remain = num & 0x07;
+#endif
+        } else if (tsSIMDEnable && tsAVX2Enable) {
+#if __AVX2__
+        batch = num >> 2;
+        remain = num & 0x03;
+#endif
+        }
+
         if (selector == 0 || selector == 1) {
-          if (tsSIMDEnable && tsAVX2Enable) {
+          if (tsSIMDEnable && tsAVX512Enable) {
+#if __AVX512F__
+            for (int32_t i = 0; i < batch; ++i) {
+            __m512i prev = _mm512_set1_epi64(prevValue);
+            _mm512_storeu_si512((__m512i *)&p[_pos], prev);
+            _pos += 8;	//handle 64bit x 8 = 512bit
+            }
+            for (int32_t i = 0; i < remain; ++i) {
+            p[_pos++] = prevValue;
+            }
+#endif
+          } else if (tsSIMDEnable && tsAVX2Enable) {
             for (int32_t i = 0; i < batch; ++i) {
               __m256i prev = _mm256_set1_epi64x(prevValue);
               _mm256_storeu_si256((__m256i *)&p[_pos], prev);
@@ -85,10 +108,7 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements,
             for (int32_t i = 0; i < remain; ++i) {
               p[_pos++] = prevValue;
             }
-          } else if (tsSIMDEnable && tsAVX512Enable) {
-#if __AVX512F__
-            // todo add avx512 impl
-#endif
+
           } else { // alternative implementation without SIMD instructions.
             for (int32_t i = 0; i < elems && count < nelements; i++, count++) {
               p[_pos++] = prevValue;
@@ -96,7 +116,73 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements,
             }
           }
         } else {
-          if (tsSIMDEnable && tsAVX2Enable) {
+          if (tsSIMDEnable && tsAVX512Enable) {
+ #if __AVX512F__
+            __m512i sum_mask1 = _mm512_set_epi64(6, 6, 4, 4, 2, 2, 0, 0);
+            __m512i sum_mask2 = _mm512_set_epi64(5, 5, 5, 5, 1, 1, 1, 1);
+            __m512i sum_mask3 = _mm512_set_epi64(3, 3, 3, 3, 3, 3, 3, 3);
+            __m512i base = _mm512_set1_epi64(w);
+            __m512i maskVal = _mm512_set1_epi64(mask);
+            __m512i shiftBits = _mm512_set_epi64(bit * 7 + 4, bit * 6 + 4, bit * 5 + 4, bit * 4 + 4, bit * 3 + 4, bit * 2 + 4, bit + 4, 4);
+            __m512i inc = _mm512_set1_epi64(bit << 3);
+
+            for (int32_t i = 0; i < batch; ++i) {
+
+            __m512i after = _mm512_srlv_epi64(base, shiftBits);
+            __m512i zigzagVal = _mm512_and_si512(after, maskVal);
+
+            // ZIGZAG_DECODE(T, v) (((v) >> 1) ^ -((T)((v)&1)))
+            __m512i signmask = _mm512_and_si512(_mm512_set1_epi64(1), zigzagVal);
+            signmask = _mm512_sub_epi64(_mm512_setzero_si512(), signmask);
+            __m512i delta = _mm512_xor_si512(_mm512_srli_epi64(zigzagVal, 1), signmask);
+
+            // calculate the cumulative sum (prefix sum) for each number
+            // decode[0] =  prevValue + final[0]
+            // decode[1] = decode[0] + final[1]   -----> prevValue + final[0] + final[1]
+            // decode[2] = decode[1] + final[2]   -----> prevValue + final[0] + final[1] + final[2]
+            // decode[3] = decode[2] + final[3]   -----> prevValue + final[0] + final[1] + final[2] + final[3]
+
+
+            //7		6		5		4		3		2		1		0
+            //D7		D6		D5		D4		D3		D2		D1		D0
+            //D6		0		D4		0		D2		0		D0		0
+            //D7+D6		D6		D5+D4		D4		D3+D2		D2		D1+D0		D0
+            //13		6		9		4		5		2		1		0
+            __m512i prev = _mm512_set1_epi64(prevValue);
+            __m512i cum_sum = _mm512_add_epi64(delta, _mm512_maskz_permutexvar_epi64(0xaa, sum_mask1, delta));
+            cum_sum = _mm512_add_epi64(cum_sum, _mm512_maskz_permutexvar_epi64(0xcc, sum_mask2, cum_sum));
+            cum_sum = _mm512_add_epi64(cum_sum, _mm512_maskz_permutexvar_epi64(0xf0, sum_mask3, cum_sum));
+
+
+
+            //13		6		9		4		5		2		1		0
+            //D7,D6		D6		D5,D4		D4		D3,D2		D2		D1,D0		D0
+            //+D5,D4	D5,D4,		0		0		D1,D0		D1,D0		0		0
+            //D7~D4		D6~D4		D5~D4		D4		D3~D0		D2~D0		D1~D0		D0
+            //22		15		9		4		6		3		1		0
+            //
+            //D3~D0		D3~D0		D3~D0		D3~D0		0		0		0		0
+            //28		21		15		10		6		3		1		0
+
+
+            cum_sum = _mm512_add_epi64(cum_sum, prev);
+            _mm512_storeu_si512((__m512i *)&p[_pos], cum_sum);
+
+            shiftBits = _mm512_add_epi64(shiftBits, inc);
+            prevValue = p[_pos + 7];
+            _pos += 8;
+
+            }
+            // handle the remain value
+            for (int32_t i = 0; i < remain; i++) {
+            zigzag_value = ((w >> (v + (batch * bit * 8))) & mask);
+            prevValue += ZIGZAG_DECODE(int64_t, zigzag_value);
+
+            p[_pos++] = prevValue;
+            v += bit;
+            }
+#endif
+          } else if (tsSIMDEnable && tsAVX2Enable) {
             __m256i base = _mm256_set1_epi64x(w);
             __m256i maskVal = _mm256_set1_epi64x(mask);
 
@@ -157,10 +243,6 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements,
               p[_pos++] = prevValue;
               v += bit;
             }
-          } else if (tsSIMDEnable && tsAVX512Enable) {
-#if __AVX512F__
-            // todo add avx512 impl
-#endif
           } else {  // alternative implementation without SIMD instructions.
             for (int32_t i = 0; i < elems && count < nelements; i++, count++) {
               zigzag_value = ((w >> v) & mask);
@@ -507,4 +589,4 @@ int32_t tsDecompressTimestampAvx512(const char *const input, const int32_t nelem
 
 #endif
   return 0;
-}
\ No newline at end of file
+}

From 04281bcd07388b21255e433002b9178f2e8a5005 Mon Sep 17 00:00:00 2001
From: Haojun Liao <hjliao@taosdata.com>
Date: Wed, 19 Jun 2024 15:06:09 +0800
Subject: [PATCH 2/5] fix(util): uncomment the timestamp decode function with
 AVX2 instructions, and do some internal refactor.

---
 cmake/cmake.define                        | 14 +++---
 include/os/osEnv.h                        |  7 +--
 source/client/test/clientTests.cpp        |  9 +---
 source/common/src/tglobal.c               |  7 +--
 source/libs/function/src/detail/tminmax.c |  6 +--
 source/os/src/osEnv.c                     |  7 +--
 source/os/src/osSysinfo.c                 |  2 +-
 source/util/src/tcompression.c            | 10 ++---
 source/util/src/tdecompress.c             | 21 ++++-----
 source/util/test/decompressTest.cpp       | 54 ++++++++++++++++++++++-
 10 files changed, 92 insertions(+), 45 deletions(-)

diff --git a/cmake/cmake.define b/cmake/cmake.define
index 735735f0cc..f1a5cef67e 100644
--- a/cmake/cmake.define
+++ b/cmake/cmake.define
@@ -180,18 +180,20 @@ ELSE ()
             SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2")
         ENDIF()
         MESSAGE(STATUS "SIMD instructions (FMA/AVX/AVX2) is ACTIVATED")
+    ENDIF()
 
+    IF ("${SIMD_AVX512_SUPPORT}" MATCHES "true")
         IF (COMPILER_SUPPORT_AVX512F AND COMPILER_SUPPORT_AVX512BMI)
             SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512f -mavx512vbmi")
             SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512vbmi")
-            MESSAGE(STATUS "avx512f/avx512bmi supported by compiler")
+            MESSAGE(STATUS "avx512f/avx512bmi enabled by compiler")
         ENDIF()
 
-#        IF (COMPILER_SUPPORT_AVX512VL)
-#            SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512vl")
-#            SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512vl")
-#        MESSAGE(STATUS "avx512vl supported by compiler")
-#        ENDIF()
+        IF (COMPILER_SUPPORT_AVX512VL)
+            SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512vl")
+            SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512vl")
+        MESSAGE(STATUS "avx512vl enabled by compiler")
+        ENDIF()
     ENDIF()
 
     # build mode
diff --git a/include/os/osEnv.h b/include/os/osEnv.h
index ac4ecd4212..e3e5da59f5 100644
--- a/include/os/osEnv.h
+++ b/include/os/osEnv.h
@@ -39,9 +39,10 @@ extern char           *tsProcPath;
 extern char            tsSIMDEnable;
 extern char            tsSSE42Enable;
 extern char            tsAVXEnable;
-extern char            tsAVX2Enable;
-extern char            tsFMAEnable;
-extern char 	       tsAVX512Enable;
+extern char            tsAVX2Supported;
+extern char            tsFMASupported;
+extern char 	       tsAVX512Supported;
+extern char            tsAVX512Enable;
 extern char            tsTagFilterCache;
 
 extern char configDir[];
diff --git a/source/client/test/clientTests.cpp b/source/client/test/clientTests.cpp
index b5bad92dc4..bbd759b3d1 100644
--- a/source/client/test/clientTests.cpp
+++ b/source/client/test/clientTests.cpp
@@ -828,12 +828,8 @@ TEST(clientCase, projection_query_tables) {
   //    printf("error in create db, reason:%s\n", taos_errstr(pRes));
   //  }
   //  taos_free_result(pRes);
-/*
-  TAOS_RES* pRes = taos_query(pConn, "select last(ts), ts from cache_1.t1");
-//  pRes = taos_query(pConn, "select last(ts), ts from cache_1.no_pk_t1");
-  if (taos_errno(pRes) != 0) {
-    printf("failed to create table tu, reason:%s\n", taos_errstr(pRes));
-  }
+
+  pRes= taos_query(pConn, "use abc1");
   taos_free_result(pRes);
 
   pRes = taos_query(pConn, "create table tu using st2 tags(2)");
@@ -868,7 +864,6 @@ TEST(clientCase, projection_query_tables) {
       createNewTable(pConn, i, 100000, 0, pstr);
     }
   }
-*/
 
     pRes = taos_query(pConn, "select * from abc1.st2");
     if (taos_errno(pRes) != 0) {
diff --git a/source/common/src/tglobal.c b/source/common/src/tglobal.c
index c68dc85c29..5b0361f8ca 100644
--- a/source/common/src/tglobal.c
+++ b/source/common/src/tglobal.c
@@ -596,10 +596,11 @@ static int32_t taosAddSystemCfg(SConfig *pCfg) {
 
   if (cfgAddBool(pCfg, "ssd42", tsSSE42Enable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;
   if (cfgAddBool(pCfg, "avx", tsAVXEnable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;
-  if (cfgAddBool(pCfg, "avx2", tsAVX2Enable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;
-  if (cfgAddBool(pCfg, "fma", tsFMAEnable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;
-  if (cfgAddBool(pCfg, "avx512", tsAVX512Enable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;
+  if (cfgAddBool(pCfg, "avx2", tsAVX2Supported, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;
+  if (cfgAddBool(pCfg, "fma", tsFMASupported, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;
+  if (cfgAddBool(pCfg, "avx512", tsAVX512Supported, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;
   if (cfgAddBool(pCfg, "simdEnable", tsSIMDEnable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;
+  if (cfgAddBool(pCfg, "AVX512Enable", tsAVX512Enable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;
   if (cfgAddBool(pCfg, "tagFilterCache", tsTagFilterCache, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;
 
   if (cfgAddInt64(pCfg, "openMax", tsOpenMax, 0, INT64_MAX, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;
diff --git a/source/libs/function/src/detail/tminmax.c b/source/libs/function/src/detail/tminmax.c
index a6c91a57ce..e36157b565 100644
--- a/source/libs/function/src/detail/tminmax.c
+++ b/source/libs/function/src/detail/tminmax.c
@@ -370,7 +370,7 @@ static int32_t findFirstValPosition(const SColumnInfoData* pCol, int32_t start,
 static void handleInt8Col(const void* data, int32_t start, int32_t numOfRows, SMinmaxResInfo* pBuf, bool isMinFunc,
                           bool signVal) {
   // AVX2 version to speedup the loop
-  if (tsAVX2Enable && tsSIMDEnable) {
+  if (tsAVX2Supported && tsSIMDEnable) {
     pBuf->v = i8VectorCmpAVX2(data, numOfRows, isMinFunc, signVal);
   } else {
     if (!pBuf->assign) {
@@ -404,7 +404,7 @@ static void handleInt8Col(const void* data, int32_t start, int32_t numOfRows, SM
 static void handleInt16Col(const void* data, int32_t start, int32_t numOfRows, SMinmaxResInfo* pBuf, bool isMinFunc,
                            bool signVal) {
   // AVX2 version to speedup the loop
-  if (tsAVX2Enable && tsSIMDEnable) {
+  if (tsAVX2Supported && tsSIMDEnable) {
     pBuf->v = i16VectorCmpAVX2(data, numOfRows, isMinFunc, signVal);
   } else {
     if (!pBuf->assign) {
@@ -438,7 +438,7 @@ static void handleInt16Col(const void* data, int32_t start, int32_t numOfRows, S
 static void handleInt32Col(const void* data, int32_t start, int32_t numOfRows, SMinmaxResInfo* pBuf, bool isMinFunc,
                            bool signVal) {
   // AVX2 version to speedup the loop
-  if (tsAVX2Enable && tsSIMDEnable) {
+  if (tsAVX2Supported && tsSIMDEnable) {
     pBuf->v = i32VectorCmpAVX2(data, numOfRows, isMinFunc, signVal);
   } else {
     if (!pBuf->assign) {
diff --git a/source/os/src/osEnv.c b/source/os/src/osEnv.c
index 54107db325..ea88d5307b 100644
--- a/source/os/src/osEnv.c
+++ b/source/os/src/osEnv.c
@@ -38,11 +38,12 @@ int64_t         tsTotalMemoryKB = 0;
 char           *tsProcPath = NULL;
 
 char tsSIMDEnable = 0;
+char tsAVX512Enable = 0;
 char tsSSE42Enable = 0;
 char tsAVXEnable = 0;
-char tsAVX2Enable = 0;
-char tsFMAEnable = 0;
-char tsAVX512Enable = 0;
+char tsAVX2Supported = 0;
+char tsFMASupported = 0;
+char tsAVX512Supported = 0;
 
 void osDefaultInit() {
   taosSeedRand(taosSafeRand());
diff --git a/source/os/src/osSysinfo.c b/source/os/src/osSysinfo.c
index 187461826a..71cbf5541f 100644
--- a/source/os/src/osSysinfo.c
+++ b/source/os/src/osSysinfo.c
@@ -250,7 +250,7 @@ void taosGetSystemInfo() {
   taosGetCpuCores(&tsNumOfCores, false);
   taosGetTotalMemory(&tsTotalMemoryKB);
   taosGetCpuUsage(NULL, NULL);
-  taosGetCpuInstructions(&tsSSE42Enable, &tsAVXEnable, &tsAVX2Enable, &tsFMAEnable, &tsAVX512Enable);
+  taosGetCpuInstructions(&tsSSE42Enable, &tsAVXEnable, &tsAVX2Supported, &tsFMASupported, &tsAVX512Supported);
 #endif
 }
 
diff --git a/source/util/src/tcompression.c b/source/util/src/tcompression.c
index 4635ec340d..053a716721 100644
--- a/source/util/src/tcompression.c
+++ b/source/util/src/tcompression.c
@@ -822,9 +822,9 @@ int32_t tsDecompressTimestampImp(const char *const input, const int32_t nelement
     memcpy(output, input + 1, nelements * longBytes);
     return nelements * longBytes;
   } else if (input[0] == 1) {  // Decompress
-    if (tsSIMDEnable && tsAVX512Enable) {
+    if (tsSIMDEnable && tsAVX512Supported && tsAVX512Enable) {
       tsDecompressTimestampAvx512(input, nelements, output, false);
-    } else if (tsSIMDEnable && tsAVX2Enable) {
+    } else if (tsSIMDEnable && tsAVX2Supported) {
       tsDecompressTimestampAvx2(input, nelements, output, false);
     } else {
       int64_t *ostream = (int64_t *)output;
@@ -1198,9 +1198,9 @@ int32_t tsDecompressFloatImp(const char *const input, const int32_t nelements, c
     return nelements * FLOAT_BYTES;
   }
 
-  if (tsSIMDEnable && tsAVX2Enable) {
+  if (tsSIMDEnable && tsAVX2Supported) {
     tsDecompressFloatImplAvx2(input, nelements, output);
-  } else if (tsSIMDEnable && tsAVX512Enable) {
+  } else if (tsSIMDEnable && tsAVX512Supported && tsAVX512Enable) {
     tsDecompressFloatImplAvx512(input, nelements, output);
   } else {  // alternative implementation without SIMD instructions.
     tsDecompressFloatHelper(input, nelements, (float *)output);
@@ -2713,7 +2713,7 @@ int32_t tsDecompressBigint(void *pIn, int32_t nIn, int32_t nEle, void *pOut, int
         int8_t alvl = tsGetCompressL2Level(l2, lvl);                                                                  \
         return compressL2Dict[l2].comprFn(pIn, nIn, pOut, nOut, type, alvl);                                          \
       } else {                                                                                                        \
-        uTrace("dencode:%s, dcompress:%s, level:%d, type:%s", "disabled", compressL2Dict[l1].name, lvl,               \
+        uTrace("dencode:%s, decompress:%s, level:%d, type:%s", "disabled", compressL2Dict[l1].name, lvl,               \
                tDataTypes[type].name);                                                                                \
         return compressL2Dict[l2].decomprFn(pIn, nIn, pOut, nOut, type);                                              \
       }                                                                                                               \
diff --git a/source/util/src/tdecompress.c b/source/util/src/tdecompress.c
index d3cb3118d2..38f277fb48 100644
--- a/source/util/src/tdecompress.c
+++ b/source/util/src/tdecompress.c
@@ -74,12 +74,12 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements,
 
         int32_t batch = 0;
         int32_t remain = 0;
-        if (tsSIMDEnable && tsAVX512Enable) {
+        if (tsSIMDEnable && tsAVX512Supported && tsAVX512Enable) {
 #if __AVX512F__
         batch = num >> 3;
         remain = num & 0x07;
 #endif
-        } else if (tsSIMDEnable && tsAVX2Enable) {
+        } else if (tsSIMDEnable && tsAVX2Supported) {
 #if __AVX2__
         batch = num >> 2;
         remain = num & 0x03;
@@ -87,7 +87,7 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements,
         }
 
         if (selector == 0 || selector == 1) {
-          if (tsSIMDEnable && tsAVX512Enable) {
+          if (tsSIMDEnable && tsAVX512Supported && tsAVX512Enable) {
 #if __AVX512F__
             for (int32_t i = 0; i < batch; ++i) {
             __m512i prev = _mm512_set1_epi64(prevValue);
@@ -98,7 +98,7 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements,
             p[_pos++] = prevValue;
             }
 #endif
-          } else if (tsSIMDEnable && tsAVX2Enable) {
+          } else if (tsSIMDEnable && tsAVX2Supported) {
             for (int32_t i = 0; i < batch; ++i) {
               __m256i prev = _mm256_set1_epi64x(prevValue);
               _mm256_storeu_si256((__m256i *)&p[_pos], prev);
@@ -116,7 +116,7 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements,
             }
           }
         } else {
-          if (tsSIMDEnable && tsAVX512Enable) {
+          if (tsSIMDEnable && tsAVX512Supported && tsAVX512Enable) {
  #if __AVX512F__
             __m512i sum_mask1 = _mm512_set_epi64(6, 6, 4, 4, 2, 2, 0, 0);
             __m512i sum_mask2 = _mm512_set_epi64(5, 5, 5, 5, 1, 1, 1, 1);
@@ -182,7 +182,7 @@ int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements,
             v += bit;
             }
 #endif
-          } else if (tsSIMDEnable && tsAVX2Enable) {
+          } else if (tsSIMDEnable && tsAVX2Supported) {
             __m256i base = _mm256_set1_epi64x(w);
             __m256i maskVal = _mm256_set1_epi64x(mask);
 
@@ -331,16 +331,16 @@ int32_t tsDecompressFloatImplAvx2(const char *const input, const int32_t nelemen
 
 int32_t tsDecompressTimestampAvx2(const char *const input, const int32_t nelements, char *const output,
                                   bool bigEndian) {
-#if 0
   int64_t *ostream = (int64_t *)output;
   int32_t  ipos = 1, opos = 0;
+
+#if __AVX2__
   __m128i  prevVal = _mm_setzero_si128();
   __m128i  prevDelta = _mm_setzero_si128();
 
-#if __AVX2__
   int32_t batch = nelements >> 1;
   int32_t remainder = nelements & 0x01;
-  __mmask16 mask2[16] = {0, 0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff};
+//  __mmask16 mask2[16] = {0, 0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff};
 
   int32_t i = 0;
   if (batch > 1) {
@@ -398,8 +398,6 @@ int32_t tsDecompressTimestampAvx2(const char *const input, const int32_t nelemen
     int8_t nbytes1 = flags & INT8MASK(4);  // range of nbytes starts from 0 to 7
     int8_t nbytes2 = (flags >> 4) & INT8MASK(4);
 
-//    __m128i data1 = _mm_maskz_loadu_epi8(mask2[nbytes1], (const void*)(input + ipos));
-//    __m128i data2 = _mm_maskz_loadu_epi8(mask2[nbytes2], (const void*)(input + ipos + nbytes1));
     __m128i data1;
     if (nbytes1 == 0) {
       data1 = _mm_setzero_si128();
@@ -471,7 +469,6 @@ int32_t tsDecompressTimestampAvx2(const char *const input, const int32_t nelemen
       ostream[opos++] = prevVal[1] + prevDeltaX;
     }
   }
-#endif
 #endif
   return 0;
 }
diff --git a/source/util/test/decompressTest.cpp b/source/util/test/decompressTest.cpp
index 2ddef3f595..c982f450ea 100644
--- a/source/util/test/decompressTest.cpp
+++ b/source/util/test/decompressTest.cpp
@@ -4,9 +4,16 @@
 #include <random>
 #include "ttypes.h"
 
-namespace {}  // namespace
+namespace {
+
+}  // namespace
+
+TEST(utilTest, decompress_ts_test) {
+  {
+    tsSIMDEnable = 1;
+    tsAVX2Supported = 1;
+  }
 
-TEST(utilTest, decompress_test) {
   int64_t tsList[10] = {1700000000, 1700000100, 1700000200, 1700000300, 1700000400,
                         1700000500, 1700000600, 1700000700, 1700000800, 1700000900};
 
@@ -57,6 +64,49 @@ TEST(utilTest, decompress_test) {
   }
 }
 
+TEST(utilTest, decompress_bigint_avx2_test) {
+  {
+    tsSIMDEnable = 1;
+    tsAVX2Supported = 1;
+  }
+
+  int64_t tsList[10] = {1700000000, 1700000100, 1700000200, 1700000300, 1700000400,
+                        1700000500, 1700000600, 1700000700, 1700000800, 1700000900};
+
+  char*   pOutput[10 * sizeof(int64_t)] = {0};
+  int32_t len = tsCompressBigint(tsList, sizeof(tsList), sizeof(tsList) / sizeof(tsList[0]), pOutput, 10,
+                                    ONE_STAGE_COMP, NULL, 0);
+
+  char* decompOutput[10 * 8] = {0};
+
+  tsDecompressBigint(pOutput, len, 10, decompOutput, sizeof(int64_t) * 10, ONE_STAGE_COMP, NULL, 0);
+
+  for (int32_t i = 0; i < 10; ++i) {
+    std::cout << ((int64_t*)decompOutput)[i] << std::endl;
+  }
+}
+
+TEST(utilTest, decompress_int_avx2_test) {
+  {
+    tsSIMDEnable = 1;
+    tsAVX2Supported = 1;
+  }
+
+  int32_t tsList[10] = {17000000, 17000001, 17000002, 17000003, 17000004,
+                        17000005, 17000006, 17000007, 17000008, 17000009};
+
+  char*   pOutput[10 * sizeof(int32_t)] = {0};
+  int32_t len =
+      tsCompressInt(tsList, sizeof(tsList), sizeof(tsList) / sizeof(tsList[0]), pOutput, 10, ONE_STAGE_COMP, NULL, 0);
+
+  char* decompOutput[10 * 8] = {0};
+  tsDecompressInt(pOutput, len, 10, decompOutput, sizeof(int32_t) * 10, ONE_STAGE_COMP, NULL, 0);
+
+  for (int32_t i = 0; i < 10; ++i) {
+    std::cout << ((int32_t*)decompOutput)[i] << std::endl;
+  }
+}
+
 TEST(utilTest, decompress_perf_test) {
   int32_t num = 10000;
 

From f3536361367499c7222f13b278904d8d9c74a2bd Mon Sep 17 00:00:00 2001
From: Haojun Liao <hjliao@taosdata.com>
Date: Wed, 19 Jun 2024 15:08:25 +0800
Subject: [PATCH 3/5] refactor: do some internal refactor.

---
 include/os/osEnv.h                             | 4 ++--
 source/common/src/tglobal.c                    | 4 ++--
 source/libs/function/src/detail/tavgfunction.c | 2 +-
 source/libs/function/src/detail/tminmax.c      | 4 ++--
 source/os/src/osEnv.c                          | 4 ++--
 source/os/src/osSysinfo.c                      | 2 +-
 6 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/os/osEnv.h b/include/os/osEnv.h
index e3e5da59f5..4f4a58d4e8 100644
--- a/include/os/osEnv.h
+++ b/include/os/osEnv.h
@@ -37,8 +37,8 @@ extern float           tsNumOfCores;
 extern int64_t         tsTotalMemoryKB;
 extern char           *tsProcPath;
 extern char            tsSIMDEnable;
-extern char            tsSSE42Enable;
-extern char            tsAVXEnable;
+extern char            tsSSE42Supported;
+extern char            tsAVXSupported;
 extern char            tsAVX2Supported;
 extern char            tsFMASupported;
 extern char 	       tsAVX512Supported;
diff --git a/source/common/src/tglobal.c b/source/common/src/tglobal.c
index 5b0361f8ca..97cb4b4e1d 100644
--- a/source/common/src/tglobal.c
+++ b/source/common/src/tglobal.c
@@ -594,8 +594,8 @@ static int32_t taosAddSystemCfg(SConfig *pCfg) {
   if (cfgAddBool(pCfg, "enableCoreFile", 1, CFG_SCOPE_BOTH, CFG_DYN_CLIENT) != 0) return -1;
   if (cfgAddFloat(pCfg, "numOfCores", tsNumOfCores, 1, 100000, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;
 
-  if (cfgAddBool(pCfg, "ssd42", tsSSE42Enable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;
-  if (cfgAddBool(pCfg, "avx", tsAVXEnable, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;
+  if (cfgAddBool(pCfg, "ssd42", tsSSE42Supported, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;
+  if (cfgAddBool(pCfg, "avx", tsAVXSupported, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;
   if (cfgAddBool(pCfg, "avx2", tsAVX2Supported, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;
   if (cfgAddBool(pCfg, "fma", tsFMASupported, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;
   if (cfgAddBool(pCfg, "avx512", tsAVX512Supported, CFG_SCOPE_BOTH, CFG_DYN_NONE) != 0) return -1;
diff --git a/source/libs/function/src/detail/tavgfunction.c b/source/libs/function/src/detail/tavgfunction.c
index 66ed092f76..3d51f0cd16 100644
--- a/source/libs/function/src/detail/tavgfunction.c
+++ b/source/libs/function/src/detail/tavgfunction.c
@@ -565,7 +565,7 @@ int32_t avgFunction(SqlFunctionCtx* pCtx) {
     numOfElem = pInput->numOfRows;
     pAvgRes->count += pInput->numOfRows;
 
-    bool simdAvailable = tsAVXEnable && tsSIMDEnable && (numOfRows > THRESHOLD_SIZE);
+    bool simdAvailable = tsAVXSupported && tsSIMDEnable && (numOfRows > THRESHOLD_SIZE);
 
     switch(type) {
       case TSDB_DATA_TYPE_UTINYINT:
diff --git a/source/libs/function/src/detail/tminmax.c b/source/libs/function/src/detail/tminmax.c
index e36157b565..331f222a71 100644
--- a/source/libs/function/src/detail/tminmax.c
+++ b/source/libs/function/src/detail/tminmax.c
@@ -502,7 +502,7 @@ static void handleFloatCol(SColumnInfoData* pCol, int32_t start, int32_t numOfRo
   float* val = (float*)&pBuf->v;
 
   // AVX version to speedup the loop
-  if (tsAVXEnable && tsSIMDEnable) {
+  if (tsAVXSupported && tsSIMDEnable) {
     *val = floatVectorCmpAVX(pData, numOfRows, isMinFunc);
   } else {
     if (!pBuf->assign) {
@@ -533,7 +533,7 @@ static void handleDoubleCol(SColumnInfoData* pCol, int32_t start, int32_t numOfR
   double* val = (double*)&pBuf->v;
 
   // AVX version to speedup the loop
-  if (tsAVXEnable && tsSIMDEnable) {
+  if (tsAVXSupported && tsSIMDEnable) {
     *val = (double)doubleVectorCmpAVX(pData, numOfRows, isMinFunc);
   } else {
     if (!pBuf->assign) {
diff --git a/source/os/src/osEnv.c b/source/os/src/osEnv.c
index ea88d5307b..28f4178790 100644
--- a/source/os/src/osEnv.c
+++ b/source/os/src/osEnv.c
@@ -39,8 +39,8 @@ char           *tsProcPath = NULL;
 
 char tsSIMDEnable = 0;
 char tsAVX512Enable = 0;
-char tsSSE42Enable = 0;
-char tsAVXEnable = 0;
+char tsSSE42Supported = 0;
+char tsAVXSupported = 0;
 char tsAVX2Supported = 0;
 char tsFMASupported = 0;
 char tsAVX512Supported = 0;
diff --git a/source/os/src/osSysinfo.c b/source/os/src/osSysinfo.c
index 71cbf5541f..50eb8413c0 100644
--- a/source/os/src/osSysinfo.c
+++ b/source/os/src/osSysinfo.c
@@ -250,7 +250,7 @@ void taosGetSystemInfo() {
   taosGetCpuCores(&tsNumOfCores, false);
   taosGetTotalMemory(&tsTotalMemoryKB);
   taosGetCpuUsage(NULL, NULL);
-  taosGetCpuInstructions(&tsSSE42Enable, &tsAVXEnable, &tsAVX2Supported, &tsFMASupported, &tsAVX512Supported);
+  taosGetCpuInstructions(&tsSSE42Supported, &tsAVXSupported, &tsAVX2Supported, &tsFMASupported, &tsAVX512Supported);
 #endif
 }
 

From 8d18e7cd9d0df9c135a51a304e5d5fc47d0015f3 Mon Sep 17 00:00:00 2001
From: Haojun Liao <hjliao@taosdata.com>
Date: Wed, 19 Jun 2024 19:11:52 +0800
Subject: [PATCH 4/5] fix(util): fix decode timestamp error by using avx2.

---
 source/util/src/tdecompress.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/source/util/src/tdecompress.c b/source/util/src/tdecompress.c
index 38f277fb48..046cba8686 100644
--- a/source/util/src/tdecompress.c
+++ b/source/util/src/tdecompress.c
@@ -329,6 +329,7 @@ int32_t tsDecompressFloatImplAvx2(const char *const input, const int32_t nelemen
   return 0;
 }
 
+// decode two timestamps in one loop.
 int32_t tsDecompressTimestampAvx2(const char *const input, const int32_t nelements, char *const output,
                                   bool bigEndian) {
   int64_t *ostream = (int64_t *)output;
@@ -375,13 +376,13 @@ int32_t tsDecompressTimestampAvx2(const char *const input, const int32_t nelemen
     __m128i deltaOfDelta = _mm_xor_si128(_mm_srli_epi64(zzVal, 1), signmask);
 
     __m128i deltaCurrent = _mm_add_epi64(deltaOfDelta, prevDelta);
-    deltaCurrent = _mm_add_epi64(_mm_slli_si128(deltaCurrent, 8), deltaCurrent);
+    deltaCurrent = _mm_add_epi64(_mm_slli_si128(deltaOfDelta, 8), deltaCurrent);
 
-    __m128i val = _mm_add_epi64(deltaCurrent, prevVal);
-    _mm_storeu_si128((__m128i *)&ostream[opos], val);
+    __m128i finalVal = _mm_add_epi64(deltaCurrent, prevVal);
+    _mm_storeu_si128((__m128i *)&ostream[opos], finalVal);
 
     // keep the previous value
-    prevVal = _mm_shuffle_epi32 (val, 0xEE);
+    prevVal = _mm_shuffle_epi32 (finalVal, 0xEE);
 
     // keep the previous delta of delta, for the first item
     prevDelta = _mm_shuffle_epi32(deltaOfDelta, 0xEE);
@@ -428,17 +429,18 @@ int32_t tsDecompressTimestampAvx2(const char *const input, const int32_t nelemen
     __m128i deltaOfDelta = _mm_xor_si128(_mm_srli_epi64(zzVal, 1), signmask);
 
     __m128i deltaCurrent = _mm_add_epi64(deltaOfDelta, prevDelta);
-    deltaCurrent = _mm_add_epi64(_mm_slli_si128(deltaCurrent, 8), deltaCurrent);
+    deltaCurrent = _mm_add_epi64(_mm_slli_si128(deltaOfDelta, 8), deltaCurrent);
 
-    __m128i val = _mm_add_epi64(deltaCurrent, prevVal);
-    _mm_storeu_si128((__m128i *)&ostream[opos], val);
+    __m128i finalVal = _mm_add_epi64(deltaCurrent, prevVal);
+    finalVal = _mm_add_epi64(_mm_slli_si128(deltaCurrent, 8), finalVal);
+
+    _mm_storeu_si128((__m128i *)&ostream[opos], finalVal);
 
     // keep the previous value
-    prevVal = _mm_shuffle_epi32 (val, 0xEE);
+    prevVal = _mm_shuffle_epi32 (finalVal, 0xEE);
 
     // keep the previous delta of delta
-    __m128i delta = _mm_add_epi64(_mm_slli_si128(deltaOfDelta, 8), deltaOfDelta);
-    prevDelta = _mm_shuffle_epi32(_mm_add_epi64(delta, prevDelta), 0xEE);
+    prevDelta = _mm_shuffle_epi32 (deltaCurrent, 0xEE);
 
     opos += 2;
     ipos += nbytes1 + nbytes2;

From e9b07459473f8f391b9ab9d5d39153e156befa86 Mon Sep 17 00:00:00 2001
From: Haojun Liao <hjliao@taosdata.com>
Date: Wed, 19 Jun 2024 19:15:45 +0800
Subject: [PATCH 5/5] fix(util): update test cases.

---
 source/util/test/decompressTest.cpp | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/source/util/test/decompressTest.cpp b/source/util/test/decompressTest.cpp
index c982f450ea..b3cf46fea6 100644
--- a/source/util/test/decompressTest.cpp
+++ b/source/util/test/decompressTest.cpp
@@ -37,6 +37,20 @@ TEST(utilTest, decompress_ts_test) {
     std::cout << ((int64_t*)decompOutput)[i] << std::endl;
   }
 
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  int64_t tsList[4] = {1286, 1124, 2681, 2823};
+
+  char*   pOutput[4 * sizeof(int64_t)] = {0};
+  int32_t len = tsCompressTimestamp(tsList, sizeof(tsList), sizeof(tsList) / sizeof(tsList[0]), pOutput, 4,
+                                    ONE_STAGE_COMP, NULL, 0);
+
+  char* decompOutput[4 * 8] = {0};
+  tsDecompressTimestamp(pOutput, len, 4, decompOutput, sizeof(int64_t) * 4, ONE_STAGE_COMP, NULL, 0);
+
+  for (int32_t i = 0; i < 4; ++i) {
+    std::cout << ((int64_t*)decompOutput)[i] << std::endl;
+  }
+
   ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
   int64_t tsList1[7] = {1700000000, 1700000000, 1700000000, 1700000000, 1700000000, 1700000000, 1700000900};
   int32_t len1 = tsCompressTimestamp(tsList1, sizeof(tsList1), sizeof(tsList1) / sizeof(tsList1[0]), pOutput, 7,