From 2b622ceb4cc16605f226a90691acc849753c1dc0 Mon Sep 17 00:00:00 2001
From: "wenzhouwww@live.cn" <wenzhouwww@live.cn>
Date: Tue, 26 Jul 2022 10:10:34 +0800
Subject: [PATCH 01/26] update test case about csum

---
 tests/system-test/2-query/csum.py | 145 +++++++++++++++---------------
 1 file changed, 73 insertions(+), 72 deletions(-)

diff --git a/tests/system-test/2-query/csum.py b/tests/system-test/2-query/csum.py
index bdb8c095e6..260528be04 100644
--- a/tests/system-test/2-query/csum.py
+++ b/tests/system-test/2-query/csum.py
@@ -30,7 +30,7 @@ class TDTestCase:
         tdLog.debug("start to execute %s" % __file__)
         tdSql.init(conn.cursor(), logSql)
 
-    def csum_query_form(self, col="c1",  alias="", table_expr="t1", condition=""):
+    def csum_query_form(self, col="c1",  alias="", table_expr="db.t1", condition=""):
 
         '''
         csum function:
@@ -44,7 +44,7 @@ class TDTestCase:
 
         return f"select csum({col}) {alias} from {table_expr} {condition}"
 
-    def checkcsum(self,col="c1", alias="", table_expr="t1", condition="" ):
+    def checkcsum(self,col="c1", alias="", table_expr="db.t1", condition="" ):
         line = sys._getframe().f_back.f_lineno
         pre_sql = self.csum_query_form(
             col=col, table_expr=table_expr, condition=condition
@@ -59,11 +59,11 @@ class TDTestCase:
             tdSql.checkRows(0)
             return
 
-        if "order by tbname" in condition:
-            tdSql.error(self.csum_query_form(
-                col=col, alias=alias, table_expr=table_expr, condition=condition
-            ))
-            return
+        # if "order by tbname" in condition:
+        #     tdSql.error(self.csum_query_form(
+        #         col=col, alias=alias, table_expr=table_expr, condition=condition
+        #     ))
+        #     return
 
         if "group" in condition:
 
@@ -123,7 +123,8 @@ class TDTestCase:
             return
 
         else:
-            tdSql.query(f"select {col} from {table_expr} {re.sub('limit [0-9]*|offset [0-9]*','',condition)}")
+
+            tdSql.query(f"select {col} from {table_expr} {re.sub('limit [0-9]*|offset [0-9]*','',condition)} " )
             offset_val = condition.split("offset")[1].split(" ")[1] if "offset" in condition else 0
             pre_result = np.array(tdSql.queryResult)[np.array(tdSql.queryResult) != None]
             if (platform.system().lower() == 'windows' and pre_result.dtype == 'int32'):
@@ -161,12 +162,12 @@ class TDTestCase:
         self.checkcsum(**case6)
 
         # case7~8: nested query
-        # case7 = {"table_expr": "(select c1 from stb1)"}
-        # self.checkcsum(**case7)
-        # case8 = {"table_expr": "(select csum(c1) c1 from stb1 group by tbname)"}
-        # self.checkcsum(**case8)
+        case7 = {"table_expr": "(select c1 from db.stb1 order by tbname ,ts )"}
+        self.checkcsum(**case7)
+        case8 = {"table_expr": "(select csum(c1) c1 from db.t1 partition by tbname)"}
+        self.checkcsum(**case8)
 
-        # case9~10: mix with tbname/ts/tag/col
+        # case9~10: mix with tbname/ts/tag/col not support , must partition by alias  ,such as select tbname ,csum(c1) partition by tbname
         # case9 = {"alias": ", tbname"}
         # self.checkcsum(**case9)
         # case10 = {"alias": ", _c0"}
@@ -196,37 +197,37 @@ class TDTestCase:
         }
         self.checkcsum(**case17)
         # case18~19: with group by
-        # case18 = {
-        #     "table_expr": "t1",
-        #     "condition": "group by c6"
-        # }
-        # self.checkcsum(**case18)
-        # case19 = {
-        #     "table_expr": "stb1",
-        #     "condition": "partition by tbname"  # partition by tbname
-        # }
-        # self.checkcsum(**case19)
+        case18 = {
+            "table_expr": "db.t1",
+            "condition": "where c6 <0 partition by c6 order by c6"
+        }
+        self.checkcsum(**case18)
+        case19 = {
+            "table_expr": "db.t1",
+            "condition": " "  # partition by tbname
+        }
+        self.checkcsum(**case19)
 
-        # # case20~21: with order by
-        # case20 = {"condition": "order by ts"}
-        # self.checkcsum(**case20)
+        # case20~21: with order by
+        case20 = {"condition": "partition by tbname order by tbname  "}
+        # self.checkcsum(**case20) # order by without order by tbname ,because check will random failed
 
-        # # case22: with union
-        # case22 = {
-        #     "condition": "union all select csum(c1) from t2"
-        # }
-        # self.checkcsum(**case22)
+        # case22: with union
+        case22 = {
+            "condition": "union all select csum(c1) from db.t2"
+        }
+        # self.checkcsum(**case22) union all without check result becasue ,it will random return table_records
 
         # case23: with limit/slimit
         case23 = {
             "condition": "limit 1"
         }
         self.checkcsum(**case23)
-        # case24 = {
-        #     "table_expr": "stb1",
-        #     "condition": "group by tbname slimit 1 soffset 1"
-        # }
-        # self.checkcsum(**case24)
+        case24 = {
+            "table_expr": "db.t1",
+            "condition": "partition by tbname "
+        }
+        self.checkcsum(**case24)
 
         pass
 
@@ -291,17 +292,17 @@ class TDTestCase:
         }
         tdSql.error(self.csum_query_form(**interval_sql))       # interval
         group_normal_col = {
-            "table_expr": "t1",
+            "table_expr": "db.t1",
             "condition": "group by c6"
         }
         tdSql.error(self.csum_query_form(**group_normal_col))       # group by normal col
         slimit_soffset_sql = {
-            "table_expr": "stb1",
+            "table_expr": "db.stb1",
             "condition": "group by tbname slimit 1 soffset 1"
         }
         # tdSql.error(self.csum_query_form(**slimit_soffset_sql))
         order_by_tbname_sql = {
-            "table_expr": "stb1",
+            "table_expr": "db.stb1",
             "condition": "group by tbname order by tbname"
         }
         tdSql.error(self.csum_query_form(**order_by_tbname_sql))
@@ -346,8 +347,8 @@ class TDTestCase:
             "create stable db.stb2 (ts timestamp, c1 int) tags(st2 int)"
         )
         for i in range(tbnum):
-            tdSql.execute(f"create table t{i} using stb1 tags({i})")
-            tdSql.execute(f"create table tt{i} using stb2 tags({i})")
+            tdSql.execute(f"create table db.t{i} using db.stb1 tags({i})")
+            tdSql.execute(f"create table db.tt{i} using db.stb2 tags({i})")
 
         pass
 
@@ -364,25 +365,25 @@ class TDTestCase:
 
         tdLog.printNoPrefix("######## insert only NULL test:")
         for i in range(tbnum):
-            tdSql.execute(f"insert into t{i}(ts) values ({nowtime - 5})")
-            tdSql.execute(f"insert into t{i}(ts) values ({nowtime + 5})")
+            tdSql.execute(f"insert into db.t{i}(ts) values ({nowtime - 5})")
+            tdSql.execute(f"insert into db.t{i}(ts) values ({nowtime + 5})")
         self.csum_current_query()
         self.csum_error_query()
 
         tdLog.printNoPrefix("######## insert data in the range near the max(bigint/double):")
         self.csum_test_table(tbnum)
-        tdSql.execute(f"insert into t1(ts, c1,c2,c5,c7) values "
+        tdSql.execute(f"insert into db.t1(ts, c1,c2,c5,c7) values "
                       f"({nowtime - (per_table_rows + 1) * 10}, {2**31-1}, {3.4*10**38}, {1.7*10**308}, {2**63-1})")
-        tdSql.execute(f"insert into t1(ts, c1,c2,c5,c7) values "
+        tdSql.execute(f"insert into db.t1(ts, c1,c2,c5,c7) values "
                       f"({nowtime - (per_table_rows + 2) * 10}, {2**31-1}, {3.4*10**38}, {1.7*10**308}, {2**63-1})")
         self.csum_current_query()
         self.csum_error_query()
 
         tdLog.printNoPrefix("######## insert data in the range near the min(bigint/double):")
         self.csum_test_table(tbnum)
-        tdSql.execute(f"insert into t1(ts, c1,c2,c5,c7) values "
+        tdSql.execute(f"insert into db.t1(ts, c1,c2,c5,c7) values "
                       f"({nowtime - (per_table_rows + 1) * 10}, {1-2**31}, {-3.4*10**38}, {-1.7*10**308}, {1-2**63})")
-        tdSql.execute(f"insert into t1(ts, c1,c2,c5,c7) values "
+        tdSql.execute(f"insert into db.t1(ts, c1,c2,c5,c7) values "
                       f"({nowtime - (per_table_rows + 2) * 10}, {1-2**31}, {-3.4*10**38}, {-1.7*10**308}, {512-2**63})")
         self.csum_current_query()
         self.csum_error_query()
@@ -396,9 +397,9 @@ class TDTestCase:
 
         tdLog.printNoPrefix("######## insert data mix with NULL test:")
         for i in range(tbnum):
-            tdSql.execute(f"insert into t{i}(ts) values ({nowtime})")
-            tdSql.execute(f"insert into t{i}(ts) values ({nowtime-(per_table_rows+3)*10})")
-            tdSql.execute(f"insert into t{i}(ts) values ({nowtime+(per_table_rows+3)*10})")
+            tdSql.execute(f"insert into db.t{i}(ts) values ({nowtime})")
+            tdSql.execute(f"insert into db.t{i}(ts) values ({nowtime-(per_table_rows+3)*10})")
+            tdSql.execute(f"insert into db.t{i}(ts) values ({nowtime+(per_table_rows+3)*10})")
         self.csum_current_query()
         self.csum_error_query()
 
@@ -411,65 +412,65 @@ class TDTestCase:
         tdDnodes.start(index)
         self.csum_current_query()
         self.csum_error_query()
-        tdSql.query("select csum(1) from t1 ")
+        tdSql.query("select csum(1) from db.t1 ")
         tdSql.checkRows(7)
         tdSql.checkData(0,0,1)
         tdSql.checkData(1,0,2)
         tdSql.checkData(2,0,3)
         tdSql.checkData(3,0,4)
-        tdSql.query("select csum(abs(c1))+2 from t1 ")
+        tdSql.query("select csum(abs(c1))+2 from db.t1 ")
         tdSql.checkRows(4)
 
     def csum_support_stable(self):
-        tdSql.query(" select csum(1) from stb1 ")
+        tdSql.query(" select csum(1) from db.stb1 ")
         tdSql.checkRows(70)
-        tdSql.query("select csum(c1) from stb1 partition by tbname ")
+        tdSql.query("select csum(c1) from db.stb1 partition by tbname ")
         tdSql.checkRows(40)
-        tdSql.query("select csum(st1) from stb1 partition by tbname")
+        tdSql.query("select csum(st1) from db.stb1 partition by tbname")
         tdSql.checkRows(70)
-        tdSql.query("select csum(st1+c1) from stb1 partition by tbname")
+        tdSql.query("select csum(st1+c1) from db.stb1 partition by tbname")
         tdSql.checkRows(40)
-        tdSql.query("select csum(st1+c1) from stb1 partition by tbname")
+        tdSql.query("select csum(st1+c1) from db.stb1 partition by tbname")
         tdSql.checkRows(40)
-        tdSql.query("select csum(st1+c1) from stb1 partition by tbname")
+        tdSql.query("select csum(st1+c1) from db.stb1 partition by tbname")
         tdSql.checkRows(40)
 
         # # bug need fix
-        tdSql.query("select csum(st1+c1) from stb1 partition by tbname slimit 1 ")
+        tdSql.query("select csum(st1+c1) from db.stb1 partition by tbname slimit 1 ")
         tdSql.checkRows(4)
-        # tdSql.error("select csum(st1+c1) from stb1 partition by tbname limit 1 ")
+        # tdSql.error("select csum(st1+c1) from db.stb1 partition by tbname limit 1 ")
 
 
         # bug need fix
-        tdSql.query("select csum(st1+c1) from stb1 partition by tbname")
+        tdSql.query("select csum(st1+c1) from db.stb1 partition by tbname")
         tdSql.checkRows(40)
 
         # bug need fix
-        tdSql.query("select tbname , csum(c1) from stb1 partition by tbname")
+        tdSql.query("select tbname , csum(c1) from db.stb1 partition by tbname")
         tdSql.checkRows(40)
-        tdSql.query("select tbname , csum(st1) from stb1 partition by tbname")
+        tdSql.query("select tbname , csum(st1) from db.stb1 partition by tbname")
         tdSql.checkRows(70)
-        tdSql.query("select tbname , csum(st1) from stb1 partition by tbname slimit 1")
+        tdSql.query("select tbname , csum(st1) from db.stb1 partition by tbname slimit 1")
         tdSql.checkRows(7)
 
         # partition by tags
-        tdSql.query("select st1 , csum(c1) from stb1 partition by st1")
+        tdSql.query("select st1 , csum(c1) from db.stb1 partition by st1")
         tdSql.checkRows(40)
-        tdSql.query("select csum(c1) from stb1 partition by st1")
+        tdSql.query("select csum(c1) from db.stb1 partition by st1")
         tdSql.checkRows(40)
-        tdSql.query("select st1 , csum(c1) from stb1 partition by st1 slimit 1")
+        tdSql.query("select st1 , csum(c1) from db.stb1 partition by st1 slimit 1")
         tdSql.checkRows(4)
-        tdSql.query("select csum(c1) from stb1 partition by st1 slimit 1")
+        tdSql.query("select csum(c1) from db.stb1 partition by st1 slimit 1")
         tdSql.checkRows(4)
 
         # partition by col
-        # tdSql.query("select c1 , csum(c1) from stb1 partition by c1")
+        # tdSql.query("select c1 , csum(c1) from db.stb1 partition by c1")
         # tdSql.checkRows(41)
-        # tdSql.query("select csum(c1) from stb1 partition by c1")
+        # tdSql.query("select csum(c1) from db.stb1 partition by c1")
         # tdSql.checkRows(41)
-        # tdSql.query("select c1 , csum(c1) from stb1 partition by st1 slimit 1")
+        # tdSql.query("select c1 , csum(c1) from db.stb1 partition by st1 slimit 1")
         # tdSql.checkRows(4)
-        # tdSql.query("select csum(c1) from stb1 partition by st1 slimit 1")
+        # tdSql.query("select csum(c1) from db.stb1 partition by st1 slimit 1")
         # tdSql.checkRows(4)
 
 

From 231f4399791c3bed4379bfffe9c7db2e48302c37 Mon Sep 17 00:00:00 2001
From: Haojun Liao <hjliao@taosdata.com>
Date: Tue, 26 Jul 2022 10:19:12 +0800
Subject: [PATCH 02/26] refactor: opt memory consumption for tsdbread.

---
 source/dnode/vnode/src/tsdb/tsdbRead.c | 177 +++++++++++++++----------
 1 file changed, 110 insertions(+), 67 deletions(-)

diff --git a/source/dnode/vnode/src/tsdb/tsdbRead.c b/source/dnode/vnode/src/tsdb/tsdbRead.c
index ea8ac09429..386326fca3 100644
--- a/source/dnode/vnode/src/tsdb/tsdbRead.c
+++ b/source/dnode/vnode/src/tsdb/tsdbRead.c
@@ -32,6 +32,7 @@ typedef struct STableBlockScanInfo {
   uint64_t  uid;
   TSKEY     lastKey;
   SBlockIdx blockIdx;
+  SMapData  mapData;     // block info (compressed)
   SArray*   pBlockList;  // block data index list
   SIterInfo iter;        // mem buffer skip list iterator
   SIterInfo iiter;       // imem buffer skip list iterator
@@ -42,7 +43,7 @@ typedef struct STableBlockScanInfo {
 
 typedef struct SBlockOrderWrapper {
   int64_t uid;
-  SBlock* pBlock;
+  int64_t offset;
 } SBlockOrderWrapper;
 
 typedef struct SBlockOrderSupporter {
@@ -53,11 +54,13 @@ typedef struct SBlockOrderSupporter {
 } SBlockOrderSupporter;
 
 typedef struct SIOCostSummary {
-  int64_t blockLoadTime;
-  int64_t smaLoadTime;
-  int64_t checkForNextTime;
+  int64_t numOfBlocks;
+  double  blockLoadTime;
+  double  buildmemBlock;
   int64_t headFileLoad;
-  int64_t headFileLoadTime;
+  double  headFileLoadTime;
+  int64_t smaData;
+  double  smaLoadTime;
 } SIOCostSummary;
 
 typedef struct SBlockLoadSuppInfo {
@@ -86,6 +89,8 @@ typedef struct SDataBlockIter {
   int32_t index;
   SArray* blockList;  // SArray<SFileDataBlockInfo>
   int32_t order;
+  SBlock  block;      // current SBlock data
+  SHashObj* pTableMap;
 } SDataBlockIter;
 
 typedef struct SFileBlockDumpInfo {
@@ -320,6 +325,8 @@ static bool filesetIteratorNext(SFilesetIter* pIter, STsdbReader* pReader) {
       goto _err;
     }
 
+    pReader->cost.headFileLoad += 1;
+
     int32_t fid = pReader->status.pCurrentFileset->fid;
     tsdbFidKeyRange(fid, pReader->pTsdb->keepCfg.days, pReader->pTsdb->keepCfg.precision, &win.skey, &win.ekey);
 
@@ -347,7 +354,7 @@ _err:
   return false;
 }
 
-static void resetDataBlockIterator(SDataBlockIter* pIter, int32_t order) {
+static void resetDataBlockIterator(SDataBlockIter* pIter, int32_t order, SHashObj* pTableMap) {
   pIter->order = order;
   pIter->index = -1;
   pIter->numOfBlocks = -1;
@@ -356,6 +363,7 @@ static void resetDataBlockIterator(SDataBlockIter* pIter, int32_t order) {
   } else {
     taosArrayClear(pIter->blockList);
   }
+  pIter->pTableMap = pTableMap;
 }
 
 static void cleanupDataBlockIterator(SDataBlockIter* pIter) { taosArrayDestroy(pIter->blockList); }
@@ -554,7 +562,7 @@ static int32_t doLoadBlockIndex(STsdbReader* pReader, SDataFReader* pFileReader,
 
     STableBlockScanInfo* pScanInfo = p;
     if (pScanInfo->pBlockList == NULL) {
-      pScanInfo->pBlockList = taosArrayInit(16, sizeof(SBlock));
+      pScanInfo->pBlockList = taosArrayInit(4, sizeof(int32_t));
     }
 
     pScanInfo->blockIdx = *pBlockIdx;
@@ -562,8 +570,11 @@ static int32_t doLoadBlockIndex(STsdbReader* pReader, SDataFReader* pFileReader,
   }
 
   int64_t et2 = taosGetTimestampUs();
-  tsdbDebug("load block index for %d tables completed, elapsed time:%.2f ms, set blockIdx:%.2f ms, size:%d bytes %s",
-            (int32_t)num, (et1 - st)/1000.0, (et2-et1)/1000.0, num * sizeof(SBlockIdx), pReader->idStr);
+  tsdbDebug("load block index for %d tables completed, elapsed time:%.2f ms, set blockIdx:%.2f ms, size:%.2f Kb %s",
+            (int32_t)num, (et1 - st)/1000.0, (et2-et1)/1000.0, num * sizeof(SBlockIdx)/1024.0, pReader->idStr);
+
+  pReader->cost.headFileLoadTime += (et1 - st) / 1000.0;
+
 _end:
   taosArrayDestroy(aBlockIdx);
   return code;
@@ -584,23 +595,22 @@ static int32_t doLoadFileBlock(STsdbReader* pReader, SArray* pIndexList, uint32_
       break;
     }
 
+    tMapDataClear(&px->mapData);
     taosArrayClear(px->pBlockList);
   }
 
   for (int32_t i = 0; i < numOfTables; ++i) {
     SBlockIdx* pBlockIdx = taosArrayGet(pIndexList, i);
 
-    SMapData mapData = {0};
-    tMapDataReset(&mapData);
-    tsdbReadBlock(pReader->pFileReader, pBlockIdx, &mapData, NULL);
-
-    size += mapData.nData;
-
     STableBlockScanInfo* pScanInfo = taosHashGet(pReader->status.pTableMap, &pBlockIdx->uid, sizeof(int64_t));
-    for (int32_t j = 0; j < mapData.nItem; ++j) {
-      SBlock block = {0};
 
-      tMapDataGetItemByIdx(&mapData, j, &block, tGetBlock);
+    tMapDataReset(&pScanInfo->mapData);
+    tsdbReadBlock(pReader->pFileReader, pBlockIdx, &pScanInfo->mapData, NULL);
+
+    size += pScanInfo->mapData.nData;
+    for (int32_t j = 0; j < pScanInfo->mapData.nItem; ++j) {
+      SBlock block = {0};
+      tMapDataGetItemByIdx(&pScanInfo->mapData, j, &block, tGetBlock);
 
       // 1. time range check
       if (block.minKey.ts > pReader->window.ekey || block.maxKey.ts < pReader->window.skey) {
@@ -612,24 +622,26 @@ static int32_t doLoadFileBlock(STsdbReader* pReader, SArray* pIndexList, uint32_
         continue;
       }
 
-      void* p = taosArrayPush(pScanInfo->pBlockList, &block);
+      void* p = taosArrayPush(pScanInfo->pBlockList, &j);
       if (p == NULL) {
-        tMapDataClear(&mapData);
+        tMapDataClear(&pScanInfo->mapData);
         return TSDB_CODE_OUT_OF_MEMORY;
       }
 
       (*numOfBlocks) += 1;
     }
 
-    tMapDataClear(&mapData);
     if (pScanInfo->pBlockList != NULL && taosArrayGetSize(pScanInfo->pBlockList) > 0) {
       (*numOfValidTables) += 1;
     }
   }
 
-  int64_t et = taosGetTimestampUs();
+  double el = (taosGetTimestampUs() - st)/1000.0;
   tsdbDebug("load block of %d tables completed, blocks:%d in %d tables, size:%.2f Kb, elapsed time:%.2f ms %s",
-      numOfTables, *numOfBlocks, *numOfValidTables, size/1000.0, (et-st)/1000.0, pReader->idStr);
+      numOfTables, *numOfBlocks, *numOfValidTables, size/1000.0, el, pReader->idStr);
+
+  pReader->cost.numOfBlocks += (*numOfBlocks);
+  pReader->cost.headFileLoadTime += el;
 
   return TSDB_CODE_SUCCESS;
 }
@@ -657,13 +669,22 @@ static void doCopyColVal(SColumnInfoData* pColInfoData, int32_t rowIndex, int32_
   }
 }
 
+static SFileDataBlockInfo* getCurrentBlockInfo(SDataBlockIter* pBlockIter) {
+  SFileDataBlockInfo* pFBlockInfo = taosArrayGet(pBlockIter->blockList, pBlockIter->index);
+  return pFBlockInfo;
+}
+
+static SBlock* getCurrentBlock(SDataBlockIter* pBlockIter) {
+  return &pBlockIter->block;
+}
+
 static int32_t copyBlockDataToSDataBlock(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo) {
   SReaderStatus*  pStatus = &pReader->status;
   SDataBlockIter* pBlockIter = &pStatus->blockIter;
 
   SBlockData*         pBlockData = &pStatus->fileBlockData;
   SFileDataBlockInfo* pFBlock = getCurrentBlockInfo(pBlockIter);
-  SBlock*             pBlock = taosArrayGet(pBlockScanInfo->pBlockList, pFBlock->tbBlockIdx);
+  SBlock*             pBlock  = getCurrentBlock(pBlockIter);
   SSDataBlock*        pResBlock = pReader->pResBlock;
   int32_t             numOfCols = blockDataGetNumOfCols(pResBlock);
 
@@ -729,12 +750,12 @@ static int32_t copyBlockDataToSDataBlock(STsdbReader* pReader, STableBlockScanIn
 
   setBlockAllDumped(pDumpInfo, pBlock, pReader->order);
 
-  int64_t elapsedTime = (taosGetTimestampUs() - st);
+  double elapsedTime = (taosGetTimestampUs() - st) / 1000.0;
   pReader->cost.blockLoadTime += elapsedTime;
 
   int32_t unDumpedRows = asc ? pBlock->nRow - pDumpInfo->rowIndex : pDumpInfo->rowIndex + 1;
   tsdbDebug("%p load file block into buffer, global index:%d, table index:%d, brange:%" PRId64 "-%" PRId64
-            ", rows:%d, remain:%d, minVer:%" PRId64 ", maxVer:%" PRId64 ", elapsed time:%" PRId64 " us, %s",
+            ", rows:%d, remain:%d, minVer:%" PRId64 ", maxVer:%" PRId64 ", elapsed time:%.2f ms, %s",
             pReader, pBlockIter->index, pFBlock->tbBlockIdx, pBlock->minKey.ts, pBlock->maxKey.ts, remain, unDumpedRows,
             pBlock->minVersion, pBlock->maxVersion, elapsedTime, pReader->idStr);
 
@@ -746,7 +767,8 @@ static int32_t doLoadFileBlockData(STsdbReader* pReader, SDataBlockIter* pBlockI
   int64_t st = taosGetTimestampUs();
 
   SFileDataBlockInfo* pFBlock = getCurrentBlockInfo(pBlockIter);
-  SBlock*             pBlock = taosArrayGet(pBlockScanInfo->pBlockList, pFBlock->tbBlockIdx);
+  SBlock* pBlock = getCurrentBlock(pBlockIter);
+
   SSDataBlock*        pResBlock = pReader->pResBlock;
   int32_t             numOfCols = blockDataGetNumOfCols(pResBlock);
 
@@ -759,14 +781,15 @@ static int32_t doLoadFileBlockData(STsdbReader* pReader, SDataBlockIter* pBlockI
     goto _error;
   }
 
-  int64_t elapsedTime = (taosGetTimestampUs() - st);
+  double elapsedTime = (taosGetTimestampUs() - st) / 1000.0;
   pReader->cost.blockLoadTime += elapsedTime;
 
   pDumpInfo->allDumped = false;
   tsdbDebug("%p load file block into buffer, global index:%d, table index:%d, brange:%" PRId64 "-%" PRId64
-            ", rows:%d, minVer:%" PRId64 ", maxVer:%" PRId64 ", elapsed time:%" PRId64 " us, %s",
+            ", rows:%d, minVer:%" PRId64 ", maxVer:%" PRId64 ", elapsed time:%.2f ms, %s",
             pReader, pBlockIter->index, pFBlock->tbBlockIdx, pBlock->minKey.ts, pBlock->maxKey.ts, pBlock->nRow,
             pBlock->minVersion, pBlock->maxVersion, elapsedTime, pReader->idStr);
+
   return TSDB_CODE_SUCCESS;
 
 _error:
@@ -824,7 +847,21 @@ static int32_t fileDataBlockOrderCompar(const void* pLeft, const void* pRight, v
   SBlockOrderWrapper* pLeftBlock = &pSupporter->pDataBlockInfo[leftIndex][leftTableBlockIndex];
   SBlockOrderWrapper* pRightBlock = &pSupporter->pDataBlockInfo[rightIndex][rightTableBlockIndex];
 
-  return pLeftBlock->pBlock->aSubBlock[0].offset > pRightBlock->pBlock->aSubBlock[0].offset ? 1 : -1;
+  return pLeftBlock->offset > pRightBlock->offset ? 1 : -1;
+}
+
+static int32_t doSetCurrentBlock(SDataBlockIter* pBlockIter) {
+  SFileDataBlockInfo* pFBlock = getCurrentBlockInfo(pBlockIter);
+  STableBlockScanInfo* pScanInfo = taosHashGet(pBlockIter->pTableMap, &pFBlock->uid, sizeof(pFBlock->uid));
+
+  int32_t* mapDataIndex = taosArrayGet(pScanInfo->pBlockList, pFBlock->tbBlockIdx);
+  tMapDataGetItemByIdx(&pScanInfo->mapData, *mapDataIndex, &pBlockIter->block, tGetBlock);
+
+#if 0
+  qDebug("check file block, table uid:%"PRIu64" index:%d offset:%"PRId64", ", pScanInfo->uid, *mapDataIndex, pBlockIter->block.aSubBlock[0].offset);
+#endif
+
+  return TSDB_CODE_SUCCESS;
 }
 
 static int32_t initBlockIterator(STsdbReader* pReader, SDataBlockIter* pBlockIter, int32_t numOfBlocks) {
@@ -869,8 +906,13 @@ static int32_t initBlockIterator(STsdbReader* pReader, SDataBlockIter* pBlockIte
     sup.pDataBlockInfo[sup.numOfTables] = (SBlockOrderWrapper*)buf;
     for (int32_t k = 0; k < num; ++k) {
       SBlockOrderWrapper wrapper = {0};
-      wrapper.pBlock = (SBlock*)taosArrayGet(pTableScanInfo->pBlockList, k);
+
+      SBlock block = {0};
+      int32_t* mapDataIndex = taosArrayGet(pTableScanInfo->pBlockList, k);
+      tMapDataGetItemByIdx(&pTableScanInfo->mapData, *mapDataIndex, &block, tGetBlock);
+
       wrapper.uid = pTableScanInfo->uid;
+      wrapper.offset = block.aSubBlock[0].offset;
 
       sup.pDataBlockInfo[sup.numOfTables][k] = wrapper;
       cnt++;
@@ -894,6 +936,7 @@ static int32_t initBlockIterator(STsdbReader* pReader, SDataBlockIter* pBlockIte
 
     pBlockIter->index = asc ? 0 : (numOfBlocks - 1);
     cleanupBlockOrderSupporter(&sup);
+    doSetCurrentBlock(pBlockIter);
     return TSDB_CODE_SUCCESS;
   }
 
@@ -932,6 +975,8 @@ static int32_t initBlockIterator(STsdbReader* pReader, SDataBlockIter* pBlockIte
   taosMemoryFree(pTree);
 
   pBlockIter->index = asc ? 0 : (numOfBlocks - 1);
+  doSetCurrentBlock(pBlockIter);
+
   return TSDB_CODE_SUCCESS;
 }
 
@@ -944,6 +989,8 @@ static bool blockIteratorNext(SDataBlockIter* pBlockIter) {
   }
 
   pBlockIter->index += step;
+  doSetCurrentBlock(pBlockIter);
+
   return true;
 }
 
@@ -957,11 +1004,6 @@ static int32_t dataBlockPartiallyRequired(STimeWindow* pWindow, SVersionRange* p
          (pVerRange->maxVer < pBlock->maxVersion && pVerRange->maxVer >= pBlock->minVersion);
 }
 
-static SFileDataBlockInfo* getCurrentBlockInfo(SDataBlockIter* pBlockIter) {
-  SFileDataBlockInfo* pFBlockInfo = taosArrayGet(pBlockIter->blockList, pBlockIter->index);
-  return pFBlockInfo;
-}
-
 static SBlock* getNeighborBlockOfSameTable(SFileDataBlockInfo* pFBlockInfo, STableBlockScanInfo* pTableBlockScanInfo,
                                            int32_t* nextIndex, int32_t order) {
   bool asc = ASCENDING_TRAVERSE(order);
@@ -974,10 +1016,13 @@ static SBlock* getNeighborBlockOfSameTable(SFileDataBlockInfo* pFBlockInfo, STab
   }
 
   int32_t step = asc ? 1 : -1;
-
   *nextIndex = pFBlockInfo->tbBlockIdx + step;
-  SBlock* pNext = taosArrayGet(pTableBlockScanInfo->pBlockList, *nextIndex);
-  return pNext;
+
+  SBlock *pBlock = taosMemoryCalloc(1, sizeof(SBlock));
+  int32_t* indexInMapdata = taosArrayGet(pTableBlockScanInfo->pBlockList, *nextIndex);
+
+  tMapDataGetItemByIdx(&pTableBlockScanInfo->mapData, *indexInMapdata, pBlock, tGetBlock);
+  return pBlock;
 }
 
 static int32_t findFileBlockInfoIndex(SDataBlockIter* pBlockIter, SFileDataBlockInfo* pFBlockInfo) {
@@ -1117,6 +1162,7 @@ static bool fileBlockShouldLoad(STsdbReader* pReader, SFileDataBlockInfo* pFBloc
   bool overlapWithNeighbor = false;
   if (pNeighbor) {
     overlapWithNeighbor = overlapWithNeighborBlock(pBlock, pNeighbor, pReader->order);
+    taosMemoryFree(pNeighbor);
   }
 
   // has duplicated ts of different version in this block
@@ -1142,11 +1188,13 @@ static int32_t buildDataBlockFromBuf(STsdbReader* pReader, STableBlockScanInfo*
 
   setComposedBlockFlag(pReader, true);
 
-  int64_t elapsedTime = taosGetTimestampUs() - st;
-  tsdbDebug("%p build data block from cache completed, elapsed time:%" PRId64
-            " us, numOfRows:%d, numOfCols:%d, brange: %" PRId64 " - %" PRId64 " %s",
-            pReader, elapsedTime, pBlock->info.rows, (int32_t)blockDataGetNumOfCols(pBlock), pBlock->info.window.skey,
-            pBlock->info.window.ekey, pReader->idStr);
+  double elapsedTime = (taosGetTimestampUs() - st) / 1000.0;
+  tsdbDebug(
+      "%p build data block from cache completed, elapsed time:%.2f ms, numOfRows:%d, brange: %" PRId64
+      " - %" PRId64 " %s",
+      pReader, elapsedTime, pBlock->info.rows, pBlock->info.window.skey, pBlock->info.window.ekey, pReader->idStr);
+
+  pReader->cost.buildmemBlock += elapsedTime;
   return code;
 }
 
@@ -1408,9 +1456,7 @@ static int32_t buildComposedDataBlock(STsdbReader* pReader, STableBlockScanInfo*
       if (!isValidFileBlockRow(pBlockData, pDumpInfo, pBlockScanInfo, pReader)) {
         pDumpInfo->rowIndex += step;
 
-        SFileDataBlockInfo* pFBlock = getCurrentBlockInfo(&pReader->status.blockIter);
-        SBlock*             pBlock = taosArrayGet(pBlockScanInfo->pBlockList, pFBlock->tbBlockIdx);
-
+        SBlock* pBlock = getCurrentBlock(&pReader->status.blockIter);
         if (pDumpInfo->rowIndex >= pBlock->nRow || pDumpInfo->rowIndex < 0) {
           setBlockAllDumped(pDumpInfo, pBlock, pReader->order);
           break;
@@ -1421,9 +1467,7 @@ static int32_t buildComposedDataBlock(STsdbReader* pReader, STableBlockScanInfo*
     }
 
     buildComposedDataBlockImpl(pReader, pBlockScanInfo);
-
-    SFileDataBlockInfo* pFBlock = getCurrentBlockInfo(&pReader->status.blockIter);
-    SBlock*             pBlock = taosArrayGet(pBlockScanInfo->pBlockList, pFBlock->tbBlockIdx);
+    SBlock* pBlock = getCurrentBlock(&pReader->status.blockIter);
 
     // currently loaded file data block is consumed
     if (pDumpInfo->rowIndex >= pBlock->nRow || pDumpInfo->rowIndex < 0) {
@@ -1666,7 +1710,7 @@ static int32_t doBuildDataBlock(STsdbReader* pReader) {
   SFileDataBlockInfo*  pFBlock = getCurrentBlockInfo(pBlockIter);
   STableBlockScanInfo* pScanInfo = taosHashGet(pStatus->pTableMap, &pFBlock->uid, sizeof(pFBlock->uid));
 
-  SBlock* pBlock = taosArrayGet(pScanInfo->pBlockList, pFBlock->tbBlockIdx);
+  SBlock* pBlock = getCurrentBlock(pBlockIter);
 
   TSDBKEY key = getCurrentKeyInBuf(pBlockIter, pReader);
   if (fileBlockShouldLoad(pReader, pFBlock, pBlock, pScanInfo, key)) {
@@ -1729,9 +1773,7 @@ static int32_t buildBlockFromBufferSequentially(STsdbReader* pReader) {
 
 // set the correct start position in case of the first/last file block, according to the query time window
 static void initBlockDumpInfo(STsdbReader* pReader, SDataBlockIter* pBlockIter) {
-  SFileDataBlockInfo*  pFBlock = getCurrentBlockInfo(pBlockIter);
-  STableBlockScanInfo* pScanInfo = taosHashGet(pReader->status.pTableMap, &pFBlock->uid, sizeof(pFBlock->uid));
-  SBlock*              pBlock = taosArrayGet(pScanInfo->pBlockList, pFBlock->tbBlockIdx);
+  SBlock* pBlock = getCurrentBlock(pBlockIter);
 
   SReaderStatus* pStatus = &pReader->status;
 
@@ -2102,6 +2144,8 @@ static int32_t checkForNeighborFileBlock(STsdbReader* pReader, STableBlockScanIn
   }
 
   bool overlap = overlapWithNeighborBlock(pBlock, pNeighborBlock, pReader->order);
+  taosMemoryFree(pNeighborBlock);
+
   if (overlap) {  // load next block
     SReaderStatus*  pStatus = &pReader->status;
     SDataBlockIter* pBlockIter = &pStatus->blockIter;
@@ -2152,7 +2196,7 @@ int32_t doMergeRowsInFileBlocks(SBlockData* pBlockData, STableBlockScanInfo* pSc
       CHECK_FILEBLOCK_STATE st;
 
       SFileDataBlockInfo* pFileBlockInfo = getCurrentBlockInfo(&pReader->status.blockIter);
-      SBlock*             pCurrentBlock = taosArrayGet(pScanInfo->pBlockList, pFileBlockInfo->tbBlockIdx);
+      SBlock* pCurrentBlock = getCurrentBlock(&pReader->status.blockIter);
       checkForNeighborFileBlock(pReader, pScanInfo, pCurrentBlock, pFileBlockInfo, pMerger, key, &st);
       if (st == CHECK_FILEBLOCK_QUIT) {
         break;
@@ -2461,7 +2505,7 @@ int32_t tsdbReaderOpen(SVnode* pVnode, SQueryTableDataCond* pCond, SArray* pTabl
     SDataBlockIter* pBlockIter = &pReader->status.blockIter;
 
     initFilesetIterator(&pReader->status.fileIter, pReader->pReadSnap->fs.aDFileSet, pReader->order, pReader->idStr);
-    resetDataBlockIterator(&pReader->status.blockIter, pReader->order);
+    resetDataBlockIterator(&pReader->status.blockIter, pReader->order, pReader->status.pTableMap);
 
     // no data in files, let's try buffer in memory
     if (pReader->status.fileIter.numOfFiles == 0) {
@@ -2477,7 +2521,7 @@ int32_t tsdbReaderOpen(SVnode* pVnode, SQueryTableDataCond* pCond, SArray* pTabl
     SDataBlockIter* pBlockIter = &pPrevReader->status.blockIter;
 
     initFilesetIterator(&pPrevReader->status.fileIter, pPrevReader->pReadSnap->fs.aDFileSet, pPrevReader->order, pPrevReader->idStr);
-    resetDataBlockIterator(&pPrevReader->status.blockIter, pPrevReader->order);
+    resetDataBlockIterator(&pPrevReader->status.blockIter, pPrevReader->order, pReader->status.pTableMap);
 
     // no data in files, let's try buffer in memory
     if (pPrevReader->status.fileIter.numOfFiles == 0) {
@@ -2528,10 +2572,10 @@ void tsdbReaderClose(STsdbReader* pReader) {
 
   SIOCostSummary* pCost = &pReader->cost;
 
-  tsdbDebug("%p :io-cost summary: head-file read cnt:%" PRIu64 ", head-file time:%" PRIu64 " us, statis-info:%" PRId64
-            " us, datablock:%" PRId64 " us, check data:%" PRId64 " us, %s",
-            pReader, pCost->headFileLoad, pCost->headFileLoadTime, pCost->smaLoadTime, pCost->blockLoadTime,
-            pCost->checkForNextTime, pReader->idStr);
+  tsdbDebug("%p :io-cost summary: head-file:%" PRIu64 ", head-file time:%.2f ms, SMA:%"PRId64" SMA-time:%.2f ms, "
+            "fileBlocks:%"PRId64", fileBlocks-time:%.2f ms, build in-memory-block-time:%.2f ms, %s",
+            pReader, pCost->headFileLoad, pCost->headFileLoadTime, pCost->smaData, pCost->smaLoadTime,
+            pCost->numOfBlocks, pCost->blockLoadTime, pCost->buildmemBlock, pReader->idStr);
 
   taosMemoryFree(pReader->idStr);
   taosMemoryFree(pReader->pSchema);
@@ -2543,7 +2587,6 @@ static bool doTsdbNextDataBlock(STsdbReader* pReader) {
   SSDataBlock* pBlock = pReader->pResBlock;
   blockDataCleanup(pBlock);
 
-  int64_t        stime = taosGetTimestampUs();
   SReaderStatus* pStatus = &pReader->status;
 
   if (pStatus->loadFromFile) {
@@ -2639,9 +2682,8 @@ int32_t tsdbRetrieveDatablockSMA(STsdbReader* pReader, SColumnDataAgg*** pBlockS
   }
 
   SFileDataBlockInfo*  pFBlock = getCurrentBlockInfo(&pReader->status.blockIter);
-  STableBlockScanInfo* pBlockScanInfo = taosHashGet(pReader->status.pTableMap, &pFBlock->uid, sizeof(pFBlock->uid));
-  SBlock*              pBlock = taosArrayGet(pBlockScanInfo->pBlockList, pFBlock->tbBlockIdx);
 
+  SBlock* pBlock = getCurrentBlock(&pReader->status.blockIter);
   int64_t stime = taosGetTimestampUs();
 
   SBlockLoadSuppInfo* pSup = &pReader->suppInfo;
@@ -2690,12 +2732,13 @@ int32_t tsdbRetrieveDatablockSMA(STsdbReader* pReader, SColumnDataAgg*** pBlockS
     }
   }
 
-  int64_t elapsed = taosGetTimestampUs() - stime;
+  double elapsed = (taosGetTimestampUs() - stime) / 1000.0;
   pReader->cost.smaLoadTime += elapsed;
+  pReader->cost.smaData += 1;
 
   *pBlockStatis = pSup->plist;
 
-  tsdbDebug("vgId:%d, succeed to load block SMA for uid %" PRIu64 ", elapsed time:%" PRId64 "us, %s", 0, pFBlock->uid,
+  tsdbDebug("vgId:%d, succeed to load block SMA for uid %" PRIu64 ", elapsed time:%.2f ms, %s", 0, pFBlock->uid,
             elapsed, pReader->idStr);
 
   return code;
@@ -2764,7 +2807,7 @@ int32_t tsdbReaderReset(STsdbReader* pReader, SQueryTableDataCond* pCond) {
   tsdbDataFReaderClose(&pReader->pFileReader);
 
   initFilesetIterator(&pReader->status.fileIter, pReader->pReadSnap->fs.aDFileSet, pReader->order, pReader->idStr);
-  resetDataBlockIterator(&pReader->status.blockIter, pReader->order);
+  resetDataBlockIterator(&pReader->status.blockIter, pReader->order, pReader->status.pTableMap);
   resetDataBlockScanInfo(pReader->status.pTableMap);
 
   int32_t code = 0;

From bd975ddaee29a44b22536da77d2d11f1fc949ce8 Mon Sep 17 00:00:00 2001
From: "wenzhouwww@live.cn" <wenzhouwww@live.cn>
Date: Tue, 26 Jul 2022 10:24:38 +0800
Subject: [PATCH 03/26] update case bug fix

---
 tests/system-test/2-query/max_partition.py | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/tests/system-test/2-query/max_partition.py b/tests/system-test/2-query/max_partition.py
index 109c9075f5..4b9996d9c3 100644
--- a/tests/system-test/2-query/max_partition.py
+++ b/tests/system-test/2-query/max_partition.py
@@ -193,20 +193,11 @@ class TDTestCase:
         tdSql.query("select c1 , DERIVATIVE(c1,2,1) from stb partition by c1 order by c1")
         tdSql.checkRows(90)
         # bug need fix
-        # tdSql.checkData(0,1,None)
+        tdSql.checkData(0,1,None)
 
 
-
-
-
-
-
-        # bug need fix
-        # tdSql.query(" select tbname , max(c1) from stb partition by tbname order by tbname slimit 5 soffset 0 ")
-        # tdSql.checkRows(5)
-
-        # tdSql.query(" select tbname , max(c1) from stb partition by tbname order by tbname slimit 5 soffset 1 ")
-        # tdSql.checkRows(5)
+        tdSql.query(" select tbname , max(c1) from stb partition by tbname order by tbname slimit 5 soffset 0 ")
+        tdSql.checkRows(10)
 
         tdSql.query(" select tbname , max(c1) from sub_stb_1 partition by tbname interval(10s) sliding(5s) ")
 

From 662a9997c1aa20c94f296d5968fdceaa79d79ce5 Mon Sep 17 00:00:00 2001
From: shenglian zhou <slzhou@taosdata.com>
Date: Tue, 26 Jul 2022 10:39:58 +0800
Subject: [PATCH 04/26] fix: set null when all inputs are null for leastsquare
 function

---
 source/libs/function/src/builtinsimpl.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/source/libs/function/src/builtinsimpl.c b/source/libs/function/src/builtinsimpl.c
index b6fe5b9998..92f9bdd1b2 100644
--- a/source/libs/function/src/builtinsimpl.c
+++ b/source/libs/function/src/builtinsimpl.c
@@ -2218,6 +2218,7 @@ int32_t leastSQRFinalize(SqlFunctionCtx* pCtx, SSDataBlock* pBlock) {
   int32_t currentRow = pBlock->info.rows;
 
   if (0 == pInfo->num) {
+    colDataAppendNULL(pCol, currentRow);
     return 0;
   }
 

From 43e2f8bc2bea1cf6a95741d764bcae9786ac1840 Mon Sep 17 00:00:00 2001
From: Minghao Li <castermode@gmail.com>
Date: Tue, 26 Jul 2022 10:59:56 +0800
Subject: [PATCH 05/26] refactor(sync): add pre-commit interface

---
 source/libs/sync/inc/syncInt.h           |  1 +
 source/libs/sync/src/syncAppendEntries.c | 46 +++++-------------------
 source/libs/sync/src/syncMain.c          | 33 ++---------------
 3 files changed, 12 insertions(+), 68 deletions(-)

diff --git a/source/libs/sync/inc/syncInt.h b/source/libs/sync/inc/syncInt.h
index 64f66e390a..b802d94bea 100644
--- a/source/libs/sync/inc/syncInt.h
+++ b/source/libs/sync/inc/syncInt.h
@@ -238,6 +238,7 @@ int32_t   syncNodeGetPreIndexTerm(SSyncNode* pSyncNode, SyncIndex index, SyncInd
 
 bool    syncNodeIsOptimizedOneReplica(SSyncNode* ths, SRpcMsg* pMsg);
 int32_t syncNodeCommit(SSyncNode* ths, SyncIndex beginIndex, SyncIndex endIndex, uint64_t flag);
+int32_t syncNodePreCommit(SSyncNode* ths, SSyncRaftEntry* pEntry, int32_t code);
 
 int32_t syncNodeUpdateNewConfigIndex(SSyncNode* ths, SSyncCfg* pNewCfg);
 
diff --git a/source/libs/sync/src/syncAppendEntries.c b/source/libs/sync/src/syncAppendEntries.c
index 4295abeaa1..7d0f53640c 100644
--- a/source/libs/sync/src/syncAppendEntries.c
+++ b/source/libs/sync/src/syncAppendEntries.c
@@ -244,22 +244,7 @@ int32_t syncNodeOnAppendEntriesCb(SSyncNode* ths, SyncAppendEntries* pMsg) {
         ths->pLogStore->appendEntry(ths->pLogStore, pAppendEntry);
 
         // pre commit
-        SRpcMsg rpcMsg;
-        syncEntry2OriginalRpc(pAppendEntry, &rpcMsg);
-        if (ths->pFsm != NULL) {
-          // if (ths->pFsm->FpPreCommitCb != NULL && pAppendEntry->originalRpcType != TDMT_SYNC_NOOP) {
-          if (ths->pFsm->FpPreCommitCb != NULL && syncUtilUserPreCommit(pAppendEntry->originalRpcType)) {
-            SFsmCbMeta cbMeta = {0};
-            cbMeta.index = pAppendEntry->index;
-            cbMeta.lastConfigIndex = syncNodeGetSnapshotConfigIndex(ths, cbMeta.index);
-            cbMeta.isWeak = pAppendEntry->isWeak;
-            cbMeta.code = 2;
-            cbMeta.state = ths->state;
-            cbMeta.seqNum = pAppendEntry->seqNum;
-            ths->pFsm->FpPreCommitCb(ths->pFsm, &rpcMsg, cbMeta);
-          }
-        }
-        rpcFreeCont(rpcMsg.pCont);
+        syncNodePreCommit(ths, pAppendEntry, 0);
       }
 
       // free memory
@@ -280,22 +265,7 @@ int32_t syncNodeOnAppendEntriesCb(SSyncNode* ths, SyncAppendEntries* pMsg) {
       ths->pLogStore->appendEntry(ths->pLogStore, pAppendEntry);
 
       // pre commit
-      SRpcMsg rpcMsg;
-      syncEntry2OriginalRpc(pAppendEntry, &rpcMsg);
-      if (ths->pFsm != NULL) {
-        // if (ths->pFsm->FpPreCommitCb != NULL && pAppendEntry->originalRpcType != TDMT_SYNC_NOOP) {
-        if (ths->pFsm->FpPreCommitCb != NULL && syncUtilUserPreCommit(pAppendEntry->originalRpcType)) {
-          SFsmCbMeta cbMeta = {0};
-          cbMeta.index = pAppendEntry->index;
-          cbMeta.lastConfigIndex = syncNodeGetSnapshotConfigIndex(ths, cbMeta.index);
-          cbMeta.isWeak = pAppendEntry->isWeak;
-          cbMeta.code = 3;
-          cbMeta.state = ths->state;
-          cbMeta.seqNum = pAppendEntry->seqNum;
-          ths->pFsm->FpPreCommitCb(ths->pFsm, &rpcMsg, cbMeta);
-        }
-      }
-      rpcFreeCont(rpcMsg.pCont);
+      syncNodePreCommit(ths, pAppendEntry, 0);
 
       // free memory
       syncEntryDestory(pAppendEntry);
@@ -440,7 +410,7 @@ static int32_t syncNodeDoMakeLogSame(SSyncNode* ths, SyncIndex FromIndex) {
   return code;
 }
 
-static int32_t syncNodePreCommit(SSyncNode* ths, SSyncRaftEntry* pEntry) {
+int32_t syncNodePreCommit(SSyncNode* ths, SSyncRaftEntry* pEntry, int32_t code) {
   SRpcMsg rpcMsg;
   syncEntry2OriginalRpc(pEntry, &rpcMsg);
 
@@ -456,7 +426,7 @@ static int32_t syncNodePreCommit(SSyncNode* ths, SSyncRaftEntry* pEntry) {
       cbMeta.index = pEntry->index;
       cbMeta.lastConfigIndex = syncNodeGetSnapshotConfigIndex(ths, cbMeta.index);
       cbMeta.isWeak = pEntry->isWeak;
-      cbMeta.code = 2;
+      cbMeta.code = code;
       cbMeta.state = ths->state;
       cbMeta.seqNum = pEntry->seqNum;
       ths->pFsm->FpPreCommitCb(ths->pFsm, &rpcMsg, cbMeta);
@@ -594,7 +564,7 @@ int32_t syncNodeOnAppendEntriesSnapshot2Cb(SSyncNode* ths, SyncAppendEntriesBatc
             return -1;
           }
 
-          code = syncNodePreCommit(ths, pAppendEntry);
+          code = syncNodePreCommit(ths, pAppendEntry, 0);
           ASSERT(code == 0);
 
           // syncEntryDestory(pAppendEntry);
@@ -715,7 +685,7 @@ int32_t syncNodeOnAppendEntriesSnapshot2Cb(SSyncNode* ths, SyncAppendEntriesBatc
             return -1;
           }
 
-          code = syncNodePreCommit(ths, pAppendEntry);
+          code = syncNodePreCommit(ths, pAppendEntry, 0);
           ASSERT(code == 0);
 
           // syncEntryDestory(pAppendEntry);
@@ -919,7 +889,7 @@ int32_t syncNodeOnAppendEntriesSnapshotCb(SSyncNode* ths, SyncAppendEntries* pMs
         }
 
         // pre commit
-        code = syncNodePreCommit(ths, pAppendEntry);
+        code = syncNodePreCommit(ths, pAppendEntry, 0);
         ASSERT(code == 0);
 
         // update match index
@@ -1032,7 +1002,7 @@ int32_t syncNodeOnAppendEntriesSnapshotCb(SSyncNode* ths, SyncAppendEntries* pMs
         }
 
         // pre commit
-        code = syncNodePreCommit(ths, pAppendEntry);
+        code = syncNodePreCommit(ths, pAppendEntry, 0);
         ASSERT(code == 0);
 
         syncEntryDestory(pAppendEntry);
diff --git a/source/libs/sync/src/syncMain.c b/source/libs/sync/src/syncMain.c
index a453b2572c..63a404d5f6 100644
--- a/source/libs/sync/src/syncMain.c
+++ b/source/libs/sync/src/syncMain.c
@@ -2504,23 +2504,11 @@ int32_t syncNodeOnClientRequestCb(SSyncNode* ths, SyncClientRequest* pMsg, SyncI
     }
 
     // pre commit
+    syncNodePreCommit(ths, pEntry, 0);
+
     SRpcMsg rpcMsg;
     syncEntry2OriginalRpc(pEntry, &rpcMsg);
 
-    if (ths->pFsm != NULL) {
-      if (ths->pFsm->FpPreCommitCb != NULL && syncUtilUserPreCommit(pEntry->originalRpcType)) {
-        SFsmCbMeta cbMeta = {0};
-        cbMeta.index = pEntry->index;
-        cbMeta.lastConfigIndex = syncNodeGetSnapshotConfigIndex(ths, cbMeta.index);
-        cbMeta.isWeak = pEntry->isWeak;
-        cbMeta.code = 0;
-        cbMeta.state = ths->state;
-        cbMeta.seqNum = pEntry->seqNum;
-        ths->pFsm->FpPreCommitCb(ths->pFsm, &rpcMsg, cbMeta);
-      }
-    }
-    rpcFreeCont(rpcMsg.pCont);
-
     // if only myself, maybe commit right now
     if (ths->replicaNum == 1) {
       syncMaybeAdvanceCommitIndex(ths);
@@ -2528,22 +2516,7 @@ int32_t syncNodeOnClientRequestCb(SSyncNode* ths, SyncClientRequest* pMsg, SyncI
 
   } else {
     // pre commit
-    SRpcMsg rpcMsg;
-    syncEntry2OriginalRpc(pEntry, &rpcMsg);
-
-    if (ths->pFsm != NULL) {
-      if (ths->pFsm->FpPreCommitCb != NULL && syncUtilUserPreCommit(pEntry->originalRpcType)) {
-        SFsmCbMeta cbMeta = {0};
-        cbMeta.index = pEntry->index;
-        cbMeta.lastConfigIndex = syncNodeGetSnapshotConfigIndex(ths, cbMeta.index);
-        cbMeta.isWeak = pEntry->isWeak;
-        cbMeta.code = 1;
-        cbMeta.state = ths->state;
-        cbMeta.seqNum = pEntry->seqNum;
-        ths->pFsm->FpPreCommitCb(ths->pFsm, &rpcMsg, cbMeta);
-      }
-    }
-    rpcFreeCont(rpcMsg.pCont);
+    syncNodePreCommit(ths, pEntry, 0);
   }
 
   if (pRetIndex != NULL) {

From d47905c3acff8fd355d2eb6c65dc668c817600a3 Mon Sep 17 00:00:00 2001
From: plum-lihui <huili@taosdata.com>
Date: Tue, 26 Jul 2022 11:00:16 +0800
Subject: [PATCH 06/26] test: add test case

---
 .../7-tmq/dropDbR3ConflictTransaction.py      | 193 ++++++++++++++++++
 tests/system-test/fulltest.sh                 |   2 +-
 2 files changed, 194 insertions(+), 1 deletion(-)
 create mode 100644 tests/system-test/7-tmq/dropDbR3ConflictTransaction.py

diff --git a/tests/system-test/7-tmq/dropDbR3ConflictTransaction.py b/tests/system-test/7-tmq/dropDbR3ConflictTransaction.py
new file mode 100644
index 0000000000..4dac872fde
--- /dev/null
+++ b/tests/system-test/7-tmq/dropDbR3ConflictTransaction.py
@@ -0,0 +1,193 @@
+from distutils.log import error
+import taos
+import sys
+import time
+import socket
+import os
+import threading
+import subprocess
+import platform
+
+from util.log import *
+from util.sql import *
+from util.cases import *
+from util.dnodes import *
+from util.common import *
+sys.path.append("./7-tmq")
+from tmqCommon import *
+
+
+
+class TDTestCase:
+    def __init__(self):
+        self.snapshot   = 0
+        self.replica    = 3
+        self.vgroups    = 3
+        self.ctbNum     = 2
+        self.rowsPerTbl = 2
+
+    def init(self, conn, logSql):
+        tdLog.debug(f"start to excute {__file__}")
+        tdSql.init(conn.cursor())
+        #tdSql.init(conn.cursor(), logSql)  # output sql.txt file
+
+    def checkFileContent(self, consumerId, queryString):
+        buildPath = tdCom.getBuildPath()
+        cfgPath = tdCom.getClientCfgPath()
+        dstFile = '%s/../log/dstrows_%d.txt'%(cfgPath, consumerId)
+        cmdStr = '%s/build/bin/taos -c %s -s "%s >> %s"'%(buildPath, cfgPath, queryString, dstFile)
+        tdLog.info(cmdStr)
+        os.system(cmdStr)
+        
+        consumeRowsFile = '%s/../log/consumerid_%d.txt'%(cfgPath, consumerId)
+        tdLog.info("rows file: %s, %s"%(consumeRowsFile, dstFile))
+
+        consumeFile = open(consumeRowsFile, mode='r')
+        queryFile = open(dstFile, mode='r')
+        
+        # skip first line for it is schema
+        queryFile.readline()
+
+        while True:
+            dst = queryFile.readline()
+            src = consumeFile.readline()
+            
+            if dst:
+                if dst != src:
+                    tdLog.exit("consumerId %d consume rows is not match the rows by direct query"%consumerId)
+            else:
+                break
+        return
+
+    def prepareTestEnv(self):
+        tdLog.printNoPrefix("======== prepare test env include database, stable, ctables, and insert data: ")
+        paraDict = {'dbName':     'dbt',
+                    'dropFlag':   1,
+                    'event':      '',
+                    'vgroups':    4,
+                    'stbName':    'stb',
+                    'colPrefix':  'c',
+                    'tagPrefix':  't',
+                    'colSchema':   [{'type': 'INT', 'count':1},{'type': 'BIGINT', 'count':1}],
+                    'tagSchema':   [{'type': 'INT', 'count':1},{'type': 'BIGINT', 'count':1}],
+                    'ctbPrefix':  'ctb',
+                    'ctbStartIdx': 0,
+                    'ctbNum':     2,
+                    'rowsPerTbl': 1000,
+                    'batchNum':   10,
+                    'startTs':    1640966400000,  # 2022-01-01 00:00:00.000
+                    'pollDelay':  3,
+                    'showMsg':    1,
+                    'showRow':    1,
+                    'snapshot':   0}
+
+        paraDict['vgroups'] = self.vgroups
+        paraDict['ctbNum'] = self.ctbNum
+        paraDict['rowsPerTbl'] = self.rowsPerTbl
+        
+        tmqCom.initConsumerTable()
+        tdCom.create_database(tdSql, paraDict["dbName"],paraDict["dropFlag"], vgroups=paraDict["vgroups"],replica=self.replica)
+        tdLog.info("create stb")
+        tmqCom.create_stable(tdSql, dbName=paraDict["dbName"],stbName=paraDict["stbName"])
+        tdLog.info("create ctb")
+        tmqCom.create_ctable(tdSql, dbName=paraDict["dbName"],stbName=paraDict["stbName"],ctbPrefix=paraDict['ctbPrefix'],
+                             ctbNum=paraDict["ctbNum"],ctbStartIdx=paraDict['ctbStartIdx'])
+        tdLog.info("insert data")
+        tmqCom.insert_data_interlaceByMultiTbl(tsql=tdSql,dbName=paraDict["dbName"],ctbPrefix=paraDict["ctbPrefix"],
+                                               ctbNum=paraDict["ctbNum"],rowsPerTbl=paraDict["rowsPerTbl"],batchNum=paraDict["batchNum"],
+                                               startTs=paraDict["startTs"],ctbStartIdx=paraDict['ctbStartIdx'])
+        # tmqCom.insert_data_with_autoCreateTbl(tsql=tdSql,dbName=paraDict["dbName"],stbName=paraDict["stbName"],ctbPrefix="ctbx",
+        #                                       ctbNum=paraDict["ctbNum"],rowsPerTbl=paraDict["rowsPerTbl"],batchNum=paraDict["batchNum"],
+        #                                       startTs=paraDict["startTs"],ctbStartIdx=paraDict['ctbStartIdx'])
+        # tmqCom.asyncInsertDataByInterlace(paraDict)
+        tmqCom.create_ntable(tdSql, dbname=paraDict["dbName"], tbname_prefix="ntb", tbname_index_start_num = 1, column_elm_list=paraDict["colSchema"], colPrefix='c', tblNum=1)
+        tmqCom.insert_rows_into_ntbl(tdSql, dbname=paraDict["dbName"], tbname_prefix="ntb", tbname_index_start_num = 1, column_ele_list=paraDict["colSchema"], startTs=paraDict["startTs"], tblNum=1, rows=2)        # tdLog.info("restart taosd to ensure that the data falls into the disk")        
+        tdSql.query("drop database %s"%paraDict["dbName"])
+        return
+
+    def tmqCase1(self):        
+        tdLog.printNoPrefix("======== test case 1: ")       
+        
+        # create and start thread
+        paraDict = {'dbName':     'dbt',
+                    'dropFlag':   1,
+                    'event':      '',
+                    'vgroups':    4,
+                    'stbName':    'stb',
+                    'colPrefix':  'c',
+                    'tagPrefix':  't',
+                    'colSchema':   [{'type': 'INT', 'count':1},{'type': 'BIGINT', 'count':1},{'type': 'DOUBLE', 'count':1},{'type': 'BINARY', 'len':32, 'count':1},{'type': 'NCHAR', 'len':32, 'count':1},{'type': 'TIMESTAMP', 'count':1}],
+                    'tagSchema':   [{'type': 'INT', 'count':1},{'type': 'BIGINT', 'count':1},{'type': 'DOUBLE', 'count':1},{'type': 'BINARY', 'len':32, 'count':1},{'type': 'NCHAR', 'len':32, 'count':1}],
+                    'ctbPrefix':  'ctb',
+                    'ctbStartIdx': 0,
+                    'ctbNum':     100,
+                    'rowsPerTbl': 1000,
+                    'batchNum':   100,
+                    'startTs':    1640966400000,  # 2022-01-01 00:00:00.000
+                    'pollDelay':  3,
+                    'showMsg':    1,
+                    'showRow':    1,
+                    'snapshot':   1}
+
+        paraDict['vgroups'] = self.vgroups
+        paraDict['ctbNum'] = self.ctbNum
+        paraDict['rowsPerTbl'] = self.rowsPerTbl
+        
+        tdLog.info("create topics from stb1")
+        topicFromStb1 = 'topic_stb1'        
+        queryString = "select ts, c1, c2 from %s.%s  where t4 == 'beijing' or t4 == 'changsha' "%(paraDict['dbName'], paraDict['stbName'])
+        sqlString = "create topic %s as %s" %(topicFromStb1, queryString)
+        tdLog.info("create topic sql: %s"%sqlString)
+        tdSql.execute(sqlString)
+        
+        consumerId     = 0
+        expectrowcnt   = paraDict["rowsPerTbl"] * paraDict["ctbNum"]
+        topicList      = topicFromStb1
+        ifcheckdata    = 0
+        ifManualCommit = 0
+        keyList        = 'group.id:cgrp1,\
+                        enable.auto.commit:false,\
+                        auto.commit.interval.ms:6000,\
+                        auto.offset.reset:earliest'
+        tmqCom.insertConsumerInfo(consumerId, expectrowcnt,topicList,keyList,ifcheckdata,ifManualCommit)
+
+        tdLog.info("start consume processor")
+        pollDelay = 100
+        showMsg   = 1
+        showRow   = 1
+        tmqCom.startTmqSimProcess(pollDelay=paraDict['pollDelay'],dbName=paraDict["dbName"],showMsg=paraDict['showMsg'], showRow=paraDict['showRow'],snapshot=paraDict['snapshot'])
+
+        tdLog.info("start to check consume result")
+        expectRows = 1
+        resultList = tmqCom.selectConsumeResult(expectRows)
+        totalConsumeRows = 0
+        for i in range(expectRows):
+            totalConsumeRows += resultList[i]
+
+        tdSql.query(queryString)
+        totalRowsInserted = tdSql.getRows()
+        
+        tdLog.info("act consume rows: %d, act insert rows: %d, expect consume rows: %d, "%(totalConsumeRows, totalRowsInserted, expectrowcnt))
+        
+        if totalConsumeRows != expectrowcnt:
+            tdLog.exit("tmq consume rows error!")
+            
+        # tmqCom.checkFileContent(consumerId, queryString)   
+
+        tmqCom.waitSubscriptionExit(tdSql, topicFromStb1)
+        tdSql.query("drop topic %s"%topicFromStb1)
+
+        tdLog.printNoPrefix("======== test case 1 end ...... ")
+
+    def run(self):
+        self.prepareTestEnv()
+        # self.tmqCase1()
+
+    def stop(self):
+        tdSql.close()
+        tdLog.success(f"{__file__} successfully executed")
+
+event = threading.Event()
+
+tdCases.addLinux(__file__, TDTestCase())
+tdCases.addWindows(__file__, TDTestCase())
diff --git a/tests/system-test/fulltest.sh b/tests/system-test/fulltest.sh
index f074bd8850..27cb06bf18 100755
--- a/tests/system-test/fulltest.sh
+++ b/tests/system-test/fulltest.sh
@@ -173,7 +173,7 @@ python3 ./test.py -f 6-cluster/5dnode3mnodeAdd1Ddnoe.py -N 6 -M 3 -C 5
 # python3 ./test.py -f 6-cluster/5dnode3mnodeDrop.py -N 5
 # python3 test.py -f 6-cluster/5dnode3mnodeStopConnect.py -N 5 -M 3
 
- 
+python3 ./test.py -f 7-tmq/dropDbR3ConflictTransaction.py 
 python3 ./test.py -f 7-tmq/basic5.py
 python3 ./test.py -f 7-tmq/subscribeDb.py
 python3 ./test.py -f 7-tmq/subscribeDb0.py

From b870b6381f90667174125160669bbc92d68c7619 Mon Sep 17 00:00:00 2001
From: shenglian zhou <slzhou@taosdata.com>
Date: Tue, 26 Jul 2022 11:14:17 +0800
Subject: [PATCH 07/26] fix: save sort exec info into table merge scan struct
 and use it when explain

---
 source/libs/executor/inc/executorimpl.h | 2 ++
 source/libs/executor/src/scanoperator.c | 9 ++++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/source/libs/executor/inc/executorimpl.h b/source/libs/executor/inc/executorimpl.h
index 0beb6f1784..4a57819eba 100644
--- a/source/libs/executor/inc/executorimpl.h
+++ b/source/libs/executor/inc/executorimpl.h
@@ -359,6 +359,8 @@ typedef struct STableMergeScanInfo {
   // window to check if current data block needs to be loaded.
   SInterval       interval;
   SSampleExecInfo sample;  // sample execution info
+
+  SSortExecInfo sortExecInfo;
 } STableMergeScanInfo;
 
 typedef struct STagScanInfo {
diff --git a/source/libs/executor/src/scanoperator.c b/source/libs/executor/src/scanoperator.c
index 2dcb555834..463ac0e69c 100644
--- a/source/libs/executor/src/scanoperator.c
+++ b/source/libs/executor/src/scanoperator.c
@@ -2831,6 +2831,13 @@ int32_t stopGroupTableMergeScan(SOperatorInfo* pOperator) {
 
   size_t numReaders = taosArrayGetSize(pInfo->dataReaders);
 
+  SSortExecInfo sortExecInfo = tsortGetSortExecInfo(pInfo->pSortHandle);
+  pInfo->sortExecInfo.sortMethod = sortExecInfo.sortMethod;
+  pInfo->sortExecInfo.sortBuffer = sortExecInfo.sortBuffer;
+  pInfo->sortExecInfo.loops += sortExecInfo.loops;
+  pInfo->sortExecInfo.readBytes += sortExecInfo.readBytes;
+  pInfo->sortExecInfo.writeBytes += sortExecInfo.writeBytes;
+
   for (int32_t i = 0; i < numReaders; ++i) {
     STableMergeScanSortSourceParam* param = taosArrayGet(pInfo->sortSourceParams, i);
     blockDataDestroy(param->inputBlock);
@@ -2955,7 +2962,7 @@ int32_t getTableMergeScanExplainExecInfo(SOperatorInfo* pOptr, void** pOptrExpla
   STableMergeScanExecInfo* execInfo = taosMemoryCalloc(1, sizeof(STableMergeScanExecInfo));
   STableMergeScanInfo*     pInfo = pOptr->info;
   execInfo->blockRecorder = pInfo->readRecorder;
-  execInfo->sortExecInfo = tsortGetSortExecInfo(pInfo->pSortHandle);
+  execInfo->sortExecInfo = pInfo->sortExecInfo;
 
   *pOptrExplain = execInfo;
   *len = sizeof(STableMergeScanExecInfo);

From 0addf0999693fcc1fcf3c5fc54e5c94049b0b942 Mon Sep 17 00:00:00 2001
From: Minghao Li <castermode@gmail.com>
Date: Tue, 26 Jul 2022 11:24:39 +0800
Subject: [PATCH 08/26] refactor(sync): add pre-commit interface

---
 source/dnode/vnode/src/vnd/vnodeSync.c | 33 +++++++++++++++++++-------
 source/libs/sync/src/syncCommit.c      | 26 +++++---------------
 2 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/source/dnode/vnode/src/vnd/vnodeSync.c b/source/dnode/vnode/src/vnd/vnodeSync.c
index a0e2354f51..2b760efba0 100644
--- a/source/dnode/vnode/src/vnd/vnodeSync.c
+++ b/source/dnode/vnode/src/vnd/vnodeSync.c
@@ -206,13 +206,13 @@ static void inline vnodeProposeBatchMsg(SVnode *pVnode, SRpcMsg **pMsgArr, bool
 }
 
 void vnodeProposeWriteMsg(SQueueInfo *pInfo, STaosQall *qall, int32_t numOfMsgs) {
-  SVnode  *pVnode = pInfo->ahandle;
-  int32_t  vgId = pVnode->config.vgId;
-  int32_t  code = 0;
-  SRpcMsg *pMsg = NULL;
-  int32_t  arrayPos = 0;
-  SRpcMsg **pMsgArr = taosMemoryCalloc(numOfMsgs, sizeof(SRpcMsg*));
-  bool    *pIsWeakArr = taosMemoryCalloc(numOfMsgs, sizeof(bool));
+  SVnode   *pVnode = pInfo->ahandle;
+  int32_t   vgId = pVnode->config.vgId;
+  int32_t   code = 0;
+  SRpcMsg  *pMsg = NULL;
+  int32_t   arrayPos = 0;
+  SRpcMsg **pMsgArr = taosMemoryCalloc(numOfMsgs, sizeof(SRpcMsg *));
+  bool     *pIsWeakArr = taosMemoryCalloc(numOfMsgs, sizeof(bool));
   vTrace("vgId:%d, get %d msgs from vnode-write queue", vgId, numOfMsgs);
 
   for (int32_t msg = 0; msg < numOfMsgs; msg++) {
@@ -506,7 +506,7 @@ static void vnodeSyncCommitMsg(SSyncFSM *pFsm, const SRpcMsg *pMsg, SFsmCbMeta c
          syncGetVgId(pVnode->sync), pFsm, cbMeta.index, cbMeta.isWeak, cbMeta.code, cbMeta.state,
          syncUtilState2String(cbMeta.state), pMsg->msgType, TMSG_INFO(pMsg->msgType));
 
-  if (cbMeta.code == 0) {
+  if (cbMeta.code == 0 && cbMeta.isWeak == 0) {
     SRpcMsg rpcMsg = {.msgType = pMsg->msgType, .contLen = pMsg->contLen};
     rpcMsg.pCont = rpcMallocCont(rpcMsg.contLen);
     memcpy(rpcMsg.pCont, pMsg->pCont, pMsg->contLen);
@@ -529,6 +529,23 @@ static void vnodeSyncPreCommitMsg(SSyncFSM *pFsm, const SRpcMsg *pMsg, SFsmCbMet
   vTrace("vgId:%d, pre-commit-cb is excuted, fsm:%p, index:%" PRId64 ", isWeak:%d, code:%d, state:%d %s, msgtype:%d %s",
          syncGetVgId(pVnode->sync), pFsm, cbMeta.index, cbMeta.isWeak, cbMeta.code, cbMeta.state,
          syncUtilState2String(cbMeta.state), pMsg->msgType, TMSG_INFO(pMsg->msgType));
+
+  if (cbMeta.code == 0 && cbMeta.isWeak == 1) {
+    SRpcMsg rpcMsg = {.msgType = pMsg->msgType, .contLen = pMsg->contLen};
+    rpcMsg.pCont = rpcMallocCont(rpcMsg.contLen);
+    memcpy(rpcMsg.pCont, pMsg->pCont, pMsg->contLen);
+    syncGetAndDelRespRpc(pVnode->sync, cbMeta.seqNum, &rpcMsg.info);
+    rpcMsg.info.conn.applyIndex = cbMeta.index;
+    rpcMsg.info.conn.applyTerm = cbMeta.term;
+    tmsgPutToQueue(&pVnode->msgCb, APPLY_QUEUE, &rpcMsg);
+  } else {
+    SRpcMsg rsp = {.code = cbMeta.code, .info = pMsg->info};
+    vError("vgId:%d, sync pre-commit error, msgtype:%d,%s, error:0x%X, errmsg:%s", syncGetVgId(pVnode->sync),
+           pMsg->msgType, TMSG_INFO(pMsg->msgType), cbMeta.code, tstrerror(cbMeta.code));
+    if (rsp.info.handle != NULL) {
+      tmsgSendRsp(&rsp);
+    }
+  }
 }
 
 static void vnodeSyncRollBackMsg(SSyncFSM *pFsm, const SRpcMsg *pMsg, SFsmCbMeta cbMeta) {
diff --git a/source/libs/sync/src/syncCommit.c b/source/libs/sync/src/syncCommit.c
index b3cdd079a4..fd6577477f 100644
--- a/source/libs/sync/src/syncCommit.c
+++ b/source/libs/sync/src/syncCommit.c
@@ -67,11 +67,6 @@ void syncMaybeAdvanceCommitIndex(SSyncNode* pSyncNode) {
   for (SyncIndex index = syncNodeGetLastIndex(pSyncNode); index > pSyncNode->commitIndex; --index) {
     bool agree = syncAgree(pSyncNode, index);
 
-    if (gRaftDetailLog) {
-      sTrace("syncMaybeAdvanceCommitIndex syncAgree:%d, index:%" PRId64 ", pSyncNode->commitIndex:%" PRId64, agree,
-             index, pSyncNode->commitIndex);
-    }
-
     if (agree) {
       // term
       SSyncRaftEntry* pEntry = pSyncNode->pLogStore->getEntry(pSyncNode->pLogStore, index);
@@ -82,20 +77,15 @@ void syncMaybeAdvanceCommitIndex(SSyncNode* pSyncNode) {
         // update commit index
         newCommitIndex = index;
 
-        if (gRaftDetailLog) {
-          sTrace("syncMaybeAdvanceCommitIndex maybe to update, newCommitIndex:%" PRId64
-                 " commit, pSyncNode->commitIndex:%" PRId64,
-                 newCommitIndex, pSyncNode->commitIndex);
-        }
-
         syncEntryDestory(pEntry);
         break;
       } else {
-        if (gRaftDetailLog) {
-          sTrace("syncMaybeAdvanceCommitIndex can not commit due to term not equal, pEntry->term:%" PRIu64
-                 ", pSyncNode->pRaftStore->currentTerm:%" PRIu64,
-                 pEntry->term, pSyncNode->pRaftStore->currentTerm);
-        }
+        do {
+          char logBuf[128];
+          snprintf(logBuf, sizeof(logBuf), "can not commit due to term not equal, index:%ld, term:%lu", pEntry->index,
+                   pEntry->term);
+          syncNodeEventLog(pSyncNode, logBuf);
+        } while (0);
       }
 
       syncEntryDestory(pEntry);
@@ -107,10 +97,6 @@ void syncMaybeAdvanceCommitIndex(SSyncNode* pSyncNode) {
     SyncIndex beginIndex = pSyncNode->commitIndex + 1;
     SyncIndex endIndex = newCommitIndex;
 
-    if (gRaftDetailLog) {
-      sTrace("syncMaybeAdvanceCommitIndex sync commit %" PRId64, newCommitIndex);
-    }
-
     // update commit index
     pSyncNode->commitIndex = newCommitIndex;
 

From 6600540fd9a034238f5f7cd726c39617b4db6736 Mon Sep 17 00:00:00 2001
From: Haojun Liao <hjliao@taosdata.com>
Date: Tue, 26 Jul 2022 11:43:45 +0800
Subject: [PATCH 09/26] refactor: optimize the memory consumption during
 tsdbread

---
 source/dnode/vnode/src/inc/tsdb.h      |  2 +-
 source/dnode/vnode/src/tsdb/tsdbRead.c | 18 +++++++++++-------
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/source/dnode/vnode/src/inc/tsdb.h b/source/dnode/vnode/src/inc/tsdb.h
index 04bf6bcc2b..d8c84e952b 100644
--- a/source/dnode/vnode/src/inc/tsdb.h
+++ b/source/dnode/vnode/src/inc/tsdb.h
@@ -371,8 +371,8 @@ struct SBlockIdx {
 
 struct SMapData {
   int32_t  nItem;
-  int32_t *aOffset;
   int32_t  nData;
+  int32_t *aOffset;
   uint8_t *pData;
 };
 
diff --git a/source/dnode/vnode/src/tsdb/tsdbRead.c b/source/dnode/vnode/src/tsdb/tsdbRead.c
index 386326fca3..a17504ea04 100644
--- a/source/dnode/vnode/src/tsdb/tsdbRead.c
+++ b/source/dnode/vnode/src/tsdb/tsdbRead.c
@@ -31,7 +31,6 @@ typedef struct {
 typedef struct STableBlockScanInfo {
   uint64_t  uid;
   TSKEY     lastKey;
-  SBlockIdx blockIdx;
   SMapData  mapData;     // block info (compressed)
   SArray*   pBlockList;  // block data index list
   SIterInfo iter;        // mem buffer skip list iterator
@@ -188,7 +187,7 @@ static int32_t setColumnIdSlotList(STsdbReader* pReader, SSDataBlock* pBlock) {
 
 static SHashObj* createDataBlockScanInfo(STsdbReader* pTsdbReader, const STableKeyInfo* idList, int32_t numOfTables) {
   // allocate buffer in order to load data blocks from file
-  // todo use simple hash instead
+  // todo use simple hash instead, optimize the memory consumption
   SHashObj* pTableMap =
       taosHashInit(numOfTables, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BIGINT), false, HASH_NO_LOCK);
   if (pTableMap == NULL) {
@@ -249,6 +248,7 @@ static void destroyBlockScanInfo(SHashObj* pTableMap) {
 
     p->delSkyline = taosArrayDestroy(p->delSkyline);
     p->pBlockList = taosArrayDestroy(p->pBlockList);
+    tMapDataClear(&p->mapData);
   }
 
   taosHashCleanup(pTableMap);
@@ -565,7 +565,6 @@ static int32_t doLoadBlockIndex(STsdbReader* pReader, SDataFReader* pFileReader,
       pScanInfo->pBlockList = taosArrayInit(4, sizeof(int32_t));
     }
 
-    pScanInfo->blockIdx = *pBlockIdx;
     taosArrayPush(pIndexList, pBlockIdx);
   }
 
@@ -775,7 +774,8 @@ static int32_t doLoadFileBlockData(STsdbReader* pReader, SDataBlockIter* pBlockI
   SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;
   SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
 
-  int32_t code = tsdbReadColData(pReader->pFileReader, &pBlockScanInfo->blockIdx, pBlock, pSupInfo->colIds, numOfCols,
+  SBlockIdx blockIdx = {.suid = pReader->suid, .uid =  pBlockScanInfo->uid};
+  int32_t code = tsdbReadColData(pReader->pFileReader, &blockIdx, pBlock, pSupInfo->colIds, numOfCols,
                                  pBlockData, NULL, NULL);
   if (code != TSDB_CODE_SUCCESS) {
     goto _error;
@@ -904,10 +904,10 @@ static int32_t initBlockIterator(STsdbReader* pReader, SDataBlockIter* pBlockIte
     }
 
     sup.pDataBlockInfo[sup.numOfTables] = (SBlockOrderWrapper*)buf;
+    SBlock block = {0};
     for (int32_t k = 0; k < num; ++k) {
       SBlockOrderWrapper wrapper = {0};
 
-      SBlock block = {0};
       int32_t* mapDataIndex = taosArrayGet(pTableScanInfo->pBlockList, k);
       tMapDataGetItemByIdx(&pTableScanInfo->mapData, *mapDataIndex, &block, tGetBlock);
 
@@ -2563,6 +2563,8 @@ void tsdbReaderClose(STsdbReader* pReader) {
   taosMemoryFree(pSupInfo->buildBuf);
 
   cleanupDataBlockIterator(&pReader->status.blockIter);
+
+  size_t numOfTables = taosHashGetSize(pReader->status.pTableMap);
   destroyBlockScanInfo(pReader->status.pTableMap);
   blockDataDestroy(pReader->pResBlock);
 
@@ -2573,9 +2575,11 @@ void tsdbReaderClose(STsdbReader* pReader) {
   SIOCostSummary* pCost = &pReader->cost;
 
   tsdbDebug("%p :io-cost summary: head-file:%" PRIu64 ", head-file time:%.2f ms, SMA:%"PRId64" SMA-time:%.2f ms, "
-            "fileBlocks:%"PRId64", fileBlocks-time:%.2f ms, build in-memory-block-time:%.2f ms, %s",
+            "fileBlocks:%"PRId64", fileBlocks-time:%.2f ms, build in-memory-block-time:%.2f ms, STableBlockScanInfo "
+                                "size:%.2f Kb %s",
             pReader, pCost->headFileLoad, pCost->headFileLoadTime, pCost->smaData, pCost->smaLoadTime,
-            pCost->numOfBlocks, pCost->blockLoadTime, pCost->buildmemBlock, pReader->idStr);
+            pCost->numOfBlocks, pCost->blockLoadTime, pCost->buildmemBlock,
+            numOfTables * sizeof(STableBlockScanInfo) /1000.0, pReader->idStr);
 
   taosMemoryFree(pReader->idStr);
   taosMemoryFree(pReader->pSchema);

From beabd02905a49887314769f5e9e32d63c677d045 Mon Sep 17 00:00:00 2001
From: plum-lihui <huili@taosdata.com>
Date: Tue, 26 Jul 2022 11:46:57 +0800
Subject: [PATCH 10/26] test: add replica=3 case

---
 tests/system-test/fulltest.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/system-test/fulltest.sh b/tests/system-test/fulltest.sh
index 27cb06bf18..58d1df0f31 100755
--- a/tests/system-test/fulltest.sh
+++ b/tests/system-test/fulltest.sh
@@ -173,7 +173,7 @@ python3 ./test.py -f 6-cluster/5dnode3mnodeAdd1Ddnoe.py -N 6 -M 3 -C 5
 # python3 ./test.py -f 6-cluster/5dnode3mnodeDrop.py -N 5
 # python3 test.py -f 6-cluster/5dnode3mnodeStopConnect.py -N 5 -M 3
 
-python3 ./test.py -f 7-tmq/dropDbR3ConflictTransaction.py 
+python3 ./test.py -f 7-tmq/dropDbR3ConflictTransaction.py -N 3 
 python3 ./test.py -f 7-tmq/basic5.py
 python3 ./test.py -f 7-tmq/subscribeDb.py
 python3 ./test.py -f 7-tmq/subscribeDb0.py

From 49a4e83d484c32d839ee97561a7034920c7d9b65 Mon Sep 17 00:00:00 2001
From: Minghao Li <castermode@gmail.com>
Date: Tue, 26 Jul 2022 13:23:56 +0800
Subject: [PATCH 11/26] refactor(sync): add pre-commit interface

---
 source/libs/sync/src/syncMain.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/source/libs/sync/src/syncMain.c b/source/libs/sync/src/syncMain.c
index 63a404d5f6..935d89b99b 100644
--- a/source/libs/sync/src/syncMain.c
+++ b/source/libs/sync/src/syncMain.c
@@ -2506,9 +2506,6 @@ int32_t syncNodeOnClientRequestCb(SSyncNode* ths, SyncClientRequest* pMsg, SyncI
     // pre commit
     syncNodePreCommit(ths, pEntry, 0);
 
-    SRpcMsg rpcMsg;
-    syncEntry2OriginalRpc(pEntry, &rpcMsg);
-
     // if only myself, maybe commit right now
     if (ths->replicaNum == 1) {
       syncMaybeAdvanceCommitIndex(ths);

From 4532bcdf4e89febddf75e74c363d7767e25b62b6 Mon Sep 17 00:00:00 2001
From: Haojun Liao <hjliao@taosdata.com>
Date: Tue, 26 Jul 2022 13:26:29 +0800
Subject: [PATCH 12/26] fix(query):set ptr to be null after releasing
 resources.

---
 source/dnode/vnode/src/tsdb/tsdbUtil.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/source/dnode/vnode/src/tsdb/tsdbUtil.c b/source/dnode/vnode/src/tsdb/tsdbUtil.c
index 872357fc93..805e49a705 100644
--- a/source/dnode/vnode/src/tsdb/tsdbUtil.c
+++ b/source/dnode/vnode/src/tsdb/tsdbUtil.c
@@ -24,6 +24,8 @@ void tMapDataReset(SMapData *pMapData) {
 void tMapDataClear(SMapData *pMapData) {
   tFree((uint8_t *)pMapData->aOffset);
   tFree(pMapData->pData);
+  pMapData->pData = NULL;
+  pMapData->aOffset = NULL;
 }
 
 int32_t tMapDataPutItem(SMapData *pMapData, void *pItem, int32_t (*tPutItemFn)(uint8_t *, void *)) {

From 9d5acf18123ebd582c3df09189d2f84ede0bdb11 Mon Sep 17 00:00:00 2001
From: Shengliang Guan <slguan@taosdata.com>
Date: Tue, 26 Jul 2022 13:26:52 +0800
Subject: [PATCH 13/26] enh: add wal options to db

---
 include/common/tmsg.h                       |  4 ++++
 source/common/src/tmsg.c                    |  9 +++++++++
 source/dnode/mgmt/mgmt_vnode/src/vmHandle.c |  7 +++++++
 source/dnode/mnode/impl/inc/mndDef.h        |  6 +++++-
 source/dnode/mnode/impl/src/mndDb.c         | 20 ++++++++++++++++++++
 source/dnode/mnode/impl/src/mndVgroup.c     |  4 ++++
 6 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/include/common/tmsg.h b/include/common/tmsg.h
index 0d0eb841bc..9d4f77d1ef 100644
--- a/include/common/tmsg.h
+++ b/include/common/tmsg.h
@@ -1154,6 +1154,10 @@ typedef struct {
   int32_t  numOfRetensions;
   SArray*  pRetensions;  // SRetention
   void*    pTsma;
+  int32_t  walRetentionPeriod;
+  int32_t  walRetentionSize;
+  int32_t  walRollPeriod;
+  int32_t  walSegmentSize;
 } SCreateVnodeReq;
 
 int32_t tSerializeSCreateVnodeReq(void* buf, int32_t bufLen, SCreateVnodeReq* pReq);
diff --git a/source/common/src/tmsg.c b/source/common/src/tmsg.c
index f87d336ec2..d4e56ec742 100644
--- a/source/common/src/tmsg.c
+++ b/source/common/src/tmsg.c
@@ -3750,6 +3750,10 @@ int32_t tSerializeSCreateVnodeReq(void *buf, int32_t bufLen, SCreateVnodeReq *pR
     uint32_t tsmaLen = (uint32_t)(htonl(((SMsgHead *)pReq->pTsma)->contLen));
     if (tEncodeBinary(&encoder, (const uint8_t *)pReq->pTsma, tsmaLen) < 0) return -1;
   }
+  if (tEncodeI32(&encoder, pReq->walRetentionPeriod) < 0) return -1;
+  if (tEncodeI32(&encoder, pReq->walRetentionSize) < 0) return -1;
+  if (tEncodeI32(&encoder, pReq->walRollPeriod) < 0) return -1;
+  if (tEncodeI32(&encoder, pReq->walSegmentSize) < 0) return -1;
 
   tEndEncode(&encoder);
 
@@ -3818,6 +3822,11 @@ int32_t tDeserializeSCreateVnodeReq(void *buf, int32_t bufLen, SCreateVnodeReq *
     if (tDecodeBinary(&decoder, (uint8_t **)&pReq->pTsma, NULL) < 0) return -1;
   }
 
+  if (tDecodeI32(&decoder, &pReq->walRetentionPeriod) < 0) return -1;
+  if (tDecodeI32(&decoder, &pReq->walRetentionSize) < 0) return -1;
+  if (tDecodeI32(&decoder, &pReq->walRollPeriod) < 0) return -1;
+  if (tDecodeI32(&decoder, &pReq->walSegmentSize) < 0) return -1;
+
   tEndDecode(&decoder);
   tDecoderClear(&decoder);
   return 0;
diff --git a/source/dnode/mgmt/mgmt_vnode/src/vmHandle.c b/source/dnode/mgmt/mgmt_vnode/src/vmHandle.c
index 0471e2b850..159b9ca568 100644
--- a/source/dnode/mgmt/mgmt_vnode/src/vmHandle.c
+++ b/source/dnode/mgmt/mgmt_vnode/src/vmHandle.c
@@ -160,6 +160,13 @@ static void vmGenerateVnodeCfg(SCreateVnodeReq *pCreate, SVnodeCfg *pCfg) {
   }
 
   pCfg->walCfg.vgId = pCreate->vgId;
+  pCfg->walCfg.fsyncPeriod = pCreate->fsyncPeriod;
+  pCfg->walCfg.retentionPeriod = pCreate->walRetentionPeriod;
+  pCfg->walCfg.rollPeriod = pCreate->walRetentionSize;
+  pCfg->walCfg.retentionSize = pCreate->walRollPeriod;
+  pCfg->walCfg.segSize = pCreate->walSegmentSize;
+  pCfg->walCfg.level = pCreate->walLevel;
+
   pCfg->hashBegin = pCreate->hashBegin;
   pCfg->hashEnd = pCreate->hashEnd;
   pCfg->hashMethod = pCreate->hashMethod;
diff --git a/source/dnode/mnode/impl/inc/mndDef.h b/source/dnode/mnode/impl/inc/mndDef.h
index c9997fa3d5..e7f78b34c5 100644
--- a/source/dnode/mnode/impl/inc/mndDef.h
+++ b/source/dnode/mnode/impl/inc/mndDef.h
@@ -302,9 +302,13 @@ typedef struct {
   int8_t  strict;
   int8_t  hashMethod;  // default is 1
   int8_t  cacheLast;
+  int8_t  schemaless;
   int32_t numOfRetensions;
   SArray* pRetensions;
-  int8_t  schemaless;
+  int32_t walRetentionPeriod;
+  int32_t walRetentionSize;
+  int32_t walRollPeriod;
+  int32_t walSegmentSize;
 } SDbCfg;
 
 typedef struct {
diff --git a/source/dnode/mnode/impl/src/mndDb.c b/source/dnode/mnode/impl/src/mndDb.c
index 064ef9b40a..d183b519e8 100644
--- a/source/dnode/mnode/impl/src/mndDb.c
+++ b/source/dnode/mnode/impl/src/mndDb.c
@@ -120,6 +120,10 @@ static SSdbRaw *mndDbActionEncode(SDbObj *pDb) {
     SDB_SET_INT8(pRaw, dataPos, pRetension->keepUnit, _OVER)
   }
   SDB_SET_INT8(pRaw, dataPos, pDb->cfg.schemaless, _OVER)
+  SDB_SET_INT32(pRaw, dataPos, pDb->cfg.walRetentionPeriod, _OVER)
+  SDB_SET_INT32(pRaw, dataPos, pDb->cfg.walRetentionSize, _OVER)
+  SDB_SET_INT32(pRaw, dataPos, pDb->cfg.walRollPeriod, _OVER)
+  SDB_SET_INT32(pRaw, dataPos, pDb->cfg.walSegmentSize, _OVER)
 
   SDB_SET_RESERVE(pRaw, dataPos, DB_RESERVE_SIZE, _OVER)
   SDB_SET_DATALEN(pRaw, dataPos, _OVER)
@@ -199,6 +203,10 @@ static SSdbRow *mndDbActionDecode(SSdbRaw *pRaw) {
     }
   }
   SDB_GET_INT8(pRaw, dataPos, &pDb->cfg.schemaless, _OVER)
+  SDB_GET_INT32(pRaw, dataPos, &pDb->cfg.walRetentionPeriod, _OVER)
+  SDB_GET_INT32(pRaw, dataPos, &pDb->cfg.walRetentionSize, _OVER)
+  SDB_GET_INT32(pRaw, dataPos, &pDb->cfg.walRollPeriod, _OVER)
+  SDB_GET_INT32(pRaw, dataPos, &pDb->cfg.walSegmentSize, _OVER)
 
   SDB_GET_RESERVE(pRaw, dataPos, DB_RESERVE_SIZE, _OVER)
   taosInitRWLatch(&pDb->lock);
@@ -318,6 +326,10 @@ static int32_t mndCheckDbCfg(SMnode *pMnode, SDbCfg *pCfg) {
     terrno = TSDB_CODE_MND_NO_ENOUGH_DNODES;
     return -1;
   }
+  if (pCfg->walRetentionPeriod < TSDB_DB_MIN_WAL_RETENTION_PERIOD) return -1;
+  if (pCfg->walRetentionSize < TSDB_DB_MIN_WAL_RETENTION_SIZE) return -1;
+  if (pCfg->walRollPeriod < TSDB_DB_MIN_WAL_ROLL_PERIOD) return -1;
+  if (pCfg->walSegmentSize < TSDB_DB_MIN_WAL_SEGMENT_SIZE) return -1;
 
   terrno = 0;
   return terrno;
@@ -345,6 +357,10 @@ static void mndSetDefaultDbCfg(SDbCfg *pCfg) {
   if (pCfg->cacheLastSize <= 0) pCfg->cacheLastSize = TSDB_DEFAULT_CACHE_SIZE;
   if (pCfg->numOfRetensions < 0) pCfg->numOfRetensions = 0;
   if (pCfg->schemaless < 0) pCfg->schemaless = TSDB_DB_SCHEMALESS_OFF;
+  if (pCfg->walRetentionPeriod < 0) pCfg->walRetentionPeriod = TSDB_DEFAULT_DB_WAL_RETENTION_PERIOD;
+  if (pCfg->walRetentionSize < 0) pCfg->walRetentionSize = TSDB_DEFAULT_DB_WAL_RETENTION_SIZE;
+  if (pCfg->walRollPeriod < 0) pCfg->walRollPeriod = TSDB_DEFAULT_DB_WAL_ROLL_PERIOD;
+  if (pCfg->walSegmentSize < 0) pCfg->walSegmentSize = TSDB_DEFAULT_DB_WAL_SEGMENT_SIZE;
 }
 
 static int32_t mndSetCreateDbRedoLogs(SMnode *pMnode, STrans *pTrans, SDbObj *pDb, SVgObj *pVgroups) {
@@ -457,6 +473,10 @@ static int32_t mndCreateDb(SMnode *pMnode, SRpcMsg *pReq, SCreateDbReq *pCreate,
       .cacheLast = pCreate->cacheLast,
       .hashMethod = 1,
       .schemaless = pCreate->schemaless,
+      .walRetentionPeriod = pCreate->walRetentionPeriod,
+      .walRetentionSize = pCreate->walRetentionSize,
+      .walRollPeriod = pCreate->walRollPeriod,
+      .walSegmentSize = pCreate->walSegmentSize,
   };
 
   dbObj.cfg.numOfRetensions = pCreate->numOfRetensions;
diff --git a/source/dnode/mnode/impl/src/mndVgroup.c b/source/dnode/mnode/impl/src/mndVgroup.c
index 3eb3a6cd1f..4625b2ab01 100644
--- a/source/dnode/mnode/impl/src/mndVgroup.c
+++ b/source/dnode/mnode/impl/src/mndVgroup.c
@@ -230,6 +230,10 @@ void *mndBuildCreateVnodeReq(SMnode *pMnode, SDnodeObj *pDnode, SDbObj *pDb, SVg
   createReq.standby = standby;
   createReq.isTsma = pVgroup->isTsma;
   createReq.pTsma = pVgroup->pTsma;
+  createReq.walRetentionPeriod = pDb->cfg.walRetentionPeriod;
+  createReq.walRetentionSize = pDb->cfg.walRetentionSize;
+  createReq.walRollPeriod = pDb->cfg.walRollPeriod;
+  createReq.walSegmentSize = pDb->cfg.walSegmentSize;
 
   for (int32_t v = 0; v < pVgroup->replica; ++v) {
     SReplica  *pReplica = &createReq.replicas[v];

From 0b8e9af8ec9eb0328e29b47c4c70e548f1ecacf7 Mon Sep 17 00:00:00 2001
From: Liu Jicong <liujicong@qq.com>
Date: Tue, 26 Jul 2022 13:33:26 +0800
Subject: [PATCH 14/26] refactor(wal)

---
 include/libs/wal/wal.h         |  4 +--
 source/libs/wal/src/walMgmt.c  | 14 +++++-----
 source/libs/wal/src/walSeek.c  | 24 ++++++++--------
 source/libs/wal/src/walWrite.c | 50 +++++++++++++++++-----------------
 4 files changed, 46 insertions(+), 46 deletions(-)

diff --git a/include/libs/wal/wal.h b/include/libs/wal/wal.h
index 220c4f73e0..5b8d70fb7c 100644
--- a/include/libs/wal/wal.h
+++ b/include/libs/wal/wal.h
@@ -103,8 +103,8 @@ typedef struct SWal {
   int32_t fsyncSeq;
   // meta
   SWalVer   vers;
-  TdFilePtr pWriteLogTFile;
-  TdFilePtr pWriteIdxTFile;
+  TdFilePtr pLogFile;
+  TdFilePtr pIdxFile;
   int32_t   writeCur;
   SArray   *fileInfoSet;  // SArray<SWalFileInfo>
   // status
diff --git a/source/libs/wal/src/walMgmt.c b/source/libs/wal/src/walMgmt.c
index 85238e87b9..047354c4aa 100644
--- a/source/libs/wal/src/walMgmt.c
+++ b/source/libs/wal/src/walMgmt.c
@@ -101,8 +101,8 @@ SWal *walOpen(const char *path, SWalCfg *pCfg) {
 
   // open meta
   walResetVer(&pWal->vers);
-  pWal->pWriteLogTFile = NULL;
-  pWal->pWriteIdxTFile = NULL;
+  pWal->pLogFile = NULL;
+  pWal->pIdxFile = NULL;
   pWal->writeCur = -1;
   pWal->fileInfoSet = taosArrayInit(8, sizeof(SWalFileInfo));
   if (pWal->fileInfoSet == NULL) {
@@ -179,10 +179,10 @@ int32_t walAlter(SWal *pWal, SWalCfg *pCfg) {
 
 void walClose(SWal *pWal) {
   taosThreadMutexLock(&pWal->mutex);
-  taosCloseFile(&pWal->pWriteLogTFile);
-  pWal->pWriteLogTFile = NULL;
-  taosCloseFile(&pWal->pWriteIdxTFile);
-  pWal->pWriteIdxTFile = NULL;
+  taosCloseFile(&pWal->pLogFile);
+  pWal->pLogFile = NULL;
+  taosCloseFile(&pWal->pIdxFile);
+  pWal->pIdxFile = NULL;
   walSaveMeta(pWal);
   taosArrayDestroy(pWal->fileInfoSet);
   pWal->fileInfoSet = NULL;
@@ -223,7 +223,7 @@ static void walFsyncAll() {
     if (walNeedFsync(pWal)) {
       wTrace("vgId:%d, do fsync, level:%d seq:%d rseq:%d", pWal->cfg.vgId, pWal->cfg.level, pWal->fsyncSeq,
              atomic_load_32(&tsWal.seq));
-      int32_t code = taosFsyncFile(pWal->pWriteLogTFile);
+      int32_t code = taosFsyncFile(pWal->pLogFile);
       if (code != 0) {
         wError("vgId:%d, file:%" PRId64 ".log, failed to fsync since %s", pWal->cfg.vgId, walGetLastFileFirstVer(pWal),
                strerror(code));
diff --git a/source/libs/wal/src/walSeek.c b/source/libs/wal/src/walSeek.c
index 78d45c84e2..87ab155065 100644
--- a/source/libs/wal/src/walSeek.c
+++ b/source/libs/wal/src/walSeek.c
@@ -22,8 +22,8 @@
 static int64_t walSeekWritePos(SWal* pWal, int64_t ver) {
   int64_t code = 0;
 
-  TdFilePtr pIdxTFile = pWal->pWriteIdxTFile;
-  TdFilePtr pLogTFile = pWal->pWriteLogTFile;
+  TdFilePtr pIdxTFile = pWal->pIdxFile;
+  TdFilePtr pLogTFile = pWal->pLogFile;
 
   // seek position
   int64_t idxOff = walGetVerIdxOffset(pWal, ver);
@@ -68,8 +68,8 @@ int walInitWriteFile(SWal* pWal) {
     return -1;
   }
   // switch file
-  pWal->pWriteIdxTFile = pIdxTFile;
-  pWal->pWriteLogTFile = pLogTFile;
+  pWal->pIdxFile = pIdxTFile;
+  pWal->pLogFile = pLogTFile;
   pWal->writeCur = taosArrayGetSize(pWal->fileInfoSet) - 1;
   return 0;
 }
@@ -78,15 +78,15 @@ int walChangeWrite(SWal* pWal, int64_t ver) {
   int       code;
   TdFilePtr pIdxTFile, pLogTFile;
   char      fnameStr[WAL_FILE_LEN];
-  if (pWal->pWriteLogTFile != NULL) {
-    code = taosCloseFile(&pWal->pWriteLogTFile);
+  if (pWal->pLogFile != NULL) {
+    code = taosCloseFile(&pWal->pLogFile);
     if (code != 0) {
       terrno = TAOS_SYSTEM_ERROR(errno);
       return -1;
     }
   }
-  if (pWal->pWriteIdxTFile != NULL) {
-    code = taosCloseFile(&pWal->pWriteIdxTFile);
+  if (pWal->pIdxFile != NULL) {
+    code = taosCloseFile(&pWal->pIdxFile);
     if (code != 0) {
       terrno = TAOS_SYSTEM_ERROR(errno);
       return -1;
@@ -106,7 +106,7 @@ int walChangeWrite(SWal* pWal, int64_t ver) {
   pIdxTFile = taosOpenFile(fnameStr, TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_APPEND);
   if (pIdxTFile == NULL) {
     terrno = TAOS_SYSTEM_ERROR(errno);
-    pWal->pWriteIdxTFile = NULL;
+    pWal->pIdxFile = NULL;
     return -1;
   }
   walBuildLogName(pWal, fileFirstVer, fnameStr);
@@ -114,12 +114,12 @@ int walChangeWrite(SWal* pWal, int64_t ver) {
   if (pLogTFile == NULL) {
     taosCloseFile(&pIdxTFile);
     terrno = TAOS_SYSTEM_ERROR(errno);
-    pWal->pWriteLogTFile = NULL;
+    pWal->pLogFile = NULL;
     return -1;
   }
 
-  pWal->pWriteLogTFile = pLogTFile;
-  pWal->pWriteIdxTFile = pIdxTFile;
+  pWal->pLogFile = pLogTFile;
+  pWal->pIdxFile = pIdxTFile;
   pWal->writeCur = idx;
   return fileFirstVer;
 }
diff --git a/source/libs/wal/src/walWrite.c b/source/libs/wal/src/walWrite.c
index 81500d8088..bbd916ba4e 100644
--- a/source/libs/wal/src/walWrite.c
+++ b/source/libs/wal/src/walWrite.c
@@ -32,8 +32,8 @@ int32_t walRestoreFromSnapshot(SWal *pWal, int64_t ver) {
     }
   }
 
-  taosCloseFile(&pWal->pWriteLogTFile);
-  taosCloseFile(&pWal->pWriteIdxTFile);
+  taosCloseFile(&pWal->pLogFile);
+  taosCloseFile(&pWal->pIdxFile);
 
   if (pWal->vers.firstVer != -1) {
     int32_t fileSetSize = taosArrayGetSize(pWal->fileInfoSet);
@@ -324,34 +324,34 @@ END:
 
 int32_t walRollImpl(SWal *pWal) {
   int32_t code = 0;
-  if (pWal->pWriteIdxTFile != NULL) {
-    code = taosCloseFile(&pWal->pWriteIdxTFile);
+  if (pWal->pIdxFile != NULL) {
+    code = taosCloseFile(&pWal->pIdxFile);
     if (code != 0) {
       terrno = TAOS_SYSTEM_ERROR(errno);
       goto END;
     }
   }
-  if (pWal->pWriteLogTFile != NULL) {
-    code = taosCloseFile(&pWal->pWriteLogTFile);
+  if (pWal->pLogFile != NULL) {
+    code = taosCloseFile(&pWal->pLogFile);
     if (code != 0) {
       terrno = TAOS_SYSTEM_ERROR(errno);
       goto END;
     }
   }
-  TdFilePtr pIdxTFile, pLogTFile;
+  TdFilePtr pIdxFile, pLogFile;
   // create new file
-  int64_t newFileFirstVersion = pWal->vers.lastVer + 1;
+  int64_t newFileFirstVer = pWal->vers.lastVer + 1;
   char    fnameStr[WAL_FILE_LEN];
-  walBuildIdxName(pWal, newFileFirstVersion, fnameStr);
-  pIdxTFile = taosOpenFile(fnameStr, TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_APPEND);
-  if (pIdxTFile == NULL) {
+  walBuildIdxName(pWal, newFileFirstVer, fnameStr);
+  pIdxFile = taosOpenFile(fnameStr, TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_APPEND);
+  if (pIdxFile == NULL) {
     terrno = TAOS_SYSTEM_ERROR(errno);
     code = -1;
     goto END;
   }
-  walBuildLogName(pWal, newFileFirstVersion, fnameStr);
-  pLogTFile = taosOpenFile(fnameStr, TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_APPEND);
-  if (pLogTFile == NULL) {
+  walBuildLogName(pWal, newFileFirstVer, fnameStr);
+  pLogFile = taosOpenFile(fnameStr, TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_APPEND);
+  if (pLogFile == NULL) {
     terrno = TAOS_SYSTEM_ERROR(errno);
     code = -1;
     goto END;
@@ -363,8 +363,8 @@ int32_t walRollImpl(SWal *pWal) {
   }
 
   // switch file
-  pWal->pWriteIdxTFile = pIdxTFile;
-  pWal->pWriteLogTFile = pLogTFile;
+  pWal->pIdxFile = pIdxFile;
+  pWal->pLogFile = pLogFile;
   pWal->writeCur = taosArrayGetSize(pWal->fileInfoSet) - 1;
   ASSERT(pWal->writeCur >= 0);
 
@@ -378,10 +378,10 @@ END:
 
 static int32_t walWriteIndex(SWal *pWal, int64_t ver, int64_t offset) {
   SWalIdxEntry entry = {.ver = ver, .offset = offset};
-  int64_t      idxOffset = taosLSeekFile(pWal->pWriteIdxTFile, 0, SEEK_END);
+  int64_t      idxOffset = taosLSeekFile(pWal->pIdxFile, 0, SEEK_END);
   wDebug("vgId:%d, write index, index:%" PRId64 ", offset:%" PRId64 ", at %" PRId64, pWal->cfg.vgId, ver, offset,
          idxOffset);
-  int64_t size = taosWriteFile(pWal->pWriteIdxTFile, &entry, sizeof(SWalIdxEntry));
+  int64_t size = taosWriteFile(pWal->pIdxFile, &entry, sizeof(SWalIdxEntry));
   if (size != sizeof(SWalIdxEntry)) {
     terrno = TAOS_SYSTEM_ERROR(errno);
     // TODO truncate
@@ -407,7 +407,7 @@ static FORCE_INLINE int32_t walWriteImpl(SWal *pWal, int64_t index, tmsg_t msgTy
   pWal->writeHead.cksumHead = walCalcHeadCksum(&pWal->writeHead);
   pWal->writeHead.cksumBody = walCalcBodyCksum(body, bodyLen);
 
-  if (taosWriteFile(pWal->pWriteLogTFile, &pWal->writeHead, sizeof(SWalCkHead)) != sizeof(SWalCkHead)) {
+  if (taosWriteFile(pWal->pLogFile, &pWal->writeHead, sizeof(SWalCkHead)) != sizeof(SWalCkHead)) {
     // TODO ftruncate
     terrno = TAOS_SYSTEM_ERROR(errno);
     wError("vgId:%d, file:%" PRId64 ".log, failed to write since %s", pWal->cfg.vgId, walGetLastFileFirstVer(pWal),
@@ -416,7 +416,7 @@ static FORCE_INLINE int32_t walWriteImpl(SWal *pWal, int64_t index, tmsg_t msgTy
     goto END;
   }
 
-  if (taosWriteFile(pWal->pWriteLogTFile, (char *)body, bodyLen) != bodyLen) {
+  if (taosWriteFile(pWal->pLogFile, (char *)body, bodyLen) != bodyLen) {
     // TODO ftruncate
     terrno = TAOS_SYSTEM_ERROR(errno);
     wError("vgId:%d, file:%" PRId64 ".log, failed to write since %s", pWal->cfg.vgId, walGetLastFileFirstVer(pWal),
@@ -456,14 +456,14 @@ int64_t walAppendLog(SWal *pWal, tmsg_t msgType, SWalSyncInfo syncMeta, const vo
     return -1;
   }
 
-  if (pWal->pWriteIdxTFile == NULL || pWal->pWriteIdxTFile == NULL || pWal->writeCur < 0) {
+  if (pWal->pIdxFile == NULL || pWal->pIdxFile == NULL || pWal->writeCur < 0) {
     if (walInitWriteFile(pWal) < 0) {
       taosThreadMutexUnlock(&pWal->mutex);
       return -1;
     }
   }
 
-  ASSERT(pWal->pWriteIdxTFile != NULL && pWal->pWriteLogTFile != NULL && pWal->writeCur >= 0);
+  ASSERT(pWal->pIdxFile != NULL && pWal->pLogFile != NULL && pWal->writeCur >= 0);
 
   if (walWriteImpl(pWal, index, msgType, syncMeta, body, bodyLen) < 0) {
     taosThreadMutexUnlock(&pWal->mutex);
@@ -494,14 +494,14 @@ int32_t walWriteWithSyncInfo(SWal *pWal, int64_t index, tmsg_t msgType, SWalSync
     return -1;
   }
 
-  if (pWal->pWriteIdxTFile == NULL || pWal->pWriteIdxTFile == NULL || pWal->writeCur < 0) {
+  if (pWal->pIdxFile == NULL || pWal->pIdxFile == NULL || pWal->writeCur < 0) {
     if (walInitWriteFile(pWal) < 0) {
       taosThreadMutexUnlock(&pWal->mutex);
       return -1;
     }
   }
 
-  ASSERT(pWal->pWriteIdxTFile != NULL && pWal->pWriteLogTFile != NULL && pWal->writeCur >= 0);
+  ASSERT(pWal->pIdxFile != NULL && pWal->pLogFile != NULL && pWal->writeCur >= 0);
 
   if (walWriteImpl(pWal, index, msgType, syncMeta, body, bodyLen) < 0) {
     taosThreadMutexUnlock(&pWal->mutex);
@@ -524,7 +524,7 @@ int32_t walWrite(SWal *pWal, int64_t index, tmsg_t msgType, const void *body, in
 void walFsync(SWal *pWal, bool forceFsync) {
   if (forceFsync || (pWal->cfg.level == TAOS_WAL_FSYNC && pWal->cfg.fsyncPeriod == 0)) {
     wTrace("vgId:%d, fileId:%" PRId64 ".log, do fsync", pWal->cfg.vgId, walGetCurFileFirstVer(pWal));
-    if (taosFsyncFile(pWal->pWriteLogTFile) < 0) {
+    if (taosFsyncFile(pWal->pLogFile) < 0) {
       wError("vgId:%d, file:%" PRId64 ".log, fsync failed since %s", pWal->cfg.vgId, walGetCurFileFirstVer(pWal),
              strerror(errno));
     }

From c2164dd8aaf731ea4b5ccf1d974ce53c8ae7afb3 Mon Sep 17 00:00:00 2001
From: Liu Jicong <liujicong@qq.com>
Date: Tue, 26 Jul 2022 14:13:03 +0800
Subject: [PATCH 15/26] fix(wal): snapshot end with no deleting

---
 source/dnode/vnode/src/tq/tq.c        |  2 +-
 source/dnode/vnode/src/vnd/vnodeCfg.c |  4 +-
 source/libs/wal/src/walWrite.c        | 56 ++++++++++++++-------------
 3 files changed, 32 insertions(+), 30 deletions(-)

diff --git a/source/dnode/vnode/src/tq/tq.c b/source/dnode/vnode/src/tq/tq.c
index 118e3a5d43..2adfc92ab1 100644
--- a/source/dnode/vnode/src/tq/tq.c
+++ b/source/dnode/vnode/src/tq/tq.c
@@ -583,7 +583,7 @@ int32_t tqProcessVgChangeReq(STQ* pTq, char* msg, int32_t msgLen) {
       pHandle->execHandle.execTb.suid = req.suid;
       SArray* tbUidList = taosArrayInit(0, sizeof(int64_t));
       vnodeGetCtbIdList(pTq->pVnode, req.suid, tbUidList);
-      tqDebug("vgId:%d, tq try get suid:%" PRId64, pTq->pVnode->config.vgId, req.suid);
+      tqDebug("vgId:%d, tq try to get all ctb, suid:%" PRId64, pTq->pVnode->config.vgId, req.suid);
       for (int32_t i = 0; i < taosArrayGetSize(tbUidList); i++) {
         int64_t tbUid = *(int64_t*)taosArrayGet(tbUidList, i);
         tqDebug("vgId:%d, idx %d, uid:%" PRId64, TD_VID(pTq->pVnode), i, tbUid);
diff --git a/source/dnode/vnode/src/vnd/vnodeCfg.c b/source/dnode/vnode/src/vnd/vnodeCfg.c
index eac1fd1a74..e38fe9876b 100644
--- a/source/dnode/vnode/src/vnd/vnodeCfg.c
+++ b/source/dnode/vnode/src/vnd/vnodeCfg.c
@@ -40,8 +40,8 @@ const SVnodeCfg vnodeCfgDefault = {.vgId = -1,
                                            .vgId = -1,
                                            .fsyncPeriod = 0,
                                            .retentionPeriod = -1,
-                                           .rollPeriod = -1,
-                                           .segSize = -1,
+                                           .rollPeriod = 0,
+                                           .segSize = 0,
                                            .retentionSize = -1,
                                            .level = TAOS_WAL_WRITE,
                                        },
diff --git a/source/libs/wal/src/walWrite.c b/source/libs/wal/src/walWrite.c
index bbd916ba4e..1bd769d5f0 100644
--- a/source/libs/wal/src/walWrite.c
+++ b/source/libs/wal/src/walWrite.c
@@ -277,35 +277,37 @@ int32_t walEndSnapshot(SWal *pWal) {
   tmp.firstVer = ver;
   // find files safe to delete
   SWalFileInfo *pInfo = taosArraySearch(pWal->fileInfoSet, &tmp, compareWalFileInfo, TD_LE);
-  if (ver >= pInfo->lastVer) {
-    pInfo++;
-  }
-  // iterate files, until the searched result
-  for (SWalFileInfo *iter = pWal->fileInfoSet->pData; iter < pInfo; iter++) {
-    if ((pWal->cfg.retentionSize != -1 && newTotSize > pWal->cfg.retentionSize) ||
-        (pWal->cfg.retentionPeriod != -1 && iter->closeTs + pWal->cfg.retentionPeriod > ts)) {
-      // delete according to file size or close time
-      deleteCnt++;
-      newTotSize -= iter->fileSize;
+  if (pInfo) {
+    if (ver >= pInfo->lastVer) {
+      pInfo++;
+    }
+    // iterate files, until the searched result
+    for (SWalFileInfo *iter = pWal->fileInfoSet->pData; iter < pInfo; iter++) {
+      if ((pWal->cfg.retentionSize != -1 && newTotSize > pWal->cfg.retentionSize) ||
+          (pWal->cfg.retentionPeriod != -1 && iter->closeTs + pWal->cfg.retentionPeriod > ts)) {
+        // delete according to file size or close time
+        deleteCnt++;
+        newTotSize -= iter->fileSize;
+      }
+    }
+    char fnameStr[WAL_FILE_LEN];
+    // remove file
+    for (int i = 0; i < deleteCnt; i++) {
+      pInfo = taosArrayGet(pWal->fileInfoSet, i);
+      walBuildLogName(pWal, pInfo->firstVer, fnameStr);
+      taosRemoveFile(fnameStr);
+      walBuildIdxName(pWal, pInfo->firstVer, fnameStr);
+      taosRemoveFile(fnameStr);
     }
-  }
-  char fnameStr[WAL_FILE_LEN];
-  // remove file
-  for (int i = 0; i < deleteCnt; i++) {
-    pInfo = taosArrayGet(pWal->fileInfoSet, i);
-    walBuildLogName(pWal, pInfo->firstVer, fnameStr);
-    taosRemoveFile(fnameStr);
-    walBuildIdxName(pWal, pInfo->firstVer, fnameStr);
-    taosRemoveFile(fnameStr);
-  }
 
-  // make new array, remove files
-  taosArrayPopFrontBatch(pWal->fileInfoSet, deleteCnt);
-  if (taosArrayGetSize(pWal->fileInfoSet) == 0) {
-    pWal->writeCur = -1;
-    pWal->vers.firstVer = -1;
-  } else {
-    pWal->vers.firstVer = ((SWalFileInfo *)taosArrayGet(pWal->fileInfoSet, 0))->firstVer;
+    // make new array, remove files
+    taosArrayPopFrontBatch(pWal->fileInfoSet, deleteCnt);
+    if (taosArrayGetSize(pWal->fileInfoSet) == 0) {
+      pWal->writeCur = -1;
+      pWal->vers.firstVer = -1;
+    } else {
+      pWal->vers.firstVer = ((SWalFileInfo *)taosArrayGet(pWal->fileInfoSet, 0))->firstVer;
+    }
   }
   pWal->writeCur = taosArrayGetSize(pWal->fileInfoSet) - 1;
   pWal->totSize = newTotSize;

From 285eb4ff259f7ec8f55f263c91872006df83bbd9 Mon Sep 17 00:00:00 2001
From: Shengliang Guan <slguan@taosdata.com>
Date: Tue, 26 Jul 2022 14:41:39 +0800
Subject: [PATCH 16/26] test: valgrind case

---
 tests/script/tsim/valgrind/basic2.sim      | 140 ++++++++++-----------
 tests/script/tsim/valgrind/basic4.sim      |  74 +++++++++++
 tests/script/tsim/valgrind/checkError3.sim |   1 +
 tests/script/tsim/valgrind/checkError5.sim |  33 +++++
 tests/script/tsim/valgrind/checkError7.sim |  75 +++++++++++
 5 files changed, 246 insertions(+), 77 deletions(-)
 create mode 100644 tests/script/tsim/valgrind/basic4.sim
 create mode 100644 tests/script/tsim/valgrind/checkError7.sim

diff --git a/tests/script/tsim/valgrind/basic2.sim b/tests/script/tsim/valgrind/basic2.sim
index 45ac78daf0..7c905209ee 100644
--- a/tests/script/tsim/valgrind/basic2.sim
+++ b/tests/script/tsim/valgrind/basic2.sim
@@ -1,6 +1,7 @@
 system sh/stop_dnodes.sh
 system sh/deploy.sh -n dnode1 -i 1
-system sh/exec.sh -n dnode1 -s start -v
+system sh/cfg.sh -n dnode1 -c debugflag -v 131
+system sh/exec.sh -n dnode1 -s start
 sql connect
 
 print =============== step1: create drop show dnodes
@@ -21,88 +22,73 @@ if $data(1)[4] != ready then
   goto step1
 endi
 
-print =============== step2: create db
-sql create database db
+$tbPrefix = tb
+$tbNum = 5
+$rowNum = 10
+
+print =============== step2: prepare data
+sql create database db vgroups 2
 sql use db
-sql create table db.stb (ts timestamp, c1 int, c2 binary(4)) tags(t1 int, t2 float, t3 binary(16)) comment "abd"
-sql create table db.c1 using db.stb tags(101, 102, "103")
+sql create table if not exists stb (ts timestamp, tbcol int, tbcol2 float, tbcol3 double) tags (tgcol int unsigned)
 
-print =============== step3: alter stb
-sql_error alter table db.stb add column ts int
-sql alter table db.stb add column c3 int
-sql alter table db.stb add column c4 bigint
-sql alter table db.stb add column c5 binary(12)
-sql alter table db.stb drop column c1
-sql alter table db.stb drop column c4
-sql alter table db.stb MODIFY column c2 binary(32)
-sql alter table db.stb add tag t4 bigint
-sql alter table db.stb add tag c1 int  
-sql alter table db.stb add tag t5 binary(12)
-sql alter table db.stb drop tag c1
-sql alter table db.stb drop tag t5
-sql alter table db.stb MODIFY tag t3 binary(32)
-sql alter table db.stb rename tag t1 tx
-sql alter table db.stb comment 'abcde' ;
-sql drop table db.stb
+$i = 0
+while $i < $tbNum
+  $tb = $tbPrefix . $i
+  sql create table $tb using stb tags( $i )
+  $x = 0
+  while $x < $rowNum
+    $cc = $x * 60000
+    $ms = 1601481600000 + $cc
+    sql insert into $tb values ($ms , $x , $x , $x ) 
+    $x = $x + 1
+  endw 
 
-print =============== step4: alter tb
-sql create table tb (ts timestamp, a int)
-sql insert into tb values(now-28d, -28)
-sql select count(a) from tb
-sql alter table tb add column b smallint
-sql insert into tb values(now-25d, -25, 0)
-sql select count(b) from tb
-sql alter table tb add column c tinyint
-sql insert into tb values(now-22d, -22, 3, 0)
-sql select count(c) from tb
-sql alter table tb add column d int
-sql insert into tb values(now-19d, -19, 6, 0, 0)
-sql select count(d) from tb
-sql alter table tb add column e bigint
-sql alter table tb add column f float
-sql alter table tb add column g double
-sql alter table tb add column h binary(10)
-sql select count(a), count(b), count(c), count(d), count(e), count(f), count(g), count(h) from tb
-sql select * from tb order by ts desc
+  $cc = $x * 60000
+  $ms = 1601481600000 + $cc
+  sql insert into $tb values ($ms , NULL , NULL , NULL ) 
+  $i = $i + 1
+endw 
 
-print =============== step5: alter stb and insert data
-sql create table stb (ts timestamp, c1 int, c2 binary(4)) tags(t1 int, t2 float, t3 binary(16)) comment "abd"
-sql show db.stables
-sql describe stb
-sql_error alter table stb add column ts int
+system sh/exec.sh -n dnode1 -s stop -x SIGINT
+system sh/exec.sh -n dnode1 -s start -v
 
-sql create table db.ctb using db.stb tags(101, 102, "103")
-sql insert into db.ctb values(now, 1, "2")
-sql show db.tables
-sql select * from db.stb
-sql select * from tb
+print =============== step3: tb
+sql select avg(tbcol) from tb1
+sql select avg(tbcol) from tb1 where ts <= 1601481840000
+sql select avg(tbcol) as b from tb1
+sql select avg(tbcol) as b from tb1 interval(1d)
+sql select avg(tbcol) as b from tb1 where ts <= 1601481840000 interval(1m)
+sql select bottom(tbcol, 2) from tb1 where ts <= 1601481840000 
+sql select top(tbcol, 2) from tb1 where ts <= 1601481840000 
+sql select percentile(tbcol, 2) from tb1 where ts <= 1601481840000 
+sql select leastsquares(tbcol, 1, 1) as b from tb1 where ts <= 1601481840000
+sql show table distributed tb1
+sql select count(tbcol) as b from tb1 where ts <= 1601481840000 interval(1m)
+sql select diff(tbcol) from tb1 where ts <= 1601481840000 
+sql select diff(tbcol) from tb1 where tbcol > 5 and tbcol < 20
+sql select first(tbcol), last(tbcol) as b from tb1 where ts <= 1601481840000 interval(1m)
+sql select count(tbcol), avg(tbcol), max(tbcol), min(tbcol), sum(tbcol), stddev(tbcol) from tb1 where ts <= 1601481840000 partition by tgcol interval(1m)
+#sql select count(tbcol), avg(tbcol), max(tbcol), min(tbcol), count(tbcol) from tb1 where ts <= 1601481840000 and ts >= 1601481800000 partition by tgcol interval(1m) fill(value, 0)
+sql select last_row(*) from tb1 where tbcol > 5 and tbcol < 20
 
-sql alter table stb add column c3 int
-sql describe stb
-sql select * from db.stb
-sql select * from tb
-sql insert into db.ctb values(now+1s, 1, 2, 3)
-sql select * from db.stb
-
-sql alter table db.stb add column c4 bigint
-sql select * from db.stb
-sql insert into db.ctb values(now+2s, 1, 2, 3, 4)
-
-sql alter table db.stb drop column c1
-sql reset query cache
-sql select * from tb
-sql insert into db.ctb values(now+3s, 2, 3, 4)
-sql select * from db.stb
-
-sql alter table db.stb add tag t4 bigint
-sql select * from db.stb
-sql select * from db.stb
-sql_error create table db.ctb2 using db.stb tags(101, "102")
-sql create table db.ctb2 using db.stb tags(101, 102, "103", 104)
-sql insert into db.ctb2 values(now, 1, 2, 3)
-
-print =============== step6: query data
-sql select * from db.stb where tbname = 'ctb2';
+print =============== step4: stb
+sql select avg(tbcol) as c from stb
+sql select avg(tbcol) as c from stb where ts <= 1601481840000
+sql select avg(tbcol) as c from stb where tgcol < 5 and ts <= 1601481840000
+sql select avg(tbcol) as c from stb interval(1m)
+sql select avg(tbcol) as c from stb interval(1d)
+sql select avg(tbcol) as b from stb where ts <= 1601481840000 interval(1m)
+sql select avg(tbcol) as c from stb group by tgcol
+sql select avg(tbcol) as b from stb where ts <= 1601481840000 partition by tgcol interval(1m)
+sql show table distributed stb
+sql select count(tbcol) as b from stb where ts <= 1601481840000 partition by tgcol interval(1m)
+sql select diff(tbcol) from stb where ts <= 1601481840000 
+sql select first(tbcol), last(tbcol) as c from stb group by tgcol
+sql select first(tbcol), last(tbcol) as b from stb where ts <= 1601481840000 and tbcol2 is null partition by tgcol interval(1m)
+sql select first(tbcol), last(tbcol) as b from stb where ts <= 1601481840000 partition by tgcol interval(1m)
+sql select count(tbcol), avg(tbcol), max(tbcol), min(tbcol), sum(tbcol), stddev(tbcol) from stb where ts <= 1601481840000 partition by tgcol interval(1m)
+#sql select count(tbcol), avg(tbcol), max(tbcol), min(tbcol), count(tbcol) from stb where ts <= 1601481840000 and ts >= 1601481800000 partition by tgcol interval(1m) fill(value, 0)
+sql select last_row(tbcol), stddev(tbcol) from stb where tbcol > 5 and tbcol < 20 group by tgcol
 
 _OVER:
 system sh/exec.sh -n dnode1 -s stop -x SIGINT
diff --git a/tests/script/tsim/valgrind/basic4.sim b/tests/script/tsim/valgrind/basic4.sim
new file mode 100644
index 0000000000..8be96f769b
--- /dev/null
+++ b/tests/script/tsim/valgrind/basic4.sim
@@ -0,0 +1,74 @@
+system sh/stop_dnodes.sh
+system sh/deploy.sh -n dnode1 -i 1
+system sh/cfg.sh -n dnode1 -c debugflag -v 131
+system sh/exec.sh -n dnode1 -s start -v
+sql connect
+
+print =============== step1: create drop show dnodes
+$x = 0
+step1:
+	$x = $x + 1
+	sleep 1000
+	if $x == 10 then
+	  print ---> dnode not ready!
+		return -1
+	endi
+sql show dnodes
+print ---> $data00 $data01 $data02 $data03 $data04 $data05
+if $rows != 1 then
+  return -1
+endi
+if $data(1)[4] != ready then
+  goto step1
+endi
+
+print =============== step2: create db
+sql create database d1 vgroups 2 buffer 3
+sql show databases
+sql use d1
+sql show vgroups
+
+print =============== step3: create show stable
+sql create table if not exists stb (ts timestamp, c1 int, c2 float, c3 double) tags (t1 int unsigned)
+sql show stables
+if $rows != 1 then 
+  return -1
+endi
+
+print =============== step4: create show table
+sql create table ct1 using stb tags(1000)
+sql create table ct2 using stb tags(2000)
+sql create table ct3 using stb tags(3000)
+sql show tables
+if $rows != 3 then 
+  return -1
+endi
+
+print =============== step5: insert data (null / update)
+sql insert into ct1 values(now+0s, 10, 2.0, 3.0)
+sql insert into ct1 values(now+1s, 11, 2.1, NULL)(now+2s, -12, -2.2, -3.2)(now+3s, -13, -2.3, -3.3)
+sql insert into ct2 values(now+0s, 10, 2.0, 3.0)
+sql insert into ct2 values(now+1s, 11, 2.1, 3.1)(now+2s, -12, -2.2, -3.2)(now+3s, -13, -2.3, -3.3)
+sql insert into ct3 values('2021-01-01 00:00:00.000', NULL, NULL, 3.0)
+sql insert into ct3 values('2022-03-02 16:59:00.010', 3  , 4, 5), ('2022-03-02 16:59:00.010', 33 , 4, 5), ('2022-04-01 16:59:00.011', 4,  4, 5), ('2022-04-01 16:59:00.011', 6,  4, 5), ('2022-03-06 16:59:00.013', 8,  4, 5);
+sql insert into ct3 values('2022-03-02 16:59:00.010', 103, 1, 2), ('2022-03-02 16:59:00.010', 303, 3, 4), ('2022-04-01 16:59:00.011', 40, 5, 6), ('2022-04-01 16:59:00.011', 60, 4, 5), ('2022-03-06 16:59:00.013', 80, 4, 5);
+
+print =============== step6: query data=
+
+sql select * from stb where t1 between 1000 and 2500
+
+
+_OVER:
+system sh/exec.sh -n dnode1 -s stop -x SIGINT
+print =============== check
+$null=
+
+system_content sh/checkValgrind.sh -n dnode1 
+print cmd return result ----> [ $system_content ]
+if $system_content > 0 then
+  return -1
+endi 
+
+if $system_content == $null then
+  return -1
+endi 
diff --git a/tests/script/tsim/valgrind/checkError3.sim b/tests/script/tsim/valgrind/checkError3.sim
index e8b25098d6..41623896b3 100644
--- a/tests/script/tsim/valgrind/checkError3.sim
+++ b/tests/script/tsim/valgrind/checkError3.sim
@@ -37,6 +37,7 @@ sql show stables
 if $rows != 4 then 
   return -1
 endi
+sql show stables like 'stb'
 
 print =============== step4: ccreate child table
 sql create table c1 using stb tags(true, -1, -2, -3, -4, -6.0, -7.0, 'child tbl 1', 'child tbl 1', '2022-02-25 18:00:00.000', 10, 20, 30, 40)
diff --git a/tests/script/tsim/valgrind/checkError5.sim b/tests/script/tsim/valgrind/checkError5.sim
index 6eef185fd3..f0786587d9 100644
--- a/tests/script/tsim/valgrind/checkError5.sim
+++ b/tests/script/tsim/valgrind/checkError5.sim
@@ -105,6 +105,39 @@ sql insert into db.ctb2 values(now, 1, 2, 3)
 print =============== step6: query data
 sql select * from db.stb where tbname = 'ctb2';
 
+
+print =============== step7: normal table
+sql create database d1 replica 1 duration 7 keep 50 
+sql use d1
+sql create table tb (ts timestamp, a int)
+sql insert into tb values(now-28d, -28)
+sql alter table tb add column b smallint
+sql insert into tb values(now-25d, -25, 0)
+sql alter table tb add column c tinyint
+sql insert into tb values(now-22d, -22, 3, 0)
+sql alter table tb add column d int
+sql insert into tb values(now-19d, -19, 6, 0, 0)
+sql alter table tb add column e bigint
+sql insert into tb values(now-16d, -16, 9, 0, 0, 0)
+sql alter table tb add column f float
+sql insert into tb values(now-13d, -13, 12, 0, 0, 0, 0)
+sql alter table tb add column g double
+sql insert into tb values(now-10d, -10, 15, 0, 0, 0, 0, 0)
+sql alter table tb add column h binary(10)
+sql insert into tb values(now-7d, -7, 18, 0, 0, 0, 0, 0, '0')
+sql select count(a), count(b), count(c), count(d), count(e), count(f), count(g), count(h) from d1.tb;
+sql alter table tb drop column a
+sql insert into tb values(now-4d, 1, 1, 1, 1, 1, 1, '1')
+sql alter table tb drop column b
+sql insert into tb values(now-3d, 1, 1, 1, 1, 1, '1')
+sql alter table tb drop column c
+sql insert into tb values(now-2d, 1, 1, 1, 1, '1')
+sql alter table tb drop column d
+sql insert into tb values(now-1d, 1, 1, 1, '1')
+sql alter table tb drop column e
+sql insert into tb values(now, 1, 1, '1')
+sql select count(h) from tb
+
 _OVER:
 system sh/exec.sh -n dnode1 -s stop -x SIGINT
 print =============== check
diff --git a/tests/script/tsim/valgrind/checkError7.sim b/tests/script/tsim/valgrind/checkError7.sim
new file mode 100644
index 0000000000..a66ddb30df
--- /dev/null
+++ b/tests/script/tsim/valgrind/checkError7.sim
@@ -0,0 +1,75 @@
+system sh/stop_dnodes.sh
+system sh/deploy.sh -n dnode1 -i 1
+system sh/exec.sh -n dnode1 -s start -v
+sql connect
+
+print ======================== create stable
+sql create database d1
+sql use d1
+
+$x = 0
+while $x < 128
+  $tb = d1.s . $x
+  sql create table $tb (ts timestamp, i int) tags (j int)
+  $x = $x + 1
+endw
+
+print ======================== describe stables
+# TODO : create stable error
+$m = 0
+while $m < 128
+  $tb = s . $m
+  $filter = ' . $tb
+  $filter = $filter . '
+  sql show stables like $filter
+  print sql : show stables like $filter
+  if $rows != 1 then
+    print expect 1, actual: $rows
+    return -1
+  endi
+  $m = $m + 1
+endw
+
+
+print ======================== show stables
+
+sql show d1.stables
+
+print num of stables is $rows
+if $rows != 128 then
+  return -1
+endi
+
+print ======================== create table
+
+$x = 0
+while $x < 424
+  $tb = d1.t . $x
+  sql create table $tb using d1.s0 tags( $x )
+  $x = $x + 1
+endw
+
+print ======================== show stables
+
+sql show d1.tables
+
+print num of tables is $rows
+if $rows != 424 then
+  return -1
+endi
+
+
+_OVER:
+system sh/exec.sh -n dnode1 -s stop -x SIGINT
+print =============== check
+$null=
+
+system_content sh/checkValgrind.sh -n dnode1 
+print cmd return result ----> [ $system_content ]
+if $system_content > 2 then
+  return -1
+endi 
+
+if $system_content == $null then
+  return -1
+endi 

From ecb7a9c1336678bc742df70112958ac09ea2ad3b Mon Sep 17 00:00:00 2001
From: Liu Jicong <liujicong@qq.com>
Date: Tue, 26 Jul 2022 14:53:39 +0800
Subject: [PATCH 17/26] fix(wal): snapshot end with no deleting

---
 source/libs/wal/src/walWrite.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/source/libs/wal/src/walWrite.c b/source/libs/wal/src/walWrite.c
index 1bd769d5f0..d869e6e2ce 100644
--- a/source/libs/wal/src/walWrite.c
+++ b/source/libs/wal/src/walWrite.c
@@ -261,14 +261,13 @@ int32_t walEndSnapshot(SWal *pWal) {
   pWal->vers.snapshotVer = ver;
   int ts = taosGetTimestampSec();
 
-  int64_t minVerToDelete = ver;
-  void   *pIter = NULL;
+  void *pIter = NULL;
   while (1) {
     pIter = taosHashIterate(pWal->pRefHash, pIter);
     if (pIter == NULL) break;
     SWalRef *pRef = *(SWalRef **)pIter;
     if (pRef->refVer == -1) continue;
-    minVerToDelete = TMIN(minVerToDelete, pRef->refVer);
+    ver = TMIN(ver, pRef->refVer);
   }
 
   int          deleteCnt = 0;

From bb80e5c64fea695c395675c7ac4b2604de804fa4 Mon Sep 17 00:00:00 2001
From: Minglei Jin <mljin@taosdata.com>
Date: Tue, 26 Jul 2022 15:38:18 +0800
Subject: [PATCH 18/26] fix: handle both null tag values

---
 source/dnode/vnode/src/meta/metaOpen.c | 53 ++++++++++++++++++++------
 1 file changed, 42 insertions(+), 11 deletions(-)

diff --git a/source/dnode/vnode/src/meta/metaOpen.c b/source/dnode/vnode/src/meta/metaOpen.c
index 59df35d554..7c7d14e337 100644
--- a/source/dnode/vnode/src/meta/metaOpen.c
+++ b/source/dnode/vnode/src/meta/metaOpen.c
@@ -180,11 +180,41 @@ int metaClose(SMeta *pMeta) {
   return 0;
 }
 
-int32_t metaRLock(SMeta *pMeta) { return taosThreadRwlockRdlock(&pMeta->lock); }
+int32_t metaRLock(SMeta *pMeta) {
+  int32_t ret = 0;
 
-int32_t metaWLock(SMeta *pMeta) { return taosThreadRwlockWrlock(&pMeta->lock); }
+  metaDebug("meta rlock %p B", &pMeta->lock);
 
-int32_t metaULock(SMeta *pMeta) { return taosThreadRwlockUnlock(&pMeta->lock); }
+  ret = taosThreadRwlockRdlock(&pMeta->lock);
+
+  metaDebug("meta rlock %p E", &pMeta->lock);
+
+  return ret;
+}
+
+int32_t metaWLock(SMeta *pMeta) {
+  int32_t ret = 0;
+
+  metaDebug("meta wlock %p B", &pMeta->lock);
+
+  ret = taosThreadRwlockWrlock(&pMeta->lock);
+
+  metaDebug("meta wlock %p E", &pMeta->lock);
+
+  return ret;
+}
+
+int32_t metaULock(SMeta *pMeta) {
+  int32_t ret = 0;
+
+  metaDebug("meta ulock %p B", &pMeta->lock);
+
+  ret = taosThreadRwlockUnlock(&pMeta->lock);
+
+  metaDebug("meta ulock %p E", &pMeta->lock);
+
+  return ret;
+}
 
 static int tbDbKeyCmpr(const void *pKey1, int kLen1, const void *pKey2, int kLen2) {
   STbDbKey *pTbDbKey1 = (STbDbKey *)pKey1;
@@ -259,7 +289,7 @@ static int ctbIdxKeyCmpr(const void *pKey1, int kLen1, const void *pKey2, int kL
 static int tagIdxKeyCmpr(const void *pKey1, int kLen1, const void *pKey2, int kLen2) {
   STagIdxKey *pTagIdxKey1 = (STagIdxKey *)pKey1;
   STagIdxKey *pTagIdxKey2 = (STagIdxKey *)pKey2;
-  tb_uid_t    uid1, uid2;
+  tb_uid_t    uid1 = 0, uid2 = 0;
   int         c;
 
   // compare suid
@@ -287,14 +317,15 @@ static int tagIdxKeyCmpr(const void *pKey1, int kLen1, const void *pKey2, int kL
     // all not NULL, compr tag vals
     c = doCompare(pTagIdxKey1->data, pTagIdxKey2->data, pTagIdxKey1->type, 0);
     if (c) return c;
+  }
 
-    if (IS_VAR_DATA_TYPE(pTagIdxKey1->type)) {
-      uid1 = *(tb_uid_t *)(pTagIdxKey1->data + varDataTLen(pTagIdxKey1->data));
-      uid2 = *(tb_uid_t *)(pTagIdxKey2->data + varDataTLen(pTagIdxKey2->data));
-    } else {
-      uid1 = *(tb_uid_t *)(pTagIdxKey1->data + tDataTypes[pTagIdxKey1->type].bytes);
-      uid2 = *(tb_uid_t *)(pTagIdxKey2->data + tDataTypes[pTagIdxKey2->type].bytes);
-    }
+  // both null or tag values are equal, then continue to compare uids
+  if (IS_VAR_DATA_TYPE(pTagIdxKey1->type)) {
+    uid1 = *(tb_uid_t *)(pTagIdxKey1->data + varDataTLen(pTagIdxKey1->data));
+    uid2 = *(tb_uid_t *)(pTagIdxKey2->data + varDataTLen(pTagIdxKey2->data));
+  } else {
+    uid1 = *(tb_uid_t *)(pTagIdxKey1->data + tDataTypes[pTagIdxKey1->type].bytes);
+    uid2 = *(tb_uid_t *)(pTagIdxKey2->data + tDataTypes[pTagIdxKey2->type].bytes);
   }
 
   // compare uid

From b6e222195b5c46099c9a3db58906be2365fa8a68 Mon Sep 17 00:00:00 2001
From: Shengliang Guan <slguan@taosdata.com>
Date: Tue, 26 Jul 2022 15:51:32 +0800
Subject: [PATCH 19/26] test: reproduce crash in client

---
 tests/script/tsim/parser/select_with_tags.sim | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/script/tsim/parser/select_with_tags.sim b/tests/script/tsim/parser/select_with_tags.sim
index 5130b39f48..49412e74ba 100644
--- a/tests/script/tsim/parser/select_with_tags.sim
+++ b/tests/script/tsim/parser/select_with_tags.sim
@@ -183,7 +183,7 @@ if $rows != 12800 then
   return -1
 endi
 
-sql select _rowts, top(c1, 80), tbname, t1, t2 from select_tags_mt0;
+sql select ts, top(c1, 80), tbname, t1, t2 from select_tags_mt0 order by ts;
 if $rows != 80 then
    return -1
 endi
@@ -212,7 +212,7 @@ if $data04 != @abc12@ then
    return -1
 endi
 
-sql select top(c1, 80), tbname, t1, t2 from select_tags_mt0;
+sql select ts, top(c1, 80), tbname, t1, t2 from select_tags_mt0 order by ts;
 if $rows != 80 then
    return -1
 endi
@@ -241,7 +241,7 @@ if $data04 != @abc12@ then
    return -1
 endi
 
-sql select bottom(c1, 72), tbname, t1, t2 from select_tags_mt0;
+sql select ts, bottom(c1, 72), tbname, t1, t2 from select_tags_mt0 order by ts;
 if $rows != 72 then
    return -1
 endi
@@ -293,7 +293,7 @@ if $data03 != 15 then
 endi
 
 print ====== selectivity+tags+group by tags=======================
-sql select first(c1), tbname, t1, t2 from select_tags_mt0 group by tbname;
+sql select first(c1), tbname, t1, t2, tbname from select_tags_mt0 group by tbname order by t1;
 if $rows != 16 then
    return -1
 endi
@@ -327,7 +327,7 @@ if $data04 != @select_tags_tb0@ then
    return -1
 endi
 
-sql select last_row(ts,c1), tbname, t1, t2 from select_tags_mt0 group by tbname;
+sql select last_row(ts,c1), tbname, t1, t2, tbname from select_tags_mt0 group by tbname order by t1;
 if $rows != 16 then
    return -1
 endi
@@ -361,7 +361,7 @@ if $data04 != @abc0@ then
    return -1
 endi
 
-sql select tbname,t1,t2 from select_tags_mt0;
+sql select distinct tbname,t1,t2 from select_tags_mt0;
 if $row != 16 then
    return -1
 endi

From bd8fa53371f3a8f4580db16243e7c5f0d97672d0 Mon Sep 17 00:00:00 2001
From: Haojun Liao <hjliao@taosdata.com>
Date: Tue, 26 Jul 2022 15:52:47 +0800
Subject: [PATCH 20/26] fix(query): set current SBlock before move to next
 block.

---
 source/dnode/vnode/src/tsdb/tsdbRead.c        | 6 ++++--
 source/libs/executor/src/timewindowoperator.c | 2 ++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/source/dnode/vnode/src/tsdb/tsdbRead.c b/source/dnode/vnode/src/tsdb/tsdbRead.c
index a17504ea04..1b162bf488 100644
--- a/source/dnode/vnode/src/tsdb/tsdbRead.c
+++ b/source/dnode/vnode/src/tsdb/tsdbRead.c
@@ -529,7 +529,7 @@ _end:
 // }
 
 static int32_t doLoadBlockIndex(STsdbReader* pReader, SDataFReader* pFileReader, SArray* pIndexList) {
-  SArray* aBlockIdx = taosArrayInit(0, sizeof(SBlockIdx));
+  SArray* aBlockIdx = taosArrayInit(8, sizeof(SBlockIdx));
 
   int64_t st = taosGetTimestampUs();
   int32_t code = tsdbReadBlockIdx(pFileReader, aBlockIdx, NULL);
@@ -1060,6 +1060,7 @@ static int32_t setFileBlockActiveInBlockIter(SDataBlockIter* pBlockIter, int32_t
     ASSERT(pBlockInfo->uid == fblock.uid && pBlockInfo->tbBlockIdx == fblock.tbBlockIdx);
   }
 
+  doSetCurrentBlock(pBlockIter);
   return TSDB_CODE_SUCCESS;
 }
 
@@ -1410,7 +1411,6 @@ static int32_t buildComposedDataBlockImpl(STsdbReader* pReader, STableBlockScanI
   SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
   SBlockData*         pBlockData = &pReader->status.fileBlockData;
 
-  SRowMerger merge = {0};
   STSRow*    pTSRow = NULL;
 
   int64_t  key = pBlockData->aTSKEY[pDumpInfo->rowIndex];
@@ -1432,6 +1432,8 @@ static int32_t buildComposedDataBlockImpl(STsdbReader* pReader, STableBlockScanI
 
     // imem & mem are all empty, only file exist
     TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
+
+    SRowMerger merge = {0};
     tRowMergerInit(&merge, &fRow, pReader->pSchema);
     doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
     tRowMergerGetRow(&merge, &pTSRow);
diff --git a/source/libs/executor/src/timewindowoperator.c b/source/libs/executor/src/timewindowoperator.c
index 1e001a29a0..cb613f2298 100644
--- a/source/libs/executor/src/timewindowoperator.c
+++ b/source/libs/executor/src/timewindowoperator.c
@@ -1616,6 +1616,7 @@ static SSDataBlock* doStreamIntervalAgg(SOperatorInfo* pOperator) {
     pInfo->twAggSup.maxTs = TMAX(pInfo->twAggSup.maxTs, pBlock->info.window.ekey);
     hashIntervalAgg(pOperator, &pInfo->binfo.resultRowInfo, pBlock, MAIN_SCAN, pUpdated);
   }
+
   pOperator->status = OP_RES_TO_RETURN;
   closeIntervalWindow(pInfo->aggSup.pResultRowHashTable, &pInfo->twAggSup, &pInfo->interval, NULL, pUpdated,
                       pInfo->pRecycledPages, pInfo->aggSup.pResultBuf);
@@ -1628,6 +1629,7 @@ static SSDataBlock* doStreamIntervalAgg(SOperatorInfo* pOperator) {
   if (pInfo->pDelRes->info.rows > 0) {
     return pInfo->pDelRes;
   }
+
   doBuildResultDatablock(pOperator, &pInfo->binfo, &pInfo->groupResInfo, pInfo->aggSup.pResultBuf);
   printDataBlock(pInfo->binfo.pRes, "single interval");
   return pInfo->binfo.pRes->info.rows == 0 ? NULL : pInfo->binfo.pRes;

From e6e95dffce7e24d8cc74e4b18d78c68922cfd84d Mon Sep 17 00:00:00 2001
From: Liu Jicong <liujicong@qq.com>
Date: Tue, 26 Jul 2022 16:03:00 +0800
Subject: [PATCH 21/26] fix(wal): config error

---
 include/common/tmsg.h                           |  4 ++--
 source/common/src/tmsg.c                        |  8 ++++----
 source/dnode/mgmt/mgmt_vnode/src/vmHandle.c     |  4 ++--
 source/dnode/mnode/impl/inc/mndDef.h            |  4 ++--
 source/dnode/mnode/impl/src/mndDb.c             | 14 ++++++++------
 tests/system-test/7-tmq/db.py                   |  2 +-
 tests/system-test/7-tmq/tmqDelete-1ctb.py       |  2 +-
 tests/system-test/7-tmq/tmqDelete-multiCtb.py   |  2 +-
 tests/system-test/7-tmq/tmqDnodeRestart.py      |  2 +-
 tests/system-test/7-tmq/tmqUpdateWithConsume.py |  2 +-
 10 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/include/common/tmsg.h b/include/common/tmsg.h
index 9d4f77d1ef..eb12bb4073 100644
--- a/include/common/tmsg.h
+++ b/include/common/tmsg.h
@@ -1155,9 +1155,9 @@ typedef struct {
   SArray*  pRetensions;  // SRetention
   void*    pTsma;
   int32_t  walRetentionPeriod;
-  int32_t  walRetentionSize;
+  int64_t  walRetentionSize;
   int32_t  walRollPeriod;
-  int32_t  walSegmentSize;
+  int64_t  walSegmentSize;
 } SCreateVnodeReq;
 
 int32_t tSerializeSCreateVnodeReq(void* buf, int32_t bufLen, SCreateVnodeReq* pReq);
diff --git a/source/common/src/tmsg.c b/source/common/src/tmsg.c
index d4e56ec742..6bdb6a4e65 100644
--- a/source/common/src/tmsg.c
+++ b/source/common/src/tmsg.c
@@ -3751,9 +3751,9 @@ int32_t tSerializeSCreateVnodeReq(void *buf, int32_t bufLen, SCreateVnodeReq *pR
     if (tEncodeBinary(&encoder, (const uint8_t *)pReq->pTsma, tsmaLen) < 0) return -1;
   }
   if (tEncodeI32(&encoder, pReq->walRetentionPeriod) < 0) return -1;
-  if (tEncodeI32(&encoder, pReq->walRetentionSize) < 0) return -1;
+  if (tEncodeI64(&encoder, pReq->walRetentionSize) < 0) return -1;
   if (tEncodeI32(&encoder, pReq->walRollPeriod) < 0) return -1;
-  if (tEncodeI32(&encoder, pReq->walSegmentSize) < 0) return -1;
+  if (tEncodeI64(&encoder, pReq->walSegmentSize) < 0) return -1;
 
   tEndEncode(&encoder);
 
@@ -3823,9 +3823,9 @@ int32_t tDeserializeSCreateVnodeReq(void *buf, int32_t bufLen, SCreateVnodeReq *
   }
 
   if (tDecodeI32(&decoder, &pReq->walRetentionPeriod) < 0) return -1;
-  if (tDecodeI32(&decoder, &pReq->walRetentionSize) < 0) return -1;
+  if (tDecodeI64(&decoder, &pReq->walRetentionSize) < 0) return -1;
   if (tDecodeI32(&decoder, &pReq->walRollPeriod) < 0) return -1;
-  if (tDecodeI32(&decoder, &pReq->walSegmentSize) < 0) return -1;
+  if (tDecodeI64(&decoder, &pReq->walSegmentSize) < 0) return -1;
 
   tEndDecode(&decoder);
   tDecoderClear(&decoder);
diff --git a/source/dnode/mgmt/mgmt_vnode/src/vmHandle.c b/source/dnode/mgmt/mgmt_vnode/src/vmHandle.c
index 159b9ca568..cb061e6d1c 100644
--- a/source/dnode/mgmt/mgmt_vnode/src/vmHandle.c
+++ b/source/dnode/mgmt/mgmt_vnode/src/vmHandle.c
@@ -162,8 +162,8 @@ static void vmGenerateVnodeCfg(SCreateVnodeReq *pCreate, SVnodeCfg *pCfg) {
   pCfg->walCfg.vgId = pCreate->vgId;
   pCfg->walCfg.fsyncPeriod = pCreate->fsyncPeriod;
   pCfg->walCfg.retentionPeriod = pCreate->walRetentionPeriod;
-  pCfg->walCfg.rollPeriod = pCreate->walRetentionSize;
-  pCfg->walCfg.retentionSize = pCreate->walRollPeriod;
+  pCfg->walCfg.rollPeriod = pCreate->walRollPeriod;
+  pCfg->walCfg.retentionSize = pCreate->walRetentionSize;
   pCfg->walCfg.segSize = pCreate->walSegmentSize;
   pCfg->walCfg.level = pCreate->walLevel;
 
diff --git a/source/dnode/mnode/impl/inc/mndDef.h b/source/dnode/mnode/impl/inc/mndDef.h
index e7f78b34c5..0ff9b4102d 100644
--- a/source/dnode/mnode/impl/inc/mndDef.h
+++ b/source/dnode/mnode/impl/inc/mndDef.h
@@ -306,9 +306,9 @@ typedef struct {
   int32_t numOfRetensions;
   SArray* pRetensions;
   int32_t walRetentionPeriod;
-  int32_t walRetentionSize;
+  int64_t walRetentionSize;
   int32_t walRollPeriod;
-  int32_t walSegmentSize;
+  int64_t walSegmentSize;
 } SDbCfg;
 
 typedef struct {
diff --git a/source/dnode/mnode/impl/src/mndDb.c b/source/dnode/mnode/impl/src/mndDb.c
index d183b519e8..86787fcd01 100644
--- a/source/dnode/mnode/impl/src/mndDb.c
+++ b/source/dnode/mnode/impl/src/mndDb.c
@@ -121,9 +121,9 @@ static SSdbRaw *mndDbActionEncode(SDbObj *pDb) {
   }
   SDB_SET_INT8(pRaw, dataPos, pDb->cfg.schemaless, _OVER)
   SDB_SET_INT32(pRaw, dataPos, pDb->cfg.walRetentionPeriod, _OVER)
-  SDB_SET_INT32(pRaw, dataPos, pDb->cfg.walRetentionSize, _OVER)
+  SDB_SET_INT64(pRaw, dataPos, pDb->cfg.walRetentionSize, _OVER)
   SDB_SET_INT32(pRaw, dataPos, pDb->cfg.walRollPeriod, _OVER)
-  SDB_SET_INT32(pRaw, dataPos, pDb->cfg.walSegmentSize, _OVER)
+  SDB_SET_INT64(pRaw, dataPos, pDb->cfg.walSegmentSize, _OVER)
 
   SDB_SET_RESERVE(pRaw, dataPos, DB_RESERVE_SIZE, _OVER)
   SDB_SET_DATALEN(pRaw, dataPos, _OVER)
@@ -204,9 +204,9 @@ static SSdbRow *mndDbActionDecode(SSdbRaw *pRaw) {
   }
   SDB_GET_INT8(pRaw, dataPos, &pDb->cfg.schemaless, _OVER)
   SDB_GET_INT32(pRaw, dataPos, &pDb->cfg.walRetentionPeriod, _OVER)
-  SDB_GET_INT32(pRaw, dataPos, &pDb->cfg.walRetentionSize, _OVER)
+  SDB_GET_INT64(pRaw, dataPos, &pDb->cfg.walRetentionSize, _OVER)
   SDB_GET_INT32(pRaw, dataPos, &pDb->cfg.walRollPeriod, _OVER)
-  SDB_GET_INT32(pRaw, dataPos, &pDb->cfg.walSegmentSize, _OVER)
+  SDB_GET_INT64(pRaw, dataPos, &pDb->cfg.walSegmentSize, _OVER)
 
   SDB_GET_RESERVE(pRaw, dataPos, DB_RESERVE_SIZE, _OVER)
   taosInitRWLatch(&pDb->lock);
@@ -357,8 +357,10 @@ static void mndSetDefaultDbCfg(SDbCfg *pCfg) {
   if (pCfg->cacheLastSize <= 0) pCfg->cacheLastSize = TSDB_DEFAULT_CACHE_SIZE;
   if (pCfg->numOfRetensions < 0) pCfg->numOfRetensions = 0;
   if (pCfg->schemaless < 0) pCfg->schemaless = TSDB_DB_SCHEMALESS_OFF;
-  if (pCfg->walRetentionPeriod < 0) pCfg->walRetentionPeriod = TSDB_DEFAULT_DB_WAL_RETENTION_PERIOD;
-  if (pCfg->walRetentionSize < 0) pCfg->walRetentionSize = TSDB_DEFAULT_DB_WAL_RETENTION_SIZE;
+  if (pCfg->walRetentionPeriod < 0 && pCfg->walRetentionPeriod != -1)
+    pCfg->walRetentionPeriod = TSDB_DEFAULT_DB_WAL_RETENTION_PERIOD;
+  if (pCfg->walRetentionSize < 0 && pCfg->walRetentionSize != -1)
+    pCfg->walRetentionSize = TSDB_DEFAULT_DB_WAL_RETENTION_SIZE;
   if (pCfg->walRollPeriod < 0) pCfg->walRollPeriod = TSDB_DEFAULT_DB_WAL_ROLL_PERIOD;
   if (pCfg->walSegmentSize < 0) pCfg->walSegmentSize = TSDB_DEFAULT_DB_WAL_SEGMENT_SIZE;
 }
diff --git a/tests/system-test/7-tmq/db.py b/tests/system-test/7-tmq/db.py
index fd793fd841..1fd0638d17 100644
--- a/tests/system-test/7-tmq/db.py
+++ b/tests/system-test/7-tmq/db.py
@@ -118,7 +118,7 @@ class TDTestCase:
         if dropFlag == 1:
             tsql.execute("drop database if exists %s"%(dbName))
 
-        tsql.execute("create database if not exists %s vgroups %d replica %d"%(dbName, vgroups, replica))
+        tsql.execute("create database if not exists %s vgroups %d replica %d wal_retention_period -1 wal_retention_size -1"%(dbName, vgroups, replica))
         tdLog.debug("complete to create database %s"%(dbName))
         return
 
diff --git a/tests/system-test/7-tmq/tmqDelete-1ctb.py b/tests/system-test/7-tmq/tmqDelete-1ctb.py
index a2a429771c..bedb36e505 100644
--- a/tests/system-test/7-tmq/tmqDelete-1ctb.py
+++ b/tests/system-test/7-tmq/tmqDelete-1ctb.py
@@ -52,7 +52,7 @@ class TDTestCase:
         paraDict['rowsPerTbl'] = self.rowsPerTbl
         
         tmqCom.initConsumerTable()
-        tdCom.create_database(tdSql, paraDict["dbName"],paraDict["dropFlag"], vgroups=paraDict["vgroups"],replica=1)
+        tdCom.create_database(tdSql, paraDict["dbName"],paraDict["dropFlag"], vgroups=paraDict["vgroups"],replica=1,wal_retention_size=-1, wal_retention_period=-1)
         tdLog.info("create stb")
         tmqCom.create_stable(tdSql, dbName=paraDict["dbName"],stbName=paraDict["stbName"])
         tdLog.info("create ctb")
diff --git a/tests/system-test/7-tmq/tmqDelete-multiCtb.py b/tests/system-test/7-tmq/tmqDelete-multiCtb.py
index fa32efbd0b..94ca16bc6f 100644
--- a/tests/system-test/7-tmq/tmqDelete-multiCtb.py
+++ b/tests/system-test/7-tmq/tmqDelete-multiCtb.py
@@ -52,7 +52,7 @@ class TDTestCase:
         paraDict['rowsPerTbl'] = self.rowsPerTbl
         
         tmqCom.initConsumerTable()
-        tdCom.create_database(tdSql, paraDict["dbName"],paraDict["dropFlag"], vgroups=paraDict["vgroups"],replica=1)
+        tdCom.create_database(tdSql, paraDict["dbName"],paraDict["dropFlag"], vgroups=paraDict["vgroups"],replica=1,wal_retention_size=-1, wal_retention_period=-1)
         tdLog.info("create stb")
         tmqCom.create_stable(tdSql, dbName=paraDict["dbName"],stbName=paraDict["stbName"])
         tdLog.info("create ctb")
diff --git a/tests/system-test/7-tmq/tmqDnodeRestart.py b/tests/system-test/7-tmq/tmqDnodeRestart.py
index 5117ee3d24..9a11106e3e 100644
--- a/tests/system-test/7-tmq/tmqDnodeRestart.py
+++ b/tests/system-test/7-tmq/tmqDnodeRestart.py
@@ -53,7 +53,7 @@ class TDTestCase:
         paraDict['rowsPerTbl'] = self.rowsPerTbl
         
         tmqCom.initConsumerTable()
-        tdCom.create_database(tdSql, paraDict["dbName"],paraDict["dropFlag"], vgroups=paraDict["vgroups"],replica=1)
+        tdCom.create_database(tdSql, paraDict["dbName"],paraDict["dropFlag"], vgroups=paraDict["vgroups"],replica=1,wal_retention_size=-1, wal_retention_period=-1)
         tdLog.info("create stb")
         tmqCom.create_stable(tdSql, dbName=paraDict["dbName"],stbName=paraDict["stbName"])
         tdLog.info("create ctb")
diff --git a/tests/system-test/7-tmq/tmqUpdateWithConsume.py b/tests/system-test/7-tmq/tmqUpdateWithConsume.py
index 4f21beffc4..2dd3a061c6 100644
--- a/tests/system-test/7-tmq/tmqUpdateWithConsume.py
+++ b/tests/system-test/7-tmq/tmqUpdateWithConsume.py
@@ -52,7 +52,7 @@ class TDTestCase:
         paraDict['rowsPerTbl'] = self.rowsPerTbl
         
         tmqCom.initConsumerTable()
-        tdCom.create_database(tdSql, paraDict["dbName"],paraDict["dropFlag"], vgroups=paraDict["vgroups"],replica=1)
+        tdCom.create_database(tdSql, paraDict["dbName"],paraDict["dropFlag"], vgroups=paraDict["vgroups"],replica=1, wal_retention_size=-1, wal_retention_period=-1)
         tdLog.info("create stb")
         tmqCom.create_stable(tdSql, dbName=paraDict["dbName"],stbName=paraDict["stbName"])
         tdLog.info("create ctb")

From 10d378b4a6a041675f30957813bac833fc8705aa Mon Sep 17 00:00:00 2001
From: Shengliang Guan <slguan@taosdata.com>
Date: Tue, 26 Jul 2022 16:08:38 +0800
Subject: [PATCH 22/26] test: reproduce crash in client

---
 tests/script/tsim/parser/select_with_tags.sim | 48 ++++++-------------
 1 file changed, 14 insertions(+), 34 deletions(-)

diff --git a/tests/script/tsim/parser/select_with_tags.sim b/tests/script/tsim/parser/select_with_tags.sim
index 49412e74ba..7a2c1217e9 100644
--- a/tests/script/tsim/parser/select_with_tags.sim
+++ b/tests/script/tsim/parser/select_with_tags.sim
@@ -411,7 +411,7 @@ if $data11 != @70-01-01 08:01:40.001@ then
   return -1
 endi
 
-sql select top(c1, 100), tbname, t1, t2 from select_tags_mt0 where tbname in ('select_tags_tb0', 'select_tags_tb1') group by tbname;
+sql select ts, top(c1, 100), tbname, t1, t2 from select_tags_mt0 where tbname in ('select_tags_tb0', 'select_tags_tb1') group by tbname order by ts;
 if $row != 200 then
   return -1
 endi
@@ -448,7 +448,7 @@ if $data04 != @abc0@ then
   return -1
 endi
 
-sql select top(c1, 2), t2 from select_tags_mt0 where tbname in ('select_tags_tb0', 'select_tags_tb1') group by tbname,t2;
+sql select ts, top(c1, 2), t2, tbname, t2 from select_tags_mt0 where tbname in ('select_tags_tb0', 'select_tags_tb1') group by tbname,t2 order by ts;
 if $row != 4 then
   return -1
 endi
@@ -535,33 +535,13 @@ endi
 
 
 # slimit /limit
-sql select top(c1, 2), t2 from select_tags_mt0 where tbname in ('select_tags_tb0', 'select_tags_tb1') group by tbname,t2 limit 2 offset 1;
+sql select ts, top(c1, 2), t2 from select_tags_mt0 where tbname in ('select_tags_tb0', 'select_tags_tb1') group by tbname,t2 limit 2 offset 1;
 if $row != 2 then
   return -1
 endi
 
-if $data00 != @70-01-01 08:01:40.199@ then
-  return -1
-endi
-
-if $data01 != 99 then
-   return -1
-endi
-
-if $data02 != @abc0@ then
-   return -1
-endi
-
-if $data03 != @select_tags_tb0@ then
-   return -1
-endi
-
-if $data04 != @abc0@ then
-   return -1
-endi
-
 print ======= selectivity + tags + group by + tags + filter ===========================
-sql select first(c1), t1 from select_tags_mt0 where c1<=2 group by tbname;
+sql select first(c1), t1, tbname from select_tags_mt0 where c1<=2 group by tbname order by t1;
 if $row != 3 then
   return -1
 endi
@@ -602,7 +582,7 @@ if $data22 != @select_tags_tb2@ then
    return -1
 endi
 
-sql select first(c1), tbname from select_tags_mt0 where c1<=2 interval(1s);
+sql select _wstart, first(c1), tbname from select_tags_mt0 where c1<=2 interval(1s);
 if $row != 3 then
    return -1
 endi
@@ -671,7 +651,7 @@ if $data01 != @70-01-01 08:01:50.001@ then
 endi
 
 print ======= selectivity + tags + group by + tags + filter + interval ================
-sql select first(c1), t2, t1, tbname from select_tags_mt0 where c1<=2 interval(1d) group by tbname;
+sql select _wstart,first(c1), t2, t1, tbname, tbname from select_tags_mt0 where c1<=2 partition by tbname interval(1d) order by t1;
 if $row != 3 then
    return -1
 endi
@@ -708,7 +688,7 @@ if $data25 != @select_tags_tb2@ then
   return -1
 endi
 
-sql select top(c1, 5), t2 from select_tags_mt0 where c1<=2 interval(1d) group by tbname;
+sql select ts, top(c1, 5), t2, tbname from select_tags_mt0 where c1<=2 partition by tbname interval(1d) order by ts, t2;
 if $row != 15 then
    return -1
 endi
@@ -746,7 +726,7 @@ if $data93 != @select_tags_tb1@ then
 endi
 
 #if data
-sql select top(c1, 50), t2, t1, tbname from select_tags_mt0 where c1<=2 interval(1d) group by tbname;
+sql select ts, top(c1, 50), t2, t1, tbname, tbname from select_tags_mt0 where c1<=2  partition by tbname interval(1d) order by ts, t2;
 if $row != 48 then
   return -1
 endi
@@ -831,7 +811,7 @@ endi
 print TODO ======= selectivity + tags+ group by + tags + filter + interval + join===========
 
 print ==========================mix tag columns and group by columns======================
-sql select top(c1, 100), tbname from select_tags_mt0 where tbname in ('select_tags_tb0', 'select_tags_tb1') group by t3
+sql select ts, top(c1, 100), tbname, t3 from select_tags_mt0 where tbname in ('select_tags_tb0', 'select_tags_tb1') group by t3 order by ts, tbname;
 if $rows != 100 then
   return -1
 endi
@@ -887,9 +867,9 @@ sql_error select twa(c2), tbname from select_tags_mt0;
 sql_error select interp(c2), tbname from select_tags_mt0 where ts=100001;
 
 sql_error select t1,t2,tbname from select_tags_mt0 group by tbname;
-sql_error select count(tbname) from select_tags_mt0 interval(1d);
-sql_error select count(tbname) from select_tags_mt0 group by t1;
-sql_error select count(tbname),SUM(T1) from select_tags_mt0 interval(1d);
+sql select count(tbname) from select_tags_mt0 interval(1d);
+sql select count(tbname) from select_tags_mt0 group by t1;
+sql select count(tbname),SUM(T1) from select_tags_mt0 interval(1d);
 sql_error select first(c1), count(*), t2, t1, tbname from select_tags_mt0 where c1<=2 interval(1d) group by tbname;
 sql_error select ts from select_tags_mt0 interval(1y);
 sql_error select count(*), tbname from select_tags_mt0 interval(1y);
@@ -902,8 +882,8 @@ sql_error select tbname, t1 from select_tags_mt0 interval(1y);
 #valid sql: select first(c1), tbname, t1 from select_tags_mt0 group by t2;
 
 print ==================================>TD-4231
-sql_error select t1,tbname from select_tags_mt0 where c1<0
-sql_error select t1,tbname from select_tags_mt0 where c1<0 and tbname in ('select_tags_tb12')
+sql select t1,tbname from select_tags_mt0 where c1<0
+sql select t1,tbname from select_tags_mt0 where c1<0 and tbname in ('select_tags_tb12')
 
 sql select tbname from select_tags_mt0 where tbname in ('select_tags_tb12');
 

From 9a540919b90a8ffe352e84498c3d2b6ca834998e Mon Sep 17 00:00:00 2001
From: Minghao Li <castermode@gmail.com>
Date: Tue, 26 Jul 2022 17:23:26 +0800
Subject: [PATCH 23/26] refactor(sync): pre-commit integration

---
 source/dnode/vnode/src/vnd/vnodeSync.c | 77 ++++++++++++++------------
 1 file changed, 41 insertions(+), 36 deletions(-)

diff --git a/source/dnode/vnode/src/vnd/vnodeSync.c b/source/dnode/vnode/src/vnd/vnodeSync.c
index 2b760efba0..4323fa0aff 100644
--- a/source/dnode/vnode/src/vnd/vnodeSync.c
+++ b/source/dnode/vnode/src/vnd/vnodeSync.c
@@ -501,49 +501,54 @@ static void vnodeSyncReconfig(struct SSyncFSM *pFsm, const SRpcMsg *pMsg, SReCon
 }
 
 static void vnodeSyncCommitMsg(SSyncFSM *pFsm, const SRpcMsg *pMsg, SFsmCbMeta cbMeta) {
-  SVnode *pVnode = pFsm->data;
-  vTrace("vgId:%d, commit-cb is excuted, fsm:%p, index:%" PRId64 ", isWeak:%d, code:%d, state:%d %s, msgtype:%d %s",
-         syncGetVgId(pVnode->sync), pFsm, cbMeta.index, cbMeta.isWeak, cbMeta.code, cbMeta.state,
-         syncUtilState2String(cbMeta.state), pMsg->msgType, TMSG_INFO(pMsg->msgType));
+  if (cbMeta.isWeak == 0) {
+    SVnode *pVnode = pFsm->data;
+    vTrace("vgId:%d, commit-cb is excuted, fsm:%p, index:%" PRId64 ", isWeak:%d, code:%d, state:%d %s, msgtype:%d %s",
+           syncGetVgId(pVnode->sync), pFsm, cbMeta.index, cbMeta.isWeak, cbMeta.code, cbMeta.state,
+           syncUtilState2String(cbMeta.state), pMsg->msgType, TMSG_INFO(pMsg->msgType));
 
-  if (cbMeta.code == 0 && cbMeta.isWeak == 0) {
-    SRpcMsg rpcMsg = {.msgType = pMsg->msgType, .contLen = pMsg->contLen};
-    rpcMsg.pCont = rpcMallocCont(rpcMsg.contLen);
-    memcpy(rpcMsg.pCont, pMsg->pCont, pMsg->contLen);
-    syncGetAndDelRespRpc(pVnode->sync, cbMeta.seqNum, &rpcMsg.info);
-    rpcMsg.info.conn.applyIndex = cbMeta.index;
-    rpcMsg.info.conn.applyTerm = cbMeta.term;
-    tmsgPutToQueue(&pVnode->msgCb, APPLY_QUEUE, &rpcMsg);
-  } else {
-    SRpcMsg rsp = {.code = cbMeta.code, .info = pMsg->info};
-    vError("vgId:%d, sync commit error, msgtype:%d,%s, error:0x%X, errmsg:%s", syncGetVgId(pVnode->sync), pMsg->msgType,
-           TMSG_INFO(pMsg->msgType), cbMeta.code, tstrerror(cbMeta.code));
-    if (rsp.info.handle != NULL) {
-      tmsgSendRsp(&rsp);
+    if (cbMeta.code == 0) {
+      SRpcMsg rpcMsg = {.msgType = pMsg->msgType, .contLen = pMsg->contLen};
+      rpcMsg.pCont = rpcMallocCont(rpcMsg.contLen);
+      memcpy(rpcMsg.pCont, pMsg->pCont, pMsg->contLen);
+      syncGetAndDelRespRpc(pVnode->sync, cbMeta.seqNum, &rpcMsg.info);
+      rpcMsg.info.conn.applyIndex = cbMeta.index;
+      rpcMsg.info.conn.applyTerm = cbMeta.term;
+      tmsgPutToQueue(&pVnode->msgCb, APPLY_QUEUE, &rpcMsg);
+    } else {
+      SRpcMsg rsp = {.code = cbMeta.code, .info = pMsg->info};
+      vError("vgId:%d, sync commit error, msgtype:%d,%s, error:0x%X, errmsg:%s", syncGetVgId(pVnode->sync),
+             pMsg->msgType, TMSG_INFO(pMsg->msgType), cbMeta.code, tstrerror(cbMeta.code));
+      if (rsp.info.handle != NULL) {
+        tmsgSendRsp(&rsp);
+      }
     }
   }
 }
 
 static void vnodeSyncPreCommitMsg(SSyncFSM *pFsm, const SRpcMsg *pMsg, SFsmCbMeta cbMeta) {
-  SVnode *pVnode = pFsm->data;
-  vTrace("vgId:%d, pre-commit-cb is excuted, fsm:%p, index:%" PRId64 ", isWeak:%d, code:%d, state:%d %s, msgtype:%d %s",
-         syncGetVgId(pVnode->sync), pFsm, cbMeta.index, cbMeta.isWeak, cbMeta.code, cbMeta.state,
-         syncUtilState2String(cbMeta.state), pMsg->msgType, TMSG_INFO(pMsg->msgType));
+  if (cbMeta.isWeak == 1) {
+    SVnode *pVnode = pFsm->data;
+    vTrace("vgId:%d, pre-commit-cb is excuted, fsm:%p, index:%" PRId64
+           ", isWeak:%d, code:%d, state:%d %s, msgtype:%d %s",
+           syncGetVgId(pVnode->sync), pFsm, cbMeta.index, cbMeta.isWeak, cbMeta.code, cbMeta.state,
+           syncUtilState2String(cbMeta.state), pMsg->msgType, TMSG_INFO(pMsg->msgType));
 
-  if (cbMeta.code == 0 && cbMeta.isWeak == 1) {
-    SRpcMsg rpcMsg = {.msgType = pMsg->msgType, .contLen = pMsg->contLen};
-    rpcMsg.pCont = rpcMallocCont(rpcMsg.contLen);
-    memcpy(rpcMsg.pCont, pMsg->pCont, pMsg->contLen);
-    syncGetAndDelRespRpc(pVnode->sync, cbMeta.seqNum, &rpcMsg.info);
-    rpcMsg.info.conn.applyIndex = cbMeta.index;
-    rpcMsg.info.conn.applyTerm = cbMeta.term;
-    tmsgPutToQueue(&pVnode->msgCb, APPLY_QUEUE, &rpcMsg);
-  } else {
-    SRpcMsg rsp = {.code = cbMeta.code, .info = pMsg->info};
-    vError("vgId:%d, sync pre-commit error, msgtype:%d,%s, error:0x%X, errmsg:%s", syncGetVgId(pVnode->sync),
-           pMsg->msgType, TMSG_INFO(pMsg->msgType), cbMeta.code, tstrerror(cbMeta.code));
-    if (rsp.info.handle != NULL) {
-      tmsgSendRsp(&rsp);
+    if (cbMeta.code == 0) {
+      SRpcMsg rpcMsg = {.msgType = pMsg->msgType, .contLen = pMsg->contLen};
+      rpcMsg.pCont = rpcMallocCont(rpcMsg.contLen);
+      memcpy(rpcMsg.pCont, pMsg->pCont, pMsg->contLen);
+      syncGetAndDelRespRpc(pVnode->sync, cbMeta.seqNum, &rpcMsg.info);
+      rpcMsg.info.conn.applyIndex = cbMeta.index;
+      rpcMsg.info.conn.applyTerm = cbMeta.term;
+      tmsgPutToQueue(&pVnode->msgCb, APPLY_QUEUE, &rpcMsg);
+    } else {
+      SRpcMsg rsp = {.code = cbMeta.code, .info = pMsg->info};
+      vError("vgId:%d, sync pre-commit error, msgtype:%d,%s, error:0x%X, errmsg:%s", syncGetVgId(pVnode->sync),
+             pMsg->msgType, TMSG_INFO(pMsg->msgType), cbMeta.code, tstrerror(cbMeta.code));
+      if (rsp.info.handle != NULL) {
+        tmsgSendRsp(&rsp);
+      }
     }
   }
 }

From 7974d672b47282fae2e30e2a12476e5ee1a184d2 Mon Sep 17 00:00:00 2001
From: Minglei Jin <mljin@taosdata.com>
Date: Tue, 26 Jul 2022 17:46:47 +0800
Subject: [PATCH 24/26] fix: make tdbDebugFlag work

---
 include/util/tlog.h                    |  1 +
 source/common/src/tglobal.c            | 17 +++++++++++------
 source/dnode/mnode/impl/src/mndDnode.c |  9 ++++-----
 3 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/include/util/tlog.h b/include/util/tlog.h
index 76d04a5997..68b004cda7 100644
--- a/include/util/tlog.h
+++ b/include/util/tlog.h
@@ -63,6 +63,7 @@ extern int32_t metaDebugFlag;
 extern int32_t udfDebugFlag;
 extern int32_t smaDebugFlag;
 extern int32_t idxDebugFlag;
+extern int32_t tdbDebugFlag;
 
 int32_t taosInitLog(const char *logName, int32_t maxFiles);
 void    taosCloseLog();
diff --git a/source/common/src/tglobal.c b/source/common/src/tglobal.c
index 7a20969a63..cb1f3ca91c 100644
--- a/source/common/src/tglobal.c
+++ b/source/common/src/tglobal.c
@@ -316,6 +316,7 @@ static int32_t taosAddServerLogCfg(SConfig *pCfg) {
   if (cfgAddInt32(pCfg, "udfDebugFlag", udfDebugFlag, 0, 255, 0) != 0) return -1;
   if (cfgAddInt32(pCfg, "smaDebugFlag", smaDebugFlag, 0, 255, 0) != 0) return -1;
   if (cfgAddInt32(pCfg, "idxDebugFlag", idxDebugFlag, 0, 255, 0) != 0) return -1;
+  if (cfgAddInt32(pCfg, "tdbDebugFlag", tdbDebugFlag, 0, 255, 0) != 0) return -1;
   return 0;
 }
 
@@ -506,6 +507,7 @@ static void taosSetServerLogCfg(SConfig *pCfg) {
   udfDebugFlag = cfgGetItem(pCfg, "udfDebugFlag")->i32;
   smaDebugFlag = cfgGetItem(pCfg, "smaDebugFlag")->i32;
   idxDebugFlag = cfgGetItem(pCfg, "idxDebugFlag")->i32;
+  tdbDebugFlag = cfgGetItem(pCfg, "tdbDebugFlag")->i32;
 }
 
 static int32_t taosSetClientCfg(SConfig *pCfg) {
@@ -950,6 +952,8 @@ int32_t taosSetCfg(SConfig *pCfg, char *name) {
           uError("failed to create tempDir:%s since %s", tsTempDir, terrstr());
           return -1;
         }
+      } else if (strcasecmp("tdbDebugFlag", name) == 0) {
+        tdbDebugFlag = cfgGetItem(pCfg, "tdbDebugFlag")->i32;
       } else if (strcasecmp("telemetryReporting", name) == 0) {
         tsEnableTelem = cfgGetItem(pCfg, "telemetryReporting")->bval;
       } else if (strcasecmp("telemetryInterval", name) == 0) {
@@ -1151,14 +1155,14 @@ void taosCfgDynamicOptions(const char *option, const char *value) {
   }
 
   const char *options[] = {
-      "dDebugFlag",  "vDebugFlag",   "mDebugFlag",   "wDebugFlag",   "sDebugFlag",   "tsdbDebugFlag",
-      "tqDebugFlag", "fsDebugFlag",  "udfDebugFlag", "smaDebugFlag", "idxDebugFlag", "tmrDebugFlag",
-      "uDebugFlag",  "smaDebugFlag", "rpcDebugFlag", "qDebugFlag",
+      "dDebugFlag",   "vDebugFlag",  "mDebugFlag",   "wDebugFlag",   "sDebugFlag",   "tsdbDebugFlag",
+      "tqDebugFlag",  "fsDebugFlag", "udfDebugFlag", "smaDebugFlag", "idxDebugFlag", "tdbDebugFlag",
+      "tmrDebugFlag", "uDebugFlag",  "smaDebugFlag", "rpcDebugFlag", "qDebugFlag",
   };
   int32_t *optionVars[] = {
-      &dDebugFlag,  &vDebugFlag,   &mDebugFlag,   &wDebugFlag,   &sDebugFlag,   &tsdbDebugFlag,
-      &tqDebugFlag, &fsDebugFlag,  &udfDebugFlag, &smaDebugFlag, &idxDebugFlag, &tmrDebugFlag,
-      &uDebugFlag,  &smaDebugFlag, &rpcDebugFlag, &qDebugFlag,
+      &dDebugFlag,   &vDebugFlag,  &mDebugFlag,   &wDebugFlag,   &sDebugFlag,   &tsdbDebugFlag,
+      &tqDebugFlag,  &fsDebugFlag, &udfDebugFlag, &smaDebugFlag, &idxDebugFlag, &tdbDebugFlag,
+      &tmrDebugFlag, &uDebugFlag,  &smaDebugFlag, &rpcDebugFlag, &qDebugFlag,
   };
 
   int32_t optionSize = tListLen(options);
@@ -1204,5 +1208,6 @@ void taosSetAllDebugFlag(int32_t flag) {
   taosSetDebugFlag(&udfDebugFlag, "udfDebugFlag", flag);
   taosSetDebugFlag(&smaDebugFlag, "smaDebugFlag", flag);
   taosSetDebugFlag(&idxDebugFlag, "idxDebugFlag", flag);
+  taosSetDebugFlag(&tdbDebugFlag, "tdbDebugFlag", flag);
   uInfo("all debug flag are set to %d", flag);
 }
diff --git a/source/dnode/mnode/impl/src/mndDnode.c b/source/dnode/mnode/impl/src/mndDnode.c
index 7141a62be5..d84455ac94 100644
--- a/source/dnode/mnode/impl/src/mndDnode.c
+++ b/source/dnode/mnode/impl/src/mndDnode.c
@@ -788,9 +788,9 @@ _OVER:
 static int32_t mndProcessConfigDnodeReq(SRpcMsg *pReq) {
   SMnode     *pMnode = pReq->info.node;
   const char *options[] = {
-      "debugFlag",     "dDebugFlag",  "vDebugFlag",   "mDebugFlag",   "wDebugFlag",   "sDebugFlag",
-      "tsdbDebugFlag", "tqDebugFlag", "fsDebugFlag",  "udfDebugFlag", "smaDebugFlag", "idxDebugFlag",
-      "tmrDebugFlag",  "uDebugFlag",  "smaDebugFlag", "rpcDebugFlag", "qDebugFlag",
+      "debugFlag",     "dDebugFlag",   "vDebugFlag",  "mDebugFlag",   "wDebugFlag",   "sDebugFlag",
+      "tsdbDebugFlag", "tqDebugFlag",  "fsDebugFlag", "udfDebugFlag", "smaDebugFlag", "idxDebugFlag",
+      "tdbDebugFlag",  "tmrDebugFlag", "uDebugFlag",  "smaDebugFlag", "rpcDebugFlag", "qDebugFlag",
   };
   int32_t optionSize = tListLen(options);
 
@@ -813,7 +813,6 @@ static int32_t mndProcessConfigDnodeReq(SRpcMsg *pReq) {
   SEpSet epSet = mndGetDnodeEpset(pDnode);
   mndReleaseDnode(pMnode, pDnode);
 
-
   SDCfgDnodeReq dcfgReq = {0};
   if (strcasecmp(cfgReq.config, "resetlog") == 0) {
     strcpy(dcfgReq.config, "resetlog");
@@ -839,7 +838,7 @@ static int32_t mndProcessConfigDnodeReq(SRpcMsg *pReq) {
       if (strncasecmp(cfgReq.config, optName, optLen) != 0) continue;
 
       const char *value = cfgReq.value;
-      int32_t flag = atoi(value);
+      int32_t     flag = atoi(value);
       if (flag <= 0) {
         flag = atoi(cfgReq.config + optLen + 1);
       }

From 4176b6b632f8c25cbca72898546293df52bf0ecb Mon Sep 17 00:00:00 2001
From: gccgdb1234 <wxzhang@taosdata.com>
Date: Tue, 26 Jul 2022 18:49:59 +0800
Subject: [PATCH 25/26] doc: SQL reference guide

---
 docs/zh/12-taos-sql/05-insert.md              | 106 ++---
 docs/zh/12-taos-sql/06-select.md              | 441 ++++++++----------
 docs/zh/12-taos-sql/08-delete-data.mdx        |  20 +-
 .../{12-interval.md => 12-distinguished.md}   |  12 +-
 docs/zh/12-taos-sql/14-stream.md              | 122 +++++
 docs/zh/12-taos-sql/20-keywords.md            | 285 +++++------
 docs/zh/12-taos-sql/22-information.md         |   5 -
 docs/zh/12-taos-sql/22-meta.md                | 186 ++++++++
 docs/zh/12-taos-sql/23-show.md                | 270 +++++++++++
 9 files changed, 961 insertions(+), 486 deletions(-)
 rename docs/zh/12-taos-sql/{12-interval.md => 12-distinguished.md} (91%)
 delete mode 100644 docs/zh/12-taos-sql/22-information.md
 create mode 100644 docs/zh/12-taos-sql/22-meta.md
 create mode 100644 docs/zh/12-taos-sql/23-show.md

diff --git a/docs/zh/12-taos-sql/05-insert.md b/docs/zh/12-taos-sql/05-insert.md
index 04118303f3..c91e70c481 100644
--- a/docs/zh/12-taos-sql/05-insert.md
+++ b/docs/zh/12-taos-sql/05-insert.md
@@ -5,7 +5,7 @@ title: 数据写入
 
 ## 写入语法
 
-```
+```sql
 INSERT INTO
     tb_name
         [USING stb_name [(tag1_name, ...)] TAGS (tag1_value, ...)]
@@ -18,46 +18,64 @@ INSERT INTO
     ...];
 ```
 
-## 插入一条或多条记录
+**关于时间戳**
+
+1. TDengine 要求插入的数据必须要有时间戳，插入数据的时间戳要注意以下几点：
+
+2. 时间戳不同的格式语法会有不同的精度影响。字符串格式的时间戳写法不受所在 DATABASE 的时间精度设置影响；而长整形格式的时间戳写法会受到所在 DATABASE 的时间精度设置影响。例如，时间戳"2021-07-13 16:16:48"的 UNIX 秒数为 1626164208。则其在毫秒精度下需要写作 1626164208000，在微秒精度设置下就需要写为 1626164208000000，纳秒精度设置下需要写为 1626164208000000000。
+
+3. 一次插入多行数据时，不要把首列的时间戳的值都写 NOW。否则会导致语句中的多条记录使用相同的时间戳，于是就可能出现相互覆盖以致这些数据行无法全部被正确保存。其原因在于，NOW 函数在执行中会被解析为所在 SQL 语句的客户端执行时间，出现在同一语句中的多个 NOW 标记也就会被替换为完全相同的时间戳取值。
+   允许插入的最老记录的时间戳，是相对于当前服务器时间，减去配置的 KEEP 值（数据保留的天数）。允许插入的最新记录的时间戳，是相对于当前服务器时间，加上配置的 DURATION 值（数据文件存储数据的时间跨度，单位为天）。KEEP 和 DURATION 都是可以在创建数据库时指定的，缺省值分别是 3650 天和 10 天。
+
+**语法说明**
+
+1. USING 子句是自动建表语法。如果用户在写数据时并不确定某个表是否存在，此时可以在写入数据时使用自动建表语法来创建不存在的表，若该表已存在则不会建立新表。自动建表时，要求必须以超级表为模板，并写明数据表的 TAGS 取值。可以只是指定部分 TAGS 列的取值，未被指定的 TAGS 列将置为 NULL。
+
+2. 可以指定要插入值的列，对于为指定的列数据库将自动填充为 NULL。
+
+3. VALUES 语法表示了要插入的一行或多行数据。
+
+4. FILE 语法表示数据来自于 CSV 文件（英文逗号分隔、英文单引号括住每个值），CSV 文件无需表头。
+
+5. 无论使用哪种语法，均可以在一条 INSERT 语句中同时向多个表插入数据。
+
+6. INSERT 语句是完整解析后再执行的，对如下语句，不会再出现数据错误但建表成功的情况：
+
+   ```sql
+   INSERT INTO d1001 USING meters TAGS('Beijing.Chaoyang', 2) VALUES('a');
+   ```
+
+7. 对于向多个子表插入数据的情况，依然会有部分数据写入失败，部分数据写入成功的情况。这是因为多个子表可能分布在不同的 VNODE 上，客户端将 INSERT 语句完整解析后，将数据发往各个涉及的 VNODE 上，每个 VNODE 独立进行写入操作。如果某个 VNODE 因为某些原因（比如网络问题或磁盘故障）导致写入失败，并不会影响其他 VNODE 节点的写入。
+
+## 插入一条记录
 
 指定已经创建好的数据子表的表名，并通过 VALUES 关键字提供一行或多行数据，即可向数据库写入这些数据。例如，执行如下语句可以写入一行记录：
 
-```
+```sql
 INSERT INTO d1001 VALUES (NOW, 10.2, 219, 0.32);
 ```
 
+## 插入多条记录
+
 或者，可以通过如下语句写入两行记录：
 
-```
+```sql
 INSERT INTO d1001 VALUES ('2021-07-13 14:06:32.272', 10.2, 219, 0.32) (1626164208000, 10.15, 217, 0.33);
 ```
 
-:::note
-
-1. 在第二个例子中，两行记录的首列时间戳使用了不同格式的写法。其中字符串格式的时间戳写法不受所在 DATABASE 的时间精度设置影响；而长整形格式的时间戳写法会受到所在 DATABASE 的时间精度设置影响——例子中的时间戳在毫秒精度下可以写作 1626164208000，而如果是在微秒精度设置下就需要写为 1626164208000000，纳秒精度设置下需要写为 1626164208000000000。
-2. 在使用“插入多条记录”方式写入数据时，不能把第一列的时间戳取值都设为 NOW，否则会导致语句中的多条记录使用相同的时间戳，于是就可能出现相互覆盖以致这些数据行无法全部被正确保存。其原因在于，NOW 函数在执行中会被解析为所在 SQL 语句的实际执行时间，出现在同一语句中的多个 NOW 标记也就会被替换为完全相同的时间戳取值。
-3. 允许插入的最老记录的时间戳，是相对于当前服务器时间，减去配置的 keep 值（数据保留的天数）；允许插入的最新记录的时间戳，是相对于当前服务器时间，加上配置的 days 值（数据文件存储数据的时间跨度，单位为天）。keep 和 days 都是可以在创建数据库时指定的，缺省值分别是 3650 天和 10 天。
-
-:::
-
-## 插入记录，数据对应到指定的列
+## 指定列插入
 
 向数据子表中插入记录时，无论插入一行还是多行，都可以让数据对应到指定的列。对于 SQL 语句中没有出现的列，数据库将自动填充为 NULL。主键（时间戳）不能为 NULL。例如：
 
-```
+```sql
 INSERT INTO d1001 (ts, current, phase) VALUES ('2021-07-13 14:06:33.196', 10.27, 0.31);
 ```
 
-:::info
-如果不指定列，也即使用全列模式——那么在 VALUES 部分提供的数据，必须为数据表的每个列都显式地提供数据。全列模式写入速度会远快于指定列，因此建议尽可能采用全列写入方式，此时空列可以填入 NULL。
-
-:::
-
 ## 向多个表插入记录
 
 可以在一条语句中，分别向多个表插入一条或多条记录，并且也可以在插入过程中指定列。例如：
 
-```
+```sql
 INSERT INTO d1001 VALUES ('2021-07-13 14:06:34.630', 10.2, 219, 0.32) ('2021-07-13 14:06:35.779', 10.15, 217, 0.33)
             d1002 (ts, current, phase) VALUES ('2021-07-13 14:06:34.255', 10.27, 0.31）;
 ```
@@ -66,28 +84,24 @@ INSERT INTO d1001 VALUES ('2021-07-13 14:06:34.630', 10.2, 219, 0.32) ('2021-07-
 
 如果用户在写数据时并不确定某个表是否存在，此时可以在写入数据时使用自动建表语法来创建不存在的表，若该表已存在则不会建立新表。自动建表时，要求必须以超级表为模板，并写明数据表的 TAGS 取值。例如：
 
-```
+```sql
 INSERT INTO d21001 USING meters TAGS ('California.SanFrancisco', 2) VALUES ('2021-07-13 14:06:32.272', 10.2, 219, 0.32);
 ```
 
 也可以在自动建表时，只是指定部分 TAGS 列的取值，未被指定的 TAGS 列将置为 NULL。例如：
 
-```
+```sql
 INSERT INTO d21001 USING meters (groupId) TAGS (2) VALUES ('2021-07-13 14:06:33.196', 10.15, 217, 0.33);
 ```
 
 自动建表语法也支持在一条语句中向多个表插入记录。例如：
 
-```
+```sql
 INSERT INTO d21001 USING meters TAGS ('California.SanFrancisco', 2) VALUES ('2021-07-13 14:06:34.630', 10.2, 219, 0.32) ('2021-07-13 14:06:35.779', 10.15, 217, 0.33)
             d21002 USING meters (groupId) TAGS (2) VALUES ('2021-07-13 14:06:34.255', 10.15, 217, 0.33)
             d21003 USING meters (groupId) TAGS (2) (ts, current, phase) VALUES ('2021-07-13 14:06:34.255', 10.27, 0.31);
 ```
 
-:::info
-在 2.0.20.5 版本之前，在使用自动建表语法并指定列时，子表的列名必须紧跟在子表名称后面，而不能如例子里那样放在 TAGS 和 VALUES 之间。从 2.0.20.5 版本开始，两种写法都可以，但不能在一条 SQL 语句中混用，否则会报语法错误。
-:::
-
 ## 插入来自文件的数据记录
 
 除了使用 VALUES 关键字插入一行或多行数据外，也可以把要写入的数据放在 CSV 文件中（英文逗号分隔、英文单引号括住每个值）供 SQL 指令读取。其中 CSV 文件无需表头。例如，如果 /tmp/csvfile.csv 文件的内容为：
@@ -99,51 +113,19 @@ INSERT INTO d21001 USING meters TAGS ('California.SanFrancisco', 2) VALUES ('202
 
 那么通过如下指令可以把这个文件中的数据写入子表中：
 
-```
+```sql
 INSERT INTO d1001 FILE '/tmp/csvfile.csv';
 ```
 
 ## 插入来自文件的数据记录，并自动建表
 
-从 2.1.5.0 版本开始，支持在插入来自 CSV 文件的数据时，以超级表为模板来自动创建不存在的数据表。例如：
-
-```
+```sql
 INSERT INTO d21001 USING meters TAGS ('California.SanFrancisco', 2) FILE '/tmp/csvfile.csv';
 ```
 
 也可以在一条语句中向多个表以自动建表的方式插入记录。例如：
 
-```
+```sql
 INSERT INTO d21001 USING meters TAGS ('California.SanFrancisco', 2) FILE '/tmp/csvfile_21001.csv'
             d21002 USING meters (groupId) TAGS (2) FILE '/tmp/csvfile_21002.csv';
 ```
-
-## 历史记录写入
-
-可使用 IMPORT 或者 INSERT 命令，IMPORT 的语法，功能与 INSERT 完全一样。
-
-针对 insert 类型的 SQL 语句，我们采用的流式解析策略，在发现后面的错误之前，前面正确的部分 SQL 仍会执行。下面的 SQL 中，INSERT 语句是无效的，但是 d1001 仍会被创建。
-
-```
-taos> CREATE TABLE meters(ts TIMESTAMP, current FLOAT, voltage INT, phase FLOAT) TAGS(location BINARY(30), groupId INT);
-Query OK, 0 row(s) affected (0.008245s)
-
-taos> SHOW STABLES;
-              name              |      created_time       | columns |  tags  |   tables    |
-============================================================================================
- meters                         | 2020-08-06 17:50:27.831 |       4 |      2 |           0 |
-Query OK, 1 row(s) in set (0.001029s)
-
-taos> SHOW TABLES;
-Query OK, 0 row(s) in set (0.000946s)
-
-taos> INSERT INTO d1001 USING meters TAGS('California.SanFrancisco', 2) VALUES('a');
-
-DB error: invalid SQL: 'a' (invalid timestamp) (0.039494s)
-
-taos> SHOW TABLES;
-           table_name           |      created_time       | columns |          stable_name           |
-======================================================================================================
- d1001                          | 2020-08-06 17:52:02.097 |       4 | meters                         |
-Query OK, 1 row(s) in set (0.001091s)
-```
diff --git a/docs/zh/12-taos-sql/06-select.md b/docs/zh/12-taos-sql/06-select.md
index 92abc4344b..51150b8b13 100644
--- a/docs/zh/12-taos-sql/06-select.md
+++ b/docs/zh/12-taos-sql/06-select.md
@@ -5,121 +5,118 @@ title: 数据查询
 
 ## 查询语法
 
-```
-SELECT select_expr [, select_expr ...]
-    FROM {tb_name_list}
-    [WHERE where_condition]
-    [SESSION(ts_col, tol_val)]
-    [STATE_WINDOW(col)]
-    [INTERVAL(interval_val [, interval_offset]) [SLIDING sliding_val]]
-    [FILL(fill_mod_and_val)]
-    [GROUP BY col_list]
-    [ORDER BY col_list { DESC | ASC }]
+```sql
+SELECT {DATABASE() | CLIENT_VERSION() | SERVER_VERSION() | SERVER_STATUS() | NOW() | TODAY() | TIMEZONE()}
+
+SELECT [DISTINCT] select_list
+    from_clause
+    [WHERE condition]
+    [PARTITION BY tag_list]
+    [window_clause]
+    [group_by_clause]
+    [order_by_clasue]
     [SLIMIT limit_val [SOFFSET offset_val]]
     [LIMIT limit_val [OFFSET offset_val]]
-    [>> export_file];
+    [>> export_file]
+
+select_list:
+    select_expr [, select_expr] ...
+
+select_expr: {
+    *
+  | query_name.*
+  | [schema_name.] {table_name | view_name} .*
+  | t_alias.*
+  | expr [[AS] c_alias]
+}
+
+from_clause: {
+    table_reference [, table_reference] ...
+  | join_clause [, join_clause] ...
+}
+
+table_reference:
+    table_expr t_alias
+
+table_expr: {
+    table_name
+  | view_name
+  | ( subquery )
+}
+
+join_clause:
+    table_reference [INNER] JOIN table_reference ON condition
+
+window_clause: {
+    SESSION(ts_col, tol_val)
+  | STATE_WINDOW(col)
+  | INTERVAL(interval_val [, interval_offset]) [SLIDING (sliding_val)] [WATERMARK(watermark_val)] [FILL(fill_mod_and_val)]
+
+changes_option: {
+    DURATION duration_val
+  | ROWS rows_val
+}
+
+group_by_clause:
+    GROUP BY expr [, expr] ... HAVING condition
+
+order_by_clasue:
+    ORDER BY order_expr [, order_expr] ...
+
+order_expr:
+    {expr | position | c_alias} [DESC | ASC] [NULLS FIRST | NULLS LAST]
 ```
 
-## 通配符
+## 列表
 
-通配符 \* 可以用于代指全部列。对于普通表，结果中只有普通列。
+查询语句可以指定部分或全部列作为返回结果。数据列和标签列都可以出现在列表中。
 
-```
-taos> SELECT * FROM d1001;
-           ts            |       current        |   voltage   |        phase         |
-======================================================================================
- 2018-10-03 14:38:05.000 |             10.30000 |         219 |              0.31000 |
- 2018-10-03 14:38:15.000 |             12.60000 |         218 |              0.33000 |
- 2018-10-03 14:38:16.800 |             12.30000 |         221 |              0.31000 |
-Query OK, 3 row(s) in set (0.001165s)
-```
+### 通配符
 
-在针对超级表，通配符包含 _标签列_ 。
+通配符 \* 可以用于代指全部列。对于普通表，结果中只有普通列。对于超级表和子表，还包含了 TAG 列。
 
-```
-taos> SELECT * FROM meters;
-           ts            |       current        |   voltage   |        phase         |            location            |   groupid   |
-=====================================================================================================================================
- 2018-10-03 14:38:05.500 |             11.80000 |         221 |              0.28000 | California.LosAngeles                |           2 |
- 2018-10-03 14:38:16.600 |             13.40000 |         223 |              0.29000 | California.LosAngeles                |           2 |
- 2018-10-03 14:38:05.000 |             10.80000 |         223 |              0.29000 | California.LosAngeles                |           3 |
- 2018-10-03 14:38:06.500 |             11.50000 |         221 |              0.35000 | California.LosAngeles                |           3 |
- 2018-10-03 14:38:04.000 |             10.20000 |         220 |              0.23000 | California.SanFrancisco               |           3 |
- 2018-10-03 14:38:16.650 |             10.30000 |         218 |              0.25000 | California.SanFrancisco               |           3 |
- 2018-10-03 14:38:05.000 |             10.30000 |         219 |              0.31000 | California.SanFrancisco               |           2 |
- 2018-10-03 14:38:15.000 |             12.60000 |         218 |              0.33000 | California.SanFrancisco               |           2 |
- 2018-10-03 14:38:16.800 |             12.30000 |         221 |              0.31000 | California.SanFrancisco               |           2 |
-Query OK, 9 row(s) in set (0.002022s)
+```sql
+SELECT * FROM d1001;
 ```
 
 通配符支持表名前缀，以下两个 SQL 语句均为返回全部的列：
 
-```
+```sql
 SELECT * FROM d1001;
 SELECT d1001.* FROM d1001;
 ```
 
-在 JOIN 查询中，带前缀的\*和不带前缀\*返回的结果有差别， \*返回全部表的所有列数据（不包含标签），带前缀的通配符，则只返回该表的列数据。
+在 JOIN 查询中，带表名前缀的\*和不带前缀\*返回的结果有差别， \*返回全部表的所有列数据（不包含标签），而带表名前缀的通配符，则只返回该表的列数据。
 
-```
-taos> SELECT * FROM d1001, d1003 WHERE d1001.ts=d1003.ts;
-           ts            | current |   voltage   |    phase     |           ts            | current |   voltage   |    phase     |
-==================================================================================================================================
- 2018-10-03 14:38:05.000 | 10.30000|         219 |      0.31000 | 2018-10-03 14:38:05.000 | 10.80000|         223 |      0.29000 |
-Query OK, 1 row(s) in set (0.017385s)
+```sql
+SELECT * FROM d1001, d1003 WHERE d1001.ts=d1003.ts;
+SELECT d1001.* FROM d1001,d1003 WHERE d1001.ts = d1003.ts;
 ```
 
-```
-taos> SELECT d1001.* FROM d1001,d1003 WHERE d1001.ts = d1003.ts;
-           ts            |       current        |   voltage   |        phase         |
-======================================================================================
- 2018-10-03 14:38:05.000 |             10.30000 |         219 |              0.31000 |
-Query OK, 1 row(s) in set (0.020443s)
-```
+上面的查询语句中，前者返回 d1001 和 d1003 的全部列，而后者仅返回 d1001 的全部列。
 
 在使用 SQL 函数来进行查询的过程中，部分 SQL 函数支持通配符操作。其中的区别在于：
 `count(*)`函数只返回一列。`first`、`last`、`last_row`函数则是返回全部列。
 
-```
-taos> SELECT COUNT(*) FROM d1001;
-       count(*)        |
-========================
-                     3 |
-Query OK, 1 row(s) in set (0.001035s)
+### 标签列
+
+在超级表和子表的查询中可以指定 _标签列_，且标签列的值会与普通列的数据一起返回。
+
+```sql
+ELECT location, groupid, current FROM d1001 LIMIT 2;
 ```
 
-```
-taos> SELECT FIRST(*) FROM d1001;
-        first(ts)        |    first(current)    | first(voltage) |     first(phase)     |
-=========================================================================================
- 2018-10-03 14:38:05.000 |             10.30000 |            219 |              0.31000 |
-Query OK, 1 row(s) in set (0.000849s)
-```
+### 结果去重
 
-## 标签列
+`DISINTCT` 关键字可以对结果集中的一列或多列进行去重，去除的列既可以是标签列也可以是数据列。
 
-从 2.0.14 版本开始，支持在普通表的查询中指定 _标签列_，且标签列的值会与普通列的数据一起返回。
-
-```
-taos> SELECT location, groupid, current FROM d1001 LIMIT 2;
-            location            |   groupid   |       current        |
-======================================================================
- California.SanFrancisco               |           2 |             10.30000 |
- California.SanFrancisco               |           2 |             12.60000 |
-Query OK, 2 row(s) in set (0.003112s)
-```
-
-注意：普通表的通配符 \* 中并不包含 _标签列_。
-
-## 获取标签列或普通列的去重取值
-
-从 2.0.15.0 版本开始，支持在超级表查询标签列时，指定 DISTINCT 关键字，这样将返回指定标签列的所有不重复取值。注意，在 2.1.6.0 版本之前，DISTINCT 只支持处理单个标签列，而从 2.1.6.0 版本开始，DISTINCT 可以对多个标签列进行处理，输出这些标签列取值不重复的组合。
+对标签列去重：
 
 ```sql
 SELECT DISTINCT tag_name [, tag_name ...] FROM stb_name;
 ```
 
-从 2.1.7.0 版本开始，DISTINCT 也支持对数据子表或普通表进行处理，也即支持获取单个普通列的不重复取值，或多个普通列取值的不重复组合。
+对数据列去重：
 
 ```sql
 SELECT DISTINCT col_name [, col_name ...] FROM tb_name;
@@ -133,210 +130,178 @@ SELECT DISTINCT col_name [, col_name ...] FROM tb_name;
 
 :::
 
-## 结果集列名
+### 结果集列名
 
 `SELECT`子句中，如果不指定返回结果集合的列名，结果集列名称默认使用`SELECT`子句中的表达式名称作为列名称。此外，用户可使用`AS`来重命名返回结果集合中列的名称。例如：
 
-```
+```sql
 taos> SELECT ts, ts AS primary_key_ts FROM d1001;
-           ts            |     primary_key_ts      |
-====================================================
- 2018-10-03 14:38:05.000 | 2018-10-03 14:38:05.000 |
- 2018-10-03 14:38:15.000 | 2018-10-03 14:38:15.000 |
- 2018-10-03 14:38:16.800 | 2018-10-03 14:38:16.800 |
-Query OK, 3 row(s) in set (0.001191s)
 ```
 
 但是针对`first(*)`、`last(*)`、`last_row(*)`不支持针对单列的重命名。
 
-## 隐式结果列
+### 隐式结果列
 
 `Select_exprs`可以是表所属列的列名，也可以是基于列的函数表达式或计算式，数量的上限 256 个。当用户使用了`interval`或`group by tags`的子句以后，在最后返回结果中会强制返回时间戳列（第一列）和 group by 子句中的标签列。后续的版本中可以支持关闭 group by 子句中隐式列的输出，列输出完全由 select 子句控制。
 
-## 表（超级表）列表
+### 伪列
+
+**TBNAME**
+`TBNAME` 可以视为超级表中一个特殊的标签，代表子表的表名。
+
+获取一个超级表所有的子表名及相关的标签信息：
+
+```mysql
+SELECT TBNAME, location FROM meters;
+```
+
+统计超级表下辖子表数量：
+
+```mysql
+SELECT COUNT(*) FROM (SELECT DISTINCT TBNAME FROM meters);
+```
+
+以上两个查询均只支持在 WHERE 条件子句中添加针对标签（TAGS）的过滤条件。例如：
+
+**\_QSTART/\_QEND**
+
+\_qstart 和\_qend 表示用户输入的查询时间范围，即 WHERE 子句中主键时间戳条件所限定的时间范围。如果 WHERE 子句中没有有效的主键时间戳条件，则时间范围为[-2^63, 2^63-1]。
+
+\_qstart 和\_qend 不能用于 WHERE 子句中。
+
+**\_WSTART/\_WEND/\_WDURATION**
+\_wstart 伪列、\_wend 伪列和\_wduration 伪列
+\_wstart 表示窗口起始时间戳，\_wend 表示窗口结束时间戳，\_wduration 表示窗口持续时长。
+
+这三个伪列只能用于时间窗口的窗口切分查询之中，且要在窗口切分子句之后出现。
+
+### \_c0/\_ROWTS
+
+TDengine 中，所有表的第一列都必须是时间戳类型，且为其主键，\_rowts 伪列和\_c0 伪列均代表了此列的值。相比实际的主键时间戳列，使用伪列更加灵活，语义也更加标准。例如，可以和 max\min 等函数一起使用。
+
+```sql
+select _rowts, max(current) from meters;
+```
+
+## GROUP BY
+
+如果在语句中同时指定了 GROUP BY 子句，那么 SELECT 列表只能包含如下表达式：
+
+1. 常量
+2. 聚集函数
+3. 与 GROUP BY 后表达式相同的表达式。
+4. 包含前面表达式的表达式
+
+GROUP BY 子句对每行数据按 GROUP BY 后的表达式的值进行分组，并为每个组返回一行汇总信息。
+
+GROUP BY 子句中的表达式可以包含表或视图中的任何列，这些列不需要出现在 SELECT 列表中。
+
+该子句对行进行分组，但不保证结果集的顺序。若要对分组进行排序，请使用 ORDER BY 子句
+
+## 查询对象
 
 FROM 关键字后面可以是若干个表（超级表）列表，也可以是子查询的结果。
 如果没有指定用户的当前数据库，可以在表名称之前使用数据库的名称来指定表所属的数据库。例如：`power.d1001` 方式来跨库使用表。
 
-```
-SELECT * FROM power.d1001;
-------------------------------
-USE power;
-SELECT * FROM d1001;
-```
+TDengine 支持基于时间戳主键的 INNER JOIN，规则如下：
+
+1. 支持 FROM 表列表和显式的 JOIN 子句两种语法。
+2. 对于普通表和子表，ON 条件必须有且只有时间戳主键的等值条件。
+3. 对于超级表，ON 条件在时间戳主键的等值条件之外，还要求有可以一一对应的标签列等值条件，不支持 OR 条件。
+4. 参与 JOIN 计算的表只能是同一种类型，即只能都是超级表，或都是子表，或都是普通表。
+5. JOIN 两侧均支持子查询。
+6. 参与 JOIN 的表个数上限为 10 个。
+7. 不支持与 FILL 子句混合使用。
+
+## PARTITON BY
+
+PARTITION BY 子句是 TDengine 特色语法，按 part_list 对数据进行切分，在每个切分的分片中进行计算。
+
+详见 [TDengine 特色查询](taos-sql/distinguished)
+
+## ORDER BY
+
+ORDER BY 子句对结果集排序。如果没有指定 ORDER BY，无法保证同一语句多次查询的结果集返回顺序一致。
+
+ORDER BY 后可以使用位置语法，位置标识为正整数，从 1 开始，表示使用 SELECT 列表的第几个表达式进行排序。
+
+ASC 表示升序，DESC 表示降序。
+
+NULLS 语法用来指定 NULL 值在排序中输出的位置。NULLS LAST 是升序的默认值，NULLS FIRST 是降序的默认值。
+
+## LIMIT
+
+LIMIT 控制输出条数，OFFSET 指定从第几条之后开始输出。LIMIT/OFFSET 对结果集的执行顺序在 ORDER BY 之后。LIMIT 5 OFFSET 2 可以简写为 LIMIT 2, 5，都输出第 3 行到第 7 行数据。
+
+在有 PARTITION BY 子句时，LIMIT 控制的是每个切分的分片中的输出，而不是总的结果集输出。
+
+## SLIMIT
+
+SLIMIT 和 PARTITION BY 子句一起使用，用来控制输出的分片的数量。SLIMIT 5 SOFFSET 2 可以简写为 SLIMIT 2, 5，都表示输出第 3 个到第 7 个分片。
+
+需要注意，如果有 ORDER BY 子句，则输出只有一个分片。
 
 ## 特殊功能
 
-部分特殊的查询功能可以不使用 FROM 子句执行。获取当前所在的数据库 database()：
+部分特殊的查询功能可以不使用 FROM 子句执行。
 
-```
-taos> SELECT DATABASE();
-           database()           |
-=================================
- power                          |
-Query OK, 1 row(s) in set (0.000079s)
+### 获取当前数据库
+
+下面的命令可以获取当前所在的数据库 database()，如果登录的时候没有指定默认数据库，且没有使用`USE`命令切换数据，则返回 NULL。
+
+```sql
+SELECT DATABASE();
 ```
 
-如果登录的时候没有指定默认数据库，且没有使用`USE`命令切换数据，则返回 NULL。
+### 获取服务器和客户端版本号
 
-```
-taos> SELECT DATABASE();
-           database()           |
-=================================
- NULL                           |
-Query OK, 1 row(s) in set (0.000184s)
+```sql
+SELECT CLIENT_VERSION();
+SELECT SERVER_VERSION();
 ```
 
-获取服务器和客户端版本号：
-
-```
-taos> SELECT CLIENT_VERSION();
- client_version() |
-===================
- 2.0.0.0          |
-Query OK, 1 row(s) in set (0.000070s)
-
-taos> SELECT SERVER_VERSION();
- server_version() |
-===================
- 2.0.0.0          |
-Query OK, 1 row(s) in set (0.000077s)
-```
+### 获取服务器状态
 
 服务器状态检测语句。如果服务器正常，返回一个数字（例如 1）。如果服务器异常，返回 error code。该 SQL 语法能兼容连接池对于 TDengine 状态的检查及第三方工具对于数据库服务器状态的检查。并可以避免出现使用了错误的心跳检测 SQL 语句导致的连接池连接丢失的问题。
 
-```
-taos> SELECT SERVER_STATUS();
- server_status() |
-==================
-               1 |
-Query OK, 1 row(s) in set (0.000074s)
-
-taos> SELECT SERVER_STATUS() AS status;
-   status    |
-==============
-           1 |
-Query OK, 1 row(s) in set (0.000081s)
+```sql
+SELECT SERVER_STATUS();
 ```
 
-## \_block_dist 函数
+### 获取当前时间
 
-**功能说明**: 用于获得指定的（超级）表的数据块分布信息
-
-```txt title="语法"
-SELECT _block_dist() FROM { tb_name | stb_name }
+```sql
+SELECT NOW();
 ```
 
-**返回结果类型**：字符串。
+### 获取当前日期
 
-**适用数据类型**：不能输入任何参数。
-
-**嵌套子查询支持**：不支持子查询或嵌套查询。
-
-**返回结果**:
-
-- 返回 FROM 子句中输入的表或超级表的数据块分布情况。不支持查询条件。
-- 返回的结果是该表或超级表的数据块所包含的行数的数据分布直方图。
-
-```txt title="返回结果"
-summary:
-5th=[392], 10th=[392], 20th=[392], 30th=[392], 40th=[792], 50th=[792] 60th=[792], 70th=[792], 80th=[792], 90th=[792], 95th=[792], 99th=[792] Min=[392(Rows)] Max=[800(Rows)] Avg=[666(Rows)] Stddev=[2.17] Rows=[2000], Blocks=[3], Size=[5.440(Kb)] Comp=[0.23] RowsInMem=[0] SeekHeaderTime=[1(us)]
+```sql
+SELECT TODAY();
 ```
 
-**上述信息的说明如下**:
+### 获取当前时区
 
-- 查询的（超级）表所包含的存储在文件中的数据块（data block）中所包含的数据行的数量分布直方图信息：5%， 10%， 20%， 30%， 40%， 50%， 60%， 70%， 80%， 90%， 95%， 99% 的数值；
-- 所有数据块中，包含行数最少的数据块所包含的行数量， 其中的 Min 指标 392 行。
-- 所有数据块中，包含行数最多的数据块所包含的行数量， 其中的 Max 指标 800 行。
-- 所有数据块行数的算数平均值 666 行（其中的 Avg 项）。
-- 所有数据块中行数分布的均方差为 2.17 ( stddev ）。
-- 数据块包含的行的总数为 2000 行（Rows）。
-- 数据块总数是 3 个数据块 （Blocks）。
-- 数据块占用磁盘空间大小 5.44 Kb （size）。
-- 压缩后的数据块的大小除以原始数据的所获得的压缩比例： 23%（Comp），及压缩后的数据规模是原始数据规模的 23%。
-- 内存中存在的数据行数是 0，表示内存中没有数据缓存。
-- 获取数据块信息的过程中读取头文件的时间开销 1 微秒（SeekHeaderTime）。
-
-**支持版本**：指定计算算法的功能从 2.1.0.x 版本开始，2.1.0.0 之前的版本不支持指定使用算法的功能。
+```sql
+SELECT TIMEZONE();
+```
 
 ## TAOS SQL 中特殊关键词
 
 - `TBNAME`： 在超级表查询中可视为一个特殊的标签，代表查询涉及的子表名
 - `_c0`: 表示表（超级表）的第一列
 
-## 小技巧
-
 获取一个超级表所有的子表名及相关的标签信息：
 
-```
+```sql
 SELECT TBNAME, location FROM meters;
 ```
 
 统计超级表下辖子表数量：
 
+```sql
+SELECT COUNT(*) FROM (SELECT DISTINCT TBNAMEFROM meters);
 ```
-SELECT COUNT(TBNAME) FROM meters;
-```
-
-以上两个查询均只支持在 WHERE 条件子句中添加针对标签（TAGS）的过滤条件。例如：
-
-```
-taos> SELECT TBNAME, location FROM meters;
-             tbname             |            location            |
-==================================================================
- d1004                          | California.LosAngeles                |
- d1003                          | California.LosAngeles                |
- d1002                          | California.SanFrancisco               |
- d1001                          | California.SanFrancisco               |
-Query OK, 4 row(s) in set (0.000881s)
-
-taos> SELECT COUNT(tbname) FROM meters WHERE groupId > 2;
-     count(tbname)     |
-========================
-                     2 |
-Query OK, 1 row(s) in set (0.001091s)
-```
-
-- 可以使用 \* 返回所有列，或指定列名。可以对数字列进行四则运算，可以给输出的列取列名。
-  - 暂不支持含列名的四则运算表达式用于条件过滤算子（例如，不支持 `where a*2>6;`，但可以写 `where a>6/2;`）。
-  - 暂不支持含列名的四则运算表达式作为 SQL 函数的应用对象（例如，不支持 `select min(2*a) from t;`，但可以写 `select 2*min(a) from t;`）。
-- WHERE 语句可以使用各种逻辑判断来过滤数字值，或使用通配符来过滤字符串。
-- 输出结果缺省按首列时间戳升序排序，但可以指定按降序排序( \_c0 指首列时间戳)。使用 ORDER BY 对其他字段进行排序,排序结果顺序不确定。
-- 参数 LIMIT 控制输出条数，OFFSET 指定从第几条开始输出。LIMIT/OFFSET 对结果集的执行顺序在 ORDER BY 之后。且 `LIMIT 5 OFFSET 2` 可以简写为 `LIMIT 2, 5`。
-  - 在有 GROUP BY 子句的情况下，LIMIT 参数控制的是每个分组中至多允许输出的条数。
-- 参数 SLIMIT 控制由 GROUP BY 指令划分的分组中，至多允许输出几个分组的数据。且 `SLIMIT 5 SOFFSET 2` 可以简写为 `SLIMIT 2, 5`。
-- 通过 “>>” 输出结果可以导出到指定文件。
-
-## 条件过滤操作
-
-| **Operation** | **Note**                 | **Applicable Data Types**                 |
-| ------------- | ------------------------ | ----------------------------------------- |
-| >             | larger than              | all types except bool                     |
-| <             | smaller than             | all types except bool                     |
-| >=            | larger than or equal to  | all types except bool                     |
-| <=            | smaller than or equal to | all types except bool                     |
-| =             | equal to                 | all types                                 |
-| <\>           | not equal to             | all types                                 |
-| is [not] null | is null or is not null   | all types                                 |
-| between and   | within a certain range   | all types except bool                     |
-| in            | match any value in a set | all types except first column `timestamp` |
-| like          | match a wildcard string  | **`binary`** **`nchar`**                  |
-| match/nmatch  | filter regex             | **`binary`** **`nchar`**                  |
-
-**使用说明**:
-
-- <\> 算子也可以写为 != ，请注意，这个算子不能用于数据表第一列的 timestamp 字段。
-- like 算子使用通配符字符串进行匹配检查。
-  - 在通配符字符串中：'%'（百分号）匹配 0 到任意个字符；'\_'（下划线）匹配单个任意 ASCII 字符。
-  - 如果希望匹配字符串中原本就带有的 \_（下划线）字符，那么可以在通配符字符串中写作 `\_`，也即加一个反斜线来进行转义。（从 2.2.0.0 版本开始支持）
-  - 通配符字符串最长不能超过 20 字节。（从 2.1.6.1 版本开始，通配符字符串的长度放宽到了 100 字节，并可以通过 taos.cfg 中的 maxWildCardsLength 参数来配置这一长度限制。但不建议使用太长的通配符字符串，将有可能严重影响 LIKE 操作的执行性能。）
-- 同时进行多个字段的范围过滤，需要使用关键词 AND 来连接不同的查询条件，暂不支持 OR 连接的不同列之间的查询过滤条件。
-  - 从 2.3.0.0 版本开始，已支持完整的同一列和/或不同列间的 AND/OR 运算。
-- 针对单一字段的过滤，如果是时间过滤条件，则一条语句中只支持设定一个；但针对其他的（普通）列或标签列，则可以使用 `OR` 关键字进行组合条件的查询过滤。例如： `((value > 20 AND value < 30) OR (value < 12))`。
-  - 从 2.3.0.0 版本开始，允许使用多个时间过滤条件，但首列时间戳的过滤运算结果只能包含一个区间。
-- 从 2.0.17.0 版本开始，条件过滤开始支持 BETWEEN AND 语法，例如 `WHERE col2 BETWEEN 1.5 AND 3.25` 表示查询条件为“1.5 ≤ col2 ≤ 3.25”。
-- 从 2.1.4.0 版本开始，条件过滤开始支持 IN 算子，例如 `WHERE city IN ('California.SanFrancisco', 'California.SanDieo')`。说明：BOOL 类型写作 `{true, false}` 或 `{0, 1}` 均可，但不能写作 0、1 之外的整数；FLOAT 和 DOUBLE 类型会受到浮点数精度影响，集合内的值在精度范围内认为和数据行的值完全相等才能匹配成功；TIMESTAMP 类型支持非主键的列。
-- 从 2.3.0.0 版本开始，条件过滤开始支持正则表达式，关键字 match/nmatch，不区分大小写。
 
 ## 正则表达式过滤
 
@@ -358,7 +323,7 @@ WHERE (column|tbname) **match/MATCH/nmatch/NMATCH** _regex_
 
 ## JOIN 子句
 
-从 2.2.0.0 版本开始，TDengine 对内连接（INNER JOIN）中的自然连接（Natural join）操作实现了完整的支持。也即支持“普通表与普通表之间”、“超级表与超级表之间”、“子查询与子查询之间”进行自然连接。自然连接与内连接的主要区别是，自然连接要求参与连接的字段在不同的表/超级表中必须是同名字段。也即，TDengine 在连接关系的表达中，要求必须使用同名数据列/标签列的相等关系。
+TDengine 支持“普通表与普通表之间”、“超级表与超级表之间”、“子查询与子查询之间” 进行自然连接。自然连接与内连接的主要区别是，自然连接要求参与连接的字段在不同的表/超级表中必须是同名字段。也即，TDengine 在连接关系的表达中，要求必须使用同名数据列/标签列的相等关系。
 
 在普通表与普通表之间的 JOIN 操作中，只能使用主键时间戳之间的相等关系。例如：
 
diff --git a/docs/zh/12-taos-sql/08-delete-data.mdx b/docs/zh/12-taos-sql/08-delete-data.mdx
index eafe8cff26..15becd2593 100644
--- a/docs/zh/12-taos-sql/08-delete-data.mdx
+++ b/docs/zh/12-taos-sql/08-delete-data.mdx
@@ -5,8 +5,6 @@ title: "删除数据"
 ---
 
 删除数据是 TDengine 提供的根据指定时间段删除指定表或超级表中数据记录的功能，方便用户清理由于设备故障等原因产生的异常数据。
-注意：本功能只在企业版 2.6.0.0 及以后的版本中提供，如需此功能请点击下面的链接访问[企业版产品](https://www.taosdata.com/products#enterprise-edition-link)
-
 
 **语法：**
 
@@ -17,21 +15,21 @@ DELETE FROM [ db_name. ] tb_name [WHERE condition];
 **功能：** 删除指定表或超级表中的数据记录
 
 **参数：**
-   
-- `db_name` ：  可选参数，指定要删除表所在的数据库名，不填写则在当前数据库中
-- `tb_name` ：  必填参数，指定要删除数据的表名，可以是普通表、子表，也可以是超级表。
-- `condition`： 可选参数，指定删除数据的过滤条件，不指定过滤条件则为表中所有数据，请慎重使用。特别说明，这里的where 条件中只支持对第一列时间列的过滤，如果是超级表，支持对 tag 列过滤。
+
+- `db_name` ： 可选参数，指定要删除表所在的数据库名，不填写则在当前数据库中
+- `tb_name` ： 必填参数，指定要删除数据的表名，可以是普通表、子表，也可以是超级表。
+- `condition`： 可选参数，指定删除数据的过滤条件，不指定过滤条件则为表中所有数据，请慎重使用。特别说明，这里的 where 条件中只支持对第一列时间列的过滤。
 
 **特别说明：**
-      
-数据删除后不可恢复，请慎重使用。为了确保删除的数据确实是自己要删除的，建议可以先使用 `select` 语句加 `where` 后的删除条件查看要删除的数据内容，确认无误后再执行 `delete` 命令。      
+
+数据删除后不可恢复，请慎重使用。为了确保删除的数据确实是自己要删除的，建议可以先使用 `select` 语句加 `where` 后的删除条件查看要删除的数据内容，确认无误后再执行 `delete` 命令。
 
 **示例：**
-    
-`meters` 是一个超级表，`groupid` 是 int 类型的 tag 列，现在要删除 `meters` 表中时间小于 2021-10-01 10:40:00.100 且 tag 列 `groupid` 值为 1 的所有数据，sql 如下：
+
+`meters` 是一个超级表，`groupid` 是 int 类型的 tag 列，现在要删除 `meters` 表中时间小于 2021-10-01 10:40:00.100 的所有数据，sql 如下：
 
 ```sql
-delete from meters where ts < '2021-10-01 10:40:00.100' and groupid=1 ;
+delete from meters where ts < '2021-10-01 10:40:00.100' ;
 ```
 
 执行后显示结果为：
diff --git a/docs/zh/12-taos-sql/12-interval.md b/docs/zh/12-taos-sql/12-distinguished.md
similarity index 91%
rename from docs/zh/12-taos-sql/12-interval.md
rename to docs/zh/12-taos-sql/12-distinguished.md
index 8bf0c578e7..2dad49ece9 100644
--- a/docs/zh/12-taos-sql/12-interval.md
+++ b/docs/zh/12-taos-sql/12-distinguished.md
@@ -12,16 +12,16 @@ TDengine 提供的特色查询包括标签切分查询和窗口切分查询。
 超级表查询中，当需要针对标签进行数据切分然后在切分出的数据空间内再进行一系列的计算时使用标签切分子句，标签切分的语句如下：
 
 ```sql
-PARTITION BY tag_list
+PARTITION BY part_list
 ```
 
-其中 `tag_list` 是标签列的列表，还可以包括 tbname 伪列。
+part_list 可以是任意的标量表达式，包括列、常量、标量函数和它们的组合。
 
-TDengine 按如下方式处理标签切分子句：
+当 PARTITION BY 和标签一起使用时，TDengine 按如下方式处理标签切分子句：
 
-标签切分子句位于 `WHERE` 子句之后，且不能和 `JOIN` 子句一起使用。
-标签切分子句将超级表数据按指定的标签组合进行切分，然后对每个切分的分片进行指定的计算。计算由之后的子句定义（窗口子句、`GROUP BY` 子句或`SELECT` 子句）。
-标签切分子句可以和窗口切分子句（或 `GROUP BY` 子句）一起使用，此时后面的子句作用在每个切分的分片上。例如，下面的示例将数据按标签 `location` 进行分组，并对每个组按 10 分钟进行降采样，取其最大值。
+- 标签切分子句位于 WHERE 子句之后，且不能和 JOIN 子句一起使用。
+- 标签切分子句将超级表数据按指定的标签组合进行切分，每个切分的分片进行指定的计算。计算由之后的子句定义（窗口子句、GROUP BY 子句或 SELECT 子句）。
+- 标签切分子句可以和窗口切分子句（或 GROUP BY 子句）一起使用，此时后面的子句作用在每个切分的分片上。例如，将数据按标签 location 进行分组，并对每个组按 10 分钟进行降采样，取其最大值。
 
 ```sql
 select max(current) from meters partition by location interval(10m)
diff --git a/docs/zh/12-taos-sql/14-stream.md b/docs/zh/12-taos-sql/14-stream.md
index e69de29bb2..7ff7da2bfb 100644
--- a/docs/zh/12-taos-sql/14-stream.md
+++ b/docs/zh/12-taos-sql/14-stream.md
@@ -0,0 +1,122 @@
+---
+sidebar_label: 流式计算
+title: 流式计算
+---
+
+在时序数据的处理中，经常要对原始数据进行清洗、预处理，再使用时序数据库进行长久的储存。用户通常需要在时序数据库之外再搭建 Kafka、Flink、Spark 等流计算处理引擎，增加了用户的开发成本和维护成本。
+
+使用 TDengine 3.0 的流式计算引擎能够最大限度的减少对这些额外中间件的依赖，真正将数据的写入、预处理、长期存储、复杂分析、实时计算、实时报警触发等功能融为一体，并且，所有这些任务只需要使用 SQL 完成，极大降低了用户的学习成本、使用成本。
+
+## 创建流式计算
+
+```sql
+CREATE STREAM [IF NOT EXISTS] stream_name [stream_options] INTO stb_name AS subquery
+stream_options: {
+ TRIGGER    [AT_ONCE | WINDOW_CLOSE | MAX_DELAY time]
+ WATERMARK   time
+}
+
+```
+
+其中 subquery 是 select 普通查询语法的子集:
+
+```sql
+subquery: SELECT [DISTINCT] select_list
+    from_clause
+    [WHERE condition]
+    [PARTITION BY tag_list]
+    [window_clause]
+    [group_by_clause]
+```
+
+不支持 order_by，limit，slimit，fill 语句
+
+例如，如下语句创建流式计算，同时自动创建名为 avg_vol 的超级表，此流计算以一分钟为时间窗口、30 秒为前向增量统计这些电表的平均电压，并将来自 meters 表的数据的计算结果写入 avg_vol 表，不同 partition 的数据会分别创建子表并写入不同子表。
+
+```sql
+CREATE STREAM avg_vol_s INTO avg_vol AS
+SELECT _wstartts, count(*), avg(voltage) FROM meters PARTITION BY tbname INTERVAL(1m) SLIDING(30s);
+```
+
+## 删除流式计算
+
+```sql
+DROP STREAM [IF NOT EXISTS] stream_name
+```
+
+仅删除流式计算任务，由流式计算写入的数据不会被删除。
+
+## 展示流式计算
+
+```sql
+SHOW STREAMS;
+```
+
+## 流式计算的触发模式
+
+在创建流时，可以通过 TRIGGER 指令指定流式计算的触发模式。
+
+对于非窗口计算，流式计算的触发是实时的；对于窗口计算，目前提供 3 种触发模式：
+
+1. AT_ONCE：写入立即触发
+
+2. WINDOW_CLOSE：窗口关闭时触发（窗口关闭由事件时间决定，可配合 watermark 使用，详见《流式计算的乱序数据容忍策略》）
+
+3. MAX_DELAY time：若窗口关闭，则触发计算。若窗口未关闭，且未关闭时长超过 max delay 指定的时间，则触发计算。
+
+由于窗口关闭是由事件时间决定的，如事件流中断、或持续延迟，则事件时间无法更新，可能导致无法得到最新的计算结果。
+
+因此，流式计算提供了以事件时间结合处理时间计算的 MAX_DELAY 触发模式。
+
+MAX_DELAY 模式在窗口关闭时会立即触发计算。此外，当数据写入后，计算触发的时间超过 max delay 指定的时间，则立即触发计算
+
+## 流式计算的乱序数据容忍策略
+
+在创建流时，可以在 stream_option 中指定 watermark。
+
+流式计算通过 watermark 来度量对乱序数据的容忍程度，watermark 默认为 0。
+
+T = 最新事件时间 - watermark
+
+每批到来的数据都会以上述公式更新窗口关闭时间，并将窗口结束时间 < T 的所有打开的窗口关闭，若触发模式为 WINDOW_CLOSE 或 MAX_DELAY，则推送窗口聚合结果。
+
+流式计算的过期数据处理策略
+对于已关闭的窗口，再次落入该窗口中的数据被标记为过期数据，对于过期数据，流式计算提供两种处理方式：
+
+1. 直接丢弃：这是常见流式计算引擎提供的默认（甚至是唯一）计算模式
+
+2. 重新计算：从 TSDB 中重新查找对应窗口的所有数据并重新计算得到最新结果
+
+无论在哪种模式下，watermark 都应该被妥善设置，来得到正确结果（直接丢弃模式）或避免频繁触发重算带来的性能开销（重新计算模式）。
+
+## 流式计算的数据填充策略
+
+TODO
+
+## 流式计算与会话窗口（session window）
+
+```sql
+window_clause: {
+    SESSION(ts_col, tol_val)
+  | STATE_WINDOW(col)
+  | INTERVAL(interval_val [, interval_offset]) [SLIDING (sliding_val)] [FILL(fill_mod_and_val)]
+}
+```
+
+其中，SESSION 是会话窗口，tol_val 是时间间隔的最大范围。在 tol_val 时间间隔范围内的数据都属于同一个窗口，如果连续的两条数据的时间超过 tol_val，则自动开启下一个窗口。
+
+## 流式计算的监控与流任务分布查询
+
+TODO
+
+## 流式计算的内存控制与存算分离
+
+TODO
+
+## 流式计算的暂停与恢复
+
+```sql
+STOP STREAM stream_name;
+
+RESUME STREAM stream_name;
+```
diff --git a/docs/zh/12-taos-sql/20-keywords.md b/docs/zh/12-taos-sql/20-keywords.md
index d91c9be2ac..cac29d7863 100644
--- a/docs/zh/12-taos-sql/20-keywords.md
+++ b/docs/zh/12-taos-sql/20-keywords.md
@@ -1,5 +1,5 @@
 ---
-sidebar_label: 保留关键字 
+sidebar_label: 保留关键字
 title: TDengine 保留关键字
 ---
 
@@ -58,70 +58,70 @@ title: TDengine 保留关键字
 
 ### D
 
-- DATABASE  
-- DATABASES 
-- DAYS      
-- DBS       
-- DEFERRED  
+- DATABASE
+- DATABASES
+- DAYS
+- DBS
+- DEFERRED
 - DELETE
 - DELIMITERS
-- DESC      
-- DESCRIBE  
-- DETACH    
-- DISTINCT  
-- DIVIDE    
-- DNODE     
-- DNODES    
-- DOT       
-- DOUBLE    
-- DROP  
+- DESC
+- DESCRIBE
+- DETACH
+- DISTINCT
+- DIVIDE
+- DNODE
+- DNODES
+- DOT
+- DOUBLE
+- DROP
 
 ### E
 
-- END     
-- EQ      
-- EXISTS  
-- EXPLAIN 
+- END
+- EQ
+- EXISTS
+- EXPLAIN
 
 ### F
 
-- FAIL   
-- FILE   
-- FILL   
-- FLOAT  
-- FOR    
-- FROM   
-- FSYNC  
+- FAIL
+- FILE
+- FILL
+- FLOAT
+- FOR
+- FROM
+- FSYNC
 
 ### G
 
-- GE    
-- GLOB  
+- GE
+- GLOB
 - GRANTS
-- GROUP 
-- GT  
+- GROUP
+- GT
 
 ### H
 
-- HAVING 
+- HAVING
 
 ### I
 
 - ID
 - IF
-- IGNORE 
+- IGNORE
 - IMMEDIA
-- IMPORT 
-- IN     
+- IMPORT
+- IN
 - INITIAL
-- INSERT 
+- INSERT
 - INSTEAD
-- INT    
+- INT
 - INTEGER
 - INTERVA
-- INTO   
-- IS     
-- ISNULL 
+- INTO
+- IS
+- ISNULL
 
 ### J
 
@@ -130,190 +130,147 @@ title: TDengine 保留关键字
 ### K
 
 - KEEP
-- KEY 
+- KEY
 - KILL
 
 ### L
 
-- LE    
-- LIKE  
-- LIMIT 
+- LE
+- LIKE
+- LIMIT
 - LINEAR
-- LOCAL 
-- LP    
+- LOCAL
+- LP
 - LSHIFT
-- LT 
+- LT
 
 ### M
 
-- MATCH    
-- MAXROWS  
-- MINROWS  
-- MINUS    
-- MNODES   
-- MODIFY   
-- MODULES  
+- MATCH
+- MAXROWS
+- MINROWS
+- MINUS
+- MNODES
+- MODIFY
+- MODULES
 
 ### N
 
-- NE     
-- NONE   
-- NOT    
+- NE
+- NONE
+- NOT
 - NOTNULL
-- NOW    
+- NOW
 - NULL
 
 ### O
 
-- OF    
+- OF
 - OFFSET
-- OR    
-- ORDER 
+- OR
+- ORDER
 
 ### P
 
 - PARTITION
-- PASS     
-- PLUS     
-- PPS      
+- PASS
+- PLUS
+- PPS
 - PRECISION
-- PREV     
+- PREV
 - PRIVILEGE
 
 ### Q
 
-- QTIME 
+- QTIME
 - QUERIE
-- QUERY 
+- QUERY
 - QUORUM
 
 ### R
 
-- RAISE  
-- REM    
+- RAISE
+- REM
 - REPLACE
 - REPLICA
-- RESET  
+- RESET
 - RESTRIC
-- ROW    
-- RP     
+- ROW
+- RP
 - RSHIFT
 
 ### S
 
-- SCORES 
-- SELECT 
-- SEMI   
+- SCORES
+- SELECT
+- SEMI
 - SESSION
-- SET    
-- SHOW   
-- SLASH  
+- SET
+- SHOW
+- SLASH
 - SLIDING
-- SLIMIT 
+- SLIMIT
 - SMALLIN
 - SOFFSET
-- STable 
+- STable
 - STableS
-- STAR    
-- STATE   
+- STAR
+- STATE
 - STATEMEN
 - STATE_WI
-- STORAGE 
-- STREAM  
-- STREAMS 
-- STRING  
-- SYNCDB  
+- STORAGE
+- STREAM
+- STREAMS
+- STRING
+- SYNCDB
 
 ### T
 
-- TABLE     
-- TABLES    
-- TAG       
-- TAGS      
-- TBNAME    
-- TIMES     
-- TIMESTAMP 
-- TINYINT   
-- TOPIC     
-- TOPICS    
-- TRIGGER   
-- TSERIES   
+- TABLE
+- TABLES
+- TAG
+- TAGS
+- TBNAME
+- TIMES
+- TIMESTAMP
+- TINYINT
+- TOPIC
+- TOPICS
+- TRIGGER
+- TSERIES
 
 ### U
 
-- UMINUS   
-- UNION    
-- UNSIGNED 
-- UPDATE   
-- UPLUS    
-- USE      
-- USER     
-- USERS    
-- USING  
+- UMINUS
+- UNION
+- UNSIGNED
+- UPDATE
+- UPLUS
+- USE
+- USER
+- USERS
+- USING
 
 ### V
 
-- VALUES   
-- VARIABLE 
+- VALUES
+- VARIABLE
 - VARIABLES
-- VGROUPS  
-- VIEW     
-- VNODES   
+- VGROUPS
+- VIEW
+- VNODES
 
 ### W
 
 - WAL
 - WHERE
 
-### _
+### \_
 
-- _C0
-- _QSTART
-- _QSTOP
-- _QDURATION
-- _WSTART
-- _WSTOP
-- _WDURATION
-
-
-## 特殊说明
-### TBNAME
-`TBNAME` 可以视为超级表中一个特殊的标签，代表子表的表名。
-
-获取一个超级表所有的子表名及相关的标签信息：
-
-```mysql
-SELECT TBNAME, location FROM meters;
-```
-
-统计超级表下辖子表数量：
-
-```mysql
-SELECT COUNT(TBNAME) FROM meters;
-```
-
-以上两个查询均只支持在WHERE条件子句中添加针对标签（TAGS）的过滤条件。例如：
-```mysql
-taos> SELECT TBNAME, location FROM meters;
-             tbname             |            location            |
-==================================================================
- d1004                          | California.SanFrancisco        |
- d1003                          | California.SanFrancisco        |
- d1002                          | California.LosAngeles          |
- d1001                          | California.LosAngeles          |
-Query OK, 4 row(s) in set (0.000881s)
-
-taos> SELECT COUNT(tbname) FROM meters WHERE groupId > 2;
-     count(tbname)     |
-========================
-                     2 |
-Query OK, 1 row(s) in set (0.001091s)
-```
-### _QSTART/_QSTOP/_QDURATION
-表示查询过滤窗口的起始，结束以及持续时间。
-
-### _WSTART/_WSTOP/_WDURATION
-窗口切分聚合查询（例如 interval/session window/state window）中表示每个切分窗口的起始，结束以及持续时间。
-
-### _c0/_ROWTS
-_c0 _ROWTS 等价，表示表或超级表的第一列
+- \_C0
+- \_QSTART
+- \_QSTOP
+- \_QDURATION
+- \_WSTART
+- \_WSTOP
+- \_WDURATION
diff --git a/docs/zh/12-taos-sql/22-information.md b/docs/zh/12-taos-sql/22-information.md
deleted file mode 100644
index 0695aa5172..0000000000
--- a/docs/zh/12-taos-sql/22-information.md
+++ /dev/null
@@ -1,5 +0,0 @@
----
-sidebar_label: Information内置数据库
-title: Information内置数据库
----
-
diff --git a/docs/zh/12-taos-sql/22-meta.md b/docs/zh/12-taos-sql/22-meta.md
new file mode 100644
index 0000000000..885ffbc635
--- /dev/null
+++ b/docs/zh/12-taos-sql/22-meta.md
@@ -0,0 +1,186 @@
+---
+sidebar_label: 元数据库
+title: 元数据库
+---
+
+TDengine 内置了一个名为 `INFORMATION_SCHEMA` 的数据库，提供对数据库元数据、数据库系统信息和状态的访问，例如数据库或表的名称，当前执行的 SQL 语句等。
+
+`INFORMATION_SCHEMA` 是 TDengine 启动时自动创建的数据库，该数据库存储有关 TDengine 维护的所有其他数据库的信息。它包含多个只读表。实际上，这些表都是视图，而不是基表，因此没有与它们关联的文件。所以对这些表只能查询，不能进行 INSERT 等写入操作。
+
+可以使用 USE 语句将 INFORMATION_SCHEMA 设为默认数据库。
+
+INFORMATION_SCHEMA 旨在以一种更一致的方式来提供对 TDengine 支持的各种 SHOW 语句（如 SHOW TABLES、SHOW DATABASES）提供的信息的访问。与 SHOW 语句相比，使用 SELECT ... FROM INFORMATION_SCHEMA.tablename 具有以下优点：
+
+您可以使用 SELECT 语句熟悉的语法，只需要学习一些表名和列名。
+
+您可以对查询结果进行筛选、排序等操作，事实上，您可以使用任意 TDengine 支持的 SELECT 语句对 INFORMATION_SCHEMA 中的表进行查询。
+TDengine 在后续演进中可以灵活的添加已有 INFORMATION_SCHEMA 中表的列，而不用担心对既有业务系统造成影响。
+
+此技术与其他数据库系统更具互操作性。例如，Oracle 数据库用户熟悉查询 Oracle 数据字典中的表。
+
+由于 SHOW 语句已经被开发者熟悉的和广泛使用，所以它们仍然是可用的。
+
+本章将详细介绍 `INFORMATION_SCHEMA` 这个内置元数据库中的表和表结构。
+
+## DNODES
+
+提供 dnode 的相关信息。也可以使用 SHOW DNODES 来查询这些信息。
+
+| #   |    **列名**    | **数据类型** | **说明**              |
+| --- | :------------: | ------------ | --------------------- |
+| 1   |     vnodes     | SMALLINT     | dnode 中的 vnode 个数 |
+| 2   | support_vnodes | SMALLINT     | 支持的 vnode 个数     |
+| 3   |     status     | BINARY(10)   | 当前状态              |
+| 4   |      note      | BINARY(256)  | 离线原因等信息        |
+| 5   |       id       | SMALLINT     | dnode id              |
+| 6   |    endpoint    | BINARY(134)  | dnode 的地址          |
+| 7   |     create     | TIMESTAMP    | 创建时间              |
+
+## MNODES
+
+提供 mnode 的相关信息。也可以使用 SHOW MNODES 来查询这些信息。
+
+| #   |  **列名**   | **数据类型** | **说明**           |
+| --- | :---------: | ------------ | ------------------ |
+| 1   |     id      | SMALLINT     | mnode id           |
+| 2   |  endpoint   | BINARY(134)  | mnode 的地址       |
+| 3   |    role     | BINARY(10)   | 当前角色           |
+| 4   |  role_time  | TIMESTAMP    | 成为当前角色的时间 |
+| 5   | create_time | TIMESTAMP    | 创建时间           |
+
+## MODULES
+
+提供组件的相关信息。也可以使用 SHOW MODULES 来查询这些信息
+
+| #   | **列名** | **数据类型** | **说明**   |
+| --- | :------: | ------------ | ---------- |
+| 1   |    id    | SMALLINT     | module id  |
+| 2   | endpoint | BINARY(134)  | 组件的地址 |
+| 3   |  module  | BINARY(10)   | 组件状态   |
+
+## QNODES
+
+当前系统中 QNODE 的信息。也可以使用 SHOW QNODES 来查询这些信息。
+
+| #   |  **列名**   | **数据类型** | **说明**     |
+| --- | :---------: | ------------ | ------------ |
+| 1   |     id      | SMALLINT     | module id    |
+| 2   |  endpoint   | BINARY(134)  | qnode 的地址 |
+| 3   | create_time | TIMESTAMP    | 创建时间     |
+
+## USER_DATABASES
+
+提供用户创建的数据库对象的相关信息。也可以使用 SHOW DATABASES 来查询这些信息。
+
+TODO
+
+| #   |  **列名**   | **数据类型** | **说明**                                         |
+| --- | :---------: | ------------ | ------------------------------------------------ |
+| 1   |    name     | BINARY(32)   | 数据库名                                         |
+| 2   | create_time | TIMESTAMP    | 创建时间                                         |
+| 3   |   ntables   | INT          | 数据库中表的数量，包含子表和普通表但不包含超级表 |
+| 4   |   vgroups   | INT          | 数据库中有多少个 vgroup                          |
+| 5   |   replica   | INT          | 副本数                                           |
+| 6   |   quorum    | INT          | 写成功的确认数                                   |
+| 7   |    days     | INT          | 单文件存储数据的时间跨度                         |
+| 8   |    keep     | INT          | 数据保留时长                                     |
+| 9   |   buffer    | INT          | 每个 vnode 写缓存的内存块大小，单位 MB           |
+| 10  |   minrows   | INT          | 文件块中记录的最大条数                           |
+| 11  |   maxrows   | INT          | 文件块中记录的最小条数                           |
+| 12  |  wallevel   | INT          | WAL 级别                                         |
+| 13  |    fsync    | INT          | 数据落盘周期                                     |
+| 14  |    comp     | INT          | 数据压缩方式                                     |
+| 15  |  precision  | BINARY(2)    | 时间分辨率                                       |
+| 16  |   status    | BINARY(10)   | 数据库状态                                       |
+
+## USER_FUNCTIONS
+
+TODO
+
+## USER_INDEXES
+
+提供用户创建的索引的相关信息。也可以使用 SHOW INDEX 来查询这些信息。
+
+| #   |     **列名**     | **数据类型** | **说明**                                                                           |
+| --- | :--------------: | ------------ | ---------------------------------------------------------------------------------- |
+| 1   |     db_name      | BINARY(32)   | 包含此索引的表所在的数据库名                                                       |
+| 2   |    table_name    | BINARY(192)  | 包含此索引的表的名称                                                               |
+| 3   |    index_name    | BINARY(192)  | 索引名                                                                             |
+| 4   |   column_name    | BINARY(64)   | 建索引的列的列名                                                                   |
+| 5   |    index_type    | BINARY(10)   | 目前有 SMA 和 FULLTEXT                                                             |
+| 6   | index_extensions | BINARY(256)  | 索引的额外信息。对 SMA 类型的索引，是函数名的列表。对 FULLTEXT 类型的索引为 NULL。 |
+
+## USER_STABLES
+
+提供用户创建的超级表的相关信息。
+
+| #   |   **列名**    | **数据类型** | **说明**                 |
+| --- | :-----------: | ------------ | ------------------------ |
+| 1   |  stable_name  | BINARY(192)  | 超级表表名               |
+| 2   |    db_name    | BINARY(64)   | 超级表所在的数据库的名称 |
+| 3   |  create_time  | TIMESTAMP    | 创建时间                 |
+| 4   |    columns    | INT          | 列数目                   |
+| 5   |     tags      | INT          | 标签数目                 |
+| 6   |  last_update  | TIMESTAMP    | 最后更新时间             |
+| 7   | table_comment | BINARY(1024) | 表注释                   |
+| 8   |   watermark   | BINARY(64)   | 窗口的关闭时间           |
+| 9   |   max_delay   | BINARY(64)   | 推送计算结果的最大延迟   |
+| 10  |    rollup     | BINARY(128)  | rollup 聚合函数          |
+
+## USER_STREAMS
+
+提供用户创建的流计算的相关信息。
+
+| #   |  **列名**   | **数据类型** | **说明**                    |
+| --- | :---------: | ------------ | --------------------------- |
+| 1   | stream_name | BINARY(192)  | 流计算名称                  |
+| 2   |  user_name  | BINARY(23)   | 创建流计算的用户            |
+| 3   | dest_table  | BINARY(192)  | 流计算写入的目标表          |
+| 4   | create_time | TIMESTAMP    | 创建时间                    |
+| 5   |     sql     | BLOB         | 创建流计算时提供的 SQL 语句 |
+
+## USER_TABLES
+
+提供用户创建的普通表和子表的相关信息
+
+| #   |   **列名**    | **数据类型** | **说明**         |
+| --- | :-----------: | ------------ | ---------------- |
+| 1   |  table_name   | BINARY(192)  | 表名             |
+| 2   |    db_name    | BINARY(64)   | 数据库名         |
+| 3   |  create_time  | TIMESTAMP    | 创建时间         |
+| 4   |    columns    | INT          | 列数目           |
+| 5   |  stable_name  | BINARY(192)  | 所属的超级表表名 |
+| 6   |      uid      | BIGINT       | 表 id            |
+| 7   |   vgroup_id   | INT          | vgroup id        |
+| 8   |      ttl      | INT          | 表的生命周期     |
+| 9   | table_comment | BINARY(1024) | 表注释           |
+| 10  |     type      | BINARY(20)   | 表类型           |
+
+## USER_USERS
+
+提供系统中创建的用户的相关信息。
+
+| #   |  **列名**   | **数据类型** | **说明** |
+| --- | :---------: | ------------ | -------- |
+| 1   |  user_name  | BINARY(23)   | 用户名   |
+| 2   |  privilege  | BINARY(256)  | 权限     |
+| 3   | create_time | TIMESTAMP    | 创建时间 |
+
+## VGROUPS
+
+系统中所有 vgroups 的信息。
+
+| #   |  **列名**  | **数据类型** | **说明**                     |
+| --- | :--------: | ------------ | ---------------------------- |
+| 1   |   vg_id    | INT          | vgroup id                    |
+| 2   |  db_name   | BINARY(32)   | 数据库名                     |
+| 3   |   tables   | INT          | 此 vgroup 内有多少表         |
+| 4   |   status   | BINARY(10)   | 此 vgroup 的状态             |
+| 5   |  onlines   | INT          | 在线的成员数目               |
+| 6   |  v1_dnode  | INT          | 第一个成员所在的 dnode 的 id |
+| 7   | v1_status  | BINARY(10)   | 第一个成员的状态             |
+| 8   |  v2_dnode  | INT          | 第二个成员所在的 dnode 的 id |
+| 9   | v2_status  | BINARY(10)   | 第二个成员的状态             |
+| 10  |  v3_dnode  | INT          | 第三个成员所在的 dnode 的 id |
+| 11  | v3_status  | BINARY(10)   | 第三个成员的状态             |
+| 12  | compacting | INT          | compact 状态                 |
diff --git a/docs/zh/12-taos-sql/23-show.md b/docs/zh/12-taos-sql/23-show.md
new file mode 100644
index 0000000000..781f94324c
--- /dev/null
+++ b/docs/zh/12-taos-sql/23-show.md
@@ -0,0 +1,270 @@
+---
+sidebar_label: SHOW 命令
+title: 使用 SHOW 命令查看系统元数据
+---
+
+除了使用 `select` 语句查询 `INFORMATION_SCHEMA` 数据库中的表获得系统中的各种元数据、系统信息和状态之外，也可以用 `SHOW` 命令来实现同样的目的。
+
+## SHOW ACCOUNTS
+
+```sql
+SHOW ACCOUNTS;
+```
+
+显示当前系统中所有租户的信息。
+
+注：企业版独有
+
+## SHOW APPS
+
+```sql
+SHOW APPS;
+```
+
+显示接入集群的应用（客户端）信息。
+
+## SHOW BNODES
+
+```sql
+SHOW BNODES;
+```
+
+显示当前系统中存在的 BNODE (backup node, 即备份节点）的信息。
+
+## SHOW CLUSTER
+
+```sql
+SHOW CLUSTER;
+```
+
+显示当前集群的信息
+
+## SHOW CONNECTIONS
+
+```sql
+SHOW CONNECTIONS;
+```
+
+显示当前系统中存在的连接的信息。
+
+## SHOW CONSUMERS
+
+```sql
+SHOW CONSUMERS;
+```
+
+显示当前数据库下所有活跃的消费者的信息。
+
+## SHOW CREATE DATABASE
+
+```sql
+SHOW CREATE DATABASE db_name;
+```
+
+显示 db_name 指定的数据库的创建语句。
+
+## SHOW CREATE STABLE
+
+```sql
+SHOW CREATE STABLE [db_name.]stb_name;
+```
+
+显示 tb_name 指定的超级表的创建语句
+
+## SHOW CREATE TABLE
+
+```sql
+SHOW CREATE TABLE [db_name.]tb_name
+```
+
+显示 tb_name 指定的表的创建语句。支持普通表、超级表和子表。
+
+## SHOW DATABASES
+
+```sql
+SHOW DATABASES;
+```
+
+显示用户定义的所有数据库。
+
+## SHOW DNODES
+
+```sql
+SHOW DNODES;
+```
+
+显示当前系统中 DNODE 的信息。
+
+## SHOW FUNCTIONS
+
+```sql
+SHOW FUNCTIONS;
+```
+
+显示用户定义的自定义函数。
+
+## SHOW LICENSE
+
+```sql
+SHOW LICENSE;
+SHOW GRANTS;
+```
+
+显示企业版许可授权的信息。
+
+注：企业版独有
+
+## SHOW INDEXES
+
+```sql
+SHOW INDEXES FROM tbl_name [FROM db_name];
+```
+
+显示已创建的索引。
+
+## SHOW LOCAL VARIABLES
+
+```sql
+SHOW LOCAL VARIABLES;
+```
+
+显示当前客户端配置参数的运行值。
+
+## SHOW MNODES
+
+```sql
+SHOW MNODES;
+```
+
+显示当前系统中 MNODE 的信息。
+
+## SHOW MODULES
+
+```sql
+SHOW MODULES;
+```
+
+显示当前系统中所安装的组件的信息。
+
+## SHOW QNODES
+
+```sql
+SHOW QNODES;
+```
+
+显示当前系统中 QNODE （查询节点）的信息。
+
+## SHOW SCORES
+
+```sql
+SHOW SCORES;
+```
+
+显示系统被许可授权的容量的信息。
+
+注：企业版独有
+
+## SHOW SNODES
+
+```sql
+SHOW SNODES;
+```
+
+显示当前系统中 SNODE （流计算节点）的信息。
+
+## SHOW STABLES
+
+```sql
+SHOW [db_name.]STABLES [LIKE 'pattern'];
+```
+
+显示当前数据库下的所有超级表的信息。可以使用 LIKE 对表名进行模糊匹配。
+
+## SHOW STREAMS
+
+```sql
+SHOW STREAMS;
+```
+
+显示当前系统内所有流计算的信息。
+
+## SHOW SUBSCRIPTIONS
+
+```sql
+SHOW SUBSCRIPTIONS;
+```
+
+显示当前数据库下的所有的订阅关系
+
+## SHOW TABLES
+
+```sql
+SHOW [db_name.]TABLES [LIKE 'pattern'];
+```
+
+显示当前数据库下的所有普通表和子表的信息。可以使用 LIKE 对表名进行模糊匹配。
+
+## SHOW TABLE DISTRIBUTED
+
+```sql
+SHOW TABLE DISTRIBUTED table_name;
+```
+
+显示表的数据分布信息。
+
+## SHOW TAGS
+
+```sql
+SHOW TAGS FROM child_table_name [FROM db_name];
+```
+
+显示子表的标签信息。
+
+## SHOW TOPICS
+
+```sql
+SHOW TOPICS;
+```
+
+显示当前数据库下的所有主题的信息。
+
+## SHOW TRANSACTIONS
+
+```sql
+SHOW TRANSACTIONS;
+```
+
+显示当前系统中正在执行的事务的信息
+
+## SHOW USERS
+
+```sql
+SHOW USERS;
+```
+
+显示当前系统中所有用户的信息。包括用户自定义的用户和系统默认用户。
+
+## SHOW VARIABLES
+
+```sql
+SHOW VARIABLES;
+SHOW DNODE dnode_id VARIABLES;
+```
+
+显示当前系统中各节点需要相同的配置参数的运行值，也可以指定 DNODE 来查看其的配置参数。
+
+## SHOW VGROUPS
+
+```sql
+SHOW [db_name.]VGROUPS;
+```
+
+显示当前系统中所有 VGROUP 或某个 db 的 VGROUPS 的信息。
+
+## SHOW VNODES
+
+```sql
+SHOW VNODES [dnode_name];
+```
+
+显示当前系统中所有 VNODE 或某个 DNODE 的 VNODE 的信息。

From b0ad07d4e7b2206508bb69b75e283affa2bdfbb3 Mon Sep 17 00:00:00 2001
From: gccgdb1234 <wxzhang@taosdata.com>
Date: Tue, 26 Jul 2022 19:06:32 +0800
Subject: [PATCH 26/26] doc: SQL reference guide, grant and select

---
 docs/zh/12-taos-sql/06-select.md | 50 +++++++--------------
 docs/zh/12-taos-sql/25-grant.md  | 77 ++++++++++++++++++++++++++++++++
 2 files changed, 94 insertions(+), 33 deletions(-)
 create mode 100644 docs/zh/12-taos-sql/25-grant.md

diff --git a/docs/zh/12-taos-sql/06-select.md b/docs/zh/12-taos-sql/06-select.md
index 51150b8b13..16f1997d5a 100644
--- a/docs/zh/12-taos-sql/06-select.md
+++ b/docs/zh/12-taos-sql/06-select.md
@@ -175,7 +175,7 @@ SELECT COUNT(*) FROM (SELECT DISTINCT TBNAME FROM meters);
 
 这三个伪列只能用于时间窗口的窗口切分查询之中，且要在窗口切分子句之后出现。
 
-### \_c0/\_ROWTS
+**\_c0/\_ROWTS**
 
 TDengine 中，所有表的第一列都必须是时间戳类型，且为其主键，\_rowts 伪列和\_c0 伪列均代表了此列的值。相比实际的主键时间戳列，使用伪列更加灵活，语义也更加标准。例如，可以和 max\min 等函数一起使用。
 
@@ -183,6 +183,21 @@ TDengine 中，所有表的第一列都必须是时间戳类型，且为其主
 select _rowts, max(current) from meters;
 ```
 
+## 查询对象
+
+FROM 关键字后面可以是若干个表（超级表）列表，也可以是子查询的结果。
+如果没有指定用户的当前数据库，可以在表名称之前使用数据库的名称来指定表所属的数据库。例如：`power.d1001` 方式来跨库使用表。
+
+TDengine 支持基于时间戳主键的 INNER JOIN，规则如下：
+
+1. 支持 FROM 表列表和显式的 JOIN 子句两种语法。
+2. 对于普通表和子表，ON 条件必须有且只有时间戳主键的等值条件。
+3. 对于超级表，ON 条件在时间戳主键的等值条件之外，还要求有可以一一对应的标签列等值条件，不支持 OR 条件。
+4. 参与 JOIN 计算的表只能是同一种类型，即只能都是超级表，或都是子表，或都是普通表。
+5. JOIN 两侧均支持子查询。
+6. 参与 JOIN 的表个数上限为 10 个。
+7. 不支持与 FILL 子句混合使用。
+
 ## GROUP BY
 
 如果在语句中同时指定了 GROUP BY 子句，那么 SELECT 列表只能包含如下表达式：
@@ -198,20 +213,6 @@ GROUP BY 子句中的表达式可以包含表或视图中的任何列，这些
 
 该子句对行进行分组，但不保证结果集的顺序。若要对分组进行排序，请使用 ORDER BY 子句
 
-## 查询对象
-
-FROM 关键字后面可以是若干个表（超级表）列表，也可以是子查询的结果。
-如果没有指定用户的当前数据库，可以在表名称之前使用数据库的名称来指定表所属的数据库。例如：`power.d1001` 方式来跨库使用表。
-
-TDengine 支持基于时间戳主键的 INNER JOIN，规则如下：
-
-1. 支持 FROM 表列表和显式的 JOIN 子句两种语法。
-2. 对于普通表和子表，ON 条件必须有且只有时间戳主键的等值条件。
-3. 对于超级表，ON 条件在时间戳主键的等值条件之外，还要求有可以一一对应的标签列等值条件，不支持 OR 条件。
-4. 参与 JOIN 计算的表只能是同一种类型，即只能都是超级表，或都是子表，或都是普通表。
-5. JOIN 两侧均支持子查询。
-6. 参与 JOIN 的表个数上限为 10 个。
-7. 不支持与 FILL 子句混合使用。
 
 ## PARTITON BY
 
@@ -286,23 +287,6 @@ SELECT TODAY();
 SELECT TIMEZONE();
 ```
 
-## TAOS SQL 中特殊关键词
-
-- `TBNAME`： 在超级表查询中可视为一个特殊的标签，代表查询涉及的子表名
-- `_c0`: 表示表（超级表）的第一列
-
-获取一个超级表所有的子表名及相关的标签信息：
-
-```sql
-SELECT TBNAME, location FROM meters;
-```
-
-统计超级表下辖子表数量：
-
-```sql
-SELECT COUNT(*) FROM (SELECT DISTINCT TBNAMEFROM meters);
-```
-
 ## 正则表达式过滤
 
 ### 语法
@@ -394,7 +378,7 @@ UNION ALL SELECT ...
 
 TDengine 支持 UNION ALL 操作符。也就是说，如果多个 SELECT 子句返回结果集的结构完全相同（列名、列类型、列数、顺序），那么可以通过 UNION ALL 把这些结果集合并到一起。目前只支持 UNION ALL 模式，也即在结果集的合并过程中是不去重的。在同一个 sql 语句中，UNION ALL 最多支持 100 个。
 
-### SQL 示例
+## SQL 示例
 
 对于下面的例子，表 tb1 用以下语句创建：
 
diff --git a/docs/zh/12-taos-sql/25-grant.md b/docs/zh/12-taos-sql/25-grant.md
new file mode 100644
index 0000000000..05836bb575
--- /dev/null
+++ b/docs/zh/12-taos-sql/25-grant.md
@@ -0,0 +1,77 @@
+---
+sidebar_label: 权限管理
+title: 权限管理
+---
+
+本节讲述如何在 TDengine 中进行权限管理的相关操作。
+
+## 创建用户
+
+```sql
+CREATE USER use_name PASS password;
+```
+
+创建用户。
+
+use_name最长为23字节。
+
+password最长为128字节，合法字符包括"a-zA-Z0-9!?$%^&*()_–+={[}]:;@~#|<,>.?/"，不可以出现单双引号、撇号、反斜杠和空格，且不可以为空。
+
+## 删除用户
+
+```sql
+DROP USER user_name;
+```
+
+## 授权
+
+```sql
+GRANT privileges ON priv_level TO user_name
+ 
+privileges : {
+    ALL
+  | priv_type [, priv_type] ...
+}
+ 
+priv_type : {
+    READ
+  | WRITE
+}
+ 
+priv_level : {
+    dbname.*
+  | *.*
+}
+```
+
+对用户授权。
+
+授权级别支持到DATABASE，权限有READ和WRITE两种。
+
+TDengine 有超级用户和普通用户两类用户。超级用户缺省创建为root，拥有所有权限。使用超级用户创建出来的用户为普通用户。在未授权的情况下，普通用户可以创建DATABASE，并拥有自己创建的DATABASE的所有权限，包括删除数据库、修改数据库、查询时序数据和写入时序数据。超级用户可以给普通用户授予其他DATABASE的读写权限，使其可以在此DATABASE上读写数据，但不能对其进行删除和修改数据库的操作。
+
+对于非DATABASE的对象，如USER、DNODE、UDF、QNODE等，普通用户只有读权限（一般为SHOW命令），不能创建和修改。
+
+## 撤销授权
+
+```sql
+REVOKE privileges ON priv_level FROM user_name
+ 
+privileges : {
+    ALL
+  | priv_type [, priv_type] ...
+}
+ 
+priv_type : {
+    READ
+  | WRITE
+}
+ 
+priv_level : {
+    dbname.*
+  | *.*
+}
+
+```
+
+收回对用户的授权。
\ No newline at end of file