other: merge main.

2023-02-02 14:00:36 +08:00 · 2023-02-02 14:00:36 +08:00 · ab8c977417
parent 3897a91a95 461dbe752d
commit ab8c977417
130 changed files with 3418 additions and 2765 deletions
--- a/2
+++ b/2
@ -430,7 +430,7 @@ pipeline {
                                        date
                                        rm -rf ${WKC}/debug
                                        cd ${WKC}/tests/parallel_test
-                                        time ./container_build.sh -w ${WKDIR} -t 10 -e
+                                        time ./container_build.sh -w ${WKDIR} -e
                                    '''
                                    def extra_param = ""
                                    def log_server_file = "/home/log_server.json"
--- a/cmake/cmake.version
+++ b/cmake/cmake.version
@ -2,7 +2,7 @@
 IF (DEFINED VERNUMBER)
  SET(TD_VER_NUMBER ${VERNUMBER})
 ELSE ()
-  SET(TD_VER_NUMBER "3.0.2.2")
+  SET(TD_VER_NUMBER "3.0.2.4")
 ENDIF ()

 IF (DEFINED VERCOMPATIBLE)
--- a/cmake/taosadapter_CMakeLists.txt.in
+++ b/cmake/taosadapter_CMakeLists.txt.in
@ -2,7 +2,7 @@
 # taosadapter
 ExternalProject_Add(taosadapter
        GIT_REPOSITORY https://github.com/taosdata/taosadapter.git
-        GIT_TAG 69eee2e
+        GIT_TAG 3e08996
        SOURCE_DIR "${TD_SOURCE_DIR}/tools/taosadapter"
        BINARY_DIR ""
        #BUILD_IN_SOURCE TRUE
--- a/cmake/taostools_CMakeLists.txt.in
+++ b/cmake/taostools_CMakeLists.txt.in
@ -2,7 +2,7 @@
 # taos-tools
 ExternalProject_Add(taos-tools
        GIT_REPOSITORY https://github.com/taosdata/taos-tools.git
-        GIT_TAG 5aa25e9
+        GIT_TAG a0234fe
        SOURCE_DIR "${TD_SOURCE_DIR}/tools/taos-tools"
        BINARY_DIR ""
        #BUILD_IN_SOURCE TRUE
--- a/docs/examples/go/go.mod
+++ b/docs/examples/go/go.mod
@ -1,6 +0,0 @@
-module goexample
-
-go 1.17
-
-require github.com/taosdata/driver-go/v3 3.0
-
--- a/docs/examples/python/conn_native_pandas.py
+++ b/docs/examples/python/conn_native_pandas.py
@ -1,8 +1,11 @@
 import pandas
-from sqlalchemy import create_engine
+from sqlalchemy import create_engine, text

 engine = create_engine("taos://root:taosdata@localhost:6030/power")
-df = pandas.read_sql("SELECT * FROM meters", engine)
+conn = engine.connect()
+df = pandas.read_sql(text("SELECT * FROM power.meters"), conn)
+conn.close()
+

 # print index
 print(df.index)
--- a/docs/examples/python/conn_rest_pandas.py
+++ b/docs/examples/python/conn_rest_pandas.py
@ -1,8 +1,10 @@
 import pandas
-from sqlalchemy import create_engine
+from sqlalchemy import create_engine, text

 engine = create_engine("taosrest://root:taosdata@localhost:6041")
-df: pandas.DataFrame = pandas.read_sql("SELECT * FROM power.meters", engine)
+conn = engine.connect()
+df: pandas.DataFrame = pandas.read_sql(text("SELECT * FROM power.meters"), conn)
+conn.close()

 # print index
 print(df.index)
--- a/docs/examples/python/connect_rest_examples.py
+++ b/docs/examples/python/connect_rest_examples.py
@ -1,7 +1,7 @@
 # ANCHOR: connect
 from taosrest import connect, TaosRestConnection, TaosRestCursor

-conn: TaosRestConnection = connect(url="http://localhost:6041",
+conn = connect(url="http://localhost:6041",
               user="root",
               password="taosdata",
               timeout=30)
@ -9,16 +9,17 @@ conn: TaosRestConnection = connect(url="http://localhost:6041",
 # ANCHOR_END: connect
 # ANCHOR: basic
 # create STable
-cursor: TaosRestCursor = conn.cursor()
+cursor = conn.cursor()
 cursor.execute("DROP DATABASE IF EXISTS power")
 cursor.execute("CREATE DATABASE power")
-cursor.execute("CREATE STABLE power.meters (ts TIMESTAMP, current FLOAT, voltage INT, phase FLOAT) TAGS (location BINARY(64), groupId INT)")
+cursor.execute(
+    "CREATE STABLE power.meters (ts TIMESTAMP, current FLOAT, voltage INT, phase FLOAT) TAGS (location BINARY(64), groupId INT)")

 # insert data
-cursor.execute("""INSERT INTO power.d1001 USING power.meters TAGS(California.SanFrancisco, 2) VALUES ('2018-10-03 14:38:05.000', 10.30000, 219, 0.31000) ('2018-10-03 14:38:15.000', 12.60000, 218, 0.33000) ('2018-10-03 14:38:16.800', 12.30000, 221, 0.31000)
-    power.d1002 USING power.meters TAGS(California.SanFrancisco, 3) VALUES ('2018-10-03 14:38:16.650', 10.30000, 218, 0.25000)
-    power.d1003 USING power.meters TAGS(California.LosAngeles, 2) VALUES ('2018-10-03 14:38:05.500', 11.80000, 221, 0.28000) ('2018-10-03 14:38:16.600', 13.40000, 223, 0.29000)
-    power.d1004 USING power.meters TAGS(California.LosAngeles, 3) VALUES ('2018-10-03 14:38:05.000', 10.80000, 223, 0.29000) ('2018-10-03 14:38:06.500', 11.50000, 221, 0.35000)""")
+cursor.execute("""INSERT INTO power.d1001 USING power.meters TAGS('California.SanFrancisco', 2) VALUES ('2018-10-03 14:38:05.000', 10.30000, 219, 0.31000) ('2018-10-03 14:38:15.000', 12.60000, 218, 0.33000) ('2018-10-03 14:38:16.800', 12.30000, 221, 0.31000)
+    power.d1002 USING power.meters TAGS('California.SanFrancisco', 3) VALUES ('2018-10-03 14:38:16.650', 10.30000, 218, 0.25000)
+    power.d1003 USING power.meters TAGS('California.LosAngeles', 2) VALUES ('2018-10-03 14:38:05.500', 11.80000, 221, 0.28000) ('2018-10-03 14:38:16.600', 13.40000, 223, 0.29000)
+    power.d1004 USING power.meters TAGS('California.LosAngeles', 3) VALUES ('2018-10-03 14:38:05.000', 10.80000, 223, 0.29000) ('2018-10-03 14:38:06.500', 11.50000, 221, 0.35000)""")
 print("inserted row count:", cursor.rowcount)

 # query data
@ -28,7 +29,7 @@ print("queried row count:", cursor.rowcount)
 # get column names from cursor
 column_names = [meta[0] for meta in cursor.description]
 # get rows
-data: list[tuple] = cursor.fetchall()
+data = cursor.fetchall()
 print(column_names)
 for row in data:
    print(row)
--- a/docs/examples/python/connection_usage_native_reference.py
+++ b/docs/examples/python/connection_usage_native_reference.py
@ -8,7 +8,7 @@ conn.execute("CREATE DATABASE test")
 # change database. same as execute "USE db"
 conn.select_db("test")
 conn.execute("CREATE STABLE weather(ts TIMESTAMP, temperature FLOAT) TAGS (location INT)")
-affected_row: int = conn.execute("INSERT INTO t1 USING weather TAGS(1) VALUES (now, 23.5) (now+1m, 23.5) (now+2m 24.4)")
+affected_row = conn.execute("INSERT INTO t1 USING weather TAGS(1) VALUES (now, 23.5) (now+1m, 23.5) (now+2m, 24.4)")
 print("affected_row", affected_row)
 # output:
 # affected_row 3
@ -16,10 +16,10 @@ print("affected_row", affected_row)

 # ANCHOR: query
 # Execute a sql and get its result set. It's useful for SELECT statement
-result: taos.TaosResult = conn.query("SELECT * from weather")
+result = conn.query("SELECT * from weather")

 # Get fields from result
-fields: taos.field.TaosFields = result.fields
+fields = result.fields
 for field in fields:
    print(field)  # {name: ts, type: 9, bytes: 8}

--- a/docs/examples/python/fast_write_example.py
+++ b/docs/examples/python/fast_write_example.py
@ -1,15 +1,14 @@
 # install dependencies:
 # recommend python >= 3.8
-# pip3 install faster-fifo
 #

 import logging
 import math
+import multiprocessing
 import sys
 import time
 import os
-from multiprocessing import Process
-from faster_fifo import Queue
+from multiprocessing import Process, Queue
 from mockdatasource import MockDataSource
 from queue import Empty
 from typing import List
@ -22,8 +21,7 @@ TABLE_COUNT = 1000
 QUEUE_SIZE = 1000000
 MAX_BATCH_SIZE = 3000

-read_processes = []
-write_processes = []
+_DONE_MESSAGE = '__DONE__'


 def get_connection():
@ -44,41 +42,64 @@ def get_connection():

 # ANCHOR: read

-def run_read_task(task_id: int, task_queues: List[Queue]):
+def run_read_task(task_id: int, task_queues: List[Queue], infinity):
    table_count_per_task = TABLE_COUNT // READ_TASK_COUNT
-    data_source = MockDataSource(f"tb{task_id}", table_count_per_task)
+    data_source = MockDataSource(f"tb{task_id}", table_count_per_task, infinity)
    try:
        for batch in data_source:
+            if isinstance(batch, tuple):
+                batch = [batch]
            for table_id, rows in batch:
                # hash data to different queue
                i = table_id % len(task_queues)
                # block putting forever when the queue is full
-                task_queues[i].put_many(rows, block=True, timeout=-1)
+                for row in rows:
+                    task_queues[i].put(row)
+        if not infinity:
+            for queue in task_queues:
+                queue.put(_DONE_MESSAGE)
    except KeyboardInterrupt:
        pass
+    finally:
+        logging.info('read task over')


 # ANCHOR_END: read

+
 # ANCHOR: write
-def run_write_task(task_id: int, queue: Queue):
+def run_write_task(task_id: int, queue: Queue, done_queue: Queue):
    from sql_writer import SQLWriter
    log = logging.getLogger(f"WriteTask-{task_id}")
    writer = SQLWriter(get_connection)
    lines = None
    try:
        while True:
+            over = False
+            lines = []
+            for _ in range(MAX_BATCH_SIZE):
                try:
-                # get as many as possible
-                lines = queue.get_many(block=False, max_messages_to_get=MAX_BATCH_SIZE)
-                writer.process_lines(lines)
+                    line = queue.get_nowait()
+                    if line == _DONE_MESSAGE:
+                        over = True
+                        break
+                    if line:
+                        lines.append(line)
                except Empty:
-                time.sleep(0.01)
+                    time.sleep(0.1)
+            if len(lines) > 0:
+                writer.process_lines(lines)
+            if over:
+                done_queue.put(_DONE_MESSAGE)
+                break
    except KeyboardInterrupt:
        pass
    except BaseException as e:
        log.debug(f"lines={lines}")
        raise e
+    finally:
+        writer.close()
+        log.debug('write task over')


 # ANCHOR_END: write
@ -103,13 +124,11 @@ def set_global_config():


 # ANCHOR: monitor
-def run_monitor_process():
+def run_monitor_process(done_queue: Queue):
    log = logging.getLogger("DataBaseMonitor")
+    conn = None
+    try:
        conn = get_connection()
-    conn.execute("DROP DATABASE IF EXISTS test")
-    conn.execute("CREATE DATABASE test")
-    conn.execute("CREATE STABLE test.meters (ts TIMESTAMP, current FLOAT, voltage INT, phase FLOAT) "
-                 "TAGS (location BINARY(64), groupId INT)")

        def get_count():
            res = conn.query("SELECT count(*) FROM test.meters")
@ -118,32 +137,51 @@ def run_monitor_process():

        last_count = 0
        while True:
+            try:
+                done = done_queue.get_nowait()
+                if done == _DONE_MESSAGE:
+                    break
+            except Empty:
+                pass
            time.sleep(10)
            count = get_count()
            log.info(f"count={count} speed={(count - last_count) / 10}")
            last_count = count
+    finally:
+        conn.close()


 # ANCHOR_END: monitor
 # ANCHOR: main
-def main():
+def main(infinity):
    set_global_config()
    logging.info(f"READ_TASK_COUNT={READ_TASK_COUNT}, WRITE_TASK_COUNT={WRITE_TASK_COUNT}, "
                 f"TABLE_COUNT={TABLE_COUNT}, QUEUE_SIZE={QUEUE_SIZE}, MAX_BATCH_SIZE={MAX_BATCH_SIZE}")

-    monitor_process = Process(target=run_monitor_process)
+    conn = get_connection()
+    conn.execute("DROP DATABASE IF EXISTS test")
+    conn.execute("CREATE DATABASE IF NOT EXISTS test")
+    conn.execute("CREATE STABLE IF NOT EXISTS test.meters (ts TIMESTAMP, current FLOAT, voltage INT, phase FLOAT) "
+                 "TAGS (location BINARY(64), groupId INT)")
+    conn.close()
+
+    done_queue = Queue()
+    monitor_process = Process(target=run_monitor_process, args=(done_queue,))
    monitor_process.start()
-    time.sleep(3)  # waiting for database ready.
+    logging.debug(f"monitor task started with pid {monitor_process.pid}")

    task_queues: List[Queue] = []
+    write_processes = []
+    read_processes = []
+
    # create task queues
    for i in range(WRITE_TASK_COUNT):
-        queue = Queue(max_size_bytes=QUEUE_SIZE)
+        queue = Queue()
        task_queues.append(queue)

    # create write processes
    for i in range(WRITE_TASK_COUNT):
-        p = Process(target=run_write_task, args=(i, task_queues[i]))
+        p = Process(target=run_write_task, args=(i, task_queues[i], done_queue))
        p.start()
        logging.debug(f"WriteTask-{i} started with pid {p.pid}")
        write_processes.append(p)
@ -151,13 +189,19 @@ def main():
    # create read processes
    for i in range(READ_TASK_COUNT):
        queues = assign_queues(i, task_queues)
-        p = Process(target=run_read_task, args=(i, queues))
+        p = Process(target=run_read_task, args=(i, queues, infinity))
        p.start()
        logging.debug(f"ReadTask-{i} started with pid {p.pid}")
        read_processes.append(p)

    try:
        monitor_process.join()
+        for p in read_processes:
+            p.join()
+        for p in write_processes:
+            p.join()
+        time.sleep(1)
+        return
    except KeyboardInterrupt:
        monitor_process.terminate()
        [p.terminate() for p in read_processes]
@ -176,5 +220,6 @@ def assign_queues(read_task_id, task_queues):


 if __name__ == '__main__':
-    main()
+    multiprocessing.set_start_method('spawn')
+    main(False)
 # ANCHOR_END: main
--- a/docs/examples/python/kafka_example.py
+++ b/docs/examples/python/kafka_example.py
@ -26,7 +26,8 @@ class Consumer(object):
        'bath_consume': True,
        'batch_size': 1000,
        'async_model': True,
-        'workers': 10
+        'workers': 10,
+        'testing': False
    }

    LOCATIONS = ['California.SanFrancisco', 'California.LosAngles', 'California.SanDiego', 'California.SanJose',
@ -46,6 +47,7 @@ class Consumer(object):
    def __init__(self, **configs):
        self.config: dict = self.DEFAULT_CONFIGS
        self.config.update(configs)
+        if not self.config.get('testing'):
            self.consumer = KafkaConsumer(
                self.config.get('kafka_topic'),  # topic
                bootstrap_servers=self.config.get('kafka_brokers'),
@ -60,7 +62,7 @@ class Consumer(object):
        )
        if self.config.get('async_model'):
            self.pool = ThreadPoolExecutor(max_workers=self.config.get('workers'))
-            self.tasks: list[Future] = []
+            self.tasks = []
        # tags and table mapping # key: {location}_{groupId} value:
        self.tag_table_mapping = {}
        i = 0
@ -115,14 +117,14 @@ class Consumer(object):
        if self.taos is not None:
            self.taos.close()

-    def _run(self, f: Callable[[ConsumerRecord], bool]):
+    def _run(self, f):
        for message in self.consumer:
            if self.config.get('async_model'):
                self.pool.submit(f(message))
            else:
                f(message)

-    def _run_batch(self, f: Callable[[list[list[ConsumerRecord]]], None]):
+    def _run_batch(self, f):
        while True:
            messages = self.consumer.poll(timeout_ms=500, max_records=self.config.get('batch_size'))
            if messages:
@ -140,7 +142,7 @@ class Consumer(object):
        logging.info('## insert sql %s', sql)
        return self.taos.execute(sql=sql) == 1

-    def _to_taos_batch(self, messages: list[list[ConsumerRecord]]):
+    def _to_taos_batch(self, messages):
        sql = self._build_sql_batch(messages=messages)
        if len(sql) == 0:  # decode error, skip
            return
@ -162,7 +164,7 @@ class Consumer(object):
        table_name = self._get_table_name(location=location, group_id=group_id)
        return self.INSERT_PART_SQL.format(table_name, ts, current, voltage, phase)

-    def _build_sql_batch(self, messages: list[list[ConsumerRecord]]) -> str:
+    def _build_sql_batch(self, messages) -> str:
        sql_list = []
        for partition_messages in messages:
            for message in partition_messages:
@ -186,7 +188,55 @@ def _get_location_and_group(key: str) -> (str, int):
    return fields[0], fields[1]


+def test_to_taos(consumer: Consumer):
+    msg = {
+        'location': 'California.SanFrancisco',
+        'groupId': 1,
+        'ts': '2022-12-06 15:13:38.643',
+        'current': 3.41,
+        'voltage': 105,
+        'phase': 0.02027,
+    }
+    record = ConsumerRecord(checksum=None, headers=None, offset=1, key=None, value=json.dumps(msg), partition=1,
+                            topic='test', serialized_key_size=None, serialized_header_size=None,
+                            serialized_value_size=None, timestamp=time.time(), timestamp_type=None)
+    assert consumer._to_taos(message=record)
+
+
+def test_to_taos_batch(consumer: Consumer):
+    records = [
+        [
+            ConsumerRecord(checksum=None, headers=None, offset=1, key=None,
+                           value=json.dumps({'location': 'California.SanFrancisco',
+                                             'groupId': 1,
+                                             'ts': '2022-12-06 15:13:38.643',
+                                             'current': 3.41,
+                                             'voltage': 105,
+                                             'phase': 0.02027, }),
+                           partition=1, topic='test', serialized_key_size=None, serialized_header_size=None,
+                           serialized_value_size=None, timestamp=time.time(), timestamp_type=None),
+            ConsumerRecord(checksum=None, headers=None, offset=1, key=None,
+                           value=json.dumps({'location': 'California.LosAngles',
+                                             'groupId': 2,
+                                             'ts': '2022-12-06 15:13:39.643',
+                                             'current': 3.41,
+                                             'voltage': 102,
+                                             'phase': 0.02027, }),
+                           partition=1, topic='test', serialized_key_size=None, serialized_header_size=None,
+                           serialized_value_size=None, timestamp=time.time(), timestamp_type=None),
+        ]
+    ]
+
+    consumer._to_taos_batch(messages=records)
+
+
 if __name__ == '__main__':
-    consumer = Consumer(async_model=True)
+    consumer = Consumer(async_model=True, testing=True)
+    # init env
    consumer.init_env()
-    consumer.consume()
+    # consumer.consume()
+    # test build sql
+    # test build sql batch
+    test_to_taos(consumer)
+    test_to_taos_batch(consumer)
+    
--- a/docs/examples/python/mockdatasource.py
+++ b/docs/examples/python/mockdatasource.py
@ -10,13 +10,14 @@ class MockDataSource:
        "9.4,118,0.141,California.SanFrancisco,4"
    ]

-    def __init__(self, tb_name_prefix, table_count):
+    def __init__(self, tb_name_prefix, table_count, infinity=True):
        self.table_name_prefix = tb_name_prefix + "_"
        self.table_count = table_count
        self.max_rows = 10000000
        self.current_ts = round(time.time() * 1000) - self.max_rows * 100
        # [(tableId, tableName, values),]
        self.data = self._init_data()
+        self.infinity = infinity

    def _init_data(self):
        lines = self.samples * (self.table_count // 5 + 1)
@ -28,6 +29,9 @@ class MockDataSource:

    def __iter__(self):
        self.row = 0
+        if not self.infinity:
+            return iter(self._iter_data())
+        else:
            return self

    def __next__(self):
@ -35,7 +39,9 @@ class MockDataSource:
        next 1000 rows for each table.
        return: {tableId:[row,...]}
        """
-        # generate 1000 timestamps
+        return self._iter_data()
+
+    def _iter_data(self):
        ts = []
        for _ in range(1000):
            self.current_ts += 100
@ -47,3 +53,10 @@ class MockDataSource:
            rows = [table_name + ',' + t + ',' + values for t in ts]
            result.append((table_id, rows))
        return result
+
+
+if __name__ == '__main__':
+    datasource = MockDataSource('t', 10, False)
+    for data in datasource:
+        print(data)
+        
--- a/docs/examples/python/sql_writer.py
+++ b/docs/examples/python/sql_writer.py
@ -10,6 +10,7 @@ class SQLWriter:
        self._tb_tags = {}
        self._conn = get_connection_func()
        self._max_sql_length = self.get_max_sql_length()
+        self._conn.execute("create database if not exists test")
        self._conn.execute("USE test")

    def get_max_sql_length(self):
@ -20,7 +21,7 @@ class SQLWriter:
                return int(r[1])
        return 1024 * 1024

-    def process_lines(self, lines: str):
+    def process_lines(self, lines: [str]):
        """
        :param lines: [[tbName,ts,current,voltage,phase,location,groupId]]
        """
@ -60,6 +61,7 @@ class SQLWriter:
            buf.append(q)
            sql_len += len(q)
        sql += " ".join(buf)
+        self.create_tables()
        self.execute_sql(sql)
        self._tb_values.clear()

@ -88,3 +90,23 @@ class SQLWriter:
        except BaseException as e:
            self.log.error("Execute SQL: %s", sql)
            raise e
+
+    def close(self):
+        if self._conn:
+            self._conn.close()
+
+
+if __name__ == '__main__':
+    def get_connection_func():
+        conn = taos.connect()
+        return conn
+
+
+    writer = SQLWriter(get_connection_func=get_connection_func)
+    writer.execute_sql(
+        "create stable if not exists meters (ts timestamp, current float, voltage int, phase float) "
+        "tags (location binary(64), groupId int)")
+    writer.execute_sql(
+        "INSERT INTO d21001 USING meters TAGS ('California.SanFrancisco', 2) "
+        "VALUES ('2021-07-13 14:06:32.272', 10.2, 219, 0.32)")
+    
--- a/docs/examples/python/tmq_example.py
+++ b/docs/examples/python/tmq_example.py
@ -1,58 +1,55 @@
+from taos.tmq import Consumer
 import taos
-from taos.tmq import *

+
+def init_tmq_env(db, topic):
    conn = taos.connect()
-
-print("init")
-conn.execute("drop topic if exists topic_ctb_column")
-conn.execute("drop database if exists py_tmq")
-conn.execute("create database if not exists py_tmq vgroups 2")
-conn.select_db("py_tmq")
+    conn.execute("drop topic if exists {}".format(topic))
+    conn.execute("drop database if exists {}".format(db))
+    conn.execute("create database if not exists {}".format(db))
+    conn.select_db(db)
    conn.execute(
-    "create stable if not exists stb1 (ts timestamp, c1 int, c2 float, c3 binary(10)) tags(t1 int)"
+        "create stable if not exists stb1 (ts timestamp, c1 int, c2 float, c3 varchar(16)) tags(t1 int, t3 varchar(16))")
+    conn.execute("create table if not exists tb1 using stb1 tags(1, 't1')")
+    conn.execute("create table if not exists tb2 using stb1 tags(2, 't2')")
+    conn.execute("create table if not exists tb3 using stb1 tags(3, 't3')")
+    conn.execute("create topic if not exists {} as select ts, c1, c2, c3 from stb1".format(topic))
+    conn.execute("insert into tb1 values (now, 1, 1.0, 'tmq test')")
+    conn.execute("insert into tb2 values (now, 2, 2.0, 'tmq test')")
+    conn.execute("insert into tb3 values (now, 3, 3.0, 'tmq test')")
+
+
+def cleanup(db, topic):
+    conn = taos.connect()
+    conn.execute("drop topic if exists {}".format(topic))
+    conn.execute("drop database if exists {}".format(db))
+
+
+if __name__ == '__main__':
+    init_tmq_env("tmq_test", "tmq_test_topic")  # init env
+    consumer = Consumer(
+        {
+            "group.id": "tg2",
+            "td.connect.user": "root",
+            "td.connect.pass": "taosdata",
+            "enable.auto.commit": "true",
+        }
    )
-conn.execute("create table if not exists tb1 using stb1 tags(1)")
-conn.execute("create table if not exists tb2 using stb1 tags(2)")
-conn.execute("create table if not exists tb3 using stb1 tags(3)")
+    consumer.subscribe(["tmq_test_topic"])

-print("create topic")
-conn.execute(
-    "create topic if not exists topic_ctb_column as select ts, c1, c2, c3 from stb1"
-)
+    try:
+        while True:
+            res = consumer.poll(1)
+            if not res:
+                break
+            err = res.error()
+            if err is not None:
+                raise err
+            val = res.value()

-print("build consumer")
-conf = TaosTmqConf()
-conf.set("group.id", "tg2")
-conf.set("td.connect.user", "root")
-conf.set("td.connect.pass", "taosdata")
-conf.set("enable.auto.commit", "true")
-
-
-def tmq_commit_cb_print(tmq, resp, offset, param=None):
-    print(f"commit: {resp}, tmq: {tmq}, offset: {offset}, param: {param}")
-
-
-conf.set_auto_commit_cb(tmq_commit_cb_print, None)
-tmq = conf.new_consumer()
-
-print("build topic list")
-
-topic_list = TaosTmqList()
-topic_list.append("topic_ctb_column")
-
-print("basic consume loop")
-tmq.subscribe(topic_list)
-
-sub_list = tmq.subscription()
-
-print("subscribed topics: ", sub_list)
-
-while 1:
-    res = tmq.poll(1000)
-    if res:
-        topic = res.get_topic_name()
-        vg = res.get_vgroup_id()
-        db = res.get_db_name()
-        print(f"topic: {topic}\nvgroup id: {vg}\ndb: {db}")
-        for row in res:
-            print(row)
+            for block in val:
+                print(block.fetchall())
+    finally:
+        consumer.unsubscribe()
+        consumer.close()
+        cleanup("tmq_test", "tmq_test_topic")
--- a/include/common/tmsg.h
+++ b/include/common/tmsg.h
@ -907,6 +907,7 @@ typedef struct {
  int32_t numOfRetensions;
  SArray* pRetensions;
  int8_t  schemaless;
+  int16_t sstTrigger;
 } SDbCfgRsp;

 int32_t tSerializeSDbCfgRsp(void* buf, int32_t bufLen, const SDbCfgRsp* pRsp);
--- a/include/common/tmsgcb.h
+++ b/include/common/tmsgcb.h
@ -39,7 +39,7 @@ typedef enum {
  QUEUE_MAX,
 } EQueueType;

-typedef int32_t (*UpdateDnodeInfoFp)(void* pData, int32_t* dnodeId, int64_t* clusterId, char* fqdn, uint16_t* port);
+typedef void (*UpdateDnodeInfoFp)(void* pData, int32_t* dnodeId, int64_t* clusterId, char* fqdn, uint16_t* port);
 typedef int32_t (*PutToQueueFp)(void* pMgmt, EQueueType qtype, SRpcMsg* pMsg);
 typedef int32_t (*GetQueueSizeFp)(void* pMgmt, int32_t vgId, EQueueType qtype);
 typedef int32_t (*SendReqFp)(const SEpSet* pEpSet, SRpcMsg* pMsg);
@ -70,7 +70,8 @@ void    tmsgSendRsp(SRpcMsg* pMsg);
 void    tmsgRegisterBrokenLinkArg(SRpcMsg* pMsg);
 void    tmsgReleaseHandle(SRpcHandleInfo* pHandle, int8_t type);
 void    tmsgReportStartup(const char* name, const char* desc);
-int32_t tmsgUpdateDnodeInfo(int32_t* dnodeId, int64_t* clusterId, char* fqdn, uint16_t* port);
+void    tmsgUpdateDnodeInfo(int32_t* dnodeId, int64_t* clusterId, char* fqdn, uint16_t* port);
+void    tmsgUpdateDnodeEpSet(SEpSet* epset);

 #ifdef __cplusplus
 }
--- a/include/libs/stream/tstream.h
+++ b/include/libs/stream/tstream.h
@ -354,7 +354,8 @@ int32_t      tDecodeSStreamTask(SDecoder* pDecoder, SStreamTask* pTask);
 void         tFreeSStreamTask(SStreamTask* pTask);

 static FORCE_INLINE int32_t streamTaskInput(SStreamTask* pTask, SStreamQueueItem* pItem) {
-  if (pItem->type == STREAM_INPUT__DATA_SUBMIT) {
+  int8_t type = pItem->type;
+  if (type == STREAM_INPUT__DATA_SUBMIT) {
    SStreamDataSubmit* pSubmitClone = streamSubmitRefClone((SStreamDataSubmit*)pItem);
    if (pSubmitClone == NULL) {
      qDebug("task %d %p submit enqueue failed since out of memory", pTask->taskId, pTask);
@ -365,19 +366,19 @@ static FORCE_INLINE int32_t streamTaskInput(SStreamTask* pTask, SStreamQueueItem
    qDebug("task %d %p submit enqueue %p %p %p", pTask->taskId, pTask, pItem, pSubmitClone, pSubmitClone->data);
    taosWriteQitem(pTask->inputQueue->queue, pSubmitClone);
    // qStreamInput(pTask->exec.executor, pSubmitClone);
-  } else if (pItem->type == STREAM_INPUT__DATA_BLOCK || pItem->type == STREAM_INPUT__DATA_RETRIEVE ||
-             pItem->type == STREAM_INPUT__REF_DATA_BLOCK) {
+  } else if (type == STREAM_INPUT__DATA_BLOCK || type == STREAM_INPUT__DATA_RETRIEVE ||
+             type == STREAM_INPUT__REF_DATA_BLOCK) {
    taosWriteQitem(pTask->inputQueue->queue, pItem);
    // qStreamInput(pTask->exec.executor, pItem);
-  } else if (pItem->type == STREAM_INPUT__CHECKPOINT) {
+  } else if (type == STREAM_INPUT__CHECKPOINT) {
    taosWriteQitem(pTask->inputQueue->queue, pItem);
    // qStreamInput(pTask->exec.executor, pItem);
-  } else if (pItem->type == STREAM_INPUT__GET_RES) {
+  } else if (type == STREAM_INPUT__GET_RES) {
    taosWriteQitem(pTask->inputQueue->queue, pItem);
    // qStreamInput(pTask->exec.executor, pItem);
  }

-  if (pItem->type != STREAM_INPUT__GET_RES && pItem->type != STREAM_INPUT__CHECKPOINT && pTask->triggerParam != 0) {
+  if (type != STREAM_INPUT__GET_RES && type != STREAM_INPUT__CHECKPOINT && pTask->triggerParam != 0) {
    atomic_val_compare_exchange_8(&pTask->triggerStatus, TASK_TRIGGER_STATUS__INACTIVE, TASK_TRIGGER_STATUS__ACTIVE);
  }

--- a/include/libs/sync/sync.h
+++ b/include/libs/sync/sync.h
@ -193,7 +193,7 @@ typedef struct SSyncLogStore {
  SyncIndex (*syncLogLastIndex)(struct SSyncLogStore* pLogStore);
  SyncTerm (*syncLogLastTerm)(struct SSyncLogStore* pLogStore);

-  int32_t (*syncLogAppendEntry)(struct SSyncLogStore* pLogStore, SSyncRaftEntry* pEntry);
+  int32_t (*syncLogAppendEntry)(struct SSyncLogStore* pLogStore, SSyncRaftEntry* pEntry, bool forcSync);
  int32_t (*syncLogGetEntry)(struct SSyncLogStore* pLogStore, SyncIndex index, SSyncRaftEntry** ppEntry);
  int32_t (*syncLogTruncate)(struct SSyncLogStore* pLogStore, SyncIndex fromIndex);

--- a/include/libs/wal/wal.h
+++ b/include/libs/wal/wal.h
@ -201,6 +201,7 @@ int32_t walFetchHead(SWalReader *pRead, int64_t ver, SWalCkHead *pHead);
 int32_t walFetchBody(SWalReader *pRead, SWalCkHead **ppHead);
 int32_t walSkipFetchBody(SWalReader *pRead, const SWalCkHead *pHead);

+SWalRef *walRefFirstVer(SWal *, SWalRef *);
 SWalRef *walRefCommittedVer(SWal *);

 SWalRef *walOpenRef(SWal *);
--- a/include/util/tdef.h
+++ b/include/util/tdef.h
@ -499,7 +499,7 @@ enum {
 #define DEFAULT_PAGESIZE 4096

 #define VNODE_TIMEOUT_SEC 60
-#define MNODE_TIMEOUT_SEC 10
+#define MNODE_TIMEOUT_SEC 60

 #ifdef __cplusplus
 }
--- a/packaging/tools/install.sh
+++ b/packaging/tools/install.sh
@ -210,8 +210,8 @@ function install_bin() {
  [ -x ${install_main_dir}/bin/${serverName} ] && ${csudo}ln -s ${install_main_dir}/bin/${serverName} ${bin_link_dir}/${serverName} || :
  [ -x ${install_main_dir}/bin/${udfdName} ] && ${csudo}ln -s ${install_main_dir}/bin/${udfdName} ${bin_link_dir}/${udfdName} || :
  [ -x ${install_main_dir}/bin/${adapterName} ] && ${csudo}ln -s ${install_main_dir}/bin/${adapterName} ${bin_link_dir}/${adapterName} || :
-  [ -x ${install_main_dir}/bin/${benchmarkName} ] && ${csudo}ln -s ${install_main_dir}/bin/${benchmarkName} ${bin_link_dir}/${demoName} || :
-  [ -x ${install_main_dir}/bin/${benchmarkName} ] && ${csudo}ln -s ${install_main_dir}/bin/${benchmarkName} ${bin_link_dir}/${benchmarkName} || :
+  [ -x ${install_main_dir}/bin/${benchmarkName} ] && ${csudo}ln -sf ${install_main_dir}/bin/${benchmarkName} ${bin_link_dir}/${demoName} || :
+  [ -x ${install_main_dir}/bin/${benchmarkName} ] && ${csudo}ln -sf ${install_main_dir}/bin/${benchmarkName} ${bin_link_dir}/${benchmarkName} || :
  [ -x ${install_main_dir}/bin/${dumpName} ] && ${csudo}ln -s ${install_main_dir}/bin/${dumpName} ${bin_link_dir}/${dumpName} || :
  [ -x ${install_main_dir}/bin/${xname} ] && ${csudo}ln -s ${install_main_dir}/bin/${xname} ${bin_link_dir}/${xname} || :
  [ -x ${install_main_dir}/bin/TDinsight.sh ] && ${csudo}ln -s ${install_main_dir}/bin/TDinsight.sh ${bin_link_dir}/TDinsight.sh || :
@ -743,6 +743,34 @@ function is_version_compatible() {
  esac
 }

+deb_erase() {
+  confirm=""
+  while [ "" == "${confirm}" ]; do
+    echo -e -n "${RED}Existing TDengine deb is detected, do you want to remove it? [yes|no] ${NC}:"
+    read confirm
+    if [ "yes" == "$confirm" ]; then
+      ${csudo}dpkg --remove tdengine ||:
+      break
+    elif [ "no" == "$confirm" ]; then
+      break
+    fi
+  done
+}
+
+rpm_erase() {
+  confirm=""
+  while [ "" == "${confirm}" ]; do
+    echo -e -n "${RED}Existing TDengine rpm is detected, do you want to remove it? [yes|no] ${NC}:"
+    read confirm
+    if [ "yes" == "$confirm" ]; then
+      ${csudo}rpm -e tdengine ||:
+      break
+    elif [ "no" == "$confirm" ]; then
+      break
+    fi
+  done
+}
+
 function updateProduct() {
  # Check if version compatible
  if ! is_version_compatible; then
@ -755,6 +783,13 @@ function updateProduct() {
    echo "File ${tarName} does not exist"
    exit 1
  fi
+
+  if echo $osinfo | grep -qwi "centos"; then
+    rpm -q tdengine 2>&1 > /dev/null && rpm_erase tdengine ||:
+  elif echo $osinfo | grep -qwi "ubuntu"; then
+    dpkg -l tdengine 2>&1 | grep ii > /dev/null && deb_erase tdengine ||:
+  fi
+
  tar -zxf ${tarName}
  install_jemalloc

--- a/source/client/src/clientEnv.c
+++ b/source/client/src/clientEnv.c
@ -357,6 +357,7 @@ void doDestroyRequest(void *p) {
  taosMemoryFreeClear(pRequest->pDb);

  doFreeReqResultInfo(&pRequest->body.resInfo);
+  tsem_destroy(&pRequest->body.rspSem);

  taosArrayDestroy(pRequest->tableList);
  taosArrayDestroy(pRequest->dbList);
@ -371,6 +372,9 @@ void doDestroyRequest(void *p) {
  }

  if (pRequest->syncQuery) {
+      if (pRequest->body.param){
+        tsem_destroy(&((SSyncQueryParam*)pRequest->body.param)->sem);
+      }
    taosMemoryFree(pRequest->body.param);
  }

@ -388,45 +392,6 @@ void destroyRequest(SRequestObj *pRequest) {
  removeRequest(pRequest->self);
 }

-void taosClientCrash(int signum, void *sigInfo, void *context) {
-  taosIgnSignal(SIGTERM);
-  taosIgnSignal(SIGHUP);
-  taosIgnSignal(SIGINT);
-  taosIgnSignal(SIGBREAK);
-
-#if !defined(WINDOWS)
-  taosIgnSignal(SIGBUS);
-#endif  
-  taosIgnSignal(SIGABRT);
-  taosIgnSignal(SIGFPE);
-  taosIgnSignal(SIGSEGV);
-
-  char *pMsg = NULL;
-  const char *flags = "UTL FATAL ";
-  ELogLevel   level = DEBUG_FATAL;
-  int32_t     dflag = 255;
-  int64_t     msgLen= -1;
-
-  if (tsEnableCrashReport) {
-    if (taosGenCrashJsonMsg(signum, &pMsg, lastClusterId, appInfo.startTime)) {
-      taosPrintLog(flags, level, dflag, "failed to generate crash json msg");
-      goto _return;
-    } else {
-      msgLen = strlen(pMsg);  
-    }
-  }
-  
-_return:
-
-  taosLogCrashInfo("taos", pMsg, msgLen, signum, sigInfo);
-
-#ifdef _TD_DARWIN_64
-  exit(signum);
-#elif defined(WINDOWS)
-  exit(signum);
-#endif
-}
-
 void crashReportThreadFuncUnexpectedStopped(void) { atomic_store_32(&clientStop, -1); }

 static void *tscCrashReportThreadFp(void *param) {
@ -523,14 +488,25 @@ void tscStopCrashReport() {
  }
 }

-static void tscSetSignalHandle() {
-#if !defined(WINDOWS)
-  taosSetSignal(SIGBUS, taosClientCrash);
-#endif
-  taosSetSignal(SIGABRT, taosClientCrash);
-  taosSetSignal(SIGFPE, taosClientCrash);
-  taosSetSignal(SIGSEGV, taosClientCrash);
+
+void tscWriteCrashInfo(int signum, void *sigInfo, void *context) {
+  char *pMsg = NULL;
+  const char *flags = "UTL FATAL ";
+  ELogLevel   level = DEBUG_FATAL;
+  int32_t     dflag = 255;
+  int64_t     msgLen= -1;
+
+  if (tsEnableCrashReport) {
+    if (taosGenCrashJsonMsg(signum, &pMsg, lastClusterId, appInfo.startTime)) {
+      taosPrintLog(flags, level, dflag, "failed to generate crash json msg");
+    } else {
+      msgLen = strlen(pMsg);  
    }
+  }
+
+  taosLogCrashInfo("taos", pMsg, msgLen, signum, sigInfo);
+}
+

 void taos_init_imp(void) {
  // In the APIs of other program language, taos_cleanup is not available yet.
@ -555,8 +531,6 @@ void taos_init_imp(void) {
    return;
  }

-  tscSetSignalHandle();
-
  initQueryModuleMsgHandle();

  if (taosConvInit() != 0) {
--- a/source/client/src/clientImpl.c
+++ b/source/client/src/clientImpl.c
@ -159,6 +159,12 @@ STscObj* taos_connect_internal(const char* ip, const char* user, const char* pas
  return taosConnectImpl(user, &secretEncrypt[0], localDb, NULL, NULL, *pInst, connType);
 }

+void freeQueryParam(SSyncQueryParam* param) {
+  if (param == NULL) return;
+  tsem_destroy(&param->sem);
+  taosMemoryFree(param);
+}
+
 int32_t buildRequest(uint64_t connId, const char* sql, int sqlLen, void* param, bool validateSql,
                     SRequestObj** pRequest, int64_t reqid) {
  *pRequest = createRequest(connId, TSDB_SQL_SELECT, reqid);
@ -180,17 +186,18 @@ int32_t buildRequest(uint64_t connId, const char* sql, int sqlLen, void* param,
  (*pRequest)->sqlLen = sqlLen;
  (*pRequest)->validateOnly = validateSql;

+  SSyncQueryParam* newpParam;
  if (param == NULL) {
-    SSyncQueryParam* pParam = taosMemoryCalloc(1, sizeof(SSyncQueryParam));
-    if (pParam == NULL) {
+    newpParam = taosMemoryCalloc(1, sizeof(SSyncQueryParam));
+    if (newpParam == NULL) {
      destroyRequest(*pRequest);
      *pRequest = NULL;
      return TSDB_CODE_OUT_OF_MEMORY;
    }

-    tsem_init(&pParam->sem, 0, 0);
-    pParam->pRequest = (*pRequest);
-    param = pParam;
+    tsem_init(&newpParam->sem, 0, 0);
+    newpParam->pRequest = (*pRequest);
+    param = newpParam;
  }

  (*pRequest)->body.param = param;
@ -201,8 +208,7 @@ int32_t buildRequest(uint64_t connId, const char* sql, int sqlLen, void* param,
  if (err) {
    tscError("%" PRId64 " failed to add to request container, reqId:0x%" PRIx64 ", conn:%" PRId64 ", %s",
             (*pRequest)->self, (*pRequest)->requestId, pTscObj->id, sql);
-
-    taosMemoryFree(param);
+    freeQueryParam(newpParam);
    destroyRequest(*pRequest);
    *pRequest = NULL;
    return TSDB_CODE_OUT_OF_MEMORY;
@ -214,6 +220,7 @@ int32_t buildRequest(uint64_t connId, const char* sql, int sqlLen, void* param,
        nodesCreateAllocator((*pRequest)->requestId, tsQueryNodeChunkSize, &((*pRequest)->allocatorRefId))) {
      tscError("%" PRId64 " failed to create node allocator, reqId:0x%" PRIx64 ", conn:%" PRId64 ", %s",
               (*pRequest)->self, (*pRequest)->requestId, pTscObj->id, sql);
+      freeQueryParam(newpParam);
      destroyRequest(*pRequest);
      *pRequest = NULL;
      return TSDB_CODE_OUT_OF_MEMORY;
--- a/source/client/src/clientMain.c
+++ b/source/client/src/clientMain.c
@ -509,9 +509,8 @@ void taos_stop_query(TAOS_RES *res) {
  SRequestObj *pRequest = (SRequestObj *)res;
  pRequest->killed = true;

-  int32_t numOfFields = taos_num_fields(pRequest);
  // It is not a query, no need to stop.
-  if (numOfFields == 0) {
+  if (NULL == pRequest->pQuery || QUERY_EXEC_MODE_SCHEDULE != pRequest->pQuery->execMode) {
    tscDebug("request 0x%" PRIx64 " no need to be killed since not query", pRequest->requestId);
    return;
  }
--- a/source/client/src/clientRawBlockWrite.c
+++ b/source/client/src/clientRawBlockWrite.c
@ -1448,6 +1448,7 @@ int taos_write_raw_block_with_fields(TAOS* taos, int rows, char* pData, const ch
  end:
  taosMemoryFreeClear(pTableMeta);
  qDestroyQuery(pQuery);
+  destroyRequest(pRequest);
  taosMemoryFree(subReq);
  return code;
 }
@ -1639,6 +1640,7 @@ int taos_write_raw_block(TAOS* taos, int rows, char* pData, const char* tbname)
  end:
  taosMemoryFreeClear(pTableMeta);
  qDestroyQuery(pQuery);
+  destroyRequest(pRequest);
  taosMemoryFree(subReq);
  return code;
 }
--- a/source/client/src/clientStmt.c
+++ b/source/client/src/clientStmt.c
@ -300,11 +300,7 @@ int32_t stmtCleanExecInfo(STscStmt* pStmt, bool keepTable, bool deepClean) {
      continue;
    }

-    if (STMT_TYPE_MULTI_INSERT == pStmt->sql.type) {
-      qFreeStmtDataBlock(pBlocks);
-    } else {
    qDestroyStmtDataBlock(pBlocks);
-    }
    taosHashRemove(pStmt->exec.pBlockHash, key, keyLen);

    pIter = taosHashIterate(pStmt->exec.pBlockHash, pIter);
--- a/source/common/src/tmsg.c
+++ b/source/common/src/tmsg.c
@ -2821,8 +2821,8 @@ int32_t tSerializeSDbCfgRsp(void *buf, int32_t bufLen, const SDbCfgRsp *pRsp) {
    if (tEncodeI8(&encoder, pRetension->keepUnit) < 0) return -1;
  }
  if (tEncodeI8(&encoder, pRsp->schemaless) < 0) return -1;
+  if (tEncodeI16(&encoder, pRsp->sstTrigger) < 0) return -1;
  tEndEncode(&encoder);
-
  int32_t tlen = encoder.pos;
  tEncoderClear(&encoder);
  return tlen;
@ -2873,6 +2873,7 @@ int32_t tDeserializeSDbCfgRsp(void *buf, int32_t bufLen, SDbCfgRsp *pRsp) {
    }
  }
  if (tDecodeI8(&decoder, &pRsp->schemaless) < 0) return -1;
+  if (tDecodeI16(&decoder, &pRsp->sstTrigger) < 0) return -1;
  tEndDecode(&decoder);

  tDecoderClear(&decoder);
--- a/source/dnode/mgmt/mgmt_vnode/src/vmHandle.c
+++ b/source/dnode/mgmt/mgmt_vnode/src/vmHandle.c
@ -137,7 +137,7 @@ static void vmGenerateVnodeCfg(SCreateVnodeReq *pCreate, SVnodeCfg *pCfg) {
    pNode->nodeId = pCreate->replicas[i].id;
    pNode->nodePort = pCreate->replicas[i].port;
    tstrncpy(pNode->nodeFqdn, pCreate->replicas[i].fqdn, TSDB_FQDN_LEN);
-    (void)tmsgUpdateDnodeInfo(&pNode->nodeId, &pNode->clusterId, pNode->nodeFqdn, &pNode->nodePort);
+    tmsgUpdateDnodeInfo(&pNode->nodeId, &pNode->clusterId, pNode->nodeFqdn, &pNode->nodePort);
  }
 }

--- a/source/dnode/mgmt/mgmt_vnode/src/vmInt.c
+++ b/source/dnode/mgmt/mgmt_vnode/src/vmInt.c
@ -79,8 +79,6 @@ int32_t vmOpenVnode(SVnodeMgmt *pMgmt, SWrapperCfg *pCfg, SVnode *pImpl) {
 void vmCloseVnode(SVnodeMgmt *pMgmt, SVnodeObj *pVnode) {
  char path[TSDB_FILENAME_LEN] = {0};

-  vnodeProposeCommitOnNeed(pVnode->pImpl);
-
  taosThreadRwlockWrlock(&pMgmt->lock);
  taosHashRemove(pMgmt->hash, &pVnode->vgId, sizeof(int32_t));
  taosThreadRwlockUnlock(&pMgmt->lock);
@ -343,13 +341,12 @@ static void vmCheckSyncTimeout(SVnodeMgmt *pMgmt) {
  int32_t     numOfVnodes = 0;
  SVnodeObj **ppVnodes = vmGetVnodeListFromHash(pMgmt, &numOfVnodes);

+  if (ppVnodes != NULL) {
    for (int32_t i = 0; i < numOfVnodes; ++i) {
      SVnodeObj *pVnode = ppVnodes[i];
      vnodeSyncCheckTimeout(pVnode->pImpl);
      vmReleaseVnode(pMgmt, pVnode);
    }
-
-  if (ppVnodes != NULL) {
    taosMemoryFree(ppVnodes);
  }
 }
--- a/source/dnode/mgmt/node_util/inc/dmUtil.h
+++ b/source/dnode/mgmt/node_util/inc/dmUtil.h
@ -108,6 +108,7 @@ typedef struct {
  bool           stopped;
  SEpSet         mnodeEps;
  SArray        *dnodeEps;
+  SArray        *oldDnodeEps;
  SHashObj      *dnodeHash;
  TdThreadRwlock lock;
  SMsgCb         msgCb;
@ -175,7 +176,7 @@ void    dmUpdateEps(SDnodeData *pData, SArray *pDnodeEps);
 void    dmGetMnodeEpSet(SDnodeData *pData, SEpSet *pEpSet);
 void    dmGetMnodeEpSetForRedirect(SDnodeData *pData, SRpcMsg *pMsg, SEpSet *pEpSet);
 void    dmSetMnodeEpSet(SDnodeData *pData, SEpSet *pEpSet);
-int32_t dmUpdateDnodeInfo(void *pData, int32_t *dnodeId, int64_t *clusterId, char *fqdn, uint16_t *port);
+void    dmUpdateDnodeInfo(void *pData, int32_t *dnodeId, int64_t *clusterId, char *fqdn, uint16_t *port);

 #ifdef __cplusplus
 }
--- a/source/dnode/mgmt/node_util/src/dmEps.c
+++ b/source/dnode/mgmt/node_util/src/dmEps.c
@ -332,40 +332,48 @@ void dmSetMnodeEpSet(SDnodeData *pData, SEpSet *pEpSet) {
  }
 }

-int32_t dmUpdateDnodeInfo(void *data, int32_t *dnodeId, int64_t *clusterId, char *fqdn, uint16_t *port) {
+void dmUpdateDnodeInfo(void *data, int32_t *did, int64_t *clusterId, char *fqdn, uint16_t *port) {
  SDnodeData *pData = data;
-  int32_t     ret = -1;
+  int32_t     dnodeId = -1;
+  if (did != NULL) dnodeId = *did;
+
  taosThreadRwlockRdlock(&pData->lock);
-  if (*dnodeId <= 0) {
-    for (int32_t i = 0; i < (int32_t)taosArrayGetSize(pData->dnodeEps); ++i) {
-      SDnodeEp *pDnodeEp = taosArrayGet(pData->dnodeEps, i);
+
+  if (pData->oldDnodeEps != NULL) {
+    int32_t size = (int32_t)taosArrayGetSize(pData->oldDnodeEps);
+    for (int32_t i = 0; i < size; ++i) {
+      SDnodeEp *pDnodeEp = taosArrayGet(pData->oldDnodeEps, i);
      if (strcmp(pDnodeEp->ep.fqdn, fqdn) == 0 && pDnodeEp->ep.port == *port) {
-        dInfo("dnode:%s:%u, update dnodeId from %d to %d", fqdn, *port, *dnodeId, pDnodeEp->id);
-        *dnodeId = pDnodeEp->id;
-        *clusterId = pData->clusterId;
-        ret = 0;
-      }
-    }
-    if (ret != 0) {
-      dInfo("dnode:%s:%u, failed to update dnodeId:%d", fqdn, *port, *dnodeId);
-    }
-  } else {
-    SDnodeEp *pDnodeEp = taosHashGet(pData->dnodeHash, dnodeId, sizeof(int32_t));
-    if (pDnodeEp) {
-      if (strcmp(pDnodeEp->ep.fqdn, fqdn) != 0) {
-        dInfo("dnode:%d, update port from %s to %s", *dnodeId, fqdn, pDnodeEp->ep.fqdn);
+        dInfo("dnode:%d, update ep:%s:%u to %s:%u", dnodeId, fqdn, *port, pDnodeEp->ep.fqdn, pDnodeEp->ep.port);
        tstrncpy(fqdn, pDnodeEp->ep.fqdn, TSDB_FQDN_LEN);
-      }
-      if (pDnodeEp->ep.port != *port) {
-        dInfo("dnode:%d, update port from %u to %u", *dnodeId, *port, pDnodeEp->ep.port);
        *port = pDnodeEp->ep.port;
      }
-      *clusterId = pData->clusterId;
-      ret = 0;
-    } else {
-      dInfo("dnode:%d, failed to update dnode info", *dnodeId);
    }
  }
+
+  if (did != NULL && dnodeId <= 0) {
+    int32_t size = (int32_t)taosArrayGetSize(pData->dnodeEps);
+    for (int32_t i = 0; i < size; ++i) {
+      SDnodeEp *pDnodeEp = taosArrayGet(pData->dnodeEps, i);
+      if (strcmp(pDnodeEp->ep.fqdn, fqdn) == 0 && pDnodeEp->ep.port == *port) {
+        dInfo("dnode:%s:%u, update dnodeId to dnode:%d", fqdn, *port, pDnodeEp->id);
+        *did = pDnodeEp->id;
+        if (clusterId != NULL) *clusterId = pData->clusterId;
+      }
+    }
+  }
+
+  if (dnodeId > 0) {
+    SDnodeEp *pDnodeEp = taosHashGet(pData->dnodeHash, &dnodeId, sizeof(int32_t));
+    if (pDnodeEp) {
+      if (strcmp(pDnodeEp->ep.fqdn, fqdn) != 0 || pDnodeEp->ep.port != *port) {
+        dInfo("dnode:%d, update ep:%s:%u to %s:%u", dnodeId, fqdn, *port, pDnodeEp->ep.fqdn, pDnodeEp->ep.port);
+        tstrncpy(fqdn, pDnodeEp->ep.fqdn, TSDB_FQDN_LEN);
+        *port = pDnodeEp->ep.port;
+      }
+      if (clusterId != NULL) *clusterId = pData->clusterId;
+    }
+  }
+
  taosThreadRwlockUnlock(&pData->lock);
-  return ret;
 }
--- a/source/dnode/mnode/impl/src/mndConsumer.c
+++ b/source/dnode/mnode/impl/src/mndConsumer.c
@ -742,6 +742,7 @@ SSdbRow *mndConsumerActionDecode(SSdbRaw *pRaw) {
  if (tDecodeSMqConsumerObj(buf, pConsumer) == NULL) {
    goto CM_DECODE_OVER;
  }
+  tmsgUpdateDnodeEpSet(&pConsumer->ep);

  terrno = TSDB_CODE_SUCCESS;

--- a/source/dnode/mnode/impl/src/mndDb.c
+++ b/source/dnode/mnode/impl/src/mndDb.c
@ -889,7 +889,7 @@ static int32_t mndProcessGetDbCfgReq(SRpcMsg *pReq) {
  cfgRsp.numOfRetensions = pDb->cfg.numOfRetensions;
  cfgRsp.pRetensions = pDb->cfg.pRetensions;
  cfgRsp.schemaless = pDb->cfg.schemaless;
-
+  cfgRsp.sstTrigger = pDb->cfg.sstTrigger;
  int32_t contLen = tSerializeSDbCfgRsp(NULL, 0, &cfgRsp);
  void   *pRsp = rpcMallocCont(contLen);
  if (pRsp == NULL) {
--- a/source/dnode/mnode/impl/src/mndDnode.c
+++ b/source/dnode/mnode/impl/src/mndDnode.c
@ -180,6 +180,7 @@ static SSdbRow *mndDnodeActionDecode(SSdbRaw *pRaw) {
  SDB_GET_RESERVE(pRaw, dataPos, TSDB_DNODE_RESERVE_SIZE, _OVER)

  terrno = 0;
+  tmsgUpdateDnodeInfo(&pDnode->id, NULL, pDnode->fqdn, &pDnode->port);

 _OVER:
  if (terrno != 0) {
--- a/source/dnode/mnode/impl/src/mndFunc.c
+++ b/source/dnode/mnode/impl/src/mndFunc.c
@ -293,7 +293,7 @@ static int32_t mndProcessCreateFuncReq(SRpcMsg *pReq) {
    goto _OVER;
  }

-  mInfo("func:%s, start to create", createReq.name);
+  mInfo("func:%s, start to create, size:%d", createReq.name, createReq.codeLen);
  if (mndCheckOperPrivilege(pMnode, pReq->info.conn.user, MND_OPER_CREATE_FUNC) != 0) {
    goto _OVER;
  }
--- a/source/dnode/mnode/impl/src/mndMnode.c
+++ b/source/dnode/mnode/impl/src/mndMnode.c
@ -15,13 +15,13 @@

 #define _DEFAULT_SOURCE
 #include "mndMnode.h"
+#include "mndCluster.h"
 #include "mndDnode.h"
 #include "mndPrivilege.h"
 #include "mndShow.h"
 #include "mndSync.h"
 #include "mndTrans.h"
 #include "tmisce.h"
-#include "mndCluster.h"

 #define MNODE_VER_NUMBER   1
 #define MNODE_RESERVE_SIZE 64
@ -181,9 +181,8 @@ _OVER:

 static int32_t mndMnodeActionInsert(SSdb *pSdb, SMnodeObj *pObj) {
  mTrace("mnode:%d, perform insert action, row:%p", pObj->id, pObj);
-  pObj->pDnode = sdbAcquire(pSdb, SDB_DNODE, &pObj->id);
+  pObj->pDnode = sdbAcquireNotReadyObj(pSdb, SDB_DNODE, &pObj->id);
  if (pObj->pDnode == NULL) {
-    terrno = TSDB_CODE_MND_DNODE_NOT_EXIST;
    mError("mnode:%d, failed to perform insert action since %s", pObj->id, terrstr());
    return -1;
  }
@ -748,7 +747,7 @@ static void mndReloadSyncConfig(SMnode *pMnode) {
      pNode->clusterId = mndGetClusterId(pMnode);
      pNode->nodePort = pObj->pDnode->port;
      tstrncpy(pNode->nodeFqdn, pObj->pDnode->fqdn, TSDB_FQDN_LEN);
-      (void)tmsgUpdateDnodeInfo(&pNode->nodeId, &pNode->clusterId, pNode->nodeFqdn, &pNode->nodePort);
+      tmsgUpdateDnodeInfo(&pNode->nodeId, &pNode->clusterId, pNode->nodeFqdn, &pNode->nodePort);
      mInfo("vgId:1, ep:%s:%u dnode:%d", pNode->nodeFqdn, pNode->nodePort, pNode->nodeId);
      if (pObj->pDnode->id == pMnode->selfDnodeId) {
        cfg.myIndex = cfg.replicaNum;
--- a/source/dnode/mnode/impl/src/mndSma.c
+++ b/source/dnode/mnode/impl/src/mndSma.c
@ -202,11 +202,13 @@ static SSdbRow *mndSmaActionDecode(SSdbRaw *pRaw) {

 _OVER:
  if (terrno != 0) {
-    mError("sma:%s, failed to decode from raw:%p since %s", pSma == NULL ? "null" : pSma->name, pRaw, terrstr());
+    if (pSma != NULL) {
+      mError("sma:%s, failed to decode from raw:%p since %s", pSma->name, pRaw, terrstr());
      taosMemoryFreeClear(pSma->expr);
      taosMemoryFreeClear(pSma->tagsFilter);
      taosMemoryFreeClear(pSma->sql);
      taosMemoryFreeClear(pSma->ast);
+    }
    taosMemoryFreeClear(pRow);
    return NULL;
  }
--- a/source/dnode/mnode/impl/src/mndSubscribe.c
+++ b/source/dnode/mnode/impl/src/mndSubscribe.c
@ -760,6 +760,27 @@ static SSdbRow *mndSubActionDecode(SSdbRaw *pRaw) {
    goto SUB_DECODE_OVER;
  }

+  // update epset saved in mnode
+  if (pSub->unassignedVgs != NULL) {
+    int32_t size = (int32_t)taosArrayGetSize(pSub->unassignedVgs);
+    for (int32_t i = 0; i < size; ++i) {
+      SMqVgEp *pMqVgEp = taosArrayGet(pSub->unassignedVgs, i);
+      tmsgUpdateDnodeEpSet(&pMqVgEp->epSet);
+    }
+  }
+  if (pSub->consumerHash != NULL) {
+    void *pIter = taosHashIterate(pSub->consumerHash, NULL);
+    while (pIter) {
+      SMqConsumerEp *pConsumerEp = pIter;
+      int32_t        size = (int32_t)taosArrayGetSize(pConsumerEp->vgs);
+      for (int32_t i = 0; i < size; ++i) {
+        SMqVgEp *pMqVgEp = taosArrayGet(pConsumerEp->vgs, i);
+        tmsgUpdateDnodeEpSet(&pMqVgEp->epSet);
+      }
+      pIter = taosHashIterate(pSub->consumerHash, pIter);
+    }
+  }
+
  terrno = TSDB_CODE_SUCCESS;

 SUB_DECODE_OVER:
--- a/source/dnode/mnode/impl/src/mndSync.c
+++ b/source/dnode/mnode/impl/src/mndSync.c
@ -271,9 +271,11 @@ SSyncFSM *mndSyncMakeFsm(SMnode *pMnode) {
 int32_t mndInitSync(SMnode *pMnode) {
  SSyncMgmt *pMgmt = &pMnode->syncMgmt;
  taosThreadMutexInit(&pMgmt->lock, NULL);
+  taosThreadMutexLock(&pMgmt->lock);
  pMgmt->transId = 0;
  pMgmt->transSec = 0;
  pMgmt->transSeq = 0;
+  taosThreadMutexUnlock(&pMgmt->lock);

  SSyncInfo syncInfo = {
      .snapshotStrategy = SYNC_STRATEGY_STANDARD_SNAPSHOT,
@ -301,7 +303,7 @@ int32_t mndInitSync(SMnode *pMnode) {
    pNode->nodeId = pMgmt->replicas[i].id;
    pNode->nodePort = pMgmt->replicas[i].port;
    tstrncpy(pNode->nodeFqdn, pMgmt->replicas[i].fqdn, sizeof(pNode->nodeFqdn));
-    (void)tmsgUpdateDnodeInfo(&pNode->nodeId, &pNode->clusterId, pNode->nodeFqdn, &pNode->nodePort);
+    tmsgUpdateDnodeInfo(&pNode->nodeId, &pNode->clusterId, pNode->nodeFqdn, &pNode->nodePort);
    mInfo("vgId:1, index:%d ep:%s:%u dnode:%d cluster:%" PRId64, i, pNode->nodeFqdn, pNode->nodePort, pNode->nodeId,
          pNode->clusterId);
  }
@ -369,6 +371,7 @@ int32_t mndSyncPropose(SMnode *pMnode, SSdbRaw *pRaw, int32_t transId) {
  if (pMgmt->transId != 0) {
    mError("trans:%d, can't be proposed since trans:%d already waiting for confirm", transId, pMgmt->transId);
    taosThreadMutexUnlock(&pMgmt->lock);
+    rpcFreeCont(req.pCont);
    terrno = TSDB_CODE_MND_LAST_TRANS_NOT_FINISHED;
    return terrno;
  }
--- a/source/dnode/mnode/impl/src/mndTrans.c
+++ b/source/dnode/mnode/impl/src/mndTrans.c
@ -329,6 +329,7 @@ static SSdbRow *mndTransActionDecode(SSdbRaw *pRaw) {
      action.pRaw = NULL;
    } else if (action.actionType == TRANS_ACTION_MSG) {
      SDB_GET_BINARY(pRaw, dataPos, (void *)&action.epSet, sizeof(SEpSet), _OVER);
+      tmsgUpdateDnodeEpSet(&action.epSet);
      SDB_GET_INT16(pRaw, dataPos, &action.msgType, _OVER)
      SDB_GET_INT8(pRaw, dataPos, &unused /*&action.msgSent*/, _OVER)
      SDB_GET_INT8(pRaw, dataPos, &unused /*&action.msgReceived*/, _OVER)
--- a/source/dnode/mnode/impl/src/mndVgroup.c
+++ b/source/dnode/mnode/impl/src/mndVgroup.c
@ -1441,10 +1441,10 @@ static int32_t mndRedistributeVgroup(SMnode *pMnode, SRpcMsg *pReq, SDbObj *pDb,

  {
    SSdbRaw *pRaw = mndVgroupActionEncode(&newVg);
-    if (pRaw == NULL) return -1;
+    if (pRaw == NULL) goto _OVER;
    if (mndTransAppendCommitlog(pTrans, pRaw) != 0) {
      sdbFreeRaw(pRaw);
-      return -1;
+      goto _OVER;
    }
    (void)sdbSetRawStatus(pRaw, SDB_STATUS_READY);
  }
--- a/source/dnode/mnode/sdb/inc/sdb.h
+++ b/source/dnode/mnode/sdb/inc/sdb.h
@ -291,6 +291,7 @@ int32_t sdbWriteWithoutFree(SSdb *pSdb, SSdbRaw *pRaw);
 * @return void* The object of the row.
 */
 void *sdbAcquire(SSdb *pSdb, ESdbType type, const void *pKey);
+void *sdbAcquireNotReadyObj(SSdb *pSdb, ESdbType type, const void *pKey);

 /**
 * @brief Release a row from sdb.
--- a/source/dnode/mnode/sdb/src/sdbFile.c
+++ b/source/dnode/mnode/sdb/src/sdbFile.c
@ -228,11 +228,12 @@ static int32_t sdbReadFileImp(SSdb *pSdb) {
  int32_t readLen = 0;
  int64_t ret = 0;
  char    file[PATH_MAX] = {0};
+  int32_t bufLen = TSDB_MAX_MSG_SIZE;

  snprintf(file, sizeof(file), "%s%ssdb.data", pSdb->currDir, TD_DIRSEP);
  mInfo("start to read sdb file:%s", file);

-  SSdbRaw *pRaw = taosMemoryMalloc(TSDB_MAX_MSG_SIZE + 100);
+  SSdbRaw *pRaw = taosMemoryMalloc(bufLen + 100);
  if (pRaw == NULL) {
    terrno = TSDB_CODE_OUT_OF_MEMORY;
    mError("failed read sdb file since %s", terrstr());
@ -275,14 +276,15 @@ static int32_t sdbReadFileImp(SSdb *pSdb) {
    }

    readLen = pRaw->dataLen + sizeof(int32_t);
-    if (readLen >= pRaw->dataLen) {
-      SSdbRaw *pNewRaw = taosMemoryMalloc(pRaw->dataLen + TSDB_MAX_MSG_SIZE);
+    if (readLen >= bufLen) {
+      bufLen = pRaw->dataLen * 2;
+      SSdbRaw *pNewRaw = taosMemoryMalloc(bufLen + 100);
      if (pNewRaw == NULL) {
        terrno = TSDB_CODE_OUT_OF_MEMORY;
-        mError("failed read sdb file since malloc new sdbRaw size:%d failed", pRaw->dataLen + TSDB_MAX_MSG_SIZE);
+        mError("failed read sdb file since malloc new sdbRaw size:%d failed", bufLen);
        goto _OVER;
      }
-      mInfo("malloc new sdbRaw size:%d, type:%d", pRaw->dataLen + TSDB_MAX_MSG_SIZE, pRaw->type);
+      mInfo("malloc new sdb raw size:%d, type:%d", bufLen, pRaw->type);
      memcpy(pNewRaw, pRaw, sizeof(SSdbRaw));
      sdbFreeRaw(pRaw);
      pRaw = pNewRaw;
--- a/source/dnode/mnode/sdb/src/sdbHash.c
+++ b/source/dnode/mnode/sdb/src/sdbHash.c
@ -270,7 +270,7 @@ int32_t sdbWrite(SSdb *pSdb, SSdbRaw *pRaw) {
  return code;
 }

-void *sdbAcquire(SSdb *pSdb, ESdbType type, const void *pKey) {
+void *sdbAcquireAll(SSdb *pSdb, ESdbType type, const void *pKey, bool onlyReady) {
  terrno = 0;

  SHashObj *hash = sdbGetHash(pSdb, type);
@ -306,10 +306,24 @@ void *sdbAcquire(SSdb *pSdb, ESdbType type, const void *pKey) {
      break;
  }

+  if (pRet == NULL) {
+    if (!onlyReady) {
+      terrno = 0;
+      atomic_add_fetch_32(&pRow->refCount, 1);
+      pRet = pRow->pObj;
+      sdbPrintOper(pSdb, pRow, "acquire");
+    }
+  }
+
  sdbUnLock(pSdb, type);
  return pRet;
 }

+void *sdbAcquire(SSdb *pSdb, ESdbType type, const void *pKey) { return sdbAcquireAll(pSdb, type, pKey, true); }
+void *sdbAcquireNotReadyObj(SSdb *pSdb, ESdbType type, const void *pKey) {
+  return sdbAcquireAll(pSdb, type, pKey, false);
+}
+
 static void sdbCheckRow(SSdb *pSdb, SSdbRow *pRow) {
  int32_t type = pRow->type;
  sdbWriteLock(pSdb, type);
--- a/source/dnode/vnode/inc/vnode.h
+++ b/source/dnode/vnode/inc/vnode.h
@ -153,6 +153,8 @@ typedef struct SMTbCursor SMTbCursor;
 SMTbCursor *metaOpenTbCursor(SMeta *pMeta);
 void        metaCloseTbCursor(SMTbCursor *pTbCur);
 int32_t     metaTbCursorNext(SMTbCursor *pTbCur);
+int32_t     metaTbCursorPrev(SMTbCursor *pTbCur);
+
 #endif

 // tsdb
--- a/source/dnode/vnode/src/inc/tsdb.h
+++ b/source/dnode/vnode/src/inc/tsdb.h
@ -202,6 +202,7 @@ int32_t tsdbCmprColData(SColData *pColData, int8_t cmprAlg, SBlockCol *pBlockCol
                        uint8_t **ppBuf);
 int32_t tsdbDecmprColData(uint8_t *pIn, SBlockCol *pBlockCol, int8_t cmprAlg, int32_t nVal, SColData *pColData,
                          uint8_t **ppBuf);
+int32_t tRowInfoCmprFn(const void *p1, const void *p2);
 // tsdbMemTable ==============================================================================================
 // SMemTable
 int32_t  tsdbMemTableCreate(STsdb *pTsdb, SMemTable **ppMemTable);
--- a/source/dnode/vnode/src/inc/vnodeInt.h
+++ b/source/dnode/vnode/src/inc/vnodeInt.h
@ -247,7 +247,7 @@ int32_t tsdbSnapReaderClose(STsdbSnapReader** ppReader);
 int32_t tsdbSnapRead(STsdbSnapReader* pReader, uint8_t** ppData);
 // STsdbSnapWriter ========================================
 int32_t tsdbSnapWriterOpen(STsdb* pTsdb, int64_t sver, int64_t ever, STsdbSnapWriter** ppWriter);
-int32_t tsdbSnapWrite(STsdbSnapWriter* pWriter, uint8_t* pData, uint32_t nData);
+int32_t tsdbSnapWrite(STsdbSnapWriter* pWriter, SSnapDataHdr* pHdr);
 int32_t tsdbSnapWriterPrepareClose(STsdbSnapWriter* pWriter);
 int32_t tsdbSnapWriterClose(STsdbSnapWriter** ppWriter, int8_t rollback);
 // STqSnapshotReader ==
--- a/source/dnode/vnode/src/meta/metaQuery.c
+++ b/source/dnode/vnode/src/meta/metaQuery.c
@ -311,7 +311,7 @@ void metaCloseTbCursor(SMTbCursor *pTbCur) {
  }
 }

-int metaTbCursorNext(SMTbCursor *pTbCur) {
+int32_t metaTbCursorNext(SMTbCursor *pTbCur) {
  int    ret;
  void  *pBuf;
  STbCfg tbCfg;
@ -335,6 +335,31 @@ int metaTbCursorNext(SMTbCursor *pTbCur) {
  return 0;
 }

+int32_t metaTbCursorPrev(SMTbCursor *pTbCur) {
+  int    ret;
+  void  *pBuf;
+  STbCfg tbCfg;
+
+  for (;;) {
+    ret = tdbTbcPrev(pTbCur->pDbc, &pTbCur->pKey, &pTbCur->kLen, &pTbCur->pVal, &pTbCur->vLen);
+    if (ret < 0) {
+      return -1;
+    }
+
+    tDecoderClear(&pTbCur->mr.coder);
+
+    metaGetTableEntryByVersion(&pTbCur->mr, ((SUidIdxVal *)pTbCur->pVal)[0].version, *(tb_uid_t *)pTbCur->pKey);
+    if (pTbCur->mr.me.type == TSDB_SUPER_TABLE) {
+      continue;
+    }
+
+    break;
+  }
+
+  return 0;
+}
+
+
 SSchemaWrapper *metaGetTableSchema(SMeta *pMeta, tb_uid_t uid, int32_t sver, int lock) {
  void           *pData = NULL;
  int             nData = 0;
--- a/source/dnode/vnode/src/sma/smaSnapshot.c
+++ b/source/dnode/vnode/src/sma/smaSnapshot.c
@ -423,10 +423,10 @@ int32_t rsmaSnapWrite(SRSmaSnapWriter* pWriter, uint8_t* pData, uint32_t nData)
  // rsma1/rsma2
  if (pHdr->type == SNAP_DATA_RSMA1) {
    pHdr->type = SNAP_DATA_TSDB;
-    code = tsdbSnapWrite(pWriter->pDataWriter[0], pData, nData);
+    code = tsdbSnapWrite(pWriter->pDataWriter[0], pHdr);
  } else if (pHdr->type == SNAP_DATA_RSMA2) {
    pHdr->type = SNAP_DATA_TSDB;
-    code = tsdbSnapWrite(pWriter->pDataWriter[1], pData, nData);
+    code = tsdbSnapWrite(pWriter->pDataWriter[1], pHdr);
  } else if (pHdr->type == SNAP_DATA_QTASK) {
    code = rsmaSnapWriteQTaskInfo(pWriter, pData, nData);
  } else {
--- a/source/dnode/vnode/src/tq/tq.c
+++ b/source/dnode/vnode/src/tq/tq.c
@ -521,7 +521,12 @@ int32_t tqProcessPollReq(STQ* pTq, SRpcMsg* pMsg) {
            tqOffsetResetToData(&fetchOffsetNew, 0, 0);
          }
        } else {
-          tqOffsetResetToLog(&fetchOffsetNew, walGetFirstVer(pTq->pVnode->pWal));
+          pHandle->pRef = walRefFirstVer(pTq->pVnode->pWal, pHandle->pRef);
+          if (pHandle->pRef == NULL) {
+            terrno = TSDB_CODE_OUT_OF_MEMORY;
+            return -1;
+          }
+          tqOffsetResetToLog(&fetchOffsetNew, pHandle->pRef->refVer - 1);
        }
      } else if (reqOffset.type == TMQ_OFFSET__RESET_LATEST) {
        if (pHandle->execHandle.subType == TOPIC_SUB_TYPE__COLUMN) {
@ -719,6 +724,8 @@ int32_t tqProcessPollReq(STQ* pTq, SRpcMsg* pMsg) {
 int32_t tqProcessDeleteSubReq(STQ* pTq, int64_t version, char* msg, int32_t msgLen) {
  SMqVDeleteReq* pReq = (SMqVDeleteReq*)msg;

+  tqDebug("vgId:%d, delete sub: %s", pTq->pVnode->config.vgId, pReq->subKey);
+
  taosWLockLatch(&pTq->pushLock);
  int32_t code = taosHashRemove(pTq->pPushMgr, pReq->subKey, strlen(pReq->subKey));
  if (code != 0) {
--- a/source/dnode/vnode/src/tsdb/tsdbCacheRead.c
+++ b/source/dnode/vnode/src/tsdb/tsdbCacheRead.c
@ -241,7 +241,11 @@ int32_t tsdbRetrieveCacheRows(void* pReader, SSDataBlock* pResBlock, const int32
    taosArrayPush(pLastCols, &p);
  }

-  tsdbTakeReadSnap(pr->pVnode->pTsdb, &pr->pReadSnap, "cache-l");
+  code = tsdbTakeReadSnap(pr->pVnode->pTsdb, &pr->pReadSnap, "cache-l");
+  if (code != TSDB_CODE_SUCCESS) {
+    goto _end;
+  }
+
  pr->pDataFReader = NULL;
  pr->pDataFReaderLast = NULL;

@ -252,7 +256,7 @@ int32_t tsdbRetrieveCacheRows(void* pReader, SSDataBlock* pResBlock, const int32

      code = doExtractCacheRow(pr, lruCache, pKeyInfo->uid, &pRow, &h);
      if (code != TSDB_CODE_SUCCESS) {
-        return code;
+        goto _end;
      }

      if (h == NULL) {
@ -321,7 +325,7 @@ int32_t tsdbRetrieveCacheRows(void* pReader, SSDataBlock* pResBlock, const int32
      STableKeyInfo* pKeyInfo = &pr->pTableList[i];
      code = doExtractCacheRow(pr, lruCache, pKeyInfo->uid, &pRow, &h);
      if (code != TSDB_CODE_SUCCESS) {
-        return code;
+        goto _end;
      }

      if (h == NULL) {
--- a/source/dnode/vnode/src/tsdb/tsdbFS.c
+++ b/source/dnode/vnode/src/tsdb/tsdbFS.c
@ -458,9 +458,8 @@ static int32_t tsdbMergeFileSet(STsdb *pTsdb, SDFileSet *pSetOld, SDFileSet *pSe
      taosMemoryFree(pHeadF);
    }
  } else {
-    nRef = pHeadF->nRef;
-    *pHeadF = *pSetNew->pHeadF;
-    pHeadF->nRef = nRef;
+    ASSERT(pHeadF->offset == pSetNew->pHeadF->offset);
+    ASSERT(pHeadF->size == pSetNew->pHeadF->size);
  }

  // data
@ -481,9 +480,7 @@ static int32_t tsdbMergeFileSet(STsdb *pTsdb, SDFileSet *pSetOld, SDFileSet *pSe
      taosMemoryFree(pDataF);
    }
  } else {
-    nRef = pDataF->nRef;
-    *pDataF = *pSetNew->pDataF;
-    pDataF->nRef = nRef;
+    pDataF->size = pSetNew->pDataF->size;
  }

  // sma
@ -504,9 +501,7 @@ static int32_t tsdbMergeFileSet(STsdb *pTsdb, SDFileSet *pSetOld, SDFileSet *pSe
      taosMemoryFree(pSmaF);
    }
  } else {
-    nRef = pSmaF->nRef;
-    *pSmaF = *pSetNew->pSmaF;
-    pSmaF->nRef = nRef;
+    pSmaF->size = pSetNew->pSmaF->size;
  }

  // stt
--- a/source/dnode/vnode/src/tsdb/tsdbSnapshot.c
+++ b/source/dnode/vnode/src/tsdb/tsdbSnapshot.c
--- a/source/dnode/vnode/src/tsdb/tsdbUtil.c
+++ b/source/dnode/vnode/src/tsdb/tsdbUtil.c
@ -731,6 +731,7 @@ int32_t tRowMergerAdd(SRowMerger *pMerger, TSDBROW *pRow, STSchema *pTSchema) {
    tsdbRowGetColVal(pRow, pTSchema, jCol++, pColVal);

    if (key.version > pMerger->version) {
+#if 0
      if (!COL_VAL_IS_NONE(pColVal)) {
        if ((!COL_VAL_IS_NULL(pColVal)) && IS_VAR_DATA_TYPE(pColVal->type)) {
          SColVal *tColVal = taosArrayGet(pMerger->pArray, iCol);
@ -746,6 +747,28 @@ int32_t tRowMergerAdd(SRowMerger *pMerger, TSDBROW *pRow, STSchema *pTSchema) {
          taosArraySet(pMerger->pArray, iCol, pColVal);
        }
      }
+#endif
+      if (!COL_VAL_IS_NONE(pColVal)) {
+        if (IS_VAR_DATA_TYPE(pColVal->type)) {
+          SColVal *pTColVal = taosArrayGet(pMerger->pArray, iCol);
+          if (!COL_VAL_IS_NULL(pColVal)) {
+            code = tRealloc(&pTColVal->value.pData, pColVal->value.nData);
+            if (code) return code;
+
+            pTColVal->value.nData = pColVal->value.nData;
+            if (pTColVal->value.nData) {
+              memcpy(pTColVal->value.pData, pColVal->value.pData, pTColVal->value.nData);
+            }
+            pTColVal->flag = 0;
+          } else {
+            tFree(pTColVal->value.pData);
+            pTColVal->value.pData = NULL;
+            taosArraySet(pMerger->pArray, iCol, pColVal);
+          }
+        } else {
+          taosArraySet(pMerger->pArray, iCol, pColVal);
+        }
+      }
    } else if (key.version < pMerger->version) {
      SColVal *tColVal = (SColVal *)taosArrayGet(pMerger->pArray, iCol);
      if (COL_VAL_IS_NONE(tColVal) && !COL_VAL_IS_NONE(pColVal)) {
--- a/source/dnode/vnode/src/vnd/vnodeOpen.c
+++ b/source/dnode/vnode/src/vnd/vnodeOpen.c
@ -86,7 +86,7 @@ int32_t vnodeAlter(const char *path, SAlterVnodeReplicaReq *pReq, STfs *pTfs) {
    pNode->nodeId = pReq->replicas[i].id;
    pNode->nodePort = pReq->replicas[i].port;
    tstrncpy(pNode->nodeFqdn, pReq->replicas[i].fqdn, sizeof(pNode->nodeFqdn));
-    (void)tmsgUpdateDnodeInfo(&pNode->nodeId, &pNode->clusterId, pNode->nodeFqdn, &pNode->nodePort);
+    tmsgUpdateDnodeInfo(&pNode->nodeId, &pNode->clusterId, pNode->nodeFqdn, &pNode->nodePort);
    vInfo("vgId:%d, replica:%d ep:%s:%u dnode:%d", pReq->vgId, i, pNode->nodeFqdn, pNode->nodePort, pNode->nodeId);
  }

--- a/source/dnode/vnode/src/vnd/vnodeSnapshot.c
+++ b/source/dnode/vnode/src/vnd/vnodeSnapshot.c
@ -455,7 +455,7 @@ int32_t vnodeSnapWrite(SVSnapWriter *pWriter, uint8_t *pData, uint32_t nData) {
        if (code) goto _err;
      }

-      code = tsdbSnapWrite(pWriter->pTsdbSnapWriter, pData, nData);
+      code = tsdbSnapWrite(pWriter->pTsdbSnapWriter, pHdr);
      if (code) goto _err;
    } break;
    case SNAP_DATA_TQ_HANDLE: {
--- a/source/libs/catalog/inc/catalogInt.h
+++ b/source/libs/catalog/inc/catalogInt.h
@ -805,6 +805,7 @@ int32_t ctgMakeVgArray(SDBVgInfo* dbInfo);
 int32_t ctgAcquireVgMetaFromCache(SCatalog *pCtg, const char *dbFName, const char *tbName, SCtgDBCache **pDb, SCtgTbCache **pTb);
 int32_t ctgCopyTbMeta(SCatalog *pCtg, SCtgTbMetaCtx *ctx, SCtgDBCache **pDb, SCtgTbCache **pTb, STableMeta **pTableMeta, char* dbFName);
 void    ctgReleaseVgMetaToCache(SCatalog *pCtg, SCtgDBCache *dbCache, SCtgTbCache *pCache);
+void    ctgReleaseTbMetaToCache(SCatalog *pCtg, SCtgDBCache *dbCache, SCtgTbCache *pCache);

 extern SCatalogMgmt gCtgMgmt;
 extern SCtgDebug    gCTGDebug;
--- a/source/libs/catalog/src/catalog.c
+++ b/source/libs/catalog/src/catalog.c
@ -598,10 +598,16 @@ int32_t ctgGetCachedTbVgMeta(SCatalog* pCtg, const SName* pTableName, SVgroupInf

  CTG_ERR_JRET(ctgGetVgInfoFromHashValue(pCtg, dbCache->vgCache.vgInfo, pTableName, pVgroup));

+  ctgRUnlockVgInfo(dbCache);
+
  SCtgTbMetaCtx ctx = {0};
  ctx.pName = (SName*)pTableName;
  ctx.flag = CTG_FLAG_UNKNOWN_STB;
-  CTG_ERR_JRET(ctgCopyTbMeta(pCtg, &ctx, &dbCache, &tbCache, pTableMeta, db));
+  code = ctgCopyTbMeta(pCtg, &ctx, &dbCache, &tbCache, pTableMeta, db);
+
+  ctgReleaseTbMetaToCache(pCtg, dbCache, tbCache);
+
+  CTG_RET(code);

 _return:
  
--- a/source/libs/catalog/src/ctgAsync.c
+++ b/source/libs/catalog/src/ctgAsync.c
@ -999,6 +999,7 @@ int32_t ctgHandleGetTbMetaRsp(SCtgTaskReq* tReq, int32_t reqType, const SDataBuf
            CTG_ERR_JRET(ctgGetTbMetaFromVnode(pCtg, pConn, pName, &vgInfo, NULL, tReq));

            ctgReleaseVgInfoToCache(pCtg, dbCache);
+            dbCache = NULL;
          } else {
            SBuildUseDBInput input = {0};

@ -1168,6 +1169,7 @@ int32_t ctgHandleGetTbMetasRsp(SCtgTaskReq* tReq, int32_t reqType, const SDataBu
            CTG_ERR_JRET(ctgGetTbMetaFromVnode(pCtg, pConn, pName, &vgInfo, NULL, tReq));

            ctgReleaseVgInfoToCache(pCtg, dbCache);
+            dbCache = NULL;
          } else {
            SBuildUseDBInput input = {0};

--- a/source/libs/catalog/src/ctgCache.c
+++ b/source/libs/catalog/src/ctgCache.c
@ -2118,7 +2118,7 @@ int32_t ctgOpUpdateEpset(SCtgCacheOperation *operation) {

 _return:

-  if (dbCache) {
+  if (code == TSDB_CODE_SUCCESS && dbCache) {
    ctgWUnlockVgInfo(dbCache);
  }

--- a/source/libs/command/src/command.c
+++ b/source/libs/command/src/command.c
@ -264,10 +264,10 @@ static void setCreateDBResultIntoDataBlock(SSDataBlock* pBlock, char* dbFName, S
  len += sprintf(
      buf2 + VARSTR_HEADER_SIZE,
      "CREATE DATABASE `%s` BUFFER %d CACHESIZE %d CACHEMODEL '%s' COMP %d DURATION %dm "
-      "WAL_FSYNC_PERIOD %d MAXROWS %d MINROWS %d KEEP %dm,%dm,%dm PAGES %d PAGESIZE %d PRECISION '%s' REPLICA %d "
+      "WAL_FSYNC_PERIOD %d MAXROWS %d MINROWS %d STT_TRIGGER %d KEEP %dm,%dm,%dm PAGES %d PAGESIZE %d PRECISION '%s' REPLICA %d "
      "WAL_LEVEL %d VGROUPS %d SINGLE_STABLE %d",
      dbFName, pCfg->buffer, pCfg->cacheSize, cacheModelStr(pCfg->cacheLast), pCfg->compression, pCfg->daysPerFile,
-      pCfg->walFsyncPeriod, pCfg->maxRows, pCfg->minRows, pCfg->daysToKeep0, pCfg->daysToKeep1, pCfg->daysToKeep2,
+      pCfg->walFsyncPeriod, pCfg->maxRows, pCfg->minRows,  pCfg->sstTrigger, pCfg->daysToKeep0, pCfg->daysToKeep1, pCfg->daysToKeep2,
      pCfg->pages, pCfg->pageSize, prec, pCfg->replications, pCfg->walLevel, pCfg->numOfVgroups,
      1 == pCfg->numOfStables);

--- a/source/libs/executor/inc/executorimpl.h
+++ b/source/libs/executor/inc/executorimpl.h
@ -705,7 +705,8 @@ void doBuildResultDatablock(SOperatorInfo* pOperator, SOptrBasicInfo* pbInfo, SG

 bool hasLimitOffsetInfo(SLimitInfo* pLimitInfo);
 void initLimitInfo(const SNode* pLimit, const SNode* pSLimit, SLimitInfo* pLimitInfo);
-void applyLimitOffset(SLimitInfo* pLimitInfo, SSDataBlock* pBlock, SExecTaskInfo* pTaskInfo, SOperatorInfo* pOperator);
+void resetLimitInfoForNextGroup(SLimitInfo* pLimitInfo);
+bool applyLimitOffset(SLimitInfo* pLimitInfo, SSDataBlock* pBlock, SExecTaskInfo* pTaskInfo, SOperatorInfo* pOperator);

 void applyAggFunctionOnPartialTuples(SExecTaskInfo* taskInfo, SqlFunctionCtx* pCtx, SColumnInfoData* pTimeWindowData,
                                     int32_t offset, int32_t forwardStep, int32_t numOfTotal, int32_t numOfOutput);
--- a/source/libs/executor/src/exchangeoperator.c
+++ b/source/libs/executor/src/exchangeoperator.c
@ -584,7 +584,13 @@ int32_t doExtractResultBlocks(SExchangeInfo* pExchangeInfo, SSourceDataInfo* pDa
  int32_t index = 0;
  int32_t code = 0;
  while (index++ < pRetrieveRsp->numOfBlocks) {
-    SSDataBlock* pb = createOneDataBlock(pExchangeInfo->pDummyBlock, false);
+    SSDataBlock* pb = NULL;
+    if (taosArrayGetSize(pExchangeInfo->pRecycledBlocks) > 0) {
+      pb = *(SSDataBlock**)taosArrayPop(pExchangeInfo->pRecycledBlocks);
+      blockDataCleanup(pb);
+    } else {
+      pb = createOneDataBlock(pExchangeInfo->pDummyBlock, false);
+    }

    code = extractDataBlockFromFetchRsp(pb, pStart, NULL, &pStart);
    if (code != 0) {
@ -732,9 +738,7 @@ int32_t handleLimitOffset(SOperatorInfo* pOperator, SLimitInfo* pLimitInfo, SSDa
    }

    // reset the value for a new group data
-    pLimitInfo->numOfOutputRows = 0;
-    pLimitInfo->remainOffset = pLimitInfo->limit.offset;
-
+    resetLimitInfoForNextGroup(pLimitInfo);
    // existing rows that belongs to previous group.
    if (pBlock->info.rows > 0) {
      return PROJECT_RETRIEVE_DONE;
@ -760,7 +764,12 @@ int32_t handleLimitOffset(SOperatorInfo* pOperator, SLimitInfo* pLimitInfo, SSDa
    int32_t keepRows = (int32_t)(pLimitInfo->limit.limit - pLimitInfo->numOfOutputRows);
    blockDataKeepFirstNRows(pBlock, keepRows);
    if (pLimitInfo->slimit.limit > 0 && pLimitInfo->slimit.limit <= pLimitInfo->numOfOutputGroups) {
-      pOperator->status = OP_EXEC_DONE;
+      setOperatorCompleted(pOperator);
+    } else {
+      // current group limitation is reached, and future blocks of this group need to be discarded.
+      if (pBlock->info.rows == 0) {
+        return PROJECT_RETRIEVE_CONTINUE;
+      }
    }

    return PROJECT_RETRIEVE_DONE;
--- a/source/libs/executor/src/executil.c
+++ b/source/libs/executor/src/executil.c
@ -1789,6 +1789,11 @@ void initLimitInfo(const SNode* pLimit, const SNode* pSLimit, SLimitInfo* pLimit
  pLimitInfo->remainGroupOffset = slimit.offset;
 }

+void resetLimitInfoForNextGroup(SLimitInfo* pLimitInfo) {
+  pLimitInfo->numOfOutputRows = 0;
+  pLimitInfo->remainOffset = pLimitInfo->limit.offset;
+}
+
 uint64_t tableListGetSize(const STableListInfo* pTableList) {
  ASSERT(taosArrayGetSize(pTableList->pTableList) == taosHashGetSize(pTableList->map));
  return taosArrayGetSize(pTableList->pTableList);
--- a/source/libs/executor/src/executor.c
+++ b/source/libs/executor/src/executor.c
@ -24,12 +24,16 @@
 static TdThreadOnce initPoolOnce = PTHREAD_ONCE_INIT;
 int32_t             exchangeObjRefPool = -1;

-static void initRefPool() { exchangeObjRefPool = taosOpenRef(1024, doDestroyExchangeOperatorInfo); }
 static void cleanupRefPool() {
  int32_t ref = atomic_val_compare_exchange_32(&exchangeObjRefPool, exchangeObjRefPool, 0);
  taosCloseRef(ref);
 }

+static void initRefPool() { 
+  exchangeObjRefPool = taosOpenRef(1024, doDestroyExchangeOperatorInfo);   
+  atexit(cleanupRefPool);
+}
+
 static int32_t doSetSMABlock(SOperatorInfo* pOperator, void* input, size_t numOfBlocks, int32_t type, char* id) {
  ASSERT(pOperator != NULL);
  if (pOperator->operatorType != QUERY_NODE_PHYSICAL_PLAN_STREAM_SCAN) {
@ -442,7 +446,6 @@ int32_t qCreateExecTask(SReadHandle* readHandle, int32_t vgId, uint64_t taskId,
  SExecTaskInfo** pTask = (SExecTaskInfo**)pTaskInfo;

  taosThreadOnce(&initPoolOnce, initRefPool);
-  atexit(cleanupRefPool);

  qDebug("start to create subplan task, TID:0x%" PRIx64 " QID:0x%" PRIx64, taskId, pSubplan->id.queryId);

--- a/source/libs/executor/src/groupoperator.c
+++ b/source/libs/executor/src/groupoperator.c
@ -593,8 +593,11 @@ void* getCurrentDataGroupInfo(const SPartitionOperatorInfo* pInfo, SDataGroupInf

    int32_t pageId = 0;
    pPage = getNewBufPage(pInfo->pBuf, &pageId);
-    taosArrayPush(p->pPageList, &pageId);
+    if (pPage == NULL) {
+      return pPage;
+    }

+    taosArrayPush(p->pPageList, &pageId);
    *(int32_t*)pPage = 0;
  } else {
    int32_t* curId = taosArrayGetLast(p->pPageList);
@ -612,6 +615,11 @@ void* getCurrentDataGroupInfo(const SPartitionOperatorInfo* pInfo, SDataGroupInf
      // add a new page for current group
      int32_t pageId = 0;
      pPage = getNewBufPage(pInfo->pBuf, &pageId);
+      if (pPage == NULL) {
+        qError("failed to get new buffer, code:%s", tstrerror(terrno));
+        return NULL;
+      }
+
      taosArrayPush(p->pPageList, &pageId);
      memset(pPage, 0, getBufPageSize(pInfo->pBuf));
    }
--- a/source/libs/executor/src/projectoperator.c
+++ b/source/libs/executor/src/projectoperator.c
@ -175,8 +175,7 @@ static int32_t setInfoForNewGroup(SSDataBlock* pBlock, SLimitInfo* pLimitInfo, S

    // reset the value for a new group data
    // existing rows that belongs to previous group.
-    pLimitInfo->numOfOutputRows = 0;
-    pLimitInfo->remainOffset = pLimitInfo->limit.offset;
+    resetLimitInfoForNextGroup(pLimitInfo);
  }

  return PROJECT_RETRIEVE_DONE;
@ -200,10 +199,18 @@ static int32_t doIngroupLimitOffset(SLimitInfo* pLimitInfo, uint64_t groupId, SS
  if (pLimitInfo->limit.limit >= 0 && pLimitInfo->numOfOutputRows + pBlock->info.rows >= pLimitInfo->limit.limit) {
    int32_t keepRows = (int32_t)(pLimitInfo->limit.limit - pLimitInfo->numOfOutputRows);
    blockDataKeepFirstNRows(pBlock, keepRows);
+
    // TODO: optimize it later when partition by + limit
+    // all retrieved requirement has been fulfilled, let's finish this
    if ((pLimitInfo->slimit.limit == -1 && pLimitInfo->currentGroupId == 0) ||
        (pLimitInfo->slimit.limit > 0 && pLimitInfo->slimit.limit <= pLimitInfo->numOfOutputGroups)) {
      setOperatorCompleted(pOperator);
+    } else {
+      // Even current group is done, there may be many vgroups remain existed, and we need to continue to retrieve data
+      // from next group. So let's continue this retrieve process
+      if (keepRows == 0) {
+        return PROJECT_RETRIEVE_CONTINUE;
+      }
    }
  }

@ -357,7 +364,6 @@ SSDataBlock* doProjectOperation(SOperatorInfo* pOperator) {
    pOperator->cost.openCost = (taosGetTimestampUs() - st) / 1000.0;
  }

-  // printDataBlock1(p, "project");
  return (p->info.rows > 0) ? p : NULL;
 }

--- a/source/libs/executor/src/scanoperator.c
+++ b/source/libs/executor/src/scanoperator.c
@ -257,7 +257,7 @@ static void doSetTagColumnData(STableScanBase* pTableScanInfo, SSDataBlock* pBlo
 }

 // todo handle the slimit info
-void applyLimitOffset(SLimitInfo* pLimitInfo, SSDataBlock* pBlock, SExecTaskInfo* pTaskInfo, SOperatorInfo* pOperator) {
+bool applyLimitOffset(SLimitInfo* pLimitInfo, SSDataBlock* pBlock, SExecTaskInfo* pTaskInfo, SOperatorInfo* pOperator) {
  SLimit*     pLimit = &pLimitInfo->limit;
  const char* id = GET_TASKID(pTaskInfo);

@ -266,6 +266,7 @@ void applyLimitOffset(SLimitInfo* pLimitInfo, SSDataBlock* pBlock, SExecTaskInfo
      pLimitInfo->remainOffset -= pBlock->info.rows;
      blockDataEmpty(pBlock);
      qDebug("current block ignore due to offset, current:%" PRId64 ", %s", pLimitInfo->remainOffset, id);
+      return false;
    } else {
      blockDataTrimFirstNRows(pBlock, pLimitInfo->remainOffset);
      pLimitInfo->remainOffset = 0;
@ -274,13 +275,14 @@ void applyLimitOffset(SLimitInfo* pLimitInfo, SSDataBlock* pBlock, SExecTaskInfo

  if (pLimit->limit != -1 && pLimit->limit <= (pLimitInfo->numOfOutputRows + pBlock->info.rows)) {
    // limit the output rows
-    int32_t overflowRows = pLimitInfo->numOfOutputRows + pBlock->info.rows - pLimit->limit;
-    int32_t keep = pBlock->info.rows - overflowRows;
+    int32_t keep = (int32_t)(pLimit->limit - pLimitInfo->numOfOutputRows);

    blockDataKeepFirstNRows(pBlock, keep);
    qDebug("output limit %" PRId64 " has reached, %s", pLimit->limit, id);
-    pOperator->status = OP_EXEC_DONE;
+    return true;
  }
+
+  return false;
 }

 static int32_t loadDataBlock(SOperatorInfo* pOperator, STableScanBase* pTableScanInfo, SSDataBlock* pBlock,
@ -391,7 +393,10 @@ static int32_t loadDataBlock(SOperatorInfo* pOperator, STableScanBase* pTableSca
    }
  }

-  applyLimitOffset(&pTableScanInfo->limitInfo, pBlock, pTaskInfo, pOperator);
+  bool limitReached = applyLimitOffset(&pTableScanInfo->limitInfo, pBlock, pTaskInfo, pOperator);
+  if (limitReached) { // set operator flag is done
+    setOperatorCompleted(pOperator);
+  }

  pCost->totalRows += pBlock->info.rows;
  pTableScanInfo->limitInfo.numOfOutputRows = pCost->totalRows;
@ -768,8 +773,7 @@ static SSDataBlock* doTableScan(SOperatorInfo* pOperator) {

    // reset value for the next group data output
    pOperator->status = OP_OPENED;
-    pInfo->base.limitInfo.numOfOutputRows = 0;
-    pInfo->base.limitInfo.remainOffset = pInfo->base.limitInfo.limit.offset;
+    resetLimitInfoForNextGroup(&pInfo->base.limitInfo);

    int32_t        num = 0;
    STableKeyInfo* pList = NULL;
@ -2685,9 +2689,12 @@ int32_t stopGroupTableMergeScan(SOperatorInfo* pOperator) {
  taosArrayDestroy(pInfo->queryConds);
  pInfo->queryConds = NULL;

+  resetLimitInfoForNextGroup(&pInfo->limitInfo);
  return TSDB_CODE_SUCCESS;
 }

+// all data produced by this function only belongs to one group
+// slimit/soffset does not need to be concerned here, since this function only deal with data within one group.
 SSDataBlock* getSortedTableMergeScanBlockData(SSortHandle* pHandle, SSDataBlock* pResBlock, int32_t capacity,
                                              SOperatorInfo* pOperator) {
  STableMergeScanInfo* pInfo = pOperator->info;
@ -2707,10 +2714,12 @@ SSDataBlock* getSortedTableMergeScanBlockData(SSortHandle* pHandle, SSDataBlock*
    }
  }

-  qDebug("%s get sorted row blocks, rows:%d", GET_TASKID(pTaskInfo), pResBlock->info.rows);
  applyLimitOffset(&pInfo->limitInfo, pResBlock, pTaskInfo, pOperator);
  pInfo->limitInfo.numOfOutputRows += pResBlock->info.rows;

+  qDebug("%s get sorted row block, rows:%d, limit:%"PRId64, GET_TASKID(pTaskInfo), pResBlock->info.rows,
+      pInfo->limitInfo.numOfOutputRows);
+
  return (pResBlock->info.rows > 0) ? pResBlock : NULL;
 }

@ -2749,11 +2758,13 @@ SSDataBlock* doTableMergeScan(SOperatorInfo* pOperator) {
      pOperator->resultInfo.totalRows += pBlock->info.rows;
      return pBlock;
    } else {
+      // Data of this group are all dumped, let's try the next group
      stopGroupTableMergeScan(pOperator);
      if (pInfo->tableEndIndex >= tableListSize - 1) {
        setOperatorCompleted(pOperator);
        break;
      }
+
      pInfo->tableStartIndex = pInfo->tableEndIndex + 1;
      pInfo->groupId = tableListGetInfo(pTaskInfo->pTableInfoList, pInfo->tableStartIndex)->groupId;
      startGroupTableMergeScan(pOperator);
@ -3222,8 +3233,10 @@ static void buildVnodeGroupedNtbTableCount(STableCountScanOperatorInfo* pInfo, S
  uint64_t groupId = calcGroupId(fullStbName, strlen(fullStbName));
  pRes->info.id.groupId = groupId;
  int64_t ntbNum = metaGetNtbNum(pInfo->readHandle.meta);
+  if (ntbNum != 0) {
    fillTableCountScanDataBlock(pSupp, dbName, "", ntbNum, pRes);
  }
+}

 static void buildVnodeGroupedStbTableCount(STableCountScanOperatorInfo* pInfo, STableCountScanSupp* pSupp,
                                           SSDataBlock* pRes, char* dbName, tb_uid_t stbUid) {
--- a/source/libs/executor/src/sortoperator.c
+++ b/source/libs/executor/src/sortoperator.c
@ -680,13 +680,15 @@ SSDataBlock* getMultiwaySortedBlockData(SSortHandle* pHandle, SSDataBlock* pData
      break;
    }

-    if (p->info.rows > 0) {
-      applyLimitOffset(&pInfo->limitInfo, p, pTaskInfo, pOperator);
+    bool limitReached = applyLimitOffset(&pInfo->limitInfo, p, pTaskInfo, pOperator);
+    if (limitReached) {
+      resetLimitInfoForNextGroup(&pInfo->limitInfo);
+    }
+
    if (p->info.rows > 0) {
      break;
    }
  }
-  }

  if (p->info.rows > 0) {
    int32_t numOfCols = taosArrayGetSize(pColMatchInfo);
@ -698,7 +700,6 @@ SSDataBlock* getMultiwaySortedBlockData(SSortHandle* pHandle, SSDataBlock* pData
      colDataAssign(pDst, pSrc, p->info.rows, &pDataBlock->info);
    }

-    pInfo->limitInfo.numOfOutputRows += p->info.rows;
    pDataBlock->info.rows = p->info.rows;
    pDataBlock->info.id.groupId = pInfo->groupId;
    pDataBlock->info.dataLoad = 1;
--- a/source/libs/executor/src/sysscanoperator.c
+++ b/source/libs/executor/src/sysscanoperator.c
@ -491,6 +491,7 @@ static SSDataBlock* sysTableScanUserTags(SOperatorInfo* pOperator) {
    pInfo->pCur = metaOpenTbCursor(pInfo->readHandle.meta);
  }

+  bool blockFull = false;
  while ((ret = metaTbCursorNext(pInfo->pCur)) == 0) {
    if (pInfo->pCur->mr.me.type != TSDB_CHILD_TABLE) {
      continue;
@ -512,17 +513,24 @@ static SSDataBlock* sysTableScanUserTags(SOperatorInfo* pOperator) {
      T_LONG_JMP(pTaskInfo->env, terrno);
    }

+    if ((smrSuperTable.me.stbEntry.schemaTag.nCols + numOfRows) > pOperator->resultInfo.capacity) {
+      metaTbCursorPrev(pInfo->pCur);
+      blockFull = true;
+    } else {
      sysTableUserTagsFillOneTableTags(pInfo, &smrSuperTable, &pInfo->pCur->mr, dbname, tableName, &numOfRows, dataBlock);
+    }
    
    metaReaderClear(&smrSuperTable);

-    if (numOfRows >= pOperator->resultInfo.capacity) {
+    if (blockFull || numOfRows >= pOperator->resultInfo.capacity) {
      relocateAndFilterSysTagsScanResult(pInfo, numOfRows, dataBlock, pOperator->exprSupp.pFilterInfo);
      numOfRows = 0;

      if (pInfo->pRes->info.rows > 0) {
        break;
      }
+
+      blockFull = false;
    }
  }

--- a/source/libs/executor/src/timewindowoperator.c
+++ b/source/libs/executor/src/timewindowoperator.c
@ -2477,7 +2477,19 @@ static void doStreamIntervalAggImpl(SOperatorInfo* pOperatorInfo, SSDataBlock* p
      pInfo->delKey = key;
    }
    int32_t prevEndPos = (forwardRows - 1) * step + startPos;
-    ASSERT(pSDataBlock->info.window.skey > 0 && pSDataBlock->info.window.ekey > 0);
+    if (pSDataBlock->info.window.skey <= 0 || pSDataBlock->info.window.ekey <= 0) {
+      qError("table uid %" PRIu64 " data block timestamp range may not be calculated! minKey %" PRId64
+             ",maxKey %" PRId64,
+             pSDataBlock->info.id.uid, pSDataBlock->info.window.skey, pSDataBlock->info.window.ekey);
+      blockDataUpdateTsWindow(pSDataBlock, 0);
+
+      // timestamp of the data is incorrect
+      if (pSDataBlock->info.window.skey <= 0 || pSDataBlock->info.window.ekey <= 0) {
+        qError("table uid %" PRIu64 " data block timestamp is out of range! minKey %" PRId64 ",maxKey %" PRId64,
+               pSDataBlock->info.id.uid, pSDataBlock->info.window.skey, pSDataBlock->info.window.ekey);
+      }
+    }
+
    if (IS_FINAL_OP(pInfo)) {
      startPos = getNextQualifiedFinalWindow(&pInfo->interval, &nextWin, &pSDataBlock->info, tsCols, prevEndPos);
    } else {
--- a/source/libs/function/src/builtinsimpl.c
+++ b/source/libs/function/src/builtinsimpl.c
@ -3087,14 +3087,12 @@ static int32_t doSaveTupleData(SSerializeDataHandle* pHandle, const void* pBuf,
    if (pHandle->currentPage == -1) {
      pPage = getNewBufPage(pHandle->pBuf, &pHandle->currentPage);
      if (pPage == NULL) {
-        terrno = TSDB_CODE_NO_AVAIL_DISK;
        return terrno;
      }
      pPage->num = sizeof(SFilePage);
    } else {
      pPage = getBufPage(pHandle->pBuf, pHandle->currentPage);
      if (pPage == NULL) {
-        terrno = TSDB_CODE_NO_AVAIL_DISK;
        return terrno;
      }
      if (pPage->num + length > getBufPageSize(pHandle->pBuf)) {
@ -3102,7 +3100,6 @@ static int32_t doSaveTupleData(SSerializeDataHandle* pHandle, const void* pBuf,
        releaseBufPage(pHandle->pBuf, pPage);
        pPage = getNewBufPage(pHandle->pBuf, &pHandle->currentPage);
        if (pPage == NULL) {
-          terrno = TSDB_CODE_NO_AVAIL_DISK;
          return terrno;
        }
        pPage->num = sizeof(SFilePage);
@ -3149,7 +3146,6 @@ static int32_t doUpdateTupleData(SSerializeDataHandle* pHandle, const void* pBuf
  if (pHandle->pBuf != NULL) {
    SFilePage* pPage = getBufPage(pHandle->pBuf, pPos->pageId);
    if (pPage == NULL) {
-      terrno = TSDB_CODE_NO_AVAIL_DISK;
      return terrno;
    }
    memcpy(pPage->data + pPos->offset, pBuf, length);
--- a/source/libs/function/src/tpercentile.c
+++ b/source/libs/function/src/tpercentile.c
@ -43,8 +43,8 @@ static SFilePage *loadDataFromFilePage(tMemBucket *pMemBucket, int32_t slotIdx)
    if (pg == NULL) {
      return NULL;
    }
-    memcpy(buffer->data + offset, pg->data, (size_t)(pg->num * pMemBucket->bytes));

+    memcpy(buffer->data + offset, pg->data, (size_t)(pg->num * pMemBucket->bytes));
    offset += (int32_t)(pg->num * pMemBucket->bytes);
  }

@ -109,7 +109,7 @@ int32_t findOnlyResult(tMemBucket *pMemBucket, double *result) {
      int32_t   *pageId = taosArrayGet(list, 0);
      SFilePage *pPage = getBufPage(pMemBucket->pBuffer, *pageId);
      if (pPage == NULL) {
-        return TSDB_CODE_NO_AVAIL_DISK;
+        return terrno;
      }
      ASSERT(pPage->num == 1);

@ -276,7 +276,7 @@ tMemBucket *tMemBucketCreate(int16_t nElemSize, int16_t dataType, double minval,
    return NULL;
  }

-  int32_t ret = createDiskbasedBuf(&pBucket->pBuffer, pBucket->bufPageSize, pBucket->bufPageSize * 512, "1", tsTempDir);
+  int32_t ret = createDiskbasedBuf(&pBucket->pBuffer, pBucket->bufPageSize, pBucket->bufPageSize * 1024, "1", tsTempDir);
  if (ret != 0) {
    tMemBucketDestroy(pBucket);
    return NULL;
@ -386,7 +386,7 @@ int32_t tMemBucketPut(tMemBucket *pBucket, const void *data, size_t size) {

      pSlot->info.data = getNewBufPage(pBucket->pBuffer, &pageId);
      if (pSlot->info.data == NULL) {
-        return TSDB_CODE_NO_AVAIL_DISK;
+        return terrno;
      }
      pSlot->info.pageId = pageId;
      taosArrayPush(pPageIdList, &pageId);
@ -480,8 +480,9 @@ int32_t getPercentileImpl(tMemBucket *pMemBucket, int32_t count, double fraction
        // data in buffer and file are merged together to be processed.
        SFilePage *buffer = loadDataFromFilePage(pMemBucket, i);
        if (buffer == NULL) {
-          return TSDB_CODE_NO_AVAIL_DISK;
+          return terrno;
        }
+
        int32_t    currentIdx = count - num;

        char *thisVal = buffer->data + pMemBucket->bytes * currentIdx;
@ -518,7 +519,7 @@ int32_t getPercentileImpl(tMemBucket *pMemBucket, int32_t count, double fraction
          int32_t *pageId = taosArrayGet(list, f);
          SFilePage *pg = getBufPage(pMemBucket->pBuffer, *pageId);
          if (pg == NULL) {
-            return TSDB_CODE_NO_AVAIL_DISK;
+            return terrno;
          }

          int32_t code = tMemBucketPut(pMemBucket, pg->data, (int32_t)pg->num);
--- a/source/libs/parser/src/parInsertSql.c
+++ b/source/libs/parser/src/parInsertSql.c
@ -1760,6 +1760,9 @@ static int32_t getTableSchemaFromMetaData(SInsertParseContext* pCxt, const SMeta
  if (TSDB_CODE_SUCCESS == code && !isStb && TSDB_SUPER_TABLE == pStmt->pTableMeta->tableType) {
    code = buildInvalidOperationMsg(&pCxt->msg, "insert data into super table is not supported");
  }
+  if (TSDB_CODE_SUCCESS == code && isStb) {
+    code = storeTableMeta(pCxt, pStmt);
+  }
  if (TSDB_CODE_SUCCESS == code) {
    code = getTableVgroupFromMetaData(pMetaData->pTableHash, pStmt, isStb);
  }
--- a/source/libs/parser/src/parInsertStmt.c
+++ b/source/libs/parser/src/parInsertStmt.c
@ -425,6 +425,27 @@ int32_t qCloneStmtDataBlock(void** pDst, void* pSrc) {
    pBlock->pTableMeta = pNewMeta;
  }

+  if (pBlock->boundColumnInfo.boundColumns) {
+    int32_t size = pBlock->boundColumnInfo.numOfCols * sizeof(col_id_t);
+    void* tmp = taosMemoryMalloc(size);
+    memcpy(tmp, pBlock->boundColumnInfo.boundColumns, size);
+    pBlock->boundColumnInfo.boundColumns = tmp;
+  }
+
+  if (pBlock->boundColumnInfo.cols) {
+    int32_t size = pBlock->boundColumnInfo.numOfCols * sizeof(SBoundColumn);
+    void* tmp = taosMemoryMalloc(size);
+    memcpy(tmp, pBlock->boundColumnInfo.cols, size);
+    pBlock->boundColumnInfo.cols = tmp;
+  }
+
+  if (pBlock->boundColumnInfo.colIdxInfo) {
+    int32_t size = pBlock->boundColumnInfo.numOfBound * sizeof(SBoundIdxInfo);
+    void* tmp = taosMemoryMalloc(size);
+    memcpy(tmp, pBlock->boundColumnInfo.colIdxInfo, size);
+    pBlock->boundColumnInfo.colIdxInfo = tmp;
+  }
+
  return qResetStmtDataBlock(*pDst, false);
 }

@ -437,7 +458,7 @@ int32_t qRebuildStmtDataBlock(void** pDst, void* pSrc, uint64_t uid, int32_t vgI
  STableDataBlocks* pBlock = (STableDataBlocks*)*pDst;
  pBlock->pData = taosMemoryMalloc(pBlock->nAllocSize);
  if (NULL == pBlock->pData) {
-    qFreeStmtDataBlock(pBlock);
+    qDestroyStmtDataBlock(pBlock);
    return TSDB_CODE_OUT_OF_MEMORY;
  }

--- a/source/libs/planner/src/planOptimizer.c
+++ b/source/libs/planner/src/planOptimizer.c
@ -1080,29 +1080,29 @@ static bool sortPriKeyOptMayBeOptimized(SLogicNode* pNode) {
    return false;
  }
  SSortLogicNode* pSort = (SSortLogicNode*)pNode;
-  if (pSort->groupSort || !sortPriKeyOptIsPriKeyOrderBy(pSort->pSortKeys) || 1 != LIST_LENGTH(pSort->node.pChildren)) {
+  if (!sortPriKeyOptIsPriKeyOrderBy(pSort->pSortKeys) || 1 != LIST_LENGTH(pSort->node.pChildren)) {
    return false;
  }
  return true;
 }

-static int32_t sortPriKeyOptGetSequencingNodesImpl(SLogicNode* pNode, bool* pNotOptimize,
+static int32_t sortPriKeyOptGetSequencingNodesImpl(SLogicNode* pNode, bool groupSort, bool* pNotOptimize,
                                                   SNodeList** pSequencingNodes) {
  switch (nodeType(pNode)) {
    case QUERY_NODE_LOGIC_PLAN_SCAN: {
      SScanLogicNode* pScan = (SScanLogicNode*)pNode;
-      if (NULL != pScan->pGroupTags || TSDB_SYSTEM_TABLE == pScan->tableType) {
+      if ((!groupSort && NULL != pScan->pGroupTags) || TSDB_SYSTEM_TABLE == pScan->tableType) {
        *pNotOptimize = true;
        return TSDB_CODE_SUCCESS;
      }
      return nodesListMakeAppend(pSequencingNodes, (SNode*)pNode);
    }
    case QUERY_NODE_LOGIC_PLAN_JOIN: {
-      int32_t code = sortPriKeyOptGetSequencingNodesImpl((SLogicNode*)nodesListGetNode(pNode->pChildren, 0),
+      int32_t code = sortPriKeyOptGetSequencingNodesImpl((SLogicNode*)nodesListGetNode(pNode->pChildren, 0), groupSort,
                                                         pNotOptimize, pSequencingNodes);
      if (TSDB_CODE_SUCCESS == code) {
-        code = sortPriKeyOptGetSequencingNodesImpl((SLogicNode*)nodesListGetNode(pNode->pChildren, 1), pNotOptimize,
-                                                   pSequencingNodes);
+        code = sortPriKeyOptGetSequencingNodesImpl((SLogicNode*)nodesListGetNode(pNode->pChildren, 1), groupSort,
+                                                   pNotOptimize, pSequencingNodes);
      }
      return code;
    }
@ -1121,13 +1121,13 @@ static int32_t sortPriKeyOptGetSequencingNodesImpl(SLogicNode* pNode, bool* pNot
    return TSDB_CODE_SUCCESS;
  }

-  return sortPriKeyOptGetSequencingNodesImpl((SLogicNode*)nodesListGetNode(pNode->pChildren, 0), pNotOptimize,
-                                             pSequencingNodes);
+  return sortPriKeyOptGetSequencingNodesImpl((SLogicNode*)nodesListGetNode(pNode->pChildren, 0), groupSort,
+                                             pNotOptimize, pSequencingNodes);
 }

-static int32_t sortPriKeyOptGetSequencingNodes(SLogicNode* pNode, SNodeList** pSequencingNodes) {
+static int32_t sortPriKeyOptGetSequencingNodes(SLogicNode* pNode, bool groupSort, SNodeList** pSequencingNodes) {
  bool    notOptimize = false;
-  int32_t code = sortPriKeyOptGetSequencingNodesImpl(pNode, &notOptimize, pSequencingNodes);
+  int32_t code = sortPriKeyOptGetSequencingNodesImpl(pNode, groupSort, &notOptimize, pSequencingNodes);
  if (TSDB_CODE_SUCCESS != code || notOptimize) {
    NODES_CLEAR_LIST(*pSequencingNodes);
  }
@ -1175,8 +1175,8 @@ static int32_t sortPriKeyOptApply(SOptimizeContext* pCxt, SLogicSubplan* pLogicS

 static int32_t sortPrimaryKeyOptimizeImpl(SOptimizeContext* pCxt, SLogicSubplan* pLogicSubplan, SSortLogicNode* pSort) {
  SNodeList* pSequencingNodes = NULL;
-  int32_t    code =
-      sortPriKeyOptGetSequencingNodes((SLogicNode*)nodesListGetNode(pSort->node.pChildren, 0), &pSequencingNodes);
+  int32_t    code = sortPriKeyOptGetSequencingNodes((SLogicNode*)nodesListGetNode(pSort->node.pChildren, 0),
+                                                    pSort->groupSort, &pSequencingNodes);
  if (TSDB_CODE_SUCCESS == code && NULL != pSequencingNodes) {
    code = sortPriKeyOptApply(pCxt, pLogicSubplan, pSort, pSequencingNodes);
  }
--- a/source/libs/qworker/inc/qwInt.h
+++ b/source/libs/qworker/inc/qwInt.h
@ -194,6 +194,8 @@ typedef struct SQWorker {
  SMsgCb    msgCb;
  SQWStat   stat;
  int32_t  *destroyed;
+
+  int8_t    nodeStopped;
 } SQWorker;

 typedef struct SQWorkerMgmt {
@ -228,9 +230,14 @@ typedef struct SQWorkerMgmt {
      case QW_PHASE_POST_FETCH:                                              \
        ctx->inFetch = 0;                                                    \
        break;                                                               \
-      default:                                                               \
+      case QW_PHASE_PRE_QUERY:                                               \
+      case QW_PHASE_POST_QUERY:                                              \
+      case QW_PHASE_PRE_CQUERY:                                              \
+      case QW_PHASE_POST_CQUERY:                                             \
        atomic_store_8(&(ctx)->phase, _value);                               \
        break;                                                               \
+      default:                                                               \
+        break;                                                               \
    }                                                                        \
  } while (0)

--- a/source/libs/qworker/src/qwUtil.c
+++ b/source/libs/qworker/src/qwUtil.c
@ -213,9 +213,15 @@ int32_t qwAcquireTaskCtx(QW_FPARAMS_DEF, SQWTaskCtx **ctx) {
  QW_SET_QTID(id, qId, tId, eId);

  *ctx = taosHashAcquire(mgmt->ctxHash, id, sizeof(id));
+  int8_t nodeStopped = atomic_load_8(&mgmt->nodeStopped);
  if (NULL == (*ctx)) {
+    if (!nodeStopped) {
      QW_TASK_DLOG_E("task ctx not exist, may be dropped");
      QW_ERR_RET(TSDB_CODE_QRY_TASK_CTX_NOT_EXIST);
+    } else {
+      QW_TASK_DLOG_E("node stopped");
+      QW_ERR_RET(TSDB_CODE_VND_STOPPED);
+    }
  }

  return TSDB_CODE_SUCCESS;
@ -226,9 +232,16 @@ int32_t qwGetTaskCtx(QW_FPARAMS_DEF, SQWTaskCtx **ctx) {
  QW_SET_QTID(id, qId, tId, eId);

  *ctx = taosHashGet(mgmt->ctxHash, id, sizeof(id));
+  int8_t nodeStopped = atomic_load_8(&mgmt->nodeStopped);
+
  if (NULL == (*ctx)) {
+    if (!nodeStopped) {
      QW_TASK_DLOG_E("task ctx not exist, may be dropped");
      QW_ERR_RET(TSDB_CODE_QRY_TASK_CTX_NOT_EXIST);
+    } else {
+      QW_TASK_DLOG_E("node stopped");
+      QW_ERR_RET(TSDB_CODE_VND_STOPPED);
+    }
  }

  return TSDB_CODE_SUCCESS;
--- a/source/libs/qworker/src/qworker.c
+++ b/source/libs/qworker/src/qworker.c
@ -551,7 +551,9 @@ _return:
  if (ctx) {
    QW_UPDATE_RSP_CODE(ctx, code);

+    if (QW_PHASE_POST_CQUERY != phase) {
      QW_SET_PHASE(ctx, phase);
+    }

    QW_UNLOCK(QW_WRITE, &ctx->lock);
    qwReleaseTaskCtx(mgmt, ctx);
@ -758,7 +760,7 @@ int32_t qwProcessCQuery(QW_FPARAMS_DEF, SQWMsg *qwMsg) {
    QW_LOCK(QW_WRITE, &ctx->lock);
    if (qComplete || (queryStop && (0 == atomic_load_8((int8_t *)&ctx->queryContinue))) || code) {
      // Note: query is not running anymore
-      QW_SET_PHASE(ctx, 0);
+      QW_SET_PHASE(ctx, QW_PHASE_POST_CQUERY);
      QW_UNLOCK(QW_WRITE, &ctx->lock);
      break;
    }
@ -1186,6 +1188,9 @@ void qWorkerStopAllTasks(void *qWorkerMgmt) {
  uint64_t qId, tId, sId;
  int32_t  eId;
  int64_t  rId = 0;
+  
+  atomic_store_8(&mgmt->nodeStopped, 1);
+
  void    *pIter = taosHashIterate(mgmt->ctxHash, NULL);
  while (pIter) {
    SQWTaskCtx *ctx = (SQWTaskCtx *)pIter;
--- a/source/libs/stream/src/streamMeta.c
+++ b/source/libs/stream/src/streamMeta.c
@ -207,6 +207,7 @@ void streamMetaRemoveTask(SStreamMeta* pMeta, int32_t taskId) {
  if (ppTask) {
    SStreamTask* pTask = *ppTask;
    taosHashRemove(pMeta->pTasks, &taskId, sizeof(int32_t));
+    tdbTbDelete(pMeta->pTaskDb, &taskId, sizeof(int32_t), pMeta->txn);
    /*if (pTask->timer) {
     * taosTmrStop(pTask->timer);*/
    /*pTask->timer = NULL;*/
--- a/source/libs/stream/src/streamState.c
+++ b/source/libs/stream/src/streamState.c
@ -192,7 +192,7 @@ void streamStateClose(SStreamState* pState) {
 }

 int32_t streamStateBegin(SStreamState* pState) {
-  if (tdbBegin(pState->pTdbState->db, &pState->pTdbState->txn, tdbDefaultMalloc, tdbDefaultFree, NULL,
+  if (tdbBegin(pState->pTdbState->db, &pState->pTdbState->txn, NULL, NULL, NULL,
               TDB_TXN_WRITE | TDB_TXN_READ_UNCOMMITTED) < 0) {
    tdbAbort(pState->pTdbState->db, pState->pTdbState->txn);
    return -1;
@ -208,7 +208,7 @@ int32_t streamStateCommit(SStreamState* pState) {
    return -1;
  }

-  if (tdbBegin(pState->pTdbState->db, &pState->pTdbState->txn, tdbDefaultMalloc, tdbDefaultFree, NULL,
+  if (tdbBegin(pState->pTdbState->db, &pState->pTdbState->txn, NULL, NULL, NULL,
               TDB_TXN_WRITE | TDB_TXN_READ_UNCOMMITTED) < 0) {
    return -1;
  }
@ -220,7 +220,7 @@ int32_t streamStateAbort(SStreamState* pState) {
    return -1;
  }

-  if (tdbBegin(pState->pTdbState->db, &pState->pTdbState->txn, tdbDefaultMalloc, tdbDefaultFree, NULL,
+  if (tdbBegin(pState->pTdbState->db, &pState->pTdbState->txn, NULL, NULL, NULL,
               TDB_TXN_WRITE | TDB_TXN_READ_UNCOMMITTED) < 0) {
    return -1;
  }
--- a/source/libs/sync/src/syncAppendEntries.c
+++ b/source/libs/sync/src/syncAppendEntries.c
@ -89,45 +89,6 @@
 //       /\ UNCHANGED <<candidateVars, leaderVars>>
 //

-int32_t syncNodeFollowerCommit(SSyncNode* ths, SyncIndex newCommitIndex) {
-  ASSERT(false && "deprecated");
-  if (ths->state != TAOS_SYNC_STATE_FOLLOWER) {
-    sNTrace(ths, "can not do follower commit");
-    return -1;
-  }
-
-  // maybe update commit index, leader notice me
-  if (newCommitIndex > ths->commitIndex) {
-    // has commit entry in local
-    if (newCommitIndex <= ths->pLogStore->syncLogLastIndex(ths->pLogStore)) {
-      // advance commit index to sanpshot first
-      SSnapshot snapshot;
-      ths->pFsm->FpGetSnapshotInfo(ths->pFsm, &snapshot);
-      if (snapshot.lastApplyIndex >= 0 && snapshot.lastApplyIndex > ths->commitIndex) {
-        SyncIndex commitBegin = ths->commitIndex;
-        SyncIndex commitEnd = snapshot.lastApplyIndex;
-        ths->commitIndex = snapshot.lastApplyIndex;
-        sNTrace(ths, "commit by snapshot from index:%" PRId64 " to index:%" PRId64, commitBegin, commitEnd);
-      }
-
-      SyncIndex beginIndex = ths->commitIndex + 1;
-      SyncIndex endIndex = newCommitIndex;
-
-      // update commit index
-      ths->commitIndex = newCommitIndex;
-
-      // call back Wal
-      int32_t code = ths->pLogStore->syncLogUpdateCommitIndex(ths->pLogStore, ths->commitIndex);
-      ASSERT(code == 0);
-
-      code = syncNodeDoCommit(ths, beginIndex, endIndex, ths->state);
-      ASSERT(code == 0);
-    }
-  }
-
-  return 0;
-}
-
 SSyncRaftEntry* syncBuildRaftEntryFromAppendEntries(const SyncAppendEntries* pMsg) {
  SSyncRaftEntry* pEntry = taosMemoryMalloc(pMsg->dataLen);
  if (pEntry == NULL) {
@ -232,256 +193,3 @@ _IGNORE:
  rpcFreeCont(rpcRsp.pCont);
  return 0;
 }
-
-int32_t syncNodeOnAppendEntriesOld(SSyncNode* ths, const SRpcMsg* pRpcMsg) {
-  SyncAppendEntries* pMsg = pRpcMsg->pCont;
-  SRpcMsg            rpcRsp = {0};
-
-  // if already drop replica, do not process
-  if (!syncNodeInRaftGroup(ths, &(pMsg->srcId))) {
-    syncLogRecvAppendEntries(ths, pMsg, "not in my config");
-    goto _IGNORE;
-  }
-
-  // prepare response msg
-  int32_t code = syncBuildAppendEntriesReply(&rpcRsp, ths->vgId);
-  if (code != 0) {
-    syncLogRecvAppendEntries(ths, pMsg, "build rsp error");
-    goto _IGNORE;
-  }
-
-  SyncAppendEntriesReply* pReply = rpcRsp.pCont;
-  pReply->srcId = ths->myRaftId;
-  pReply->destId = pMsg->srcId;
-  pReply->term = ths->raftStore.currentTerm;
-  pReply->success = false;
-  // pReply->matchIndex = ths->pLogStore->syncLogLastIndex(ths->pLogStore);
-  pReply->matchIndex = SYNC_INDEX_INVALID;
-  pReply->lastSendIndex = pMsg->prevLogIndex + 1;
-  pReply->startTime = ths->startTime;
-
-  if (pMsg->term < ths->raftStore.currentTerm) {
-    syncLogRecvAppendEntries(ths, pMsg, "reject, small term");
-    goto _SEND_RESPONSE;
-  }
-
-  if (pMsg->term > ths->raftStore.currentTerm) {
-    pReply->term = pMsg->term;
-  }
-
-  syncNodeStepDown(ths, pMsg->term);
-  syncNodeResetElectTimer(ths);
-
-  SyncIndex startIndex = ths->pLogStore->syncLogBeginIndex(ths->pLogStore);
-  SyncIndex lastIndex = ths->pLogStore->syncLogLastIndex(ths->pLogStore);
-
-  if (pMsg->prevLogIndex > lastIndex) {
-    syncLogRecvAppendEntries(ths, pMsg, "reject, index not match");
-    goto _SEND_RESPONSE;
-  }
-
-  if (pMsg->prevLogIndex >= startIndex) {
-    SyncTerm myPreLogTerm = syncNodeGetPreTerm(ths, pMsg->prevLogIndex + 1);
-    // ASSERT(myPreLogTerm != SYNC_TERM_INVALID);
-    if (myPreLogTerm == SYNC_TERM_INVALID) {
-      syncLogRecvAppendEntries(ths, pMsg, "reject, pre-term invalid");
-      goto _SEND_RESPONSE;
-    }
-
-    if (myPreLogTerm != pMsg->prevLogTerm) {
-      syncLogRecvAppendEntries(ths, pMsg, "reject, pre-term not match");
-      goto _SEND_RESPONSE;
-    }
-  }
-
-  // accept
-  pReply->success = true;
-  bool hasAppendEntries = pMsg->dataLen > 0;
-  if (hasAppendEntries) {
-    SSyncRaftEntry* pAppendEntry = syncEntryBuildFromAppendEntries(pMsg);
-    ASSERT(pAppendEntry != NULL);
-
-    SyncIndex appendIndex = pMsg->prevLogIndex + 1;
-
-    LRUHandle* hLocal = NULL;
-    LRUHandle* hAppend = NULL;
-
-    int32_t         code = 0;
-    SSyncRaftEntry* pLocalEntry = NULL;
-    SLRUCache*      pCache = ths->pLogStore->pCache;
-    hLocal = taosLRUCacheLookup(pCache, &appendIndex, sizeof(appendIndex));
-    if (hLocal) {
-      pLocalEntry = (SSyncRaftEntry*)taosLRUCacheValue(pCache, hLocal);
-      code = 0;
-
-      ths->pLogStore->cacheHit++;
-      sNTrace(ths, "hit cache index:%" PRId64 ", bytes:%u, %p", appendIndex, pLocalEntry->bytes, pLocalEntry);
-
-    } else {
-      ths->pLogStore->cacheMiss++;
-      sNTrace(ths, "miss cache index:%" PRId64, appendIndex);
-
-      code = ths->pLogStore->syncLogGetEntry(ths->pLogStore, appendIndex, &pLocalEntry);
-    }
-
-    if (code == 0) {
-      // get local entry success
-
-      if (pLocalEntry->term == pAppendEntry->term) {
-        // do nothing
-        sNTrace(ths, "log match, do nothing, index:%" PRId64, appendIndex);
-
-      } else {
-        // truncate
-        code = ths->pLogStore->syncLogTruncate(ths->pLogStore, appendIndex);
-        if (code != 0) {
-          char logBuf[128];
-          snprintf(logBuf, sizeof(logBuf), "ignore, truncate error, append-index:%" PRId64, appendIndex);
-          syncLogRecvAppendEntries(ths, pMsg, logBuf);
-
-          if (hLocal) {
-            taosLRUCacheRelease(ths->pLogStore->pCache, hLocal, false);
-          } else {
-            syncEntryDestroy(pLocalEntry);
-          }
-
-          if (hAppend) {
-            taosLRUCacheRelease(ths->pLogStore->pCache, hAppend, false);
-          } else {
-            syncEntryDestroy(pAppendEntry);
-          }
-
-          goto _IGNORE;
-        }
-
-        ASSERT(pAppendEntry->index == appendIndex);
-
-        // append
-        code = ths->pLogStore->syncLogAppendEntry(ths->pLogStore, pAppendEntry);
-        if (code != 0) {
-          char logBuf[128];
-          snprintf(logBuf, sizeof(logBuf), "ignore, append error, append-index:%" PRId64, appendIndex);
-          syncLogRecvAppendEntries(ths, pMsg, logBuf);
-
-          if (hLocal) {
-            taosLRUCacheRelease(ths->pLogStore->pCache, hLocal, false);
-          } else {
-            syncEntryDestroy(pLocalEntry);
-          }
-
-          if (hAppend) {
-            taosLRUCacheRelease(ths->pLogStore->pCache, hAppend, false);
-          } else {
-            syncEntryDestroy(pAppendEntry);
-          }
-
-          goto _IGNORE;
-        }
-
-        syncCacheEntry(ths->pLogStore, pAppendEntry, &hAppend);
-      }
-
-    } else {
-      if (terrno == TSDB_CODE_WAL_LOG_NOT_EXIST) {
-        // log not exist
-
-        // truncate
-        code = ths->pLogStore->syncLogTruncate(ths->pLogStore, appendIndex);
-        if (code != 0) {
-          char logBuf[128];
-          snprintf(logBuf, sizeof(logBuf), "ignore, log not exist, truncate error, append-index:%" PRId64, appendIndex);
-          syncLogRecvAppendEntries(ths, pMsg, logBuf);
-
-          syncEntryDestroy(pLocalEntry);
-          syncEntryDestroy(pAppendEntry);
-          goto _IGNORE;
-        }
-
-        // append
-        code = ths->pLogStore->syncLogAppendEntry(ths->pLogStore, pAppendEntry);
-        if (code != 0) {
-          char logBuf[128];
-          snprintf(logBuf, sizeof(logBuf), "ignore, log not exist, append error, append-index:%" PRId64, appendIndex);
-          syncLogRecvAppendEntries(ths, pMsg, logBuf);
-
-          if (hLocal) {
-            taosLRUCacheRelease(ths->pLogStore->pCache, hLocal, false);
-          } else {
-            syncEntryDestroy(pLocalEntry);
-          }
-
-          if (hAppend) {
-            taosLRUCacheRelease(ths->pLogStore->pCache, hAppend, false);
-          } else {
-            syncEntryDestroy(pAppendEntry);
-          }
-
-          goto _IGNORE;
-        }
-
-        syncCacheEntry(ths->pLogStore, pAppendEntry, &hAppend);
-
-      } else {
-        // get local entry success
-        char logBuf[128];
-        snprintf(logBuf, sizeof(logBuf), "ignore, get local entry error, append-index:%" PRId64 " err:%d", appendIndex,
-                 terrno);
-        syncLogRecvAppendEntries(ths, pMsg, logBuf);
-
-        if (hLocal) {
-          taosLRUCacheRelease(ths->pLogStore->pCache, hLocal, false);
-        } else {
-          syncEntryDestroy(pLocalEntry);
-        }
-
-        if (hAppend) {
-          taosLRUCacheRelease(ths->pLogStore->pCache, hAppend, false);
-        } else {
-          syncEntryDestroy(pAppendEntry);
-        }
-
-        goto _IGNORE;
-      }
-    }
-
-    // update match index
-    pReply->matchIndex = pAppendEntry->index;
-
-    if (hLocal) {
-      taosLRUCacheRelease(ths->pLogStore->pCache, hLocal, false);
-    } else {
-      syncEntryDestroy(pLocalEntry);
-    }
-
-    if (hAppend) {
-      taosLRUCacheRelease(ths->pLogStore->pCache, hAppend, false);
-    } else {
-      syncEntryDestroy(pAppendEntry);
-    }
-
-  } else {
-    // no append entries, do nothing
-    // maybe has extra entries, no harm
-
-    // update match index
-    pReply->matchIndex = pMsg->prevLogIndex;
-  }
-
-  // maybe update commit index, leader notice me
-  syncNodeFollowerCommit(ths, pMsg->commitIndex);
-
-  syncLogRecvAppendEntries(ths, pMsg, "accept");
-  goto _SEND_RESPONSE;
-
-_IGNORE:
-  rpcFreeCont(rpcRsp.pCont);
-  return 0;
-
-_SEND_RESPONSE:
-  // msg event log
-  syncLogSendAppendEntriesReply(ths, pReply, "");
-
-  // send response
-  syncNodeSendMsgById(&pReply->destId, ths, &rpcRsp);
-  return 0;
-}
--- a/source/libs/sync/src/syncAppendEntriesReply.c
+++ b/source/libs/sync/src/syncAppendEntriesReply.c
@ -89,63 +89,3 @@ int32_t syncNodeOnAppendEntriesReply(SSyncNode* ths, const SRpcMsg* pRpcMsg) {
  }
  return 0;
 }
-
-int32_t syncNodeOnAppendEntriesReplyOld(SSyncNode* ths, SyncAppendEntriesReply* pMsg) {
-  int32_t ret = 0;
-
-  // if already drop replica, do not process
-  if (!syncNodeInRaftGroup(ths, &(pMsg->srcId))) {
-    syncLogRecvAppendEntriesReply(ths, pMsg, "not in my config");
-    return 0;
-  }
-
-  // drop stale response
-  if (pMsg->term < ths->raftStore.currentTerm) {
-    syncLogRecvAppendEntriesReply(ths, pMsg, "drop stale response");
-    return 0;
-  }
-
-  if (ths->state == TAOS_SYNC_STATE_LEADER) {
-    if (pMsg->term > ths->raftStore.currentTerm) {
-      syncLogRecvAppendEntriesReply(ths, pMsg, "error term");
-      syncNodeStepDown(ths, pMsg->term);
-      return -1;
-    }
-
-    ASSERT(pMsg->term == ths->raftStore.currentTerm);
-
-    if (pMsg->success) {
-      SyncIndex oldMatchIndex = syncIndexMgrGetIndex(ths->pMatchIndex, &(pMsg->srcId));
-      if (pMsg->matchIndex > oldMatchIndex) {
-        syncIndexMgrSetIndex(ths->pMatchIndex, &(pMsg->srcId), pMsg->matchIndex);
-        syncMaybeAdvanceCommitIndex(ths);
-
-        // maybe update minMatchIndex
-        ths->minMatchIndex = syncMinMatchIndex(ths);
-      }
-      syncIndexMgrSetIndex(ths->pNextIndex, &(pMsg->srcId), pMsg->matchIndex + 1);
-
-    } else {
-      SyncIndex nextIndex = syncIndexMgrGetIndex(ths->pNextIndex, &(pMsg->srcId));
-      if (nextIndex > SYNC_INDEX_BEGIN) {
-        --nextIndex;
-      }
-      syncIndexMgrSetIndex(ths->pNextIndex, &(pMsg->srcId), nextIndex);
-    }
-
-    // send next append entries
-    SPeerState* pState = syncNodeGetPeerState(ths, &(pMsg->srcId));
-    ASSERT(pState != NULL);
-
-    if (pMsg->lastSendIndex == pState->lastSendIndex) {
-      int64_t timeNow = taosGetTimestampMs();
-      int64_t elapsed = timeNow - pState->lastSendTime;
-      sNTrace(ths, "sync-append-entries rtt elapsed:%" PRId64 ", index:%" PRId64, elapsed, pState->lastSendIndex);
-
-      syncNodeReplicateOne(ths, &(pMsg->srcId), true);
-    }
-  }
-
-  syncLogRecvAppendEntriesReply(ths, pMsg, "process");
-  return 0;
-}
--- a/source/libs/sync/src/syncCommit.c
+++ b/source/libs/sync/src/syncCommit.c
@ -43,148 +43,6 @@
 //        IN commitIndex' = [commitIndex EXCEPT ![i] = newCommitIndex]
 //     /\ UNCHANGED <<messages, serverVars, candidateVars, leaderVars, log>>
 //
-void syncOneReplicaAdvance(SSyncNode* pSyncNode) {
-  ASSERT(false && "deprecated");
-  if (pSyncNode == NULL) {
-    sError("pSyncNode is NULL");
-    return;
-  }
-
-  if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) {
-    sNError(pSyncNode, "not leader, can not advance commit index");
-    return;
-  }
-
-  if (pSyncNode->replicaNum != 1) {
-    sNError(pSyncNode, "not one replica, can not advance commit index");
-    return;
-  }
-
-  // advance commit index to snapshot first
-  SSnapshot snapshot;
-  pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
-  if (snapshot.lastApplyIndex > 0 && snapshot.lastApplyIndex > pSyncNode->commitIndex) {
-    SyncIndex commitBegin = pSyncNode->commitIndex;
-    SyncIndex commitEnd = snapshot.lastApplyIndex;
-    pSyncNode->commitIndex = snapshot.lastApplyIndex;
-    sNTrace(pSyncNode, "commit by snapshot from index:%" PRId64 " to index:%" PRId64, commitBegin, commitEnd);
-  }
-
-  // advance commit index as large as possible
-  SyncIndex lastIndex = syncNodeGetLastIndex(pSyncNode);
-  if (lastIndex > pSyncNode->commitIndex) {
-    sNTrace(pSyncNode, "commit by wal from index:%" PRId64 " to index:%" PRId64, pSyncNode->commitIndex + 1, lastIndex);
-    pSyncNode->commitIndex = lastIndex;
-  }
-
-  // call back Wal
-  SyncIndex walCommitVer = logStoreWalCommitVer(pSyncNode->pLogStore);
-  if (pSyncNode->commitIndex > walCommitVer) {
-    pSyncNode->pLogStore->syncLogUpdateCommitIndex(pSyncNode->pLogStore, pSyncNode->commitIndex);
-  }
-}
-
-void syncMaybeAdvanceCommitIndex(SSyncNode* pSyncNode) {
-  ASSERTS(false, "deprecated");
-  if (pSyncNode == NULL) {
-    sError("pSyncNode is NULL");
-    return;
-  }
-
-  if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) {
-    sNError(pSyncNode, "not leader, can not advance commit index");
-    return;
-  }
-
-  // advance commit index to sanpshot first
-  SSnapshot snapshot;
-  pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
-  if (snapshot.lastApplyIndex > 0 && snapshot.lastApplyIndex > pSyncNode->commitIndex) {
-    SyncIndex commitBegin = pSyncNode->commitIndex;
-    SyncIndex commitEnd = snapshot.lastApplyIndex;
-    pSyncNode->commitIndex = snapshot.lastApplyIndex;
-    sNTrace(pSyncNode, "commit by snapshot from index:%" PRId64 " to index:%" PRId64, commitBegin, commitEnd);
-  }
-
-  // update commit index
-  SyncIndex newCommitIndex = pSyncNode->commitIndex;
-  for (SyncIndex index = syncNodeGetLastIndex(pSyncNode); index > pSyncNode->commitIndex; --index) {
-    bool agree = syncAgree(pSyncNode, index);
-
-    if (agree) {
-      // term
-      SSyncRaftEntry* pEntry = NULL;
-      SLRUCache*      pCache = pSyncNode->pLogStore->pCache;
-      LRUHandle*      h = taosLRUCacheLookup(pCache, &index, sizeof(index));
-      if (h) {
-        pEntry = (SSyncRaftEntry*)taosLRUCacheValue(pCache, h);
-
-        pSyncNode->pLogStore->cacheHit++;
-        sNTrace(pSyncNode, "hit cache index:%" PRId64 ", bytes:%u, %p", index, pEntry->bytes, pEntry);
-
-      } else {
-        pSyncNode->pLogStore->cacheMiss++;
-        sNTrace(pSyncNode, "miss cache index:%" PRId64, index);
-
-        int32_t code = pSyncNode->pLogStore->syncLogGetEntry(pSyncNode->pLogStore, index, &pEntry);
-        if (code != 0) {
-          sNError(pSyncNode, "advance commit index error, read wal index:%" PRId64, index);
-          return;
-        }
-      }
-      // cannot commit, even if quorum agree. need check term!
-      if (pEntry->term <= pSyncNode->raftStore.currentTerm) {
-        // update commit index
-        newCommitIndex = index;
-
-        if (h) {
-          taosLRUCacheRelease(pCache, h, false);
-        } else {
-          syncEntryDestroy(pEntry);
-        }
-
-        break;
-      } else {
-        sNTrace(pSyncNode, "can not commit due to term not equal, index:%" PRId64 ", term:%" PRIu64, pEntry->index,
-                pEntry->term);
-      }
-
-      if (h) {
-        taosLRUCacheRelease(pCache, h, false);
-      } else {
-        syncEntryDestroy(pEntry);
-      }
-    }
-  }
-
-  // advance commit index as large as possible
-  SyncIndex walCommitVer = logStoreWalCommitVer(pSyncNode->pLogStore);
-  if (walCommitVer > newCommitIndex) {
-    newCommitIndex = walCommitVer;
-  }
-
-  // maybe execute fsm
-  if (newCommitIndex > pSyncNode->commitIndex) {
-    SyncIndex beginIndex = pSyncNode->commitIndex + 1;
-    SyncIndex endIndex = newCommitIndex;
-
-    // update commit index
-    pSyncNode->commitIndex = newCommitIndex;
-
-    // call back Wal
-    pSyncNode->pLogStore->syncLogUpdateCommitIndex(pSyncNode->pLogStore, pSyncNode->commitIndex);
-
-    // execute fsm
-    if (pSyncNode != NULL && pSyncNode->pFsm != NULL) {
-      int32_t code = syncNodeDoCommit(pSyncNode, beginIndex, endIndex, pSyncNode->state);
-      if (code != 0) {
-        sNError(pSyncNode, "advance commit index error, do commit begin:%" PRId64 ", end:%" PRId64, beginIndex,
-                endIndex);
-        return;
-      }
-    }
-  }
-}

 bool syncAgreeIndex(SSyncNode* pSyncNode, SRaftId* pRaftId, SyncIndex index) {
  // I am leader, I agree
@ -210,83 +68,7 @@ static inline int64_t syncNodeAbs64(int64_t a, int64_t b) {
  return c;
 }

-int32_t syncNodeDynamicQuorum(const SSyncNode* pSyncNode) {
-  return pSyncNode->quorum;
-
-#if 0
-  int32_t quorum = 1;  // self
-
-  int64_t timeNow = taosGetTimestampMs();
-  for (int i = 0; i < pSyncNode->peersNum; ++i) {
-    int64_t   peerStartTime = syncIndexMgrGetStartTime(pSyncNode->pNextIndex, &(pSyncNode->peersId)[i]);
-    int64_t   peerRecvTime = syncIndexMgrGetRecvTime(pSyncNode->pNextIndex, &(pSyncNode->peersId)[i]);
-    SyncIndex peerMatchIndex = syncIndexMgrGetIndex(pSyncNode->pMatchIndex, &(pSyncNode->peersId)[i]);
-
-    int64_t recvTimeDiff = TABS(peerRecvTime - timeNow);
-    int64_t startTimeDiff = TABS(peerStartTime - pSyncNode->startTime);
-    int64_t logDiff = TABS(peerMatchIndex - syncNodeGetLastIndex(pSyncNode));
-
-    /*
-        int64_t recvTimeDiff = syncNodeAbs64(peerRecvTime, timeNow);
-        int64_t startTimeDiff = syncNodeAbs64(peerStartTime, pSyncNode->startTime);
-        int64_t logDiff = syncNodeAbs64(peerMatchIndex, syncNodeGetLastIndex(pSyncNode));
-    */
-
-    int32_t addQuorum = 0;
-
-    if (recvTimeDiff < SYNC_MAX_RECV_TIME_RANGE_MS) {
-      if (startTimeDiff < SYNC_MAX_START_TIME_RANGE_MS) {
-        addQuorum = 1;
-      } else {
-        if (logDiff < SYNC_ADD_QUORUM_COUNT) {
-          addQuorum = 1;
-        } else {
-          addQuorum = 0;
-        }
-      }
-    } else {
-      addQuorum = 0;
-    }
-
-    /*
-        if (recvTimeDiff < SYNC_MAX_RECV_TIME_RANGE_MS) {
-          addQuorum = 1;
-        } else {
-          addQuorum = 0;
-        }
-
-        if (startTimeDiff > SYNC_MAX_START_TIME_RANGE_MS) {
-          addQuorum = 0;
-        }
-    */
-
-    quorum += addQuorum;
-  }
-
-  ASSERT(quorum <= pSyncNode->replicaNum);
-
-  if (quorum < pSyncNode->quorum) {
-    quorum = pSyncNode->quorum;
-  }
-
-  return quorum;
-#endif
-}
-
-/*
-bool syncAgree(SSyncNode* pSyncNode, SyncIndex index) {
-  int agreeCount = 0;
-  for (int i = 0; i < pSyncNode->replicaNum; ++i) {
-    if (syncAgreeIndex(pSyncNode, &(pSyncNode->replicasId[i]), index)) {
-      ++agreeCount;
-    }
-    if (agreeCount >= syncNodeDynamicQuorum(pSyncNode)) {
-      return true;
-    }
-  }
-  return false;
-}
-*/
+int32_t syncNodeDynamicQuorum(const SSyncNode* pSyncNode) { return pSyncNode->quorum; }

 bool syncNodeAgreedUpon(SSyncNode* pNode, SyncIndex index) {
  int            count = 0;
--- a/source/libs/sync/src/syncElection.c
+++ b/source/libs/sync/src/syncElection.c
@ -43,7 +43,10 @@ static int32_t syncNodeRequestVotePeers(SSyncNode* pNode) {
  for (int i = 0; i < pNode->peersNum; ++i) {
    SRpcMsg rpcMsg = {0};
    ret = syncBuildRequestVote(&rpcMsg, pNode->vgId);
-    ASSERT(ret == 0);
+    if (ret < 0) {
+      sError("vgId:%d, failed to build request-vote msg since %s", pNode->vgId, terrstr());
+      continue;
+    }

    SyncRequestVote* pMsg = rpcMsg.pCont;
    pMsg->srcId = pNode->myRaftId;
@ -51,13 +54,18 @@ static int32_t syncNodeRequestVotePeers(SSyncNode* pNode) {
    pMsg->term = pNode->raftStore.currentTerm;

    ret = syncNodeGetLastIndexTerm(pNode, &pMsg->lastLogIndex, &pMsg->lastLogTerm);
-    ASSERT(ret == 0);
-
-    ret = syncNodeSendMsgById(&pNode->peersId[i], pNode, &rpcMsg);
-    ASSERT(ret == 0);
+    if (ret < 0) {
+      sError("vgId:%d, failed to get index and term of last log since %s", pNode->vgId, terrstr());
+      continue;
    }

-  return ret;
+    ret = syncNodeSendMsgById(&pNode->peersId[i], pNode, &rpcMsg);
+    if (ret < 0) {
+      sError("vgId:%d, failed to send msg to peerId:%" PRId64, pNode->vgId, pNode->peersId[i].addr);
+      continue;
+    }
+  }
+  return 0;
 }

 int32_t syncNodeElect(SSyncNode* pSyncNode) {
--- a/source/libs/sync/src/syncMain.c
+++ b/source/libs/sync/src/syncMain.c
@ -292,8 +292,6 @@ int32_t syncBeginSnapshot(int64_t rid, int64_t lastApplyIndex) {
    goto _DEL_WAL;

  } else {
-    lastApplyIndex -= SYNC_VNODE_LOG_RETENTION;
-
    SyncIndex beginIndex = pSyncNode->pLogStore->syncLogBeginIndex(pSyncNode->pLogStore);
    SyncIndex endIndex = pSyncNode->pLogStore->syncLogEndIndex(pSyncNode->pLogStore);
    bool      isEmpty = pSyncNode->pLogStore->syncLogIsEmpty(pSyncNode->pLogStore);
@ -308,6 +306,8 @@ int32_t syncBeginSnapshot(int64_t rid, int64_t lastApplyIndex) {
    if (pSyncNode->replicaNum > 1) {
      // multi replicas

+      lastApplyIndex = TMAX(lastApplyIndex - SYNC_VNODE_LOG_RETENTION, beginIndex - 1);
+
      if (pSyncNode->state == TAOS_SYNC_STATE_LEADER) {
        pSyncNode->minMatchIndex = syncMinMatchIndex(pSyncNode);

@ -586,78 +586,6 @@ SSyncState syncGetState(int64_t rid) {
  return state;
 }

-#if 0
-int32_t syncGetSnapshotByIndex(int64_t rid, SyncIndex index, SSnapshot* pSnapshot) {
-  if (index < SYNC_INDEX_BEGIN) {
-    return -1;
-  }
-
-  SSyncNode* pSyncNode = syncNodeAcquire(rid);
-  if (pSyncNode == NULL) {
-    return -1;
-  }
-  ASSERT(rid == pSyncNode->rid);
-
-  SSyncRaftEntry* pEntry = NULL;
-  int32_t         code = pSyncNode->pLogStore->syncLogGetEntry(pSyncNode->pLogStore, index, &pEntry);
-  if (code != 0) {
-    if (pEntry != NULL) {
-      syncEntryDestroy(pEntry);
-    }
-    syncNodeRelease(pSyncNode);
-    return -1;
-  }
-  ASSERT(pEntry != NULL);
-
-  pSnapshot->data = NULL;
-  pSnapshot->lastApplyIndex = index;
-  pSnapshot->lastApplyTerm = pEntry->term;
-  pSnapshot->lastConfigIndex = syncNodeGetSnapshotConfigIndex(pSyncNode, index);
-
-  syncEntryDestroy(pEntry);
-  syncNodeRelease(pSyncNode);
-  return 0;
-}
-
-int32_t syncGetSnapshotMeta(int64_t rid, struct SSnapshotMeta* sMeta) {
-  SSyncNode* pSyncNode = syncNodeAcquire(rid);
-  if (pSyncNode == NULL) {
-    return -1;
-  }
-  ASSERT(rid == pSyncNode->rid);
-  sMeta->lastConfigIndex = pSyncNode->raftCfg.lastConfigIndex;
-
-  sTrace("vgId:%d, get snapshot meta, lastConfigIndex:%" PRId64, pSyncNode->vgId, pSyncNode->raftCfg.lastConfigIndex);
-
-  syncNodeRelease(pSyncNode);
-  return 0;
-}
-
-int32_t syncGetSnapshotMetaByIndex(int64_t rid, SyncIndex snapshotIndex, struct SSnapshotMeta* sMeta) {
-  SSyncNode* pSyncNode = syncNodeAcquire(rid);
-  if (pSyncNode == NULL) {
-    return -1;
-  }
-  ASSERT(rid == pSyncNode->rid);
-
-  ASSERT(pSyncNode->raftCfg.configIndexCount >= 1);
-  SyncIndex lastIndex = (pSyncNode->raftCfg.configIndexArr)[0];
-
-  for (int32_t i = 0; i < pSyncNode->raftCfg.configIndexCount; ++i) {
-    if ((pSyncNode->raftCfg.configIndexArr)[i] > lastIndex &&
-        (pSyncNode->raftCfg.configIndexArr)[i] <= snapshotIndex) {
-      lastIndex = (pSyncNode->raftCfg.configIndexArr)[i];
-    }
-  }
-  sMeta->lastConfigIndex = lastIndex;
-  sTrace("vgId:%d, get snapshot meta by index:%" PRId64 " lcindex:%" PRId64, pSyncNode->vgId, snapshotIndex,
-         sMeta->lastConfigIndex);
-
-  syncNodeRelease(pSyncNode);
-  return 0;
-}
-#endif
-
 SyncIndex syncNodeGetSnapshotConfigIndex(SSyncNode* pSyncNode, SyncIndex snapshotLastApplyIndex) {
  ASSERT(pSyncNode->raftCfg.configIndexCount >= 1);
  SyncIndex lastIndex = (pSyncNode->raftCfg.configIndexArr)[0];
@ -898,7 +826,7 @@ SSyncNode* syncNodeOpen(SSyncInfo* pSyncInfo) {
  sInfo("vgId:%d, start to open sync node, replica:%d selfIndex:%d", pSyncNode->vgId, pCfg->replicaNum, pCfg->myIndex);
  for (int32_t i = 0; i < pCfg->replicaNum; ++i) {
    SNodeInfo* pNode = &pCfg->nodeInfo[i];
-    (void)tmsgUpdateDnodeInfo(&pNode->nodeId, &pNode->clusterId, pNode->nodeFqdn, &pNode->nodePort);
+    tmsgUpdateDnodeInfo(&pNode->nodeId, &pNode->clusterId, pNode->nodeFqdn, &pNode->nodePort);
    sInfo("vgId:%d, index:%d ep:%s:%u dnode:%d cluster:%" PRId64, pSyncNode->vgId, i, pNode->nodeFqdn, pNode->nodePort,
          pNode->nodeId, pNode->clusterId);
  }
@ -1031,9 +959,12 @@ SSyncNode* syncNodeOpen(SSyncInfo* pSyncInfo) {
  pSyncNode->commitIndex = commitIndex;
  sInfo("vgId:%d, sync node commitIndex initialized as %" PRId64, pSyncNode->vgId, pSyncNode->commitIndex);

+  // restore log store on need
  if (syncNodeLogStoreRestoreOnNeed(pSyncNode) < 0) {
+    sError("vgId:%d, failed to restore log store since %s.", pSyncNode->vgId, terrstr());
    goto _error;
  }
+
  // timer ms init
  pSyncNode->pingBaseLine = PING_TIMER_MS;
  pSyncNode->electBaseLine = tsElectInterval;
@ -1096,10 +1027,16 @@ SSyncNode* syncNodeOpen(SSyncInfo* pSyncInfo) {
  pSyncNode->changing = false;

  // replication mgr
-  syncNodeLogReplMgrInit(pSyncNode);
+  if (syncNodeLogReplMgrInit(pSyncNode) < 0) {
+    sError("vgId:%d, failed to init repl mgr since %s.", pSyncNode->vgId, terrstr());
+    goto _error;
+  }

  // peer state
-  syncNodePeerStateInit(pSyncNode);
+  if (syncNodePeerStateInit(pSyncNode) < 0) {
+    sError("vgId:%d, failed to init peer stat since %s.", pSyncNode->vgId, terrstr());
+    goto _error;
+  }

  //
  // min match index
@ -1194,29 +1131,12 @@ int32_t syncNodeStart(SSyncNode* pSyncNode) {

  int32_t ret = 0;
  ret = syncNodeStartPingTimer(pSyncNode);
-  ASSERT(ret == 0);
+  if (ret != 0) {
+    sError("vgId:%d, failed to start ping timer since %s", pSyncNode->vgId, terrstr());
+  }
  return ret;
 }

-void syncNodeStartOld(SSyncNode* pSyncNode) {
-  // start raft
-  if (pSyncNode->replicaNum == 1) {
-    raftStoreNextTerm(pSyncNode);
-    syncNodeBecomeLeader(pSyncNode, "one replica start");
-
-    // Raft 3.6.2 Committing entries from previous terms
-    syncNodeAppendNoop(pSyncNode);
-    syncMaybeAdvanceCommitIndex(pSyncNode);
-
-  } else {
-    syncNodeBecomeFollower(pSyncNode, "first start");
-  }
-
-  int32_t ret = 0;
-  ret = syncNodeStartPingTimer(pSyncNode);
-  ASSERT(ret == 0);
-}
-
 int32_t syncNodeStartStandBy(SSyncNode* pSyncNode) {
  // state change
  pSyncNode->state = TAOS_SYNC_STATE_FOLLOWER;
@ -1225,11 +1145,16 @@ int32_t syncNodeStartStandBy(SSyncNode* pSyncNode) {
  // reset elect timer, long enough
  int32_t electMS = TIMER_MAX_MS;
  int32_t ret = syncNodeRestartElectTimer(pSyncNode, electMS);
-  ASSERT(ret == 0);
+  if (ret < 0) {
+    sError("vgId:%d, failed to restart elect timer since %s", pSyncNode->vgId, terrstr());
+    return -1;
+  }

-  ret = 0;
  ret = syncNodeStartPingTimer(pSyncNode);
-  ASSERT(ret == 0);
+  if (ret < 0) {
+    sError("vgId:%d, failed to start ping timer since %s", pSyncNode->vgId, terrstr());
+    return -1;
+  }
  return ret;
 }

@ -1703,8 +1628,7 @@ void syncNodeDoConfigChange(SSyncNode* pSyncNode, SSyncCfg* pNewConfig, SyncInde

 _END:
  // log end config change
-  sNInfo(pSyncNode, "end do config change, from %d to %d", pSyncNode->vgId, oldConfig.replicaNum,
-         pNewConfig->replicaNum);
+  sNInfo(pSyncNode, "end do config change, from %d to %d", oldConfig.replicaNum, pNewConfig->replicaNum);
 }

 // raft state change --------------
@ -1819,12 +1743,6 @@ void syncNodeBecomeLeader(SSyncNode* pSyncNode, const char* debugStr) {
  pSyncNode->leaderCache = pSyncNode->myRaftId;

  for (int32_t i = 0; i < pSyncNode->pNextIndex->replicaNum; ++i) {
-    // maybe overwrite myself, no harm
-    // just do it!
-
-    // pSyncNode->pNextIndex->index[i] = pSyncNode->pLogStore->getLastIndex(pSyncNode->pLogStore) + 1;
-
-    // maybe wal is deleted
    SyncIndex lastIndex;
    SyncTerm  lastTerm;
    int32_t   code = syncNodeGetLastIndexTerm(pSyncNode, &lastIndex, &lastTerm);
@ -1886,7 +1804,11 @@ void syncNodeBecomeLeader(SSyncNode* pSyncNode, const char* debugStr) {

 void syncNodeCandidate2Leader(SSyncNode* pSyncNode) {
  ASSERT(pSyncNode->state == TAOS_SYNC_STATE_CANDIDATE);
-  ASSERT(voteGrantedMajority(pSyncNode->pVotesGranted));
+  bool granted = voteGrantedMajority(pSyncNode->pVotesGranted);
+  if (!granted) {
+    sError("vgId:%d, not granted by majority.", pSyncNode->vgId);
+    return;
+  }
  syncNodeBecomeLeader(pSyncNode, "candidate to leader");

  sNTrace(pSyncNode, "state change syncNodeCandidate2Leader");
@ -1902,20 +1824,6 @@ void syncNodeCandidate2Leader(SSyncNode* pSyncNode) {
        pSyncNode->vgId, pSyncNode->raftStore.currentTerm, pSyncNode->commitIndex, lastIndex);
 }

-void syncNodeCandidate2LeaderOld(SSyncNode* pSyncNode) {
-  ASSERT(pSyncNode->state == TAOS_SYNC_STATE_CANDIDATE);
-  ASSERT(voteGrantedMajority(pSyncNode->pVotesGranted));
-  syncNodeBecomeLeader(pSyncNode, "candidate to leader");
-
-  // Raft 3.6.2 Committing entries from previous terms
-  syncNodeAppendNoop(pSyncNode);
-  syncMaybeAdvanceCommitIndex(pSyncNode);
-
-  if (pSyncNode->replicaNum > 1) {
-    syncNodeReplicate(pSyncNode);
-  }
-}
-
 bool syncNodeIsMnode(SSyncNode* pSyncNode) { return (pSyncNode->vgId == 1); }

 int32_t syncNodePeerStateInit(SSyncNode* pSyncNode) {
@ -1961,7 +1869,8 @@ void syncNodeCandidate2Follower(SSyncNode* pSyncNode) {
 // need assert
 void syncNodeVoteForTerm(SSyncNode* pSyncNode, SyncTerm term, SRaftId* pRaftId) {
  ASSERT(term == pSyncNode->raftStore.currentTerm);
-  ASSERT(!raftStoreHasVoted(pSyncNode));
+  bool voted = raftStoreHasVoted(pSyncNode);
+  ASSERT(!voted);

  raftStoreVote(pSyncNode, pRaftId);
 }
@ -2478,7 +2387,7 @@ static int32_t syncNodeAppendNoopOld(SSyncNode* ths) {
  LRUHandle* h = NULL;

  if (ths->state == TAOS_SYNC_STATE_LEADER) {
-    int32_t code = ths->pLogStore->syncLogAppendEntry(ths->pLogStore, pEntry);
+    int32_t code = ths->pLogStore->syncLogAppendEntry(ths->pLogStore, pEntry, false);
    if (code != 0) {
      sError("append noop error");
      return -1;
@ -2639,24 +2548,6 @@ int32_t syncNodeOnLocalCmd(SSyncNode* ths, const SRpcMsg* pRpcMsg) {
  return 0;
 }

-int32_t syncNodeOnLocalCmdOld(SSyncNode* ths, const SRpcMsg* pRpcMsg) {
-  ASSERT(false && "deprecated");
-  SyncLocalCmd* pMsg = pRpcMsg->pCont;
-  syncLogRecvLocalCmd(ths, pMsg, "");
-
-  if (pMsg->cmd == SYNC_LOCAL_CMD_STEP_DOWN) {
-    syncNodeStepDown(ths, pMsg->currentTerm);
-
-  } else if (pMsg->cmd == SYNC_LOCAL_CMD_FOLLOWER_CMT) {
-    syncNodeFollowerCommit(ths, pMsg->commitIndex);
-
-  } else {
-    sError("error local cmd");
-  }
-
-  return 0;
-}
-
 // TLA+ Spec
 // ClientRequest(i, v) ==
 //     /\ state[i] = Leader
@ -2701,96 +2592,6 @@ int32_t syncNodeOnClientRequest(SSyncNode* ths, SRpcMsg* pMsg, SyncIndex* pRetIn
  }
 }

-int32_t syncNodeOnClientRequestOld(SSyncNode* ths, SRpcMsg* pMsg, SyncIndex* pRetIndex) {
-  sNTrace(ths, "on client request");
-
-  int32_t ret = 0;
-  int32_t code = 0;
-
-  SyncIndex       index = ths->pLogStore->syncLogWriteIndex(ths->pLogStore);
-  SyncTerm        term = ths->raftStore.currentTerm;
-  SSyncRaftEntry* pEntry;
-
-  if (pMsg->msgType == TDMT_SYNC_CLIENT_REQUEST) {
-    pEntry = syncEntryBuildFromClientRequest(pMsg->pCont, term, index);
-  } else {
-    pEntry = syncEntryBuildFromRpcMsg(pMsg, term, index);
-  }
-
-  LRUHandle* h = NULL;
-
-  if (ths->state == TAOS_SYNC_STATE_LEADER) {
-    // append entry
-    code = ths->pLogStore->syncLogAppendEntry(ths->pLogStore, pEntry);
-    if (code != 0) {
-      if (ths->replicaNum == 1) {
-        if (h) {
-          taosLRUCacheRelease(ths->pLogStore->pCache, h, false);
-        } else {
-          syncEntryDestroy(pEntry);
-        }
-
-        return -1;
-
-      } else {
-        // del resp mgr, call FpCommitCb
-        SFsmCbMeta cbMeta = {
-            .index = pEntry->index,
-            .lastConfigIndex = SYNC_INDEX_INVALID,
-            .isWeak = pEntry->isWeak,
-            .code = -1,
-            .state = ths->state,
-            .seqNum = pEntry->seqNum,
-            .term = pEntry->term,
-            .currentTerm = ths->raftStore.currentTerm,
-            .flag = 0,
-        };
-        ths->pFsm->FpCommitCb(ths->pFsm, pMsg, &cbMeta);
-
-        if (h) {
-          taosLRUCacheRelease(ths->pLogStore->pCache, h, false);
-        } else {
-          syncEntryDestroy(pEntry);
-        }
-
-        return -1;
-      }
-    }
-
-    syncCacheEntry(ths->pLogStore, pEntry, &h);
-
-    // if mulit replica, start replicate right now
-    if (ths->replicaNum > 1) {
-      syncNodeReplicate(ths);
-    }
-
-    // if only myself, maybe commit right now
-    if (ths->replicaNum == 1) {
-      if (syncNodeIsMnode(ths)) {
-        syncMaybeAdvanceCommitIndex(ths);
-      } else {
-        syncOneReplicaAdvance(ths);
-      }
-    }
-  }
-
-  if (pRetIndex != NULL) {
-    if (ret == 0 && pEntry != NULL) {
-      *pRetIndex = pEntry->index;
-    } else {
-      *pRetIndex = SYNC_INDEX_INVALID;
-    }
-  }
-
-  if (h) {
-    taosLRUCacheRelease(ths->pLogStore->pCache, h, false);
-  } else {
-    syncEntryDestroy(pEntry);
-  }
-
-  return ret;
-}
-
 const char* syncStr(ESyncState state) {
  switch (state) {
    case TAOS_SYNC_STATE_FOLLOWER:
@ -2895,129 +2696,6 @@ bool syncNodeIsOptimizedOneReplica(SSyncNode* ths, SRpcMsg* pMsg) {
  return (ths->replicaNum == 1 && syncUtilUserCommit(pMsg->msgType) && ths->vgId != 1);
 }

-int32_t syncNodeDoCommit(SSyncNode* ths, SyncIndex beginIndex, SyncIndex endIndex, uint64_t flag) {
-  ASSERT(false);
-  if (beginIndex > endIndex) {
-    return 0;
-  }
-
-  if (ths == NULL) {
-    return -1;
-  }
-
-  if (ths->pFsm != NULL && ths->pFsm->FpGetSnapshotInfo != NULL) {
-    // advance commit index to sanpshot first
-    SSnapshot snapshot = {0};
-    ths->pFsm->FpGetSnapshotInfo(ths->pFsm, &snapshot);
-    if (snapshot.lastApplyIndex >= 0 && snapshot.lastApplyIndex >= beginIndex) {
-      sNTrace(ths, "commit by snapshot from index:%" PRId64 " to index:%" PRId64, beginIndex, snapshot.lastApplyIndex);
-
-      // update begin index
-      beginIndex = snapshot.lastApplyIndex + 1;
-    }
-  }
-
-  int32_t    code = 0;
-  ESyncState state = flag;
-
-  sNTrace(ths, "commit by wal from index:%" PRId64 " to index:%" PRId64, beginIndex, endIndex);
-
-  // execute fsm
-  if (ths->pFsm != NULL) {
-    for (SyncIndex i = beginIndex; i <= endIndex; ++i) {
-      if (i != SYNC_INDEX_INVALID) {
-        SSyncRaftEntry* pEntry;
-        SLRUCache*      pCache = ths->pLogStore->pCache;
-        LRUHandle*      h = taosLRUCacheLookup(pCache, &i, sizeof(i));
-        if (h) {
-          pEntry = (SSyncRaftEntry*)taosLRUCacheValue(pCache, h);
-
-          ths->pLogStore->cacheHit++;
-          sNTrace(ths, "hit cache index:%" PRId64 ", bytes:%u, %p", i, pEntry->bytes, pEntry);
-
-        } else {
-          ths->pLogStore->cacheMiss++;
-          sNTrace(ths, "miss cache index:%" PRId64, i);
-
-          code = ths->pLogStore->syncLogGetEntry(ths->pLogStore, i, &pEntry);
-          // ASSERT(code == 0);
-          // ASSERT(pEntry != NULL);
-          if (code != 0 || pEntry == NULL) {
-            sNError(ths, "get log entry error");
-            sFatal("vgId:%d, get log entry %" PRId64 " error when commit since %s", ths->vgId, i, terrstr());
-            continue;
-          }
-        }
-
-        SRpcMsg rpcMsg = {0};
-        syncEntry2OriginalRpc(pEntry, &rpcMsg);
-
-        sTrace("do commit index:%" PRId64 ", type:%s", i, TMSG_INFO(pEntry->msgType));
-
-        // user commit
-        if ((ths->pFsm->FpCommitCb != NULL) && syncUtilUserCommit(pEntry->originalRpcType)) {
-          bool internalExecute = true;
-          if ((ths->replicaNum == 1) && ths->restoreFinish && ths->vgId != 1) {
-            internalExecute = false;
-          }
-
-          sNTrace(ths, "user commit index:%" PRId64 ", internal:%d, type:%s", i, internalExecute,
-                  TMSG_INFO(pEntry->msgType));
-
-          // execute fsm in apply thread, or execute outside syncPropose
-          if (internalExecute) {
-            SFsmCbMeta cbMeta = {
-                .index = pEntry->index,
-                .lastConfigIndex = syncNodeGetSnapshotConfigIndex(ths, pEntry->index),
-                .isWeak = pEntry->isWeak,
-                .code = 0,
-                .state = ths->state,
-                .seqNum = pEntry->seqNum,
-                .term = pEntry->term,
-                .currentTerm = ths->raftStore.currentTerm,
-                .flag = flag,
-            };
-
-            syncRespMgrGetAndDel(ths->pSyncRespMgr, cbMeta.seqNum, &rpcMsg.info);
-            ths->pFsm->FpCommitCb(ths->pFsm, &rpcMsg, &cbMeta);
-          }
-        }
-
-#if 0
-        // execute in pre-commit
-        // leader transfer
-        if (pEntry->originalRpcType == TDMT_SYNC_LEADER_TRANSFER) {
-          code = syncDoLeaderTransfer(ths, &rpcMsg, pEntry);
-          ASSERT(code == 0);
-        }
-#endif
-
-        // restore finish
-        // if only snapshot, a noop entry will be append, so syncLogLastIndex is always ok
-        if (pEntry->index == ths->pLogStore->syncLogLastIndex(ths->pLogStore)) {
-          if (ths->restoreFinish == false) {
-            if (ths->pFsm->FpRestoreFinishCb != NULL) {
-              ths->pFsm->FpRestoreFinishCb(ths->pFsm);
-            }
-            ths->restoreFinish = true;
-
-            int64_t restoreDelay = taosGetTimestampMs() - ths->leaderTime;
-            sNTrace(ths, "restore finish, index:%" PRId64 ", elapsed:%" PRId64 " ms", pEntry->index, restoreDelay);
-          }
-        }
-
-        rpcFreeCont(rpcMsg.pCont);
-        if (h) {
-          taosLRUCacheRelease(pCache, h, false);
-        } else {
-          syncEntryDestroy(pEntry);
-        }
-      }
-    }
-  }
-  return 0;
-}
-
 bool syncNodeInRaftGroup(SSyncNode* ths, SRaftId* pRaftId) {
  for (int32_t i = 0; i < ths->replicaNum; ++i) {
    if (syncUtilSameId(&((ths->replicasId)[i]), pRaftId)) {
--- a/source/libs/sync/src/syncPipeline.c
+++ b/source/libs/sync/src/syncPipeline.c
@ -364,7 +364,11 @@ _out:
  return ret;
 }

-int32_t syncLogStorePersist(SSyncLogStore* pLogStore, SSyncRaftEntry* pEntry) {
+static inline bool syncLogStoreNeedFlush(SSyncRaftEntry* pEntry, int32_t replicaNum) {
+  return (replicaNum > 1) && (pEntry->originalRpcType == TDMT_VND_COMMIT);
+}
+
+int32_t syncLogStorePersist(SSyncLogStore* pLogStore, SSyncNode* pNode, SSyncRaftEntry* pEntry) {
  ASSERT(pEntry->index >= 0);
  SyncIndex lastVer = pLogStore->syncLogLastIndex(pLogStore);
  if (lastVer >= pEntry->index && pLogStore->syncLogTruncate(pLogStore, pEntry->index) < 0) {
@ -374,7 +378,8 @@ int32_t syncLogStorePersist(SSyncLogStore* pLogStore, SSyncRaftEntry* pEntry) {
  lastVer = pLogStore->syncLogLastIndex(pLogStore);
  ASSERT(pEntry->index == lastVer + 1);

-  if (pLogStore->syncLogAppendEntry(pLogStore, pEntry) < 0) {
+  bool doFsync = syncLogStoreNeedFlush(pEntry, pNode->replicaNum);
+  if (pLogStore->syncLogAppendEntry(pLogStore, pEntry, doFsync) < 0) {
    sError("failed to append sync log entry since %s. index:%" PRId64 ", term:%" PRId64 "", terrstr(), pEntry->index,
           pEntry->term);
    return -1;
@ -436,7 +441,7 @@ int64_t syncLogBufferProceed(SSyncLogBuffer* pBuf, SSyncNode* pNode, SyncTerm* p
    (void)syncNodeReplicateWithoutLock(pNode);

    // persist
-    if (syncLogStorePersist(pLogStore, pEntry) < 0) {
+    if (syncLogStorePersist(pLogStore, pNode, pEntry) < 0) {
      sError("vgId:%d, failed to persist sync log entry from buffer since %s. index:%" PRId64, pNode->vgId, terrstr(),
             pEntry->index);
      goto _out;
@ -940,8 +945,11 @@ int32_t syncNodeLogReplMgrInit(SSyncNode* pNode) {
  for (int i = 0; i < TSDB_MAX_REPLICA; i++) {
    ASSERT(pNode->logReplMgrs[i] == NULL);
    pNode->logReplMgrs[i] = syncLogReplMgrCreate();
+    if (pNode->logReplMgrs[i] == NULL) {
+      terrno = TSDB_CODE_OUT_OF_MEMORY;
+      return -1;
+    }
    pNode->logReplMgrs[i]->peerId = i;
-    ASSERTS(pNode->logReplMgrs[i] != NULL, "Out of memory.");
  }
  return 0;
 }
--- a/source/libs/sync/src/syncRaftLog.c
+++ b/source/libs/sync/src/syncRaftLog.c
@ -23,7 +23,7 @@

 // public function
 static int32_t   raftLogRestoreFromSnapshot(struct SSyncLogStore* pLogStore, SyncIndex snapshotIndex);
-static int32_t   raftLogAppendEntry(struct SSyncLogStore* pLogStore, SSyncRaftEntry* pEntry);
+static int32_t   raftLogAppendEntry(struct SSyncLogStore* pLogStore, SSyncRaftEntry* pEntry, bool forceSync);
 static int32_t   raftLogTruncate(struct SSyncLogStore* pLogStore, SyncIndex fromIndex);
 static bool      raftLogExist(struct SSyncLogStore* pLogStore, SyncIndex index);
 static int32_t   raftLogUpdateCommitIndex(SSyncLogStore* pLogStore, SyncIndex index);
@ -192,7 +192,7 @@ SyncTerm raftLogLastTerm(struct SSyncLogStore* pLogStore) {
  return SYNC_TERM_INVALID;
 }

-static int32_t raftLogAppendEntry(struct SSyncLogStore* pLogStore, SSyncRaftEntry* pEntry) {
+static int32_t raftLogAppendEntry(struct SSyncLogStore* pLogStore, SSyncRaftEntry* pEntry, bool forceSync) {
  SSyncLogStoreData* pData = pLogStore->data;
  SWal*              pWal = pData->pWal;

@ -219,9 +219,7 @@ static int32_t raftLogAppendEntry(struct SSyncLogStore* pLogStore, SSyncRaftEntr

  ASSERT(pEntry->index == index);

-  if (pEntry->originalRpcType == TDMT_VND_COMMIT) {
-    walFsync(pWal, true);
-  }
+  walFsync(pWal, forceSync);

  sNTrace(pData->pSyncNode, "write index:%" PRId64 ", type:%s, origin type:%s, elapsed:%" PRId64, pEntry->index,
          TMSG_INFO(pEntry->msgType), TMSG_INFO(pEntry->originalRpcType), tsElapsed);
--- a/source/libs/sync/src/syncReplication.c
+++ b/source/libs/sync/src/syncReplication.c
@ -48,92 +48,6 @@

 int32_t syncNodeMaybeSendAppendEntries(SSyncNode* pSyncNode, const SRaftId* destRaftId, SRpcMsg* pRpcMsg);

-int32_t syncNodeReplicateOne(SSyncNode* pSyncNode, SRaftId* pDestId, bool snapshot) {
-  ASSERT(false && "deprecated");
-  // next index
-  SyncIndex nextIndex = syncIndexMgrGetIndex(pSyncNode->pNextIndex, pDestId);
-
-  if (snapshot) {
-    // maybe start snapshot
-    SyncIndex logStartIndex = pSyncNode->pLogStore->syncLogBeginIndex(pSyncNode->pLogStore);
-    SyncIndex logEndIndex = pSyncNode->pLogStore->syncLogEndIndex(pSyncNode->pLogStore);
-    if (nextIndex < logStartIndex || nextIndex - 1 > logEndIndex) {
-      sNTrace(pSyncNode, "maybe start snapshot for next-index:%" PRId64 ", start:%" PRId64 ", end:%" PRId64, nextIndex,
-              logStartIndex, logEndIndex);
-      // start snapshot
-      int32_t code = syncNodeStartSnapshot(pSyncNode, pDestId);
-    }
-  }
-
-  // pre index, pre term
-  SyncIndex preLogIndex = syncNodeGetPreIndex(pSyncNode, nextIndex);
-  SyncTerm  preLogTerm = syncNodeGetPreTerm(pSyncNode, nextIndex);
-
-  // prepare entry
-  SRpcMsg            rpcMsg = {0};
-  SyncAppendEntries* pMsg = NULL;
-
-  SSyncRaftEntry* pEntry = NULL;
-  SLRUCache*      pCache = pSyncNode->pLogStore->pCache;
-  LRUHandle*      h = taosLRUCacheLookup(pCache, &nextIndex, sizeof(nextIndex));
-  int32_t         code = 0;
-  if (h) {
-    pEntry = (SSyncRaftEntry*)taosLRUCacheValue(pCache, h);
-    code = 0;
-
-    pSyncNode->pLogStore->cacheHit++;
-    sNTrace(pSyncNode, "hit cache index:%" PRId64 ", bytes:%u, %p", nextIndex, pEntry->bytes, pEntry);
-
-  } else {
-    pSyncNode->pLogStore->cacheMiss++;
-    sNTrace(pSyncNode, "miss cache index:%" PRId64, nextIndex);
-
-    code = pSyncNode->pLogStore->syncLogGetEntry(pSyncNode->pLogStore, nextIndex, &pEntry);
-  }
-
-  if (code == 0) {
-    ASSERT(pEntry != NULL);
-
-    code = syncBuildAppendEntries(&rpcMsg, (int32_t)(pEntry->bytes), pSyncNode->vgId);
-    ASSERT(code == 0);
-
-    pMsg = rpcMsg.pCont;
-    memcpy(pMsg->data, pEntry, pEntry->bytes);
-  } else {
-    if (terrno == TSDB_CODE_WAL_LOG_NOT_EXIST) {
-      // no entry in log
-      code = syncBuildAppendEntries(&rpcMsg, 0, pSyncNode->vgId);
-      ASSERT(code == 0);
-
-      pMsg = rpcMsg.pCont;
-    } else {
-      sNError(pSyncNode, "replicate to dnode:%d error, next-index:%" PRId64, DID(pDestId), nextIndex);
-      return -1;
-    }
-  }
-
-  if (h) {
-    taosLRUCacheRelease(pCache, h, false);
-  } else {
-    syncEntryDestroy(pEntry);
-  }
-
-  // prepare msg
-  ASSERT(pMsg != NULL);
-  pMsg->srcId = pSyncNode->myRaftId;
-  pMsg->destId = *pDestId;
-  pMsg->term = pSyncNode->raftStore.currentTerm;
-  pMsg->prevLogIndex = preLogIndex;
-  pMsg->prevLogTerm = preLogTerm;
-  pMsg->commitIndex = pSyncNode->commitIndex;
-  pMsg->privateTerm = 0;
-  // pMsg->privateTerm = syncIndexMgrGetTerm(pSyncNode->pNextIndex, pDestId);
-
-  // send msg
-  syncNodeMaybeSendAppendEntries(pSyncNode, pDestId, &rpcMsg);
-  return 0;
-}
-
 int32_t syncNodeReplicate(SSyncNode* pNode) {
  SSyncLogBuffer* pBuf = pNode->pLogBuf;
  taosThreadMutexLock(&pBuf->mutex);
@ -156,25 +70,6 @@ int32_t syncNodeReplicateWithoutLock(SSyncNode* pNode) {
  return 0;
 }

-int32_t syncNodeReplicateOld(SSyncNode* pSyncNode) {
-  if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) {
-    return -1;
-  }
-
-  sNTrace(pSyncNode, "do replicate");
-
-  int32_t ret = 0;
-  for (int i = 0; i < pSyncNode->peersNum; ++i) {
-    SRaftId* pDestId = &(pSyncNode->peersId[i]);
-    ret = syncNodeReplicateOne(pSyncNode, pDestId, true);
-    if (ret != 0) {
-      sError("vgId:%d, do append entries error for dnode:%d", pSyncNode->vgId, DID(pDestId));
-    }
-  }
-
-  return 0;
-}
-
 int32_t syncNodeSendAppendEntries(SSyncNode* pSyncNode, const SRaftId* destRaftId, SRpcMsg* pRpcMsg) {
  SyncAppendEntries* pMsg = pRpcMsg->pCont;
  pMsg->destId = *destRaftId;
@ -182,39 +77,6 @@ int32_t syncNodeSendAppendEntries(SSyncNode* pSyncNode, const SRaftId* destRaftI
  return 0;
 }

-int32_t syncNodeSendAppendEntriesOld(SSyncNode* pSyncNode, const SRaftId* destRaftId, SRpcMsg* pRpcMsg) {
-  int32_t            ret = 0;
-  SyncAppendEntries* pMsg = pRpcMsg->pCont;
-  if (pMsg == NULL) {
-    sError("vgId:%d, sync-append-entries msg is NULL", pSyncNode->vgId);
-    return 0;
-  }
-
-  SPeerState* pState = syncNodeGetPeerState(pSyncNode, destRaftId);
-  if (pState == NULL) {
-    sError("vgId:%d, replica maybe dropped", pSyncNode->vgId);
-    return 0;
-  }
-
-  // save index, otherwise pMsg will be free by rpc
-  SyncIndex saveLastSendIndex = pState->lastSendIndex;
-  bool      update = false;
-  if (pMsg->dataLen > 0) {
-    saveLastSendIndex = pMsg->prevLogIndex + 1;
-    update = true;
-  }
-
-  syncLogSendAppendEntries(pSyncNode, pMsg, "");
-  syncNodeSendMsgById(destRaftId, pSyncNode, pRpcMsg);
-
-  if (update) {
-    pState->lastSendIndex = saveLastSendIndex;
-    pState->lastSendTime = taosGetTimestampMs();
-  }
-
-  return ret;
-}
-
 int32_t syncNodeMaybeSendAppendEntries(SSyncNode* pSyncNode, const SRaftId* destRaftId, SRpcMsg* pRpcMsg) {
  int32_t            ret = 0;
  SyncAppendEntries* pMsg = pRpcMsg->pCont;
--- a/source/libs/tdb/src/db/tdbBtree.c
+++ b/source/libs/tdb/src/db/tdbBtree.c
@ -1063,11 +1063,11 @@ static int tdbBtreeEncodePayload(SPage *pPage, SCell *pCell, int nHeader, const
    } else {
      int nLeftKey = kLen;
      // pack partial key and nextPgno
-      memcpy(pCell + nHeader, pKey, nLocal - 4);
-      nLeft -= nLocal - 4;
-      nLeftKey -= nLocal - 4;
+      memcpy(pCell + nHeader, pKey, nLocal - nHeader - sizeof(pgno));
+      nLeft -= nLocal - nHeader - sizeof(pgno);
+      nLeftKey -= nLocal - nHeader - sizeof(pgno);

-      memcpy(pCell + nHeader + nLocal - 4, &pgno, sizeof(pgno));
+      memcpy(pCell + nLocal - sizeof(pgno), &pgno, sizeof(pgno));

      int lastKeyPageSpace = 0;
      // pack left key & val to ovpages
@ -1087,9 +1087,12 @@ static int tdbBtreeEncodePayload(SPage *pPage, SCell *pCell, int nHeader, const

        if (lastKeyPage) {
          if (lastKeyPageSpace >= vLen) {
+            if (vLen > 0) {
              memcpy(pBuf + kLen - nLeftKey, pVal, vLen);

              nLeft -= vLen;
+            }
+
            pgno = 0;
          } else {
            memcpy(pBuf + kLen - nLeftKey, pVal, lastKeyPageSpace);
@ -1111,7 +1114,7 @@ static int tdbBtreeEncodePayload(SPage *pPage, SCell *pCell, int nHeader, const
          }
        }

-        memcpy(pBuf + kLen - nLeft, &pgno, sizeof(pgno));
+        memcpy(pBuf + bytes, &pgno, sizeof(pgno));

        ret = tdbPageInsertCell(ofp, 0, pBuf, bytes + sizeof(pgno), 0);
        if (ret < 0) {
@ -1313,11 +1316,11 @@ static int tdbBtreeDecodePayload(SPage *pPage, const SCell *pCell, int nHeader,
      }
      TDB_CELLDECODER_SET_FREE_KEY(pDecoder);

-      memcpy(pDecoder->pKey, pCell + nHeader, nLocal - 4);
-      nLeft -= nLocal - 4;
-      nLeftKey -= nLocal - 4;
+      memcpy(pDecoder->pKey, pCell + nHeader, nLocal - nHeader - sizeof(pgno));
+      nLeft -= nLocal - nHeader - sizeof(pgno);
+      nLeftKey -= nLocal - nHeader - sizeof(pgno);

-      memcpy(&pgno, pCell + nHeader + nLocal - 4, sizeof(pgno));
+      memcpy(&pgno, pCell + nLocal - sizeof(pgno), sizeof(pgno));

      int lastKeyPageSpace = 0;
      // load left key & val to ovpages
@ -1343,9 +1346,11 @@ static int tdbBtreeDecodePayload(SPage *pPage, const SCell *pCell, int nHeader,

        if (lastKeyPage) {
          if (lastKeyPageSpace >= vLen) {
+            if (vLen > 0) {
              pDecoder->pVal = ofpCell + kLen - nLeftKey;

              nLeft -= vLen;
+            }
            pgno = 0;
          } else {
            // read partial val to local
--- a/source/libs/transport/src/tmsgcb.c
+++ b/source/libs/transport/src/tmsgcb.c
@ -59,6 +59,12 @@ void tmsgReleaseHandle(SRpcHandleInfo* pHandle, int8_t type) { (*defaultMsgCb.re

 void tmsgReportStartup(const char* name, const char* desc) { (*defaultMsgCb.reportStartupFp)(name, desc); }

-int32_t tmsgUpdateDnodeInfo(int32_t* dnodeId, int64_t* clusterId, char* fqdn, uint16_t* port) {
-  return (*defaultMsgCb.updateDnodeInfoFp)(defaultMsgCb.data, dnodeId, clusterId, fqdn, port);
+void tmsgUpdateDnodeInfo(int32_t* dnodeId, int64_t* clusterId, char* fqdn, uint16_t* port) {
+  (*defaultMsgCb.updateDnodeInfoFp)(defaultMsgCb.data, dnodeId, clusterId, fqdn, port);
+}
+
+void tmsgUpdateDnodeEpSet(SEpSet* epset) {
+  for (int32_t i = 0; i < epset->numOfEps; ++i) {
+    tmsgUpdateDnodeInfo(NULL, NULL, epset->eps[i].fqdn, &epset->eps[i].port);
+  }
 }
--- a/source/libs/wal/src/walMeta.c
+++ b/source/libs/wal/src/walMeta.c
@ -325,6 +325,35 @@ bool walLogEntriesComplete(const SWal* pWal) {
  return complete;
 }

+int walTrimIdxFile(SWal* pWal, int32_t fileIdx) {
+  SWalFileInfo* pFileInfo = taosArrayGet(pWal->fileInfoSet, fileIdx);
+  ASSERT(pFileInfo != NULL);
+  char fnameStr[WAL_FILE_LEN];
+  walBuildIdxName(pWal, pFileInfo->firstVer, fnameStr);
+
+  int64_t fileSize = 0;
+  taosStatFile(fnameStr, &fileSize, NULL);
+  int64_t records = TMAX(0, pFileInfo->lastVer - pFileInfo->firstVer + 1);
+  int64_t lastEndOffset = records * sizeof(SWalIdxEntry);
+
+  if (fileSize <= lastEndOffset) {
+    return 0;
+  }
+
+  TdFilePtr pFile = taosOpenFile(fnameStr, TD_FILE_READ | TD_FILE_WRITE);
+  if (pFile == NULL) {
+    terrno = TAOS_SYSTEM_ERROR(errno);
+    return -1;
+  }
+
+  wInfo("vgId:%d, trim idx file. file: %s, size: %" PRId64 ", offset: %" PRId64, pWal->cfg.vgId, fnameStr, fileSize,
+        lastEndOffset);
+
+  taosFtruncateFile(pFile, lastEndOffset);
+  taosCloseFile(&pFile);
+  return 0;
+}
+
 int walCheckAndRepairMeta(SWal* pWal) {
  // load log files, get first/snapshot/last version info
  const char* logPattern = "^[0-9]+.log$";
@ -402,6 +431,8 @@ int walCheckAndRepairMeta(SWal* pWal) {
    }
    updateMeta = true;

+    (void)walTrimIdxFile(pWal, fileIdx);
+
    int64_t lastVer = walScanLogGetLastVer(pWal, fileIdx);
    if (lastVer < 0) {
      if (terrno != TSDB_CODE_WAL_LOG_NOT_EXIST) {
@ -567,6 +598,7 @@ int walCheckAndRepairIdxFile(SWal* pWal, int32_t fileIdx) {
    goto _err;
  }

+  int64_t count = 0;
  while (idxEntry.ver < pFileInfo->lastVer) {
    ASSERT(idxEntry.ver == ckHead.head.version);

@ -578,11 +610,11 @@ int walCheckAndRepairIdxFile(SWal* pWal, int32_t fileIdx) {
             idxEntry.offset, fLogNameStr);
      goto _err;
    }
-    wWarn("vgId:%d, wal idx append new entry %" PRId64 " %" PRId64, pWal->cfg.vgId, idxEntry.ver, idxEntry.offset);
    if (taosWriteFile(pIdxFile, &idxEntry, sizeof(SWalIdxEntry)) < 0) {
      wError("vgId:%d, failed to append file since %s. file:%s", pWal->cfg.vgId, terrstr(), fnameStr);
      goto _err;
    }
+    count++;
  }

  if (taosFsyncFile(pIdxFile) < 0) {
@ -590,6 +622,11 @@ int walCheckAndRepairIdxFile(SWal* pWal, int32_t fileIdx) {
    goto _err;
  }

+  if (count > 0) {
+    wInfo("vgId:%d, rebuilt %" PRId64 " wal idx entries until lastVer: %" PRId64, pWal->cfg.vgId, count,
+          pFileInfo->lastVer);
+  }
+
  (void)taosCloseFile(&pLogFile);
  (void)taosCloseFile(&pIdxFile);
  return 0;
--- a/source/libs/wal/src/walRef.c
+++ b/source/libs/wal/src/walRef.c
@ -77,14 +77,41 @@ void walUnrefVer(SWalRef *pRef) {
 }
 #endif

-SWalRef *walRefCommittedVer(SWal *pWal) {
-  SWalRef *pRef = walOpenRef(pWal);
+SWalRef *walRefFirstVer(SWal *pWal, SWalRef *pRef) {
+  if (pRef == NULL) {
+    pRef = walOpenRef(pWal);
    if (pRef == NULL) {
      return NULL;
    }
+  }
  taosThreadMutexLock(&pWal->mutex);

-  int64_t ver = walGetCommittedVer(pWal);
+  int64_t ver = walGetFirstVer(pWal);
+
+  wDebug("vgId:%d, wal ref version %" PRId64 " for first", pWal->cfg.vgId, ver);
+
+  pRef->refVer = ver;
+  // bsearch in fileSet
+  SWalFileInfo tmpInfo;
+  tmpInfo.firstVer = ver;
+  SWalFileInfo *pRet = taosArraySearch(pWal->fileInfoSet, &tmpInfo, compareWalFileInfo, TD_LE);
+  ASSERT(pRet != NULL);
+  pRef->refFile = pRet->firstVer;
+
+  taosThreadMutexUnlock(&pWal->mutex);
+  return pRef;
+}
+
+SWalRef *walRefCommittedVer(SWal *pWal) {
+  SWalRef *pRef = walOpenRef(pWal);
+  if (pRef == NULL) {
+    return NULL;
+  }
+  taosThreadMutexLock(&pWal->mutex);
+
+  int64_t ver = walGetCommittedVer(pWal);
+
+  wDebug("vgId:%d, wal ref version %" PRId64 " for committed", pWal->cfg.vgId, ver);

  pRef->refVer = ver;
  // bsearch in fileSet
--- a/source/libs/wal/src/walWrite.c
+++ b/source/libs/wal/src/walWrite.c
@ -637,11 +637,6 @@ int32_t walWrite(SWal *pWal, int64_t index, tmsg_t msgType, const void *body, in
 void walFsync(SWal *pWal, bool forceFsync) {
  taosThreadMutexLock(&pWal->mutex);
  if (forceFsync || (pWal->cfg.level == TAOS_WAL_FSYNC && pWal->cfg.fsyncPeriod == 0)) {
-    wTrace("vgId:%d, fileId:%" PRId64 ".idx, do fsync", pWal->cfg.vgId, walGetCurFileFirstVer(pWal));
-    if (taosFsyncFile(pWal->pIdxFile) < 0) {
-      wError("vgId:%d, file:%" PRId64 ".idx, fsync failed since %s", pWal->cfg.vgId, walGetCurFileFirstVer(pWal),
-             strerror(errno));
-    }
    wTrace("vgId:%d, fileId:%" PRId64 ".log, do fsync", pWal->cfg.vgId, walGetCurFileFirstVer(pWal));
    if (taosFsyncFile(pWal->pLogFile) < 0) {
      wError("vgId:%d, file:%" PRId64 ".log, fsync failed since %s", pWal->cfg.vgId, walGetCurFileFirstVer(pWal),
--- a/source/os/src/osMath.c
+++ b/source/os/src/osMath.c
@ -32,7 +32,18 @@ void swapStr(char* j, char* J, int width) {
 }
 #endif

-// todo refactor: 1) move away; 2) use merge sort instead; 3) qsort is not a stable sort actually.
-void taosSort(void* arr, int64_t sz, int64_t width, __compar_fn_t compar) {
-  qsort(arr, sz, width, compar);
+int qsortHelper(const void* p1, const void* p2, const void* param) {
+  __compar_fn_t comparFn = param;
+  return comparFn(p1, p2);
 }
+
+// todo refactor: 1) move away; 2) use merge sort instead; 3) qsort is not a stable sort actually.
+void taosSort(void* base, int64_t sz, int64_t width, __compar_fn_t compar) {
+#ifdef _ALPINE
+  void* param = compar;
+  taosqsort(base, width, sz, param, qsortHelper);
+#else
+  qsort(base, sz, width, compar);
+#endif
+}
+
--- a/source/os/src/osSysinfo.c
+++ b/source/os/src/osSysinfo.c
@ -834,7 +834,11 @@ int32_t taosGetSystemUUID(char *uid, int32_t uidlen) {
  uuid_generate(uuid);
  // it's caller's responsibility to make enough space for `uid`, that's 36-char + 1-null
  uuid_unparse_lower(uuid, buf);
-  memcpy(uid, buf, uidlen);
+  int n = snprintf(uid, uidlen, "%.*s", (int)sizeof(buf), buf);  // though less performance, much safer
+  if (n >= uidlen) {
+    // target buffer is too small
+    return -1;
+  }
  return 0;
 #else
  int len = 0;
--- a/source/os/src/osTime.c
+++ b/source/os/src/osTime.c
@ -33,6 +33,11 @@
 #include <time.h>
 //#define TM_YEAR_BASE 1970 //origin
 #define TM_YEAR_BASE 1900  // slguan
+
+// This magic number is the number of 100 nanosecond intervals since January 1, 1601 (UTC)
+// until 00:00:00 January 1, 1970
+static const uint64_t TIMEEPOCH = ((uint64_t)116444736000000000ULL);
+
 /*
 * We do not implement alternate representations. However, we always
 * check whether a given modifier is allowed for a certain conversion.
@ -341,15 +346,17 @@ char *taosStrpTime(const char *buf, const char *fmt, struct tm *tm) {

 int32_t taosGetTimeOfDay(struct timeval *tv) {
 #ifdef WINDOWS
-  time_t t;
-  t = taosGetTimestampSec();
-  SYSTEMTIME st;
-  GetLocalTime(&st);
+  LARGE_INTEGER t;
+  FILETIME      f;

-  tv->tv_sec = (long)t;
-  tv->tv_usec = st.wMilliseconds * 1000;
+  GetSystemTimeAsFileTime(&f);
+  t.QuadPart = f.dwHighDateTime;
+  t.QuadPart <<= 32;
+  t.QuadPart |= f.dwLowDateTime;

-  return 0;
+  t.QuadPart -= TIMEEPOCH;
+  tv->tv_sec = t.QuadPart / 10000000;
+  tv->tv_usec = (t.QuadPart % 10000000) / 10;
 #else
  return gettimeofday(tv, NULL);
 #endif
@ -550,37 +557,13 @@ int32_t taosClockGetTime(int clock_id, struct timespec *pTS) {
 #ifdef WINDOWS
  LARGE_INTEGER        t;
  FILETIME             f;
-  static FILETIME      ff;
-  static SYSTEMTIME    ss;
-  static LARGE_INTEGER offset;
-
-  static int8_t        offsetInit = 0;
-  static volatile bool offsetInitFinished = false;
-  int8_t               old = atomic_val_compare_exchange_8(&offsetInit, 0, 1);
-  if (0 == old) {
-    ss.wYear = 1970;
-    ss.wMonth = 1;
-    ss.wDay = 1;
-    ss.wHour = 0;
-    ss.wMinute = 0;
-    ss.wSecond = 0;
-    ss.wMilliseconds = 0;
-    SystemTimeToFileTime(&ss, &ff);
-    offset.QuadPart = ff.dwHighDateTime;
-    offset.QuadPart <<= 32;
-    offset.QuadPart |= ff.dwLowDateTime;
-    offsetInitFinished = true;
-  } else {
-    while (!offsetInitFinished)
-      ;  // Ensure initialization is completed.
-  }

  GetSystemTimeAsFileTime(&f);
  t.QuadPart = f.dwHighDateTime;
  t.QuadPart <<= 32;
  t.QuadPart |= f.dwLowDateTime;

-  t.QuadPart -= offset.QuadPart;
+  t.QuadPart -= TIMEEPOCH;
  pTS->tv_sec = t.QuadPart / 10000000;
  pTS->tv_nsec = (t.QuadPart % 10000000) * 100;
  return (0);
--- a/source/util/src/talgo.c
+++ b/source/util/src/talgo.c
@ -41,12 +41,6 @@ static void median(void *src, int64_t size, int64_t s, int64_t e, const void *pa

  ASSERT(comparFn(elePtrAt(src, size, mid), elePtrAt(src, size, s), param) <= 0 &&
         comparFn(elePtrAt(src, size, s), elePtrAt(src, size, e), param) <= 0);
-
-#ifdef _DEBUG_VIEW
-//  tTagsPrints(src[s], pOrderDesc->pColumnModel, &pOrderDesc->orderIdx);
-//  tTagsPrints(src[mid], pOrderDesc->pColumnModel, &pOrderDesc->orderIdx);
-//  tTagsPrints(src[e], pOrderDesc->pColumnModel, &pOrderDesc->orderIdx);
-#endif
 }

 static void tInsertSort(void *src, int64_t size, int32_t s, int32_t e, const void *param, __ext_compar_fn_t comparFn,
@ -278,14 +272,4 @@ void taosheapsort(void *base, int32_t size, int32_t len, const void *parcompar,
  }

  taosMemoryFree(buf);
-  /*
-    char *buf = taosMemoryCalloc(1, size);
-
-    for (i = len - 1; i > 0; i--) {
-      doswap(elePtrAt(base, size, 0), elePtrAt(base, size, i));
-      taosheapadjust(base, size, 0, i - 1, parcompar, compar, parswap, swap, maxroot);
-    }
-
-    taosMemoryFreeClear(buf);
-  */
 }
--- a/source/util/src/tarray.c
+++ b/source/util/src/tarray.c
@ -20,7 +20,10 @@
 // todo refactor API

 SArray* taosArrayInit(size_t size, size_t elemSize) {
-  assert(elemSize > 0);
+  if (elemSize == 0) {
+    terrno = TSDB_CODE_INVALID_PARA;
+    return NULL;
+  }

  if (size < TARRAY_MIN_SIZE) {
    size = TARRAY_MIN_SIZE;
@ -116,8 +119,6 @@ void* taosArrayAddBatch(SArray* pArray, const void* pData, int32_t nEles) {
 }

 void taosArrayRemoveDuplicate(SArray* pArray, __compar_fn_t comparFn, void (*fp)(void*)) {
-  assert(pArray);
-
  size_t size = pArray->size;
  if (size <= 1) {
    return;
@ -156,8 +157,6 @@ void taosArrayRemoveDuplicate(SArray* pArray, __compar_fn_t comparFn, void (*fp)
 }

 void taosArrayRemoveDuplicateP(SArray* pArray, __compar_fn_t comparFn, void (*fp)(void*)) {
-  assert(pArray);
-
  size_t size = pArray->size;
  if (size <= 1) {
    return;
@ -215,11 +214,10 @@ void* taosArrayReserve(SArray* pArray, int32_t num) {
 }

 void* taosArrayPop(SArray* pArray) {
-  assert(pArray != NULL);
-
  if (pArray->size == 0) {
    return NULL;
  }
+
  pArray->size -= 1;
  return TARRAY_GET_ELEM(pArray, pArray->size);
 }
@ -228,16 +226,21 @@ void* taosArrayGet(const SArray* pArray, size_t index) {
  if (NULL == pArray) {
    return NULL;
  }
-  assert(index < pArray->size);
+
+  if (index >= pArray->size) {
+    uError("index is out of range, current:%"PRIzu" max:%d", index, pArray->capacity);
+    return NULL;
+  }
+
  return TARRAY_GET_ELEM(pArray, index);
 }

 void* taosArrayGetP(const SArray* pArray, size_t index) {
-  assert(index < pArray->size);
-
-  void* d = TARRAY_GET_ELEM(pArray, index);
-
-  return *(void**)d;
+  void** p = taosArrayGet(pArray, index);
+  if (p == NULL) {
+    return NULL;
+  }
+  return *p;
 }

 void* taosArrayGetLast(const SArray* pArray) {
@ -322,9 +325,12 @@ void taosArrayRemove(SArray* pArray, size_t index) {
 }

 SArray* taosArrayFromList(const void* src, size_t size, size_t elemSize) {
-  assert(src != NULL && elemSize > 0);
-  SArray* pDst = taosArrayInit(size, elemSize);
+  if (elemSize <= 0) {
+    terrno = TSDB_CODE_INVALID_PARA;
+    return NULL;
+  }

+  SArray* pDst = taosArrayInit(size, elemSize);
  memcpy(pDst->pData, src, elemSize * size);
  pDst->size = size;

@ -332,8 +338,6 @@ SArray* taosArrayFromList(const void* src, size_t size, size_t elemSize) {
 }

 SArray* taosArrayDup(const SArray* pSrc, __array_item_dup_fn_t fn) {
-  assert(pSrc != NULL);
-
  if (pSrc->size == 0) {  // empty array list
    return taosArrayInit(8, pSrc->elemSize);
  }
@ -425,14 +429,10 @@ void taosArrayDestroyEx(SArray* pArray, FDelete fp) {
 }

 void taosArraySort(SArray* pArray, __compar_fn_t compar) {
-  ASSERT(pArray != NULL && compar != NULL);
  taosSort(pArray->pData, pArray->size, pArray->elemSize, compar);
 }

 void* taosArraySearch(const SArray* pArray, const void* key, __compar_fn_t comparFn, int32_t flags) {
-  assert(pArray != NULL && comparFn != NULL);
-  assert(key != NULL);
-
  return taosbsearch(key, pArray->pData, pArray->size, pArray->elemSize, comparFn, flags);
 }

--- a/source/util/src/tcache.c
+++ b/source/util/src/tcache.c
@ -921,7 +921,7 @@ void taosCacheRefresh(SCacheObj *pCacheObj, __cache_trav_fn_t fp, void *param1)
 void taosStopCacheRefreshWorker(void) {
  stopRefreshWorker = true;
  TdThreadOnce tmp = PTHREAD_ONCE_INIT;
-  if (memcmp(&cacheRefreshWorker, &tmp, sizeof(TdThreadOnce)) != 0) taosThreadJoin(cacheRefreshWorker, NULL);
+  if (memcmp(&cacheThreadInit, &tmp, sizeof(TdThreadOnce)) != 0) taosThreadJoin(cacheRefreshWorker, NULL);
  taosArrayDestroy(pCacheArrayList);
 }

--- a/Show More
+++ b/Show More