From fb9376bea77ea94ce7bbd3ec69419b0b12fea641 Mon Sep 17 00:00:00 2001
From: Steven Li <stevenli@taosdata.com>
Date: Tue, 20 Oct 2020 07:13:00 +0000
Subject: [PATCH 01/16] Introduced TdeInstance concept into crash_gen tool,
 ready to run clusters next

---
 tests/pytest/crash_gen/crash_gen.py | 368 +++++++++++++++++-----------
 1 file changed, 220 insertions(+), 148 deletions(-)

diff --git a/tests/pytest/crash_gen/crash_gen.py b/tests/pytest/crash_gen/crash_gen.py
index 48196ab383..b1d79f54c3 100755
--- a/tests/pytest/crash_gen/crash_gen.py
+++ b/tests/pytest/crash_gen/crash_gen.py
@@ -44,6 +44,7 @@ import traceback
 import resource
 from guppy import hpy
 import gc
+import subprocess
 
 try:
     import psutil
@@ -59,12 +60,13 @@ if sys.version_info[0] < 3:
 
 # Command-line/Environment Configurations, will set a bit later
 # ConfigNameSpace = argparse.Namespace
-gConfig = argparse.Namespace()  # Dummy value, will be replaced later
-gSvcMgr = None # TODO: refactor this hack, use dep injection
-logger = None # type: Logger
+gConfig:    argparse.Namespace 
+gSvcMgr:    ServiceManager # TODO: refactor this hack, use dep injection
+logger:     logging.Logger
+gContainer: Container
 
-def runThread(wt: WorkerThread):
-    wt.run()
+# def runThread(wt: WorkerThread):
+#     wt.run()
 
 class CrashGenError(Exception):
     def __init__(self, msg=None, errno=None):
@@ -74,7 +76,6 @@ class CrashGenError(Exception):
     def __str__(self):
         return self.msg
 
-
 class WorkerThread:
     def __init__(self, pool: ThreadPool, tid, tc: ThreadCoordinator,
                  # te: TaskExecutor,
@@ -84,7 +85,8 @@ class WorkerThread:
         self._tid = tid
         self._tc = tc  # type: ThreadCoordinator
         # self.threadIdent = threading.get_ident()
-        self._thread = threading.Thread(target=runThread, args=(self,))
+        # self._thread = threading.Thread(target=runThread, args=(self,))
+        self._thread = threading.Thread(target=self.run)
         self._stepGate = threading.Event()
 
         # Let us have a DB connection of our own
@@ -253,7 +255,7 @@ class WorkerThread:
 
 
 class ThreadCoordinator:
-    WORKER_THREAD_TIMEOUT = 60 # one minute
+    WORKER_THREAD_TIMEOUT = 180 # one minute
 
     def __init__(self, pool: ThreadPool, dbManager: DbManager):
         self._curStep = -1  # first step is 0
@@ -882,20 +884,15 @@ class MyTDSql:
             raise
         return self.affectedRows
 
+class TdeInstance():
+    """
+    A class to capture the *static* information of a TDengine instance,
+    including the location of the various files/directories, and basica
+    configuration.
+    """
 
-class DbConnNative(DbConn):
-    # Class variables
-    _lock = threading.Lock()
-    _connInfoDisplayed = False
-    totalConnections = 0 # Not private
-
-    def __init__(self):
-        super().__init__()
-        self._type = self.TYPE_NATIVE
-        self._conn = None
-        # self._cursor = None        
-
-    def getBuildPath(self):
+    @classmethod
+    def _getBuildPath(cls):
         selfPath = os.path.dirname(os.path.realpath(__file__))
         if ("community" in selfPath):
             projPath = selfPath[:selfPath.find("communit")]
@@ -914,10 +911,118 @@ class DbConnNative(DbConn):
                 .format(selfPath, projPath))
         return buildPath
 
+    def __init__(self, subdir='test'):
+        self._buildDir = self._getBuildPath()
+        self._subdir = '/' + subdir # TODO: tolerate "/"
+
+    def __repr__(self):
+        return "[TdeInstance: {}, subdir={}]".format(self._buildDir, self._subdir)
     
+    def generateCfgFile(self):
+        # buildPath = self.getBuildPath()
+        # taosdPath = self._buildPath + "/build/bin/taosd"
+
+        cfgDir  = self.getCfgDir()
+        cfgFile = cfgDir + "/taos.cfg" # TODO: inquire if this is fixed
+        if os.path.exists(cfgFile):
+            if os.path.isfile(cfgFile):
+                logger.warning("Config file exists already, skip creation: {}".format(cfgFile))
+                return # cfg file already exists, nothing to do
+            else:
+                raise CrashGenError("Invalid config file: {}".format(cfgFile))
+        # Now that the cfg file doesn't exist
+        if os.path.exists(cfgDir):
+            if not os.path.isdir(cfgDir):
+                raise CrashGenError("Invalid config dir: {}".format(cfgDir))
+            # else: good path
+        else: 
+            os.makedirs(cfgDir, exist_ok=True) # like "mkdir -p"
+        # Now we have a good cfg dir
+        cfgValues = {
+            'runDir': self.getRunDir(),
+            'ip': '127.0.0.1', # TODO: change to a network addressable ip
+            'port': 6030,
+        }
+        cfgTemplate = """
+dataDir {runDir}/data
+logDir  {runDir}/log
+
+charset UTF-8
+
+firstEp {ip}:{port}
+fqdn {ip}
+serverPort {port}
+
+# was all 135 below
+dDebugFlag 135
+cDebugFlag 135
+rpcDebugFlag 135
+qDebugFlag 135
+# httpDebugFlag 143
+# asyncLog 0
+# tables 10
+maxtablesPerVnode 10
+rpcMaxTime 101
+# cache 2
+keep 36500
+# walLevel 2
+walLevel 1
+#
+# maxConnections 100
+"""
+        cfgContent = cfgTemplate.format_map(cfgValues)
+        f = open(cfgFile, "w")
+        f.write(cfgContent)
+        f.close()
+
+    def rotateLogs(self):
+        logPath = self.getLogDir()
+        # ref: https://stackoverflow.com/questions/1995373/deleting-all-files-in-a-directory-with-python/1995397
+        if os.path.exists(logPath):
+            logPathSaved = logPath + "_" + time.strftime('%Y-%m-%d-%H-%M-%S')
+            logger.info("Saving old log files to: {}".format(logPathSaved))
+            os.rename(logPath, logPathSaved)
+        # os.mkdir(logPath) # recreate, no need actually, TDengine will auto-create with proper perms
+
+
+    def getExecFile(self): # .../taosd
+        return self._buildDir + "/build/bin/taosd"
+
+    def getRunDir(self): # TODO: rename to "root dir" ?!
+        return self._buildDir + self._subdir
+
+    def getCfgDir(self): # path, not file
+        return self.getRunDir() + "/cfg"
+
+    def getLogDir(self):
+        return self.getRunDir() + "/log"
+
+    def getHostAddr(self):
+        return "127.0.0.1"
+
+    def getServiceCommand(self): # to start the instance
+        return [self.getExecFile(), '-c', self.getCfgDir()] # used in subproce.Popen()
+
+
+
+class DbConnNative(DbConn):
+    # Class variables
+    _lock = threading.Lock()
+    _connInfoDisplayed = False
+    totalConnections = 0 # Not private
+
+    def __init__(self):
+        super().__init__()
+        self._type = self.TYPE_NATIVE
+        self._conn = None
+        # self._cursor = None        
+
     def openByType(self):  # Open connection
-        cfgPath = self.getBuildPath() + "/test/cfg"
-        hostAddr = "127.0.0.1"
+        global gContainer
+        tdeInstance = gContainer.defTdeInstance # set up in ClientManager, type: TdeInstance
+        # cfgPath = self.getBuildPath() + "/test/cfg"
+        cfgPath  = tdeInstance.getCfgDir()
+        hostAddr = tdeInstance.getHostAddr()
 
         cls = self.__class__ # Get the class, to access class variables
         with cls._lock: # force single threading for opening DB connections. # TODO: whaaat??!!!
@@ -1662,7 +1767,7 @@ class Task():
                 0x503,
                 0x510,  # vnode not in ready state
                 0x14,   # db not ready, errno changed
-                0x600,
+                0x600,  # Invalid table ID, why?
                 1000  # REST catch-all error
             ]: 
             return True # These are the ALWAYS-ACCEPTABLE ones
@@ -1824,7 +1929,7 @@ class ExecutionStats:
                 "FAILED (reason: {})".format(
                     self._failureReason) if self._failed else "SUCCEEDED"))
         logger.info("| Task Execution Times (success/total):")
-        execTimesAny = 0
+        execTimesAny = 0.001 # avoid div by zero
         for k, n in self._execTimes.items():
             execTimesAny += n[0]
             errStr = None
@@ -2343,7 +2448,9 @@ class MyLoggingAdapter(logging.LoggerAdapter):
         # return '[%s] %s' % (self.extra['connid'], msg), kwargs
 
 
-class SvcManager:
+class ServiceManager:
+    PAUSE_BETWEEN_IPC_CHECK = 1.2  # seconds between checks on STDOUT of sub process
+
     def __init__(self):
         print("Starting TDengine Service Manager")
         # signal.signal(signal.SIGTERM, self.sigIntHandler) # Moved to MainExec
@@ -2384,10 +2491,8 @@ class SvcManager:
         self.inSigHandler = True
 
         choice = self._doMenu()
-        if choice == "1":
-            # TODO: can the sub-process be blocked due to us not reading from
-            # queue?
-            self.sigHandlerResume()
+        if choice == "1":            
+            self.sigHandlerResume() # TODO: can the sub-process be blocked due to us not reading from queue?
         elif choice == "2":
             self.stopTaosService()
         elif choice == "3": # Restart
@@ -2398,20 +2503,20 @@ class SvcManager:
         self.inSigHandler = False
 
     def sigIntHandler(self, signalNumber, frame):
-        print("SvcManager: INT Signal Handler starting...")
+        print("ServiceManager: INT Signal Handler starting...")
         if self.inSigHandler:
             print("Ignoring repeated SIG_INT...")
             return
         self.inSigHandler = True
 
         self.stopTaosService()
-        print("SvcManager: INT Signal Handler returning...")
+        print("ServiceManager: INT Signal Handler returning...")
         self.inSigHandler = False
 
     def sigHandlerResume(self):
-        print("Resuming TDengine service manager thread (main thread)...\n\n")
+        print("Resuming TDengine service manager (main thread)...\n\n")
 
-    def _checkServiceManagerThread(self):
+    def _updateThreadStatus(self):
         if self.svcMgrThread:  # valid svc mgr thread
             if self.svcMgrThread.isStopped():  # done?
                 self.svcMgrThread.procIpcBatch()  # one last time. TODO: appropriate?
@@ -2419,14 +2524,13 @@ class SvcManager:
 
     def _procIpcAll(self):
         while self.isRunning() or self.isRestarting() :  # for as long as the svc mgr thread is still here
-            if self.isRunning():
+            if  self.isRunning():
                 self.svcMgrThread.procIpcBatch()  # regular processing,
-                self._checkServiceManagerThread()
+                self._updateThreadStatus()
             elif self.isRetarting():
                 print("Service restarting...")
-            time.sleep(0.5)  # pause, before next round
-        print(
-            "Service Manager Thread (with subprocess) has ended, main thread now exiting...")
+            time.sleep(self.PAUSE_BETWEEN_IPC_CHECK)  # pause, before next round
+        print("Service Manager Thread (with subprocess) ended, main thread exiting...")
 
     def startTaosService(self):
         with self._lock:
@@ -2440,7 +2544,6 @@ class SvcManager:
                     time.sleep(2.0)
                     proc.kill()
                 # print("Process: {}".format(proc.name()))
-
             
             self.svcMgrThread = ServiceManagerThread()  # create the object
             print("Attempting to start TAOS service started, printing out output...")
@@ -2491,10 +2594,17 @@ class SvcManager:
         return self._isRestarting
 
 class ServiceManagerThread:
+    """
+    A class representing a dedicated thread which manages the "sub process"
+    of the TDengine service, interacting with its STDOUT/ERR.
+
+    It takes a TdeInstance parameter at creation time, or create a default    
+    """
     MAX_QUEUE_SIZE = 10000
 
-    def __init__(self):
+    def __init__(self, tInst : TdeInstance = None):
         self._tdeSubProcess = None # type: TdeSubProcess
+        self._tInst = tInst or TdeInstance() # Need an instance
         self._thread = None
         self._status = None
 
@@ -2521,7 +2631,7 @@ class ServiceManagerThread:
 
         self._status = MainExec.STATUS_STARTING
 
-        self._tdeSubProcess = TdeSubProcess()
+        self._tdeSubProcess = TdeSubProcess(self._tInst)
         self._tdeSubProcess.start()
 
         self._ipcQueue = Queue()
@@ -2681,8 +2791,19 @@ class ServiceManagerThread:
 
 
 class TdeSubProcess:
-    def __init__(self):
+    """
+    A class to to represent the actual sub process that is the run-time
+    of a TDengine instance. 
+
+    It takes a TdeInstance object as its parameter, with the rationale being
+    "a sub process runs an instance".
+    """
+
+    def __init__(self, tInst : TdeInstance):
         self.subProcess = None
+        if tInst is None:
+            raise CrashGenError("Empty instance not allowed in TdeSubProcess")
+        self._tInst = tInst # Default create at ServiceManagerThread
 
     def getStdOut(self):
         return self.subProcess.stdout
@@ -2696,50 +2817,39 @@ class TdeSubProcess:
     def getPid(self):
         return self.subProcess.pid
 
-    def getBuildPath(self):
-        selfPath = os.path.dirname(os.path.realpath(__file__))
-        if ("community" in selfPath):
-            projPath = selfPath[:selfPath.find("communit")]
-        else:
-            projPath = selfPath[:selfPath.find("tests")]
+    # Repalced by TdeInstance class
+    # def getBuildPath(self):
+    #     selfPath = os.path.dirname(os.path.realpath(__file__))
+    #     if ("community" in selfPath):
+    #         projPath = selfPath[:selfPath.find("communit")]
+    #     else:
+    #         projPath = selfPath[:selfPath.find("tests")]
 
-        for root, dirs, files in os.walk(projPath):
-            if ("taosd" in files):
-                rootRealPath = os.path.dirname(os.path.realpath(root))
-                if ("packaging" not in rootRealPath):
-                    buildPath = root[:len(root) - len("/build/bin")]
-                    break
-        return buildPath
+    #     for root, dirs, files in os.walk(projPath):
+    #         if ("taosd" in files):
+    #             rootRealPath = os.path.dirname(os.path.realpath(root))
+    #             if ("packaging" not in rootRealPath):
+    #                 buildPath = root[:len(root) - len("/build/bin")]
+    #                 break
+    #     return buildPath
 
     def start(self):
         ON_POSIX = 'posix' in sys.builtin_module_names
 
-        taosdPath = self.getBuildPath() + "/build/bin/taosd"
-        cfgPath = self.getBuildPath() + "/test/cfg"
-
-        # Delete the log files
-        logPath = self.getBuildPath() + "/test/log"
-        # ref: https://stackoverflow.com/questions/1995373/deleting-all-files-in-a-directory-with-python/1995397
-        # filelist = [ f for f in os.listdir(logPath) ] # if f.endswith(".bak") ]
-        # for f in filelist:
-        #     filePath = os.path.join(logPath, f)
-        #     print("Removing log file: {}".format(filePath))
-        #     os.remove(filePath)        
-        if os.path.exists(logPath):
-            logPathSaved = logPath + "_" + time.strftime('%Y-%m-%d-%H-%M-%S')
-            logger.info("Saving old log files to: {}".format(logPathSaved))
-            os.rename(logPath, logPathSaved)
-        # os.mkdir(logPath) # recreate, no need actually, TDengine will auto-create with proper perms
-            
-        svcCmd = [taosdPath, '-c', cfgPath]
-        # svcCmdSingle = "{} -c {}".format(taosdPath, cfgPath)
-        # svcCmd = ['vmstat', '1']
+        # Sanity check
         if self.subProcess:  # already there
             raise RuntimeError("Corrupt process state")
 
-        # print("Starting service: {}".format(svcCmd))
+        # global gContainer
+        # tInst = gContainer.defTdeInstance = TdeInstance('test3') # creae the instance
+        self._tInst.generateCfgFile() # service side generates config file, client does not
+
+        self._tInst.rotateLogs()
+
+        print("Starting TDengine instance: {}".format(self._tInst))
         self.subProcess = subprocess.Popen(
-            svcCmd, shell=False,
+            self._tInst.getServiceCommand(),
+            shell=False,
             # svcCmdSingle, shell=True, # capture core dump?
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,
@@ -2898,10 +3008,15 @@ class ClientManager:
         # self._printLastNumbers()
         global gConfig
 
+        # Prepare Tde Instance
+        global gContainer
+        tInst = gContainer.defTdeInstance = TdeInstance() # "subdir to hold the instance"
+
         dbManager = DbManager()  # Regular function
         thPool = ThreadPool(gConfig.num_threads, gConfig.max_steps)
         self.tc = ThreadCoordinator(thPool, dbManager)
         
+        print("Starting client instance to: {}".format(tInst))
         self.tc.run()
         # print("exec stats: {}".format(self.tc.getExecStats()))
         # print("TC failed = {}".format(self.tc.isFailed()))
@@ -2936,9 +3051,6 @@ class ClientManager:
         # self.tc.getDbManager().cleanUp() # clean up first, so we can show ZERO db connections
         self.tc.printStats()
 
-        
-        
-
 class MainExec:
     STATUS_STARTING = 1
     STATUS_RUNNING = 2
@@ -2968,7 +3080,7 @@ class MainExec:
     def runClient(self):
         global gSvcMgr
         if gConfig.auto_start_service:
-            self._svcMgr = SvcManager()
+            self._svcMgr = ServiceManager()
             gSvcMgr = self._svcMgr # hack alert
             self._svcMgr.startTaosService() # we start, don't run
         
@@ -2983,55 +3095,13 @@ class MainExec:
 
     def runService(self):
         global gSvcMgr
-        self._svcMgr = SvcManager()
+        self._svcMgr = ServiceManager()
         gSvcMgr = self._svcMgr # save it in a global variable TODO: hack alert
 
         self._svcMgr.run() # run to some end state
         self._svcMgr = None 
         gSvcMgr = None        
 
-    def runTemp(self):  # for debugging purposes
-        # # Hack to exercise reading from disk, imcreasing coverage. TODO: fix
-        # dbc = dbState.getDbConn()
-        # sTbName = dbState.getFixedSuperTableName()
-        # dbc.execute("create database if not exists db")
-        # if not dbState.getState().equals(StateEmpty()):
-        #     dbc.execute("use db")
-
-        # rTables = None
-        # try: # the super table may not exist
-        #     sql = "select TBNAME from db.{}".format(sTbName)
-        #     logger.info("Finding out tables in super table: {}".format(sql))
-        #     dbc.query(sql) # TODO: analyze result set later
-        #     logger.info("Fetching result")
-        #     rTables = dbc.getQueryResult()
-        #     logger.info("Result: {}".format(rTables))
-        # except taos.error.ProgrammingError as err:
-        #     logger.info("Initial Super table OPS error: {}".format(err))
-
-        # # sys.exit()
-        # if ( not rTables == None):
-        #     # print("rTables[0] = {}, type = {}".format(rTables[0], type(rTables[0])))
-        #     try:
-        #         for rTbName in rTables : # regular tables
-        #             ds = dbState
-        #             logger.info("Inserting into table: {}".format(rTbName[0]))
-        #             sql = "insert into db.{} values ('{}', {});".format(
-        #                 rTbName[0],
-        #                 ds.getNextTick(), ds.getNextInt())
-        #             dbc.execute(sql)
-        #         for rTbName in rTables : # regular tables
-        #             dbc.query("select * from db.{}".format(rTbName[0])) # TODO: check success failure
-        #         logger.info("Initial READING operation is successful")
-        #     except taos.error.ProgrammingError as err:
-        #         logger.info("Initial WRITE/READ error: {}".format(err))
-
-        # Sandbox testing code
-        # dbc = dbState.getDbConn()
-        # while True:
-        #     rows = dbc.query("show databases")
-        #     print("Rows: {}, time={}".format(rows, time.time()))
-        return
 
 
 def main():
@@ -3045,28 +3115,7 @@ def main():
             1. You build TDengine in the top level ./build directory, as described in offical docs
             2. You run the server there before this script: ./build/bin/taosd -c test/cfg
 
-            '''))
-
-    # parser.add_argument('-a', '--auto-start-service', action='store_true',                        
-    #                     help='Automatically start/stop the TDengine service (default: false)')
-    # parser.add_argument('-c', '--connector-type', action='store', default='native', type=str,
-    #                     help='Connector type to use: native, rest, or mixed (default: 10)')
-    # parser.add_argument('-d', '--debug', action='store_true',                        
-    #                     help='Turn on DEBUG mode for more logging (default: false)')
-    # parser.add_argument('-e', '--run-tdengine', action='store_true',                        
-    #                     help='Run TDengine service in foreground (default: false)')
-    # parser.add_argument('-l', '--larger-data', action='store_true',                        
-    #                     help='Write larger amount of data during write operations (default: false)')
-    # parser.add_argument('-p', '--per-thread-db-connection', action='store_true',                        
-    #                     help='Use a single shared db connection (default: false)')
-    # parser.add_argument('-r', '--record-ops', action='store_true',                        
-    #                     help='Use a pair of always-fsynced fils to record operations performing + performed, for power-off tests (default: false)')                    
-    # parser.add_argument('-s', '--max-steps', action='store', default=1000, type=int,
-    #                     help='Maximum number of steps to run (default: 100)')
-    # parser.add_argument('-t', '--num-threads', action='store', default=5, type=int,
-    #                     help='Number of threads to run (default: 10)')
-    # parser.add_argument('-x', '--continue-on-exception', action='store_true',                        
-    #                     help='Continue execution after encountering unexpected/disallowed errors/exceptions (default: false)')                        
+            '''))                      
 
     parser.add_argument(
         '-a',
@@ -3171,8 +3220,31 @@ def main():
     else:
         return mExec.runClient()
 
+class Container():
+    _propertyList = {'defTdeInstance'}
+
+    def __init__(self):
+        self._cargo = {} # No cargo at the beginning
+
+    def _verifyValidProperty(self, name):
+        if not name in self._propertyList:
+            raise CrashGenError("Invalid container property: {}".format(name))
+
+    # Called for an attribute, when other mechanisms fail (compare to  __getattribute__)
+    def __getattr__(self, name):
+        self._verifyValidProperty(name)
+        return self._cargo[name] # just a simple lookup
+
+    def __setattr__(self, name, value):
+        if name == '_cargo' : # reserved vars
+            super().__setattr__(name, value)
+            return
+        self._verifyValidProperty(name)
+        self._cargo[name] = value
 
 if __name__ == "__main__":
+    gContainer = Container() # micky-mouse DI
+
     exitCode = main()
     # print("Exiting with code: {}".format(exitCode))
-    sys.exit(exitCode)
+    sys.exit(exitCode)
\ No newline at end of file

From c6a5706f662c3000371fa0a2f827749e0948ccb0 Mon Sep 17 00:00:00 2001
From: Steven Li <stevenli@taosdata.com>
Date: Wed, 21 Oct 2020 00:02:27 +0000
Subject: [PATCH 02/16] Multi-instance code working for single instance case,
 ready to refactor crash_gen tool into multiple files

---
 tests/pytest/crash_gen/crash_gen.py | 182 ++++++++++++++++++----------
 1 file changed, 120 insertions(+), 62 deletions(-)

diff --git a/tests/pytest/crash_gen/crash_gen.py b/tests/pytest/crash_gen/crash_gen.py
index b1d79f54c3..3f662fac73 100755
--- a/tests/pytest/crash_gen/crash_gen.py
+++ b/tests/pytest/crash_gen/crash_gen.py
@@ -1780,8 +1780,8 @@ class Task():
                 return True
             elif msg.find("duplicated column names") != -1: # also alter table tag issues
                 return True
-        elif (gSvcMgr!=None) and gSvcMgr.isRestarting():
-            logger.info("Ignoring error when service is restarting: errno = {}, msg = {}".format(errno, msg))
+        elif gSvcMgr and (not gSvcMgr.isStable()): # We are managing service, and ...
+            logger.info("Ignoring error when service starting/stopping: errno = {}, msg = {}".format(errno, msg))
             return True
         
         return False # Not an acceptable error
@@ -2451,8 +2451,9 @@ class MyLoggingAdapter(logging.LoggerAdapter):
 class ServiceManager:
     PAUSE_BETWEEN_IPC_CHECK = 1.2  # seconds between checks on STDOUT of sub process
 
-    def __init__(self):
-        print("Starting TDengine Service Manager")
+    def __init__(self, numDnodes = 1):
+        logger.info("TDengine Service Manager (TSM) created")
+        self._numDnodes = numDnodes # >1 means we have a cluster
         # signal.signal(signal.SIGTERM, self.sigIntHandler) # Moved to MainExec
         # signal.signal(signal.SIGINT, self.sigIntHandler)
         # signal.signal(signal.SIGUSR1, self.sigUsrHandler)  # different handler!
@@ -2460,9 +2461,12 @@ class ServiceManager:
         self.inSigHandler = False
         # self._status = MainExec.STATUS_RUNNING # set inside
         # _startTaosService()
-        self.svcMgrThread = None # type: ServiceManagerThread
+        self.svcMgrThreads = [] # type: List[ServiceManagerThread]
+        for i in range(0, numDnodes):
+            self.svcMgrThreads.append(ServiceManagerThread(i))
+
         self._lock = threading.Lock()
-        self._isRestarting = False
+        # self._isRestarting = False
 
     def _doMenu(self):
         choice = ""
@@ -2494,7 +2498,7 @@ class ServiceManager:
         if choice == "1":            
             self.sigHandlerResume() # TODO: can the sub-process be blocked due to us not reading from queue?
         elif choice == "2":
-            self.stopTaosService()
+            self.stopTaosServices()
         elif choice == "3": # Restart
             self.restart()
         else:
@@ -2509,33 +2513,70 @@ class ServiceManager:
             return
         self.inSigHandler = True
 
-        self.stopTaosService()
+        self.stopTaosServices()
         print("ServiceManager: INT Signal Handler returning...")
         self.inSigHandler = False
 
     def sigHandlerResume(self):
         print("Resuming TDengine service manager (main thread)...\n\n")
 
-    def _updateThreadStatus(self):
-        if self.svcMgrThread:  # valid svc mgr thread
-            if self.svcMgrThread.isStopped():  # done?
-                self.svcMgrThread.procIpcBatch()  # one last time. TODO: appropriate?
-                self.svcMgrThread = None  # no more
+    # def _updateThreadStatus(self):
+    #     if self.svcMgrThread:  # valid svc mgr thread
+    #         if self.svcMgrThread.isStopped():  # done?
+    #             self.svcMgrThread.procIpcBatch()  # one last time. TODO: appropriate?
+    #             self.svcMgrThread = None  # no more
+
+    def isActive(self):
+        """
+        Determine if the service/cluster is active at all, i.e. at least
+        one thread is not "stopped".
+        """
+        for thread in self.svcMgrThreads:
+            if not thread.isStopped():
+                return True
+        return False
+
+    # def isRestarting(self):
+    #     """
+    #     Determine if the service/cluster is being "restarted", i.e., at least
+    #     one thread is in "restarting" status
+    #     """
+    #     for thread in self.svcMgrThreads:
+    #         if thread.isRestarting():
+    #             return True
+    #     return False
+
+    def isStable(self):
+        """
+        Determine if the service/cluster is "stable", i.e. all of the
+        threads are in "stable" status.
+        """
+        for thread in self.svcMgrThreads:
+            if not thread.isStable():
+                return False
+        return True
 
     def _procIpcAll(self):
-        while self.isRunning() or self.isRestarting() :  # for as long as the svc mgr thread is still here
-            if  self.isRunning():
-                self.svcMgrThread.procIpcBatch()  # regular processing,
-                self._updateThreadStatus()
-            elif self.isRetarting():
-                print("Service restarting...")
+        while self.isActive():
+            for thread in self.svcMgrThreads: # all thread objects should always be valid
+            # while self.isRunning() or self.isRestarting() :  # for as long as the svc mgr thread is still here
+                if  thread.isRunning():
+                    thread.procIpcBatch()  # regular processing,
+                    if  thread.isStopped():
+                        thread.procIpcBatch() # one last time?
+                    # self._updateThreadStatus()
+                elif thread.isRetarting():
+                    print("Service restarting...")
+                # else this thread is stopped 
+                    
             time.sleep(self.PAUSE_BETWEEN_IPC_CHECK)  # pause, before next round
+        # raise CrashGenError("dummy")
         print("Service Manager Thread (with subprocess) ended, main thread exiting...")
 
-    def startTaosService(self):
+    def startTaosServices(self):
         with self._lock:
-            if self.svcMgrThread:
-                raise RuntimeError("Cannot start TAOS service when one may already be running")
+            if self.isActive():
+                raise RuntimeError("Cannot start TAOS service(s) when one/some may already be running")
 
             # Find if there's already a taosd service, and then kill it
             for proc in psutil.process_iter():
@@ -2545,53 +2586,45 @@ class ServiceManager:
                     proc.kill()
                 # print("Process: {}".format(proc.name()))
             
-            self.svcMgrThread = ServiceManagerThread()  # create the object
-            print("Attempting to start TAOS service started, printing out output...")
-            self.svcMgrThread.start()            
-            self.svcMgrThread.procIpcBatch(trimToTarget=10, forceOutput=True)  # for printing 10 lines             
-            print("TAOS service started")
+            # self.svcMgrThread = ServiceManagerThread()  # create the object
+            for thread in self.svcMgrThreads:
+                thread.start()            
+                thread.procIpcBatch(trimToTarget=10, forceOutput=True)  # for printing 10 lines                         
 
-    def stopTaosService(self, outputLines=20):
+    def stopTaosServices(self):
         with self._lock:
-            if not self.isRunning():
-                logger.warning("Cannot stop TAOS service, not running")
+            if not self.isActive():
+                logger.warning("Cannot stop TAOS service(s), already not active")
                 return
 
-            print("Terminating Service Manager Thread (SMT) execution...")
-            self.svcMgrThread.stop()
-            if self.svcMgrThread.isStopped():
-                self.svcMgrThread.procIpcBatch(outputLines)  # one last time
-                self.svcMgrThread = None
-                print("End of TDengine Service Output")
-                print("----- TDengine Service (managed by SMT) is now terminated -----\n")
-            else:
-                print("WARNING: SMT did not terminate as expected")
-
+            for thread in self.svcMgrThreads:
+                thread.stop()
+                
     def run(self):
-        self.startTaosService()
+        self.startTaosServices()
         self._procIpcAll()  # pump/process all the messages, may encounter SIG + restart
-        if self.isRunning():  # if sig handler hasn't destroyed it by now
-            self.stopTaosService()  # should have started already
+        if  self.isActive():  # if sig handler hasn't destroyed it by now
+            self.stopTaosServices()  # should have started already
 
     def restart(self):
-        if self._isRestarting:
-            logger.warning("Cannot restart service when it's already restarting")
+        if not self.isStable():
+            logger.warning("Cannot restart service/cluster, when not stable")
             return
 
-        self._isRestarting = True
-        if self.isRunning():
-            self.stopTaosService()
+        # self._isRestarting = True
+        if  self.isActive():
+            self.stopTaosServices()
         else:
-            logger.warning("Service not running when restart requested")
+            logger.warning("Service not active when restart requested")
 
         self.startTaosService()
-        self._isRestarting = False
+        # self._isRestarting = False
 
-    def isRunning(self):
-        return self.svcMgrThread != None
+    # def isRunning(self):
+    #     return self.svcMgrThread != None
 
-    def isRestarting(self):
-        return self._isRestarting
+    # def isRestarting(self):
+    #     return self._isRestarting
 
 class ServiceManagerThread:
     """
@@ -2602,15 +2635,26 @@ class ServiceManagerThread:
     """
     MAX_QUEUE_SIZE = 10000
 
-    def __init__(self, tInst : TdeInstance = None):
+    def __init__(self, tInstNum = 0, tInst : TdeInstance = None):
+        # Set the sub process
         self._tdeSubProcess = None # type: TdeSubProcess
-        self._tInst = tInst or TdeInstance() # Need an instance
-        self._thread = None
-        self._status = None
+
+        # Arrange the TDengine instance
+        self._tInstNum = tInstNum # instance serial number in cluster, ZERO based
+        self._tInst    = tInst or TdeInstance() # Need an instance
+
+        self._thread = None # The actual thread, # type: threading.Thread
+        self._status = MainExec.STATUS_STOPPED # The status of the underlying service, actually.
+
+    def __repr__(self):
+        return "[SvcMgrThread: tInstNum={}]".format(self._tInstNum)
 
     def getStatus(self):
         return self._status
 
+    def isStarting(self):
+        return self._status == MainExec.STATUS_STARTING
+
     def isRunning(self):
         # return self._thread and self._thread.is_alive()
         return self._status == MainExec.STATUS_RUNNING
@@ -2621,6 +2665,9 @@ class ServiceManagerThread:
     def isStopped(self):
         return self._status == MainExec.STATUS_STOPPED
 
+    def isStable(self):
+        return self.isRunning() or self.isStopped()
+
     # Start the thread (with sub process), and wait for the sub service
     # to become fully operational
     def start(self):
@@ -2629,8 +2676,9 @@ class ServiceManagerThread:
         if self._tdeSubProcess:
             raise RuntimeError("TDengine sub process already created/running")
 
-        self._status = MainExec.STATUS_STARTING
+        logger.info("Attempting to start TAOS service: {}".format(self))
 
+        self._status = MainExec.STATUS_STARTING
         self._tdeSubProcess = TdeSubProcess(self._tInst)
         self._tdeSubProcess.start()
 
@@ -2654,10 +2702,11 @@ class ServiceManagerThread:
             print("_zz_", end="", flush=True)
             if self._status == MainExec.STATUS_RUNNING:
                 logger.info("[] TDengine service READY to process requests")
+                logger.info("[] TAOS service started: {}".format(self))
                 return  # now we've started
-        # TODO: handle this better?
+        # TODO: handle failure-to-start  better?
         self.procIpcBatch(100, True) # display output before cronking out, trim to last 20 msgs, force output
-        raise RuntimeError("TDengine service did not start successfully")
+        raise RuntimeError("TDengine service did not start successfully: {}".format(self))
 
     def stop(self):
         # can be called from both main thread or signal handler
@@ -2687,6 +2736,15 @@ class ServiceManagerThread:
             self._tdeSubProcess = None  # not running any more
             self.join()  # stop the thread, change the status, etc.
 
+        # Check if it's really stopped
+        outputLines = 20 # for last output
+        if  self.isStopped():
+            self.procIpcBatch(outputLines)  # one last time
+            print("End of TDengine Service Output: {}".format(self))
+            print("----- TDengine Service (managed by SMT) is now terminated -----\n")
+        else:
+            print("WARNING: SMT did not terminate as expected: {}".format(self))
+
     def join(self):
         # TODO: sanity check
         if not self.isStopping():
@@ -2770,7 +2828,7 @@ class ServiceManagerThread:
                 if line.find(self.TD_READY_MSG) != -1:  # found
                     logger.info("Waiting for the service to become FULLY READY")
                     time.sleep(1.0) # wait for the server to truly start. TODO: remove this
-                    logger.info("Service is now FULLY READY")   
+                    logger.info("Service instance #{} is now FULLY READY".format(self._tInstNum))   
                     self._status = MainExec.STATUS_RUNNING                 
 
             # Trim the queue if necessary: TODO: try this 1 out of 10 times

From e011827fd4ea39b1826babeed74d318c5f78d64d Mon Sep 17 00:00:00 2001
From: Steven Li <stevenli@taosdata.com>
Date: Wed, 21 Oct 2020 00:18:49 +0000
Subject: [PATCH 03/16] Finished refactoring crash_gen tool into a Python
 modular structure

---
 tests/pytest/crash_gen.sh           |   7 +-
 tests/pytest/crash_gen/crash_gen.py | 221 ++++++++++++++--------------
 tests/pytest/crash_gen_bootstrap.py |  23 +++
 3 files changed, 135 insertions(+), 116 deletions(-)
 create mode 100644 tests/pytest/crash_gen_bootstrap.py

diff --git a/tests/pytest/crash_gen.sh b/tests/pytest/crash_gen.sh
index 4ffe35fc3c..9cca23ac79 100755
--- a/tests/pytest/crash_gen.sh
+++ b/tests/pytest/crash_gen.sh
@@ -54,6 +54,7 @@ export PYTHONPATH=$(pwd)/../../src/connector/python/linux/python3:$(pwd)
 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$LIB_DIR
 
 # Now we are all let, and let's see if we can find a crash. Note we pass all params
+CRASH_GEN_EXEC=crash_gen_bootstrap.py
 if [[ $1 == '--valgrind' ]]; then
   shift
   export PYTHONMALLOC=malloc
@@ -66,14 +67,14 @@ if [[ $1 == '--valgrind' ]]; then
     --leak-check=yes \
     --suppressions=crash_gen/valgrind_taos.supp \
     $PYTHON_EXEC \
-    ./crash_gen/crash_gen.py $@ > $VALGRIND_OUT 2> $VALGRIND_ERR 
+    $CRASH_GEN_EXEC $@ > $VALGRIND_OUT 2> $VALGRIND_ERR 
 elif [[ $1 == '--helgrind' ]]; then
   shift
   valgrind  \
     --tool=helgrind \
     $PYTHON_EXEC \
-    ./crash_gen/crash_gen.py $@
+    $CRASH_GEN_EXEC $@
 else
-  $PYTHON_EXEC ./crash_gen/crash_gen.py $@
+  $PYTHON_EXEC $CRASH_GEN_EXEC $@
 fi
 
diff --git a/tests/pytest/crash_gen/crash_gen.py b/tests/pytest/crash_gen/crash_gen.py
index 3f662fac73..dbd4eab9e7 100755
--- a/tests/pytest/crash_gen/crash_gen.py
+++ b/tests/pytest/crash_gen/crash_gen.py
@@ -3160,123 +3160,124 @@ class MainExec:
         self._svcMgr = None 
         gSvcMgr = None        
 
+    def init(self): # TODO: refactor
+        global gContainer
+        gContainer = Container() # micky-mouse DI
 
+        # Super cool Python argument library:
+        # https://docs.python.org/3/library/argparse.html
+        parser = argparse.ArgumentParser(
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            description=textwrap.dedent('''\
+                TDengine Auto Crash Generator (PLEASE NOTICE the Prerequisites Below)
+                ---------------------------------------------------------------------
+                1. You build TDengine in the top level ./build directory, as described in offical docs
+                2. You run the server there before this script: ./build/bin/taosd -c test/cfg
 
-def main():
-    # Super cool Python argument library:
-    # https://docs.python.org/3/library/argparse.html
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        description=textwrap.dedent('''\
-            TDengine Auto Crash Generator (PLEASE NOTICE the Prerequisites Below)
-            ---------------------------------------------------------------------
-            1. You build TDengine in the top level ./build directory, as described in offical docs
-            2. You run the server there before this script: ./build/bin/taosd -c test/cfg
+                '''))                      
 
-            '''))                      
+        parser.add_argument(
+            '-a',
+            '--auto-start-service',
+            action='store_true',
+            help='Automatically start/stop the TDengine service (default: false)')
+        parser.add_argument(
+            '-b',
+            '--max-dbs',
+            action='store',
+            default=0,
+            type=int,
+            help='Maximum number of DBs to keep, set to disable dropping DB. (default: 0)')
+        parser.add_argument(
+            '-c',
+            '--connector-type',
+            action='store',
+            default='native',
+            type=str,
+            help='Connector type to use: native, rest, or mixed (default: 10)')
+        parser.add_argument(
+            '-d',
+            '--debug',
+            action='store_true',
+            help='Turn on DEBUG mode for more logging (default: false)')
+        parser.add_argument(
+            '-e',
+            '--run-tdengine',
+            action='store_true',
+            help='Run TDengine service in foreground (default: false)')
+        parser.add_argument(
+            '-i',
+            '--max-replicas',
+            action='store',
+            default=1,
+            type=int,
+            help='Maximum number of replicas to use, when testing against clusters. (default: 1)')
+        parser.add_argument(
+            '-l',
+            '--larger-data',
+            action='store_true',
+            help='Write larger amount of data during write operations (default: false)')
+        parser.add_argument(
+            '-p',
+            '--per-thread-db-connection',
+            action='store_true',
+            help='Use a single shared db connection (default: false)')
+        parser.add_argument(
+            '-r',
+            '--record-ops',
+            action='store_true',
+            help='Use a pair of always-fsynced fils to record operations performing + performed, for power-off tests (default: false)')
+        parser.add_argument(
+            '-s',
+            '--max-steps',
+            action='store',
+            default=1000,
+            type=int,
+            help='Maximum number of steps to run (default: 100)')
+        parser.add_argument(
+            '-t',
+            '--num-threads',
+            action='store',
+            default=5,
+            type=int,
+            help='Number of threads to run (default: 10)')
+        parser.add_argument(
+            '-v',
+            '--verify-data',
+            action='store_true',
+            help='Verify data written in a number of places by reading back (default: false)')
+        parser.add_argument(
+            '-x',
+            '--continue-on-exception',
+            action='store_true',
+            help='Continue execution after encountering unexpected/disallowed errors/exceptions (default: false)')
 
-    parser.add_argument(
-        '-a',
-        '--auto-start-service',
-        action='store_true',
-        help='Automatically start/stop the TDengine service (default: false)')
-    parser.add_argument(
-        '-b',
-        '--max-dbs',
-        action='store',
-        default=0,
-        type=int,
-        help='Maximum number of DBs to keep, set to disable dropping DB. (default: 0)')
-    parser.add_argument(
-        '-c',
-        '--connector-type',
-        action='store',
-        default='native',
-        type=str,
-        help='Connector type to use: native, rest, or mixed (default: 10)')
-    parser.add_argument(
-        '-d',
-        '--debug',
-        action='store_true',
-        help='Turn on DEBUG mode for more logging (default: false)')
-    parser.add_argument(
-        '-e',
-        '--run-tdengine',
-        action='store_true',
-        help='Run TDengine service in foreground (default: false)')
-    parser.add_argument(
-        '-i',
-        '--max-replicas',
-        action='store',
-        default=1,
-        type=int,
-        help='Maximum number of replicas to use, when testing against clusters. (default: 1)')
-    parser.add_argument(
-        '-l',
-        '--larger-data',
-        action='store_true',
-        help='Write larger amount of data during write operations (default: false)')
-    parser.add_argument(
-        '-p',
-        '--per-thread-db-connection',
-        action='store_true',
-        help='Use a single shared db connection (default: false)')
-    parser.add_argument(
-        '-r',
-        '--record-ops',
-        action='store_true',
-        help='Use a pair of always-fsynced fils to record operations performing + performed, for power-off tests (default: false)')
-    parser.add_argument(
-        '-s',
-        '--max-steps',
-        action='store',
-        default=1000,
-        type=int,
-        help='Maximum number of steps to run (default: 100)')
-    parser.add_argument(
-        '-t',
-        '--num-threads',
-        action='store',
-        default=5,
-        type=int,
-        help='Number of threads to run (default: 10)')
-    parser.add_argument(
-        '-v',
-        '--verify-data',
-        action='store_true',
-        help='Verify data written in a number of places by reading back (default: false)')
-    parser.add_argument(
-        '-x',
-        '--continue-on-exception',
-        action='store_true',
-        help='Continue execution after encountering unexpected/disallowed errors/exceptions (default: false)')
+        global gConfig
+        gConfig = parser.parse_args()
 
-    global gConfig
-    gConfig = parser.parse_args()
+        # Logging Stuff
+        global logger
+        _logger = logging.getLogger('CrashGen')  # real logger
+        _logger.addFilter(LoggingFilter())
+        ch = logging.StreamHandler()
+        _logger.addHandler(ch)
 
-    # Logging Stuff
-    global logger
-    _logger = logging.getLogger('CrashGen')  # real logger
-    _logger.addFilter(LoggingFilter())
-    ch = logging.StreamHandler()
-    _logger.addHandler(ch)
+        # Logging adapter, to be used as a logger
+        logger = MyLoggingAdapter(_logger, [])
 
-    # Logging adapter, to be used as a logger
-    logger = MyLoggingAdapter(_logger, [])
+        if (gConfig.debug):
+            logger.setLevel(logging.DEBUG)  # default seems to be INFO
+        else:
+            logger.setLevel(logging.INFO)
 
-    if (gConfig.debug):
-        logger.setLevel(logging.DEBUG)  # default seems to be INFO
-    else:
-        logger.setLevel(logging.INFO)
+        Dice.seed(0)  # initial seeding of dice
 
-    Dice.seed(0)  # initial seeding of dice
+    def run(self):
+        if gConfig.run_tdengine:  # run server
+            self.runService()
+        else:
+            return self.runClient()
 
-    # Run server or client
-    mExec = MainExec()
-    if gConfig.run_tdengine:  # run server
-        mExec.runService()
-    else:
-        return mExec.runClient()
 
 class Container():
     _propertyList = {'defTdeInstance'}
@@ -3300,9 +3301,3 @@ class Container():
         self._verifyValidProperty(name)
         self._cargo[name] = value
 
-if __name__ == "__main__":
-    gContainer = Container() # micky-mouse DI
-
-    exitCode = main()
-    # print("Exiting with code: {}".format(exitCode))
-    sys.exit(exitCode)
\ No newline at end of file
diff --git a/tests/pytest/crash_gen_bootstrap.py b/tests/pytest/crash_gen_bootstrap.py
new file mode 100644
index 0000000000..a3417d21a8
--- /dev/null
+++ b/tests/pytest/crash_gen_bootstrap.py
@@ -0,0 +1,23 @@
+# -----!/usr/bin/python3.7
+###################################################################
+#           Copyright (c) 2016 by TAOS Technologies, Inc.
+#                     All rights reserved.
+#
+#  This file is proprietary and confidential to TAOS Technologies.
+#  No part of this file may be reproduced, stored, transmitted,
+#  disclosed or used in any form or by any means other than as
+#  expressly provided by the written permission from Jianhui Tao
+#
+###################################################################
+
+import sys
+from crash_gen.crash_gen import MainExec
+
+if __name__ == "__main__":
+    
+    mExec = MainExec()
+    mExec.init()
+    exitCode = mExec.run()
+
+    print("Exiting with code: {}".format(exitCode))
+    sys.exit(exitCode)

From dc72a1a60c7c206f1e2d03a14fea3d756997a8de Mon Sep 17 00:00:00 2001
From: Steven Li <stevenli@taosdata.com>
Date: Wed, 21 Oct 2020 07:54:47 +0000
Subject: [PATCH 04/16] Split crash_gen tool into different functional
 files/modules

---
 tests/pytest/crash_gen/crash_gen.py       | 914 +++-------------------
 tests/pytest/crash_gen/misc.py            | 133 ++++
 tests/pytest/crash_gen/service_manager.py | 633 +++++++++++++++
 3 files changed, 873 insertions(+), 807 deletions(-)
 create mode 100644 tests/pytest/crash_gen/misc.py
 create mode 100644 tests/pytest/crash_gen/service_manager.py

diff --git a/tests/pytest/crash_gen/crash_gen.py b/tests/pytest/crash_gen/crash_gen.py
index dbd4eab9e7..f369f5a3e8 100755
--- a/tests/pytest/crash_gen/crash_gen.py
+++ b/tests/pytest/crash_gen/crash_gen.py
@@ -19,17 +19,15 @@ from util.sql import *
 from util.cases import *
 from util.dnodes import *
 from util.log import *
-from queue import Queue, Empty
-from typing import IO
 from typing import Set
 from typing import Dict
 from typing import List
 from requests.auth import HTTPBasicAuth
 import textwrap
-import datetime
-import logging
 import time
+import datetime
 import random
+import logging
 import threading
 import requests
 import copy
@@ -38,19 +36,14 @@ import getopt
 
 import sys
 import os
-import io
 import signal
 import traceback
 import resource
 from guppy import hpy
 import gc
-import subprocess
 
-try:
-    import psutil
-except:
-    print("Psutil module needed, please install: sudo pip3 install psutil")
-    sys.exit(-1)
+from .service_manager import ServiceManager, TdeInstance
+from .misc import Logging, Status, CrashGenError, Dice, Helper, Progress
 
 # Require Python 3
 if sys.version_info[0] < 3:
@@ -62,19 +55,12 @@ if sys.version_info[0] < 3:
 # ConfigNameSpace = argparse.Namespace
 gConfig:    argparse.Namespace 
 gSvcMgr:    ServiceManager # TODO: refactor this hack, use dep injection
-logger:     logging.Logger
+# logger:     logging.Logger
 gContainer: Container
 
 # def runThread(wt: WorkerThread):
 #     wt.run()
 
-class CrashGenError(Exception):
-    def __init__(self, msg=None, errno=None):
-        self.msg = msg
-        self.errno = errno
-
-    def __str__(self):
-        return self.msg
 
 class WorkerThread:
     def __init__(self, pool: ThreadPool, tid, tc: ThreadCoordinator,
@@ -107,10 +93,10 @@ class WorkerThread:
         # self._dbInUse = False  # if "use db" was executed already
 
     def logDebug(self, msg):
-        logger.debug("    TRD[{}] {}".format(self._tid, msg))
+        Logging.debug("    TRD[{}] {}".format(self._tid, msg))
 
     def logInfo(self, msg):
-        logger.info("    TRD[{}] {}".format(self._tid, msg))
+        Logging.info("    TRD[{}] {}".format(self._tid, msg))
 
     # def dbInUse(self):
     #     return self._dbInUse
@@ -129,10 +115,10 @@ class WorkerThread:
     def run(self):
         # initialization after thread starts, in the thread context
         # self.isSleeping = False
-        logger.info("Starting to run thread: {}".format(self._tid))
+        Logging.info("Starting to run thread: {}".format(self._tid))
 
         if (gConfig.per_thread_db_connection):  # type: ignore
-            logger.debug("Worker thread openning database connection")
+            Logging.debug("Worker thread openning database connection")
             self._dbConn.open()
 
         self._doTaskLoop()
@@ -142,7 +128,7 @@ class WorkerThread:
             if self._dbConn.isOpen: #sometimes it is not open
                 self._dbConn.close()
             else:
-                logger.warning("Cleaning up worker thread, dbConn already closed")
+                Logging.warning("Cleaning up worker thread, dbConn already closed")
 
     def _doTaskLoop(self):
         # while self._curStep < self._pool.maxSteps:
@@ -153,15 +139,15 @@ class WorkerThread:
                 tc.crossStepBarrier()  # shared barrier first, INCLUDING the last one
             except threading.BrokenBarrierError as err: # main thread timed out
                 print("_bto", end="")
-                logger.debug("[TRD] Worker thread exiting due to main thread barrier time-out")
+                Logging.debug("[TRD] Worker thread exiting due to main thread barrier time-out")
                 break
 
-            logger.debug("[TRD] Worker thread [{}] exited barrier...".format(self._tid))
+            Logging.debug("[TRD] Worker thread [{}] exited barrier...".format(self._tid))
             self.crossStepGate()   # then per-thread gate, after being tapped
-            logger.debug("[TRD] Worker thread [{}] exited step gate...".format(self._tid))
+            Logging.debug("[TRD] Worker thread [{}] exited step gate...".format(self._tid))
             if not self._tc.isRunning():
                 print("_wts", end="")
-                logger.debug("[TRD] Thread Coordinator not running any more, worker thread now stopping...")
+                Logging.debug("[TRD] Thread Coordinator not running any more, worker thread now stopping...")
                 break
 
             # Before we fetch the task and run it, let's ensure we properly "use" the database (not needed any more)
@@ -180,15 +166,15 @@ class WorkerThread:
                     raise
 
             # Fetch a task from the Thread Coordinator
-            logger.debug( "[TRD] Worker thread [{}] about to fetch task".format(self._tid))
+            Logging.debug( "[TRD] Worker thread [{}] about to fetch task".format(self._tid))
             task = tc.fetchTask()
 
             # Execute such a task
-            logger.debug("[TRD] Worker thread [{}] about to execute task: {}".format(
+            Logging.debug("[TRD] Worker thread [{}] about to execute task: {}".format(
                     self._tid, task.__class__.__name__))
             task.execute(self)
             tc.saveExecutedTask(task)
-            logger.debug("[TRD] Worker thread [{}] finished executing task".format(self._tid))
+            Logging.debug("[TRD] Worker thread [{}] finished executing task".format(self._tid))
 
             # self._dbInUse = False  # there may be changes between steps
         # print("_wtd", end=None) # worker thread died
@@ -211,7 +197,7 @@ class WorkerThread:
         self.verifyThreadSelf()  # only allowed by ourselves
 
         # Wait again at the "gate", waiting to be "tapped"
-        logger.debug(
+        Logging.debug(
             "[TRD] Worker thread {} about to cross the step gate".format(
                 self._tid))
         self._stepGate.wait()
@@ -224,7 +210,7 @@ class WorkerThread:
         self.verifyThreadMain()  # only allowed for main thread
 
         if self._thread.is_alive():
-            logger.debug("[TRD] Tapping worker thread {}".format(self._tid))
+            Logging.debug("[TRD] Tapping worker thread {}".format(self._tid))
             self._stepGate.set()  # wake up!
             time.sleep(0)  # let the released thread run a bit
         else:
@@ -269,7 +255,7 @@ class ThreadCoordinator:
         self._stepBarrier = threading.Barrier(
             self._pool.numThreads + 1)  # one barrier for all threads
         self._execStats = ExecutionStats()
-        self._runStatus = MainExec.STATUS_RUNNING
+        self._runStatus = Status.STATUS_RUNNING
         self._initDbs()
 
     def getTaskExecutor(self):
@@ -282,14 +268,14 @@ class ThreadCoordinator:
         self._stepBarrier.wait(timeout) 
 
     def requestToStop(self):
-        self._runStatus = MainExec.STATUS_STOPPING
+        self._runStatus = Status.STATUS_STOPPING
         self._execStats.registerFailure("User Interruption")
 
     def _runShouldEnd(self, transitionFailed, hasAbortedTask, workerTimeout):
         maxSteps = gConfig.max_steps  # type: ignore
         if self._curStep >= (maxSteps - 1): # maxStep==10, last curStep should be 9
             return True
-        if self._runStatus != MainExec.STATUS_RUNNING:
+        if self._runStatus != Status.STATUS_RUNNING:
             return True
         if transitionFailed:
             return True
@@ -310,7 +296,7 @@ class ThreadCoordinator:
     def _releaseAllWorkerThreads(self, transitionFailed):
         self._curStep += 1  # we are about to get into next step. TODO: race condition here!
         # Now not all threads had time to go to sleep
-        logger.debug(
+        Logging.debug(
             "--\r\n\n--> Step {} starts with main thread waking up".format(self._curStep))
 
         # A new TE for the new step
@@ -318,7 +304,7 @@ class ThreadCoordinator:
         if not transitionFailed:  # only if not failed
             self._te = TaskExecutor(self._curStep)
 
-        logger.debug("[TRD] Main thread waking up at step {}, tapping worker threads".format(
+        Logging.debug("[TRD] Main thread waking up at step {}, tapping worker threads".format(
                 self._curStep))  # Now not all threads had time to go to sleep
         # Worker threads will wake up at this point, and each execute it's own task
         self.tapAllThreads() # release all worker thread from their "gates"
@@ -327,10 +313,10 @@ class ThreadCoordinator:
          # Now main thread (that's us) is ready to enter a step
         # let other threads go past the pool barrier, but wait at the
         # thread gate
-        logger.debug("[TRD] Main thread about to cross the barrier")
+        Logging.debug("[TRD] Main thread about to cross the barrier")
         self.crossStepBarrier(timeout=self.WORKER_THREAD_TIMEOUT)
         self._stepBarrier.reset()  # Other worker threads should now be at the "gate"
-        logger.debug("[TRD] Main thread finished crossing the barrier")
+        Logging.debug("[TRD] Main thread finished crossing the barrier")
 
     def _doTransition(self):
         transitionFailed = False
@@ -338,11 +324,11 @@ class ThreadCoordinator:
             for x in self._dbs:
                 db = x # type: Database
                 sm = db.getStateMachine()
-                logger.debug("[STT] starting transitions for DB: {}".format(db.getName()))
+                Logging.debug("[STT] starting transitions for DB: {}".format(db.getName()))
                 # at end of step, transiton the DB state
                 tasksForDb = db.filterTasks(self._executedTasks)
                 sm.transition(tasksForDb, self.getDbManager().getDbConn())
-                logger.debug("[STT] transition ended for DB: {}".format(db.getName()))
+                Logging.debug("[STT] transition ended for DB: {}".format(db.getName()))
 
             # Due to limitation (or maybe not) of the TD Python library,
             # we cannot share connections across threads
@@ -350,14 +336,14 @@ class ThreadCoordinator:
             # Moving below to task loop
             # if sm.hasDatabase():
             #     for t in self._pool.threadList:
-            #         logger.debug("[DB] use db for all worker threads")
+            #         Logging.debug("[DB] use db for all worker threads")
             #         t.useDb()
                     # t.execSql("use db") # main thread executing "use
                     # db" on behalf of every worker thread
 
         except taos.error.ProgrammingError as err:
             if (err.msg == 'network unavailable'):  # broken DB connection
-                logger.info("DB connection broken, execution failed")
+                Logging.info("DB connection broken, execution failed")
                 traceback.print_stack()
                 transitionFailed = True
                 self._te = None  # Not running any more
@@ -370,7 +356,7 @@ class ThreadCoordinator:
 
         self.resetExecutedTasks()  # clear the tasks after we are done
         # Get ready for next step
-        logger.debug("<-- Step {} finished, trasition failed = {}".format(self._curStep, transitionFailed))
+        Logging.debug("<-- Step {} finished, trasition failed = {}".format(self._curStep, transitionFailed))
         return transitionFailed
 
     def run(self):
@@ -384,8 +370,9 @@ class ThreadCoordinator:
         hasAbortedTask = False
         workerTimeout = False
         while not self._runShouldEnd(transitionFailed, hasAbortedTask, workerTimeout):
-            if not gConfig.debug: # print this only if we are not in debug mode                
-                print(".", end="", flush=True)
+            if not gConfig.debug: # print this only if we are not in debug mode    
+                Progress.emit(Progress.STEP_BOUNDARY)            
+                # print(".", end="", flush=True)
             # if (self._curStep % 2) == 0: # print memory usage once every 10 steps
             #     memUsage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
             #     print("[m:{}]".format(memUsage), end="", flush=True) # print memory usage
@@ -397,8 +384,9 @@ class ThreadCoordinator:
                         
             try:
                 self._syncAtBarrier() # For now just cross the barrier
+                Progress.emit(Progress.END_THREAD_STEP)
             except threading.BrokenBarrierError as err:
-                logger.info("Main loop aborted, caused by worker thread time-out")
+                Logging.info("Main loop aborted, caused by worker thread time-out")
                 self._execStats.registerFailure("Aborted due to worker thread timeout")
                 print("\n\nWorker Thread time-out detected, important thread info:")
                 ts = ThreadStacks()
@@ -411,7 +399,7 @@ class ThreadCoordinator:
             # threads are QUIET.
             hasAbortedTask = self._hasAbortedTask() # from previous step
             if hasAbortedTask: 
-                logger.info("Aborted task encountered, exiting test program")
+                Logging.info("Aborted task encountered, exiting test program")
                 self._execStats.registerFailure("Aborted Task Encountered")
                 break # do transition only if tasks are error free
 
@@ -422,29 +410,30 @@ class ThreadCoordinator:
                 transitionFailed = True
                 errno2 = Helper.convertErrno(err.errno)  # correct error scheme
                 errMsg = "Transition failed: errno=0x{:X}, msg: {}".format(errno2, err)
-                logger.info(errMsg)
+                Logging.info(errMsg)
                 traceback.print_exc()
                 self._execStats.registerFailure(errMsg)
 
             # Then we move on to the next step
+            Progress.emit(Progress.BEGIN_THREAD_STEP)
             self._releaseAllWorkerThreads(transitionFailed)                    
 
         if hasAbortedTask or transitionFailed : # abnormal ending, workers waiting at "gate"
-            logger.debug("Abnormal ending of main thraed")
+            Logging.debug("Abnormal ending of main thraed")
         elif workerTimeout:
-            logger.debug("Abnormal ending of main thread, due to worker timeout")
+            Logging.debug("Abnormal ending of main thread, due to worker timeout")
         else: # regular ending, workers waiting at "barrier"
-            logger.debug("Regular ending, main thread waiting for all worker threads to stop...")
+            Logging.debug("Regular ending, main thread waiting for all worker threads to stop...")
             self._syncAtBarrier()
 
         self._te = None  # No more executor, time to end
-        logger.debug("Main thread tapping all threads one last time...")
+        Logging.debug("Main thread tapping all threads one last time...")
         self.tapAllThreads()  # Let the threads run one last time
 
-        logger.debug("\r\n\n--> Main thread ready to finish up...")
-        logger.debug("Main thread joining all threads")
+        Logging.debug("\r\n\n--> Main thread ready to finish up...")
+        Logging.debug("Main thread joining all threads")
         self._pool.joinAll()  # Get all threads to finish
-        logger.info("\nAll worker threads finished")
+        Logging.info("\nAll worker threads finished")
         self._execStats.endExec()
 
     def cleanup(self): # free resources
@@ -476,7 +465,7 @@ class ThreadCoordinator:
                 wakeSeq.append(i)
             else:
                 wakeSeq.insert(0, i)
-        logger.debug(
+        Logging.debug(
             "[TRD] Main thread waking up worker threads: {}".format(
                 str(wakeSeq)))
         # TODO: set dice seed to a deterministic value
@@ -524,13 +513,6 @@ class ThreadCoordinator:
         with self._lock:
             self._executedTasks.append(task)
 
-# We define a class to run a number of threads in locking steps.
-
-class Helper:
-    @classmethod
-    def convertErrno(cls, errno):
-        return errno if (errno > 0) else 0x80000000 + errno
-
 class ThreadPool:
     def __init__(self, numThreads, maxSteps):
         self.numThreads = numThreads
@@ -548,7 +530,7 @@ class ThreadPool:
 
     def joinAll(self):
         for workerThread in self.threadList:
-            logger.debug("Joining thread...")
+            Logging.debug("Joining thread...")
             workerThread._thread.join()
 
     def cleanup(self):
@@ -605,7 +587,7 @@ class LinearQueue():
 
     def allocate(self, i):
         with self._lock:
-            # logger.debug("LQ allocating item {}".format(i))
+            # Logging.debug("LQ allocating item {}".format(i))
             if (i in self.inUse):
                 raise RuntimeError(
                     "Cannot re-use same index in queue: {}".format(i))
@@ -613,7 +595,7 @@ class LinearQueue():
 
     def release(self, i):
         with self._lock:
-            # logger.debug("LQ releasing item {}".format(i))
+            # Logging.debug("LQ releasing item {}".format(i))
             self.inUse.remove(i)  # KeyError possible, TODO: why?
 
     def size(self):
@@ -673,9 +655,12 @@ class DbConn:
         # below implemented by child classes
         self.openByType()
 
-        logger.debug("[DB] data connection opened, type = {}".format(self._type))
+        Logging.debug("[DB] data connection opened, type = {}".format(self._type))
         self.isOpen = True
 
+    def close(self):
+        raise RuntimeError("Unexpected execution, should be overriden")
+
     def queryScalar(self, sql) -> int:
         return self._queryAny(sql)
 
@@ -755,7 +740,7 @@ class DbConnRest(DbConn):
         if (not self.isOpen):
             raise RuntimeError("Cannot clean up database until connection is open")
         # Do nothing for REST
-        logger.debug("[DB] REST Database connection closed")
+        Logging.debug("[DB] REST Database connection closed")
         self.isOpen = False
 
     def _doSql(self, sql):
@@ -793,9 +778,9 @@ class DbConnRest(DbConn):
         if (not self.isOpen):
             raise RuntimeError(
                 "Cannot execute database commands until connection is open")
-        logger.debug("[SQL-REST] Executing SQL: {}".format(sql))
+        Logging.debug("[SQL-REST] Executing SQL: {}".format(sql))
         nRows = self._doSql(sql)
-        logger.debug(
+        Logging.debug(
             "[SQL-REST] Execution Result, nRows = {}, SQL = {}".format(nRows, sql))
         return nRows
 
@@ -884,127 +869,6 @@ class MyTDSql:
             raise
         return self.affectedRows
 
-class TdeInstance():
-    """
-    A class to capture the *static* information of a TDengine instance,
-    including the location of the various files/directories, and basica
-    configuration.
-    """
-
-    @classmethod
-    def _getBuildPath(cls):
-        selfPath = os.path.dirname(os.path.realpath(__file__))
-        if ("community" in selfPath):
-            projPath = selfPath[:selfPath.find("communit")]
-        else:
-            projPath = selfPath[:selfPath.find("tests")]
-
-        buildPath = None
-        for root, dirs, files in os.walk(projPath):
-            if ("taosd" in files):
-                rootRealPath = os.path.dirname(os.path.realpath(root))
-                if ("packaging" not in rootRealPath):
-                    buildPath = root[:len(root) - len("/build/bin")]
-                    break
-        if buildPath == None:
-            raise RuntimeError("Failed to determine buildPath, selfPath={}, projPath={}"
-                .format(selfPath, projPath))
-        return buildPath
-
-    def __init__(self, subdir='test'):
-        self._buildDir = self._getBuildPath()
-        self._subdir = '/' + subdir # TODO: tolerate "/"
-
-    def __repr__(self):
-        return "[TdeInstance: {}, subdir={}]".format(self._buildDir, self._subdir)
-    
-    def generateCfgFile(self):
-        # buildPath = self.getBuildPath()
-        # taosdPath = self._buildPath + "/build/bin/taosd"
-
-        cfgDir  = self.getCfgDir()
-        cfgFile = cfgDir + "/taos.cfg" # TODO: inquire if this is fixed
-        if os.path.exists(cfgFile):
-            if os.path.isfile(cfgFile):
-                logger.warning("Config file exists already, skip creation: {}".format(cfgFile))
-                return # cfg file already exists, nothing to do
-            else:
-                raise CrashGenError("Invalid config file: {}".format(cfgFile))
-        # Now that the cfg file doesn't exist
-        if os.path.exists(cfgDir):
-            if not os.path.isdir(cfgDir):
-                raise CrashGenError("Invalid config dir: {}".format(cfgDir))
-            # else: good path
-        else: 
-            os.makedirs(cfgDir, exist_ok=True) # like "mkdir -p"
-        # Now we have a good cfg dir
-        cfgValues = {
-            'runDir': self.getRunDir(),
-            'ip': '127.0.0.1', # TODO: change to a network addressable ip
-            'port': 6030,
-        }
-        cfgTemplate = """
-dataDir {runDir}/data
-logDir  {runDir}/log
-
-charset UTF-8
-
-firstEp {ip}:{port}
-fqdn {ip}
-serverPort {port}
-
-# was all 135 below
-dDebugFlag 135
-cDebugFlag 135
-rpcDebugFlag 135
-qDebugFlag 135
-# httpDebugFlag 143
-# asyncLog 0
-# tables 10
-maxtablesPerVnode 10
-rpcMaxTime 101
-# cache 2
-keep 36500
-# walLevel 2
-walLevel 1
-#
-# maxConnections 100
-"""
-        cfgContent = cfgTemplate.format_map(cfgValues)
-        f = open(cfgFile, "w")
-        f.write(cfgContent)
-        f.close()
-
-    def rotateLogs(self):
-        logPath = self.getLogDir()
-        # ref: https://stackoverflow.com/questions/1995373/deleting-all-files-in-a-directory-with-python/1995397
-        if os.path.exists(logPath):
-            logPathSaved = logPath + "_" + time.strftime('%Y-%m-%d-%H-%M-%S')
-            logger.info("Saving old log files to: {}".format(logPathSaved))
-            os.rename(logPath, logPathSaved)
-        # os.mkdir(logPath) # recreate, no need actually, TDengine will auto-create with proper perms
-
-
-    def getExecFile(self): # .../taosd
-        return self._buildDir + "/build/bin/taosd"
-
-    def getRunDir(self): # TODO: rename to "root dir" ?!
-        return self._buildDir + self._subdir
-
-    def getCfgDir(self): # path, not file
-        return self.getRunDir() + "/cfg"
-
-    def getLogDir(self):
-        return self.getRunDir() + "/log"
-
-    def getHostAddr(self):
-        return "127.0.0.1"
-
-    def getServiceCommand(self): # to start the instance
-        return [self.getExecFile(), '-c', self.getCfgDir()] # used in subproce.Popen()
-
-
-
 class DbConnNative(DbConn):
     # Class variables
     _lock = threading.Lock()
@@ -1028,7 +892,7 @@ class DbConnNative(DbConn):
         with cls._lock: # force single threading for opening DB connections. # TODO: whaaat??!!!
             if not cls._connInfoDisplayed:
                 cls._connInfoDisplayed = True # updating CLASS variable
-                logger.info("Initiating TAOS native connection to {}, using config at {}".format(hostAddr, cfgPath))                    
+                Logging.info("Initiating TAOS native connection to {}, using config at {}".format(hostAddr, cfgPath))                    
             # Make the connection         
             # self._conn = taos.connect(host=hostAddr, config=cfgPath)  # TODO: make configurable
             # self._cursor = self._conn.cursor()
@@ -1052,16 +916,16 @@ class DbConnNative(DbConn):
         with cls._lock:
             cls.totalConnections -= 1
 
-        logger.debug("[DB] Database connection closed")
+        Logging.debug("[DB] Database connection closed")
         self.isOpen = False
 
     def execute(self, sql):
         if (not self.isOpen):
             raise RuntimeError("Cannot execute database commands until connection is open")
-        logger.debug("[SQL] Executing SQL: {}".format(sql))
+        Logging.debug("[SQL] Executing SQL: {}".format(sql))
         self._lastSql = sql
         nRows = self._tdSql.execute(sql)
-        logger.debug(
+        Logging.debug(
             "[SQL] Execution Result, nRows = {}, SQL = {}".format(
                 nRows, sql))
         return nRows
@@ -1070,10 +934,10 @@ class DbConnNative(DbConn):
         if (not self.isOpen):
             raise RuntimeError(
                 "Cannot query database until connection is open")
-        logger.debug("[SQL] Executing SQL: {}".format(sql))
+        Logging.debug("[SQL] Executing SQL: {}".format(sql))
         self._lastSql = sql
         nRows = self._tdSql.query(sql)
-        logger.debug(
+        Logging.debug(
             "[SQL] Query Result, nRows = {}, SQL = {}".format(
                 nRows, sql))
         return nRows
@@ -1337,7 +1201,7 @@ class StateMechine:
 
     def init(self, dbc: DbConn): # late initailization, don't save the dbConn
         self._curState = self._findCurrentState(dbc)  # starting state
-        logger.debug("Found Starting State: {}".format(self._curState))
+        Logging.debug("Found Starting State: {}".format(self._curState))
 
     # TODO: seems no lnoger used, remove?
     def getCurrentState(self):
@@ -1375,7 +1239,7 @@ class StateMechine:
             raise RuntimeError(
                 "No suitable task types found for state: {}".format(
                     self._curState))
-        logger.debug(
+        Logging.debug(
             "[OPS] Tasks found for state {}: {}".format(
                 self._curState,
                 typesToStrings(taskTypes)))
@@ -1385,27 +1249,27 @@ class StateMechine:
         ts = time.time()  # we use this to debug how fast/slow it is to do the various queries to find the current DB state
         dbName =self._db.getName()
         if not dbc.existsDatabase(dbName): # dbc.hasDatabases():  # no database?!
-            logger.debug( "[STT] empty database found, between {} and {}".format(ts, time.time()))
+            Logging.debug( "[STT] empty database found, between {} and {}".format(ts, time.time()))
             return StateEmpty()
         # did not do this when openning connection, and this is NOT the worker
         # thread, which does this on their own
         dbc.use(dbName)
         if not dbc.hasTables():  # no tables
-            logger.debug("[STT] DB_ONLY found, between {} and {}".format(ts, time.time()))
+            Logging.debug("[STT] DB_ONLY found, between {} and {}".format(ts, time.time()))
             return StateDbOnly()
 
         sTable = self._db.getFixedSuperTable()
         if sTable.hasRegTables(dbc, dbName):  # no regular tables
-            logger.debug("[STT] SUPER_TABLE_ONLY found, between {} and {}".format(ts, time.time()))
+            Logging.debug("[STT] SUPER_TABLE_ONLY found, between {} and {}".format(ts, time.time()))
             return StateSuperTableOnly()
         else:  # has actual tables
-            logger.debug("[STT] HAS_DATA found, between {} and {}".format(ts, time.time()))
+            Logging.debug("[STT] HAS_DATA found, between {} and {}".format(ts, time.time()))
             return StateHasData()
 
     # We transition the system to a new state by examining the current state itself
     def transition(self, tasks, dbc: DbConn):
         if (len(tasks) == 0):  # before 1st step, or otherwise empty
-            logger.debug("[STT] Starting State: {}".format(self._curState))
+            Logging.debug("[STT] Starting State: {}".format(self._curState))
             return  # do nothing
 
         # this should show up in the server log, separating steps
@@ -1441,7 +1305,7 @@ class StateMechine:
             # Nothing for sure
 
         newState = self._findCurrentState(dbc)
-        logger.debug("[STT] New DB state determined: {}".format(newState))
+        Logging.debug("[STT] New DB state determined: {}".format(newState))
         # can old state move to new state through the tasks?
         self._curState.verifyTasksToState(tasks, newState)
         self._curState = newState
@@ -1459,7 +1323,7 @@ class StateMechine:
                 # read data task, default to 10: TODO: change to a constant
                 weights.append(10)
         i = self._weighted_choice_sub(weights)
-        # logger.debug(" (weighted random:{}/{}) ".format(i, len(taskTypes)))
+        # Logging.debug(" (weighted random:{}/{}) ".format(i, len(taskTypes)))
         return taskTypes[i]
 
     # ref:
@@ -1538,7 +1402,7 @@ class Database:
         t3 = datetime.datetime(2012, 1, 1)  # default "keep" is 10 years
         t4 = datetime.datetime.fromtimestamp(
             t3.timestamp() + elSec2)  # see explanation above
-        logger.info("Setting up TICKS to start from: {}".format(t4))
+        Logging.info("Setting up TICKS to start from: {}".format(t4))
         return t4
 
     @classmethod
@@ -1689,10 +1553,10 @@ class TaskExecutor():
         self._boundedList.add(n)
 
     # def logInfo(self, msg):
-    #     logger.info("    T[{}.x]: ".format(self._curStep) + msg)
+    #     Logging.info("    T[{}.x]: ".format(self._curStep) + msg)
 
     # def logDebug(self, msg):
-    #     logger.debug("    T[{}.x]: ".format(self._curStep) + msg)
+    #     Logging.debug("    T[{}.x]: ".format(self._curStep) + msg)
 
 
 class Task():
@@ -1705,7 +1569,7 @@ class Task():
     @classmethod
     def allocTaskNum(cls):
         Task.taskSn += 1  # IMPORTANT: cannot use cls.taskSn, since each sub class will have a copy
-        # logger.debug("Allocating taskSN: {}".format(Task.taskSn))
+        # Logging.debug("Allocating taskSN: {}".format(Task.taskSn))
         return Task.taskSn
 
     def __init__(self, execStats: ExecutionStats, db: Database):
@@ -1717,7 +1581,7 @@ class Task():
 
         # Assign an incremental task serial number
         self._taskNum = self.allocTaskNum()
-        # logger.debug("Creating new task {}...".format(self._taskNum))
+        # Logging.debug("Creating new task {}...".format(self._taskNum))
 
         self._execStats = execStats
         self._db = db # A task is always associated/for a specific DB
@@ -1781,7 +1645,7 @@ class Task():
             elif msg.find("duplicated column names") != -1: # also alter table tag issues
                 return True
         elif gSvcMgr and (not gSvcMgr.isStable()): # We are managing service, and ...
-            logger.info("Ignoring error when service starting/stopping: errno = {}, msg = {}".format(errno, msg))
+            Logging.info("Ignoring error when service starting/stopping: errno = {}, msg = {}".format(errno, msg))
             return True
         
         return False # Not an acceptable error
@@ -1922,13 +1786,13 @@ class ExecutionStats:
         self._failureReason = reason
 
     def printStats(self):
-        logger.info(
+        Logging.info(
             "----------------------------------------------------------------------")
-        logger.info(
+        Logging.info(
             "| Crash_Gen test {}, with the following stats:". format(
                 "FAILED (reason: {})".format(
                     self._failureReason) if self._failed else "SUCCEEDED"))
-        logger.info("| Task Execution Times (success/total):")
+        Logging.info("| Task Execution Times (success/total):")
         execTimesAny = 0.001 # avoid div by zero
         for k, n in self._execTimes.items():
             execTimesAny += n[0]
@@ -1939,28 +1803,28 @@ class ExecutionStats:
                 errStrs = ["0x{:X}:{}".format(eno, n) for (eno, n) in errors.items()]
                 # print("error strings = {}".format(errStrs))
                 errStr = ", ".join(errStrs) 
-            logger.info("|    {0:<24}: {1}/{2} (Errors: {3})".format(k, n[1], n[0], errStr))
+            Logging.info("|    {0:<24}: {1}/{2} (Errors: {3})".format(k, n[1], n[0], errStr))
 
-        logger.info(
+        Logging.info(
             "| Total Tasks Executed (success or not): {} ".format(execTimesAny))
-        logger.info(
+        Logging.info(
             "| Total Tasks In Progress at End: {}".format(
                 self._tasksInProgress))
-        logger.info(
+        Logging.info(
             "| Total Task Busy Time (elapsed time when any task is in progress): {:.3f} seconds".format(
                 self._accRunTime))
-        logger.info(
+        Logging.info(
             "| Average Per-Task Execution Time: {:.3f} seconds".format(self._accRunTime / execTimesAny))
-        logger.info(
+        Logging.info(
             "| Total Elapsed Time (from wall clock): {:.3f} seconds".format(
                 self._elapsedTime))
-        logger.info("| Top numbers written: {}".format(TaskExecutor.getBoundedList()))
-        logger.info("| Active DB Native Connections (now): {}".format(DbConnNative.totalConnections))
-        logger.info("| Longest native query time: {:.3f} seconds, started: {}".
+        Logging.info("| Top numbers written: {}".format(TaskExecutor.getBoundedList()))
+        Logging.info("| Active DB Native Connections (now): {}".format(DbConnNative.totalConnections))
+        Logging.info("| Longest native query time: {:.3f} seconds, started: {}".
             format(MyTDSql.longestQueryTime, 
                 time.strftime("%x %X", time.localtime(MyTDSql.lqStartTime))) )
-        logger.info("| Longest native query: {}".format(MyTDSql.longestQuery))
-        logger.info(
+        Logging.info("| Longest native query: {}".format(MyTDSql.longestQuery))
+        Logging.info(
             "----------------------------------------------------------------------")
 
 
@@ -2030,7 +1894,7 @@ class TaskDropDb(StateTransitionTask):
 
     def _executeInternal(self, te: TaskExecutor, wt: WorkerThread):
         self.execWtSql(wt, "drop database {}".format(self._db.getName()))
-        logger.debug("[OPS] database dropped at {}".format(time.time()))
+        Logging.debug("[OPS] database dropped at {}".format(time.time()))
 
 class TaskCreateSuperTable(StateTransitionTask):
     @classmethod
@@ -2043,7 +1907,7 @@ class TaskCreateSuperTable(StateTransitionTask):
 
     def _executeInternal(self, te: TaskExecutor, wt: WorkerThread):
         if not self._db.exists(wt.getDbConn()):
-            logger.debug("Skipping task, no DB yet")
+            Logging.debug("Skipping task, no DB yet")
             return
 
         sTable = self._db.getFixedSuperTable() # type: TdSuperTable
@@ -2078,7 +1942,7 @@ class TdSuperTable:
             dbc.query("select TBNAME from {}.{}".format(dbName, self._stName))  # TODO: analyze result set later            
         except taos.error.ProgrammingError as err:                    
             errno2 = Helper.convertErrno(err.errno) 
-            logger.debug("[=] Failed to get tables from super table: errno=0x{:X}, msg: {}".format(errno2, err))
+            Logging.debug("[=] Failed to get tables from super table: errno=0x{:X}, msg: {}".format(errno2, err))
             raise
 
         qr = dbc.getQueryResult()
@@ -2193,7 +2057,7 @@ class TaskReadData(StateTransitionTask):
                     dbc.execute("select {} from {}.{}".format(aggExpr, dbName, sTable.getName()))
             except taos.error.ProgrammingError as err:                    
                 errno2 = Helper.convertErrno(err.errno)
-                logger.debug("[=] Read Failure: errno=0x{:X}, msg: {}, SQL: {}".format(errno2, err, dbc.getLastSql()))
+                Logging.debug("[=] Read Failure: errno=0x{:X}, msg: {}, SQL: {}".format(errno2, err, dbc.getLastSql()))
                 raise
 
 class TaskDropSuperTable(StateTransitionTask):
@@ -2224,7 +2088,7 @@ class TaskDropSuperTable(StateTransitionTask):
                     errno2 = Helper.convertErrno(err.errno)
                     if (errno2 in [0x362]):  # mnode invalid table name
                         isSuccess = False
-                        logger.debug("[DB] Acceptable error when dropping a table")
+                        Logging.debug("[DB] Acceptable error when dropping a table")
                     continue  # try to delete next regular table
 
                 if (not tickOutput):
@@ -2304,20 +2168,19 @@ class TaskAddData(StateTransitionTask):
     # Track which table is being actively worked on
     activeTable: Set[int] = set()
 
-    # We use these two files to record operations to DB, useful for power-off
-    # tests
-    fAddLogReady = None
-    fAddLogDone = None
+    # We use these two files to record operations to DB, useful for power-off tests
+    fAddLogReady = None # type: TextIOWrapper
+    fAddLogDone  = None # type: TextIOWrapper
 
     @classmethod
     def prepToRecordOps(cls):
         if gConfig.record_ops:
             if (cls.fAddLogReady is None):
-                logger.info(
+                Logging.info(
                     "Recording in a file operations to be performed...")
                 cls.fAddLogReady = open("add_log_ready.txt", "w")
             if (cls.fAddLogDone is None):
-                logger.info("Recording in a file operations completed...")
+                Logging.info("Recording in a file operations completed...")
                 cls.fAddLogDone = open("add_log_done.txt", "w")
 
     @classmethod
@@ -2393,553 +2256,8 @@ class TaskAddData(StateTransitionTask):
             self.activeTable.discard(i)  # not raising an error, unlike remove
 
 
-# Deterministic random number generator
-class Dice():
-    seeded = False  # static, uninitialized
 
-    @classmethod
-    def seed(cls, s):  # static
-        if (cls.seeded):
-            raise RuntimeError(
-                "Cannot seed the random generator more than once")
-        cls.verifyRNG()
-        random.seed(s)
-        cls.seeded = True  # TODO: protect against multi-threading
 
-    @classmethod
-    def verifyRNG(cls):  # Verify that the RNG is determinstic
-        random.seed(0)
-        x1 = random.randrange(0, 1000)
-        x2 = random.randrange(0, 1000)
-        x3 = random.randrange(0, 1000)
-        if (x1 != 864 or x2 != 394 or x3 != 776):
-            raise RuntimeError("System RNG is not deterministic")
-
-    @classmethod
-    def throw(cls, stop):  # get 0 to stop-1
-        return cls.throwRange(0, stop)
-
-    @classmethod
-    def throwRange(cls, start, stop):  # up to stop-1
-        if (not cls.seeded):
-            raise RuntimeError("Cannot throw dice before seeding it")
-        return random.randrange(start, stop)
-
-    @classmethod
-    def choice(cls, cList):
-        return random.choice(cList)
-
-
-class LoggingFilter(logging.Filter):
-    def filter(self, record: logging.LogRecord):
-        if (record.levelno >= logging.INFO):
-            return True  # info or above always log
-
-        # Commenting out below to adjust...
-
-        # if msg.startswith("[TRD]"):
-        #     return False
-        return True
-
-
-class MyLoggingAdapter(logging.LoggerAdapter):
-    def process(self, msg, kwargs):
-        return "[{}]{}".format(threading.get_ident() % 10000, msg), kwargs
-        # return '[%s] %s' % (self.extra['connid'], msg), kwargs
-
-
-class ServiceManager:
-    PAUSE_BETWEEN_IPC_CHECK = 1.2  # seconds between checks on STDOUT of sub process
-
-    def __init__(self, numDnodes = 1):
-        logger.info("TDengine Service Manager (TSM) created")
-        self._numDnodes = numDnodes # >1 means we have a cluster
-        # signal.signal(signal.SIGTERM, self.sigIntHandler) # Moved to MainExec
-        # signal.signal(signal.SIGINT, self.sigIntHandler)
-        # signal.signal(signal.SIGUSR1, self.sigUsrHandler)  # different handler!
-
-        self.inSigHandler = False
-        # self._status = MainExec.STATUS_RUNNING # set inside
-        # _startTaosService()
-        self.svcMgrThreads = [] # type: List[ServiceManagerThread]
-        for i in range(0, numDnodes):
-            self.svcMgrThreads.append(ServiceManagerThread(i))
-
-        self._lock = threading.Lock()
-        # self._isRestarting = False
-
-    def _doMenu(self):
-        choice = ""
-        while True:
-            print("\nInterrupting Service Program, Choose an Action: ")
-            print("1: Resume")
-            print("2: Terminate")
-            print("3: Restart")
-            # Remember to update the if range below
-            # print("Enter Choice: ", end="", flush=True)
-            while choice == "":
-                choice = input("Enter Choice: ")
-                if choice != "":
-                    break  # done with reading repeated input
-            if choice in ["1", "2", "3"]:
-                break  # we are done with whole method
-            print("Invalid choice, please try again.")
-            choice = ""  # reset
-        return choice
-
-    def sigUsrHandler(self, signalNumber, frame):
-        print("Interrupting main thread execution upon SIGUSR1")
-        if self.inSigHandler:  # already
-            print("Ignoring repeated SIG...")
-            return  # do nothing if it's already not running
-        self.inSigHandler = True
-
-        choice = self._doMenu()
-        if choice == "1":            
-            self.sigHandlerResume() # TODO: can the sub-process be blocked due to us not reading from queue?
-        elif choice == "2":
-            self.stopTaosServices()
-        elif choice == "3": # Restart
-            self.restart()
-        else:
-            raise RuntimeError("Invalid menu choice: {}".format(choice))
-
-        self.inSigHandler = False
-
-    def sigIntHandler(self, signalNumber, frame):
-        print("ServiceManager: INT Signal Handler starting...")
-        if self.inSigHandler:
-            print("Ignoring repeated SIG_INT...")
-            return
-        self.inSigHandler = True
-
-        self.stopTaosServices()
-        print("ServiceManager: INT Signal Handler returning...")
-        self.inSigHandler = False
-
-    def sigHandlerResume(self):
-        print("Resuming TDengine service manager (main thread)...\n\n")
-
-    # def _updateThreadStatus(self):
-    #     if self.svcMgrThread:  # valid svc mgr thread
-    #         if self.svcMgrThread.isStopped():  # done?
-    #             self.svcMgrThread.procIpcBatch()  # one last time. TODO: appropriate?
-    #             self.svcMgrThread = None  # no more
-
-    def isActive(self):
-        """
-        Determine if the service/cluster is active at all, i.e. at least
-        one thread is not "stopped".
-        """
-        for thread in self.svcMgrThreads:
-            if not thread.isStopped():
-                return True
-        return False
-
-    # def isRestarting(self):
-    #     """
-    #     Determine if the service/cluster is being "restarted", i.e., at least
-    #     one thread is in "restarting" status
-    #     """
-    #     for thread in self.svcMgrThreads:
-    #         if thread.isRestarting():
-    #             return True
-    #     return False
-
-    def isStable(self):
-        """
-        Determine if the service/cluster is "stable", i.e. all of the
-        threads are in "stable" status.
-        """
-        for thread in self.svcMgrThreads:
-            if not thread.isStable():
-                return False
-        return True
-
-    def _procIpcAll(self):
-        while self.isActive():
-            for thread in self.svcMgrThreads: # all thread objects should always be valid
-            # while self.isRunning() or self.isRestarting() :  # for as long as the svc mgr thread is still here
-                if  thread.isRunning():
-                    thread.procIpcBatch()  # regular processing,
-                    if  thread.isStopped():
-                        thread.procIpcBatch() # one last time?
-                    # self._updateThreadStatus()
-                elif thread.isRetarting():
-                    print("Service restarting...")
-                # else this thread is stopped 
-                    
-            time.sleep(self.PAUSE_BETWEEN_IPC_CHECK)  # pause, before next round
-        # raise CrashGenError("dummy")
-        print("Service Manager Thread (with subprocess) ended, main thread exiting...")
-
-    def startTaosServices(self):
-        with self._lock:
-            if self.isActive():
-                raise RuntimeError("Cannot start TAOS service(s) when one/some may already be running")
-
-            # Find if there's already a taosd service, and then kill it
-            for proc in psutil.process_iter():
-                if proc.name() == 'taosd':
-                    print("Killing an existing TAOSD process in 2 seconds... press CTRL-C to interrupe")
-                    time.sleep(2.0)
-                    proc.kill()
-                # print("Process: {}".format(proc.name()))
-            
-            # self.svcMgrThread = ServiceManagerThread()  # create the object
-            for thread in self.svcMgrThreads:
-                thread.start()            
-                thread.procIpcBatch(trimToTarget=10, forceOutput=True)  # for printing 10 lines                         
-
-    def stopTaosServices(self):
-        with self._lock:
-            if not self.isActive():
-                logger.warning("Cannot stop TAOS service(s), already not active")
-                return
-
-            for thread in self.svcMgrThreads:
-                thread.stop()
-                
-    def run(self):
-        self.startTaosServices()
-        self._procIpcAll()  # pump/process all the messages, may encounter SIG + restart
-        if  self.isActive():  # if sig handler hasn't destroyed it by now
-            self.stopTaosServices()  # should have started already
-
-    def restart(self):
-        if not self.isStable():
-            logger.warning("Cannot restart service/cluster, when not stable")
-            return
-
-        # self._isRestarting = True
-        if  self.isActive():
-            self.stopTaosServices()
-        else:
-            logger.warning("Service not active when restart requested")
-
-        self.startTaosService()
-        # self._isRestarting = False
-
-    # def isRunning(self):
-    #     return self.svcMgrThread != None
-
-    # def isRestarting(self):
-    #     return self._isRestarting
-
-class ServiceManagerThread:
-    """
-    A class representing a dedicated thread which manages the "sub process"
-    of the TDengine service, interacting with its STDOUT/ERR.
-
-    It takes a TdeInstance parameter at creation time, or create a default    
-    """
-    MAX_QUEUE_SIZE = 10000
-
-    def __init__(self, tInstNum = 0, tInst : TdeInstance = None):
-        # Set the sub process
-        self._tdeSubProcess = None # type: TdeSubProcess
-
-        # Arrange the TDengine instance
-        self._tInstNum = tInstNum # instance serial number in cluster, ZERO based
-        self._tInst    = tInst or TdeInstance() # Need an instance
-
-        self._thread = None # The actual thread, # type: threading.Thread
-        self._status = MainExec.STATUS_STOPPED # The status of the underlying service, actually.
-
-    def __repr__(self):
-        return "[SvcMgrThread: tInstNum={}]".format(self._tInstNum)
-
-    def getStatus(self):
-        return self._status
-
-    def isStarting(self):
-        return self._status == MainExec.STATUS_STARTING
-
-    def isRunning(self):
-        # return self._thread and self._thread.is_alive()
-        return self._status == MainExec.STATUS_RUNNING
-
-    def isStopping(self):
-        return self._status == MainExec.STATUS_STOPPING
-
-    def isStopped(self):
-        return self._status == MainExec.STATUS_STOPPED
-
-    def isStable(self):
-        return self.isRunning() or self.isStopped()
-
-    # Start the thread (with sub process), and wait for the sub service
-    # to become fully operational
-    def start(self):
-        if self._thread:
-            raise RuntimeError("Unexpected _thread")
-        if self._tdeSubProcess:
-            raise RuntimeError("TDengine sub process already created/running")
-
-        logger.info("Attempting to start TAOS service: {}".format(self))
-
-        self._status = MainExec.STATUS_STARTING
-        self._tdeSubProcess = TdeSubProcess(self._tInst)
-        self._tdeSubProcess.start()
-
-        self._ipcQueue = Queue()
-        self._thread = threading.Thread( # First thread captures server OUTPUT
-            target=self.svcOutputReader,
-            args=(self._tdeSubProcess.getStdOut(), self._ipcQueue))
-        self._thread.daemon = True  # thread dies with the program
-        self._thread.start()
-
-        self._thread2 = threading.Thread( # 2nd thread captures server ERRORs
-            target=self.svcErrorReader,
-            args=(self._tdeSubProcess.getStdErr(), self._ipcQueue))
-        self._thread2.daemon = True  # thread dies with the program
-        self._thread2.start()
-
-        # wait for service to start
-        for i in range(0, 100):
-            time.sleep(1.0)
-            # self.procIpcBatch() # don't pump message during start up
-            print("_zz_", end="", flush=True)
-            if self._status == MainExec.STATUS_RUNNING:
-                logger.info("[] TDengine service READY to process requests")
-                logger.info("[] TAOS service started: {}".format(self))
-                return  # now we've started
-        # TODO: handle failure-to-start  better?
-        self.procIpcBatch(100, True) # display output before cronking out, trim to last 20 msgs, force output
-        raise RuntimeError("TDengine service did not start successfully: {}".format(self))
-
-    def stop(self):
-        # can be called from both main thread or signal handler
-        print("Terminating TDengine service running as the sub process...")
-        if self.isStopped():
-            print("Service already stopped")
-            return
-        if self.isStopping():
-            print("Service is already being stopped")
-            return
-        # Linux will send Control-C generated SIGINT to the TDengine process
-        # already, ref:
-        # https://unix.stackexchange.com/questions/176235/fork-and-how-signals-are-delivered-to-processes
-        if not self._tdeSubProcess:
-            raise RuntimeError("sub process object missing")
-
-        self._status = MainExec.STATUS_STOPPING
-        retCode = self._tdeSubProcess.stop()
-        print("Attempted to stop sub process, got return code: {}".format(retCode))
-        if (retCode==-11): # SGV
-            logger.error("[[--ERROR--]]: TDengine service SEGV fault (check core file!)")
-
-        if self._tdeSubProcess.isRunning():  # still running
-            print("FAILED to stop sub process, it is still running... pid = {}".format(
-                    self._tdeSubProcess.getPid()))
-        else:
-            self._tdeSubProcess = None  # not running any more
-            self.join()  # stop the thread, change the status, etc.
-
-        # Check if it's really stopped
-        outputLines = 20 # for last output
-        if  self.isStopped():
-            self.procIpcBatch(outputLines)  # one last time
-            print("End of TDengine Service Output: {}".format(self))
-            print("----- TDengine Service (managed by SMT) is now terminated -----\n")
-        else:
-            print("WARNING: SMT did not terminate as expected: {}".format(self))
-
-    def join(self):
-        # TODO: sanity check
-        if not self.isStopping():
-            raise RuntimeError(
-                "Unexpected status when ending svc mgr thread: {}".format(
-                    self._status))
-
-        if self._thread:
-            self._thread.join()
-            self._thread = None
-            self._status = MainExec.STATUS_STOPPED
-            # STD ERR thread
-            self._thread2.join()
-            self._thread2 = None
-        else:
-            print("Joining empty thread, doing nothing")
-
-    def _trimQueue(self, targetSize):
-        if targetSize <= 0:
-            return  # do nothing
-        q = self._ipcQueue
-        if (q.qsize() <= targetSize):  # no need to trim
-            return
-
-        logger.debug("Triming IPC queue to target size: {}".format(targetSize))
-        itemsToTrim = q.qsize() - targetSize
-        for i in range(0, itemsToTrim):
-            try:
-                q.get_nowait()
-            except Empty:
-                break  # break out of for loop, no more trimming
-
-    TD_READY_MSG = "TDengine is initialized successfully"
-
-    def procIpcBatch(self, trimToTarget=0, forceOutput=False):
-        self._trimQueue(trimToTarget)  # trim if necessary
-        # Process all the output generated by the underlying sub process,
-        # managed by IO thread
-        print("<", end="", flush=True)
-        while True:
-            try:
-                line = self._ipcQueue.get_nowait()  # getting output at fast speed
-                self._printProgress("_o")
-            except Empty:
-                # time.sleep(2.3) # wait only if there's no output
-                # no more output
-                print(".>", end="", flush=True)
-                return  # we are done with THIS BATCH
-            else:  # got line, printing out
-                if forceOutput:
-                    logger.info(line)
-                else:
-                    logger.debug(line)
-        print(">", end="", flush=True)
-
-    _ProgressBars = ["--", "//", "||", "\\\\"]
-
-    def _printProgress(self, msg):  # TODO: assuming 2 chars
-        print(msg, end="", flush=True)
-        pBar = self._ProgressBars[Dice.throw(4)]
-        print(pBar, end="", flush=True)
-        print('\b\b\b\b', end="", flush=True)
-
-    def svcOutputReader(self, out: IO, queue):
-        # Important Reference: https://stackoverflow.com/questions/375427/non-blocking-read-on-a-subprocess-pipe-in-python
-        # print("This is the svcOutput Reader...")
-        # for line in out :
-        for line in iter(out.readline, b''):
-            # print("Finished reading a line: {}".format(line))
-            # print("Adding item to queue...")
-            try:
-                line = line.decode("utf-8").rstrip()
-            except UnicodeError:
-                print("\nNon-UTF8 server output: {}\n".format(line))
-
-            # This might block, and then causing "out" buffer to block
-            queue.put(line)
-            self._printProgress("_i")
-
-            if self._status == MainExec.STATUS_STARTING:  # we are starting, let's see if we have started
-                if line.find(self.TD_READY_MSG) != -1:  # found
-                    logger.info("Waiting for the service to become FULLY READY")
-                    time.sleep(1.0) # wait for the server to truly start. TODO: remove this
-                    logger.info("Service instance #{} is now FULLY READY".format(self._tInstNum))   
-                    self._status = MainExec.STATUS_RUNNING                 
-
-            # Trim the queue if necessary: TODO: try this 1 out of 10 times
-            self._trimQueue(self.MAX_QUEUE_SIZE * 9 // 10)  # trim to 90% size
-
-            if self.isStopping():  # TODO: use thread status instead
-                # WAITING for stopping sub process to finish its outptu
-                print("_w", end="", flush=True)
-
-            # queue.put(line)
-        # meaning sub process must have died
-        print("\nNo more output from IO thread managing TDengine service")
-        out.close()
-
-    def svcErrorReader(self, err: IO, queue):
-        for line in iter(err.readline, b''):
-            print("\nTDengine Service (taosd) ERROR (from stderr): {}".format(line))
-
-
-class TdeSubProcess:
-    """
-    A class to to represent the actual sub process that is the run-time
-    of a TDengine instance. 
-
-    It takes a TdeInstance object as its parameter, with the rationale being
-    "a sub process runs an instance".
-    """
-
-    def __init__(self, tInst : TdeInstance):
-        self.subProcess = None
-        if tInst is None:
-            raise CrashGenError("Empty instance not allowed in TdeSubProcess")
-        self._tInst = tInst # Default create at ServiceManagerThread
-
-    def getStdOut(self):
-        return self.subProcess.stdout
-
-    def getStdErr(self):
-        return self.subProcess.stderr
-
-    def isRunning(self):
-        return self.subProcess is not None
-
-    def getPid(self):
-        return self.subProcess.pid
-
-    # Repalced by TdeInstance class
-    # def getBuildPath(self):
-    #     selfPath = os.path.dirname(os.path.realpath(__file__))
-    #     if ("community" in selfPath):
-    #         projPath = selfPath[:selfPath.find("communit")]
-    #     else:
-    #         projPath = selfPath[:selfPath.find("tests")]
-
-    #     for root, dirs, files in os.walk(projPath):
-    #         if ("taosd" in files):
-    #             rootRealPath = os.path.dirname(os.path.realpath(root))
-    #             if ("packaging" not in rootRealPath):
-    #                 buildPath = root[:len(root) - len("/build/bin")]
-    #                 break
-    #     return buildPath
-
-    def start(self):
-        ON_POSIX = 'posix' in sys.builtin_module_names
-
-        # Sanity check
-        if self.subProcess:  # already there
-            raise RuntimeError("Corrupt process state")
-
-        # global gContainer
-        # tInst = gContainer.defTdeInstance = TdeInstance('test3') # creae the instance
-        self._tInst.generateCfgFile() # service side generates config file, client does not
-
-        self._tInst.rotateLogs()
-
-        print("Starting TDengine instance: {}".format(self._tInst))
-        self.subProcess = subprocess.Popen(
-            self._tInst.getServiceCommand(),
-            shell=False,
-            # svcCmdSingle, shell=True, # capture core dump?
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            # bufsize=1, # not supported in binary mode
-            close_fds=ON_POSIX
-            )  # had text=True, which interferred with reading EOF
-
-    def stop(self):
-        if not self.subProcess:
-            print("Sub process already stopped")
-            return -1
-
-        retCode = self.subProcess.poll() # contains real sub process return code
-        if retCode:  # valid return code, process ended
-            self.subProcess = None
-        else:  # process still alive, let's interrupt it
-            print(
-                "Sub process is running, sending SIG_INT and waiting for it to terminate...")
-            # sub process should end, then IPC queue should end, causing IO
-            # thread to end
-            self.subProcess.send_signal(signal.SIGINT)
-            try:
-                self.subProcess.wait(10)
-                retCode = self.subProcess.returncode
-            except subprocess.TimeoutExpired as err:
-                print("Time out waiting for TDengine service process to exit")
-                retCode = -3
-            else:
-                print("TDengine service process terminated successfully from SIG_INT")
-                retCode = -4
-                self.subProcess = None
-        return retCode
 
 class ThreadStacks: # stack info for all threads
     def __init__(self):
@@ -2976,17 +2294,17 @@ class ClientManager:
         # signal.signal(signal.SIGTERM, self.sigIntHandler)
         # signal.signal(signal.SIGINT, self.sigIntHandler)
 
-        self._status = MainExec.STATUS_RUNNING
+        self._status = Status.STATUS_RUNNING
         self.tc = None
 
         self.inSigHandler = False
 
     def sigIntHandler(self, signalNumber, frame):
-        if self._status != MainExec.STATUS_RUNNING:
+        if self._status != Status.STATUS_RUNNING:
             print("Repeated SIGINT received, forced exit...")
             # return  # do nothing if it's already not running
             sys.exit(-1)
-        self._status = MainExec.STATUS_STOPPING  # immediately set our status
+        self._status = Status.STATUS_STOPPING  # immediately set our status
 
         print("ClientManager: Terminating program...")
         self.tc.requestToStop()
@@ -3110,11 +2428,6 @@ class ClientManager:
         self.tc.printStats()
 
 class MainExec:
-    STATUS_STARTING = 1
-    STATUS_RUNNING = 2
-    STATUS_STOPPING = 3
-    STATUS_STOPPED = 4
-
     def __init__(self):        
         self._clientMgr = None
         self._svcMgr = None
@@ -3147,7 +2460,7 @@ class MainExec:
         try: 
             ret = self._clientMgr.run(self._svcMgr) # stop TAOS service inside
         except requests.exceptions.ConnectionError as err:
-            logger.warning("Failed to open REST connection to DB: {}".format(err.getMessage()))
+            Logging.warning("Failed to open REST connection to DB: {}".format(err.getMessage()))
             # don't raise
         return ret
 
@@ -3255,20 +2568,7 @@ class MainExec:
         global gConfig
         gConfig = parser.parse_args()
 
-        # Logging Stuff
-        global logger
-        _logger = logging.getLogger('CrashGen')  # real logger
-        _logger.addFilter(LoggingFilter())
-        ch = logging.StreamHandler()
-        _logger.addHandler(ch)
-
-        # Logging adapter, to be used as a logger
-        logger = MyLoggingAdapter(_logger, [])
-
-        if (gConfig.debug):
-            logger.setLevel(logging.DEBUG)  # default seems to be INFO
-        else:
-            logger.setLevel(logging.INFO)
+        Logging.clsInit(gConfig)
 
         Dice.seed(0)  # initial seeding of dice
 
diff --git a/tests/pytest/crash_gen/misc.py b/tests/pytest/crash_gen/misc.py
new file mode 100644
index 0000000000..08e50e5070
--- /dev/null
+++ b/tests/pytest/crash_gen/misc.py
@@ -0,0 +1,133 @@
+import threading
+import random
+import logging
+
+
+class CrashGenError(Exception):
+    def __init__(self, msg=None, errno=None):
+        self.msg = msg
+        self.errno = errno
+
+    def __str__(self):
+        return self.msg
+
+
+class LoggingFilter(logging.Filter):
+    def filter(self, record: logging.LogRecord):
+        if (record.levelno >= logging.INFO):
+            return True  # info or above always log
+
+        # Commenting out below to adjust...
+
+        # if msg.startswith("[TRD]"):
+        #     return False
+        return True
+
+
+class MyLoggingAdapter(logging.LoggerAdapter):
+    def process(self, msg, kwargs):
+        return "[{}]{}".format(threading.get_ident() % 10000, msg), kwargs
+        # return '[%s] %s' % (self.extra['connid'], msg), kwargs
+
+
+class Logging:
+    logger = None
+
+    @classmethod
+    def getLogger(cls):
+        return logger
+
+    @classmethod
+    def clsInit(cls, gConfig): # TODO: refactor away gConfig
+        if cls.logger:
+            return
+        
+        # Logging Stuff
+        # global misc.logger
+        _logger = logging.getLogger('CrashGen')  # real logger
+        _logger.addFilter(LoggingFilter())
+        ch = logging.StreamHandler()
+        _logger.addHandler(ch)
+
+        # Logging adapter, to be used as a logger
+        print("setting logger variable")
+        # global logger
+        cls.logger = MyLoggingAdapter(_logger, [])
+
+        if (gConfig.debug):
+            cls.logger.setLevel(logging.DEBUG)  # default seems to be INFO
+        else:
+            cls.logger.setLevel(logging.INFO)
+
+    @classmethod
+    def info(cls, msg):
+        cls.logger.info(msg)
+
+    @classmethod
+    def debug(cls, msg):
+        cls.logger.debug(msg)
+
+    @classmethod
+    def warning(cls, msg):
+        cls.logger.warning(msg)
+
+class Status:
+    STATUS_STARTING = 1
+    STATUS_RUNNING  = 2
+    STATUS_STOPPING = 3
+    STATUS_STOPPED  = 4
+
+# Deterministic random number generator
+class Dice():
+    seeded = False  # static, uninitialized
+
+    @classmethod
+    def seed(cls, s):  # static
+        if (cls.seeded):
+            raise RuntimeError(
+                "Cannot seed the random generator more than once")
+        cls.verifyRNG()
+        random.seed(s)
+        cls.seeded = True  # TODO: protect against multi-threading
+
+    @classmethod
+    def verifyRNG(cls):  # Verify that the RNG is determinstic
+        random.seed(0)
+        x1 = random.randrange(0, 1000)
+        x2 = random.randrange(0, 1000)
+        x3 = random.randrange(0, 1000)
+        if (x1 != 864 or x2 != 394 or x3 != 776):
+            raise RuntimeError("System RNG is not deterministic")
+
+    @classmethod
+    def throw(cls, stop):  # get 0 to stop-1
+        return cls.throwRange(0, stop)
+
+    @classmethod
+    def throwRange(cls, start, stop):  # up to stop-1
+        if (not cls.seeded):
+            raise RuntimeError("Cannot throw dice before seeding it")
+        return random.randrange(start, stop)
+
+    @classmethod
+    def choice(cls, cList):
+        return random.choice(cList)
+
+class Helper:
+    @classmethod
+    def convertErrno(cls, errno):
+        return errno if (errno > 0) else 0x80000000 + errno
+
+class Progress:
+    STEP_BOUNDARY = 0
+    BEGIN_THREAD_STEP = 1
+    END_THREAD_STEP   = 2
+    tokens = {
+        STEP_BOUNDARY:      '.',
+        BEGIN_THREAD_STEP:  '[',
+        END_THREAD_STEP:    '] '
+    }
+
+    @classmethod
+    def emit(cls, token):
+        print(cls.tokens[token], end="", flush=True)
diff --git a/tests/pytest/crash_gen/service_manager.py b/tests/pytest/crash_gen/service_manager.py
new file mode 100644
index 0000000000..cdb12303a2
--- /dev/null
+++ b/tests/pytest/crash_gen/service_manager.py
@@ -0,0 +1,633 @@
+import os
+import io
+import sys
+import threading
+import signal
+import logging
+import time
+import subprocess
+
+from typing import IO
+
+try:
+    import psutil
+except:
+    print("Psutil module needed, please install: sudo pip3 install psutil")
+    sys.exit(-1)
+
+from queue import Queue, Empty
+from .misc import Logging, Status, CrashGenError, Dice
+
+class TdeInstance():
+    """
+    A class to capture the *static* information of a TDengine instance,
+    including the location of the various files/directories, and basica
+    configuration.
+    """
+
+    @classmethod
+    def _getBuildPath(cls):
+        selfPath = os.path.dirname(os.path.realpath(__file__))
+        if ("community" in selfPath):
+            projPath = selfPath[:selfPath.find("communit")]
+        else:
+            projPath = selfPath[:selfPath.find("tests")]
+
+        buildPath = None
+        for root, dirs, files in os.walk(projPath):
+            if ("taosd" in files):
+                rootRealPath = os.path.dirname(os.path.realpath(root))
+                if ("packaging" not in rootRealPath):
+                    buildPath = root[:len(root) - len("/build/bin")]
+                    break
+        if buildPath == None:
+            raise RuntimeError("Failed to determine buildPath, selfPath={}, projPath={}"
+                .format(selfPath, projPath))
+        return buildPath
+
+    def __init__(self, subdir='test'):
+        self._buildDir = self._getBuildPath()
+        self._subdir = '/' + subdir # TODO: tolerate "/"
+
+    def __repr__(self):
+        return "[TdeInstance: {}, subdir={}]".format(self._buildDir, self._subdir)
+    
+    def generateCfgFile(self):       
+        # print("Logger = {}".format(logger))
+        # buildPath = self.getBuildPath()
+        # taosdPath = self._buildPath + "/build/bin/taosd"
+
+        cfgDir  = self.getCfgDir()
+        cfgFile = cfgDir + "/taos.cfg" # TODO: inquire if this is fixed
+        if os.path.exists(cfgFile):
+            if os.path.isfile(cfgFile):
+                Logging.warning("Config file exists already, skip creation: {}".format(cfgFile))
+                return # cfg file already exists, nothing to do
+            else:
+                raise CrashGenError("Invalid config file: {}".format(cfgFile))
+        # Now that the cfg file doesn't exist
+        if os.path.exists(cfgDir):
+            if not os.path.isdir(cfgDir):
+                raise CrashGenError("Invalid config dir: {}".format(cfgDir))
+            # else: good path
+        else: 
+            os.makedirs(cfgDir, exist_ok=True) # like "mkdir -p"
+        # Now we have a good cfg dir
+        cfgValues = {
+            'runDir': self.getRunDir(),
+            'ip': '127.0.0.1', # TODO: change to a network addressable ip
+            'port': 6030,
+        }
+        cfgTemplate = """
+dataDir {runDir}/data
+logDir  {runDir}/log
+
+charset UTF-8
+
+firstEp {ip}:{port}
+fqdn {ip}
+serverPort {port}
+
+# was all 135 below
+dDebugFlag 135
+cDebugFlag 135
+rpcDebugFlag 135
+qDebugFlag 135
+# httpDebugFlag 143
+# asyncLog 0
+# tables 10
+maxtablesPerVnode 10
+rpcMaxTime 101
+# cache 2
+keep 36500
+# walLevel 2
+walLevel 1
+#
+# maxConnections 100
+"""
+        cfgContent = cfgTemplate.format_map(cfgValues)
+        f = open(cfgFile, "w")
+        f.write(cfgContent)
+        f.close()
+
+    def rotateLogs(self):
+        logPath = self.getLogDir()
+        # ref: https://stackoverflow.com/questions/1995373/deleting-all-files-in-a-directory-with-python/1995397
+        if os.path.exists(logPath):
+            logPathSaved = logPath + "_" + time.strftime('%Y-%m-%d-%H-%M-%S')
+            Logging.info("Saving old log files to: {}".format(logPathSaved))
+            os.rename(logPath, logPathSaved)
+        # os.mkdir(logPath) # recreate, no need actually, TDengine will auto-create with proper perms
+
+
+    def getExecFile(self): # .../taosd
+        return self._buildDir + "/build/bin/taosd"
+
+    def getRunDir(self): # TODO: rename to "root dir" ?!
+        return self._buildDir + self._subdir
+
+    def getCfgDir(self): # path, not file
+        return self.getRunDir() + "/cfg"
+
+    def getLogDir(self):
+        return self.getRunDir() + "/log"
+
+    def getHostAddr(self):
+        return "127.0.0.1"
+
+    def getServiceCommand(self): # to start the instance
+        return [self.getExecFile(), '-c', self.getCfgDir()] # used in subproce.Popen()
+
+
+class TdeSubProcess:
+    """
+    A class to to represent the actual sub process that is the run-time
+    of a TDengine instance. 
+
+    It takes a TdeInstance object as its parameter, with the rationale being
+    "a sub process runs an instance".
+    """
+
+    def __init__(self, tInst : TdeInstance):
+        self.subProcess = None
+        if tInst is None:
+            raise CrashGenError("Empty instance not allowed in TdeSubProcess")
+        self._tInst = tInst # Default create at ServiceManagerThread
+
+    def getStdOut(self):
+        return self.subProcess.stdout
+
+    def getStdErr(self):
+        return self.subProcess.stderr
+
+    def isRunning(self):
+        return self.subProcess is not None
+
+    def getPid(self):
+        return self.subProcess.pid
+
+    # Repalced by TdeInstance class
+    # def getBuildPath(self):
+    #     selfPath = os.path.dirname(os.path.realpath(__file__))
+    #     if ("community" in selfPath):
+    #         projPath = selfPath[:selfPath.find("communit")]
+    #     else:
+    #         projPath = selfPath[:selfPath.find("tests")]
+
+    #     for root, dirs, files in os.walk(projPath):
+    #         if ("taosd" in files):
+    #             rootRealPath = os.path.dirname(os.path.realpath(root))
+    #             if ("packaging" not in rootRealPath):
+    #                 buildPath = root[:len(root) - len("/build/bin")]
+    #                 break
+    #     return buildPath
+
+    def start(self):
+        ON_POSIX = 'posix' in sys.builtin_module_names
+
+        # Sanity check
+        if self.subProcess:  # already there
+            raise RuntimeError("Corrupt process state")
+
+        # global gContainer
+        # tInst = gContainer.defTdeInstance = TdeInstance('test3') # creae the instance
+        self._tInst.generateCfgFile() # service side generates config file, client does not
+
+        self._tInst.rotateLogs()
+
+        print("Starting TDengine instance: {}".format(self._tInst))
+        self.subProcess = subprocess.Popen(
+            self._tInst.getServiceCommand(),
+            shell=False,
+            # svcCmdSingle, shell=True, # capture core dump?
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            # bufsize=1, # not supported in binary mode
+            close_fds=ON_POSIX
+            )  # had text=True, which interferred with reading EOF
+
+    def stop(self):
+        if not self.subProcess:
+            print("Sub process already stopped")
+            return -1
+
+        retCode = self.subProcess.poll() # contains real sub process return code
+        if retCode:  # valid return code, process ended
+            self.subProcess = None
+        else:  # process still alive, let's interrupt it
+            print(
+                "Sub process is running, sending SIG_INT and waiting for it to terminate...")
+            # sub process should end, then IPC queue should end, causing IO
+            # thread to end
+            self.subProcess.send_signal(signal.SIGINT)
+            try:
+                self.subProcess.wait(10)
+                retCode = self.subProcess.returncode
+            except subprocess.TimeoutExpired as err:
+                print("Time out waiting for TDengine service process to exit")
+                retCode = -3
+            else:
+                print("TDengine service process terminated successfully from SIG_INT")
+                retCode = -4
+                self.subProcess = None
+        return retCode
+
+
+class ServiceManager:
+    PAUSE_BETWEEN_IPC_CHECK = 1.2  # seconds between checks on STDOUT of sub process
+
+    def __init__(self, numDnodes = 1):
+        Logging.info("TDengine Service Manager (TSM) created")
+        self._numDnodes = numDnodes # >1 means we have a cluster
+        # signal.signal(signal.SIGTERM, self.sigIntHandler) # Moved to MainExec
+        # signal.signal(signal.SIGINT, self.sigIntHandler)
+        # signal.signal(signal.SIGUSR1, self.sigUsrHandler)  # different handler!
+
+        self.inSigHandler = False
+        # self._status = MainExec.STATUS_RUNNING # set inside
+        # _startTaosService()
+        self.svcMgrThreads = [] # type: List[ServiceManagerThread]
+        for i in range(0, numDnodes):
+            self.svcMgrThreads.append(ServiceManagerThread(i))
+
+        self._lock = threading.Lock()
+        # self._isRestarting = False
+
+    def _doMenu(self):
+        choice = ""
+        while True:
+            print("\nInterrupting Service Program, Choose an Action: ")
+            print("1: Resume")
+            print("2: Terminate")
+            print("3: Restart")
+            # Remember to update the if range below
+            # print("Enter Choice: ", end="", flush=True)
+            while choice == "":
+                choice = input("Enter Choice: ")
+                if choice != "":
+                    break  # done with reading repeated input
+            if choice in ["1", "2", "3"]:
+                break  # we are done with whole method
+            print("Invalid choice, please try again.")
+            choice = ""  # reset
+        return choice
+
+    def sigUsrHandler(self, signalNumber, frame):
+        print("Interrupting main thread execution upon SIGUSR1")
+        if self.inSigHandler:  # already
+            print("Ignoring repeated SIG...")
+            return  # do nothing if it's already not running
+        self.inSigHandler = True
+
+        choice = self._doMenu()
+        if choice == "1":            
+            self.sigHandlerResume() # TODO: can the sub-process be blocked due to us not reading from queue?
+        elif choice == "2":
+            self.stopTaosServices()
+        elif choice == "3": # Restart
+            self.restart()
+        else:
+            raise RuntimeError("Invalid menu choice: {}".format(choice))
+
+        self.inSigHandler = False
+
+    def sigIntHandler(self, signalNumber, frame):
+        print("ServiceManager: INT Signal Handler starting...")
+        if self.inSigHandler:
+            print("Ignoring repeated SIG_INT...")
+            return
+        self.inSigHandler = True
+
+        self.stopTaosServices()
+        print("ServiceManager: INT Signal Handler returning...")
+        self.inSigHandler = False
+
+    def sigHandlerResume(self):
+        print("Resuming TDengine service manager (main thread)...\n\n")
+
+    # def _updateThreadStatus(self):
+    #     if self.svcMgrThread:  # valid svc mgr thread
+    #         if self.svcMgrThread.isStopped():  # done?
+    #             self.svcMgrThread.procIpcBatch()  # one last time. TODO: appropriate?
+    #             self.svcMgrThread = None  # no more
+
+    def isActive(self):
+        """
+        Determine if the service/cluster is active at all, i.e. at least
+        one thread is not "stopped".
+        """
+        for thread in self.svcMgrThreads:
+            if not thread.isStopped():
+                return True
+        return False
+
+    # def isRestarting(self):
+    #     """
+    #     Determine if the service/cluster is being "restarted", i.e., at least
+    #     one thread is in "restarting" status
+    #     """
+    #     for thread in self.svcMgrThreads:
+    #         if thread.isRestarting():
+    #             return True
+    #     return False
+
+    def isStable(self):
+        """
+        Determine if the service/cluster is "stable", i.e. all of the
+        threads are in "stable" status.
+        """
+        for thread in self.svcMgrThreads:
+            if not thread.isStable():
+                return False
+        return True
+
+    def _procIpcAll(self):
+        while self.isActive():
+            for thread in self.svcMgrThreads: # all thread objects should always be valid
+            # while self.isRunning() or self.isRestarting() :  # for as long as the svc mgr thread is still here
+                if  thread.isRunning():
+                    thread.procIpcBatch()  # regular processing,
+                    if  thread.isStopped():
+                        thread.procIpcBatch() # one last time?
+                    # self._updateThreadStatus()
+                elif thread.isRetarting():
+                    print("Service restarting...")
+                # else this thread is stopped 
+                    
+            time.sleep(self.PAUSE_BETWEEN_IPC_CHECK)  # pause, before next round
+        # raise CrashGenError("dummy")
+        print("Service Manager Thread (with subprocess) ended, main thread exiting...")
+
+    def startTaosServices(self):
+        with self._lock:
+            if self.isActive():
+                raise RuntimeError("Cannot start TAOS service(s) when one/some may already be running")
+
+            # Find if there's already a taosd service, and then kill it
+            for proc in psutil.process_iter():
+                if proc.name() == 'taosd':
+                    print("Killing an existing TAOSD process in 2 seconds... press CTRL-C to interrupe")
+                    time.sleep(2.0)
+                    proc.kill()
+                # print("Process: {}".format(proc.name()))
+            
+            # self.svcMgrThread = ServiceManagerThread()  # create the object
+            for thread in self.svcMgrThreads:
+                thread.start()            
+                thread.procIpcBatch(trimToTarget=10, forceOutput=True)  # for printing 10 lines                         
+
+    def stopTaosServices(self):
+        with self._lock:
+            if not self.isActive():
+                Logging.warning("Cannot stop TAOS service(s), already not active")
+                return
+
+            for thread in self.svcMgrThreads:
+                thread.stop()
+                
+    def run(self):
+        self.startTaosServices()
+        self._procIpcAll()  # pump/process all the messages, may encounter SIG + restart
+        if  self.isActive():  # if sig handler hasn't destroyed it by now
+            self.stopTaosServices()  # should have started already
+
+    def restart(self):
+        if not self.isStable():
+            Logging.warning("Cannot restart service/cluster, when not stable")
+            return
+
+        # self._isRestarting = True
+        if  self.isActive():
+            self.stopTaosServices()
+        else:
+            Logging.warning("Service not active when restart requested")
+
+        self.startTaosService()
+        # self._isRestarting = False
+
+    # def isRunning(self):
+    #     return self.svcMgrThread != None
+
+    # def isRestarting(self):
+    #     return self._isRestarting
+
+class ServiceManagerThread:
+    """
+    A class representing a dedicated thread which manages the "sub process"
+    of the TDengine service, interacting with its STDOUT/ERR.
+
+    It takes a TdeInstance parameter at creation time, or create a default    
+    """
+    MAX_QUEUE_SIZE = 10000
+
+    def __init__(self, tInstNum = 0, tInst : TdeInstance = None):
+        # Set the sub process
+        self._tdeSubProcess = None # type: TdeSubProcess
+
+        # Arrange the TDengine instance
+        self._tInstNum = tInstNum # instance serial number in cluster, ZERO based
+        self._tInst    = tInst or TdeInstance() # Need an instance
+
+        self._thread = None # The actual thread, # type: threading.Thread
+        self._status = Status.STATUS_STOPPED # The status of the underlying service, actually.
+
+    def __repr__(self):
+        return "[SvcMgrThread: tInstNum={}]".format(self._tInstNum)
+
+    def getStatus(self):
+        return self._status
+
+    def isStarting(self):
+        return self._status == Status.STATUS_STARTING
+
+    def isRunning(self):
+        # return self._thread and self._thread.is_alive()
+        return self._status == Status.STATUS_RUNNING
+
+    def isStopping(self):
+        return self._status == Status.STATUS_STOPPING
+
+    def isStopped(self):
+        return self._status == Status.STATUS_STOPPED
+
+    def isStable(self):
+        return self.isRunning() or self.isStopped()
+
+    # Start the thread (with sub process), and wait for the sub service
+    # to become fully operational
+    def start(self):
+        if self._thread:
+            raise RuntimeError("Unexpected _thread")
+        if self._tdeSubProcess:
+            raise RuntimeError("TDengine sub process already created/running")
+
+        Logging.info("Attempting to start TAOS service: {}".format(self))
+
+        self._status = Status.STATUS_STARTING
+        self._tdeSubProcess = TdeSubProcess(self._tInst)
+        self._tdeSubProcess.start()
+
+        self._ipcQueue = Queue()
+        self._thread = threading.Thread( # First thread captures server OUTPUT
+            target=self.svcOutputReader,
+            args=(self._tdeSubProcess.getStdOut(), self._ipcQueue))
+        self._thread.daemon = True  # thread dies with the program
+        self._thread.start()
+
+        self._thread2 = threading.Thread( # 2nd thread captures server ERRORs
+            target=self.svcErrorReader,
+            args=(self._tdeSubProcess.getStdErr(), self._ipcQueue))
+        self._thread2.daemon = True  # thread dies with the program
+        self._thread2.start()
+
+        # wait for service to start
+        for i in range(0, 100):
+            time.sleep(1.0)
+            # self.procIpcBatch() # don't pump message during start up
+            print("_zz_", end="", flush=True)
+            if self._status == Status.STATUS_RUNNING:
+                Logging.info("[] TDengine service READY to process requests")
+                Logging.info("[] TAOS service started: {}".format(self))
+                return  # now we've started
+        # TODO: handle failure-to-start  better?
+        self.procIpcBatch(100, True) # display output before cronking out, trim to last 20 msgs, force output
+        raise RuntimeError("TDengine service did not start successfully: {}".format(self))
+
+    def stop(self):
+        # can be called from both main thread or signal handler
+        print("Terminating TDengine service running as the sub process...")
+        if self.isStopped():
+            print("Service already stopped")
+            return
+        if self.isStopping():
+            print("Service is already being stopped")
+            return
+        # Linux will send Control-C generated SIGINT to the TDengine process
+        # already, ref:
+        # https://unix.stackexchange.com/questions/176235/fork-and-how-signals-are-delivered-to-processes
+        if not self._tdeSubProcess:
+            raise RuntimeError("sub process object missing")
+
+        self._status = Status.STATUS_STOPPING
+        retCode = self._tdeSubProcess.stop()
+        print("Attempted to stop sub process, got return code: {}".format(retCode))
+        if (retCode==-11): # SGV
+            Logging.error("[[--ERROR--]]: TDengine service SEGV fault (check core file!)")
+
+        if self._tdeSubProcess.isRunning():  # still running
+            print("FAILED to stop sub process, it is still running... pid = {}".format(
+                    self._tdeSubProcess.getPid()))
+        else:
+            self._tdeSubProcess = None  # not running any more
+            self.join()  # stop the thread, change the status, etc.
+
+        # Check if it's really stopped
+        outputLines = 20 # for last output
+        if  self.isStopped():
+            self.procIpcBatch(outputLines)  # one last time
+            print("End of TDengine Service Output: {}".format(self))
+            print("----- TDengine Service (managed by SMT) is now terminated -----\n")
+        else:
+            print("WARNING: SMT did not terminate as expected: {}".format(self))
+
+    def join(self):
+        # TODO: sanity check
+        if not self.isStopping():
+            raise RuntimeError(
+                "Unexpected status when ending svc mgr thread: {}".format(
+                    self._status))
+
+        if self._thread:
+            self._thread.join()
+            self._thread = None
+            self._status = Status.STATUS_STOPPED
+            # STD ERR thread
+            self._thread2.join()
+            self._thread2 = None
+        else:
+            print("Joining empty thread, doing nothing")
+
+    def _trimQueue(self, targetSize):
+        if targetSize <= 0:
+            return  # do nothing
+        q = self._ipcQueue
+        if (q.qsize() <= targetSize):  # no need to trim
+            return
+
+        Logging.debug("Triming IPC queue to target size: {}".format(targetSize))
+        itemsToTrim = q.qsize() - targetSize
+        for i in range(0, itemsToTrim):
+            try:
+                q.get_nowait()
+            except Empty:
+                break  # break out of for loop, no more trimming
+
+    TD_READY_MSG = "TDengine is initialized successfully"
+
+    def procIpcBatch(self, trimToTarget=0, forceOutput=False):
+        self._trimQueue(trimToTarget)  # trim if necessary
+        # Process all the output generated by the underlying sub process,
+        # managed by IO thread
+        print("<", end="", flush=True)
+        while True:
+            try:
+                line = self._ipcQueue.get_nowait()  # getting output at fast speed
+                self._printProgress("_o")
+            except Empty:
+                # time.sleep(2.3) # wait only if there's no output
+                # no more output
+                print(".>", end="", flush=True)
+                return  # we are done with THIS BATCH
+            else:  # got line, printing out
+                if forceOutput:
+                    Logging.info(line)
+                else:
+                    Logging.debug(line)
+        print(">", end="", flush=True)
+
+    _ProgressBars = ["--", "//", "||", "\\\\"]
+
+    def _printProgress(self, msg):  # TODO: assuming 2 chars
+        print(msg, end="", flush=True)
+        pBar = self._ProgressBars[Dice.throw(4)]
+        print(pBar, end="", flush=True)
+        print('\b\b\b\b', end="", flush=True)
+
+    def svcOutputReader(self, out: IO, queue):
+        # Important Reference: https://stackoverflow.com/questions/375427/non-blocking-read-on-a-subprocess-pipe-in-python
+        # print("This is the svcOutput Reader...")
+        # for line in out :
+        for line in iter(out.readline, b''):
+            # print("Finished reading a line: {}".format(line))
+            # print("Adding item to queue...")
+            try:
+                line = line.decode("utf-8").rstrip()
+            except UnicodeError:
+                print("\nNon-UTF8 server output: {}\n".format(line))
+
+            # This might block, and then causing "out" buffer to block
+            queue.put(line)
+            self._printProgress("_i")
+
+            if self._status == Status.STATUS_STARTING:  # we are starting, let's see if we have started
+                if line.find(self.TD_READY_MSG) != -1:  # found
+                    Logging.info("Waiting for the service to become FULLY READY")
+                    time.sleep(1.0) # wait for the server to truly start. TODO: remove this
+                    Logging.info("Service instance #{} is now FULLY READY".format(self._tInstNum))   
+                    self._status = Status.STATUS_RUNNING                 
+
+            # Trim the queue if necessary: TODO: try this 1 out of 10 times
+            self._trimQueue(self.MAX_QUEUE_SIZE * 9 // 10)  # trim to 90% size
+
+            if self.isStopping():  # TODO: use thread status instead
+                # WAITING for stopping sub process to finish its outptu
+                print("_w", end="", flush=True)
+
+            # queue.put(line)
+        # meaning sub process must have died
+        print("\nNo more output from IO thread managing TDengine service")
+        out.close()
+
+    def svcErrorReader(self, err: IO, queue):
+        for line in iter(err.readline, b''):
+            print("\nTDengine Service (taosd) ERROR (from stderr): {}".format(line))

From 44b6dd9a3f9d7b5b59e33f3ebf609edd5049f16f Mon Sep 17 00:00:00 2001
From: yihaoDeng <yhdeng@taosdata.com>
Date: Wed, 21 Oct 2020 22:57:49 +0000
Subject: [PATCH 05/16] TD-1720

---
 src/query/src/qExtbuffer.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/query/src/qExtbuffer.c b/src/query/src/qExtbuffer.c
index fc9c60b39b..17be294531 100644
--- a/src/query/src/qExtbuffer.c
+++ b/src/query/src/qExtbuffer.c
@@ -344,8 +344,6 @@ static FORCE_INLINE int32_t primaryKeyComparator(int64_t f1, int64_t f2, int32_t
     return 0;
   }
 
-  assert(colIdx == 0);
-
   if (tsOrder == TSDB_ORDER_DESC) {  // primary column desc order
     return (f1 < f2) ? 1 : -1;
   } else {  // asc

From 3c81c340323a3406e8e4917b505b070c2cbb2c69 Mon Sep 17 00:00:00 2001
From: Steven Li <stevenli@taosdata.com>
Date: Thu, 22 Oct 2020 06:05:18 +0000
Subject: [PATCH 06/16] Enhanced crash_gen tool to run multiple instances
 concurrently, by using dynamic names for tables and databases

---
 tests/pytest/crash_gen/crash_gen.py | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/tests/pytest/crash_gen/crash_gen.py b/tests/pytest/crash_gen/crash_gen.py
index f369f5a3e8..8b1d79b811 100755
--- a/tests/pytest/crash_gen/crash_gen.py
+++ b/tests/pytest/crash_gen/crash_gen.py
@@ -484,8 +484,10 @@ class ThreadCoordinator:
         if gConfig.max_dbs == 0:
             self._dbs.append(Database(0, dbc))
         else:
+            baseDbNumber = 0 if gConfig.dynamic_db_table_names else int(datetime.datetime.now(
+            ).timestamp()) % 888  # Don't use Dice/random, as they are deterministic
             for i in range(gConfig.max_dbs):
-                self._dbs.append(Database(i, dbc))
+                self._dbs.append(Database(baseDbNumber + i, dbc))
 
     def pickDatabase(self):
         idxDb = 0
@@ -1793,7 +1795,7 @@ class ExecutionStats:
                 "FAILED (reason: {})".format(
                     self._failureReason) if self._failed else "SUCCEEDED"))
         Logging.info("| Task Execution Times (success/total):")
-        execTimesAny = 0.001 # avoid div by zero
+        execTimesAny = 0.0
         for k, n in self._execTimes.items():
             execTimesAny += n[0]
             errStr = None
@@ -1834,11 +1836,14 @@ class StateTransitionTask(Task):
     LARGE_NUMBER_OF_RECORDS = 50
     SMALL_NUMBER_OF_RECORDS = 3
 
+    _baseTableNumber = None
+
+    _endState = None
+
     @classmethod
     def getInfo(cls):  # each sub class should supply their own information
         raise RuntimeError("Overriding method expected")
-
-    _endState = None
+    
     @classmethod
     def getEndState(cls):  # TODO: optimize by calling it fewer times
         raise RuntimeError("Overriding method expected")
@@ -1858,7 +1863,9 @@ class StateTransitionTask(Task):
 
     @classmethod
     def getRegTableName(cls, i):
-        return "reg_table_{}".format(i)
+        if ( StateTransitionTask._baseTableNumber is None):
+            StateTransitionTask._baseTableNumber = 0 if gConfig.dynamic_db_table_names else Dice.throw(999)
+        return "reg_table_{}".format(StateTransitionTask._baseTableNumber + i)
 
     def execute(self, wt: WorkerThread):
         super().execute(wt)
@@ -2477,6 +2484,9 @@ class MainExec:
         global gContainer
         gContainer = Container() # micky-mouse DI
 
+        global gSvcMgr # TODO: refactor away
+        gSvcMgr = None
+
         # Super cool Python argument library:
         # https://docs.python.org/3/library/argparse.html
         parser = argparse.ArgumentParser(
@@ -2530,6 +2540,12 @@ class MainExec:
             '--larger-data',
             action='store_true',
             help='Write larger amount of data during write operations (default: false)')
+        parser.add_argument(
+            '-n',
+            '--dynamic-db-table-names',
+            action='store_true',
+            help='Use non-fixed names for dbs/tables, useful for multi-instance executions (default: false)')
+        
         parser.add_argument(
             '-p',
             '--per-thread-db-connection',

From 871b9d47ec42f59ee1d7e8618bdaae99f599bf4f Mon Sep 17 00:00:00 2001
From: Steven Li <stevenli@taosdata.com>
Date: Thu, 22 Oct 2020 06:19:31 +0000
Subject: [PATCH 07/16] Minor crash_gen tool tweaks

---
 tests/pytest/crash_gen/crash_gen.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tests/pytest/crash_gen/crash_gen.py b/tests/pytest/crash_gen/crash_gen.py
index 8b1d79b811..ee4aa5bb77 100755
--- a/tests/pytest/crash_gen/crash_gen.py
+++ b/tests/pytest/crash_gen/crash_gen.py
@@ -483,9 +483,9 @@ class ThreadCoordinator:
         dbc = self.getDbManager().getDbConn()
         if gConfig.max_dbs == 0:
             self._dbs.append(Database(0, dbc))
-        else:
-            baseDbNumber = 0 if gConfig.dynamic_db_table_names else int(datetime.datetime.now(
-            ).timestamp()) % 888  # Don't use Dice/random, as they are deterministic
+        else:            
+            baseDbNumber = int(datetime.datetime.now().timestamp( # Don't use Dice/random, as they are deterministic
+                )) % 888 if gConfig.dynamic_db_table_names else 0
             for i in range(gConfig.max_dbs):
                 self._dbs.append(Database(baseDbNumber + i, dbc))
 
@@ -1864,7 +1864,8 @@ class StateTransitionTask(Task):
     @classmethod
     def getRegTableName(cls, i):
         if ( StateTransitionTask._baseTableNumber is None):
-            StateTransitionTask._baseTableNumber = 0 if gConfig.dynamic_db_table_names else Dice.throw(999)
+            StateTransitionTask._baseTableNumber = Dice.throw(
+                999) if gConfig.dynamic_db_table_names else 0
         return "reg_table_{}".format(StateTransitionTask._baseTableNumber + i)
 
     def execute(self, wt: WorkerThread):

From 7eb3d6e33ac5c7f7f1255fe8999706911663e6d6 Mon Sep 17 00:00:00 2001
From: Steven Li <stevenli@taosdata.com>
Date: Thu, 22 Oct 2020 09:12:24 +0000
Subject: [PATCH 08/16] Enhanced crash_gen tool to accept/tolerate additional
 errors based on command line input

---
 tests/pytest/crash_gen/crash_gen.py | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/tests/pytest/crash_gen/crash_gen.py b/tests/pytest/crash_gen/crash_gen.py
index ee4aa5bb77..ccfee43ba5 100755
--- a/tests/pytest/crash_gen/crash_gen.py
+++ b/tests/pytest/crash_gen/crash_gen.py
@@ -1620,11 +1620,12 @@ class Task():
         if errno in [
                 0x05,  # TSDB_CODE_RPC_NOT_READY
                 0x0B,  # Unable to establish connection, more details in TD-1648
-                # 0x200, # invalid SQL， TODO: re-examine with TD-934
+                0x200, # invalid SQL， TODO: re-examine with TD-934
                 0x217, # "db not selected", client side defined error code
-                0x218, # "Table does not exist" client side defined error code
-                0x360, 0x362, 
-                0x369, # tag already exists
+                # 0x218, # "Table does not exist" client side defined error code
+                0x360, # Table already exists
+                0x362, 
+                # 0x369, # tag already exists
                 0x36A, 0x36B, 0x36D,
                 0x381, 
                 0x380, # "db not selected"
@@ -1637,8 +1638,13 @@ class Task():
                 1000  # REST catch-all error
             ]: 
             return True # These are the ALWAYS-ACCEPTABLE ones
-        elif (errno in [ 0x0B ]) and gConfig.auto_start_service:
-            return True # We may get "network unavilable" when restarting service
+        # This case handled below already.
+        # elif (errno in [ 0x0B ]) and gConfig.auto_start_service:
+        #     return True # We may get "network unavilable" when restarting service
+        elif gConfig.ignore_errors: # something is specified on command line
+            moreErrnos = [int(v, 0) for v in gConfig.ignore_errors.split(',')]
+            if errno in moreErrnos:
+                return True
         elif errno == 0x200 : # invalid SQL, we need to div in a bit more
             if msg.find("invalid column name") != -1:
                 return True 
@@ -2529,6 +2535,13 @@ class MainExec:
             '--run-tdengine',
             action='store_true',
             help='Run TDengine service in foreground (default: false)')
+        parser.add_argument(
+            '-g',
+            '--ignore-errors',
+            action='store',
+            default=None,
+            type=str,
+            help='Ignore error codes, comma separated, 0x supported (default: None)')
         parser.add_argument(
             '-i',
             '--max-replicas',
@@ -2545,8 +2558,7 @@ class MainExec:
             '-n',
             '--dynamic-db-table-names',
             action='store_true',
-            help='Use non-fixed names for dbs/tables, useful for multi-instance executions (default: false)')
-        
+            help='Use non-fixed names for dbs/tables, useful for multi-instance executions (default: false)')        
         parser.add_argument(
             '-p',
             '--per-thread-db-connection',

From 69cf5cf56e9902432bc3a502bbda334dbdb2d602 Mon Sep 17 00:00:00 2001
From: Steven Li <stevenli@taosdata.com>
Date: Thu, 22 Oct 2020 09:17:48 +0000
Subject: [PATCH 09/16] Minor fix to crash_gen tool, allowing simultaneous
 start of multiple executions

---
 tests/pytest/crash_gen/crash_gen.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pytest/crash_gen/crash_gen.py b/tests/pytest/crash_gen/crash_gen.py
index ccfee43ba5..4ec8e48582 100755
--- a/tests/pytest/crash_gen/crash_gen.py
+++ b/tests/pytest/crash_gen/crash_gen.py
@@ -485,7 +485,7 @@ class ThreadCoordinator:
             self._dbs.append(Database(0, dbc))
         else:            
             baseDbNumber = int(datetime.datetime.now().timestamp( # Don't use Dice/random, as they are deterministic
-                )) % 888 if gConfig.dynamic_db_table_names else 0
+                )*333) % 888 if gConfig.dynamic_db_table_names else 0
             for i in range(gConfig.max_dbs):
                 self._dbs.append(Database(baseDbNumber + i, dbc))
 

From f7a0b6b89b81e43e2730201e9c9bb7387bdfcd97 Mon Sep 17 00:00:00 2001
From: Steven Li <stevenli@taosdata.com>
Date: Fri, 23 Oct 2020 06:57:36 +0000
Subject: [PATCH 10/16] Enhanced crash_gen tool to verify dnode being in ready
 status, plus additional refactoring

---
 tests/pytest/crash_gen/crash_gen.py       | 418 +--------------------
 tests/pytest/crash_gen/db.py              | 426 ++++++++++++++++++++++
 tests/pytest/crash_gen/service_manager.py |  62 +++-
 3 files changed, 490 insertions(+), 416 deletions(-)
 create mode 100644 tests/pytest/crash_gen/db.py

diff --git a/tests/pytest/crash_gen/crash_gen.py b/tests/pytest/crash_gen/crash_gen.py
index 4ec8e48582..2d52d274c3 100755
--- a/tests/pytest/crash_gen/crash_gen.py
+++ b/tests/pytest/crash_gen/crash_gen.py
@@ -14,22 +14,17 @@
 # For type hinting before definition, ref:
 # https://stackoverflow.com/questions/33533148/how-do-i-specify-that-the-return-type-of-a-method-is-the-same-as-the-class-itsel
 from __future__ import annotations
-import taos
-from util.sql import *
-from util.cases import *
-from util.dnodes import *
-from util.log import *
+
 from typing import Set
 from typing import Dict
 from typing import List
-from requests.auth import HTTPBasicAuth
+
 import textwrap
 import time
 import datetime
 import random
 import logging
 import threading
-import requests
 import copy
 import argparse
 import getopt
@@ -44,6 +39,10 @@ import gc
 
 from .service_manager import ServiceManager, TdeInstance
 from .misc import Logging, Status, CrashGenError, Dice, Helper, Progress
+from .db import DbConn, MyTDSql, DbConnNative, DbManager
+
+import taos
+import requests
 
 # Require Python 3
 if sys.version_info[0] < 3:
@@ -78,10 +77,11 @@ class WorkerThread:
         # Let us have a DB connection of our own
         if (gConfig.per_thread_db_connection):  # type: ignore
             # print("connector_type = {}".format(gConfig.connector_type))
-            if gConfig.connector_type == 'native':
-                self._dbConn = DbConn.createNative() 
+            tInst = gContainer.defTdeInstance
+            if gConfig.connector_type == 'native':                
+                self._dbConn = DbConn.createNative(tInst.getDbTarget()) 
             elif gConfig.connector_type == 'rest':
-                self._dbConn = DbConn.createRest() 
+                self._dbConn = DbConn.createRest(tInst.getDbTarget()) 
             elif gConfig.connector_type == 'mixed':
                 if Dice.throw(2) == 0: # 1/2 chance
                     self._dbConn = DbConn.createNative() 
@@ -505,7 +505,7 @@ class ThreadCoordinator:
 
         # pick a task type for current state
         db = self.pickDatabase()
-        taskType = db.getStateMachine().pickTaskType() # type: Task
+        taskType = db.getStateMachine().pickTaskType() # dynamic name of class
         return taskType(self._execStats, db)  # create a task from it
 
     def resetExecutedTasks(self):
@@ -619,342 +619,6 @@ class LinearQueue():
                     return ret
 
 
-class DbConn:
-    TYPE_NATIVE = "native-c"
-    TYPE_REST =   "rest-api"
-    TYPE_INVALID = "invalid"
-
-    @classmethod
-    def create(cls, connType):
-        if connType == cls.TYPE_NATIVE:
-            return DbConnNative()
-        elif connType == cls.TYPE_REST:
-            return DbConnRest()
-        else:
-            raise RuntimeError(
-                "Unexpected connection type: {}".format(connType))
-
-    @classmethod
-    def createNative(cls):
-        return cls.create(cls.TYPE_NATIVE)
-
-    @classmethod
-    def createRest(cls):
-        return cls.create(cls.TYPE_REST)
-
-    def __init__(self):
-        self.isOpen = False
-        self._type = self.TYPE_INVALID
-        self._lastSql = None
-
-    def getLastSql(self):
-        return self._lastSql
-
-    def open(self):
-        if (self.isOpen):
-            raise RuntimeError("Cannot re-open an existing DB connection")
-
-        # below implemented by child classes
-        self.openByType()
-
-        Logging.debug("[DB] data connection opened, type = {}".format(self._type))
-        self.isOpen = True
-
-    def close(self):
-        raise RuntimeError("Unexpected execution, should be overriden")
-
-    def queryScalar(self, sql) -> int:
-        return self._queryAny(sql)
-
-    def queryString(self, sql) -> str:
-        return self._queryAny(sql)
-
-    def _queryAny(self, sql):  # actual query result as an int
-        if (not self.isOpen):
-            raise RuntimeError("Cannot query database until connection is open")
-        nRows = self.query(sql)
-        if nRows != 1:
-            raise taos.error.ProgrammingError(
-                "Unexpected result for query: {}, rows = {}".format(sql, nRows), 
-                (0x991 if nRows==0 else 0x992)
-            )
-        if self.getResultRows() != 1 or self.getResultCols() != 1:
-            raise RuntimeError("Unexpected result set for query: {}".format(sql))
-        return self.getQueryResult()[0][0]
-
-    def use(self, dbName):
-        self.execute("use {}".format(dbName))
-
-    def existsDatabase(self, dbName: str):
-        ''' Check if a certain database exists '''
-        self.query("show databases")
-        dbs = [v[0] for v in self.getQueryResult()] # ref: https://stackoverflow.com/questions/643823/python-list-transformation
-        # ret2 = dbName in dbs
-        # print("dbs = {}, str = {}, ret2={}, type2={}".format(dbs, dbName,ret2, type(dbName)))
-        return dbName in dbs # TODO: super weird type mangling seen, once here
-
-    def hasTables(self):
-        return self.query("show tables") > 0
-
-    def execute(self, sql):
-        ''' Return the number of rows affected'''
-        raise RuntimeError("Unexpected execution, should be overriden")
-
-    def safeExecute(self, sql):
-        '''Safely execute any SQL query, returning True/False upon success/failure'''
-        try:
-            self.execute(sql)
-            return True # ignore num of results, return success
-        except taos.error.ProgrammingError as err:
-            return False # failed, for whatever TAOS reason
-        # Not possile to reach here, non-TAOS exception would have been thrown
-
-    def query(self, sql) -> int: # return num rows returned
-        ''' Return the number of rows affected'''
-        raise RuntimeError("Unexpected execution, should be overriden")
-
-    def openByType(self):
-        raise RuntimeError("Unexpected execution, should be overriden")
-
-    def getQueryResult(self):
-        raise RuntimeError("Unexpected execution, should be overriden")
-
-    def getResultRows(self):
-        raise RuntimeError("Unexpected execution, should be overriden")
-
-    def getResultCols(self):
-        raise RuntimeError("Unexpected execution, should be overriden")
-
-# Sample: curl -u root:taosdata -d "show databases" localhost:6020/rest/sql
-
-
-class DbConnRest(DbConn):
-    def __init__(self):
-        super().__init__()
-        self._type = self.TYPE_REST
-        self._url = "http://localhost:6041/rest/sql"  # fixed for now
-        self._result = None
-
-    def openByType(self):  # Open connection
-        pass  # do nothing, always open
-
-    def close(self):
-        if (not self.isOpen):
-            raise RuntimeError("Cannot clean up database until connection is open")
-        # Do nothing for REST
-        Logging.debug("[DB] REST Database connection closed")
-        self.isOpen = False
-
-    def _doSql(self, sql):
-        self._lastSql = sql # remember this, last SQL attempted
-        try:
-            r = requests.post(self._url, 
-                data = sql,
-                auth = HTTPBasicAuth('root', 'taosdata'))         
-        except:
-            print("REST API Failure (TODO: more info here)")
-            raise
-        rj = r.json()
-        # Sanity check for the "Json Result"
-        if ('status' not in rj):
-            raise RuntimeError("No status in REST response")
-
-        if rj['status'] == 'error':  # clearly reported error
-            if ('code' not in rj):  # error without code
-                raise RuntimeError("REST error return without code")
-            errno = rj['code']  # May need to massage this in the future
-            # print("Raising programming error with REST return: {}".format(rj))
-            raise taos.error.ProgrammingError(
-                rj['desc'], errno)  # todo: check existance of 'desc'
-
-        if rj['status'] != 'succ':  # better be this
-            raise RuntimeError(
-                "Unexpected REST return status: {}".format(
-                    rj['status']))
-
-        nRows = rj['rows'] if ('rows' in rj) else 0
-        self._result = rj
-        return nRows
-
-    def execute(self, sql):
-        if (not self.isOpen):
-            raise RuntimeError(
-                "Cannot execute database commands until connection is open")
-        Logging.debug("[SQL-REST] Executing SQL: {}".format(sql))
-        nRows = self._doSql(sql)
-        Logging.debug(
-            "[SQL-REST] Execution Result, nRows = {}, SQL = {}".format(nRows, sql))
-        return nRows
-
-    def query(self, sql):  # return rows affected
-        return self.execute(sql)
-
-    def getQueryResult(self):
-        return self._result['data']
-
-    def getResultRows(self):
-        print(self._result)
-        raise RuntimeError("TBD")
-        # return self._tdSql.queryRows
-
-    def getResultCols(self):
-        print(self._result)
-        raise RuntimeError("TBD")
-
-    # Duplicate code from TDMySQL, TODO: merge all this into DbConnNative
-
-
-class MyTDSql:
-    # Class variables
-    _clsLock = threading.Lock() # class wide locking
-    longestQuery = None # type: str
-    longestQueryTime = 0.0 # seconds
-    lqStartTime = 0.0
-    # lqEndTime = 0.0 # Not needed, as we have the two above already
-
-    def __init__(self, hostAddr, cfgPath):
-        # Make the DB connection
-        self._conn = taos.connect(host=hostAddr, config=cfgPath) 
-        self._cursor = self._conn.cursor()
-
-        self.queryRows = 0
-        self.queryCols = 0
-        self.affectedRows = 0
-
-    # def init(self, cursor, log=True):
-    #     self.cursor = cursor
-        # if (log):
-        #     caller = inspect.getframeinfo(inspect.stack()[1][0])
-        #     self.cursor.log(caller.filename + ".sql")
-
-    def close(self):
-        self._cursor.close() # can we double close?
-        self._conn.close() # TODO: very important, cursor close does NOT close DB connection!
-        self._cursor.close()
-
-    def _execInternal(self, sql):
-        startTime = time.time() 
-        ret = self._cursor.execute(sql)
-        # print("\nSQL success: {}".format(sql))
-        queryTime =  time.time() - startTime
-        # Record the query time
-        cls = self.__class__
-        if queryTime > (cls.longestQueryTime + 0.01) :
-            with cls._clsLock:
-                cls.longestQuery = sql
-                cls.longestQueryTime = queryTime
-                cls.lqStartTime = startTime
-        return ret
-
-    def query(self, sql):
-        self.sql = sql
-        try:
-            self._execInternal(sql)
-            self.queryResult = self._cursor.fetchall()
-            self.queryRows = len(self.queryResult)
-            self.queryCols = len(self._cursor.description)
-        except Exception as e:
-            # caller = inspect.getframeinfo(inspect.stack()[1][0])
-            # args = (caller.filename, caller.lineno, sql, repr(e))
-            # tdLog.exit("%s(%d) failed: sql:%s, %s" % args)
-            raise
-        return self.queryRows
-
-    def execute(self, sql):
-        self.sql = sql
-        try:
-            self.affectedRows = self._execInternal(sql)
-        except Exception as e:
-            # caller = inspect.getframeinfo(inspect.stack()[1][0])
-            # args = (caller.filename, caller.lineno, sql, repr(e))
-            # tdLog.exit("%s(%d) failed: sql:%s, %s" % args)
-            raise
-        return self.affectedRows
-
-class DbConnNative(DbConn):
-    # Class variables
-    _lock = threading.Lock()
-    _connInfoDisplayed = False
-    totalConnections = 0 # Not private
-
-    def __init__(self):
-        super().__init__()
-        self._type = self.TYPE_NATIVE
-        self._conn = None
-        # self._cursor = None        
-
-    def openByType(self):  # Open connection
-        global gContainer
-        tdeInstance = gContainer.defTdeInstance # set up in ClientManager, type: TdeInstance
-        # cfgPath = self.getBuildPath() + "/test/cfg"
-        cfgPath  = tdeInstance.getCfgDir()
-        hostAddr = tdeInstance.getHostAddr()
-
-        cls = self.__class__ # Get the class, to access class variables
-        with cls._lock: # force single threading for opening DB connections. # TODO: whaaat??!!!
-            if not cls._connInfoDisplayed:
-                cls._connInfoDisplayed = True # updating CLASS variable
-                Logging.info("Initiating TAOS native connection to {}, using config at {}".format(hostAddr, cfgPath))                    
-            # Make the connection         
-            # self._conn = taos.connect(host=hostAddr, config=cfgPath)  # TODO: make configurable
-            # self._cursor = self._conn.cursor()
-            # Record the count in the class
-            self._tdSql = MyTDSql(hostAddr, cfgPath) # making DB connection
-            cls.totalConnections += 1 
-        
-        self._tdSql.execute('reset query cache')
-        # self._cursor.execute('use db') # do this at the beginning of every
-
-        # Open connection
-        # self._tdSql = MyTDSql()
-        # self._tdSql.init(self._cursor)
-        
-    def close(self):
-        if (not self.isOpen):
-            raise RuntimeError("Cannot clean up database until connection is open")
-        self._tdSql.close()
-        # Decrement the class wide counter
-        cls = self.__class__ # Get the class, to access class variables
-        with cls._lock:
-            cls.totalConnections -= 1
-
-        Logging.debug("[DB] Database connection closed")
-        self.isOpen = False
-
-    def execute(self, sql):
-        if (not self.isOpen):
-            raise RuntimeError("Cannot execute database commands until connection is open")
-        Logging.debug("[SQL] Executing SQL: {}".format(sql))
-        self._lastSql = sql
-        nRows = self._tdSql.execute(sql)
-        Logging.debug(
-            "[SQL] Execution Result, nRows = {}, SQL = {}".format(
-                nRows, sql))
-        return nRows
-
-    def query(self, sql):  # return rows affected
-        if (not self.isOpen):
-            raise RuntimeError(
-                "Cannot query database until connection is open")
-        Logging.debug("[SQL] Executing SQL: {}".format(sql))
-        self._lastSql = sql
-        nRows = self._tdSql.query(sql)
-        Logging.debug(
-            "[SQL] Query Result, nRows = {}, SQL = {}".format(
-                nRows, sql))
-        return nRows
-        # results are in: return self._tdSql.queryResult
-
-    def getQueryResult(self):
-        return self._tdSql.queryResult
-
-    def getResultRows(self):
-        return self._tdSql.queryRows
-
-    def getResultCols(self):
-        return self._tdSql.queryCols
-
-
 class AnyState:
     STATE_INVALID = -1
     STATE_EMPTY = 0  # nothing there, no even a DB
@@ -1439,64 +1103,6 @@ class Database:
         return ret
 
 
-class DbManager():
-    ''' This is a wrapper around DbConn(), to make it easier to use. 
-
-        TODO: rename this to DbConnManager
-    '''
-    def __init__(self):
-        self.tableNumQueue = LinearQueue() # TODO: delete?
-        # self.openDbServerConnection()
-        self._dbConn = DbConn.createNative() if (
-            gConfig.connector_type == 'native') else DbConn.createRest()
-        try:
-            self._dbConn.open()  # may throw taos.error.ProgrammingError: disconnected
-        except taos.error.ProgrammingError as err:
-            # print("Error type: {}, msg: {}, value: {}".format(type(err), err.msg, err))
-            if (err.msg == 'client disconnected'):  # cannot open DB connection
-                print(
-                    "Cannot establish DB connection, please re-run script without parameter, and follow the instructions.")
-                sys.exit(2)
-            else:
-                print("Failed to connect to DB, errno = {}, msg: {}"
-                    .format(Helper.convertErrno(err.errno), err.msg))
-                raise
-        except BaseException:
-            print("[=] Unexpected exception")
-            raise
-
-        # Do this after dbConn is in proper shape
-        # Moved to Database()
-        # self._stateMachine = StateMechine(self._dbConn)
-
-    def getDbConn(self):
-        return self._dbConn
-
-    # TODO: not used any more, to delete
-    def pickAndAllocateTable(self):  # pick any table, and "use" it
-        return self.tableNumQueue.pickAndAllocate()
-
-    # TODO: Not used any more, to delete
-    def addTable(self):
-        with self._lock:
-            tIndex = self.tableNumQueue.push()
-        return tIndex
-
-    # Not used any more, to delete
-    def releaseTable(self, i):  # return the table back, so others can use it
-        self.tableNumQueue.release(i)    
-
-    # TODO: not used any more, delete
-    def getTableNameToDelete(self):
-        tblNum = self.tableNumQueue.pop()  # TODO: race condition!
-        if (not tblNum):  # maybe false
-            return False
-
-        return "table_{}".format(tblNum)
-
-    def cleanUp(self):
-        self._dbConn.close()
-
 class TaskExecutor():
     class BoundedList:
         def __init__(self, size=10):
@@ -2402,7 +2008,7 @@ class ClientManager:
         global gContainer
         tInst = gContainer.defTdeInstance = TdeInstance() # "subdir to hold the instance"
 
-        dbManager = DbManager()  # Regular function
+        dbManager = DbManager(gConfig.connector_type, tInst.getDbTarget())  # Regular function
         thPool = ThreadPool(gConfig.num_threads, gConfig.max_steps)
         self.tc = ThreadCoordinator(thPool, dbManager)
         
diff --git a/tests/pytest/crash_gen/db.py b/tests/pytest/crash_gen/db.py
new file mode 100644
index 0000000000..5404382bf0
--- /dev/null
+++ b/tests/pytest/crash_gen/db.py
@@ -0,0 +1,426 @@
+from __future__ import annotations
+
+import sys
+import time
+import threading
+import requests
+from requests.auth import HTTPBasicAuth
+
+import taos
+from util.sql import *
+from util.cases import *
+from util.dnodes import *
+from util.log import *
+
+from .misc import Logging, CrashGenError, Helper
+# from .service_manager import TdeInstance
+
+class DbConn:
+    TYPE_NATIVE = "native-c"
+    TYPE_REST =   "rest-api"
+    TYPE_INVALID = "invalid"
+
+    @classmethod
+    def create(cls, connType, dbTarget):
+        if connType == cls.TYPE_NATIVE:
+            return DbConnNative(dbTarget)
+        elif connType == cls.TYPE_REST:
+            return DbConnRest(dbTarget)
+        else:
+            raise RuntimeError(
+                "Unexpected connection type: {}".format(connType))
+
+    @classmethod
+    def createNative(cls, dbTarget) -> DbConn:
+        return cls.create(cls.TYPE_NATIVE, dbTarget)
+
+    @classmethod
+    def createRest(cls, dbTarget) -> DbConn:
+        return cls.create(cls.TYPE_REST, dbTarget)
+
+    def __init__(self, dbTarget):
+        self.isOpen = False
+        self._type = self.TYPE_INVALID
+        self._lastSql = None
+        self._dbTarget = dbTarget
+
+    def getLastSql(self):
+        return self._lastSql
+
+    def open(self):
+        if (self.isOpen):
+            raise RuntimeError("Cannot re-open an existing DB connection")
+
+        # below implemented by child classes
+        self.openByType()
+
+        Logging.debug("[DB] data connection opened, type = {}".format(self._type))
+        self.isOpen = True
+
+    def close(self):
+        raise RuntimeError("Unexpected execution, should be overriden")
+
+    def queryScalar(self, sql) -> int:
+        return self._queryAny(sql)
+
+    def queryString(self, sql) -> str:
+        return self._queryAny(sql)
+
+    def _queryAny(self, sql):  # actual query result as an int
+        if (not self.isOpen):
+            raise RuntimeError("Cannot query database until connection is open")
+        nRows = self.query(sql)
+        if nRows != 1:
+            raise taos.error.ProgrammingError(
+                "Unexpected result for query: {}, rows = {}".format(sql, nRows), 
+                (0x991 if nRows==0 else 0x992)
+            )
+        if self.getResultRows() != 1 or self.getResultCols() != 1:
+            raise RuntimeError("Unexpected result set for query: {}".format(sql))
+        return self.getQueryResult()[0][0]
+
+    def use(self, dbName):
+        self.execute("use {}".format(dbName))
+
+    def existsDatabase(self, dbName: str):
+        ''' Check if a certain database exists '''
+        self.query("show databases")
+        dbs = [v[0] for v in self.getQueryResult()] # ref: https://stackoverflow.com/questions/643823/python-list-transformation
+        # ret2 = dbName in dbs
+        # print("dbs = {}, str = {}, ret2={}, type2={}".format(dbs, dbName,ret2, type(dbName)))
+        return dbName in dbs # TODO: super weird type mangling seen, once here
+
+    def hasTables(self):
+        return self.query("show tables") > 0
+
+    def execute(self, sql):
+        ''' Return the number of rows affected'''
+        raise RuntimeError("Unexpected execution, should be overriden")
+
+    def safeExecute(self, sql):
+        '''Safely execute any SQL query, returning True/False upon success/failure'''
+        try:
+            self.execute(sql)
+            return True # ignore num of results, return success
+        except taos.error.ProgrammingError as err:
+            return False # failed, for whatever TAOS reason
+        # Not possile to reach here, non-TAOS exception would have been thrown
+
+    def query(self, sql) -> int: # return num rows returned
+        ''' Return the number of rows affected'''
+        raise RuntimeError("Unexpected execution, should be overriden")
+
+    def openByType(self):
+        raise RuntimeError("Unexpected execution, should be overriden")
+
+    def getQueryResult(self):
+        raise RuntimeError("Unexpected execution, should be overriden")
+
+    def getResultRows(self):
+        raise RuntimeError("Unexpected execution, should be overriden")
+
+    def getResultCols(self):
+        raise RuntimeError("Unexpected execution, should be overriden")
+
+# Sample: curl -u root:taosdata -d "show databases" localhost:6020/rest/sql
+
+
+class DbConnRest(DbConn):
+    REST_PORT_INCREMENT = 11
+
+    def __init__(self, dbTarget: DbTarget):
+        super().__init__(dbTarget)
+        self._type = self.TYPE_REST
+        restPort = dbTarget.port + 11
+        self._url = "http://{}:{}/rest/sql".format(
+            dbTarget.hostAddr, dbTarget.port + self.REST_PORT_INCREMENT)
+        self._result = None
+
+    def openByType(self):  # Open connection        
+        pass  # do nothing, always open
+
+    def close(self):
+        if (not self.isOpen):
+            raise RuntimeError("Cannot clean up database until connection is open")
+        # Do nothing for REST
+        Logging.debug("[DB] REST Database connection closed")
+        self.isOpen = False
+
+    def _doSql(self, sql):
+        self._lastSql = sql # remember this, last SQL attempted
+        try:
+            r = requests.post(self._url, 
+                data = sql,
+                auth = HTTPBasicAuth('root', 'taosdata'))         
+        except:
+            print("REST API Failure (TODO: more info here)")
+            raise
+        rj = r.json()
+        # Sanity check for the "Json Result"
+        if ('status' not in rj):
+            raise RuntimeError("No status in REST response")
+
+        if rj['status'] == 'error':  # clearly reported error
+            if ('code' not in rj):  # error without code
+                raise RuntimeError("REST error return without code")
+            errno = rj['code']  # May need to massage this in the future
+            # print("Raising programming error with REST return: {}".format(rj))
+            raise taos.error.ProgrammingError(
+                rj['desc'], errno)  # todo: check existance of 'desc'
+
+        if rj['status'] != 'succ':  # better be this
+            raise RuntimeError(
+                "Unexpected REST return status: {}".format(
+                    rj['status']))
+
+        nRows = rj['rows'] if ('rows' in rj) else 0
+        self._result = rj
+        return nRows
+
+    def execute(self, sql):
+        if (not self.isOpen):
+            raise RuntimeError(
+                "Cannot execute database commands until connection is open")
+        Logging.debug("[SQL-REST] Executing SQL: {}".format(sql))
+        nRows = self._doSql(sql)
+        Logging.debug(
+            "[SQL-REST] Execution Result, nRows = {}, SQL = {}".format(nRows, sql))
+        return nRows
+
+    def query(self, sql):  # return rows affected
+        return self.execute(sql)
+
+    def getQueryResult(self):
+        return self._result['data']
+
+    def getResultRows(self):
+        print(self._result)
+        raise RuntimeError("TBD") # TODO: finish here to support -v under -c rest
+        # return self._tdSql.queryRows
+
+    def getResultCols(self):
+        print(self._result)
+        raise RuntimeError("TBD")
+
+    # Duplicate code from TDMySQL, TODO: merge all this into DbConnNative
+
+
+class MyTDSql:
+    # Class variables
+    _clsLock = threading.Lock() # class wide locking
+    longestQuery = None # type: str
+    longestQueryTime = 0.0 # seconds
+    lqStartTime = 0.0
+    # lqEndTime = 0.0 # Not needed, as we have the two above already
+
+    def __init__(self, hostAddr, cfgPath):
+        # Make the DB connection
+        self._conn = taos.connect(host=hostAddr, config=cfgPath) 
+        self._cursor = self._conn.cursor()
+
+        self.queryRows = 0
+        self.queryCols = 0
+        self.affectedRows = 0
+
+    # def init(self, cursor, log=True):
+    #     self.cursor = cursor
+        # if (log):
+        #     caller = inspect.getframeinfo(inspect.stack()[1][0])
+        #     self.cursor.log(caller.filename + ".sql")
+
+    def close(self):
+        self._cursor.close() # can we double close?
+        self._conn.close() # TODO: very important, cursor close does NOT close DB connection!
+        self._cursor.close()
+
+    def _execInternal(self, sql):
+        startTime = time.time() 
+        ret = self._cursor.execute(sql)
+        # print("\nSQL success: {}".format(sql))
+        queryTime =  time.time() - startTime
+        # Record the query time
+        cls = self.__class__
+        if queryTime > (cls.longestQueryTime + 0.01) :
+            with cls._clsLock:
+                cls.longestQuery = sql
+                cls.longestQueryTime = queryTime
+                cls.lqStartTime = startTime
+        return ret
+
+    def query(self, sql):
+        self.sql = sql
+        try:
+            self._execInternal(sql)
+            self.queryResult = self._cursor.fetchall()
+            self.queryRows = len(self.queryResult)
+            self.queryCols = len(self._cursor.description)
+        except Exception as e:
+            # caller = inspect.getframeinfo(inspect.stack()[1][0])
+            # args = (caller.filename, caller.lineno, sql, repr(e))
+            # tdLog.exit("%s(%d) failed: sql:%s, %s" % args)
+            raise
+        return self.queryRows
+
+    def execute(self, sql):
+        self.sql = sql
+        try:
+            self.affectedRows = self._execInternal(sql)
+        except Exception as e:
+            # caller = inspect.getframeinfo(inspect.stack()[1][0])
+            # args = (caller.filename, caller.lineno, sql, repr(e))
+            # tdLog.exit("%s(%d) failed: sql:%s, %s" % args)
+            raise
+        return self.affectedRows
+
+class DbTarget:
+    def __init__(self, cfgPath, hostAddr, port):
+        self.cfgPath  = cfgPath
+        self.hostAddr = hostAddr
+        self.port     = port
+
+    def __repr__(self):
+        return "[DbTarget: cfgPath={}, host={}:{}]".format(
+            self.cfgPath, self.hostAddr, self.port)
+
+class DbConnNative(DbConn):
+    # Class variables
+    _lock = threading.Lock()
+    _connInfoDisplayed = False
+    totalConnections = 0 # Not private
+
+    def __init__(self, dbTarget):
+        super().__init__(dbTarget)
+        self._type = self.TYPE_NATIVE
+        self._conn = None
+        # self._cursor = None        
+
+    def openByType(self):  # Open connection
+        # global gContainer
+        # tInst = tInst or gContainer.defTdeInstance # set up in ClientManager, type: TdeInstance
+        # cfgPath = self.getBuildPath() + "/test/cfg"
+        # cfgPath  = tInst.getCfgDir()
+        # hostAddr = tInst.getHostAddr()
+
+        cls = self.__class__ # Get the class, to access class variables
+        with cls._lock: # force single threading for opening DB connections. # TODO: whaaat??!!!
+            dbTarget = self._dbTarget
+            if not cls._connInfoDisplayed:
+                cls._connInfoDisplayed = True # updating CLASS variable
+                Logging.info("Initiating TAOS native connection to {}".format(dbTarget))                    
+            # Make the connection         
+            # self._conn = taos.connect(host=hostAddr, config=cfgPath)  # TODO: make configurable
+            # self._cursor = self._conn.cursor()
+            # Record the count in the class
+            self._tdSql = MyTDSql(dbTarget.hostAddr, dbTarget.cfgPath) # making DB connection
+            cls.totalConnections += 1 
+        
+        self._tdSql.execute('reset query cache')
+        # self._cursor.execute('use db') # do this at the beginning of every
+
+        # Open connection
+        # self._tdSql = MyTDSql()
+        # self._tdSql.init(self._cursor)
+        
+    def close(self):
+        if (not self.isOpen):
+            raise RuntimeError("Cannot clean up database until connection is open")
+        self._tdSql.close()
+        # Decrement the class wide counter
+        cls = self.__class__ # Get the class, to access class variables
+        with cls._lock:
+            cls.totalConnections -= 1
+
+        Logging.debug("[DB] Database connection closed")
+        self.isOpen = False
+
+    def execute(self, sql):
+        if (not self.isOpen):
+            raise RuntimeError("Cannot execute database commands until connection is open")
+        Logging.debug("[SQL] Executing SQL: {}".format(sql))
+        self._lastSql = sql
+        nRows = self._tdSql.execute(sql)
+        Logging.debug(
+            "[SQL] Execution Result, nRows = {}, SQL = {}".format(
+                nRows, sql))
+        return nRows
+
+    def query(self, sql):  # return rows affected
+        if (not self.isOpen):
+            raise RuntimeError(
+                "Cannot query database until connection is open")
+        Logging.debug("[SQL] Executing SQL: {}".format(sql))
+        self._lastSql = sql
+        nRows = self._tdSql.query(sql)
+        Logging.debug(
+            "[SQL] Query Result, nRows = {}, SQL = {}".format(
+                nRows, sql))
+        return nRows
+        # results are in: return self._tdSql.queryResult
+
+    def getQueryResult(self):
+        return self._tdSql.queryResult
+
+    def getResultRows(self):
+        return self._tdSql.queryRows
+
+    def getResultCols(self):
+        return self._tdSql.queryCols
+
+
+class DbManager():
+    ''' This is a wrapper around DbConn(), to make it easier to use. 
+
+        TODO: rename this to DbConnManager
+    '''
+    def __init__(self, cType, dbTarget):
+        # self.tableNumQueue = LinearQueue() # TODO: delete?
+        # self.openDbServerConnection()
+        self._dbConn = DbConn.createNative(dbTarget) if (
+            cType == 'native') else DbConn.createRest(dbTarget)
+        try:
+            self._dbConn.open()  # may throw taos.error.ProgrammingError: disconnected
+        except taos.error.ProgrammingError as err:
+            # print("Error type: {}, msg: {}, value: {}".format(type(err), err.msg, err))
+            if (err.msg == 'client disconnected'):  # cannot open DB connection
+                print(
+                    "Cannot establish DB connection, please re-run script without parameter, and follow the instructions.")
+                sys.exit(2)
+            else:
+                print("Failed to connect to DB, errno = {}, msg: {}"
+                    .format(Helper.convertErrno(err.errno), err.msg))
+                raise
+        except BaseException:
+            print("[=] Unexpected exception")
+            raise
+
+        # Do this after dbConn is in proper shape
+        # Moved to Database()
+        # self._stateMachine = StateMechine(self._dbConn)
+
+    def getDbConn(self):
+        return self._dbConn
+
+    # TODO: not used any more, to delete
+    def pickAndAllocateTable(self):  # pick any table, and "use" it
+        return self.tableNumQueue.pickAndAllocate()
+
+    # TODO: Not used any more, to delete
+    def addTable(self):
+        with self._lock:
+            tIndex = self.tableNumQueue.push()
+        return tIndex
+
+    # Not used any more, to delete
+    def releaseTable(self, i):  # return the table back, so others can use it
+        self.tableNumQueue.release(i)    
+
+    # TODO: not used any more, delete
+    def getTableNameToDelete(self):
+        tblNum = self.tableNumQueue.pop()  # TODO: race condition!
+        if (not tblNum):  # maybe false
+            return False
+
+        return "table_{}".format(tblNum)
+
+    def cleanUp(self):
+        self._dbConn.close()
diff --git a/tests/pytest/crash_gen/service_manager.py b/tests/pytest/crash_gen/service_manager.py
index cdb12303a2..11e35b6de8 100644
--- a/tests/pytest/crash_gen/service_manager.py
+++ b/tests/pytest/crash_gen/service_manager.py
@@ -16,7 +16,9 @@ except:
     sys.exit(-1)
 
 from queue import Queue, Empty
+
 from .misc import Logging, Status, CrashGenError, Dice
+from .db import DbConn, DbTarget
 
 class TdeInstance():
     """
@@ -45,9 +47,17 @@ class TdeInstance():
                 .format(selfPath, projPath))
         return buildPath
 
-    def __init__(self, subdir='test'):
-        self._buildDir = self._getBuildPath()
-        self._subdir = '/' + subdir # TODO: tolerate "/"
+    def __init__(self, subdir='test', port=6030, fepPort=6030):
+        self._buildDir  = self._getBuildPath()
+        self._subdir    = '/' + subdir # TODO: tolerate "/"
+        self._port      = port # TODO: support different IP address too
+        self._fepPort   = fepPort
+
+    def getDbTarget(self):
+        return DbTarget(self.getCfgDir(), self.getHostAddr(), self._port)
+
+    def getPort(self):
+        return self._port
 
     def __repr__(self):
         return "[TdeInstance: {}, subdir={}]".format(self._buildDir, self._subdir)
@@ -74,9 +84,10 @@ class TdeInstance():
             os.makedirs(cfgDir, exist_ok=True) # like "mkdir -p"
         # Now we have a good cfg dir
         cfgValues = {
-            'runDir': self.getRunDir(),
-            'ip': '127.0.0.1', # TODO: change to a network addressable ip
-            'port': 6030,
+            'runDir':   self.getRunDir(),
+            'ip':       '127.0.0.1', # TODO: change to a network addressable ip
+            'port':     self._port,
+            'fepPort':  self._fepPort,
         }
         cfgTemplate = """
 dataDir {runDir}/data
@@ -84,7 +95,7 @@ logDir  {runDir}/log
 
 charset UTF-8
 
-firstEp {ip}:{port}
+firstEp {ip}:{fepPort}
 fqdn {ip}
 serverPort {port}
 
@@ -236,9 +247,10 @@ class TdeSubProcess:
 class ServiceManager:
     PAUSE_BETWEEN_IPC_CHECK = 1.2  # seconds between checks on STDOUT of sub process
 
-    def __init__(self, numDnodes = 1):
+    def __init__(self, numDnodes = 1): # Otherwise we run a cluster
         Logging.info("TDengine Service Manager (TSM) created")
         self._numDnodes = numDnodes # >1 means we have a cluster
+        self._lock = threading.Lock()
         # signal.signal(signal.SIGTERM, self.sigIntHandler) # Moved to MainExec
         # signal.signal(signal.SIGINT, self.sigIntHandler)
         # signal.signal(signal.SIGUSR1, self.sigUsrHandler)  # different handler!
@@ -246,12 +258,20 @@ class ServiceManager:
         self.inSigHandler = False
         # self._status = MainExec.STATUS_RUNNING # set inside
         # _startTaosService()
+        self._runCluster = (numDnodes >= 1)
         self.svcMgrThreads = [] # type: List[ServiceManagerThread]
         for i in range(0, numDnodes):
             self.svcMgrThreads.append(ServiceManagerThread(i))
 
-        self._lock = threading.Lock()
-        # self._isRestarting = False
+    def _createThread(self, dnIndex):
+        if not self._runCluster: # single instance 
+            return ServiceManagerThread(0)
+        # Create all threads in a cluster
+        subdir = 'cluster_dnode_{}'.format(dnIndex)
+        fepPort= 6030 # firstEP Port
+        port   = fepPort + dnIndex * 100
+        ti = TdeInstance(subdir, port, fepPort)
+        return ServiceManagerThread(dnIndex, ti)
 
     def _doMenu(self):
         choice = ""
@@ -488,11 +508,33 @@ class ServiceManagerThread:
             if self._status == Status.STATUS_RUNNING:
                 Logging.info("[] TDengine service READY to process requests")
                 Logging.info("[] TAOS service started: {}".format(self))
+                self._verifyDnode(self._tInst) # query and ensure dnode is ready
                 return  # now we've started
         # TODO: handle failure-to-start  better?
         self.procIpcBatch(100, True) # display output before cronking out, trim to last 20 msgs, force output
         raise RuntimeError("TDengine service did not start successfully: {}".format(self))
 
+    def _verifyDnode(self, tInst: TdeInstance):
+        dbc = DbConn.createNative(tInst.getDbTarget())
+        dbc.open()
+        dbc.query("show dnodes")
+        # dbc.query("DESCRIBE {}.{}".format(dbName, self._stName))
+        cols = dbc.getQueryResult() #  id,end_point,vnodes,cores,status,role,create_time,offline reason
+        # ret = {row[0]:row[1] for row in stCols if row[3]=='TAG'} # name:type
+        isValid = False
+        for col in cols:
+            print("col = {}".format(col))
+            ep = col[1].split(':') # 10.1.30.2:6030
+            print("ep={}".format(ep))
+            if tInst.getPort() == int(ep[1]): # That's us
+                print("Valid Dnode matched!")
+                isValid = True # now we are valid
+                break
+        if not isValid:
+            raise RuntimeError("Failed to start Dnode, port = {}, expected: {}".
+                format(ep[1], tInst.getPort()))
+        dbc.close()
+
     def stop(self):
         # can be called from both main thread or signal handler
         print("Terminating TDengine service running as the sub process...")

From efac9a1de702a9c78f5a0feff4c16d2931b7cf8d Mon Sep 17 00:00:00 2001
From: Hui Li <huili@taosdata.com>
Date: Sat, 24 Oct 2020 10:10:20 +0800
Subject: [PATCH 11/16] modify release scripts

---
 packaging/tools/makeclient_power.sh |  2 +-
 packaging/tools/makepkg_power.sh    |  2 +-
 packaging/tools/post.sh             | 23 +++++++++++++++++++++++
 3 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/packaging/tools/makeclient_power.sh b/packaging/tools/makeclient_power.sh
index b4416a68bb..faa5a03f52 100755
--- a/packaging/tools/makeclient_power.sh
+++ b/packaging/tools/makeclient_power.sh
@@ -123,7 +123,7 @@ if [[ "$pagMode" != "lite" ]] && [[ "$cpuType" != "aarch32" ]]; then
   cp -r ${examples_dir}/R      ${install_dir}/examples
   sed -i '/password/ {s/taosdata/powerdb/g}'  ${install_dir}/examples/R/command.txt
   cp -r ${examples_dir}/go     ${install_dir}/examples  
-  sed -i '/root/ {s/taosdata/powerdb/g}'  ${install_dir}/examples/go/src/taosapp/taosapp.go
+  sed -i '/root/ {s/taosdata/powerdb/g}'  ${install_dir}/examples/go/taosdemo.go
 fi
 # Copy driver
 mkdir -p ${install_dir}/driver 
diff --git a/packaging/tools/makepkg_power.sh b/packaging/tools/makepkg_power.sh
index 3d625900c9..2c02b99787 100755
--- a/packaging/tools/makepkg_power.sh
+++ b/packaging/tools/makepkg_power.sh
@@ -146,7 +146,7 @@ if [[ "$pagMode" != "lite" ]] && [[ "$cpuType" != "aarch32" ]]; then
   cp -r ${examples_dir}/R      ${install_dir}/examples
   sed -i '/password/ {s/taosdata/powerdb/g}'  ${install_dir}/examples/R/command.txt  
   cp -r ${examples_dir}/go     ${install_dir}/examples  
-  sed -i '/root/ {s/taosdata/powerdb/g}'  ${install_dir}/examples/go/src/taosapp/taosapp.go
+  sed -i '/root/ {s/taosdata/powerdb/g}'  ${install_dir}/examples/go/taosdemo.go
 fi
 # Copy driver
 mkdir -p ${install_dir}/driver 
diff --git a/packaging/tools/post.sh b/packaging/tools/post.sh
index 0feb64c795..726eda69d0 100755
--- a/packaging/tools/post.sh
+++ b/packaging/tools/post.sh
@@ -134,6 +134,29 @@ function install_config() {
         else
             break
         fi
+    done		
+
+    # user email 
+    #EMAIL_PATTERN='^[A-Za-z0-9\u4e00-\u9fa5]+@[a-zA-Z0-9_-]+(\.[a-zA-Z0-9_-]+)+$'
+    #EMAIL_PATTERN='^[\w-]+(\.[\w-]+)*@[\w-]+(\.[\w-]+)+$'
+    #EMAIL_PATTERN="^[\w-]+(\.[\w-]+)*@[\w-]+(\.[\w-]+)+$"
+    echo
+    echo -e -n "${GREEN}Enter your email address for priority support or enter empty to skip${NC}: "
+    read emailAddr
+    while true; do
+        if [ ! -z "$emailAddr" ]; then
+            # check the format of the emailAddr
+            #if [[ "$emailAddr" =~ $EMAIL_PATTERN ]]; then
+                # Write the email address to temp file                    
+                email_file="${install_main_dir}/email" 
+                ${csudo} bash -c "echo $emailAddr > ${email_file}"
+                break         
+            #else
+            #    read -p "Please enter the correct email address: " emailAddr   
+            #fi
+        else
+            break
+        fi
     done	
 }
 

From 5961df09b527a40876ccd819d6bf89ae05bd52c1 Mon Sep 17 00:00:00 2001
From: Hui Li <huili@taosdata.com>
Date: Sat, 24 Oct 2020 10:15:36 +0800
Subject: [PATCH 12/16] modify release script

---
 packaging/tools/post.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/packaging/tools/post.sh b/packaging/tools/post.sh
index 726eda69d0..d91daaa5c4 100755
--- a/packaging/tools/post.sh
+++ b/packaging/tools/post.sh
@@ -10,6 +10,7 @@ data_dir="/var/lib/taos"
 log_dir="/var/log/taos"
 data_link_dir="/usr/local/taos/data"
 log_link_dir="/usr/local/taos/log"
+install_main_dir="/usr/local/taos"
 
 # static directory
 cfg_dir="/usr/local/taos/cfg"

From fbb406e4af51e5ba6f70fb71f1c6e603f21cceca Mon Sep 17 00:00:00 2001
From: Hui Li <huili@taosdata.com>
Date: Sat, 24 Oct 2020 11:16:48 +0800
Subject: [PATCH 13/16] [add proxy for go]

---
 tests/gotest/batchtest.bat | 3 +++
 tests/gotest/batchtest.sh  | 3 +++
 2 files changed, 6 insertions(+)
 mode change 100644 => 100755 tests/gotest/batchtest.bat
 mode change 100644 => 100755 tests/gotest/batchtest.sh

diff --git a/tests/gotest/batchtest.bat b/tests/gotest/batchtest.bat
old mode 100644
new mode 100755
index abe9a58f31..efd8961bb0
--- a/tests/gotest/batchtest.bat
+++ b/tests/gotest/batchtest.bat
@@ -7,6 +7,9 @@ set serverPort=%2
 if "%severIp%"=="" (set severIp=127.0.0.1)
 if "%serverPort%"=="" (set serverPort=6030)
 
+go env -w GO111MODULE=on
+go env -w GOPROXY=https://goproxy.io,direct
+
 cd case001
 case001.bat %severIp% %serverPort%  
 
diff --git a/tests/gotest/batchtest.sh b/tests/gotest/batchtest.sh
old mode 100644
new mode 100755
index e8ed9ecbed..0fbbf40714
--- a/tests/gotest/batchtest.sh
+++ b/tests/gotest/batchtest.sh
@@ -13,6 +13,9 @@ if [ ! -n "$serverPort" ]; then
   serverPort=6030
 fi
 
+go env -w GO111MODULE=on
+go env -w GOPROXY=https://goproxy.io,direct
+
 bash ./case001/case001.sh $severIp $serverPort
 #bash ./case002/case002.sh $severIp $serverPort
 #bash ./case003/case003.sh $severIp $serverPort

From 5a92c415a2b04f31dd1ad7f230cb04b31a95a0c3 Mon Sep 17 00:00:00 2001
From: Steven Li <stevenli@taosdata.com>
Date: Sat, 24 Oct 2020 08:06:59 +0000
Subject: [PATCH 14/16] Enhanced crash_gen tool to run clusters, with a new
 README file

---
 tests/pytest/crash_gen/README.md          | 130 +++++++++
 tests/pytest/crash_gen/crash_gen.py       |  57 ++--
 tests/pytest/crash_gen/db.py              |  25 +-
 tests/pytest/crash_gen/misc.py            |  46 ++-
 tests/pytest/crash_gen/service_manager.py | 330 +++++++++++++---------
 5 files changed, 419 insertions(+), 169 deletions(-)
 create mode 100644 tests/pytest/crash_gen/README.md

diff --git a/tests/pytest/crash_gen/README.md b/tests/pytest/crash_gen/README.md
new file mode 100644
index 0000000000..6788ab1a63
--- /dev/null
+++ b/tests/pytest/crash_gen/README.md
@@ -0,0 +1,130 @@
+<center><h1>User's Guide to the Crash_Gen Tool</h1></center>
+
+# Introduction
+
+To effectively test and debug our TDengine product, we have developed a simple tool to 
+exercise various functions of the system in a randomized fashion, hoping to expose 
+maximum number of problems, hopefully without a pre-determined scenario.
+
+# Preparation
+
+To run this tool, please ensure the followed preparation work is done first.
+
+1. Fetch a copy of the TDengine source code, and build it successfully in the `build/` 
+    directory
+1. Ensure that the system has Python3.8 or above properly installed. We use 
+    Ubuntu 20.04LTS as our own development environment, and suggest you also use such
+    an environment if possible.
+
+# Simple Execution
+
+To run the tool with the simplest method, follow the steps below:
+
+1. Open a terminal window, start the `taosd` service in the `build/` directory 
+    (or however you prefer to start the `taosd` service)
+1. Open another terminal window, go into the `tests/pytest/` directory, and
+    run `./crash_gen.sh -p -t 3 -s 10` (change the two parameters here as you wish)
+1. Watch the output to the end and see if you get a `SUCCESS` or `FAILURE`
+
+That's it!
+
+# Running Clusters
+
+This tool also makes it easy to test/verify the clustering capabilities of TDengine. You
+can start a cluster quite easily with the following command:
+
+```
+$ cd tests/pytest/
+$ ./crash_gen.sh -e -o 3
+```
+
+The `-e` option above tells the tool to start the service, and do not run any tests, while 
+the `-o 3` option tells the tool to start 3 DNodes and join them together in a cluster. 
+Obviously you can adjust the the number here.
+
+## Behind the Scenes
+
+When the tool runs a cluster, it users a number of directories, each holding the information
+for a single DNode, see:
+
+```
+$ ls build/cluster*
+build/cluster_dnode_0:
+cfg  data  log
+
+build/cluster_dnode_1:
+cfg  data  log
+
+build/cluster_dnode_2:
+cfg  data  log
+```
+
+Therefore, when something goes wrong and you want to reset everything with the cluster, simple
+erase all the files:
+
+```
+$ rm -rf build/cluster_dnode_*
+```
+
+## Addresses and Ports
+
+The DNodes in the cluster all binds the the `127.0.0.1` IP address (for now anyway), and
+uses port 6030 for the first DNode, and 6130 for the 2nd one, and so on.
+
+## Testing Against a Cluster
+
+In a separate terminal window, you can invoke the tool in client mode and test against
+a cluster, such as:
+
+```
+$ ./crash_gen.sh -p -t 10 -s 100 -i 3
+```
+
+Here the `-i` option tells the tool to always create tables with 3 replicas, and run 
+all tests against such tables.
+
+# Additional Features
+
+The exhaustive features of the tool is available through the `-h` option:
+
+```
+$ ./crash_gen.sh -h
+usage: crash_gen_bootstrap.py [-h] [-a] [-b MAX_DBS] [-c CONNECTOR_TYPE] [-d] [-e] [-g IGNORE_ERRORS] [-i MAX_REPLICAS] [-l] [-n] [-o NUM_DNODES] [-p] [-r]
+                              [-s MAX_STEPS] [-t NUM_THREADS] [-v] [-x]
+
+TDengine Auto Crash Generator (PLEASE NOTICE the Prerequisites Below)
+---------------------------------------------------------------------
+1. You build TDengine in the top level ./build directory, as described in offical docs
+2. You run the server there before this script: ./build/bin/taosd -c test/cfg
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -a, --auto-start-service
+                        Automatically start/stop the TDengine service (default: false)
+  -b MAX_DBS, --max-dbs MAX_DBS
+                        Maximum number of DBs to keep, set to disable dropping DB. (default: 0)
+  -c CONNECTOR_TYPE, --connector-type CONNECTOR_TYPE
+                        Connector type to use: native, rest, or mixed (default: 10)
+  -d, --debug           Turn on DEBUG mode for more logging (default: false)
+  -e, --run-tdengine    Run TDengine service in foreground (default: false)
+  -g IGNORE_ERRORS, --ignore-errors IGNORE_ERRORS
+                        Ignore error codes, comma separated, 0x supported (default: None)
+  -i MAX_REPLICAS, --max-replicas MAX_REPLICAS
+                        Maximum number of replicas to use, when testing against clusters. (default: 1)
+  -l, --larger-data     Write larger amount of data during write operations (default: false)
+  -n, --dynamic-db-table-names
+                        Use non-fixed names for dbs/tables, useful for multi-instance executions (default: false)
+  -o NUM_DNODES, --num-dnodes NUM_DNODES
+                        Number of Dnodes to initialize, used with -e option. (default: 1)
+  -p, --per-thread-db-connection
+                        Use a single shared db connection (default: false)
+  -r, --record-ops      Use a pair of always-fsynced fils to record operations performing + performed, for power-off tests (default: false)
+  -s MAX_STEPS, --max-steps MAX_STEPS
+                        Maximum number of steps to run (default: 100)
+  -t NUM_THREADS, --num-threads NUM_THREADS
+                        Number of threads to run (default: 10)
+  -v, --verify-data     Verify data written in a number of places by reading back (default: false)
+  -x, --continue-on-exception
+                        Continue execution after encountering unexpected/disallowed errors/exceptions (default: false)
+```
+
diff --git a/tests/pytest/crash_gen/crash_gen.py b/tests/pytest/crash_gen/crash_gen.py
index 2d52d274c3..74e3964d5a 100755
--- a/tests/pytest/crash_gen/crash_gen.py
+++ b/tests/pytest/crash_gen/crash_gen.py
@@ -18,6 +18,7 @@ from __future__ import annotations
 from typing import Set
 from typing import Dict
 from typing import List
+from typing import Optional # Type hinting, ref: https://stackoverflow.com/questions/19202633/python-3-type-hinting-for-none
 
 import textwrap
 import time
@@ -62,9 +63,10 @@ gContainer: Container
 
 
 class WorkerThread:
-    def __init__(self, pool: ThreadPool, tid, tc: ThreadCoordinator,
-                 # te: TaskExecutor,
-                 ):  # note: main thread context!
+    def __init__(self, pool: ThreadPool, tid, tc: ThreadCoordinator):
+        """
+            Note: this runs in the main thread context
+        """                 
         # self._curStep = -1
         self._pool = pool
         self._tid = tid
@@ -1007,6 +1009,8 @@ class Database:
         possibly in a cluster environment.
 
         For now we use it to manage state transitions in that database
+
+        TODO: consider moving, but keep in mind it contains "StateMachine"
     '''
     _clsLock = threading.Lock() # class wide lock
     _lastInt = 101  # next one is initial integer
@@ -1182,7 +1186,7 @@ class Task():
 
     def __init__(self, execStats: ExecutionStats, db: Database):
         self._workerThread = None
-        self._err = None # type: Exception
+        self._err: Optional[Exception] = None
         self._aborted = False
         self._curStep = None
         self._numRows = None  # Number of rows affected
@@ -1318,10 +1322,11 @@ class Task():
             self._aborted = True
             traceback.print_exc()
         except BaseException: # TODO: what is this again??!!
-            self.logDebug(
-                "[=] Unexpected exception, SQL: {}".format(
-                    wt.getDbConn().getLastSql()))
-            raise
+            raise RuntimeError("Punt")
+            # self.logDebug(
+            #     "[=] Unexpected exception, SQL: {}".format(
+            #         wt.getDbConn().getLastSql()))
+            # raise
         self._execStats.endTaskType(self.__class__.__name__, self.isSuccess())
 
         self.logDebug("[X] task execution completed, {}, status: {}".format(
@@ -1498,7 +1503,8 @@ class TaskCreateDb(StateTransitionTask):
         # was: self.execWtSql(wt, "create database db")
         repStr = ""
         if gConfig.max_replicas != 1:
-            numReplica = Dice.throw(gConfig.max_replicas) + 1 # 1,2 ... N
+            # numReplica = Dice.throw(gConfig.max_replicas) + 1 # 1,2 ... N
+            numReplica = gConfig.max_replicas # fixed, always
             repStr = "replica {}".format(numReplica)
         self.execWtSql(wt, "create database {} {}"
             .format(self._db.getName(), repStr) )
@@ -2050,7 +2056,7 @@ class ClientManager:
 class MainExec:
     def __init__(self):        
         self._clientMgr = None
-        self._svcMgr = None
+        self._svcMgr = None # type: ServiceManager
 
         signal.signal(signal.SIGTERM, self.sigIntHandler)
         signal.signal(signal.SIGINT,  self.sigIntHandler)
@@ -2063,17 +2069,16 @@ class MainExec:
             self._svcMgr.sigUsrHandler(signalNumber, frame)
         
     def sigIntHandler(self, signalNumber, frame):
-        if self._svcMgr:
+        if  self._svcMgr:
             self._svcMgr.sigIntHandler(signalNumber, frame)
-        if self._clientMgr:
+        if  self._clientMgr:
             self._clientMgr.sigIntHandler(signalNumber, frame)
 
     def runClient(self):
         global gSvcMgr
         if gConfig.auto_start_service:
-            self._svcMgr = ServiceManager()
-            gSvcMgr = self._svcMgr # hack alert
-            self._svcMgr.startTaosService() # we start, don't run
+            gSvcMgr = self._svcMgr = ServiceManager() # hack alert
+            gSvcMgr.startTaosService() # we start, don't run
         
         self._clientMgr = ClientManager()
         ret = None
@@ -2086,12 +2091,10 @@ class MainExec:
 
     def runService(self):
         global gSvcMgr
-        self._svcMgr = ServiceManager()
-        gSvcMgr = self._svcMgr # save it in a global variable TODO: hack alert
+        gSvcMgr = self._svcMgr = ServiceManager(gConfig.num_dnodes) # save it in a global variable TODO: hack alert
 
-        self._svcMgr.run() # run to some end state
-        self._svcMgr = None 
-        gSvcMgr = None        
+        gSvcMgr.run() # run to some end state
+        gSvcMgr = self._svcMgr = None 
 
     def init(self): # TODO: refactor
         global gContainer
@@ -2165,6 +2168,13 @@ class MainExec:
             '--dynamic-db-table-names',
             action='store_true',
             help='Use non-fixed names for dbs/tables, useful for multi-instance executions (default: false)')        
+        parser.add_argument(
+            '-o',
+            '--num-dnodes',
+            action='store',
+            default=1,
+            type=int,
+            help='Number of Dnodes to initialize, used with -e option. (default: 1)')
         parser.add_argument(
             '-p',
             '--per-thread-db-connection',
@@ -2209,7 +2219,12 @@ class MainExec:
 
     def run(self):
         if gConfig.run_tdengine:  # run server
-            self.runService()
+            try:
+                self.runService()
+                return 0 # success
+            except ConnectionError as err:
+                Logging.error("Failed to make DB connection, please check DB instance manually")
+            return -1 # failure
         else:
             return self.runClient()
 
diff --git a/tests/pytest/crash_gen/db.py b/tests/pytest/crash_gen/db.py
index 5404382bf0..43c855647c 100644
--- a/tests/pytest/crash_gen/db.py
+++ b/tests/pytest/crash_gen/db.py
@@ -12,7 +12,9 @@ from util.cases import *
 from util.dnodes import *
 from util.log import *
 
-from .misc import Logging, CrashGenError, Helper
+from .misc import Logging, CrashGenError, Helper, Dice
+import os
+import datetime
 # from .service_manager import TdeInstance
 
 class DbConn:
@@ -44,6 +46,9 @@ class DbConn:
         self._lastSql = None
         self._dbTarget = dbTarget
 
+    def __repr__(self):
+        return "[DbConn: type={}, target={}]".format(self._type, self._dbTarget)
+
     def getLastSql(self):
         return self._lastSql
 
@@ -54,7 +59,7 @@ class DbConn:
         # below implemented by child classes
         self.openByType()
 
-        Logging.debug("[DB] data connection opened, type = {}".format(self._type))
+        Logging.debug("[DB] data connection opened: {}".format(self))
         self.isOpen = True
 
     def close(self):
@@ -277,15 +282,18 @@ class DbTarget:
         self.cfgPath  = cfgPath
         self.hostAddr = hostAddr
         self.port     = port
-
+    
     def __repr__(self):
         return "[DbTarget: cfgPath={}, host={}:{}]".format(
-            self.cfgPath, self.hostAddr, self.port)
+            Helper.getFriendlyPath(self.cfgPath), self.hostAddr, self.port)
+
+    def getEp(self):
+        return "{}:{}".format(self.hostAddr, self.port)
 
 class DbConnNative(DbConn):
     # Class variables
     _lock = threading.Lock()
-    _connInfoDisplayed = False
+    # _connInfoDisplayed = False # TODO: find another way to display this
     totalConnections = 0 # Not private
 
     def __init__(self, dbTarget):
@@ -304,9 +312,9 @@ class DbConnNative(DbConn):
         cls = self.__class__ # Get the class, to access class variables
         with cls._lock: # force single threading for opening DB connections. # TODO: whaaat??!!!
             dbTarget = self._dbTarget
-            if not cls._connInfoDisplayed:
-                cls._connInfoDisplayed = True # updating CLASS variable
-                Logging.info("Initiating TAOS native connection to {}".format(dbTarget))                    
+            # if not cls._connInfoDisplayed:
+            #     cls._connInfoDisplayed = True # updating CLASS variable
+            Logging.debug("Initiating TAOS native connection to {}".format(dbTarget))                    
             # Make the connection         
             # self._conn = taos.connect(host=hostAddr, config=cfgPath)  # TODO: make configurable
             # self._cursor = self._conn.cursor()
@@ -424,3 +432,4 @@ class DbManager():
 
     def cleanUp(self):
         self._dbConn.close()
+
diff --git a/tests/pytest/crash_gen/misc.py b/tests/pytest/crash_gen/misc.py
index 08e50e5070..8a2817b389 100644
--- a/tests/pytest/crash_gen/misc.py
+++ b/tests/pytest/crash_gen/misc.py
@@ -1,6 +1,7 @@
 import threading
 import random
 import logging
+import os
 
 
 class CrashGenError(Exception):
@@ -26,7 +27,7 @@ class LoggingFilter(logging.Filter):
 
 class MyLoggingAdapter(logging.LoggerAdapter):
     def process(self, msg, kwargs):
-        return "[{}]{}".format(threading.get_ident() % 10000, msg), kwargs
+        return "[{}] {}".format(threading.get_ident() % 10000, msg), kwargs
         # return '[%s] %s' % (self.extra['connid'], msg), kwargs
 
 
@@ -71,12 +72,44 @@ class Logging:
     def warning(cls, msg):
         cls.logger.warning(msg)
 
+    @classmethod
+    def error(cls, msg):
+        cls.logger.error(msg)
+
 class Status:
     STATUS_STARTING = 1
     STATUS_RUNNING  = 2
     STATUS_STOPPING = 3
     STATUS_STOPPED  = 4
 
+    def __init__(self, status):
+        self.set(status)
+
+    def __repr__(self):
+        return "[Status: v={}]".format(self._status)
+
+    def set(self, status):
+        self._status = status
+
+    def get(self):
+        return self._status
+
+    def isStarting(self):
+        return self._status == Status.STATUS_STARTING
+
+    def isRunning(self):
+        # return self._thread and self._thread.is_alive()
+        return self._status == Status.STATUS_RUNNING
+
+    def isStopping(self):
+        return self._status == Status.STATUS_STOPPING
+
+    def isStopped(self):
+        return self._status == Status.STATUS_STOPPED
+
+    def isStable(self):
+        return self.isRunning() or self.isStopped()
+
 # Deterministic random number generator
 class Dice():
     seeded = False  # static, uninitialized
@@ -118,14 +151,23 @@ class Helper:
     def convertErrno(cls, errno):
         return errno if (errno > 0) else 0x80000000 + errno
 
+    @classmethod
+    def getFriendlyPath(cls, path): # returns .../xxx/yyy
+        ht1 = os.path.split(path)
+        ht2 = os.path.split(ht1[0])
+        return ".../" + ht2[1] + '/' + ht1[1]
+
+
 class Progress:
     STEP_BOUNDARY = 0
     BEGIN_THREAD_STEP = 1
     END_THREAD_STEP   = 2
+    SERVICE_HEART_BEAT= 3
     tokens = {
         STEP_BOUNDARY:      '.',
         BEGIN_THREAD_STEP:  '[',
-        END_THREAD_STEP:    '] '
+        END_THREAD_STEP:    '] ',
+        SERVICE_HEART_BEAT: '.Y.'
     }
 
     @classmethod
diff --git a/tests/pytest/crash_gen/service_manager.py b/tests/pytest/crash_gen/service_manager.py
index 11e35b6de8..c85f64fde4 100644
--- a/tests/pytest/crash_gen/service_manager.py
+++ b/tests/pytest/crash_gen/service_manager.py
@@ -7,7 +7,7 @@ import logging
 import time
 import subprocess
 
-from typing import IO
+from typing import IO, List
 
 try:
     import psutil
@@ -17,7 +17,7 @@ except:
 
 from queue import Queue, Empty
 
-from .misc import Logging, Status, CrashGenError, Dice
+from .misc import Logging, Status, CrashGenError, Dice, Helper, Progress
 from .db import DbConn, DbTarget
 
 class TdeInstance():
@@ -47,12 +47,15 @@ class TdeInstance():
                 .format(selfPath, projPath))
         return buildPath
 
-    def __init__(self, subdir='test', port=6030, fepPort=6030):
+    def __init__(self, subdir='test', tInstNum=0, port=6030, fepPort=6030):
         self._buildDir  = self._getBuildPath()
         self._subdir    = '/' + subdir # TODO: tolerate "/"
         self._port      = port # TODO: support different IP address too
         self._fepPort   = fepPort
 
+        self._tInstNum    = tInstNum
+        self._smThread    = ServiceManagerThread()
+
     def getDbTarget(self):
         return DbTarget(self.getCfgDir(), self.getHostAddr(), self._port)
 
@@ -60,7 +63,8 @@ class TdeInstance():
         return self._port
 
     def __repr__(self):
-        return "[TdeInstance: {}, subdir={}]".format(self._buildDir, self._subdir)
+        return "[TdeInstance: {}, subdir={}]".format(
+            self._buildDir, Helper.getFriendlyPath(self._subdir))
     
     def generateCfgFile(self):       
         # print("Logger = {}".format(logger))
@@ -146,8 +150,52 @@ walLevel 1
     def getHostAddr(self):
         return "127.0.0.1"
 
-    def getServiceCommand(self): # to start the instance
+    def getServiceCmdLine(self): # to start the instance
         return [self.getExecFile(), '-c', self.getCfgDir()] # used in subproce.Popen()
+    
+    def _getDnodes(self, dbc):
+        dbc.query("show dnodes")
+        cols = dbc.getQueryResult() #  id,end_point,vnodes,cores,status,role,create_time,offline reason
+        return {c[1]:c[4] for c in cols} # {'xxx:6030':'ready', 'xxx:6130':'ready'}
+
+    def createDnode(self, dbt: DbTarget):
+        """
+        With a connection to the "first" EP, let's create a dnode for someone else who
+        wants to join.
+        """
+        dbc = DbConn.createNative(self.getDbTarget())
+        dbc.open()
+
+        if dbt.getEp() in self._getDnodes(dbc):
+            Logging.info("Skipping DNode creation for: {}".format(dbt))
+            dbc.close()
+            return
+
+        sql = "CREATE DNODE \"{}\"".format(dbt.getEp())
+        dbc.execute(sql)
+        dbc.close()
+
+    def getStatus(self):
+        return self._smThread.getStatus()
+
+    def getSmThread(self):
+        return self._smThread
+
+    def start(self):
+        if not self.getStatus().isStopped():
+            raise CrashGenError("Cannot start instance from status: {}".format(self.getStatus()))
+
+        Logging.info("Starting TDengine instance: {}".format(self))
+        self.generateCfgFile() # service side generates config file, client does not
+        self.rotateLogs()
+
+        self._smThread.start(self.getServiceCmdLine())
+
+    def stop(self):
+        self._smThread.stop()
+
+    def isFirst(self):
+        return self._tInstNum == 0
 
 
 class TdeSubProcess:
@@ -159,11 +207,15 @@ class TdeSubProcess:
     "a sub process runs an instance".
     """
 
-    def __init__(self, tInst : TdeInstance):
+    # RET_ALREADY_STOPPED = -1
+    # RET_TIME_OUT = -3
+    # RET_SUCCESS = -4
+
+    def __init__(self):
         self.subProcess = None
-        if tInst is None:
-            raise CrashGenError("Empty instance not allowed in TdeSubProcess")
-        self._tInst = tInst # Default create at ServiceManagerThread
+        # if tInst is None:
+        #     raise CrashGenError("Empty instance not allowed in TdeSubProcess")
+        # self._tInst = tInst # Default create at ServiceManagerThread
 
     def getStdOut(self):
         return self.subProcess.stdout
@@ -177,38 +229,15 @@ class TdeSubProcess:
     def getPid(self):
         return self.subProcess.pid
 
-    # Repalced by TdeInstance class
-    # def getBuildPath(self):
-    #     selfPath = os.path.dirname(os.path.realpath(__file__))
-    #     if ("community" in selfPath):
-    #         projPath = selfPath[:selfPath.find("communit")]
-    #     else:
-    #         projPath = selfPath[:selfPath.find("tests")]
-
-    #     for root, dirs, files in os.walk(projPath):
-    #         if ("taosd" in files):
-    #             rootRealPath = os.path.dirname(os.path.realpath(root))
-    #             if ("packaging" not in rootRealPath):
-    #                 buildPath = root[:len(root) - len("/build/bin")]
-    #                 break
-    #     return buildPath
-
-    def start(self):
+    def start(self, cmdLine):
         ON_POSIX = 'posix' in sys.builtin_module_names
 
         # Sanity check
         if self.subProcess:  # already there
             raise RuntimeError("Corrupt process state")
-
-        # global gContainer
-        # tInst = gContainer.defTdeInstance = TdeInstance('test3') # creae the instance
-        self._tInst.generateCfgFile() # service side generates config file, client does not
-
-        self._tInst.rotateLogs()
-
-        print("Starting TDengine instance: {}".format(self._tInst))
+    
         self.subProcess = subprocess.Popen(
-            self._tInst.getServiceCommand(),
+            cmdLine,
             shell=False,
             # svcCmdSingle, shell=True, # capture core dump?
             stdout=subprocess.PIPE,
@@ -218,31 +247,50 @@ class TdeSubProcess:
             )  # had text=True, which interferred with reading EOF
 
     def stop(self):
+        """
+        Stop a sub process, and try to return a meaningful return code.
+
+        Common POSIX signal values (from man -7 signal):
+        SIGHUP           1
+        SIGINT           2 
+        SIGQUIT          3 
+        SIGILL           4
+        SIGTRAP          5
+        SIGABRT          6 
+        SIGIOT           6 
+        SIGBUS           7 
+        SIGEMT           - 
+        SIGFPE           8  
+        SIGKILL          9  
+        SIGUSR1         10 
+        SIGSEGV         11
+        SIGUSR2         12
+        """
         if not self.subProcess:
             print("Sub process already stopped")
-            return -1
+            return  # -1
 
-        retCode = self.subProcess.poll() # contains real sub process return code
+        retCode = self.subProcess.poll() # ret -N means killed with signal N, otherwise it's from exit(N)
         if retCode:  # valid return code, process ended
+            retCode = -retCode # only if valid
+            Logging.warning("TSP.stop(): process ended itself")
             self.subProcess = None
-        else:  # process still alive, let's interrupt it
-            print(
-                "Sub process is running, sending SIG_INT and waiting for it to terminate...")
-            # sub process should end, then IPC queue should end, causing IO
-            # thread to end
-            self.subProcess.send_signal(signal.SIGINT)
-            try:
-                self.subProcess.wait(10)
-                retCode = self.subProcess.returncode
-            except subprocess.TimeoutExpired as err:
-                print("Time out waiting for TDengine service process to exit")
-                retCode = -3
-            else:
-                print("TDengine service process terminated successfully from SIG_INT")
-                retCode = -4
-                self.subProcess = None
-        return retCode
+            return retCode
 
+        # process still alive, let's interrupt it
+        print("Terminate running process, send SIG_INT and wait...")
+        # sub process should end, then IPC queue should end, causing IO thread to end
+        self.subProcess.send_signal(signal.SIGINT)
+        self.subProcess.wait(20)
+        retCode = self.subProcess.returncode # should always be there
+        # May throw subprocess.TimeoutExpired exception above, therefore
+        # The process is guranteed to have ended by now
+        self.subProcess = None        
+        if retCode != 0: # != (- signal.SIGINT):
+            Logging.error("TSP.stop(): Failed to stop sub proc properly w/ SIG_INT, retCode={}".format(retCode))
+        else:
+            Logging.info("TSP.stop(): sub proc successfully terminated with SIG_INT")
+        return - retCode
 
 class ServiceManager:
     PAUSE_BETWEEN_IPC_CHECK = 1.2  # seconds between checks on STDOUT of sub process
@@ -259,19 +307,25 @@ class ServiceManager:
         # self._status = MainExec.STATUS_RUNNING # set inside
         # _startTaosService()
         self._runCluster = (numDnodes >= 1)
-        self.svcMgrThreads = [] # type: List[ServiceManagerThread]
+        self._tInsts : List[TdeInstance] = []
         for i in range(0, numDnodes):
-            self.svcMgrThreads.append(ServiceManagerThread(i))
+            ti = self._createTdeInstance(i) # construct tInst
+            self._tInsts.append(ti)
 
-    def _createThread(self, dnIndex):
-        if not self._runCluster: # single instance 
-            return ServiceManagerThread(0)
+        # self.svcMgrThreads : List[ServiceManagerThread] = []
+        # for i in range(0, numDnodes):
+        #     thread = self._createThread(i) # construct tInst
+        #     self.svcMgrThreads.append(thread)
+
+    def _createTdeInstance(self, dnIndex):
+        # if not self._runCluster: # single instance 
+        #     return ServiceManagerThread(0)
         # Create all threads in a cluster
         subdir = 'cluster_dnode_{}'.format(dnIndex)
         fepPort= 6030 # firstEP Port
         port   = fepPort + dnIndex * 100
-        ti = TdeInstance(subdir, port, fepPort)
-        return ServiceManagerThread(dnIndex, ti)
+        return TdeInstance(subdir, dnIndex, port, fepPort)
+        # return ServiceManagerThread(dnIndex, ti)
 
     def _doMenu(self):
         choice = ""
@@ -336,8 +390,8 @@ class ServiceManager:
         Determine if the service/cluster is active at all, i.e. at least
         one thread is not "stopped".
         """
-        for thread in self.svcMgrThreads:
-            if not thread.isStopped():
+        for ti in self._tInsts:
+            if not ti.getStatus().isStopped():
                 return True
         return False
 
@@ -356,28 +410,31 @@ class ServiceManager:
         Determine if the service/cluster is "stable", i.e. all of the
         threads are in "stable" status.
         """
-        for thread in self.svcMgrThreads:
-            if not thread.isStable():
+        for ti in self._tInsts:
+            if not ti.isStable():
                 return False
         return True
 
     def _procIpcAll(self):
         while self.isActive():
-            for thread in self.svcMgrThreads: # all thread objects should always be valid
+            Progress.emit(Progress.SERVICE_HEART_BEAT)
+            for ti in self._tInsts: # all thread objects should always be valid
             # while self.isRunning() or self.isRestarting() :  # for as long as the svc mgr thread is still here
-                if  thread.isRunning():
-                    thread.procIpcBatch()  # regular processing,
-                    if  thread.isStopped():
-                        thread.procIpcBatch() # one last time?
+                status = ti.getStatus()
+                if  status.isRunning():
+                    th = ti.getSmThread()
+                    th.procIpcBatch()  # regular processing,
+                    if  status.isStopped():
+                        th.procIpcBatch() # one last time?
                     # self._updateThreadStatus()
-                elif thread.isRetarting():
-                    print("Service restarting...")
-                # else this thread is stopped 
                     
             time.sleep(self.PAUSE_BETWEEN_IPC_CHECK)  # pause, before next round
         # raise CrashGenError("dummy")
         print("Service Manager Thread (with subprocess) ended, main thread exiting...")
 
+    def _getFirstInstance(self):
+        return self._tInsts[0]
+
     def startTaosServices(self):
         with self._lock:
             if self.isActive():
@@ -386,15 +443,19 @@ class ServiceManager:
             # Find if there's already a taosd service, and then kill it
             for proc in psutil.process_iter():
                 if proc.name() == 'taosd':
-                    print("Killing an existing TAOSD process in 2 seconds... press CTRL-C to interrupe")
+                    print("Killing an existing TAOSD process in 2 seconds... press CTRL-C to interrupt")
                     time.sleep(2.0)
                     proc.kill()
                 # print("Process: {}".format(proc.name()))
             
             # self.svcMgrThread = ServiceManagerThread()  # create the object
-            for thread in self.svcMgrThreads:
-                thread.start()            
-                thread.procIpcBatch(trimToTarget=10, forceOutput=True)  # for printing 10 lines                         
+            
+            for ti in self._tInsts:
+                ti.start()  
+                if not ti.isFirst():                                    
+                    tFirst = self._getFirstInstance()
+                    tFirst.createDnode(ti.getDbTarget())
+                ti.getSmThread().procIpcBatch(trimToTarget=10, forceOutput=True)  # for printing 10 lines                                     
 
     def stopTaosServices(self):
         with self._lock:
@@ -402,8 +463,8 @@ class ServiceManager:
                 Logging.warning("Cannot stop TAOS service(s), already not active")
                 return
 
-            for thread in self.svcMgrThreads:
-                thread.stop()
+            for ti in self._tInsts:
+                ti.stop()
                 
     def run(self):
         self.startTaosServices()
@@ -412,7 +473,7 @@ class ServiceManager:
             self.stopTaosServices()  # should have started already
 
     def restart(self):
-        if not self.isStable():
+        if not self.getStatus().isStable():
             Logging.warning("Cannot restart service/cluster, when not stable")
             return
 
@@ -440,42 +501,27 @@ class ServiceManagerThread:
     """
     MAX_QUEUE_SIZE = 10000
 
-    def __init__(self, tInstNum = 0, tInst : TdeInstance = None):
+    def __init__(self):
         # Set the sub process
         self._tdeSubProcess = None # type: TdeSubProcess
 
         # Arrange the TDengine instance
-        self._tInstNum = tInstNum # instance serial number in cluster, ZERO based
-        self._tInst    = tInst or TdeInstance() # Need an instance
+        # self._tInstNum = tInstNum # instance serial number in cluster, ZERO based
+        # self._tInst    = tInst or TdeInstance() # Need an instance
 
         self._thread = None # The actual thread, # type: threading.Thread
-        self._status = Status.STATUS_STOPPED # The status of the underlying service, actually.
+        self._status = Status(Status.STATUS_STOPPED) # The status of the underlying service, actually.
 
     def __repr__(self):
-        return "[SvcMgrThread: tInstNum={}]".format(self._tInstNum)
+        return "[SvcMgrThread: status={}, subProc={}]".format(
+            self.getStatus(), self._tdeSubProcess)
 
     def getStatus(self):
         return self._status
 
-    def isStarting(self):
-        return self._status == Status.STATUS_STARTING
-
-    def isRunning(self):
-        # return self._thread and self._thread.is_alive()
-        return self._status == Status.STATUS_RUNNING
-
-    def isStopping(self):
-        return self._status == Status.STATUS_STOPPING
-
-    def isStopped(self):
-        return self._status == Status.STATUS_STOPPED
-
-    def isStable(self):
-        return self.isRunning() or self.isStopped()
-
     # Start the thread (with sub process), and wait for the sub service
     # to become fully operational
-    def start(self):
+    def start(self, cmdLine):
         if self._thread:
             raise RuntimeError("Unexpected _thread")
         if self._tdeSubProcess:
@@ -483,9 +529,9 @@ class ServiceManagerThread:
 
         Logging.info("Attempting to start TAOS service: {}".format(self))
 
-        self._status = Status.STATUS_STARTING
-        self._tdeSubProcess = TdeSubProcess(self._tInst)
-        self._tdeSubProcess.start()
+        self._status.set(Status.STATUS_STARTING)
+        self._tdeSubProcess = TdeSubProcess()
+        self._tdeSubProcess.start(cmdLine)
 
         self._ipcQueue = Queue()
         self._thread = threading.Thread( # First thread captures server OUTPUT
@@ -505,10 +551,11 @@ class ServiceManagerThread:
             time.sleep(1.0)
             # self.procIpcBatch() # don't pump message during start up
             print("_zz_", end="", flush=True)
-            if self._status == Status.STATUS_RUNNING:
+            if self._status.isRunning():
                 Logging.info("[] TDengine service READY to process requests")
                 Logging.info("[] TAOS service started: {}".format(self))
-                self._verifyDnode(self._tInst) # query and ensure dnode is ready
+                # self._verifyDnode(self._tInst) # query and ensure dnode is ready
+                # Logging.debug("[] TAOS Dnode verified: {}".format(self))
                 return  # now we've started
         # TODO: handle failure-to-start  better?
         self.procIpcBatch(100, True) # display output before cronking out, trim to last 20 msgs, force output
@@ -523,25 +570,27 @@ class ServiceManagerThread:
         # ret = {row[0]:row[1] for row in stCols if row[3]=='TAG'} # name:type
         isValid = False
         for col in cols:
-            print("col = {}".format(col))
+            # print("col = {}".format(col))
             ep = col[1].split(':') # 10.1.30.2:6030
-            print("ep={}".format(ep))
+            print("Found ep={}".format(ep))
             if tInst.getPort() == int(ep[1]): # That's us
-                print("Valid Dnode matched!")
+                # print("Valid Dnode matched!")
                 isValid = True # now we are valid
                 break
         if not isValid:
-            raise RuntimeError("Failed to start Dnode, port = {}, expected: {}".
-                format(ep[1], tInst.getPort()))
+            print("Failed to start dnode, sleep for a while")
+            time.sleep(600)
+            raise RuntimeError("Failed to start Dnode, expected port not found: {}".
+                format(tInst.getPort()))
         dbc.close()
 
     def stop(self):
         # can be called from both main thread or signal handler
         print("Terminating TDengine service running as the sub process...")
-        if self.isStopped():
+        if self.getStatus().isStopped():
             print("Service already stopped")
             return
-        if self.isStopping():
+        if self.getStatus().isStopping():
             print("Service is already being stopped")
             return
         # Linux will send Control-C generated SIGINT to the TDengine process
@@ -550,39 +599,42 @@ class ServiceManagerThread:
         if not self._tdeSubProcess:
             raise RuntimeError("sub process object missing")
 
-        self._status = Status.STATUS_STOPPING
-        retCode = self._tdeSubProcess.stop()
-        print("Attempted to stop sub process, got return code: {}".format(retCode))
-        if (retCode==-11): # SGV
-            Logging.error("[[--ERROR--]]: TDengine service SEGV fault (check core file!)")
-
-        if self._tdeSubProcess.isRunning():  # still running
-            print("FAILED to stop sub process, it is still running... pid = {}".format(
+        self._status.set(Status.STATUS_STOPPING)
+        # retCode = self._tdeSubProcess.stop()
+        try:
+            retCode = self._tdeSubProcess.stop()
+            # print("Attempted to stop sub process, got return code: {}".format(retCode))
+            if retCode == signal.SIGSEGV : # SGV
+                Logging.error("[[--ERROR--]]: TDengine service SEGV fault (check core file!)")
+        except subprocess.TimeoutExpired as err:
+            print("Time out waiting for TDengine service process to exit")
+        else:    
+            if self._tdeSubProcess.isRunning():  # still running, should now never happen
+                print("FAILED to stop sub process, it is still running... pid = {}".format(
                     self._tdeSubProcess.getPid()))
-        else:
-            self._tdeSubProcess = None  # not running any more
-            self.join()  # stop the thread, change the status, etc.
+            else:
+                self._tdeSubProcess = None  # not running any more
+                self.join()  # stop the thread, change the status, etc.
 
         # Check if it's really stopped
-        outputLines = 20 # for last output
-        if  self.isStopped():
+        outputLines = 10 # for last output
+        if  self.getStatus().isStopped():
             self.procIpcBatch(outputLines)  # one last time
-            print("End of TDengine Service Output: {}".format(self))
-            print("----- TDengine Service (managed by SMT) is now terminated -----\n")
+            Logging.debug("End of TDengine Service Output: {}".format(self))
+            Logging.info("----- TDengine Service (managed by SMT) is now terminated -----\n")
         else:
             print("WARNING: SMT did not terminate as expected: {}".format(self))
 
     def join(self):
         # TODO: sanity check
-        if not self.isStopping():
+        if not self.getStatus().isStopping():
             raise RuntimeError(
-                "Unexpected status when ending svc mgr thread: {}".format(
-                    self._status))
+                "SMT.Join(): Unexpected status: {}".format(self._status))
 
         if self._thread:
             self._thread.join()
             self._thread = None
-            self._status = Status.STATUS_STOPPED
+            self._status.set(Status.STATUS_STOPPED)
             # STD ERR thread
             self._thread2.join()
             self._thread2 = None
@@ -651,25 +703,27 @@ class ServiceManagerThread:
             queue.put(line)
             self._printProgress("_i")
 
-            if self._status == Status.STATUS_STARTING:  # we are starting, let's see if we have started
+            if self._status.isStarting():  # we are starting, let's see if we have started
                 if line.find(self.TD_READY_MSG) != -1:  # found
                     Logging.info("Waiting for the service to become FULLY READY")
                     time.sleep(1.0) # wait for the server to truly start. TODO: remove this
-                    Logging.info("Service instance #{} is now FULLY READY".format(self._tInstNum))   
-                    self._status = Status.STATUS_RUNNING                 
+                    Logging.info("Service is now FULLY READY") # TODO: more ID info here?
+                    self._status.set(Status.STATUS_RUNNING)
 
             # Trim the queue if necessary: TODO: try this 1 out of 10 times
             self._trimQueue(self.MAX_QUEUE_SIZE * 9 // 10)  # trim to 90% size
 
-            if self.isStopping():  # TODO: use thread status instead
+            if self._status.isStopping():  # TODO: use thread status instead
                 # WAITING for stopping sub process to finish its outptu
                 print("_w", end="", flush=True)
 
             # queue.put(line)
         # meaning sub process must have died
-        print("\nNo more output from IO thread managing TDengine service")
+        Logging.info("\nEnd of stream detected for TDengine STDOUT: {}".format(self))
         out.close()
 
     def svcErrorReader(self, err: IO, queue):
         for line in iter(err.readline, b''):
             print("\nTDengine Service (taosd) ERROR (from stderr): {}".format(line))
+        Logging.info("\nEnd of stream detected for TDengine STDERR: {}".format(self))
+        err.close()
\ No newline at end of file

From 87cd1cc0f67a210aecbd5b5a8cf84735cf8f9ae1 Mon Sep 17 00:00:00 2001
From: Steven Li <stevenli@taosdata.com>
Date: Sat, 24 Oct 2020 08:42:38 +0000
Subject: [PATCH 15/16] Fixed travis build failure caused by crash_gen tool,
 sorry

---
 tests/pytest/crash_gen/crash_gen.py       |  6 +++---
 tests/pytest/crash_gen/service_manager.py | 18 +++++++++---------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/tests/pytest/crash_gen/crash_gen.py b/tests/pytest/crash_gen/crash_gen.py
index 74e3964d5a..102d7d9bdd 100755
--- a/tests/pytest/crash_gen/crash_gen.py
+++ b/tests/pytest/crash_gen/crash_gen.py
@@ -2023,7 +2023,7 @@ class ClientManager:
         # print("exec stats: {}".format(self.tc.getExecStats()))
         # print("TC failed = {}".format(self.tc.isFailed()))
         if svcMgr: # gConfig.auto_start_service:
-            svcMgr.stopTaosService()
+            svcMgr.stopTaosServices()
             svcMgr = None
         # Print exec status, etc., AFTER showing messages from the server
         self.conclude()
@@ -2077,8 +2077,8 @@ class MainExec:
     def runClient(self):
         global gSvcMgr
         if gConfig.auto_start_service:
-            gSvcMgr = self._svcMgr = ServiceManager() # hack alert
-            gSvcMgr.startTaosService() # we start, don't run
+            gSvcMgr = self._svcMgr = ServiceManager(1) # hack alert
+            gSvcMgr.startTaosServices() # we start, don't run
         
         self._clientMgr = ClientManager()
         ret = None
diff --git a/tests/pytest/crash_gen/service_manager.py b/tests/pytest/crash_gen/service_manager.py
index c85f64fde4..bb2becb55b 100644
--- a/tests/pytest/crash_gen/service_manager.py
+++ b/tests/pytest/crash_gen/service_manager.py
@@ -295,7 +295,7 @@ class TdeSubProcess:
 class ServiceManager:
     PAUSE_BETWEEN_IPC_CHECK = 1.2  # seconds between checks on STDOUT of sub process
 
-    def __init__(self, numDnodes = 1): # Otherwise we run a cluster
+    def __init__(self, numDnodes): # >1 when we run a cluster
         Logging.info("TDengine Service Manager (TSM) created")
         self._numDnodes = numDnodes # >1 means we have a cluster
         self._lock = threading.Lock()
@@ -306,7 +306,7 @@ class ServiceManager:
         self.inSigHandler = False
         # self._status = MainExec.STATUS_RUNNING # set inside
         # _startTaosService()
-        self._runCluster = (numDnodes >= 1)
+        self._runCluster = (numDnodes > 1)
         self._tInsts : List[TdeInstance] = []
         for i in range(0, numDnodes):
             ti = self._createTdeInstance(i) # construct tInst
@@ -318,10 +318,10 @@ class ServiceManager:
         #     self.svcMgrThreads.append(thread)
 
     def _createTdeInstance(self, dnIndex):
-        # if not self._runCluster: # single instance 
-        #     return ServiceManagerThread(0)
-        # Create all threads in a cluster
-        subdir = 'cluster_dnode_{}'.format(dnIndex)
+        if not self._runCluster: # single instance 
+            subdir = 'test'
+        else:        # Create all threads in a cluster
+            subdir = 'cluster_dnode_{}'.format(dnIndex)
         fepPort= 6030 # firstEP Port
         port   = fepPort + dnIndex * 100
         return TdeInstance(subdir, dnIndex, port, fepPort)
@@ -411,7 +411,7 @@ class ServiceManager:
         threads are in "stable" status.
         """
         for ti in self._tInsts:
-            if not ti.isStable():
+            if not ti.getStatus().isStable():
                 return False
         return True
 
@@ -473,7 +473,7 @@ class ServiceManager:
             self.stopTaosServices()  # should have started already
 
     def restart(self):
-        if not self.getStatus().isStable():
+        if not self.isStable():
             Logging.warning("Cannot restart service/cluster, when not stable")
             return
 
@@ -483,7 +483,7 @@ class ServiceManager:
         else:
             Logging.warning("Service not active when restart requested")
 
-        self.startTaosService()
+        self.startTaosServices()
         # self._isRestarting = False
 
     # def isRunning(self):

From aa995b99a374cd2d1c5188b8828e4d959742f220 Mon Sep 17 00:00:00 2001
From: yihaoDeng <yhdeng@taosdata.com>
Date: Sat, 24 Oct 2020 17:01:37 +0000
Subject: [PATCH 16/16] add query sort test cases

---
 tests/pytest/query/querySort.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/pytest/query/querySort.py b/tests/pytest/query/querySort.py
index e5d3c8ce1f..649e0dc1cb 100644
--- a/tests/pytest/query/querySort.py
+++ b/tests/pytest/query/querySort.py
@@ -96,6 +96,12 @@ class TDTestCase:
         tdSql.query("select * from st order by ts desc")
         self.checkColumnSorted(0, "desc")
 
+        print("======= step 2: verify order for special column =========")
+        
+        tdSql.query("select tbcol1 from st order by ts desc")
+
+        tdSql.query("select tbcol6 from st order by ts desc")
+
         for i in range(1, 10):
             tdSql.error("select * from st order by tbcol%d" % i)
             tdSql.error("select * from st order by tbcol%d asc" % i)