From fb9376bea77ea94ce7bbd3ec69419b0b12fea641 Mon Sep 17 00:00:00 2001 From: Steven Li Date: Tue, 20 Oct 2020 07:13:00 +0000 Subject: [PATCH 01/16] Introduced TdeInstance concept into crash_gen tool, ready to run clusters next --- tests/pytest/crash_gen/crash_gen.py | 368 +++++++++++++++++----------- 1 file changed, 220 insertions(+), 148 deletions(-) diff --git a/tests/pytest/crash_gen/crash_gen.py b/tests/pytest/crash_gen/crash_gen.py index 48196ab383..b1d79f54c3 100755 --- a/tests/pytest/crash_gen/crash_gen.py +++ b/tests/pytest/crash_gen/crash_gen.py @@ -44,6 +44,7 @@ import traceback import resource from guppy import hpy import gc +import subprocess try: import psutil @@ -59,12 +60,13 @@ if sys.version_info[0] < 3: # Command-line/Environment Configurations, will set a bit later # ConfigNameSpace = argparse.Namespace -gConfig = argparse.Namespace() # Dummy value, will be replaced later -gSvcMgr = None # TODO: refactor this hack, use dep injection -logger = None # type: Logger +gConfig: argparse.Namespace +gSvcMgr: ServiceManager # TODO: refactor this hack, use dep injection +logger: logging.Logger +gContainer: Container -def runThread(wt: WorkerThread): - wt.run() +# def runThread(wt: WorkerThread): +# wt.run() class CrashGenError(Exception): def __init__(self, msg=None, errno=None): @@ -74,7 +76,6 @@ class CrashGenError(Exception): def __str__(self): return self.msg - class WorkerThread: def __init__(self, pool: ThreadPool, tid, tc: ThreadCoordinator, # te: TaskExecutor, @@ -84,7 +85,8 @@ class WorkerThread: self._tid = tid self._tc = tc # type: ThreadCoordinator # self.threadIdent = threading.get_ident() - self._thread = threading.Thread(target=runThread, args=(self,)) + # self._thread = threading.Thread(target=runThread, args=(self,)) + self._thread = threading.Thread(target=self.run) self._stepGate = threading.Event() # Let us have a DB connection of our own @@ -253,7 +255,7 @@ class WorkerThread: class ThreadCoordinator: - WORKER_THREAD_TIMEOUT = 60 # one minute + WORKER_THREAD_TIMEOUT = 180 # one minute def __init__(self, pool: ThreadPool, dbManager: DbManager): self._curStep = -1 # first step is 0 @@ -882,20 +884,15 @@ class MyTDSql: raise return self.affectedRows +class TdeInstance(): + """ + A class to capture the *static* information of a TDengine instance, + including the location of the various files/directories, and basica + configuration. + """ -class DbConnNative(DbConn): - # Class variables - _lock = threading.Lock() - _connInfoDisplayed = False - totalConnections = 0 # Not private - - def __init__(self): - super().__init__() - self._type = self.TYPE_NATIVE - self._conn = None - # self._cursor = None - - def getBuildPath(self): + @classmethod + def _getBuildPath(cls): selfPath = os.path.dirname(os.path.realpath(__file__)) if ("community" in selfPath): projPath = selfPath[:selfPath.find("communit")] @@ -914,10 +911,118 @@ class DbConnNative(DbConn): .format(selfPath, projPath)) return buildPath + def __init__(self, subdir='test'): + self._buildDir = self._getBuildPath() + self._subdir = '/' + subdir # TODO: tolerate "/" + + def __repr__(self): + return "[TdeInstance: {}, subdir={}]".format(self._buildDir, self._subdir) + def generateCfgFile(self): + # buildPath = self.getBuildPath() + # taosdPath = self._buildPath + "/build/bin/taosd" + + cfgDir = self.getCfgDir() + cfgFile = cfgDir + "/taos.cfg" # TODO: inquire if this is fixed + if os.path.exists(cfgFile): + if os.path.isfile(cfgFile): + logger.warning("Config file exists already, skip creation: {}".format(cfgFile)) + return # cfg file already exists, nothing to do + else: + raise CrashGenError("Invalid config file: {}".format(cfgFile)) + # Now that the cfg file doesn't exist + if os.path.exists(cfgDir): + if not os.path.isdir(cfgDir): + raise CrashGenError("Invalid config dir: {}".format(cfgDir)) + # else: good path + else: + os.makedirs(cfgDir, exist_ok=True) # like "mkdir -p" + # Now we have a good cfg dir + cfgValues = { + 'runDir': self.getRunDir(), + 'ip': '127.0.0.1', # TODO: change to a network addressable ip + 'port': 6030, + } + cfgTemplate = """ +dataDir {runDir}/data +logDir {runDir}/log + +charset UTF-8 + +firstEp {ip}:{port} +fqdn {ip} +serverPort {port} + +# was all 135 below +dDebugFlag 135 +cDebugFlag 135 +rpcDebugFlag 135 +qDebugFlag 135 +# httpDebugFlag 143 +# asyncLog 0 +# tables 10 +maxtablesPerVnode 10 +rpcMaxTime 101 +# cache 2 +keep 36500 +# walLevel 2 +walLevel 1 +# +# maxConnections 100 +""" + cfgContent = cfgTemplate.format_map(cfgValues) + f = open(cfgFile, "w") + f.write(cfgContent) + f.close() + + def rotateLogs(self): + logPath = self.getLogDir() + # ref: https://stackoverflow.com/questions/1995373/deleting-all-files-in-a-directory-with-python/1995397 + if os.path.exists(logPath): + logPathSaved = logPath + "_" + time.strftime('%Y-%m-%d-%H-%M-%S') + logger.info("Saving old log files to: {}".format(logPathSaved)) + os.rename(logPath, logPathSaved) + # os.mkdir(logPath) # recreate, no need actually, TDengine will auto-create with proper perms + + + def getExecFile(self): # .../taosd + return self._buildDir + "/build/bin/taosd" + + def getRunDir(self): # TODO: rename to "root dir" ?! + return self._buildDir + self._subdir + + def getCfgDir(self): # path, not file + return self.getRunDir() + "/cfg" + + def getLogDir(self): + return self.getRunDir() + "/log" + + def getHostAddr(self): + return "127.0.0.1" + + def getServiceCommand(self): # to start the instance + return [self.getExecFile(), '-c', self.getCfgDir()] # used in subproce.Popen() + + + +class DbConnNative(DbConn): + # Class variables + _lock = threading.Lock() + _connInfoDisplayed = False + totalConnections = 0 # Not private + + def __init__(self): + super().__init__() + self._type = self.TYPE_NATIVE + self._conn = None + # self._cursor = None + def openByType(self): # Open connection - cfgPath = self.getBuildPath() + "/test/cfg" - hostAddr = "127.0.0.1" + global gContainer + tdeInstance = gContainer.defTdeInstance # set up in ClientManager, type: TdeInstance + # cfgPath = self.getBuildPath() + "/test/cfg" + cfgPath = tdeInstance.getCfgDir() + hostAddr = tdeInstance.getHostAddr() cls = self.__class__ # Get the class, to access class variables with cls._lock: # force single threading for opening DB connections. # TODO: whaaat??!!! @@ -1662,7 +1767,7 @@ class Task(): 0x503, 0x510, # vnode not in ready state 0x14, # db not ready, errno changed - 0x600, + 0x600, # Invalid table ID, why? 1000 # REST catch-all error ]: return True # These are the ALWAYS-ACCEPTABLE ones @@ -1824,7 +1929,7 @@ class ExecutionStats: "FAILED (reason: {})".format( self._failureReason) if self._failed else "SUCCEEDED")) logger.info("| Task Execution Times (success/total):") - execTimesAny = 0 + execTimesAny = 0.001 # avoid div by zero for k, n in self._execTimes.items(): execTimesAny += n[0] errStr = None @@ -2343,7 +2448,9 @@ class MyLoggingAdapter(logging.LoggerAdapter): # return '[%s] %s' % (self.extra['connid'], msg), kwargs -class SvcManager: +class ServiceManager: + PAUSE_BETWEEN_IPC_CHECK = 1.2 # seconds between checks on STDOUT of sub process + def __init__(self): print("Starting TDengine Service Manager") # signal.signal(signal.SIGTERM, self.sigIntHandler) # Moved to MainExec @@ -2384,10 +2491,8 @@ class SvcManager: self.inSigHandler = True choice = self._doMenu() - if choice == "1": - # TODO: can the sub-process be blocked due to us not reading from - # queue? - self.sigHandlerResume() + if choice == "1": + self.sigHandlerResume() # TODO: can the sub-process be blocked due to us not reading from queue? elif choice == "2": self.stopTaosService() elif choice == "3": # Restart @@ -2398,20 +2503,20 @@ class SvcManager: self.inSigHandler = False def sigIntHandler(self, signalNumber, frame): - print("SvcManager: INT Signal Handler starting...") + print("ServiceManager: INT Signal Handler starting...") if self.inSigHandler: print("Ignoring repeated SIG_INT...") return self.inSigHandler = True self.stopTaosService() - print("SvcManager: INT Signal Handler returning...") + print("ServiceManager: INT Signal Handler returning...") self.inSigHandler = False def sigHandlerResume(self): - print("Resuming TDengine service manager thread (main thread)...\n\n") + print("Resuming TDengine service manager (main thread)...\n\n") - def _checkServiceManagerThread(self): + def _updateThreadStatus(self): if self.svcMgrThread: # valid svc mgr thread if self.svcMgrThread.isStopped(): # done? self.svcMgrThread.procIpcBatch() # one last time. TODO: appropriate? @@ -2419,14 +2524,13 @@ class SvcManager: def _procIpcAll(self): while self.isRunning() or self.isRestarting() : # for as long as the svc mgr thread is still here - if self.isRunning(): + if self.isRunning(): self.svcMgrThread.procIpcBatch() # regular processing, - self._checkServiceManagerThread() + self._updateThreadStatus() elif self.isRetarting(): print("Service restarting...") - time.sleep(0.5) # pause, before next round - print( - "Service Manager Thread (with subprocess) has ended, main thread now exiting...") + time.sleep(self.PAUSE_BETWEEN_IPC_CHECK) # pause, before next round + print("Service Manager Thread (with subprocess) ended, main thread exiting...") def startTaosService(self): with self._lock: @@ -2440,7 +2544,6 @@ class SvcManager: time.sleep(2.0) proc.kill() # print("Process: {}".format(proc.name())) - self.svcMgrThread = ServiceManagerThread() # create the object print("Attempting to start TAOS service started, printing out output...") @@ -2491,10 +2594,17 @@ class SvcManager: return self._isRestarting class ServiceManagerThread: + """ + A class representing a dedicated thread which manages the "sub process" + of the TDengine service, interacting with its STDOUT/ERR. + + It takes a TdeInstance parameter at creation time, or create a default + """ MAX_QUEUE_SIZE = 10000 - def __init__(self): + def __init__(self, tInst : TdeInstance = None): self._tdeSubProcess = None # type: TdeSubProcess + self._tInst = tInst or TdeInstance() # Need an instance self._thread = None self._status = None @@ -2521,7 +2631,7 @@ class ServiceManagerThread: self._status = MainExec.STATUS_STARTING - self._tdeSubProcess = TdeSubProcess() + self._tdeSubProcess = TdeSubProcess(self._tInst) self._tdeSubProcess.start() self._ipcQueue = Queue() @@ -2681,8 +2791,19 @@ class ServiceManagerThread: class TdeSubProcess: - def __init__(self): + """ + A class to to represent the actual sub process that is the run-time + of a TDengine instance. + + It takes a TdeInstance object as its parameter, with the rationale being + "a sub process runs an instance". + """ + + def __init__(self, tInst : TdeInstance): self.subProcess = None + if tInst is None: + raise CrashGenError("Empty instance not allowed in TdeSubProcess") + self._tInst = tInst # Default create at ServiceManagerThread def getStdOut(self): return self.subProcess.stdout @@ -2696,50 +2817,39 @@ class TdeSubProcess: def getPid(self): return self.subProcess.pid - def getBuildPath(self): - selfPath = os.path.dirname(os.path.realpath(__file__)) - if ("community" in selfPath): - projPath = selfPath[:selfPath.find("communit")] - else: - projPath = selfPath[:selfPath.find("tests")] + # Repalced by TdeInstance class + # def getBuildPath(self): + # selfPath = os.path.dirname(os.path.realpath(__file__)) + # if ("community" in selfPath): + # projPath = selfPath[:selfPath.find("communit")] + # else: + # projPath = selfPath[:selfPath.find("tests")] - for root, dirs, files in os.walk(projPath): - if ("taosd" in files): - rootRealPath = os.path.dirname(os.path.realpath(root)) - if ("packaging" not in rootRealPath): - buildPath = root[:len(root) - len("/build/bin")] - break - return buildPath + # for root, dirs, files in os.walk(projPath): + # if ("taosd" in files): + # rootRealPath = os.path.dirname(os.path.realpath(root)) + # if ("packaging" not in rootRealPath): + # buildPath = root[:len(root) - len("/build/bin")] + # break + # return buildPath def start(self): ON_POSIX = 'posix' in sys.builtin_module_names - taosdPath = self.getBuildPath() + "/build/bin/taosd" - cfgPath = self.getBuildPath() + "/test/cfg" - - # Delete the log files - logPath = self.getBuildPath() + "/test/log" - # ref: https://stackoverflow.com/questions/1995373/deleting-all-files-in-a-directory-with-python/1995397 - # filelist = [ f for f in os.listdir(logPath) ] # if f.endswith(".bak") ] - # for f in filelist: - # filePath = os.path.join(logPath, f) - # print("Removing log file: {}".format(filePath)) - # os.remove(filePath) - if os.path.exists(logPath): - logPathSaved = logPath + "_" + time.strftime('%Y-%m-%d-%H-%M-%S') - logger.info("Saving old log files to: {}".format(logPathSaved)) - os.rename(logPath, logPathSaved) - # os.mkdir(logPath) # recreate, no need actually, TDengine will auto-create with proper perms - - svcCmd = [taosdPath, '-c', cfgPath] - # svcCmdSingle = "{} -c {}".format(taosdPath, cfgPath) - # svcCmd = ['vmstat', '1'] + # Sanity check if self.subProcess: # already there raise RuntimeError("Corrupt process state") - # print("Starting service: {}".format(svcCmd)) + # global gContainer + # tInst = gContainer.defTdeInstance = TdeInstance('test3') # creae the instance + self._tInst.generateCfgFile() # service side generates config file, client does not + + self._tInst.rotateLogs() + + print("Starting TDengine instance: {}".format(self._tInst)) self.subProcess = subprocess.Popen( - svcCmd, shell=False, + self._tInst.getServiceCommand(), + shell=False, # svcCmdSingle, shell=True, # capture core dump? stdout=subprocess.PIPE, stderr=subprocess.PIPE, @@ -2898,10 +3008,15 @@ class ClientManager: # self._printLastNumbers() global gConfig + # Prepare Tde Instance + global gContainer + tInst = gContainer.defTdeInstance = TdeInstance() # "subdir to hold the instance" + dbManager = DbManager() # Regular function thPool = ThreadPool(gConfig.num_threads, gConfig.max_steps) self.tc = ThreadCoordinator(thPool, dbManager) + print("Starting client instance to: {}".format(tInst)) self.tc.run() # print("exec stats: {}".format(self.tc.getExecStats())) # print("TC failed = {}".format(self.tc.isFailed())) @@ -2936,9 +3051,6 @@ class ClientManager: # self.tc.getDbManager().cleanUp() # clean up first, so we can show ZERO db connections self.tc.printStats() - - - class MainExec: STATUS_STARTING = 1 STATUS_RUNNING = 2 @@ -2968,7 +3080,7 @@ class MainExec: def runClient(self): global gSvcMgr if gConfig.auto_start_service: - self._svcMgr = SvcManager() + self._svcMgr = ServiceManager() gSvcMgr = self._svcMgr # hack alert self._svcMgr.startTaosService() # we start, don't run @@ -2983,55 +3095,13 @@ class MainExec: def runService(self): global gSvcMgr - self._svcMgr = SvcManager() + self._svcMgr = ServiceManager() gSvcMgr = self._svcMgr # save it in a global variable TODO: hack alert self._svcMgr.run() # run to some end state self._svcMgr = None gSvcMgr = None - def runTemp(self): # for debugging purposes - # # Hack to exercise reading from disk, imcreasing coverage. TODO: fix - # dbc = dbState.getDbConn() - # sTbName = dbState.getFixedSuperTableName() - # dbc.execute("create database if not exists db") - # if not dbState.getState().equals(StateEmpty()): - # dbc.execute("use db") - - # rTables = None - # try: # the super table may not exist - # sql = "select TBNAME from db.{}".format(sTbName) - # logger.info("Finding out tables in super table: {}".format(sql)) - # dbc.query(sql) # TODO: analyze result set later - # logger.info("Fetching result") - # rTables = dbc.getQueryResult() - # logger.info("Result: {}".format(rTables)) - # except taos.error.ProgrammingError as err: - # logger.info("Initial Super table OPS error: {}".format(err)) - - # # sys.exit() - # if ( not rTables == None): - # # print("rTables[0] = {}, type = {}".format(rTables[0], type(rTables[0]))) - # try: - # for rTbName in rTables : # regular tables - # ds = dbState - # logger.info("Inserting into table: {}".format(rTbName[0])) - # sql = "insert into db.{} values ('{}', {});".format( - # rTbName[0], - # ds.getNextTick(), ds.getNextInt()) - # dbc.execute(sql) - # for rTbName in rTables : # regular tables - # dbc.query("select * from db.{}".format(rTbName[0])) # TODO: check success failure - # logger.info("Initial READING operation is successful") - # except taos.error.ProgrammingError as err: - # logger.info("Initial WRITE/READ error: {}".format(err)) - - # Sandbox testing code - # dbc = dbState.getDbConn() - # while True: - # rows = dbc.query("show databases") - # print("Rows: {}, time={}".format(rows, time.time())) - return def main(): @@ -3045,28 +3115,7 @@ def main(): 1. You build TDengine in the top level ./build directory, as described in offical docs 2. You run the server there before this script: ./build/bin/taosd -c test/cfg - ''')) - - # parser.add_argument('-a', '--auto-start-service', action='store_true', - # help='Automatically start/stop the TDengine service (default: false)') - # parser.add_argument('-c', '--connector-type', action='store', default='native', type=str, - # help='Connector type to use: native, rest, or mixed (default: 10)') - # parser.add_argument('-d', '--debug', action='store_true', - # help='Turn on DEBUG mode for more logging (default: false)') - # parser.add_argument('-e', '--run-tdengine', action='store_true', - # help='Run TDengine service in foreground (default: false)') - # parser.add_argument('-l', '--larger-data', action='store_true', - # help='Write larger amount of data during write operations (default: false)') - # parser.add_argument('-p', '--per-thread-db-connection', action='store_true', - # help='Use a single shared db connection (default: false)') - # parser.add_argument('-r', '--record-ops', action='store_true', - # help='Use a pair of always-fsynced fils to record operations performing + performed, for power-off tests (default: false)') - # parser.add_argument('-s', '--max-steps', action='store', default=1000, type=int, - # help='Maximum number of steps to run (default: 100)') - # parser.add_argument('-t', '--num-threads', action='store', default=5, type=int, - # help='Number of threads to run (default: 10)') - # parser.add_argument('-x', '--continue-on-exception', action='store_true', - # help='Continue execution after encountering unexpected/disallowed errors/exceptions (default: false)') + ''')) parser.add_argument( '-a', @@ -3171,8 +3220,31 @@ def main(): else: return mExec.runClient() +class Container(): + _propertyList = {'defTdeInstance'} + + def __init__(self): + self._cargo = {} # No cargo at the beginning + + def _verifyValidProperty(self, name): + if not name in self._propertyList: + raise CrashGenError("Invalid container property: {}".format(name)) + + # Called for an attribute, when other mechanisms fail (compare to __getattribute__) + def __getattr__(self, name): + self._verifyValidProperty(name) + return self._cargo[name] # just a simple lookup + + def __setattr__(self, name, value): + if name == '_cargo' : # reserved vars + super().__setattr__(name, value) + return + self._verifyValidProperty(name) + self._cargo[name] = value if __name__ == "__main__": + gContainer = Container() # micky-mouse DI + exitCode = main() # print("Exiting with code: {}".format(exitCode)) - sys.exit(exitCode) + sys.exit(exitCode) \ No newline at end of file From c6a5706f662c3000371fa0a2f827749e0948ccb0 Mon Sep 17 00:00:00 2001 From: Steven Li Date: Wed, 21 Oct 2020 00:02:27 +0000 Subject: [PATCH 02/16] Multi-instance code working for single instance case, ready to refactor crash_gen tool into multiple files --- tests/pytest/crash_gen/crash_gen.py | 182 ++++++++++++++++++---------- 1 file changed, 120 insertions(+), 62 deletions(-) diff --git a/tests/pytest/crash_gen/crash_gen.py b/tests/pytest/crash_gen/crash_gen.py index b1d79f54c3..3f662fac73 100755 --- a/tests/pytest/crash_gen/crash_gen.py +++ b/tests/pytest/crash_gen/crash_gen.py @@ -1780,8 +1780,8 @@ class Task(): return True elif msg.find("duplicated column names") != -1: # also alter table tag issues return True - elif (gSvcMgr!=None) and gSvcMgr.isRestarting(): - logger.info("Ignoring error when service is restarting: errno = {}, msg = {}".format(errno, msg)) + elif gSvcMgr and (not gSvcMgr.isStable()): # We are managing service, and ... + logger.info("Ignoring error when service starting/stopping: errno = {}, msg = {}".format(errno, msg)) return True return False # Not an acceptable error @@ -2451,8 +2451,9 @@ class MyLoggingAdapter(logging.LoggerAdapter): class ServiceManager: PAUSE_BETWEEN_IPC_CHECK = 1.2 # seconds between checks on STDOUT of sub process - def __init__(self): - print("Starting TDengine Service Manager") + def __init__(self, numDnodes = 1): + logger.info("TDengine Service Manager (TSM) created") + self._numDnodes = numDnodes # >1 means we have a cluster # signal.signal(signal.SIGTERM, self.sigIntHandler) # Moved to MainExec # signal.signal(signal.SIGINT, self.sigIntHandler) # signal.signal(signal.SIGUSR1, self.sigUsrHandler) # different handler! @@ -2460,9 +2461,12 @@ class ServiceManager: self.inSigHandler = False # self._status = MainExec.STATUS_RUNNING # set inside # _startTaosService() - self.svcMgrThread = None # type: ServiceManagerThread + self.svcMgrThreads = [] # type: List[ServiceManagerThread] + for i in range(0, numDnodes): + self.svcMgrThreads.append(ServiceManagerThread(i)) + self._lock = threading.Lock() - self._isRestarting = False + # self._isRestarting = False def _doMenu(self): choice = "" @@ -2494,7 +2498,7 @@ class ServiceManager: if choice == "1": self.sigHandlerResume() # TODO: can the sub-process be blocked due to us not reading from queue? elif choice == "2": - self.stopTaosService() + self.stopTaosServices() elif choice == "3": # Restart self.restart() else: @@ -2509,33 +2513,70 @@ class ServiceManager: return self.inSigHandler = True - self.stopTaosService() + self.stopTaosServices() print("ServiceManager: INT Signal Handler returning...") self.inSigHandler = False def sigHandlerResume(self): print("Resuming TDengine service manager (main thread)...\n\n") - def _updateThreadStatus(self): - if self.svcMgrThread: # valid svc mgr thread - if self.svcMgrThread.isStopped(): # done? - self.svcMgrThread.procIpcBatch() # one last time. TODO: appropriate? - self.svcMgrThread = None # no more + # def _updateThreadStatus(self): + # if self.svcMgrThread: # valid svc mgr thread + # if self.svcMgrThread.isStopped(): # done? + # self.svcMgrThread.procIpcBatch() # one last time. TODO: appropriate? + # self.svcMgrThread = None # no more + + def isActive(self): + """ + Determine if the service/cluster is active at all, i.e. at least + one thread is not "stopped". + """ + for thread in self.svcMgrThreads: + if not thread.isStopped(): + return True + return False + + # def isRestarting(self): + # """ + # Determine if the service/cluster is being "restarted", i.e., at least + # one thread is in "restarting" status + # """ + # for thread in self.svcMgrThreads: + # if thread.isRestarting(): + # return True + # return False + + def isStable(self): + """ + Determine if the service/cluster is "stable", i.e. all of the + threads are in "stable" status. + """ + for thread in self.svcMgrThreads: + if not thread.isStable(): + return False + return True def _procIpcAll(self): - while self.isRunning() or self.isRestarting() : # for as long as the svc mgr thread is still here - if self.isRunning(): - self.svcMgrThread.procIpcBatch() # regular processing, - self._updateThreadStatus() - elif self.isRetarting(): - print("Service restarting...") + while self.isActive(): + for thread in self.svcMgrThreads: # all thread objects should always be valid + # while self.isRunning() or self.isRestarting() : # for as long as the svc mgr thread is still here + if thread.isRunning(): + thread.procIpcBatch() # regular processing, + if thread.isStopped(): + thread.procIpcBatch() # one last time? + # self._updateThreadStatus() + elif thread.isRetarting(): + print("Service restarting...") + # else this thread is stopped + time.sleep(self.PAUSE_BETWEEN_IPC_CHECK) # pause, before next round + # raise CrashGenError("dummy") print("Service Manager Thread (with subprocess) ended, main thread exiting...") - def startTaosService(self): + def startTaosServices(self): with self._lock: - if self.svcMgrThread: - raise RuntimeError("Cannot start TAOS service when one may already be running") + if self.isActive(): + raise RuntimeError("Cannot start TAOS service(s) when one/some may already be running") # Find if there's already a taosd service, and then kill it for proc in psutil.process_iter(): @@ -2545,53 +2586,45 @@ class ServiceManager: proc.kill() # print("Process: {}".format(proc.name())) - self.svcMgrThread = ServiceManagerThread() # create the object - print("Attempting to start TAOS service started, printing out output...") - self.svcMgrThread.start() - self.svcMgrThread.procIpcBatch(trimToTarget=10, forceOutput=True) # for printing 10 lines - print("TAOS service started") + # self.svcMgrThread = ServiceManagerThread() # create the object + for thread in self.svcMgrThreads: + thread.start() + thread.procIpcBatch(trimToTarget=10, forceOutput=True) # for printing 10 lines - def stopTaosService(self, outputLines=20): + def stopTaosServices(self): with self._lock: - if not self.isRunning(): - logger.warning("Cannot stop TAOS service, not running") + if not self.isActive(): + logger.warning("Cannot stop TAOS service(s), already not active") return - print("Terminating Service Manager Thread (SMT) execution...") - self.svcMgrThread.stop() - if self.svcMgrThread.isStopped(): - self.svcMgrThread.procIpcBatch(outputLines) # one last time - self.svcMgrThread = None - print("End of TDengine Service Output") - print("----- TDengine Service (managed by SMT) is now terminated -----\n") - else: - print("WARNING: SMT did not terminate as expected") - + for thread in self.svcMgrThreads: + thread.stop() + def run(self): - self.startTaosService() + self.startTaosServices() self._procIpcAll() # pump/process all the messages, may encounter SIG + restart - if self.isRunning(): # if sig handler hasn't destroyed it by now - self.stopTaosService() # should have started already + if self.isActive(): # if sig handler hasn't destroyed it by now + self.stopTaosServices() # should have started already def restart(self): - if self._isRestarting: - logger.warning("Cannot restart service when it's already restarting") + if not self.isStable(): + logger.warning("Cannot restart service/cluster, when not stable") return - self._isRestarting = True - if self.isRunning(): - self.stopTaosService() + # self._isRestarting = True + if self.isActive(): + self.stopTaosServices() else: - logger.warning("Service not running when restart requested") + logger.warning("Service not active when restart requested") self.startTaosService() - self._isRestarting = False + # self._isRestarting = False - def isRunning(self): - return self.svcMgrThread != None + # def isRunning(self): + # return self.svcMgrThread != None - def isRestarting(self): - return self._isRestarting + # def isRestarting(self): + # return self._isRestarting class ServiceManagerThread: """ @@ -2602,15 +2635,26 @@ class ServiceManagerThread: """ MAX_QUEUE_SIZE = 10000 - def __init__(self, tInst : TdeInstance = None): + def __init__(self, tInstNum = 0, tInst : TdeInstance = None): + # Set the sub process self._tdeSubProcess = None # type: TdeSubProcess - self._tInst = tInst or TdeInstance() # Need an instance - self._thread = None - self._status = None + + # Arrange the TDengine instance + self._tInstNum = tInstNum # instance serial number in cluster, ZERO based + self._tInst = tInst or TdeInstance() # Need an instance + + self._thread = None # The actual thread, # type: threading.Thread + self._status = MainExec.STATUS_STOPPED # The status of the underlying service, actually. + + def __repr__(self): + return "[SvcMgrThread: tInstNum={}]".format(self._tInstNum) def getStatus(self): return self._status + def isStarting(self): + return self._status == MainExec.STATUS_STARTING + def isRunning(self): # return self._thread and self._thread.is_alive() return self._status == MainExec.STATUS_RUNNING @@ -2621,6 +2665,9 @@ class ServiceManagerThread: def isStopped(self): return self._status == MainExec.STATUS_STOPPED + def isStable(self): + return self.isRunning() or self.isStopped() + # Start the thread (with sub process), and wait for the sub service # to become fully operational def start(self): @@ -2629,8 +2676,9 @@ class ServiceManagerThread: if self._tdeSubProcess: raise RuntimeError("TDengine sub process already created/running") - self._status = MainExec.STATUS_STARTING + logger.info("Attempting to start TAOS service: {}".format(self)) + self._status = MainExec.STATUS_STARTING self._tdeSubProcess = TdeSubProcess(self._tInst) self._tdeSubProcess.start() @@ -2654,10 +2702,11 @@ class ServiceManagerThread: print("_zz_", end="", flush=True) if self._status == MainExec.STATUS_RUNNING: logger.info("[] TDengine service READY to process requests") + logger.info("[] TAOS service started: {}".format(self)) return # now we've started - # TODO: handle this better? + # TODO: handle failure-to-start better? self.procIpcBatch(100, True) # display output before cronking out, trim to last 20 msgs, force output - raise RuntimeError("TDengine service did not start successfully") + raise RuntimeError("TDengine service did not start successfully: {}".format(self)) def stop(self): # can be called from both main thread or signal handler @@ -2687,6 +2736,15 @@ class ServiceManagerThread: self._tdeSubProcess = None # not running any more self.join() # stop the thread, change the status, etc. + # Check if it's really stopped + outputLines = 20 # for last output + if self.isStopped(): + self.procIpcBatch(outputLines) # one last time + print("End of TDengine Service Output: {}".format(self)) + print("----- TDengine Service (managed by SMT) is now terminated -----\n") + else: + print("WARNING: SMT did not terminate as expected: {}".format(self)) + def join(self): # TODO: sanity check if not self.isStopping(): @@ -2770,7 +2828,7 @@ class ServiceManagerThread: if line.find(self.TD_READY_MSG) != -1: # found logger.info("Waiting for the service to become FULLY READY") time.sleep(1.0) # wait for the server to truly start. TODO: remove this - logger.info("Service is now FULLY READY") + logger.info("Service instance #{} is now FULLY READY".format(self._tInstNum)) self._status = MainExec.STATUS_RUNNING # Trim the queue if necessary: TODO: try this 1 out of 10 times From e011827fd4ea39b1826babeed74d318c5f78d64d Mon Sep 17 00:00:00 2001 From: Steven Li Date: Wed, 21 Oct 2020 00:18:49 +0000 Subject: [PATCH 03/16] Finished refactoring crash_gen tool into a Python modular structure --- tests/pytest/crash_gen.sh | 7 +- tests/pytest/crash_gen/crash_gen.py | 221 ++++++++++++++-------------- tests/pytest/crash_gen_bootstrap.py | 23 +++ 3 files changed, 135 insertions(+), 116 deletions(-) create mode 100644 tests/pytest/crash_gen_bootstrap.py diff --git a/tests/pytest/crash_gen.sh b/tests/pytest/crash_gen.sh index 4ffe35fc3c..9cca23ac79 100755 --- a/tests/pytest/crash_gen.sh +++ b/tests/pytest/crash_gen.sh @@ -54,6 +54,7 @@ export PYTHONPATH=$(pwd)/../../src/connector/python/linux/python3:$(pwd) export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$LIB_DIR # Now we are all let, and let's see if we can find a crash. Note we pass all params +CRASH_GEN_EXEC=crash_gen_bootstrap.py if [[ $1 == '--valgrind' ]]; then shift export PYTHONMALLOC=malloc @@ -66,14 +67,14 @@ if [[ $1 == '--valgrind' ]]; then --leak-check=yes \ --suppressions=crash_gen/valgrind_taos.supp \ $PYTHON_EXEC \ - ./crash_gen/crash_gen.py $@ > $VALGRIND_OUT 2> $VALGRIND_ERR + $CRASH_GEN_EXEC $@ > $VALGRIND_OUT 2> $VALGRIND_ERR elif [[ $1 == '--helgrind' ]]; then shift valgrind \ --tool=helgrind \ $PYTHON_EXEC \ - ./crash_gen/crash_gen.py $@ + $CRASH_GEN_EXEC $@ else - $PYTHON_EXEC ./crash_gen/crash_gen.py $@ + $PYTHON_EXEC $CRASH_GEN_EXEC $@ fi diff --git a/tests/pytest/crash_gen/crash_gen.py b/tests/pytest/crash_gen/crash_gen.py index 3f662fac73..dbd4eab9e7 100755 --- a/tests/pytest/crash_gen/crash_gen.py +++ b/tests/pytest/crash_gen/crash_gen.py @@ -3160,123 +3160,124 @@ class MainExec: self._svcMgr = None gSvcMgr = None + def init(self): # TODO: refactor + global gContainer + gContainer = Container() # micky-mouse DI + # Super cool Python argument library: + # https://docs.python.org/3/library/argparse.html + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description=textwrap.dedent('''\ + TDengine Auto Crash Generator (PLEASE NOTICE the Prerequisites Below) + --------------------------------------------------------------------- + 1. You build TDengine in the top level ./build directory, as described in offical docs + 2. You run the server there before this script: ./build/bin/taosd -c test/cfg -def main(): - # Super cool Python argument library: - # https://docs.python.org/3/library/argparse.html - parser = argparse.ArgumentParser( - formatter_class=argparse.RawDescriptionHelpFormatter, - description=textwrap.dedent('''\ - TDengine Auto Crash Generator (PLEASE NOTICE the Prerequisites Below) - --------------------------------------------------------------------- - 1. You build TDengine in the top level ./build directory, as described in offical docs - 2. You run the server there before this script: ./build/bin/taosd -c test/cfg + ''')) - ''')) + parser.add_argument( + '-a', + '--auto-start-service', + action='store_true', + help='Automatically start/stop the TDengine service (default: false)') + parser.add_argument( + '-b', + '--max-dbs', + action='store', + default=0, + type=int, + help='Maximum number of DBs to keep, set to disable dropping DB. (default: 0)') + parser.add_argument( + '-c', + '--connector-type', + action='store', + default='native', + type=str, + help='Connector type to use: native, rest, or mixed (default: 10)') + parser.add_argument( + '-d', + '--debug', + action='store_true', + help='Turn on DEBUG mode for more logging (default: false)') + parser.add_argument( + '-e', + '--run-tdengine', + action='store_true', + help='Run TDengine service in foreground (default: false)') + parser.add_argument( + '-i', + '--max-replicas', + action='store', + default=1, + type=int, + help='Maximum number of replicas to use, when testing against clusters. (default: 1)') + parser.add_argument( + '-l', + '--larger-data', + action='store_true', + help='Write larger amount of data during write operations (default: false)') + parser.add_argument( + '-p', + '--per-thread-db-connection', + action='store_true', + help='Use a single shared db connection (default: false)') + parser.add_argument( + '-r', + '--record-ops', + action='store_true', + help='Use a pair of always-fsynced fils to record operations performing + performed, for power-off tests (default: false)') + parser.add_argument( + '-s', + '--max-steps', + action='store', + default=1000, + type=int, + help='Maximum number of steps to run (default: 100)') + parser.add_argument( + '-t', + '--num-threads', + action='store', + default=5, + type=int, + help='Number of threads to run (default: 10)') + parser.add_argument( + '-v', + '--verify-data', + action='store_true', + help='Verify data written in a number of places by reading back (default: false)') + parser.add_argument( + '-x', + '--continue-on-exception', + action='store_true', + help='Continue execution after encountering unexpected/disallowed errors/exceptions (default: false)') - parser.add_argument( - '-a', - '--auto-start-service', - action='store_true', - help='Automatically start/stop the TDengine service (default: false)') - parser.add_argument( - '-b', - '--max-dbs', - action='store', - default=0, - type=int, - help='Maximum number of DBs to keep, set to disable dropping DB. (default: 0)') - parser.add_argument( - '-c', - '--connector-type', - action='store', - default='native', - type=str, - help='Connector type to use: native, rest, or mixed (default: 10)') - parser.add_argument( - '-d', - '--debug', - action='store_true', - help='Turn on DEBUG mode for more logging (default: false)') - parser.add_argument( - '-e', - '--run-tdengine', - action='store_true', - help='Run TDengine service in foreground (default: false)') - parser.add_argument( - '-i', - '--max-replicas', - action='store', - default=1, - type=int, - help='Maximum number of replicas to use, when testing against clusters. (default: 1)') - parser.add_argument( - '-l', - '--larger-data', - action='store_true', - help='Write larger amount of data during write operations (default: false)') - parser.add_argument( - '-p', - '--per-thread-db-connection', - action='store_true', - help='Use a single shared db connection (default: false)') - parser.add_argument( - '-r', - '--record-ops', - action='store_true', - help='Use a pair of always-fsynced fils to record operations performing + performed, for power-off tests (default: false)') - parser.add_argument( - '-s', - '--max-steps', - action='store', - default=1000, - type=int, - help='Maximum number of steps to run (default: 100)') - parser.add_argument( - '-t', - '--num-threads', - action='store', - default=5, - type=int, - help='Number of threads to run (default: 10)') - parser.add_argument( - '-v', - '--verify-data', - action='store_true', - help='Verify data written in a number of places by reading back (default: false)') - parser.add_argument( - '-x', - '--continue-on-exception', - action='store_true', - help='Continue execution after encountering unexpected/disallowed errors/exceptions (default: false)') + global gConfig + gConfig = parser.parse_args() - global gConfig - gConfig = parser.parse_args() + # Logging Stuff + global logger + _logger = logging.getLogger('CrashGen') # real logger + _logger.addFilter(LoggingFilter()) + ch = logging.StreamHandler() + _logger.addHandler(ch) - # Logging Stuff - global logger - _logger = logging.getLogger('CrashGen') # real logger - _logger.addFilter(LoggingFilter()) - ch = logging.StreamHandler() - _logger.addHandler(ch) + # Logging adapter, to be used as a logger + logger = MyLoggingAdapter(_logger, []) - # Logging adapter, to be used as a logger - logger = MyLoggingAdapter(_logger, []) + if (gConfig.debug): + logger.setLevel(logging.DEBUG) # default seems to be INFO + else: + logger.setLevel(logging.INFO) - if (gConfig.debug): - logger.setLevel(logging.DEBUG) # default seems to be INFO - else: - logger.setLevel(logging.INFO) + Dice.seed(0) # initial seeding of dice - Dice.seed(0) # initial seeding of dice + def run(self): + if gConfig.run_tdengine: # run server + self.runService() + else: + return self.runClient() - # Run server or client - mExec = MainExec() - if gConfig.run_tdengine: # run server - mExec.runService() - else: - return mExec.runClient() class Container(): _propertyList = {'defTdeInstance'} @@ -3300,9 +3301,3 @@ class Container(): self._verifyValidProperty(name) self._cargo[name] = value -if __name__ == "__main__": - gContainer = Container() # micky-mouse DI - - exitCode = main() - # print("Exiting with code: {}".format(exitCode)) - sys.exit(exitCode) \ No newline at end of file diff --git a/tests/pytest/crash_gen_bootstrap.py b/tests/pytest/crash_gen_bootstrap.py new file mode 100644 index 0000000000..a3417d21a8 --- /dev/null +++ b/tests/pytest/crash_gen_bootstrap.py @@ -0,0 +1,23 @@ +# -----!/usr/bin/python3.7 +################################################################### +# Copyright (c) 2016 by TAOS Technologies, Inc. +# All rights reserved. +# +# This file is proprietary and confidential to TAOS Technologies. +# No part of this file may be reproduced, stored, transmitted, +# disclosed or used in any form or by any means other than as +# expressly provided by the written permission from Jianhui Tao +# +################################################################### + +import sys +from crash_gen.crash_gen import MainExec + +if __name__ == "__main__": + + mExec = MainExec() + mExec.init() + exitCode = mExec.run() + + print("Exiting with code: {}".format(exitCode)) + sys.exit(exitCode) From dc72a1a60c7c206f1e2d03a14fea3d756997a8de Mon Sep 17 00:00:00 2001 From: Steven Li Date: Wed, 21 Oct 2020 07:54:47 +0000 Subject: [PATCH 04/16] Split crash_gen tool into different functional files/modules --- tests/pytest/crash_gen/crash_gen.py | 914 +++------------------- tests/pytest/crash_gen/misc.py | 133 ++++ tests/pytest/crash_gen/service_manager.py | 633 +++++++++++++++ 3 files changed, 873 insertions(+), 807 deletions(-) create mode 100644 tests/pytest/crash_gen/misc.py create mode 100644 tests/pytest/crash_gen/service_manager.py diff --git a/tests/pytest/crash_gen/crash_gen.py b/tests/pytest/crash_gen/crash_gen.py index dbd4eab9e7..f369f5a3e8 100755 --- a/tests/pytest/crash_gen/crash_gen.py +++ b/tests/pytest/crash_gen/crash_gen.py @@ -19,17 +19,15 @@ from util.sql import * from util.cases import * from util.dnodes import * from util.log import * -from queue import Queue, Empty -from typing import IO from typing import Set from typing import Dict from typing import List from requests.auth import HTTPBasicAuth import textwrap -import datetime -import logging import time +import datetime import random +import logging import threading import requests import copy @@ -38,19 +36,14 @@ import getopt import sys import os -import io import signal import traceback import resource from guppy import hpy import gc -import subprocess -try: - import psutil -except: - print("Psutil module needed, please install: sudo pip3 install psutil") - sys.exit(-1) +from .service_manager import ServiceManager, TdeInstance +from .misc import Logging, Status, CrashGenError, Dice, Helper, Progress # Require Python 3 if sys.version_info[0] < 3: @@ -62,19 +55,12 @@ if sys.version_info[0] < 3: # ConfigNameSpace = argparse.Namespace gConfig: argparse.Namespace gSvcMgr: ServiceManager # TODO: refactor this hack, use dep injection -logger: logging.Logger +# logger: logging.Logger gContainer: Container # def runThread(wt: WorkerThread): # wt.run() -class CrashGenError(Exception): - def __init__(self, msg=None, errno=None): - self.msg = msg - self.errno = errno - - def __str__(self): - return self.msg class WorkerThread: def __init__(self, pool: ThreadPool, tid, tc: ThreadCoordinator, @@ -107,10 +93,10 @@ class WorkerThread: # self._dbInUse = False # if "use db" was executed already def logDebug(self, msg): - logger.debug(" TRD[{}] {}".format(self._tid, msg)) + Logging.debug(" TRD[{}] {}".format(self._tid, msg)) def logInfo(self, msg): - logger.info(" TRD[{}] {}".format(self._tid, msg)) + Logging.info(" TRD[{}] {}".format(self._tid, msg)) # def dbInUse(self): # return self._dbInUse @@ -129,10 +115,10 @@ class WorkerThread: def run(self): # initialization after thread starts, in the thread context # self.isSleeping = False - logger.info("Starting to run thread: {}".format(self._tid)) + Logging.info("Starting to run thread: {}".format(self._tid)) if (gConfig.per_thread_db_connection): # type: ignore - logger.debug("Worker thread openning database connection") + Logging.debug("Worker thread openning database connection") self._dbConn.open() self._doTaskLoop() @@ -142,7 +128,7 @@ class WorkerThread: if self._dbConn.isOpen: #sometimes it is not open self._dbConn.close() else: - logger.warning("Cleaning up worker thread, dbConn already closed") + Logging.warning("Cleaning up worker thread, dbConn already closed") def _doTaskLoop(self): # while self._curStep < self._pool.maxSteps: @@ -153,15 +139,15 @@ class WorkerThread: tc.crossStepBarrier() # shared barrier first, INCLUDING the last one except threading.BrokenBarrierError as err: # main thread timed out print("_bto", end="") - logger.debug("[TRD] Worker thread exiting due to main thread barrier time-out") + Logging.debug("[TRD] Worker thread exiting due to main thread barrier time-out") break - logger.debug("[TRD] Worker thread [{}] exited barrier...".format(self._tid)) + Logging.debug("[TRD] Worker thread [{}] exited barrier...".format(self._tid)) self.crossStepGate() # then per-thread gate, after being tapped - logger.debug("[TRD] Worker thread [{}] exited step gate...".format(self._tid)) + Logging.debug("[TRD] Worker thread [{}] exited step gate...".format(self._tid)) if not self._tc.isRunning(): print("_wts", end="") - logger.debug("[TRD] Thread Coordinator not running any more, worker thread now stopping...") + Logging.debug("[TRD] Thread Coordinator not running any more, worker thread now stopping...") break # Before we fetch the task and run it, let's ensure we properly "use" the database (not needed any more) @@ -180,15 +166,15 @@ class WorkerThread: raise # Fetch a task from the Thread Coordinator - logger.debug( "[TRD] Worker thread [{}] about to fetch task".format(self._tid)) + Logging.debug( "[TRD] Worker thread [{}] about to fetch task".format(self._tid)) task = tc.fetchTask() # Execute such a task - logger.debug("[TRD] Worker thread [{}] about to execute task: {}".format( + Logging.debug("[TRD] Worker thread [{}] about to execute task: {}".format( self._tid, task.__class__.__name__)) task.execute(self) tc.saveExecutedTask(task) - logger.debug("[TRD] Worker thread [{}] finished executing task".format(self._tid)) + Logging.debug("[TRD] Worker thread [{}] finished executing task".format(self._tid)) # self._dbInUse = False # there may be changes between steps # print("_wtd", end=None) # worker thread died @@ -211,7 +197,7 @@ class WorkerThread: self.verifyThreadSelf() # only allowed by ourselves # Wait again at the "gate", waiting to be "tapped" - logger.debug( + Logging.debug( "[TRD] Worker thread {} about to cross the step gate".format( self._tid)) self._stepGate.wait() @@ -224,7 +210,7 @@ class WorkerThread: self.verifyThreadMain() # only allowed for main thread if self._thread.is_alive(): - logger.debug("[TRD] Tapping worker thread {}".format(self._tid)) + Logging.debug("[TRD] Tapping worker thread {}".format(self._tid)) self._stepGate.set() # wake up! time.sleep(0) # let the released thread run a bit else: @@ -269,7 +255,7 @@ class ThreadCoordinator: self._stepBarrier = threading.Barrier( self._pool.numThreads + 1) # one barrier for all threads self._execStats = ExecutionStats() - self._runStatus = MainExec.STATUS_RUNNING + self._runStatus = Status.STATUS_RUNNING self._initDbs() def getTaskExecutor(self): @@ -282,14 +268,14 @@ class ThreadCoordinator: self._stepBarrier.wait(timeout) def requestToStop(self): - self._runStatus = MainExec.STATUS_STOPPING + self._runStatus = Status.STATUS_STOPPING self._execStats.registerFailure("User Interruption") def _runShouldEnd(self, transitionFailed, hasAbortedTask, workerTimeout): maxSteps = gConfig.max_steps # type: ignore if self._curStep >= (maxSteps - 1): # maxStep==10, last curStep should be 9 return True - if self._runStatus != MainExec.STATUS_RUNNING: + if self._runStatus != Status.STATUS_RUNNING: return True if transitionFailed: return True @@ -310,7 +296,7 @@ class ThreadCoordinator: def _releaseAllWorkerThreads(self, transitionFailed): self._curStep += 1 # we are about to get into next step. TODO: race condition here! # Now not all threads had time to go to sleep - logger.debug( + Logging.debug( "--\r\n\n--> Step {} starts with main thread waking up".format(self._curStep)) # A new TE for the new step @@ -318,7 +304,7 @@ class ThreadCoordinator: if not transitionFailed: # only if not failed self._te = TaskExecutor(self._curStep) - logger.debug("[TRD] Main thread waking up at step {}, tapping worker threads".format( + Logging.debug("[TRD] Main thread waking up at step {}, tapping worker threads".format( self._curStep)) # Now not all threads had time to go to sleep # Worker threads will wake up at this point, and each execute it's own task self.tapAllThreads() # release all worker thread from their "gates" @@ -327,10 +313,10 @@ class ThreadCoordinator: # Now main thread (that's us) is ready to enter a step # let other threads go past the pool barrier, but wait at the # thread gate - logger.debug("[TRD] Main thread about to cross the barrier") + Logging.debug("[TRD] Main thread about to cross the barrier") self.crossStepBarrier(timeout=self.WORKER_THREAD_TIMEOUT) self._stepBarrier.reset() # Other worker threads should now be at the "gate" - logger.debug("[TRD] Main thread finished crossing the barrier") + Logging.debug("[TRD] Main thread finished crossing the barrier") def _doTransition(self): transitionFailed = False @@ -338,11 +324,11 @@ class ThreadCoordinator: for x in self._dbs: db = x # type: Database sm = db.getStateMachine() - logger.debug("[STT] starting transitions for DB: {}".format(db.getName())) + Logging.debug("[STT] starting transitions for DB: {}".format(db.getName())) # at end of step, transiton the DB state tasksForDb = db.filterTasks(self._executedTasks) sm.transition(tasksForDb, self.getDbManager().getDbConn()) - logger.debug("[STT] transition ended for DB: {}".format(db.getName())) + Logging.debug("[STT] transition ended for DB: {}".format(db.getName())) # Due to limitation (or maybe not) of the TD Python library, # we cannot share connections across threads @@ -350,14 +336,14 @@ class ThreadCoordinator: # Moving below to task loop # if sm.hasDatabase(): # for t in self._pool.threadList: - # logger.debug("[DB] use db for all worker threads") + # Logging.debug("[DB] use db for all worker threads") # t.useDb() # t.execSql("use db") # main thread executing "use # db" on behalf of every worker thread except taos.error.ProgrammingError as err: if (err.msg == 'network unavailable'): # broken DB connection - logger.info("DB connection broken, execution failed") + Logging.info("DB connection broken, execution failed") traceback.print_stack() transitionFailed = True self._te = None # Not running any more @@ -370,7 +356,7 @@ class ThreadCoordinator: self.resetExecutedTasks() # clear the tasks after we are done # Get ready for next step - logger.debug("<-- Step {} finished, trasition failed = {}".format(self._curStep, transitionFailed)) + Logging.debug("<-- Step {} finished, trasition failed = {}".format(self._curStep, transitionFailed)) return transitionFailed def run(self): @@ -384,8 +370,9 @@ class ThreadCoordinator: hasAbortedTask = False workerTimeout = False while not self._runShouldEnd(transitionFailed, hasAbortedTask, workerTimeout): - if not gConfig.debug: # print this only if we are not in debug mode - print(".", end="", flush=True) + if not gConfig.debug: # print this only if we are not in debug mode + Progress.emit(Progress.STEP_BOUNDARY) + # print(".", end="", flush=True) # if (self._curStep % 2) == 0: # print memory usage once every 10 steps # memUsage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss # print("[m:{}]".format(memUsage), end="", flush=True) # print memory usage @@ -397,8 +384,9 @@ class ThreadCoordinator: try: self._syncAtBarrier() # For now just cross the barrier + Progress.emit(Progress.END_THREAD_STEP) except threading.BrokenBarrierError as err: - logger.info("Main loop aborted, caused by worker thread time-out") + Logging.info("Main loop aborted, caused by worker thread time-out") self._execStats.registerFailure("Aborted due to worker thread timeout") print("\n\nWorker Thread time-out detected, important thread info:") ts = ThreadStacks() @@ -411,7 +399,7 @@ class ThreadCoordinator: # threads are QUIET. hasAbortedTask = self._hasAbortedTask() # from previous step if hasAbortedTask: - logger.info("Aborted task encountered, exiting test program") + Logging.info("Aborted task encountered, exiting test program") self._execStats.registerFailure("Aborted Task Encountered") break # do transition only if tasks are error free @@ -422,29 +410,30 @@ class ThreadCoordinator: transitionFailed = True errno2 = Helper.convertErrno(err.errno) # correct error scheme errMsg = "Transition failed: errno=0x{:X}, msg: {}".format(errno2, err) - logger.info(errMsg) + Logging.info(errMsg) traceback.print_exc() self._execStats.registerFailure(errMsg) # Then we move on to the next step + Progress.emit(Progress.BEGIN_THREAD_STEP) self._releaseAllWorkerThreads(transitionFailed) if hasAbortedTask or transitionFailed : # abnormal ending, workers waiting at "gate" - logger.debug("Abnormal ending of main thraed") + Logging.debug("Abnormal ending of main thraed") elif workerTimeout: - logger.debug("Abnormal ending of main thread, due to worker timeout") + Logging.debug("Abnormal ending of main thread, due to worker timeout") else: # regular ending, workers waiting at "barrier" - logger.debug("Regular ending, main thread waiting for all worker threads to stop...") + Logging.debug("Regular ending, main thread waiting for all worker threads to stop...") self._syncAtBarrier() self._te = None # No more executor, time to end - logger.debug("Main thread tapping all threads one last time...") + Logging.debug("Main thread tapping all threads one last time...") self.tapAllThreads() # Let the threads run one last time - logger.debug("\r\n\n--> Main thread ready to finish up...") - logger.debug("Main thread joining all threads") + Logging.debug("\r\n\n--> Main thread ready to finish up...") + Logging.debug("Main thread joining all threads") self._pool.joinAll() # Get all threads to finish - logger.info("\nAll worker threads finished") + Logging.info("\nAll worker threads finished") self._execStats.endExec() def cleanup(self): # free resources @@ -476,7 +465,7 @@ class ThreadCoordinator: wakeSeq.append(i) else: wakeSeq.insert(0, i) - logger.debug( + Logging.debug( "[TRD] Main thread waking up worker threads: {}".format( str(wakeSeq))) # TODO: set dice seed to a deterministic value @@ -524,13 +513,6 @@ class ThreadCoordinator: with self._lock: self._executedTasks.append(task) -# We define a class to run a number of threads in locking steps. - -class Helper: - @classmethod - def convertErrno(cls, errno): - return errno if (errno > 0) else 0x80000000 + errno - class ThreadPool: def __init__(self, numThreads, maxSteps): self.numThreads = numThreads @@ -548,7 +530,7 @@ class ThreadPool: def joinAll(self): for workerThread in self.threadList: - logger.debug("Joining thread...") + Logging.debug("Joining thread...") workerThread._thread.join() def cleanup(self): @@ -605,7 +587,7 @@ class LinearQueue(): def allocate(self, i): with self._lock: - # logger.debug("LQ allocating item {}".format(i)) + # Logging.debug("LQ allocating item {}".format(i)) if (i in self.inUse): raise RuntimeError( "Cannot re-use same index in queue: {}".format(i)) @@ -613,7 +595,7 @@ class LinearQueue(): def release(self, i): with self._lock: - # logger.debug("LQ releasing item {}".format(i)) + # Logging.debug("LQ releasing item {}".format(i)) self.inUse.remove(i) # KeyError possible, TODO: why? def size(self): @@ -673,9 +655,12 @@ class DbConn: # below implemented by child classes self.openByType() - logger.debug("[DB] data connection opened, type = {}".format(self._type)) + Logging.debug("[DB] data connection opened, type = {}".format(self._type)) self.isOpen = True + def close(self): + raise RuntimeError("Unexpected execution, should be overriden") + def queryScalar(self, sql) -> int: return self._queryAny(sql) @@ -755,7 +740,7 @@ class DbConnRest(DbConn): if (not self.isOpen): raise RuntimeError("Cannot clean up database until connection is open") # Do nothing for REST - logger.debug("[DB] REST Database connection closed") + Logging.debug("[DB] REST Database connection closed") self.isOpen = False def _doSql(self, sql): @@ -793,9 +778,9 @@ class DbConnRest(DbConn): if (not self.isOpen): raise RuntimeError( "Cannot execute database commands until connection is open") - logger.debug("[SQL-REST] Executing SQL: {}".format(sql)) + Logging.debug("[SQL-REST] Executing SQL: {}".format(sql)) nRows = self._doSql(sql) - logger.debug( + Logging.debug( "[SQL-REST] Execution Result, nRows = {}, SQL = {}".format(nRows, sql)) return nRows @@ -884,127 +869,6 @@ class MyTDSql: raise return self.affectedRows -class TdeInstance(): - """ - A class to capture the *static* information of a TDengine instance, - including the location of the various files/directories, and basica - configuration. - """ - - @classmethod - def _getBuildPath(cls): - selfPath = os.path.dirname(os.path.realpath(__file__)) - if ("community" in selfPath): - projPath = selfPath[:selfPath.find("communit")] - else: - projPath = selfPath[:selfPath.find("tests")] - - buildPath = None - for root, dirs, files in os.walk(projPath): - if ("taosd" in files): - rootRealPath = os.path.dirname(os.path.realpath(root)) - if ("packaging" not in rootRealPath): - buildPath = root[:len(root) - len("/build/bin")] - break - if buildPath == None: - raise RuntimeError("Failed to determine buildPath, selfPath={}, projPath={}" - .format(selfPath, projPath)) - return buildPath - - def __init__(self, subdir='test'): - self._buildDir = self._getBuildPath() - self._subdir = '/' + subdir # TODO: tolerate "/" - - def __repr__(self): - return "[TdeInstance: {}, subdir={}]".format(self._buildDir, self._subdir) - - def generateCfgFile(self): - # buildPath = self.getBuildPath() - # taosdPath = self._buildPath + "/build/bin/taosd" - - cfgDir = self.getCfgDir() - cfgFile = cfgDir + "/taos.cfg" # TODO: inquire if this is fixed - if os.path.exists(cfgFile): - if os.path.isfile(cfgFile): - logger.warning("Config file exists already, skip creation: {}".format(cfgFile)) - return # cfg file already exists, nothing to do - else: - raise CrashGenError("Invalid config file: {}".format(cfgFile)) - # Now that the cfg file doesn't exist - if os.path.exists(cfgDir): - if not os.path.isdir(cfgDir): - raise CrashGenError("Invalid config dir: {}".format(cfgDir)) - # else: good path - else: - os.makedirs(cfgDir, exist_ok=True) # like "mkdir -p" - # Now we have a good cfg dir - cfgValues = { - 'runDir': self.getRunDir(), - 'ip': '127.0.0.1', # TODO: change to a network addressable ip - 'port': 6030, - } - cfgTemplate = """ -dataDir {runDir}/data -logDir {runDir}/log - -charset UTF-8 - -firstEp {ip}:{port} -fqdn {ip} -serverPort {port} - -# was all 135 below -dDebugFlag 135 -cDebugFlag 135 -rpcDebugFlag 135 -qDebugFlag 135 -# httpDebugFlag 143 -# asyncLog 0 -# tables 10 -maxtablesPerVnode 10 -rpcMaxTime 101 -# cache 2 -keep 36500 -# walLevel 2 -walLevel 1 -# -# maxConnections 100 -""" - cfgContent = cfgTemplate.format_map(cfgValues) - f = open(cfgFile, "w") - f.write(cfgContent) - f.close() - - def rotateLogs(self): - logPath = self.getLogDir() - # ref: https://stackoverflow.com/questions/1995373/deleting-all-files-in-a-directory-with-python/1995397 - if os.path.exists(logPath): - logPathSaved = logPath + "_" + time.strftime('%Y-%m-%d-%H-%M-%S') - logger.info("Saving old log files to: {}".format(logPathSaved)) - os.rename(logPath, logPathSaved) - # os.mkdir(logPath) # recreate, no need actually, TDengine will auto-create with proper perms - - - def getExecFile(self): # .../taosd - return self._buildDir + "/build/bin/taosd" - - def getRunDir(self): # TODO: rename to "root dir" ?! - return self._buildDir + self._subdir - - def getCfgDir(self): # path, not file - return self.getRunDir() + "/cfg" - - def getLogDir(self): - return self.getRunDir() + "/log" - - def getHostAddr(self): - return "127.0.0.1" - - def getServiceCommand(self): # to start the instance - return [self.getExecFile(), '-c', self.getCfgDir()] # used in subproce.Popen() - - - class DbConnNative(DbConn): # Class variables _lock = threading.Lock() @@ -1028,7 +892,7 @@ class DbConnNative(DbConn): with cls._lock: # force single threading for opening DB connections. # TODO: whaaat??!!! if not cls._connInfoDisplayed: cls._connInfoDisplayed = True # updating CLASS variable - logger.info("Initiating TAOS native connection to {}, using config at {}".format(hostAddr, cfgPath)) + Logging.info("Initiating TAOS native connection to {}, using config at {}".format(hostAddr, cfgPath)) # Make the connection # self._conn = taos.connect(host=hostAddr, config=cfgPath) # TODO: make configurable # self._cursor = self._conn.cursor() @@ -1052,16 +916,16 @@ class DbConnNative(DbConn): with cls._lock: cls.totalConnections -= 1 - logger.debug("[DB] Database connection closed") + Logging.debug("[DB] Database connection closed") self.isOpen = False def execute(self, sql): if (not self.isOpen): raise RuntimeError("Cannot execute database commands until connection is open") - logger.debug("[SQL] Executing SQL: {}".format(sql)) + Logging.debug("[SQL] Executing SQL: {}".format(sql)) self._lastSql = sql nRows = self._tdSql.execute(sql) - logger.debug( + Logging.debug( "[SQL] Execution Result, nRows = {}, SQL = {}".format( nRows, sql)) return nRows @@ -1070,10 +934,10 @@ class DbConnNative(DbConn): if (not self.isOpen): raise RuntimeError( "Cannot query database until connection is open") - logger.debug("[SQL] Executing SQL: {}".format(sql)) + Logging.debug("[SQL] Executing SQL: {}".format(sql)) self._lastSql = sql nRows = self._tdSql.query(sql) - logger.debug( + Logging.debug( "[SQL] Query Result, nRows = {}, SQL = {}".format( nRows, sql)) return nRows @@ -1337,7 +1201,7 @@ class StateMechine: def init(self, dbc: DbConn): # late initailization, don't save the dbConn self._curState = self._findCurrentState(dbc) # starting state - logger.debug("Found Starting State: {}".format(self._curState)) + Logging.debug("Found Starting State: {}".format(self._curState)) # TODO: seems no lnoger used, remove? def getCurrentState(self): @@ -1375,7 +1239,7 @@ class StateMechine: raise RuntimeError( "No suitable task types found for state: {}".format( self._curState)) - logger.debug( + Logging.debug( "[OPS] Tasks found for state {}: {}".format( self._curState, typesToStrings(taskTypes))) @@ -1385,27 +1249,27 @@ class StateMechine: ts = time.time() # we use this to debug how fast/slow it is to do the various queries to find the current DB state dbName =self._db.getName() if not dbc.existsDatabase(dbName): # dbc.hasDatabases(): # no database?! - logger.debug( "[STT] empty database found, between {} and {}".format(ts, time.time())) + Logging.debug( "[STT] empty database found, between {} and {}".format(ts, time.time())) return StateEmpty() # did not do this when openning connection, and this is NOT the worker # thread, which does this on their own dbc.use(dbName) if not dbc.hasTables(): # no tables - logger.debug("[STT] DB_ONLY found, between {} and {}".format(ts, time.time())) + Logging.debug("[STT] DB_ONLY found, between {} and {}".format(ts, time.time())) return StateDbOnly() sTable = self._db.getFixedSuperTable() if sTable.hasRegTables(dbc, dbName): # no regular tables - logger.debug("[STT] SUPER_TABLE_ONLY found, between {} and {}".format(ts, time.time())) + Logging.debug("[STT] SUPER_TABLE_ONLY found, between {} and {}".format(ts, time.time())) return StateSuperTableOnly() else: # has actual tables - logger.debug("[STT] HAS_DATA found, between {} and {}".format(ts, time.time())) + Logging.debug("[STT] HAS_DATA found, between {} and {}".format(ts, time.time())) return StateHasData() # We transition the system to a new state by examining the current state itself def transition(self, tasks, dbc: DbConn): if (len(tasks) == 0): # before 1st step, or otherwise empty - logger.debug("[STT] Starting State: {}".format(self._curState)) + Logging.debug("[STT] Starting State: {}".format(self._curState)) return # do nothing # this should show up in the server log, separating steps @@ -1441,7 +1305,7 @@ class StateMechine: # Nothing for sure newState = self._findCurrentState(dbc) - logger.debug("[STT] New DB state determined: {}".format(newState)) + Logging.debug("[STT] New DB state determined: {}".format(newState)) # can old state move to new state through the tasks? self._curState.verifyTasksToState(tasks, newState) self._curState = newState @@ -1459,7 +1323,7 @@ class StateMechine: # read data task, default to 10: TODO: change to a constant weights.append(10) i = self._weighted_choice_sub(weights) - # logger.debug(" (weighted random:{}/{}) ".format(i, len(taskTypes))) + # Logging.debug(" (weighted random:{}/{}) ".format(i, len(taskTypes))) return taskTypes[i] # ref: @@ -1538,7 +1402,7 @@ class Database: t3 = datetime.datetime(2012, 1, 1) # default "keep" is 10 years t4 = datetime.datetime.fromtimestamp( t3.timestamp() + elSec2) # see explanation above - logger.info("Setting up TICKS to start from: {}".format(t4)) + Logging.info("Setting up TICKS to start from: {}".format(t4)) return t4 @classmethod @@ -1689,10 +1553,10 @@ class TaskExecutor(): self._boundedList.add(n) # def logInfo(self, msg): - # logger.info(" T[{}.x]: ".format(self._curStep) + msg) + # Logging.info(" T[{}.x]: ".format(self._curStep) + msg) # def logDebug(self, msg): - # logger.debug(" T[{}.x]: ".format(self._curStep) + msg) + # Logging.debug(" T[{}.x]: ".format(self._curStep) + msg) class Task(): @@ -1705,7 +1569,7 @@ class Task(): @classmethod def allocTaskNum(cls): Task.taskSn += 1 # IMPORTANT: cannot use cls.taskSn, since each sub class will have a copy - # logger.debug("Allocating taskSN: {}".format(Task.taskSn)) + # Logging.debug("Allocating taskSN: {}".format(Task.taskSn)) return Task.taskSn def __init__(self, execStats: ExecutionStats, db: Database): @@ -1717,7 +1581,7 @@ class Task(): # Assign an incremental task serial number self._taskNum = self.allocTaskNum() - # logger.debug("Creating new task {}...".format(self._taskNum)) + # Logging.debug("Creating new task {}...".format(self._taskNum)) self._execStats = execStats self._db = db # A task is always associated/for a specific DB @@ -1781,7 +1645,7 @@ class Task(): elif msg.find("duplicated column names") != -1: # also alter table tag issues return True elif gSvcMgr and (not gSvcMgr.isStable()): # We are managing service, and ... - logger.info("Ignoring error when service starting/stopping: errno = {}, msg = {}".format(errno, msg)) + Logging.info("Ignoring error when service starting/stopping: errno = {}, msg = {}".format(errno, msg)) return True return False # Not an acceptable error @@ -1922,13 +1786,13 @@ class ExecutionStats: self._failureReason = reason def printStats(self): - logger.info( + Logging.info( "----------------------------------------------------------------------") - logger.info( + Logging.info( "| Crash_Gen test {}, with the following stats:". format( "FAILED (reason: {})".format( self._failureReason) if self._failed else "SUCCEEDED")) - logger.info("| Task Execution Times (success/total):") + Logging.info("| Task Execution Times (success/total):") execTimesAny = 0.001 # avoid div by zero for k, n in self._execTimes.items(): execTimesAny += n[0] @@ -1939,28 +1803,28 @@ class ExecutionStats: errStrs = ["0x{:X}:{}".format(eno, n) for (eno, n) in errors.items()] # print("error strings = {}".format(errStrs)) errStr = ", ".join(errStrs) - logger.info("| {0:<24}: {1}/{2} (Errors: {3})".format(k, n[1], n[0], errStr)) + Logging.info("| {0:<24}: {1}/{2} (Errors: {3})".format(k, n[1], n[0], errStr)) - logger.info( + Logging.info( "| Total Tasks Executed (success or not): {} ".format(execTimesAny)) - logger.info( + Logging.info( "| Total Tasks In Progress at End: {}".format( self._tasksInProgress)) - logger.info( + Logging.info( "| Total Task Busy Time (elapsed time when any task is in progress): {:.3f} seconds".format( self._accRunTime)) - logger.info( + Logging.info( "| Average Per-Task Execution Time: {:.3f} seconds".format(self._accRunTime / execTimesAny)) - logger.info( + Logging.info( "| Total Elapsed Time (from wall clock): {:.3f} seconds".format( self._elapsedTime)) - logger.info("| Top numbers written: {}".format(TaskExecutor.getBoundedList())) - logger.info("| Active DB Native Connections (now): {}".format(DbConnNative.totalConnections)) - logger.info("| Longest native query time: {:.3f} seconds, started: {}". + Logging.info("| Top numbers written: {}".format(TaskExecutor.getBoundedList())) + Logging.info("| Active DB Native Connections (now): {}".format(DbConnNative.totalConnections)) + Logging.info("| Longest native query time: {:.3f} seconds, started: {}". format(MyTDSql.longestQueryTime, time.strftime("%x %X", time.localtime(MyTDSql.lqStartTime))) ) - logger.info("| Longest native query: {}".format(MyTDSql.longestQuery)) - logger.info( + Logging.info("| Longest native query: {}".format(MyTDSql.longestQuery)) + Logging.info( "----------------------------------------------------------------------") @@ -2030,7 +1894,7 @@ class TaskDropDb(StateTransitionTask): def _executeInternal(self, te: TaskExecutor, wt: WorkerThread): self.execWtSql(wt, "drop database {}".format(self._db.getName())) - logger.debug("[OPS] database dropped at {}".format(time.time())) + Logging.debug("[OPS] database dropped at {}".format(time.time())) class TaskCreateSuperTable(StateTransitionTask): @classmethod @@ -2043,7 +1907,7 @@ class TaskCreateSuperTable(StateTransitionTask): def _executeInternal(self, te: TaskExecutor, wt: WorkerThread): if not self._db.exists(wt.getDbConn()): - logger.debug("Skipping task, no DB yet") + Logging.debug("Skipping task, no DB yet") return sTable = self._db.getFixedSuperTable() # type: TdSuperTable @@ -2078,7 +1942,7 @@ class TdSuperTable: dbc.query("select TBNAME from {}.{}".format(dbName, self._stName)) # TODO: analyze result set later except taos.error.ProgrammingError as err: errno2 = Helper.convertErrno(err.errno) - logger.debug("[=] Failed to get tables from super table: errno=0x{:X}, msg: {}".format(errno2, err)) + Logging.debug("[=] Failed to get tables from super table: errno=0x{:X}, msg: {}".format(errno2, err)) raise qr = dbc.getQueryResult() @@ -2193,7 +2057,7 @@ class TaskReadData(StateTransitionTask): dbc.execute("select {} from {}.{}".format(aggExpr, dbName, sTable.getName())) except taos.error.ProgrammingError as err: errno2 = Helper.convertErrno(err.errno) - logger.debug("[=] Read Failure: errno=0x{:X}, msg: {}, SQL: {}".format(errno2, err, dbc.getLastSql())) + Logging.debug("[=] Read Failure: errno=0x{:X}, msg: {}, SQL: {}".format(errno2, err, dbc.getLastSql())) raise class TaskDropSuperTable(StateTransitionTask): @@ -2224,7 +2088,7 @@ class TaskDropSuperTable(StateTransitionTask): errno2 = Helper.convertErrno(err.errno) if (errno2 in [0x362]): # mnode invalid table name isSuccess = False - logger.debug("[DB] Acceptable error when dropping a table") + Logging.debug("[DB] Acceptable error when dropping a table") continue # try to delete next regular table if (not tickOutput): @@ -2304,20 +2168,19 @@ class TaskAddData(StateTransitionTask): # Track which table is being actively worked on activeTable: Set[int] = set() - # We use these two files to record operations to DB, useful for power-off - # tests - fAddLogReady = None - fAddLogDone = None + # We use these two files to record operations to DB, useful for power-off tests + fAddLogReady = None # type: TextIOWrapper + fAddLogDone = None # type: TextIOWrapper @classmethod def prepToRecordOps(cls): if gConfig.record_ops: if (cls.fAddLogReady is None): - logger.info( + Logging.info( "Recording in a file operations to be performed...") cls.fAddLogReady = open("add_log_ready.txt", "w") if (cls.fAddLogDone is None): - logger.info("Recording in a file operations completed...") + Logging.info("Recording in a file operations completed...") cls.fAddLogDone = open("add_log_done.txt", "w") @classmethod @@ -2393,553 +2256,8 @@ class TaskAddData(StateTransitionTask): self.activeTable.discard(i) # not raising an error, unlike remove -# Deterministic random number generator -class Dice(): - seeded = False # static, uninitialized - @classmethod - def seed(cls, s): # static - if (cls.seeded): - raise RuntimeError( - "Cannot seed the random generator more than once") - cls.verifyRNG() - random.seed(s) - cls.seeded = True # TODO: protect against multi-threading - @classmethod - def verifyRNG(cls): # Verify that the RNG is determinstic - random.seed(0) - x1 = random.randrange(0, 1000) - x2 = random.randrange(0, 1000) - x3 = random.randrange(0, 1000) - if (x1 != 864 or x2 != 394 or x3 != 776): - raise RuntimeError("System RNG is not deterministic") - - @classmethod - def throw(cls, stop): # get 0 to stop-1 - return cls.throwRange(0, stop) - - @classmethod - def throwRange(cls, start, stop): # up to stop-1 - if (not cls.seeded): - raise RuntimeError("Cannot throw dice before seeding it") - return random.randrange(start, stop) - - @classmethod - def choice(cls, cList): - return random.choice(cList) - - -class LoggingFilter(logging.Filter): - def filter(self, record: logging.LogRecord): - if (record.levelno >= logging.INFO): - return True # info or above always log - - # Commenting out below to adjust... - - # if msg.startswith("[TRD]"): - # return False - return True - - -class MyLoggingAdapter(logging.LoggerAdapter): - def process(self, msg, kwargs): - return "[{}]{}".format(threading.get_ident() % 10000, msg), kwargs - # return '[%s] %s' % (self.extra['connid'], msg), kwargs - - -class ServiceManager: - PAUSE_BETWEEN_IPC_CHECK = 1.2 # seconds between checks on STDOUT of sub process - - def __init__(self, numDnodes = 1): - logger.info("TDengine Service Manager (TSM) created") - self._numDnodes = numDnodes # >1 means we have a cluster - # signal.signal(signal.SIGTERM, self.sigIntHandler) # Moved to MainExec - # signal.signal(signal.SIGINT, self.sigIntHandler) - # signal.signal(signal.SIGUSR1, self.sigUsrHandler) # different handler! - - self.inSigHandler = False - # self._status = MainExec.STATUS_RUNNING # set inside - # _startTaosService() - self.svcMgrThreads = [] # type: List[ServiceManagerThread] - for i in range(0, numDnodes): - self.svcMgrThreads.append(ServiceManagerThread(i)) - - self._lock = threading.Lock() - # self._isRestarting = False - - def _doMenu(self): - choice = "" - while True: - print("\nInterrupting Service Program, Choose an Action: ") - print("1: Resume") - print("2: Terminate") - print("3: Restart") - # Remember to update the if range below - # print("Enter Choice: ", end="", flush=True) - while choice == "": - choice = input("Enter Choice: ") - if choice != "": - break # done with reading repeated input - if choice in ["1", "2", "3"]: - break # we are done with whole method - print("Invalid choice, please try again.") - choice = "" # reset - return choice - - def sigUsrHandler(self, signalNumber, frame): - print("Interrupting main thread execution upon SIGUSR1") - if self.inSigHandler: # already - print("Ignoring repeated SIG...") - return # do nothing if it's already not running - self.inSigHandler = True - - choice = self._doMenu() - if choice == "1": - self.sigHandlerResume() # TODO: can the sub-process be blocked due to us not reading from queue? - elif choice == "2": - self.stopTaosServices() - elif choice == "3": # Restart - self.restart() - else: - raise RuntimeError("Invalid menu choice: {}".format(choice)) - - self.inSigHandler = False - - def sigIntHandler(self, signalNumber, frame): - print("ServiceManager: INT Signal Handler starting...") - if self.inSigHandler: - print("Ignoring repeated SIG_INT...") - return - self.inSigHandler = True - - self.stopTaosServices() - print("ServiceManager: INT Signal Handler returning...") - self.inSigHandler = False - - def sigHandlerResume(self): - print("Resuming TDengine service manager (main thread)...\n\n") - - # def _updateThreadStatus(self): - # if self.svcMgrThread: # valid svc mgr thread - # if self.svcMgrThread.isStopped(): # done? - # self.svcMgrThread.procIpcBatch() # one last time. TODO: appropriate? - # self.svcMgrThread = None # no more - - def isActive(self): - """ - Determine if the service/cluster is active at all, i.e. at least - one thread is not "stopped". - """ - for thread in self.svcMgrThreads: - if not thread.isStopped(): - return True - return False - - # def isRestarting(self): - # """ - # Determine if the service/cluster is being "restarted", i.e., at least - # one thread is in "restarting" status - # """ - # for thread in self.svcMgrThreads: - # if thread.isRestarting(): - # return True - # return False - - def isStable(self): - """ - Determine if the service/cluster is "stable", i.e. all of the - threads are in "stable" status. - """ - for thread in self.svcMgrThreads: - if not thread.isStable(): - return False - return True - - def _procIpcAll(self): - while self.isActive(): - for thread in self.svcMgrThreads: # all thread objects should always be valid - # while self.isRunning() or self.isRestarting() : # for as long as the svc mgr thread is still here - if thread.isRunning(): - thread.procIpcBatch() # regular processing, - if thread.isStopped(): - thread.procIpcBatch() # one last time? - # self._updateThreadStatus() - elif thread.isRetarting(): - print("Service restarting...") - # else this thread is stopped - - time.sleep(self.PAUSE_BETWEEN_IPC_CHECK) # pause, before next round - # raise CrashGenError("dummy") - print("Service Manager Thread (with subprocess) ended, main thread exiting...") - - def startTaosServices(self): - with self._lock: - if self.isActive(): - raise RuntimeError("Cannot start TAOS service(s) when one/some may already be running") - - # Find if there's already a taosd service, and then kill it - for proc in psutil.process_iter(): - if proc.name() == 'taosd': - print("Killing an existing TAOSD process in 2 seconds... press CTRL-C to interrupe") - time.sleep(2.0) - proc.kill() - # print("Process: {}".format(proc.name())) - - # self.svcMgrThread = ServiceManagerThread() # create the object - for thread in self.svcMgrThreads: - thread.start() - thread.procIpcBatch(trimToTarget=10, forceOutput=True) # for printing 10 lines - - def stopTaosServices(self): - with self._lock: - if not self.isActive(): - logger.warning("Cannot stop TAOS service(s), already not active") - return - - for thread in self.svcMgrThreads: - thread.stop() - - def run(self): - self.startTaosServices() - self._procIpcAll() # pump/process all the messages, may encounter SIG + restart - if self.isActive(): # if sig handler hasn't destroyed it by now - self.stopTaosServices() # should have started already - - def restart(self): - if not self.isStable(): - logger.warning("Cannot restart service/cluster, when not stable") - return - - # self._isRestarting = True - if self.isActive(): - self.stopTaosServices() - else: - logger.warning("Service not active when restart requested") - - self.startTaosService() - # self._isRestarting = False - - # def isRunning(self): - # return self.svcMgrThread != None - - # def isRestarting(self): - # return self._isRestarting - -class ServiceManagerThread: - """ - A class representing a dedicated thread which manages the "sub process" - of the TDengine service, interacting with its STDOUT/ERR. - - It takes a TdeInstance parameter at creation time, or create a default - """ - MAX_QUEUE_SIZE = 10000 - - def __init__(self, tInstNum = 0, tInst : TdeInstance = None): - # Set the sub process - self._tdeSubProcess = None # type: TdeSubProcess - - # Arrange the TDengine instance - self._tInstNum = tInstNum # instance serial number in cluster, ZERO based - self._tInst = tInst or TdeInstance() # Need an instance - - self._thread = None # The actual thread, # type: threading.Thread - self._status = MainExec.STATUS_STOPPED # The status of the underlying service, actually. - - def __repr__(self): - return "[SvcMgrThread: tInstNum={}]".format(self._tInstNum) - - def getStatus(self): - return self._status - - def isStarting(self): - return self._status == MainExec.STATUS_STARTING - - def isRunning(self): - # return self._thread and self._thread.is_alive() - return self._status == MainExec.STATUS_RUNNING - - def isStopping(self): - return self._status == MainExec.STATUS_STOPPING - - def isStopped(self): - return self._status == MainExec.STATUS_STOPPED - - def isStable(self): - return self.isRunning() or self.isStopped() - - # Start the thread (with sub process), and wait for the sub service - # to become fully operational - def start(self): - if self._thread: - raise RuntimeError("Unexpected _thread") - if self._tdeSubProcess: - raise RuntimeError("TDengine sub process already created/running") - - logger.info("Attempting to start TAOS service: {}".format(self)) - - self._status = MainExec.STATUS_STARTING - self._tdeSubProcess = TdeSubProcess(self._tInst) - self._tdeSubProcess.start() - - self._ipcQueue = Queue() - self._thread = threading.Thread( # First thread captures server OUTPUT - target=self.svcOutputReader, - args=(self._tdeSubProcess.getStdOut(), self._ipcQueue)) - self._thread.daemon = True # thread dies with the program - self._thread.start() - - self._thread2 = threading.Thread( # 2nd thread captures server ERRORs - target=self.svcErrorReader, - args=(self._tdeSubProcess.getStdErr(), self._ipcQueue)) - self._thread2.daemon = True # thread dies with the program - self._thread2.start() - - # wait for service to start - for i in range(0, 100): - time.sleep(1.0) - # self.procIpcBatch() # don't pump message during start up - print("_zz_", end="", flush=True) - if self._status == MainExec.STATUS_RUNNING: - logger.info("[] TDengine service READY to process requests") - logger.info("[] TAOS service started: {}".format(self)) - return # now we've started - # TODO: handle failure-to-start better? - self.procIpcBatch(100, True) # display output before cronking out, trim to last 20 msgs, force output - raise RuntimeError("TDengine service did not start successfully: {}".format(self)) - - def stop(self): - # can be called from both main thread or signal handler - print("Terminating TDengine service running as the sub process...") - if self.isStopped(): - print("Service already stopped") - return - if self.isStopping(): - print("Service is already being stopped") - return - # Linux will send Control-C generated SIGINT to the TDengine process - # already, ref: - # https://unix.stackexchange.com/questions/176235/fork-and-how-signals-are-delivered-to-processes - if not self._tdeSubProcess: - raise RuntimeError("sub process object missing") - - self._status = MainExec.STATUS_STOPPING - retCode = self._tdeSubProcess.stop() - print("Attempted to stop sub process, got return code: {}".format(retCode)) - if (retCode==-11): # SGV - logger.error("[[--ERROR--]]: TDengine service SEGV fault (check core file!)") - - if self._tdeSubProcess.isRunning(): # still running - print("FAILED to stop sub process, it is still running... pid = {}".format( - self._tdeSubProcess.getPid())) - else: - self._tdeSubProcess = None # not running any more - self.join() # stop the thread, change the status, etc. - - # Check if it's really stopped - outputLines = 20 # for last output - if self.isStopped(): - self.procIpcBatch(outputLines) # one last time - print("End of TDengine Service Output: {}".format(self)) - print("----- TDengine Service (managed by SMT) is now terminated -----\n") - else: - print("WARNING: SMT did not terminate as expected: {}".format(self)) - - def join(self): - # TODO: sanity check - if not self.isStopping(): - raise RuntimeError( - "Unexpected status when ending svc mgr thread: {}".format( - self._status)) - - if self._thread: - self._thread.join() - self._thread = None - self._status = MainExec.STATUS_STOPPED - # STD ERR thread - self._thread2.join() - self._thread2 = None - else: - print("Joining empty thread, doing nothing") - - def _trimQueue(self, targetSize): - if targetSize <= 0: - return # do nothing - q = self._ipcQueue - if (q.qsize() <= targetSize): # no need to trim - return - - logger.debug("Triming IPC queue to target size: {}".format(targetSize)) - itemsToTrim = q.qsize() - targetSize - for i in range(0, itemsToTrim): - try: - q.get_nowait() - except Empty: - break # break out of for loop, no more trimming - - TD_READY_MSG = "TDengine is initialized successfully" - - def procIpcBatch(self, trimToTarget=0, forceOutput=False): - self._trimQueue(trimToTarget) # trim if necessary - # Process all the output generated by the underlying sub process, - # managed by IO thread - print("<", end="", flush=True) - while True: - try: - line = self._ipcQueue.get_nowait() # getting output at fast speed - self._printProgress("_o") - except Empty: - # time.sleep(2.3) # wait only if there's no output - # no more output - print(".>", end="", flush=True) - return # we are done with THIS BATCH - else: # got line, printing out - if forceOutput: - logger.info(line) - else: - logger.debug(line) - print(">", end="", flush=True) - - _ProgressBars = ["--", "//", "||", "\\\\"] - - def _printProgress(self, msg): # TODO: assuming 2 chars - print(msg, end="", flush=True) - pBar = self._ProgressBars[Dice.throw(4)] - print(pBar, end="", flush=True) - print('\b\b\b\b', end="", flush=True) - - def svcOutputReader(self, out: IO, queue): - # Important Reference: https://stackoverflow.com/questions/375427/non-blocking-read-on-a-subprocess-pipe-in-python - # print("This is the svcOutput Reader...") - # for line in out : - for line in iter(out.readline, b''): - # print("Finished reading a line: {}".format(line)) - # print("Adding item to queue...") - try: - line = line.decode("utf-8").rstrip() - except UnicodeError: - print("\nNon-UTF8 server output: {}\n".format(line)) - - # This might block, and then causing "out" buffer to block - queue.put(line) - self._printProgress("_i") - - if self._status == MainExec.STATUS_STARTING: # we are starting, let's see if we have started - if line.find(self.TD_READY_MSG) != -1: # found - logger.info("Waiting for the service to become FULLY READY") - time.sleep(1.0) # wait for the server to truly start. TODO: remove this - logger.info("Service instance #{} is now FULLY READY".format(self._tInstNum)) - self._status = MainExec.STATUS_RUNNING - - # Trim the queue if necessary: TODO: try this 1 out of 10 times - self._trimQueue(self.MAX_QUEUE_SIZE * 9 // 10) # trim to 90% size - - if self.isStopping(): # TODO: use thread status instead - # WAITING for stopping sub process to finish its outptu - print("_w", end="", flush=True) - - # queue.put(line) - # meaning sub process must have died - print("\nNo more output from IO thread managing TDengine service") - out.close() - - def svcErrorReader(self, err: IO, queue): - for line in iter(err.readline, b''): - print("\nTDengine Service (taosd) ERROR (from stderr): {}".format(line)) - - -class TdeSubProcess: - """ - A class to to represent the actual sub process that is the run-time - of a TDengine instance. - - It takes a TdeInstance object as its parameter, with the rationale being - "a sub process runs an instance". - """ - - def __init__(self, tInst : TdeInstance): - self.subProcess = None - if tInst is None: - raise CrashGenError("Empty instance not allowed in TdeSubProcess") - self._tInst = tInst # Default create at ServiceManagerThread - - def getStdOut(self): - return self.subProcess.stdout - - def getStdErr(self): - return self.subProcess.stderr - - def isRunning(self): - return self.subProcess is not None - - def getPid(self): - return self.subProcess.pid - - # Repalced by TdeInstance class - # def getBuildPath(self): - # selfPath = os.path.dirname(os.path.realpath(__file__)) - # if ("community" in selfPath): - # projPath = selfPath[:selfPath.find("communit")] - # else: - # projPath = selfPath[:selfPath.find("tests")] - - # for root, dirs, files in os.walk(projPath): - # if ("taosd" in files): - # rootRealPath = os.path.dirname(os.path.realpath(root)) - # if ("packaging" not in rootRealPath): - # buildPath = root[:len(root) - len("/build/bin")] - # break - # return buildPath - - def start(self): - ON_POSIX = 'posix' in sys.builtin_module_names - - # Sanity check - if self.subProcess: # already there - raise RuntimeError("Corrupt process state") - - # global gContainer - # tInst = gContainer.defTdeInstance = TdeInstance('test3') # creae the instance - self._tInst.generateCfgFile() # service side generates config file, client does not - - self._tInst.rotateLogs() - - print("Starting TDengine instance: {}".format(self._tInst)) - self.subProcess = subprocess.Popen( - self._tInst.getServiceCommand(), - shell=False, - # svcCmdSingle, shell=True, # capture core dump? - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - # bufsize=1, # not supported in binary mode - close_fds=ON_POSIX - ) # had text=True, which interferred with reading EOF - - def stop(self): - if not self.subProcess: - print("Sub process already stopped") - return -1 - - retCode = self.subProcess.poll() # contains real sub process return code - if retCode: # valid return code, process ended - self.subProcess = None - else: # process still alive, let's interrupt it - print( - "Sub process is running, sending SIG_INT and waiting for it to terminate...") - # sub process should end, then IPC queue should end, causing IO - # thread to end - self.subProcess.send_signal(signal.SIGINT) - try: - self.subProcess.wait(10) - retCode = self.subProcess.returncode - except subprocess.TimeoutExpired as err: - print("Time out waiting for TDengine service process to exit") - retCode = -3 - else: - print("TDengine service process terminated successfully from SIG_INT") - retCode = -4 - self.subProcess = None - return retCode class ThreadStacks: # stack info for all threads def __init__(self): @@ -2976,17 +2294,17 @@ class ClientManager: # signal.signal(signal.SIGTERM, self.sigIntHandler) # signal.signal(signal.SIGINT, self.sigIntHandler) - self._status = MainExec.STATUS_RUNNING + self._status = Status.STATUS_RUNNING self.tc = None self.inSigHandler = False def sigIntHandler(self, signalNumber, frame): - if self._status != MainExec.STATUS_RUNNING: + if self._status != Status.STATUS_RUNNING: print("Repeated SIGINT received, forced exit...") # return # do nothing if it's already not running sys.exit(-1) - self._status = MainExec.STATUS_STOPPING # immediately set our status + self._status = Status.STATUS_STOPPING # immediately set our status print("ClientManager: Terminating program...") self.tc.requestToStop() @@ -3110,11 +2428,6 @@ class ClientManager: self.tc.printStats() class MainExec: - STATUS_STARTING = 1 - STATUS_RUNNING = 2 - STATUS_STOPPING = 3 - STATUS_STOPPED = 4 - def __init__(self): self._clientMgr = None self._svcMgr = None @@ -3147,7 +2460,7 @@ class MainExec: try: ret = self._clientMgr.run(self._svcMgr) # stop TAOS service inside except requests.exceptions.ConnectionError as err: - logger.warning("Failed to open REST connection to DB: {}".format(err.getMessage())) + Logging.warning("Failed to open REST connection to DB: {}".format(err.getMessage())) # don't raise return ret @@ -3255,20 +2568,7 @@ class MainExec: global gConfig gConfig = parser.parse_args() - # Logging Stuff - global logger - _logger = logging.getLogger('CrashGen') # real logger - _logger.addFilter(LoggingFilter()) - ch = logging.StreamHandler() - _logger.addHandler(ch) - - # Logging adapter, to be used as a logger - logger = MyLoggingAdapter(_logger, []) - - if (gConfig.debug): - logger.setLevel(logging.DEBUG) # default seems to be INFO - else: - logger.setLevel(logging.INFO) + Logging.clsInit(gConfig) Dice.seed(0) # initial seeding of dice diff --git a/tests/pytest/crash_gen/misc.py b/tests/pytest/crash_gen/misc.py new file mode 100644 index 0000000000..08e50e5070 --- /dev/null +++ b/tests/pytest/crash_gen/misc.py @@ -0,0 +1,133 @@ +import threading +import random +import logging + + +class CrashGenError(Exception): + def __init__(self, msg=None, errno=None): + self.msg = msg + self.errno = errno + + def __str__(self): + return self.msg + + +class LoggingFilter(logging.Filter): + def filter(self, record: logging.LogRecord): + if (record.levelno >= logging.INFO): + return True # info or above always log + + # Commenting out below to adjust... + + # if msg.startswith("[TRD]"): + # return False + return True + + +class MyLoggingAdapter(logging.LoggerAdapter): + def process(self, msg, kwargs): + return "[{}]{}".format(threading.get_ident() % 10000, msg), kwargs + # return '[%s] %s' % (self.extra['connid'], msg), kwargs + + +class Logging: + logger = None + + @classmethod + def getLogger(cls): + return logger + + @classmethod + def clsInit(cls, gConfig): # TODO: refactor away gConfig + if cls.logger: + return + + # Logging Stuff + # global misc.logger + _logger = logging.getLogger('CrashGen') # real logger + _logger.addFilter(LoggingFilter()) + ch = logging.StreamHandler() + _logger.addHandler(ch) + + # Logging adapter, to be used as a logger + print("setting logger variable") + # global logger + cls.logger = MyLoggingAdapter(_logger, []) + + if (gConfig.debug): + cls.logger.setLevel(logging.DEBUG) # default seems to be INFO + else: + cls.logger.setLevel(logging.INFO) + + @classmethod + def info(cls, msg): + cls.logger.info(msg) + + @classmethod + def debug(cls, msg): + cls.logger.debug(msg) + + @classmethod + def warning(cls, msg): + cls.logger.warning(msg) + +class Status: + STATUS_STARTING = 1 + STATUS_RUNNING = 2 + STATUS_STOPPING = 3 + STATUS_STOPPED = 4 + +# Deterministic random number generator +class Dice(): + seeded = False # static, uninitialized + + @classmethod + def seed(cls, s): # static + if (cls.seeded): + raise RuntimeError( + "Cannot seed the random generator more than once") + cls.verifyRNG() + random.seed(s) + cls.seeded = True # TODO: protect against multi-threading + + @classmethod + def verifyRNG(cls): # Verify that the RNG is determinstic + random.seed(0) + x1 = random.randrange(0, 1000) + x2 = random.randrange(0, 1000) + x3 = random.randrange(0, 1000) + if (x1 != 864 or x2 != 394 or x3 != 776): + raise RuntimeError("System RNG is not deterministic") + + @classmethod + def throw(cls, stop): # get 0 to stop-1 + return cls.throwRange(0, stop) + + @classmethod + def throwRange(cls, start, stop): # up to stop-1 + if (not cls.seeded): + raise RuntimeError("Cannot throw dice before seeding it") + return random.randrange(start, stop) + + @classmethod + def choice(cls, cList): + return random.choice(cList) + +class Helper: + @classmethod + def convertErrno(cls, errno): + return errno if (errno > 0) else 0x80000000 + errno + +class Progress: + STEP_BOUNDARY = 0 + BEGIN_THREAD_STEP = 1 + END_THREAD_STEP = 2 + tokens = { + STEP_BOUNDARY: '.', + BEGIN_THREAD_STEP: '[', + END_THREAD_STEP: '] ' + } + + @classmethod + def emit(cls, token): + print(cls.tokens[token], end="", flush=True) diff --git a/tests/pytest/crash_gen/service_manager.py b/tests/pytest/crash_gen/service_manager.py new file mode 100644 index 0000000000..cdb12303a2 --- /dev/null +++ b/tests/pytest/crash_gen/service_manager.py @@ -0,0 +1,633 @@ +import os +import io +import sys +import threading +import signal +import logging +import time +import subprocess + +from typing import IO + +try: + import psutil +except: + print("Psutil module needed, please install: sudo pip3 install psutil") + sys.exit(-1) + +from queue import Queue, Empty +from .misc import Logging, Status, CrashGenError, Dice + +class TdeInstance(): + """ + A class to capture the *static* information of a TDengine instance, + including the location of the various files/directories, and basica + configuration. + """ + + @classmethod + def _getBuildPath(cls): + selfPath = os.path.dirname(os.path.realpath(__file__)) + if ("community" in selfPath): + projPath = selfPath[:selfPath.find("communit")] + else: + projPath = selfPath[:selfPath.find("tests")] + + buildPath = None + for root, dirs, files in os.walk(projPath): + if ("taosd" in files): + rootRealPath = os.path.dirname(os.path.realpath(root)) + if ("packaging" not in rootRealPath): + buildPath = root[:len(root) - len("/build/bin")] + break + if buildPath == None: + raise RuntimeError("Failed to determine buildPath, selfPath={}, projPath={}" + .format(selfPath, projPath)) + return buildPath + + def __init__(self, subdir='test'): + self._buildDir = self._getBuildPath() + self._subdir = '/' + subdir # TODO: tolerate "/" + + def __repr__(self): + return "[TdeInstance: {}, subdir={}]".format(self._buildDir, self._subdir) + + def generateCfgFile(self): + # print("Logger = {}".format(logger)) + # buildPath = self.getBuildPath() + # taosdPath = self._buildPath + "/build/bin/taosd" + + cfgDir = self.getCfgDir() + cfgFile = cfgDir + "/taos.cfg" # TODO: inquire if this is fixed + if os.path.exists(cfgFile): + if os.path.isfile(cfgFile): + Logging.warning("Config file exists already, skip creation: {}".format(cfgFile)) + return # cfg file already exists, nothing to do + else: + raise CrashGenError("Invalid config file: {}".format(cfgFile)) + # Now that the cfg file doesn't exist + if os.path.exists(cfgDir): + if not os.path.isdir(cfgDir): + raise CrashGenError("Invalid config dir: {}".format(cfgDir)) + # else: good path + else: + os.makedirs(cfgDir, exist_ok=True) # like "mkdir -p" + # Now we have a good cfg dir + cfgValues = { + 'runDir': self.getRunDir(), + 'ip': '127.0.0.1', # TODO: change to a network addressable ip + 'port': 6030, + } + cfgTemplate = """ +dataDir {runDir}/data +logDir {runDir}/log + +charset UTF-8 + +firstEp {ip}:{port} +fqdn {ip} +serverPort {port} + +# was all 135 below +dDebugFlag 135 +cDebugFlag 135 +rpcDebugFlag 135 +qDebugFlag 135 +# httpDebugFlag 143 +# asyncLog 0 +# tables 10 +maxtablesPerVnode 10 +rpcMaxTime 101 +# cache 2 +keep 36500 +# walLevel 2 +walLevel 1 +# +# maxConnections 100 +""" + cfgContent = cfgTemplate.format_map(cfgValues) + f = open(cfgFile, "w") + f.write(cfgContent) + f.close() + + def rotateLogs(self): + logPath = self.getLogDir() + # ref: https://stackoverflow.com/questions/1995373/deleting-all-files-in-a-directory-with-python/1995397 + if os.path.exists(logPath): + logPathSaved = logPath + "_" + time.strftime('%Y-%m-%d-%H-%M-%S') + Logging.info("Saving old log files to: {}".format(logPathSaved)) + os.rename(logPath, logPathSaved) + # os.mkdir(logPath) # recreate, no need actually, TDengine will auto-create with proper perms + + + def getExecFile(self): # .../taosd + return self._buildDir + "/build/bin/taosd" + + def getRunDir(self): # TODO: rename to "root dir" ?! + return self._buildDir + self._subdir + + def getCfgDir(self): # path, not file + return self.getRunDir() + "/cfg" + + def getLogDir(self): + return self.getRunDir() + "/log" + + def getHostAddr(self): + return "127.0.0.1" + + def getServiceCommand(self): # to start the instance + return [self.getExecFile(), '-c', self.getCfgDir()] # used in subproce.Popen() + + +class TdeSubProcess: + """ + A class to to represent the actual sub process that is the run-time + of a TDengine instance. + + It takes a TdeInstance object as its parameter, with the rationale being + "a sub process runs an instance". + """ + + def __init__(self, tInst : TdeInstance): + self.subProcess = None + if tInst is None: + raise CrashGenError("Empty instance not allowed in TdeSubProcess") + self._tInst = tInst # Default create at ServiceManagerThread + + def getStdOut(self): + return self.subProcess.stdout + + def getStdErr(self): + return self.subProcess.stderr + + def isRunning(self): + return self.subProcess is not None + + def getPid(self): + return self.subProcess.pid + + # Repalced by TdeInstance class + # def getBuildPath(self): + # selfPath = os.path.dirname(os.path.realpath(__file__)) + # if ("community" in selfPath): + # projPath = selfPath[:selfPath.find("communit")] + # else: + # projPath = selfPath[:selfPath.find("tests")] + + # for root, dirs, files in os.walk(projPath): + # if ("taosd" in files): + # rootRealPath = os.path.dirname(os.path.realpath(root)) + # if ("packaging" not in rootRealPath): + # buildPath = root[:len(root) - len("/build/bin")] + # break + # return buildPath + + def start(self): + ON_POSIX = 'posix' in sys.builtin_module_names + + # Sanity check + if self.subProcess: # already there + raise RuntimeError("Corrupt process state") + + # global gContainer + # tInst = gContainer.defTdeInstance = TdeInstance('test3') # creae the instance + self._tInst.generateCfgFile() # service side generates config file, client does not + + self._tInst.rotateLogs() + + print("Starting TDengine instance: {}".format(self._tInst)) + self.subProcess = subprocess.Popen( + self._tInst.getServiceCommand(), + shell=False, + # svcCmdSingle, shell=True, # capture core dump? + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + # bufsize=1, # not supported in binary mode + close_fds=ON_POSIX + ) # had text=True, which interferred with reading EOF + + def stop(self): + if not self.subProcess: + print("Sub process already stopped") + return -1 + + retCode = self.subProcess.poll() # contains real sub process return code + if retCode: # valid return code, process ended + self.subProcess = None + else: # process still alive, let's interrupt it + print( + "Sub process is running, sending SIG_INT and waiting for it to terminate...") + # sub process should end, then IPC queue should end, causing IO + # thread to end + self.subProcess.send_signal(signal.SIGINT) + try: + self.subProcess.wait(10) + retCode = self.subProcess.returncode + except subprocess.TimeoutExpired as err: + print("Time out waiting for TDengine service process to exit") + retCode = -3 + else: + print("TDengine service process terminated successfully from SIG_INT") + retCode = -4 + self.subProcess = None + return retCode + + +class ServiceManager: + PAUSE_BETWEEN_IPC_CHECK = 1.2 # seconds between checks on STDOUT of sub process + + def __init__(self, numDnodes = 1): + Logging.info("TDengine Service Manager (TSM) created") + self._numDnodes = numDnodes # >1 means we have a cluster + # signal.signal(signal.SIGTERM, self.sigIntHandler) # Moved to MainExec + # signal.signal(signal.SIGINT, self.sigIntHandler) + # signal.signal(signal.SIGUSR1, self.sigUsrHandler) # different handler! + + self.inSigHandler = False + # self._status = MainExec.STATUS_RUNNING # set inside + # _startTaosService() + self.svcMgrThreads = [] # type: List[ServiceManagerThread] + for i in range(0, numDnodes): + self.svcMgrThreads.append(ServiceManagerThread(i)) + + self._lock = threading.Lock() + # self._isRestarting = False + + def _doMenu(self): + choice = "" + while True: + print("\nInterrupting Service Program, Choose an Action: ") + print("1: Resume") + print("2: Terminate") + print("3: Restart") + # Remember to update the if range below + # print("Enter Choice: ", end="", flush=True) + while choice == "": + choice = input("Enter Choice: ") + if choice != "": + break # done with reading repeated input + if choice in ["1", "2", "3"]: + break # we are done with whole method + print("Invalid choice, please try again.") + choice = "" # reset + return choice + + def sigUsrHandler(self, signalNumber, frame): + print("Interrupting main thread execution upon SIGUSR1") + if self.inSigHandler: # already + print("Ignoring repeated SIG...") + return # do nothing if it's already not running + self.inSigHandler = True + + choice = self._doMenu() + if choice == "1": + self.sigHandlerResume() # TODO: can the sub-process be blocked due to us not reading from queue? + elif choice == "2": + self.stopTaosServices() + elif choice == "3": # Restart + self.restart() + else: + raise RuntimeError("Invalid menu choice: {}".format(choice)) + + self.inSigHandler = False + + def sigIntHandler(self, signalNumber, frame): + print("ServiceManager: INT Signal Handler starting...") + if self.inSigHandler: + print("Ignoring repeated SIG_INT...") + return + self.inSigHandler = True + + self.stopTaosServices() + print("ServiceManager: INT Signal Handler returning...") + self.inSigHandler = False + + def sigHandlerResume(self): + print("Resuming TDengine service manager (main thread)...\n\n") + + # def _updateThreadStatus(self): + # if self.svcMgrThread: # valid svc mgr thread + # if self.svcMgrThread.isStopped(): # done? + # self.svcMgrThread.procIpcBatch() # one last time. TODO: appropriate? + # self.svcMgrThread = None # no more + + def isActive(self): + """ + Determine if the service/cluster is active at all, i.e. at least + one thread is not "stopped". + """ + for thread in self.svcMgrThreads: + if not thread.isStopped(): + return True + return False + + # def isRestarting(self): + # """ + # Determine if the service/cluster is being "restarted", i.e., at least + # one thread is in "restarting" status + # """ + # for thread in self.svcMgrThreads: + # if thread.isRestarting(): + # return True + # return False + + def isStable(self): + """ + Determine if the service/cluster is "stable", i.e. all of the + threads are in "stable" status. + """ + for thread in self.svcMgrThreads: + if not thread.isStable(): + return False + return True + + def _procIpcAll(self): + while self.isActive(): + for thread in self.svcMgrThreads: # all thread objects should always be valid + # while self.isRunning() or self.isRestarting() : # for as long as the svc mgr thread is still here + if thread.isRunning(): + thread.procIpcBatch() # regular processing, + if thread.isStopped(): + thread.procIpcBatch() # one last time? + # self._updateThreadStatus() + elif thread.isRetarting(): + print("Service restarting...") + # else this thread is stopped + + time.sleep(self.PAUSE_BETWEEN_IPC_CHECK) # pause, before next round + # raise CrashGenError("dummy") + print("Service Manager Thread (with subprocess) ended, main thread exiting...") + + def startTaosServices(self): + with self._lock: + if self.isActive(): + raise RuntimeError("Cannot start TAOS service(s) when one/some may already be running") + + # Find if there's already a taosd service, and then kill it + for proc in psutil.process_iter(): + if proc.name() == 'taosd': + print("Killing an existing TAOSD process in 2 seconds... press CTRL-C to interrupe") + time.sleep(2.0) + proc.kill() + # print("Process: {}".format(proc.name())) + + # self.svcMgrThread = ServiceManagerThread() # create the object + for thread in self.svcMgrThreads: + thread.start() + thread.procIpcBatch(trimToTarget=10, forceOutput=True) # for printing 10 lines + + def stopTaosServices(self): + with self._lock: + if not self.isActive(): + Logging.warning("Cannot stop TAOS service(s), already not active") + return + + for thread in self.svcMgrThreads: + thread.stop() + + def run(self): + self.startTaosServices() + self._procIpcAll() # pump/process all the messages, may encounter SIG + restart + if self.isActive(): # if sig handler hasn't destroyed it by now + self.stopTaosServices() # should have started already + + def restart(self): + if not self.isStable(): + Logging.warning("Cannot restart service/cluster, when not stable") + return + + # self._isRestarting = True + if self.isActive(): + self.stopTaosServices() + else: + Logging.warning("Service not active when restart requested") + + self.startTaosService() + # self._isRestarting = False + + # def isRunning(self): + # return self.svcMgrThread != None + + # def isRestarting(self): + # return self._isRestarting + +class ServiceManagerThread: + """ + A class representing a dedicated thread which manages the "sub process" + of the TDengine service, interacting with its STDOUT/ERR. + + It takes a TdeInstance parameter at creation time, or create a default + """ + MAX_QUEUE_SIZE = 10000 + + def __init__(self, tInstNum = 0, tInst : TdeInstance = None): + # Set the sub process + self._tdeSubProcess = None # type: TdeSubProcess + + # Arrange the TDengine instance + self._tInstNum = tInstNum # instance serial number in cluster, ZERO based + self._tInst = tInst or TdeInstance() # Need an instance + + self._thread = None # The actual thread, # type: threading.Thread + self._status = Status.STATUS_STOPPED # The status of the underlying service, actually. + + def __repr__(self): + return "[SvcMgrThread: tInstNum={}]".format(self._tInstNum) + + def getStatus(self): + return self._status + + def isStarting(self): + return self._status == Status.STATUS_STARTING + + def isRunning(self): + # return self._thread and self._thread.is_alive() + return self._status == Status.STATUS_RUNNING + + def isStopping(self): + return self._status == Status.STATUS_STOPPING + + def isStopped(self): + return self._status == Status.STATUS_STOPPED + + def isStable(self): + return self.isRunning() or self.isStopped() + + # Start the thread (with sub process), and wait for the sub service + # to become fully operational + def start(self): + if self._thread: + raise RuntimeError("Unexpected _thread") + if self._tdeSubProcess: + raise RuntimeError("TDengine sub process already created/running") + + Logging.info("Attempting to start TAOS service: {}".format(self)) + + self._status = Status.STATUS_STARTING + self._tdeSubProcess = TdeSubProcess(self._tInst) + self._tdeSubProcess.start() + + self._ipcQueue = Queue() + self._thread = threading.Thread( # First thread captures server OUTPUT + target=self.svcOutputReader, + args=(self._tdeSubProcess.getStdOut(), self._ipcQueue)) + self._thread.daemon = True # thread dies with the program + self._thread.start() + + self._thread2 = threading.Thread( # 2nd thread captures server ERRORs + target=self.svcErrorReader, + args=(self._tdeSubProcess.getStdErr(), self._ipcQueue)) + self._thread2.daemon = True # thread dies with the program + self._thread2.start() + + # wait for service to start + for i in range(0, 100): + time.sleep(1.0) + # self.procIpcBatch() # don't pump message during start up + print("_zz_", end="", flush=True) + if self._status == Status.STATUS_RUNNING: + Logging.info("[] TDengine service READY to process requests") + Logging.info("[] TAOS service started: {}".format(self)) + return # now we've started + # TODO: handle failure-to-start better? + self.procIpcBatch(100, True) # display output before cronking out, trim to last 20 msgs, force output + raise RuntimeError("TDengine service did not start successfully: {}".format(self)) + + def stop(self): + # can be called from both main thread or signal handler + print("Terminating TDengine service running as the sub process...") + if self.isStopped(): + print("Service already stopped") + return + if self.isStopping(): + print("Service is already being stopped") + return + # Linux will send Control-C generated SIGINT to the TDengine process + # already, ref: + # https://unix.stackexchange.com/questions/176235/fork-and-how-signals-are-delivered-to-processes + if not self._tdeSubProcess: + raise RuntimeError("sub process object missing") + + self._status = Status.STATUS_STOPPING + retCode = self._tdeSubProcess.stop() + print("Attempted to stop sub process, got return code: {}".format(retCode)) + if (retCode==-11): # SGV + Logging.error("[[--ERROR--]]: TDengine service SEGV fault (check core file!)") + + if self._tdeSubProcess.isRunning(): # still running + print("FAILED to stop sub process, it is still running... pid = {}".format( + self._tdeSubProcess.getPid())) + else: + self._tdeSubProcess = None # not running any more + self.join() # stop the thread, change the status, etc. + + # Check if it's really stopped + outputLines = 20 # for last output + if self.isStopped(): + self.procIpcBatch(outputLines) # one last time + print("End of TDengine Service Output: {}".format(self)) + print("----- TDengine Service (managed by SMT) is now terminated -----\n") + else: + print("WARNING: SMT did not terminate as expected: {}".format(self)) + + def join(self): + # TODO: sanity check + if not self.isStopping(): + raise RuntimeError( + "Unexpected status when ending svc mgr thread: {}".format( + self._status)) + + if self._thread: + self._thread.join() + self._thread = None + self._status = Status.STATUS_STOPPED + # STD ERR thread + self._thread2.join() + self._thread2 = None + else: + print("Joining empty thread, doing nothing") + + def _trimQueue(self, targetSize): + if targetSize <= 0: + return # do nothing + q = self._ipcQueue + if (q.qsize() <= targetSize): # no need to trim + return + + Logging.debug("Triming IPC queue to target size: {}".format(targetSize)) + itemsToTrim = q.qsize() - targetSize + for i in range(0, itemsToTrim): + try: + q.get_nowait() + except Empty: + break # break out of for loop, no more trimming + + TD_READY_MSG = "TDengine is initialized successfully" + + def procIpcBatch(self, trimToTarget=0, forceOutput=False): + self._trimQueue(trimToTarget) # trim if necessary + # Process all the output generated by the underlying sub process, + # managed by IO thread + print("<", end="", flush=True) + while True: + try: + line = self._ipcQueue.get_nowait() # getting output at fast speed + self._printProgress("_o") + except Empty: + # time.sleep(2.3) # wait only if there's no output + # no more output + print(".>", end="", flush=True) + return # we are done with THIS BATCH + else: # got line, printing out + if forceOutput: + Logging.info(line) + else: + Logging.debug(line) + print(">", end="", flush=True) + + _ProgressBars = ["--", "//", "||", "\\\\"] + + def _printProgress(self, msg): # TODO: assuming 2 chars + print(msg, end="", flush=True) + pBar = self._ProgressBars[Dice.throw(4)] + print(pBar, end="", flush=True) + print('\b\b\b\b', end="", flush=True) + + def svcOutputReader(self, out: IO, queue): + # Important Reference: https://stackoverflow.com/questions/375427/non-blocking-read-on-a-subprocess-pipe-in-python + # print("This is the svcOutput Reader...") + # for line in out : + for line in iter(out.readline, b''): + # print("Finished reading a line: {}".format(line)) + # print("Adding item to queue...") + try: + line = line.decode("utf-8").rstrip() + except UnicodeError: + print("\nNon-UTF8 server output: {}\n".format(line)) + + # This might block, and then causing "out" buffer to block + queue.put(line) + self._printProgress("_i") + + if self._status == Status.STATUS_STARTING: # we are starting, let's see if we have started + if line.find(self.TD_READY_MSG) != -1: # found + Logging.info("Waiting for the service to become FULLY READY") + time.sleep(1.0) # wait for the server to truly start. TODO: remove this + Logging.info("Service instance #{} is now FULLY READY".format(self._tInstNum)) + self._status = Status.STATUS_RUNNING + + # Trim the queue if necessary: TODO: try this 1 out of 10 times + self._trimQueue(self.MAX_QUEUE_SIZE * 9 // 10) # trim to 90% size + + if self.isStopping(): # TODO: use thread status instead + # WAITING for stopping sub process to finish its outptu + print("_w", end="", flush=True) + + # queue.put(line) + # meaning sub process must have died + print("\nNo more output from IO thread managing TDengine service") + out.close() + + def svcErrorReader(self, err: IO, queue): + for line in iter(err.readline, b''): + print("\nTDengine Service (taosd) ERROR (from stderr): {}".format(line)) From 44b6dd9a3f9d7b5b59e33f3ebf609edd5049f16f Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Wed, 21 Oct 2020 22:57:49 +0000 Subject: [PATCH 05/16] TD-1720 --- src/query/src/qExtbuffer.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/query/src/qExtbuffer.c b/src/query/src/qExtbuffer.c index fc9c60b39b..17be294531 100644 --- a/src/query/src/qExtbuffer.c +++ b/src/query/src/qExtbuffer.c @@ -344,8 +344,6 @@ static FORCE_INLINE int32_t primaryKeyComparator(int64_t f1, int64_t f2, int32_t return 0; } - assert(colIdx == 0); - if (tsOrder == TSDB_ORDER_DESC) { // primary column desc order return (f1 < f2) ? 1 : -1; } else { // asc From 3c81c340323a3406e8e4917b505b070c2cbb2c69 Mon Sep 17 00:00:00 2001 From: Steven Li Date: Thu, 22 Oct 2020 06:05:18 +0000 Subject: [PATCH 06/16] Enhanced crash_gen tool to run multiple instances concurrently, by using dynamic names for tables and databases --- tests/pytest/crash_gen/crash_gen.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/tests/pytest/crash_gen/crash_gen.py b/tests/pytest/crash_gen/crash_gen.py index f369f5a3e8..8b1d79b811 100755 --- a/tests/pytest/crash_gen/crash_gen.py +++ b/tests/pytest/crash_gen/crash_gen.py @@ -484,8 +484,10 @@ class ThreadCoordinator: if gConfig.max_dbs == 0: self._dbs.append(Database(0, dbc)) else: + baseDbNumber = 0 if gConfig.dynamic_db_table_names else int(datetime.datetime.now( + ).timestamp()) % 888 # Don't use Dice/random, as they are deterministic for i in range(gConfig.max_dbs): - self._dbs.append(Database(i, dbc)) + self._dbs.append(Database(baseDbNumber + i, dbc)) def pickDatabase(self): idxDb = 0 @@ -1793,7 +1795,7 @@ class ExecutionStats: "FAILED (reason: {})".format( self._failureReason) if self._failed else "SUCCEEDED")) Logging.info("| Task Execution Times (success/total):") - execTimesAny = 0.001 # avoid div by zero + execTimesAny = 0.0 for k, n in self._execTimes.items(): execTimesAny += n[0] errStr = None @@ -1834,11 +1836,14 @@ class StateTransitionTask(Task): LARGE_NUMBER_OF_RECORDS = 50 SMALL_NUMBER_OF_RECORDS = 3 + _baseTableNumber = None + + _endState = None + @classmethod def getInfo(cls): # each sub class should supply their own information raise RuntimeError("Overriding method expected") - - _endState = None + @classmethod def getEndState(cls): # TODO: optimize by calling it fewer times raise RuntimeError("Overriding method expected") @@ -1858,7 +1863,9 @@ class StateTransitionTask(Task): @classmethod def getRegTableName(cls, i): - return "reg_table_{}".format(i) + if ( StateTransitionTask._baseTableNumber is None): + StateTransitionTask._baseTableNumber = 0 if gConfig.dynamic_db_table_names else Dice.throw(999) + return "reg_table_{}".format(StateTransitionTask._baseTableNumber + i) def execute(self, wt: WorkerThread): super().execute(wt) @@ -2477,6 +2484,9 @@ class MainExec: global gContainer gContainer = Container() # micky-mouse DI + global gSvcMgr # TODO: refactor away + gSvcMgr = None + # Super cool Python argument library: # https://docs.python.org/3/library/argparse.html parser = argparse.ArgumentParser( @@ -2530,6 +2540,12 @@ class MainExec: '--larger-data', action='store_true', help='Write larger amount of data during write operations (default: false)') + parser.add_argument( + '-n', + '--dynamic-db-table-names', + action='store_true', + help='Use non-fixed names for dbs/tables, useful for multi-instance executions (default: false)') + parser.add_argument( '-p', '--per-thread-db-connection', From 871b9d47ec42f59ee1d7e8618bdaae99f599bf4f Mon Sep 17 00:00:00 2001 From: Steven Li Date: Thu, 22 Oct 2020 06:19:31 +0000 Subject: [PATCH 07/16] Minor crash_gen tool tweaks --- tests/pytest/crash_gen/crash_gen.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/pytest/crash_gen/crash_gen.py b/tests/pytest/crash_gen/crash_gen.py index 8b1d79b811..ee4aa5bb77 100755 --- a/tests/pytest/crash_gen/crash_gen.py +++ b/tests/pytest/crash_gen/crash_gen.py @@ -483,9 +483,9 @@ class ThreadCoordinator: dbc = self.getDbManager().getDbConn() if gConfig.max_dbs == 0: self._dbs.append(Database(0, dbc)) - else: - baseDbNumber = 0 if gConfig.dynamic_db_table_names else int(datetime.datetime.now( - ).timestamp()) % 888 # Don't use Dice/random, as they are deterministic + else: + baseDbNumber = int(datetime.datetime.now().timestamp( # Don't use Dice/random, as they are deterministic + )) % 888 if gConfig.dynamic_db_table_names else 0 for i in range(gConfig.max_dbs): self._dbs.append(Database(baseDbNumber + i, dbc)) @@ -1864,7 +1864,8 @@ class StateTransitionTask(Task): @classmethod def getRegTableName(cls, i): if ( StateTransitionTask._baseTableNumber is None): - StateTransitionTask._baseTableNumber = 0 if gConfig.dynamic_db_table_names else Dice.throw(999) + StateTransitionTask._baseTableNumber = Dice.throw( + 999) if gConfig.dynamic_db_table_names else 0 return "reg_table_{}".format(StateTransitionTask._baseTableNumber + i) def execute(self, wt: WorkerThread): From 7eb3d6e33ac5c7f7f1255fe8999706911663e6d6 Mon Sep 17 00:00:00 2001 From: Steven Li Date: Thu, 22 Oct 2020 09:12:24 +0000 Subject: [PATCH 08/16] Enhanced crash_gen tool to accept/tolerate additional errors based on command line input --- tests/pytest/crash_gen/crash_gen.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/tests/pytest/crash_gen/crash_gen.py b/tests/pytest/crash_gen/crash_gen.py index ee4aa5bb77..ccfee43ba5 100755 --- a/tests/pytest/crash_gen/crash_gen.py +++ b/tests/pytest/crash_gen/crash_gen.py @@ -1620,11 +1620,12 @@ class Task(): if errno in [ 0x05, # TSDB_CODE_RPC_NOT_READY 0x0B, # Unable to establish connection, more details in TD-1648 - # 0x200, # invalid SQL, TODO: re-examine with TD-934 + 0x200, # invalid SQL, TODO: re-examine with TD-934 0x217, # "db not selected", client side defined error code - 0x218, # "Table does not exist" client side defined error code - 0x360, 0x362, - 0x369, # tag already exists + # 0x218, # "Table does not exist" client side defined error code + 0x360, # Table already exists + 0x362, + # 0x369, # tag already exists 0x36A, 0x36B, 0x36D, 0x381, 0x380, # "db not selected" @@ -1637,8 +1638,13 @@ class Task(): 1000 # REST catch-all error ]: return True # These are the ALWAYS-ACCEPTABLE ones - elif (errno in [ 0x0B ]) and gConfig.auto_start_service: - return True # We may get "network unavilable" when restarting service + # This case handled below already. + # elif (errno in [ 0x0B ]) and gConfig.auto_start_service: + # return True # We may get "network unavilable" when restarting service + elif gConfig.ignore_errors: # something is specified on command line + moreErrnos = [int(v, 0) for v in gConfig.ignore_errors.split(',')] + if errno in moreErrnos: + return True elif errno == 0x200 : # invalid SQL, we need to div in a bit more if msg.find("invalid column name") != -1: return True @@ -2529,6 +2535,13 @@ class MainExec: '--run-tdengine', action='store_true', help='Run TDengine service in foreground (default: false)') + parser.add_argument( + '-g', + '--ignore-errors', + action='store', + default=None, + type=str, + help='Ignore error codes, comma separated, 0x supported (default: None)') parser.add_argument( '-i', '--max-replicas', @@ -2545,8 +2558,7 @@ class MainExec: '-n', '--dynamic-db-table-names', action='store_true', - help='Use non-fixed names for dbs/tables, useful for multi-instance executions (default: false)') - + help='Use non-fixed names for dbs/tables, useful for multi-instance executions (default: false)') parser.add_argument( '-p', '--per-thread-db-connection', From 69cf5cf56e9902432bc3a502bbda334dbdb2d602 Mon Sep 17 00:00:00 2001 From: Steven Li Date: Thu, 22 Oct 2020 09:17:48 +0000 Subject: [PATCH 09/16] Minor fix to crash_gen tool, allowing simultaneous start of multiple executions --- tests/pytest/crash_gen/crash_gen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pytest/crash_gen/crash_gen.py b/tests/pytest/crash_gen/crash_gen.py index ccfee43ba5..4ec8e48582 100755 --- a/tests/pytest/crash_gen/crash_gen.py +++ b/tests/pytest/crash_gen/crash_gen.py @@ -485,7 +485,7 @@ class ThreadCoordinator: self._dbs.append(Database(0, dbc)) else: baseDbNumber = int(datetime.datetime.now().timestamp( # Don't use Dice/random, as they are deterministic - )) % 888 if gConfig.dynamic_db_table_names else 0 + )*333) % 888 if gConfig.dynamic_db_table_names else 0 for i in range(gConfig.max_dbs): self._dbs.append(Database(baseDbNumber + i, dbc)) From f7a0b6b89b81e43e2730201e9c9bb7387bdfcd97 Mon Sep 17 00:00:00 2001 From: Steven Li Date: Fri, 23 Oct 2020 06:57:36 +0000 Subject: [PATCH 10/16] Enhanced crash_gen tool to verify dnode being in ready status, plus additional refactoring --- tests/pytest/crash_gen/crash_gen.py | 418 +-------------------- tests/pytest/crash_gen/db.py | 426 ++++++++++++++++++++++ tests/pytest/crash_gen/service_manager.py | 62 +++- 3 files changed, 490 insertions(+), 416 deletions(-) create mode 100644 tests/pytest/crash_gen/db.py diff --git a/tests/pytest/crash_gen/crash_gen.py b/tests/pytest/crash_gen/crash_gen.py index 4ec8e48582..2d52d274c3 100755 --- a/tests/pytest/crash_gen/crash_gen.py +++ b/tests/pytest/crash_gen/crash_gen.py @@ -14,22 +14,17 @@ # For type hinting before definition, ref: # https://stackoverflow.com/questions/33533148/how-do-i-specify-that-the-return-type-of-a-method-is-the-same-as-the-class-itsel from __future__ import annotations -import taos -from util.sql import * -from util.cases import * -from util.dnodes import * -from util.log import * + from typing import Set from typing import Dict from typing import List -from requests.auth import HTTPBasicAuth + import textwrap import time import datetime import random import logging import threading -import requests import copy import argparse import getopt @@ -44,6 +39,10 @@ import gc from .service_manager import ServiceManager, TdeInstance from .misc import Logging, Status, CrashGenError, Dice, Helper, Progress +from .db import DbConn, MyTDSql, DbConnNative, DbManager + +import taos +import requests # Require Python 3 if sys.version_info[0] < 3: @@ -78,10 +77,11 @@ class WorkerThread: # Let us have a DB connection of our own if (gConfig.per_thread_db_connection): # type: ignore # print("connector_type = {}".format(gConfig.connector_type)) - if gConfig.connector_type == 'native': - self._dbConn = DbConn.createNative() + tInst = gContainer.defTdeInstance + if gConfig.connector_type == 'native': + self._dbConn = DbConn.createNative(tInst.getDbTarget()) elif gConfig.connector_type == 'rest': - self._dbConn = DbConn.createRest() + self._dbConn = DbConn.createRest(tInst.getDbTarget()) elif gConfig.connector_type == 'mixed': if Dice.throw(2) == 0: # 1/2 chance self._dbConn = DbConn.createNative() @@ -505,7 +505,7 @@ class ThreadCoordinator: # pick a task type for current state db = self.pickDatabase() - taskType = db.getStateMachine().pickTaskType() # type: Task + taskType = db.getStateMachine().pickTaskType() # dynamic name of class return taskType(self._execStats, db) # create a task from it def resetExecutedTasks(self): @@ -619,342 +619,6 @@ class LinearQueue(): return ret -class DbConn: - TYPE_NATIVE = "native-c" - TYPE_REST = "rest-api" - TYPE_INVALID = "invalid" - - @classmethod - def create(cls, connType): - if connType == cls.TYPE_NATIVE: - return DbConnNative() - elif connType == cls.TYPE_REST: - return DbConnRest() - else: - raise RuntimeError( - "Unexpected connection type: {}".format(connType)) - - @classmethod - def createNative(cls): - return cls.create(cls.TYPE_NATIVE) - - @classmethod - def createRest(cls): - return cls.create(cls.TYPE_REST) - - def __init__(self): - self.isOpen = False - self._type = self.TYPE_INVALID - self._lastSql = None - - def getLastSql(self): - return self._lastSql - - def open(self): - if (self.isOpen): - raise RuntimeError("Cannot re-open an existing DB connection") - - # below implemented by child classes - self.openByType() - - Logging.debug("[DB] data connection opened, type = {}".format(self._type)) - self.isOpen = True - - def close(self): - raise RuntimeError("Unexpected execution, should be overriden") - - def queryScalar(self, sql) -> int: - return self._queryAny(sql) - - def queryString(self, sql) -> str: - return self._queryAny(sql) - - def _queryAny(self, sql): # actual query result as an int - if (not self.isOpen): - raise RuntimeError("Cannot query database until connection is open") - nRows = self.query(sql) - if nRows != 1: - raise taos.error.ProgrammingError( - "Unexpected result for query: {}, rows = {}".format(sql, nRows), - (0x991 if nRows==0 else 0x992) - ) - if self.getResultRows() != 1 or self.getResultCols() != 1: - raise RuntimeError("Unexpected result set for query: {}".format(sql)) - return self.getQueryResult()[0][0] - - def use(self, dbName): - self.execute("use {}".format(dbName)) - - def existsDatabase(self, dbName: str): - ''' Check if a certain database exists ''' - self.query("show databases") - dbs = [v[0] for v in self.getQueryResult()] # ref: https://stackoverflow.com/questions/643823/python-list-transformation - # ret2 = dbName in dbs - # print("dbs = {}, str = {}, ret2={}, type2={}".format(dbs, dbName,ret2, type(dbName))) - return dbName in dbs # TODO: super weird type mangling seen, once here - - def hasTables(self): - return self.query("show tables") > 0 - - def execute(self, sql): - ''' Return the number of rows affected''' - raise RuntimeError("Unexpected execution, should be overriden") - - def safeExecute(self, sql): - '''Safely execute any SQL query, returning True/False upon success/failure''' - try: - self.execute(sql) - return True # ignore num of results, return success - except taos.error.ProgrammingError as err: - return False # failed, for whatever TAOS reason - # Not possile to reach here, non-TAOS exception would have been thrown - - def query(self, sql) -> int: # return num rows returned - ''' Return the number of rows affected''' - raise RuntimeError("Unexpected execution, should be overriden") - - def openByType(self): - raise RuntimeError("Unexpected execution, should be overriden") - - def getQueryResult(self): - raise RuntimeError("Unexpected execution, should be overriden") - - def getResultRows(self): - raise RuntimeError("Unexpected execution, should be overriden") - - def getResultCols(self): - raise RuntimeError("Unexpected execution, should be overriden") - -# Sample: curl -u root:taosdata -d "show databases" localhost:6020/rest/sql - - -class DbConnRest(DbConn): - def __init__(self): - super().__init__() - self._type = self.TYPE_REST - self._url = "http://localhost:6041/rest/sql" # fixed for now - self._result = None - - def openByType(self): # Open connection - pass # do nothing, always open - - def close(self): - if (not self.isOpen): - raise RuntimeError("Cannot clean up database until connection is open") - # Do nothing for REST - Logging.debug("[DB] REST Database connection closed") - self.isOpen = False - - def _doSql(self, sql): - self._lastSql = sql # remember this, last SQL attempted - try: - r = requests.post(self._url, - data = sql, - auth = HTTPBasicAuth('root', 'taosdata')) - except: - print("REST API Failure (TODO: more info here)") - raise - rj = r.json() - # Sanity check for the "Json Result" - if ('status' not in rj): - raise RuntimeError("No status in REST response") - - if rj['status'] == 'error': # clearly reported error - if ('code' not in rj): # error without code - raise RuntimeError("REST error return without code") - errno = rj['code'] # May need to massage this in the future - # print("Raising programming error with REST return: {}".format(rj)) - raise taos.error.ProgrammingError( - rj['desc'], errno) # todo: check existance of 'desc' - - if rj['status'] != 'succ': # better be this - raise RuntimeError( - "Unexpected REST return status: {}".format( - rj['status'])) - - nRows = rj['rows'] if ('rows' in rj) else 0 - self._result = rj - return nRows - - def execute(self, sql): - if (not self.isOpen): - raise RuntimeError( - "Cannot execute database commands until connection is open") - Logging.debug("[SQL-REST] Executing SQL: {}".format(sql)) - nRows = self._doSql(sql) - Logging.debug( - "[SQL-REST] Execution Result, nRows = {}, SQL = {}".format(nRows, sql)) - return nRows - - def query(self, sql): # return rows affected - return self.execute(sql) - - def getQueryResult(self): - return self._result['data'] - - def getResultRows(self): - print(self._result) - raise RuntimeError("TBD") - # return self._tdSql.queryRows - - def getResultCols(self): - print(self._result) - raise RuntimeError("TBD") - - # Duplicate code from TDMySQL, TODO: merge all this into DbConnNative - - -class MyTDSql: - # Class variables - _clsLock = threading.Lock() # class wide locking - longestQuery = None # type: str - longestQueryTime = 0.0 # seconds - lqStartTime = 0.0 - # lqEndTime = 0.0 # Not needed, as we have the two above already - - def __init__(self, hostAddr, cfgPath): - # Make the DB connection - self._conn = taos.connect(host=hostAddr, config=cfgPath) - self._cursor = self._conn.cursor() - - self.queryRows = 0 - self.queryCols = 0 - self.affectedRows = 0 - - # def init(self, cursor, log=True): - # self.cursor = cursor - # if (log): - # caller = inspect.getframeinfo(inspect.stack()[1][0]) - # self.cursor.log(caller.filename + ".sql") - - def close(self): - self._cursor.close() # can we double close? - self._conn.close() # TODO: very important, cursor close does NOT close DB connection! - self._cursor.close() - - def _execInternal(self, sql): - startTime = time.time() - ret = self._cursor.execute(sql) - # print("\nSQL success: {}".format(sql)) - queryTime = time.time() - startTime - # Record the query time - cls = self.__class__ - if queryTime > (cls.longestQueryTime + 0.01) : - with cls._clsLock: - cls.longestQuery = sql - cls.longestQueryTime = queryTime - cls.lqStartTime = startTime - return ret - - def query(self, sql): - self.sql = sql - try: - self._execInternal(sql) - self.queryResult = self._cursor.fetchall() - self.queryRows = len(self.queryResult) - self.queryCols = len(self._cursor.description) - except Exception as e: - # caller = inspect.getframeinfo(inspect.stack()[1][0]) - # args = (caller.filename, caller.lineno, sql, repr(e)) - # tdLog.exit("%s(%d) failed: sql:%s, %s" % args) - raise - return self.queryRows - - def execute(self, sql): - self.sql = sql - try: - self.affectedRows = self._execInternal(sql) - except Exception as e: - # caller = inspect.getframeinfo(inspect.stack()[1][0]) - # args = (caller.filename, caller.lineno, sql, repr(e)) - # tdLog.exit("%s(%d) failed: sql:%s, %s" % args) - raise - return self.affectedRows - -class DbConnNative(DbConn): - # Class variables - _lock = threading.Lock() - _connInfoDisplayed = False - totalConnections = 0 # Not private - - def __init__(self): - super().__init__() - self._type = self.TYPE_NATIVE - self._conn = None - # self._cursor = None - - def openByType(self): # Open connection - global gContainer - tdeInstance = gContainer.defTdeInstance # set up in ClientManager, type: TdeInstance - # cfgPath = self.getBuildPath() + "/test/cfg" - cfgPath = tdeInstance.getCfgDir() - hostAddr = tdeInstance.getHostAddr() - - cls = self.__class__ # Get the class, to access class variables - with cls._lock: # force single threading for opening DB connections. # TODO: whaaat??!!! - if not cls._connInfoDisplayed: - cls._connInfoDisplayed = True # updating CLASS variable - Logging.info("Initiating TAOS native connection to {}, using config at {}".format(hostAddr, cfgPath)) - # Make the connection - # self._conn = taos.connect(host=hostAddr, config=cfgPath) # TODO: make configurable - # self._cursor = self._conn.cursor() - # Record the count in the class - self._tdSql = MyTDSql(hostAddr, cfgPath) # making DB connection - cls.totalConnections += 1 - - self._tdSql.execute('reset query cache') - # self._cursor.execute('use db') # do this at the beginning of every - - # Open connection - # self._tdSql = MyTDSql() - # self._tdSql.init(self._cursor) - - def close(self): - if (not self.isOpen): - raise RuntimeError("Cannot clean up database until connection is open") - self._tdSql.close() - # Decrement the class wide counter - cls = self.__class__ # Get the class, to access class variables - with cls._lock: - cls.totalConnections -= 1 - - Logging.debug("[DB] Database connection closed") - self.isOpen = False - - def execute(self, sql): - if (not self.isOpen): - raise RuntimeError("Cannot execute database commands until connection is open") - Logging.debug("[SQL] Executing SQL: {}".format(sql)) - self._lastSql = sql - nRows = self._tdSql.execute(sql) - Logging.debug( - "[SQL] Execution Result, nRows = {}, SQL = {}".format( - nRows, sql)) - return nRows - - def query(self, sql): # return rows affected - if (not self.isOpen): - raise RuntimeError( - "Cannot query database until connection is open") - Logging.debug("[SQL] Executing SQL: {}".format(sql)) - self._lastSql = sql - nRows = self._tdSql.query(sql) - Logging.debug( - "[SQL] Query Result, nRows = {}, SQL = {}".format( - nRows, sql)) - return nRows - # results are in: return self._tdSql.queryResult - - def getQueryResult(self): - return self._tdSql.queryResult - - def getResultRows(self): - return self._tdSql.queryRows - - def getResultCols(self): - return self._tdSql.queryCols - - class AnyState: STATE_INVALID = -1 STATE_EMPTY = 0 # nothing there, no even a DB @@ -1439,64 +1103,6 @@ class Database: return ret -class DbManager(): - ''' This is a wrapper around DbConn(), to make it easier to use. - - TODO: rename this to DbConnManager - ''' - def __init__(self): - self.tableNumQueue = LinearQueue() # TODO: delete? - # self.openDbServerConnection() - self._dbConn = DbConn.createNative() if ( - gConfig.connector_type == 'native') else DbConn.createRest() - try: - self._dbConn.open() # may throw taos.error.ProgrammingError: disconnected - except taos.error.ProgrammingError as err: - # print("Error type: {}, msg: {}, value: {}".format(type(err), err.msg, err)) - if (err.msg == 'client disconnected'): # cannot open DB connection - print( - "Cannot establish DB connection, please re-run script without parameter, and follow the instructions.") - sys.exit(2) - else: - print("Failed to connect to DB, errno = {}, msg: {}" - .format(Helper.convertErrno(err.errno), err.msg)) - raise - except BaseException: - print("[=] Unexpected exception") - raise - - # Do this after dbConn is in proper shape - # Moved to Database() - # self._stateMachine = StateMechine(self._dbConn) - - def getDbConn(self): - return self._dbConn - - # TODO: not used any more, to delete - def pickAndAllocateTable(self): # pick any table, and "use" it - return self.tableNumQueue.pickAndAllocate() - - # TODO: Not used any more, to delete - def addTable(self): - with self._lock: - tIndex = self.tableNumQueue.push() - return tIndex - - # Not used any more, to delete - def releaseTable(self, i): # return the table back, so others can use it - self.tableNumQueue.release(i) - - # TODO: not used any more, delete - def getTableNameToDelete(self): - tblNum = self.tableNumQueue.pop() # TODO: race condition! - if (not tblNum): # maybe false - return False - - return "table_{}".format(tblNum) - - def cleanUp(self): - self._dbConn.close() - class TaskExecutor(): class BoundedList: def __init__(self, size=10): @@ -2402,7 +2008,7 @@ class ClientManager: global gContainer tInst = gContainer.defTdeInstance = TdeInstance() # "subdir to hold the instance" - dbManager = DbManager() # Regular function + dbManager = DbManager(gConfig.connector_type, tInst.getDbTarget()) # Regular function thPool = ThreadPool(gConfig.num_threads, gConfig.max_steps) self.tc = ThreadCoordinator(thPool, dbManager) diff --git a/tests/pytest/crash_gen/db.py b/tests/pytest/crash_gen/db.py new file mode 100644 index 0000000000..5404382bf0 --- /dev/null +++ b/tests/pytest/crash_gen/db.py @@ -0,0 +1,426 @@ +from __future__ import annotations + +import sys +import time +import threading +import requests +from requests.auth import HTTPBasicAuth + +import taos +from util.sql import * +from util.cases import * +from util.dnodes import * +from util.log import * + +from .misc import Logging, CrashGenError, Helper +# from .service_manager import TdeInstance + +class DbConn: + TYPE_NATIVE = "native-c" + TYPE_REST = "rest-api" + TYPE_INVALID = "invalid" + + @classmethod + def create(cls, connType, dbTarget): + if connType == cls.TYPE_NATIVE: + return DbConnNative(dbTarget) + elif connType == cls.TYPE_REST: + return DbConnRest(dbTarget) + else: + raise RuntimeError( + "Unexpected connection type: {}".format(connType)) + + @classmethod + def createNative(cls, dbTarget) -> DbConn: + return cls.create(cls.TYPE_NATIVE, dbTarget) + + @classmethod + def createRest(cls, dbTarget) -> DbConn: + return cls.create(cls.TYPE_REST, dbTarget) + + def __init__(self, dbTarget): + self.isOpen = False + self._type = self.TYPE_INVALID + self._lastSql = None + self._dbTarget = dbTarget + + def getLastSql(self): + return self._lastSql + + def open(self): + if (self.isOpen): + raise RuntimeError("Cannot re-open an existing DB connection") + + # below implemented by child classes + self.openByType() + + Logging.debug("[DB] data connection opened, type = {}".format(self._type)) + self.isOpen = True + + def close(self): + raise RuntimeError("Unexpected execution, should be overriden") + + def queryScalar(self, sql) -> int: + return self._queryAny(sql) + + def queryString(self, sql) -> str: + return self._queryAny(sql) + + def _queryAny(self, sql): # actual query result as an int + if (not self.isOpen): + raise RuntimeError("Cannot query database until connection is open") + nRows = self.query(sql) + if nRows != 1: + raise taos.error.ProgrammingError( + "Unexpected result for query: {}, rows = {}".format(sql, nRows), + (0x991 if nRows==0 else 0x992) + ) + if self.getResultRows() != 1 or self.getResultCols() != 1: + raise RuntimeError("Unexpected result set for query: {}".format(sql)) + return self.getQueryResult()[0][0] + + def use(self, dbName): + self.execute("use {}".format(dbName)) + + def existsDatabase(self, dbName: str): + ''' Check if a certain database exists ''' + self.query("show databases") + dbs = [v[0] for v in self.getQueryResult()] # ref: https://stackoverflow.com/questions/643823/python-list-transformation + # ret2 = dbName in dbs + # print("dbs = {}, str = {}, ret2={}, type2={}".format(dbs, dbName,ret2, type(dbName))) + return dbName in dbs # TODO: super weird type mangling seen, once here + + def hasTables(self): + return self.query("show tables") > 0 + + def execute(self, sql): + ''' Return the number of rows affected''' + raise RuntimeError("Unexpected execution, should be overriden") + + def safeExecute(self, sql): + '''Safely execute any SQL query, returning True/False upon success/failure''' + try: + self.execute(sql) + return True # ignore num of results, return success + except taos.error.ProgrammingError as err: + return False # failed, for whatever TAOS reason + # Not possile to reach here, non-TAOS exception would have been thrown + + def query(self, sql) -> int: # return num rows returned + ''' Return the number of rows affected''' + raise RuntimeError("Unexpected execution, should be overriden") + + def openByType(self): + raise RuntimeError("Unexpected execution, should be overriden") + + def getQueryResult(self): + raise RuntimeError("Unexpected execution, should be overriden") + + def getResultRows(self): + raise RuntimeError("Unexpected execution, should be overriden") + + def getResultCols(self): + raise RuntimeError("Unexpected execution, should be overriden") + +# Sample: curl -u root:taosdata -d "show databases" localhost:6020/rest/sql + + +class DbConnRest(DbConn): + REST_PORT_INCREMENT = 11 + + def __init__(self, dbTarget: DbTarget): + super().__init__(dbTarget) + self._type = self.TYPE_REST + restPort = dbTarget.port + 11 + self._url = "http://{}:{}/rest/sql".format( + dbTarget.hostAddr, dbTarget.port + self.REST_PORT_INCREMENT) + self._result = None + + def openByType(self): # Open connection + pass # do nothing, always open + + def close(self): + if (not self.isOpen): + raise RuntimeError("Cannot clean up database until connection is open") + # Do nothing for REST + Logging.debug("[DB] REST Database connection closed") + self.isOpen = False + + def _doSql(self, sql): + self._lastSql = sql # remember this, last SQL attempted + try: + r = requests.post(self._url, + data = sql, + auth = HTTPBasicAuth('root', 'taosdata')) + except: + print("REST API Failure (TODO: more info here)") + raise + rj = r.json() + # Sanity check for the "Json Result" + if ('status' not in rj): + raise RuntimeError("No status in REST response") + + if rj['status'] == 'error': # clearly reported error + if ('code' not in rj): # error without code + raise RuntimeError("REST error return without code") + errno = rj['code'] # May need to massage this in the future + # print("Raising programming error with REST return: {}".format(rj)) + raise taos.error.ProgrammingError( + rj['desc'], errno) # todo: check existance of 'desc' + + if rj['status'] != 'succ': # better be this + raise RuntimeError( + "Unexpected REST return status: {}".format( + rj['status'])) + + nRows = rj['rows'] if ('rows' in rj) else 0 + self._result = rj + return nRows + + def execute(self, sql): + if (not self.isOpen): + raise RuntimeError( + "Cannot execute database commands until connection is open") + Logging.debug("[SQL-REST] Executing SQL: {}".format(sql)) + nRows = self._doSql(sql) + Logging.debug( + "[SQL-REST] Execution Result, nRows = {}, SQL = {}".format(nRows, sql)) + return nRows + + def query(self, sql): # return rows affected + return self.execute(sql) + + def getQueryResult(self): + return self._result['data'] + + def getResultRows(self): + print(self._result) + raise RuntimeError("TBD") # TODO: finish here to support -v under -c rest + # return self._tdSql.queryRows + + def getResultCols(self): + print(self._result) + raise RuntimeError("TBD") + + # Duplicate code from TDMySQL, TODO: merge all this into DbConnNative + + +class MyTDSql: + # Class variables + _clsLock = threading.Lock() # class wide locking + longestQuery = None # type: str + longestQueryTime = 0.0 # seconds + lqStartTime = 0.0 + # lqEndTime = 0.0 # Not needed, as we have the two above already + + def __init__(self, hostAddr, cfgPath): + # Make the DB connection + self._conn = taos.connect(host=hostAddr, config=cfgPath) + self._cursor = self._conn.cursor() + + self.queryRows = 0 + self.queryCols = 0 + self.affectedRows = 0 + + # def init(self, cursor, log=True): + # self.cursor = cursor + # if (log): + # caller = inspect.getframeinfo(inspect.stack()[1][0]) + # self.cursor.log(caller.filename + ".sql") + + def close(self): + self._cursor.close() # can we double close? + self._conn.close() # TODO: very important, cursor close does NOT close DB connection! + self._cursor.close() + + def _execInternal(self, sql): + startTime = time.time() + ret = self._cursor.execute(sql) + # print("\nSQL success: {}".format(sql)) + queryTime = time.time() - startTime + # Record the query time + cls = self.__class__ + if queryTime > (cls.longestQueryTime + 0.01) : + with cls._clsLock: + cls.longestQuery = sql + cls.longestQueryTime = queryTime + cls.lqStartTime = startTime + return ret + + def query(self, sql): + self.sql = sql + try: + self._execInternal(sql) + self.queryResult = self._cursor.fetchall() + self.queryRows = len(self.queryResult) + self.queryCols = len(self._cursor.description) + except Exception as e: + # caller = inspect.getframeinfo(inspect.stack()[1][0]) + # args = (caller.filename, caller.lineno, sql, repr(e)) + # tdLog.exit("%s(%d) failed: sql:%s, %s" % args) + raise + return self.queryRows + + def execute(self, sql): + self.sql = sql + try: + self.affectedRows = self._execInternal(sql) + except Exception as e: + # caller = inspect.getframeinfo(inspect.stack()[1][0]) + # args = (caller.filename, caller.lineno, sql, repr(e)) + # tdLog.exit("%s(%d) failed: sql:%s, %s" % args) + raise + return self.affectedRows + +class DbTarget: + def __init__(self, cfgPath, hostAddr, port): + self.cfgPath = cfgPath + self.hostAddr = hostAddr + self.port = port + + def __repr__(self): + return "[DbTarget: cfgPath={}, host={}:{}]".format( + self.cfgPath, self.hostAddr, self.port) + +class DbConnNative(DbConn): + # Class variables + _lock = threading.Lock() + _connInfoDisplayed = False + totalConnections = 0 # Not private + + def __init__(self, dbTarget): + super().__init__(dbTarget) + self._type = self.TYPE_NATIVE + self._conn = None + # self._cursor = None + + def openByType(self): # Open connection + # global gContainer + # tInst = tInst or gContainer.defTdeInstance # set up in ClientManager, type: TdeInstance + # cfgPath = self.getBuildPath() + "/test/cfg" + # cfgPath = tInst.getCfgDir() + # hostAddr = tInst.getHostAddr() + + cls = self.__class__ # Get the class, to access class variables + with cls._lock: # force single threading for opening DB connections. # TODO: whaaat??!!! + dbTarget = self._dbTarget + if not cls._connInfoDisplayed: + cls._connInfoDisplayed = True # updating CLASS variable + Logging.info("Initiating TAOS native connection to {}".format(dbTarget)) + # Make the connection + # self._conn = taos.connect(host=hostAddr, config=cfgPath) # TODO: make configurable + # self._cursor = self._conn.cursor() + # Record the count in the class + self._tdSql = MyTDSql(dbTarget.hostAddr, dbTarget.cfgPath) # making DB connection + cls.totalConnections += 1 + + self._tdSql.execute('reset query cache') + # self._cursor.execute('use db') # do this at the beginning of every + + # Open connection + # self._tdSql = MyTDSql() + # self._tdSql.init(self._cursor) + + def close(self): + if (not self.isOpen): + raise RuntimeError("Cannot clean up database until connection is open") + self._tdSql.close() + # Decrement the class wide counter + cls = self.__class__ # Get the class, to access class variables + with cls._lock: + cls.totalConnections -= 1 + + Logging.debug("[DB] Database connection closed") + self.isOpen = False + + def execute(self, sql): + if (not self.isOpen): + raise RuntimeError("Cannot execute database commands until connection is open") + Logging.debug("[SQL] Executing SQL: {}".format(sql)) + self._lastSql = sql + nRows = self._tdSql.execute(sql) + Logging.debug( + "[SQL] Execution Result, nRows = {}, SQL = {}".format( + nRows, sql)) + return nRows + + def query(self, sql): # return rows affected + if (not self.isOpen): + raise RuntimeError( + "Cannot query database until connection is open") + Logging.debug("[SQL] Executing SQL: {}".format(sql)) + self._lastSql = sql + nRows = self._tdSql.query(sql) + Logging.debug( + "[SQL] Query Result, nRows = {}, SQL = {}".format( + nRows, sql)) + return nRows + # results are in: return self._tdSql.queryResult + + def getQueryResult(self): + return self._tdSql.queryResult + + def getResultRows(self): + return self._tdSql.queryRows + + def getResultCols(self): + return self._tdSql.queryCols + + +class DbManager(): + ''' This is a wrapper around DbConn(), to make it easier to use. + + TODO: rename this to DbConnManager + ''' + def __init__(self, cType, dbTarget): + # self.tableNumQueue = LinearQueue() # TODO: delete? + # self.openDbServerConnection() + self._dbConn = DbConn.createNative(dbTarget) if ( + cType == 'native') else DbConn.createRest(dbTarget) + try: + self._dbConn.open() # may throw taos.error.ProgrammingError: disconnected + except taos.error.ProgrammingError as err: + # print("Error type: {}, msg: {}, value: {}".format(type(err), err.msg, err)) + if (err.msg == 'client disconnected'): # cannot open DB connection + print( + "Cannot establish DB connection, please re-run script without parameter, and follow the instructions.") + sys.exit(2) + else: + print("Failed to connect to DB, errno = {}, msg: {}" + .format(Helper.convertErrno(err.errno), err.msg)) + raise + except BaseException: + print("[=] Unexpected exception") + raise + + # Do this after dbConn is in proper shape + # Moved to Database() + # self._stateMachine = StateMechine(self._dbConn) + + def getDbConn(self): + return self._dbConn + + # TODO: not used any more, to delete + def pickAndAllocateTable(self): # pick any table, and "use" it + return self.tableNumQueue.pickAndAllocate() + + # TODO: Not used any more, to delete + def addTable(self): + with self._lock: + tIndex = self.tableNumQueue.push() + return tIndex + + # Not used any more, to delete + def releaseTable(self, i): # return the table back, so others can use it + self.tableNumQueue.release(i) + + # TODO: not used any more, delete + def getTableNameToDelete(self): + tblNum = self.tableNumQueue.pop() # TODO: race condition! + if (not tblNum): # maybe false + return False + + return "table_{}".format(tblNum) + + def cleanUp(self): + self._dbConn.close() diff --git a/tests/pytest/crash_gen/service_manager.py b/tests/pytest/crash_gen/service_manager.py index cdb12303a2..11e35b6de8 100644 --- a/tests/pytest/crash_gen/service_manager.py +++ b/tests/pytest/crash_gen/service_manager.py @@ -16,7 +16,9 @@ except: sys.exit(-1) from queue import Queue, Empty + from .misc import Logging, Status, CrashGenError, Dice +from .db import DbConn, DbTarget class TdeInstance(): """ @@ -45,9 +47,17 @@ class TdeInstance(): .format(selfPath, projPath)) return buildPath - def __init__(self, subdir='test'): - self._buildDir = self._getBuildPath() - self._subdir = '/' + subdir # TODO: tolerate "/" + def __init__(self, subdir='test', port=6030, fepPort=6030): + self._buildDir = self._getBuildPath() + self._subdir = '/' + subdir # TODO: tolerate "/" + self._port = port # TODO: support different IP address too + self._fepPort = fepPort + + def getDbTarget(self): + return DbTarget(self.getCfgDir(), self.getHostAddr(), self._port) + + def getPort(self): + return self._port def __repr__(self): return "[TdeInstance: {}, subdir={}]".format(self._buildDir, self._subdir) @@ -74,9 +84,10 @@ class TdeInstance(): os.makedirs(cfgDir, exist_ok=True) # like "mkdir -p" # Now we have a good cfg dir cfgValues = { - 'runDir': self.getRunDir(), - 'ip': '127.0.0.1', # TODO: change to a network addressable ip - 'port': 6030, + 'runDir': self.getRunDir(), + 'ip': '127.0.0.1', # TODO: change to a network addressable ip + 'port': self._port, + 'fepPort': self._fepPort, } cfgTemplate = """ dataDir {runDir}/data @@ -84,7 +95,7 @@ logDir {runDir}/log charset UTF-8 -firstEp {ip}:{port} +firstEp {ip}:{fepPort} fqdn {ip} serverPort {port} @@ -236,9 +247,10 @@ class TdeSubProcess: class ServiceManager: PAUSE_BETWEEN_IPC_CHECK = 1.2 # seconds between checks on STDOUT of sub process - def __init__(self, numDnodes = 1): + def __init__(self, numDnodes = 1): # Otherwise we run a cluster Logging.info("TDengine Service Manager (TSM) created") self._numDnodes = numDnodes # >1 means we have a cluster + self._lock = threading.Lock() # signal.signal(signal.SIGTERM, self.sigIntHandler) # Moved to MainExec # signal.signal(signal.SIGINT, self.sigIntHandler) # signal.signal(signal.SIGUSR1, self.sigUsrHandler) # different handler! @@ -246,12 +258,20 @@ class ServiceManager: self.inSigHandler = False # self._status = MainExec.STATUS_RUNNING # set inside # _startTaosService() + self._runCluster = (numDnodes >= 1) self.svcMgrThreads = [] # type: List[ServiceManagerThread] for i in range(0, numDnodes): self.svcMgrThreads.append(ServiceManagerThread(i)) - self._lock = threading.Lock() - # self._isRestarting = False + def _createThread(self, dnIndex): + if not self._runCluster: # single instance + return ServiceManagerThread(0) + # Create all threads in a cluster + subdir = 'cluster_dnode_{}'.format(dnIndex) + fepPort= 6030 # firstEP Port + port = fepPort + dnIndex * 100 + ti = TdeInstance(subdir, port, fepPort) + return ServiceManagerThread(dnIndex, ti) def _doMenu(self): choice = "" @@ -488,11 +508,33 @@ class ServiceManagerThread: if self._status == Status.STATUS_RUNNING: Logging.info("[] TDengine service READY to process requests") Logging.info("[] TAOS service started: {}".format(self)) + self._verifyDnode(self._tInst) # query and ensure dnode is ready return # now we've started # TODO: handle failure-to-start better? self.procIpcBatch(100, True) # display output before cronking out, trim to last 20 msgs, force output raise RuntimeError("TDengine service did not start successfully: {}".format(self)) + def _verifyDnode(self, tInst: TdeInstance): + dbc = DbConn.createNative(tInst.getDbTarget()) + dbc.open() + dbc.query("show dnodes") + # dbc.query("DESCRIBE {}.{}".format(dbName, self._stName)) + cols = dbc.getQueryResult() # id,end_point,vnodes,cores,status,role,create_time,offline reason + # ret = {row[0]:row[1] for row in stCols if row[3]=='TAG'} # name:type + isValid = False + for col in cols: + print("col = {}".format(col)) + ep = col[1].split(':') # 10.1.30.2:6030 + print("ep={}".format(ep)) + if tInst.getPort() == int(ep[1]): # That's us + print("Valid Dnode matched!") + isValid = True # now we are valid + break + if not isValid: + raise RuntimeError("Failed to start Dnode, port = {}, expected: {}". + format(ep[1], tInst.getPort())) + dbc.close() + def stop(self): # can be called from both main thread or signal handler print("Terminating TDengine service running as the sub process...") From efac9a1de702a9c78f5a0feff4c16d2931b7cf8d Mon Sep 17 00:00:00 2001 From: Hui Li Date: Sat, 24 Oct 2020 10:10:20 +0800 Subject: [PATCH 11/16] modify release scripts --- packaging/tools/makeclient_power.sh | 2 +- packaging/tools/makepkg_power.sh | 2 +- packaging/tools/post.sh | 23 +++++++++++++++++++++++ 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/packaging/tools/makeclient_power.sh b/packaging/tools/makeclient_power.sh index b4416a68bb..faa5a03f52 100755 --- a/packaging/tools/makeclient_power.sh +++ b/packaging/tools/makeclient_power.sh @@ -123,7 +123,7 @@ if [[ "$pagMode" != "lite" ]] && [[ "$cpuType" != "aarch32" ]]; then cp -r ${examples_dir}/R ${install_dir}/examples sed -i '/password/ {s/taosdata/powerdb/g}' ${install_dir}/examples/R/command.txt cp -r ${examples_dir}/go ${install_dir}/examples - sed -i '/root/ {s/taosdata/powerdb/g}' ${install_dir}/examples/go/src/taosapp/taosapp.go + sed -i '/root/ {s/taosdata/powerdb/g}' ${install_dir}/examples/go/taosdemo.go fi # Copy driver mkdir -p ${install_dir}/driver diff --git a/packaging/tools/makepkg_power.sh b/packaging/tools/makepkg_power.sh index 3d625900c9..2c02b99787 100755 --- a/packaging/tools/makepkg_power.sh +++ b/packaging/tools/makepkg_power.sh @@ -146,7 +146,7 @@ if [[ "$pagMode" != "lite" ]] && [[ "$cpuType" != "aarch32" ]]; then cp -r ${examples_dir}/R ${install_dir}/examples sed -i '/password/ {s/taosdata/powerdb/g}' ${install_dir}/examples/R/command.txt cp -r ${examples_dir}/go ${install_dir}/examples - sed -i '/root/ {s/taosdata/powerdb/g}' ${install_dir}/examples/go/src/taosapp/taosapp.go + sed -i '/root/ {s/taosdata/powerdb/g}' ${install_dir}/examples/go/taosdemo.go fi # Copy driver mkdir -p ${install_dir}/driver diff --git a/packaging/tools/post.sh b/packaging/tools/post.sh index 0feb64c795..726eda69d0 100755 --- a/packaging/tools/post.sh +++ b/packaging/tools/post.sh @@ -134,6 +134,29 @@ function install_config() { else break fi + done + + # user email + #EMAIL_PATTERN='^[A-Za-z0-9\u4e00-\u9fa5]+@[a-zA-Z0-9_-]+(\.[a-zA-Z0-9_-]+)+$' + #EMAIL_PATTERN='^[\w-]+(\.[\w-]+)*@[\w-]+(\.[\w-]+)+$' + #EMAIL_PATTERN="^[\w-]+(\.[\w-]+)*@[\w-]+(\.[\w-]+)+$" + echo + echo -e -n "${GREEN}Enter your email address for priority support or enter empty to skip${NC}: " + read emailAddr + while true; do + if [ ! -z "$emailAddr" ]; then + # check the format of the emailAddr + #if [[ "$emailAddr" =~ $EMAIL_PATTERN ]]; then + # Write the email address to temp file + email_file="${install_main_dir}/email" + ${csudo} bash -c "echo $emailAddr > ${email_file}" + break + #else + # read -p "Please enter the correct email address: " emailAddr + #fi + else + break + fi done } From 5961df09b527a40876ccd819d6bf89ae05bd52c1 Mon Sep 17 00:00:00 2001 From: Hui Li Date: Sat, 24 Oct 2020 10:15:36 +0800 Subject: [PATCH 12/16] modify release script --- packaging/tools/post.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/packaging/tools/post.sh b/packaging/tools/post.sh index 726eda69d0..d91daaa5c4 100755 --- a/packaging/tools/post.sh +++ b/packaging/tools/post.sh @@ -10,6 +10,7 @@ data_dir="/var/lib/taos" log_dir="/var/log/taos" data_link_dir="/usr/local/taos/data" log_link_dir="/usr/local/taos/log" +install_main_dir="/usr/local/taos" # static directory cfg_dir="/usr/local/taos/cfg" From fbb406e4af51e5ba6f70fb71f1c6e603f21cceca Mon Sep 17 00:00:00 2001 From: Hui Li Date: Sat, 24 Oct 2020 11:16:48 +0800 Subject: [PATCH 13/16] [add proxy for go] --- tests/gotest/batchtest.bat | 3 +++ tests/gotest/batchtest.sh | 3 +++ 2 files changed, 6 insertions(+) mode change 100644 => 100755 tests/gotest/batchtest.bat mode change 100644 => 100755 tests/gotest/batchtest.sh diff --git a/tests/gotest/batchtest.bat b/tests/gotest/batchtest.bat old mode 100644 new mode 100755 index abe9a58f31..efd8961bb0 --- a/tests/gotest/batchtest.bat +++ b/tests/gotest/batchtest.bat @@ -7,6 +7,9 @@ set serverPort=%2 if "%severIp%"=="" (set severIp=127.0.0.1) if "%serverPort%"=="" (set serverPort=6030) +go env -w GO111MODULE=on +go env -w GOPROXY=https://goproxy.io,direct + cd case001 case001.bat %severIp% %serverPort% diff --git a/tests/gotest/batchtest.sh b/tests/gotest/batchtest.sh old mode 100644 new mode 100755 index e8ed9ecbed..0fbbf40714 --- a/tests/gotest/batchtest.sh +++ b/tests/gotest/batchtest.sh @@ -13,6 +13,9 @@ if [ ! -n "$serverPort" ]; then serverPort=6030 fi +go env -w GO111MODULE=on +go env -w GOPROXY=https://goproxy.io,direct + bash ./case001/case001.sh $severIp $serverPort #bash ./case002/case002.sh $severIp $serverPort #bash ./case003/case003.sh $severIp $serverPort From 5a92c415a2b04f31dd1ad7f230cb04b31a95a0c3 Mon Sep 17 00:00:00 2001 From: Steven Li Date: Sat, 24 Oct 2020 08:06:59 +0000 Subject: [PATCH 14/16] Enhanced crash_gen tool to run clusters, with a new README file --- tests/pytest/crash_gen/README.md | 130 +++++++++ tests/pytest/crash_gen/crash_gen.py | 57 ++-- tests/pytest/crash_gen/db.py | 25 +- tests/pytest/crash_gen/misc.py | 46 ++- tests/pytest/crash_gen/service_manager.py | 330 +++++++++++++--------- 5 files changed, 419 insertions(+), 169 deletions(-) create mode 100644 tests/pytest/crash_gen/README.md diff --git a/tests/pytest/crash_gen/README.md b/tests/pytest/crash_gen/README.md new file mode 100644 index 0000000000..6788ab1a63 --- /dev/null +++ b/tests/pytest/crash_gen/README.md @@ -0,0 +1,130 @@ +

User's Guide to the Crash_Gen Tool

+ +# Introduction + +To effectively test and debug our TDengine product, we have developed a simple tool to +exercise various functions of the system in a randomized fashion, hoping to expose +maximum number of problems, hopefully without a pre-determined scenario. + +# Preparation + +To run this tool, please ensure the followed preparation work is done first. + +1. Fetch a copy of the TDengine source code, and build it successfully in the `build/` + directory +1. Ensure that the system has Python3.8 or above properly installed. We use + Ubuntu 20.04LTS as our own development environment, and suggest you also use such + an environment if possible. + +# Simple Execution + +To run the tool with the simplest method, follow the steps below: + +1. Open a terminal window, start the `taosd` service in the `build/` directory + (or however you prefer to start the `taosd` service) +1. Open another terminal window, go into the `tests/pytest/` directory, and + run `./crash_gen.sh -p -t 3 -s 10` (change the two parameters here as you wish) +1. Watch the output to the end and see if you get a `SUCCESS` or `FAILURE` + +That's it! + +# Running Clusters + +This tool also makes it easy to test/verify the clustering capabilities of TDengine. You +can start a cluster quite easily with the following command: + +``` +$ cd tests/pytest/ +$ ./crash_gen.sh -e -o 3 +``` + +The `-e` option above tells the tool to start the service, and do not run any tests, while +the `-o 3` option tells the tool to start 3 DNodes and join them together in a cluster. +Obviously you can adjust the the number here. + +## Behind the Scenes + +When the tool runs a cluster, it users a number of directories, each holding the information +for a single DNode, see: + +``` +$ ls build/cluster* +build/cluster_dnode_0: +cfg data log + +build/cluster_dnode_1: +cfg data log + +build/cluster_dnode_2: +cfg data log +``` + +Therefore, when something goes wrong and you want to reset everything with the cluster, simple +erase all the files: + +``` +$ rm -rf build/cluster_dnode_* +``` + +## Addresses and Ports + +The DNodes in the cluster all binds the the `127.0.0.1` IP address (for now anyway), and +uses port 6030 for the first DNode, and 6130 for the 2nd one, and so on. + +## Testing Against a Cluster + +In a separate terminal window, you can invoke the tool in client mode and test against +a cluster, such as: + +``` +$ ./crash_gen.sh -p -t 10 -s 100 -i 3 +``` + +Here the `-i` option tells the tool to always create tables with 3 replicas, and run +all tests against such tables. + +# Additional Features + +The exhaustive features of the tool is available through the `-h` option: + +``` +$ ./crash_gen.sh -h +usage: crash_gen_bootstrap.py [-h] [-a] [-b MAX_DBS] [-c CONNECTOR_TYPE] [-d] [-e] [-g IGNORE_ERRORS] [-i MAX_REPLICAS] [-l] [-n] [-o NUM_DNODES] [-p] [-r] + [-s MAX_STEPS] [-t NUM_THREADS] [-v] [-x] + +TDengine Auto Crash Generator (PLEASE NOTICE the Prerequisites Below) +--------------------------------------------------------------------- +1. You build TDengine in the top level ./build directory, as described in offical docs +2. You run the server there before this script: ./build/bin/taosd -c test/cfg + +optional arguments: + -h, --help show this help message and exit + -a, --auto-start-service + Automatically start/stop the TDengine service (default: false) + -b MAX_DBS, --max-dbs MAX_DBS + Maximum number of DBs to keep, set to disable dropping DB. (default: 0) + -c CONNECTOR_TYPE, --connector-type CONNECTOR_TYPE + Connector type to use: native, rest, or mixed (default: 10) + -d, --debug Turn on DEBUG mode for more logging (default: false) + -e, --run-tdengine Run TDengine service in foreground (default: false) + -g IGNORE_ERRORS, --ignore-errors IGNORE_ERRORS + Ignore error codes, comma separated, 0x supported (default: None) + -i MAX_REPLICAS, --max-replicas MAX_REPLICAS + Maximum number of replicas to use, when testing against clusters. (default: 1) + -l, --larger-data Write larger amount of data during write operations (default: false) + -n, --dynamic-db-table-names + Use non-fixed names for dbs/tables, useful for multi-instance executions (default: false) + -o NUM_DNODES, --num-dnodes NUM_DNODES + Number of Dnodes to initialize, used with -e option. (default: 1) + -p, --per-thread-db-connection + Use a single shared db connection (default: false) + -r, --record-ops Use a pair of always-fsynced fils to record operations performing + performed, for power-off tests (default: false) + -s MAX_STEPS, --max-steps MAX_STEPS + Maximum number of steps to run (default: 100) + -t NUM_THREADS, --num-threads NUM_THREADS + Number of threads to run (default: 10) + -v, --verify-data Verify data written in a number of places by reading back (default: false) + -x, --continue-on-exception + Continue execution after encountering unexpected/disallowed errors/exceptions (default: false) +``` + diff --git a/tests/pytest/crash_gen/crash_gen.py b/tests/pytest/crash_gen/crash_gen.py index 2d52d274c3..74e3964d5a 100755 --- a/tests/pytest/crash_gen/crash_gen.py +++ b/tests/pytest/crash_gen/crash_gen.py @@ -18,6 +18,7 @@ from __future__ import annotations from typing import Set from typing import Dict from typing import List +from typing import Optional # Type hinting, ref: https://stackoverflow.com/questions/19202633/python-3-type-hinting-for-none import textwrap import time @@ -62,9 +63,10 @@ gContainer: Container class WorkerThread: - def __init__(self, pool: ThreadPool, tid, tc: ThreadCoordinator, - # te: TaskExecutor, - ): # note: main thread context! + def __init__(self, pool: ThreadPool, tid, tc: ThreadCoordinator): + """ + Note: this runs in the main thread context + """ # self._curStep = -1 self._pool = pool self._tid = tid @@ -1007,6 +1009,8 @@ class Database: possibly in a cluster environment. For now we use it to manage state transitions in that database + + TODO: consider moving, but keep in mind it contains "StateMachine" ''' _clsLock = threading.Lock() # class wide lock _lastInt = 101 # next one is initial integer @@ -1182,7 +1186,7 @@ class Task(): def __init__(self, execStats: ExecutionStats, db: Database): self._workerThread = None - self._err = None # type: Exception + self._err: Optional[Exception] = None self._aborted = False self._curStep = None self._numRows = None # Number of rows affected @@ -1318,10 +1322,11 @@ class Task(): self._aborted = True traceback.print_exc() except BaseException: # TODO: what is this again??!! - self.logDebug( - "[=] Unexpected exception, SQL: {}".format( - wt.getDbConn().getLastSql())) - raise + raise RuntimeError("Punt") + # self.logDebug( + # "[=] Unexpected exception, SQL: {}".format( + # wt.getDbConn().getLastSql())) + # raise self._execStats.endTaskType(self.__class__.__name__, self.isSuccess()) self.logDebug("[X] task execution completed, {}, status: {}".format( @@ -1498,7 +1503,8 @@ class TaskCreateDb(StateTransitionTask): # was: self.execWtSql(wt, "create database db") repStr = "" if gConfig.max_replicas != 1: - numReplica = Dice.throw(gConfig.max_replicas) + 1 # 1,2 ... N + # numReplica = Dice.throw(gConfig.max_replicas) + 1 # 1,2 ... N + numReplica = gConfig.max_replicas # fixed, always repStr = "replica {}".format(numReplica) self.execWtSql(wt, "create database {} {}" .format(self._db.getName(), repStr) ) @@ -2050,7 +2056,7 @@ class ClientManager: class MainExec: def __init__(self): self._clientMgr = None - self._svcMgr = None + self._svcMgr = None # type: ServiceManager signal.signal(signal.SIGTERM, self.sigIntHandler) signal.signal(signal.SIGINT, self.sigIntHandler) @@ -2063,17 +2069,16 @@ class MainExec: self._svcMgr.sigUsrHandler(signalNumber, frame) def sigIntHandler(self, signalNumber, frame): - if self._svcMgr: + if self._svcMgr: self._svcMgr.sigIntHandler(signalNumber, frame) - if self._clientMgr: + if self._clientMgr: self._clientMgr.sigIntHandler(signalNumber, frame) def runClient(self): global gSvcMgr if gConfig.auto_start_service: - self._svcMgr = ServiceManager() - gSvcMgr = self._svcMgr # hack alert - self._svcMgr.startTaosService() # we start, don't run + gSvcMgr = self._svcMgr = ServiceManager() # hack alert + gSvcMgr.startTaosService() # we start, don't run self._clientMgr = ClientManager() ret = None @@ -2086,12 +2091,10 @@ class MainExec: def runService(self): global gSvcMgr - self._svcMgr = ServiceManager() - gSvcMgr = self._svcMgr # save it in a global variable TODO: hack alert + gSvcMgr = self._svcMgr = ServiceManager(gConfig.num_dnodes) # save it in a global variable TODO: hack alert - self._svcMgr.run() # run to some end state - self._svcMgr = None - gSvcMgr = None + gSvcMgr.run() # run to some end state + gSvcMgr = self._svcMgr = None def init(self): # TODO: refactor global gContainer @@ -2165,6 +2168,13 @@ class MainExec: '--dynamic-db-table-names', action='store_true', help='Use non-fixed names for dbs/tables, useful for multi-instance executions (default: false)') + parser.add_argument( + '-o', + '--num-dnodes', + action='store', + default=1, + type=int, + help='Number of Dnodes to initialize, used with -e option. (default: 1)') parser.add_argument( '-p', '--per-thread-db-connection', @@ -2209,7 +2219,12 @@ class MainExec: def run(self): if gConfig.run_tdengine: # run server - self.runService() + try: + self.runService() + return 0 # success + except ConnectionError as err: + Logging.error("Failed to make DB connection, please check DB instance manually") + return -1 # failure else: return self.runClient() diff --git a/tests/pytest/crash_gen/db.py b/tests/pytest/crash_gen/db.py index 5404382bf0..43c855647c 100644 --- a/tests/pytest/crash_gen/db.py +++ b/tests/pytest/crash_gen/db.py @@ -12,7 +12,9 @@ from util.cases import * from util.dnodes import * from util.log import * -from .misc import Logging, CrashGenError, Helper +from .misc import Logging, CrashGenError, Helper, Dice +import os +import datetime # from .service_manager import TdeInstance class DbConn: @@ -44,6 +46,9 @@ class DbConn: self._lastSql = None self._dbTarget = dbTarget + def __repr__(self): + return "[DbConn: type={}, target={}]".format(self._type, self._dbTarget) + def getLastSql(self): return self._lastSql @@ -54,7 +59,7 @@ class DbConn: # below implemented by child classes self.openByType() - Logging.debug("[DB] data connection opened, type = {}".format(self._type)) + Logging.debug("[DB] data connection opened: {}".format(self)) self.isOpen = True def close(self): @@ -277,15 +282,18 @@ class DbTarget: self.cfgPath = cfgPath self.hostAddr = hostAddr self.port = port - + def __repr__(self): return "[DbTarget: cfgPath={}, host={}:{}]".format( - self.cfgPath, self.hostAddr, self.port) + Helper.getFriendlyPath(self.cfgPath), self.hostAddr, self.port) + + def getEp(self): + return "{}:{}".format(self.hostAddr, self.port) class DbConnNative(DbConn): # Class variables _lock = threading.Lock() - _connInfoDisplayed = False + # _connInfoDisplayed = False # TODO: find another way to display this totalConnections = 0 # Not private def __init__(self, dbTarget): @@ -304,9 +312,9 @@ class DbConnNative(DbConn): cls = self.__class__ # Get the class, to access class variables with cls._lock: # force single threading for opening DB connections. # TODO: whaaat??!!! dbTarget = self._dbTarget - if not cls._connInfoDisplayed: - cls._connInfoDisplayed = True # updating CLASS variable - Logging.info("Initiating TAOS native connection to {}".format(dbTarget)) + # if not cls._connInfoDisplayed: + # cls._connInfoDisplayed = True # updating CLASS variable + Logging.debug("Initiating TAOS native connection to {}".format(dbTarget)) # Make the connection # self._conn = taos.connect(host=hostAddr, config=cfgPath) # TODO: make configurable # self._cursor = self._conn.cursor() @@ -424,3 +432,4 @@ class DbManager(): def cleanUp(self): self._dbConn.close() + diff --git a/tests/pytest/crash_gen/misc.py b/tests/pytest/crash_gen/misc.py index 08e50e5070..8a2817b389 100644 --- a/tests/pytest/crash_gen/misc.py +++ b/tests/pytest/crash_gen/misc.py @@ -1,6 +1,7 @@ import threading import random import logging +import os class CrashGenError(Exception): @@ -26,7 +27,7 @@ class LoggingFilter(logging.Filter): class MyLoggingAdapter(logging.LoggerAdapter): def process(self, msg, kwargs): - return "[{}]{}".format(threading.get_ident() % 10000, msg), kwargs + return "[{}] {}".format(threading.get_ident() % 10000, msg), kwargs # return '[%s] %s' % (self.extra['connid'], msg), kwargs @@ -71,12 +72,44 @@ class Logging: def warning(cls, msg): cls.logger.warning(msg) + @classmethod + def error(cls, msg): + cls.logger.error(msg) + class Status: STATUS_STARTING = 1 STATUS_RUNNING = 2 STATUS_STOPPING = 3 STATUS_STOPPED = 4 + def __init__(self, status): + self.set(status) + + def __repr__(self): + return "[Status: v={}]".format(self._status) + + def set(self, status): + self._status = status + + def get(self): + return self._status + + def isStarting(self): + return self._status == Status.STATUS_STARTING + + def isRunning(self): + # return self._thread and self._thread.is_alive() + return self._status == Status.STATUS_RUNNING + + def isStopping(self): + return self._status == Status.STATUS_STOPPING + + def isStopped(self): + return self._status == Status.STATUS_STOPPED + + def isStable(self): + return self.isRunning() or self.isStopped() + # Deterministic random number generator class Dice(): seeded = False # static, uninitialized @@ -118,14 +151,23 @@ class Helper: def convertErrno(cls, errno): return errno if (errno > 0) else 0x80000000 + errno + @classmethod + def getFriendlyPath(cls, path): # returns .../xxx/yyy + ht1 = os.path.split(path) + ht2 = os.path.split(ht1[0]) + return ".../" + ht2[1] + '/' + ht1[1] + + class Progress: STEP_BOUNDARY = 0 BEGIN_THREAD_STEP = 1 END_THREAD_STEP = 2 + SERVICE_HEART_BEAT= 3 tokens = { STEP_BOUNDARY: '.', BEGIN_THREAD_STEP: '[', - END_THREAD_STEP: '] ' + END_THREAD_STEP: '] ', + SERVICE_HEART_BEAT: '.Y.' } @classmethod diff --git a/tests/pytest/crash_gen/service_manager.py b/tests/pytest/crash_gen/service_manager.py index 11e35b6de8..c85f64fde4 100644 --- a/tests/pytest/crash_gen/service_manager.py +++ b/tests/pytest/crash_gen/service_manager.py @@ -7,7 +7,7 @@ import logging import time import subprocess -from typing import IO +from typing import IO, List try: import psutil @@ -17,7 +17,7 @@ except: from queue import Queue, Empty -from .misc import Logging, Status, CrashGenError, Dice +from .misc import Logging, Status, CrashGenError, Dice, Helper, Progress from .db import DbConn, DbTarget class TdeInstance(): @@ -47,12 +47,15 @@ class TdeInstance(): .format(selfPath, projPath)) return buildPath - def __init__(self, subdir='test', port=6030, fepPort=6030): + def __init__(self, subdir='test', tInstNum=0, port=6030, fepPort=6030): self._buildDir = self._getBuildPath() self._subdir = '/' + subdir # TODO: tolerate "/" self._port = port # TODO: support different IP address too self._fepPort = fepPort + self._tInstNum = tInstNum + self._smThread = ServiceManagerThread() + def getDbTarget(self): return DbTarget(self.getCfgDir(), self.getHostAddr(), self._port) @@ -60,7 +63,8 @@ class TdeInstance(): return self._port def __repr__(self): - return "[TdeInstance: {}, subdir={}]".format(self._buildDir, self._subdir) + return "[TdeInstance: {}, subdir={}]".format( + self._buildDir, Helper.getFriendlyPath(self._subdir)) def generateCfgFile(self): # print("Logger = {}".format(logger)) @@ -146,8 +150,52 @@ walLevel 1 def getHostAddr(self): return "127.0.0.1" - def getServiceCommand(self): # to start the instance + def getServiceCmdLine(self): # to start the instance return [self.getExecFile(), '-c', self.getCfgDir()] # used in subproce.Popen() + + def _getDnodes(self, dbc): + dbc.query("show dnodes") + cols = dbc.getQueryResult() # id,end_point,vnodes,cores,status,role,create_time,offline reason + return {c[1]:c[4] for c in cols} # {'xxx:6030':'ready', 'xxx:6130':'ready'} + + def createDnode(self, dbt: DbTarget): + """ + With a connection to the "first" EP, let's create a dnode for someone else who + wants to join. + """ + dbc = DbConn.createNative(self.getDbTarget()) + dbc.open() + + if dbt.getEp() in self._getDnodes(dbc): + Logging.info("Skipping DNode creation for: {}".format(dbt)) + dbc.close() + return + + sql = "CREATE DNODE \"{}\"".format(dbt.getEp()) + dbc.execute(sql) + dbc.close() + + def getStatus(self): + return self._smThread.getStatus() + + def getSmThread(self): + return self._smThread + + def start(self): + if not self.getStatus().isStopped(): + raise CrashGenError("Cannot start instance from status: {}".format(self.getStatus())) + + Logging.info("Starting TDengine instance: {}".format(self)) + self.generateCfgFile() # service side generates config file, client does not + self.rotateLogs() + + self._smThread.start(self.getServiceCmdLine()) + + def stop(self): + self._smThread.stop() + + def isFirst(self): + return self._tInstNum == 0 class TdeSubProcess: @@ -159,11 +207,15 @@ class TdeSubProcess: "a sub process runs an instance". """ - def __init__(self, tInst : TdeInstance): + # RET_ALREADY_STOPPED = -1 + # RET_TIME_OUT = -3 + # RET_SUCCESS = -4 + + def __init__(self): self.subProcess = None - if tInst is None: - raise CrashGenError("Empty instance not allowed in TdeSubProcess") - self._tInst = tInst # Default create at ServiceManagerThread + # if tInst is None: + # raise CrashGenError("Empty instance not allowed in TdeSubProcess") + # self._tInst = tInst # Default create at ServiceManagerThread def getStdOut(self): return self.subProcess.stdout @@ -177,38 +229,15 @@ class TdeSubProcess: def getPid(self): return self.subProcess.pid - # Repalced by TdeInstance class - # def getBuildPath(self): - # selfPath = os.path.dirname(os.path.realpath(__file__)) - # if ("community" in selfPath): - # projPath = selfPath[:selfPath.find("communit")] - # else: - # projPath = selfPath[:selfPath.find("tests")] - - # for root, dirs, files in os.walk(projPath): - # if ("taosd" in files): - # rootRealPath = os.path.dirname(os.path.realpath(root)) - # if ("packaging" not in rootRealPath): - # buildPath = root[:len(root) - len("/build/bin")] - # break - # return buildPath - - def start(self): + def start(self, cmdLine): ON_POSIX = 'posix' in sys.builtin_module_names # Sanity check if self.subProcess: # already there raise RuntimeError("Corrupt process state") - - # global gContainer - # tInst = gContainer.defTdeInstance = TdeInstance('test3') # creae the instance - self._tInst.generateCfgFile() # service side generates config file, client does not - - self._tInst.rotateLogs() - - print("Starting TDengine instance: {}".format(self._tInst)) + self.subProcess = subprocess.Popen( - self._tInst.getServiceCommand(), + cmdLine, shell=False, # svcCmdSingle, shell=True, # capture core dump? stdout=subprocess.PIPE, @@ -218,31 +247,50 @@ class TdeSubProcess: ) # had text=True, which interferred with reading EOF def stop(self): + """ + Stop a sub process, and try to return a meaningful return code. + + Common POSIX signal values (from man -7 signal): + SIGHUP 1 + SIGINT 2 + SIGQUIT 3 + SIGILL 4 + SIGTRAP 5 + SIGABRT 6 + SIGIOT 6 + SIGBUS 7 + SIGEMT - + SIGFPE 8 + SIGKILL 9 + SIGUSR1 10 + SIGSEGV 11 + SIGUSR2 12 + """ if not self.subProcess: print("Sub process already stopped") - return -1 + return # -1 - retCode = self.subProcess.poll() # contains real sub process return code + retCode = self.subProcess.poll() # ret -N means killed with signal N, otherwise it's from exit(N) if retCode: # valid return code, process ended + retCode = -retCode # only if valid + Logging.warning("TSP.stop(): process ended itself") self.subProcess = None - else: # process still alive, let's interrupt it - print( - "Sub process is running, sending SIG_INT and waiting for it to terminate...") - # sub process should end, then IPC queue should end, causing IO - # thread to end - self.subProcess.send_signal(signal.SIGINT) - try: - self.subProcess.wait(10) - retCode = self.subProcess.returncode - except subprocess.TimeoutExpired as err: - print("Time out waiting for TDengine service process to exit") - retCode = -3 - else: - print("TDengine service process terminated successfully from SIG_INT") - retCode = -4 - self.subProcess = None - return retCode + return retCode + # process still alive, let's interrupt it + print("Terminate running process, send SIG_INT and wait...") + # sub process should end, then IPC queue should end, causing IO thread to end + self.subProcess.send_signal(signal.SIGINT) + self.subProcess.wait(20) + retCode = self.subProcess.returncode # should always be there + # May throw subprocess.TimeoutExpired exception above, therefore + # The process is guranteed to have ended by now + self.subProcess = None + if retCode != 0: # != (- signal.SIGINT): + Logging.error("TSP.stop(): Failed to stop sub proc properly w/ SIG_INT, retCode={}".format(retCode)) + else: + Logging.info("TSP.stop(): sub proc successfully terminated with SIG_INT") + return - retCode class ServiceManager: PAUSE_BETWEEN_IPC_CHECK = 1.2 # seconds between checks on STDOUT of sub process @@ -259,19 +307,25 @@ class ServiceManager: # self._status = MainExec.STATUS_RUNNING # set inside # _startTaosService() self._runCluster = (numDnodes >= 1) - self.svcMgrThreads = [] # type: List[ServiceManagerThread] + self._tInsts : List[TdeInstance] = [] for i in range(0, numDnodes): - self.svcMgrThreads.append(ServiceManagerThread(i)) + ti = self._createTdeInstance(i) # construct tInst + self._tInsts.append(ti) - def _createThread(self, dnIndex): - if not self._runCluster: # single instance - return ServiceManagerThread(0) + # self.svcMgrThreads : List[ServiceManagerThread] = [] + # for i in range(0, numDnodes): + # thread = self._createThread(i) # construct tInst + # self.svcMgrThreads.append(thread) + + def _createTdeInstance(self, dnIndex): + # if not self._runCluster: # single instance + # return ServiceManagerThread(0) # Create all threads in a cluster subdir = 'cluster_dnode_{}'.format(dnIndex) fepPort= 6030 # firstEP Port port = fepPort + dnIndex * 100 - ti = TdeInstance(subdir, port, fepPort) - return ServiceManagerThread(dnIndex, ti) + return TdeInstance(subdir, dnIndex, port, fepPort) + # return ServiceManagerThread(dnIndex, ti) def _doMenu(self): choice = "" @@ -336,8 +390,8 @@ class ServiceManager: Determine if the service/cluster is active at all, i.e. at least one thread is not "stopped". """ - for thread in self.svcMgrThreads: - if not thread.isStopped(): + for ti in self._tInsts: + if not ti.getStatus().isStopped(): return True return False @@ -356,28 +410,31 @@ class ServiceManager: Determine if the service/cluster is "stable", i.e. all of the threads are in "stable" status. """ - for thread in self.svcMgrThreads: - if not thread.isStable(): + for ti in self._tInsts: + if not ti.isStable(): return False return True def _procIpcAll(self): while self.isActive(): - for thread in self.svcMgrThreads: # all thread objects should always be valid + Progress.emit(Progress.SERVICE_HEART_BEAT) + for ti in self._tInsts: # all thread objects should always be valid # while self.isRunning() or self.isRestarting() : # for as long as the svc mgr thread is still here - if thread.isRunning(): - thread.procIpcBatch() # regular processing, - if thread.isStopped(): - thread.procIpcBatch() # one last time? + status = ti.getStatus() + if status.isRunning(): + th = ti.getSmThread() + th.procIpcBatch() # regular processing, + if status.isStopped(): + th.procIpcBatch() # one last time? # self._updateThreadStatus() - elif thread.isRetarting(): - print("Service restarting...") - # else this thread is stopped time.sleep(self.PAUSE_BETWEEN_IPC_CHECK) # pause, before next round # raise CrashGenError("dummy") print("Service Manager Thread (with subprocess) ended, main thread exiting...") + def _getFirstInstance(self): + return self._tInsts[0] + def startTaosServices(self): with self._lock: if self.isActive(): @@ -386,15 +443,19 @@ class ServiceManager: # Find if there's already a taosd service, and then kill it for proc in psutil.process_iter(): if proc.name() == 'taosd': - print("Killing an existing TAOSD process in 2 seconds... press CTRL-C to interrupe") + print("Killing an existing TAOSD process in 2 seconds... press CTRL-C to interrupt") time.sleep(2.0) proc.kill() # print("Process: {}".format(proc.name())) # self.svcMgrThread = ServiceManagerThread() # create the object - for thread in self.svcMgrThreads: - thread.start() - thread.procIpcBatch(trimToTarget=10, forceOutput=True) # for printing 10 lines + + for ti in self._tInsts: + ti.start() + if not ti.isFirst(): + tFirst = self._getFirstInstance() + tFirst.createDnode(ti.getDbTarget()) + ti.getSmThread().procIpcBatch(trimToTarget=10, forceOutput=True) # for printing 10 lines def stopTaosServices(self): with self._lock: @@ -402,8 +463,8 @@ class ServiceManager: Logging.warning("Cannot stop TAOS service(s), already not active") return - for thread in self.svcMgrThreads: - thread.stop() + for ti in self._tInsts: + ti.stop() def run(self): self.startTaosServices() @@ -412,7 +473,7 @@ class ServiceManager: self.stopTaosServices() # should have started already def restart(self): - if not self.isStable(): + if not self.getStatus().isStable(): Logging.warning("Cannot restart service/cluster, when not stable") return @@ -440,42 +501,27 @@ class ServiceManagerThread: """ MAX_QUEUE_SIZE = 10000 - def __init__(self, tInstNum = 0, tInst : TdeInstance = None): + def __init__(self): # Set the sub process self._tdeSubProcess = None # type: TdeSubProcess # Arrange the TDengine instance - self._tInstNum = tInstNum # instance serial number in cluster, ZERO based - self._tInst = tInst or TdeInstance() # Need an instance + # self._tInstNum = tInstNum # instance serial number in cluster, ZERO based + # self._tInst = tInst or TdeInstance() # Need an instance self._thread = None # The actual thread, # type: threading.Thread - self._status = Status.STATUS_STOPPED # The status of the underlying service, actually. + self._status = Status(Status.STATUS_STOPPED) # The status of the underlying service, actually. def __repr__(self): - return "[SvcMgrThread: tInstNum={}]".format(self._tInstNum) + return "[SvcMgrThread: status={}, subProc={}]".format( + self.getStatus(), self._tdeSubProcess) def getStatus(self): return self._status - def isStarting(self): - return self._status == Status.STATUS_STARTING - - def isRunning(self): - # return self._thread and self._thread.is_alive() - return self._status == Status.STATUS_RUNNING - - def isStopping(self): - return self._status == Status.STATUS_STOPPING - - def isStopped(self): - return self._status == Status.STATUS_STOPPED - - def isStable(self): - return self.isRunning() or self.isStopped() - # Start the thread (with sub process), and wait for the sub service # to become fully operational - def start(self): + def start(self, cmdLine): if self._thread: raise RuntimeError("Unexpected _thread") if self._tdeSubProcess: @@ -483,9 +529,9 @@ class ServiceManagerThread: Logging.info("Attempting to start TAOS service: {}".format(self)) - self._status = Status.STATUS_STARTING - self._tdeSubProcess = TdeSubProcess(self._tInst) - self._tdeSubProcess.start() + self._status.set(Status.STATUS_STARTING) + self._tdeSubProcess = TdeSubProcess() + self._tdeSubProcess.start(cmdLine) self._ipcQueue = Queue() self._thread = threading.Thread( # First thread captures server OUTPUT @@ -505,10 +551,11 @@ class ServiceManagerThread: time.sleep(1.0) # self.procIpcBatch() # don't pump message during start up print("_zz_", end="", flush=True) - if self._status == Status.STATUS_RUNNING: + if self._status.isRunning(): Logging.info("[] TDengine service READY to process requests") Logging.info("[] TAOS service started: {}".format(self)) - self._verifyDnode(self._tInst) # query and ensure dnode is ready + # self._verifyDnode(self._tInst) # query and ensure dnode is ready + # Logging.debug("[] TAOS Dnode verified: {}".format(self)) return # now we've started # TODO: handle failure-to-start better? self.procIpcBatch(100, True) # display output before cronking out, trim to last 20 msgs, force output @@ -523,25 +570,27 @@ class ServiceManagerThread: # ret = {row[0]:row[1] for row in stCols if row[3]=='TAG'} # name:type isValid = False for col in cols: - print("col = {}".format(col)) + # print("col = {}".format(col)) ep = col[1].split(':') # 10.1.30.2:6030 - print("ep={}".format(ep)) + print("Found ep={}".format(ep)) if tInst.getPort() == int(ep[1]): # That's us - print("Valid Dnode matched!") + # print("Valid Dnode matched!") isValid = True # now we are valid break if not isValid: - raise RuntimeError("Failed to start Dnode, port = {}, expected: {}". - format(ep[1], tInst.getPort())) + print("Failed to start dnode, sleep for a while") + time.sleep(600) + raise RuntimeError("Failed to start Dnode, expected port not found: {}". + format(tInst.getPort())) dbc.close() def stop(self): # can be called from both main thread or signal handler print("Terminating TDengine service running as the sub process...") - if self.isStopped(): + if self.getStatus().isStopped(): print("Service already stopped") return - if self.isStopping(): + if self.getStatus().isStopping(): print("Service is already being stopped") return # Linux will send Control-C generated SIGINT to the TDengine process @@ -550,39 +599,42 @@ class ServiceManagerThread: if not self._tdeSubProcess: raise RuntimeError("sub process object missing") - self._status = Status.STATUS_STOPPING - retCode = self._tdeSubProcess.stop() - print("Attempted to stop sub process, got return code: {}".format(retCode)) - if (retCode==-11): # SGV - Logging.error("[[--ERROR--]]: TDengine service SEGV fault (check core file!)") - - if self._tdeSubProcess.isRunning(): # still running - print("FAILED to stop sub process, it is still running... pid = {}".format( + self._status.set(Status.STATUS_STOPPING) + # retCode = self._tdeSubProcess.stop() + try: + retCode = self._tdeSubProcess.stop() + # print("Attempted to stop sub process, got return code: {}".format(retCode)) + if retCode == signal.SIGSEGV : # SGV + Logging.error("[[--ERROR--]]: TDengine service SEGV fault (check core file!)") + except subprocess.TimeoutExpired as err: + print("Time out waiting for TDengine service process to exit") + else: + if self._tdeSubProcess.isRunning(): # still running, should now never happen + print("FAILED to stop sub process, it is still running... pid = {}".format( self._tdeSubProcess.getPid())) - else: - self._tdeSubProcess = None # not running any more - self.join() # stop the thread, change the status, etc. + else: + self._tdeSubProcess = None # not running any more + self.join() # stop the thread, change the status, etc. # Check if it's really stopped - outputLines = 20 # for last output - if self.isStopped(): + outputLines = 10 # for last output + if self.getStatus().isStopped(): self.procIpcBatch(outputLines) # one last time - print("End of TDengine Service Output: {}".format(self)) - print("----- TDengine Service (managed by SMT) is now terminated -----\n") + Logging.debug("End of TDengine Service Output: {}".format(self)) + Logging.info("----- TDengine Service (managed by SMT) is now terminated -----\n") else: print("WARNING: SMT did not terminate as expected: {}".format(self)) def join(self): # TODO: sanity check - if not self.isStopping(): + if not self.getStatus().isStopping(): raise RuntimeError( - "Unexpected status when ending svc mgr thread: {}".format( - self._status)) + "SMT.Join(): Unexpected status: {}".format(self._status)) if self._thread: self._thread.join() self._thread = None - self._status = Status.STATUS_STOPPED + self._status.set(Status.STATUS_STOPPED) # STD ERR thread self._thread2.join() self._thread2 = None @@ -651,25 +703,27 @@ class ServiceManagerThread: queue.put(line) self._printProgress("_i") - if self._status == Status.STATUS_STARTING: # we are starting, let's see if we have started + if self._status.isStarting(): # we are starting, let's see if we have started if line.find(self.TD_READY_MSG) != -1: # found Logging.info("Waiting for the service to become FULLY READY") time.sleep(1.0) # wait for the server to truly start. TODO: remove this - Logging.info("Service instance #{} is now FULLY READY".format(self._tInstNum)) - self._status = Status.STATUS_RUNNING + Logging.info("Service is now FULLY READY") # TODO: more ID info here? + self._status.set(Status.STATUS_RUNNING) # Trim the queue if necessary: TODO: try this 1 out of 10 times self._trimQueue(self.MAX_QUEUE_SIZE * 9 // 10) # trim to 90% size - if self.isStopping(): # TODO: use thread status instead + if self._status.isStopping(): # TODO: use thread status instead # WAITING for stopping sub process to finish its outptu print("_w", end="", flush=True) # queue.put(line) # meaning sub process must have died - print("\nNo more output from IO thread managing TDengine service") + Logging.info("\nEnd of stream detected for TDengine STDOUT: {}".format(self)) out.close() def svcErrorReader(self, err: IO, queue): for line in iter(err.readline, b''): print("\nTDengine Service (taosd) ERROR (from stderr): {}".format(line)) + Logging.info("\nEnd of stream detected for TDengine STDERR: {}".format(self)) + err.close() \ No newline at end of file From 87cd1cc0f67a210aecbd5b5a8cf84735cf8f9ae1 Mon Sep 17 00:00:00 2001 From: Steven Li Date: Sat, 24 Oct 2020 08:42:38 +0000 Subject: [PATCH 15/16] Fixed travis build failure caused by crash_gen tool, sorry --- tests/pytest/crash_gen/crash_gen.py | 6 +++--- tests/pytest/crash_gen/service_manager.py | 18 +++++++++--------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/pytest/crash_gen/crash_gen.py b/tests/pytest/crash_gen/crash_gen.py index 74e3964d5a..102d7d9bdd 100755 --- a/tests/pytest/crash_gen/crash_gen.py +++ b/tests/pytest/crash_gen/crash_gen.py @@ -2023,7 +2023,7 @@ class ClientManager: # print("exec stats: {}".format(self.tc.getExecStats())) # print("TC failed = {}".format(self.tc.isFailed())) if svcMgr: # gConfig.auto_start_service: - svcMgr.stopTaosService() + svcMgr.stopTaosServices() svcMgr = None # Print exec status, etc., AFTER showing messages from the server self.conclude() @@ -2077,8 +2077,8 @@ class MainExec: def runClient(self): global gSvcMgr if gConfig.auto_start_service: - gSvcMgr = self._svcMgr = ServiceManager() # hack alert - gSvcMgr.startTaosService() # we start, don't run + gSvcMgr = self._svcMgr = ServiceManager(1) # hack alert + gSvcMgr.startTaosServices() # we start, don't run self._clientMgr = ClientManager() ret = None diff --git a/tests/pytest/crash_gen/service_manager.py b/tests/pytest/crash_gen/service_manager.py index c85f64fde4..bb2becb55b 100644 --- a/tests/pytest/crash_gen/service_manager.py +++ b/tests/pytest/crash_gen/service_manager.py @@ -295,7 +295,7 @@ class TdeSubProcess: class ServiceManager: PAUSE_BETWEEN_IPC_CHECK = 1.2 # seconds between checks on STDOUT of sub process - def __init__(self, numDnodes = 1): # Otherwise we run a cluster + def __init__(self, numDnodes): # >1 when we run a cluster Logging.info("TDengine Service Manager (TSM) created") self._numDnodes = numDnodes # >1 means we have a cluster self._lock = threading.Lock() @@ -306,7 +306,7 @@ class ServiceManager: self.inSigHandler = False # self._status = MainExec.STATUS_RUNNING # set inside # _startTaosService() - self._runCluster = (numDnodes >= 1) + self._runCluster = (numDnodes > 1) self._tInsts : List[TdeInstance] = [] for i in range(0, numDnodes): ti = self._createTdeInstance(i) # construct tInst @@ -318,10 +318,10 @@ class ServiceManager: # self.svcMgrThreads.append(thread) def _createTdeInstance(self, dnIndex): - # if not self._runCluster: # single instance - # return ServiceManagerThread(0) - # Create all threads in a cluster - subdir = 'cluster_dnode_{}'.format(dnIndex) + if not self._runCluster: # single instance + subdir = 'test' + else: # Create all threads in a cluster + subdir = 'cluster_dnode_{}'.format(dnIndex) fepPort= 6030 # firstEP Port port = fepPort + dnIndex * 100 return TdeInstance(subdir, dnIndex, port, fepPort) @@ -411,7 +411,7 @@ class ServiceManager: threads are in "stable" status. """ for ti in self._tInsts: - if not ti.isStable(): + if not ti.getStatus().isStable(): return False return True @@ -473,7 +473,7 @@ class ServiceManager: self.stopTaosServices() # should have started already def restart(self): - if not self.getStatus().isStable(): + if not self.isStable(): Logging.warning("Cannot restart service/cluster, when not stable") return @@ -483,7 +483,7 @@ class ServiceManager: else: Logging.warning("Service not active when restart requested") - self.startTaosService() + self.startTaosServices() # self._isRestarting = False # def isRunning(self): From aa995b99a374cd2d1c5188b8828e4d959742f220 Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Sat, 24 Oct 2020 17:01:37 +0000 Subject: [PATCH 16/16] add query sort test cases --- tests/pytest/query/querySort.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/pytest/query/querySort.py b/tests/pytest/query/querySort.py index e5d3c8ce1f..649e0dc1cb 100644 --- a/tests/pytest/query/querySort.py +++ b/tests/pytest/query/querySort.py @@ -96,6 +96,12 @@ class TDTestCase: tdSql.query("select * from st order by ts desc") self.checkColumnSorted(0, "desc") + print("======= step 2: verify order for special column =========") + + tdSql.query("select tbcol1 from st order by ts desc") + + tdSql.query("select tbcol6 from st order by ts desc") + for i in range(1, 10): tdSql.error("select * from st order by tbcol%d" % i) tdSql.error("select * from st order by tbcol%d asc" % i)