Enhanced crash_gen to use SIG_KILL insteadl of SIG_INT when restarting services

2020-10-28 06:38:41 +00:00 · 2020-10-28 06:38:41 +00:00 · a0b83d4754
parent 4b4e842204
commit a0b83d4754
2 changed files with 43 additions and 9 deletions
--- a/tests/pytest/crash_gen/crash_gen.py
+++ b/tests/pytest/crash_gen/crash_gen.py
@ -1226,6 +1226,11 @@ class Task():
            "To be implemeted by child classes, class name: {}".format(
                self.__class__.__name__))
    def _isServiceStable(self):
        if not gSvcMgr:
            return True  # we don't run service, so let's assume it's stable
        return gSvcMgr.isStable() # otherwise let's examine the service
    def _isErrAcceptable(self, errno, msg):
        if errno in [
                0x05,  # TSDB_CODE_RPC_NOT_READY
@ -1263,7 +1268,7 @@ class Task():
                return True
            elif msg.find("duplicated column names") != -1: # also alter table tag issues
                return True
-        elif gSvcMgr and (not gSvcMgr.isStable()): # We are managing service, and ...
+        elif not self._isServiceStable(): # We are managing service, and ...
            Logging.info("Ignoring error when service starting/stopping: errno = {}, msg = {}".format(errno, msg))
            return True
@ -1641,14 +1646,34 @@ class TaskReadData(StateTransitionTask):
    def canBeginFrom(cls, state: AnyState):
        return state.canReadData()
    # def _canRestartService(self):
    #     if not gSvcMgr:
    #         return True # always
    #     return gSvcMgr.isActive() # only if it's running TODO: race condition here
    def _executeInternal(self, te: TaskExecutor, wt: WorkerThread):
        sTable = self._db.getFixedSuperTable()
-        # 1 in 5 chance, simulate a broken connection. 
+        # 1 in 5 chance, simulate a broken connection, only if service stable (not restarting)
-        if random.randrange(5) == 0:  # TODO: break connection in all situations
+        if random.randrange(20)==0: # and self._canRestartService():  # TODO: break connection in all situations
            Logging.info("Attempting to reconnect to server") # TODO: change to DEBUG
            try:
                wt.getDbConn().close()
                wt.getDbConn().open()
            except ConnectionError as err: # may fail
                if not gSvcMgr:
                    Logging.error("Failed to reconnect in client-only mode")
                    raise # Not OK if we are running in client-only mode
                if gSvcMgr.isRunning(): # may have race conditon, but low prob, due to 
                    Logging.error("Failed to reconnect when managed server is running")
                    raise # Not OK if we are running normally
                Logging.info("Ignoring DB reconnect error")
            print("_r", end="", flush=True)
            # The above might have taken a lot of time, service might be running
            # by now, causing error below to be incorrectly handled due to timing issue
            return # TODO: fix server restart status race condtion
        dbc = wt.getDbConn()
        dbName = self._db.getName()
--- a/tests/pytest/crash_gen/service_manager.py
+++ b/tests/pytest/crash_gen/service_manager.py
@ -280,16 +280,18 @@ class TdeSubProcess:
        # process still alive, let's interrupt it
        print("Terminate running process, send SIG_INT and wait...")
        # sub process should end, then IPC queue should end, causing IO thread to end
-        self.subProcess.send_signal(signal.SIGINT)
+        # sig = signal.SIGINT
        sig = signal.SIGKILL
        self.subProcess.send_signal(sig) # SIGNINT or SIGKILL
        self.subProcess.wait(20)
        retCode = self.subProcess.returncode # should always be there
        # May throw subprocess.TimeoutExpired exception above, therefore
        # The process is guranteed to have ended by now
        self.subProcess = None        
        if retCode != 0: # != (- signal.SIGINT):
-            Logging.error("TSP.stop(): Failed to stop sub proc properly w/ SIG_INT, retCode={}".format(retCode))
+            Logging.error("TSP.stop(): Failed to stop sub proc properly w/ SIG {}, retCode={}".format(sig, retCode))
        else:
-            Logging.info("TSP.stop(): sub proc successfully terminated with SIG_INT")
+            Logging.info("TSP.stop(): sub proc successfully terminated with SIG {}".format(sig))
        return - retCode
 class ServiceManager:
@ -395,6 +397,13 @@ class ServiceManager:
                return True
        return False
    def isRunning(self):
        for ti in self._tInsts:
            if not ti.getStatus().isRunning():
                return False
        return True
    # def isRestarting(self):
    #     """
    #     Determine if the service/cluster is being "restarted", i.e., at least