Merge pull request #4028 from taosdata/feature/crash_gen

Enhanced crash_gen tool
2020-10-28 15:20:15 +08:00 · 2020-10-28 15:20:15 +08:00 · c380e8f229
parent 6d690442b9 eceae10473
commit c380e8f229
4 changed files with 58 additions and 12 deletions
--- a/tests/pytest/crash_gen.sh
+++ b/tests/pytest/crash_gen.sh
@ -70,10 +70,12 @@ if [[ $1 == '--valgrind' ]]; then
    $CRASH_GEN_EXEC $@ > $VALGRIND_OUT 2> $VALGRIND_ERR 
 elif [[ $1 == '--helgrind' ]]; then
  shift
+  HELGRIND_OUT=helgrind.out 
+  HELGRIND_ERR=helgrind.err
  valgrind  \
    --tool=helgrind \
    $PYTHON_EXEC \
-    $CRASH_GEN_EXEC $@
+    $CRASH_GEN_EXEC $@ > $HELGRIND_OUT 2> $HELGRIND_ERR
 else
  $PYTHON_EXEC $CRASH_GEN_EXEC $@
 fi
--- a/tests/pytest/crash_gen/crash_gen.py
+++ b/tests/pytest/crash_gen/crash_gen.py
@ -1226,6 +1226,11 @@ class Task():
            "To be implemeted by child classes, class name: {}".format(
                self.__class__.__name__))

+    def _isServiceStable(self):
+        if not gSvcMgr:
+            return True  # we don't run service, so let's assume it's stable
+        return gSvcMgr.isStable() # otherwise let's examine the service
+
    def _isErrAcceptable(self, errno, msg):
        if errno in [
                0x05,  # TSDB_CODE_RPC_NOT_READY
@ -1263,7 +1268,7 @@ class Task():
                return True
            elif msg.find("duplicated column names") != -1: # also alter table tag issues
                return True
-        elif gSvcMgr and (not gSvcMgr.isStable()): # We are managing service, and ...
+        elif not self._isServiceStable(): # We are managing service, and ...
            Logging.info("Ignoring error when service starting/stopping: errno = {}, msg = {}".format(errno, msg))
            return True
        
@ -1641,15 +1646,39 @@ class TaskReadData(StateTransitionTask):
    def canBeginFrom(cls, state: AnyState):
        return state.canReadData()

+    # def _canRestartService(self):
+    #     if not gSvcMgr:
+    #         return True # always
+    #     return gSvcMgr.isActive() # only if it's running TODO: race condition here
+
    def _executeInternal(self, te: TaskExecutor, wt: WorkerThread):
        sTable = self._db.getFixedSuperTable()

-        # 1 in 5 chance, simulate a broken connection. 
-        if random.randrange(5) == 0:  # TODO: break connection in all situations
-            wt.getDbConn().close()
-            wt.getDbConn().open()
-            print("_r", end="", flush=True)
-        
+        # 1 in 5 chance, simulate a broken connection, only if service stable (not restarting)
+        if random.randrange(20)==0: # and self._canRestartService():  # TODO: break connection in all situations
+            # Logging.info("Attempting to reconnect to server") # TODO: change to DEBUG
+            Progress.emit(Progress.SERVICE_RECONNECT_START) 
+            try:
+                wt.getDbConn().close()
+                wt.getDbConn().open()
+            except ConnectionError as err: # may fail
+                if not gSvcMgr:
+                    Logging.error("Failed to reconnect in client-only mode")
+                    raise # Not OK if we are running in client-only mode
+                if gSvcMgr.isRunning(): # may have race conditon, but low prob, due to 
+                    Logging.error("Failed to reconnect when managed server is running")
+                    raise # Not OK if we are running normally
+
+                Progress.emit(Progress.SERVICE_RECONNECT_FAILURE) 
+                # Logging.info("Ignoring DB reconnect error")
+
+            # print("_r", end="", flush=True)
+            Progress.emit(Progress.SERVICE_RECONNECT_SUCCESS) 
+            # The above might have taken a lot of time, service might be running
+            # by now, causing error below to be incorrectly handled due to timing issue
+            return # TODO: fix server restart status race condtion
+
+
        dbc = wt.getDbConn()
        dbName = self._db.getName()
        for rTbName in sTable.getRegTables(dbc, dbName):  # regular tables
--- a/tests/pytest/crash_gen/misc.py
+++ b/tests/pytest/crash_gen/misc.py
@ -163,11 +163,17 @@ class Progress:
    BEGIN_THREAD_STEP = 1
    END_THREAD_STEP   = 2
    SERVICE_HEART_BEAT= 3
+    SERVICE_RECONNECT_START     = 4
+    SERVICE_RECONNECT_SUCCESS   = 5
+    SERVICE_RECONNECT_FAILURE   = 6
    tokens = {
        STEP_BOUNDARY:      '.',
        BEGIN_THREAD_STEP:  '[',
        END_THREAD_STEP:    '] ',
-        SERVICE_HEART_BEAT: '.Y.'
+        SERVICE_HEART_BEAT: '.Y.',
+        SERVICE_RECONNECT_START:    '<r.',
+        SERVICE_RECONNECT_SUCCESS:  '.r>',
+        SERVICE_RECONNECT_FAILURE:  '.xr>',
    }

    @classmethod
--- a/tests/pytest/crash_gen/service_manager.py
+++ b/tests/pytest/crash_gen/service_manager.py
@ -280,16 +280,18 @@ class TdeSubProcess:
        # process still alive, let's interrupt it
        print("Terminate running process, send SIG_INT and wait...")
        # sub process should end, then IPC queue should end, causing IO thread to end
-        self.subProcess.send_signal(signal.SIGINT)
+        # sig = signal.SIGINT
+        sig = signal.SIGKILL
+        self.subProcess.send_signal(sig) # SIGNINT or SIGKILL
        self.subProcess.wait(20)
        retCode = self.subProcess.returncode # should always be there
        # May throw subprocess.TimeoutExpired exception above, therefore
        # The process is guranteed to have ended by now
        self.subProcess = None        
        if retCode != 0: # != (- signal.SIGINT):
-            Logging.error("TSP.stop(): Failed to stop sub proc properly w/ SIG_INT, retCode={}".format(retCode))
+            Logging.error("TSP.stop(): Failed to stop sub proc properly w/ SIG {}, retCode={}".format(sig, retCode))
        else:
-            Logging.info("TSP.stop(): sub proc successfully terminated with SIG_INT")
+            Logging.info("TSP.stop(): sub proc successfully terminated with SIG {}".format(sig))
        return - retCode

 class ServiceManager:
@ -395,6 +397,13 @@ class ServiceManager:
                return True
        return False

+    def isRunning(self):
+        for ti in self._tInsts:
+            if not ti.getStatus().isRunning():
+                return False
+        return True
+
+
    # def isRestarting(self):
    #     """
    #     Determine if the service/cluster is being "restarted", i.e., at least