Enhanced crash_gen to use SIG_KILL insteadl of SIG_INT when restarting services

This commit is contained in:
Steven Li 2020-10-28 06:38:41 +00:00
parent 4b4e842204
commit a0b83d4754
2 changed files with 43 additions and 9 deletions

View File

@ -1226,6 +1226,11 @@ class Task():
"To be implemeted by child classes, class name: {}".format( "To be implemeted by child classes, class name: {}".format(
self.__class__.__name__)) self.__class__.__name__))
def _isServiceStable(self):
if not gSvcMgr:
return True # we don't run service, so let's assume it's stable
return gSvcMgr.isStable() # otherwise let's examine the service
def _isErrAcceptable(self, errno, msg): def _isErrAcceptable(self, errno, msg):
if errno in [ if errno in [
0x05, # TSDB_CODE_RPC_NOT_READY 0x05, # TSDB_CODE_RPC_NOT_READY
@ -1263,7 +1268,7 @@ class Task():
return True return True
elif msg.find("duplicated column names") != -1: # also alter table tag issues elif msg.find("duplicated column names") != -1: # also alter table tag issues
return True return True
elif gSvcMgr and (not gSvcMgr.isStable()): # We are managing service, and ... elif not self._isServiceStable(): # We are managing service, and ...
Logging.info("Ignoring error when service starting/stopping: errno = {}, msg = {}".format(errno, msg)) Logging.info("Ignoring error when service starting/stopping: errno = {}, msg = {}".format(errno, msg))
return True return True
@ -1641,14 +1646,34 @@ class TaskReadData(StateTransitionTask):
def canBeginFrom(cls, state: AnyState): def canBeginFrom(cls, state: AnyState):
return state.canReadData() return state.canReadData()
# def _canRestartService(self):
# if not gSvcMgr:
# return True # always
# return gSvcMgr.isActive() # only if it's running TODO: race condition here
def _executeInternal(self, te: TaskExecutor, wt: WorkerThread): def _executeInternal(self, te: TaskExecutor, wt: WorkerThread):
sTable = self._db.getFixedSuperTable() sTable = self._db.getFixedSuperTable()
# 1 in 5 chance, simulate a broken connection. # 1 in 5 chance, simulate a broken connection, only if service stable (not restarting)
if random.randrange(5) == 0: # TODO: break connection in all situations if random.randrange(20)==0: # and self._canRestartService(): # TODO: break connection in all situations
wt.getDbConn().close() Logging.info("Attempting to reconnect to server") # TODO: change to DEBUG
wt.getDbConn().open() try:
wt.getDbConn().close()
wt.getDbConn().open()
except ConnectionError as err: # may fail
if not gSvcMgr:
Logging.error("Failed to reconnect in client-only mode")
raise # Not OK if we are running in client-only mode
if gSvcMgr.isRunning(): # may have race conditon, but low prob, due to
Logging.error("Failed to reconnect when managed server is running")
raise # Not OK if we are running normally
Logging.info("Ignoring DB reconnect error")
print("_r", end="", flush=True) print("_r", end="", flush=True)
# The above might have taken a lot of time, service might be running
# by now, causing error below to be incorrectly handled due to timing issue
return # TODO: fix server restart status race condtion
dbc = wt.getDbConn() dbc = wt.getDbConn()
dbName = self._db.getName() dbName = self._db.getName()

View File

@ -280,16 +280,18 @@ class TdeSubProcess:
# process still alive, let's interrupt it # process still alive, let's interrupt it
print("Terminate running process, send SIG_INT and wait...") print("Terminate running process, send SIG_INT and wait...")
# sub process should end, then IPC queue should end, causing IO thread to end # sub process should end, then IPC queue should end, causing IO thread to end
self.subProcess.send_signal(signal.SIGINT) # sig = signal.SIGINT
sig = signal.SIGKILL
self.subProcess.send_signal(sig) # SIGNINT or SIGKILL
self.subProcess.wait(20) self.subProcess.wait(20)
retCode = self.subProcess.returncode # should always be there retCode = self.subProcess.returncode # should always be there
# May throw subprocess.TimeoutExpired exception above, therefore # May throw subprocess.TimeoutExpired exception above, therefore
# The process is guranteed to have ended by now # The process is guranteed to have ended by now
self.subProcess = None self.subProcess = None
if retCode != 0: # != (- signal.SIGINT): if retCode != 0: # != (- signal.SIGINT):
Logging.error("TSP.stop(): Failed to stop sub proc properly w/ SIG_INT, retCode={}".format(retCode)) Logging.error("TSP.stop(): Failed to stop sub proc properly w/ SIG {}, retCode={}".format(sig, retCode))
else: else:
Logging.info("TSP.stop(): sub proc successfully terminated with SIG_INT") Logging.info("TSP.stop(): sub proc successfully terminated with SIG {}".format(sig))
return - retCode return - retCode
class ServiceManager: class ServiceManager:
@ -395,6 +397,13 @@ class ServiceManager:
return True return True
return False return False
def isRunning(self):
for ti in self._tInsts:
if not ti.getStatus().isRunning():
return False
return True
# def isRestarting(self): # def isRestarting(self):
# """ # """
# Determine if the service/cluster is being "restarted", i.e., at least # Determine if the service/cluster is being "restarted", i.e., at least