Merge pull request #4028 from taosdata/feature/crash_gen
Enhanced crash_gen tool
This commit is contained in:
commit
c380e8f229
|
@ -70,10 +70,12 @@ if [[ $1 == '--valgrind' ]]; then
|
||||||
$CRASH_GEN_EXEC $@ > $VALGRIND_OUT 2> $VALGRIND_ERR
|
$CRASH_GEN_EXEC $@ > $VALGRIND_OUT 2> $VALGRIND_ERR
|
||||||
elif [[ $1 == '--helgrind' ]]; then
|
elif [[ $1 == '--helgrind' ]]; then
|
||||||
shift
|
shift
|
||||||
|
HELGRIND_OUT=helgrind.out
|
||||||
|
HELGRIND_ERR=helgrind.err
|
||||||
valgrind \
|
valgrind \
|
||||||
--tool=helgrind \
|
--tool=helgrind \
|
||||||
$PYTHON_EXEC \
|
$PYTHON_EXEC \
|
||||||
$CRASH_GEN_EXEC $@
|
$CRASH_GEN_EXEC $@ > $HELGRIND_OUT 2> $HELGRIND_ERR
|
||||||
else
|
else
|
||||||
$PYTHON_EXEC $CRASH_GEN_EXEC $@
|
$PYTHON_EXEC $CRASH_GEN_EXEC $@
|
||||||
fi
|
fi
|
||||||
|
|
|
@ -1226,6 +1226,11 @@ class Task():
|
||||||
"To be implemeted by child classes, class name: {}".format(
|
"To be implemeted by child classes, class name: {}".format(
|
||||||
self.__class__.__name__))
|
self.__class__.__name__))
|
||||||
|
|
||||||
|
def _isServiceStable(self):
|
||||||
|
if not gSvcMgr:
|
||||||
|
return True # we don't run service, so let's assume it's stable
|
||||||
|
return gSvcMgr.isStable() # otherwise let's examine the service
|
||||||
|
|
||||||
def _isErrAcceptable(self, errno, msg):
|
def _isErrAcceptable(self, errno, msg):
|
||||||
if errno in [
|
if errno in [
|
||||||
0x05, # TSDB_CODE_RPC_NOT_READY
|
0x05, # TSDB_CODE_RPC_NOT_READY
|
||||||
|
@ -1263,7 +1268,7 @@ class Task():
|
||||||
return True
|
return True
|
||||||
elif msg.find("duplicated column names") != -1: # also alter table tag issues
|
elif msg.find("duplicated column names") != -1: # also alter table tag issues
|
||||||
return True
|
return True
|
||||||
elif gSvcMgr and (not gSvcMgr.isStable()): # We are managing service, and ...
|
elif not self._isServiceStable(): # We are managing service, and ...
|
||||||
Logging.info("Ignoring error when service starting/stopping: errno = {}, msg = {}".format(errno, msg))
|
Logging.info("Ignoring error when service starting/stopping: errno = {}, msg = {}".format(errno, msg))
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
@ -1641,14 +1646,38 @@ class TaskReadData(StateTransitionTask):
|
||||||
def canBeginFrom(cls, state: AnyState):
|
def canBeginFrom(cls, state: AnyState):
|
||||||
return state.canReadData()
|
return state.canReadData()
|
||||||
|
|
||||||
|
# def _canRestartService(self):
|
||||||
|
# if not gSvcMgr:
|
||||||
|
# return True # always
|
||||||
|
# return gSvcMgr.isActive() # only if it's running TODO: race condition here
|
||||||
|
|
||||||
def _executeInternal(self, te: TaskExecutor, wt: WorkerThread):
|
def _executeInternal(self, te: TaskExecutor, wt: WorkerThread):
|
||||||
sTable = self._db.getFixedSuperTable()
|
sTable = self._db.getFixedSuperTable()
|
||||||
|
|
||||||
# 1 in 5 chance, simulate a broken connection.
|
# 1 in 5 chance, simulate a broken connection, only if service stable (not restarting)
|
||||||
if random.randrange(5) == 0: # TODO: break connection in all situations
|
if random.randrange(20)==0: # and self._canRestartService(): # TODO: break connection in all situations
|
||||||
|
# Logging.info("Attempting to reconnect to server") # TODO: change to DEBUG
|
||||||
|
Progress.emit(Progress.SERVICE_RECONNECT_START)
|
||||||
|
try:
|
||||||
wt.getDbConn().close()
|
wt.getDbConn().close()
|
||||||
wt.getDbConn().open()
|
wt.getDbConn().open()
|
||||||
print("_r", end="", flush=True)
|
except ConnectionError as err: # may fail
|
||||||
|
if not gSvcMgr:
|
||||||
|
Logging.error("Failed to reconnect in client-only mode")
|
||||||
|
raise # Not OK if we are running in client-only mode
|
||||||
|
if gSvcMgr.isRunning(): # may have race conditon, but low prob, due to
|
||||||
|
Logging.error("Failed to reconnect when managed server is running")
|
||||||
|
raise # Not OK if we are running normally
|
||||||
|
|
||||||
|
Progress.emit(Progress.SERVICE_RECONNECT_FAILURE)
|
||||||
|
# Logging.info("Ignoring DB reconnect error")
|
||||||
|
|
||||||
|
# print("_r", end="", flush=True)
|
||||||
|
Progress.emit(Progress.SERVICE_RECONNECT_SUCCESS)
|
||||||
|
# The above might have taken a lot of time, service might be running
|
||||||
|
# by now, causing error below to be incorrectly handled due to timing issue
|
||||||
|
return # TODO: fix server restart status race condtion
|
||||||
|
|
||||||
|
|
||||||
dbc = wt.getDbConn()
|
dbc = wt.getDbConn()
|
||||||
dbName = self._db.getName()
|
dbName = self._db.getName()
|
||||||
|
|
|
@ -163,11 +163,17 @@ class Progress:
|
||||||
BEGIN_THREAD_STEP = 1
|
BEGIN_THREAD_STEP = 1
|
||||||
END_THREAD_STEP = 2
|
END_THREAD_STEP = 2
|
||||||
SERVICE_HEART_BEAT= 3
|
SERVICE_HEART_BEAT= 3
|
||||||
|
SERVICE_RECONNECT_START = 4
|
||||||
|
SERVICE_RECONNECT_SUCCESS = 5
|
||||||
|
SERVICE_RECONNECT_FAILURE = 6
|
||||||
tokens = {
|
tokens = {
|
||||||
STEP_BOUNDARY: '.',
|
STEP_BOUNDARY: '.',
|
||||||
BEGIN_THREAD_STEP: '[',
|
BEGIN_THREAD_STEP: '[',
|
||||||
END_THREAD_STEP: '] ',
|
END_THREAD_STEP: '] ',
|
||||||
SERVICE_HEART_BEAT: '.Y.'
|
SERVICE_HEART_BEAT: '.Y.',
|
||||||
|
SERVICE_RECONNECT_START: '<r.',
|
||||||
|
SERVICE_RECONNECT_SUCCESS: '.r>',
|
||||||
|
SERVICE_RECONNECT_FAILURE: '.xr>',
|
||||||
}
|
}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|
|
@ -280,16 +280,18 @@ class TdeSubProcess:
|
||||||
# process still alive, let's interrupt it
|
# process still alive, let's interrupt it
|
||||||
print("Terminate running process, send SIG_INT and wait...")
|
print("Terminate running process, send SIG_INT and wait...")
|
||||||
# sub process should end, then IPC queue should end, causing IO thread to end
|
# sub process should end, then IPC queue should end, causing IO thread to end
|
||||||
self.subProcess.send_signal(signal.SIGINT)
|
# sig = signal.SIGINT
|
||||||
|
sig = signal.SIGKILL
|
||||||
|
self.subProcess.send_signal(sig) # SIGNINT or SIGKILL
|
||||||
self.subProcess.wait(20)
|
self.subProcess.wait(20)
|
||||||
retCode = self.subProcess.returncode # should always be there
|
retCode = self.subProcess.returncode # should always be there
|
||||||
# May throw subprocess.TimeoutExpired exception above, therefore
|
# May throw subprocess.TimeoutExpired exception above, therefore
|
||||||
# The process is guranteed to have ended by now
|
# The process is guranteed to have ended by now
|
||||||
self.subProcess = None
|
self.subProcess = None
|
||||||
if retCode != 0: # != (- signal.SIGINT):
|
if retCode != 0: # != (- signal.SIGINT):
|
||||||
Logging.error("TSP.stop(): Failed to stop sub proc properly w/ SIG_INT, retCode={}".format(retCode))
|
Logging.error("TSP.stop(): Failed to stop sub proc properly w/ SIG {}, retCode={}".format(sig, retCode))
|
||||||
else:
|
else:
|
||||||
Logging.info("TSP.stop(): sub proc successfully terminated with SIG_INT")
|
Logging.info("TSP.stop(): sub proc successfully terminated with SIG {}".format(sig))
|
||||||
return - retCode
|
return - retCode
|
||||||
|
|
||||||
class ServiceManager:
|
class ServiceManager:
|
||||||
|
@ -395,6 +397,13 @@ class ServiceManager:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def isRunning(self):
|
||||||
|
for ti in self._tInsts:
|
||||||
|
if not ti.getStatus().isRunning():
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
# def isRestarting(self):
|
# def isRestarting(self):
|
||||||
# """
|
# """
|
||||||
# Determine if the service/cluster is being "restarted", i.e., at least
|
# Determine if the service/cluster is being "restarted", i.e., at least
|
||||||
|
|
Loading…
Reference in New Issue