Enhanced crash_gen tool to report better error for deadlocks
This commit is contained in:
parent
4d8fc705d8
commit
203e54c068
|
@ -371,7 +371,9 @@ class ThreadCoordinator:
|
||||||
if isinstance(err, CrashGenError): # our own transition failure
|
if isinstance(err, CrashGenError): # our own transition failure
|
||||||
Logging.info("State transition error")
|
Logging.info("State transition error")
|
||||||
# TODO: saw an error here once, let's print out stack info for err?
|
# TODO: saw an error here once, let's print out stack info for err?
|
||||||
traceback.print_stack()
|
traceback.print_stack() # Stack frame to here.
|
||||||
|
Logging.info("Caused by:")
|
||||||
|
traceback.print_exception(*sys.exc_info()) # Ref: https://www.geeksforgeeks.org/how-to-print-exception-stack-trace-in-python/
|
||||||
transitionFailed = True
|
transitionFailed = True
|
||||||
self._te = None # Not running any more
|
self._te = None # Not running any more
|
||||||
self._execStats.registerFailure("State transition error: {}".format(err))
|
self._execStats.registerFailure("State transition error: {}".format(err))
|
||||||
|
@ -741,7 +743,8 @@ class AnyState:
|
||||||
sCnt += 1
|
sCnt += 1
|
||||||
if (sCnt >= 2):
|
if (sCnt >= 2):
|
||||||
raise CrashGenError(
|
raise CrashGenError(
|
||||||
"Unexpected more than 1 success with task: {}, in task set: {}".format(
|
"Unexpected more than 1 success at state: {}, with task: {}, in task set: {}".format(
|
||||||
|
self.__class__.__name__,
|
||||||
cls.__name__, # verified just now that isinstance(task, cls)
|
cls.__name__, # verified just now that isinstance(task, cls)
|
||||||
[c.__class__.__name__ for c in tasks]
|
[c.__class__.__name__ for c in tasks]
|
||||||
))
|
))
|
||||||
|
@ -756,8 +759,11 @@ class AnyState:
|
||||||
if task.isSuccess():
|
if task.isSuccess():
|
||||||
sCnt += 1
|
sCnt += 1
|
||||||
if (exists and sCnt <= 0):
|
if (exists and sCnt <= 0):
|
||||||
raise CrashGenError("Unexpected zero success for task type: {}, from tasks: {}"
|
raise CrashGenError("Unexpected zero success at state: {}, with task: {}, in task set: {}".format(
|
||||||
.format(cls, tasks))
|
self.__class__.__name__,
|
||||||
|
cls.__name__, # verified just now that isinstance(task, cls)
|
||||||
|
[c.__class__.__name__ for c in tasks]
|
||||||
|
))
|
||||||
|
|
||||||
def assertNoTask(self, tasks, cls):
|
def assertNoTask(self, tasks, cls):
|
||||||
for task in tasks:
|
for task in tasks:
|
||||||
|
@ -809,8 +815,6 @@ class StateEmpty(AnyState):
|
||||||
]
|
]
|
||||||
|
|
||||||
def verifyTasksToState(self, tasks, newState):
|
def verifyTasksToState(self, tasks, newState):
|
||||||
if Config.getConfig().ignore_errors: # if we are asked to ignore certain errors, let's not verify CreateDB success.
|
|
||||||
return
|
|
||||||
if (self.hasSuccess(tasks, TaskCreateDb)
|
if (self.hasSuccess(tasks, TaskCreateDb)
|
||||||
): # at EMPTY, if there's succes in creating DB
|
): # at EMPTY, if there's succes in creating DB
|
||||||
if (not self.hasTask(tasks, TaskDropDb)): # and no drop_db tasks
|
if (not self.hasTask(tasks, TaskDropDb)): # and no drop_db tasks
|
||||||
|
@ -995,16 +999,17 @@ class StateMechine:
|
||||||
dbc.execute("show dnodes")
|
dbc.execute("show dnodes")
|
||||||
|
|
||||||
# Generic Checks, first based on the start state
|
# Generic Checks, first based on the start state
|
||||||
if self._curState.canCreateDb():
|
if not Config.getConfig().ignore_errors: # verify state, only if we are asked not to ignore certain errors.
|
||||||
self._curState.assertIfExistThenSuccess(tasks, TaskCreateDb)
|
if self._curState.canCreateDb():
|
||||||
# self.assertAtMostOneSuccess(tasks, CreateDbTask) # not really, in
|
self._curState.assertIfExistThenSuccess(tasks, TaskCreateDb)
|
||||||
# case of multiple creation and drops
|
# self.assertAtMostOneSuccess(tasks, CreateDbTask) # not really, in
|
||||||
|
# case of multiple creation and drops
|
||||||
|
|
||||||
if self._curState.canDropDb():
|
if self._curState.canDropDb():
|
||||||
if gSvcMgr == None: # only if we are running as client-only
|
if gSvcMgr == None: # only if we are running as client-only
|
||||||
self._curState.assertIfExistThenSuccess(tasks, TaskDropDb)
|
self._curState.assertIfExistThenSuccess(tasks, TaskDropDb)
|
||||||
# self.assertAtMostOneSuccess(tasks, DropDbTask) # not really in
|
# self.assertAtMostOneSuccess(tasks, DropDbTask) # not really in
|
||||||
# case of drop-create-drop
|
# case of drop-create-drop
|
||||||
|
|
||||||
# if self._state.canCreateFixedTable():
|
# if self._state.canCreateFixedTable():
|
||||||
# self.assertIfExistThenSuccess(tasks, CreateFixedTableTask) # Not true, DB may be dropped
|
# self.assertIfExistThenSuccess(tasks, CreateFixedTableTask) # Not true, DB may be dropped
|
||||||
|
@ -1026,7 +1031,8 @@ class StateMechine:
|
||||||
newState = self._findCurrentState(dbc)
|
newState = self._findCurrentState(dbc)
|
||||||
Logging.debug("[STT] New DB state determined: {}".format(newState))
|
Logging.debug("[STT] New DB state determined: {}".format(newState))
|
||||||
# can old state move to new state through the tasks?
|
# can old state move to new state through the tasks?
|
||||||
self._curState.verifyTasksToState(tasks, newState)
|
if not Config.getConfig().ignore_errors: # verify state, only if we are asked not to ignore certain errors.
|
||||||
|
self._curState.verifyTasksToState(tasks, newState)
|
||||||
self._curState = newState
|
self._curState = newState
|
||||||
|
|
||||||
def pickTaskType(self):
|
def pickTaskType(self):
|
||||||
|
@ -2231,16 +2237,14 @@ class TaskAddData(StateTransitionTask):
|
||||||
class ThreadStacks: # stack info for all threads
|
class ThreadStacks: # stack info for all threads
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self._allStacks = {}
|
self._allStacks = {}
|
||||||
allFrames = sys._current_frames() # All current stack frames
|
allFrames = sys._current_frames() # All current stack frames, keyed with "ident"
|
||||||
for th in threading.enumerate(): # For each thread
|
for th in threading.enumerate(): # For each thread
|
||||||
if th.ident is None:
|
stack = traceback.extract_stack(allFrames[th.ident]) #type: ignore # Get stack for a thread
|
||||||
continue
|
shortTid = th.native_id % 10000 #type: ignore
|
||||||
stack = traceback.extract_stack(allFrames[th.ident]) # Get stack for a thread
|
|
||||||
shortTid = th.ident % 10000
|
|
||||||
self._allStacks[shortTid] = stack # Was using th.native_id
|
self._allStacks[shortTid] = stack # Was using th.native_id
|
||||||
|
|
||||||
def print(self, filteredEndName = None, filterInternal = False):
|
def print(self, filteredEndName = None, filterInternal = False):
|
||||||
for tIdent, stack in self._allStacks.items(): # for each thread, stack frames top to bottom
|
for shortTid, stack in self._allStacks.items(): # for each thread, stack frames top to bottom
|
||||||
lastFrame = stack[-1]
|
lastFrame = stack[-1]
|
||||||
if filteredEndName: # we need to filter out stacks that match this name
|
if filteredEndName: # we need to filter out stacks that match this name
|
||||||
if lastFrame.name == filteredEndName : # end did not match
|
if lastFrame.name == filteredEndName : # end did not match
|
||||||
|
@ -2252,7 +2256,9 @@ class ThreadStacks: # stack info for all threads
|
||||||
'__init__']: # the thread that extracted the stack
|
'__init__']: # the thread that extracted the stack
|
||||||
continue # ignore
|
continue # ignore
|
||||||
# Now print
|
# Now print
|
||||||
print("\n<----- Thread Info for LWP/ID: {} (most recent call last) <-----".format(tIdent))
|
print("\n<----- Thread Info for LWP/ID: {} (most recent call last) <-----".format(shortTid))
|
||||||
|
lastSqlForThread = DbConn.fetchSqlForThread(shortTid)
|
||||||
|
print("Last SQL statement attempted from thread {} is: {}".format(shortTid, lastSqlForThread))
|
||||||
stackFrame = 0
|
stackFrame = 0
|
||||||
for frame in stack: # was using: reversed(stack)
|
for frame in stack: # was using: reversed(stack)
|
||||||
# print(frame)
|
# print(frame)
|
||||||
|
|
|
@ -27,6 +27,26 @@ class DbConn:
|
||||||
TYPE_REST = "rest-api"
|
TYPE_REST = "rest-api"
|
||||||
TYPE_INVALID = "invalid"
|
TYPE_INVALID = "invalid"
|
||||||
|
|
||||||
|
# class variables
|
||||||
|
lastSqlFromThreads : dict[int, str] = {} # stored by thread id, obtained from threading.current_thread().ident%10000
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def saveSqlForCurrentThread(cls, sql: str):
|
||||||
|
'''
|
||||||
|
Let us save the last SQL statement on a per-thread basis, so that when later we
|
||||||
|
run into a dead-lock situation, we can pick out the deadlocked thread, and use
|
||||||
|
that information to find what what SQL statement is stuck.
|
||||||
|
'''
|
||||||
|
th = threading.current_thread()
|
||||||
|
shortTid = th.native_id % 10000 #type: ignore
|
||||||
|
cls.lastSqlFromThreads[shortTid] = sql # Save this for later
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def fetchSqlForThread(cls, shortTid : int) -> str :
|
||||||
|
if shortTid not in cls.lastSqlFromThreads:
|
||||||
|
raise CrashGenError("No last-attempted-SQL found for thread id: {}".format(shortTid))
|
||||||
|
return cls.lastSqlFromThreads[shortTid]
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create(cls, connType, dbTarget):
|
def create(cls, connType, dbTarget):
|
||||||
if connType == cls.TYPE_NATIVE:
|
if connType == cls.TYPE_NATIVE:
|
||||||
|
@ -163,6 +183,7 @@ class DbConnRest(DbConn):
|
||||||
|
|
||||||
def _doSql(self, sql):
|
def _doSql(self, sql):
|
||||||
self._lastSql = sql # remember this, last SQL attempted
|
self._lastSql = sql # remember this, last SQL attempted
|
||||||
|
self.saveSqlForCurrentThread(sql) # Save in global structure too. #TODO: combine with above
|
||||||
try:
|
try:
|
||||||
r = requests.post(self._url,
|
r = requests.post(self._url,
|
||||||
data = sql,
|
data = sql,
|
||||||
|
@ -392,6 +413,7 @@ class DbConnNative(DbConn):
|
||||||
"Cannot exec SQL unless db connection is open", CrashGenError.DB_CONNECTION_NOT_OPEN)
|
"Cannot exec SQL unless db connection is open", CrashGenError.DB_CONNECTION_NOT_OPEN)
|
||||||
Logging.debug("[SQL] Executing SQL: {}".format(sql))
|
Logging.debug("[SQL] Executing SQL: {}".format(sql))
|
||||||
self._lastSql = sql
|
self._lastSql = sql
|
||||||
|
self.saveSqlForCurrentThread(sql) # Save in global structure too. #TODO: combine with above
|
||||||
nRows = self._tdSql.execute(sql)
|
nRows = self._tdSql.execute(sql)
|
||||||
cls = self.__class__
|
cls = self.__class__
|
||||||
cls.totalRequests += 1
|
cls.totalRequests += 1
|
||||||
|
@ -407,6 +429,7 @@ class DbConnNative(DbConn):
|
||||||
"Cannot query database until connection is open, restarting?", CrashGenError.DB_CONNECTION_NOT_OPEN)
|
"Cannot query database until connection is open, restarting?", CrashGenError.DB_CONNECTION_NOT_OPEN)
|
||||||
Logging.debug("[SQL] Executing SQL: {}".format(sql))
|
Logging.debug("[SQL] Executing SQL: {}".format(sql))
|
||||||
self._lastSql = sql
|
self._lastSql = sql
|
||||||
|
self.saveSqlForCurrentThread(sql) # Save in global structure too. #TODO: combine with above
|
||||||
nRows = self._tdSql.query(sql)
|
nRows = self._tdSql.query(sql)
|
||||||
cls = self.__class__
|
cls = self.__class__
|
||||||
cls.totalRequests += 1
|
cls.totalRequests += 1
|
||||||
|
|
Loading…
Reference in New Issue