From 752ef0cf1d0f4b32a0c5736928d44e517cea34c3 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Thu, 5 Sep 2024 17:41:36 +0800 Subject: [PATCH 1/2] refactor: add random error generator for stream trans. --- source/dnode/mnode/impl/src/mndStreamUtil.c | 70 +++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/source/dnode/mnode/impl/src/mndStreamUtil.c b/source/dnode/mnode/impl/src/mndStreamUtil.c index 3bb5617a9c..5de2a1e182 100644 --- a/source/dnode/mnode/impl/src/mndStreamUtil.c +++ b/source/dnode/mnode/impl/src/mndStreamUtil.c @@ -1858,4 +1858,74 @@ int32_t setTaskAttrInResBlock(SStreamObj *pStream, SStreamTask *pTask, SSDataBlo mError("error happens during build task attr result blocks, lino:%d, code:%s", lino, tstrerror(code)); } return code; +} + +uint32_t seed = 0; +static SRpcMsg createRpcMsg(STransAction* pAction, int64_t traceId, int64_t signature) { + SRpcMsg rpcMsg = {.msgType = pAction->msgType, .contLen = pAction->contLen, .info.ahandle = (void *)signature}; + rpcMsg.pCont = rpcMallocCont(pAction->contLen); + if (rpcMsg.pCont == NULL) { + return rpcMsg; + } + + rpcMsg.info.traceId.rootId = traceId; + rpcMsg.info.notFreeAhandle = 1; + + memcpy(rpcMsg.pCont, pAction->pCont, pAction->contLen); + return rpcMsg; +} + +void streamTransRandomErrorGen(STransAction *pAction, STrans *pTrans, int64_t signature) { + if ((pAction->msgType == TDMT_STREAM_TASK_UPDATE_CHKPT && pAction->id > 2) || + (pAction->msgType == TDMT_STREAM_CONSEN_CHKPT) || + (pAction->msgType == TDMT_VND_STREAM_CHECK_POINT_SOURCE && pAction->id > 2)) { + if (seed == 0) { + seed = taosGetTimestampSec(); + } + + uint32_t v = taosRandR(&seed); + int32_t choseItem = v % 5; + + if (choseItem == 0) { + // 1. one of update-checkpoint not send, restart and send it again + taosMsleep(5000); + if (pAction->msgType == TDMT_STREAM_TASK_UPDATE_CHKPT) { + mError( + "***sleep 5s and core dump, following tasks will not recv update-checkpoint info, so the checkpoint will " + "rollback***"); + ASSERT(0); + } else if (pAction->msgType == TDMT_STREAM_CONSEN_CHKPT) { // pAction->msgType == TDMT_STREAM_CONSEN_CHKPT + mError( + "***sleep 5s and core dump, following tasks will not recv consen-checkpoint info, so the tasks will " + "not started***"); + } else { // pAction->msgType == TDMT_VND_STREAM_CHECK_POINT_SOURCE + mError( + "***sleep 5s and core dump, following tasks will not recv checkpoint-source info, so the tasks will " + "started after restart***"); + ASSERT(0); + } + } else if (choseItem == 1) { + // 2. repeat send update chkpt msg + mError("***repeat send update-checkpoint/consensus/checkpoint trans msg 3times to vnode***"); + + mError("***repeat 1***"); + SRpcMsg rpcMsg1 = createRpcMsg(pAction, pTrans->mTraceId, signature); + int32_t code = tmsgSendReq(&pAction->epSet, &rpcMsg1); + + mError("***repeat 2***"); + SRpcMsg rpcMsg2 = createRpcMsg(pAction, pTrans->mTraceId, signature); + code = tmsgSendReq(&pAction->epSet, &rpcMsg2); + + mError("***repeat 3***"); + SRpcMsg rpcMsg3 = createRpcMsg(pAction, pTrans->mTraceId, signature); + code = tmsgSendReq(&pAction->epSet, &rpcMsg3); + } else if (choseItem == 2) { + // 3. sleep 40s and then send msg + mError("***idle for 30s, and then send msg***"); + taosMsleep(30000); + } else { + // do nothing + // mInfo("no error triggered"); + } + } } \ No newline at end of file From 9e33244209aa530441cc6a75177bf263a81a08fd Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Fri, 6 Sep 2024 16:06:30 +0800 Subject: [PATCH 2/2] fix(stream):remove assert. --- source/dnode/mnode/impl/src/mndStreamUtil.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/dnode/mnode/impl/src/mndStreamUtil.c b/source/dnode/mnode/impl/src/mndStreamUtil.c index 5de2a1e182..c4e03e7951 100644 --- a/source/dnode/mnode/impl/src/mndStreamUtil.c +++ b/source/dnode/mnode/impl/src/mndStreamUtil.c @@ -1893,7 +1893,7 @@ void streamTransRandomErrorGen(STransAction *pAction, STrans *pTrans, int64_t si mError( "***sleep 5s and core dump, following tasks will not recv update-checkpoint info, so the checkpoint will " "rollback***"); - ASSERT(0); + exit(-1); } else if (pAction->msgType == TDMT_STREAM_CONSEN_CHKPT) { // pAction->msgType == TDMT_STREAM_CONSEN_CHKPT mError( "***sleep 5s and core dump, following tasks will not recv consen-checkpoint info, so the tasks will " @@ -1902,7 +1902,7 @@ void streamTransRandomErrorGen(STransAction *pAction, STrans *pTrans, int64_t si mError( "***sleep 5s and core dump, following tasks will not recv checkpoint-source info, so the tasks will " "started after restart***"); - ASSERT(0); + exit(-1); } } else if (choseItem == 1) { // 2. repeat send update chkpt msg