From 406e8777005e02a38b330e15fa818f192cba0998 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Fri, 6 Dec 2024 00:11:51 +0800 Subject: [PATCH] fix(stream): send chkpt-source rsp to mnd even if it is failed. --- source/dnode/vnode/src/tq/tq.c | 116 ++++++++++++--------------------- 1 file changed, 41 insertions(+), 75 deletions(-) diff --git a/source/dnode/vnode/src/tq/tq.c b/source/dnode/vnode/src/tq/tq.c index bd78f62cae..937bc6e268 100644 --- a/source/dnode/vnode/src/tq/tq.c +++ b/source/dnode/vnode/src/tq/tq.c @@ -1108,91 +1108,76 @@ _OVER: return code; } +// always return success to mnode +//todo: handle failure of build and send msg to mnode +static void doSendChkptSourceRsp(SStreamCheckpointSourceReq* pReq, SRpcHandleInfo* pRpcInfo, int32_t code, + int32_t taskId) { + SRpcMsg rsp = {0}; + int32_t ret = streamTaskBuildCheckpointSourceRsp(pReq, pRpcInfo, &rsp, code); + if (ret) { // suppress the error in build checkpoint source rsp + tqError("s-task:0x%x failed to build checkpoint-source rsp, code:%s", taskId, tstrerror(ret)); + } + tmsgSendRsp(&rsp); // error occurs +} + // no matter what kinds of error happened, make sure the mnode will receive the success execution code. int32_t tqProcessTaskCheckPointSourceReq(STQ* pTq, SRpcMsg* pMsg, SRpcMsg* pRsp) { - int32_t vgId = TD_VID(pTq->pVnode); - SStreamMeta* pMeta = pTq->pStreamMeta; - char* msg = POINTER_SHIFT(pMsg->pCont, sizeof(SMsgHead)); - int32_t len = pMsg->contLen - sizeof(SMsgHead); - int32_t code = 0; + int32_t vgId = TD_VID(pTq->pVnode); + SStreamMeta* pMeta = pTq->pStreamMeta; + char* msg = POINTER_SHIFT(pMsg->pCont, sizeof(SMsgHead)); + int32_t len = pMsg->contLen - sizeof(SMsgHead); + int32_t code = 0; + SStreamCheckpointSourceReq req = {0}; + SDecoder decoder = {0}; + SStreamTask* pTask = NULL; + int64_t checkpointId = 0; // disable auto rsp to mnode pRsp->info.handle = NULL; - SStreamCheckpointSourceReq req = {0}; - SDecoder decoder; tDecoderInit(&decoder, (uint8_t*)msg, len); if (tDecodeStreamCheckpointSourceReq(&decoder, &req) < 0) { code = TSDB_CODE_MSG_DECODE_ERROR; tDecoderClear(&decoder); tqError("vgId:%d failed to decode checkpoint-source msg, code:%s", vgId, tstrerror(code)); - - SRpcMsg rsp = {0}; - int32_t ret = streamTaskBuildCheckpointSourceRsp(&req, &pMsg->info, &rsp, TSDB_CODE_SUCCESS); - if (ret) { // suppress the error in build checkpointsource rsp - tqError("s-task:0x%x failed to build checkpoint-source rsp, code:%s", req.taskId, tstrerror(code)); - } - - tmsgSendRsp(&rsp); // error occurs - return TSDB_CODE_SUCCESS; // always return success to mnode, todo: handle failure of build and send msg to mnode + doSendChkptSourceRsp(&req, &pMsg->info, TSDB_CODE_SUCCESS, req.taskId); + return TSDB_CODE_SUCCESS; // always return success to mnode, } + tDecoderClear(&decoder); if (!vnodeIsRoleLeader(pTq->pVnode)) { tqDebug("vgId:%d not leader, ignore checkpoint-source msg, s-task:0x%x", vgId, req.taskId); - SRpcMsg rsp = {0}; - int32_t ret = streamTaskBuildCheckpointSourceRsp(&req, &pMsg->info, &rsp, TSDB_CODE_SUCCESS); - if (ret) { // suppress the error in build checkpointsource rsp - tqError("s-task:0x%x failed to build checkpoint-source rsp, code:%s", req.taskId, tstrerror(code)); - } - - tmsgSendRsp(&rsp); // error occurs - return TSDB_CODE_SUCCESS; // always return success to mnode, todo: handle failure of build and send msg to mnode + doSendChkptSourceRsp(&req, &pMsg->info, TSDB_CODE_SUCCESS, req.taskId); + return TSDB_CODE_SUCCESS; // always return success to mnode } if (!pTq->pVnode->restored) { tqDebug("vgId:%d checkpoint-source msg received during restoring, checkpointId:%" PRId64 ", transId:%d s-task:0x%x ignore it", vgId, req.checkpointId, req.transId, req.taskId); - SRpcMsg rsp = {0}; - int32_t ret = streamTaskBuildCheckpointSourceRsp(&req, &pMsg->info, &rsp, TSDB_CODE_SUCCESS); - if (ret) { // suppress the error in build checkpointsource rsp - tqError("s-task:0x%x failed to build checkpoint-source rsp, code:%s", req.taskId, tstrerror(code)); - } - - tmsgSendRsp(&rsp); // error occurs - return TSDB_CODE_SUCCESS; // always return success to mnode, , todo: handle failure of build and send msg to mnode + doSendChkptSourceRsp(&req, &pMsg->info, TSDB_CODE_SUCCESS, req.taskId); + return TSDB_CODE_SUCCESS; // always return success to mnode } - SStreamTask* pTask = NULL; code = streamMetaAcquireTask(pMeta, req.streamId, req.taskId, &pTask); if (pTask == NULL || code != 0) { tqError("vgId:%d failed to find s-task:0x%x, ignore checkpoint msg. checkpointId:%" PRId64 " transId:%d it may have been destroyed", vgId, req.taskId, req.checkpointId, req.transId); - SRpcMsg rsp = {0}; - int32_t ret = streamTaskBuildCheckpointSourceRsp(&req, &pMsg->info, &rsp, TSDB_CODE_SUCCESS); - if (ret) { // suppress the error in build checkpointsource rsp - tqError("s-task:%s failed to build checkpoint-source rsp, code:%s", pTask->id.idStr, tstrerror(code)); - } - tmsgSendRsp(&rsp); // error occurs + doSendChkptSourceRsp(&req, &pMsg->info, TSDB_CODE_SUCCESS, req.taskId); return TSDB_CODE_SUCCESS; } if (pTask->status.downstreamReady != 1) { - streamTaskSetFailedChkptInfo(pTask, req.transId, req.checkpointId); // record the latest failed checkpoint id + // record the latest failed checkpoint id + streamTaskSetFailedChkptInfo(pTask, req.transId, req.checkpointId); tqError("s-task:%s not ready for checkpoint, since downstream not ready, ignore this checkpointId:%" PRId64 ", transId:%d set it failed", pTask->id.idStr, req.checkpointId, req.transId); + streamMetaReleaseTask(pMeta, pTask); - - SRpcMsg rsp = {0}; - int32_t ret = streamTaskBuildCheckpointSourceRsp(&req, &pMsg->info, &rsp, TSDB_CODE_SUCCESS); - if (ret) { // suppress the error in build checkpointsource rsp - tqError("s-task:%s failed to build checkpoint-source rsp, code:%s", pTask->id.idStr, tstrerror(code)); - } - - tmsgSendRsp(&rsp); // error occurs + doSendChkptSourceRsp(&req, &pMsg->info, TSDB_CODE_SUCCESS, req.taskId); return TSDB_CODE_SUCCESS; // todo retry handle error } @@ -1207,14 +1192,7 @@ int32_t tqProcessTaskCheckPointSourceReq(STQ* pTq, SRpcMsg* pMsg, SRpcMsg* pRsp) streamMutexUnlock(&pTask->lock); streamMetaReleaseTask(pMeta, pTask); - - SRpcMsg rsp = {0}; - int32_t ret = streamTaskBuildCheckpointSourceRsp(&req, &pMsg->info, &rsp, TSDB_CODE_SUCCESS); - if (ret) { // suppress the error in build checkpointsource rsp - tqError("s-task:%s failed to build checkpoint-source rsp, code:%s", pTask->id.idStr, tstrerror(code)); - } - - tmsgSendRsp(&rsp); // error occurs + doSendChkptSourceRsp(&req, &pMsg->info, TSDB_CODE_SUCCESS, req.taskId); return TSDB_CODE_SUCCESS; } } else { @@ -1226,7 +1204,6 @@ int32_t tqProcessTaskCheckPointSourceReq(STQ* pTq, SRpcMsg* pMsg, SRpcMsg* pRsp) // check if the checkpoint msg already sent or not. if (status == TASK_STATUS__CK) { - int64_t checkpointId = 0; streamTaskGetActiveCheckpointInfo(pTask, NULL, &checkpointId); tqWarn("s-task:%s repeatly recv checkpoint-source msg checkpointId:%" PRId64 @@ -1235,7 +1212,7 @@ int32_t tqProcessTaskCheckPointSourceReq(STQ* pTq, SRpcMsg* pMsg, SRpcMsg* pRsp) streamMutexUnlock(&pTask->lock); streamMetaReleaseTask(pMeta, pTask); - + doSendChkptSourceRsp(&req, &pMsg->info, TSDB_CODE_SYN_PROPOSE_NOT_READY, req.taskId); return TSDB_CODE_SUCCESS; } else { // checkpoint already finished, and not in checkpoint status if (req.checkpointId <= pTask->chkInfo.checkpointId) { @@ -1245,15 +1222,7 @@ int32_t tqProcessTaskCheckPointSourceReq(STQ* pTq, SRpcMsg* pMsg, SRpcMsg* pRsp) streamMutexUnlock(&pTask->lock); streamMetaReleaseTask(pMeta, pTask); - - SRpcMsg rsp = {0}; - int32_t ret = streamTaskBuildCheckpointSourceRsp(&req, &pMsg->info, &rsp, TSDB_CODE_SUCCESS); - if (ret) { // suppress the error in build checkpointsource rsp - tqError("s-task:%s failed to build checkpoint-source rsp, code:%s", pTask->id.idStr, tstrerror(code)); - } - - tmsgSendRsp(&rsp); // error occurs - + doSendChkptSourceRsp(&req, &pMsg->info, TSDB_CODE_SUCCESS, req.taskId); return TSDB_CODE_SUCCESS; } } @@ -1264,7 +1233,9 @@ int32_t tqProcessTaskCheckPointSourceReq(STQ* pTq, SRpcMsg* pMsg, SRpcMsg* pRsp) if (code) { qError("s-task:%s (vgId:%d) failed to process checkpoint-source req, code:%s", pTask->id.idStr, vgId, tstrerror(code)); - return code; + streamMetaReleaseTask(pMeta, pTask); + doSendChkptSourceRsp(&req, &pMsg->info, TSDB_CODE_SUCCESS, req.taskId); + return TSDB_CODE_SUCCESS; } if (req.mndTrigger) { @@ -1279,13 +1250,8 @@ int32_t tqProcessTaskCheckPointSourceReq(STQ* pTq, SRpcMsg* pMsg, SRpcMsg* pRsp) code = streamAddCheckpointSourceRspMsg(&req, &pMsg->info, pTask); if (code != TSDB_CODE_SUCCESS) { - SRpcMsg rsp = {0}; - int32_t ret = streamTaskBuildCheckpointSourceRsp(&req, &pMsg->info, &rsp, TSDB_CODE_SUCCESS); - if (ret) { // suppress the error in build checkpointsource rsp - tqError("s-task:%s failed to build checkpoint-source rsp, code:%s", pTask->id.idStr, tstrerror(code)); - } - tmsgSendRsp(&rsp); // error occurs - return TSDB_CODE_SUCCESS; + streamTaskSetCheckpointFailed(pTask); // set the checkpoint failed + doSendChkptSourceRsp(&req, &pMsg->info, TSDB_CODE_SUCCESS, req.taskId); } streamMetaReleaseTask(pMeta, pTask);