fix(stream): refactor the tasks restart mechanism.

This commit is contained in:
Haojun Liao 2023-08-18 22:36:38 +08:00
parent 41fe441dad
commit b8101afbe2
2 changed files with 42 additions and 24 deletions

View File

@ -390,6 +390,7 @@ typedef struct SStreamMeta {
tmr_h hbTmr; tmr_h hbTmr;
SMgmtInfo mgmtInfo; SMgmtInfo mgmtInfo;
int32_t closedTask;
int32_t chkptNotReadyTasks; int32_t chkptNotReadyTasks;
int64_t chkpId; int64_t chkpId;

View File

@ -1855,12 +1855,11 @@ int32_t tqProcessStreamTaskCheckpointReadyMsg(STQ* pTq, SRpcMsg* pMsg) {
} }
int32_t tqProcessTaskUpdateReq(STQ* pTq, SRpcMsg* pMsg) { int32_t tqProcessTaskUpdateReq(STQ* pTq, SRpcMsg* pMsg) {
bool restartTasks = false;
SStreamMeta* pMeta = pTq->pStreamMeta; SStreamMeta* pMeta = pTq->pStreamMeta;
int32_t vgId = TD_VID(pTq->pVnode); int32_t vgId = TD_VID(pTq->pVnode);
char* msg = POINTER_SHIFT(pMsg->pCont, sizeof(SMsgHead)); char* msg = POINTER_SHIFT(pMsg->pCont, sizeof(SMsgHead));
int32_t len = pMsg->contLen - sizeof(SMsgHead); int32_t len = pMsg->contLen - sizeof(SMsgHead);
bool startTask = vnodeIsRoleLeader(pTq->pVnode); // in case of follower, do not launch task
SRpcMsg rsp = {.info = pMsg->info, .code = TSDB_CODE_SUCCESS}; SRpcMsg rsp = {.info = pMsg->info, .code = TSDB_CODE_SUCCESS};
SStreamTaskNodeUpdateMsg req = {0}; SStreamTaskNodeUpdateMsg req = {0};
@ -1884,34 +1883,52 @@ int32_t tqProcessTaskUpdateReq(STQ* pTq, SRpcMsg* pMsg) {
tqDebug("s-task:%s receive task nodeEp update msg from mnode", pTask->id.idStr); tqDebug("s-task:%s receive task nodeEp update msg from mnode", pTask->id.idStr);
streamTaskUpdateEpsetInfo(pTask, req.pNodeList); streamTaskUpdateEpsetInfo(pTask, req.pNodeList);
streamTaskStop(pTask);
SStreamTask* pHistoryTask = NULL; taosWLockLatch(&pMeta->lock);
if (pTask->historyTaskId.taskId != 0) {
pHistoryTask = streamMetaAcquireTask(pMeta, pTask->historyTaskId.streamId, pTask->historyTaskId.taskId);
if (pHistoryTask != NULL) {
tqDebug("s-task:%s fill-history task handle task update along with related stream task", pHistoryTask->id.idStr);
streamTaskUpdateEpsetInfo(pHistoryTask, req.pNodeList);
streamTaskRestart(pHistoryTask, NULL, startTask); int32_t numOfCount = streamMetaGetNumOfTasks(pMeta);
streamMetaReleaseTask(pMeta, pHistoryTask); pMeta->closedTask += 1;
} else {
tqError("vgId:%d failed to get fill-history task:0x%x when handling task update, it may have been dropped",
pMeta->vgId, pTask->historyTaskId.taskId);
streamMetaReleaseTask(pMeta, pTask);
}
}
streamTaskRestart(pTask, NULL, startTask); tqDebug("s-task:%s task nodeEp update completed", pTask->id.idStr);
streamMetaReleaseTask(pMeta, pTask); streamMetaReleaseTask(pMeta, pTask);
int32_t code = rsp.code; // all tasks are closed, now let's restart the stream meta
if (pMeta->closedTask == numOfCount) {
if (streamMetaCommit(pMeta) < 0) {
// persist to disk
}
restartTasks = true;
pMeta->closedTask = 0; // reset value
}
taosWUnLockLatch(&pMeta->lock);
_end: _end:
tDecoderClear(&decoder); tDecoderClear(&decoder);
tmsgSendRsp(&rsp); tmsgSendRsp(&rsp);
tqDebug("s-task:%s task nodeEp update completed", pTask->id.idStr); if (restartTasks) {
return code; tqDebug("vgId:%d all tasks are stopped, restart them", vgId);
streamMetaClose(pTask->pMeta);
pTq->pStreamMeta = streamMetaOpen(pTq->path, pTq, (FTaskExpand*)tqExpandTask, pTq->pVnode->config.vgId, -1);
if (pTq->pStreamMeta == NULL) {
return -1;
}
if (streamLoadTasks(pTq->pStreamMeta) < 0) {
return -1;
}
if (vnodeIsRoleLeader(pTq->pVnode) && !tsDisableStream) {
vInfo("vgId:%d, restart to all stream tasks", vgId);
tqCheckStreamStatus(pTq);
}
}
return rsp.code;
} }
int32_t tqProcessTaskStopReq(STQ* pTq, SRpcMsg* pMsg) { int32_t tqProcessTaskStopReq(STQ* pTq, SRpcMsg* pMsg) {