diff --git a/include/dnode/vnode/tqCommon.h b/include/dnode/vnode/tqCommon.h index 39e0c07406..50458d684f 100644 --- a/include/dnode/vnode/tqCommon.h +++ b/include/dnode/vnode/tqCommon.h @@ -32,8 +32,9 @@ int32_t tqStreamTaskProcessDeployReq(SStreamMeta* pMeta, SMsgCb* cb, int64_t sve bool isLeader, bool restored); int32_t tqStreamTaskProcessDropReq(SStreamMeta* pMeta, char* msg, int32_t msgLen); int32_t tqStreamTaskProcessRunReq(SStreamMeta* pMeta, SRpcMsg* pMsg, bool isLeader); -int32_t tqStreamTaskResetStatus(SStreamMeta* pMeta, int32_t* numOfTasks); +int32_t tqStreamTaskResetStatus(SStreamMeta* pMeta); int32_t tqStartTaskCompleteCallback(SStreamMeta* pMeta); +int32_t tqStreamTasksGetTotalNum(SStreamMeta* pMeta); int32_t tqStreamTaskProcessTaskResetReq(SStreamMeta* pMeta, SRpcMsg* pMsg); int32_t tqStreamTaskProcessTaskPauseReq(SStreamMeta* pMeta, char* pMsg); int32_t tqStreamTaskProcessTaskResumeReq(void* handle, int64_t sversion, char* pMsg, bool fromVnode); diff --git a/include/libs/stream/tstream.h b/include/libs/stream/tstream.h index a0cff35623..f1b5ec19b0 100644 --- a/include/libs/stream/tstream.h +++ b/include/libs/stream/tstream.h @@ -874,6 +874,8 @@ void streamMetaStartHb(SStreamMeta* pMeta); bool streamMetaTaskInTimer(SStreamMeta* pMeta); int32_t streamMetaAddTaskLaunchResult(SStreamMeta* pMeta, int64_t streamId, int32_t taskId, int64_t startTs, int64_t endTs, bool ready); +int32_t streamMetaResetTaskStatus(SStreamMeta* pMeta); + void streamMetaRLock(SStreamMeta* pMeta); void streamMetaRUnLock(SStreamMeta* pMeta); void streamMetaWLock(SStreamMeta* pMeta); diff --git a/source/dnode/snode/src/snode.c b/source/dnode/snode/src/snode.c index 4815dfe65c..4284b0838e 100644 --- a/source/dnode/snode/src/snode.c +++ b/source/dnode/snode/src/snode.c @@ -145,8 +145,7 @@ FAIL: } int32_t sndInit(SSnode *pSnode) { - int32_t numOfTasks = 0; - tqStreamTaskResetStatus(pSnode->pMeta, &numOfTasks); + streamMetaResetTaskStatus(pSnode->pMeta); streamMetaStartAllTasks(pSnode->pMeta); return 0; } diff --git a/source/dnode/vnode/src/tq/tqStreamTask.c b/source/dnode/vnode/src/tq/tqStreamTask.c index 2a600d08ef..74aea0cdfb 100644 --- a/source/dnode/vnode/src/tq/tqStreamTask.c +++ b/source/dnode/vnode/src/tq/tqStreamTask.c @@ -175,11 +175,11 @@ int32_t tqStopStreamTasksAsync(STQ* pTq) { SStreamTaskRunReq* pRunReq = rpcMallocCont(sizeof(SStreamTaskRunReq)); if (pRunReq == NULL) { terrno = TSDB_CODE_OUT_OF_MEMORY; - tqError("vgId:%d failed to create msg to stop tasks, code:%s", vgId, terrstr()); + tqError("vgId:%d failed to create msg to stop tasks async, code:%s", vgId, terrstr()); return -1; } - tqDebug("vgId:%d create msg to stop tasks", vgId); + tqDebug("vgId:%d create msg to stop all tasks async", vgId); pRunReq->head.vgId = vgId; pRunReq->streamId = 0; diff --git a/source/dnode/vnode/src/tqCommon/tqCommon.c b/source/dnode/vnode/src/tqCommon/tqCommon.c index e2dc9fa679..497faa8e1a 100644 --- a/source/dnode/vnode/src/tqCommon/tqCommon.c +++ b/source/dnode/vnode/src/tqCommon/tqCommon.c @@ -103,8 +103,7 @@ int32_t tqStreamTaskProcessUpdateReq(SStreamMeta* pMeta, SMsgCb* cb, SRpcMsg* pM STaskId id = {.streamId = req.streamId, .taskId = req.taskId}; SStreamTask** ppTask = (SStreamTask**)taosHashGet(pMeta->pTasksMap, &id, sizeof(id)); if (ppTask == NULL || *ppTask == NULL) { - tqError("vgId:%d failed to acquire task:0x%x when handling update, it may have been dropped already", vgId, - req.taskId); + tqError("vgId:%d failed to acquire task:0x%x when handling update, it may have been dropped", vgId, req.taskId); rsp.code = TSDB_CODE_SUCCESS; streamMetaWUnLock(pMeta); @@ -124,6 +123,7 @@ int32_t tqStreamTaskProcessUpdateReq(SStreamMeta* pMeta, SMsgCb* cb, SRpcMsg* pM tqDebug("s-task:%s recv trans to update nodeEp from mnode, transId:%d", idstr, req.transId); } + // duplicate update epset msg received, discard this redundant message STaskUpdateEntry entry = {.streamId = req.streamId, .taskId = req.taskId, .transId = req.transId}; void* pReqTask = taosHashGet(pMeta->updateInfo.pTasks, &entry, sizeof(STaskUpdateEntry)); @@ -135,7 +135,6 @@ int32_t tqStreamTaskProcessUpdateReq(SStreamMeta* pMeta, SMsgCb* cb, SRpcMsg* pM return rsp.code; } - // the following two functions should not be executed within the scope of meta lock to avoid deadlock streamTaskUpdateEpsetInfo(pTask, req.pNodeList); streamTaskResetStatus(pTask); @@ -159,11 +158,6 @@ int32_t tqStreamTaskProcessUpdateReq(SStreamMeta* pMeta, SMsgCb* cb, SRpcMsg* pM streamMetaSaveTask(pMeta, *ppHTask); } -#if 0 - if (streamMetaCommit(pMeta) < 0) { - // persist to disk - } -#endif } else { tqDebug("s-task:%s vgId:%d not save since restore not finish", idstr, vgId); } @@ -171,7 +165,7 @@ int32_t tqStreamTaskProcessUpdateReq(SStreamMeta* pMeta, SMsgCb* cb, SRpcMsg* pM tqDebug("s-task:%s vgId:%d start to stop task after save task", idstr, vgId); streamTaskStop(pTask); - // keep the already handled info + // keep the already updated info taosHashPut(pMeta->updateInfo.pTasks, &entry, sizeof(entry), NULL, 0); if (ppHTask != NULL) { @@ -200,6 +194,10 @@ int32_t tqStreamTaskProcessUpdateReq(SStreamMeta* pMeta, SMsgCb* cb, SRpcMsg* pM updateTasks, (numOfTasks - updateTasks)); streamMetaWUnLock(pMeta); } else { + if (streamMetaCommit(pMeta) < 0) { + // persist to disk + } + if (!restored) { tqDebug("vgId:%d vnode restore not completed, not start the tasks, clear the start after nodeUpdate flag", vgId); pMeta->startInfo.tasksWillRestart = 0; @@ -686,16 +684,16 @@ int32_t tqStreamTaskProcessDropReq(SStreamMeta* pMeta, char* msg, int32_t msgLen return 0; } -int32_t tqStreamTaskResetStatus(SStreamMeta* pMeta, int32_t* numOfTasks) { +int32_t tqStreamTaskResetStatus(SStreamMeta* pMeta) { int32_t vgId = pMeta->vgId; - *numOfTasks = taosArrayGetSize(pMeta->pTaskList); + int32_t numOfTasks = taosArrayGetSize(pMeta->pTaskList); - tqDebug("vgId:%d reset all %d stream task(s) status to be uninit", vgId, *numOfTasks); - if (*numOfTasks == 0) { + tqDebug("vgId:%d reset all %d stream task(s) status to be uninit", vgId, numOfTasks); + if (numOfTasks == 0) { return TSDB_CODE_SUCCESS; } - for (int32_t i = 0; i < (*numOfTasks); ++i) { + for (int32_t i = 0; i < numOfTasks; ++i) { SStreamTaskId* pTaskId = taosArrayGet(pMeta->pTaskList, i); STaskId id = {.streamId = pTaskId->streamId, .taskId = pTaskId->taskId}; @@ -754,8 +752,7 @@ static int32_t restartStreamTasks(SStreamMeta* pMeta, bool isLeader) { } if (isLeader && !tsDisableStream) { - int32_t numOfTasks = 0; - tqStreamTaskResetStatus(pMeta, &numOfTasks); + streamMetaResetTaskStatus(pMeta); streamMetaWUnLock(pMeta); streamMetaStartAllTasks(pMeta); @@ -965,8 +962,7 @@ static int32_t tqProcessTaskResumeImpl(void* handle, SStreamTask* pTask, int64_t } else if (status == TASK_STATUS__UNINIT) { // todo: fill-history task init ? if (pTask->info.fillHistory == 0) { - EStreamTaskEvent event = /*HAS_RELATED_FILLHISTORY_TASK(pTask) ? TASK_EVENT_INIT_STREAM_SCANHIST : */TASK_EVENT_INIT; - streamTaskHandleEvent(pTask->status.pSM, event); + streamTaskHandleEvent(pTask->status.pSM, TASK_EVENT_INIT); } } @@ -990,4 +986,11 @@ int32_t tqStreamTaskProcessTaskResumeReq(void* handle, int64_t sversion, char* m } return code; +} + +int32_t tqStreamTasksGetTotalNum(SStreamMeta* pMeta) { + streamMetaRLock(pMeta); + int32_t num = taosArrayGetSize(pMeta->pTaskList); + streamMetaRUnLock(pMeta); + return num; } \ No newline at end of file diff --git a/source/dnode/vnode/src/vnd/vnodeSync.c b/source/dnode/vnode/src/vnd/vnodeSync.c index 42d57ca391..1d094539a0 100644 --- a/source/dnode/vnode/src/vnd/vnodeSync.c +++ b/source/dnode/vnode/src/vnd/vnodeSync.c @@ -570,9 +570,7 @@ static void vnodeRestoreFinish(const SSyncFSM *pFsm, const SyncIndex commitIdx) vInfo("vgId:%d, sync restore finished, not launch stream tasks, since stream tasks are disabled", vgId); } else { vInfo("vgId:%d sync restore finished, start to launch stream task(s)", pVnode->config.vgId); - int32_t numOfTasks = 0; - tqStreamTaskResetStatus(pMeta, &numOfTasks); - + int32_t numOfTasks = tqStreamTasksGetTotalNum(pMeta); if (numOfTasks > 0) { if (pMeta->startInfo.taskStarting == 1) { pMeta->startInfo.restartCount += 1; @@ -580,6 +578,7 @@ static void vnodeRestoreFinish(const SSyncFSM *pFsm, const SyncIndex commitIdx) pMeta->startInfo.restartCount); } else { pMeta->startInfo.taskStarting = 1; + streamMetaWUnLock(pMeta); tqStreamTaskStartAsync(pMeta, &pVnode->msgCb, false); return; diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index 1f45e9b94c..9edbe22168 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -1413,31 +1413,43 @@ void streamMetaUpdateStageRole(SStreamMeta* pMeta, int64_t stage, bool isLeader) } } +static SArray* prepareBeforeStartTasks(SStreamMeta* pMeta) { + streamMetaWLock(pMeta); + + SArray* pTaskList = taosArrayDup(pMeta->pTaskList, NULL); + taosHashClear(pMeta->startInfo.pReadyTaskSet); + taosHashClear(pMeta->startInfo.pFailedTaskSet); + pMeta->startInfo.startTs = taosGetTimestampMs(); + + streamMetaResetTaskStatus(pMeta); + streamMetaWUnLock(pMeta); + + return pTaskList; +} + int32_t streamMetaStartAllTasks(SStreamMeta* pMeta) { int32_t code = TSDB_CODE_SUCCESS; int32_t vgId = pMeta->vgId; int32_t numOfTasks = taosArrayGetSize(pMeta->pTaskList); stInfo("vgId:%d start to check all %d stream task(s) downstream status", vgId, numOfTasks); + if (numOfTasks == 0) { stInfo("vgId:%d start tasks completed", pMeta->vgId); return TSDB_CODE_SUCCESS; } - SArray* pTaskList = NULL; - streamMetaWLock(pMeta); - pTaskList = taosArrayDup(pMeta->pTaskList, NULL); - taosHashClear(pMeta->startInfo.pReadyTaskSet); - taosHashClear(pMeta->startInfo.pFailedTaskSet); - pMeta->startInfo.startTs = taosGetTimestampMs(); - streamMetaWUnLock(pMeta); - - // broadcast the check downstream tasks msg int64_t now = taosGetTimestampMs(); + SArray* pTaskList = prepareBeforeStartTasks(pMeta); + numOfTasks = taosArrayGetSize(pTaskList); + + // broadcast the check downstream tasks msg for (int32_t i = 0; i < numOfTasks; ++i) { SStreamTaskId* pTaskId = taosArrayGet(pTaskList, i); + // todo: may be we should find the related fill-history task and set it failed. + // todo: use hashTable instead SStreamTask* pTask = streamMetaAcquireTask(pMeta, pTaskId->streamId, pTaskId->taskId); if (pTask == NULL) { stError("vgId:%d failed to acquire task:0x%x during start tasks", pMeta->vgId, pTaskId->taskId); @@ -1445,8 +1457,6 @@ int32_t streamMetaStartAllTasks(SStreamMeta* pMeta) { continue; } - // todo: may be we should find the related fill-history task and set it failed. - // fill-history task can only be launched by related stream tasks. STaskExecStatisInfo* pInfo = &pTask->execInfo; if (pTask->info.fillHistory == 1) { @@ -1493,10 +1503,13 @@ int32_t streamMetaStopAllTasks(SStreamMeta* pMeta) { int32_t num = taosArrayGetSize(pMeta->pTaskList); stDebug("vgId:%d stop all %d stream task(s)", pMeta->vgId, num); if (num == 0) { + stDebug("vgId:%d stop all %d task(s) completed, elapsed time:0 Sec.", pMeta->vgId, num); streamMetaRUnLock(pMeta); return TSDB_CODE_SUCCESS; } + int64_t st = taosGetTimestampMs(); + // send hb msg to mnode before closing all tasks. SArray* pTaskList = streamMetaSendMsgBeforeCloseTasks(pMeta); int32_t numOfTasks = taosArrayGetSize(pTaskList); @@ -1514,6 +1527,9 @@ int32_t streamMetaStopAllTasks(SStreamMeta* pMeta) { taosArrayDestroy(pTaskList); + double el = (taosGetTimestampMs() - st) / 1000.0; + stDebug("vgId:%d stop all %d task(s) completed, elapsed time:%.2f Sec.", pMeta->vgId, num, el); + streamMetaRUnLock(pMeta); return 0; } @@ -1640,4 +1656,23 @@ int32_t streamMetaAddTaskLaunchResult(SStreamMeta* pMeta, int64_t streamId, int3 } return TSDB_CODE_SUCCESS; +} + +int32_t streamMetaResetTaskStatus(SStreamMeta* pMeta) { + int32_t numOfTasks = taosArrayGetSize(pMeta->pTaskList); + + stDebug("vgId:%d reset all %d stream task(s) status to be uninit", pMeta->vgId, numOfTasks); + if (numOfTasks == 0) { + return TSDB_CODE_SUCCESS; + } + + for (int32_t i = 0; i < numOfTasks; ++i) { + SStreamTaskId* pTaskId = taosArrayGet(pMeta->pTaskList, i); + + STaskId id = {.streamId = pTaskId->streamId, .taskId = pTaskId->taskId}; + SStreamTask** pTask = taosHashGet(pMeta->pTasksMap, &id, sizeof(id)); + streamTaskResetStatus(*pTask); + } + + return 0; } \ No newline at end of file