diff --git a/include/libs/stream/tstream.h b/include/libs/stream/tstream.h index 5e145d8fbb..2e4204ab34 100644 --- a/include/libs/stream/tstream.h +++ b/include/libs/stream/tstream.h @@ -762,7 +762,6 @@ int32_t streamQueueGetNumOfItems(const SStreamQueue* pQueue); int32_t streamRestoreParam(SStreamTask* pTask); void streamTaskPause(SStreamTask* pTask, SStreamMeta* pMeta); void streamTaskResume(SStreamTask* pTask); -void streamTaskDisablePause(SStreamTask* pTask); void streamTaskEnablePause(SStreamTask* pTask); int32_t streamTaskSetUpstreamInfo(SStreamTask* pTask, const SStreamTask* pUpstreamTask); void streamTaskUpdateUpstreamInfo(SStreamTask* pTask, int32_t nodeId, const SEpSet* pEpSet); @@ -807,6 +806,10 @@ void streamMetaStartHb(SStreamMeta* pMeta); void streamMetaInitForSnode(SStreamMeta* pMeta); bool streamMetaTaskInTimer(SStreamMeta* pMeta); int32_t streamMetaUpdateTaskReadyInfo(SStreamTask* pTask); +void streamMetaRLock(SStreamMeta* pMeta); +void streamMetaRUnLock(SStreamMeta* pMeta); +void streamMetaWLock(SStreamMeta* pMeta); +void streamMetaWUnLock(SStreamMeta* pMeta); // checkpoint int32_t streamProcessCheckpointSourceReq(SStreamTask* pTask, SStreamCheckpointSourceReq* pReq); diff --git a/include/util/taoserror.h b/include/util/taoserror.h index 6094dbf635..db809f1af2 100644 --- a/include/util/taoserror.h +++ b/include/util/taoserror.h @@ -819,6 +819,8 @@ int32_t* taosGetErrno(); // stream #define TSDB_CODE_STREAM_TASK_NOT_EXIST TAOS_DEF_ERROR_CODE(0, 0x4100) #define TSDB_CODE_STREAM_EXEC_CANCELLED TAOS_DEF_ERROR_CODE(0, 0x4102) +#define TSDB_CODE_STREAM_INVALID_STATETRANS TAOS_DEF_ERROR_CODE(0, 0x4103) +#define TSDB_CODE_STREAM_TASK_IVLD_STATUS TAOS_DEF_ERROR_CODE(0, 0x4104) // TDLite #define TSDB_CODE_TDLITE_IVLD_OPEN_FLAGS TAOS_DEF_ERROR_CODE(0, 0x5100) diff --git a/source/dnode/snode/src/snode.c b/source/dnode/snode/src/snode.c index 4e84b4cd26..f2ef00c534 100644 --- a/source/dnode/snode/src/snode.c +++ b/source/dnode/snode/src/snode.c @@ -165,17 +165,17 @@ int32_t sndProcessTaskDeployReq(SSnode *pSnode, char *msg, int32_t msgLen) { ASSERT(pTask->info.taskLevel == TASK_LEVEL__AGG); // 2.save task - taosWLockLatch(&pSnode->pMeta->lock); + streamMetaWLock(pSnode->pMeta); bool added = false; code = streamMetaRegisterTask(pSnode->pMeta, -1, pTask, &added); if (code < 0) { - taosWUnLockLatch(&pSnode->pMeta->lock); + streamMetaWUnLock(pSnode->pMeta); return -1; } int32_t numOfTasks = streamMetaGetNumOfTasks(pSnode->pMeta); - taosWUnLockLatch(&pSnode->pMeta->lock); + streamMetaWUnLock(pSnode->pMeta); char* p = NULL; streamTaskGetStatus(pTask, &p); @@ -195,14 +195,14 @@ int32_t sndProcessTaskDropReq(SSnode *pSnode, char *msg, int32_t msgLen) { streamMetaUnregisterTask(pSnode->pMeta, pReq->streamId, pReq->taskId); // commit the update - taosWLockLatch(&pSnode->pMeta->lock); + streamMetaWLock(pSnode->pMeta); int32_t numOfTasks = streamMetaGetNumOfTasks(pSnode->pMeta); qDebug("vgId:%d task:0x%x dropped, remain tasks:%d", pSnode->pMeta->vgId, pReq->taskId, numOfTasks); if (streamMetaCommit(pSnode->pMeta) < 0) { // persist to disk } - taosWUnLockLatch(&pSnode->pMeta->lock); + streamMetaWUnLock(pSnode->pMeta); return 0; } diff --git a/source/dnode/vnode/src/tq/tq.c b/source/dnode/vnode/src/tq/tq.c index e1b27049af..fff58feeb1 100644 --- a/source/dnode/vnode/src/tq/tq.c +++ b/source/dnode/vnode/src/tq/tq.c @@ -1023,10 +1023,10 @@ int32_t tqProcessTaskDeployReq(STQ* pTq, int64_t sversion, char* msg, int32_t ms int64_t streamId = pTask->id.streamId; bool added = false; - taosWLockLatch(&pStreamMeta->lock); + streamMetaWLock(pStreamMeta); code = streamMetaRegisterTask(pStreamMeta, sversion, pTask, &added); int32_t numOfTasks = streamMetaGetNumOfTasks(pStreamMeta); - taosWUnLockLatch(&pStreamMeta->lock); + streamMetaWUnLock(pStreamMeta); if (code < 0) { tqError("failed to add s-task:0x%x into vgId:%d meta, total:%d, code:%s", vgId, taskId, numOfTasks, tstrerror(code)); @@ -1064,6 +1064,47 @@ int32_t tqProcessTaskDeployReq(STQ* pTq, int64_t sversion, char* msg, int32_t ms return code; } +static void doStartStep2(SStreamTask* pTask, SStreamTask* pStreamTask, STQ* pTq) { + const char* id = pTask->id.idStr; + int64_t nextProcessedVer = pStreamTask->hTaskInfo.haltVer; + + // if it's an source task, extract the last version in wal. + SVersionRange *pRange = &pTask->dataRange.range; + + bool done = streamHistoryTaskSetVerRangeStep2(pTask, nextProcessedVer); + pTask->execInfo.step2Start = taosGetTimestampMs(); + + if (done) { + qDebug("s-task:%s scan-history from WAL stage(step 2) ended, elapsed time:%.2fs", id, 0.0); + streamTaskPutTranstateIntoInputQ(pTask); + streamExecTask(pTask); // exec directly + } else { + STimeWindow* pWindow = &pTask->dataRange.window; + tqDebug("s-task:%s level:%d verRange:%" PRId64 " - %" PRId64 " window:%" PRId64 "-%" PRId64 + ", do secondary scan-history from WAL after halt the related stream task:%s", + id, pTask->info.taskLevel, pRange->minVer, pRange->maxVer, pWindow->skey, pWindow->ekey, + pStreamTask->id.idStr); + ASSERT(pTask->status.schedStatus == TASK_SCHED_STATUS__WAITING); + + streamSetParamForStreamScannerStep2(pTask, pRange, pWindow); + + int64_t dstVer = pTask->dataRange.range.minVer; + pTask->chkInfo.nextProcessVer = dstVer; + + walReaderSetSkipToVersion(pTask->exec.pWalReader, dstVer); + tqDebug("s-task:%s wal reader start scan WAL verRange:%" PRId64 "-%" PRId64 ", set sched-status:%d", id, dstVer, + pTask->dataRange.range.maxVer, TASK_SCHED_STATUS__INACTIVE); + + /*int8_t status = */streamTaskSetSchedStatusInactive(pTask); + + // now the fill-history task starts to scan data from wal files. + int32_t code = streamTaskHandleEvent(pTask->status.pSM, TASK_EVENT_SCANHIST_DONE); + if (code == TSDB_CODE_SUCCESS) { + tqScanWalAsync(pTq, false); + } + } +} + // this function should be executed by only one thread int32_t tqProcessTaskScanHistory(STQ* pTq, SRpcMsg* pMsg) { SStreamScanHistoryReq* pReq = (SStreamScanHistoryReq*)pMsg->pCont; @@ -1126,10 +1167,6 @@ int32_t tqProcessTaskScanHistory(STQ* pTq, SRpcMsg* pMsg) { return 0; } - if (pTask->info.fillHistory == 1) { - ASSERT(pTask->status.pauseAllowed == true); - } - streamScanHistoryData(pTask); double el = (taosGetTimestampMs() - pTask->execInfo.step1Start) / 1000.0; @@ -1146,18 +1183,15 @@ int32_t tqProcessTaskScanHistory(STQ* pTq, SRpcMsg* pMsg) { tqDebug("s-task:%s scan-history(step 1) ended, elapsed time:%.2fs", id, el); if (pTask->info.fillHistory) { - SVersionRange* pRange = NULL; SStreamTask* pStreamTask = NULL; // 1. get the related stream task pStreamTask = streamMetaAcquireTask(pMeta, pTask->streamTaskId.streamId, pTask->streamTaskId.taskId); if (pStreamTask == NULL) { - // todo delete this task, if the related stream task is dropped - qError("failed to find s-task:0x%"PRIx64", it may have been destroyed, drop fill-history task:%s", + tqError("failed to find s-task:0x%"PRIx64", it may have been destroyed, drop related fill-history task:%s", pTask->streamTaskId.taskId, pTask->id.idStr); tqDebug("s-task:%s fill-history task set status to be dropping", id); - streamBuildAndSendDropTaskMsg(pTask->pMsgCb, pMeta->vgId, &pTask->id); atomic_store_32(&pTask->status.inScanHistorySentinel, 0); @@ -1166,112 +1200,14 @@ int32_t tqProcessTaskScanHistory(STQ* pTq, SRpcMsg* pMsg) { } ASSERT(pStreamTask->info.taskLevel == TASK_LEVEL__SOURCE); -#if 0 - // 2. it cannot be paused, when the stream task in TASK_STATUS__SCAN_HISTORY status. Let's wait for the - // stream task get ready for scan history data - while (streamTaskGetStatus(pStreamTask, NULL) == TASK_STATUS__SCAN_HISTORY) { - tqDebug( - "s-task:%s level:%d related stream task:%s(status:%s) not ready for halt, wait for it and recheck in 100ms", - id, pTask->info.taskLevel, pStreamTask->id.idStr, streamGetTaskStatusStr(pStreamTask->status.taskStatus)); - taosMsleep(100); - } - // now we can stop the stream task execution - int64_t nextProcessedVer = 0; - - while (1) { - taosThreadMutexLock(&pStreamTask->lock); - int8_t status = pStreamTask->status.taskStatus; - if (status == TASK_STATUS__DROPPING || status == TASK_STATUS__STOP) { - // return; do nothing - } - - if (status == TASK_STATUS__HALT) { -// tqDebug("s-task:%s level:%d sched-status:%d is halt by fill-history task:%s", pStreamTask->id.idStr, -// pStreamTask->info.taskLevel, pStreamTask->status.schedStatus, id); -// latestVer = walReaderGetCurrentVer(pStreamTask->exec.pWalReader); -// -// taosThreadMutexUnlock(&pStreamTask->lock); -// break; - } - - if (pStreamTask->status.taskStatus == TASK_STATUS__CK) { - qDebug("s-task:%s status:%s during generating checkpoint, wait for 1sec and retry set status:halt", - pStreamTask->id.idStr, streamGetTaskStatusStr(TASK_STATUS__CK)); - taosThreadMutexUnlock(&pStreamTask->lock); - taosMsleep(1000); - continue; - } - - // upgrade to halt status - if (status == TASK_STATUS__PAUSE) { - qDebug("s-task:%s upgrade status to %s from %s", pStreamTask->id.idStr, streamGetTaskStatusStr(TASK_STATUS__HALT), - streamGetTaskStatusStr(TASK_STATUS__PAUSE)); - } else { - qDebug("s-task:%s halt task, prev status:%s", pStreamTask->id.idStr, streamGetTaskStatusStr(status)); - } - - pStreamTask->status.keepTaskStatus = status; - pStreamTask->status.taskStatus = TASK_STATUS__HALT; - - // wal scan not start yet, reset it to be the start position - nextProcessedVer = walReaderGetCurrentVer(pStreamTask->exec.pWalReader); - if (nextProcessedVer == -1) { - nextProcessedVer = pStreamTask->dataRange.range.maxVer + 1; - } - - tqDebug("s-task:%s level:%d nextProcessedVer:%" PRId64 ", sched-status:%d is halt by fill-history task:%s", - pStreamTask->id.idStr, pStreamTask->info.taskLevel, nextProcessedVer, pStreamTask->status.schedStatus, - id); - - taosThreadMutexUnlock(&pStreamTask->lock); - break; - } -#endif - - streamTaskHandleEvent(pStreamTask->status.pSM, TASK_EVENT_HALT); - int64_t nextProcessedVer = pStreamTask->hTaskInfo.haltVer; - - // if it's an source task, extract the last version in wal. - pRange = &pTask->dataRange.range; - bool done = streamHistoryTaskSetVerRangeStep2(pTask, nextProcessedVer); - pTask->execInfo.step2Start = taosGetTimestampMs(); - - if (done) { - qDebug("s-task:%s scan-history from WAL stage(step 2) ended, elapsed time:%.2fs", id, 0.0); - streamTaskPutTranstateIntoInputQ(pTask); -// streamTaskRestoreStatus(pTask); - -// if (pTask->status.taskStatus == TASK_STATUS__PAUSE) { -// pTask->status.keepTaskStatus = TASK_STATUS__READY; -// qDebug("s-task:%s prev status is %s, update the kept status to be:%s when after step 2", id, -// streamGetTaskStatusStr(TASK_STATUS__PAUSE), streamGetTaskStatusStr(pTask->status.keepTaskStatus)); -// } - - streamExecTask(pTask); // exec directly + code = streamTaskHandleEvent(pStreamTask->status.pSM, TASK_EVENT_HALT); + if (code == TSDB_CODE_SUCCESS) { + doStartStep2(pTask, pStreamTask, pTq); } else { - STimeWindow* pWindow = &pTask->dataRange.window; - tqDebug("s-task:%s level:%d verRange:%" PRId64 " - %" PRId64 " window:%" PRId64 "-%" PRId64 - ", do secondary scan-history from WAL after halt the related stream task:%s", - id, pTask->info.taskLevel, pRange->minVer, pRange->maxVer, pWindow->skey, pWindow->ekey, - pStreamTask->id.idStr); - ASSERT(pTask->status.schedStatus == TASK_SCHED_STATUS__WAITING); - - streamSetParamForStreamScannerStep2(pTask, pRange, pWindow); - - int64_t dstVer = pTask->dataRange.range.minVer; - pTask->chkInfo.nextProcessVer = dstVer; - - walReaderSetSkipToVersion(pTask->exec.pWalReader, dstVer); - tqDebug("s-task:%s wal reader start scan WAL verRange:%" PRId64 "-%" PRId64 ", set sched-status:%d", id, dstVer, - pTask->dataRange.range.maxVer, TASK_SCHED_STATUS__INACTIVE); - - /*int8_t status = */streamTaskSetSchedStatusInactive(pTask); - - // now the fill-history task starts to scan data from wal files. - streamTaskHandleEvent(pTask->status.pSM, TASK_EVENT_SCANHIST_DONE); - tqScanWalAsync(pTq, false); + tqError("s-task:%s failed to halt s-task:%s, not launch step2", id, pStreamTask->id.idStr); } + streamMetaReleaseTask(pMeta, pStreamTask); } else { @@ -1467,14 +1403,14 @@ int32_t tqProcessTaskDropReq(STQ* pTq, char* msg, int32_t msgLen) { streamMetaUnregisterTask(pMeta, pReq->streamId, pReq->taskId); // commit the update - taosWLockLatch(&pMeta->lock); + streamMetaWLock(pMeta); int32_t numOfTasks = streamMetaGetNumOfTasks(pMeta); tqDebug("vgId:%d task:0x%x dropped, remain tasks:%d", vgId, pReq->taskId, numOfTasks); if (streamMetaCommit(pMeta) < 0) { // persist to disk } - taosWUnLockLatch(&pMeta->lock); + streamMetaWUnLock(pMeta); return 0; } @@ -1785,7 +1721,7 @@ int32_t tqProcessTaskCheckPointSourceReq(STQ* pTq, SRpcMsg* pMsg, SRpcMsg* pRsp) taosThreadMutexUnlock(&pTask->lock); int32_t total = 0; - taosWLockLatch(&pMeta->lock); + streamMetaWLock(pMeta); // set the initial value for generating check point // set the mgmt epset info according to the checkout source msg from mnode, todo update mgmt epset if needed @@ -1794,7 +1730,7 @@ int32_t tqProcessTaskCheckPointSourceReq(STQ* pTq, SRpcMsg* pMsg, SRpcMsg* pRsp) } total = pMeta->numOfStreamTasks; - taosWUnLockLatch(&pMeta->lock); + streamMetaWUnLock(pMeta); qInfo("s-task:%s (vgId:%d) level:%d receive checkpoint-source msg chkpt:%" PRId64 ", total checkpoint reqs:%d", pTask->id.idStr, vgId, pTask->info.taskLevel, req.checkpointId, total); @@ -1865,7 +1801,7 @@ int32_t tqProcessTaskUpdateReq(STQ* pTq, SRpcMsg* pMsg) { tDecoderClear(&decoder); // update the nodeEpset when it exists - taosWLockLatch(&pMeta->lock); + streamMetaWLock(pMeta); // the task epset may be updated again and again, when replaying the WAL, the task may be in stop status. STaskId id = {.streamId = req.streamId, .taskId = req.taskId}; @@ -1874,7 +1810,8 @@ int32_t tqProcessTaskUpdateReq(STQ* pTq, SRpcMsg* pMsg) { tqError("vgId:%d failed to acquire task:0x%x when handling update, it may have been dropped already", pMeta->vgId, req.taskId); rsp.code = TSDB_CODE_SUCCESS; - taosWUnLockLatch(&pMeta->lock); + streamMetaWUnLock(pMeta); + taosArrayDestroy(req.pNodeList); return rsp.code; } @@ -1883,7 +1820,7 @@ int32_t tqProcessTaskUpdateReq(STQ* pTq, SRpcMsg* pMsg) { if (pMeta->updateInfo.transId != req.transId) { pMeta->updateInfo.transId = req.transId; - tqDebug("s-task:%s receive new trans to update nodeEp msg from mnode, transId:%d", pTask->id.idStr, req.transId); + tqInfo("s-task:%s receive new trans to update nodeEp msg from mnode, transId:%d", pTask->id.idStr, req.transId); // info needs to be kept till the new trans to update the nodeEp arrived. taosHashClear(pMeta->updateInfo.pTasks); } else { @@ -1896,19 +1833,19 @@ int32_t tqProcessTaskUpdateReq(STQ* pTq, SRpcMsg* pMsg) { tqDebug("s-task:%s (vgId:%d) already update in trans:%d, discard the nodeEp update msg", pTask->id.idStr, vgId, req.transId); rsp.code = TSDB_CODE_SUCCESS; - taosWUnLockLatch(&pMeta->lock); + streamMetaWUnLock(pMeta); taosArrayDestroy(req.pNodeList); return rsp.code; } - taosWUnLockLatch(&pMeta->lock); + streamMetaWUnLock(pMeta); // the following two functions should not be executed within the scope of meta lock to avoid deadlock streamTaskUpdateEpsetInfo(pTask, req.pNodeList); streamTaskResetStatus(pTask); // continue after lock the meta again - taosWLockLatch(&pMeta->lock); + streamMetaWLock(pMeta); SStreamTask** ppHTask = NULL; if (HAS_RELATED_FILLHISTORY_TASK(pTask)) { @@ -1958,47 +1895,49 @@ int32_t tqProcessTaskUpdateReq(STQ* pTq, SRpcMsg* pMsg) { if (updateTasks < numOfTasks) { tqDebug("vgId:%d closed tasks:%d, unclosed:%d, all tasks will be started when nodeEp update completed", vgId, updateTasks, (numOfTasks - updateTasks)); - taosWUnLockLatch(&pMeta->lock); + streamMetaWUnLock(pMeta); } else { if (!pTq->pVnode->restored) { tqDebug("vgId:%d vnode restore not completed, not restart the tasks, clear the start after nodeUpdate flag", vgId); pMeta->startInfo.startAllTasksFlag = 0; - taosWUnLockLatch(&pMeta->lock); + streamMetaWUnLock(pMeta); } else { - tqDebug("vgId:%d tasks are all updated and stopped, restart them", vgId); + tqInfo("vgId:%d tasks are all updated and stopped, restart them", vgId); terrno = 0; - taosWUnLockLatch(&pMeta->lock); + + streamMetaWUnLock(pMeta); while (streamMetaTaskInTimer(pMeta)) { - qDebug("vgId:%d some tasks in timer, wait for 100ms and recheck", pMeta->vgId); + tqDebug("vgId:%d some tasks in timer, wait for 100ms and recheck", pMeta->vgId); taosMsleep(100); } - taosWLockLatch(&pMeta->lock); + streamMetaWLock(pMeta); + int32_t code = streamMetaReopen(pMeta); if (code != 0) { tqError("vgId:%d failed to reopen stream meta", vgId); - taosWUnLockLatch(&pMeta->lock); + streamMetaWUnLock(pMeta); taosArrayDestroy(req.pNodeList); return -1; } if (streamMetaLoadAllTasks(pTq->pStreamMeta) < 0) { tqError("vgId:%d failed to load stream tasks", vgId); - taosWUnLockLatch(&pMeta->lock); + streamMetaWUnLock(pMeta); taosArrayDestroy(req.pNodeList); return -1; } if (vnodeIsRoleLeader(pTq->pVnode) && !tsDisableStream) { - vInfo("vgId:%d restart all stream tasks after all tasks being updated", vgId); + tqInfo("vgId:%d restart all stream tasks after all tasks being updated", vgId); tqResetStreamTaskStatus(pTq); tqLaunchStreamTaskAsync(pTq); } else { - vInfo("vgId:%d, follower node not start stream tasks", vgId); + tqInfo("vgId:%d, follower node not start stream tasks", vgId); } - taosWUnLockLatch(&pMeta->lock); + streamMetaWUnLock(pMeta); } } diff --git a/source/dnode/vnode/src/tq/tqPush.c b/source/dnode/vnode/src/tq/tqPush.c index 50ee52f45b..f367bc96f8 100644 --- a/source/dnode/vnode/src/tq/tqPush.c +++ b/source/dnode/vnode/src/tq/tqPush.c @@ -35,9 +35,9 @@ int32_t tqPushMsg(STQ* pTq, tmsg_t msgType) { tqProcessSubmitReqForSubscribe(pTq); } - taosRLockLatch(&pTq->pStreamMeta->lock); + streamMetaRLock(pTq->pStreamMeta); int32_t numOfTasks = streamMetaGetNumOfTasks(pTq->pStreamMeta); - taosRUnLockLatch(&pTq->pStreamMeta->lock); + streamMetaRUnLock(pTq->pStreamMeta); // tqTrace("vgId:%d handle submit, restore:%d, numOfTasks:%d", TD_VID(pTq->pVnode), pTq->pVnode->restored, numOfTasks); diff --git a/source/dnode/vnode/src/tq/tqRead.c b/source/dnode/vnode/src/tq/tqRead.c index dadbd30808..bd2a591a98 100644 --- a/source/dnode/vnode/src/tq/tqRead.c +++ b/source/dnode/vnode/src/tq/tqRead.c @@ -1111,7 +1111,7 @@ int32_t tqUpdateTbUidList(STQ* pTq, const SArray* tbUidList, bool isAdd) { taosWUnLockLatch(&pTq->lock); // update the table list handle for each stream scanner/wal reader - taosWLockLatch(&pTq->pStreamMeta->lock); + streamMetaWLock(pTq->pStreamMeta); while (1) { pIter = taosHashIterate(pTq->pStreamMeta->pTasksMap, pIter); if (pIter == NULL) { @@ -1128,6 +1128,6 @@ int32_t tqUpdateTbUidList(STQ* pTq, const SArray* tbUidList, bool isAdd) { } } - taosWUnLockLatch(&pTq->pStreamMeta->lock); + streamMetaWUnLock(pTq->pStreamMeta); return 0; } diff --git a/source/dnode/vnode/src/tq/tqStreamTask.c b/source/dnode/vnode/src/tq/tqStreamTask.c index 1f1dd61c3c..2d94f23009 100644 --- a/source/dnode/vnode/src/tq/tqStreamTask.c +++ b/source/dnode/vnode/src/tq/tqStreamTask.c @@ -38,18 +38,16 @@ int32_t tqScanWal(STQ* pTq) { doScanWalForAllTasks(pTq->pStreamMeta, &shouldIdle); if (shouldIdle) { - taosWLockLatch(&pMeta->lock); - + streamMetaWLock(pMeta); int32_t times = (--pMeta->walScanCounter); ASSERT(pMeta->walScanCounter >= 0); + streamMetaWUnLock(pMeta); - if (pMeta->walScanCounter <= 0) { - taosWUnLockLatch(&pMeta->lock); + if (times <= 0) { break; + } else { + tqDebug("vgId:%d scan wal for stream tasks for %d times in %dms", vgId, times, SCAN_WAL_IDLE_DURATION); } - - taosWUnLockLatch(&pMeta->lock); - tqDebug("vgId:%d scan wal for stream tasks for %d times in %dms", vgId, times, SCAN_WAL_IDLE_DURATION); } taosMsleep(SCAN_WAL_IDLE_DURATION); @@ -61,6 +59,7 @@ int32_t tqScanWal(STQ* pTq) { } int32_t tqStartStreamTask(STQ* pTq) { + int32_t code = TSDB_CODE_SUCCESS; int32_t vgId = TD_VID(pTq->pVnode); SStreamMeta* pMeta = pTq->pStreamMeta; @@ -71,11 +70,11 @@ int32_t tqStartStreamTask(STQ* pTq) { } SArray* pTaskList = NULL; - taosWLockLatch(&pMeta->lock); + streamMetaWLock(pMeta); pTaskList = taosArrayDup(pMeta->pTaskList, NULL); taosHashClear(pMeta->startInfo.pReadyTaskSet); pMeta->startInfo.startTs = taosGetTimestampMs(); - taosWUnLockLatch(&pMeta->lock); + streamMetaWUnLock(pMeta); // broadcast the check downstream tasks msg for (int32_t i = 0; i < numOfTasks; ++i) { @@ -104,12 +103,16 @@ int32_t tqStartStreamTask(STQ* pTq) { } EStreamTaskEvent event = (HAS_RELATED_FILLHISTORY_TASK(pTask)) ? TASK_EVENT_INIT_STREAM_SCANHIST : TASK_EVENT_INIT; - streamTaskHandleEvent(pTask->status.pSM, event); + int32_t ret = streamTaskHandleEvent(pTask->status.pSM, event); + if (ret != TSDB_CODE_SUCCESS) { + code = ret; + } + streamMetaReleaseTask(pMeta, pTask); } taosArrayDestroy(pTaskList); - return 0; + return code; } int32_t tqLaunchStreamTaskAsync(STQ* pTq) { @@ -148,12 +151,12 @@ int32_t tqScanWalAsync(STQ* pTq, bool ckPause) { return TSDB_CODE_SUCCESS; } - taosWLockLatch(&pMeta->lock); + streamMetaWLock(pMeta); int32_t numOfTasks = taosArrayGetSize(pMeta->pTaskList); if (numOfTasks == 0) { tqDebug("vgId:%d no stream tasks existed to run", vgId); - taosWUnLockLatch(&pMeta->lock); + streamMetaWUnLock(pMeta); return 0; } @@ -164,7 +167,7 @@ int32_t tqScanWalAsync(STQ* pTq, bool ckPause) { if (pMeta->walScanCounter > 1) { tqDebug("vgId:%d wal read task has been launched, remain scan times:%d", vgId, pMeta->walScanCounter); - taosWUnLockLatch(&pMeta->lock); + streamMetaWUnLock(pMeta); return 0; } @@ -174,7 +177,7 @@ int32_t tqScanWalAsync(STQ* pTq, bool ckPause) { // reset the counter value, since we do not launch the scan wal operation. pMeta->walScanCounter = 0; - taosWUnLockLatch(&pMeta->lock); + streamMetaWUnLock(pMeta); return 0; } @@ -182,7 +185,7 @@ int32_t tqScanWalAsync(STQ* pTq, bool ckPause) { if (pRunReq == NULL) { terrno = TSDB_CODE_OUT_OF_MEMORY; tqError("vgId:%d failed to create msg to start wal scanning to launch stream tasks, code:%s", vgId, terrstr()); - taosWUnLockLatch(&pMeta->lock); + streamMetaWUnLock(pMeta); return -1; } @@ -193,7 +196,7 @@ int32_t tqScanWalAsync(STQ* pTq, bool ckPause) { SRpcMsg msg = {.msgType = TDMT_STREAM_TASK_RUN, .pCont = pRunReq, .contLen = sizeof(SStreamTaskRunReq)}; tmsgPutToQueue(&pTq->pVnode->msgCb, STREAM_QUEUE, &msg); - taosWUnLockLatch(&pMeta->lock); + streamMetaWUnLock(pMeta); return 0; } @@ -209,9 +212,9 @@ int32_t tqStopStreamTasks(STQ* pTq) { } SArray* pTaskList = NULL; - taosWLockLatch(&pMeta->lock); + streamMetaWLock(pMeta); pTaskList = taosArrayDup(pMeta->pTaskList, NULL); - taosWUnLockLatch(&pMeta->lock); + streamMetaWUnLock(pMeta); for (int32_t i = 0; i < numOfTasks; ++i) { SStreamTaskId* pTaskId = taosArrayGet(pTaskList, i); @@ -412,9 +415,9 @@ int32_t doScanWalForAllTasks(SStreamMeta* pStreamMeta, bool* pScanIdle) { // clone the task list, to avoid the task update during scan wal files SArray* pTaskList = NULL; - taosWLockLatch(&pStreamMeta->lock); + streamMetaWLock(pStreamMeta); pTaskList = taosArrayDup(pStreamMeta->pTaskList, NULL); - taosWUnLockLatch(&pStreamMeta->lock); + streamMetaWUnLock(pStreamMeta); tqDebug("vgId:%d start to check wal to extract new submit block for %d tasks", vgId, numOfTasks); @@ -446,6 +449,7 @@ int32_t doScanWalForAllTasks(SStreamMeta* pStreamMeta, bool* pScanIdle) { int64_t maxVer = (pTask->info.fillHistory == 1) ? pTask->dataRange.range.maxVer : INT64_MAX; taosThreadMutexLock(&pTask->lock); + tqDebug("s-task:%s lock", pTask->id.idStr); char* p = NULL; ETaskStatus status = streamTaskGetStatus(pTask, &p); diff --git a/source/dnode/vnode/src/vnd/vnodeSync.c b/source/dnode/vnode/src/vnd/vnodeSync.c index dcc31526f7..4a0c987e57 100644 --- a/source/dnode/vnode/src/vnd/vnodeSync.c +++ b/source/dnode/vnode/src/vnd/vnodeSync.c @@ -554,10 +554,12 @@ static void vnodeRestoreFinish(const SSyncFSM *pFsm, const SyncIndex commitIdx) walApplyVer(pVnode->pWal, commitIdx); pVnode->restored = true; - taosWLockLatch(&pVnode->pTq->pStreamMeta->lock); - if (pVnode->pTq->pStreamMeta->startInfo.startAllTasksFlag) { + SStreamMeta* pMeta = pVnode->pTq->pStreamMeta; + streamMetaWLock(pMeta); + + if (pMeta->startInfo.startAllTasksFlag) { vInfo("vgId:%d, sync restore finished, stream tasks will be launched by other thread", vgId); - taosWUnLockLatch(&pVnode->pTq->pStreamMeta->lock); + streamMetaWUnLock(pMeta); return; } @@ -574,7 +576,7 @@ static void vnodeRestoreFinish(const SSyncFSM *pFsm, const SyncIndex commitIdx) vInfo("vgId:%d, sync restore finished, not launch stream tasks since not leader", vgId); } - taosWUnLockLatch(&pVnode->pTq->pStreamMeta->lock); + streamMetaWUnLock(pMeta); } static void vnodeBecomeFollower(const SSyncFSM *pFsm) { diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index 9eaa9fcb92..81840aaeb7 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -184,13 +184,13 @@ int32_t streamProcessCheckpointBlock(SStreamTask* pTask, SStreamDataBlock* pBloc { // todo: remove this when the pipeline checkpoint generating is used. SStreamMeta* pMeta = pTask->pMeta; - taosWLockLatch(&pMeta->lock); + streamMetaWLock(pMeta); if (pMeta->chkptNotReadyTasks == 0) { pMeta->chkptNotReadyTasks = pMeta->numOfStreamTasks; } - taosWUnLockLatch(&pMeta->lock); + streamMetaWUnLock(pMeta); } //todo fix race condition: set the status and append checkpoint block @@ -281,8 +281,9 @@ void streamTaskClearCheckInfo(SStreamTask* pTask) { int32_t streamSaveAllTaskStatus(SStreamMeta* pMeta, int64_t checkpointId) { int32_t vgId = pMeta->vgId; + int32_t code = 0; - taosWLockLatch(&pMeta->lock); + streamMetaWLock(pMeta); for (int32_t i = 0; i < taosArrayGetSize(pMeta->pTaskList); ++i) { STaskId* pId = taosArrayGet(pMeta->pTaskList, i); @@ -304,10 +305,10 @@ int32_t streamSaveAllTaskStatus(SStreamMeta* pMeta, int64_t checkpointId) { char* str = NULL; streamTaskGetStatus(p, &str); - int32_t code = streamTaskHandleEvent(p->status.pSM, TASK_EVENT_CHECKPOINT_DONE); + code = streamTaskHandleEvent(p->status.pSM, TASK_EVENT_CHECKPOINT_DONE); if (code != TSDB_CODE_SUCCESS) { stDebug("s-task:%s vgId:%d save task status failed, since handle event failed", p->id.idStr, vgId); - taosWUnLockLatch(&pMeta->lock); + streamMetaWUnLock(pMeta); return -1; } else { // save the task streamMetaSaveTask(pMeta, p); @@ -320,17 +321,16 @@ int32_t streamSaveAllTaskStatus(SStreamMeta* pMeta, int64_t checkpointId) { str); } - if (streamMetaCommit(pMeta) < 0) { - taosWUnLockLatch(&pMeta->lock); + code = streamMetaCommit(pMeta); + if (code < 0) { stError("vgId:%d failed to commit stream meta after do checkpoint, checkpointId:%" PRId64 ", since %s", pMeta->vgId, checkpointId, terrstr()); - return -1; } else { - taosWUnLockLatch(&pMeta->lock); stInfo("vgId:%d commit stream meta after do checkpoint, checkpointId:%" PRId64 " DONE", pMeta->vgId, checkpointId); } - return TSDB_CODE_SUCCESS; + streamMetaWUnLock(pMeta); + return code; } int32_t streamTaskBuildCheckpoint(SStreamTask* pTask) { diff --git a/source/libs/stream/src/streamDispatch.c b/source/libs/stream/src/streamDispatch.c index cd8bbacb98..edfc66762d 100644 --- a/source/libs/stream/src/streamDispatch.c +++ b/source/libs/stream/src/streamDispatch.c @@ -41,9 +41,9 @@ static int32_t tInitStreamDispatchReq(SStreamDispatchReq* pReq, const SStreamTas int32_t numOfBlocks, int64_t dstTaskId, int32_t type); void initRpcMsg(SRpcMsg* pMsg, int32_t msgType, void* pCont, int32_t contLen) { - pMsg->msgType = msgType; - pMsg->pCont = pCont; - pMsg->contLen = contLen; + pMsg->msgType = msgType; + pMsg->pCont = pCont; + pMsg->contLen = contLen; } int32_t tEncodeStreamDispatchReq(SEncoder* pEncoder, const SStreamDispatchReq* pReq) { @@ -429,6 +429,7 @@ static void doRetryDispatchData(void* param, void* tmrId) { ASSERT(pTask->outputq.status == TASK_OUTPUT_STATUS__WAIT); int32_t code = 0; + { SArray* pList = taosArrayDup(pTask->msgInfo.pRetryList, NULL); taosArrayClear(pTask->msgInfo.pRetryList); @@ -440,7 +441,7 @@ static void doRetryDispatchData(void* param, void* tmrId) { int32_t numOfVgroups = taosArrayGetSize(vgInfo); int32_t numOfFailed = taosArrayGetSize(pList); - stDebug("s-task:%s (child taskId:%d) re-try shuffle-dispatch blocks to %d vgroup(s), msgId:%d", + stDebug("s-task:%s (child taskId:%d) retry shuffle-dispatch blocks to %d vgroup(s), msgId:%d", id, pTask->info.selfChildId, numOfFailed, msgId); for (int32_t i = 0; i < numOfFailed; i++) { @@ -471,6 +472,8 @@ static void doRetryDispatchData(void* param, void* tmrId) { code = doSendDispatchMsg(pTask, pReq, vgId, pEpSet); } + + taosArrayDestroy(pList); } if (code != TSDB_CODE_SUCCESS) { @@ -1004,6 +1007,8 @@ int32_t streamAddEndScanHistoryMsg(SStreamTask* pTask, SRpcHandleInfo* pRpcInfo, info.msg.info = *pRpcInfo; taosThreadMutexLock(&pTask->lock); + stDebug("s-task:%s lock", pTask->id.idStr); + if (pTask->pRspMsgList == NULL) { pTask->pRspMsgList = taosArrayInit(4, sizeof(SStreamContinueExecInfo)); } diff --git a/source/libs/stream/src/streamExec.c b/source/libs/stream/src/streamExec.c index 15b809d6bd..49f691c558 100644 --- a/source/libs/stream/src/streamExec.c +++ b/source/libs/stream/src/streamExec.c @@ -285,33 +285,31 @@ static void waitForTaskIdle(SStreamTask* pTask, SStreamTask* pStreamTask) { int32_t streamDoTransferStateToStreamTask(SStreamTask* pTask) { SStreamMeta* pMeta = pTask->pMeta; + const char* id = pTask->id.idStr; SStreamTask* pStreamTask = streamMetaAcquireTask(pMeta, pTask->streamTaskId.streamId, pTask->streamTaskId.taskId); if (pStreamTask == NULL) { stError( "s-task:%s failed to find related stream task:0x%x, it may have been destroyed or closed, destroy the related " "fill-history task", - pTask->id.idStr, (int32_t) pTask->streamTaskId.taskId); + id, (int32_t) pTask->streamTaskId.taskId); // 1. free it and remove fill-history task from disk meta-store streamBuildAndSendDropTaskMsg(pTask->pMsgCb, pMeta->vgId, &pTask->id); // 2. save to disk - taosWLockLatch(&pMeta->lock); + streamMetaWLock(pMeta); if (streamMetaCommit(pMeta) < 0) { // persist to disk } - taosWUnLockLatch(&pMeta->lock); + streamMetaWUnLock(pMeta); return TSDB_CODE_STREAM_TASK_NOT_EXIST; } else { - stDebug("s-task:%s fill-history task end, update related stream task:%s info, transfer exec state", pTask->id.idStr, + stDebug("s-task:%s fill-history task end, update related stream task:%s info, transfer exec state", id, pStreamTask->id.idStr); } ETaskStatus status = streamTaskGetStatus(pStreamTask, NULL); - ASSERT(((status == TASK_STATUS__DROPPING) || (pStreamTask->hTaskInfo.id.taskId == pTask->id.taskId)) && - pTask->status.appendTranstateBlock == true); - STimeWindow* pTimeWindow = &pStreamTask->dataRange.window; // It must be halted for a source stream task, since when the related scan-history-data task start scan the history @@ -321,7 +319,7 @@ int32_t streamDoTransferStateToStreamTask(SStreamTask* pTask) { } else { ASSERT(status == TASK_STATUS__READY|| status == TASK_STATUS__DROPPING || status == TASK_STATUS__STOP); streamTaskHandleEvent(pStreamTask->status.pSM, TASK_EVENT_HALT); - stDebug("s-task:%s halt by related fill-history task:%s", pStreamTask->id.idStr, pTask->id.idStr); + stDebug("s-task:%s halt by related fill-history task:%s", pStreamTask->id.idStr, id); } // wait for the stream task to handle all in the inputQ, and to be idle @@ -331,7 +329,13 @@ int32_t streamDoTransferStateToStreamTask(SStreamTask* pTask) { // In case of source tasks and agg tasks, we should HALT them, and wait for them to be idle. And then, it's safe to // start the task state transfer procedure. char* p = NULL; - streamTaskGetStatus(pStreamTask, &p); + status = streamTaskGetStatus(pStreamTask, &p); + if (status == TASK_STATUS__STOP || status == TASK_STATUS__DROPPING) { + stError("s-task:%s failed to transfer state from fill-history task:%s, status:%s", id, pStreamTask->id.idStr, p); + streamMetaReleaseTask(pMeta, pStreamTask); + return TSDB_CODE_STREAM_TASK_IVLD_STATUS; + } + if (pStreamTask->info.taskLevel == TASK_LEVEL__SOURCE) { // update the scan data range for source task. stDebug("s-task:%s level:%d stream task window %" PRId64 " - %" PRId64 " update to %" PRId64 " - %" PRId64 @@ -350,30 +354,18 @@ int32_t streamDoTransferStateToStreamTask(SStreamTask* pTask) { streamTaskReleaseState(pTask); streamTaskReloadState(pStreamTask); - // 3. resume the state of stream task, after this function, the stream task will run immidately. But it can not be - // pause, since the pause allowed attribute is not set yet. - streamTaskResume(pStreamTask); // todo refactor: use streamTaskResume. + // 3. resume the state of stream task, after this function, the stream task will run immediately. + streamTaskResume(pStreamTask); - stDebug("s-task:%s fill-history task set status to be dropping, save the state into disk", pTask->id.idStr); + stDebug("s-task:%s fill-history task set status to be dropping, save the state into disk", id); // 4. free it and remove fill-history task from disk meta-store streamBuildAndSendDropTaskMsg(pTask->pMsgCb, pMeta->vgId, &pTask->id); - // 5. clear the link between fill-history task and stream task info -// CLEAR_RELATED_FILLHISTORY_TASK(pStreamTask); - - // 6. save to disk - taosWLockLatch(&pMeta->lock); - + // 5. save to disk pStreamTask->status.taskStatus = streamTaskGetStatus(pStreamTask, NULL); -// streamMetaSaveTask(pMeta, pStreamTask); -// if (streamMetaCommit(pMeta) < 0) { - // persist to disk -// } - taosWUnLockLatch(&pMeta->lock); - // 7. pause allowed. - streamTaskEnablePause(pStreamTask); + // 6. pause allowed. if ((pStreamTask->info.taskLevel == TASK_LEVEL__SOURCE) && taosQueueEmpty(pStreamTask->inputq.queue->pQueue)) { SStreamRefDataBlock* pItem = taosAllocateQitem(sizeof(SStreamRefDataBlock), DEF_QITEM, 0); diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index f788e244cd..76945f17a9 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -260,13 +260,9 @@ int32_t streamMetaReopen(SStreamMeta* pMeta) { } } - pMeta->streamBackend = streamBackendInit(pMeta->path, pMeta->chkpId); - while (pMeta->streamBackend == NULL) { + while ((pMeta->streamBackend = streamBackendInit(pMeta->path, pMeta->chkpId)) == NULL) { + stInfo("vgId:%d failed to init stream backend, retry in 100ms", pMeta->vgId); taosMsleep(100); - pMeta->streamBackend = streamBackendInit(pMeta->path, pMeta->chkpId); - if (pMeta->streamBackend == NULL) { - stInfo("vgId:%d failed to init stream backend, retry in 100ms", pMeta->vgId); - } } pMeta->streamBackendRid = taosAddRef(streamBackendId, pMeta->streamBackend); @@ -447,20 +443,20 @@ int32_t streamMetaGetNumOfTasks(SStreamMeta* pMeta) { } SStreamTask* streamMetaAcquireTask(SStreamMeta* pMeta, int64_t streamId, int32_t taskId) { - taosRLockLatch(&pMeta->lock); + streamMetaRLock(pMeta); STaskId id = {.streamId = streamId, .taskId = taskId}; SStreamTask** ppTask = (SStreamTask**)taosHashGet(pMeta->pTasksMap, &id, sizeof(id)); if (ppTask != NULL) { if (!streamTaskShouldStop(*ppTask)) { int32_t ref = atomic_add_fetch_32(&(*ppTask)->refCnt, 1); - taosRUnLockLatch(&pMeta->lock); + streamMetaRUnLock(pMeta); stTrace("s-task:%s acquire task, ref:%d", (*ppTask)->id.idStr, ref); return *ppTask; } } - taosRUnLockLatch(&pMeta->lock); + streamMetaRUnLock(pMeta); return NULL; } @@ -491,7 +487,7 @@ int32_t streamMetaUnregisterTask(SStreamMeta* pMeta, int64_t streamId, int32_t t SStreamTask* pTask = NULL; // pre-delete operation - taosWLockLatch(&pMeta->lock); + streamMetaWLock(pMeta); STaskId id = {.streamId = streamId, .taskId = taskId}; SStreamTask** ppTask = (SStreamTask**)taosHashGet(pMeta->pTasksMap, &id, sizeof(id)); @@ -508,34 +504,35 @@ int32_t streamMetaUnregisterTask(SStreamMeta* pMeta, int64_t streamId, int32_t t streamTaskHandleEvent(pTask->status.pSM, TASK_EVENT_DROPPING); } else { stDebug("vgId:%d failed to find the task:0x%x, it may be dropped already", pMeta->vgId, taskId); - taosWUnLockLatch(&pMeta->lock); + streamMetaWUnLock(pMeta); return 0; } - taosWUnLockLatch(&pMeta->lock); + streamMetaWUnLock(pMeta); stDebug("s-task:0x%x set task status:dropping and start to unregister it", taskId); while (1) { - taosRLockLatch(&pMeta->lock); + streamMetaRLock(pMeta); ppTask = (SStreamTask**)taosHashGet(pMeta->pTasksMap, &id, sizeof(id)); if (ppTask) { if ((*ppTask)->status.timerActive == 0) { - taosRUnLockLatch(&pMeta->lock); + streamMetaRUnLock(pMeta); break; } taosMsleep(10); stDebug("s-task:%s wait for quit from timer", (*ppTask)->id.idStr); - taosRUnLockLatch(&pMeta->lock); + streamMetaRUnLock(pMeta); } else { - taosRUnLockLatch(&pMeta->lock); + streamMetaRUnLock(pMeta); break; } } // let's do delete of stream task - taosWLockLatch(&pMeta->lock); + streamMetaWLock(pMeta); + ppTask = (SStreamTask**)taosHashGet(pMeta->pTasksMap, &id, sizeof(id)); if (ppTask) { pTask = *ppTask; @@ -565,19 +562,16 @@ int32_t streamMetaUnregisterTask(SStreamMeta* pMeta, int64_t streamId, int32_t t stDebug("vgId:%d failed to find the task:0x%x, it may have been dropped already", pMeta->vgId, taskId); } - taosWUnLockLatch(&pMeta->lock); + streamMetaWUnLock(pMeta); return 0; } int32_t streamMetaBegin(SStreamMeta* pMeta) { - taosWLockLatch(&pMeta->lock); - if (tdbBegin(pMeta->db, &pMeta->txn, tdbDefaultMalloc, tdbDefaultFree, NULL, - TDB_TXN_WRITE | TDB_TXN_READ_UNCOMMITTED) < 0) { - taosWUnLockLatch(&pMeta->lock); - return -1; - } - taosWUnLockLatch(&pMeta->lock); - return 0; + streamMetaWLock(pMeta); + int32_t code = tdbBegin(pMeta->db, &pMeta->txn, tdbDefaultMalloc, tdbDefaultFree, NULL, + TDB_TXN_WRITE | TDB_TXN_READ_UNCOMMITTED); + streamMetaWUnLock(pMeta); + return code; } // todo add error log @@ -846,6 +840,12 @@ static bool waitForEnoughDuration(SMetaHbInfo* pInfo) { return false; } +static void clearHbMsg(SStreamHbMsg* pMsg, SArray* pIdList) { + taosArrayDestroy(pMsg->pTaskStatus); + taosArrayDestroy(pMsg->pUpdateNodes); + taosArrayDestroy(pIdList); +} + void metaHbToMnode(void* param, void* tmrId) { int64_t rid = *(int64_t*)param; @@ -884,19 +884,33 @@ void metaHbToMnode(void* param, void* tmrId) { stDebug("vgId:%d build stream task hb, leader:%d", pMeta->vgId, (pMeta->role == NODE_ROLE_LEADER)); SStreamHbMsg hbMsg = {0}; - taosRLockLatch(&pMeta->lock); + SEpSet epset = {0}; + bool hasMnodeEpset = false; + int32_t stage = 0; + + streamMetaRLock(pMeta); + int32_t numOfTasks = streamMetaGetNumOfTasks(pMeta); - - SEpSet epset = {0}; - bool hasMnodeEpset = false; - hbMsg.vgId = pMeta->vgId; + stage = pMeta->stage; + + SArray* pIdList = taosArrayDup(pMeta->pTaskList, NULL); + + streamMetaRUnLock(pMeta); + hbMsg.pTaskStatus = taosArrayInit(numOfTasks, sizeof(STaskStatusEntry)); hbMsg.pUpdateNodes = taosArrayInit(numOfTasks, sizeof(int32_t)); for (int32_t i = 0; i < numOfTasks; ++i) { - STaskId* pId = taosArrayGet(pMeta->pTaskList, i); + STaskId* pId = taosArrayGet(pIdList, i); + + streamMetaRLock(pMeta); SStreamTask** pTask = taosHashGet(pMeta->pTasksMap, pId, sizeof(*pId)); + streamMetaRUnLock(pMeta); + + if (pTask == NULL) { + continue; + } // not report the status of fill-history task if ((*pTask)->info.fillHistory == 1) { @@ -906,12 +920,12 @@ void metaHbToMnode(void* param, void* tmrId) { STaskStatusEntry entry = { .id = *pId, .status = streamTaskGetStatus(*pTask, NULL), - .nodeId = pMeta->vgId, - .stage = pMeta->stage, + .nodeId = hbMsg.vgId, + .stage = stage, .inputQUsed = SIZE_IN_MiB(streamQueueGetItemSize((*pTask)->inputq.queue)), }; - entry.inputRate = entry.inputQUsed*100.0/STREAM_TASK_QUEUE_CAPACITY_IN_SIZE; + entry.inputRate = entry.inputQUsed * 100.0 / STREAM_TASK_QUEUE_CAPACITY_IN_SIZE; if ((*pTask)->info.taskLevel == TASK_LEVEL__SINK) { entry.sinkQuota = (*pTask)->outputInfo.pTokenBucket->quotaRate; entry.sinkDataSize = SIZE_IN_MiB((*pTask)->execInfo.sink.dataSize); @@ -930,9 +944,9 @@ void metaHbToMnode(void* param, void* tmrId) { taosThreadMutexLock(&(*pTask)->lock); int32_t num = taosArrayGetSize((*pTask)->outputInfo.pDownstreamUpdateList); for (int j = 0; j < num; ++j) { - int32_t *pNodeId = taosArrayGet((*pTask)->outputInfo.pDownstreamUpdateList, j); + int32_t* pNodeId = taosArrayGet((*pTask)->outputInfo.pDownstreamUpdateList, j); - bool exist = false; + bool exist = false; int32_t numOfExisted = taosArrayGetSize(hbMsg.pUpdateNodes); for (int k = 0; k < numOfExisted; ++k) { if (*pNodeId == *(int32_t*)taosArrayGet(hbMsg.pUpdateNodes, k)) { @@ -957,7 +971,6 @@ void metaHbToMnode(void* param, void* tmrId) { } hbMsg.numOfTasks = taosArrayGetSize(hbMsg.pTaskStatus); - taosRUnLockLatch(&pMeta->lock); if (hasMnodeEpset) { int32_t code = 0; @@ -966,17 +979,13 @@ void metaHbToMnode(void* param, void* tmrId) { tEncodeSize(tEncodeStreamHbMsg, &hbMsg, tlen, code); if (code < 0) { stError("vgId:%d encode stream hb msg failed, code:%s", pMeta->vgId, tstrerror(code)); - taosArrayDestroy(hbMsg.pTaskStatus); - taosReleaseRef(streamMetaId, rid); - return; + goto _end; } void* buf = rpcMallocCont(tlen); if (buf == NULL) { stError("vgId:%d encode stream hb msg failed, code:%s", pMeta->vgId, tstrerror(TSDB_CODE_OUT_OF_MEMORY)); - taosArrayDestroy(hbMsg.pTaskStatus); - taosReleaseRef(streamMetaId, rid); - return; + goto _end; } SEncoder encoder; @@ -984,15 +993,12 @@ void metaHbToMnode(void* param, void* tmrId) { if ((code = tEncodeStreamHbMsg(&encoder, &hbMsg)) < 0) { rpcFreeCont(buf); stError("vgId:%d encode stream hb msg failed, code:%s", pMeta->vgId, tstrerror(code)); - taosArrayDestroy(hbMsg.pTaskStatus); - taosReleaseRef(streamMetaId, rid); - return; + goto _end; } tEncoderClear(&encoder); - SRpcMsg msg = {0}; + SRpcMsg msg = {.info.noResp = 1,}; initRpcMsg(&msg, TDMT_MND_STREAM_HEARTBEAT, buf, tlen); - msg.info.noResp = 1; pMeta->pHbInfo->hbCount += 1; @@ -1003,16 +1009,15 @@ void metaHbToMnode(void* param, void* tmrId) { stDebug("vgId:%d no tasks and no mnd epset, not send stream hb to mnode", pMeta->vgId); } - taosArrayDestroy(hbMsg.pTaskStatus); - taosArrayDestroy(hbMsg.pUpdateNodes); - + _end: + clearHbMsg(&hbMsg, pIdList); taosTmrReset(metaHbToMnode, META_HB_CHECK_INTERVAL, param, streamEnv.timer, &pMeta->pHbInfo->hbTmr); taosReleaseRef(streamMetaId, rid); } bool streamMetaTaskInTimer(SStreamMeta* pMeta) { bool inTimer = false; - taosWLockLatch(&pMeta->lock); + streamMetaWLock(pMeta); void* pIter = NULL; while (1) { @@ -1027,7 +1032,7 @@ bool streamMetaTaskInTimer(SStreamMeta* pMeta) { } } - taosWUnLockLatch(&pMeta->lock); + streamMetaWUnLock(pMeta); return inTimer; } @@ -1037,7 +1042,7 @@ void streamMetaNotifyClose(SStreamMeta* pMeta) { stDebug("vgId:%d notify all stream tasks that the vnode is closing. isLeader:%d startHb%" PRId64 ", totalHb:%d", vgId, (pMeta->role == NODE_ROLE_LEADER), pMeta->pHbInfo->hbStart, pMeta->pHbInfo->hbCount); - taosWLockLatch(&pMeta->lock); + streamMetaWLock(pMeta); void* pIter = NULL; while (1) { @@ -1051,7 +1056,7 @@ void streamMetaNotifyClose(SStreamMeta* pMeta) { streamTaskStop(pTask); } - taosWUnLockLatch(&pMeta->lock); + streamMetaWUnLock(pMeta); // wait for the stream meta hb function stopping if (pMeta->role == NODE_ROLE_LEADER) { @@ -1090,4 +1095,23 @@ void streamMetaResetStartInfo(STaskStartInfo* pStartInfo) { taosHashClear(pStartInfo->pReadyTaskSet); pStartInfo->startAllTasksFlag = 0; pStartInfo->readyTs = 0; -} \ No newline at end of file +} + +void streamMetaRLock(SStreamMeta* pMeta) { + stTrace("vgId:%d meta-rlock", pMeta->vgId); + taosRLockLatch(&pMeta->lock); +} +void streamMetaRUnLock(SStreamMeta* pMeta) { + stTrace("vgId:%d meta-runlock", pMeta->vgId); + taosRUnLockLatch(&pMeta->lock); + +} +void streamMetaWLock(SStreamMeta* pMeta) { + stTrace("vgId:%d meta-wlock", pMeta->vgId); + taosWLockLatch(&pMeta->lock); +} +void streamMetaWUnLock(SStreamMeta* pMeta) { + stTrace("vgId:%d meta-wunlock", pMeta->vgId); + taosWUnLockLatch(&pMeta->lock); +} + diff --git a/source/libs/stream/src/streamStart.c b/source/libs/stream/src/streamStart.c index 2d951147d0..5ebc60dc13 100644 --- a/source/libs/stream/src/streamStart.c +++ b/source/libs/stream/src/streamStart.c @@ -580,13 +580,13 @@ int32_t streamProcessScanHistoryFinishRsp(SStreamTask* pTask) { SStreamMeta* pMeta = pTask->pMeta; // execute in the scan history complete call back msg, ready to process data from inputQ - streamTaskHandleEvent(pTask->status.pSM, TASK_EVENT_SCANHIST_DONE); + int32_t code = streamTaskHandleEvent(pTask->status.pSM, TASK_EVENT_SCANHIST_DONE); streamTaskSetSchedStatusInactive(pTask); - taosWLockLatch(&pMeta->lock); + streamMetaWLock(pMeta); streamMetaSaveTask(pMeta, pTask); streamMetaCommit(pMeta); - taosWUnLockLatch(&pMeta->lock); + streamMetaWUnLock(pMeta); // history data scan in the stream time window finished, now let's enable the pause streamTaskEnablePause(pTask); @@ -624,7 +624,8 @@ static void tryLaunchHistoryTask(void* param, void* tmrId) { SLaunchHTaskInfo* pInfo = param; SStreamMeta* pMeta = pInfo->pMeta; - taosWLockLatch(&pMeta->lock); + streamMetaWLock(pMeta); + SStreamTask** ppTask = (SStreamTask**)taosHashGet(pMeta->pTasksMap, &pInfo->id, sizeof(pInfo->id)); if (ppTask) { ASSERT((*ppTask)->status.timerActive >= 1); @@ -637,11 +638,11 @@ static void tryLaunchHistoryTask(void* param, void* tmrId) { (*ppTask)->id.idStr, p, (*ppTask)->hTaskInfo.retryTimes, ref); taosMemoryFree(pInfo); - taosWUnLockLatch(&pMeta->lock); + streamMetaWUnLock(pMeta); return; } } - taosWUnLockLatch(&pMeta->lock); + streamMetaWUnLock(pMeta); SStreamTask* pTask = streamMetaAcquireTask(pMeta, pInfo->id.streamId, pInfo->id.taskId); if (pTask != NULL) { @@ -934,66 +935,6 @@ void streamTaskSetRangeStreamCalc(SStreamTask* pTask) { } void streamTaskPause(SStreamTask* pTask, SStreamMeta* pMeta) { -#if 0 - int8_t status = pTask->status.taskStatus; - if (status == TASK_STATUS__DROPPING) { - stDebug("vgId:%d s-task:%s task already dropped, do nothing", pMeta->vgId, pTask->id.idStr); - return; - } - - const char* str = streamGetTaskStatusStr(status); - if (status == TASK_STATUS__STOP || status == TASK_STATUS__PAUSE) { - stDebug("vgId:%d s-task:%s task already stopped/paused, status:%s, do nothing", pMeta->vgId, pTask->id.idStr, str); - return; - } - - if(pTask->info.taskLevel == TASK_LEVEL__SINK) { - int32_t num = atomic_add_fetch_32(&pMeta->numOfPausedTasks, 1); - stInfo("vgId:%d s-task:%s pause stream sink task. pause task num:%d", pMeta->vgId, pTask->id.idStr, num); - return; - } - - while (!pTask->status.pauseAllowed || (pTask->status.taskStatus == TASK_STATUS__HALT)) { - status = pTask->status.taskStatus; - if (status == TASK_STATUS__DROPPING) { - stDebug("vgId:%d s-task:%s task already dropped, do nothing", pMeta->vgId, pTask->id.idStr); - return; - } - - if (status == TASK_STATUS__STOP || status == TASK_STATUS__PAUSE) { - stDebug("vgId:%d s-task:%s task already stopped/paused, status:%s, do nothing", pMeta->vgId, pTask->id.idStr, str); - return; - } -// -// if (pTask->status.downstreamReady == 0) { -// ASSERT(pTask->execInfo.start == 0); -// stDebug("s-task:%s in check downstream procedure, abort and paused", pTask->id.idStr); -// break; -// } - - const char* pStatus = streamGetTaskStatusStr(status); - stDebug("s-task:%s wait for the task can be paused, status:%s, vgId:%d", pTask->id.idStr, pStatus, pMeta->vgId); - taosMsleep(100); - } - - // todo: use the task lock, stead of meta lock - taosWLockLatch(&pMeta->lock); - - status = pTask->status.taskStatus; - if (status == TASK_STATUS__DROPPING || status == TASK_STATUS__STOP) { - taosWUnLockLatch(&pMeta->lock); - stDebug("vgId:%d s-task:%s task already dropped/stopped/paused, do nothing", pMeta->vgId, pTask->id.idStr); - return; - } - - atomic_store_8(&pTask->status.keepTaskStatus, pTask->status.taskStatus); - atomic_store_8(&pTask->status.taskStatus, TASK_STATUS__PAUSE); - int32_t num = atomic_add_fetch_32(&pMeta->numOfPausedTasks, 1); - stInfo("vgId:%d s-task:%s pause stream task. pause task num:%d", pMeta->vgId, pTask->id.idStr, num); - taosWUnLockLatch(&pMeta->lock); - -#endif - streamTaskHandleEvent(pTask->status.pSM, TASK_EVENT_PAUSE); int32_t num = atomic_add_fetch_32(&pMeta->numOfPausedTasks, 1); @@ -1029,19 +970,6 @@ void streamTaskResume(SStreamTask* pTask) { } } -// todo fix race condition -void streamTaskDisablePause(SStreamTask* pTask) { - // pre-condition check -// const char* id = pTask->id.idStr; -// while (pTask->status.taskStatus == TASK_STATUS__PAUSE) { -// stDebug("s-task:%s already in pause, wait for pause being cancelled, and set pause disabled, recheck in 100ms", id); -// taosMsleep(100); -// } -// -// stDebug("s-task:%s disable task pause", id); -// pTask->status.pauseAllowed = 0; -} - void streamTaskEnablePause(SStreamTask* pTask) { stDebug("s-task:%s enable task pause", pTask->id.idStr); pTask->status.pauseAllowed = 1; @@ -1050,7 +978,7 @@ void streamTaskEnablePause(SStreamTask* pTask) { int32_t streamMetaUpdateTaskReadyInfo(SStreamTask* pTask) { SStreamMeta* pMeta = pTask->pMeta; - taosWLockLatch(&pMeta->lock); + streamMetaWLock(pMeta); STaskId id = streamTaskExtractKey(pTask); taosHashPut(pMeta->startInfo.pReadyTaskSet, &id, sizeof(id), NULL, 0); @@ -1071,6 +999,6 @@ int32_t streamMetaUpdateTaskReadyInfo(SStreamTask* pTask) { pStartInfo->elapsedTime / 1000.0); } - taosWUnLockLatch(&pMeta->lock); + streamMetaWUnLock(pMeta); return TSDB_CODE_SUCCESS; } diff --git a/source/libs/stream/src/streamState.c b/source/libs/stream/src/streamState.c index e9960a0059..5079b341d6 100644 --- a/source/libs/stream/src/streamState.c +++ b/source/libs/stream/src/streamState.c @@ -121,7 +121,7 @@ SStreamState* streamStateOpen(char* path, void* pTask, bool specPath, int32_t sz #ifdef USE_ROCKSDB SStreamMeta* pMeta = pStreamTask->pMeta; pState->streamBackendRid = pMeta->streamBackendRid; - // taosWLockLatch(&pMeta->lock); + // streamMetaWLock(pMeta); taosThreadMutexLock(&pMeta->backendMutex); void* uniqueId = taosHashGet(pMeta->pTaskBackendUnique, pState->pTdbState->idstr, strlen(pState->pTdbState->idstr) + 1); diff --git a/source/libs/stream/src/streamTaskSm.c b/source/libs/stream/src/streamTaskSm.c index 1aa4dd8e13..09656cbe97 100644 --- a/source/libs/stream/src/streamTaskSm.c +++ b/source/libs/stream/src/streamTaskSm.c @@ -20,6 +20,8 @@ #include "ttimer.h" #include "wal.h" +#define GET_EVT_NAME(_ev) (StreamTaskEventList[(_ev)].name) + SStreamTaskState StreamTaskStatusList[9] = { {.state = TASK_STATUS__READY, .name = "ready"}, {.state = TASK_STATUS__DROPPING, .name = "dropped"}, @@ -66,7 +68,7 @@ static int32_t attachEvent(SStreamTask* pTask, SAttachedEventInfo* pEvtInfo) { streamTaskGetStatus(pTask, &p); stDebug("s-task:%s status:%s attach event:%s required status:%s, since not allowed to handle it", pTask->id.idStr, p, - StreamTaskEventList[pEvtInfo->event].name, StreamTaskStatusList[pEvtInfo->status].name); + GET_EVT_NAME(pEvtInfo->event), StreamTaskStatusList[pEvtInfo->status].name); taosArrayPush(pTask->status.pSM->pWaitingEventList, pEvtInfo); return 0; } @@ -80,13 +82,6 @@ int32_t streamTaskInitStatus(SStreamTask* pTask) { return 0; } -int32_t streamTaskSetReadyForWal(SStreamTask* pTask) { - if (pTask->info.taskLevel == TASK_LEVEL__SOURCE) { - stDebug("s-task:%s ready for extract data from wal", pTask->id.idStr); - } - return TSDB_CODE_SUCCESS; -} - static int32_t streamTaskDoCheckpoint(SStreamTask* pTask) { stDebug("s-task:%s start to do checkpoint", pTask->id.idStr); return 0; @@ -105,6 +100,39 @@ int32_t streamTaskKeepCurrentVerInWal(SStreamTask* pTask) { return TSDB_CODE_SUCCESS; } +// todo check rsp code for handle Event:TASK_EVENT_SCANHIST_DONE +static bool isInvalidStateTransfer(ETaskStatus state, const EStreamTaskEvent event) { + if (event == TASK_EVENT_INIT_STREAM_SCANHIST || event == TASK_EVENT_INIT || event == TASK_EVENT_INIT_SCANHIST) { + return (state != TASK_STATUS__UNINIT); + } + + if (event == TASK_EVENT_SCANHIST_DONE) { + return (state != TASK_STATUS__SCAN_HISTORY && state != TASK_STATUS__STREAM_SCAN_HISTORY); + } + + if (event == TASK_EVENT_GEN_CHECKPOINT) { + return (state != TASK_STATUS__READY); + } + + if (event == TASK_EVENT_CHECKPOINT_DONE) { + return (state != TASK_STATUS__CK); + } + + // todo refactor later + if (event == TASK_EVENT_RESUME) { + return true; + } + + if (event == TASK_EVENT_HALT) { + if (state == TASK_STATUS__DROPPING || state == TASK_STATUS__UNINIT || state == TASK_STATUS__STOP || + state == TASK_STATUS__SCAN_HISTORY) { + return true; + } + } + + return false; +} + // todo optimize the perf of find the trans objs by using hash table static STaskStateTrans* streamTaskFindTransform(ETaskStatus state, const EStreamTaskEvent event) { int32_t numOfTrans = taosArrayGetSize(streamTaskSMTrans); @@ -115,10 +143,8 @@ static STaskStateTrans* streamTaskFindTransform(ETaskStatus state, const EStream } } - if (event == TASK_EVENT_CHECKPOINT_DONE && state == TASK_STATUS__STOP) { - - } else if (event == TASK_EVENT_GEN_CHECKPOINT && state == TASK_STATUS__UNINIT) { - // the task is set to uninit due to nodeEpset update, during processing checkpoint-trigger block. + if (isInvalidStateTransfer(state, event)) { + return NULL; } else { ASSERT(0); } @@ -128,9 +154,10 @@ static STaskStateTrans* streamTaskFindTransform(ETaskStatus state, const EStream void streamTaskRestoreStatus(SStreamTask* pTask) { SStreamTaskSM* pSM = pTask->status.pSM; - taosThreadMutexLock(&pTask->lock); - ASSERT(pSM->pActiveTrans == NULL); + taosThreadMutexLock(&pTask->lock); + + ASSERT(pSM->pActiveTrans == NULL); ASSERT(pSM->current.state == TASK_STATUS__PAUSE || pSM->current.state == TASK_STATUS__HALT); SStreamTaskState state = pSM->current; @@ -199,14 +226,14 @@ static int32_t doHandleEvent(SStreamTaskSM* pSM, EStreamTaskEvent event, STaskSt taosThreadMutexUnlock(&pTask->lock); if ((s == pTrans->next.state) && (pSM->prev.evt == pTrans->event)) { - stDebug("s-task:%s attached event:%s handled", id, StreamTaskEventList[pTrans->event].name); + stDebug("s-task:%s attached event:%s handled", id, GET_EVT_NAME(pTrans->event)); return TSDB_CODE_SUCCESS; } else if (s != TASK_STATUS__DROPPING && s != TASK_STATUS__STOP) { // this event has been handled already - stDebug("s-task:%s not handle event:%s yet, wait for 100ms and recheck", id, StreamTaskEventList[event].name); + stDebug("s-task:%s not handle event:%s yet, wait for 100ms and recheck", id, GET_EVT_NAME(event)); taosMsleep(100); } else { stDebug("s-task:%s is dropped or stopped already, not wait.", id); - return TSDB_CODE_INVALID_PARA; + return TSDB_CODE_STREAM_INVALID_STATETRANS; } } @@ -227,37 +254,41 @@ static int32_t doHandleEvent(SStreamTaskSM* pSM, EStreamTaskEvent event, STaskSt } int32_t streamTaskHandleEvent(SStreamTaskSM* pSM, EStreamTaskEvent event) { + int32_t code = TSDB_CODE_SUCCESS; SStreamTask* pTask = pSM->pTask; STaskStateTrans* pTrans = NULL; while (1) { taosThreadMutexLock(&pTask->lock); + if (pSM->pActiveTrans != NULL && pSM->pActiveTrans->autoInvokeEndFn) { + EStreamTaskEvent evt = pSM->pActiveTrans->event; taosThreadMutexUnlock(&pTask->lock); - taosMsleep(100); + stDebug("s-task:%s status:%s handling event:%s by some other thread, wait for 100ms and check if completed", - pTask->id.idStr, pSM->current.name, StreamTaskEventList[pSM->pActiveTrans->event].name); + pTask->id.idStr, pSM->current.name, GET_EVT_NAME(evt)); + taosMsleep(100); } else { pTrans = streamTaskFindTransform(pSM->current.state, event); if (pTrans == NULL) { - stDebug("s-task:%s failed to handle event:%s", pTask->id.idStr, StreamTaskEventList[event].name); + stDebug("s-task:%s failed to handle event:%s", pTask->id.idStr, GET_EVT_NAME(event)); taosThreadMutexUnlock(&pTask->lock); - return TSDB_CODE_INVALID_PARA; // todo: set new error code// failed to handle the event. + return TSDB_CODE_STREAM_INVALID_STATETRANS; } if (pSM->pActiveTrans != NULL) { // currently in some state transfer procedure, not auto invoke transfer, abort it stDebug("s-task:%s event:%s handle procedure quit, status %s -> %s failed, handle event %s now", - pTask->id.idStr, StreamTaskEventList[pSM->pActiveTrans->event].name, pSM->current.name, - pSM->pActiveTrans->next.name, StreamTaskEventList[event].name); + pTask->id.idStr, GET_EVT_NAME(pSM->pActiveTrans->event), pSM->current.name, + pSM->pActiveTrans->next.name, GET_EVT_NAME(event)); } - doHandleEvent(pSM, event, pTrans); + code = doHandleEvent(pSM, event, pTrans); break; } } - return TSDB_CODE_SUCCESS; + return code; } static void keepPrevInfo(SStreamTaskSM* pSM) { @@ -272,24 +303,27 @@ int32_t streamTaskOnHandleEventSuccess(SStreamTaskSM* pSM, EStreamTaskEvent even // do update the task status taosThreadMutexLock(&pTask->lock); - STaskStateTrans* pTrans = pSM->pActiveTrans; + STaskStateTrans* pTrans = pSM->pActiveTrans; if (pTrans == NULL) { ETaskStatus s = pSM->current.state; - ASSERT(s == TASK_STATUS__DROPPING || s == TASK_STATUS__PAUSE || s == TASK_STATUS__STOP); + ASSERT(s == TASK_STATUS__DROPPING || s == TASK_STATUS__PAUSE || s == TASK_STATUS__STOP || + s == TASK_STATUS__UNINIT || s == TASK_STATUS__READY); + // the pSM->prev.evt may be 0, so print string is not appropriate. stDebug("s-task:%s event:%s handled failed, current status:%s, trigger event:%s", pTask->id.idStr, - StreamTaskEventList[event].name, pSM->current.name, StreamTaskEventList[pSM->prev.evt].name); + GET_EVT_NAME(event), pSM->current.name, GET_EVT_NAME(pSM->prev.evt)); taosThreadMutexUnlock(&pTask->lock); - return TSDB_CODE_INVALID_PARA; + stDebug("s-task:%s unlockx", pTask->id.idStr); + return TSDB_CODE_STREAM_INVALID_STATETRANS; } if (pTrans->event != event) { stWarn("s-task:%s handle event:%s failed, current status:%s, active trans evt:%s", pTask->id.idStr, - StreamTaskEventList[event].name, pSM->current.name, StreamTaskEventList[pTrans->event].name); + GET_EVT_NAME(event), pSM->current.name, GET_EVT_NAME(pTrans->event)); taosThreadMutexUnlock(&pTask->lock); - return TSDB_CODE_INVALID_PARA; + return TSDB_CODE_STREAM_INVALID_STATETRANS; } keepPrevInfo(pSM); @@ -303,14 +337,17 @@ int32_t streamTaskOnHandleEventSuccess(SStreamTaskSM* pSM, EStreamTaskEvent even if (taosArrayGetSize(pSM->pWaitingEventList) > 0) { int64_t el = (taosGetTimestampMs() - pSM->startTs); stDebug("s-task:%s handle event:%s completed, elapsed time:%" PRId64 "ms state:%s -> %s", pTask->id.idStr, - StreamTaskEventList[pTrans->event].name, el, pSM->prev.state.name, pSM->current.name); + GET_EVT_NAME(pTrans->event), el, pSM->prev.state.name, pSM->current.name); - SAttachedEventInfo* pEvtInfo = taosArrayPop(pSM->pWaitingEventList); + SAttachedEventInfo* pEvtInfo = taosArrayGet(pSM->pWaitingEventList, 0); // OK, let's handle the attached event, since the task has reached the required status now if (pSM->current.state == pEvtInfo->status) { - stDebug("s-task:%s handle the attached event:%s, state:%s", pTask->id.idStr, - StreamTaskEventList[pEvtInfo->event].name, pSM->current.name); + stDebug("s-task:%s handle the event:%s in waiting list, state:%s", pTask->id.idStr, + GET_EVT_NAME(pEvtInfo->event), pSM->current.name); + + // remove it + taosArrayPop(pSM->pWaitingEventList); STaskStateTrans* pNextTrans = streamTaskFindTransform(pSM->current.state, pEvtInfo->event); ASSERT(pSM->pActiveTrans == NULL && pNextTrans != NULL); @@ -325,13 +362,18 @@ int32_t streamTaskOnHandleEventSuccess(SStreamTaskSM* pSM, EStreamTaskEvent even } else { return code; } + } else { + taosThreadMutexUnlock(&pTask->lock); + stDebug("s-task:%s state:%s event:%s in waiting list, req state:%s not fulfilled, put it back", pTask->id.idStr, + pSM->current.name, GET_EVT_NAME(pEvtInfo->event), + StreamTaskStatusList[pEvtInfo->status].name); } } else { taosThreadMutexUnlock(&pTask->lock); int64_t el = (taosGetTimestampMs() - pSM->startTs); stDebug("s-task:%s handle event:%s completed, elapsed time:%" PRId64 "ms state:%s -> %s", pTask->id.idStr, - StreamTaskEventList[pTrans->event].name, el, pSM->prev.state.name, pSM->current.name); + GET_EVT_NAME(pTrans->event), el, pSM->prev.state.name, pSM->current.name); } return TSDB_CODE_SUCCESS; @@ -440,6 +482,10 @@ void doInitStateTransferTable(void) { streamTaskKeepCurrentVerInWal, NULL, true); taosArrayPush(streamTaskSMTrans, &trans); + trans = createStateTransform(TASK_STATUS__HALT, TASK_STATUS__HALT, TASK_EVENT_HALT, NULL, + streamTaskKeepCurrentVerInWal, NULL, true); + taosArrayPush(streamTaskSMTrans, &trans); + SAttachedEventInfo info = {.status = TASK_STATUS__READY, .event = TASK_EVENT_HALT}; trans = createStateTransform(TASK_STATUS__STREAM_SCAN_HISTORY, TASK_STATUS__HALT, TASK_EVENT_HALT, NULL, streamTaskKeepCurrentVerInWal, &info, true); diff --git a/source/util/src/terror.c b/source/util/src/terror.c index 1714126a32..58022cb8f1 100644 --- a/source/util/src/terror.c +++ b/source/util/src/terror.c @@ -680,6 +680,9 @@ TAOS_DEFINE_ERROR(TSDB_CODE_TMQ_REPLAY_NOT_SUPPORT, "Replay is disabled // stream TAOS_DEFINE_ERROR(TSDB_CODE_STREAM_TASK_NOT_EXIST, "Stream task not exist") TAOS_DEFINE_ERROR(TSDB_CODE_STREAM_EXEC_CANCELLED, "Stream task exec cancelled") +TAOS_DEFINE_ERROR(TSDB_CODE_STREAM_INVALID_STATETRANS, "Invalid task state to handle event") +TAOS_DEFINE_ERROR(TSDB_CODE_STREAM_TASK_IVLD_STATUS,"Invalid task status to proceed") + // TDLite TAOS_DEFINE_ERROR(TSDB_CODE_TDLITE_IVLD_OPEN_FLAGS, "Invalid TDLite open flags") TAOS_DEFINE_ERROR(TSDB_CODE_TDLITE_IVLD_OPEN_DIR, "Invalid TDLite open directory")