From ea6e78cbaaa8d433a79d16024a8bd1937fb03461 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Mon, 18 Dec 2023 16:33:44 +0800 Subject: [PATCH 01/38] fix(stream): add async call restart, instead of sync wait. --- include/dnode/vnode/tqCommon.h | 1 + include/libs/stream/tstream.h | 12 ++++--- source/dnode/snode/src/snode.c | 2 +- source/dnode/vnode/src/tq/tq.c | 3 +- source/dnode/vnode/src/tqCommon/tqCommon.c | 42 ++++++++++++++++------ source/libs/stream/src/streamMeta.c | 8 +++-- source/libs/stream/src/streamStart.c | 3 ++ 7 files changed, 52 insertions(+), 19 deletions(-) diff --git a/include/dnode/vnode/tqCommon.h b/include/dnode/vnode/tqCommon.h index 75dafcdbff..32ac38cee6 100644 --- a/include/dnode/vnode/tqCommon.h +++ b/include/dnode/vnode/tqCommon.h @@ -32,5 +32,6 @@ int32_t tqStreamTaskProcessDropReq(SStreamMeta* pMeta, char* msg, int32_t msgLen int32_t tqStreamTaskProcessRunReq(SStreamMeta* pMeta, SRpcMsg* pMsg, bool isLeader); int32_t startStreamTasks(SStreamMeta* pMeta); int32_t resetStreamTaskStatus(SStreamMeta* pMeta); +int32_t tqStartTaskCompleteCallback(SStreamMeta* pMeta); #endif // TDENGINE_TQ_COMMON_H diff --git a/include/libs/stream/tstream.h b/include/libs/stream/tstream.h index 6730d211df..812b57b01f 100644 --- a/include/libs/stream/tstream.h +++ b/include/libs/stream/tstream.h @@ -460,14 +460,18 @@ struct SStreamTask { char reserve[256]; }; +typedef int32_t (*startComplete_fn_t)(struct SStreamMeta*); + typedef struct STaskStartInfo { int64_t startTs; int64_t readyTs; int32_t tasksWillRestart; - int32_t taskStarting; // restart flag, sentinel to guard the restart procedure. - SHashObj* pReadyTaskSet; // tasks that are all ready for running stream processing - SHashObj* pFailedTaskSet; // tasks that are done the check downstream process, may be successful or failed + int32_t taskStarting; // restart flag, sentinel to guard the restart procedure. + SHashObj* pReadyTaskSet; // tasks that are all ready for running stream processing + SHashObj* pFailedTaskSet; // tasks that are done the check downstream process, may be successful or failed int64_t elapsedTime; + int32_t restartCount; // restart task counter + startComplete_fn_t completeFn; // complete callback function } STaskStartInfo; typedef struct STaskUpdateInfo { @@ -827,7 +831,7 @@ int32_t streamProcessScanHistoryFinishRsp(SStreamTask* pTask); // stream task meta void streamMetaInit(); void streamMetaCleanup(); -SStreamMeta* streamMetaOpen(const char* path, void* ahandle, FTaskExpand expandFunc, int32_t vgId, int64_t stage); +SStreamMeta* streamMetaOpen(const char* path, void* ahandle, FTaskExpand expandFunc, int32_t vgId, int64_t stage, startComplete_fn_t fn); void streamMetaClose(SStreamMeta* streamMeta); int32_t streamMetaSaveTask(SStreamMeta* pMeta, SStreamTask* pTask); // save to stream meta store int32_t streamMetaRemoveTask(SStreamMeta* pMeta, STaskId* pKey); diff --git a/source/dnode/snode/src/snode.c b/source/dnode/snode/src/snode.c index 380be1dd38..432f80824a 100644 --- a/source/dnode/snode/src/snode.c +++ b/source/dnode/snode/src/snode.c @@ -127,7 +127,7 @@ SSnode *sndOpen(const char *path, const SSnodeOpt *pOption) { } pSnode->msgCb = pOption->msgCb; - pSnode->pMeta = streamMetaOpen(path, pSnode, (FTaskExpand *)sndExpandTask, SNODE_HANDLE, taosGetTimestampMs()); + pSnode->pMeta = streamMetaOpen(path, pSnode, (FTaskExpand *)sndExpandTask, SNODE_HANDLE, taosGetTimestampMs(), tqStartTaskCompleteCallback); if (pSnode->pMeta == NULL) { terrno = TSDB_CODE_OUT_OF_MEMORY; goto FAIL; diff --git a/source/dnode/vnode/src/tq/tq.c b/source/dnode/vnode/src/tq/tq.c index 138c58b45f..76049d21b1 100644 --- a/source/dnode/vnode/src/tq/tq.c +++ b/source/dnode/vnode/src/tq/tq.c @@ -96,7 +96,8 @@ int32_t tqInitialize(STQ* pTq) { return -1; } - pTq->pStreamMeta = streamMetaOpen(pTq->path, pTq, (FTaskExpand*)tqExpandTask, pTq->pVnode->config.vgId, -1); + int32_t vgId = TD_VID(pTq->pVnode); + pTq->pStreamMeta = streamMetaOpen(pTq->path, pTq, (FTaskExpand*)tqExpandTask, vgId, -1, tqStartTaskCompleteCallback); if (pTq->pStreamMeta == NULL) { return -1; } diff --git a/source/dnode/vnode/src/tqCommon/tqCommon.c b/source/dnode/vnode/src/tqCommon/tqCommon.c index 480ed7fd38..004eb240b3 100644 --- a/source/dnode/vnode/src/tqCommon/tqCommon.c +++ b/source/dnode/vnode/src/tqCommon/tqCommon.c @@ -699,8 +699,8 @@ int32_t startStreamTasks(SStreamMeta* pMeta) { } int32_t resetStreamTaskStatus(SStreamMeta* pMeta) { - int32_t vgId = pMeta->vgId; - int32_t numOfTasks = taosArrayGetSize(pMeta->pTaskList); + int32_t vgId = pMeta->vgId; + int32_t numOfTasks = taosArrayGetSize(pMeta->pTaskList); tqDebug("vgId:%d reset all %d stream task(s) status to be uninit", vgId, numOfTasks); if (numOfTasks == 0) { @@ -723,16 +723,17 @@ static int32_t restartStreamTasks(SStreamMeta* pMeta, bool isLeader) { int32_t code = 0; int64_t st = taosGetTimestampMs(); - while(1) { - int32_t startVal = atomic_val_compare_exchange_32(&pMeta->startInfo.taskStarting, 0, 1); - if (startVal == 0) { - break; - } - - tqDebug("vgId:%d in start stream tasks procedure, wait for 500ms and recheck", vgId); - taosMsleep(500); + streamMetaWLock(pMeta); + if (pMeta->startInfo.taskStarting == 1) { + pMeta->startInfo.restartCount += 1; + tqDebug("vgId:%d in start tasks procedure, inc restartCounter by 1, %d", vgId, pMeta->startInfo.restartCount); + streamMetaWUnLock(pMeta); + return TSDB_CODE_SUCCESS; } + pMeta->startInfo.taskStarting = 1; + streamMetaWUnLock(pMeta); + terrno = 0; tqInfo("vgId:%d tasks are all updated and stopped, restart all tasks, triggered by transId:%d", vgId, pMeta->updateInfo.transId); @@ -791,7 +792,7 @@ int32_t tqStreamTaskProcessRunReq(SStreamMeta* pMeta, SRpcMsg* pMsg, bool isLead char* p = NULL; if (streamTaskReadyToRun(pTask, &p)) { tqDebug("vgId:%d s-task:%s start to process block from inputQ, next checked ver:%" PRId64, vgId, pTask->id.idStr, - pTask->chkInfo.nextProcessVer); + pTask->chkInfo.nextProcessVer); streamExecTask(pTask); } else { int8_t status = streamTaskSetSchedStatusInactive(pTask); @@ -808,4 +809,23 @@ int32_t tqStreamTaskProcessRunReq(SStreamMeta* pMeta, SRpcMsg* pMsg, bool isLead } } +int32_t tqStartTaskCompleteCallback(SStreamMeta* pMeta) { + STaskStartInfo* pStartInfo = &pMeta->startInfo; + taosWLockLatch(&pMeta->lock); + if (pStartInfo->restartCount > 0) { + pStartInfo->restartCount -= 1; + + ASSERT(pStartInfo->taskStarting == 0); + tqDebug("vgId:%d role:%d need to restart all tasks again, restartCounter:%d", pMeta->vgId, pMeta->role, + pStartInfo->restartCount); + + taosWUnLockLatch(&pMeta->lock); + restartStreamTasks(pMeta, (pMeta->role == NODE_ROLE_LEADER)); + } else { + taosWUnLockLatch(&pMeta->lock); + tqDebug("vgId:%d start all tasks completed", pMeta->vgId); + } + + return TSDB_CODE_SUCCESS; +} \ No newline at end of file diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index 23cb6f5a35..8868a5719c 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -277,7 +277,9 @@ int32_t streamTaskSetDb(SStreamMeta* pMeta, void* arg, char* key) { stDebug("s-task:0x%x set backend %p", pTask->id.taskId, pBackend); return 0; } -SStreamMeta* streamMetaOpen(const char* path, void* ahandle, FTaskExpand expandFunc, int32_t vgId, int64_t stage) { + +SStreamMeta* streamMetaOpen(const char* path, void* ahandle, FTaskExpand expandFunc, int32_t vgId, int64_t stage, + startComplete_fn_t fn) { int32_t code = -1; SStreamMeta* pMeta = taosMemoryCalloc(1, sizeof(SStreamMeta)); if (pMeta == NULL) { @@ -343,6 +345,7 @@ SStreamMeta* streamMetaOpen(const char* path, void* ahandle, FTaskExpand expandF pMeta->stage = stage; pMeta->role = (vgId == SNODE_HANDLE) ? NODE_ROLE_LEADER : NODE_ROLE_UNINIT; + pMeta->startInfo.completeFn = fn; pMeta->pTaskDbUnique = taosHashInit(64, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BINARY), false, HASH_ENTRY_LOCK); // pMeta->chkpId = streamGetLatestCheckpointId(pMeta); @@ -1258,8 +1261,9 @@ void streamMetaResetStartInfo(STaskStartInfo* pStartInfo) { taosHashClear(pStartInfo->pFailedTaskSet); pStartInfo->tasksWillRestart = 0; pStartInfo->readyTs = 0; + // reset the sentinel flag value to be 0 - atomic_store_32(&pStartInfo->taskStarting, 0); + pStartInfo->taskStarting = 0; } void streamMetaRLock(SStreamMeta* pMeta) { diff --git a/source/libs/stream/src/streamStart.c b/source/libs/stream/src/streamStart.c index 53f87591e8..dd713290f3 100644 --- a/source/libs/stream/src/streamStart.c +++ b/source/libs/stream/src/streamStart.c @@ -1083,6 +1083,7 @@ int32_t streamMetaUpdateTaskDownstreamStatus(SStreamMeta* pMeta, int64_t streamI int64_t endTs, bool ready) { STaskStartInfo* pStartInfo = &pMeta->startInfo; STaskId id = {.streamId = streamId, .taskId = taskId}; + bool restart = true; streamMetaWLock(pMeta); SHashObj* pDst = ready ? pStartInfo->pReadyTaskSet : pStartInfo->pFailedTaskSet; @@ -1106,6 +1107,8 @@ int32_t streamMetaUpdateTaskDownstreamStatus(SStreamMeta* pMeta, int64_t streamI displayStatusInfo(pMeta, pStartInfo->pReadyTaskSet, true); displayStatusInfo(pMeta, pStartInfo->pFailedTaskSet, false); streamMetaResetStartInfo(pStartInfo); + + pStartInfo->completeFn(pMeta); } else { stDebug("vgId:%d recv check down results:%d, total:%d", pMeta->vgId, numOfRecv, numOfTotal); } From e734569de07850ef02c7077fd95b702debb44a1b Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Mon, 18 Dec 2023 18:42:57 +0800 Subject: [PATCH 02/38] fix(stream): fix dead lock. --- source/dnode/vnode/src/tqCommon/tqCommon.c | 6 +++--- source/libs/stream/src/streamMeta.c | 12 ------------ source/libs/stream/src/streamStart.c | 3 ++- 3 files changed, 5 insertions(+), 16 deletions(-) diff --git a/source/dnode/vnode/src/tqCommon/tqCommon.c b/source/dnode/vnode/src/tqCommon/tqCommon.c index 004eb240b3..a42f81edab 100644 --- a/source/dnode/vnode/src/tqCommon/tqCommon.c +++ b/source/dnode/vnode/src/tqCommon/tqCommon.c @@ -811,7 +811,7 @@ int32_t tqStreamTaskProcessRunReq(SStreamMeta* pMeta, SRpcMsg* pMsg, bool isLead int32_t tqStartTaskCompleteCallback(SStreamMeta* pMeta) { STaskStartInfo* pStartInfo = &pMeta->startInfo; - taosWLockLatch(&pMeta->lock); + streamMetaWLock(pMeta); if (pStartInfo->restartCount > 0) { pStartInfo->restartCount -= 1; @@ -820,10 +820,10 @@ int32_t tqStartTaskCompleteCallback(SStreamMeta* pMeta) { tqDebug("vgId:%d role:%d need to restart all tasks again, restartCounter:%d", pMeta->vgId, pMeta->role, pStartInfo->restartCount); - taosWUnLockLatch(&pMeta->lock); + streamMetaWUnLock(pMeta); restartStreamTasks(pMeta, (pMeta->role == NODE_ROLE_LEADER)); } else { - taosWUnLockLatch(&pMeta->lock); + streamMetaWUnLock(pMeta); tqDebug("vgId:%d start all tasks completed", pMeta->vgId); } diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index 8868a5719c..e94c1b7596 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -348,18 +348,6 @@ SStreamMeta* streamMetaOpen(const char* path, void* ahandle, FTaskExpand expandF pMeta->startInfo.completeFn = fn; pMeta->pTaskDbUnique = taosHashInit(64, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BINARY), false, HASH_ENTRY_LOCK); - // pMeta->chkpId = streamGetLatestCheckpointId(pMeta); - // pMeta->streamBackend = streamBackendInit(pMeta->path, pMeta->chkpId); - // while (pMeta->streamBackend == NULL) { - // qError("vgId:%d failed to init stream backend", pMeta->vgId); - // taosMsleep(2 * 1000); - // qInfo("vgId:%d retry to init stream backend", pMeta->vgId); - // pMeta->streamBackend = streamBackendInit(pMeta->path, pMeta->chkpId); - // if (pMeta->streamBackend == NULL) { - // } - // } - // pMeta->streamBackendRid = taosAddRef(streamBackendId, pMeta->streamBackend); - pMeta->numOfPausedTasks = 0; pMeta->numOfStreamTasks = 0; stInfo("vgId:%d open stream meta successfully, latest checkpoint:%" PRId64 ", stage:%" PRId64, vgId, pMeta->chkpId, diff --git a/source/libs/stream/src/streamStart.c b/source/libs/stream/src/streamStart.c index dd713290f3..8bd7984d29 100644 --- a/source/libs/stream/src/streamStart.c +++ b/source/libs/stream/src/streamStart.c @@ -1107,12 +1107,13 @@ int32_t streamMetaUpdateTaskDownstreamStatus(SStreamMeta* pMeta, int64_t streamI displayStatusInfo(pMeta, pStartInfo->pReadyTaskSet, true); displayStatusInfo(pMeta, pStartInfo->pFailedTaskSet, false); streamMetaResetStartInfo(pStartInfo); + streamMetaWUnLock(pMeta); pStartInfo->completeFn(pMeta); } else { + streamMetaWUnLock(pMeta); stDebug("vgId:%d recv check down results:%d, total:%d", pMeta->vgId, numOfRecv, numOfTotal); } - streamMetaWUnLock(pMeta); return TSDB_CODE_SUCCESS; } From fee198f9ada4cdf223810f3c28512f11867e0348 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Tue, 19 Dec 2023 16:21:11 +0800 Subject: [PATCH 03/38] refactor: do some internal refactor. --- include/dnode/vnode/tqCommon.h | 2 +- source/dnode/snode/src/snode.c | 2 +- source/dnode/vnode/src/tqCommon/tqCommon.c | 6 +++--- source/dnode/vnode/src/vnd/vnodeSync.c | 2 +- source/libs/stream/src/streamMeta.c | 4 ++++ source/libs/stream/src/streamStart.c | 1 - source/os/test/osTests.cpp | 2 +- 7 files changed, 11 insertions(+), 8 deletions(-) diff --git a/include/dnode/vnode/tqCommon.h b/include/dnode/vnode/tqCommon.h index 32ac38cee6..0ad3023cbe 100644 --- a/include/dnode/vnode/tqCommon.h +++ b/include/dnode/vnode/tqCommon.h @@ -31,7 +31,7 @@ int32_t tqStreamTaskProcessDeployReq(SStreamMeta* pMeta, int64_t sversion, char* int32_t tqStreamTaskProcessDropReq(SStreamMeta* pMeta, char* msg, int32_t msgLen); int32_t tqStreamTaskProcessRunReq(SStreamMeta* pMeta, SRpcMsg* pMsg, bool isLeader); int32_t startStreamTasks(SStreamMeta* pMeta); -int32_t resetStreamTaskStatus(SStreamMeta* pMeta); +int32_t tqStreamTaskResetStatus(SStreamMeta* pMeta); int32_t tqStartTaskCompleteCallback(SStreamMeta* pMeta); #endif // TDENGINE_TQ_COMMON_H diff --git a/source/dnode/snode/src/snode.c b/source/dnode/snode/src/snode.c index 853dc9d1aa..31d923169a 100644 --- a/source/dnode/snode/src/snode.c +++ b/source/dnode/snode/src/snode.c @@ -150,7 +150,7 @@ FAIL: } int32_t sndInit(SSnode * pSnode) { - resetStreamTaskStatus(pSnode->pMeta); + tqStreamTaskResetStatus(pSnode->pMeta); startStreamTasks(pSnode->pMeta); return 0; } diff --git a/source/dnode/vnode/src/tqCommon/tqCommon.c b/source/dnode/vnode/src/tqCommon/tqCommon.c index bfae691dbf..011ce674ec 100644 --- a/source/dnode/vnode/src/tqCommon/tqCommon.c +++ b/source/dnode/vnode/src/tqCommon/tqCommon.c @@ -218,7 +218,7 @@ int32_t tqStreamTaskProcessUpdateReq(SStreamMeta* pMeta, SMsgCb* cb, SRpcMsg* pM if (vnodeIsRoleLeader(pTq->pVnode) && !tsDisableStream) { tqInfo("vgId:%d start all stream tasks after all being updated", vgId); - resetStreamTaskStatus(pTq->pStreamMeta); + tqStreamTaskResetStatus(pTq->pStreamMeta); tqStartStreamTaskAsync(pTq, false); } else { tqInfo("vgId:%d, follower node not start stream tasks", vgId); @@ -703,7 +703,7 @@ int32_t startStreamTasks(SStreamMeta* pMeta) { return code; } -int32_t resetStreamTaskStatus(SStreamMeta* pMeta) { +int32_t tqStreamTaskResetStatus(SStreamMeta* pMeta) { int32_t vgId = pMeta->vgId; int32_t numOfTasks = taosArrayGetSize(pMeta->pTaskList); @@ -763,7 +763,7 @@ static int32_t restartStreamTasks(SStreamMeta* pMeta, bool isLeader) { } if (isLeader && !tsDisableStream) { - resetStreamTaskStatus(pMeta); + tqStreamTaskResetStatus(pMeta); streamMetaWUnLock(pMeta); tqInfo("vgId:%d restart all stream tasks after all tasks being updated", vgId); diff --git a/source/dnode/vnode/src/vnd/vnodeSync.c b/source/dnode/vnode/src/vnd/vnodeSync.c index 5871a60c9e..48811b2925 100644 --- a/source/dnode/vnode/src/vnd/vnodeSync.c +++ b/source/dnode/vnode/src/vnd/vnodeSync.c @@ -570,7 +570,7 @@ static void vnodeRestoreFinish(const SSyncFSM *pFsm, const SyncIndex commitIdx) vInfo("vgId:%d, sync restore finished, not launch stream tasks, since stream tasks are disabled", vgId); } else { vInfo("vgId:%d sync restore finished, start to launch stream tasks", pVnode->config.vgId); - resetStreamTaskStatus(pVnode->pTq->pStreamMeta); + tqStreamTaskResetStatus(pVnode->pTq->pStreamMeta); tqStreamTaskStartAsync(pMeta, &pVnode->msgCb, false); } } else { diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index 59178336d8..c263968828 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -1270,18 +1270,22 @@ void streamMetaRLock(SStreamMeta* pMeta) { stTrace("vgId:%d meta-rlock", pMeta->vgId); taosRLockLatch(&pMeta->lock); } + void streamMetaRUnLock(SStreamMeta* pMeta) { stTrace("vgId:%d meta-runlock", pMeta->vgId); taosRUnLockLatch(&pMeta->lock); } + void streamMetaWLock(SStreamMeta* pMeta) { stTrace("vgId:%d meta-wlock", pMeta->vgId); taosWLockLatch(&pMeta->lock); } + void streamMetaWUnLock(SStreamMeta* pMeta) { stTrace("vgId:%d meta-wunlock", pMeta->vgId); taosWUnLockLatch(&pMeta->lock); } + static void execHelper(struct SSchedMsg* pSchedMsg) { __async_exec_fn_t execFn = (__async_exec_fn_t)pSchedMsg->ahandle; int32_t code = execFn(pSchedMsg->thandle); diff --git a/source/libs/stream/src/streamStart.c b/source/libs/stream/src/streamStart.c index 8bd7984d29..0ae85024c0 100644 --- a/source/libs/stream/src/streamStart.c +++ b/source/libs/stream/src/streamStart.c @@ -1083,7 +1083,6 @@ int32_t streamMetaUpdateTaskDownstreamStatus(SStreamMeta* pMeta, int64_t streamI int64_t endTs, bool ready) { STaskStartInfo* pStartInfo = &pMeta->startInfo; STaskId id = {.streamId = streamId, .taskId = taskId}; - bool restart = true; streamMetaWLock(pMeta); SHashObj* pDst = ready ? pStartInfo->pReadyTaskSet : pStartInfo->pFailedTaskSet; diff --git a/source/os/test/osTests.cpp b/source/os/test/osTests.cpp index 16660a9477..4d9ad9b5bd 100644 --- a/source/os/test/osTests.cpp +++ b/source/os/test/osTests.cpp @@ -72,7 +72,7 @@ TEST(osTest, osSystem) { const int sysLen = 64; char osSysName[sysLen]; int ret = taosGetOsReleaseName(osSysName, NULL, NULL, sysLen); - printf("os systeme name:%s\n", osSysName); + printf("os system name:%s\n", osSysName); ASSERT_EQ(ret, 0); } From 92045f54856713bff0ef6801f8cc4bc412afce69 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Tue, 19 Dec 2023 18:34:30 +0800 Subject: [PATCH 04/38] fix(stream): add snode check, and handle the task status reset from checkpoint when it is in the snode. --- include/dnode/vnode/tqCommon.h | 1 + source/dnode/mgmt/mgmt_snode/src/smHandle.c | 2 +- source/dnode/mnode/impl/src/mndStream.c | 127 ++++++++++++++------ source/dnode/snode/src/snode.c | 2 + source/dnode/vnode/src/tq/tq.c | 21 +--- source/dnode/vnode/src/tqCommon/tqCommon.c | 22 ++++ 6 files changed, 120 insertions(+), 55 deletions(-) diff --git a/include/dnode/vnode/tqCommon.h b/include/dnode/vnode/tqCommon.h index 0ad3023cbe..da9dcf1c04 100644 --- a/include/dnode/vnode/tqCommon.h +++ b/include/dnode/vnode/tqCommon.h @@ -33,5 +33,6 @@ int32_t tqStreamTaskProcessRunReq(SStreamMeta* pMeta, SRpcMsg* pMsg, bool isLead int32_t startStreamTasks(SStreamMeta* pMeta); int32_t tqStreamTaskResetStatus(SStreamMeta* pMeta); int32_t tqStartTaskCompleteCallback(SStreamMeta* pMeta); +int32_t tqStreamTaskProcessTaskResetReq(SStreamMeta* pMeta, SRpcMsg* pMsg); #endif // TDENGINE_TQ_COMMON_H diff --git a/source/dnode/mgmt/mgmt_snode/src/smHandle.c b/source/dnode/mgmt/mgmt_snode/src/smHandle.c index 6de29f8513..444739e461 100644 --- a/source/dnode/mgmt/mgmt_snode/src/smHandle.c +++ b/source/dnode/mgmt/mgmt_snode/src/smHandle.c @@ -90,7 +90,7 @@ SArray *smGetMsgHandles() { if (dmSetMgmtHandle(pArray, TDMT_VND_STREAM_SCAN_HISTORY_FINISH, smPutNodeMsgToStreamQueue, 1) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_VND_STREAM_SCAN_HISTORY_FINISH_RSP, smPutNodeMsgToStreamQueue, 1) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_STREAM_TASK_CHECKPOINT_READY, smPutNodeMsgToStreamQueue, 1) == NULL) goto _OVER; - + if (dmSetMgmtHandle(pArray, TDMT_VND_STREAM_TASK_RESET, smPutNodeMsgToMgmtQueue, 1) == NULL) goto _OVER; code = 0; _OVER: diff --git a/source/dnode/mnode/impl/src/mndStream.c b/source/dnode/mnode/impl/src/mndStream.c index 62840f7e1f..b8ecf66686 100644 --- a/source/dnode/mnode/impl/src/mndStream.c +++ b/source/dnode/mnode/impl/src/mndStream.c @@ -685,6 +685,40 @@ _OVER: return -1; } +static int32_t extractNodeEpset(SMnode *pMnode, SEpSet *pEpSet, bool* hasEpset, int32_t taskId, int32_t nodeId) { + *hasEpset = false; + + if (nodeId == SNODE_HANDLE) { + SSnodeObj *pObj = NULL; + void *pIter = NULL; + + pIter = sdbFetch(pMnode->pSdb, SDB_SNODE, pIter, (void **)&pObj); + if (pIter != NULL) { + addEpIntoEpSet(pEpSet, pObj->pDnode->fqdn, pObj->pDnode->port); + sdbRelease(pMnode->pSdb, pObj); + sdbCancelFetch(pMnode->pSdb, pIter); + *hasEpset = true; + return TSDB_CODE_SUCCESS; + } else { + mError("failed to acquire snode epset"); + return TSDB_CODE_INVALID_PARA; + } + } else { + SVgObj *pVgObj = mndAcquireVgroup(pMnode, nodeId); + if (pVgObj != NULL) { + SEpSet epset = mndGetVgroupEpset(pMnode, pVgObj); + mndReleaseVgroup(pMnode, pVgObj); + + epsetAssign(pEpSet, &epset); + *hasEpset = true; + return TSDB_CODE_SUCCESS; + } else { + mDebug("orphaned task:0x%x need to be dropped, nodeId:%d, no redo action", taskId, nodeId); + return TSDB_CODE_SUCCESS; + } + } +} + static int32_t mndPersistTaskDropReq(SMnode *pMnode, STrans *pTrans, SStreamTask *pTask) { SVDropStreamTaskReq *pReq = taosMemoryCalloc(1, sizeof(SVDropStreamTaskReq)); if (pReq == NULL) { @@ -698,28 +732,17 @@ static int32_t mndPersistTaskDropReq(SMnode *pMnode, STrans *pTrans, SStreamTask STransAction action = {0}; SEpSet epset = {0}; - if (pTask->info.nodeId == SNODE_HANDLE) { - SSnodeObj *pObj = NULL; - void *pIter = NULL; - while (1) { - pIter = sdbFetch(pMnode->pSdb, SDB_SNODE, pIter, (void **)&pObj); - if (pIter == NULL) { - break; - } + bool hasEpset = false; - addEpIntoEpSet(&epset, pObj->pDnode->fqdn, pObj->pDnode->port); - sdbRelease(pMnode->pSdb, pObj); - } - } else { - SVgObj *pVgObj = mndAcquireVgroup(pMnode, pTask->info.nodeId); - if (pVgObj != NULL) { - epset = mndGetVgroupEpset(pMnode, pVgObj); - mndReleaseVgroup(pMnode, pVgObj); - } else { - mDebug("orphaned task:0x%x need to be dropped, nodeId:%d, no redo action", pTask->id.taskId, pTask->info.nodeId); - taosMemoryFree(pReq); - return 0; - } + int32_t code = extractNodeEpset(pMnode, &epset, &hasEpset, pTask->id.taskId, pTask->info.nodeId); + if (code != TSDB_CODE_SUCCESS) { + terrno = code; + return -1; + } + + // no valid epset, return directly without redoAction + if (!hasEpset) { + return TSDB_CODE_SUCCESS; } // The epset of nodeId of this task may have been expired now, let's use the newest epset from mnode. @@ -1776,9 +1799,20 @@ static int32_t mndPauseStreamTask(SMnode *pMnode, STrans *pTrans, SStreamTask *p pReq->taskId = pTask->id.taskId; pReq->streamId = pTask->id.streamId; - SVgObj *pVgObj = mndAcquireVgroup(pMnode, pTask->info.nodeId); - SEpSet epset = mndGetVgroupEpset(pMnode, pVgObj); - mndReleaseVgroup(pMnode, pVgObj); + SEpSet epset; + bool hasEpset = false; + int32_t code = extractNodeEpset(pMnode, &epset, &hasEpset, pTask->id.taskId, pTask->info.nodeId); + if (code != TSDB_CODE_SUCCESS) { + terrno = code; + taosMemoryFree(pReq); + return -1; + } + + // no valid epset, return directly without redoAction + if (!hasEpset) { + taosMemoryFree(pReq); + return TSDB_CODE_SUCCESS; + } STransAction action = {0}; initTransAction(&action, pReq, sizeof(SVPauseStreamTaskReq), TDMT_STREAM_TASK_PAUSE, &epset, 0); @@ -1920,17 +1954,25 @@ static int32_t mndProcessPauseStreamReq(SRpcMsg *pReq) { static int32_t mndResumeStreamTask(STrans *pTrans, SMnode *pMnode, SStreamTask *pTask, int8_t igUntreated) { SVResumeStreamTaskReq *pReq = taosMemoryCalloc(1, sizeof(SVResumeStreamTaskReq)); if (pReq == NULL) { + mError("failed to malloc in resume stream, size:%" PRIzu ", code:%s", sizeof(SVResumeStreamTaskReq), + tstrerror(TSDB_CODE_OUT_OF_MEMORY)); terrno = TSDB_CODE_OUT_OF_MEMORY; return -1; } + pReq->head.vgId = htonl(pTask->info.nodeId); pReq->taskId = pTask->id.taskId; pReq->streamId = pTask->id.streamId; pReq->igUntreated = igUntreated; - SVgObj *pVgObj = mndAcquireVgroup(pMnode, pTask->info.nodeId); - SEpSet epset = mndGetVgroupEpset(pMnode, pVgObj); - mndReleaseVgroup(pMnode, pVgObj); + SEpSet epset; + bool hasEpset = false; + int32_t code = extractNodeEpset(pMnode, &epset, &hasEpset, pTask->id.taskId, pTask->info.nodeId); + if (code != TSDB_CODE_SUCCESS) { + terrno = code; + taosMemoryFree(pReq); + return -1; + } STransAction action = {0}; initTransAction(&action, pReq, sizeof(SVResumeStreamTaskReq), TDMT_STREAM_TASK_RESUME, &epset, 0); @@ -2208,6 +2250,8 @@ static SVgroupChangeInfo mndFindChangedNodeInfo(SMnode *pMnode, const SArray *pP epsetAssign(&updateInfo.newEp, &pCurrent->epset); taosArrayPush(info.pUpdateNodeList, &updateInfo); } + + // todo handle the snode info if (pCurrent->nodeId != SNODE_HANDLE) { SVgObj *pVgroup = mndAcquireVgroup(pMnode, pCurrent->nodeId); taosHashPut(info.pDBMap, pVgroup->dbName, strlen(pVgroup->dbName), NULL, 0); @@ -2704,9 +2748,18 @@ int32_t createStreamResetStatusTrans(SMnode *pMnode, SStreamObj *pStream) { pReq->taskId = pTask->id.taskId; pReq->streamId = pTask->id.streamId; - SVgObj *pVgObj = mndAcquireVgroup(pMnode, pTask->info.nodeId); - SEpSet epset = mndGetVgroupEpset(pMnode, pVgObj); - mndReleaseVgroup(pMnode, pVgObj); + SEpSet epset; + bool hasEpset = false; + int32_t code = extractNodeEpset(pMnode, &epset, &hasEpset, pTask->id.taskId, pTask->info.nodeId); + if (code != TSDB_CODE_SUCCESS) { + taosMemoryFree(pReq); + continue; + } + + if (!hasEpset) { + taosMemoryFree(pReq); + continue; + } STransAction action = {0}; initTransAction(&action, pReq, sizeof(SVResetStreamTaskReq), TDMT_VND_STREAM_TASK_RESET, &epset, 0); @@ -2878,11 +2931,17 @@ static int32_t mndDropRelatedFillhistoryTask(SMnode *pMnode, STaskStatusEntry *p mDebug("build and send drop related fill-history task for task:0x%x", pTask->id.taskId); - SVgObj *pVgObj = mndAcquireVgroup(pMnode, pTask->info.nodeId); - SEpSet epset = mndGetVgroupEpset(pMnode, pVgObj); - mndReleaseVgroup(pMnode, pVgObj); + SEpSet epset; + bool hasEpset = false; + int32_t code = extractNodeEpset(pMnode, &epset, &hasEpset, pTask->id.taskId, pTask->info.nodeId); + if (code != TSDB_CODE_SUCCESS) { + return code; + } + + if (hasEpset) { + tmsgSendReq(&epset, &msg); + } - tmsgSendReq(&epset, &msg); return TSDB_CODE_SUCCESS; } diff --git a/source/dnode/snode/src/snode.c b/source/dnode/snode/src/snode.c index 31d923169a..f0fe662025 100644 --- a/source/dnode/snode/src/snode.c +++ b/source/dnode/snode/src/snode.c @@ -205,6 +205,8 @@ int32_t sndProcessWriteMsg(SSnode *pSnode, SRpcMsg *pMsg, SRpcMsg *pRsp) { return tqStreamTaskProcessDropReq(pSnode->pMeta, pMsg->pCont, pMsg->contLen); case TDMT_VND_STREAM_TASK_UPDATE: return tqStreamTaskProcessUpdateReq(pSnode->pMeta, &pSnode->msgCb, pMsg, true); + case TDMT_VND_STREAM_TASK_RESET: + return tqStreamTaskProcessTaskResetReq(pSnode->pMeta, pMsg); default: ASSERT(0); } diff --git a/source/dnode/vnode/src/tq/tq.c b/source/dnode/vnode/src/tq/tq.c index 76049d21b1..c33becf5b9 100644 --- a/source/dnode/vnode/src/tq/tq.c +++ b/source/dnode/vnode/src/tq/tq.c @@ -1336,26 +1336,7 @@ int32_t tqProcessTaskUpdateReq(STQ* pTq, SRpcMsg* pMsg) { } int32_t tqProcessTaskResetReq(STQ* pTq, SRpcMsg* pMsg) { - SVPauseStreamTaskReq* pReq = (SVPauseStreamTaskReq*)pMsg->pCont; - - SStreamMeta* pMeta = pTq->pStreamMeta; - SStreamTask* pTask = streamMetaAcquireTask(pMeta, pReq->streamId, pReq->taskId); - if (pTask == NULL) { - tqError("vgId:%d process task-reset req, failed to acquire task:0x%x, it may have been dropped already", - pMeta->vgId, pReq->taskId); - return TSDB_CODE_SUCCESS; - } - - tqDebug("s-task:%s receive task-reset msg from mnode, reset status and ready for data processing", pTask->id.idStr); - - // clear flag set during do checkpoint, and open inputQ for all upstream tasks - if (streamTaskGetStatus(pTask, NULL) == TASK_STATUS__CK) { - streamTaskClearCheckInfo(pTask, true); - streamTaskSetStatusReady(pTask); - } - - streamMetaReleaseTask(pMeta, pTask); - return TSDB_CODE_SUCCESS; + return tqStreamTaskProcessTaskResetReq(pTq->pStreamMeta, pMsg); } int32_t tqProcessTaskDropHTask(STQ* pTq, SRpcMsg* pMsg) { diff --git a/source/dnode/vnode/src/tqCommon/tqCommon.c b/source/dnode/vnode/src/tqCommon/tqCommon.c index 011ce674ec..0d932e6e72 100644 --- a/source/dnode/vnode/src/tqCommon/tqCommon.c +++ b/source/dnode/vnode/src/tqCommon/tqCommon.c @@ -832,5 +832,27 @@ int32_t tqStartTaskCompleteCallback(SStreamMeta* pMeta) { tqDebug("vgId:%d start all tasks completed", pMeta->vgId); } + return TSDB_CODE_SUCCESS; +} + +int32_t tqStreamTaskProcessTaskResetReq(SStreamMeta* pMeta, SRpcMsg* pMsg) { + SVPauseStreamTaskReq* pReq = (SVPauseStreamTaskReq*)pMsg->pCont; + + SStreamTask* pTask = streamMetaAcquireTask(pMeta, pReq->streamId, pReq->taskId); + if (pTask == NULL) { + tqError("vgId:%d process task-reset req, failed to acquire task:0x%x, it may have been dropped already", + pMeta->vgId, pReq->taskId); + return TSDB_CODE_SUCCESS; + } + + tqDebug("s-task:%s receive task-reset msg from mnode, reset status and ready for data processing", pTask->id.idStr); + + // clear flag set during do checkpoint, and open inputQ for all upstream tasks + if (streamTaskGetStatus(pTask, NULL) == TASK_STATUS__CK) { + streamTaskClearCheckInfo(pTask, true); + streamTaskSetStatusReady(pTask); + } + + streamMetaReleaseTask(pMeta, pTask); return TSDB_CODE_SUCCESS; } \ No newline at end of file From 2de0f6e9713fb56c04cfa34ba505cfe3e2cf8d3d Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Tue, 19 Dec 2023 18:56:15 +0800 Subject: [PATCH 05/38] test(stream): lock the update procedure. --- source/dnode/vnode/src/tqCommon/tqCommon.c | 63 +++------------------- 1 file changed, 8 insertions(+), 55 deletions(-) diff --git a/source/dnode/vnode/src/tqCommon/tqCommon.c b/source/dnode/vnode/src/tqCommon/tqCommon.c index 0d932e6e72..c39e653596 100644 --- a/source/dnode/vnode/src/tqCommon/tqCommon.c +++ b/source/dnode/vnode/src/tqCommon/tqCommon.c @@ -106,14 +106,15 @@ int32_t tqStreamTaskProcessUpdateReq(SStreamMeta* pMeta, SMsgCb* cb, SRpcMsg* pM return rsp.code; } - streamMetaWUnLock(pMeta); +// streamMetaWUnLock(pMeta); + // todo for test purpose // the following two functions should not be executed within the scope of meta lock to avoid deadlock streamTaskUpdateEpsetInfo(pTask, req.pNodeList); streamTaskResetStatus(pTask); // continue after lock the meta again - streamMetaWLock(pMeta); +// streamMetaWLock(pMeta); SStreamTask** ppHTask = NULL; if (HAS_RELATED_FILLHISTORY_TASK(pTask)) { @@ -166,65 +167,17 @@ int32_t tqStreamTaskProcessUpdateReq(SStreamMeta* pMeta, SMsgCb* cb, SRpcMsg* pM streamMetaWUnLock(pMeta); } else { if (!restored) { - tqDebug("vgId:%d vnode restore not completed, not restart the tasks, clear the start after nodeUpdate flag", - vgId); + tqDebug("vgId:%d vnode restore not completed, not start the tasks, clear the start after nodeUpdate flag", vgId); pMeta->startInfo.tasksWillRestart = 0; streamMetaWUnLock(pMeta); } else { tqDebug("vgId:%d all %d task(s) nodeEp updated and closed", vgId, numOfTasks); -#if 1 +#if 0 + // for test purpose, to trigger the leader election + taosMSleep(5000); +#endif tqStreamTaskStartAsync(pMeta, cb, true); streamMetaWUnLock(pMeta); -#else - streamMetaWUnLock(pMeta); - - // For debug purpose. - // the following procedure consume many CPU resource, result in the re-election of leader - // with high probability. So we employ it as a test case for the stream processing framework, with - // checkpoint/restart/nodeUpdate etc. - while (1) { - int32_t startVal = atomic_val_compare_exchange_32(&pMeta->startInfo.taskStarting, 0, 1); - if (startVal == 0) { - break; - } - - tqDebug("vgId:%d in start stream tasks procedure, wait for 500ms and recheck", vgId); - taosMsleep(500); - } - - while (streamMetaTaskInTimer(pMeta)) { - tqDebug("vgId:%d some tasks in timer, wait for 100ms and recheck", pMeta->vgId); - taosMsleep(100); - } - - streamMetaWLock(pMeta); - - int32_t code = streamMetaReopen(pMeta); - if (code != 0) { - tqError("vgId:%d failed to reopen stream meta", vgId); - streamMetaWUnLock(pMeta); - taosArrayDestroy(req.pNodeList); - return -1; - } - - streamMetaInitBackend(pMeta); - - if (streamMetaLoadAllTasks(pTq->pStreamMeta) < 0) { - tqError("vgId:%d failed to load stream tasks", vgId); - streamMetaWUnLock(pMeta); - taosArrayDestroy(req.pNodeList); - return -1; - } - - if (vnodeIsRoleLeader(pTq->pVnode) && !tsDisableStream) { - tqInfo("vgId:%d start all stream tasks after all being updated", vgId); - tqStreamTaskResetStatus(pTq->pStreamMeta); - tqStartStreamTaskAsync(pTq, false); - } else { - tqInfo("vgId:%d, follower node not start stream tasks", vgId); - } - streamMetaWUnLock(pMeta); -#endif } } From 541967e99ebf6fd85ec60064abff3e1bc08e3acf Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Wed, 20 Dec 2023 10:24:03 +0800 Subject: [PATCH 06/38] refactor: do some internal refactor. --- source/dnode/vnode/src/tsdb/tsdbRead2.c | 42 +++++++++++++++---------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/source/dnode/vnode/src/tsdb/tsdbRead2.c b/source/dnode/vnode/src/tsdb/tsdbRead2.c index fd179dba2e..773ce9e513 100644 --- a/source/dnode/vnode/src/tsdb/tsdbRead2.c +++ b/source/dnode/vnode/src/tsdb/tsdbRead2.c @@ -1360,7 +1360,8 @@ static bool tryCopyDistinctRowFromFileBlock(STsdbReader* pReader, SBlockData* pB static bool nextRowFromSttBlocks(SSttBlockReader* pSttBlockReader, STableBlockScanInfo* pScanInfo, SVersionRange* pVerRange) { - int32_t step = ASCENDING_TRAVERSE(pSttBlockReader->order) ? 1 : -1; + int32_t order = pSttBlockReader->order; + int32_t step = ASCENDING_TRAVERSE(order) ? 1 : -1; while (1) { bool hasVal = tMergeTreeNext(&pSttBlockReader->mergeTree); @@ -1377,10 +1378,12 @@ static bool nextRowFromSttBlocks(SSttBlockReader* pSttBlockReader, STableBlockSc pSttBlockReader->currentKey = key; pScanInfo->sttKeyInfo.nextProcKey = key; - if (!hasBeenDropped(pScanInfo->delSkyline, &pScanInfo->sttBlockDelIndex, key, ver, pSttBlockReader->order, - pVerRange)) { - pScanInfo->sttKeyInfo.status = STT_FILE_HAS_DATA; - return true; + if (pScanInfo->delSkyline != NULL && TARRAY_SIZE(pScanInfo->delSkyline) > 0) { + bool dropped = hasBeenDropped(pScanInfo->delSkyline, &pScanInfo->sttBlockDelIndex, key, ver, order, pVerRange); + if (!dropped) { + pScanInfo->sttKeyInfo.status = STT_FILE_HAS_DATA; + return true; + } } } } @@ -2054,9 +2057,12 @@ static bool isValidFileBlockRow(SBlockData* pBlockData, SFileBlockDumpInfo* pDum return false; } - if (hasBeenDropped(pBlockScanInfo->delSkyline, &pBlockScanInfo->fileDelIndex, ts, ver, pReader->info.order, - &pReader->info.verRange)) { - return false; + if (pBlockScanInfo->delSkyline != NULL && TARRAY_SIZE(pBlockScanInfo->delSkyline) > 0) { + bool deleted = hasBeenDropped(pBlockScanInfo->delSkyline, &pBlockScanInfo->fileDelIndex, ts, ver, + pReader->info.order, &pReader->info.verRange); + if (deleted) { + return false; + } } return true; @@ -3219,7 +3225,7 @@ SVersionRange getQueryVerRange(SVnode* pVnode, SQueryTableDataCond* pCond, int8_ bool hasBeenDropped(const SArray* pDelList, int32_t* index, int64_t key, int64_t ver, int32_t order, SVersionRange* pVerRange) { - if (pDelList == NULL || (taosArrayGetSize(pDelList) == 0)) { + if (pDelList == NULL || (TARRAY_SIZE(pDelList) == 0)) { return false; } @@ -3327,16 +3333,18 @@ TSDBROW* getValidMemRow(SIterInfo* pIter, const SArray* pDelList, STsdbReader* p TSDBROW* pRow = tsdbTbDataIterGet(pIter->iter); TSDBKEY key = TSDBROW_KEY(pRow); - + int32_t order = pReader->info.order; if (outOfTimeWindow(key.ts, &pReader->info.window)) { pIter->hasVal = false; return NULL; } // it is a valid data version - if ((key.version <= pReader->info.verRange.maxVer && key.version >= pReader->info.verRange.minVer) && - (!hasBeenDropped(pDelList, &pIter->index, key.ts, key.version, pReader->info.order, &pReader->info.verRange))) { - return pRow; + if ((key.version <= pReader->info.verRange.maxVer && key.version >= pReader->info.verRange.minVer)) { + bool dropped = hasBeenDropped(pDelList, &pIter->index, key.ts, key.version, order, &pReader->info.verRange); + if (!dropped) { + return pRow; + } } while (1) { @@ -3353,9 +3361,11 @@ TSDBROW* getValidMemRow(SIterInfo* pIter, const SArray* pDelList, STsdbReader* p return NULL; } - if (key.version <= pReader->info.verRange.maxVer && key.version >= pReader->info.verRange.minVer && - (!hasBeenDropped(pDelList, &pIter->index, key.ts, key.version, pReader->info.order, &pReader->info.verRange))) { - return pRow; + if (key.version <= pReader->info.verRange.maxVer && key.version >= pReader->info.verRange.minVer) { + bool dropped = hasBeenDropped(pDelList, &pIter->index, key.ts, key.version, order, &pReader->info.verRange); + if (!dropped) { + return pRow; + } } } } From 81d8b2434f8b5c8e4a2c36be00115d3e0d25672b Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Wed, 20 Dec 2023 10:25:31 +0800 Subject: [PATCH 07/38] refactor: do some internal refactor. --- source/dnode/vnode/src/tsdb/tsdbRead2.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/source/dnode/vnode/src/tsdb/tsdbRead2.c b/source/dnode/vnode/src/tsdb/tsdbRead2.c index 773ce9e513..751569bf90 100644 --- a/source/dnode/vnode/src/tsdb/tsdbRead2.c +++ b/source/dnode/vnode/src/tsdb/tsdbRead2.c @@ -3340,7 +3340,8 @@ TSDBROW* getValidMemRow(SIterInfo* pIter, const SArray* pDelList, STsdbReader* p } // it is a valid data version - if ((key.version <= pReader->info.verRange.maxVer && key.version >= pReader->info.verRange.minVer)) { + if ((key.version <= pReader->info.verRange.maxVer && key.version >= pReader->info.verRange.minVer) && + pDelList != NULL && TARRAY_SIZE(pDelList) > 0) { bool dropped = hasBeenDropped(pDelList, &pIter->index, key.ts, key.version, order, &pReader->info.verRange); if (!dropped) { return pRow; @@ -3361,7 +3362,8 @@ TSDBROW* getValidMemRow(SIterInfo* pIter, const SArray* pDelList, STsdbReader* p return NULL; } - if (key.version <= pReader->info.verRange.maxVer && key.version >= pReader->info.verRange.minVer) { + if (key.version <= pReader->info.verRange.maxVer && key.version >= pReader->info.verRange.minVer && + pDelList != NULL && TARRAY_SIZE(pDelList) > 0) { bool dropped = hasBeenDropped(pDelList, &pIter->index, key.ts, key.version, order, &pReader->info.verRange); if (!dropped) { return pRow; From 94b260a1f9431d21a4776d1c7e078de13781368e Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Wed, 20 Dec 2023 10:29:04 +0800 Subject: [PATCH 08/38] refactor: do some internal refactor. --- source/libs/stream/src/streamStart.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/source/libs/stream/src/streamStart.c b/source/libs/stream/src/streamStart.c index 743661bb59..68380e4ba4 100644 --- a/source/libs/stream/src/streamStart.c +++ b/source/libs/stream/src/streamStart.c @@ -209,7 +209,11 @@ void streamTaskCheckDownstream(SStreamTask* pTask) { int32_t numOfVgs = taosArrayGetSize(vgInfo); pTask->notReadyTasks = numOfVgs; - pTask->checkReqIds = taosArrayInit(numOfVgs, sizeof(int64_t)); + if (pTask->checkReqIds == NULL) { + pTask->checkReqIds = taosArrayInit(numOfVgs, sizeof(int64_t)); + } else { + taosArrayClear(pTask->checkReqIds); + } stDebug("s-task:%s check %d downstream tasks, ver:%" PRId64 "-%" PRId64 " window:%" PRId64 "-%" PRId64, pTask->id.idStr, numOfVgs, pRange->range.minVer, pRange->range.maxVer, pWindow->skey, pWindow->ekey); @@ -436,8 +440,7 @@ int32_t streamProcessCheckRsp(SStreamTask* pTask, const SStreamTaskCheckRsp* pRs ASSERT(left >= 0); if (left == 0) { - taosArrayDestroy(pTask->checkReqIds); - pTask->checkReqIds = NULL; + pTask->checkReqIds = taosArrayDestroy(pTask->checkReqIds);; doProcessDownstreamReadyRsp(pTask); } else { From d25a323a4c1e801999e3c197fbbf85811f622829 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Wed, 20 Dec 2023 13:55:12 +0800 Subject: [PATCH 09/38] fix(stream): record the launch failure. --- include/libs/stream/tstream.h | 1 + source/dnode/vnode/src/tq/tq.c | 1 + source/dnode/vnode/src/tqCommon/tqCommon.c | 4 +++- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/include/libs/stream/tstream.h b/include/libs/stream/tstream.h index 866bb4dc29..e38a677c8b 100644 --- a/include/libs/stream/tstream.h +++ b/include/libs/stream/tstream.h @@ -324,6 +324,7 @@ typedef struct SStreamStatus { int8_t keepTaskStatus; bool appendTranstateBlock; // has append the transfer state data block already, todo: remove it int32_t timerActive; // timer is active + int8_t allowedAddInTimer; // allowed to add into timer int32_t inScanHistorySentinel; } SStreamStatus; diff --git a/source/dnode/vnode/src/tq/tq.c b/source/dnode/vnode/src/tq/tq.c index c33becf5b9..7e2bd1c5e0 100644 --- a/source/dnode/vnode/src/tq/tq.c +++ b/source/dnode/vnode/src/tq/tq.c @@ -1071,6 +1071,7 @@ int32_t tqProcessTaskRunReq(STQ* pTq, SRpcMsg* pMsg) { tqScanWal(pTq); return 0; } + int32_t code = tqStreamTaskProcessRunReq(pTq->pStreamMeta, pMsg, vnodeIsRoleLeader(pTq->pVnode)); if(code == 0 && taskId > 0){ tqScanWalAsync(pTq, false); diff --git a/source/dnode/vnode/src/tqCommon/tqCommon.c b/source/dnode/vnode/src/tqCommon/tqCommon.c index c39e653596..2201f0744e 100644 --- a/source/dnode/vnode/src/tqCommon/tqCommon.c +++ b/source/dnode/vnode/src/tqCommon/tqCommon.c @@ -621,6 +621,8 @@ int32_t startStreamTasks(SStreamMeta* pMeta) { SStreamTaskId* pTaskId = taosArrayGet(pTaskList, i); SStreamTask* pTask = streamMetaAcquireTask(pMeta, pTaskId->streamId, pTaskId->taskId); if (pTask == NULL) { + streamMetaUpdateTaskDownstreamStatus(pMeta, pTaskId->streamId, pTaskId->taskId, 0, + taosGetTimestampMs(), false); continue; } @@ -637,7 +639,7 @@ int32_t startStreamTasks(SStreamMeta* pMeta) { streamLaunchFillHistoryTask(pTask); } - streamMetaUpdateTaskDownstreamStatus(pTask->pMeta, pTask->id.streamId, pTask->id.taskId, pTask->execInfo.init, + streamMetaUpdateTaskDownstreamStatus(pMeta, pTask->id.streamId, pTask->id.taskId, pTask->execInfo.init, pTask->execInfo.start, true); streamMetaReleaseTask(pMeta, pTask); continue; From 1a8583887f471ca7e87128a7c6c4e17517d8d8a1 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Wed, 20 Dec 2023 15:35:43 +0800 Subject: [PATCH 10/38] fix(stream): remove clear backendcache. --- source/dnode/vnode/src/tqCommon/tqCommon.c | 5 +++-- source/libs/stream/src/streamMeta.c | 4 ++-- source/libs/stream/src/streamStart.c | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/source/dnode/vnode/src/tqCommon/tqCommon.c b/source/dnode/vnode/src/tqCommon/tqCommon.c index 2201f0744e..0864afdf2c 100644 --- a/source/dnode/vnode/src/tqCommon/tqCommon.c +++ b/source/dnode/vnode/src/tqCommon/tqCommon.c @@ -686,7 +686,8 @@ static int32_t restartStreamTasks(SStreamMeta* pMeta, bool isLeader) { streamMetaWLock(pMeta); if (pMeta->startInfo.taskStarting == 1) { pMeta->startInfo.restartCount += 1; - tqDebug("vgId:%d in start tasks procedure, inc restartCounter by 1, %d", vgId, pMeta->startInfo.restartCount); + tqDebug("vgId:%d in start tasks procedure, inc restartCounter by 1, remaining restart:%d", vgId, + pMeta->startInfo.restartCount); streamMetaWUnLock(pMeta); return TSDB_CODE_SUCCESS; } @@ -720,7 +721,7 @@ static int32_t restartStreamTasks(SStreamMeta* pMeta, bool isLeader) { if (isLeader && !tsDisableStream) { tqStreamTaskResetStatus(pMeta); streamMetaWUnLock(pMeta); - tqInfo("vgId:%d restart all stream tasks after all tasks being updated", vgId); + tqInfo("vgId:%d start all stream tasks after reload tasks from disk", vgId); startStreamTasks(pMeta); } else { diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index 2bbc1607b3..b67b81b611 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -266,11 +266,12 @@ int32_t streamTaskSetDb(SStreamMeta* pMeta, void* arg, char* key) { if (pBackend == NULL) { taosThreadMutexUnlock(&pMeta->backendMutex); taosMsleep(1000); - stDebug("backed holded by other task, restart later, path: %s, key: %s", pMeta->path, key); + stDebug("backend held by other task, restart later, path:%s, key:%s", pMeta->path, key); } else { taosThreadMutexUnlock(&pMeta->backendMutex); break; } + taosThreadMutexLock(&pMeta->backendMutex); pBackend = taskDbOpen(pMeta->path, key, chkpId); } @@ -448,7 +449,6 @@ void streamMetaClear(SStreamMeta* pMeta) { taosRemoveRef(streamBackendId, pMeta->streamBackendRid); taosHashClear(pMeta->pTasksMap); - taosHashClear(pMeta->pTaskDbUnique); taosArrayClear(pMeta->pTaskList); taosArrayClear(pMeta->chkpSaved); diff --git a/source/libs/stream/src/streamStart.c b/source/libs/stream/src/streamStart.c index 68380e4ba4..5ce7668048 100644 --- a/source/libs/stream/src/streamStart.c +++ b/source/libs/stream/src/streamStart.c @@ -1113,7 +1113,7 @@ int32_t streamMetaUpdateTaskDownstreamStatus(SStreamMeta* pMeta, int64_t streamI pStartInfo->completeFn(pMeta); } else { streamMetaWUnLock(pMeta); - stDebug("vgId:%d recv check down results:%d, total:%d", pMeta->vgId, numOfRecv, numOfTotal); + stDebug("vgId:%d recv check downstream results:%d, total:%d", pMeta->vgId, numOfRecv, numOfTotal); } return TSDB_CODE_SUCCESS; From 43fb7cf14ade07c3b4d6ed3baf4f71e538662ca1 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Wed, 20 Dec 2023 16:04:20 +0800 Subject: [PATCH 11/38] other: add log info. --- source/libs/stream/src/streamDispatch.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/source/libs/stream/src/streamDispatch.c b/source/libs/stream/src/streamDispatch.c index 5fb7db233f..f0ee03810b 100644 --- a/source/libs/stream/src/streamDispatch.c +++ b/source/libs/stream/src/streamDispatch.c @@ -372,8 +372,8 @@ static int32_t doBuildDispatchMsg(SStreamTask* pTask, const SStreamDataBlock* pD pTask->msgInfo.pData = pReqs; } - stDebug("s-task:%s build dispatch msg success, msgId:%d, stage:%" PRId64, pTask->id.idStr, pTask->execInfo.dispatch, - pTask->pMeta->stage); + stDebug("s-task:%s build dispatch msg success, msgId:%d, stage:%"PRId64" %p", pTask->id.idStr, pTask->execInfo.dispatch, + pTask->pMeta->stage, pTask->msgInfo.pData); return code; } @@ -1081,6 +1081,7 @@ int32_t streamNotifyUpstreamContinue(SStreamTask* pTask) { // this message has been sent successfully, let's try next one. static int32_t handleDispatchSuccessRsp(SStreamTask* pTask, int32_t downstreamId) { + stDebug("s-task:%s destroy dispatch msg:%p", pTask->id.idStr, pTask->msgInfo.pData); destroyDispatchMsg(pTask->msgInfo.pData, getNumOfDispatchBranch(pTask)); bool delayDispatch = (pTask->msgInfo.dispatchMsgType == STREAM_INPUT__CHECKPOINT_TRIGGER); From 849aaf8d4fb70b73c03bbd85f9e068d0b2d74aa1 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Wed, 20 Dec 2023 18:07:53 +0800 Subject: [PATCH 12/38] fix(stream): avoid dead lock. --- source/libs/stream/src/streamMeta.c | 1 + source/libs/stream/src/streamStart.c | 21 ++++++++++++++------- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index b67b81b611..d6e1286093 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -1215,6 +1215,7 @@ bool streamMetaTaskInTimer(SStreamMeta* pMeta) { SStreamTask* pTask = *(SStreamTask**)pIter; if (pTask->status.timerActive >= 1) { + stDebug("s-task:%s in timer, blocking tasks in vgId:%d restart", pTask->id.idStr, pMeta->vgId); inTimer = true; } } diff --git a/source/libs/stream/src/streamStart.c b/source/libs/stream/src/streamStart.c index 5ce7668048..94f635a480 100644 --- a/source/libs/stream/src/streamStart.c +++ b/source/libs/stream/src/streamStart.c @@ -48,9 +48,13 @@ static void tryLaunchHistoryTask(void* param, void* tmrId); static void doProcessDownstreamReadyRsp(SStreamTask* pTask); int32_t streamTaskSetReady(SStreamTask* pTask) { - char* p = NULL; - int32_t numOfDowns = streamTaskGetNumOfDownstream(pTask); - ETaskStatus status = streamTaskGetStatus(pTask, &p); + SStreamMeta* pMeta = pTask->pMeta; + SStreamTaskId id = pTask->id; + int64_t initTs = pTask->execInfo.init; + int64_t startTs = pTask->execInfo.start; + char* p = NULL; + int32_t numOfDowns = streamTaskGetNumOfDownstream(pTask); + ETaskStatus status = streamTaskGetStatus(pTask, &p); if ((status == TASK_STATUS__SCAN_HISTORY || status == TASK_STATUS__STREAM_SCAN_HISTORY) && pTask->info.taskLevel != TASK_LEVEL__SOURCE) { @@ -67,8 +71,12 @@ int32_t streamTaskSetReady(SStreamTask* pTask) { stDebug("s-task:%s all %d downstream ready, init completed, elapsed time:%" PRId64 "ms, task status:%s", pTask->id.idStr, numOfDowns, el, p); - streamMetaUpdateTaskDownstreamStatus(pTask->pMeta, pTask->id.streamId, pTask->id.taskId, pTask->execInfo.init, - pTask->execInfo.start, true); + taosThreadMutexUnlock(&pTask->lock); + + // to avoid deadlock + streamMetaUpdateTaskDownstreamStatus(pMeta, id.streamId, id.taskId, initTs, startTs, true); + + taosThreadMutexLock(&pTask->lock); return TSDB_CODE_SUCCESS; } @@ -461,8 +469,7 @@ int32_t streamProcessCheckRsp(SStreamTask* pTask, const SStreamTaskCheckRsp* pRs if (pRsp->status == TASK_UPSTREAM_NEW_STAGE) { stError("s-task:%s vgId:%d self vnode-transfer/leader-change/restart detected, old stage:%" PRId64 ", current stage:%" PRId64 - ", " - "not check wait for downstream task nodeUpdate, and all tasks restart", + ", not check wait for downstream task nodeUpdate, and all tasks restart", id, pRsp->upstreamNodeId, pRsp->oldStage, pTask->pMeta->stage); addIntoNodeUpdateList(pTask, pRsp->upstreamNodeId); } else { From 64eee4bbd2f799f1cb843d0b944fbbeef9863d77 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Wed, 20 Dec 2023 19:06:32 +0800 Subject: [PATCH 13/38] fix(stream): add conflict check for nodeUpdate --- source/dnode/mnode/impl/src/mndStream.c | 30 ++++++++++++++++---- source/dnode/mnode/impl/src/mndStreamTrans.c | 4 +-- source/libs/stream/src/streamStart.c | 2 +- 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/source/dnode/mnode/impl/src/mndStream.c b/source/dnode/mnode/impl/src/mndStream.c index b8ecf66686..0e593a5109 100644 --- a/source/dnode/mnode/impl/src/mndStream.c +++ b/source/dnode/mnode/impl/src/mndStream.c @@ -2330,12 +2330,28 @@ static SArray *mndTakeVgroupSnapshot(SMnode *pMnode, bool *allReady) { } static int32_t mndProcessVgroupChange(SMnode *pMnode, SVgroupChangeInfo *pChangeInfo) { - SSdb *pSdb = pMnode->pSdb; - - // check all streams that involved this vnode should update the epset info + SSdb *pSdb = pMnode->pSdb; SStreamObj *pStream = NULL; - void * pIter = NULL; - STrans * pTrans = NULL; + void *pIter = NULL; + STrans *pTrans = NULL; + + // conflict check for nodeUpdate trans, here we randomly chose one stream to add into the trans pool + while(1) { + pIter = sdbFetch(pSdb, SDB_STREAM, pIter, (void **)&pStream); + if (pIter == NULL) { + break; + } + + bool conflict = streamTransConflictOtherTrans(pMnode, pStream->uid, MND_STREAM_TASK_UPDATE_NAME, true); + sdbRelease(pSdb, pStream); + + if (conflict) { + mWarn("nodeUpdate trans in progress, current nodeUpdate ignored"); + sdbCancelFetch(pSdb, pIter); + return -1; + } + } + while (1) { pIter = sdbFetch(pSdb, SDB_STREAM, pIter, (void **)&pStream); @@ -2351,6 +2367,8 @@ static int32_t mndProcessVgroupChange(SMnode *pMnode, SVgroupChangeInfo *pChange sdbCancelFetch(pSdb, pIter); return terrno; } + + mndStreamRegisterTrans(pTrans, MND_STREAM_TASK_RESET_NAME, pStream->uid); } void *p = taosHashGet(pChangeInfo->pDBMap, pStream->targetDb, strlen(pStream->targetDb)); @@ -2597,7 +2615,7 @@ static int32_t mndProcessNodeCheckReq(SRpcMsg *pMsg) { code = mndProcessVgroupChange(pMnode, &changeInfo); - // keep the new vnode snapshot + // keep the new vnode snapshot if success if (code == TSDB_CODE_SUCCESS || code == TSDB_CODE_ACTION_IN_PROGRESS) { mDebug("create trans successfully, update cached node list"); taosArrayDestroy(execInfo.pNodeList); diff --git a/source/dnode/mnode/impl/src/mndStreamTrans.c b/source/dnode/mnode/impl/src/mndStreamTrans.c index 0db3cb0a8a..a7d4703c8c 100644 --- a/source/dnode/mnode/impl/src/mndStreamTrans.c +++ b/source/dnode/mnode/impl/src/mndStreamTrans.c @@ -94,8 +94,8 @@ bool streamTransConflictOtherTrans(SMnode* pMnode, int64_t streamUid, const char tInfo.name); return true; } - } else if ((strcmp(tInfo.name, MND_STREAM_CREATE_NAME) == 0) || - (strcmp(tInfo.name, MND_STREAM_DROP_NAME) == 0)) { + } else if ((strcmp(tInfo.name, MND_STREAM_CREATE_NAME) == 0) || (strcmp(tInfo.name, MND_STREAM_DROP_NAME) == 0) || + (strcmp(tInfo.name, MND_STREAM_TASK_RESET_NAME) == 0)) { mWarn("conflict with other transId:%d streamUid:%" PRIx64 ", trans:%s", tInfo.transId, tInfo.streamUid, tInfo.name); return true; diff --git a/source/libs/stream/src/streamStart.c b/source/libs/stream/src/streamStart.c index 94f635a480..d94aa80369 100644 --- a/source/libs/stream/src/streamStart.c +++ b/source/libs/stream/src/streamStart.c @@ -73,7 +73,7 @@ int32_t streamTaskSetReady(SStreamTask* pTask) { taosThreadMutexUnlock(&pTask->lock); - // to avoid deadlock + // todo: fix it, to avoid deadlock in: tqStreamTaskProcessUpdateReq streamMetaUpdateTaskDownstreamStatus(pMeta, id.streamId, id.taskId, initTs, startTs, true); taosThreadMutexLock(&pTask->lock); From ad5d78a6ca95d5fdb503bdbed9d0c7a392e700b2 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Wed, 20 Dec 2023 19:12:47 +0800 Subject: [PATCH 14/38] fix(stream): fix deadlock. --- source/dnode/mnode/impl/src/mndStream.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/dnode/mnode/impl/src/mndStream.c b/source/dnode/mnode/impl/src/mndStream.c index 0e593a5109..d5bb8b2b92 100644 --- a/source/dnode/mnode/impl/src/mndStream.c +++ b/source/dnode/mnode/impl/src/mndStream.c @@ -2342,7 +2342,7 @@ static int32_t mndProcessVgroupChange(SMnode *pMnode, SVgroupChangeInfo *pChange break; } - bool conflict = streamTransConflictOtherTrans(pMnode, pStream->uid, MND_STREAM_TASK_UPDATE_NAME, true); + bool conflict = streamTransConflictOtherTrans(pMnode, pStream->uid, MND_STREAM_TASK_UPDATE_NAME, false); sdbRelease(pSdb, pStream); if (conflict) { From 32ce4b6a4ce9452f6fd4b14bfb1fc59295850932 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Wed, 20 Dec 2023 23:12:59 +0800 Subject: [PATCH 15/38] fix(stream): fix race condition. --- source/libs/stream/src/streamStart.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/source/libs/stream/src/streamStart.c b/source/libs/stream/src/streamStart.c index d94aa80369..d62990f181 100644 --- a/source/libs/stream/src/streamStart.c +++ b/source/libs/stream/src/streamStart.c @@ -48,10 +48,6 @@ static void tryLaunchHistoryTask(void* param, void* tmrId); static void doProcessDownstreamReadyRsp(SStreamTask* pTask); int32_t streamTaskSetReady(SStreamTask* pTask) { - SStreamMeta* pMeta = pTask->pMeta; - SStreamTaskId id = pTask->id; - int64_t initTs = pTask->execInfo.init; - int64_t startTs = pTask->execInfo.start; char* p = NULL; int32_t numOfDowns = streamTaskGetNumOfDownstream(pTask); ETaskStatus status = streamTaskGetStatus(pTask, &p); @@ -70,13 +66,6 @@ int32_t streamTaskSetReady(SStreamTask* pTask) { int64_t el = (pTask->execInfo.start - pTask->execInfo.init); stDebug("s-task:%s all %d downstream ready, init completed, elapsed time:%" PRId64 "ms, task status:%s", pTask->id.idStr, numOfDowns, el, p); - - taosThreadMutexUnlock(&pTask->lock); - - // todo: fix it, to avoid deadlock in: tqStreamTaskProcessUpdateReq - streamMetaUpdateTaskDownstreamStatus(pMeta, id.streamId, id.taskId, initTs, startTs, true); - - taosThreadMutexLock(&pTask->lock); return TSDB_CODE_SUCCESS; } @@ -392,6 +381,10 @@ void doProcessDownstreamReadyRsp(SStreamTask* pTask) { } streamTaskOnHandleEventSuccess(pTask->status.pSM, event); + + int64_t initTs = pTask->execInfo.init; + int64_t startTs = pTask->execInfo.start; + streamMetaUpdateTaskDownstreamStatus(pTask->pMeta, pTask->id.streamId, pTask->id.taskId, initTs, startTs, true); } static void addIntoNodeUpdateList(SStreamTask* pTask, int32_t nodeId) { From 8a8ddf56997ccfc5db8133217b3ebb241a689ee8 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Thu, 21 Dec 2023 10:30:44 +0800 Subject: [PATCH 16/38] fix(stream): record the checkpoint failure when nodeEp changed. --- source/libs/stream/inc/streamInt.h | 12 +----------- source/libs/stream/src/streamCheckpoint.c | 5 ++--- source/libs/stream/src/streamStart.c | 15 +++++++++++++++ 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/source/libs/stream/inc/streamInt.h b/source/libs/stream/inc/streamInt.h index f709741b57..341706ff3b 100644 --- a/source/libs/stream/inc/streamInt.h +++ b/source/libs/stream/inc/streamInt.h @@ -116,6 +116,7 @@ int32_t streamSendCheckMsg(SStreamTask* pTask, const SStreamTaskCheckReq* pReq, int32_t streamAddCheckpointReadyMsg(SStreamTask* pTask, int32_t srcTaskId, int32_t index, int64_t checkpointId); int32_t streamTaskSendCheckpointReadyMsg(SStreamTask* pTask); int32_t streamTaskSendCheckpointSourceRsp(SStreamTask* pTask); +void streamTaskSetCheckpointFailedId(SStreamTask* pTask); int32_t streamTaskGetNumOfDownstream(const SStreamTask* pTask); int32_t streamTaskInitTokenBucket(STokenBucket* pBucket, int32_t numCap, int32_t numRate, float quotaRate, const char*); STaskId streamTaskExtractKey(const SStreamTask* pTask); @@ -137,17 +138,6 @@ int32_t streamAddEndScanHistoryMsg(SStreamTask* pTask, SRpcHandleInfo* pRpcInfo, int32_t streamNotifyUpstreamContinue(SStreamTask* pTask); int32_t streamTransferStateToStreamTask(SStreamTask* pTask); -// <<<<<<< HEAD -// void streamClearChkptReadyMsg(SStreamTask* pTask); - -// int32_t streamTaskInitTokenBucket(STokenBucket* pBucket, int32_t numCap, int32_t numRate, float quotaRate, const -// char*); STaskId streamTaskExtractKey(const SStreamTask* pTask); void streamTaskInitForLaunchHTask(SHistoryTaskInfo* -// pInfo); void streamTaskSetRetryInfoForLaunch(SHistoryTaskInfo* pInfo); - -// void streamMetaResetStartInfo(STaskStartInfo* pMeta); - -// ======= -// >>>>>>> 3.0 SStreamQueue* streamQueueOpen(int64_t cap); void streamQueueClose(SStreamQueue* pQueue, int32_t taskId); void streamQueueProcessSuccess(SStreamQueue* queue); diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index cf0682f037..4a99d54de1 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -341,9 +341,8 @@ int32_t streamSaveTaskCheckpointInfo(SStreamTask* p, int64_t checkpointId) { return code; } -void streamTaskSetFailedId(SStreamTask* pTask) { +void streamTaskSetCheckpointFailedId(SStreamTask* pTask) { pTask->chkInfo.failedId = pTask->chkInfo.checkpointingId; - pTask->chkInfo.checkpointId = pTask->chkInfo.checkpointingId; } int32_t getChkpMeta(char* id, char* path, SArray* list) { @@ -485,7 +484,7 @@ int32_t streamTaskBuildCheckpoint(SStreamTask* pTask) { code = streamTaskHandleEvent(pTask->status.pSM, TASK_EVENT_CHECKPOINT_DONE); taosThreadMutexUnlock(&pTask->lock); - streamTaskSetFailedId(pTask); + streamTaskSetCheckpointFailedId(pTask); stDebug("s-task:%s clear checkpoint flag since gen checkpoint failed, checkpointId:%" PRId64, pTask->id.idStr, ckId); } diff --git a/source/libs/stream/src/streamStart.c b/source/libs/stream/src/streamStart.c index d62990f181..3bef6adf57 100644 --- a/source/libs/stream/src/streamStart.c +++ b/source/libs/stream/src/streamStart.c @@ -313,9 +313,24 @@ int32_t streamTaskCheckStatus(SStreamTask* pTask, int32_t upstreamTaskId, int32_ stError("s-task:%s receive check msg from upstream task:0x%x(vgId:%d), new stage received:%" PRId64 ", prev:%" PRId64, id, upstreamTaskId, vgId, stage, pInfo->stage); + // record the checkpoint failure id and sent to mnode + taosThreadMutexLock(&pTask->lock); + ETaskStatus status = streamTaskGetStatus(pTask, NULL); + if (status == TASK_STATUS__CK) { + streamTaskSetCheckpointFailedId(pTask); + } + taosThreadMutexUnlock(&pTask->lock); } if (pInfo->stage != stage) { + + taosThreadMutexLock(&pTask->lock); + ETaskStatus status = streamTaskGetStatus(pTask, NULL); + if (status == TASK_STATUS__CK) { + streamTaskSetCheckpointFailedId(pTask); + } + taosThreadMutexUnlock(&pTask->lock); + return TASK_UPSTREAM_NEW_STAGE; } else if (pTask->status.downstreamReady != 1) { stDebug("s-task:%s vgId:%d leader:%d, downstream not ready", id, vgId, (pTask->pMeta->role == NODE_ROLE_LEADER)); From 71f6d9f06fd57b95ce938f5796d1d5698779f611 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Thu, 21 Dec 2023 11:52:45 +0800 Subject: [PATCH 17/38] fix(stream): handle the case that recvs drop msg twice. --- source/dnode/vnode/src/tq/tq.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/source/dnode/vnode/src/tq/tq.c b/source/dnode/vnode/src/tq/tq.c index 7e2bd1c5e0..02c9b61432 100644 --- a/source/dnode/vnode/src/tq/tq.c +++ b/source/dnode/vnode/src/tq/tq.c @@ -1340,6 +1340,7 @@ int32_t tqProcessTaskResetReq(STQ* pTq, SRpcMsg* pMsg) { return tqStreamTaskProcessTaskResetReq(pTq->pStreamMeta, pMsg); } +// NOTE: here we may receive this message more than once, so need to handle this case int32_t tqProcessTaskDropHTask(STQ* pTq, SRpcMsg* pMsg) { SVDropHTaskReq* pReq = (SVDropHTaskReq*)pMsg->pCont; @@ -1358,14 +1359,17 @@ int32_t tqProcessTaskDropHTask(STQ* pTq, SRpcMsg* pMsg) { return TSDB_CODE_SUCCESS; } + taosThreadMutexLock(&pTask->lock); ETaskStatus status = streamTaskGetStatus(pTask, NULL); - ASSERT(status == TASK_STATUS__STREAM_SCAN_HISTORY); - - streamTaskHandleEvent(pTask->status.pSM, TASK_EVENT_SCANHIST_DONE); + if (status == TASK_STATUS__STREAM_SCAN_HISTORY) { + streamTaskHandleEvent(pTask->status.pSM, TASK_EVENT_SCANHIST_DONE); + } SStreamTaskId id = {.streamId = pTask->hTaskInfo.id.streamId, .taskId = pTask->hTaskInfo.id.taskId}; streamBuildAndSendDropTaskMsg(pTask->pMsgCb, pMeta->vgId, &id); + taosThreadMutexUnlock(&pTask->lock); + // clear the scheduler status streamTaskSetSchedStatusInactive(pTask); tqDebug("s-task:%s set scheduler status:%d after drop fill-history task", pTask->id.idStr, pTask->status.schedStatus); From 2acc8423c5263fe57fc205a9e2f26447926e56e0 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Thu, 21 Dec 2023 16:00:50 +0800 Subject: [PATCH 18/38] fix(stream): send msg to mnode before closing all tasks. --- include/libs/stream/tstream.h | 3 + source/dnode/vnode/src/tq/tqStreamTask.c | 13 ++- source/dnode/vnode/src/tq/tqUtil.c | 6 ++ source/libs/stream/src/streamMeta.c | 128 +++++++++++++++-------- 4 files changed, 101 insertions(+), 49 deletions(-) diff --git a/include/libs/stream/tstream.h b/include/libs/stream/tstream.h index e38a677c8b..9b96a41516 100644 --- a/include/libs/stream/tstream.h +++ b/include/libs/stream/tstream.h @@ -494,6 +494,7 @@ typedef struct SStreamMeta { int32_t vgId; int64_t stage; int32_t role; + bool sendMsgBeforeClosing; // send hb to mnode before close all tasks when switch to follower. STaskStartInfo startInfo; SRWLatch lock; int32_t walScanCounter; @@ -782,6 +783,8 @@ int32_t streamTaskCheckStatus(SStreamTask* pTask, int32_t upstreamTaskId, int32_ int64_t* oldStage); int32_t streamTaskUpdateEpsetInfo(SStreamTask* pTask, SArray* pNodeList); void streamTaskResetUpstreamStageInfo(SStreamTask* pTask); +SArray* streamMetaSendMsgBeforeCloseTasks(SStreamMeta* pMeta); + bool streamTaskAllUpstreamClosed(SStreamTask* pTask); bool streamTaskSetSchedStatusWait(SStreamTask* pTask); int8_t streamTaskSetSchedStatusActive(SStreamTask* pTask); diff --git a/source/dnode/vnode/src/tq/tqStreamTask.c b/source/dnode/vnode/src/tq/tqStreamTask.c index 1b0a76e81c..99cc3a36ea 100644 --- a/source/dnode/vnode/src/tq/tqStreamTask.c +++ b/source/dnode/vnode/src/tq/tqStreamTask.c @@ -126,17 +126,16 @@ int32_t tqScanWalAsync(STQ* pTq, bool ckPause) { int32_t tqStopStreamTasks(STQ* pTq) { SStreamMeta* pMeta = pTq->pStreamMeta; int32_t vgId = TD_VID(pTq->pVnode); - int32_t numOfTasks = taosArrayGetSize(pMeta->pTaskList); + int32_t num = taosArrayGetSize(pMeta->pTaskList); - tqDebug("vgId:%d stop all %d stream task(s)", vgId, numOfTasks); - if (numOfTasks == 0) { + tqDebug("vgId:%d stop all %d stream task(s)", vgId, num); + if (num == 0) { return TSDB_CODE_SUCCESS; } - SArray* pTaskList = NULL; - streamMetaWLock(pMeta); - pTaskList = taosArrayDup(pMeta->pTaskList, NULL); - streamMetaWUnLock(pMeta); + // send hb msg to mnode before closing all tasks. + SArray* pTaskList = streamMetaSendMsgBeforeCloseTasks(pMeta); + int32_t numOfTasks = taosArrayGetSize(pTaskList); for (int32_t i = 0; i < numOfTasks; ++i) { SStreamTaskId* pTaskId = taosArrayGet(pTaskList, i); diff --git a/source/dnode/vnode/src/tq/tqUtil.c b/source/dnode/vnode/src/tq/tqUtil.c index 110cf79b4e..d3ea1f19e5 100644 --- a/source/dnode/vnode/src/tq/tqUtil.c +++ b/source/dnode/vnode/src/tq/tqUtil.c @@ -40,6 +40,12 @@ void tqUpdateNodeStage(STQ* pTq, bool isLeader) { int64_t stage = pMeta->stage; pMeta->stage = state.term; + + // mark the sign to send msg before close all tasks + if ((!isLeader) && (pMeta->role == NODE_ROLE_LEADER)) { + pMeta->sendMsgBeforeClosing = true; + } + pMeta->role = (isLeader)? NODE_ROLE_LEADER:NODE_ROLE_FOLLOWER; if (isLeader) { tqInfo("vgId:%d update meta stage:%" PRId64 ", prev:%" PRId64 " leader:%d, start to send Hb", pMeta->vgId, diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index d6e1286093..355e17db9a 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -1051,43 +1051,7 @@ static void addUpdateNodeIntoHbMsg(SStreamTask* pTask, SStreamHbMsg* pMsg) { taosThreadMutexUnlock(&pTask->lock); } -void metaHbToMnode(void* param, void* tmrId) { - int64_t rid = *(int64_t*)param; - - SStreamMeta* pMeta = taosAcquireRef(streamMetaId, rid); - if (pMeta == NULL) { - return; - } - - // need to stop, stop now - if (pMeta->pHbInfo->stopFlag == STREAM_META_WILL_STOP) { - pMeta->pHbInfo->stopFlag = STREAM_META_OK_TO_STOP; - stDebug("vgId:%d jump out of meta timer", pMeta->vgId); - taosReleaseRef(streamMetaId, rid); - return; - } - - // not leader not send msg - if (pMeta->role != NODE_ROLE_LEADER) { - stInfo("vgId:%d role:%d not leader not send hb to mnode", pMeta->vgId, pMeta->role); - taosReleaseRef(streamMetaId, rid); - pMeta->pHbInfo->hbStart = 0; - return; - } - - // set the hb start time - if (pMeta->pHbInfo->hbStart == 0) { - pMeta->pHbInfo->hbStart = taosGetTimestampMs(); - } - - if (!waitForEnoughDuration(pMeta->pHbInfo)) { - taosTmrReset(metaHbToMnode, META_HB_CHECK_INTERVAL, param, streamTimer, &pMeta->pHbInfo->hbTmr); - taosReleaseRef(streamMetaId, rid); - return; - } - - stDebug("vgId:%d build stream task hb, leader:%d", pMeta->vgId, (pMeta->role == NODE_ROLE_LEADER)); - +static int32_t metaHeartbeatToMnodeImpl(SStreamMeta* pMeta) { SStreamHbMsg hbMsg = {0}; SEpSet epset = {0}; bool hasMnodeEpset = false; @@ -1181,23 +1145,62 @@ void metaHbToMnode(void* param, void* tmrId) { } tEncoderClear(&encoder); - SRpcMsg msg = { - .info.noResp = 1, - }; + SRpcMsg msg = {.info.noResp = 1}; initRpcMsg(&msg, TDMT_MND_STREAM_HEARTBEAT, buf, tlen); pMeta->pHbInfo->hbCount += 1; - stDebug("vgId:%d build and send hb to mnode, numOfTasks:%d total:%d", pMeta->vgId, hbMsg.numOfTasks, pMeta->pHbInfo->hbCount); + tmsgSendReq(&epset, &msg); } else { stDebug("vgId:%d no tasks and no mnd epset, not send stream hb to mnode", pMeta->vgId); } -_end: + _end: streamMetaClearHbMsg(&hbMsg); taosArrayDestroy(pIdList); + return TSDB_CODE_SUCCESS; +} + +void metaHbToMnode(void* param, void* tmrId) { + int64_t rid = *(int64_t*)param; + + SStreamMeta* pMeta = taosAcquireRef(streamMetaId, rid); + if (pMeta == NULL) { + return; + } + + // need to stop, stop now + if (pMeta->pHbInfo->stopFlag == STREAM_META_WILL_STOP) { + pMeta->pHbInfo->stopFlag = STREAM_META_OK_TO_STOP; + stDebug("vgId:%d jump out of meta timer", pMeta->vgId); + taosReleaseRef(streamMetaId, rid); + return; + } + + // not leader not send msg + if (pMeta->role != NODE_ROLE_LEADER) { + stInfo("vgId:%d role:%d not leader not send hb to mnode", pMeta->vgId, pMeta->role); + taosReleaseRef(streamMetaId, rid); + pMeta->pHbInfo->hbStart = 0; + return; + } + + // set the hb start time + if (pMeta->pHbInfo->hbStart == 0) { + pMeta->pHbInfo->hbStart = taosGetTimestampMs(); + } + + if (!waitForEnoughDuration(pMeta->pHbInfo)) { + taosTmrReset(metaHbToMnode, META_HB_CHECK_INTERVAL, param, streamTimer, &pMeta->pHbInfo->hbTmr); + taosReleaseRef(streamMetaId, rid); + return; + } + + stDebug("vgId:%d build stream task hb, leader:%d", pMeta->vgId, (pMeta->role == NODE_ROLE_LEADER)); + metaHeartbeatToMnodeImpl(pMeta); + taosTmrReset(metaHbToMnode, META_HB_CHECK_INTERVAL, param, streamTimer, &pMeta->pHbInfo->hbTmr); taosReleaseRef(streamMetaId, rid); } @@ -1320,3 +1323,44 @@ int32_t streamMetaAsyncExec(SStreamMeta* pMeta, __stream_async_exec_fn_t fn, voi schedMsg.msg = code; return taosScheduleTask(pMeta->qHandle, &schedMsg); } + +SArray* streamMetaSendMsgBeforeCloseTasks(SStreamMeta* pMeta) { + SArray* pTaskList = NULL; + bool sendMsg = false; + + streamMetaWLock(pMeta); + pTaskList = taosArrayDup(pMeta->pTaskList, NULL); + sendMsg = pMeta->sendMsgBeforeClosing; + streamMetaWUnLock(pMeta); + + if (!sendMsg) { + stDebug("vgId:%d no need to send msg to mnode before closing tasks", pMeta->vgId); + return pTaskList; + } + + stDebug("vgId:%d send msg to mnode before closing all tasks", pMeta->vgId); + + // send hb msg to mnode before closing all tasks. + int32_t numOfTasks = taosArrayGetSize(pTaskList); + for (int32_t i = 0; i < numOfTasks; ++i) { + SStreamTaskId* pTaskId = taosArrayGet(pTaskList, i); + SStreamTask* pTask = streamMetaAcquireTask(pMeta, pTaskId->streamId, pTaskId->taskId); + if (pTask == NULL) { + continue; + } + + taosThreadMutexLock(&pTask->lock); + ETaskStatus s = streamTaskGetStatus(pTask, NULL); + if (s == TASK_STATUS__CK) { + streamTaskSetCheckpointFailedId(pTask); + stDebug("s-task:%s mark the checkpoint:%"PRId64" failed", pTask->id.idStr, pTask->chkInfo.checkpointingId); + } + + taosThreadMutexUnlock(&pTask->lock); + streamMetaReleaseTask(pMeta, pTask); + } + + metaHeartbeatToMnodeImpl(pMeta); + pMeta->sendMsgBeforeClosing = false; + return pTaskList; +} From 84583580e25dd55d787bd524b14ee521b8da7d0b Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Thu, 21 Dec 2023 19:12:49 +0800 Subject: [PATCH 19/38] fix(stream): transfer the checkpoint trans id. --- include/libs/stream/tstream.h | 4 +- source/dnode/mnode/impl/inc/mndStream.h | 7 ++- source/dnode/mnode/impl/src/mndStream.c | 61 ++++++++++++----------- source/libs/stream/src/streamCheckpoint.c | 7 ++- source/libs/stream/src/streamMeta.c | 8 ++- source/libs/stream/src/streamTask.c | 1 + 6 files changed, 51 insertions(+), 37 deletions(-) diff --git a/include/libs/stream/tstream.h b/include/libs/stream/tstream.h index 9b96a41516..9137965849 100644 --- a/include/libs/stream/tstream.h +++ b/include/libs/stream/tstream.h @@ -314,6 +314,7 @@ typedef struct SCheckpointInfo { int32_t checkpointNotReadyTasks; bool dispatchCheckpointTrigger; int64_t msgVer; + int32_t transId; } SCheckpointInfo; typedef struct SStreamStatus { @@ -635,6 +636,7 @@ typedef struct { int32_t nodeId; SEpSet mgmtEps; int32_t mnodeId; + int32_t transId; int64_t expireTime; } SStreamCheckpointSourceReq; @@ -677,8 +679,8 @@ typedef struct STaskStatusEntry { int64_t verStart; // start version in WAL, only valid for source task int64_t verEnd; // end version in WAL, only valid for source task int64_t processedVer; // only valid for source task - int32_t relatedHTask; // has related fill-history task int64_t activeCheckpointId; // current active checkpoint id + int32_t chkpointTransId; // checkpoint trans id bool checkpointFailed; // denote if the checkpoint is failed or not bool inputQChanging; // inputQ is changing or not int64_t inputQUnchangeCounter; diff --git a/source/dnode/mnode/impl/inc/mndStream.h b/source/dnode/mnode/impl/inc/mndStream.h index b0f6273acc..4d06b16297 100644 --- a/source/dnode/mnode/impl/inc/mndStream.h +++ b/source/dnode/mnode/impl/inc/mndStream.h @@ -44,12 +44,11 @@ typedef struct SStreamTransMgmt { } SStreamTransMgmt; typedef struct SStreamExecInfo { - SArray * pNodeList; + SArray *pNodeList; int64_t ts; // snapshot ts SStreamTransMgmt transMgmt; - int64_t activeCheckpoint; // active check point id - SHashObj * pTaskMap; - SArray * pTaskList; + SHashObj *pTaskMap; + SArray *pTaskList; TdThreadMutex lock; } SStreamExecInfo; diff --git a/source/dnode/mnode/impl/src/mndStream.c b/source/dnode/mnode/impl/src/mndStream.c index d5bb8b2b92..b04a707728 100644 --- a/source/dnode/mnode/impl/src/mndStream.c +++ b/source/dnode/mnode/impl/src/mndStream.c @@ -62,7 +62,7 @@ static void mndCancelGetNextStreamTask(SMnode *pMnode, void *pIter); static int32_t mndProcessPauseStreamReq(SRpcMsg *pReq); static int32_t mndProcessResumeStreamReq(SRpcMsg *pReq); static int32_t mndBuildStreamCheckpointSourceReq2(void **pBuf, int32_t *pLen, int32_t nodeId, int64_t checkpointId, - int64_t streamId, int32_t taskId); + int64_t streamId, int32_t taskId, int32_t transId); static int32_t mndProcessNodeCheck(SRpcMsg *pReq); static int32_t mndProcessNodeCheckReq(SRpcMsg *pMsg); static SArray *extractNodeListFromStream(SMnode *pMnode); @@ -997,13 +997,14 @@ static int32_t mndProcessStreamRemainChkptTmr(SRpcMsg *pReq) { } static int32_t mndBuildStreamCheckpointSourceReq2(void **pBuf, int32_t *pLen, int32_t nodeId, int64_t checkpointId, - int64_t streamId, int32_t taskId) { + int64_t streamId, int32_t taskId, int32_t transId) { SStreamCheckpointSourceReq req = {0}; req.checkpointId = checkpointId; req.nodeId = nodeId; req.expireTime = -1; req.streamId = streamId; // pTask->id.streamId; req.taskId = taskId; // pTask->id.taskId; + req.transId = transId; int32_t code; int32_t blen; @@ -1093,7 +1094,7 @@ static int32_t mndProcessStreamCheckpointTrans(SMnode *pMnode, SStreamObj *pStre void * buf; int32_t tlen; if (mndBuildStreamCheckpointSourceReq2(&buf, &tlen, pTask->info.nodeId, checkpointId, pTask->id.streamId, - pTask->id.taskId) < 0) { + pTask->id.taskId, pTrans->id) < 0) { mndReleaseVgroup(pMnode, pVgObj); taosWUnLockLatch(&pStream->lock); goto _ERR; @@ -1162,7 +1163,7 @@ static int32_t mndAddStreamCheckpointToTrans(STrans *pTrans, SStreamObj *pStream void * buf; int32_t tlen; if (mndBuildStreamCheckpointSourceReq2(&buf, &tlen, pTask->info.nodeId, chkptId, pTask->id.streamId, - pTask->id.taskId) < 0) { + pTask->id.taskId, pTrans->id) < 0) { mndReleaseVgroup(pMnode, pVgObj); taosWUnLockLatch(&pStream->lock); return -1; @@ -2837,39 +2838,36 @@ int32_t killActiveCheckpointTrans(SMnode *pMnode, const char *pDBName, size_t le return TSDB_CODE_SUCCESS; } -static int32_t mndResetStatusFromCheckpoint(SMnode *pMnode, int32_t transId) { +static int32_t mndResetStatusFromCheckpoint(SMnode *pMnode, int64_t streamId, int32_t transId) { + int32_t code = TSDB_CODE_SUCCESS; + STrans *pTrans = mndAcquireTrans(pMnode, transId); if (pTrans != NULL) { mInfo("kill checkpoint transId:%d to reset task status", transId); mndKillTrans(pMnode, pTrans); mndReleaseTrans(pMnode, pTrans); + } else { + mError("failed to acquire checkpoint trans:%d", transId); } - // set all tasks status to be normal, refactor later to be stream level, instead of vnode level. - SSdb * pSdb = pMnode->pSdb; - SStreamObj *pStream = NULL; - void * pIter = NULL; - while (1) { - pIter = sdbFetch(pSdb, SDB_STREAM, pIter, (void **)&pStream); - if (pIter == NULL) { - break; - } - + SStreamObj *pStream = mndGetStreamObj(pMnode, streamId); + if (pStream == NULL) { + code = TSDB_CODE_STREAM_TASK_NOT_EXIST; + mError("failed to acquire the streamObj:0x%" PRIx64 " to reset checkpoint, may have been dropped", pStream->uid); + } else { bool conflict = streamTransConflictOtherTrans(pMnode, pStream->uid, MND_STREAM_TASK_RESET_NAME, false); if (conflict) { - mError("stream:%s other trans exists in DB:%s & %s failed to start reset-status trans", pStream->name, - pStream->sourceDb, pStream->targetDb); - continue; - } - - mDebug("stream:%s (0x%" PRIx64 ") reset checkpoint procedure, create reset trans", pStream->name, pStream->uid); - int32_t code = createStreamResetStatusTrans(pMnode, pStream); - if (code != TSDB_CODE_SUCCESS) { - sdbCancelFetch(pSdb, pIter); - return code; + mError("stream:%s other trans exists in DB:%s, dstTable:%s failed to start reset-status trans", pStream->name, + pStream->sourceDb, pStream->targetSTbName); + } else { + mDebug("stream:%s (0x%" PRIx64 ") reset checkpoint procedure, transId:%d, create reset trans", pStream->name, + pStream->uid, transId); + code = createStreamResetStatusTrans(pMnode, pStream); + mndReleaseStream(pMnode, pStream); } } - return 0; + + return code; } static SStreamTask *mndGetStreamTask(STaskId *pId, SStreamObj *pStream) { @@ -3007,6 +3005,7 @@ int32_t mndProcessStreamHb(SRpcMsg *pReq) { bool checkpointFailed = false; int64_t activeCheckpointId = 0; + int64_t streamId = 0; SDecoder decoder = {0}; tDecoderInit(&decoder, pReq->pCont, pReq->contLen); @@ -3049,7 +3048,9 @@ int32_t mndProcessStreamHb(SRpcMsg *pReq) { if (pTaskEntry->stage != p->stage && pTaskEntry->stage != -1) { updateStageInfo(pTaskEntry, p->stage); - if (pTaskEntry->nodeId == SNODE_HANDLE) snodeChanged = true; + if (pTaskEntry->nodeId == SNODE_HANDLE) { + snodeChanged = true; + } } else { // task is idle for more than 50 sec. if (fabs(pTaskEntry->inputQUsed - p->inputQUsed) <= DBL_EPSILON) { @@ -3073,6 +3074,7 @@ int32_t mndProcessStreamHb(SRpcMsg *pReq) { if (p->checkpointFailed) { checkpointFailed = p->checkpointFailed; + streamId = p->id.streamId; } } } @@ -3111,9 +3113,8 @@ int32_t mndProcessStreamHb(SRpcMsg *pReq) { if (allReady || snodeChanged) { // if the execInfo.activeCheckpoint == 0, the checkpoint is restoring from wal - mInfo("checkpointId:%" PRId64 " failed, issue task-reset trans to reset all tasks status", - execInfo.activeCheckpoint); - mndResetStatusFromCheckpoint(pMnode, activeCheckpointId); + mInfo("checkpointId:%" PRId64 " failed, issue task-reset trans to reset all tasks status", activeCheckpointId); + mndResetStatusFromCheckpoint(pMnode, streamId, activeCheckpointId); } else { mInfo("not all vgroups are ready, wait for next HB from stream tasks"); } diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index 4a99d54de1..2a766f21ec 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -25,6 +25,7 @@ typedef struct { SStreamTask* pTask; } SAsyncUploadArg; + int32_t tEncodeStreamCheckpointSourceReq(SEncoder* pEncoder, const SStreamCheckpointSourceReq* pReq) { if (tStartEncode(pEncoder) < 0) return -1; if (tEncodeI64(pEncoder, pReq->streamId) < 0) return -1; @@ -34,6 +35,7 @@ int32_t tEncodeStreamCheckpointSourceReq(SEncoder* pEncoder, const SStreamCheckp if (tEncodeSEpSet(pEncoder, &pReq->mgmtEps) < 0) return -1; if (tEncodeI32(pEncoder, pReq->mnodeId) < 0) return -1; if (tEncodeI64(pEncoder, pReq->expireTime) < 0) return -1; + if (tEncodeI32(pEncoder, pReq->transId) < 0) return -1; tEndEncode(pEncoder); return pEncoder->pos; } @@ -47,6 +49,7 @@ int32_t tDecodeStreamCheckpointSourceReq(SDecoder* pDecoder, SStreamCheckpointSo if (tDecodeSEpSet(pDecoder, &pReq->mgmtEps) < 0) return -1; if (tDecodeI32(pDecoder, &pReq->mnodeId) < 0) return -1; if (tDecodeI64(pDecoder, &pReq->expireTime) < 0) return -1; + if (tDecodeI32(pDecoder, &pReq->transId) < 0) return -1; tEndDecode(pDecoder); return 0; } @@ -149,6 +152,7 @@ int32_t streamProcessCheckpointSourceReq(SStreamTask* pTask, SStreamCheckpointSo // 1. set task status to be prepared for check point, no data are allowed to put into inputQ. streamTaskHandleEvent(pTask->status.pSM, TASK_EVENT_GEN_CHECKPOINT); + pTask->chkInfo.transId = pReq->transId; pTask->chkInfo.checkpointingId = pReq->checkpointId; pTask->chkInfo.checkpointNotReadyTasks = streamTaskGetNumOfDownstream(pTask); pTask->chkInfo.startTs = taosGetTimestampMs(); @@ -273,8 +277,9 @@ void streamTaskClearCheckInfo(SStreamTask* pTask, bool clearChkpReadyMsg) { pTask->chkInfo.failedId = 0; pTask->chkInfo.startTs = 0; // clear the recorded start time pTask->chkInfo.checkpointNotReadyTasks = 0; - // pTask->chkInfo.checkpointAlignCnt = 0; + pTask->chkInfo.transId = 0; pTask->chkInfo.dispatchCheckpointTrigger = false; + streamTaskOpenAllUpstreamInput(pTask); // open inputQ for all upstream tasks if (clearChkpReadyMsg) { streamClearChkptReadyMsg(pTask); diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index 355e17db9a..d1df74b8d3 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -940,6 +940,7 @@ int32_t tEncodeStreamHbMsg(SEncoder* pEncoder, const SStreamHbMsg* pReq) { if (tEncodeI64(pEncoder, ps->verEnd) < 0) return -1; if (tEncodeI64(pEncoder, ps->activeCheckpointId) < 0) return -1; if (tEncodeI8(pEncoder, ps->checkpointFailed) < 0) return -1; + if (tEncodeI32(pEncoder, ps->chkpointTransId) < 0) return -1; } int32_t numOfVgs = taosArrayGetSize(pReq->pUpdateNodes); @@ -978,6 +979,7 @@ int32_t tDecodeStreamHbMsg(SDecoder* pDecoder, SStreamHbMsg* pReq) { if (tDecodeI64(pDecoder, &entry.verEnd) < 0) return -1; if (tDecodeI64(pDecoder, &entry.activeCheckpointId) < 0) return -1; if (tDecodeI8(pDecoder, (int8_t*)&entry.checkpointFailed) < 0) return -1; + if (tDecodeI32(pDecoder, &entry.chkpointTransId) < 0) return -1; entry.id.taskId = taskId; taosArrayPush(pReq->pTaskStatus, &entry); @@ -1103,6 +1105,7 @@ static int32_t metaHeartbeatToMnodeImpl(SStreamMeta* pMeta) { if ((*pTask)->chkInfo.checkpointingId != 0) { entry.checkpointFailed = ((*pTask)->chkInfo.failedId >= (*pTask)->chkInfo.checkpointingId); entry.activeCheckpointId = (*pTask)->chkInfo.checkpointingId; + entry.chkpointTransId = (*pTask)->chkInfo.transId; } if ((*pTask)->exec.pWalReader != NULL) { @@ -1350,10 +1353,13 @@ SArray* streamMetaSendMsgBeforeCloseTasks(SStreamMeta* pMeta) { } taosThreadMutexLock(&pTask->lock); - ETaskStatus s = streamTaskGetStatus(pTask, NULL); + char* p = NULL; + ETaskStatus s = streamTaskGetStatus(pTask, &p); if (s == TASK_STATUS__CK) { streamTaskSetCheckpointFailedId(pTask); stDebug("s-task:%s mark the checkpoint:%"PRId64" failed", pTask->id.idStr, pTask->chkInfo.checkpointingId); + } else { + stDebug("s-task:%s status:%s not reset the checkpoint", pTask->id.idStr, p); } taosThreadMutexUnlock(&pTask->lock); diff --git a/source/libs/stream/src/streamTask.c b/source/libs/stream/src/streamTask.c index 335f9d27d5..4d343a484d 100644 --- a/source/libs/stream/src/streamTask.c +++ b/source/libs/stream/src/streamTask.c @@ -796,5 +796,6 @@ void streamTaskStatusCopy(STaskStatusEntry* pDst, const STaskStatusEntry* pSrc) pDst->sinkDataSize = pSrc->sinkDataSize; pDst->activeCheckpointId = pSrc->activeCheckpointId; pDst->checkpointFailed = pSrc->checkpointFailed; + pDst->chkpointTransId = pSrc->chkpointTransId; } From 0fab9a1827da20da21d12856c89329f7868e1f4a Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Fri, 22 Dec 2023 00:29:49 +0800 Subject: [PATCH 20/38] other: fix syntax error. --- source/dnode/mnode/impl/src/mndStream.c | 2 ++ source/dnode/mnode/impl/src/mndStreamTrans.c | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/source/dnode/mnode/impl/src/mndStream.c b/source/dnode/mnode/impl/src/mndStream.c index b04a707728..3ba6280f59 100644 --- a/source/dnode/mnode/impl/src/mndStream.c +++ b/source/dnode/mnode/impl/src/mndStream.c @@ -3006,6 +3006,7 @@ int32_t mndProcessStreamHb(SRpcMsg *pReq) { bool checkpointFailed = false; int64_t activeCheckpointId = 0; int64_t streamId = 0; + int32_t transId = 0; SDecoder decoder = {0}; tDecoderInit(&decoder, pReq->pCont, pReq->contLen); @@ -3075,6 +3076,7 @@ int32_t mndProcessStreamHb(SRpcMsg *pReq) { if (p->checkpointFailed) { checkpointFailed = p->checkpointFailed; streamId = p->id.streamId; + transId = p->chkpointTransId; } } } diff --git a/source/dnode/mnode/impl/src/mndStreamTrans.c b/source/dnode/mnode/impl/src/mndStreamTrans.c index a7d4703c8c..b8a1c3d4ae 100644 --- a/source/dnode/mnode/impl/src/mndStreamTrans.c +++ b/source/dnode/mnode/impl/src/mndStreamTrans.c @@ -93,6 +93,8 @@ bool streamTransConflictOtherTrans(SMnode* pMnode, int64_t streamUid, const char mWarn("conflict with other transId:%d streamUid:%" PRIx64 ", trans:%s", tInfo.transId, tInfo.streamUid, tInfo.name); return true; + } else { + mDebug("not conflict with checkpoint trans, name:%s, continue create trans", pTransName); } } else if ((strcmp(tInfo.name, MND_STREAM_CREATE_NAME) == 0) || (strcmp(tInfo.name, MND_STREAM_DROP_NAME) == 0) || (strcmp(tInfo.name, MND_STREAM_TASK_RESET_NAME) == 0)) { @@ -100,6 +102,8 @@ bool streamTransConflictOtherTrans(SMnode* pMnode, int64_t streamUid, const char tInfo.name); return true; } + } else { + mDebug("stream:0x%"PRIx64" no conflict trans existed, continue create trans", streamUid); } if (lock) { From c9475060de3d834be1153e4dc6584b73441c070f Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Fri, 22 Dec 2023 09:34:49 +0800 Subject: [PATCH 21/38] fix(stream): fix race condition. --- source/dnode/mnode/impl/src/mndStreamTrans.c | 4 ++-- source/dnode/vnode/src/tq/tqStreamTask.c | 7 ++++++- source/dnode/vnode/src/tq/tqUtil.c | 8 ++++++-- source/libs/stream/src/streamMeta.c | 9 ++------- 4 files changed, 16 insertions(+), 12 deletions(-) diff --git a/source/dnode/mnode/impl/src/mndStreamTrans.c b/source/dnode/mnode/impl/src/mndStreamTrans.c index b8a1c3d4ae..8f94843500 100644 --- a/source/dnode/mnode/impl/src/mndStreamTrans.c +++ b/source/dnode/mnode/impl/src/mndStreamTrans.c @@ -90,7 +90,7 @@ bool streamTransConflictOtherTrans(SMnode* pMnode, int64_t streamUid, const char if (strcmp(tInfo.name, MND_STREAM_CHECKPOINT_NAME) == 0) { if (strcmp(pTransName, MND_STREAM_DROP_NAME) != 0) { - mWarn("conflict with other transId:%d streamUid:%" PRIx64 ", trans:%s", tInfo.transId, tInfo.streamUid, + mWarn("conflict with other transId:%d streamUid:0x%" PRIx64 ", trans:%s", tInfo.transId, tInfo.streamUid, tInfo.name); return true; } else { @@ -98,7 +98,7 @@ bool streamTransConflictOtherTrans(SMnode* pMnode, int64_t streamUid, const char } } else if ((strcmp(tInfo.name, MND_STREAM_CREATE_NAME) == 0) || (strcmp(tInfo.name, MND_STREAM_DROP_NAME) == 0) || (strcmp(tInfo.name, MND_STREAM_TASK_RESET_NAME) == 0)) { - mWarn("conflict with other transId:%d streamUid:%" PRIx64 ", trans:%s", tInfo.transId, tInfo.streamUid, + mWarn("conflict with other transId:%d streamUid:0x%" PRIx64 ", trans:%s", tInfo.transId, tInfo.streamUid, tInfo.name); return true; } diff --git a/source/dnode/vnode/src/tq/tqStreamTask.c b/source/dnode/vnode/src/tq/tqStreamTask.c index 99cc3a36ea..4fc1c6cc01 100644 --- a/source/dnode/vnode/src/tq/tqStreamTask.c +++ b/source/dnode/vnode/src/tq/tqStreamTask.c @@ -126,10 +126,13 @@ int32_t tqScanWalAsync(STQ* pTq, bool ckPause) { int32_t tqStopStreamTasks(STQ* pTq) { SStreamMeta* pMeta = pTq->pStreamMeta; int32_t vgId = TD_VID(pTq->pVnode); - int32_t num = taosArrayGetSize(pMeta->pTaskList); + streamMetaRLock(pMeta); + + int32_t num = taosArrayGetSize(pMeta->pTaskList); tqDebug("vgId:%d stop all %d stream task(s)", vgId, num); if (num == 0) { + streamMetaRUnLock(pMeta); return TSDB_CODE_SUCCESS; } @@ -149,6 +152,8 @@ int32_t tqStopStreamTasks(STQ* pTq) { } taosArrayDestroy(pTaskList); + + streamMetaRUnLock(pMeta); return 0; } diff --git a/source/dnode/vnode/src/tq/tqUtil.c b/source/dnode/vnode/src/tq/tqUtil.c index d3ea1f19e5..64c733d63d 100644 --- a/source/dnode/vnode/src/tq/tqUtil.c +++ b/source/dnode/vnode/src/tq/tqUtil.c @@ -39,6 +39,8 @@ void tqUpdateNodeStage(STQ* pTq, bool isLeader) { SStreamMeta* pMeta = pTq->pStreamMeta; int64_t stage = pMeta->stage; + streamMetaWLock(pMeta); + pMeta->stage = state.term; // mark the sign to send msg before close all tasks @@ -52,9 +54,11 @@ void tqUpdateNodeStage(STQ* pTq, bool isLeader) { state.term, stage, isLeader); streamMetaStartHb(pMeta); } else { - tqInfo("vgId:%d update meta stage:%" PRId64 " prev:%" PRId64 " leader:%d", pMeta->vgId, state.term, stage, - isLeader); + tqInfo("vgId:%d update meta stage:%" PRId64 " prev:%" PRId64 " leader:%d sendMsg beforeClosing:%d", pMeta->vgId, + state.term, stage, isLeader, pMeta->sendMsgBeforeClosing); } + + streamMetaWUnLock(pMeta); } static int32_t tqInitTaosxRsp(STaosxRsp* pRsp, STqOffsetVal pOffset) { diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index d1df74b8d3..3504c97398 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -1328,14 +1328,9 @@ int32_t streamMetaAsyncExec(SStreamMeta* pMeta, __stream_async_exec_fn_t fn, voi } SArray* streamMetaSendMsgBeforeCloseTasks(SStreamMeta* pMeta) { - SArray* pTaskList = NULL; - bool sendMsg = false; - - streamMetaWLock(pMeta); - pTaskList = taosArrayDup(pMeta->pTaskList, NULL); - sendMsg = pMeta->sendMsgBeforeClosing; - streamMetaWUnLock(pMeta); + SArray* pTaskList = taosArrayDup(pMeta->pTaskList, NULL); + bool sendMsg = pMeta->sendMsgBeforeClosing; if (!sendMsg) { stDebug("vgId:%d no need to send msg to mnode before closing tasks", pMeta->vgId); return pTaskList; From 90898a16e0393da66938541560585d8e3679d6ea Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Fri, 22 Dec 2023 13:57:15 +0800 Subject: [PATCH 22/38] refactor: do some internal refactor. --- source/dnode/vnode/src/tq/tqStreamTask.c | 4 ++-- source/libs/stream/src/streamMeta.c | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/source/dnode/vnode/src/tq/tqStreamTask.c b/source/dnode/vnode/src/tq/tqStreamTask.c index 4fc1c6cc01..fde5082212 100644 --- a/source/dnode/vnode/src/tq/tqStreamTask.c +++ b/source/dnode/vnode/src/tq/tqStreamTask.c @@ -141,8 +141,8 @@ int32_t tqStopStreamTasks(STQ* pTq) { int32_t numOfTasks = taosArrayGetSize(pTaskList); for (int32_t i = 0; i < numOfTasks; ++i) { - SStreamTaskId* pTaskId = taosArrayGet(pTaskList, i); - SStreamTask* pTask = streamMetaAcquireTask(pMeta, pTaskId->streamId, pTaskId->taskId); + SStreamTaskId* pTaskId = taosArrayGet(pTaskList, i); + SStreamTask* pTask = streamMetaAcquireTask(pMeta, pTaskId->streamId, pTaskId->taskId); if (pTask == NULL) { continue; } diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index 3504c97398..151605493a 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -1210,7 +1210,7 @@ void metaHbToMnode(void* param, void* tmrId) { bool streamMetaTaskInTimer(SStreamMeta* pMeta) { bool inTimer = false; - streamMetaWLock(pMeta); + streamMetaRLock(pMeta); void* pIter = NULL; while (1) { @@ -1226,7 +1226,7 @@ bool streamMetaTaskInTimer(SStreamMeta* pMeta) { } } - streamMetaWUnLock(pMeta); + streamMetaRUnLock(pMeta); return inTimer; } @@ -1303,6 +1303,7 @@ void streamMetaRUnLock(SStreamMeta* pMeta) { void streamMetaWLock(SStreamMeta* pMeta) { stTrace("vgId:%d meta-wlock", pMeta->vgId); taosWLockLatch(&pMeta->lock); + stTrace("vgId:%d meta-wlock completed", pMeta->vgId); } void streamMetaWUnLock(SStreamMeta* pMeta) { From e91afa7df2b948cfa3a69edf5a75e9fa3c1ded19 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Fri, 22 Dec 2023 14:51:04 +0800 Subject: [PATCH 23/38] fix(stream): fix deadlock. --- include/libs/stream/tstream.h | 1 + source/dnode/vnode/src/tq/tqStreamTask.c | 3 ++- source/libs/stream/src/streamMeta.c | 25 +++++++++++++----------- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/include/libs/stream/tstream.h b/include/libs/stream/tstream.h index 9137965849..9772c7ecf3 100644 --- a/include/libs/stream/tstream.h +++ b/include/libs/stream/tstream.h @@ -845,6 +845,7 @@ int32_t streamMetaRemoveTask(SStreamMeta* pMeta, STaskId* pKey); int32_t streamMetaRegisterTask(SStreamMeta* pMeta, int64_t ver, SStreamTask* pTask, bool* pAdded); int32_t streamMetaUnregisterTask(SStreamMeta* pMeta, int64_t streamId, int32_t taskId); int32_t streamMetaGetNumOfTasks(SStreamMeta* pMeta); +SStreamTask* streamMetaAcquireTaskNoLock(SStreamMeta* pMeta, int64_t streamId, int32_t taskId); SStreamTask* streamMetaAcquireTask(SStreamMeta* pMeta, int64_t streamId, int32_t taskId); void streamMetaReleaseTask(SStreamMeta* pMeta, SStreamTask* pTask); void streamMetaClear(SStreamMeta* pMeta); diff --git a/source/dnode/vnode/src/tq/tqStreamTask.c b/source/dnode/vnode/src/tq/tqStreamTask.c index fde5082212..63e71f8ee8 100644 --- a/source/dnode/vnode/src/tq/tqStreamTask.c +++ b/source/dnode/vnode/src/tq/tqStreamTask.c @@ -123,6 +123,7 @@ int32_t tqScanWalAsync(STQ* pTq, bool ckPause) { return 0; } +// todo: createMsg to invoke this function in stream threads, to avoid blocking the syn thread int32_t tqStopStreamTasks(STQ* pTq) { SStreamMeta* pMeta = pTq->pStreamMeta; int32_t vgId = TD_VID(pTq->pVnode); @@ -142,7 +143,7 @@ int32_t tqStopStreamTasks(STQ* pTq) { for (int32_t i = 0; i < numOfTasks; ++i) { SStreamTaskId* pTaskId = taosArrayGet(pTaskList, i); - SStreamTask* pTask = streamMetaAcquireTask(pMeta, pTaskId->streamId, pTaskId->taskId); + SStreamTask* pTask = streamMetaAcquireTaskNoLock(pMeta, pTaskId->streamId, pTaskId->taskId); if (pTask == NULL) { continue; } diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index 151605493a..686cf841e1 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -607,22 +607,23 @@ int32_t streamMetaGetNumOfTasks(SStreamMeta* pMeta) { return (int32_t)size; } -SStreamTask* streamMetaAcquireTask(SStreamMeta* pMeta, int64_t streamId, int32_t taskId) { - streamMetaRLock(pMeta); - +SStreamTask* streamMetaAcquireTaskNoLock(SStreamMeta* pMeta, int64_t streamId, int32_t taskId) { STaskId id = {.streamId = streamId, .taskId = taskId}; SStreamTask** ppTask = (SStreamTask**)taosHashGet(pMeta->pTasksMap, &id, sizeof(id)); - if (ppTask != NULL) { - if (!streamTaskShouldStop(*ppTask)) { - int32_t ref = atomic_add_fetch_32(&(*ppTask)->refCnt, 1); - streamMetaRUnLock(pMeta); - stTrace("s-task:%s acquire task, ref:%d", (*ppTask)->id.idStr, ref); - return *ppTask; - } + if (ppTask == NULL || streamTaskShouldStop(*ppTask)) { + return NULL; } + int32_t ref = atomic_add_fetch_32(&(*ppTask)->refCnt, 1); + stTrace("s-task:%s acquire task, ref:%d", (*ppTask)->id.idStr, ref); + return *ppTask; +} + +SStreamTask* streamMetaAcquireTask(SStreamMeta* pMeta, int64_t streamId, int32_t taskId) { + streamMetaRLock(pMeta); + SStreamTask* p = streamMetaAcquireTaskNoLock(pMeta, streamId, taskId); streamMetaRUnLock(pMeta); - return NULL; + return p; } void streamMetaReleaseTask(SStreamMeta* UNUSED_PARAM(pMeta), SStreamTask* pTask) { @@ -1293,6 +1294,7 @@ void streamMetaResetStartInfo(STaskStartInfo* pStartInfo) { void streamMetaRLock(SStreamMeta* pMeta) { stTrace("vgId:%d meta-rlock", pMeta->vgId); taosRLockLatch(&pMeta->lock); + ASSERT(pMeta->lock != 0x40000001); } void streamMetaRUnLock(SStreamMeta* pMeta) { @@ -1303,6 +1305,7 @@ void streamMetaRUnLock(SStreamMeta* pMeta) { void streamMetaWLock(SStreamMeta* pMeta) { stTrace("vgId:%d meta-wlock", pMeta->vgId); taosWLockLatch(&pMeta->lock); + ASSERT(pMeta->lock != 0x40000001); stTrace("vgId:%d meta-wlock completed", pMeta->vgId); } From dce0a6c74a3218a98f0c3146ebd43e47b3ef92e0 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Fri, 22 Dec 2023 15:24:37 +0800 Subject: [PATCH 24/38] fix(stream): remove invalid assert --- source/libs/stream/src/streamMeta.c | 1 - 1 file changed, 1 deletion(-) diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index 686cf841e1..b82021abf6 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -1294,7 +1294,6 @@ void streamMetaResetStartInfo(STaskStartInfo* pStartInfo) { void streamMetaRLock(SStreamMeta* pMeta) { stTrace("vgId:%d meta-rlock", pMeta->vgId); taosRLockLatch(&pMeta->lock); - ASSERT(pMeta->lock != 0x40000001); } void streamMetaRUnLock(SStreamMeta* pMeta) { From 7ce545bcd417b4d63531fc3a54b76e9bffeb2aeb Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Fri, 22 Dec 2023 16:14:49 +0800 Subject: [PATCH 25/38] refactor: refactor the lock type. --- include/libs/stream/tstream.h | 2 +- source/dnode/vnode/src/sma/smaRollup.c | 20 ++++++++++---------- source/dnode/vnode/src/tq/tqStreamTaskSnap.c | 12 ++++++------ source/libs/stream/src/streamMeta.c | 9 ++++----- 4 files changed, 21 insertions(+), 22 deletions(-) diff --git a/include/libs/stream/tstream.h b/include/libs/stream/tstream.h index 9772c7ecf3..e7ac0e99c5 100644 --- a/include/libs/stream/tstream.h +++ b/include/libs/stream/tstream.h @@ -497,7 +497,7 @@ typedef struct SStreamMeta { int32_t role; bool sendMsgBeforeClosing; // send hb to mnode before close all tasks when switch to follower. STaskStartInfo startInfo; - SRWLatch lock; + TdThreadRwlock lock; int32_t walScanCounter; void* streamBackend; int64_t streamBackendRid; diff --git a/source/dnode/vnode/src/sma/smaRollup.c b/source/dnode/vnode/src/sma/smaRollup.c index abe4c3f2fc..4963a0aa3b 100644 --- a/source/dnode/vnode/src/sma/smaRollup.c +++ b/source/dnode/vnode/src/sma/smaRollup.c @@ -240,23 +240,23 @@ int32_t tdFetchTbUidList(SSma *pSma, STbUidStore **ppStore, tb_uid_t suid, tb_ui static void tdRSmaTaskInit(SStreamMeta *pMeta, SRSmaInfoItem *pItem, SStreamTaskId *pId) { STaskId id = {.streamId = pId->streamId, .taskId = pId->taskId}; - taosRLockLatch(&pMeta->lock); + streamMetaRLock(pMeta); SStreamTask **ppTask = (SStreamTask **)taosHashGet(pMeta->pTasksMap, &id, sizeof(id)); if (ppTask && *ppTask) { pItem->submitReqVer = (*ppTask)->chkInfo.checkpointVer; pItem->fetchResultVer = (*ppTask)->info.triggerParam; } - taosRUnLockLatch(&pMeta->lock); + streamMetaRUnLock(pMeta); } static void tdRSmaTaskRemove(SStreamMeta *pMeta, int64_t streamId, int32_t taskId) { streamMetaUnregisterTask(pMeta, streamId, taskId); - taosWLockLatch(&pMeta->lock); + streamMetaWLock(pMeta); int32_t numOfTasks = streamMetaGetNumOfTasks(pMeta); if (streamMetaCommit(pMeta) < 0) { // persist to disk } - taosWUnLockLatch(&pMeta->lock); + streamMetaWUnLock(pMeta); smaDebug("vgId:%d, rsma task:%" PRIi64 ",%d dropped, remain tasks:%d", pMeta->vgId, streamId, taskId, numOfTasks); } @@ -1301,14 +1301,14 @@ _checkpoint: checkpointBuilt = true; } - taosWLockLatch(&pMeta->lock); + streamMetaWLock(pMeta); if (streamMetaSaveTask(pMeta, pTask)) { - taosWUnLockLatch(&pMeta->lock); + streamMetaWUnLock(pMeta); code = terrno ? terrno : TSDB_CODE_OUT_OF_MEMORY; taosHashCancelIterate(pInfoHash, infoHash); TSDB_CHECK_CODE(code, lino, _exit); } - taosWUnLockLatch(&pMeta->lock); + streamMetaWUnLock(pMeta); smaDebug("vgId:%d, rsma commit, succeed to commit task:%p, submitReqVer:%" PRIi64 ", fetchResultVer:%" PRIi64 ", table:%" PRIi64 ", level:%d", TD_VID(pVnode), pTask, pItem->submitReqVer, pItem->fetchResultVer, pRSmaInfo->suid, i + 1); @@ -1316,13 +1316,13 @@ _checkpoint: } } if (pMeta) { - taosWLockLatch(&pMeta->lock); + streamMetaWLock(pMeta); if (streamMetaCommit(pMeta)) { - taosWUnLockLatch(&pMeta->lock); + streamMetaWUnLock(pMeta); code = terrno ? terrno : TSDB_CODE_OUT_OF_MEMORY; TSDB_CHECK_CODE(code, lino, _exit); } - taosWUnLockLatch(&pMeta->lock); + streamMetaWUnLock(pMeta); } if (checkpointBuilt) { smaInfo("vgId:%d, rsma commit, succeed to commit checkpoint:%" PRIi64, TD_VID(pVnode), checkpointId); diff --git a/source/dnode/vnode/src/tq/tqStreamTaskSnap.c b/source/dnode/vnode/src/tq/tqStreamTaskSnap.c index f966e90b9a..dda5173ad9 100644 --- a/source/dnode/vnode/src/tq/tqStreamTaskSnap.c +++ b/source/dnode/vnode/src/tq/tqStreamTaskSnap.c @@ -196,7 +196,7 @@ int32_t streamTaskSnapWriterClose(SStreamTaskWriter* pWriter, int8_t rollback) { int32_t code = 0; STQ* pTq = pWriter->pTq; - taosWLockLatch(&pTq->pStreamMeta->lock); + streamMetaWLock(pTq->pStreamMeta); tqDebug("vgId:%d, vnode stream-task snapshot writer closed", TD_VID(pTq->pVnode)); if (rollback) { tdbAbort(pTq->pStreamMeta->db, pTq->pStreamMeta->txn); @@ -212,14 +212,14 @@ int32_t streamTaskSnapWriterClose(SStreamTaskWriter* pWriter, int8_t rollback) { taosMemoryFree(pWriter); goto _err; } - taosWUnLockLatch(&pTq->pStreamMeta->lock); + streamMetaWUnLock(pTq->pStreamMeta); taosMemoryFree(pWriter); return code; _err: tqError("vgId:%d, vnode stream-task snapshot writer failed to close since %s", TD_VID(pWriter->pTq->pVnode), tstrerror(code)); - taosWUnLockLatch(&pTq->pStreamMeta->lock); + streamMetaWUnLock(pTq->pStreamMeta); return code; } @@ -240,13 +240,13 @@ int32_t streamTaskSnapWrite(SStreamTaskWriter* pWriter, uint8_t* pData, uint32_t tDecoderClear(&decoder); int64_t key[2] = {taskId.streamId, taskId.taskId}; - taosWLockLatch(&pTq->pStreamMeta->lock); + streamMetaWLock(pTq->pStreamMeta); if (tdbTbUpsert(pTq->pStreamMeta->pTaskDb, key, sizeof(int64_t) << 1, (uint8_t*)pData + sizeof(SSnapDataHdr), nData - sizeof(SSnapDataHdr), pTq->pStreamMeta->txn) < 0) { - taosWUnLockLatch(&pTq->pStreamMeta->lock); + streamMetaWUnLock(pTq->pStreamMeta); return -1; } - taosWUnLockLatch(&pTq->pStreamMeta->lock); + streamMetaWUnLock(pTq->pStreamMeta); } else if (pHdr->type == SNAP_DATA_STREAM_TASK_CHECKPOINT) { // do nothing } diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index b82021abf6..665288cf7b 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -1293,24 +1293,23 @@ void streamMetaResetStartInfo(STaskStartInfo* pStartInfo) { void streamMetaRLock(SStreamMeta* pMeta) { stTrace("vgId:%d meta-rlock", pMeta->vgId); - taosRLockLatch(&pMeta->lock); + taosThreadRwlockRdlock(&pMeta->lock); } void streamMetaRUnLock(SStreamMeta* pMeta) { stTrace("vgId:%d meta-runlock", pMeta->vgId); - taosRUnLockLatch(&pMeta->lock); + taosThreadRwlockUnlock(&pMeta->lock); } void streamMetaWLock(SStreamMeta* pMeta) { stTrace("vgId:%d meta-wlock", pMeta->vgId); - taosWLockLatch(&pMeta->lock); - ASSERT(pMeta->lock != 0x40000001); + taosThreadRwlockWrlock(&pMeta->lock); stTrace("vgId:%d meta-wlock completed", pMeta->vgId); } void streamMetaWUnLock(SStreamMeta* pMeta) { stTrace("vgId:%d meta-wunlock", pMeta->vgId); - taosWUnLockLatch(&pMeta->lock); + taosThreadRwlockUnlock(&pMeta->lock); } static void execHelper(struct SSchedMsg* pSchedMsg) { From b175a4b7b3cc7db8400e00547d421d898e486d0f Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Fri, 22 Dec 2023 17:56:16 +0800 Subject: [PATCH 26/38] refactor(stream): async stop tasks, and do some internal refactor. --- include/dnode/vnode/tqCommon.h | 1 - include/libs/stream/tstream.h | 11 +- source/dnode/mnode/impl/src/mndStream.c | 2 +- source/dnode/snode/src/snode.c | 2 +- source/dnode/vnode/src/inc/tq.h | 3 - source/dnode/vnode/src/inc/vnodeInt.h | 1 + source/dnode/vnode/src/tq/tq.c | 1 + source/dnode/vnode/src/tq/tqStreamTask.c | 43 +++----- source/dnode/vnode/src/tq/tqUtil.c | 26 +---- source/dnode/vnode/src/tqCommon/tqCommon.c | 67 +----------- source/dnode/vnode/src/vnd/vnodeSync.c | 2 +- source/libs/stream/src/streamMeta.c | 116 +++++++++++++++++++++ 12 files changed, 150 insertions(+), 125 deletions(-) diff --git a/include/dnode/vnode/tqCommon.h b/include/dnode/vnode/tqCommon.h index da9dcf1c04..da2325f006 100644 --- a/include/dnode/vnode/tqCommon.h +++ b/include/dnode/vnode/tqCommon.h @@ -30,7 +30,6 @@ int32_t tqStreamTaskProcessCheckpointReadyMsg(SStreamMeta* pMeta, SRpcMsg* pMsg) int32_t tqStreamTaskProcessDeployReq(SStreamMeta* pMeta, int64_t sversion, char* msg, int32_t msgLen, bool isLeader, bool restored); int32_t tqStreamTaskProcessDropReq(SStreamMeta* pMeta, char* msg, int32_t msgLen); int32_t tqStreamTaskProcessRunReq(SStreamMeta* pMeta, SRpcMsg* pMsg, bool isLeader); -int32_t startStreamTasks(SStreamMeta* pMeta); int32_t tqStreamTaskResetStatus(SStreamMeta* pMeta); int32_t tqStartTaskCompleteCallback(SStreamMeta* pMeta); int32_t tqStreamTaskProcessTaskResetReq(SStreamMeta* pMeta, SRpcMsg* pMsg); diff --git a/include/libs/stream/tstream.h b/include/libs/stream/tstream.h index e7ac0e99c5..2d2db5c1dc 100644 --- a/include/libs/stream/tstream.h +++ b/include/libs/stream/tstream.h @@ -53,6 +53,7 @@ extern "C" { #define STREAM_EXEC_EXTRACT_DATA_IN_WAL_ID (-1) #define STREAM_EXEC_START_ALL_TASKS_ID (-2) #define STREAM_EXEC_RESTART_ALL_TASKS_ID (-3) +#define STREAM_EXEC_STOP_ALL_TASKS_ID (-4) typedef struct SStreamTask SStreamTask; typedef struct SStreamQueue SStreamQueue; @@ -785,8 +786,6 @@ int32_t streamTaskCheckStatus(SStreamTask* pTask, int32_t upstreamTaskId, int32_ int64_t* oldStage); int32_t streamTaskUpdateEpsetInfo(SStreamTask* pTask, SArray* pNodeList); void streamTaskResetUpstreamStageInfo(SStreamTask* pTask); -SArray* streamMetaSendMsgBeforeCloseTasks(SStreamMeta* pMeta); - bool streamTaskAllUpstreamClosed(SStreamTask* pTask); bool streamTaskSetSchedStatusWait(SStreamTask* pTask); int8_t streamTaskSetSchedStatusActive(SStreamTask* pTask); @@ -821,6 +820,7 @@ int32_t streamTaskReleaseState(SStreamTask* pTask); int32_t streamTaskReloadState(SStreamTask* pTask); void streamTaskCloseUpstreamInput(SStreamTask* pTask, int32_t taskId); void streamTaskOpenAllUpstreamInput(SStreamTask* pTask); +int32_t streamTaskSetDb(SStreamMeta* pMeta, void* pTask, char* key); void streamTaskStatusInit(STaskStatusEntry* pEntry, const SStreamTask* pTask); void streamTaskStatusCopy(STaskStatusEntry* pDst, const STaskStatusEntry* pSrc); @@ -851,10 +851,8 @@ void streamMetaReleaseTask(SStreamMeta* pMeta, SStreamTask* pTask); void streamMetaClear(SStreamMeta* pMeta); void streamMetaInitBackend(SStreamMeta* pMeta); int32_t streamMetaCommit(SStreamMeta* pMeta); -int32_t streamMetaLoadAllTasks(SStreamMeta* pMeta); int64_t streamMetaGetLatestCheckpointId(SStreamMeta* pMeta); void streamMetaNotifyClose(SStreamMeta* pMeta); -int32_t streamTaskSetDb(SStreamMeta* pMeta, void* pTask, char* key); void streamMetaStartHb(SStreamMeta* pMeta); bool streamMetaTaskInTimer(SStreamMeta* pMeta); int32_t streamMetaUpdateTaskDownstreamStatus(SStreamMeta* pMeta, int64_t streamId, int32_t taskId, int64_t startTs, @@ -864,6 +862,11 @@ void streamMetaRUnLock(SStreamMeta* pMeta); void streamMetaWLock(SStreamMeta* pMeta); void streamMetaWUnLock(SStreamMeta* pMeta); void streamMetaResetStartInfo(STaskStartInfo* pMeta); +SArray* streamMetaSendMsgBeforeCloseTasks(SStreamMeta* pMeta); +void streamMetaUpdateStageRole(SStreamMeta* pMeta, int64_t stage, bool isLeader); +int32_t streamMetaLoadAllTasks(SStreamMeta* pMeta); +int32_t streamMetaStartAllTasks(SStreamMeta* pMeta); +int32_t streamMetaStopAllTasks(SStreamMeta* pMeta); // checkpoint int32_t streamProcessCheckpointSourceReq(SStreamTask* pTask, SStreamCheckpointSourceReq* pReq); diff --git a/source/dnode/mnode/impl/src/mndStream.c b/source/dnode/mnode/impl/src/mndStream.c index 3ba6280f59..25f51b85df 100644 --- a/source/dnode/mnode/impl/src/mndStream.c +++ b/source/dnode/mnode/impl/src/mndStream.c @@ -3116,7 +3116,7 @@ int32_t mndProcessStreamHb(SRpcMsg *pReq) { if (allReady || snodeChanged) { // if the execInfo.activeCheckpoint == 0, the checkpoint is restoring from wal mInfo("checkpointId:%" PRId64 " failed, issue task-reset trans to reset all tasks status", activeCheckpointId); - mndResetStatusFromCheckpoint(pMnode, streamId, activeCheckpointId); + mndResetStatusFromCheckpoint(pMnode, streamId, transId); } else { mInfo("not all vgroups are ready, wait for next HB from stream tasks"); } diff --git a/source/dnode/snode/src/snode.c b/source/dnode/snode/src/snode.c index f0fe662025..2225a07d3a 100644 --- a/source/dnode/snode/src/snode.c +++ b/source/dnode/snode/src/snode.c @@ -151,7 +151,7 @@ FAIL: int32_t sndInit(SSnode * pSnode) { tqStreamTaskResetStatus(pSnode->pMeta); - startStreamTasks(pSnode->pMeta); + streamMetaStartAllTasks(pSnode->pMeta); return 0; } diff --git a/source/dnode/vnode/src/inc/tq.h b/source/dnode/vnode/src/inc/tq.h index cf57623a43..0ef29fcb3a 100644 --- a/source/dnode/vnode/src/inc/tq.h +++ b/source/dnode/vnode/src/inc/tq.h @@ -152,9 +152,6 @@ void tqSinkDataIntoDstTable(SStreamTask* pTask, void* vnode, void* data); char* tqOffsetBuildFName(const char* path, int32_t fVer); int32_t tqOffsetRestoreFromFile(STqOffsetStore* pStore, const char* fname); -// tqStream -int32_t tqStopStreamTasks(STQ* pTq); - // tq util int32_t extractDelDataBlock(const void* pData, int32_t len, int64_t ver, void** pRefBlock, int32_t type); int32_t tqExtractDataForMq(STQ* pTq, STqHandle* pHandle, const SMqPollReq* pRequest, SRpcMsg* pMsg); diff --git a/source/dnode/vnode/src/inc/vnodeInt.h b/source/dnode/vnode/src/inc/vnodeInt.h index f3b495675d..84f51827ba 100644 --- a/source/dnode/vnode/src/inc/vnodeInt.h +++ b/source/dnode/vnode/src/inc/vnodeInt.h @@ -232,6 +232,7 @@ int tqPushMsg(STQ*, tmsg_t msgType); int tqRegisterPushHandle(STQ* pTq, void* handle, SRpcMsg* pMsg); int tqUnregisterPushHandle(STQ* pTq, void* pHandle); int tqScanWalAsync(STQ* pTq, bool ckPause); +int32_t tqStopStreamTasksAsync(STQ* pTq); int32_t tqProcessTaskCheckPointSourceReq(STQ* pTq, SRpcMsg* pMsg, SRpcMsg* pRsp); int32_t tqProcessTaskCheckpointReadyMsg(STQ* pTq, SRpcMsg* pMsg); int32_t tqProcessTaskUpdateReq(STQ* pTq, SRpcMsg* pMsg); diff --git a/source/dnode/vnode/src/tq/tq.c b/source/dnode/vnode/src/tq/tq.c index 02c9b61432..e8e88e52f7 100644 --- a/source/dnode/vnode/src/tq/tq.c +++ b/source/dnode/vnode/src/tq/tq.c @@ -1076,6 +1076,7 @@ int32_t tqProcessTaskRunReq(STQ* pTq, SRpcMsg* pMsg) { if(code == 0 && taskId > 0){ tqScanWalAsync(pTq, false); } + return code; } diff --git a/source/dnode/vnode/src/tq/tqStreamTask.c b/source/dnode/vnode/src/tq/tqStreamTask.c index 63e71f8ee8..2bb2609ae9 100644 --- a/source/dnode/vnode/src/tq/tqStreamTask.c +++ b/source/dnode/vnode/src/tq/tqStreamTask.c @@ -109,8 +109,8 @@ int32_t tqScanWalAsync(STQ* pTq, bool ckPause) { return -1; } - tqDebug("vgId:%d create msg to start wal scan to launch stream tasks, numOfTasks:%d, restored:%d", vgId, numOfTasks, - alreadyRestored); + tqDebug("vgId:%d create msg to start wal scan to launch stream tasks, numOfTasks:%d, vnd restored:%d", vgId, + numOfTasks, alreadyRestored); pRunReq->head.vgId = vgId; pRunReq->streamId = 0; @@ -123,38 +123,25 @@ int32_t tqScanWalAsync(STQ* pTq, bool ckPause) { return 0; } -// todo: createMsg to invoke this function in stream threads, to avoid blocking the syn thread -int32_t tqStopStreamTasks(STQ* pTq) { +int32_t tqStopStreamTasksAsync(STQ* pTq) { SStreamMeta* pMeta = pTq->pStreamMeta; - int32_t vgId = TD_VID(pTq->pVnode); + int32_t vgId = pMeta->vgId; - streamMetaRLock(pMeta); - - int32_t num = taosArrayGetSize(pMeta->pTaskList); - tqDebug("vgId:%d stop all %d stream task(s)", vgId, num); - if (num == 0) { - streamMetaRUnLock(pMeta); - return TSDB_CODE_SUCCESS; + SStreamTaskRunReq* pRunReq = rpcMallocCont(sizeof(SStreamTaskRunReq)); + if (pRunReq == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + tqError("vgId:%d failed to create msg to stop tasks, code:%s", vgId, terrstr()); + return -1; } - // send hb msg to mnode before closing all tasks. - SArray* pTaskList = streamMetaSendMsgBeforeCloseTasks(pMeta); - int32_t numOfTasks = taosArrayGetSize(pTaskList); + tqDebug("vgId:%d create msg to stop tasks", vgId); - for (int32_t i = 0; i < numOfTasks; ++i) { - SStreamTaskId* pTaskId = taosArrayGet(pTaskList, i); - SStreamTask* pTask = streamMetaAcquireTaskNoLock(pMeta, pTaskId->streamId, pTaskId->taskId); - if (pTask == NULL) { - continue; - } + pRunReq->head.vgId = vgId; + pRunReq->streamId = 0; + pRunReq->taskId = STREAM_EXEC_STOP_ALL_TASKS_ID; - streamTaskStop(pTask); - streamMetaReleaseTask(pMeta, pTask); - } - - taosArrayDestroy(pTaskList); - - streamMetaRUnLock(pMeta); + SRpcMsg msg = {.msgType = TDMT_STREAM_TASK_RUN, .pCont = pRunReq, .contLen = sizeof(SStreamTaskRunReq)}; + tmsgPutToQueue(&pTq->pVnode->msgCb, STREAM_QUEUE, &msg); return 0; } diff --git a/source/dnode/vnode/src/tq/tqUtil.c b/source/dnode/vnode/src/tq/tqUtil.c index 64c733d63d..d18455d221 100644 --- a/source/dnode/vnode/src/tq/tqUtil.c +++ b/source/dnode/vnode/src/tq/tqUtil.c @@ -35,30 +35,8 @@ int32_t tqInitDataRsp(SMqDataRsp* pRsp, STqOffsetVal pOffset) { } void tqUpdateNodeStage(STQ* pTq, bool isLeader) { - SSyncState state = syncGetState(pTq->pVnode->sync); - SStreamMeta* pMeta = pTq->pStreamMeta; - int64_t stage = pMeta->stage; - - streamMetaWLock(pMeta); - - pMeta->stage = state.term; - - // mark the sign to send msg before close all tasks - if ((!isLeader) && (pMeta->role == NODE_ROLE_LEADER)) { - pMeta->sendMsgBeforeClosing = true; - } - - pMeta->role = (isLeader)? NODE_ROLE_LEADER:NODE_ROLE_FOLLOWER; - if (isLeader) { - tqInfo("vgId:%d update meta stage:%" PRId64 ", prev:%" PRId64 " leader:%d, start to send Hb", pMeta->vgId, - state.term, stage, isLeader); - streamMetaStartHb(pMeta); - } else { - tqInfo("vgId:%d update meta stage:%" PRId64 " prev:%" PRId64 " leader:%d sendMsg beforeClosing:%d", pMeta->vgId, - state.term, stage, isLeader, pMeta->sendMsgBeforeClosing); - } - - streamMetaWUnLock(pMeta); + SSyncState state = syncGetState(pTq->pVnode->sync); + streamMetaUpdateStageRole(pTq->pStreamMeta, state.term, isLeader); } static int32_t tqInitTaosxRsp(STaosxRsp* pRsp, STqOffsetVal pOffset) { diff --git a/source/dnode/vnode/src/tqCommon/tqCommon.c b/source/dnode/vnode/src/tqCommon/tqCommon.c index 0864afdf2c..af9363e363 100644 --- a/source/dnode/vnode/src/tqCommon/tqCommon.c +++ b/source/dnode/vnode/src/tqCommon/tqCommon.c @@ -598,66 +598,6 @@ int32_t tqStreamTaskProcessDropReq(SStreamMeta* pMeta, char* msg, int32_t msgLen return 0; } -int32_t startStreamTasks(SStreamMeta* pMeta) { - int32_t code = TSDB_CODE_SUCCESS; - int32_t vgId = pMeta->vgId; - - int32_t numOfTasks = taosArrayGetSize(pMeta->pTaskList); - tqDebug("vgId:%d start to check all %d stream task(s) downstream status", vgId, numOfTasks); - if (numOfTasks == 0) { - return TSDB_CODE_SUCCESS; - } - - SArray* pTaskList = NULL; - streamMetaWLock(pMeta); - pTaskList = taosArrayDup(pMeta->pTaskList, NULL); - taosHashClear(pMeta->startInfo.pReadyTaskSet); - taosHashClear(pMeta->startInfo.pFailedTaskSet); - pMeta->startInfo.startTs = taosGetTimestampMs(); - streamMetaWUnLock(pMeta); - - // broadcast the check downstream tasks msg - for (int32_t i = 0; i < numOfTasks; ++i) { - SStreamTaskId* pTaskId = taosArrayGet(pTaskList, i); - SStreamTask* pTask = streamMetaAcquireTask(pMeta, pTaskId->streamId, pTaskId->taskId); - if (pTask == NULL) { - streamMetaUpdateTaskDownstreamStatus(pMeta, pTaskId->streamId, pTaskId->taskId, 0, - taosGetTimestampMs(), false); - continue; - } - - // fill-history task can only be launched by related stream tasks. - if (pTask->info.fillHistory == 1) { - streamMetaReleaseTask(pMeta, pTask); - continue; - } - - if (pTask->status.downstreamReady == 1) { - if (HAS_RELATED_FILLHISTORY_TASK(pTask)) { - tqDebug("s-task:%s downstream ready, no need to check downstream, check only related fill-history task", - pTask->id.idStr); - streamLaunchFillHistoryTask(pTask); - } - - streamMetaUpdateTaskDownstreamStatus(pMeta, pTask->id.streamId, pTask->id.taskId, pTask->execInfo.init, - pTask->execInfo.start, true); - streamMetaReleaseTask(pMeta, pTask); - continue; - } - - EStreamTaskEvent event = (HAS_RELATED_FILLHISTORY_TASK(pTask)) ? TASK_EVENT_INIT_STREAM_SCANHIST : TASK_EVENT_INIT; - int32_t ret = streamTaskHandleEvent(pTask->status.pSM, event); - if (ret != TSDB_CODE_SUCCESS) { - code = ret; - } - - streamMetaReleaseTask(pMeta, pTask); - } - - taosArrayDestroy(pTaskList); - return code; -} - int32_t tqStreamTaskResetStatus(SStreamMeta* pMeta) { int32_t vgId = pMeta->vgId; int32_t numOfTasks = taosArrayGetSize(pMeta->pTaskList); @@ -723,7 +663,7 @@ static int32_t restartStreamTasks(SStreamMeta* pMeta, bool isLeader) { streamMetaWUnLock(pMeta); tqInfo("vgId:%d start all stream tasks after reload tasks from disk", vgId); - startStreamTasks(pMeta); + streamMetaStartAllTasks(pMeta); } else { streamMetaResetStartInfo(&pMeta->startInfo); streamMetaWUnLock(pMeta); @@ -741,11 +681,14 @@ int32_t tqStreamTaskProcessRunReq(SStreamMeta* pMeta, SRpcMsg* pMsg, bool isLead int32_t vgId = pMeta->vgId; if (taskId == STREAM_EXEC_START_ALL_TASKS_ID) { - startStreamTasks(pMeta); + streamMetaStartAllTasks(pMeta); return 0; } else if (taskId == STREAM_EXEC_RESTART_ALL_TASKS_ID) { restartStreamTasks(pMeta, isLeader); return 0; + } else if (taskId == STREAM_EXEC_STOP_ALL_TASKS_ID) { + streamMetaStopAllTasks(pMeta); + return 0; } SStreamTask* pTask = streamMetaAcquireTask(pMeta, pReq->streamId, taskId); diff --git a/source/dnode/vnode/src/vnd/vnodeSync.c b/source/dnode/vnode/src/vnd/vnodeSync.c index 48811b2925..048092131d 100644 --- a/source/dnode/vnode/src/vnd/vnodeSync.c +++ b/source/dnode/vnode/src/vnd/vnodeSync.c @@ -594,7 +594,7 @@ static void vnodeBecomeFollower(const SSyncFSM *pFsm) { if (pVnode->pTq) { tqUpdateNodeStage(pVnode->pTq, false); - tqStopStreamTasks(pVnode->pTq); + tqStopStreamTasksAsync(pVnode->pTq); } } diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index 665288cf7b..0702a908ab 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -1367,3 +1367,119 @@ SArray* streamMetaSendMsgBeforeCloseTasks(SStreamMeta* pMeta) { pMeta->sendMsgBeforeClosing = false; return pTaskList; } + +void streamMetaUpdateStageRole(SStreamMeta* pMeta, int64_t stage, bool isLeader) { + streamMetaWLock(pMeta); + + int64_t prevStage = pMeta->stage; + pMeta->stage = stage; + + // mark the sign to send msg before close all tasks + if ((!isLeader) && (pMeta->role == NODE_ROLE_LEADER)) { + pMeta->sendMsgBeforeClosing = true; + } + + pMeta->role = (isLeader)? NODE_ROLE_LEADER:NODE_ROLE_FOLLOWER; + + if (isLeader) { + stInfo("vgId:%d update meta stage:%" PRId64 ", prev:%" PRId64 " leader:%d, start to send Hb", pMeta->vgId, + prevStage, stage, isLeader); + streamMetaStartHb(pMeta); + } else { + stInfo("vgId:%d update meta stage:%" PRId64 " prev:%" PRId64 " leader:%d sendMsg beforeClosing:%d", pMeta->vgId, + prevStage, stage, isLeader, pMeta->sendMsgBeforeClosing); + } + + streamMetaWUnLock(pMeta); +} + +int32_t streamMetaStopAllTasks(SStreamMeta* pMeta) { + streamMetaRLock(pMeta); + + int32_t num = taosArrayGetSize(pMeta->pTaskList); + stDebug("vgId:%d stop all %d stream task(s)", pMeta->vgId, num); + if (num == 0) { + streamMetaRUnLock(pMeta); + return TSDB_CODE_SUCCESS; + } + + // send hb msg to mnode before closing all tasks. + SArray* pTaskList = streamMetaSendMsgBeforeCloseTasks(pMeta); + int32_t numOfTasks = taosArrayGetSize(pTaskList); + + for (int32_t i = 0; i < numOfTasks; ++i) { + SStreamTaskId* pTaskId = taosArrayGet(pTaskList, i); + SStreamTask* pTask = streamMetaAcquireTaskNoLock(pMeta, pTaskId->streamId, pTaskId->taskId); + if (pTask == NULL) { + continue; + } + + streamTaskStop(pTask); + streamMetaReleaseTask(pMeta, pTask); + } + + taosArrayDestroy(pTaskList); + + streamMetaRUnLock(pMeta); + return 0; +} + +int32_t streamMetaStartAllTasks(SStreamMeta* pMeta) { + int32_t code = TSDB_CODE_SUCCESS; + int32_t vgId = pMeta->vgId; + + int32_t numOfTasks = taosArrayGetSize(pMeta->pTaskList); + stDebug("vgId:%d start to check all %d stream task(s) downstream status", vgId, numOfTasks); + if (numOfTasks == 0) { + return TSDB_CODE_SUCCESS; + } + + SArray* pTaskList = NULL; + streamMetaWLock(pMeta); + pTaskList = taosArrayDup(pMeta->pTaskList, NULL); + taosHashClear(pMeta->startInfo.pReadyTaskSet); + taosHashClear(pMeta->startInfo.pFailedTaskSet); + pMeta->startInfo.startTs = taosGetTimestampMs(); + streamMetaWUnLock(pMeta); + + // broadcast the check downstream tasks msg + for (int32_t i = 0; i < numOfTasks; ++i) { + SStreamTaskId* pTaskId = taosArrayGet(pTaskList, i); + SStreamTask* pTask = streamMetaAcquireTask(pMeta, pTaskId->streamId, pTaskId->taskId); + if (pTask == NULL) { + streamMetaUpdateTaskDownstreamStatus(pMeta, pTaskId->streamId, pTaskId->taskId, 0, + taosGetTimestampMs(), false); + continue; + } + + // fill-history task can only be launched by related stream tasks. + if (pTask->info.fillHistory == 1) { + streamMetaReleaseTask(pMeta, pTask); + continue; + } + + if (pTask->status.downstreamReady == 1) { + if (HAS_RELATED_FILLHISTORY_TASK(pTask)) { + stDebug("s-task:%s downstream ready, no need to check downstream, check only related fill-history task", + pTask->id.idStr); + streamLaunchFillHistoryTask(pTask); + } + + streamMetaUpdateTaskDownstreamStatus(pMeta, pTask->id.streamId, pTask->id.taskId, pTask->execInfo.init, + pTask->execInfo.start, true); + streamMetaReleaseTask(pMeta, pTask); + continue; + } + + EStreamTaskEvent event = (HAS_RELATED_FILLHISTORY_TASK(pTask)) ? TASK_EVENT_INIT_STREAM_SCANHIST : TASK_EVENT_INIT; + int32_t ret = streamTaskHandleEvent(pTask->status.pSM, event); + if (ret != TSDB_CODE_SUCCESS) { + code = ret; + } + + streamMetaReleaseTask(pMeta, pTask); + } + + taosArrayDestroy(pTaskList); + return code; +} \ No newline at end of file From 80604ec8e29c83313da061d9b2d868d558158aaa Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Fri, 22 Dec 2023 23:31:51 +0800 Subject: [PATCH 27/38] fix(stream): init rwlock. --- source/libs/stream/src/streamMeta.c | 1 + 1 file changed, 1 insertion(+) diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index 0702a908ab..292bd8cb38 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -375,6 +375,7 @@ SStreamMeta* streamMetaOpen(const char* path, void* ahandle, FTaskExpand expandF stage); pMeta->rid = taosAddRef(streamMetaId, pMeta); + taosThreadRwlockInit(&pMeta->lock, NULL); int64_t* pRid = taosMemoryMalloc(sizeof(int64_t)); memcpy(pRid, &pMeta->rid, sizeof(pMeta->rid)); From 66bdace9eb70058bd80476a9185d090edb1ee980 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Sat, 23 Dec 2023 00:09:55 +0800 Subject: [PATCH 28/38] fix(stream): check rsp. --- source/libs/stream/src/streamMeta.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index 292bd8cb38..eebde87b28 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -1299,7 +1299,12 @@ void streamMetaRLock(SStreamMeta* pMeta) { void streamMetaRUnLock(SStreamMeta* pMeta) { stTrace("vgId:%d meta-runlock", pMeta->vgId); - taosThreadRwlockUnlock(&pMeta->lock); + int32_t code = taosThreadRwlockUnlock(&pMeta->lock); + if (code != TSDB_CODE_SUCCESS) { + stError("vgId:%d meta-runlock failed, code:%d", pMeta->vgId, code); + } else { + stDebug("vgId:%d meta-runlock completed", pMeta->vgId); + } } void streamMetaWLock(SStreamMeta* pMeta) { From 5c1ff76082d393fae04374e09e679004a07dc7ad Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Sat, 23 Dec 2023 00:58:52 +0800 Subject: [PATCH 29/38] fix(stream): set rwlock attr. --- source/libs/stream/src/streamMeta.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index eebde87b28..25f0fbf991 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -375,7 +375,13 @@ SStreamMeta* streamMetaOpen(const char* path, void* ahandle, FTaskExpand expandF stage); pMeta->rid = taosAddRef(streamMetaId, pMeta); - taosThreadRwlockInit(&pMeta->lock, NULL); + + TdThreadRwlockAttr attr; + taosThreadRwlockAttrInit(&attr); + + pthread_rwlockattr_setkind_np(&attr, PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP); + taosThreadRwlockInit(&pMeta->lock, &attr); + taosThreadRwlockAttrDestroy(&attr); int64_t* pRid = taosMemoryMalloc(sizeof(int64_t)); memcpy(pRid, &pMeta->rid, sizeof(pMeta->rid)); @@ -1386,6 +1392,7 @@ void streamMetaUpdateStageRole(SStreamMeta* pMeta, int64_t stage, bool isLeader) } pMeta->role = (isLeader)? NODE_ROLE_LEADER:NODE_ROLE_FOLLOWER; + streamMetaWUnLock(pMeta); if (isLeader) { stInfo("vgId:%d update meta stage:%" PRId64 ", prev:%" PRId64 " leader:%d, start to send Hb", pMeta->vgId, @@ -1395,8 +1402,6 @@ void streamMetaUpdateStageRole(SStreamMeta* pMeta, int64_t stage, bool isLeader) stInfo("vgId:%d update meta stage:%" PRId64 " prev:%" PRId64 " leader:%d sendMsg beforeClosing:%d", pMeta->vgId, prevStage, stage, isLeader, pMeta->sendMsgBeforeClosing); } - - streamMetaWUnLock(pMeta); } int32_t streamMetaStopAllTasks(SStreamMeta* pMeta) { From 8a714db17df323425fb4ca85b5edefda625a3cc4 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Sat, 23 Dec 2023 15:33:50 +0800 Subject: [PATCH 30/38] fix: fix deadlock. --- source/libs/stream/src/streamMeta.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index 25f0fbf991..7217a49ff8 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -1356,7 +1356,7 @@ SArray* streamMetaSendMsgBeforeCloseTasks(SStreamMeta* pMeta) { int32_t numOfTasks = taosArrayGetSize(pTaskList); for (int32_t i = 0; i < numOfTasks; ++i) { SStreamTaskId* pTaskId = taosArrayGet(pTaskList, i); - SStreamTask* pTask = streamMetaAcquireTask(pMeta, pTaskId->streamId, pTaskId->taskId); + SStreamTask* pTask = streamMetaAcquireTaskNoLock(pMeta, pTaskId->streamId, pTaskId->taskId); if (pTask == NULL) { continue; } From c61203b55a98707c12675358c2e6e56b175395e3 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Sat, 23 Dec 2023 16:37:21 +0800 Subject: [PATCH 31/38] fix: fix deadlock. --- source/libs/stream/src/streamMeta.c | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index 7217a49ff8..46e3dded11 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -1065,28 +1065,16 @@ static int32_t metaHeartbeatToMnodeImpl(SStreamMeta* pMeta) { SStreamHbMsg hbMsg = {0}; SEpSet epset = {0}; bool hasMnodeEpset = false; - int64_t stage = 0; + int32_t numOfTasks = streamMetaGetNumOfTasks(pMeta); - streamMetaRLock(pMeta); - - int32_t numOfTasks = streamMetaGetNumOfTasks(pMeta); hbMsg.vgId = pMeta->vgId; - stage = pMeta->stage; - - SArray* pIdList = taosArrayDup(pMeta->pTaskList, NULL); - - streamMetaRUnLock(pMeta); - hbMsg.pTaskStatus = taosArrayInit(numOfTasks, sizeof(STaskStatusEntry)); hbMsg.pUpdateNodes = taosArrayInit(numOfTasks, sizeof(int32_t)); for (int32_t i = 0; i < numOfTasks; ++i) { - STaskId* pId = taosArrayGet(pIdList, i); + STaskId* pId = taosArrayGet(pMeta->pTaskList, i); - streamMetaRLock(pMeta); SStreamTask** pTask = taosHashGet(pMeta->pTasksMap, pId, sizeof(*pId)); - streamMetaRUnLock(pMeta); - if (pTask == NULL) { continue; } @@ -1100,7 +1088,7 @@ static int32_t metaHeartbeatToMnodeImpl(SStreamMeta* pMeta) { .id = *pId, .status = streamTaskGetStatus(*pTask, NULL), .nodeId = hbMsg.vgId, - .stage = stage, + .stage = pMeta->stage, .inputQUsed = SIZE_IN_MiB(streamQueueGetItemSize((*pTask)->inputq.queue)), }; @@ -1170,7 +1158,6 @@ static int32_t metaHeartbeatToMnodeImpl(SStreamMeta* pMeta) { _end: streamMetaClearHbMsg(&hbMsg); - taosArrayDestroy(pIdList); return TSDB_CODE_SUCCESS; } @@ -1210,7 +1197,9 @@ void metaHbToMnode(void* param, void* tmrId) { } stDebug("vgId:%d build stream task hb, leader:%d", pMeta->vgId, (pMeta->role == NODE_ROLE_LEADER)); + streamMetaRLock(pMeta); metaHeartbeatToMnodeImpl(pMeta); + streamMetaRUnLock(pMeta); taosTmrReset(metaHbToMnode, META_HB_CHECK_INTERVAL, param, streamTimer, &pMeta->pHbInfo->hbTmr); taosReleaseRef(streamMetaId, rid); From 25f798e42f24c90917192ce30c705053d156d43b Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Sat, 23 Dec 2023 23:29:30 +0800 Subject: [PATCH 32/38] fix(stream): record the start failure tasks --- source/dnode/vnode/src/tqCommon/tqCommon.c | 2 -- source/libs/stream/src/streamMeta.c | 20 ++++++++++++++------ source/libs/stream/src/streamStart.c | 7 ++++--- 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/source/dnode/vnode/src/tqCommon/tqCommon.c b/source/dnode/vnode/src/tqCommon/tqCommon.c index af9363e363..f576345f64 100644 --- a/source/dnode/vnode/src/tqCommon/tqCommon.c +++ b/source/dnode/vnode/src/tqCommon/tqCommon.c @@ -661,8 +661,6 @@ static int32_t restartStreamTasks(SStreamMeta* pMeta, bool isLeader) { if (isLeader && !tsDisableStream) { tqStreamTaskResetStatus(pMeta); streamMetaWUnLock(pMeta); - tqInfo("vgId:%d start all stream tasks after reload tasks from disk", vgId); - streamMetaStartAllTasks(pMeta); } else { streamMetaResetStartInfo(&pMeta->startInfo); diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index 46e3dded11..a46853fc26 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -1429,8 +1429,9 @@ int32_t streamMetaStartAllTasks(SStreamMeta* pMeta) { int32_t vgId = pMeta->vgId; int32_t numOfTasks = taosArrayGetSize(pMeta->pTaskList); - stDebug("vgId:%d start to check all %d stream task(s) downstream status", vgId, numOfTasks); + stInfo("vgId:%d start to check all %d stream task(s) downstream status", vgId, numOfTasks); if (numOfTasks == 0) { + stInfo("vgId:%d start tasks completed", pMeta->vgId); return TSDB_CODE_SUCCESS; } @@ -1443,16 +1444,20 @@ int32_t streamMetaStartAllTasks(SStreamMeta* pMeta) { streamMetaWUnLock(pMeta); // broadcast the check downstream tasks msg + int64_t now = taosGetTimestampMs(); + for (int32_t i = 0; i < numOfTasks; ++i) { SStreamTaskId* pTaskId = taosArrayGet(pTaskList, i); - SStreamTask* pTask = streamMetaAcquireTask(pMeta, pTaskId->streamId, pTaskId->taskId); + + SStreamTask* pTask = streamMetaAcquireTask(pMeta, pTaskId->streamId, pTaskId->taskId); if (pTask == NULL) { - streamMetaUpdateTaskDownstreamStatus(pMeta, pTaskId->streamId, pTaskId->taskId, 0, - taosGetTimestampMs(), false); + stError("vgId:%d failed to acquire task:0x%x during start tasks", pMeta->vgId, pTaskId->taskId); + streamMetaUpdateTaskDownstreamStatus(pMeta, pTaskId->streamId, pTaskId->taskId, 0, now, false); continue; } // fill-history task can only be launched by related stream tasks. + STaskExecStatisInfo* pInfo = &pTask->execInfo; if (pTask->info.fillHistory == 1) { streamMetaReleaseTask(pMeta, pTask); continue; @@ -1465,8 +1470,7 @@ int32_t streamMetaStartAllTasks(SStreamMeta* pMeta) { streamLaunchFillHistoryTask(pTask); } - streamMetaUpdateTaskDownstreamStatus(pMeta, pTask->id.streamId, pTask->id.taskId, pTask->execInfo.init, - pTask->execInfo.start, true); + streamMetaUpdateTaskDownstreamStatus(pMeta, pTaskId->streamId, pTaskId->taskId, pInfo->init, pInfo->start, true); streamMetaReleaseTask(pMeta, pTask); continue; } @@ -1474,12 +1478,16 @@ int32_t streamMetaStartAllTasks(SStreamMeta* pMeta) { EStreamTaskEvent event = (HAS_RELATED_FILLHISTORY_TASK(pTask)) ? TASK_EVENT_INIT_STREAM_SCANHIST : TASK_EVENT_INIT; int32_t ret = streamTaskHandleEvent(pTask->status.pSM, event); if (ret != TSDB_CODE_SUCCESS) { + stError("vgId:%d failed to handle event:%d", pMeta->vgId, event); code = ret; + + streamMetaUpdateTaskDownstreamStatus(pMeta, pTaskId->streamId, pTaskId->taskId, pInfo->init, pInfo->start, false); } streamMetaReleaseTask(pMeta, pTask); } + stInfo("vgId:%d start tasks completed", pMeta->vgId); taosArrayDestroy(pTaskList); return code; } \ No newline at end of file diff --git a/source/libs/stream/src/streamStart.c b/source/libs/stream/src/streamStart.c index 3bef6adf57..267dadf109 100644 --- a/source/libs/stream/src/streamStart.c +++ b/source/libs/stream/src/streamStart.c @@ -1114,9 +1114,9 @@ int32_t streamMetaUpdateTaskDownstreamStatus(SStreamMeta* pMeta, int64_t streamI pStartInfo->readyTs = taosGetTimestampMs(); pStartInfo->elapsedTime = (pStartInfo->startTs != 0) ? pStartInfo->readyTs - pStartInfo->startTs : 0; - stDebug("vgId:%d all %d task(s) check downstream completed, last completed task:0x%x startTs:%" PRId64 + stDebug("vgId:%d all %d task(s) check downstream completed, last completed task:0x%x (succ:%d) startTs:%" PRId64 ", readyTs:%" PRId64 " total elapsed time:%.2fs", - pMeta->vgId, numOfTotal, taskId, pStartInfo->startTs, pStartInfo->readyTs, + pMeta->vgId, numOfTotal, taskId, ready, pStartInfo->startTs, pStartInfo->readyTs, pStartInfo->elapsedTime / 1000.0); // print the initialization elapsed time and info @@ -1128,7 +1128,8 @@ int32_t streamMetaUpdateTaskDownstreamStatus(SStreamMeta* pMeta, int64_t streamI pStartInfo->completeFn(pMeta); } else { streamMetaWUnLock(pMeta); - stDebug("vgId:%d recv check downstream results:%d, total:%d", pMeta->vgId, numOfRecv, numOfTotal); + stDebug("vgId:%d recv check downstream results, s-task:0x%x succ:%d, received:%d, total:%d", pMeta->vgId, taskId, + ready, numOfRecv, numOfTotal); } return TSDB_CODE_SUCCESS; From 30498a40285565dfdfa7f4afcfac29906162619c Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Sun, 24 Dec 2023 02:24:46 +0800 Subject: [PATCH 33/38] refactor: do some internal refactor. --- source/dnode/vnode/src/tq/tq.c | 5 +++-- source/libs/stream/src/streamMeta.c | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/source/dnode/vnode/src/tq/tq.c b/source/dnode/vnode/src/tq/tq.c index e8e88e52f7..c8cc4f9d54 100644 --- a/source/dnode/vnode/src/tq/tq.c +++ b/source/dnode/vnode/src/tq/tq.c @@ -1259,10 +1259,11 @@ int32_t tqProcessTaskCheckPointSourceReq(STQ* pTq, SRpcMsg* pMsg, SRpcMsg* pRsp) if (pTask->status.downstreamReady != 1) { pTask->chkInfo.failedId = req.checkpointId; // record the latest failed checkpoint id pTask->chkInfo.checkpointingId = req.checkpointId; + pTask->chkInfo.transId = req.transId; tqError("s-task:%s not ready for checkpoint, since downstream not ready, ignore this checkpoint:%" PRId64 - ", set it failure", - pTask->id.idStr, req.checkpointId); + ", transId:%d set it failed", + pTask->id.idStr, req.checkpointId, req.transId); streamMetaReleaseTask(pMeta, pTask); SRpcMsg rsp = {0}; diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index a46853fc26..9e20a2997c 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -1102,6 +1102,7 @@ static int32_t metaHeartbeatToMnodeImpl(SStreamMeta* pMeta) { entry.checkpointFailed = ((*pTask)->chkInfo.failedId >= (*pTask)->chkInfo.checkpointingId); entry.activeCheckpointId = (*pTask)->chkInfo.checkpointingId; entry.chkpointTransId = (*pTask)->chkInfo.transId; + stInfo("s-task:%s send kill checkpoint trans info, transId:%d", (*pTask)->id.idStr, (*pTask)->chkInfo.transId); } if ((*pTask)->exec.pWalReader != NULL) { From bf895fadcb144cb31951cb54f3e0874290f8c8d2 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Sun, 24 Dec 2023 03:05:29 +0800 Subject: [PATCH 34/38] fix(stream): fix invalid read. --- source/libs/stream/src/streamDispatch.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/source/libs/stream/src/streamDispatch.c b/source/libs/stream/src/streamDispatch.c index f0ee03810b..bfa269b0d5 100644 --- a/source/libs/stream/src/streamDispatch.c +++ b/source/libs/stream/src/streamDispatch.c @@ -352,6 +352,7 @@ static int32_t doBuildDispatchMsg(SStreamTask* pTask, const SStreamDataBlock* pD return code; } + // it's a new vnode to receive dispatch msg, so add one if (pReqs[j].blockNum == 0) { atomic_add_fetch_32(&pTask->outputInfo.shuffleDispatcher.waitingRspCnt, 1); } @@ -372,8 +373,15 @@ static int32_t doBuildDispatchMsg(SStreamTask* pTask, const SStreamDataBlock* pD pTask->msgInfo.pData = pReqs; } - stDebug("s-task:%s build dispatch msg success, msgId:%d, stage:%"PRId64" %p", pTask->id.idStr, pTask->execInfo.dispatch, - pTask->pMeta->stage, pTask->msgInfo.pData); + if (pTask->outputInfo.type == TASK_OUTPUT__FIXED_DISPATCH) { + stDebug("s-task:%s build dispatch msg success, msgId:%d, stage:%" PRId64 " %p", pTask->id.idStr, + pTask->execInfo.dispatch, pTask->pMeta->stage, pTask->msgInfo.pData); + } else { + stDebug("s-task:%s build dispatch msg success, msgId:%d, stage:%" PRId64 " dstVgNum:%d %p", pTask->id.idStr, + pTask->execInfo.dispatch, pTask->pMeta->stage, pTask->outputInfo.shuffleDispatcher.waitingRspCnt, + pTask->msgInfo.pData); + } + return code; } @@ -395,9 +403,11 @@ static int32_t sendDispatchMsg(SStreamTask* pTask, SStreamDispatchReq* pDispatch SArray* vgInfo = pTask->outputInfo.shuffleDispatcher.dbInfo.pVgroupInfos; int32_t numOfVgroups = taosArrayGetSize(vgInfo); - stDebug("s-task:%s (child taskId:%d) start to shuffle-dispatch blocks to %d vgroup(s), msgId:%d", id, - pTask->info.selfChildId, numOfVgroups, msgId); + int32_t actualVgroups = pTask->outputInfo.shuffleDispatcher.waitingRspCnt; + stDebug("s-task:%s (child taskId:%d) start to shuffle-dispatch blocks to %d/%d vgroup(s), msgId:%d", id, + pTask->info.selfChildId, actualVgroups, numOfVgroups, msgId); + int32_t numOfSend = 0; for (int32_t i = 0; i < numOfVgroups; i++) { if (pDispatchMsg[i].blockNum > 0) { SVgroupInfo* pVgInfo = taosArrayGet(vgInfo, i); @@ -408,6 +418,11 @@ static int32_t sendDispatchMsg(SStreamTask* pTask, SStreamDispatchReq* pDispatch if (code < 0) { break; } + + // no need to try remain, all already send. + if (++numOfSend == actualVgroups) { + break; + } } } From cd85dae3fddc45f58eb29c85832013462c6c806b Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Sun, 24 Dec 2023 23:30:00 +0800 Subject: [PATCH 35/38] fix(stream): fix the dead lock. --- source/libs/stream/inc/streamInt.h | 2 -- source/libs/stream/src/streamCheckpoint.c | 1 + source/libs/stream/src/streamMeta.c | 5 ++++- source/libs/stream/src/streamStart.c | 16 ++++++++++++---- 4 files changed, 17 insertions(+), 7 deletions(-) diff --git a/source/libs/stream/inc/streamInt.h b/source/libs/stream/inc/streamInt.h index 341706ff3b..9b5134d449 100644 --- a/source/libs/stream/inc/streamInt.h +++ b/source/libs/stream/inc/streamInt.h @@ -109,8 +109,6 @@ int32_t streamBroadcastToChildren(SStreamTask* pTask, const SSDataBlock* pBlock) int32_t tEncodeStreamRetrieveReq(SEncoder* pEncoder, const SStreamRetrieveReq* pReq); int32_t streamSaveTaskCheckpointInfo(SStreamTask* p, int64_t checkpointId); -int32_t streamTaskBuildCheckpoint(SStreamTask* pTask); -int32_t streamSaveAllTaskStatus(SStreamMeta* pMeta, int64_t checkpointId); int32_t streamSendCheckMsg(SStreamTask* pTask, const SStreamTaskCheckReq* pReq, int32_t nodeId, SEpSet* pEpSet); int32_t streamAddCheckpointReadyMsg(SStreamTask* pTask, int32_t srcTaskId, int32_t index, int64_t checkpointId); diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index 2a766f21ec..b54adb0f96 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -114,6 +114,7 @@ static int32_t streamAlignCheckpoint(SStreamTask* pTask) { return atomic_sub_fetch_32(&pTask->chkInfo.downstreamAlignNum, 1); } +// todo handle down the transId of checkpoint to sink/agg tasks. static int32_t appendCheckpointIntoInputQ(SStreamTask* pTask, int32_t checkpointType) { SStreamDataBlock* pChkpoint = taosAllocateQitem(sizeof(SStreamDataBlock), DEF_QITEM, sizeof(SSDataBlock)); if (pChkpoint == NULL) { diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index 9e20a2997c..8b2330fda0 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -1102,7 +1102,10 @@ static int32_t metaHeartbeatToMnodeImpl(SStreamMeta* pMeta) { entry.checkpointFailed = ((*pTask)->chkInfo.failedId >= (*pTask)->chkInfo.checkpointingId); entry.activeCheckpointId = (*pTask)->chkInfo.checkpointingId; entry.chkpointTransId = (*pTask)->chkInfo.transId; - stInfo("s-task:%s send kill checkpoint trans info, transId:%d", (*pTask)->id.idStr, (*pTask)->chkInfo.transId); + + if (entry.checkpointFailed) { + stInfo("s-task:%s send kill checkpoint trans info, transId:%d", (*pTask)->id.idStr, (*pTask)->chkInfo.transId); + } } if ((*pTask)->exec.pWalReader != NULL) { diff --git a/source/libs/stream/src/streamStart.c b/source/libs/stream/src/streamStart.c index 267dadf109..ea5e2edc09 100644 --- a/source/libs/stream/src/streamStart.c +++ b/source/libs/stream/src/streamStart.c @@ -379,10 +379,11 @@ int32_t streamTaskOnScanhistoryTaskReady(SStreamTask* pTask) { stDebug("s-task:%s enter into scan-history data stage, status:%s", id, p); streamTaskStartScanHistory(pTask); - // start the related fill-history task, when current task is ready - if (HAS_RELATED_FILLHISTORY_TASK(pTask)) { - streamLaunchFillHistoryTask(pTask); - } + // NOTE: there will be an deadlock if launch fill history here. +// // start the related fill-history task, when current task is ready +// if (HAS_RELATED_FILLHISTORY_TASK(pTask)) { +// streamLaunchFillHistoryTask(pTask); +// } return TSDB_CODE_SUCCESS; } @@ -400,6 +401,13 @@ void doProcessDownstreamReadyRsp(SStreamTask* pTask) { int64_t initTs = pTask->execInfo.init; int64_t startTs = pTask->execInfo.start; streamMetaUpdateTaskDownstreamStatus(pTask->pMeta, pTask->id.streamId, pTask->id.taskId, initTs, startTs, true); + + // start the related fill-history task, when current task is ready + // not invoke in success callback due to the deadlock. + if (HAS_RELATED_FILLHISTORY_TASK(pTask)) { + stDebug("s-task:%s try to launch related fill-history task", pTask->id.idStr); + streamLaunchFillHistoryTask(pTask); + } } static void addIntoNodeUpdateList(SStreamTask* pTask, int32_t nodeId) { From 00fac852e8e22cb5c05d7e6405f1f5040af3be7d Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Mon, 25 Dec 2023 09:55:24 +0800 Subject: [PATCH 36/38] fix(tsdb): fix invalid delete check. --- source/dnode/vnode/src/tsdb/tsdbRead2.c | 26 +++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/source/dnode/vnode/src/tsdb/tsdbRead2.c b/source/dnode/vnode/src/tsdb/tsdbRead2.c index 751569bf90..5dafe91d3d 100644 --- a/source/dnode/vnode/src/tsdb/tsdbRead2.c +++ b/source/dnode/vnode/src/tsdb/tsdbRead2.c @@ -2058,9 +2058,9 @@ static bool isValidFileBlockRow(SBlockData* pBlockData, SFileBlockDumpInfo* pDum } if (pBlockScanInfo->delSkyline != NULL && TARRAY_SIZE(pBlockScanInfo->delSkyline) > 0) { - bool deleted = hasBeenDropped(pBlockScanInfo->delSkyline, &pBlockScanInfo->fileDelIndex, ts, ver, + bool dropped = hasBeenDropped(pBlockScanInfo->delSkyline, &pBlockScanInfo->fileDelIndex, ts, ver, pReader->info.order, &pReader->info.verRange); - if (deleted) { + if (dropped) { return false; } } @@ -3340,11 +3340,14 @@ TSDBROW* getValidMemRow(SIterInfo* pIter, const SArray* pDelList, STsdbReader* p } // it is a valid data version - if ((key.version <= pReader->info.verRange.maxVer && key.version >= pReader->info.verRange.minVer) && - pDelList != NULL && TARRAY_SIZE(pDelList) > 0) { - bool dropped = hasBeenDropped(pDelList, &pIter->index, key.ts, key.version, order, &pReader->info.verRange); - if (!dropped) { + if (key.version <= pReader->info.verRange.maxVer && key.version >= pReader->info.verRange.minVer) { + if (pDelList == NULL || TARRAY_SIZE(pDelList) == 0) { return pRow; + } else { + bool dropped = hasBeenDropped(pDelList, &pIter->index, key.ts, key.version, order, &pReader->info.verRange); + if (!dropped) { + return pRow; + } } } @@ -3362,11 +3365,14 @@ TSDBROW* getValidMemRow(SIterInfo* pIter, const SArray* pDelList, STsdbReader* p return NULL; } - if (key.version <= pReader->info.verRange.maxVer && key.version >= pReader->info.verRange.minVer && - pDelList != NULL && TARRAY_SIZE(pDelList) > 0) { - bool dropped = hasBeenDropped(pDelList, &pIter->index, key.ts, key.version, order, &pReader->info.verRange); - if (!dropped) { + if (key.version <= pReader->info.verRange.maxVer && key.version >= pReader->info.verRange.minVer) { + if (pDelList == NULL || TARRAY_SIZE(pDelList) == 0) { return pRow; + } else { + bool dropped = hasBeenDropped(pDelList, &pIter->index, key.ts, key.version, order, &pReader->info.verRange); + if (!dropped) { + return pRow; + } } } } From e5087f6e23ae34c69c9df9f942b5bc8acc490c07 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Mon, 25 Dec 2023 10:40:36 +0800 Subject: [PATCH 37/38] fix(tsdb): fix invalid delete check. --- source/dnode/vnode/src/tsdb/tsdbRead2.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/source/dnode/vnode/src/tsdb/tsdbRead2.c b/source/dnode/vnode/src/tsdb/tsdbRead2.c index 5dafe91d3d..5ad067c113 100644 --- a/source/dnode/vnode/src/tsdb/tsdbRead2.c +++ b/source/dnode/vnode/src/tsdb/tsdbRead2.c @@ -1379,11 +1379,13 @@ static bool nextRowFromSttBlocks(SSttBlockReader* pSttBlockReader, STableBlockSc pScanInfo->sttKeyInfo.nextProcKey = key; if (pScanInfo->delSkyline != NULL && TARRAY_SIZE(pScanInfo->delSkyline) > 0) { - bool dropped = hasBeenDropped(pScanInfo->delSkyline, &pScanInfo->sttBlockDelIndex, key, ver, order, pVerRange); - if (!dropped) { + if (!hasBeenDropped(pScanInfo->delSkyline, &pScanInfo->sttBlockDelIndex, key, ver, order, pVerRange)) { pScanInfo->sttKeyInfo.status = STT_FILE_HAS_DATA; return true; } + } else { + pScanInfo->sttKeyInfo.status = STT_FILE_HAS_DATA; + return true; } } } From 60977d56bd2f6756542e5eb1d773cb2b04976b1f Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Mon, 25 Dec 2023 12:47:16 +0800 Subject: [PATCH 38/38] refactor: set attribute only on Linux --- source/libs/stream/src/streamMeta.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index 3ee51ffd03..d09d370a36 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -376,12 +376,16 @@ SStreamMeta* streamMetaOpen(const char* path, void* ahandle, FTaskExpand expandF pMeta->rid = taosAddRef(streamMetaId, pMeta); + // set the attribute when running on Linux OS +#if defined LINUX TdThreadRwlockAttr attr; taosThreadRwlockAttrInit(&attr); pthread_rwlockattr_setkind_np(&attr, PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP); taosThreadRwlockInit(&pMeta->lock, &attr); + taosThreadRwlockAttrDestroy(&attr); +#endif int64_t* pRid = taosMemoryMalloc(sizeof(int64_t)); memcpy(pRid, &pMeta->rid, sizeof(pMeta->rid));