From 8b269ca955660a96907f3e0f9e123a61ac692745 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Fri, 12 Jul 2024 15:12:43 +0800 Subject: [PATCH 01/34] refactor: do some internal refactor. --- source/libs/stream/src/streamCheckpoint.c | 49 ++--------------------- 1 file changed, 4 insertions(+), 45 deletions(-) diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index b490b0e02a..25974375e1 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -433,8 +433,8 @@ int32_t streamTaskUpdateTaskCheckpointInfo(SStreamTask* pTask, bool restored, SV taosThreadMutexLock(&pTask->lock); if (pReq->checkpointId <= pInfo->checkpointId) { - stDebug("s-task:%s vgId:%d latest checkpointId:%" PRId64 " checkpointVer:%" PRId64 - " no need to update the checkpoint info, updated checkpointId:%" PRId64 " checkpointVer:%" PRId64 + stDebug("s-task:%s vgId:%d latest checkpointId:%" PRId64 " Ver:%" PRId64 + " no need to update checkpoint info, updated checkpointId:%" PRId64 " Ver:%" PRId64 " transId:%d ignored", id, vgId, pInfo->checkpointId, pInfo->checkpointVer, pReq->checkpointId, pReq->checkpointVer, pReq->transId); @@ -1114,12 +1114,7 @@ int32_t deleteCheckpointFile(const char* id, const char* name) { } int32_t streamTaskSendRestoreChkptMsg(SStreamTask* pTask) { - int32_t code; - int32_t tlen = 0; - int32_t vgId = pTask->pMeta->vgId; - const char* id = pTask->id.idStr; - SCheckpointInfo* pInfo = &pTask->chkInfo; - + const char* id = pTask->id.idStr; taosThreadMutexLock(&pTask->lock); if (pTask->status.sendConsensusChkptId == true) { stDebug("s-task:%s already start to consensus-checkpointId, not start again before it completed", id); @@ -1133,44 +1128,8 @@ int32_t streamTaskSendRestoreChkptMsg(SStreamTask* pTask) { ASSERT(pTask->pBackend == NULL); pTask->status.requireConsensusChkptId = true; -#if 0 - SRestoreCheckpointInfo req = { - .streamId = pTask->id.streamId, - .taskId = pTask->id.taskId, - .nodeId = vgId, - .checkpointId = pInfo->checkpointId, - .startTs = pTask->execInfo.created, - }; - tEncodeSize(tEncodeRestoreCheckpointInfo, &req, tlen, code); - if (code < 0) { - stError("s-task:%s vgId:%d encode stream task latest-checkpoint-id failed, code:%s", id, vgId, tstrerror(code)); - return -1; - } - - void* buf = rpcMallocCont(tlen); - if (buf == NULL) { - stError("s-task:%s vgId:%d encode stream task latest-checkpoint-id msg failed, code:%s", id, vgId, - tstrerror(TSDB_CODE_OUT_OF_MEMORY)); - return -1; - } - - SEncoder encoder; - tEncoderInit(&encoder, buf, tlen); - if ((code = tEncodeRestoreCheckpointInfo(&encoder, &req)) < 0) { - rpcFreeCont(buf); - stError("s-task:%s vgId:%d encode stream task latest-checkpoint-id msg failed, code:%s", id, vgId, tstrerror(code)); - return -1; - } - tEncoderClear(&encoder); - - SRpcMsg msg = {0}; - initRpcMsg(&msg, TDMT_MND_STREAM_REQ_CONSEN_CHKPT, buf, tlen); - stDebug("s-task:%s vgId:%d send latest checkpointId:%" PRId64 " to mnode to get the consensus checkpointId", id, vgId, - pInfo->checkpointId); - - tmsgSendReq(&pTask->info.mnodeEpset, &msg); -#endif + stDebug("s-task:%s set the require consensus-checkpointId flag", id); return 0; } From 142f9132a51ffc42937ffa9a518bb37c6868ee83 Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Fri, 12 Jul 2024 12:08:57 +0000 Subject: [PATCH 02/34] fix failed to load task --- source/dnode/vnode/src/inc/vnodeInt.h | 2 +- source/dnode/vnode/src/tq/tqStreamTaskSnap.c | 6 +++++- source/dnode/vnode/src/vnd/vnodeSnapshot.c | 3 ++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/source/dnode/vnode/src/inc/vnodeInt.h b/source/dnode/vnode/src/inc/vnodeInt.h index 4a47e08730..4b16a076cc 100644 --- a/source/dnode/vnode/src/inc/vnodeInt.h +++ b/source/dnode/vnode/src/inc/vnodeInt.h @@ -381,7 +381,7 @@ int32_t streamTaskSnapReaderClose(SStreamTaskReader* pReader); int32_t streamTaskSnapRead(SStreamTaskReader* pReader, uint8_t** ppData); int32_t streamTaskSnapWriterOpen(STQ* pTq, int64_t sver, int64_t ever, SStreamTaskWriter** ppWriter); -int32_t streamTaskSnapWriterClose(SStreamTaskWriter* ppWriter, int8_t rollback); +int32_t streamTaskSnapWriterClose(SStreamTaskWriter* ppWriter, int8_t rollback, int8_t loadTask); int32_t streamTaskSnapWrite(SStreamTaskWriter* pWriter, uint8_t* pData, uint32_t nData); int32_t streamStateSnapReaderOpen(STQ* pTq, int64_t sver, int64_t ever, SStreamStateReader** ppReader); diff --git a/source/dnode/vnode/src/tq/tqStreamTaskSnap.c b/source/dnode/vnode/src/tq/tqStreamTaskSnap.c index dda5173ad9..167d20ef54 100644 --- a/source/dnode/vnode/src/tq/tqStreamTaskSnap.c +++ b/source/dnode/vnode/src/tq/tqStreamTaskSnap.c @@ -192,7 +192,7 @@ _err: return 0; } -int32_t streamTaskSnapWriterClose(SStreamTaskWriter* pWriter, int8_t rollback) { +int32_t streamTaskSnapWriterClose(SStreamTaskWriter* pWriter, int8_t rollback, int8_t loadTask) { int32_t code = 0; STQ* pTq = pWriter->pTq; @@ -214,6 +214,10 @@ int32_t streamTaskSnapWriterClose(SStreamTaskWriter* pWriter, int8_t rollback) { } streamMetaWUnLock(pTq->pStreamMeta); taosMemoryFree(pWriter); + + if (loadTask == 1) { + streamMetaLoadAllTasks(pTq->pStreamMeta); + } return code; _err: diff --git a/source/dnode/vnode/src/vnd/vnodeSnapshot.c b/source/dnode/vnode/src/vnd/vnodeSnapshot.c index 611a603c63..0951e56d66 100644 --- a/source/dnode/vnode/src/vnd/vnodeSnapshot.c +++ b/source/dnode/vnode/src/vnd/vnodeSnapshot.c @@ -752,7 +752,8 @@ int32_t vnodeSnapWriterClose(SVSnapWriter *pWriter, int8_t rollback, SSnapshot * } if (pWriter->pStreamTaskWriter) { - code = streamTaskSnapWriterClose(pWriter->pStreamTaskWriter, rollback); + code = streamTaskSnapWriterClose(pWriter->pStreamTaskWriter, rollback, pWriter->pStreamStateWriter == NULL ? 1 : 0); + if (code) goto _exit; } From c4cde6f26881069675baa944ffed26eef029b516 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Mon, 15 Jul 2024 14:23:33 +0800 Subject: [PATCH 03/34] fix(stream): mark the timer launched by which checkpoint procedure. --- source/libs/stream/inc/streamInt.h | 8 ++------ source/libs/stream/src/streamDispatch.c | 15 ++++++++++++++- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/source/libs/stream/inc/streamInt.h b/source/libs/stream/inc/streamInt.h index 008d066717..d31f720411 100644 --- a/source/libs/stream/inc/streamInt.h +++ b/source/libs/stream/inc/streamInt.h @@ -63,11 +63,7 @@ struct SActiveCheckpointInfo { tmr_h pChkptTriggerTmr; int32_t sendReadyCheckCounter; tmr_h pSendReadyMsgTmr; -}; - -struct SConsensusCheckpoint { - int8_t inProcess; - + int64_t sendReadyTmrChkptId; }; typedef struct { @@ -227,7 +223,7 @@ int32_t streamMetaSendHbHelper(SStreamMeta* pMeta); ECHECKPOINT_BACKUP_TYPE streamGetCheckpointBackupType(); -int32_t streamTaskDownloadCheckpointData(const char* id, char* path); +int32_t streamTaskDownloadCheckpointData(const char* id, char* path, int64_t checkpointId); int32_t streamTaskOnNormalTaskReady(SStreamTask* pTask); int32_t streamTaskOnScanHistoryTaskReady(SStreamTask* pTask); diff --git a/source/libs/stream/src/streamDispatch.c b/source/libs/stream/src/streamDispatch.c index 617adaa016..1948b04186 100644 --- a/source/libs/stream/src/streamDispatch.c +++ b/source/libs/stream/src/streamDispatch.c @@ -815,6 +815,16 @@ static void checkpointReadyMsgSendMonitorFn(void* param, void* tmrId) { SArray* pList = pActiveInfo->pReadyMsgList; int32_t num = taosArrayGetSize(pList); + if (pActiveInfo->sendReadyTmrChkptId < pActiveInfo->activeId) { + taosThreadMutexUnlock(&pActiveInfo->lock); + int32_t ref = atomic_sub_fetch_32(&pTask->status.timerActive, 1); + stWarn("s-task:%s vgId:%d tmr launched by previous checkpoint procedure, checkpointId:%" PRId64 ", quit, ref:%d", + id, vgId, pActiveInfo->sendReadyTmrChkptId, ref); + + streamMetaReleaseTask(pTask->pMeta, pTask); + return; + } + // active checkpoint info is cleared for now if ((pActiveInfo->activeId == 0) && (pActiveInfo->transId == 0) && (num == 0) && (pTask->chkInfo.startTs == 0)) { taosThreadMutexUnlock(&pActiveInfo->lock); @@ -902,7 +912,6 @@ int32_t streamTaskSendCheckpointReadyMsg(SStreamTask* pTask) { pInfo->upstreamTaskId); } - taosThreadMutexUnlock(&pActiveInfo->lock); stDebug("s-task:%s level:%d checkpoint-ready msg sent to all %d upstreams", id, pTask->info.taskLevel, num); // start to check if checkpoint ready msg has successfully received by upstream tasks. @@ -916,8 +925,12 @@ int32_t streamTaskSendCheckpointReadyMsg(SStreamTask* pTask) { } else { taosTmrReset(checkpointReadyMsgSendMonitorFn, 100, pTask, streamTimer, &pActiveInfo->pSendReadyMsgTmr); } + + // mark the timer monitor checkpointId + pActiveInfo->sendReadyTmrChkptId = pActiveInfo->activeId; } + taosThreadMutexUnlock(&pActiveInfo->lock); return TSDB_CODE_SUCCESS; } From a88635129841692ac9cd4be75d058215c33901bb Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Mon, 15 Jul 2024 14:52:28 +0800 Subject: [PATCH 04/34] fix(stream): update checkpoint into different dir. --- include/common/rsync.h | 4 +- source/common/src/rsync.c | 74 ++++++++++++++----- source/libs/stream/src/streamBackendRocksdb.c | 4 +- source/libs/stream/src/streamCheckpoint.c | 12 +-- 4 files changed, 64 insertions(+), 30 deletions(-) diff --git a/include/common/rsync.h b/include/common/rsync.h index 0840b51793..4221fb432f 100644 --- a/include/common/rsync.h +++ b/include/common/rsync.h @@ -13,8 +13,8 @@ extern "C" { void stopRsync(); void startRsync(); -int32_t uploadByRsync(const char* id, const char* path); -int32_t downloadRsync(const char* id, const char* path); +int32_t uploadByRsync(const char* id, const char* path, int64_t checkpointId); +int32_t downloadByRsync(const char* id, const char* path, int64_t checkpointId); int32_t deleteRsync(const char* id); #ifdef __cplusplus diff --git a/source/common/src/rsync.c b/source/common/src/rsync.c index d0b10b7f41..36d634c305 100644 --- a/source/common/src/rsync.c +++ b/source/common/src/rsync.c @@ -157,7 +157,7 @@ void startRsync() { } } -int32_t uploadByRsync(const char* id, const char* path) { +int32_t uploadByRsync(const char* id, const char* path, int64_t checkpointId) { int64_t st = taosGetTimestampMs(); char command[PATH_MAX] = {0}; @@ -197,11 +197,11 @@ int32_t uploadByRsync(const char* id, const char* path) { // prepare the data directory int32_t code = execCommand(command); if (code != 0) { - uError("[rsync] s-task:%s prepare checkpoint data in %s to %s failed, code:%d," ERRNO_ERR_FORMAT, id, path, + uError("[rsync] s-task:%s prepare checkpoint dir in %s to %s failed, code:%d," ERRNO_ERR_FORMAT, id, path, tsSnodeAddress, code, ERRNO_ERR_DATA); } else { int64_t el = (taosGetTimestampMs() - st); - uDebug("[rsync] s-task:%s prepare checkpoint data in:%s to %s successfully, elapsed time:%" PRId64 "ms", id, path, + uDebug("[rsync] s-task:%s prepare checkpoint dir in:%s to %s successfully, elapsed time:%" PRId64 "ms", id, path, tsSnodeAddress, el); } @@ -215,7 +215,7 @@ int32_t uploadByRsync(const char* id, const char* path) { #endif snprintf(command, PATH_MAX, "rsync -av --debug=all --log-file=%s/rsynclog --delete --timeout=10 --bwlimit=100000 %s/ " - "rsync://%s/checkpoint/%s/data/", + "rsync://%s/checkpoint/%s/%" PRId64 "/", tsLogDir, #ifdef WINDOWS pathTransform @@ -223,11 +223,11 @@ int32_t uploadByRsync(const char* id, const char* path) { path #endif , - tsSnodeAddress, id); + tsSnodeAddress, id, checkpointId); } else { snprintf(command, PATH_MAX, "rsync -av --debug=all --log-file=%s/rsynclog --delete --timeout=10 --bwlimit=100000 %s " - "rsync://%s/checkpoint/%s/data/", + "rsync://%s/checkpoint/%s/%" PRId64 "/", tsLogDir, #ifdef WINDOWS pathTransform @@ -235,7 +235,7 @@ int32_t uploadByRsync(const char* id, const char* path) { path #endif , - tsSnodeAddress, id); + tsSnodeAddress, id, checkpointId); } code = execCommand(command); @@ -252,7 +252,7 @@ int32_t uploadByRsync(const char* id, const char* path) { } // abort from retry if quit -int32_t downloadRsync(const char* id, const char* path) { +int32_t downloadByRsync(const char* id, const char* path, int64_t checkpointId) { int64_t st = taosGetTimestampMs(); int32_t MAX_RETRY = 10; int32_t times = 0; @@ -264,6 +264,42 @@ int32_t downloadRsync(const char* id, const char* path) { #endif char command[PATH_MAX] = {0}; + snprintf( + command, PATH_MAX, + "rsync -av --debug=all --log-file=%s/rsynclog --timeout=10 --bwlimit=100000 rsync://%s/checkpoint/%s/%" PRId64 + "/ %s", + tsLogDir, tsSnodeAddress, id, checkpointId, +#ifdef WINDOWS + pathTransform +#else + path +#endif + ); + + uDebug("[rsync] %s start to sync data from remote to:%s, cmd:%s", id, path, command); + +// while (times++ < MAX_RETRY) { + code = execCommand(command); + if (code != TSDB_CODE_SUCCESS) { + uError("[rsync] %s download checkpointId:%" PRId64 + " data:%s failed, retry after 1sec, times:%d, code:%d," ERRNO_ERR_FORMAT, + id, checkpointId, path, times, code, ERRNO_ERR_DATA); +// taosSsleep(1); + } else { + int32_t el = taosGetTimestampMs() - st; + uDebug("[rsync] %s download checkpointId:%" PRId64 " data:%s successfully, elapsed time:%dms", id, checkpointId, + path, el); +// break; + } +// } + + // if failed, try to load it from data directory +#ifdef WINDOWS + memset(pathTransform, 0, PATH_MAX); + changeDirFromWindowsToLinux(path, pathTransform); +#endif + + memset(command, 0, PATH_MAX); snprintf( command, PATH_MAX, "rsync -av --debug=all --log-file=%s/rsynclog --timeout=10 --bwlimit=100000 rsync://%s/checkpoint/%s/data/ %s", @@ -275,19 +311,17 @@ int32_t downloadRsync(const char* id, const char* path) { #endif ); - uDebug("[rsync] %s start to sync data from remote to:%s, %s", id, path, command); + uDebug("[rsync] %s start to sync data from remote data dir to:%s, cmd:%s", id, path, command); - while (times++ < MAX_RETRY) { - code = execCommand(command); - if (code != TSDB_CODE_SUCCESS) { - uError("[rsync] %s download checkpoint data:%s failed, retry after 1sec, times:%d, code:%d," ERRNO_ERR_FORMAT, id, - path, times, code, ERRNO_ERR_DATA); - taosSsleep(1); - } else { - int32_t el = taosGetTimestampMs() - st; - uDebug("[rsync] %s download checkpoint data:%s successfully, elapsed time:%dms", id, path, el); - break; - } + code = execCommand(command); + if (code != TSDB_CODE_SUCCESS) { + uError("[rsync] %s download checkpointId:%" PRId64 + " data:%s failed, retry after 1sec, times:%d, code:%d," ERRNO_ERR_FORMAT, + id, checkpointId, path, times, code, ERRNO_ERR_DATA); + } else { + int32_t el = taosGetTimestampMs() - st; + uDebug("[rsync] %s download checkpointId:%" PRId64 " data:%s successfully, elapsed time:%dms", id, checkpointId, + path, el); } return code; diff --git a/source/libs/stream/src/streamBackendRocksdb.c b/source/libs/stream/src/streamBackendRocksdb.c index 8b87019ee0..15a8be6eaa 100644 --- a/source/libs/stream/src/streamBackendRocksdb.c +++ b/source/libs/stream/src/streamBackendRocksdb.c @@ -447,7 +447,7 @@ int32_t rebuildFromRemoteChkp_rsync(const char* key, char* checkpointPath, int64 cleanDir(defaultPath, key); stDebug("clear local default dir before downloading checkpoint data:%s succ", defaultPath); - code = streamTaskDownloadCheckpointData(key, checkpointPath); + code = streamTaskDownloadCheckpointData(key, checkpointPath, checkpointId); if (code != 0) { stError("failed to download checkpoint data:%s", key); return code; @@ -482,7 +482,7 @@ int32_t rebuildDataFromS3(char* chkpPath, int64_t chkpId) { int32_t rebuildFromRemoteChkp_s3(const char* key, char* chkpPath, int64_t chkpId, char* defaultPath) { int8_t rename = 0; - int32_t code = streamTaskDownloadCheckpointData(key, chkpPath); + int32_t code = streamTaskDownloadCheckpointData(key, chkpPath, chkpId); if (code != 0) { return code; } diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index bcdd1a047c..87c2af5207 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -20,7 +20,7 @@ static int32_t downloadCheckpointDataByName(const char* id, const char* fname, const char* dstName); static int32_t deleteCheckpointFile(const char* id, const char* name); -static int32_t streamTaskUploadCheckpoint(const char* id, const char* path); +static int32_t streamTaskUploadCheckpoint(const char* id, const char* path, int64_t checkpointId); static int32_t deleteCheckpoint(const char* id); static int32_t downloadCheckpointByNameS3(const char* id, const char* fname, const char* dstName); static int32_t continueDispatchCheckpointTriggerBlock(SStreamDataBlock* pBlock, SStreamTask* pTask); @@ -601,7 +601,7 @@ int32_t uploadCheckpointData(SStreamTask* pTask, int64_t checkpointId, int64_t d } if (code == TSDB_CODE_SUCCESS) { - code = streamTaskUploadCheckpoint(idStr, path); + code = streamTaskUploadCheckpoint(idStr, path, checkpointId); if (code == TSDB_CODE_SUCCESS) { stDebug("s-task:%s upload checkpointId:%" PRId64 " to remote succ", idStr, checkpointId); } else { @@ -1082,7 +1082,7 @@ ECHECKPOINT_BACKUP_TYPE streamGetCheckpointBackupType() { } } -int32_t streamTaskUploadCheckpoint(const char* id, const char* path) { +int32_t streamTaskUploadCheckpoint(const char* id, const char* path, int64_t checkpointId) { int32_t code = 0; if (id == NULL || path == NULL || strlen(id) == 0 || strlen(path) == 0 || strlen(path) >= PATH_MAX) { stError("invalid parameters in upload checkpoint, %s", id); @@ -1090,7 +1090,7 @@ int32_t streamTaskUploadCheckpoint(const char* id, const char* path) { } if (strlen(tsSnodeAddress) != 0) { - code = uploadByRsync(id, path); + code = uploadByRsync(id, path, checkpointId); if (code != 0) { return TAOS_SYSTEM_ERROR(errno); } @@ -1117,14 +1117,14 @@ int32_t downloadCheckpointDataByName(const char* id, const char* fname, const ch return 0; } -int32_t streamTaskDownloadCheckpointData(const char* id, char* path) { +int32_t streamTaskDownloadCheckpointData(const char* id, char* path, int64_t checkpointId) { if (id == NULL || path == NULL || strlen(id) == 0 || strlen(path) == 0 || strlen(path) >= PATH_MAX) { stError("down checkpoint data parameters invalid"); return -1; } if (strlen(tsSnodeAddress) != 0) { - return downloadRsync(id, path); + return downloadByRsync(id, path, checkpointId); } else if (tsS3StreamEnabled) { return s3GetObjectsByPrefix(id, path); } From 14a7cebc56cb2034d383cb6a3ecc6a230878322b Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Mon, 15 Jul 2024 18:51:58 +0800 Subject: [PATCH 05/34] fix(stream): add explicit create table into sink cache. --- source/dnode/vnode/src/tq/tqSink.c | 41 ++++++++++++++++++------------ 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/source/dnode/vnode/src/tq/tqSink.c b/source/dnode/vnode/src/tq/tqSink.c index 5f3e1e3d14..34bd39f6e7 100644 --- a/source/dnode/vnode/src/tq/tqSink.c +++ b/source/dnode/vnode/src/tq/tqSink.c @@ -46,6 +46,7 @@ static int32_t initCreateTableMsg(SVCreateTbReq* pCreateTableReq, uint64_t suid, static SArray* createDefaultTagColName(); static void setCreateTableMsgTableName(SVCreateTbReq* pCreateTableReq, SSDataBlock* pDataBlock, const char* stbFullName, int64_t gid, bool newSubTableRule); +static int32_t doCreateSinkInfo(const char* pDstTableName, STableSinkInfo** pInfo); int32_t tqBuildDeleteReq(STQ* pTq, const char* stbFullName, const SSDataBlock* pDataBlock, SBatchDeleteReq* deleteReq, const char* pIdStr, bool newSubTableRule) { @@ -269,6 +270,14 @@ static int32_t doBuildAndSendCreateTableMsg(SVnode* pVnode, char* stbFullName, S pTask->ver >= SSTREAM_TASK_SUBTABLE_CHANGED_VER && pTask->subtableWithoutMd5 != 1); taosArrayPush(reqs.pArray, pCreateTbReq); + + STableSinkInfo* pInfo = NULL; + bool alreadyCached = tqGetTableInfo(pTask->outputInfo.tbSink.pTblInfo, gid, &pInfo); + if (!alreadyCached) { + code = doCreateSinkInfo(pCreateTbReq->name, &pInfo); + doPutIntoCache(pTask->outputInfo.tbSink.pTblInfo, pInfo, gid, pTask->id.idStr); + } + tqDebug("s-task:%s build create table:%s msg complete", pTask->id.idStr, pCreateTbReq->name); } @@ -631,6 +640,18 @@ int32_t doWaitForDstTableCreated(SVnode* pVnode, SStreamTask* pTask, STableSinkI return TSDB_CODE_SUCCESS; } +int32_t doCreateSinkInfo(const char* pDstTableName, STableSinkInfo** pInfo) { + int32_t nameLen = strlen(pDstTableName); + (*pInfo) = taosMemoryCalloc(1, sizeof(STableSinkInfo) + nameLen + 1); + if (*pInfo == NULL) { + return TSDB_CODE_OUT_OF_MEMORY; + } + + (*pInfo)->name.len = nameLen; + memcpy((*pInfo)->name.data, pDstTableName, nameLen); + return TSDB_CODE_SUCCESS; +} + int32_t setDstTableDataUid(SVnode* pVnode, SStreamTask* pTask, SSDataBlock* pDataBlock, char* stbFullName, SSubmitTbData* pTableData) { uint64_t groupId = pDataBlock->info.id.groupId; @@ -667,22 +688,15 @@ int32_t setDstTableDataUid(SVnode* pVnode, SStreamTask* pTask, SSDataBlock* pDat if (pTask->subtableWithoutMd5 != 1 && !isAutoTableName(dstTableName) && !alreadyAddGroupId(dstTableName, groupId) && groupId != 0) { tqDebug("s-task:%s append groupId:%" PRId64 " for generated dstTable:%s", id, groupId, dstTableName); - if(pTask->ver == SSTREAM_TASK_SUBTABLE_CHANGED_VER){ + if (pTask->ver == SSTREAM_TASK_SUBTABLE_CHANGED_VER) { buildCtbNameAddGroupId(NULL, dstTableName, groupId); - }else if(pTask->ver > SSTREAM_TASK_SUBTABLE_CHANGED_VER && stbFullName) { + } else if (pTask->ver > SSTREAM_TASK_SUBTABLE_CHANGED_VER && stbFullName) { buildCtbNameAddGroupId(stbFullName, dstTableName, groupId); } } } - int32_t nameLen = strlen(dstTableName); - pTableSinkInfo = taosMemoryCalloc(1, sizeof(STableSinkInfo) + nameLen + 1); - if (pTableSinkInfo == NULL) { - return TSDB_CODE_OUT_OF_MEMORY; - } - - pTableSinkInfo->name.len = nameLen; - memcpy(pTableSinkInfo->name.data, dstTableName, nameLen); + int32_t code = doCreateSinkInfo(dstTableName, &pTableSinkInfo); tqDebug("s-task:%s build new sinkTableInfo to add cache, dstTable:%s", id, dstTableName); } @@ -690,7 +704,7 @@ int32_t setDstTableDataUid(SVnode* pVnode, SStreamTask* pTask, SSDataBlock* pDat pTableData->uid = pTableSinkInfo->uid; if (pTableData->uid == 0) { - tqTrace("s-task:%s cached tableInfo uid is invalid, acquire it from meta", id); + tqTrace("s-task:%s cached tableInfo:%s uid is invalid, acquire it from meta", id, pTableSinkInfo->name.data); return doWaitForDstTableCreated(pVnode, pTask, pTableSinkInfo, dstTableName, &pTableData->uid); } else { tqTrace("s-task:%s set the dstTable uid from cache:%" PRId64, id, pTableData->uid); @@ -926,11 +940,6 @@ bool hasOnlySubmitData(const SArray* pBlocks, int32_t numOfBlocks) { } int32_t doPutIntoCache(SSHashObj* pSinkTableMap, STableSinkInfo* pTableSinkInfo, uint64_t groupId, const char* id) { - if (tSimpleHashGetSize(pSinkTableMap) > MAX_CACHE_TABLE_INFO_NUM) { - taosMemoryFreeClear(pTableSinkInfo); // too many items, failed to cache it - return TSDB_CODE_FAILED; - } - int32_t code = tSimpleHashPut(pSinkTableMap, &groupId, sizeof(uint64_t), &pTableSinkInfo, POINTER_BYTES); if (code != TSDB_CODE_SUCCESS) { taosMemoryFreeClear(pTableSinkInfo); From a46b7b3a414dacc3417934c92f81d00a7965d414 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Mon, 15 Jul 2024 19:19:58 +0800 Subject: [PATCH 06/34] fix(stream): adjust the time to free task backend. --- source/libs/stream/src/streamCheckpoint.c | 8 ++++++++ source/libs/stream/src/streamMeta.c | 1 - source/libs/stream/src/streamTaskSm.c | 22 ++++++++-------------- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index 87c2af5207..d1ea72370d 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -1163,7 +1163,10 @@ int32_t deleteCheckpointFile(const char* id, const char* name) { int32_t streamTaskSendRestoreChkptMsg(SStreamTask* pTask) { const char* id = pTask->id.idStr; + taosThreadMutexLock(&pTask->lock); + ETaskStatus p = streamTaskGetStatus(pTask)->state; + if (pTask->status.sendConsensusChkptId == true) { stDebug("s-task:%s already start to consensus-checkpointId, not start again before it completed", id); taosThreadMutexUnlock(&pTask->lock); @@ -1174,6 +1177,11 @@ int32_t streamTaskSendRestoreChkptMsg(SStreamTask* pTask) { taosThreadMutexUnlock(&pTask->lock); + if (pTask->pBackend != NULL) { + streamFreeTaskState(pTask, p); + pTask->pBackend = NULL; + } + ASSERT(pTask->pBackend == NULL); pTask->status.requireConsensusChkptId = true; diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index d0b1f6ca93..d2c957422b 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -1265,7 +1265,6 @@ int32_t streamMetaStartAllTasks(SStreamMeta* pMeta) { } // negotiate the consensus checkpoint id for current task - ASSERT(pTask->pBackend == NULL); code = streamTaskSendRestoreChkptMsg(pTask); // this task may has no checkpoint, but others tasks may generate checkpoint already? diff --git a/source/libs/stream/src/streamTaskSm.c b/source/libs/stream/src/streamTaskSm.c index f2bd99cdaf..85d3e0068a 100644 --- a/source/libs/stream/src/streamTaskSm.c +++ b/source/libs/stream/src/streamTaskSm.c @@ -79,12 +79,6 @@ static int32_t attachWaitedEvent(SStreamTask* pTask, SFutureHandleEventInfo* pEv return 0; } -static int32_t stopTaskSuccFn(SStreamTask* pTask) { - SStreamTaskSM* pSM = pTask->status.pSM; - streamFreeTaskState(pTask, pSM->current.state); - return TSDB_CODE_SUCCESS; -} - int32_t streamTaskInitStatus(SStreamTask* pTask) { pTask->execInfo.checkTs = taosGetTimestampMs(); stDebug("s-task:%s start init, and check downstream tasks, set the init ts:%" PRId64, pTask->id.idStr, @@ -640,21 +634,21 @@ void doInitStateTransferTable(void) { // resume is completed by restore status of state-machine // stop related event - trans = createStateTransform(TASK_STATUS__READY, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, stopTaskSuccFn, NULL); + trans = createStateTransform(TASK_STATUS__READY, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, NULL, NULL); taosArrayPush(streamTaskSMTrans, &trans); - trans = createStateTransform(TASK_STATUS__DROPPING, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, stopTaskSuccFn, NULL); + trans = createStateTransform(TASK_STATUS__DROPPING, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, NULL, NULL); taosArrayPush(streamTaskSMTrans, &trans); - trans = createStateTransform(TASK_STATUS__UNINIT, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, stopTaskSuccFn, NULL); + trans = createStateTransform(TASK_STATUS__UNINIT, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, NULL, NULL); taosArrayPush(streamTaskSMTrans, &trans); - trans = createStateTransform(TASK_STATUS__STOP, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, stopTaskSuccFn, NULL); + trans = createStateTransform(TASK_STATUS__STOP, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, NULL, NULL); taosArrayPush(streamTaskSMTrans, &trans); - trans = createStateTransform(TASK_STATUS__SCAN_HISTORY, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, stopTaskSuccFn, NULL); + trans = createStateTransform(TASK_STATUS__SCAN_HISTORY, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, NULL, NULL); taosArrayPush(streamTaskSMTrans, &trans); - trans = createStateTransform(TASK_STATUS__HALT, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, stopTaskSuccFn, NULL); + trans = createStateTransform(TASK_STATUS__HALT, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, NULL, NULL); taosArrayPush(streamTaskSMTrans, &trans); - trans = createStateTransform(TASK_STATUS__PAUSE, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, stopTaskSuccFn, NULL); + trans = createStateTransform(TASK_STATUS__PAUSE, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, NULL, NULL); taosArrayPush(streamTaskSMTrans, &trans); - trans = createStateTransform(TASK_STATUS__CK, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, stopTaskSuccFn, NULL); + trans = createStateTransform(TASK_STATUS__CK, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, NULL, NULL); taosArrayPush(streamTaskSMTrans, &trans); // dropping related event From 9aadc5e4acf886120094c87d47cffe5c7c9576ed Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Wed, 17 Jul 2024 10:18:58 +0800 Subject: [PATCH 07/34] fix(stream): fix race condition for dispatch msg. --- source/libs/stream/src/streamDispatch.c | 56 +++++++++++++++---------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/source/libs/stream/src/streamDispatch.c b/source/libs/stream/src/streamDispatch.c index 1948b04186..9615ed49e0 100644 --- a/source/libs/stream/src/streamDispatch.c +++ b/source/libs/stream/src/streamDispatch.c @@ -1175,10 +1175,10 @@ void streamClearChkptReadyMsg(SActiveCheckpointInfo* pActiveInfo) { static int32_t handleDispatchSuccessRsp(SStreamTask* pTask, int32_t downstreamId, int32_t downstreamNodeId) { stDebug("s-task:%s destroy dispatch msg:%p", pTask->id.idStr, pTask->msgInfo.pData); - bool delayDispatch = (pTask->msgInfo.dispatchMsgType == STREAM_INPUT__CHECKPOINT_TRIGGER); - clearBufferedDispatchMsg(pTask); - int64_t el = taosGetTimestampMs() - pTask->msgInfo.startTs; + bool delayDispatch = (pTask->msgInfo.dispatchMsgType == STREAM_INPUT__CHECKPOINT_TRIGGER); + + clearBufferedDispatchMsg(pTask); // put data into inputQ of current task is also allowed if (pTask->inputq.status == TASK_INPUT_STATUS__BLOCKED) { @@ -1202,13 +1202,24 @@ static int32_t handleDispatchSuccessRsp(SStreamTask* pTask, int32_t downstreamId return 0; } -static int32_t setDispatchRspInfo(SDispatchMsgInfo* pMsgInfo, int32_t vgId, int32_t code, int64_t now, const char* id) { +static bool setDispatchRspInfo(SDispatchMsgInfo* pMsgInfo, int32_t vgId, int32_t code, int64_t now, int32_t* pNotRsp, const char* id) { int32_t numOfRsp = 0; bool alreadySet = false; bool updated = false; + bool allRsp = false; + *pNotRsp = 0; taosThreadMutexLock(&pMsgInfo->lock); - for (int32_t j = 0; j < taosArrayGetSize(pMsgInfo->pSendInfo); ++j) { + int32_t numOfDispatchBranch = taosArrayGetSize(pMsgInfo->pSendInfo); + + for(int32_t i = 0; i < numOfDispatchBranch; ++i) { + SDispatchEntry* pEntry = taosArrayGet(pMsgInfo->pSendInfo, i); + if (pEntry->rspTs != -1) { + numOfRsp += 1; + } + } + + for (int32_t j = 0; j < numOfDispatchBranch; ++j) { SDispatchEntry* pEntry = taosArrayGet(pMsgInfo->pSendInfo, j); if (pEntry->nodeId == vgId) { ASSERT(!alreadySet); @@ -1216,18 +1227,20 @@ static int32_t setDispatchRspInfo(SDispatchMsgInfo* pMsgInfo, int32_t vgId, int3 pEntry->status = code; alreadySet = true; updated = true; - stDebug("s-task:%s record the rsp recv, ts:%" PRId64 " code:%d, idx:%d", id, now, code, j); - } - - if (pEntry->rspTs != -1) { numOfRsp += 1; + + stDebug("s-task:%s record the rsp recv, ts:%" PRId64 " code:%d, idx:%d, total recv:%d/%d", id, now, code, j, + numOfRsp, numOfDispatchBranch); } } - taosThreadMutexUnlock(&pMsgInfo->lock); - ASSERT(updated); + *pNotRsp = numOfDispatchBranch - numOfRsp; + allRsp = (numOfRsp == numOfDispatchBranch); - return numOfRsp; + taosThreadMutexUnlock(&pMsgInfo->lock); + + ASSERT(updated); + return allRsp; } bool isDispatchRspTimeout(SDispatchEntry* pEntry, int64_t now) { @@ -1253,7 +1266,8 @@ int32_t streamProcessDispatchRsp(SStreamTask* pTask, SStreamDispatchRsp* pRsp, i int32_t vgId = pTask->pMeta->vgId; SDispatchMsgInfo* pMsgInfo = &pTask->msgInfo; int64_t now = taosGetTimestampMs(); - int32_t totalRsp = 0; + bool allRsp = false; + int32_t notRsp = 0; taosThreadMutexLock(&pMsgInfo->lock); int32_t msgId = pMsgInfo->msgId; @@ -1282,18 +1296,18 @@ int32_t streamProcessDispatchRsp(SStreamTask* pTask, SStreamDispatchRsp* pRsp, i if (code == TSDB_CODE_STREAM_TASK_NOT_EXIST) { // destination task does not exist, not retry anymore stError("s-task:%s failed to dispatch msg to task:0x%x(vgId:%d), msgId:%d no retry, since task destroyed already", id, pRsp->downstreamTaskId, pRsp->downstreamNodeId, msgId); - totalRsp = setDispatchRspInfo(pMsgInfo, pRsp->downstreamNodeId, TSDB_CODE_SUCCESS, now, id); + allRsp = setDispatchRspInfo(pMsgInfo, pRsp->downstreamNodeId, TSDB_CODE_SUCCESS, now, ¬Rsp, id); } else { stError("s-task:%s failed to dispatch msgId:%d to task:0x%x(vgId:%d), code:%s, add to retry list", id, msgId, pRsp->downstreamTaskId, pRsp->downstreamNodeId, tstrerror(code)); - totalRsp = setDispatchRspInfo(pMsgInfo, pRsp->downstreamNodeId, code, now, id); + allRsp = setDispatchRspInfo(pMsgInfo, pRsp->downstreamNodeId, code, now, ¬Rsp, id); } } else { // code == 0 if (pRsp->inputStatus == TASK_INPUT_STATUS__BLOCKED) { pTask->inputq.status = TASK_INPUT_STATUS__BLOCKED; // block the input of current task, to push pressure to upstream - totalRsp = setDispatchRspInfo(pMsgInfo, pRsp->downstreamNodeId, pRsp->inputStatus, now, id); + allRsp = setDispatchRspInfo(pMsgInfo, pRsp->downstreamNodeId, pRsp->inputStatus, now, ¬Rsp, id); stTrace("s-task:%s inputQ of downstream task:0x%x(vgId:%d) is full, wait for retry dispatch", id, pRsp->downstreamTaskId, pRsp->downstreamNodeId); } else { @@ -1305,7 +1319,7 @@ int32_t streamProcessDispatchRsp(SStreamTask* pTask, SStreamDispatchRsp* pRsp, i id, pRsp->downstreamTaskId, pRsp->downstreamNodeId); } - totalRsp = setDispatchRspInfo(pMsgInfo, pRsp->downstreamNodeId, TSDB_CODE_SUCCESS, now, id); + allRsp = setDispatchRspInfo(pMsgInfo, pRsp->downstreamNodeId, TSDB_CODE_SUCCESS, now, ¬Rsp, id); { bool delayDispatch = (pMsgInfo->dispatchMsgType == STREAM_INPUT__CHECKPOINT_TRIGGER); @@ -1330,13 +1344,11 @@ int32_t streamProcessDispatchRsp(SStreamTask* pTask, SStreamDispatchRsp* pRsp, i } } - int32_t notRsp = taosArrayGetSize(pMsgInfo->pSendInfo) - totalRsp; if (pTask->outputInfo.type == TASK_OUTPUT__SHUFFLE_DISPATCH) { - if (notRsp > 0) { + if (!allRsp) { stDebug( "s-task:%s recv dispatch rsp, msgId:%d from 0x%x(vgId:%d), downstream task input status:%d code:%s, " - "waiting " - "for %d rsp", + "waiting for %d rsp", id, msgId, pRsp->downstreamTaskId, pRsp->downstreamNodeId, pRsp->inputStatus, tstrerror(code), notRsp); } else { stDebug( @@ -1350,7 +1362,7 @@ int32_t streamProcessDispatchRsp(SStreamTask* pTask, SStreamDispatchRsp* pRsp, i } // all msg rsp already, continue - if (notRsp == 0) { + if (allRsp) { ASSERT(pTask->outputq.status == TASK_OUTPUT_STATUS__WAIT); // we need to re-try send dispatch msg to downstream tasks From fb3fe03c1fb118a0435bc1134e226f8a7bb63b66 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Thu, 18 Jul 2024 14:57:39 +0800 Subject: [PATCH 08/34] fix(stream): to avoid repeatly start checkpoint timer if previous timer is not started yet. --- source/libs/stream/inc/streamInt.h | 36 +++++++------ source/libs/stream/src/streamCheckpoint.c | 44 ++++++++++------ source/libs/stream/src/streamDispatch.c | 61 +++++++++++++---------- source/libs/stream/src/streamTask.c | 14 +++--- source/libs/stream/src/streamTimer.c | 10 ++++ 5 files changed, 102 insertions(+), 63 deletions(-) diff --git a/source/libs/stream/inc/streamInt.h b/source/libs/stream/inc/streamInt.h index d31f720411..dc19d8c5b0 100644 --- a/source/libs/stream/inc/streamInt.h +++ b/source/libs/stream/inc/streamInt.h @@ -48,24 +48,30 @@ extern "C" { #define stTrace(...) do { if (stDebugFlag & DEBUG_TRACE) { taosPrintLog("STM ", DEBUG_TRACE, stDebugFlag, __VA_ARGS__); }} while(0) // clang-format on +typedef struct SStreamTmrInfo { + int32_t activeCounter; // make sure only launch one checkpoint trigger check tmr + tmr_h tmrHandle; + int64_t launchChkptId; + int8_t isActive; +} SStreamTmrInfo; + struct SActiveCheckpointInfo { - TdThreadMutex lock; - int32_t transId; - int64_t firstRecvTs; // first time to recv checkpoint trigger info - int64_t activeId; // current active checkpoint id - int64_t failedId; - bool dispatchTrigger; - SArray* pDispatchTriggerList; // SArray - SArray* pReadyMsgList; // SArray - int8_t allUpstreamTriggerRecv; - SArray* pCheckpointReadyRecvList; // SArray - int32_t checkCounter; - tmr_h pChkptTriggerTmr; - int32_t sendReadyCheckCounter; - tmr_h pSendReadyMsgTmr; - int64_t sendReadyTmrChkptId; + TdThreadMutex lock; + int32_t transId; + int64_t firstRecvTs; // first time to recv checkpoint trigger info + int64_t activeId; // current active checkpoint id + int64_t failedId; + bool dispatchTrigger; + SArray* pDispatchTriggerList; // SArray + SArray* pReadyMsgList; // SArray + int8_t allUpstreamTriggerRecv; + SArray* pCheckpointReadyRecvList; // SArray + SStreamTmrInfo chkptTriggerMsgTmr; + SStreamTmrInfo chkptReadyMsgTmr; }; +int32_t streamCleanBeforeQuitTmr(SStreamTmrInfo* pInfo, SStreamTask* pTask); + typedef struct { int8_t type; SSDataBlock* pBlock; diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index d1ea72370d..96a614f6a4 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -265,14 +265,26 @@ int32_t streamProcessCheckpointTriggerBlock(SStreamTask* pTask, SStreamDataBlock return code; } - int32_t ref = atomic_add_fetch_32(&pTask->status.timerActive, 1); - stDebug("s-task:%s start checkpoint-trigger monitor in 10s, ref:%d ", pTask->id.idStr, ref); - streamMetaAcquireOneTask(pTask); + // if previous launched timer not started yet, not start a new timer + // todo: fix this bug: previous set checkpoint-trigger check tmr is running, while we happen to try to launch + // a new checkpoint-trigger timer right now. + // And if we don't start a new timer, and the lost of checkpoint-trigger message may cause the whole checkpoint + // procedure to be stucked. + SStreamTmrInfo* pTmrInfo = &pActiveInfo->chkptTriggerMsgTmr; + int8_t old = atomic_val_compare_exchange_8(&pTmrInfo->isActive, 0, 1); + if (old == 0) { + int32_t ref = atomic_add_fetch_32(&pTask->status.timerActive, 1); + stDebug("s-task:%s start checkpoint-trigger monitor in 10s, ref:%d ", pTask->id.idStr, ref); + streamMetaAcquireOneTask(pTask); - if (pActiveInfo->pChkptTriggerTmr == NULL) { - pActiveInfo->pChkptTriggerTmr = taosTmrStart(checkpointTriggerMonitorFn, 100, pTask, streamTimer); - } else { - taosTmrReset(checkpointTriggerMonitorFn, 100, pTask, streamTimer, &pActiveInfo->pChkptTriggerTmr); + if (pTmrInfo->tmrHandle == NULL) { + pTmrInfo->tmrHandle = taosTmrStart(checkpointTriggerMonitorFn, 200, pTask, streamTimer); + } else { + taosTmrReset(checkpointTriggerMonitorFn, 200, pTask, streamTimer, &pTmrInfo->tmrHandle); + } + pTmrInfo->launchChkptId = pActiveInfo->activeId; + } else { // already launched, do nothing + stError("s-task:%s previous checkpoint-trigger monitor tmr is set, not start new one", pTask->id.idStr); } } @@ -741,27 +753,28 @@ void checkpointTriggerMonitorFn(void* param, void* tmrId) { const char* id = pTask->id.idStr; SActiveCheckpointInfo* pActiveInfo = pTask->chkInfo.pActiveInfo; + SStreamTmrInfo* pTmrInfo = &pActiveInfo->chkptTriggerMsgTmr; // check the status every 100ms if (streamTaskShouldStop(pTask)) { - int32_t ref = atomic_sub_fetch_32(&pTask->status.timerActive, 1); + int32_t ref = streamCleanBeforeQuitTmr(pTmrInfo, pTask); stDebug("s-task:%s vgId:%d quit from monitor checkpoint-trigger, ref:%d", id, vgId, ref); streamMetaReleaseTask(pTask->pMeta, pTask); return; } - if (++pActiveInfo->checkCounter < 100) { - taosTmrReset(checkpointTriggerMonitorFn, 100, pTask, streamTimer, &pActiveInfo->pChkptTriggerTmr); + if (++pTmrInfo->activeCounter < 50) { + taosTmrReset(checkpointTriggerMonitorFn, 200, pTask, streamTimer, &pTmrInfo->tmrHandle); return; } - pActiveInfo->checkCounter = 0; + pTmrInfo->activeCounter = 0; stDebug("s-task:%s vgId:%d checkpoint-trigger monitor in tmr, ts:%" PRId64, id, vgId, now); taosThreadMutexLock(&pTask->lock); SStreamTaskState* pState = streamTaskGetStatus(pTask); if (pState->state != TASK_STATUS__CK) { - int32_t ref = atomic_sub_fetch_32(&pTask->status.timerActive, 1); + int32_t ref = streamCleanBeforeQuitTmr(pTmrInfo, pTask); stDebug("s-task:%s vgId:%d not in checkpoint status, quit from monitor checkpoint-trigger, ref:%d", id, vgId, ref); taosThreadMutexUnlock(&pTask->lock); @@ -771,7 +784,7 @@ void checkpointTriggerMonitorFn(void* param, void* tmrId) { // checkpoint-trigger recv flag is set, quit if (pActiveInfo->allUpstreamTriggerRecv) { - int32_t ref = atomic_sub_fetch_32(&pTask->status.timerActive, 1); + int32_t ref = streamCleanBeforeQuitTmr(pTmrInfo, pTask); stDebug("s-task:%s vgId:%d all checkpoint-trigger recv, quit from monitor checkpoint-trigger, ref:%d", id, vgId, ref); @@ -779,7 +792,6 @@ void checkpointTriggerMonitorFn(void* param, void* tmrId) { streamMetaReleaseTask(pTask->pMeta, pTask); return; } - taosThreadMutexUnlock(&pTask->lock); taosThreadMutexLock(&pActiveInfo->lock); @@ -820,9 +832,9 @@ void checkpointTriggerMonitorFn(void* param, void* tmrId) { // check every 100ms if (size > 0) { stDebug("s-task:%s start to monitor checkpoint-trigger in 10s", id); - taosTmrReset(checkpointTriggerMonitorFn, 100, pTask, streamTimer, &pActiveInfo->pChkptTriggerTmr); + taosTmrReset(checkpointTriggerMonitorFn, 200, pTask, streamTimer, &pTmrInfo->tmrHandle); } else { - int32_t ref = atomic_sub_fetch_32(&pTask->status.timerActive, 1); + int32_t ref = streamCleanBeforeQuitTmr(pTmrInfo, pTask); stDebug("s-task:%s all checkpoint-trigger recved, quit from monitor checkpoint-trigger tmr, ref:%d", id, ref); streamMetaReleaseTask(pTask->pMeta, pTask); } diff --git a/source/libs/stream/src/streamDispatch.c b/source/libs/stream/src/streamDispatch.c index 9615ed49e0..006e55374e 100644 --- a/source/libs/stream/src/streamDispatch.c +++ b/source/libs/stream/src/streamDispatch.c @@ -777,31 +777,32 @@ int32_t initCheckpointReadyMsg(SStreamTask* pTask, int32_t upstreamNodeId, int32 } static void checkpointReadyMsgSendMonitorFn(void* param, void* tmrId) { - SStreamTask* pTask = param; - int32_t vgId = pTask->pMeta->vgId; - const char* id = pTask->id.idStr; + SStreamTask* pTask = param; + int32_t vgId = pTask->pMeta->vgId; + const char* id = pTask->id.idStr; + SActiveCheckpointInfo* pActiveInfo = pTask->chkInfo.pActiveInfo; + SStreamTmrInfo* pTmrInfo = &pActiveInfo->chkptReadyMsgTmr; // check the status every 100ms if (streamTaskShouldStop(pTask)) { - int32_t ref = atomic_sub_fetch_32(&pTask->status.timerActive, 1); + int32_t ref = streamCleanBeforeQuitTmr(pTmrInfo, pTask); stDebug("s-task:%s vgId:%d status:stop, quit from monitor checkpoint-trigger, ref:%d", id, vgId, ref); streamMetaReleaseTask(pTask->pMeta, pTask); return; } - SActiveCheckpointInfo* pActiveInfo = pTask->chkInfo.pActiveInfo; - if (++pActiveInfo->sendReadyCheckCounter < 100) { - taosTmrReset(checkpointReadyMsgSendMonitorFn, 100, pTask, streamTimer, &pActiveInfo->pSendReadyMsgTmr); + if (++pTmrInfo->activeCounter < 50) { + taosTmrReset(checkpointReadyMsgSendMonitorFn, 200, pTask, streamTimer, &pTmrInfo->tmrHandle); return; } - pActiveInfo->sendReadyCheckCounter = 0; - stDebug("s-task:%s in sending checkpoint-ready msg monitor timer", id); + pTmrInfo->activeCounter = 0; + stDebug("s-task:%s in sending checkpoint-ready msg monitor tmr", id); taosThreadMutexLock(&pTask->lock); SStreamTaskState* pState = streamTaskGetStatus(pTask); if (pState->state != TASK_STATUS__CK) { - int32_t ref = atomic_sub_fetch_32(&pTask->status.timerActive, 1); + int32_t ref = streamCleanBeforeQuitTmr(pTmrInfo, pTask); stDebug("s-task:%s vgId:%d status:%s not in checkpoint, quit from monitor checkpoint-ready send, ref:%d", id, vgId, pState->name, ref); taosThreadMutexUnlock(&pTask->lock); @@ -815,11 +816,12 @@ static void checkpointReadyMsgSendMonitorFn(void* param, void* tmrId) { SArray* pList = pActiveInfo->pReadyMsgList; int32_t num = taosArrayGetSize(pList); - if (pActiveInfo->sendReadyTmrChkptId < pActiveInfo->activeId) { + if (pTmrInfo->launchChkptId < pActiveInfo->activeId) { taosThreadMutexUnlock(&pActiveInfo->lock); - int32_t ref = atomic_sub_fetch_32(&pTask->status.timerActive, 1); - stWarn("s-task:%s vgId:%d tmr launched by previous checkpoint procedure, checkpointId:%" PRId64 ", quit, ref:%d", - id, vgId, pActiveInfo->sendReadyTmrChkptId, ref); + int32_t ref = streamCleanBeforeQuitTmr(pTmrInfo, pTask); + stWarn("s-task:%s vgId:%d ready-msg send tmr launched by previous checkpoint procedure, checkpointId:%" PRId64 + ", quit, ref:%d", + id, vgId, pTmrInfo->launchChkptId, ref); streamMetaReleaseTask(pTask->pMeta, pTask); return; @@ -828,7 +830,7 @@ static void checkpointReadyMsgSendMonitorFn(void* param, void* tmrId) { // active checkpoint info is cleared for now if ((pActiveInfo->activeId == 0) && (pActiveInfo->transId == 0) && (num == 0) && (pTask->chkInfo.startTs == 0)) { taosThreadMutexUnlock(&pActiveInfo->lock); - int32_t ref = atomic_sub_fetch_32(&pTask->status.timerActive, 1); + int32_t ref = streamCleanBeforeQuitTmr(pTmrInfo, pTask); stWarn("s-task:%s vgId:%d active checkpoint may be cleared, quit from readyMsg send tmr, ref:%d", id, vgId, ref); streamMetaReleaseTask(pTask->pMeta, pTask); @@ -871,10 +873,10 @@ static void checkpointReadyMsgSendMonitorFn(void* param, void* tmrId) { } } - taosTmrReset(checkpointReadyMsgSendMonitorFn, 100, pTask, streamTimer, &pActiveInfo->pSendReadyMsgTmr); + taosTmrReset(checkpointReadyMsgSendMonitorFn, 200, pTask, streamTimer, &pTmrInfo->tmrHandle); taosThreadMutexUnlock(&pActiveInfo->lock); } else { - int32_t ref = atomic_sub_fetch_32(&pTask->status.timerActive, 1); + int32_t ref = streamCleanBeforeQuitTmr(pTmrInfo, pTask); stDebug( "s-task:%s vgId:%d recv of checkpoint-ready msg confirmed by all upstream task(s), clear checkpoint-ready msg " "and quit from timer, ref:%d", @@ -916,18 +918,25 @@ int32_t streamTaskSendCheckpointReadyMsg(SStreamTask* pTask) { // start to check if checkpoint ready msg has successfully received by upstream tasks. if (pTask->info.taskLevel == TASK_LEVEL__SINK || pTask->info.taskLevel == TASK_LEVEL__AGG) { - int32_t ref = atomic_add_fetch_32(&pTask->status.timerActive, 1); - stDebug("s-task:%s start checkpoint-ready monitor in 10s, ref:%d ", pTask->id.idStr, ref); - streamMetaAcquireOneTask(pTask); + SStreamTmrInfo* pTmrInfo = &pActiveInfo->chkptReadyMsgTmr; - if (pActiveInfo->pSendReadyMsgTmr == NULL) { - pActiveInfo->pSendReadyMsgTmr = taosTmrStart(checkpointReadyMsgSendMonitorFn, 100, pTask, streamTimer); + int8_t old = atomic_val_compare_exchange_8(&pTmrInfo->isActive, 0, 1); + if (old == 0) { + int32_t ref = atomic_add_fetch_32(&pTask->status.timerActive, 1); + stDebug("s-task:%s start checkpoint-ready monitor in 10s, ref:%d ", pTask->id.idStr, ref); + streamMetaAcquireOneTask(pTask); + + if (pTmrInfo->tmrHandle == NULL) { + pTmrInfo->tmrHandle = taosTmrStart(checkpointReadyMsgSendMonitorFn, 200, pTask, streamTimer); + } else { + taosTmrReset(checkpointReadyMsgSendMonitorFn, 200, pTask, streamTimer, &pTmrInfo->tmrHandle); + } + + // mark the timer monitor checkpointId + pTmrInfo->launchChkptId = pActiveInfo->activeId; } else { - taosTmrReset(checkpointReadyMsgSendMonitorFn, 100, pTask, streamTimer, &pActiveInfo->pSendReadyMsgTmr); + stError("s-task:%s previous checkpoint-ready monitor tmr is set, not start new one", pTask->id.idStr); } - - // mark the timer monitor checkpointId - pActiveInfo->sendReadyTmrChkptId = pActiveInfo->activeId; } taosThreadMutexUnlock(&pActiveInfo->lock); diff --git a/source/libs/stream/src/streamTask.c b/source/libs/stream/src/streamTask.c index 4cbe0cb136..d63b6ea935 100644 --- a/source/libs/stream/src/streamTask.c +++ b/source/libs/stream/src/streamTask.c @@ -1064,14 +1064,16 @@ void streamTaskDestroyActiveChkptInfo(SActiveCheckpointInfo* pInfo) { taosArrayDestroy(pInfo->pCheckpointReadyRecvList); pInfo->pCheckpointReadyRecvList = NULL; - if (pInfo->pChkptTriggerTmr != NULL) { - taosTmrStop(pInfo->pChkptTriggerTmr); - pInfo->pChkptTriggerTmr = NULL; + SStreamTmrInfo* pTriggerTmr = &pInfo->chkptTriggerMsgTmr; + if (pTriggerTmr->tmrHandle != NULL) { + taosTmrStop(pTriggerTmr->tmrHandle); + pTriggerTmr->tmrHandle = NULL; } - if (pInfo->pSendReadyMsgTmr != NULL) { - taosTmrStop(pInfo->pSendReadyMsgTmr); - pInfo->pSendReadyMsgTmr = NULL; + SStreamTmrInfo* pReadyTmr = &pInfo->chkptReadyMsgTmr; + if (pReadyTmr->tmrHandle != NULL) { + taosTmrStop(pReadyTmr->tmrHandle); + pReadyTmr->tmrHandle = NULL; } taosMemoryFree(pInfo); diff --git a/source/libs/stream/src/streamTimer.c b/source/libs/stream/src/streamTimer.c index 6e956e2682..4838d76fe0 100644 --- a/source/libs/stream/src/streamTimer.c +++ b/source/libs/stream/src/streamTimer.c @@ -38,3 +38,13 @@ void streamTimerCleanUp() { tmr_h streamTimerGetInstance() { return streamTimer; } + +int32_t streamCleanBeforeQuitTmr(SStreamTmrInfo* pInfo, SStreamTask* pTask) { + pInfo->activeCounter = 0; + pInfo->launchChkptId = 0; + atomic_store_8(&pInfo->isActive, 0); + + int32_t ref = atomic_sub_fetch_32(&pTask->status.timerActive, 1); + ASSERT(ref >= 0); + return ref; +} \ No newline at end of file From 0cca12ab52aa1506b80154d8784ed3b6b1da3daa Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Thu, 18 Jul 2024 15:49:49 +0800 Subject: [PATCH 09/34] fix(stream): add some logs. --- source/common/src/rsync.c | 67 ++++++++++++++--------------- source/libs/stream/src/streamMeta.c | 14 +++++- 2 files changed, 45 insertions(+), 36 deletions(-) diff --git a/source/common/src/rsync.c b/source/common/src/rsync.c index 36d634c305..f2f6796fb0 100644 --- a/source/common/src/rsync.c +++ b/source/common/src/rsync.c @@ -278,41 +278,6 @@ int32_t downloadByRsync(const char* id, const char* path, int64_t checkpointId) uDebug("[rsync] %s start to sync data from remote to:%s, cmd:%s", id, path, command); -// while (times++ < MAX_RETRY) { - code = execCommand(command); - if (code != TSDB_CODE_SUCCESS) { - uError("[rsync] %s download checkpointId:%" PRId64 - " data:%s failed, retry after 1sec, times:%d, code:%d," ERRNO_ERR_FORMAT, - id, checkpointId, path, times, code, ERRNO_ERR_DATA); -// taosSsleep(1); - } else { - int32_t el = taosGetTimestampMs() - st; - uDebug("[rsync] %s download checkpointId:%" PRId64 " data:%s successfully, elapsed time:%dms", id, checkpointId, - path, el); -// break; - } -// } - - // if failed, try to load it from data directory -#ifdef WINDOWS - memset(pathTransform, 0, PATH_MAX); - changeDirFromWindowsToLinux(path, pathTransform); -#endif - - memset(command, 0, PATH_MAX); - snprintf( - command, PATH_MAX, - "rsync -av --debug=all --log-file=%s/rsynclog --timeout=10 --bwlimit=100000 rsync://%s/checkpoint/%s/data/ %s", - tsLogDir, tsSnodeAddress, id, -#ifdef WINDOWS - pathTransform -#else - path -#endif - ); - - uDebug("[rsync] %s start to sync data from remote data dir to:%s, cmd:%s", id, path, command); - code = execCommand(command); if (code != TSDB_CODE_SUCCESS) { uError("[rsync] %s download checkpointId:%" PRId64 @@ -324,6 +289,38 @@ int32_t downloadByRsync(const char* id, const char* path, int64_t checkpointId) path, el); } + if (code != TSDB_CODE_SUCCESS) { // if failed, try to load it from data directory +#ifdef WINDOWS + memset(pathTransform, 0, PATH_MAX); + changeDirFromWindowsToLinux(path, pathTransform); +#endif + + memset(command, 0, PATH_MAX); + snprintf( + command, PATH_MAX, + "rsync -av --debug=all --log-file=%s/rsynclog --timeout=10 --bwlimit=100000 rsync://%s/checkpoint/%s/data/ %s", + tsLogDir, tsSnodeAddress, id, +#ifdef WINDOWS + pathTransform +#else + path +#endif + ); + + uDebug("[rsync] %s start to sync data from remote data dir to:%s, cmd:%s", id, path, command); + + code = execCommand(command); + if (code != TSDB_CODE_SUCCESS) { + uError("[rsync] %s download checkpointId:%" PRId64 + " data:%s failed, retry after 1sec, times:%d, code:%d," ERRNO_ERR_FORMAT, + id, checkpointId, path, times, code, ERRNO_ERR_DATA); + } else { + int32_t el = taosGetTimestampMs() - st; + uDebug("[rsync] %s download checkpointId:%" PRId64 " data:%s successfully, elapsed time:%dms", id, checkpointId, + path, el); + } + } + return code; } diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index d2c957422b..ebc0a864fc 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -273,7 +273,19 @@ int32_t streamTaskSetDb(SStreamMeta* pMeta, SStreamTask* pTask, const char* key) pBackend->pTask = pTask; pBackend->pMeta = pMeta; - if (processVer != -1) pTask->chkInfo.processedVer = processVer; + if (processVer != -1) { + if (pTask->chkInfo.processedVer != processVer) { + stWarn("s-task:%s vgId:%d update checkpointVer:%" PRId64 "->%" PRId64 " for checkpointId:%" PRId64, + pTask->id.idStr, pTask->pMeta->vgId, pTask->chkInfo.processedVer, processVer, pTask->chkInfo.checkpointId); + pTask->chkInfo.processedVer = processVer; + pTask->chkInfo.checkpointVer = processVer; + pTask->chkInfo.nextProcessVer = processVer + 1; + } else { + stInfo("s-task:%s vgId:%d processedVer:%" PRId64 + " in task meta equals to data in checkpoint data for checkpointId:%" PRId64, + pTask->id.idStr, pTask->pMeta->vgId, pTask->chkInfo.processedVer, pTask->chkInfo.checkpointId); + } + } taosHashPut(pMeta->pTaskDbUnique, key, strlen(key), &pBackend, sizeof(void*)); taosThreadMutexUnlock(&pMeta->backendMutex); From aaf67a42eb06bf9bd52eb3acb7d7459378f74d8d Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Fri, 19 Jul 2024 16:41:39 +0800 Subject: [PATCH 10/34] fix(stream): fix race condition in handling dispatch rsp. --- source/libs/stream/src/streamCheckpoint.c | 15 +-- source/libs/stream/src/streamDispatch.c | 113 ++++++++++++---------- 2 files changed, 70 insertions(+), 58 deletions(-) diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index 96a614f6a4..59075c47b2 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -799,10 +799,15 @@ void checkpointTriggerMonitorFn(void* param, void* tmrId) { // send msg to retrieve checkpoint trigger msg SArray* pList = pTask->upstreamInfo.pList; ASSERT(pTask->info.taskLevel > TASK_LEVEL__SOURCE); + SArray* pNotSendList = taosArrayInit(4, sizeof(SStreamUpstreamEpInfo)); if (pNotSendList == NULL) { terrno = TSDB_CODE_OUT_OF_MEMORY; - stDebug("s-task:%s start to triggerMonitor, reason:%s", id, tstrerror(terrno)); + stError("s-task:%s quit tmr function due to out of memory", id); + taosThreadMutexUnlock(&pActiveInfo->lock); + + stDebug("s-task:%s start to monitor checkpoint-trigger in 10s", id); + taosTmrReset(checkpointTriggerMonitorFn, 200, pTask, streamTimer, &pTmrInfo->tmrHandle); return; } @@ -967,18 +972,14 @@ void streamTaskInitTriggerDispatchInfo(SStreamTask* pTask) { taosThreadMutexUnlock(&pInfo->lock); } -int32_t streamTaskGetNumOfConfirmed(SStreamTask* pTask) { - SActiveCheckpointInfo* pInfo = pTask->chkInfo.pActiveInfo; - +int32_t streamTaskGetNumOfConfirmed(SActiveCheckpointInfo* pInfo) { int32_t num = 0; - taosThreadMutexLock(&pInfo->lock); for (int32_t i = 0; i < taosArrayGetSize(pInfo->pDispatchTriggerList); ++i) { STaskTriggerSendInfo* p = taosArrayGet(pInfo->pDispatchTriggerList, i); if (p->recved) { num++; } } - taosThreadMutexUnlock(&pInfo->lock); return num; } @@ -1000,9 +1001,9 @@ void streamTaskSetTriggerDispatchConfirmed(SStreamTask* pTask, int32_t vgId) { } } + int32_t numOfConfirmed = streamTaskGetNumOfConfirmed(pInfo); taosThreadMutexUnlock(&pInfo->lock); - int32_t numOfConfirmed = streamTaskGetNumOfConfirmed(pTask); int32_t total = streamTaskGetNumOfDownstream(pTask); stDebug("s-task:%s set downstream:0x%x(vgId:%d) checkpoint-trigger dispatch confirmed, total confirmed:%d/%d", pTask->id.idStr, taskId, vgId, numOfConfirmed, total); diff --git a/source/libs/stream/src/streamDispatch.c b/source/libs/stream/src/streamDispatch.c index 006e55374e..6fec79eb04 100644 --- a/source/libs/stream/src/streamDispatch.c +++ b/source/libs/stream/src/streamDispatch.c @@ -501,7 +501,7 @@ static void doMonitorDispatchData(void* param, void* tmrId) { int32_t ref = atomic_sub_fetch_32(&pTask->status.timerActive, 1); stDebug("s-task:%s not in dispatch procedure, abort from timer, ref:%d", pTask->id.idStr, ref); - pTask->msgInfo.inMonitor = 0; + pTask->msgInfo.inMonitor = 0; // set not in dispatch monitor taosThreadMutexUnlock(&pMsgInfo->lock); return; } @@ -1211,44 +1211,51 @@ static int32_t handleDispatchSuccessRsp(SStreamTask* pTask, int32_t downstreamId return 0; } -static bool setDispatchRspInfo(SDispatchMsgInfo* pMsgInfo, int32_t vgId, int32_t code, int64_t now, int32_t* pNotRsp, const char* id) { +static bool setDispatchRspInfo(SDispatchMsgInfo* pMsgInfo, int32_t vgId, int32_t code, int64_t now, int32_t* pNotRsp, + int32_t* pFailed, const char* id) { int32_t numOfRsp = 0; - bool alreadySet = false; - bool updated = false; - bool allRsp = false; - *pNotRsp = 0; + int32_t numOfFailed = 0; - taosThreadMutexLock(&pMsgInfo->lock); + bool allRsp = false; int32_t numOfDispatchBranch = taosArrayGetSize(pMsgInfo->pSendInfo); - for(int32_t i = 0; i < numOfDispatchBranch; ++i) { + *pNotRsp = 0; + *pFailed = 0; + + for (int32_t i = 0; i < numOfDispatchBranch; ++i) { SDispatchEntry* pEntry = taosArrayGet(pMsgInfo->pSendInfo, i); if (pEntry->rspTs != -1) { numOfRsp += 1; + } else { + if (pEntry->status != TSDB_CODE_SUCCESS || isDispatchRspTimeout(pEntry, now)) { + numOfFailed += 1; + } } } for (int32_t j = 0; j < numOfDispatchBranch; ++j) { SDispatchEntry* pEntry = taosArrayGet(pMsgInfo->pSendInfo, j); if (pEntry->nodeId == vgId) { - ASSERT(!alreadySet); - pEntry->rspTs = now; - pEntry->status = code; - alreadySet = true; - updated = true; - numOfRsp += 1; + if (pEntry->rspTs != -1) { + stDebug("s-task:%s dispatch rsp has already recved at:%" PRId64 ", ignore this rsp, msgId:%d", id, + pEntry->rspTs, pMsgInfo->msgId); + allRsp = false; + } else { + pEntry->rspTs = now; + pEntry->status = code; + numOfRsp += 1; + allRsp = (numOfRsp == numOfDispatchBranch); - stDebug("s-task:%s record the rsp recv, ts:%" PRId64 " code:%d, idx:%d, total recv:%d/%d", id, now, code, j, - numOfRsp, numOfDispatchBranch); + stDebug("s-task:%s record the rsp recv, ts:%" PRId64 " code:%d, idx:%d, total recv:%d/%d", id, now, code, j, + numOfRsp, numOfDispatchBranch); + } + break; } } + *pFailed = numOfFailed; *pNotRsp = numOfDispatchBranch - numOfRsp; - allRsp = (numOfRsp == numOfDispatchBranch); - taosThreadMutexUnlock(&pMsgInfo->lock); - - ASSERT(updated); return allRsp; } @@ -1277,15 +1284,23 @@ int32_t streamProcessDispatchRsp(SStreamTask* pTask, SStreamDispatchRsp* pRsp, i int64_t now = taosGetTimestampMs(); bool allRsp = false; int32_t notRsp = 0; + int32_t numOfFailed = 0; + bool triggerDispatchRsp = false; + + // we only set the dispatch msg info for current checkpoint trans + taosThreadMutexLock(&pTask->lock); + triggerDispatchRsp = (streamTaskGetStatus(pTask)->state == TASK_STATUS__CK) && + (pTask->chkInfo.pActiveInfo->activeId == pMsgInfo->checkpointId); + taosThreadMutexUnlock(&pTask->lock); taosThreadMutexLock(&pMsgInfo->lock); - int32_t msgId = pMsgInfo->msgId; - taosThreadMutexUnlock(&pMsgInfo->lock); + int32_t msgId = pMsgInfo->msgId; // follower not handle the dispatch rsp if ((pTask->pMeta->role == NODE_ROLE_FOLLOWER) || (pTask->status.downstreamReady != 1)) { stError("s-task:%s vgId:%d is follower or task just re-launched, not handle the dispatch rsp, discard it", id, vgId); + taosThreadMutexUnlock(&pMsgInfo->lock); return TSDB_CODE_STREAM_TASK_NOT_EXIST; } @@ -1294,6 +1309,7 @@ int32_t streamProcessDispatchRsp(SStreamTask* pTask, SStreamDispatchRsp* pRsp, i stError("s-task:%s vgId:%d not expect rsp, expected: msgId:%d, stage:%" PRId64 " actual msgId:%d, stage:%" PRId64 " discard it", id, vgId, msgId, pTask->pMeta->stage, pRsp->msgId, pRsp->stage); + taosThreadMutexUnlock(&pMsgInfo->lock); return TSDB_CODE_INVALID_MSG; } @@ -1305,18 +1321,18 @@ int32_t streamProcessDispatchRsp(SStreamTask* pTask, SStreamDispatchRsp* pRsp, i if (code == TSDB_CODE_STREAM_TASK_NOT_EXIST) { // destination task does not exist, not retry anymore stError("s-task:%s failed to dispatch msg to task:0x%x(vgId:%d), msgId:%d no retry, since task destroyed already", id, pRsp->downstreamTaskId, pRsp->downstreamNodeId, msgId); - allRsp = setDispatchRspInfo(pMsgInfo, pRsp->downstreamNodeId, TSDB_CODE_SUCCESS, now, ¬Rsp, id); + allRsp = setDispatchRspInfo(pMsgInfo, pRsp->downstreamNodeId, TSDB_CODE_SUCCESS, now, ¬Rsp, &numOfFailed, id); } else { stError("s-task:%s failed to dispatch msgId:%d to task:0x%x(vgId:%d), code:%s, add to retry list", id, msgId, pRsp->downstreamTaskId, pRsp->downstreamNodeId, tstrerror(code)); - allRsp = setDispatchRspInfo(pMsgInfo, pRsp->downstreamNodeId, code, now, ¬Rsp, id); + allRsp = setDispatchRspInfo(pMsgInfo, pRsp->downstreamNodeId, code, now, ¬Rsp, &numOfFailed, id); } } else { // code == 0 if (pRsp->inputStatus == TASK_INPUT_STATUS__BLOCKED) { pTask->inputq.status = TASK_INPUT_STATUS__BLOCKED; // block the input of current task, to push pressure to upstream - allRsp = setDispatchRspInfo(pMsgInfo, pRsp->downstreamNodeId, pRsp->inputStatus, now, ¬Rsp, id); + allRsp = setDispatchRspInfo(pMsgInfo, pRsp->downstreamNodeId, pRsp->inputStatus, now, ¬Rsp, &numOfFailed, id); stTrace("s-task:%s inputQ of downstream task:0x%x(vgId:%d) is full, wait for retry dispatch", id, pRsp->downstreamTaskId, pRsp->downstreamNodeId); } else { @@ -1328,15 +1344,13 @@ int32_t streamProcessDispatchRsp(SStreamTask* pTask, SStreamDispatchRsp* pRsp, i id, pRsp->downstreamTaskId, pRsp->downstreamNodeId); } - allRsp = setDispatchRspInfo(pMsgInfo, pRsp->downstreamNodeId, TSDB_CODE_SUCCESS, now, ¬Rsp, id); + allRsp = setDispatchRspInfo(pMsgInfo, pRsp->downstreamNodeId, TSDB_CODE_SUCCESS, now, ¬Rsp, &numOfFailed, id); { bool delayDispatch = (pMsgInfo->dispatchMsgType == STREAM_INPUT__CHECKPOINT_TRIGGER); if (delayDispatch) { - taosThreadMutexLock(&pTask->lock); // we only set the dispatch msg info for current checkpoint trans - if (streamTaskGetStatus(pTask)->state == TASK_STATUS__CK && - pTask->chkInfo.pActiveInfo->activeId == pMsgInfo->checkpointId) { + if (triggerDispatchRsp) { ASSERT(pTask->chkInfo.pActiveInfo->transId == pMsgInfo->transId); stDebug("s-task:%s checkpoint-trigger msg to 0x%x rsp for checkpointId:%" PRId64 " transId:%d confirmed", pTask->id.idStr, pRsp->downstreamTaskId, pMsgInfo->checkpointId, pMsgInfo->transId); @@ -1347,12 +1361,13 @@ int32_t streamProcessDispatchRsp(SStreamTask* pTask, SStreamDispatchRsp* pRsp, i " transId:%d discard, since expired", pTask->id.idStr, pMsgInfo->checkpointId, pMsgInfo->transId); } - taosThreadMutexUnlock(&pTask->lock); } } } } + taosThreadMutexUnlock(&pMsgInfo->lock); + if (pTask->outputInfo.type == TASK_OUTPUT__SHUFFLE_DISPATCH) { if (!allRsp) { stDebug( @@ -1371,29 +1386,25 @@ int32_t streamProcessDispatchRsp(SStreamTask* pTask, SStreamDispatchRsp* pRsp, i } // all msg rsp already, continue - if (allRsp) { - ASSERT(pTask->outputq.status == TASK_OUTPUT_STATUS__WAIT); + // we need to re-try send dispatch msg to downstream tasks + if (allRsp && (numOfFailed == 0)) { + // trans-state msg has been sent to downstream successfully. let's transfer the fill-history task state + if (pMsgInfo->dispatchMsgType == STREAM_INPUT__TRANS_STATE) { + stDebug("s-task:%s dispatch trans-state msgId:%d to downstream successfully, start to prepare transfer state", id, + msgId); + ASSERT(pTask->info.fillHistory == 1); - // we need to re-try send dispatch msg to downstream tasks - int32_t numOfFailed = getFailedDispatchInfo(pMsgInfo, now); - if (numOfFailed == 0) { // this message has been sent successfully, let's try next one. - // trans-state msg has been sent to downstream successfully. let's transfer the fill-history task state - if (pMsgInfo->dispatchMsgType == STREAM_INPUT__TRANS_STATE) { - stDebug("s-task:%s dispatch trans-state msgId:%d to downstream successfully, start to prepare transfer state", - id, msgId); - ASSERT(pTask->info.fillHistory == 1); - - code = streamTransferStatePrepare(pTask); - if (code != TSDB_CODE_SUCCESS) { // todo: do nothing if error happens - } - - clearBufferedDispatchMsg(pTask); - - // now ready for next data output - atomic_store_8(&pTask->outputq.status, TASK_OUTPUT_STATUS__NORMAL); - } else { - handleDispatchSuccessRsp(pTask, pRsp->downstreamTaskId, pRsp->downstreamNodeId); + code = streamTransferStatePrepare(pTask); + if (code != TSDB_CODE_SUCCESS) { // todo: do nothing if error happens } + + clearBufferedDispatchMsg(pTask); + + // now ready for next data output + atomic_store_8(&pTask->outputq.status, TASK_OUTPUT_STATUS__NORMAL); + } else { + // this message has been sent successfully, let's try next one. + handleDispatchSuccessRsp(pTask, pRsp->downstreamTaskId, pRsp->downstreamNodeId); } } From da4018931b73736f373ac74af7ac6bfc86e5550b Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Sat, 20 Jul 2024 15:35:15 +0800 Subject: [PATCH 11/34] fix(stream): calculate the error code after set current rsp status. --- source/libs/stream/src/streamDispatch.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/source/libs/stream/src/streamDispatch.c b/source/libs/stream/src/streamDispatch.c index 6fec79eb04..dd55884689 100644 --- a/source/libs/stream/src/streamDispatch.c +++ b/source/libs/stream/src/streamDispatch.c @@ -1226,10 +1226,6 @@ static bool setDispatchRspInfo(SDispatchMsgInfo* pMsgInfo, int32_t vgId, int32_t SDispatchEntry* pEntry = taosArrayGet(pMsgInfo->pSendInfo, i); if (pEntry->rspTs != -1) { numOfRsp += 1; - } else { - if (pEntry->status != TSDB_CODE_SUCCESS || isDispatchRspTimeout(pEntry, now)) { - numOfFailed += 1; - } } } @@ -1253,6 +1249,14 @@ static bool setDispatchRspInfo(SDispatchMsgInfo* pMsgInfo, int32_t vgId, int32_t } } + // this code may be error code. + for (int32_t i = 0; i < numOfDispatchBranch; ++i) { + SDispatchEntry* pEntry = taosArrayGet(pMsgInfo->pSendInfo, i); + if (pEntry->status != TSDB_CODE_SUCCESS || isDispatchRspTimeout(pEntry, now)) { + numOfFailed += 1; + } + } + *pFailed = numOfFailed; *pNotRsp = numOfDispatchBranch - numOfRsp; From ad96333336a8e16b1e8c3bdfbbbb845061399505 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Tue, 23 Jul 2024 17:16:40 +0800 Subject: [PATCH 12/34] fix(stream): discard the processed hbmsg in the mnode. --- source/dnode/mnode/impl/inc/mndStream.h | 1 + source/dnode/mnode/impl/src/mndStreamHb.c | 24 +++++++++++++++++++++ source/dnode/mnode/impl/src/mndStreamUtil.c | 3 +++ 3 files changed, 28 insertions(+) diff --git a/source/dnode/mnode/impl/inc/mndStream.h b/source/dnode/mnode/impl/inc/mndStream.h index 0b6b6a9ef2..b7aa398e59 100644 --- a/source/dnode/mnode/impl/inc/mndStream.h +++ b/source/dnode/mnode/impl/inc/mndStream.h @@ -73,6 +73,7 @@ typedef struct SNodeEntry { bool stageUpdated; // the stage has been updated due to the leader/follower change or node reboot. SEpSet epset; // compare the epset to identify the vgroup tranferring between different dnodes. int64_t hbTimestamp; // second + int32_t lastHbMsgId; // latest hb msgId } SNodeEntry; typedef struct SOrphanTask { diff --git a/source/dnode/mnode/impl/src/mndStreamHb.c b/source/dnode/mnode/impl/src/mndStreamHb.c index bc10ec211d..04dd135320 100644 --- a/source/dnode/mnode/impl/src/mndStreamHb.c +++ b/source/dnode/mnode/impl/src/mndStreamHb.c @@ -265,6 +265,30 @@ int32_t mndProcessStreamHb(SRpcMsg *pReq) { return -1; } + for(int32_t i = 0; i < taosArrayGetSize(execInfo.pNodeList); ++i) { + SNodeEntry* pEntry = taosArrayGet(execInfo.pNodeList, i); + if (pEntry == NULL) { + continue; + } + + if (pEntry->nodeId != req.vgId) { + continue; + } + + if (pEntry->lastHbMsgId == req.msgId) { + mError("vgId:%d Hb msgId:%d already handled, discard", pEntry->nodeId, req.msgId); + + terrno = TSDB_CODE_INVALID_MSG; + doSendHbMsgRsp(terrno, &pReq->info, req.vgId, req.msgId); + + taosThreadMutexUnlock(&execInfo.lock); + cleanupAfterProcessHbMsg(&req, pFailedChkpt, pOrphanTasks); + return -1; + } else { + pEntry->lastHbMsgId = req.msgId; + } + } + int32_t numOfUpdated = taosArrayGetSize(req.pUpdateNodes); if (numOfUpdated > 0) { mDebug("%d stream node(s) need updated from hbMsg(vgId:%d)", numOfUpdated, req.vgId); diff --git a/source/dnode/mnode/impl/src/mndStreamUtil.c b/source/dnode/mnode/impl/src/mndStreamUtil.c index e4e30bdf10..23eb3656da 100644 --- a/source/dnode/mnode/impl/src/mndStreamUtil.c +++ b/source/dnode/mnode/impl/src/mndStreamUtil.c @@ -607,7 +607,10 @@ void removeExpiredNodeInfo(const SArray *pNodeSnapshot) { for (int32_t j = 0; j < size; ++j) { SNodeEntry *pEntry = taosArrayGet(pNodeSnapshot, j); if (pEntry->nodeId == p->nodeId) { + p->hbTimestamp = pEntry->hbTimestamp; + taosArrayPush(pValidList, p); + mDebug("vgId:%d ts:%"PRId64" HbMsgId:%d is valid", p->nodeId, p->hbTimestamp, p->lastHbMsgId); break; } } From b57b263534b4d0368ea69cffc1e4586a7b2e7d0c Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Thu, 1 Aug 2024 14:40:10 +0800 Subject: [PATCH 13/34] fix(stream): add check. --- source/libs/stream/src/streamTask.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/libs/stream/src/streamTask.c b/source/libs/stream/src/streamTask.c index 979d1960f1..e8ff1552e8 100644 --- a/source/libs/stream/src/streamTask.c +++ b/source/libs/stream/src/streamTask.c @@ -602,7 +602,7 @@ int32_t streamTaskStop(SStreamTask* pTask) { stError("failed to handle STOP event, s-task:%s", id); } - if (pTask->info.taskLevel != TASK_LEVEL__SINK) { + if ((pTask->info.taskLevel != TASK_LEVEL__SINK) && (pTask->exec.pExecutor != NULL)) { code = qKillTask(pTask->exec.pExecutor, TSDB_CODE_SUCCESS); if (code) { stError("s-task:%s failed to kill task related query handle", id); From aefb9d275e295e446ee621d71a13adb8fcb192a2 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Thu, 1 Aug 2024 15:22:37 +0800 Subject: [PATCH 14/34] fix(stream): add ts in HbMsg. --- include/libs/stream/streamMsg.h | 1 + source/dnode/mnode/impl/inc/mndStream.h | 1 + source/dnode/mnode/impl/src/mndStream.c | 4 ++-- source/dnode/mnode/impl/src/mndStreamHb.c | 11 +++++++---- source/libs/stream/src/streamHb.c | 7 ++++--- source/libs/stream/src/streamMsg.c | 2 ++ tests/script/tsim/stream/checkpointInterval0.sim | 2 ++ 7 files changed, 19 insertions(+), 9 deletions(-) diff --git a/include/libs/stream/streamMsg.h b/include/libs/stream/streamMsg.h index 34921daac3..0ceaa93a72 100644 --- a/include/libs/stream/streamMsg.h +++ b/include/libs/stream/streamMsg.h @@ -164,6 +164,7 @@ int32_t tDecodeStreamTaskCheckpointReq(SDecoder* pDecoder, SStreamTaskCheckpoint typedef struct SStreamHbMsg { int32_t vgId; int32_t msgId; + int64_t ts; int32_t numOfTasks; SArray* pTaskStatus; // SArray SArray* pUpdateNodes; // SArray, needs update the epsets in stream tasks for those nodes. diff --git a/source/dnode/mnode/impl/inc/mndStream.h b/source/dnode/mnode/impl/inc/mndStream.h index d253e58703..89343ce37c 100644 --- a/source/dnode/mnode/impl/inc/mndStream.h +++ b/source/dnode/mnode/impl/inc/mndStream.h @@ -80,6 +80,7 @@ typedef struct SNodeEntry { SEpSet epset; // compare the epset to identify the vgroup tranferring between different dnodes. int64_t hbTimestamp; // second int32_t lastHbMsgId; // latest hb msgId + int64_t lastHbMsgTs; } SNodeEntry; typedef struct { diff --git a/source/dnode/mnode/impl/src/mndStream.c b/source/dnode/mnode/impl/src/mndStream.c index b7ab76984a..a1fd75c774 100644 --- a/source/dnode/mnode/impl/src/mndStream.c +++ b/source/dnode/mnode/impl/src/mndStream.c @@ -2085,7 +2085,7 @@ static int32_t refreshNodeListFromExistedStreams(SMnode *pMnode, SArray *pNodeLi break; } - SNodeEntry entry = {.hbTimestamp = -1, .nodeId = pTask->info.nodeId}; + SNodeEntry entry = {.hbTimestamp = -1, .nodeId = pTask->info.nodeId, .lastHbMsgId = -1}; epsetAssign(&entry.epset, &pTask->info.epSet); (void)taosHashPut(pHash, &entry.nodeId, sizeof(entry.nodeId), &entry, sizeof(entry)); } @@ -2265,7 +2265,7 @@ void saveTaskAndNodeInfoIntoBuf(SStreamObj *pStream, SStreamExecInfo *pExecNode) } if (!exist) { - SNodeEntry nodeEntry = {.hbTimestamp = -1, .nodeId = pTask->info.nodeId}; + SNodeEntry nodeEntry = {.hbTimestamp = -1, .nodeId = pTask->info.nodeId, .lastHbMsgId = -1}; epsetAssign(&nodeEntry.epset, &pTask->info.epSet); void* px = taosArrayPush(pExecNode->pNodeList, &nodeEntry); diff --git a/source/dnode/mnode/impl/src/mndStreamHb.c b/source/dnode/mnode/impl/src/mndStreamHb.c index bba39d0c98..50db903520 100644 --- a/source/dnode/mnode/impl/src/mndStreamHb.c +++ b/source/dnode/mnode/impl/src/mndStreamHb.c @@ -333,7 +333,8 @@ int32_t mndProcessStreamHb(SRpcMsg *pReq) { } tDecoderClear(&decoder); - mDebug("receive stream-meta hb from vgId:%d, active numOfTasks:%d, msgId:%d", req.vgId, req.numOfTasks, req.msgId); + mDebug("receive stream-meta hb from vgId:%d, active numOfTasks:%d, HbMsgId:%d, HbMsgTs:%" PRId64, req.vgId, + req.numOfTasks, req.msgId, req.ts); pFailedChkpt = taosArrayInit(4, sizeof(SFailedCheckpointInfo)); pOrphanTasks = taosArrayInit(4, sizeof(SOrphanTask)); @@ -366,17 +367,18 @@ int32_t mndProcessStreamHb(SRpcMsg *pReq) { continue; } - if (pEntry->lastHbMsgId == req.msgId) { - mError("vgId:%d Hb msgId:%d already handled, discard", pEntry->nodeId, req.msgId); + if ((pEntry->lastHbMsgId == req.msgId) && (pEntry->lastHbMsgTs == req.ts)) { + mError("vgId:%d HbMsgId:%d already handled, bh msg discard", pEntry->nodeId, req.msgId); terrno = TSDB_CODE_INVALID_MSG; doSendHbMsgRsp(terrno, &pReq->info, req.vgId, req.msgId); streamMutexUnlock(&execInfo.lock); cleanupAfterProcessHbMsg(&req, pFailedChkpt, pOrphanTasks); - return -1; + return terrno; } else { pEntry->lastHbMsgId = req.msgId; + pEntry->lastHbMsgTs = req.ts; } } @@ -417,6 +419,7 @@ int32_t mndProcessStreamHb(SRpcMsg *pReq) { SStreamObj *pStream = NULL; code = mndGetStreamObj(pMnode, p->id.streamId, &pStream); if (code) { + mError("stream obj not exist, failed to handle consensus checkpoint-info req, code:%s", tstrerror(code)); continue; } diff --git a/source/libs/stream/src/streamHb.c b/source/libs/stream/src/streamHb.c index 9804943ec2..a158d6e4bb 100644 --- a/source/libs/stream/src/streamHb.c +++ b/source/libs/stream/src/streamHb.c @@ -142,11 +142,12 @@ int32_t streamMetaSendHbHelper(SStreamMeta* pMeta) { } SStreamHbMsg* pMsg = &pInfo->hbMsg; - stDebug("vgId:%d build stream hbMsg, leader:%d msgId:%d", pMeta->vgId, (pMeta->role == NODE_ROLE_LEADER), - pMeta->pHbInfo->hbCount); - pMsg->vgId = pMeta->vgId; pMsg->msgId = pMeta->pHbInfo->hbCount; + pMsg->ts = taosGetTimestampMs(); + + stDebug("vgId:%d build stream hbMsg, leader:%d HbMsgId:%d, HbMsgTs:%" PRId64, pMeta->vgId, + (pMeta->role == NODE_ROLE_LEADER), pMsg->msgId, pMsg->ts); pMsg->pTaskStatus = taosArrayInit(numOfTasks, sizeof(STaskStatusEntry)); pMsg->pUpdateNodes = taosArrayInit(numOfTasks, sizeof(int32_t)); diff --git a/source/libs/stream/src/streamMsg.c b/source/libs/stream/src/streamMsg.c index bc0faacb32..75cb0e6683 100644 --- a/source/libs/stream/src/streamMsg.c +++ b/source/libs/stream/src/streamMsg.c @@ -382,6 +382,7 @@ int32_t tEncodeStreamHbMsg(SEncoder* pEncoder, const SStreamHbMsg* pReq) { } if (tEncodeI32(pEncoder, pReq->msgId) < 0) return -1; + if (tEncodeI64(pEncoder, pReq->ts) < 0) return -1; tEndEncode(pEncoder); return pEncoder->pos; } @@ -454,6 +455,7 @@ int32_t tDecodeStreamHbMsg(SDecoder* pDecoder, SStreamHbMsg* pReq) { } if (tDecodeI32(pDecoder, &pReq->msgId) < 0) return -1; + if (tDecodeI64(pDecoder, &pReq->ts) < 0) return -1; tEndDecode(pDecoder); return 0; diff --git a/tests/script/tsim/stream/checkpointInterval0.sim b/tests/script/tsim/stream/checkpointInterval0.sim index a548f05c82..a5e5c87704 100644 --- a/tests/script/tsim/stream/checkpointInterval0.sim +++ b/tests/script/tsim/stream/checkpointInterval0.sim @@ -76,6 +76,8 @@ system sh/stop_dnodes.sh system sh/exec.sh -n dnode1 -s start +run tsim/stream/checkTaskStatus.sim + sql insert into t1 values(1648791213002,3,2,3,1.1); $loop_count = 0 From 6539760c647292f9c98988120eb43dc95962dc42 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Wed, 31 Jul 2024 19:15:17 +0800 Subject: [PATCH 15/34] fix(stream): fix dead lock caused by refactor. --- source/libs/stream/src/streamCheckpoint.c | 13 +++++++++---- source/libs/stream/src/streamDispatch.c | 4 ++-- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index 12453f8b0e..d777883015 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -402,6 +402,7 @@ int32_t streamProcessCheckpointReadyMsg(SStreamTask* pTask, int64_t checkpointId for (int32_t i = 0; i < size; ++i) { STaskDownstreamReadyInfo* p = taosArrayGet(pInfo->pCheckpointReadyRecvList, i); if (p == NULL) { + streamMutexUnlock(&pInfo->lock); return TSDB_CODE_INVALID_PARA; } @@ -445,6 +446,7 @@ int32_t streamTaskProcessCheckpointReadyRsp(SStreamTask* pTask, int32_t upstream for (int32_t i = 0; i < taosArrayGetSize(pInfo->pReadyMsgList); ++i) { STaskCheckpointReadyInfo* pReadyInfo = taosArrayGet(pInfo->pReadyMsgList, i); if (pReadyInfo == NULL) { + streamMutexUnlock(&pInfo->lock); return TSDB_CODE_INVALID_PARA; } @@ -459,6 +461,7 @@ int32_t streamTaskProcessCheckpointReadyRsp(SStreamTask* pTask, int32_t upstream for (int32_t i = 0; i < taosArrayGetSize(pInfo->pReadyMsgList); ++i) { STaskCheckpointReadyInfo* pReadyInfo = taosArrayGet(pInfo->pReadyMsgList, i); if (pReadyInfo == NULL) { + streamMutexUnlock(&pInfo->lock); return TSDB_CODE_INVALID_PARA; } @@ -843,7 +846,7 @@ void checkpointTriggerMonitorFn(void* param, void* tmrId) { SArray* pNotSendList = taosArrayInit(4, sizeof(SStreamUpstreamEpInfo)); if (pNotSendList == NULL) { terrno = TSDB_CODE_OUT_OF_MEMORY; - stError("s-task:%s quit tmr function due to out of memory", id); + stDebug("s-task:%s start to triggerMonitor, reason:%s", id, tstrerror(terrno)); streamMutexUnlock(&pActiveInfo->lock); stDebug("s-task:%s start to monitor checkpoint-trigger in 10s", id); @@ -956,13 +959,14 @@ bool streamTaskAlreadySendTrigger(SStreamTask* pTask, int32_t downstreamNodeId) streamMutexLock(&pInfo->lock); if (!pInfo->dispatchTrigger) { - streamMutexUnlock(&pTask->lock); + streamMutexUnlock(&pInfo->lock); return false; } for (int32_t i = 0; i < taosArrayGetSize(pInfo->pDispatchTriggerList); ++i) { STaskTriggerSendInfo* pSendInfo = taosArrayGet(pInfo->pDispatchTriggerList, i); if (pSendInfo == NULL) { + streamMutexUnlock(&pInfo->lock); return TSDB_CODE_INVALID_PARA; } @@ -982,11 +986,11 @@ bool streamTaskAlreadySendTrigger(SStreamTask* pTask, int32_t downstreamNodeId) id, pSendInfo->sendTs, before, pInfo->activeId, pInfo->transId); } - streamMutexUnlock(&pTask->lock); + streamMutexUnlock(&pInfo->lock); return true; } - ASSERT(0); + streamMutexUnlock(&pInfo->lock); return false; } @@ -1043,6 +1047,7 @@ int32_t streamTaskGetNumOfConfirmed(SActiveCheckpointInfo* pInfo) { for (int32_t i = 0; i < taosArrayGetSize(pInfo->pDispatchTriggerList); ++i) { STaskTriggerSendInfo* p = taosArrayGet(pInfo->pDispatchTriggerList, i); if (p == NULL) { + streamMutexUnlock(&pInfo->lock); return num; } diff --git a/source/libs/stream/src/streamDispatch.c b/source/libs/stream/src/streamDispatch.c index 96a9f2d297..f6e827b745 100644 --- a/source/libs/stream/src/streamDispatch.c +++ b/source/libs/stream/src/streamDispatch.c @@ -919,8 +919,8 @@ int32_t streamTaskSendCheckpointReadyMsg(SStreamTask* pTask) { STaskCheckpointReadyInfo* pInfo = taosArrayGet(pList, i); SRpcMsg msg = {0}; - int32_t code = initCheckpointReadyMsg(pTask, pInfo->upstreamNodeId, pInfo->upstreamTaskId, pInfo->childId, pInfo->checkpointId, - &msg); + int32_t code = initCheckpointReadyMsg(pTask, pInfo->upstreamNodeId, pInfo->upstreamTaskId, pInfo->childId, + pInfo->checkpointId, &msg); if (code == TSDB_CODE_SUCCESS) { code = tmsgSendReq(&pInfo->upstreamNodeEpset, &msg); if (code == TSDB_CODE_SUCCESS) { From e8f6454d179fa3f754350b80b77ccb12c9cb431b Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Wed, 31 Jul 2024 19:24:41 +0800 Subject: [PATCH 16/34] fix(stream): compare vg replica according to different db. --- source/dnode/mnode/impl/src/mndStreamUtil.c | 114 +++++++++++++------- 1 file changed, 74 insertions(+), 40 deletions(-) diff --git a/source/dnode/mnode/impl/src/mndStreamUtil.c b/source/dnode/mnode/impl/src/mndStreamUtil.c index c375b46627..b5a612f058 100644 --- a/source/dnode/mnode/impl/src/mndStreamUtil.c +++ b/source/dnode/mnode/impl/src/mndStreamUtil.c @@ -88,18 +88,48 @@ void destroyStreamTaskIter(SStreamTaskIter* pIter) { taosMemoryFree(pIter); } -int32_t mndTakeVgroupSnapshot(SMnode *pMnode, bool *allReady, SArray** pList) { - SSdb *pSdb = pMnode->pSdb; - void *pIter = NULL; - SVgObj *pVgroup = NULL; - int32_t replica = -1; // do the replica check - int32_t code = 0; +static bool checkStatusForEachReplica(SVgObj *pVgroup) { + for (int32_t i = 0; i < pVgroup->replica; ++i) { + if (!pVgroup->vnodeGid[i].syncRestore) { + mInfo("vgId:%d not restored, not ready for checkpoint or other operations", pVgroup->vgId); + return false; + } + + ESyncState state = pVgroup->vnodeGid[i].syncState; + if (state == TAOS_SYNC_STATE_OFFLINE || state == TAOS_SYNC_STATE_ERROR || state == TAOS_SYNC_STATE_LEARNER || + state == TAOS_SYNC_STATE_CANDIDATE) { + mInfo("vgId:%d state:%d , not ready for checkpoint or other operations, not check other vgroups", pVgroup->vgId, + state); + return false; + } + } + + return true; +} + +int32_t mndTakeVgroupSnapshot(SMnode *pMnode, bool *allReady, SArray **pList) { + SSdb *pSdb = pMnode->pSdb; + void *pIter = NULL; + SVgObj *pVgroup = NULL; + int32_t code = 0; + SArray *pVgroupList = NULL; + SHashObj *pHash = NULL; + + pVgroupList = taosArrayInit(4, sizeof(SNodeEntry)); + if (pVgroupList == NULL) { + mError("failed to prepare arraylist during take vgroup snapshot, code:%s", tstrerror(terrno)); + code = terrno; + goto _err; + } + + pHash = taosHashInit(10, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BIGINT), false, HASH_NO_LOCK); + if (pHash == NULL) { + mError("failed to prepare hashmap during take vgroup snapshot, code:%s", tstrerror(terrno)); + code = terrno; + goto _err; + } *allReady = true; - SArray *pVgroupList = taosArrayInit(4, sizeof(SNodeEntry)); - if (pVgroupList == NULL) { - return terrno; - } while (1) { pIter = sdbFetch(pSdb, SDB_VGROUP, pIter, (void **)&pVgroup); @@ -110,44 +140,37 @@ int32_t mndTakeVgroupSnapshot(SMnode *pMnode, bool *allReady, SArray** pList) { SNodeEntry entry = {.nodeId = pVgroup->vgId, .hbTimestamp = pVgroup->updateTime}; entry.epset = mndGetVgroupEpset(pMnode, pVgroup); - if (replica == -1) { - replica = pVgroup->replica; - } else { - if (replica != pVgroup->replica) { - mInfo("vgId:%d replica:%d inconsistent with other vgroups replica:%d, not ready for stream operations", - pVgroup->vgId, pVgroup->replica, replica); - *allReady = false; + int8_t *pReplica = taosHashGet(pHash, &pVgroup->dbUid, sizeof(pVgroup->dbUid)); + if (pReplica == NULL) { // not exist, add it into hash map + code = taosHashPut(pHash, &pVgroup->dbUid, sizeof(pVgroup->dbUid), &pVgroup->replica, sizeof(pVgroup->replica)); + if (code) { + mError("failed to put info into hashmap during task vgroup snapshot, code:%s", tstrerror(code)); sdbRelease(pSdb, pVgroup); - break; + goto _err; // take snapshot failed, and not all ready + } + } else { + if (*pReplica != pVgroup->replica) { + mInfo("vgId:%d replica:%d inconsistent with other vgroups replica:%d, not ready for stream operations", + pVgroup->vgId, pVgroup->replica, *pReplica); + *allReady = false; // task snap success, but not all ready } } // if not all ready till now, no need to check the remaining vgroups. + // but still we need to put the info of the existed vgroups into the snapshot list if (*allReady) { - for (int32_t i = 0; i < pVgroup->replica; ++i) { - if (!pVgroup->vnodeGid[i].syncRestore) { - mInfo("vgId:%d not restored, not ready for checkpoint or other operations", pVgroup->vgId); - *allReady = false; - break; - } - - ESyncState state = pVgroup->vnodeGid[i].syncState; - if (state == TAOS_SYNC_STATE_OFFLINE || state == TAOS_SYNC_STATE_ERROR || state == TAOS_SYNC_STATE_LEARNER || - state == TAOS_SYNC_STATE_CANDIDATE) { - mInfo("vgId:%d state:%d , not ready for checkpoint or other operations, not check other vgroups", - pVgroup->vgId, state); - *allReady = false; - break; - } - } + *allReady = checkStatusForEachReplica(pVgroup); } char buf[256] = {0}; - (void) epsetToStr(&entry.epset, buf, tListLen(buf)); + (void)epsetToStr(&entry.epset, buf, tListLen(buf)); - void* p = taosArrayPush(pVgroupList, &entry); + void *p = taosArrayPush(pVgroupList, &entry); if (p == NULL) { mError("failed to put entry in vgroup list, nodeId:%d code:out of memory", entry.nodeId); + code = terrno; + sdbRelease(pSdb, pVgroup); + goto _err; } else { mDebug("take node snapshot, nodeId:%d %s", entry.nodeId, buf); } @@ -166,15 +189,19 @@ int32_t mndTakeVgroupSnapshot(SMnode *pMnode, bool *allReady, SArray** pList) { code = addEpIntoEpSet(&entry.epset, pObj->pDnode->fqdn, pObj->pDnode->port); if (code) { sdbRelease(pSdb, pObj); - continue; + mError("failed to extract epset for fqdn:%s during task vgroup snapshot", pObj->pDnode->fqdn); + goto _err; } char buf[256] = {0}; - (void) epsetToStr(&entry.epset, buf, tListLen(buf)); + (void)epsetToStr(&entry.epset, buf, tListLen(buf)); - void* p = taosArrayPush(pVgroupList, &entry); + void *p = taosArrayPush(pVgroupList, &entry); if (p == NULL) { - mError("failed to put entry in vgroup list, nodeId:%d code:out of memory", entry.nodeId); + code = terrno; + sdbRelease(pSdb, pObj); + mError("failed to put entry in vgroup list, nodeId:%d code:%s", entry.nodeId, tstrerror(code)); + goto _err; } else { mDebug("take snode snapshot, nodeId:%d %s", entry.nodeId, buf); } @@ -184,6 +211,13 @@ int32_t mndTakeVgroupSnapshot(SMnode *pMnode, bool *allReady, SArray** pList) { *pList = pVgroupList; return code; + +_err: + *allReady = false; + taosArrayDestroy(pVgroupList); + taosHashCleanup(pHash); + + return code; } int32_t mndGetStreamObj(SMnode *pMnode, int64_t streamId, SStreamObj **pStream) { From 400ed18c6ae9e442482a221b8f1ecb014e69d14d Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Wed, 31 Jul 2024 19:55:26 +0800 Subject: [PATCH 17/34] fix(stream): fix memory leak. --- source/dnode/mnode/impl/src/mndStreamUtil.c | 1 + 1 file changed, 1 insertion(+) diff --git a/source/dnode/mnode/impl/src/mndStreamUtil.c b/source/dnode/mnode/impl/src/mndStreamUtil.c index b5a612f058..7b4b82fcfe 100644 --- a/source/dnode/mnode/impl/src/mndStreamUtil.c +++ b/source/dnode/mnode/impl/src/mndStreamUtil.c @@ -210,6 +210,7 @@ int32_t mndTakeVgroupSnapshot(SMnode *pMnode, bool *allReady, SArray **pList) { } *pList = pVgroupList; + taosHashCleanup(pHash); return code; _err: From 3a1a528028bc284bda5479b342b7ea2b12a8f1bc Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Thu, 1 Aug 2024 09:21:54 +0800 Subject: [PATCH 18/34] fix(stream): cancel fetch --- source/dnode/mnode/impl/src/mndStreamUtil.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/source/dnode/mnode/impl/src/mndStreamUtil.c b/source/dnode/mnode/impl/src/mndStreamUtil.c index 7b4b82fcfe..030b14ea0d 100644 --- a/source/dnode/mnode/impl/src/mndStreamUtil.c +++ b/source/dnode/mnode/impl/src/mndStreamUtil.c @@ -146,6 +146,7 @@ int32_t mndTakeVgroupSnapshot(SMnode *pMnode, bool *allReady, SArray **pList) { if (code) { mError("failed to put info into hashmap during task vgroup snapshot, code:%s", tstrerror(code)); sdbRelease(pSdb, pVgroup); + sdbCancelFetch(pSdb, pIter); goto _err; // take snapshot failed, and not all ready } } else { @@ -170,6 +171,7 @@ int32_t mndTakeVgroupSnapshot(SMnode *pMnode, bool *allReady, SArray **pList) { mError("failed to put entry in vgroup list, nodeId:%d code:out of memory", entry.nodeId); code = terrno; sdbRelease(pSdb, pVgroup); + sdbCancelFetch(pSdb, pIter); goto _err; } else { mDebug("take node snapshot, nodeId:%d %s", entry.nodeId, buf); @@ -189,6 +191,7 @@ int32_t mndTakeVgroupSnapshot(SMnode *pMnode, bool *allReady, SArray **pList) { code = addEpIntoEpSet(&entry.epset, pObj->pDnode->fqdn, pObj->pDnode->port); if (code) { sdbRelease(pSdb, pObj); + sdbCancelFetch(pSdb, pIter); mError("failed to extract epset for fqdn:%s during task vgroup snapshot", pObj->pDnode->fqdn); goto _err; } @@ -200,6 +203,7 @@ int32_t mndTakeVgroupSnapshot(SMnode *pMnode, bool *allReady, SArray **pList) { if (p == NULL) { code = terrno; sdbRelease(pSdb, pObj); + sdbCancelFetch(pSdb, pIter); mError("failed to put entry in vgroup list, nodeId:%d code:%s", entry.nodeId, tstrerror(code)); goto _err; } else { From ede7f23b0e1e40887d2a6a398192a8c4bc3bbe7a Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Thu, 1 Aug 2024 09:31:51 +0800 Subject: [PATCH 19/34] fix(stream): fix dead lock caused by refactor. --- source/libs/stream/src/streamCheckpoint.c | 1 + 1 file changed, 1 insertion(+) diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index d777883015..7b205a16a1 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -253,6 +253,7 @@ int32_t streamProcessCheckpointTriggerBlock(SStreamTask* pTask, SStreamDataBlock for (int32_t i = 0; i < taosArrayGetSize(pActiveInfo->pReadyMsgList); ++i) { STaskCheckpointReadyInfo* p = taosArrayGet(pActiveInfo->pReadyMsgList, i); if (p == NULL) { + streamMutexUnlock(&pTask->lock); return TSDB_CODE_INVALID_PARA; } From 2dae0bf423604ca118f4a5e56407c04ee9d209ae Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Thu, 1 Aug 2024 14:15:08 +0800 Subject: [PATCH 20/34] fix(stream): add more check. --- source/libs/stream/src/streamTask.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/libs/stream/src/streamTask.c b/source/libs/stream/src/streamTask.c index e8ff1552e8..040bbb4f00 100644 --- a/source/libs/stream/src/streamTask.c +++ b/source/libs/stream/src/streamTask.c @@ -602,9 +602,9 @@ int32_t streamTaskStop(SStreamTask* pTask) { stError("failed to handle STOP event, s-task:%s", id); } - if ((pTask->info.taskLevel != TASK_LEVEL__SINK) && (pTask->exec.pExecutor != NULL)) { + if (pTask->info.taskLevel != TASK_LEVEL__SINK && pTask->exec.pExecutor != NULL) { code = qKillTask(pTask->exec.pExecutor, TSDB_CODE_SUCCESS); - if (code) { + if (code != TSDB_CODE_SUCCESS) { stError("s-task:%s failed to kill task related query handle", id); } } From 3e1074aea37aed264cbf79d1bb4f5d167912367d Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Thu, 1 Aug 2024 15:48:09 +0800 Subject: [PATCH 21/34] fix(stream): set correct return value. --- source/libs/stream/src/streamMeta.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index 8c5faf006f..fe4b626325 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -1211,14 +1211,17 @@ void streamMetaWUnLock(SStreamMeta* pMeta) { } int32_t streamMetaSendMsgBeforeCloseTasks(SStreamMeta* pMeta, SArray** pList) { - *pList = NULL; + QRY_OPTR_CHECK(pList); + int32_t code = 0; SArray* pTaskList = taosArrayDup(pMeta->pTaskList, NULL); if (pTaskList == NULL) { stError("failed to generate the task list during send hbMsg to mnode, vgId:%d, code: out of memory", pMeta->vgId); - return TSDB_CODE_OUT_OF_MEMORY; + return terrno; } + *pList = pTaskList; + bool sendMsg = pMeta->sendMsgBeforeClosing; if (!sendMsg) { stDebug("vgId:%d no need to send msg to mnode before closing tasks", pMeta->vgId); @@ -1251,9 +1254,9 @@ int32_t streamMetaSendMsgBeforeCloseTasks(SStreamMeta* pMeta, SArray** pList) { streamMetaReleaseTask(pMeta, pTask); } - code = streamMetaSendHbHelper(pMeta); + (void)streamMetaSendHbHelper(pMeta); pMeta->sendMsgBeforeClosing = false; - return code; + return TSDB_CODE_SUCCESS; // always return true } void streamMetaUpdateStageRole(SStreamMeta* pMeta, int64_t stage, bool isLeader) { From c5bac71b3224a479c1d20dba3101d3800fabc460 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Thu, 1 Aug 2024 15:59:57 +0800 Subject: [PATCH 22/34] fix(stream): remove invalid return code check. --- source/dnode/mnode/impl/src/mndStreamUtil.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/source/dnode/mnode/impl/src/mndStreamUtil.c b/source/dnode/mnode/impl/src/mndStreamUtil.c index 030b14ea0d..512862b37e 100644 --- a/source/dnode/mnode/impl/src/mndStreamUtil.c +++ b/source/dnode/mnode/impl/src/mndStreamUtil.c @@ -638,12 +638,9 @@ static int32_t doBuildStreamTaskUpdateMsg(void **pBuf, int32_t *pLen, SVgroupCha static int32_t doSetUpdateTaskAction(SMnode *pMnode, STrans *pTrans, SStreamTask *pTask, SVgroupChangeInfo *pInfo) { void *pBuf = NULL; int32_t len = 0; - int32_t code = streamTaskUpdateEpsetInfo(pTask, pInfo->pUpdateNodeList); - if (code) { - return code; - } + (void)streamTaskUpdateEpsetInfo(pTask, pInfo->pUpdateNodeList); - code = doBuildStreamTaskUpdateMsg(&pBuf, &len, pInfo, pTask->info.nodeId, &pTask->id, pTrans->id); + int32_t code = doBuildStreamTaskUpdateMsg(&pBuf, &len, pInfo, pTask->info.nodeId, &pTask->id, pTrans->id); if (code) { return code; } From 3b2d1ae101e2e97df84500bdf4287f4297d5bad1 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Thu, 1 Aug 2024 18:32:01 +0800 Subject: [PATCH 23/34] fix(test): wait for a little longer. --- tests/system-test/8-stream/state_window_case.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/system-test/8-stream/state_window_case.py b/tests/system-test/8-stream/state_window_case.py index 5ecf8d7832..3015b0db42 100644 --- a/tests/system-test/8-stream/state_window_case.py +++ b/tests/system-test/8-stream/state_window_case.py @@ -30,14 +30,14 @@ class TDTestCase: tdSql.execute("CREATE STREAM stream_device_alarm2 TRIGGER AT_ONCE DELETE_MARK 30d INTO st_device_alarm2 tags(factory_id varchar(20), device_code varchar(80), var_name varchar(200))\ as select _wstart start_time, last(load_time) end_time, first(var_value) var_value, 1 state_flag from st_variable_data\ PARTITION BY tbname tname, factory_id, device_code, var_name STATE_WINDOW(case when lower(var_value)=lower(trigger_value) then '1' else '0' end)") - time.sleep(2) + time.sleep(5) def insert_data(self): try: tdSql.execute("insert into aaa values('2024-07-15 14:00:00', '2024-07-15 14:00:00', 'a8')", queryTimes=5, show=True) time.sleep(0.01) tdSql.execute("insert into aaa values('2024-07-15 14:10:00', '2024-07-15 14:10:00', 'a9')", queryTimes=5, show=True) - time.sleep(1) + time.sleep(5) except Exception as error: tdLog.exit(f"insert data failed {error}") From 02b59d0b33abf19562c3b92a4561b8822c94c168 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Sat, 3 Aug 2024 16:34:26 +0800 Subject: [PATCH 24/34] fix(stream): add more check in tmr. --- source/libs/stream/src/streamCheckStatus.c | 7 ------- source/libs/stream/src/streamCheckpoint.c | 22 ++++++++++++++++++++++ source/libs/stream/src/streamDispatch.c | 4 ++-- 3 files changed, 24 insertions(+), 9 deletions(-) diff --git a/source/libs/stream/src/streamCheckStatus.c b/source/libs/stream/src/streamCheckStatus.c index c9ba6ffcfe..b7661e72d4 100644 --- a/source/libs/stream/src/streamCheckStatus.c +++ b/source/libs/stream/src/streamCheckStatus.c @@ -74,13 +74,6 @@ int32_t streamTaskCheckStatus(SStreamTask* pTask, int32_t upstreamTaskId, int32_ } if (pInfo->stage != stage) { - streamMutexLock(&pTask->lock); - ETaskStatus status = streamTaskGetStatus(pTask).state; - if (status == TASK_STATUS__CK) { - streamTaskSetFailedCheckpointId(pTask); - } - streamMutexUnlock(&pTask->lock); - return TASK_UPSTREAM_NEW_STAGE; } else if (pTask->status.downstreamReady != 1) { stDebug("s-task:%s vgId:%d leader:%d, downstream not ready", id, vgId, (pTask->pMeta->role == NODE_ROLE_LEADER)); diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index 7b205a16a1..d638e28c8d 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -855,6 +855,28 @@ void checkpointTriggerMonitorFn(void* param, void* tmrId) { return; } + if ((pTmrInfo->launchChkptId != pActiveInfo->activeId) || (pActiveInfo->activeId == 0)) { + streamMutexUnlock(&pActiveInfo->lock); + int32_t ref = streamCleanBeforeQuitTmr(pTmrInfo, pTask); + stWarn("s-task:%s vgId:%d checkpoint-trigger retrieve by previous checkpoint procedure, checkpointId:%" PRId64 + ", quit, ref:%d", + id, vgId, pTmrInfo->launchChkptId, ref); + + streamMetaReleaseTask(pTask->pMeta, pTask); + return; + } + + // active checkpoint info is cleared for now + if ((pActiveInfo->activeId == 0) || (pActiveInfo->transId == 0) || (pTask->chkInfo.startTs == 0)) { + streamMutexUnlock(&pActiveInfo->lock); + int32_t ref = streamCleanBeforeQuitTmr(pTmrInfo, pTask); + stWarn("s-task:%s vgId:%d active checkpoint may be cleared, quit from retrieve checkpoint-trigger send tmr, ref:%d", + id, vgId, ref); + + streamMetaReleaseTask(pTask->pMeta, pTask); + return; + } + for (int32_t i = 0; i < taosArrayGetSize(pList); ++i) { SStreamUpstreamEpInfo* pInfo = taosArrayGetP(pList, i); diff --git a/source/libs/stream/src/streamDispatch.c b/source/libs/stream/src/streamDispatch.c index f6e827b745..010f6f006f 100644 --- a/source/libs/stream/src/streamDispatch.c +++ b/source/libs/stream/src/streamDispatch.c @@ -820,7 +820,7 @@ static void checkpointReadyMsgSendMonitorFn(void* param, void* tmrId) { SArray* pList = pActiveInfo->pReadyMsgList; int32_t num = taosArrayGetSize(pList); - if (pTmrInfo->launchChkptId < pActiveInfo->activeId) { + if (pTmrInfo->launchChkptId != pActiveInfo->activeId) { streamMutexUnlock(&pActiveInfo->lock); int32_t ref = streamCleanBeforeQuitTmr(pTmrInfo, pTask); stWarn("s-task:%s vgId:%d ready-msg send tmr launched by previous checkpoint procedure, checkpointId:%" PRId64 @@ -832,7 +832,7 @@ static void checkpointReadyMsgSendMonitorFn(void* param, void* tmrId) { } // active checkpoint info is cleared for now - if ((pActiveInfo->activeId == 0) && (pActiveInfo->transId == 0) && (num == 0) && (pTask->chkInfo.startTs == 0)) { + if ((pActiveInfo->activeId == 0) || (pActiveInfo->transId == 0) || (num == 0) || (pTask->chkInfo.startTs == 0)) { streamMutexUnlock(&pActiveInfo->lock); int32_t ref = streamCleanBeforeQuitTmr(pTmrInfo, pTask); stWarn("s-task:%s vgId:%d active checkpoint may be cleared, quit from readyMsg send tmr, ref:%d", id, vgId, ref); From 170a074de829155ba336dabca09caaaa0f3c75a5 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Sun, 4 Aug 2024 11:37:23 +0800 Subject: [PATCH 25/34] fix(stream): add check for checkpointId in retrieve-checkpoint id msg. --- source/dnode/vnode/src/tqCommon/tqCommon.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/source/dnode/vnode/src/tqCommon/tqCommon.c b/source/dnode/vnode/src/tqCommon/tqCommon.c index b56c474ed5..11d38dde87 100644 --- a/source/dnode/vnode/src/tqCommon/tqCommon.c +++ b/source/dnode/vnode/src/tqCommon/tqCommon.c @@ -989,7 +989,12 @@ int32_t tqStreamTaskProcessRetrieveTriggerReq(SStreamMeta* pMeta, SRpcMsg* pMsg) int64_t checkpointId = 0; streamTaskGetActiveCheckpointInfo(pTask, &transId, &checkpointId); - ASSERT(checkpointId == pReq->checkpointId); + if (checkpointId != pReq->checkpointId) { + tqError("s-task:%s invalid checkpoint-trigger retrieve msg from %x, current checkpointId:%"PRId64" req:%"PRId64, + pTask->id.idStr, pReq->downstreamTaskId, checkpointId, pReq->checkpointId); + streamMetaReleaseTask(pMeta, pTask); + return TSDB_CODE_INVALID_MSG; + } if (streamTaskAlreadySendTrigger(pTask, pReq->downstreamNodeId)) { // re-send the lost checkpoint-trigger msg to downstream task From 79d4596b72f1c55b2e073c4faff0be4b092556dc Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Mon, 5 Aug 2024 19:26:42 +0800 Subject: [PATCH 26/34] fix(stream): fix syntax error. --- source/dnode/vnode/src/tqCommon/tqCommon.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/source/dnode/vnode/src/tqCommon/tqCommon.c b/source/dnode/vnode/src/tqCommon/tqCommon.c index a77d3462de..ba911fa76d 100644 --- a/source/dnode/vnode/src/tqCommon/tqCommon.c +++ b/source/dnode/vnode/src/tqCommon/tqCommon.c @@ -997,8 +997,9 @@ int32_t tqStreamTaskProcessRetrieveTriggerReq(SStreamMeta* pMeta, SRpcMsg* pMsg) streamTaskGetActiveCheckpointInfo(pTask, &transId, &checkpointId); if (checkpointId != pReq->checkpointId) { - tqError("s-task:%s invalid checkpoint-trigger retrieve msg from %x, current checkpointId:%"PRId64" req:%"PRId64, - pTask->id.idStr, pReq->downstreamTaskId, checkpointId, pReq->checkpointId); + tqError("s-task:%s invalid checkpoint-trigger retrieve msg from 0x%" PRIx64 ", current checkpointId:%" PRId64 + " req:%" PRId64, + pTask->id.idStr, pReq->downstreamTaskId, checkpointId, pReq->checkpointId); streamMetaReleaseTask(pMeta, pTask); return TSDB_CODE_INVALID_MSG; } From 94a2ea1ad5835a8dbef7595752ac5720bb6bacad Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Mon, 5 Aug 2024 19:32:29 +0800 Subject: [PATCH 27/34] fix(stream): clear the freed ptr --- source/libs/stream/src/streamBackendRocksdb.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/source/libs/stream/src/streamBackendRocksdb.c b/source/libs/stream/src/streamBackendRocksdb.c index 207bcdcac5..a408ef5872 100644 --- a/source/libs/stream/src/streamBackendRocksdb.c +++ b/source/libs/stream/src/streamBackendRocksdb.c @@ -910,6 +910,7 @@ void streamBackendCleanup(void* arg) { if (pHandle->db) { rocksdb_close(pHandle->db); + pHandle->db = NULL; } rocksdb_options_destroy(pHandle->dbOpt); rocksdb_env_destroy(pHandle->env); @@ -2508,6 +2509,7 @@ STaskDbWrapper* taskDbOpenImpl(const char* key, char* statePath, char* dbPath) { } rocksdb_close(pTaskDb->db); + pTaskDb->db = NULL; if (cfNames != NULL) { rocksdb_list_column_families_destroy(cfNames, nCf); @@ -2617,6 +2619,7 @@ void taskDbDestroy(void* pDb, bool flush) { if (wrapper->db) { rocksdb_close(wrapper->db); + wrapper->db = NULL; } rocksdb_options_destroy(wrapper->dbOpt); From 255faa0eac8688dd0fa07e19970640736c08cd26 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Wed, 7 Aug 2024 11:43:38 +0800 Subject: [PATCH 28/34] fix(stream): update acceptable code. --- source/dnode/mnode/impl/src/mndStreamUtil.c | 2 +- source/libs/stream/src/streamBackendRocksdb.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/source/dnode/mnode/impl/src/mndStreamUtil.c b/source/dnode/mnode/impl/src/mndStreamUtil.c index 512862b37e..739bb0ca37 100644 --- a/source/dnode/mnode/impl/src/mndStreamUtil.c +++ b/source/dnode/mnode/impl/src/mndStreamUtil.c @@ -542,7 +542,7 @@ static int32_t doSetDropActionFromId(SMnode *pMnode, STrans *pTrans, SOrphanTask } // The epset of nodeId of this task may have been expired now, let's use the newest epset from mnode. - code = setTransAction(pTrans, pReq, sizeof(SVDropStreamTaskReq), TDMT_STREAM_TASK_DROP, &epset, 0, 0); + code = setTransAction(pTrans, pReq, sizeof(SVDropStreamTaskReq), TDMT_STREAM_TASK_DROP, &epset, 0, TSDB_CODE_VND_INVALID_VGROUP_ID); if (code != 0) { taosMemoryFree(pReq); return code; diff --git a/source/libs/stream/src/streamBackendRocksdb.c b/source/libs/stream/src/streamBackendRocksdb.c index a80b8ef4eb..fa09191854 100644 --- a/source/libs/stream/src/streamBackendRocksdb.c +++ b/source/libs/stream/src/streamBackendRocksdb.c @@ -683,7 +683,7 @@ static int32_t rebuildFromLocalCheckpoint(const char* pTaskIdStr, const char* ch defaultPath); } } else { - code = TSDB_CODE_FAILED; + code = terrno; stError("%s no valid data for checkpointId:%" PRId64 " in %s", pTaskIdStr, checkpointId, checkpointPath); } From 1fbb3a63bcdc5311e250598fe71de466576a3bdc Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Wed, 7 Aug 2024 14:59:39 +0800 Subject: [PATCH 29/34] refactor: do some internal refactor. --- source/libs/stream/src/streamBackendRocksdb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/libs/stream/src/streamBackendRocksdb.c b/source/libs/stream/src/streamBackendRocksdb.c index fa09191854..41bacab667 100644 --- a/source/libs/stream/src/streamBackendRocksdb.c +++ b/source/libs/stream/src/streamBackendRocksdb.c @@ -763,7 +763,7 @@ int32_t restoreCheckpointData(const char* path, const char* key, int64_t chkptId } if (code != 0) { - stError("failed to start stream backend at %s, restart from default defaultPath:%s, reason:%s", checkpointPath, + stError("failed to start stream backend at %s, restart from defaultPath:%s, reason:%s", checkpointPath, defaultPath, tstrerror(code)); code = 0; // reset the error code } From 26a770f61edb44ea131e7cc15eb19a2950077e35 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Wed, 7 Aug 2024 15:54:48 +0800 Subject: [PATCH 30/34] fix(stream):update log. --- source/libs/stream/src/streamHb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/libs/stream/src/streamHb.c b/source/libs/stream/src/streamHb.c index a158d6e4bb..8513a8ba06 100644 --- a/source/libs/stream/src/streamHb.c +++ b/source/libs/stream/src/streamHb.c @@ -293,7 +293,7 @@ void streamMetaHbToMnode(void* param, void* tmrId) { streamMetaRLock(pMeta); code = streamMetaSendHbHelper(pMeta); if (code) { - stError("vgId:%d failed to send hmMsg to mnode, try again in 5s, code:%s", pMeta->vgId, strerror(code)); + stError("vgId:%d failed to send hmMsg to mnode, try again in 5s, code:%s", pMeta->vgId, tstrerror(code)); } streamMetaRUnLock(pMeta); From 31c21f6f6d8d27281c783904300f6901bd1834ed Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Wed, 7 Aug 2024 16:01:28 +0800 Subject: [PATCH 31/34] fix(rpc): update log. --- source/dnode/mgmt/node_mgmt/src/dmTransport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/dnode/mgmt/node_mgmt/src/dmTransport.c b/source/dnode/mgmt/node_mgmt/src/dmTransport.c index 3d758e1fd3..cc57b04d47 100644 --- a/source/dnode/mgmt/node_mgmt/src/dmTransport.c +++ b/source/dnode/mgmt/node_mgmt/src/dmTransport.c @@ -109,7 +109,7 @@ static void dmProcessRpcMsg(SDnode *pDnode, SRpcMsg *pRpc, SEpSet *pEpSet) { int32_t svrVer = 0; (void)taosVersionStrToInt(version, &svrVer); if ((code = taosCheckVersionCompatible(pRpc->info.cliVer, svrVer, 3)) != 0) { - dError("Version not compatible, cli ver: %d, svr ver: %d", pRpc->info.cliVer, svrVer); + dError("Version not compatible, cli ver: %d, svr ver: %d, ip:0x%x", pRpc->info.cliVer, svrVer, pRpc->info.conn.clientIp); goto _OVER; } From ea715a21b1b96dd64d98cfaa6b78e014f4b759bf Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Wed, 7 Aug 2024 15:25:27 +0800 Subject: [PATCH 32/34] fix invalid remove --- source/libs/stream/src/streamBackendRocksdb.c | 3 +++ source/libs/stream/src/streamTask.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/source/libs/stream/src/streamBackendRocksdb.c b/source/libs/stream/src/streamBackendRocksdb.c index 41bacab667..537aa72d91 100644 --- a/source/libs/stream/src/streamBackendRocksdb.c +++ b/source/libs/stream/src/streamBackendRocksdb.c @@ -1144,6 +1144,8 @@ int32_t chkpMayDelObsolete(void* arg, int64_t chkpId, char* path) { int64_t id = *(int64_t*)taosArrayGet(chkpDel, i); char tbuf[256] = {0}; sprintf(tbuf, "%s%scheckpoint%" PRId64 "", path, TD_DIRSEP, id); + + stInfo("backend remove obsolete checkpoint: %s", tbuf); if (taosIsDir(tbuf)) { taosRemoveDir(tbuf); } @@ -2661,6 +2663,7 @@ void taskDbDestroy(void* pDb, bool flush) { if (wrapper->removeAllFiles) { char* err = NULL; + stInfo("drop task remove backend dat:%s", wrapper->path); taosRemoveDir(wrapper->path); } taosMemoryFree(wrapper->path); diff --git a/source/libs/stream/src/streamTask.c b/source/libs/stream/src/streamTask.c index c7f3bd264d..c0b2b16d30 100644 --- a/source/libs/stream/src/streamTask.c +++ b/source/libs/stream/src/streamTask.c @@ -321,7 +321,7 @@ void streamFreeTaskState(SStreamTask* pTask, int8_t remove) { stDebug("s-task:0x%x start to free task state", pTask->id.taskId); streamStateClose(pTask->pState, remove); - taskDbSetClearFileFlag(pTask->pBackend); + if (remove)taskDbSetClearFileFlag(pTask->pBackend); taskDbRemoveRef(pTask->pBackend); pTask->pBackend = NULL; pTask->pState = NULL; From dfa74f82d78b4bfbfcbba4768cf8d0794de45ccc Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Fri, 9 Aug 2024 00:40:05 +0800 Subject: [PATCH 33/34] fix(stream): avoid repeat send checkpoint-report msg. --- source/dnode/mnode/impl/inc/mndStream.h | 12 ++- source/dnode/mnode/impl/src/mndStream.c | 104 ++++++++++++++------ source/dnode/mnode/impl/src/mndStreamHb.c | 6 +- source/dnode/mnode/impl/src/mndStreamUtil.c | 68 +++++++++---- source/dnode/vnode/src/tqCommon/tqCommon.c | 2 +- source/libs/stream/src/streamCheckpoint.c | 23 +++-- 6 files changed, 155 insertions(+), 60 deletions(-) diff --git a/source/dnode/mnode/impl/inc/mndStream.h b/source/dnode/mnode/impl/inc/mndStream.h index 89343ce37c..a5d91c8aa8 100644 --- a/source/dnode/mnode/impl/inc/mndStream.h +++ b/source/dnode/mnode/impl/inc/mndStream.h @@ -57,6 +57,12 @@ typedef struct SStreamTaskResetMsg { int32_t transId; } SStreamTaskResetMsg; +typedef struct SChkptReportInfo { + SArray* pTaskList; + int64_t reportChkpt; + int64_t streamId; +} SChkptReportInfo; + typedef struct SStreamExecInfo { bool initTaskList; SArray *pNodeList; @@ -66,9 +72,9 @@ typedef struct SStreamExecInfo { SArray *pTaskList; TdThreadMutex lock; SHashObj *pTransferStateStreams; - SHashObj *pChkptStreams; + SHashObj *pChkptStreams; // use to update the checkpoint info, if all tasks send the checkpoint-report msgs SHashObj *pStreamConsensus; - SArray *pKilledChkptTrans; // SArray + SArray *pKilledChkptTrans; // SArray } SStreamExecInfo; extern SStreamExecInfo execInfo; @@ -153,6 +159,8 @@ int32_t mndGetConsensusInfo(SHashObj *pHash, int64_t streamId, int32_t numOfTask void mndAddConsensusTasks(SCheckpointConsensusInfo *pInfo, const SRestoreCheckpointInfo *pRestoreInfo); void mndClearConsensusRspEntry(SCheckpointConsensusInfo *pInfo); int64_t mndClearConsensusCheckpointId(SHashObj* pHash, int64_t streamId); +int64_t mndClearChkptReportInfo(SHashObj* pHash, int64_t streamId); +int32_t mndResetChkptReportInfo(SHashObj* pHash, int64_t streamId); int32_t setStreamAttrInResBlock(SStreamObj *pStream, SSDataBlock *pBlock, int32_t numOfRows); int32_t setTaskAttrInResBlock(SStreamObj *pStream, SStreamTask *pTask, SSDataBlock *pBlock, int32_t numOfRows); diff --git a/source/dnode/mnode/impl/src/mndStream.c b/source/dnode/mnode/impl/src/mndStream.c index 90ef7daa60..a8d35993c7 100644 --- a/source/dnode/mnode/impl/src/mndStream.c +++ b/source/dnode/mnode/impl/src/mndStream.c @@ -2454,8 +2454,45 @@ int32_t mndProcessStreamReqCheckpoint(SRpcMsg *pReq) { return 0; } -static void doAddReportStreamTask(SArray* pList, const SCheckpointReport* pReport) { - bool existed = false; +// valid the info according to the HbMsg +static bool validateChkptReport(const SCheckpointReport *pReport, int64_t reportChkptId) { + STaskId id = {.streamId = pReport->streamId, .taskId = pReport->taskId}; + STaskStatusEntry *pTaskEntry = taosHashGet(execInfo.pTaskMap, &id, sizeof(id)); + if (pTaskEntry == NULL) { + mError("invalid checkpoint-report msg from task:0x%x, discard", pReport->taskId); + return false; + } + + if (pTaskEntry->checkpointInfo.latestId >= pReport->checkpointId) { + mError("s-task:0x%x invalid checkpoint-report msg, checkpointId:%" PRId64 " saved checkpointId:%" PRId64 " discard", + pReport->taskId, pReport->checkpointId, pTaskEntry->checkpointInfo.activeId); + return false; + } + + // now the task in checkpoint procedure + if ((pTaskEntry->checkpointInfo.activeId != 0) && (pTaskEntry->checkpointInfo.activeId > pReport->checkpointId)) { + mError("s-task:0x%x invalid checkpoint-report msg, checkpointId:%" PRId64 " active checkpointId:%" PRId64 + " discard", + pReport->taskId, pReport->checkpointId, pTaskEntry->checkpointInfo.activeId); + return false; + } + + if (reportChkptId >= pReport->checkpointId) { + mError("s-task:0x%x expired checkpoint-report msg, checkpointId:%" PRId64 " already update checkpointId:%" PRId64 + " discard", + pReport->taskId, pReport->checkpointId, reportChkptId); + return false; + } + + return true; +} + +static void doAddReportStreamTask(SArray *pList, int64_t reportChkptId, const SCheckpointReport *pReport) { + bool valid = validateChkptReport(pReport, reportChkptId); + if (!valid) { + return; + } + for (int32_t i = 0; i < taosArrayGetSize(pList); ++i) { STaskChkptInfo *p = taosArrayGet(pList, i); if (p == NULL) { @@ -2463,27 +2500,38 @@ static void doAddReportStreamTask(SArray* pList, const SCheckpointReport* pRepor } if (p->taskId == pReport->taskId) { - existed = true; - break; + if (p->checkpointId > pReport->checkpointId) { + mError("s-task:0x%x invalid checkpoint-report msg, existed:%" PRId64 " req checkpointId:%" PRId64 ", discard", + pReport->taskId, p->checkpointId, pReport->checkpointId); + } else if (p->checkpointId < pReport->checkpointId) { // expired checkpoint-report msg, update it + mDebug("s-task:0x%x expired checkpoint-report msg in checkpoint-report list update from %" PRId64 "->%" PRId64, + pReport->taskId, p->checkpointId, pReport->checkpointId); + + memcpy(p, pReport, sizeof(STaskChkptInfo)); + } else { + mWarn("taskId:0x%x already in checkpoint-report list", pReport->taskId); + } + return; } } - if (!existed) { - STaskChkptInfo info = { - .streamId = pReport->streamId, - .taskId = pReport->taskId, - .transId = pReport->transId, - .dropHTask = pReport->dropHTask, - .version = pReport->checkpointVer, - .ts = pReport->checkpointTs, - .checkpointId = pReport->checkpointId, - .nodeId = pReport->nodeId, - }; + STaskChkptInfo info = { + .streamId = pReport->streamId, + .taskId = pReport->taskId, + .transId = pReport->transId, + .dropHTask = pReport->dropHTask, + .version = pReport->checkpointVer, + .ts = pReport->checkpointTs, + .checkpointId = pReport->checkpointId, + .nodeId = pReport->nodeId, + }; - void* p = taosArrayPush(pList, &info); - if (p == NULL) { - mError("failed to put into task list, taskId:0x%x", pReport->taskId); - } + void *p = taosArrayPush(pList, &info); + if (p == NULL) { + mError("failed to put into task list, taskId:0x%x", pReport->taskId); + } else { + int32_t size = taosArrayGetSize(pList); + mDebug("stream:0x%"PRIx64" %d tasks has send checkpoint-report", pReport->streamId, size); } } @@ -2530,23 +2578,23 @@ int32_t mndProcessCheckpointReport(SRpcMsg *pReq) { int32_t numOfTasks = (pStream == NULL) ? 0 : mndGetNumOfStreamTasks(pStream); - SArray **pReqTaskList = (SArray **)taosHashGet(execInfo.pChkptStreams, &req.streamId, sizeof(req.streamId)); - if (pReqTaskList == NULL) { - SArray *pList = taosArrayInit(4, sizeof(STaskChkptInfo)); - if (pList != NULL) { - doAddReportStreamTask(pList, &req); - code = taosHashPut(execInfo.pChkptStreams, &req.streamId, sizeof(req.streamId), &pList, POINTER_BYTES); + SChkptReportInfo *pInfo = (SChkptReportInfo*)taosHashGet(execInfo.pChkptStreams, &req.streamId, sizeof(req.streamId)); + if (pInfo == NULL) { + SChkptReportInfo info = {.pTaskList = taosArrayInit(4, sizeof(STaskChkptInfo)), .streamId = req.streamId}; + if (info.pTaskList != NULL) { + doAddReportStreamTask(info.pTaskList, info.reportChkpt, &req); + code = taosHashPut(execInfo.pChkptStreams, &req.streamId, sizeof(req.streamId), &info, sizeof(info)); if (code) { mError("stream:0x%" PRIx64 " failed to put into checkpoint stream", req.streamId); } - pReqTaskList = (SArray **)taosHashGet(execInfo.pChkptStreams, &req.streamId, sizeof(req.streamId)); + pInfo = (SChkptReportInfo *)taosHashGet(execInfo.pChkptStreams, &req.streamId, sizeof(req.streamId)); } } else { - doAddReportStreamTask(*pReqTaskList, &req); + doAddReportStreamTask(pInfo->pTaskList, pInfo->reportChkpt, &req); } - int32_t total = taosArrayGetSize(*pReqTaskList); + int32_t total = taosArrayGetSize(pInfo->pTaskList); if (total == numOfTasks) { // all tasks has send the reqs mInfo("stream:0x%" PRIx64 " %s all %d tasks send checkpoint-report, checkpoint meta-info for checkpointId:%" PRId64 " will be issued soon", diff --git a/source/dnode/mnode/impl/src/mndStreamHb.c b/source/dnode/mnode/impl/src/mndStreamHb.c index 50db903520..59f07ce977 100644 --- a/source/dnode/mnode/impl/src/mndStreamHb.c +++ b/source/dnode/mnode/impl/src/mndStreamHb.c @@ -211,6 +211,10 @@ int32_t mndProcessResetStatusReq(SRpcMsg *pReq) { SStreamTaskResetMsg* pMsg = pReq->pCont; mndKillTransImpl(pMnode, pMsg->transId, ""); + streamMutexLock(&execInfo.lock); + (void) mndResetChkptReportInfo(execInfo.pChkptStreams, pMsg->streamId); + streamMutexUnlock(&execInfo.lock); + code = mndGetStreamObj(pMnode, pMsg->streamId, &pStream); if (pStream == NULL || code != 0) { code = TSDB_CODE_STREAM_TASK_NOT_EXIST; @@ -453,7 +457,7 @@ int32_t mndProcessStreamHb(SRpcMsg *pReq) { addIntoCheckpointList(pFailedChkpt, &info); // remove failed trans from pChkptStreams - code = taosHashRemove(execInfo.pChkptStreams, &p->id.streamId, sizeof(p->id.streamId)); + code = mndResetChkptReportInfo(execInfo.pChkptStreams, p->id.streamId); if (code) { mError("failed to remove stream:0x%"PRIx64" in checkpoint stream list", p->id.streamId); } diff --git a/source/dnode/mnode/impl/src/mndStreamUtil.c b/source/dnode/mnode/impl/src/mndStreamUtil.c index 739bb0ca37..649cab91c1 100644 --- a/source/dnode/mnode/impl/src/mndStreamUtil.c +++ b/source/dnode/mnode/impl/src/mndStreamUtil.c @@ -904,8 +904,9 @@ void removeStreamTasksInBuf(SStreamObj *pStream, SStreamExecInfo *pExecNode) { ASSERT(taosHashGetSize(pExecNode->pTaskMap) == taosArrayGetSize(pExecNode->pTaskList)); - // 2. remove stream entry in consensus hash table + // 2. remove stream entry in consensus hash table and checkpoint-report hash table (void) mndClearConsensusCheckpointId(execInfo.pStreamConsensus, pStream->uid); + (void) mndClearChkptReportInfo(execInfo.pChkptStreams, pStream->uid); streamMutexUnlock(&pExecNode->lock); destroyStreamTaskIter(pIter); @@ -973,9 +974,8 @@ int32_t removeExpiredNodeEntryAndTaskInBuf(SArray *pNodeSnapshot) { static int32_t doSetUpdateChkptAction(SMnode *pMnode, STrans *pTrans, SStreamTask *pTask) { SVUpdateCheckpointInfoReq *pReq = taosMemoryCalloc(1, sizeof(SVUpdateCheckpointInfoReq)); if (pReq == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; mError("failed to malloc in reset stream, size:%" PRIzu ", code:%s", sizeof(SVUpdateCheckpointInfoReq), - tstrerror(TSDB_CODE_OUT_OF_MEMORY)); + tstrerror(terrno)); return terrno; } @@ -983,12 +983,14 @@ static int32_t doSetUpdateChkptAction(SMnode *pMnode, STrans *pTrans, SStreamTas pReq->taskId = pTask->id.taskId; pReq->streamId = pTask->id.streamId; - SArray **pReqTaskList = (SArray **)taosHashGet(execInfo.pChkptStreams, &pTask->id.streamId, sizeof(pTask->id.streamId)); - ASSERT(pReqTaskList); + SChkptReportInfo *pStreamItem = (SChkptReportInfo*)taosHashGet(execInfo.pChkptStreams, &pTask->id.streamId, sizeof(pTask->id.streamId)); + if (pStreamItem == NULL) { + return TSDB_CODE_INVALID_PARA; + } - int32_t size = taosArrayGetSize(*pReqTaskList); + int32_t size = taosArrayGetSize(pStreamItem->pTaskList); for(int32_t i = 0; i < size; ++i) { - STaskChkptInfo* pInfo = taosArrayGet(*pReqTaskList, i); + STaskChkptInfo* pInfo = taosArrayGet(pStreamItem->pTaskList, i); if (pInfo == NULL) { continue; } @@ -1063,11 +1065,12 @@ int32_t mndScanCheckpointReportInfo(SRpcMsg *pReq) { } mDebug("start to scan checkpoint report info"); + streamMutexLock(&execInfo.lock); while ((pIter = taosHashIterate(execInfo.pChkptStreams, pIter)) != NULL) { - SArray *pList = *(SArray **)pIter; + SChkptReportInfo* px = (SChkptReportInfo *)pIter; - STaskChkptInfo *pInfo = taosArrayGet(pList, 0); + STaskChkptInfo *pInfo = taosArrayGet(px->pTaskList, 0); if (pInfo == NULL) { continue; } @@ -1080,12 +1083,11 @@ int32_t mndScanCheckpointReportInfo(SRpcMsg *pReq) { if (p == NULL) { mError("failed to put stream into drop list:0x%" PRIx64, pInfo->streamId); } - continue; } int32_t total = mndGetNumOfStreamTasks(pStream); - int32_t existed = (int32_t)taosArrayGetSize(pList); + int32_t existed = (int32_t)taosArrayGetSize(px->pTaskList); if (total == existed) { mDebug("stream:0x%" PRIx64 " %s all %d tasks send checkpoint-report, start to update checkpoint-info", @@ -1093,14 +1095,11 @@ int32_t mndScanCheckpointReportInfo(SRpcMsg *pReq) { bool conflict = mndStreamTransConflictCheck(pMnode, pStream->uid, MND_STREAM_CHKPT_UPDATE_NAME, false); if (!conflict) { - code = mndCreateStreamChkptInfoUpdateTrans(pMnode, pStream, pList); + code = mndCreateStreamChkptInfoUpdateTrans(pMnode, pStream, px->pTaskList); if (code == TSDB_CODE_SUCCESS || code == TSDB_CODE_ACTION_IN_PROGRESS) { // remove this entry - void* p = taosArrayPush(pDropped, &pInfo->streamId); - if (p == NULL) { - mError("failed to remove stream:0x%" PRIx64, pInfo->streamId); - } else { - mDebug("stream:0x%" PRIx64 " removed", pInfo->streamId); - } + taosArrayClear(px->pTaskList); + px->reportChkpt = pInfo->checkpointId; + mDebug("stream:0x%" PRIx64 " clear checkpoint-report list", pInfo->streamId); } else { mDebug("stream:0x%" PRIx64 " not launch chkpt-meta update trans, due to checkpoint not finished yet", pInfo->streamId); @@ -1135,6 +1134,8 @@ int32_t mndScanCheckpointReportInfo(SRpcMsg *pReq) { mDebug("drop %d stream(s) in checkpoint-report list, remain:%d", size, numOfStreams); } + streamMutexUnlock(&execInfo.lock); + taosArrayDestroy(pDropped); return TSDB_CODE_SUCCESS; } @@ -1319,7 +1320,7 @@ int64_t mndClearConsensusCheckpointId(SHashObj* pHash, int64_t streamId) { int32_t code = 0; int32_t numOfStreams = taosHashGetSize(pHash); if (numOfStreams == 0) { - return TSDB_CODE_SUCCESS; + return code; } code = taosHashRemove(pHash, &streamId, sizeof(streamId)); @@ -1332,6 +1333,35 @@ int64_t mndClearConsensusCheckpointId(SHashObj* pHash, int64_t streamId) { return code; } +int64_t mndClearChkptReportInfo(SHashObj* pHash, int64_t streamId) { + int32_t code = 0; + int32_t numOfStreams = taosHashGetSize(pHash); + if (numOfStreams == 0) { + return code; + } + + code = taosHashRemove(pHash, &streamId, sizeof(streamId)); + if (code == 0) { + mDebug("drop stream:0x%" PRIx64 " in chkpt-report list, remain:%d", streamId, numOfStreams); + } else { + mError("failed to remove stream:0x%"PRIx64" in chkpt-report list, remain:%d", streamId, numOfStreams); + } + + return code; +} + +int32_t mndResetChkptReportInfo(SHashObj* pHash, int64_t streamId) { + SChkptReportInfo* pInfo = taosHashGet(pHash, &streamId, sizeof(streamId)); + if (pInfo != NULL) { + taosArrayClear(pInfo->pTaskList); + mDebug("stream:0x%" PRIx64 " checkpoint-report list cleared, prev report checkpointId:%" PRId64, streamId, + pInfo->reportChkpt); + return 0; + } + + return TSDB_CODE_MND_STREAM_NOT_EXIST; +} + static void mndShowStreamStatus(char *dst, SStreamObj *pStream) { int8_t status = atomic_load_8(&pStream->status); if (status == STREAM_STATUS__NORMAL) { diff --git a/source/dnode/vnode/src/tqCommon/tqCommon.c b/source/dnode/vnode/src/tqCommon/tqCommon.c index ba911fa76d..faca2020c5 100644 --- a/source/dnode/vnode/src/tqCommon/tqCommon.c +++ b/source/dnode/vnode/src/tqCommon/tqCommon.c @@ -563,7 +563,7 @@ int32_t tqStreamTaskProcessCheckpointReadyMsg(SStreamMeta* pMeta, SRpcMsg* pMsg) pTask->id.idStr, req.downstreamTaskId, req.downstreamNodeId); } - code = streamProcessCheckpointReadyMsg(pTask, req.checkpointId, req.downstreamTaskId, req.downstreamNodeId); + code = streamProcessCheckpointReadyMsg(pTask, req.checkpointId, req.downstreamNodeId, req.downstreamTaskId); streamMetaReleaseTask(pMeta, pTask); if (code) { return code; diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index 6c0f8ec6cb..65e5c475b4 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -361,7 +361,6 @@ int32_t streamProcessCheckpointTriggerBlock(SStreamTask* pTask, SStreamDataBlock (void)streamTaskBuildCheckpoint(pTask); // todo: not handle error yet } else { // source & agg tasks need to forward the checkpoint msg downwards stDebug("s-task:%s process checkpoint-trigger block, all %d upstreams sent, forwards to downstream", id, num); - flushStateDataInExecutor(pTask, (SStreamQueueItem*)pBlock); // Put the checkpoint-trigger block into outputQ, to make sure all blocks with less version have been handled by @@ -376,8 +375,8 @@ int32_t streamProcessCheckpointTriggerBlock(SStreamTask* pTask, SStreamDataBlock // only when all downstream tasks are send checkpoint rsp, we can start the checkpoint procedure for the agg task static int32_t processCheckpointReadyHelp(SActiveCheckpointInfo* pInfo, int32_t numOfDownstream, int32_t downstreamNodeId, int64_t streamId, int32_t downstreamTaskId, - const char* id, int32_t* pNotReady, int32_t* pTransId) { - bool received = false; + const char* id, int32_t* pNotReady, int32_t* pTransId, bool* alreadyRecv) { + *alreadyRecv = false; int32_t size = taosArrayGetSize(pInfo->pCheckpointReadyRecvList); for (int32_t i = 0; i < size; ++i) { STaskDownstreamReadyInfo* p = taosArrayGet(pInfo->pCheckpointReadyRecvList, i); @@ -386,12 +385,12 @@ static int32_t processCheckpointReadyHelp(SActiveCheckpointInfo* pInfo, int32_t } if (p->downstreamTaskId == downstreamTaskId) { - received = true; + (*alreadyRecv) = true; break; } } - if (received) { + if (*alreadyRecv) { stDebug("s-task:%s already recv checkpoint-ready msg from downstream:0x%x, ignore. %d/%d downstream not ready", id, downstreamTaskId, (int32_t)(numOfDownstream - taosArrayGetSize(pInfo->pCheckpointReadyRecvList)), numOfDownstream); @@ -427,6 +426,7 @@ int32_t streamProcessCheckpointReadyMsg(SStreamTask* pTask, int64_t checkpointId int32_t code = 0; int32_t notReady = 0; int32_t transId = 0; + bool alreadyHandled = false; // 1. not in checkpoint status now SStreamTaskState pStat = streamTaskGetStatus(pTask); @@ -445,12 +445,17 @@ int32_t streamProcessCheckpointReadyMsg(SStreamTask* pTask, int64_t checkpointId streamMutexLock(&pInfo->lock); code = processCheckpointReadyHelp(pInfo, total, downstreamNodeId, pTask->id.streamId, downstreamTaskId, id, ¬Ready, - &transId); + &transId, &alreadyHandled); streamMutexUnlock(&pInfo->lock); - if ((notReady == 0) && (code == 0)) { - stDebug("s-task:%s all downstream tasks have completed build checkpoint, do checkpoint for current task", id); - (void)appendCheckpointIntoInputQ(pTask, STREAM_INPUT__CHECKPOINT, checkpointId, transId, -1); + if (alreadyHandled) { + stDebug("s-task:%s checkpoint-ready msg checkpointId:%" PRId64 " from task:0x%x already handled, not handle again", + id, checkpointId, downstreamTaskId); + } else { + if ((notReady == 0) && (code == 0) && (!alreadyHandled)) { + stDebug("s-task:%s all downstream tasks have completed build checkpoint, do checkpoint for current task", id); + (void)appendCheckpointIntoInputQ(pTask, STREAM_INPUT__CHECKPOINT, checkpointId, transId, -1); + } } return code; From 506a72d50f317db5ff3f5295b5c19ae672065504 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Fri, 9 Aug 2024 02:02:41 +0800 Subject: [PATCH 34/34] fix(stream): update checkpoint info only it is in ck status. --- source/libs/stream/src/streamCheckpoint.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index 65e5c475b4..c555da9865 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -567,14 +567,13 @@ int32_t streamTaskUpdateTaskCheckpointInfo(SStreamTask* pTask, bool restored, SV ASSERT(pInfo->checkpointId <= pReq->checkpointId && pInfo->checkpointVer <= pReq->checkpointVer && pInfo->processedVer <= pReq->checkpointVer); - pInfo->checkpointId = pReq->checkpointId; - pInfo->checkpointVer = pReq->checkpointVer; - pInfo->checkpointTime = pReq->checkpointTs; - - streamTaskClearCheckInfo(pTask, true); - + // update only it is in checkpoint status. if (pStatus.state == TASK_STATUS__CK) { - // todo handle error + pInfo->checkpointId = pReq->checkpointId; + pInfo->checkpointVer = pReq->checkpointVer; + pInfo->checkpointTime = pReq->checkpointTs; + + streamTaskClearCheckInfo(pTask, true); code = streamTaskHandleEvent(pTask->status.pSM, TASK_EVENT_CHECKPOINT_DONE); } else { stDebug("s-task:0x%x vgId:%d not handle checkpoint-done event, status:%s", pReq->taskId, vgId, pStatus.name);