Merge pull request #28900 from taosdata/fix/liaohj

fix(stream): check whether dispatch trigger block is expired or not
This commit is contained in:
Shengliang Guan 2024-11-27 15:47:47 +08:00 committed by GitHub
commit 642a5152f7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 43 additions and 25 deletions

View File

@ -714,7 +714,7 @@ int32_t streamTaskSetActiveCheckpointInfo(SStreamTask* pTask, int64_t activeChec
void streamTaskSetFailedChkptInfo(SStreamTask* pTask, int32_t transId, int64_t checkpointId);
bool streamTaskAlreadySendTrigger(SStreamTask* pTask, int32_t downstreamNodeId);
void streamTaskGetTriggerRecvStatus(SStreamTask* pTask, int32_t* pRecved, int32_t* pTotal);
int32_t streamTaskInitTriggerDispatchInfo(SStreamTask* pTask);
int32_t streamTaskInitTriggerDispatchInfo(SStreamTask* pTask, int64_t sendingChkptId);
void streamTaskSetTriggerDispatchConfirmed(SStreamTask* pTask, int32_t vgId);
int32_t streamTaskSendCheckpointTriggerMsg(SStreamTask* pTask, int32_t dstTaskId, int32_t downstreamNodeId,
SRpcHandleInfo* pInfo, int32_t code);

View File

@ -379,9 +379,9 @@ int32_t mndProcessStreamHb(SRpcMsg *pReq) {
}
if ((pEntry->lastHbMsgId == req.msgId) && (pEntry->lastHbMsgTs == req.ts)) {
mError("vgId:%d HbMsgId:%d already handled, bh msg discard", pEntry->nodeId, req.msgId);
mError("vgId:%d HbMsgId:%d already handled, bh msg discard, and send HbRsp", pEntry->nodeId, req.msgId);
terrno = TSDB_CODE_INVALID_MSG;
terrno = TSDB_CODE_SUCCESS;
doSendHbMsgRsp(terrno, &pReq->info, req.vgId, req.msgId);
streamMutexUnlock(&execInfo.lock);

View File

@ -652,7 +652,7 @@ int32_t tqStreamTaskProcessDeployReq(SStreamMeta* pMeta, SMsgCb* cb, int64_t sve
streamMetaWUnLock(pMeta);
if (code < 0) {
tqError("failed to add s-task:0x%x into vgId:%d meta, existed:%d, code:%s", vgId, taskId, numOfTasks,
tqError("vgId:%d failed to register s-task:0x%x into meta, existed tasks:%d, code:%s", vgId, taskId, numOfTasks,
tstrerror(code));
return code;
}

View File

@ -1208,34 +1208,37 @@ void streamTaskGetTriggerRecvStatus(SStreamTask* pTask, int32_t* pRecved, int32_
// record the dispatch checkpoint trigger info in the list
// memory insufficient may cause the stream computing stopped
int32_t streamTaskInitTriggerDispatchInfo(SStreamTask* pTask) {
int32_t streamTaskInitTriggerDispatchInfo(SStreamTask* pTask, int64_t sendingChkptId) {
SActiveCheckpointInfo* pInfo = pTask->chkInfo.pActiveInfo;
int64_t now = taosGetTimestampMs();
int32_t code = 0;
streamMutexLock(&pInfo->lock);
pInfo->dispatchTrigger = true;
if (pTask->outputInfo.type == TASK_OUTPUT__FIXED_DISPATCH) {
STaskDispatcherFixed* pDispatch = &pTask->outputInfo.fixedDispatcher;
if (sendingChkptId > pInfo->failedId) {
pInfo->dispatchTrigger = true;
if (pTask->outputInfo.type == TASK_OUTPUT__FIXED_DISPATCH) {
STaskDispatcherFixed* pDispatch = &pTask->outputInfo.fixedDispatcher;
STaskTriggerSendInfo p = {.sendTs = now, .recved = false, .nodeId = pDispatch->nodeId, .taskId = pDispatch->taskId};
void* px = taosArrayPush(pInfo->pDispatchTriggerList, &p);
if (px == NULL) { // pause the stream task, if memory not enough
code = terrno;
}
} else {
for (int32_t i = 0; i < streamTaskGetNumOfDownstream(pTask); ++i) {
SVgroupInfo* pVgInfo = taosArrayGet(pTask->outputInfo.shuffleDispatcher.dbInfo.pVgroupInfos, i);
if (pVgInfo == NULL) {
continue;
}
STaskTriggerSendInfo p = {.sendTs = now, .recved = false, .nodeId = pVgInfo->vgId, .taskId = pVgInfo->taskId};
void* px = taosArrayPush(pInfo->pDispatchTriggerList, &p);
STaskTriggerSendInfo p = {
.sendTs = now, .recved = false, .nodeId = pDispatch->nodeId, .taskId = pDispatch->taskId};
void* px = taosArrayPush(pInfo->pDispatchTriggerList, &p);
if (px == NULL) { // pause the stream task, if memory not enough
code = terrno;
break;
}
} else {
for (int32_t i = 0; i < streamTaskGetNumOfDownstream(pTask); ++i) {
SVgroupInfo* pVgInfo = taosArrayGet(pTask->outputInfo.shuffleDispatcher.dbInfo.pVgroupInfos, i);
if (pVgInfo == NULL) {
continue;
}
STaskTriggerSendInfo p = {.sendTs = now, .recved = false, .nodeId = pVgInfo->vgId, .taskId = pVgInfo->taskId};
void* px = taosArrayPush(pInfo->pDispatchTriggerList, &p);
if (px == NULL) { // pause the stream task, if memory not enough
code = terrno;
break;
}
}
}
}

View File

@ -845,7 +845,15 @@ int32_t streamDispatchStreamBlock(SStreamTask* pTask) {
streamMutexUnlock(&pTask->msgInfo.lock);
code = doBuildDispatchMsg(pTask, pBlock);
int64_t chkptId = 0;
if (code == 0) {
if (type == STREAM_INPUT__CHECKPOINT_TRIGGER) {
SSDataBlock* p = taosArrayGet(pBlock->blocks, 0);
if (pBlock != NULL) {
chkptId = p->info.version;
}
}
destroyStreamDataBlock(pBlock);
} else { // todo handle build dispatch msg failed
}
@ -862,7 +870,7 @@ int32_t streamDispatchStreamBlock(SStreamTask* pTask) {
continue;
}
code = streamTaskInitTriggerDispatchInfo(pTask);
code = streamTaskInitTriggerDispatchInfo(pTask, chkptId);
if (code != TSDB_CODE_SUCCESS) { // todo handle error
}
}

View File

@ -723,8 +723,9 @@ int32_t streamMetaRegisterTask(SStreamMeta* pMeta, int64_t ver, SStreamTask* pTa
pTask->id.refId = refId = taosAddRef(streamTaskRefPool, pTask);
code = taosHashPut(pMeta->pTasksMap, &id, sizeof(id), &pTask->id.refId, sizeof(int64_t));
if (code) { // todo remove it from task list
if (code) {
stError("s-task:0x%" PRIx64 " failed to register task into meta-list, code: out of memory", id.taskId);
void* pUnused = taosArrayPop(pMeta->pTaskList);
int32_t ret = taosRemoveRef(streamTaskRefPool, refId);
if (ret != 0) {
@ -734,6 +735,9 @@ int32_t streamMetaRegisterTask(SStreamMeta* pMeta, int64_t ver, SStreamTask* pTa
}
if ((code = streamMetaSaveTask(pMeta, pTask)) != 0) {
int32_t unused = taosHashRemove(pMeta->pTasksMap, &id, sizeof(id));
void* pUnused = taosArrayPop(pMeta->pTaskList);
int32_t ret = taosRemoveRef(streamTaskRefPool, refId);
if (ret) {
stError("vgId:%d remove task refId failed, refId:%" PRId64, pMeta->vgId, refId);
@ -742,6 +746,9 @@ int32_t streamMetaRegisterTask(SStreamMeta* pMeta, int64_t ver, SStreamTask* pTa
}
if ((code = streamMetaCommit(pMeta)) != 0) {
int32_t unused = taosHashRemove(pMeta->pTasksMap, &id, sizeof(id));
void* pUnused = taosArrayPop(pMeta->pTaskList);
int32_t ret = taosRemoveRef(streamTaskRefPool, refId);
if (ret) {
stError("vgId:%d remove task refId failed, refId:%" PRId64, pMeta->vgId, refId);