diff --git a/include/libs/stream/tstream.h b/include/libs/stream/tstream.h index dff212b15c..5f322be99b 100644 --- a/include/libs/stream/tstream.h +++ b/include/libs/stream/tstream.h @@ -686,7 +686,7 @@ int32_t streamExecScanHistoryInFuture(SStreamTask* pTask, int32_t idleDuration); bool streamHistoryTaskSetVerRangeStep2(SStreamTask* pTask, int64_t latestVer); // checkpoint related -int32_t streamTaskGetActiveCheckpointInfo(const SStreamTask* pTask, int32_t* pTransId, int64_t* pCheckpointId); +void streamTaskGetActiveCheckpointInfo(const SStreamTask* pTask, int32_t* pTransId, int64_t* pCheckpointId); int32_t streamTaskSetActiveCheckpointInfo(SStreamTask* pTask, int64_t activeCheckpointId); int32_t streamTaskSetFailedChkptInfo(SStreamTask* pTask, int32_t transId, int64_t checkpointId); bool streamTaskAlreadySendTrigger(SStreamTask* pTask, int32_t downstreamNodeId); @@ -770,9 +770,9 @@ bool streamMetaAllTasksReady(const SStreamMeta* pMeta); int32_t streamTaskSendRestoreChkptMsg(SStreamTask* pTask); // timer -tmr_h streamTimerGetInstance(); -void streamTmrReset(TAOS_TMR_CALLBACK fp, int32_t mseconds, void* param, void* handle, tmr_h* pTmrId, int32_t vgId, - const char* pMsg); +int32_t streamTimerGetInstance(tmr_h* pTmr); +void streamTmrReset(TAOS_TMR_CALLBACK fp, int32_t mseconds, void* param, void* handle, tmr_h* pTmrId, int32_t vgId, + const char* pMsg); // checkpoint int32_t streamProcessCheckpointSourceReq(SStreamTask* pTask, SStreamCheckpointSourceReq* pReq); @@ -809,6 +809,9 @@ void streamTaskSendRetrieveRsp(SStreamRetrieveReq* pReq, SRpcMsg* pRsp); int32_t streamProcessHeartbeatRsp(SStreamMeta* pMeta, SMStreamHbRspMsg* pRsp); int32_t streamTaskSendCheckpointsourceRsp(SStreamTask* pTask); +void streamMutexLock(TdThreadMutex *pMutex); +void streamMutexUnlock(TdThreadMutex *pMutex); +void streamMutexDestroy(TdThreadMutex *pMutex); #ifdef __cplusplus } diff --git a/source/common/src/tdatablock.c b/source/common/src/tdatablock.c index d12db23a43..6746d0343b 100644 --- a/source/common/src/tdatablock.c +++ b/source/common/src/tdatablock.c @@ -2491,7 +2491,7 @@ int32_t buildCtbNameByGroupIdImpl(const char* stbFullName, uint64_t groupId, cha .tags = tags, .stbFullName = stbFullName, .stbFullNameLen = strlen(stbFullName), .ctbShortName = cname}; int32_t code = buildChildTableName(&rname); - if(code != TSDB_CODE_SUCCESS){ + if (code != TSDB_CODE_SUCCESS) { return code; } taosArrayDestroy(tags); diff --git a/source/common/src/tmisce.c b/source/common/src/tmisce.c index 7b349e91b0..154fcc3f6b 100644 --- a/source/common/src/tmisce.c +++ b/source/common/src/tmisce.c @@ -150,7 +150,6 @@ int32_t epsetToStr(const SEpSet* pEpSet, char* pBuf, int32_t cap) { cap -= nwrite; for (int _i = 0; (_i < pEpSet->numOfEps) && (cap > 0); _i++) { - int32_t ret = 0; if (_i == pEpSet->numOfEps - 1) { ret = snprintf(pBuf + nwrite, cap, "%d. %s:%d", _i, pEpSet->eps[_i].fqdn, pEpSet->eps[_i].port); } else { diff --git a/source/dnode/mnode/impl/inc/mndStream.h b/source/dnode/mnode/impl/inc/mndStream.h index 0b6b6a9ef2..bd0d97e34d 100644 --- a/source/dnode/mnode/impl/inc/mndStream.h +++ b/source/dnode/mnode/impl/inc/mndStream.h @@ -96,57 +96,58 @@ typedef struct STaskChkptInfo { int8_t dropHTask; }STaskChkptInfo; -int32_t mndInitStream(SMnode *pMnode); -void mndCleanupStream(SMnode *pMnode); -SStreamObj *mndAcquireStream(SMnode *pMnode, char *streamName); -void mndReleaseStream(SMnode *pMnode, SStreamObj *pStream); -int32_t mndDropStreamByDb(SMnode *pMnode, STrans *pTrans, SDbObj *pDb); -int32_t mndPersistStream(STrans *pTrans, SStreamObj *pStream); -int32_t mndStreamRegisterTrans(STrans *pTrans, const char *pTransName, int64_t streamId); -int32_t mndStreamClearFinishedTrans(SMnode *pMnode, int32_t *pNumOfActiveChkpt); -bool mndStreamTransConflictCheck(SMnode *pMnode, int64_t streamId, const char *pTransName, bool lock); -int32_t mndStreamGetRelTrans(SMnode *pMnode, int64_t streamId); +int32_t mndInitStream(SMnode *pMnode); +void mndCleanupStream(SMnode *pMnode); +int32_t mndAcquireStream(SMnode *pMnode, char *streamName, SStreamObj **pStream); +void mndReleaseStream(SMnode *pMnode, SStreamObj *pStream); +int32_t mndDropStreamByDb(SMnode *pMnode, STrans *pTrans, SDbObj *pDb); +int32_t mndPersistStream(STrans *pTrans, SStreamObj *pStream); +int32_t mndStreamRegisterTrans(STrans *pTrans, const char *pTransName, int64_t streamId); +int32_t mndStreamClearFinishedTrans(SMnode *pMnode, int32_t *pNumOfActiveChkpt); +bool mndStreamTransConflictCheck(SMnode *pMnode, int64_t streamId, const char *pTransName, bool lock); +int32_t mndStreamGetRelTrans(SMnode *pMnode, int64_t streamId); int32_t mndGetNumOfStreams(SMnode *pMnode, char *dbName, int32_t *pNumOfStreams); int32_t mndGetNumOfStreamTasks(const SStreamObj *pStream); -SArray *mndTakeVgroupSnapshot(SMnode *pMnode, bool *allReady); +int32_t mndTakeVgroupSnapshot(SMnode *pMnode, bool *allReady, SArray** pList); void mndKillTransImpl(SMnode *pMnode, int32_t transId, const char *pDbName); int32_t setTransAction(STrans *pTrans, void *pCont, int32_t contLen, int32_t msgType, const SEpSet *pEpset, int32_t retryCode, int32_t acceptCode); -STrans *doCreateTrans(SMnode *pMnode, SStreamObj *pStream, SRpcMsg *pReq, ETrnConflct conflict, const char *name, const char *pMsg); +int32_t doCreateTrans(SMnode *pMnode, SStreamObj *pStream, SRpcMsg *pReq, ETrnConflct conflict, const char *name, + const char *pMsg, STrans **pTrans1); int32_t mndPersistTransLog(SStreamObj *pStream, STrans *pTrans, int32_t status); SSdbRaw *mndStreamActionEncode(SStreamObj *pStream); void killAllCheckpointTrans(SMnode *pMnode, SVgroupChangeInfo *pChangeInfo); int32_t mndStreamSetUpdateEpsetAction(SMnode *pMnode, SStreamObj *pStream, SVgroupChangeInfo *pInfo, STrans *pTrans); -SStreamObj *mndGetStreamObj(SMnode *pMnode, int64_t streamId); -int32_t extractNodeEpset(SMnode *pMnode, SEpSet *pEpSet, bool *hasEpset, int32_t taskId, int32_t nodeId); -int32_t mndProcessStreamHb(SRpcMsg *pReq); -void saveTaskAndNodeInfoIntoBuf(SStreamObj *pStream, SStreamExecInfo *pExecNode); -int32_t extractStreamNodeList(SMnode *pMnode); -int32_t mndStreamSetResumeAction(STrans *pTrans, SMnode *pMnode, SStreamObj *pStream, int8_t igUntreated); -int32_t mndStreamSetPauseAction(SMnode *pMnode, STrans *pTrans, SStreamObj *pStream); -int32_t mndStreamSetDropAction(SMnode *pMnode, STrans *pTrans, SStreamObj *pStream); -int32_t mndStreamSetDropActionFromList(SMnode *pMnode, STrans *pTrans, SArray *pList); -int32_t mndStreamSetResetTaskAction(SMnode *pMnode, STrans *pTrans, SStreamObj *pStream); -int32_t mndCreateStreamResetStatusTrans(SMnode *pMnode, SStreamObj *pStream); -int32_t mndStreamSetUpdateChkptAction(SMnode *pMnode, STrans *pTrans, SStreamObj *pStream); -int32_t mndCreateStreamChkptInfoUpdateTrans(SMnode *pMnode, SStreamObj *pStream, SArray *pChkptInfoList); -int32_t mndScanCheckpointReportInfo(SRpcMsg *pReq); -int32_t mndCreateSetConsensusChkptIdTrans(SMnode *pMnode, SStreamObj *pStream, int32_t taskId, int64_t checkpointId, - int64_t ts); -void removeTasksInBuf(SArray *pTaskIds, SStreamExecInfo *pExecInfo); +int32_t mndGetStreamObj(SMnode *pMnode, int64_t streamId, SStreamObj** pStream); +int32_t extractNodeEpset(SMnode *pMnode, SEpSet *pEpSet, bool *hasEpset, int32_t taskId, int32_t nodeId); +int32_t mndProcessStreamHb(SRpcMsg *pReq); +void saveTaskAndNodeInfoIntoBuf(SStreamObj *pStream, SStreamExecInfo *pExecNode); +int32_t extractStreamNodeList(SMnode *pMnode); +int32_t mndStreamSetResumeAction(STrans *pTrans, SMnode *pMnode, SStreamObj *pStream, int8_t igUntreated); +int32_t mndStreamSetPauseAction(SMnode *pMnode, STrans *pTrans, SStreamObj *pStream); +int32_t mndStreamSetDropAction(SMnode *pMnode, STrans *pTrans, SStreamObj *pStream); +int32_t mndStreamSetDropActionFromList(SMnode *pMnode, STrans *pTrans, SArray *pList); +int32_t mndStreamSetResetTaskAction(SMnode *pMnode, STrans *pTrans, SStreamObj *pStream); +int32_t mndCreateStreamResetStatusTrans(SMnode *pMnode, SStreamObj *pStream); +int32_t mndStreamSetUpdateChkptAction(SMnode *pMnode, STrans *pTrans, SStreamObj *pStream); +int32_t mndCreateStreamChkptInfoUpdateTrans(SMnode *pMnode, SStreamObj *pStream, SArray *pChkptInfoList); +int32_t mndScanCheckpointReportInfo(SRpcMsg *pReq); +int32_t mndCreateSetConsensusChkptIdTrans(SMnode *pMnode, SStreamObj *pStream, int32_t taskId, int64_t checkpointId, + int64_t ts); +void removeTasksInBuf(SArray *pTaskIds, SStreamExecInfo *pExecInfo); -SStreamTaskIter *createStreamTaskIter(SStreamObj *pStream); -void destroyStreamTaskIter(SStreamTaskIter *pIter); -bool streamTaskIterNextTask(SStreamTaskIter *pIter); -SStreamTask *streamTaskIterGetCurrent(SStreamTaskIter *pIter); -void mndInitExecInfo(); -void mndInitStreamExecInfo(SMnode *pMnode, SStreamExecInfo *pExecInfo); -int32_t removeExpiredNodeEntryAndTaskInBuf(SArray *pNodeSnapshot); -void removeStreamTasksInBuf(SStreamObj *pStream, SStreamExecInfo *pExecNode); +int32_t createStreamTaskIter(SStreamObj *pStream, SStreamTaskIter **pIter); +void destroyStreamTaskIter(SStreamTaskIter *pIter); +bool streamTaskIterNextTask(SStreamTaskIter *pIter); +int32_t streamTaskIterGetCurrent(SStreamTaskIter *pIter, SStreamTask **pTask); +int32_t mndInitExecInfo(); +void mndInitStreamExecInfo(SMnode *pMnode, SStreamExecInfo *pExecInfo); +int32_t removeExpiredNodeEntryAndTaskInBuf(SArray *pNodeSnapshot); +void removeStreamTasksInBuf(SStreamObj *pStream, SStreamExecInfo *pExecNode); -SCheckpointConsensusInfo *mndGetConsensusInfo(SHashObj *pHash, int64_t streamId, int32_t numOfTasks); +int32_t mndGetConsensusInfo(SHashObj *pHash, int64_t streamId, int32_t numOfTasks, SCheckpointConsensusInfo **pInfo); void mndAddConsensusTasks(SCheckpointConsensusInfo *pInfo, const SRestoreCheckpointInfo *pRestoreInfo); void mndClearConsensusRspEntry(SCheckpointConsensusInfo *pInfo); int64_t mndClearConsensusCheckpointId(SHashObj* pHash, int64_t streamId); diff --git a/source/dnode/mnode/impl/src/mndSma.c b/source/dnode/mnode/impl/src/mndSma.c index 108bafeb09..704b4a5ea9 100644 --- a/source/dnode/mnode/impl/src/mndSma.c +++ b/source/dnode/mnode/impl/src/mndSma.c @@ -815,8 +815,8 @@ static int32_t mndProcessCreateSmaReq(SRpcMsg *pReq) { char streamName[TSDB_TABLE_FNAME_LEN] = {0}; mndGetStreamNameFromSmaName(streamName, createReq.name); - pStream = mndAcquireStream(pMnode, streamName); - if (pStream != NULL) { + code = mndAcquireStream(pMnode, streamName, &pStream); + if (pStream != NULL || code == 0) { mError("sma:%s, failed to create since stream:%s already exist", createReq.name, streamName); code = TSDB_CODE_MND_STREAM_ALREADY_EXIST; goto _OVER; @@ -991,8 +991,10 @@ static int32_t mndDropSma(SMnode *pMnode, SRpcMsg *pReq, SDbObj *pDb, SSmaObj *p char streamName[TSDB_TABLE_FNAME_LEN] = {0}; mndGetStreamNameFromSmaName(streamName, pSma->name); - SStreamObj *pStream = mndAcquireStream(pMnode, streamName); - if (pStream == NULL || pStream->smaId != pSma->uid) { + SStreamObj *pStream = NULL; + + code = mndAcquireStream(pMnode, streamName, &pStream); + if (pStream == NULL || pStream->smaId != pSma->uid || code != 0) { sdbRelease(pMnode->pSdb, pStream); goto _OVER; } else { @@ -1050,10 +1052,11 @@ int32_t mndDropSmasByStb(SMnode *pMnode, STrans *pTrans, SDbObj *pDb, SStbObj *p char streamName[TSDB_TABLE_FNAME_LEN] = {0}; mndGetStreamNameFromSmaName(streamName, pSma->name); - SStreamObj *pStream = mndAcquireStream(pMnode, streamName); - if (pStream != NULL && pStream->smaId == pSma->uid) { + SStreamObj *pStream = NULL; + code = mndAcquireStream(pMnode, streamName, &pStream); + if ((pStream != NULL && pStream->smaId == pSma->uid) || code != 0) { if ((code = mndStreamSetDropAction(pMnode, pTrans, pStream)) < 0) { - mError("stream:%s, failed to drop task since %s", pStream->name, tstrerror(code)); + mError("stream:%s, failed to drop task since %s", pStream->name, terrstr()); mndReleaseStream(pMnode, pStream); goto _OVER; } @@ -1800,6 +1803,7 @@ static int32_t mndProcessCreateTSMAReq(SRpcMsg* pReq) { code = 0; goto _OVER; } + if (pSma) { code = TSDB_CODE_MND_SMA_ALREADY_EXIST; goto _OVER; @@ -1813,8 +1817,8 @@ static int32_t mndProcessCreateTSMAReq(SRpcMsg* pReq) { goto _OVER; } - pStream = mndAcquireStream(pMnode, streamName); - if (pStream != NULL) { + code = mndAcquireStream(pMnode, streamName, &pStream); + if (pStream != NULL || code != TSDB_CODE_MND_STREAM_NOT_EXIST) { mError("tsma:%s, failed to create since stream:%s already exist", createReq.name, streamName); code = TSDB_CODE_MND_SMA_ALREADY_EXIST; goto _OVER; @@ -2292,7 +2296,7 @@ static int32_t mndGetSomeTsmas(SMnode* pMnode, STableTSMAInfoRsp* pRsp, tsmaFilt SSmaObj * pBaseTsma = NULL; SSdb * pSdb = pMnode->pSdb; void * pIter = NULL; - SStreamObj * pStreamObj = NULL; + SStreamObj * pStream = NULL; SStbObj * pStb = NULL; while (1) { @@ -2314,14 +2318,16 @@ static int32_t mndGetSomeTsmas(SMnode* pMnode, STableTSMAInfoRsp* pRsp, tsmaFilt char streamName[TSDB_TABLE_FNAME_LEN] = {0}; tNameFromString(&smaName, pSma->name, T_NAME_ACCT | T_NAME_DB | T_NAME_TABLE); sprintf(streamName, "%d.%s", smaName.acctId, smaName.tname); - pStreamObj = mndAcquireStream(pMnode, streamName); - if (!pStreamObj) { + pStream = NULL; + + code = mndAcquireStream(pMnode, streamName, &pStream); + if (!pStream || (code != 0)) { sdbRelease(pSdb, pSma); continue; } - int64_t streamId = pStreamObj->uid; - mndReleaseStream(pMnode, pStreamObj); + int64_t streamId = pStream->uid; + mndReleaseStream(pMnode, pStream); STableTSMAInfo *pTsma = taosMemoryCalloc(1, sizeof(STableTSMAInfo)); if (!pTsma) { diff --git a/source/dnode/mnode/impl/src/mndStream.c b/source/dnode/mnode/impl/src/mndStream.c index d57dc6e52e..df8800aee4 100644 --- a/source/dnode/mnode/impl/src/mndStream.c +++ b/source/dnode/mnode/impl/src/mndStream.c @@ -134,17 +134,18 @@ int32_t mndInitStream(SMnode *pMnode) { mndAddShowRetrieveHandle(pMnode, TSDB_MGMT_TABLE_STREAM_TASKS, mndRetrieveStreamTask); mndAddShowFreeIterHandle(pMnode, TSDB_MGMT_TABLE_STREAM_TASKS, mndCancelGetNextStreamTask); - mndInitExecInfo(); - - if (sdbSetTable(pMnode->pSdb, table) != 0) { - return -1; + int32_t code = mndInitExecInfo(); + if (code) { + return code; } - if (sdbSetTable(pMnode->pSdb, tableSeq) != 0) { - return -1; + code = sdbSetTable(pMnode->pSdb, table); + if (code) { + return terrno; } - return 0; + code = sdbSetTable(pMnode->pSdb, tableSeq); + return code; } void mndCleanupStream(SMnode *pMnode) { @@ -251,13 +252,15 @@ static int32_t mndStreamActionUpdate(SSdb *pSdb, SStreamObj *pOldStream, SStream return 0; } -SStreamObj *mndAcquireStream(SMnode *pMnode, char *streamName) { - SSdb *pSdb = pMnode->pSdb; - SStreamObj *pStream = sdbAcquire(pSdb, SDB_STREAM, streamName); - if (pStream == NULL && terrno == TSDB_CODE_SDB_OBJ_NOT_THERE) { +int32_t mndAcquireStream(SMnode *pMnode, char *streamName, SStreamObj **pStream) { + terrno = 0; + + SSdb *pSdb = pMnode->pSdb; + (*pStream) = sdbAcquire(pSdb, SDB_STREAM, streamName); + if ((*pStream) == NULL && terrno == TSDB_CODE_SDB_OBJ_NOT_THERE) { terrno = TSDB_CODE_MND_STREAM_NOT_EXIST; } - return pStream; + return terrno; } void mndReleaseStream(SMnode *pMnode, SStreamObj *pStream) { @@ -530,9 +533,21 @@ int32_t mndPersistTaskDeployReq(STrans *pTrans, SStreamTask *pTask) { } int32_t mndPersistStreamTasks(STrans *pTrans, SStreamObj *pStream) { - SStreamTaskIter *pIter = createStreamTaskIter(pStream); + SStreamTaskIter *pIter = NULL; + int32_t code = createStreamTaskIter(pStream, &pIter); + if (code) { + mError("failed to create task iter for stream:%s", pStream->name); + return code; + } + while (streamTaskIterNextTask(pIter)) { - SStreamTask *pTask = streamTaskIterGetCurrent(pIter); + SStreamTask *pTask = NULL; + code = streamTaskIterGetCurrent(pIter, &pTask); + if (code) { + destroyStreamTaskIter(pIter); + return code; + } + if (mndPersistTaskDeployReq(pTrans, pTask) < 0) { destroyStreamTaskIter(pIter); return -1; @@ -706,7 +721,7 @@ static int32_t mndProcessCreateStreamReq(SRpcMsg *pReq) { char *sql = NULL; int32_t sqlLen = 0; const char *pMsg = "create stream tasks on dnodes"; - + int32_t code = 0; terrno = TSDB_CODE_SUCCESS; SCMCreateStreamReq createReq = {0}; @@ -726,8 +741,8 @@ static int32_t mndProcessCreateStreamReq(SRpcMsg *pReq) { goto _OVER; } - pStream = mndAcquireStream(pMnode, createReq.name); - if (pStream != NULL) { + code = mndAcquireStream(pMnode, createReq.name, &pStream); + if (pStream != NULL || code == 0) { if (createReq.igExists) { mInfo("stream:%s, already exist, ignore exist is set", createReq.name); goto _OVER; @@ -760,8 +775,9 @@ static int32_t mndProcessCreateStreamReq(SRpcMsg *pReq) { goto _OVER; } - STrans *pTrans = doCreateTrans(pMnode, &streamObj, pReq, TRN_CONFLICT_DB, MND_STREAM_CREATE_NAME, pMsg); - if (pTrans == NULL) { + STrans *pTrans = NULL; + code = doCreateTrans(pMnode, &streamObj, pReq, TRN_CONFLICT_DB, MND_STREAM_CREATE_NAME, pMsg, &pTrans); + if (pTrans == NULL || code) { goto _OVER; } @@ -802,11 +818,10 @@ static int32_t mndProcessCreateStreamReq(SRpcMsg *pReq) { // add into buffer firstly // to make sure when the hb from vnode arrived, the newly created tasks have been in the task map already. - taosThreadMutexLock(&execInfo.lock); + streamMutexLock(&execInfo.lock); mDebug("stream stream:%s start to register tasks into task nodeList and set initial checkpointId", createReq.name); saveTaskAndNodeInfoIntoBuf(&streamObj, &execInfo); -// mndRegisterConsensusChkptId(execInfo.pStreamConsensus, streamObj.uid); - taosThreadMutexUnlock(&execInfo.lock); + streamMutexUnlock(&execInfo.lock); // execute creation if (mndTransPrepare(pMnode, pTrans) != 0) { @@ -867,7 +882,7 @@ int64_t mndStreamGenChkptId(SMnode *pMnode, bool lock) { { // check the max checkpoint id from all vnodes. int64_t maxCheckpointId = -1; if (lock) { - taosThreadMutexLock(&execInfo.lock); + streamMutexLock(&execInfo.lock); } for (int32_t i = 0; i < taosArrayGetSize(execInfo.pTaskList); ++i) { @@ -888,7 +903,7 @@ int64_t mndStreamGenChkptId(SMnode *pMnode, bool lock) { } if (lock) { - taosThreadMutexUnlock(&execInfo.lock); + streamMutexUnlock(&execInfo.lock); } if (maxCheckpointId > maxChkptId) { @@ -989,11 +1004,13 @@ static int32_t mndProcessStreamCheckpointTrans(SMnode *pMnode, SStreamObj *pStre return -1; } - STrans *pTrans = doCreateTrans(pMnode, pStream, NULL, TRN_CONFLICT_NOTHING, MND_STREAM_CHECKPOINT_NAME, - "gen checkpoint for stream"); - if (pTrans == NULL) { + STrans *pTrans = NULL; + code = doCreateTrans(pMnode, pStream, NULL, TRN_CONFLICT_NOTHING, MND_STREAM_CHECKPOINT_NAME, + "gen checkpoint for stream", &pTrans); + if (pTrans == NULL || code) { + code = TSDB_CODE_MND_TRANS_CONFLICT; mError("failed to checkpoint of stream name%s, checkpointId: %" PRId64 ", reason:%s", pStream->name, checkpointId, - tstrerror(TSDB_CODE_MND_TRANS_CONFLICT)); + tstrerror(code)); goto _ERR; } @@ -1033,7 +1050,7 @@ static int32_t mndProcessStreamCheckpointTrans(SMnode *pMnode, SStreamObj *pStre taosWUnLockLatch(&pStream->lock); if ((code = mndPersistTransLog(pStream, pTrans, SDB_STATUS_READY)) != TSDB_CODE_SUCCESS) { - return code; + goto _ERR; } if ((code = mndTransPrepare(pMnode, pTrans)) != TSDB_CODE_SUCCESS) { @@ -1057,13 +1074,13 @@ int32_t extractStreamNodeList(SMnode *pMnode) { static bool taskNodeIsUpdated(SMnode *pMnode) { // check if the node update happens or not - taosThreadMutexLock(&execInfo.lock); + streamMutexLock(&execInfo.lock); int32_t numOfNodes = extractStreamNodeList(pMnode); if (numOfNodes == 0) { mDebug("stream task node change checking done, no vgroups exist, do nothing"); execInfo.ts = taosGetTimestampSec(); - taosThreadMutexUnlock(&execInfo.lock); + streamMutexUnlock(&execInfo.lock); return false; } @@ -1071,17 +1088,22 @@ static bool taskNodeIsUpdated(SMnode *pMnode) { SNodeEntry *pNodeEntry = taosArrayGet(execInfo.pNodeList, i); if (pNodeEntry->stageUpdated) { mDebug("stream task not ready due to node update detected, checkpoint not issued"); - taosThreadMutexUnlock(&execInfo.lock); + streamMutexUnlock(&execInfo.lock); return true; } } bool allReady = true; - SArray *pNodeSnapshot = mndTakeVgroupSnapshot(pMnode, &allReady); + SArray *pNodeSnapshot = NULL; + + int32_t code = mndTakeVgroupSnapshot(pMnode, &allReady, &pNodeSnapshot); + if (code) { + mError("failed to get the vgroup snapshot, ignore it and continue"); + } if (!allReady) { mWarn("not all vnodes ready, quit from vnodes status check"); taosArrayDestroy(pNodeSnapshot); - taosThreadMutexUnlock(&execInfo.lock); + streamMutexUnlock(&execInfo.lock); return true; } @@ -1097,7 +1119,7 @@ static bool taskNodeIsUpdated(SMnode *pMnode) { mDebug("stream tasks not ready due to node update"); } - taosThreadMutexUnlock(&execInfo.lock); + streamMutexUnlock(&execInfo.lock); return nodeUpdated; } @@ -1107,7 +1129,7 @@ static int32_t mndCheckTaskAndNodeStatus(SMnode *pMnode) { return -1; } - taosThreadMutexLock(&execInfo.lock); + streamMutexLock(&execInfo.lock); if (taosArrayGetSize(execInfo.pNodeList) == 0) { mDebug("stream task node change checking done, no vgroups exist, do nothing"); ASSERT(taosArrayGetSize(execInfo.pTaskList) == 0); @@ -1152,7 +1174,7 @@ static int32_t mndCheckTaskAndNodeStatus(SMnode *pMnode) { removeTasksInBuf(pInvalidList, &execInfo); taosArrayDestroy(pInvalidList); - taosThreadMutexUnlock(&execInfo.lock); + streamMutexUnlock(&execInfo.lock); return ready ? 0 : -1; } @@ -1215,14 +1237,14 @@ static int32_t mndProcessStreamCheckpoint(SRpcMsg *pReq) { continue; } - taosThreadMutexLock(&execInfo.lock); + streamMutexLock(&execInfo.lock); int64_t startTs = getStreamTaskLastReadyState(execInfo.pTaskList, pStream->uid); if (startTs != -1 && (now - startTs) < tsStreamCheckpointInterval * 1000) { - taosThreadMutexUnlock(&execInfo.lock); + streamMutexUnlock(&execInfo.lock); sdbRelease(pSdb, pStream); continue; } - taosThreadMutexUnlock(&execInfo.lock); + streamMutexUnlock(&execInfo.lock); SCheckpointInterval in = {.streamId = pStream->uid, .duration = duration}; taosArrayPush(pList, &in); @@ -1265,8 +1287,9 @@ static int32_t mndProcessStreamCheckpoint(SRpcMsg *pReq) { for (int32_t i = 0; i < numOfQual; ++i) { SCheckpointInterval *pCheckpointInfo = taosArrayGet(pList, i); - SStreamObj *p = mndGetStreamObj(pMnode, pCheckpointInfo->streamId); - if (p != NULL) { + SStreamObj *p = NULL; + code = mndGetStreamObj(pMnode, pCheckpointInfo->streamId, &p); + if (p != NULL && code == 0) { code = mndProcessStreamCheckpointTrans(pMnode, p, checkpointId, 1, true); sdbRelease(pSdb, p); @@ -1289,6 +1312,7 @@ static int32_t mndProcessStreamCheckpoint(SRpcMsg *pReq) { static int32_t mndProcessDropStreamReq(SRpcMsg *pReq) { SMnode *pMnode = pReq->info.node; SStreamObj *pStream = NULL; + int32_t code = 0; SMDropStreamReq dropReq = {0}; if (tDeserializeSMDropStreamReq(pReq->pCont, pReq->contLen, &dropReq) < 0) { @@ -1299,8 +1323,8 @@ static int32_t mndProcessDropStreamReq(SRpcMsg *pReq) { mDebug("recv drop stream:%s msg", dropReq.name); - pStream = mndAcquireStream(pMnode, dropReq.name); - if (pStream == NULL) { + code = mndAcquireStream(pMnode, dropReq.name, &pStream); + if (pStream == NULL || code != 0) { if (dropReq.igNotExists) { mInfo("stream:%s not exist, ignore not exist is set, drop stream exec done with success", dropReq.name); sdbRelease(pMnode->pSdb, pStream); @@ -1356,15 +1380,16 @@ static int32_t mndProcessDropStreamReq(SRpcMsg *pReq) { return -1; } - STrans *pTrans = doCreateTrans(pMnode, pStream, pReq, TRN_CONFLICT_NOTHING, MND_STREAM_DROP_NAME, "drop stream"); - if (pTrans == NULL) { + STrans *pTrans = NULL; + code = doCreateTrans(pMnode, pStream, pReq, TRN_CONFLICT_NOTHING, MND_STREAM_DROP_NAME, "drop stream", &pTrans); + if (pTrans == NULL || code) { mError("stream:%s uid:0x%" PRIx64 " failed to drop since %s", dropReq.name, pStream->uid, terrstr()); sdbRelease(pMnode->pSdb, pStream); tFreeMDropStreamReq(&dropReq); return -1; } - int32_t code = mndStreamRegisterTrans(pTrans, MND_STREAM_DROP_NAME, pStream->uid); + code = mndStreamRegisterTrans(pTrans, MND_STREAM_DROP_NAME, pStream->uid); // drop all tasks if (mndStreamSetDropAction(pMnode, pTrans, pStream) < 0) { @@ -1857,9 +1882,9 @@ static int32_t mndRetrieveStreamTask(SRpcMsg *pReq, SShowObj *pShow, SSDataBlock int32_t numOfRows = 0; SStreamObj *pStream = NULL; - taosThreadMutexLock(&execInfo.lock); + streamMutexLock(&execInfo.lock); mndInitStreamExecInfo(pMnode, &execInfo); - taosThreadMutexUnlock(&execInfo.lock); + streamMutexUnlock(&execInfo.lock); while (numOfRows < rowsCapacity) { pShow->pIter = sdbFetch(pSdb, SDB_STREAM, pShow->pIter, (void **)&pStream); @@ -1876,11 +1901,24 @@ static int32_t mndRetrieveStreamTask(SRpcMsg *pReq, SShowObj *pShow, SSDataBlock } // add row for each task - SStreamTaskIter *pIter = createStreamTaskIter(pStream); - while (streamTaskIterNextTask(pIter)) { - SStreamTask *pTask = streamTaskIterGetCurrent(pIter); + SStreamTaskIter *pIter = NULL; + int32_t code = createStreamTaskIter(pStream, &pIter); + if (code) { + taosRUnLockLatch(&pStream->lock); + sdbRelease(pSdb, pStream); + mError("failed to create task iter for stream:%s", pStream->name); + continue; + } - int32_t code = setTaskAttrInResBlock(pStream, pTask, pBlock, numOfRows); + while (streamTaskIterNextTask(pIter)) { + SStreamTask *pTask = NULL; + code = streamTaskIterGetCurrent(pIter, &pTask); + if (code) { + destroyStreamTaskIter(pIter); + break; + } + + code = setTaskAttrInResBlock(pStream, pTask, pBlock, numOfRows); if (code == TSDB_CODE_SUCCESS) { numOfRows++; } @@ -1906,6 +1944,7 @@ static void mndCancelGetNextStreamTask(SMnode *pMnode, void *pIter) { static int32_t mndProcessPauseStreamReq(SRpcMsg *pReq) { SMnode *pMnode = pReq->info.node; SStreamObj *pStream = NULL; + int32_t code = 0; SMPauseStreamReq pauseReq = {0}; if (tDeserializeSMPauseStreamReq(pReq->pCont, pReq->contLen, &pauseReq) < 0) { @@ -1913,9 +1952,8 @@ static int32_t mndProcessPauseStreamReq(SRpcMsg *pReq) { return -1; } - pStream = mndAcquireStream(pMnode, pauseReq.name); - - if (pStream == NULL) { + code = mndAcquireStream(pMnode, pauseReq.name, &pStream); + if (pStream == NULL || code != 0) { if (pauseReq.igNotExists) { mInfo("stream:%s, not exist, not pause stream", pauseReq.name); return 0; @@ -1955,7 +1993,7 @@ static int32_t mndProcessPauseStreamReq(SRpcMsg *pReq) { { // check for tasks, if tasks are not ready, not allowed to pause bool found = false; bool readyToPause = true; - taosThreadMutexLock(&execInfo.lock); + streamMutexLock(&execInfo.lock); for (int32_t i = 0; i < taosArrayGetSize(execInfo.pTaskList); ++i) { STaskId *p = taosArrayGet(execInfo.pTaskList, i); @@ -1978,7 +2016,7 @@ static int32_t mndProcessPauseStreamReq(SRpcMsg *pReq) { found = true; } - taosThreadMutexUnlock(&execInfo.lock); + streamMutexUnlock(&execInfo.lock); if (!found) { mError("stream:%s task not report status yet, not ready for pause", pauseReq.name); sdbRelease(pMnode->pSdb, pStream); @@ -1992,42 +2030,49 @@ static int32_t mndProcessPauseStreamReq(SRpcMsg *pReq) { } } - STrans *pTrans = - doCreateTrans(pMnode, pStream, pReq, TRN_CONFLICT_NOTHING, MND_STREAM_PAUSE_NAME, "pause the stream"); - if (pTrans == NULL) { + STrans *pTrans = NULL; + code = doCreateTrans(pMnode, pStream, pReq, TRN_CONFLICT_NOTHING, MND_STREAM_PAUSE_NAME, "pause the stream", &pTrans); + if (pTrans == NULL || code) { mError("stream:%s failed to pause stream since %s", pauseReq.name, terrstr()); sdbRelease(pMnode->pSdb, pStream); - return -1; + return code; } - int32_t code = mndStreamRegisterTrans(pTrans, MND_STREAM_PAUSE_NAME, pStream->uid); + code = mndStreamRegisterTrans(pTrans, MND_STREAM_PAUSE_NAME, pStream->uid); + if (code) { + sdbRelease(pMnode->pSdb, pStream); + mndTransDrop(pTrans); + return code; + } // if nodeUpdate happened, not send pause trans - if (mndStreamSetPauseAction(pMnode, pTrans, pStream) < 0) { + code = mndStreamSetPauseAction(pMnode, pTrans, pStream); + if (code) { mError("stream:%s, failed to pause task since %s", pauseReq.name, terrstr()); sdbRelease(pMnode->pSdb, pStream); mndTransDrop(pTrans); - return -1; + return code; } // pause stream taosWLockLatch(&pStream->lock); pStream->status = STREAM_STATUS__PAUSE; - if (mndPersistTransLog(pStream, pTrans, SDB_STATUS_READY) < 0) { + code = mndPersistTransLog(pStream, pTrans, SDB_STATUS_READY); + if (code) { taosWUnLockLatch(&pStream->lock); - sdbRelease(pMnode->pSdb, pStream); mndTransDrop(pTrans); - return -1; + return code; } taosWUnLockLatch(&pStream->lock); - if (mndTransPrepare(pMnode, pTrans) != 0) { + code = mndTransPrepare(pMnode, pTrans); + if (code) { mError("trans:%d, failed to prepare pause stream trans since %s", pTrans->id, terrstr()); sdbRelease(pMnode->pSdb, pStream); mndTransDrop(pTrans); - return -1; + return code; } sdbRelease(pMnode->pSdb, pStream); @@ -2039,6 +2084,7 @@ static int32_t mndProcessPauseStreamReq(SRpcMsg *pReq) { static int32_t mndProcessResumeStreamReq(SRpcMsg *pReq) { SMnode *pMnode = pReq->info.node; SStreamObj *pStream = NULL; + int32_t code = 0; if ((terrno = grantCheckExpire(TSDB_GRANT_STREAMS)) < 0) { return -1; @@ -2050,9 +2096,8 @@ static int32_t mndProcessResumeStreamReq(SRpcMsg *pReq) { return -1; } - pStream = mndAcquireStream(pMnode, resumeReq.name); - - if (pStream == NULL) { + code = mndAcquireStream(pMnode, resumeReq.name, &pStream); + if (pStream == NULL || code != 0) { if (resumeReq.igNotExists) { mInfo("stream:%s not exist, not resume stream", resumeReq.name); sdbRelease(pMnode->pSdb, pStream); @@ -2081,22 +2126,28 @@ static int32_t mndProcessResumeStreamReq(SRpcMsg *pReq) { return -1; } - STrans *pTrans = - doCreateTrans(pMnode, pStream, pReq, TRN_CONFLICT_NOTHING, MND_STREAM_RESUME_NAME, "resume the stream"); - if (pTrans == NULL) { + STrans *pTrans = NULL; + code = + doCreateTrans(pMnode, pStream, pReq, TRN_CONFLICT_NOTHING, MND_STREAM_RESUME_NAME, "resume the stream", &pTrans); + if (pTrans == NULL || code) { mError("stream:%s, failed to resume stream since %s", resumeReq.name, terrstr()); sdbRelease(pMnode->pSdb, pStream); - return -1; + return code; } - int32_t code = mndStreamRegisterTrans(pTrans, MND_STREAM_RESUME_NAME, pStream->uid); + code = mndStreamRegisterTrans(pTrans, MND_STREAM_RESUME_NAME, pStream->uid); + if (code) { + sdbRelease(pMnode->pSdb, pStream); + mndTransDrop(pTrans); + return code; + } // set the resume action if (mndStreamSetResumeAction(pTrans, pMnode, pStream, resumeReq.igUntreated) < 0) { mError("stream:%s, failed to drop task since %s", resumeReq.name, terrstr()); sdbRelease(pMnode->pSdb, pStream); mndTransDrop(pTrans); - return -1; + return code; } // resume stream @@ -2107,7 +2158,7 @@ static int32_t mndProcessResumeStreamReq(SRpcMsg *pReq) { sdbRelease(pMnode->pSdb, pStream); mndTransDrop(pTrans); - return -1; + return code; } taosWUnLockLatch(&pStream->lock); @@ -2115,7 +2166,7 @@ static int32_t mndProcessResumeStreamReq(SRpcMsg *pReq) { mError("trans:%d, failed to prepare pause stream trans since %s", pTrans->id, terrstr()); sdbRelease(pMnode->pSdb, pStream); mndTransDrop(pTrans); - return -1; + return code; } sdbRelease(pMnode->pSdb, pStream); @@ -2189,6 +2240,7 @@ static int32_t mndProcessVgroupChange(SMnode *pMnode, SVgroupChangeInfo *pChange SStreamObj *pStream = NULL; void *pIter = NULL; STrans *pTrans = NULL; + int32_t code = 0; // conflict check for nodeUpdate trans, here we randomly chose one stream to add into the trans pool while (1) { @@ -2215,12 +2267,11 @@ static int32_t mndProcessVgroupChange(SMnode *pMnode, SVgroupChangeInfo *pChange // here create only one trans if (pTrans == NULL) { - pTrans = - doCreateTrans(pMnode, pStream, NULL, TRN_CONFLICT_NOTHING, MND_STREAM_TASK_UPDATE_NAME, "update task epsets"); - if (pTrans == NULL) { + code = doCreateTrans(pMnode, pStream, NULL, TRN_CONFLICT_NOTHING, MND_STREAM_TASK_UPDATE_NAME, "update task epsets", &pTrans); + if (pTrans == NULL || code) { sdbRelease(pSdb, pStream); sdbCancelFetch(pSdb, pIter); - return terrno; + return terrno = code; } mndStreamRegisterTrans(pTrans, MND_STREAM_TASK_UPDATE_NAME, pStream->uid); @@ -2237,7 +2288,7 @@ static int32_t mndProcessVgroupChange(SMnode *pMnode, SVgroupChangeInfo *pChange mDebug("stream:0x%" PRIx64 " %s involved node changed, create update trans, transId:%d", pStream->uid, pStream->name, pTrans->id); - int32_t code = mndStreamSetUpdateEpsetAction(pMnode, pStream, pChangeInfo, pTrans); + code = mndStreamSetUpdateEpsetAction(pMnode, pStream, pChangeInfo, pTrans); // todo: not continue, drop all and retry again if (code != TSDB_CODE_SUCCESS) { @@ -2252,7 +2303,7 @@ static int32_t mndProcessVgroupChange(SMnode *pMnode, SVgroupChangeInfo *pChange if (code != TSDB_CODE_SUCCESS) { sdbCancelFetch(pSdb, pIter); - return -1; + return code; } } @@ -2261,16 +2312,17 @@ static int32_t mndProcessVgroupChange(SMnode *pMnode, SVgroupChangeInfo *pChange return 0; } - if (mndTransPrepare(pMnode, pTrans) != 0) { + code = mndTransPrepare(pMnode, pTrans); + if (code) { mError("trans:%d, failed to prepare update stream trans since %s", pTrans->id, terrstr()); sdbRelease(pMnode->pSdb, pStream); mndTransDrop(pTrans); - return -1; + return code; } sdbRelease(pMnode->pSdb, pStream); mndTransDrop(pTrans); - return 0; + return code; } static int32_t extractNodeListFromStream(SMnode *pMnode, SArray *pNodeList) { @@ -2287,9 +2339,21 @@ static int32_t extractNodeListFromStream(SMnode *pMnode, SArray *pNodeList) { taosWLockLatch(&pStream->lock); - SStreamTaskIter *pTaskIter = createStreamTaskIter(pStream); + SStreamTaskIter *pTaskIter = NULL; + int32_t code = createStreamTaskIter(pStream, &pTaskIter); + if (code) { + taosWUnLockLatch(&pStream->lock); + sdbRelease(pSdb, pStream); + mError("failed to create task iter for stream:%s", pStream->name); + continue; + } + while (streamTaskIterNextTask(pTaskIter)) { - SStreamTask *pTask = streamTaskIterGetCurrent(pTaskIter); + SStreamTask *pTask = NULL; + code = streamTaskIterGetCurrent(pTaskIter, &pTask); + if (code) { + break; + } SNodeEntry entry = {.hbTimestamp = -1, .nodeId = pTask->info.nodeId}; epsetAssign(&entry.epset, &pTask->info.epSet); @@ -2336,9 +2400,9 @@ static int32_t mndProcessNodeCheckReq(SRpcMsg *pMsg) { SMnode *pMnode = pMsg->info.node; - taosThreadMutexLock(&execInfo.lock); + streamMutexLock(&execInfo.lock); int32_t numOfNodes = extractStreamNodeList(pMnode); - taosThreadMutexUnlock(&execInfo.lock); + streamMutexUnlock(&execInfo.lock); if (numOfNodes == 0) { mDebug("end to do stream task(s) node change checking, no stream tasks exist, do nothing"); @@ -2348,7 +2412,13 @@ static int32_t mndProcessNodeCheckReq(SRpcMsg *pMsg) { } bool allReady = true; - SArray *pNodeSnapshot = mndTakeVgroupSnapshot(pMnode, &allReady); + SArray *pNodeSnapshot = NULL; + + code = mndTakeVgroupSnapshot(pMnode, &allReady, &pNodeSnapshot); + if (code) { + mError("failed to take the vgroup snapshot, ignore it and continue"); + } + if (!allReady) { taosArrayDestroy(pNodeSnapshot); atomic_store_32(&mndNodeCheckSentinel, 0); @@ -2356,7 +2426,7 @@ static int32_t mndProcessNodeCheckReq(SRpcMsg *pMsg) { return 0; } - taosThreadMutexLock(&execInfo.lock); + streamMutexLock(&execInfo.lock); removeExpiredNodeEntryAndTaskInBuf(pNodeSnapshot); @@ -2380,7 +2450,7 @@ static int32_t mndProcessNodeCheckReq(SRpcMsg *pMsg) { } taosArrayDestroy(pNodeSnapshot); - taosThreadMutexUnlock(&execInfo.lock); + streamMutexUnlock(&execInfo.lock); taosArrayDestroy(changeInfo.pUpdateNodeList); taosHashCleanup(changeInfo.pDBMap); @@ -2406,9 +2476,19 @@ static int32_t mndProcessNodeCheck(SRpcMsg *pReq) { } void saveTaskAndNodeInfoIntoBuf(SStreamObj *pStream, SStreamExecInfo *pExecNode) { - SStreamTaskIter *pIter = createStreamTaskIter(pStream); + SStreamTaskIter *pIter = NULL; + int32_t code = createStreamTaskIter(pStream, &pIter); + if (code) { + mError("failed to create task iter for stream:%s", pStream->name); + return; + } + while (streamTaskIterNextTask(pIter)) { - SStreamTask *pTask = streamTaskIterGetCurrent(pIter); + SStreamTask *pTask = NULL; + code = streamTaskIterGetCurrent(pIter, &pTask); + if (code) { + break; + } STaskId id = {.streamId = pTask->id.streamId, .taskId = pTask->id.taskId}; void *p = taosHashGet(pExecNode->pTaskMap, &id, sizeof(id)); @@ -2478,10 +2558,11 @@ int32_t mndProcessStreamReqCheckpoint(SRpcMsg *pReq) { mDebug("receive stream task checkpoint req msg, vgId:%d, s-task:0x%x", req.nodeId, req.taskId); // register to the stream task done map, if all tasks has sent this kinds of message, start the checkpoint trans. - taosThreadMutexLock(&execInfo.lock); + streamMutexLock(&execInfo.lock); - SStreamObj *pStream = mndGetStreamObj(pMnode, req.streamId); - if (pStream == NULL) { + SStreamObj *pStream = NULL; + int32_t code = mndGetStreamObj(pMnode, req.streamId, &pStream); + if (pStream == NULL || code != 0) { mWarn("failed to find the stream:0x%" PRIx64 ", not handle the checkpoint req, try to acquire in buf", req.streamId); @@ -2492,7 +2573,7 @@ int32_t mndProcessStreamReqCheckpoint(SRpcMsg *pReq) { if (p == NULL) { mError("failed to find the stream:0x%" PRIx64 " in buf, not handle the checkpoint req", req.streamId); terrno = TSDB_CODE_MND_STREAM_NOT_EXIST; - taosThreadMutexUnlock(&execInfo.lock); + streamMutexUnlock(&execInfo.lock); return -1; } else { mDebug("s-task:0x%" PRIx64 "-0x%x in buf not in mnode/meta, create stream trans may not complete yet", @@ -2537,7 +2618,7 @@ int32_t mndProcessStreamReqCheckpoint(SRpcMsg *pReq) { mndReleaseStream(pMnode, pStream); } - taosThreadMutexUnlock(&execInfo.lock); + streamMutexUnlock(&execInfo.lock); { SRpcMsg rsp = {.code = 0, .info = pReq->info, .contLen = sizeof(SMStreamReqCheckpointRsp)}; @@ -2597,10 +2678,11 @@ int32_t mndProcessCheckpointReport(SRpcMsg *pReq) { req.nodeId, req.taskId, req.checkpointId, req.checkpointVer, req.transId); // register to the stream task done map, if all tasks has sent this kinds of message, start the checkpoint trans. - taosThreadMutexLock(&execInfo.lock); + streamMutexLock(&execInfo.lock); - SStreamObj *pStream = mndGetStreamObj(pMnode, req.streamId); - if (pStream == NULL) { + SStreamObj *pStream = NULL; + int32_t code = mndGetStreamObj(pMnode, req.streamId, &pStream); + if (pStream == NULL || code != 0) { mWarn("failed to find the stream:0x%" PRIx64 ", not handle checkpoint-report, try to acquire in buf", req.streamId); // not in meta-store yet, try to acquire the task in exec buffer @@ -2610,7 +2692,7 @@ int32_t mndProcessCheckpointReport(SRpcMsg *pReq) { if (p == NULL) { mError("failed to find the stream:0x%" PRIx64 " in buf, not handle the checkpoint-report", req.streamId); terrno = TSDB_CODE_MND_STREAM_NOT_EXIST; - taosThreadMutexUnlock(&execInfo.lock); + streamMutexUnlock(&execInfo.lock); return -1; } else { mDebug("s-task:0x%" PRIx64 "-0x%x in buf not in mnode/meta, create stream trans may not complete yet", @@ -2642,7 +2724,7 @@ int32_t mndProcessCheckpointReport(SRpcMsg *pReq) { mndReleaseStream(pMnode, pStream); } - taosThreadMutexUnlock(&execInfo.lock); + streamMutexUnlock(&execInfo.lock); doSendQuickRsp(&pReq->info, sizeof(SMStreamUpdateChkptRsp), req.nodeId, TSDB_CODE_SUCCESS); return 0; @@ -2707,7 +2789,7 @@ static void doSendQuickRsp(SRpcHandleInfo *pInfo, int32_t msgSize, int32_t vgId, // req.nodeId, req.streamId, req.taskId, req.checkpointId); // // // register to the stream task done map, if all tasks has sent this kinds of message, start the checkpoint trans. -// taosThreadMutexLock(&execInfo.lock); +// streamMutexLock(&execInfo.lock); // // // mnode handle the create stream transaction too slow may cause this problem // SStreamObj *pStream = mndGetStreamObj(pMnode, req.streamId); @@ -2721,7 +2803,7 @@ static void doSendQuickRsp(SRpcHandleInfo *pInfo, int32_t msgSize, int32_t vgId, // if (p == NULL) { // mError("failed to find the stream:0x%" PRIx64 " in buf, not handle consensus-checkpointId", req.streamId); // terrno = TSDB_CODE_MND_STREAM_NOT_EXIST; -// taosThreadMutexUnlock(&execInfo.lock); +// streamMutexUnlock(&execInfo.lock); // // doSendQuickRsp(&pMsg->info, sizeof(SMStreamReqConsensChkptRsp), req.nodeId, terrno); // return -1; @@ -2737,7 +2819,7 @@ static void doSendQuickRsp(SRpcHandleInfo *pInfo, int32_t msgSize, int32_t vgId, // // int32_t numOfTasks = (pStream == NULL) ? 0 : mndGetNumOfStreamTasks(pStream); // if ((pStream != NULL) && (pStream->checkpointId == 0)) { // not generated checkpoint yet, return 0 directly -// taosThreadMutexUnlock(&execInfo.lock); +// streamMutexUnlock(&execInfo.lock); // mndCreateSetConsensusChkptIdTrans(pMnode, pStream, req.taskId, 0, req.startTs); // // doSendQuickRsp(&pMsg->info, sizeof(SMStreamReqConsensChkptRsp), req.nodeId, terrno); @@ -2754,7 +2836,7 @@ static void doSendQuickRsp(SRpcHandleInfo *pInfo, int32_t msgSize, int32_t vgId, // SCheckpointConsensusInfo *pInfo = mndGetConsensusInfo(execInfo.pStreamConsensus, req.streamId, numOfTasks); // mndAddConsensusTasks(pInfo, &req); // -// taosThreadMutexUnlock(&execInfo.lock); +// streamMutexUnlock(&execInfo.lock); // doSendQuickRsp(&pMsg->info, sizeof(SMStreamReqConsensChkptRsp), req.nodeId, terrno); // return 0; // } @@ -2764,7 +2846,7 @@ static void doSendQuickRsp(SRpcHandleInfo *pInfo, int32_t msgSize, int32_t vgId, // req.nodeId, req.streamId, pStream->name, chkId, pStream->checkpointId); // mndCreateSetConsensusChkptIdTrans(pMnode, pStream, req.taskId, chkId, req.startTs); // -// taosThreadMutexUnlock(&execInfo.lock); +// streamMutexUnlock(&execInfo.lock); // doSendQuickRsp(&pMsg->info, sizeof(SMStreamReqConsensChkptRsp), req.nodeId, terrno); // return 0; // } @@ -2777,7 +2859,7 @@ static void doSendQuickRsp(SRpcHandleInfo *pInfo, int32_t msgSize, int32_t vgId, // mndReleaseStream(pMnode, pStream); // } // -// taosThreadMutexUnlock(&execInfo.lock); +// streamMutexUnlock(&execInfo.lock); // doSendQuickRsp(&pMsg->info, sizeof(SMStreamReqConsensChkptRsp), req.nodeId, terrno); // return 0; //} @@ -2790,15 +2872,21 @@ int32_t mndProcessConsensusInTmr(SRpcMsg *pMsg) { mDebug("start to process consensus-checkpointId in tmr"); bool allReady = true; - SArray *pNodeSnapshot = mndTakeVgroupSnapshot(pMnode, &allReady); + SArray *pNodeSnapshot = NULL; + + int32_t code = mndTakeVgroupSnapshot(pMnode, &allReady, &pNodeSnapshot); taosArrayDestroy(pNodeSnapshot); + if (code) { + mError("failed to get the vgroup snapshot, ignore it and continue"); + } + if (!allReady) { mWarn("not all vnodes are ready, end to process the consensus-checkpointId in tmr process"); taosArrayDestroy(pStreamList); return 0; } - taosThreadMutexLock(&execInfo.lock); + streamMutexLock(&execInfo.lock); void *pIter = NULL; while ((pIter = taosHashIterate(execInfo.pStreamConsensus, pIter)) != NULL) { @@ -2808,8 +2896,9 @@ int32_t mndProcessConsensusInTmr(SRpcMsg *pMsg) { int32_t num = taosArrayGetSize(pInfo->pTaskList); SArray *pList = taosArrayInit(4, sizeof(int32_t)); - SStreamObj *pStream = mndGetStreamObj(pMnode, pInfo->streamId); - if (pStream == NULL) { // stream has been dropped already + SStreamObj *pStream = NULL; + code = mndGetStreamObj(pMnode, pInfo->streamId, &pStream); + if (pStream == NULL || code != 0) { // stream has been dropped already mDebug("stream:0x%" PRIx64 " dropped already, continue", pInfo->streamId); taosArrayDestroy(pList); continue; @@ -2868,14 +2957,14 @@ int32_t mndProcessConsensusInTmr(SRpcMsg *pMsg) { for (int32_t i = 0; i < taosArrayGetSize(pStreamList); ++i) { int64_t *pStreamId = (int64_t *)taosArrayGet(pStreamList, i); - mndClearConsensusCheckpointId(execInfo.pStreamConsensus, *pStreamId); + code = mndClearConsensusCheckpointId(execInfo.pStreamConsensus, *pStreamId); } - taosThreadMutexUnlock(&execInfo.lock); + streamMutexUnlock(&execInfo.lock); taosArrayDestroy(pStreamList); mDebug("end to process consensus-checkpointId in tmr"); - return TSDB_CODE_SUCCESS; + return code; } static int32_t mndProcessCreateStreamReqFromMNode(SRpcMsg *pReq) { @@ -2926,32 +3015,41 @@ void addAllStreamTasksIntoBuf(SMnode *pMnode, SStreamExecInfo *pExecInfo) { } int32_t mndCreateStreamChkptInfoUpdateTrans(SMnode *pMnode, SStreamObj *pStream, SArray *pChkptInfoList) { - STrans *pTrans = doCreateTrans(pMnode, pStream, NULL, TRN_CONFLICT_NOTHING, MND_STREAM_CHKPT_UPDATE_NAME, - "update checkpoint-info"); - if (pTrans == NULL) { - return terrno; + STrans *pTrans = NULL; + int32_t code = doCreateTrans(pMnode, pStream, NULL, TRN_CONFLICT_NOTHING, MND_STREAM_CHKPT_UPDATE_NAME, + "update checkpoint-info", &pTrans); + if (pTrans == NULL || code) { + sdbRelease(pMnode->pSdb, pStream); + return code; } - /*int32_t code = */ mndStreamRegisterTrans(pTrans, MND_STREAM_CHKPT_UPDATE_NAME, pStream->uid); - int32_t code = mndStreamSetUpdateChkptAction(pMnode, pTrans, pStream); - if (code != 0) { + code = mndStreamRegisterTrans(pTrans, MND_STREAM_CHKPT_UPDATE_NAME, pStream->uid); + if (code){ + sdbRelease(pMnode->pSdb, pStream); + mndTransDrop(pTrans); + return code; + } + + code = mndStreamSetUpdateChkptAction(pMnode, pTrans, pStream); + if (code) { sdbRelease(pMnode->pSdb, pStream); mndTransDrop(pTrans); return code; } code = mndPersistTransLog(pStream, pTrans, SDB_STATUS_READY); - if (code != TSDB_CODE_SUCCESS) { + if (code) { sdbRelease(pMnode->pSdb, pStream); mndTransDrop(pTrans); - return -1; + return code; } - if (mndTransPrepare(pMnode, pTrans) != 0) { + code = mndTransPrepare(pMnode, pTrans); + if (code) { mError("trans:%d, failed to prepare update checkpoint-info meta trans since %s", pTrans->id, terrstr()); sdbRelease(pMnode->pSdb, pStream); mndTransDrop(pTrans); - return -1; + return code; } sdbRelease(pMnode->pSdb, pStream); diff --git a/source/dnode/mnode/impl/src/mndStreamHb.c b/source/dnode/mnode/impl/src/mndStreamHb.c index bc10ec211d..c5297b5ba8 100644 --- a/source/dnode/mnode/impl/src/mndStreamHb.c +++ b/source/dnode/mnode/impl/src/mndStreamHb.c @@ -61,15 +61,23 @@ void addIntoCheckpointList(SArray *pList, const SFailedCheckpointInfo *pInfo) { } int32_t mndCreateStreamResetStatusTrans(SMnode *pMnode, SStreamObj *pStream) { - STrans *pTrans = doCreateTrans(pMnode, pStream, NULL, TRN_CONFLICT_NOTHING, MND_STREAM_TASK_RESET_NAME, - " reset from failed checkpoint"); - if (pTrans == NULL) { + STrans *pTrans = NULL; + int32_t code = doCreateTrans(pMnode, pStream, NULL, TRN_CONFLICT_NOTHING, MND_STREAM_TASK_RESET_NAME, + " reset from failed checkpoint", &pTrans); + if (pTrans == NULL || code) { + sdbRelease(pMnode->pSdb, pStream); return terrno; } - /*int32_t code = */ mndStreamRegisterTrans(pTrans, MND_STREAM_TASK_RESET_NAME, pStream->uid); - int32_t code = mndStreamSetResetTaskAction(pMnode, pTrans, pStream); - if (code != 0) { + code = mndStreamRegisterTrans(pTrans, MND_STREAM_TASK_RESET_NAME, pStream->uid); + if (code) { + sdbRelease(pMnode->pSdb, pStream); + mndTransDrop(pTrans); + return code; + } + + code = mndStreamSetResetTaskAction(pMnode, pTrans, pStream); + if (code) { sdbRelease(pMnode->pSdb, pStream); mndTransDrop(pTrans); return code; @@ -79,14 +87,15 @@ int32_t mndCreateStreamResetStatusTrans(SMnode *pMnode, SStreamObj *pStream) { if (code != TSDB_CODE_SUCCESS) { sdbRelease(pMnode->pSdb, pStream); mndTransDrop(pTrans); - return -1; + return code; } - if (mndTransPrepare(pMnode, pTrans) != 0) { + code = mndTransPrepare(pMnode, pTrans); + if (code != 0) { mError("trans:%d, failed to prepare update stream trans since %s", pTrans->id, terrstr()); sdbRelease(pMnode->pSdb, pStream); mndTransDrop(pTrans); - return -1; + return code; } sdbRelease(pMnode->pSdb, pStream); @@ -99,8 +108,9 @@ int32_t mndResetStatusFromCheckpoint(SMnode *pMnode, int64_t streamId, int32_t t int32_t code = TSDB_CODE_SUCCESS; mndKillTransImpl(pMnode, transId, ""); - SStreamObj *pStream = mndGetStreamObj(pMnode, streamId); - if (pStream == NULL) { + SStreamObj *pStream = NULL; + code = mndGetStreamObj(pMnode, streamId, &pStream); + if (pStream == NULL || code != 0) { code = TSDB_CODE_STREAM_TASK_NOT_EXIST; mError("failed to acquire the streamObj:0x%" PRIx64 " to reset checkpoint, may have been dropped", pStream->uid); } else { @@ -159,34 +169,39 @@ int32_t mndDropOrphanTasks(SMnode *pMnode, SArray *pList) { } SStreamObj dummyObj = {.uid = pTask->streamId, .sourceDb = "", .targetSTbName = ""}; - STrans *pTrans = doCreateTrans(pMnode, &dummyObj, NULL, TRN_CONFLICT_NOTHING, MND_STREAM_DROP_NAME, "drop stream"); - if (pTrans == NULL) { + STrans *pTrans = NULL; + int32_t code = + doCreateTrans(pMnode, &dummyObj, NULL, TRN_CONFLICT_NOTHING, MND_STREAM_DROP_NAME, "drop stream", &pTrans); + if (pTrans == NULL || code != 0) { mError("failed to create trans to drop orphan tasks since %s", terrstr()); - return -1; + return code; } - int32_t code = mndStreamRegisterTrans(pTrans, MND_STREAM_DROP_NAME, pTask->streamId); - + code = mndStreamRegisterTrans(pTrans, MND_STREAM_DROP_NAME, pTask->streamId); + if (code) { + return code; + } // drop all tasks - if (mndStreamSetDropActionFromList(pMnode, pTrans, pList) < 0) { + if ((code = mndStreamSetDropActionFromList(pMnode, pTrans, pList)) < 0) { mError("failed to create trans to drop orphan tasks since %s", terrstr()); mndTransDrop(pTrans); - return -1; + return code; } // drop stream - if (mndPersistTransLog(&dummyObj, pTrans, SDB_STATUS_DROPPED) < 0) { + if ((code = mndPersistTransLog(&dummyObj, pTrans, SDB_STATUS_DROPPED)) < 0) { mndTransDrop(pTrans); - return -1; + return code; } - if (mndTransPrepare(pMnode, pTrans) != 0) { + if ((code = mndTransPrepare(pMnode, pTrans)) != 0) { mError("trans:%d, failed to prepare drop stream trans since %s", pTrans->id, terrstr()); mndTransDrop(pTrans); - return -1; + return code; } + mndTransDrop(pTrans); - return 0; + return code; } int32_t suspendAllStreams(SMnode *pMnode, SRpcHandleInfo *info) { @@ -228,10 +243,11 @@ int32_t mndProcessStreamHb(SRpcMsg *pReq) { SStreamHbMsg req = {0}; SArray *pFailedChkpt = NULL; SArray *pOrphanTasks = NULL; + int32_t code = 0; - if ((terrno = grantCheckExpire(TSDB_GRANT_STREAMS)) < 0) { + if ((code = grantCheckExpire(TSDB_GRANT_STREAMS)) < 0) { if (suspendAllStreams(pMnode, &pReq->info) < 0) { - return -1; + return code; } } @@ -241,8 +257,8 @@ int32_t mndProcessStreamHb(SRpcMsg *pReq) { if (tDecodeStreamHbMsg(&decoder, &req) < 0) { tCleanupStreamHbMsg(&req); tDecoderClear(&decoder); - terrno = TSDB_CODE_INVALID_MSG; - return -1; + code = terrno = TSDB_CODE_INVALID_MSG; + return code; } tDecoderClear(&decoder); @@ -257,12 +273,12 @@ int32_t mndProcessStreamHb(SRpcMsg *pReq) { if (!validateHbMsg(execInfo.pNodeList, req.vgId)) { mError("vgId:%d not exists in nodeList buf, discarded", req.vgId); - terrno = TSDB_CODE_INVALID_MSG; + code = terrno = TSDB_CODE_INVALID_MSG; doSendHbMsgRsp(terrno, &pReq->info, req.vgId, req.msgId); taosThreadMutexUnlock(&execInfo.lock); cleanupAfterProcessHbMsg(&req, pFailedChkpt, pOrphanTasks); - return -1; + return code; } int32_t numOfUpdated = taosArrayGetSize(req.pUpdateNodes); @@ -293,11 +309,23 @@ int32_t mndProcessStreamHb(SRpcMsg *pReq) { .startTs = pChkInfo->consensusTs, }; - SStreamObj *pStream = mndGetStreamObj(pMnode, p->id.streamId); - int32_t numOfTasks = mndGetNumOfStreamTasks(pStream); + SStreamObj *pStream = NULL; + code = mndGetStreamObj(pMnode, p->id.streamId, &pStream); + if (code) { + code = TSDB_CODE_STREAM_TASK_NOT_EXIST; + continue; + } + + int32_t numOfTasks = mndGetNumOfStreamTasks(pStream); + SCheckpointConsensusInfo *pInfo = NULL; + + code = mndGetConsensusInfo(execInfo.pStreamConsensus, p->id.streamId, numOfTasks, &pInfo); + if (code == 0) { + mndAddConsensusTasks(pInfo, &cp); + } else { + mError("failed to get consensus checkpoint-info"); + } - SCheckpointConsensusInfo *pInfo = mndGetConsensusInfo(execInfo.pStreamConsensus, p->id.streamId, numOfTasks); - mndAddConsensusTasks(pInfo, &cp); mndReleaseStream(pMnode, pStream); } @@ -338,9 +366,15 @@ int32_t mndProcessStreamHb(SRpcMsg *pReq) { // kill the checkpoint trans and then set all tasks status to be normal if (taosArrayGetSize(pFailedChkpt) > 0) { bool allReady = true; + if (pMnode != NULL) { - SArray *p = mndTakeVgroupSnapshot(pMnode, &allReady); + SArray *p = NULL; + + code = mndTakeVgroupSnapshot(pMnode, &allReady, &p); taosArrayDestroy(p); + if (code) { + mError("failed to get the vgroup snapshot, ignore it and continue"); + } } else { allReady = false; } @@ -374,7 +408,7 @@ int32_t mndProcessStreamHb(SRpcMsg *pReq) { doSendHbMsgRsp(terrno, &pReq->info, req.vgId, req.msgId); cleanupAfterProcessHbMsg(&req, pFailedChkpt, pOrphanTasks); - return TSDB_CODE_SUCCESS; + return terrno; } void mndStreamStartUpdateCheckpointInfo(SMnode *pMnode) { // here reuse the doCheckpointmsg diff --git a/source/dnode/mnode/impl/src/mndStreamTrans.c b/source/dnode/mnode/impl/src/mndStreamTrans.c index f252791618..c0a869fb77 100644 --- a/source/dnode/mnode/impl/src/mndStreamTrans.c +++ b/source/dnode/mnode/impl/src/mndStreamTrans.c @@ -153,27 +153,30 @@ int32_t mndStreamGetRelTrans(SMnode *pMnode, int64_t streamId) { return 0; } -STrans *doCreateTrans(SMnode *pMnode, SStreamObj *pStream, SRpcMsg *pReq, ETrnConflct conflict, const char *name, - const char *pMsg) { - STrans *pTrans = mndTransCreate(pMnode, TRN_POLICY_RETRY, conflict, pReq, name); - if (pTrans == NULL) { +int32_t doCreateTrans(SMnode *pMnode, SStreamObj *pStream, SRpcMsg *pReq, ETrnConflct conflict, const char *name, + const char *pMsg, STrans ** pTrans1) { + *pTrans1 = NULL; + terrno = 0; + + STrans *p = mndTransCreate(pMnode, TRN_POLICY_RETRY, conflict, pReq, name); + if (p == NULL) { mError("failed to build trans:%s, reason: %s", name, tstrerror(TSDB_CODE_OUT_OF_MEMORY)); terrno = TSDB_CODE_OUT_OF_MEMORY; - return NULL; + return terrno; } - mInfo("stream:0x%" PRIx64 " start to build trans %s, transId:%d", pStream->uid, pMsg, pTrans->id); + mInfo("stream:0x%" PRIx64 " start to build trans %s, transId:%d", pStream->uid, pMsg, p->id); - mndTransSetDbName(pTrans, pStream->sourceDb, pStream->targetSTbName); - if (mndTransCheckConflict(pMnode, pTrans) != 0) { + mndTransSetDbName(p, pStream->sourceDb, pStream->targetSTbName); + if (mndTransCheckConflict(pMnode, p) != 0) { terrno = TSDB_CODE_MND_TRANS_CONFLICT; mError("failed to build trans:%s for stream:0x%" PRIx64 " code:%s", name, pStream->uid, tstrerror(terrno)); - mndTransDrop(pTrans); - return NULL; + mndTransDrop(p); + return terrno; } - terrno = 0; - return pTrans; + *pTrans1 = p; + return 0; } SSdbRaw *mndStreamActionEncode(SStreamObj *pStream) { @@ -272,8 +275,9 @@ int32_t doKillCheckpointTrans(SMnode *pMnode, const char *pDBName, size_t len) { continue; } - SStreamObj *pStream = mndGetStreamObj(pMnode, pTransInfo->streamId); - if (pStream != NULL) { + SStreamObj *pStream = NULL; + int32_t code = mndGetStreamObj(pMnode, pTransInfo->streamId, &pStream); + if (pStream != NULL && code == 0) { if (identicalName(pStream->sourceDb, pDBName, len)) { mndKillTransImpl(pMnode, pTransInfo->transId, pStream->sourceDb); } else if (identicalName(pStream->targetDb, pDBName, len)) { diff --git a/source/dnode/mnode/impl/src/mndStreamUtil.c b/source/dnode/mnode/impl/src/mndStreamUtil.c index 843c024286..0b96626536 100644 --- a/source/dnode/mnode/impl/src/mndStreamUtil.c +++ b/source/dnode/mnode/impl/src/mndStreamUtil.c @@ -28,20 +28,20 @@ struct SStreamTaskIter { int32_t doRemoveTasks(SStreamExecInfo *pExecNode, STaskId *pRemovedId); -SStreamTaskIter* createStreamTaskIter(SStreamObj* pStream) { - SStreamTaskIter* pIter = taosMemoryCalloc(1, sizeof(SStreamTaskIter)); - if (pIter == NULL) { +int32_t createStreamTaskIter(SStreamObj* pStream, SStreamTaskIter** pIter) { + *pIter = taosMemoryCalloc(1, sizeof(SStreamTaskIter)); + if (*pIter == NULL) { terrno = TSDB_CODE_OUT_OF_MEMORY; - return NULL; + return terrno; } - pIter->level = -1; - pIter->ordinalIndex = 0; - pIter->pStream = pStream; - pIter->totalLevel = taosArrayGetSize(pStream->tasks); - pIter->pTask = NULL; + (*pIter)->level = -1; + (*pIter)->ordinalIndex = 0; + (*pIter)->pStream = pStream; + (*pIter)->totalLevel = taosArrayGetSize(pStream->tasks); + (*pIter)->pTask = NULL; - return pIter; + return 0; } bool streamTaskIterNextTask(SStreamTaskIter* pIter) { @@ -72,19 +72,27 @@ bool streamTaskIterNextTask(SStreamTaskIter* pIter) { return false; } -SStreamTask* streamTaskIterGetCurrent(SStreamTaskIter* pIter) { - return pIter->pTask; +int32_t streamTaskIterGetCurrent(SStreamTaskIter* pIter, SStreamTask** pTask) { + if (pTask) { + *pTask = pIter->pTask; + if (*pTask != NULL) { + return TSDB_CODE_SUCCESS; + } + } + + return TSDB_CODE_INVALID_PARA; } void destroyStreamTaskIter(SStreamTaskIter* pIter) { taosMemoryFree(pIter); } -SArray *mndTakeVgroupSnapshot(SMnode *pMnode, bool *allReady) { +int32_t mndTakeVgroupSnapshot(SMnode *pMnode, bool *allReady, SArray** pList) { SSdb *pSdb = pMnode->pSdb; void *pIter = NULL; SVgObj *pVgroup = NULL; int32_t replica = -1; // do the replica check + int32_t code = 0; *allReady = true; SArray *pVgroupList = taosArrayInit(4, sizeof(SNodeEntry)); @@ -131,10 +139,15 @@ SArray *mndTakeVgroupSnapshot(SMnode *pMnode, bool *allReady) { } char buf[256] = {0}; - epsetToStr(&entry.epset, buf, tListLen(buf)); + (void) epsetToStr(&entry.epset, buf, tListLen(buf)); + + void* p = taosArrayPush(pVgroupList, &entry); + if (p == NULL) { + mError("failed to put entry in vgroup list, nodeId:%d code:out of memory", entry.nodeId); + } else { + mDebug("take node snapshot, nodeId:%d %s", entry.nodeId, buf); + } - mDebug("take node snapshot, nodeId:%d %s", entry.nodeId, buf); - taosArrayPush(pVgroupList, &entry); sdbRelease(pSdb, pVgroup); } @@ -145,43 +158,57 @@ SArray *mndTakeVgroupSnapshot(SMnode *pMnode, bool *allReady) { break; } - SNodeEntry entry = {0}; - addEpIntoEpSet(&entry.epset, pObj->pDnode->fqdn, pObj->pDnode->port); - entry.nodeId = SNODE_HANDLE; + SNodeEntry entry = {.nodeId = SNODE_HANDLE}; + code = addEpIntoEpSet(&entry.epset, pObj->pDnode->fqdn, pObj->pDnode->port); + if (code) { + sdbRelease(pSdb, pObj); + continue; + } char buf[256] = {0}; - epsetToStr(&entry.epset, buf, tListLen(buf)); - mDebug("take snode snapshot, nodeId:%d %s", entry.nodeId, buf); + (void) epsetToStr(&entry.epset, buf, tListLen(buf)); + + void* p = taosArrayPush(pVgroupList, &entry); + if (p == NULL) { + mError("failed to put entry in vgroup list, nodeId:%d code:out of memory", entry.nodeId); + } else { + mDebug("take snode snapshot, nodeId:%d %s", entry.nodeId, buf); + } - taosArrayPush(pVgroupList, &entry); sdbRelease(pSdb, pObj); } - return pVgroupList; + *pList = pVgroupList; + return code; } -SStreamObj *mndGetStreamObj(SMnode *pMnode, int64_t streamId) { - void *pIter = NULL; - SSdb *pSdb = pMnode->pSdb; - SStreamObj *pStream = NULL; +int32_t mndGetStreamObj(SMnode *pMnode, int64_t streamId, SStreamObj **pStream) { + void *pIter = NULL; + SSdb *pSdb = pMnode->pSdb; + *pStream = NULL; - while ((pIter = sdbFetch(pSdb, SDB_STREAM, pIter, (void **)&pStream)) != NULL) { - if (pStream->uid == streamId) { + SStreamObj *p = NULL; + while ((pIter = sdbFetch(pSdb, SDB_STREAM, pIter, (void **)&p)) != NULL) { + if (p->uid == streamId) { sdbCancelFetch(pSdb, pIter); - return pStream; + *pStream = p; + return TSDB_CODE_SUCCESS; } - sdbRelease(pSdb, pStream); + sdbRelease(pSdb, p); } - return NULL; + return TSDB_CODE_STREAM_TASK_NOT_EXIST; } void mndKillTransImpl(SMnode *pMnode, int32_t transId, const char *pDbName) { STrans *pTrans = mndAcquireTrans(pMnode, transId); if (pTrans != NULL) { mInfo("kill active transId:%d in Db:%s", transId, pDbName); - mndKillTrans(pMnode, pTrans); + int32_t code = mndKillTrans(pMnode, pTrans); mndReleaseTrans(pMnode, pTrans); + if (code) { + mError("failed to kill trans:%d", pTrans->id); + } } else { mError("failed to acquire trans in Db:%s, transId:%d", pDbName, transId); } @@ -197,11 +224,16 @@ int32_t extractNodeEpset(SMnode *pMnode, SEpSet *pEpSet, bool *hasEpset, int32_t pIter = sdbFetch(pMnode->pSdb, SDB_SNODE, pIter, (void **)&pObj); if (pIter != NULL) { - addEpIntoEpSet(pEpSet, pObj->pDnode->fqdn, pObj->pDnode->port); + int32_t code = addEpIntoEpSet(pEpSet, pObj->pDnode->fqdn, pObj->pDnode->port); sdbRelease(pMnode->pSdb, pObj); sdbCancelFetch(pMnode->pSdb, pIter); - *hasEpset = true; - return TSDB_CODE_SUCCESS; + if (code) { + *hasEpset = false; + mError("failed to set epset"); + } else { + *hasEpset = true; + } + return code; } else { mError("failed to acquire snode epset"); return TSDB_CODE_INVALID_PARA; @@ -223,12 +255,14 @@ int32_t extractNodeEpset(SMnode *pMnode, SEpSet *pEpSet, bool *hasEpset, int32_t } static int32_t doSetResumeAction(STrans *pTrans, SMnode *pMnode, SStreamTask *pTask, int8_t igUntreated) { + terrno = 0; + SVResumeStreamTaskReq *pReq = taosMemoryCalloc(1, sizeof(SVResumeStreamTaskReq)); if (pReq == NULL) { mError("failed to malloc in resume stream, size:%" PRIzu ", code:%s", sizeof(SVResumeStreamTaskReq), tstrerror(TSDB_CODE_OUT_OF_MEMORY)); terrno = TSDB_CODE_OUT_OF_MEMORY; - return -1; + return terrno; } pReq->head.vgId = htonl(pTask->info.nodeId); @@ -242,31 +276,45 @@ static int32_t doSetResumeAction(STrans *pTrans, SMnode *pMnode, SStreamTask *pT if (code != TSDB_CODE_SUCCESS || (!hasEpset)) { terrno = code; taosMemoryFree(pReq); - return -1; + return terrno; } code = setTransAction(pTrans, pReq, sizeof(SVResumeStreamTaskReq), TDMT_STREAM_TASK_RESUME, &epset, 0, 0); if (code != 0) { taosMemoryFree(pReq); - return -1; + return terrno; } mDebug("set the resume action for trans:%d", pTrans->id); return 0; } -SStreamTask *mndGetStreamTask(STaskId *pId, SStreamObj *pStream) { - SStreamTaskIter *pIter = createStreamTaskIter(pStream); +int32_t mndGetStreamTask(STaskId *pId, SStreamObj *pStream, SStreamTask **pTask) { + *pTask = NULL; + + SStreamTask *p = NULL; + SStreamTaskIter *pIter = NULL; + int32_t code = createStreamTaskIter(pStream, &pIter); + if (code) { + mError("failed to create stream task iter:%s", pStream->name); + return code; + } + while (streamTaskIterNextTask(pIter)) { - SStreamTask *pTask = streamTaskIterGetCurrent(pIter); - if (pTask->id.taskId == pId->taskId) { + code = streamTaskIterGetCurrent(pIter, &p); + if (code) { + continue; + } + + if (p->id.taskId == pId->taskId) { destroyStreamTaskIter(pIter); - return pTask; + *pTask = p; + return 0; } } destroyStreamTaskIter(pIter); - return NULL; + return TSDB_CODE_FAILED; } int32_t mndGetNumOfStreamTasks(const SStreamObj *pStream) { @@ -280,13 +328,25 @@ int32_t mndGetNumOfStreamTasks(const SStreamObj *pStream) { } int32_t mndStreamSetResumeAction(STrans *pTrans, SMnode *pMnode, SStreamObj *pStream, int8_t igUntreated) { - SStreamTaskIter *pIter = createStreamTaskIter(pStream); + SStreamTaskIter *pIter = NULL; + int32_t code = createStreamTaskIter(pStream, &pIter); + if (code) { + mError("failed to create stream task iter:%s", pStream->name); + return code; + } while (streamTaskIterNextTask(pIter)) { - SStreamTask *pTask = streamTaskIterGetCurrent(pIter); - if (doSetResumeAction(pTrans, pMnode, pTask, igUntreated) < 0) { + SStreamTask *pTask = NULL; + code = streamTaskIterGetCurrent(pIter, &pTask); + if (code || pTask == NULL) { destroyStreamTaskIter(pIter); - return -1; + return code; + } + + code = doSetResumeAction(pTrans, pMnode, pTask, igUntreated); + if (code) { + destroyStreamTaskIter(pIter); + return code; } if (atomic_load_8(&pTask->status.taskStatus) == TASK_STATUS__PAUSE) { @@ -303,7 +363,7 @@ static int32_t doSetPauseAction(SMnode *pMnode, STrans *pTrans, SStreamTask *pTa mError("failed to malloc in pause stream, size:%" PRIzu ", code:%s", sizeof(SVPauseStreamTaskReq), tstrerror(TSDB_CODE_OUT_OF_MEMORY)); terrno = TSDB_CODE_OUT_OF_MEMORY; - return -1; + return terrno; } pReq->head.vgId = htonl(pTask->info.nodeId); @@ -320,25 +380,38 @@ static int32_t doSetPauseAction(SMnode *pMnode, STrans *pTrans, SStreamTask *pTa } char buf[256] = {0}; - epsetToStr(&epset, buf, tListLen(buf)); + (void) epsetToStr(&epset, buf, tListLen(buf)); mDebug("pause stream task in node:%d, epset:%s", pTask->info.nodeId, buf); code = setTransAction(pTrans, pReq, sizeof(SVPauseStreamTaskReq), TDMT_STREAM_TASK_PAUSE, &epset, 0, 0); if (code != 0) { taosMemoryFree(pReq); - return -1; + return code; } return 0; } int32_t mndStreamSetPauseAction(SMnode *pMnode, STrans *pTrans, SStreamObj *pStream) { - SStreamTaskIter *pIter = createStreamTaskIter(pStream); + SStreamTaskIter *pIter = NULL; + + int32_t code = createStreamTaskIter(pStream, &pIter); + if (code) { + mError("failed to create stream task iter:%s", pStream->name); + return code; + } while (streamTaskIterNextTask(pIter)) { - SStreamTask *pTask = streamTaskIterGetCurrent(pIter); - if (doSetPauseAction(pMnode, pTrans, pTask) < 0) { + SStreamTask *pTask = NULL; + code = streamTaskIterGetCurrent(pIter, &pTask); + if (code) { destroyStreamTaskIter(pIter); - return -1; + return code; + } + + code = doSetPauseAction(pMnode, pTrans, pTask); + if (code) { + destroyStreamTaskIter(pIter); + return code; } if (atomic_load_8(&pTask->status.taskStatus) != TASK_STATUS__PAUSE) { @@ -348,14 +421,14 @@ int32_t mndStreamSetPauseAction(SMnode *pMnode, STrans *pTrans, SStreamObj *pStr } destroyStreamTaskIter(pIter); - return 0; + return code; } static int32_t doSetDropAction(SMnode *pMnode, STrans *pTrans, SStreamTask *pTask) { SVDropStreamTaskReq *pReq = taosMemoryCalloc(1, sizeof(SVDropStreamTaskReq)); if (pReq == NULL) { terrno = TSDB_CODE_OUT_OF_MEMORY; - return -1; + return terrno; } pReq->head.vgId = htonl(pTask->info.nodeId); @@ -366,28 +439,40 @@ static int32_t doSetDropAction(SMnode *pMnode, STrans *pTrans, SStreamTask *pTas bool hasEpset = false; int32_t code = extractNodeEpset(pMnode, &epset, &hasEpset, pTask->id.taskId, pTask->info.nodeId); if (code != TSDB_CODE_SUCCESS || !hasEpset) { // no valid epset, return directly without redoAction - terrno = code; - return -1; + return code; } // The epset of nodeId of this task may have been expired now, let's use the newest epset from mnode. code = setTransAction(pTrans, pReq, sizeof(SVDropStreamTaskReq), TDMT_STREAM_TASK_DROP, &epset, 0, 0); if (code != 0) { taosMemoryFree(pReq); - return -1; + return code; } return 0; } int32_t mndStreamSetDropAction(SMnode *pMnode, STrans *pTrans, SStreamObj *pStream) { - SStreamTaskIter *pIter = createStreamTaskIter(pStream); + SStreamTaskIter *pIter = NULL; + + int32_t code = createStreamTaskIter(pStream, &pIter); + if (code) { + mError("failed to create stream task iter:%s", pStream->name); + return code; + } while(streamTaskIterNextTask(pIter)) { - SStreamTask *pTask = streamTaskIterGetCurrent(pIter); - if (doSetDropAction(pMnode, pTrans, pTask) < 0) { + SStreamTask *pTask = NULL; + code = streamTaskIterGetCurrent(pIter, &pTask); + if (code) { destroyStreamTaskIter(pIter); - return -1; + return code; + } + + code = doSetDropAction(pMnode, pTrans, pTask); + if (code) { + destroyStreamTaskIter(pIter); + return code; } } destroyStreamTaskIter(pIter); @@ -398,7 +483,7 @@ static int32_t doSetDropActionFromId(SMnode *pMnode, STrans *pTrans, SOrphanTask SVDropStreamTaskReq *pReq = taosMemoryCalloc(1, sizeof(SVDropStreamTaskReq)); if (pReq == NULL) { terrno = TSDB_CODE_OUT_OF_MEMORY; - return -1; + return terrno; } pReq->head.vgId = htonl(pTask->nodeId); @@ -409,16 +494,15 @@ static int32_t doSetDropActionFromId(SMnode *pMnode, STrans *pTrans, SOrphanTask bool hasEpset = false; int32_t code = extractNodeEpset(pMnode, &epset, &hasEpset, pTask->taskId, pTask->nodeId); if (code != TSDB_CODE_SUCCESS || (!hasEpset)) { // no valid epset, return directly without redoAction - terrno = code; taosMemoryFree(pReq); - return -1; + return code; } // The epset of nodeId of this task may have been expired now, let's use the newest epset from mnode. code = setTransAction(pTrans, pReq, sizeof(SVDropStreamTaskReq), TDMT_STREAM_TASK_DROP, &epset, 0, 0); if (code != 0) { taosMemoryFree(pReq); - return -1; + return code; } return 0; @@ -427,19 +511,35 @@ static int32_t doSetDropActionFromId(SMnode *pMnode, STrans *pTrans, SOrphanTask int32_t mndStreamSetDropActionFromList(SMnode *pMnode, STrans *pTrans, SArray* pList) { for(int32_t i = 0; i < taosArrayGetSize(pList); ++i) { SOrphanTask* pTask = taosArrayGet(pList, i); - mDebug("add drop task:0x%x action to drop orphan task", pTask->taskId); - doSetDropActionFromId(pMnode, pTrans, pTask); + int32_t code = doSetDropActionFromId(pMnode, pTrans, pTask); + if (code != 0) { + return code; + } else { + mDebug("add drop task:0x%x action to drop orphan task", pTask->taskId); + } } return 0; } static void initNodeUpdateMsg(SStreamTaskNodeUpdateMsg *pMsg, const SVgroupChangeInfo *pInfo, SStreamTaskId *pId, int32_t transId) { + int32_t code = 0; + pMsg->streamId = pId->streamId; pMsg->taskId = pId->taskId; pMsg->transId = transId; pMsg->pNodeList = taosArrayInit(taosArrayGetSize(pInfo->pUpdateNodeList), sizeof(SNodeUpdateInfo)); - taosArrayAddAll(pMsg->pNodeList, pInfo->pUpdateNodeList); + if (pMsg->pNodeList == NULL) { + mError("failed to prepare node list, code:out of memory"); + code = TSDB_CODE_OUT_OF_MEMORY; + } + + if (code == 0) { + void *p = taosArrayAddAll(pMsg->pNodeList, pInfo->pUpdateNodeList); + if (p == NULL) { + mError("failed to add update node list into nodeList"); + } + } } static int32_t doBuildStreamTaskUpdateMsg(void **pBuf, int32_t *pLen, SVgroupChangeInfo *pInfo, int32_t nodeId, @@ -454,7 +554,7 @@ static int32_t doBuildStreamTaskUpdateMsg(void **pBuf, int32_t *pLen, SVgroupCha if (code < 0) { terrno = TSDB_CODE_OUT_OF_MEMORY; taosArrayDestroy(req.pNodeList); - return -1; + return terrno; } int32_t tlen = sizeof(SMsgHead) + blen; @@ -463,13 +563,18 @@ static int32_t doBuildStreamTaskUpdateMsg(void **pBuf, int32_t *pLen, SVgroupCha if (buf == NULL) { terrno = TSDB_CODE_OUT_OF_MEMORY; taosArrayDestroy(req.pNodeList); - return -1; + return terrno; } void *abuf = POINTER_SHIFT(buf, sizeof(SMsgHead)); SEncoder encoder; tEncoderInit(&encoder, abuf, tlen); - tEncodeStreamTaskUpdateMsg(&encoder, &req); + code = tEncodeStreamTaskUpdateMsg(&encoder, &req); + if (code == -1) { + tEncoderClear(&encoder); + taosArrayDestroy(req.pNodeList); + return code; + } SMsgHead *pMsgHead = (SMsgHead *)buf; pMsgHead->contLen = htonl(tlen); @@ -487,15 +592,20 @@ static int32_t doBuildStreamTaskUpdateMsg(void **pBuf, int32_t *pLen, SVgroupCha static int32_t doSetUpdateTaskAction(SMnode *pMnode, STrans *pTrans, SStreamTask *pTask, SVgroupChangeInfo *pInfo) { void *pBuf = NULL; int32_t len = 0; - streamTaskUpdateEpsetInfo(pTask, pInfo->pUpdateNodeList); + int32_t code = streamTaskUpdateEpsetInfo(pTask, pInfo->pUpdateNodeList); + if (code) { + return code; + } - doBuildStreamTaskUpdateMsg(&pBuf, &len, pInfo, pTask->info.nodeId, &pTask->id, pTrans->id); + code = doBuildStreamTaskUpdateMsg(&pBuf, &len, pInfo, pTask->info.nodeId, &pTask->id, pTrans->id); + if (code) { + return code; + } SEpSet epset = {0}; bool hasEpset = false; - int32_t code = extractNodeEpset(pMnode, &epset, &hasEpset, pTask->id.taskId, pTask->info.nodeId); + code = extractNodeEpset(pMnode, &epset, &hasEpset, pTask->id.taskId, pTask->info.nodeId); if (code != TSDB_CODE_SUCCESS || !hasEpset) { - terrno = code; return code; } @@ -510,16 +620,30 @@ static int32_t doSetUpdateTaskAction(SMnode *pMnode, STrans *pTrans, SStreamTask // build trans to update the epset int32_t mndStreamSetUpdateEpsetAction(SMnode *pMnode, SStreamObj *pStream, SVgroupChangeInfo *pInfo, STrans *pTrans) { mDebug("stream:0x%" PRIx64 " set tasks epset update action", pStream->uid); - taosWLockLatch(&pStream->lock); + SStreamTaskIter *pIter = NULL; + + taosWLockLatch(&pStream->lock); + int32_t code = createStreamTaskIter(pStream, &pIter); + if (code) { + taosWUnLockLatch(&pStream->lock); + mError("failed to create stream task iter:%s", pStream->name); + return code; + } - SStreamTaskIter *pIter = createStreamTaskIter(pStream); while (streamTaskIterNextTask(pIter)) { - SStreamTask *pTask = streamTaskIterGetCurrent(pIter); - int32_t code = doSetUpdateTaskAction(pMnode, pTrans, pTask, pInfo); + SStreamTask *pTask = NULL; + code = streamTaskIterGetCurrent(pIter, &pTask); + if (code) { + destroyStreamTaskIter(pIter); + taosWUnLockLatch(&pStream->lock); + return code; + } + + code = doSetUpdateTaskAction(pMnode, pTrans, pTask, pInfo); if (code != TSDB_CODE_SUCCESS) { destroyStreamTaskIter(pIter); taosWUnLockLatch(&pStream->lock); - return -1; + return code; } } @@ -558,16 +682,30 @@ static int32_t doSetResetAction(SMnode *pMnode, STrans *pTrans, SStreamTask *pTa } int32_t mndStreamSetResetTaskAction(SMnode *pMnode, STrans *pTrans, SStreamObj *pStream) { - taosWLockLatch(&pStream->lock); + SStreamTaskIter *pIter = NULL; + + taosWLockLatch(&pStream->lock); + int32_t code = createStreamTaskIter(pStream, &pIter); + if (code) { + taosWUnLockLatch(&pStream->lock); + mError("failed to create stream task iter:%s", pStream->name); + return code; + } - SStreamTaskIter *pIter = createStreamTaskIter(pStream); while (streamTaskIterNextTask(pIter)) { - SStreamTask *pTask = streamTaskIterGetCurrent(pIter); - int32_t code = doSetResetAction(pMnode, pTrans, pTask); + SStreamTask *pTask = NULL; + code = streamTaskIterGetCurrent(pIter, &pTask); + if (code) { + destroyStreamTaskIter(pIter); + taosWUnLockLatch(&pStream->lock); + return code; + } + + code = doSetResetAction(pMnode, pTrans, pTask); if (code != TSDB_CODE_SUCCESS) { destroyStreamTaskIter(pIter); taosWUnLockLatch(&pStream->lock); - return -1; + return code; } } @@ -581,8 +719,12 @@ static void freeTaskList(void* param) { taosArrayDestroy(*pList); } -void mndInitExecInfo() { - taosThreadMutexInit(&execInfo.lock, NULL); +int32_t mndInitExecInfo() { + int32_t code = taosThreadMutexInit(&execInfo.lock, NULL); + if (code) { + return code; + } + _hash_fn_t fn = taosGetDefaultHashFunction(TSDB_DATA_TYPE_VARCHAR); execInfo.pTaskList = taosArrayInit(4, sizeof(STaskId)); @@ -596,6 +738,7 @@ void mndInitExecInfo() { taosHashSetFreeFp(execInfo.pTransferStateStreams, freeTaskList); taosHashSetFreeFp(execInfo.pChkptStreams, freeTaskList); taosHashSetFreeFp(execInfo.pStreamConsensus, freeTaskList); + return 0; } void removeExpiredNodeInfo(const SArray *pNodeSnapshot) { @@ -608,7 +751,10 @@ void removeExpiredNodeInfo(const SArray *pNodeSnapshot) { for (int32_t j = 0; j < size; ++j) { SNodeEntry *pEntry = taosArrayGet(pNodeSnapshot, j); if (pEntry->nodeId == p->nodeId) { - taosArrayPush(pValidList, p); + void* px = taosArrayPush(pValidList, p); + if (px == NULL) { + mError("failed to put node into list, nodeId:%d", p->nodeId); + } break; } } @@ -626,7 +772,10 @@ int32_t doRemoveTasks(SStreamExecInfo *pExecNode, STaskId *pRemovedId) { return TSDB_CODE_SUCCESS; } - taosHashRemove(pExecNode->pTaskMap, pRemovedId, sizeof(*pRemovedId)); + int32_t code = taosHashRemove(pExecNode->pTaskMap, pRemovedId, sizeof(*pRemovedId)); + if (code) { + return code; + } for (int32_t k = 0; k < taosArrayGetSize(pExecNode->pTaskList); ++k) { STaskId *pId = taosArrayGet(pExecNode->pTaskList, k); @@ -645,28 +794,45 @@ int32_t doRemoveTasks(SStreamExecInfo *pExecNode, STaskId *pRemovedId) { void removeTasksInBuf(SArray *pTaskIds, SStreamExecInfo* pExecInfo) { for (int32_t i = 0; i < taosArrayGetSize(pTaskIds); ++i) { STaskId *pId = taosArrayGet(pTaskIds, i); - doRemoveTasks(pExecInfo, pId); + int32_t code = doRemoveTasks(pExecInfo, pId); + if (code) { + mError("failed to remove task in buffer list, 0x%"PRIx64, pId->taskId); + } } } void removeStreamTasksInBuf(SStreamObj *pStream, SStreamExecInfo *pExecNode) { - taosThreadMutexLock(&pExecNode->lock); + SStreamTaskIter *pIter = NULL; + streamMutexLock(&pExecNode->lock); // 1. remove task entries - SStreamTaskIter *pIter = createStreamTaskIter(pStream); + int32_t code = createStreamTaskIter(pStream, &pIter); + if (code) { + streamMutexUnlock(&pExecNode->lock); + mError("failed to create stream task iter:%s", pStream->name); + return; + } + while (streamTaskIterNextTask(pIter)) { - SStreamTask *pTask = streamTaskIterGetCurrent(pIter); + SStreamTask *pTask = NULL; + code = streamTaskIterGetCurrent(pIter, &pTask); + if (code) { + continue; + } STaskId id = {.streamId = pTask->id.streamId, .taskId = pTask->id.taskId}; - doRemoveTasks(pExecNode, &id); + code = doRemoveTasks(pExecNode, &id); + if (code) { + mError("failed to remove task in buffer list, 0x%"PRIx64, id.taskId); + } } ASSERT(taosHashGetSize(pExecNode->pTaskMap) == taosArrayGetSize(pExecNode->pTaskList)); // 2. remove stream entry in consensus hash table - mndClearConsensusCheckpointId(execInfo.pStreamConsensus, pStream->uid); + (void) mndClearConsensusCheckpointId(execInfo.pStreamConsensus, pStream->uid); - taosThreadMutexUnlock(&pExecNode->lock); + streamMutexUnlock(&pExecNode->lock); destroyStreamTaskIter(pIter); } @@ -697,7 +863,10 @@ int32_t removeExpiredNodeEntryAndTaskInBuf(SArray *pNodeSnapshot) { bool existed = taskNodeExists(pNodeSnapshot, pEntry->nodeId); if (!existed) { - taosArrayPush(pRemovedTasks, pId); + void* p = taosArrayPush(pRemovedTasks, pId); + if (p == NULL) { + mError("failed to put task entry into remove list, taskId:0x%" PRIx64, pId->taskId); + } } } @@ -759,45 +928,64 @@ static int32_t doSetUpdateChkptAction(SMnode *pMnode, STrans *pTrans, SStreamTas } int32_t mndStreamSetUpdateChkptAction(SMnode *pMnode, STrans *pTrans, SStreamObj *pStream) { + SStreamTaskIter *pIter = NULL; + taosWLockLatch(&pStream->lock); + int32_t code = createStreamTaskIter(pStream, &pIter); + if (code) { + taosWUnLockLatch(&pStream->lock); + mError("failed to create stream task iter:%s", pStream->name); + return code; + } - SStreamTaskIter *pIter = createStreamTaskIter(pStream); while (streamTaskIterNextTask(pIter)) { - SStreamTask *pTask = streamTaskIterGetCurrent(pIter); + SStreamTask *pTask = NULL; + code = streamTaskIterGetCurrent(pIter, &pTask); + if (code) { + destroyStreamTaskIter(pIter); + taosWUnLockLatch(&pStream->lock); + return code; + } - int32_t code = doSetUpdateChkptAction(pMnode, pTrans, pTask); + code = doSetUpdateChkptAction(pMnode, pTrans, pTask); if (code != TSDB_CODE_SUCCESS) { destroyStreamTaskIter(pIter); taosWUnLockLatch(&pStream->lock); - return -1; + return code; } } destroyStreamTaskIter(pIter); taosWUnLockLatch(&pStream->lock); - return 0; + return code; } int32_t mndScanCheckpointReportInfo(SRpcMsg *pReq) { SMnode *pMnode = pReq->info.node; void *pIter = NULL; SArray *pDropped = taosArrayInit(4, sizeof(int64_t)); + int32_t code = 0; mDebug("start to scan checkpoint report info"); while ((pIter = taosHashIterate(execInfo.pChkptStreams, pIter)) != NULL) { SArray *pList = *(SArray **)pIter; - STaskChkptInfo* pInfo = taosArrayGet(pList, 0); - SStreamObj* pStream = mndGetStreamObj(pMnode, pInfo->streamId); - if (pStream == NULL) { + STaskChkptInfo *pInfo = taosArrayGet(pList, 0); + SStreamObj *pStream = NULL; + code = mndGetStreamObj(pMnode, pInfo->streamId, &pStream); + if (pStream == NULL || code != 0) { mDebug("failed to acquire stream:0x%" PRIx64 " remove it from checkpoint-report list", pInfo->streamId); - taosArrayPush(pDropped, &pInfo->streamId); + void* p = taosArrayPush(pDropped, &pInfo->streamId); + if (p == NULL) { + mError("failed to put stream into drop list:0x%" PRIx64, pInfo->streamId); + } + continue; } int32_t total = mndGetNumOfStreamTasks(pStream); - int32_t existed = (int32_t) taosArrayGetSize(pList); + int32_t existed = (int32_t)taosArrayGetSize(pList); if (total == existed) { mDebug("stream:0x%" PRIx64 " %s all %d tasks send checkpoint-report, start to update checkpoint-info", @@ -805,17 +993,21 @@ int32_t mndScanCheckpointReportInfo(SRpcMsg *pReq) { bool conflict = mndStreamTransConflictCheck(pMnode, pStream->uid, MND_STREAM_CHKPT_UPDATE_NAME, false); if (!conflict) { - int32_t code = mndCreateStreamChkptInfoUpdateTrans(pMnode, pStream, pList); - if (code == TSDB_CODE_SUCCESS || code == TSDB_CODE_ACTION_IN_PROGRESS) { // remove this entry - taosArrayPush(pDropped, &pInfo->streamId); - mDebug("stream:0x%" PRIx64 " removed", pInfo->streamId); + code = mndCreateStreamChkptInfoUpdateTrans(pMnode, pStream, pList); + if (code == TSDB_CODE_SUCCESS || code == TSDB_CODE_ACTION_IN_PROGRESS) { // remove this entry + void* p = taosArrayPush(pDropped, &pInfo->streamId); + if (p == NULL) { + mError("failed to remove stream:0x%" PRIx64, pInfo->streamId); + } else { + mDebug("stream:0x%" PRIx64 " removed", pInfo->streamId); + } } else { mDebug("stream:0x%" PRIx64 " not launch chkpt-meta update trans, due to checkpoint not finished yet", pInfo->streamId); } break; } else { - mDebug("stream:0x%"PRIx64" active checkpoint trans not finished yet, wait", pInfo->streamId); + mDebug("stream:0x%" PRIx64 " active checkpoint trans not finished yet, wait", pInfo->streamId); } } else { mDebug("stream:0x%" PRIx64 " %s %d/%d tasks send checkpoint-report, %d not send", pInfo->streamId, pStream->name, @@ -829,7 +1021,10 @@ int32_t mndScanCheckpointReportInfo(SRpcMsg *pReq) { if (size > 0) { for (int32_t i = 0; i < size; ++i) { int64_t streamId = *(int64_t *)taosArrayGet(pDropped, i); - taosHashRemove(execInfo.pChkptStreams, &streamId, sizeof(streamId)); + code = taosHashRemove(execInfo.pChkptStreams, &streamId, sizeof(streamId)); + if (code) { + mError("failed to remove stream in buf:0x%"PRIx64, streamId); + } } int32_t numOfStreams = taosHashGetSize(execInfo.pChkptStreams); @@ -854,29 +1049,30 @@ static int32_t mndStreamSetChkptIdAction(SMnode *pMnode, STrans *pTrans, SStream int32_t blen; tEncodeSize(tEncodeRestoreCheckpointInfo, &req, blen, code); if (code < 0) { - terrno = TSDB_CODE_OUT_OF_MEMORY; - return -1; + return terrno = TSDB_CODE_OUT_OF_MEMORY; } int32_t tlen = sizeof(SMsgHead) + blen; void *pBuf = taosMemoryMalloc(tlen); if (pBuf == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; - return -1; + return terrno = TSDB_CODE_OUT_OF_MEMORY; } void *abuf = POINTER_SHIFT(pBuf, sizeof(SMsgHead)); SEncoder encoder; tEncoderInit(&encoder, abuf, tlen); - tEncodeRestoreCheckpointInfo(&encoder, &req); + code = tEncodeRestoreCheckpointInfo(&encoder, &req); + tEncoderClear(&encoder); + if (code == -1) { + taosMemoryFree(pBuf); + return code; + } SMsgHead *pMsgHead = (SMsgHead *)pBuf; pMsgHead->contLen = htonl(tlen); pMsgHead->vgId = htonl(pTask->info.nodeId); - tEncoderClear(&encoder); - SEpSet epset = {0}; bool hasEpset = false; code = extractNodeEpset(pMnode, &epset, &hasEpset, pTask->id.taskId, pTask->info.nodeId); @@ -898,17 +1094,28 @@ int32_t mndCreateSetConsensusChkptIdTrans(SMnode *pMnode, SStreamObj *pStream, i char msg[128] = {0}; snprintf(msg, tListLen(msg), "set consen-chkpt-id for task:0x%x", taskId); - STrans *pTrans = doCreateTrans(pMnode, pStream, NULL, TRN_CONFLICT_NOTHING, MND_STREAM_CHKPT_CONSEN_NAME, msg); - if (pTrans == NULL) { + STrans *pTrans = NULL; + int32_t code = doCreateTrans(pMnode, pStream, NULL, TRN_CONFLICT_NOTHING, MND_STREAM_CHKPT_CONSEN_NAME, msg, &pTrans); + if (pTrans == NULL || code != 0) { return terrno; } STaskId id = {.streamId = pStream->uid, .taskId = taskId}; - SStreamTask *pTask = mndGetStreamTask(&id, pStream); - ASSERT(pTask); + SStreamTask *pTask = NULL; + code = mndGetStreamTask(&id, pStream, &pTask); + if (code) { + mError("failed to get task:0x%x in stream:%s, failed to create consensus-checkpointId", taskId, pStream->name); + sdbRelease(pMnode->pSdb, pStream); + return code; + } - /*int32_t code = */ mndStreamRegisterTrans(pTrans, MND_STREAM_CHKPT_CONSEN_NAME, pStream->uid); - int32_t code = mndStreamSetChkptIdAction(pMnode, pTrans, pTask, checkpointId, ts); + code = mndStreamRegisterTrans(pTrans, MND_STREAM_CHKPT_CONSEN_NAME, pStream->uid); + if (code) { + sdbRelease(pMnode->pSdb, pStream); + return code; + } + + code = mndStreamSetChkptIdAction(pMnode, pTrans, pTask, checkpointId, ts); if (code != 0) { sdbRelease(pMnode->pSdb, pStream); mndTransDrop(pTrans); @@ -916,17 +1123,18 @@ int32_t mndCreateSetConsensusChkptIdTrans(SMnode *pMnode, SStreamObj *pStream, i } code = mndPersistTransLog(pStream, pTrans, SDB_STATUS_READY); - if (code != TSDB_CODE_SUCCESS) { + if (code) { sdbRelease(pMnode->pSdb, pStream); mndTransDrop(pTrans); - return -1; + return code; } - if (mndTransPrepare(pMnode, pTrans) != 0) { + code = mndTransPrepare(pMnode, pTrans); + if (code) { mError("trans:%d, failed to prepare set consensus-chkptId trans since %s", pTrans->id, terrstr()); sdbRelease(pMnode->pSdb, pStream); mndTransDrop(pTrans); - return -1; + return code; } sdbRelease(pMnode->pSdb, pStream); @@ -935,10 +1143,13 @@ int32_t mndCreateSetConsensusChkptIdTrans(SMnode *pMnode, SStreamObj *pStream, i return TSDB_CODE_ACTION_IN_PROGRESS; } -SCheckpointConsensusInfo* mndGetConsensusInfo(SHashObj* pHash, int64_t streamId, int32_t numOfTasks) { - void* pInfo = taosHashGet(pHash, &streamId, sizeof(streamId)); - if (pInfo != NULL) { - return (SCheckpointConsensusInfo*)pInfo; +int32_t mndGetConsensusInfo(SHashObj* pHash, int64_t streamId, int32_t numOfTasks, SCheckpointConsensusInfo **pInfo) { + *pInfo = NULL; + + void* px = taosHashGet(pHash, &streamId, sizeof(streamId)); + if (px != NULL) { + *pInfo = px; + return 0; } SCheckpointConsensusInfo p = { @@ -947,10 +1158,14 @@ SCheckpointConsensusInfo* mndGetConsensusInfo(SHashObj* pHash, int64_t streamId, .streamId = streamId, }; - taosHashPut(pHash, &streamId, sizeof(streamId), &p, sizeof(p)); - - void* pChkptInfo = (SCheckpointConsensusInfo*)taosHashGet(pHash, &streamId, sizeof(streamId)); - return pChkptInfo; + int32_t code = taosHashPut(pHash, &streamId, sizeof(streamId), &p, sizeof(p)); + if (code == 0) { + void *pChkptInfo = (SCheckpointConsensusInfo *)taosHashGet(pHash, &streamId, sizeof(streamId)); + *pInfo = pChkptInfo; + } else { + *pInfo = NULL; + } + return code; } // no matter existed or not, add the request into info list anyway, since we need to send rsp mannually @@ -971,11 +1186,15 @@ void mndAddConsensusTasks(SCheckpointConsensusInfo *pInfo, const SRestoreCheckpo } } - taosArrayPush(pInfo->pTaskList, &info); - int32_t num = taosArrayGetSize(pInfo->pTaskList); - mDebug("s-task:0x%x checkpointId:%" PRId64 " added into consensus-checkpointId list, stream:0x%" PRIx64 - " waiting tasks:%d", - pRestoreInfo->taskId, pRestoreInfo->checkpointId, pRestoreInfo->streamId, num); + void *p = taosArrayPush(pInfo->pTaskList, &info); + if (p == NULL) { + mError("s-task:0x%x failed to put task into consensus-checkpointId list, code: out of memory", info.req.taskId); + } else { + int32_t num = taosArrayGetSize(pInfo->pTaskList); + mDebug("s-task:0x%x checkpointId:%" PRId64 " added into consensus-checkpointId list, stream:0x%" PRIx64 + " waiting tasks:%d", + pRestoreInfo->taskId, pRestoreInfo->checkpointId, pRestoreInfo->streamId, num); + } } void mndClearConsensusRspEntry(SCheckpointConsensusInfo* pInfo) { @@ -984,22 +1203,14 @@ void mndClearConsensusRspEntry(SCheckpointConsensusInfo* pInfo) { } int64_t mndClearConsensusCheckpointId(SHashObj* pHash, int64_t streamId) { - taosHashRemove(pHash, &streamId, sizeof(streamId)); - int32_t numOfStreams = taosHashGetSize(pHash); - mDebug("drop stream:0x%" PRIx64 " in consensus-checkpointId list after new checkpoint generated, remain:%d", streamId, - numOfStreams); - return TSDB_CODE_SUCCESS; -} + int32_t code = taosHashRemove(pHash, &streamId, sizeof(streamId)); + if (code == 0) { + int32_t numOfStreams = taosHashGetSize(pHash); + mDebug("drop stream:0x%" PRIx64 " in consensus-checkpointId list after new checkpoint generated, remain:%d", + streamId, numOfStreams); + } else { + mError("failed to remove stream:0x%"PRIx64" in consensus-checkpointId list", streamId); + } -//int32_t mndRegisterConsensusChkptId(SHashObj* pHash, int64_t streamId) { -// void* pInfo = taosHashGet(pHash, &streamId, sizeof(streamId)); -// ASSERT(pInfo == NULL); -// -// SCheckpointConsensusInfo p = {.genTs = taosGetTimestampMs(), .checkpointId = 0, .pTaskList = NULL}; -// taosHashPut(pHash, &streamId, sizeof(streamId), &p, sizeof(p)); -// -// SCheckpointConsensusInfo* pChkptInfo = (SCheckpointConsensusInfo*)taosHashGet(pHash, &streamId, sizeof(streamId)); -// ASSERT(pChkptInfo->genTs > 0 && pChkptInfo->checkpointId == 0); -// mDebug("s-task:0x%" PRIx64 " set the initial consensus-checkpointId:0", streamId); -// return TSDB_CODE_SUCCESS; -//} \ No newline at end of file + return code; +} \ No newline at end of file diff --git a/source/dnode/vnode/src/inc/tq.h b/source/dnode/vnode/src/inc/tq.h index d3582ab8f3..56a3c00fee 100644 --- a/source/dnode/vnode/src/inc/tq.h +++ b/source/dnode/vnode/src/inc/tq.h @@ -155,8 +155,8 @@ int32_t tqSetDstTableDataPayload(uint64_t suid, const STSchema* pTSchema, int32_ SSubmitTbData* pTableData, int64_t earlyTs, const char* id); int32_t doMergeExistedRows(SSubmitTbData* pExisted, const SSubmitTbData* pNew, const char* id); -SVCreateTbReq* buildAutoCreateTableReq(const char* stbFullName, int64_t suid, int32_t numOfCols, - SSDataBlock* pDataBlock, SArray* pTagArray, bool newSubTableRule); +int32_t buildAutoCreateTableReq(const char* stbFullName, int64_t suid, int32_t numOfCols, SSDataBlock* pDataBlock, + SArray* pTagArray, bool newSubTableRule, SVCreateTbReq** pReq); #define TQ_ERR_GO_TO_END(c) \ do { \ diff --git a/source/dnode/vnode/src/sma/smaTimeRange.c b/source/dnode/vnode/src/sma/smaTimeRange.c index fe3117de49..201e496140 100644 --- a/source/dnode/vnode/src/sma/smaTimeRange.c +++ b/source/dnode/vnode/src/sma/smaTimeRange.c @@ -201,7 +201,12 @@ int32_t smaBlockToSubmit(SVnode *pVnode, const SArray *pBlocks, const STSchema * SSubmitTbData tbData = {.suid = suid, .uid = 0, .sver = pTSchema->version, .flags = SUBMIT_REQ_AUTO_CREATE_TABLE}; int32_t cid = taosArrayGetSize(pDataBlock->pDataBlock) + 1; - tbData.pCreateTbReq = buildAutoCreateTableReq(stbFullName, suid, cid, pDataBlock, tagArray, true); + + code = buildAutoCreateTableReq(stbFullName, suid, cid, pDataBlock, tagArray, true, &tbData.pCreateTbReq); + if (code) { + smaError("failed to build create-table req, code:%d", code); + continue; + } { uint64_t groupId = pDataBlock->info.id.groupId; diff --git a/source/dnode/vnode/src/tq/tqSink.c b/source/dnode/vnode/src/tq/tqSink.c index 3efc653f64..45212df1dd 100644 --- a/source/dnode/vnode/src/tq/tqSink.c +++ b/source/dnode/vnode/src/tq/tqSink.c @@ -18,8 +18,6 @@ #include "tmsg.h" #include "tq.h" -#define MAX_CACHE_TABLE_INFO_NUM 10240 - typedef struct STableSinkInfo { uint64_t uid; tstr name; @@ -43,7 +41,7 @@ static int32_t doRemoveFromCache(SSHashObj* pSinkTableMap, uint64_t groupId, con static bool isValidDstChildTable(SMetaReader* pReader, int32_t vgId, const char* ctbName, int64_t suid); static int32_t initCreateTableMsg(SVCreateTbReq* pCreateTableReq, uint64_t suid, const char* stbFullName, int32_t numOfTags); -static SArray* createDefaultTagColName(); +static int32_t createDefaultTagColName(SArray** pList); static void setCreateTableMsgTableName(SVCreateTbReq* pCreateTableReq, SSDataBlock* pDataBlock, const char* stbFullName, int64_t gid, bool newSubTableRule); static int32_t doCreateSinkInfo(const char* pDstTableName, STableSinkInfo** pInfo); @@ -95,7 +93,10 @@ int32_t tqBuildDeleteReq(STQ* pTq, const char* stbFullName, const SSDataBlock* p SSingleDeleteReq req = {.startTs = skey, .endTs = ekey}; strncpy(req.tbname, name, TSDB_TABLE_NAME_LEN - 1); - taosArrayPush(deleteReq->deleteReqs, &req); + void* p = taosArrayPush(deleteReq->deleteReqs, &req); + if (p == NULL) { + return TSDB_CODE_OUT_OF_MEMORY; + } } if (originName) name = originName; taosMemoryFreeClear(name); @@ -149,14 +150,20 @@ static bool tqGetTableInfo(SSHashObj* pTableInfoMap, uint64_t groupId, STableSin static int32_t tqPutReqToQueue(SVnode* pVnode, SVCreateTbBatchReq* pReqs) { void* buf = NULL; int32_t tlen = 0; - encodeCreateChildTableForRPC(pReqs, TD_VID(pVnode), &buf, &tlen); + + int32_t code = encodeCreateChildTableForRPC(pReqs, TD_VID(pVnode), &buf, &tlen); + if (code) { + tqError("vgId:%d failed to encode create table msg, create table failed, code:%s", TD_VID(pVnode), tstrerror(code)); + return code; + } SRpcMsg msg = {.msgType = TDMT_VND_CREATE_TABLE, .pCont = buf, .contLen = tlen}; - if (tmsgPutToQueue(&pVnode->msgCb, WRITE_QUEUE, &msg) != 0) { + code = tmsgPutToQueue(&pVnode->msgCb, WRITE_QUEUE, &msg); + if (code) { tqError("failed to put into write-queue since %s", terrstr()); } - return TSDB_CODE_SUCCESS; + return code; } int32_t initCreateTableMsg(SVCreateTbReq* pCreateTableReq, uint64_t suid, const char* stbFullName, int32_t numOfTags) { @@ -166,18 +173,36 @@ int32_t initCreateTableMsg(SVCreateTbReq* pCreateTableReq, uint64_t suid, const // set super table name SName name = {0}; - tNameFromString(&name, stbFullName, T_NAME_ACCT | T_NAME_DB | T_NAME_TABLE); - pCreateTableReq->ctb.stbName = taosStrdup((char*)tNameGetTableName(&name)); + + int32_t code = tNameFromString(&name, stbFullName, T_NAME_ACCT | T_NAME_DB | T_NAME_TABLE); + if (code == 0) { + pCreateTableReq->ctb.stbName = taosStrdup((char*)tNameGetTableName(&name)); + if (pCreateTableReq->ctb.stbName == NULL) { // ignore this error code + tqError("failed to duplicate the stb name:%s, failed to init create-table msg and create req table", stbFullName); + } + } pCreateTableReq->ctb.tagNum = numOfTags; - return TSDB_CODE_SUCCESS; + return code; } -SArray* createDefaultTagColName() { +int32_t createDefaultTagColName(SArray** pColNameList) { + *pColNameList = NULL; + SArray* pTagColNameList = taosArrayInit(1, TSDB_COL_NAME_LEN); - char tagNameStr[TSDB_COL_NAME_LEN] = "group_id"; - taosArrayPush(pTagColNameList, tagNameStr); - return pTagColNameList; + if (pTagColNameList == NULL) { + return TSDB_CODE_OUT_OF_MEMORY; + } + + char tagNameStr[TSDB_COL_NAME_LEN] = "group_id"; + void* p = taosArrayPush(pTagColNameList, tagNameStr); + if (p == NULL) { + taosArrayDestroy(pTagColNameList); + return TSDB_CODE_OUT_OF_MEMORY; + } + + *pColNameList = pTagColNameList; + return TSDB_CODE_SUCCESS; } void setCreateTableMsgTableName(SVCreateTbReq* pCreateTableReq, SSDataBlock* pDataBlock, const char* stbFullName, @@ -201,18 +226,20 @@ void setCreateTableMsgTableName(SVCreateTbReq* pCreateTableReq, SSDataBlock* pDa static int32_t doBuildAndSendCreateTableMsg(SVnode* pVnode, char* stbFullName, SSDataBlock* pDataBlock, SStreamTask* pTask, int64_t suid) { - STSchema* pTSchema = pTask->outputInfo.tbSink.pTSchema; - int32_t rows = pDataBlock->info.rows; - SArray* tagArray = taosArrayInit(4, sizeof(STagVal)); + STSchema* pTSchema = pTask->outputInfo.tbSink.pTSchema; + int32_t rows = pDataBlock->info.rows; + SArray* tagArray = taosArrayInit(4, sizeof(STagVal)); + const char* id = pTask->id.idStr; + int32_t vgId = pTask->pMeta->vgId; - tqDebug("s-task:%s build create %d table(s) msg", pTask->id.idStr, rows); + tqDebug("s-task:%s build create %d table(s) msg", id, rows); int32_t code = 0; SVCreateTbBatchReq reqs = {0}; SArray* crTblArray = reqs.pArray = taosArrayInit(1, sizeof(SVCreateTbReq)); if (NULL == reqs.pArray) { - tqError("s-task:%s failed to init create table msg, code:%s", pTask->id.idStr, tstrerror(terrno)); + tqError("s-task:%s failed to init create table msg, code:%s", id, tstrerror(terrno)); goto _end; } @@ -222,15 +249,26 @@ static int32_t doBuildAndSendCreateTableMsg(SVnode* pVnode, char* stbFullName, S int32_t size = taosArrayGetSize(pDataBlock->pDataBlock); int32_t numOfTags = TMAX(size - UD_TAG_COLUMN_INDEX, 1); - initCreateTableMsg(pCreateTbReq, suid, stbFullName, numOfTags); + code = initCreateTableMsg(pCreateTbReq, suid, stbFullName, numOfTags); + if (code) { + tqError("s-task:%s vgId:%d failed to init create table msg", id, vgId); + continue; + } taosArrayClear(tagArray); if (size == 2) { STagVal tagVal = { .cid = pTSchema->numOfCols + 1, .type = TSDB_DATA_TYPE_UBIGINT, .i64 = pDataBlock->info.id.groupId}; - taosArrayPush(tagArray, &tagVal); - pCreateTbReq->ctb.tagName = createDefaultTagColName(); + void* p = taosArrayPush(tagArray, &tagVal); + if (p == NULL) { + return TSDB_CODE_OUT_OF_MEMORY; + } + + code = createDefaultTagColName(&pCreateTbReq->ctb.tagName); + if (code) { + return code; + } } else { for (int32_t tagId = UD_TAG_COLUMN_INDEX, step = 1; tagId < size; tagId++, step++) { SColumnInfoData* pTagData = taosArrayGet(pDataBlock->pDataBlock, tagId); @@ -245,14 +283,19 @@ static int32_t doBuildAndSendCreateTableMsg(SVnode* pVnode, char* stbFullName, S } else { memcpy(&tagVal.i64, pData, pTagData->info.bytes); } - taosArrayPush(tagArray, &tagVal); + void* p = taosArrayPush(tagArray, &tagVal); + if (p == NULL) { + code = TSDB_CODE_OUT_OF_MEMORY; + goto _end; + } } } - tTagNew(tagArray, 1, false, (STag**)&pCreateTbReq->ctb.pTag); + code = tTagNew(tagArray, 1, false, (STag**)&pCreateTbReq->ctb.pTag); taosArrayDestroy(tagArray); tagArray = NULL; - if (pCreateTbReq->ctb.pTag == NULL) { + + if (pCreateTbReq->ctb.pTag == NULL || (code != 0)) { tdDestroySVCreateTbReq(pCreateTbReq); code = TSDB_CODE_OUT_OF_MEMORY; goto _end; @@ -270,22 +313,34 @@ static int32_t doBuildAndSendCreateTableMsg(SVnode* pVnode, char* stbFullName, S setCreateTableMsgTableName(pCreateTbReq, pDataBlock, stbFullName, gid, pTask->ver >= SSTREAM_TASK_SUBTABLE_CHANGED_VER && pTask->subtableWithoutMd5 != 1); - taosArrayPush(reqs.pArray, pCreateTbReq); + void* p = taosArrayPush(reqs.pArray, pCreateTbReq); + if (p == NULL) { + code = TSDB_CODE_OUT_OF_MEMORY; + goto _end; + } STableSinkInfo* pInfo = NULL; bool alreadyCached = tqGetTableInfo(pTask->outputInfo.tbSink.pTblInfo, gid, &pInfo); if (!alreadyCached) { code = doCreateSinkInfo(pCreateTbReq->name, &pInfo); - doPutIntoCache(pTask->outputInfo.tbSink.pTblInfo, pInfo, gid, pTask->id.idStr); + if (code) { + tqError("vgId:%d failed to create sink tableInfo for table:%s, s-task:%s", vgId, pCreateTbReq->name, id); + continue; + } + + code = doPutIntoCache(pTask->outputInfo.tbSink.pTblInfo, pInfo, gid, id); + if (code) { + tqError("vgId:%d failed to put sink tableInfo:%s into cache, s-task:%s", vgId, pCreateTbReq->name, id); + } } - tqDebug("s-task:%s build create table:%s msg complete", pTask->id.idStr, pCreateTbReq->name); + tqDebug("s-task:%s build create table:%s msg complete", id, pCreateTbReq->name); } reqs.nReqs = taosArrayGetSize(reqs.pArray); code = tqPutReqToQueue(pVnode, &reqs); if (code != TSDB_CODE_SUCCESS) { - tqError("s-task:%s failed to send create table msg", pTask->id.idStr); + tqError("s-task:%s failed to send create table msg", id); } _end: @@ -348,15 +403,26 @@ int32_t doMergeExistedRows(SSubmitTbData* pExisted, const SSubmitTbData* pNew, c SRow* pOldRow = *(SRow**)TARRAY_GET_ELEM(pExisted->aRowP, k); if (pNewRow->ts < pOldRow->ts) { - taosArrayPush(pFinal, &pNewRow); + void* p = taosArrayPush(pFinal, &pNewRow); + if (p == NULL) { + return TSDB_CODE_OUT_OF_MEMORY; + } j += 1; } else if (pNewRow->ts > pOldRow->ts) { - taosArrayPush(pFinal, &pOldRow); + void* p = taosArrayPush(pFinal, &pOldRow); + if (p == NULL) { + return TSDB_CODE_OUT_OF_MEMORY; + } + k += 1; } else { // check for the existance of primary key if (pNewRow->numOfPKs == 0) { - taosArrayPush(pFinal, &pNewRow); + void* p = taosArrayPush(pFinal, &pNewRow); + if (p == NULL) { + return TSDB_CODE_OUT_OF_MEMORY; + } + k += 1; j += 1; tRowDestroy(pOldRow); @@ -369,7 +435,11 @@ int32_t doMergeExistedRows(SSubmitTbData* pExisted, const SSubmitTbData* pNew, c int32_t ret = tRowKeyCompare(&kNew, &kOld); if (ret <= 0) { - taosArrayPush(pFinal, &pNewRow); + void* p = taosArrayPush(pFinal, &pNewRow); + if (p == NULL) { + return TSDB_CODE_OUT_OF_MEMORY; + } + j += 1; if (ret == 0) { @@ -377,7 +447,11 @@ int32_t doMergeExistedRows(SSubmitTbData* pExisted, const SSubmitTbData* pNew, c tRowDestroy(pOldRow); } } else { - taosArrayPush(pFinal, &pOldRow); + void* p = taosArrayPush(pFinal, &pOldRow); + if (p == NULL) { + return TSDB_CODE_OUT_OF_MEMORY; + } + k += 1; } } @@ -386,12 +460,18 @@ int32_t doMergeExistedRows(SSubmitTbData* pExisted, const SSubmitTbData* pNew, c while (j < newLen) { SRow* pRow = *(SRow**)TARRAY_GET_ELEM(pNew->aRowP, j++); - taosArrayPush(pFinal, &pRow); + void* p = taosArrayPush(pFinal, &pRow); + if (p == NULL) { + return TSDB_CODE_OUT_OF_MEMORY; + } } while (k < oldLen) { SRow* pRow = *(SRow**)TARRAY_GET_ELEM(pExisted->aRowP, k++); - taosArrayPush(pFinal, &pRow); + void* p = taosArrayPush(pFinal, &pRow); + if (p == NULL) { + return TSDB_CODE_OUT_OF_MEMORY; + } } taosArrayDestroy(pNew->aRowP); @@ -425,34 +505,40 @@ bool isValidDstChildTable(SMetaReader* pReader, int32_t vgId, const char* ctbNam return true; } -SVCreateTbReq* buildAutoCreateTableReq(const char* stbFullName, int64_t suid, int32_t numOfCols, - SSDataBlock* pDataBlock, SArray* pTagArray, bool newSubTableRule) { +int32_t buildAutoCreateTableReq(const char* stbFullName, int64_t suid, int32_t numOfCols, SSDataBlock* pDataBlock, + SArray* pTagArray, bool newSubTableRule, SVCreateTbReq** pReq) { + *pReq = NULL; + SVCreateTbReq* pCreateTbReq = taosMemoryCalloc(1, sizeof(SVCreateTbReq)); if (pCreateTbReq == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; - return NULL; + return TSDB_CODE_OUT_OF_MEMORY; } taosArrayClear(pTagArray); - initCreateTableMsg(pCreateTbReq, suid, stbFullName, 1); - - STagVal tagVal = {.cid = numOfCols, .type = TSDB_DATA_TYPE_UBIGINT, .i64 = pDataBlock->info.id.groupId}; - taosArrayPush(pTagArray, &tagVal); - - tTagNew(pTagArray, 1, false, (STag**)&pCreateTbReq->ctb.pTag); - - if (pCreateTbReq->ctb.pTag == NULL) { - tdDestroySVCreateTbReq(pCreateTbReq); - taosMemoryFreeClear(pCreateTbReq); - terrno = TSDB_CODE_OUT_OF_MEMORY; - return NULL; + int32_t code = initCreateTableMsg(pCreateTbReq, suid, stbFullName, 1); + if (code != 0) { + return code; } - pCreateTbReq->ctb.tagName = createDefaultTagColName(); + STagVal tagVal = {.cid = numOfCols, .type = TSDB_DATA_TYPE_UBIGINT, .i64 = pDataBlock->info.id.groupId}; + void* p = taosArrayPush(pTagArray, &tagVal); + if (p == NULL) { + return TSDB_CODE_OUT_OF_MEMORY; + } + + code = tTagNew(pTagArray, 1, false, (STag**)&pCreateTbReq->ctb.pTag); + if (pCreateTbReq->ctb.pTag == NULL || (code != 0)) { + tdDestroySVCreateTbReq(pCreateTbReq); + taosMemoryFreeClear(pCreateTbReq); + return code; + } + + code = createDefaultTagColName(&pCreateTbReq->ctb.tagName); // set table name setCreateTableMsgTableName(pCreateTbReq, pDataBlock, stbFullName, pDataBlock->info.id.groupId, newSubTableRule); - return pCreateTbReq; + *pReq = pCreateTbReq; + return code; } int32_t buildSubmitMsgImpl(SSubmitReq2* pSubmitReq, int32_t vgId, void** pMsg, int32_t* msgLen) { @@ -555,7 +641,10 @@ int32_t doConvertRows(SSubmitTbData* pTableData, const STSchema* pTSchema, SSDat break; } SColVal cv = COL_VAL_NULL(pCol->colId, pCol->type); - taosArrayPush(pVals, &cv); + void* p = taosArrayPush(pVals, &cv); + if (p == NULL) { + return TSDB_CODE_OUT_OF_MEMORY; + } } else { SColumnInfoData* pColData = taosArrayGet(pDataBlock->pDataBlock, dataIndex); if (colDataIsNull_s(pColData, j)) { @@ -566,7 +655,11 @@ int32_t doConvertRows(SSubmitTbData* pTableData, const STSchema* pTSchema, SSDat } SColVal cv = COL_VAL_NULL(pCol->colId, pCol->type); - taosArrayPush(pVals, &cv); + void* p = taosArrayPush(pVals, &cv); + if (p == NULL) { + return TSDB_CODE_OUT_OF_MEMORY; + } + dataIndex++; } else { void* colData = colDataGetData(pColData, j); @@ -574,12 +667,18 @@ int32_t doConvertRows(SSubmitTbData* pTableData, const STSchema* pTSchema, SSDat SValue sv = (SValue){.type = pCol->type, .nData = varDataLen(colData), .pData = (uint8_t*)varDataVal(colData)}; SColVal cv = COL_VAL_VALUE(pCol->colId, sv); - taosArrayPush(pVals, &cv); + void* p = taosArrayPush(pVals, &cv); + if (p == NULL) { + return TSDB_CODE_OUT_OF_MEMORY; + } } else { SValue sv = {.type = pCol->type}; memcpy(&sv.val, colData, tDataTypes[pCol->type].bytes); SColVal cv = COL_VAL_VALUE(pCol->colId, sv); - taosArrayPush(pVals, &cv); + void* p = taosArrayPush(pVals, &cv); + if (p == NULL) { + return TSDB_CODE_OUT_OF_MEMORY; + } } dataIndex++; } @@ -596,7 +695,10 @@ int32_t doConvertRows(SSubmitTbData* pTableData, const STSchema* pTSchema, SSDat } ASSERT(pRow); - taosArrayPush(pTableData->aRowP, &pRow); + void* p = taosArrayPush(pTableData->aRowP, &pRow); + if (p == NULL) { + return TSDB_CODE_OUT_OF_MEMORY; + } } taosArrayDestroy(pVals); @@ -665,6 +767,7 @@ int32_t setDstTableDataUid(SVnode* pVnode, SStreamTask* pTask, SSDataBlock* pDat STSchema* pTSchema = pTask->outputInfo.tbSink.pTSchema; int32_t vgId = TD_VID(pVnode); STableSinkInfo* pTableSinkInfo = NULL; + int32_t code = 0; bool alreadyCached = tqGetTableInfo(pTask->outputInfo.tbSink.pTblInfo, groupId, &pTableSinkInfo); @@ -686,7 +789,11 @@ int32_t setDstTableDataUid(SVnode* pVnode, SStreamTask* pTask, SSDataBlock* pDat } else { // this groupId has not been kept in cache yet if (dstTableName[0] == 0) { memset(dstTableName, 0, TSDB_TABLE_NAME_LEN); - buildCtbNameByGroupIdImpl(stbFullName, groupId, dstTableName); + code = buildCtbNameByGroupIdImpl(stbFullName, groupId, dstTableName); + if (code) { + tqDebug("s-task:%s failed to build auto create table-name:%s, groupId:0x%" PRId64, id, dstTableName, groupId); + return code; + } } else { if (pTask->subtableWithoutMd5 != 1 && !isAutoTableName(dstTableName) && !alreadyAddGroupId(dstTableName, groupId) && groupId != 0) { @@ -699,8 +806,13 @@ int32_t setDstTableDataUid(SVnode* pVnode, SStreamTask* pTask, SSDataBlock* pDat } } - int32_t code = doCreateSinkInfo(dstTableName, &pTableSinkInfo); - tqDebug("s-task:%s build new sinkTableInfo to add cache, dstTable:%s", id, dstTableName); + code = doCreateSinkInfo(dstTableName, &pTableSinkInfo); + if (code == 0) { + tqDebug("s-task:%s build new sinkTableInfo to add cache, dstTable:%s", id, dstTableName); + } else { + tqDebug("s-task:%s failed to build new sinkTableInfo, dstTable:%s", id, dstTableName); + return code; + } } if (alreadyCached) { @@ -731,20 +843,20 @@ int32_t setDstTableDataUid(SVnode* pVnode, SStreamTask* pTask, SSDataBlock* pDat SArray* pTagArray = taosArrayInit(pTSchema->numOfCols + 1, sizeof(STagVal)); pTableData->flags = SUBMIT_REQ_AUTO_CREATE_TABLE; - pTableData->pCreateTbReq = + code = buildAutoCreateTableReq(stbFullName, suid, pTSchema->numOfCols + 1, pDataBlock, pTagArray, - pTask->ver >= SSTREAM_TASK_SUBTABLE_CHANGED_VER && pTask->subtableWithoutMd5 != 1); + (pTask->ver >= SSTREAM_TASK_SUBTABLE_CHANGED_VER && pTask->subtableWithoutMd5 != 1), + &pTableData->pCreateTbReq); taosArrayDestroy(pTagArray); - if (pTableData->pCreateTbReq == NULL) { - tqError("s-task:%s failed to build auto create dst-table req:%s, code:%s", id, dstTableName, - tstrerror(terrno)); + if (code) { + tqError("s-task:%s failed to build auto create dst-table req:%s, code:%s", id, dstTableName, tstrerror(code)); taosMemoryFree(pTableSinkInfo); - return terrno; + return code; } pTableSinkInfo->uid = 0; - doPutIntoCache(pTask->outputInfo.tbSink.pTblInfo, pTableSinkInfo, groupId, id); + code = doPutIntoCache(pTask->outputInfo.tbSink.pTblInfo, pTableSinkInfo, groupId, id); } else { metaReaderClear(&mr); @@ -765,12 +877,12 @@ int32_t setDstTableDataUid(SVnode* pVnode, SStreamTask* pTask, SSDataBlock* pDat pTableSinkInfo->uid = mr.me.uid; metaReaderClear(&mr); - doPutIntoCache(pTask->outputInfo.tbSink.pTblInfo, pTableSinkInfo, groupId, id); + code = doPutIntoCache(pTask->outputInfo.tbSink.pTblInfo, pTableSinkInfo, groupId, id); } } } - return TDB_CODE_SUCCESS; + return code; } int32_t tqSetDstTableDataPayload(uint64_t suid, const STSchema *pTSchema, int32_t blockIndex, SSDataBlock* pDataBlock, @@ -864,14 +976,21 @@ void tqSinkDataIntoDstTable(SStreamTask* pTask, void* vnode, void* data) { if (code != TSDB_CODE_SUCCESS || tbData.aRowP == NULL) { if (tbData.pCreateTbReq != NULL) { tdDestroySVCreateTbReq(tbData.pCreateTbReq); - doRemoveFromCache(pTask->outputInfo.tbSink.pTblInfo, pDataBlock->info.id.groupId, id); + (void) doRemoveFromCache(pTask->outputInfo.tbSink.pTblInfo, pDataBlock->info.id.groupId, id); tbData.pCreateTbReq = NULL; } continue; } - taosArrayPush(submitReq.aSubmitTbData, &tbData); + void* p = taosArrayPush(submitReq.aSubmitTbData, &tbData); + if (p == NULL) { + tqDebug("vgId:%d, s-task:%s failed to build submit msg, data lost", vgId, id); + } + code = doBuildAndSendSubmitMsg(pVnode, pTask, &submitReq, 1); + if (code) { // failed and continue + tqDebug("vgId:%d, s-task:%s submit msg failed, data lost", vgId, id); + } } } } else { @@ -918,16 +1037,24 @@ void tqSinkDataIntoDstTable(SStreamTask* pTask, void* vnode, void* data) { if (code != TSDB_CODE_SUCCESS || tbData.aRowP == NULL) { if (tbData.pCreateTbReq != NULL) { tdDestroySVCreateTbReq(tbData.pCreateTbReq); - doRemoveFromCache(pTask->outputInfo.tbSink.pTblInfo, groupId, id); + (void) doRemoveFromCache(pTask->outputInfo.tbSink.pTblInfo, groupId, id); tbData.pCreateTbReq = NULL; } continue; } - taosArrayPush(submitReq.aSubmitTbData, &tbData); + void* p = taosArrayPush(submitReq.aSubmitTbData, &tbData); + if (p == NULL) { + tqError("vgId:%d, s-task:%s failed to build submit msg, data lost", vgId, id); + continue; + } int32_t size = (int32_t)taosArrayGetSize(submitReq.aSubmitTbData) - 1; - taosHashPut(pTableIndexMap, &groupId, sizeof(groupId), &size, sizeof(size)); + code = taosHashPut(pTableIndexMap, &groupId, sizeof(groupId), &size, sizeof(size)); + if (code) { + tqError("vgId:%d, s-task:%s failed to put group into index map, code:%s", vgId, id, tstrerror(code)); + continue; + } } else { code = tqSetDstTableDataPayload(suid, pTSchema, i, pDataBlock, &tbData, earlyTs, id); if (code != TSDB_CODE_SUCCESS || tbData.aRowP == NULL) { @@ -951,7 +1078,10 @@ void tqSinkDataIntoDstTable(SStreamTask* pTask, void* vnode, void* data) { taosHashCleanup(pTableIndexMap); if (hasSubmit) { - doBuildAndSendSubmitMsg(pVnode, pTask, &submitReq, numOfBlocks); + code = doBuildAndSendSubmitMsg(pVnode, pTask, &submitReq, numOfBlocks); + if (code) { // failed and continue + tqError("vgId:%d failed to build and send submit msg", vgId); + } } else { tDestroySubmitReq(&submitReq, TSDB_MSG_FLG_ENCODE); tqDebug("vgId:%d, s-task:%s write results completed", vgId, id); @@ -989,7 +1119,11 @@ int32_t doRemoveFromCache(SSHashObj* pSinkTableMap, uint64_t groupId, const char } int32_t code = tSimpleHashRemove(pSinkTableMap, &groupId, sizeof(groupId)); - tqDebug("s-task:%s remove cached table meta for groupId:%" PRId64, id, groupId); + if (code == 0) { + tqDebug("s-task:%s remove cached table meta for groupId:%" PRId64, id, groupId); + } else { + tqError("s-task:%s failed to remove table meta from hashmap, groupId:%" PRId64, id, groupId); + } return code; } @@ -1019,10 +1153,14 @@ int32_t doBuildAndSendDeleteMsg(SVnode* pVnode, char* stbFullName, SSDataBlock* void* serializedDeleteReq = rpcMallocCont(len + sizeof(SMsgHead)); void* abuf = POINTER_SHIFT(serializedDeleteReq, sizeof(SMsgHead)); tEncoderInit(&encoder, abuf, len); - tEncodeSBatchDeleteReq(&encoder, &deleteReq); + code = tEncodeSBatchDeleteReq(&encoder, &deleteReq); tEncoderClear(&encoder); taosArrayDestroy(deleteReq.deleteReqs); + if (code) { + return code; + } + ((SMsgHead*)serializedDeleteReq)->vgId = TD_VID(pVnode); SRpcMsg msg = {.msgType = TDMT_VND_BATCH_DEL, .pCont = serializedDeleteReq, .contLen = len + sizeof(SMsgHead)}; diff --git a/source/dnode/vnode/src/tq/tqStreamTask.c b/source/dnode/vnode/src/tq/tqStreamTask.c index 0e5b1b6fb7..c84e016459 100644 --- a/source/dnode/vnode/src/tq/tqStreamTask.c +++ b/source/dnode/vnode/src/tq/tqStreamTask.c @@ -37,7 +37,12 @@ int32_t tqScanWal(STQ* pTq) { // check all tasks int32_t numOfTasks = 0; bool shouldIdle = true; - doScanWalForAllTasks(pMeta, &shouldIdle); + + int32_t code = doScanWalForAllTasks(pMeta, &shouldIdle); + if (code) { + tqError("vgId:%d failed to start all tasks, try next time", vgId); + return code; + } streamMetaWLock(pMeta); int32_t times = (--pMeta->scanInfo.scanCounter); @@ -51,9 +56,13 @@ int32_t tqScanWal(STQ* pTq) { if (times > 0) { tqDebug("vgId:%d scan wal for stream tasks for %d times in %dms", vgId, times, SCAN_WAL_IDLE_DURATION); - tqScanWalInFuture(pTq, numOfTasks, SCAN_WAL_IDLE_DURATION); + code = tqScanWalInFuture(pTq, numOfTasks, SCAN_WAL_IDLE_DURATION); + if (code) { + tqError("vgId:%d sched scan wal in %dms failed, ignore this failure", vgId, SCAN_WAL_IDLE_DURATION); + } } - return 0; + + return code; } typedef struct SBuildScanWalMsgParam { @@ -69,28 +78,44 @@ static void doStartScanWal(void* param, void* tmrId) { tqDebug("vgId:%d create msg to start wal scan, numOfTasks:%d, vnd restored:%d", vgId, pParam->numOfTasks, pTq->pVnode->restored); - /*int32_t code = */ streamTaskSchedTask(&pTq->pVnode->msgCb, vgId, 0, 0, STREAM_EXEC_T_EXTRACT_WAL_DATA); + int32_t code = streamTaskSchedTask(&pTq->pVnode->msgCb, vgId, 0, 0, STREAM_EXEC_T_EXTRACT_WAL_DATA); taosMemoryFree(pParam); + + if (code) { + tqError("vgId:%d failed sched task to scan wal", vgId); + } } int32_t tqScanWalInFuture(STQ* pTq, int32_t numOfTasks, int32_t idleDuration) { SStreamMeta* pMeta = pTq->pStreamMeta; + int32_t code = 0; + int32_t vgId = TD_VID(pTq->pVnode); SBuildScanWalMsgParam* pParam = taosMemoryMalloc(sizeof(SBuildScanWalMsgParam)); + if (pParam == NULL) { + return TSDB_CODE_OUT_OF_MEMORY; + } pParam->pTq = pTq; pParam->numOfTasks = numOfTasks; - tmr_h pTimer = streamTimerGetInstance(); - ASSERT(pTimer); + tmr_h pTimer = NULL; + code = streamTimerGetInstance(&pTimer); + if (code) { + tqError("vgId:%d failed to get tmr ctrl during sched scan wal", vgId); + return code; + } if (pMeta->scanInfo.scanTimer == NULL) { pMeta->scanInfo.scanTimer = taosTmrStart(doStartScanWal, idleDuration, pParam, pTimer); } else { - taosTmrReset(doStartScanWal, idleDuration, pParam, pTimer, &pMeta->scanInfo.scanTimer); + code = taosTmrReset(doStartScanWal, idleDuration, pParam, pTimer, &pMeta->scanInfo.scanTimer); + if (code) { + tqError("vgId:%d failed to start scan wal in:%dms", vgId, idleDuration); + } } - return TSDB_CODE_SUCCESS; + return code; } int32_t tqScanWalAsync(STQ* pTq, bool ckPause) { @@ -207,7 +232,11 @@ bool handleFillhistoryScanComplete(SStreamTask* pTask, int64_t ver) { double el = (taosGetTimestampMs() - pTask->execInfo.step2Start) / 1000.0; qDebug("s-task:%s scan-history from WAL stage(step 2) ended, range:%" PRId64 "-%" PRId64 ", elapsed time:%.2fs", id, pTask->step2Range.minVer, maxVer, el); - /*int32_t code = */streamTaskPutTranstateIntoInputQ(pTask); + int32_t code = streamTaskPutTranstateIntoInputQ(pTask); + if (code) { + qError("s-task:%s failed to put trans-state into inputQ", id); + } + return true; } else { qWarn("s-task:%s fill-history scan WAL, nextProcessVer:%" PRId64 " out of the ver range:%" PRId64 "-%" PRId64 @@ -290,8 +319,12 @@ bool doPutDataIntoInputQ(SStreamTask* pTask, int64_t maxVer, int32_t* numOfItems break; } } else { - walReaderSeekVer(pTask->exec.pWalReader, pTask->chkInfo.nextProcessVer); tqTrace("s-task:%s append input queue failed, code:too many items, ver:%" PRId64, id, pTask->chkInfo.nextProcessVer); + code = walReaderSeekVer(pTask->exec.pWalReader, pTask->chkInfo.nextProcessVer); + if (code) { + tqError("s-task:%s failed to seek ver to:%"PRId64 " in wal", id, pTask->chkInfo.nextProcessVer); + } + break; } } @@ -347,18 +380,18 @@ int32_t doScanWalForAllTasks(SStreamMeta* pStreamMeta, bool* pScanIdle) { int32_t numOfItems = streamQueueGetNumOfItems(pTask->inputq.queue); int64_t maxVer = (pTask->info.fillHistory == 1) ? pTask->step2Range.maxVer : INT64_MAX; - taosThreadMutexLock(&pTask->lock); + streamMutexLock(&pTask->lock); SStreamTaskState pState = streamTaskGetStatus(pTask); if (pState.state != TASK_STATUS__READY) { tqDebug("s-task:%s not ready for submit block from wal, status:%s", pTask->id.idStr, pState.name); - taosThreadMutexUnlock(&pTask->lock); + streamMutexUnlock(&pTask->lock); streamMetaReleaseTask(pStreamMeta, pTask); continue; } bool hasNewData = doPutDataIntoInputQ(pTask, maxVer, &numOfItems); - taosThreadMutexUnlock(&pTask->lock); + streamMutexUnlock(&pTask->lock); if ((numOfItems > 0) || hasNewData) { noDataInWal = false; @@ -366,7 +399,7 @@ int32_t doScanWalForAllTasks(SStreamMeta* pStreamMeta, bool* pScanIdle) { if (code != TSDB_CODE_SUCCESS) { streamMetaReleaseTask(pStreamMeta, pTask); taosArrayDestroy(pTaskList); - return -1; + return code; } } @@ -379,5 +412,5 @@ int32_t doScanWalForAllTasks(SStreamMeta* pStreamMeta, bool* pScanIdle) { } taosArrayDestroy(pTaskList); - return 0; + return TSDB_CODE_SUCCESS; } diff --git a/source/dnode/vnode/src/tqCommon/tqCommon.c b/source/dnode/vnode/src/tqCommon/tqCommon.c index 3c6100a8f4..b56c474ed5 100644 --- a/source/dnode/vnode/src/tqCommon/tqCommon.c +++ b/source/dnode/vnode/src/tqCommon/tqCommon.c @@ -160,6 +160,7 @@ int32_t tqStreamTaskProcessUpdateReq(SStreamMeta* pMeta, SMsgCb* cb, SRpcMsg* pM SRpcMsg rsp = {.info = pMsg->info, .code = TSDB_CODE_SUCCESS}; int64_t st = taosGetTimestampMs(); bool updated = false; + int32_t code = 0; SStreamTaskNodeUpdateMsg req = {0}; @@ -258,26 +259,40 @@ int32_t tqStreamTaskProcessUpdateReq(SStreamMeta* pMeta, SMsgCb* cb, SRpcMsg* pM } } - // save + // stream do update the nodeEp info, write it into stream meta. if (updated) { tqDebug("s-task:%s vgId:%d save task after update epset, and stop task", idstr, vgId); - streamMetaSaveTask(pMeta, pTask); + code = streamMetaSaveTask(pMeta, pTask); + if (code) { + tqError("s-task:%s vgId:%d failed to save task, code:%s", idstr, vgId, tstrerror(code)); + } + if (ppHTask != NULL) { - streamMetaSaveTask(pMeta, *ppHTask); + code = streamMetaSaveTask(pMeta, *ppHTask); + if (code) { + tqError("s-task:%s vgId:%d failed to save related history task, code:%s", idstr, vgId, tstrerror(code)); + } } } else { tqDebug("s-task:%s vgId:%d not save task since not update epset actually, stop task", idstr, vgId); } - streamTaskStop(pTask); + code = streamTaskStop(pTask); + if (code) { + tqError("s-task:%s vgId:%d failed to stop task, code:%s", idstr, vgId, tstrerror(code)); + } + if (ppHTask != NULL) { - streamTaskStop(*ppHTask); + code = streamTaskStop(*ppHTask); + if (code) { + tqError("s-task:%s vgId:%d failed to stop related history task, code:%s", idstr, vgId, tstrerror(code)); + } } // keep info streamMetaAddIntoUpdateTaskList(pMeta, pTask, (ppHTask != NULL) ? (*ppHTask) : NULL, req.transId, st); - rsp.code = 0; + rsp.code = TSDB_CODE_SUCCESS; // possibly only handle the stream task. int32_t numOfTasks = streamMetaGetNumOfTasks(pMeta); @@ -305,13 +320,16 @@ int32_t tqStreamTaskProcessUpdateReq(SStreamMeta* pMeta, SMsgCb* cb, SRpcMsg* pM #if 0 taosMSleep(5000);// for test purpose, to trigger the leader election #endif - tqStreamTaskStartAsync(pMeta, cb, true); + code = tqStreamTaskStartAsync(pMeta, cb, true); + if (code) { + tqError("vgId:%d async start all tasks, failed, code:%s", vgId, tstrerror(code)); + } } } streamMetaWUnLock(pMeta); taosArrayDestroy(req.pNodeList); - return rsp.code; + return rsp.code; // always return true } int32_t tqStreamTaskProcessDispatchReq(SStreamMeta* pMeta, SRpcMsg* pMsg) { @@ -333,7 +351,7 @@ int32_t tqStreamTaskProcessDispatchReq(SStreamMeta* pMeta, SRpcMsg* pMsg) { SStreamTask* pTask = NULL; int32_t code = streamMetaAcquireTask(pMeta, req.streamId, req.taskId, &pTask); - if (pTask) { + if (pTask && (code == 0)) { SRpcMsg rsp = {.info = pMsg->info, .code = 0}; if (streamProcessDispatchMsg(pTask, &req, &rsp) != 0) { return -1; @@ -393,14 +411,14 @@ int32_t tqStreamTaskProcessDispatchRsp(SStreamMeta* pMeta, SRpcMsg* pMsg) { SStreamTask* pTask = NULL; int32_t code = streamMetaAcquireTask(pMeta, pRsp->streamId, pRsp->upstreamTaskId, &pTask); - if (pTask) { - streamProcessDispatchRsp(pTask, pRsp, pMsg->code); + if (pTask && (code == 0)) { + code = streamProcessDispatchRsp(pTask, pRsp, pMsg->code); streamMetaReleaseTask(pMeta, pTask); - return TSDB_CODE_SUCCESS; + return code; } else { tqDebug("vgId:%d failed to handle the dispatch rsp, since find task:0x%x failed", vgId, pRsp->upstreamTaskId); terrno = TSDB_CODE_STREAM_TASK_NOT_EXIST; - return terrno; + return TSDB_CODE_STREAM_TASK_NOT_EXIST; } } @@ -408,16 +426,22 @@ int32_t tqStreamTaskProcessRetrieveReq(SStreamMeta* pMeta, SRpcMsg* pMsg) { char* msgStr = pMsg->pCont; char* msgBody = POINTER_SHIFT(msgStr, sizeof(SMsgHead)); int32_t msgLen = pMsg->contLen - sizeof(SMsgHead); + int32_t code = 0; SDecoder decoder; SStreamRetrieveReq req; tDecoderInit(&decoder, (uint8_t*)msgBody, msgLen); - tDecodeStreamRetrieveReq(&decoder, &req); + code = tDecodeStreamRetrieveReq(&decoder, &req); tDecoderClear(&decoder); + if (code) { + tqError("vgId:%d failed to decode retrieve msg, quit handling it", pMeta->vgId); + return code; + } + SStreamTask* pTask = NULL; - int32_t code = streamMetaAcquireTask(pMeta, req.streamId, req.dstTaskId, &pTask); - if (pTask == NULL) { + code = streamMetaAcquireTask(pMeta, req.streamId, req.dstTaskId, &pTask); + if (pTask == NULL || code != 0) { tqError("vgId:%d process retrieve req, failed to acquire task:0x%x, it may have been dropped already", pMeta->vgId, req.dstTaskId); tCleanupStreamRetrieveReq(&req); @@ -446,6 +470,7 @@ int32_t tqStreamTaskProcessCheckReq(SStreamMeta* pMeta, SRpcMsg* pMsg) { char* msgStr = pMsg->pCont; char* msgBody = POINTER_SHIFT(msgStr, sizeof(SMsgHead)); int32_t msgLen = pMsg->contLen - sizeof(SMsgHead); + int32_t code = 0; SStreamTaskCheckReq req; SStreamTaskCheckRsp rsp = {0}; @@ -453,9 +478,14 @@ int32_t tqStreamTaskProcessCheckReq(SStreamMeta* pMeta, SRpcMsg* pMsg) { SDecoder decoder; tDecoderInit(&decoder, (uint8_t*)msgBody, msgLen); - tDecodeStreamTaskCheckReq(&decoder, &req); + code = tDecodeStreamTaskCheckReq(&decoder, &req); tDecoderClear(&decoder); + if (code) { + tqError("vgId:%d decode check msg failed, not handle this msg", pMeta->vgId); + return code; + } + streamTaskProcessCheckMsg(pMeta, &req, &rsp); return streamTaskSendCheckRsp(pMeta, req.upstreamNodeId, &rsp, &pMsg->info, req.upstreamTaskId); } @@ -490,7 +520,7 @@ int32_t tqStreamTaskProcessCheckRsp(SStreamMeta* pMeta, SRpcMsg* pMsg, bool isLe SStreamTask* pTask = NULL; code = streamMetaAcquireTask(pMeta, rsp.streamId, rsp.upstreamTaskId, &pTask); - if (pTask == NULL) { + if ((pTask == NULL) || (code != 0)) { return streamMetaAddFailedTask(pMeta, rsp.streamId, rsp.upstreamTaskId); } @@ -518,19 +548,25 @@ int32_t tqStreamTaskProcessCheckpointReadyMsg(SStreamMeta* pMeta, SRpcMsg* pMsg) SStreamTask* pTask = NULL; code = streamMetaAcquireTask(pMeta, req.streamId, req.upstreamTaskId, &pTask); - if (pTask == NULL) { + if (code != 0) { tqError("vgId:%d failed to find s-task:0x%x, it may have been destroyed already", vgId, req.downstreamTaskId); - return TSDB_CODE_STREAM_TASK_NOT_EXIST; + return code; } tqDebug("vgId:%d s-task:%s received the checkpoint-ready msg from task:0x%x (vgId:%d), handle it", vgId, pTask->id.idStr, req.downstreamTaskId, req.downstreamNodeId); - streamProcessCheckpointReadyMsg(pTask, req.checkpointId, req.downstreamTaskId, req.downstreamNodeId); + code = streamProcessCheckpointReadyMsg(pTask, req.checkpointId, req.downstreamTaskId, req.downstreamNodeId); streamMetaReleaseTask(pMeta, pTask); + if (code) { + return code; + } { // send checkpoint ready rsp SMStreamCheckpointReadyRspMsg* pReadyRsp = rpcMallocCont(sizeof(SMStreamCheckpointReadyRspMsg)); + if (pReadyRsp == NULL) { + return TSDB_CODE_OUT_OF_MEMORY; + } pReadyRsp->upstreamTaskId = req.upstreamTaskId; pReadyRsp->upstreamNodeId = req.upstreamNodeId; @@ -606,8 +642,8 @@ int32_t tqStreamTaskProcessDeployReq(SStreamMeta* pMeta, SMsgCb* cb, int64_t sve if (restored) { SStreamTask* p = NULL; code = streamMetaAcquireTask(pMeta, streamId, taskId, &p); - if ((p != NULL) && (p->info.fillHistory == 0)) { - tqStreamStartOneTaskAsync(pMeta, cb, streamId, taskId); + if ((p != NULL) && (code == 0) && (p->info.fillHistory == 0)) { + code = tqStreamStartOneTaskAsync(pMeta, cb, streamId, taskId); } if (p != NULL) { @@ -631,6 +667,7 @@ int32_t tqStreamTaskProcessDeployReq(SStreamMeta* pMeta, SMsgCb* cb, int64_t sve int32_t tqStreamTaskProcessDropReq(SStreamMeta* pMeta, char* msg, int32_t msgLen) { SVDropStreamTaskReq* pReq = (SVDropStreamTaskReq*)msg; + int32_t code = 0; int32_t vgId = pMeta->vgId; STaskId hTaskId = {0}; tqDebug("vgId:%d receive msg to drop s-task:0x%x", vgId, pReq->taskId); @@ -649,8 +686,12 @@ int32_t tqStreamTaskProcessDropReq(SStreamMeta* pMeta, char* msg, int32_t msgLen } streamTaskSetRemoveBackendFiles(pTask); - streamTaskClearHTaskAttr(pTask, pReq->resetRelHalt); + code = streamTaskClearHTaskAttr(pTask, pReq->resetRelHalt); streamMetaReleaseTask(pMeta, pTask); + + if (code) { + tqError("s-task:0x%x failed to clear related fill-history info, still exists", pReq->taskId); + } } streamMetaWUnLock(pMeta); @@ -658,11 +699,17 @@ int32_t tqStreamTaskProcessDropReq(SStreamMeta* pMeta, char* msg, int32_t msgLen // drop the related fill-history task firstly if (hTaskId.taskId != 0 && hTaskId.streamId != 0) { tqDebug("s-task:0x%x vgId:%d drop rel fill-history task:0x%x firstly", pReq->taskId, vgId, (int32_t)hTaskId.taskId); - streamMetaUnregisterTask(pMeta, hTaskId.streamId, hTaskId.taskId); + code = streamMetaUnregisterTask(pMeta, hTaskId.streamId, hTaskId.taskId); + if (code) { + tqDebug("s-task:0x%x vgId:%d drop rel fill-history task:0x%x failed", pReq->taskId, vgId, (int32_t)hTaskId.taskId); + } } // drop the stream task now - streamMetaUnregisterTask(pMeta, pReq->streamId, pReq->taskId); + code = streamMetaUnregisterTask(pMeta, pReq->streamId, pReq->taskId); + if (code) { + tqDebug("s-task:0x%x vgId:%d drop task failed", pReq->taskId, vgId); + } // commit the update streamMetaWLock(pMeta); @@ -674,12 +721,13 @@ int32_t tqStreamTaskProcessDropReq(SStreamMeta* pMeta, char* msg, int32_t msgLen } streamMetaWUnLock(pMeta); - return 0; + return 0; // always return success } int32_t tqStreamTaskProcessUpdateCheckpointReq(SStreamMeta* pMeta, bool restored, char* msg) { SVUpdateCheckpointInfoReq* pReq = (SVUpdateCheckpointInfoReq*)msg; + int32_t code = 0; int32_t vgId = pMeta->vgId; tqDebug("vgId:%d receive msg to update-checkpoint-info for s-task:0x%x", vgId, pReq->taskId); @@ -689,7 +737,7 @@ int32_t tqStreamTaskProcessUpdateCheckpointReq(SStreamMeta* pMeta, bool restored SStreamTask** ppTask = (SStreamTask**)taosHashGet(pMeta->pTasksMap, &id, sizeof(id)); if (ppTask != NULL && (*ppTask) != NULL) { - streamTaskUpdateTaskCheckpointInfo(*ppTask, restored, pReq); + code = streamTaskUpdateTaskCheckpointInfo(*ppTask, restored, pReq); } else { // failed to get the task. int32_t numOfTasks = streamMetaGetNumOfTasks(pMeta); tqError( @@ -700,7 +748,7 @@ int32_t tqStreamTaskProcessUpdateCheckpointReq(SStreamMeta* pMeta, bool restored streamMetaWUnLock(pMeta); // always return success when handling the requirement issued by mnode during transaction. - return TSDB_CODE_SUCCESS; + return code; } static int32_t restartStreamTasks(SStreamMeta* pMeta, bool isLeader) { @@ -746,7 +794,7 @@ static int32_t restartStreamTasks(SStreamMeta* pMeta, bool isLeader) { if (isLeader && !tsDisableStream) { streamMetaWUnLock(pMeta); - streamMetaStartAllTasks(pMeta); + code = streamMetaStartAllTasks(pMeta); } else { streamMetaResetStartInfo(&pMeta->startInfo, pMeta->vgId); pMeta->startInfo.restartCount = 0; @@ -765,16 +813,16 @@ int32_t tqStreamTaskProcessRunReq(SStreamMeta* pMeta, SRpcMsg* pMsg, bool isLead int32_t vgId = pMeta->vgId; if (type == STREAM_EXEC_T_START_ONE_TASK) { - streamMetaStartOneTask(pMeta, pReq->streamId, pReq->taskId); + (void) streamMetaStartOneTask(pMeta, pReq->streamId, pReq->taskId); return 0; } else if (type == STREAM_EXEC_T_START_ALL_TASKS) { - streamMetaStartAllTasks(pMeta); + (void) streamMetaStartAllTasks(pMeta); return 0; } else if (type == STREAM_EXEC_T_RESTART_ALL_TASKS) { - restartStreamTasks(pMeta, isLeader); + (void) restartStreamTasks(pMeta, isLeader); return 0; } else if (type == STREAM_EXEC_T_STOP_ALL_TASKS) { - streamMetaStopAllTasks(pMeta); + (void) streamMetaStopAllTasks(pMeta); return 0; } else if (type == STREAM_EXEC_T_ADD_FAILED_TASK) { int32_t code = streamMetaAddFailedTask(pMeta, pReq->streamId, pReq->taskId); @@ -783,7 +831,7 @@ int32_t tqStreamTaskProcessRunReq(SStreamMeta* pMeta, SRpcMsg* pMsg, bool isLead SStreamTask* pTask = NULL; int32_t code = streamMetaAcquireTask(pMeta, pReq->streamId, pReq->taskId, &pTask); - if (pTask != NULL) { + if (pTask != NULL && (code == 0)) { char* pStatus = NULL; if (streamTaskReadyToRun(pTask, &pStatus)) { int64_t execTs = pTask->status.lastExecTs; @@ -804,12 +852,12 @@ int32_t tqStreamTaskProcessRunReq(SStreamMeta* pMeta, SRpcMsg* pMsg, bool isLead SStreamTask* pTask = NULL; int32_t code = streamMetaAcquireTask(pMeta, pReq->streamId, pReq->taskId, &pTask); - if (pTask != NULL) { // even in halt status, the data in inputQ must be processed + if ((pTask != NULL) && (code == 0)) { // even in halt status, the data in inputQ must be processed char* p = NULL; if (streamTaskReadyToRun(pTask, &p)) { tqDebug("vgId:%d s-task:%s status:%s start to process block from inputQ, next checked ver:%" PRId64, vgId, pTask->id.idStr, p, pTask->chkInfo.nextProcessVer); - streamExecTask(pTask); + (void) streamExecTask(pTask); } else { int8_t status = streamTaskSetSchedStatusInactive(pTask); tqDebug("vgId:%d s-task:%s ignore run req since not in ready state, status:%s, sched-status:%d", vgId, @@ -829,6 +877,7 @@ int32_t tqStartTaskCompleteCallback(SStreamMeta* pMeta) { STaskStartInfo* pStartInfo = &pMeta->startInfo; int32_t vgId = pMeta->vgId; bool scanWal = false; + int32_t code = 0; streamMetaWLock(pMeta); if (pStartInfo->startAllTasks == 1) { @@ -844,8 +893,7 @@ int32_t tqStartTaskCompleteCallback(SStreamMeta* pMeta) { pStartInfo->restartCount); streamMetaWUnLock(pMeta); - restartStreamTasks(pMeta, (pMeta->role == NODE_ROLE_LEADER)); - return TSDB_CODE_SUCCESS; + return restartStreamTasks(pMeta, (pMeta->role == NODE_ROLE_LEADER)); } else { if (pStartInfo->restartCount == 0) { tqDebug("vgId:%d start all tasks completed in callbackFn, restartCount is 0", pMeta->vgId); @@ -862,10 +910,10 @@ int32_t tqStartTaskCompleteCallback(SStreamMeta* pMeta) { if (scanWal && (vgId != SNODE_HANDLE)) { tqDebug("vgId:%d start scan wal for executing tasks", vgId); - tqScanWalAsync(pMeta->ahandle, true); + code = tqScanWalAsync(pMeta->ahandle, true); } - return TSDB_CODE_SUCCESS; + return code; } int32_t tqStreamTaskProcessTaskResetReq(SStreamMeta* pMeta, char* pMsg) { @@ -873,7 +921,7 @@ int32_t tqStreamTaskProcessTaskResetReq(SStreamMeta* pMeta, char* pMsg) { SStreamTask* pTask = NULL; int32_t code = streamMetaAcquireTask(pMeta, pReq->streamId, pReq->taskId, &pTask); - if (pTask == NULL) { + if (pTask == NULL || (code != 0)) { tqError("vgId:%d process task-reset req, failed to acquire task:0x%x, it may have been dropped already", pMeta->vgId, pReq->taskId); return TSDB_CODE_SUCCESS; @@ -881,7 +929,7 @@ int32_t tqStreamTaskProcessTaskResetReq(SStreamMeta* pMeta, char* pMsg) { tqDebug("s-task:%s receive task-reset msg from mnode, reset status and ready for data processing", pTask->id.idStr); - taosThreadMutexLock(&pTask->lock); + streamMutexLock(&pTask->lock); streamTaskClearCheckInfo(pTask, true); // clear flag set during do checkpoint, and open inputQ for all upstream tasks @@ -904,7 +952,7 @@ int32_t tqStreamTaskProcessTaskResetReq(SStreamMeta* pMeta, char* pMsg) { tqDebug("s-task:%s status:%s do nothing after receiving reset-task from mnode", pTask->id.idStr, pState.name); } - taosThreadMutexUnlock(&pTask->lock); + streamMutexUnlock(&pTask->lock); streamMetaReleaseTask(pMeta, pTask); return TSDB_CODE_SUCCESS; @@ -915,7 +963,7 @@ int32_t tqStreamTaskProcessRetrieveTriggerReq(SStreamMeta* pMeta, SRpcMsg* pMsg) SStreamTask* pTask = NULL; int32_t code = streamMetaAcquireTask(pMeta, pReq->streamId, pReq->upstreamTaskId, &pTask); - if (pTask == NULL) { + if (pTask == NULL || (code != 0)) { tqError("vgId:%d process retrieve checkpoint trigger, checkpointId:%" PRId64 " from s-task:0x%x, failed to acquire task:0x%x, it may have been dropped already", pMeta->vgId, pReq->checkpointId, (int32_t)pReq->downstreamTaskId, pReq->upstreamTaskId); @@ -929,11 +977,10 @@ int32_t tqStreamTaskProcessRetrieveTriggerReq(SStreamMeta* pMeta, SRpcMsg* pMsg) tqError("s-task:%s not ready for checkpoint-trigger retrieve from 0x%x, since downstream not ready", pTask->id.idStr, (int32_t)pReq->downstreamTaskId); - streamTaskSendCheckpointTriggerMsg(pTask, pReq->downstreamTaskId, pReq->downstreamNodeId, &pMsg->info, + code = streamTaskSendCheckpointTriggerMsg(pTask, pReq->downstreamTaskId, pReq->downstreamNodeId, &pMsg->info, TSDB_CODE_STREAM_TASK_IVLD_STATUS); streamMetaReleaseTask(pMeta, pTask); - - return TSDB_CODE_SUCCESS; + return code; } SStreamTaskState pState = streamTaskGetStatus(pTask); @@ -948,7 +995,7 @@ int32_t tqStreamTaskProcessRetrieveTriggerReq(SStreamMeta* pMeta, SRpcMsg* pMsg) // re-send the lost checkpoint-trigger msg to downstream task tqDebug("s-task:%s re-send checkpoint-trigger to:0x%x, checkpointId:%" PRId64 ", transId:%d", pTask->id.idStr, (int32_t)pReq->downstreamTaskId, checkpointId, transId); - streamTaskSendCheckpointTriggerMsg(pTask, pReq->downstreamTaskId, pReq->downstreamNodeId, &pMsg->info, + code = streamTaskSendCheckpointTriggerMsg(pTask, pReq->downstreamTaskId, pReq->downstreamNodeId, &pMsg->info, TSDB_CODE_SUCCESS); } else { // not send checkpoint-trigger yet, wait int32_t recv = 0, total = 0; @@ -962,7 +1009,7 @@ int32_t tqStreamTaskProcessRetrieveTriggerReq(SStreamMeta* pMeta, SRpcMsg* pMsg) "sending checkpoint-source/trigger", pTask->id.idStr, recv, total); } - streamTaskSendCheckpointTriggerMsg(pTask, pReq->downstreamTaskId, pReq->downstreamNodeId, &pMsg->info, + code = streamTaskSendCheckpointTriggerMsg(pTask, pReq->downstreamTaskId, pReq->downstreamNodeId, &pMsg->info, TSDB_CODE_ACTION_IN_PROGRESS); } } else { // upstream not recv the checkpoint-source/trigger till now @@ -971,12 +1018,12 @@ int32_t tqStreamTaskProcessRetrieveTriggerReq(SStreamMeta* pMeta, SRpcMsg* pMsg) "s-task:%s not recv checkpoint-source from mnode or checkpoint-trigger from upstream yet, wait for all " "upstream sending checkpoint-source/trigger", pTask->id.idStr); - streamTaskSendCheckpointTriggerMsg(pTask, pReq->downstreamTaskId, pReq->downstreamNodeId, &pMsg->info, + code = streamTaskSendCheckpointTriggerMsg(pTask, pReq->downstreamTaskId, pReq->downstreamNodeId, &pMsg->info, TSDB_CODE_ACTION_IN_PROGRESS); } streamMetaReleaseTask(pMeta, pTask); - return TSDB_CODE_SUCCESS; + return code; } int32_t tqStreamTaskProcessRetrieveTriggerRsp(SStreamMeta* pMeta, SRpcMsg* pMsg) { @@ -984,19 +1031,19 @@ int32_t tqStreamTaskProcessRetrieveTriggerRsp(SStreamMeta* pMeta, SRpcMsg* pMsg) SStreamTask* pTask = NULL; int32_t code = streamMetaAcquireTask(pMeta, pRsp->streamId, pRsp->taskId, &pTask); - if (pTask == NULL) { + if (pTask == NULL || (code != 0)) { tqError( "vgId:%d process retrieve checkpoint-trigger, failed to acquire task:0x%x, it may have been dropped already", pMeta->vgId, pRsp->taskId); - return TSDB_CODE_STREAM_TASK_NOT_EXIST; + return code; } tqDebug("s-task:%s recv re-send checkpoint-trigger msg from upstream:0x%x, checkpointId:%" PRId64 ", transId:%d", pTask->id.idStr, pRsp->upstreamTaskId, pRsp->checkpointId, pRsp->transId); - streamTaskProcessCheckpointTriggerRsp(pTask, pRsp); + code = streamTaskProcessCheckpointTriggerRsp(pTask, pRsp); streamMetaReleaseTask(pMeta, pTask); - return TSDB_CODE_SUCCESS; + return code; } int32_t tqStreamTaskProcessTaskPauseReq(SStreamMeta* pMeta, char* pMsg) { @@ -1004,7 +1051,7 @@ int32_t tqStreamTaskProcessTaskPauseReq(SStreamMeta* pMeta, char* pMsg) { SStreamTask* pTask = NULL; int32_t code = streamMetaAcquireTask(pMeta, pReq->streamId, pReq->taskId, &pTask); - if (pTask == NULL) { + if (pTask == NULL || (code != 0)) { tqError("vgId:%d process pause req, failed to acquire task:0x%x, it may have been dropped already", pMeta->vgId, pReq->taskId); // since task is in [STOP|DROPPING] state, it is safe to assume the pause is active @@ -1018,7 +1065,7 @@ int32_t tqStreamTaskProcessTaskPauseReq(SStreamMeta* pMeta, char* pMsg) { if (HAS_RELATED_FILLHISTORY_TASK(pTask)) { pHistoryTask = NULL; code = streamMetaAcquireTask(pMeta, pTask->hTaskInfo.id.streamId, pTask->hTaskInfo.id.taskId, &pHistoryTask); - if (pHistoryTask == NULL) { + if (pHistoryTask == NULL || (code != 0)) { tqError("vgId:%d process pause req, failed to acquire fill-history task:0x%" PRIx64 ", it may have been dropped already", pMeta->vgId, pTask->hTaskInfo.id.taskId); @@ -1042,6 +1089,8 @@ static int32_t tqProcessTaskResumeImpl(void* handle, SStreamTask* pTask, int64_t bool fromVnode) { SStreamMeta* pMeta = fromVnode ? ((STQ*)handle)->pStreamMeta : handle; int32_t vgId = pMeta->vgId; + int32_t code = 0; + if (pTask == NULL) { return -1; } @@ -1065,18 +1114,18 @@ static int32_t tqProcessTaskResumeImpl(void* handle, SStreamTask* pTask, int64_t if (level == TASK_LEVEL__SOURCE && pTask->info.fillHistory && status == TASK_STATUS__SCAN_HISTORY) { pTask->hTaskInfo.operatorOpen = false; - streamStartScanHistoryAsync(pTask, igUntreated); + code = streamStartScanHistoryAsync(pTask, igUntreated); } else if (level == TASK_LEVEL__SOURCE && (streamQueueGetNumOfItems(pTask->inputq.queue) == 0)) { - tqScanWalAsync((STQ*)handle, false); + code = tqScanWalAsync((STQ*)handle, false); } else { - streamTrySchedExec(pTask); + code = streamTrySchedExec(pTask); } } /*else { ASSERT(status != TASK_STATUS__UNINIT); }*/ streamMetaReleaseTask(pMeta, pTask); - return 0; + return code; } int32_t tqStreamTaskProcessTaskResumeReq(void* handle, int64_t sversion, char* msg, bool fromVnode) { @@ -1086,15 +1135,15 @@ int32_t tqStreamTaskProcessTaskResumeReq(void* handle, int64_t sversion, char* m SStreamTask* pTask = NULL; int32_t code = streamMetaAcquireTask(pMeta, pReq->streamId, pReq->taskId, &pTask); - if (pTask == NULL) { + if (pTask == NULL || (code != 0)) { tqError("s-task:0x%x failed to acquire task to resume, it may have been dropped or stopped", pReq->taskId); return TSDB_CODE_STREAM_TASK_IVLD_STATUS; } - taosThreadMutexLock(&pTask->lock); + streamMutexLock(&pTask->lock); SStreamTaskState pState = streamTaskGetStatus(pTask); tqDebug("s-task:%s start to resume from paused, current status:%s", pTask->id.idStr, pState.name); - taosThreadMutexUnlock(&pTask->lock); + streamMutexUnlock(&pTask->lock); code = tqProcessTaskResumeImpl(handle, pTask, sversion, pReq->igUntreated, fromVnode); if (code != 0) { @@ -1104,11 +1153,11 @@ int32_t tqStreamTaskProcessTaskResumeReq(void* handle, int64_t sversion, char* m STaskId* pHTaskId = &pTask->hTaskInfo.id; SStreamTask* pHTask = NULL; code = streamMetaAcquireTask(pMeta, pHTaskId->streamId, pHTaskId->taskId, &pHTask); - if (pHTask) { - taosThreadMutexLock(&pHTask->lock); + if (pHTask && (code == 0)) { + streamMutexLock(&pHTask->lock); SStreamTaskState p = streamTaskGetStatus(pHTask); tqDebug("s-task:%s related history task start to resume from paused, current status:%s", pHTask->id.idStr, p.name); - taosThreadMutexUnlock(&pHTask->lock); + streamMutexUnlock(&pHTask->lock); code = tqProcessTaskResumeImpl(handle, pHTask, sversion, pReq->igUntreated, fromVnode); } @@ -1139,15 +1188,15 @@ int32_t tqStreamProcessCheckpointReadyRsp(SStreamMeta* pMeta, SRpcMsg* pMsg) { SStreamTask* pTask = NULL; int32_t code = streamMetaAcquireTask(pMeta, pRsp->streamId, pRsp->downstreamTaskId, &pTask); - if (pTask == NULL) { + if (pTask == NULL || (code != 0)) { tqError("vgId:%d failed to acquire task:0x%x when handling checkpoint-ready msg, it may have been dropped", pRsp->downstreamNodeId, pRsp->downstreamTaskId); - return TSDB_CODE_STREAM_TASK_NOT_EXIST; + return code; } - streamTaskProcessCheckpointReadyRsp(pTask, pRsp->upstreamTaskId, pRsp->checkpointId); + code = streamTaskProcessCheckpointReadyRsp(pTask, pRsp->upstreamTaskId, pRsp->checkpointId); streamMetaReleaseTask(pMeta, pTask); - return TSDB_CODE_SUCCESS; + return code; } int32_t tqStreamTaskProcessConsenChkptIdReq(SStreamMeta* pMeta, SRpcMsg* pMsg) { @@ -1173,11 +1222,11 @@ int32_t tqStreamTaskProcessConsenChkptIdReq(SStreamMeta* pMeta, SRpcMsg* pMsg) { SStreamTask* pTask = NULL; code = streamMetaAcquireTask(pMeta, req.streamId, req.taskId, &pTask); - if (pTask == NULL) { + if (pTask == NULL || (code != 0)) { tqError("vgId:%d process set consensus checkpointId req, failed to acquire task:0x%x, it may have been dropped already", pMeta->vgId, req.taskId); - streamMetaAddFailedTask(pMeta, req.streamId, req.taskId); - return TSDB_CODE_SUCCESS; + (void)streamMetaAddFailedTask(pMeta, req.streamId, req.taskId); + return code; } // discard the rsp, since it is expired. @@ -1193,13 +1242,13 @@ int32_t tqStreamTaskProcessConsenChkptIdReq(SStreamMeta* pMeta, SRpcMsg* pMsg) { tqDebug("s-task:%s vgId:%d checkpointId:%" PRId64 " restore to consensus-checkpointId:%" PRId64 " from mnode", pTask->id.idStr, vgId, pTask->chkInfo.checkpointId, req.checkpointId); - taosThreadMutexLock(&pTask->lock); + streamMutexLock(&pTask->lock); ASSERT(pTask->chkInfo.checkpointId >= req.checkpointId); if (pTask->chkInfo.consensusTransId >= req.transId) { tqDebug("s-task:%s vgId:%d latest consensus transId:%d, expired consensus trans:%d, discard", pTask->id.idStr, vgId, pTask->chkInfo.consensusTransId, req.transId); - taosThreadMutexUnlock(&pTask->lock); + streamMutexUnlock(&pTask->lock); streamMetaReleaseTask(pMeta, pTask); return TSDB_CODE_SUCCESS; } @@ -1215,14 +1264,14 @@ int32_t tqStreamTaskProcessConsenChkptIdReq(SStreamMeta* pMeta, SRpcMsg* pMsg) { } pTask->chkInfo.consensusTransId = req.transId; - taosThreadMutexUnlock(&pTask->lock); + streamMutexUnlock(&pTask->lock); if (pMeta->role == NODE_ROLE_LEADER) { - /*code = */ tqStreamStartOneTaskAsync(pMeta, pTask->pMsgCb, req.streamId, req.taskId); + code = tqStreamStartOneTaskAsync(pMeta, pTask->pMsgCb, req.streamId, req.taskId); } else { tqDebug("vgId:%d follower not start task:%s", vgId, pTask->id.idStr); } streamMetaReleaseTask(pMeta, pTask); - return TSDB_CODE_SUCCESS; + return code; } \ No newline at end of file diff --git a/source/libs/stream/inc/streamInt.h b/source/libs/stream/inc/streamInt.h index 2fe86817e3..93d2edd639 100644 --- a/source/libs/stream/inc/streamInt.h +++ b/source/libs/stream/inc/streamInt.h @@ -231,14 +231,8 @@ void initCheckpointReadyInfo(STaskCheckpointReadyInfo* pReadyInfo, int32_t up int32_t initCheckpointReadyMsg(SStreamTask* pTask, int32_t upstreamNodeId, int32_t upstreamTaskId, int32_t childId, int64_t checkpointId, SRpcMsg* pMsg); -typedef int32_t (*__stream_async_exec_fn_t)(void* param); - -int32_t streamMetaAsyncExec(SStreamMeta* pMeta, __stream_async_exec_fn_t fn, void* param, int32_t* code); void flushStateDataInExecutor(SStreamTask* pTask, SStreamQueueItem* pCheckpointBlock); -void streamMutexLock(TdThreadMutex *pMutex); -void streamMutexUnlock(TdThreadMutex *pMutex); -void streamMutexDestroy(TdThreadMutex *pMutex); #ifdef __cplusplus } diff --git a/source/libs/stream/src/streamBackendRocksdb.c b/source/libs/stream/src/streamBackendRocksdb.c index 74fff23c6b..869877c9a8 100644 --- a/source/libs/stream/src/streamBackendRocksdb.c +++ b/source/libs/stream/src/streamBackendRocksdb.c @@ -1540,7 +1540,7 @@ int32_t chkpLoadExtraInfo(char* pChkpIdDir, int64_t* chkpId, int64_t* processId) // compatible with previous version *processId = -1; code = 0; - stError("failed to open file to load extra info, file:%s, reason:%s", pDst, tstrerror(TAOS_SYSTEM_ERROR(errno))); + stWarn("failed to open file to load extra info, file:%s, reason:%s", pDst, tstrerror(TAOS_SYSTEM_ERROR(errno))); goto _EXIT; } @@ -2308,6 +2308,7 @@ _EXIT: taosMemoryFree(cfHandle); return code; } + void* taskDbAddRef(void* pTaskDb) { STaskDbWrapper* pBackend = pTaskDb; return taosAcquireRef(taskDbWrapperId, pBackend->refId); diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index 6f3b7d8b32..1283f8e20b 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -45,7 +45,7 @@ typedef struct STaskInitTs { SMetaRefMgt gMetaRefMgt; -void metaRefMgtInit(); +int32_t metaRefMgtInit(); void metaRefMgtCleanup(); int32_t metaRefMgtAdd(int64_t vgId, int64_t* rid); @@ -56,9 +56,14 @@ static void streamMetaEnvInit() { streamMetaId = taosOpenRef(64, streamMetaCloseImpl); - metaRefMgtInit(); - int32_t code = streamTimerInit(); - if (code != 0) { + int32_t code = metaRefMgtInit(); + if (code) { + stError("failed to init stream meta mgmt env, start failed"); + return; + } + + code = streamTimerInit(); + if (code) { stError("failed to init stream meta env, start failed"); } } @@ -66,17 +71,29 @@ static void streamMetaEnvInit() { void streamMetaInit() { (void) taosThreadOnce(&streamMetaModuleInit, streamMetaEnvInit); } void streamMetaCleanup() { - taosCloseRef(streamBackendId); - taosCloseRef(streamBackendCfWrapperId); - taosCloseRef(streamMetaId); + (void) taosCloseRef(streamBackendId); + (void) taosCloseRef(streamBackendCfWrapperId); + (void) taosCloseRef(streamMetaId); metaRefMgtCleanup(); streamTimerCleanUp(); } -void metaRefMgtInit() { - taosThreadMutexInit(&(gMetaRefMgt.mutex), NULL); - gMetaRefMgt.pTable = taosHashInit(64, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BIGINT), false, HASH_ENTRY_LOCK); +int32_t metaRefMgtInit() { + int32_t code = taosThreadMutexInit(&(gMetaRefMgt.mutex), NULL); + if (code) { + return code; + } + + if (code == 0) { + gMetaRefMgt.pTable = taosHashInit(64, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BIGINT), false, HASH_ENTRY_LOCK); + } + + if (gMetaRefMgt.pTable == NULL) { + return TSDB_CODE_OUT_OF_MEMORY; + } else { + return code; + } } void metaRefMgtCleanup() { @@ -96,20 +113,34 @@ void metaRefMgtCleanup() { } int32_t metaRefMgtAdd(int64_t vgId, int64_t* rid) { + int32_t code = 0; + void* p = NULL; + streamMutexLock(&gMetaRefMgt.mutex); - void* p = taosHashGet(gMetaRefMgt.pTable, &vgId, sizeof(vgId)); + p = taosHashGet(gMetaRefMgt.pTable, &vgId, sizeof(vgId)); if (p == NULL) { SArray* list = taosArrayInit(8, sizeof(void*)); - taosArrayPush(list, &rid); - taosHashPut(gMetaRefMgt.pTable, &vgId, sizeof(vgId), &list, sizeof(void*)); + p = taosArrayPush(list, &rid); + if (p == NULL) { + return TSDB_CODE_OUT_OF_MEMORY; + } + + code = taosHashPut(gMetaRefMgt.pTable, &vgId, sizeof(vgId), &list, sizeof(void*)); + if (code) { + stError("vgId:%d failed to put into metaRef table, rid:%" PRId64, (int32_t) vgId, *rid); + return code; + } } else { SArray* list = *(SArray**)p; - taosArrayPush(list, &rid); + void* px = taosArrayPush(list, &rid); + if (px == NULL) { + code = TSDB_CODE_OUT_OF_MEMORY; + } } streamMutexUnlock(&gMetaRefMgt.mutex); - return 0; + return code; } int32_t streamMetaOpenTdb(SStreamMeta* pMeta) { @@ -141,19 +172,25 @@ enum STREAM_STATE_VER { }; int32_t streamMetaCheckBackendCompatible(SStreamMeta* pMeta) { - int8_t ret = STREAM_STATA_COMPATIBLE; - TBC* pCur = NULL; - - if (tdbTbcOpen(pMeta->pTaskDb, &pCur, NULL) < 0) { // no task info, no stream - return ret; - } - + int8_t ret = STREAM_STATA_COMPATIBLE; + TBC* pCur = NULL; + int32_t code = 0; void* pKey = NULL; int32_t kLen = 0; void* pVal = NULL; int32_t vLen = 0; - tdbTbcMoveToFirst(pCur); + if (tdbTbcOpen(pMeta->pTaskDb, &pCur, NULL) < 0) { // no task info, no stream + return ret; + } + + code = tdbTbcMoveToFirst(pCur); + if (code) { + (void) tdbTbcClose(pCur); + stError("vgId:%d failed to open stream meta file cursor, not perform compatible check", pMeta->vgId); + return ret; + } + while (tdbTbcNext(pCur, &pKey, &kLen, &pVal, &vLen) == 0) { if (pVal == NULL || vLen == 0) { break; @@ -178,7 +215,7 @@ int32_t streamMetaCheckBackendCompatible(SStreamMeta* pMeta) { tdbFree(pKey); tdbFree(pVal); - tdbTbcClose(pCur); + (void) tdbTbcClose(pCur); return ret; } @@ -244,7 +281,11 @@ int32_t streamTaskSetDb(SStreamMeta* pMeta, SStreamTask* pTask, const char* key) streamMutexLock(&pMeta->backendMutex); void** ppBackend = taosHashGet(pMeta->pTaskDbUnique, key, strlen(key)); if ((ppBackend != NULL) && (*ppBackend != NULL)) { - taskDbAddRef(*ppBackend); + void* p = taskDbAddRef(*ppBackend); + if (p == NULL) { + stError("s-task:0x%x failed to ref backend", pTask->id.taskId); + return TSDB_CODE_FAILED; + } STaskDbWrapper* pBackend = *ppBackend; pBackend->pMeta = pMeta; @@ -278,7 +319,10 @@ int32_t streamTaskSetDb(SStreamMeta* pMeta, SStreamTask* pTask, const char* key) if (processVer != -1) pTask->chkInfo.processedVer = processVer; - taosHashPut(pMeta->pTaskDbUnique, key, strlen(key), &pBackend, sizeof(void*)); + int32_t code = taosHashPut(pMeta->pTaskDbUnique, key, strlen(key), &pBackend, sizeof(void*)); + if (code) { + stError("s-task:0x%x failed to put taskDb backend, code:out of memory", pTask->id.taskId); + } streamMutexUnlock(&pMeta->backendMutex); stDebug("s-task:0x%x set backend %p", pTask->id.taskId, pBackend); @@ -290,7 +334,10 @@ void streamMetaRemoveDB(void* arg, char* key) { SStreamMeta* pMeta = arg; streamMutexLock(&pMeta->backendMutex); - taosHashRemove(pMeta->pTaskDbUnique, key, strlen(key)); + int32_t code = taosHashRemove(pMeta->pTaskDbUnique, key, strlen(key)); + if (code) { + stError("vgId:%d failed to remove key:%s in taskDbUnique map", pMeta->vgId, key); + } streamMutexUnlock(&pMeta->backendMutex); } @@ -398,12 +445,22 @@ int32_t streamMetaOpen(const char* path, void* ahandle, FTaskBuild buildTaskFn, } #endif - taosThreadRwlockInit(&pMeta->lock, &attr); - taosThreadRwlockAttrDestroy(&attr); + code = taosThreadRwlockInit(&pMeta->lock, &attr); + if (code) { + goto _err; + } + + code = taosThreadRwlockAttrDestroy(&attr); + if (code) { + goto _err; + } int64_t* pRid = taosMemoryMalloc(sizeof(int64_t)); memcpy(pRid, &pMeta->rid, sizeof(pMeta->rid)); - metaRefMgtAdd(pMeta->vgId, pRid); + code = metaRefMgtAdd(pMeta->vgId, pRid); + if (code) { + goto _err; + } code = createMetaHbInfo(pRid, &pMeta->pHbInfo); if (code != TSDB_CODE_SUCCESS) { @@ -416,7 +473,8 @@ int32_t streamMetaOpen(const char* path, void* ahandle, FTaskBuild buildTaskFn, if (pMeta->bkdChkptMgt == NULL) { goto _err; } - taosThreadMutexInit(&pMeta->backendMutex, NULL); + + code = taosThreadMutexInit(&pMeta->backendMutex, NULL); *p = pMeta; return code; @@ -425,9 +483,9 @@ _err: taosMemoryFree(pMeta->path); if (pMeta->pTasksMap) taosHashCleanup(pMeta->pTasksMap); if (pMeta->pTaskList) taosArrayDestroy(pMeta->pTaskList); - if (pMeta->pTaskDb) tdbTbClose(pMeta->pTaskDb); - if (pMeta->pCheckpointDb) tdbTbClose(pMeta->pCheckpointDb); - if (pMeta->db) tdbClose(pMeta->db); + if (pMeta->pTaskDb) (void)tdbTbClose(pMeta->pTaskDb); + if (pMeta->pCheckpointDb) (void)tdbTbClose(pMeta->pCheckpointDb); + if (pMeta->db) (void) tdbClose(pMeta->db); if (pMeta->pHbInfo) taosMemoryFreeClear(pMeta->pHbInfo); if (pMeta->updateInfo.pTasks) taosHashCleanup(pMeta->updateInfo.pTasks); if (pMeta->startInfo.pReadyTaskSet) taosHashCleanup(pMeta->startInfo.pReadyTaskSet); @@ -473,7 +531,7 @@ void streamMetaClear(SStreamMeta* pMeta) { // release the ref by timer if (p->info.delaySchedParam != 0 && p->info.fillHistory == 0) { // one more ref in timer stDebug("s-task:%s stop schedTimer, and (before) desc ref:%d", p->id.idStr, p->refCnt); - taosTmrStop(p->schedInfo.pDelayTimer); + (void) taosTmrStop(p->schedInfo.pDelayTimer); p->info.delaySchedParam = 0; streamMetaReleaseTask(pMeta, p); } @@ -481,7 +539,11 @@ void streamMetaClear(SStreamMeta* pMeta) { streamMetaReleaseTask(pMeta, p); } - taosRemoveRef(streamBackendId, pMeta->streamBackendRid); + int32_t code = taosRemoveRef(streamBackendId, pMeta->streamBackendRid); + if (code) { + stError("vgId:%d remove stream backend Ref failed, rid:%"PRId64, pMeta->vgId, pMeta->streamBackendRid); + } + taosHashClear(pMeta->pTasksMap); taosArrayClear(pMeta->pTaskList); @@ -502,14 +564,7 @@ void streamMetaClose(SStreamMeta* pMeta) { if (pMeta == NULL) { return; } - - // int64_t rid = *(int64_t*)pMeta->pRid; - // if (taosTmrStop(pMeta->hbInfo.hbTmr)) { - // taosMemoryFree(pMeta->pRid); - // } else { - // // do nothing, stop by timer thread - // } - taosRemoveRef(streamMetaId, pMeta->rid); + (void) taosRemoveRef(streamMetaId, pMeta->rid); } void streamMetaCloseImpl(void* arg) { @@ -525,10 +580,11 @@ void streamMetaCloseImpl(void* arg) { streamMetaClear(pMeta); streamMetaWUnLock(pMeta); - tdbAbort(pMeta->db, pMeta->txn); - tdbTbClose(pMeta->pTaskDb); - tdbTbClose(pMeta->pCheckpointDb); - tdbClose(pMeta->db); + // already log the error, ignore here + (void) tdbAbort(pMeta->db, pMeta->txn); + (void) tdbTbClose(pMeta->pTaskDb); + (void) tdbTbClose(pMeta->pCheckpointDb); + (void) tdbClose(pMeta->db); taosArrayDestroy(pMeta->pTaskList); taosArrayDestroy(pMeta->chkpSaved); @@ -552,7 +608,7 @@ void streamMetaCloseImpl(void* arg) { bkdMgtDestroy(pMeta->bkdChkptMgt); pMeta->role = NODE_ROLE_UNINIT; - taosThreadRwlockDestroy(&pMeta->lock); + (void) taosThreadRwlockDestroy(&pMeta->lock); taosMemoryFree(pMeta); stDebug("vgId:%d end to close stream meta", vgId); @@ -568,9 +624,10 @@ int32_t streamMetaSaveTask(SStreamMeta* pMeta, SStreamTask* pTask) { if (code < 0) { return -1; } + buf = taosMemoryCalloc(1, len); if (buf == NULL) { - return -1; + return TSDB_CODE_OUT_OF_MEMORY; } if (pTask->ver < SSTREAM_TASK_SUBTABLE_CHANGED_VER) { @@ -579,13 +636,19 @@ int32_t streamMetaSaveTask(SStreamMeta* pMeta, SStreamTask* pTask) { SEncoder encoder = {0}; tEncoderInit(&encoder, buf, len); - tEncodeStreamTask(&encoder, pTask); + code = tEncodeStreamTask(&encoder, pTask); tEncoderClear(&encoder); + if (code == -1) { + stError("s-task:%s vgId:%d task meta encode failed, code:%s", pTask->id.idStr, vgId, tstrerror(code)); + return TSDB_CODE_INVALID_MSG; + } + int64_t id[2] = {pTask->id.streamId, pTask->id.taskId}; code = tdbTbUpsert(pMeta->pTaskDb, id, STREAM_TASK_KEY_LEN, buf, len, pMeta->txn); if (code != TSDB_CODE_SUCCESS) { + code = terrno; stError("s-task:%s vgId:%d task meta save to disk failed, code:%s", pTask->id.idStr, vgId, tstrerror(terrno)); } else { stDebug("s-task:%s vgId:%d task meta save to disk", pTask->id.idStr, vgId); @@ -612,33 +675,44 @@ int32_t streamMetaRemoveTask(SStreamMeta* pMeta, STaskId* pTaskId) { int32_t streamMetaRegisterTask(SStreamMeta* pMeta, int64_t ver, SStreamTask* pTask, bool* pAdded) { *pAdded = false; + int32_t code = 0; STaskId id = streamTaskGetTaskId(pTask); void* p = taosHashGet(pMeta->pTasksMap, &id, sizeof(id)); if (p != NULL) { - return 0; + stDebug("s-task:%" PRIx64 " already exist in meta, no need to register", id.taskId); + return code; } - if (pMeta->buildTaskFn(pMeta->ahandle, pTask, ver) < 0) { - return -1; + if ((code = pMeta->buildTaskFn(pMeta->ahandle, pTask, ver)) != 0) { + return code; } - taosArrayPush(pMeta->pTaskList, &pTask->id); - taosHashPut(pMeta->pTasksMap, &id, sizeof(id), &pTask, POINTER_BYTES); - - if (streamMetaSaveTask(pMeta, pTask) < 0) { - return -1; + p = taosArrayPush(pMeta->pTaskList, &pTask->id); + if (p == NULL) { + stError("s-task:0x%"PRIx64" failed to register task into meta-list, code: out of memory", id.taskId); + return TSDB_CODE_OUT_OF_MEMORY; } - if (streamMetaCommit(pMeta) < 0) { - return -1; + code = taosHashPut(pMeta->pTasksMap, &id, sizeof(id), &pTask, POINTER_BYTES); + if (code) { + stError("s-task:0x%"PRIx64" failed to register task into meta-list, code: out of memory", id.taskId); + return code; + } + + if ((code = streamMetaSaveTask(pMeta, pTask)) != 0) { + return code; + } + + if ((code = streamMetaCommit(pMeta)) != 0) { + return code; } if (pTask->info.fillHistory == 0) { - atomic_add_fetch_32(&pMeta->numOfStreamTasks, 1); + (void) atomic_add_fetch_32(&pMeta->numOfStreamTasks, 1); } *pAdded = true; - return 0; + return code; } int32_t streamMetaGetNumOfTasks(SStreamMeta* pMeta) { @@ -703,7 +777,7 @@ static void doRemoveIdFromList(SArray* pTaskList, int32_t num, SStreamTaskId* id static int32_t streamTaskSendTransSuccessMsg(SStreamTask* pTask, void* param) { if (pTask->info.taskLevel == TASK_LEVEL__SOURCE) { - streamTaskSendCheckpointSourceRsp(pTask); + (void) streamTaskSendCheckpointSourceRsp(pTask); } return 0; } @@ -726,7 +800,7 @@ int32_t streamMetaUnregisterTask(SStreamMeta* pMeta, int64_t streamId, int32_t t } // handle the dropping event - streamTaskHandleEventAsync(pTask->status.pSM, TASK_EVENT_DROPPING, streamTaskSendTransSuccessMsg, NULL); + (void) streamTaskHandleEventAsync(pTask->status.pSM, TASK_EVENT_DROPPING, streamTaskSendTransSuccessMsg, NULL); } else { stDebug("vgId:%d failed to find the task:0x%x, it may be dropped already", pMeta->vgId, taskId); streamMetaWUnLock(pMeta); @@ -765,12 +839,12 @@ int32_t streamMetaUnregisterTask(SStreamMeta* pMeta, int64_t streamId, int32_t t pTask = *ppTask; // it is an fill-history task, remove the related stream task's id that points to it if (pTask->info.fillHistory == 0) { - atomic_sub_fetch_32(&pMeta->numOfStreamTasks, 1); + (void) atomic_sub_fetch_32(&pMeta->numOfStreamTasks, 1); } - taosHashRemove(pMeta->pTasksMap, &id, sizeof(id)); + (void) taosHashRemove(pMeta->pTasksMap, &id, sizeof(id)); doRemoveIdFromList(pMeta->pTaskList, (int32_t)taosArrayGetSize(pMeta->pTaskList), &pTask->id); - streamMetaRemoveTask(pMeta, &id); + (void) streamMetaRemoveTask(pMeta, &id); ASSERT(taosHashGetSize(pMeta->pTasksMap) == taosArrayGetSize(pMeta->pTaskList)); streamMetaWUnLock(pMeta); @@ -778,7 +852,7 @@ int32_t streamMetaUnregisterTask(SStreamMeta* pMeta, int64_t streamId, int32_t t ASSERT(pTask->status.timerActive == 0); if (pTask->info.delaySchedParam != 0 && pTask->info.fillHistory == 0) { stDebug("s-task:%s stop schedTimer, and (before) desc ref:%d", pTask->id.idStr, pTask->refCnt); - taosTmrStop(pTask->schedInfo.pDelayTimer); + (void) taosTmrStop(pTask->schedInfo.pDelayTimer); pTask->info.delaySchedParam = 0; streamMetaReleaseTask(pMeta, pTask); } @@ -823,9 +897,11 @@ int32_t streamMetaCommit(SStreamMeta* pMeta) { int64_t streamMetaGetLatestCheckpointId(SStreamMeta* pMeta) { int64_t checkpointId = 0; + int32_t code = 0; TBC* pCur = NULL; if (tdbTbcOpen(pMeta->pTaskDb, &pCur, NULL) < 0) { + stError("failed to open stream meta file, the latest checkpointId is 0, vgId:%d", pMeta->vgId); return checkpointId; } @@ -835,7 +911,13 @@ int64_t streamMetaGetLatestCheckpointId(SStreamMeta* pMeta) { int32_t vLen = 0; SDecoder decoder; - tdbTbcMoveToFirst(pCur); + code = tdbTbcMoveToFirst(pCur); + if (code) { + (void) tdbTbcClose(pCur); + stError("failed to open stream meta file cursor, the latest checkpointId is 0, vgId:%d", pMeta->vgId); + return checkpointId; + } + while (tdbTbcNext(pCur, &pKey, &kLen, &pVal, &vLen) == 0) { if (pVal == NULL || vLen == 0) { break; @@ -854,8 +936,8 @@ int64_t streamMetaGetLatestCheckpointId(SStreamMeta* pMeta) { tdbFree(pKey); tdbFree(pVal); - tdbTbcClose(pCur); + (void)tdbTbcClose(pCur); return checkpointId; } @@ -867,23 +949,34 @@ void streamMetaLoadAllTasks(SStreamMeta* pMeta) { void* pVal = NULL; int32_t vLen = 0; SDecoder decoder; + int32_t vgId = 0; + int32_t code = 0; + SArray* pRecycleList = NULL; if (pMeta == NULL) { return; } - SArray* pRecycleList = taosArrayInit(4, sizeof(STaskId)); - int32_t vgId = pMeta->vgId; + pRecycleList = taosArrayInit(4, sizeof(STaskId)); + + vgId = pMeta->vgId; stInfo("vgId:%d load stream tasks from meta files", vgId); - int32_t code = tdbTbcOpen(pMeta->pTaskDb, &pCur, NULL); + code = tdbTbcOpen(pMeta->pTaskDb, &pCur, NULL); if (code != TSDB_CODE_SUCCESS) { stError("vgId:%d failed to open stream meta, code:%s, not load any stream tasks", vgId, tstrerror(terrno)); taosArrayDestroy(pRecycleList); return; } - tdbTbcMoveToFirst(pCur); + code = tdbTbcMoveToFirst(pCur); + if (code) { + stError("vgId:%d failed to open stream meta cursor, code:%s, not load any stream tasks", vgId, tstrerror(terrno)); + taosArrayDestroy(pRecycleList); + (void) tdbTbcClose(pCur); + return; + } + while (tdbTbcNext(pCur, &pKey, &kLen, &pVal, &vLen) == 0) { if (pVal == NULL || vLen == 0) { break; @@ -913,7 +1006,7 @@ void streamMetaLoadAllTasks(SStreamMeta* pMeta) { tFreeStreamTask(pTask); STaskId id = streamTaskGetTaskId(pTask); - taosArrayPush(pRecycleList, &id); + (void) taosArrayPush(pRecycleList, &id); int32_t total = taosArrayGetSize(pRecycleList); stDebug("s-task:0x%x is already dropped, add into recycle list, total:%d", taskId, total); @@ -934,7 +1027,7 @@ void streamMetaLoadAllTasks(SStreamMeta* pMeta) { continue; } - taosArrayPush(pMeta->pTaskList, &pTask->id); + (void) taosArrayPush(pMeta->pTaskList, &pTask->id); } else { // todo this should replace the existed object put by replay creating stream task msg from mnode stError("s-task:0x%x already added into table meta by replaying WAL, need check", pTask->id.taskId); @@ -944,17 +1037,17 @@ void streamMetaLoadAllTasks(SStreamMeta* pMeta) { if (taosHashPut(pMeta->pTasksMap, &id, sizeof(id), &pTask, POINTER_BYTES) != 0) { stError("s-task:0x%x failed to put into hashTable, code:%s, continue", pTask->id.taskId, tstrerror(terrno)); - taosArrayPop(pMeta->pTaskList); + (void) taosArrayPop(pMeta->pTaskList); tFreeStreamTask(pTask); continue; } if (pTask->info.fillHistory == 0) { - atomic_add_fetch_32(&pMeta->numOfStreamTasks, 1); + (void) atomic_add_fetch_32(&pMeta->numOfStreamTasks, 1); } if (streamTaskShouldPause(pTask)) { - atomic_add_fetch_32(&pMeta->numOfPausedTasks, 1); + (void) atomic_add_fetch_32(&pMeta->numOfPausedTasks, 1); } ASSERT(pTask->status.downstreamReady == 0); @@ -970,7 +1063,7 @@ void streamMetaLoadAllTasks(SStreamMeta* pMeta) { if (taosArrayGetSize(pRecycleList) > 0) { for (int32_t i = 0; i < taosArrayGetSize(pRecycleList); ++i) { STaskId* pId = taosArrayGet(pRecycleList, i); - streamMetaRemoveTask(pMeta, pId); + (void) streamMetaRemoveTask(pMeta, pId); } } @@ -998,7 +1091,7 @@ bool streamMetaTaskInTimer(SStreamMeta* pMeta) { SStreamTask* pTask = *(SStreamTask**)pIter; if (pTask->status.timerActive >= 1) { stDebug("s-task:%s in timer, blocking tasks in vgId:%d restart, set closing again", pTask->id.idStr, pMeta->vgId); - streamTaskStop(pTask); + (void) streamTaskStop(pTask); inTimer = true; } } @@ -1031,7 +1124,7 @@ void streamMetaNotifyClose(SStreamMeta* pMeta) { SStreamTask* pTask = *(SStreamTask**)pIter; stDebug("vgId:%d s-task:%s set task closing flag", vgId, pTask->id.idStr); - streamTaskStop(pTask); + (void) streamTaskStop(pTask); } streamMetaWUnLock(pMeta); @@ -1050,7 +1143,16 @@ void streamMetaNotifyClose(SStreamMeta* pMeta) { void streamMetaStartHb(SStreamMeta* pMeta) { int64_t* pRid = taosMemoryMalloc(sizeof(int64_t)); - metaRefMgtAdd(pMeta->vgId, pRid); + if (pRid == NULL) { + stError("vgId:%d failed to prepare the metaHb to mnode, hbMsg will not started, code: out of memory", pMeta->vgId); + return; + } + + int32_t code = metaRefMgtAdd(pMeta->vgId, pRid); + if (code) { + return; + } + *pRid = pMeta->rid; streamMetaHbToMnode(pRid, NULL); } @@ -1069,7 +1171,7 @@ void streamMetaResetStartInfo(STaskStartInfo* pStartInfo, int32_t vgId) { void streamMetaRLock(SStreamMeta* pMeta) { // stTrace("vgId:%d meta-rlock", pMeta->vgId); - taosThreadRwlockRdlock(&pMeta->lock); + (void) taosThreadRwlockRdlock(&pMeta->lock); } void streamMetaRUnLock(SStreamMeta* pMeta) { @@ -1084,30 +1186,13 @@ void streamMetaRUnLock(SStreamMeta* pMeta) { void streamMetaWLock(SStreamMeta* pMeta) { // stTrace("vgId:%d meta-wlock", pMeta->vgId); - taosThreadRwlockWrlock(&pMeta->lock); + (void) taosThreadRwlockWrlock(&pMeta->lock); // stTrace("vgId:%d meta-wlock completed", pMeta->vgId); } void streamMetaWUnLock(SStreamMeta* pMeta) { // stTrace("vgId:%d meta-wunlock", pMeta->vgId); - taosThreadRwlockUnlock(&pMeta->lock); -} - -static void execHelper(struct SSchedMsg* pSchedMsg) { - __async_exec_fn_t execFn = (__async_exec_fn_t)pSchedMsg->ahandle; - int32_t code = execFn(pSchedMsg->thandle); - if (code != 0 && pSchedMsg->msg != NULL) { - *(int32_t*)pSchedMsg->msg = code; - } -} - -int32_t streamMetaAsyncExec(SStreamMeta* pMeta, __stream_async_exec_fn_t fn, void* param, int32_t* code) { - SSchedMsg schedMsg = {0}; - schedMsg.fp = execHelper; - schedMsg.ahandle = fn; - schedMsg.thandle = param; - schedMsg.msg = code; - return taosScheduleTask(pMeta->qHandle, &schedMsg); + (void) taosThreadRwlockUnlock(&pMeta->lock); } int32_t streamMetaSendMsgBeforeCloseTasks(SStreamMeta* pMeta, SArray** pList) { @@ -1195,10 +1280,10 @@ static int32_t prepareBeforeStartTasks(SStreamMeta* pMeta, SArray** pList, int64 taosHashClear(pMeta->startInfo.pFailedTaskSet); pMeta->startInfo.startTs = now; - streamMetaResetTaskStatus(pMeta); + int32_t code = streamMetaResetTaskStatus(pMeta); streamMetaWUnLock(pMeta); - return TSDB_CODE_SUCCESS; + return code; } // restore the checkpoint id by negotiating the latest consensus checkpoint id @@ -1233,7 +1318,7 @@ int32_t streamMetaStartAllTasks(SStreamMeta* pMeta) { code = streamMetaAcquireTask(pMeta, pTaskId->streamId, pTaskId->taskId, &pTask); if (pTask == NULL) { stError("vgId:%d failed to acquire task:0x%x during start tasks", pMeta->vgId, pTaskId->taskId); - streamMetaAddFailedTask(pMeta, pTaskId->streamId, pTaskId->taskId); + (void) streamMetaAddFailedTask(pMeta, pTaskId->streamId, pTaskId->taskId); continue; } @@ -1256,7 +1341,7 @@ int32_t streamMetaStartAllTasks(SStreamMeta* pMeta) { code = streamMetaAcquireTask(pMeta, pTaskId->streamId, pTaskId->taskId, &pTask); if (pTask == NULL) { stError("vgId:%d failed to acquire task:0x%x during start tasks", pMeta->vgId, pTaskId->taskId); - streamMetaAddFailedTask(pMeta, pTaskId->streamId, pTaskId->taskId); + (void) streamMetaAddFailedTask(pMeta, pTaskId->streamId, pTaskId->taskId); continue; } @@ -1274,7 +1359,7 @@ int32_t streamMetaStartAllTasks(SStreamMeta* pMeta) { if (HAS_RELATED_FILLHISTORY_TASK(pTask)) { stDebug("s-task:%s downstream ready, no need to check downstream, check only related fill-history task", pTask->id.idStr); - streamLaunchFillHistoryTask(pTask); + (void) streamLaunchFillHistoryTask(pTask); // todo: how about retry launch fill-history task? } (void) streamMetaAddTaskLaunchResult(pMeta, pTaskId->streamId, pTaskId->taskId, pInfo->checkTs, pInfo->readyTs, true); @@ -1340,7 +1425,7 @@ int32_t streamMetaStopAllTasks(SStreamMeta* pMeta) { continue; } - streamTaskStop(pTask); + (void) streamTaskStop(pTask); streamMetaReleaseTask(pMeta, pTask); } @@ -1380,7 +1465,7 @@ int32_t streamMetaStartOneTask(SStreamMeta* pMeta, int64_t streamId, int32_t tas code = streamMetaAcquireTask(pMeta, streamId, taskId, &pTask); if (pTask == NULL) { stError("vgId:%d failed to acquire task:0x%x when starting task", pMeta->vgId, taskId); - streamMetaAddFailedTask(pMeta, streamId, taskId); + (void) streamMetaAddFailedTask(pMeta, streamId, taskId); return TSDB_CODE_STREAM_TASK_IVLD_STATUS; } @@ -1471,7 +1556,10 @@ int32_t streamMetaAddTaskLaunchResult(SStreamMeta* pMeta, int64_t streamId, int3 SHashObj* pDst = ready ? pStartInfo->pReadyTaskSet : pStartInfo->pFailedTaskSet; STaskInitTs initTs = {.start = startTs, .end = endTs, .success = ready}; - taosHashPut(pDst, &id, sizeof(id), &initTs, sizeof(STaskInitTs)); + int32_t code = taosHashPut(pDst, &id, sizeof(id), &initTs, sizeof(STaskInitTs)); + if (code) { + + } int32_t numOfTotal = streamMetaGetNumOfTasks(pMeta); int32_t numOfRecv = taosHashGetSize(pStartInfo->pReadyTaskSet) + taosHashGetSize(pStartInfo->pFailedTaskSet); @@ -1491,14 +1579,14 @@ int32_t streamMetaAddTaskLaunchResult(SStreamMeta* pMeta, int64_t streamId, int3 streamMetaResetStartInfo(pStartInfo, pMeta->vgId); streamMetaWUnLock(pMeta); - pStartInfo->completeFn(pMeta); + code = pStartInfo->completeFn(pMeta); } else { streamMetaWUnLock(pMeta); stDebug("vgId:%d recv check downstream results, s-task:0x%x succ:%d, received:%d, total:%d", pMeta->vgId, taskId, ready, numOfRecv, numOfTotal); } - return TSDB_CODE_SUCCESS; + return code; } int32_t streamMetaResetTaskStatus(SStreamMeta* pMeta) { @@ -1572,19 +1660,26 @@ void streamMetaAddIntoUpdateTaskList(SStreamMeta* pMeta, SStreamTask* pTask, SSt int64_t startTs) { const char* id = pTask->id.idStr; int32_t vgId = pTask->pMeta->vgId; + int32_t code = 0; // keep the already updated info STaskUpdateEntry entry = {.streamId = pTask->id.streamId, .taskId = pTask->id.taskId, .transId = transId}; - taosHashPut(pMeta->updateInfo.pTasks, &entry, sizeof(entry), NULL, 0); + code = taosHashPut(pMeta->updateInfo.pTasks, &entry, sizeof(entry), NULL, 0); + if (code != 0) { + stError("s-task:%s failed to put updateTask into update list", id); + } int64_t el = taosGetTimestampMs() - startTs; if (pHTask != NULL) { STaskUpdateEntry hEntry = {.streamId = pHTask->id.streamId, .taskId = pHTask->id.taskId, .transId = transId}; - taosHashPut(pMeta->updateInfo.pTasks, &hEntry, sizeof(hEntry), NULL, 0); - - stDebug("s-task:%s vgId:%d transId:%d task nodeEp update completed, streamTask/hTask closed, elapsed:%" PRId64 - " ms", - id, vgId, transId, el); + code = taosHashPut(pMeta->updateInfo.pTasks, &hEntry, sizeof(hEntry), NULL, 0); + if (code != 0) { + stError("s-task:%s failed to put updateTask into update list", id); + } else { + stDebug("s-task:%s vgId:%d transId:%d task nodeEp update completed, streamTask/hTask closed, elapsed:%" PRId64 + " ms", + id, vgId, transId, el); + } } else { stDebug("s-task:%s vgId:%d transId:%d task nodeEp update completed, streamTask closed, elapsed time:%" PRId64 "ms", id, vgId, transId, el); diff --git a/source/libs/stream/src/streamSnapshot.c b/source/libs/stream/src/streamSnapshot.c index 02e4ed8d8b..3c27210a23 100644 --- a/source/libs/stream/src/streamSnapshot.c +++ b/source/libs/stream/src/streamSnapshot.c @@ -130,7 +130,6 @@ int32_t streamGetFileSize(char* path, char* name, int64_t* sz) { int32_t ret = 0; char* fullname = taosMemoryCalloc(1, strlen(path) + 32); - sprintf(fullname, "%s%s%s", path, TD_DIRSEP, name); ret = taosStatFile(fullname, sz, NULL, NULL); @@ -185,48 +184,89 @@ void snapFileDebugInfo(SBackendSnapFile2* pSnapFile) { } int32_t snapFileGenMeta(SBackendSnapFile2* pSnapFile) { + void* p = NULL; SBackendFileItem item = {0}; item.ref = 1; + // current item.name = pSnapFile->pCurrent; item.type = ROCKSDB_CURRENT_TYPE; - streamGetFileSize(pSnapFile->path, item.name, &item.size); - taosArrayPush(pSnapFile->pFileList, &item); + int32_t code = streamGetFileSize(pSnapFile->path, item.name, &item.size); + if (code) { + stError("failed to get file size"); + return code; + } + + p = taosArrayPush(pSnapFile->pFileList, &item); + if (p == NULL) { + return TSDB_CODE_OUT_OF_MEMORY; + } // mainfest item.name = pSnapFile->pMainfest; item.type = ROCKSDB_MAINFEST_TYPE; - streamGetFileSize(pSnapFile->path, item.name, &item.size); - taosArrayPush(pSnapFile->pFileList, &item); + code = streamGetFileSize(pSnapFile->path, item.name, &item.size); + if (code) { + return code; + } + + p = taosArrayPush(pSnapFile->pFileList, &item); + if (p == NULL) { + return TSDB_CODE_OUT_OF_MEMORY; + } // options item.name = pSnapFile->pOptions; item.type = ROCKSDB_OPTIONS_TYPE; - streamGetFileSize(pSnapFile->path, item.name, &item.size); - taosArrayPush(pSnapFile->pFileList, &item); + code = streamGetFileSize(pSnapFile->path, item.name, &item.size); + if (code) { + return code; + } + + p = taosArrayPush(pSnapFile->pFileList, &item); + if (p == NULL) { + return TSDB_CODE_OUT_OF_MEMORY; + } + // sst for (int32_t i = 0; i < taosArrayGetSize(pSnapFile->pSst); i++) { char* sst = taosArrayGetP(pSnapFile->pSst, i); item.name = sst; item.type = ROCKSDB_SST_TYPE; - streamGetFileSize(pSnapFile->path, item.name, &item.size); - taosArrayPush(pSnapFile->pFileList, &item); + code = streamGetFileSize(pSnapFile->path, item.name, &item.size); + if (code) { + return code; + } + + p = taosArrayPush(pSnapFile->pFileList, &item); + if (p == NULL) { + return TSDB_CODE_OUT_OF_MEMORY; + } } + // meta item.name = pSnapFile->pCheckpointMeta; item.type = ROCKSDB_CHECKPOINT_META_TYPE; if (streamGetFileSize(pSnapFile->path, item.name, &item.size) == 0) { - taosArrayPush(pSnapFile->pFileList, &item); + p = taosArrayPush(pSnapFile->pFileList, &item); + if (p == NULL) { + return TSDB_CODE_OUT_OF_MEMORY; + } } item.name = pSnapFile->pCheckpointSelfcheck; item.type = ROCKSDB_CHECKPOINT_SELFCHECK_TYPE; if (streamGetFileSize(pSnapFile->path, item.name, &item.size) == 0) { - taosArrayPush(pSnapFile->pFileList, &item); + p = taosArrayPush(pSnapFile->pFileList, &item); + if (p == NULL) { + return TSDB_CODE_OUT_OF_MEMORY; + } } + return 0; } + int32_t snapFileReadMeta(SBackendSnapFile2* pSnapFile) { int32_t code = 0; TdDirPtr pDir = taosOpenDir(pSnapFile->path); @@ -288,12 +328,18 @@ int32_t snapFileReadMeta(SBackendSnapFile2* pSnapFile) { code = TSDB_CODE_OUT_OF_MEMORY; break; } - taosArrayPush(pSnapFile->pSst, &sst); + + void* p = taosArrayPush(pSnapFile->pSst, &sst); + if (p == NULL) { + code = TSDB_CODE_OUT_OF_MEMORY; + break; + } } } - taosCloseDir(&pDir); - return code; + + return taosCloseDir(&pDir); } + int32_t streamBackendSnapInitFile(char* metaPath, SStreamTaskSnap* pSnap, SBackendSnapFile2* pSnapFile) { int32_t code = 0; int32_t nBytes = 0; @@ -359,13 +405,16 @@ void snapFileDestroy(SBackendSnapFile2* pSnap) { } taosArrayDestroy(pSnap->pFileList); taosArrayDestroy(pSnap->pSst); - taosCloseFile(&pSnap->fd); - - return; + int32_t code = taosCloseFile(&pSnap->fd); + if (code) { + stError("failed to close snapshot fd"); + } } + int32_t streamSnapHandleInit(SStreamSnapHandle* pHandle, char* path, void* pMeta) { - // impl later int32_t code = 0; + SArray* pDbSnapSet = NULL; + SArray* pSnapInfoSet = taosArrayInit(4, sizeof(SStreamTaskSnap)); if (pSnapInfoSet == NULL) { return TSDB_CODE_OUT_OF_MEMORY; @@ -374,15 +423,13 @@ int32_t streamSnapHandleInit(SStreamSnapHandle* pHandle, char* path, void* pMeta code = streamCreateTaskDbSnapInfo(pMeta, path, pSnapInfoSet); if (code != 0) { stError("failed to do task db snap info, reason:%s", tstrerror(code)); - taosArrayDestroy(pSnapInfoSet); - return code; + goto _err; } - SArray* pDbSnapSet = taosArrayInit(8, sizeof(SBackendSnapFile2)); + pDbSnapSet = taosArrayInit(8, sizeof(SBackendSnapFile2)); if (pDbSnapSet == NULL) { - taosArrayDestroy(pSnapInfoSet); code = TSDB_CODE_OUT_OF_MEMORY; - return code; + goto _err; } for (int32_t i = 0; i < taosArrayGetSize(pSnapInfoSet); i++) { @@ -391,16 +438,24 @@ int32_t streamSnapHandleInit(SStreamSnapHandle* pHandle, char* path, void* pMeta SBackendSnapFile2 snapFile = {0}; code = streamBackendSnapInitFile(path, pSnap, &snapFile); ASSERT(code == 0); - taosArrayPush(pDbSnapSet, &snapFile); + + void* p = taosArrayPush(pDbSnapSet, &snapFile); + if (p == NULL) { + code = TSDB_CODE_OUT_OF_MEMORY; + goto _err; + } } pHandle->pDbSnapSet = pDbSnapSet; pHandle->pSnapInfoSet = pSnapInfoSet; pHandle->currIdx = 0; pHandle->pMeta = pMeta; - return 0; + + return code; _err: + taosArrayDestroy(pSnapInfoSet); + taosArrayDestroy(pDbSnapSet); streamSnapHandleDestroy(pHandle); return code; } @@ -414,7 +469,8 @@ void streamSnapHandleDestroy(SStreamSnapHandle* handle) { } taosArrayDestroy(handle->pDbSnapSet); } - streamDestroyTaskDbSnapInfo(handle->pMeta, handle->pSnapInfoSet); + + (void) streamDestroyTaskDbSnapInfo(handle->pMeta, handle->pSnapInfoSet); if (handle->pSnapInfoSet) { for (int32_t i = 0; i < taosArrayGetSize(handle->pSnapInfoSet); i++) { SStreamTaskSnap* pSnap = taosArrayGet(handle->pSnapInfoSet, i); @@ -422,8 +478,8 @@ void streamSnapHandleDestroy(SStreamSnapHandle* handle) { } taosArrayDestroy(handle->pSnapInfoSet); } + taosMemoryFree(handle->metaPath); - return; } int32_t streamSnapReaderOpen(void* pMeta, int64_t sver, int64_t chkpId, char* path, SStreamSnapReader** ppReader) { @@ -506,14 +562,22 @@ _NEXT: item->name, (int64_t)pSnapFile->offset, item->size, pSnapFile->currFileIdx); pSnapFile->offset += nread; if (pSnapFile->offset >= item->size || nread < kBlockSize) { - taosCloseFile(&pSnapFile->fd); + code = taosCloseFile(&pSnapFile->fd); + if (code) { + stError("failed to close snapshot fd"); + } + pSnapFile->offset = 0; pSnapFile->currFileIdx += 1; } } else { stDebug("%s no data read, close file no.%d, move to next file, open and read", STREAM_STATE_TRANSFER, pSnapFile->currFileIdx); - taosCloseFile(&pSnapFile->fd); + code = taosCloseFile(&pSnapFile->fd); + if (code) { + stError("failed to close snapshot fd"); + } + pSnapFile->offset = 0; pSnapFile->currFileIdx += 1; @@ -577,14 +641,22 @@ int32_t streamSnapWriterOpen(void* pMeta, int64_t sver, int64_t ever, char* path pHandle->pDbSnapSet = taosArrayInit(8, sizeof(SBackendSnapFile2)); if (pHandle->pDbSnapSet == NULL) { - streamSnapWriterClose(pWriter, 0); + int32_t c = streamSnapWriterClose(pWriter, 0); // not override the error code, and igore this error code + if (c) { + stError("failed close snaphost writer"); + } + code = TSDB_CODE_OUT_OF_MEMORY; return code; } SBackendSnapFile2 snapFile = {0}; if (taosArrayPush(pHandle->pDbSnapSet, &snapFile) == NULL) { - streamSnapWriterClose(pWriter, 0); + int32_t c = streamSnapWriterClose(pWriter, 0); + if (c) { + stError("failed close snaphost writer"); + } + code = TSDB_CODE_OUT_OF_MEMORY; return code; } @@ -614,46 +686,62 @@ int32_t streamSnapWriteImpl(SStreamSnapWriter* pWriter, uint8_t* pData, uint32_t pHdr->name, tstrerror(code)); } } + if (strlen(pHdr->name) == strlen(pItem->name) && strcmp(pHdr->name, pItem->name) == 0) { int64_t bytes = taosPWriteFile(pSnapFile->fd, pHdr->data, pHdr->size, pSnapFile->offset); if (bytes != pHdr->size) { code = TAOS_SYSTEM_ERROR(errno); stError("%s failed to write snap, file name:%s, reason:%s", STREAM_STATE_TRANSFER, pHdr->name, tstrerror(code)); - return code; + goto _err; } else { stInfo("succ to write data %s", pItem->name); } pSnapFile->offset += bytes; } else { - taosCloseFile(&pSnapFile->fd); + code = taosCloseFile(&pSnapFile->fd); + if (code) { + stError("failed to close snapshot fd"); + } + pSnapFile->offset = 0; pSnapFile->currFileIdx += 1; SBackendFileItem item = {0}; item.name = taosStrdup(pHdr->name); item.type = pHdr->type; + if (item.name == NULL) { + return TSDB_CODE_OUT_OF_MEMORY; + } - taosArrayPush(pSnapFile->pFileList, &item); + void* p = taosArrayPush(pSnapFile->pFileList, &item); + if (p == NULL) { // can NOT goto _err here. + return TSDB_CODE_OUT_OF_MEMORY; + } - SBackendFileItem* pItem = taosArrayGet(pSnapFile->pFileList, pSnapFile->currFileIdx); - pSnapFile->fd = streamOpenFile(pSnapFile->path, pItem->name, TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_APPEND); + SBackendFileItem* pItem2 = taosArrayGet(pSnapFile->pFileList, pSnapFile->currFileIdx); + pSnapFile->fd = streamOpenFile(pSnapFile->path, pItem2->name, TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_APPEND); if (pSnapFile->fd == NULL) { code = TAOS_SYSTEM_ERROR(errno); stError("%s failed to open file name:%s%s%s, reason:%s", STREAM_STATE_TRANSFER, pSnapFile->path, TD_DIRSEP, pHdr->name, tstrerror(code)); + return code; } + // open fd again, let's close fd during handle errors. if (taosPWriteFile(pSnapFile->fd, pHdr->data, pHdr->size, pSnapFile->offset) != pHdr->size) { code = TAOS_SYSTEM_ERROR(errno); stError("%s failed to write snap, file name:%s, reason:%s", STREAM_STATE_TRANSFER, pHdr->name, tstrerror(code)); - return code; + goto _err; } - stInfo("succ to write data %s", pItem->name); + + stInfo("succ to write data %s", pItem2->name); pSnapFile->offset += pHdr->size; } - code = 0; -_EXIT: + return TSDB_CODE_SUCCESS; + +_err: + (void) taosCloseFile(&pSnapFile->fd); return code; } @@ -688,7 +776,10 @@ int32_t streamSnapWrite(SStreamSnapWriter* pWriter, uint8_t* pData, uint32_t nDa item.name = taosStrdup((char*)ROCKSDB_CURRENT); item.type = ROCKSDB_CURRENT_TYPE; - taosArrayPush(pDbSnapFile->pFileList, &item); + void* p = taosArrayPush(pDbSnapFile->pFileList, &item); + if (p == NULL) { + return TSDB_CODE_OUT_OF_MEMORY; + } pDbSnapFile->inited = 1; return streamSnapWriteImpl(pWriter, pData, nData, pDbSnapFile); @@ -697,9 +788,12 @@ int32_t streamSnapWrite(SStreamSnapWriter* pWriter, uint8_t* pData, uint32_t nDa return streamSnapWriteImpl(pWriter, pData, nData, pDbSnapFile); } else { SBackendSnapFile2 snapFile = {0}; - taosArrayPush(pHandle->pDbSnapSet, &snapFile); - pHandle->currIdx += 1; + void* p = taosArrayPush(pHandle->pDbSnapSet, &snapFile); + if (p == NULL) { + return TSDB_CODE_OUT_OF_MEMORY; + } + pHandle->currIdx += 1; return streamSnapWrite(pWriter, pData, nData); } } diff --git a/source/libs/stream/src/streamStartHistory.c b/source/libs/stream/src/streamStartHistory.c index 1efb2af381..db0784d572 100644 --- a/source/libs/stream/src/streamStartHistory.c +++ b/source/libs/stream/src/streamStartHistory.c @@ -29,19 +29,19 @@ typedef struct SLaunchHTaskInfo { STaskId hTaskId; } SLaunchHTaskInfo; -static int32_t streamSetParamForScanHistory(SStreamTask* pTask); -static void streamTaskSetRangeStreamCalc(SStreamTask* pTask); -static int32_t initScanHistoryReq(SStreamTask* pTask, SStreamScanHistoryReq* pReq, int8_t igUntreated); -static SLaunchHTaskInfo* createHTaskLaunchInfo(SStreamMeta* pMeta, STaskId* pTaskId, int64_t hStreamId, - int32_t hTaskId); -static void tryLaunchHistoryTask(void* param, void* tmrId); -static void doExecScanhistoryInFuture(void* param, void* tmrId); -static int32_t doStartScanHistoryTask(SStreamTask* pTask); -static int32_t streamTaskStartScanHistory(SStreamTask* pTask); -static void checkFillhistoryTaskStatus(SStreamTask* pTask, SStreamTask* pHTask); -static int32_t launchNotBuiltFillHistoryTask(SStreamTask* pTask); -static void doRetryLaunchFillHistoryTask(SStreamTask* pTask, SLaunchHTaskInfo* pInfo, int64_t now); -static void notRetryLaunchFillHistoryTask(SStreamTask* pTask, SLaunchHTaskInfo* pInfo, int64_t now); +static int32_t streamSetParamForScanHistory(SStreamTask* pTask); +static int32_t streamTaskSetRangeStreamCalc(SStreamTask* pTask); +static void initScanHistoryReq(SStreamTask* pTask, SStreamScanHistoryReq* pReq, int8_t igUntreated); +static int32_t createHTaskLaunchInfo(SStreamMeta* pMeta, STaskId* pTaskId, int64_t hStreamId, int32_t hTaskId, + SLaunchHTaskInfo** pInfo); +static void tryLaunchHistoryTask(void* param, void* tmrId); +static void doExecScanhistoryInFuture(void* param, void* tmrId); +static int32_t doStartScanHistoryTask(SStreamTask* pTask); +static int32_t streamTaskStartScanHistory(SStreamTask* pTask); +static void checkFillhistoryTaskStatus(SStreamTask* pTask, SStreamTask* pHTask); +static int32_t launchNotBuiltFillHistoryTask(SStreamTask* pTask); +static void doRetryLaunchFillHistoryTask(SStreamTask* pTask, SLaunchHTaskInfo* pInfo, int64_t now); +static void notRetryLaunchFillHistoryTask(SStreamTask* pTask, SLaunchHTaskInfo* pInfo, int64_t now); static int32_t streamTaskSetReady(SStreamTask* pTask) { int32_t numOfDowns = streamTaskGetNumOfDownstream(pTask); @@ -65,22 +65,19 @@ static int32_t streamTaskSetReady(SStreamTask* pTask) { int32_t streamStartScanHistoryAsync(SStreamTask* pTask, int8_t igUntreated) { SStreamScanHistoryReq req; + int32_t code = 0; initScanHistoryReq(pTask, &req, igUntreated); int32_t len = sizeof(SStreamScanHistoryReq); void* serializedReq = rpcMallocCont(len); if (serializedReq == NULL) { - return -1; + return TSDB_CODE_OUT_OF_MEMORY; } memcpy(serializedReq, &req, len); SRpcMsg rpcMsg = {.contLen = len, .pCont = serializedReq, .msgType = TDMT_VND_STREAM_SCAN_HISTORY}; - if (tmsgPutToQueue(pTask->pMsgCb, STREAM_QUEUE, &rpcMsg) < 0) { - /*ASSERT(0);*/ - } - - return 0; + return tmsgPutToQueue(pTask->pMsgCb, STREAM_QUEUE, &rpcMsg); } int32_t streamExecScanHistoryInFuture(SStreamTask* pTask, int32_t idleDuration) { @@ -109,8 +106,8 @@ int32_t streamExecScanHistoryInFuture(SStreamTask* pTask, int32_t idleDuration) pTask->schedHistoryInfo.pTimer = taosTmrStart(doExecScanhistoryInFuture, SCANHISTORY_IDLE_TIME_SLICE, pTask, streamTimer); } else { - taosTmrReset(doExecScanhistoryInFuture, SCANHISTORY_IDLE_TIME_SLICE, pTask, streamTimer, - &pTask->schedHistoryInfo.pTimer); + streamTmrReset(doExecScanhistoryInFuture, SCANHISTORY_IDLE_TIME_SLICE, pTask, streamTimer, + &pTask->schedHistoryInfo.pTimer, pTask->pMeta->vgId, " start-history-task-tmr"); } return TSDB_CODE_SUCCESS; @@ -135,9 +132,19 @@ int32_t streamTaskStartScanHistory(SStreamTask* pTask) { int32_t streamTaskOnNormalTaskReady(SStreamTask* pTask) { const char* id = pTask->id.idStr; + int32_t code = 0; - streamTaskSetReady(pTask); - streamTaskSetRangeStreamCalc(pTask); + code = streamTaskSetReady(pTask); + if (code) { + stError("s-task:%s failed to set task status ready", id); + return code; + } + + code = streamTaskSetRangeStreamCalc(pTask); + if (code) { + stError("s-task:%s failed to set the time range for stream task", id); + return code; + } SStreamTaskState p = streamTaskGetStatus(pTask); ASSERT(p.state == TASK_STATUS__READY); @@ -155,19 +162,23 @@ int32_t streamTaskOnNormalTaskReady(SStreamTask* pTask) { stDebug("s-task:%s level:%d status:%s sched-status:%d", id, pTask->info.taskLevel, p.name, schedStatus); } - return TSDB_CODE_SUCCESS; + return code; } int32_t streamTaskOnScanHistoryTaskReady(SStreamTask* pTask) { // set the state to be ready - streamTaskSetReady(pTask); - streamTaskSetRangeStreamCalc(pTask); + int32_t code = streamTaskSetReady(pTask); + if (code == 0) { + code = streamTaskSetRangeStreamCalc(pTask); + } - SStreamTaskState p = streamTaskGetStatus(pTask); - ASSERT((p.state == TASK_STATUS__SCAN_HISTORY) && (pTask->info.fillHistory == 1)); + if (code == 0) { + SStreamTaskState p = streamTaskGetStatus(pTask); + ASSERT((p.state == TASK_STATUS__SCAN_HISTORY) && (pTask->info.fillHistory == 1)); - stDebug("s-task:%s fill-history task enters into scan-history data stage, status:%s", pTask->id.idStr, p.name); - streamTaskStartScanHistory(pTask); + stDebug("s-task:%s fill-history task enters into scan-history data stage, status:%s", pTask->id.idStr, p.name); + code = streamTaskStartScanHistory(pTask); + } // NOTE: there will be an deadlock if launch fill history here. // start the related fill-history task, when current task is ready @@ -175,7 +186,7 @@ int32_t streamTaskOnScanHistoryTaskReady(SStreamTask* pTask) { // streamLaunchFillHistoryTask(pTask); // } - return TSDB_CODE_SUCCESS; + return code; } // common @@ -212,8 +223,7 @@ int32_t streamLaunchFillHistoryTask(SStreamTask* pTask) { stDebug("s-task:%s not launch related fill-history task:0x%" PRIx64 "-0x%x, status:%s", idStr, hStreamId, hTaskId, pStatus.name); - (void) streamMetaAddTaskLaunchResult(pMeta, hStreamId, hTaskId, pExecInfo->checkTs, pExecInfo->readyTs, false); - return -1; // todo set the correct error code + return streamMetaAddTaskLaunchResult(pMeta, hStreamId, hTaskId, pExecInfo->checkTs, pExecInfo->readyTs, false); } stDebug("s-task:%s start to launch related fill-history task:0x%" PRIx64 "-0x%x", idStr, hStreamId, hTaskId); @@ -257,12 +267,11 @@ int32_t streamLaunchFillHistoryTask(SStreamTask* pTask) { } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -int32_t initScanHistoryReq(SStreamTask* pTask, SStreamScanHistoryReq* pReq, int8_t igUntreated) { +void initScanHistoryReq(SStreamTask* pTask, SStreamScanHistoryReq* pReq, int8_t igUntreated) { pReq->msgHead.vgId = pTask->info.nodeId; pReq->streamId = pTask->id.streamId; pReq->taskId = pTask->id.taskId; pReq->igUntreated = igUntreated; - return 0; } void checkFillhistoryTaskStatus(SStreamTask* pTask, SStreamTask* pHTask) { @@ -281,7 +290,10 @@ void checkFillhistoryTaskStatus(SStreamTask* pTask, SStreamTask* pHTask) { } // check if downstream tasks have been ready - streamTaskHandleEvent(pHTask->status.pSM, TASK_EVENT_INIT_SCANHIST); + int32_t code = streamTaskHandleEvent(pHTask->status.pSM, TASK_EVENT_INIT_SCANHIST); + if (code) { + stError("s-task:%s handle event init_scanhist failed", pTask->id.idStr); + } } void notRetryLaunchFillHistoryTask(SStreamTask* pTask, SLaunchHTaskInfo* pInfo, int64_t now) { @@ -316,7 +328,8 @@ void doRetryLaunchFillHistoryTask(SStreamTask* pTask, SLaunchHTaskInfo* pInfo, i stDebug("s-task:%s status:%s failed to launch fill-history task:0x%x, retry launch:%dms, retryCount:%d", pTask->id.idStr, p, hTaskId, pHTaskInfo->waitInterval, pHTaskInfo->retryTimes); - taosTmrReset(tryLaunchHistoryTask, LAUNCH_HTASK_INTERVAL, pInfo, streamTimer, &pHTaskInfo->pTimer); + streamTmrReset(tryLaunchHistoryTask, LAUNCH_HTASK_INTERVAL, pInfo, streamTimer, &pHTaskInfo->pTimer, + pTask->pMeta->vgId, " start-history-task-tmr"); } } @@ -367,7 +380,8 @@ void tryLaunchHistoryTask(void* param, void* tmrId) { pHTaskInfo->tickCount -= 1; if (pHTaskInfo->tickCount > 0) { - taosTmrReset(tryLaunchHistoryTask, LAUNCH_HTASK_INTERVAL, pInfo, streamTimer, &pHTaskInfo->pTimer); + streamTmrReset(tryLaunchHistoryTask, LAUNCH_HTASK_INTERVAL, pInfo, streamTimer, &pHTaskInfo->pTimer, + pTask->pMeta->vgId, " start-history-task-tmr"); streamMetaReleaseTask(pMeta, pTask); return; } @@ -417,21 +431,21 @@ void tryLaunchHistoryTask(void* param, void* tmrId) { taosMemoryFree(pInfo); } -SLaunchHTaskInfo* createHTaskLaunchInfo(SStreamMeta* pMeta, STaskId* pTaskId, int64_t hStreamId, int32_t hTaskId) { - SLaunchHTaskInfo* pInfo = taosMemoryCalloc(1, sizeof(SLaunchHTaskInfo)); - if (pInfo == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; - return NULL; +int32_t createHTaskLaunchInfo(SStreamMeta* pMeta, STaskId* pTaskId, int64_t hStreamId, int32_t hTaskId, + SLaunchHTaskInfo** pInfo) { + *pInfo = taosMemoryCalloc(1, sizeof(SLaunchHTaskInfo)); + if ((*pInfo) == NULL) { + return TSDB_CODE_OUT_OF_MEMORY; } - pInfo->id.streamId = pTaskId->streamId; - pInfo->id.taskId = pTaskId->taskId; + (*pInfo)->id.streamId = pTaskId->streamId; + (*pInfo)->id.taskId = pTaskId->taskId; - pInfo->hTaskId.streamId = hStreamId; - pInfo->hTaskId.taskId = hTaskId; + (*pInfo)->hTaskId.streamId = hStreamId; + (*pInfo)->hTaskId.taskId = hTaskId; - pInfo->pMeta = pMeta; - return pInfo; + (*pInfo)->pMeta = pMeta; + return TSDB_CODE_SUCCESS; } int32_t launchNotBuiltFillHistoryTask(SStreamTask* pTask) { @@ -440,16 +454,18 @@ int32_t launchNotBuiltFillHistoryTask(SStreamTask* pTask) { const char* idStr = pTask->id.idStr; int64_t hStreamId = pTask->hTaskInfo.id.streamId; int32_t hTaskId = pTask->hTaskInfo.id.taskId; + SLaunchHTaskInfo* pInfo = NULL; + ASSERT(hTaskId != 0); stWarn("s-task:%s vgId:%d failed to launch history task:0x%x, since not built yet", idStr, pMeta->vgId, hTaskId); - STaskId id = streamTaskGetTaskId(pTask); - SLaunchHTaskInfo* pInfo = createHTaskLaunchInfo(pMeta, &id, hStreamId, hTaskId); - if (pInfo == NULL) { + STaskId id = streamTaskGetTaskId(pTask); + int32_t code = createHTaskLaunchInfo(pMeta, &id, hStreamId, hTaskId, &pInfo); + if (code) { stError("s-task:%s failed to launch related fill-history task, since Out Of Memory", idStr); - (void) streamMetaAddTaskLaunchResult(pMeta, hStreamId, hTaskId, pExecInfo->checkTs, pExecInfo->readyTs, false); - return terrno; + (void)streamMetaAddTaskLaunchResult(pMeta, hStreamId, hTaskId, pExecInfo->checkTs, pExecInfo->readyTs, false); + return code; } // set the launch time info @@ -475,7 +491,8 @@ int32_t launchNotBuiltFillHistoryTask(SStreamTask* pTask) { } else { // timer exists ASSERT(pTask->status.timerActive >= 1); stDebug("s-task:%s set timer active flag, task timer not null", idStr); - taosTmrReset(tryLaunchHistoryTask, WAIT_FOR_MINIMAL_INTERVAL, pInfo, streamTimer, &pTask->hTaskInfo.pTimer); + streamTmrReset(tryLaunchHistoryTask, WAIT_FOR_MINIMAL_INTERVAL, pInfo, streamTimer, &pTask->hTaskInfo.pTimer, + pTask->pMeta->vgId, " start-history-task-tmr"); } return TSDB_CODE_SUCCESS; @@ -510,7 +527,7 @@ bool streamHistoryTaskSetVerRangeStep2(SStreamTask* pTask, int64_t nextProcessVe } } -void streamTaskSetRangeStreamCalc(SStreamTask* pTask) { +int32_t streamTaskSetRangeStreamCalc(SStreamTask* pTask) { SDataRange* pRange = &pTask->dataRange; if (!HAS_RELATED_FILLHISTORY_TASK(pTask)) { @@ -523,10 +540,12 @@ void streamTaskSetRangeStreamCalc(SStreamTask* pTask) { "window:%" PRId64 "-%" PRId64 ", verRange:%" PRId64 "-%" PRId64, pTask->id.idStr, pRange->window.skey, pRange->window.ekey, pRange->range.minVer, pRange->range.maxVer); } + + return TSDB_CODE_SUCCESS; } else { ASSERT(pTask->info.fillHistory == 0); if (pTask->info.taskLevel >= TASK_LEVEL__AGG) { - return; + return TSDB_CODE_SUCCESS; } stDebug("s-task:%s level:%d related fill-history task exists, stream task timeWindow:%" PRId64 " - %" PRId64 @@ -536,7 +555,7 @@ void streamTaskSetRangeStreamCalc(SStreamTask* pTask) { SVersionRange verRange = pRange->range; STimeWindow win = pRange->window; - streamSetParamForStreamScannerStep2(pTask, &verRange, &win); + return streamSetParamForStreamScannerStep2(pTask, &verRange, &win); } } @@ -554,7 +573,10 @@ void doExecScanhistoryInFuture(void* param, void* tmrId) { } if (pTask->schedHistoryInfo.numOfTicks <= 0) { - streamStartScanHistoryAsync(pTask, 0); + int32_t code = streamStartScanHistoryAsync(pTask, 0); + if (code) { + stError("s-task:%s async start history task failed", pTask->id.idStr); + } int32_t ref = atomic_sub_fetch_32(&pTask->status.timerActive, 1); stDebug("s-task:%s fill-history:%d start scan-history data, out of tmr, ref:%d", pTask->id.idStr, @@ -563,18 +585,26 @@ void doExecScanhistoryInFuture(void* param, void* tmrId) { // release the task. streamMetaReleaseTask(pTask->pMeta, pTask); } else { - taosTmrReset(doExecScanhistoryInFuture, SCANHISTORY_IDLE_TIME_SLICE, pTask, streamTimer, - &pTask->schedHistoryInfo.pTimer); + streamTmrReset(doExecScanhistoryInFuture, SCANHISTORY_IDLE_TIME_SLICE, pTask, streamTimer, + &pTask->schedHistoryInfo.pTimer, pTask->pMeta->vgId, " start-history-task-tmr"); } } int32_t doStartScanHistoryTask(SStreamTask* pTask) { + int32_t code = 0; SVersionRange* pRange = &pTask->dataRange.range; + if (pTask->info.fillHistory) { - streamSetParamForScanHistory(pTask); + code = streamSetParamForScanHistory(pTask); + if (code) { + return code; + } } - streamSetParamForStreamScannerStep1(pTask, pRange, &pTask->dataRange.window); - int32_t code = streamStartScanHistoryAsync(pTask, 0); - return code; + code = streamSetParamForStreamScannerStep1(pTask, pRange, &pTask->dataRange.window); + if (code) { + return code; + } + + return streamStartScanHistoryAsync(pTask, 0); } diff --git a/source/libs/stream/src/streamTask.c b/source/libs/stream/src/streamTask.c index 7c2d0b3556..0110a9825c 100644 --- a/source/libs/stream/src/streamTask.c +++ b/source/libs/stream/src/streamTask.c @@ -29,20 +29,20 @@ static void streamTaskDestroyActiveChkptInfo(SActiveCheckpointInfo* pInfo); static int32_t addToTaskset(SArray* pArray, SStreamTask* pTask) { int32_t childId = taosArrayGetSize(pArray); pTask->info.selfChildId = childId; - taosArrayPush(pArray, &pTask); - return 0; + void* p = taosArrayPush(pArray, &pTask); + return (p == NULL)? TSDB_CODE_OUT_OF_MEMORY:TSDB_CODE_SUCCESS; } static int32_t doUpdateTaskEpset(SStreamTask* pTask, int32_t nodeId, SEpSet* pEpSet, bool* pUpdated) { char buf[512] = {0}; if (pTask->info.nodeId == nodeId) { // execution task should be moved away bool isEqual = isEpsetEqual(&pTask->info.epSet, pEpSet); - epsetToStr(pEpSet, buf, tListLen(buf)); + (void)epsetToStr(pEpSet, buf, tListLen(buf)); if (!isEqual) { (*pUpdated) = true; char tmp[512] = {0}; - epsetToStr(&pTask->info.epSet, tmp, tListLen(tmp)); + (void) epsetToStr(&pTask->info.epSet, tmp, tListLen(tmp)); // only for log file, ignore errors epsetAssign(&pTask->info.epSet, pEpSet); stDebug("s-task:0x%x (vgId:%d) self node epset is updated %s, old:%s", pTask->id.taskId, nodeId, buf, tmp); @@ -127,7 +127,10 @@ int32_t tNewStreamTask(int64_t streamId, int8_t taskLevel, SEpSet* pEpset, bool pTask->outputq.status = TASK_OUTPUT_STATUS__NORMAL; pTask->taskCheckInfo.pList = taosArrayInit(4, sizeof(SDownstreamStatusInfo)); - taosThreadMutexInit(&pTask->taskCheckInfo.checkInfoLock, NULL); + code = taosThreadMutexInit(&pTask->taskCheckInfo.checkInfoLock, NULL); + if (code) { + return code; + } if (fillHistory) { ASSERT(hasFillhistory); @@ -135,7 +138,7 @@ int32_t tNewStreamTask(int64_t streamId, int8_t taskLevel, SEpSet* pEpset, bool epsetAssign(&(pTask->info.mnodeEpset), pEpset); - addToTaskset(pTaskList, pTask); + code = addToTaskset(pTaskList, pTask); *p = pTask; return code; @@ -221,17 +224,17 @@ void tFreeStreamTask(SStreamTask* pTask) { } if (pTask->schedInfo.pDelayTimer != NULL) { - taosTmrStop(pTask->schedInfo.pDelayTimer); + (void) taosTmrStop(pTask->schedInfo.pDelayTimer); pTask->schedInfo.pDelayTimer = NULL; } if (pTask->hTaskInfo.pTimer != NULL) { - /*bool ret = */ taosTmrStop(pTask->hTaskInfo.pTimer); + (void) taosTmrStop(pTask->hTaskInfo.pTimer); pTask->hTaskInfo.pTimer = NULL; } if (pTask->msgInfo.pRetryTmr != NULL) { - /*bool ret = */ taosTmrStop(pTask->msgInfo.pRetryTmr); + (void) taosTmrStop(pTask->msgInfo.pRetryTmr); pTask->msgInfo.pRetryTmr = NULL; } @@ -394,10 +397,12 @@ int32_t streamTaskInit(SStreamTask* pTask, SStreamMeta* pMeta, SMsgCb* pMsgCb, i return terrno; } - taosThreadMutexInit(&pTask->msgInfo.lock, NULL); + code = taosThreadMutexInit(&pTask->msgInfo.lock, NULL); + if (code) { + return code; + } TdThreadMutexAttr attr = {0}; - code = taosThreadMutexAttrInit(&attr); if (code != 0) { stError("s-task:%s initElapsed mutex attr failed, code:%s", pTask->id.idStr, tstrerror(code)); @@ -410,8 +415,16 @@ int32_t streamTaskInit(SStreamTask* pTask, SStreamMeta* pMeta, SMsgCb* pMsgCb, i return code; } - taosThreadMutexInit(&pTask->lock, &attr); - taosThreadMutexAttrDestroy(&attr); + code = taosThreadMutexInit(&pTask->lock, &attr); + if (code) { + return code; + } + + code = taosThreadMutexAttrDestroy(&attr); + if (code) { + return code; + } + streamTaskOpenAllUpstreamInput(pTask); STaskOutputInfo* pOutputInfo = &pTask->outputInfo; @@ -424,7 +437,11 @@ int32_t streamTaskInit(SStreamTask* pTask, SStreamMeta* pMeta, SMsgCb* pMsgCb, i // 2MiB per second for sink task // 50 times sink operator per second - streamTaskInitTokenBucket(pOutputInfo->pTokenBucket, 35, 35, tsSinkDataRate, pTask->id.idStr); + code = streamTaskInitTokenBucket(pOutputInfo->pTokenBucket, 35, 35, tsSinkDataRate, pTask->id.idStr); + if (code) { + return code; + } + pOutputInfo->pNodeEpsetUpdateList = taosArrayInit(4, sizeof(SDownstreamTaskEpset)); if (pOutputInfo->pNodeEpsetUpdateList == NULL) { stError("s-task:%s failed to prepare downstreamUpdateList, code:%s", pTask->id.idStr, @@ -474,13 +491,13 @@ int32_t streamTaskSetUpstreamInfo(SStreamTask* pTask, const SStreamTask* pUpstre pTask->upstreamInfo.pList = taosArrayInit(4, POINTER_BYTES); } - taosArrayPush(pTask->upstreamInfo.pList, &pEpInfo); - return TSDB_CODE_SUCCESS; + void* p = taosArrayPush(pTask->upstreamInfo.pList, &pEpInfo); + return (p == NULL)? TSDB_CODE_OUT_OF_MEMORY:TSDB_CODE_SUCCESS; } void streamTaskUpdateUpstreamInfo(SStreamTask* pTask, int32_t nodeId, const SEpSet* pEpSet, bool* pUpdated) { char buf[512] = {0}; - epsetToStr(pEpSet, buf, tListLen(buf)); + (void) epsetToStr(pEpSet, buf, tListLen(buf)); // ignore error since it is only for log file. int32_t numOfUpstream = taosArrayGetSize(pTask->upstreamInfo.pList); for (int32_t i = 0; i < numOfUpstream; ++i) { @@ -491,7 +508,7 @@ void streamTaskUpdateUpstreamInfo(SStreamTask* pTask, int32_t nodeId, const SEpS *pUpdated = true; char tmp[512] = {0}; - epsetToStr(&pInfo->epSet, tmp, tListLen(tmp)); + (void) epsetToStr(&pInfo->epSet, tmp, tListLen(tmp)); epsetAssign(&pInfo->epSet, pEpSet); stDebug("s-task:0x%x update the upstreamInfo taskId:0x%x(nodeId:%d) newEpset:%s old:%s", pTask->id.taskId, @@ -526,7 +543,7 @@ void streamTaskSetFixedDownstreamInfo(SStreamTask* pTask, const SStreamTask* pDo void streamTaskUpdateDownstreamInfo(SStreamTask* pTask, int32_t nodeId, const SEpSet* pEpSet, bool* pUpdated) { char buf[512] = {0}; - epsetToStr(pEpSet, buf, tListLen(buf)); + (void) epsetToStr(pEpSet, buf, tListLen(buf)); // ignore the error since only for log files. int32_t id = pTask->id.taskId; int8_t type = pTask->outputInfo.type; @@ -542,7 +559,7 @@ void streamTaskUpdateDownstreamInfo(SStreamTask* pTask, int32_t nodeId, const SE if (!isEqual) { *pUpdated = true; char tmp[512] = {0}; - epsetToStr(&pVgInfo->epSet, tmp, tListLen(tmp)); + (void) epsetToStr(&pVgInfo->epSet, tmp, tListLen(tmp)); epsetAssign(&pVgInfo->epSet, pEpSet); stDebug("s-task:0x%x update dispatch info, task:0x%x(nodeId:%d) newEpset:%s old:%s", id, pVgInfo->taskId, @@ -562,7 +579,7 @@ void streamTaskUpdateDownstreamInfo(SStreamTask* pTask, int32_t nodeId, const SE *pUpdated = true; char tmp[512] = {0}; - epsetToStr(&pDispatcher->epSet, tmp, tListLen(tmp)); + (void) epsetToStr(&pDispatcher->epSet, tmp, tListLen(tmp)); epsetAssign(&pDispatcher->epSet, pEpSet); stDebug("s-task:0x%x update dispatch info, task:0x%x(nodeId:%d) newEpset:%s old:%s", id, pDispatcher->taskId, @@ -580,8 +597,16 @@ int32_t streamTaskStop(SStreamTask* pTask) { int64_t st = taosGetTimestampMs(); const char* id = pTask->id.idStr; - streamTaskHandleEvent(pTask->status.pSM, TASK_EVENT_STOP); - qKillTask(pTask->exec.pExecutor, TSDB_CODE_SUCCESS); + int32_t code = streamTaskHandleEvent(pTask->status.pSM, TASK_EVENT_STOP); + if (code) { + stError("failed to handle STOP event, s-task:%s", id); + } + + code = qKillTask(pTask->exec.pExecutor, TSDB_CODE_SUCCESS); + if (code) { + stError("s-task:%s failed to kill task related query handle", id); + } + while (!streamTaskIsIdle(pTask)) { stDebug("s-task:%s level:%d wait for task to be idle and then close, check again in 100ms", id, pTask->info.taskLevel); @@ -590,7 +615,7 @@ int32_t streamTaskStop(SStreamTask* pTask) { int64_t el = taosGetTimestampMs() - st; stDebug("vgId:%d s-task:%s is closed in %" PRId64 " ms", vgId, id, el); - return 0; + return code; } bool streamTaskUpdateEpsetInfo(SStreamTask* pTask, SArray* pNodeList) { @@ -607,7 +632,10 @@ bool streamTaskUpdateEpsetInfo(SStreamTask* pTask, SArray* pNodeList) { bool updated = false; for (int32_t i = 0; i < taosArrayGetSize(pNodeList); ++i) { SNodeUpdateInfo* pInfo = taosArrayGet(pNodeList, i); - doUpdateTaskEpset(pTask, pInfo->nodeId, &pInfo->newEp, &updated); + int32_t code = doUpdateTaskEpset(pTask, pInfo->nodeId, &pInfo->newEp, &updated); + if (code) { + stError("s-task:0x%x failed to update the task nodeEp epset, code:%s", pTask->id.taskId, tstrerror(code)); + } } return updated; @@ -704,10 +732,11 @@ int8_t streamTaskSetSchedStatusInactive(SStreamTask* pTask) { } int32_t streamTaskClearHTaskAttr(SStreamTask* pTask, int32_t resetRelHalt) { + int32_t code = 0; SStreamMeta* pMeta = pTask->pMeta; STaskId sTaskId = {.streamId = pTask->streamTaskId.streamId, .taskId = pTask->streamTaskId.taskId}; if (pTask->info.fillHistory == 0) { - return TSDB_CODE_SUCCESS; + return code; } SStreamTask** ppStreamTask = (SStreamTask**)taosHashGet(pMeta->pTasksMap, &sTaskId, sizeof(sTaskId)); @@ -725,11 +754,11 @@ int32_t streamTaskClearHTaskAttr(SStreamTask* pTask, int32_t resetRelHalt) { (*ppStreamTask)->status.taskStatus = TASK_STATUS__READY; } - streamMetaSaveTask(pMeta, *ppStreamTask); + code = streamMetaSaveTask(pMeta, *ppStreamTask); streamMutexUnlock(&(*ppStreamTask)->lock); } - return TSDB_CODE_SUCCESS; + return code; } int32_t streamBuildAndSendDropTaskMsg(SMsgCb* pMsgCb, int32_t vgId, SStreamTaskId* pTaskId, int64_t resetRelHalt) { @@ -797,8 +826,7 @@ int32_t streamSendChkptReportMsg(SStreamTask* pTask, SCheckpointInfo* pCheckpoin initRpcMsg(&msg, TDMT_MND_STREAM_CHKPT_REPORT, buf, tlen); stDebug("s-task:%s vgId:%d build and send task checkpoint-report to mnode", id, vgId); - tmsgSendReq(&pTask->info.mnodeEpset, &msg); - return 0; + return tmsgSendReq(&pTask->info.mnodeEpset, &msg); } STaskId streamTaskGetTaskId(const SStreamTask* pTask) { @@ -880,6 +908,7 @@ STaskStatusEntry streamTaskGetStatusEntry(SStreamTask* pTask) { static int32_t taskPauseCallback(SStreamTask* pTask, void* param) { SStreamMeta* pMeta = pTask->pMeta; + int32_t code = 0; int32_t num = atomic_add_fetch_32(&pMeta->numOfPausedTasks, 1); stInfo("vgId:%d s-task:%s pause stream task. paused task num:%d", pMeta->vgId, pTask->id.idStr, num); @@ -887,15 +916,15 @@ static int32_t taskPauseCallback(SStreamTask* pTask, void* param) { // in case of fill-history task, stop the tsdb file scan operation. if (pTask->info.fillHistory == 1) { void* pExecutor = pTask->exec.pExecutor; - qKillTask(pExecutor, TSDB_CODE_SUCCESS); + code = qKillTask(pExecutor, TSDB_CODE_SUCCESS); } stDebug("vgId:%d s-task:%s set pause flag and pause task", pMeta->vgId, pTask->id.idStr); - return TSDB_CODE_SUCCESS; + return code; } void streamTaskPause(SStreamTask* pTask) { - streamTaskHandleEventAsync(pTask->status.pSM, TASK_EVENT_PAUSE, taskPauseCallback, NULL); + (void) streamTaskHandleEventAsync(pTask->status.pSM, TASK_EVENT_PAUSE, taskPauseCallback, NULL); } void streamTaskResume(SStreamTask* pTask) { @@ -949,8 +978,7 @@ int32_t streamTaskSendCheckpointReq(SStreamTask* pTask) { initRpcMsg(&msg, TDMT_MND_STREAM_REQ_CHKPT, buf, tlen); stDebug("s-task:%s vgId:%d build and send task checkpoint req", id, vgId); - tmsgSendReq(&pTask->info.mnodeEpset, &msg); - return 0; + return tmsgSendReq(&pTask->info.mnodeEpset, &msg); } void streamTaskGetUpstreamTaskEpInfo(SStreamTask* pTask, int32_t taskId, SStreamUpstreamEpInfo** pEpInfo) { @@ -1044,7 +1072,7 @@ int32_t streamProcessRetrieveReq(SStreamTask* pTask, SStreamRetrieveReq* pReq) { void streamTaskSetRemoveBackendFiles(SStreamTask* pTask) { pTask->status.removeBackendFiles = true; } -int32_t streamTaskGetActiveCheckpointInfo(const SStreamTask* pTask, int32_t* pTransId, int64_t* pCheckpointId) { +void streamTaskGetActiveCheckpointInfo(const SStreamTask* pTask, int32_t* pTransId, int64_t* pCheckpointId) { if (pTransId != NULL) { *pTransId = pTask->chkInfo.pActiveInfo->transId; } @@ -1052,8 +1080,6 @@ int32_t streamTaskGetActiveCheckpointInfo(const SStreamTask* pTask, int32_t* pTr if (pCheckpointId != NULL) { *pCheckpointId = pTask->chkInfo.pActiveInfo->activeId; } - - return TSDB_CODE_SUCCESS; } int32_t streamTaskSetActiveCheckpointInfo(SStreamTask* pTask, int64_t activeCheckpointId) { @@ -1084,7 +1110,7 @@ int32_t streamTaskCreateActiveChkptInfo(SActiveCheckpointInfo** pRes) { pInfo->pCheckpointReadyRecvList = taosArrayInit(4, sizeof(STaskDownstreamReadyInfo)); *pRes = pInfo; - return TSDB_CODE_SUCCESS; + return code; } void streamTaskDestroyActiveChkptInfo(SActiveCheckpointInfo* pInfo) { @@ -1101,12 +1127,12 @@ void streamTaskDestroyActiveChkptInfo(SActiveCheckpointInfo* pInfo) { pInfo->pCheckpointReadyRecvList = NULL; if (pInfo->pChkptTriggerTmr != NULL) { - taosTmrStop(pInfo->pChkptTriggerTmr); + (void) taosTmrStop(pInfo->pChkptTriggerTmr); pInfo->pChkptTriggerTmr = NULL; } if (pInfo->pSendReadyMsgTmr != NULL) { - taosTmrStop(pInfo->pSendReadyMsgTmr); + (void) taosTmrStop(pInfo->pSendReadyMsgTmr); pInfo->pSendReadyMsgTmr = NULL; } diff --git a/source/libs/stream/src/streamTimer.c b/source/libs/stream/src/streamTimer.c index 931de397cc..fb1740ae0a 100644 --- a/source/libs/stream/src/streamTimer.c +++ b/source/libs/stream/src/streamTimer.c @@ -35,8 +35,9 @@ void streamTimerCleanUp() { streamTimer = NULL; } -tmr_h streamTimerGetInstance() { - return streamTimer; +int32_t streamTimerGetInstance(tmr_h* pTmr) { + *pTmr = streamTimer; + return TSDB_CODE_SUCCESS; } void streamTmrReset(TAOS_TMR_CALLBACK fp, int32_t mseconds, void* param, void* handle, tmr_h* pTmrId, int32_t vgId, diff --git a/source/libs/stream/src/streamUpdate.c b/source/libs/stream/src/streamUpdate.c index 76b0d6a561..6a2c85323a 100644 --- a/source/libs/stream/src/streamUpdate.c +++ b/source/libs/stream/src/streamUpdate.c @@ -36,7 +36,6 @@ static int64_t adjustExpEntries(int64_t entries) { return TMIN(DEFAULT_EXPECTED_ int compareKeyTs(void* pTs1, void* pTs2, void* pPkVal, __compar_fn_t cmpPkFn) { return compareInt64Val(pTs1, pTs2); - ; } int compareKeyTsAndPk(void* pValue1, void* pTs, void* pPkVal, __compar_fn_t cmpPkFn) {