From 7cab27110ad2b350df76600d59065565b99a2d8a Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Mon, 24 Jun 2024 12:26:10 +0000 Subject: [PATCH 01/92] add self check info --- source/libs/stream/inc/streamBackendRocksdb.h | 5 +- source/libs/stream/src/streamBackendRocksdb.c | 159 ++++++++++++++++-- source/libs/stream/src/streamCheckpoint.c | 63 +++---- source/libs/stream/test/backendTest.cpp | 38 ++--- 4 files changed, 197 insertions(+), 68 deletions(-) diff --git a/source/libs/stream/inc/streamBackendRocksdb.h b/source/libs/stream/inc/streamBackendRocksdb.h index 6b81ac87ee..ebeedcb5d2 100644 --- a/source/libs/stream/inc/streamBackendRocksdb.h +++ b/source/libs/stream/inc/streamBackendRocksdb.h @@ -136,7 +136,7 @@ void* streamBackendInit(const char* path, int64_t chkpId, int32_t vgId); void streamBackendCleanup(void* arg); void streamBackendHandleCleanup(void* arg); int32_t streamBackendLoadCheckpointInfo(void* pMeta); -int32_t streamBackendDoCheckpoint(void* pMeta, int64_t checkpointId); +int32_t streamBackendDoCheckpoint(void* pMeta, int64_t checkpointId, int64_t processver); SListNode* streamBackendAddCompare(void* backend, void* arg); void streamBackendDelCompare(void* backend, void* arg); int32_t streamStateCvtDataFormat(char* path, char* key, void* cfInst); @@ -144,7 +144,6 @@ int32_t streamStateCvtDataFormat(char* path, char* key, void* cfInst); STaskDbWrapper* taskDbOpen(const char* path, const char* key, int64_t chkptId); void taskDbDestroy(void* pBackend, bool flush); void taskDbDestroy2(void* pBackend); -int32_t taskDbDoCheckpoint(void* arg, int64_t chkpId); void taskDbUpdateChkpId(void* pTaskDb, int64_t chkpId); @@ -249,7 +248,7 @@ int32_t streamBackendDelInUseChkp(void* arg, int64_t chkpId); int32_t taskDbBuildSnap(void* arg, SArray* pSnap); int32_t taskDbDestroySnap(void* arg, SArray* pSnapInfo); -int32_t taskDbDoCheckpoint(void* arg, int64_t chkpId); +int32_t taskDbDoCheckpoint(void* arg, int64_t chkpId, int64_t processId); SBkdMgt* bkdMgtCreate(char* path); int32_t bkdMgtAddChkp(SBkdMgt* bm, char* task, char* path); diff --git a/source/libs/stream/src/streamBackendRocksdb.c b/source/libs/stream/src/streamBackendRocksdb.c index c151193284..4915d4b122 100644 --- a/source/libs/stream/src/streamBackendRocksdb.c +++ b/source/libs/stream/src/streamBackendRocksdb.c @@ -140,7 +140,7 @@ int32_t valueDecode(void* value, int32_t vlen, int64_t* ttl, char** dest); int32_t valueToString(void* k, char* buf); int32_t valueIsStale(void* k, int64_t ts); -void destroyCompare(void* arg); +void destroyCompare(void* arg); static void cleanDir(const char* pPath, const char* id); static bool streamStateIterSeekAndValid(rocksdb_iterator_t* iter, char* buf, size_t len); @@ -194,9 +194,7 @@ int32_t getCfIdx(const char* cfName) { return idx; } -bool isValidCheckpoint(const char* dir) { - return true; -} +bool isValidCheckpoint(const char* dir) { return true; } int32_t rebuildDirFromCheckpoint(const char* path, int64_t chkpId, char** dst) { // impl later @@ -486,9 +484,7 @@ _ERROR: return code; } -int32_t backendCopyFiles(const char* src, const char* dst) { - return backendFileCopyFilesImpl(src, dst); -} +int32_t backendCopyFiles(const char* src, const char* dst) { return backendFileCopyFilesImpl(src, dst); } static int32_t rebuildFromLocalCheckpoint(const char* pTaskIdStr, const char* checkpointPath, int64_t checkpointId, const char* defaultPath) { @@ -540,7 +536,8 @@ int32_t restoreCheckpointData(const char* path, const char* key, int64_t chkptId char* chkptPath = taosMemoryCalloc(1, pathLen); if (chkptId > 0) { - snprintf(chkptPath, pathLen, "%s%s%s%s%s%" PRId64 "", prefixPath, TD_DIRSEP, "checkpoints", TD_DIRSEP, "checkpoint", chkptId); + snprintf(chkptPath, pathLen, "%s%s%s%s%s%" PRId64 "", prefixPath, TD_DIRSEP, "checkpoints", TD_DIRSEP, "checkpoint", + chkptId); code = rebuildFromLocalCheckpoint(key, chkptPath, chkptId, defaultPath); if (code != 0) { @@ -549,11 +546,12 @@ int32_t restoreCheckpointData(const char* path, const char* key, int64_t chkptId if (code != 0) { stError("failed to start stream backend at %s, reason: %s, restart from default defaultPath:%s", chkptPath, - tstrerror(code), defaultPath); - code = 0; // reset the error code + tstrerror(code), defaultPath); + code = 0; // reset the error code } } else { // no valid checkpoint id - stInfo("%s no valid checkpoint ever generated, no need to copy checkpoint data, clean defaultPath:%s", key, defaultPath); + stInfo("%s no valid checkpoint ever generated, no need to copy checkpoint data, clean defaultPath:%s", key, + defaultPath); cleanDir(defaultPath, key); } @@ -1142,7 +1140,7 @@ int32_t taskDbBuildSnap(void* arg, SArray* pSnap) { int64_t chkpId = pTaskDb->chkpId; taskDbRefChkp(pTaskDb, chkpId); - code = taskDbDoCheckpoint(pTaskDb, chkpId); + code = taskDbDoCheckpoint(pTaskDb, chkpId, 0); if (code != 0) { taskDbUnRefChkp(pTaskDb, chkpId); } @@ -1230,7 +1228,106 @@ int64_t taskGetDBRef(void* arg) { return pDb->refId; } -int32_t taskDbDoCheckpoint(void* arg, int64_t chkpId) { +int32_t chkpLoadExtraInfo(char* pChkpIdDir, int64_t* chkpId, int64_t* processId) { + TdFilePtr pFile = NULL; + int32_t code = -1; + + int32_t len = strlen(pChkpIdDir); + if (len == 0) { + terrno = TSDB_CODE_INVALID_PARA; + stError("failed to load extra info, dir:%s, reason:%s", pChkpIdDir, tstrerror(terrno)); + return -1; + } + + char* pDst = taosMemoryCalloc(1, len + 64); + if (pDst == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + stError("failed to alloc memory to load extra info, dir:%s", pChkpIdDir); + goto _EXIT; + } + + if (sprintf(pDst, "%s%sinfo", pChkpIdDir, TD_DIRSEP) <= 0) { + code = -1; + stError("failed to build dst to load extra info, dir:%s", pChkpIdDir); + goto _EXIT; + } + + pFile = taosOpenFile(pDst, TD_FILE_READ); + if (pFile == NULL) { + terrno = TAOS_SYSTEM_ERROR(errno); + stError("failed to open file to load extra info, file:%s", pDst); + goto _EXIT; + } + + char buf[256] = {0}; + if (taosReadFile(pFile, buf, sizeof(buf)) <= 0) { + terrno = TAOS_SYSTEM_ERROR(errno); + stError("failed to read file to load extra info, file:%s, reason:%s", pDst, tstrerror(terrno)); + code = -1; + goto _EXIT; + } + + if (sscanf(buf, "%" PRId64 " %" PRId64 "", chkpId, processId) < 2) { + terrno = TSDB_CODE_INVALID_PARA; + stError("failed to read file content to load extra info, file:%s, reason:%s", pDst, tstrerror(terrno)); + } + code = 0; +_EXIT: + taosMemoryFree(pDst); + taosCloseFile(&pFile); + return code; +} +int32_t chkpAddExtraInfo(char* pChkpIdDir, int64_t chkpId, int64_t processId) { + TdFilePtr pFile = NULL; + int32_t code = -1; + + int32_t len = strlen(pChkpIdDir); + if (len == 0) { + terrno = TSDB_CODE_INVALID_PARA; + stError("failed to add extra info, dir:%s, reason:%s", pChkpIdDir, tstrerror(terrno)); + return -1; + } + + char* pDst = taosMemoryCalloc(1, len + 64); + if (pDst == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + stError("failed to alloc memory to add extra info, dir:%s", pChkpIdDir); + goto _EXIT; + } + + if (sprintf(pDst, "%s%sinfo", pChkpIdDir, TD_DIRSEP) < 0) { + stError("failed to build dst to add extra info, dir:%s", pChkpIdDir); + goto _EXIT; + } + + pFile = taosOpenFile(pDst, TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_TRUNC); + if (pFile == NULL) { + terrno = TAOS_SYSTEM_ERROR(errno); + stError("failed to open file to add extra info, file:%s", pDst); + goto _EXIT; + } + + char buf[256] = {0}; + int n = snprintf(buf, sizeof(buf), "%" PRId64 " %" PRId64 "", chkpId, processId); + if (n <= 0 || n >= sizeof(buf)) { + code = -1; + stError("failed to build content to add extra info, dir:%s", pChkpIdDir); + goto _EXIT; + } + + if (taosWriteFile(pFile, buf, strlen(buf)) <= 0) { + terrno = TAOS_SYSTEM_ERROR(errno); + stError("failed to write file to add extra info, file:%s, reason:%s", pDst, tstrerror(terrno)); + goto _EXIT; + } + code = 0; + +_EXIT: + taosCloseFile(&pFile); + taosMemoryFree(pDst); + return code; +} +int32_t taskDbDoCheckpoint(void* arg, int64_t chkpId, int64_t processId) { STaskDbWrapper* pTaskDb = arg; int64_t st = taosGetTimestampMs(); int32_t code = -1; @@ -1254,32 +1351,58 @@ int32_t taskDbDoCheckpoint(void* arg, int64_t chkpId) { int64_t written = atomic_load_64(&pTaskDb->dataWritten); + // flush db if (written > 0) { stDebug("stream backend:%p start to flush db at:%s, data written:%" PRId64 "", pTaskDb, pChkpIdDir, written); code = chkpPreFlushDb(pTaskDb->db, ppCf, nCf); + if (code != 0) goto _EXIT; } else { stDebug("stream backend:%p not need flush db at:%s, data written:%" PRId64 "", pTaskDb, pChkpIdDir, written); } + + // do checkpoint if ((code = chkpDoDbCheckpoint(pTaskDb->db, pChkpIdDir)) != 0) { stError("stream backend:%p failed to do checkpoint at:%s", pTaskDb, pChkpIdDir); + goto _EXIT; } else { stDebug("stream backend:%p end to do checkpoint at:%s, time cost:%" PRId64 "ms", pTaskDb, pChkpIdDir, taosGetTimestampMs() - st); } + // add extra info to checkpoint + if ((code = chkpAddExtraInfo(pChkpIdDir, chkpId, processId)) != 0) { + stError("stream backend:%p failed to add extra info to checkpoint at:%s", pTaskDb, pChkpIdDir); + goto _EXIT; + } + + // delete ttl checkpoint code = chkpMayDelObsolete(pTaskDb, chkpId, pChkpDir); + if (code < 0) { + goto _EXIT; + } + atomic_store_64(&pTaskDb->dataWritten, 0); pTaskDb->chkpId = chkpId; _EXIT: - taosMemoryFree(pChkpDir); + + // clear checkpoint dir if failed + if (code != 0 && pChkpDir != NULL) { + if (taosDirExist(pChkpIdDir)) { + taosRemoveDir(pChkpIdDir); + } + } taosMemoryFree(pChkpIdDir); + taosMemoryFree(pChkpDir); + taosReleaseRef(taskDbWrapperId, refId); taosMemoryFree(ppCf); return code; } -int32_t streamBackendDoCheckpoint(void* arg, int64_t chkpId) { return taskDbDoCheckpoint(arg, chkpId); } +int32_t streamBackendDoCheckpoint(void* arg, int64_t chkpId, int64_t processVer) { + return taskDbDoCheckpoint(arg, chkpId, processVer); +} SListNode* streamBackendAddCompare(void* backend, void* arg) { SBackendWrapper* pHandle = (SBackendWrapper*)backend; @@ -2205,7 +2328,8 @@ int32_t taskDbGenChkpUploadData__rsync(STaskDbWrapper* pDb, int64_t chkpId, char return code; } -int32_t taskDbGenChkpUploadData__s3(STaskDbWrapper* pDb, void* bkdChkpMgt, int64_t chkpId, char** path, SArray* list, const char* idStr) { +int32_t taskDbGenChkpUploadData__s3(STaskDbWrapper* pDb, void* bkdChkpMgt, int64_t chkpId, char** path, SArray* list, + const char* idStr) { int32_t code = 0; SBkdMgt* p = (SBkdMgt*)bkdChkpMgt; @@ -2224,7 +2348,8 @@ int32_t taskDbGenChkpUploadData__s3(STaskDbWrapper* pDb, void* bkdChkpMgt, int64 return code; } -int32_t taskDbGenChkpUploadData(void* arg, void* mgt, int64_t chkpId, int8_t type, char** path, SArray* list, const char* idStr) { +int32_t taskDbGenChkpUploadData(void* arg, void* mgt, int64_t chkpId, int8_t type, char** path, SArray* list, + const char* idStr) { int32_t code = -1; STaskDbWrapper* pDb = arg; ECHECKPOINT_BACKUP_TYPE utype = type; diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index 1fddb5a97d..af7e969c07 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -24,11 +24,13 @@ static int32_t streamTaskUploadCheckpoint(const char* id, const char* path); static int32_t deleteCheckpoint(const char* id); static int32_t downloadCheckpointByNameS3(const char* id, const char* fname, const char* dstName); static int32_t continueDispatchCheckpointTriggerBlock(SStreamDataBlock* pBlock, SStreamTask* pTask); -static int32_t appendCheckpointIntoInputQ(SStreamTask* pTask, int32_t checkpointType, int64_t checkpointId, int32_t transId); +static int32_t appendCheckpointIntoInputQ(SStreamTask* pTask, int32_t checkpointType, int64_t checkpointId, + int32_t transId); static int32_t doSendRetrieveTriggerMsg(SStreamTask* pTask, SArray* pNotSendList); static void checkpointTriggerMonitorFn(void* param, void* tmrId); -static SStreamDataBlock* createChkptTriggerBlock(SStreamTask* pTask, int32_t checkpointType, int64_t checkpointId, int32_t transId); +static SStreamDataBlock* createChkptTriggerBlock(SStreamTask* pTask, int32_t checkpointType, int64_t checkpointId, + int32_t transId); SStreamDataBlock* createChkptTriggerBlock(SStreamTask* pTask, int32_t checkpointType, int64_t checkpointId, int32_t transId) { @@ -96,7 +98,7 @@ int32_t streamTaskProcessCheckpointTriggerRsp(SStreamTask* pTask, SCheckpointTri if (pRsp->rspCode != TSDB_CODE_SUCCESS) { stDebug("s-task:%s retrieve checkpoint-trgger rsp from upstream:0x%x invalid, code:%s", pTask->id.idStr, - pRsp->upstreamTaskId, tstrerror(pRsp->rspCode)); + pRsp->upstreamTaskId, tstrerror(pRsp->rspCode)); return TSDB_CODE_SUCCESS; } @@ -108,7 +110,7 @@ int32_t streamTaskSendCheckpointTriggerMsg(SStreamTask* pTask, int32_t dstTaskId SRpcHandleInfo* pRpcInfo, int32_t code) { int32_t size = sizeof(SMsgHead) + sizeof(SCheckpointTriggerRsp); - void* pBuf = rpcMallocCont(size); + void* pBuf = rpcMallocCont(size); SCheckpointTriggerRsp* pRsp = POINTER_SHIFT(pBuf, sizeof(SMsgHead)); ((SMsgHead*)pBuf)->vgId = htonl(downstreamNodeId); @@ -162,15 +164,15 @@ int32_t streamProcessCheckpointTriggerBlock(SStreamTask* pTask, SStreamDataBlock taosThreadMutexLock(&pTask->lock); if (pTask->chkInfo.checkpointId > checkpointId) { stError("s-task:%s vgId:%d current checkpointId:%" PRId64 - " recv expired checkpoint-trigger block, checkpointId:%" PRId64 " transId:%d, discard", - id, vgId, pTask->chkInfo.checkpointId, checkpointId, transId); + " recv expired checkpoint-trigger block, checkpointId:%" PRId64 " transId:%d, discard", + id, vgId, pTask->chkInfo.checkpointId, checkpointId, transId); taosThreadMutexUnlock(&pTask->lock); return TSDB_CODE_SUCCESS; } if (pTask->chkInfo.checkpointId == checkpointId) { { // send checkpoint-ready msg to upstream - SRpcMsg msg ={0}; + SRpcMsg msg = {0}; SStreamUpstreamEpInfo* pInfo = streamTaskGetUpstreamTaskEpInfo(pTask, pBlock->srcTaskId); initCheckpointReadyMsg(pTask, pInfo->nodeId, pBlock->srcTaskId, pInfo->childId, checkpointId, &msg); @@ -362,7 +364,8 @@ int32_t streamProcessCheckpointReadyMsg(SStreamTask* pTask, int64_t checkpointId taosThreadMutexUnlock(&pInfo->lock); if (notReady == 0) { - stDebug("s-task:%s all downstream task(s) have completed build checkpoint, start to do checkpoint for current task", id); + stDebug("s-task:%s all downstream task(s) have completed build checkpoint, start to do checkpoint for current task", + id); appendCheckpointIntoInputQ(pTask, STREAM_INPUT__CHECKPOINT, checkpointId, transId); } @@ -371,11 +374,11 @@ int32_t streamProcessCheckpointReadyMsg(SStreamTask* pTask, int64_t checkpointId int32_t streamTaskProcessCheckpointReadyRsp(SStreamTask* pTask, int32_t upstreamTaskId, int64_t checkpointId) { SActiveCheckpointInfo* pInfo = pTask->chkInfo.pActiveInfo; - int64_t now = taosGetTimestampMs(); - int32_t numOfConfirmed = 0; + int64_t now = taosGetTimestampMs(); + int32_t numOfConfirmed = 0; taosThreadMutexLock(&pInfo->lock); - for(int32_t i = 0; i < taosArrayGetSize(pInfo->pReadyMsgList); ++i) { + for (int32_t i = 0; i < taosArrayGetSize(pInfo->pReadyMsgList); ++i) { STaskCheckpointReadyInfo* pReadyInfo = taosArrayGet(pInfo->pReadyMsgList, i); if (pReadyInfo->upstreamTaskId == upstreamTaskId && pReadyInfo->checkpointId == checkpointId) { pReadyInfo->sendCompleted = 1; @@ -385,7 +388,7 @@ int32_t streamTaskProcessCheckpointReadyRsp(SStreamTask* pTask, int32_t upstream } } - for(int32_t i = 0; i < taosArrayGetSize(pInfo->pReadyMsgList); ++i) { + for (int32_t i = 0; i < taosArrayGetSize(pInfo->pReadyMsgList); ++i) { STaskCheckpointReadyInfo* pReadyInfo = taosArrayGet(pInfo->pReadyMsgList, i); if (pReadyInfo->sendCompleted == 1) { numOfConfirmed += 1; @@ -568,12 +571,12 @@ static int32_t getCheckpointDataMeta(const char* id, const char* path, SArray* l } int32_t uploadCheckpointData(SStreamTask* pTask, int64_t checkpointId, int64_t dbRefId, ECHECKPOINT_BACKUP_TYPE type) { - char* path = NULL; - int32_t code = 0; - SArray* toDelFiles = taosArrayInit(4, POINTER_BYTES); - int64_t now = taosGetTimestampMs(); - SStreamMeta* pMeta = pTask->pMeta; - const char* idStr = pTask->id.idStr; + char* path = NULL; + int32_t code = 0; + SArray* toDelFiles = taosArrayInit(4, POINTER_BYTES); + int64_t now = taosGetTimestampMs(); + SStreamMeta* pMeta = pTask->pMeta; + const char* idStr = pTask->id.idStr; if ((code = taskDbGenChkpUploadData(pTask->pBackend, pMeta->bkdChkptMgt, checkpointId, type, &path, toDelFiles, pTask->id.idStr)) != 0) { @@ -619,8 +622,8 @@ int32_t uploadCheckpointData(SStreamTask* pTask, int64_t checkpointId, int64_t d idStr, checkpointId, el, path); taosRemoveDir(path); } else { - stDebug("s-task:%s failed to upload checkpointId:%" PRId64 " keep local checkpoint data, elapsed time:%.2fs", - idStr, checkpointId, el); + stDebug("s-task:%s failed to upload checkpointId:%" PRId64 " keep local checkpoint data, elapsed time:%.2fs", idStr, + checkpointId, el); } taosMemoryFree(path); @@ -639,9 +642,10 @@ int32_t streamTaskRemoteBackupCheckpoint(SStreamTask* pTask, int64_t checkpointI } int64_t dbRefId = taskGetDBRef(pTask->pBackend); - void* pBackend = taskAcquireDb(dbRefId); + void* pBackend = taskAcquireDb(dbRefId); if (pBackend == NULL) { - stError("s-task:%s failed to acquire db during update checkpoint data, failed to upload checkpointData", pTask->id.idStr); + stError("s-task:%s failed to acquire db during update checkpoint data, failed to upload checkpointData", + pTask->id.idStr); return -1; } @@ -663,7 +667,8 @@ int32_t streamTaskBuildCheckpoint(SStreamTask* pTask) { if (pTask->info.taskLevel != TASK_LEVEL__SINK) { stDebug("s-task:%s level:%d start gen checkpoint, checkpointId:%" PRId64, id, pTask->info.taskLevel, ckId); - code = streamBackendDoCheckpoint(pTask->pBackend, ckId); + int64_t ver = 0; + code = streamBackendDoCheckpoint(pTask->pBackend, ckId, ver); if (code != TSDB_CODE_SUCCESS) { stError("s-task:%s gen checkpoint:%" PRId64 " failed, code:%s", id, ckId, tstrerror(terrno)); } @@ -773,11 +778,11 @@ void checkpointTriggerMonitorFn(void* param, void* tmrId) { ASSERT(pTask->info.taskLevel > TASK_LEVEL__SOURCE); SArray* pNotSendList = taosArrayInit(4, sizeof(SStreamUpstreamEpInfo)); - for(int32_t i = 0; i < taosArrayGetSize(pList); ++i) { + for (int32_t i = 0; i < taosArrayGetSize(pList); ++i) { SStreamUpstreamEpInfo* pInfo = taosArrayGetP(pList, i); bool recved = false; - for(int32_t j = 0; j < taosArrayGetSize(pActiveInfo->pReadyMsgList); ++j) { + for (int32_t j = 0; j < taosArrayGetSize(pActiveInfo->pReadyMsgList); ++j) { STaskCheckpointReadyInfo* pReady = taosArrayGet(pActiveInfo->pReadyMsgList, j); if (pInfo->nodeId == pReady->upstreamNodeId) { recved = true; @@ -785,7 +790,7 @@ void checkpointTriggerMonitorFn(void* param, void* tmrId) { } } - if (!recved) { // make sure the inputQ is opened for not recv upstream checkpoint-trigger message + if (!recved) { // make sure the inputQ is opened for not recv upstream checkpoint-trigger message streamTaskOpenUpstreamInput(pTask, pInfo->taskId); taosArrayPush(pNotSendList, pInfo); } @@ -870,7 +875,7 @@ bool streamTaskAlreadySendTrigger(SStreamTask* pTask, int32_t downstreamNodeId) return false; } - for(int32_t i = 0; i < taosArrayGetSize(pInfo->pDispatchTriggerList); ++i) { + for (int32_t i = 0; i < taosArrayGetSize(pInfo->pDispatchTriggerList); ++i) { STaskTriggerSendInfo* pSendInfo = taosArrayGet(pInfo->pDispatchTriggerList, i); if (pSendInfo->nodeId != downstreamNodeId) { continue; @@ -939,10 +944,10 @@ int32_t streamTaskGetNumOfConfirmed(SStreamTask* pTask) { int32_t num = 0; taosThreadMutexLock(&pInfo->lock); - for(int32_t i = 0; i < taosArrayGetSize(pInfo->pDispatchTriggerList); ++i) { + for (int32_t i = 0; i < taosArrayGetSize(pInfo->pDispatchTriggerList); ++i) { STaskTriggerSendInfo* p = taosArrayGet(pInfo->pDispatchTriggerList, i); if (p->recved) { - num ++; + num++; } } taosThreadMutexUnlock(&pInfo->lock); diff --git a/source/libs/stream/test/backendTest.cpp b/source/libs/stream/test/backendTest.cpp index 2fb257fe4e..38d48a2a32 100644 --- a/source/libs/stream/test/backendTest.cpp +++ b/source/libs/stream/test/backendTest.cpp @@ -29,7 +29,7 @@ class BackendEnv : public ::testing::Test { void *backendCreate() { const char *streamPath = "/tmp"; - void * p = NULL; + void *p = NULL; // char *absPath = NULL; // // SBackendWrapper *p = (SBackendWrapper *)streamBackendInit(streamPath, -1, 2); @@ -52,7 +52,7 @@ SStreamState *stateCreate(const char *path) { } void *backendOpen() { streamMetaInit(); - const char * path = "/tmp/backend"; + const char *path = "/tmp/backend"; SStreamState *p = stateCreate(path); ASSERT(p != NULL); @@ -79,7 +79,7 @@ void *backendOpen() { const char *val = "value data"; int32_t len = 0; - char * newVal = NULL; + char *newVal = NULL; streamStateGet_rocksdb(p, &key, (void **)&newVal, &len); ASSERT(len == strlen(val)); } @@ -100,7 +100,7 @@ void *backendOpen() { const char *val = "value data"; int32_t len = 0; - char * newVal = NULL; + char *newVal = NULL; int32_t code = streamStateGet_rocksdb(p, &key, (void **)&newVal, &len); ASSERT(code != 0); } @@ -130,7 +130,7 @@ void *backendOpen() { winkey.groupId = 0; winkey.ts = tsArray[0]; - char * val = NULL; + char *val = NULL; int32_t len = 0; pCurr = streamStateSeekKeyNext_rocksdb(p, &winkey); @@ -157,7 +157,7 @@ void *backendOpen() { key.ts = tsArray[i]; key.exprIdx = i; - char * val = NULL; + char *val = NULL; int32_t len = 0; streamStateFuncGet_rocksdb(p, &key, (void **)&val, &len); ASSERT(len == strlen("Value")); @@ -168,7 +168,7 @@ void *backendOpen() { key.ts = tsArray[i]; key.exprIdx = i; - char * val = NULL; + char *val = NULL; int32_t len = 0; streamStateFuncDel_rocksdb(p, &key); } @@ -213,7 +213,7 @@ void *backendOpen() { { SSessionKey key; memset(&key, 0, sizeof(key)); - char * val = NULL; + char *val = NULL; int32_t vlen = 0; code = streamStateSessionGetKVByCur_rocksdb(pCurr, &key, (void **)&val, &vlen); ASSERT(code == 0); @@ -260,7 +260,7 @@ void *backendOpen() { SWinKey key = {0}; // {.groupId = (uint64_t)(i), .ts = tsArray[i]}; key.groupId = (uint64_t)(i); key.ts = tsArray[i]; - char * val = NULL; + char *val = NULL; int32_t vlen = 0; ASSERT(streamStateFillGet_rocksdb(p, &key, (void **)&val, &vlen) == 0); taosMemoryFreeClear(val); @@ -272,7 +272,7 @@ void *backendOpen() { SStreamStateCur *pCurr = streamStateFillGetCur_rocksdb(p, &key); ASSERT(pCurr != NULL); - char * val = NULL; + char *val = NULL; int32_t vlen = 0; ASSERT(0 == streamStateFillGetKVByCur_rocksdb(pCurr, &key, (const void **)&val, &vlen)); ASSERT(vlen == strlen("Value")); @@ -296,7 +296,7 @@ void *backendOpen() { SWinKey key = {0}; // {.groupId = (uint64_t)(i), .ts = tsArray[i]}; key.groupId = (uint64_t)(i); key.ts = tsArray[i]; - char * val = NULL; + char *val = NULL; int32_t vlen = 0; ASSERT(streamStateFillDel_rocksdb(p, &key) == 0); taosMemoryFreeClear(val); @@ -338,7 +338,7 @@ void *backendOpen() { char key[128] = {0}; sprintf(key, "tbname_%d", i); - char * val = NULL; + char *val = NULL; int32_t len = 0; code = streamDefaultGet_rocksdb(p, key, (void **)&val, &len); ASSERT(code == 0); @@ -354,7 +354,7 @@ TEST_F(BackendEnv, checkOpen) { SStreamState *p = (SStreamState *)backendOpen(); int64_t tsStart = taosGetTimestampMs(); { - void * pBatch = streamStateCreateBatch(); + void *pBatch = streamStateCreateBatch(); int32_t size = 0; for (int i = 0; i < size; i++) { char key[128] = {0}; @@ -368,7 +368,7 @@ TEST_F(BackendEnv, checkOpen) { streamStateDestroyBatch(pBatch); } { - void * pBatch = streamStateCreateBatch(); + void *pBatch = streamStateCreateBatch(); int32_t size = 0; char valBuf[256] = {0}; for (int i = 0; i < size; i++) { @@ -383,9 +383,9 @@ TEST_F(BackendEnv, checkOpen) { streamStateDestroyBatch(pBatch); } // do checkpoint 2 - taskDbDoCheckpoint(p->pTdbState->pOwner->pBackend, 2); + taskDbDoCheckpoint(p->pTdbState->pOwner->pBackend, 2, 0); { - void * pBatch = streamStateCreateBatch(); + void *pBatch = streamStateCreateBatch(); int32_t size = 0; char valBuf[256] = {0}; for (int i = 0; i < size; i++) { @@ -400,17 +400,17 @@ TEST_F(BackendEnv, checkOpen) { streamStateDestroyBatch(pBatch); } - taskDbDoCheckpoint(p->pTdbState->pOwner->pBackend, 3); + taskDbDoCheckpoint(p->pTdbState->pOwner->pBackend, 3, 0); const char *path = "/tmp/backend/stream"; const char *dump = "/tmp/backend/stream/dump"; // taosMkDir(dump); taosMulMkDir(dump); SBkdMgt *mgt = bkdMgtCreate((char *)path); - SArray * result = taosArrayInit(4, sizeof(void *)); + SArray *result = taosArrayInit(4, sizeof(void *)); bkdMgtGetDelta(mgt, p->pTdbState->idstr, 3, result, (char *)dump); - taskDbDoCheckpoint(p->pTdbState->pOwner->pBackend, 4); + taskDbDoCheckpoint(p->pTdbState->pOwner->pBackend, 4, 0); taosArrayClear(result); bkdMgtGetDelta(mgt, p->pTdbState->idstr, 4, result, (char *)dump); From 33aef6ddc550808185e720e2093d920ab35228ac Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Tue, 25 Jun 2024 07:12:09 +0000 Subject: [PATCH 02/92] add self check info --- source/libs/stream/inc/streamBackendRocksdb.h | 2 +- source/libs/stream/src/streamBackendRocksdb.c | 105 +++++++++++++++--- source/libs/stream/src/streamMeta.c | 28 +++-- 3 files changed, 104 insertions(+), 31 deletions(-) diff --git a/source/libs/stream/inc/streamBackendRocksdb.h b/source/libs/stream/inc/streamBackendRocksdb.h index ebeedcb5d2..24cd861550 100644 --- a/source/libs/stream/inc/streamBackendRocksdb.h +++ b/source/libs/stream/inc/streamBackendRocksdb.h @@ -141,7 +141,7 @@ SListNode* streamBackendAddCompare(void* backend, void* arg); void streamBackendDelCompare(void* backend, void* arg); int32_t streamStateCvtDataFormat(char* path, char* key, void* cfInst); -STaskDbWrapper* taskDbOpen(const char* path, const char* key, int64_t chkptId); +STaskDbWrapper* taskDbOpen(const char* path, const char* key, int64_t chkptId, int64_t* processVer); void taskDbDestroy(void* pBackend, bool flush); void taskDbDestroy2(void* pBackend); diff --git a/source/libs/stream/src/streamBackendRocksdb.c b/source/libs/stream/src/streamBackendRocksdb.c index 4915d4b122..4278757136 100644 --- a/source/libs/stream/src/streamBackendRocksdb.c +++ b/source/libs/stream/src/streamBackendRocksdb.c @@ -326,9 +326,11 @@ void cleanDir(const char* pPath, const char* id) { } } -void validateDir(const char* pPath) { +int32_t createDirIfNotExist(const char* pPath) { if (!taosIsDir(pPath)) { - taosMulMkDir(pPath); + return taosMulMkDir(pPath); + } else { + return 0; } } @@ -419,6 +421,9 @@ int32_t backendFileCopyFilesImpl(const char* src, const char* dst) { const char* current = "CURRENT"; size_t currLen = strlen(current); + const char* info = "info"; + size_t infoLen = strlen(info); + int32_t code = 0; int32_t sLen = strlen(src); int32_t dLen = strlen(dst); @@ -455,6 +460,14 @@ int32_t backendFileCopyFilesImpl(const char* src, const char* dst) { stError("failed to copy file, detail: %s to %s reason:%s", srcName, dstName, tstrerror(code)); goto _ERROR; } + } else if (strncmp(name, info, strlen(name) <= infoLen ? strlen(name) : infoLen) == 0) { + code = copyFiles_create(srcName, dstName, 0); + if (code != 0) { + code = TAOS_SYSTEM_ERROR(code); + stError("failed to copy file, detail: %s to %s reason:%s", srcName, dstName, tstrerror(code)); + goto _ERROR; + } + } else { code = copyFiles_hardlink(srcName, dstName, 0); if (code != 0) { @@ -487,7 +500,7 @@ _ERROR: int32_t backendCopyFiles(const char* src, const char* dst) { return backendFileCopyFilesImpl(src, dst); } static int32_t rebuildFromLocalCheckpoint(const char* pTaskIdStr, const char* checkpointPath, int64_t checkpointId, - const char* defaultPath) { + const char* defaultPath, int64_t* processVer) { int32_t code = 0; cleanDir(defaultPath, pTaskIdStr); @@ -512,34 +525,67 @@ static int32_t rebuildFromLocalCheckpoint(const char* pTaskIdStr, const char* ch return code; } -int32_t restoreCheckpointData(const char* path, const char* key, int64_t chkptId, char** dbPrefixPath, char** dbPath) { - int32_t code = 0; +int32_t restoreCheckpointData(const char* path, const char* key, int64_t chkptId, char** dbPrefixPath, char** dbPath, + int64_t* processVer) { + int32_t code = -1; + + size_t pathLen = strlen(path); + char* prefixPath = NULL; + char* defaultPath = NULL; + + // alloc buf + prefixPath = taosMemoryCalloc(1, pathLen + 64); + if (prefixPath == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + goto _EXIT; + } - char* prefixPath = taosMemoryCalloc(1, strlen(path) + 128); sprintf(prefixPath, "%s%s%s", path, TD_DIRSEP, key); + code = createDirIfNotExist(prefixPath); + if (code != 0) { + goto _EXIT; + } - validateDir(prefixPath); + defaultPath = taosMemoryCalloc(1, pathLen + 128); + if (defaultPath == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + goto _EXIT; + } - char* defaultPath = taosMemoryCalloc(1, strlen(path) + 256); sprintf(defaultPath, "%s%s%s", prefixPath, TD_DIRSEP, "state"); + code = createDirIfNotExist(defaultPath); + if (code != 0) { + goto _EXIT; + } - validateDir(defaultPath); - int32_t pathLen = strlen(path) + 256; + // int32_t pathLen = strlen(path) + 48; + char* checkpointRoot = taosMemoryCalloc(1, pathLen + 48); + if (checkpointRoot == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + goto _EXIT; + } - char* checkpointRoot = taosMemoryCalloc(1, pathLen); sprintf(checkpointRoot, "%s%s%s", prefixPath, TD_DIRSEP, "checkpoints"); - - validateDir(checkpointRoot); - taosMemoryFree(checkpointRoot); + code = createDirIfNotExist(checkpointRoot); + if (code != 0) { + taosMemoryFreeClear(checkpointRoot); + goto _EXIT; + } + taosMemoryFreeClear(checkpointRoot); stDebug("%s check local backend dir:%s, checkpointId:%" PRId64 " succ", key, defaultPath, chkptId); char* chkptPath = taosMemoryCalloc(1, pathLen); + if (chkptPath == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + goto _EXIT; + } + if (chkptId > 0) { snprintf(chkptPath, pathLen, "%s%s%s%s%s%" PRId64 "", prefixPath, TD_DIRSEP, "checkpoints", TD_DIRSEP, "checkpoint", chkptId); - code = rebuildFromLocalCheckpoint(key, chkptPath, chkptId, defaultPath); + code = rebuildFromLocalCheckpoint(key, chkptPath, chkptId, defaultPath, processVer); if (code != 0) { code = rebuildFromRemoteCheckpoint(key, chkptPath, chkptId, defaultPath); } @@ -559,7 +605,11 @@ int32_t restoreCheckpointData(const char* path, const char* key, int64_t chkptId *dbPath = defaultPath; *dbPrefixPath = prefixPath; + return 0; +_EXIT: + taosMemoryFree(defaultPath); + taosMemoryFree(prefixPath); return code; } @@ -2216,15 +2266,33 @@ _EXIT: return NULL; } -STaskDbWrapper* taskDbOpen(const char* path, const char* key, int64_t chkptId) { +STaskDbWrapper* taskDbOpen(const char* path, const char* key, int64_t chkptId, int64_t* processVer) { char* statePath = NULL; char* dbPath = NULL; - if (restoreCheckpointData(path, key, chkptId, &statePath, &dbPath) != 0) { + if (restoreCheckpointData(path, key, chkptId, &statePath, &dbPath, processVer) != 0) { + stError("failed to restore checkpoint data, path:%s, key:%s, checkpointId: %" PRId64 "reason:%s", path, key, + chkptId, tstrerror(terrno)); return NULL; } STaskDbWrapper* pTaskDb = taskDbOpenImpl(key, statePath, dbPath); + if (pTaskDb != NULL) { + int64_t chkpId = -1, ver = -1; + if (chkpLoadExtraInfo(dbPath, &chkpId, &ver) == 0) { + *processVer = ver; + } else { + if (terrno == TSDB_CODE_OUT_OF_MEMORY) { + taskDbDestroy(pTaskDb, false); + return NULL; + } else { + // not info file exists, caller handle this situation + terrno = 0; + *processVer = -1; + } + } + } + taosMemoryFree(dbPath); taosMemoryFree(statePath); return pTaskDb; @@ -2435,7 +2503,8 @@ int32_t streamStateCvtDataFormat(char* path, char* key, void* pCfInst) { int32_t code = 0; - STaskDbWrapper* pTaskDb = taskDbOpen(path, key, 0); + int64_t processVer = -1; + STaskDbWrapper* pTaskDb = taskDbOpen(path, key, 0, &processVer); RocksdbCfInst* pSrcBackend = pCfInst; for (int i = 0; i < nCf; i++) { diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index 03c7b93f91..864f9514da 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -151,7 +151,7 @@ int32_t streamMetaCheckBackendCompatible(SStreamMeta* pMeta) { int8_t ret = STREAM_STATA_COMPATIBLE; TBC* pCur = NULL; - if (tdbTbcOpen(pMeta->pTaskDb, &pCur, NULL) < 0) { // no task info, no stream + if (tdbTbcOpen(pMeta->pTaskDb, &pCur, NULL) < 0) { // no task info, no stream return ret; } @@ -262,8 +262,9 @@ int32_t streamTaskSetDb(SStreamMeta* pMeta, SStreamTask* pTask, const char* key) } STaskDbWrapper* pBackend = NULL; + int64_t processVer = -1; while (1) { - pBackend = taskDbOpen(pMeta->path, key, chkpId); + pBackend = taskDbOpen(pMeta->path, key, chkpId, &processVer); if (pBackend != NULL) { break; } @@ -557,7 +558,7 @@ int32_t streamMetaSaveTask(SStreamMeta* pMeta, SStreamTask* pTask) { return -1; } - if (pTask->ver < SSTREAM_TASK_SUBTABLE_CHANGED_VER){ + if (pTask->ver < SSTREAM_TASK_SUBTABLE_CHANGED_VER) { pTask->ver = SSTREAM_TASK_VER; } @@ -907,7 +908,7 @@ void streamMetaLoadAllTasks(SStreamMeta* pMeta) { if (p == NULL) { code = pMeta->buildTaskFn(pMeta->ahandle, pTask, pTask->chkInfo.checkpointVer + 1); if (code < 0) { - stError("failed to expand s-task:0x%"PRIx64", code:%s, continue", id.taskId, tstrerror(terrno)); + stError("failed to expand s-task:0x%" PRIx64 ", code:%s, continue", id.taskId, tstrerror(terrno)); tFreeStreamTask(pTask); continue; } @@ -1012,7 +1013,7 @@ static int32_t metaHeartbeatToMnodeImpl(SStreamMeta* pMeta) { for (int32_t i = 0; i < numOfTasks; ++i) { SStreamTaskId* pId = taosArrayGet(pMeta->pTaskList, i); - STaskId id = {.streamId = pId->streamId, .taskId = pId->taskId}; + STaskId id = {.streamId = pId->streamId, .taskId = pId->taskId}; SStreamTask** pTask = taosHashGet(pMeta->pTasksMap, &id, sizeof(id)); if (pTask == NULL) { continue; @@ -1052,12 +1053,14 @@ static int32_t metaHeartbeatToMnodeImpl(SStreamMeta* pMeta) { } if ((*pTask)->chkInfo.pActiveInfo->activeId != 0) { - entry.checkpointInfo.failed = ((*pTask)->chkInfo.pActiveInfo->failedId >= (*pTask)->chkInfo.pActiveInfo->activeId) ? 1 : 0; + entry.checkpointInfo.failed = + ((*pTask)->chkInfo.pActiveInfo->failedId >= (*pTask)->chkInfo.pActiveInfo->activeId) ? 1 : 0; entry.checkpointInfo.activeId = (*pTask)->chkInfo.pActiveInfo->activeId; entry.checkpointInfo.activeTransId = (*pTask)->chkInfo.pActiveInfo->transId; if (entry.checkpointInfo.failed) { - stInfo("s-task:%s set kill checkpoint trans in hb, transId:%d", (*pTask)->id.idStr, (*pTask)->chkInfo.pActiveInfo->transId); + stInfo("s-task:%s set kill checkpoint trans in hb, transId:%d", (*pTask)->id.idStr, + (*pTask)->chkInfo.pActiveInfo->transId); } } @@ -1384,7 +1387,7 @@ int32_t streamMetaStartAllTasks(SStreamMeta* pMeta, __stream_task_expand_fn expa int64_t now = taosGetTimestampMs(); int32_t numOfTasks = taosArrayGetSize(pMeta->pTaskList); - stInfo("vgId:%d start to check all %d stream task(s) downstream status, start ts:%"PRId64, vgId, numOfTasks, now); + stInfo("vgId:%d start to check all %d stream task(s) downstream status, start ts:%" PRId64, vgId, numOfTasks, now); if (numOfTasks == 0) { stInfo("vgId:%d no tasks to be started", pMeta->vgId); @@ -1513,8 +1516,8 @@ bool streamMetaAllTasksReady(const SStreamMeta* pMeta) { int32_t num = taosArrayGetSize(pMeta->pTaskList); for (int32_t i = 0; i < num; ++i) { SStreamTaskId* pId = taosArrayGet(pMeta->pTaskList, i); - STaskId id = {.streamId = pId->streamId, .taskId = pId->taskId}; - SStreamTask** ppTask = taosHashGet(pMeta->pTasksMap, &id, sizeof(id)); + STaskId id = {.streamId = pId->streamId, .taskId = pId->taskId}; + SStreamTask** ppTask = taosHashGet(pMeta->pTasksMap, &id, sizeof(id)); if (ppTask == NULL) { continue; } @@ -1598,7 +1601,7 @@ int32_t streamMetaAddTaskLaunchResult(SStreamMeta* pMeta, int64_t streamId, int3 if (pStartInfo->startAllTasks != 1) { int64_t el = endTs - startTs; stDebug("vgId:%d not start all task(s), not record status, s-task:0x%x launch succ:%d elapsed time:%" PRId64 "ms", - pMeta->vgId, taskId, ready, el); + pMeta->vgId, taskId, ready, el); streamMetaWUnLock(pMeta); return 0; } @@ -1725,7 +1728,8 @@ void streamMetaAddIntoUpdateTaskList(SStreamMeta* pMeta, SStreamTask* pTask, SSt taosHashPut(pMeta->updateInfo.pTasks, &hEntry, sizeof(hEntry), NULL, 0); stDebug("s-task:%s vgId:%d transId:%d task nodeEp update completed, streamTask/hTask closed, elapsed:%" PRId64 - " ms", id, vgId, transId, el); + " ms", + id, vgId, transId, el); } else { stDebug("s-task:%s vgId:%d transId:%d task nodeEp update completed, streamTask closed, elapsed time:%" PRId64 "ms", id, vgId, transId, el); From 49ba8132c0f7388aad1c81be1e3b61e0acdfdcef Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Tue, 25 Jun 2024 12:04:10 +0000 Subject: [PATCH 03/92] add self check info --- source/libs/stream/src/streamBackendRocksdb.c | 134 +++++++++++++++--- source/libs/stream/src/streamSnapshot.c | 25 +++- 2 files changed, 139 insertions(+), 20 deletions(-) diff --git a/source/libs/stream/src/streamBackendRocksdb.c b/source/libs/stream/src/streamBackendRocksdb.c index 4278757136..2642f608d9 100644 --- a/source/libs/stream/src/streamBackendRocksdb.c +++ b/source/libs/stream/src/streamBackendRocksdb.c @@ -1178,6 +1178,7 @@ int32_t chkpPreBuildDir(char* path, int64_t chkpId, char** chkpDir, char** chkpI } int32_t taskDbBuildSnap(void* arg, SArray* pSnap) { + // vnode task->db SStreamMeta* pMeta = arg; taosThreadMutexLock(&pMeta->backendMutex); @@ -1186,27 +1187,44 @@ int32_t taskDbBuildSnap(void* arg, SArray* pSnap) { while (pIter) { STaskDbWrapper* pTaskDb = *(STaskDbWrapper**)pIter; - taskDbAddRef(pTaskDb); - int64_t chkpId = pTaskDb->chkpId; - taskDbRefChkp(pTaskDb, chkpId); - code = taskDbDoCheckpoint(pTaskDb, chkpId, 0); - if (code != 0) { - taskDbUnRefChkp(pTaskDb, chkpId); + void* p = taskDbAddRef(pTaskDb); + if (p == NULL) { + terrno = 0; + pIter = taosHashIterate(pMeta->pTaskDbUnique, pIter); + continue; } - taskDbRemoveRef(pTaskDb); + // add chkpId to in-use-ckpkIdSet + taskDbRefChkp(pTaskDb, pTaskDb->chkpId); + + code = taskDbDoCheckpoint(pTaskDb, pTaskDb->chkpId, ((SStreamTask*)pTaskDb->pTask)->chkInfo.processedVer); + if (code != 0) { + // remove chkpId from in-use-ckpkIdSet + taskDbUnRefChkp(pTaskDb, pTaskDb->chkpId); + taskDbRemoveRef(pTaskDb); + code = -1; + break; + } SStreamTask* pTask = pTaskDb->pTask; SStreamTaskSnap snap = {.streamId = pTask->id.streamId, .taskId = pTask->id.taskId, .chkpId = pTaskDb->chkpId, .dbPrefixPath = taosStrdup(pTaskDb->path)}; + if (snap.dbPrefixPath == NULL) { + // remove chkpid from chkp-in-use set + taskDbUnRefChkp(pTaskDb, pTaskDb->chkpId); + taskDbRemoveRef(pTaskDb); + terrno = TSDB_CODE_OUT_OF_MEMORY; + code = -1; + break; + } taosArrayPush(pSnap, &snap); + pIter = taosHashIterate(pMeta->pTaskDbUnique, pIter); } taosThreadMutexUnlock(&pMeta->backendMutex); - return code; } int32_t taskDbDestroySnap(void* arg, SArray* pSnapInfo) { @@ -2172,23 +2190,35 @@ void taskDbDestroyChkpOpt(STaskDbWrapper* pTaskDb) { int32_t taskDbBuildFullPath(char* path, char* key, char** dbFullPath, char** stateFullPath) { int32_t code = 0; + char* statePath = taosMemoryCalloc(1, strlen(path) + 128); + if (statePath == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + return -1; + } - char* statePath = taosMemoryCalloc(1, strlen(path) + 128); sprintf(statePath, "%s%s%s", path, TD_DIRSEP, key); if (!taosDirExist(statePath)) { code = taosMulMkDir(statePath); if (code != 0) { - stError("failed to create dir: %s, reason:%s", statePath, tstrerror(code)); + terrno = errno; + stError("failed to create dir: %s, reason:%s", statePath, tstrerror(terrno)); taosMemoryFree(statePath); return code; } } char* dbPath = taosMemoryCalloc(1, strlen(statePath) + 128); + if (dbPath == NULL) { + taosMemoryFree(statePath); + terrno = TSDB_CODE_OUT_OF_MEMORY; + return -1; + } + sprintf(dbPath, "%s%s%s", statePath, TD_DIRSEP, "state"); if (!taosDirExist(dbPath)) { code = taosMulMkDir(dbPath); if (code != 0) { + terrno = errno; stError("failed to create dir: %s, reason:%s", dbPath, tstrerror(code)); taosMemoryFree(statePath); taosMemoryFree(dbPath); @@ -2384,6 +2414,11 @@ int32_t taskDbGenChkpUploadData__rsync(STaskDbWrapper* pDb, int64_t chkpId, char } char* buf = taosMemoryCalloc(1, strlen(pDb->path) + 128); + if (buf == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + return -1; + } + sprintf(buf, "%s%s%s%s%s%" PRId64 "", pDb->path, TD_DIRSEP, "checkpoints", TD_DIRSEP, "checkpoint", chkpId); if (taosIsDir(buf)) { code = 0; @@ -2402,6 +2437,11 @@ int32_t taskDbGenChkpUploadData__s3(STaskDbWrapper* pDb, void* bkdChkpMgt, int64 SBkdMgt* p = (SBkdMgt*)bkdChkpMgt; char* temp = taosMemoryCalloc(1, strlen(pDb->path) + 32); + if (temp == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + return -1; + } + sprintf(temp, "%s%s%s%" PRId64, pDb->path, TD_DIRSEP, "tmp", chkpId); if (taosDirExist(temp)) { @@ -4239,14 +4279,12 @@ int32_t dbChkpGetDelta(SDbChkp* p, int64_t chkpId, SArray* list) { if (strlen(name) == currLen && strcmp(name, pCurrent) == 0) { taosMemoryFreeClear(p->pCurrent); p->pCurrent = taosStrdup(name); - // taosHashPut(p->pSstTbl[1 - p->idx], name, strlen(name), &dummy, sizeof(dummy)); continue; } if (strlen(name) >= maniLen && strncmp(name, pManifest, maniLen) == 0) { taosMemoryFreeClear(p->pManifest); p->pManifest = taosStrdup(name); - // taosHashPut(p->pSstTbl[1 - p->idx], name, strlen(name), &dummy, sizeof(dummy)); continue; } if (strlen(name) >= sstLen && strncmp(name + strlen(name) - 4, pSST, sstLen) == 0) { @@ -4301,31 +4339,75 @@ int32_t dbChkpGetDelta(SDbChkp* p, int64_t chkpId, SArray* list) { return 0; } +void dbChkpDestroy(SDbChkp* pChkp); + SDbChkp* dbChkpCreate(char* path, int64_t initChkpId) { SDbChkp* p = taosMemoryCalloc(1, sizeof(SDbChkp)); + if (p == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + goto _EXIT; + } + p->curChkpId = initChkpId; p->preCkptId = -1; p->pSST = taosArrayInit(64, sizeof(void*)); + if (p->pSST == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + dbChkpDestroy(p); + return NULL; + } + p->path = path; p->len = strlen(path) + 128; p->buf = taosMemoryCalloc(1, p->len); + if (p->buf == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + goto _EXIT; + } p->idx = 0; p->pSstTbl[0] = taosHashInit(64, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BINARY), false, HASH_ENTRY_LOCK); + if (p->pSstTbl[0] == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + goto _EXIT; + } + p->pSstTbl[1] = taosHashInit(64, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BINARY), false, HASH_ENTRY_LOCK); + if (p->pSstTbl[1] == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + goto _EXIT; + } p->pAdd = taosArrayInit(64, sizeof(void*)); + if (p->pAdd == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + goto _EXIT; + } + p->pDel = taosArrayInit(64, sizeof(void*)); + if (p->pDel == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + goto _EXIT; + } + p->update = 0; taosThreadRwlockInit(&p->rwLock, NULL); SArray* list = NULL; int32_t code = dbChkpGetDelta(p, initChkpId, list); + if (code != 0) { + goto _EXIT; + } return p; +_EXIT: + dbChkpDestroy(p); + return NULL; } void dbChkpDestroy(SDbChkp* pChkp) { + if (pChkp == NULL) return; + taosMemoryFree(pChkp->buf); taosMemoryFree(pChkp->path); @@ -4357,6 +4439,11 @@ int32_t dbChkpDumpTo(SDbChkp* p, char* dname, SArray* list) { char* srcDir = taosMemoryCalloc(1, len); char* dstDir = taosMemoryCalloc(1, len); + if (srcBuf == NULL || dstBuf == NULL || srcDir == NULL || dstDir == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + goto _ERROR; + } + sprintf(srcDir, "%s%s%s%s%s%" PRId64 "", p->path, TD_DIRSEP, "checkpoints", TD_DIRSEP, "checkpoint", p->curChkpId); sprintf(dstDir, "%s", dname); @@ -4375,7 +4462,8 @@ int32_t dbChkpDumpTo(SDbChkp* p, char* dname, SArray* list) { sprintf(dstBuf, "%s%s%s", dstDir, TD_DIRSEP, filename); if (taosCopyFile(srcBuf, dstBuf) < 0) { - stError("failed to copy file from %s to %s", srcBuf, dstBuf); + terrno = errno; + stError("failed to copy file from %s to %s, reason:%s", srcBuf, dstBuf, tstrerror(terrno)); goto _ERROR; } } @@ -4392,7 +4480,8 @@ int32_t dbChkpDumpTo(SDbChkp* p, char* dname, SArray* list) { sprintf(srcBuf, "%s%s%s", srcDir, TD_DIRSEP, p->pCurrent); sprintf(dstBuf, "%s%s%s_%" PRId64 "", dstDir, TD_DIRSEP, p->pCurrent, p->curChkpId); if (taosCopyFile(srcBuf, dstBuf) < 0) { - stError("failed to copy file from %s to %s", srcBuf, dstBuf); + terrno = errno; + stError("failed to copy file from %s to %s, reason:%s", srcBuf, dstBuf, tstrerror(terrno)); goto _ERROR; } @@ -4402,7 +4491,8 @@ int32_t dbChkpDumpTo(SDbChkp* p, char* dname, SArray* list) { sprintf(srcBuf, "%s%s%s", srcDir, TD_DIRSEP, p->pManifest); sprintf(dstBuf, "%s%s%s_%" PRId64 "", dstDir, TD_DIRSEP, p->pManifest, p->curChkpId); if (taosCopyFile(srcBuf, dstBuf) < 0) { - stError("failed to copy file from %s to %s", srcBuf, dstBuf); + terrno = errno; + stError("failed to copy file from %s to %s, reason:%s", srcBuf, dstBuf, tstrerror(terrno)); goto _ERROR; } @@ -4412,17 +4502,21 @@ int32_t dbChkpDumpTo(SDbChkp* p, char* dname, SArray* list) { TdFilePtr pFile = taosOpenFile(dstDir, TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_TRUNC); if (pFile == NULL) { - stError("chkp failed to create meta file: %s", dstDir); + terrno = errno; + stError("chkp failed to create meta file: %s, reason:%s", dstDir, tstrerror(terrno)); goto _ERROR; } char content[128] = {0}; snprintf(content, sizeof(content), "%s_%" PRId64 "\n%s_%" PRId64 "", p->pCurrent, p->curChkpId, p->pManifest, p->curChkpId); if (taosWriteFile(pFile, content, strlen(content)) <= 0) { - stError("chkp failed to write meta file: %s", dstDir); + terrno = errno; + stError("chkp failed to write meta file: %s,reason:%s", dstDir, tstrerror(terrno)); taosCloseFile(&pFile); + code = -1; goto _ERROR; } + taosCloseFile(&pFile); // clear delta data buf @@ -4471,6 +4565,12 @@ int32_t bkdMgtGetDelta(SBkdMgt* bm, char* taskId, int64_t chkpId, SArray* list, if (pChkp == NULL) { char* path = taosMemoryCalloc(1, strlen(bm->path) + 64); + if (path == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + taosThreadRwlockUnlock(&bm->rwLock); + return -1; + } + sprintf(path, "%s%s%s", bm->path, TD_DIRSEP, taskId); SDbChkp* p = dbChkpCreate(path, chkpId); diff --git a/source/libs/stream/src/streamSnapshot.c b/source/libs/stream/src/streamSnapshot.c index adefe97f1f..868ff002bf 100644 --- a/source/libs/stream/src/streamSnapshot.c +++ b/source/libs/stream/src/streamSnapshot.c @@ -24,6 +24,7 @@ enum SBackendFileType { ROCKSDB_SST_TYPE = 3, ROCKSDB_CURRENT_TYPE = 4, ROCKSDB_CHECKPOINT_META_TYPE = 5, + ROCKSDB_CHECKPOINT_SELFCHECK_TYPE = 6, }; typedef struct SBackendFileItem { @@ -49,6 +50,7 @@ typedef struct SBackendSnapFiles2 { char* pOptions; SArray* pSst; char* pCheckpointMeta; + char* pCheckpointSelfcheck; char* path; int64_t checkpointId; @@ -111,6 +113,7 @@ const char* ROCKSDB_MAINFEST = "MANIFEST"; const char* ROCKSDB_SST = "sst"; const char* ROCKSDB_CURRENT = "CURRENT"; const char* ROCKSDB_CHECKPOINT_META = "CHECKPOINT"; +const char* ROCKSDB_CHECKPOINT_SELF_CHECK = "info"; static int64_t kBlockSize = 64 * 1024; int32_t streamSnapHandleInit(SStreamSnapHandle* handle, char* path, void* pMeta); @@ -127,6 +130,7 @@ int32_t streamGetFileSize(char* path, char* name, int64_t* sz) { int32_t ret = 0; char* fullname = taosMemoryCalloc(1, strlen(path) + 32); + sprintf(fullname, "%s%s%s", path, TD_DIRSEP, name); ret = taosStatFile(fullname, sz, NULL, NULL); @@ -148,7 +152,8 @@ int32_t streamDestroyTaskDbSnapInfo(void* arg, SArray* snap) { return taskDbDest void snapFileDebugInfo(SBackendSnapFile2* pSnapFile) { if (qDebugFlag & DEBUG_DEBUG) { - char* buf = taosMemoryCalloc(1, 512); + int16_t cap = 511; + char* buf = taosMemoryCalloc(1, cap + 1); sprintf(buf + strlen(buf), "["); if (pSnapFile->pCurrent) sprintf(buf, "current: %s,", pSnapFile->pCurrent); @@ -157,10 +162,10 @@ void snapFileDebugInfo(SBackendSnapFile2* pSnapFile) { if (pSnapFile->pSst) { for (int32_t i = 0; i < taosArrayGetSize(pSnapFile->pSst); i++) { char* name = taosArrayGetP(pSnapFile->pSst, i); - sprintf(buf + strlen(buf), "%s,", name); + if (strlen(buf) + strlen(name) < cap) sprintf(buf + strlen(buf), "%s,", name); } } - sprintf(buf + strlen(buf) - 1, "]"); + if ((strlen(buf)) < cap) sprintf(buf + strlen(buf) - 1, "]"); stInfo("%s %" PRId64 "-%" PRId64 " get file list: %s", STREAM_STATE_TRANSFER, pSnapFile->snapInfo.streamId, pSnapFile->snapInfo.taskId, buf); @@ -199,6 +204,13 @@ int32_t snapFileGenMeta(SBackendSnapFile2* pSnapFile) { // meta item.name = pSnapFile->pCheckpointMeta; item.type = ROCKSDB_CHECKPOINT_META_TYPE; + if (streamGetFileSize(pSnapFile->path, item.name, &item.size) == 0) { + taosArrayPush(pSnapFile->pFileList, &item); + } + + item.name = pSnapFile->pCheckpointSelfcheck; + item.type = ROCKSDB_CHECKPOINT_SELFCHECK_TYPE; + if (streamGetFileSize(pSnapFile->path, item.name, &item.size) == 0) { taosArrayPush(pSnapFile->pFileList, &item); } @@ -231,6 +243,11 @@ int32_t snapFileReadMeta(SBackendSnapFile2* pSnapFile) { pSnapFile->pCheckpointMeta = taosStrdup(name); continue; } + if (strlen(name) >= strlen(ROCKSDB_CHECKPOINT_SELF_CHECK) && + 0 == strncmp(name, ROCKSDB_CHECKPOINT_SELF_CHECK, strlen(ROCKSDB_CHECKPOINT_SELF_CHECK))) { + pSnapFile->pCheckpointSelfcheck = taosStrdup(name); + continue; + } if (strlen(name) >= strlen(ROCKSDB_SST) && 0 == strncmp(name + strlen(name) - strlen(ROCKSDB_SST), ROCKSDB_SST, strlen(ROCKSDB_SST))) { char* sst = taosStrdup(name); @@ -276,6 +293,7 @@ void snapFileDestroy(SBackendSnapFile2* pSnap) { taosMemoryFree(pSnap->pMainfest); taosMemoryFree(pSnap->pOptions); taosMemoryFree(pSnap->path); + taosMemoryFree(pSnap->pCheckpointSelfcheck); for (int32_t i = 0; i < taosArrayGetSize(pSnap->pSst); i++) { char* sst = taosArrayGetP(pSnap->pSst, i); taosMemoryFree(sst); @@ -298,6 +316,7 @@ int32_t streamSnapHandleInit(SStreamSnapHandle* pHandle, char* path, void* pMeta SArray* pSnapInfoSet = taosArrayInit(4, sizeof(SStreamTaskSnap)); int32_t code = streamCreateTaskDbSnapInfo(pMeta, path, pSnapInfoSet); if (code != 0) { + stError("failed to do task db snap info, reason:%s", tstrerror(terrno)); taosArrayDestroy(pSnapInfoSet); return -1; } From 061648071e2b1c07434d6ef7833b778c972f7c84 Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Tue, 25 Jun 2024 12:35:25 +0000 Subject: [PATCH 04/92] add self check info --- source/libs/stream/src/streamBackendRocksdb.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/source/libs/stream/src/streamBackendRocksdb.c b/source/libs/stream/src/streamBackendRocksdb.c index 2642f608d9..b5ae2ab3dd 100644 --- a/source/libs/stream/src/streamBackendRocksdb.c +++ b/source/libs/stream/src/streamBackendRocksdb.c @@ -4270,7 +4270,13 @@ int32_t dbChkpGetDelta(SDbChkp* p, int64_t chkpId, SArray* list) { taosArrayClearP(p->pDel, taosMemoryFree); taosHashClear(p->pSstTbl[1 - p->idx]); - TdDirPtr pDir = taosOpenDir(p->buf); + TdDirPtr pDir = taosOpenDir(p->buf); + if (pDir == NULL) { + terrno = errno; + taosThreadRwlockUnlock(&p->rwLock); + return -1; + } + TdDirEntryPtr de = NULL; int8_t dummy = 0; while ((de = taosReadDir(pDir)) != NULL) { From ac351c5b58515e5a3d5b17699c06f9520f715f20 Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Wed, 26 Jun 2024 01:26:39 +0000 Subject: [PATCH 05/92] add self check info --- source/libs/stream/src/streamBackendRocksdb.c | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/source/libs/stream/src/streamBackendRocksdb.c b/source/libs/stream/src/streamBackendRocksdb.c index b5ae2ab3dd..5e8f45e8a2 100644 --- a/source/libs/stream/src/streamBackendRocksdb.c +++ b/source/libs/stream/src/streamBackendRocksdb.c @@ -4230,23 +4230,24 @@ void strArrayDebugInfo(SArray* pArr, char** buf) { *buf = p; } void dbChkpDebugInfo(SDbChkp* pDb) { - // stTrace("chkp get file list: curr"); - char* p[4] = {NULL}; + if (stDebugFlag & DEBUG_INFO) { + char* p[4] = {NULL}; - hashTableToDebug(pDb->pSstTbl[pDb->idx], &p[0]); - stTrace("chkp previous file: [%s]", p[0]); + hashTableToDebug(pDb->pSstTbl[pDb->idx], &p[0]); + stTrace("chkp previous file: [%s]", p[0]); - hashTableToDebug(pDb->pSstTbl[1 - pDb->idx], &p[1]); - stTrace("chkp curr file: [%s]", p[1]); + hashTableToDebug(pDb->pSstTbl[1 - pDb->idx], &p[1]); + stTrace("chkp curr file: [%s]", p[1]); - strArrayDebugInfo(pDb->pAdd, &p[2]); - stTrace("chkp newly addded file: [%s]", p[2]); + strArrayDebugInfo(pDb->pAdd, &p[2]); + stTrace("chkp newly addded file: [%s]", p[2]); - strArrayDebugInfo(pDb->pDel, &p[3]); - stTrace("chkp newly deleted file: [%s]", p[3]); + strArrayDebugInfo(pDb->pDel, &p[3]); + stTrace("chkp newly deleted file: [%s]", p[3]); - for (int i = 0; i < 4; i++) { - taosMemoryFree(p[i]); + for (int i = 0; i < 4; i++) { + taosMemoryFree(p[i]); + } } } int32_t dbChkpGetDelta(SDbChkp* p, int64_t chkpId, SArray* list) { From 0a35d7ef6b9a3f7cb26f0ea90d2c3fb0d46bdd47 Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Thu, 27 Jun 2024 01:33:25 +0000 Subject: [PATCH 06/92] add self check --- source/libs/stream/src/streamBackendRocksdb.c | 7 +++---- source/libs/stream/src/streamCheckpoint.c | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/source/libs/stream/src/streamBackendRocksdb.c b/source/libs/stream/src/streamBackendRocksdb.c index 5e8f45e8a2..c12ab68607 100644 --- a/source/libs/stream/src/streamBackendRocksdb.c +++ b/source/libs/stream/src/streamBackendRocksdb.c @@ -558,7 +558,6 @@ int32_t restoreCheckpointData(const char* path, const char* key, int64_t chkptId goto _EXIT; } - // int32_t pathLen = strlen(path) + 48; char* checkpointRoot = taosMemoryCalloc(1, pathLen + 48); if (checkpointRoot == NULL) { terrno = TSDB_CODE_OUT_OF_MEMORY; @@ -575,15 +574,15 @@ int32_t restoreCheckpointData(const char* path, const char* key, int64_t chkptId stDebug("%s check local backend dir:%s, checkpointId:%" PRId64 " succ", key, defaultPath, chkptId); - char* chkptPath = taosMemoryCalloc(1, pathLen); + char* chkptPath = taosMemoryCalloc(1, pathLen + 128); if (chkptPath == NULL) { terrno = TSDB_CODE_OUT_OF_MEMORY; goto _EXIT; } if (chkptId > 0) { - snprintf(chkptPath, pathLen, "%s%s%s%s%s%" PRId64 "", prefixPath, TD_DIRSEP, "checkpoints", TD_DIRSEP, "checkpoint", - chkptId); + snprintf(chkptPath, pathLen + 127, "%s%s%s%s%s%" PRId64 "", prefixPath, TD_DIRSEP, "checkpoints", TD_DIRSEP, + "checkpoint", chkptId); code = rebuildFromLocalCheckpoint(key, chkptPath, chkptId, defaultPath, processVer); if (code != 0) { diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index af7e969c07..26df7b1627 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -667,7 +667,7 @@ int32_t streamTaskBuildCheckpoint(SStreamTask* pTask) { if (pTask->info.taskLevel != TASK_LEVEL__SINK) { stDebug("s-task:%s level:%d start gen checkpoint, checkpointId:%" PRId64, id, pTask->info.taskLevel, ckId); - int64_t ver = 0; + int64_t ver = pTask->chkInfo.processedVer; code = streamBackendDoCheckpoint(pTask->pBackend, ckId, ver); if (code != TSDB_CODE_SUCCESS) { stError("s-task:%s gen checkpoint:%" PRId64 " failed, code:%s", id, ckId, tstrerror(terrno)); From f2fc09cd023a9e7340d2cb633dc72fec0f3e45e7 Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Thu, 27 Jun 2024 01:48:59 +0000 Subject: [PATCH 07/92] add self check --- source/libs/stream/src/streamBackendRocksdb.c | 4 ++++ source/libs/stream/src/streamMeta.c | 6 ++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/source/libs/stream/src/streamBackendRocksdb.c b/source/libs/stream/src/streamBackendRocksdb.c index c12ab68607..59916b8c0d 100644 --- a/source/libs/stream/src/streamBackendRocksdb.c +++ b/source/libs/stream/src/streamBackendRocksdb.c @@ -615,6 +615,10 @@ _EXIT: bool streamBackendDataIsExist(const char* path, int64_t chkpId, int32_t vgId) { bool exist = true; char* state = taosMemoryCalloc(1, strlen(path) + 32); + if (state == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + return false; + } sprintf(state, "%s%s%s", path, TD_DIRSEP, "state"); if (!taosDirExist(state)) { exist = false; diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index 864f9514da..08e373fa56 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -195,7 +195,8 @@ int32_t streamMetaCvtDbFormat(SStreamMeta* pMeta) { bool exist = streamBackendDataIsExist(pMeta->path, chkpId, pMeta->vgId); if (exist == false) { - return code; + stError("failed to check backend data exist, reason:%s", tstrerror(terrno)); + return -1; } SBackendWrapper* pBackend = streamBackendInit(pMeta->path, chkpId, pMeta->vgId); @@ -319,7 +320,8 @@ SStreamMeta* streamMetaOpen(const char* path, void* ahandle, FTaskBuild buildTas } if (streamMetaMayCvtDbFormat(pMeta) < 0) { - stError("vgId:%d convert sub info format failed, open stream meta failed", pMeta->vgId); + stError("vgId:%d convert sub info format failed, open stream meta failed, reason: %s", pMeta->vgId, + tstrerror(terrno)); goto _err; } From 7c328f0cfaed5a8dbfdd57d0ee7d4f86fde19ac9 Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Thu, 27 Jun 2024 05:28:06 +0000 Subject: [PATCH 08/92] add self check --- source/libs/stream/src/streamBackendRocksdb.c | 36 ++++++++++++++++--- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/source/libs/stream/src/streamBackendRocksdb.c b/source/libs/stream/src/streamBackendRocksdb.c index 59916b8c0d..a710f2531a 100644 --- a/source/libs/stream/src/streamBackendRocksdb.c +++ b/source/libs/stream/src/streamBackendRocksdb.c @@ -234,10 +234,25 @@ int32_t rebuildDirFromCheckpoint(const char* path, int64_t chkpId, char** dst) { } int32_t remoteChkp_readMetaData(char* path, SArray* list) { - char* metaPath = taosMemoryCalloc(1, strlen(path)); - sprintf(metaPath, "%s%s%s", path, TD_DIRSEP, "META"); + int32_t cap = strlen(path); + char* metaPath = taosMemoryCalloc(1, cap + 32); + if (metaPath == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + return -1; + } + + if (sprintf(metaPath, "%s%s%s", path, TD_DIRSEP, "META") >= (cap + 32)) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + taosMemoryFree(metaPath); + return -1; + } TdFilePtr pFile = taosOpenFile(path, TD_FILE_READ); + if (pFile == NULL) { + terrno = TAOS_SYSTEM_ERROR(errno); + taosMemoryFree(metaPath); + return -1; + } char buf[128] = {0}; if (taosReadFile(pFile, buf, sizeof(buf)) <= 0) { @@ -281,6 +296,10 @@ int32_t remoteChkp_validAndCvtMeta(char* path, SArray* list, int64_t chkpId) { int32_t len = strlen(path) + 32; char* src = taosMemoryCalloc(1, len); char* dst = taosMemoryCalloc(1, len); + if (src == NULL || dst == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + return -1; + } int8_t count = 0; for (int i = 0; i < taosArrayGetSize(list); i++) { @@ -4461,6 +4480,11 @@ int32_t dbChkpDumpTo(SDbChkp* p, char* dname, SArray* list) { stError("failed to dump srcDir %s, reason: not exist such dir", srcDir); goto _ERROR; } + int64_t chkpId = 0, processId = -1; + if (chkpLoadExtraInfo(srcDir, &chkpId, &processId) != 0) { + stError("failed to load extra info from %s, reason:%s", srcDir, terrno != 0 ? "unkown" : tstrerror(terrno)); + goto _ERROR; + } // add file to $name dir for (int i = 0; i < taosArrayGetSize(p->pAdd); i++) { @@ -4516,9 +4540,13 @@ int32_t dbChkpDumpTo(SDbChkp* p, char* dname, SArray* list) { stError("chkp failed to create meta file: %s, reason:%s", dstDir, tstrerror(terrno)); goto _ERROR; } + // META_ON_S3 + // current_checkpointID + // manifest_checkpointID + // processVer_processID char content[128] = {0}; - snprintf(content, sizeof(content), "%s_%" PRId64 "\n%s_%" PRId64 "", p->pCurrent, p->curChkpId, p->pManifest, - p->curChkpId); + snprintf(content, sizeof(content), "%s_%" PRId64 "\n%s_%" PRId64 "\n%s_%" PRId64 "", p->pCurrent, p->curChkpId, + p->pManifest, p->curChkpId, "processVer", processId); if (taosWriteFile(pFile, content, strlen(content)) <= 0) { terrno = errno; stError("chkp failed to write meta file: %s,reason:%s", dstDir, tstrerror(terrno)); From 1004ac69245014fe63854f6b28bcb4f44956acd9 Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Thu, 27 Jun 2024 05:28:17 +0000 Subject: [PATCH 09/92] add self check --- source/libs/stream/src/streamMeta.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index 08e373fa56..c74689fa9e 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -196,7 +196,7 @@ int32_t streamMetaCvtDbFormat(SStreamMeta* pMeta) { bool exist = streamBackendDataIsExist(pMeta->path, chkpId, pMeta->vgId); if (exist == false) { stError("failed to check backend data exist, reason:%s", tstrerror(terrno)); - return -1; + return code; } SBackendWrapper* pBackend = streamBackendInit(pMeta->path, chkpId, pMeta->vgId); @@ -283,6 +283,8 @@ int32_t streamTaskSetDb(SStreamMeta* pMeta, SStreamTask* pTask, const char* key) pBackend->pTask = pTask; pBackend->pMeta = pMeta; + pTask->chkInfo.processedVer = processVer; + taosHashPut(pMeta->pTaskDbUnique, key, strlen(key), &pBackend, sizeof(void*)); taosThreadMutexUnlock(&pMeta->backendMutex); From de77ce6480ec6cbd4ffea080a92dcee1639e2e5f Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Thu, 27 Jun 2024 09:51:49 +0000 Subject: [PATCH 10/92] add self check --- out | Bin 0 -> 21360 bytes source/libs/stream/src/streamBackendRocksdb.c | 199 ++++++++++++------ source/libs/stream/src/streamCheckpoint.c | 56 +++-- t.c | 12 ++ 4 files changed, 182 insertions(+), 85 deletions(-) create mode 100755 out create mode 100644 t.c diff --git a/out b/out new file mode 100755 index 0000000000000000000000000000000000000000..21f5cbee379517922a226c62b551376a5b0ad2f4 GIT binary patch literal 21360 zcmeHPe{@vUoxg7;Ap;3XApFM4L;*pXA%Q>vK@vh>QWKH|Qh(ORWacG#W-^n$c>{@c z6&KwjC|hdludZ6y(~76nF0O4?yS7DLslU359?vdy&u(eER@hn>Q*==)v!Cz1_d84; z8T7dO$DTcNleypf{oL>Od%yR4?|bjQpEotPuJaj&;N%yV3gWg*P)LKWIMJdDAPr)f z7>|6em?Or3FOrz5Z%_oaDjo5SqydflK}l~K6-J@wDOfON4~ddqz7!m*YM6>j#gkq! z6*cdrZQ2i}T%T<(NXke-nP+K{?jO@!J&H|jjgqfjHIfeK_69X&Ju*jnS7^N}v>wx- zZjUL)lVU=@W!j#X)}oFQQ?FjH*7MS}N=-1OHYja+!;O8K_ByoQ;h;1lyp-Ev%I*CL z^e8T$H1RNR)a~UPhr>FaOliC+EsWb;OY0ZLV>NL*kshoWT(-1kX?-Y_3@wuNNZ@c2z9@S0K^(24KuBzuw{@baq9eVEzZ_RrzHc;|N>^mewev=N# zP$GNUMafgS96yp#@wi#ygi(6M`iFj18a>|#c;YBk>+;~#PH+_Yt~~PO|7h}8=8^ww z9{C&d$Um6}xAWjv=fRKW!SBt3zXJRM{LV_I+0&m)m?<~nxTY!0md$Nu%yO(AJLOu= z=C-DIGGT3wbj2-IH+qd3NktN-ov>XIjl|>0s7NIS96RB5iT(NM}w zI#wNn=>%Ifd*aEiNIbhASN?1TVM&eEo^l$fY0t#55v(_}6REv^?_TohV@a2cci`GL}0raS}6+w1qg zdN-!EUyQ}XBP>0`#va`#CecJhTBi4hy#Jb7rSJw1-k>LW*n_{X@eU8p8%3Z&*<<_6 z2R-E5!PWjv)f~vd zd2FcWi5y%X`lZ6b9DJNIA@k>R@X{Roa1K5`2QL&05hz5U5P?Dj3K97K7=b@joc}?1 z$6JB$?Zq!F6e4`0e|@TzAR4*lHucD&GK~MA3lC_mZwF<@bNEYd0GGr zACE8(@8(GF)(Yt#_#cGG+cW&59{$@N{s9kvmxtf&;cxZuf9m1WxqRL4TXw#3ZFuLe z!#n={M91dlx`TBugm6hbsK zV>3r9W^E_AXElZVFQ{_&l2?h$KONqABK-8*tHV#9@`sH>;a5((lVIQ@TuvYZfy~iv z85h>4@!P(Nx+~HPI>S3wy-pV%!aLt^%fh#>`Yo7a_nppUj>Qn{L&g1&GrkDjZ2P2t zd>bmM&T1i_vA;hwvx5soj-d5_2Sr;SurOT92@LPcdwX}LIL-*faG49|5UA99BOT< z6K*Ifs8lPAnf^;~+ArvmLC1e)GJ~KeK=*=*6PXO1bnXG&0y^|gCUY2c{%|I90(3oS zrM`JV_i8B_*KZWYV5Kp$tR%1pb%fJ13G-lME(xqHk+-g9qgapM;kR{L<(2Eor+vPn zbX#D%SUqFK#fxWOK;oo-4Sr$xBozrnBHYEV0r(yQbfUKIz;7SgJ3v@-dF7w`n#Pau z?T1Iomq!5Cp)cp>)_2Nk>FdkD7ovV%ZoR4M2`Z#Q1PT!-M4%9XLIesCC`6zTfkFfd z5jb}Qvd^9A=gIiFGJcLspGJsbZHD=b=F8Z;J_*20E}0ti^K3!Q^K)L+nx|Wzl%^|^ zeI9P6=J}jrhUWSCH#(1?#P#o<&LoLHpdS+BXTlnEfuFrPqU-rYEvP%f&rVGvBS@1K z$>v&34C?2q_}Q-`T7u4ZDDg9AQxz-1x`T`#(2j7w@73+|vtHc3JokYOmisWH`Ui0d z;SQec?{;kucPu5Z&C#i$`Eux zbg-^2R3ECXsY^???#4xnLbdhtH7rkc)imQtFw&bl;6t%6WMx$bjO^25Ei~x{+_W}! zl*VCV@&~9jc1Pv1BPs)r!JMx~akvr?=qG+*6&Zm$V#;&GFD~tt zyv!=6J__o)+9=(I21QZj1IUl~NTc#)^x+q>IPElf^Gko}I^Z`|+yl-(W#SjW2xIyi z*itL~^8;7H_=LYec5%fCvNeGo?_VH$7ntEjdW8S8k|`TI4TeK+K}5DW_8G7X*TD+0 z#@H5^s7+|rH}=N>>pqV@Q*CS!Gq<2Sy1@^GW2w8=?B!t#Mi)HQgI(#=mI|-6~$B#GnWJM zH&xt)p3rDdq^h-3S~R3*_l~Vc$j2GQ0hwTI3pAqvl$$aE*sRLCHMg`wBJsP09S7?>LF=mCTF2xm2@HGB5f5i5&fwWL^%?&b#O# z8H&eu{h%?>Ne1^zzI^G?W>M4x)qTbT6qdh|VG%{c@cLT@9gh6p))1_S{l>EdJ*=VM zLh(BWHRk`WhHAk)YEYp3-;uAeZ7fC!d>c=@f$=ok#?!`aT#M}d@kQvaaR>4DQT7mJk5hJ#vY%7- zI%RKCc9JrKoGYhnCS~&|TSD0y%GxQrhBBM7A!M`0R7^-gGf1znjF~#F7)_4#XICV?02Md>A_%S&6TR!%>!6!BakR#Y386fduyFbxt#%MC~s*!OaWA${hOI4+b@|DwL zRb;%BjMrm*RTdD^t?EhS#P|wPS{XyVB+oiW_@QO|(u3pI*a4h-a zPG4JOup{F3wpxiEx0fvEHQZF++;Lg|b)v5=8B52l&d!##WF35Da8tU!KjI8&E9)(H zgEi>3B=8zgI_lzGo{p&KYlOS&<(obmt=RQ=!6@6AyiL~CZEjp|c5b+AL;K|$nh}o( zUIl8k6P9Y2EH=`MN0E(oPp_-($qu5oF$&?iI#gFn)|)yvZEkOaak3`E*lKmVBV3eK zEeY3&w0B)+MY$4fxUvni-|8Q5lHEBrq(dW&TyDqXohfTwB;|G{w%Lgo)cZfqMIH0& zAUo(ph6Ezp;aFSkWI8o6x?Ra+976PxlX@dd&H_OlT-)5(+S=YE`Zijw?N~AIZ0c)v zoTSrcrBaa|YfZYl+j3-h@eWu|kPZgucND{%jSqU!*p2tZHbs*IdJt(2Z0>ceh_bGR zr+E-UcqOZ&3G+{+6DhkVVa0;c-iV{DH9F{1Q!?3Svl(imDVcEXo;12hCA__5IqmIh z)Zj{=WR^8Y9-3zFHtIUXM{hHjapxNQcgVi(KfW^{v301oG|WbC!qqE-6@C^ZM&&nyqFjC z#1es2yA{M0(Y2hYH9#A&=sV*w##=Rm%|Z@x*oq9OCeCbB=Zq|-Huh08;A#`5txo&6 z(TZDLZ`#{}v!dlg1VVkg{U?Wbnca?h0H(1jXtZ82qtVy}CRXvHc)UWC8XlV-8 zbSIKEU1>Y++6l#2gL+I;ig#SsMk(2EyEWZ(ps5*1%Ap6CmV2ujmtF@Y@vU~s?y}>y zJG83Tj>Rl((jJJ^*wJnXM7wLSH*~S1#H3l(osP!=Mp8qGXs?q@V7spInoQ9Fj+K%Q zx&T3Rr(0D-JjGD7_sGH8r&t z((XcLralygq5}hBAw?;+uqPU=sjI8G!rHt%ekF72L|#iE=H}9R{hl?VeYWwBZ~oj8 zSy#n@e0uOEdw=oU_DJKD08d~K}VCPb4#D76npe{8??C8BmAHTr?nK}e7%z8TTn2> zOtH5?A>{KyW!Z_D^?DBzNX;P-MF-;cs*{y&iie+oF+=j*F{ zo$n=$^L5yH;3?&c&+Di&=qU?Y6nfHV{v4AsRoU+ou*!!h7`Xb%*EJcRfIt~yn($uN zqm#s-z}46WmQ-{GaDC}`M8)4oyfXX!jk&tEI*)t}@ZbmwilGY+6MYi|Ur%O{HNYqM zCJR0r!+Kq&%SoPva_F_dgE>4Gug@dD1Na28q?K@fE7P68HKY+m9syoCN_?IKKFUxD zx`s2sHzPYg+1OEy2O1brlr9%l=FDfVeP81bd*n|8A1w~ke52#Fz{&r;9y@fBJDQyv z^4M?9BX8%ycS*cbwCG0Y8Udw?$i5GE7qDrMNSmULTNg^w!nVsc>4F}Bl#4Ygo}D`L z!2;9mH>32)0y@cwCH40N%ox5mkTN6bL4n=kK-_YzSg3Y%3Ho+`ZANH4YT|^;84~g+ zXvT0}GX#|!PA;!Uh{~O6?oouE>@rj7E_1C}TU)z)xrtVZZP6@!17Y39#0hBv0u>D295HV5yL@q9Wwt zY+Quoj0ibN8KjWat7mm@4Az)~lDaaWX1FrR=)pFrSv%2cVL(bNE_~#bpGPiv(jrI3sompN5$UCG28wOjc={UY{`K z`;59;%5s_Lh4|qDRaQUWcZx`s5+9!wCj?#7%+>dPuV^+4YRbnShG~71tIz8zra>0e z)T@6v@b_?Uko!*;tSK?&_HpG=Ca?bypt!J>?LXgNW!g)@MIs-m$ohPr(*{OOa{CRq zv`2|)Q0psH+2AtKFX2b`K3SjdcQWPm75mR}OmBof-AiSj@24`YrACmv{wq|7-6&8@ zS)cFkGUfZb+`hN}_h|h!+92OAW_m>9q)%5Bz54rrQHa=n_Bg4Jmo&d~lXv{@M`f;l zQ0p<>o2!g6ABLIk_vjzg6--%<%`ngOfJgtZu3)-{6*Xl&rcZkG`8#h+f9aV&tjA{Q z4p?sgd40*0pJQNoZ~UIq`aJ$3s3kQ$tXU=+W>)+hC2F7b4{HIYYqOH7;BEh30O#sY zt5$MM>Aq%e;`zha8z@jr^k$!3NAvf^yzRrS;eG#L-4rpQ)Wk|Dpz^u!;rq=u@?)i&C<;Q7LTK8~kjrbRUZ6 m10O%>x(d%7rJ?d;&5C}-W8C8o&TU86-xgN-LmmYWEB+hAz@k(D literal 0 HcmV?d00001 diff --git a/source/libs/stream/src/streamBackendRocksdb.c b/source/libs/stream/src/streamBackendRocksdb.c index a710f2531a..7ff651d190 100644 --- a/source/libs/stream/src/streamBackendRocksdb.c +++ b/source/libs/stream/src/streamBackendRocksdb.c @@ -19,6 +19,8 @@ #include "tcommon.h" #include "tref.h" +#define META_ON_S3_FORMATE "%s_%" PRId64 "\n%s_%" PRId64 "\n%s_%" PRId64 "" + typedef struct SCompactFilteFactory { void* status; } SCompactFilteFactory; @@ -233,15 +235,28 @@ int32_t rebuildDirFromCheckpoint(const char* path, int64_t chkpId, char** dst) { return 0; } -int32_t remoteChkp_readMetaData(char* path, SArray* list) { - int32_t cap = strlen(path); - char* metaPath = taosMemoryCalloc(1, cap + 32); +typedef struct { + char pCurrName[24]; + int64_t currChkptId; + + char pManifestName[24]; + int64_t manifestChkptId; + + char processName[24]; + int64_t processId; +} SSChkpMetaOnS3; + +int32_t remoteChkp_readMetaData(char* path, SSChkpMetaOnS3** pMeta) { + int32_t cap = strlen(path) + 32; + + char* metaPath = taosMemoryCalloc(1, cap); if (metaPath == NULL) { terrno = TSDB_CODE_OUT_OF_MEMORY; return -1; } - if (sprintf(metaPath, "%s%s%s", path, TD_DIRSEP, "META") >= (cap + 32)) { + int32_t n = sprintf(metaPath, "%s%s%s", path, TD_DIRSEP, "META"); + if (n <= 0 || n >= (cap - 1)) { terrno = TSDB_CODE_OUT_OF_MEMORY; taosMemoryFree(metaPath); return -1; @@ -254,23 +269,23 @@ int32_t remoteChkp_readMetaData(char* path, SArray* list) { return -1; } - char buf[128] = {0}; + char buf[256] = {0}; if (taosReadFile(pFile, buf, sizeof(buf)) <= 0) { + terrno = TAOS_SYSTEM_ERROR(errno); taosMemoryFree(metaPath); taosCloseFile(&pFile); return -1; } - int32_t len = strlen(buf); - for (int i = 0; i < len; i++) { - if (buf[i] == '\n') { - char* item = taosMemoryCalloc(1, i + 1); - memcpy(item, buf, i); - taosArrayPush(list, &item); - item = taosMemoryCalloc(1, len - i); - memcpy(item, buf + i + 1, len - i - 1); - taosArrayPush(list, &item); - } + SSChkpMetaOnS3* p = taosMemoryCalloc(1, sizeof(SSChkpMetaOnS3)); + n = sscanf(buf, META_ON_S3_FORMATE, p->pCurrName, &p->currChkptId, p->pManifestName, &p->manifestChkptId, + p->processName, &p->processId); + if (n != 6) { + terrno = TSDB_CODE_INVALID_MSG; + taosMemoryFree(p); + taosMemoryFree(metaPath); + taosCloseFile(&pFile); + return -1; } taosCloseFile(&pFile); @@ -291,7 +306,7 @@ int32_t remoteChkp_validMetaFile(char* name, char* prename, int64_t chkpId) { } return valid; } -int32_t remoteChkp_validAndCvtMeta(char* path, SArray* list, int64_t chkpId) { +int32_t remoteChkp_validAndCvtMeta(char* path, SSChkpMetaOnS3* pMeta, int64_t chkpId) { int32_t complete = 1; int32_t len = strlen(path) + 32; char* src = taosMemoryCalloc(1, len); @@ -301,33 +316,38 @@ int32_t remoteChkp_validAndCvtMeta(char* path, SArray* list, int64_t chkpId) { return -1; } + if (pMeta->currChkptId != chkpId || pMeta->manifestChkptId != chkpId) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + return -1; + } + int8_t count = 0; - for (int i = 0; i < taosArrayGetSize(list); i++) { - char* p = taosArrayGetP(list, i); - sprintf(src, "%s%s%s", path, TD_DIRSEP, p); + // for (int i = 0; i < taosArrayGetSize(list); i++) { + // char* p = taosArrayGetP(list, i); + // sprintf(src, "%s%s%s", path, TD_DIRSEP, p); - // check file exist - if (taosStatFile(src, NULL, NULL, NULL) != 0) { - complete = 0; - break; - } + // // check file exist + // if (taosStatFile(src, NULL, NULL, NULL) != 0) { + // complete = 0; + // break; + // } - // check file name - char temp[64] = {0}; - if (remoteChkp_validMetaFile(p, temp, chkpId)) { - count++; - } + // // check file name + // char temp[64] = {0}; + // if (remoteChkp_validMetaFile(p, temp, chkpId)) { + // count++; + // } - // rename file - sprintf(dst, "%s%s%s", path, TD_DIRSEP, temp); - taosRenameFile(src, dst); + // // rename file + // sprintf(dst, "%s%s%s", path, TD_DIRSEP, temp); + // taosRenameFile(src, dst); - memset(src, 0, len); - memset(dst, 0, len); - } - if (count != taosArrayGetSize(list)) { - complete = 0; - } + // memset(src, 0, len); + // memset(dst, 0, len); + // } + // if (count != taosArrayGetSize(list)) { + // complete = 0; + // } taosMemoryFree(src); taosMemoryFree(dst); @@ -385,12 +405,14 @@ int32_t rebuildFromRemoteChkp_s3(const char* key, char* chkpPath, int64_t chkpId if (taosIsDir(tmp)) taosRemoveDir(tmp); if (taosIsDir(defaultPath)) taosRenameFile(defaultPath, tmp); - SArray* list = taosArrayInit(2, sizeof(void*)); - code = remoteChkp_readMetaData(chkpPath, list); + // SArray* list = taosArrayInit(2, sizeof(void*)); + SSChkpMetaOnS3* pMeta; + code = remoteChkp_readMetaData(chkpPath, &pMeta); if (code == 0) { - code = remoteChkp_validAndCvtMeta(chkpPath, list, chkpId); + code = remoteChkp_validAndCvtMeta(chkpPath, pMeta, chkpId); } - taosArrayDestroyP(list, taosMemoryFree); + taosMemoryFree(pMeta); + // taosArrayDestroyP(list, taosMemoryFree); if (code == 0) { taosMkDir(defaultPath); @@ -1322,6 +1344,9 @@ int32_t chkpLoadExtraInfo(char* pChkpIdDir, int64_t* chkpId, int64_t* processId) TdFilePtr pFile = NULL; int32_t code = -1; + char buf[256] = {0}; + int32_t nBytes = 0; + int32_t len = strlen(pChkpIdDir); if (len == 0) { terrno = TSDB_CODE_INVALID_PARA; @@ -1336,7 +1361,8 @@ int32_t chkpLoadExtraInfo(char* pChkpIdDir, int64_t* chkpId, int64_t* processId) goto _EXIT; } - if (sprintf(pDst, "%s%sinfo", pChkpIdDir, TD_DIRSEP) <= 0) { + nBytes = snprintf(pDst, len + 64, "%s%sinfo", pChkpIdDir, TD_DIRSEP); + if (nBytes != strlen(pDst)) { code = -1; stError("failed to build dst to load extra info, dir:%s", pChkpIdDir); goto _EXIT; @@ -1349,7 +1375,6 @@ int32_t chkpLoadExtraInfo(char* pChkpIdDir, int64_t* chkpId, int64_t* processId) goto _EXIT; } - char buf[256] = {0}; if (taosReadFile(pFile, buf, sizeof(buf)) <= 0) { terrno = TAOS_SYSTEM_ERROR(errno); stError("failed to read file to load extra info, file:%s, reason:%s", pDst, tstrerror(terrno)); @@ -1368,8 +1393,12 @@ _EXIT: return code; } int32_t chkpAddExtraInfo(char* pChkpIdDir, int64_t chkpId, int64_t processId) { + int32_t code = -1; + TdFilePtr pFile = NULL; - int32_t code = -1; + + char buf[256] = {0}; + int32_t nBytes = 0; int32_t len = strlen(pChkpIdDir); if (len == 0) { @@ -1385,7 +1414,8 @@ int32_t chkpAddExtraInfo(char* pChkpIdDir, int64_t chkpId, int64_t processId) { goto _EXIT; } - if (sprintf(pDst, "%s%sinfo", pChkpIdDir, TD_DIRSEP) < 0) { + nBytes = snprintf(pDst, len + 64, "%s%sinfo", pChkpIdDir, TD_DIRSEP); + if (nBytes != strlen(pDst)) { stError("failed to build dst to add extra info, dir:%s", pChkpIdDir); goto _EXIT; } @@ -1397,15 +1427,14 @@ int32_t chkpAddExtraInfo(char* pChkpIdDir, int64_t chkpId, int64_t processId) { goto _EXIT; } - char buf[256] = {0}; - int n = snprintf(buf, sizeof(buf), "%" PRId64 " %" PRId64 "", chkpId, processId); - if (n <= 0 || n >= sizeof(buf)) { + nBytes = snprintf(buf, sizeof(buf), "%" PRId64 " %" PRId64 "", chkpId, processId); + if (nBytes != strlen(buf)) { code = -1; stError("failed to build content to add extra info, dir:%s", pChkpIdDir); goto _EXIT; } - if (taosWriteFile(pFile, buf, strlen(buf)) <= 0) { + if (nBytes != taosWriteFile(pFile, buf, nBytes)) { terrno = TAOS_SYSTEM_ERROR(errno); stError("failed to write file to add extra info, file:%s, reason:%s", pDst, tstrerror(terrno)); goto _EXIT; @@ -2430,18 +2459,27 @@ void taskDbDestroy2(void* pDb) { taskDbDestroy(pDb, true); } int32_t taskDbGenChkpUploadData__rsync(STaskDbWrapper* pDb, int64_t chkpId, char** path) { int32_t code = -1; int64_t refId = pDb->refId; + int32_t nBytes = 0; if (taosAcquireRef(taskDbWrapperId, refId) == NULL) { return -1; } - char* buf = taosMemoryCalloc(1, strlen(pDb->path) + 128); + int32_t cap = strlen(pDb->path) + 128; + + char* buf = taosMemoryCalloc(1, cap); if (buf == NULL) { terrno = TSDB_CODE_OUT_OF_MEMORY; return -1; } - sprintf(buf, "%s%s%s%s%s%" PRId64 "", pDb->path, TD_DIRSEP, "checkpoints", TD_DIRSEP, "checkpoint", chkpId); + nBytes = + snprintf(buf, cap, "%s%s%s%s%s%" PRId64 "", pDb->path, TD_DIRSEP, "checkpoints", TD_DIRSEP, "checkpoint", chkpId); + if (nBytes != strlen(buf)) { + terrno = TSDB_CODE_OUT_OF_RANGE; + return -1; + } + if (taosIsDir(buf)) { code = 0; *path = buf; @@ -4473,8 +4511,18 @@ int32_t dbChkpDumpTo(SDbChkp* p, char* dname, SArray* list) { goto _ERROR; } - sprintf(srcDir, "%s%s%s%s%s%" PRId64 "", p->path, TD_DIRSEP, "checkpoints", TD_DIRSEP, "checkpoint", p->curChkpId); - sprintf(dstDir, "%s", dname); + int nBytes = snprintf(srcDir, len, "%s%s%s%s%s%" PRId64 "", p->path, TD_DIRSEP, "checkpoints", TD_DIRSEP, + "checkpoint", p->curChkpId); + if (nBytes != strlen(srcBuf)) { + terrno = TSDB_CODE_OUT_OF_RANGE; + goto _ERROR; + } + + nBytes = snprintf(dstDir, len, "%s", dname); + if (nBytes != strlen(dstBuf)) { + terrno = TSDB_CODE_OUT_OF_RANGE; + goto _ERROR; + } if (!taosDirExist(srcDir)) { stError("failed to dump srcDir %s, reason: not exist such dir", srcDir); @@ -4540,14 +4588,20 @@ int32_t dbChkpDumpTo(SDbChkp* p, char* dname, SArray* list) { stError("chkp failed to create meta file: %s, reason:%s", dstDir, tstrerror(terrno)); goto _ERROR; } - // META_ON_S3 - // current_checkpointID - // manifest_checkpointID - // processVer_processID - char content[128] = {0}; - snprintf(content, sizeof(content), "%s_%" PRId64 "\n%s_%" PRId64 "\n%s_%" PRId64 "", p->pCurrent, p->curChkpId, - p->pManifest, p->curChkpId, "processVer", processId); - if (taosWriteFile(pFile, content, strlen(content)) <= 0) { + + char content[256] = {0}; + nBytes = snprintf(content, sizeof(content), META_ON_S3_FORMATE, p->pCurrent, p->curChkpId, p->pManifest, p->curChkpId, + "processVer", processId); + if (nBytes != strlen(content)) { + terrno = TSDB_CODE_INVALID_MSG; + stError("chkp failed to format meta file: %s, reason: invalid msg", dstDir); + taosCloseFile(&pFile); + code = -1; + goto _ERROR; + } + + nBytes = taosWriteFile(pFile, content, strlen(content)); + if (nBytes != strlen(content)) { terrno = errno; stError("chkp failed to write meta file: %s,reason:%s", dstDir, tstrerror(terrno)); taosCloseFile(&pFile); @@ -4612,17 +4666,28 @@ int32_t bkdMgtGetDelta(SBkdMgt* bm, char* taskId, int64_t chkpId, SArray* list, sprintf(path, "%s%s%s", bm->path, TD_DIRSEP, taskId); SDbChkp* p = dbChkpCreate(path, chkpId); - taosHashPut(bm->pDbChkpTbl, taskId, strlen(taskId), &p, sizeof(void*)); + if (p == NULL) { + taosMemoryFree(path); + taosThreadRwlockUnlock(&bm->rwLock); + return -1; + } + + if (taosHashPut(bm->pDbChkpTbl, taskId, strlen(taskId), &p, sizeof(void*)) != 0) { + dbChkpDestroy(p); + taosMemoryFree(path); + taosThreadRwlockUnlock(&bm->rwLock); + return -1; + } pChkp = p; - code = dbChkpDumpTo(pChkp, dname, list); taosThreadRwlockUnlock(&bm->rwLock); return code; - } + } else { + code = dbChkpGetDelta(pChkp, chkpId, NULL); - code = dbChkpGetDelta(pChkp, chkpId, NULL); - code = dbChkpDumpTo(pChkp, dname, list); + if (code == 0) code = dbChkpDumpTo(pChkp, dname, list); + } taosThreadRwlockUnlock(&bm->rwLock); return code; diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index 26df7b1627..bc3762a6d5 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -527,27 +527,41 @@ void streamTaskSetFailedCheckpointId(SStreamTask* pTask) { } static int32_t getCheckpointDataMeta(const char* id, const char* path, SArray* list) { - char buf[128] = {0}; + TdFilePtr pFile = NULL; + int32_t cap = strlen(path) + 32; + char buf[128] = {0}; + int32_t code = 0; - char* file = taosMemoryCalloc(1, strlen(path) + 32); - sprintf(file, "%s%s%s", path, TD_DIRSEP, "META_TMP"); + char* filePath = taosMemoryCalloc(1, cap); + if (filePath == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + return -1; + } - int32_t code = downloadCheckpointDataByName(id, "META", file); + int32_t nBytes = snprintf(filePath, cap, "%s%s%s", path, TD_DIRSEP, "META_TMP"); + if (nBytes != strlen(filePath)) { + taosMemoryFree(filePath); + terrno = TSDB_CODE_OUT_OF_RANGE; + return -1; + } + + code = downloadCheckpointDataByName(id, "META", filePath); if (code != 0) { - stDebug("%s chkp failed to download meta file:%s", id, file); - taosMemoryFree(file); + stDebug("%s chkp failed to download meta file:%s", id, filePath); + taosMemoryFree(filePath); return code; } - TdFilePtr pFile = taosOpenFile(file, TD_FILE_READ); + pFile = taosOpenFile(filePath, TD_FILE_READ); if (pFile == NULL) { - stError("%s failed to open meta file:%s for checkpoint", id, file); - code = -1; - return code; + terrno = TAOS_SYSTEM_ERROR(errno); + stError("%s failed to open meta file:%s for checkpoint", id, filePath); + taosMemoryFree(filePath); + return -1; } if (taosReadFile(pFile, buf, sizeof(buf)) <= 0) { - stError("%s failed to read meta file:%s for checkpoint", id, file); + stError("%s failed to read meta file:%s for checkpoint", id, filePath); code = -1; } else { int32_t len = strnlen(buf, tListLen(buf)); @@ -565,27 +579,33 @@ static int32_t getCheckpointDataMeta(const char* id, const char* path, SArray* l } taosCloseFile(&pFile); - taosRemoveFile(file); - taosMemoryFree(file); + taosRemoveFile(filePath); + taosMemoryFree(filePath); return code; } int32_t uploadCheckpointData(SStreamTask* pTask, int64_t checkpointId, int64_t dbRefId, ECHECKPOINT_BACKUP_TYPE type) { char* path = NULL; int32_t code = 0; - SArray* toDelFiles = taosArrayInit(4, POINTER_BYTES); - int64_t now = taosGetTimestampMs(); SStreamMeta* pMeta = pTask->pMeta; const char* idStr = pTask->id.idStr; + int64_t now = taosGetTimestampMs(); + + SArray* toDelFiles = taosArrayInit(4, POINTER_BYTES); + if (toDelFiles == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + return -1; + } if ((code = taskDbGenChkpUploadData(pTask->pBackend, pMeta->bkdChkptMgt, checkpointId, type, &path, toDelFiles, pTask->id.idStr)) != 0) { - stError("s-task:%s failed to gen upload checkpoint:%" PRId64, idStr, checkpointId); + stError("s-task:%s failed to gen upload checkpoint:%" PRId64 ", reason:%s", idStr, checkpointId, tstrerror(terrno)); } if (type == DATA_UPLOAD_S3) { if (code == TSDB_CODE_SUCCESS && (code = getCheckpointDataMeta(idStr, path, toDelFiles)) != 0) { - stError("s-task:%s failed to get checkpointData for checkpointId:%" PRId64 " meta", idStr, checkpointId); + stError("s-task:%s failed to get checkpointData for checkpointId:%" PRId64 ", reason:%s", idStr, checkpointId, + tstrerror(terrno)); } } @@ -594,7 +614,7 @@ int32_t uploadCheckpointData(SStreamTask* pTask, int64_t checkpointId, int64_t d if (code == TSDB_CODE_SUCCESS) { stDebug("s-task:%s upload checkpointId:%" PRId64 " to remote succ", idStr, checkpointId); } else { - stError("s-task:%s failed to upload checkpointId:%" PRId64 " data:%s", idStr, checkpointId, path); + stError("s-task:%s failed to upload checkpointId:%" PRId64 " path:%s,reason:%s", idStr, checkpointId, path); } } diff --git a/t.c b/t.c new file mode 100644 index 0000000000..a79ed4c134 --- /dev/null +++ b/t.c @@ -0,0 +1,12 @@ +#include +#include +#include + +int main() { + char *buf = calloc(1, 4); + int n = snprintf(buf, 4, "size"); + + printf("write size:%d \t buf:%s \t len:%d\n", n, buf, (int)(strlen(buf))); + buf[4] = 10; + return 1; +} From 51e4abe2563d5d45f093d04792776386cecfc065 Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Fri, 28 Jun 2024 02:58:30 +0000 Subject: [PATCH 11/92] add self check --- source/libs/stream/src/streamBackendRocksdb.c | 179 +++++++++++++----- 1 file changed, 130 insertions(+), 49 deletions(-) diff --git a/source/libs/stream/src/streamBackendRocksdb.c b/source/libs/stream/src/streamBackendRocksdb.c index 7ff651d190..53b45f13a2 100644 --- a/source/libs/stream/src/streamBackendRocksdb.c +++ b/source/libs/stream/src/streamBackendRocksdb.c @@ -1354,37 +1354,45 @@ int32_t chkpLoadExtraInfo(char* pChkpIdDir, int64_t* chkpId, int64_t* processId) return -1; } - char* pDst = taosMemoryCalloc(1, len + 64); + int32_t cap = len + 64; + char* pDst = taosMemoryCalloc(1, cap); if (pDst == NULL) { terrno = TSDB_CODE_OUT_OF_MEMORY; stError("failed to alloc memory to load extra info, dir:%s", pChkpIdDir); goto _EXIT; } - nBytes = snprintf(pDst, len + 64, "%s%sinfo", pChkpIdDir, TD_DIRSEP); - if (nBytes != strlen(pDst)) { - code = -1; + nBytes = snprintf(pDst, cap, "%s%sinfo", pChkpIdDir, TD_DIRSEP); + if (nBytes <= 0 || nBytes >= cap) { + terrno = TSDB_CODE_OUT_OF_RANGE; stError("failed to build dst to load extra info, dir:%s", pChkpIdDir); goto _EXIT; } pFile = taosOpenFile(pDst, TD_FILE_READ); if (pFile == NULL) { - terrno = TAOS_SYSTEM_ERROR(errno); - stError("failed to open file to load extra info, file:%s", pDst); + if (errno == ENOENT) { + // compatible with previous version + *processId = -1; + code = 0; + goto _EXIT; + } else { + terrno = TAOS_SYSTEM_ERROR(errno); + stError("failed to open file to load extra info, file:%s", pDst); + } goto _EXIT; } if (taosReadFile(pFile, buf, sizeof(buf)) <= 0) { terrno = TAOS_SYSTEM_ERROR(errno); stError("failed to read file to load extra info, file:%s, reason:%s", pDst, tstrerror(terrno)); - code = -1; goto _EXIT; } if (sscanf(buf, "%" PRId64 " %" PRId64 "", chkpId, processId) < 2) { terrno = TSDB_CODE_INVALID_PARA; stError("failed to read file content to load extra info, file:%s, reason:%s", pDst, tstrerror(terrno)); + goto _EXIT; } code = 0; _EXIT: @@ -1406,16 +1414,16 @@ int32_t chkpAddExtraInfo(char* pChkpIdDir, int64_t chkpId, int64_t processId) { stError("failed to add extra info, dir:%s, reason:%s", pChkpIdDir, tstrerror(terrno)); return -1; } - - char* pDst = taosMemoryCalloc(1, len + 64); + int32_t cap = len + 64; + char* pDst = taosMemoryCalloc(1, cap); if (pDst == NULL) { terrno = TSDB_CODE_OUT_OF_MEMORY; stError("failed to alloc memory to add extra info, dir:%s", pChkpIdDir); goto _EXIT; } - nBytes = snprintf(pDst, len + 64, "%s%sinfo", pChkpIdDir, TD_DIRSEP); - if (nBytes != strlen(pDst)) { + nBytes = snprintf(pDst, cap, "%s%sinfo", pChkpIdDir, TD_DIRSEP); + if (nBytes <= 0 || nBytes >= cap) { stError("failed to build dst to add extra info, dir:%s", pChkpIdDir); goto _EXIT; } @@ -1428,8 +1436,8 @@ int32_t chkpAddExtraInfo(char* pChkpIdDir, int64_t chkpId, int64_t processId) { } nBytes = snprintf(buf, sizeof(buf), "%" PRId64 " %" PRId64 "", chkpId, processId); - if (nBytes != strlen(buf)) { - code = -1; + if (nBytes <= 0 || nBytes >= sizeof(buf)) { + terrno = TSDB_CODE_OUT_OF_RANGE; stError("failed to build content to add extra info, dir:%s", pChkpIdDir); goto _EXIT; } @@ -2475,7 +2483,7 @@ int32_t taskDbGenChkpUploadData__rsync(STaskDbWrapper* pDb, int64_t chkpId, char nBytes = snprintf(buf, cap, "%s%s%s%s%s%" PRId64 "", pDb->path, TD_DIRSEP, "checkpoints", TD_DIRSEP, "checkpoint", chkpId); - if (nBytes != strlen(buf)) { + if (nBytes <= 0 || nBytes >= cap) { terrno = TSDB_CODE_OUT_OF_RANGE; return -1; } @@ -4311,6 +4319,7 @@ void dbChkpDebugInfo(SDbChkp* pDb) { } } int32_t dbChkpGetDelta(SDbChkp* p, int64_t chkpId, SArray* list) { + int32_t nBytes; taosThreadRwlockWrlock(&p->rwLock); p->preCkptId = p->curChkpId; @@ -4368,6 +4377,11 @@ int32_t dbChkpGetDelta(SDbChkp* p, int64_t chkpId, SArray* list) { char* name = taosHashGetKey(pIter, &len); if (name != NULL && !isBkdDataMeta(name, len)) { char* fname = taosMemoryCalloc(1, len + 1); + if (fname == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + taosThreadRwlockUnlock(&p->rwLock); + return -1; + } strncpy(fname, name, len); taosArrayPush(p->pAdd, &fname); } @@ -4496,30 +4510,32 @@ int32_t dbChkpInit(SDbChkp* p) { } #endif int32_t dbChkpDumpTo(SDbChkp* p, char* dname, SArray* list) { + static char* chkpMeta = "META"; + int32_t code = -1; + int32_t cap = p->len + 128; + taosThreadRwlockRdlock(&p->rwLock); - int32_t code = -1; - int32_t len = p->len + 128; - char* srcBuf = taosMemoryCalloc(1, len); - char* dstBuf = taosMemoryCalloc(1, len); + char* srcBuf = taosMemoryCalloc(1, cap); + char* dstBuf = taosMemoryCalloc(1, cap); - char* srcDir = taosMemoryCalloc(1, len); - char* dstDir = taosMemoryCalloc(1, len); + char* srcDir = taosMemoryCalloc(1, cap); + char* dstDir = taosMemoryCalloc(1, cap); if (srcBuf == NULL || dstBuf == NULL || srcDir == NULL || dstDir == NULL) { terrno = TSDB_CODE_OUT_OF_MEMORY; goto _ERROR; } - int nBytes = snprintf(srcDir, len, "%s%s%s%s%s%" PRId64 "", p->path, TD_DIRSEP, "checkpoints", TD_DIRSEP, + int nBytes = snprintf(srcDir, cap, "%s%s%s%s%s%" PRId64 "", p->path, TD_DIRSEP, "checkpoints", TD_DIRSEP, "checkpoint", p->curChkpId); - if (nBytes != strlen(srcBuf)) { + if (nBytes <= 0 || nBytes >= cap) { terrno = TSDB_CODE_OUT_OF_RANGE; goto _ERROR; } - nBytes = snprintf(dstDir, len, "%s", dname); - if (nBytes != strlen(dstBuf)) { + nBytes = snprintf(dstDir, cap, "%s", dname); + if (nBytes <= 0 || nBytes >= cap) { terrno = TSDB_CODE_OUT_OF_RANGE; goto _ERROR; } @@ -4536,12 +4552,21 @@ int32_t dbChkpDumpTo(SDbChkp* p, char* dname, SArray* list) { // add file to $name dir for (int i = 0; i < taosArrayGetSize(p->pAdd); i++) { - memset(srcBuf, 0, len); - memset(dstBuf, 0, len); + memset(srcBuf, 0, cap); + memset(dstBuf, 0, cap); char* filename = taosArrayGetP(p->pAdd, i); - sprintf(srcBuf, "%s%s%s", srcDir, TD_DIRSEP, filename); - sprintf(dstBuf, "%s%s%s", dstDir, TD_DIRSEP, filename); + nBytes = snprintf(srcBuf, cap, "%s%s%s", srcDir, TD_DIRSEP, filename); + if (nBytes <= 0 || nBytes >= cap) { + terrno = TSDB_CODE_OUT_OF_RANGE; + goto _ERROR; + } + + nBytes = snprintf(dstBuf, cap, "%s%s%s", dstDir, TD_DIRSEP, filename); + if (nBytes <= 0 || nBytes >= cap) { + terrno = TSDB_CODE_OUT_OF_RANGE; + goto _ERROR; + } if (taosCopyFile(srcBuf, dstBuf) < 0) { terrno = errno; @@ -4553,14 +4578,29 @@ int32_t dbChkpDumpTo(SDbChkp* p, char* dname, SArray* list) { for (int i = 0; i < taosArrayGetSize(p->pDel); i++) { char* filename = taosArrayGetP(p->pDel, i); char* p = taosStrdup(filename); + if (p == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + goto _ERROR; + } taosArrayPush(list, &p); } // copy current file to dst dir - memset(srcBuf, 0, len); - memset(dstBuf, 0, len); - sprintf(srcBuf, "%s%s%s", srcDir, TD_DIRSEP, p->pCurrent); - sprintf(dstBuf, "%s%s%s_%" PRId64 "", dstDir, TD_DIRSEP, p->pCurrent, p->curChkpId); + memset(srcBuf, 0, cap); + memset(dstBuf, 0, cap); + + nBytes = snprintf(srcBuf, cap, "%s%s%s", srcDir, TD_DIRSEP, p->pCurrent); + if (nBytes <= 0 || nBytes >= cap) { + terrno = TSDB_CODE_OUT_OF_RANGE; + goto _ERROR; + } + + nBytes = snprintf(dstBuf, cap, "%s%s%s_%" PRId64 "", dstDir, TD_DIRSEP, p->pCurrent, p->curChkpId); + if (nBytes <= 0 || nBytes >= cap) { + terrno = TSDB_CODE_OUT_OF_RANGE; + goto _ERROR; + } + if (taosCopyFile(srcBuf, dstBuf) < 0) { terrno = errno; stError("failed to copy file from %s to %s, reason:%s", srcBuf, dstBuf, tstrerror(terrno)); @@ -4568,23 +4608,37 @@ int32_t dbChkpDumpTo(SDbChkp* p, char* dname, SArray* list) { } // copy manifest file to dst dir - memset(srcBuf, 0, len); - memset(dstBuf, 0, len); - sprintf(srcBuf, "%s%s%s", srcDir, TD_DIRSEP, p->pManifest); - sprintf(dstBuf, "%s%s%s_%" PRId64 "", dstDir, TD_DIRSEP, p->pManifest, p->curChkpId); + memset(srcBuf, 0, cap); + memset(dstBuf, 0, cap); + + nBytes = snprintf(srcBuf, cap, "%s%s%s", srcDir, TD_DIRSEP, p->pManifest); + if (nBytes <= 0 || nBytes >= cap) { + terrno = TSDB_CODE_OUT_OF_RANGE; + goto _ERROR; + } + + nBytes = snprintf(dstBuf, cap, "%s%s%s_%" PRId64 "", dstDir, TD_DIRSEP, p->pManifest, p->curChkpId); + if (nBytes <= 0 || nBytes >= cap) { + terrno = TSDB_CODE_OUT_OF_RANGE; + goto _ERROR; + } + if (taosCopyFile(srcBuf, dstBuf) < 0) { terrno = errno; stError("failed to copy file from %s to %s, reason:%s", srcBuf, dstBuf, tstrerror(terrno)); goto _ERROR; } - static char* chkpMeta = "META"; - memset(dstBuf, 0, len); - sprintf(dstDir, "%s%s%s", dstDir, TD_DIRSEP, chkpMeta); + memset(dstBuf, 0, cap); + nBytes = snprintf(dstDir, cap, "%s%s%s", dstDir, TD_DIRSEP, chkpMeta); + if (nBytes <= 0 || nBytes >= cap) { + terrno = TSDB_CODE_OUT_OF_RANGE; + goto _ERROR; + } TdFilePtr pFile = taosOpenFile(dstDir, TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_TRUNC); if (pFile == NULL) { - terrno = errno; + terrno = TAOS_SYSTEM_ERROR(errno); stError("chkp failed to create meta file: %s, reason:%s", dstDir, tstrerror(terrno)); goto _ERROR; } @@ -4592,23 +4646,20 @@ int32_t dbChkpDumpTo(SDbChkp* p, char* dname, SArray* list) { char content[256] = {0}; nBytes = snprintf(content, sizeof(content), META_ON_S3_FORMATE, p->pCurrent, p->curChkpId, p->pManifest, p->curChkpId, "processVer", processId); - if (nBytes != strlen(content)) { - terrno = TSDB_CODE_INVALID_MSG; + if (nBytes <= 0 || nBytes >= sizeof(content)) { + terrno = TSDB_CODE_OUT_OF_RANGE; stError("chkp failed to format meta file: %s, reason: invalid msg", dstDir); taosCloseFile(&pFile); - code = -1; goto _ERROR; } nBytes = taosWriteFile(pFile, content, strlen(content)); if (nBytes != strlen(content)) { - terrno = errno; + terrno = TAOS_SYSTEM_ERROR(errno); stError("chkp failed to write meta file: %s,reason:%s", dstDir, tstrerror(terrno)); taosCloseFile(&pFile); - code = -1; goto _ERROR; } - taosCloseFile(&pFile); // clear delta data buf @@ -4624,11 +4675,34 @@ _ERROR: taosMemoryFree(dstDir); return code; } + SBkdMgt* bkdMgtCreate(char* path) { SBkdMgt* p = taosMemoryCalloc(1, sizeof(SBkdMgt)); + if (p == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + return NULL; + } + p->pDbChkpTbl = taosHashInit(64, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BINARY), true, HASH_ENTRY_LOCK); + if (p->pDbChkpTbl == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + bkdMgtDestroy(p); + return NULL; + } + p->path = taosStrdup(path); - taosThreadRwlockInit(&p->rwLock, NULL); + if (p->path == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + bkdMgtDestroy(p); + return NULL; + } + + if (taosThreadRwlockInit(&p->rwLock, NULL) != 0) { + terrno = TAOS_SYSTEM_ERROR(errno); + bkdMgtDestroy(p); + return NULL; + } + return p; } @@ -4656,14 +4730,21 @@ int32_t bkdMgtGetDelta(SBkdMgt* bm, char* taskId, int64_t chkpId, SArray* list, SDbChkp* pChkp = ppChkp != NULL ? *ppChkp : NULL; if (pChkp == NULL) { - char* path = taosMemoryCalloc(1, strlen(bm->path) + 64); + int32_t cap = strlen(bm->path) + 64; + char* path = taosMemoryCalloc(1, cap); if (path == NULL) { terrno = TSDB_CODE_OUT_OF_MEMORY; taosThreadRwlockUnlock(&bm->rwLock); return -1; } - sprintf(path, "%s%s%s", bm->path, TD_DIRSEP, taskId); + int32_t nBytes = snprintf(path, cap, "%s%s%s", bm->path, TD_DIRSEP, taskId); + if (nBytes <= 0 || nBytes >= cap) { + terrno = TSDB_CODE_OUT_OF_RANGE; + taosMemoryFree(path); + taosThreadRwlockUnlock(&bm->rwLock); + return -1; + } SDbChkp* p = dbChkpCreate(path, chkpId); if (p == NULL) { From 8fe57c166902cbd3fce2ae99c457f70f6e4bb225 Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Fri, 28 Jun 2024 03:08:02 +0000 Subject: [PATCH 12/92] add self check --- source/libs/stream/src/streamCheckpoint.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index bc3762a6d5..bd81ee5b75 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -614,7 +614,8 @@ int32_t uploadCheckpointData(SStreamTask* pTask, int64_t checkpointId, int64_t d if (code == TSDB_CODE_SUCCESS) { stDebug("s-task:%s upload checkpointId:%" PRId64 " to remote succ", idStr, checkpointId); } else { - stError("s-task:%s failed to upload checkpointId:%" PRId64 " path:%s,reason:%s", idStr, checkpointId, path); + stError("s-task:%s failed to upload checkpointId:%" PRId64 " path:%s,reason:%s", idStr, checkpointId, path, + tstrerror(errno)); } } From 2ae54486b513db655b460978cbe935e311144a67 Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Fri, 28 Jun 2024 07:01:45 +0000 Subject: [PATCH 13/92] add self check --- source/libs/stream/src/streamBackendRocksdb.c | 139 +++++++++++------- 1 file changed, 86 insertions(+), 53 deletions(-) diff --git a/source/libs/stream/src/streamBackendRocksdb.c b/source/libs/stream/src/streamBackendRocksdb.c index 53b45f13a2..cbe6dcc886 100644 --- a/source/libs/stream/src/streamBackendRocksdb.c +++ b/source/libs/stream/src/streamBackendRocksdb.c @@ -373,24 +373,24 @@ int32_t createDirIfNotExist(const char* pPath) { } } -int32_t rebuildFromRemoteChkp_rsync(const char* key, char* chkptPath, int64_t checkpointId, char* defaultPath) { +int32_t rebuildFromRemoteChkp_rsync(const char* key, char* checkpointPath, int64_t checkpointId, char* defaultPath) { int32_t code = 0; - if (taosIsDir(chkptPath)) { - taosRemoveDir(chkptPath); - stDebug("remove local checkpoint data dir:%s succ", chkptPath); + if (taosIsDir(checkpointPath)) { + taosRemoveDir(checkpointPath); + stDebug("remove local checkpoint data dir:%s succ", checkpointPath); } cleanDir(defaultPath, key); stDebug("clear local default dir before downloading checkpoint data:%s succ", defaultPath); - code = streamTaskDownloadCheckpointData(key, chkptPath); + code = streamTaskDownloadCheckpointData(key, checkpointPath); if (code != 0) { stError("failed to download checkpoint data:%s", key); return code; } stDebug("download remote checkpoint data for checkpointId:%" PRId64 ", %s", checkpointId, key); - return backendCopyFiles(chkptPath, defaultPath); + return backendCopyFiles(checkpointPath, defaultPath); } int32_t rebuildFromRemoteChkp_s3(const char* key, char* chkpPath, int64_t chkpId, char* defaultPath) { @@ -399,29 +399,45 @@ int32_t rebuildFromRemoteChkp_s3(const char* key, char* chkpPath, int64_t chkpId return code; } - int32_t len = strlen(defaultPath) + 32; - char* tmp = taosMemoryCalloc(1, len); - sprintf(tmp, "%s%s", defaultPath, "_tmp"); + int32_t nBytes; + int32_t cap = strlen(defaultPath) + 32; + + char* tmp = taosMemoryCalloc(1, cap); + if (tmp == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + return -1; + } + + nBytes = snprintf(tmp, cap, "%s%s", defaultPath, "_tmp"); + if (nBytes <= 0 || nBytes >= cap) { + terrno = TSDB_CODE_OUT_OF_RANGE; + taosMemoryFree(tmp); + return -1; + } + if (taosIsDir(tmp)) taosRemoveDir(tmp); if (taosIsDir(defaultPath)) taosRenameFile(defaultPath, tmp); // SArray* list = taosArrayInit(2, sizeof(void*)); SSChkpMetaOnS3* pMeta; code = remoteChkp_readMetaData(chkpPath, &pMeta); - if (code == 0) { - code = remoteChkp_validAndCvtMeta(chkpPath, pMeta, chkpId); - } + if (code == 0) code = remoteChkp_validAndCvtMeta(chkpPath, pMeta, chkpId); + taosMemoryFree(pMeta); - // taosArrayDestroyP(list, taosMemoryFree); if (code == 0) { - taosMkDir(defaultPath); + code = taosMkDir(defaultPath); + } + + if (code == 0) { code = backendCopyFiles(chkpPath, defaultPath); } if (code != 0) { if (taosIsDir(defaultPath)) taosRemoveDir(defaultPath); - if (taosIsDir(tmp)) taosRenameFile(tmp, defaultPath); + if (taosIsDir(tmp)) { + code = taosRenameFile(tmp, defaultPath); + } } else { taosRemoveDir(tmp); } @@ -430,12 +446,12 @@ int32_t rebuildFromRemoteChkp_s3(const char* key, char* chkpPath, int64_t chkpId return code; } -int32_t rebuildFromRemoteCheckpoint(const char* key, char* chkptPath, int64_t checkpointId, char* defaultPath) { +int32_t rebuildFromRemoteCheckpoint(const char* key, char* checkpointPath, int64_t checkpointId, char* defaultPath) { ECHECKPOINT_BACKUP_TYPE type = streamGetCheckpointBackupType(); if (type == DATA_UPLOAD_S3) { - return rebuildFromRemoteChkp_s3(key, chkptPath, checkpointId, defaultPath); + return rebuildFromRemoteChkp_s3(key, checkpointPath, checkpointId, defaultPath); } else if (type == DATA_UPLOAD_RSYNC) { - return rebuildFromRemoteChkp_rsync(key, chkptPath, checkpointId, defaultPath); + return rebuildFromRemoteChkp_rsync(key, checkpointPath, checkpointId, defaultPath); } else { stError("%s no remote backup checkpoint data for:%" PRId64, key, checkpointId); } @@ -570,69 +586,78 @@ int32_t restoreCheckpointData(const char* path, const char* key, int64_t chkptId int64_t* processVer) { int32_t code = -1; - size_t pathLen = strlen(path); - char* prefixPath = NULL; - char* defaultPath = NULL; + char* prefixPath = NULL; + char* defaultPath = NULL; + char* checkpointPath = NULL; + char* checkpointRoot = NULL; + + int32_t cap = strlen(path) + 128; + int32_t nBytes; // alloc buf - prefixPath = taosMemoryCalloc(1, pathLen + 64); - if (prefixPath == NULL) { + prefixPath = taosMemoryCalloc(1, cap); + defaultPath = taosMemoryCalloc(1, cap); + checkpointPath = taosMemoryCalloc(1, cap); + checkpointRoot = taosMemoryCalloc(1, cap); + if (prefixPath == NULL || defaultPath == NULL || checkpointPath == NULL || checkpointRoot == NULL) { terrno = TSDB_CODE_OUT_OF_MEMORY; goto _EXIT; } - sprintf(prefixPath, "%s%s%s", path, TD_DIRSEP, key); + nBytes = snprintf(prefixPath, cap, "%s%s%s", path, TD_DIRSEP, key); + if (nBytes <= 0 || nBytes >= cap) { + terrno = TSDB_CODE_OUT_OF_RANGE; + goto _EXIT; + } + code = createDirIfNotExist(prefixPath); if (code != 0) { + terrno = TAOS_SYSTEM_ERROR(errno); goto _EXIT; } - defaultPath = taosMemoryCalloc(1, pathLen + 128); - if (defaultPath == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; + nBytes = snprintf(defaultPath, cap, "%s%s%s", prefixPath, TD_DIRSEP, "state"); + if (nBytes <= 0 || nBytes >= cap) { + terrno = TSDB_CODE_OUT_OF_RANGE; goto _EXIT; } - sprintf(defaultPath, "%s%s%s", prefixPath, TD_DIRSEP, "state"); code = createDirIfNotExist(defaultPath); if (code != 0) { + terrno = TAOS_SYSTEM_ERROR(errno); goto _EXIT; } - char* checkpointRoot = taosMemoryCalloc(1, pathLen + 48); - if (checkpointRoot == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; + nBytes = snprintf(checkpointRoot, cap, "%s%s%s", prefixPath, TD_DIRSEP, "checkpoints"); + if (nBytes <= 0 || nBytes >= cap) { + terrno = TSDB_CODE_OUT_OF_RANGE; goto _EXIT; } - sprintf(checkpointRoot, "%s%s%s", prefixPath, TD_DIRSEP, "checkpoints"); code = createDirIfNotExist(checkpointRoot); if (code != 0) { - taosMemoryFreeClear(checkpointRoot); + terrno = TAOS_SYSTEM_ERROR(errno); goto _EXIT; } - taosMemoryFreeClear(checkpointRoot); stDebug("%s check local backend dir:%s, checkpointId:%" PRId64 " succ", key, defaultPath, chkptId); - - char* chkptPath = taosMemoryCalloc(1, pathLen + 128); - if (chkptPath == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; - goto _EXIT; - } - if (chkptId > 0) { - snprintf(chkptPath, pathLen + 127, "%s%s%s%s%s%" PRId64 "", prefixPath, TD_DIRSEP, "checkpoints", TD_DIRSEP, - "checkpoint", chkptId); + nBytes = snprintf(checkpointPath, cap, "%s%s%s%s%s%" PRId64 "", prefixPath, TD_DIRSEP, "checkpoints", TD_DIRSEP, + "checkpoint", chkptId); + if (nBytes <= 0 || nBytes >= cap) { + terrno = TSDB_CODE_OUT_OF_RANGE; + goto _EXIT; + } - code = rebuildFromLocalCheckpoint(key, chkptPath, chkptId, defaultPath, processVer); + code = rebuildFromLocalCheckpoint(key, checkpointPath, chkptId, defaultPath, processVer); if (code != 0) { - code = rebuildFromRemoteCheckpoint(key, chkptPath, chkptId, defaultPath); + terrno = 0; + code = rebuildFromRemoteCheckpoint(key, checkpointPath, chkptId, defaultPath); } if (code != 0) { - stError("failed to start stream backend at %s, reason: %s, restart from default defaultPath:%s", chkptPath, - tstrerror(code), defaultPath); + stError("failed to start stream backend at %s, reason: %s, restart from default defaultPath:%s, reason:%s", + checkpointPath, tstrerror(code), defaultPath, tstrerror(terrno)); code = 0; // reset the error code } } else { // no valid checkpoint id @@ -641,15 +666,18 @@ int32_t restoreCheckpointData(const char* path, const char* key, int64_t chkptId cleanDir(defaultPath, key); } - taosMemoryFree(chkptPath); - *dbPath = defaultPath; *dbPrefixPath = prefixPath; - return 0; + defaultPath = NULL; + prefixPath = NULL; + + code = 0; _EXIT: taosMemoryFree(defaultPath); taosMemoryFree(prefixPath); + taosMemoryFree(checkpointPath); + taosMemoryFree(checkpointRoot); return code; } @@ -4334,7 +4362,14 @@ int32_t dbChkpGetDelta(SDbChkp* p, int64_t chkpId, SArray* list) { int32_t sstLen = strlen(pSST); memset(p->buf, 0, p->len); - sprintf(p->buf, "%s%s%s%scheckpoint%" PRId64 "", p->path, TD_DIRSEP, "checkpoints", TD_DIRSEP, chkpId); + + nBytes = + snprintf(p->buf, p->len, "%s%s%s%scheckpoint%" PRId64 "", p->path, TD_DIRSEP, "checkpoints", TD_DIRSEP, chkpId); + if (nBytes <= 0 || nBytes >= p->len) { + terrno = TSDB_CODE_OUT_OF_RANGE; + taosThreadRwlockUnlock(&p->rwLock); + return -1; + } taosArrayClearP(p->pAdd, taosMemoryFree); taosArrayClearP(p->pDel, taosMemoryFree); @@ -4518,10 +4553,8 @@ int32_t dbChkpDumpTo(SDbChkp* p, char* dname, SArray* list) { char* srcBuf = taosMemoryCalloc(1, cap); char* dstBuf = taosMemoryCalloc(1, cap); - char* srcDir = taosMemoryCalloc(1, cap); char* dstDir = taosMemoryCalloc(1, cap); - if (srcBuf == NULL || dstBuf == NULL || srcDir == NULL || dstDir == NULL) { terrno = TSDB_CODE_OUT_OF_MEMORY; goto _ERROR; From 2e59284388e264655dbb27c0db302b512c8a3e8d Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Fri, 28 Jun 2024 08:14:34 +0000 Subject: [PATCH 14/92] add self check --- source/libs/stream/src/streamBackendRocksdb.c | 142 +++++++++++------- 1 file changed, 91 insertions(+), 51 deletions(-) diff --git a/source/libs/stream/src/streamBackendRocksdb.c b/source/libs/stream/src/streamBackendRocksdb.c index cbe6dcc886..eff0481d5b 100644 --- a/source/libs/stream/src/streamBackendRocksdb.c +++ b/source/libs/stream/src/streamBackendRocksdb.c @@ -196,28 +196,54 @@ int32_t getCfIdx(const char* cfName) { return idx; } -bool isValidCheckpoint(const char* dir) { return true; } +bool isValidCheckpoint(const char* dir) { + // not implement yet + return true; +} +/* + *copy pChkpIdDir's file to state dir + */ int32_t rebuildDirFromCheckpoint(const char* path, int64_t chkpId, char** dst) { // impl later int32_t code = 0; + int32_t cap = strlen(path) + 64; + int32_t nBytes = 0; + + char* state = taosMemoryCalloc(1, cap); + if (state == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + return -1; + } + + nBytes = snprintf(state, cap, "%s%s%s", path, TD_DIRSEP, "state"); + if (nBytes <= 0 || nBytes >= cap) { + terrno = TSDB_CODE_OUT_OF_RANGE; + taosMemoryFree(state); + return -1; + } - /*param@1: checkpointId dir - param@2: state - copy pChkpIdDir's file to state dir - opt to set hard link to previous file - */ - char* state = taosMemoryCalloc(1, strlen(path) + 32); - sprintf(state, "%s%s%s", path, TD_DIRSEP, "state"); if (chkpId != 0) { - char* chkp = taosMemoryCalloc(1, strlen(path) + 64); - sprintf(chkp, "%s%s%s%scheckpoint%" PRId64 "", path, TD_DIRSEP, "checkpoints", TD_DIRSEP, chkpId); + char* chkp = taosMemoryCalloc(1, cap); + if (chkp == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + taosMemoryFree(state); + return -1; + } + + nBytes = snprintf(chkp, cap, "%s%s%s%scheckpoint%" PRId64 "", path, TD_DIRSEP, "checkpoints", TD_DIRSEP, chkpId); + if (nBytes <= 0 || nBytes >= cap) { + terrno = TSDB_CODE_OUT_OF_RANGE; + taosMemoryFree(state); + taosMemoryFree(chkp); + return -1; + } + if (taosIsDir(chkp) && isValidCheckpoint(chkp)) { cleanDir(state, ""); code = backendCopyFiles(chkp, state); - stInfo("copy snap file from %s to %s", chkp, state); if (code != 0) { - stError("failed to restart stream backend from %s, reason: %s", chkp, tstrerror(TAOS_SYSTEM_ERROR(errno))); + stError("failed to restart stream backend from %s, reason: %s", chkp, tstrerror(TAOS_SYSTEM_ERROR(terrno))); } else { stInfo("start to restart stream backend at checkpoint path: %s", chkp); } @@ -225,7 +251,10 @@ int32_t rebuildDirFromCheckpoint(const char* path, int64_t chkpId, char** dst) { } else { stError("failed to start stream backend at %s, reason: %s, restart from default state dir:%s", chkp, tstrerror(TAOS_SYSTEM_ERROR(errno)), state); - taosMkDir(state); + code = taosMkDir(state); + if (code != 0) { + terrno = TAOS_SYSTEM_ERROR(errno); + } } taosMemoryFree(chkp); @@ -247,7 +276,9 @@ typedef struct { } SSChkpMetaOnS3; int32_t remoteChkp_readMetaData(char* path, SSChkpMetaOnS3** pMeta) { - int32_t cap = strlen(path) + 32; + int32_t code = -1; + int32_t cap = strlen(path) + 32; + TdFilePtr pFile = NULL; char* metaPath = taosMemoryCalloc(1, cap); if (metaPath == NULL) { @@ -256,41 +287,42 @@ int32_t remoteChkp_readMetaData(char* path, SSChkpMetaOnS3** pMeta) { } int32_t n = sprintf(metaPath, "%s%s%s", path, TD_DIRSEP, "META"); - if (n <= 0 || n >= (cap - 1)) { + if (n <= 0 || n >= cap) { terrno = TSDB_CODE_OUT_OF_MEMORY; taosMemoryFree(metaPath); return -1; } - TdFilePtr pFile = taosOpenFile(path, TD_FILE_READ); + pFile = taosOpenFile(path, TD_FILE_READ); if (pFile == NULL) { terrno = TAOS_SYSTEM_ERROR(errno); - taosMemoryFree(metaPath); - return -1; + goto _EXIT; } char buf[256] = {0}; if (taosReadFile(pFile, buf, sizeof(buf)) <= 0) { terrno = TAOS_SYSTEM_ERROR(errno); - taosMemoryFree(metaPath); - taosCloseFile(&pFile); - return -1; + goto _EXIT; } SSChkpMetaOnS3* p = taosMemoryCalloc(1, sizeof(SSChkpMetaOnS3)); + if (p == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + goto _EXIT; + } n = sscanf(buf, META_ON_S3_FORMATE, p->pCurrName, &p->currChkptId, p->pManifestName, &p->manifestChkptId, p->processName, &p->processId); if (n != 6) { terrno = TSDB_CODE_INVALID_MSG; taosMemoryFree(p); - taosMemoryFree(metaPath); - taosCloseFile(&pFile); - return -1; + goto _EXIT; } - + *pMeta = p; + code = 0; +_EXIT: taosCloseFile(&pFile); taosMemoryFree(metaPath); - return 0; + return code; } int32_t remoteChkp_validMetaFile(char* name, char* prename, int64_t chkpId) { int8_t valid = 0; @@ -321,7 +353,6 @@ int32_t remoteChkp_validAndCvtMeta(char* path, SSChkpMetaOnS3* pMeta, int64_t ch return -1; } - int8_t count = 0; // for (int i = 0; i < taosArrayGetSize(list); i++) { // char* p = taosArrayGetP(list, i); // sprintf(src, "%s%s%s", path, TD_DIRSEP, p); @@ -419,7 +450,7 @@ int32_t rebuildFromRemoteChkp_s3(const char* key, char* chkpPath, int64_t chkpId if (taosIsDir(defaultPath)) taosRenameFile(defaultPath, tmp); // SArray* list = taosArrayInit(2, sizeof(void*)); - SSChkpMetaOnS3* pMeta; + SSChkpMetaOnS3* pMeta = NULL; code = remoteChkp_readMetaData(chkpPath, &pMeta); if (code == 0) code = remoteChkp_validAndCvtMeta(chkpPath, pMeta, chkpId); @@ -481,76 +512,84 @@ int32_t backendFileCopyFilesImpl(const char* src, const char* dst) { const char* info = "info"; size_t infoLen = strlen(info); - int32_t code = 0; + int32_t code = -1; int32_t sLen = strlen(src); int32_t dLen = strlen(dst); - char* srcName = taosMemoryCalloc(1, sLen + 64); - char* dstName = taosMemoryCalloc(1, dLen + 64); + int32_t cap = TMAX(sLen, dLen) + 64; + int32_t nBytes = 0; + + char* srcName = taosMemoryCalloc(1, cap); + char* dstName = taosMemoryCalloc(1, cap); + if (srcName == NULL || dstName == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + return -1; + } // copy file to dst TdDirPtr pDir = taosOpenDir(src); if (pDir == NULL) { - taosMemoryFree(srcName); - taosMemoryFree(dstName); - code = TAOS_SYSTEM_ERROR(errno); - - errno = 0; - return code; + terrno = TAOS_SYSTEM_ERROR(errno); } errno = 0; TdDirEntryPtr de = NULL; - while ((de = taosReadDir(pDir)) != NULL) { char* name = taosGetDirEntryName(de); if (strcmp(name, ".") == 0 || strcmp(name, "..") == 0) { continue; } - sprintf(srcName, "%s%s%s", src, TD_DIRSEP, name); - sprintf(dstName, "%s%s%s", dst, TD_DIRSEP, name); + nBytes = snprintf(srcName, cap, "%s%s%s", src, TD_DIRSEP, name); + if (nBytes <= 0 || nBytes >= cap) { + terrno = TSDB_CODE_OUT_OF_RANGE; + goto _ERROR; + } + + nBytes = snprintf(dstName, cap, "%s%s%s", dst, TD_DIRSEP, name); + if (nBytes <= 0 || nBytes >= cap) { + terrno = TSDB_CODE_OUT_OF_RANGE; + goto _ERROR; + } if (strncmp(name, current, strlen(name) <= currLen ? strlen(name) : currLen) == 0) { code = copyFiles_create(srcName, dstName, 0); if (code != 0) { - code = TAOS_SYSTEM_ERROR(code); - stError("failed to copy file, detail: %s to %s reason:%s", srcName, dstName, tstrerror(code)); + terrno = TAOS_SYSTEM_ERROR(errno); + stError("failed to copy file, detail: %s to %s reason:%s", srcName, dstName, tstrerror(terrno)); goto _ERROR; } } else if (strncmp(name, info, strlen(name) <= infoLen ? strlen(name) : infoLen) == 0) { code = copyFiles_create(srcName, dstName, 0); if (code != 0) { - code = TAOS_SYSTEM_ERROR(code); - stError("failed to copy file, detail: %s to %s reason:%s", srcName, dstName, tstrerror(code)); + terrno = TAOS_SYSTEM_ERROR(errno); + stError("failed to copy file, detail: %s to %s reason:%s", srcName, dstName, tstrerror(terrno)); goto _ERROR; } } else { code = copyFiles_hardlink(srcName, dstName, 0); if (code != 0) { - code = TAOS_SYSTEM_ERROR(code); - stError("failed to hard link file, detail:%s to %s, reason:%s", srcName, dstName, tstrerror(code)); + terrno = TAOS_SYSTEM_ERROR(errno); + stError("failed to hard link file, detail:%s to %s, reason:%s", srcName, dstName, tstrerror(terrno)); goto _ERROR; } else { stDebug("succ hard link file:%s to %s", srcName, dstName); } } - memset(srcName, 0, sLen + 64); - memset(dstName, 0, dLen + 64); + memset(srcName, 0, cap); + memset(dstName, 0, cap); } taosMemoryFreeClear(srcName); taosMemoryFreeClear(dstName); taosCloseDir(&pDir); - errno = 0; return code; _ERROR: taosMemoryFreeClear(srcName); taosMemoryFreeClear(dstName); taosCloseDir(&pDir); - errno = 0; return code; } @@ -568,7 +607,8 @@ static int32_t rebuildFromLocalCheckpoint(const char* pTaskIdStr, const char* ch if (code != TSDB_CODE_SUCCESS) { cleanDir(defaultPath, pTaskIdStr); stError("%s failed to start stream backend from local %s, reason:%s, try download checkpoint from remote", - pTaskIdStr, checkpointPath, tstrerror(TAOS_SYSTEM_ERROR(errno))); + pTaskIdStr, checkpointPath, tstrerror(TAOS_SYSTEM_ERROR(terrno))); + terrno = 0; code = TSDB_CODE_SUCCESS; } else { stInfo("%s copy checkpoint data from:%s to:%s succ, try to start stream backend", pTaskIdStr, checkpointPath, From f023e7780cea869b72a26bf0f1983ff1bddc299e Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Sat, 29 Jun 2024 04:33:14 +0000 Subject: [PATCH 15/92] add self check --- source/libs/stream/src/streamCheckpoint.c | 88 +++++++++++++++++------ 1 file changed, 66 insertions(+), 22 deletions(-) diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index 9948847ecb..69c2ead7d2 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -56,6 +56,13 @@ SStreamDataBlock* createChkptTriggerBlock(SStreamTask* pTask, int32_t checkpoint pBlock->info.childId = pTask->info.selfChildId; pChkpoint->blocks = taosArrayInit(4, sizeof(SSDataBlock)); // pBlock; + if (pChkpoint->blocks == NULL) { + taosMemoryFree(pBlock); + taosFreeQitem(pChkpoint); + terrno = TSDB_CODE_OUT_OF_MEMORY; + return NULL; + } + taosArrayPush(pChkpoint->blocks, pBlock); taosMemoryFree(pBlock); @@ -110,7 +117,12 @@ int32_t streamTaskSendCheckpointTriggerMsg(SStreamTask* pTask, int32_t dstTaskId SRpcHandleInfo* pRpcInfo, int32_t code) { int32_t size = sizeof(SMsgHead) + sizeof(SCheckpointTriggerRsp); - void* pBuf = rpcMallocCont(size); + void* pBuf = rpcMallocCont(size); + if (pBuf == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + return -1; + } + SCheckpointTriggerRsp* pRsp = POINTER_SHIFT(pBuf, sizeof(SMsgHead)); ((SMsgHead*)pBuf)->vgId = htonl(downstreamNodeId); @@ -131,6 +143,7 @@ int32_t streamTaskSendCheckpointTriggerMsg(SStreamTask* pTask, int32_t dstTaskId SRpcMsg rspMsg = {.code = 0, .pCont = pRsp, .contLen = size, .info = *pRpcInfo}; tmsgSendRsp(&rspMsg); + return 0; } @@ -1006,52 +1019,78 @@ void streamTaskSetTriggerDispatchConfirmed(SStreamTask* pTask, int32_t vgId) { } static int32_t uploadCheckpointToS3(const char* id, const char* path) { + int32_t code = 0; + int32_t nBytes = 0; + + if (s3Init() != 0) { + return -1; + } + TdDirPtr pDir = taosOpenDir(path); if (pDir == NULL) return -1; TdDirEntryPtr de = NULL; - s3Init(); while ((de = taosReadDir(pDir)) != NULL) { char* name = taosGetDirEntryName(de); if (strcmp(name, ".") == 0 || strcmp(name, "..") == 0 || taosDirEntryIsDir(de)) continue; char filename[PATH_MAX] = {0}; if (path[strlen(path) - 1] == TD_DIRSEP_CHAR) { - snprintf(filename, sizeof(filename), "%s%s", path, name); + nBytes = snprintf(filename, sizeof(filename), "%s%s", path, name); + if (nBytes <= 0 || nBytes >= sizeof(filename)) { + code = -1; + break; + } } else { - snprintf(filename, sizeof(filename), "%s%s%s", path, TD_DIRSEP, name); + nBytes = snprintf(filename, sizeof(filename), "%s%s%s", path, TD_DIRSEP, name); + if (nBytes <= 0 || nBytes >= sizeof(filename)) { + code = -1; + break; + } } char object[PATH_MAX] = {0}; - snprintf(object, sizeof(object), "%s%s%s", id, TD_DIRSEP, name); + nBytes = snprintf(object, sizeof(object), "%s%s%s", id, TD_DIRSEP, name); + if (nBytes <= 0 || nBytes >= sizeof(object)) { + code = -1; + break; + } if (s3PutObjectFromFile2(filename, object, 0) != 0) { - taosCloseDir(&pDir); - return -1; + code = -1; + stError("[s3] failed to upload checkpoint:%s", filename); + } else { + stDebug("[s3] upload checkpoint:%s", filename); } - stDebug("[s3] upload checkpoint:%s", filename); - // break; } - taosCloseDir(&pDir); - return 0; + return code; } int32_t downloadCheckpointByNameS3(const char* id, const char* fname, const char* dstName) { - int32_t code = 0; - char* buf = taosMemoryCalloc(1, strlen(id) + strlen(dstName) + 4); + int32_t nBytes; + int32_t cap = strlen(id) + strlen(dstName) + 16; + + char* buf = taosMemoryCalloc(1, cap); if (buf == NULL) { - code = terrno = TSDB_CODE_OUT_OF_MEMORY; - return code; + terrno = TSDB_CODE_OUT_OF_MEMORY; + return -1; + } + + nBytes = snprintf(buf, cap, "%s/%s", id, fname); + if (nBytes <= 0 || nBytes >= cap) { + taosMemoryFree(buf); + terrno = TSDB_CODE_OUT_OF_RANGE; + return -1; } - sprintf(buf, "%s/%s", id, fname); if (s3GetObjectToFile(buf, dstName) != 0) { - code = errno; + taosMemoryFree(buf); + terrno = TAOS_SYSTEM_ERROR(errno); + return -1; } - taosMemoryFree(buf); - return code; + return 0; } ECHECKPOINT_BACKUP_TYPE streamGetCheckpointBackupType() { @@ -1082,6 +1121,7 @@ int32_t streamTaskUploadCheckpoint(const char* id, const char* path) { // fileName: CURRENT int32_t downloadCheckpointDataByName(const char* id, const char* fname, const char* dstName) { if (id == NULL || fname == NULL || strlen(id) == 0 || strlen(fname) == 0 || strlen(fname) >= PATH_MAX) { + terrno = TSDB_CODE_INVALID_PARA; stError("down load checkpoint data parameters invalid"); return -1; } @@ -1125,9 +1165,13 @@ int32_t deleteCheckpoint(const char* id) { int32_t deleteCheckpointFile(const char* id, const char* name) { char object[128] = {0}; - snprintf(object, sizeof(object), "%s/%s", id, name); + + int32_t nBytes = snprintf(object, sizeof(object), "%s/%s", id, name); + if (nBytes <= 0 || nBytes >= sizeof(object)) { + terrno = TSDB_CODE_OUT_OF_RANGE; + return -1; + } char* tmp = object; - s3DeleteObjects((const char**)&tmp, 1); - return 0; + return s3DeleteObjects((const char**)&tmp, 1); } From 7290920c6ffaea6efc9d158e08c1fab3d5d190b3 Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Sat, 29 Jun 2024 04:35:54 +0000 Subject: [PATCH 16/92] add self check --- source/dnode/vnode/src/tq/tqStreamStateSnap.c | 7 ++- source/libs/stream/src/streamCheckpoint.c | 7 +++ source/libs/stream/src/streamSnapshot.c | 61 ++++++++++++++++--- 3 files changed, 64 insertions(+), 11 deletions(-) diff --git a/source/dnode/vnode/src/tq/tqStreamStateSnap.c b/source/dnode/vnode/src/tq/tqStreamStateSnap.c index 290266d94a..a2b9254db7 100644 --- a/source/dnode/vnode/src/tq/tqStreamStateSnap.c +++ b/source/dnode/vnode/src/tq/tqStreamStateSnap.c @@ -67,7 +67,7 @@ int32_t streamStateSnapReaderOpen(STQ* pTq, int64_t sver, int64_t ever, SStreamS _err: tqError("vgId:%d, vnode %s snapshot reader failed to open since %s", TD_VID(pTq->pVnode), STREAM_STATE_TRANSFER, - tstrerror(code)); + tstrerror(terrno)); *ppReader = NULL; return code; } @@ -145,14 +145,15 @@ int32_t streamStateSnapWriterOpen(STQ* pTq, int64_t sver, int64_t ever, SStreamS goto _err; } - tqDebug("vgId:%d, vnode %s snapshot writer opened, path:%s", TD_VID(pTq->pVnode), STREAM_STATE_TRANSFER, pTq->pStreamMeta->path); + tqDebug("vgId:%d, vnode %s snapshot writer opened, path:%s", TD_VID(pTq->pVnode), STREAM_STATE_TRANSFER, + pTq->pStreamMeta->path); pWriter->pWriterImpl = pSnapWriter; *ppWriter = pWriter; return code; _err: tqError("vgId:%d, vnode %s snapshot writer failed to open since %s", TD_VID(pTq->pVnode), STREAM_STATE_TRANSFER, - tstrerror(code)); + tstrerror(terrno)); taosMemoryFree(pWriter); *ppWriter = NULL; return -1; diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index 69c2ead7d2..bc5067d4d6 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -813,6 +813,11 @@ void checkpointTriggerMonitorFn(void* param, void* tmrId) { SArray* pList = pTask->upstreamInfo.pList; ASSERT(pTask->info.taskLevel > TASK_LEVEL__SOURCE); SArray* pNotSendList = taosArrayInit(4, sizeof(SStreamUpstreamEpInfo)); + if (pNotSendList == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + stDebug("s-task:%s start to triggerMonitor, reason:%s", id, tstrerror(terrno)); + return; + } for (int32_t i = 0; i < taosArrayGetSize(pList); ++i) { SStreamUpstreamEpInfo* pInfo = taosArrayGetP(pList, i); @@ -1057,6 +1062,7 @@ static int32_t uploadCheckpointToS3(const char* id, const char* path) { } if (s3PutObjectFromFile2(filename, object, 0) != 0) { + terrno = TAOS_SYSTEM_ERROR(errno); code = -1; stError("[s3] failed to upload checkpoint:%s", filename); } else { @@ -1152,6 +1158,7 @@ int32_t streamTaskDownloadCheckpointData(const char* id, char* path) { int32_t deleteCheckpoint(const char* id) { if (id == NULL || strlen(id) == 0) { + terrno = TSDB_CODE_INVALID_PARA; stError("deleteCheckpoint parameters invalid"); return -1; } diff --git a/source/libs/stream/src/streamSnapshot.c b/source/libs/stream/src/streamSnapshot.c index 868ff002bf..7ef4e8ec09 100644 --- a/source/libs/stream/src/streamSnapshot.c +++ b/source/libs/stream/src/streamSnapshot.c @@ -130,7 +130,7 @@ int32_t streamGetFileSize(char* path, char* name, int64_t* sz) { int32_t ret = 0; char* fullname = taosMemoryCalloc(1, strlen(path) + 32); - + sprintf(fullname, "%s%s%s", path, TD_DIRSEP, name); ret = taosStatFile(fullname, sz, NULL, NULL); @@ -259,17 +259,33 @@ int32_t snapFileReadMeta(SBackendSnapFile2* pSnapFile) { } int32_t streamBackendSnapInitFile(char* metaPath, SStreamTaskSnap* pSnap, SBackendSnapFile2* pSnapFile) { int32_t code = -1; + int32_t nBytes = 0; + int32_t cap = strlen(pSnap->dbPrefixPath) + 256; + + char* path = taosMemoryCalloc(1, cap); + if (path == NULL) { + return -1; + } + + nBytes = snprintf(path, cap, "%s%s%s%s%s%" PRId64 "", pSnap->dbPrefixPath, TD_DIRSEP, "checkpoints", TD_DIRSEP, + "checkpoint", pSnap->chkpId); + if (nBytes <= 0 || nBytes >= cap) { + terrno = TSDB_CODE_OUT_OF_RANGE; + goto _ERROR; + } - char* path = taosMemoryCalloc(1, strlen(pSnap->dbPrefixPath) + 256); - // char idstr[64] = {0}; - sprintf(path, "%s%s%s%s%s%" PRId64 "", pSnap->dbPrefixPath, TD_DIRSEP, "checkpoints", TD_DIRSEP, "checkpoint", - pSnap->chkpId); if (!taosIsDir(path)) { + terrno = TSDB_CODE_INVALID_MSG; goto _ERROR; } pSnapFile->pSst = taosArrayInit(16, sizeof(void*)); pSnapFile->pFileList = taosArrayInit(64, sizeof(SBackendFileItem)); + if (pSnapFile->pSst == NULL || pSnapFile->pFileList == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + goto _ERROR; + } + pSnapFile->path = path; pSnapFile->snapInfo = *pSnap; if ((code = snapFileReadMeta(pSnapFile)) != 0) { @@ -313,8 +329,15 @@ void snapFileDestroy(SBackendSnapFile2* pSnap) { } int32_t streamSnapHandleInit(SStreamSnapHandle* pHandle, char* path, void* pMeta) { // impl later + int32_t code = 0; + SArray* pSnapInfoSet = taosArrayInit(4, sizeof(SStreamTaskSnap)); - int32_t code = streamCreateTaskDbSnapInfo(pMeta, path, pSnapInfoSet); + if (pSnapInfoSet == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + return -1; + } + + code = streamCreateTaskDbSnapInfo(pMeta, path, pSnapInfoSet); if (code != 0) { stError("failed to do task db snap info, reason:%s", tstrerror(terrno)); taosArrayDestroy(pSnapInfoSet); @@ -322,6 +345,11 @@ int32_t streamSnapHandleInit(SStreamSnapHandle* pHandle, char* path, void* pMeta } SArray* pDbSnapSet = taosArrayInit(8, sizeof(SBackendSnapFile2)); + if (pDbSnapSet == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + taosArrayDestroy(pSnapInfoSet); + return -1; + } for (int32_t i = 0; i < taosArrayGetSize(pSnapInfoSet); i++) { SStreamTaskSnap* pSnap = taosArrayGet(pSnapInfoSet, i); @@ -369,7 +397,8 @@ int32_t streamSnapReaderOpen(void* pMeta, int64_t sver, int64_t chkpId, char* pa // impl later SStreamSnapReader* pReader = taosMemoryCalloc(1, sizeof(SStreamSnapReader)); if (pReader == NULL) { - return TSDB_CODE_OUT_OF_MEMORY; + terrno = TSDB_CODE_OUT_OF_MEMORY; + return -1; } if (streamSnapHandleInit(&pReader->handle, (char*)path, pMeta) < 0) { @@ -501,11 +530,27 @@ int32_t streamSnapWriterOpen(void* pMeta, int64_t sver, int64_t ever, char* path SStreamSnapHandle* pHandle = &pWriter->handle; pHandle->currIdx = 0; + pHandle->metaPath = taosStrdup(path); + if (pHandle->metaPath == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + taosMemoryFree(pWriter); + } + pHandle->pDbSnapSet = taosArrayInit(8, sizeof(SBackendSnapFile2)); + if (pHandle->pDbSnapSet == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + taosMemoryFree(pHandle->metaPath); + taosMemoryFree(pWriter); + return -1; + } SBackendSnapFile2 snapFile = {0}; - taosArrayPush(pHandle->pDbSnapSet, &snapFile); + if (taosArrayPush(pHandle->pDbSnapSet, &snapFile) == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + streamSnapWriterClose(pWriter, 0); + return -1; + } *ppWriter = pWriter; return 0; From 6c6bff611a01cd591a29407cfca3d0c2acc8bdaa Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Sat, 29 Jun 2024 12:56:36 +0000 Subject: [PATCH 17/92] add self check --- source/dnode/vnode/src/tq/tqStreamStateSnap.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/source/dnode/vnode/src/tq/tqStreamStateSnap.c b/source/dnode/vnode/src/tq/tqStreamStateSnap.c index a2b9254db7..655778568b 100644 --- a/source/dnode/vnode/src/tq/tqStreamStateSnap.c +++ b/source/dnode/vnode/src/tq/tqStreamStateSnap.c @@ -75,7 +75,7 @@ _err: int32_t streamStateSnapReaderClose(SStreamStateReader* pReader) { int32_t code = 0; tqDebug("vgId:%d, vnode %s snapshot reader closed", TD_VID(pReader->pTq->pVnode), STREAM_STATE_TRANSFER); - streamSnapReaderClose(pReader->pReaderImpl); + code = streamSnapReaderClose(pReader->pReaderImpl); taosMemoryFree(pReader); return code; } @@ -138,7 +138,12 @@ int32_t streamStateSnapWriterOpen(STQ* pTq, int64_t sver, int64_t ever, SStreamS pWriter->sver = sver; pWriter->ever = ever; - taosMkDir(pTq->pStreamMeta->path); + if (taosMkDir(pTq->pStreamMeta->path) != 0) { + terrno = TAOS_SYSTEM_ERROR(errno); + tqError("vgId:%d, vnode %s snapshot writer failed to create directory %s since %s", TD_VID(pTq->pVnode), + STREAM_STATE_TRANSFER, pTq->pStreamMeta->path, tstrerror(terrno)); + goto _err; + } SStreamSnapWriter* pSnapWriter = NULL; if (streamSnapWriterOpen(pTq, sver, ever, pTq->pStreamMeta->path, &pSnapWriter) < 0) { @@ -151,6 +156,7 @@ int32_t streamStateSnapWriterOpen(STQ* pTq, int64_t sver, int64_t ever, SStreamS *ppWriter = pWriter; return code; + _err: tqError("vgId:%d, vnode %s snapshot writer failed to open since %s", TD_VID(pTq->pVnode), STREAM_STATE_TRANSFER, tstrerror(terrno)); @@ -160,11 +166,8 @@ _err: } int32_t streamStateSnapWriterClose(SStreamStateWriter* pWriter, int8_t rollback) { - int32_t code = 0; tqDebug("vgId:%d, vnode %s snapshot writer closed", TD_VID(pWriter->pTq->pVnode), STREAM_STATE_TRANSFER); - code = streamSnapWriterClose(pWriter->pWriterImpl, rollback); - - return code; + return streamSnapWriterClose(pWriter->pWriterImpl, rollback); } int32_t streamStateSnapWrite(SStreamStateWriter* pWriter, uint8_t* pData, uint32_t nData) { From d69c5b1840cfdd707c2ac3886290ab0dceb468d3 Mon Sep 17 00:00:00 2001 From: xjzhou Date: Tue, 2 Jul 2024 10:18:56 +0800 Subject: [PATCH 18/92] isStmtBind --- source/libs/parser/src/parInsertSql.c | 16 ++- tests/taosc_test/taoscTest.cpp | 194 +++++++++++++++++++++++--- 2 files changed, 180 insertions(+), 30 deletions(-) diff --git a/source/libs/parser/src/parInsertSql.c b/source/libs/parser/src/parInsertSql.c index 9393a62e26..b053cd95a0 100644 --- a/source/libs/parser/src/parInsertSql.c +++ b/source/libs/parser/src/parInsertSql.c @@ -30,6 +30,7 @@ typedef struct SInsertParseContext { bool forceUpdate; bool needTableTagVal; bool needRequest; // whether or not request server + bool isStmtBind; // whether is stmt } SInsertParseContext; typedef int32_t (*_row_append_fn_t)(SMsgBuf* pMsgBuf, const void* value, int32_t len, void* param); @@ -1978,7 +1979,6 @@ static int32_t parseOneStbRow(SInsertParseContext* pCxt, SVnodeModifyOpStmt* pSt static int parseOneRow(SInsertParseContext* pCxt, const char** pSql, STableDataCxt* pTableCxt, bool* pGotRow, SToken* pToken) { SBoundColInfo* pCols = &pTableCxt->boundColsInfo; - bool isParseBindParam = false; SSchema* pSchemas = getTableColumnSchema(pTableCxt->pMeta); int32_t code = TSDB_CODE_SUCCESS; @@ -1996,7 +1996,7 @@ static int parseOneRow(SInsertParseContext* pCxt, const char** pSql, STableDataC SColVal* pVal = taosArrayGet(pTableCxt->pValues, pCols->pColIndex[i]); if (pToken->type == TK_NK_QUESTION) { - isParseBindParam = true; + pCxt->isStmtBind = true; if (NULL == pCxt->pComCxt->pStmtCb) { code = buildSyntaxErrMsg(&pCxt->msg, "? only used in stmt", pToken->z); break; @@ -2007,8 +2007,8 @@ static int parseOneRow(SInsertParseContext* pCxt, const char** pSql, STableDataC break; } - if (isParseBindParam) { - code = buildInvalidOperationMsg(&pCxt->msg, "no mix usage for ? and values"); + if (pCxt->isStmtBind) { + code = buildInvalidOperationMsg(&pCxt->msg, "stmt bind param does not support normal value in sql"); break; } @@ -2025,7 +2025,7 @@ static int parseOneRow(SInsertParseContext* pCxt, const char** pSql, STableDataC } } - if (TSDB_CODE_SUCCESS == code && !isParseBindParam) { + if (TSDB_CODE_SUCCESS == code && !pCxt->isStmtBind) { SRow** pRow = taosArrayReserve(pTableCxt->pData->aRowP, 1); code = tRowBuild(pTableCxt->pValues, pTableCxt->pSchema, pRow); if (TSDB_CODE_SUCCESS == code) { @@ -2035,7 +2035,7 @@ static int parseOneRow(SInsertParseContext* pCxt, const char** pSql, STableDataC } } - if (TSDB_CODE_SUCCESS == code && !isParseBindParam) { + if (TSDB_CODE_SUCCESS == code && !pCxt->isStmtBind) { *pGotRow = true; } @@ -2410,6 +2410,7 @@ static int32_t checkTableClauseFirstToken(SInsertParseContext* pCxt, SVnodeModif } if (TK_NK_QUESTION == pTbName->type) { + pCxt->isStmtBind = true; if (NULL == pCxt->pComCxt->pStmtCb) { return buildSyntaxErrMsg(&pCxt->msg, "? only used in stmt", pTbName->z); } @@ -2935,7 +2936,8 @@ int32_t parseInsertSql(SParseContext* pCxt, SQuery** pQuery, SCatalogReq* pCatal .missCache = false, .usingDuplicateTable = false, .needRequest = true, - .forceUpdate = (NULL != pCatalogReq ? pCatalogReq->forceUpdate : false)}; + .forceUpdate = (NULL != pCatalogReq ? pCatalogReq->forceUpdate : false), + .isStmtBind = false}; int32_t code = initInsertQuery(&context, pCatalogReq, pMetaData, pQuery); if (TSDB_CODE_SUCCESS == code) { diff --git a/tests/taosc_test/taoscTest.cpp b/tests/taosc_test/taoscTest.cpp index 3f49b11b70..d3f6f50547 100644 --- a/tests/taosc_test/taoscTest.cpp +++ b/tests/taosc_test/taoscTest.cpp @@ -32,29 +32,29 @@ class taoscTest : public ::testing::Test { protected: static void SetUpTestCase() { - printf("start test setup.\n"); - TAOS* taos = taos_connect("localhost", "root", "taosdata", NULL, 0); - ASSERT_TRUE(taos != nullptr); - - TAOS_RES* res = taos_query(taos, "drop database IF EXISTS taosc_test_db;"); - if (taos_errno(res) != 0) { - printf("error in drop database taosc_test_db, reason:%s\n", taos_errstr(res)); - return; - } - taosSsleep(5); - taos_free_result(res); - printf("drop database taosc_test_db,finished.\n"); - - res = taos_query(taos, "create database taosc_test_db;"); - if (taos_errno(res) != 0) { - printf("error in create database taosc_test_db, reason:%s\n", taos_errstr(res)); - return; - } - taosSsleep(5); - taos_free_result(res); - printf("create database taosc_test_db,finished.\n"); - - taos_close(taos); +// printf("start test setup.\n"); +// TAOS* taos = taos_connect("localhost", "root", "taosdata", NULL, 0); +// ASSERT_TRUE(taos != nullptr); +// +// TAOS_RES* res = taos_query(taos, "drop database IF EXISTS taosc_test_db;"); +// if (taos_errno(res) != 0) { +// printf("error in drop database taosc_test_db, reason:%s\n", taos_errstr(res)); +// return; +// } +// taosSsleep(5); +// taos_free_result(res); +// printf("drop database taosc_test_db,finished.\n"); +// +// res = taos_query(taos, "create database taosc_test_db;"); +// if (taos_errno(res) != 0) { +// printf("error in create database taosc_test_db, reason:%s\n", taos_errstr(res)); +// return; +// } +// taosSsleep(5); +// taos_free_result(res); +// printf("create database taosc_test_db,finished.\n"); +// +// taos_close(taos); } static void TearDownTestCase() {} @@ -99,6 +99,154 @@ void queryCallback(void* param, void* res, int32_t code) { taos_fetch_raw_block_a(res, fetchCallback, param); } +/** + * @brief execute sql only. + * + * @param taos + * @param sql + */ +void executeSQL(TAOS *taos, const char *sql) { + TAOS_RES *res = taos_query(taos, sql); + int code = taos_errno(res); + if (code != 0) { + printf("%s\n", taos_errstr(res)); + taos_free_result(res); + taos_close(taos); + exit(EXIT_FAILURE); + } + taos_free_result(res); +} + +/** + * @brief check return status and exit program when error occur. + * + * @param stmt + * @param code + * @param msg + */ +void checkErrorCode(TAOS_STMT *stmt, int code, const char* msg) { + if (code != 0) { + printf("%s. error: %s\n", msg, taos_stmt_errstr(stmt)); + taos_stmt_close(stmt); + exit(EXIT_FAILURE); + } +} + +typedef struct { + int64_t ts; + float current; + int voltage; + float phase; +} Row; + + +/** + * @brief insert data using stmt API + * + * @param taos + */ +void insertData(TAOS *taos) { + // init + TAOS_STMT *stmt = taos_stmt_init(taos); + // prepare +// const char *sql = "INSERT INTO ?.d1001 USING meters TAGS(?, ?) values(?, ?, ?, ?)"; +// const char *sql = "INSERT INTO ?.? USING meters TAGS(?, ?) values(?, ?, ?, ?)"; +// const char *sql = "INSERT INTO power.? USING meters TAGS(?, ?) values(?, ?, ?, ?)"; +// const char *sql = "INSERT INTO ? USING meters TAGS(?, ?) values(?, ?, ?, ?)"; +// const char *sql = "INSERT INTO ? USING meters TAGS(?, ?) values(?, ?, ?, ?)"; + const char *sql = "insert into huawei USING meters TAGS(?, ?) values(?, ?, ?, ?)"; + int code = taos_stmt_prepare(stmt, sql, 0); + checkErrorCode(stmt, code, "failed to execute taos_stmt_prepare"); + // bind table name and tags + TAOS_MULTI_BIND tags[2]; + char *location = "California.SanFrancisco"; + int groupId = 2; + tags[0].buffer_type = TSDB_DATA_TYPE_BINARY; + tags[0].buffer_length = strlen(location); + tags[0].length = (int32_t *)&tags[0].buffer_length; + tags[0].buffer = location; + tags[0].is_null = NULL; + + tags[1].buffer_type = TSDB_DATA_TYPE_INT; + tags[1].buffer_length = sizeof(int); + tags[1].length = (int32_t *)&tags[1].buffer_length; + tags[1].buffer = &groupId; + tags[1].is_null = NULL; + +// code = taos_stmt_set_tbname_tags(stmt, "duck", tags); +// checkErrorCode(stmt, code, "failed to execute taos_stmt_set_dbname_tbname_tags"); + + // insert two rows with multi binds + TAOS_MULTI_BIND params[4]; + // values to bind + int64_t ts[] = {1648432611250, 1648432611778}; + float current[] = {10.3, 12.6}; + int voltage[] = {219, 218}; + float phase[] = {0.31, 0.33}; + // is_null array + char is_null[2] = {0}; + // length array + int32_t int64Len[2] = {sizeof(int64_t)}; + int32_t floatLen[2] = {sizeof(float)}; + int32_t intLen[2] = {sizeof(int)}; + + params[0].buffer_type = TSDB_DATA_TYPE_TIMESTAMP; + params[0].buffer_length = sizeof(int64_t); + params[0].buffer = ts; + params[0].length = int64Len; + params[0].is_null = is_null; + params[0].num = 2; + + params[1].buffer_type = TSDB_DATA_TYPE_FLOAT; + params[1].buffer_length = sizeof(float); + params[1].buffer = current; + params[1].length = floatLen; + params[1].is_null = is_null; + params[1].num = 2; + + params[2].buffer_type = TSDB_DATA_TYPE_INT; + params[2].buffer_length = sizeof(int); + params[2].buffer = voltage; + params[2].length = intLen; + params[2].is_null = is_null; + params[2].num = 2; + + params[3].buffer_type = TSDB_DATA_TYPE_FLOAT; + params[3].buffer_length = sizeof(float); + params[3].buffer = phase; + params[3].length = floatLen; + params[3].is_null = is_null; + params[3].num = 2; + + code = taos_stmt_bind_param_batch(stmt, params); // bind batch + checkErrorCode(stmt, code, "failed to execute taos_stmt_bind_param_batch"); + code = taos_stmt_add_batch(stmt); // add batch + checkErrorCode(stmt, code, "failed to execute taos_stmt_add_batch"); + // execute + code = taos_stmt_execute(stmt); + checkErrorCode(stmt, code, "failed to execute taos_stmt_execute"); + int affectedRows = taos_stmt_affected_rows(stmt); + printf("successfully inserted %d rows\n", affectedRows); + + // close + taos_stmt_close(stmt); +} + +TEST_F(taoscTest, taos_stmt_test) { + TAOS *taos = taos_connect("localhost", "root", "taosdata", NULL, 6030); + if (taos == NULL) { + printf("failed to connect to server"); + exit(EXIT_FAILURE); + } +// executeSQL(taos, "drop database if exists power"); +// executeSQL(taos, "create database power"); + executeSQL(taos, "use power"); +// executeSQL(taos, "create stable meters (ts timestamp, current float, voltage int, phase float) tags (location binary(64), groupId int)"); + insertData(taos); + taos_close(taos); + taos_cleanup(); +} + TEST_F(taoscTest, taos_query_a_test) { char sql[1024] = {0}; int32_t code = 0; From 88aa15e944f88c9b44d148fcc2efc07520778f7b Mon Sep 17 00:00:00 2001 From: xjzhou Date: Tue, 2 Jul 2024 11:05:54 +0800 Subject: [PATCH 19/92] enh: Enhance error handling for stmt --- include/libs/parser/parser.h | 1 + source/client/inc/clientInt.h | 1 + source/client/src/clientImpl.c | 4 +++- source/client/src/clientStmt.c | 1 + source/libs/parser/src/parInsertSql.c | 2 +- 5 files changed, 7 insertions(+), 2 deletions(-) diff --git a/include/libs/parser/parser.h b/include/libs/parser/parser.h index ad41b9a542..3ac357055e 100644 --- a/include/libs/parser/parser.h +++ b/include/libs/parser/parser.h @@ -89,6 +89,7 @@ typedef struct SParseContext { bool isView; bool isAudit; bool nodeOffline; + bool isStmtBind; const char* svrVer; SArray* pTableMetaPos; // sql table pos => catalog data pos SArray* pTableVgroupPos; // sql table pos => catalog data pos diff --git a/source/client/inc/clientInt.h b/source/client/inc/clientInt.h index 7a84215e12..d05abb2051 100644 --- a/source/client/inc/clientInt.h +++ b/source/client/inc/clientInt.h @@ -283,6 +283,7 @@ typedef struct SRequestObj { bool inRetry; bool isSubReq; bool inCallback; + bool isStmtBind; // is statement bind parameter uint32_t prevCode; // previous error code: todo refactor, add update flag for catalog uint32_t retry; int64_t allocatorRefId; diff --git a/source/client/src/clientImpl.c b/source/client/src/clientImpl.c index 11d3797157..080e2dc32a 100644 --- a/source/client/src/clientImpl.c +++ b/source/client/src/clientImpl.c @@ -206,6 +206,7 @@ int32_t buildRequest(uint64_t connId, const char* sql, int sqlLen, void* param, (*pRequest)->sqlstr[sqlLen] = 0; (*pRequest)->sqlLen = sqlLen; (*pRequest)->validateOnly = validateSql; + (*pRequest)->isStmtBind = false; ((SSyncQueryParam*)(*pRequest)->body.interParam)->userParam = param; @@ -266,7 +267,8 @@ int32_t parseSql(SRequestObj* pRequest, bool topicQuery, SQuery** pQuery, SStmtC .isSuperUser = (0 == strcmp(pTscObj->user, TSDB_DEFAULT_USER)), .enableSysInfo = pTscObj->sysInfo, .svrVer = pTscObj->sVer, - .nodeOffline = (pTscObj->pAppInfo->onlineDnodes < pTscObj->pAppInfo->totalDnodes)}; + .nodeOffline = (pTscObj->pAppInfo->onlineDnodes < pTscObj->pAppInfo->totalDnodes), + .isStmtBind = pRequest->isStmtBind}; cxt.mgmtEpSet = getEpSet_s(&pTscObj->pAppInfo->mgmtEp); int32_t code = catalogGetHandle(pTscObj->pAppInfo->clusterId, &cxt.pCatalog); diff --git a/source/client/src/clientStmt.c b/source/client/src/clientStmt.c index e8b76d34c2..38a16d8fbd 100644 --- a/source/client/src/clientStmt.c +++ b/source/client/src/clientStmt.c @@ -72,6 +72,7 @@ static int32_t stmtCreateRequest(STscStmt* pStmt) { } if (TSDB_CODE_SUCCESS == code) { pStmt->exec.pRequest->syncQuery = true; + pStmt->exec.pRequest->isStmtBind = true; } } diff --git a/source/libs/parser/src/parInsertSql.c b/source/libs/parser/src/parInsertSql.c index b053cd95a0..7af376f21c 100644 --- a/source/libs/parser/src/parInsertSql.c +++ b/source/libs/parser/src/parInsertSql.c @@ -30,7 +30,7 @@ typedef struct SInsertParseContext { bool forceUpdate; bool needTableTagVal; bool needRequest; // whether or not request server - bool isStmtBind; // whether is stmt + bool isStmtBind; // whether is stmt bind } SInsertParseContext; typedef int32_t (*_row_append_fn_t)(SMsgBuf* pMsgBuf, const void* value, int32_t len, void* param); From ca1562a990059a891e1b893778a05e9c1363d485 Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Tue, 2 Jul 2024 08:23:56 +0000 Subject: [PATCH 20/92] add self check --- source/libs/stream/inc/streamBackendRocksdb.h | 3 + source/libs/stream/src/streamBackendRocksdb.c | 228 ++++++++++++------ source/libs/stream/src/streamCheckpoint.c | 38 +-- 3 files changed, 162 insertions(+), 107 deletions(-) diff --git a/source/libs/stream/inc/streamBackendRocksdb.h b/source/libs/stream/inc/streamBackendRocksdb.h index 24cd861550..e4c5787020 100644 --- a/source/libs/stream/inc/streamBackendRocksdb.h +++ b/source/libs/stream/inc/streamBackendRocksdb.h @@ -131,6 +131,8 @@ typedef struct { TdThreadRwlock rwLock; } SBkdMgt; +#define META_ON_S3_FORMATE "%s_%" PRId64 "\n%s_%" PRId64 "\n%s_%" PRId64 "" + bool streamBackendDataIsExist(const char* path, int64_t chkpId, int32_t vgId); void* streamBackendInit(const char* path, int64_t chkpId, int32_t vgId); void streamBackendCleanup(void* arg); @@ -258,6 +260,7 @@ void bkdMgtDestroy(SBkdMgt* bm); int32_t taskDbGenChkpUploadData(void* arg, void* bkdMgt, int64_t chkpId, int8_t type, char** path, SArray* list, const char* id); +int32_t remoteChkpGetDelFile(char* path, SArray* toDel); void* taskAcquireDb(int64_t refId); void taskReleaseDb(int64_t refId); diff --git a/source/libs/stream/src/streamBackendRocksdb.c b/source/libs/stream/src/streamBackendRocksdb.c index 1042e6dfc9..0074251669 100644 --- a/source/libs/stream/src/streamBackendRocksdb.c +++ b/source/libs/stream/src/streamBackendRocksdb.c @@ -19,8 +19,6 @@ #include "tcommon.h" #include "tref.h" -#define META_ON_S3_FORMATE "%s_%" PRId64 "\n%s_%" PRId64 "\n%s_%" PRId64 "" - typedef struct SCompactFilteFactory { void* status; } SCompactFilteFactory; @@ -152,6 +150,9 @@ static rocksdb_iterator_t* streamStateIterCreate(SStreamState* pState, const cha void taskDbRefChkp(STaskDbWrapper* pTaskDb, int64_t chkp); void taskDbUnRefChkp(STaskDbWrapper* pTaskDb, int64_t chkp); +int32_t chkpAddExtraInfo(char* pChkpIdDir, int64_t chkpId, int64_t processId); +int32_t chkpLoadExtraInfo(char* pChkpIdDir, int64_t* chkpId, int64_t* processId); + #define GEN_COLUMN_FAMILY_NAME(name, idstr, SUFFIX) sprintf(name, "%s_%s", idstr, (SUFFIX)); int32_t copyFiles(const char* src, const char* dst); uint32_t nextPow2(uint32_t x); @@ -286,7 +287,7 @@ int32_t remoteChkp_readMetaData(char* path, SSChkpMetaOnS3** pMeta) { return -1; } - int32_t n = sprintf(metaPath, "%s%s%s", path, TD_DIRSEP, "META"); + int32_t n = snprintf(metaPath, cap, "%s%s%s", path, TD_DIRSEP, "META"); if (n <= 0 || n >= cap) { terrno = TSDB_CODE_OUT_OF_MEMORY; taosMemoryFree(metaPath); @@ -317,6 +318,12 @@ int32_t remoteChkp_readMetaData(char* path, SSChkpMetaOnS3** pMeta) { taosMemoryFree(p); goto _EXIT; } + + if (p->currChkptId != p->manifestChkptId) { + terrno = TSDB_CODE_INVALID_MSG; + taosMemoryFree(p); + goto _EXIT; + } *pMeta = p; code = 0; _EXIT: @@ -324,66 +331,100 @@ _EXIT: taosMemoryFree(metaPath); return code; } -int32_t remoteChkp_validMetaFile(char* name, char* prename, int64_t chkpId) { - int8_t valid = 0; - for (int i = 0; i < strlen(name); i++) { - if (name[i] == '_') { - memcpy(prename, name, i); - if (taosStr2int64(name + i + 1) != chkpId) { - break; - } else { - valid = 1; - } - } - } - return valid; -} + int32_t remoteChkp_validAndCvtMeta(char* path, SSChkpMetaOnS3* pMeta, int64_t chkpId) { - int32_t complete = 1; - int32_t len = strlen(path) + 32; - char* src = taosMemoryCalloc(1, len); - char* dst = taosMemoryCalloc(1, len); + int32_t code = -1; + int32_t nBytes = 0; + int32_t cap = strlen(path) + 64; + char* src = taosMemoryCalloc(1, cap); + char* dst = taosMemoryCalloc(1, cap); if (src == NULL || dst == NULL) { terrno = TSDB_CODE_OUT_OF_MEMORY; - return -1; + return code; } if (pMeta->currChkptId != chkpId || pMeta->manifestChkptId != chkpId) { - terrno = TSDB_CODE_OUT_OF_MEMORY; - return -1; + terrno = TSDB_CODE_INVALID_CFG; + return code; } + // rename current_chkp/mainfest to current + for (int i = 0; i < 2; i++) { + char* key = (i == 0 ? pMeta->pCurrName : pMeta->pManifestName); + if (strlen(key) <= 0) { + terrno = TSDB_CODE_INVALID_PARA; + } - // for (int i = 0; i < taosArrayGetSize(list); i++) { - // char* p = taosArrayGetP(list, i); - // sprintf(src, "%s%s%s", path, TD_DIRSEP, p); + nBytes = snprintf(src, cap, "%s%s%s_%" PRId64 "", path, TD_DIRSEP, key, pMeta->currChkptId); + if (nBytes <= 0 || nBytes >= cap) { + terrno = TSDB_CODE_OUT_OF_RANGE; + goto _EXIT; + } - // // check file exist - // if (taosStatFile(src, NULL, NULL, NULL) != 0) { - // complete = 0; - // break; - // } + if (taosStatFile(src, NULL, NULL, NULL) != 0) { + terrno = TAOS_SYSTEM_ERROR(errno); + goto _EXIT; + } - // // check file name - // char temp[64] = {0}; - // if (remoteChkp_validMetaFile(p, temp, chkpId)) { - // count++; - // } + nBytes = snprintf(dst, cap, "%s%s%s", path, TD_DIRSEP, key); + if (nBytes <= 0 || nBytes >= cap) { + terrno = TSDB_CODE_OUT_OF_RANGE; + goto _EXIT; + } - // // rename file - // sprintf(dst, "%s%s%s", path, TD_DIRSEP, temp); - // taosRenameFile(src, dst); + if (taosRenameFile(src, dst) != 0) { + terrno = TAOS_SYSTEM_ERROR(errno); + goto _EXIT; + } - // memset(src, 0, len); - // memset(dst, 0, len); - // } - // if (count != taosArrayGetSize(list)) { - // complete = 0; - // } + memset(src, 0, cap); + memset(dst, 0, cap); + } + code = 0; +// rename manifest_chkp to manifest +_EXIT: taosMemoryFree(src); taosMemoryFree(dst); + return code; +} +int32_t remoteChkpGetDelFile(char* path, SArray* toDel) { + int32_t code = -1; + int32_t nBytes = 0; - return complete == 1 ? 0 : -1; + SSChkpMetaOnS3* pMeta = NULL; + code = remoteChkp_readMetaData(path, &pMeta); + if (code != 0) { + return code; + } + + for (int i = 0; i < 2; i++) { + char* key = (i == 0 ? pMeta->pCurrName : pMeta->pManifestName); + + int32_t cap = strlen(key) + 32; + char* p = taosMemoryCalloc(1, cap); + if (p == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + taosMemoryFree(pMeta); + return -1; + } + + nBytes = snprintf(p, cap, "%s_%" PRId64 "", key, pMeta->currChkptId); + if (nBytes <= 0 || nBytes >= cap) { + terrno = TSDB_CODE_OUT_OF_RANGE; + taosMemoryFree(pMeta); + taosMemoryFree(p); + return code; + } + if (taosArrayPush(toDel, &p) == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + taosMemoryFree(pMeta); + taosMemoryFree(p); + return code; + } + } + code = 0; + + return code; } void cleanDir(const char* pPath, const char* id) { @@ -424,56 +465,91 @@ int32_t rebuildFromRemoteChkp_rsync(const char* key, char* checkpointPath, int64 return backendCopyFiles(checkpointPath, defaultPath); } +int32_t rebuildDataFromS3(char* chkpPath, int64_t chkpId) { + SSChkpMetaOnS3* pMeta = NULL; + + int32_t code = remoteChkp_readMetaData(chkpPath, &pMeta); + if (code != 0) { + return -1; + } + + if (pMeta->currChkptId != chkpId || pMeta->manifestChkptId != chkpId) { + taosMemoryFree(pMeta); + terrno = TSDB_CODE_INVALID_PARA; + return -1; + } + + code = remoteChkp_validAndCvtMeta(chkpPath, pMeta, chkpId); + if (code != 0) { + taosMemoryFree(pMeta); + return -1; + } + + return chkpAddExtraInfo(chkpPath, chkpId, pMeta->processId); +} + int32_t rebuildFromRemoteChkp_s3(const char* key, char* chkpPath, int64_t chkpId, char* defaultPath) { + int8_t rename = 0; int32_t code = streamTaskDownloadCheckpointData(key, chkpPath); if (code != 0) { return code; } - int32_t nBytes; int32_t cap = strlen(defaultPath) + 32; - char* tmp = taosMemoryCalloc(1, cap); - if (tmp == NULL) { + char* defaultTmp = taosMemoryCalloc(1, cap); + if (defaultTmp == NULL) { terrno = TSDB_CODE_OUT_OF_MEMORY; return -1; } - nBytes = snprintf(tmp, cap, "%s%s", defaultPath, "_tmp"); + int32_t nBytes = snprintf(defaultPath, cap, "%s%s", defaultPath, "_tmp"); if (nBytes <= 0 || nBytes >= cap) { terrno = TSDB_CODE_OUT_OF_RANGE; - taosMemoryFree(tmp); + taosMemoryFree(defaultPath); return -1; } - if (taosIsDir(tmp)) taosRemoveDir(tmp); - if (taosIsDir(defaultPath)) taosRenameFile(defaultPath, tmp); - - // SArray* list = taosArrayInit(2, sizeof(void*)); - SSChkpMetaOnS3* pMeta = NULL; - code = remoteChkp_readMetaData(chkpPath, &pMeta); - if (code == 0) code = remoteChkp_validAndCvtMeta(chkpPath, pMeta, chkpId); - - taosMemoryFree(pMeta); - - if (code == 0) { - code = taosMkDir(defaultPath); - } - - if (code == 0) { - code = backendCopyFiles(chkpPath, defaultPath); - } - - if (code != 0) { - if (taosIsDir(defaultPath)) taosRemoveDir(defaultPath); - if (taosIsDir(tmp)) { - code = taosRenameFile(tmp, defaultPath); + if (taosIsDir(defaultTmp)) taosRemoveDir(defaultTmp); + if (taosIsDir(defaultPath)) { + code = taosRenameFile(defaultPath, defaultTmp); + if (code != 0) { + terrno = TAOS_SYSTEM_ERROR(errno); + goto _EXIT; + } else { + rename = 1; } } else { - taosRemoveDir(tmp); + code = taosMkDir(defaultPath); + if (code != 0) { + terrno = TAOS_SYSTEM_ERROR(errno); + goto _EXIT; + } } - taosMemoryFree(tmp); + code = rebuildDataFromS3(chkpPath, chkpId); + if (code != 0) { + goto _EXIT; + } + + code = backendCopyFiles(chkpPath, defaultPath); + if (code != 0) { + goto _EXIT; + } + code = 0; + +_EXIT: + if (code != 0) { + if (rename) { + taosRenameFile(defaultTmp, defaultPath); + } + } + + if (taosIsDir(defaultPath)) { + taosRemoveDir(defaultPath); + } + + taosMemoryFree(defaultTmp); return code; } diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index bc5067d4d6..8b75e74d3b 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -542,7 +542,7 @@ void streamTaskSetFailedCheckpointId(SStreamTask* pTask) { static int32_t getCheckpointDataMeta(const char* id, const char* path, SArray* list) { TdFilePtr pFile = NULL; - int32_t cap = strlen(path) + 32; + int32_t cap = strlen(path) + 64; char buf[128] = {0}; int32_t code = 0; @@ -553,7 +553,7 @@ static int32_t getCheckpointDataMeta(const char* id, const char* path, SArray* l } int32_t nBytes = snprintf(filePath, cap, "%s%s%s", path, TD_DIRSEP, "META_TMP"); - if (nBytes != strlen(filePath)) { + if (nBytes <= 0 || nBytes >= cap) { taosMemoryFree(filePath); terrno = TSDB_CODE_OUT_OF_RANGE; return -1; @@ -561,41 +561,17 @@ static int32_t getCheckpointDataMeta(const char* id, const char* path, SArray* l code = downloadCheckpointDataByName(id, "META", filePath); if (code != 0) { - stDebug("%s chkp failed to download meta file:%s", id, filePath); + stError("%s chkp failed to download meta file:%s", id, filePath); taosMemoryFree(filePath); return code; } - pFile = taosOpenFile(filePath, TD_FILE_READ); - if (pFile == NULL) { - terrno = TAOS_SYSTEM_ERROR(errno); - stError("%s failed to open meta file:%s for checkpoint", id, filePath); + code = remoteChkpGetDelFile(filePath, list); + if (code != 0) { + stError("%s chkp failed to get to del:%s", id, filePath); taosMemoryFree(filePath); - return -1; } - - if (taosReadFile(pFile, buf, sizeof(buf)) <= 0) { - stError("%s failed to read meta file:%s for checkpoint", id, filePath); - code = -1; - } else { - int32_t len = strnlen(buf, tListLen(buf)); - for (int i = 0; i < len; i++) { - if (buf[i] == '\n') { - char* item = taosMemoryCalloc(1, i + 1); - memcpy(item, buf, i); - taosArrayPush(list, &item); - - item = taosMemoryCalloc(1, len - i); - memcpy(item, buf + i + 1, len - i - 1); - taosArrayPush(list, &item); - } - } - } - - taosCloseFile(&pFile); - taosRemoveFile(filePath); - taosMemoryFree(filePath); - return code; + return 0; } int32_t uploadCheckpointData(SStreamTask* pTask, int64_t checkpointId, int64_t dbRefId, ECHECKPOINT_BACKUP_TYPE type) { From 67217a9bed837cc1dcd05ab35e03a3823523845a Mon Sep 17 00:00:00 2001 From: xjzhou Date: Tue, 2 Jul 2024 18:01:54 +0800 Subject: [PATCH 21/92] Return an error early when an error has already occurred in stmt --- source/client/src/clientStmt.c | 50 ++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/source/client/src/clientStmt.c b/source/client/src/clientStmt.c index 38a16d8fbd..21d2cbf447 100644 --- a/source/client/src/clientStmt.c +++ b/source/client/src/clientStmt.c @@ -831,6 +831,7 @@ TAOS_STMT* stmtInit(STscObj* taos, int64_t reqid, TAOS_STMT_OPTIONS* pOptions) { pStmt->bInfo.needParse = true; pStmt->sql.status = STMT_INIT; pStmt->reqid = reqid; + pStmt->errCode = TSDB_CODE_SUCCESS; if (NULL != pOptions) { memcpy(&pStmt->options, pOptions, sizeof(pStmt->options)); @@ -883,6 +884,10 @@ int stmtPrepare(TAOS_STMT* stmt, const char* sql, unsigned long length) { STMT_DLOG_E("start to prepare"); + if (pStmt->errCode != TSDB_CODE_SUCCESS) { + return pStmt->errCode; + } + if (pStmt->sql.status >= STMT_PREPARE) { STMT_ERR_RET(stmtResetStmt(pStmt)); } @@ -954,6 +959,10 @@ int stmtSetTbName(TAOS_STMT* stmt, const char* tbName) { STMT_DLOG("start to set tbName: %s", tbName); + if (pStmt->errCode != TSDB_CODE_SUCCESS) { + return pStmt->errCode; + } + STMT_ERR_RET(stmtSwitchStatus(pStmt, STMT_SETTBNAME)); int32_t insert = 0; @@ -1000,6 +1009,10 @@ int stmtSetTbTags(TAOS_STMT* stmt, TAOS_MULTI_BIND* tags) { STMT_DLOG_E("start to set tbTags"); + if (pStmt->errCode != TSDB_CODE_SUCCESS) { + return pStmt->errCode; + } + STMT_ERR_RET(stmtSwitchStatus(pStmt, STMT_SETTAGS)); if (pStmt->bInfo.inExecCache) { @@ -1022,6 +1035,10 @@ int stmtSetTbTags(TAOS_STMT* stmt, TAOS_MULTI_BIND* tags) { } int stmtFetchTagFields(STscStmt* pStmt, int32_t* fieldNum, TAOS_FIELD_E** fields) { + if (pStmt->errCode != TSDB_CODE_SUCCESS) { + return pStmt->errCode; + } + if (STMT_TYPE_QUERY == pStmt->sql.type) { tscError("invalid operation to get query tag fileds"); STMT_ERR_RET(TSDB_CODE_TSC_STMT_API_ERROR); @@ -1040,6 +1057,10 @@ int stmtFetchTagFields(STscStmt* pStmt, int32_t* fieldNum, TAOS_FIELD_E** fields } int stmtFetchColFields(STscStmt* pStmt, int32_t* fieldNum, TAOS_FIELD_E** fields) { + if (pStmt->errCode != TSDB_CODE_SUCCESS) { + return pStmt->errCode; + } + if (STMT_TYPE_QUERY == pStmt->sql.type) { tscError("invalid operation to get query column fileds"); STMT_ERR_RET(TSDB_CODE_TSC_STMT_API_ERROR); @@ -1151,8 +1172,13 @@ int stmtBindBatch(TAOS_STMT* stmt, TAOS_MULTI_BIND* bind, int32_t colIdx) { STMT_DLOG("start to bind stmt data, colIdx: %d", colIdx); + if (pStmt->errCode != TSDB_CODE_SUCCESS) { + return pStmt->errCode; + } + STMT_ERR_RET(stmtSwitchStatus(pStmt, STMT_BIND)); + if (pStmt->bInfo.needParse && pStmt->sql.runTimes && pStmt->sql.type > 0 && STMT_TYPE_MULTI_INSERT != pStmt->sql.type) { pStmt->bInfo.needParse = false; @@ -1308,6 +1334,10 @@ int stmtAddBatch(TAOS_STMT* stmt) { STMT_DLOG_E("start to add batch"); + if (pStmt->errCode != TSDB_CODE_SUCCESS) { + return pStmt->errCode; + } + STMT_ERR_RET(stmtSwitchStatus(pStmt, STMT_ADD_BATCH)); if (pStmt->sql.stbInterlaceMode) { @@ -1472,6 +1502,10 @@ int stmtExec(TAOS_STMT* stmt) { STMT_DLOG_E("start to exec"); + if (pStmt->errCode != TSDB_CODE_SUCCESS) { + return pStmt->errCode; + } + STMT_ERR_RET(stmtSwitchStatus(pStmt, STMT_EXECUTE)); if (STMT_TYPE_QUERY == pStmt->sql.type) { @@ -1600,6 +1634,10 @@ int stmtGetTagFields(TAOS_STMT* stmt, int* nums, TAOS_FIELD_E** fields) { STMT_DLOG_E("start to get tag fields"); + if (pStmt->errCode != TSDB_CODE_SUCCESS) { + return pStmt->errCode; + } + if (STMT_TYPE_QUERY == pStmt->sql.type) { STMT_ERRI_JRET(TSDB_CODE_TSC_STMT_API_ERROR); } @@ -1638,6 +1676,10 @@ int stmtGetColFields(TAOS_STMT* stmt, int* nums, TAOS_FIELD_E** fields) { STMT_DLOG_E("start to get col fields"); + if (pStmt->errCode != TSDB_CODE_SUCCESS) { + return pStmt->errCode; + } + if (STMT_TYPE_QUERY == pStmt->sql.type) { STMT_ERRI_JRET(TSDB_CODE_TSC_STMT_API_ERROR); } @@ -1675,6 +1717,10 @@ int stmtGetParamNum(TAOS_STMT* stmt, int* nums) { STMT_DLOG_E("start to get param num"); + if (pStmt->errCode != TSDB_CODE_SUCCESS) { + return pStmt->errCode; + } + STMT_ERR_RET(stmtSwitchStatus(pStmt, STMT_FETCH_FIELDS)); if (pStmt->bInfo.needParse && pStmt->sql.runTimes && pStmt->sql.type > 0 && @@ -1707,6 +1753,10 @@ int stmtGetParam(TAOS_STMT* stmt, int idx, int* type, int* bytes) { STMT_DLOG_E("start to get param"); + if (pStmt->errCode != TSDB_CODE_SUCCESS) { + return pStmt->errCode; + } + if (STMT_TYPE_QUERY == pStmt->sql.type) { STMT_RET(TSDB_CODE_TSC_STMT_API_ERROR); } From 27cb3638c22abf2882bee1686862e759d05731e5 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Wed, 3 Jul 2024 09:20:37 +0800 Subject: [PATCH 22/92] fix(stream): add more check before launching update stream task nodeEp --- source/dnode/mnode/impl/src/mndStream.c | 2 +- source/dnode/mnode/impl/src/mndStreamUtil.c | 18 ++++++++++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/source/dnode/mnode/impl/src/mndStream.c b/source/dnode/mnode/impl/src/mndStream.c index a137c10ed5..54b24fe0b4 100644 --- a/source/dnode/mnode/impl/src/mndStream.c +++ b/source/dnode/mnode/impl/src/mndStream.c @@ -2309,7 +2309,7 @@ static int32_t mndProcessNodeCheckReq(SRpcMsg *pMsg) { taosThreadMutexUnlock(&execInfo.lock); if (numOfNodes == 0) { - mDebug("end to do stream task node change checking, no vgroup exists, do nothing"); + mDebug("end to do stream task(s) node change checking, no stream tasks exist, do nothing"); execInfo.ts = ts; atomic_store_32(&mndNodeCheckSentinel, 0); return 0; diff --git a/source/dnode/mnode/impl/src/mndStreamUtil.c b/source/dnode/mnode/impl/src/mndStreamUtil.c index 8fb5bc8a99..a15b817784 100644 --- a/source/dnode/mnode/impl/src/mndStreamUtil.c +++ b/source/dnode/mnode/impl/src/mndStreamUtil.c @@ -84,6 +84,7 @@ SArray *mndTakeVgroupSnapshot(SMnode *pMnode, bool *allReady) { SSdb *pSdb = pMnode->pSdb; void *pIter = NULL; SVgObj *pVgroup = NULL; + int32_t replica = -1; // do the replica check *allReady = true; SArray *pVgroupListSnapshot = taosArrayInit(4, sizeof(SNodeEntry)); @@ -97,6 +98,17 @@ SArray *mndTakeVgroupSnapshot(SMnode *pMnode, bool *allReady) { SNodeEntry entry = {.nodeId = pVgroup->vgId, .hbTimestamp = pVgroup->updateTime}; entry.epset = mndGetVgroupEpset(pMnode, pVgroup); + if (replica == -1) { + replica = pVgroup->replica; + } else { + if (replica != pVgroup->replica) { + mInfo("vgId:%d replica:%d inconsistent with other vgroups replica:%d, not ready for stream operations", + pVgroup->vgId); + *allReady = false; + break; + } + } + // if not all ready till now, no need to check the remaining vgroups. if (*allReady) { for (int32_t i = 0; i < pVgroup->replica; ++i) { @@ -107,8 +119,10 @@ SArray *mndTakeVgroupSnapshot(SMnode *pMnode, bool *allReady) { } ESyncState state = pVgroup->vnodeGid[i].syncState; - if (state == TAOS_SYNC_STATE_OFFLINE || state == TAOS_SYNC_STATE_ERROR) { - mInfo("vgId:%d offline/err, not ready for checkpoint or other operations", pVgroup->vgId); + if (state == TAOS_SYNC_STATE_OFFLINE || state == TAOS_SYNC_STATE_ERROR || state == TAOS_SYNC_STATE_LEARNER || + state == TAOS_SYNC_STATE_CANDIDATE) { + mInfo("vgId:%d state:%d , not ready for checkpoint or other operations, not check other vgroups", + pVgroup->vgId, state); *allReady = false; break; } From 95469124f8ac2b9fac2c801b74798dccc936bf03 Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Wed, 3 Jul 2024 06:47:52 +0000 Subject: [PATCH 23/92] fix stream restart crash --- source/libs/stream/src/streamMeta.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index 958c3bc00f..2244861bc7 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -273,7 +273,7 @@ int32_t streamTaskSetDb(SStreamMeta* pMeta, SStreamTask* pTask, const char* key) pBackend->pTask = pTask; pBackend->pMeta = pMeta; - pTask->chkInfo.processedVer = processVer; + if (processVer != -1) pTask->chkInfo.processedVer = processVer; taosHashPut(pMeta->pTaskDbUnique, key, strlen(key), &pBackend, sizeof(void*)); taosThreadMutexUnlock(&pMeta->backendMutex); @@ -905,7 +905,7 @@ void streamMetaLoadAllTasks(SStreamMeta* pMeta) { if (p == NULL) { code = pMeta->buildTaskFn(pMeta->ahandle, pTask, pTask->chkInfo.checkpointVer + 1); if (code < 0) { - stError("failed to load s-task:0x%"PRIx64", code:%s, continue", id.taskId, tstrerror(terrno)); + stError("failed to load s-task:0x%" PRIx64 ", code:%s, continue", id.taskId, tstrerror(terrno)); tFreeStreamTask(pTask); continue; } @@ -990,7 +990,7 @@ void streamMetaNotifyClose(SStreamMeta* pMeta) { streamMetaGetHbSendInfo(pMeta->pHbInfo, &startTs, &sendCount); stInfo("vgId:%d notify all stream tasks that current vnode is closing. isLeader:%d startHb:%" PRId64 ", totalHb:%d", - vgId, (pMeta->role == NODE_ROLE_LEADER), startTs, sendCount); + vgId, (pMeta->role == NODE_ROLE_LEADER), startTs, sendCount); // wait for the stream meta hb function stopping streamMetaWaitForHbTmrQuit(pMeta); @@ -1175,7 +1175,7 @@ int32_t streamMetaStartAllTasks(SStreamMeta* pMeta) { int64_t now = taosGetTimestampMs(); int32_t numOfTasks = taosArrayGetSize(pMeta->pTaskList); - stInfo("vgId:%d start to consensus checkpointId for all %d task(s), start ts:%"PRId64, vgId, numOfTasks, now); + stInfo("vgId:%d start to consensus checkpointId for all %d task(s), start ts:%" PRId64, vgId, numOfTasks, now); if (numOfTasks == 0) { stInfo("vgId:%d no tasks exist, quit from consensus checkpointId", pMeta->vgId); From b88ff0f45505475843284ddfc43360e834013f22 Mon Sep 17 00:00:00 2001 From: Hongze Cheng Date: Wed, 3 Jul 2024 18:51:50 +0800 Subject: [PATCH 24/92] refact some code --- include/util/taoserror.h | 2 + include/util/tutil.h | 9 +++ source/common/src/tmsg.c | 117 ++++++++++++++++++++++----------------- source/util/src/terror.c | 2 + 4 files changed, 78 insertions(+), 52 deletions(-) diff --git a/include/util/taoserror.h b/include/util/taoserror.h index 2de336d036..4b72810a61 100644 --- a/include/util/taoserror.h +++ b/include/util/taoserror.h @@ -46,8 +46,10 @@ const char* terrstr(); char* taosGetErrMsgReturn(); char* taosGetErrMsg(); int32_t* taosGetErrno(); +int32_t* taosGetErrln(); int32_t taosGetErrSize(); #define terrno (*taosGetErrno()) +#define terrln (*taosGetErrln()) #define terrMsg (taosGetErrMsg()) #define SET_ERROR_MSG(MSG, ...) \ diff --git a/include/util/tutil.h b/include/util/tutil.h index c049949590..d1a18dc3e8 100644 --- a/include/util/tutil.h +++ b/include/util/tutil.h @@ -117,6 +117,15 @@ static FORCE_INLINE int32_t taosGetTbHashVal(const char *tbname, int32_t tblen, } } +#define TAOS_CHECK_ERRNO(CODE) \ + do { \ + terrno = (CODE); \ + if (terrno != TSDB_CODE_SUCCESS) { \ + terrln = __LINE__; \ + goto _exit; \ + } \ + } while (0) + #define TSDB_CHECK_CODE(CODE, LINO, LABEL) \ do { \ if (TSDB_CODE_SUCCESS != (CODE)) { \ diff --git a/source/common/src/tmsg.c b/source/common/src/tmsg.c index 10719674f5..7e89753241 100644 --- a/source/common/src/tmsg.c +++ b/source/common/src/tmsg.c @@ -69,7 +69,7 @@ pReq->sql = NULL; \ } while (0) -static int32_t tSerializeSMonitorParas(SEncoder *encoder, const SMonitorParas* pMonitorParas) { +static int32_t tSerializeSMonitorParas(SEncoder *encoder, const SMonitorParas *pMonitorParas) { if (tEncodeI8(encoder, pMonitorParas->tsEnableMonitor) < 0) return -1; if (tEncodeI32(encoder, pMonitorParas->tsMonitorInterval) < 0) return -1; if (tEncodeI32(encoder, pMonitorParas->tsSlowLogScope) < 0) return -1; @@ -80,7 +80,7 @@ static int32_t tSerializeSMonitorParas(SEncoder *encoder, const SMonitorParas* p return 0; } -static int32_t tDeserializeSMonitorParas(SDecoder *decoder, SMonitorParas* pMonitorParas){ +static int32_t tDeserializeSMonitorParas(SDecoder *decoder, SMonitorParas *pMonitorParas) { if (tDecodeI8(decoder, (int8_t *)&pMonitorParas->tsEnableMonitor) < 0) return -1; if (tDecodeI32(decoder, &pMonitorParas->tsMonitorInterval) < 0) return -1; if (tDecodeI32(decoder, &pMonitorParas->tsSlowLogScope) < 0) return -1; @@ -1577,7 +1577,7 @@ int32_t tDeserializeSStatisReq(void *buf, int32_t bufLen, SStatisReq *pReq) { if (tDecodeCStrTo(&decoder, pReq->pCont) < 0) return -1; } if (!tDecodeIsEnd(&decoder)) { - if (tDecodeI8(&decoder, (int8_t*)&pReq->type) < 0) return -1; + if (tDecodeI8(&decoder, (int8_t *)&pReq->type) < 0) return -1; } tEndDecode(&decoder); tDecoderClear(&decoder); @@ -5737,65 +5737,74 @@ _exit: } int32_t tSerializeSAlterVnodeConfigReq(void *buf, int32_t bufLen, SAlterVnodeConfigReq *pReq) { + int32_t tlen; SEncoder encoder = {0}; + tEncoderInit(&encoder, buf, bufLen); - if (tStartEncode(&encoder) < 0) return -1; - if (tEncodeI32(&encoder, pReq->vgVersion) < 0) return -1; - if (tEncodeI32(&encoder, pReq->buffer) < 0) return -1; - if (tEncodeI32(&encoder, pReq->pageSize) < 0) return -1; - if (tEncodeI32(&encoder, pReq->pages) < 0) return -1; - if (tEncodeI32(&encoder, pReq->cacheLastSize) < 0) return -1; - if (tEncodeI32(&encoder, pReq->daysPerFile) < 0) return -1; - if (tEncodeI32(&encoder, pReq->daysToKeep0) < 0) return -1; - if (tEncodeI32(&encoder, pReq->daysToKeep1) < 0) return -1; - if (tEncodeI32(&encoder, pReq->daysToKeep2) < 0) return -1; - if (tEncodeI32(&encoder, pReq->walFsyncPeriod) < 0) return -1; - if (tEncodeI8(&encoder, pReq->walLevel) < 0) return -1; - if (tEncodeI8(&encoder, pReq->strict) < 0) return -1; - if (tEncodeI8(&encoder, pReq->cacheLast) < 0) return -1; + TAOS_CHECK_ERRNO(tStartEncode(&encoder)); + TAOS_CHECK_ERRNO(tEncodeI32(&encoder, pReq->vgVersion)); + TAOS_CHECK_ERRNO(tEncodeI32(&encoder, pReq->buffer)); + TAOS_CHECK_ERRNO(tEncodeI32(&encoder, pReq->pageSize)); + TAOS_CHECK_ERRNO(tEncodeI32(&encoder, pReq->pages)); + TAOS_CHECK_ERRNO(tEncodeI32(&encoder, pReq->cacheLastSize)); + TAOS_CHECK_ERRNO(tEncodeI32(&encoder, pReq->daysPerFile)); + TAOS_CHECK_ERRNO(tEncodeI32(&encoder, pReq->daysToKeep0)); + TAOS_CHECK_ERRNO(tEncodeI32(&encoder, pReq->daysToKeep1)); + TAOS_CHECK_ERRNO(tEncodeI32(&encoder, pReq->daysToKeep2)); + TAOS_CHECK_ERRNO(tEncodeI32(&encoder, pReq->walFsyncPeriod)); + TAOS_CHECK_ERRNO(tEncodeI8(&encoder, pReq->walLevel)); + TAOS_CHECK_ERRNO(tEncodeI8(&encoder, pReq->strict)); + TAOS_CHECK_ERRNO(tEncodeI8(&encoder, pReq->cacheLast)); for (int32_t i = 0; i < 7; ++i) { - if (tEncodeI64(&encoder, pReq->reserved[i]) < 0) return -1; + TAOS_CHECK_ERRNO(tEncodeI64(&encoder, pReq->reserved[i])); } // 1st modification - if (tEncodeI16(&encoder, pReq->sttTrigger) < 0) return -1; - if (tEncodeI32(&encoder, pReq->minRows) < 0) return -1; + TAOS_CHECK_ERRNO(tEncodeI16(&encoder, pReq->sttTrigger)); + TAOS_CHECK_ERRNO(tEncodeI32(&encoder, pReq->minRows)); // 2nd modification - if (tEncodeI32(&encoder, pReq->walRetentionPeriod) < 0) return -1; - if (tEncodeI32(&encoder, pReq->walRetentionSize) < 0) return -1; - if (tEncodeI32(&encoder, pReq->keepTimeOffset) < 0) return -1; + TAOS_CHECK_ERRNO(tEncodeI32(&encoder, pReq->walRetentionPeriod)); + TAOS_CHECK_ERRNO(tEncodeI32(&encoder, pReq->walRetentionSize)); + TAOS_CHECK_ERRNO(tEncodeI32(&encoder, pReq->keepTimeOffset)); - if (tEncodeI32(&encoder, pReq->s3KeepLocal) < 0) return -1; - if (tEncodeI8(&encoder, pReq->s3Compact) < 0) return -1; + TAOS_CHECK_ERRNO(tEncodeI32(&encoder, pReq->s3KeepLocal)); + TAOS_CHECK_ERRNO(tEncodeI8(&encoder, pReq->s3Compact)); tEndEncode(&encoder); - int32_t tlen = encoder.pos; +_exit: + if (terrno) { + uError("%s failed at line %d since %s", __func__, terrln, terrstr()); + tlen = -1; + } else { + tlen = encoder.pos; + } tEncoderClear(&encoder); return tlen; } int32_t tDeserializeSAlterVnodeConfigReq(void *buf, int32_t bufLen, SAlterVnodeConfigReq *pReq) { SDecoder decoder = {0}; + tDecoderInit(&decoder, buf, bufLen); - if (tStartDecode(&decoder) < 0) return -1; - if (tDecodeI32(&decoder, &pReq->vgVersion) < 0) return -1; - if (tDecodeI32(&decoder, &pReq->buffer) < 0) return -1; - if (tDecodeI32(&decoder, &pReq->pageSize) < 0) return -1; - if (tDecodeI32(&decoder, &pReq->pages) < 0) return -1; - if (tDecodeI32(&decoder, &pReq->cacheLastSize) < 0) return -1; - if (tDecodeI32(&decoder, &pReq->daysPerFile) < 0) return -1; - if (tDecodeI32(&decoder, &pReq->daysToKeep0) < 0) return -1; - if (tDecodeI32(&decoder, &pReq->daysToKeep1) < 0) return -1; - if (tDecodeI32(&decoder, &pReq->daysToKeep2) < 0) return -1; - if (tDecodeI32(&decoder, &pReq->walFsyncPeriod) < 0) return -1; - if (tDecodeI8(&decoder, &pReq->walLevel) < 0) return -1; - if (tDecodeI8(&decoder, &pReq->strict) < 0) return -1; - if (tDecodeI8(&decoder, &pReq->cacheLast) < 0) return -1; + TAOS_CHECK_ERRNO(tStartDecode(&decoder)); + TAOS_CHECK_ERRNO(tDecodeI32(&decoder, &pReq->vgVersion)); + TAOS_CHECK_ERRNO(tDecodeI32(&decoder, &pReq->buffer)); + TAOS_CHECK_ERRNO(tDecodeI32(&decoder, &pReq->pageSize)); + TAOS_CHECK_ERRNO(tDecodeI32(&decoder, &pReq->pages)); + TAOS_CHECK_ERRNO(tDecodeI32(&decoder, &pReq->cacheLastSize)); + TAOS_CHECK_ERRNO(tDecodeI32(&decoder, &pReq->daysPerFile)); + TAOS_CHECK_ERRNO(tDecodeI32(&decoder, &pReq->daysToKeep0)); + TAOS_CHECK_ERRNO(tDecodeI32(&decoder, &pReq->daysToKeep1)); + TAOS_CHECK_ERRNO(tDecodeI32(&decoder, &pReq->daysToKeep2)); + TAOS_CHECK_ERRNO(tDecodeI32(&decoder, &pReq->walFsyncPeriod)); + TAOS_CHECK_ERRNO(tDecodeI8(&decoder, &pReq->walLevel)); + TAOS_CHECK_ERRNO(tDecodeI8(&decoder, &pReq->strict)); + TAOS_CHECK_ERRNO(tDecodeI8(&decoder, &pReq->cacheLast)); for (int32_t i = 0; i < 7; ++i) { - if (tDecodeI64(&decoder, &pReq->reserved[i]) < 0) return -1; + TAOS_CHECK_ERRNO(tDecodeI64(&decoder, &pReq->reserved[i])); } // 1st modification @@ -5803,8 +5812,8 @@ int32_t tDeserializeSAlterVnodeConfigReq(void *buf, int32_t bufLen, SAlterVnodeC pReq->sttTrigger = -1; pReq->minRows = -1; } else { - if (tDecodeI16(&decoder, &pReq->sttTrigger) < 0) return -1; - if (tDecodeI32(&decoder, &pReq->minRows) < 0) return -1; + TAOS_CHECK_ERRNO(tDecodeI16(&decoder, &pReq->sttTrigger)); + TAOS_CHECK_ERRNO(tDecodeI32(&decoder, &pReq->minRows)); } // 2n modification @@ -5812,24 +5821,29 @@ int32_t tDeserializeSAlterVnodeConfigReq(void *buf, int32_t bufLen, SAlterVnodeC pReq->walRetentionPeriod = -1; pReq->walRetentionSize = -1; } else { - if (tDecodeI32(&decoder, &pReq->walRetentionPeriod) < 0) return -1; - if (tDecodeI32(&decoder, &pReq->walRetentionSize) < 0) return -1; + TAOS_CHECK_ERRNO(tDecodeI32(&decoder, &pReq->walRetentionPeriod)); + TAOS_CHECK_ERRNO(tDecodeI32(&decoder, &pReq->walRetentionSize)); } pReq->keepTimeOffset = TSDB_DEFAULT_KEEP_TIME_OFFSET; if (!tDecodeIsEnd(&decoder)) { - if (tDecodeI32(&decoder, &pReq->keepTimeOffset) < 0) return -1; + TAOS_CHECK_ERRNO(tDecodeI32(&decoder, &pReq->keepTimeOffset)); } pReq->s3KeepLocal = TSDB_DEFAULT_S3_KEEP_LOCAL; pReq->s3Compact = TSDB_DEFAULT_S3_COMPACT; if (!tDecodeIsEnd(&decoder)) { - if (tDecodeI32(&decoder, &pReq->s3KeepLocal) < 0) return -1; - if (tDecodeI8(&decoder, &pReq->s3Compact) < 0) return -1; + TAOS_CHECK_ERRNO(tDecodeI32(&decoder, &pReq->s3KeepLocal) < 0); + TAOS_CHECK_ERRNO(tDecodeI8(&decoder, &pReq->s3Compact) < 0); } tEndDecode(&decoder); + +_exit: tDecoderClear(&decoder); - return 0; + if (terrno) { + uError("%s failed at line %d since %s", __func__, terrln, terrstr()); + } + return terrno; } int32_t tSerializeSAlterVnodeReplicaReq(void *buf, int32_t bufLen, SAlterVnodeReplicaReq *pReq) { @@ -9296,7 +9310,7 @@ int32_t tDecodeSTqCheckInfo(SDecoder *pDecoder, STqCheckInfo *pInfo) { } void tDeleteSTqCheckInfo(STqCheckInfo *pInfo) { taosArrayDestroy(pInfo->colIdList); } -int32_t tEncodeSMqRebVgReq(SEncoder* pCoder, const SMqRebVgReq* pReq) { +int32_t tEncodeSMqRebVgReq(SEncoder *pCoder, const SMqRebVgReq *pReq) { if (tStartEncode(pCoder) < 0) return -1; if (tEncodeI64(pCoder, pReq->leftForVer) < 0) return -1; if (tEncodeI32(pCoder, pReq->vgId) < 0) return -1; @@ -9316,7 +9330,7 @@ int32_t tEncodeSMqRebVgReq(SEncoder* pCoder, const SMqRebVgReq* pReq) { return 0; } -int32_t tDecodeSMqRebVgReq(SDecoder* pCoder, SMqRebVgReq* pReq) { +int32_t tDecodeSMqRebVgReq(SDecoder *pCoder, SMqRebVgReq *pReq) { if (tStartDecode(pCoder) < 0) return -1; if (tDecodeI64(pCoder, &pReq->leftForVer) < 0) return -1; @@ -9341,7 +9355,6 @@ int32_t tDecodeSMqRebVgReq(SDecoder* pCoder, SMqRebVgReq* pReq) { return 0; } - int32_t tEncodeDeleteRes(SEncoder *pCoder, const SDeleteRes *pRes) { int32_t nUid = taosArrayGetSize(pRes->uidList); diff --git a/source/util/src/terror.c b/source/util/src/terror.c index c5bba6fa53..688a642542 100644 --- a/source/util/src/terror.c +++ b/source/util/src/terror.c @@ -22,10 +22,12 @@ #define TAOS_ERROR_C static threadlocal int32_t tsErrno; +static threadlocal int32_t tsErrln; static threadlocal char tsErrMsgDetail[ERR_MSG_LEN] = {0}; static threadlocal char tsErrMsgReturn[ERR_MSG_LEN] = {0}; int32_t* taosGetErrno() { return &tsErrno; } +int32_t* taosGetErrln() { return &tsErrln; } char* taosGetErrMsg() { return tsErrMsgDetail; } char* taosGetErrMsgReturn() { return tsErrMsgReturn; } From 653f7a1a434403882a2dc94f6ea59875f9665097 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Thu, 4 Jul 2024 13:55:50 +0800 Subject: [PATCH 25/92] fix(stream): refactor the checkpoint consensus policy. --- include/common/tmsgdef.h | 1 + include/libs/stream/streamMsg.h | 4 +- include/libs/stream/tstream.h | 5 +- source/dnode/mnode/impl/inc/mndStream.h | 10 +- source/dnode/mnode/impl/src/mndMain.c | 14 +- source/dnode/mnode/impl/src/mndStream.c | 195 +++++++++++++++----- source/dnode/mnode/impl/src/mndStreamUtil.c | 132 ++++--------- source/dnode/snode/src/snode.c | 4 +- source/dnode/vnode/src/tqCommon/tqCommon.c | 8 +- source/libs/stream/src/streamCheckStatus.c | 6 +- source/libs/stream/src/streamCheckpoint.c | 8 - source/libs/stream/src/streamHb.c | 2 + 12 files changed, 223 insertions(+), 166 deletions(-) diff --git a/include/common/tmsgdef.h b/include/common/tmsgdef.h index c92649f1f7..19fe34fe01 100644 --- a/include/common/tmsgdef.h +++ b/include/common/tmsgdef.h @@ -251,6 +251,7 @@ TD_DEF_MSG_TYPE(TDMT_MND_STREAM_UPDATE_CHKPT_EVT, "stream-update-chkpt-evt", NULL, NULL) TD_DEF_MSG_TYPE(TDMT_MND_STREAM_CHKPT_REPORT, "stream-chkpt-report", NULL, NULL) TD_DEF_MSG_TYPE(TDMT_MND_STREAM_CHKPT_CONSEN, "stream-chkpt-consen", NULL, NULL) + TD_DEF_MSG_TYPE(TDMT_MND_STREAM_CONSEN_TIMER, "stream-consen-tmr", NULL, NULL) TD_DEF_MSG_TYPE(TDMT_MND_MAX_MSG, "mnd-max", NULL, NULL) TD_CLOSE_MSG_SEG(TDMT_END_MND_MSG) diff --git a/include/libs/stream/streamMsg.h b/include/libs/stream/streamMsg.h index b69032330d..8b6ca2c5cd 100644 --- a/include/libs/stream/streamMsg.h +++ b/include/libs/stream/streamMsg.h @@ -242,8 +242,8 @@ typedef struct { typedef struct SCheckpointConsensusEntry { SRestoreCheckpointInfo req; - SRpcMsg rsp; - int64_t ts; + SRpcHandleInfo rspInfo; + int64_t ts; } SCheckpointConsensusEntry; #ifdef __cplusplus diff --git a/include/libs/stream/tstream.h b/include/libs/stream/tstream.h index e98039d2fe..66b9db47e2 100644 --- a/include/libs/stream/tstream.h +++ b/include/libs/stream/tstream.h @@ -616,8 +616,9 @@ typedef struct SStreamTaskState { typedef struct SCheckpointConsensusInfo { SArray* pTaskList; - int64_t checkpointId; - int64_t genTs; +// int64_t checkpointId; +// int64_t genTs; + int32_t numOfTasks; } SCheckpointConsensusInfo; int32_t streamSetupScheduleTrigger(SStreamTask* pTask); diff --git a/source/dnode/mnode/impl/inc/mndStream.h b/source/dnode/mnode/impl/inc/mndStream.h index b261f89057..a86e06b486 100644 --- a/source/dnode/mnode/impl/inc/mndStream.h +++ b/source/dnode/mnode/impl/inc/mndStream.h @@ -133,7 +133,8 @@ int32_t mndCreateStreamResetStatusTrans(SMnode *pMnode, SStreamObj *pStream) int32_t mndStreamSetUpdateChkptAction(SMnode *pMnode, STrans *pTrans, SStreamObj *pStream); int32_t mndCreateStreamChkptInfoUpdateTrans(SMnode *pMnode, SStreamObj *pStream, SArray *pChkptInfoList); int32_t mndScanCheckpointReportInfo(SRpcMsg *pReq); -int32_t mndSendConsensusCheckpointIdRsp(SArray* pList, int64_t checkpointId); +int32_t mndSendQuickConsensusChkptIdRsp(SRestoreCheckpointInfo *pReq, int32_t code, int64_t streamId, + int64_t checkpointId, SRpcHandleInfo *pRpcInfo); void removeTasksInBuf(SArray *pTaskIds, SStreamExecInfo *pExecInfo); @@ -146,10 +147,9 @@ void mndInitStreamExecInfo(SMnode *pMnode, SStreamExecInfo *pExecInf int32_t removeExpiredNodeEntryAndTaskInBuf(SArray *pNodeSnapshot); void removeStreamTasksInBuf(SStreamObj *pStream, SStreamExecInfo *pExecNode); -SCheckpointConsensusInfo *mndGetConsensusInfo(SHashObj *pHash, int64_t streamId); -void mndAddConsensusTasks(SCheckpointConsensusInfo *pInfo, const SRestoreCheckpointInfo *pRestoreInfo, SRpcMsg *pMsg); -int64_t mndGetConsensusCheckpointId(SCheckpointConsensusInfo *pInfo, SStreamObj *pStream); -bool mndAllTaskSendCheckpointId(SCheckpointConsensusInfo *pInfo, int32_t numOfTasks, int32_t* pTotal); +SCheckpointConsensusInfo *mndGetConsensusInfo(SHashObj *pHash, int64_t streamId, int32_t numOfTasks); +void mndAddConsensusTasks(SCheckpointConsensusInfo *pInfo, const SRestoreCheckpointInfo *pRestoreInfo, + SRpcHandleInfo *pRpcInfo); void mndClearConsensusRspEntry(SCheckpointConsensusInfo *pInfo); int32_t doSendConsensusCheckpointRsp(SRestoreCheckpointInfo *pInfo, SRpcMsg *pMsg, int64_t checkpointId); int64_t mndClearConsensusCheckpointId(SHashObj* pHash, int64_t streamId); diff --git a/source/dnode/mnode/impl/src/mndMain.c b/source/dnode/mnode/impl/src/mndMain.c index cad8c6d745..3acb727910 100644 --- a/source/dnode/mnode/impl/src/mndMain.c +++ b/source/dnode/mnode/impl/src/mndMain.c @@ -177,6 +177,15 @@ static void mndStreamCheckNode(SMnode *pMnode) { } } +static void mndStreamConsensusChkpt(SMnode *pMnode) { + int32_t contLen = 0; + void *pReq = mndBuildTimerMsg(&contLen); + if (pReq != NULL) { + SRpcMsg rpcMsg = {.msgType = TDMT_MND_STREAM_CONSEN_TIMER, .pCont = pReq, .contLen = contLen}; + tmsgPutToQueue(&pMnode->msgCb, WRITE_QUEUE, &rpcMsg); + } +} + static void mndPullupTelem(SMnode *pMnode) { mTrace("pullup telem msg"); int32_t contLen = 0; @@ -308,7 +317,6 @@ static int32_t minCronTime() { min = TMIN(min, tsCompactPullupInterval); min = TMIN(min, tsMqRebalanceInterval); min = TMIN(min, tsStreamCheckpointInterval); - min = TMIN(min, 6); // checkpointRemain min = TMIN(min, tsStreamNodeCheckInterval); min = TMIN(min, tsArbHeartBeatIntervalSec); min = TMIN(min, tsArbCheckSyncIntervalSec); @@ -353,6 +361,10 @@ void mndDoTimerPullupTask(SMnode *pMnode, int64_t sec) { mndStreamCheckNode(pMnode); } + if (sec % 5 == 0) { + mndStreamConsensusChkpt(pMnode); + } + if (sec % tsTelemInterval == (TMIN(60, (tsTelemInterval - 1)))) { mndPullupTelem(pMnode); } diff --git a/source/dnode/mnode/impl/src/mndStream.c b/source/dnode/mnode/impl/src/mndStream.c index 54b24fe0b4..a4c03ab3e0 100644 --- a/source/dnode/mnode/impl/src/mndStream.c +++ b/source/dnode/mnode/impl/src/mndStream.c @@ -59,7 +59,8 @@ static int32_t mndProcessNodeCheckReq(SRpcMsg *pMsg); static int32_t extractNodeListFromStream(SMnode *pMnode, SArray *pNodeList); static int32_t mndProcessStreamReqCheckpoint(SRpcMsg *pReq); static int32_t mndProcessCheckpointReport(SRpcMsg *pReq); -static int32_t mndProcessConsensusCheckpointId(SRpcMsg *pReq); +static int32_t mndProcessConsensusCheckpointId(SRpcMsg *pMsg); +static int32_t mndProcessConsensusInTmr(SRpcMsg *pMsg); static SVgroupChangeInfo mndFindChangedNodeInfo(SMnode *pMnode, const SArray *pPrevNodeList, const SArray *pNodeList); @@ -123,6 +124,7 @@ int32_t mndInitStream(SMnode *pMnode) { mndSetMsgHandle(pMnode, TDMT_STREAM_TASK_REPORT_CHECKPOINT, mndTransProcessRsp); mndSetMsgHandle(pMnode, TDMT_MND_STREAM_HEARTBEAT, mndProcessStreamHb); mndSetMsgHandle(pMnode, TDMT_MND_STREAM_NODECHANGE_CHECK, mndProcessNodeCheckReq); + mndSetMsgHandle(pMnode, TDMT_MND_STREAM_CONSEN_TIMER, mndProcessConsensusInTmr); mndSetMsgHandle(pMnode, TDMT_MND_PAUSE_STREAM, mndProcessPauseStreamReq); mndSetMsgHandle(pMnode, TDMT_MND_RESUME_STREAM, mndProcessResumeStreamReq); @@ -803,7 +805,7 @@ static int32_t mndProcessCreateStreamReq(SRpcMsg *pReq) { taosThreadMutexLock(&execInfo.lock); mDebug("stream stream:%s start to register tasks into task nodeList and set initial checkpointId", createReq.name); saveTaskAndNodeInfoIntoBuf(&streamObj, &execInfo); - mndRegisterConsensusChkptId(execInfo.pStreamConsensus, streamObj.uid); +// mndRegisterConsensusChkptId(execInfo.pStreamConsensus, streamObj.uid); taosThreadMutexUnlock(&execInfo.lock); // execute creation @@ -2625,12 +2627,42 @@ int32_t mndProcessCheckpointReport(SRpcMsg *pReq) { return 0; } -static int32_t mndProcessConsensusCheckpointId(SRpcMsg *pReq) { - SMnode *pMnode = pReq->info.node; +static int64_t getConsensusId(int64_t streamId, int32_t numOfTasks, bool* pAllEqual) { + int32_t num = 0; + int64_t chkId = INT64_MAX; + *pAllEqual = true; + + for(int32_t i = 0; i < taosArrayGetSize(execInfo.pTaskList); ++i) { + STaskId* p = taosArrayGet(execInfo.pTaskList, i); + if (p->streamId != streamId) { + continue; + } + + num += 1; + STaskStatusEntry* pe = taosHashGet(execInfo.pTaskMap, p, sizeof(*p)); + + if (chkId != INT64_MAX && chkId != pe->checkpointInfo.latestId) { + *pAllEqual = false; + } + + if (chkId > pe->checkpointInfo.latestId) { + chkId = pe->checkpointInfo.latestId; + } + } + + if (num < numOfTasks) { // not all task send info to mnode through hbMsg, no valid checkpoint Id + return -1; + } + + return chkId; +} + +static int32_t mndProcessConsensusCheckpointId(SRpcMsg *pMsg) { + SMnode *pMnode = pMsg->info.node; SDecoder decoder = {0}; SRestoreCheckpointInfo req = {0}; - tDecoderInit(&decoder, pReq->pCont, pReq->contLen); + tDecoderInit(&decoder, pMsg->pCont, pMsg->contLen); if (tDecodeRestoreCheckpointInfo(&decoder, &req)) { tDecoderClear(&decoder); @@ -2647,80 +2679,155 @@ static int32_t mndProcessConsensusCheckpointId(SRpcMsg *pReq) { taosThreadMutexLock(&execInfo.lock); SStreamObj *pStream = mndGetStreamObj(pMnode, req.streamId); + + // mnode handle the create stream transaction too slow may cause this problem if (pStream == NULL) { - mWarn("failed to find the stream:0x%" PRIx64 ", not handle checkpoint-report, try to acquire in buf", req.streamId); + mWarn("failed to find the stream:0x%" PRIx64 ", not handle consensus-checkpointId", req.streamId); // not in meta-store yet, try to acquire the task in exec buffer // the checkpoint req arrives too soon before the completion of the create stream trans. STaskId id = {.streamId = req.streamId, .taskId = req.taskId}; void *p = taosHashGet(execInfo.pTaskMap, &id, sizeof(id)); if (p == NULL) { - mError("failed to find the stream:0x%" PRIx64 " in buf, not handle the checkpoint-report", req.streamId); + mError("failed to find the stream:0x%" PRIx64 " in buf, not handle consensus-checkpointId", req.streamId); terrno = TSDB_CODE_MND_STREAM_NOT_EXIST; + + mndSendQuickConsensusChkptIdRsp(&req, terrno, req.streamId, 0, &pMsg->info); + taosThreadMutexUnlock(&execInfo.lock); + pMsg->info.handle = NULL; // disable auto rsp return -1; } else { mDebug("s-task:0x%" PRIx64 "-0x%x in buf not in mnode/meta, create stream trans may not complete yet", req.streamId, req.taskId); + // todo wait for stream is created } } + mInfo("vgId:%d meta-stored checkpointId for stream:0x%" PRIx64 " %s is:%" PRId64, req.nodeId, req.streamId, + pStream->name, pStream->checkpointId); + int32_t numOfTasks = (pStream == NULL) ? 0 : mndGetNumOfStreamTasks(pStream); - - SCheckpointConsensusInfo *pInfo = mndGetConsensusInfo(execInfo.pStreamConsensus, req.streamId); - - int64_t ckId = mndGetConsensusCheckpointId(pInfo, pStream); - if (ckId != -1) { // consensus checkpoint id already exist - SRpcMsg rsp = {0}; - rsp.code = 0; - rsp.info = pReq->info; - rsp.contLen = sizeof(SRestoreCheckpointInfoRsp) + sizeof(SMsgHead); - rsp.pCont = rpcMallocCont(rsp.contLen); - - SMsgHead *pHead = rsp.pCont; - pHead->vgId = htonl(req.nodeId); - - mDebug("stream:0x%" PRIx64 " consensus-checkpointId:%" PRId64 " exists, return directly", req.streamId, ckId); - doSendConsensusCheckpointRsp(&req, &rsp, ckId); + if ((pStream != NULL) && (pStream->checkpointId == 0)) { // not generated checkpoint yet, return 0 directly + mndSendQuickConsensusChkptIdRsp(&req, TSDB_CODE_SUCCESS, req.streamId, 0, &pMsg->info); taosThreadMutexUnlock(&execInfo.lock); - pReq->info.handle = NULL; // disable auto rsp - + pMsg->info.handle = NULL; // disable auto rsp return TSDB_CODE_SUCCESS; } - mndAddConsensusTasks(pInfo, &req, pReq); + bool allEqual = true; + int64_t chkId = getConsensusId(req.streamId, numOfTasks, &allEqual); - int32_t total = 0; - if (mndAllTaskSendCheckpointId(pInfo, numOfTasks, &total)) { // all tasks has send the reqs - // start transaction to set the checkpoint id - int64_t checkpointId = mndGetConsensusCheckpointId(pInfo, pStream); - mInfo("stream:0x%" PRIx64 " %s all %d tasks send latest checkpointId, the consensus-checkpointId is:%" PRId64 - " will be issued soon", - req.streamId, pStream->name, numOfTasks, checkpointId); + // some tasks not send hbMsg to mnode yet, wait for 5s. + if (chkId == -1) { + mDebug( + "not all task send hbMsg yet, add into list and wait for 10s to check the consensus-checkpointId again, " + "s-task:0x%x", req.taskId); + SCheckpointConsensusInfo *pInfo = mndGetConsensusInfo(execInfo.pStreamConsensus, req.streamId, numOfTasks); + mndAddConsensusTasks(pInfo, &req, &pMsg->info); + taosThreadMutexUnlock(&execInfo.lock); - // start the checkpoint consensus trans - int32_t code = mndSendConsensusCheckpointIdRsp(pInfo->pTaskList, checkpointId); - if (code == TSDB_CODE_SUCCESS) { - mndClearConsensusRspEntry(pInfo); - mDebug("clear all waiting for rsp entry for stream:0x%" PRIx64, req.streamId); - } else { - mDebug("stream:0x%" PRIx64 " not start send consensus-checkpointId msg, due to not all task ready", req.streamId); - } - } else { - mDebug("stream:0x%" PRIx64 " %d/%d tasks send consensus-checkpointId info", req.streamId, total, numOfTasks); + pMsg->info.handle = NULL; // disable auto rsp + return 0; } + if (chkId == req.checkpointId) { + mDebug("vgId:%d stream:0x%" PRIx64 " %s consensus-checkpointId is:%" PRId64, req.nodeId, req.streamId, + pStream->name, pStream->checkpointId); + mndSendQuickConsensusChkptIdRsp(&req, TSDB_CODE_SUCCESS, req.streamId, chkId, &pMsg->info); + + taosThreadMutexUnlock(&execInfo.lock); + pMsg->info.handle = NULL; // disable auto rsp + return 0; + } + + // wait for 5s and check again + SCheckpointConsensusInfo *pInfo = mndGetConsensusInfo(execInfo.pStreamConsensus, req.streamId, numOfTasks); + mndAddConsensusTasks(pInfo, &req, &pMsg->info); + if (pStream != NULL) { mndReleaseStream(pMnode, pStream); } taosThreadMutexUnlock(&execInfo.lock); - pReq->info.handle = NULL; // disable auto rsp + pMsg->info.handle = NULL; // disable auto rsp return 0; } +int32_t mndProcessConsensusInTmr(SRpcMsg *pMsg) { + int64_t now = taosGetTimestampMs(); + int64_t streamId = -1; // todo: fix only one + + mDebug("start to process consensus-checkpointId in tmr"); + taosThreadMutexLock(&execInfo.lock); + + void *pIter = NULL; + while ((pIter = taosHashIterate(execInfo.pStreamConsensus, pIter)) != NULL) { + SCheckpointConsensusInfo *pInfo = (SCheckpointConsensusInfo *)pIter; + + int32_t j = 0; + int32_t num = taosArrayGetSize(pInfo->pTaskList); + + SArray *pList = taosArrayInit(4, sizeof(int32_t)); + + for (; j < num; ++j) { + SCheckpointConsensusEntry *pe = taosArrayGet(pInfo->pTaskList, j); + + if ((now - pe->ts) > 10 * 1000) { + bool allEqual = true; + int64_t chkId = getConsensusId(pe->req.streamId, pInfo->numOfTasks, &allEqual); + if (chkId == -1) { + mDebug("tasks send hbMsg for stream:0x%" PRIx64 ", wait for next round", pe->req.streamId); + break; + } + + if (allEqual) { + mDebug("all has identical checkpointId for stream:0x%"PRIx64" send checkpointId to s-task:0x%x", + pe->req.streamId, pe->req.taskId); + + mndSendQuickConsensusChkptIdRsp(&pe->req, TSDB_CODE_SUCCESS, pe->req.streamId, chkId, &pe->rspInfo); + } else { + ASSERT(chkId <= pe->req.checkpointId); + mndSendQuickConsensusChkptIdRsp(&pe->req, TSDB_CODE_SUCCESS, pe->req.streamId, chkId, &pe->rspInfo); + } + + taosArrayPush(pList, &pe->req.taskId); + streamId = pe->req.streamId; + } else { + mDebug("s-task:0x%x sendTs:%" PRId64 " wait %2.fs already, wait for next round to check", pe->req.taskId, + (now - pe->ts)/ 1000.0, pe->ts); + } + } + + if (taosArrayGetSize(pList) > 0) { + for (int32_t i = 0; i < taosArrayGetSize(pList); ++i) { + int32_t *taskId = taosArrayGet(pList, i); + for (int32_t k = 0; k < taosArrayGetSize(pInfo->pTaskList); ++k) { + SCheckpointConsensusEntry *pe = taosArrayGet(pInfo->pTaskList, k); + if (pe->req.taskId == *taskId) { + taosArrayRemove(pInfo->pTaskList, k); + break; + } + } + } + } + + taosArrayDestroy(pList); + + if (taosArrayGetSize(pInfo->pTaskList) == 0) { + mndClearConsensusRspEntry(pInfo); + mndClearConsensusCheckpointId(execInfo.pStreamConsensus, streamId); + } + } + + taosThreadMutexUnlock(&execInfo.lock); + + mDebug("end to process consensus-checkpointId in tmr"); + return TSDB_CODE_SUCCESS; +} + static int32_t mndProcessCreateStreamReqFromMNode(SRpcMsg *pReq) { int32_t code = mndProcessCreateStreamReq(pReq); if (code != 0 && code != TSDB_CODE_ACTION_IN_PROGRESS) { diff --git a/source/dnode/mnode/impl/src/mndStreamUtil.c b/source/dnode/mnode/impl/src/mndStreamUtil.c index a15b817784..9ee820925c 100644 --- a/source/dnode/mnode/impl/src/mndStreamUtil.c +++ b/source/dnode/mnode/impl/src/mndStreamUtil.c @@ -848,7 +848,7 @@ int32_t doSendConsensusCheckpointRsp(SRestoreCheckpointInfo* pInfo, SRpcMsg* pMs return -1; } - int32_t tlen = sizeof(SMsgHead) + blen; + int32_t tlen = sizeof(SMsgHead) + blen; void *abuf = POINTER_SHIFT(pMsg->pCont, sizeof(SMsgHead)); SEncoder encoder; tEncoderInit(&encoder, abuf, tlen); @@ -863,22 +863,30 @@ int32_t doSendConsensusCheckpointRsp(SRestoreCheckpointInfo* pInfo, SRpcMsg* pMs return code; } -int32_t mndSendConsensusCheckpointIdRsp(SArray* pInfoList, int64_t checkpointId) { - for(int32_t i = 0; i < taosArrayGetSize(pInfoList); ++i) { - SCheckpointConsensusEntry* pInfo = taosArrayGet(pInfoList, i); - doSendConsensusCheckpointRsp(&pInfo->req, &pInfo->rsp, checkpointId); - } - return 0; +int32_t mndSendQuickConsensusChkptIdRsp(SRestoreCheckpointInfo *pReq, int32_t code, int64_t streamId, + int64_t checkpointId, SRpcHandleInfo *pRpcInfo) { + SRpcMsg rsp = {.code = code, .info = *pRpcInfo, .contLen = sizeof(SRestoreCheckpointInfoRsp) + sizeof(SMsgHead)}; + rsp.pCont = rpcMallocCont(rsp.contLen); + + SMsgHead *pHead = rsp.pCont; + pHead->vgId = htonl(pReq->nodeId); + + mDebug("stream:0x%" PRIx64 " consensus-checkpointId:%" PRId64 " exists, s-task:0x%x send to vnode", + streamId, checkpointId, pReq->taskId); + return doSendConsensusCheckpointRsp(pReq, &rsp, checkpointId); } -SCheckpointConsensusInfo* mndGetConsensusInfo(SHashObj* pHash, int64_t streamId) { +SCheckpointConsensusInfo* mndGetConsensusInfo(SHashObj* pHash, int64_t streamId, int32_t numOfTasks) { void* pInfo = taosHashGet(pHash, &streamId, sizeof(streamId)); if (pInfo != NULL) { return (SCheckpointConsensusInfo*)pInfo; } SCheckpointConsensusInfo p = { - .genTs = -1, .checkpointId = -1, .pTaskList = taosArrayInit(4, sizeof(SCheckpointConsensusEntry))}; + .pTaskList = taosArrayInit(4, sizeof(SCheckpointConsensusEntry)), + .numOfTasks = numOfTasks, + }; + taosHashPut(pHash, &streamId, sizeof(streamId), &p, sizeof(p)); void* pChkptInfo = (SCheckpointConsensusInfo*)taosHashGet(pHash, &streamId, sizeof(streamId)); @@ -887,87 +895,15 @@ SCheckpointConsensusInfo* mndGetConsensusInfo(SHashObj* pHash, int64_t streamId) // no matter existed or not, add the request into info list anyway, since we need to send rsp mannually // discard the msg may lead to the lost of connections. -void mndAddConsensusTasks(SCheckpointConsensusInfo *pInfo, const SRestoreCheckpointInfo *pRestoreInfo, SRpcMsg *pMsg) { - SCheckpointConsensusEntry info = {0}; +void mndAddConsensusTasks(SCheckpointConsensusInfo *pInfo, const SRestoreCheckpointInfo *pRestoreInfo, SRpcHandleInfo* pRpcInfo) { + SCheckpointConsensusEntry info = {.ts = taosGetTimestampMs(), .rspInfo = *pRpcInfo}; memcpy(&info.req, pRestoreInfo, sizeof(info.req)); - info.rsp.code = 0; - info.rsp.info = pMsg->info; - info.rsp.contLen = sizeof(SRestoreCheckpointInfoRsp) + sizeof(SMsgHead); - info.rsp.pCont = rpcMallocCont(info.rsp.contLen); - - SMsgHead *pHead = info.rsp.pCont; - pHead->vgId = htonl(pRestoreInfo->nodeId); - taosArrayPush(pInfo->pTaskList, &info); -} -static int32_t entryComparFn(const void* p1, const void* p2) { - const SCheckpointConsensusEntry* pe1 = p1; - const SCheckpointConsensusEntry* pe2 = p2; - - if (pe1->req.taskId == pe2->req.taskId) { - return 0; - } - - return pe1->req.taskId < pe2->req.taskId? -1:1; -} - -bool mndAllTaskSendCheckpointId(SCheckpointConsensusInfo* pInfo, int32_t numOfTasks, int32_t* pTotal) { - int32_t numOfExisted = taosArrayGetSize(pInfo->pTaskList); - if (numOfExisted < numOfTasks) { - if (pTotal != NULL) { - *pTotal = numOfExisted; - } - return false; - } - - taosArraySort(pInfo->pTaskList, entryComparFn); - - int32_t num = 1; - int32_t taskId = ((SCheckpointConsensusEntry*)taosArrayGet(pInfo->pTaskList, 0))->req.taskId; - for(int32_t i = 1; i < taosArrayGetSize(pInfo->pTaskList); ++i) { - SCheckpointConsensusEntry* pe = taosArrayGet(pInfo->pTaskList, i); - if (pe->req.taskId != taskId) { - num += 1; - taskId = pe->req.taskId; - } - } - - if (pTotal != NULL) { - *pTotal = num; - } - - ASSERT(num <= numOfTasks); - return num == numOfTasks; -} - -int64_t mndGetConsensusCheckpointId(SCheckpointConsensusInfo* pInfo, SStreamObj* pStream) { - if (pInfo->genTs > 0) { // there is no checkpoint ever generated if the checkpointId is 0. - mDebug("existed consensus-checkpointId:%" PRId64 " for stream:0x%" PRIx64 " %s exist, and return", - pInfo->checkpointId, pStream->uid, pStream->name); - return pInfo->checkpointId; - } - - int32_t numOfTasks = mndGetNumOfStreamTasks(pStream); - if (!mndAllTaskSendCheckpointId(pInfo, numOfTasks, NULL)) { - return -1; - } - - int64_t checkpointId = INT64_MAX; - - for (int32_t i = 0; i < taosArrayGetSize(pInfo->pTaskList); ++i) { - SCheckpointConsensusEntry *pEntry = taosArrayGet(pInfo->pTaskList, i); - if (pEntry->req.checkpointId < checkpointId) { - checkpointId = pEntry->req.checkpointId; - mTrace("stream:0x%" PRIx64 " %s task:0x%x vgId:%d latest checkpointId:%" PRId64, pStream->uid, pStream->name, - pEntry->req.taskId, pEntry->req.nodeId, pEntry->req.checkpointId); - } - } - - pInfo->checkpointId = checkpointId; - pInfo->genTs = taosGetTimestampMs(); - return checkpointId; + int32_t num = taosArrayGetSize(pInfo->pTaskList); + mDebug("s-task:0x%x added into consensus-checkpointId list, stream:0x%" PRIx64 " total waiting:%d", + pRestoreInfo->taskId, pRestoreInfo->streamId, num); } void mndClearConsensusRspEntry(SCheckpointConsensusInfo* pInfo) { @@ -982,15 +918,15 @@ int64_t mndClearConsensusCheckpointId(SHashObj* pHash, int64_t streamId) { return TSDB_CODE_SUCCESS; } -int32_t mndRegisterConsensusChkptId(SHashObj* pHash, int64_t streamId) { - void* pInfo = taosHashGet(pHash, &streamId, sizeof(streamId)); - ASSERT(pInfo == NULL); - - SCheckpointConsensusInfo p = {.genTs = taosGetTimestampMs(), .checkpointId = 0, .pTaskList = NULL}; - taosHashPut(pHash, &streamId, sizeof(streamId), &p, sizeof(p)); - - SCheckpointConsensusInfo* pChkptInfo = (SCheckpointConsensusInfo*)taosHashGet(pHash, &streamId, sizeof(streamId)); - ASSERT(pChkptInfo->genTs > 0 && pChkptInfo->checkpointId == 0); - mDebug("s-task:0x%" PRIx64 " set the initial consensus-checkpointId:0", streamId); - return TSDB_CODE_SUCCESS; -} \ No newline at end of file +//int32_t mndRegisterConsensusChkptId(SHashObj* pHash, int64_t streamId) { +// void* pInfo = taosHashGet(pHash, &streamId, sizeof(streamId)); +// ASSERT(pInfo == NULL); +// +// SCheckpointConsensusInfo p = {.genTs = taosGetTimestampMs(), .checkpointId = 0, .pTaskList = NULL}; +// taosHashPut(pHash, &streamId, sizeof(streamId), &p, sizeof(p)); +// +// SCheckpointConsensusInfo* pChkptInfo = (SCheckpointConsensusInfo*)taosHashGet(pHash, &streamId, sizeof(streamId)); +// ASSERT(pChkptInfo->genTs > 0 && pChkptInfo->checkpointId == 0); +// mDebug("s-task:0x%" PRIx64 " set the initial consensus-checkpointId:0", streamId); +// return TSDB_CODE_SUCCESS; +//} \ No newline at end of file diff --git a/source/dnode/snode/src/snode.c b/source/dnode/snode/src/snode.c index 9686fd3789..69a7bc7ba4 100644 --- a/source/dnode/snode/src/snode.c +++ b/source/dnode/snode/src/snode.c @@ -43,14 +43,14 @@ int32_t sndBuildStreamTask(SSnode *pSnode, SStreamTask *pTask, int64_t nextProce char *p = streamTaskGetStatus(pTask)->name; if (pTask->info.fillHistory) { - sndInfo("vgId:%d expand stream task, s-task:%s, checkpointId:%" PRId64 " checkpointVer:%" PRId64 + sndInfo("vgId:%d build stream task, s-task:%s, checkpointId:%" PRId64 " checkpointVer:%" PRId64 " nextProcessVer:%" PRId64 " child id:%d, level:%d, status:%s fill-history:%d, related stream task:0x%x trigger:%" PRId64 " ms", SNODE_HANDLE, pTask->id.idStr, pChkInfo->checkpointId, pChkInfo->checkpointVer, pChkInfo->nextProcessVer, pTask->info.selfChildId, pTask->info.taskLevel, p, pTask->info.fillHistory, (int32_t)pTask->streamTaskId.taskId, pTask->info.delaySchedParam); } else { - sndInfo("vgId:%d expand stream task, s-task:%s, checkpointId:%" PRId64 " checkpointVer:%" PRId64 + sndInfo("vgId:%d build stream task, s-task:%s, checkpointId:%" PRId64 " checkpointVer:%" PRId64 " nextProcessVer:%" PRId64 " child id:%d, level:%d, status:%s fill-history:%d, related fill-task:0x%x trigger:%" PRId64 " ms", SNODE_HANDLE, pTask->id.idStr, pChkInfo->checkpointId, pChkInfo->checkpointVer, pChkInfo->nextProcessVer, diff --git a/source/dnode/vnode/src/tqCommon/tqCommon.c b/source/dnode/vnode/src/tqCommon/tqCommon.c index 668e178d2d..04a658a30c 100644 --- a/source/dnode/vnode/src/tqCommon/tqCommon.c +++ b/source/dnode/vnode/src/tqCommon/tqCommon.c @@ -279,7 +279,10 @@ int32_t tqStreamTaskProcessUpdateReq(SStreamMeta* pMeta, SMsgCb* cb, SRpcMsg* pM int32_t numOfTasks = streamMetaGetNumOfTasks(pMeta); int32_t updateTasks = taosHashGetSize(pMeta->updateInfo.pTasks); - pMeta->startInfo.tasksWillRestart = 1; + if (restored) { + tqDebug("vgId:%d s-task:0x%x update epset transId:%d, set the restart flag", vgId, req.taskId, req.transId); + pMeta->startInfo.tasksWillRestart = 1; + } if (updateTasks < numOfTasks) { tqDebug("vgId:%d closed tasks:%d, unclosed:%d, all tasks will be started when nodeEp update completed", vgId, @@ -292,8 +295,7 @@ int32_t tqStreamTaskProcessUpdateReq(SStreamMeta* pMeta, SMsgCb* cb, SRpcMsg* pM streamMetaClearUpdateTaskList(pMeta); if (!restored) { - tqDebug("vgId:%d vnode restore not completed, not start the tasks, clear the start after nodeUpdate flag", vgId); - pMeta->startInfo.tasksWillRestart = 0; + tqDebug("vgId:%d vnode restore not completed, not start all tasks", vgId); } else { tqDebug("vgId:%d all %d task(s) nodeEp updated and closed, transId:%d", vgId, numOfTasks, req.transId); #if 0 diff --git a/source/libs/stream/src/streamCheckStatus.c b/source/libs/stream/src/streamCheckStatus.c index 8778e3314a..22d336a549 100644 --- a/source/libs/stream/src/streamCheckStatus.c +++ b/source/libs/stream/src/streamCheckStatus.c @@ -666,7 +666,11 @@ void rspMonitorFn(void* param, void* tmrId) { stDebug("s-task:%s status:%s vgId:%d quit from monitor check-rsp tmr, ref:%d", id, pStat->name, vgId, ref); streamTaskCompleteCheckRsp(pInfo, true, id); - addDownstreamFailedStatusResultAsync(pTask->pMsgCb, vgId, pTask->id.streamId, pTask->id.taskId); + + // not record the failed of the current task if try to close current vnode + if (!pMeta->closeFlag) { + addDownstreamFailedStatusResultAsync(pTask->pMsgCb, vgId, pTask->id.streamId, pTask->id.taskId); + } streamMetaReleaseTask(pMeta, pTask); return; diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index bc973f17d7..1195362ab3 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -447,14 +447,6 @@ int32_t streamTaskUpdateTaskCheckpointInfo(SStreamTask* pTask, bool restored, SV SStreamTaskState* pStatus = streamTaskGetStatus(pTask); - if (restored && (pStatus->state != TASK_STATUS__CK) && (pMeta->role == NODE_ROLE_LEADER)) { - stDebug("s-task:0x%x vgId:%d restored:%d status:%s not update checkpoint-info, checkpointId:%" PRId64 "->%" PRId64 - " failed", - pReq->taskId, vgId, restored, pStatus->name, pInfo->checkpointId, pReq->checkpointId); - taosThreadMutexUnlock(&pTask->lock); - return TSDB_CODE_STREAM_TASK_IVLD_STATUS; - } - if (!restored) { // during restore procedure, do update checkpoint-info stDebug("s-task:%s vgId:%d status:%s update the checkpoint-info during restore, checkpointId:%" PRId64 "->%" PRId64 " checkpointVer:%" PRId64 "->%" PRId64 " checkpointTs:%" PRId64 "->%" PRId64, diff --git a/source/libs/stream/src/streamHb.c b/source/libs/stream/src/streamHb.c index d6411e25f2..691ec44672 100644 --- a/source/libs/stream/src/streamHb.c +++ b/source/libs/stream/src/streamHb.c @@ -168,7 +168,9 @@ int32_t streamMetaSendHbHelper(SStreamMeta* pMeta) { continue; } + taosThreadMutexLock(&(*pTask)->lock); STaskStatusEntry entry = streamTaskGetStatusEntry(*pTask); + taosThreadMutexUnlock(&(*pTask)->lock); entry.inputRate = entry.inputQUsed * 100.0 / (2 * STREAM_TASK_QUEUE_CAPACITY_IN_SIZE); if ((*pTask)->info.taskLevel == TASK_LEVEL__SINK) { From 0bd51f21333b6753530f3a472581f966418fb27a Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Thu, 4 Jul 2024 06:36:36 +0000 Subject: [PATCH 26/92] fix comment --- out | Bin 21360 -> 0 bytes source/dnode/vnode/src/tq/tqStreamStateSnap.c | 8 +- source/libs/stream/src/streamBackendRocksdb.c | 82 ++++++++++++------ source/libs/stream/src/streamCheckpoint.c | 2 +- source/libs/stream/src/streamSnapshot.c | 51 +++++++++-- t.c | 12 --- 6 files changed, 105 insertions(+), 50 deletions(-) delete mode 100755 out delete mode 100644 t.c diff --git a/out b/out deleted file mode 100755 index 21f5cbee379517922a226c62b551376a5b0ad2f4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 21360 zcmeHPe{@vUoxg7;Ap;3XApFM4L;*pXA%Q>vK@vh>QWKH|Qh(ORWacG#W-^n$c>{@c z6&KwjC|hdludZ6y(~76nF0O4?yS7DLslU359?vdy&u(eER@hn>Q*==)v!Cz1_d84; z8T7dO$DTcNleypf{oL>Od%yR4?|bjQpEotPuJaj&;N%yV3gWg*P)LKWIMJdDAPr)f z7>|6em?Or3FOrz5Z%_oaDjo5SqydflK}l~K6-J@wDOfON4~ddqz7!m*YM6>j#gkq! z6*cdrZQ2i}T%T<(NXke-nP+K{?jO@!J&H|jjgqfjHIfeK_69X&Ju*jnS7^N}v>wx- zZjUL)lVU=@W!j#X)}oFQQ?FjH*7MS}N=-1OHYja+!;O8K_ByoQ;h;1lyp-Ev%I*CL z^e8T$H1RNR)a~UPhr>FaOliC+EsWb;OY0ZLV>NL*kshoWT(-1kX?-Y_3@wuNNZ@c2z9@S0K^(24KuBzuw{@baq9eVEzZ_RrzHc;|N>^mewev=N# zP$GNUMafgS96yp#@wi#ygi(6M`iFj18a>|#c;YBk>+;~#PH+_Yt~~PO|7h}8=8^ww z9{C&d$Um6}xAWjv=fRKW!SBt3zXJRM{LV_I+0&m)m?<~nxTY!0md$Nu%yO(AJLOu= z=C-DIGGT3wbj2-IH+qd3NktN-ov>XIjl|>0s7NIS96RB5iT(NM}w zI#wNn=>%Ifd*aEiNIbhASN?1TVM&eEo^l$fY0t#55v(_}6REv^?_TohV@a2cci`GL}0raS}6+w1qg zdN-!EUyQ}XBP>0`#va`#CecJhTBi4hy#Jb7rSJw1-k>LW*n_{X@eU8p8%3Z&*<<_6 z2R-E5!PWjv)f~vd zd2FcWi5y%X`lZ6b9DJNIA@k>R@X{Roa1K5`2QL&05hz5U5P?Dj3K97K7=b@joc}?1 z$6JB$?Zq!F6e4`0e|@TzAR4*lHucD&GK~MA3lC_mZwF<@bNEYd0GGr zACE8(@8(GF)(Yt#_#cGG+cW&59{$@N{s9kvmxtf&;cxZuf9m1WxqRL4TXw#3ZFuLe z!#n={M91dlx`TBugm6hbsK zV>3r9W^E_AXElZVFQ{_&l2?h$KONqABK-8*tHV#9@`sH>;a5((lVIQ@TuvYZfy~iv z85h>4@!P(Nx+~HPI>S3wy-pV%!aLt^%fh#>`Yo7a_nppUj>Qn{L&g1&GrkDjZ2P2t zd>bmM&T1i_vA;hwvx5soj-d5_2Sr;SurOT92@LPcdwX}LIL-*faG49|5UA99BOT< z6K*Ifs8lPAnf^;~+ArvmLC1e)GJ~KeK=*=*6PXO1bnXG&0y^|gCUY2c{%|I90(3oS zrM`JV_i8B_*KZWYV5Kp$tR%1pb%fJ13G-lME(xqHk+-g9qgapM;kR{L<(2Eor+vPn zbX#D%SUqFK#fxWOK;oo-4Sr$xBozrnBHYEV0r(yQbfUKIz;7SgJ3v@-dF7w`n#Pau z?T1Iomq!5Cp)cp>)_2Nk>FdkD7ovV%ZoR4M2`Z#Q1PT!-M4%9XLIesCC`6zTfkFfd z5jb}Qvd^9A=gIiFGJcLspGJsbZHD=b=F8Z;J_*20E}0ti^K3!Q^K)L+nx|Wzl%^|^ zeI9P6=J}jrhUWSCH#(1?#P#o<&LoLHpdS+BXTlnEfuFrPqU-rYEvP%f&rVGvBS@1K z$>v&34C?2q_}Q-`T7u4ZDDg9AQxz-1x`T`#(2j7w@73+|vtHc3JokYOmisWH`Ui0d z;SQec?{;kucPu5Z&C#i$`Eux zbg-^2R3ECXsY^???#4xnLbdhtH7rkc)imQtFw&bl;6t%6WMx$bjO^25Ei~x{+_W}! zl*VCV@&~9jc1Pv1BPs)r!JMx~akvr?=qG+*6&Zm$V#;&GFD~tt zyv!=6J__o)+9=(I21QZj1IUl~NTc#)^x+q>IPElf^Gko}I^Z`|+yl-(W#SjW2xIyi z*itL~^8;7H_=LYec5%fCvNeGo?_VH$7ntEjdW8S8k|`TI4TeK+K}5DW_8G7X*TD+0 z#@H5^s7+|rH}=N>>pqV@Q*CS!Gq<2Sy1@^GW2w8=?B!t#Mi)HQgI(#=mI|-6~$B#GnWJM zH&xt)p3rDdq^h-3S~R3*_l~Vc$j2GQ0hwTI3pAqvl$$aE*sRLCHMg`wBJsP09S7?>LF=mCTF2xm2@HGB5f5i5&fwWL^%?&b#O# z8H&eu{h%?>Ne1^zzI^G?W>M4x)qTbT6qdh|VG%{c@cLT@9gh6p))1_S{l>EdJ*=VM zLh(BWHRk`WhHAk)YEYp3-;uAeZ7fC!d>c=@f$=ok#?!`aT#M}d@kQvaaR>4DQT7mJk5hJ#vY%7- zI%RKCc9JrKoGYhnCS~&|TSD0y%GxQrhBBM7A!M`0R7^-gGf1znjF~#F7)_4#XICV?02Md>A_%S&6TR!%>!6!BakR#Y386fduyFbxt#%MC~s*!OaWA${hOI4+b@|DwL zRb;%BjMrm*RTdD^t?EhS#P|wPS{XyVB+oiW_@QO|(u3pI*a4h-a zPG4JOup{F3wpxiEx0fvEHQZF++;Lg|b)v5=8B52l&d!##WF35Da8tU!KjI8&E9)(H zgEi>3B=8zgI_lzGo{p&KYlOS&<(obmt=RQ=!6@6AyiL~CZEjp|c5b+AL;K|$nh}o( zUIl8k6P9Y2EH=`MN0E(oPp_-($qu5oF$&?iI#gFn)|)yvZEkOaak3`E*lKmVBV3eK zEeY3&w0B)+MY$4fxUvni-|8Q5lHEBrq(dW&TyDqXohfTwB;|G{w%Lgo)cZfqMIH0& zAUo(ph6Ezp;aFSkWI8o6x?Ra+976PxlX@dd&H_OlT-)5(+S=YE`Zijw?N~AIZ0c)v zoTSrcrBaa|YfZYl+j3-h@eWu|kPZgucND{%jSqU!*p2tZHbs*IdJt(2Z0>ceh_bGR zr+E-UcqOZ&3G+{+6DhkVVa0;c-iV{DH9F{1Q!?3Svl(imDVcEXo;12hCA__5IqmIh z)Zj{=WR^8Y9-3zFHtIUXM{hHjapxNQcgVi(KfW^{v301oG|WbC!qqE-6@C^ZM&&nyqFjC z#1es2yA{M0(Y2hYH9#A&=sV*w##=Rm%|Z@x*oq9OCeCbB=Zq|-Huh08;A#`5txo&6 z(TZDLZ`#{}v!dlg1VVkg{U?Wbnca?h0H(1jXtZ82qtVy}CRXvHc)UWC8XlV-8 zbSIKEU1>Y++6l#2gL+I;ig#SsMk(2EyEWZ(ps5*1%Ap6CmV2ujmtF@Y@vU~s?y}>y zJG83Tj>Rl((jJJ^*wJnXM7wLSH*~S1#H3l(osP!=Mp8qGXs?q@V7spInoQ9Fj+K%Q zx&T3Rr(0D-JjGD7_sGH8r&t z((XcLralygq5}hBAw?;+uqPU=sjI8G!rHt%ekF72L|#iE=H}9R{hl?VeYWwBZ~oj8 zSy#n@e0uOEdw=oU_DJKD08d~K}VCPb4#D76npe{8??C8BmAHTr?nK}e7%z8TTn2> zOtH5?A>{KyW!Z_D^?DBzNX;P-MF-;cs*{y&iie+oF+=j*F{ zo$n=$^L5yH;3?&c&+Di&=qU?Y6nfHV{v4AsRoU+ou*!!h7`Xb%*EJcRfIt~yn($uN zqm#s-z}46WmQ-{GaDC}`M8)4oyfXX!jk&tEI*)t}@ZbmwilGY+6MYi|Ur%O{HNYqM zCJR0r!+Kq&%SoPva_F_dgE>4Gug@dD1Na28q?K@fE7P68HKY+m9syoCN_?IKKFUxD zx`s2sHzPYg+1OEy2O1brlr9%l=FDfVeP81bd*n|8A1w~ke52#Fz{&r;9y@fBJDQyv z^4M?9BX8%ycS*cbwCG0Y8Udw?$i5GE7qDrMNSmULTNg^w!nVsc>4F}Bl#4Ygo}D`L z!2;9mH>32)0y@cwCH40N%ox5mkTN6bL4n=kK-_YzSg3Y%3Ho+`ZANH4YT|^;84~g+ zXvT0}GX#|!PA;!Uh{~O6?oouE>@rj7E_1C}TU)z)xrtVZZP6@!17Y39#0hBv0u>D295HV5yL@q9Wwt zY+Quoj0ibN8KjWat7mm@4Az)~lDaaWX1FrR=)pFrSv%2cVL(bNE_~#bpGPiv(jrI3sompN5$UCG28wOjc={UY{`K z`;59;%5s_Lh4|qDRaQUWcZx`s5+9!wCj?#7%+>dPuV^+4YRbnShG~71tIz8zra>0e z)T@6v@b_?Uko!*;tSK?&_HpG=Ca?bypt!J>?LXgNW!g)@MIs-m$ohPr(*{OOa{CRq zv`2|)Q0psH+2AtKFX2b`K3SjdcQWPm75mR}OmBof-AiSj@24`YrACmv{wq|7-6&8@ zS)cFkGUfZb+`hN}_h|h!+92OAW_m>9q)%5Bz54rrQHa=n_Bg4Jmo&d~lXv{@M`f;l zQ0p<>o2!g6ABLIk_vjzg6--%<%`ngOfJgtZu3)-{6*Xl&rcZkG`8#h+f9aV&tjA{Q z4p?sgd40*0pJQNoZ~UIq`aJ$3s3kQ$tXU=+W>)+hC2F7b4{HIYYqOH7;BEh30O#sY zt5$MM>Aq%e;`zha8z@jr^k$!3NAvf^yzRrS;eG#L-4rpQ)Wk|Dpz^u!;rq=u@?)i&C<;Q7LTK8~kjrbRUZ6 m10O%>x(d%7rJ?d;&5C}-W8C8o&TU86-xgN-LmmYWEB+hAz@k(D diff --git a/source/dnode/vnode/src/tq/tqStreamStateSnap.c b/source/dnode/vnode/src/tq/tqStreamStateSnap.c index 655778568b..be768e375e 100644 --- a/source/dnode/vnode/src/tq/tqStreamStateSnap.c +++ b/source/dnode/vnode/src/tq/tqStreamStateSnap.c @@ -38,7 +38,8 @@ int32_t streamStateSnapReaderOpen(STQ* pTq, int64_t sver, int64_t ever, SStreamS // alloc pReader = (SStreamStateReader*)taosMemoryCalloc(1, sizeof(SStreamStateReader)); if (pReader == NULL) { - code = TSDB_CODE_OUT_OF_MEMORY; + terrno = TSDB_CODE_OUT_OF_MEMORY; + code = terrno; goto _err; } @@ -54,7 +55,7 @@ int32_t streamStateSnapReaderOpen(STQ* pTq, int64_t sver, int64_t ever, SStreamS if (streamSnapReaderOpen(meta, sver, chkpId, meta->path, &pSnapReader) == 0) { pReader->complete = 1; } else { - code = -1; + code = terrno; taosMemoryFree(pReader); goto _err; } @@ -131,7 +132,8 @@ int32_t streamStateSnapWriterOpen(STQ* pTq, int64_t sver, int64_t ever, SStreamS // alloc pWriter = (SStreamStateWriter*)taosMemoryCalloc(1, sizeof(*pWriter)); if (pWriter == NULL) { - code = TSDB_CODE_OUT_OF_MEMORY; + terrno = TSDB_CODE_OUT_OF_MEMORY; + code = terrno; goto _err; } pWriter->pTq = pTq; diff --git a/source/libs/stream/src/streamBackendRocksdb.c b/source/libs/stream/src/streamBackendRocksdb.c index 4ba409e3f0..54abba8bdc 100644 --- a/source/libs/stream/src/streamBackendRocksdb.c +++ b/source/libs/stream/src/streamBackendRocksdb.c @@ -214,14 +214,14 @@ int32_t rebuildDirFromCheckpoint(const char* path, int64_t chkpId, char** dst) { char* state = taosMemoryCalloc(1, cap); if (state == NULL) { terrno = TSDB_CODE_OUT_OF_MEMORY; - return -1; + return terrno; } nBytes = snprintf(state, cap, "%s%s%s", path, TD_DIRSEP, "state"); if (nBytes <= 0 || nBytes >= cap) { terrno = TSDB_CODE_OUT_OF_RANGE; taosMemoryFree(state); - return -1; + return terrno; } if (chkpId != 0) { @@ -229,7 +229,7 @@ int32_t rebuildDirFromCheckpoint(const char* path, int64_t chkpId, char** dst) { if (chkp == NULL) { terrno = TSDB_CODE_OUT_OF_MEMORY; taosMemoryFree(state); - return -1; + return terrno; } nBytes = snprintf(chkp, cap, "%s%s%s%scheckpoint%" PRId64 "", path, TD_DIRSEP, "checkpoints", TD_DIRSEP, chkpId); @@ -237,7 +237,7 @@ int32_t rebuildDirFromCheckpoint(const char* path, int64_t chkpId, char** dst) { terrno = TSDB_CODE_OUT_OF_RANGE; taosMemoryFree(state); taosMemoryFree(chkp); - return -1; + return terrno; } if (taosIsDir(chkp) && isValidCheckpoint(chkp)) { @@ -255,6 +255,7 @@ int32_t rebuildDirFromCheckpoint(const char* path, int64_t chkpId, char** dst) { code = taosMkDir(state); if (code != 0) { terrno = TAOS_SYSTEM_ERROR(errno); + code = terrno; } } @@ -262,7 +263,7 @@ int32_t rebuildDirFromCheckpoint(const char* path, int64_t chkpId, char** dst) { } *dst = state; - return 0; + return code; } typedef struct { @@ -284,14 +285,14 @@ int32_t remoteChkp_readMetaData(char* path, SSChkpMetaOnS3** pMeta) { char* metaPath = taosMemoryCalloc(1, cap); if (metaPath == NULL) { terrno = TSDB_CODE_OUT_OF_MEMORY; - return -1; + return terrno; } int32_t n = snprintf(metaPath, cap, "%s%s%s", path, TD_DIRSEP, "META"); if (n <= 0 || n >= cap) { terrno = TSDB_CODE_OUT_OF_MEMORY; taosMemoryFree(metaPath); - return -1; + return terrno; } pFile = taosOpenFile(path, TD_FILE_READ); @@ -329,29 +330,32 @@ int32_t remoteChkp_readMetaData(char* path, SSChkpMetaOnS3** pMeta) { _EXIT: taosCloseFile(&pFile); taosMemoryFree(metaPath); + code = terrno; return code; } int32_t remoteChkp_validAndCvtMeta(char* path, SSChkpMetaOnS3* pMeta, int64_t chkpId) { int32_t code = -1; int32_t nBytes = 0; + int32_t cap = strlen(path) + 64; char* src = taosMemoryCalloc(1, cap); char* dst = taosMemoryCalloc(1, cap); if (src == NULL || dst == NULL) { terrno = TSDB_CODE_OUT_OF_MEMORY; - return code; + goto _EXIT; } if (pMeta->currChkptId != chkpId || pMeta->manifestChkptId != chkpId) { terrno = TSDB_CODE_INVALID_CFG; - return code; + goto _EXIT; } // rename current_chkp/mainfest to current for (int i = 0; i < 2; i++) { char* key = (i == 0 ? pMeta->pCurrName : pMeta->pManifestName); if (strlen(key) <= 0) { terrno = TSDB_CODE_INVALID_PARA; + goto _EXIT; } nBytes = snprintf(src, cap, "%s%s%s_%" PRId64 "", path, TD_DIRSEP, key, pMeta->currChkptId); @@ -385,6 +389,7 @@ int32_t remoteChkp_validAndCvtMeta(char* path, SSChkpMetaOnS3* pMeta, int64_t ch _EXIT: taosMemoryFree(src); taosMemoryFree(dst); + code = terrno; return code; } int32_t remoteChkpGetDelFile(char* path, SArray* toDel) { @@ -1495,7 +1500,7 @@ int32_t chkpLoadExtraInfo(char* pChkpIdDir, int64_t* chkpId, int64_t* processId) if (len == 0) { terrno = TSDB_CODE_INVALID_PARA; stError("failed to load extra info, dir:%s, reason:%s", pChkpIdDir, tstrerror(terrno)); - return -1; + return terrno; } int32_t cap = len + 64; @@ -1542,7 +1547,7 @@ int32_t chkpLoadExtraInfo(char* pChkpIdDir, int64_t* chkpId, int64_t* processId) _EXIT: taosMemoryFree(pDst); taosCloseFile(&pFile); - return code; + return terrno; } int32_t chkpAddExtraInfo(char* pChkpIdDir, int64_t chkpId, int64_t processId) { int32_t code = -1; @@ -4399,6 +4404,10 @@ int32_t compareHashTableImpl(SHashObj* p1, SHashObj* p2, SArray* diff) { char* name = taosHashGetKey(pIter, &len); if (!isBkdDataMeta(name, len) && !taosHashGet(p1, name, len)) { char* fname = taosMemoryCalloc(1, len + 1); + if (fname == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + return terrno; + } strncpy(fname, name, len); taosArrayPush(diff, &fname); } @@ -4410,7 +4419,9 @@ int32_t compareHashTable(SHashObj* p1, SHashObj* p2, SArray* add, SArray* del) { int32_t code = 0; code = compareHashTableImpl(p1, p2, add); - code = compareHashTableImpl(p2, p1, del); + if (code != 0) { + code = compareHashTableImpl(p2, p1, del); + } return code; } @@ -4493,7 +4504,7 @@ int32_t dbChkpGetDelta(SDbChkp* p, int64_t chkpId, SArray* list) { if (nBytes <= 0 || nBytes >= p->len) { terrno = TSDB_CODE_OUT_OF_RANGE; taosThreadRwlockUnlock(&p->rwLock); - return -1; + return terrno; } taosArrayClearP(p->pAdd, taosMemoryFree); @@ -4502,9 +4513,9 @@ int32_t dbChkpGetDelta(SDbChkp* p, int64_t chkpId, SArray* list) { TdDirPtr pDir = taosOpenDir(p->buf); if (pDir == NULL) { - terrno = errno; + terrno = TAOS_SYSTEM_ERROR(errno); taosThreadRwlockUnlock(&p->rwLock); - return -1; + return terrno; } TdDirEntryPtr de = NULL; @@ -4514,21 +4525,36 @@ int32_t dbChkpGetDelta(SDbChkp* p, int64_t chkpId, SArray* list) { if (strcmp(name, ".") == 0 || strcmp(name, "..") == 0) continue; if (strlen(name) == currLen && strcmp(name, pCurrent) == 0) { taosMemoryFreeClear(p->pCurrent); + p->pCurrent = taosStrdup(name); + if (p->pCurrent == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + break; + } continue; } if (strlen(name) >= maniLen && strncmp(name, pManifest, maniLen) == 0) { taosMemoryFreeClear(p->pManifest); p->pManifest = taosStrdup(name); + if (p->pManifest == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + break; + } continue; } if (strlen(name) >= sstLen && strncmp(name + strlen(name) - 4, pSST, sstLen) == 0) { - taosHashPut(p->pSstTbl[1 - p->idx], name, strlen(name), &dummy, sizeof(dummy)); + if (taosHashPut(p->pSstTbl[1 - p->idx], name, strlen(name), &dummy, sizeof(dummy)) != 0) { + break; + } continue; } } taosCloseDir(&pDir); + if (terrno != 0) { + taosThreadRwlockUnlock(&p->rwLock); + return terrno; + } if (p->init == 0) { void* pIter = taosHashIterate(p->pSstTbl[1 - p->idx], NULL); @@ -4542,6 +4568,7 @@ int32_t dbChkpGetDelta(SDbChkp* p, int64_t chkpId, SArray* list) { taosThreadRwlockUnlock(&p->rwLock); return -1; } + strncpy(fname, name, len); taosArrayPush(p->pAdd, &fname); } @@ -4560,7 +4587,7 @@ int32_t dbChkpGetDelta(SDbChkp* p, int64_t chkpId, SArray* list) { taosArrayClearP(p->pDel, taosMemoryFree); taosHashClear(p->pSstTbl[1 - p->idx]); p->update = 0; - return code; + return terrno; } if (taosArrayGetSize(p->pAdd) == 0 && taosArrayGetSize(p->pDel) == 0) { @@ -4831,7 +4858,8 @@ _ERROR: taosMemoryFree(dstBuf); taosMemoryFree(srcDir); taosMemoryFree(dstDir); - return code; + + return terrno; } SBkdMgt* bkdMgtCreate(char* path) { @@ -4893,7 +4921,7 @@ int32_t bkdMgtGetDelta(SBkdMgt* bm, char* taskId, int64_t chkpId, SArray* list, if (path == NULL) { terrno = TSDB_CODE_OUT_OF_MEMORY; taosThreadRwlockUnlock(&bm->rwLock); - return -1; + return terrno; } int32_t nBytes = snprintf(path, cap, "%s%s%s", bm->path, TD_DIRSEP, taskId); @@ -4901,21 +4929,20 @@ int32_t bkdMgtGetDelta(SBkdMgt* bm, char* taskId, int64_t chkpId, SArray* list, terrno = TSDB_CODE_OUT_OF_RANGE; taosMemoryFree(path); taosThreadRwlockUnlock(&bm->rwLock); - return -1; + return terrno; } SDbChkp* p = dbChkpCreate(path, chkpId); if (p == NULL) { taosMemoryFree(path); taosThreadRwlockUnlock(&bm->rwLock); - return -1; + return terrno; } if (taosHashPut(bm->pDbChkpTbl, taskId, strlen(taskId), &p, sizeof(void*)) != 0) { dbChkpDestroy(p); - taosMemoryFree(path); taosThreadRwlockUnlock(&bm->rwLock); - return -1; + return terrno; } pChkp = p; @@ -4923,13 +4950,14 @@ int32_t bkdMgtGetDelta(SBkdMgt* bm, char* taskId, int64_t chkpId, SArray* list, taosThreadRwlockUnlock(&bm->rwLock); return code; } else { - code = dbChkpGetDelta(pChkp, chkpId, NULL); - - if (code == 0) code = dbChkpDumpTo(pChkp, dname, list); + terrno = dbChkpGetDelta(pChkp, chkpId, NULL); + if (code == 0) { + terrno = dbChkpDumpTo(pChkp, dname, list); + } } taosThreadRwlockUnlock(&bm->rwLock); - return code; + return terrno; } #ifdef BUILD_NO_CALL diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index d6612cd4d8..df73b9f1c8 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -123,7 +123,7 @@ int32_t streamTaskSendCheckpointTriggerMsg(SStreamTask* pTask, int32_t dstTaskId void* pBuf = rpcMallocCont(size); if (pBuf == NULL) { terrno = TSDB_CODE_OUT_OF_MEMORY; - return -1; + return terrno; } SCheckpointTriggerRsp* pRsp = POINTER_SHIFT(pBuf, sizeof(SMsgHead)); diff --git a/source/libs/stream/src/streamSnapshot.c b/source/libs/stream/src/streamSnapshot.c index 7ef4e8ec09..160bd3525d 100644 --- a/source/libs/stream/src/streamSnapshot.c +++ b/source/libs/stream/src/streamSnapshot.c @@ -152,9 +152,20 @@ int32_t streamDestroyTaskDbSnapInfo(void* arg, SArray* snap) { return taskDbDest void snapFileDebugInfo(SBackendSnapFile2* pSnapFile) { if (qDebugFlag & DEBUG_DEBUG) { - int16_t cap = 511; - char* buf = taosMemoryCalloc(1, cap + 1); - sprintf(buf + strlen(buf), "["); + int16_t cap = 512; + + char* buf = taosMemoryCalloc(1, cap); + if (buf == NULL) { + stError("%s failed to alloc memory", STREAM_STATE_TRANSFER, tstrerror(TSDB_CODE_OUT_OF_MEMORY)); + return; + } + + int32_t nBytes = snprintf(buf + strlen(buf), cap, "["); + if (nBytes <= 0 || nBytes >= cap) { + taosMemoryFree(buf); + stError("%s failed to write buf, reason:%s", STREAM_STATE_TRANSFER, tstrerror(TSDB_CODE_OUT_OF_RANGE)); + return; + } if (pSnapFile->pCurrent) sprintf(buf, "current: %s,", pSnapFile->pCurrent); if (pSnapFile->pMainfest) sprintf(buf + strlen(buf), "MANIFEST: %s,", pSnapFile->pMainfest); @@ -219,8 +230,9 @@ int32_t snapFileGenMeta(SBackendSnapFile2* pSnapFile) { int32_t snapFileReadMeta(SBackendSnapFile2* pSnapFile) { TdDirPtr pDir = taosOpenDir(pSnapFile->path); if (NULL == pDir) { + terrno = TAOS_SYSTEM_ERROR(errno); stError("%s failed to open %s", STREAM_STATE_TRANSFER, pSnapFile->path); - return -1; + return terrno; } TdDirEntryPtr pDirEntry; @@ -228,34 +240,58 @@ int32_t snapFileReadMeta(SBackendSnapFile2* pSnapFile) { char* name = taosGetDirEntryName(pDirEntry); if (strlen(name) >= strlen(ROCKSDB_CURRENT) && 0 == strncmp(name, ROCKSDB_CURRENT, strlen(ROCKSDB_CURRENT))) { pSnapFile->pCurrent = taosStrdup(name); + if (pSnapFile->pCurrent == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + break; + } continue; } if (strlen(name) >= strlen(ROCKSDB_MAINFEST) && 0 == strncmp(name, ROCKSDB_MAINFEST, strlen(ROCKSDB_MAINFEST))) { pSnapFile->pMainfest = taosStrdup(name); + if (pSnapFile->pMainfest == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + break; + } continue; } if (strlen(name) >= strlen(ROCKSDB_OPTIONS) && 0 == strncmp(name, ROCKSDB_OPTIONS, strlen(ROCKSDB_OPTIONS))) { pSnapFile->pOptions = taosStrdup(name); + if (pSnapFile->pOptions == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + break; + } continue; } if (strlen(name) >= strlen(ROCKSDB_CHECKPOINT_META) && 0 == strncmp(name, ROCKSDB_CHECKPOINT_META, strlen(ROCKSDB_CHECKPOINT_META))) { pSnapFile->pCheckpointMeta = taosStrdup(name); + if (pSnapFile->pCheckpointMeta == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + break; + } continue; } if (strlen(name) >= strlen(ROCKSDB_CHECKPOINT_SELF_CHECK) && 0 == strncmp(name, ROCKSDB_CHECKPOINT_SELF_CHECK, strlen(ROCKSDB_CHECKPOINT_SELF_CHECK))) { pSnapFile->pCheckpointSelfcheck = taosStrdup(name); + if (pSnapFile->pCheckpointSelfcheck == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + break; + } continue; } if (strlen(name) >= strlen(ROCKSDB_SST) && 0 == strncmp(name + strlen(name) - strlen(ROCKSDB_SST), ROCKSDB_SST, strlen(ROCKSDB_SST))) { char* sst = taosStrdup(name); + if (sst == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + break; + } taosArrayPush(pSnapFile->pSst, &sst); } } taosCloseDir(&pDir); - return 0; + return terrno; } int32_t streamBackendSnapInitFile(char* metaPath, SStreamTaskSnap* pSnap, SBackendSnapFile2* pSnapFile) { int32_t code = -1; @@ -535,6 +571,7 @@ int32_t streamSnapWriterOpen(void* pMeta, int64_t sver, int64_t ever, char* path if (pHandle->metaPath == NULL) { terrno = TSDB_CODE_OUT_OF_MEMORY; taosMemoryFree(pWriter); + return terrno; } pHandle->pDbSnapSet = taosArrayInit(8, sizeof(SBackendSnapFile2)); @@ -542,14 +579,14 @@ int32_t streamSnapWriterOpen(void* pMeta, int64_t sver, int64_t ever, char* path terrno = TSDB_CODE_OUT_OF_MEMORY; taosMemoryFree(pHandle->metaPath); taosMemoryFree(pWriter); - return -1; + return terrno; } SBackendSnapFile2 snapFile = {0}; if (taosArrayPush(pHandle->pDbSnapSet, &snapFile) == NULL) { terrno = TSDB_CODE_OUT_OF_MEMORY; streamSnapWriterClose(pWriter, 0); - return -1; + return terrno; } *ppWriter = pWriter; diff --git a/t.c b/t.c deleted file mode 100644 index a79ed4c134..0000000000 --- a/t.c +++ /dev/null @@ -1,12 +0,0 @@ -#include -#include -#include - -int main() { - char *buf = calloc(1, 4); - int n = snprintf(buf, 4, "size"); - - printf("write size:%d \t buf:%s \t len:%d\n", n, buf, (int)(strlen(buf))); - buf[4] = 10; - return 1; -} From ea01f1eb85dcc83a1cafd8946a224bcc1b0646b3 Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Thu, 4 Jul 2024 06:38:14 +0000 Subject: [PATCH 27/92] fix comment --- source/libs/stream/src/streamSnapshot.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/source/libs/stream/src/streamSnapshot.c b/source/libs/stream/src/streamSnapshot.c index 160bd3525d..bbf7f5499d 100644 --- a/source/libs/stream/src/streamSnapshot.c +++ b/source/libs/stream/src/streamSnapshot.c @@ -577,8 +577,7 @@ int32_t streamSnapWriterOpen(void* pMeta, int64_t sver, int64_t ever, char* path pHandle->pDbSnapSet = taosArrayInit(8, sizeof(SBackendSnapFile2)); if (pHandle->pDbSnapSet == NULL) { terrno = TSDB_CODE_OUT_OF_MEMORY; - taosMemoryFree(pHandle->metaPath); - taosMemoryFree(pWriter); + streamSnapWriterClose(pWriter, 0); return terrno; } From 64e7c4c84266ac4ef74353eff09863cb66767242 Mon Sep 17 00:00:00 2001 From: wangmm0220 Date: Thu, 4 Jul 2024 14:56:56 +0800 Subject: [PATCH 28/92] fix:[TS-4921] send data to queue error if monitor thread starts later or failed --- include/libs/monitor/clientMonitor.h | 2 +- source/client/src/clientEnv.c | 6 ++- source/client/src/clientMonitor.c | 64 ++++++++++++++++------------ source/client/src/clientMsgHandler.c | 13 +++--- 4 files changed, 48 insertions(+), 37 deletions(-) diff --git a/include/libs/monitor/clientMonitor.h b/include/libs/monitor/clientMonitor.h index 4c7ab6f65a..1e6db8c00a 100644 --- a/include/libs/monitor/clientMonitor.h +++ b/include/libs/monitor/clientMonitor.h @@ -65,7 +65,7 @@ typedef struct { } MonitorSlowLogData; void monitorClose(); -void monitorInit(); +int32_t monitorInit(); void monitorClientSQLReqInit(int64_t clusterKey); void monitorClientSlowQueryInit(int64_t clusterId); diff --git a/source/client/src/clientEnv.c b/source/client/src/clientEnv.c index 3a821768f8..b227a6bd96 100644 --- a/source/client/src/clientEnv.c +++ b/source/client/src/clientEnv.c @@ -867,7 +867,10 @@ void taos_init_imp(void) { tscError("failed to init conv"); return; } - + if (monitorInit() != 0){ + tscError("failed to init monitor"); + return; + } rpcInit(); SCatalogCfg cfg = {.maxDBCacheNum = 100, .maxTblCacheNum = 100}; @@ -891,7 +894,6 @@ void taos_init_imp(void) { taosThreadMutexInit(&appInfo.mutex, NULL); tscCrashReportInit(); - monitorInit(); tscDebug("client is initialized successfully"); } diff --git a/source/client/src/clientMonitor.c b/source/client/src/clientMonitor.c index 479ea76fe3..c3345bf58d 100644 --- a/source/client/src/clientMonitor.c +++ b/source/client/src/clientMonitor.c @@ -18,6 +18,7 @@ int32_t quitCnt = 0; tsem2_t monitorSem; STaosQueue* monitorQueue; SHashObj* monitorSlowLogHash; +char tmpSlowLogPath[PATH_MAX] = {0}; static int32_t getSlowLogTmpDir(char* tmpPath, int32_t size){ if (tsTempDir == NULL) { @@ -690,28 +691,6 @@ static void* monitorThreadFunc(void *param){ } #endif - char tmpPath[PATH_MAX] = {0}; - if (getSlowLogTmpDir(tmpPath, sizeof(tmpPath)) < 0){ - return NULL; - } - - if (taosMulModeMkDir(tmpPath, 0777, true) != 0) { - terrno = TAOS_SYSTEM_ERROR(errno); - printf("failed to create dir:%s since %s", tmpPath, terrstr()); - return NULL; - } - - if (tsem2_init(&monitorSem, 0, 0) != 0) { - tscError("sem init error since %s", terrstr()); - return NULL; - } - - monitorQueue = taosOpenQueue(); - if(monitorQueue == NULL){ - tscError("open queue error since %s", terrstr()); - return NULL; - } - if (-1 != atomic_val_compare_exchange_32(&slowLogFlag, -1, 0)) { return NULL; } @@ -747,7 +726,7 @@ static void* monitorThreadFunc(void *param){ monitorSendAllSlowLogFromTempDir(slowLogData->clusterId); } } else if(slowLogData->type == SLOW_LOG_WRITE){ - monitorWriteSlowLog2File(slowLogData, tmpPath); + monitorWriteSlowLog2File(slowLogData, tmpSlowLogPath); } else if(slowLogData->type == SLOW_LOG_READ_RUNNING){ monitorSendSlowLogAtRunning(slowLogData->clusterId); } else if(slowLogData->type == SLOW_LOG_READ_QUIT){ @@ -799,27 +778,59 @@ static void tscMonitorStop() { } } -void monitorInit() { +int32_t monitorInit() { tscInfo("[monitor] tscMonitor init"); monitorCounterHash = (SHashObj*)taosHashInit(64, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BIGINT), false, HASH_ENTRY_LOCK); if (monitorCounterHash == NULL) { tscError("failed to create monitorCounterHash"); + terrno = TSDB_CODE_OUT_OF_MEMORY; + return -1; } taosHashSetFreeFp(monitorCounterHash, destroyMonitorClient); monitorSlowLogHash = (SHashObj*)taosHashInit(64, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BIGINT), false, HASH_ENTRY_LOCK); if (monitorSlowLogHash == NULL) { tscError("failed to create monitorSlowLogHash"); + terrno = TSDB_CODE_OUT_OF_MEMORY; + return -1; } taosHashSetFreeFp(monitorSlowLogHash, destroySlowLogClient); monitorTimer = taosTmrInit(0, 0, 0, "MONITOR"); if (monitorTimer == NULL) { tscError("failed to create monitor timer"); + terrno = TSDB_CODE_OUT_OF_MEMORY; + return -1; + } + + if (getSlowLogTmpDir(tmpSlowLogPath, sizeof(tmpSlowLogPath)) < 0){ + terrno = TSDB_CODE_TSC_INTERNAL_ERROR; + return -1; + } + + if (taosMulModeMkDir(tmpSlowLogPath, 0777, true) != 0) { + terrno = TAOS_SYSTEM_ERROR(errno); + tscError("failed to create dir:%s since %s", tmpSlowLogPath, terrstr()); + return -1; + } + + if (tsem2_init(&monitorSem, 0, 0) != 0) { + terrno = TAOS_SYSTEM_ERROR(errno); + tscError("sem init error since %s", terrstr()); + return -1; + } + + monitorQueue = taosOpenQueue(); + if(monitorQueue == NULL){ + tscError("open queue error since %s", terrstr()); + return -1; } taosInitRWLatch(&monitorLock); - tscMonitortInit(); + if (tscMonitortInit() != 0){ + return -1; + } + return 0; } void monitorClose() { @@ -845,9 +856,6 @@ int32_t monitorPutData2MonitorQueue(MonitorSlowLogData data){ } *slowLogData = data; tscDebug("[monitor] write slow log to queue, clusterId:%"PRIx64 " type:%d", slowLogData->clusterId, slowLogData->type); - while (atomic_load_32(&slowLogFlag) == -1) { - taosMsleep(5); - } if (taosWriteQitem(monitorQueue, slowLogData) == 0){ tsem2_post(&monitorSem); }else{ diff --git a/source/client/src/clientMsgHandler.c b/source/client/src/clientMsgHandler.c index e5baa7137e..d587deffc5 100644 --- a/source/client/src/clientMsgHandler.c +++ b/source/client/src/clientMsgHandler.c @@ -154,13 +154,14 @@ int32_t processConnectRsp(void* param, SDataBuf* pMsg, int32_t code) { if(taosHashGet(appInfo.pInstMapByClusterId, &connectRsp.clusterId, LONG_BYTES) == NULL){ if(taosHashPut(appInfo.pInstMapByClusterId, &connectRsp.clusterId, LONG_BYTES, &pTscObj->pAppInfo, POINTER_BYTES) != 0){ tscError("failed to put appInfo into appInfo.pInstMapByClusterId"); + }else{ + MonitorSlowLogData data = {0}; + data.clusterId = pTscObj->pAppInfo->clusterId; + data.type = SLOW_LOG_READ_BEGINNIG; + monitorPutData2MonitorQueue(data); + monitorClientSlowQueryInit(connectRsp.clusterId); + monitorClientSQLReqInit(connectRsp.clusterId); } - MonitorSlowLogData data = {0}; - data.clusterId = pTscObj->pAppInfo->clusterId; - data.type = SLOW_LOG_READ_BEGINNIG; - monitorPutData2MonitorQueue(data); - monitorClientSlowQueryInit(connectRsp.clusterId); - monitorClientSQLReqInit(connectRsp.clusterId); } taosThreadMutexLock(&clientHbMgr.lock); From 47b0a0464e62e1458a87799ac5800b451929fd4d Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Thu, 4 Jul 2024 15:21:16 +0800 Subject: [PATCH 29/92] fix(stream): send checkpoint-source-rsp to mnode before reset task status. --- include/libs/stream/tstream.h | 1 + source/dnode/mnode/impl/src/mndStream.c | 3 --- source/dnode/vnode/src/tqCommon/tqCommon.c | 4 +++- source/libs/stream/src/streamCheckpoint.c | 16 ++++++++++++++++ 4 files changed, 20 insertions(+), 4 deletions(-) diff --git a/include/libs/stream/tstream.h b/include/libs/stream/tstream.h index 66b9db47e2..f24f7da7c3 100644 --- a/include/libs/stream/tstream.h +++ b/include/libs/stream/tstream.h @@ -801,6 +801,7 @@ int32_t streamTaskBroadcastRetrieveReq(SStreamTask* pTask, SStreamRetrieveReq* r void streamTaskSendRetrieveRsp(SStreamRetrieveReq* pReq, SRpcMsg* pRsp); int32_t streamProcessHeartbeatRsp(SStreamMeta* pMeta, SMStreamHbRspMsg* pRsp); +int32_t streamTaskSendPreparedCheckpointsourceRsp(SStreamTask* pTask); #ifdef __cplusplus diff --git a/source/dnode/mnode/impl/src/mndStream.c b/source/dnode/mnode/impl/src/mndStream.c index a4c03ab3e0..536dfab331 100644 --- a/source/dnode/mnode/impl/src/mndStream.c +++ b/source/dnode/mnode/impl/src/mndStream.c @@ -1239,9 +1239,6 @@ static int32_t mndProcessStreamCheckpoint(SRpcMsg *pReq) { code = mndProcessStreamCheckpointTrans(pMnode, p, checkpointId, 1, true); sdbRelease(pSdb, p); - // clear the consensus checkpoint info - mndClearConsensusCheckpointId(execInfo.pStreamConsensus, p->uid); - if (code != -1) { started += 1; diff --git a/source/dnode/vnode/src/tqCommon/tqCommon.c b/source/dnode/vnode/src/tqCommon/tqCommon.c index 04a658a30c..1999134754 100644 --- a/source/dnode/vnode/src/tqCommon/tqCommon.c +++ b/source/dnode/vnode/src/tqCommon/tqCommon.c @@ -228,6 +228,9 @@ int32_t tqStreamTaskProcessUpdateReq(SStreamMeta* pMeta, SMsgCb* cb, SRpcMsg* pM } updated = streamTaskUpdateEpsetInfo(pTask, req.pNodeList); + + // send the checkpoint-source-rsp for source task to end the checkpoint trans in mnode + streamTaskSendPreparedCheckpointsourceRsp(pTask); streamTaskResetStatus(pTask); streamTaskStopMonitorCheckRsp(&pTask->taskCheckInfo, pTask->id.idStr); @@ -264,7 +267,6 @@ int32_t tqStreamTaskProcessUpdateReq(SStreamMeta* pMeta, SMsgCb* cb, SRpcMsg* pM tqDebug("s-task:%s vgId:%d not save task since not update epset actually, stop task", idstr, vgId); } - // stop streamTaskStop(pTask); if (ppHTask != NULL) { streamTaskStop(*ppHTask); diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index 1195362ab3..969c2e1795 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -1146,4 +1146,20 @@ int32_t streamTaskSendRestoreChkptMsg(SStreamTask* pTask) { tmsgSendReq(&pTask->info.mnodeEpset, &msg); return 0; +} + +int32_t streamTaskSendPreparedCheckpointsourceRsp(SStreamTask* pTask) { + int32_t code = 0; + if (pTask->info.taskLevel != TASK_LEVEL__SOURCE) { + return code; + } + + taosThreadMutexLock(&pTask->lock); + SStreamTaskState* p = streamTaskGetStatus(pTask); + if (p->state == TASK_STATUS__CK) { + code = streamTaskSendCheckpointSourceRsp(pTask); + } + taosThreadMutexUnlock(&pTask->lock); + + return code; } \ No newline at end of file From eb1a5e3cc64ab939252835c78398229338a8edf2 Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Thu, 4 Jul 2024 08:05:51 +0000 Subject: [PATCH 30/92] fix comment --- source/libs/stream/src/streamSnapshot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/libs/stream/src/streamSnapshot.c b/source/libs/stream/src/streamSnapshot.c index bbf7f5499d..3514ad218d 100644 --- a/source/libs/stream/src/streamSnapshot.c +++ b/source/libs/stream/src/streamSnapshot.c @@ -156,7 +156,7 @@ void snapFileDebugInfo(SBackendSnapFile2* pSnapFile) { char* buf = taosMemoryCalloc(1, cap); if (buf == NULL) { - stError("%s failed to alloc memory", STREAM_STATE_TRANSFER, tstrerror(TSDB_CODE_OUT_OF_MEMORY)); + stError("%s failed to alloc memory, reason:%s", STREAM_STATE_TRANSFER, tstrerror(TSDB_CODE_OUT_OF_MEMORY)); return; } From 5b1dddf4d55c6adba643fc42ea40bc813c0faf38 Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Thu, 4 Jul 2024 08:14:50 +0000 Subject: [PATCH 31/92] fix comment --- source/libs/stream/src/streamCheckpoint.c | 10 ++++--- source/libs/stream/src/streamSnapshot.c | 35 +++++++++++------------ 2 files changed, 23 insertions(+), 22 deletions(-) diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index df73b9f1c8..66dcfec86e 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -1150,12 +1150,14 @@ int32_t deleteCheckpointFile(const char* id, const char* name) { int32_t nBytes = snprintf(object, sizeof(object), "%s/%s", id, name); if (nBytes <= 0 || nBytes >= sizeof(object)) { - terrno = TSDB_CODE_OUT_OF_RANGE; - return -1; + return TSDB_CODE_OUT_OF_RANGE; } - char* tmp = object; - return s3DeleteObjects((const char**)&tmp, 1); + char* tmp = object; + int32_t code = s3DeleteObjects((const char**)&tmp, 1); + if (code != 0) { + return code; + } } int32_t streamTaskSendRestoreChkptMsg(SStreamTask* pTask) { diff --git a/source/libs/stream/src/streamSnapshot.c b/source/libs/stream/src/streamSnapshot.c index 3514ad218d..57723132d8 100644 --- a/source/libs/stream/src/streamSnapshot.c +++ b/source/libs/stream/src/streamSnapshot.c @@ -228,6 +228,7 @@ int32_t snapFileGenMeta(SBackendSnapFile2* pSnapFile) { return 0; } int32_t snapFileReadMeta(SBackendSnapFile2* pSnapFile) { + terrno = 0; TdDirPtr pDir = taosOpenDir(pSnapFile->path); if (NULL == pDir) { terrno = TAOS_SYSTEM_ERROR(errno); @@ -294,13 +295,13 @@ int32_t snapFileReadMeta(SBackendSnapFile2* pSnapFile) { return terrno; } int32_t streamBackendSnapInitFile(char* metaPath, SStreamTaskSnap* pSnap, SBackendSnapFile2* pSnapFile) { - int32_t code = -1; + terrno = 0; int32_t nBytes = 0; int32_t cap = strlen(pSnap->dbPrefixPath) + 256; char* path = taosMemoryCalloc(1, cap); if (path == NULL) { - return -1; + return TSDB_CODE_OUT_OF_MEMORY; } nBytes = snprintf(path, cap, "%s%s%s%s%s%" PRId64 "", pSnap->dbPrefixPath, TD_DIRSEP, "checkpoints", TD_DIRSEP, @@ -324,20 +325,19 @@ int32_t streamBackendSnapInitFile(char* metaPath, SStreamTaskSnap* pSnap, SBacke pSnapFile->path = path; pSnapFile->snapInfo = *pSnap; - if ((code = snapFileReadMeta(pSnapFile)) != 0) { + if ((terrno = snapFileReadMeta(pSnapFile)) != 0) { goto _ERROR; } - if ((code = snapFileGenMeta(pSnapFile)) != 0) { + if ((terrno = snapFileGenMeta(pSnapFile)) != 0) { goto _ERROR; } snapFileDebugInfo(pSnapFile); path = NULL; - code = 0; _ERROR: taosMemoryFree(path); - return code; + return terrno; } void snapFileDestroy(SBackendSnapFile2* pSnap) { taosMemoryFree(pSnap->pCheckpointMeta); @@ -365,19 +365,19 @@ void snapFileDestroy(SBackendSnapFile2* pSnap) { } int32_t streamSnapHandleInit(SStreamSnapHandle* pHandle, char* path, void* pMeta) { // impl later - int32_t code = 0; + terrno = 0; SArray* pSnapInfoSet = taosArrayInit(4, sizeof(SStreamTaskSnap)); if (pSnapInfoSet == NULL) { terrno = TSDB_CODE_OUT_OF_MEMORY; - return -1; + return terrno; } - code = streamCreateTaskDbSnapInfo(pMeta, path, pSnapInfoSet); - if (code != 0) { + terrno = streamCreateTaskDbSnapInfo(pMeta, path, pSnapInfoSet); + if (terrno != 0) { stError("failed to do task db snap info, reason:%s", tstrerror(terrno)); taosArrayDestroy(pSnapInfoSet); - return -1; + return terrno; } SArray* pDbSnapSet = taosArrayInit(8, sizeof(SBackendSnapFile2)); @@ -391,8 +391,8 @@ int32_t streamSnapHandleInit(SStreamSnapHandle* pHandle, char* path, void* pMeta SStreamTaskSnap* pSnap = taosArrayGet(pSnapInfoSet, i); SBackendSnapFile2 snapFile = {0}; - code = streamBackendSnapInitFile(path, pSnap, &snapFile); - ASSERT(code == 0); + terrno = streamBackendSnapInitFile(path, pSnap, &snapFile); + ASSERT(terrno == 0); taosArrayPush(pDbSnapSet, &snapFile); } pHandle->pDbSnapSet = pDbSnapSet; @@ -403,9 +403,7 @@ int32_t streamSnapHandleInit(SStreamSnapHandle* pHandle, char* path, void* pMeta _err: streamSnapHandleDestroy(pHandle); - - code = -1; - return code; + return terrno; } void streamSnapHandleDestroy(SStreamSnapHandle* handle) { @@ -437,9 +435,10 @@ int32_t streamSnapReaderOpen(void* pMeta, int64_t sver, int64_t chkpId, char* pa return -1; } - if (streamSnapHandleInit(&pReader->handle, (char*)path, pMeta) < 0) { + int32_t code = streamSnapHandleInit(&pReader->handle, (char*)path, pMeta); + if (code != 0) { taosMemoryFree(pReader); - return -1; + return code; } *ppReader = pReader; From 7171b6dd6d2bfa92baf76cd13427c10d51875979 Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Thu, 4 Jul 2024 08:34:42 +0000 Subject: [PATCH 32/92] fix comment --- source/libs/stream/src/streamCheckpoint.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index 66dcfec86e..1e77e70efa 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -1135,7 +1135,7 @@ int32_t deleteCheckpoint(const char* id) { if (id == NULL || strlen(id) == 0) { terrno = TSDB_CODE_INVALID_PARA; stError("deleteCheckpoint parameters invalid"); - return -1; + return terrno; } if (strlen(tsSnodeAddress) != 0) { return deleteRsync(id); From 3151d0663c87d9e5df89d0b9da9e4b5e3e704fcc Mon Sep 17 00:00:00 2001 From: xjzhou Date: Thu, 4 Jul 2024 16:45:10 +0800 Subject: [PATCH 33/92] update --- source/libs/parser/src/parInsertSql.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/libs/parser/src/parInsertSql.c b/source/libs/parser/src/parInsertSql.c index cd7288f1f5..d4b9f20f51 100644 --- a/source/libs/parser/src/parInsertSql.c +++ b/source/libs/parser/src/parInsertSql.c @@ -2938,7 +2938,7 @@ int32_t parseInsertSql(SParseContext* pCxt, SQuery** pQuery, SCatalogReq* pCatal .usingDuplicateTable = false, .needRequest = true, .forceUpdate = (NULL != pCatalogReq ? pCatalogReq->forceUpdate : false), - .isStmtBind = false}; + .isStmtBind = pCxt->isStmtBind}; int32_t code = initInsertQuery(&context, pCatalogReq, pMetaData, pQuery); if (TSDB_CODE_SUCCESS == code) { From c35c634977390d64165a9f4bd03b0ccfc1855f8a Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Thu, 4 Jul 2024 17:08:32 +0800 Subject: [PATCH 34/92] fix(stream): add flag to disable the concurrently started consensus-checkpointId procedure. --- include/libs/stream/tstream.h | 4 ++-- source/libs/stream/src/streamCheckpoint.c | 11 +++++++++++ source/libs/stream/src/streamMeta.c | 23 +++++++++++++++-------- 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/include/libs/stream/tstream.h b/include/libs/stream/tstream.h index f24f7da7c3..093a21c999 100644 --- a/include/libs/stream/tstream.h +++ b/include/libs/stream/tstream.h @@ -272,9 +272,8 @@ typedef struct SCheckpointInfo { int64_t checkpointTime; // latest checkpoint time int64_t processedVer; int64_t nextProcessVer; // current offset in WAL, not serialize it - + int64_t msgVer; SActiveCheckpointInfo* pActiveInfo; - int64_t msgVer; } SCheckpointInfo; typedef struct SStreamStatus { @@ -289,6 +288,7 @@ typedef struct SStreamStatus { int32_t inScanHistorySentinel; bool appendTranstateBlock; // has append the transfer state data block already bool removeBackendFiles; // remove backend files on disk when free stream tasks + bool sendConsensusChkptId; } SStreamStatus; typedef struct SDataRange { diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index 969c2e1795..0f39ca7213 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -1107,6 +1107,17 @@ int32_t streamTaskSendRestoreChkptMsg(SStreamTask* pTask) { const char* id = pTask->id.idStr; SCheckpointInfo* pInfo = &pTask->chkInfo; + taosThreadMutexLock(&pTask->lock); + if (pTask->status.sendConsensusChkptId == true) { + stDebug("s-task:%s already start to consensus-checkpointId, not start again before it completed", id); + taosThreadMutexUnlock(&pTask->lock); + return TSDB_CODE_SUCCESS; + } else { + pTask->status.sendConsensusChkptId = true; + } + + taosThreadMutexUnlock(&pTask->lock); + ASSERT(pTask->pBackend == NULL); SRestoreCheckpointInfo req = { diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index 7b94f642e2..19cb2f7854 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -1198,7 +1198,7 @@ int32_t streamMetaStartAllTasks(SStreamMeta* pMeta) { continue; } - if ((pTask->pBackend == NULL) && (pTask->info.fillHistory == 1 || HAS_RELATED_FILLHISTORY_TASK(pTask))) { + if ((pTask->pBackend == NULL) && ((pTask->info.fillHistory == 1) || HAS_RELATED_FILLHISTORY_TASK(pTask))) { code = pMeta->expandTaskFn(pTask); if (code != TSDB_CODE_SUCCESS) { stError("s-task:0x%x vgId:%d failed to expand stream backend", pTaskId->taskId, vgId); @@ -1392,17 +1392,24 @@ int32_t streamMetaAddTaskLaunchResult(SStreamMeta* pMeta, int64_t streamId, int3 streamMetaWLock(pMeta); - if (pStartInfo->startAllTasks != 1) { - int64_t el = endTs - startTs; - stDebug("vgId:%d not start all task(s), not record status, s-task:0x%x launch succ:%d elapsed time:%" PRId64 "ms", - pMeta->vgId, taskId, ready, el); + SStreamTask** p = taosHashGet(pMeta->pTasksMap, &id, sizeof(id)); + if (p == NULL) { // task does not exists in current vnode, not record the complete info + stError("vgId:%d s-task:0x%x not exists discard the check downstream info", pMeta->vgId, taskId); streamMetaWUnLock(pMeta); return 0; } - void* p = taosHashGet(pMeta->pTasksMap, &id, sizeof(id)); - if (p == NULL) { // task does not exists in current vnode, not record the complete info - stError("vgId:%d s-task:0x%x not exists discard the check downstream info", pMeta->vgId, taskId); + // clear the send consensus-checkpointId flag + taosThreadMutexLock(&(*p)->lock); + (*p)->status.sendConsensusChkptId = false; + taosThreadMutexUnlock(&(*p)->lock); + + if (pStartInfo->startAllTasks != 1) { + int64_t el = endTs - startTs; + stDebug( + "vgId:%d not in start all task(s) process, not record launch result status, s-task:0x%x launch succ:%d elapsed " + "time:%" PRId64 "ms", + pMeta->vgId, taskId, ready, el); streamMetaWUnLock(pMeta); return 0; } From 79f1e90743e883b8c7b7c034ba139e895fa2e88b Mon Sep 17 00:00:00 2001 From: kailixu Date: Thu, 4 Jul 2024 17:11:53 +0800 Subject: [PATCH 35/92] fix: oom in rpc queue --- source/dnode/mgmt/mgmt_vnode/src/vmWorker.c | 3 ++- source/dnode/mgmt/node_mgmt/src/dmTransport.c | 4 +++- source/util/src/tqueue.c | 2 ++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/source/dnode/mgmt/mgmt_vnode/src/vmWorker.c b/source/dnode/mgmt/mgmt_vnode/src/vmWorker.c index 45d1486912..8c1b33cb14 100644 --- a/source/dnode/mgmt/mgmt_vnode/src/vmWorker.c +++ b/source/dnode/mgmt/mgmt_vnode/src/vmWorker.c @@ -287,7 +287,8 @@ int32_t vmPutRpcMsgToQueue(SVnodeMgmt *pMgmt, EQueueType qtype, SRpcMsg *pRpc) { return -1; } - SRpcMsg *pMsg = taosAllocateQitem(sizeof(SRpcMsg), RPC_QITEM, pRpc->contLen); + EQItype itype = APPLY_QUEUE == qtype ? DEF_QITEM : RPC_QITEM; + SRpcMsg *pMsg = taosAllocateQitem(sizeof(SRpcMsg), itype, pRpc->contLen); if (pMsg == NULL) { rpcFreeCont(pRpc->pCont); pRpc->pCont = NULL; diff --git a/source/dnode/mgmt/node_mgmt/src/dmTransport.c b/source/dnode/mgmt/node_mgmt/src/dmTransport.c index 74bf1f964c..bc269a6410 100644 --- a/source/dnode/mgmt/node_mgmt/src/dmTransport.c +++ b/source/dnode/mgmt/node_mgmt/src/dmTransport.c @@ -208,7 +208,9 @@ static void dmProcessRpcMsg(SDnode *pDnode, SRpcMsg *pRpc, SEpSet *pEpSet) { } pRpc->info.wrapper = pWrapper; - pMsg = taosAllocateQitem(sizeof(SRpcMsg), RPC_QITEM, pRpc->contLen); + + EQItype itype = IsReq(pRpc) ? RPC_QITEM : DEF_QITEM; // resp msg is not limited by tsRpcQueueMemoryUsed + pMsg = taosAllocateQitem(sizeof(SRpcMsg), itype, pRpc->contLen); if (pMsg == NULL) goto _OVER; memcpy(pMsg, pRpc, sizeof(SRpcMsg)); diff --git a/source/util/src/tqueue.c b/source/util/src/tqueue.c index 7a4eb09b99..aa8834c89f 100644 --- a/source/util/src/tqueue.c +++ b/source/util/src/tqueue.c @@ -494,6 +494,8 @@ int32_t taosReadAllQitemsFromQset(STaosQset *qset, STaosQall *qall, SQueueInfo * qall->start = queue->head; qall->numOfItems = queue->numOfItems; qall->memOfItems = queue->memOfItems; + qall->unAccessedNumOfItems = queue->numOfItems; + qall->unAccessMemOfItems = queue->memOfItems; code = qall->numOfItems; qinfo->ahandle = queue->ahandle; From e6defda0d06bfee1841cbab56a7daa2d567895d3 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Thu, 4 Jul 2024 17:48:58 +0800 Subject: [PATCH 36/92] fix(stream): check for checkpoint interrpution in sendReady monitor. --- source/libs/stream/src/streamCheckpoint.c | 6 ++++-- source/libs/stream/src/streamDispatch.c | 13 ++++++++++++- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index 0f39ca7213..d5f7d6ef21 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -405,12 +405,14 @@ int32_t streamTaskProcessCheckpointReadyRsp(SStreamTask* pTask, int32_t upstream void streamTaskClearCheckInfo(SStreamTask* pTask, bool clearChkpReadyMsg) { pTask->chkInfo.startTs = 0; // clear the recorded start time - - streamTaskClearActiveInfo(pTask->chkInfo.pActiveInfo); streamTaskOpenAllUpstreamInput(pTask); // open inputQ for all upstream tasks + + taosThreadMutexLock(&pTask->chkInfo.pActiveInfo->lock); + streamTaskClearActiveInfo(pTask->chkInfo.pActiveInfo); if (clearChkpReadyMsg) { streamClearChkptReadyMsg(pTask->chkInfo.pActiveInfo); } + taosThreadMutexUnlock(&pTask->chkInfo.pActiveInfo->lock); } int32_t streamTaskUpdateTaskCheckpointInfo(SStreamTask* pTask, bool restored, SVUpdateCheckpointInfoReq* pReq) { diff --git a/source/libs/stream/src/streamDispatch.c b/source/libs/stream/src/streamDispatch.c index 83e73e8c88..5164e20ec9 100644 --- a/source/libs/stream/src/streamDispatch.c +++ b/source/libs/stream/src/streamDispatch.c @@ -813,9 +813,20 @@ static void checkpointReadyMsgSendMonitorFn(void* param, void* tmrId) { taosThreadMutexLock(&pActiveInfo->lock); SArray* pList = pActiveInfo->pReadyMsgList; + int32_t num = taosArrayGetSize(pList); + + // active checkpoint info is cleared for now + if ((pActiveInfo->activeId == 0) && (pActiveInfo->transId == 0) && (num == 0) && (pTask->chkInfo.startTs == 0)) { + taosThreadMutexUnlock(&pActiveInfo->lock); + int32_t ref = atomic_sub_fetch_32(&pTask->status.timerActive, 1); + stWarn("s-task:0x%x vgId:%d active checkpoint may failed, quit from readyMsg send tmr, ref:%d", id, vgId, ref); + + streamMetaReleaseTask(pTask->pMeta, pTask); + return; + } + SArray* pNotRspList = taosArrayInit(4, sizeof(int32_t)); - int32_t num = taosArrayGetSize(pList); ASSERT(taosArrayGetSize(pTask->upstreamInfo.pList) == num); for (int32_t i = 0; i < num; ++i) { From 998421e5ad79315892fe5ef639e2baf2f30b6237 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Thu, 4 Jul 2024 17:49:59 +0800 Subject: [PATCH 37/92] fix(stream): update some logs. --- source/libs/stream/src/streamDispatch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/libs/stream/src/streamDispatch.c b/source/libs/stream/src/streamDispatch.c index 5164e20ec9..8bf0ccca53 100644 --- a/source/libs/stream/src/streamDispatch.c +++ b/source/libs/stream/src/streamDispatch.c @@ -819,7 +819,7 @@ static void checkpointReadyMsgSendMonitorFn(void* param, void* tmrId) { if ((pActiveInfo->activeId == 0) && (pActiveInfo->transId == 0) && (num == 0) && (pTask->chkInfo.startTs == 0)) { taosThreadMutexUnlock(&pActiveInfo->lock); int32_t ref = atomic_sub_fetch_32(&pTask->status.timerActive, 1); - stWarn("s-task:0x%x vgId:%d active checkpoint may failed, quit from readyMsg send tmr, ref:%d", id, vgId, ref); + stWarn("s-task:0x%x vgId:%d active checkpoint may be cleared, quit from readyMsg send tmr, ref:%d", id, vgId, ref); streamMetaReleaseTask(pTask->pMeta, pTask); return; From ea7733f9ce1d823ea67113ec80bcadc6d11774fa Mon Sep 17 00:00:00 2001 From: wangmm0220 Date: Thu, 4 Jul 2024 18:10:10 +0800 Subject: [PATCH 38/92] fix[TD-30883] tmqParamsTest.py failed in some times --- tests/system-test/7-tmq/tmqParamsTest.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/system-test/7-tmq/tmqParamsTest.py b/tests/system-test/7-tmq/tmqParamsTest.py index b25f23ef11..3d5fb52da5 100644 --- a/tests/system-test/7-tmq/tmqParamsTest.py +++ b/tests/system-test/7-tmq/tmqParamsTest.py @@ -123,6 +123,7 @@ class TDTestCase: tmqCom.insert_data(tdSql,paraDict["dbName"],paraDict["ctbPrefix"],paraDict["ctbNum"],paraDict["rowsPerTbl"],paraDict["batchNum"],int(round(time.time()*1000))) stop_flag = 1 finally: + time.sleep(5) #wait for send heartbeat to update subscription info. consumer.unsubscribe() consumer.close() tdSql.checkEqual(consumer_info, expected_parameters) @@ -134,6 +135,8 @@ class TDTestCase: if offset_value != "earliest" and offset_value != "": if offset_value == "latest": offset_value_list = list(map(lambda x: (x[-2].replace("wal:", "").replace("earliest", "0").replace("latest", "0").replace(offset_value, "0")), subscription_info)) + if None in offset_value_list: + continue offset_value_list1 = list(map(lambda x: int(x.split("/")[0]), offset_value_list)) offset_value_list2 = list(map(lambda x: int(x.split("/")[1]), offset_value_list)) tdSql.checkEqual(offset_value_list1 == offset_value_list2, True) @@ -142,6 +145,8 @@ class TDTestCase: tdSql.checkEqual(sum(rows_value_list), expected_res) elif offset_value == "none": offset_value_list = list(map(lambda x: x[-2], subscription_info)) + if None in offset_value_list: + continue offset_value_list1 = list(map(lambda x: (x.split("/")[0]), offset_value_list)) tdSql.checkEqual(offset_value_list1, ['none']*len(subscription_info)) rows_value_list = list(map(lambda x: x[-1], subscription_info)) @@ -155,6 +160,8 @@ class TDTestCase: # tdSql.checkEqual(sum(rows_value_list), expected_res) else: offset_value_list = list(map(lambda x: x[-2], subscription_info)) + if None in offset_value_list: + continue offset_value_list1 = list(map(lambda x: (x.split("/")[0]), offset_value_list)) tdSql.checkEqual(offset_value_list1, [None]*len(subscription_info)) rows_value_list = list(map(lambda x: x[-1], subscription_info)) @@ -162,6 +169,8 @@ class TDTestCase: else: if offset_value != "none": offset_value_list = list(map(lambda x: (x[-2].replace("wal:", "").replace("earliest", "0").replace("latest", "0").replace(offset_value, "0")), subscription_info)) + if None in offset_value_list: + continue offset_value_list1 = list(map(lambda x: int(x.split("/")[0]), offset_value_list)) offset_value_list2 = list(map(lambda x: int(x.split("/")[1]), offset_value_list)) tdSql.checkEqual(offset_value_list1 <= offset_value_list2, True) @@ -170,6 +179,8 @@ class TDTestCase: tdSql.checkEqual(sum(rows_value_list), expected_res) else: offset_value_list = list(map(lambda x: x[-2], subscription_info)) + if None in offset_value_list: + continue offset_value_list1 = list(map(lambda x: (x.split("/")[0]), offset_value_list)) tdSql.checkEqual(offset_value_list1, ['none']*len(subscription_info)) rows_value_list = list(map(lambda x: x[-1], subscription_info)) From a9a6747ac09cead5d0104def2a7b963ee1556d95 Mon Sep 17 00:00:00 2001 From: kailixu Date: Thu, 4 Jul 2024 18:12:55 +0800 Subject: [PATCH 39/92] fix: oom in rpc queue --- source/dnode/mgmt/node_mgmt/src/dmTransport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/dnode/mgmt/node_mgmt/src/dmTransport.c b/source/dnode/mgmt/node_mgmt/src/dmTransport.c index bc269a6410..c3dfc1a64c 100644 --- a/source/dnode/mgmt/node_mgmt/src/dmTransport.c +++ b/source/dnode/mgmt/node_mgmt/src/dmTransport.c @@ -209,7 +209,7 @@ static void dmProcessRpcMsg(SDnode *pDnode, SRpcMsg *pRpc, SEpSet *pEpSet) { pRpc->info.wrapper = pWrapper; - EQItype itype = IsReq(pRpc) ? RPC_QITEM : DEF_QITEM; // resp msg is not limited by tsRpcQueueMemoryUsed + EQItype itype = IsReq(pRpc) ? RPC_QITEM : DEF_QITEM; // rsp msg should not be restricted by tsRpcQueueMemoryUsed pMsg = taosAllocateQitem(sizeof(SRpcMsg), itype, pRpc->contLen); if (pMsg == NULL) goto _OVER; From 81577b82222013e84a485004f5b8a4846d8f9002 Mon Sep 17 00:00:00 2001 From: kailixu Date: Thu, 4 Jul 2024 18:19:08 +0800 Subject: [PATCH 40/92] fix: oom in rpc queue --- source/dnode/mgmt/node_mgmt/src/dmTransport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/dnode/mgmt/node_mgmt/src/dmTransport.c b/source/dnode/mgmt/node_mgmt/src/dmTransport.c index c3dfc1a64c..99d641ff3f 100644 --- a/source/dnode/mgmt/node_mgmt/src/dmTransport.c +++ b/source/dnode/mgmt/node_mgmt/src/dmTransport.c @@ -209,7 +209,7 @@ static void dmProcessRpcMsg(SDnode *pDnode, SRpcMsg *pRpc, SEpSet *pEpSet) { pRpc->info.wrapper = pWrapper; - EQItype itype = IsReq(pRpc) ? RPC_QITEM : DEF_QITEM; // rsp msg should not be restricted by tsRpcQueueMemoryUsed + EQItype itype = IsReq(pRpc) ? RPC_QITEM : DEF_QITEM; // rsp msg is not restricted by tsRpcQueueMemoryUsed pMsg = taosAllocateQitem(sizeof(SRpcMsg), itype, pRpc->contLen); if (pMsg == NULL) goto _OVER; From 8428a5be374aef352de78f9101ad4af5a3c809f8 Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Thu, 4 Jul 2024 10:25:37 +0000 Subject: [PATCH 41/92] refactor backend --- include/util/taoserror.h | 1 + source/dnode/vnode/src/tq/tqStreamStateSnap.c | 11 +- source/libs/stream/src/streamBackendRocksdb.c | 429 +++++++++--------- source/libs/stream/src/streamCheckpoint.c | 58 ++- source/libs/stream/src/streamSnapshot.c | 92 ++-- source/util/src/terror.c | 1 + 6 files changed, 295 insertions(+), 297 deletions(-) diff --git a/include/util/taoserror.h b/include/util/taoserror.h index 2de336d036..359872e8cd 100644 --- a/include/util/taoserror.h +++ b/include/util/taoserror.h @@ -136,6 +136,7 @@ int32_t taosGetErrSize(); #define TSDB_CODE_TIMEOUT_ERROR TAOS_DEF_ERROR_CODE(0, 0x012C) #define TSDB_CODE_MSG_ENCODE_ERROR TAOS_DEF_ERROR_CODE(0, 0x012D) #define TSDB_CODE_NO_ENOUGH_DISKSPACE TAOS_DEF_ERROR_CODE(0, 0x012E) +#define TSDB_CODE_THIRDPARTY_ERROR TAOS_DEF_ERROR_CODE(0, 0x012F) #define TSDB_CODE_APP_IS_STARTING TAOS_DEF_ERROR_CODE(0, 0x0130) #define TSDB_CODE_APP_IS_STOPPING TAOS_DEF_ERROR_CODE(0, 0x0131) diff --git a/source/dnode/vnode/src/tq/tqStreamStateSnap.c b/source/dnode/vnode/src/tq/tqStreamStateSnap.c index be768e375e..07bfd52a9c 100644 --- a/source/dnode/vnode/src/tq/tqStreamStateSnap.c +++ b/source/dnode/vnode/src/tq/tqStreamStateSnap.c @@ -132,8 +132,7 @@ int32_t streamStateSnapWriterOpen(STQ* pTq, int64_t sver, int64_t ever, SStreamS // alloc pWriter = (SStreamStateWriter*)taosMemoryCalloc(1, sizeof(*pWriter)); if (pWriter == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; - code = terrno; + code = TSDB_CODE_OUT_OF_MEMORY; goto _err; } pWriter->pTq = pTq; @@ -141,14 +140,14 @@ int32_t streamStateSnapWriterOpen(STQ* pTq, int64_t sver, int64_t ever, SStreamS pWriter->ever = ever; if (taosMkDir(pTq->pStreamMeta->path) != 0) { - terrno = TAOS_SYSTEM_ERROR(errno); + code = TAOS_SYSTEM_ERROR(errno); tqError("vgId:%d, vnode %s snapshot writer failed to create directory %s since %s", TD_VID(pTq->pVnode), STREAM_STATE_TRANSFER, pTq->pStreamMeta->path, tstrerror(terrno)); goto _err; } SStreamSnapWriter* pSnapWriter = NULL; - if (streamSnapWriterOpen(pTq, sver, ever, pTq->pStreamMeta->path, &pSnapWriter) < 0) { + if ((code = streamSnapWriterOpen(pTq, sver, ever, pTq->pStreamMeta->path, &pSnapWriter)) < 0) { goto _err; } @@ -157,14 +156,14 @@ int32_t streamStateSnapWriterOpen(STQ* pTq, int64_t sver, int64_t ever, SStreamS pWriter->pWriterImpl = pSnapWriter; *ppWriter = pWriter; - return code; + return 0; _err: tqError("vgId:%d, vnode %s snapshot writer failed to open since %s", TD_VID(pTq->pVnode), STREAM_STATE_TRANSFER, tstrerror(terrno)); taosMemoryFree(pWriter); *ppWriter = NULL; - return -1; + return code; } int32_t streamStateSnapWriterClose(SStreamStateWriter* pWriter, int8_t rollback) { diff --git a/source/libs/stream/src/streamBackendRocksdb.c b/source/libs/stream/src/streamBackendRocksdb.c index 54abba8bdc..057ff56aa9 100644 --- a/source/libs/stream/src/streamBackendRocksdb.c +++ b/source/libs/stream/src/streamBackendRocksdb.c @@ -213,38 +213,34 @@ int32_t rebuildDirFromCheckpoint(const char* path, int64_t chkpId, char** dst) { char* state = taosMemoryCalloc(1, cap); if (state == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; - return terrno; + return TSDB_CODE_OUT_OF_MEMORY; } nBytes = snprintf(state, cap, "%s%s%s", path, TD_DIRSEP, "state"); if (nBytes <= 0 || nBytes >= cap) { - terrno = TSDB_CODE_OUT_OF_RANGE; taosMemoryFree(state); - return terrno; + return TSDB_CODE_OUT_OF_RANGE; } if (chkpId != 0) { char* chkp = taosMemoryCalloc(1, cap); if (chkp == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; taosMemoryFree(state); - return terrno; + return TSDB_CODE_OUT_OF_MEMORY; } nBytes = snprintf(chkp, cap, "%s%s%s%scheckpoint%" PRId64 "", path, TD_DIRSEP, "checkpoints", TD_DIRSEP, chkpId); if (nBytes <= 0 || nBytes >= cap) { - terrno = TSDB_CODE_OUT_OF_RANGE; taosMemoryFree(state); taosMemoryFree(chkp); - return terrno; + return TSDB_CODE_OUT_OF_RANGE; } if (taosIsDir(chkp) && isValidCheckpoint(chkp)) { cleanDir(state, ""); code = backendCopyFiles(chkp, state); if (code != 0) { - stError("failed to restart stream backend from %s, reason: %s", chkp, tstrerror(TAOS_SYSTEM_ERROR(terrno))); + stError("failed to restart stream backend from %s, reason: %s", chkp, tstrerror(TAOS_SYSTEM_ERROR(code))); } else { stInfo("start to restart stream backend at checkpoint path: %s", chkp); } @@ -254,8 +250,7 @@ int32_t rebuildDirFromCheckpoint(const char* path, int64_t chkpId, char** dst) { tstrerror(TAOS_SYSTEM_ERROR(errno)), state); code = taosMkDir(state); if (code != 0) { - terrno = TAOS_SYSTEM_ERROR(errno); - code = terrno; + code = TAOS_SYSTEM_ERROR(errno); } } @@ -278,50 +273,48 @@ typedef struct { } SSChkpMetaOnS3; int32_t remoteChkp_readMetaData(char* path, SSChkpMetaOnS3** pMeta) { - int32_t code = -1; + int32_t code = 0; int32_t cap = strlen(path) + 32; TdFilePtr pFile = NULL; char* metaPath = taosMemoryCalloc(1, cap); if (metaPath == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; - return terrno; + return TSDB_CODE_OUT_OF_MEMORY; } int32_t n = snprintf(metaPath, cap, "%s%s%s", path, TD_DIRSEP, "META"); if (n <= 0 || n >= cap) { - terrno = TSDB_CODE_OUT_OF_MEMORY; taosMemoryFree(metaPath); - return terrno; + return TSDB_CODE_OUT_OF_MEMORY; } pFile = taosOpenFile(path, TD_FILE_READ); if (pFile == NULL) { - terrno = TAOS_SYSTEM_ERROR(errno); + code = TAOS_SYSTEM_ERROR(errno); goto _EXIT; } char buf[256] = {0}; if (taosReadFile(pFile, buf, sizeof(buf)) <= 0) { - terrno = TAOS_SYSTEM_ERROR(errno); + code = TAOS_SYSTEM_ERROR(errno); goto _EXIT; } SSChkpMetaOnS3* p = taosMemoryCalloc(1, sizeof(SSChkpMetaOnS3)); if (p == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; + code = TSDB_CODE_OUT_OF_MEMORY; goto _EXIT; } n = sscanf(buf, META_ON_S3_FORMATE, p->pCurrName, &p->currChkptId, p->pManifestName, &p->manifestChkptId, p->processName, &p->processId); if (n != 6) { - terrno = TSDB_CODE_INVALID_MSG; + code = TSDB_CODE_INVALID_MSG; taosMemoryFree(p); goto _EXIT; } if (p->currChkptId != p->manifestChkptId) { - terrno = TSDB_CODE_INVALID_MSG; + code = TSDB_CODE_INVALID_MSG; taosMemoryFree(p); goto _EXIT; } @@ -330,53 +323,52 @@ int32_t remoteChkp_readMetaData(char* path, SSChkpMetaOnS3** pMeta) { _EXIT: taosCloseFile(&pFile); taosMemoryFree(metaPath); - code = terrno; return code; } int32_t remoteChkp_validAndCvtMeta(char* path, SSChkpMetaOnS3* pMeta, int64_t chkpId) { - int32_t code = -1; + int32_t code = 0; int32_t nBytes = 0; int32_t cap = strlen(path) + 64; char* src = taosMemoryCalloc(1, cap); char* dst = taosMemoryCalloc(1, cap); if (src == NULL || dst == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; + code = TSDB_CODE_OUT_OF_MEMORY; goto _EXIT; } if (pMeta->currChkptId != chkpId || pMeta->manifestChkptId != chkpId) { - terrno = TSDB_CODE_INVALID_CFG; + code = TSDB_CODE_INVALID_CFG; goto _EXIT; } // rename current_chkp/mainfest to current for (int i = 0; i < 2; i++) { char* key = (i == 0 ? pMeta->pCurrName : pMeta->pManifestName); if (strlen(key) <= 0) { - terrno = TSDB_CODE_INVALID_PARA; + code = TSDB_CODE_INVALID_PARA; goto _EXIT; } nBytes = snprintf(src, cap, "%s%s%s_%" PRId64 "", path, TD_DIRSEP, key, pMeta->currChkptId); if (nBytes <= 0 || nBytes >= cap) { - terrno = TSDB_CODE_OUT_OF_RANGE; + code = TSDB_CODE_OUT_OF_RANGE; goto _EXIT; } if (taosStatFile(src, NULL, NULL, NULL) != 0) { - terrno = TAOS_SYSTEM_ERROR(errno); + code = TAOS_SYSTEM_ERROR(errno); goto _EXIT; } nBytes = snprintf(dst, cap, "%s%s%s", path, TD_DIRSEP, key); if (nBytes <= 0 || nBytes >= cap) { - terrno = TSDB_CODE_OUT_OF_RANGE; + code = TSDB_CODE_OUT_OF_RANGE; goto _EXIT; } if (taosRenameFile(src, dst) != 0) { - terrno = TAOS_SYSTEM_ERROR(errno); + code = TAOS_SYSTEM_ERROR(errno); goto _EXIT; } @@ -389,11 +381,10 @@ int32_t remoteChkp_validAndCvtMeta(char* path, SSChkpMetaOnS3* pMeta, int64_t ch _EXIT: taosMemoryFree(src); taosMemoryFree(dst); - code = terrno; return code; } int32_t remoteChkpGetDelFile(char* path, SArray* toDel) { - int32_t code = -1; + int32_t code = 0; int32_t nBytes = 0; SSChkpMetaOnS3* pMeta = NULL; @@ -408,28 +399,24 @@ int32_t remoteChkpGetDelFile(char* path, SArray* toDel) { int32_t cap = strlen(key) + 32; char* p = taosMemoryCalloc(1, cap); if (p == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; taosMemoryFree(pMeta); - return -1; + return TSDB_CODE_OUT_OF_MEMORY; } nBytes = snprintf(p, cap, "%s_%" PRId64 "", key, pMeta->currChkptId); if (nBytes <= 0 || nBytes >= cap) { - terrno = TSDB_CODE_OUT_OF_RANGE; taosMemoryFree(pMeta); taosMemoryFree(p); - return code; + return TSDB_CODE_OUT_OF_RANGE; } if (taosArrayPush(toDel, &p) == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; taosMemoryFree(pMeta); taosMemoryFree(p); - return code; + return TSDB_CODE_OUT_OF_MEMORY; } } - code = 0; - return code; + return 0; } void cleanDir(const char* pPath, const char* id) { @@ -475,19 +462,18 @@ int32_t rebuildDataFromS3(char* chkpPath, int64_t chkpId) { int32_t code = remoteChkp_readMetaData(chkpPath, &pMeta); if (code != 0) { - return -1; + return code; } if (pMeta->currChkptId != chkpId || pMeta->manifestChkptId != chkpId) { taosMemoryFree(pMeta); - terrno = TSDB_CODE_INVALID_PARA; - return -1; + return TSDB_CODE_INVALID_PARA; } code = remoteChkp_validAndCvtMeta(chkpPath, pMeta, chkpId); if (code != 0) { taosMemoryFree(pMeta); - return -1; + return code; } return chkpAddExtraInfo(chkpPath, chkpId, pMeta->processId); @@ -504,22 +490,20 @@ int32_t rebuildFromRemoteChkp_s3(const char* key, char* chkpPath, int64_t chkpId char* defaultTmp = taosMemoryCalloc(1, cap); if (defaultTmp == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; - return -1; + return TSDB_CODE_OUT_OF_MEMORY; } int32_t nBytes = snprintf(defaultPath, cap, "%s%s", defaultPath, "_tmp"); if (nBytes <= 0 || nBytes >= cap) { - terrno = TSDB_CODE_OUT_OF_RANGE; taosMemoryFree(defaultPath); - return -1; + return TSDB_CODE_OUT_OF_RANGE; } if (taosIsDir(defaultTmp)) taosRemoveDir(defaultTmp); if (taosIsDir(defaultPath)) { code = taosRenameFile(defaultPath, defaultTmp); if (code != 0) { - terrno = TAOS_SYSTEM_ERROR(errno); + code = TAOS_SYSTEM_ERROR(errno); goto _EXIT; } else { rename = 1; @@ -527,7 +511,7 @@ int32_t rebuildFromRemoteChkp_s3(const char* key, char* chkpPath, int64_t chkpId } else { code = taosMkDir(defaultPath); if (code != 0) { - terrno = TAOS_SYSTEM_ERROR(errno); + code = TAOS_SYSTEM_ERROR(errno); goto _EXIT; } } @@ -593,7 +577,7 @@ int32_t backendFileCopyFilesImpl(const char* src, const char* dst) { const char* info = "info"; size_t infoLen = strlen(info); - int32_t code = -1; + int32_t code = 0; int32_t sLen = strlen(src); int32_t dLen = strlen(dst); int32_t cap = TMAX(sLen, dLen) + 64; @@ -602,14 +586,17 @@ int32_t backendFileCopyFilesImpl(const char* src, const char* dst) { char* srcName = taosMemoryCalloc(1, cap); char* dstName = taosMemoryCalloc(1, cap); if (srcName == NULL || dstName == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; - return -1; + taosMemoryFree(srcName); + taosMemoryFree(dstName); + code = TSDB_CODE_OUT_OF_MEMORY; + return code; } // copy file to dst TdDirPtr pDir = taosOpenDir(src); if (pDir == NULL) { - terrno = TAOS_SYSTEM_ERROR(errno); + code = TAOS_SYSTEM_ERROR(errno); + goto _ERROR; } errno = 0; @@ -622,36 +609,36 @@ int32_t backendFileCopyFilesImpl(const char* src, const char* dst) { nBytes = snprintf(srcName, cap, "%s%s%s", src, TD_DIRSEP, name); if (nBytes <= 0 || nBytes >= cap) { - terrno = TSDB_CODE_OUT_OF_RANGE; + code = TSDB_CODE_OUT_OF_RANGE; goto _ERROR; } nBytes = snprintf(dstName, cap, "%s%s%s", dst, TD_DIRSEP, name); if (nBytes <= 0 || nBytes >= cap) { - terrno = TSDB_CODE_OUT_OF_RANGE; + code = TSDB_CODE_OUT_OF_RANGE; goto _ERROR; } if (strncmp(name, current, strlen(name) <= currLen ? strlen(name) : currLen) == 0) { code = copyFiles_create(srcName, dstName, 0); if (code != 0) { - terrno = TAOS_SYSTEM_ERROR(errno); - stError("failed to copy file, detail: %s to %s reason:%s", srcName, dstName, tstrerror(terrno)); + code = TAOS_SYSTEM_ERROR(errno); + stError("failed to copy file, detail: %s to %s reason:%s", srcName, dstName, tstrerror(code)); goto _ERROR; } } else if (strncmp(name, info, strlen(name) <= infoLen ? strlen(name) : infoLen) == 0) { code = copyFiles_create(srcName, dstName, 0); if (code != 0) { - terrno = TAOS_SYSTEM_ERROR(errno); - stError("failed to copy file, detail: %s to %s reason:%s", srcName, dstName, tstrerror(terrno)); + code = TAOS_SYSTEM_ERROR(errno); + stError("failed to copy file, detail: %s to %s reason:%s", srcName, dstName, tstrerror(code)); goto _ERROR; } } else { code = copyFiles_hardlink(srcName, dstName, 0); if (code != 0) { - terrno = TAOS_SYSTEM_ERROR(errno); - stError("failed to hard link file, detail:%s to %s, reason:%s", srcName, dstName, tstrerror(terrno)); + code = TAOS_SYSTEM_ERROR(errno); + stError("failed to hard link file, detail:%s to %s, reason:%s", srcName, dstName, tstrerror(code)); goto _ERROR; } else { stDebug("succ hard link file:%s to %s", srcName, dstName); @@ -688,8 +675,7 @@ static int32_t rebuildFromLocalCheckpoint(const char* pTaskIdStr, const char* ch if (code != TSDB_CODE_SUCCESS) { cleanDir(defaultPath, pTaskIdStr); stError("%s failed to start stream backend from local %s, reason:%s, try download checkpoint from remote", - pTaskIdStr, checkpointPath, tstrerror(TAOS_SYSTEM_ERROR(terrno))); - terrno = 0; + pTaskIdStr, checkpointPath, tstrerror(TAOS_SYSTEM_ERROR(code))); code = TSDB_CODE_SUCCESS; } else { stInfo("%s copy checkpoint data from:%s to:%s succ, try to start stream backend", pTaskIdStr, checkpointPath, @@ -705,7 +691,7 @@ static int32_t rebuildFromLocalCheckpoint(const char* pTaskIdStr, const char* ch int32_t restoreCheckpointData(const char* path, const char* key, int64_t chkptId, char** dbPrefixPath, char** dbPath, int64_t* processVer) { - int32_t code = -1; + int32_t code = 0; char* prefixPath = NULL; char* defaultPath = NULL; @@ -721,43 +707,43 @@ int32_t restoreCheckpointData(const char* path, const char* key, int64_t chkptId checkpointPath = taosMemoryCalloc(1, cap); checkpointRoot = taosMemoryCalloc(1, cap); if (prefixPath == NULL || defaultPath == NULL || checkpointPath == NULL || checkpointRoot == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; + code = TSDB_CODE_OUT_OF_MEMORY; goto _EXIT; } nBytes = snprintf(prefixPath, cap, "%s%s%s", path, TD_DIRSEP, key); if (nBytes <= 0 || nBytes >= cap) { - terrno = TSDB_CODE_OUT_OF_RANGE; + code = TSDB_CODE_OUT_OF_RANGE; goto _EXIT; } code = createDirIfNotExist(prefixPath); if (code != 0) { - terrno = TAOS_SYSTEM_ERROR(errno); + code = TAOS_SYSTEM_ERROR(errno); goto _EXIT; } nBytes = snprintf(defaultPath, cap, "%s%s%s", prefixPath, TD_DIRSEP, "state"); if (nBytes <= 0 || nBytes >= cap) { - terrno = TSDB_CODE_OUT_OF_RANGE; + code = TSDB_CODE_OUT_OF_RANGE; goto _EXIT; } code = createDirIfNotExist(defaultPath); if (code != 0) { - terrno = TAOS_SYSTEM_ERROR(errno); + code = TAOS_SYSTEM_ERROR(errno); goto _EXIT; } nBytes = snprintf(checkpointRoot, cap, "%s%s%s", prefixPath, TD_DIRSEP, "checkpoints"); if (nBytes <= 0 || nBytes >= cap) { - terrno = TSDB_CODE_OUT_OF_RANGE; + code = TSDB_CODE_OUT_OF_RANGE; goto _EXIT; } code = createDirIfNotExist(checkpointRoot); if (code != 0) { - terrno = TAOS_SYSTEM_ERROR(errno); + code = TAOS_SYSTEM_ERROR(errno); goto _EXIT; } @@ -766,19 +752,18 @@ int32_t restoreCheckpointData(const char* path, const char* key, int64_t chkptId nBytes = snprintf(checkpointPath, cap, "%s%s%s%s%s%" PRId64 "", prefixPath, TD_DIRSEP, "checkpoints", TD_DIRSEP, "checkpoint", chkptId); if (nBytes <= 0 || nBytes >= cap) { - terrno = TSDB_CODE_OUT_OF_RANGE; + code = TSDB_CODE_OUT_OF_RANGE; goto _EXIT; } code = rebuildFromLocalCheckpoint(key, checkpointPath, chkptId, defaultPath, processVer); if (code != 0) { - terrno = 0; code = rebuildFromRemoteCheckpoint(key, checkpointPath, chkptId, defaultPath); } if (code != 0) { stError("failed to start stream backend at %s, reason: %s, restart from default defaultPath:%s, reason:%s", - checkpointPath, tstrerror(code), defaultPath, tstrerror(terrno)); + checkpointPath, tstrerror(code), defaultPath, tstrerror(code)); code = 0; // reset the error code } } else { // no valid checkpoint id @@ -802,21 +787,6 @@ _EXIT: return code; } -bool streamBackendDataIsExist(const char* path, int64_t chkpId, int32_t vgId) { - bool exist = true; - char* state = taosMemoryCalloc(1, strlen(path) + 32); - if (state == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; - return false; - } - sprintf(state, "%s%s%s", path, TD_DIRSEP, "state"); - if (!taosDirExist(state)) { - exist = false; - } - taosMemoryFree(state); - return exist; -} - void* streamBackendInit(const char* streamPath, int64_t chkpId, int32_t vgId) { char* backendPath = NULL; int32_t code = rebuildDirFromCheckpoint(streamPath, chkpId, &backendPath); @@ -1313,12 +1283,14 @@ int32_t chkpDoDbCheckpoint(rocksdb_t* db, char* path) { if (cp == NULL || err != NULL) { stError("failed to do checkpoint at:%s, reason:%s", path, err); taosMemoryFreeClear(err); + code = TSDB_CODE_THIRDPARTY_ERROR; goto _ERROR; } rocksdb_checkpoint_create(cp, path, UINT64_MAX, &err); if (err != NULL) { stError("failed to do checkpoint at:%s, reason:%s", path, err); taosMemoryFreeClear(err); + code = TSDB_CODE_THIRDPARTY_ERROR; } else { code = 0; } @@ -1332,13 +1304,17 @@ int32_t chkpPreFlushDb(rocksdb_t* db, rocksdb_column_family_handle_t** cf, int32 char* err = NULL; rocksdb_flushoptions_t* flushOpt = rocksdb_flushoptions_create(); + if (flushOpt == NULL) { + return TSDB_CODE_OUT_OF_MEMORY; + } + rocksdb_flushoptions_set_wait(flushOpt, 1); rocksdb_flush_cfs(db, flushOpt, cf, nCf, &err); if (err != NULL) { stError("failed to flush db before streamBackend clean up, reason:%s", err); taosMemoryFree(err); - code = -1; + code = TSDB_CODE_THIRDPARTY_ERROR; } rocksdb_flushoptions_destroy(flushOpt); return code; @@ -1346,28 +1322,47 @@ int32_t chkpPreFlushDb(rocksdb_t* db, rocksdb_column_family_handle_t** cf, int32 int32_t chkpPreBuildDir(char* path, int64_t chkpId, char** chkpDir, char** chkpIdDir) { int32_t code = 0; - char* pChkpDir = taosMemoryCalloc(1, 256); - char* pChkpIdDir = taosMemoryCalloc(1, 256); + int32_t cap = strlen(path) + 256; + int32_t nBytes = 0; - sprintf(pChkpDir, "%s%s%s", path, TD_DIRSEP, "checkpoints"); - code = taosMulModeMkDir(pChkpDir, 0755, true); - if (code != 0) { - stError("failed to prepare checkpoint dir, path:%s, reason:%s", path, tstrerror(code)); - taosMemoryFree(pChkpDir); - taosMemoryFree(pChkpIdDir); - code = -1; - return code; + char* pChkpDir = taosMemoryCalloc(1, cap); + char* pChkpIdDir = taosMemoryCalloc(1, cap); + if (pChkpDir == NULL || pChkpIdDir == NULL) { + code = TSDB_CODE_OUT_OF_MEMORY; + goto _EXIT; + } + + nBytes = snprintf(pChkpDir, cap, "%s%s%s", path, TD_DIRSEP, "checkpoints"); + if (nBytes <= 0 || nBytes >= cap) { + code = TSDB_CODE_OUT_OF_RANGE; + goto _EXIT; + } + + nBytes = snprintf(pChkpIdDir, cap, "%s%s%s%" PRId64, pChkpDir, TD_DIRSEP, "checkpoint", chkpId); + if (nBytes <= 0 || nBytes >= cap) { + code = TSDB_CODE_OUT_OF_RANGE; + goto _EXIT; + } + + code = taosMulModeMkDir(pChkpDir, 0755, true); + if (code != 0) { + code = TAOS_SYSTEM_ERROR(errno); + stError("failed to prepare checkpoint dir, path:%s, reason:%s", path, tstrerror(code)); + goto _EXIT; } - sprintf(pChkpIdDir, "%s%s%s%" PRId64, pChkpDir, TD_DIRSEP, "checkpoint", chkpId); if (taosIsDir(pChkpIdDir)) { stInfo("stream rm exist checkpoint%s", pChkpIdDir); taosRemoveDir(pChkpIdDir); } + *chkpDir = pChkpDir; *chkpIdDir = pChkpIdDir; - return 0; +_EXIT: + taosMemoryFree(pChkpDir); + taosMemoryFree(pChkpIdDir); + return code; } int32_t taskDbBuildSnap(void* arg, SArray* pSnap) { @@ -1396,7 +1391,6 @@ int32_t taskDbBuildSnap(void* arg, SArray* pSnap) { // remove chkpId from in-use-ckpkIdSet taskDbUnRefChkp(pTaskDb, pTaskDb->chkpId); taskDbRemoveRef(pTaskDb); - code = -1; break; } @@ -1409,8 +1403,7 @@ int32_t taskDbBuildSnap(void* arg, SArray* pSnap) { // remove chkpid from chkp-in-use set taskDbUnRefChkp(pTaskDb, pTaskDb->chkpId); taskDbRemoveRef(pTaskDb); - terrno = TSDB_CODE_OUT_OF_MEMORY; - code = -1; + code = TSDB_CODE_OUT_OF_MEMORY; break; } taosArrayPush(pSnap, &snap); @@ -1491,29 +1484,29 @@ int64_t taskGetDBRef(void* arg) { int32_t chkpLoadExtraInfo(char* pChkpIdDir, int64_t* chkpId, int64_t* processId) { TdFilePtr pFile = NULL; - int32_t code = -1; + int32_t code = 0; char buf[256] = {0}; int32_t nBytes = 0; int32_t len = strlen(pChkpIdDir); if (len == 0) { - terrno = TSDB_CODE_INVALID_PARA; - stError("failed to load extra info, dir:%s, reason:%s", pChkpIdDir, tstrerror(terrno)); - return terrno; + code = TSDB_CODE_INVALID_PARA; + stError("failed to load extra info, dir:%s, reason:%s", pChkpIdDir, tstrerror(code)); + return code; } int32_t cap = len + 64; char* pDst = taosMemoryCalloc(1, cap); if (pDst == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; + code = TSDB_CODE_OUT_OF_MEMORY; stError("failed to alloc memory to load extra info, dir:%s", pChkpIdDir); goto _EXIT; } nBytes = snprintf(pDst, cap, "%s%sinfo", pChkpIdDir, TD_DIRSEP); if (nBytes <= 0 || nBytes >= cap) { - terrno = TSDB_CODE_OUT_OF_RANGE; + code = TSDB_CODE_OUT_OF_RANGE; stError("failed to build dst to load extra info, dir:%s", pChkpIdDir); goto _EXIT; } @@ -1526,31 +1519,31 @@ int32_t chkpLoadExtraInfo(char* pChkpIdDir, int64_t* chkpId, int64_t* processId) code = 0; goto _EXIT; } else { - terrno = TAOS_SYSTEM_ERROR(errno); - stError("failed to open file to load extra info, file:%s", pDst); + code = TAOS_SYSTEM_ERROR(errno); + stError("failed to open file to load extra info, file:%s, reason:%s", pDst, tstrerror(code)); } goto _EXIT; } if (taosReadFile(pFile, buf, sizeof(buf)) <= 0) { - terrno = TAOS_SYSTEM_ERROR(errno); - stError("failed to read file to load extra info, file:%s, reason:%s", pDst, tstrerror(terrno)); + code = TAOS_SYSTEM_ERROR(errno); + stError("failed to read file to load extra info, file:%s, reason:%s", pDst, tstrerror(code)); goto _EXIT; } if (sscanf(buf, "%" PRId64 " %" PRId64 "", chkpId, processId) < 2) { - terrno = TSDB_CODE_INVALID_PARA; - stError("failed to read file content to load extra info, file:%s, reason:%s", pDst, tstrerror(terrno)); + code = TSDB_CODE_INVALID_PARA; + stError("failed to read file content to load extra info, file:%s, reason:%s", pDst, tstrerror(code)); goto _EXIT; } code = 0; _EXIT: taosMemoryFree(pDst); taosCloseFile(&pFile); - return terrno; + return code; } int32_t chkpAddExtraInfo(char* pChkpIdDir, int64_t chkpId, int64_t processId) { - int32_t code = -1; + int32_t code = 0; TdFilePtr pFile = NULL; @@ -1559,41 +1552,43 @@ int32_t chkpAddExtraInfo(char* pChkpIdDir, int64_t chkpId, int64_t processId) { int32_t len = strlen(pChkpIdDir); if (len == 0) { - terrno = TSDB_CODE_INVALID_PARA; - stError("failed to add extra info, dir:%s, reason:%s", pChkpIdDir, tstrerror(terrno)); - return -1; + code = TSDB_CODE_INVALID_PARA; + stError("failed to add extra info, dir:%s, reason:%s", pChkpIdDir, tstrerror(code)); + return code; } + int32_t cap = len + 64; char* pDst = taosMemoryCalloc(1, cap); if (pDst == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; + code = TSDB_CODE_OUT_OF_MEMORY; stError("failed to alloc memory to add extra info, dir:%s", pChkpIdDir); goto _EXIT; } nBytes = snprintf(pDst, cap, "%s%sinfo", pChkpIdDir, TD_DIRSEP); if (nBytes <= 0 || nBytes >= cap) { - stError("failed to build dst to add extra info, dir:%s", pChkpIdDir); + code = TSDB_CODE_OUT_OF_RANGE; + stError("failed to build dst to add extra info, dir:%s, reason:%d", pChkpIdDir, tstrerror(code)); goto _EXIT; } pFile = taosOpenFile(pDst, TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_TRUNC); if (pFile == NULL) { - terrno = TAOS_SYSTEM_ERROR(errno); - stError("failed to open file to add extra info, file:%s", pDst); + code = TAOS_SYSTEM_ERROR(errno); + stError("failed to open file to add extra info, file:%s, reason:%s", pDst, tstrerror(code)); goto _EXIT; } nBytes = snprintf(buf, sizeof(buf), "%" PRId64 " %" PRId64 "", chkpId, processId); if (nBytes <= 0 || nBytes >= sizeof(buf)) { - terrno = TSDB_CODE_OUT_OF_RANGE; - stError("failed to build content to add extra info, dir:%s", pChkpIdDir); + code = TSDB_CODE_OUT_OF_RANGE; + stError("failed to build content to add extra info, dir:%s,reason:%d", pChkpIdDir, tstrerror(code)); goto _EXIT; } if (nBytes != taosWriteFile(pFile, buf, nBytes)) { - terrno = TAOS_SYSTEM_ERROR(errno); - stError("failed to write file to add extra info, file:%s, reason:%s", pDst, tstrerror(terrno)); + code = TAOS_SYSTEM_ERROR(errno); + stError("failed to write file to add extra info, file:%s, reason:%s", pDst, tstrerror(code)); goto _EXIT; } code = 0; @@ -1606,17 +1601,18 @@ _EXIT: int32_t taskDbDoCheckpoint(void* arg, int64_t chkpId, int64_t processId) { STaskDbWrapper* pTaskDb = arg; int64_t st = taosGetTimestampMs(); - int32_t code = -1; + int32_t code = 0; int64_t refId = pTaskDb->refId; if (taosAcquireRef(taskDbWrapperId, refId) == NULL) { - return -1; + code = terrno; + terrno = 0; + return code; } char* pChkpDir = NULL; char* pChkpIdDir = NULL; - if (chkpPreBuildDir(pTaskDb->path, chkpId, &pChkpDir, &pChkpIdDir) != 0) { - code = -1; + if ((code = chkpPreBuildDir(pTaskDb->path, chkpId, &pChkpDir, &pChkpIdDir)) < 0) { goto _EXIT; } // Get all cf and acquire cfWrappter @@ -2404,16 +2400,15 @@ int32_t taskDbBuildFullPath(char* path, char* key, char** dbFullPath, char** sta int32_t code = 0; char* statePath = taosMemoryCalloc(1, strlen(path) + 128); if (statePath == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; - return -1; + return TSDB_CODE_OUT_OF_MEMORY; } sprintf(statePath, "%s%s%s", path, TD_DIRSEP, key); if (!taosDirExist(statePath)) { code = taosMulMkDir(statePath); if (code != 0) { - terrno = errno; - stError("failed to create dir: %s, reason:%s", statePath, tstrerror(terrno)); + code = TAOS_SYSTEM_ERROR(errno); + stError("failed to create dir: %s, reason:%s", statePath, tstrerror(code)); taosMemoryFree(statePath); return code; } @@ -2422,15 +2417,14 @@ int32_t taskDbBuildFullPath(char* path, char* key, char** dbFullPath, char** sta char* dbPath = taosMemoryCalloc(1, strlen(statePath) + 128); if (dbPath == NULL) { taosMemoryFree(statePath); - terrno = TSDB_CODE_OUT_OF_MEMORY; - return -1; + return TSDB_CODE_OUT_OF_MEMORY; } sprintf(dbPath, "%s%s%s", statePath, TD_DIRSEP, "state"); if (!taosDirExist(dbPath)) { code = taosMulMkDir(dbPath); if (code != 0) { - terrno = errno; + code = TAOS_SYSTEM_ERROR(errno); stError("failed to create dir: %s, reason:%s", dbPath, tstrerror(code)); taosMemoryFree(statePath); taosMemoryFree(dbPath); @@ -2511,8 +2505,9 @@ _EXIT: STaskDbWrapper* taskDbOpen(const char* path, const char* key, int64_t chkptId, int64_t* processVer) { char* statePath = NULL; char* dbPath = NULL; - - if (restoreCheckpointData(path, key, chkptId, &statePath, &dbPath, processVer) != 0) { + int code = 0; + if ((code = restoreCheckpointData(path, key, chkptId, &statePath, &dbPath, processVer)) < 0) { + terrno = code; stError("failed to restore checkpoint data, path:%s, key:%s, checkpointId: %" PRId64 "reason:%s", path, key, chkptId, tstrerror(terrno)); return NULL; @@ -2521,17 +2516,14 @@ STaskDbWrapper* taskDbOpen(const char* path, const char* key, int64_t chkptId, i STaskDbWrapper* pTaskDb = taskDbOpenImpl(key, statePath, dbPath); if (pTaskDb != NULL) { int64_t chkpId = -1, ver = -1; - if (chkpLoadExtraInfo(dbPath, &chkpId, &ver) == 0) { + if ((code = chkpLoadExtraInfo(dbPath, &chkpId, &ver) == 0)) { *processVer = ver; } else { - if (terrno == TSDB_CODE_OUT_OF_MEMORY) { - taskDbDestroy(pTaskDb, false); - return NULL; - } else { - // not info file exists, caller handle this situation - terrno = 0; - *processVer = -1; - } + terrno = code; + stError("failed to load extra info, path:%s, key:%s, checkpointId: %" PRId64 "reason:%s", path, key, chkptId, + tstrerror(terrno)); + taskDbDestroy(pTaskDb, false); + return NULL; } } @@ -2623,27 +2615,27 @@ void taskDbDestroy(void* pDb, bool flush) { void taskDbDestroy2(void* pDb) { taskDbDestroy(pDb, true); } int32_t taskDbGenChkpUploadData__rsync(STaskDbWrapper* pDb, int64_t chkpId, char** path) { - int32_t code = -1; + int32_t code = 0; int64_t refId = pDb->refId; int32_t nBytes = 0; if (taosAcquireRef(taskDbWrapperId, refId) == NULL) { - return -1; + code = terrno; + return code; } int32_t cap = strlen(pDb->path) + 128; char* buf = taosMemoryCalloc(1, cap); if (buf == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; - return -1; + return TSDB_CODE_OUT_OF_MEMORY; } nBytes = snprintf(buf, cap, "%s%s%s%s%s%" PRId64 "", pDb->path, TD_DIRSEP, "checkpoints", TD_DIRSEP, "checkpoint", chkpId); if (nBytes <= 0 || nBytes >= cap) { - terrno = TSDB_CODE_OUT_OF_RANGE; - return -1; + taosMemoryFree(buf); + return TSDB_CODE_OUT_OF_RANGE; } if (taosIsDir(buf)) { @@ -2660,20 +2652,28 @@ int32_t taskDbGenChkpUploadData__rsync(STaskDbWrapper* pDb, int64_t chkpId, char int32_t taskDbGenChkpUploadData__s3(STaskDbWrapper* pDb, void* bkdChkpMgt, int64_t chkpId, char** path, SArray* list, const char* idStr) { int32_t code = 0; + int32_t cap = strlen(pDb->path) + 32; SBkdMgt* p = (SBkdMgt*)bkdChkpMgt; - char* temp = taosMemoryCalloc(1, strlen(pDb->path) + 32); + char* temp = taosMemoryCalloc(1, cap); if (temp == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; - return -1; + return TSDB_CODE_OUT_OF_MEMORY; } - sprintf(temp, "%s%s%s%" PRId64, pDb->path, TD_DIRSEP, "tmp", chkpId); + int32_t nBytes = snprintf(temp, cap, "%s%s%s%" PRId64, pDb->path, TD_DIRSEP, "tmp", chkpId); + if (nBytes <= 0 || nBytes >= cap) { + taosMemoryFree(temp); + return TSDB_CODE_OUT_OF_RANGE; + } if (taosDirExist(temp)) { cleanDir(temp, idStr); } else { - taosMkDir(temp); + code = taosMkDir(temp); + if (code != 0) { + taosMemoryFree(temp); + return TAOS_SYSTEM_ERROR(errno); + } } code = bkdMgtGetDelta(p, pDb->idstr, chkpId, list, temp); @@ -4405,8 +4405,7 @@ int32_t compareHashTableImpl(SHashObj* p1, SHashObj* p2, SArray* diff) { if (!isBkdDataMeta(name, len) && !taosHashGet(p1, name, len)) { char* fname = taosMemoryCalloc(1, len + 1); if (fname == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; - return terrno; + return TSDB_CODE_OUT_OF_MEMORY; } strncpy(fname, name, len); taosArrayPush(diff, &fname); @@ -4483,6 +4482,7 @@ void dbChkpDebugInfo(SDbChkp* pDb) { } } int32_t dbChkpGetDelta(SDbChkp* p, int64_t chkpId, SArray* list) { + int32_t code = 0; int32_t nBytes; taosThreadRwlockWrlock(&p->rwLock); @@ -4502,9 +4502,8 @@ int32_t dbChkpGetDelta(SDbChkp* p, int64_t chkpId, SArray* list) { nBytes = snprintf(p->buf, p->len, "%s%s%s%scheckpoint%" PRId64 "", p->path, TD_DIRSEP, "checkpoints", TD_DIRSEP, chkpId); if (nBytes <= 0 || nBytes >= p->len) { - terrno = TSDB_CODE_OUT_OF_RANGE; taosThreadRwlockUnlock(&p->rwLock); - return terrno; + return TSDB_CODE_OUT_OF_RANGE; } taosArrayClearP(p->pAdd, taosMemoryFree); @@ -4513,9 +4512,8 @@ int32_t dbChkpGetDelta(SDbChkp* p, int64_t chkpId, SArray* list) { TdDirPtr pDir = taosOpenDir(p->buf); if (pDir == NULL) { - terrno = TAOS_SYSTEM_ERROR(errno); taosThreadRwlockUnlock(&p->rwLock); - return terrno; + return TAOS_SYSTEM_ERROR(errno); } TdDirEntryPtr de = NULL; @@ -4528,7 +4526,7 @@ int32_t dbChkpGetDelta(SDbChkp* p, int64_t chkpId, SArray* list) { p->pCurrent = taosStrdup(name); if (p->pCurrent == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; + code = TSDB_CODE_OUT_OF_MEMORY; break; } continue; @@ -4538,7 +4536,7 @@ int32_t dbChkpGetDelta(SDbChkp* p, int64_t chkpId, SArray* list) { taosMemoryFreeClear(p->pManifest); p->pManifest = taosStrdup(name); if (p->pManifest == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; + code = TSDB_CODE_OUT_OF_MEMORY; break; } continue; @@ -4551,9 +4549,9 @@ int32_t dbChkpGetDelta(SDbChkp* p, int64_t chkpId, SArray* list) { } } taosCloseDir(&pDir); - if (terrno != 0) { + if (code != 0) { taosThreadRwlockUnlock(&p->rwLock); - return terrno; + return code; } if (p->init == 0) { @@ -4564,9 +4562,8 @@ int32_t dbChkpGetDelta(SDbChkp* p, int64_t chkpId, SArray* list) { if (name != NULL && !isBkdDataMeta(name, len)) { char* fname = taosMemoryCalloc(1, len + 1); if (fname == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; taosThreadRwlockUnlock(&p->rwLock); - return -1; + return TSDB_CODE_OUT_OF_MEMORY; } strncpy(fname, name, len); @@ -4587,7 +4584,7 @@ int32_t dbChkpGetDelta(SDbChkp* p, int64_t chkpId, SArray* list) { taosArrayClearP(p->pDel, taosMemoryFree); taosHashClear(p->pSstTbl[1 - p->idx]); p->update = 0; - return terrno; + return code; } if (taosArrayGetSize(p->pAdd) == 0 && taosArrayGetSize(p->pDel) == 0) { @@ -4604,7 +4601,7 @@ int32_t dbChkpGetDelta(SDbChkp* p, int64_t chkpId, SArray* list) { taosThreadRwlockUnlock(&p->rwLock); - return 0; + return code; } void dbChkpDestroy(SDbChkp* pChkp); @@ -4698,7 +4695,7 @@ int32_t dbChkpInit(SDbChkp* p) { #endif int32_t dbChkpDumpTo(SDbChkp* p, char* dname, SArray* list) { static char* chkpMeta = "META"; - int32_t code = -1; + int32_t code = 0; int32_t cap = p->len + 128; taosThreadRwlockRdlock(&p->rwLock); @@ -4708,30 +4705,33 @@ int32_t dbChkpDumpTo(SDbChkp* p, char* dname, SArray* list) { char* srcDir = taosMemoryCalloc(1, cap); char* dstDir = taosMemoryCalloc(1, cap); if (srcBuf == NULL || dstBuf == NULL || srcDir == NULL || dstDir == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; + code = TSDB_CODE_OUT_OF_MEMORY; goto _ERROR; } int nBytes = snprintf(srcDir, cap, "%s%s%s%s%s%" PRId64 "", p->path, TD_DIRSEP, "checkpoints", TD_DIRSEP, "checkpoint", p->curChkpId); if (nBytes <= 0 || nBytes >= cap) { - terrno = TSDB_CODE_OUT_OF_RANGE; + code = TSDB_CODE_OUT_OF_RANGE; goto _ERROR; } nBytes = snprintf(dstDir, cap, "%s", dname); if (nBytes <= 0 || nBytes >= cap) { - terrno = TSDB_CODE_OUT_OF_RANGE; + code = TSDB_CODE_OUT_OF_RANGE; goto _ERROR; } if (!taosDirExist(srcDir)) { stError("failed to dump srcDir %s, reason: not exist such dir", srcDir); + code = TSDB_CODE_INVALID_PARA; goto _ERROR; } int64_t chkpId = 0, processId = -1; - if (chkpLoadExtraInfo(srcDir, &chkpId, &processId) != 0) { - stError("failed to load extra info from %s, reason:%s", srcDir, terrno != 0 ? "unkown" : tstrerror(terrno)); + code = chkpLoadExtraInfo(srcDir, &chkpId, &processId); + if (code < 0) { + stError("failed to load extra info from %s, reason:%s", srcDir, code != 0 ? "unkown" : tstrerror(code)); + goto _ERROR; } @@ -4743,19 +4743,19 @@ int32_t dbChkpDumpTo(SDbChkp* p, char* dname, SArray* list) { char* filename = taosArrayGetP(p->pAdd, i); nBytes = snprintf(srcBuf, cap, "%s%s%s", srcDir, TD_DIRSEP, filename); if (nBytes <= 0 || nBytes >= cap) { - terrno = TSDB_CODE_OUT_OF_RANGE; + code = TSDB_CODE_OUT_OF_RANGE; goto _ERROR; } nBytes = snprintf(dstBuf, cap, "%s%s%s", dstDir, TD_DIRSEP, filename); if (nBytes <= 0 || nBytes >= cap) { - terrno = TSDB_CODE_OUT_OF_RANGE; + code = TSDB_CODE_OUT_OF_RANGE; goto _ERROR; } if (taosCopyFile(srcBuf, dstBuf) < 0) { - terrno = errno; - stError("failed to copy file from %s to %s, reason:%s", srcBuf, dstBuf, tstrerror(terrno)); + code = TAOS_SYSTEM_ERROR(errno); + stError("failed to copy file from %s to %s, reason:%s", srcBuf, dstBuf, tstrerror(code)); goto _ERROR; } } @@ -4764,7 +4764,7 @@ int32_t dbChkpDumpTo(SDbChkp* p, char* dname, SArray* list) { char* filename = taosArrayGetP(p->pDel, i); char* p = taosStrdup(filename); if (p == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; + code = TSDB_CODE_OUT_OF_MEMORY; goto _ERROR; } taosArrayPush(list, &p); @@ -4776,19 +4776,19 @@ int32_t dbChkpDumpTo(SDbChkp* p, char* dname, SArray* list) { nBytes = snprintf(srcBuf, cap, "%s%s%s", srcDir, TD_DIRSEP, p->pCurrent); if (nBytes <= 0 || nBytes >= cap) { - terrno = TSDB_CODE_OUT_OF_RANGE; + code = TSDB_CODE_OUT_OF_RANGE; goto _ERROR; } nBytes = snprintf(dstBuf, cap, "%s%s%s_%" PRId64 "", dstDir, TD_DIRSEP, p->pCurrent, p->curChkpId); if (nBytes <= 0 || nBytes >= cap) { - terrno = TSDB_CODE_OUT_OF_RANGE; + code = TSDB_CODE_OUT_OF_RANGE; goto _ERROR; } if (taosCopyFile(srcBuf, dstBuf) < 0) { - terrno = errno; - stError("failed to copy file from %s to %s, reason:%s", srcBuf, dstBuf, tstrerror(terrno)); + code = TAOS_SYSTEM_ERROR(errno); + stError("failed to copy file from %s to %s, reason:%s", srcBuf, dstBuf, tstrerror(code)); goto _ERROR; } @@ -4798,33 +4798,33 @@ int32_t dbChkpDumpTo(SDbChkp* p, char* dname, SArray* list) { nBytes = snprintf(srcBuf, cap, "%s%s%s", srcDir, TD_DIRSEP, p->pManifest); if (nBytes <= 0 || nBytes >= cap) { - terrno = TSDB_CODE_OUT_OF_RANGE; + code = TSDB_CODE_OUT_OF_RANGE; goto _ERROR; } nBytes = snprintf(dstBuf, cap, "%s%s%s_%" PRId64 "", dstDir, TD_DIRSEP, p->pManifest, p->curChkpId); if (nBytes <= 0 || nBytes >= cap) { - terrno = TSDB_CODE_OUT_OF_RANGE; + code = TSDB_CODE_OUT_OF_RANGE; goto _ERROR; } if (taosCopyFile(srcBuf, dstBuf) < 0) { - terrno = errno; - stError("failed to copy file from %s to %s, reason:%s", srcBuf, dstBuf, tstrerror(terrno)); + code = TAOS_SYSTEM_ERROR(errno); + stError("failed to copy file from %s to %s, reason:%s", srcBuf, dstBuf, tstrerror(code)); goto _ERROR; } memset(dstBuf, 0, cap); nBytes = snprintf(dstDir, cap, "%s%s%s", dstDir, TD_DIRSEP, chkpMeta); if (nBytes <= 0 || nBytes >= cap) { - terrno = TSDB_CODE_OUT_OF_RANGE; + code = TSDB_CODE_OUT_OF_RANGE; goto _ERROR; } TdFilePtr pFile = taosOpenFile(dstDir, TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_TRUNC); if (pFile == NULL) { - terrno = TAOS_SYSTEM_ERROR(errno); - stError("chkp failed to create meta file: %s, reason:%s", dstDir, tstrerror(terrno)); + code = TAOS_SYSTEM_ERROR(errno); + stError("chkp failed to create meta file: %s, reason:%s", dstDir, tstrerror(code)); goto _ERROR; } @@ -4832,7 +4832,7 @@ int32_t dbChkpDumpTo(SDbChkp* p, char* dname, SArray* list) { nBytes = snprintf(content, sizeof(content), META_ON_S3_FORMATE, p->pCurrent, p->curChkpId, p->pManifest, p->curChkpId, "processVer", processId); if (nBytes <= 0 || nBytes >= sizeof(content)) { - terrno = TSDB_CODE_OUT_OF_RANGE; + code = TSDB_CODE_OUT_OF_RANGE; stError("chkp failed to format meta file: %s, reason: invalid msg", dstDir); taosCloseFile(&pFile); goto _ERROR; @@ -4840,8 +4840,8 @@ int32_t dbChkpDumpTo(SDbChkp* p, char* dname, SArray* list) { nBytes = taosWriteFile(pFile, content, strlen(content)); if (nBytes != strlen(content)) { - terrno = TAOS_SYSTEM_ERROR(errno); - stError("chkp failed to write meta file: %s,reason:%s", dstDir, tstrerror(terrno)); + code = TAOS_SYSTEM_ERROR(errno); + stError("chkp failed to write meta file: %s,reason:%s", dstDir, tstrerror(code)); taosCloseFile(&pFile); goto _ERROR; } @@ -4859,10 +4859,11 @@ _ERROR: taosMemoryFree(srcDir); taosMemoryFree(dstDir); - return terrno; + return code; } SBkdMgt* bkdMgtCreate(char* path) { + terrno = 0; SBkdMgt* p = taosMemoryCalloc(1, sizeof(SBkdMgt)); if (p == NULL) { terrno = TSDB_CODE_OUT_OF_MEMORY; @@ -4910,7 +4911,6 @@ void bkdMgtDestroy(SBkdMgt* bm) { } int32_t bkdMgtGetDelta(SBkdMgt* bm, char* taskId, int64_t chkpId, SArray* list, char* dname) { int32_t code = 0; - taosThreadRwlockWrlock(&bm->rwLock); SDbChkp** ppChkp = taosHashGet(bm->pDbChkpTbl, taskId, strlen(taskId)); SDbChkp* pChkp = ppChkp != NULL ? *ppChkp : NULL; @@ -4919,30 +4919,31 @@ int32_t bkdMgtGetDelta(SBkdMgt* bm, char* taskId, int64_t chkpId, SArray* list, int32_t cap = strlen(bm->path) + 64; char* path = taosMemoryCalloc(1, cap); if (path == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; taosThreadRwlockUnlock(&bm->rwLock); - return terrno; + return TSDB_CODE_OUT_OF_MEMORY; } int32_t nBytes = snprintf(path, cap, "%s%s%s", bm->path, TD_DIRSEP, taskId); if (nBytes <= 0 || nBytes >= cap) { - terrno = TSDB_CODE_OUT_OF_RANGE; taosMemoryFree(path); taosThreadRwlockUnlock(&bm->rwLock); - return terrno; + code = TSDB_CODE_OUT_OF_RANGE; + return code; } SDbChkp* p = dbChkpCreate(path, chkpId); if (p == NULL) { taosMemoryFree(path); taosThreadRwlockUnlock(&bm->rwLock); - return terrno; + code = terrno; + return code; } if (taosHashPut(bm->pDbChkpTbl, taskId, strlen(taskId), &p, sizeof(void*)) != 0) { dbChkpDestroy(p); taosThreadRwlockUnlock(&bm->rwLock); - return terrno; + code = terrno; + return code; } pChkp = p; @@ -4950,14 +4951,14 @@ int32_t bkdMgtGetDelta(SBkdMgt* bm, char* taskId, int64_t chkpId, SArray* list, taosThreadRwlockUnlock(&bm->rwLock); return code; } else { - terrno = dbChkpGetDelta(pChkp, chkpId, NULL); + code = dbChkpGetDelta(pChkp, chkpId, NULL); if (code == 0) { - terrno = dbChkpDumpTo(pChkp, dname, list); + code = dbChkpDumpTo(pChkp, dname, list); } } taosThreadRwlockUnlock(&bm->rwLock); - return terrno; + return code; } #ifdef BUILD_NO_CALL diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index 1e77e70efa..a66c7a7cfa 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -548,15 +548,13 @@ static int32_t getCheckpointDataMeta(const char* id, const char* path, SArray* l char* filePath = taosMemoryCalloc(1, cap); if (filePath == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; - return -1; + return TSDB_CODE_OUT_OF_MEMORY; } int32_t nBytes = snprintf(filePath, cap, "%s%s%s", path, TD_DIRSEP, "META_TMP"); if (nBytes <= 0 || nBytes >= cap) { taosMemoryFree(filePath); - terrno = TSDB_CODE_OUT_OF_RANGE; - return -1; + return TSDB_CODE_OUT_OF_RANGE; } code = downloadCheckpointDataByName(id, "META", filePath); @@ -584,19 +582,18 @@ int32_t uploadCheckpointData(SStreamTask* pTask, int64_t checkpointId, int64_t d SArray* toDelFiles = taosArrayInit(4, POINTER_BYTES); if (toDelFiles == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; - return -1; + return TSDB_CODE_OUT_OF_MEMORY; } if ((code = taskDbGenChkpUploadData(pTask->pBackend, pMeta->bkdChkptMgt, checkpointId, type, &path, toDelFiles, pTask->id.idStr)) != 0) { - stError("s-task:%s failed to gen upload checkpoint:%" PRId64 ", reason:%s", idStr, checkpointId, tstrerror(terrno)); + stError("s-task:%s failed to gen upload checkpoint:%" PRId64 ", reason:%s", idStr, checkpointId, tstrerror(code)); } if (type == DATA_UPLOAD_S3) { if (code == TSDB_CODE_SUCCESS && (code = getCheckpointDataMeta(idStr, path, toDelFiles)) != 0) { stError("s-task:%s failed to get checkpointData for checkpointId:%" PRId64 ", reason:%s", idStr, checkpointId, - tstrerror(terrno)); + tstrerror(code)); } } @@ -1003,11 +1000,13 @@ static int32_t uploadCheckpointToS3(const char* id, const char* path) { int32_t nBytes = 0; if (s3Init() != 0) { - return -1; + return TSDB_CODE_THIRDPARTY_ERROR; } TdDirPtr pDir = taosOpenDir(path); - if (pDir == NULL) return -1; + if (pDir == NULL) { + return TAOS_SYSTEM_ERROR(errno); + } TdDirEntryPtr de = NULL; while ((de = taosReadDir(pDir)) != NULL) { @@ -1018,13 +1017,13 @@ static int32_t uploadCheckpointToS3(const char* id, const char* path) { if (path[strlen(path) - 1] == TD_DIRSEP_CHAR) { nBytes = snprintf(filename, sizeof(filename), "%s%s", path, name); if (nBytes <= 0 || nBytes >= sizeof(filename)) { - code = -1; + code = TSDB_CODE_OUT_OF_RANGE; break; } } else { nBytes = snprintf(filename, sizeof(filename), "%s%s%s", path, TD_DIRSEP, name); if (nBytes <= 0 || nBytes >= sizeof(filename)) { - code = -1; + code = TSDB_CODE_OUT_OF_RANGE; break; } } @@ -1032,14 +1031,13 @@ static int32_t uploadCheckpointToS3(const char* id, const char* path) { char object[PATH_MAX] = {0}; nBytes = snprintf(object, sizeof(object), "%s%s%s", id, TD_DIRSEP, name); if (nBytes <= 0 || nBytes >= sizeof(object)) { - code = -1; + code = TSDB_CODE_OUT_OF_RANGE; break; } - if (s3PutObjectFromFile2(filename, object, 0) != 0) { - terrno = TAOS_SYSTEM_ERROR(errno); - code = -1; - stError("[s3] failed to upload checkpoint:%s", filename); + code = s3PutObjectFromFile2(filename, object, 0); + if (code != 0) { + stError("[s3] failed to upload checkpoint:%s, reason:%s", filename, tstrerror(code)); } else { stDebug("[s3] upload checkpoint:%s", filename); } @@ -1054,21 +1052,18 @@ int32_t downloadCheckpointByNameS3(const char* id, const char* fname, const char char* buf = taosMemoryCalloc(1, cap); if (buf == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; - return -1; + return TSDB_CODE_OUT_OF_MEMORY; } nBytes = snprintf(buf, cap, "%s/%s", id, fname); if (nBytes <= 0 || nBytes >= cap) { taosMemoryFree(buf); - terrno = TSDB_CODE_OUT_OF_RANGE; - return -1; + return TSDB_CODE_OUT_OF_RANGE; } - - if (s3GetObjectToFile(buf, dstName) != 0) { + int32_t code = s3GetObjectToFile(buf, dstName); + if (code != 0) { taosMemoryFree(buf); - terrno = TAOS_SYSTEM_ERROR(errno); - return -1; + return TAOS_SYSTEM_ERROR(errno); } taosMemoryFree(buf); return 0; @@ -1102,9 +1097,8 @@ int32_t streamTaskUploadCheckpoint(const char* id, const char* path) { // fileName: CURRENT int32_t downloadCheckpointDataByName(const char* id, const char* fname, const char* dstName) { if (id == NULL || fname == NULL || strlen(id) == 0 || strlen(fname) == 0 || strlen(fname) >= PATH_MAX) { - terrno = TSDB_CODE_INVALID_PARA; stError("down load checkpoint data parameters invalid"); - return -1; + return TSDB_CODE_INVALID_PARA; } if (strlen(tsSnodeAddress) != 0) { @@ -1133,9 +1127,8 @@ int32_t streamTaskDownloadCheckpointData(const char* id, char* path) { int32_t deleteCheckpoint(const char* id) { if (id == NULL || strlen(id) == 0) { - terrno = TSDB_CODE_INVALID_PARA; stError("deleteCheckpoint parameters invalid"); - return terrno; + return TSDB_CODE_INVALID_PARA; } if (strlen(tsSnodeAddress) != 0) { return deleteRsync(id); @@ -1156,8 +1149,9 @@ int32_t deleteCheckpointFile(const char* id, const char* name) { char* tmp = object; int32_t code = s3DeleteObjects((const char**)&tmp, 1); if (code != 0) { - return code; + return TSDB_CODE_THIRDPARTY_ERROR; } + return code; } int32_t streamTaskSendRestoreChkptMsg(SStreamTask* pTask) { @@ -1180,14 +1174,14 @@ int32_t streamTaskSendRestoreChkptMsg(SStreamTask* pTask) { tEncodeSize(tEncodeRestoreCheckpointInfo, &req, tlen, code); if (code < 0) { stError("s-task:%s vgId:%d encode stream task latest-checkpoint-id failed, code:%s", id, vgId, tstrerror(code)); - return -1; + return TSDB_CODE_INVALID_MSG; } void* buf = rpcMallocCont(tlen); if (buf == NULL) { stError("s-task:%s vgId:%d encode stream task latest-checkpoint-id msg failed, code:%s", id, vgId, tstrerror(TSDB_CODE_OUT_OF_MEMORY)); - return -1; + return TSDB_CODE_OUT_OF_MEMORY; } SEncoder encoder; diff --git a/source/libs/stream/src/streamSnapshot.c b/source/libs/stream/src/streamSnapshot.c index 57723132d8..878cb2ac71 100644 --- a/source/libs/stream/src/streamSnapshot.c +++ b/source/libs/stream/src/streamSnapshot.c @@ -228,12 +228,12 @@ int32_t snapFileGenMeta(SBackendSnapFile2* pSnapFile) { return 0; } int32_t snapFileReadMeta(SBackendSnapFile2* pSnapFile) { - terrno = 0; + int32_t code = 0; TdDirPtr pDir = taosOpenDir(pSnapFile->path); if (NULL == pDir) { - terrno = TAOS_SYSTEM_ERROR(errno); - stError("%s failed to open %s", STREAM_STATE_TRANSFER, pSnapFile->path); - return terrno; + code = TAOS_SYSTEM_ERROR(errno); + stError("%s failed to open %s, reason:%s", STREAM_STATE_TRANSFER, pSnapFile->path, tstrerror(code)); + return code; } TdDirEntryPtr pDirEntry; @@ -242,7 +242,7 @@ int32_t snapFileReadMeta(SBackendSnapFile2* pSnapFile) { if (strlen(name) >= strlen(ROCKSDB_CURRENT) && 0 == strncmp(name, ROCKSDB_CURRENT, strlen(ROCKSDB_CURRENT))) { pSnapFile->pCurrent = taosStrdup(name); if (pSnapFile->pCurrent == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; + code = TSDB_CODE_OUT_OF_MEMORY; break; } continue; @@ -250,7 +250,7 @@ int32_t snapFileReadMeta(SBackendSnapFile2* pSnapFile) { if (strlen(name) >= strlen(ROCKSDB_MAINFEST) && 0 == strncmp(name, ROCKSDB_MAINFEST, strlen(ROCKSDB_MAINFEST))) { pSnapFile->pMainfest = taosStrdup(name); if (pSnapFile->pMainfest == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; + code = TSDB_CODE_OUT_OF_MEMORY; break; } continue; @@ -258,7 +258,7 @@ int32_t snapFileReadMeta(SBackendSnapFile2* pSnapFile) { if (strlen(name) >= strlen(ROCKSDB_OPTIONS) && 0 == strncmp(name, ROCKSDB_OPTIONS, strlen(ROCKSDB_OPTIONS))) { pSnapFile->pOptions = taosStrdup(name); if (pSnapFile->pOptions == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; + code = TSDB_CODE_OUT_OF_MEMORY; break; } continue; @@ -267,7 +267,7 @@ int32_t snapFileReadMeta(SBackendSnapFile2* pSnapFile) { 0 == strncmp(name, ROCKSDB_CHECKPOINT_META, strlen(ROCKSDB_CHECKPOINT_META))) { pSnapFile->pCheckpointMeta = taosStrdup(name); if (pSnapFile->pCheckpointMeta == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; + code = TSDB_CODE_OUT_OF_MEMORY; break; } continue; @@ -276,7 +276,7 @@ int32_t snapFileReadMeta(SBackendSnapFile2* pSnapFile) { 0 == strncmp(name, ROCKSDB_CHECKPOINT_SELF_CHECK, strlen(ROCKSDB_CHECKPOINT_SELF_CHECK))) { pSnapFile->pCheckpointSelfcheck = taosStrdup(name); if (pSnapFile->pCheckpointSelfcheck == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; + code = TSDB_CODE_OUT_OF_MEMORY; break; } continue; @@ -285,17 +285,17 @@ int32_t snapFileReadMeta(SBackendSnapFile2* pSnapFile) { 0 == strncmp(name + strlen(name) - strlen(ROCKSDB_SST), ROCKSDB_SST, strlen(ROCKSDB_SST))) { char* sst = taosStrdup(name); if (sst == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; + code = TSDB_CODE_OUT_OF_MEMORY; break; } taosArrayPush(pSnapFile->pSst, &sst); } } taosCloseDir(&pDir); - return terrno; + return code; } int32_t streamBackendSnapInitFile(char* metaPath, SStreamTaskSnap* pSnap, SBackendSnapFile2* pSnapFile) { - terrno = 0; + int32_t code = 0; int32_t nBytes = 0; int32_t cap = strlen(pSnap->dbPrefixPath) + 256; @@ -307,28 +307,28 @@ int32_t streamBackendSnapInitFile(char* metaPath, SStreamTaskSnap* pSnap, SBacke nBytes = snprintf(path, cap, "%s%s%s%s%s%" PRId64 "", pSnap->dbPrefixPath, TD_DIRSEP, "checkpoints", TD_DIRSEP, "checkpoint", pSnap->chkpId); if (nBytes <= 0 || nBytes >= cap) { - terrno = TSDB_CODE_OUT_OF_RANGE; + code = TSDB_CODE_OUT_OF_RANGE; goto _ERROR; } if (!taosIsDir(path)) { - terrno = TSDB_CODE_INVALID_MSG; + code = TSDB_CODE_INVALID_MSG; goto _ERROR; } pSnapFile->pSst = taosArrayInit(16, sizeof(void*)); pSnapFile->pFileList = taosArrayInit(64, sizeof(SBackendFileItem)); if (pSnapFile->pSst == NULL || pSnapFile->pFileList == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; + code = TSDB_CODE_OUT_OF_MEMORY; goto _ERROR; } pSnapFile->path = path; pSnapFile->snapInfo = *pSnap; - if ((terrno = snapFileReadMeta(pSnapFile)) != 0) { + if ((code = snapFileReadMeta(pSnapFile)) != 0) { goto _ERROR; } - if ((terrno = snapFileGenMeta(pSnapFile)) != 0) { + if ((code = snapFileGenMeta(pSnapFile)) != 0) { goto _ERROR; } @@ -337,7 +337,7 @@ int32_t streamBackendSnapInitFile(char* metaPath, SStreamTaskSnap* pSnap, SBacke _ERROR: taosMemoryFree(path); - return terrno; + return code; } void snapFileDestroy(SBackendSnapFile2* pSnap) { taosMemoryFree(pSnap->pCheckpointMeta); @@ -365,34 +365,32 @@ void snapFileDestroy(SBackendSnapFile2* pSnap) { } int32_t streamSnapHandleInit(SStreamSnapHandle* pHandle, char* path, void* pMeta) { // impl later - - terrno = 0; + int32_t code = 0; SArray* pSnapInfoSet = taosArrayInit(4, sizeof(SStreamTaskSnap)); if (pSnapInfoSet == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; - return terrno; + return TSDB_CODE_OUT_OF_MEMORY; } - terrno = streamCreateTaskDbSnapInfo(pMeta, path, pSnapInfoSet); - if (terrno != 0) { - stError("failed to do task db snap info, reason:%s", tstrerror(terrno)); + code = streamCreateTaskDbSnapInfo(pMeta, path, pSnapInfoSet); + if (code != 0) { + stError("failed to do task db snap info, reason:%s", tstrerror(code)); taosArrayDestroy(pSnapInfoSet); - return terrno; + return code; } SArray* pDbSnapSet = taosArrayInit(8, sizeof(SBackendSnapFile2)); if (pDbSnapSet == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; taosArrayDestroy(pSnapInfoSet); - return -1; + code = TSDB_CODE_OUT_OF_MEMORY; + return code; } for (int32_t i = 0; i < taosArrayGetSize(pSnapInfoSet); i++) { SStreamTaskSnap* pSnap = taosArrayGet(pSnapInfoSet, i); SBackendSnapFile2 snapFile = {0}; - terrno = streamBackendSnapInitFile(path, pSnap, &snapFile); - ASSERT(terrno == 0); + code = streamBackendSnapInitFile(path, pSnap, &snapFile); + ASSERT(code == 0); taosArrayPush(pDbSnapSet, &snapFile); } pHandle->pDbSnapSet = pDbSnapSet; @@ -403,7 +401,7 @@ int32_t streamSnapHandleInit(SStreamSnapHandle* pHandle, char* path, void* pMeta _err: streamSnapHandleDestroy(pHandle); - return terrno; + return code; } void streamSnapHandleDestroy(SStreamSnapHandle* handle) { @@ -431,8 +429,7 @@ int32_t streamSnapReaderOpen(void* pMeta, int64_t sver, int64_t chkpId, char* pa // impl later SStreamSnapReader* pReader = taosMemoryCalloc(1, sizeof(SStreamSnapReader)); if (pReader == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; - return -1; + return TSDB_CODE_OUT_OF_MEMORY; } int32_t code = streamSnapHandleInit(&pReader->handle, (char*)path, pMeta); @@ -498,10 +495,10 @@ _NEXT: int64_t nread = taosPReadFile(pSnapFile->fd, buf + sizeof(SStreamSnapBlockHdr), kBlockSize, pSnapFile->offset); if (nread == -1) { taosMemoryFree(buf); - code = TAOS_SYSTEM_ERROR(terrno); + code = TAOS_SYSTEM_ERROR(errno); stError("%s snap failed to read snap, file name:%s, type:%d,reason:%s", STREAM_STATE_TRANSFER, item->name, item->type, tstrerror(code)); - return -1; + return code; } else if (nread > 0 && nread <= kBlockSize) { // left bytes less than kBlockSize stDebug("%s read file %s, current offset:%" PRId64 ",size:% " PRId64 ", file no.%d", STREAM_STATE_TRANSFER, @@ -558,6 +555,7 @@ _NEXT: // SMetaSnapWriter ======================================== int32_t streamSnapWriterOpen(void* pMeta, int64_t sver, int64_t ever, char* path, SStreamSnapWriter** ppWriter) { // impl later + int32_t code = 0; SStreamSnapWriter* pWriter = taosMemoryCalloc(1, sizeof(SStreamSnapWriter)); if (pWriter == NULL) { return TSDB_CODE_OUT_OF_MEMORY; @@ -568,23 +566,23 @@ int32_t streamSnapWriterOpen(void* pMeta, int64_t sver, int64_t ever, char* path pHandle->metaPath = taosStrdup(path); if (pHandle->metaPath == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; taosMemoryFree(pWriter); - return terrno; + code = TSDB_CODE_OUT_OF_MEMORY; + return code; } pHandle->pDbSnapSet = taosArrayInit(8, sizeof(SBackendSnapFile2)); if (pHandle->pDbSnapSet == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; streamSnapWriterClose(pWriter, 0); - return terrno; + code = TSDB_CODE_OUT_OF_MEMORY; + return code; } SBackendSnapFile2 snapFile = {0}; if (taosArrayPush(pHandle->pDbSnapSet, &snapFile) == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; streamSnapWriterClose(pWriter, 0); - return terrno; + code = TSDB_CODE_OUT_OF_MEMORY; + return code; } *ppWriter = pWriter; @@ -607,7 +605,7 @@ int32_t streamSnapWriteImpl(SStreamSnapWriter* pWriter, uint8_t* pData, uint32_t if (pSnapFile->fd == 0) { pSnapFile->fd = streamOpenFile(pSnapFile->path, pItem->name, TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_APPEND); if (pSnapFile->fd == NULL) { - code = TAOS_SYSTEM_ERROR(terrno); + code = TAOS_SYSTEM_ERROR(errno); stError("%s failed to open file name:%s%s%s, reason:%s", STREAM_STATE_TRANSFER, pHandle->metaPath, TD_DIRSEP, pHdr->name, tstrerror(code)); } @@ -615,7 +613,7 @@ int32_t streamSnapWriteImpl(SStreamSnapWriter* pWriter, uint8_t* pData, uint32_t if (strlen(pHdr->name) == strlen(pItem->name) && strcmp(pHdr->name, pItem->name) == 0) { int64_t bytes = taosPWriteFile(pSnapFile->fd, pHdr->data, pHdr->size, pSnapFile->offset); if (bytes != pHdr->size) { - code = TAOS_SYSTEM_ERROR(terrno); + code = TAOS_SYSTEM_ERROR(errno); stError("%s failed to write snap, file name:%s, reason:%s", STREAM_STATE_TRANSFER, pHdr->name, tstrerror(code)); return code; } else { @@ -636,12 +634,16 @@ int32_t streamSnapWriteImpl(SStreamSnapWriter* pWriter, uint8_t* pData, uint32_t SBackendFileItem* pItem = taosArrayGet(pSnapFile->pFileList, pSnapFile->currFileIdx); pSnapFile->fd = streamOpenFile(pSnapFile->path, pItem->name, TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_APPEND); if (pSnapFile->fd == NULL) { - code = TAOS_SYSTEM_ERROR(terrno); + code = TAOS_SYSTEM_ERROR(errno); stError("%s failed to open file name:%s%s%s, reason:%s", STREAM_STATE_TRANSFER, pSnapFile->path, TD_DIRSEP, pHdr->name, tstrerror(code)); } - taosPWriteFile(pSnapFile->fd, pHdr->data, pHdr->size, pSnapFile->offset); + if (taosPWriteFile(pSnapFile->fd, pHdr->data, pHdr->size, pSnapFile->offset) != pHdr->size) { + code = TAOS_SYSTEM_ERROR(errno); + stError("%s failed to write snap, file name:%s, reason:%s", STREAM_STATE_TRANSFER, pHdr->name, tstrerror(code)); + return code; + } stInfo("succ to write data %s", pItem->name); pSnapFile->offset += pHdr->size; } diff --git a/source/util/src/terror.c b/source/util/src/terror.c index c5bba6fa53..4563e21c6e 100644 --- a/source/util/src/terror.c +++ b/source/util/src/terror.c @@ -96,6 +96,7 @@ TAOS_DEFINE_ERROR(TSDB_CODE_NOT_FOUND, "Not found") TAOS_DEFINE_ERROR(TSDB_CODE_NO_DISKSPACE, "Out of disk space") TAOS_DEFINE_ERROR(TSDB_CODE_TIMEOUT_ERROR, "Operation timeout") TAOS_DEFINE_ERROR(TSDB_CODE_NO_ENOUGH_DISKSPACE, "No enough disk space") +TAOS_DEFINE_ERROR(TSDB_CODE_THIRDPARTY_ERROR, "third party error, please check the log") TAOS_DEFINE_ERROR(TSDB_CODE_APP_IS_STARTING, "Database is starting up") TAOS_DEFINE_ERROR(TSDB_CODE_APP_IS_STOPPING, "Database is closing down") From 92ab689c46a1f3691104a00d4f09c20337254340 Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Thu, 4 Jul 2024 10:26:58 +0000 Subject: [PATCH 42/92] refactor backend --- source/libs/stream/src/streamBackendRocksdb.c | 1 - 1 file changed, 1 deletion(-) diff --git a/source/libs/stream/src/streamBackendRocksdb.c b/source/libs/stream/src/streamBackendRocksdb.c index 057ff56aa9..cb40864c6e 100644 --- a/source/libs/stream/src/streamBackendRocksdb.c +++ b/source/libs/stream/src/streamBackendRocksdb.c @@ -1606,7 +1606,6 @@ int32_t taskDbDoCheckpoint(void* arg, int64_t chkpId, int64_t processId) { if (taosAcquireRef(taskDbWrapperId, refId) == NULL) { code = terrno; - terrno = 0; return code; } From 2b9df7b45ce99cc54a8b6f43c6b4ce965199c397 Mon Sep 17 00:00:00 2001 From: kailixu Date: Thu, 4 Jul 2024 18:36:17 +0800 Subject: [PATCH 43/92] fix: oom in rpc queue --- source/util/src/tqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/util/src/tqueue.c b/source/util/src/tqueue.c index aa8834c89f..45a8a462fb 100644 --- a/source/util/src/tqueue.c +++ b/source/util/src/tqueue.c @@ -162,7 +162,7 @@ void *taosAllocateQitem(int32_t size, EQItype itype, int64_t dataSize) { int64_t alloced = atomic_add_fetch_64(&tsRpcQueueMemoryUsed, size + dataSize); if (alloced > tsRpcQueueMemoryAllowed) { uError("failed to alloc qitem, size:%" PRId64 " alloc:%" PRId64 " allowed:%" PRId64, size + dataSize, alloced, - tsRpcQueueMemoryUsed); + tsRpcQueueMemoryAllowed); atomic_sub_fetch_64(&tsRpcQueueMemoryUsed, size + dataSize); taosMemoryFree(pNode); terrno = TSDB_CODE_OUT_OF_RPC_MEMORY_QUEUE; From 0cdfae3a2c2fa34a3bbbd8a8405fc86ecc866220 Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Thu, 4 Jul 2024 12:13:17 +0000 Subject: [PATCH 44/92] refactor backend --- source/dnode/vnode/src/tq/tqStreamStateSnap.c | 12 +++++------- source/libs/stream/src/streamMeta.c | 6 +++++- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/source/dnode/vnode/src/tq/tqStreamStateSnap.c b/source/dnode/vnode/src/tq/tqStreamStateSnap.c index 07bfd52a9c..c79fc66a06 100644 --- a/source/dnode/vnode/src/tq/tqStreamStateSnap.c +++ b/source/dnode/vnode/src/tq/tqStreamStateSnap.c @@ -38,8 +38,7 @@ int32_t streamStateSnapReaderOpen(STQ* pTq, int64_t sver, int64_t ever, SStreamS // alloc pReader = (SStreamStateReader*)taosMemoryCalloc(1, sizeof(SStreamStateReader)); if (pReader == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; - code = terrno; + code = TSDB_CODE_OUT_OF_MEMORY; goto _err; } @@ -52,10 +51,9 @@ int32_t streamStateSnapReaderOpen(STQ* pTq, int64_t sver, int64_t ever, SStreamS SStreamSnapReader* pSnapReader = NULL; - if (streamSnapReaderOpen(meta, sver, chkpId, meta->path, &pSnapReader) == 0) { + if ((code = streamSnapReaderOpen(meta, sver, chkpId, meta->path, &pSnapReader)) == 0) { pReader->complete = 1; } else { - code = terrno; taosMemoryFree(pReader); goto _err; } @@ -68,7 +66,7 @@ int32_t streamStateSnapReaderOpen(STQ* pTq, int64_t sver, int64_t ever, SStreamS _err: tqError("vgId:%d, vnode %s snapshot reader failed to open since %s", TD_VID(pTq->pVnode), STREAM_STATE_TRANSFER, - tstrerror(terrno)); + tstrerror(code)); *ppReader = NULL; return code; } @@ -142,7 +140,7 @@ int32_t streamStateSnapWriterOpen(STQ* pTq, int64_t sver, int64_t ever, SStreamS if (taosMkDir(pTq->pStreamMeta->path) != 0) { code = TAOS_SYSTEM_ERROR(errno); tqError("vgId:%d, vnode %s snapshot writer failed to create directory %s since %s", TD_VID(pTq->pVnode), - STREAM_STATE_TRANSFER, pTq->pStreamMeta->path, tstrerror(terrno)); + STREAM_STATE_TRANSFER, pTq->pStreamMeta->path, tstrerror(code)); goto _err; } @@ -160,7 +158,7 @@ int32_t streamStateSnapWriterOpen(STQ* pTq, int64_t sver, int64_t ever, SStreamS _err: tqError("vgId:%d, vnode %s snapshot writer failed to open since %s", TD_VID(pTq->pVnode), STREAM_STATE_TRANSFER, - tstrerror(terrno)); + tstrerror(code)); taosMemoryFree(pWriter); *ppWriter = NULL; return code; diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index 2244861bc7..901d91d02d 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -398,6 +398,9 @@ SStreamMeta* streamMetaOpen(const char* path, void* ahandle, FTaskBuild buildTas pMeta->qHandle = taosInitScheduler(32, 1, "stream-chkp", NULL); pMeta->bkdChkptMgt = bkdMgtCreate(tpath); + if (pMeta->bkdChkptMgt == NULL) { + goto _err; + } taosThreadMutexInit(&pMeta->backendMutex, NULL); return pMeta; @@ -413,9 +416,10 @@ _err: if (pMeta->updateInfo.pTasks) taosHashCleanup(pMeta->updateInfo.pTasks); if (pMeta->startInfo.pReadyTaskSet) taosHashCleanup(pMeta->startInfo.pReadyTaskSet); if (pMeta->startInfo.pFailedTaskSet) taosHashCleanup(pMeta->startInfo.pFailedTaskSet); + if (pMeta->bkdChkptMgt) bkdMgtDestroy(pMeta->bkdChkptMgt); taosMemoryFree(pMeta); - stError("failed to open stream meta"); + stError("failed to open stream meta, reason:%s", tstrerror(terrno)); return NULL; } From db4a00c74ed4ac0f58e360af579ce8ec5580884f Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Fri, 5 Jul 2024 08:55:40 +0800 Subject: [PATCH 45/92] fix(stream): not restart for reset task status. --- source/dnode/mnode/impl/src/mndStream.c | 8 ++++---- source/dnode/vnode/src/tqCommon/tqCommon.c | 8 ++++---- source/libs/stream/src/streamCheckStatus.c | 3 ++- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/source/dnode/mnode/impl/src/mndStream.c b/source/dnode/mnode/impl/src/mndStream.c index 536dfab331..f5b944de45 100644 --- a/source/dnode/mnode/impl/src/mndStream.c +++ b/source/dnode/mnode/impl/src/mndStream.c @@ -2701,8 +2701,8 @@ static int32_t mndProcessConsensusCheckpointId(SRpcMsg *pMsg) { } } - mInfo("vgId:%d meta-stored checkpointId for stream:0x%" PRIx64 " %s is:%" PRId64, req.nodeId, req.streamId, - pStream->name, pStream->checkpointId); + mInfo("vgId:%d stream:0x%" PRIx64 " %s meta-stored checkpointId:%" PRId64 " stream:0x%" PRIx64 " %s", req.nodeId, + req.streamId, pStream->name, pStream->checkpointId); int32_t numOfTasks = (pStream == NULL) ? 0 : mndGetNumOfStreamTasks(pStream); if ((pStream != NULL) && (pStream->checkpointId == 0)) { // not generated checkpoint yet, return 0 directly @@ -2730,8 +2730,8 @@ static int32_t mndProcessConsensusCheckpointId(SRpcMsg *pMsg) { } if (chkId == req.checkpointId) { - mDebug("vgId:%d stream:0x%" PRIx64 " %s consensus-checkpointId is:%" PRId64, req.nodeId, req.streamId, - pStream->name, pStream->checkpointId); + mDebug("vgId:%d stream:0x%" PRIx64 " %s consensus-checkpointId is:%" PRId64 ", meta-stored checkpointId:%" PRId64, + req.nodeId, req.streamId, pStream->name, chkId, pStream->checkpointId); mndSendQuickConsensusChkptIdRsp(&req, TSDB_CODE_SUCCESS, req.streamId, chkId, &pMsg->info); taosThreadMutexUnlock(&execInfo.lock); diff --git a/source/dnode/vnode/src/tqCommon/tqCommon.c b/source/dnode/vnode/src/tqCommon/tqCommon.c index 1999134754..c40332ff39 100644 --- a/source/dnode/vnode/src/tqCommon/tqCommon.c +++ b/source/dnode/vnode/src/tqCommon/tqCommon.c @@ -871,7 +871,6 @@ int32_t tqStreamTaskProcessTaskResetReq(SStreamMeta* pMeta, SRpcMsg* pMsg) { tqDebug("s-task:%s receive task-reset msg from mnode, reset status and ready for data processing", pTask->id.idStr); taosThreadMutexLock(&pTask->lock); - streamTaskClearCheckInfo(pTask, true); // clear flag set during do checkpoint, and open inputQ for all upstream tasks @@ -886,9 +885,10 @@ int32_t tqStreamTaskProcessTaskResetReq(SStreamMeta* pMeta, SRpcMsg* pMsg) { streamTaskSetStatusReady(pTask); } else if (pState->state == TASK_STATUS__UNINIT) { - tqDebug("s-task:%s start task by checking downstream tasks", pTask->id.idStr); - ASSERT(pTask->status.downstreamReady == 0); - tqStreamTaskRestoreCheckpoint(pMeta, pTask->id.streamId, pTask->id.taskId); +// tqDebug("s-task:%s start task by checking downstream tasks", pTask->id.idStr); +// ASSERT(pTask->status.downstreamReady == 0); +// tqStreamTaskRestoreCheckpoint(pMeta, pTask->id.streamId, pTask->id.taskId); + tqDebug("s-task:%s status:%s do nothing after receiving reset-task from mnode", pTask->id.idStr, pState->name); } else { tqDebug("s-task:%s status:%s do nothing after receiving reset-task from mnode", pTask->id.idStr, pState->name); } diff --git a/source/libs/stream/src/streamCheckStatus.c b/source/libs/stream/src/streamCheckStatus.c index 22d336a549..2d8fe4a0e1 100644 --- a/source/libs/stream/src/streamCheckStatus.c +++ b/source/libs/stream/src/streamCheckStatus.c @@ -668,6 +668,7 @@ void rspMonitorFn(void* param, void* tmrId) { streamTaskCompleteCheckRsp(pInfo, true, id); // not record the failed of the current task if try to close current vnode + // otherwise, the put of message operation may incur invalid read of message queue. if (!pMeta->closeFlag) { addDownstreamFailedStatusResultAsync(pTask->pMsgCb, vgId, pTask->id.streamId, pTask->id.taskId); } @@ -676,7 +677,7 @@ void rspMonitorFn(void* param, void* tmrId) { return; } - if (state == TASK_STATUS__DROPPING || state == TASK_STATUS__READY || state == TASK_STATUS__PAUSE) { + if (state == TASK_STATUS__DROPPING || state == TASK_STATUS__READY) { int32_t ref = atomic_sub_fetch_32(&pTask->status.timerActive, 1); stDebug("s-task:%s status:%s vgId:%d quit from monitor check-rsp tmr, ref:%d", id, pStat->name, vgId, ref); From 78993d9c55cef8bbbb6346f15a3da1ac6c8d022c Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Fri, 5 Jul 2024 09:00:01 +0800 Subject: [PATCH 46/92] fix(stream): update logs. --- source/dnode/mnode/impl/src/mndStream.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/dnode/mnode/impl/src/mndStream.c b/source/dnode/mnode/impl/src/mndStream.c index f5b944de45..a1016ad96c 100644 --- a/source/dnode/mnode/impl/src/mndStream.c +++ b/source/dnode/mnode/impl/src/mndStream.c @@ -2701,8 +2701,8 @@ static int32_t mndProcessConsensusCheckpointId(SRpcMsg *pMsg) { } } - mInfo("vgId:%d stream:0x%" PRIx64 " %s meta-stored checkpointId:%" PRId64 " stream:0x%" PRIx64 " %s", req.nodeId, - req.streamId, pStream->name, pStream->checkpointId); + mInfo("vgId:%d stream:0x%" PRIx64 " %s meta-stored checkpointId:%" PRId64, req.nodeId, req.streamId, pStream->name, + pStream->checkpointId); int32_t numOfTasks = (pStream == NULL) ? 0 : mndGetNumOfStreamTasks(pStream); if ((pStream != NULL) && (pStream->checkpointId == 0)) { // not generated checkpoint yet, return 0 directly From 8087dbe16d8db8beaa76c134312d45793cc7d9f2 Mon Sep 17 00:00:00 2001 From: wangmm0220 Date: Fri, 5 Jul 2024 09:41:34 +0800 Subject: [PATCH 47/92] fix[TS-4921] set flag -1 if init monitor failed --- source/client/src/clientEnv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/source/client/src/clientEnv.c b/source/client/src/clientEnv.c index b227a6bd96..ecfa1e3392 100644 --- a/source/client/src/clientEnv.c +++ b/source/client/src/clientEnv.c @@ -864,10 +864,12 @@ void taos_init_imp(void) { initQueryModuleMsgHandle(); if (taosConvInit() != 0) { + tscInitRes = -1; tscError("failed to init conv"); return; } if (monitorInit() != 0){ + tscInitRes = -1; tscError("failed to init monitor"); return; } From 20c8e3168c8e7924e7abb41693bc2cdcdc64bc6a Mon Sep 17 00:00:00 2001 From: wangmm0220 Date: Fri, 5 Jul 2024 09:51:28 +0800 Subject: [PATCH 48/92] fix[TD-30895] heap use after free --- source/client/src/clientMonitor.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/source/client/src/clientMonitor.c b/source/client/src/clientMonitor.c index c3345bf58d..032b7cdeea 100644 --- a/source/client/src/clientMonitor.c +++ b/source/client/src/clientMonitor.c @@ -480,7 +480,6 @@ static void monitorSendSlowLogAtBeginning(int64_t clusterId, char** fileName, Td sendSlowLog(clusterId, data, pFile, offset, SLOW_LOG_READ_BEGINNIG, *fileName, pTransporter, epSet); *fileName = NULL; } - tscDebug("[monitor] monitorSendSlowLogAtBeginning send slow log file:%p, data:%s", pFile, data); } } @@ -511,7 +510,6 @@ static void monitorSendSlowLogAtRunning(int64_t clusterId){ if(data != NULL){ sendSlowLog(clusterId, data, pClient->pFile, pClient->offset, SLOW_LOG_READ_RUNNING, NULL, pInst->pTransporter, &ep); } - tscDebug("[monitor] monitorSendSlowLogAtRunning send slow log:%s", data); } } @@ -542,7 +540,6 @@ static bool monitorSendSlowLogAtQuit(int64_t clusterId) { if(data != NULL){ sendSlowLog(clusterId, data, pClient->pFile, pClient->offset, SLOW_LOG_READ_QUIT, NULL, pInst->pTransporter, &ep); } - tscInfo("[monitor] monitorSendSlowLogAtQuit send slow log:%s", data); } return false; } @@ -568,7 +565,6 @@ static void monitorSendAllSlowLogAtQuit(){ if(data != NULL && sendSlowLog(*clusterId, data, NULL, pClient->offset, SLOW_LOG_READ_QUIT, NULL, pInst->pTransporter, &ep) == 0){ quitCnt ++; } - tscInfo("[monitor] monitorSendAllSlowLogAtQuit send slow log :%s", data); } } } @@ -619,7 +615,6 @@ static void monitorSendAllSlowLog(){ if(data != NULL){ sendSlowLog(*clusterId, data, NULL, pClient->offset, SLOW_LOG_READ_RUNNING, NULL, pInst->pTransporter, &ep); } - tscDebug("[monitor] monitorSendAllSlowLog send slow log :%s", data); } } } From f9679feaa72bb51a5587531589bc41f8879bb827 Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Fri, 5 Jul 2024 02:42:22 +0000 Subject: [PATCH 49/92] fix compile error --- source/libs/stream/src/streamBackendRocksdb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/libs/stream/src/streamBackendRocksdb.c b/source/libs/stream/src/streamBackendRocksdb.c index f27462fb08..1f4603c466 100644 --- a/source/libs/stream/src/streamBackendRocksdb.c +++ b/source/libs/stream/src/streamBackendRocksdb.c @@ -1568,7 +1568,7 @@ int32_t chkpAddExtraInfo(char* pChkpIdDir, int64_t chkpId, int64_t processId) { nBytes = snprintf(pDst, cap, "%s%sinfo", pChkpIdDir, TD_DIRSEP); if (nBytes <= 0 || nBytes >= cap) { code = TSDB_CODE_OUT_OF_RANGE; - stError("failed to build dst to add extra info, dir:%s, reason:%d", pChkpIdDir, tstrerror(code)); + stError("failed to build dst to add extra info, dir:%s, reason:%s", pChkpIdDir, tstrerror(code)); goto _EXIT; } @@ -1582,7 +1582,7 @@ int32_t chkpAddExtraInfo(char* pChkpIdDir, int64_t chkpId, int64_t processId) { nBytes = snprintf(buf, sizeof(buf), "%" PRId64 " %" PRId64 "", chkpId, processId); if (nBytes <= 0 || nBytes >= sizeof(buf)) { code = TSDB_CODE_OUT_OF_RANGE; - stError("failed to build content to add extra info, dir:%s,reason:%d", pChkpIdDir, tstrerror(code)); + stError("failed to build content to add extra info, dir:%s,reason:%s", pChkpIdDir, tstrerror(code)); goto _EXIT; } From 52a80826223543797ab2503370704f1c4bfa6eb4 Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Fri, 5 Jul 2024 03:40:19 +0000 Subject: [PATCH 50/92] refactor backend --- source/libs/stream/inc/streamBackendRocksdb.h | 2 +- source/libs/stream/src/streamBackendRocksdb.c | 23 +++++++++++++++++++ source/libs/stream/src/streamMeta.c | 10 ++++++-- 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/source/libs/stream/inc/streamBackendRocksdb.h b/source/libs/stream/inc/streamBackendRocksdb.h index e4c5787020..f0647f44a3 100644 --- a/source/libs/stream/inc/streamBackendRocksdb.h +++ b/source/libs/stream/inc/streamBackendRocksdb.h @@ -133,7 +133,7 @@ typedef struct { #define META_ON_S3_FORMATE "%s_%" PRId64 "\n%s_%" PRId64 "\n%s_%" PRId64 "" -bool streamBackendDataIsExist(const char* path, int64_t chkpId, int32_t vgId); +bool streamBackendDataIsExist(const char* path, int64_t chkpId); void* streamBackendInit(const char* path, int64_t chkpId, int32_t vgId); void streamBackendCleanup(void* arg); void streamBackendHandleCleanup(void* arg); diff --git a/source/libs/stream/src/streamBackendRocksdb.c b/source/libs/stream/src/streamBackendRocksdb.c index 1f4603c466..8d39db33e8 100644 --- a/source/libs/stream/src/streamBackendRocksdb.c +++ b/source/libs/stream/src/streamBackendRocksdb.c @@ -786,6 +786,29 @@ _EXIT: taosMemoryFree(checkpointRoot); return code; } +bool streamBackendDataIsExist(const char* path, int64_t chkpId) { + bool exist = true; + int32_t cap = strlen(path) + 32; + + char* state = taosMemoryCalloc(1, cap); + if (state == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + return false; + } + + int16_t nBytes = snprintf(state, cap, "%s%s%s", path, TD_DIRSEP, "state"); + if (nBytes <= 0 || nBytes >= cap) { + terrno = TSDB_CODE_OUT_OF_RANGE; + exist = false; + } else { + if (!taosDirExist(state)) { + exist = false; + } + } + + taosMemoryFree(state); + return exist; +} void* streamBackendInit(const char* streamPath, int64_t chkpId, int32_t vgId) { char* backendPath = NULL; diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index 901d91d02d..a97b803703 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -183,9 +183,15 @@ int32_t streamMetaCvtDbFormat(SStreamMeta* pMeta) { int32_t code = 0; int64_t chkpId = streamMetaGetLatestCheckpointId(pMeta); - bool exist = streamBackendDataIsExist(pMeta->path, chkpId, pMeta->vgId); + bool exist = streamBackendDataIsExist(pMeta->path, chkpId); if (exist == false) { - stError("failed to check backend data exist, reason:%s", tstrerror(terrno)); + if (terrno != 0) { + code = terrno; + terrno = 0; + stError("failed to check backend data exist, reason:%s", tstrerror(code)); + } else { + stInfo("not need to convert stream backend formate"); + } return code; } From 6c5acdfc4bb5c247fb5c85347e356b3cd9b43387 Mon Sep 17 00:00:00 2001 From: wangmm0220 Date: Fri, 5 Jul 2024 14:03:12 +0800 Subject: [PATCH 51/92] fix:[TS-4921]refactor code --- include/libs/monitor/clientMonitor.h | 7 +++ source/client/src/clientMonitor.c | 80 ++++++++++++---------------- 2 files changed, 40 insertions(+), 47 deletions(-) diff --git a/include/libs/monitor/clientMonitor.h b/include/libs/monitor/clientMonitor.h index 1e6db8c00a..3bb325921e 100644 --- a/include/libs/monitor/clientMonitor.h +++ b/include/libs/monitor/clientMonitor.h @@ -38,6 +38,13 @@ typedef enum { SLOW_LOG_READ_QUIT = 3, } SLOW_LOG_QUEUE_TYPE; +char* queueTypeStr[] = { + "SLOW_LOG_WRITE", + "SLOW_LOG_READ_RUNNING", + "SLOW_LOG_READ_BEGINNIG", + "SLOW_LOG_READ_QUIT" +}; + #define SLOW_LOG_SEND_SIZE_MAX 1024*1024 typedef struct { diff --git a/source/client/src/clientMonitor.c b/source/client/src/clientMonitor.c index 032b7cdeea..d1a9897caa 100644 --- a/source/client/src/clientMonitor.c +++ b/source/client/src/clientMonitor.c @@ -454,6 +454,10 @@ static int64_t getFileSize(char* path){ } static int32_t sendSlowLog(int64_t clusterId, char* data, TdFilePtr pFile, int64_t offset, SLOW_LOG_QUEUE_TYPE type, char* fileName, void* pTransporter, SEpSet *epSet){ + if (data == NULL){ + taosMemoryFree(fileName); + return -1; + } MonitorSlowLogData* pParam = taosMemoryMalloc(sizeof(MonitorSlowLogData)); if(pParam == NULL){ taosMemoryFree(data); @@ -469,17 +473,26 @@ static int32_t sendSlowLog(int64_t clusterId, char* data, TdFilePtr pFile, int64 return sendReport(pTransporter, epSet, data, MONITOR_TYPE_SLOW_LOG, pParam); } -static void monitorSendSlowLogAtBeginning(int64_t clusterId, char** fileName, TdFilePtr pFile, int64_t offset, void* pTransporter, SEpSet *epSet){ +static int32_t monitorReadSend(int64_t clusterId, TdFilePtr pFile, int64_t* offset, int64_t size, SLOW_LOG_QUEUE_TYPE type, char* fileName){ + SAppInstInfo* pInst = getAppInstByClusterId(clusterId); + if(pInst == NULL){ + tscError("failed to get app instance by clusterId:%" PRId64, clusterId); + return -1; + } + SEpSet ep = getEpSet_s(&pInst->mgmtEp); + char* data = readFile(pFile, offset, size); + return sendSlowLog(clusterId, data, (type == SLOW_LOG_READ_BEGINNIG ? pFile : NULL), *offset, type, fileName, pInst->pTransporter, &ep); +} + +static void monitorSendSlowLogAtBeginning(int64_t clusterId, char** fileName, TdFilePtr pFile, int64_t offset){ int64_t size = getFileSize(*fileName); if(size <= offset){ processFileInTheEnd(pFile, *fileName); tscDebug("[monitor] monitorSendSlowLogAtBeginning delete file:%s", *fileName); }else{ - char* data = readFile(pFile, &offset, size); - if(data != NULL){ - sendSlowLog(clusterId, data, pFile, offset, SLOW_LOG_READ_BEGINNIG, *fileName, pTransporter, epSet); - *fileName = NULL; - } + int32_t code = monitorReadSend(clusterId, pFile, &offset, size, SLOW_LOG_READ_BEGINNIG, *fileName); + tscDebug("[monitor] monitorSendSlowLogAtBeginning send slow log clusterId:%"PRId64",ret:%d", clusterId, code); + *fileName = NULL; } } @@ -500,16 +513,8 @@ static void monitorSendSlowLogAtRunning(int64_t clusterId){ tscDebug("[monitor] monitorSendSlowLogAtRunning truncate file to 0 file:%p", pClient->pFile); pClient->offset = 0; }else{ - SAppInstInfo* pInst = getAppInstByClusterId(clusterId); - if(pInst == NULL){ - tscError("failed to get app instance by clusterId:%" PRId64, clusterId); - return; - } - SEpSet ep = getEpSet_s(&pInst->mgmtEp); - char* data = readFile(pClient->pFile, &pClient->offset, size); - if(data != NULL){ - sendSlowLog(clusterId, data, pClient->pFile, pClient->offset, SLOW_LOG_READ_RUNNING, NULL, pInst->pTransporter, &ep); - } + int32_t code = monitorReadSend(clusterId, pClient->pFile, &pClient->offset, size, SLOW_LOG_READ_RUNNING, NULL); + tscDebug("[monitor] monitorSendSlowLogAtRunning send slow log clusterId:%"PRId64",ret:%d", clusterId, code); } } @@ -531,15 +536,8 @@ static bool monitorSendSlowLogAtQuit(int64_t clusterId) { return true; } }else{ - SAppInstInfo* pInst = getAppInstByClusterId(clusterId); - if(pInst == NULL) { - return true; - } - SEpSet ep = getEpSet_s(&pInst->mgmtEp); - char* data = readFile(pClient->pFile, &pClient->offset, size); - if(data != NULL){ - sendSlowLog(clusterId, data, pClient->pFile, pClient->offset, SLOW_LOG_READ_QUIT, NULL, pInst->pTransporter, &ep); - } + int32_t code = monitorReadSend(clusterId, pClient->pFile, &pClient->offset, size, SLOW_LOG_READ_QUIT, NULL); + tscDebug("[monitor] monitorSendSlowLogAtQuit send slow log clusterId:%"PRId64",ret:%d", clusterId, code); } return false; } @@ -556,13 +554,9 @@ static void monitorSendAllSlowLogAtQuit(){ pClient->pFile = NULL; }else if(pClient->offset == 0){ int64_t* clusterId = (int64_t*)taosHashGetKey(pIter, NULL); - SAppInstInfo* pInst = getAppInstByClusterId(*clusterId); - if(pInst == NULL) { - continue; - } - SEpSet ep = getEpSet_s(&pInst->mgmtEp); - char* data = readFile(pClient->pFile, &pClient->offset, size); - if(data != NULL && sendSlowLog(*clusterId, data, NULL, pClient->offset, SLOW_LOG_READ_QUIT, NULL, pInst->pTransporter, &ep) == 0){ + int32_t code = monitorReadSend(*clusterId, pClient->pFile, &pClient->offset, size, SLOW_LOG_READ_QUIT, NULL); + tscDebug("[monitor] monitorSendAllSlowLogAtQuit send slow log clusterId:%"PRId64",ret:%d", *clusterId, code); + if (code == 0){ quitCnt ++; } } @@ -610,11 +604,8 @@ static void monitorSendAllSlowLog(){ } continue; } - SEpSet ep = getEpSet_s(&pInst->mgmtEp); - char* data = readFile(pClient->pFile, &pClient->offset, size); - if(data != NULL){ - sendSlowLog(*clusterId, data, NULL, pClient->offset, SLOW_LOG_READ_RUNNING, NULL, pInst->pTransporter, &ep); - } + int32_t code = monitorReadSend(*clusterId, pClient->pFile, &pClient->offset, size, SLOW_LOG_READ_RUNNING, NULL); + tscDebug("[monitor] monitorSendAllSlowLog send slow log clusterId:%"PRId64",ret:%d", *clusterId, code); } } } @@ -627,7 +618,7 @@ static void monitorSendAllSlowLogFromTempDir(int64_t clusterId){ return; } char namePrefix[PATH_MAX] = {0}; - if (snprintf(namePrefix, sizeof(namePrefix), "%s%"PRIx64, TD_TMP_FILE_PREFIX, pInst->clusterId) < 0) { + if (snprintf(namePrefix, sizeof(namePrefix), "%s%"PRIx64, TD_TMP_FILE_PREFIX, clusterId) < 0) { tscError("failed to generate slow log file name prefix"); return; } @@ -652,7 +643,7 @@ static void monitorSendAllSlowLogFromTempDir(int64_t clusterId){ if (strcmp(name, ".") == 0 || strcmp(name, "..") == 0 || strstr(name, namePrefix) == NULL) { - tscInfo("skip file:%s, for cluster id:%"PRIx64, name, pInst->clusterId); + tscInfo("skip file:%s, for cluster id:%"PRIx64, name, clusterId); continue; } @@ -668,9 +659,8 @@ static void monitorSendAllSlowLogFromTempDir(int64_t clusterId){ taosCloseFile(&pFile); continue; } - SEpSet ep = getEpSet_s(&pInst->mgmtEp); char *tmp = taosStrdup(filename); - monitorSendSlowLogAtBeginning(pInst->clusterId, &tmp, pFile, 0, pInst->pTransporter, &ep); + monitorSendSlowLogAtBeginning(clusterId, &tmp, pFile, 0); taosMemoryFree(tmp); } @@ -712,11 +702,7 @@ static void* monitorThreadFunc(void *param){ if (slowLogData != NULL) { if (slowLogData->type == SLOW_LOG_READ_BEGINNIG){ if(slowLogData->pFile != NULL){ - SAppInstInfo* pInst = getAppInstByClusterId(slowLogData->clusterId); - if(pInst != NULL) { - SEpSet ep = getEpSet_s(&pInst->mgmtEp); - monitorSendSlowLogAtBeginning(slowLogData->clusterId, &(slowLogData->fileName), slowLogData->pFile, slowLogData->offset, pInst->pTransporter, &ep); - } + monitorSendSlowLogAtBeginning(slowLogData->clusterId, &(slowLogData->fileName), slowLogData->pFile, slowLogData->offset); }else{ monitorSendAllSlowLogFromTempDir(slowLogData->clusterId); } @@ -850,7 +836,7 @@ int32_t monitorPutData2MonitorQueue(MonitorSlowLogData data){ return -1; } *slowLogData = data; - tscDebug("[monitor] write slow log to queue, clusterId:%"PRIx64 " type:%d", slowLogData->clusterId, slowLogData->type); + tscDebug("[monitor] write slow log to queue, clusterId:%"PRIx64 " type:%s, data:%s", slowLogData->clusterId, queueTypeStr[slowLogData->type], slowLogData->data); if (taosWriteQitem(monitorQueue, slowLogData) == 0){ tsem2_post(&monitorSem); }else{ From b8012df90906dfb3702d7df1dfcccadf9539a182 Mon Sep 17 00:00:00 2001 From: xjzhou Date: Fri, 5 Jul 2024 14:18:51 +0800 Subject: [PATCH 52/92] handle fixed table name int sql --- source/client/src/clientStmt.c | 6 ++++++ source/libs/parser/src/parInsertSql.c | 7 +++++++ 2 files changed, 13 insertions(+) diff --git a/source/client/src/clientStmt.c b/source/client/src/clientStmt.c index 21d2cbf447..17b52521b8 100644 --- a/source/client/src/clientStmt.c +++ b/source/client/src/clientStmt.c @@ -1015,6 +1015,12 @@ int stmtSetTbTags(TAOS_STMT* stmt, TAOS_MULTI_BIND* tags) { STMT_ERR_RET(stmtSwitchStatus(pStmt, STMT_SETTAGS)); + SBoundColInfo *tags_info = (SBoundColInfo*)pStmt->bInfo.boundTags; + if (tags_info->numOfBound <= 0 || tags_info->numOfCols <= 0) { + tscWarn("no tags bound in sql, will not bound tags"); + return TSDB_CODE_SUCCESS; + } + if (pStmt->bInfo.inExecCache) { return TSDB_CODE_SUCCESS; } diff --git a/source/libs/parser/src/parInsertSql.c b/source/libs/parser/src/parInsertSql.c index d4b9f20f51..313d9449d2 100644 --- a/source/libs/parser/src/parInsertSql.c +++ b/source/libs/parser/src/parInsertSql.c @@ -2444,6 +2444,13 @@ static int32_t checkTableClauseFirstToken(SInsertParseContext* pCxt, SVnodeModif pTbName->n = strlen(tbName); } + if (pCxt->isStmtBind) { + if (TK_NK_ID == pTbName->type || (tbNameAfterDbName != NULL && *(tbNameAfterDbName + 1) != '?')) { + // In SQL statements, the table name has already been specified. + parserWarn("0x%" PRIx64 " table name is specified in sql, ignore the table name in bind param", pCxt->pComCxt->requestId); + } + } + *pHasData = true; return TSDB_CODE_SUCCESS; } From 48560ddf4332fc2c51646147b3b2133e97057e29 Mon Sep 17 00:00:00 2001 From: xjzhou Date: Fri, 5 Jul 2024 14:23:54 +0800 Subject: [PATCH 53/92] recover taoscTest.cpp --- tests/taosc_test/taoscTest.cpp | 201 +++++---------------------------- 1 file changed, 26 insertions(+), 175 deletions(-) diff --git a/tests/taosc_test/taoscTest.cpp b/tests/taosc_test/taoscTest.cpp index d3f6f50547..1b051f555e 100644 --- a/tests/taosc_test/taoscTest.cpp +++ b/tests/taosc_test/taoscTest.cpp @@ -30,31 +30,31 @@ #include "taos.h" class taoscTest : public ::testing::Test { - protected: +protected: static void SetUpTestCase() { -// printf("start test setup.\n"); -// TAOS* taos = taos_connect("localhost", "root", "taosdata", NULL, 0); -// ASSERT_TRUE(taos != nullptr); -// -// TAOS_RES* res = taos_query(taos, "drop database IF EXISTS taosc_test_db;"); -// if (taos_errno(res) != 0) { -// printf("error in drop database taosc_test_db, reason:%s\n", taos_errstr(res)); -// return; -// } -// taosSsleep(5); -// taos_free_result(res); -// printf("drop database taosc_test_db,finished.\n"); -// -// res = taos_query(taos, "create database taosc_test_db;"); -// if (taos_errno(res) != 0) { -// printf("error in create database taosc_test_db, reason:%s\n", taos_errstr(res)); -// return; -// } -// taosSsleep(5); -// taos_free_result(res); -// printf("create database taosc_test_db,finished.\n"); -// -// taos_close(taos); + printf("start test setup.\n"); + TAOS* taos = taos_connect("localhost", "root", "taosdata", NULL, 0); + ASSERT_TRUE(taos != nullptr); + + TAOS_RES* res = taos_query(taos, "drop database IF EXISTS taosc_test_db;"); + if (taos_errno(res) != 0) { + printf("error in drop database taosc_test_db, reason:%s\n", taos_errstr(res)); + return; + } + taosSsleep(5); + taos_free_result(res); + printf("drop database taosc_test_db,finished.\n"); + + res = taos_query(taos, "create database taosc_test_db;"); + if (taos_errno(res) != 0) { + printf("error in create database taosc_test_db, reason:%s\n", taos_errstr(res)); + return; + } + taosSsleep(5); + taos_free_result(res); + printf("create database taosc_test_db,finished.\n"); + + taos_close(taos); } static void TearDownTestCase() {} @@ -99,154 +99,6 @@ void queryCallback(void* param, void* res, int32_t code) { taos_fetch_raw_block_a(res, fetchCallback, param); } -/** - * @brief execute sql only. - * - * @param taos - * @param sql - */ -void executeSQL(TAOS *taos, const char *sql) { - TAOS_RES *res = taos_query(taos, sql); - int code = taos_errno(res); - if (code != 0) { - printf("%s\n", taos_errstr(res)); - taos_free_result(res); - taos_close(taos); - exit(EXIT_FAILURE); - } - taos_free_result(res); -} - -/** - * @brief check return status and exit program when error occur. - * - * @param stmt - * @param code - * @param msg - */ -void checkErrorCode(TAOS_STMT *stmt, int code, const char* msg) { - if (code != 0) { - printf("%s. error: %s\n", msg, taos_stmt_errstr(stmt)); - taos_stmt_close(stmt); - exit(EXIT_FAILURE); - } -} - -typedef struct { - int64_t ts; - float current; - int voltage; - float phase; -} Row; - - -/** - * @brief insert data using stmt API - * - * @param taos - */ -void insertData(TAOS *taos) { - // init - TAOS_STMT *stmt = taos_stmt_init(taos); - // prepare -// const char *sql = "INSERT INTO ?.d1001 USING meters TAGS(?, ?) values(?, ?, ?, ?)"; -// const char *sql = "INSERT INTO ?.? USING meters TAGS(?, ?) values(?, ?, ?, ?)"; -// const char *sql = "INSERT INTO power.? USING meters TAGS(?, ?) values(?, ?, ?, ?)"; -// const char *sql = "INSERT INTO ? USING meters TAGS(?, ?) values(?, ?, ?, ?)"; -// const char *sql = "INSERT INTO ? USING meters TAGS(?, ?) values(?, ?, ?, ?)"; - const char *sql = "insert into huawei USING meters TAGS(?, ?) values(?, ?, ?, ?)"; - int code = taos_stmt_prepare(stmt, sql, 0); - checkErrorCode(stmt, code, "failed to execute taos_stmt_prepare"); - // bind table name and tags - TAOS_MULTI_BIND tags[2]; - char *location = "California.SanFrancisco"; - int groupId = 2; - tags[0].buffer_type = TSDB_DATA_TYPE_BINARY; - tags[0].buffer_length = strlen(location); - tags[0].length = (int32_t *)&tags[0].buffer_length; - tags[0].buffer = location; - tags[0].is_null = NULL; - - tags[1].buffer_type = TSDB_DATA_TYPE_INT; - tags[1].buffer_length = sizeof(int); - tags[1].length = (int32_t *)&tags[1].buffer_length; - tags[1].buffer = &groupId; - tags[1].is_null = NULL; - -// code = taos_stmt_set_tbname_tags(stmt, "duck", tags); -// checkErrorCode(stmt, code, "failed to execute taos_stmt_set_dbname_tbname_tags"); - - // insert two rows with multi binds - TAOS_MULTI_BIND params[4]; - // values to bind - int64_t ts[] = {1648432611250, 1648432611778}; - float current[] = {10.3, 12.6}; - int voltage[] = {219, 218}; - float phase[] = {0.31, 0.33}; - // is_null array - char is_null[2] = {0}; - // length array - int32_t int64Len[2] = {sizeof(int64_t)}; - int32_t floatLen[2] = {sizeof(float)}; - int32_t intLen[2] = {sizeof(int)}; - - params[0].buffer_type = TSDB_DATA_TYPE_TIMESTAMP; - params[0].buffer_length = sizeof(int64_t); - params[0].buffer = ts; - params[0].length = int64Len; - params[0].is_null = is_null; - params[0].num = 2; - - params[1].buffer_type = TSDB_DATA_TYPE_FLOAT; - params[1].buffer_length = sizeof(float); - params[1].buffer = current; - params[1].length = floatLen; - params[1].is_null = is_null; - params[1].num = 2; - - params[2].buffer_type = TSDB_DATA_TYPE_INT; - params[2].buffer_length = sizeof(int); - params[2].buffer = voltage; - params[2].length = intLen; - params[2].is_null = is_null; - params[2].num = 2; - - params[3].buffer_type = TSDB_DATA_TYPE_FLOAT; - params[3].buffer_length = sizeof(float); - params[3].buffer = phase; - params[3].length = floatLen; - params[3].is_null = is_null; - params[3].num = 2; - - code = taos_stmt_bind_param_batch(stmt, params); // bind batch - checkErrorCode(stmt, code, "failed to execute taos_stmt_bind_param_batch"); - code = taos_stmt_add_batch(stmt); // add batch - checkErrorCode(stmt, code, "failed to execute taos_stmt_add_batch"); - // execute - code = taos_stmt_execute(stmt); - checkErrorCode(stmt, code, "failed to execute taos_stmt_execute"); - int affectedRows = taos_stmt_affected_rows(stmt); - printf("successfully inserted %d rows\n", affectedRows); - - // close - taos_stmt_close(stmt); -} - -TEST_F(taoscTest, taos_stmt_test) { - TAOS *taos = taos_connect("localhost", "root", "taosdata", NULL, 6030); - if (taos == NULL) { - printf("failed to connect to server"); - exit(EXIT_FAILURE); - } -// executeSQL(taos, "drop database if exists power"); -// executeSQL(taos, "create database power"); - executeSQL(taos, "use power"); -// executeSQL(taos, "create stable meters (ts timestamp, current float, voltage int, phase float) tags (location binary(64), groupId int)"); - insertData(taos); - taos_close(taos); - taos_cleanup(); -} - TEST_F(taoscTest, taos_query_a_test) { char sql[1024] = {0}; int32_t code = 0; @@ -336,7 +188,7 @@ TEST_F(taoscTest, taos_query_test) { void queryCallback2(void* param, void* res, int32_t code) { ASSERT_TRUE(code == 0); ASSERT_TRUE(param == pUserParam); - // After using taos_query_a to query, using taos_fetch_row in the callback will cause blocking. + // After using taos_query_a to query, using taos_fetch_row in the callback will cause blocking. // Reason: schProcessOnCbBegin SCH_LOCK_TASK(pTask) TAOS_ROW row; row = taos_fetch_row(res); @@ -402,7 +254,7 @@ TEST_F(taoscTest, taos_query_a_fetch_row) { printf("taos_query_a_fetch_row taos_fetch_row start...\n"); while ((row = taos_fetch_row(*pres))) { - getRecordCounts++; + getRecordCounts++; } printf("taos_query_a_fetch_row taos_fetch_row end. %p record count:%d.\n", *pres, getRecordCounts); taos_free_result(*pres); @@ -412,4 +264,3 @@ TEST_F(taoscTest, taos_query_a_fetch_row) { printf("taos_query_a_fetch_row test finished.\n"); } - From 3a4412b2829a20b241150d22913647137dfd981d Mon Sep 17 00:00:00 2001 From: dmchen Date: Fri, 5 Jul 2024 06:24:53 +0000 Subject: [PATCH 54/92] fix/TD-30876 --- include/common/tglobal.h | 1 - include/libs/monitor/monitor.h | 1 - source/common/src/tglobal.c | 3 -- source/dnode/mgmt/mgmt_dnode/inc/dmInt.h | 1 - source/dnode/mgmt/mgmt_dnode/src/dmInt.c | 1 - source/dnode/mgmt/mgmt_dnode/src/dmWorker.c | 9 ---- source/dnode/mgmt/node_mgmt/inc/dmMgmt.h | 1 - source/dnode/mgmt/node_mgmt/src/dmEnv.c | 1 - source/dnode/mgmt/node_mgmt/src/dmMonitor.c | 10 ---- source/dnode/mgmt/node_util/inc/dmUtil.h | 1 - source/libs/monitor/src/monMain.c | 54 +++++++++------------ 11 files changed, 23 insertions(+), 60 deletions(-) diff --git a/include/common/tglobal.h b/include/common/tglobal.h index 96b9617fc4..3fd3cc4ca9 100644 --- a/include/common/tglobal.h +++ b/include/common/tglobal.h @@ -134,7 +134,6 @@ extern uint16_t tsMonitorPort; extern int32_t tsMonitorMaxLogs; extern bool tsMonitorComp; extern bool tsMonitorLogProtocol; -extern int32_t tsMonitorIntervalForBasic; extern bool tsMonitorForceV2; // audit diff --git a/include/libs/monitor/monitor.h b/include/libs/monitor/monitor.h index 9d7878ecf7..6007d52bb4 100644 --- a/include/libs/monitor/monitor.h +++ b/include/libs/monitor/monitor.h @@ -226,7 +226,6 @@ void monSetQmInfo(SMonQmInfo *pInfo); void monSetSmInfo(SMonSmInfo *pInfo); void monSetBmInfo(SMonBmInfo *pInfo); void monGenAndSendReport(); -void monGenAndSendReportBasic(); void monSendContent(char *pCont, const char* uri); void tFreeSMonMmInfo(SMonMmInfo *pInfo); diff --git a/source/common/src/tglobal.c b/source/common/src/tglobal.c index 3372c7b1cc..84d4e7b7e7 100644 --- a/source/common/src/tglobal.c +++ b/source/common/src/tglobal.c @@ -111,7 +111,6 @@ uint16_t tsMonitorPort = 6043; int32_t tsMonitorMaxLogs = 100; bool tsMonitorComp = false; bool tsMonitorLogProtocol = false; -int32_t tsMonitorIntervalForBasic = 30; bool tsMonitorForceV2 = true; // audit @@ -712,7 +711,6 @@ static int32_t taosAddServerCfg(SConfig *pCfg) { if (cfgAddInt32(pCfg, "monitorMaxLogs", tsMonitorMaxLogs, 1, 1000000, CFG_SCOPE_SERVER, CFG_DYN_NONE) != 0) return -1; if (cfgAddBool(pCfg, "monitorComp", tsMonitorComp, CFG_SCOPE_SERVER, CFG_DYN_NONE) != 0) return -1; if (cfgAddBool(pCfg, "monitorLogProtocol", tsMonitorLogProtocol, CFG_SCOPE_SERVER, CFG_DYN_SERVER) != 0) return -1; - if (cfgAddInt32(pCfg, "monitorIntervalForBasic", tsMonitorIntervalForBasic, 1, 200000, CFG_SCOPE_SERVER, CFG_DYN_NONE) != 0) return -1; if (cfgAddBool(pCfg, "monitorForceV2", tsMonitorForceV2, CFG_SCOPE_SERVER, CFG_DYN_NONE) != 0) return -1; if (cfgAddBool(pCfg, "audit", tsEnableAudit, CFG_SCOPE_SERVER, CFG_DYN_ENT_SERVER) != 0) return -1; @@ -1165,7 +1163,6 @@ static int32_t taosSetServerCfg(SConfig *pCfg) { tsMonitorComp = cfgGetItem(pCfg, "monitorComp")->bval; tsQueryRspPolicy = cfgGetItem(pCfg, "queryRspPolicy")->i32; tsMonitorLogProtocol = cfgGetItem(pCfg, "monitorLogProtocol")->bval; - tsMonitorIntervalForBasic = cfgGetItem(pCfg, "monitorIntervalForBasic")->i32; tsMonitorForceV2 = cfgGetItem(pCfg, "monitorForceV2")->i32; tsEnableAudit = cfgGetItem(pCfg, "audit")->bval; diff --git a/source/dnode/mgmt/mgmt_dnode/inc/dmInt.h b/source/dnode/mgmt/mgmt_dnode/inc/dmInt.h index 46f8dd06d4..be9ff56674 100644 --- a/source/dnode/mgmt/mgmt_dnode/inc/dmInt.h +++ b/source/dnode/mgmt/mgmt_dnode/inc/dmInt.h @@ -43,7 +43,6 @@ typedef struct SDnodeMgmt { GetMnodeLoadsFp getMnodeLoadsFp; GetQnodeLoadsFp getQnodeLoadsFp; int32_t statusSeq; - SendMonitorReportFp sendMonitorReportFpBasic; } SDnodeMgmt; // dmHandle.c diff --git a/source/dnode/mgmt/mgmt_dnode/src/dmInt.c b/source/dnode/mgmt/mgmt_dnode/src/dmInt.c index a651fbf060..b9dd45f1c0 100644 --- a/source/dnode/mgmt/mgmt_dnode/src/dmInt.c +++ b/source/dnode/mgmt/mgmt_dnode/src/dmInt.c @@ -65,7 +65,6 @@ static int32_t dmOpenMgmt(SMgmtInputOpt *pInput, SMgmtOutputOpt *pOutput) { pMgmt->processDropNodeFp = pInput->processDropNodeFp; pMgmt->sendMonitorReportFp = pInput->sendMonitorReportFp; pMgmt->sendAuditRecordsFp = pInput->sendAuditRecordFp; - pMgmt->sendMonitorReportFpBasic = pInput->sendMonitorReportFpBasic; pMgmt->getVnodeLoadsFp = pInput->getVnodeLoadsFp; pMgmt->getVnodeLoadsLiteFp = pInput->getVnodeLoadsLiteFp; pMgmt->getMnodeLoadsFp = pInput->getMnodeLoadsFp; diff --git a/source/dnode/mgmt/mgmt_dnode/src/dmWorker.c b/source/dnode/mgmt/mgmt_dnode/src/dmWorker.c index c48b614f96..eafa10aa32 100644 --- a/source/dnode/mgmt/mgmt_dnode/src/dmWorker.c +++ b/source/dnode/mgmt/mgmt_dnode/src/dmWorker.c @@ -175,15 +175,6 @@ static void *dmMonitorThreadFp(void *param) { taosMemoryTrim(0); } } - - if(tsMonitorForceV2){ - if (curTime < lastTimeForBasic) lastTimeForBasic = curTime; - float intervalForBasic = (curTime - lastTimeForBasic) / 1000.0f; - if (intervalForBasic >= tsMonitorIntervalForBasic) { - (*pMgmt->sendMonitorReportFpBasic)(); - lastTimeForBasic = curTime; - } - } } return NULL; diff --git a/source/dnode/mgmt/node_mgmt/inc/dmMgmt.h b/source/dnode/mgmt/node_mgmt/inc/dmMgmt.h index 90e44e5acc..bc6a4652e7 100644 --- a/source/dnode/mgmt/node_mgmt/inc/dmMgmt.h +++ b/source/dnode/mgmt/node_mgmt/inc/dmMgmt.h @@ -128,7 +128,6 @@ int32_t dmProcessNodeMsg(SMgmtWrapper *pWrapper, SRpcMsg *pMsg); // dmMonitor.c void dmSendMonitorReport(); void dmSendAuditRecords(); -void dmSendMonitorReportBasic(); void dmGetVnodeLoads(SMonVloadInfo *pInfo); void dmGetVnodeLoadsLite(SMonVloadInfo *pInfo); void dmGetMnodeLoads(SMonMloadInfo *pInfo); diff --git a/source/dnode/mgmt/node_mgmt/src/dmEnv.c b/source/dnode/mgmt/node_mgmt/src/dmEnv.c index 54a118b666..4be1af30b5 100644 --- a/source/dnode/mgmt/node_mgmt/src/dmEnv.c +++ b/source/dnode/mgmt/node_mgmt/src/dmEnv.c @@ -394,7 +394,6 @@ SMgmtInputOpt dmBuildMgmtInputOpt(SMgmtWrapper *pWrapper) { .processDropNodeFp = dmProcessDropNodeReq, .sendMonitorReportFp = dmSendMonitorReport, .sendAuditRecordFp = auditSendRecordsInBatch, - .sendMonitorReportFpBasic = dmSendMonitorReportBasic, .getVnodeLoadsFp = dmGetVnodeLoads, .getVnodeLoadsLiteFp = dmGetVnodeLoadsLite, .getMnodeLoadsFp = dmGetMnodeLoads, diff --git a/source/dnode/mgmt/node_mgmt/src/dmMonitor.c b/source/dnode/mgmt/node_mgmt/src/dmMonitor.c index 21e25f5535..d3197282b6 100644 --- a/source/dnode/mgmt/node_mgmt/src/dmMonitor.c +++ b/source/dnode/mgmt/node_mgmt/src/dmMonitor.c @@ -123,16 +123,6 @@ void dmSendMonitorReport() { monGenAndSendReport(); } -void dmSendMonitorReportBasic() { - if (!tsEnableMonitor || tsMonitorFqdn[0] == 0 || tsMonitorPort == 0) return; - dTrace("send monitor report to %s:%u", tsMonitorFqdn, tsMonitorPort); - - SDnode *pDnode = dmInstance(); - dmGetDmMonitorInfoBasic(pDnode); - dmGetMmMonitorInfo(pDnode); - monGenAndSendReportBasic(); -} - //Todo: put this in seperate file in the future void dmSendAuditRecords() { auditSendRecordsInBatch(); diff --git a/source/dnode/mgmt/node_util/inc/dmUtil.h b/source/dnode/mgmt/node_util/inc/dmUtil.h index aea3286d76..d316a82af2 100644 --- a/source/dnode/mgmt/node_util/inc/dmUtil.h +++ b/source/dnode/mgmt/node_util/inc/dmUtil.h @@ -155,7 +155,6 @@ typedef struct { ProcessDropNodeFp processDropNodeFp; SendMonitorReportFp sendMonitorReportFp; SendAuditRecordsFp sendAuditRecordFp; - SendMonitorReportFp sendMonitorReportFpBasic; GetVnodeLoadsFp getVnodeLoadsFp; GetVnodeLoadsFp getVnodeLoadsLiteFp; GetMnodeLoadsFp getMnodeLoadsFp; diff --git a/source/libs/monitor/src/monMain.c b/source/libs/monitor/src/monMain.c index 21c196872c..3389780916 100644 --- a/source/libs/monitor/src/monMain.c +++ b/source/libs/monitor/src/monMain.c @@ -568,6 +568,25 @@ void monSendReport(SMonInfo *pMonitor){ } } +void monSendReportBasic(SMonInfo *pMonitor) { + char *pCont = tjsonToString(pMonitor->pJson); + if (tsMonitorLogProtocol) { + if (pCont != NULL) { + uInfoL("report cont basic:\n%s", pCont); + } else { + uInfo("report cont basic is null"); + } + } + if (pCont != NULL) { + EHttpCompFlag flag = tsMonitor.cfg.comp ? HTTP_GZIP : HTTP_FLAT; + if (taosSendHttpReport(tsMonitor.cfg.server, tsMonFwBasicUri, tsMonitor.cfg.port, pCont, strlen(pCont), flag) != + 0) { + uError("failed to send monitor msg"); + } + taosMemoryFree(pCont); + } +} + void monGenAndSendReport() { SMonInfo *pMonitor = monCreateMonitorInfo(); if (pMonitor == NULL) return; @@ -595,38 +614,11 @@ void monGenAndSendReport() { monGenVnodeRoleTable(pMonitor); monSendPromReport(); - } - - monCleanupMonitorInfo(pMonitor); -} - -void monSendReportBasic(SMonInfo *pMonitor){ - char *pCont = tjsonToString(pMonitor->pJson); - if(tsMonitorLogProtocol){ - if(pCont != NULL){ - uInfoL("report cont basic:\n%s", pCont); + if (pMonitor->mmInfo.cluster.first_ep_dnode_id != 0) { + monGenBasicJsonBasic(pMonitor); + monGenClusterJsonBasic(pMonitor); + monSendReportBasic(pMonitor); } - else{ - uInfo("report cont basic is null"); - } - } - if (pCont != NULL) { - EHttpCompFlag flag = tsMonitor.cfg.comp ? HTTP_GZIP : HTTP_FLAT; - if (taosSendHttpReport(tsMonitor.cfg.server, tsMonFwBasicUri, tsMonitor.cfg.port, pCont, strlen(pCont), flag) != 0) { - uError("failed to send monitor msg"); - } - taosMemoryFree(pCont); - } -} - -void monGenAndSendReportBasic() { - SMonInfo *pMonitor = monCreateMonitorInfo(); - - monGenBasicJsonBasic(pMonitor); - monGenClusterJsonBasic(pMonitor); - - if (pMonitor->mmInfo.cluster.first_ep_dnode_id != 0) { - monSendReportBasic(pMonitor); } monCleanupMonitorInfo(pMonitor); From 4ac897be44f61e545971f3fed0fd4d9711cd2a53 Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Fri, 5 Jul 2024 06:51:13 +0000 Subject: [PATCH 55/92] refactor backend --- source/libs/stream/src/streamBackendRocksdb.c | 5 +++-- source/libs/stream/src/streamMeta.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/source/libs/stream/src/streamBackendRocksdb.c b/source/libs/stream/src/streamBackendRocksdb.c index 8d39db33e8..e8a5e30661 100644 --- a/source/libs/stream/src/streamBackendRocksdb.c +++ b/source/libs/stream/src/streamBackendRocksdb.c @@ -762,8 +762,8 @@ int32_t restoreCheckpointData(const char* path, const char* key, int64_t chkptId } if (code != 0) { - stError("failed to start stream backend at %s, reason: %s, restart from default defaultPath:%s, reason:%s", - checkpointPath, tstrerror(code), defaultPath, tstrerror(code)); + stError("failed to start stream backend at %s, restart from default defaultPath:%s, reason:%s", checkpointPath, + defaultPath, tstrerror(code)); code = 0; // reset the error code } } else { // no valid checkpoint id @@ -2528,6 +2528,7 @@ STaskDbWrapper* taskDbOpen(const char* path, const char* key, int64_t chkptId, i char* statePath = NULL; char* dbPath = NULL; int code = 0; + terrno = 0; if ((code = restoreCheckpointData(path, key, chkptId, &statePath, &dbPath, processVer)) < 0) { terrno = code; stError("failed to restore checkpoint data, path:%s, key:%s, checkpointId: %" PRId64 "reason:%s", path, key, diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index a97b803703..15aa42e741 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -182,7 +182,7 @@ int32_t streamMetaCheckBackendCompatible(SStreamMeta* pMeta) { int32_t streamMetaCvtDbFormat(SStreamMeta* pMeta) { int32_t code = 0; int64_t chkpId = streamMetaGetLatestCheckpointId(pMeta); - + terrno = 0; bool exist = streamBackendDataIsExist(pMeta->path, chkpId); if (exist == false) { if (terrno != 0) { From d08cd42c8ae72d4c11708a47f4e69d72aec550bc Mon Sep 17 00:00:00 2001 From: Hongze Cheng Date: Fri, 5 Jul 2024 14:56:42 +0800 Subject: [PATCH 56/92] refact error code --- include/util/taoserror.h | 10 ++++++---- source/util/src/terror.c | 12 ++++++------ 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/include/util/taoserror.h b/include/util/taoserror.h index c6eccae1d3..751e35b482 100644 --- a/include/util/taoserror.h +++ b/include/util/taoserror.h @@ -18,6 +18,8 @@ #include +#include "osDef.h" + #ifdef __cplusplus extern "C" { #endif @@ -46,11 +48,11 @@ const char* terrstr(); char* taosGetErrMsgReturn(); char* taosGetErrMsg(); int32_t* taosGetErrno(); -int32_t* taosGetErrln(); int32_t taosGetErrSize(); -#define terrno (*taosGetErrno()) -#define terrln (*taosGetErrln()) -#define terrMsg (taosGetErrMsg()) + +extern threadlocal int32_t terrno; +extern threadlocal int32_t terrln; +extern threadlocal char terrMsg[ERR_MSG_LEN]; #define SET_ERROR_MSG(MSG, ...) \ snprintf(terrMsg, ERR_MSG_LEN, MSG, ##__VA_ARGS__) diff --git a/source/util/src/terror.c b/source/util/src/terror.c index 14ea448731..bb8c30380f 100644 --- a/source/util/src/terror.c +++ b/source/util/src/terror.c @@ -21,14 +21,14 @@ #define TAOS_ERROR_C -static threadlocal int32_t tsErrno; -static threadlocal int32_t tsErrln; -static threadlocal char tsErrMsgDetail[ERR_MSG_LEN] = {0}; +threadlocal int32_t terrno; +threadlocal int32_t terrln; +threadlocal char terrMsg[ERR_MSG_LEN]; + static threadlocal char tsErrMsgReturn[ERR_MSG_LEN] = {0}; -int32_t* taosGetErrno() { return &tsErrno; } -int32_t* taosGetErrln() { return &tsErrln; } -char* taosGetErrMsg() { return tsErrMsgDetail; } +int32_t* taosGetErrno() { return &terrno; } +char* taosGetErrMsg() { return terrMsg; } char* taosGetErrMsgReturn() { return tsErrMsgReturn; } #ifdef TAOS_ERROR_C From 2b9065fbe7a1eb9b3e89679b8b9388a2918b90f1 Mon Sep 17 00:00:00 2001 From: Hongze Cheng Date: Fri, 5 Jul 2024 15:06:11 +0800 Subject: [PATCH 57/92] reformat and add a error number --- include/common/tmsg.h | 68 ++++++++++++++++++++-------------------- include/util/taoserror.h | 1 + source/util/src/terror.c | 1 + 3 files changed, 36 insertions(+), 34 deletions(-) diff --git a/include/common/tmsg.h b/include/common/tmsg.h index a83aa4da44..40fce1d67b 100644 --- a/include/common/tmsg.h +++ b/include/common/tmsg.h @@ -965,20 +965,20 @@ int32_t tSerializeSConnectReq(void* buf, int32_t bufLen, SConnectReq* pReq); int32_t tDeserializeSConnectReq(void* buf, int32_t bufLen, SConnectReq* pReq); typedef struct { - int32_t acctId; - int64_t clusterId; - uint32_t connId; - int32_t dnodeNum; - int8_t superUser; - int8_t sysInfo; - int8_t connType; - SEpSet epSet; - int32_t svrTimestamp; - int32_t passVer; - int32_t authVer; - char sVer[TSDB_VERSION_LEN]; - char sDetailVer[128]; - int64_t whiteListVer; + int32_t acctId; + int64_t clusterId; + uint32_t connId; + int32_t dnodeNum; + int8_t superUser; + int8_t sysInfo; + int8_t connType; + SEpSet epSet; + int32_t svrTimestamp; + int32_t passVer; + int32_t authVer; + char sVer[TSDB_VERSION_LEN]; + char sDetailVer[128]; + int64_t whiteListVer; SMonitorParas monitorParas; } SConnectRsp; @@ -1638,15 +1638,15 @@ void tFreeSFuncInfo(SFuncInfo* pInfo); void tFreeSRetrieveFuncRsp(SRetrieveFuncRsp* pRsp); typedef struct { - int32_t statusInterval; - int64_t checkTime; // 1970-01-01 00:00:00.000 - char timezone[TD_TIMEZONE_LEN]; // tsTimezone - char locale[TD_LOCALE_LEN]; // tsLocale - char charset[TD_LOCALE_LEN]; // tsCharset - int8_t ttlChangeOnWrite; - int8_t enableWhiteList; - int8_t encryptionKeyStat; - uint32_t encryptionKeyChksum; + int32_t statusInterval; + int64_t checkTime; // 1970-01-01 00:00:00.000 + char timezone[TD_TIMEZONE_LEN]; // tsTimezone + char locale[TD_LOCALE_LEN]; // tsLocale + char charset[TD_LOCALE_LEN]; // tsCharset + int8_t ttlChangeOnWrite; + int8_t enableWhiteList; + int8_t encryptionKeyStat; + uint32_t encryptionKeyChksum; SMonitorParas monitorParas; } SClusterCfg; @@ -1745,9 +1745,9 @@ typedef enum { } MONITOR_TYPE; typedef struct { - int32_t contLen; - char* pCont; - MONITOR_TYPE type; + int32_t contLen; + char* pCont; + MONITOR_TYPE type; } SStatisReq; int32_t tSerializeSStatisReq(void* buf, int32_t bufLen, SStatisReq* pReq); @@ -3035,8 +3035,8 @@ typedef struct { int8_t source; // TD_REQ_FROM_TAOX-taosX or TD_REQ_FROM_APP-taosClient } SVCreateTbBatchReq; -int tEncodeSVCreateTbBatchReq(SEncoder* pCoder, const SVCreateTbBatchReq* pReq); -int tDecodeSVCreateTbBatchReq(SDecoder* pCoder, SVCreateTbBatchReq* pReq); +int tEncodeSVCreateTbBatchReq(SEncoder* pCoder, const SVCreateTbBatchReq* pReq); +int tDecodeSVCreateTbBatchReq(SDecoder* pCoder, SVCreateTbBatchReq* pReq); void tDeleteSVCreateTbBatchReq(SVCreateTbBatchReq* pReq); typedef struct { @@ -3275,10 +3275,10 @@ typedef struct { } SClientHbRsp; typedef struct { - int64_t reqId; - int64_t rspId; - int32_t svrTimestamp; - SArray* rsps; // SArray + int64_t reqId; + int64_t rspId; + int32_t svrTimestamp; + SArray* rsps; // SArray SMonitorParas monitorParas; } SClientHbBatchRsp; @@ -3514,7 +3514,7 @@ typedef struct SVUpdateCheckpointInfoReq { int64_t checkpointVer; int64_t checkpointTs; int32_t transId; - int64_t hStreamId; // add encode/decode + int64_t hStreamId; // add encode/decode int64_t hTaskId; int8_t dropRelHTask; } SVUpdateCheckpointInfoReq; @@ -3993,7 +3993,7 @@ int32_t tDecodeSTaosxRsp(SDecoder* pDecoder, void* pRsp); void tDeleteSTaosxRsp(void* pRsp); typedef struct SMqBatchMetaRsp { - SMqRspHead head; // not serialize + SMqRspHead head; // not serialize STqOffsetVal rspOffset; SArray* batchMetaLen; SArray* batchMetaReq; diff --git a/include/util/taoserror.h b/include/util/taoserror.h index 751e35b482..0d58dfefaa 100644 --- a/include/util/taoserror.h +++ b/include/util/taoserror.h @@ -149,6 +149,7 @@ extern threadlocal char terrMsg[ERR_MSG_LEN]; #define TSDB_CODE_IP_NOT_IN_WHITE_LIST TAOS_DEF_ERROR_CODE(0, 0x0134) #define TSDB_CODE_FAILED_TO_CONNECT_S3 TAOS_DEF_ERROR_CODE(0, 0x0135) #define TSDB_CODE_MSG_PREPROCESSED TAOS_DEF_ERROR_CODE(0, 0x0136) // internal +#define TSDB_CODE_OUT_OF_BUFFER TAOS_DEF_ERROR_CODE(0, 0x0137) //client #define TSDB_CODE_TSC_INVALID_OPERATION TAOS_DEF_ERROR_CODE(0, 0x0200) diff --git a/source/util/src/terror.c b/source/util/src/terror.c index bb8c30380f..956a2552d4 100644 --- a/source/util/src/terror.c +++ b/source/util/src/terror.c @@ -106,6 +106,7 @@ TAOS_DEFINE_ERROR(TSDB_CODE_INVALID_CFG_VALUE, "Invalid configuration TAOS_DEFINE_ERROR(TSDB_CODE_IP_NOT_IN_WHITE_LIST, "Not allowed to connect") TAOS_DEFINE_ERROR(TSDB_CODE_FAILED_TO_CONNECT_S3, "Failed to connect to s3 server") TAOS_DEFINE_ERROR(TSDB_CODE_MSG_PREPROCESSED, "Message has been processed in preprocess") +TAOS_DEFINE_ERROR(TSDB_CODE_OUT_OF_BUFFER, "Out of buffer") //client TAOS_DEFINE_ERROR(TSDB_CODE_TSC_INVALID_OPERATION, "Invalid operation") From 9d42b31d4a017b9737da7a342e5b988765059a62 Mon Sep 17 00:00:00 2001 From: wangmm0220 Date: Fri, 5 Jul 2024 16:12:13 +0800 Subject: [PATCH 58/92] fix:[TS-4921]refactor code --- include/libs/monitor/clientMonitor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/libs/monitor/clientMonitor.h b/include/libs/monitor/clientMonitor.h index 3bb325921e..0085173ecd 100644 --- a/include/libs/monitor/clientMonitor.h +++ b/include/libs/monitor/clientMonitor.h @@ -38,7 +38,7 @@ typedef enum { SLOW_LOG_READ_QUIT = 3, } SLOW_LOG_QUEUE_TYPE; -char* queueTypeStr[] = { +static char* queueTypeStr[] = { "SLOW_LOG_WRITE", "SLOW_LOG_READ_RUNNING", "SLOW_LOG_READ_BEGINNIG", From 8e6bb176c21675e8eb0d762347f931784d7c6403 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Fri, 5 Jul 2024 16:17:20 +0800 Subject: [PATCH 59/92] fix(stream): use trans to set the consensus-checkpoint id --- include/common/tmsgdef.h | 3 +- include/dnode/vnode/tqCommon.h | 7 +- include/libs/stream/streamMsg.h | 10 -- include/libs/stream/tstream.h | 3 +- source/dnode/mgmt/mgmt_mnode/src/mmHandle.c | 3 +- source/dnode/mgmt/mgmt_snode/src/smHandle.c | 3 +- source/dnode/mgmt/mgmt_vnode/src/vmHandle.c | 3 +- source/dnode/mnode/impl/inc/mndStream.h | 9 +- source/dnode/mnode/impl/src/mndStream.c | 120 ++++++++++---------- source/dnode/mnode/impl/src/mndStreamUtil.c | 102 +++++++++++++---- source/dnode/snode/src/snode.c | 10 +- source/dnode/vnode/src/inc/vnodeInt.h | 1 + source/dnode/vnode/src/tq/tq.c | 10 +- source/dnode/vnode/src/tqCommon/tqCommon.c | 25 ++-- source/dnode/vnode/src/vnd/vnodeSvr.c | 12 +- source/libs/stream/src/streamCheckpoint.c | 2 +- source/libs/stream/src/streamMsg.c | 20 ---- 17 files changed, 193 insertions(+), 150 deletions(-) diff --git a/include/common/tmsgdef.h b/include/common/tmsgdef.h index 19fe34fe01..7621615278 100644 --- a/include/common/tmsgdef.h +++ b/include/common/tmsgdef.h @@ -250,7 +250,7 @@ TD_DEF_MSG_TYPE(TDMT_MND_DROP_TB_WITH_TSMA, "drop-tb-with-tsma", NULL, NULL) TD_DEF_MSG_TYPE(TDMT_MND_STREAM_UPDATE_CHKPT_EVT, "stream-update-chkpt-evt", NULL, NULL) TD_DEF_MSG_TYPE(TDMT_MND_STREAM_CHKPT_REPORT, "stream-chkpt-report", NULL, NULL) - TD_DEF_MSG_TYPE(TDMT_MND_STREAM_CHKPT_CONSEN, "stream-chkpt-consen", NULL, NULL) + TD_DEF_MSG_TYPE(TDMT_MND_STREAM_REQ_CONSEN_CHKPT, "stream-req-consen-chkpt", NULL, NULL) TD_DEF_MSG_TYPE(TDMT_MND_STREAM_CONSEN_TIMER, "stream-consen-tmr", NULL, NULL) TD_DEF_MSG_TYPE(TDMT_MND_MAX_MSG, "mnd-max", NULL, NULL) TD_CLOSE_MSG_SEG(TDMT_END_MND_MSG) @@ -342,6 +342,7 @@ TD_DEF_MSG_TYPE(TDMT_STREAM_CREATE, "stream-create", NULL, NULL) TD_DEF_MSG_TYPE(TDMT_STREAM_DROP, "stream-drop", NULL, NULL) TD_DEF_MSG_TYPE(TDMT_STREAM_RETRIEVE_TRIGGER, "stream-retri-trigger", NULL, NULL) + TD_DEF_MSG_TYPE(TDMT_STREAM_CONSEN_CHKPT, "stream-consen-chkpt", NULL, NULL) TD_CLOSE_MSG_SEG(TDMT_STREAM_MSG) TD_NEW_MSG_SEG(TDMT_MON_MSG) //5 << 8 diff --git a/include/dnode/vnode/tqCommon.h b/include/dnode/vnode/tqCommon.h index 566e8dbbd8..4d5e18520c 100644 --- a/include/dnode/vnode/tqCommon.h +++ b/include/dnode/vnode/tqCommon.h @@ -29,7 +29,8 @@ int32_t tqStreamTaskProcessCheckpointReadyMsg(SStreamMeta* pMeta, SRpcMsg* pMsg) int32_t tqStreamProcessStreamHbRsp(SStreamMeta* pMeta, SRpcMsg* pMsg); int32_t tqStreamProcessReqCheckpointRsp(SStreamMeta* pMeta, SRpcMsg* pMsg); int32_t tqStreamProcessChkptReportRsp(SStreamMeta* pMeta, SRpcMsg* pMsg); -int32_t tqStreamProcessConsensusChkptRsp(SStreamMeta* pMeta, SRpcMsg* pMsg); +int32_t tqStreamProcessConsensusChkptRsp2(SStreamMeta* pMeta, SRpcMsg* pMsg); +int32_t tqStreamTaskProcessConsenChkptIdReq(SStreamMeta* pMeta, SRpcMsg* pMsg); int32_t tqStreamProcessCheckpointReadyRsp(SStreamMeta* pMeta, SRpcMsg* pMsg); int32_t tqStreamTaskProcessDeployReq(SStreamMeta* pMeta, SMsgCb* cb, int64_t sversion, char* msg, int32_t msgLen, bool isLeader, bool restored); @@ -37,12 +38,12 @@ int32_t tqStreamTaskProcessDropReq(SStreamMeta* pMeta, char* msg, int32_t msgLen int32_t tqStreamTaskProcessRunReq(SStreamMeta* pMeta, SRpcMsg* pMsg, bool isLeader); int32_t tqStartTaskCompleteCallback(SStreamMeta* pMeta); int32_t tqStreamTasksGetTotalNum(SStreamMeta* pMeta); -int32_t tqStreamTaskProcessTaskResetReq(SStreamMeta* pMeta, SRpcMsg* pMsg); +int32_t tqStreamTaskProcessTaskResetReq(SStreamMeta* pMeta, char* msg); int32_t tqStreamTaskProcessRetrieveTriggerReq(SStreamMeta* pMeta, SRpcMsg* pMsg); int32_t tqStreamTaskProcessRetrieveTriggerRsp(SStreamMeta* pMeta, SRpcMsg* pMsg); int32_t tqStreamTaskProcessTaskPauseReq(SStreamMeta* pMeta, char* pMsg); int32_t tqStreamTaskProcessTaskResumeReq(void* handle, int64_t sversion, char* pMsg, bool fromVnode); -int32_t tqStreamTaskProcessUpdateCheckpointReq(SStreamMeta* pMeta, bool restored, char* msg, int32_t msgLen); +int32_t tqStreamTaskProcessUpdateCheckpointReq(SStreamMeta* pMeta, bool restored, char* msg); void tqSetRestoreVersionInfo(SStreamTask* pTask); int32_t tqExpandStreamTask(SStreamTask* pTask); diff --git a/include/libs/stream/streamMsg.h b/include/libs/stream/streamMsg.h index 8b6ca2c5cd..bdb8ff7f8e 100644 --- a/include/libs/stream/streamMsg.h +++ b/include/libs/stream/streamMsg.h @@ -223,16 +223,6 @@ typedef struct SRestoreCheckpointInfo { int32_t tEncodeRestoreCheckpointInfo (SEncoder* pEncoder, const SRestoreCheckpointInfo* pReq); int32_t tDecodeRestoreCheckpointInfo(SDecoder* pDecoder, SRestoreCheckpointInfo* pReq); -typedef struct SRestoreCheckpointInfoRsp { - int64_t streamId; - int64_t checkpointId; - int64_t startTs; - int32_t taskId; -} SRestoreCheckpointInfoRsp; - -int32_t tEncodeRestoreCheckpointInfoRsp(SEncoder* pCoder, const SRestoreCheckpointInfoRsp* pInfo); -int32_t tDecodeRestoreCheckpointInfoRsp(SDecoder* pCoder, SRestoreCheckpointInfoRsp* pInfo); - typedef struct { SMsgHead head; int64_t streamId; diff --git a/include/libs/stream/tstream.h b/include/libs/stream/tstream.h index 093a21c999..d0feebf814 100644 --- a/include/libs/stream/tstream.h +++ b/include/libs/stream/tstream.h @@ -616,9 +616,8 @@ typedef struct SStreamTaskState { typedef struct SCheckpointConsensusInfo { SArray* pTaskList; -// int64_t checkpointId; -// int64_t genTs; int32_t numOfTasks; + int64_t streamId; } SCheckpointConsensusInfo; int32_t streamSetupScheduleTrigger(SStreamTask* pTask); diff --git a/source/dnode/mgmt/mgmt_mnode/src/mmHandle.c b/source/dnode/mgmt/mgmt_mnode/src/mmHandle.c index 677e19d4c1..f3763ef0c5 100644 --- a/source/dnode/mgmt/mgmt_mnode/src/mmHandle.c +++ b/source/dnode/mgmt/mgmt_mnode/src/mmHandle.c @@ -233,6 +233,7 @@ SArray *mmGetMsgHandles() { if (dmSetMgmtHandle(pArray, TDMT_STREAM_TASK_RESUME_RSP, mmPutMsgToWriteQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_STREAM_TASK_STOP_RSP, mmPutMsgToWriteQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_STREAM_TASK_UPDATE_CHKPT_RSP, mmPutMsgToWriteQueue, 0) == NULL) goto _OVER; + if (dmSetMgmtHandle(pArray, TDMT_STREAM_CONSEN_CHKPT_RSP, mmPutMsgToWriteQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_STREAM_CREATE, mmPutMsgToWriteQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_STREAM_DROP, mmPutMsgToWriteQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_STREAM_CREATE_RSP, mmPutMsgToWriteQueue, 0) == NULL) goto _OVER; @@ -242,7 +243,7 @@ SArray *mmGetMsgHandles() { if (dmSetMgmtHandle(pArray, TDMT_VND_STREAM_TASK_RESET_RSP, mmPutMsgToWriteQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_MND_STREAM_HEARTBEAT, mmPutMsgToReadQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_MND_STREAM_CHKPT_REPORT, mmPutMsgToWriteQueue, 0) == NULL) goto _OVER; - if (dmSetMgmtHandle(pArray, TDMT_MND_STREAM_CHKPT_CONSEN, mmPutMsgToWriteQueue, 0) == NULL) goto _OVER; + if (dmSetMgmtHandle(pArray, TDMT_MND_STREAM_REQ_CONSEN_CHKPT, mmPutMsgToWriteQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_MND_STREAM_REQ_CHKPT, mmPutMsgToWriteQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_VND_KILL_COMPACT_RSP, mmPutMsgToWriteQueue, 0) == NULL) goto _OVER; diff --git a/source/dnode/mgmt/mgmt_snode/src/smHandle.c b/source/dnode/mgmt/mgmt_snode/src/smHandle.c index 7a0189b7c1..3d0587a11b 100644 --- a/source/dnode/mgmt/mgmt_snode/src/smHandle.c +++ b/source/dnode/mgmt/mgmt_snode/src/smHandle.c @@ -76,6 +76,7 @@ SArray *smGetMsgHandles() { if (dmSetMgmtHandle(pArray, TDMT_VND_STREAM_TASK_UPDATE, smPutNodeMsgToMgmtQueue, 1) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_STREAM_TASK_DEPLOY, smPutNodeMsgToMgmtQueue, 1) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_STREAM_TASK_UPDATE_CHKPT, smPutNodeMsgToMgmtQueue, 1) == NULL) goto _OVER; + if (dmSetMgmtHandle(pArray, TDMT_STREAM_CONSEN_CHKPT, smPutNodeMsgToMgmtQueue, 1) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_STREAM_TASK_DROP, smPutNodeMsgToMgmtQueue, 1) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_STREAM_TASK_RUN, smPutNodeMsgToStreamQueue, 1) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_STREAM_TASK_DISPATCH, smPutNodeMsgToStreamQueue, 1) == NULL) goto _OVER; @@ -96,7 +97,7 @@ SArray *smGetMsgHandles() { if (dmSetMgmtHandle(pArray, TDMT_MND_STREAM_HEARTBEAT_RSP, smPutNodeMsgToStreamQueue, 1) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_MND_STREAM_REQ_CHKPT_RSP, smPutNodeMsgToStreamQueue, 1) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_MND_STREAM_CHKPT_REPORT_RSP, smPutNodeMsgToStreamQueue, 1) == NULL) goto _OVER; - if (dmSetMgmtHandle(pArray, TDMT_MND_STREAM_CHKPT_CONSEN_RSP, smPutNodeMsgToMgmtQueue, 1) == NULL) goto _OVER; + if (dmSetMgmtHandle(pArray, TDMT_MND_STREAM_REQ_CONSEN_CHKPT_RSP, smPutNodeMsgToStreamQueue, 1) == NULL) goto _OVER; code = 0; _OVER: diff --git a/source/dnode/mgmt/mgmt_vnode/src/vmHandle.c b/source/dnode/mgmt/mgmt_vnode/src/vmHandle.c index 001696aecc..7d35fd71b7 100644 --- a/source/dnode/mgmt/mgmt_vnode/src/vmHandle.c +++ b/source/dnode/mgmt/mgmt_vnode/src/vmHandle.c @@ -972,10 +972,11 @@ SArray *vmGetMsgHandles() { if (dmSetMgmtHandle(pArray, TDMT_MND_STREAM_HEARTBEAT_RSP, vmPutMsgToStreamQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_MND_STREAM_REQ_CHKPT_RSP, vmPutMsgToStreamQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_MND_STREAM_CHKPT_REPORT_RSP, vmPutMsgToStreamQueue, 0) == NULL) goto _OVER; - if (dmSetMgmtHandle(pArray, TDMT_MND_STREAM_CHKPT_CONSEN_RSP, vmPutMsgToWriteQueue, 0) == NULL) goto _OVER; + if (dmSetMgmtHandle(pArray, TDMT_MND_STREAM_REQ_CONSEN_CHKPT_RSP, vmPutMsgToStreamQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_VND_GET_STREAM_PROGRESS, vmPutMsgToStreamQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_STREAM_TASK_UPDATE_CHKPT, vmPutMsgToWriteQueue, 0) == NULL) goto _OVER; + if (dmSetMgmtHandle(pArray, TDMT_STREAM_CONSEN_CHKPT, vmPutMsgToWriteQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_VND_ALTER_REPLICA, vmPutMsgToMgmtQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_VND_ALTER_CONFIG, vmPutMsgToWriteQueue, 0) == NULL) goto _OVER; diff --git a/source/dnode/mnode/impl/inc/mndStream.h b/source/dnode/mnode/impl/inc/mndStream.h index a86e06b486..7b4270462f 100644 --- a/source/dnode/mnode/impl/inc/mndStream.h +++ b/source/dnode/mnode/impl/inc/mndStream.h @@ -83,7 +83,7 @@ typedef struct SOrphanTask { typedef struct { SMsgHead head; -} SMStreamReqCheckpointRsp, SMStreamUpdateChkptRsp; +} SMStreamReqCheckpointRsp, SMStreamUpdateChkptRsp, SMStreamReqConsensChkptRsp; typedef struct STaskChkptInfo { int32_t nodeId; @@ -133,9 +133,8 @@ int32_t mndCreateStreamResetStatusTrans(SMnode *pMnode, SStreamObj *pStream) int32_t mndStreamSetUpdateChkptAction(SMnode *pMnode, STrans *pTrans, SStreamObj *pStream); int32_t mndCreateStreamChkptInfoUpdateTrans(SMnode *pMnode, SStreamObj *pStream, SArray *pChkptInfoList); int32_t mndScanCheckpointReportInfo(SRpcMsg *pReq); -int32_t mndSendQuickConsensusChkptIdRsp(SRestoreCheckpointInfo *pReq, int32_t code, int64_t streamId, - int64_t checkpointId, SRpcHandleInfo *pRpcInfo); - +int32_t mndCreateSetConsensusChkptIdTrans(SMnode *pMnode, SStreamObj *pStream, int32_t taskId, int64_t checkpointId, + int64_t ts); void removeTasksInBuf(SArray *pTaskIds, SStreamExecInfo *pExecInfo); SStreamTaskIter *createStreamTaskIter(SStreamObj *pStream); @@ -151,9 +150,7 @@ SCheckpointConsensusInfo *mndGetConsensusInfo(SHashObj *pHash, int64_t streamId, void mndAddConsensusTasks(SCheckpointConsensusInfo *pInfo, const SRestoreCheckpointInfo *pRestoreInfo, SRpcHandleInfo *pRpcInfo); void mndClearConsensusRspEntry(SCheckpointConsensusInfo *pInfo); -int32_t doSendConsensusCheckpointRsp(SRestoreCheckpointInfo *pInfo, SRpcMsg *pMsg, int64_t checkpointId); int64_t mndClearConsensusCheckpointId(SHashObj* pHash, int64_t streamId); -int32_t mndRegisterConsensusChkptId(SHashObj* pHash, int64_t streamId); #ifdef __cplusplus } diff --git a/source/dnode/mnode/impl/src/mndStream.c b/source/dnode/mnode/impl/src/mndStream.c index a1016ad96c..a8dc1e42bb 100644 --- a/source/dnode/mnode/impl/src/mndStream.c +++ b/source/dnode/mnode/impl/src/mndStream.c @@ -61,6 +61,7 @@ static int32_t mndProcessStreamReqCheckpoint(SRpcMsg *pReq); static int32_t mndProcessCheckpointReport(SRpcMsg *pReq); static int32_t mndProcessConsensusCheckpointId(SRpcMsg *pMsg); static int32_t mndProcessConsensusInTmr(SRpcMsg *pMsg); +static void doSendQuickRsp(SRpcHandleInfo *pInfo, int32_t msgSize, int32_t vgId, int32_t code); static SVgroupChangeInfo mndFindChangedNodeInfo(SMnode *pMnode, const SArray *pPrevNodeList, const SArray *pNodeList); @@ -107,6 +108,7 @@ int32_t mndInitStream(SMnode *pMnode) { mndSetMsgHandle(pMnode, TDMT_VND_STREAM_TASK_UPDATE_RSP, mndTransProcessRsp); mndSetMsgHandle(pMnode, TDMT_VND_STREAM_TASK_RESET_RSP, mndTransProcessRsp); mndSetMsgHandle(pMnode, TDMT_STREAM_TASK_UPDATE_CHKPT_RSP, mndTransProcessRsp); + mndSetMsgHandle(pMnode, TDMT_STREAM_CONSEN_CHKPT_RSP, mndTransProcessRsp); // for msgs inside mnode // TODO change the name @@ -119,7 +121,7 @@ int32_t mndInitStream(SMnode *pMnode) { mndSetMsgHandle(pMnode, TDMT_MND_STREAM_BEGIN_CHECKPOINT, mndProcessStreamCheckpoint); mndSetMsgHandle(pMnode, TDMT_MND_STREAM_REQ_CHKPT, mndProcessStreamReqCheckpoint); mndSetMsgHandle(pMnode, TDMT_MND_STREAM_CHKPT_REPORT, mndProcessCheckpointReport); - mndSetMsgHandle(pMnode, TDMT_MND_STREAM_CHKPT_CONSEN, mndProcessConsensusCheckpointId); + mndSetMsgHandle(pMnode, TDMT_MND_STREAM_REQ_CONSEN_CHKPT, mndProcessConsensusCheckpointId); mndSetMsgHandle(pMnode, TDMT_MND_STREAM_UPDATE_CHKPT_EVT, mndScanCheckpointReportInfo); mndSetMsgHandle(pMnode, TDMT_STREAM_TASK_REPORT_CHECKPOINT, mndTransProcessRsp); mndSetMsgHandle(pMnode, TDMT_MND_STREAM_HEARTBEAT, mndProcessStreamHb); @@ -2611,23 +2613,14 @@ int32_t mndProcessCheckpointReport(SRpcMsg *pReq) { taosThreadMutexUnlock(&execInfo.lock); - { - SRpcMsg rsp = {.code = 0, .info = pReq->info, .contLen = sizeof(SMStreamUpdateChkptRsp)}; - rsp.pCont = rpcMallocCont(rsp.contLen); - SMsgHead *pHead = rsp.pCont; - pHead->vgId = htonl(req.nodeId); - - tmsgSendRsp(&rsp); - pReq->info.handle = NULL; // disable auto rsp - } - + doSendQuickRsp(&pReq->info, sizeof(SMStreamUpdateChkptRsp), req.nodeId, TSDB_CODE_SUCCESS); return 0; } -static int64_t getConsensusId(int64_t streamId, int32_t numOfTasks, bool* pAllEqual) { +static int64_t getConsensusId(int64_t streamId, int32_t numOfTasks, int32_t* pExistedTasks) { int32_t num = 0; int64_t chkId = INT64_MAX; - *pAllEqual = true; + *pExistedTasks = 0; for(int32_t i = 0; i < taosArrayGetSize(execInfo.pTaskList); ++i) { STaskId* p = taosArrayGet(execInfo.pTaskList, i); @@ -2637,16 +2630,12 @@ static int64_t getConsensusId(int64_t streamId, int32_t numOfTasks, bool* pAllEq num += 1; STaskStatusEntry* pe = taosHashGet(execInfo.pTaskMap, p, sizeof(*p)); - - if (chkId != INT64_MAX && chkId != pe->checkpointInfo.latestId) { - *pAllEqual = false; - } - if (chkId > pe->checkpointInfo.latestId) { chkId = pe->checkpointInfo.latestId; } } + *pExistedTasks = num; if (num < numOfTasks) { // not all task send info to mnode through hbMsg, no valid checkpoint Id return -1; } @@ -2654,6 +2643,16 @@ static int64_t getConsensusId(int64_t streamId, int32_t numOfTasks, bool* pAllEq return chkId; } +static void doSendQuickRsp(SRpcHandleInfo *pInfo, int32_t msgSize, int32_t vgId, int32_t code) { + SRpcMsg rsp = {.code = code, .info = *pInfo, .contLen = msgSize}; + rsp.pCont = rpcMallocCont(rsp.contLen); + SMsgHead *pHead = rsp.pCont; + pHead->vgId = htonl(vgId); + + tmsgSendRsp(&rsp); + pInfo->handle = NULL; // disable auto rsp +} + static int32_t mndProcessConsensusCheckpointId(SRpcMsg *pMsg) { SMnode *pMnode = pMsg->info.node; SDecoder decoder = {0}; @@ -2675,9 +2674,8 @@ static int32_t mndProcessConsensusCheckpointId(SRpcMsg *pMsg) { // register to the stream task done map, if all tasks has sent this kinds of message, start the checkpoint trans. taosThreadMutexLock(&execInfo.lock); - SStreamObj *pStream = mndGetStreamObj(pMnode, req.streamId); - // mnode handle the create stream transaction too slow may cause this problem + SStreamObj *pStream = mndGetStreamObj(pMnode, req.streamId); if (pStream == NULL) { mWarn("failed to find the stream:0x%" PRIx64 ", not handle consensus-checkpointId", req.streamId); @@ -2688,11 +2686,9 @@ static int32_t mndProcessConsensusCheckpointId(SRpcMsg *pMsg) { if (p == NULL) { mError("failed to find the stream:0x%" PRIx64 " in buf, not handle consensus-checkpointId", req.streamId); terrno = TSDB_CODE_MND_STREAM_NOT_EXIST; - - mndSendQuickConsensusChkptIdRsp(&req, terrno, req.streamId, 0, &pMsg->info); - taosThreadMutexUnlock(&execInfo.lock); - pMsg->info.handle = NULL; // disable auto rsp + + doSendQuickRsp(&pMsg->info, sizeof(SMStreamReqConsensChkptRsp), req.nodeId, terrno); return -1; } else { mDebug("s-task:0x%" PRIx64 "-0x%x in buf not in mnode/meta, create stream trans may not complete yet", @@ -2706,36 +2702,35 @@ static int32_t mndProcessConsensusCheckpointId(SRpcMsg *pMsg) { int32_t numOfTasks = (pStream == NULL) ? 0 : mndGetNumOfStreamTasks(pStream); if ((pStream != NULL) && (pStream->checkpointId == 0)) { // not generated checkpoint yet, return 0 directly - mndSendQuickConsensusChkptIdRsp(&req, TSDB_CODE_SUCCESS, req.streamId, 0, &pMsg->info); - taosThreadMutexUnlock(&execInfo.lock); - pMsg->info.handle = NULL; // disable auto rsp + mndCreateSetConsensusChkptIdTrans(pMnode, pStream, req.taskId, 0, req.startTs); + + doSendQuickRsp(&pMsg->info, sizeof(SMStreamReqConsensChkptRsp), req.nodeId, terrno); return TSDB_CODE_SUCCESS; } - bool allEqual = true; - int64_t chkId = getConsensusId(req.streamId, numOfTasks, &allEqual); + int32_t num = 0; + int64_t chkId = getConsensusId(req.streamId, numOfTasks, &num); // some tasks not send hbMsg to mnode yet, wait for 5s. if (chkId == -1) { - mDebug( - "not all task send hbMsg yet, add into list and wait for 10s to check the consensus-checkpointId again, " - "s-task:0x%x", req.taskId); + mDebug("not all(%d/%d) task(s) send hbMsg yet, wait for a while and check again, s-task:0x%x", req.taskId, num, + numOfTasks); SCheckpointConsensusInfo *pInfo = mndGetConsensusInfo(execInfo.pStreamConsensus, req.streamId, numOfTasks); mndAddConsensusTasks(pInfo, &req, &pMsg->info); - taosThreadMutexUnlock(&execInfo.lock); - pMsg->info.handle = NULL; // disable auto rsp + taosThreadMutexUnlock(&execInfo.lock); + doSendQuickRsp(&pMsg->info, sizeof(SMStreamReqConsensChkptRsp), req.nodeId, terrno); return 0; } if (chkId == req.checkpointId) { mDebug("vgId:%d stream:0x%" PRIx64 " %s consensus-checkpointId is:%" PRId64 ", meta-stored checkpointId:%" PRId64, req.nodeId, req.streamId, pStream->name, chkId, pStream->checkpointId); - mndSendQuickConsensusChkptIdRsp(&req, TSDB_CODE_SUCCESS, req.streamId, chkId, &pMsg->info); + mndCreateSetConsensusChkptIdTrans(pMnode, pStream, req.taskId, chkId, req.startTs); taosThreadMutexUnlock(&execInfo.lock); - pMsg->info.handle = NULL; // disable auto rsp + doSendQuickRsp(&pMsg->info, sizeof(SMStreamReqConsensChkptRsp), req.nodeId, terrno); return 0; } @@ -2748,14 +2743,14 @@ static int32_t mndProcessConsensusCheckpointId(SRpcMsg *pMsg) { } taosThreadMutexUnlock(&execInfo.lock); - pMsg->info.handle = NULL; // disable auto rsp - + doSendQuickRsp(&pMsg->info, sizeof(SMStreamReqConsensChkptRsp), req.nodeId, terrno); return 0; } int32_t mndProcessConsensusInTmr(SRpcMsg *pMsg) { + SMnode *pMnode = pMsg->info.node; int64_t now = taosGetTimestampMs(); - int64_t streamId = -1; // todo: fix only one + SArray *pStreamList = taosArrayInit(4, sizeof(int64_t)); mDebug("start to process consensus-checkpointId in tmr"); taosThreadMutexLock(&execInfo.lock); @@ -2764,40 +2759,42 @@ int32_t mndProcessConsensusInTmr(SRpcMsg *pMsg) { while ((pIter = taosHashIterate(execInfo.pStreamConsensus, pIter)) != NULL) { SCheckpointConsensusInfo *pInfo = (SCheckpointConsensusInfo *)pIter; - int32_t j = 0; + int64_t streamId = -1; int32_t num = taosArrayGetSize(pInfo->pTaskList); - SArray *pList = taosArrayInit(4, sizeof(int32_t)); - for (; j < num; ++j) { - SCheckpointConsensusEntry *pe = taosArrayGet(pInfo->pTaskList, j); + SStreamObj *pStream = mndGetStreamObj(pMnode, pInfo->streamId); + if (pStream == NULL) { // stream has been dropped already + mDebug("stream:0x%"PRIx64" dropped already, continue", pInfo->streamId); + continue; + } - if ((now - pe->ts) > 10 * 1000) { - bool allEqual = true; - int64_t chkId = getConsensusId(pe->req.streamId, pInfo->numOfTasks, &allEqual); + for (int32_t j = 0; j < num; ++j) { + SCheckpointConsensusEntry *pe = taosArrayGet(pInfo->pTaskList, j); + streamId = pe->req.streamId; + + if ((now - pe->ts) >= 10 * 1000) { + int32_t existed = 0; + int64_t chkId = getConsensusId(pe->req.streamId, pInfo->numOfTasks, &existed); if (chkId == -1) { - mDebug("tasks send hbMsg for stream:0x%" PRIx64 ", wait for next round", pe->req.streamId); + mDebug("not all(%d/%d) task(s) send hbMsg yet, wait for a while and check again, s-task:0x%x", existed, + pInfo->numOfTasks, pe->req.taskId); break; } - if (allEqual) { - mDebug("all has identical checkpointId for stream:0x%"PRIx64" send checkpointId to s-task:0x%x", - pe->req.streamId, pe->req.taskId); - - mndSendQuickConsensusChkptIdRsp(&pe->req, TSDB_CODE_SUCCESS, pe->req.streamId, chkId, &pe->rspInfo); - } else { - ASSERT(chkId <= pe->req.checkpointId); - mndSendQuickConsensusChkptIdRsp(&pe->req, TSDB_CODE_SUCCESS, pe->req.streamId, chkId, &pe->rspInfo); - } + ASSERT(chkId <= pe->req.checkpointId); + mndCreateSetConsensusChkptIdTrans(pMnode, pStream, pe->req.taskId, chkId, pe->req.startTs); taosArrayPush(pList, &pe->req.taskId); streamId = pe->req.streamId; } else { mDebug("s-task:0x%x sendTs:%" PRId64 " wait %2.fs already, wait for next round to check", pe->req.taskId, - (now - pe->ts)/ 1000.0, pe->ts); + (now - pe->ts) / 1000.0, pe->ts); } } + mndReleaseStream(pMnode, pStream); + if (taosArrayGetSize(pList) > 0) { for (int32_t i = 0; i < taosArrayGetSize(pList); ++i) { int32_t *taskId = taosArrayGet(pList, i); @@ -2815,12 +2812,19 @@ int32_t mndProcessConsensusInTmr(SRpcMsg *pMsg) { if (taosArrayGetSize(pInfo->pTaskList) == 0) { mndClearConsensusRspEntry(pInfo); - mndClearConsensusCheckpointId(execInfo.pStreamConsensus, streamId); + ASSERT(streamId != -1); + taosArrayPush(pStreamList, &streamId); } } + for (int32_t i = 0; i < taosArrayGetSize(pStreamList); ++i) { + int64_t *pStreamId = (int64_t *)taosArrayGet(pStreamList, i); + mndClearConsensusCheckpointId(execInfo.pStreamConsensus, *pStreamId); + } + taosThreadMutexUnlock(&execInfo.lock); + taosArrayDestroy(pStreamList); mDebug("end to process consensus-checkpointId in tmr"); return TSDB_CODE_SUCCESS; } diff --git a/source/dnode/mnode/impl/src/mndStreamUtil.c b/source/dnode/mnode/impl/src/mndStreamUtil.c index 9ee820925c..7d9b9e4571 100644 --- a/source/dnode/mnode/impl/src/mndStreamUtil.c +++ b/source/dnode/mnode/impl/src/mndStreamUtil.c @@ -651,6 +651,7 @@ void removeTasksInBuf(SArray *pTaskIds, SStreamExecInfo* pExecInfo) { void removeStreamTasksInBuf(SStreamObj *pStream, SStreamExecInfo *pExecNode) { taosThreadMutexLock(&pExecNode->lock); + // 1. remove task entries SStreamTaskIter *pIter = createStreamTaskIter(pStream); while (streamTaskIterNextTask(pIter)) { SStreamTask *pTask = streamTaskIterGetCurrent(pIter); @@ -660,8 +661,11 @@ void removeStreamTasksInBuf(SStreamObj *pStream, SStreamExecInfo *pExecNode) { } ASSERT(taosHashGetSize(pExecNode->pTaskMap) == taosArrayGetSize(pExecNode->pTaskList)); - taosThreadMutexUnlock(&pExecNode->lock); + // 2. remove stream entry in consensus hash table + mndClearConsensusCheckpointId(execInfo.pStreamConsensus, pStream->uid); + + taosThreadMutexUnlock(&pExecNode->lock); destroyStreamTaskIter(pIter); } @@ -835,45 +839,98 @@ int32_t mndScanCheckpointReportInfo(SRpcMsg *pReq) { return TSDB_CODE_SUCCESS; } -int32_t doSendConsensusCheckpointRsp(SRestoreCheckpointInfo* pInfo, SRpcMsg* pMsg, int64_t checkpointId) { +static int32_t mndStreamSetChkptIdAction(SMnode *pMnode, STrans *pTrans, SStreamTask* pTask, int64_t checkpointId, int64_t ts) { + SRestoreCheckpointInfo req = { + .taskId = pTask->id.taskId, + .streamId = pTask->id.streamId, + .checkpointId = checkpointId, + .startTs = ts, + .nodeId = pTask->info.nodeId, + }; + int32_t code = 0; int32_t blen; - - SRestoreCheckpointInfoRsp req = { - .streamId = pInfo->streamId, .taskId = pInfo->taskId, .checkpointId = checkpointId, .startTs = pInfo->startTs}; - - tEncodeSize(tEncodeRestoreCheckpointInfoRsp, &req, blen, code); + tEncodeSize(tEncodeRestoreCheckpointInfo, &req, blen, code); if (code < 0) { terrno = TSDB_CODE_OUT_OF_MEMORY; return -1; } - int32_t tlen = sizeof(SMsgHead) + blen; - void *abuf = POINTER_SHIFT(pMsg->pCont, sizeof(SMsgHead)); + int32_t tlen = sizeof(SMsgHead) + blen; + + void *pBuf = taosMemoryMalloc(tlen); + if (pBuf == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + return -1; + } + + void *abuf = POINTER_SHIFT(pBuf, sizeof(SMsgHead)); SEncoder encoder; tEncoderInit(&encoder, abuf, tlen); - tEncodeRestoreCheckpointInfoRsp(&encoder, &req); + tEncodeRestoreCheckpointInfo(&encoder, &req); - SMsgHead *pMsgHead = (SMsgHead *)pMsg->pCont; + SMsgHead *pMsgHead = (SMsgHead *)pBuf; pMsgHead->contLen = htonl(tlen); - pMsgHead->vgId = htonl(pInfo->nodeId); + pMsgHead->vgId = htonl(pTask->info.nodeId); + tEncoderClear(&encoder); - tmsgSendRsp(pMsg); + SEpSet epset = {0}; + bool hasEpset = false; + code = extractNodeEpset(pMnode, &epset, &hasEpset, pTask->id.taskId, pTask->info.nodeId); + if (code != TSDB_CODE_SUCCESS || !hasEpset) { + taosMemoryFree(pBuf); + return code; + } + + code = setTransAction(pTrans, pBuf, tlen, TDMT_STREAM_CONSEN_CHKPT, &epset, 0, TSDB_CODE_VND_INVALID_VGROUP_ID); + if (code != TSDB_CODE_SUCCESS) { + taosMemoryFree(pBuf); + } + return code; } -int32_t mndSendQuickConsensusChkptIdRsp(SRestoreCheckpointInfo *pReq, int32_t code, int64_t streamId, - int64_t checkpointId, SRpcHandleInfo *pRpcInfo) { - SRpcMsg rsp = {.code = code, .info = *pRpcInfo, .contLen = sizeof(SRestoreCheckpointInfoRsp) + sizeof(SMsgHead)}; - rsp.pCont = rpcMallocCont(rsp.contLen); +int32_t mndCreateSetConsensusChkptIdTrans(SMnode *pMnode, SStreamObj *pStream, int32_t taskId, int64_t checkpointId, + int64_t ts) { + char msg[128] = {0}; + snprintf(msg, tListLen(msg), "set consen-chkpt-id for task:0x%x", taskId); - SMsgHead *pHead = rsp.pCont; - pHead->vgId = htonl(pReq->nodeId); + STrans *pTrans = doCreateTrans(pMnode, pStream, NULL, TRN_CONFLICT_NOTHING, MND_STREAM_CHKPT_CONSEN_NAME, msg); + if (pTrans == NULL) { + return terrno; + } - mDebug("stream:0x%" PRIx64 " consensus-checkpointId:%" PRId64 " exists, s-task:0x%x send to vnode", - streamId, checkpointId, pReq->taskId); - return doSendConsensusCheckpointRsp(pReq, &rsp, checkpointId); + STaskId id = {.streamId = pStream->uid, .taskId = taskId}; + SStreamTask *pTask = mndGetStreamTask(&id, pStream); + ASSERT(pTask); + + /*int32_t code = */ mndStreamRegisterTrans(pTrans, MND_STREAM_CHKPT_CONSEN_NAME, pStream->uid); + int32_t code = mndStreamSetChkptIdAction(pMnode, pTrans, pTask, checkpointId, ts); + if (code != 0) { + sdbRelease(pMnode->pSdb, pStream); + mndTransDrop(pTrans); + return code; + } + + code = mndPersistTransLog(pStream, pTrans, SDB_STATUS_READY); + if (code != TSDB_CODE_SUCCESS) { + sdbRelease(pMnode->pSdb, pStream); + mndTransDrop(pTrans); + return -1; + } + + if (mndTransPrepare(pMnode, pTrans) != 0) { + mError("trans:%d, failed to prepare set consensus-chkptId trans since %s", pTrans->id, terrstr()); + sdbRelease(pMnode->pSdb, pStream); + mndTransDrop(pTrans); + return -1; + } + + sdbRelease(pMnode->pSdb, pStream); + mndTransDrop(pTrans); + + return TSDB_CODE_ACTION_IN_PROGRESS; } SCheckpointConsensusInfo* mndGetConsensusInfo(SHashObj* pHash, int64_t streamId, int32_t numOfTasks) { @@ -885,6 +942,7 @@ SCheckpointConsensusInfo* mndGetConsensusInfo(SHashObj* pHash, int64_t streamId, SCheckpointConsensusInfo p = { .pTaskList = taosArrayInit(4, sizeof(SCheckpointConsensusEntry)), .numOfTasks = numOfTasks, + .streamId = streamId, }; taosHashPut(pHash, &streamId, sizeof(streamId), &p, sizeof(p)); diff --git a/source/dnode/snode/src/snode.c b/source/dnode/snode/src/snode.c index 69a7bc7ba4..60e57e8a2f 100644 --- a/source/dnode/snode/src/snode.c +++ b/source/dnode/snode/src/snode.c @@ -128,6 +128,8 @@ int32_t sndProcessStreamMsg(SSnode *pSnode, SRpcMsg *pMsg) { return tqStreamTaskProcessRetrieveTriggerReq(pSnode->pMeta, pMsg); case TDMT_STREAM_RETRIEVE_TRIGGER_RSP: return tqStreamTaskProcessRetrieveTriggerRsp(pSnode->pMeta, pMsg); + case TDMT_MND_STREAM_REQ_CONSEN_CHKPT_RSP: + return tqStreamProcessConsensusChkptRsp2(pSnode->pMeta, pMsg); default: sndError("invalid snode msg:%d", pMsg->msgType); ASSERT(0); @@ -149,15 +151,15 @@ int32_t sndProcessWriteMsg(SSnode *pSnode, SRpcMsg *pMsg, SRpcMsg *pRsp) { case TDMT_VND_STREAM_TASK_UPDATE: return tqStreamTaskProcessUpdateReq(pSnode->pMeta, &pSnode->msgCb, pMsg, true); case TDMT_VND_STREAM_TASK_RESET: - return tqStreamTaskProcessTaskResetReq(pSnode->pMeta, pMsg); + return tqStreamTaskProcessTaskResetReq(pSnode->pMeta, pMsg->pCont); case TDMT_STREAM_TASK_PAUSE: return tqStreamTaskProcessTaskPauseReq(pSnode->pMeta, pMsg->pCont); case TDMT_STREAM_TASK_RESUME: return tqStreamTaskProcessTaskResumeReq(pSnode->pMeta, pMsg->info.conn.applyIndex, pMsg->pCont, false); case TDMT_STREAM_TASK_UPDATE_CHKPT: - return tqStreamTaskProcessUpdateCheckpointReq(pSnode->pMeta, true, pMsg->pCont, pMsg->contLen); - case TDMT_MND_STREAM_CHKPT_CONSEN_RSP: - return tqStreamProcessConsensusChkptRsp(pSnode->pMeta, pMsg); + return tqStreamTaskProcessUpdateCheckpointReq(pSnode->pMeta, true, pMsg->pCont); + case TDMT_STREAM_CONSEN_CHKPT: + return tqStreamTaskProcessConsenChkptIdReq(pSnode->pMeta, pMsg); default: ASSERT(0); } diff --git a/source/dnode/vnode/src/inc/vnodeInt.h b/source/dnode/vnode/src/inc/vnodeInt.h index 1bec226489..4a47e08730 100644 --- a/source/dnode/vnode/src/inc/vnodeInt.h +++ b/source/dnode/vnode/src/inc/vnodeInt.h @@ -298,6 +298,7 @@ int32_t tqProcessTaskRetrieveRsp(STQ* pTq, SRpcMsg* pMsg); int32_t tqProcessTaskScanHistory(STQ* pTq, SRpcMsg* pMsg); int32_t tqStreamProgressRetrieveReq(STQ* pTq, SRpcMsg* pMsg); int32_t tqProcessTaskUpdateCheckpointReq(STQ* pTq, char* msg, int32_t msgLen); +int32_t tqProcessTaskConsenChkptIdReq(STQ* pTq, SRpcMsg* pMsg); // sma int32_t smaInit(); diff --git a/source/dnode/vnode/src/tq/tq.c b/source/dnode/vnode/src/tq/tq.c index 0a64b9c165..ac57a003c5 100644 --- a/source/dnode/vnode/src/tq/tq.c +++ b/source/dnode/vnode/src/tq/tq.c @@ -1016,7 +1016,11 @@ int32_t tqProcessTaskDropReq(STQ* pTq, char* msg, int32_t msgLen) { } int32_t tqProcessTaskUpdateCheckpointReq(STQ* pTq, char* msg, int32_t msgLen) { - return tqStreamTaskProcessUpdateCheckpointReq(pTq->pStreamMeta, pTq->pVnode->restored, msg, msgLen); + return tqStreamTaskProcessUpdateCheckpointReq(pTq->pStreamMeta, pTq->pVnode->restored, msg); +} + +int32_t tqProcessTaskConsenChkptIdReq(STQ* pTq, SRpcMsg* pMsg) { + return tqStreamTaskProcessConsenChkptIdReq(pTq->pStreamMeta, pMsg); } int32_t tqProcessTaskPauseReq(STQ* pTq, int64_t sversion, char* msg, int32_t msgLen) { @@ -1239,7 +1243,7 @@ int32_t tqProcessTaskUpdateReq(STQ* pTq, SRpcMsg* pMsg) { } int32_t tqProcessTaskResetReq(STQ* pTq, SRpcMsg* pMsg) { - return tqStreamTaskProcessTaskResetReq(pTq->pStreamMeta, pMsg); + return tqStreamTaskProcessTaskResetReq(pTq->pStreamMeta, pMsg->pCont); } int32_t tqProcessTaskRetrieveTriggerReq(STQ* pTq, SRpcMsg* pMsg) { @@ -1277,5 +1281,5 @@ int32_t tqProcessTaskChkptReportRsp(STQ* pTq, SRpcMsg* pMsg) { } int32_t tqProcessTaskConsensusChkptRsp(STQ* pTq, SRpcMsg* pMsg) { - return tqStreamProcessConsensusChkptRsp(pTq->pStreamMeta, pMsg); + return tqStreamProcessConsensusChkptRsp2(pTq->pStreamMeta, pMsg); } diff --git a/source/dnode/vnode/src/tqCommon/tqCommon.c b/source/dnode/vnode/src/tqCommon/tqCommon.c index c40332ff39..cb480d09bb 100644 --- a/source/dnode/vnode/src/tqCommon/tqCommon.c +++ b/source/dnode/vnode/src/tqCommon/tqCommon.c @@ -670,7 +670,7 @@ int32_t tqStreamTaskProcessDropReq(SStreamMeta* pMeta, char* msg, int32_t msgLen return 0; } -int32_t tqStreamTaskProcessUpdateCheckpointReq(SStreamMeta* pMeta, bool restored, char* msg, int32_t msgLen) { +int32_t tqStreamTaskProcessUpdateCheckpointReq(SStreamMeta* pMeta, bool restored, char* msg) { SVUpdateCheckpointInfoReq* pReq = (SVUpdateCheckpointInfoReq*)msg; int32_t vgId = pMeta->vgId; @@ -858,8 +858,8 @@ int32_t tqStartTaskCompleteCallback(SStreamMeta* pMeta) { return TSDB_CODE_SUCCESS; } -int32_t tqStreamTaskProcessTaskResetReq(SStreamMeta* pMeta, SRpcMsg* pMsg) { - SVPauseStreamTaskReq* pReq = (SVPauseStreamTaskReq*)pMsg->pCont; +int32_t tqStreamTaskProcessTaskResetReq(SStreamMeta* pMeta, char* pMsg) { + SVPauseStreamTaskReq* pReq = (SVPauseStreamTaskReq*)pMsg; SStreamTask* pTask = streamMetaAcquireTask(pMeta, pReq->streamId, pReq->taskId); if (pTask == NULL) { @@ -1115,6 +1115,8 @@ int32_t tqStreamProcessReqCheckpointRsp(SStreamMeta* pMeta, SRpcMsg* pMsg) { ret int32_t tqStreamProcessChkptReportRsp(SStreamMeta* pMeta, SRpcMsg* pMsg) { return doProcessDummyRspMsg(pMeta, pMsg); } +int32_t tqStreamProcessConsensusChkptRsp2(SStreamMeta* pMeta, SRpcMsg* pMsg) { return doProcessDummyRspMsg(pMeta, pMsg); } + int32_t tqStreamProcessCheckpointReadyRsp(SStreamMeta* pMeta, SRpcMsg* pMsg) { SMStreamCheckpointReadyRspMsg* pRsp = pMsg->pCont; @@ -1130,22 +1132,21 @@ int32_t tqStreamProcessCheckpointReadyRsp(SStreamMeta* pMeta, SRpcMsg* pMsg) { return TSDB_CODE_SUCCESS; } -int32_t tqStreamProcessConsensusChkptRsp(SStreamMeta* pMeta, SRpcMsg* pMsg) { +int32_t tqStreamTaskProcessConsenChkptIdReq(SStreamMeta* pMeta, SRpcMsg* pMsg) { int32_t vgId = pMeta->vgId; + int32_t code = 0; + char* msg = POINTER_SHIFT(pMsg->pCont, sizeof(SMsgHead)); int32_t len = pMsg->contLen - sizeof(SMsgHead); - SRpcMsg rsp = {.info = pMsg->info, .code = TSDB_CODE_SUCCESS}; int64_t now = taosGetTimestampMs(); - SRestoreCheckpointInfoRsp req = {0}; + SRestoreCheckpointInfo req = {0}; SDecoder decoder; tDecoderInit(&decoder, (uint8_t*)msg, len); - rsp.info.handle = NULL; - if (tDecodeRestoreCheckpointInfoRsp(&decoder, &req) < 0) { - // rsp.code = TSDB_CODE_MSG_DECODE_ERROR; // disable it temporarily - tqError("vgId:%d failed to decode restore task checkpointId, code:%s", vgId, tstrerror(rsp.code)); + if (tDecodeRestoreCheckpointInfo(&decoder, &req) < 0) { + tqError("vgId:%d failed to decode set consensus checkpointId req, code:%s", vgId, tstrerror(code)); tDecoderClear(&decoder); return TSDB_CODE_SUCCESS; } @@ -1154,7 +1155,7 @@ int32_t tqStreamProcessConsensusChkptRsp(SStreamMeta* pMeta, SRpcMsg* pMsg) { SStreamTask* pTask = streamMetaAcquireTask(pMeta, req.streamId, req.taskId); if (pTask == NULL) { - tqError("vgId:%d process restore checkpointId req, failed to acquire task:0x%x, it may have been dropped already", + tqError("vgId:%d process set consensus checkpointId req, failed to acquire task:0x%x, it may have been dropped already", pMeta->vgId, req.taskId); streamMetaAddFailedTask(pMeta, req.streamId, req.taskId); return TSDB_CODE_SUCCESS; @@ -1182,7 +1183,7 @@ int32_t tqStreamProcessConsensusChkptRsp(SStreamMeta* pMeta, SRpcMsg* pMsg) { pTask->chkInfo.checkpointId = req.checkpointId; tqSetRestoreVersionInfo(pTask); } else { - tqDebug("s-task:%s vgId:%d consensus-checkpointId:%" PRId64 " equals to current checkpointId, no need to update", + tqDebug("s-task:%s vgId:%d consensus-checkpointId:%" PRId64 " equals to current checkpointId, not update", pTask->id.idStr, vgId, req.checkpointId); } diff --git a/source/dnode/vnode/src/vnd/vnodeSvr.c b/source/dnode/vnode/src/vnd/vnodeSvr.c index 3757cd00bc..04839c3357 100644 --- a/source/dnode/vnode/src/vnd/vnodeSvr.c +++ b/source/dnode/vnode/src/vnd/vnodeSvr.c @@ -630,6 +630,11 @@ int32_t vnodeProcessWriteMsg(SVnode *pVnode, SRpcMsg *pMsg, int64_t ver, SRpcMsg goto _err; } } break; + case TDMT_STREAM_CONSEN_CHKPT: { + if (pVnode->restored) { + tqProcessTaskConsenChkptIdReq(pVnode->pTq, pMsg); + } + } break; case TDMT_STREAM_TASK_PAUSE: { if (pVnode->restored && vnodeIsLeader(pVnode) && tqProcessTaskPauseReq(pVnode->pTq, ver, pMsg->pCont, pMsg->contLen) < 0) { @@ -647,11 +652,6 @@ int32_t vnodeProcessWriteMsg(SVnode *pVnode, SRpcMsg *pMsg, int64_t ver, SRpcMsg tqProcessTaskResetReq(pVnode->pTq, pMsg); } } break; - case TDMT_MND_STREAM_CHKPT_CONSEN_RSP: { - if (pVnode->restored) { - tqProcessTaskConsensusChkptRsp(pVnode->pTq, pMsg); - } - } break; case TDMT_VND_ALTER_CONFIRM: needCommit = pVnode->config.hashChange; if (vnodeProcessAlterConfirmReq(pVnode, ver, pReq, len, pRsp) < 0) { @@ -861,6 +861,8 @@ int32_t vnodeProcessStreamMsg(SVnode *pVnode, SRpcMsg *pMsg, SQueueInfo *pInfo) return tqStreamProgressRetrieveReq(pVnode->pTq, pMsg); case TDMT_MND_STREAM_CHKPT_REPORT_RSP: return tqProcessTaskChkptReportRsp(pVnode->pTq, pMsg); + case TDMT_MND_STREAM_REQ_CONSEN_CHKPT_RSP: + return tqProcessTaskConsensusChkptRsp(pVnode->pTq, pMsg); default: vError("unknown msg type:%d in stream queue", pMsg->msgType); return TSDB_CODE_APP_ERROR; diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index d5f7d6ef21..5cd084e6a2 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -1153,7 +1153,7 @@ int32_t streamTaskSendRestoreChkptMsg(SStreamTask* pTask) { tEncoderClear(&encoder); SRpcMsg msg = {0}; - initRpcMsg(&msg, TDMT_MND_STREAM_CHKPT_CONSEN, buf, tlen); + initRpcMsg(&msg, TDMT_MND_STREAM_REQ_CONSEN_CHKPT, buf, tlen); stDebug("s-task:%s vgId:%d send latest checkpointId:%" PRId64 " to mnode to get the consensus checkpointId", id, vgId, pInfo->checkpointId); diff --git a/source/libs/stream/src/streamMsg.c b/source/libs/stream/src/streamMsg.c index e0435156e2..f02f661143 100644 --- a/source/libs/stream/src/streamMsg.c +++ b/source/libs/stream/src/streamMsg.c @@ -650,23 +650,3 @@ int32_t tDecodeRestoreCheckpointInfo(SDecoder* pDecoder, SRestoreCheckpointInfo* tEndDecode(pDecoder); return 0; } - -int32_t tEncodeRestoreCheckpointInfoRsp(SEncoder* pCoder, const SRestoreCheckpointInfoRsp* pInfo) { - if (tStartEncode(pCoder) < 0) return -1; - if (tEncodeI64(pCoder, pInfo->startTs) < 0) return -1; - if (tEncodeI64(pCoder, pInfo->streamId) < 0) return -1; - if (tEncodeI32(pCoder, pInfo->taskId) < 0) return -1; - if (tEncodeI64(pCoder, pInfo->checkpointId) < 0) return -1; - tEndEncode(pCoder); - return 0; -} - -int32_t tDecodeRestoreCheckpointInfoRsp(SDecoder* pCoder, SRestoreCheckpointInfoRsp* pInfo) { - if (tStartDecode(pCoder) < 0) return -1; - if (tDecodeI64(pCoder, &pInfo->startTs) < 0) return -1; - if (tDecodeI64(pCoder, &pInfo->streamId) < 0) return -1; - if (tDecodeI32(pCoder, &pInfo->taskId) < 0) return -1; - if (tDecodeI64(pCoder, &pInfo->checkpointId) < 0) return -1; - tEndDecode(pCoder); - return 0; -} \ No newline at end of file From c9153b8176f31c55568518d1f6221b72f098c182 Mon Sep 17 00:00:00 2001 From: xjzhou Date: Fri, 5 Jul 2024 16:27:40 +0800 Subject: [PATCH 60/92] update CI test case stmt_error --- tests/system-test/1-insert/stmt_error.py | 70 +++++++++++++++++++++--- 1 file changed, 62 insertions(+), 8 deletions(-) diff --git a/tests/system-test/1-insert/stmt_error.py b/tests/system-test/1-insert/stmt_error.py index c6d747c317..0bfbedb9a1 100644 --- a/tests/system-test/1-insert/stmt_error.py +++ b/tests/system-test/1-insert/stmt_error.py @@ -24,7 +24,7 @@ class TDTestCase: case1 : [TD-11899] : this is an test case for check stmt error use . ''' return - + def init(self, conn, logSql, replicaVar=1): self.replicaVar = int(replicaVar) tdLog.debug("start to execute %s" % __file__) @@ -49,7 +49,7 @@ class TDTestCase: ff float, dd double, bb binary(65059), nn nchar(100), tt timestamp)", ) conn.load_table_info("log") - + stmt = conn.statement("insert into log values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)") params = new_bind_params(16) @@ -123,7 +123,7 @@ class TDTestCase: ff float, dd double, bb binary(100), nn nchar(100), tt timestamp , error_data int )", ) conn.load_table_info("log") - + stmt = conn.statement("insert into log values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,1000)") params = new_bind_params(16) @@ -195,20 +195,74 @@ class TDTestCase: except Exception as err: conn.close() raise err - + + def test_stmt_nornmal_value_error(self, conn): + # type: (TaosConnection) -> None + dbname = "pytest_taos_stmt_error" + try: + conn.execute("drop database if exists %s" % dbname) + conn.execute("create database if not exists %s" % dbname) + conn.select_db(dbname) + + conn.execute( + "create table if not exists log(ts timestamp, bo bool, nil tinyint, ti tinyint, si smallint, ii int,\ + bi bigint, tu tinyint unsigned, su smallint unsigned, iu int unsigned, bu bigint unsigned, \ + ff float, dd double, bb binary(100), nn nchar(100), tt timestamp , error_data int )", + ) + conn.load_table_info("log") + + + stmt = conn.statement("insert into log values(NOW(),?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)") + params = new_bind_params(16) + params[0].timestamp(1626861392589, PrecisionEnum.Milliseconds) + params[1].bool(True) + params[2].tinyint(None) + params[3].tinyint(2) + params[4].smallint(3) + params[5].int(4) + params[6].bigint(5) + params[7].tinyint_unsigned(6) + params[8].smallint_unsigned(7) + params[9].int_unsigned(8) + params[10].bigint_unsigned(9) + params[11].float(10.1) + params[12].double(10.11) + params[13].binary("hello") + params[14].nchar("stmt") + params[15].timestamp(1626861392589, PrecisionEnum.Milliseconds) + + stmt.bind_param(params) + stmt.execute() + + conn.close() + + except Exception as err: + conn.execute("drop database if exists %s" % dbname) + conn.close() + raise err + def run(self): - + self.test_stmt_insert(self.conn()) try: self.test_stmt_insert_error(self.conn()) except Exception as error : - - if str(error)=='[0x0200]: no mix usage for ? and values': + + if str(error)=='[0x0200]: stmt bind param does not support normal value in sql': tdLog.info('=========stmt error occured for bind part column ==============') else: tdLog.exit("expect error(%s) not occured" % str(error)) - try: + try: + self.test_stmt_nornmal_value_error(self.conn()) + except Exception as error : + + if str(error)=='[0x0200]: stmt bind param does not support normal value in sql': + tdLog.info('=========stmt error occured for bind part column ==============') + else: + tdLog.exit("expect error(%s) not occured" % str(error)) + + try: self.test_stmt_insert_error_null_timestamp(self.conn()) tdLog.exit("expect error not occured - 1") except Exception as error : From 0e9205c3143aa61b8796173ebe62543034049b8a Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Fri, 5 Jul 2024 09:13:30 +0000 Subject: [PATCH 61/92] fix UAF --- source/libs/transport/src/transCli.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/source/libs/transport/src/transCli.c b/source/libs/transport/src/transCli.c index ffcb1fbdb5..25e0248095 100644 --- a/source/libs/transport/src/transCli.c +++ b/source/libs/transport/src/transCli.c @@ -832,6 +832,9 @@ static int32_t allocConnRef(SCliConn* conn, bool update) { taosInitRWLatch(&exh->latch); exh->refId = transAddExHandle(transGetRefMgt(), exh); + SExHandle* self = transAcquireExHandle(transGetRefMgt(), exh->refId); + ASSERT(exh == self); + QUEUE_INIT(&exh->q); taosInitRWLatch(&exh->latch); @@ -2833,6 +2836,9 @@ int64_t transAllocHandle() { taosInitRWLatch(&exh->latch); exh->refId = transAddExHandle(transGetRefMgt(), exh); + SExHandle* self = transAcquireExHandle(transGetRefMgt(), exh->refId); + ASSERT(exh == self); + QUEUE_INIT(&exh->q); taosInitRWLatch(&exh->latch); tDebug("pre alloc refId %" PRId64 "", exh->refId); From 6a1555e893ccd11419b90ff43133bec6f5b7efdd Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Fri, 5 Jul 2024 18:06:00 +0800 Subject: [PATCH 62/92] refactor: do some internal refactor. --- source/dnode/mnode/impl/src/mndStream.c | 1 + 1 file changed, 1 insertion(+) diff --git a/source/dnode/mnode/impl/src/mndStream.c b/source/dnode/mnode/impl/src/mndStream.c index a8dc1e42bb..00909cb36d 100644 --- a/source/dnode/mnode/impl/src/mndStream.c +++ b/source/dnode/mnode/impl/src/mndStream.c @@ -1138,6 +1138,7 @@ static int32_t mndCheckTaskAndNodeStatus(SMnode *pMnode) { mDebug("s-task:0x%" PRIx64 "-0x%x (nodeId:%d) status:%s, checkpoint not issued", pEntry->id.streamId, (int32_t)pEntry->id.taskId, pEntry->nodeId, streamTaskGetStatusStr(pEntry->status)); ready = false; + break; } if (pEntry->hTaskId != 0) { From 674acd0e9f6664f147b406f948998638a447a125 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Fri, 5 Jul 2024 18:53:35 +0800 Subject: [PATCH 63/92] refactor(stream): delay checkpointInterval to generate the checkpoint after stream started. --- source/dnode/mnode/impl/src/mndStream.c | 31 +++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/source/dnode/mnode/impl/src/mndStream.c b/source/dnode/mnode/impl/src/mndStream.c index a137c10ed5..20cd415a6f 100644 --- a/source/dnode/mnode/impl/src/mndStream.c +++ b/source/dnode/mnode/impl/src/mndStream.c @@ -1134,6 +1134,7 @@ static int32_t mndCheckTaskAndNodeStatus(SMnode *pMnode) { mDebug("s-task:0x%" PRIx64 "-0x%x (nodeId:%d) status:%s, checkpoint not issued", pEntry->id.streamId, (int32_t)pEntry->id.taskId, pEntry->nodeId, streamTaskGetStatusStr(pEntry->status)); ready = false; + break; } if (pEntry->hTaskId != 0) { @@ -1153,6 +1154,27 @@ static int32_t mndCheckTaskAndNodeStatus(SMnode *pMnode) { return ready ? 0 : -1; } +int64_t getStreamTaskLastReadyState(SArray *pTaskList, int64_t streamId) { + int64_t ts = -1; + int32_t taskId = -1; + + for (int32_t i = 0; i < taosArrayGetSize(pTaskList); ++i) { + STaskId *p = taosArrayGet(pTaskList, i); + STaskStatusEntry *pEntry = taosHashGet(execInfo.pTaskMap, p, sizeof(*p)); + if (pEntry == NULL || pEntry->id.streamId != streamId) { + continue; + } + + if (pEntry->status == TASK_STATUS__READY && ts < pEntry->startTime) { + ts = pEntry->startTime; + taskId = pEntry->id.taskId; + } + } + + mDebug("stream:0x%" PRIx64 " last ready ts:%" PRId64 " s-task:0x%x", streamId, ts, taskId); + return ts; +} + typedef struct { int64_t streamId; int64_t duration; @@ -1191,6 +1213,15 @@ static int32_t mndProcessStreamCheckpoint(SRpcMsg *pReq) { continue; } + taosThreadMutexLock(&execInfo.lock); + int64_t startTs = getStreamTaskLastReadyState(execInfo.pTaskList, pStream->uid); + if (startTs != -1 && (now - startTs) < tsStreamCheckpointInterval * 1000) { + taosThreadMutexUnlock(&execInfo.lock); + sdbRelease(pSdb, pStream); + continue; + } + taosThreadMutexUnlock(&execInfo.lock); + SCheckpointInterval in = {.streamId = pStream->uid, .duration = duration}; taosArrayPush(pList, &in); From 8cbe534f931cc3f37eb199a8ed0a542ba296bf25 Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Sun, 7 Jul 2024 06:04:59 +0000 Subject: [PATCH 64/92] fix invalid read --- source/libs/stream/src/streamBackendRocksdb.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/source/libs/stream/src/streamBackendRocksdb.c b/source/libs/stream/src/streamBackendRocksdb.c index 8c1f962f00..8c390c189c 100644 --- a/source/libs/stream/src/streamBackendRocksdb.c +++ b/source/libs/stream/src/streamBackendRocksdb.c @@ -2074,7 +2074,9 @@ int32_t valueEncode(void* value, int32_t vlen, int64_t ttl, char** dest) { len += taosEncodeFixedI32((void**)&buf, key.len); len += taosEncodeFixedI32((void**)&buf, key.rawLen); len += taosEncodeFixedI8((void**)&buf, key.compress); - len += taosEncodeBinary((void**)&buf, (char*)value, key.len); + if (value != NULL && key.len != 0) { + len += taosEncodeBinary((void**)&buf, (char*)value, key.len); + } *dest = p; } else { char* buf = *dest; @@ -2082,7 +2084,9 @@ int32_t valueEncode(void* value, int32_t vlen, int64_t ttl, char** dest) { len += taosEncodeFixedI32((void**)&buf, key.len); len += taosEncodeFixedI32((void**)&buf, key.rawLen); len += taosEncodeFixedI8((void**)&buf, key.compress); - len += taosEncodeBinary((void**)&buf, (char*)value, key.len); + if (value != NULL && key.len != 0) { + len += taosEncodeBinary((void**)&buf, (char*)value, key.len); + } } taosMemoryFree(dst); From 514108c3bfa5f85f9ea94bf7e45d7455bf7df0cc Mon Sep 17 00:00:00 2001 From: zhiyong Date: Sun, 7 Jul 2024 16:33:20 +0800 Subject: [PATCH 65/92] test: move TS_5105 to queryBugs --- tests/army/query/queryBugs.py | 32 +++++++++++++-- tests/army/query/query_last_row_repeatly.py | 44 --------------------- tests/parallel_test/cases.task | 1 - 3 files changed, 28 insertions(+), 49 deletions(-) delete mode 100644 tests/army/query/query_last_row_repeatly.py diff --git a/tests/army/query/queryBugs.py b/tests/army/query/queryBugs.py index ca28ff549c..7583382290 100644 --- a/tests/army/query/queryBugs.py +++ b/tests/army/query/queryBugs.py @@ -28,8 +28,7 @@ from frame import * class TDTestCase(TBase): - - # fix + # fix def FIX_TD_30686(self): tdLog.info("check bug TD_30686 ...\n") sqls = [ @@ -49,6 +48,32 @@ class TDTestCase(TBase): ] tdSql.checkDataMem(sql, results) + def FIX_TS_5105(self): + tdLog.info("check bug TS_5105 ...\n") + ts1 = "2024-07-03 10:00:00.000" + ts2 = "2024-07-03 13:00:00.000" + sqls = [ + "drop database if exists ts_5105", + "create database ts_5105 cachemodel 'both';", + "use ts_5105;", + "CREATE STABLE meters (ts timestamp, current float) TAGS (location binary(64), groupId int);", + "CREATE TABLE d1001 USING meters TAGS ('California.B', 2);", + "CREATE TABLE d1002 USING meters TAGS ('California.S', 3);", + f"INSERT INTO d1001 VALUES ('{ts1}', 10);", + f"INSERT INTO d1002 VALUES ('{ts2}', 13);", + ] + tdSql.executes(sqls) + + sql = "select last(ts), last_row(ts) from meters;" + + # 执行多次,有些时候last_row(ts)会返回错误的值,详见TS-5105 + for i in range(1, 10): + tdLog.debug(f"{i}th execute sql: {sql}") + tdSql.query(sql) + tdSql.checkRows(1) + tdSql.checkData(0, 0, ts2) + tdSql.checkData(0, 1, ts2) + # run def run(self): tdLog.debug(f"start to excute {__file__}") @@ -57,11 +82,10 @@ class TDTestCase(TBase): self.FIX_TD_30686() # TS BUGS - + self.FIX_TS_5105() tdLog.success(f"{__file__} successfully executed") - tdCases.addLinux(__file__, TDTestCase()) tdCases.addWindows(__file__, TDTestCase()) diff --git a/tests/army/query/query_last_row_repeatly.py b/tests/army/query/query_last_row_repeatly.py deleted file mode 100644 index 3cca032176..0000000000 --- a/tests/army/query/query_last_row_repeatly.py +++ /dev/null @@ -1,44 +0,0 @@ -# -*- coding: utf-8 -*- - -from frame.log import * -from frame.cases import * -from frame.sql import * -from frame.caseBase import * -from frame import * - - -class TDTestCase(TBase): - def init(self, conn, logSql, replicaVar=1): - self.replicaVar = int(replicaVar) - tdLog.debug("start to execute %s" % __file__) - tdSql.init(conn.cursor(), logSql) - - def run(self): - sqls = [ - "drop database if exists ts_5101", - "create database ts_5101 cachemodel 'both';", - "use ts_5101;", - "CREATE STABLE meters (ts timestamp, current float) TAGS (location binary(64), groupId int);", - "CREATE TABLE d1001 USING meters TAGS ('California.B', 2);", - "CREATE TABLE d1002 USING meters TAGS ('California.S', 3);", - "INSERT INTO d1001 VALUES ('2024-07-03 10:00:00.000', 10);", - "INSERT INTO d1002 VALUES ('2024-07-03 13:00:00.000', 13);", - ] - tdSql.executes(sqls) - - # 执行多次,有些时候last_row(ts)会返回错误的值,详见TS-5105 - for i in range(1, 10): - sql = "select last(ts), last_row(ts) from meters;" - tdLog.debug(f"{i}th execute sql: {sql}") - tdSql.query(sql) - tdSql.checkRows(1) - tdSql.checkData(0, 0, "2024-07-03 13:00:00.000") - tdSql.checkData(0, 1, "2024-07-03 13:00:00.000") - - def stop(self): - tdSql.close() - tdLog.success("%s successfully executed" % __file__) - - -tdCases.addWindows(__file__, TDTestCase()) -tdCases.addLinux(__file__, TDTestCase()) diff --git a/tests/parallel_test/cases.task b/tests/parallel_test/cases.task index aff5bedaf8..5667255f9f 100644 --- a/tests/parallel_test/cases.task +++ b/tests/parallel_test/cases.task @@ -20,7 +20,6 @@ ,,y,army,./pytest.sh python3 ./test.py -f insert/test_column_tag_boundary.py ,,y,army,./pytest.sh python3 ./test.py -f query/fill/fill_desc.py -N 3 -L 3 -D 2 ,,y,army,./pytest.sh python3 ./test.py -f query/fill/fill_null.py -,,y,army,./pytest.sh python3 ./test.py -f query/query_last_row_repeatly.py ,,y,army,./pytest.sh python3 ./test.py -f cluster/incSnapshot.py -N 3 ,,y,army,./pytest.sh python3 ./test.py -f query/query_basic.py -N 3 ,,y,army,./pytest.sh python3 ./test.py -f query/accuracy/test_query_accuracy.py From ce4153b6fcb125059c5d96a5ba364a0732ba90e2 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Mon, 8 Jul 2024 09:05:33 +0800 Subject: [PATCH 66/92] fix(stream): use hb to send the consens-checkpointid req. --- include/libs/stream/tstream.h | 18 ++++++----- source/dnode/mnode/impl/inc/mndStream.h | 3 +- source/dnode/mnode/impl/src/mndStream.c | 6 ++-- source/dnode/mnode/impl/src/mndStreamHb.c | 35 +++++++++++++++++++-- source/dnode/mnode/impl/src/mndStreamUtil.c | 15 +++++++-- source/dnode/vnode/src/tqCommon/tqCommon.c | 5 +-- source/libs/stream/src/streamCheckpoint.c | 4 ++- source/libs/stream/src/streamHb.c | 6 ++++ source/libs/stream/src/streamMeta.c | 3 +- source/libs/stream/src/streamMsg.c | 2 ++ source/libs/stream/src/streamTask.c | 1 + 11 files changed, 76 insertions(+), 22 deletions(-) diff --git a/include/libs/stream/tstream.h b/include/libs/stream/tstream.h index d0feebf814..5c61265c01 100644 --- a/include/libs/stream/tstream.h +++ b/include/libs/stream/tstream.h @@ -289,6 +289,7 @@ typedef struct SStreamStatus { bool appendTranstateBlock; // has append the transfer state data block already bool removeBackendFiles; // remove backend files on disk when free stream tasks bool sendConsensusChkptId; + bool requireConsensusChkptId; } SStreamStatus; typedef struct SDataRange { @@ -568,14 +569,15 @@ typedef struct { } SStreamScanHistoryReq; typedef struct STaskCkptInfo { - int64_t latestId; // saved checkpoint id - int64_t latestVer; // saved checkpoint ver - int64_t latestTime; // latest checkpoint time - int64_t latestSize; // latest checkpoint size - int8_t remoteBackup; // latest checkpoint backup done - int64_t activeId; // current active checkpoint id - int32_t activeTransId; // checkpoint trans id - int8_t failed; // denote if the checkpoint is failed or not + int64_t latestId; // saved checkpoint id + int64_t latestVer; // saved checkpoint ver + int64_t latestTime; // latest checkpoint time + int64_t latestSize; // latest checkpoint size + int8_t remoteBackup; // latest checkpoint backup done + int64_t activeId; // current active checkpoint id + int32_t activeTransId; // checkpoint trans id + int8_t failed; // denote if the checkpoint is failed or not + int8_t consensusChkptId; // required the consensus-checkpointId } STaskCkptInfo; typedef struct STaskStatusEntry { diff --git a/source/dnode/mnode/impl/inc/mndStream.h b/source/dnode/mnode/impl/inc/mndStream.h index 7b4270462f..0b6b6a9ef2 100644 --- a/source/dnode/mnode/impl/inc/mndStream.h +++ b/source/dnode/mnode/impl/inc/mndStream.h @@ -147,8 +147,7 @@ int32_t removeExpiredNodeEntryAndTaskInBuf(SArray *pNodeSnapshot); void removeStreamTasksInBuf(SStreamObj *pStream, SStreamExecInfo *pExecNode); SCheckpointConsensusInfo *mndGetConsensusInfo(SHashObj *pHash, int64_t streamId, int32_t numOfTasks); -void mndAddConsensusTasks(SCheckpointConsensusInfo *pInfo, const SRestoreCheckpointInfo *pRestoreInfo, - SRpcHandleInfo *pRpcInfo); +void mndAddConsensusTasks(SCheckpointConsensusInfo *pInfo, const SRestoreCheckpointInfo *pRestoreInfo); void mndClearConsensusRspEntry(SCheckpointConsensusInfo *pInfo); int64_t mndClearConsensusCheckpointId(SHashObj* pHash, int64_t streamId); diff --git a/source/dnode/mnode/impl/src/mndStream.c b/source/dnode/mnode/impl/src/mndStream.c index 00909cb36d..cd99714395 100644 --- a/source/dnode/mnode/impl/src/mndStream.c +++ b/source/dnode/mnode/impl/src/mndStream.c @@ -2718,7 +2718,7 @@ static int32_t mndProcessConsensusCheckpointId(SRpcMsg *pMsg) { mDebug("not all(%d/%d) task(s) send hbMsg yet, wait for a while and check again, s-task:0x%x", req.taskId, num, numOfTasks); SCheckpointConsensusInfo *pInfo = mndGetConsensusInfo(execInfo.pStreamConsensus, req.streamId, numOfTasks); - mndAddConsensusTasks(pInfo, &req, &pMsg->info); + mndAddConsensusTasks(pInfo, &req); taosThreadMutexUnlock(&execInfo.lock); doSendQuickRsp(&pMsg->info, sizeof(SMStreamReqConsensChkptRsp), req.nodeId, terrno); @@ -2737,7 +2737,7 @@ static int32_t mndProcessConsensusCheckpointId(SRpcMsg *pMsg) { // wait for 5s and check again SCheckpointConsensusInfo *pInfo = mndGetConsensusInfo(execInfo.pStreamConsensus, req.streamId, numOfTasks); - mndAddConsensusTasks(pInfo, &req, &pMsg->info); + mndAddConsensusTasks(pInfo, &req); if (pStream != NULL) { mndReleaseStream(pMnode, pStream); @@ -2789,7 +2789,7 @@ int32_t mndProcessConsensusInTmr(SRpcMsg *pMsg) { taosArrayPush(pList, &pe->req.taskId); streamId = pe->req.streamId; } else { - mDebug("s-task:0x%x sendTs:%" PRId64 " wait %2.fs already, wait for next round to check", pe->req.taskId, + mDebug("s-task:0x%x sendTs:%" PRId64 " wait %.2fs already, wait for next round to check", pe->req.taskId, (now - pe->ts) / 1000.0, pe->ts); } } diff --git a/source/dnode/mnode/impl/src/mndStreamHb.c b/source/dnode/mnode/impl/src/mndStreamHb.c index c7f97b4a62..8a374c99ef 100644 --- a/source/dnode/mnode/impl/src/mndStreamHb.c +++ b/source/dnode/mnode/impl/src/mndStreamHb.c @@ -246,7 +246,7 @@ int32_t mndProcessStreamHb(SRpcMsg *pReq) { } tDecoderClear(&decoder); - mTrace("receive stream-meta hb from vgId:%d, active numOfTasks:%d, msgId:%d", req.vgId, req.numOfTasks, req.msgId); + mDebug("receive stream-meta hb from vgId:%d, active numOfTasks:%d, msgId:%d", req.vgId, req.numOfTasks, req.msgId); pFailedChkpt = taosArrayInit(4, sizeof(SFailedCheckpointInfo)); pOrphanTasks = taosArrayInit(4, sizeof(SOrphanTask)); @@ -284,6 +284,23 @@ int32_t mndProcessStreamHb(SRpcMsg *pReq) { continue; } + STaskCkptInfo *pChkInfo = &p->checkpointInfo; + if (pChkInfo->consensusChkptId != 0) { + SRestoreCheckpointInfo cp = { + .streamId = p->id.streamId, + .taskId = p->id.taskId, + .checkpointId = p->checkpointInfo.latestId, + .startTs = pTaskEntry->startTime, + }; + + SStreamObj *pStream = mndGetStreamObj(pMnode, p->id.streamId); + int32_t numOfTasks = mndGetNumOfStreamTasks(pStream); + + SCheckpointConsensusInfo *pInfo = mndGetConsensusInfo(execInfo.pStreamConsensus, p->id.streamId, numOfTasks); + mndAddConsensusTasks(pInfo, &cp); + mndReleaseStream(pMnode, pStream); + } + if (pTaskEntry->stage != p->stage && pTaskEntry->stage != -1) { updateStageInfo(pTaskEntry, p->stage); if (pTaskEntry->nodeId == SNODE_HANDLE) { @@ -292,7 +309,6 @@ int32_t mndProcessStreamHb(SRpcMsg *pReq) { } else { streamTaskStatusCopy(pTaskEntry, p); - STaskCkptInfo *pChkInfo = &p->checkpointInfo; if ((pChkInfo->activeId != 0) && pChkInfo->failed) { mError("stream task:0x%" PRIx64 " checkpointId:%" PRIx64 " transId:%d failed, kill it", p->id.taskId, pChkInfo->activeId, pChkInfo->activeTransId); @@ -304,6 +320,21 @@ int32_t mndProcessStreamHb(SRpcMsg *pReq) { // remove failed trans from pChkptStreams taosHashRemove(execInfo.pChkptStreams, &p->id.streamId, sizeof(p->id.streamId)); } + +/* if (pChkInfo->consensusChkptId != 0) { + SRestoreCheckpointInfo cp = { + .streamId = p->id.streamId, + .taskId = p->id.taskId, + .checkpointId = p->checkpointInfo.latestId, + .startTs = pTaskEntry->startTime, + }; + + SStreamObj* pStream = mndGetStreamObj(pMnode, p->id.streamId); + int32_t numOfTasks = mndGetNumOfStreamTasks(pStream); + SCheckpointConsensusInfo *pInfo = mndGetConsensusInfo(execInfo.pStreamConsensus, p->id.streamId, numOfTasks); + mndAddConsensusTasks(pInfo, &cp, NULL); + mndReleaseStream(pMnode, pStream); + }*/ } if (p->status == pTaskEntry->status) { diff --git a/source/dnode/mnode/impl/src/mndStreamUtil.c b/source/dnode/mnode/impl/src/mndStreamUtil.c index 7d9b9e4571..c4797957c2 100644 --- a/source/dnode/mnode/impl/src/mndStreamUtil.c +++ b/source/dnode/mnode/impl/src/mndStreamUtil.c @@ -953,12 +953,21 @@ SCheckpointConsensusInfo* mndGetConsensusInfo(SHashObj* pHash, int64_t streamId, // no matter existed or not, add the request into info list anyway, since we need to send rsp mannually // discard the msg may lead to the lost of connections. -void mndAddConsensusTasks(SCheckpointConsensusInfo *pInfo, const SRestoreCheckpointInfo *pRestoreInfo, SRpcHandleInfo* pRpcInfo) { - SCheckpointConsensusEntry info = {.ts = taosGetTimestampMs(), .rspInfo = *pRpcInfo}; +void mndAddConsensusTasks(SCheckpointConsensusInfo *pInfo, const SRestoreCheckpointInfo *pRestoreInfo) { + SCheckpointConsensusEntry info = {.ts = taosGetTimestampMs()}; memcpy(&info.req, pRestoreInfo, sizeof(info.req)); - taosArrayPush(pInfo->pTaskList, &info); + for (int32_t i = 0; i < taosArrayGetSize(pInfo->pTaskList); ++i) { + SCheckpointConsensusEntry *p = taosArrayGet(pInfo->pTaskList, i); + if (p->req.taskId == info.req.taskId) { + mDebug("s-task:0x%x already in consensus-checkpointId list for stream:0x%" PRIx64 + ", ignore this, total existed:%d", + pRestoreInfo->taskId, pRestoreInfo->streamId, (int32_t)taosArrayGetSize(pInfo->pTaskList)); + return; + } + } + taosArrayPush(pInfo->pTaskList, &info); int32_t num = taosArrayGetSize(pInfo->pTaskList); mDebug("s-task:0x%x added into consensus-checkpointId list, stream:0x%" PRIx64 " total waiting:%d", pRestoreInfo->taskId, pRestoreInfo->streamId, num); diff --git a/source/dnode/vnode/src/tqCommon/tqCommon.c b/source/dnode/vnode/src/tqCommon/tqCommon.c index cb480d09bb..dc55acbb5c 100644 --- a/source/dnode/vnode/src/tqCommon/tqCommon.c +++ b/source/dnode/vnode/src/tqCommon/tqCommon.c @@ -742,6 +742,7 @@ static int32_t restartStreamTasks(SStreamMeta* pMeta, bool isLeader) { streamMetaStartAllTasks(pMeta); } else { streamMetaResetStartInfo(&pMeta->startInfo, pMeta->vgId); + pMeta->startInfo.restartCount = 0; streamMetaWUnLock(pMeta); tqInfo("vgId:%d, follower node not start stream tasks or stream is disabled", vgId); } @@ -1160,7 +1161,7 @@ int32_t tqStreamTaskProcessConsenChkptIdReq(SStreamMeta* pMeta, SRpcMsg* pMsg) { streamMetaAddFailedTask(pMeta, req.streamId, req.taskId); return TSDB_CODE_SUCCESS; } - +#if 0 // discard the rsp, since it is expired. if (req.startTs < pTask->execInfo.created) { tqWarn("s-task:%s vgId:%d create time:%" PRId64 " recv expired consensus checkpointId:%" PRId64 @@ -1170,7 +1171,7 @@ int32_t tqStreamTaskProcessConsenChkptIdReq(SStreamMeta* pMeta, SRpcMsg* pMsg) { streamMetaReleaseTask(pMeta, pTask); return TSDB_CODE_SUCCESS; } - +#endif tqDebug("s-task:%s vgId:%d checkpointId:%" PRId64 " restore to consensus-checkpointId:%" PRId64 " from mnode", pTask->id.idStr, vgId, pTask->chkInfo.checkpointId, req.checkpointId); diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index 5cd084e6a2..cdb5bf0b50 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -1121,7 +1121,8 @@ int32_t streamTaskSendRestoreChkptMsg(SStreamTask* pTask) { taosThreadMutexUnlock(&pTask->lock); ASSERT(pTask->pBackend == NULL); - + pTask->status.requireConsensusChkptId = true; +#if 0 SRestoreCheckpointInfo req = { .streamId = pTask->id.streamId, .taskId = pTask->id.taskId, @@ -1158,6 +1159,7 @@ int32_t streamTaskSendRestoreChkptMsg(SStreamTask* pTask) { pInfo->checkpointId); tmsgSendReq(&pTask->info.mnodeEpset, &msg); +#endif return 0; } diff --git a/source/libs/stream/src/streamHb.c b/source/libs/stream/src/streamHb.c index 691ec44672..6b1a1aca92 100644 --- a/source/libs/stream/src/streamHb.c +++ b/source/libs/stream/src/streamHb.c @@ -194,6 +194,12 @@ int32_t streamMetaSendHbHelper(SStreamMeta* pMeta) { } } + if ((*pTask)->status.requireConsensusChkptId) { + entry.checkpointInfo.consensusChkptId = 1; + (*pTask)->status.requireConsensusChkptId = false; + stDebug("s-task:%s vgId:%d set the require consensus-checkpointId in hbMsg", (*pTask)->id.idStr, pMeta->vgId); + } + if ((*pTask)->exec.pWalReader != NULL) { entry.processedVer = walReaderGetCurrentVer((*pTask)->exec.pWalReader) - 1; if (entry.processedVer < 0) { diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index 19cb2f7854..e7fdb7ae2a 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -1031,10 +1031,11 @@ void streamMetaResetStartInfo(STaskStartInfo* pStartInfo, int32_t vgId) { taosHashClear(pStartInfo->pFailedTaskSet); pStartInfo->tasksWillRestart = 0; pStartInfo->readyTs = 0; + pStartInfo->elapsedTime = 0; // reset the sentinel flag value to be 0 pStartInfo->startAllTasks = 0; - stDebug("vgId:%d clear all start-all-task info", vgId); + stDebug("vgId:%d clear start-all-task info", vgId); } void streamMetaRLock(SStreamMeta* pMeta) { diff --git a/source/libs/stream/src/streamMsg.c b/source/libs/stream/src/streamMsg.c index f02f661143..f10296f6ff 100644 --- a/source/libs/stream/src/streamMsg.c +++ b/source/libs/stream/src/streamMsg.c @@ -349,6 +349,7 @@ int32_t tEncodeStreamHbMsg(SEncoder* pEncoder, const SStreamHbMsg* pReq) { if (tEncodeI64(pEncoder, ps->checkpointInfo.latestTime) < 0) return -1; if (tEncodeI64(pEncoder, ps->checkpointInfo.latestSize) < 0) return -1; if (tEncodeI8(pEncoder, ps->checkpointInfo.remoteBackup) < 0) return -1; + if (tEncodeI8(pEncoder, ps->checkpointInfo.consensusChkptId) < 0) return -1; if (tEncodeI64(pEncoder, ps->startTime) < 0) return -1; if (tEncodeI64(pEncoder, ps->startCheckpointId) < 0) return -1; if (tEncodeI64(pEncoder, ps->startCheckpointVer) < 0) return -1; @@ -403,6 +404,7 @@ int32_t tDecodeStreamHbMsg(SDecoder* pDecoder, SStreamHbMsg* pReq) { if (tDecodeI64(pDecoder, &entry.checkpointInfo.latestTime) < 0) return -1; if (tDecodeI64(pDecoder, &entry.checkpointInfo.latestSize) < 0) return -1; if (tDecodeI8(pDecoder, &entry.checkpointInfo.remoteBackup) < 0) return -1; + if (tDecodeI8(pDecoder, &entry.checkpointInfo.consensusChkptId) < 0) return -1; if (tDecodeI64(pDecoder, &entry.startTime) < 0) return -1; if (tDecodeI64(pDecoder, &entry.startCheckpointId) < 0) return -1; if (tDecodeI64(pDecoder, &entry.startCheckpointVer) < 0) return -1; diff --git a/source/libs/stream/src/streamTask.c b/source/libs/stream/src/streamTask.c index 1decfe198a..5506ed2d45 100644 --- a/source/libs/stream/src/streamTask.c +++ b/source/libs/stream/src/streamTask.c @@ -848,6 +848,7 @@ STaskStatusEntry streamTaskGetStatusEntry(SStreamTask* pTask) { .checkpointInfo.latestTime = pTask->chkInfo.checkpointTime, .checkpointInfo.latestSize = 0, .checkpointInfo.remoteBackup = 0, + .checkpointInfo.consensusChkptId = 0, .hTaskId = pTask->hTaskInfo.id.taskId, .procsTotal = SIZE_IN_MiB(pExecInfo->inputDataSize), .outputTotal = SIZE_IN_MiB(pExecInfo->outputDataSize), From df12b725493fc7e08e406887ef59dc522dc9450e Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Mon, 8 Jul 2024 01:37:47 +0000 Subject: [PATCH 67/92] fix invalid read --- source/libs/scheduler/test/schedulerTests.cpp | 134 +++++++++--------- source/libs/transport/src/transCli.c | 2 - 2 files changed, 66 insertions(+), 70 deletions(-) diff --git a/source/libs/scheduler/test/schedulerTests.cpp b/source/libs/scheduler/test/schedulerTests.cpp index 6d2215264e..78e876f82c 100644 --- a/source/libs/scheduler/test/schedulerTests.cpp +++ b/source/libs/scheduler/test/schedulerTests.cpp @@ -36,9 +36,9 @@ #include "tdatablock.h" #include "tdef.h" #include "tglobal.h" +#include "tmisce.h" #include "trpc.h" #include "tvariant.h" -#include "tmisce.h" #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wwrite-strings" @@ -54,7 +54,8 @@ namespace { -extern "C" int32_t schHandleResponseMsg(SSchJob *pJob, SSchTask *pTask, int32_t execId, SDataBuf *pMsg, int32_t rspCode); +extern "C" int32_t schHandleResponseMsg(SSchJob *pJob, SSchTask *pTask, int32_t execId, SDataBuf *pMsg, + int32_t rspCode); extern "C" int32_t schHandleCallback(void *param, const SDataBuf *pMsg, int32_t rspCode); int64_t insertJobRefId = 0; @@ -74,8 +75,9 @@ int32_t schtStartFetch = 0; void schtInitLogFile() { const char *defaultLogFileNamePrefix = "taoslog"; const int32_t maxLogFileNum = 10; - + rpcInit(); tsAsyncLog = 0; + rpcInit(); qDebugFlag = 159; strcpy(tsLogDir, TD_LOG_DIR_PATH); @@ -84,12 +86,10 @@ void schtInitLogFile() { } } -void schtQueryCb(SExecResult *pResult, void *param, int32_t code) { - *(int32_t *)param = 1; -} +void schtQueryCb(SExecResult *pResult, void *param, int32_t code) { *(int32_t *)param = 1; } -int32_t schtBuildQueryRspMsg(uint32_t *msize, void** rspMsg) { - SQueryTableRsp rsp = {0}; +int32_t schtBuildQueryRspMsg(uint32_t *msize, void **rspMsg) { + SQueryTableRsp rsp = {0}; rsp.code = 0; rsp.affectedRows = 0; rsp.tbVerInfo = NULL; @@ -99,7 +99,7 @@ int32_t schtBuildQueryRspMsg(uint32_t *msize, void** rspMsg) { qError("tSerializeSQueryTableRsp failed"); return TSDB_CODE_OUT_OF_MEMORY; } - + void *pRsp = taosMemoryCalloc(msgSize, 1); if (NULL == pRsp) { qError("rpcMallocCont %d failed", msgSize); @@ -117,9 +117,8 @@ int32_t schtBuildQueryRspMsg(uint32_t *msize, void** rspMsg) { return TSDB_CODE_SUCCESS; } - -int32_t schtBuildFetchRspMsg(uint32_t *msize, void** rspMsg) { - SRetrieveTableRsp* rsp = (SRetrieveTableRsp*)taosMemoryCalloc(sizeof(SRetrieveTableRsp), 1); +int32_t schtBuildFetchRspMsg(uint32_t *msize, void **rspMsg) { + SRetrieveTableRsp *rsp = (SRetrieveTableRsp *)taosMemoryCalloc(sizeof(SRetrieveTableRsp), 1); rsp->completed = 1; rsp->numOfRows = 10; rsp->compLen = 0; @@ -130,14 +129,14 @@ int32_t schtBuildFetchRspMsg(uint32_t *msize, void** rspMsg) { return TSDB_CODE_SUCCESS; } -int32_t schtBuildSubmitRspMsg(uint32_t *msize, void** rspMsg) { +int32_t schtBuildSubmitRspMsg(uint32_t *msize, void **rspMsg) { SSubmitRsp2 submitRsp = {0}; - int32_t msgSize = 0, ret = 0; - SEncoder ec = {0}; - + int32_t msgSize = 0, ret = 0; + SEncoder ec = {0}; + tEncodeSize(tEncodeSSubmitRsp2, &submitRsp, msgSize, ret); - void* msg = taosMemoryCalloc(1, msgSize); - tEncoderInit(&ec, (uint8_t*)msg, msgSize); + void *msg = taosMemoryCalloc(1, msgSize); + tEncoderInit(&ec, (uint8_t *)msg, msgSize); tEncodeSSubmitRsp2(&ec, &submitRsp); tEncoderClear(&ec); @@ -147,7 +146,6 @@ int32_t schtBuildSubmitRspMsg(uint32_t *msize, void** rspMsg) { return TSDB_CODE_SUCCESS; } - void schtBuildQueryDag(SQueryPlan *dag) { uint64_t qId = schtQueryId; @@ -157,8 +155,8 @@ void schtBuildQueryDag(SQueryPlan *dag) { SNodeListNode *scan = (SNodeListNode *)nodesMakeNode(QUERY_NODE_NODE_LIST); SNodeListNode *merge = (SNodeListNode *)nodesMakeNode(QUERY_NODE_NODE_LIST); - SSubplan *scanPlan = (SSubplan*)nodesMakeNode(QUERY_NODE_PHYSICAL_SUBPLAN); - SSubplan *mergePlan = (SSubplan*)nodesMakeNode(QUERY_NODE_PHYSICAL_SUBPLAN); + SSubplan *scanPlan = (SSubplan *)nodesMakeNode(QUERY_NODE_PHYSICAL_SUBPLAN); + SSubplan *mergePlan = (SSubplan *)nodesMakeNode(QUERY_NODE_PHYSICAL_SUBPLAN); scanPlan->id.queryId = qId; scanPlan->id.groupId = 0x0000000000000002; @@ -210,7 +208,7 @@ void schtBuildQueryFlowCtrlDag(SQueryPlan *dag) { SNodeListNode *scan = (SNodeListNode *)nodesMakeNode(QUERY_NODE_NODE_LIST); SNodeListNode *merge = (SNodeListNode *)nodesMakeNode(QUERY_NODE_NODE_LIST); - SSubplan *mergePlan = (SSubplan*)nodesMakeNode(QUERY_NODE_PHYSICAL_SUBPLAN); + SSubplan *mergePlan = (SSubplan *)nodesMakeNode(QUERY_NODE_PHYSICAL_SUBPLAN); merge->pNodeList = nodesMakeList(); scan->pNodeList = nodesMakeList(); @@ -218,7 +216,7 @@ void schtBuildQueryFlowCtrlDag(SQueryPlan *dag) { mergePlan->pChildren = nodesMakeList(); for (int32_t i = 0; i < scanPlanNum; ++i) { - SSubplan *scanPlan = (SSubplan*)nodesMakeNode(QUERY_NODE_PHYSICAL_SUBPLAN); + SSubplan *scanPlan = (SSubplan *)nodesMakeNode(QUERY_NODE_PHYSICAL_SUBPLAN); scanPlan->id.queryId = qId; scanPlan->id.groupId = 0x0000000000000002; scanPlan->id.subplanId = 0x0000000000000003 + i; @@ -272,7 +270,7 @@ void schtBuildInsertDag(SQueryPlan *dag) { SNodeListNode *inserta = (SNodeListNode *)nodesMakeNode(QUERY_NODE_NODE_LIST); inserta->pNodeList = nodesMakeList(); - SSubplan *insertPlan = (SSubplan*)nodesMakeNode(QUERY_NODE_PHYSICAL_SUBPLAN); + SSubplan *insertPlan = (SSubplan *)nodesMakeNode(QUERY_NODE_PHYSICAL_SUBPLAN); insertPlan->id.queryId = qId; insertPlan->id.groupId = 0x0000000000000003; @@ -287,14 +285,14 @@ void schtBuildInsertDag(SQueryPlan *dag) { insertPlan->pChildren = NULL; insertPlan->pParents = NULL; insertPlan->pNode = NULL; - insertPlan->pDataSink = (SDataSinkNode*)nodesMakeNode(QUERY_NODE_PHYSICAL_PLAN_INSERT); - ((SDataInserterNode*)insertPlan->pDataSink)->size = 1; - ((SDataInserterNode*)insertPlan->pDataSink)->pData = taosMemoryCalloc(1, 1); + insertPlan->pDataSink = (SDataSinkNode *)nodesMakeNode(QUERY_NODE_PHYSICAL_PLAN_INSERT); + ((SDataInserterNode *)insertPlan->pDataSink)->size = 1; + ((SDataInserterNode *)insertPlan->pDataSink)->pData = taosMemoryCalloc(1, 1); insertPlan->msgType = TDMT_VND_SUBMIT; nodesListAppend(inserta->pNodeList, (SNode *)insertPlan); - insertPlan = (SSubplan*)nodesMakeNode(QUERY_NODE_PHYSICAL_SUBPLAN); + insertPlan = (SSubplan *)nodesMakeNode(QUERY_NODE_PHYSICAL_SUBPLAN); insertPlan->id.queryId = qId; insertPlan->id.groupId = 0x0000000000000003; @@ -309,9 +307,9 @@ void schtBuildInsertDag(SQueryPlan *dag) { insertPlan->pChildren = NULL; insertPlan->pParents = NULL; insertPlan->pNode = NULL; - insertPlan->pDataSink = (SDataSinkNode*)nodesMakeNode(QUERY_NODE_PHYSICAL_PLAN_INSERT); - ((SDataInserterNode*)insertPlan->pDataSink)->size = 1; - ((SDataInserterNode*)insertPlan->pDataSink)->pData = taosMemoryCalloc(1, 1); + insertPlan->pDataSink = (SDataSinkNode *)nodesMakeNode(QUERY_NODE_PHYSICAL_PLAN_INSERT); + ((SDataInserterNode *)insertPlan->pDataSink)->size = 1; + ((SDataInserterNode *)insertPlan->pDataSink)->pData = taosMemoryCalloc(1, 1); insertPlan->msgType = TDMT_VND_SUBMIT; nodesListAppend(inserta->pNodeList, (SNode *)insertPlan); @@ -389,7 +387,8 @@ void schtSetRpcSendRequest() { } } -int32_t schtAsyncSendMsgToServer(void *pTransporter, SEpSet *epSet, int64_t *pTransporterId, SMsgSendInfo *pInfo, bool persistHandle, void* rpcCtx) { +int32_t schtAsyncSendMsgToServer(void *pTransporter, SEpSet *epSet, int64_t *pTransporterId, SMsgSendInfo *pInfo, + bool persistHandle, void *rpcCtx) { if (pInfo) { taosMemoryFreeClear(pInfo->param); taosMemoryFreeClear(pInfo->msgInfo.pData); @@ -439,11 +438,11 @@ void *schtSendRsp(void *param) { SSchTask *task = *(SSchTask **)pIter; SDataBuf msg = {0}; - void* rmsg = NULL; + void *rmsg = NULL; schtBuildSubmitRspMsg(&msg.len, &rmsg); msg.msgType = TDMT_VND_SUBMIT_RSP; msg.pData = rmsg; - + schHandleResponseMsg(pJob, task, task->execId, &msg, 0); pIter = taosHashIterate(pJob->execTasks, pIter); @@ -452,7 +451,7 @@ void *schtSendRsp(void *param) { schReleaseJob(job); schtJobDone = true; - + return NULL; } @@ -462,13 +461,13 @@ void *schtCreateFetchRspThread(void *param) { taosSsleep(1); - int32_t code = 0; + int32_t code = 0; SDataBuf msg = {0}; - void* rmsg = NULL; + void *rmsg = NULL; schtBuildFetchRspMsg(&msg.len, &rmsg); msg.msgType = TDMT_SCH_MERGE_FETCH_RSP; msg.pData = rmsg; - + code = schHandleResponseMsg(pJob, pJob->fetchTask, pJob->fetchTask->execId, &msg, 0); schReleaseJob(job); @@ -529,7 +528,7 @@ void *schtRunJobThread(void *aa) { char *dbname = "1.db1"; char *tablename = "table1"; SVgroupInfo vgInfo = {0}; - SQueryPlan* dag = (SQueryPlan*)nodesMakeNode(QUERY_NODE_PHYSICAL_PLAN); + SQueryPlan *dag = (SQueryPlan *)nodesMakeNode(QUERY_NODE_PHYSICAL_PLAN); schtInitLogFile(); @@ -601,7 +600,7 @@ void *schtRunJobThread(void *aa) { param->taskId = task->taskId; SDataBuf msg = {0}; - void* rmsg = NULL; + void *rmsg = NULL; schtBuildQueryRspMsg(&msg.len, &rmsg); msg.msgType = TDMT_SCH_QUERY_RSP; msg.pData = rmsg; @@ -622,7 +621,7 @@ void *schtRunJobThread(void *aa) { param->taskId = task->taskId - 1; SDataBuf msg = {0}; - void* rmsg = NULL; + void *rmsg = NULL; schtBuildQueryRspMsg(&msg.len, &rmsg); msg.msgType = TDMT_SCH_QUERY_RSP; msg.pData = rmsg; @@ -686,7 +685,6 @@ void *schtFreeJobThread(void *aa) { return NULL; } - } // namespace TEST(queryTest, normalCase) { @@ -696,7 +694,7 @@ TEST(queryTest, normalCase) { char *tablename = "table1"; SVgroupInfo vgInfo = {0}; int64_t job = 0; - SQueryPlan* dag = (SQueryPlan*)nodesMakeNode(QUERY_NODE_PHYSICAL_PLAN); + SQueryPlan *dag = (SQueryPlan *)nodesMakeNode(QUERY_NODE_PHYSICAL_PLAN); SArray *qnodeList = taosArrayInit(1, sizeof(SQueryNodeLoad)); @@ -737,13 +735,13 @@ TEST(queryTest, normalCase) { SSchTask *task = *(SSchTask **)pIter; SDataBuf msg = {0}; - void* rmsg = NULL; + void *rmsg = NULL; schtBuildQueryRspMsg(&msg.len, &rmsg); msg.msgType = TDMT_SCH_QUERY_RSP; msg.pData = rmsg; - + code = schHandleResponseMsg(pJob, task, task->execId, &msg, 0); - + ASSERT_EQ(code, 0); pIter = taosHashIterate(pJob->execTasks, pIter); } @@ -753,13 +751,13 @@ TEST(queryTest, normalCase) { SSchTask *task = *(SSchTask **)pIter; if (JOB_TASK_STATUS_EXEC == task->status) { SDataBuf msg = {0}; - void* rmsg = NULL; + void *rmsg = NULL; schtBuildQueryRspMsg(&msg.len, &rmsg); msg.msgType = TDMT_SCH_QUERY_RSP; msg.pData = rmsg; - + code = schHandleResponseMsg(pJob, task, task->execId, &msg, 0); - + ASSERT_EQ(code, 0); } @@ -793,7 +791,7 @@ TEST(queryTest, normalCase) { taosMemoryFreeClear(data); schReleaseJob(job); - + schedulerDestroy(); schedulerFreeJob(&job, 0); @@ -808,7 +806,7 @@ TEST(queryTest, readyFirstCase) { char *tablename = "table1"; SVgroupInfo vgInfo = {0}; int64_t job = 0; - SQueryPlan* dag = (SQueryPlan*)nodesMakeNode(QUERY_NODE_PHYSICAL_PLAN); + SQueryPlan *dag = (SQueryPlan *)nodesMakeNode(QUERY_NODE_PHYSICAL_PLAN); SArray *qnodeList = taosArrayInit(1, sizeof(SQueryNodeLoad)); @@ -816,7 +814,7 @@ TEST(queryTest, readyFirstCase) { load.addr.epSet.numOfEps = 1; strcpy(load.addr.epSet.eps[0].fqdn, "qnode0.ep"); load.addr.epSet.eps[0].port = 6031; - taosArrayPush(qnodeList, &load); + taosArrayPush(qnodeList, &load); int32_t code = schedulerInit(); ASSERT_EQ(code, 0); @@ -848,11 +846,11 @@ TEST(queryTest, readyFirstCase) { SSchTask *task = *(SSchTask **)pIter; SDataBuf msg = {0}; - void* rmsg = NULL; + void *rmsg = NULL; schtBuildQueryRspMsg(&msg.len, &rmsg); msg.msgType = TDMT_SCH_QUERY_RSP; msg.pData = rmsg; - + code = schHandleResponseMsg(pJob, task, task->execId, &msg, 0); ASSERT_EQ(code, 0); @@ -865,13 +863,13 @@ TEST(queryTest, readyFirstCase) { if (JOB_TASK_STATUS_EXEC == task->status) { SDataBuf msg = {0}; - void* rmsg = NULL; + void *rmsg = NULL; schtBuildQueryRspMsg(&msg.len, &rmsg); msg.msgType = TDMT_SCH_QUERY_RSP; msg.pData = rmsg; - + code = schHandleResponseMsg(pJob, task, task->execId, &msg, 0); - + ASSERT_EQ(code, 0); } @@ -919,7 +917,7 @@ TEST(queryTest, flowCtrlCase) { char *tablename = "table1"; SVgroupInfo vgInfo = {0}; int64_t job = 0; - SQueryPlan* dag = (SQueryPlan*)nodesMakeNode(QUERY_NODE_PHYSICAL_PLAN); + SQueryPlan *dag = (SQueryPlan *)nodesMakeNode(QUERY_NODE_PHYSICAL_PLAN); schtInitLogFile(); @@ -933,7 +931,6 @@ TEST(queryTest, flowCtrlCase) { load.addr.epSet.eps[0].port = 6031; taosArrayPush(qnodeList, &load); - int32_t code = schedulerInit(); ASSERT_EQ(code, 0); @@ -968,13 +965,13 @@ TEST(queryTest, flowCtrlCase) { if (JOB_TASK_STATUS_EXEC == task->status && 0 != task->lastMsgType) { SDataBuf msg = {0}; - void* rmsg = NULL; + void *rmsg = NULL; schtBuildQueryRspMsg(&msg.len, &rmsg); msg.msgType = TDMT_SCH_QUERY_RSP; msg.pData = rmsg; - + code = schHandleResponseMsg(pJob, task, task->execId, &msg, 0); - + ASSERT_EQ(code, 0); } @@ -1005,7 +1002,7 @@ TEST(queryTest, flowCtrlCase) { schedulerFreeJob(&job, 0); - taosThreadJoin(thread1, NULL); + taosThreadJoin(thread1, NULL); } TEST(insertTest, normalCase) { @@ -1014,7 +1011,7 @@ TEST(insertTest, normalCase) { char *dbname = "1.db1"; char *tablename = "table1"; SVgroupInfo vgInfo = {0}; - SQueryPlan* dag = (SQueryPlan*)nodesMakeNode(QUERY_NODE_PHYSICAL_PLAN); + SQueryPlan *dag = (SQueryPlan *)nodesMakeNode(QUERY_NODE_PHYSICAL_PLAN); uint64_t numOfRows = 0; SArray *qnodeList = taosArrayInit(1, sizeof(SQueryNodeLoad)); @@ -1067,7 +1064,7 @@ TEST(insertTest, normalCase) { schedulerDestroy(); - taosThreadJoin(thread1, NULL); + taosThreadJoin(thread1, NULL); } TEST(multiThread, forceFree) { @@ -1076,7 +1073,7 @@ TEST(multiThread, forceFree) { TdThread thread1, thread2, thread3; taosThreadCreate(&(thread1), &thattr, schtRunJobThread, NULL); -// taosThreadCreate(&(thread2), &thattr, schtFreeJobThread, NULL); + // taosThreadCreate(&(thread2), &thattr, schtFreeJobThread, NULL); taosThreadCreate(&(thread3), &thattr, schtFetchRspThread, NULL); while (true) { @@ -1089,7 +1086,7 @@ TEST(multiThread, forceFree) { } schtTestStop = true; - //taosSsleep(3); + // taosSsleep(3); } TEST(otherTest, otherCase) { @@ -1097,12 +1094,13 @@ TEST(otherTest, otherCase) { schReleaseJob(0); schFreeRpcCtx(NULL); - ASSERT_EQ(schDumpEpSet(NULL), (char*)NULL); + ASSERT_EQ(schDumpEpSet(NULL), (char *)NULL); ASSERT_EQ(strcmp(schGetOpStr(SCH_OP_NULL), "NULL"), 0); ASSERT_EQ(strcmp(schGetOpStr((SCH_OP_TYPE)100), "UNKNOWN"), 0); } int main(int argc, char **argv) { + schtInitLogFile(); taosSeedRand(taosGetTimestampSec()); testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/source/libs/transport/src/transCli.c b/source/libs/transport/src/transCli.c index 25e0248095..9818394a2a 100644 --- a/source/libs/transport/src/transCli.c +++ b/source/libs/transport/src/transCli.c @@ -2832,8 +2832,6 @@ int transSetDefaultAddr(void* shandle, const char* ip, const char* fqdn) { int64_t transAllocHandle() { SExHandle* exh = taosMemoryCalloc(1, sizeof(SExHandle)); - QUEUE_INIT(&exh->q); - taosInitRWLatch(&exh->latch); exh->refId = transAddExHandle(transGetRefMgt(), exh); SExHandle* self = transAcquireExHandle(transGetRefMgt(), exh->refId); From 647f9f47ef6602509d5330391965cf9b532cd275 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Mon, 8 Jul 2024 10:32:23 +0800 Subject: [PATCH 68/92] refactor: do some internal refactor. --- include/common/tmsgdef.h | 1 - include/libs/stream/streamMsg.h | 1 - source/dnode/mgmt/mgmt_mnode/src/mmHandle.c | 1 - source/dnode/mgmt/mgmt_snode/src/smHandle.c | 1 - source/dnode/mgmt/mgmt_vnode/src/vmHandle.c | 1 - source/dnode/mnode/impl/src/mndStream.c | 1 - source/dnode/mnode/impl/src/mndStreamUtil.c | 2 +- source/dnode/snode/src/snode.c | 2 -- source/dnode/vnode/src/vnd/vnodeSvr.c | 2 -- 9 files changed, 1 insertion(+), 11 deletions(-) diff --git a/include/common/tmsgdef.h b/include/common/tmsgdef.h index 7621615278..3515df3127 100644 --- a/include/common/tmsgdef.h +++ b/include/common/tmsgdef.h @@ -250,7 +250,6 @@ TD_DEF_MSG_TYPE(TDMT_MND_DROP_TB_WITH_TSMA, "drop-tb-with-tsma", NULL, NULL) TD_DEF_MSG_TYPE(TDMT_MND_STREAM_UPDATE_CHKPT_EVT, "stream-update-chkpt-evt", NULL, NULL) TD_DEF_MSG_TYPE(TDMT_MND_STREAM_CHKPT_REPORT, "stream-chkpt-report", NULL, NULL) - TD_DEF_MSG_TYPE(TDMT_MND_STREAM_REQ_CONSEN_CHKPT, "stream-req-consen-chkpt", NULL, NULL) TD_DEF_MSG_TYPE(TDMT_MND_STREAM_CONSEN_TIMER, "stream-consen-tmr", NULL, NULL) TD_DEF_MSG_TYPE(TDMT_MND_MAX_MSG, "mnd-max", NULL, NULL) TD_CLOSE_MSG_SEG(TDMT_END_MND_MSG) diff --git a/include/libs/stream/streamMsg.h b/include/libs/stream/streamMsg.h index bdb8ff7f8e..b253054dfe 100644 --- a/include/libs/stream/streamMsg.h +++ b/include/libs/stream/streamMsg.h @@ -232,7 +232,6 @@ typedef struct { typedef struct SCheckpointConsensusEntry { SRestoreCheckpointInfo req; - SRpcHandleInfo rspInfo; int64_t ts; } SCheckpointConsensusEntry; diff --git a/source/dnode/mgmt/mgmt_mnode/src/mmHandle.c b/source/dnode/mgmt/mgmt_mnode/src/mmHandle.c index f3763ef0c5..9b987b3237 100644 --- a/source/dnode/mgmt/mgmt_mnode/src/mmHandle.c +++ b/source/dnode/mgmt/mgmt_mnode/src/mmHandle.c @@ -243,7 +243,6 @@ SArray *mmGetMsgHandles() { if (dmSetMgmtHandle(pArray, TDMT_VND_STREAM_TASK_RESET_RSP, mmPutMsgToWriteQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_MND_STREAM_HEARTBEAT, mmPutMsgToReadQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_MND_STREAM_CHKPT_REPORT, mmPutMsgToWriteQueue, 0) == NULL) goto _OVER; - if (dmSetMgmtHandle(pArray, TDMT_MND_STREAM_REQ_CONSEN_CHKPT, mmPutMsgToWriteQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_MND_STREAM_REQ_CHKPT, mmPutMsgToWriteQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_VND_KILL_COMPACT_RSP, mmPutMsgToWriteQueue, 0) == NULL) goto _OVER; diff --git a/source/dnode/mgmt/mgmt_snode/src/smHandle.c b/source/dnode/mgmt/mgmt_snode/src/smHandle.c index 3d0587a11b..5c2f54fd10 100644 --- a/source/dnode/mgmt/mgmt_snode/src/smHandle.c +++ b/source/dnode/mgmt/mgmt_snode/src/smHandle.c @@ -97,7 +97,6 @@ SArray *smGetMsgHandles() { if (dmSetMgmtHandle(pArray, TDMT_MND_STREAM_HEARTBEAT_RSP, smPutNodeMsgToStreamQueue, 1) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_MND_STREAM_REQ_CHKPT_RSP, smPutNodeMsgToStreamQueue, 1) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_MND_STREAM_CHKPT_REPORT_RSP, smPutNodeMsgToStreamQueue, 1) == NULL) goto _OVER; - if (dmSetMgmtHandle(pArray, TDMT_MND_STREAM_REQ_CONSEN_CHKPT_RSP, smPutNodeMsgToStreamQueue, 1) == NULL) goto _OVER; code = 0; _OVER: diff --git a/source/dnode/mgmt/mgmt_vnode/src/vmHandle.c b/source/dnode/mgmt/mgmt_vnode/src/vmHandle.c index 7d35fd71b7..fbe1925e3f 100644 --- a/source/dnode/mgmt/mgmt_vnode/src/vmHandle.c +++ b/source/dnode/mgmt/mgmt_vnode/src/vmHandle.c @@ -972,7 +972,6 @@ SArray *vmGetMsgHandles() { if (dmSetMgmtHandle(pArray, TDMT_MND_STREAM_HEARTBEAT_RSP, vmPutMsgToStreamQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_MND_STREAM_REQ_CHKPT_RSP, vmPutMsgToStreamQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_MND_STREAM_CHKPT_REPORT_RSP, vmPutMsgToStreamQueue, 0) == NULL) goto _OVER; - if (dmSetMgmtHandle(pArray, TDMT_MND_STREAM_REQ_CONSEN_CHKPT_RSP, vmPutMsgToStreamQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_VND_GET_STREAM_PROGRESS, vmPutMsgToStreamQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_STREAM_TASK_UPDATE_CHKPT, vmPutMsgToWriteQueue, 0) == NULL) goto _OVER; diff --git a/source/dnode/mnode/impl/src/mndStream.c b/source/dnode/mnode/impl/src/mndStream.c index cd99714395..4faa8bdb58 100644 --- a/source/dnode/mnode/impl/src/mndStream.c +++ b/source/dnode/mnode/impl/src/mndStream.c @@ -121,7 +121,6 @@ int32_t mndInitStream(SMnode *pMnode) { mndSetMsgHandle(pMnode, TDMT_MND_STREAM_BEGIN_CHECKPOINT, mndProcessStreamCheckpoint); mndSetMsgHandle(pMnode, TDMT_MND_STREAM_REQ_CHKPT, mndProcessStreamReqCheckpoint); mndSetMsgHandle(pMnode, TDMT_MND_STREAM_CHKPT_REPORT, mndProcessCheckpointReport); - mndSetMsgHandle(pMnode, TDMT_MND_STREAM_REQ_CONSEN_CHKPT, mndProcessConsensusCheckpointId); mndSetMsgHandle(pMnode, TDMT_MND_STREAM_UPDATE_CHKPT_EVT, mndScanCheckpointReportInfo); mndSetMsgHandle(pMnode, TDMT_STREAM_TASK_REPORT_CHECKPOINT, mndTransProcessRsp); mndSetMsgHandle(pMnode, TDMT_MND_STREAM_HEARTBEAT, mndProcessStreamHb); diff --git a/source/dnode/mnode/impl/src/mndStreamUtil.c b/source/dnode/mnode/impl/src/mndStreamUtil.c index c4797957c2..430bfcc3a2 100644 --- a/source/dnode/mnode/impl/src/mndStreamUtil.c +++ b/source/dnode/mnode/impl/src/mndStreamUtil.c @@ -103,7 +103,7 @@ SArray *mndTakeVgroupSnapshot(SMnode *pMnode, bool *allReady) { } else { if (replica != pVgroup->replica) { mInfo("vgId:%d replica:%d inconsistent with other vgroups replica:%d, not ready for stream operations", - pVgroup->vgId); + pVgroup->vgId, pVgroup->replica, replica); *allReady = false; break; } diff --git a/source/dnode/snode/src/snode.c b/source/dnode/snode/src/snode.c index 60e57e8a2f..cfa24b2430 100644 --- a/source/dnode/snode/src/snode.c +++ b/source/dnode/snode/src/snode.c @@ -128,8 +128,6 @@ int32_t sndProcessStreamMsg(SSnode *pSnode, SRpcMsg *pMsg) { return tqStreamTaskProcessRetrieveTriggerReq(pSnode->pMeta, pMsg); case TDMT_STREAM_RETRIEVE_TRIGGER_RSP: return tqStreamTaskProcessRetrieveTriggerRsp(pSnode->pMeta, pMsg); - case TDMT_MND_STREAM_REQ_CONSEN_CHKPT_RSP: - return tqStreamProcessConsensusChkptRsp2(pSnode->pMeta, pMsg); default: sndError("invalid snode msg:%d", pMsg->msgType); ASSERT(0); diff --git a/source/dnode/vnode/src/vnd/vnodeSvr.c b/source/dnode/vnode/src/vnd/vnodeSvr.c index 04839c3357..9dcf7e53c7 100644 --- a/source/dnode/vnode/src/vnd/vnodeSvr.c +++ b/source/dnode/vnode/src/vnd/vnodeSvr.c @@ -861,8 +861,6 @@ int32_t vnodeProcessStreamMsg(SVnode *pVnode, SRpcMsg *pMsg, SQueueInfo *pInfo) return tqStreamProgressRetrieveReq(pVnode->pTq, pMsg); case TDMT_MND_STREAM_CHKPT_REPORT_RSP: return tqProcessTaskChkptReportRsp(pVnode->pTq, pMsg); - case TDMT_MND_STREAM_REQ_CONSEN_CHKPT_RSP: - return tqProcessTaskConsensusChkptRsp(pVnode->pTq, pMsg); default: vError("unknown msg type:%d in stream queue", pMsg->msgType); return TSDB_CODE_APP_ERROR; From 322c0633aa0d2c07f85b9dc2e443333678f0a872 Mon Sep 17 00:00:00 2001 From: wangmm0220 Date: Mon, 8 Jul 2024 10:46:14 +0800 Subject: [PATCH 69/92] fix:[TD-30915]send hb before close in tmq --- source/client/src/clientTmq.c | 65 ++++++++++++----------------------- 1 file changed, 22 insertions(+), 43 deletions(-) diff --git a/source/client/src/clientTmq.c b/source/client/src/clientTmq.c index 21d1a528da..a42b0f75dd 100644 --- a/source/client/src/clientTmq.c +++ b/source/client/src/clientTmq.c @@ -27,6 +27,7 @@ #define EMPTY_BLOCK_POLL_IDLE_DURATION 10 #define DEFAULT_AUTO_COMMIT_INTERVAL 5000 #define DEFAULT_HEARTBEAT_INTERVAL 3000 +#define DEFAULT_ASKEP_INTERVAL 1000 struct SMqMgmt { int8_t inited; @@ -99,7 +100,6 @@ struct tmq_t { int64_t totalRows; // timer - tmr_h hbLiveTimer; tmr_h epTimer; tmr_h commitTimer; STscObj* pTscObj; // connection @@ -737,35 +737,33 @@ static void generateTimedTask(int64_t refId, int32_t type) { if (tmq == NULL) return; int8_t* pTaskType = taosAllocateQitem(sizeof(int8_t), DEF_QITEM, 0); - if (pTaskType == NULL) return; + if (pTaskType != NULL){ + *pTaskType = type; + if (taosWriteQitem(tmq->delayedTask, pTaskType) == 0){ + tsem2_post(&tmq->rspSem); + } + } - *pTaskType = type; - taosWriteQitem(tmq->delayedTask, pTaskType); - tsem2_post(&tmq->rspSem); taosReleaseRef(tmqMgmt.rsetId, refId); } void tmqAssignAskEpTask(void* param, void* tmrId) { - int64_t refId = *(int64_t*)param; + int64_t refId = (int64_t)param; generateTimedTask(refId, TMQ_DELAYED_TASK__ASK_EP); - taosMemoryFree(param); } void tmqReplayTask(void* param, void* tmrId) { - int64_t refId = *(int64_t*)param; + int64_t refId = (int64_t)param; tmq_t* tmq = taosAcquireRef(tmqMgmt.rsetId, refId); - if (tmq == NULL) goto END; + if (tmq == NULL) return; tsem2_post(&tmq->rspSem); taosReleaseRef(tmqMgmt.rsetId, refId); -END: - taosMemoryFree(param); } void tmqAssignDelayedCommitTask(void* param, void* tmrId) { - int64_t refId = *(int64_t*)param; + int64_t refId = (int64_t)param; generateTimedTask(refId, TMQ_DELAYED_TASK__COMMIT); - taosMemoryFree(param); } int32_t tmqHbCb(void* param, SDataBuf* pMsg, int32_t code) { @@ -802,11 +800,10 @@ int32_t tmqHbCb(void* param, SDataBuf* pMsg, int32_t code) { } void tmqSendHbReq(void* param, void* tmrId) { - int64_t refId = *(int64_t*)param; + int64_t refId = (int64_t)param; tmq_t* tmq = taosAcquireRef(tmqMgmt.rsetId, refId); if (tmq == NULL) { - taosMemoryFree(param); return; } @@ -880,7 +877,9 @@ void tmqSendHbReq(void* param, void* tmrId) { OVER: tDestroySMqHbReq(&req); - taosTmrReset(tmqSendHbReq, DEFAULT_HEARTBEAT_INTERVAL, param, tmqMgmt.timer, &tmq->hbLiveTimer); + if(tmrId != NULL){ + taosTmrReset(tmqSendHbReq, DEFAULT_HEARTBEAT_INTERVAL, param, tmqMgmt.timer, &tmrId); + } taosReleaseRef(tmqMgmt.rsetId, refId); } @@ -908,21 +907,14 @@ int32_t tmqHandleAllDelayedTask(tmq_t* pTmq) { if (*pTaskType == TMQ_DELAYED_TASK__ASK_EP) { askEp(pTmq, NULL, false, false); - int64_t* pRefId = taosMemoryMalloc(sizeof(int64_t)); - *pRefId = pTmq->refId; - tscDebug("consumer:0x%" PRIx64 " retrieve ep from mnode in 1s", pTmq->consumerId); - taosTmrReset(tmqAssignAskEpTask, 1000, pRefId, tmqMgmt.timer, &pTmq->epTimer); + taosTmrReset(tmqAssignAskEpTask, DEFAULT_ASKEP_INTERVAL, (void*)(pTmq->refId), tmqMgmt.timer, &pTmq->epTimer); } else if (*pTaskType == TMQ_DELAYED_TASK__COMMIT) { tmq_commit_cb* pCallbackFn = pTmq->commitCb ? pTmq->commitCb : defaultCommitCbFn; - asyncCommitAllOffsets(pTmq, pCallbackFn, pTmq->commitCbUserParam); - int64_t* pRefId = taosMemoryMalloc(sizeof(int64_t)); - *pRefId = pTmq->refId; - tscDebug("consumer:0x%" PRIx64 " next commit to vnode(s) in %.2fs", pTmq->consumerId, pTmq->autoCommitInterval / 1000.0); - taosTmrReset(tmqAssignDelayedCommitTask, pTmq->autoCommitInterval, pRefId, tmqMgmt.timer, &pTmq->commitTimer); + taosTmrReset(tmqAssignDelayedCommitTask, pTmq->autoCommitInterval, (void*)(pTmq->refId), tmqMgmt.timer, &pTmq->commitTimer); } else { tscError("consumer:0x%" PRIx64 " invalid task type:%d", pTmq->consumerId, *pTaskType); } @@ -1171,9 +1163,7 @@ tmq_t* tmq_consumer_new(tmq_conf_t* conf, char* errstr, int32_t errstrLen) { goto _failed; } - int64_t* pRefId = taosMemoryMalloc(sizeof(int64_t)); - *pRefId = pTmq->refId; - pTmq->hbLiveTimer = taosTmrStart(tmqSendHbReq, DEFAULT_HEARTBEAT_INTERVAL, pRefId, tmqMgmt.timer); + taosTmrStart(tmqSendHbReq, DEFAULT_HEARTBEAT_INTERVAL, (void*)pTmq->refId, tmqMgmt.timer); char buf[TSDB_OFFSET_LEN] = {0}; STqOffsetVal offset = {.type = pTmq->resetOffsetCfg}; @@ -1301,18 +1291,9 @@ int32_t tmq_subscribe(tmq_t* tmq, const tmq_list_t* topic_list) { } // init ep timer - if (tmq->epTimer == NULL) { - int64_t* pRefId1 = taosMemoryMalloc(sizeof(int64_t)); - *pRefId1 = tmq->refId; - tmq->epTimer = taosTmrStart(tmqAssignAskEpTask, 1000, pRefId1, tmqMgmt.timer); - } - + tmq->epTimer = taosTmrStart(tmqAssignAskEpTask, DEFAULT_ASKEP_INTERVAL, (void*)(tmq->refId), tmqMgmt.timer); // init auto commit timer - if (tmq->autoCommit && tmq->commitTimer == NULL) { - int64_t* pRefId2 = taosMemoryMalloc(sizeof(int64_t)); - *pRefId2 = tmq->refId; - tmq->commitTimer = taosTmrStart(tmqAssignDelayedCommitTask, tmq->autoCommitInterval, pRefId2, tmqMgmt.timer); - } + tmq->commitTimer = taosTmrStart(tmqAssignDelayedCommitTask, tmq->autoCommitInterval, (void*)(tmq->refId), tmqMgmt.timer); FAIL: taosArrayDestroyP(req.topicNames, taosMemoryFree); @@ -2015,9 +1996,7 @@ static void* tmqHandleAllRsp(tmq_t* tmq, int64_t timeout) { pVg->blockReceiveTs = taosGetTimestampMs(); pVg->blockSleepForReplay = pRsp->rsp.sleepTime; if (pVg->blockSleepForReplay > 0) { - int64_t* pRefId1 = taosMemoryMalloc(sizeof(int64_t)); - *pRefId1 = tmq->refId; - taosTmrStart(tmqReplayTask, pVg->blockSleepForReplay, pRefId1, tmqMgmt.timer); + taosTmrStart(tmqReplayTask, pVg->blockSleepForReplay, (void*)(tmq->refId), tmqMgmt.timer); } } tscDebug("consumer:0x%" PRIx64 " process poll rsp, vgId:%d, offset:%s, blocks:%d, rows:%" PRId64 @@ -2274,7 +2253,7 @@ int32_t tmq_consumer_close(tmq_t* tmq) { return code; } } - taosSsleep(2); // sleep 2s for hb to send offset and rows to server + tmqSendHbReq((void*)(tmq->refId), NULL); tmq_list_t* lst = tmq_list_new(); int32_t code = tmq_subscribe(tmq, lst); From 7ff7ef1d73d67eda3f1ed500d4cdb74925b59f35 Mon Sep 17 00:00:00 2001 From: wangmm0220 Date: Mon, 8 Jul 2024 13:51:30 +0800 Subject: [PATCH 70/92] fix:[TD-30883]send hb before close in tmq --- tests/system-test/7-tmq/tmqParamsTest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/system-test/7-tmq/tmqParamsTest.py b/tests/system-test/7-tmq/tmqParamsTest.py index 3d5fb52da5..a323dff19e 100644 --- a/tests/system-test/7-tmq/tmqParamsTest.py +++ b/tests/system-test/7-tmq/tmqParamsTest.py @@ -123,7 +123,6 @@ class TDTestCase: tmqCom.insert_data(tdSql,paraDict["dbName"],paraDict["ctbPrefix"],paraDict["ctbNum"],paraDict["rowsPerTbl"],paraDict["batchNum"],int(round(time.time()*1000))) stop_flag = 1 finally: - time.sleep(5) #wait for send heartbeat to update subscription info. consumer.unsubscribe() consumer.close() tdSql.checkEqual(consumer_info, expected_parameters) From 6e15c16cf722d6bb8aff2c3b4edf8591289445d0 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Mon, 8 Jul 2024 14:33:28 +0800 Subject: [PATCH 71/92] fix(stream): check the set consensus-checkpointId ts. --- include/libs/stream/tstream.h | 3 +- source/dnode/mnode/impl/src/mndStream.c | 225 +++++++++++--------- source/dnode/mnode/impl/src/mndStreamHb.c | 17 +- source/dnode/mnode/impl/src/mndStreamUtil.c | 13 +- source/dnode/vnode/src/tqCommon/tqCommon.c | 4 +- source/libs/stream/src/streamMsg.c | 2 + source/libs/stream/src/streamTask.c | 1 + 7 files changed, 135 insertions(+), 130 deletions(-) diff --git a/include/libs/stream/tstream.h b/include/libs/stream/tstream.h index 5c61265c01..e275c1511d 100644 --- a/include/libs/stream/tstream.h +++ b/include/libs/stream/tstream.h @@ -578,6 +578,7 @@ typedef struct STaskCkptInfo { int32_t activeTransId; // checkpoint trans id int8_t failed; // denote if the checkpoint is failed or not int8_t consensusChkptId; // required the consensus-checkpointId + int64_t consensusTs; // } STaskCkptInfo; typedef struct STaskStatusEntry { @@ -588,8 +589,6 @@ typedef struct STaskStatusEntry { int32_t nodeId; SVersionRange verRange; // start/end version in WAL, only valid for source task int64_t processedVer; // only valid for source task - bool inputQChanging; // inputQ is changing or not - int64_t inputQUnchangeCounter; double inputQUsed; // in MiB double inputRate; double procsThroughput; // duration between one element put into input queue and being processed. diff --git a/source/dnode/mnode/impl/src/mndStream.c b/source/dnode/mnode/impl/src/mndStream.c index 4faa8bdb58..d2252d1bee 100644 --- a/source/dnode/mnode/impl/src/mndStream.c +++ b/source/dnode/mnode/impl/src/mndStream.c @@ -59,7 +59,7 @@ static int32_t mndProcessNodeCheckReq(SRpcMsg *pMsg); static int32_t extractNodeListFromStream(SMnode *pMnode, SArray *pNodeList); static int32_t mndProcessStreamReqCheckpoint(SRpcMsg *pReq); static int32_t mndProcessCheckpointReport(SRpcMsg *pReq); -static int32_t mndProcessConsensusCheckpointId(SRpcMsg *pMsg); +//static int32_t mndProcessConsensusCheckpointId(SRpcMsg *pMsg); static int32_t mndProcessConsensusInTmr(SRpcMsg *pMsg); static void doSendQuickRsp(SRpcHandleInfo *pInfo, int32_t msgSize, int32_t vgId, int32_t code); @@ -2617,10 +2617,11 @@ int32_t mndProcessCheckpointReport(SRpcMsg *pReq) { return 0; } -static int64_t getConsensusId(int64_t streamId, int32_t numOfTasks, int32_t* pExistedTasks) { +static int64_t getConsensusId(int64_t streamId, int32_t numOfTasks, int32_t* pExistedTasks, bool *pAllSame) { int32_t num = 0; int64_t chkId = INT64_MAX; *pExistedTasks = 0; + *pAllSame = true; for(int32_t i = 0; i < taosArrayGetSize(execInfo.pTaskList); ++i) { STaskId* p = taosArrayGet(execInfo.pTaskList, i); @@ -2631,6 +2632,9 @@ static int64_t getConsensusId(int64_t streamId, int32_t numOfTasks, int32_t* pEx num += 1; STaskStatusEntry* pe = taosHashGet(execInfo.pTaskMap, p, sizeof(*p)); if (chkId > pe->checkpointInfo.latestId) { + if (chkId != INT64_MAX) { + *pAllSame = false; + } chkId = pe->checkpointInfo.latestId; } } @@ -2653,99 +2657,99 @@ static void doSendQuickRsp(SRpcHandleInfo *pInfo, int32_t msgSize, int32_t vgId, pInfo->handle = NULL; // disable auto rsp } -static int32_t mndProcessConsensusCheckpointId(SRpcMsg *pMsg) { - SMnode *pMnode = pMsg->info.node; - SDecoder decoder = {0}; - - SRestoreCheckpointInfo req = {0}; - tDecoderInit(&decoder, pMsg->pCont, pMsg->contLen); - - if (tDecodeRestoreCheckpointInfo(&decoder, &req)) { - tDecoderClear(&decoder); - terrno = TSDB_CODE_INVALID_MSG; - mError("invalid task consensus-checkpoint msg received"); - return -1; - } - tDecoderClear(&decoder); - - mDebug("receive stream task consensus-checkpoint msg, vgId:%d, s-task:0x%" PRIx64 "-0x%x, checkpointId:%" PRId64, - req.nodeId, req.streamId, req.taskId, req.checkpointId); - - // register to the stream task done map, if all tasks has sent this kinds of message, start the checkpoint trans. - taosThreadMutexLock(&execInfo.lock); - - // mnode handle the create stream transaction too slow may cause this problem - SStreamObj *pStream = mndGetStreamObj(pMnode, req.streamId); - if (pStream == NULL) { - mWarn("failed to find the stream:0x%" PRIx64 ", not handle consensus-checkpointId", req.streamId); - - // not in meta-store yet, try to acquire the task in exec buffer - // the checkpoint req arrives too soon before the completion of the create stream trans. - STaskId id = {.streamId = req.streamId, .taskId = req.taskId}; - void *p = taosHashGet(execInfo.pTaskMap, &id, sizeof(id)); - if (p == NULL) { - mError("failed to find the stream:0x%" PRIx64 " in buf, not handle consensus-checkpointId", req.streamId); - terrno = TSDB_CODE_MND_STREAM_NOT_EXIST; - taosThreadMutexUnlock(&execInfo.lock); - - doSendQuickRsp(&pMsg->info, sizeof(SMStreamReqConsensChkptRsp), req.nodeId, terrno); - return -1; - } else { - mDebug("s-task:0x%" PRIx64 "-0x%x in buf not in mnode/meta, create stream trans may not complete yet", - req.streamId, req.taskId); - // todo wait for stream is created - } - } - - mInfo("vgId:%d stream:0x%" PRIx64 " %s meta-stored checkpointId:%" PRId64, req.nodeId, req.streamId, pStream->name, - pStream->checkpointId); - - int32_t numOfTasks = (pStream == NULL) ? 0 : mndGetNumOfStreamTasks(pStream); - if ((pStream != NULL) && (pStream->checkpointId == 0)) { // not generated checkpoint yet, return 0 directly - taosThreadMutexUnlock(&execInfo.lock); - mndCreateSetConsensusChkptIdTrans(pMnode, pStream, req.taskId, 0, req.startTs); - - doSendQuickRsp(&pMsg->info, sizeof(SMStreamReqConsensChkptRsp), req.nodeId, terrno); - return TSDB_CODE_SUCCESS; - } - - int32_t num = 0; - int64_t chkId = getConsensusId(req.streamId, numOfTasks, &num); - - // some tasks not send hbMsg to mnode yet, wait for 5s. - if (chkId == -1) { - mDebug("not all(%d/%d) task(s) send hbMsg yet, wait for a while and check again, s-task:0x%x", req.taskId, num, - numOfTasks); - SCheckpointConsensusInfo *pInfo = mndGetConsensusInfo(execInfo.pStreamConsensus, req.streamId, numOfTasks); - mndAddConsensusTasks(pInfo, &req); - - taosThreadMutexUnlock(&execInfo.lock); - doSendQuickRsp(&pMsg->info, sizeof(SMStreamReqConsensChkptRsp), req.nodeId, terrno); - return 0; - } - - if (chkId == req.checkpointId) { - mDebug("vgId:%d stream:0x%" PRIx64 " %s consensus-checkpointId is:%" PRId64 ", meta-stored checkpointId:%" PRId64, - req.nodeId, req.streamId, pStream->name, chkId, pStream->checkpointId); - mndCreateSetConsensusChkptIdTrans(pMnode, pStream, req.taskId, chkId, req.startTs); - - taosThreadMutexUnlock(&execInfo.lock); - doSendQuickRsp(&pMsg->info, sizeof(SMStreamReqConsensChkptRsp), req.nodeId, terrno); - return 0; - } - - // wait for 5s and check again - SCheckpointConsensusInfo *pInfo = mndGetConsensusInfo(execInfo.pStreamConsensus, req.streamId, numOfTasks); - mndAddConsensusTasks(pInfo, &req); - - if (pStream != NULL) { - mndReleaseStream(pMnode, pStream); - } - - taosThreadMutexUnlock(&execInfo.lock); - doSendQuickRsp(&pMsg->info, sizeof(SMStreamReqConsensChkptRsp), req.nodeId, terrno); - return 0; -} +//static int32_t mndProcessConsensusCheckpointId(SRpcMsg *pMsg) { +// SMnode *pMnode = pMsg->info.node; +// SDecoder decoder = {0}; +// +// SRestoreCheckpointInfo req = {0}; +// tDecoderInit(&decoder, pMsg->pCont, pMsg->contLen); +// +// if (tDecodeRestoreCheckpointInfo(&decoder, &req)) { +// tDecoderClear(&decoder); +// terrno = TSDB_CODE_INVALID_MSG; +// mError("invalid task consensus-checkpoint msg received"); +// return -1; +// } +// tDecoderClear(&decoder); +// +// mDebug("receive stream task consensus-checkpoint msg, vgId:%d, s-task:0x%" PRIx64 "-0x%x, checkpointId:%" PRId64, +// req.nodeId, req.streamId, req.taskId, req.checkpointId); +// +// // register to the stream task done map, if all tasks has sent this kinds of message, start the checkpoint trans. +// taosThreadMutexLock(&execInfo.lock); +// +// // mnode handle the create stream transaction too slow may cause this problem +// SStreamObj *pStream = mndGetStreamObj(pMnode, req.streamId); +// if (pStream == NULL) { +// mWarn("failed to find the stream:0x%" PRIx64 ", not handle consensus-checkpointId", req.streamId); +// +// // not in meta-store yet, try to acquire the task in exec buffer +// // the checkpoint req arrives too soon before the completion of the create stream trans. +// STaskId id = {.streamId = req.streamId, .taskId = req.taskId}; +// void *p = taosHashGet(execInfo.pTaskMap, &id, sizeof(id)); +// if (p == NULL) { +// mError("failed to find the stream:0x%" PRIx64 " in buf, not handle consensus-checkpointId", req.streamId); +// terrno = TSDB_CODE_MND_STREAM_NOT_EXIST; +// taosThreadMutexUnlock(&execInfo.lock); +// +// doSendQuickRsp(&pMsg->info, sizeof(SMStreamReqConsensChkptRsp), req.nodeId, terrno); +// return -1; +// } else { +// mDebug("s-task:0x%" PRIx64 "-0x%x in buf not in mnode/meta, create stream trans may not complete yet", +// req.streamId, req.taskId); +// // todo wait for stream is created +// } +// } +// +// mInfo("vgId:%d stream:0x%" PRIx64 " %s meta-stored checkpointId:%" PRId64, req.nodeId, req.streamId, pStream->name, +// pStream->checkpointId); +// +// int32_t numOfTasks = (pStream == NULL) ? 0 : mndGetNumOfStreamTasks(pStream); +// if ((pStream != NULL) && (pStream->checkpointId == 0)) { // not generated checkpoint yet, return 0 directly +// taosThreadMutexUnlock(&execInfo.lock); +// mndCreateSetConsensusChkptIdTrans(pMnode, pStream, req.taskId, 0, req.startTs); +// +// doSendQuickRsp(&pMsg->info, sizeof(SMStreamReqConsensChkptRsp), req.nodeId, terrno); +// return TSDB_CODE_SUCCESS; +// } +// +// int32_t num = 0; +// int64_t chkId = getConsensusId(req.streamId, numOfTasks, &num); +// +// // some tasks not send hbMsg to mnode yet, wait for 5s. +// if (chkId == -1) { +// mDebug("not all(%d/%d) task(s) send hbMsg yet, wait for a while and check again, s-task:0x%x", req.taskId, num, +// numOfTasks); +// SCheckpointConsensusInfo *pInfo = mndGetConsensusInfo(execInfo.pStreamConsensus, req.streamId, numOfTasks); +// mndAddConsensusTasks(pInfo, &req); +// +// taosThreadMutexUnlock(&execInfo.lock); +// doSendQuickRsp(&pMsg->info, sizeof(SMStreamReqConsensChkptRsp), req.nodeId, terrno); +// return 0; +// } +// +// if (chkId == req.checkpointId) { +// mDebug("vgId:%d stream:0x%" PRIx64 " %s consensus-checkpointId is:%" PRId64 ", meta-stored checkpointId:%" PRId64, +// req.nodeId, req.streamId, pStream->name, chkId, pStream->checkpointId); +// mndCreateSetConsensusChkptIdTrans(pMnode, pStream, req.taskId, chkId, req.startTs); +// +// taosThreadMutexUnlock(&execInfo.lock); +// doSendQuickRsp(&pMsg->info, sizeof(SMStreamReqConsensChkptRsp), req.nodeId, terrno); +// return 0; +// } +// +// // wait for 5s and check again +// SCheckpointConsensusInfo *pInfo = mndGetConsensusInfo(execInfo.pStreamConsensus, req.streamId, numOfTasks); +// mndAddConsensusTasks(pInfo, &req); +// +// if (pStream != NULL) { +// mndReleaseStream(pMnode, pStream); +// } +// +// taosThreadMutexUnlock(&execInfo.lock); +// doSendQuickRsp(&pMsg->info, sizeof(SMStreamReqConsensChkptRsp), req.nodeId, terrno); +// return 0; +//} int32_t mndProcessConsensusInTmr(SRpcMsg *pMsg) { SMnode *pMnode = pMsg->info.node; @@ -2753,6 +2757,15 @@ int32_t mndProcessConsensusInTmr(SRpcMsg *pMsg) { SArray *pStreamList = taosArrayInit(4, sizeof(int64_t)); mDebug("start to process consensus-checkpointId in tmr"); + + bool allReady = true; + SArray *pNodeSnapshot = mndTakeVgroupSnapshot(pMnode, &allReady); + taosArrayDestroy(pNodeSnapshot); + if (!allReady) { + mWarn("not all vnodes are ready, end to process the consensus-checkpointId in tmr process"); + return 0; + } + taosThreadMutexLock(&execInfo.lock); void *pIter = NULL; @@ -2765,7 +2778,8 @@ int32_t mndProcessConsensusInTmr(SRpcMsg *pMsg) { SStreamObj *pStream = mndGetStreamObj(pMnode, pInfo->streamId); if (pStream == NULL) { // stream has been dropped already - mDebug("stream:0x%"PRIx64" dropped already, continue", pInfo->streamId); + mDebug("stream:0x%" PRIx64 " dropped already, continue", pInfo->streamId); + taosArrayDestroy(pList); continue; } @@ -2773,15 +2787,18 @@ int32_t mndProcessConsensusInTmr(SRpcMsg *pMsg) { SCheckpointConsensusEntry *pe = taosArrayGet(pInfo->pTaskList, j); streamId = pe->req.streamId; - if ((now - pe->ts) >= 10 * 1000) { - int32_t existed = 0; - int64_t chkId = getConsensusId(pe->req.streamId, pInfo->numOfTasks, &existed); - if (chkId == -1) { - mDebug("not all(%d/%d) task(s) send hbMsg yet, wait for a while and check again, s-task:0x%x", existed, - pInfo->numOfTasks, pe->req.taskId); - break; - } + int32_t existed = 0; + bool allSame = true; + int64_t chkId = getConsensusId(pe->req.streamId, pInfo->numOfTasks, &existed, &allSame); + if (chkId == -1) { + mDebug("not all(%d/%d) task(s) send hbMsg yet, wait for a while and check again, s-task:0x%x", existed, + pInfo->numOfTasks, pe->req.taskId); + break; + } + if (((now - pe->ts) >= 10 * 1000) || allSame) { + mDebug("s-task:0x%x sendTs:%" PRId64 " wait %.2fs and all tasks have same checkpointId", pe->req.taskId, + (now - pe->ts) / 1000.0, pe->ts); ASSERT(chkId <= pe->req.checkpointId); mndCreateSetConsensusChkptIdTrans(pMnode, pStream, pe->req.taskId, chkId, pe->req.startTs); diff --git a/source/dnode/mnode/impl/src/mndStreamHb.c b/source/dnode/mnode/impl/src/mndStreamHb.c index 8a374c99ef..1452ac77d2 100644 --- a/source/dnode/mnode/impl/src/mndStreamHb.c +++ b/source/dnode/mnode/impl/src/mndStreamHb.c @@ -290,7 +290,7 @@ int32_t mndProcessStreamHb(SRpcMsg *pReq) { .streamId = p->id.streamId, .taskId = p->id.taskId, .checkpointId = p->checkpointInfo.latestId, - .startTs = pTaskEntry->startTime, + .startTs = pChkInfo->consensusTs, }; SStreamObj *pStream = mndGetStreamObj(pMnode, p->id.streamId); @@ -320,21 +320,6 @@ int32_t mndProcessStreamHb(SRpcMsg *pReq) { // remove failed trans from pChkptStreams taosHashRemove(execInfo.pChkptStreams, &p->id.streamId, sizeof(p->id.streamId)); } - -/* if (pChkInfo->consensusChkptId != 0) { - SRestoreCheckpointInfo cp = { - .streamId = p->id.streamId, - .taskId = p->id.taskId, - .checkpointId = p->checkpointInfo.latestId, - .startTs = pTaskEntry->startTime, - }; - - SStreamObj* pStream = mndGetStreamObj(pMnode, p->id.streamId); - int32_t numOfTasks = mndGetNumOfStreamTasks(pStream); - SCheckpointConsensusInfo *pInfo = mndGetConsensusInfo(execInfo.pStreamConsensus, p->id.streamId, numOfTasks); - mndAddConsensusTasks(pInfo, &cp, NULL); - mndReleaseStream(pMnode, pStream); - }*/ } if (p->status == pTaskEntry->status) { diff --git a/source/dnode/mnode/impl/src/mndStreamUtil.c b/source/dnode/mnode/impl/src/mndStreamUtil.c index 430bfcc3a2..be17082200 100644 --- a/source/dnode/mnode/impl/src/mndStreamUtil.c +++ b/source/dnode/mnode/impl/src/mndStreamUtil.c @@ -87,7 +87,7 @@ SArray *mndTakeVgroupSnapshot(SMnode *pMnode, bool *allReady) { int32_t replica = -1; // do the replica check *allReady = true; - SArray *pVgroupListSnapshot = taosArrayInit(4, sizeof(SNodeEntry)); + SArray *pVgroupList = taosArrayInit(4, sizeof(SNodeEntry)); while (1) { pIter = sdbFetch(pSdb, SDB_VGROUP, pIter, (void **)&pVgroup); @@ -133,7 +133,7 @@ SArray *mndTakeVgroupSnapshot(SMnode *pMnode, bool *allReady) { epsetToStr(&entry.epset, buf, tListLen(buf)); mDebug("take node snapshot, nodeId:%d %s", entry.nodeId, buf); - taosArrayPush(pVgroupListSnapshot, &entry); + taosArrayPush(pVgroupList, &entry); sdbRelease(pSdb, pVgroup); } @@ -152,11 +152,11 @@ SArray *mndTakeVgroupSnapshot(SMnode *pMnode, bool *allReady) { epsetToStr(&entry.epset, buf, tListLen(buf)); mDebug("take snode snapshot, nodeId:%d %s", entry.nodeId, buf); - taosArrayPush(pVgroupListSnapshot, &entry); + taosArrayPush(pVgroupList, &entry); sdbRelease(pSdb, pObj); } - return pVgroupListSnapshot; + return pVgroupList; } SStreamObj *mndGetStreamObj(SMnode *pMnode, int64_t streamId) { @@ -960,9 +960,10 @@ void mndAddConsensusTasks(SCheckpointConsensusInfo *pInfo, const SRestoreCheckpo for (int32_t i = 0; i < taosArrayGetSize(pInfo->pTaskList); ++i) { SCheckpointConsensusEntry *p = taosArrayGet(pInfo->pTaskList, i); if (p->req.taskId == info.req.taskId) { - mDebug("s-task:0x%x already in consensus-checkpointId list for stream:0x%" PRIx64 - ", ignore this, total existed:%d", + mDebug("s-task:0x%x already in consensus-checkpointId list for stream:0x%" PRIx64 ", update ts %" PRId64 + "->%" PRId64 " total existed:%d", pRestoreInfo->taskId, pRestoreInfo->streamId, (int32_t)taosArrayGetSize(pInfo->pTaskList)); + p->req.startTs = info.req.startTs; return; } } diff --git a/source/dnode/vnode/src/tqCommon/tqCommon.c b/source/dnode/vnode/src/tqCommon/tqCommon.c index dc55acbb5c..a94c17f735 100644 --- a/source/dnode/vnode/src/tqCommon/tqCommon.c +++ b/source/dnode/vnode/src/tqCommon/tqCommon.c @@ -1161,7 +1161,7 @@ int32_t tqStreamTaskProcessConsenChkptIdReq(SStreamMeta* pMeta, SRpcMsg* pMsg) { streamMetaAddFailedTask(pMeta, req.streamId, req.taskId); return TSDB_CODE_SUCCESS; } -#if 0 + // discard the rsp, since it is expired. if (req.startTs < pTask->execInfo.created) { tqWarn("s-task:%s vgId:%d create time:%" PRId64 " recv expired consensus checkpointId:%" PRId64 @@ -1171,7 +1171,7 @@ int32_t tqStreamTaskProcessConsenChkptIdReq(SStreamMeta* pMeta, SRpcMsg* pMsg) { streamMetaReleaseTask(pMeta, pTask); return TSDB_CODE_SUCCESS; } -#endif + tqDebug("s-task:%s vgId:%d checkpointId:%" PRId64 " restore to consensus-checkpointId:%" PRId64 " from mnode", pTask->id.idStr, vgId, pTask->chkInfo.checkpointId, req.checkpointId); diff --git a/source/libs/stream/src/streamMsg.c b/source/libs/stream/src/streamMsg.c index f10296f6ff..1bc91d6984 100644 --- a/source/libs/stream/src/streamMsg.c +++ b/source/libs/stream/src/streamMsg.c @@ -350,6 +350,7 @@ int32_t tEncodeStreamHbMsg(SEncoder* pEncoder, const SStreamHbMsg* pReq) { if (tEncodeI64(pEncoder, ps->checkpointInfo.latestSize) < 0) return -1; if (tEncodeI8(pEncoder, ps->checkpointInfo.remoteBackup) < 0) return -1; if (tEncodeI8(pEncoder, ps->checkpointInfo.consensusChkptId) < 0) return -1; + if (tEncodeI64(pEncoder, ps->checkpointInfo.consensusTs) < 0) return -1; if (tEncodeI64(pEncoder, ps->startTime) < 0) return -1; if (tEncodeI64(pEncoder, ps->startCheckpointId) < 0) return -1; if (tEncodeI64(pEncoder, ps->startCheckpointVer) < 0) return -1; @@ -405,6 +406,7 @@ int32_t tDecodeStreamHbMsg(SDecoder* pDecoder, SStreamHbMsg* pReq) { if (tDecodeI64(pDecoder, &entry.checkpointInfo.latestSize) < 0) return -1; if (tDecodeI8(pDecoder, &entry.checkpointInfo.remoteBackup) < 0) return -1; if (tDecodeI8(pDecoder, &entry.checkpointInfo.consensusChkptId) < 0) return -1; + if (tDecodeI64(pDecoder, &entry.checkpointInfo.consensusTs) < 0) return -1; if (tDecodeI64(pDecoder, &entry.startTime) < 0) return -1; if (tDecodeI64(pDecoder, &entry.startCheckpointId) < 0) return -1; if (tDecodeI64(pDecoder, &entry.startCheckpointVer) < 0) return -1; diff --git a/source/libs/stream/src/streamTask.c b/source/libs/stream/src/streamTask.c index 5506ed2d45..f72a5dd434 100644 --- a/source/libs/stream/src/streamTask.c +++ b/source/libs/stream/src/streamTask.c @@ -849,6 +849,7 @@ STaskStatusEntry streamTaskGetStatusEntry(SStreamTask* pTask) { .checkpointInfo.latestSize = 0, .checkpointInfo.remoteBackup = 0, .checkpointInfo.consensusChkptId = 0, + .checkpointInfo.consensusTs = taosGetTimestampMs(), .hTaskId = pTask->hTaskInfo.id.taskId, .procsTotal = SIZE_IN_MiB(pExecInfo->inputDataSize), .outputTotal = SIZE_IN_MiB(pExecInfo->outputDataSize), From 0cbf88f021ebe431f5e80c8ca4b606888529ec15 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Mon, 8 Jul 2024 14:51:08 +0800 Subject: [PATCH 72/92] fix(stream): fix memory leak. --- source/dnode/mnode/impl/src/mndStream.c | 1 + 1 file changed, 1 insertion(+) diff --git a/source/dnode/mnode/impl/src/mndStream.c b/source/dnode/mnode/impl/src/mndStream.c index d2252d1bee..77a6cb64b2 100644 --- a/source/dnode/mnode/impl/src/mndStream.c +++ b/source/dnode/mnode/impl/src/mndStream.c @@ -2763,6 +2763,7 @@ int32_t mndProcessConsensusInTmr(SRpcMsg *pMsg) { taosArrayDestroy(pNodeSnapshot); if (!allReady) { mWarn("not all vnodes are ready, end to process the consensus-checkpointId in tmr process"); + taosArrayDestroy(pStreamList); return 0; } From 56b7ec3d89556e5ecaff31fe42b079491bd3f89a Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Mon, 8 Jul 2024 15:10:00 +0800 Subject: [PATCH 73/92] fix(stream): update some logs. --- source/dnode/mnode/impl/src/mndStreamUtil.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/source/dnode/mnode/impl/src/mndStreamUtil.c b/source/dnode/mnode/impl/src/mndStreamUtil.c index be17082200..e14d2a0557 100644 --- a/source/dnode/mnode/impl/src/mndStreamUtil.c +++ b/source/dnode/mnode/impl/src/mndStreamUtil.c @@ -962,7 +962,8 @@ void mndAddConsensusTasks(SCheckpointConsensusInfo *pInfo, const SRestoreCheckpo if (p->req.taskId == info.req.taskId) { mDebug("s-task:0x%x already in consensus-checkpointId list for stream:0x%" PRIx64 ", update ts %" PRId64 "->%" PRId64 " total existed:%d", - pRestoreInfo->taskId, pRestoreInfo->streamId, (int32_t)taosArrayGetSize(pInfo->pTaskList)); + pRestoreInfo->taskId, pRestoreInfo->streamId, p->req.startTs, info.req.startTs, + (int32_t)taosArrayGetSize(pInfo->pTaskList)); p->req.startTs = info.req.startTs; return; } From 982fed581d3be22708c26103368524f74b4f8980 Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Mon, 8 Jul 2024 07:34:06 +0000 Subject: [PATCH 74/92] fix invalid read --- source/libs/stream/src/streamBackendRocksdb.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/source/libs/stream/src/streamBackendRocksdb.c b/source/libs/stream/src/streamBackendRocksdb.c index 8c390c189c..231fc2ce5b 100644 --- a/source/libs/stream/src/streamBackendRocksdb.c +++ b/source/libs/stream/src/streamBackendRocksdb.c @@ -1536,15 +1536,10 @@ int32_t chkpLoadExtraInfo(char* pChkpIdDir, int64_t* chkpId, int64_t* processId) pFile = taosOpenFile(pDst, TD_FILE_READ); if (pFile == NULL) { - if (errno == ENOENT) { - // compatible with previous version - *processId = -1; - code = 0; - goto _EXIT; - } else { - code = TAOS_SYSTEM_ERROR(errno); - stError("failed to open file to load extra info, file:%s, reason:%s", pDst, tstrerror(code)); - } + // compatible with previous version + *processId = -1; + code = 0; + stError("failed to open file to load extra info, file:%s, reason:%s", pDst, tstrerror(TAOS_SYSTEM_ERROR(errno))); goto _EXIT; } From b3be47fdbb3ae43cbaf704b01926669e5578f53a Mon Sep 17 00:00:00 2001 From: Hongze Cheng Date: Mon, 8 Jul 2024 15:39:25 +0800 Subject: [PATCH 75/92] cancel bg tasks when alter stt_trigger --- source/dnode/vnode/src/vnd/vnodeSvr.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/source/dnode/vnode/src/vnd/vnodeSvr.c b/source/dnode/vnode/src/vnd/vnodeSvr.c index 9006d36e65..29104c6c12 100644 --- a/source/dnode/vnode/src/vnd/vnodeSvr.c +++ b/source/dnode/vnode/src/vnd/vnodeSvr.c @@ -738,7 +738,7 @@ int32_t vnodePreprocessQueryMsg(SVnode *pVnode, SRpcMsg *pMsg) { return qWorkerPreprocessQueryMsg(pVnode->pQuery, pMsg, TDMT_SCH_QUERY == pMsg->msgType); } -int32_t vnodeProcessQueryMsg(SVnode *pVnode, SRpcMsg *pMsg, SQueueInfo* pInfo) { +int32_t vnodeProcessQueryMsg(SVnode *pVnode, SRpcMsg *pMsg, SQueueInfo *pInfo) { vTrace("message in vnode query queue is processing"); if ((pMsg->msgType == TDMT_SCH_QUERY || pMsg->msgType == TDMT_VND_TMQ_CONSUME || pMsg->msgType == TDMT_VND_TMQ_CONSUME_PUSH) && @@ -1978,6 +1978,9 @@ _exit: return code; } +extern int32_t tsdbDisableAndCancelAllBgTask(STsdb *pTsdb); +extern int32_t tsdbEnableBgTask(STsdb *pTsdb); + static int32_t vnodeProcessAlterConfigReq(SVnode *pVnode, int64_t ver, void *pReq, int32_t len, SRpcMsg *pRsp) { bool walChanged = false; bool tsdbChanged = false; @@ -2075,7 +2078,14 @@ static int32_t vnodeProcessAlterConfigReq(SVnode *pVnode, int64_t ver, void *pRe } if (req.sttTrigger != -1 && req.sttTrigger != pVnode->config.sttTrigger) { - pVnode->config.sttTrigger = req.sttTrigger; + if (req.sttTrigger > 1 && pVnode->config.sttTrigger > 1) { + pVnode->config.sttTrigger = req.sttTrigger; + } else { + vnodeAWait(&pVnode->commitTask); + tsdbDisableAndCancelAllBgTask(pVnode->pTsdb); + pVnode->config.sttTrigger = req.sttTrigger; + tsdbEnableBgTask(pVnode->pTsdb); + } } if (req.minRows != -1 && req.minRows != pVnode->config.tsdbCfg.minRows) { From 7cfdf0c14da8f74d8a7854b5a30540854818787e Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Mon, 8 Jul 2024 15:43:29 +0800 Subject: [PATCH 76/92] fix(stream): not check the total number of sub tasks with different status. --- source/libs/stream/src/streamCheckStatus.c | 42 +++++++++++----------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/source/libs/stream/src/streamCheckStatus.c b/source/libs/stream/src/streamCheckStatus.c index 2d8fe4a0e1..226a06be7e 100644 --- a/source/libs/stream/src/streamCheckStatus.c +++ b/source/libs/stream/src/streamCheckStatus.c @@ -703,31 +703,31 @@ void rspMonitorFn(void* param, void* tmrId) { if (pStat->state == TASK_STATUS__UNINIT) { getCheckRspStatus(pInfo, timeoutDuration, &numOfReady, &numOfFault, &numOfNotRsp, pTimeoutList, pNotReadyList, id); + + numOfNotReady = (int32_t)taosArrayGetSize(pNotReadyList); + numOfTimeout = (int32_t)taosArrayGetSize(pTimeoutList); + + // fault tasks detected, not try anymore + ASSERT((numOfReady + numOfFault + numOfNotReady + numOfTimeout + numOfNotRsp) == total); + if (numOfFault > 0) { + int32_t ref = atomic_sub_fetch_32(&pTask->status.timerActive, 1); + stDebug( + "s-task:%s status:%s vgId:%d all rsp. quit from monitor rsp tmr, since vnode-transfer/leader-change/restart " + "detected, total:%d, notRsp:%d, notReady:%d, fault:%d, timeout:%d, ready:%d ref:%d", + id, pStat->name, vgId, total, numOfNotRsp, numOfNotReady, numOfFault, numOfTimeout, numOfReady, ref); + + streamTaskCompleteCheckRsp(pInfo, false, id); + taosThreadMutexUnlock(&pInfo->checkInfoLock); + streamMetaReleaseTask(pMeta, pTask); + + taosArrayDestroy(pNotReadyList); + taosArrayDestroy(pTimeoutList); + return; + } } else { // unexpected status stError("s-task:%s unexpected task status:%s during waiting for check rsp", id, pStat->name); } - numOfNotReady = (int32_t)taosArrayGetSize(pNotReadyList); - numOfTimeout = (int32_t)taosArrayGetSize(pTimeoutList); - - // fault tasks detected, not try anymore - ASSERT((numOfReady + numOfFault + numOfNotReady + numOfTimeout + numOfNotRsp) == total); - if (numOfFault > 0) { - int32_t ref = atomic_sub_fetch_32(&pTask->status.timerActive, 1); - stDebug( - "s-task:%s status:%s vgId:%d all rsp. quit from monitor rsp tmr, since vnode-transfer/leader-change/restart " - "detected, total:%d, notRsp:%d, notReady:%d, fault:%d, timeout:%d, ready:%d ref:%d", - id, pStat->name, vgId, total, numOfNotRsp, numOfNotReady, numOfFault, numOfTimeout, numOfReady, ref); - - streamTaskCompleteCheckRsp(pInfo, false, id); - taosThreadMutexUnlock(&pInfo->checkInfoLock); - streamMetaReleaseTask(pMeta, pTask); - - taosArrayDestroy(pNotReadyList); - taosArrayDestroy(pTimeoutList); - return; - } - // checking of downstream tasks has been stopped by other threads if (pInfo->stopCheckProcess == 1) { int32_t ref = atomic_sub_fetch_32(&pTask->status.timerActive, 1); From 2176fa3b1ad3e9dc2ab2651de71999d3b5cc1572 Mon Sep 17 00:00:00 2001 From: wangmm0220 Date: Mon, 8 Jul 2024 15:49:00 +0800 Subject: [PATCH 77/92] fix:[TD-30883]send hb before close in tmq --- source/client/src/clientTmq.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/source/client/src/clientTmq.c b/source/client/src/clientTmq.c index a42b0f75dd..10d5140a0b 100644 --- a/source/client/src/clientTmq.c +++ b/source/client/src/clientTmq.c @@ -100,6 +100,7 @@ struct tmq_t { int64_t totalRows; // timer + tmr_h hbLiveTimer; tmr_h epTimer; tmr_h commitTimer; STscObj* pTscObj; // connection @@ -878,7 +879,7 @@ void tmqSendHbReq(void* param, void* tmrId) { OVER: tDestroySMqHbReq(&req); if(tmrId != NULL){ - taosTmrReset(tmqSendHbReq, DEFAULT_HEARTBEAT_INTERVAL, param, tmqMgmt.timer, &tmrId); + taosTmrReset(tmqSendHbReq, DEFAULT_HEARTBEAT_INTERVAL, param, tmqMgmt.timer, &tmq->hbLiveTimer); } taosReleaseRef(tmqMgmt.rsetId, refId); } @@ -1163,7 +1164,7 @@ tmq_t* tmq_consumer_new(tmq_conf_t* conf, char* errstr, int32_t errstrLen) { goto _failed; } - taosTmrStart(tmqSendHbReq, DEFAULT_HEARTBEAT_INTERVAL, (void*)pTmq->refId, tmqMgmt.timer); + pTmq->hbLiveTimer = taosTmrStart(tmqSendHbReq, DEFAULT_HEARTBEAT_INTERVAL, (void*)pTmq->refId, tmqMgmt.timer); char buf[TSDB_OFFSET_LEN] = {0}; STqOffsetVal offset = {.type = pTmq->resetOffsetCfg}; From de7e25f259042f668962188eaf6e9cfd6a467c7f Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Mon, 8 Jul 2024 16:57:13 +0800 Subject: [PATCH 78/92] fix(stream): fix race condition in handling the hbMsg rsp. --- source/libs/stream/src/streamHb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/libs/stream/src/streamHb.c b/source/libs/stream/src/streamHb.c index 6b1a1aca92..16cb23de10 100644 --- a/source/libs/stream/src/streamHb.c +++ b/source/libs/stream/src/streamHb.c @@ -332,7 +332,7 @@ int32_t streamProcessHeartbeatRsp(SStreamMeta* pMeta, SMStreamHbRspMsg* pRsp) { stDebug("vgId:%d process hbMsg rsp, msgId:%d rsp confirmed", pMeta->vgId, pRsp->msgId); SMetaHbInfo* pInfo = pMeta->pHbInfo; - streamMetaRLock(pMeta); + streamMetaWLock(pMeta); // current waiting rsp recved if (pRsp->msgId == pInfo->hbCount) { @@ -345,6 +345,6 @@ int32_t streamProcessHeartbeatRsp(SStreamMeta* pMeta, SMStreamHbRspMsg* pRsp) { stWarn("vgId:%d recv expired hb rsp, msgId:%d, discarded", pMeta->vgId, pRsp->msgId); } - streamMetaRUnLock(pMeta); + streamMetaWUnLock(pMeta); return TSDB_CODE_SUCCESS; } \ No newline at end of file From 855c6d6295fb0f7753c5b6c4f35927a12b6fb771 Mon Sep 17 00:00:00 2001 From: Hongze Cheng Date: Mon, 8 Jul 2024 17:09:15 +0800 Subject: [PATCH 79/92] make it compile --- include/util/taoserror.h | 10 ++++------ source/util/src/terror.c | 12 ++++++------ 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/include/util/taoserror.h b/include/util/taoserror.h index 0d58dfefaa..1ca419ec4e 100644 --- a/include/util/taoserror.h +++ b/include/util/taoserror.h @@ -18,8 +18,6 @@ #include -#include "osDef.h" - #ifdef __cplusplus extern "C" { #endif @@ -48,11 +46,11 @@ const char* terrstr(); char* taosGetErrMsgReturn(); char* taosGetErrMsg(); int32_t* taosGetErrno(); +int32_t* taosGetErrln(); int32_t taosGetErrSize(); - -extern threadlocal int32_t terrno; -extern threadlocal int32_t terrln; -extern threadlocal char terrMsg[ERR_MSG_LEN]; +#define terrno (*taosGetErrno()) +#define terrMsg (taosGetErrMsg()) +#define terrln (*taosGetErrln()) #define SET_ERROR_MSG(MSG, ...) \ snprintf(terrMsg, ERR_MSG_LEN, MSG, ##__VA_ARGS__) diff --git a/source/util/src/terror.c b/source/util/src/terror.c index 956a2552d4..cf41887142 100644 --- a/source/util/src/terror.c +++ b/source/util/src/terror.c @@ -21,14 +21,14 @@ #define TAOS_ERROR_C -threadlocal int32_t terrno; -threadlocal int32_t terrln; -threadlocal char terrMsg[ERR_MSG_LEN]; - +static threadlocal int32_t tsErrno; +static threadlocal int32_t tsErrln; +static threadlocal char tsErrMsgDetail[ERR_MSG_LEN] = {0}; static threadlocal char tsErrMsgReturn[ERR_MSG_LEN] = {0}; -int32_t* taosGetErrno() { return &terrno; } -char* taosGetErrMsg() { return terrMsg; } +int32_t* taosGetErrno() { return &tsErrno; } +int32_t* taosGetErrln() { return &tsErrln; } +char* taosGetErrMsg() { return tsErrMsgDetail; } char* taosGetErrMsgReturn() { return tsErrMsgReturn; } #ifdef TAOS_ERROR_C From 50a2ef08bdbd62f1aecc5223fef9ce9801d4cbf7 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Mon, 8 Jul 2024 17:10:34 +0800 Subject: [PATCH 80/92] fix(stream): update some logs. --- source/dnode/mnode/impl/src/mndStreamUtil.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/source/dnode/mnode/impl/src/mndStreamUtil.c b/source/dnode/mnode/impl/src/mndStreamUtil.c index e14d2a0557..0e498f20f6 100644 --- a/source/dnode/mnode/impl/src/mndStreamUtil.c +++ b/source/dnode/mnode/impl/src/mndStreamUtil.c @@ -971,8 +971,9 @@ void mndAddConsensusTasks(SCheckpointConsensusInfo *pInfo, const SRestoreCheckpo taosArrayPush(pInfo->pTaskList, &info); int32_t num = taosArrayGetSize(pInfo->pTaskList); - mDebug("s-task:0x%x added into consensus-checkpointId list, stream:0x%" PRIx64 " total waiting:%d", - pRestoreInfo->taskId, pRestoreInfo->streamId, num); + mDebug("s-task:0x%x checkpointId:%" PRId64 " added into consensus-checkpointId list, stream:0x%" PRIx64 + " waiting tasks:%d", + pRestoreInfo->taskId, pRestoreInfo->checkpointId, pRestoreInfo->streamId, num); } void mndClearConsensusRspEntry(SCheckpointConsensusInfo* pInfo) { From bdced636b3e611718ff096fd740e650454d5fd91 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Mon, 8 Jul 2024 18:20:35 +0800 Subject: [PATCH 81/92] fix(stream): free task state when stopping stream tasks. --- include/libs/stream/tstream.h | 3 ++- source/libs/stream/src/streamTask.c | 18 +++++++++++------- source/libs/stream/src/streamTaskSm.c | 22 ++++++++++++++-------- 3 files changed, 27 insertions(+), 16 deletions(-) diff --git a/include/libs/stream/tstream.h b/include/libs/stream/tstream.h index e275c1511d..f867a82cbb 100644 --- a/include/libs/stream/tstream.h +++ b/include/libs/stream/tstream.h @@ -529,10 +529,11 @@ typedef int32_t (*__state_trans_user_fn)(SStreamTask*, void* param); SStreamTask* tNewStreamTask(int64_t streamId, int8_t taskLevel, SEpSet* pEpset, bool fillHistory, int64_t triggerParam, SArray* pTaskList, bool hasFillhistory, int8_t subtableWithoutMd5); +void tFreeStreamTask(SStreamTask* pTask); int32_t tEncodeStreamTask(SEncoder* pEncoder, const SStreamTask* pTask); int32_t tDecodeStreamTask(SDecoder* pDecoder, SStreamTask* pTask); -void tFreeStreamTask(SStreamTask* pTask); int32_t streamTaskInit(SStreamTask* pTask, SStreamMeta* pMeta, SMsgCb* pMsgCb, int64_t ver); +void streamFreeTaskState(SStreamTask* pTask, ETaskStatus status); int32_t tDecodeStreamTaskChkInfo(SDecoder* pDecoder, SCheckpointInfo* pChkpInfo); int32_t tDecodeStreamTaskId(SDecoder* pDecoder, STaskId* pTaskId); diff --git a/source/libs/stream/src/streamTask.c b/source/libs/stream/src/streamTask.c index f72a5dd434..9bcad87264 100644 --- a/source/libs/stream/src/streamTask.c +++ b/source/libs/stream/src/streamTask.c @@ -268,13 +268,7 @@ void tFreeStreamTask(SStreamTask* pTask) { } streamTaskCleanupCheckInfo(&pTask->taskCheckInfo); - - if (pTask->pState) { - stDebug("s-task:0x%x start to free task state", taskId); - streamStateClose(pTask->pState, status1 == TASK_STATUS__DROPPING); - taskDbRemoveRef(pTask->pBackend); - pTask->pBackend = NULL; - } + streamFreeTaskState(pTask, status1); if (pTask->pNameMap) { tSimpleHashCleanup(pTask->pNameMap); @@ -311,6 +305,16 @@ void tFreeStreamTask(SStreamTask* pTask) { stDebug("s-task:0x%x free task completed", taskId); } +void streamFreeTaskState(SStreamTask* pTask, ETaskStatus status) { + if (pTask->pState != NULL) { + stDebug("s-task:0x%x start to free task state", pTask->id.taskId); + streamStateClose(pTask->pState, status == TASK_STATUS__DROPPING); + taskDbRemoveRef(pTask->pBackend); + pTask->pBackend = NULL; + pTask->pState = NULL; + } +} + static void setInitialVersionInfo(SStreamTask* pTask, int64_t ver) { SCheckpointInfo* pChkInfo = &pTask->chkInfo; SDataRange* pRange = &pTask->dataRange; diff --git a/source/libs/stream/src/streamTaskSm.c b/source/libs/stream/src/streamTaskSm.c index 85d3e0068a..f2bd99cdaf 100644 --- a/source/libs/stream/src/streamTaskSm.c +++ b/source/libs/stream/src/streamTaskSm.c @@ -79,6 +79,12 @@ static int32_t attachWaitedEvent(SStreamTask* pTask, SFutureHandleEventInfo* pEv return 0; } +static int32_t stopTaskSuccFn(SStreamTask* pTask) { + SStreamTaskSM* pSM = pTask->status.pSM; + streamFreeTaskState(pTask, pSM->current.state); + return TSDB_CODE_SUCCESS; +} + int32_t streamTaskInitStatus(SStreamTask* pTask) { pTask->execInfo.checkTs = taosGetTimestampMs(); stDebug("s-task:%s start init, and check downstream tasks, set the init ts:%" PRId64, pTask->id.idStr, @@ -634,21 +640,21 @@ void doInitStateTransferTable(void) { // resume is completed by restore status of state-machine // stop related event - trans = createStateTransform(TASK_STATUS__READY, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, NULL, NULL); + trans = createStateTransform(TASK_STATUS__READY, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, stopTaskSuccFn, NULL); taosArrayPush(streamTaskSMTrans, &trans); - trans = createStateTransform(TASK_STATUS__DROPPING, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, NULL, NULL); + trans = createStateTransform(TASK_STATUS__DROPPING, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, stopTaskSuccFn, NULL); taosArrayPush(streamTaskSMTrans, &trans); - trans = createStateTransform(TASK_STATUS__UNINIT, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, NULL, NULL); + trans = createStateTransform(TASK_STATUS__UNINIT, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, stopTaskSuccFn, NULL); taosArrayPush(streamTaskSMTrans, &trans); - trans = createStateTransform(TASK_STATUS__STOP, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, NULL, NULL); + trans = createStateTransform(TASK_STATUS__STOP, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, stopTaskSuccFn, NULL); taosArrayPush(streamTaskSMTrans, &trans); - trans = createStateTransform(TASK_STATUS__SCAN_HISTORY, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, NULL, NULL); + trans = createStateTransform(TASK_STATUS__SCAN_HISTORY, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, stopTaskSuccFn, NULL); taosArrayPush(streamTaskSMTrans, &trans); - trans = createStateTransform(TASK_STATUS__HALT, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, NULL, NULL); + trans = createStateTransform(TASK_STATUS__HALT, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, stopTaskSuccFn, NULL); taosArrayPush(streamTaskSMTrans, &trans); - trans = createStateTransform(TASK_STATUS__PAUSE, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, NULL, NULL); + trans = createStateTransform(TASK_STATUS__PAUSE, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, stopTaskSuccFn, NULL); taosArrayPush(streamTaskSMTrans, &trans); - trans = createStateTransform(TASK_STATUS__CK, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, NULL, NULL); + trans = createStateTransform(TASK_STATUS__CK, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, stopTaskSuccFn, NULL); taosArrayPush(streamTaskSMTrans, &trans); // dropping related event From 9f4f4f7f9fc19865bc5cfab307ebd5dd01eac58b Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Tue, 9 Jul 2024 00:01:54 +0800 Subject: [PATCH 82/92] fix(stream): set the null column when extracting data from submit data. --- source/libs/executor/src/scanoperator.c | 42 +++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/source/libs/executor/src/scanoperator.c b/source/libs/executor/src/scanoperator.c index 490f6b86fa..ce4915ca4d 100644 --- a/source/libs/executor/src/scanoperator.c +++ b/source/libs/executor/src/scanoperator.c @@ -2187,6 +2187,17 @@ static void rebuildDeleteBlockData(SSDataBlock* pBlock, STimeWindow* pWindow, co taosMemoryFree(p); } +static int32_t colIdComparFn(const void* param1, const void * param2) { + int32_t p1 = *(int32_t*) param1; + int32_t p2 = *(int32_t*) param2; + + if (p1 == p2) { + return 0; + } else { + return (p1 < p2)? -1:1; + } +} + static int32_t setBlockIntoRes(SStreamScanInfo* pInfo, const SSDataBlock* pBlock, STimeWindow* pTimeWindow, bool filter) { SDataBlockInfo* pBlockInfo = &pInfo->pRes->info; SOperatorInfo* pOperator = pInfo->pStreamScanOp; @@ -2203,6 +2214,8 @@ static int32_t setBlockIntoRes(SStreamScanInfo* pInfo, const SSDataBlock* pBlock STableScanInfo* pTableScanInfo = pInfo->pTableScanOp->info; pBlockInfo->id.groupId = tableListGetTableGroupId(pTableScanInfo->base.pTableListInfo, pBlock->info.id.uid); + SArray* pColList = taosArrayInit(4, sizeof(int32_t)); + // todo extract method for (int32_t i = 0; i < taosArrayGetSize(pInfo->matchInfo.pList); ++i) { SColMatchItem* pColMatchInfo = taosArrayGet(pInfo->matchInfo.pList, i); @@ -2217,6 +2230,7 @@ static int32_t setBlockIntoRes(SStreamScanInfo* pInfo, const SSDataBlock* pBlock SColumnInfoData* pDst = taosArrayGet(pInfo->pRes->pDataBlock, pColMatchInfo->dstSlotId); colDataAssign(pDst, pResCol, pBlock->info.rows, &pInfo->pRes->info); colExists = true; + taosArrayPush(pColList, &pColMatchInfo->dstSlotId); break; } } @@ -2225,6 +2239,7 @@ static int32_t setBlockIntoRes(SStreamScanInfo* pInfo, const SSDataBlock* pBlock if (!colExists) { SColumnInfoData* pDst = taosArrayGet(pInfo->pRes->pDataBlock, pColMatchInfo->dstSlotId); colDataSetNNULL(pDst, 0, pBlockInfo->rows); + taosArrayPush(pColList, &pColMatchInfo->dstSlotId); } } @@ -2240,8 +2255,35 @@ static int32_t setBlockIntoRes(SStreamScanInfo* pInfo, const SSDataBlock* pBlock // reset the error code. terrno = 0; + + for(int32_t i = 0; i < pInfo->numOfPseudoExpr; ++i) { + taosArrayPush(pColList, &pInfo->pPseudoExpr[i].base.resSchema.slotId); + } } + taosArraySort(pColList, colIdComparFn); + + int32_t i = 0, j = 0; + while(i < taosArrayGetSize(pColList)) { + int32_t slot1 = *(int32_t*)taosArrayGet(pColList, i); + if (slot1 > j) { + SColumnInfoData* pDst = taosArrayGet(pInfo->pRes->pDataBlock, j); + colDataSetNNULL(pDst, 0, pBlockInfo->rows); + j += 1; + } else { + i += 1; + j += 1; + } + } + + while(j < taosArrayGetSize(pInfo->pRes->pDataBlock)) { + SColumnInfoData* pDst = taosArrayGet(pInfo->pRes->pDataBlock, j); + colDataSetNNULL(pDst, 0, pBlockInfo->rows); + j += 1; + } + + taosArrayDestroy(pColList); + if (filter) { doFilter(pInfo->pRes, pOperator->exprSupp.pFilterInfo, NULL); } From 3efb64ea8cb602d1344bef81add57db1adffde5b Mon Sep 17 00:00:00 2001 From: wangjiaming0909 <604227650@qq.com> Date: Tue, 9 Jul 2024 09:14:43 +0800 Subject: [PATCH 83/92] add doc for SELECT ... FILL multi values --- docs/en/12-taos-sql/12-distinguished.md | 2 +- docs/zh/12-taos-sql/12-distinguished.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/12-taos-sql/12-distinguished.md b/docs/en/12-taos-sql/12-distinguished.md index 5dca92a35c..91fcec5f7b 100644 --- a/docs/en/12-taos-sql/12-distinguished.md +++ b/docs/en/12-taos-sql/12-distinguished.md @@ -80,7 +80,7 @@ These pseudocolumns occur after the aggregation clause. `FILL` clause is used to specify how to fill when there is data missing in any window, including: 1. NONE: No fill (the default fill mode) -2. VALUE: Fill with a fixed value, which should be specified together, for example `FILL(VALUE, 1.23)` Note: The value filled depends on the data type. For example, if you run FILL(VALUE 1.23) on an integer column, the value 1 is filled. +2. VALUE: Fill with a fixed value, which should be specified together, for example `FILL(VALUE, 1.23)` Note: The value filled depends on the data type. For example, if you run FILL(VALUE 1.23) on an integer column, the value 1 is filled. If multi columns in select list need to be filled, multi values are needed in fill clause, `SELECT _wstart, min(c1), max(c1) FROM ... FILL(VALUE, 0, 0)`. 3. PREV: Fill with the previous non-NULL value, `FILL(PREV)` 4. NULL: Fill with NULL, `FILL(NULL)` 5. LINEAR: Fill with the closest non-NULL value, `FILL(LINEAR)` diff --git a/docs/zh/12-taos-sql/12-distinguished.md b/docs/zh/12-taos-sql/12-distinguished.md index b979d44a5e..bf24d0adac 100755 --- a/docs/zh/12-taos-sql/12-distinguished.md +++ b/docs/zh/12-taos-sql/12-distinguished.md @@ -76,7 +76,7 @@ window_clause: { FILL 语句指定某一窗口区间数据缺失的情况下的填充模式。填充模式包括以下几种: 1. 不进行填充:NONE(默认填充模式)。 -2. VALUE 填充:固定值填充,此时需要指定填充的数值。例如:FILL(VALUE, 1.23)。这里需要注意,最终填充的值受由相应列的类型决定,如 FILL(VALUE, 1.23),相应列为 INT 类型,则填充值为 1。 +2. VALUE 填充:固定值填充,此时需要指定填充的数值。例如:FILL(VALUE, 1.23)。这里需要注意,最终填充的值受由相应列的类型决定,如 FILL(VALUE, 1.23),相应列为 INT 类型,则填充值为 1, 若查询列表中有多列需要FILL, 则需要给每一个FILL列指定VALUE, 如`SELECT _wstart, min(c1), max(c1) FROM ... FILL(VALUE, 0, 0)`。 3. PREV 填充:使用前一个非 NULL 值填充数据。例如:FILL(PREV)。 4. NULL 填充:使用 NULL 填充数据。例如:FILL(NULL)。 5. LINEAR 填充:根据前后距离最近的非 NULL 值做线性插值填充。例如:FILL(LINEAR)。 From b30c2ab621acaa3810989ee57bed8430a7e3ab9f Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Tue, 9 Jul 2024 09:32:22 +0800 Subject: [PATCH 84/92] Update 14-stream.md --- docs/zh/12-taos-sql/14-stream.md | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/docs/zh/12-taos-sql/14-stream.md b/docs/zh/12-taos-sql/14-stream.md index 8d814d05a1..c0d14f0455 100644 --- a/docs/zh/12-taos-sql/14-stream.md +++ b/docs/zh/12-taos-sql/14-stream.md @@ -27,7 +27,7 @@ subquery: SELECT select_list from_clause [WHERE condition] [PARTITION BY tag_list] - [window_clause] + window_clause ``` 支持会话窗口、状态窗口、滑动窗口、事件窗口和计数窗口,其中,状态窗口、事件窗口和计数窗口搭配超级表时必须与partition by tbname一起使用。对于数据源表是复合主键的流,不支持状态窗口、事件窗口、计数窗口的计算。 @@ -271,4 +271,23 @@ PAUSE STREAM [IF EXISTS] stream_name; 2.流计算恢复计算任务 RESUME STREAM [IF EXISTS] [IGNORE UNTREATED] stream_name; -没有指定IF EXISTS,如果该stream不存在,则报错,如果存在,则恢复流计算;指定了IF EXISTS,如果stream不存在,则返回成功;如果存在,则恢复流计算。如果指定IGNORE UNTREATED,则恢复流计算时,忽略流计算暂停期间写入的数据。 \ No newline at end of file +没有指定IF EXISTS,如果该stream不存在,则报错,如果存在,则恢复流计算;指定了IF EXISTS,如果stream不存在,则返回成功;如果存在,则恢复流计算。如果指定IGNORE UNTREATED,则恢复流计算时,忽略流计算暂停期间写入的数据。 + +## 状态数据备份与同步 +流计算的中间结果成为计算的状态数据,需要在流计算整个生命周期中进行持久化保存。为了确保流计算中间状态能够在集群环境下在不同的节点间可靠地同步和迁移,至3.3.2.1 版本开始,需要在运行环境中部署 rsync 软件,还需要增加以下的步骤: +1. 在配置文件中配置 snode 的地址(IP+端口)和状态数据备份目录(该目录系 snode 所在的物理节点的目录)。 +2. 然后创建 snode。 +完成上述两个步骤以后才能创建流。 +如果没有创建 snode 并正确配置 snode 的地址,流计算过程中将无法生成检查点(checkpoint),并可能导致后续的计算结果产生错误。 + +> snodeAddress 127.0.0.1:873 +> +> checkpointBackupDir /home/user/stream/backup/checkpoint/ + + +## 创建 snode 的方式 +使用以下命令创建 snode(stream node), snode 是流计算中有状态的计算节点,可用于部署聚合任务,同时负责备份不同的流计算任务生成的检查点数据。 +```sql +CREATE SNODE ON DNODE [id] +``` +其中的 id 是集群中的 dnode 的序号。请注意选择的dnode,流计算的中间状态将自动在其上进行备份。 From 985d888b6d51df48abb80c193ef66622f738e650 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Tue, 9 Jul 2024 10:28:20 +0800 Subject: [PATCH 85/92] Update 14-stream.md --- docs/en/12-taos-sql/14-stream.md | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/docs/en/12-taos-sql/14-stream.md b/docs/en/12-taos-sql/14-stream.md index a6759da858..9c6c57ba6a 100644 --- a/docs/en/12-taos-sql/14-stream.md +++ b/docs/en/12-taos-sql/14-stream.md @@ -30,7 +30,7 @@ subquery: SELECT [DISTINCT] select_list from_clause [WHERE condition] [PARTITION BY tag_list] - [window_clause] + window_clause ``` Session windows, state windows, and sliding windows are supported. When you configure a session or state window for a supertable, you must use PARTITION BY TBNAME. If the source table has a composite primary key, state windows, event windows, and count windows are not supported. @@ -193,11 +193,32 @@ All [scalar functions](../function/#scalar-functions) are available in stream pr - [unique](../function/#unique) - [mode](../function/#mode) -## Pause\Resume stream +## Pause Resume stream 1.pause stream +```sql PAUSE STREAM [IF EXISTS] stream_name; +``` If "IF EXISTS" is not specified and the stream does not exist, an error will be reported; If "IF EXISTS" is specified and the stream does not exist, success is returned; If the stream exists, paused all stream tasks. 2.resume stream +```sql RESUME STREAM [IF EXISTS] [IGNORE UNTREATED] stream_name; +``` If "IF EXISTS" is not specified and the stream does not exist, an error will be reported. If "IF EXISTS" is specified and the stream does not exist, success is returned; If the stream exists, all of the stream tasks will be resumed. If "IGNORE UntREATED" is specified, data written during the pause period of stream is ignored when resuming stream. + +## Stream State Backup +The intermediate processing results of stream, a.k.a stream state, need to be persistent on the disk properly during stream processing. The stream state, consisting of multiple files on disk, may be transferred between different computing nodes during the stream processing, as a result of a leader/follower switch or physical computing node offline. You need to deploy the rsync on each physical node to enable the backup and restore processing work, since _ver_.3.3.2.1. To ensure it works correctly, please refer to the following instructions: +1. add the option "snodeAddress" in the configure file +2. add the option "checkpointBackupDir" in the configure file to set the backup data directory. +3. create a _snode_ before creating a stream to ensure the backup service is activated. Otherwise, the checkpoint may not generated during the stream procedure. + +>snodeAddress 127.0.0.1:873 +> +>checkpointBackupDir /home/user/stream/backup/checkpoint/ + +## create snode +The snode, stream node for short, on which the aggregate tasks can be deployed on, is a stateful computing node dedicated to the stream processing. An important feature is to backup and restore the stream state files. The snode needs to be created before creating stream tasks. Use the following SQL statement to create a snode in a TDengine cluster, and only one snode is allowed in a TDengine cluster for now. +```sql +CREATE SNODE ON DNODE id +``` +is the ordinal number of a dnode, which can be acquired by using ```show dnodes``` statement. From 8976ac0c8328d3e7eae5506e6b187948f6c9bd35 Mon Sep 17 00:00:00 2001 From: wade zhang <95411902+gccgdb1234@users.noreply.github.com> Date: Tue, 9 Jul 2024 10:38:06 +0800 Subject: [PATCH 86/92] Update 12-distinguished.md --- docs/en/12-taos-sql/12-distinguished.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/12-taos-sql/12-distinguished.md b/docs/en/12-taos-sql/12-distinguished.md index 91fcec5f7b..818b67db9b 100644 --- a/docs/en/12-taos-sql/12-distinguished.md +++ b/docs/en/12-taos-sql/12-distinguished.md @@ -80,7 +80,7 @@ These pseudocolumns occur after the aggregation clause. `FILL` clause is used to specify how to fill when there is data missing in any window, including: 1. NONE: No fill (the default fill mode) -2. VALUE: Fill with a fixed value, which should be specified together, for example `FILL(VALUE, 1.23)` Note: The value filled depends on the data type. For example, if you run FILL(VALUE 1.23) on an integer column, the value 1 is filled. If multi columns in select list need to be filled, multi values are needed in fill clause, `SELECT _wstart, min(c1), max(c1) FROM ... FILL(VALUE, 0, 0)`. +2. VALUE: Fill with a fixed value, which should be specified together, for example `FILL(VALUE, 1.23)` Note: The value filled depends on the data type. For example, if you run FILL(VALUE 1.23) on an integer column, the value 1 is filled. If multiple columns in select list need to be filled, then in the fill clause there must be a fill value for each of these columns, for example, `SELECT _wstart, min(c1), max(c1) FROM ... FILL(VALUE, 0, 0)`. 3. PREV: Fill with the previous non-NULL value, `FILL(PREV)` 4. NULL: Fill with NULL, `FILL(NULL)` 5. LINEAR: Fill with the closest non-NULL value, `FILL(LINEAR)` From d4c6b3447b4c0dcd5dd54616d475821530621617 Mon Sep 17 00:00:00 2001 From: dmchen Date: Tue, 9 Jul 2024 03:02:08 +0000 Subject: [PATCH 87/92] fix/TS-5144 --- source/dnode/mnode/impl/src/mndTrans.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/source/dnode/mnode/impl/src/mndTrans.c b/source/dnode/mnode/impl/src/mndTrans.c index 4669fdfb38..58dab20859 100644 --- a/source/dnode/mnode/impl/src/mndTrans.c +++ b/source/dnode/mnode/impl/src/mndTrans.c @@ -1241,6 +1241,7 @@ static void mndTransResetActions(SMnode *pMnode, STrans *pTrans, SArray *pArray) } } +// execute at bottom half static int32_t mndTransWriteSingleLog(SMnode *pMnode, STrans *pTrans, STransAction *pAction, bool topHalf) { if (pAction->rawWritten) return 0; if (topHalf) { @@ -1267,6 +1268,7 @@ static int32_t mndTransWriteSingleLog(SMnode *pMnode, STrans *pTrans, STransActi return code; } +// execute at top half static int32_t mndTransSendSingleMsg(SMnode *pMnode, STrans *pTrans, STransAction *pAction, bool topHalf) { if (pAction->msgSent) return 0; if (mndCannotExecuteTransAction(pMnode, topHalf)) { @@ -1644,6 +1646,11 @@ static bool mndTransPerformCommitActionStage(SMnode *pMnode, STrans *pTrans, boo pTrans->stage = TRN_STAGE_FINISH; // TRN_STAGE_PRE_FINISH is not necessary mInfo("trans:%d, stage from commitAction to finished", pTrans->id); continueExec = true; + } else if (code == TSDB_CODE_MND_TRANS_CTX_SWITCH && topHalf) { + pTrans->code = 0; + pTrans->stage = TRN_STAGE_COMMIT; + mInfo("trans:%d, back to commit stage", pTrans->id); + continueExec = true; } else { pTrans->code = terrno; pTrans->failedTimes++; @@ -1783,11 +1790,13 @@ void mndTransExecuteImp(SMnode *pMnode, STrans *pTrans, bool topHalf) { mndTransSendRpcRsp(pMnode, pTrans); } +// start trans, pullup, receive rsp, kill void mndTransExecute(SMnode *pMnode, STrans *pTrans) { bool topHalf = true; return mndTransExecuteImp(pMnode, pTrans, topHalf); } +// update trans void mndTransRefresh(SMnode *pMnode, STrans *pTrans) { bool topHalf = false; return mndTransExecuteImp(pMnode, pTrans, topHalf); From c71413c2aa3b927252231880f3f7f00af8e053e8 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Tue, 9 Jul 2024 14:52:15 +0800 Subject: [PATCH 88/92] fix(stream): discard the repeat send consensus-checkpointId msg. --- include/libs/stream/streamMsg.h | 1 + include/libs/stream/tstream.h | 1 + source/dnode/mnode/impl/src/mndStreamUtil.c | 1 + source/dnode/vnode/src/tqCommon/tqCommon.c | 17 +++++++++++++---- source/libs/stream/src/streamMsg.c | 2 ++ 5 files changed, 18 insertions(+), 4 deletions(-) diff --git a/include/libs/stream/streamMsg.h b/include/libs/stream/streamMsg.h index b253054dfe..34921daac3 100644 --- a/include/libs/stream/streamMsg.h +++ b/include/libs/stream/streamMsg.h @@ -216,6 +216,7 @@ typedef struct SRestoreCheckpointInfo { int64_t startTs; int64_t streamId; int64_t checkpointId; // latest checkpoint id + int32_t transId; // transaction id of the update the consensus-checkpointId transaction int32_t taskId; int32_t nodeId; } SRestoreCheckpointInfo; diff --git a/include/libs/stream/tstream.h b/include/libs/stream/tstream.h index f867a82cbb..5ba0ce454c 100644 --- a/include/libs/stream/tstream.h +++ b/include/libs/stream/tstream.h @@ -273,6 +273,7 @@ typedef struct SCheckpointInfo { int64_t processedVer; int64_t nextProcessVer; // current offset in WAL, not serialize it int64_t msgVer; + int32_t consensusTransId;// consensus checkpoint id SActiveCheckpointInfo* pActiveInfo; } SCheckpointInfo; diff --git a/source/dnode/mnode/impl/src/mndStreamUtil.c b/source/dnode/mnode/impl/src/mndStreamUtil.c index 0e498f20f6..c4adbd0fc3 100644 --- a/source/dnode/mnode/impl/src/mndStreamUtil.c +++ b/source/dnode/mnode/impl/src/mndStreamUtil.c @@ -846,6 +846,7 @@ static int32_t mndStreamSetChkptIdAction(SMnode *pMnode, STrans *pTrans, SStream .checkpointId = checkpointId, .startTs = ts, .nodeId = pTask->info.nodeId, + .transId = pTrans->id, }; int32_t code = 0; diff --git a/source/dnode/vnode/src/tqCommon/tqCommon.c b/source/dnode/vnode/src/tqCommon/tqCommon.c index a94c17f735..1f3c049211 100644 --- a/source/dnode/vnode/src/tqCommon/tqCommon.c +++ b/source/dnode/vnode/src/tqCommon/tqCommon.c @@ -1178,16 +1178,25 @@ int32_t tqStreamTaskProcessConsenChkptIdReq(SStreamMeta* pMeta, SRpcMsg* pMsg) { taosThreadMutexLock(&pTask->lock); ASSERT(pTask->chkInfo.checkpointId >= req.checkpointId); + if (pTask->chkInfo.consensusTransId >= req.transId) { + tqDebug("s-task:%s vgId:%d latest consensus transId:%d, expired consensus trans:%d, discard", + pTask->id.idStr, vgId, pTask->chkInfo.consensusTransId, req.transId); + taosThreadMutexUnlock(&pTask->lock); + streamMetaReleaseTask(pMeta, pTask); + return TSDB_CODE_SUCCESS; + } + if (pTask->chkInfo.checkpointId != req.checkpointId) { - tqDebug("s-task:%s vgId:%d update the checkpoint from %" PRId64 " to %" PRId64, pTask->id.idStr, vgId, - pTask->chkInfo.checkpointId, req.checkpointId); + tqDebug("s-task:%s vgId:%d update the checkpoint from %" PRId64 " to %" PRId64" transId:%d", pTask->id.idStr, vgId, + pTask->chkInfo.checkpointId, req.checkpointId, req.transId); pTask->chkInfo.checkpointId = req.checkpointId; tqSetRestoreVersionInfo(pTask); } else { - tqDebug("s-task:%s vgId:%d consensus-checkpointId:%" PRId64 " equals to current checkpointId, not update", - pTask->id.idStr, vgId, req.checkpointId); + tqDebug("s-task:%s vgId:%d consensus-checkpointId:%" PRId64 " equals to current id, transId:%d not update", + pTask->id.idStr, vgId, req.checkpointId, req.transId); } + pTask->chkInfo.consensusTransId = req.transId; taosThreadMutexUnlock(&pTask->lock); if (pMeta->role == NODE_ROLE_LEADER) { diff --git a/source/libs/stream/src/streamMsg.c b/source/libs/stream/src/streamMsg.c index 1bc91d6984..40582b5144 100644 --- a/source/libs/stream/src/streamMsg.c +++ b/source/libs/stream/src/streamMsg.c @@ -638,6 +638,7 @@ int32_t tEncodeRestoreCheckpointInfo (SEncoder* pEncoder, const SRestoreCheckpoi if (tEncodeI64(pEncoder, pReq->startTs) < 0) return -1; if (tEncodeI64(pEncoder, pReq->streamId) < 0) return -1; if (tEncodeI64(pEncoder, pReq->checkpointId) < 0) return -1; + if (tEncodeI32(pEncoder, pReq->transId) < 0) return -1; if (tEncodeI32(pEncoder, pReq->taskId) < 0) return -1; if (tEncodeI32(pEncoder, pReq->nodeId) < 0) return -1; tEndEncode(pEncoder); @@ -649,6 +650,7 @@ int32_t tDecodeRestoreCheckpointInfo(SDecoder* pDecoder, SRestoreCheckpointInfo* if (tDecodeI64(pDecoder, &pReq->startTs) < 0) return -1; if (tDecodeI64(pDecoder, &pReq->streamId) < 0) return -1; if (tDecodeI64(pDecoder, &pReq->checkpointId) < 0) return -1; + if (tDecodeI32(pDecoder, &pReq->transId) < 0) return -1; if (tDecodeI32(pDecoder, &pReq->taskId) < 0) return -1; if (tDecodeI32(pDecoder, &pReq->nodeId) < 0) return -1; tEndDecode(pDecoder); From 773f70a907e9ee84d4f8d9c8ff072aabfec68f52 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Tue, 9 Jul 2024 15:55:56 +0800 Subject: [PATCH 89/92] fix(stream): fix syntax error. --- source/libs/stream/src/streamDispatch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/libs/stream/src/streamDispatch.c b/source/libs/stream/src/streamDispatch.c index 8bf0ccca53..617adaa016 100644 --- a/source/libs/stream/src/streamDispatch.c +++ b/source/libs/stream/src/streamDispatch.c @@ -819,7 +819,7 @@ static void checkpointReadyMsgSendMonitorFn(void* param, void* tmrId) { if ((pActiveInfo->activeId == 0) && (pActiveInfo->transId == 0) && (num == 0) && (pTask->chkInfo.startTs == 0)) { taosThreadMutexUnlock(&pActiveInfo->lock); int32_t ref = atomic_sub_fetch_32(&pTask->status.timerActive, 1); - stWarn("s-task:0x%x vgId:%d active checkpoint may be cleared, quit from readyMsg send tmr, ref:%d", id, vgId, ref); + stWarn("s-task:%s vgId:%d active checkpoint may be cleared, quit from readyMsg send tmr, ref:%d", id, vgId, ref); streamMetaReleaseTask(pTask->pMeta, pTask); return; From 8db4722be475f5ce7162b9a0a7424acffbf16787 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Tue, 9 Jul 2024 16:08:35 +0800 Subject: [PATCH 90/92] fix(stream):fix syntax error. --- source/dnode/mnode/impl/src/mndStream.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/dnode/mnode/impl/src/mndStream.c b/source/dnode/mnode/impl/src/mndStream.c index a515ed1d7d..415d1ff9f0 100644 --- a/source/dnode/mnode/impl/src/mndStream.c +++ b/source/dnode/mnode/impl/src/mndStream.c @@ -2829,7 +2829,7 @@ int32_t mndProcessConsensusInTmr(SRpcMsg *pMsg) { if (((now - pe->ts) >= 10 * 1000) || allSame) { mDebug("s-task:0x%x sendTs:%" PRId64 " wait %.2fs and all tasks have same checkpointId", pe->req.taskId, - (now - pe->ts) / 1000.0, pe->ts); + pe->req.startTs, (now - pe->ts) / 1000.0); ASSERT(chkId <= pe->req.checkpointId); mndCreateSetConsensusChkptIdTrans(pMnode, pStream, pe->req.taskId, chkId, pe->req.startTs); @@ -2837,7 +2837,7 @@ int32_t mndProcessConsensusInTmr(SRpcMsg *pMsg) { streamId = pe->req.streamId; } else { mDebug("s-task:0x%x sendTs:%" PRId64 " wait %.2fs already, wait for next round to check", pe->req.taskId, - (now - pe->ts) / 1000.0, pe->ts); + pe->req.startTs, (now - pe->ts) / 1000.0); } } From ee09e26f470a03ea22149b44f47a715c7bf13fd0 Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Tue, 9 Jul 2024 08:24:30 +0000 Subject: [PATCH 91/92] refactor code --- source/libs/stream/src/streamBackendRocksdb.c | 24 ++++++++++--------- source/libs/stream/src/streamCheckpoint.c | 16 +++++++------ 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/source/libs/stream/src/streamBackendRocksdb.c b/source/libs/stream/src/streamBackendRocksdb.c index 231fc2ce5b..8b87019ee0 100644 --- a/source/libs/stream/src/streamBackendRocksdb.c +++ b/source/libs/stream/src/streamBackendRocksdb.c @@ -475,6 +475,7 @@ int32_t rebuildDataFromS3(char* chkpPath, int64_t chkpId) { taosMemoryFree(pMeta); return code; } + taosMemoryFree(pMeta); return chkpAddExtraInfo(chkpPath, chkpId, pMeta->processId); } @@ -2648,6 +2649,7 @@ int32_t taskDbGenChkpUploadData__rsync(STaskDbWrapper* pDb, int64_t chkpId, char char* buf = taosMemoryCalloc(1, cap); if (buf == NULL) { + taosReleaseRef(taskDbWrapperId, refId); return TSDB_CODE_OUT_OF_MEMORY; } @@ -2655,6 +2657,7 @@ int32_t taskDbGenChkpUploadData__rsync(STaskDbWrapper* pDb, int64_t chkpId, char snprintf(buf, cap, "%s%s%s%s%s%" PRId64 "", pDb->path, TD_DIRSEP, "checkpoints", TD_DIRSEP, "checkpoint", chkpId); if (nBytes <= 0 || nBytes >= cap) { taosMemoryFree(buf); + taosReleaseRef(taskDbWrapperId, refId); return TSDB_CODE_OUT_OF_RANGE; } @@ -4716,19 +4719,22 @@ int32_t dbChkpInit(SDbChkp* p) { int32_t dbChkpDumpTo(SDbChkp* p, char* dname, SArray* list) { static char* chkpMeta = "META"; int32_t code = 0; - int32_t cap = p->len + 128; taosThreadRwlockRdlock(&p->rwLock); - char* srcBuf = taosMemoryCalloc(1, cap); - char* dstBuf = taosMemoryCalloc(1, cap); - char* srcDir = taosMemoryCalloc(1, cap); - char* dstDir = taosMemoryCalloc(1, cap); - if (srcBuf == NULL || dstBuf == NULL || srcDir == NULL || dstDir == NULL) { + int32_t cap = p->len + 128; + + char* buffer = taosMemoryCalloc(4, cap); + if (buffer == NULL) { code = TSDB_CODE_OUT_OF_MEMORY; goto _ERROR; } + char* srcBuf = buffer; + char* dstBuf = &srcBuf[cap]; + char* srcDir = &dstBuf[cap]; + char* dstDir = &srcDir[cap]; + int nBytes = snprintf(srcDir, cap, "%s%s%s%s%s%" PRId64 "", p->path, TD_DIRSEP, "checkpoints", TD_DIRSEP, "checkpoint", p->curChkpId); if (nBytes <= 0 || nBytes >= cap) { @@ -4872,12 +4878,8 @@ int32_t dbChkpDumpTo(SDbChkp* p, char* dname, SArray* list) { code = 0; _ERROR: + taosMemoryFree(buffer); taosThreadRwlockUnlock(&p->rwLock); - taosMemoryFree(srcBuf); - taosMemoryFree(dstBuf); - taosMemoryFree(srcDir); - taosMemoryFree(dstDir); - return code; } diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index a66c7a7cfa..731b6e9586 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -541,10 +541,8 @@ void streamTaskSetFailedCheckpointId(SStreamTask* pTask) { } static int32_t getCheckpointDataMeta(const char* id, const char* path, SArray* list) { - TdFilePtr pFile = NULL; - int32_t cap = strlen(path) + 64; - char buf[128] = {0}; - int32_t code = 0; + int32_t code = 0; + int32_t cap = strlen(path) + 64; char* filePath = taosMemoryCalloc(1, cap); if (filePath == NULL) { @@ -603,7 +601,7 @@ int32_t uploadCheckpointData(SStreamTask* pTask, int64_t checkpointId, int64_t d stDebug("s-task:%s upload checkpointId:%" PRId64 " to remote succ", idStr, checkpointId); } else { stError("s-task:%s failed to upload checkpointId:%" PRId64 " path:%s,reason:%s", idStr, checkpointId, path, - tstrerror(errno)); + tstrerror(code)); } } @@ -1080,13 +1078,17 @@ ECHECKPOINT_BACKUP_TYPE streamGetCheckpointBackupType() { } int32_t streamTaskUploadCheckpoint(const char* id, const char* path) { + int32_t code = 0; if (id == NULL || path == NULL || strlen(id) == 0 || strlen(path) == 0 || strlen(path) >= PATH_MAX) { stError("invalid parameters in upload checkpoint, %s", id); - return -1; + return TSDB_CODE_INVALID_CFG; } if (strlen(tsSnodeAddress) != 0) { - return uploadByRsync(id, path); + code = uploadByRsync(id, path); + if (code != 0) { + return TAOS_SYSTEM_ERROR(errno); + } } else if (tsS3StreamEnabled) { return uploadCheckpointToS3(id, path); } From 41f8553d53c43059b2a0191074bfcd4d90e34570 Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Tue, 9 Jul 2024 08:35:04 +0000 Subject: [PATCH 92/92] refactor code --- source/libs/stream/src/streamMeta.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index 15aa42e741..b38b8d73c0 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -185,13 +185,7 @@ int32_t streamMetaCvtDbFormat(SStreamMeta* pMeta) { terrno = 0; bool exist = streamBackendDataIsExist(pMeta->path, chkpId); if (exist == false) { - if (terrno != 0) { - code = terrno; - terrno = 0; - stError("failed to check backend data exist, reason:%s", tstrerror(code)); - } else { - stInfo("not need to convert stream backend formate"); - } + code = terrno; return code; }