From 8b269ca955660a96907f3e0f9e123a61ac692745 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Fri, 12 Jul 2024 15:12:43 +0800 Subject: [PATCH 001/103] refactor: do some internal refactor. --- source/libs/stream/src/streamCheckpoint.c | 49 ++--------------------- 1 file changed, 4 insertions(+), 45 deletions(-) diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index b490b0e02a..25974375e1 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -433,8 +433,8 @@ int32_t streamTaskUpdateTaskCheckpointInfo(SStreamTask* pTask, bool restored, SV taosThreadMutexLock(&pTask->lock); if (pReq->checkpointId <= pInfo->checkpointId) { - stDebug("s-task:%s vgId:%d latest checkpointId:%" PRId64 " checkpointVer:%" PRId64 - " no need to update the checkpoint info, updated checkpointId:%" PRId64 " checkpointVer:%" PRId64 + stDebug("s-task:%s vgId:%d latest checkpointId:%" PRId64 " Ver:%" PRId64 + " no need to update checkpoint info, updated checkpointId:%" PRId64 " Ver:%" PRId64 " transId:%d ignored", id, vgId, pInfo->checkpointId, pInfo->checkpointVer, pReq->checkpointId, pReq->checkpointVer, pReq->transId); @@ -1114,12 +1114,7 @@ int32_t deleteCheckpointFile(const char* id, const char* name) { } int32_t streamTaskSendRestoreChkptMsg(SStreamTask* pTask) { - int32_t code; - int32_t tlen = 0; - int32_t vgId = pTask->pMeta->vgId; - const char* id = pTask->id.idStr; - SCheckpointInfo* pInfo = &pTask->chkInfo; - + const char* id = pTask->id.idStr; taosThreadMutexLock(&pTask->lock); if (pTask->status.sendConsensusChkptId == true) { stDebug("s-task:%s already start to consensus-checkpointId, not start again before it completed", id); @@ -1133,44 +1128,8 @@ int32_t streamTaskSendRestoreChkptMsg(SStreamTask* pTask) { ASSERT(pTask->pBackend == NULL); pTask->status.requireConsensusChkptId = true; -#if 0 - SRestoreCheckpointInfo req = { - .streamId = pTask->id.streamId, - .taskId = pTask->id.taskId, - .nodeId = vgId, - .checkpointId = pInfo->checkpointId, - .startTs = pTask->execInfo.created, - }; - tEncodeSize(tEncodeRestoreCheckpointInfo, &req, tlen, code); - if (code < 0) { - stError("s-task:%s vgId:%d encode stream task latest-checkpoint-id failed, code:%s", id, vgId, tstrerror(code)); - return -1; - } - - void* buf = rpcMallocCont(tlen); - if (buf == NULL) { - stError("s-task:%s vgId:%d encode stream task latest-checkpoint-id msg failed, code:%s", id, vgId, - tstrerror(TSDB_CODE_OUT_OF_MEMORY)); - return -1; - } - - SEncoder encoder; - tEncoderInit(&encoder, buf, tlen); - if ((code = tEncodeRestoreCheckpointInfo(&encoder, &req)) < 0) { - rpcFreeCont(buf); - stError("s-task:%s vgId:%d encode stream task latest-checkpoint-id msg failed, code:%s", id, vgId, tstrerror(code)); - return -1; - } - tEncoderClear(&encoder); - - SRpcMsg msg = {0}; - initRpcMsg(&msg, TDMT_MND_STREAM_REQ_CONSEN_CHKPT, buf, tlen); - stDebug("s-task:%s vgId:%d send latest checkpointId:%" PRId64 " to mnode to get the consensus checkpointId", id, vgId, - pInfo->checkpointId); - - tmsgSendReq(&pTask->info.mnodeEpset, &msg); -#endif + stDebug("s-task:%s set the require consensus-checkpointId flag", id); return 0; } From 142f9132a51ffc42937ffa9a518bb37c6868ee83 Mon Sep 17 00:00:00 2001 From: Yihao Deng Date: Fri, 12 Jul 2024 12:08:57 +0000 Subject: [PATCH 002/103] fix failed to load task --- source/dnode/vnode/src/inc/vnodeInt.h | 2 +- source/dnode/vnode/src/tq/tqStreamTaskSnap.c | 6 +++++- source/dnode/vnode/src/vnd/vnodeSnapshot.c | 3 ++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/source/dnode/vnode/src/inc/vnodeInt.h b/source/dnode/vnode/src/inc/vnodeInt.h index 4a47e08730..4b16a076cc 100644 --- a/source/dnode/vnode/src/inc/vnodeInt.h +++ b/source/dnode/vnode/src/inc/vnodeInt.h @@ -381,7 +381,7 @@ int32_t streamTaskSnapReaderClose(SStreamTaskReader* pReader); int32_t streamTaskSnapRead(SStreamTaskReader* pReader, uint8_t** ppData); int32_t streamTaskSnapWriterOpen(STQ* pTq, int64_t sver, int64_t ever, SStreamTaskWriter** ppWriter); -int32_t streamTaskSnapWriterClose(SStreamTaskWriter* ppWriter, int8_t rollback); +int32_t streamTaskSnapWriterClose(SStreamTaskWriter* ppWriter, int8_t rollback, int8_t loadTask); int32_t streamTaskSnapWrite(SStreamTaskWriter* pWriter, uint8_t* pData, uint32_t nData); int32_t streamStateSnapReaderOpen(STQ* pTq, int64_t sver, int64_t ever, SStreamStateReader** ppReader); diff --git a/source/dnode/vnode/src/tq/tqStreamTaskSnap.c b/source/dnode/vnode/src/tq/tqStreamTaskSnap.c index dda5173ad9..167d20ef54 100644 --- a/source/dnode/vnode/src/tq/tqStreamTaskSnap.c +++ b/source/dnode/vnode/src/tq/tqStreamTaskSnap.c @@ -192,7 +192,7 @@ _err: return 0; } -int32_t streamTaskSnapWriterClose(SStreamTaskWriter* pWriter, int8_t rollback) { +int32_t streamTaskSnapWriterClose(SStreamTaskWriter* pWriter, int8_t rollback, int8_t loadTask) { int32_t code = 0; STQ* pTq = pWriter->pTq; @@ -214,6 +214,10 @@ int32_t streamTaskSnapWriterClose(SStreamTaskWriter* pWriter, int8_t rollback) { } streamMetaWUnLock(pTq->pStreamMeta); taosMemoryFree(pWriter); + + if (loadTask == 1) { + streamMetaLoadAllTasks(pTq->pStreamMeta); + } return code; _err: diff --git a/source/dnode/vnode/src/vnd/vnodeSnapshot.c b/source/dnode/vnode/src/vnd/vnodeSnapshot.c index 611a603c63..0951e56d66 100644 --- a/source/dnode/vnode/src/vnd/vnodeSnapshot.c +++ b/source/dnode/vnode/src/vnd/vnodeSnapshot.c @@ -752,7 +752,8 @@ int32_t vnodeSnapWriterClose(SVSnapWriter *pWriter, int8_t rollback, SSnapshot * } if (pWriter->pStreamTaskWriter) { - code = streamTaskSnapWriterClose(pWriter->pStreamTaskWriter, rollback); + code = streamTaskSnapWriterClose(pWriter->pStreamTaskWriter, rollback, pWriter->pStreamStateWriter == NULL ? 1 : 0); + if (code) goto _exit; } From c4cde6f26881069675baa944ffed26eef029b516 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Mon, 15 Jul 2024 14:23:33 +0800 Subject: [PATCH 003/103] fix(stream): mark the timer launched by which checkpoint procedure. --- source/libs/stream/inc/streamInt.h | 8 ++------ source/libs/stream/src/streamDispatch.c | 15 ++++++++++++++- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/source/libs/stream/inc/streamInt.h b/source/libs/stream/inc/streamInt.h index 008d066717..d31f720411 100644 --- a/source/libs/stream/inc/streamInt.h +++ b/source/libs/stream/inc/streamInt.h @@ -63,11 +63,7 @@ struct SActiveCheckpointInfo { tmr_h pChkptTriggerTmr; int32_t sendReadyCheckCounter; tmr_h pSendReadyMsgTmr; -}; - -struct SConsensusCheckpoint { - int8_t inProcess; - + int64_t sendReadyTmrChkptId; }; typedef struct { @@ -227,7 +223,7 @@ int32_t streamMetaSendHbHelper(SStreamMeta* pMeta); ECHECKPOINT_BACKUP_TYPE streamGetCheckpointBackupType(); -int32_t streamTaskDownloadCheckpointData(const char* id, char* path); +int32_t streamTaskDownloadCheckpointData(const char* id, char* path, int64_t checkpointId); int32_t streamTaskOnNormalTaskReady(SStreamTask* pTask); int32_t streamTaskOnScanHistoryTaskReady(SStreamTask* pTask); diff --git a/source/libs/stream/src/streamDispatch.c b/source/libs/stream/src/streamDispatch.c index 617adaa016..1948b04186 100644 --- a/source/libs/stream/src/streamDispatch.c +++ b/source/libs/stream/src/streamDispatch.c @@ -815,6 +815,16 @@ static void checkpointReadyMsgSendMonitorFn(void* param, void* tmrId) { SArray* pList = pActiveInfo->pReadyMsgList; int32_t num = taosArrayGetSize(pList); + if (pActiveInfo->sendReadyTmrChkptId < pActiveInfo->activeId) { + taosThreadMutexUnlock(&pActiveInfo->lock); + int32_t ref = atomic_sub_fetch_32(&pTask->status.timerActive, 1); + stWarn("s-task:%s vgId:%d tmr launched by previous checkpoint procedure, checkpointId:%" PRId64 ", quit, ref:%d", + id, vgId, pActiveInfo->sendReadyTmrChkptId, ref); + + streamMetaReleaseTask(pTask->pMeta, pTask); + return; + } + // active checkpoint info is cleared for now if ((pActiveInfo->activeId == 0) && (pActiveInfo->transId == 0) && (num == 0) && (pTask->chkInfo.startTs == 0)) { taosThreadMutexUnlock(&pActiveInfo->lock); @@ -902,7 +912,6 @@ int32_t streamTaskSendCheckpointReadyMsg(SStreamTask* pTask) { pInfo->upstreamTaskId); } - taosThreadMutexUnlock(&pActiveInfo->lock); stDebug("s-task:%s level:%d checkpoint-ready msg sent to all %d upstreams", id, pTask->info.taskLevel, num); // start to check if checkpoint ready msg has successfully received by upstream tasks. @@ -916,8 +925,12 @@ int32_t streamTaskSendCheckpointReadyMsg(SStreamTask* pTask) { } else { taosTmrReset(checkpointReadyMsgSendMonitorFn, 100, pTask, streamTimer, &pActiveInfo->pSendReadyMsgTmr); } + + // mark the timer monitor checkpointId + pActiveInfo->sendReadyTmrChkptId = pActiveInfo->activeId; } + taosThreadMutexUnlock(&pActiveInfo->lock); return TSDB_CODE_SUCCESS; } From a88635129841692ac9cd4be75d058215c33901bb Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Mon, 15 Jul 2024 14:52:28 +0800 Subject: [PATCH 004/103] fix(stream): update checkpoint into different dir. --- include/common/rsync.h | 4 +- source/common/src/rsync.c | 74 ++++++++++++++----- source/libs/stream/src/streamBackendRocksdb.c | 4 +- source/libs/stream/src/streamCheckpoint.c | 12 +-- 4 files changed, 64 insertions(+), 30 deletions(-) diff --git a/include/common/rsync.h b/include/common/rsync.h index 0840b51793..4221fb432f 100644 --- a/include/common/rsync.h +++ b/include/common/rsync.h @@ -13,8 +13,8 @@ extern "C" { void stopRsync(); void startRsync(); -int32_t uploadByRsync(const char* id, const char* path); -int32_t downloadRsync(const char* id, const char* path); +int32_t uploadByRsync(const char* id, const char* path, int64_t checkpointId); +int32_t downloadByRsync(const char* id, const char* path, int64_t checkpointId); int32_t deleteRsync(const char* id); #ifdef __cplusplus diff --git a/source/common/src/rsync.c b/source/common/src/rsync.c index d0b10b7f41..36d634c305 100644 --- a/source/common/src/rsync.c +++ b/source/common/src/rsync.c @@ -157,7 +157,7 @@ void startRsync() { } } -int32_t uploadByRsync(const char* id, const char* path) { +int32_t uploadByRsync(const char* id, const char* path, int64_t checkpointId) { int64_t st = taosGetTimestampMs(); char command[PATH_MAX] = {0}; @@ -197,11 +197,11 @@ int32_t uploadByRsync(const char* id, const char* path) { // prepare the data directory int32_t code = execCommand(command); if (code != 0) { - uError("[rsync] s-task:%s prepare checkpoint data in %s to %s failed, code:%d," ERRNO_ERR_FORMAT, id, path, + uError("[rsync] s-task:%s prepare checkpoint dir in %s to %s failed, code:%d," ERRNO_ERR_FORMAT, id, path, tsSnodeAddress, code, ERRNO_ERR_DATA); } else { int64_t el = (taosGetTimestampMs() - st); - uDebug("[rsync] s-task:%s prepare checkpoint data in:%s to %s successfully, elapsed time:%" PRId64 "ms", id, path, + uDebug("[rsync] s-task:%s prepare checkpoint dir in:%s to %s successfully, elapsed time:%" PRId64 "ms", id, path, tsSnodeAddress, el); } @@ -215,7 +215,7 @@ int32_t uploadByRsync(const char* id, const char* path) { #endif snprintf(command, PATH_MAX, "rsync -av --debug=all --log-file=%s/rsynclog --delete --timeout=10 --bwlimit=100000 %s/ " - "rsync://%s/checkpoint/%s/data/", + "rsync://%s/checkpoint/%s/%" PRId64 "/", tsLogDir, #ifdef WINDOWS pathTransform @@ -223,11 +223,11 @@ int32_t uploadByRsync(const char* id, const char* path) { path #endif , - tsSnodeAddress, id); + tsSnodeAddress, id, checkpointId); } else { snprintf(command, PATH_MAX, "rsync -av --debug=all --log-file=%s/rsynclog --delete --timeout=10 --bwlimit=100000 %s " - "rsync://%s/checkpoint/%s/data/", + "rsync://%s/checkpoint/%s/%" PRId64 "/", tsLogDir, #ifdef WINDOWS pathTransform @@ -235,7 +235,7 @@ int32_t uploadByRsync(const char* id, const char* path) { path #endif , - tsSnodeAddress, id); + tsSnodeAddress, id, checkpointId); } code = execCommand(command); @@ -252,7 +252,7 @@ int32_t uploadByRsync(const char* id, const char* path) { } // abort from retry if quit -int32_t downloadRsync(const char* id, const char* path) { +int32_t downloadByRsync(const char* id, const char* path, int64_t checkpointId) { int64_t st = taosGetTimestampMs(); int32_t MAX_RETRY = 10; int32_t times = 0; @@ -264,6 +264,42 @@ int32_t downloadRsync(const char* id, const char* path) { #endif char command[PATH_MAX] = {0}; + snprintf( + command, PATH_MAX, + "rsync -av --debug=all --log-file=%s/rsynclog --timeout=10 --bwlimit=100000 rsync://%s/checkpoint/%s/%" PRId64 + "/ %s", + tsLogDir, tsSnodeAddress, id, checkpointId, +#ifdef WINDOWS + pathTransform +#else + path +#endif + ); + + uDebug("[rsync] %s start to sync data from remote to:%s, cmd:%s", id, path, command); + +// while (times++ < MAX_RETRY) { + code = execCommand(command); + if (code != TSDB_CODE_SUCCESS) { + uError("[rsync] %s download checkpointId:%" PRId64 + " data:%s failed, retry after 1sec, times:%d, code:%d," ERRNO_ERR_FORMAT, + id, checkpointId, path, times, code, ERRNO_ERR_DATA); +// taosSsleep(1); + } else { + int32_t el = taosGetTimestampMs() - st; + uDebug("[rsync] %s download checkpointId:%" PRId64 " data:%s successfully, elapsed time:%dms", id, checkpointId, + path, el); +// break; + } +// } + + // if failed, try to load it from data directory +#ifdef WINDOWS + memset(pathTransform, 0, PATH_MAX); + changeDirFromWindowsToLinux(path, pathTransform); +#endif + + memset(command, 0, PATH_MAX); snprintf( command, PATH_MAX, "rsync -av --debug=all --log-file=%s/rsynclog --timeout=10 --bwlimit=100000 rsync://%s/checkpoint/%s/data/ %s", @@ -275,19 +311,17 @@ int32_t downloadRsync(const char* id, const char* path) { #endif ); - uDebug("[rsync] %s start to sync data from remote to:%s, %s", id, path, command); + uDebug("[rsync] %s start to sync data from remote data dir to:%s, cmd:%s", id, path, command); - while (times++ < MAX_RETRY) { - code = execCommand(command); - if (code != TSDB_CODE_SUCCESS) { - uError("[rsync] %s download checkpoint data:%s failed, retry after 1sec, times:%d, code:%d," ERRNO_ERR_FORMAT, id, - path, times, code, ERRNO_ERR_DATA); - taosSsleep(1); - } else { - int32_t el = taosGetTimestampMs() - st; - uDebug("[rsync] %s download checkpoint data:%s successfully, elapsed time:%dms", id, path, el); - break; - } + code = execCommand(command); + if (code != TSDB_CODE_SUCCESS) { + uError("[rsync] %s download checkpointId:%" PRId64 + " data:%s failed, retry after 1sec, times:%d, code:%d," ERRNO_ERR_FORMAT, + id, checkpointId, path, times, code, ERRNO_ERR_DATA); + } else { + int32_t el = taosGetTimestampMs() - st; + uDebug("[rsync] %s download checkpointId:%" PRId64 " data:%s successfully, elapsed time:%dms", id, checkpointId, + path, el); } return code; diff --git a/source/libs/stream/src/streamBackendRocksdb.c b/source/libs/stream/src/streamBackendRocksdb.c index 8b87019ee0..15a8be6eaa 100644 --- a/source/libs/stream/src/streamBackendRocksdb.c +++ b/source/libs/stream/src/streamBackendRocksdb.c @@ -447,7 +447,7 @@ int32_t rebuildFromRemoteChkp_rsync(const char* key, char* checkpointPath, int64 cleanDir(defaultPath, key); stDebug("clear local default dir before downloading checkpoint data:%s succ", defaultPath); - code = streamTaskDownloadCheckpointData(key, checkpointPath); + code = streamTaskDownloadCheckpointData(key, checkpointPath, checkpointId); if (code != 0) { stError("failed to download checkpoint data:%s", key); return code; @@ -482,7 +482,7 @@ int32_t rebuildDataFromS3(char* chkpPath, int64_t chkpId) { int32_t rebuildFromRemoteChkp_s3(const char* key, char* chkpPath, int64_t chkpId, char* defaultPath) { int8_t rename = 0; - int32_t code = streamTaskDownloadCheckpointData(key, chkpPath); + int32_t code = streamTaskDownloadCheckpointData(key, chkpPath, chkpId); if (code != 0) { return code; } diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index bcdd1a047c..87c2af5207 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -20,7 +20,7 @@ static int32_t downloadCheckpointDataByName(const char* id, const char* fname, const char* dstName); static int32_t deleteCheckpointFile(const char* id, const char* name); -static int32_t streamTaskUploadCheckpoint(const char* id, const char* path); +static int32_t streamTaskUploadCheckpoint(const char* id, const char* path, int64_t checkpointId); static int32_t deleteCheckpoint(const char* id); static int32_t downloadCheckpointByNameS3(const char* id, const char* fname, const char* dstName); static int32_t continueDispatchCheckpointTriggerBlock(SStreamDataBlock* pBlock, SStreamTask* pTask); @@ -601,7 +601,7 @@ int32_t uploadCheckpointData(SStreamTask* pTask, int64_t checkpointId, int64_t d } if (code == TSDB_CODE_SUCCESS) { - code = streamTaskUploadCheckpoint(idStr, path); + code = streamTaskUploadCheckpoint(idStr, path, checkpointId); if (code == TSDB_CODE_SUCCESS) { stDebug("s-task:%s upload checkpointId:%" PRId64 " to remote succ", idStr, checkpointId); } else { @@ -1082,7 +1082,7 @@ ECHECKPOINT_BACKUP_TYPE streamGetCheckpointBackupType() { } } -int32_t streamTaskUploadCheckpoint(const char* id, const char* path) { +int32_t streamTaskUploadCheckpoint(const char* id, const char* path, int64_t checkpointId) { int32_t code = 0; if (id == NULL || path == NULL || strlen(id) == 0 || strlen(path) == 0 || strlen(path) >= PATH_MAX) { stError("invalid parameters in upload checkpoint, %s", id); @@ -1090,7 +1090,7 @@ int32_t streamTaskUploadCheckpoint(const char* id, const char* path) { } if (strlen(tsSnodeAddress) != 0) { - code = uploadByRsync(id, path); + code = uploadByRsync(id, path, checkpointId); if (code != 0) { return TAOS_SYSTEM_ERROR(errno); } @@ -1117,14 +1117,14 @@ int32_t downloadCheckpointDataByName(const char* id, const char* fname, const ch return 0; } -int32_t streamTaskDownloadCheckpointData(const char* id, char* path) { +int32_t streamTaskDownloadCheckpointData(const char* id, char* path, int64_t checkpointId) { if (id == NULL || path == NULL || strlen(id) == 0 || strlen(path) == 0 || strlen(path) >= PATH_MAX) { stError("down checkpoint data parameters invalid"); return -1; } if (strlen(tsSnodeAddress) != 0) { - return downloadRsync(id, path); + return downloadByRsync(id, path, checkpointId); } else if (tsS3StreamEnabled) { return s3GetObjectsByPrefix(id, path); } From 14a7cebc56cb2034d383cb6a3ecc6a230878322b Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Mon, 15 Jul 2024 18:51:58 +0800 Subject: [PATCH 005/103] fix(stream): add explicit create table into sink cache. --- source/dnode/vnode/src/tq/tqSink.c | 41 ++++++++++++++++++------------ 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/source/dnode/vnode/src/tq/tqSink.c b/source/dnode/vnode/src/tq/tqSink.c index 5f3e1e3d14..34bd39f6e7 100644 --- a/source/dnode/vnode/src/tq/tqSink.c +++ b/source/dnode/vnode/src/tq/tqSink.c @@ -46,6 +46,7 @@ static int32_t initCreateTableMsg(SVCreateTbReq* pCreateTableReq, uint64_t suid, static SArray* createDefaultTagColName(); static void setCreateTableMsgTableName(SVCreateTbReq* pCreateTableReq, SSDataBlock* pDataBlock, const char* stbFullName, int64_t gid, bool newSubTableRule); +static int32_t doCreateSinkInfo(const char* pDstTableName, STableSinkInfo** pInfo); int32_t tqBuildDeleteReq(STQ* pTq, const char* stbFullName, const SSDataBlock* pDataBlock, SBatchDeleteReq* deleteReq, const char* pIdStr, bool newSubTableRule) { @@ -269,6 +270,14 @@ static int32_t doBuildAndSendCreateTableMsg(SVnode* pVnode, char* stbFullName, S pTask->ver >= SSTREAM_TASK_SUBTABLE_CHANGED_VER && pTask->subtableWithoutMd5 != 1); taosArrayPush(reqs.pArray, pCreateTbReq); + + STableSinkInfo* pInfo = NULL; + bool alreadyCached = tqGetTableInfo(pTask->outputInfo.tbSink.pTblInfo, gid, &pInfo); + if (!alreadyCached) { + code = doCreateSinkInfo(pCreateTbReq->name, &pInfo); + doPutIntoCache(pTask->outputInfo.tbSink.pTblInfo, pInfo, gid, pTask->id.idStr); + } + tqDebug("s-task:%s build create table:%s msg complete", pTask->id.idStr, pCreateTbReq->name); } @@ -631,6 +640,18 @@ int32_t doWaitForDstTableCreated(SVnode* pVnode, SStreamTask* pTask, STableSinkI return TSDB_CODE_SUCCESS; } +int32_t doCreateSinkInfo(const char* pDstTableName, STableSinkInfo** pInfo) { + int32_t nameLen = strlen(pDstTableName); + (*pInfo) = taosMemoryCalloc(1, sizeof(STableSinkInfo) + nameLen + 1); + if (*pInfo == NULL) { + return TSDB_CODE_OUT_OF_MEMORY; + } + + (*pInfo)->name.len = nameLen; + memcpy((*pInfo)->name.data, pDstTableName, nameLen); + return TSDB_CODE_SUCCESS; +} + int32_t setDstTableDataUid(SVnode* pVnode, SStreamTask* pTask, SSDataBlock* pDataBlock, char* stbFullName, SSubmitTbData* pTableData) { uint64_t groupId = pDataBlock->info.id.groupId; @@ -667,22 +688,15 @@ int32_t setDstTableDataUid(SVnode* pVnode, SStreamTask* pTask, SSDataBlock* pDat if (pTask->subtableWithoutMd5 != 1 && !isAutoTableName(dstTableName) && !alreadyAddGroupId(dstTableName, groupId) && groupId != 0) { tqDebug("s-task:%s append groupId:%" PRId64 " for generated dstTable:%s", id, groupId, dstTableName); - if(pTask->ver == SSTREAM_TASK_SUBTABLE_CHANGED_VER){ + if (pTask->ver == SSTREAM_TASK_SUBTABLE_CHANGED_VER) { buildCtbNameAddGroupId(NULL, dstTableName, groupId); - }else if(pTask->ver > SSTREAM_TASK_SUBTABLE_CHANGED_VER && stbFullName) { + } else if (pTask->ver > SSTREAM_TASK_SUBTABLE_CHANGED_VER && stbFullName) { buildCtbNameAddGroupId(stbFullName, dstTableName, groupId); } } } - int32_t nameLen = strlen(dstTableName); - pTableSinkInfo = taosMemoryCalloc(1, sizeof(STableSinkInfo) + nameLen + 1); - if (pTableSinkInfo == NULL) { - return TSDB_CODE_OUT_OF_MEMORY; - } - - pTableSinkInfo->name.len = nameLen; - memcpy(pTableSinkInfo->name.data, dstTableName, nameLen); + int32_t code = doCreateSinkInfo(dstTableName, &pTableSinkInfo); tqDebug("s-task:%s build new sinkTableInfo to add cache, dstTable:%s", id, dstTableName); } @@ -690,7 +704,7 @@ int32_t setDstTableDataUid(SVnode* pVnode, SStreamTask* pTask, SSDataBlock* pDat pTableData->uid = pTableSinkInfo->uid; if (pTableData->uid == 0) { - tqTrace("s-task:%s cached tableInfo uid is invalid, acquire it from meta", id); + tqTrace("s-task:%s cached tableInfo:%s uid is invalid, acquire it from meta", id, pTableSinkInfo->name.data); return doWaitForDstTableCreated(pVnode, pTask, pTableSinkInfo, dstTableName, &pTableData->uid); } else { tqTrace("s-task:%s set the dstTable uid from cache:%" PRId64, id, pTableData->uid); @@ -926,11 +940,6 @@ bool hasOnlySubmitData(const SArray* pBlocks, int32_t numOfBlocks) { } int32_t doPutIntoCache(SSHashObj* pSinkTableMap, STableSinkInfo* pTableSinkInfo, uint64_t groupId, const char* id) { - if (tSimpleHashGetSize(pSinkTableMap) > MAX_CACHE_TABLE_INFO_NUM) { - taosMemoryFreeClear(pTableSinkInfo); // too many items, failed to cache it - return TSDB_CODE_FAILED; - } - int32_t code = tSimpleHashPut(pSinkTableMap, &groupId, sizeof(uint64_t), &pTableSinkInfo, POINTER_BYTES); if (code != TSDB_CODE_SUCCESS) { taosMemoryFreeClear(pTableSinkInfo); From a46b7b3a414dacc3417934c92f81d00a7965d414 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Mon, 15 Jul 2024 19:19:58 +0800 Subject: [PATCH 006/103] fix(stream): adjust the time to free task backend. --- source/libs/stream/src/streamCheckpoint.c | 8 ++++++++ source/libs/stream/src/streamMeta.c | 1 - source/libs/stream/src/streamTaskSm.c | 22 ++++++++-------------- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index 87c2af5207..d1ea72370d 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -1163,7 +1163,10 @@ int32_t deleteCheckpointFile(const char* id, const char* name) { int32_t streamTaskSendRestoreChkptMsg(SStreamTask* pTask) { const char* id = pTask->id.idStr; + taosThreadMutexLock(&pTask->lock); + ETaskStatus p = streamTaskGetStatus(pTask)->state; + if (pTask->status.sendConsensusChkptId == true) { stDebug("s-task:%s already start to consensus-checkpointId, not start again before it completed", id); taosThreadMutexUnlock(&pTask->lock); @@ -1174,6 +1177,11 @@ int32_t streamTaskSendRestoreChkptMsg(SStreamTask* pTask) { taosThreadMutexUnlock(&pTask->lock); + if (pTask->pBackend != NULL) { + streamFreeTaskState(pTask, p); + pTask->pBackend = NULL; + } + ASSERT(pTask->pBackend == NULL); pTask->status.requireConsensusChkptId = true; diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index d0b1f6ca93..d2c957422b 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -1265,7 +1265,6 @@ int32_t streamMetaStartAllTasks(SStreamMeta* pMeta) { } // negotiate the consensus checkpoint id for current task - ASSERT(pTask->pBackend == NULL); code = streamTaskSendRestoreChkptMsg(pTask); // this task may has no checkpoint, but others tasks may generate checkpoint already? diff --git a/source/libs/stream/src/streamTaskSm.c b/source/libs/stream/src/streamTaskSm.c index f2bd99cdaf..85d3e0068a 100644 --- a/source/libs/stream/src/streamTaskSm.c +++ b/source/libs/stream/src/streamTaskSm.c @@ -79,12 +79,6 @@ static int32_t attachWaitedEvent(SStreamTask* pTask, SFutureHandleEventInfo* pEv return 0; } -static int32_t stopTaskSuccFn(SStreamTask* pTask) { - SStreamTaskSM* pSM = pTask->status.pSM; - streamFreeTaskState(pTask, pSM->current.state); - return TSDB_CODE_SUCCESS; -} - int32_t streamTaskInitStatus(SStreamTask* pTask) { pTask->execInfo.checkTs = taosGetTimestampMs(); stDebug("s-task:%s start init, and check downstream tasks, set the init ts:%" PRId64, pTask->id.idStr, @@ -640,21 +634,21 @@ void doInitStateTransferTable(void) { // resume is completed by restore status of state-machine // stop related event - trans = createStateTransform(TASK_STATUS__READY, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, stopTaskSuccFn, NULL); + trans = createStateTransform(TASK_STATUS__READY, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, NULL, NULL); taosArrayPush(streamTaskSMTrans, &trans); - trans = createStateTransform(TASK_STATUS__DROPPING, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, stopTaskSuccFn, NULL); + trans = createStateTransform(TASK_STATUS__DROPPING, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, NULL, NULL); taosArrayPush(streamTaskSMTrans, &trans); - trans = createStateTransform(TASK_STATUS__UNINIT, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, stopTaskSuccFn, NULL); + trans = createStateTransform(TASK_STATUS__UNINIT, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, NULL, NULL); taosArrayPush(streamTaskSMTrans, &trans); - trans = createStateTransform(TASK_STATUS__STOP, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, stopTaskSuccFn, NULL); + trans = createStateTransform(TASK_STATUS__STOP, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, NULL, NULL); taosArrayPush(streamTaskSMTrans, &trans); - trans = createStateTransform(TASK_STATUS__SCAN_HISTORY, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, stopTaskSuccFn, NULL); + trans = createStateTransform(TASK_STATUS__SCAN_HISTORY, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, NULL, NULL); taosArrayPush(streamTaskSMTrans, &trans); - trans = createStateTransform(TASK_STATUS__HALT, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, stopTaskSuccFn, NULL); + trans = createStateTransform(TASK_STATUS__HALT, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, NULL, NULL); taosArrayPush(streamTaskSMTrans, &trans); - trans = createStateTransform(TASK_STATUS__PAUSE, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, stopTaskSuccFn, NULL); + trans = createStateTransform(TASK_STATUS__PAUSE, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, NULL, NULL); taosArrayPush(streamTaskSMTrans, &trans); - trans = createStateTransform(TASK_STATUS__CK, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, stopTaskSuccFn, NULL); + trans = createStateTransform(TASK_STATUS__CK, TASK_STATUS__STOP, TASK_EVENT_STOP, NULL, NULL, NULL); taosArrayPush(streamTaskSMTrans, &trans); // dropping related event From 9aadc5e4acf886120094c87d47cffe5c7c9576ed Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Wed, 17 Jul 2024 10:18:58 +0800 Subject: [PATCH 007/103] fix(stream): fix race condition for dispatch msg. --- source/libs/stream/src/streamDispatch.c | 56 +++++++++++++++---------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/source/libs/stream/src/streamDispatch.c b/source/libs/stream/src/streamDispatch.c index 1948b04186..9615ed49e0 100644 --- a/source/libs/stream/src/streamDispatch.c +++ b/source/libs/stream/src/streamDispatch.c @@ -1175,10 +1175,10 @@ void streamClearChkptReadyMsg(SActiveCheckpointInfo* pActiveInfo) { static int32_t handleDispatchSuccessRsp(SStreamTask* pTask, int32_t downstreamId, int32_t downstreamNodeId) { stDebug("s-task:%s destroy dispatch msg:%p", pTask->id.idStr, pTask->msgInfo.pData); - bool delayDispatch = (pTask->msgInfo.dispatchMsgType == STREAM_INPUT__CHECKPOINT_TRIGGER); - clearBufferedDispatchMsg(pTask); - int64_t el = taosGetTimestampMs() - pTask->msgInfo.startTs; + bool delayDispatch = (pTask->msgInfo.dispatchMsgType == STREAM_INPUT__CHECKPOINT_TRIGGER); + + clearBufferedDispatchMsg(pTask); // put data into inputQ of current task is also allowed if (pTask->inputq.status == TASK_INPUT_STATUS__BLOCKED) { @@ -1202,13 +1202,24 @@ static int32_t handleDispatchSuccessRsp(SStreamTask* pTask, int32_t downstreamId return 0; } -static int32_t setDispatchRspInfo(SDispatchMsgInfo* pMsgInfo, int32_t vgId, int32_t code, int64_t now, const char* id) { +static bool setDispatchRspInfo(SDispatchMsgInfo* pMsgInfo, int32_t vgId, int32_t code, int64_t now, int32_t* pNotRsp, const char* id) { int32_t numOfRsp = 0; bool alreadySet = false; bool updated = false; + bool allRsp = false; + *pNotRsp = 0; taosThreadMutexLock(&pMsgInfo->lock); - for (int32_t j = 0; j < taosArrayGetSize(pMsgInfo->pSendInfo); ++j) { + int32_t numOfDispatchBranch = taosArrayGetSize(pMsgInfo->pSendInfo); + + for(int32_t i = 0; i < numOfDispatchBranch; ++i) { + SDispatchEntry* pEntry = taosArrayGet(pMsgInfo->pSendInfo, i); + if (pEntry->rspTs != -1) { + numOfRsp += 1; + } + } + + for (int32_t j = 0; j < numOfDispatchBranch; ++j) { SDispatchEntry* pEntry = taosArrayGet(pMsgInfo->pSendInfo, j); if (pEntry->nodeId == vgId) { ASSERT(!alreadySet); @@ -1216,18 +1227,20 @@ static int32_t setDispatchRspInfo(SDispatchMsgInfo* pMsgInfo, int32_t vgId, int3 pEntry->status = code; alreadySet = true; updated = true; - stDebug("s-task:%s record the rsp recv, ts:%" PRId64 " code:%d, idx:%d", id, now, code, j); - } - - if (pEntry->rspTs != -1) { numOfRsp += 1; + + stDebug("s-task:%s record the rsp recv, ts:%" PRId64 " code:%d, idx:%d, total recv:%d/%d", id, now, code, j, + numOfRsp, numOfDispatchBranch); } } - taosThreadMutexUnlock(&pMsgInfo->lock); - ASSERT(updated); + *pNotRsp = numOfDispatchBranch - numOfRsp; + allRsp = (numOfRsp == numOfDispatchBranch); - return numOfRsp; + taosThreadMutexUnlock(&pMsgInfo->lock); + + ASSERT(updated); + return allRsp; } bool isDispatchRspTimeout(SDispatchEntry* pEntry, int64_t now) { @@ -1253,7 +1266,8 @@ int32_t streamProcessDispatchRsp(SStreamTask* pTask, SStreamDispatchRsp* pRsp, i int32_t vgId = pTask->pMeta->vgId; SDispatchMsgInfo* pMsgInfo = &pTask->msgInfo; int64_t now = taosGetTimestampMs(); - int32_t totalRsp = 0; + bool allRsp = false; + int32_t notRsp = 0; taosThreadMutexLock(&pMsgInfo->lock); int32_t msgId = pMsgInfo->msgId; @@ -1282,18 +1296,18 @@ int32_t streamProcessDispatchRsp(SStreamTask* pTask, SStreamDispatchRsp* pRsp, i if (code == TSDB_CODE_STREAM_TASK_NOT_EXIST) { // destination task does not exist, not retry anymore stError("s-task:%s failed to dispatch msg to task:0x%x(vgId:%d), msgId:%d no retry, since task destroyed already", id, pRsp->downstreamTaskId, pRsp->downstreamNodeId, msgId); - totalRsp = setDispatchRspInfo(pMsgInfo, pRsp->downstreamNodeId, TSDB_CODE_SUCCESS, now, id); + allRsp = setDispatchRspInfo(pMsgInfo, pRsp->downstreamNodeId, TSDB_CODE_SUCCESS, now, ¬Rsp, id); } else { stError("s-task:%s failed to dispatch msgId:%d to task:0x%x(vgId:%d), code:%s, add to retry list", id, msgId, pRsp->downstreamTaskId, pRsp->downstreamNodeId, tstrerror(code)); - totalRsp = setDispatchRspInfo(pMsgInfo, pRsp->downstreamNodeId, code, now, id); + allRsp = setDispatchRspInfo(pMsgInfo, pRsp->downstreamNodeId, code, now, ¬Rsp, id); } } else { // code == 0 if (pRsp->inputStatus == TASK_INPUT_STATUS__BLOCKED) { pTask->inputq.status = TASK_INPUT_STATUS__BLOCKED; // block the input of current task, to push pressure to upstream - totalRsp = setDispatchRspInfo(pMsgInfo, pRsp->downstreamNodeId, pRsp->inputStatus, now, id); + allRsp = setDispatchRspInfo(pMsgInfo, pRsp->downstreamNodeId, pRsp->inputStatus, now, ¬Rsp, id); stTrace("s-task:%s inputQ of downstream task:0x%x(vgId:%d) is full, wait for retry dispatch", id, pRsp->downstreamTaskId, pRsp->downstreamNodeId); } else { @@ -1305,7 +1319,7 @@ int32_t streamProcessDispatchRsp(SStreamTask* pTask, SStreamDispatchRsp* pRsp, i id, pRsp->downstreamTaskId, pRsp->downstreamNodeId); } - totalRsp = setDispatchRspInfo(pMsgInfo, pRsp->downstreamNodeId, TSDB_CODE_SUCCESS, now, id); + allRsp = setDispatchRspInfo(pMsgInfo, pRsp->downstreamNodeId, TSDB_CODE_SUCCESS, now, ¬Rsp, id); { bool delayDispatch = (pMsgInfo->dispatchMsgType == STREAM_INPUT__CHECKPOINT_TRIGGER); @@ -1330,13 +1344,11 @@ int32_t streamProcessDispatchRsp(SStreamTask* pTask, SStreamDispatchRsp* pRsp, i } } - int32_t notRsp = taosArrayGetSize(pMsgInfo->pSendInfo) - totalRsp; if (pTask->outputInfo.type == TASK_OUTPUT__SHUFFLE_DISPATCH) { - if (notRsp > 0) { + if (!allRsp) { stDebug( "s-task:%s recv dispatch rsp, msgId:%d from 0x%x(vgId:%d), downstream task input status:%d code:%s, " - "waiting " - "for %d rsp", + "waiting for %d rsp", id, msgId, pRsp->downstreamTaskId, pRsp->downstreamNodeId, pRsp->inputStatus, tstrerror(code), notRsp); } else { stDebug( @@ -1350,7 +1362,7 @@ int32_t streamProcessDispatchRsp(SStreamTask* pTask, SStreamDispatchRsp* pRsp, i } // all msg rsp already, continue - if (notRsp == 0) { + if (allRsp) { ASSERT(pTask->outputq.status == TASK_OUTPUT_STATUS__WAIT); // we need to re-try send dispatch msg to downstream tasks From fb3fe03c1fb118a0435bc1134e226f8a7bb63b66 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Thu, 18 Jul 2024 14:57:39 +0800 Subject: [PATCH 008/103] fix(stream): to avoid repeatly start checkpoint timer if previous timer is not started yet. --- source/libs/stream/inc/streamInt.h | 36 +++++++------ source/libs/stream/src/streamCheckpoint.c | 44 ++++++++++------ source/libs/stream/src/streamDispatch.c | 61 +++++++++++++---------- source/libs/stream/src/streamTask.c | 14 +++--- source/libs/stream/src/streamTimer.c | 10 ++++ 5 files changed, 102 insertions(+), 63 deletions(-) diff --git a/source/libs/stream/inc/streamInt.h b/source/libs/stream/inc/streamInt.h index d31f720411..dc19d8c5b0 100644 --- a/source/libs/stream/inc/streamInt.h +++ b/source/libs/stream/inc/streamInt.h @@ -48,24 +48,30 @@ extern "C" { #define stTrace(...) do { if (stDebugFlag & DEBUG_TRACE) { taosPrintLog("STM ", DEBUG_TRACE, stDebugFlag, __VA_ARGS__); }} while(0) // clang-format on +typedef struct SStreamTmrInfo { + int32_t activeCounter; // make sure only launch one checkpoint trigger check tmr + tmr_h tmrHandle; + int64_t launchChkptId; + int8_t isActive; +} SStreamTmrInfo; + struct SActiveCheckpointInfo { - TdThreadMutex lock; - int32_t transId; - int64_t firstRecvTs; // first time to recv checkpoint trigger info - int64_t activeId; // current active checkpoint id - int64_t failedId; - bool dispatchTrigger; - SArray* pDispatchTriggerList; // SArray - SArray* pReadyMsgList; // SArray - int8_t allUpstreamTriggerRecv; - SArray* pCheckpointReadyRecvList; // SArray - int32_t checkCounter; - tmr_h pChkptTriggerTmr; - int32_t sendReadyCheckCounter; - tmr_h pSendReadyMsgTmr; - int64_t sendReadyTmrChkptId; + TdThreadMutex lock; + int32_t transId; + int64_t firstRecvTs; // first time to recv checkpoint trigger info + int64_t activeId; // current active checkpoint id + int64_t failedId; + bool dispatchTrigger; + SArray* pDispatchTriggerList; // SArray + SArray* pReadyMsgList; // SArray + int8_t allUpstreamTriggerRecv; + SArray* pCheckpointReadyRecvList; // SArray + SStreamTmrInfo chkptTriggerMsgTmr; + SStreamTmrInfo chkptReadyMsgTmr; }; +int32_t streamCleanBeforeQuitTmr(SStreamTmrInfo* pInfo, SStreamTask* pTask); + typedef struct { int8_t type; SSDataBlock* pBlock; diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index d1ea72370d..96a614f6a4 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -265,14 +265,26 @@ int32_t streamProcessCheckpointTriggerBlock(SStreamTask* pTask, SStreamDataBlock return code; } - int32_t ref = atomic_add_fetch_32(&pTask->status.timerActive, 1); - stDebug("s-task:%s start checkpoint-trigger monitor in 10s, ref:%d ", pTask->id.idStr, ref); - streamMetaAcquireOneTask(pTask); + // if previous launched timer not started yet, not start a new timer + // todo: fix this bug: previous set checkpoint-trigger check tmr is running, while we happen to try to launch + // a new checkpoint-trigger timer right now. + // And if we don't start a new timer, and the lost of checkpoint-trigger message may cause the whole checkpoint + // procedure to be stucked. + SStreamTmrInfo* pTmrInfo = &pActiveInfo->chkptTriggerMsgTmr; + int8_t old = atomic_val_compare_exchange_8(&pTmrInfo->isActive, 0, 1); + if (old == 0) { + int32_t ref = atomic_add_fetch_32(&pTask->status.timerActive, 1); + stDebug("s-task:%s start checkpoint-trigger monitor in 10s, ref:%d ", pTask->id.idStr, ref); + streamMetaAcquireOneTask(pTask); - if (pActiveInfo->pChkptTriggerTmr == NULL) { - pActiveInfo->pChkptTriggerTmr = taosTmrStart(checkpointTriggerMonitorFn, 100, pTask, streamTimer); - } else { - taosTmrReset(checkpointTriggerMonitorFn, 100, pTask, streamTimer, &pActiveInfo->pChkptTriggerTmr); + if (pTmrInfo->tmrHandle == NULL) { + pTmrInfo->tmrHandle = taosTmrStart(checkpointTriggerMonitorFn, 200, pTask, streamTimer); + } else { + taosTmrReset(checkpointTriggerMonitorFn, 200, pTask, streamTimer, &pTmrInfo->tmrHandle); + } + pTmrInfo->launchChkptId = pActiveInfo->activeId; + } else { // already launched, do nothing + stError("s-task:%s previous checkpoint-trigger monitor tmr is set, not start new one", pTask->id.idStr); } } @@ -741,27 +753,28 @@ void checkpointTriggerMonitorFn(void* param, void* tmrId) { const char* id = pTask->id.idStr; SActiveCheckpointInfo* pActiveInfo = pTask->chkInfo.pActiveInfo; + SStreamTmrInfo* pTmrInfo = &pActiveInfo->chkptTriggerMsgTmr; // check the status every 100ms if (streamTaskShouldStop(pTask)) { - int32_t ref = atomic_sub_fetch_32(&pTask->status.timerActive, 1); + int32_t ref = streamCleanBeforeQuitTmr(pTmrInfo, pTask); stDebug("s-task:%s vgId:%d quit from monitor checkpoint-trigger, ref:%d", id, vgId, ref); streamMetaReleaseTask(pTask->pMeta, pTask); return; } - if (++pActiveInfo->checkCounter < 100) { - taosTmrReset(checkpointTriggerMonitorFn, 100, pTask, streamTimer, &pActiveInfo->pChkptTriggerTmr); + if (++pTmrInfo->activeCounter < 50) { + taosTmrReset(checkpointTriggerMonitorFn, 200, pTask, streamTimer, &pTmrInfo->tmrHandle); return; } - pActiveInfo->checkCounter = 0; + pTmrInfo->activeCounter = 0; stDebug("s-task:%s vgId:%d checkpoint-trigger monitor in tmr, ts:%" PRId64, id, vgId, now); taosThreadMutexLock(&pTask->lock); SStreamTaskState* pState = streamTaskGetStatus(pTask); if (pState->state != TASK_STATUS__CK) { - int32_t ref = atomic_sub_fetch_32(&pTask->status.timerActive, 1); + int32_t ref = streamCleanBeforeQuitTmr(pTmrInfo, pTask); stDebug("s-task:%s vgId:%d not in checkpoint status, quit from monitor checkpoint-trigger, ref:%d", id, vgId, ref); taosThreadMutexUnlock(&pTask->lock); @@ -771,7 +784,7 @@ void checkpointTriggerMonitorFn(void* param, void* tmrId) { // checkpoint-trigger recv flag is set, quit if (pActiveInfo->allUpstreamTriggerRecv) { - int32_t ref = atomic_sub_fetch_32(&pTask->status.timerActive, 1); + int32_t ref = streamCleanBeforeQuitTmr(pTmrInfo, pTask); stDebug("s-task:%s vgId:%d all checkpoint-trigger recv, quit from monitor checkpoint-trigger, ref:%d", id, vgId, ref); @@ -779,7 +792,6 @@ void checkpointTriggerMonitorFn(void* param, void* tmrId) { streamMetaReleaseTask(pTask->pMeta, pTask); return; } - taosThreadMutexUnlock(&pTask->lock); taosThreadMutexLock(&pActiveInfo->lock); @@ -820,9 +832,9 @@ void checkpointTriggerMonitorFn(void* param, void* tmrId) { // check every 100ms if (size > 0) { stDebug("s-task:%s start to monitor checkpoint-trigger in 10s", id); - taosTmrReset(checkpointTriggerMonitorFn, 100, pTask, streamTimer, &pActiveInfo->pChkptTriggerTmr); + taosTmrReset(checkpointTriggerMonitorFn, 200, pTask, streamTimer, &pTmrInfo->tmrHandle); } else { - int32_t ref = atomic_sub_fetch_32(&pTask->status.timerActive, 1); + int32_t ref = streamCleanBeforeQuitTmr(pTmrInfo, pTask); stDebug("s-task:%s all checkpoint-trigger recved, quit from monitor checkpoint-trigger tmr, ref:%d", id, ref); streamMetaReleaseTask(pTask->pMeta, pTask); } diff --git a/source/libs/stream/src/streamDispatch.c b/source/libs/stream/src/streamDispatch.c index 9615ed49e0..006e55374e 100644 --- a/source/libs/stream/src/streamDispatch.c +++ b/source/libs/stream/src/streamDispatch.c @@ -777,31 +777,32 @@ int32_t initCheckpointReadyMsg(SStreamTask* pTask, int32_t upstreamNodeId, int32 } static void checkpointReadyMsgSendMonitorFn(void* param, void* tmrId) { - SStreamTask* pTask = param; - int32_t vgId = pTask->pMeta->vgId; - const char* id = pTask->id.idStr; + SStreamTask* pTask = param; + int32_t vgId = pTask->pMeta->vgId; + const char* id = pTask->id.idStr; + SActiveCheckpointInfo* pActiveInfo = pTask->chkInfo.pActiveInfo; + SStreamTmrInfo* pTmrInfo = &pActiveInfo->chkptReadyMsgTmr; // check the status every 100ms if (streamTaskShouldStop(pTask)) { - int32_t ref = atomic_sub_fetch_32(&pTask->status.timerActive, 1); + int32_t ref = streamCleanBeforeQuitTmr(pTmrInfo, pTask); stDebug("s-task:%s vgId:%d status:stop, quit from monitor checkpoint-trigger, ref:%d", id, vgId, ref); streamMetaReleaseTask(pTask->pMeta, pTask); return; } - SActiveCheckpointInfo* pActiveInfo = pTask->chkInfo.pActiveInfo; - if (++pActiveInfo->sendReadyCheckCounter < 100) { - taosTmrReset(checkpointReadyMsgSendMonitorFn, 100, pTask, streamTimer, &pActiveInfo->pSendReadyMsgTmr); + if (++pTmrInfo->activeCounter < 50) { + taosTmrReset(checkpointReadyMsgSendMonitorFn, 200, pTask, streamTimer, &pTmrInfo->tmrHandle); return; } - pActiveInfo->sendReadyCheckCounter = 0; - stDebug("s-task:%s in sending checkpoint-ready msg monitor timer", id); + pTmrInfo->activeCounter = 0; + stDebug("s-task:%s in sending checkpoint-ready msg monitor tmr", id); taosThreadMutexLock(&pTask->lock); SStreamTaskState* pState = streamTaskGetStatus(pTask); if (pState->state != TASK_STATUS__CK) { - int32_t ref = atomic_sub_fetch_32(&pTask->status.timerActive, 1); + int32_t ref = streamCleanBeforeQuitTmr(pTmrInfo, pTask); stDebug("s-task:%s vgId:%d status:%s not in checkpoint, quit from monitor checkpoint-ready send, ref:%d", id, vgId, pState->name, ref); taosThreadMutexUnlock(&pTask->lock); @@ -815,11 +816,12 @@ static void checkpointReadyMsgSendMonitorFn(void* param, void* tmrId) { SArray* pList = pActiveInfo->pReadyMsgList; int32_t num = taosArrayGetSize(pList); - if (pActiveInfo->sendReadyTmrChkptId < pActiveInfo->activeId) { + if (pTmrInfo->launchChkptId < pActiveInfo->activeId) { taosThreadMutexUnlock(&pActiveInfo->lock); - int32_t ref = atomic_sub_fetch_32(&pTask->status.timerActive, 1); - stWarn("s-task:%s vgId:%d tmr launched by previous checkpoint procedure, checkpointId:%" PRId64 ", quit, ref:%d", - id, vgId, pActiveInfo->sendReadyTmrChkptId, ref); + int32_t ref = streamCleanBeforeQuitTmr(pTmrInfo, pTask); + stWarn("s-task:%s vgId:%d ready-msg send tmr launched by previous checkpoint procedure, checkpointId:%" PRId64 + ", quit, ref:%d", + id, vgId, pTmrInfo->launchChkptId, ref); streamMetaReleaseTask(pTask->pMeta, pTask); return; @@ -828,7 +830,7 @@ static void checkpointReadyMsgSendMonitorFn(void* param, void* tmrId) { // active checkpoint info is cleared for now if ((pActiveInfo->activeId == 0) && (pActiveInfo->transId == 0) && (num == 0) && (pTask->chkInfo.startTs == 0)) { taosThreadMutexUnlock(&pActiveInfo->lock); - int32_t ref = atomic_sub_fetch_32(&pTask->status.timerActive, 1); + int32_t ref = streamCleanBeforeQuitTmr(pTmrInfo, pTask); stWarn("s-task:%s vgId:%d active checkpoint may be cleared, quit from readyMsg send tmr, ref:%d", id, vgId, ref); streamMetaReleaseTask(pTask->pMeta, pTask); @@ -871,10 +873,10 @@ static void checkpointReadyMsgSendMonitorFn(void* param, void* tmrId) { } } - taosTmrReset(checkpointReadyMsgSendMonitorFn, 100, pTask, streamTimer, &pActiveInfo->pSendReadyMsgTmr); + taosTmrReset(checkpointReadyMsgSendMonitorFn, 200, pTask, streamTimer, &pTmrInfo->tmrHandle); taosThreadMutexUnlock(&pActiveInfo->lock); } else { - int32_t ref = atomic_sub_fetch_32(&pTask->status.timerActive, 1); + int32_t ref = streamCleanBeforeQuitTmr(pTmrInfo, pTask); stDebug( "s-task:%s vgId:%d recv of checkpoint-ready msg confirmed by all upstream task(s), clear checkpoint-ready msg " "and quit from timer, ref:%d", @@ -916,18 +918,25 @@ int32_t streamTaskSendCheckpointReadyMsg(SStreamTask* pTask) { // start to check if checkpoint ready msg has successfully received by upstream tasks. if (pTask->info.taskLevel == TASK_LEVEL__SINK || pTask->info.taskLevel == TASK_LEVEL__AGG) { - int32_t ref = atomic_add_fetch_32(&pTask->status.timerActive, 1); - stDebug("s-task:%s start checkpoint-ready monitor in 10s, ref:%d ", pTask->id.idStr, ref); - streamMetaAcquireOneTask(pTask); + SStreamTmrInfo* pTmrInfo = &pActiveInfo->chkptReadyMsgTmr; - if (pActiveInfo->pSendReadyMsgTmr == NULL) { - pActiveInfo->pSendReadyMsgTmr = taosTmrStart(checkpointReadyMsgSendMonitorFn, 100, pTask, streamTimer); + int8_t old = atomic_val_compare_exchange_8(&pTmrInfo->isActive, 0, 1); + if (old == 0) { + int32_t ref = atomic_add_fetch_32(&pTask->status.timerActive, 1); + stDebug("s-task:%s start checkpoint-ready monitor in 10s, ref:%d ", pTask->id.idStr, ref); + streamMetaAcquireOneTask(pTask); + + if (pTmrInfo->tmrHandle == NULL) { + pTmrInfo->tmrHandle = taosTmrStart(checkpointReadyMsgSendMonitorFn, 200, pTask, streamTimer); + } else { + taosTmrReset(checkpointReadyMsgSendMonitorFn, 200, pTask, streamTimer, &pTmrInfo->tmrHandle); + } + + // mark the timer monitor checkpointId + pTmrInfo->launchChkptId = pActiveInfo->activeId; } else { - taosTmrReset(checkpointReadyMsgSendMonitorFn, 100, pTask, streamTimer, &pActiveInfo->pSendReadyMsgTmr); + stError("s-task:%s previous checkpoint-ready monitor tmr is set, not start new one", pTask->id.idStr); } - - // mark the timer monitor checkpointId - pActiveInfo->sendReadyTmrChkptId = pActiveInfo->activeId; } taosThreadMutexUnlock(&pActiveInfo->lock); diff --git a/source/libs/stream/src/streamTask.c b/source/libs/stream/src/streamTask.c index 4cbe0cb136..d63b6ea935 100644 --- a/source/libs/stream/src/streamTask.c +++ b/source/libs/stream/src/streamTask.c @@ -1064,14 +1064,16 @@ void streamTaskDestroyActiveChkptInfo(SActiveCheckpointInfo* pInfo) { taosArrayDestroy(pInfo->pCheckpointReadyRecvList); pInfo->pCheckpointReadyRecvList = NULL; - if (pInfo->pChkptTriggerTmr != NULL) { - taosTmrStop(pInfo->pChkptTriggerTmr); - pInfo->pChkptTriggerTmr = NULL; + SStreamTmrInfo* pTriggerTmr = &pInfo->chkptTriggerMsgTmr; + if (pTriggerTmr->tmrHandle != NULL) { + taosTmrStop(pTriggerTmr->tmrHandle); + pTriggerTmr->tmrHandle = NULL; } - if (pInfo->pSendReadyMsgTmr != NULL) { - taosTmrStop(pInfo->pSendReadyMsgTmr); - pInfo->pSendReadyMsgTmr = NULL; + SStreamTmrInfo* pReadyTmr = &pInfo->chkptReadyMsgTmr; + if (pReadyTmr->tmrHandle != NULL) { + taosTmrStop(pReadyTmr->tmrHandle); + pReadyTmr->tmrHandle = NULL; } taosMemoryFree(pInfo); diff --git a/source/libs/stream/src/streamTimer.c b/source/libs/stream/src/streamTimer.c index 6e956e2682..4838d76fe0 100644 --- a/source/libs/stream/src/streamTimer.c +++ b/source/libs/stream/src/streamTimer.c @@ -38,3 +38,13 @@ void streamTimerCleanUp() { tmr_h streamTimerGetInstance() { return streamTimer; } + +int32_t streamCleanBeforeQuitTmr(SStreamTmrInfo* pInfo, SStreamTask* pTask) { + pInfo->activeCounter = 0; + pInfo->launchChkptId = 0; + atomic_store_8(&pInfo->isActive, 0); + + int32_t ref = atomic_sub_fetch_32(&pTask->status.timerActive, 1); + ASSERT(ref >= 0); + return ref; +} \ No newline at end of file From 0cca12ab52aa1506b80154d8784ed3b6b1da3daa Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Thu, 18 Jul 2024 15:49:49 +0800 Subject: [PATCH 009/103] fix(stream): add some logs. --- source/common/src/rsync.c | 67 ++++++++++++++--------------- source/libs/stream/src/streamMeta.c | 14 +++++- 2 files changed, 45 insertions(+), 36 deletions(-) diff --git a/source/common/src/rsync.c b/source/common/src/rsync.c index 36d634c305..f2f6796fb0 100644 --- a/source/common/src/rsync.c +++ b/source/common/src/rsync.c @@ -278,41 +278,6 @@ int32_t downloadByRsync(const char* id, const char* path, int64_t checkpointId) uDebug("[rsync] %s start to sync data from remote to:%s, cmd:%s", id, path, command); -// while (times++ < MAX_RETRY) { - code = execCommand(command); - if (code != TSDB_CODE_SUCCESS) { - uError("[rsync] %s download checkpointId:%" PRId64 - " data:%s failed, retry after 1sec, times:%d, code:%d," ERRNO_ERR_FORMAT, - id, checkpointId, path, times, code, ERRNO_ERR_DATA); -// taosSsleep(1); - } else { - int32_t el = taosGetTimestampMs() - st; - uDebug("[rsync] %s download checkpointId:%" PRId64 " data:%s successfully, elapsed time:%dms", id, checkpointId, - path, el); -// break; - } -// } - - // if failed, try to load it from data directory -#ifdef WINDOWS - memset(pathTransform, 0, PATH_MAX); - changeDirFromWindowsToLinux(path, pathTransform); -#endif - - memset(command, 0, PATH_MAX); - snprintf( - command, PATH_MAX, - "rsync -av --debug=all --log-file=%s/rsynclog --timeout=10 --bwlimit=100000 rsync://%s/checkpoint/%s/data/ %s", - tsLogDir, tsSnodeAddress, id, -#ifdef WINDOWS - pathTransform -#else - path -#endif - ); - - uDebug("[rsync] %s start to sync data from remote data dir to:%s, cmd:%s", id, path, command); - code = execCommand(command); if (code != TSDB_CODE_SUCCESS) { uError("[rsync] %s download checkpointId:%" PRId64 @@ -324,6 +289,38 @@ int32_t downloadByRsync(const char* id, const char* path, int64_t checkpointId) path, el); } + if (code != TSDB_CODE_SUCCESS) { // if failed, try to load it from data directory +#ifdef WINDOWS + memset(pathTransform, 0, PATH_MAX); + changeDirFromWindowsToLinux(path, pathTransform); +#endif + + memset(command, 0, PATH_MAX); + snprintf( + command, PATH_MAX, + "rsync -av --debug=all --log-file=%s/rsynclog --timeout=10 --bwlimit=100000 rsync://%s/checkpoint/%s/data/ %s", + tsLogDir, tsSnodeAddress, id, +#ifdef WINDOWS + pathTransform +#else + path +#endif + ); + + uDebug("[rsync] %s start to sync data from remote data dir to:%s, cmd:%s", id, path, command); + + code = execCommand(command); + if (code != TSDB_CODE_SUCCESS) { + uError("[rsync] %s download checkpointId:%" PRId64 + " data:%s failed, retry after 1sec, times:%d, code:%d," ERRNO_ERR_FORMAT, + id, checkpointId, path, times, code, ERRNO_ERR_DATA); + } else { + int32_t el = taosGetTimestampMs() - st; + uDebug("[rsync] %s download checkpointId:%" PRId64 " data:%s successfully, elapsed time:%dms", id, checkpointId, + path, el); + } + } + return code; } diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index d2c957422b..ebc0a864fc 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -273,7 +273,19 @@ int32_t streamTaskSetDb(SStreamMeta* pMeta, SStreamTask* pTask, const char* key) pBackend->pTask = pTask; pBackend->pMeta = pMeta; - if (processVer != -1) pTask->chkInfo.processedVer = processVer; + if (processVer != -1) { + if (pTask->chkInfo.processedVer != processVer) { + stWarn("s-task:%s vgId:%d update checkpointVer:%" PRId64 "->%" PRId64 " for checkpointId:%" PRId64, + pTask->id.idStr, pTask->pMeta->vgId, pTask->chkInfo.processedVer, processVer, pTask->chkInfo.checkpointId); + pTask->chkInfo.processedVer = processVer; + pTask->chkInfo.checkpointVer = processVer; + pTask->chkInfo.nextProcessVer = processVer + 1; + } else { + stInfo("s-task:%s vgId:%d processedVer:%" PRId64 + " in task meta equals to data in checkpoint data for checkpointId:%" PRId64, + pTask->id.idStr, pTask->pMeta->vgId, pTask->chkInfo.processedVer, pTask->chkInfo.checkpointId); + } + } taosHashPut(pMeta->pTaskDbUnique, key, strlen(key), &pBackend, sizeof(void*)); taosThreadMutexUnlock(&pMeta->backendMutex); From aaf67a42eb06bf9bd52eb3acb7d7459378f74d8d Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Fri, 19 Jul 2024 16:41:39 +0800 Subject: [PATCH 010/103] fix(stream): fix race condition in handling dispatch rsp. --- source/libs/stream/src/streamCheckpoint.c | 15 +-- source/libs/stream/src/streamDispatch.c | 113 ++++++++++++---------- 2 files changed, 70 insertions(+), 58 deletions(-) diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index 96a614f6a4..59075c47b2 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -799,10 +799,15 @@ void checkpointTriggerMonitorFn(void* param, void* tmrId) { // send msg to retrieve checkpoint trigger msg SArray* pList = pTask->upstreamInfo.pList; ASSERT(pTask->info.taskLevel > TASK_LEVEL__SOURCE); + SArray* pNotSendList = taosArrayInit(4, sizeof(SStreamUpstreamEpInfo)); if (pNotSendList == NULL) { terrno = TSDB_CODE_OUT_OF_MEMORY; - stDebug("s-task:%s start to triggerMonitor, reason:%s", id, tstrerror(terrno)); + stError("s-task:%s quit tmr function due to out of memory", id); + taosThreadMutexUnlock(&pActiveInfo->lock); + + stDebug("s-task:%s start to monitor checkpoint-trigger in 10s", id); + taosTmrReset(checkpointTriggerMonitorFn, 200, pTask, streamTimer, &pTmrInfo->tmrHandle); return; } @@ -967,18 +972,14 @@ void streamTaskInitTriggerDispatchInfo(SStreamTask* pTask) { taosThreadMutexUnlock(&pInfo->lock); } -int32_t streamTaskGetNumOfConfirmed(SStreamTask* pTask) { - SActiveCheckpointInfo* pInfo = pTask->chkInfo.pActiveInfo; - +int32_t streamTaskGetNumOfConfirmed(SActiveCheckpointInfo* pInfo) { int32_t num = 0; - taosThreadMutexLock(&pInfo->lock); for (int32_t i = 0; i < taosArrayGetSize(pInfo->pDispatchTriggerList); ++i) { STaskTriggerSendInfo* p = taosArrayGet(pInfo->pDispatchTriggerList, i); if (p->recved) { num++; } } - taosThreadMutexUnlock(&pInfo->lock); return num; } @@ -1000,9 +1001,9 @@ void streamTaskSetTriggerDispatchConfirmed(SStreamTask* pTask, int32_t vgId) { } } + int32_t numOfConfirmed = streamTaskGetNumOfConfirmed(pInfo); taosThreadMutexUnlock(&pInfo->lock); - int32_t numOfConfirmed = streamTaskGetNumOfConfirmed(pTask); int32_t total = streamTaskGetNumOfDownstream(pTask); stDebug("s-task:%s set downstream:0x%x(vgId:%d) checkpoint-trigger dispatch confirmed, total confirmed:%d/%d", pTask->id.idStr, taskId, vgId, numOfConfirmed, total); diff --git a/source/libs/stream/src/streamDispatch.c b/source/libs/stream/src/streamDispatch.c index 006e55374e..6fec79eb04 100644 --- a/source/libs/stream/src/streamDispatch.c +++ b/source/libs/stream/src/streamDispatch.c @@ -501,7 +501,7 @@ static void doMonitorDispatchData(void* param, void* tmrId) { int32_t ref = atomic_sub_fetch_32(&pTask->status.timerActive, 1); stDebug("s-task:%s not in dispatch procedure, abort from timer, ref:%d", pTask->id.idStr, ref); - pTask->msgInfo.inMonitor = 0; + pTask->msgInfo.inMonitor = 0; // set not in dispatch monitor taosThreadMutexUnlock(&pMsgInfo->lock); return; } @@ -1211,44 +1211,51 @@ static int32_t handleDispatchSuccessRsp(SStreamTask* pTask, int32_t downstreamId return 0; } -static bool setDispatchRspInfo(SDispatchMsgInfo* pMsgInfo, int32_t vgId, int32_t code, int64_t now, int32_t* pNotRsp, const char* id) { +static bool setDispatchRspInfo(SDispatchMsgInfo* pMsgInfo, int32_t vgId, int32_t code, int64_t now, int32_t* pNotRsp, + int32_t* pFailed, const char* id) { int32_t numOfRsp = 0; - bool alreadySet = false; - bool updated = false; - bool allRsp = false; - *pNotRsp = 0; + int32_t numOfFailed = 0; - taosThreadMutexLock(&pMsgInfo->lock); + bool allRsp = false; int32_t numOfDispatchBranch = taosArrayGetSize(pMsgInfo->pSendInfo); - for(int32_t i = 0; i < numOfDispatchBranch; ++i) { + *pNotRsp = 0; + *pFailed = 0; + + for (int32_t i = 0; i < numOfDispatchBranch; ++i) { SDispatchEntry* pEntry = taosArrayGet(pMsgInfo->pSendInfo, i); if (pEntry->rspTs != -1) { numOfRsp += 1; + } else { + if (pEntry->status != TSDB_CODE_SUCCESS || isDispatchRspTimeout(pEntry, now)) { + numOfFailed += 1; + } } } for (int32_t j = 0; j < numOfDispatchBranch; ++j) { SDispatchEntry* pEntry = taosArrayGet(pMsgInfo->pSendInfo, j); if (pEntry->nodeId == vgId) { - ASSERT(!alreadySet); - pEntry->rspTs = now; - pEntry->status = code; - alreadySet = true; - updated = true; - numOfRsp += 1; + if (pEntry->rspTs != -1) { + stDebug("s-task:%s dispatch rsp has already recved at:%" PRId64 ", ignore this rsp, msgId:%d", id, + pEntry->rspTs, pMsgInfo->msgId); + allRsp = false; + } else { + pEntry->rspTs = now; + pEntry->status = code; + numOfRsp += 1; + allRsp = (numOfRsp == numOfDispatchBranch); - stDebug("s-task:%s record the rsp recv, ts:%" PRId64 " code:%d, idx:%d, total recv:%d/%d", id, now, code, j, - numOfRsp, numOfDispatchBranch); + stDebug("s-task:%s record the rsp recv, ts:%" PRId64 " code:%d, idx:%d, total recv:%d/%d", id, now, code, j, + numOfRsp, numOfDispatchBranch); + } + break; } } + *pFailed = numOfFailed; *pNotRsp = numOfDispatchBranch - numOfRsp; - allRsp = (numOfRsp == numOfDispatchBranch); - taosThreadMutexUnlock(&pMsgInfo->lock); - - ASSERT(updated); return allRsp; } @@ -1277,15 +1284,23 @@ int32_t streamProcessDispatchRsp(SStreamTask* pTask, SStreamDispatchRsp* pRsp, i int64_t now = taosGetTimestampMs(); bool allRsp = false; int32_t notRsp = 0; + int32_t numOfFailed = 0; + bool triggerDispatchRsp = false; + + // we only set the dispatch msg info for current checkpoint trans + taosThreadMutexLock(&pTask->lock); + triggerDispatchRsp = (streamTaskGetStatus(pTask)->state == TASK_STATUS__CK) && + (pTask->chkInfo.pActiveInfo->activeId == pMsgInfo->checkpointId); + taosThreadMutexUnlock(&pTask->lock); taosThreadMutexLock(&pMsgInfo->lock); - int32_t msgId = pMsgInfo->msgId; - taosThreadMutexUnlock(&pMsgInfo->lock); + int32_t msgId = pMsgInfo->msgId; // follower not handle the dispatch rsp if ((pTask->pMeta->role == NODE_ROLE_FOLLOWER) || (pTask->status.downstreamReady != 1)) { stError("s-task:%s vgId:%d is follower or task just re-launched, not handle the dispatch rsp, discard it", id, vgId); + taosThreadMutexUnlock(&pMsgInfo->lock); return TSDB_CODE_STREAM_TASK_NOT_EXIST; } @@ -1294,6 +1309,7 @@ int32_t streamProcessDispatchRsp(SStreamTask* pTask, SStreamDispatchRsp* pRsp, i stError("s-task:%s vgId:%d not expect rsp, expected: msgId:%d, stage:%" PRId64 " actual msgId:%d, stage:%" PRId64 " discard it", id, vgId, msgId, pTask->pMeta->stage, pRsp->msgId, pRsp->stage); + taosThreadMutexUnlock(&pMsgInfo->lock); return TSDB_CODE_INVALID_MSG; } @@ -1305,18 +1321,18 @@ int32_t streamProcessDispatchRsp(SStreamTask* pTask, SStreamDispatchRsp* pRsp, i if (code == TSDB_CODE_STREAM_TASK_NOT_EXIST) { // destination task does not exist, not retry anymore stError("s-task:%s failed to dispatch msg to task:0x%x(vgId:%d), msgId:%d no retry, since task destroyed already", id, pRsp->downstreamTaskId, pRsp->downstreamNodeId, msgId); - allRsp = setDispatchRspInfo(pMsgInfo, pRsp->downstreamNodeId, TSDB_CODE_SUCCESS, now, ¬Rsp, id); + allRsp = setDispatchRspInfo(pMsgInfo, pRsp->downstreamNodeId, TSDB_CODE_SUCCESS, now, ¬Rsp, &numOfFailed, id); } else { stError("s-task:%s failed to dispatch msgId:%d to task:0x%x(vgId:%d), code:%s, add to retry list", id, msgId, pRsp->downstreamTaskId, pRsp->downstreamNodeId, tstrerror(code)); - allRsp = setDispatchRspInfo(pMsgInfo, pRsp->downstreamNodeId, code, now, ¬Rsp, id); + allRsp = setDispatchRspInfo(pMsgInfo, pRsp->downstreamNodeId, code, now, ¬Rsp, &numOfFailed, id); } } else { // code == 0 if (pRsp->inputStatus == TASK_INPUT_STATUS__BLOCKED) { pTask->inputq.status = TASK_INPUT_STATUS__BLOCKED; // block the input of current task, to push pressure to upstream - allRsp = setDispatchRspInfo(pMsgInfo, pRsp->downstreamNodeId, pRsp->inputStatus, now, ¬Rsp, id); + allRsp = setDispatchRspInfo(pMsgInfo, pRsp->downstreamNodeId, pRsp->inputStatus, now, ¬Rsp, &numOfFailed, id); stTrace("s-task:%s inputQ of downstream task:0x%x(vgId:%d) is full, wait for retry dispatch", id, pRsp->downstreamTaskId, pRsp->downstreamNodeId); } else { @@ -1328,15 +1344,13 @@ int32_t streamProcessDispatchRsp(SStreamTask* pTask, SStreamDispatchRsp* pRsp, i id, pRsp->downstreamTaskId, pRsp->downstreamNodeId); } - allRsp = setDispatchRspInfo(pMsgInfo, pRsp->downstreamNodeId, TSDB_CODE_SUCCESS, now, ¬Rsp, id); + allRsp = setDispatchRspInfo(pMsgInfo, pRsp->downstreamNodeId, TSDB_CODE_SUCCESS, now, ¬Rsp, &numOfFailed, id); { bool delayDispatch = (pMsgInfo->dispatchMsgType == STREAM_INPUT__CHECKPOINT_TRIGGER); if (delayDispatch) { - taosThreadMutexLock(&pTask->lock); // we only set the dispatch msg info for current checkpoint trans - if (streamTaskGetStatus(pTask)->state == TASK_STATUS__CK && - pTask->chkInfo.pActiveInfo->activeId == pMsgInfo->checkpointId) { + if (triggerDispatchRsp) { ASSERT(pTask->chkInfo.pActiveInfo->transId == pMsgInfo->transId); stDebug("s-task:%s checkpoint-trigger msg to 0x%x rsp for checkpointId:%" PRId64 " transId:%d confirmed", pTask->id.idStr, pRsp->downstreamTaskId, pMsgInfo->checkpointId, pMsgInfo->transId); @@ -1347,12 +1361,13 @@ int32_t streamProcessDispatchRsp(SStreamTask* pTask, SStreamDispatchRsp* pRsp, i " transId:%d discard, since expired", pTask->id.idStr, pMsgInfo->checkpointId, pMsgInfo->transId); } - taosThreadMutexUnlock(&pTask->lock); } } } } + taosThreadMutexUnlock(&pMsgInfo->lock); + if (pTask->outputInfo.type == TASK_OUTPUT__SHUFFLE_DISPATCH) { if (!allRsp) { stDebug( @@ -1371,29 +1386,25 @@ int32_t streamProcessDispatchRsp(SStreamTask* pTask, SStreamDispatchRsp* pRsp, i } // all msg rsp already, continue - if (allRsp) { - ASSERT(pTask->outputq.status == TASK_OUTPUT_STATUS__WAIT); + // we need to re-try send dispatch msg to downstream tasks + if (allRsp && (numOfFailed == 0)) { + // trans-state msg has been sent to downstream successfully. let's transfer the fill-history task state + if (pMsgInfo->dispatchMsgType == STREAM_INPUT__TRANS_STATE) { + stDebug("s-task:%s dispatch trans-state msgId:%d to downstream successfully, start to prepare transfer state", id, + msgId); + ASSERT(pTask->info.fillHistory == 1); - // we need to re-try send dispatch msg to downstream tasks - int32_t numOfFailed = getFailedDispatchInfo(pMsgInfo, now); - if (numOfFailed == 0) { // this message has been sent successfully, let's try next one. - // trans-state msg has been sent to downstream successfully. let's transfer the fill-history task state - if (pMsgInfo->dispatchMsgType == STREAM_INPUT__TRANS_STATE) { - stDebug("s-task:%s dispatch trans-state msgId:%d to downstream successfully, start to prepare transfer state", - id, msgId); - ASSERT(pTask->info.fillHistory == 1); - - code = streamTransferStatePrepare(pTask); - if (code != TSDB_CODE_SUCCESS) { // todo: do nothing if error happens - } - - clearBufferedDispatchMsg(pTask); - - // now ready for next data output - atomic_store_8(&pTask->outputq.status, TASK_OUTPUT_STATUS__NORMAL); - } else { - handleDispatchSuccessRsp(pTask, pRsp->downstreamTaskId, pRsp->downstreamNodeId); + code = streamTransferStatePrepare(pTask); + if (code != TSDB_CODE_SUCCESS) { // todo: do nothing if error happens } + + clearBufferedDispatchMsg(pTask); + + // now ready for next data output + atomic_store_8(&pTask->outputq.status, TASK_OUTPUT_STATUS__NORMAL); + } else { + // this message has been sent successfully, let's try next one. + handleDispatchSuccessRsp(pTask, pRsp->downstreamTaskId, pRsp->downstreamNodeId); } } From da4018931b73736f373ac74af7ac6bfc86e5550b Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Sat, 20 Jul 2024 15:35:15 +0800 Subject: [PATCH 011/103] fix(stream): calculate the error code after set current rsp status. --- source/libs/stream/src/streamDispatch.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/source/libs/stream/src/streamDispatch.c b/source/libs/stream/src/streamDispatch.c index 6fec79eb04..dd55884689 100644 --- a/source/libs/stream/src/streamDispatch.c +++ b/source/libs/stream/src/streamDispatch.c @@ -1226,10 +1226,6 @@ static bool setDispatchRspInfo(SDispatchMsgInfo* pMsgInfo, int32_t vgId, int32_t SDispatchEntry* pEntry = taosArrayGet(pMsgInfo->pSendInfo, i); if (pEntry->rspTs != -1) { numOfRsp += 1; - } else { - if (pEntry->status != TSDB_CODE_SUCCESS || isDispatchRspTimeout(pEntry, now)) { - numOfFailed += 1; - } } } @@ -1253,6 +1249,14 @@ static bool setDispatchRspInfo(SDispatchMsgInfo* pMsgInfo, int32_t vgId, int32_t } } + // this code may be error code. + for (int32_t i = 0; i < numOfDispatchBranch; ++i) { + SDispatchEntry* pEntry = taosArrayGet(pMsgInfo->pSendInfo, i); + if (pEntry->status != TSDB_CODE_SUCCESS || isDispatchRspTimeout(pEntry, now)) { + numOfFailed += 1; + } + } + *pFailed = numOfFailed; *pNotRsp = numOfDispatchBranch - numOfRsp; From ad96333336a8e16b1e8c3bdfbbbb845061399505 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Tue, 23 Jul 2024 17:16:40 +0800 Subject: [PATCH 012/103] fix(stream): discard the processed hbmsg in the mnode. --- source/dnode/mnode/impl/inc/mndStream.h | 1 + source/dnode/mnode/impl/src/mndStreamHb.c | 24 +++++++++++++++++++++ source/dnode/mnode/impl/src/mndStreamUtil.c | 3 +++ 3 files changed, 28 insertions(+) diff --git a/source/dnode/mnode/impl/inc/mndStream.h b/source/dnode/mnode/impl/inc/mndStream.h index 0b6b6a9ef2..b7aa398e59 100644 --- a/source/dnode/mnode/impl/inc/mndStream.h +++ b/source/dnode/mnode/impl/inc/mndStream.h @@ -73,6 +73,7 @@ typedef struct SNodeEntry { bool stageUpdated; // the stage has been updated due to the leader/follower change or node reboot. SEpSet epset; // compare the epset to identify the vgroup tranferring between different dnodes. int64_t hbTimestamp; // second + int32_t lastHbMsgId; // latest hb msgId } SNodeEntry; typedef struct SOrphanTask { diff --git a/source/dnode/mnode/impl/src/mndStreamHb.c b/source/dnode/mnode/impl/src/mndStreamHb.c index bc10ec211d..04dd135320 100644 --- a/source/dnode/mnode/impl/src/mndStreamHb.c +++ b/source/dnode/mnode/impl/src/mndStreamHb.c @@ -265,6 +265,30 @@ int32_t mndProcessStreamHb(SRpcMsg *pReq) { return -1; } + for(int32_t i = 0; i < taosArrayGetSize(execInfo.pNodeList); ++i) { + SNodeEntry* pEntry = taosArrayGet(execInfo.pNodeList, i); + if (pEntry == NULL) { + continue; + } + + if (pEntry->nodeId != req.vgId) { + continue; + } + + if (pEntry->lastHbMsgId == req.msgId) { + mError("vgId:%d Hb msgId:%d already handled, discard", pEntry->nodeId, req.msgId); + + terrno = TSDB_CODE_INVALID_MSG; + doSendHbMsgRsp(terrno, &pReq->info, req.vgId, req.msgId); + + taosThreadMutexUnlock(&execInfo.lock); + cleanupAfterProcessHbMsg(&req, pFailedChkpt, pOrphanTasks); + return -1; + } else { + pEntry->lastHbMsgId = req.msgId; + } + } + int32_t numOfUpdated = taosArrayGetSize(req.pUpdateNodes); if (numOfUpdated > 0) { mDebug("%d stream node(s) need updated from hbMsg(vgId:%d)", numOfUpdated, req.vgId); diff --git a/source/dnode/mnode/impl/src/mndStreamUtil.c b/source/dnode/mnode/impl/src/mndStreamUtil.c index e4e30bdf10..23eb3656da 100644 --- a/source/dnode/mnode/impl/src/mndStreamUtil.c +++ b/source/dnode/mnode/impl/src/mndStreamUtil.c @@ -607,7 +607,10 @@ void removeExpiredNodeInfo(const SArray *pNodeSnapshot) { for (int32_t j = 0; j < size; ++j) { SNodeEntry *pEntry = taosArrayGet(pNodeSnapshot, j); if (pEntry->nodeId == p->nodeId) { + p->hbTimestamp = pEntry->hbTimestamp; + taosArrayPush(pValidList, p); + mDebug("vgId:%d ts:%"PRId64" HbMsgId:%d is valid", p->nodeId, p->hbTimestamp, p->lastHbMsgId); break; } } From 4364fa8a1eaedc212f46fac7ede99c383ad70956 Mon Sep 17 00:00:00 2001 From: haoranchen Date: Wed, 31 Jul 2024 10:48:06 +0800 Subject: [PATCH 013/103] recover test case for TD-31057 --- tests/parallel_test/cases.task | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/parallel_test/cases.task b/tests/parallel_test/cases.task index e3f5e54698..dc2897baca 100644 --- a/tests/parallel_test/cases.task +++ b/tests/parallel_test/cases.task @@ -27,9 +27,9 @@ ,,y,army,./pytest.sh python3 ./test.py -f insert/insert_basic.py -N 3 ,,y,army,./pytest.sh python3 ./test.py -f cluster/splitVgroupByLearner.py -N 3 ,,y,army,./pytest.sh python3 ./test.py -f authorith/authBasic.py -N 3 -# ,,n,army,python3 ./test.py -f cmdline/fullopt.py -,,n,army,python3 ./test.py -f query/show.py -N 3 -,,n,army,python3 ./test.py -f alter/alterConfig.py -N 3 +,,y,army,./pytest.sh python3 ./test.py -f cmdline/fullopt.py +,,y,army,./pytest.sh python3 ./test.py -f query/show.py -N 3 +,,y,army,./pytest.sh python3 ./test.py -f alter/alterConfig.py -N 3 ,,y,army,./pytest.sh python3 ./test.py -f query/subquery/subqueryBugs.py -N 3 ,,y,army,./pytest.sh python3 ./test.py -f storage/oneStageComp.py -N 3 -L 3 -D 1 ,,y,army,./pytest.sh python3 ./test.py -f storage/compressBasic.py -N 3 @@ -279,8 +279,8 @@ ,,y,system-test,./pytest.sh python3 ./test.py -f 7-tmq/tmq3mnodeSwitch.py -N 6 -M 3 -n 3 -i True ,,y,system-test,./pytest.sh python3 test.py -f 7-tmq/tmqVnodeTransform-db-removewal.py -N 2 -n 1 ,,y,system-test,./pytest.sh python3 test.py -f 7-tmq/tmqVnodeTransform-stb-removewal.py -N 6 -n 3 -#,,y,system-test,./pytest.sh python3 test.py -f 7-tmq/tmqVnodeTransform-stb.py -N 2 -n 1 -#,,y,system-test,./pytest.sh python3 test.py -f 7-tmq/tmqVnodeTransform-stb.py -N 6 -n 3 +,,y,system-test,./pytest.sh python3 test.py -f 7-tmq/tmqVnodeTransform-stb.py -N 2 -n 1 +,,y,system-test,./pytest.sh python3 test.py -f 7-tmq/tmqVnodeTransform-stb.py -N 6 -n 3 #,,y,system-test,./pytest.sh python3 test.py -f 7-tmq/tmqVnodeTransform-db.py -N 6 -n 3 ,,y,system-test,./pytest.sh python3 test.py -f 7-tmq/tmqVnodeSplit-stb-select.py -N 2 -n 1 ,,y,system-test,./pytest.sh python3 test.py -f 7-tmq/tmqVnodeSplit-stb-select-duplicatedata.py -N 3 -n 3 From 6f8657ed08dff8029919518e14af3bd382bb64c9 Mon Sep 17 00:00:00 2001 From: haoranchen Date: Wed, 31 Jul 2024 10:49:41 +0800 Subject: [PATCH 014/103] fix test case in fullopt.py --- tests/army/cmdline/fullopt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/army/cmdline/fullopt.py b/tests/army/cmdline/fullopt.py index 6fc8e858a3..b80d7eac4a 100644 --- a/tests/army/cmdline/fullopt.py +++ b/tests/army/cmdline/fullopt.py @@ -60,7 +60,7 @@ class TDTestCase(TBase): "enableCoreFile 1", "fqdn 127.0.0.1", "firstEp 127.0.0.1", - "locale ENG", + "locale en_US.UTF-8", "metaCacheMaxSize 10000", "minimalTmpDirGB 5", "minimalLogDirGB 1", From fdc5b5709eca7bf1efe1d8eea0b85291a76ed698 Mon Sep 17 00:00:00 2001 From: haoranchen Date: Wed, 31 Jul 2024 11:32:08 +0800 Subject: [PATCH 015/103] Update cases.task --- tests/parallel_test/cases.task | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/parallel_test/cases.task b/tests/parallel_test/cases.task index dc2897baca..7397c45772 100644 --- a/tests/parallel_test/cases.task +++ b/tests/parallel_test/cases.task @@ -27,7 +27,7 @@ ,,y,army,./pytest.sh python3 ./test.py -f insert/insert_basic.py -N 3 ,,y,army,./pytest.sh python3 ./test.py -f cluster/splitVgroupByLearner.py -N 3 ,,y,army,./pytest.sh python3 ./test.py -f authorith/authBasic.py -N 3 -,,y,army,./pytest.sh python3 ./test.py -f cmdline/fullopt.py +,,n,army,python3 ./test.py -f cmdline/fullopt.py ,,y,army,./pytest.sh python3 ./test.py -f query/show.py -N 3 ,,y,army,./pytest.sh python3 ./test.py -f alter/alterConfig.py -N 3 ,,y,army,./pytest.sh python3 ./test.py -f query/subquery/subqueryBugs.py -N 3 From b57b263534b4d0368ea69cffc1e4586a7b2e7d0c Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Thu, 1 Aug 2024 14:40:10 +0800 Subject: [PATCH 016/103] fix(stream): add check. --- source/libs/stream/src/streamTask.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/libs/stream/src/streamTask.c b/source/libs/stream/src/streamTask.c index 979d1960f1..e8ff1552e8 100644 --- a/source/libs/stream/src/streamTask.c +++ b/source/libs/stream/src/streamTask.c @@ -602,7 +602,7 @@ int32_t streamTaskStop(SStreamTask* pTask) { stError("failed to handle STOP event, s-task:%s", id); } - if (pTask->info.taskLevel != TASK_LEVEL__SINK) { + if ((pTask->info.taskLevel != TASK_LEVEL__SINK) && (pTask->exec.pExecutor != NULL)) { code = qKillTask(pTask->exec.pExecutor, TSDB_CODE_SUCCESS); if (code) { stError("s-task:%s failed to kill task related query handle", id); From aefb9d275e295e446ee621d71a13adb8fcb192a2 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Thu, 1 Aug 2024 15:22:37 +0800 Subject: [PATCH 017/103] fix(stream): add ts in HbMsg. --- include/libs/stream/streamMsg.h | 1 + source/dnode/mnode/impl/inc/mndStream.h | 1 + source/dnode/mnode/impl/src/mndStream.c | 4 ++-- source/dnode/mnode/impl/src/mndStreamHb.c | 11 +++++++---- source/libs/stream/src/streamHb.c | 7 ++++--- source/libs/stream/src/streamMsg.c | 2 ++ tests/script/tsim/stream/checkpointInterval0.sim | 2 ++ 7 files changed, 19 insertions(+), 9 deletions(-) diff --git a/include/libs/stream/streamMsg.h b/include/libs/stream/streamMsg.h index 34921daac3..0ceaa93a72 100644 --- a/include/libs/stream/streamMsg.h +++ b/include/libs/stream/streamMsg.h @@ -164,6 +164,7 @@ int32_t tDecodeStreamTaskCheckpointReq(SDecoder* pDecoder, SStreamTaskCheckpoint typedef struct SStreamHbMsg { int32_t vgId; int32_t msgId; + int64_t ts; int32_t numOfTasks; SArray* pTaskStatus; // SArray SArray* pUpdateNodes; // SArray, needs update the epsets in stream tasks for those nodes. diff --git a/source/dnode/mnode/impl/inc/mndStream.h b/source/dnode/mnode/impl/inc/mndStream.h index d253e58703..89343ce37c 100644 --- a/source/dnode/mnode/impl/inc/mndStream.h +++ b/source/dnode/mnode/impl/inc/mndStream.h @@ -80,6 +80,7 @@ typedef struct SNodeEntry { SEpSet epset; // compare the epset to identify the vgroup tranferring between different dnodes. int64_t hbTimestamp; // second int32_t lastHbMsgId; // latest hb msgId + int64_t lastHbMsgTs; } SNodeEntry; typedef struct { diff --git a/source/dnode/mnode/impl/src/mndStream.c b/source/dnode/mnode/impl/src/mndStream.c index b7ab76984a..a1fd75c774 100644 --- a/source/dnode/mnode/impl/src/mndStream.c +++ b/source/dnode/mnode/impl/src/mndStream.c @@ -2085,7 +2085,7 @@ static int32_t refreshNodeListFromExistedStreams(SMnode *pMnode, SArray *pNodeLi break; } - SNodeEntry entry = {.hbTimestamp = -1, .nodeId = pTask->info.nodeId}; + SNodeEntry entry = {.hbTimestamp = -1, .nodeId = pTask->info.nodeId, .lastHbMsgId = -1}; epsetAssign(&entry.epset, &pTask->info.epSet); (void)taosHashPut(pHash, &entry.nodeId, sizeof(entry.nodeId), &entry, sizeof(entry)); } @@ -2265,7 +2265,7 @@ void saveTaskAndNodeInfoIntoBuf(SStreamObj *pStream, SStreamExecInfo *pExecNode) } if (!exist) { - SNodeEntry nodeEntry = {.hbTimestamp = -1, .nodeId = pTask->info.nodeId}; + SNodeEntry nodeEntry = {.hbTimestamp = -1, .nodeId = pTask->info.nodeId, .lastHbMsgId = -1}; epsetAssign(&nodeEntry.epset, &pTask->info.epSet); void* px = taosArrayPush(pExecNode->pNodeList, &nodeEntry); diff --git a/source/dnode/mnode/impl/src/mndStreamHb.c b/source/dnode/mnode/impl/src/mndStreamHb.c index bba39d0c98..50db903520 100644 --- a/source/dnode/mnode/impl/src/mndStreamHb.c +++ b/source/dnode/mnode/impl/src/mndStreamHb.c @@ -333,7 +333,8 @@ int32_t mndProcessStreamHb(SRpcMsg *pReq) { } tDecoderClear(&decoder); - mDebug("receive stream-meta hb from vgId:%d, active numOfTasks:%d, msgId:%d", req.vgId, req.numOfTasks, req.msgId); + mDebug("receive stream-meta hb from vgId:%d, active numOfTasks:%d, HbMsgId:%d, HbMsgTs:%" PRId64, req.vgId, + req.numOfTasks, req.msgId, req.ts); pFailedChkpt = taosArrayInit(4, sizeof(SFailedCheckpointInfo)); pOrphanTasks = taosArrayInit(4, sizeof(SOrphanTask)); @@ -366,17 +367,18 @@ int32_t mndProcessStreamHb(SRpcMsg *pReq) { continue; } - if (pEntry->lastHbMsgId == req.msgId) { - mError("vgId:%d Hb msgId:%d already handled, discard", pEntry->nodeId, req.msgId); + if ((pEntry->lastHbMsgId == req.msgId) && (pEntry->lastHbMsgTs == req.ts)) { + mError("vgId:%d HbMsgId:%d already handled, bh msg discard", pEntry->nodeId, req.msgId); terrno = TSDB_CODE_INVALID_MSG; doSendHbMsgRsp(terrno, &pReq->info, req.vgId, req.msgId); streamMutexUnlock(&execInfo.lock); cleanupAfterProcessHbMsg(&req, pFailedChkpt, pOrphanTasks); - return -1; + return terrno; } else { pEntry->lastHbMsgId = req.msgId; + pEntry->lastHbMsgTs = req.ts; } } @@ -417,6 +419,7 @@ int32_t mndProcessStreamHb(SRpcMsg *pReq) { SStreamObj *pStream = NULL; code = mndGetStreamObj(pMnode, p->id.streamId, &pStream); if (code) { + mError("stream obj not exist, failed to handle consensus checkpoint-info req, code:%s", tstrerror(code)); continue; } diff --git a/source/libs/stream/src/streamHb.c b/source/libs/stream/src/streamHb.c index 9804943ec2..a158d6e4bb 100644 --- a/source/libs/stream/src/streamHb.c +++ b/source/libs/stream/src/streamHb.c @@ -142,11 +142,12 @@ int32_t streamMetaSendHbHelper(SStreamMeta* pMeta) { } SStreamHbMsg* pMsg = &pInfo->hbMsg; - stDebug("vgId:%d build stream hbMsg, leader:%d msgId:%d", pMeta->vgId, (pMeta->role == NODE_ROLE_LEADER), - pMeta->pHbInfo->hbCount); - pMsg->vgId = pMeta->vgId; pMsg->msgId = pMeta->pHbInfo->hbCount; + pMsg->ts = taosGetTimestampMs(); + + stDebug("vgId:%d build stream hbMsg, leader:%d HbMsgId:%d, HbMsgTs:%" PRId64, pMeta->vgId, + (pMeta->role == NODE_ROLE_LEADER), pMsg->msgId, pMsg->ts); pMsg->pTaskStatus = taosArrayInit(numOfTasks, sizeof(STaskStatusEntry)); pMsg->pUpdateNodes = taosArrayInit(numOfTasks, sizeof(int32_t)); diff --git a/source/libs/stream/src/streamMsg.c b/source/libs/stream/src/streamMsg.c index bc0faacb32..75cb0e6683 100644 --- a/source/libs/stream/src/streamMsg.c +++ b/source/libs/stream/src/streamMsg.c @@ -382,6 +382,7 @@ int32_t tEncodeStreamHbMsg(SEncoder* pEncoder, const SStreamHbMsg* pReq) { } if (tEncodeI32(pEncoder, pReq->msgId) < 0) return -1; + if (tEncodeI64(pEncoder, pReq->ts) < 0) return -1; tEndEncode(pEncoder); return pEncoder->pos; } @@ -454,6 +455,7 @@ int32_t tDecodeStreamHbMsg(SDecoder* pDecoder, SStreamHbMsg* pReq) { } if (tDecodeI32(pDecoder, &pReq->msgId) < 0) return -1; + if (tDecodeI64(pDecoder, &pReq->ts) < 0) return -1; tEndDecode(pDecoder); return 0; diff --git a/tests/script/tsim/stream/checkpointInterval0.sim b/tests/script/tsim/stream/checkpointInterval0.sim index a548f05c82..a5e5c87704 100644 --- a/tests/script/tsim/stream/checkpointInterval0.sim +++ b/tests/script/tsim/stream/checkpointInterval0.sim @@ -76,6 +76,8 @@ system sh/stop_dnodes.sh system sh/exec.sh -n dnode1 -s start +run tsim/stream/checkTaskStatus.sim + sql insert into t1 values(1648791213002,3,2,3,1.1); $loop_count = 0 From 298a72bd8e4059c36b3010720c940671d4f1eccf Mon Sep 17 00:00:00 2001 From: charles Date: Thu, 1 Aug 2024 15:40:28 +0800 Subject: [PATCH 018/103] add test case for ts-5239 by charles --- tests/army/query/queryBugs.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/army/query/queryBugs.py b/tests/army/query/queryBugs.py index 20ecb23881..a7a6f35372 100644 --- a/tests/army/query/queryBugs.py +++ b/tests/army/query/queryBugs.py @@ -113,6 +113,34 @@ class TDTestCase(TBase): tdSql.checkData(0, 0, f"nihao{num + 2}") tdSql.checkData(0, 1, f"{11*i}") + def FIX_TS_5239(self): + tdLog.info("check bug TS_5239 ...\n") + sqls = [ + "drop database if exists ts_5239", + "create database ts_5239 cachemodel 'both' stt_trigger 1;", + "use ts_5239;", + "CREATE STABLE st (ts timestamp, c1 int) TAGS (groupId int);", + "CREATE TABLE ct1 USING st TAGS (1);" + ] + tdSql.executes(sqls) + # 2024-07-03 06:00:00.000 + start_ts = 1719957600000 + # insert 100 rows + sql = "insert into ct1 values " + for i in range(100): + sql += f"('{start_ts+i * 100}', {i+1})" + sql += ";" + tdSql.execute(sql) + tdSql.execute("flush database ts_5239;") + tdSql.execute("alter database ts_5239 stt_trigger 3;") + tdSql.execute(f"insert into ct1(ts) values({start_ts - 100 * 100})") + tdSql.execute("flush database ts_5239;") + tdSql.execute(f"insert into ct1(ts) values({start_ts + 100 * 200})") + tdSql.execute("flush database ts_5239;") + tdSql.query("select count(*) from ct1;") + tdSql.checkRows(1) + tdSql.checkData(0, 0, 102) + # run def run(self): tdLog.debug(f"start to excute {__file__}") @@ -123,6 +151,7 @@ class TDTestCase(TBase): # TS BUGS self.FIX_TS_5105() self.FIX_TS_5143() + self.FIX_TS_5239() tdLog.success(f"{__file__} successfully executed") From 6539760c647292f9c98988120eb43dc95962dc42 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Wed, 31 Jul 2024 19:15:17 +0800 Subject: [PATCH 019/103] fix(stream): fix dead lock caused by refactor. --- source/libs/stream/src/streamCheckpoint.c | 13 +++++++++---- source/libs/stream/src/streamDispatch.c | 4 ++-- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index 12453f8b0e..d777883015 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -402,6 +402,7 @@ int32_t streamProcessCheckpointReadyMsg(SStreamTask* pTask, int64_t checkpointId for (int32_t i = 0; i < size; ++i) { STaskDownstreamReadyInfo* p = taosArrayGet(pInfo->pCheckpointReadyRecvList, i); if (p == NULL) { + streamMutexUnlock(&pInfo->lock); return TSDB_CODE_INVALID_PARA; } @@ -445,6 +446,7 @@ int32_t streamTaskProcessCheckpointReadyRsp(SStreamTask* pTask, int32_t upstream for (int32_t i = 0; i < taosArrayGetSize(pInfo->pReadyMsgList); ++i) { STaskCheckpointReadyInfo* pReadyInfo = taosArrayGet(pInfo->pReadyMsgList, i); if (pReadyInfo == NULL) { + streamMutexUnlock(&pInfo->lock); return TSDB_CODE_INVALID_PARA; } @@ -459,6 +461,7 @@ int32_t streamTaskProcessCheckpointReadyRsp(SStreamTask* pTask, int32_t upstream for (int32_t i = 0; i < taosArrayGetSize(pInfo->pReadyMsgList); ++i) { STaskCheckpointReadyInfo* pReadyInfo = taosArrayGet(pInfo->pReadyMsgList, i); if (pReadyInfo == NULL) { + streamMutexUnlock(&pInfo->lock); return TSDB_CODE_INVALID_PARA; } @@ -843,7 +846,7 @@ void checkpointTriggerMonitorFn(void* param, void* tmrId) { SArray* pNotSendList = taosArrayInit(4, sizeof(SStreamUpstreamEpInfo)); if (pNotSendList == NULL) { terrno = TSDB_CODE_OUT_OF_MEMORY; - stError("s-task:%s quit tmr function due to out of memory", id); + stDebug("s-task:%s start to triggerMonitor, reason:%s", id, tstrerror(terrno)); streamMutexUnlock(&pActiveInfo->lock); stDebug("s-task:%s start to monitor checkpoint-trigger in 10s", id); @@ -956,13 +959,14 @@ bool streamTaskAlreadySendTrigger(SStreamTask* pTask, int32_t downstreamNodeId) streamMutexLock(&pInfo->lock); if (!pInfo->dispatchTrigger) { - streamMutexUnlock(&pTask->lock); + streamMutexUnlock(&pInfo->lock); return false; } for (int32_t i = 0; i < taosArrayGetSize(pInfo->pDispatchTriggerList); ++i) { STaskTriggerSendInfo* pSendInfo = taosArrayGet(pInfo->pDispatchTriggerList, i); if (pSendInfo == NULL) { + streamMutexUnlock(&pInfo->lock); return TSDB_CODE_INVALID_PARA; } @@ -982,11 +986,11 @@ bool streamTaskAlreadySendTrigger(SStreamTask* pTask, int32_t downstreamNodeId) id, pSendInfo->sendTs, before, pInfo->activeId, pInfo->transId); } - streamMutexUnlock(&pTask->lock); + streamMutexUnlock(&pInfo->lock); return true; } - ASSERT(0); + streamMutexUnlock(&pInfo->lock); return false; } @@ -1043,6 +1047,7 @@ int32_t streamTaskGetNumOfConfirmed(SActiveCheckpointInfo* pInfo) { for (int32_t i = 0; i < taosArrayGetSize(pInfo->pDispatchTriggerList); ++i) { STaskTriggerSendInfo* p = taosArrayGet(pInfo->pDispatchTriggerList, i); if (p == NULL) { + streamMutexUnlock(&pInfo->lock); return num; } diff --git a/source/libs/stream/src/streamDispatch.c b/source/libs/stream/src/streamDispatch.c index 96a9f2d297..f6e827b745 100644 --- a/source/libs/stream/src/streamDispatch.c +++ b/source/libs/stream/src/streamDispatch.c @@ -919,8 +919,8 @@ int32_t streamTaskSendCheckpointReadyMsg(SStreamTask* pTask) { STaskCheckpointReadyInfo* pInfo = taosArrayGet(pList, i); SRpcMsg msg = {0}; - int32_t code = initCheckpointReadyMsg(pTask, pInfo->upstreamNodeId, pInfo->upstreamTaskId, pInfo->childId, pInfo->checkpointId, - &msg); + int32_t code = initCheckpointReadyMsg(pTask, pInfo->upstreamNodeId, pInfo->upstreamTaskId, pInfo->childId, + pInfo->checkpointId, &msg); if (code == TSDB_CODE_SUCCESS) { code = tmsgSendReq(&pInfo->upstreamNodeEpset, &msg); if (code == TSDB_CODE_SUCCESS) { From e8f6454d179fa3f754350b80b77ccb12c9cb431b Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Wed, 31 Jul 2024 19:24:41 +0800 Subject: [PATCH 020/103] fix(stream): compare vg replica according to different db. --- source/dnode/mnode/impl/src/mndStreamUtil.c | 114 +++++++++++++------- 1 file changed, 74 insertions(+), 40 deletions(-) diff --git a/source/dnode/mnode/impl/src/mndStreamUtil.c b/source/dnode/mnode/impl/src/mndStreamUtil.c index c375b46627..b5a612f058 100644 --- a/source/dnode/mnode/impl/src/mndStreamUtil.c +++ b/source/dnode/mnode/impl/src/mndStreamUtil.c @@ -88,18 +88,48 @@ void destroyStreamTaskIter(SStreamTaskIter* pIter) { taosMemoryFree(pIter); } -int32_t mndTakeVgroupSnapshot(SMnode *pMnode, bool *allReady, SArray** pList) { - SSdb *pSdb = pMnode->pSdb; - void *pIter = NULL; - SVgObj *pVgroup = NULL; - int32_t replica = -1; // do the replica check - int32_t code = 0; +static bool checkStatusForEachReplica(SVgObj *pVgroup) { + for (int32_t i = 0; i < pVgroup->replica; ++i) { + if (!pVgroup->vnodeGid[i].syncRestore) { + mInfo("vgId:%d not restored, not ready for checkpoint or other operations", pVgroup->vgId); + return false; + } + + ESyncState state = pVgroup->vnodeGid[i].syncState; + if (state == TAOS_SYNC_STATE_OFFLINE || state == TAOS_SYNC_STATE_ERROR || state == TAOS_SYNC_STATE_LEARNER || + state == TAOS_SYNC_STATE_CANDIDATE) { + mInfo("vgId:%d state:%d , not ready for checkpoint or other operations, not check other vgroups", pVgroup->vgId, + state); + return false; + } + } + + return true; +} + +int32_t mndTakeVgroupSnapshot(SMnode *pMnode, bool *allReady, SArray **pList) { + SSdb *pSdb = pMnode->pSdb; + void *pIter = NULL; + SVgObj *pVgroup = NULL; + int32_t code = 0; + SArray *pVgroupList = NULL; + SHashObj *pHash = NULL; + + pVgroupList = taosArrayInit(4, sizeof(SNodeEntry)); + if (pVgroupList == NULL) { + mError("failed to prepare arraylist during take vgroup snapshot, code:%s", tstrerror(terrno)); + code = terrno; + goto _err; + } + + pHash = taosHashInit(10, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BIGINT), false, HASH_NO_LOCK); + if (pHash == NULL) { + mError("failed to prepare hashmap during take vgroup snapshot, code:%s", tstrerror(terrno)); + code = terrno; + goto _err; + } *allReady = true; - SArray *pVgroupList = taosArrayInit(4, sizeof(SNodeEntry)); - if (pVgroupList == NULL) { - return terrno; - } while (1) { pIter = sdbFetch(pSdb, SDB_VGROUP, pIter, (void **)&pVgroup); @@ -110,44 +140,37 @@ int32_t mndTakeVgroupSnapshot(SMnode *pMnode, bool *allReady, SArray** pList) { SNodeEntry entry = {.nodeId = pVgroup->vgId, .hbTimestamp = pVgroup->updateTime}; entry.epset = mndGetVgroupEpset(pMnode, pVgroup); - if (replica == -1) { - replica = pVgroup->replica; - } else { - if (replica != pVgroup->replica) { - mInfo("vgId:%d replica:%d inconsistent with other vgroups replica:%d, not ready for stream operations", - pVgroup->vgId, pVgroup->replica, replica); - *allReady = false; + int8_t *pReplica = taosHashGet(pHash, &pVgroup->dbUid, sizeof(pVgroup->dbUid)); + if (pReplica == NULL) { // not exist, add it into hash map + code = taosHashPut(pHash, &pVgroup->dbUid, sizeof(pVgroup->dbUid), &pVgroup->replica, sizeof(pVgroup->replica)); + if (code) { + mError("failed to put info into hashmap during task vgroup snapshot, code:%s", tstrerror(code)); sdbRelease(pSdb, pVgroup); - break; + goto _err; // take snapshot failed, and not all ready + } + } else { + if (*pReplica != pVgroup->replica) { + mInfo("vgId:%d replica:%d inconsistent with other vgroups replica:%d, not ready for stream operations", + pVgroup->vgId, pVgroup->replica, *pReplica); + *allReady = false; // task snap success, but not all ready } } // if not all ready till now, no need to check the remaining vgroups. + // but still we need to put the info of the existed vgroups into the snapshot list if (*allReady) { - for (int32_t i = 0; i < pVgroup->replica; ++i) { - if (!pVgroup->vnodeGid[i].syncRestore) { - mInfo("vgId:%d not restored, not ready for checkpoint or other operations", pVgroup->vgId); - *allReady = false; - break; - } - - ESyncState state = pVgroup->vnodeGid[i].syncState; - if (state == TAOS_SYNC_STATE_OFFLINE || state == TAOS_SYNC_STATE_ERROR || state == TAOS_SYNC_STATE_LEARNER || - state == TAOS_SYNC_STATE_CANDIDATE) { - mInfo("vgId:%d state:%d , not ready for checkpoint or other operations, not check other vgroups", - pVgroup->vgId, state); - *allReady = false; - break; - } - } + *allReady = checkStatusForEachReplica(pVgroup); } char buf[256] = {0}; - (void) epsetToStr(&entry.epset, buf, tListLen(buf)); + (void)epsetToStr(&entry.epset, buf, tListLen(buf)); - void* p = taosArrayPush(pVgroupList, &entry); + void *p = taosArrayPush(pVgroupList, &entry); if (p == NULL) { mError("failed to put entry in vgroup list, nodeId:%d code:out of memory", entry.nodeId); + code = terrno; + sdbRelease(pSdb, pVgroup); + goto _err; } else { mDebug("take node snapshot, nodeId:%d %s", entry.nodeId, buf); } @@ -166,15 +189,19 @@ int32_t mndTakeVgroupSnapshot(SMnode *pMnode, bool *allReady, SArray** pList) { code = addEpIntoEpSet(&entry.epset, pObj->pDnode->fqdn, pObj->pDnode->port); if (code) { sdbRelease(pSdb, pObj); - continue; + mError("failed to extract epset for fqdn:%s during task vgroup snapshot", pObj->pDnode->fqdn); + goto _err; } char buf[256] = {0}; - (void) epsetToStr(&entry.epset, buf, tListLen(buf)); + (void)epsetToStr(&entry.epset, buf, tListLen(buf)); - void* p = taosArrayPush(pVgroupList, &entry); + void *p = taosArrayPush(pVgroupList, &entry); if (p == NULL) { - mError("failed to put entry in vgroup list, nodeId:%d code:out of memory", entry.nodeId); + code = terrno; + sdbRelease(pSdb, pObj); + mError("failed to put entry in vgroup list, nodeId:%d code:%s", entry.nodeId, tstrerror(code)); + goto _err; } else { mDebug("take snode snapshot, nodeId:%d %s", entry.nodeId, buf); } @@ -184,6 +211,13 @@ int32_t mndTakeVgroupSnapshot(SMnode *pMnode, bool *allReady, SArray** pList) { *pList = pVgroupList; return code; + +_err: + *allReady = false; + taosArrayDestroy(pVgroupList); + taosHashCleanup(pHash); + + return code; } int32_t mndGetStreamObj(SMnode *pMnode, int64_t streamId, SStreamObj **pStream) { From 400ed18c6ae9e442482a221b8f1ecb014e69d14d Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Wed, 31 Jul 2024 19:55:26 +0800 Subject: [PATCH 021/103] fix(stream): fix memory leak. --- source/dnode/mnode/impl/src/mndStreamUtil.c | 1 + 1 file changed, 1 insertion(+) diff --git a/source/dnode/mnode/impl/src/mndStreamUtil.c b/source/dnode/mnode/impl/src/mndStreamUtil.c index b5a612f058..7b4b82fcfe 100644 --- a/source/dnode/mnode/impl/src/mndStreamUtil.c +++ b/source/dnode/mnode/impl/src/mndStreamUtil.c @@ -210,6 +210,7 @@ int32_t mndTakeVgroupSnapshot(SMnode *pMnode, bool *allReady, SArray **pList) { } *pList = pVgroupList; + taosHashCleanup(pHash); return code; _err: From 3a1a528028bc284bda5479b342b7ea2b12a8f1bc Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Thu, 1 Aug 2024 09:21:54 +0800 Subject: [PATCH 022/103] fix(stream): cancel fetch --- source/dnode/mnode/impl/src/mndStreamUtil.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/source/dnode/mnode/impl/src/mndStreamUtil.c b/source/dnode/mnode/impl/src/mndStreamUtil.c index 7b4b82fcfe..030b14ea0d 100644 --- a/source/dnode/mnode/impl/src/mndStreamUtil.c +++ b/source/dnode/mnode/impl/src/mndStreamUtil.c @@ -146,6 +146,7 @@ int32_t mndTakeVgroupSnapshot(SMnode *pMnode, bool *allReady, SArray **pList) { if (code) { mError("failed to put info into hashmap during task vgroup snapshot, code:%s", tstrerror(code)); sdbRelease(pSdb, pVgroup); + sdbCancelFetch(pSdb, pIter); goto _err; // take snapshot failed, and not all ready } } else { @@ -170,6 +171,7 @@ int32_t mndTakeVgroupSnapshot(SMnode *pMnode, bool *allReady, SArray **pList) { mError("failed to put entry in vgroup list, nodeId:%d code:out of memory", entry.nodeId); code = terrno; sdbRelease(pSdb, pVgroup); + sdbCancelFetch(pSdb, pIter); goto _err; } else { mDebug("take node snapshot, nodeId:%d %s", entry.nodeId, buf); @@ -189,6 +191,7 @@ int32_t mndTakeVgroupSnapshot(SMnode *pMnode, bool *allReady, SArray **pList) { code = addEpIntoEpSet(&entry.epset, pObj->pDnode->fqdn, pObj->pDnode->port); if (code) { sdbRelease(pSdb, pObj); + sdbCancelFetch(pSdb, pIter); mError("failed to extract epset for fqdn:%s during task vgroup snapshot", pObj->pDnode->fqdn); goto _err; } @@ -200,6 +203,7 @@ int32_t mndTakeVgroupSnapshot(SMnode *pMnode, bool *allReady, SArray **pList) { if (p == NULL) { code = terrno; sdbRelease(pSdb, pObj); + sdbCancelFetch(pSdb, pIter); mError("failed to put entry in vgroup list, nodeId:%d code:%s", entry.nodeId, tstrerror(code)); goto _err; } else { From ede7f23b0e1e40887d2a6a398192a8c4bc3bbe7a Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Thu, 1 Aug 2024 09:31:51 +0800 Subject: [PATCH 023/103] fix(stream): fix dead lock caused by refactor. --- source/libs/stream/src/streamCheckpoint.c | 1 + 1 file changed, 1 insertion(+) diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index d777883015..7b205a16a1 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -253,6 +253,7 @@ int32_t streamProcessCheckpointTriggerBlock(SStreamTask* pTask, SStreamDataBlock for (int32_t i = 0; i < taosArrayGetSize(pActiveInfo->pReadyMsgList); ++i) { STaskCheckpointReadyInfo* p = taosArrayGet(pActiveInfo->pReadyMsgList, i); if (p == NULL) { + streamMutexUnlock(&pTask->lock); return TSDB_CODE_INVALID_PARA; } From 2dae0bf423604ca118f4a5e56407c04ee9d209ae Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Thu, 1 Aug 2024 14:15:08 +0800 Subject: [PATCH 024/103] fix(stream): add more check. --- source/libs/stream/src/streamTask.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/libs/stream/src/streamTask.c b/source/libs/stream/src/streamTask.c index e8ff1552e8..040bbb4f00 100644 --- a/source/libs/stream/src/streamTask.c +++ b/source/libs/stream/src/streamTask.c @@ -602,9 +602,9 @@ int32_t streamTaskStop(SStreamTask* pTask) { stError("failed to handle STOP event, s-task:%s", id); } - if ((pTask->info.taskLevel != TASK_LEVEL__SINK) && (pTask->exec.pExecutor != NULL)) { + if (pTask->info.taskLevel != TASK_LEVEL__SINK && pTask->exec.pExecutor != NULL) { code = qKillTask(pTask->exec.pExecutor, TSDB_CODE_SUCCESS); - if (code) { + if (code != TSDB_CODE_SUCCESS) { stError("s-task:%s failed to kill task related query handle", id); } } From 3e1074aea37aed264cbf79d1bb4f5d167912367d Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Thu, 1 Aug 2024 15:48:09 +0800 Subject: [PATCH 025/103] fix(stream): set correct return value. --- source/libs/stream/src/streamMeta.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index 8c5faf006f..fe4b626325 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -1211,14 +1211,17 @@ void streamMetaWUnLock(SStreamMeta* pMeta) { } int32_t streamMetaSendMsgBeforeCloseTasks(SStreamMeta* pMeta, SArray** pList) { - *pList = NULL; + QRY_OPTR_CHECK(pList); + int32_t code = 0; SArray* pTaskList = taosArrayDup(pMeta->pTaskList, NULL); if (pTaskList == NULL) { stError("failed to generate the task list during send hbMsg to mnode, vgId:%d, code: out of memory", pMeta->vgId); - return TSDB_CODE_OUT_OF_MEMORY; + return terrno; } + *pList = pTaskList; + bool sendMsg = pMeta->sendMsgBeforeClosing; if (!sendMsg) { stDebug("vgId:%d no need to send msg to mnode before closing tasks", pMeta->vgId); @@ -1251,9 +1254,9 @@ int32_t streamMetaSendMsgBeforeCloseTasks(SStreamMeta* pMeta, SArray** pList) { streamMetaReleaseTask(pMeta, pTask); } - code = streamMetaSendHbHelper(pMeta); + (void)streamMetaSendHbHelper(pMeta); pMeta->sendMsgBeforeClosing = false; - return code; + return TSDB_CODE_SUCCESS; // always return true } void streamMetaUpdateStageRole(SStreamMeta* pMeta, int64_t stage, bool isLeader) { From c5bac71b3224a479c1d20dba3101d3800fabc460 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Thu, 1 Aug 2024 15:59:57 +0800 Subject: [PATCH 026/103] fix(stream): remove invalid return code check. --- source/dnode/mnode/impl/src/mndStreamUtil.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/source/dnode/mnode/impl/src/mndStreamUtil.c b/source/dnode/mnode/impl/src/mndStreamUtil.c index 030b14ea0d..512862b37e 100644 --- a/source/dnode/mnode/impl/src/mndStreamUtil.c +++ b/source/dnode/mnode/impl/src/mndStreamUtil.c @@ -638,12 +638,9 @@ static int32_t doBuildStreamTaskUpdateMsg(void **pBuf, int32_t *pLen, SVgroupCha static int32_t doSetUpdateTaskAction(SMnode *pMnode, STrans *pTrans, SStreamTask *pTask, SVgroupChangeInfo *pInfo) { void *pBuf = NULL; int32_t len = 0; - int32_t code = streamTaskUpdateEpsetInfo(pTask, pInfo->pUpdateNodeList); - if (code) { - return code; - } + (void)streamTaskUpdateEpsetInfo(pTask, pInfo->pUpdateNodeList); - code = doBuildStreamTaskUpdateMsg(&pBuf, &len, pInfo, pTask->info.nodeId, &pTask->id, pTrans->id); + int32_t code = doBuildStreamTaskUpdateMsg(&pBuf, &len, pInfo, pTask->info.nodeId, &pTask->id, pTrans->id); if (code) { return code; } From 3b2d1ae101e2e97df84500bdf4287f4297d5bad1 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Thu, 1 Aug 2024 18:32:01 +0800 Subject: [PATCH 027/103] fix(test): wait for a little longer. --- tests/system-test/8-stream/state_window_case.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/system-test/8-stream/state_window_case.py b/tests/system-test/8-stream/state_window_case.py index 5ecf8d7832..3015b0db42 100644 --- a/tests/system-test/8-stream/state_window_case.py +++ b/tests/system-test/8-stream/state_window_case.py @@ -30,14 +30,14 @@ class TDTestCase: tdSql.execute("CREATE STREAM stream_device_alarm2 TRIGGER AT_ONCE DELETE_MARK 30d INTO st_device_alarm2 tags(factory_id varchar(20), device_code varchar(80), var_name varchar(200))\ as select _wstart start_time, last(load_time) end_time, first(var_value) var_value, 1 state_flag from st_variable_data\ PARTITION BY tbname tname, factory_id, device_code, var_name STATE_WINDOW(case when lower(var_value)=lower(trigger_value) then '1' else '0' end)") - time.sleep(2) + time.sleep(5) def insert_data(self): try: tdSql.execute("insert into aaa values('2024-07-15 14:00:00', '2024-07-15 14:00:00', 'a8')", queryTimes=5, show=True) time.sleep(0.01) tdSql.execute("insert into aaa values('2024-07-15 14:10:00', '2024-07-15 14:10:00', 'a9')", queryTimes=5, show=True) - time.sleep(1) + time.sleep(5) except Exception as error: tdLog.exit(f"insert data failed {error}") From 02b59d0b33abf19562c3b92a4561b8822c94c168 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Sat, 3 Aug 2024 16:34:26 +0800 Subject: [PATCH 028/103] fix(stream): add more check in tmr. --- source/libs/stream/src/streamCheckStatus.c | 7 ------- source/libs/stream/src/streamCheckpoint.c | 22 ++++++++++++++++++++++ source/libs/stream/src/streamDispatch.c | 4 ++-- 3 files changed, 24 insertions(+), 9 deletions(-) diff --git a/source/libs/stream/src/streamCheckStatus.c b/source/libs/stream/src/streamCheckStatus.c index c9ba6ffcfe..b7661e72d4 100644 --- a/source/libs/stream/src/streamCheckStatus.c +++ b/source/libs/stream/src/streamCheckStatus.c @@ -74,13 +74,6 @@ int32_t streamTaskCheckStatus(SStreamTask* pTask, int32_t upstreamTaskId, int32_ } if (pInfo->stage != stage) { - streamMutexLock(&pTask->lock); - ETaskStatus status = streamTaskGetStatus(pTask).state; - if (status == TASK_STATUS__CK) { - streamTaskSetFailedCheckpointId(pTask); - } - streamMutexUnlock(&pTask->lock); - return TASK_UPSTREAM_NEW_STAGE; } else if (pTask->status.downstreamReady != 1) { stDebug("s-task:%s vgId:%d leader:%d, downstream not ready", id, vgId, (pTask->pMeta->role == NODE_ROLE_LEADER)); diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index 7b205a16a1..d638e28c8d 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -855,6 +855,28 @@ void checkpointTriggerMonitorFn(void* param, void* tmrId) { return; } + if ((pTmrInfo->launchChkptId != pActiveInfo->activeId) || (pActiveInfo->activeId == 0)) { + streamMutexUnlock(&pActiveInfo->lock); + int32_t ref = streamCleanBeforeQuitTmr(pTmrInfo, pTask); + stWarn("s-task:%s vgId:%d checkpoint-trigger retrieve by previous checkpoint procedure, checkpointId:%" PRId64 + ", quit, ref:%d", + id, vgId, pTmrInfo->launchChkptId, ref); + + streamMetaReleaseTask(pTask->pMeta, pTask); + return; + } + + // active checkpoint info is cleared for now + if ((pActiveInfo->activeId == 0) || (pActiveInfo->transId == 0) || (pTask->chkInfo.startTs == 0)) { + streamMutexUnlock(&pActiveInfo->lock); + int32_t ref = streamCleanBeforeQuitTmr(pTmrInfo, pTask); + stWarn("s-task:%s vgId:%d active checkpoint may be cleared, quit from retrieve checkpoint-trigger send tmr, ref:%d", + id, vgId, ref); + + streamMetaReleaseTask(pTask->pMeta, pTask); + return; + } + for (int32_t i = 0; i < taosArrayGetSize(pList); ++i) { SStreamUpstreamEpInfo* pInfo = taosArrayGetP(pList, i); diff --git a/source/libs/stream/src/streamDispatch.c b/source/libs/stream/src/streamDispatch.c index f6e827b745..010f6f006f 100644 --- a/source/libs/stream/src/streamDispatch.c +++ b/source/libs/stream/src/streamDispatch.c @@ -820,7 +820,7 @@ static void checkpointReadyMsgSendMonitorFn(void* param, void* tmrId) { SArray* pList = pActiveInfo->pReadyMsgList; int32_t num = taosArrayGetSize(pList); - if (pTmrInfo->launchChkptId < pActiveInfo->activeId) { + if (pTmrInfo->launchChkptId != pActiveInfo->activeId) { streamMutexUnlock(&pActiveInfo->lock); int32_t ref = streamCleanBeforeQuitTmr(pTmrInfo, pTask); stWarn("s-task:%s vgId:%d ready-msg send tmr launched by previous checkpoint procedure, checkpointId:%" PRId64 @@ -832,7 +832,7 @@ static void checkpointReadyMsgSendMonitorFn(void* param, void* tmrId) { } // active checkpoint info is cleared for now - if ((pActiveInfo->activeId == 0) && (pActiveInfo->transId == 0) && (num == 0) && (pTask->chkInfo.startTs == 0)) { + if ((pActiveInfo->activeId == 0) || (pActiveInfo->transId == 0) || (num == 0) || (pTask->chkInfo.startTs == 0)) { streamMutexUnlock(&pActiveInfo->lock); int32_t ref = streamCleanBeforeQuitTmr(pTmrInfo, pTask); stWarn("s-task:%s vgId:%d active checkpoint may be cleared, quit from readyMsg send tmr, ref:%d", id, vgId, ref); From 170a074de829155ba336dabca09caaaa0f3c75a5 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Sun, 4 Aug 2024 11:37:23 +0800 Subject: [PATCH 029/103] fix(stream): add check for checkpointId in retrieve-checkpoint id msg. --- source/dnode/vnode/src/tqCommon/tqCommon.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/source/dnode/vnode/src/tqCommon/tqCommon.c b/source/dnode/vnode/src/tqCommon/tqCommon.c index b56c474ed5..11d38dde87 100644 --- a/source/dnode/vnode/src/tqCommon/tqCommon.c +++ b/source/dnode/vnode/src/tqCommon/tqCommon.c @@ -989,7 +989,12 @@ int32_t tqStreamTaskProcessRetrieveTriggerReq(SStreamMeta* pMeta, SRpcMsg* pMsg) int64_t checkpointId = 0; streamTaskGetActiveCheckpointInfo(pTask, &transId, &checkpointId); - ASSERT(checkpointId == pReq->checkpointId); + if (checkpointId != pReq->checkpointId) { + tqError("s-task:%s invalid checkpoint-trigger retrieve msg from %x, current checkpointId:%"PRId64" req:%"PRId64, + pTask->id.idStr, pReq->downstreamTaskId, checkpointId, pReq->checkpointId); + streamMetaReleaseTask(pMeta, pTask); + return TSDB_CODE_INVALID_MSG; + } if (streamTaskAlreadySendTrigger(pTask, pReq->downstreamNodeId)) { // re-send the lost checkpoint-trigger msg to downstream task From 79d4596b72f1c55b2e073c4faff0be4b092556dc Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Mon, 5 Aug 2024 19:26:42 +0800 Subject: [PATCH 030/103] fix(stream): fix syntax error. --- source/dnode/vnode/src/tqCommon/tqCommon.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/source/dnode/vnode/src/tqCommon/tqCommon.c b/source/dnode/vnode/src/tqCommon/tqCommon.c index a77d3462de..ba911fa76d 100644 --- a/source/dnode/vnode/src/tqCommon/tqCommon.c +++ b/source/dnode/vnode/src/tqCommon/tqCommon.c @@ -997,8 +997,9 @@ int32_t tqStreamTaskProcessRetrieveTriggerReq(SStreamMeta* pMeta, SRpcMsg* pMsg) streamTaskGetActiveCheckpointInfo(pTask, &transId, &checkpointId); if (checkpointId != pReq->checkpointId) { - tqError("s-task:%s invalid checkpoint-trigger retrieve msg from %x, current checkpointId:%"PRId64" req:%"PRId64, - pTask->id.idStr, pReq->downstreamTaskId, checkpointId, pReq->checkpointId); + tqError("s-task:%s invalid checkpoint-trigger retrieve msg from 0x%" PRIx64 ", current checkpointId:%" PRId64 + " req:%" PRId64, + pTask->id.idStr, pReq->downstreamTaskId, checkpointId, pReq->checkpointId); streamMetaReleaseTask(pMeta, pTask); return TSDB_CODE_INVALID_MSG; } From 94a2ea1ad5835a8dbef7595752ac5720bb6bacad Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Mon, 5 Aug 2024 19:32:29 +0800 Subject: [PATCH 031/103] fix(stream): clear the freed ptr --- source/libs/stream/src/streamBackendRocksdb.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/source/libs/stream/src/streamBackendRocksdb.c b/source/libs/stream/src/streamBackendRocksdb.c index 207bcdcac5..a408ef5872 100644 --- a/source/libs/stream/src/streamBackendRocksdb.c +++ b/source/libs/stream/src/streamBackendRocksdb.c @@ -910,6 +910,7 @@ void streamBackendCleanup(void* arg) { if (pHandle->db) { rocksdb_close(pHandle->db); + pHandle->db = NULL; } rocksdb_options_destroy(pHandle->dbOpt); rocksdb_env_destroy(pHandle->env); @@ -2508,6 +2509,7 @@ STaskDbWrapper* taskDbOpenImpl(const char* key, char* statePath, char* dbPath) { } rocksdb_close(pTaskDb->db); + pTaskDb->db = NULL; if (cfNames != NULL) { rocksdb_list_column_families_destroy(cfNames, nCf); @@ -2617,6 +2619,7 @@ void taskDbDestroy(void* pDb, bool flush) { if (wrapper->db) { rocksdb_close(wrapper->db); + wrapper->db = NULL; } rocksdb_options_destroy(wrapper->dbOpt); From 255faa0eac8688dd0fa07e19970640736c08cd26 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Wed, 7 Aug 2024 11:43:38 +0800 Subject: [PATCH 032/103] fix(stream): update acceptable code. --- source/dnode/mnode/impl/src/mndStreamUtil.c | 2 +- source/libs/stream/src/streamBackendRocksdb.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/source/dnode/mnode/impl/src/mndStreamUtil.c b/source/dnode/mnode/impl/src/mndStreamUtil.c index 512862b37e..739bb0ca37 100644 --- a/source/dnode/mnode/impl/src/mndStreamUtil.c +++ b/source/dnode/mnode/impl/src/mndStreamUtil.c @@ -542,7 +542,7 @@ static int32_t doSetDropActionFromId(SMnode *pMnode, STrans *pTrans, SOrphanTask } // The epset of nodeId of this task may have been expired now, let's use the newest epset from mnode. - code = setTransAction(pTrans, pReq, sizeof(SVDropStreamTaskReq), TDMT_STREAM_TASK_DROP, &epset, 0, 0); + code = setTransAction(pTrans, pReq, sizeof(SVDropStreamTaskReq), TDMT_STREAM_TASK_DROP, &epset, 0, TSDB_CODE_VND_INVALID_VGROUP_ID); if (code != 0) { taosMemoryFree(pReq); return code; diff --git a/source/libs/stream/src/streamBackendRocksdb.c b/source/libs/stream/src/streamBackendRocksdb.c index a80b8ef4eb..fa09191854 100644 --- a/source/libs/stream/src/streamBackendRocksdb.c +++ b/source/libs/stream/src/streamBackendRocksdb.c @@ -683,7 +683,7 @@ static int32_t rebuildFromLocalCheckpoint(const char* pTaskIdStr, const char* ch defaultPath); } } else { - code = TSDB_CODE_FAILED; + code = terrno; stError("%s no valid data for checkpointId:%" PRId64 " in %s", pTaskIdStr, checkpointId, checkpointPath); } From 1fbb3a63bcdc5311e250598fe71de466576a3bdc Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Wed, 7 Aug 2024 14:59:39 +0800 Subject: [PATCH 033/103] refactor: do some internal refactor. --- source/libs/stream/src/streamBackendRocksdb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/libs/stream/src/streamBackendRocksdb.c b/source/libs/stream/src/streamBackendRocksdb.c index fa09191854..41bacab667 100644 --- a/source/libs/stream/src/streamBackendRocksdb.c +++ b/source/libs/stream/src/streamBackendRocksdb.c @@ -763,7 +763,7 @@ int32_t restoreCheckpointData(const char* path, const char* key, int64_t chkptId } if (code != 0) { - stError("failed to start stream backend at %s, restart from default defaultPath:%s, reason:%s", checkpointPath, + stError("failed to start stream backend at %s, restart from defaultPath:%s, reason:%s", checkpointPath, defaultPath, tstrerror(code)); code = 0; // reset the error code } From 26a770f61edb44ea131e7cc15eb19a2950077e35 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Wed, 7 Aug 2024 15:54:48 +0800 Subject: [PATCH 034/103] fix(stream):update log. --- source/libs/stream/src/streamHb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/libs/stream/src/streamHb.c b/source/libs/stream/src/streamHb.c index a158d6e4bb..8513a8ba06 100644 --- a/source/libs/stream/src/streamHb.c +++ b/source/libs/stream/src/streamHb.c @@ -293,7 +293,7 @@ void streamMetaHbToMnode(void* param, void* tmrId) { streamMetaRLock(pMeta); code = streamMetaSendHbHelper(pMeta); if (code) { - stError("vgId:%d failed to send hmMsg to mnode, try again in 5s, code:%s", pMeta->vgId, strerror(code)); + stError("vgId:%d failed to send hmMsg to mnode, try again in 5s, code:%s", pMeta->vgId, tstrerror(code)); } streamMetaRUnLock(pMeta); From 31c21f6f6d8d27281c783904300f6901bd1834ed Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Wed, 7 Aug 2024 16:01:28 +0800 Subject: [PATCH 035/103] fix(rpc): update log. --- source/dnode/mgmt/node_mgmt/src/dmTransport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/dnode/mgmt/node_mgmt/src/dmTransport.c b/source/dnode/mgmt/node_mgmt/src/dmTransport.c index 3d758e1fd3..cc57b04d47 100644 --- a/source/dnode/mgmt/node_mgmt/src/dmTransport.c +++ b/source/dnode/mgmt/node_mgmt/src/dmTransport.c @@ -109,7 +109,7 @@ static void dmProcessRpcMsg(SDnode *pDnode, SRpcMsg *pRpc, SEpSet *pEpSet) { int32_t svrVer = 0; (void)taosVersionStrToInt(version, &svrVer); if ((code = taosCheckVersionCompatible(pRpc->info.cliVer, svrVer, 3)) != 0) { - dError("Version not compatible, cli ver: %d, svr ver: %d", pRpc->info.cliVer, svrVer); + dError("Version not compatible, cli ver: %d, svr ver: %d, ip:0x%x", pRpc->info.cliVer, svrVer, pRpc->info.conn.clientIp); goto _OVER; } From ea715a21b1b96dd64d98cfaa6b78e014f4b759bf Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Wed, 7 Aug 2024 15:25:27 +0800 Subject: [PATCH 036/103] fix invalid remove --- source/libs/stream/src/streamBackendRocksdb.c | 3 +++ source/libs/stream/src/streamTask.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/source/libs/stream/src/streamBackendRocksdb.c b/source/libs/stream/src/streamBackendRocksdb.c index 41bacab667..537aa72d91 100644 --- a/source/libs/stream/src/streamBackendRocksdb.c +++ b/source/libs/stream/src/streamBackendRocksdb.c @@ -1144,6 +1144,8 @@ int32_t chkpMayDelObsolete(void* arg, int64_t chkpId, char* path) { int64_t id = *(int64_t*)taosArrayGet(chkpDel, i); char tbuf[256] = {0}; sprintf(tbuf, "%s%scheckpoint%" PRId64 "", path, TD_DIRSEP, id); + + stInfo("backend remove obsolete checkpoint: %s", tbuf); if (taosIsDir(tbuf)) { taosRemoveDir(tbuf); } @@ -2661,6 +2663,7 @@ void taskDbDestroy(void* pDb, bool flush) { if (wrapper->removeAllFiles) { char* err = NULL; + stInfo("drop task remove backend dat:%s", wrapper->path); taosRemoveDir(wrapper->path); } taosMemoryFree(wrapper->path); diff --git a/source/libs/stream/src/streamTask.c b/source/libs/stream/src/streamTask.c index c7f3bd264d..c0b2b16d30 100644 --- a/source/libs/stream/src/streamTask.c +++ b/source/libs/stream/src/streamTask.c @@ -321,7 +321,7 @@ void streamFreeTaskState(SStreamTask* pTask, int8_t remove) { stDebug("s-task:0x%x start to free task state", pTask->id.taskId); streamStateClose(pTask->pState, remove); - taskDbSetClearFileFlag(pTask->pBackend); + if (remove)taskDbSetClearFileFlag(pTask->pBackend); taskDbRemoveRef(pTask->pBackend); pTask->pBackend = NULL; pTask->pState = NULL; From b8b6eb49207237d2681eea875f7734d288f4b94f Mon Sep 17 00:00:00 2001 From: Shungang Li Date: Wed, 7 Aug 2024 18:34:59 +0800 Subject: [PATCH 037/103] enh: adjust error msg format --- source/os/src/osTimezone.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/source/os/src/osTimezone.c b/source/os/src/osTimezone.c index c801347fc2..b3f7c9a368 100644 --- a/source/os/src/osTimezone.c +++ b/source/os/src/osTimezone.c @@ -871,14 +871,14 @@ void taosGetSystemTimezone(char *outTimezoneStr, enum TdTimezone *tsTimezone) { { int n = readlink("/etc/localtime", buf, sizeof(buf)); if (n < 0) { - printf("read /etc/localtime error, reason:%s", strerror(errno)); + printf("read /etc/localtime error, reason:%s\n", strerror(errno)); return; } buf[n] = '\0'; char *zi = strstr(buf, "zoneinfo"); if (!zi) { - printf("parsing /etc/localtime failed"); + printf("parsing /etc/localtime failed\n"); return; } tz = zi + strlen("zoneinfo") + 1; @@ -893,7 +893,7 @@ void taosGetSystemTimezone(char *outTimezoneStr, enum TdTimezone *tsTimezone) { // } // } // if (!tz || 0 == strchr(tz, '/')) { - // printf("parsing /etc/localtime failed"); + // printf("parsing /etc/localtime failed\n"); // return; // } @@ -927,7 +927,7 @@ void taosGetSystemTimezone(char *outTimezoneStr, enum TdTimezone *tsTimezone) { { int n = readlink("/etc/localtime", buf, sizeof(buf)-1); if (n < 0) { - (void)printf("read /etc/localtime error, reason:%s", strerror(errno)); + (void)printf("read /etc/localtime error, reason:%s\n", strerror(errno)); if (taosCheckExistFile("/etc/timezone")) { /* @@ -947,7 +947,7 @@ void taosGetSystemTimezone(char *outTimezoneStr, enum TdTimezone *tsTimezone) { int len = taosReadFile(pFile, buf, 64); if (len < 0) { (void)taosCloseFile(&pFile); - (void)printf("read /etc/timezone error, reason:%s", strerror(errno)); + (void)printf("read /etc/timezone error, reason:%s\n", strerror(errno)); return; } @@ -994,7 +994,7 @@ void taosGetSystemTimezone(char *outTimezoneStr, enum TdTimezone *tsTimezone) { char *zi = strstr(buf, "zoneinfo"); if (!zi) { - (void)printf("parsing /etc/localtime failed"); + (void)printf("parsing /etc/localtime failed\n"); return; } tz = zi + strlen("zoneinfo") + 1; From 5d4569ce75dc1f71ccf179462960e637481c165a Mon Sep 17 00:00:00 2001 From: wangjiaming0909 <604227650@qq.com> Date: Wed, 7 Aug 2024 16:45:05 +0800 Subject: [PATCH 038/103] replace sprintf with strcat --- include/util/tutil.h | 14 +- source/libs/function/src/functionMgt.c | 2 +- source/libs/parser/src/parAstCreater.c | 2 + source/libs/parser/src/parTranslater.c | 4 +- source/libs/planner/src/planOptimizer.c | 6 +- source/libs/planner/src/planPhysiCreater.c | 202 ++++++++++++++------- source/libs/planner/src/planSpliter.c | 4 +- source/libs/planner/src/planUtil.c | 2 +- 8 files changed, 161 insertions(+), 75 deletions(-) diff --git a/include/util/tutil.h b/include/util/tutil.h index f1f2914eed..1ee3bb0e83 100644 --- a/include/util/tutil.h +++ b/include/util/tutil.h @@ -80,6 +80,11 @@ static FORCE_INLINE void taosEncryptPass_c(uint8_t *inBuf, size_t len, char *tar (void)memcpy(target, buf, TSDB_PASSWORD_LEN); } +static FORCE_INLINE int32_t taosHashBinary(char* pBuf, int32_t len) { + uint64_t hashVal = MurmurHash3_64(pBuf, len); + return sprintf(pBuf, "%" PRIu64, hashVal); +} + static FORCE_INLINE int32_t taosCreateMD5Hash(char *pBuf, int32_t len) { T_MD5_CTX ctx; tMD5Init(&ctx); @@ -87,11 +92,10 @@ static FORCE_INLINE int32_t taosCreateMD5Hash(char *pBuf, int32_t len) { tMD5Final(&ctx); char *p = pBuf; int32_t resLen = 0; - for (uint8_t i = 0; i < tListLen(ctx.digest); ++i) { - resLen += snprintf(p, 3, "%02x", ctx.digest[i]); - p += 2; - } - return resLen; + return sprintf(pBuf, "%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x", ctx.digest[0], ctx.digest[1], + ctx.digest[2], ctx.digest[3], ctx.digest[4], ctx.digest[5], ctx.digest[6], ctx.digest[7], + ctx.digest[8], ctx.digest[9], ctx.digest[10], ctx.digest[11], ctx.digest[12], ctx.digest[13], + ctx.digest[14], ctx.digest[15]); } static FORCE_INLINE int32_t taosGetTbHashVal(const char *tbname, int32_t tblen, int32_t method, int32_t prefix, diff --git a/source/libs/function/src/functionMgt.c b/source/libs/function/src/functionMgt.c index 49d700e8c8..e50dbf8b14 100644 --- a/source/libs/function/src/functionMgt.c +++ b/source/libs/function/src/functionMgt.c @@ -434,7 +434,7 @@ static int32_t createPartialFunction(const SFunctionNode* pSrcFunc, SFunctionNod (*pPartialFunc)->originalFuncId = pSrcFunc->hasOriginalFunc ? pSrcFunc->originalFuncId : pSrcFunc->funcId; char name[TSDB_FUNC_NAME_LEN + TSDB_NAME_DELIMITER_LEN + TSDB_POINTER_PRINT_BYTES + 1] = {0}; int32_t len = snprintf(name, sizeof(name) - 1, "%s.%p", (*pPartialFunc)->functionName, pSrcFunc); - (void)taosCreateMD5Hash(name, len); + (void)taosHashBinary(name, len); (void)strncpy((*pPartialFunc)->node.aliasName, name, TSDB_COL_NAME_LEN - 1); (*pPartialFunc)->hasPk = pSrcFunc->hasPk; (*pPartialFunc)->pkBytes = pSrcFunc->pkBytes; diff --git a/source/libs/parser/src/parAstCreater.c b/source/libs/parser/src/parAstCreater.c index cd7cda01e0..fab58814d5 100644 --- a/source/libs/parser/src/parAstCreater.c +++ b/source/libs/parser/src/parAstCreater.c @@ -325,6 +325,8 @@ SNode* releaseRawExprNode(SAstCreateContext* pCxt, SNode* pNode) { sprintf(p, "%02x", ctx.digest[i]); p += 2; } + uint64_t a = MurmurHash3_64(pRawExpr->p, pRawExpr->n); + sprintf(pExpr->aliasName, "%"PRIu64, a); strncpy(pExpr->userAlias, pRawExpr->p, len); pExpr->userAlias[len] = '\0'; } diff --git a/source/libs/parser/src/parTranslater.c b/source/libs/parser/src/parTranslater.c index 7a2a73d013..3ca31fc573 100755 --- a/source/libs/parser/src/parTranslater.c +++ b/source/libs/parser/src/parTranslater.c @@ -4805,7 +4805,7 @@ static int32_t createMultiResFunc(SFunctionNode* pSrcFunc, SExprNode* pExpr, SNo strcpy(pFunc->node.aliasName, pCol->colName); } else { len = snprintf(buf, sizeof(buf) - 1, "%s(%s.%s)", pSrcFunc->functionName, pCol->tableAlias, pCol->colName); - (void)taosCreateMD5Hash(buf, len); + (void)taosHashBinary(buf, len); strncpy(pFunc->node.aliasName, buf, TSDB_COL_NAME_LEN - 1); len = snprintf(buf, sizeof(buf) - 1, "%s(%s)", pSrcFunc->functionName, pCol->colName); // note: userAlias could be truncated here @@ -4813,7 +4813,7 @@ static int32_t createMultiResFunc(SFunctionNode* pSrcFunc, SExprNode* pExpr, SNo } } else { len = snprintf(buf, sizeof(buf) - 1, "%s(%s)", pSrcFunc->functionName, pExpr->aliasName); - (void)taosCreateMD5Hash(buf, len); + (void)taosHashBinary(buf, len); strncpy(pFunc->node.aliasName, buf, TSDB_COL_NAME_LEN - 1); len = snprintf(buf, sizeof(buf) - 1, "%s(%s)", pSrcFunc->functionName, pExpr->userAlias); // note: userAlias could be truncated here diff --git a/source/libs/planner/src/planOptimizer.c b/source/libs/planner/src/planOptimizer.c index 27d3e80ef0..c7ce9aed76 100644 --- a/source/libs/planner/src/planOptimizer.c +++ b/source/libs/planner/src/planOptimizer.c @@ -3164,7 +3164,7 @@ static void partTagsSetAlias(char* pAlias, const char* pTableAlias, const char* char name[TSDB_COL_FNAME_LEN + 1] = {0}; int32_t len = snprintf(name, TSDB_COL_FNAME_LEN, "%s.%s", pTableAlias, pColName); - (void)taosCreateMD5Hash(name, len); + (void)taosHashBinary(name, len); strncpy(pAlias, name, TSDB_COL_NAME_LEN - 1); } @@ -3827,7 +3827,7 @@ static int32_t rewriteUniqueOptCreateFirstFunc(SFunctionNode* pSelectValue, SNod int64_t pointer = (int64_t)pFunc; char name[TSDB_FUNC_NAME_LEN + TSDB_POINTER_PRINT_BYTES + TSDB_NAME_DELIMITER_LEN + 1] = {0}; int32_t len = snprintf(name, sizeof(name) - 1, "%s.%" PRId64 "", pFunc->functionName, pointer); - (void)taosCreateMD5Hash(name, len); + (void)taosHashBinary(name, len); strncpy(pFunc->node.aliasName, name, TSDB_COL_NAME_LEN - 1); } SNode* pNew = NULL; @@ -7197,7 +7197,7 @@ static int32_t tsmaOptCreateWStart(int8_t precision, SFunctionNode** pWStartOut) int64_t pointer = (int64_t)pWStart; char name[TSDB_COL_NAME_LEN + TSDB_POINTER_PRINT_BYTES + TSDB_NAME_DELIMITER_LEN + 1] = {0}; int32_t len = snprintf(name, sizeof(name) - 1, "%s.%" PRId64 "", pWStart->functionName, pointer); - (void)taosCreateMD5Hash(name, len); + (void)taosHashBinary(name, len); strncpy(pWStart->node.aliasName, name, TSDB_COL_NAME_LEN - 1); pWStart->node.resType.precision = precision; diff --git a/source/libs/planner/src/planPhysiCreater.c b/source/libs/planner/src/planPhysiCreater.c index d75e02bc6b..e50e574f01 100644 --- a/source/libs/planner/src/planPhysiCreater.c +++ b/source/libs/planner/src/planPhysiCreater.c @@ -39,47 +39,101 @@ typedef struct SPhysiPlanContext { bool hasSysScan; } SPhysiPlanContext; -static int32_t getSlotKey(SNode* pNode, const char* pStmtName, char* pKey, int32_t keyBufSize) { - int32_t len = 0; +static int32_t getSlotKey(SNode* pNode, const char* pStmtName, char** ppKey, int32_t *pLen) { + int32_t code = 0; if (QUERY_NODE_COLUMN == nodeType(pNode)) { SColumnNode* pCol = (SColumnNode*)pNode; if (NULL != pStmtName) { if ('\0' != pStmtName[0]) { - len = snprintf(pKey, keyBufSize, "%s.%s", pStmtName, pCol->node.aliasName); - return taosCreateMD5Hash(pKey, len); + *ppKey = taosMemoryCalloc(1, TSDB_TABLE_NAME_LEN + 1 + TSDB_COL_NAME_LEN + 1); + if (!*ppKey) { + return terrno; + } + strcat(*ppKey, pStmtName); + strcat(*ppKey, "."); + strcat(*ppKey, pCol->node.aliasName); + *pLen = taosHashBinary(*ppKey, strlen(*ppKey)); + return code; } else { - return snprintf(pKey, keyBufSize, "%s", pCol->node.aliasName); + *ppKey = taosMemoryCalloc(1, TSDB_COL_NAME_LEN + 1); + if (!*ppKey) { + return terrno; + } + strcat(*ppKey, pCol->node.aliasName); + *pLen = strlen(*ppKey); + return code; } } if ('\0' == pCol->tableAlias[0]) { - return snprintf(pKey, keyBufSize, "%s", pCol->colName); + *ppKey = taosMemoryCalloc(1, TSDB_COL_NAME_LEN + 1); + if (!*ppKey) { + return terrno; + } + strcat(*ppKey, pCol->colName); + *pLen = strlen(*ppKey); + return code; } - len = snprintf(pKey, keyBufSize, "%s.%s", pCol->tableAlias, pCol->colName); - return taosCreateMD5Hash(pKey, len); + *ppKey = taosMemoryCalloc(1, TSDB_TABLE_NAME_LEN + 1 + TSDB_COL_NAME_LEN + 1); + if (!*ppKey) { + return terrno; + } + strcat(*ppKey, pCol->tableAlias); + strcat(*ppKey, "."); + strcat(*ppKey, pCol->colName); + *pLen = taosHashBinary(*ppKey, strlen(*ppKey)); + return code; } else if (QUERY_NODE_FUNCTION == nodeType(pNode)) { SFunctionNode* pFunc = (SFunctionNode*)pNode; if (FUNCTION_TYPE_TBNAME == pFunc->funcType) { SValueNode* pVal = (SValueNode*)nodesListGetNode(pFunc->pParameterList, 0); if (pVal) { if (NULL != pStmtName && '\0' != pStmtName[0]) { - len = snprintf(pKey, keyBufSize, "%s.%s", pStmtName, ((SExprNode*)pNode)->aliasName); - return taosCreateMD5Hash(pKey, len); + *ppKey = taosMemoryCalloc(1, TSDB_TABLE_NAME_LEN + 1 + TSDB_COL_NAME_LEN + 1); + if (!*ppKey) { + return terrno; + } + strcat(*ppKey, pStmtName); + strcat(*ppKey, "."); + strcat(*ppKey, ((SExprNode*)pNode)->aliasName); + *pLen = taosHashBinary(*ppKey, strlen(*ppKey)); + return code; } - len = snprintf(pKey, keyBufSize, "%s.%s", pVal->literal, ((SExprNode*)pNode)->aliasName); - return taosCreateMD5Hash(pKey, len); + *ppKey = taosMemoryCalloc(1, strlen(pVal->literal) + 1 + TSDB_COL_NAME_LEN + 1); + if (!*ppKey) { + return terrno; + } + strcat(*ppKey, pVal->literal); + strcat(*ppKey, "."); + strcat(*ppKey, ((SExprNode*)pNode)->aliasName); + *pLen = taosHashBinary(*ppKey, strlen(*ppKey)); + return code; } } } if (NULL != pStmtName && '\0' != pStmtName[0]) { - len = snprintf(pKey, keyBufSize, "%s.%s", pStmtName, ((SExprNode*)pNode)->aliasName); - return taosCreateMD5Hash(pKey, len); + *ppKey = taosMemoryCalloc(1, TSDB_TABLE_NAME_LEN + 1 + TSDB_COL_NAME_LEN + 1); + if (!*ppKey) { + return terrno; + } + strcat(*ppKey, pStmtName); + strcat(*ppKey, "."); + strcat(*ppKey, ((SExprNode*)pNode)->aliasName); + *pLen = taosHashBinary(*ppKey, strlen(*ppKey)); + return code; } - return snprintf(pKey, keyBufSize, "%s", ((SExprNode*)pNode)->aliasName); + *ppKey = taosMemoryCalloc(1, TSDB_COL_NAME_LEN + 1); + if (!*ppKey) { + return terrno; + } + strcat(*ppKey, ((SExprNode*)pNode)->aliasName); + *pLen = strlen(*ppKey); + return code; } + static SNode* createSlotDesc(SPhysiPlanContext* pCxt, const char* pName, const SNode* pNode, int16_t slotId, bool output, bool reserve) { SSlotDescNode* pSlot = NULL; @@ -132,8 +186,8 @@ static int32_t putSlotToHashImpl(int16_t dataBlockId, int16_t slotId, const char return taosHashPut(pHash, pName, len, &index, sizeof(SSlotIndex)); } -static int32_t putSlotToHash(const char* pName, int16_t dataBlockId, int16_t slotId, SNode* pNode, SHashObj* pHash) { - return putSlotToHashImpl(dataBlockId, slotId, pName, strlen(pName), pHash); +static int32_t putSlotToHash(const char* pName, int32_t len, int16_t dataBlockId, int16_t slotId, SNode* pNode, SHashObj* pHash) { + return putSlotToHashImpl(dataBlockId, slotId, pName, len, pHash); } static int32_t createDataBlockDescHash(SPhysiPlanContext* pCxt, int32_t capacity, int16_t dataBlockId, @@ -162,12 +216,16 @@ static int32_t buildDataBlockSlots(SPhysiPlanContext* pCxt, SNodeList* pList, SD int16_t slotId = 0; SNode* pNode = NULL; FOREACH(pNode, pList) { - char name[TSDB_COL_FNAME_LEN + 1] = {0}; - (void)getSlotKey(pNode, NULL, name, TSDB_COL_FNAME_LEN); - code = nodesListStrictAppend(pDataBlockDesc->pSlots, createSlotDesc(pCxt, name, pNode, slotId, true, false)); + char* name = NULL; + int32_t len = 0; + code = getSlotKey(pNode, NULL, &name, &len); if (TSDB_CODE_SUCCESS == code) { - code = putSlotToHash(name, pDataBlockDesc->dataBlockId, slotId, pNode, pHash); + code = nodesListStrictAppend(pDataBlockDesc->pSlots, createSlotDesc(pCxt, name, pNode, slotId, true, false)); } + if (TSDB_CODE_SUCCESS == code) { + code = putSlotToHash(name, len, pDataBlockDesc->dataBlockId, slotId, pNode, pHash); + } + taosMemoryFree(name); if (TSDB_CODE_SUCCESS == code) { pDataBlockDesc->totalRowSize += ((SExprNode*)pNode)->resType.bytes; pDataBlockDesc->outputRowSize += ((SExprNode*)pNode)->resType.bytes; @@ -226,25 +284,29 @@ static int32_t addDataBlockSlotsImpl(SPhysiPlanContext* pCxt, SNodeList* pList, SNode* pNode = NULL; FOREACH(pNode, pList) { SNode* pExpr = QUERY_NODE_ORDER_BY_EXPR == nodeType(pNode) ? ((SOrderByExprNode*)pNode)->pExpr : pNode; - char name[TSDB_COL_FNAME_LEN + 1] = {0}; - int32_t len = getSlotKey(pExpr, pStmtName, name, TSDB_COL_FNAME_LEN); - SSlotIndex* pIndex = taosHashGet(pHash, name, len); - if (NULL == pIndex) { - code = + char *name = NULL; + int32_t len = 0; + code = getSlotKey(pExpr, pStmtName, &name, &len); + if (TSDB_CODE_SUCCESS == code) { + SSlotIndex* pIndex = taosHashGet(pHash, name, len); + if (NULL == pIndex) { + code = nodesListStrictAppend(pDataBlockDesc->pSlots, createSlotDesc(pCxt, name, pExpr, nextSlotId, output, reserve)); - if (TSDB_CODE_SUCCESS == code) { - code = putSlotToHashImpl(pDataBlockDesc->dataBlockId, nextSlotId, name, len, pHash); + if (TSDB_CODE_SUCCESS == code) { + code = putSlotToHashImpl(pDataBlockDesc->dataBlockId, nextSlotId, name, len, pHash); + } + pDataBlockDesc->totalRowSize += ((SExprNode*)pExpr)->resType.bytes; + if (output) { + pDataBlockDesc->outputRowSize += ((SExprNode*)pExpr)->resType.bytes; + } + slotId = nextSlotId; + ++nextSlotId; + } else { + slotId = getUnsetSlotId(pIndex->pSlotIdsInfo); } - pDataBlockDesc->totalRowSize += ((SExprNode*)pExpr)->resType.bytes; - if (output) { - pDataBlockDesc->outputRowSize += ((SExprNode*)pExpr)->resType.bytes; - } - slotId = nextSlotId; - ++nextSlotId; - } else { - slotId = getUnsetSlotId(pIndex->pSlotIdsInfo); } + taosMemoryFree(name); if (TSDB_CODE_SUCCESS == code) { SNode* pTarget = NULL; code = createTarget(pNode, pDataBlockDesc->dataBlockId, slotId, &pTarget); @@ -315,8 +377,12 @@ static void dumpSlots(const char* pName, SHashObj* pHash) { static EDealRes doSetSlotId(SNode* pNode, void* pContext) { if (QUERY_NODE_COLUMN == nodeType(pNode) && 0 != strcmp(((SColumnNode*)pNode)->colName, "*")) { SSetSlotIdCxt* pCxt = (SSetSlotIdCxt*)pContext; - char name[TSDB_COL_FNAME_LEN + 1] = {0}; - int32_t len = getSlotKey(pNode, NULL, name, TSDB_COL_FNAME_LEN); + char *name = NULL; + int32_t len = 0; + pCxt->errCode = getSlotKey(pNode, NULL, &name, &len); + if (TSDB_CODE_SUCCESS != pCxt->errCode) { + return DEAL_RES_ERROR; + } SSlotIndex* pIndex = taosHashGet(pCxt->pLeftHash, name, len); if (NULL == pIndex) { pIndex = taosHashGet(pCxt->pRightHash, name, len); @@ -327,8 +393,10 @@ static EDealRes doSetSlotId(SNode* pNode, void* pContext) { dumpSlots("left datablock desc", pCxt->pLeftHash); dumpSlots("right datablock desc", pCxt->pRightHash); pCxt->errCode = TSDB_CODE_PLAN_INTERNAL_ERROR; + taosMemoryFree(name); return DEAL_RES_ERROR; } + taosMemoryFree(name); ((SColumnNode*)pNode)->dataBlockId = pIndex->dataBlockId; ((SColumnNode*)pNode)->slotId = ((SSlotIdInfo*)taosArrayGet(pIndex->pSlotIdsInfo, 0))->slotId; return DEAL_RES_IGNORE_CHILD; @@ -1174,7 +1242,6 @@ static int32_t createHashJoinColList(int16_t lBlkId, int16_t rBlkId, SNode* pEq1 static int32_t sortHashJoinTargets(int16_t lBlkId, int16_t rBlkId, SHashJoinPhysiNode* pJoin) { SNode* pNode = NULL; - char name[TSDB_COL_FNAME_LEN + 1] = {0}; SSHashObj* pHash = tSimpleHashInit(pJoin->pTargets->length, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BINARY)); if (NULL == pHash) { return TSDB_CODE_OUT_OF_MEMORY; @@ -1185,8 +1252,13 @@ static int32_t sortHashJoinTargets(int16_t lBlkId, int16_t rBlkId, SHashJoinPhys if (TSDB_CODE_SUCCESS == code) { FOREACH(pNode, pJoin->pTargets) { SColumnNode* pCol = (SColumnNode*)pNode; - int32_t len = getSlotKey(pNode, NULL, name, TSDB_COL_FNAME_LEN); - code = tSimpleHashPut(pHash, name, len, &pCol, POINTER_BYTES); + char *pName = NULL; + int32_t len = 0; + code = getSlotKey(pNode, NULL, &pName, &len); + if (TSDB_CODE_SUCCESS == code) { + code = tSimpleHashPut(pHash, pName, len, &pCol, POINTER_BYTES); + } + taosMemoryFree(pName); if (TSDB_CODE_SUCCESS != code) { break; } @@ -1197,36 +1269,44 @@ static int32_t sortHashJoinTargets(int16_t lBlkId, int16_t rBlkId, SHashJoinPhys pJoin->pTargets = pNew; FOREACH(pNode, pJoin->pOnLeft) { + char* pName = NULL; SColumnNode* pCol = (SColumnNode*)pNode; - int32_t len = getSlotKey(pNode, NULL, name, TSDB_COL_FNAME_LEN); - SNode** p = tSimpleHashGet(pHash, name, len); - if (p) { - code = nodesListStrictAppend(pJoin->pTargets, *p); - if (TSDB_CODE_SUCCESS != code) { - break; - } - code = tSimpleHashRemove(pHash, name, len); - if (TSDB_CODE_SUCCESS != code) { - break; + int32_t len = 0; + code = getSlotKey(pNode, NULL, &pName, &len); + if (TSDB_CODE_SUCCESS == code) { + SNode** p = tSimpleHashGet(pHash, pName, len); + if (p) { + code = nodesListStrictAppend(pJoin->pTargets, *p); + if (TSDB_CODE_SUCCESS == code) { + code = tSimpleHashRemove(pHash, pName, len); + } } } + taosMemoryFree(pName); + if (TSDB_CODE_SUCCESS != code) { + break; + } } } if (TSDB_CODE_SUCCESS == code) { FOREACH(pNode, pJoin->pOnRight) { + char* pName = NULL; SColumnNode* pCol = (SColumnNode*)pNode; - int32_t len = getSlotKey(pNode, NULL, name, TSDB_COL_FNAME_LEN); - SNode** p = tSimpleHashGet(pHash, name, len); - if (p) { - code = nodesListStrictAppend(pJoin->pTargets, *p); - if (TSDB_CODE_SUCCESS != code) { - break; - } - code = tSimpleHashRemove(pHash, name, len); - if (TSDB_CODE_SUCCESS != code) { - break; + int32_t len = 0; + code = getSlotKey(pNode, NULL, &pName, &len); + if (TSDB_CODE_SUCCESS == code) { + SNode** p = tSimpleHashGet(pHash, pName, len); + if (p) { + code = nodesListStrictAppend(pJoin->pTargets, *p); + if (TSDB_CODE_SUCCESS == code) { + code = tSimpleHashRemove(pHash, pName, len); + } } } + taosMemoryFree(pName); + if (TSDB_CODE_SUCCESS != code) { + break; + } } } if (TSDB_CODE_SUCCESS == code) { diff --git a/source/libs/planner/src/planSpliter.c b/source/libs/planner/src/planSpliter.c index efbcd79b69..706394507a 100644 --- a/source/libs/planner/src/planSpliter.c +++ b/source/libs/planner/src/planSpliter.c @@ -432,7 +432,7 @@ static int32_t stbSplAppendWStart(SNodeList* pFuncs, int32_t* pIndex, uint8_t pr int64_t pointer = (int64_t)pWStart; char name[TSDB_COL_NAME_LEN + TSDB_POINTER_PRINT_BYTES + TSDB_NAME_DELIMITER_LEN + 1] = {0}; int32_t len = snprintf(name, sizeof(name) - 1, "%s.%" PRId64 "", pWStart->functionName, pointer); - (void)taosCreateMD5Hash(name, len); + (void)taosHashBinary(name, len); strncpy(pWStart->node.aliasName, name, TSDB_COL_NAME_LEN - 1); pWStart->node.resType.precision = precision; @@ -464,7 +464,7 @@ static int32_t stbSplAppendWEnd(SWindowLogicNode* pWin, int32_t* pIndex) { int64_t pointer = (int64_t)pWEnd; char name[TSDB_COL_NAME_LEN + TSDB_POINTER_PRINT_BYTES + TSDB_NAME_DELIMITER_LEN + 1] = {0}; int32_t len = snprintf(name, sizeof(name) - 1, "%s.%" PRId64 "", pWEnd->functionName, pointer); - (void)taosCreateMD5Hash(name, len); + (void)taosHashBinary(name, len); strncpy(pWEnd->node.aliasName, name, TSDB_COL_NAME_LEN - 1); code = fmGetFuncInfo(pWEnd, NULL, 0); diff --git a/source/libs/planner/src/planUtil.c b/source/libs/planner/src/planUtil.c index 91dc36b99f..e1e98f221f 100644 --- a/source/libs/planner/src/planUtil.c +++ b/source/libs/planner/src/planUtil.c @@ -631,7 +631,7 @@ SFunctionNode* createGroupKeyAggFunc(SColumnNode* pGroupCol) { if (TSDB_CODE_SUCCESS == code) { char name[TSDB_FUNC_NAME_LEN + TSDB_NAME_DELIMITER_LEN + TSDB_POINTER_PRINT_BYTES + 1] = {0}; int32_t len = snprintf(name, sizeof(name) - 1, "%s.%p", pFunc->functionName, pFunc); - (void)taosCreateMD5Hash(name, len); + (void)taosHashBinary(name, len); strncpy(pFunc->node.aliasName, name, TSDB_COL_NAME_LEN - 1); } } From 0a7e4a170662cd38ef0c8564660f2214d115360a Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Thu, 8 Aug 2024 10:08:00 +0800 Subject: [PATCH 039/103] fix mem leak --- source/libs/transport/src/transCli.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/source/libs/transport/src/transCli.c b/source/libs/transport/src/transCli.c index 4b324e52c6..dc0af79676 100644 --- a/source/libs/transport/src/transCli.c +++ b/source/libs/transport/src/transCli.c @@ -1620,15 +1620,17 @@ static void cliHandleFreeById(SCliMsg* pMsg, SCliThrd* pThrd) { if (size == 0) { // already recv, and notify upper layer TAOS_CHECK_GOTO(TSDB_CODE_REF_INVALID_ID, NULL, _exception); - return; } else { - while (T_REF_VAL_GET(conn) >= 1) transUnrefCliHandle(conn); + while (T_REF_VAL_GET(conn) >= 1) { + transUnrefCliHandle(conn); + } + return; } - return; _exception: tDebug("already free conn %p by id %" PRId64"", conn, refId); (void)transReleaseExHandle(transGetRefMgt(), refId); + (void)transRemoveExHandle(transGetRefMgt(), refId); destroyCmsg(pMsg); } @@ -2225,6 +2227,10 @@ static FORCE_INLINE void destroyCmsgAndAhandle(void* param) { pThrd->destroyAhandleFp(pMsg->ctx->ahandle); } + if (pMsg->msg.info.handle !=0) { + transReleaseExHandle(transGetRefMgt(), (int64_t)pMsg->msg.info.handle); + } + transDestroyConnCtx(pMsg->ctx); transFreeMsg(pMsg->msg.pCont); taosMemoryFree(pMsg); From 1cde8c61d6661e59d9e627fbd25cf265618b0cd1 Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Thu, 8 Aug 2024 10:28:00 +0800 Subject: [PATCH 040/103] just add test case --- tests/army/storage/compressRatio.py | 98 +++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 tests/army/storage/compressRatio.py diff --git a/tests/army/storage/compressRatio.py b/tests/army/storage/compressRatio.py new file mode 100644 index 0000000000..a5875a02b0 --- /dev/null +++ b/tests/army/storage/compressRatio.py @@ -0,0 +1,98 @@ +from util.log import * +from util.sql import * +from util.cases import * +from util.dnodes import * +from util.common import * +import json +import random + + +class TDTestCase: + def init(self, conn, logSql, replicaVar=1): + self.replicaVar = int(replicaVar) + tdLog.debug(f"start to excute {__file__}") + tdSql.init(conn.cursor(), True) + + + def checksql(self, sql): + result = os.popen(f"taos -s \"{sql}\" ") + res = result.read() + print(res) + if ("Query OK" in res): + tdLog.info(f"checkEqual success") + else : + tdLog.exit(f"checkEqual error") + def generate_random_str(self,randomlength=32): + """ + 生成一个指定长度的随机字符串 + """ + random_str = '' + base_str = 'ABCDEFGHIGKLMNOPQRSTUVWXYZabcdefghigklmnopqrstuvwxyz1234567890' + #base_str = 'ABCDEFGHIGKLMNOPQRSTUVWXYZabcdefghigklmnopqrstuvwxyz' + length = len(base_str) - 1 + count = 0 + for i in range(randomlength): + count = count + 1 + random_str += base_str[random.randint(0, length)] + return random_str + def check(self): + # tdSql.execute("create database db" ) + # tdSql.execute("create table db.jtable (ts timestamp, c1 VARCHAR(64000))",queryTimes=2) + # with open('./1-insert/temp.json', 'r') as f: + # data = json.load(f) + # json_str=json.dumps(data) + # print(data,type(data),type(json_str)) + # json_str=json_str.replace('"','\\"') + # # sql = f"insert into db.jtable values(now,\"{json_str}\") " + # # os.system(f"taos -s {sql} ") + # rowNum = 100 + # step = 1000 + # self.ts = 1537146000000 + # for j in range(1000): + # sql = "insert into db.jtable values" + # for k in range(rowNum): + # self.ts += step + # sql += f"({self.ts},\"{json_str}\") " + # tdSql.execute(sql,queryTimes=2) + # tdSql.execute("flush database db",queryTimes=2) + + tdSql.execute("create database db1" ) + tdSql.execute("create table db1.jtable (ts timestamp, c1 VARCHAR(6400) compress 'zstd')",queryTimes=2) + # with open('./1-insert/seedStr.json', 'r') as f: + # data = f.read() + # json_str=str(data) + # print(data,type(data),type(json_str)) + # json_str=json_str.replace('"','\\"') + + + rowNum = 100 + step = 1000 + self.ts = 1657146000000 + f=self.generate_random_str(5750) + json_str=f.replace('"','\\"') + for j in range(1000): + sql = "insert into db1.jtable values" + # f=self.generate_random_str(5750) + # json_str=f.replace('"','\\"') + for k in range(rowNum): + self.ts += step + f=self.generate_random_str(5750) + json_str=f.replace('"','\\"') + sql += f"({self.ts},\"{json_str}\") " + #print(sql) + tdSql.execute(sql,queryTimes=2) + tdSql.execute("flush database db1",queryTimes=2) + + + def run(self): + self.check() + + + def stop(self): + tdSql.close() + tdLog.success(f"{__file__} successfully executed") + + + +tdCases.addLinux(__file__, TDTestCase()) +tdCases.addWindows(__file__, TDTestCase()) From 7271e040d44851757c8b785926a44bd1aff306a5 Mon Sep 17 00:00:00 2001 From: Alex Duan <417921451@qq.com> Date: Thu, 8 Aug 2024 12:56:13 +0800 Subject: [PATCH 041/103] fix: restore delete_check.py case --- tests/parallel_test/cases.task | 2 +- tests/system-test/0-others/delete_check.py | 19 ++++++++++++++++--- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/tests/parallel_test/cases.task b/tests/parallel_test/cases.task index e70042001d..2b0ddc8166 100644 --- a/tests/parallel_test/cases.task +++ b/tests/parallel_test/cases.task @@ -340,7 +340,7 @@ ,,y,system-test,./pytest.sh python3 ./test.py -f 0-others/splitVGroup.py -N 3 -n 3 ,,y,system-test,./pytest.sh python3 ./test.py -f 0-others/splitVGroupWal.py -N 3 -n 3 ,,n,system-test,python3 ./test.py -f 0-others/timeRangeWise.py -N 3 -#,,y,system-test,./pytest.sh python3 ./test.py -f 0-others/delete_check.py +,,y,system-test,./pytest.sh python3 ./test.py -f 0-others/delete_check.py ,,y,system-test,./pytest.sh python3 ./test.py -f 0-others/test_hot_refresh_configurations.py ,,y,system-test,./pytest.sh python3 ./test.py -f 0-others/subscribe_stream_privilege.py ,,y,system-test,./pytest.sh python3 ./test.py -f 0-others/empty_identifier.py diff --git a/tests/system-test/0-others/delete_check.py b/tests/system-test/0-others/delete_check.py index ec589295e4..468ca63919 100644 --- a/tests/system-test/0-others/delete_check.py +++ b/tests/system-test/0-others/delete_check.py @@ -47,9 +47,9 @@ class TDTestCase: def compactDatbase(self): # compact database tdSql.execute(f"compact database {self.dbname}", show=True) - waitSeconds = 20 - if self.waitTranslation(waitSeconds) == False: - tdLog.exit(f"translation can not finish after wait {waitSeconds} seconds") + waitSeconds = 60 + if self.waitCompacts(waitSeconds) == False: + tdLog.exit(f"compacts can not finish after wait {waitSeconds} seconds") return # check tsdb folder empty @@ -108,6 +108,19 @@ class TDTestCase: return False + def waitCompacts(self, waitSeconds): + # wait end + for i in range(waitSeconds): + sql ="show compacts;" + rows = tdSql.query(sql) + if rows == 0: + return True + tdLog.info(f"i={i} wait for translation finish ...") + time.sleep(1) + + return False + + # run def run(self): # seed From 43f191511de555244e33c092c20f9f62a16c3ad8 Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Thu, 8 Aug 2024 13:41:35 +0800 Subject: [PATCH 042/103] fix mem leak --- source/libs/transport/src/transCli.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/libs/transport/src/transCli.c b/source/libs/transport/src/transCli.c index dc0af79676..02ff91e073 100644 --- a/source/libs/transport/src/transCli.c +++ b/source/libs/transport/src/transCli.c @@ -2228,7 +2228,7 @@ static FORCE_INLINE void destroyCmsgAndAhandle(void* param) { } if (pMsg->msg.info.handle !=0) { - transReleaseExHandle(transGetRefMgt(), (int64_t)pMsg->msg.info.handle); + transRemoveExHandle(transGetRefMgt(), (int64_t)pMsg->msg.info.handle); } transDestroyConnCtx(pMsg->ctx); From eafdaac3e5f0ecc6b09205796427e6cd842ad0d2 Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Thu, 8 Aug 2024 14:13:26 +0800 Subject: [PATCH 043/103] fix mem leak --- source/libs/transport/src/transCli.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/libs/transport/src/transCli.c b/source/libs/transport/src/transCli.c index 02ff91e073..1be77e6c02 100644 --- a/source/libs/transport/src/transCli.c +++ b/source/libs/transport/src/transCli.c @@ -2228,7 +2228,7 @@ static FORCE_INLINE void destroyCmsgAndAhandle(void* param) { } if (pMsg->msg.info.handle !=0) { - transRemoveExHandle(transGetRefMgt(), (int64_t)pMsg->msg.info.handle); + (void)transRemoveExHandle(transGetRefMgt(), (int64_t)pMsg->msg.info.handle); } transDestroyConnCtx(pMsg->ctx); From 09f6411f6640437e08fbdb2037a8ba560aab73b7 Mon Sep 17 00:00:00 2001 From: Shengliang Guan Date: Thu, 8 Aug 2024 16:22:49 +0800 Subject: [PATCH 044/103] fix: format udf example codes --- tests/script/sh/bit_and.c | 2 +- tests/script/sh/l2norm.c | 30 +++----- tests/script/sh/max_vol.c | 149 ++++++++++++++++++-------------------- 3 files changed, 82 insertions(+), 99 deletions(-) diff --git a/tests/script/sh/bit_and.c b/tests/script/sh/bit_and.c index 2cf2157e1c..84485d396b 100644 --- a/tests/script/sh/bit_and.c +++ b/tests/script/sh/bit_and.c @@ -55,7 +55,7 @@ DLL_EXPORT int32_t bit_and(SUdfDataBlock* block, SUdfColumn* resultCol) { } resultData->numOfRows = block->numOfRows; - udfTrace("block:%p, processing completed, rows:%d, cols:%d,", block, block->numOfRows, block->numOfCols); + udfTrace("block:%p, processing completed", block); return TSDB_CODE_SUCCESS; } diff --git a/tests/script/sh/l2norm.c b/tests/script/sh/l2norm.c index 0b7f5bf7f6..865d9ee9a5 100644 --- a/tests/script/sh/l2norm.c +++ b/tests/script/sh/l2norm.c @@ -1,32 +1,26 @@ -#include -#include -#include #include - +#include +#include +#include #include "taosudf.h" -DLL_EXPORT int32_t l2norm_init() { - return 0; -} +DLL_EXPORT int32_t l2norm_init() { return 0; } -DLL_EXPORT int32_t l2norm_destroy() { - return 0; -} +DLL_EXPORT int32_t l2norm_destroy() { return 0; } -DLL_EXPORT int32_t l2norm_start(SUdfInterBuf *buf) { +DLL_EXPORT int32_t l2norm_start(SUdfInterBuf* buf) { *(int64_t*)(buf->buf) = 0; buf->bufLen = sizeof(double); buf->numOfResult = 1; return 0; } -DLL_EXPORT int32_t l2norm(SUdfDataBlock* block, SUdfInterBuf *interBuf, SUdfInterBuf *newInterBuf) { +DLL_EXPORT int32_t l2norm(SUdfDataBlock* block, SUdfInterBuf* interBuf, SUdfInterBuf* newInterBuf) { double sumSquares = *(double*)interBuf->buf; int8_t numNotNull = 0; for (int32_t i = 0; i < block->numOfCols; ++i) { SUdfColumn* col = block->udfCols[i]; - if (!(col->colMeta.type == TSDB_DATA_TYPE_INT || - col->colMeta.type == TSDB_DATA_TYPE_DOUBLE)) { + if (!(col->colMeta.type == TSDB_DATA_TYPE_INT || col->colMeta.type == TSDB_DATA_TYPE_DOUBLE)) { return TSDB_CODE_UDF_INVALID_INPUT; } } @@ -38,18 +32,18 @@ DLL_EXPORT int32_t l2norm(SUdfDataBlock* block, SUdfInterBuf *interBuf, SUdfInte } switch (col->colMeta.type) { case TSDB_DATA_TYPE_INT: { - char* cell = udfColDataGetData(col, j); + char* cell = udfColDataGetData(col, j); int32_t num = *(int32_t*)cell; sumSquares += (double)num * num; break; } case TSDB_DATA_TYPE_DOUBLE: { - char* cell = udfColDataGetData(col, j); + char* cell = udfColDataGetData(col, j); double num = *(double*)cell; sumSquares += num * num; break; } - default: + default: break; } ++numNotNull; @@ -62,7 +56,7 @@ DLL_EXPORT int32_t l2norm(SUdfDataBlock* block, SUdfInterBuf *interBuf, SUdfInte return 0; } -DLL_EXPORT int32_t l2norm_finish(SUdfInterBuf* buf, SUdfInterBuf *resultData) { +DLL_EXPORT int32_t l2norm_finish(SUdfInterBuf* buf, SUdfInterBuf* resultData) { double sumSquares = *(double*)(buf->buf); *(double*)(resultData->buf) = sqrt(sumSquares); resultData->bufLen = sizeof(double); diff --git a/tests/script/sh/max_vol.c b/tests/script/sh/max_vol.c index 4f9ecd33a7..0a57a26d1c 100644 --- a/tests/script/sh/max_vol.c +++ b/tests/script/sh/max_vol.c @@ -1,101 +1,90 @@ -#include -#include -#include #include - +#include +#include +#include #include "taosudf.h" -#define STR_MAX_LEN 256 // inter buffer length +#define STR_MAX_LEN 256 // inter buffer length // init -DLL_EXPORT int32_t max_vol_init() -{ - return 0; -} +DLL_EXPORT int32_t max_vol_init() { return 0; } // destory -DLL_EXPORT int32_t max_vol_destroy() -{ - return 0; -} +DLL_EXPORT int32_t max_vol_destroy() { return 0; } -// start -DLL_EXPORT int32_t max_vol_start(SUdfInterBuf *buf) -{ - memset(buf->buf, 0, sizeof(float) + STR_MAX_LEN); - // set init value - *((float*)buf->buf) = -10000000; - buf->bufLen = sizeof(float) + STR_MAX_LEN; - buf->numOfResult = 0; - return 0; +// start +DLL_EXPORT int32_t max_vol_start(SUdfInterBuf *buf) { + memset(buf->buf, 0, sizeof(float) + STR_MAX_LEN); + // set init value + *((float *)buf->buf) = -10000000; + buf->bufLen = sizeof(float) + STR_MAX_LEN; + buf->numOfResult = 0; + return 0; } DLL_EXPORT int32_t max_vol(SUdfDataBlock *block, SUdfInterBuf *interBuf, SUdfInterBuf *newInterBuf) { - float maxValue = *(float *)interBuf->buf; - char strBuff[STR_MAX_LEN] = "inter1buf"; - - if (block->numOfCols < 2) - { + float maxValue = *(float *)interBuf->buf; + char strBuff[STR_MAX_LEN] = "inter1buf"; + + if (block->numOfCols < 2) { + return TSDB_CODE_UDF_INVALID_INPUT; + } + + // check data type + for (int32_t i = 0; i < block->numOfCols; ++i) { + SUdfColumn *col = block->udfCols[i]; + if (i == block->numOfCols - 1) { + // last column is device id , must varchar + if (col->colMeta.type != TSDB_DATA_TYPE_VARCHAR) { return TSDB_CODE_UDF_INVALID_INPUT; + } + } else { + if (col->colMeta.type != TSDB_DATA_TYPE_FLOAT) { + return TSDB_CODE_UDF_INVALID_INPUT; + } } + } - // check data type - for (int32_t i = 0; i < block->numOfCols; ++i) - { - SUdfColumn *col = block->udfCols[i]; - if( i == block->numOfCols - 1) { - // last column is device id , must varchar - if (col->colMeta.type != TSDB_DATA_TYPE_VARCHAR ) { - return TSDB_CODE_UDF_INVALID_INPUT; - } - } else { - if (col->colMeta.type != TSDB_DATA_TYPE_FLOAT) { - return TSDB_CODE_UDF_INVALID_INPUT; - } - } + // calc max voltage + SUdfColumn *lastCol = block->udfCols[block->numOfCols - 1]; + for (int32_t i = 0; i < (block->numOfCols - 1); ++i) { + for (int32_t j = 0; j < block->numOfRows; ++j) { + SUdfColumn *col = block->udfCols[i]; + if (udfColDataIsNull(col, j)) { + continue; + } + char *data = udfColDataGetData(col, j); + float voltage = *(float *)data; + if (voltage > maxValue) { + maxValue = voltage; + char *valData = udfColDataGetData(lastCol, j); + // get device id + char *deviceId = valData + sizeof(uint16_t); + sprintf(strBuff, "%s_(%d,%d)_%f", deviceId, j, i, maxValue); + } } + } - // calc max voltage - SUdfColumn *lastCol = block->udfCols[block->numOfCols - 1]; - for (int32_t i = 0; i < (block->numOfCols - 1); ++i) { - for (int32_t j = 0; j < block->numOfRows; ++j) { - SUdfColumn *col = block->udfCols[i]; - if (udfColDataIsNull(col, j)) { - continue; - } - char *data = udfColDataGetData(col, j); - float voltage = *(float *)data; - if (voltage > maxValue) { - maxValue = voltage; - char *valData = udfColDataGetData(lastCol, j); - // get device id - char *deviceId = valData + sizeof(uint16_t); - sprintf(strBuff, "%s_(%d,%d)_%f", deviceId, j, i, maxValue); - } - } - } - - *(float*)newInterBuf->buf = maxValue; - strcpy(newInterBuf->buf + sizeof(float), strBuff); - newInterBuf->bufLen = sizeof(float) + strlen(strBuff)+1; - newInterBuf->numOfResult = 1; - return 0; + *(float *)newInterBuf->buf = maxValue; + strcpy(newInterBuf->buf + sizeof(float), strBuff); + newInterBuf->bufLen = sizeof(float) + strlen(strBuff) + 1; + newInterBuf->numOfResult = 1; + return 0; } -DLL_EXPORT int32_t max_vol_finish(SUdfInterBuf *buf, SUdfInterBuf *resultData) -{ - char * str = buf->buf + sizeof(float); - // copy to des - char * des = resultData->buf + sizeof(uint16_t); - strcpy(des, str); +DLL_EXPORT int32_t max_vol_finish(SUdfInterBuf *buf, SUdfInterBuf *resultData) { + char *str = buf->buf + sizeof(float); + // copy to des + char *des = resultData->buf + sizeof(uint16_t); + strcpy(des, str); - // set binary type len - uint16_t len = strlen(str); - *((uint16_t*)resultData->buf) = len; + // set binary type len + uint16_t len = strlen(str); + *((uint16_t *)resultData->buf) = len; - // set buf len - resultData->bufLen = len + sizeof(uint16_t); - // set row count - resultData->numOfResult = 1; - return 0; + // set buf len + resultData->bufLen = len + sizeof(uint16_t); + // set row count + resultData->numOfResult = 1; + return 0; } From fe75c72666055d63432d2e96c19a9b972732b45c Mon Sep 17 00:00:00 2001 From: 54liuyao <54liuyao> Date: Thu, 8 Aug 2024 16:24:18 +0800 Subject: [PATCH 045/103] fix issue --- source/dnode/vnode/src/tsdb/tsdbRead2.c | 6 ++-- source/dnode/vnode/src/tsdb/tsdbReadUtil.c | 6 +++- source/libs/executor/src/cachescanoperator.c | 6 ++-- source/libs/executor/src/projectoperator.c | 4 +-- source/libs/executor/src/scanoperator.c | 38 ++++++++++++-------- 5 files changed, 38 insertions(+), 22 deletions(-) diff --git a/source/dnode/vnode/src/tsdb/tsdbRead2.c b/source/dnode/vnode/src/tsdb/tsdbRead2.c index 09ca1cdc84..84ca2c36ea 100644 --- a/source/dnode/vnode/src/tsdb/tsdbRead2.c +++ b/source/dnode/vnode/src/tsdb/tsdbRead2.c @@ -3519,8 +3519,10 @@ static int32_t initForFirstBlockInFile(STsdbReader* pReader, SDataBlockIter* pBl resetTableListIndex(&pReader->status); } - // set the correct start position according to the query time window - initBlockDumpInfo(pReader, pBlockIter); + if (code == TSDB_CODE_SUCCESS) { + // set the correct start position according to the query time window + initBlockDumpInfo(pReader, pBlockIter); + } taosArrayDestroy(pTableList); return code; } diff --git a/source/dnode/vnode/src/tsdb/tsdbReadUtil.c b/source/dnode/vnode/src/tsdb/tsdbReadUtil.c index 1d0cfecdd0..4dabffc10a 100644 --- a/source/dnode/vnode/src/tsdb/tsdbReadUtil.c +++ b/source/dnode/vnode/src/tsdb/tsdbReadUtil.c @@ -1074,8 +1074,12 @@ int32_t doAdjustValidDataIters(SArray* pLDIterList, int32_t numOfFileObj) { int32_t inc = numOfFileObj - size; for (int32_t k = 0; k < inc; ++k) { SLDataIter* pIter = taosMemoryCalloc(1, sizeof(SLDataIter)); - void* px = taosArrayPush(pLDIterList, &pIter); + if (!pIter) { + return terrno; + } + void* px = taosArrayPush(pLDIterList, &pIter); if (px == NULL) { + taosMemoryFree(pIter); return TSDB_CODE_OUT_OF_MEMORY; } } diff --git a/source/libs/executor/src/cachescanoperator.c b/source/libs/executor/src/cachescanoperator.c index 5c8ca49813..2751cf2851 100644 --- a/source/libs/executor/src/cachescanoperator.c +++ b/source/libs/executor/src/cachescanoperator.c @@ -245,8 +245,10 @@ _error: if (code != TSDB_CODE_SUCCESS) { qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } - pInfo->pTableList = NULL; - destroyCacheScanOperator(pInfo); + if (pInfo != NULL) { + pInfo->pTableList = NULL; + destroyCacheScanOperator(pInfo); + } if (pOperator != NULL) { pOperator->info = NULL; destroyOperator(pOperator); diff --git a/source/libs/executor/src/projectoperator.c b/source/libs/executor/src/projectoperator.c index 3f1eb43578..66a7408b13 100644 --- a/source/libs/executor/src/projectoperator.c +++ b/source/libs/executor/src/projectoperator.c @@ -179,7 +179,7 @@ int32_t createProjectOperatorInfo(SOperatorInfo* downstream, SProjectPhysiNode* return code; _error: - destroyProjectOperatorInfo(pInfo); + if (pInfo != NULL) destroyProjectOperatorInfo(pInfo); if (pOperator != NULL) { pOperator->info = NULL; destroyOperator(pOperator); @@ -531,7 +531,7 @@ int32_t createIndefinitOutputOperatorInfo(SOperatorInfo* downstream, SPhysiNode* return code; _error: - destroyIndefinitOperatorInfo(pInfo); + if (pInfo != NULL) destroyIndefinitOperatorInfo(pInfo); if (pOperator != NULL) { pOperator->info = NULL; destroyOperator(pOperator); diff --git a/source/libs/executor/src/scanoperator.c b/source/libs/executor/src/scanoperator.c index 647909cc13..1a508d9082 100644 --- a/source/libs/executor/src/scanoperator.c +++ b/source/libs/executor/src/scanoperator.c @@ -1568,6 +1568,8 @@ void resetTableScanInfo(STableScanInfo* pTableScanInfo, STimeWindow* pWin, uint6 static SSDataBlock* readPreVersionData(SOperatorInfo* pTableScanOp, uint64_t tbUid, TSKEY startTs, TSKEY endTs, int64_t maxVersion) { + int32_t code = TSDB_CODE_SUCCESS; + int32_t lino = 0; STableKeyInfo tblInfo = {.uid = tbUid, .groupId = 0}; STableScanInfo* pTableScanInfo = pTableScanOp->info; @@ -1584,35 +1586,31 @@ static SSDataBlock* readPreVersionData(SOperatorInfo* pTableScanOp, uint64_t tbU STsdbReader* pReader = NULL; int32_t code = pAPI->tsdReader.tsdReaderOpen(pTableScanInfo->base.readHandle.vnode, &cond, &tblInfo, 1, pBlock, (void**)&pReader, GET_TASKID(pTaskInfo), NULL); - if (code != TSDB_CODE_SUCCESS) { - terrno = code; - T_LONG_JMP(pTaskInfo->env, code); - return NULL; - } + QUERY_CHECK_CODE(code, lino, _end); bool hasNext = false; code = pAPI->tsdReader.tsdNextDataBlock(pReader, &hasNext); - if (code != TSDB_CODE_SUCCESS) { - terrno = code; - T_LONG_JMP(pTaskInfo->env, code); - return NULL; - } + QUERY_CHECK_CODE(code, lino, _end); if (hasNext) { SSDataBlock* p = NULL; code = pAPI->tsdReader.tsdReaderRetrieveDataBlock(pReader, &p, NULL); - if (code != TSDB_CODE_SUCCESS) { - return NULL; - } + QUERY_CHECK_CODE(code, lino, _end); doSetTagColumnData(&pTableScanInfo->base, pBlock, pTaskInfo, pBlock->info.rows); pBlock->info.id.groupId = tableListGetTableGroupId(pTableScanInfo->base.pTableListInfo, pBlock->info.id.uid); } +_end: pAPI->tsdReader.tsdReaderClose(pReader); qDebug("retrieve prev rows:%" PRId64 ", skey:%" PRId64 ", ekey:%" PRId64 " uid:%" PRIu64 ", max ver:%" PRId64 ", suid:%" PRIu64, pBlock->info.rows, startTs, endTs, tbUid, maxVersion, cond.suid); + if (code != TSDB_CODE_SUCCESS) { + qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); + terrno = code; + return NULL; + } return pBlock->info.rows > 0 ? pBlock : NULL; } @@ -2259,6 +2257,10 @@ static int32_t generatePartitionDelResBlock(SStreamScanInfo* pInfo, SSDataBlock* uint64_t srcUid = srcUidData[delI]; char tbname[VARSTR_HEADER_SIZE + TSDB_TABLE_NAME_LEN] = {0}; SSDataBlock* pPreRes = readPreVersionData(pInfo->pTableScanOp, srcUid, srcStartTsCol[delI], srcEndTsCol[delI], ver); + if (!pPreRes) { + qError("%s failed at line %d since %s", __func__, __LINE__, tstrerror(terrno)); + continue; + } code = blockDataEnsureCapacity(pDestBlock, pDestBlock->info.rows + pPreRes->info.rows); QUERY_CHECK_CODE(code, lino, _end); for (int32_t preJ = 0; preJ < pPreRes->info.rows; preJ++) { @@ -2331,6 +2333,10 @@ static int32_t generateDeleteResultBlockImpl(SStreamScanInfo* pInfo, SSDataBlock if (winCode != TSDB_CODE_SUCCESS) { SSDataBlock* pPreRes = readPreVersionData(pInfo->pTableScanOp, srcUid, srcStartTsCol[i], srcStartTsCol[i], ver); + if (!pPreRes) { + qError("%s failed at line %d since %s", __func__, __LINE__, tstrerror(terrno)); + continue; + } printDataBlock(pPreRes, "pre res", GET_TASKID(pInfo->pStreamScanOp->pTaskInfo)); code = calBlockTbName(pInfo, pPreRes, 0); QUERY_CHECK_CODE(code, lino, _end); @@ -5952,8 +5958,10 @@ _error: qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } pTaskInfo->code = code; - pInfo->base.pTableListInfo = NULL; - if (pInfo != NULL) destroyTableMergeScanOperatorInfo(pInfo); + if (pInfo != NULL) { + pInfo->base.pTableListInfo = NULL; + destroyTableMergeScanOperatorInfo(pInfo); + } if (pOperator != NULL) { pOperator->info = NULL; destroyOperator(pOperator); From c02ac1fe4c1a320519042193f2b865bbcfb0db0c Mon Sep 17 00:00:00 2001 From: xiao-77 Date: Thu, 8 Aug 2024 16:48:17 +0800 Subject: [PATCH 046/103] test: Adding test case for TD-31203 --- tests/parallel_test/cases.task | 1 + tests/system-test/2-query/agg_null.py | 162 ++++++++++++++++++++++++++ 2 files changed, 163 insertions(+) create mode 100644 tests/system-test/2-query/agg_null.py diff --git a/tests/parallel_test/cases.task b/tests/parallel_test/cases.task index e70042001d..d45627fcba 100644 --- a/tests/parallel_test/cases.task +++ b/tests/parallel_test/cases.task @@ -162,6 +162,7 @@ ,,y,system-test,./pytest.sh python3 ./test.py -f 1-insert/stt_blocks_check.py ,,y,system-test,./pytest.sh python3 ./test.py -f 2-query/out_of_order.py -Q 3 ,,y,system-test,./pytest.sh python3 ./test.py -f 2-query/out_of_order.py +,,y,system-test,./pytest.sh python3 ./test.py -f 2-query/agg_null.py ,,y,system-test,./pytest.sh python3 ./test.py -f 2-query/insert_null_none.py ,,y,system-test,./pytest.sh python3 ./test.py -f 2-query/insert_null_none.py -R ,,y,system-test,./pytest.sh python3 ./test.py -f 2-query/insert_null_none.py -Q 2 diff --git a/tests/system-test/2-query/agg_null.py b/tests/system-test/2-query/agg_null.py new file mode 100644 index 0000000000..bb4fbf41a2 --- /dev/null +++ b/tests/system-test/2-query/agg_null.py @@ -0,0 +1,162 @@ +################################################################### +# Copyright (c) 2016 by TAOS Technologies, Inc. +# All rights reserved. +# +# This file is proprietary and confidential to TAOS Technologies. +# No part of this file may be reproduced, stored, transmitted, +# disclosed or used in any form or by any means other than as +# expressly provided by the written permission from Jianhui Tao +# +################################################################### + +# -*- coding: utf-8 -*- + +import numpy as np +from util.log import * +from util.cases import * +from util.sql import * +from util.common import * +from util.sqlset import * +from scipy.stats import gaussian_kde +from hyperloglog import HyperLogLog +''' +Test case for TS-5150 +''' +def approximate_percentile(data, percentile): + """ + 使用 KDE 近似计算百分位数。 + + Parameters: + - data: 包含数据的列表或数组 + - percentile: 要计算的百分位数(0到100之间) + + Returns: + - 近似百分位数的值 + """ + # 使用高斯核估计概率密度 + kde = gaussian_kde(data) + + # 生成一组足够密集的点,计算累积分布函数 + min_val = min(data) + max_val = max(data) + x = np.linspace(min_val, max_val, 1000) + cdf = np.cumsum(kde(x) / kde(x).sum()) + + # 找到最接近所需百分位数的值 + idx = np.abs(cdf - percentile / 100.0).argmin() + approximate_value = x[idx] + + return approximate_value +class TDTestCase: + def init(self, conn, logSql, replicaVar=1): + self.replicaVar = int(replicaVar) + tdLog.debug("start to execute %s" % __file__) + tdSql.init(conn.cursor()) + self.ts = 1537146000000 + def initdabase(self): + tdSql.execute('create database if not exists db_test vgroups 2 buffer 10') + tdSql.execute('use db_test') + tdSql.execute('create stable stb(ts timestamp, delay int) tags(groupid int)') + tdSql.execute('create table t1 using stb tags(1)') + tdSql.execute('create table t2 using stb tags(2)') + tdSql.execute('create table t3 using stb tags(3)') + tdSql.execute('create table t4 using stb tags(4)') + tdSql.execute('create table t5 using stb tags(5)') + tdSql.execute('create table t6 using stb tags(6)') + def insert_data(self): + for i in range(5000): + tdSql.execute(f"insert into t1 values({self.ts + i * 1000}, {i%5})") + tdSql.execute(f"insert into t2 values({self.ts + i * 1000}, {i%5})") + tdSql.execute(f"insert into t3 values({self.ts + i * 1000}, {i%5})") + + def verify_agg_null(self): + for i in range(20): + col_val_list = [] + tdSql.query(f'select CASE WHEN delay != 0 THEN delay ELSE NULL END from stb where ts between {1537146000000 + i * 1000} and {1537146000000 + (i+10) * 1000}') + for col_va in tdSql.queryResult: + if col_va[0] is not None: + col_val_list.append(col_va[0]) + tdSql.query(f'SELECT APERCENTILE(CASE WHEN delay != 0 THEN delay ELSE NULL END,50) AS apercentile,\ + MAX(CASE WHEN delay != 0 THEN delay ELSE NULL END) AS maxDelay,\ + MIN(CASE WHEN delay != 0 THEN delay ELSE NULL END) AS minDelay,\ + AVG(CASE WHEN delay != 0 THEN delay ELSE NULL END) AS avgDelay,\ + STDDEV(CASE WHEN delay != 0 THEN delay ELSE NULL END) AS jitter,\ + COUNT(CASE WHEN delay = 0 THEN 1 ELSE NULL END) AS timeoutCount,\ + COUNT(*) AS totalCount ,\ + ELAPSED(ts) AS elapsed_time,\ + SPREAD(CASE WHEN delay != 0 THEN delay ELSE NULL END) AS spread,\ + SUM(CASE WHEN delay != 0 THEN delay ELSE NULL END) AS sum,\ + HYPERLOGLOG(CASE WHEN delay != 0 THEN delay ELSE NULL END) AS hyperloglog from stb where ts between {1537146000000 + i * 1000} and {1537146000000 + (i+10) * 1000}') + #verify apercentile + apercentile_res = tdSql.queryResult[0][0] + approximate_median = approximate_percentile(col_val_list, 50) + assert np.abs(apercentile_res - approximate_median) < 1 + #verify max + max_res = tdSql.queryResult[0][1] + tdSql.checkEqual(max_res,max(col_val_list)) + #verify min + min_res = tdSql.queryResult[0][2] + tdSql.checkEqual(min_res,min(col_val_list)) + #verify avg + avg_res = tdSql.queryResult[0][3] + tdSql.checkEqual(avg_res,np.average(col_val_list)) + #verify stddev + stddev_res = tdSql.queryResult[0][4] + assert np.abs(stddev_res - np.std(col_val_list)) < 0.0001 + #verify count of 0 + count of !0 == count(*) + count_res = tdSql.queryResult[0][6] + tdSql.checkEqual(count_res,len(col_val_list)+tdSql.queryResult[0][5]) + #verify elapsed + elapsed_res = tdSql.queryResult[0][7] + assert elapsed_res == 10000 + #verify spread + spread_res = tdSql.queryResult[0][8] + tdSql.checkEqual(spread_res,max(col_val_list) - min(col_val_list)) + #verify sum + sum_res = tdSql.queryResult[0][9] + tdSql.checkEqual(sum_res,sum(col_val_list)) + #verify hyperloglog + error_rate = 0.01 + hll = HyperLogLog(error_rate) + for col_val in col_val_list: + hll.add(col_val) + hll_res = tdSql.queryResult[0][10] + assert np.abs(hll_res - hll.card()) < 0.01 + #verify leastsquares + tdSql.query(f'SELECT leastsquares(CASE WHEN delay != 0 THEN delay ELSE NULL END,1,1) from stb where ts between {1537146000000 + i * 1000} and {1537146000000 + (i+10) * 1000}') + cleaned_data = tdSql.queryResult[0][0].strip('{}').replace(' ', '') + pairs = cleaned_data.split(',') + slope = None + intercept = None + for pair in pairs: + key, value = pair.split(':') + key = key.strip() + value = float(value.strip()) + if key == 'slop': + slope = value + elif key == 'intercept': + intercept = value + assert slope != 0 + assert intercept != 0 + #verify histogram + tdSql.query(f'SELECT histogram(CASE WHEN delay != 0 THEN delay ELSE NULL END, "user_input", "[1,3,5,7]", 1) from stb where ts between {1537146000000 + i * 1000} and {1537146000000 + (i+10) * 1000}') + cleaned_data = tdSql.queryResult[0][0].strip('{}').replace(' ', '') + pairs = cleaned_data.split(',') + count = None + for pair in pairs: + key, value = pair.split(':') + key = key.strip() + if key == 'count': + count = float(value.strip()) + assert count != 0 + def run(self): + self.initdabase() + self.insert_data() + self.verify_agg_null() + def stop(self): + tdSql.close() + tdLog.success(f"{__file__} successfully executed") + + +tdCases.addLinux(__file__, TDTestCase()) +tdCases.addWindows(__file__, TDTestCase()) From 340886b9be70a53ea1046ed3f2994432514cb813 Mon Sep 17 00:00:00 2001 From: 54liuyao <54liuyao> Date: Thu, 8 Aug 2024 17:23:25 +0800 Subject: [PATCH 047/103] fix issue --- source/libs/executor/src/scanoperator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/libs/executor/src/scanoperator.c b/source/libs/executor/src/scanoperator.c index 1a508d9082..e15dcf806a 100644 --- a/source/libs/executor/src/scanoperator.c +++ b/source/libs/executor/src/scanoperator.c @@ -1584,7 +1584,7 @@ static SSDataBlock* readPreVersionData(SOperatorInfo* pTableScanOp, uint64_t tbU SSDataBlock* pBlock = pTableScanInfo->pResBlock; STsdbReader* pReader = NULL; - int32_t code = pAPI->tsdReader.tsdReaderOpen(pTableScanInfo->base.readHandle.vnode, &cond, &tblInfo, 1, pBlock, + code = pAPI->tsdReader.tsdReaderOpen(pTableScanInfo->base.readHandle.vnode, &cond, &tblInfo, 1, pBlock, (void**)&pReader, GET_TASKID(pTaskInfo), NULL); QUERY_CHECK_CODE(code, lino, _end); From 132500a102dfaaa3f9253b815b817119b99c57f0 Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Thu, 8 Aug 2024 17:31:06 +0800 Subject: [PATCH 048/103] fix mem leak --- source/libs/transport/src/transCli.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/source/libs/transport/src/transCli.c b/source/libs/transport/src/transCli.c index 1be77e6c02..874fbd7733 100644 --- a/source/libs/transport/src/transCli.c +++ b/source/libs/transport/src/transCli.c @@ -1629,6 +1629,7 @@ static void cliHandleFreeById(SCliMsg* pMsg, SCliThrd* pThrd) { _exception: tDebug("already free conn %p by id %" PRId64"", conn, refId); + (void)transReleaseExHandle(transGetRefMgt(), refId); (void)transReleaseExHandle(transGetRefMgt(), refId); (void)transRemoveExHandle(transGetRefMgt(), refId); destroyCmsg(pMsg); @@ -2228,6 +2229,7 @@ static FORCE_INLINE void destroyCmsgAndAhandle(void* param) { } if (pMsg->msg.info.handle !=0) { + (void)transReleaseExHandle(transGetRefMgt(), (int64_t)pMsg->msg.info.handle); (void)transRemoveExHandle(transGetRefMgt(), (int64_t)pMsg->msg.info.handle); } From efd1ece152b250cbe274ce0f43f3ff7e4bcf72fa Mon Sep 17 00:00:00 2001 From: wangjiaming0909 <604227650@qq.com> Date: Thu, 8 Aug 2024 18:06:37 +0800 Subject: [PATCH 049/103] 1. fix exchange operator dead lock due to no ret check for add ref 2. fix null pointer of FetchRpcHandle of exchangeoperator --- source/libs/executor/src/exchangeoperator.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/source/libs/executor/src/exchangeoperator.c b/source/libs/executor/src/exchangeoperator.c index e4ebb74252..78c0d939ad 100644 --- a/source/libs/executor/src/exchangeoperator.c +++ b/source/libs/executor/src/exchangeoperator.c @@ -391,7 +391,11 @@ static int32_t initExchangeOperator(SExchangePhysiNode* pExNode, SExchangeInfo* initLimitInfo(pExNode->node.pLimit, pExNode->node.pSlimit, &pInfo->limitInfo); pInfo->self = taosAddRef(exchangeObjRefPool, pInfo); - + if (pInfo->self < 0) { + int32_t code = terrno; + qError("%s failed at line %d since %s", __func__, __LINE__, tstrerror(code)); + return code; + } return initDataSource(numOfSources, pInfo, id); } @@ -480,14 +484,16 @@ void freeSourceDataInfo(void* p) { void doDestroyExchangeOperatorInfo(void* param) { SExchangeInfo* pExInfo = (SExchangeInfo*)param; - for (int32_t i = 0; i < pExInfo->pFetchRpcHandles->size; ++i) { - int64_t* pRpcHandle = taosArrayGet(pExInfo->pFetchRpcHandles, i); - if (*pRpcHandle > 0) { - SDownstreamSourceNode* pSource = taosArrayGet(pExInfo->pSources, i); - (void)asyncFreeConnById(pExInfo->pTransporter, *pRpcHandle); + if (pExInfo->pFetchRpcHandles) { + for (int32_t i = 0; i < pExInfo->pFetchRpcHandles->size; ++i) { + int64_t* pRpcHandle = taosArrayGet(pExInfo->pFetchRpcHandles, i); + if (*pRpcHandle > 0) { + SDownstreamSourceNode* pSource = taosArrayGet(pExInfo->pSources, i); + (void)asyncFreeConnById(pExInfo->pTransporter, *pRpcHandle); + } } + taosArrayDestroy(pExInfo->pFetchRpcHandles); } - taosArrayDestroy(pExInfo->pFetchRpcHandles); taosArrayDestroy(pExInfo->pSources); taosArrayDestroyEx(pExInfo->pSourceDataInfo, freeSourceDataInfo); From d1ed3b3abbdeb25ceb7a44dacb296f0339f0c971 Mon Sep 17 00:00:00 2001 From: wangjiaming0909 <604227650@qq.com> Date: Thu, 8 Aug 2024 18:10:11 +0800 Subject: [PATCH 050/103] 1. fix interval query with month interval day slidng returning wrong window 2. fix error msg for creating tsma --- source/common/src/ttime.c | 6 ++++++ source/libs/parser/src/parTranslater.c | 2 +- source/util/src/terror.c | 2 +- tests/system-test/2-query/tsma.py | 8 ++++---- tests/system-test/2-query/tsma2.py | 14 ++++++++++++-- 5 files changed, 24 insertions(+), 8 deletions(-) diff --git a/source/common/src/ttime.c b/source/common/src/ttime.c index efabe5cf07..2a8e7951b1 100644 --- a/source/common/src/ttime.c +++ b/source/common/src/ttime.c @@ -815,6 +815,12 @@ int64_t taosTimeTruncate(int64_t ts, const SInterval* pInterval) { if (IS_CALENDAR_TIME_DURATION(pInterval->intervalUnit)) { int64_t news = (ts / pInterval->sliding) * pInterval->sliding; ASSERT(news <= ts); + if (pInterval->slidingUnit == 'd' || pInterval->slidingUnit == 'w') { +#if defined(WINDOWS) && _MSC_VER >= 1900 + int64_t timezone = _timezone; +#endif + news += (int64_t)(timezone * TSDB_TICK_PER_SECOND(precision)); + } if (news <= ts) { int64_t prev = news; diff --git a/source/libs/parser/src/parTranslater.c b/source/libs/parser/src/parTranslater.c index 7a2a73d013..2cef23e1cb 100755 --- a/source/libs/parser/src/parTranslater.c +++ b/source/libs/parser/src/parTranslater.c @@ -11864,7 +11864,7 @@ static int32_t buildCreateTSMAReq(STranslateContext* pCxt, SCreateTSMAStmt* pStm if (checkRecursiveTsmaInterval(pRecursiveTsma->interval, pRecursiveTsma->unit, pInterval->datum.i, pInterval->unit, pDbInfo.precision, true)) { } else { - code = TSDB_CODE_TSMA_INVALID_PARA; + code = TSDB_CODE_TSMA_INVALID_INTERVAL; } } } diff --git a/source/util/src/terror.c b/source/util/src/terror.c index e9bdafcd5a..4166b418bf 100644 --- a/source/util/src/terror.c +++ b/source/util/src/terror.c @@ -753,7 +753,7 @@ TAOS_DEFINE_ERROR(TSDB_CODE_TSMA_INVALID_STAT, "Invalid tsma state" TAOS_DEFINE_ERROR(TSDB_CODE_TSMA_INVALID_PTR, "Invalid tsma pointer") TAOS_DEFINE_ERROR(TSDB_CODE_TSMA_INVALID_PARA, "Invalid tsma parameters") TAOS_DEFINE_ERROR(TSDB_CODE_TSMA_INVALID_TB, "Invalid table to create tsma, only stable or normal table allowed") -TAOS_DEFINE_ERROR(TSDB_CODE_TSMA_INVALID_INTERVAL, "Invalid tsma interval, 1m ~ 1h is allowed") +TAOS_DEFINE_ERROR(TSDB_CODE_TSMA_INVALID_INTERVAL, "Invalid tsma interval, 1m ~ 1y is allowed") TAOS_DEFINE_ERROR(TSDB_CODE_TSMA_INVALID_FUNC_PARAM, "Invalid tsma func param, only one non-tag column allowed") TAOS_DEFINE_ERROR(TSDB_CODE_TSMA_UNSUPPORTED_FUNC, "Tsma func not supported") TAOS_DEFINE_ERROR(TSDB_CODE_TSMA_MUST_BE_DROPPED, "Tsma must be dropped first") diff --git a/tests/system-test/2-query/tsma.py b/tests/system-test/2-query/tsma.py index fccf6291b5..2bf908e250 100644 --- a/tests/system-test/2-query/tsma.py +++ b/tests/system-test/2-query/tsma.py @@ -1476,18 +1476,18 @@ class TDTestCase: tdSql.error(sql, -2147473920) # syntax error sql = 'create recursive tsma tsma2 on test.tsma1 interval(1m)' - tdSql.error(sql, -2147471099) # invalid tsma parameter + tdSql.error(sql, -2147471097) # invalid tsma interval sql = 'create recursive tsma tsma2 on test.tsma1 interval(7m)' - tdSql.error(sql, -2147471099) # invalid tsma parameter + tdSql.error(sql, -2147471097) # invalid tsma interval sql = 'create recursive tsma tsma2 on test.tsma1 interval(11m)' - tdSql.error(sql, -2147471099) # invalid tsma parameter + tdSql.error(sql, -2147471097) # invalid tsma interval self.create_recursive_tsma('tsma1', 'tsma2', 'test', '20m', 'meters') sql = 'create recursive tsma tsma3 on test.tsma2 interval(30m)' - tdSql.error(sql, -2147471099) # invalid tsma parameter + tdSql.error(sql, -2147471097) # invalid tsma interval self.create_recursive_tsma('tsma2', 'tsma3', 'test', '40m', 'meters') diff --git a/tests/system-test/2-query/tsma2.py b/tests/system-test/2-query/tsma2.py index 5af75b6fb9..6ea6e6c36f 100644 --- a/tests/system-test/2-query/tsma2.py +++ b/tests/system-test/2-query/tsma2.py @@ -842,7 +842,7 @@ class TDTestCase: ] tdSql.execute('use db') for (i, ri, ret) in examples: - self.test_create_recursive_tsma_interval(db, tb, func, i, ri, ret, -2147471099) + self.test_create_recursive_tsma_interval(db, tb, func, i, ri, ret, -2147471097) self.create_tsma('tsma1', db, tb, func, '1h') self.create_recursive_tsma('tsma1', 'tsma2', db, '1n', tb, func) @@ -898,7 +898,17 @@ class TDTestCase: .get_qc()) self.check(ctxs) - tdSql.execute('drop database db') + + sql = 'select count(*), _wstart, _wend from db.meters interval(1n) sliding(1d) limit 1' + tdSql.query(sql) + first_win: datetime = tdSql.queryResult[0][1] + if first_win.hour != 0: + tdLog.exit("day sliding should always aligned with current timezone") + sql = 'select /*+skip_tsma()*/count(*), _wstart, _wend from db.meters interval(1n) sliding(1d) limit 1' + tdSql.query(sql) + first_win: datetime = tdSql.queryResult[0][1] + if first_win.hour != 0: + tdLog.exit("day sliding should always aligned with current timezone") def stop(self): tdSql.close() From 566bab1fb28236874ca6a2e94b7895d253ee890a Mon Sep 17 00:00:00 2001 From: Hongze Cheng Date: Thu, 8 Aug 2024 18:25:43 +0800 Subject: [PATCH 051/103] fix: disable process write request when vnode is in snapshot mode --- include/util/taoserror.h | 1 + source/dnode/vnode/src/inc/vnodeInt.h | 1 + source/dnode/vnode/src/vnd/vnodeOpen.c | 1 + source/dnode/vnode/src/vnd/vnodeSnapshot.c | 8 +++++++- source/dnode/vnode/src/vnd/vnodeSvr.c | 8 ++++++++ source/util/src/terror.c | 1 + 6 files changed, 19 insertions(+), 1 deletion(-) diff --git a/include/util/taoserror.h b/include/util/taoserror.h index f0cb30e7e0..b091d870ec 100644 --- a/include/util/taoserror.h +++ b/include/util/taoserror.h @@ -529,6 +529,7 @@ int32_t taosGetErrSize(); #define TSDB_CODE_VND_META_DATA_UNSAFE_DELETE TAOS_DEF_ERROR_CODE(0, 0x0535) #define TSDB_CODE_VND_COLUMN_COMPRESS_ALREADY_EXIST TAOS_DEF_ERROR_CODE(0, 0x0536) #define TSDB_CODE_VND_ARB_NOT_SYNCED TAOS_DEF_ERROR_CODE(0, 0x0537) // internal +#define TSDB_CODE_VND_WRITE_DISABLED TAOS_DEF_ERROR_CODE(0, 0x0538) // internal // tsdb #define TSDB_CODE_TDB_INVALID_TABLE_ID TAOS_DEF_ERROR_CODE(0, 0x0600) diff --git a/source/dnode/vnode/src/inc/vnodeInt.h b/source/dnode/vnode/src/inc/vnodeInt.h index 416438b2ec..3eb3f09cee 100644 --- a/source/dnode/vnode/src/inc/vnodeInt.h +++ b/source/dnode/vnode/src/inc/vnodeInt.h @@ -473,6 +473,7 @@ struct SVnode { STfs* pTfs; int32_t diskPrimary; SMsgCb msgCb; + bool disableWrite; // Buffer Pool TdThreadMutex mutex; diff --git a/source/dnode/vnode/src/vnd/vnodeOpen.c b/source/dnode/vnode/src/vnd/vnodeOpen.c index 8a2b10d2ef..4f5d7c24e1 100644 --- a/source/dnode/vnode/src/vnd/vnodeOpen.c +++ b/source/dnode/vnode/src/vnd/vnodeOpen.c @@ -403,6 +403,7 @@ SVnode *vnodeOpen(const char *path, int32_t diskPrimary, STfs *pTfs, SMsgCb msgC pVnode->msgCb = msgCb; (void)taosThreadMutexInit(&pVnode->lock, NULL); pVnode->blocked = false; + pVnode->disableWrite = false; (void)tsem_init(&pVnode->syncSem, 0, 0); (void)taosThreadMutexInit(&pVnode->mutex, NULL); diff --git a/source/dnode/vnode/src/vnd/vnodeSnapshot.c b/source/dnode/vnode/src/vnd/vnodeSnapshot.c index 92e2ffeb7f..c347750d65 100644 --- a/source/dnode/vnode/src/vnd/vnodeSnapshot.c +++ b/source/dnode/vnode/src/vnd/vnodeSnapshot.c @@ -609,7 +609,10 @@ int32_t vnodeSnapWriterOpen(SVnode *pVnode, SSnapshotParam *pParam, SVSnapWriter int64_t sver = pParam->start; int64_t ever = pParam->end; - // cancel and disable all bg task + // disable write, cancel and disable all bg tasks + (void)taosThreadMutexLock(&pVnode->mutex); + pVnode->disableWrite = true; + (void)taosThreadMutexUnlock(&pVnode->mutex); (void)vnodeCancelAndDisableAllBgTask(pVnode); // alloc @@ -741,6 +744,9 @@ int32_t vnodeSnapWriterClose(SVSnapWriter *pWriter, int8_t rollback, SSnapshot * } (void)vnodeBegin(pVnode); + (void)taosThreadMutexLock(&pVnode->mutex); + pVnode->disableWrite = false; + (void)taosThreadMutexUnlock(&pVnode->mutex); _exit: if (code) { diff --git a/source/dnode/vnode/src/vnd/vnodeSvr.c b/source/dnode/vnode/src/vnd/vnodeSvr.c index feaad0f46d..3f6ca053cd 100644 --- a/source/dnode/vnode/src/vnd/vnodeSvr.c +++ b/source/dnode/vnode/src/vnd/vnodeSvr.c @@ -518,6 +518,14 @@ int32_t vnodeProcessWriteMsg(SVnode *pVnode, SRpcMsg *pMsg, int64_t ver, SRpcMsg void *pReq; int32_t len; + (void)taosThreadMutexLock(&pVnode->mutex); + if (pVnode->disableWrite) { + (void)taosThreadMutexUnlock(&pVnode->mutex); + vError("vgId:%d write is disabled for snapshot, version:%" PRId64, TD_VID(pVnode), ver); + return TSDB_CODE_VND_WRITE_DISABLED; + } + (void)taosThreadMutexUnlock(&pVnode->mutex); + if (ver <= pVnode->state.applied) { vError("vgId:%d, duplicate write request. ver: %" PRId64 ", applied: %" PRId64 "", TD_VID(pVnode), ver, pVnode->state.applied); diff --git a/source/util/src/terror.c b/source/util/src/terror.c index e9bdafcd5a..1d2e8a9100 100644 --- a/source/util/src/terror.c +++ b/source/util/src/terror.c @@ -409,6 +409,7 @@ TAOS_DEFINE_ERROR(TSDB_CODE_VND_ALREADY_IS_VOTER, "Vnode already is a vo TAOS_DEFINE_ERROR(TSDB_CODE_VND_DIR_ALREADY_EXIST, "Vnode directory already exist") TAOS_DEFINE_ERROR(TSDB_CODE_VND_META_DATA_UNSAFE_DELETE, "Single replica vnode data will lost permanently after this operation, if you make sure this, please use drop dnode unsafe to execute") TAOS_DEFINE_ERROR(TSDB_CODE_VND_ARB_NOT_SYNCED, "Vgroup peer is not synced") +TAOS_DEFINE_ERROR(TSDB_CODE_VND_WRITE_DISABLED, "Vnode write is disabled for snapshot") TAOS_DEFINE_ERROR(TSDB_CODE_VND_COLUMN_COMPRESS_ALREADY_EXIST,"Same with old param") From 1c5db0c4692b31efce48a7fb348022188ee5e905 Mon Sep 17 00:00:00 2001 From: wangjiaming0909 <604227650@qq.com> Date: Thu, 8 Aug 2024 18:26:41 +0800 Subject: [PATCH 052/103] remove unused md5hash --- source/libs/parser/src/parAstCreater.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/source/libs/parser/src/parAstCreater.c b/source/libs/parser/src/parAstCreater.c index fab58814d5..ee5f215d72 100644 --- a/source/libs/parser/src/parAstCreater.c +++ b/source/libs/parser/src/parAstCreater.c @@ -316,17 +316,8 @@ SNode* releaseRawExprNode(SAstCreateContext* pCxt, SNode* pNode) { // See TS-3398. // Len of pRawExpr->p could be larger than len of aliasName[TSDB_COL_NAME_LEN]. // If aliasName is truncated, hash value of aliasName could be the same. - T_MD5_CTX ctx; - tMD5Init(&ctx); - tMD5Update(&ctx, (uint8_t*)pRawExpr->p, pRawExpr->n); - tMD5Final(&ctx); - char* p = pExpr->aliasName; - for (uint8_t i = 0; i < tListLen(ctx.digest); ++i) { - sprintf(p, "%02x", ctx.digest[i]); - p += 2; - } - uint64_t a = MurmurHash3_64(pRawExpr->p, pRawExpr->n); - sprintf(pExpr->aliasName, "%"PRIu64, a); + uint64_t hashVal = MurmurHash3_64(pRawExpr->p, pRawExpr->n); + sprintf(pExpr->aliasName, "%"PRIu64, hashVal); strncpy(pExpr->userAlias, pRawExpr->p, len); pExpr->userAlias[len] = '\0'; } From bde16f4667bd70b8421b23d2dcfe1a60aef4f657 Mon Sep 17 00:00:00 2001 From: wangmm0220 Date: Thu, 8 Aug 2024 18:34:02 +0800 Subject: [PATCH 053/103] opti:[TD-30453] change trans policy from rollback to retry in tmq --- source/dnode/mnode/impl/src/mndConsumer.c | 2 +- source/dnode/mnode/impl/src/mndSubscribe.c | 4 ++-- source/dnode/mnode/impl/src/mndTopic.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/source/dnode/mnode/impl/src/mndConsumer.c b/source/dnode/mnode/impl/src/mndConsumer.c index 6116d2da19..8f2523d50e 100644 --- a/source/dnode/mnode/impl/src/mndConsumer.c +++ b/source/dnode/mnode/impl/src/mndConsumer.c @@ -142,7 +142,7 @@ static int32_t mndProcessConsumerClearMsg(SRpcMsg *pMsg) { mndConsumerStatusName(pConsumer->status)); MND_TMQ_RETURN_CHECK(tNewSMqConsumerObj(pConsumer->consumerId, pConsumer->cgroup, -1, NULL, NULL, &pConsumerNew)); - pTrans = mndTransCreate(pMnode, TRN_POLICY_ROLLBACK, TRN_CONFLICT_NOTHING, pMsg, "clear-csm"); + pTrans = mndTransCreate(pMnode, TRN_POLICY_RETRY, TRN_CONFLICT_NOTHING, pMsg, "clear-csm"); MND_TMQ_NULL_CHECK(pTrans); MND_TMQ_RETURN_CHECK(mndSetConsumerDropLogs(pTrans, pConsumerNew)); code = mndTransPrepare(pMnode, pTrans); diff --git a/source/dnode/mnode/impl/src/mndSubscribe.c b/source/dnode/mnode/impl/src/mndSubscribe.c index bff313dbaf..b3f8cf0aea 100644 --- a/source/dnode/mnode/impl/src/mndSubscribe.c +++ b/source/dnode/mnode/impl/src/mndSubscribe.c @@ -643,7 +643,7 @@ static int32_t mndPersistRebResult(SMnode *pMnode, SRpcMsg *pMsg, const SMqRebOu char cgroup[TSDB_CGROUP_LEN] = {0}; mndSplitSubscribeKey(pOutput->pSub->key, topic, cgroup, true); - pTrans = mndTransCreate(pMnode, TRN_POLICY_ROLLBACK, TRN_CONFLICT_DB_INSIDE, pMsg, "tmq-reb"); + pTrans = mndTransCreate(pMnode, TRN_POLICY_RETRY, TRN_CONFLICT_DB_INSIDE, pMsg, "tmq-reb"); if (pTrans == NULL) { code = TSDB_CODE_MND_RETURN_VALUE_NULL; if (terrno != 0) code = terrno; @@ -1079,7 +1079,7 @@ static int32_t mndProcessDropCgroupReq(SRpcMsg *pMsg) { goto END; } - pTrans = mndTransCreate(pMnode, TRN_POLICY_ROLLBACK, TRN_CONFLICT_DB, pMsg, "drop-cgroup"); + pTrans = mndTransCreate(pMnode, TRN_POLICY_RETRY, TRN_CONFLICT_DB, pMsg, "drop-cgroup"); MND_TMQ_NULL_CHECK(pTrans); mInfo("trans:%d, used to drop cgroup:%s on topic %s", pTrans->id, dropReq.cgroup, dropReq.topic); mndTransSetDbName(pTrans, pSub->dbName, NULL); diff --git a/source/dnode/mnode/impl/src/mndTopic.c b/source/dnode/mnode/impl/src/mndTopic.c index c27cc04ae8..dcac86a7d4 100644 --- a/source/dnode/mnode/impl/src/mndTopic.c +++ b/source/dnode/mnode/impl/src/mndTopic.c @@ -431,7 +431,7 @@ static int32_t mndCreateTopic(SMnode *pMnode, SRpcMsg *pReq, SCMCreateTopicReq * SQueryPlan *pPlan = NULL; SMqTopicObj topicObj = {0}; - pTrans = mndTransCreate(pMnode, TRN_POLICY_ROLLBACK, TRN_CONFLICT_DB, pReq, "create-topic"); + pTrans = mndTransCreate(pMnode, TRN_POLICY_RETRY, TRN_CONFLICT_DB, pReq, "create-topic"); MND_TMQ_NULL_CHECK(pTrans); mndTransSetDbName(pTrans, pDb->name, NULL); MND_TMQ_RETURN_CHECK(mndTransCheckConflict(pMnode, pTrans)); From 7a788382100cfaf77d98e5313f3ce5681fbf793a Mon Sep 17 00:00:00 2001 From: Chris Zhai Date: Thu, 8 Aug 2024 18:37:19 +0800 Subject: [PATCH 054/103] add test cases for TD-31308 --- tests/army/tmq/tmqBugs.py | 98 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 tests/army/tmq/tmqBugs.py diff --git a/tests/army/tmq/tmqBugs.py b/tests/army/tmq/tmqBugs.py new file mode 100644 index 0000000000..f2ef433665 --- /dev/null +++ b/tests/army/tmq/tmqBugs.py @@ -0,0 +1,98 @@ + +import taos +import sys +import time +import socket +import os +import threading + +from frame.log import * +from frame.cases import * +from frame.sql import * +from frame.caseBase import * +from frame import * +from taos.tmq import * +import frame.etool + +class TDTestCase: + updatecfgDict = {'debugFlag': 135, 'asynclog': 0} + def init(self, conn, logSql, replicaVar=1): + self.replicaVar = int(replicaVar) + tdLog.debug(f"start to excute {__file__}") + tdSql.init(conn.cursor()) + #tdSql.init(conn.cursor(), logSql) # output sql.txt file + + def td_31283_test(self): + tdSql.execute(f'create database if not exists d1 vgroups 1') + tdSql.execute(f'use d1') + tdSql.execute(f'create table st(ts timestamp, i int) tags(t int)') + tdSql.execute(f'insert into t1 using st tags(1) values(now, 1) (now+1s, 2)') + tdSql.execute(f'insert into t2 using st tags(2) values(now, 1) (now+1s, 2)') + tdSql.execute(f'insert into t3 using st tags(3) values(now, 1) (now+1s, 2)') + tdSql.execute(f'insert into t1 using st tags(1) values(now+5s, 11) (now+10s, 12)') + + tdSql.query("select * from st") + tdSql.checkRows(8) + + tdSql.error(f'create topic t1 with meta as database d2', expectErrInfo="Database not exist") + tdSql.error(f'create topic t1 as database d2', expectErrInfo="Database not exist") + tdSql.error(f'create topic t2 as select * from st2', expectErrInfo="Fail to get table info, error: Table does not exist") + tdSql.error(f'create topic t3 as stable st2', expectErrInfo="STable not exist") + tdSql.error(f'create topic t3 with meta as stable st2', expectErrInfo="STable not exist") + + tdSql.execute(f'create topic t1 with meta as database d1') + + consumer_dict = { + "group.id": "g1", + "td.connect.user": "root", + "td.connect.pass": "taosdata", + "auto.offset.reset": "earliest", + # "msg.enable.batchmeta": "true", + "experimental.snapshot.enable": "true", + } + consumer1 = Consumer(consumer_dict) + + try: + consumer1.subscribe(["t1"]) + except TmqError: + tdLog.exit(f"subscribe error") + + index = 0 + try: + while True: + res = consumer1.poll(1) + if not res: + if index != 1: + tdLog.exit("consume error") + break + val = res.value() + if val is None: + continue + cnt = 0; + for block in val: + cnt += len(block.fetchall()) + + if cnt != 8: + tdLog.exit("consume error") + + index += 1 + finally: + consumer1.close() + + + tdSql.query(f'show consumers') + tdSql.checkRows(0) + + tdSql.execute(f'drop topic t1') + tdSql.execute(f'drop database d1') + + def run(self): + self.td_31283_test() + + + def stop(self): + tdSql.close() + tdLog.success(f"{__file__} successfully executed") + +tdCases.addLinux(__file__, TDTestCase()) +tdCases.addWindows(__file__, TDTestCase()) From ed710653eb0146809c8936a6418f2b077d687d75 Mon Sep 17 00:00:00 2001 From: dmchen Date: Thu, 8 Aug 2024 10:37:49 +0000 Subject: [PATCH 055/103] fix/TS-5262-log-conflict --- source/dnode/mnode/impl/src/mndTrans.c | 33 +++++++++++++++----------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/source/dnode/mnode/impl/src/mndTrans.c b/source/dnode/mnode/impl/src/mndTrans.c index 8ed3f66009..7f8d63c8e0 100644 --- a/source/dnode/mnode/impl/src/mndTrans.c +++ b/source/dnode/mnode/impl/src/mndTrans.c @@ -817,6 +817,17 @@ static bool mndCheckStbConflict(const char *conflict, STrans *pTrans) { return false; } +static void mndTransLogConflict(STrans *pNew, STrans *pTrans, bool conflict, bool *globalConflict) { + if (conflict) { + mError("trans:%d, db:%s stb:%s type:%d, can't execute since conflict with trans:%d db:%s stb:%s type:%d", pNew->id, + pNew->dbname, pNew->stbname, pNew->conflict, pTrans->id, pTrans->dbname, pTrans->stbname, pTrans->conflict); + *globalConflict = true; + } else { + mInfo("trans:%d, db:%s stb:%s type:%d, not conflict with trans:%d db:%s stb:%s type:%d", pNew->id, pNew->dbname, + pNew->stbname, pNew->conflict, pTrans->id, pTrans->dbname, pTrans->stbname, pTrans->conflict); + } +} + static bool mndCheckTransConflict(SMnode *pMnode, STrans *pNew) { STrans *pTrans = NULL; void *pIter = NULL; @@ -832,18 +843,18 @@ static bool mndCheckTransConflict(SMnode *pMnode, STrans *pNew) { if (pNew->conflict == TRN_CONFLICT_DB) { if (pTrans->conflict == TRN_CONFLICT_GLOBAL) conflict = true; if (pTrans->conflict == TRN_CONFLICT_DB || pTrans->conflict == TRN_CONFLICT_DB_INSIDE) { - if (mndCheckDbConflict(pNew->dbname, pTrans)) conflict = true; - if (mndCheckStbConflict(pNew->stbname, pTrans)) conflict = true; + mndTransLogConflict(pNew, pTrans, mndCheckDbConflict(pNew->dbname, pTrans), &conflict); + mndTransLogConflict(pNew, pTrans, mndCheckStbConflict(pNew->stbname, pTrans), &conflict); } } if (pNew->conflict == TRN_CONFLICT_DB_INSIDE) { if (pTrans->conflict == TRN_CONFLICT_GLOBAL) conflict = true; if (pTrans->conflict == TRN_CONFLICT_DB) { - if (mndCheckDbConflict(pNew->dbname, pTrans)) conflict = true; - if (mndCheckStbConflict(pNew->stbname, pTrans)) conflict = true; + mndTransLogConflict(pNew, pTrans, mndCheckDbConflict(pNew->dbname, pTrans), &conflict); + mndTransLogConflict(pNew, pTrans, mndCheckStbConflict(pNew->stbname, pTrans), &conflict); } if (pTrans->conflict == TRN_CONFLICT_DB_INSIDE) { - if (mndCheckStbConflict(pNew->stbname, pTrans)) conflict = true; // for stb + mndTransLogConflict(pNew, pTrans, mndCheckStbConflict(pNew->stbname, pTrans), &conflict); // for stb } } @@ -871,22 +882,16 @@ static bool mndCheckTransConflict(SMnode *pMnode, STrans *pNew) { int32_t groupId = *(int32_t *)pGidIter; if (taosHashGet(pTrans->arbGroupIds, &groupId, sizeof(int32_t)) != NULL) { taosHashCancelIterate(pNew->arbGroupIds, pGidIter); - conflict = true; + mndTransLogConflict(pNew, pTrans, true, &conflict); break; + } else { + mndTransLogConflict(pNew, pTrans, false, &conflict); } pGidIter = taosHashIterate(pNew->arbGroupIds, pGidIter); } } } - if (conflict) { - mError("trans:%d, db:%s stb:%s type:%d, can't execute since conflict with trans:%d db:%s stb:%s type:%d", - pNew->id, pNew->dbname, pNew->stbname, pNew->conflict, pTrans->id, pTrans->dbname, pTrans->stbname, - pTrans->conflict); - } else { - mInfo("trans:%d, db:%s stb:%s type:%d, not conflict with trans:%d db:%s stb:%s type:%d", pNew->id, pNew->dbname, - pNew->stbname, pNew->conflict, pTrans->id, pTrans->dbname, pTrans->stbname, pTrans->conflict); - } sdbRelease(pMnode->pSdb, pTrans); } From ff67dd24bce386b8c2824f34eb39445f3d6996f9 Mon Sep 17 00:00:00 2001 From: Chris Zhai Date: Thu, 8 Aug 2024 18:39:37 +0800 Subject: [PATCH 056/103] update cases.task --- tests/parallel_test/cases.task | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/parallel_test/cases.task b/tests/parallel_test/cases.task index e70042001d..1b5b73cd82 100644 --- a/tests/parallel_test/cases.task +++ b/tests/parallel_test/cases.task @@ -35,6 +35,7 @@ ,,y,army,./pytest.sh python3 ./test.py -f storage/compressBasic.py -N 3 ,,y,army,./pytest.sh python3 ./test.py -f grant/grantBugs.py -N 3 ,,y,army,./pytest.sh python3 ./test.py -f query/queryBugs.py -N 3 +,,y,army,./pytest.sh python3 ./test.py -f tmq/tmqBugs.py -N 3 # # system test From 8e81abe2248b3e3f74e68f7136688293251b22aa Mon Sep 17 00:00:00 2001 From: Shengliang Guan Date: Thu, 8 Aug 2024 19:18:50 +0800 Subject: [PATCH 057/103] enh: get vardata len from udf api --- include/libs/function/taosudf.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/libs/function/taosudf.h b/include/libs/function/taosudf.h index 0b59d7c2f5..91487e5d1d 100644 --- a/include/libs/function/taosudf.h +++ b/include/libs/function/taosudf.h @@ -131,6 +131,14 @@ static FORCE_INLINE char *udfColDataGetData(const SUdfColumn *pColumn, int32_t r } } +static FORCE_INLINE int32_t udfColDataGetDataLen(const SUdfColumn *pColumn, int32_t row) { + if (IS_VAR_DATA_TYPE(pColumn->colMeta.type)) { + return *(uint16_t*)(pColumn->colData.varLenCol.payload + pColumn->colData.varLenCol.varOffsets[row]); + } else { + return pColumn->colMeta.bytes; + } +} + static FORCE_INLINE bool udfColDataIsNull(const SUdfColumn *pColumn, int32_t row) { if (IS_VAR_DATA_TYPE(pColumn->colMeta.type)) { if (pColumn->colMeta.type == TSDB_DATA_TYPE_JSON) { From 7be4d5b592cfb29757d16d418c45db735b62caf4 Mon Sep 17 00:00:00 2001 From: Shengliang Guan Date: Thu, 8 Aug 2024 19:19:23 +0800 Subject: [PATCH 058/103] enh: adjust udf example codes --- tests/script/sh/bit_and.c | 4 ++-- tests/script/sh/l2norm.c | 32 +++++++++++++++++++++----- tests/script/sh/max_vol.c | 47 +++++++++++++++++++++++++++++---------- 3 files changed, 64 insertions(+), 19 deletions(-) diff --git a/tests/script/sh/bit_and.c b/tests/script/sh/bit_and.c index 84485d396b..c35f1da171 100644 --- a/tests/script/sh/bit_and.c +++ b/tests/script/sh/bit_and.c @@ -27,8 +27,8 @@ DLL_EXPORT int32_t bit_and(SUdfDataBlock* block, SUdfColumn* resultCol) { for (int32_t i = 0; i < block->numOfRows; ++i) { if (udfColDataIsNull(block->udfCols[0], i)) { - udfTrace("block:%p, row:%d result is null since col:0 is null", block, i); udfColDataSetNull(resultCol, i); + udfTrace("block:%p, row:%d result is null since col:0 is null", block, i); continue; } @@ -38,8 +38,8 @@ DLL_EXPORT int32_t bit_and(SUdfDataBlock* block, SUdfColumn* resultCol) { int32_t j = 1; for (; j < block->numOfCols; ++j) { if (udfColDataIsNull(block->udfCols[j], i)) { - udfTrace("block:%p, row:%d result is null since col:%d is null", block, i, j); udfColDataSetNull(resultCol, i); + udfTrace("block:%p, row:%d result is null since col:%d is null", block, i, j); break; } diff --git a/tests/script/sh/l2norm.c b/tests/script/sh/l2norm.c index 865d9ee9a5..e2f379fd29 100644 --- a/tests/script/sh/l2norm.c +++ b/tests/script/sh/l2norm.c @@ -9,38 +9,55 @@ DLL_EXPORT int32_t l2norm_init() { return 0; } DLL_EXPORT int32_t l2norm_destroy() { return 0; } DLL_EXPORT int32_t l2norm_start(SUdfInterBuf* buf) { + int32_t bufLen = sizeof(double); + if (buf->bufLen < bufLen) { + udfError("failed to execute udf since input buflen:%d < %d", buf->bufLen, bufLen); + return TSDB_CODE_UDF_INVALID_BUFSIZE; + } + + udfTrace("start aggregation, buflen:%d used:%d", buf->bufLen, bufLen); *(int64_t*)(buf->buf) = 0; - buf->bufLen = sizeof(double); - buf->numOfResult = 1; + buf->bufLen = bufLen; + buf->numOfResult = 0; return 0; } DLL_EXPORT int32_t l2norm(SUdfDataBlock* block, SUdfInterBuf* interBuf, SUdfInterBuf* newInterBuf) { - double sumSquares = *(double*)interBuf->buf; - int8_t numNotNull = 0; + udfTrace("block:%p, processing begins, cols:%d rows:%d", block, block->numOfCols, block->numOfRows); + for (int32_t i = 0; i < block->numOfCols; ++i) { SUdfColumn* col = block->udfCols[i]; - if (!(col->colMeta.type == TSDB_DATA_TYPE_INT || col->colMeta.type == TSDB_DATA_TYPE_DOUBLE)) { + if (col->colMeta.type != TSDB_DATA_TYPE_INT && col->colMeta.type != TSDB_DATA_TYPE_DOUBLE) { + udfError("block:%p, col:%d type:%d should be int(%d) or double(%d)", block, i, col->colMeta.type, + TSDB_DATA_TYPE_INT, TSDB_DATA_TYPE_DOUBLE); return TSDB_CODE_UDF_INVALID_INPUT; } } + + double sumSquares = *(double*)interBuf->buf; + int8_t numNotNull = 0; + for (int32_t i = 0; i < block->numOfCols; ++i) { for (int32_t j = 0; j < block->numOfRows; ++j) { SUdfColumn* col = block->udfCols[i]; if (udfColDataIsNull(col, j)) { + udfTrace("block:%p, col:%d row:%d is null", block, i, j); continue; } + switch (col->colMeta.type) { case TSDB_DATA_TYPE_INT: { char* cell = udfColDataGetData(col, j); int32_t num = *(int32_t*)cell; sumSquares += (double)num * num; + udfTrace("block:%p, col:%d row:%d data:%d", block, i, j, num); break; } case TSDB_DATA_TYPE_DOUBLE: { char* cell = udfColDataGetData(col, j); double num = *(double*)cell; sumSquares += num * num; + udfTrace("block:%p, col:%d row:%d data:%f", block, i, j, num); break; } default: @@ -48,11 +65,14 @@ DLL_EXPORT int32_t l2norm(SUdfDataBlock* block, SUdfInterBuf* interBuf, SUdfInte } ++numNotNull; } + udfTrace("block:%p, col:%d result is %f", block, i, sumSquares); } *(double*)(newInterBuf->buf) = sumSquares; newInterBuf->bufLen = sizeof(double); newInterBuf->numOfResult = 1; + + udfTrace("block:%p, result is %f", block, sumSquares); return 0; } @@ -61,5 +81,7 @@ DLL_EXPORT int32_t l2norm_finish(SUdfInterBuf* buf, SUdfInterBuf* resultData) { *(double*)(resultData->buf) = sqrt(sumSquares); resultData->bufLen = sizeof(double); resultData->numOfResult = 1; + + udfTrace("end aggregation, result is %f", *(double*)(resultData->buf)); return 0; } diff --git a/tests/script/sh/max_vol.c b/tests/script/sh/max_vol.c index 0a57a26d1c..1a7a3f8210 100644 --- a/tests/script/sh/max_vol.c +++ b/tests/script/sh/max_vol.c @@ -6,27 +6,33 @@ #define STR_MAX_LEN 256 // inter buffer length -// init DLL_EXPORT int32_t max_vol_init() { return 0; } -// destory DLL_EXPORT int32_t max_vol_destroy() { return 0; } -// start DLL_EXPORT int32_t max_vol_start(SUdfInterBuf *buf) { + int32_t bufLen = sizeof(float) + STR_MAX_LEN; + if (buf->bufLen < bufLen) { + udfError("failed to execute udf since input buflen:%d < %d", buf->bufLen, bufLen); + return TSDB_CODE_UDF_INVALID_BUFSIZE; + } + + udfTrace("start aggregation, buflen:%d used:%d", buf->bufLen, bufLen); memset(buf->buf, 0, sizeof(float) + STR_MAX_LEN); - // set init value - *((float *)buf->buf) = -10000000; - buf->bufLen = sizeof(float) + STR_MAX_LEN; + *((float *)buf->buf) = INT32_MIN; + buf->bufLen = bufLen; buf->numOfResult = 0; return 0; } DLL_EXPORT int32_t max_vol(SUdfDataBlock *block, SUdfInterBuf *interBuf, SUdfInterBuf *newInterBuf) { + udfTrace("block:%p, processing begins, cols:%d rows:%d", block, block->numOfCols, block->numOfRows); + float maxValue = *(float *)interBuf->buf; char strBuff[STR_MAX_LEN] = "inter1buf"; if (block->numOfCols < 2) { + udfError("block:%p, cols:%d needs to be greater than 2", block, block->numOfCols); return TSDB_CODE_UDF_INVALID_INPUT; } @@ -36,10 +42,12 @@ DLL_EXPORT int32_t max_vol(SUdfDataBlock *block, SUdfInterBuf *interBuf, SUdfInt if (i == block->numOfCols - 1) { // last column is device id , must varchar if (col->colMeta.type != TSDB_DATA_TYPE_VARCHAR) { + udfError("block:%p, col:%d type:%d should be varchar(%d)", block, i, col->colMeta.type, TSDB_DATA_TYPE_VARCHAR); return TSDB_CODE_UDF_INVALID_INPUT; } } else { if (col->colMeta.type != TSDB_DATA_TYPE_FLOAT) { + udfError("block:%p, col:%d type:%d should be float(%d)", block, i, col->colMeta.type, TSDB_DATA_TYPE_FLOAT); return TSDB_CODE_UDF_INVALID_INPUT; } } @@ -47,28 +55,41 @@ DLL_EXPORT int32_t max_vol(SUdfDataBlock *block, SUdfInterBuf *interBuf, SUdfInt // calc max voltage SUdfColumn *lastCol = block->udfCols[block->numOfCols - 1]; - for (int32_t i = 0; i < (block->numOfCols - 1); ++i) { + for (int32_t i = 0; i < block->numOfCols - 1; ++i) { for (int32_t j = 0; j < block->numOfRows; ++j) { SUdfColumn *col = block->udfCols[i]; if (udfColDataIsNull(col, j)) { + udfTrace("block:%p, col:%d row:%d is null", block, i, j); continue; } + char *data = udfColDataGetData(col, j); float voltage = *(float *)data; - if (voltage > maxValue) { + + if (voltage <= maxValue) { + udfTrace("block:%p, col:%d row:%d data:%f", block, i, j, voltage); + } else { maxValue = voltage; - char *valData = udfColDataGetData(lastCol, j); + char *valData = udfColDataGetData(lastCol, j); + int32_t valDataLen = udfColDataGetDataLen(lastCol, j); + // get device id - char *deviceId = valData + sizeof(uint16_t); - sprintf(strBuff, "%s_(%d,%d)_%f", deviceId, j, i, maxValue); + char *deviceId = valData + sizeof(uint16_t); + int32_t deviceIdLen = valDataLen < (STR_MAX_LEN - 1) ? valDataLen : (STR_MAX_LEN - 1); + + strncpy(strBuff, deviceId, deviceIdLen); + snprintf(strBuff + deviceIdLen, STR_MAX_LEN - deviceIdLen, "_(%d,%d)_%f", j, i, maxValue); + udfTrace("block:%p, col:%d row:%d data:%f, as max_val:%s", block, i, j, voltage, strBuff); } } } *(float *)newInterBuf->buf = maxValue; - strcpy(newInterBuf->buf + sizeof(float), strBuff); + strncpy(newInterBuf->buf + sizeof(float), strBuff, STR_MAX_LEN); newInterBuf->bufLen = sizeof(float) + strlen(strBuff) + 1; newInterBuf->numOfResult = 1; + + udfTrace("block:%p, result is %s", block, strBuff); return 0; } @@ -86,5 +107,7 @@ DLL_EXPORT int32_t max_vol_finish(SUdfInterBuf *buf, SUdfInterBuf *resultData) { resultData->bufLen = len + sizeof(uint16_t); // set row count resultData->numOfResult = 1; + + udfTrace("end aggregation, result is %s", str); return 0; } From 645b49023bfa092a2d1aac008a9db7d079205510 Mon Sep 17 00:00:00 2001 From: xiao-77 Date: Thu, 8 Aug 2024 19:39:43 +0800 Subject: [PATCH 059/103] add hyperloglog to requirements.txt --- tests/requirements.txt | 1 + tests/system-test/2-query/agg_null.py | 28 +-------------------------- 2 files changed, 2 insertions(+), 27 deletions(-) diff --git a/tests/requirements.txt b/tests/requirements.txt index 5cdd9e02be..c6dd044c86 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -9,3 +9,4 @@ requests pexpect faker pyopenssl +hyperloglog \ No newline at end of file diff --git a/tests/system-test/2-query/agg_null.py b/tests/system-test/2-query/agg_null.py index bb4fbf41a2..bec879abbe 100644 --- a/tests/system-test/2-query/agg_null.py +++ b/tests/system-test/2-query/agg_null.py @@ -17,36 +17,10 @@ from util.cases import * from util.sql import * from util.common import * from util.sqlset import * -from scipy.stats import gaussian_kde from hyperloglog import HyperLogLog ''' Test case for TS-5150 ''' -def approximate_percentile(data, percentile): - """ - 使用 KDE 近似计算百分位数。 - - Parameters: - - data: 包含数据的列表或数组 - - percentile: 要计算的百分位数(0到100之间) - - Returns: - - 近似百分位数的值 - """ - # 使用高斯核估计概率密度 - kde = gaussian_kde(data) - - # 生成一组足够密集的点,计算累积分布函数 - min_val = min(data) - max_val = max(data) - x = np.linspace(min_val, max_val, 1000) - cdf = np.cumsum(kde(x) / kde(x).sum()) - - # 找到最接近所需百分位数的值 - idx = np.abs(cdf - percentile / 100.0).argmin() - approximate_value = x[idx] - - return approximate_value class TDTestCase: def init(self, conn, logSql, replicaVar=1): self.replicaVar = int(replicaVar) @@ -89,7 +63,7 @@ class TDTestCase: HYPERLOGLOG(CASE WHEN delay != 0 THEN delay ELSE NULL END) AS hyperloglog from stb where ts between {1537146000000 + i * 1000} and {1537146000000 + (i+10) * 1000}') #verify apercentile apercentile_res = tdSql.queryResult[0][0] - approximate_median = approximate_percentile(col_val_list, 50) + approximate_median = np.percentile(col_val_list, 50) assert np.abs(apercentile_res - approximate_median) < 1 #verify max max_res = tdSql.queryResult[0][1] From dfa74f82d78b4bfbfcbba4768cf8d0794de45ccc Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Fri, 9 Aug 2024 00:40:05 +0800 Subject: [PATCH 060/103] fix(stream): avoid repeat send checkpoint-report msg. --- source/dnode/mnode/impl/inc/mndStream.h | 12 ++- source/dnode/mnode/impl/src/mndStream.c | 104 ++++++++++++++------ source/dnode/mnode/impl/src/mndStreamHb.c | 6 +- source/dnode/mnode/impl/src/mndStreamUtil.c | 68 +++++++++---- source/dnode/vnode/src/tqCommon/tqCommon.c | 2 +- source/libs/stream/src/streamCheckpoint.c | 23 +++-- 6 files changed, 155 insertions(+), 60 deletions(-) diff --git a/source/dnode/mnode/impl/inc/mndStream.h b/source/dnode/mnode/impl/inc/mndStream.h index 89343ce37c..a5d91c8aa8 100644 --- a/source/dnode/mnode/impl/inc/mndStream.h +++ b/source/dnode/mnode/impl/inc/mndStream.h @@ -57,6 +57,12 @@ typedef struct SStreamTaskResetMsg { int32_t transId; } SStreamTaskResetMsg; +typedef struct SChkptReportInfo { + SArray* pTaskList; + int64_t reportChkpt; + int64_t streamId; +} SChkptReportInfo; + typedef struct SStreamExecInfo { bool initTaskList; SArray *pNodeList; @@ -66,9 +72,9 @@ typedef struct SStreamExecInfo { SArray *pTaskList; TdThreadMutex lock; SHashObj *pTransferStateStreams; - SHashObj *pChkptStreams; + SHashObj *pChkptStreams; // use to update the checkpoint info, if all tasks send the checkpoint-report msgs SHashObj *pStreamConsensus; - SArray *pKilledChkptTrans; // SArray + SArray *pKilledChkptTrans; // SArray } SStreamExecInfo; extern SStreamExecInfo execInfo; @@ -153,6 +159,8 @@ int32_t mndGetConsensusInfo(SHashObj *pHash, int64_t streamId, int32_t numOfTask void mndAddConsensusTasks(SCheckpointConsensusInfo *pInfo, const SRestoreCheckpointInfo *pRestoreInfo); void mndClearConsensusRspEntry(SCheckpointConsensusInfo *pInfo); int64_t mndClearConsensusCheckpointId(SHashObj* pHash, int64_t streamId); +int64_t mndClearChkptReportInfo(SHashObj* pHash, int64_t streamId); +int32_t mndResetChkptReportInfo(SHashObj* pHash, int64_t streamId); int32_t setStreamAttrInResBlock(SStreamObj *pStream, SSDataBlock *pBlock, int32_t numOfRows); int32_t setTaskAttrInResBlock(SStreamObj *pStream, SStreamTask *pTask, SSDataBlock *pBlock, int32_t numOfRows); diff --git a/source/dnode/mnode/impl/src/mndStream.c b/source/dnode/mnode/impl/src/mndStream.c index 90ef7daa60..a8d35993c7 100644 --- a/source/dnode/mnode/impl/src/mndStream.c +++ b/source/dnode/mnode/impl/src/mndStream.c @@ -2454,8 +2454,45 @@ int32_t mndProcessStreamReqCheckpoint(SRpcMsg *pReq) { return 0; } -static void doAddReportStreamTask(SArray* pList, const SCheckpointReport* pReport) { - bool existed = false; +// valid the info according to the HbMsg +static bool validateChkptReport(const SCheckpointReport *pReport, int64_t reportChkptId) { + STaskId id = {.streamId = pReport->streamId, .taskId = pReport->taskId}; + STaskStatusEntry *pTaskEntry = taosHashGet(execInfo.pTaskMap, &id, sizeof(id)); + if (pTaskEntry == NULL) { + mError("invalid checkpoint-report msg from task:0x%x, discard", pReport->taskId); + return false; + } + + if (pTaskEntry->checkpointInfo.latestId >= pReport->checkpointId) { + mError("s-task:0x%x invalid checkpoint-report msg, checkpointId:%" PRId64 " saved checkpointId:%" PRId64 " discard", + pReport->taskId, pReport->checkpointId, pTaskEntry->checkpointInfo.activeId); + return false; + } + + // now the task in checkpoint procedure + if ((pTaskEntry->checkpointInfo.activeId != 0) && (pTaskEntry->checkpointInfo.activeId > pReport->checkpointId)) { + mError("s-task:0x%x invalid checkpoint-report msg, checkpointId:%" PRId64 " active checkpointId:%" PRId64 + " discard", + pReport->taskId, pReport->checkpointId, pTaskEntry->checkpointInfo.activeId); + return false; + } + + if (reportChkptId >= pReport->checkpointId) { + mError("s-task:0x%x expired checkpoint-report msg, checkpointId:%" PRId64 " already update checkpointId:%" PRId64 + " discard", + pReport->taskId, pReport->checkpointId, reportChkptId); + return false; + } + + return true; +} + +static void doAddReportStreamTask(SArray *pList, int64_t reportChkptId, const SCheckpointReport *pReport) { + bool valid = validateChkptReport(pReport, reportChkptId); + if (!valid) { + return; + } + for (int32_t i = 0; i < taosArrayGetSize(pList); ++i) { STaskChkptInfo *p = taosArrayGet(pList, i); if (p == NULL) { @@ -2463,27 +2500,38 @@ static void doAddReportStreamTask(SArray* pList, const SCheckpointReport* pRepor } if (p->taskId == pReport->taskId) { - existed = true; - break; + if (p->checkpointId > pReport->checkpointId) { + mError("s-task:0x%x invalid checkpoint-report msg, existed:%" PRId64 " req checkpointId:%" PRId64 ", discard", + pReport->taskId, p->checkpointId, pReport->checkpointId); + } else if (p->checkpointId < pReport->checkpointId) { // expired checkpoint-report msg, update it + mDebug("s-task:0x%x expired checkpoint-report msg in checkpoint-report list update from %" PRId64 "->%" PRId64, + pReport->taskId, p->checkpointId, pReport->checkpointId); + + memcpy(p, pReport, sizeof(STaskChkptInfo)); + } else { + mWarn("taskId:0x%x already in checkpoint-report list", pReport->taskId); + } + return; } } - if (!existed) { - STaskChkptInfo info = { - .streamId = pReport->streamId, - .taskId = pReport->taskId, - .transId = pReport->transId, - .dropHTask = pReport->dropHTask, - .version = pReport->checkpointVer, - .ts = pReport->checkpointTs, - .checkpointId = pReport->checkpointId, - .nodeId = pReport->nodeId, - }; + STaskChkptInfo info = { + .streamId = pReport->streamId, + .taskId = pReport->taskId, + .transId = pReport->transId, + .dropHTask = pReport->dropHTask, + .version = pReport->checkpointVer, + .ts = pReport->checkpointTs, + .checkpointId = pReport->checkpointId, + .nodeId = pReport->nodeId, + }; - void* p = taosArrayPush(pList, &info); - if (p == NULL) { - mError("failed to put into task list, taskId:0x%x", pReport->taskId); - } + void *p = taosArrayPush(pList, &info); + if (p == NULL) { + mError("failed to put into task list, taskId:0x%x", pReport->taskId); + } else { + int32_t size = taosArrayGetSize(pList); + mDebug("stream:0x%"PRIx64" %d tasks has send checkpoint-report", pReport->streamId, size); } } @@ -2530,23 +2578,23 @@ int32_t mndProcessCheckpointReport(SRpcMsg *pReq) { int32_t numOfTasks = (pStream == NULL) ? 0 : mndGetNumOfStreamTasks(pStream); - SArray **pReqTaskList = (SArray **)taosHashGet(execInfo.pChkptStreams, &req.streamId, sizeof(req.streamId)); - if (pReqTaskList == NULL) { - SArray *pList = taosArrayInit(4, sizeof(STaskChkptInfo)); - if (pList != NULL) { - doAddReportStreamTask(pList, &req); - code = taosHashPut(execInfo.pChkptStreams, &req.streamId, sizeof(req.streamId), &pList, POINTER_BYTES); + SChkptReportInfo *pInfo = (SChkptReportInfo*)taosHashGet(execInfo.pChkptStreams, &req.streamId, sizeof(req.streamId)); + if (pInfo == NULL) { + SChkptReportInfo info = {.pTaskList = taosArrayInit(4, sizeof(STaskChkptInfo)), .streamId = req.streamId}; + if (info.pTaskList != NULL) { + doAddReportStreamTask(info.pTaskList, info.reportChkpt, &req); + code = taosHashPut(execInfo.pChkptStreams, &req.streamId, sizeof(req.streamId), &info, sizeof(info)); if (code) { mError("stream:0x%" PRIx64 " failed to put into checkpoint stream", req.streamId); } - pReqTaskList = (SArray **)taosHashGet(execInfo.pChkptStreams, &req.streamId, sizeof(req.streamId)); + pInfo = (SChkptReportInfo *)taosHashGet(execInfo.pChkptStreams, &req.streamId, sizeof(req.streamId)); } } else { - doAddReportStreamTask(*pReqTaskList, &req); + doAddReportStreamTask(pInfo->pTaskList, pInfo->reportChkpt, &req); } - int32_t total = taosArrayGetSize(*pReqTaskList); + int32_t total = taosArrayGetSize(pInfo->pTaskList); if (total == numOfTasks) { // all tasks has send the reqs mInfo("stream:0x%" PRIx64 " %s all %d tasks send checkpoint-report, checkpoint meta-info for checkpointId:%" PRId64 " will be issued soon", diff --git a/source/dnode/mnode/impl/src/mndStreamHb.c b/source/dnode/mnode/impl/src/mndStreamHb.c index 50db903520..59f07ce977 100644 --- a/source/dnode/mnode/impl/src/mndStreamHb.c +++ b/source/dnode/mnode/impl/src/mndStreamHb.c @@ -211,6 +211,10 @@ int32_t mndProcessResetStatusReq(SRpcMsg *pReq) { SStreamTaskResetMsg* pMsg = pReq->pCont; mndKillTransImpl(pMnode, pMsg->transId, ""); + streamMutexLock(&execInfo.lock); + (void) mndResetChkptReportInfo(execInfo.pChkptStreams, pMsg->streamId); + streamMutexUnlock(&execInfo.lock); + code = mndGetStreamObj(pMnode, pMsg->streamId, &pStream); if (pStream == NULL || code != 0) { code = TSDB_CODE_STREAM_TASK_NOT_EXIST; @@ -453,7 +457,7 @@ int32_t mndProcessStreamHb(SRpcMsg *pReq) { addIntoCheckpointList(pFailedChkpt, &info); // remove failed trans from pChkptStreams - code = taosHashRemove(execInfo.pChkptStreams, &p->id.streamId, sizeof(p->id.streamId)); + code = mndResetChkptReportInfo(execInfo.pChkptStreams, p->id.streamId); if (code) { mError("failed to remove stream:0x%"PRIx64" in checkpoint stream list", p->id.streamId); } diff --git a/source/dnode/mnode/impl/src/mndStreamUtil.c b/source/dnode/mnode/impl/src/mndStreamUtil.c index 739bb0ca37..649cab91c1 100644 --- a/source/dnode/mnode/impl/src/mndStreamUtil.c +++ b/source/dnode/mnode/impl/src/mndStreamUtil.c @@ -904,8 +904,9 @@ void removeStreamTasksInBuf(SStreamObj *pStream, SStreamExecInfo *pExecNode) { ASSERT(taosHashGetSize(pExecNode->pTaskMap) == taosArrayGetSize(pExecNode->pTaskList)); - // 2. remove stream entry in consensus hash table + // 2. remove stream entry in consensus hash table and checkpoint-report hash table (void) mndClearConsensusCheckpointId(execInfo.pStreamConsensus, pStream->uid); + (void) mndClearChkptReportInfo(execInfo.pChkptStreams, pStream->uid); streamMutexUnlock(&pExecNode->lock); destroyStreamTaskIter(pIter); @@ -973,9 +974,8 @@ int32_t removeExpiredNodeEntryAndTaskInBuf(SArray *pNodeSnapshot) { static int32_t doSetUpdateChkptAction(SMnode *pMnode, STrans *pTrans, SStreamTask *pTask) { SVUpdateCheckpointInfoReq *pReq = taosMemoryCalloc(1, sizeof(SVUpdateCheckpointInfoReq)); if (pReq == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; mError("failed to malloc in reset stream, size:%" PRIzu ", code:%s", sizeof(SVUpdateCheckpointInfoReq), - tstrerror(TSDB_CODE_OUT_OF_MEMORY)); + tstrerror(terrno)); return terrno; } @@ -983,12 +983,14 @@ static int32_t doSetUpdateChkptAction(SMnode *pMnode, STrans *pTrans, SStreamTas pReq->taskId = pTask->id.taskId; pReq->streamId = pTask->id.streamId; - SArray **pReqTaskList = (SArray **)taosHashGet(execInfo.pChkptStreams, &pTask->id.streamId, sizeof(pTask->id.streamId)); - ASSERT(pReqTaskList); + SChkptReportInfo *pStreamItem = (SChkptReportInfo*)taosHashGet(execInfo.pChkptStreams, &pTask->id.streamId, sizeof(pTask->id.streamId)); + if (pStreamItem == NULL) { + return TSDB_CODE_INVALID_PARA; + } - int32_t size = taosArrayGetSize(*pReqTaskList); + int32_t size = taosArrayGetSize(pStreamItem->pTaskList); for(int32_t i = 0; i < size; ++i) { - STaskChkptInfo* pInfo = taosArrayGet(*pReqTaskList, i); + STaskChkptInfo* pInfo = taosArrayGet(pStreamItem->pTaskList, i); if (pInfo == NULL) { continue; } @@ -1063,11 +1065,12 @@ int32_t mndScanCheckpointReportInfo(SRpcMsg *pReq) { } mDebug("start to scan checkpoint report info"); + streamMutexLock(&execInfo.lock); while ((pIter = taosHashIterate(execInfo.pChkptStreams, pIter)) != NULL) { - SArray *pList = *(SArray **)pIter; + SChkptReportInfo* px = (SChkptReportInfo *)pIter; - STaskChkptInfo *pInfo = taosArrayGet(pList, 0); + STaskChkptInfo *pInfo = taosArrayGet(px->pTaskList, 0); if (pInfo == NULL) { continue; } @@ -1080,12 +1083,11 @@ int32_t mndScanCheckpointReportInfo(SRpcMsg *pReq) { if (p == NULL) { mError("failed to put stream into drop list:0x%" PRIx64, pInfo->streamId); } - continue; } int32_t total = mndGetNumOfStreamTasks(pStream); - int32_t existed = (int32_t)taosArrayGetSize(pList); + int32_t existed = (int32_t)taosArrayGetSize(px->pTaskList); if (total == existed) { mDebug("stream:0x%" PRIx64 " %s all %d tasks send checkpoint-report, start to update checkpoint-info", @@ -1093,14 +1095,11 @@ int32_t mndScanCheckpointReportInfo(SRpcMsg *pReq) { bool conflict = mndStreamTransConflictCheck(pMnode, pStream->uid, MND_STREAM_CHKPT_UPDATE_NAME, false); if (!conflict) { - code = mndCreateStreamChkptInfoUpdateTrans(pMnode, pStream, pList); + code = mndCreateStreamChkptInfoUpdateTrans(pMnode, pStream, px->pTaskList); if (code == TSDB_CODE_SUCCESS || code == TSDB_CODE_ACTION_IN_PROGRESS) { // remove this entry - void* p = taosArrayPush(pDropped, &pInfo->streamId); - if (p == NULL) { - mError("failed to remove stream:0x%" PRIx64, pInfo->streamId); - } else { - mDebug("stream:0x%" PRIx64 " removed", pInfo->streamId); - } + taosArrayClear(px->pTaskList); + px->reportChkpt = pInfo->checkpointId; + mDebug("stream:0x%" PRIx64 " clear checkpoint-report list", pInfo->streamId); } else { mDebug("stream:0x%" PRIx64 " not launch chkpt-meta update trans, due to checkpoint not finished yet", pInfo->streamId); @@ -1135,6 +1134,8 @@ int32_t mndScanCheckpointReportInfo(SRpcMsg *pReq) { mDebug("drop %d stream(s) in checkpoint-report list, remain:%d", size, numOfStreams); } + streamMutexUnlock(&execInfo.lock); + taosArrayDestroy(pDropped); return TSDB_CODE_SUCCESS; } @@ -1319,7 +1320,7 @@ int64_t mndClearConsensusCheckpointId(SHashObj* pHash, int64_t streamId) { int32_t code = 0; int32_t numOfStreams = taosHashGetSize(pHash); if (numOfStreams == 0) { - return TSDB_CODE_SUCCESS; + return code; } code = taosHashRemove(pHash, &streamId, sizeof(streamId)); @@ -1332,6 +1333,35 @@ int64_t mndClearConsensusCheckpointId(SHashObj* pHash, int64_t streamId) { return code; } +int64_t mndClearChkptReportInfo(SHashObj* pHash, int64_t streamId) { + int32_t code = 0; + int32_t numOfStreams = taosHashGetSize(pHash); + if (numOfStreams == 0) { + return code; + } + + code = taosHashRemove(pHash, &streamId, sizeof(streamId)); + if (code == 0) { + mDebug("drop stream:0x%" PRIx64 " in chkpt-report list, remain:%d", streamId, numOfStreams); + } else { + mError("failed to remove stream:0x%"PRIx64" in chkpt-report list, remain:%d", streamId, numOfStreams); + } + + return code; +} + +int32_t mndResetChkptReportInfo(SHashObj* pHash, int64_t streamId) { + SChkptReportInfo* pInfo = taosHashGet(pHash, &streamId, sizeof(streamId)); + if (pInfo != NULL) { + taosArrayClear(pInfo->pTaskList); + mDebug("stream:0x%" PRIx64 " checkpoint-report list cleared, prev report checkpointId:%" PRId64, streamId, + pInfo->reportChkpt); + return 0; + } + + return TSDB_CODE_MND_STREAM_NOT_EXIST; +} + static void mndShowStreamStatus(char *dst, SStreamObj *pStream) { int8_t status = atomic_load_8(&pStream->status); if (status == STREAM_STATUS__NORMAL) { diff --git a/source/dnode/vnode/src/tqCommon/tqCommon.c b/source/dnode/vnode/src/tqCommon/tqCommon.c index ba911fa76d..faca2020c5 100644 --- a/source/dnode/vnode/src/tqCommon/tqCommon.c +++ b/source/dnode/vnode/src/tqCommon/tqCommon.c @@ -563,7 +563,7 @@ int32_t tqStreamTaskProcessCheckpointReadyMsg(SStreamMeta* pMeta, SRpcMsg* pMsg) pTask->id.idStr, req.downstreamTaskId, req.downstreamNodeId); } - code = streamProcessCheckpointReadyMsg(pTask, req.checkpointId, req.downstreamTaskId, req.downstreamNodeId); + code = streamProcessCheckpointReadyMsg(pTask, req.checkpointId, req.downstreamNodeId, req.downstreamTaskId); streamMetaReleaseTask(pMeta, pTask); if (code) { return code; diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index 6c0f8ec6cb..65e5c475b4 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -361,7 +361,6 @@ int32_t streamProcessCheckpointTriggerBlock(SStreamTask* pTask, SStreamDataBlock (void)streamTaskBuildCheckpoint(pTask); // todo: not handle error yet } else { // source & agg tasks need to forward the checkpoint msg downwards stDebug("s-task:%s process checkpoint-trigger block, all %d upstreams sent, forwards to downstream", id, num); - flushStateDataInExecutor(pTask, (SStreamQueueItem*)pBlock); // Put the checkpoint-trigger block into outputQ, to make sure all blocks with less version have been handled by @@ -376,8 +375,8 @@ int32_t streamProcessCheckpointTriggerBlock(SStreamTask* pTask, SStreamDataBlock // only when all downstream tasks are send checkpoint rsp, we can start the checkpoint procedure for the agg task static int32_t processCheckpointReadyHelp(SActiveCheckpointInfo* pInfo, int32_t numOfDownstream, int32_t downstreamNodeId, int64_t streamId, int32_t downstreamTaskId, - const char* id, int32_t* pNotReady, int32_t* pTransId) { - bool received = false; + const char* id, int32_t* pNotReady, int32_t* pTransId, bool* alreadyRecv) { + *alreadyRecv = false; int32_t size = taosArrayGetSize(pInfo->pCheckpointReadyRecvList); for (int32_t i = 0; i < size; ++i) { STaskDownstreamReadyInfo* p = taosArrayGet(pInfo->pCheckpointReadyRecvList, i); @@ -386,12 +385,12 @@ static int32_t processCheckpointReadyHelp(SActiveCheckpointInfo* pInfo, int32_t } if (p->downstreamTaskId == downstreamTaskId) { - received = true; + (*alreadyRecv) = true; break; } } - if (received) { + if (*alreadyRecv) { stDebug("s-task:%s already recv checkpoint-ready msg from downstream:0x%x, ignore. %d/%d downstream not ready", id, downstreamTaskId, (int32_t)(numOfDownstream - taosArrayGetSize(pInfo->pCheckpointReadyRecvList)), numOfDownstream); @@ -427,6 +426,7 @@ int32_t streamProcessCheckpointReadyMsg(SStreamTask* pTask, int64_t checkpointId int32_t code = 0; int32_t notReady = 0; int32_t transId = 0; + bool alreadyHandled = false; // 1. not in checkpoint status now SStreamTaskState pStat = streamTaskGetStatus(pTask); @@ -445,12 +445,17 @@ int32_t streamProcessCheckpointReadyMsg(SStreamTask* pTask, int64_t checkpointId streamMutexLock(&pInfo->lock); code = processCheckpointReadyHelp(pInfo, total, downstreamNodeId, pTask->id.streamId, downstreamTaskId, id, ¬Ready, - &transId); + &transId, &alreadyHandled); streamMutexUnlock(&pInfo->lock); - if ((notReady == 0) && (code == 0)) { - stDebug("s-task:%s all downstream tasks have completed build checkpoint, do checkpoint for current task", id); - (void)appendCheckpointIntoInputQ(pTask, STREAM_INPUT__CHECKPOINT, checkpointId, transId, -1); + if (alreadyHandled) { + stDebug("s-task:%s checkpoint-ready msg checkpointId:%" PRId64 " from task:0x%x already handled, not handle again", + id, checkpointId, downstreamTaskId); + } else { + if ((notReady == 0) && (code == 0) && (!alreadyHandled)) { + stDebug("s-task:%s all downstream tasks have completed build checkpoint, do checkpoint for current task", id); + (void)appendCheckpointIntoInputQ(pTask, STREAM_INPUT__CHECKPOINT, checkpointId, transId, -1); + } } return code; From 8472f25a6ea71d985951777b44ae9710a2c9053b Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Fri, 9 Aug 2024 00:46:20 +0800 Subject: [PATCH 061/103] fix(stream): send kill checkpoint trans msg before close task. --- source/libs/stream/src/streamMeta.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index 321027c293..2285784c97 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -1140,6 +1140,16 @@ void streamMetaNotifyClose(SStreamMeta* pMeta) { taosMsleep(100); } + SArray* pTaskList = NULL; + int32_t code = streamMetaSendMsgBeforeCloseTasks(pMeta, &pTaskList); + if (code != TSDB_CODE_SUCCESS) { +// return code; + } + + if (pTaskList != NULL) { + taosArrayDestroy(pTaskList); + } + int64_t el = taosGetTimestampMs() - st; stDebug("vgId:%d all stream tasks are not in timer, continue close, elapsed time:%" PRId64 " ms", pMeta->vgId, el); } From e3a2733feede060e5d594c9be78f1b90219688d3 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Fri, 9 Aug 2024 00:47:42 +0800 Subject: [PATCH 062/103] fix(stream): send kill checkpoint trans msg before close task. --- source/libs/stream/src/streamMeta.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index 2285784c97..fbcc338f09 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -1140,12 +1140,16 @@ void streamMetaNotifyClose(SStreamMeta* pMeta) { taosMsleep(100); } + streamMetaRLock(pMeta); + SArray* pTaskList = NULL; int32_t code = streamMetaSendMsgBeforeCloseTasks(pMeta, &pTaskList); if (code != TSDB_CODE_SUCCESS) { // return code; } + streamMetaRUnLock(pMeta); + if (pTaskList != NULL) { taosArrayDestroy(pTaskList); } From 506a72d50f317db5ff3f5295b5c19ae672065504 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Fri, 9 Aug 2024 02:02:41 +0800 Subject: [PATCH 063/103] fix(stream): update checkpoint info only it is in ck status. --- source/libs/stream/src/streamCheckpoint.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index 65e5c475b4..c555da9865 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -567,14 +567,13 @@ int32_t streamTaskUpdateTaskCheckpointInfo(SStreamTask* pTask, bool restored, SV ASSERT(pInfo->checkpointId <= pReq->checkpointId && pInfo->checkpointVer <= pReq->checkpointVer && pInfo->processedVer <= pReq->checkpointVer); - pInfo->checkpointId = pReq->checkpointId; - pInfo->checkpointVer = pReq->checkpointVer; - pInfo->checkpointTime = pReq->checkpointTs; - - streamTaskClearCheckInfo(pTask, true); - + // update only it is in checkpoint status. if (pStatus.state == TASK_STATUS__CK) { - // todo handle error + pInfo->checkpointId = pReq->checkpointId; + pInfo->checkpointVer = pReq->checkpointVer; + pInfo->checkpointTime = pReq->checkpointTs; + + streamTaskClearCheckInfo(pTask, true); code = streamTaskHandleEvent(pTask->status.pSM, TASK_EVENT_CHECKPOINT_DONE); } else { stDebug("s-task:0x%x vgId:%d not handle checkpoint-done event, status:%s", pReq->taskId, vgId, pStatus.name); From 1920307047dd01992b340b7c744548f64d4245e9 Mon Sep 17 00:00:00 2001 From: gccgdb1234 Date: Fri, 9 Aug 2024 08:04:10 +0800 Subject: [PATCH 064/103] doc: correct error in disaster recovery page --- docs/zh/07-operation/10-disaster.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/zh/07-operation/10-disaster.md b/docs/zh/07-operation/10-disaster.md index 71589cf07e..b274e1373b 100644 --- a/docs/zh/07-operation/10-disaster.md +++ b/docs/zh/07-operation/10-disaster.md @@ -11,7 +11,7 @@ toc_max_heading_level: 4 TDengine 支持 WAL 机制,实现数据的容错能力,保证数据的高可靠。TDengine 接收到应用程序的请求数据包时,会先将请求的原始数据包写入数据库日志文件,等数据成功写入数据库数据文件后,再删除相应的 WAL。这样保证了 TDengine 能够在断电等因素导致的服务重启时,从数据库日志文件中恢复数据,避免数据丢失。涉及的配置参数有如下两个: - wal_level :WAL 级别,1 表示写 WAL,但不执行 fsync ; 2 表示写 WAL,而且执行 fsync。默认值为 1。 -- wal_fsync_period:当 wal_level 设置为 2 时,执行 fsync 的周期;当 wal-level 设置为 0 时,表示每次写入,立即执行 fsync。 +- wal_fsync_period:当 wal_level 设置为 2 时,执行 fsync 的周期;当 wal_fsync_period 设置为 0 时,表示每次写入,立即执行 fsync。 如果要 100% 保证数据不丢失,则需要将 wal_level 设置为 2,wal_fsync_period 设置为 0。这时写入速度将会下降。但如果应用程序侧启动的写数据的线程数达到一定的数量(超过 50),那么写入数据的性能也会很不错,只会比 wal_fsync_period 设置为 3000ms 下降 30% 左右。 From 1c243603e02087e6ca31fc226abf7a3fe519bf0c Mon Sep 17 00:00:00 2001 From: gccgdb1234 Date: Fri, 9 Aug 2024 08:08:28 +0800 Subject: [PATCH 065/103] doc: add more explanation for lossyColumn --- docs/zh/14-reference/01-components/01-taosd.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/zh/14-reference/01-components/01-taosd.md b/docs/zh/14-reference/01-components/01-taosd.md index 0e9c4eb926..b46dc21d40 100644 --- a/docs/zh/14-reference/01-components/01-taosd.md +++ b/docs/zh/14-reference/01-components/01-taosd.md @@ -197,7 +197,7 @@ charset 的有效值是 UTF-8。 | compressMsgSize | 是否对 RPC 消息进行压缩;-1: 所有消息都不压缩; 0: 所有消息都压缩; N (N>0): 只有大于 N 个字节的消息才压缩;缺省值 -1 | | fPrecision | 设置 float 类型浮点数压缩精度 ,取值范围:0.1 ~ 0.00000001 ,默认值 0.00000001 , 小于此值的浮点数尾数部分将被截断 | |dPrecision | 设置 double 类型浮点数压缩精度 , 取值范围:0.1 ~ 0.0000000000000001 , 缺省值 0.0000000000000001 , 小于此值的浮点数尾数部分将被截取 | -|lossyColumn | 对 float 和/或 double 类型启用 TSZ 有损压缩;取值范围: float, double, none;缺省值: none,表示关闭无损压缩 | +|lossyColumn | 对 float 和/或 double 类型启用 TSZ 有损压缩;取值范围: float, double, none;缺省值: none,表示关闭无损压缩。**注意:此参数在 3.3.0.0 及更高版本中不再使用**| |ifAdtFse | 在启用 TSZ 有损压缩时,使用 FSE 算法替换 HUFFMAN 算法, FSE 算法压缩速度更快,但解压稍慢,追求压缩速度可选用此算法; 0: 关闭,1:打开;默认值为 0 | From 30f0759c718290de4b9a3f8b7654c1bef170c2ea Mon Sep 17 00:00:00 2001 From: gccgdb1234 Date: Fri, 9 Aug 2024 08:55:52 +0800 Subject: [PATCH 066/103] doc: resolve some format errors --- docs/zh/07-operation/03-deployment.md | 1678 ++++++++--------- .../zh/14-reference/01-components/01-taosd.md | 2 +- 2 files changed, 840 insertions(+), 840 deletions(-) diff --git a/docs/zh/07-operation/03-deployment.md b/docs/zh/07-operation/03-deployment.md index 829d57eeb2..7cf79636ed 100644 --- a/docs/zh/07-operation/03-deployment.md +++ b/docs/zh/07-operation/03-deployment.md @@ -1,840 +1,840 @@ ---- -sidebar_label: 集群部署 -title: 集群部署 -toc_max_heading_level: 4 ---- - -由于 TDengine 设计之初就采用了分布式架构,具有强大的水平扩展能力,以满足不断增长的数据处理需求,因此 TDengine 支持集群,并将此核心功能开源。用户可以根据实际环境和需求选择 4 种部署方式—手动部署、Docker 部署、Kubernetes 部署和 Helm 部署。 - -## 手动部署 - -### 部署 taosd - -taosd 是 TDengine 集群中最主要的服务组件,本节介绍手动部署 taosd 集群的步骤。 - -#### 1. 清除数据 - -如果搭建集群的物理节点中存在之前的测试数据或者装过其他版本(如 1.x/2.x)的TDengine,请先将其删除,并清空所有数据。 - -#### 2. 检查环境 - -在进行 TDengine 集群部署之前,全面检查所有 dnode 以及应用程序所在物理节点的网络设置至关重要。以下是检查步骤: - -第 1 步,在每个物理节点上执行 hostname -f 命令,以查看并确认所有节点的hostname 是唯一的。对于应用程序驱动所在的节点,这一步骤可以省略。 -第 2 步,在每个物理节点上执行 ping host 命令,其中 host 是其他物理节点的 hostname。这一步骤旨在检测当前节点与其他物理节点之间的网络连通性。如果发现无法 ping 通,请立即检查网络和 DNS 设置。对于 Linux 操作系统,请检查 /etc/hosts 文件;对于 Windows 操作系统,请检查C:\Windows\system32\drivers\etc\hosts 文件。网络不通畅将导致无法组建集群,请务必解决此问题。 -第 3 步,在应用程序运行的物理节点上重复上述网络监测步骤。如果发现网络不通畅,应用程序将无法连接到 taosd 服务。此时,请仔细检查应用程序所在物理节点的DNS 设置或 hosts 文件,确保其配置正确无误。 -第 4 步,检查端口,确保集群中所有主机在端口 6030 上的 TCP 能够互通。 - -通过以上步骤,你可以确保所有节点在网络层面顺利通信,从而为成功部署TDengine 集群奠定坚实基础 - -#### 3. 安装 - -为了确保集群内各物理节点的一致性和稳定性,请在所有物理节点上安装相同版本的 TDengine。 - -#### 4. 修改配置 - -修改 TDengine 的配置文件(所有节点的配置文件都需要修改)。假设准备启动的第 1 个 dnode 的 endpoint 为 h1.taosdata.com:6030,其与集群配置相关参数如下。 - -```shell -# firstEp 是每个 dnode 首次启动后连接的第 1 个 dnode -firstEp h1.taosdata.com:6030 -# 必须配置为本 dnode 的 FQDN,如果本机只有一个 hostname,可注释或删除如下这行代码 -fqdn h1.taosdata.com -# 配置本 dnode 的端口,默认是 6030 -serverPort 6030 -``` - -一定要修改的参数是 f irstEp 和 fqdn。对于每个 dnode,f irstEp 配置应该保持一致,但 fqdn 一定要配置成其所在 dnode 的值。其他参数可不做任何修改,除非你很清楚为什么要修改。 - -对于希望加入集群的 dnode 节点,必须确保下表所列的与 TDengine 集群相关的参数设置完全一致。任何参数的不匹配都可能导致 dnode 节点无法成功加入集群。 - -| 参数名称 | 含义 | -|:---------------:|:----------------------------------------------------------:| -|statusInterval | dnode 向 mnode 报告状态的间隔 | -|timezone | 时区 | -|locale | 系统区位信息及编码格式 | -|charset | 字符集编码 | -|ttlChangeOnWrite | ttl 到期时间是否伴随表的修改操作而改变 | - -#### 5. 启动 - -按照前述步骤启动第 1 个 dnode,例如 h1.taosdata.com。接着在终端中执行 taos,启动 TDengine 的 CLI 程序 taos,并在其中执行 show dnodes 命令,以查看当前集群中的所有 dnode 信息。 - -```shell -taos> show dnodes; - id | endpoint | vnodes|support_vnodes|status| create_time | note | -=================================================================================== - 1| h1.taosdata.com:6030 | 0| 1024| ready| 2022-07-16 10:50:42.673 | | -``` - -可以看到,刚刚启动的 dnode 节点的 endpoint 为 h1.taosdata.com:6030。这个地址就是新建集群的 first Ep。 - -#### 6. 添加 dnode - -按照前述步骤,在每个物理节点启动 taosd。每个 dnode 都需要在 taos.cfg 文件中将 firstEp 参数配置为新建集群首个节点的 endpoint,在本例中是 h1.taosdata.com:6030。在第 1 个 dnode 所在机器,在终端中运行 taos,打开 TDengine 的 CLI 程序 taos,然后登录TDengine 集群,执行如下 SQL。 - -```shell -create dnode "h2.taosdata.com:6030" -``` - -将新 dnode 的 endpoint 添加进集群的 endpoint 列表。需要为 `fqdn:port` 加上双引号,否则运行时出错。请注意将示例的 h2.taosdata.com:6030 替换为这个新 dnode 的 endpoint。然后执行如下 SQL 查看新节点是否成功加入。若要加入的 dnode 当前处于离线状态,请参考本节后面的 “常见问题”部分进行解决。 - -```shell -show dnodes; -``` - -在日志中,请确认输出的 dnode 的 fqdn 和端口是否与你刚刚尝试添加的 endpoint 一致。如果不一致,请修正为正确的 endpoint。遵循上述步骤,你可以持续地将新的 dnode 逐个加入集群,从而扩展集群规模并提高整体性能。确保在添加新节点时遵循正确的流程,这有助于维持集群的稳定性和可靠性。 - -**Tips** -- 任何已经加入集群的 dnode 都可以作为后续待加入节点的 firstEp。firstEp 参数仅仅在该 dnode 首次加入集群时起作用,加入集群后,该 dnode 会保存最新的 mnode 的 endpoint 列表,后续不再依赖这个参数。之后配置文件中的 firstEp 参数主要用于客户端连接,如果没有为 TDengine 的 CLI 设置参数,则默认连接由 firstEp 指定的节点。 -- 两个没有配置 firstEp 参数的 dnode 在启动后会独立运行。这时无法将其中一个dnode 加入另外一个 dnode,形成集群。 -- TDengine 不允许将两个独立的集群合并成新的集群。 - -#### 7. 添加 mnode - -在创建 TDengine 集群时,首个 dnode 将自动成为集群的 mnode,负责集群的管理和协调工作。为了实现 mnode 的高可用性,后续添加的 dnode 需要手动创建 mnode。请注意,一个集群最多允许创建 3 个 mnode,且每个 dnode 上只能创建一个 mnode。当集群中的 dnode 数量达到或超过 3 个时,你可以为现有集群创建 mnode。在第 1个 dnode 中,首先通过 TDengine 的 CLI 程序 taos 登录 TDengine,然后执行如下 SQL。 - -```shell -create mnode on dnode -``` - -请注意将上面示例中的 dnodeId 替换为刚创建 dnode 的序号(可以通过执行 `show dnodes` 命令获得)。最后执行如下 `show mnodes`,查看新创建的 mnode 是否成功加入集群。 - - -**Tips** - -在搭建 TDengine 集群的过程中,如果在执行 create dnode 命令以添加新节点后,新节点始终显示为离线状态,请按照以下步骤进行排查。 - -- 第 1 步,检查新节点上的 taosd 服务是否已经正常启动。你可以通过查看日志文件或使用 ps 命令来确认。 -- 第 2 步,如果 taosd 服务已启动,接下来请检查新节点的网络连接是否畅通,并确认防火墙是否已关闭。网络不通或防火墙设置可能会阻止节点与集群的其他节点通信。 -- 第 3 步,使用 taos -h fqdn 命令尝试连接到新节点,然后执行 show dnodes 命令。这将显示新节点作为独立集群的运行状态。如果显示的列表与主节点上显示的不一致,说明新节点可能已自行组成一个单节点集群。要解决这个问题,请按照以下步骤操作。首先,停止新节点上的 taosd 服务。其次,清空新节点上 taos.cfg 配置文件中指定的 dataDir 目录下的所有文件。这将删除与该节点相关的所有数据和配置信息。最后,重新启动新节点上的 taosd 服务。这将使新节点恢复到初始状态,并准备好重新加入主集群。 - -### 部署 taosAdapter - -本节讲述如何部署 taosAdapter,taosAdapter 为 TDengine 集群提供 RESTful 和 WebSocket 接入能力,因而在集群中扮演着很重要的角色。 - -1. 安装 - -TDengine Enterprise 安装完成后,即可使用 taosAdapter。如果想在不同的服务器上分别部署 taosAdapter,需要在这些服务器上都安装 TDengine Enterprise。 - -2. 单一实例部署 - -部署 taosAdapter 的单一实例非常简单,具体命令和配置参数请参考手册中 taosAdapter 部分。 - -3. 多实例部署 - -部署 taosAdapter 的多个实例的主要目的如下: -- 提升集群的吞吐量,避免 taosAdapter 成为系统瓶颈。 -- 提升集群的健壮性和高可用能力,当有一个实例因某种故障而不再提供服务时,可以将进入业务系统的请求自动路由到其他实例。 - -在部署 taosAdapter 的多个实例时,需要解决负载均衡问题,以避免某个节点过载而其他节点闲置。在部署过程中,需要分别部署多个单一实例,每个实例的部署步骤与部署单一实例完全相同。接下来关键的部分是配置 Nginx。以下是一个经过验证的较佳实践配置,你只须将其中的 endpoint 替换为实际环境中的正确地址即可。关于各参数的含义,请参考 Nginx 的官方文档。 - -```json -user root; -worker_processes auto; -error_log /var/log/nginx_error.log; - - -events { - use epoll; - worker_connections 1024; -} - -http { - - access_log off; - - map $http_upgrade $connection_upgrade { - default upgrade; - '' close; - } - - server { - listen 6041; - location ~* { - proxy_pass http://dbserver; - proxy_read_timeout 600s; - proxy_send_timeout 600s; - proxy_connect_timeout 600s; - proxy_next_upstream error http_502 non_idempotent; - proxy_http_version 1.1; - proxy_set_header Upgrade $http_upgrade; - proxy_set_header Connection $http_connection; - } - } - server { - listen 6043; - location ~* { - proxy_pass http://keeper; - proxy_read_timeout 60s; - proxy_next_upstream error http_502 http_500 non_idempotent; - } - } - - server { - listen 6060; - location ~* { - proxy_pass http://explorer; - proxy_read_timeout 60s; - proxy_next_upstream error http_502 http_500 non_idempotent; - } - } - upstream dbserver { - least_conn; - server 172.16.214.201:6041 max_fails=0; - server 172.16.214.202:6041 max_fails=0; - server 172.16.214.203:6041 max_fails=0; - } - upstream keeper { - ip_hash; - server 172.16.214.201:6043 ; - server 172.16.214.202:6043 ; - server 172.16.214.203:6043 ; - } - upstream explorer{ - ip_hash; - server 172.16.214.201:6060 ; - server 172.16.214.202:6060 ; - server 172.16.214.203:6060 ; - } -} -``` - -### 部署 taosKeeper - -如果要想使用 TDegnine 的监控功能,taosKeeper 是一个必要的组件,关于监控请参考[TDinsight](../../reference/components/tdinsight),关于部署 taosKeeper 的细节请参考[taosKeeper参考手册](../../reference/components/taoskeeper)。 - -### 部署 taosX - -如果想使用 TDengine 的数据接入能力,需要部署 taosX 服务,关于它的详细说明和部署请参考[taosX 参考手册](../../reference/components/taosx)。 - -### 部署 taosX-Agent - -有些数据源如 Pi, OPC 等,因为网络条件和数据源访问的限制,taosX 无法直接访问数据源,这种情况下需要部署一个代理服务 taosX-Agent,关于它的详细说明和部署请参考[taosX-Agent 参考手册](../../reference/components/taosx-agent)。 - -### 部署 taos-Explorer - -TDengine 提供了可视化管理 TDengine 集群的能力,要想使用图形化界面需要部署 taos-Explorer 服务,关于它的详细说明和部署请参考[taos-Explorer 参考手册](../../reference/components/explorer) - - -## Docker 部署 - -本节将介绍如何在 Docker 容器中启动 TDengine 服务并对其进行访问。你可以在 docker run 命令行或者 docker-compose 文件中使用环境变量来控制容器中服务的行为。 - -### 启动 TDengine - -TDengine 镜像启动时默认激活 HTTP 服务,使用下列命令便可创建一个带有 HTTP 服务的容器化 TDengine 环境。 -```shell -docker run -d --name tdengine \ --v ~/data/taos/dnode/data:/var/lib/taos \ --v ~/data/taos/dnode/log:/var/log/taos \ --p 6041:6041 tdengine/tdengine -``` - -详细的参数说明如下。 -- /var/lib/taos:TDengine 默认数据文件目录,可通过配置文件修改位置。 -- /var/log/taos:TDengine 默认日志文件目录,可通过配置文件修改位置。 - -以上命令启动了一个名为 tdengine 的容器,并把其中的 HTTP 服务的端口 6041 映射到主机端口 6041。如下命令可以验证该容器中提供的 HTTP 服务是否可用。 - -```shell -curl -u root:taosdata -d "show databases" localhost:6041/rest/sql -``` - -运行如下命令可在容器中访问 TDengine。 -```shell -$ docker exec -it tdengine taos - -taos> show databases; - name | -================================= - information_schema | - performance_schema | -Query OK, 2 rows in database (0.033802s) -``` - -在容器中,TDengine CLI 或者各种连接器(例如 JDBC-JNI)与服务器通过容器的 hostname 建立连接。从容器外访问容器内的 TDengine 比较复杂,通过 RESTful/WebSocket 连接方式是最简单的方法。 - -### 在 host 网络模式下启动 TDengine - -运行以下命令可以在 host 网络模式下启动 TDengine,这样可以使用主机的 FQDN 建立连接,而不是使用容器的 hostname。 -```shell -docker run -d --name tdengine --network host tdengine/tdengine -``` - -这种方式与在主机上使用 systemctl 命令启动 TDengine 的效果相同。在主机上已安装 TDengine 客户端的情况下,可以直接使用下面的命令访问 TDengine 服务。 -```shell -$ taos - -taos> show dnodes; - id | endpoint | vnodes | support_vnodes | status | create_time | note | -================================================================================================================================================= - 1 | vm98:6030 | 0 | 32 | ready | 2022-08-19 14:50:05.337 | | -Query OK, 1 rows in database (0.010654s) -``` - -### 以指定的 hostname 和 port 启动 TDengine - -使用如下命令可以利用 TAOS_FQDN 环境变量或者 taos.cfg 中的 fqdn 配置项使TDengine 在指定的 hostname 上建立连接。这种方式为部署 TDengine 提供了更大的灵活性。 - -```shell -docker run -d \ - --name tdengine \ - -e TAOS_FQDN=tdengine \ - -p 6030:6030 \ - -p 6041-6049:6041-6049 \ - -p 6041-6049:6041-6049/udp \ - tdengine/tdengine -``` - -首先,上面的命令在容器中启动一个 TDengine 服务,其所监听的 hostname 为tdengine,并将容器的端口 6030 映射到主机的端口 6030,将容器的端口段 6041~6049 映射到主机的端口段 6041~6049。如果主机上该端口段已经被占用,可以修改上述命令以指定一个主机上空闲的端口段。 - -其次,要确保 tdengine 这个 hostname 在 /etc/hosts 中可解析。通过如下命令可将正确的配置信息保存到 hosts 文件中。 -```shell -echo 127.0.0.1 tdengine |sudo tee -a /etc/hosts -``` - -最后,可以通过 TDengine CLI 以 tdengine 为服务器地址访问 TDengine 服务,命令如下。 -```shell -taos -h tdengine -P 6030 -``` - -如果 TAOS_FQDN 被设置为与所在主机名相同,则效果与“在 host 网络模式下启动TDengine”相同。 - -## Kubernetes 部署 - -作为面向云原生架构设计的时序数据库,TDengine 本身就支持 Kubernetes 部署。这里介绍如何使用 YAML 文件从头一步一步创建一个可用于生产使用的高可用 TDengine 集群,并重点介绍 Kubernetes 环境下 TDengine 的常用操作。本小节要求读者对 Kubernetes 有一定的了解,可以熟练运行常见的 kubectl 命令,了解 statefulset、service、pvc 等概念,对这些概念不熟悉的读者,可以先参考 Kubernetes 的官网进行学习。 -为了满足高可用的需求,集群需要满足如下要求: -- 3 个及以上 dnode :TDengine 的同一个 vgroup 中的多个 vnode ,不允许同时分布在一个 dnode ,所以如果创建 3 副本的数据库,则 dnode 数大于等于 3 -- 3 个 mnode :mnode 负责整个集群的管理工作,TDengine 默认是一个 mnode。如果这个 mnode 所在的 dnode 掉线,则整个集群不可用。 -- 数据库的 3 副本:TDengine 的副本配置是数据库级别,所以数据库 3 副本可满足在 3 个 dnode 的集群中,任意一个 dnode 下线,都不影响集群的正常使用。如果下线 dnode 个数为 2 时,此时集群不可用,因为 RAFT 无法完成选举。(企业版:在灾难恢复场景,任一节点数据文件损坏,都可以通过重新拉起 dnode 进行恢复) - -### 前置条件 - -要使用 Kubernetes 部署管理 TDengine 集群,需要做好如下准备工作。 -- 本文适用 Kubernetes v1.19 以上版本 -- 本文使用 kubectl 工具进行安装部署,请提前安装好相应软件 -- Kubernetes 已经安装部署并能正常访问使用或更新必要的容器仓库或其他服务 - -### 配置 Service 服务 - -创建一个 Service 配置文件:taosd-service.yaml,服务名称 metadata.name (此处为 "taosd") 将在下一步中使用到。首先添加 TDengine 所用到的端口,然后在选择器设置确定的标签 app (此处为 “tdengine”)。 - -```yaml ---- -apiVersion: v1 -kind: Service -metadata: - name: "taosd" - labels: - app: "tdengine" -spec: - ports: - - name: tcp6030 - protocol: "TCP" - port: 6030 - - name: tcp6041 - protocol: "TCP" - port: 6041 - selector: - app: "tdengine" -``` - -### 有状态服务 StatefulSet - -根据 Kubernetes 对各类部署的说明,我们将使用 StatefulSet 作为 TDengine 的部署资源类型。 创建文件 tdengine.yaml,其中 replicas 定义集群节点的数量为 3。节点时区为中国(Asia/Shanghai),每个节点分配 5G 标准(standard)存储,你也可以根据实际情况进行相应修改。 - -请特别注意 startupProbe 的配置,在 dnode 的 Pod 掉线一段时间后,再重新启动,这个时候新上线的 dnode 会短暂不可用。如果 startupProbe 配置过小,Kubernetes 会认为该 Pod 处于不正常的状态,并尝试重启该 Pod,该 dnode 的 Pod 会频繁重启,始终无法恢复到正常状态。 - -```yaml ---- -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: "tdengine" - labels: - app: "tdengine" -spec: - serviceName: "taosd" - replicas: 3 - updateStrategy: - type: RollingUpdate - selector: - matchLabels: - app: "tdengine" - template: - metadata: - name: "tdengine" - labels: - app: "tdengine" - spec: - containers: - - name: "tdengine" - image: "tdengine/tdengine:3.2.3.0" - imagePullPolicy: "IfNotPresent" - ports: - - name: tcp6030 - protocol: "TCP" - containerPort: 6030 - - name: tcp6041 - protocol: "TCP" - containerPort: 6041 - env: - # POD_NAME for FQDN config - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - # SERVICE_NAME and NAMESPACE for fqdn resolve - - name: SERVICE_NAME - value: "taosd" - - name: STS_NAME - value: "tdengine" - - name: STS_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - # TZ for timezone settings, we recommend to always set it. - - name: TZ - value: "Asia/Shanghai" - # Environment variables with prefix TAOS_ will be parsed and converted into corresponding parameter in taos.cfg. For example, serverPort in taos.cfg should be configured by TAOS_SERVER_PORT when using K8S to deploy - - name: TAOS_SERVER_PORT - value: "6030" - # Must set if you want a cluster. - - name: TAOS_FIRST_EP - value: "$(STS_NAME)-0.$(SERVICE_NAME).$(STS_NAMESPACE).svc.cluster.local:$(TAOS_SERVER_PORT)" - # TAOS_FQND should always be set in k8s env. - - name: TAOS_FQDN - value: "$(POD_NAME).$(SERVICE_NAME).$(STS_NAMESPACE).svc.cluster.local" - volumeMounts: - - name: taosdata - mountPath: /var/lib/taos - startupProbe: - exec: - command: - - taos-check - failureThreshold: 360 - periodSeconds: 10 - readinessProbe: - exec: - command: - - taos-check - initialDelaySeconds: 5 - timeoutSeconds: 5000 - livenessProbe: - exec: - command: - - taos-check - initialDelaySeconds: 15 - periodSeconds: 20 - volumeClaimTemplates: - - metadata: - name: taosdata - spec: - accessModes: - - "ReadWriteOnce" - storageClassName: "standard" - resources: - requests: - storage: "5Gi" -``` - -### 使用 kubectl 命令部署 TDengine 集群 - -首先创建对应的 namespace dengine-test,以及 pvc,并保证 storageClassName 是 standard 的剩余空间足够。然后顺序执行以下命令: -```shell -kubectl apply -f taosd-service.yaml -n tdengine-test -``` - -上面的配置将生成一个三节点的 TDengine 集群,dnode 为自动配置,可以使用 show dnodes 命令查看当前集群的节点: -```shell -kubectl exec -it tdengine-0 -n tdengine-test -- taos -s "show dnodes" -kubectl exec -it tdengine-1 -n tdengine-test -- taos -s "show dnodes" -kubectl exec -it tdengine-2 -n tdengine-test -- taos -s "show dnodes" -``` - -输出如下: -```shell -taos show dnodes - id | endpoint | vnodes | support_vnodes | status | create_time | reboot_time | note | active_code | c_active_code | -============================================================================================================================================================================================================================================= - 1 | tdengine-0.ta... | 0 | 16 | ready | 2023-07-19 17:54:18.552 | 2023-07-19 17:54:18.469 | | | | - 2 | tdengine-1.ta... | 0 | 16 | ready | 2023-07-19 17:54:37.828 | 2023-07-19 17:54:38.698 | | | | - 3 | tdengine-2.ta... | 0 | 16 | ready | 2023-07-19 17:55:01.141 | 2023-07-19 17:55:02.039 | | | | -Query OK, 3 row(s) in set (0.001853s) -``` - -查看当前 mnode -```shell -kubectl exec -it tdengine-1 -n tdengine-test -- taos -s "show mnodes\G" -taos> show mnodes\G -*************************** 1.row *************************** - id: 1 - endpoint: tdengine-0.taosd.tdengine-test.svc.cluster.local:6030 - role: leader - status: ready -create_time: 2023-07-19 17:54:18.559 -reboot_time: 2023-07-19 17:54:19.520 -Query OK, 1 row(s) in set (0.001282s) -``` - -创建 mnode -```shell -kubectl exec -it tdengine-0 -n tdengine-test -- taos -s "create mnode on dnode 2" -kubectl exec -it tdengine-0 -n tdengine-test -- taos -s "create mnode on dnode 3" -``` - -查看 mnode -```shell -kubectl exec -it tdengine-1 -n tdengine-test -- taos -s "show mnodes\G" - -taos> show mnodes\G -*************************** 1.row *************************** - id: 1 - endpoint: tdengine-0.taosd.tdengine-test.svc.cluster.local:6030 - role: leader - status: ready -create_time: 2023-07-19 17:54:18.559 -reboot_time: 2023-07-20 09:19:36.060 -*************************** 2.row *************************** - id: 2 - endpoint: tdengine-1.taosd.tdengine-test.svc.cluster.local:6030 - role: follower - status: ready -create_time: 2023-07-20 09:22:05.600 -reboot_time: 2023-07-20 09:22:12.838 -*************************** 3.row *************************** - id: 3 - endpoint: tdengine-2.taosd.tdengine-test.svc.cluster.local:6030 - role: follower - status: ready -create_time: 2023-07-20 09:22:20.042 -reboot_time: 2023-07-20 09:22:23.271 -Query OK, 3 row(s) in set (0.003108s) -``` - -### 端口转发 - -利用 kubectl 端口转发功能可以使应用可以访问 Kubernetes 环境运行的 TDengine 集群。 - -```shell -kubectl port-forward -n tdengine-test tdengine-0 6041:6041 & -``` - -使用 curl 命令验证 TDengine REST API 使用的 6041 接口。 -```shell -curl -u root:taosdata -d "show databases" 127.0.0.1:6041/rest/sql -{"code":0,"column_meta":[["name","VARCHAR",64]],"data":[["information_schema"],["performance_schema"],["test"],["test1"]],"rows":4} -``` - -### 集群扩容 - -TDengine 支持集群扩容: -```shell -kubectl scale statefulsets tdengine -n tdengine-test --replicas=4 -``` - -上面命令行中参数 `--replica=4` 表示要将 TDengine 集群扩容到 4 个节点,执行后首先检查 POD 的状态: -```shell -kubectl get pod -l app=tdengine -n tdengine-test -o wide -``` - -输出如下: -```text -NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES -tdengine-0 1/1 Running 4 (6h26m ago) 6h53m 10.244.2.75 node86 -tdengine-1 1/1 Running 1 (6h39m ago) 6h53m 10.244.0.59 node84 -tdengine-2 1/1 Running 0 5h16m 10.244.1.224 node85 -tdengine-3 1/1 Running 0 3m24s 10.244.2.76 node86 -``` - -此时 Pod 的状态仍然是 Running,TDengine 集群中的 dnode 状态要等 Pod 状态为 ready 之后才能看到: -```shell -kubectl exec -it tdengine-3 -n tdengine-test -- taos -s "show dnodes" -``` - -扩容后的四节点 TDengine 集群的 dnode 列表: -```text -taos> show dnodes - id | endpoint | vnodes | support_vnodes | status | create_time | reboot_time | note | active_code | c_active_code | -============================================================================================================================================================================================================================================= - 1 | tdengine-0.ta... | 10 | 16 | ready | 2023-07-19 17:54:18.552 | 2023-07-20 09:39:04.297 | | | | - 2 | tdengine-1.ta... | 10 | 16 | ready | 2023-07-19 17:54:37.828 | 2023-07-20 09:28:24.240 | | | | - 3 | tdengine-2.ta... | 10 | 16 | ready | 2023-07-19 17:55:01.141 | 2023-07-20 10:48:43.445 | | | | - 4 | tdengine-3.ta... | 0 | 16 | ready | 2023-07-20 16:01:44.007 | 2023-07-20 16:01:44.889 | | | | -Query OK, 4 row(s) in set (0.003628s) -``` - -### 清理集群 - -**Warning** -删除 pvc 时需要注意下 pv persistentVolumeReclaimPolicy 策略,建议改为 Delete,这样在删除 pvc 时才会自动清理 pv,同时会清理底层的 csi 存储资源,如果没有配置删除 pvc 自动清理 pv 的策略,再删除 pvc 后,在手动清理 pv 时,pv 对应的 csi 存储资源可能不会被释放。 - -完整移除 TDengine 集群,需要分别清理 statefulset、svc、pvc,最后删除命名空间。 - -```shell -kubectl delete statefulset -l app=tdengine -n tdengine-test -kubectl delete svc -l app=tdengine -n tdengine-test -kubectl delete pvc -l app=tdengine -n tdengine-test -kubectl delete namespace tdengine-test -``` - -### 集群灾备能力 - -对于在 Kubernetes 环境下 TDengine 的高可用和高可靠来说,对于硬件损坏、灾难恢复,分为两个层面来讲: -- 底层的分布式块存储具备的灾难恢复能力,块存储的多副本,当下流行的分布式块存储如 Ceph,就具备多副本能力,将存储副本扩展到不同的机架、机柜、机房、数据中心(或者直接使用公有云厂商提供的块存储服务) -- TDengine 的灾难恢复,在 TDengine Enterprise 中,本身具备了当一个 dnode 永久下线(物理机磁盘损坏,数据分拣丢失)后,重新拉起一个空白的 dnode 来恢复原 dnode 的工作。 - -## 使用 Helm 部署 TDengine 集群 - -Helm 是 Kubernetes 的包管理器。 -上一节使用 Kubernetes 部署 TDengine 集群的操作已经足够简单,但 Helm 可以提供更强大的能力。 - -### 安装 Helm - -```shell -curl -fsSL -o get_helm.sh \ - https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 -chmod +x get_helm.sh -./get_helm.sh -``` - -Helm 会使用 kubectl 和 kubeconfig 的配置来操作 Kubernetes,可以参考 Rancher 安装 Kubernetes 的配置来进行设置。 - -### 安装 TDengine Chart - -TDengine Chart 尚未发布到 Helm 仓库,当前可以从 GitHub 直接下载: -```shell -wget https://github.com/taosdata/TDengine-Operator/raw/3.0/helm/tdengine-3.0.2.tgz -``` - -获取当前 Kubernetes 的存储类: -```shell -kubectl get storageclass -``` - -在 minikube 默认为 standard。之后,使用 helm 命令安装: -```shell -helm install tdengine tdengine-3.0.2.tgz \ - --set storage.className= \ - --set image.tag=3.2.3.0 - -``` - -在 minikube 环境下,可以设置一个较小的容量避免超出磁盘可用空间: -```shell -helm install tdengine tdengine-3.0.2.tgz \ - --set storage.className=standard \ - --set storage.dataSize=2Gi \ - --set storage.logSize=10Mi \ - --set image.tag=3.2.3.0 -``` - -部署成功后,TDengine Chart 将会输出操作 TDengine 的说明: -```shell -export POD_NAME=$(kubectl get pods --namespace default \ - -l "app.kubernetes.io/name=tdengine,app.kubernetes.io/instance=tdengine" \ - -o jsonpath="{.items[0].metadata.name}") -kubectl --namespace default exec $POD_NAME -- taos -s "show dnodes; show mnodes" -kubectl --namespace default exec -it $POD_NAME -- taos -``` - -可以创建一个表进行测试: -```shell -kubectl --namespace default exec $POD_NAME -- \ - taos -s "create database test; - use test; - create table t1 (ts timestamp, n int); - insert into t1 values(now, 1)(now + 1s, 2); - select * from t1;" -``` - -### 配置 values - -TDengine 支持 `values.yaml` 自定义。 -通过 helm show values 可以获取 TDengine Chart 支持的全部 values 列表: -```shell -helm show values tdengine-3.0.2.tgz -``` - -你可以将结果保存为 values.yaml,之后可以修改其中的各项参数,如 replica 数量,存储类名称,容量大小,TDengine 配置等,然后使用如下命令安装 TDengine 集群: -```shell -helm install tdengine tdengine-3.0.2.tgz -f values.yaml -``` - -全部参数如下: -```yaml -# Default values for tdengine. -# This is a YAML-formatted file. -# Declare variables to be passed into helm templates. - -replicaCount: 1 - -image: - prefix: tdengine/tdengine - #pullPolicy: Always - # Overrides the image tag whose default is the chart appVersion. -# tag: "3.0.2.0" - -service: - # ClusterIP is the default service type, use NodeIP only if you know what you are doing. - type: ClusterIP - ports: - # TCP range required - tcp: [6030, 6041, 6042, 6043, 6044, 6046, 6047, 6048, 6049, 6060] - # UDP range - udp: [6044, 6045] - - -# Set timezone here, not in taoscfg -timezone: "Asia/Shanghai" - -resources: - # We usually recommend not to specify default resources and to leave this as a conscious - # choice for the user. This also increases chances charts run on environments with little - # resources, such as Minikube. If you do want to specify resources, uncomment the following - # lines, adjust them as necessary, and remove the curly braces after 'resources:'. - # limits: - # cpu: 100m - # memory: 128Mi - # requests: - # cpu: 100m - # memory: 128Mi - -storage: - # Set storageClassName for pvc. K8s use default storage class if not set. - # - className: "" - dataSize: "100Gi" - logSize: "10Gi" - -nodeSelectors: - taosd: - # node selectors - -clusterDomainSuffix: "" -# Config settings in taos.cfg file. -# -# The helm/k8s support will use environment variables for taos.cfg, -# converting an upper-snake-cased variable like `TAOS_DEBUG_FLAG`, -# to a camelCase taos config variable `debugFlag`. -# -# See the variable list at https://www.taosdata.com/cn/documentation/administrator . -# -# Note: -# 1. firstEp/secondEp: should not be set here, it's auto generated at scale-up. -# 2. serverPort: should not be set, we'll use the default 6030 in many places. -# 3. fqdn: will be auto generated in kubernetes, user should not care about it. -# 4. role: currently role is not supported - every node is able to be mnode and vnode. -# -# Btw, keep quotes "" around the value like below, even the value will be number or not. -taoscfg: - # Starts as cluster or not, must be 0 or 1. - # 0: all pods will start as a separate TDengine server - # 1: pods will start as TDengine server cluster. [default] - CLUSTER: "1" - - # number of replications, for cluster only - TAOS_REPLICA: "1" - - - # TAOS_NUM_OF_RPC_THREADS: number of threads for RPC - #TAOS_NUM_OF_RPC_THREADS: "2" - - # - # TAOS_NUM_OF_COMMIT_THREADS: number of threads to commit cache data - #TAOS_NUM_OF_COMMIT_THREADS: "4" - - # enable/disable installation / usage report - #TAOS_TELEMETRY_REPORTING: "1" - - # time interval of system monitor, seconds - #TAOS_MONITOR_INTERVAL: "30" - - # time interval of dnode status reporting to mnode, seconds, for cluster only - #TAOS_STATUS_INTERVAL: "1" - - # time interval of heart beat from shell to dnode, seconds - #TAOS_SHELL_ACTIVITY_TIMER: "3" - - # minimum sliding window time, milli-second - #TAOS_MIN_SLIDING_TIME: "10" - - # minimum time window, milli-second - #TAOS_MIN_INTERVAL_TIME: "1" - - # the compressed rpc message, option: - # -1 (no compression) - # 0 (all message compressed), - # > 0 (rpc message body which larger than this value will be compressed) - #TAOS_COMPRESS_MSG_SIZE: "-1" - - # max number of connections allowed in dnode - #TAOS_MAX_SHELL_CONNS: "50000" - - # stop writing logs when the disk size of the log folder is less than this value - #TAOS_MINIMAL_LOG_DIR_G_B: "0.1" - - # stop writing temporary files when the disk size of the tmp folder is less than this value - #TAOS_MINIMAL_TMP_DIR_G_B: "0.1" - - # if disk free space is less than this value, taosd service exit directly within startup process - #TAOS_MINIMAL_DATA_DIR_G_B: "0.1" - - # One mnode is equal to the number of vnode consumed - #TAOS_MNODE_EQUAL_VNODE_NUM: "4" - - # enbale/disable http service - #TAOS_HTTP: "1" - - # enable/disable system monitor - #TAOS_MONITOR: "1" - - # enable/disable async log - #TAOS_ASYNC_LOG: "1" - - # - # time of keeping log files, days - #TAOS_LOG_KEEP_DAYS: "0" - - # The following parameters are used for debug purpose only. - # debugFlag 8 bits mask: FILE-SCREEN-UNUSED-HeartBeat-DUMP-TRACE_WARN-ERROR - # 131: output warning and error - # 135: output debug, warning and error - # 143: output trace, debug, warning and error to log - # 199: output debug, warning and error to both screen and file - # 207: output trace, debug, warning and error to both screen and file - # - # debug flag for all log type, take effect when non-zero value\ - #TAOS_DEBUG_FLAG: "143" - - # generate core file when service crash - #TAOS_ENABLE_CORE_FILE: "1" -``` - -### 扩容 - -关于扩容可参考上一节的说明,有一些额外的操作需要从 helm 的部署中获取。 -首先,从部署中获取 StatefulSet 的名称。 -```shell -export STS_NAME=$(kubectl get statefulset \ - -l "app.kubernetes.io/name=tdengine" \ - -o jsonpath="{.items[0].metadata.name}") -``` - -扩容操作极其简单,增加 replica 即可。以下命令将 TDengine 扩充到三节点: -```shell -kubectl scale --replicas 3 statefulset/$STS_NAME -``` - -使用命令 `show dnodes` 和 `show mnodes` 检查是否扩容成功。 - -### 清理集群 - -Helm 管理下,清理操作也变得简单: - -```shell -helm uninstall tdengine -``` - +--- +sidebar_label: 集群部署 +title: 集群部署 +toc_max_heading_level: 4 +--- + +由于 TDengine 设计之初就采用了分布式架构,具有强大的水平扩展能力,以满足不断增长的数据处理需求,因此 TDengine 支持集群,并将此核心功能开源。用户可以根据实际环境和需求选择 4 种部署方式—手动部署、Docker 部署、Kubernetes 部署和 Helm 部署。 + +## 手动部署 + +### 部署 taosd + +taosd 是 TDengine 集群中最主要的服务组件,本节介绍手动部署 taosd 集群的步骤。 + +#### 1. 清除数据 + +如果搭建集群的物理节点中存在之前的测试数据或者装过其他版本(如 1.x/2.x)的TDengine,请先将其删除,并清空所有数据。 + +#### 2. 检查环境 + +在进行 TDengine 集群部署之前,全面检查所有 dnode 以及应用程序所在物理节点的网络设置至关重要。以下是检查步骤: + +- 第 1 步,在每个物理节点上执行 hostname -f 命令,以查看并确认所有节点的hostname 是唯一的。对于应用程序驱动所在的节点,这一步骤可以省略。 +- 第 2 步,在每个物理节点上执行 ping host 命令,其中 host 是其他物理节点的 hostname。这一步骤旨在检测当前节点与其他物理节点之间的网络连通性。如果发现无法 ping 通,请立即检查网络和 DNS 设置。对于 Linux 操作系统,请检查 /etc/hosts 文件;对于 Windows 操作系统,请检查C:\Windows\system32\drivers\etc\hosts 文件。网络不通畅将导致无法组建集群,请务必解决此问题。 +- 第 3 步,在应用程序运行的物理节点上重复上述网络监测步骤。如果发现网络不通畅,应用程序将无法连接到 taosd 服务。此时,请仔细检查应用程序所在物理节点的DNS 设置或 hosts 文件,确保其配置正确无误。 +- 第 4 步,检查端口,确保集群中所有主机在端口 6030 上的 TCP 能够互通。 + +通过以上步骤,你可以确保所有节点在网络层面顺利通信,从而为成功部署TDengine 集群奠定坚实基础 + +#### 3. 安装 + +为了确保集群内各物理节点的一致性和稳定性,请在所有物理节点上安装相同版本的 TDengine。 + +#### 4. 修改配置 + +修改 TDengine 的配置文件(所有节点的配置文件都需要修改)。假设准备启动的第 1 个 dnode 的 endpoint 为 h1.taosdata.com:6030,其与集群配置相关参数如下。 + +```shell +# firstEp 是每个 dnode 首次启动后连接的第 1 个 dnode +firstEp h1.taosdata.com:6030 +# 必须配置为本 dnode 的 FQDN,如果本机只有一个 hostname,可注释或删除如下这行代码 +fqdn h1.taosdata.com +# 配置本 dnode 的端口,默认是 6030 +serverPort 6030 +``` + +一定要修改的参数是 f irstEp 和 fqdn。对于每个 dnode,f irstEp 配置应该保持一致,但 fqdn 一定要配置成其所在 dnode 的值。其他参数可不做任何修改,除非你很清楚为什么要修改。 + +对于希望加入集群的 dnode 节点,必须确保下表所列的与 TDengine 集群相关的参数设置完全一致。任何参数的不匹配都可能导致 dnode 节点无法成功加入集群。 + +| 参数名称 | 含义 | +|:---------------:|:----------------------------------------------------------:| +|statusInterval | dnode 向 mnode 报告状态的间隔 | +|timezone | 时区 | +|locale | 系统区位信息及编码格式 | +|charset | 字符集编码 | +|ttlChangeOnWrite | ttl 到期时间是否伴随表的修改操作而改变 | + +#### 5. 启动 + +按照前述步骤启动第 1 个 dnode,例如 h1.taosdata.com。接着在终端中执行 taos,启动 TDengine 的 CLI 程序 taos,并在其中执行 show dnodes 命令,以查看当前集群中的所有 dnode 信息。 + +```shell +taos> show dnodes; + id | endpoint | vnodes|support_vnodes|status| create_time | note | +=================================================================================== + 1| h1.taosdata.com:6030 | 0| 1024| ready| 2022-07-16 10:50:42.673 | | +``` + +可以看到,刚刚启动的 dnode 节点的 endpoint 为 h1.taosdata.com:6030。这个地址就是新建集群的 first Ep。 + +#### 6. 添加 dnode + +按照前述步骤,在每个物理节点启动 taosd。每个 dnode 都需要在 taos.cfg 文件中将 firstEp 参数配置为新建集群首个节点的 endpoint,在本例中是 h1.taosdata.com:6030。在第 1 个 dnode 所在机器,在终端中运行 taos,打开 TDengine 的 CLI 程序 taos,然后登录TDengine 集群,执行如下 SQL。 + +```shell +create dnode "h2.taosdata.com:6030" +``` + +将新 dnode 的 endpoint 添加进集群的 endpoint 列表。需要为 `fqdn:port` 加上双引号,否则运行时出错。请注意将示例的 h2.taosdata.com:6030 替换为这个新 dnode 的 endpoint。然后执行如下 SQL 查看新节点是否成功加入。若要加入的 dnode 当前处于离线状态,请参考本节后面的 “常见问题”部分进行解决。 + +```shell +show dnodes; +``` + +在日志中,请确认输出的 dnode 的 fqdn 和端口是否与你刚刚尝试添加的 endpoint 一致。如果不一致,请修正为正确的 endpoint。遵循上述步骤,你可以持续地将新的 dnode 逐个加入集群,从而扩展集群规模并提高整体性能。确保在添加新节点时遵循正确的流程,这有助于维持集群的稳定性和可靠性。 + +**Tips** +- 任何已经加入集群的 dnode 都可以作为后续待加入节点的 firstEp。firstEp 参数仅仅在该 dnode 首次加入集群时起作用,加入集群后,该 dnode 会保存最新的 mnode 的 endpoint 列表,后续不再依赖这个参数。之后配置文件中的 firstEp 参数主要用于客户端连接,如果没有为 TDengine 的 CLI 设置参数,则默认连接由 firstEp 指定的节点。 +- 两个没有配置 firstEp 参数的 dnode 在启动后会独立运行。这时无法将其中一个dnode 加入另外一个 dnode,形成集群。 +- TDengine 不允许将两个独立的集群合并成新的集群。 + +#### 7. 添加 mnode + +在创建 TDengine 集群时,首个 dnode 将自动成为集群的 mnode,负责集群的管理和协调工作。为了实现 mnode 的高可用性,后续添加的 dnode 需要手动创建 mnode。请注意,一个集群最多允许创建 3 个 mnode,且每个 dnode 上只能创建一个 mnode。当集群中的 dnode 数量达到或超过 3 个时,你可以为现有集群创建 mnode。在第 1个 dnode 中,首先通过 TDengine 的 CLI 程序 taos 登录 TDengine,然后执行如下 SQL。 + +```shell +create mnode on dnode +``` + +请注意将上面示例中的 dnodeId 替换为刚创建 dnode 的序号(可以通过执行 `show dnodes` 命令获得)。最后执行如下 `show mnodes`,查看新创建的 mnode 是否成功加入集群。 + + +**Tips** + +在搭建 TDengine 集群的过程中,如果在执行 create dnode 命令以添加新节点后,新节点始终显示为离线状态,请按照以下步骤进行排查。 + +- 第 1 步,检查新节点上的 taosd 服务是否已经正常启动。你可以通过查看日志文件或使用 ps 命令来确认。 +- 第 2 步,如果 taosd 服务已启动,接下来请检查新节点的网络连接是否畅通,并确认防火墙是否已关闭。网络不通或防火墙设置可能会阻止节点与集群的其他节点通信。 +- 第 3 步,使用 taos -h fqdn 命令尝试连接到新节点,然后执行 show dnodes 命令。这将显示新节点作为独立集群的运行状态。如果显示的列表与主节点上显示的不一致,说明新节点可能已自行组成一个单节点集群。要解决这个问题,请按照以下步骤操作。首先,停止新节点上的 taosd 服务。其次,清空新节点上 taos.cfg 配置文件中指定的 dataDir 目录下的所有文件。这将删除与该节点相关的所有数据和配置信息。最后,重新启动新节点上的 taosd 服务。这将使新节点恢复到初始状态,并准备好重新加入主集群。 + +### 部署 taosAdapter + +本节讲述如何部署 taosAdapter,taosAdapter 为 TDengine 集群提供 RESTful 和 WebSocket 接入能力,因而在集群中扮演着很重要的角色。 + +1. 安装 + +TDengine Enterprise 安装完成后,即可使用 taosAdapter。如果想在不同的服务器上分别部署 taosAdapter,需要在这些服务器上都安装 TDengine Enterprise。 + +2. 单一实例部署 + +部署 taosAdapter 的单一实例非常简单,具体命令和配置参数请参考手册中 taosAdapter 部分。 + +3. 多实例部署 + +部署 taosAdapter 的多个实例的主要目的如下: +- 提升集群的吞吐量,避免 taosAdapter 成为系统瓶颈。 +- 提升集群的健壮性和高可用能力,当有一个实例因某种故障而不再提供服务时,可以将进入业务系统的请求自动路由到其他实例。 + +在部署 taosAdapter 的多个实例时,需要解决负载均衡问题,以避免某个节点过载而其他节点闲置。在部署过程中,需要分别部署多个单一实例,每个实例的部署步骤与部署单一实例完全相同。接下来关键的部分是配置 Nginx。以下是一个经过验证的较佳实践配置,你只须将其中的 endpoint 替换为实际环境中的正确地址即可。关于各参数的含义,请参考 Nginx 的官方文档。 + +```json +user root; +worker_processes auto; +error_log /var/log/nginx_error.log; + + +events { + use epoll; + worker_connections 1024; +} + +http { + + access_log off; + + map $http_upgrade $connection_upgrade { + default upgrade; + '' close; + } + + server { + listen 6041; + location ~* { + proxy_pass http://dbserver; + proxy_read_timeout 600s; + proxy_send_timeout 600s; + proxy_connect_timeout 600s; + proxy_next_upstream error http_502 non_idempotent; + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection $http_connection; + } + } + server { + listen 6043; + location ~* { + proxy_pass http://keeper; + proxy_read_timeout 60s; + proxy_next_upstream error http_502 http_500 non_idempotent; + } + } + + server { + listen 6060; + location ~* { + proxy_pass http://explorer; + proxy_read_timeout 60s; + proxy_next_upstream error http_502 http_500 non_idempotent; + } + } + upstream dbserver { + least_conn; + server 172.16.214.201:6041 max_fails=0; + server 172.16.214.202:6041 max_fails=0; + server 172.16.214.203:6041 max_fails=0; + } + upstream keeper { + ip_hash; + server 172.16.214.201:6043 ; + server 172.16.214.202:6043 ; + server 172.16.214.203:6043 ; + } + upstream explorer{ + ip_hash; + server 172.16.214.201:6060 ; + server 172.16.214.202:6060 ; + server 172.16.214.203:6060 ; + } +} +``` + +### 部署 taosKeeper + +如果要想使用 TDegnine 的监控功能,taosKeeper 是一个必要的组件,关于监控请参考[TDinsight](../../reference/components/tdinsight),关于部署 taosKeeper 的细节请参考[taosKeeper参考手册](../../reference/components/taoskeeper)。 + +### 部署 taosX + +如果想使用 TDengine 的数据接入能力,需要部署 taosX 服务,关于它的详细说明和部署请参考[taosX 参考手册](../../reference/components/taosx)。 + +### 部署 taosX-Agent + +有些数据源如 Pi, OPC 等,因为网络条件和数据源访问的限制,taosX 无法直接访问数据源,这种情况下需要部署一个代理服务 taosX-Agent,关于它的详细说明和部署请参考[taosX-Agent 参考手册](../../reference/components/taosx-agent)。 + +### 部署 taos-Explorer + +TDengine 提供了可视化管理 TDengine 集群的能力,要想使用图形化界面需要部署 taos-Explorer 服务,关于它的详细说明和部署请参考[taos-Explorer 参考手册](../../reference/components/explorer) + + +## Docker 部署 + +本节将介绍如何在 Docker 容器中启动 TDengine 服务并对其进行访问。你可以在 docker run 命令行或者 docker-compose 文件中使用环境变量来控制容器中服务的行为。 + +### 启动 TDengine + +TDengine 镜像启动时默认激活 HTTP 服务,使用下列命令便可创建一个带有 HTTP 服务的容器化 TDengine 环境。 +```shell +docker run -d --name tdengine \ +-v ~/data/taos/dnode/data:/var/lib/taos \ +-v ~/data/taos/dnode/log:/var/log/taos \ +-p 6041:6041 tdengine/tdengine +``` + +详细的参数说明如下。 +- /var/lib/taos:TDengine 默认数据文件目录,可通过配置文件修改位置。 +- /var/log/taos:TDengine 默认日志文件目录,可通过配置文件修改位置。 + +以上命令启动了一个名为 tdengine 的容器,并把其中的 HTTP 服务的端口 6041 映射到主机端口 6041。如下命令可以验证该容器中提供的 HTTP 服务是否可用。 + +```shell +curl -u root:taosdata -d "show databases" localhost:6041/rest/sql +``` + +运行如下命令可在容器中访问 TDengine。 +```shell +$ docker exec -it tdengine taos + +taos> show databases; + name | +================================= + information_schema | + performance_schema | +Query OK, 2 rows in database (0.033802s) +``` + +在容器中,TDengine CLI 或者各种连接器(例如 JDBC-JNI)与服务器通过容器的 hostname 建立连接。从容器外访问容器内的 TDengine 比较复杂,通过 RESTful/WebSocket 连接方式是最简单的方法。 + +### 在 host 网络模式下启动 TDengine + +运行以下命令可以在 host 网络模式下启动 TDengine,这样可以使用主机的 FQDN 建立连接,而不是使用容器的 hostname。 +```shell +docker run -d --name tdengine --network host tdengine/tdengine +``` + +这种方式与在主机上使用 systemctl 命令启动 TDengine 的效果相同。在主机上已安装 TDengine 客户端的情况下,可以直接使用下面的命令访问 TDengine 服务。 +```shell +$ taos + +taos> show dnodes; + id | endpoint | vnodes | support_vnodes | status | create_time | note | +================================================================================================================================================= + 1 | vm98:6030 | 0 | 32 | ready | 2022-08-19 14:50:05.337 | | +Query OK, 1 rows in database (0.010654s) +``` + +### 以指定的 hostname 和 port 启动 TDengine + +使用如下命令可以利用 TAOS_FQDN 环境变量或者 taos.cfg 中的 fqdn 配置项使TDengine 在指定的 hostname 上建立连接。这种方式为部署 TDengine 提供了更大的灵活性。 + +```shell +docker run -d \ + --name tdengine \ + -e TAOS_FQDN=tdengine \ + -p 6030:6030 \ + -p 6041-6049:6041-6049 \ + -p 6041-6049:6041-6049/udp \ + tdengine/tdengine +``` + +首先,上面的命令在容器中启动一个 TDengine 服务,其所监听的 hostname 为tdengine,并将容器的端口 6030 映射到主机的端口 6030,将容器的端口段 [6041, 6049] 映射到主机的端口段 [6041, 6049]。如果主机上该端口段已经被占用,可以修改上述命令以指定一个主机上空闲的端口段。 + +其次,要确保 tdengine 这个 hostname 在 /etc/hosts 中可解析。通过如下命令可将正确的配置信息保存到 hosts 文件中。 +```shell +echo 127.0.0.1 tdengine |sudo tee -a /etc/hosts +``` + +最后,可以通过 TDengine CLI 以 tdengine 为服务器地址访问 TDengine 服务,命令如下。 +```shell +taos -h tdengine -P 6030 +``` + +如果 TAOS_FQDN 被设置为与所在主机名相同,则效果与“在 host 网络模式下启动TDengine”相同。 + +## Kubernetes 部署 + +作为面向云原生架构设计的时序数据库,TDengine 本身就支持 Kubernetes 部署。这里介绍如何使用 YAML 文件从头一步一步创建一个可用于生产使用的高可用 TDengine 集群,并重点介绍 Kubernetes 环境下 TDengine 的常用操作。本小节要求读者对 Kubernetes 有一定的了解,可以熟练运行常见的 kubectl 命令,了解 statefulset、service、pvc 等概念,对这些概念不熟悉的读者,可以先参考 Kubernetes 的官网进行学习。 +为了满足高可用的需求,集群需要满足如下要求: +- 3 个及以上 dnode :TDengine 的同一个 vgroup 中的多个 vnode ,不允许同时分布在一个 dnode ,所以如果创建 3 副本的数据库,则 dnode 数大于等于 3 +- 3 个 mnode :mnode 负责整个集群的管理工作,TDengine 默认是一个 mnode。如果这个 mnode 所在的 dnode 掉线,则整个集群不可用。 +- 数据库的 3 副本:TDengine 的副本配置是数据库级别,所以数据库 3 副本可满足在 3 个 dnode 的集群中,任意一个 dnode 下线,都不影响集群的正常使用。如果下线 dnode 个数为 2 时,此时集群不可用,因为 RAFT 无法完成选举。(企业版:在灾难恢复场景,任一节点数据文件损坏,都可以通过重新拉起 dnode 进行恢复) + +### 前置条件 + +要使用 Kubernetes 部署管理 TDengine 集群,需要做好如下准备工作。 +- 本文适用 Kubernetes v1.19 以上版本 +- 本文使用 kubectl 工具进行安装部署,请提前安装好相应软件 +- Kubernetes 已经安装部署并能正常访问使用或更新必要的容器仓库或其他服务 + +### 配置 Service 服务 + +创建一个 Service 配置文件:taosd-service.yaml,服务名称 metadata.name (此处为 "taosd") 将在下一步中使用到。首先添加 TDengine 所用到的端口,然后在选择器设置确定的标签 app (此处为 “tdengine”)。 + +```yaml +--- +apiVersion: v1 +kind: Service +metadata: + name: "taosd" + labels: + app: "tdengine" +spec: + ports: + - name: tcp6030 + protocol: "TCP" + port: 6030 + - name: tcp6041 + protocol: "TCP" + port: 6041 + selector: + app: "tdengine" +``` + +### 有状态服务 StatefulSet + +根据 Kubernetes 对各类部署的说明,我们将使用 StatefulSet 作为 TDengine 的部署资源类型。 创建文件 tdengine.yaml,其中 replicas 定义集群节点的数量为 3。节点时区为中国(Asia/Shanghai),每个节点分配 5G 标准(standard)存储,你也可以根据实际情况进行相应修改。 + +请特别注意 startupProbe 的配置,在 dnode 的 Pod 掉线一段时间后,再重新启动,这个时候新上线的 dnode 会短暂不可用。如果 startupProbe 配置过小,Kubernetes 会认为该 Pod 处于不正常的状态,并尝试重启该 Pod,该 dnode 的 Pod 会频繁重启,始终无法恢复到正常状态。 + +```yaml +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: "tdengine" + labels: + app: "tdengine" +spec: + serviceName: "taosd" + replicas: 3 + updateStrategy: + type: RollingUpdate + selector: + matchLabels: + app: "tdengine" + template: + metadata: + name: "tdengine" + labels: + app: "tdengine" + spec: + containers: + - name: "tdengine" + image: "tdengine/tdengine:3.2.3.0" + imagePullPolicy: "IfNotPresent" + ports: + - name: tcp6030 + protocol: "TCP" + containerPort: 6030 + - name: tcp6041 + protocol: "TCP" + containerPort: 6041 + env: + # POD_NAME for FQDN config + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + # SERVICE_NAME and NAMESPACE for fqdn resolve + - name: SERVICE_NAME + value: "taosd" + - name: STS_NAME + value: "tdengine" + - name: STS_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + # TZ for timezone settings, we recommend to always set it. + - name: TZ + value: "Asia/Shanghai" + # Environment variables with prefix TAOS_ will be parsed and converted into corresponding parameter in taos.cfg. For example, serverPort in taos.cfg should be configured by TAOS_SERVER_PORT when using K8S to deploy + - name: TAOS_SERVER_PORT + value: "6030" + # Must set if you want a cluster. + - name: TAOS_FIRST_EP + value: "$(STS_NAME)-0.$(SERVICE_NAME).$(STS_NAMESPACE).svc.cluster.local:$(TAOS_SERVER_PORT)" + # TAOS_FQND should always be set in k8s env. + - name: TAOS_FQDN + value: "$(POD_NAME).$(SERVICE_NAME).$(STS_NAMESPACE).svc.cluster.local" + volumeMounts: + - name: taosdata + mountPath: /var/lib/taos + startupProbe: + exec: + command: + - taos-check + failureThreshold: 360 + periodSeconds: 10 + readinessProbe: + exec: + command: + - taos-check + initialDelaySeconds: 5 + timeoutSeconds: 5000 + livenessProbe: + exec: + command: + - taos-check + initialDelaySeconds: 15 + periodSeconds: 20 + volumeClaimTemplates: + - metadata: + name: taosdata + spec: + accessModes: + - "ReadWriteOnce" + storageClassName: "standard" + resources: + requests: + storage: "5Gi" +``` + +### 使用 kubectl 命令部署 TDengine 集群 + +首先创建对应的 namespace dengine-test,以及 pvc,并保证 storageClassName 是 standard 的剩余空间足够。然后顺序执行以下命令: +```shell +kubectl apply -f taosd-service.yaml -n tdengine-test +``` + +上面的配置将生成一个三节点的 TDengine 集群,dnode 为自动配置,可以使用 show dnodes 命令查看当前集群的节点: +```shell +kubectl exec -it tdengine-0 -n tdengine-test -- taos -s "show dnodes" +kubectl exec -it tdengine-1 -n tdengine-test -- taos -s "show dnodes" +kubectl exec -it tdengine-2 -n tdengine-test -- taos -s "show dnodes" +``` + +输出如下: +```shell +taos show dnodes + id | endpoint | vnodes | support_vnodes | status | create_time | reboot_time | note | active_code | c_active_code | +============================================================================================================================================================================================================================================= + 1 | tdengine-0.ta... | 0 | 16 | ready | 2023-07-19 17:54:18.552 | 2023-07-19 17:54:18.469 | | | | + 2 | tdengine-1.ta... | 0 | 16 | ready | 2023-07-19 17:54:37.828 | 2023-07-19 17:54:38.698 | | | | + 3 | tdengine-2.ta... | 0 | 16 | ready | 2023-07-19 17:55:01.141 | 2023-07-19 17:55:02.039 | | | | +Query OK, 3 row(s) in set (0.001853s) +``` + +查看当前 mnode +```shell +kubectl exec -it tdengine-1 -n tdengine-test -- taos -s "show mnodes\G" +taos> show mnodes\G +*************************** 1.row *************************** + id: 1 + endpoint: tdengine-0.taosd.tdengine-test.svc.cluster.local:6030 + role: leader + status: ready +create_time: 2023-07-19 17:54:18.559 +reboot_time: 2023-07-19 17:54:19.520 +Query OK, 1 row(s) in set (0.001282s) +``` + +创建 mnode +```shell +kubectl exec -it tdengine-0 -n tdengine-test -- taos -s "create mnode on dnode 2" +kubectl exec -it tdengine-0 -n tdengine-test -- taos -s "create mnode on dnode 3" +``` + +查看 mnode +```shell +kubectl exec -it tdengine-1 -n tdengine-test -- taos -s "show mnodes\G" + +taos> show mnodes\G +*************************** 1.row *************************** + id: 1 + endpoint: tdengine-0.taosd.tdengine-test.svc.cluster.local:6030 + role: leader + status: ready +create_time: 2023-07-19 17:54:18.559 +reboot_time: 2023-07-20 09:19:36.060 +*************************** 2.row *************************** + id: 2 + endpoint: tdengine-1.taosd.tdengine-test.svc.cluster.local:6030 + role: follower + status: ready +create_time: 2023-07-20 09:22:05.600 +reboot_time: 2023-07-20 09:22:12.838 +*************************** 3.row *************************** + id: 3 + endpoint: tdengine-2.taosd.tdengine-test.svc.cluster.local:6030 + role: follower + status: ready +create_time: 2023-07-20 09:22:20.042 +reboot_time: 2023-07-20 09:22:23.271 +Query OK, 3 row(s) in set (0.003108s) +``` + +### 端口转发 + +利用 kubectl 端口转发功能可以使应用可以访问 Kubernetes 环境运行的 TDengine 集群。 + +```shell +kubectl port-forward -n tdengine-test tdengine-0 6041:6041 & +``` + +使用 curl 命令验证 TDengine REST API 使用的 6041 接口。 +```shell +curl -u root:taosdata -d "show databases" 127.0.0.1:6041/rest/sql +{"code":0,"column_meta":[["name","VARCHAR",64]],"data":[["information_schema"],["performance_schema"],["test"],["test1"]],"rows":4} +``` + +### 集群扩容 + +TDengine 支持集群扩容: +```shell +kubectl scale statefulsets tdengine -n tdengine-test --replicas=4 +``` + +上面命令行中参数 `--replica=4` 表示要将 TDengine 集群扩容到 4 个节点,执行后首先检查 POD 的状态: +```shell +kubectl get pod -l app=tdengine -n tdengine-test -o wide +``` + +输出如下: +```text +NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES +tdengine-0 1/1 Running 4 (6h26m ago) 6h53m 10.244.2.75 node86 +tdengine-1 1/1 Running 1 (6h39m ago) 6h53m 10.244.0.59 node84 +tdengine-2 1/1 Running 0 5h16m 10.244.1.224 node85 +tdengine-3 1/1 Running 0 3m24s 10.244.2.76 node86 +``` + +此时 Pod 的状态仍然是 Running,TDengine 集群中的 dnode 状态要等 Pod 状态为 ready 之后才能看到: +```shell +kubectl exec -it tdengine-3 -n tdengine-test -- taos -s "show dnodes" +``` + +扩容后的四节点 TDengine 集群的 dnode 列表: +```text +taos> show dnodes + id | endpoint | vnodes | support_vnodes | status | create_time | reboot_time | note | active_code | c_active_code | +============================================================================================================================================================================================================================================= + 1 | tdengine-0.ta... | 10 | 16 | ready | 2023-07-19 17:54:18.552 | 2023-07-20 09:39:04.297 | | | | + 2 | tdengine-1.ta... | 10 | 16 | ready | 2023-07-19 17:54:37.828 | 2023-07-20 09:28:24.240 | | | | + 3 | tdengine-2.ta... | 10 | 16 | ready | 2023-07-19 17:55:01.141 | 2023-07-20 10:48:43.445 | | | | + 4 | tdengine-3.ta... | 0 | 16 | ready | 2023-07-20 16:01:44.007 | 2023-07-20 16:01:44.889 | | | | +Query OK, 4 row(s) in set (0.003628s) +``` + +### 清理集群 + +**Warning** +删除 pvc 时需要注意下 pv persistentVolumeReclaimPolicy 策略,建议改为 Delete,这样在删除 pvc 时才会自动清理 pv,同时会清理底层的 csi 存储资源,如果没有配置删除 pvc 自动清理 pv 的策略,再删除 pvc 后,在手动清理 pv 时,pv 对应的 csi 存储资源可能不会被释放。 + +完整移除 TDengine 集群,需要分别清理 statefulset、svc、pvc,最后删除命名空间。 + +```shell +kubectl delete statefulset -l app=tdengine -n tdengine-test +kubectl delete svc -l app=tdengine -n tdengine-test +kubectl delete pvc -l app=tdengine -n tdengine-test +kubectl delete namespace tdengine-test +``` + +### 集群灾备能力 + +对于在 Kubernetes 环境下 TDengine 的高可用和高可靠来说,对于硬件损坏、灾难恢复,分为两个层面来讲: +- 底层的分布式块存储具备的灾难恢复能力,块存储的多副本,当下流行的分布式块存储如 Ceph,就具备多副本能力,将存储副本扩展到不同的机架、机柜、机房、数据中心(或者直接使用公有云厂商提供的块存储服务) +- TDengine 的灾难恢复,在 TDengine Enterprise 中,本身具备了当一个 dnode 永久下线(物理机磁盘损坏,数据分拣丢失)后,重新拉起一个空白的 dnode 来恢复原 dnode 的工作。 + +## 使用 Helm 部署 TDengine 集群 + +Helm 是 Kubernetes 的包管理器。 +上一节使用 Kubernetes 部署 TDengine 集群的操作已经足够简单,但 Helm 可以提供更强大的能力。 + +### 安装 Helm + +```shell +curl -fsSL -o get_helm.sh \ + https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 +chmod +x get_helm.sh +./get_helm.sh +``` + +Helm 会使用 kubectl 和 kubeconfig 的配置来操作 Kubernetes,可以参考 Rancher 安装 Kubernetes 的配置来进行设置。 + +### 安装 TDengine Chart + +TDengine Chart 尚未发布到 Helm 仓库,当前可以从 GitHub 直接下载: +```shell +wget https://github.com/taosdata/TDengine-Operator/raw/3.0/helm/tdengine-3.0.2.tgz +``` + +获取当前 Kubernetes 的存储类: +```shell +kubectl get storageclass +``` + +在 minikube 默认为 standard。之后,使用 helm 命令安装: +```shell +helm install tdengine tdengine-3.0.2.tgz \ + --set storage.className= \ + --set image.tag=3.2.3.0 + +``` + +在 minikube 环境下,可以设置一个较小的容量避免超出磁盘可用空间: +```shell +helm install tdengine tdengine-3.0.2.tgz \ + --set storage.className=standard \ + --set storage.dataSize=2Gi \ + --set storage.logSize=10Mi \ + --set image.tag=3.2.3.0 +``` + +部署成功后,TDengine Chart 将会输出操作 TDengine 的说明: +```shell +export POD_NAME=$(kubectl get pods --namespace default \ + -l "app.kubernetes.io/name=tdengine,app.kubernetes.io/instance=tdengine" \ + -o jsonpath="{.items[0].metadata.name}") +kubectl --namespace default exec $POD_NAME -- taos -s "show dnodes; show mnodes" +kubectl --namespace default exec -it $POD_NAME -- taos +``` + +可以创建一个表进行测试: +```shell +kubectl --namespace default exec $POD_NAME -- \ + taos -s "create database test; + use test; + create table t1 (ts timestamp, n int); + insert into t1 values(now, 1)(now + 1s, 2); + select * from t1;" +``` + +### 配置 values + +TDengine 支持 `values.yaml` 自定义。 +通过 helm show values 可以获取 TDengine Chart 支持的全部 values 列表: +```shell +helm show values tdengine-3.0.2.tgz +``` + +你可以将结果保存为 values.yaml,之后可以修改其中的各项参数,如 replica 数量,存储类名称,容量大小,TDengine 配置等,然后使用如下命令安装 TDengine 集群: +```shell +helm install tdengine tdengine-3.0.2.tgz -f values.yaml +``` + +全部参数如下: +```yaml +# Default values for tdengine. +# This is a YAML-formatted file. +# Declare variables to be passed into helm templates. + +replicaCount: 1 + +image: + prefix: tdengine/tdengine + #pullPolicy: Always + # Overrides the image tag whose default is the chart appVersion. +# tag: "3.0.2.0" + +service: + # ClusterIP is the default service type, use NodeIP only if you know what you are doing. + type: ClusterIP + ports: + # TCP range required + tcp: [6030, 6041, 6042, 6043, 6044, 6046, 6047, 6048, 6049, 6060] + # UDP range + udp: [6044, 6045] + + +# Set timezone here, not in taoscfg +timezone: "Asia/Shanghai" + +resources: + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + +storage: + # Set storageClassName for pvc. K8s use default storage class if not set. + # + className: "" + dataSize: "100Gi" + logSize: "10Gi" + +nodeSelectors: + taosd: + # node selectors + +clusterDomainSuffix: "" +# Config settings in taos.cfg file. +# +# The helm/k8s support will use environment variables for taos.cfg, +# converting an upper-snake-cased variable like `TAOS_DEBUG_FLAG`, +# to a camelCase taos config variable `debugFlag`. +# +# See the variable list at https://www.taosdata.com/cn/documentation/administrator . +# +# Note: +# 1. firstEp/secondEp: should not be set here, it's auto generated at scale-up. +# 2. serverPort: should not be set, we'll use the default 6030 in many places. +# 3. fqdn: will be auto generated in kubernetes, user should not care about it. +# 4. role: currently role is not supported - every node is able to be mnode and vnode. +# +# Btw, keep quotes "" around the value like below, even the value will be number or not. +taoscfg: + # Starts as cluster or not, must be 0 or 1. + # 0: all pods will start as a separate TDengine server + # 1: pods will start as TDengine server cluster. [default] + CLUSTER: "1" + + # number of replications, for cluster only + TAOS_REPLICA: "1" + + + # TAOS_NUM_OF_RPC_THREADS: number of threads for RPC + #TAOS_NUM_OF_RPC_THREADS: "2" + + # + # TAOS_NUM_OF_COMMIT_THREADS: number of threads to commit cache data + #TAOS_NUM_OF_COMMIT_THREADS: "4" + + # enable/disable installation / usage report + #TAOS_TELEMETRY_REPORTING: "1" + + # time interval of system monitor, seconds + #TAOS_MONITOR_INTERVAL: "30" + + # time interval of dnode status reporting to mnode, seconds, for cluster only + #TAOS_STATUS_INTERVAL: "1" + + # time interval of heart beat from shell to dnode, seconds + #TAOS_SHELL_ACTIVITY_TIMER: "3" + + # minimum sliding window time, milli-second + #TAOS_MIN_SLIDING_TIME: "10" + + # minimum time window, milli-second + #TAOS_MIN_INTERVAL_TIME: "1" + + # the compressed rpc message, option: + # -1 (no compression) + # 0 (all message compressed), + # > 0 (rpc message body which larger than this value will be compressed) + #TAOS_COMPRESS_MSG_SIZE: "-1" + + # max number of connections allowed in dnode + #TAOS_MAX_SHELL_CONNS: "50000" + + # stop writing logs when the disk size of the log folder is less than this value + #TAOS_MINIMAL_LOG_DIR_G_B: "0.1" + + # stop writing temporary files when the disk size of the tmp folder is less than this value + #TAOS_MINIMAL_TMP_DIR_G_B: "0.1" + + # if disk free space is less than this value, taosd service exit directly within startup process + #TAOS_MINIMAL_DATA_DIR_G_B: "0.1" + + # One mnode is equal to the number of vnode consumed + #TAOS_MNODE_EQUAL_VNODE_NUM: "4" + + # enbale/disable http service + #TAOS_HTTP: "1" + + # enable/disable system monitor + #TAOS_MONITOR: "1" + + # enable/disable async log + #TAOS_ASYNC_LOG: "1" + + # + # time of keeping log files, days + #TAOS_LOG_KEEP_DAYS: "0" + + # The following parameters are used for debug purpose only. + # debugFlag 8 bits mask: FILE-SCREEN-UNUSED-HeartBeat-DUMP-TRACE_WARN-ERROR + # 131: output warning and error + # 135: output debug, warning and error + # 143: output trace, debug, warning and error to log + # 199: output debug, warning and error to both screen and file + # 207: output trace, debug, warning and error to both screen and file + # + # debug flag for all log type, take effect when non-zero value\ + #TAOS_DEBUG_FLAG: "143" + + # generate core file when service crash + #TAOS_ENABLE_CORE_FILE: "1" +``` + +### 扩容 + +关于扩容可参考上一节的说明,有一些额外的操作需要从 helm 的部署中获取。 +首先,从部署中获取 StatefulSet 的名称。 +```shell +export STS_NAME=$(kubectl get statefulset \ + -l "app.kubernetes.io/name=tdengine" \ + -o jsonpath="{.items[0].metadata.name}") +``` + +扩容操作极其简单,增加 replica 即可。以下命令将 TDengine 扩充到三节点: +```shell +kubectl scale --replicas 3 statefulset/$STS_NAME +``` + +使用命令 `show dnodes` 和 `show mnodes` 检查是否扩容成功。 + +### 清理集群 + +Helm 管理下,清理操作也变得简单: + +```shell +helm uninstall tdengine +``` + 但 Helm 也不会自动移除 PVC,需要手动获取 PVC 然后删除掉。 \ No newline at end of file diff --git a/docs/zh/14-reference/01-components/01-taosd.md b/docs/zh/14-reference/01-components/01-taosd.md index b46dc21d40..d85839e62f 100644 --- a/docs/zh/14-reference/01-components/01-taosd.md +++ b/docs/zh/14-reference/01-components/01-taosd.md @@ -197,7 +197,7 @@ charset 的有效值是 UTF-8。 | compressMsgSize | 是否对 RPC 消息进行压缩;-1: 所有消息都不压缩; 0: 所有消息都压缩; N (N>0): 只有大于 N 个字节的消息才压缩;缺省值 -1 | | fPrecision | 设置 float 类型浮点数压缩精度 ,取值范围:0.1 ~ 0.00000001 ,默认值 0.00000001 , 小于此值的浮点数尾数部分将被截断 | |dPrecision | 设置 double 类型浮点数压缩精度 , 取值范围:0.1 ~ 0.0000000000000001 , 缺省值 0.0000000000000001 , 小于此值的浮点数尾数部分将被截取 | -|lossyColumn | 对 float 和/或 double 类型启用 TSZ 有损压缩;取值范围: float, double, none;缺省值: none,表示关闭无损压缩。**注意:此参数在 3.3.0.0 及更高版本中不再使用**| +|lossyColumn | 对 float 和/或 double 类型启用 TSZ 有损压缩;取值范围: float, double, none;缺省值: none,表示关闭无损压缩。**注意:此参数在 3.3.0.0 及更高版本中不再使用** | |ifAdtFse | 在启用 TSZ 有损压缩时,使用 FSE 算法替换 HUFFMAN 算法, FSE 算法压缩速度更快,但解压稍慢,追求压缩速度可选用此算法; 0: 关闭,1:打开;默认值为 0 | From dd83d0f256a9167885cf6df139fd66855ccf7fde Mon Sep 17 00:00:00 2001 From: sheyanjie-qq <249478495@qq.com> Date: Fri, 9 Aug 2024 09:16:55 +0800 Subject: [PATCH 067/103] update tdinsight doc --- .../01-components/12-tdinsight/index.mdx | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/docs/zh/14-reference/01-components/12-tdinsight/index.mdx b/docs/zh/14-reference/01-components/12-tdinsight/index.mdx index 224fd5908d..2d4b69f46b 100644 --- a/docs/zh/14-reference/01-components/12-tdinsight/index.mdx +++ b/docs/zh/14-reference/01-components/12-tdinsight/index.mdx @@ -17,6 +17,12 @@ TDengine 通过 taosKeeper 将服务器的 CPU、内存、硬盘空间、带宽 - TDengine 已经安装并正常运行,此仪表盘需要 TDengine 3.0.0.0 及以上,并开启监控上报配置,具体配置请参考:[TDengine 监控配置](../taosd/#监控相关)。 - taosAdapter 已经安装并正常运行。具体细节请参考:[taosAdapter 参考手册](../taosadapter) - taosKeeper 已安装并正常运行。具体细节请参考:[taosKeeper 参考手册](../taoskeeper) +- Grafana 服务已安装并正常运行。我们建议您使用最新的 Grafana 版本,TDInsight 支持 Grafana 7.5 及以上版本。 + :::info + + 下文介绍中,都以 Grafana v11.0.0 版本为例,其他版本功能可能有差异,请参考 [Grafana 官网](https://grafana.com/docs/grafana/latest/)。 + + ::: 然后记录以下信息: @@ -24,22 +30,11 @@ TDengine 通过 taosKeeper 将服务器的 CPU、内存、硬盘空间、带宽 - taosAdapter 集群认证信息,可使用用户名及密码。 - taosKeeper 记录监控指标的数据库名称。 -## 安装和启动 Grafana -我们建议您使用最新的 Grafana 版本,TDInsight 支持 Grafana 7.5 及以上版本。您可以在任何[支持的操作系统](https://grafana.com/docs/grafana/latest/installation/requirements/#supported-operating-systems)中,按照 [Grafana 官方文档安装说明](https://grafana.com/docs/grafana/latest/installation/) 安装 Grafana。 -安装后请参考 [启动 Grafana](https://grafana.com/docs/grafana/latest/setup-grafana/start-restart-grafana/) 启动 Grafana 服务。 - -安装完成后就可以在 Web 浏览器中打开 Grafana 网址,默认是:`http://localhost:3000`。 默认用户名/密码都是 `admin`。Grafana 会要求在首次登录后更改密码。 - -:::info - -下文介绍中,都以 Grafana v11.0.0 版本为例,其他版本功能可能有差异,请参考 [Grafana 官网](https://grafana.com/docs/grafana/latest/)。 - -::: ## 安装 TDengine 数据源插件 -TDInsight 支持图形界面安装、手动安装和脚本安装三种安装方式,一般建议图形界面安装。对于 Grafana 8.5 以下版本可以使用手动安装和脚本安装方式。 +TDengine 数据源插件支持图形界面安装、手动安装和脚本安装三种安装方式,一般建议图形界面安装。对于 Grafana 8.5 以下版本可以使用手动安装和脚本安装方式。 From 47a7707cff316331f6fce015f9db8fe7591e5f68 Mon Sep 17 00:00:00 2001 From: gccgdb1234 Date: Fri, 9 Aug 2024 09:26:34 +0800 Subject: [PATCH 068/103] doc: add 6050 for taosX and correct typos --- docs/zh/07-operation/02-planning.md | 2 +- docs/zh/07-operation/03-deployment.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/zh/07-operation/02-planning.md b/docs/zh/07-operation/02-planning.md index dc119b5166..ef2c7ce97c 100644 --- a/docs/zh/07-operation/02-planning.md +++ b/docs/zh/07-operation/02-planning.md @@ -152,5 +152,5 @@ TDengine 的多级存储功能在使用上还具备以下优点。 |RESTful 接口 | 6041 | |WebSocket 接口 |6041 | |taosKeeper | 6043 | -|taosX | 6055 | +|taosX | 6050, 6055 | |taosExplorer | 6060 | \ No newline at end of file diff --git a/docs/zh/07-operation/03-deployment.md b/docs/zh/07-operation/03-deployment.md index 7cf79636ed..c137f346e3 100644 --- a/docs/zh/07-operation/03-deployment.md +++ b/docs/zh/07-operation/03-deployment.md @@ -44,7 +44,7 @@ fqdn h1.taosdata.com serverPort 6030 ``` -一定要修改的参数是 f irstEp 和 fqdn。对于每个 dnode,f irstEp 配置应该保持一致,但 fqdn 一定要配置成其所在 dnode 的值。其他参数可不做任何修改,除非你很清楚为什么要修改。 +一定要修改的参数是 firstEp 和 fqdn。对于每个 dnode,firstEp 配置应该保持一致,但 fqdn 一定要配置成其所在 dnode 的值。其他参数可不做任何修改,除非你很清楚为什么要修改。 对于希望加入集群的 dnode 节点,必须确保下表所列的与 TDengine 集群相关的参数设置完全一致。任何参数的不匹配都可能导致 dnode 节点无法成功加入集群。 From 3826fec13de2d5cc7107075e8f2fe1904e125cd1 Mon Sep 17 00:00:00 2001 From: 54liuyao <54liuyao> Date: Fri, 9 Aug 2024 09:28:13 +0800 Subject: [PATCH 069/103] adj error code --- source/dnode/vnode/src/tsdb/tsdbRead2.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/source/dnode/vnode/src/tsdb/tsdbRead2.c b/source/dnode/vnode/src/tsdb/tsdbRead2.c index 84ca2c36ea..880e73c5c0 100644 --- a/source/dnode/vnode/src/tsdb/tsdbRead2.c +++ b/source/dnode/vnode/src/tsdb/tsdbRead2.c @@ -4708,8 +4708,7 @@ int32_t tsdbReaderOpen2(void* pVnode, SQueryTableDataCond* pCond, void* pTableLi pReader->pSchemaMap = tSimpleHashInit(8, taosFastHash); if (pReader->pSchemaMap == NULL) { tsdbError("failed init schema hash for reader %s", pReader->idStr); - code = TSDB_CODE_OUT_OF_MEMORY; - goto _err; + TSDB_CHECK_NULL(pReader->pSchemaMap, code, lino, _err, terrno); } tSimpleHashSetFreeFp(pReader->pSchemaMap, freeSchemaFunc); From 489472deef14c4858186c0ec590765a63460a900 Mon Sep 17 00:00:00 2001 From: gccgdb1234 Date: Fri, 9 Aug 2024 10:23:57 +0800 Subject: [PATCH 070/103] correct based on comments from Dong yanqiong --- docs/zh/03-intro.md | 4 ++-- docs/zh/04-get-started/01-docker.md | 2 +- docs/zh/04-get-started/03-package.md | 2 +- docs/zh/05-basic/03-query.md | 4 ++-- docs/zh/06-advanced/02-cache.md | 10 +++++----- docs/zh/07-operation/02-planning.md | 2 +- docs/zh/07-operation/03-deployment.md | 2 +- docs/zh/07-operation/14-user.md | 2 +- docs/zh/07-operation/16-security.md | 6 +++--- 9 files changed, 17 insertions(+), 17 deletions(-) diff --git a/docs/zh/03-intro.md b/docs/zh/03-intro.md index 790af4f861..d62b906277 100644 --- a/docs/zh/03-intro.md +++ b/docs/zh/03-intro.md @@ -6,7 +6,7 @@ toc_max_heading_level: 4 TDengine 是一个高性能、分布式的时序数据库。通过集成的缓存、数据订阅、流计算和数据清洗与转换等功能,TDengine 已经发展成为一个专为物联网、工业互联网、金融和 IT 运维等关键行业量身定制的时序大数据平台。该平台能够高效地汇聚、存储、分析、计算和分发来自海量数据采集点的大规模数据流,每日处理能力可达 TB 乃至 PB 级别。借助 TDengine,企业可以实现实时的业务监控和预警,进而发掘出有价值的商业洞察。 -自 2019 年 7 月 以 来, 涛 思 数 据 陆 续 将 TDengine 的 不 同 版 本 开 源, 包 括 单 机版(2019 年 7 月)、集群版(2020 年 8 月)以及云原生版(2022 年 8 月)。开源之后,TDengine 迅速获得了全球开发者的关注,多次在 GitHub 网站全球趋势排行榜上位居榜首,最新的关注热度见[涛思数据首页](https://www.taosdata.com/)。 +自 2019 年 7 月 以来, 涛思数据陆续将 TDengine 的不同版本开源,包括单机版(2019 年 7 月)、集群版(2020 年 8 月)以及云原生版(2022 年 8 月)。开源之后,TDengine 迅速获得了全球开发者的关注,多次在 GitHub 网站全球趋势排行榜上位居榜首,最新的关注热度见[涛思数据首页](https://www.taosdata.com/)。 ## TDengine 产品 @@ -20,7 +20,7 @@ TDengine OSS 是一个开源的高性能时序数据库,与其他时序数据 ## TDengine 主要功能与特性 -TDengine 经过特别优化,以适应时间序列数据的独特需求,引入了“一个数据采集点一张表”和“超级表”的创新数据组织策略。这些策略背后的支撑是一个革命性的存储引擎,它极大地提升了数据处理的速度和效率,无论是在数据的写入、查询还是存储方面。接下来,逐一探索 TDengine 的众多功能,帮助您全面了解这个为高效处理时间序列数据而生的大数据平台。 +TDengine 经过特别优化,以适应时间序列数据的独特需求,引入了 “一个数据采集点一张表” 和 “超级表” 的创新数据组织策略。这些策略背后的支撑是一个革命性的存储引擎,它极大地提升了数据处理的速度和效率,无论是在数据的写入、查询还是存储方面。接下来,逐一探索 TDengine 的众多功能,帮助您全面了解这个为高效处理时间序列数据而生的大数据平台。 1. 写入数据:TDengine 支持多种数据写入方式。首先,它完全兼容 SQL,允许用户使用标准的 SQL 语法进行数据写入。而且 TDengine 还支持无模式(Schemaless)写入,包括流行的 InfluxDB Line 协议、OpenTSDB 的 Telnet 和 JSON 协议,这些协议的加入使得数据的导入变得更加灵活和高效。更进一步,TDengine 与众多第三方工具实现了无缝集成,例如 Telegraf、Prometheus、EMQX、StatsD、collectd 和 HiveMQ 等。在 TDengine Enterprise 中, 还提供了 MQTT、OPC-UA、OPC-DA、PI、Wonderware、Kafka、InfluxDB、OpenTSDB、MySQL、Oracle 和 SQL Server 等连接器。这些工具通过简单的配置,无需一行代码,就可以将来自各种数据源的数据源源不断的写入数据库,极大地简化了数据收集和存储的过程。 diff --git a/docs/zh/04-get-started/01-docker.md b/docs/zh/04-get-started/01-docker.md index c73f415f31..364e00f8f2 100644 --- a/docs/zh/04-get-started/01-docker.md +++ b/docs/zh/04-get-started/01-docker.md @@ -81,7 +81,7 @@ taosBenchmark 是一个专为测试 TDengine 性能而设计的工具,它能 taosBenchmark -y ``` -系统将自动在数据库 test 下创建一张名为 meters的超级表。这张超级表将包含 10 000 张子表,表名从 d0 到 d9999,每张表包含 10,000条记录。每条记录包含 ts(时间戳)、current(电流)、voltage(电压)和 phase(相位)4个字段。时间戳范围从“2017-07-14 10:40:00 000”到“2017-07-14 10:40:09 999”。每张表还带有 location 和 groupId 两个标签,其中,groupId 设置为 1 到 10,而 location 则设置为 California.Campbell、California.Cupertino 等城市信息。 +系统将自动在数据库 test 下创建一张名为 meters的超级表。这张超级表将包含 10,000 张子表,表名从 d0 到 d9999,每张表包含 10,000条记录。每条记录包含 ts(时间戳)、current(电流)、voltage(电压)和 phase(相位)4个字段。时间戳范围从“2017-07-14 10:40:00 000” 到 “2017-07-14 10:40:09 999”。每张表还带有 location 和 groupId 两个标签,其中,groupId 设置为 1 到 10,而 location 则设置为 California.Campbell、California.Cupertino 等城市信息。 执行该命令后,系统将迅速完成 1 亿条记录的写入过程。实际所需时间取决于硬件性能,但即便在普通 PC 服务器上,这个过程通常也只需要十几秒。 diff --git a/docs/zh/04-get-started/03-package.md b/docs/zh/04-get-started/03-package.md index 20166dd9ca..4906a2fcfa 100644 --- a/docs/zh/04-get-started/03-package.md +++ b/docs/zh/04-get-started/03-package.md @@ -277,7 +277,7 @@ taosBenchmark 是一个专为测试 TDengine 性能而设计的工具,它能 taosBenchmark -y ``` -系统将自动在数据库 test 下创建一张名为 meters的超级表。这张超级表将包含 10 000 张子表,表名从 d0 到 d9999,每张表包含 10,000条记录。每条记录包含 ts(时间戳)、current(电流)、voltage(电压)和 phase(相位)4个字段。时间戳范围从“2017-07-14 10:40:00 000”到“2017-07-14 10:40:09 999”。每张表还带有 location 和 groupId 两个标签,其中,groupId 设置为 1 到 10,而 location 则设置为 California.Campbell、California.Cupertino 等城市信息。 +系统将自动在数据库 test 下创建一张名为 meters的超级表。这张超级表将包含 10,000 张子表,表名从 d0 到 d9999,每张表包含 10,000条记录。每条记录包含 ts(时间戳)、current(电流)、voltage(电压)和 phase(相位)4个字段。时间戳范围从 “2017-07-14 10:40:00 000” 到 “2017-07-14 10:40:09 999”。每张表还带有 location 和 groupId 两个标签,其中,groupId 设置为 1 到 10,而 location 则设置为 California.Campbell、California.Cupertino 等城市信息。 执行该命令后,系统将迅速完成 1 亿条记录的写入过程。实际所需时间取决于硬件性能,但即便在普通 PC 服务器上,这个过程通常也只需要十几秒。 diff --git a/docs/zh/05-basic/03-query.md b/docs/zh/05-basic/03-query.md index e7ef86888f..1b4c3731e6 100644 --- a/docs/zh/05-basic/03-query.md +++ b/docs/zh/05-basic/03-query.md @@ -41,7 +41,7 @@ LIMIT 5 TDengine 支持通过 GROUP BY 子句,对数据进行聚合查询。SQL 语句包含 GROUP BY 子句时,SELECT 列表只能包含如下表达式: 1. 常量 -2. 聚集函数 +2. 聚合函数 3. 与 GROUP BY 后表达式相同的表达式 4. 包含前面表达式的表达式 @@ -158,7 +158,7 @@ window_clause: { **注意** 在使用窗口子句时应注意以下规则: 1. 窗口子句位于数据切分子句之后,不可以和 GROUP BY 子句一起使用。 -2. 窗口子句将数据按窗口进行切分,对每个窗口进行 SELECT 列表中的表达式的计算,SELECT 列表中的表达式只能包含:常量;伪列:_wstart 伪列、_wend 伪列和 _wduration 伪列;聚集函数(包括选择函数和可以由参数确定输出行数的时序特有函数) +2. 窗口子句将数据按窗口进行切分,对每个窗口进行 SELECT 列表中的表达式的计算,SELECT 列表中的表达式只能包含:常量;伪列:_wstart 伪列、_wend 伪列和 _wduration 伪列;聚合函数(包括选择函数和可以由参数确定输出行数的时序特有函数) 3. WHERE 语句可以指定查询的起止时间和其他过滤条件。 ### 时间戳伪列 diff --git a/docs/zh/06-advanced/02-cache.md b/docs/zh/06-advanced/02-cache.md index d1ad356c17..ca1da30dbf 100644 --- a/docs/zh/06-advanced/02-cache.md +++ b/docs/zh/06-advanced/02-cache.md @@ -16,12 +16,12 @@ TDengine 采用了一种创新的时间驱动缓存管理策略,亦称为写 为了实现数据的分布式存储和高可用性,TDengine 引入了虚拟节点(vnode)的概念。每个 vnode 可以拥有多达 3 个副本,这些副本共同组成一个 vnode group,简称 vgroup。在创建数据库时,用户需要确定每个 vnode 的写入缓存大小,以确保数据的合理分配和高效存储。 -创建数据库时的两个关键参数—vgroups 和 buffer—分别决定了数据库中的数据由多少个 vgroup 进行处理,以及为每个 vnode 分配多少写入缓存。通过合理配置这两个 +创建数据库时的两个关键参数 `vgroups` 和 `buffer` 分别决定了数据库中的数据由多少个 vgroup 进行处理,以及为每个 vnode 分配多少写入缓存。通过合理配置这两个 参数,用户可以根据实际需求调整数据库的性能和存储容量,从而实现最佳的性能和成本效益。 例 如, 下面的 SQL 创建了包含 10 个 vgroup,每个 vnode 占 用 256MB 内存的数据库。 -```ssql -create database power vgroups 10 buffer 256 cachemodel 'none' pages 128 pagesize 16 +```sql +CREATE DATABASE POWER VGROUPS 10 BUFFER 256 CACHEMODEL 'NONE' PAGES 128 PAGESIZE 16; ``` 缓存越大越好,但超过一定阈值后再增加缓存对写入性能提升并无帮助。 @@ -43,7 +43,7 @@ create database power vgroups 10 buffer 256 cachemodel 'none' pages 128 pagesize 为了提升查询和写入操作的效率,每个 vnode 都配备了缓存机制,用于存储其曾经获取过的元数据。这一元数据缓存的大小由创建数据库时的两个参数 pages 和 pagesize 共同决定。其中,pagesize 参数的单位是 KB,用于指定每个缓存页的大小。如下 SQL 会为数据库 power 的每个 vnode 创建 128 个 page、每个 page 16KB 的元数据缓存 ```sql -create database power pages 128 pagesize 16 +CREATE DATABASE POWER PAGES 128 PAGESIZE 16; ``` ## 文件系统缓存 @@ -57,7 +57,7 @@ TDengine 利用这些日志文件实现故障前的状态恢复。在写入 WAL - wal_fsync_period:当 wal_level 设置为 2 时,这个参数控制执行 fsync 的频率。设置为 0 表示每次写入后立即执行 fsync,这可以确保数据的安全性,但可能会牺牲一些性能。当设置为大于 0 的数值时,表示 fsync 周期,默认为 3000,范围是[1, 180000],单位毫秒。 ```sql -create database power wal_level 1 wal_fsync_period 3000 +CREATE DATABASE POWER WAL_LEVEL 1 WAL_FSYNC_PERIOD 3000; ``` 在创建数据库时可以选择不同的参数类型,来选择性能优先或者可靠性优先。 diff --git a/docs/zh/07-operation/02-planning.md b/docs/zh/07-operation/02-planning.md index ef2c7ce97c..66da1df8bf 100644 --- a/docs/zh/07-operation/02-planning.md +++ b/docs/zh/07-operation/02-planning.md @@ -63,7 +63,7 @@ M = (T × S × 3 + (N / 4096) + 100) TDengine 用户对 CPU 的需求主要受以下 3 个因素影响: - 数据分片:在 TDengine 中,每个 CPU 核心可以服务 1 至 2 个 vnode。假设一个集群配置了 100 个 vgroup,并且采用三副本策略,那么建议该集群的 CPU 核心数量为 150~300 个,以实现最佳性能。 -- 数据写入:TDengine 的单核每秒至少能处理 10 000 个写入请求。值得注意的是,每个写入请求可以包含多条记录,而且一次写入一条记录与同时写入 10 条记录相比,消耗的计算资源相差无几。因此,每次写入的记录数越多,写入效率越高。例如,如果一个写入请求包含 200 条以上记录,单核就能实现每秒写入 100 万条记录的速度。然而,这要求前端数据采集系统具备更高的能力,因为它需要缓存记录,然后批量写入。 +- 数据写入:TDengine 的单核每秒至少能处理 10,000 个写入请求。值得注意的是,每个写入请求可以包含多条记录,而且一次写入一条记录与同时写入 10 条记录相比,消耗的计算资源相差无几。因此,每次写入的记录数越多,写入效率越高。例如,如果一个写入请求包含 200 条以上记录,单核就能实现每秒写入 100 万条记录的速度。然而,这要求前端数据采集系统具备更高的能力,因为它需要缓存记录,然后批量写入。 - 查询需求:虽然 TDengine 提供了高效的查询功能,但由于每个应用场景的查询差异较大,且查询频次也会发生变化,因此很难给出一个具体的数字来衡量查询所需的计算资源。用户需要根据自己的实际场景编写一些查询语句,以便更准确地确定所需的计算资源。 综上所述,对于数据分片和数据写入,CPU 的需求是可以预估的。然而,查询需求所消耗的计算资源则难以预测。在实际运行过程中,建议保持 CPU 使用率不超过 50%,以确保系统的稳定性和性能。一旦 CPU 使用率超过这一阈值,就需要考虑增加新的节点或增加 CPU 核心数量,以提供更多的计算资源。 diff --git a/docs/zh/07-operation/03-deployment.md b/docs/zh/07-operation/03-deployment.md index c137f346e3..83b2c91843 100644 --- a/docs/zh/07-operation/03-deployment.md +++ b/docs/zh/07-operation/03-deployment.md @@ -22,7 +22,7 @@ taosd 是 TDengine 集群中最主要的服务组件,本节介绍手动部署 - 第 1 步,在每个物理节点上执行 hostname -f 命令,以查看并确认所有节点的hostname 是唯一的。对于应用程序驱动所在的节点,这一步骤可以省略。 - 第 2 步,在每个物理节点上执行 ping host 命令,其中 host 是其他物理节点的 hostname。这一步骤旨在检测当前节点与其他物理节点之间的网络连通性。如果发现无法 ping 通,请立即检查网络和 DNS 设置。对于 Linux 操作系统,请检查 /etc/hosts 文件;对于 Windows 操作系统,请检查C:\Windows\system32\drivers\etc\hosts 文件。网络不通畅将导致无法组建集群,请务必解决此问题。 -- 第 3 步,在应用程序运行的物理节点上重复上述网络监测步骤。如果发现网络不通畅,应用程序将无法连接到 taosd 服务。此时,请仔细检查应用程序所在物理节点的DNS 设置或 hosts 文件,确保其配置正确无误。 +- 第 3 步,在应用程序运行的物理节点上重复上述网络检测步骤。如果发现网络不通畅,应用程序将无法连接到 taosd 服务。此时,请仔细检查应用程序所在物理节点的DNS 设置或 hosts 文件,确保其配置正确无误。 - 第 4 步,检查端口,确保集群中所有主机在端口 6030 上的 TCP 能够互通。 通过以上步骤,你可以确保所有节点在网络层面顺利通信,从而为成功部署TDengine 集群奠定坚实基础 diff --git a/docs/zh/07-operation/14-user.md b/docs/zh/07-operation/14-user.md index f1d80f5246..03a838462f 100644 --- a/docs/zh/07-operation/14-user.md +++ b/docs/zh/07-operation/14-user.md @@ -76,7 +76,7 @@ drop user user_name 在 TDengine 中,库和表的权限分为 read (读)和 write (写)两种。这些权限可以单独授予,也可以同时授予用户。 - read 权限:拥有 read 权限的用户仅能查询库或表中的数据,而无法对数据进行修改或删除。这种权限适用于需要访问数据但不需要对数据进行写入操作的场景,如数据分析师、报表生成器等。 -- write 权限:拥有 write 权限的用户既可以查询库或表中的数据,也可以向库或表中写入数据。这种权限适用于需要对数据进行写入操作的场景,如数据采集器、数据处理器等。 +- write 权限:拥有 write 权限的用户可以向库或表中写入数据。这种权限适用于需要对数据进行写入操作的场景,如数据采集器、数据处理器等。如果只拥有 write 权限而没有 read 权限,则只能写入数据但不能查询数据。 对某个用户进行库和表访问授权的语法如下。 diff --git a/docs/zh/07-operation/16-security.md b/docs/zh/07-operation/16-security.md index cfe922d3df..4f47a644f7 100644 --- a/docs/zh/07-operation/16-security.md +++ b/docs/zh/07-operation/16-security.md @@ -18,13 +18,13 @@ alter user test add host host_name1 查询 IP 白名单的 SQL 如下。 ```sql -select test, allowed_host from ins_user_privileges; -show users; +SELECT TEST, ALLOWED_HOST FROM INS_USERS; +SHOW USERS; ``` 删除 IP 白名单的命令如下。 ```sql -alter user test drop host host_name1 +ALTER USER TEST DROP HOST HOST_NAME1 ``` ## 审计日志 From be4effd0c8e53a4250a3706ec1da2d960cc40095 Mon Sep 17 00:00:00 2001 From: wangjiaming0909 <604227650@qq.com> Date: Fri, 9 Aug 2024 10:25:04 +0800 Subject: [PATCH 071/103] fix test when using -R --- tests/system-test/2-query/tsma2.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/system-test/2-query/tsma2.py b/tests/system-test/2-query/tsma2.py index 6ea6e6c36f..404fb3f00c 100644 --- a/tests/system-test/2-query/tsma2.py +++ b/tests/system-test/2-query/tsma2.py @@ -830,11 +830,21 @@ class TDTestCase: ).ignore_res_order(sql_generator.can_ignore_res_order()).get_qc()) return ctxs + def test_query_interval(self): + sql = 'select count(*), _wstart, _wend from db.meters interval(1n) sliding(1d) limit 1' + tdSql.query(sql) + tdSql.checkData(0, 1, '2017-06-15 00:00:00') + sql = 'select /*+skip_tsma()*/count(*), _wstart, _wend from db.meters interval(1n) sliding(1d) limit 1' + tdSql.query(sql) + tdSql.checkData(0, 1, '2017-06-15 00:00:00') + def test_bigger_tsma_interval(self): db = 'db' tb = 'meters' func = ['max(c1)', 'min(c1)', 'min(c2)', 'max(c2)', 'avg(c1)', 'count(ts)'] self.init_data(db,10, 10000, 1500000000000, 11000000) + self.test_query_interval() + examples = [ ('10m', '1h', True), ('10m','1d',True), ('1m', '120s', True), ('1h','1d',True), ('12h', '1y', False), ('1h', '1n', True), ('1h', '1y', True), @@ -899,16 +909,6 @@ class TDTestCase: self.check(ctxs) - sql = 'select count(*), _wstart, _wend from db.meters interval(1n) sliding(1d) limit 1' - tdSql.query(sql) - first_win: datetime = tdSql.queryResult[0][1] - if first_win.hour != 0: - tdLog.exit("day sliding should always aligned with current timezone") - sql = 'select /*+skip_tsma()*/count(*), _wstart, _wend from db.meters interval(1n) sliding(1d) limit 1' - tdSql.query(sql) - first_win: datetime = tdSql.queryResult[0][1] - if first_win.hour != 0: - tdLog.exit("day sliding should always aligned with current timezone") def stop(self): tdSql.close() From bc14da28e5277b1fc373ca5cc506c5e6d1468db9 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Fri, 9 Aug 2024 10:31:05 +0800 Subject: [PATCH 072/103] refactor: do some internal refactor. --- source/dnode/mnode/impl/src/mndStream.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/dnode/mnode/impl/src/mndStream.c b/source/dnode/mnode/impl/src/mndStream.c index dc8f494914..20f0e7b105 100644 --- a/source/dnode/mnode/impl/src/mndStream.c +++ b/source/dnode/mnode/impl/src/mndStream.c @@ -2420,7 +2420,7 @@ int32_t mndProcessStreamReqCheckpoint(SRpcMsg *pReq) { if (pStream != NULL) { // TODO:handle error code = mndProcessStreamCheckpointTrans(pMnode, pStream, checkpointId, 0, false); if (code) { - mError("failed to create checkpoint trans, code:%s", strerror(code)); + mError("failed to create checkpoint trans, code:%s", tstrerror(code)); } } else { // todo: wait for the create stream trans completed, and launch the checkpoint trans From 10acd19e714ece29a0d81e2250bf2195513259d4 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Fri, 9 Aug 2024 10:36:10 +0800 Subject: [PATCH 073/103] refactor: do some internal refactor. --- source/dnode/mnode/impl/src/mndMain.c | 2 +- source/libs/stream/src/streamSched.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/source/dnode/mnode/impl/src/mndMain.c b/source/dnode/mnode/impl/src/mndMain.c index 37a171e9a4..11787a015b 100644 --- a/source/dnode/mnode/impl/src/mndMain.c +++ b/source/dnode/mnode/impl/src/mndMain.c @@ -443,7 +443,7 @@ static int32_t mndInitTimer(SMnode *pMnode) { (void)taosThreadAttrInit(&thAttr); (void)taosThreadAttrSetDetachState(&thAttr, PTHREAD_CREATE_JOINABLE); if ((code = taosThreadCreate(&pMnode->thread, &thAttr, mndThreadFp, pMnode)) != 0) { - mError("failed to create timer thread since %s", strerror(errno)); + mError("failed to create timer thread since %s", tstrerror(code)); TAOS_RETURN(code); } diff --git a/source/libs/stream/src/streamSched.c b/source/libs/stream/src/streamSched.c index 6506d449a6..e8c7be5204 100644 --- a/source/libs/stream/src/streamSched.c +++ b/source/libs/stream/src/streamSched.c @@ -107,7 +107,7 @@ void streamTaskResumeHelper(void* param, void* tmrId) { int32_t code = streamTaskSchedTask(pTask->pMsgCb, pTask->info.nodeId, pId->streamId, pId->taskId, STREAM_EXEC_T_RESUME_TASK); int32_t ref = atomic_sub_fetch_32(&pTask->status.timerActive, 1); if (code) { - stError("s-task:%s sched task failed, code:%s, ref:%d", pId->idStr, strerror(code), ref); + stError("s-task:%s sched task failed, code:%s, ref:%d", pId->idStr, tstrerror(code), ref); } else { stDebug("trigger to resume s-task:%s after being idled for %dms, ref:%d", pId->idStr, pTask->status.schedIdleTime, ref); From 78a4a93d9c034d917d48ec2722c7be8f23aa7aa4 Mon Sep 17 00:00:00 2001 From: Hongze Cheng Date: Fri, 9 Aug 2024 11:04:04 +0800 Subject: [PATCH 074/103] fix: handle error code --- source/dnode/vnode/src/meta/metaQuery.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/source/dnode/vnode/src/meta/metaQuery.c b/source/dnode/vnode/src/meta/metaQuery.c index 1d284aaf1f..27a4179172 100644 --- a/source/dnode/vnode/src/meta/metaQuery.c +++ b/source/dnode/vnode/src/meta/metaQuery.c @@ -315,12 +315,14 @@ int32_t metaTbCursorNext(SMTbCursor *pTbCur, ETableType jumpTableType) { for (;;) { ret = tdbTbcNext((TBC *)pTbCur->pDbc, &pTbCur->pKey, &pTbCur->kLen, &pTbCur->pVal, &pTbCur->vLen); if (ret < 0) { - return -1; + return ret; } tDecoderClear(&pTbCur->mr.coder); - (void)metaGetTableEntryByVersion(&pTbCur->mr, ((SUidIdxVal *)pTbCur->pVal)[0].version, *(tb_uid_t *)pTbCur->pKey); + ret = metaGetTableEntryByVersion(&pTbCur->mr, ((SUidIdxVal *)pTbCur->pVal)[0].version, *(tb_uid_t *)pTbCur->pKey); + if (ret) return ret; + if (pTbCur->mr.me.type == jumpTableType) { continue; } From b170feeb226798d57d6b3ae42cca24d2d79efb52 Mon Sep 17 00:00:00 2001 From: t_max <1172915550@qq.com> Date: Fri, 9 Aug 2024 11:29:37 +0800 Subject: [PATCH 075/103] docs: update java demo and grafana alert --- .../com/taos/example/WSConnectExample.java | 4 +- docs/zh/08-develop/01-connect/index.md | 12 +- docs/zh/08-develop/02-sql.md | 2 +- docs/zh/08-develop/04-schemaless.md | 4 +- docs/zh/14-reference/05-connector/14-java.mdx | 2 +- docs/zh/14-reference/05-connector/index.md | 49 ++--- .../20-third-party/03-visual/01-grafana.mdx | 184 ++++++++++-------- .../java/com/taosdata/example/HikariDemo.java | 2 +- 8 files changed, 139 insertions(+), 120 deletions(-) diff --git a/docs/examples/java/src/main/java/com/taos/example/WSConnectExample.java b/docs/examples/java/src/main/java/com/taos/example/WSConnectExample.java index d683cc64a6..f920e77037 100644 --- a/docs/examples/java/src/main/java/com/taos/example/WSConnectExample.java +++ b/docs/examples/java/src/main/java/com/taos/example/WSConnectExample.java @@ -11,9 +11,9 @@ public class WSConnectExample { // ANCHOR: main public static void main(String[] args) throws SQLException { // use - // String jdbcUrl = "jdbc:TAOS-RS://localhost:6041/dbName?user=root&password=taosdata"; + // String jdbcUrl = "jdbc:TAOS-RS://localhost:6041/dbName?user=root&password=taosdata&batchfetch=true"; // if you want to connect a specified database named "dbName". - String jdbcUrl = "jdbc:TAOS-RS://localhost:6041?user=root&password=taosdata"; + String jdbcUrl = "jdbc:TAOS-RS://localhost:6041?user=root&password=taosdata&batchfetch=true"; Properties connProps = new Properties(); connProps.setProperty(TSDBDriver.PROPERTY_KEY_BATCH_LOAD, "true"); connProps.setProperty(TSDBDriver.PROPERTY_KEY_ENABLE_AUTO_RECONNECT, "true"); diff --git a/docs/zh/08-develop/01-connect/index.md b/docs/zh/08-develop/01-connect/index.md index 2e0f284eeb..8e279e586e 100644 --- a/docs/zh/08-develop/01-connect/index.md +++ b/docs/zh/08-develop/01-connect/index.md @@ -259,17 +259,19 @@ dotnet add package TDengine.Connector ## 建立连接 在执行这一步之前,请确保有一个正在运行的,且可以访问到的 TDengine,而且服务端的 FQDN 配置正确。以下示例代码,都假设 TDengine 安装在本机,且 FQDN(默认 localhost) 和 serverPort(默认 6030) 都使用默认配置。 + ### 连接参数 连接的配置项较多,因此在建立连接之前,我们能先介绍一下各语言连接器建立连接使用的参数。 + Java 连接器建立连接的参数有 URL 和 Properties。 + TDengine 的 JDBC URL 规范格式为: + `jdbc:[TAOS|TAOS-RS]://[host_name]:[port]/[database_name]?[user={user}|&password={password}|&charset={charset}|&cfgdir={config_dir}|&locale={locale}|&timezone={timezone}|&batchfetch={batchfetch}]` -Java 连接器建立连接的参数有 URL 和 Properties。 -TDengine 的 JDBC URL 规范格式为: -`jdbc:[TAOS|TAOS-RS]://[host_name]:[port]/[database_name]?[user={user}|&password={password}|&charset={charset}|&cfgdir={config_dir}|&locale={locale}|&timezone={timezone}]` + URL 和 Properties 的详细参数说明和如何使用详见 [url 规范](../../reference/connector/java/#url-规范) -URL 和 Properties 的详细参数说明和如何使用详见 [url 规范](../../reference/connector/java/#url-规范) + **注**:REST 连接中增加 `batchfetch` 参数并设置为 true,将开启 WebSocket 连接。 @@ -343,7 +345,7 @@ DSN 的详细说明和如何使用详见 [连接功能](../../reference/connecto - **database**: 数据库名称。 - **params**: 其他参数。 例如token。 - - 完整 D 示例: + - 完整 DSN 示例: ```js ws://root:taosdata@localhost:6041 diff --git a/docs/zh/08-develop/02-sql.md b/docs/zh/08-develop/02-sql.md index 18d1072ce9..3bebe5e7a4 100644 --- a/docs/zh/08-develop/02-sql.md +++ b/docs/zh/08-develop/02-sql.md @@ -90,7 +90,7 @@ curl --location -uroot:taosdata 'http://127.0.0.1:6041/rest/sql/power' \ -> **注意**:如果不使用 `USE power` 指定数据库,则后续对表的操作都需要增加数据库名称作为前缀,如 `power.meters`。 +> **注意**:建议采用 `.` 的格式构造SQL语句,不推荐在应用中采用 `USE DBName`方式访问。 ## 插入数据 下面以智能电表为例,展示如何使用连接器执行 SQL 来插入数据到 `power` 数据库的 `meters` 超级表。样例使用 TDengine 自动建表 SQL 语法,写入 d1001 子表中 3 条数据,写入 d1002 子表中 1 条数据,然后打印出实际插入数据条数。 diff --git a/docs/zh/08-develop/04-schemaless.md b/docs/zh/08-develop/04-schemaless.md index 2c5279f1d6..1bc750c3cb 100644 --- a/docs/zh/08-develop/04-schemaless.md +++ b/docs/zh/08-develop/04-schemaless.md @@ -169,7 +169,7 @@ st,t1=3,t2=4,t3=t3 c1=3i64,c6="passit" 1626006833640000000 ``` -执行带有 reqId 的无模式写入,此 reqId 可用于请求链路追踪。 +执行带有 reqId 的无模式写入,最后一个参数 reqId 可用于请求链路追踪。 ```java writer.write(lineDemo, SchemalessProtocolType.LINE, SchemalessTimestampType.NANO_SECONDS, 1L); @@ -213,7 +213,7 @@ writer.write(lineDemo, SchemalessProtocolType.LINE, SchemalessTimestampType.NANO {{#include examples/JDBC/JDBCDemo/src/main/java/com/taosdata/example/SchemalessJniTest.java:schemaless}} ``` -执行带有 reqId 的无模式写入,此 reqId 可用于请求链路追踪。 +执行带有 reqId 的无模式写入,最后一个参数 reqId 可用于请求链路追踪。 ```java writer.write(lineDemo, SchemalessProtocolType.LINE, SchemalessTimestampType.NANO_SECONDS, 1L); diff --git a/docs/zh/14-reference/05-connector/14-java.mdx b/docs/zh/14-reference/05-connector/14-java.mdx index 8d19745668..0dc85feb13 100644 --- a/docs/zh/14-reference/05-connector/14-java.mdx +++ b/docs/zh/14-reference/05-connector/14-java.mdx @@ -199,7 +199,7 @@ Websocket 和 REST 连接使用驱动类 `com.taosdata.jdbc.rs.RestfulDriver`。 #### URL 规范 TDengine 的 JDBC URL 规范格式为: -`jdbc:[TAOS|TAOS-RS]://[host_name]:[port]/[database_name]?[user={user}|&password={password}|&charset={charset}|&cfgdir={config_dir}|&locale={locale}|&timezone={timezone}]` +`jdbc:[TAOS|TAOS-RS]://[host_name]:[port]/[database_name]?[user={user}|&password={password}|&charset={charset}|&cfgdir={config_dir}|&locale={locale}|&timezone={timezone}|&batchfetch={batchfetch}]` 对于建立连接,原生连接与 REST 连接有细微不同。 Websocket 和 REST 连接使用驱动类 `com.taosdata.jdbc.rs.RestfulDriver`。原生连接使用驱动类 `com.taosdata.jdbc.TSDBDriver`。 diff --git a/docs/zh/14-reference/05-connector/index.md b/docs/zh/14-reference/05-connector/index.md index be25521a30..bc63bdff93 100644 --- a/docs/zh/14-reference/05-connector/index.md +++ b/docs/zh/14-reference/05-connector/index.md @@ -13,7 +13,7 @@ TDengine 提供了丰富的应用程序开发接口,为了便于用户快速 目前 TDengine 的原生接口连接器可支持的平台包括:X64/ARM64 等硬件平台,以及 Linux/Win64 等开发环境。对照矩阵如下: | **CPU** | **OS** | **Java** | **Python** | **Go** | **Node.js** | **C#** | **Rust** | C/C++ | -| ------------- | --------- | -------- | ---------- | ------ | ----------- | ------ | -------- | ----- | +|---------------|-----------|----------|------------|--------|-------------|--------|----------|-------| | **X86 64bit** | **Linux** | ● | ● | ● | ● | ● | ● | ● | | **X86 64bit** | **Win64** | ● | ● | ● | ● | ● | ● | ● | | **X86 64bit** | **macOS** | ● | ● | ● | ○ | ○ | ● | ● | @@ -28,14 +28,14 @@ TDengine 提供了丰富的应用程序开发接口,为了便于用户快速 TDengine 版本更新往往会增加新的功能特性,列表中的连接器版本为连接器最佳适配版本。 -| **TDengine 版本** | **Java** | **Python** | **Go** | **C#** | **Node.js** | **Rust** | -| ---------------------- | ------------- | ------------------------------------------- | ------------ | ------------- | --------------- | -------- | -| **3.3.0.0 及以上** | 3.3.2.0及以上 | taospy 2.7.15及以上,taos-ws-py 0.3.2及以上 | 3.5.5及以上 | 3.1.3及以上 | 3.1.0及以上 | 当前版本 | -| **3.0.0.0 及以上** | 3.0.2以上 | 当前版本 | 3.0 分支 | 3.0.0 | 3.1.0 | 当前版本 | -| **2.4.0.14 及以上** | 2.0.38 | 当前版本 | develop 分支 | 1.0.2 - 1.0.6 | 2.0.10 - 2.0.12 | 当前版本 | -| **2.4.0.4 - 2.4.0.13** | 2.0.37 | 当前版本 | develop 分支 | 1.0.2 - 1.0.6 | 2.0.10 - 2.0.12 | 当前版本 | -| **2.2.x.x ** | 2.0.36 | 当前版本 | master 分支 | n/a | 2.0.7 - 2.0.9 | 当前版本 | -| **2.0.x.x ** | 2.0.34 | 当前版本 | master 分支 | n/a | 2.0.1 - 2.0.6 | 当前版本 | +| **TDengine 版本** | **Java** | **Python** | **Go** | **C#** | **Node.js** | **Rust** | +|------------------------|----------|--------------------------------------|------------|---------------|-----------------|----------| +| **3.3.0.0 及以上** | 3.3.0及以上 | taospy 2.7.15及以上,taos-ws-py 0.3.2及以上 | 3.5.5及以上 | 3.1.3及以上 | 3.1.0及以上 | 当前版本 | +| **3.0.0.0 及以上** | 3.0.2以上 | 当前版本 | 3.0 分支 | 3.0.0 | 3.1.0 | 当前版本 | +| **2.4.0.14 及以上** | 2.0.38 | 当前版本 | develop 分支 | 1.0.2 - 1.0.6 | 2.0.10 - 2.0.12 | 当前版本 | +| **2.4.0.4 - 2.4.0.13** | 2.0.37 | 当前版本 | develop 分支 | 1.0.2 - 1.0.6 | 2.0.10 - 2.0.12 | 当前版本 | +| **2.2.x.x ** | 2.0.36 | 当前版本 | master 分支 | n/a | 2.0.7 - 2.0.9 | 当前版本 | +| **2.0.x.x ** | 2.0.34 | 当前版本 | master 分支 | n/a | 2.0.1 - 2.0.6 | 当前版本 | ## 功能特性 @@ -43,13 +43,13 @@ TDengine 版本更新往往会增加新的功能特性,列表中的连接器 ### 使用原生接口(taosc) -| **功能特性** | **Java** | **Python** | **Go** | **C#** | **Node.js** | **Rust** | -| ------------------- | -------- | ---------- | ------ | ------ | ----------- | -------- | -| **连接管理** | 支持 | 支持 | 支持 | 支持 | 不支持 | 支持 | -| **普通查询** | 支持 | 支持 | 支持 | 支持 | 不支持 | 支持 | -| **参数绑定** | 支持 | 支持 | 支持 | 支持 | 不支持 | 支持 | -| **数据订阅(TMQ)** | 支持 | 支持 | 支持 | 支持 | 不支持 | 支持 | -| **Schemaless** | 支持 | 支持 | 支持 | 支持 | 不支持 | 支持 | +| **功能特性** | **Java** | **Python** | **Go** | **C#** | **Node.js** | **Rust** | +|----------------|----------|------------|--------|--------|-------------|----------| +| **连接管理** | 支持 | 支持 | 支持 | 支持 | 不支持 | 支持 | +| **普通查询** | 支持 | 支持 | 支持 | 支持 | 不支持 | 支持 | +| **参数绑定** | 支持 | 支持 | 支持 | 支持 | 不支持 | 支持 | +| **数据订阅(TMQ)** | 支持 | 支持 | 支持 | 支持 | 不支持 | 支持 | +| **Schemaless** | 支持 | 支持 | 支持 | 支持 | 不支持 | 支持 | :::info 由于不同编程语言数据库框架规范不同,并不意味着所有 C/C++ 接口都需要对应封装支持。 @@ -57,17 +57,18 @@ TDengine 版本更新往往会增加新的功能特性,列表中的连接器 ### 使用 http (REST 或 WebSocket) 接口 -| **功能特性** | **Java** | **Python** | **Go** | **C#** | **Node.js** | **Rust** | -| ------------------------------ | -------- | ---------- | ------ | ------ | ----------- | -------- | -| **连接管理** | 支持 | 支持 | 支持 | 支持 | 支持 | 支持 | -| **普通查询** | 支持 | 支持 | 支持 | 支持 | 支持 | 支持 | -| **参数绑定** | 支持 | 支持 | 支持 | 支持 | 支持 | 支持 | -| **数据订阅(TMQ)** | 支持 | 支持 | 支持 | 支持 | 支持 | 支持 | -| **Schemaless** | 支持 | 支持 | 支持 | 支持 | 支持 | 支持 | -| **批量拉取(基于 WebSocket)** | 支持 | 支持 | 支持 | 支持 | 支持 | 支持 | +| **功能特性** | **Java** | **Python** | **Go** | **C#** | **Node.js** | **Rust** | +|------------------------|----------|------------|--------|--------|-------------|----------| +| **连接管理** | 支持 | 支持 | 支持 | 支持 | 支持 | 支持 | +| **普通查询** | 支持 | 支持 | 支持 | 支持 | 支持 | 支持 | +| **参数绑定** | 支持 | 支持 | 支持 | 支持 | 支持 | 支持 | +| **数据订阅(TMQ)** | 支持 | 支持 | 支持 | 支持 | 支持 | 支持 | +| **Schemaless** | 支持 | 支持 | 支持 | 支持 | 支持 | 支持 | +| **批量拉取(基于 WebSocket)** | 支持 | 支持 | 支持 | 支持 | 支持 | 支持 | :::warning +- 参数绑定、数据订阅、Schemaless 和批量拉取功能仅在 WebSocket 连接下支持。 - 无论选用何种编程语言的连接器,2.0 及以上版本的 TDengine 推荐数据库应用的每个线程都建立一个独立的连接,或基于线程建立连接池,以避免连接内的“USE statement”状态量在线程之间相互干扰(但连接的查询和写入操作都是线程安全的)。 ::: diff --git a/docs/zh/20-third-party/03-visual/01-grafana.mdx b/docs/zh/20-third-party/03-visual/01-grafana.mdx index 9d12e1db04..860087b901 100644 --- a/docs/zh/20-third-party/03-visual/01-grafana.mdx +++ b/docs/zh/20-third-party/03-visual/01-grafana.mdx @@ -22,8 +22,6 @@ TDengine 能够与开源数据可视化系统 [Grafana](https://www.grafana.com/ - TDengine 集群已经部署并正常运行。 - taosAdapter 已经安装并正常运行。具体细节请参考 [taosAdapter 的使用手册](../../../reference/components/taosadapter) - - 记录以下信息: - TDengine 集群 REST API 地址,如:`http://tdengine.local:6041`。 @@ -173,7 +171,6 @@ docker run -d \ - :::info 下文介绍中,都以 Grafana v11.0.0 版本为例,其他版本功能可能有差异,请参考 [Grafana 官网](https://grafana.com/docs/grafana/latest/)。 @@ -181,6 +178,7 @@ docker run -d \ ::: ## 内置变量和自定义变量 + Grafana 中的 Variable(变量)功能非常强大,可以在 Dashboard 的查询、面板标题、标签等地方使用,用来创建更加动态和交互式的 Dashbord,提高用户体验和效率。 变量的主要作用和特点包括: @@ -191,26 +189,30 @@ Grafana 中的 Variable(变量)功能非常强大,可以在 Dashboard 的 - 灵活的配置选项:变量提供了多种配置选项,如预定义的静态值列表、从数据源动态查询值、正则表达式过滤等,使得变量的应用更加灵活和强大。 - Grafana 提供了内置变量和自定义变量,它们都可以可以在编写 SQL 时引用,引用的方式是 `$variableName`,`variableName` 是变量的名字,其他引用方式请参考 [引用方式](https://grafana.com/docs/grafana/latest/dashboards/variables/variable-syntax/)。 ### 内置变量 + Grafana 内置了 `from`、`to` 和 `interval` 等变量,都取自于 Grafana 插件面板。其含义如下: - `from` 查询范围的起始时间 - `to` 查询范围的结束时间 - `interval` 窗口切分间隔 -对于每个查询都建议设置查询范围的起始时间和结束时间,可以有效的减少 TDengine 服务端执行查询扫描的数据量。`interval` 是窗口切分的大小,在 Grafana 11 版本中,其大小为时间间隔和返回点数计算而得。 +对于每个查询都建议设置查询范围的起始时间和结束时间,可以有效的减少 TDengine 服务端执行查询扫描的数据量。`interval` 是窗口切分的大小,在 Grafana 11 版本中,其大小为时间间隔和返回点数计算而得。 + 除了上述三个常用变量,Grafana 还提供了如 `__timezone`, `__org`, `__user` 等变量,详情请参考 [内置变量](https://grafana.com/docs/grafana/latest/dashboards/variables/add-template-variables/#global-variables)。 ### 自定义变量 + 我们可以在 Dashbord 中增加自定义变量。自定义变量和内置变量的使用方式没有区别,都是在 SQL 中用 `$variableName` 进行引用。 自定义变量支持多种类型,常见的类型包括 `Query`(查询)、`Constant`(常量)、`Interval`(间隔)、`Data source`(数据源)等。 自定义变量可以引用其他自定义变量,比如一个变量表示区域,另一个变量可以引用区域的值,来查询这个区域的设备。 -#### 添加查询类型变量 + +#### 添加查询类型变量 + 在 Dashbord 的配置中,选择 【Variables】,然后点击 【New variable】: -1. 在 “Name“ 字段中,输入你的变量名,此处我们设置变量名为 `selected_groups`。 -2. 在 【Select variable type】下拉菜单中,选择 ”Query“(查询)。 +1. 在 “Name” 字段中,输入你的变量名,此处我们设置变量名为 `selected_groups`。 +2. 在 【Select variable type】下拉菜单中,选择 “Query”(查询)。 根据选择的变量类型,配置相应的选项。例如,如果选择了 “Query” 类型,你需要指定数据源和用于获取变量值的查询语句。此处我们还以智能电表为例,设置查询类型,选择数据源后,配置 SQL 为 `select distinct(groupid) from power.meters where groupid < 3 and ts > $from and ts < $to;` 3. 点击底部的【Run Query】后,可以在 “Preview of values”(值预览)部分,查看到根据你的配置生成的变量值。 4. 还有其他配置不再赘述,完成配置后,点击页面底部的【Apply】(应用)按钮,然后点击右上角的【Save dashboard】保存。 @@ -219,9 +221,10 @@ Grafana 内置了 `from`、`to` 和 `interval` 等变量,都取自于 Grafana 我们还可以再新增自定义变量来引用这个 `selected_groups` 变量,比如我们新增一个名为 `tbname_max_current` 的查询变量,其 SQL 为 `select tbname from power.meters where groupid = $selected_groups and ts > $from and ts < $to;` -#### 添加间隔类型变量 +#### 添加间隔类型变量 + 我们可以自定义时间窗口间隔,可以更加贴合业务需求。 -1. 在 “Name“ 字段中,输入变量名为 `interval`。 +1. 在 “Name” 字段中,输入变量名为 `interval`。 2. 在 【Select variable type】下拉菜单中,选择 “Interval”(间隔)。 3. 在 【Interval options】选项中输入 `1s,2s,5s,10s,15s,30s,1m`。 4. 还有其他配置不再赘述,完成配置后,点击页面底部的【Apply】(应用)按钮,然后点击右上角的【Save dashboard】保存。 @@ -235,6 +238,7 @@ Grafana 内置了 `from`、`to` 和 `interval` 等变量,都取自于 Grafana ::: ## TDengine 时间序列查询支持 + TDengine 在支持标准 SQL 的基础之上,还提供了一系列满足时序业务场景需求的特色查询语法,这些语法能够为时序场景的应用的开发带来极大的便利。 - `partition by` 子句可以按一定的维度对数据进行切分,然后在切分出的数据空间内再进行一系列的计算。绝大多数情况可以替代 `group by`。 - `interval` 子句用于产生相等时间周期的窗口 @@ -249,13 +253,14 @@ TDengine 在支持标准 SQL 的基础之上,还提供了一系列满足时序 ![TDengine Database Grafana plugin create dashboard](./create_dashboard1.webp) -如上图所示,在 ”Query“ 中选中 `TDengine` 数据源,在下方查询框可输入相应 SQL 进行查询。 我们继续用智能电表来举例,为了展示曲线美观,此处**用了虚拟数据**。 +如上图所示,在 “Query” 中选中 `TDengine` 数据源,在下方查询框可输入相应 SQL 进行查询。 我们继续用智能电表来举例,为了展示曲线美观,此处**用了虚拟数据**。 ## 时间序列数据展示 + 假设我们想查询一段时间内的平均电流大小,时间窗口按 `$interval` 切分,若某一时间窗口区间数据缺失,填充 null。 -- “INPUT SQL“:输入要查询的语句(该 SQL 语句的结果集应为两列多行),此处输入:`select _wstart as ts, avg(current) as current from power.meters where groupid in ($selected_groups) and ts > $from and ts < $to interval($interval) fill(null)` ,其中,from、to 和 interval 为 Grafana 内置变量,selected_groups 为自定义变量。 -- “ALIAS BY“:可设置当前查询别名。 -- “GENERATE SQL“: 点击该按钮会自动替换相应变量,并生成最终执行的语句。 +- “INPUT SQL”:输入要查询的语句(该 SQL 语句的结果集应为两列多行),此处输入:`select _wstart as ts, avg(current) as current from power.meters where groupid in ($selected_groups) and ts > $from and ts < $to interval($interval) fill(null)` ,其中,from、to 和 interval 为 Grafana 内置变量,selected_groups 为自定义变量。 +- “ALIAS BY”:可设置当前查询别名。 +- “GENERATE SQL”: 点击该按钮会自动替换相应变量,并生成最终执行的语句。 在顶部的自定义变量中,若选择 `selected_groups` 的值为 1,则查询 `meters` 超级表中 `groupid` 为 1 的所有设备电流平均值变化如下图: @@ -268,10 +273,11 @@ TDengine 在支持标准 SQL 的基础之上,还提供了一系列满足时序 ::: ## 时间序列数据分组展示 + 假设我们想查询一段时间内的平均电流大小,按 `groupid` 分组展示,我们可以修改之前的 SQL 为 `select _wstart as ts, groupid, avg(current) as current from power.meters where ts > $from and ts < $to partition by groupid interval($interval) fill(null)` -- “Group by column(s)“: **半角**逗号分隔的 `group by` 或 `partition by` 列名。如果是 `group by` 或 `partition by` 查询语句,设置 “Group by“ 列,可以展示多维数据。此处设置 “Group by“ 列名为 `groupid`,可以按 `groupid` 分组展示数据。 -- “Group By Format“: `Group by` 或 `Partition by` 场景下多维数据 legend 格式化格式。例如上述 INPUT SQL,将 “Group By Format“ 设置为 `groupid-{{groupid}}`,展示的 legend 名字为格式化的分组名。 +- “Group by column(s)”: **半角**逗号分隔的 `group by` 或 `partition by` 列名。如果是 `group by` 或 `partition by` 查询语句,设置 “Group by” 列,可以展示多维数据。此处设置 “Group by” 列名为 `groupid`,可以按 `groupid` 分组展示数据。 +- “Group By Format”: `Group by` 或 `Partition by` 场景下多维数据 legend 格式化格式。例如上述 INPUT SQL,将 “Group By Format” 设置为 `groupid-{{groupid}}`,展示的 legend 名字为格式化的分组名。 完成设置后,按照 `groupid` 分组展示如下图: @@ -280,10 +286,10 @@ TDengine 在支持标准 SQL 的基础之上,还提供了一系列满足时序 > 关于如何使用 Grafana 创建相应的监测界面以及更多有关使用 Grafana 的信息,请参考 Grafana 官方的[文档](https://grafana.com/docs/)。 ## 性能建议 + - **所有查询加上时间范围**,在时序数据库中,如果不加查询的时间范围,会扫表导致性能低下。常见的 SQL 写法是 `select column_name from db.table where ts > $from and ts < $to;` - 对于最新状态类型的查询,我们一般建议在**创建数据库的时候打开缓存**(`CACHEMODEL` 设置为 last_row 或者 both),常见的 SQL 写法是 `select last(column_name) from db.table where ts > $from and ts < $to;` - ## 导入 Dashboard 在数据源配置页面,您可以为该数据源导入 TDinsight 面板,作为 TDengine 集群的监控可视化工具。如果 TDengine 服务端为 3.0 版本请选择 `TDinsight for 3.x` 导入。注意 TDinsight for 3.x 需要运行和配置 taoskeeper。 @@ -299,30 +305,36 @@ TDengine 在支持标准 SQL 的基础之上,还提供了一系列满足时序 - [15167](https://grafana.com/grafana/dashboards/15167): TDinsight - [16388](https://grafana.com/grafana/dashboards/16388): Telegraf 采集节点信息的数据展示 -## 告警配置简介 -### 告警配置流程 +## 告警配置 + TDengine Grafana 插件支持告警,如果要配置告警,需要以下几个步骤: -1. 配置联络点(“Contact points“):配置通知渠道,包括 DingDing、Email、Slack、WebHook、Prometheus Alertmanager 等 -2. 配置告警通知策略(“Notification policies“):配置告警发送到哪个通道的路由,以及发送通知的时间和重复频率 -3. 配置 “Alert rules“:配置详细的告警规则 +1. 配置联络点(“Contact points”):配置通知渠道,包括 DingDing、Email、Slack、WebHook、Prometheus Alertmanager 等 +2. 配置告警通知策略(“Notification policies”):配置告警发送到哪个通道的路由,以及发送通知的时间和重复频率 +3. 配置告警规则(“Alert rules”):配置详细的告警规则 3.1 配置告警名称 - 3.2 配置查询及告警触发条件 + 3.2 配置查询和告警触发条件 3.3 配置规则评估策略 3.4 配置标签和告警通道 3.5 配置通知文案 ### 告警配置界面 -在Grafana 11 告警界面一共有 6 个 Tab,分别是 “Alert rules“、“Contact points“、“Notification policies“、“Silences“、 “Groups“ 和 “Settings“。 -- “Alert rules“ 告警规则列表,用于展示和配置告警规则 -- “Contact points“ 通知渠道,包括 DingDing、Email、Slack、WebHook、Prometheus Alertmanager 等 -- “Notification policies“ 配置告警发送到哪个通道的路由,以及发送通知的时间和重复频率 -- “Silences“ 配置告警静默时间段 -- “Groups“ 告警组,配置的告警触发后会在这里分组显示 -- “Settings“ 提供通过 JSON 方式修改告警配置 -## 配置邮件联络点 -### Grafana Server 配置文件修改 -在 Grafana 服务的配置文件中添加 SMTP/Emailing 和 Alerting 模块,以 Linux 系统为例,其配置文件一般位于 `/etc/grafana/grafana.ini` +在Grafana 11 告警界面一共有 6 个 Tab,分别是 “Alert rules”、“Contact points”、“Notification policies”、“Silences”、 “Groups” 和 “Settings”。 +- “Alert rules” 告警规则列表,用于展示和配置告警规则 +- “Contact points” 通知渠道,包括 DingDing、Email、Slack、WebHook、Prometheus Alertmanager 等 +- “Notification policies” 配置告警发送到哪个通道的路由,以及发送通知的时间和重复频率 +- “Silences” 配置告警静默时间段 +- “Groups” 告警组,配置的告警触发后会在这里分组显示 +- “Settings” 提供通过 JSON 方式修改告警配置 + +### 配置联络点 + +以邮件和飞书为例配置联络点。 + +#### 配置邮件联络点 + +在 Grafana 服务的配置文件中添加 SMTP/Emailing 和 Alerting 模块。(以 Linux 系统为例,其配置文件一般位于 `/etc/grafana/grafana.ini`) + 在配置文件中增加下面内容: ```ini @@ -336,106 +348,110 @@ skip_verify = true from_address = sender@foxmail.com ``` -然后重启 Grafana 服务即可, 以 Linux 系统为例,执行 `systemctl restart grafana-server.service` +然后重启 Grafana 服务(以 Linux 系统为例,执行 `systemctl restart grafana-server.service`)即可添加完成 -### Grafana 页面创建新联络点 - -在 Grafana 页面找到 “Home“ -> “Alerting“ -> “Contact points“,创建新联络点 -”Name“: Email Contact Point -“Integration“:选择联络类型,这里选择 Email,填写邮件接收地址,完成后保存联络点 +在 Grafana 页面找到 “Home” -> “Alerting” -> “Contact points”,创建新联络点 +“Name”: Email Contact Point +“Integration”:选择联络类型,这里选择 Email,填写邮件接收地址,完成后保存联络点 ![TDengine Database Grafana plugin alert email](./alert-email.webp) -## 配置飞书联络点 +#### 配置飞书联络点 -### 飞书机器人配置 -1. “飞书工作台“ -> “获取应用“ -> “搜索飞书机器人助手“ -> “新建指令“ +按照以下步骤配置飞书机器人: + +1. “飞书工作台” -> “获取应用” -> “搜索飞书机器人助手” -> “新建指令” 2. 选择触发器:Grafana 3. 选择操作:通过官方机器人发送消息,填写发送对象和发送内容 ![TDengine Database Grafana plugin feishu robot](./alert-feishu1.webp) -### Grafana 配置飞书联络点 - -在 Grafana 页面找到 “Home“ -> “Alerting“ -> “Contact points“ 创建新联络点 -“Name“:Feishu Contact Point -“Integration“:选择联络类型,这里选择 Webhook,并填写 URL (在飞书机器人助手的 Grafana 触发器 Webhook 地址),完成后保存联络点 +在 Grafana 页面找到 “Home” -> “Alerting” -> “Contact points” 创建新联络点 +“Name”:Feishu Contact Point +“Integration”:选择联络类型,这里选择 Webhook,并填写 URL (在飞书机器人助手的 Grafana 触发器 Webhook 地址),完成后保存联络点 ![TDengine Database Grafana plugin feishu contact point](./alert-feishu2.webp) -## 通知策略 +### 配置告警通知策略 + 配置好联络点后,可以看到已有一个Default Policy ![TDengine Database Grafana plugin Notification default policy](./alert-notification1.webp) -点击右侧 ”...“ -> ”Edit“,然后编辑默认通知策略,弹出配置窗口: +点击右侧 “...” -> ”Edit”,然后编辑默认通知策略,弹出配置窗口: ![TDengine Database Grafana plugin Notification](./alert-notification2.webp) 然后配置下列参数: -- “Group wait“: 发送首次告警之前的等待时间。 -- “Group interval“: 发送第一个告警后,为该组发送下一批新告警的等待时间。 -- “Repeat interval“: 成功发送告警后再次重复发送告警的等待时间。 +- “Group wait”: 发送首次告警之前的等待时间。 +- “Group interval”: 发送第一个告警后,为该组发送下一批新告警的等待时间。 +- “Repeat interval”: 成功发送告警后再次重复发送告警的等待时间。 -## 配置告警规则 +### 配置告警规则 -### 配置查询和告警触发条件 +以配置智能电表告警为例,告警规则的配置主要包括告警名称、查询和告警触发条件、规则评估策略、标签和告警通道、通知文案。 -在需要配置告警的面板中选择 “Edit“ -> “Alert“ -> “New alert rule“。 +#### 配置告警名称 + +在需要配置告警的面板中选择 “Edit” -> “Alert” -> “New alert rule”。 + +“Enter alert rule name“ (输入告警规则名称):此处以智能电表为例输入 `power meters alert` + +#### 配置查询和告警触发条件 + +在 “Define query and alert condition” (定义查询和告警触发条件) 中配置告警规则。 +1. 选择数据源:`TDengine Datasource` +2. 查询语句: -1. “Enter alert rule name“ (输入告警规则名称):此处以智能电表为例输入 `power meters alert` -2. “Define query and alert condition“ (定义查询和告警触发条件) - 2.1 选择数据源:`TDengine Datasource` - 2.2 查询语句: ```sql select _wstart as ts, groupid, avg(current) as current from power.meters where ts > $from and ts < $to partition by groupid interval($interval) fill(null) ``` - 2.3 设置 ”Expression“(表达式):`Threshold is above 100` - 2.4 点击【Set as alert condition】 - 2.5 “Preview“:查看设置的规则的结果 + +3. 设置 “Expression”(表达式):`Threshold is above 100` +4. 点击【Set as alert condition】 +5. “Preview”:查看设置的规则的结果 完成设置后可以看到下面图片展示: ![TDengine Database Grafana plugin Alert Rules](./alert-rules1.webp) -### 配置表达式和计算规则 - -Grafana 的 “Expression“(表达式)支持对数据做各种操作和计算,其类型分为: -1. “Reduce“:将所选时间范围内的时间序列值聚合为单个值 - 1.1 “Function“ 用来设置聚合方法,支持 Min、Max、Last、Mean、Sum 和 Count。 - 1.2 “Mode“ 支持下面三种: - - “Strict“:如果查询不到数据,数据会赋值为 NaN。 - - “Drop Non-numeric Value“:去掉非法数据结果。 - - “Replace Non-numeric Value“:如果是非法数据,使用固定值进行替换。 -2. “Threshold“:检查时间序列数据是否符合阈值判断条件。当条件为假时返回 0,为真则返回1。支持下列方式: +Grafana 的 “Expression”(表达式)支持对数据做各种操作和计算,其类型分为: +1. “Reduce”:将所选时间范围内的时间序列值聚合为单个值 + 1.1 “Function” 用来设置聚合方法,支持 Min、Max、Last、Mean、Sum 和 Count。 + 1.2 “Mode” 支持下面三种: + - “Strict”:如果查询不到数据,数据会赋值为 NaN。 + - “Drop Non-numeric Value”:去掉非法数据结果。 + - “Replace Non-numeric Value”:如果是非法数据,使用固定值进行替换。 +2. “Threshold”:检查时间序列数据是否符合阈值判断条件。当条件为假时返回 0,为真则返回1。支持下列方式: - Is above (x > y) - Is below (x < y) - Is within range (x > y1 AND x < y2) - Is outside range (x < y1 AND x > y2) -3. “Math“:对时间序列的数据进行数学运算。 -4. “Resample“:更改每个时间序列中的时间戳使其具有一致的时间间隔,以便在它们之间执行数学运算。 -5. “Classic condition (legacy)“: 可配置多个逻辑条件,判断是否触发告警。 +3. “Math”:对时间序列的数据进行数学运算。 +4. “Resample”:更改每个时间序列中的时间戳使其具有一致的时间间隔,以便在它们之间执行数学运算。 +5. “Classic condition (legacy)”: 可配置多个逻辑条件,判断是否触发告警。 如上节截图显示,此处我们设置最大值超过 100 触发告警。 -### 配置评估策略 +#### 配置规则评估策略 ![TDengine Database Grafana plugin Alert Evaluation Behavior](./alert-evaluation.webp) 完成下面配置: -- “Folder“:设置告警规则所属目录。 -- “Evaluation group“:设置告警规则评估组。“Evaluation group“ 可以选择已有组或者新建组,新建组可以设置组名和评估时间间隔。 -- “Pending period“:在告警规则的阈值被触发后,异常值持续多长时间可以触发告警,合理设置可以避免误报。 +- “Folder”:设置告警规则所属目录。 +- “Evaluation group”:设置告警规则评估组。“Evaluation group” 可以选择已有组或者新建组,新建组可以设置组名和评估时间间隔。 +- “Pending period”:在告警规则的阈值被触发后,异常值持续多长时间可以触发告警,合理设置可以避免误报。 + +#### 配置标签和告警通道 -### 配置标签和告警通道 ![TDengine Database Grafana plugin Alert Labels and Notifications](./alert-labels.webp) 完成下面配置: -- “Labels“ 将标签添加到规则中,以便进行搜索、静默或路由到通知策略。 -- “Contact point“ 选择联络点,当告警发生时通过设置的联络点进行通知。 +- “Labels” 将标签添加到规则中,以便进行搜索、静默或路由到通知策略。 +- “Contact point” 选择联络点,当告警发生时通过设置的联络点进行通知。 -### 配置通知文案 +#### 配置通知文案 ![TDengine Database Grafana plugin Alert Labels and Notifications](./alert-annotations.webp) -设置 “Summary” 和 ”Description” 后,若告警触发,将会收到告警通知。 +设置 “Summary” 和 “Description” 后,若告警触发,将会收到告警通知。 diff --git a/examples/JDBC/connectionPools/src/main/java/com/taosdata/example/HikariDemo.java b/examples/JDBC/connectionPools/src/main/java/com/taosdata/example/HikariDemo.java index 79f636c428..4480cbc7c4 100644 --- a/examples/JDBC/connectionPools/src/main/java/com/taosdata/example/HikariDemo.java +++ b/examples/JDBC/connectionPools/src/main/java/com/taosdata/example/HikariDemo.java @@ -21,7 +21,7 @@ public static void main(String[] args) throws SQLException { config.setConnectionTimeout(30000); //maximum wait milliseconds for get connection from pool config.setMaxLifetime(0); // maximum life time for each connection config.setIdleTimeout(0); // max idle time for recycle idle connection - config.setConnectionTestQuery("SELECT SERVER_STATUS()"); //validation query + config.setConnectionTestQuery("SELECT SERVER_VERSION()"); //validation query HikariDataSource ds = new HikariDataSource(config); //create datasource From 71b8f67ea681263212b5b4cd10ea687efafc2b3a Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Fri, 9 Aug 2024 11:37:34 +0800 Subject: [PATCH 076/103] refactor: do some internal refactor. --- source/dnode/mnode/impl/src/mndStreamUtil.c | 3 +++ source/dnode/vnode/src/tqCommon/tqCommon.c | 1 - source/libs/stream/src/streamDispatch.c | 8 ++++++-- source/libs/stream/src/streamHb.c | 10 +++++++--- 4 files changed, 16 insertions(+), 6 deletions(-) diff --git a/source/dnode/mnode/impl/src/mndStreamUtil.c b/source/dnode/mnode/impl/src/mndStreamUtil.c index 47b0fdb412..383ffe16da 100644 --- a/source/dnode/mnode/impl/src/mndStreamUtil.c +++ b/source/dnode/mnode/impl/src/mndStreamUtil.c @@ -1069,6 +1069,9 @@ int32_t mndScanCheckpointReportInfo(SRpcMsg *pReq) { while ((pIter = taosHashIterate(execInfo.pChkptStreams, pIter)) != NULL) { SChkptReportInfo* px = (SChkptReportInfo *)pIter; + if (taosArrayGetSize(px->pTaskList) == 0) { + continue; + } STaskChkptInfo *pInfo = taosArrayGet(px->pTaskList, 0); if (pInfo == NULL) { diff --git a/source/dnode/vnode/src/tqCommon/tqCommon.c b/source/dnode/vnode/src/tqCommon/tqCommon.c index faca2020c5..7164c7f543 100644 --- a/source/dnode/vnode/src/tqCommon/tqCommon.c +++ b/source/dnode/vnode/src/tqCommon/tqCommon.c @@ -417,7 +417,6 @@ int32_t tqStreamTaskProcessDispatchRsp(SStreamMeta* pMeta, SRpcMsg* pMsg) { return code; } else { tqDebug("vgId:%d failed to handle the dispatch rsp, since find task:0x%x failed", vgId, pRsp->upstreamTaskId); - terrno = TSDB_CODE_STREAM_TASK_NOT_EXIST; return TSDB_CODE_STREAM_TASK_NOT_EXIST; } } diff --git a/source/libs/stream/src/streamDispatch.c b/source/libs/stream/src/streamDispatch.c index 493b5013d0..4da108507a 100644 --- a/source/libs/stream/src/streamDispatch.c +++ b/source/libs/stream/src/streamDispatch.c @@ -1083,7 +1083,7 @@ int32_t streamAddBlockIntoDispatchMsg(const SSDataBlock* pBlock, SStreamDispatch int32_t doSendDispatchMsg(SStreamTask* pTask, const SStreamDispatchReq* pReq, int32_t vgId, SEpSet* pEpSet) { void* buf = NULL; - int32_t code = -1; + int32_t code = 0; SRpcMsg msg = {0}; // serialize @@ -1093,9 +1093,9 @@ int32_t doSendDispatchMsg(SStreamTask* pTask, const SStreamDispatchReq* pReq, in goto FAIL; } - code = -1; buf = rpcMallocCont(sizeof(SMsgHead) + tlen); if (buf == NULL) { + code = terrno; goto FAIL; } @@ -1119,6 +1119,10 @@ FAIL: rpcFreeCont(buf); } + if (code == -1) { + code = TSDB_CODE_INVALID_MSG; + } + return code; } diff --git a/source/libs/stream/src/streamHb.c b/source/libs/stream/src/streamHb.c index 8513a8ba06..898e2bbc0b 100644 --- a/source/libs/stream/src/streamHb.c +++ b/source/libs/stream/src/streamHb.c @@ -295,10 +295,14 @@ void streamMetaHbToMnode(void* param, void* tmrId) { if (code) { stError("vgId:%d failed to send hmMsg to mnode, try again in 5s, code:%s", pMeta->vgId, tstrerror(code)); } - streamMetaRUnLock(pMeta); - streamTmrReset(streamMetaHbToMnode, META_HB_CHECK_INTERVAL, param, streamTimer, &pMeta->pHbInfo->hbTmr, pMeta->vgId, - "meta-hb-tmr"); + + if (code != TSDB_CODE_APP_IS_STOPPING) { + streamTmrReset(streamMetaHbToMnode, META_HB_CHECK_INTERVAL, param, streamTimer, &pMeta->pHbInfo->hbTmr, pMeta->vgId, + "meta-hb-tmr"); + } else { + stDebug("vgId:%d is stopping, not start hb again", pMeta->vgId); + } code = taosReleaseRef(streamMetaId, rid); if (code) { From ab2007adb75689dd3e3152f2562463e08e172600 Mon Sep 17 00:00:00 2001 From: wangmm0220 Date: Fri, 9 Aug 2024 11:41:17 +0800 Subject: [PATCH 077/103] fix:[TD-31146]memory leak --- source/client/src/clientTmq.c | 2 ++ source/common/src/tmsg.c | 1 + 2 files changed, 3 insertions(+) diff --git a/source/client/src/clientTmq.c b/source/client/src/clientTmq.c index c3867f2821..b8474dd9f3 100644 --- a/source/client/src/clientTmq.c +++ b/source/client/src/clientTmq.c @@ -2088,6 +2088,7 @@ static void* tmqHandleAllRsp(tmq_t* tmq, int64_t timeout) { taosWUnLockLatch(&tmq->lock); } setVgIdle(tmq, pollRspWrapper->topicName, pollRspWrapper->vgId); + tmqFreeRspWrapper(pRspWrapper); taosFreeQitem(pRspWrapper); } else if (pRspWrapper->tmqRspType == TMQ_MSG_TYPE__POLL_DATA_RSP) { SMqPollRspWrapper* pollRspWrapper = (SMqPollRspWrapper*)pRspWrapper; @@ -2789,6 +2790,7 @@ int32_t askEpCb(void* param, SDataBuf* pMsg, int32_t code) { pWrapper->epoch = head->epoch; (void)memcpy(&pWrapper->msg, pMsg->pData, sizeof(SMqRspHead)); if (tDecodeSMqAskEpRsp(POINTER_SHIFT(pMsg->pData, sizeof(SMqRspHead)), &pWrapper->msg) == NULL){ + tmqFreeRspWrapper((SMqRspWrapper*)pWrapper); taosFreeQitem(pWrapper); }else{ (void)taosWriteQitem(tmq->mqueue, pWrapper); diff --git a/source/common/src/tmsg.c b/source/common/src/tmsg.c index 4015701a29..3b2672f5a2 100644 --- a/source/common/src/tmsg.c +++ b/source/common/src/tmsg.c @@ -10068,6 +10068,7 @@ void *tDecodeMqSubTopicEp(void *buf, SMqSubTopicEp *pTopicEp) { buf = tDecodeSMqSubVgEp(buf, &vgEp); if (taosArrayPush(pTopicEp->vgs, &vgEp) == NULL) { taosArrayDestroy(pTopicEp->vgs); + pTopicEp->vgs = NULL; return NULL; } } From aa4b4a609f85b368c167b19e008695f64fc3eb4e Mon Sep 17 00:00:00 2001 From: wangjiaming0909 <604227650@qq.com> Date: Fri, 9 Aug 2024 11:44:13 +0800 Subject: [PATCH 078/103] fix ret check caused crash --- include/libs/nodes/querynodes.h | 2 +- source/libs/executor/src/executil.c | 28 ++++++++---------- source/libs/nodes/src/nodesUtilFuncs.c | 22 ++++++++++---- source/util/src/tworker.c | 41 +++++++++++++++----------- 4 files changed, 52 insertions(+), 41 deletions(-) diff --git a/include/libs/nodes/querynodes.h b/include/libs/nodes/querynodes.h index bb06b65898..198163582b 100644 --- a/include/libs/nodes/querynodes.h +++ b/include/libs/nodes/querynodes.h @@ -636,7 +636,7 @@ bool nodesExprsHasColumn(SNodeList* pList); void* nodesGetValueFromNode(SValueNode* pNode); int32_t nodesSetValueNodeValue(SValueNode* pNode, void* value); char* nodesGetStrValueFromNode(SValueNode* pNode); -void nodesValueNodeToVariant(const SValueNode* pNode, SVariant* pVal); +int32_t nodesValueNodeToVariant(const SValueNode* pNode, SVariant* pVal); int32_t nodesMakeValueNodeFromString(char* literal, SValueNode** ppValNode); int32_t nodesMakeValueNodeFromBool(bool b, SValueNode** ppValNode); int32_t nodesMakeValueNodeFromInt32(int32_t value, SNode** ppNode); diff --git a/source/libs/executor/src/executil.c b/source/libs/executor/src/executil.c index 127e0f18f1..616c2593cb 100644 --- a/source/libs/executor/src/executil.c +++ b/source/libs/executor/src/executil.c @@ -1781,7 +1781,7 @@ int32_t createExprFromOneNode(SExprInfo* pExp, SNode* pNode, int16_t slotId) { pExp->base.resSchema = createResSchema(pType->type, pType->bytes, slotId, pType->scale, pType->precision, pValNode->node.aliasName); pExp->base.pParam[0].type = FUNC_PARAM_TYPE_VALUE; - nodesValueNodeToVariant(pValNode, &pExp->base.pParam[0].param); + code = nodesValueNodeToVariant(pValNode, &pExp->base.pParam[0].param); } else if (type == QUERY_NODE_FUNCTION) { pExp->pExpr->nodeType = QUERY_NODE_FUNCTION; SFunctionNode* pFuncNode = (SFunctionNode*)pNode; @@ -1811,12 +1811,10 @@ int32_t createExprFromOneNode(SExprInfo* pExp, SNode* pNode, int16_t slotId) { if (TSDB_CODE_SUCCESS == code) { code = nodesMakeNode(QUERY_NODE_VALUE, (SNode**)&res); } - if (TSDB_CODE_SUCCESS != code) { // todo handle error - } else { - res->node.resType = (SDataType){.bytes = sizeof(int64_t), .type = TSDB_DATA_TYPE_BIGINT}; - code = nodesListAppend(pFuncNode->pParameterList, (SNode*)res); - QUERY_CHECK_CODE(code, lino, _end); - } + QUERY_CHECK_CODE(code, lino, _end); + res->node.resType = (SDataType){.bytes = sizeof(int64_t), .type = TSDB_DATA_TYPE_BIGINT}; + code = nodesListAppend(pFuncNode->pParameterList, (SNode*)res); + QUERY_CHECK_CODE(code, lino, _end); } #endif @@ -1826,7 +1824,7 @@ int32_t createExprFromOneNode(SExprInfo* pExp, SNode* pNode, int16_t slotId) { QUERY_CHECK_NULL(pExp->base.pParam, code, lino, _end, terrno); pExp->base.numOfParams = numOfParam; - for (int32_t j = 0; j < numOfParam; ++j) { + for (int32_t j = 0; j < numOfParam && TSDB_CODE_SUCCESS == code; ++j) { SNode* p1 = nodesListGetNode(pFuncNode->pParameterList, j); QUERY_CHECK_NULL(p1, code, lino, _end, terrno); if (p1->type == QUERY_NODE_COLUMN) { @@ -1839,7 +1837,8 @@ int32_t createExprFromOneNode(SExprInfo* pExp, SNode* pNode, int16_t slotId) { } else if (p1->type == QUERY_NODE_VALUE) { SValueNode* pvn = (SValueNode*)p1; pExp->base.pParam[j].type = FUNC_PARAM_TYPE_VALUE; - nodesValueNodeToVariant(pvn, &pExp->base.pParam[j].param); + code = nodesValueNodeToVariant(pvn, &pExp->base.pParam[j].param); + QUERY_CHECK_CODE(code, lino, _end); } } } else if (type == QUERY_NODE_OPERATOR) { @@ -1871,13 +1870,10 @@ int32_t createExprFromOneNode(SExprInfo* pExp, SNode* pNode, int16_t slotId) { SLogicConditionNode* pCond = (SLogicConditionNode*)pNode; pExp->base.pParam = taosMemoryCalloc(1, sizeof(SFunctParam)); QUERY_CHECK_NULL(pExp->base.pParam, code, lino, _end, terrno); - - if (TSDB_CODE_SUCCESS == code) { - pExp->base.numOfParams = 1; - SDataType* pType = &pCond->node.resType; - pExp->base.resSchema = createResSchema(pType->type, pType->bytes, slotId, pType->scale, pType->precision, pCond->node.aliasName); - pExp->pExpr->_optrRoot.pRootNode = pNode; - } + pExp->base.numOfParams = 1; + SDataType* pType = &pCond->node.resType; + pExp->base.resSchema = createResSchema(pType->type, pType->bytes, slotId, pType->scale, pType->precision, pCond->node.aliasName); + pExp->pExpr->_optrRoot.pRootNode = pNode; } else { ASSERT(0); } diff --git a/source/libs/nodes/src/nodesUtilFuncs.c b/source/libs/nodes/src/nodesUtilFuncs.c index 6e69c56687..6b06530b3e 100644 --- a/source/libs/nodes/src/nodesUtilFuncs.c +++ b/source/libs/nodes/src/nodesUtilFuncs.c @@ -141,7 +141,7 @@ static int32_t callocNodeChunk(SNodeAllocator* pAllocator, SNodeMemChunk** pOutC static int32_t nodesCallocImpl(int32_t size, void** pOut) { if (NULL == g_pNodeAllocator) { *pOut = taosMemoryCalloc(1, size); - if (!pOut) return TSDB_CODE_OUT_OF_MEMORY; + if (!*pOut) return TSDB_CODE_OUT_OF_MEMORY; return TSDB_CODE_SUCCESS; } @@ -2638,11 +2638,12 @@ int32_t nodesGetOutputNumFromSlotList(SNodeList* pSlots) { return num; } -void nodesValueNodeToVariant(const SValueNode* pNode, SVariant* pVal) { +int32_t nodesValueNodeToVariant(const SValueNode* pNode, SVariant* pVal) { + int32_t code = 0; if (pNode->isNull) { pVal->nType = TSDB_DATA_TYPE_NULL; pVal->nLen = tDataTypes[TSDB_DATA_TYPE_NULL].bytes; - return; + return code; } pVal->nType = pNode->node.resType.type; pVal->nLen = pNode->node.resType.bytes; @@ -2676,13 +2677,21 @@ void nodesValueNodeToVariant(const SValueNode* pNode, SVariant* pVal) { case TSDB_DATA_TYPE_VARBINARY: case TSDB_DATA_TYPE_GEOMETRY: pVal->pz = taosMemoryMalloc(pVal->nLen + 1); - memcpy(pVal->pz, pNode->datum.p, pVal->nLen); - pVal->pz[pVal->nLen] = 0; + if (pVal->pz) { + memcpy(pVal->pz, pNode->datum.p, pVal->nLen); + pVal->pz[pVal->nLen] = 0; + } else { + code = terrno; + } break; case TSDB_DATA_TYPE_JSON: pVal->nLen = getJsonValueLen(pNode->datum.p); pVal->pz = taosMemoryMalloc(pVal->nLen); - memcpy(pVal->pz, pNode->datum.p, pVal->nLen); + if (pVal->pz) { + memcpy(pVal->pz, pNode->datum.p, pVal->nLen); + } else { + code = terrno; + } break; case TSDB_DATA_TYPE_DECIMAL: case TSDB_DATA_TYPE_BLOB: @@ -2690,6 +2699,7 @@ void nodesValueNodeToVariant(const SValueNode* pNode, SVariant* pVal) { default: break; } + return code; } int32_t nodesMergeConds(SNode** pDst, SNodeList** pSrc) { diff --git a/source/util/src/tworker.c b/source/util/src/tworker.c index edf5b0b970..b2064d6787 100644 --- a/source/util/src/tworker.c +++ b/source/util/src/tworker.c @@ -792,6 +792,16 @@ bool tQueryAutoQWorkerTryRecycleWorker(SQueryAutoQWorkerPool *pPool, SQueryAutoQ int32_t tQueryAutoQWorkerInit(SQueryAutoQWorkerPool *pool) { int32_t code; + + (void)taosThreadMutexInit(&pool->poolLock, NULL); + (void)taosThreadMutexInit(&pool->backupLock, NULL); + (void)taosThreadMutexInit(&pool->waitingAfterBlockLock, NULL); + (void)taosThreadMutexInit(&pool->waitingBeforeProcessMsgLock, NULL); + + (void)taosThreadCondInit(&pool->waitingBeforeProcessMsgCond, NULL); + (void)taosThreadCondInit(&pool->waitingAfterBlockCond, NULL); + (void)taosThreadCondInit(&pool->backupCond, NULL); + code = taosOpenQset(&pool->qset); if (code) return terrno = code; pool->workers = tdListNew(sizeof(SQueryAutoQWorker)); @@ -802,14 +812,6 @@ int32_t tQueryAutoQWorkerInit(SQueryAutoQWorkerPool *pool) { if (!pool->exitedWorkers) return TSDB_CODE_OUT_OF_MEMORY; pool->maxInUse = pool->max * 2 + 2; - (void)taosThreadMutexInit(&pool->poolLock, NULL); - (void)taosThreadMutexInit(&pool->backupLock, NULL); - (void)taosThreadMutexInit(&pool->waitingAfterBlockLock, NULL); - (void)taosThreadMutexInit(&pool->waitingBeforeProcessMsgLock, NULL); - - (void)taosThreadCondInit(&pool->waitingBeforeProcessMsgCond, NULL); - (void)taosThreadCondInit(&pool->waitingAfterBlockCond, NULL); - (void)taosThreadCondInit(&pool->backupCond, NULL); if (!pool->pCb) { pool->pCb = taosMemoryCalloc(1, sizeof(SQueryAutoQWorkerPoolCB)); @@ -824,13 +826,17 @@ int32_t tQueryAutoQWorkerInit(SQueryAutoQWorkerPool *pool) { void tQueryAutoQWorkerCleanup(SQueryAutoQWorkerPool *pPool) { (void)taosThreadMutexLock(&pPool->poolLock); pPool->exit = true; - int32_t size = listNEles(pPool->workers); - for (int32_t i = 0; i < size; ++i) { - taosQsetThreadResume(pPool->qset); + int32_t size = 0; + if (pPool->workers) { + size = listNEles(pPool->workers); } - size = listNEles(pPool->backupWorkers); - for (int32_t i = 0; i < size; ++i) { - taosQsetThreadResume(pPool->qset); + if (pPool->backupWorkers) { + size += listNEles(pPool->backupWorkers); + } + if (pPool->qset) { + for (int32_t i = 0; i < size; ++i) { + taosQsetThreadResume(pPool->qset); + } } (void)taosThreadMutexUnlock(&pPool->poolLock); @@ -848,7 +854,7 @@ void tQueryAutoQWorkerCleanup(SQueryAutoQWorkerPool *pPool) { int32_t idx = 0; SQueryAutoQWorker *worker = NULL; - while (true) { + while (pPool->workers) { (void)taosThreadMutexLock(&pPool->poolLock); if (listNEles(pPool->workers) == 0) { (void)taosThreadMutexUnlock(&pPool->poolLock); @@ -864,7 +870,7 @@ void tQueryAutoQWorkerCleanup(SQueryAutoQWorkerPool *pPool) { taosMemoryFree(pNode); } - while (listNEles(pPool->backupWorkers) > 0) { + while (pPool->backupWorkers && listNEles(pPool->backupWorkers) > 0) { SListNode *pNode = tdListPopHead(pPool->backupWorkers); worker = (SQueryAutoQWorker *)pNode->data; if (worker && taosCheckPthreadValid(worker->thread)) { @@ -874,7 +880,7 @@ void tQueryAutoQWorkerCleanup(SQueryAutoQWorkerPool *pPool) { taosMemoryFree(pNode); } - while (listNEles(pPool->exitedWorkers) > 0) { + while (pPool->exitedWorkers && listNEles(pPool->exitedWorkers) > 0) { SListNode *pNode = tdListPopHead(pPool->exitedWorkers); worker = (SQueryAutoQWorker *)pNode->data; if (worker && taosCheckPthreadValid(worker->thread)) { @@ -935,7 +941,6 @@ STaosQueue *tQueryAutoQWorkerAllocQueue(SQueryAutoQWorkerPool *pool, void *ahand if (taosThreadCreate(&pWorker->thread, &thAttr, (ThreadFp)tQueryAutoQWorkerThreadFp, pWorker) != 0) { taosCloseQueue(queue); - terrno = TSDB_CODE_OUT_OF_MEMORY; queue = NULL; break; } From d763e559e18dbaea9f9e9e96f53fccabf6b43d9c Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Fri, 9 Aug 2024 13:57:57 +0800 Subject: [PATCH 079/103] refactor errno code --- source/dnode/mgmt/mgmt_dnode/src/dmWorker.c | 2 +- source/libs/stream/src/streamBackendRocksdb.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/source/dnode/mgmt/mgmt_dnode/src/dmWorker.c b/source/dnode/mgmt/mgmt_dnode/src/dmWorker.c index a3e1a64012..aeb519596d 100644 --- a/source/dnode/mgmt/mgmt_dnode/src/dmWorker.c +++ b/source/dnode/mgmt/mgmt_dnode/src/dmWorker.c @@ -293,7 +293,7 @@ int32_t dmStartNotifyThread(SDnodeMgmt *pMgmt) { (void)taosThreadAttrSetDetachState(&thAttr, PTHREAD_CREATE_JOINABLE); if (taosThreadCreate(&pMgmt->notifyThread, &thAttr, dmNotifyThreadFp, pMgmt) != 0) { code = TAOS_SYSTEM_ERROR(errno); - dError("failed to create notify thread since %s", strerror(code)); + dError("failed to create notify thread since %s", tstrerror(code)); return code; } diff --git a/source/libs/stream/src/streamBackendRocksdb.c b/source/libs/stream/src/streamBackendRocksdb.c index 537aa72d91..ee87d3b897 100644 --- a/source/libs/stream/src/streamBackendRocksdb.c +++ b/source/libs/stream/src/streamBackendRocksdb.c @@ -247,7 +247,7 @@ int32_t rebuildDirFromCheckpoint(const char* path, int64_t chkpId, char** dst) { } else { stError("failed to start stream backend at %s, reason: %s, restart from default state dir:%s", chkp, - tstrerror(TAOS_SYSTEM_ERROR(errno)), state); + tstrerror(terrno), state); code = taosMkDir(state); if (code != 0) { code = TAOS_SYSTEM_ERROR(errno); From a145d9db22518ace3680fa154d0a18ee9e7d74a6 Mon Sep 17 00:00:00 2001 From: Haojun Liao Date: Fri, 9 Aug 2024 14:08:08 +0800 Subject: [PATCH 080/103] fix(stream): allowed continue hb msg. --- source/libs/stream/src/streamCheckpoint.c | 20 +++++++++++++------- source/libs/stream/src/streamHb.c | 10 +++------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/source/libs/stream/src/streamCheckpoint.c b/source/libs/stream/src/streamCheckpoint.c index c555da9865..270f678d26 100644 --- a/source/libs/stream/src/streamCheckpoint.c +++ b/source/libs/stream/src/streamCheckpoint.c @@ -558,10 +558,17 @@ int32_t streamTaskUpdateTaskCheckpointInfo(SStreamTask* pTask, bool restored, SV id, vgId, pStatus.name, pInfo->checkpointId, pReq->checkpointId, pInfo->checkpointVer, pReq->checkpointVer, pInfo->checkpointTime, pReq->checkpointTs); } else { // not in restore status, must be in checkpoint status - stDebug("s-task:%s vgId:%d status:%s start to update the checkpoint-info, checkpointId:%" PRId64 "->%" PRId64 - " checkpointVer:%" PRId64 "->%" PRId64 " checkpointTs:%" PRId64 "->%" PRId64, - id, vgId, pStatus.name, pInfo->checkpointId, pReq->checkpointId, pInfo->checkpointVer, pReq->checkpointVer, - pInfo->checkpointTime, pReq->checkpointTs); + if (pStatus.state == TASK_STATUS__CK) { + stDebug("s-task:%s vgId:%d status:%s start to update the checkpoint-info, checkpointId:%" PRId64 "->%" PRId64 + " checkpointVer:%" PRId64 "->%" PRId64 " checkpointTs:%" PRId64 "->%" PRId64, + id, vgId, pStatus.name, pInfo->checkpointId, pReq->checkpointId, pInfo->checkpointVer, + pReq->checkpointVer, pInfo->checkpointTime, pReq->checkpointTs); + } else { + stDebug("s-task:%s vgId:%d status:%s NOT update the checkpoint-info, checkpointId:%" PRId64 "->%" PRId64 + " checkpointVer:%" PRId64 "->%" PRId64, + id, vgId, pStatus.name, pInfo->checkpointId, pReq->checkpointId, pInfo->checkpointVer, + pReq->checkpointVer); + } } ASSERT(pInfo->checkpointId <= pReq->checkpointId && pInfo->checkpointVer <= pReq->checkpointVer && @@ -573,12 +580,11 @@ int32_t streamTaskUpdateTaskCheckpointInfo(SStreamTask* pTask, bool restored, SV pInfo->checkpointVer = pReq->checkpointVer; pInfo->checkpointTime = pReq->checkpointTs; - streamTaskClearCheckInfo(pTask, true); code = streamTaskHandleEvent(pTask->status.pSM, TASK_EVENT_CHECKPOINT_DONE); - } else { - stDebug("s-task:0x%x vgId:%d not handle checkpoint-done event, status:%s", pReq->taskId, vgId, pStatus.name); } + streamTaskClearCheckInfo(pTask, true); + if (pReq->dropRelHTask) { stDebug("s-task:0x%x vgId:%d drop the related fill-history task:0x%" PRIx64 " after update checkpoint", pReq->taskId, vgId, pReq->hTaskId); diff --git a/source/libs/stream/src/streamHb.c b/source/libs/stream/src/streamHb.c index 898e2bbc0b..d2c5cb05b7 100644 --- a/source/libs/stream/src/streamHb.c +++ b/source/libs/stream/src/streamHb.c @@ -297,14 +297,10 @@ void streamMetaHbToMnode(void* param, void* tmrId) { } streamMetaRUnLock(pMeta); - if (code != TSDB_CODE_APP_IS_STOPPING) { - streamTmrReset(streamMetaHbToMnode, META_HB_CHECK_INTERVAL, param, streamTimer, &pMeta->pHbInfo->hbTmr, pMeta->vgId, - "meta-hb-tmr"); - } else { - stDebug("vgId:%d is stopping, not start hb again", pMeta->vgId); - } - + streamTmrReset(streamMetaHbToMnode, META_HB_CHECK_INTERVAL, param, streamTimer, &pMeta->pHbInfo->hbTmr, pMeta->vgId, + "meta-hb-tmr"); code = taosReleaseRef(streamMetaId, rid); + if (code) { stError("vgId:%d in meta timer, failed to release the meta rid:%" PRId64, pMeta->vgId, rid); } From 75e9c027a66b3304b58269b32724db794e2a4653 Mon Sep 17 00:00:00 2001 From: 54liuyao <54liuyao> Date: Fri, 9 Aug 2024 14:40:34 +0800 Subject: [PATCH 081/103] adj error code --- source/util/src/tscalablebf.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/source/util/src/tscalablebf.c b/source/util/src/tscalablebf.c index 72a97fee45..80b633f5e8 100644 --- a/source/util/src/tscalablebf.c +++ b/source/util/src/tscalablebf.c @@ -33,7 +33,7 @@ int32_t tScalableBfInit(uint64_t expectedEntries, double errorRate, SScalableBf* int32_t lino = 0; const uint32_t defaultSize = 8; if (expectedEntries < 1 || errorRate <= 0 || errorRate >= 1.0) { - code = TSDB_CODE_FAILED; + code = TSDB_CODE_INVALID_PARA; QUERY_CHECK_CODE(code, lino, _error); } SScalableBf* pSBf = taosMemoryCalloc(1, sizeof(SScalableBf)); @@ -71,7 +71,7 @@ int32_t tScalableBfPutNoCheck(SScalableBf* pSBf, const void* keyBuf, uint32_t le int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; if (pSBf->status == SBF_INVALID) { - code = TSDB_CODE_FAILED; + code = TSDB_CODE_OUT_OF_BUFFER; QUERY_CHECK_CODE(code, lino, _error); } int32_t size = taosArrayGetSize(pSBf->bfArray); @@ -92,7 +92,7 @@ int32_t tScalableBfPutNoCheck(SScalableBf* pSBf, const void* keyBuf, uint32_t le _error: if (code != TSDB_CODE_SUCCESS) { - uError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); + uDebug("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } return code; } @@ -101,7 +101,7 @@ int32_t tScalableBfPut(SScalableBf* pSBf, const void* keyBuf, uint32_t len, int3 int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; if (pSBf->status == SBF_INVALID) { - code = TSDB_CODE_FAILED; + code = TSDB_CODE_OUT_OF_BUFFER; QUERY_CHECK_CODE(code, lino, _end); } uint64_t h1 = (uint64_t)pSBf->hashFn1(keyBuf, len); @@ -153,7 +153,7 @@ static int32_t tScalableBfAddFilter(SScalableBf* pSBf, uint64_t expectedEntries, int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; if (taosArrayGetSize(pSBf->bfArray) >= pSBf->maxBloomFilters) { - code = TSDB_CODE_FAILED; + code = TSDB_CODE_OUT_OF_BUFFER; QUERY_CHECK_CODE(code, lino, _error); } @@ -163,7 +163,7 @@ static int32_t tScalableBfAddFilter(SScalableBf* pSBf, uint64_t expectedEntries, if (taosArrayPush(pSBf->bfArray, &pNormalBf) == NULL) { tBloomFilterDestroy(pNormalBf); - code = TSDB_CODE_OUT_OF_MEMORY; + code = terrno; QUERY_CHECK_CODE(code, lino, _error); } pSBf->numBits += pNormalBf->numBits; @@ -217,7 +217,7 @@ int32_t tScalableBfDecode(SDecoder* pDecoder, SScalableBf** ppSBf) { pSBf->bfArray = NULL; int32_t size = 0; if (tDecodeI32(pDecoder, &size) < 0) { - code = TSDB_CODE_FAILED; + code = terrno; QUERY_CHECK_CODE(code, lino, _error); } if (size == 0) { @@ -242,19 +242,19 @@ int32_t tScalableBfDecode(SDecoder* pDecoder, SScalableBf** ppSBf) { } } if (tDecodeU32(pDecoder, &pSBf->growth) < 0) { - code = TSDB_CODE_FAILED; + code = terrno; QUERY_CHECK_CODE(code, lino, _error); } if (tDecodeU64(pDecoder, &pSBf->numBits) < 0) { - code = TSDB_CODE_FAILED; + code = terrno; QUERY_CHECK_CODE(code, lino, _error); } if (tDecodeU32(pDecoder, &pSBf->maxBloomFilters) < 0) { - code = TSDB_CODE_FAILED; + code = terrno; QUERY_CHECK_CODE(code, lino, _error); } if (tDecodeI8(pDecoder, &pSBf->status) < 0) { - code = TSDB_CODE_FAILED; + code = terrno; QUERY_CHECK_CODE(code, lino, _error); } (*ppSBf) = pSBf; From 4fd86887953857702971b7ca485e90aac82e0ae3 Mon Sep 17 00:00:00 2001 From: wangjiaming0909 <604227650@qq.com> Date: Fri, 9 Aug 2024 14:43:50 +0800 Subject: [PATCH 082/103] postfix exchange operator blocking due to addref failed --- source/libs/executor/src/exchangeoperator.c | 6 ++++-- source/libs/executor/src/tfill.c | 5 ++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/source/libs/executor/src/exchangeoperator.c b/source/libs/executor/src/exchangeoperator.c index 78c0d939ad..5afae596a4 100644 --- a/source/libs/executor/src/exchangeoperator.c +++ b/source/libs/executor/src/exchangeoperator.c @@ -390,11 +390,13 @@ static int32_t initExchangeOperator(SExchangePhysiNode* pExNode, SExchangeInfo* } initLimitInfo(pExNode->node.pLimit, pExNode->node.pSlimit, &pInfo->limitInfo); - pInfo->self = taosAddRef(exchangeObjRefPool, pInfo); - if (pInfo->self < 0) { + int64_t refId = taosAddRef(exchangeObjRefPool, pInfo); + if (refId < 0) { int32_t code = terrno; qError("%s failed at line %d since %s", __func__, __LINE__, tstrerror(code)); return code; + } else { + pInfo->self = refId; } return initDataSource(numOfSources, pInfo, id); diff --git a/source/libs/executor/src/tfill.c b/source/libs/executor/src/tfill.c index 3158c85987..957a5d1d2e 100644 --- a/source/libs/executor/src/tfill.c +++ b/source/libs/executor/src/tfill.c @@ -758,7 +758,10 @@ SFillColInfo* createFillColInfo(SExprInfo* pExpr, int32_t numOfFillExpr, SExprIn SValueNode* pv = (SValueNode*)nodesListGetNode(pValNode->pNodeList, index); QUERY_CHECK_NULL(pv, code, lino, _end, terrno); - nodesValueNodeToVariant(pv, &pFillCol[i].fillVal); + code = nodesValueNodeToVariant(pv, &pFillCol[i].fillVal); + } + if (TSDB_CODE_SUCCESS != code) { + goto _end; } } pFillCol->numOfFillExpr = numOfFillExpr; From acf17054d3b65443d99115ab273fa7b13b92e537 Mon Sep 17 00:00:00 2001 From: Hongze Cheng Date: Fri, 9 Aug 2024 15:20:33 +0800 Subject: [PATCH 083/103] fix: possible error handle in syncPipeline.c --- source/libs/sync/src/syncPipeline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/libs/sync/src/syncPipeline.c b/source/libs/sync/src/syncPipeline.c index ef2cbece79..782d97f789 100644 --- a/source/libs/sync/src/syncPipeline.c +++ b/source/libs/sync/src/syncPipeline.c @@ -892,7 +892,7 @@ int32_t syncLogReplRecover(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEn if (pMsg->matchIndex < pNode->pLogBuf->matchIndex) { code = syncLogReplGetPrevLogTerm(pMgr, pNode, index + 1, &term); - if (term < 0 && (errno == ENFILE || errno == EMFILE)) { + if (term < 0 && (errno == ENFILE || errno == EMFILE || errno == ENOENT)) { sError("vgId:%d, failed to get prev log term since %s. index:%" PRId64, pNode->vgId, tstrerror(code), index + 1); TAOS_RETURN(code); } From fe48c405709589a2ab930d37fbf8c35bd5b5eaba Mon Sep 17 00:00:00 2001 From: gccgdb1234 Date: Fri, 9 Aug 2024 15:49:54 +0800 Subject: [PATCH 084/103] doc: correct join error --- docs/zh/14-reference/03-taos-sql/30-join.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/zh/14-reference/03-taos-sql/30-join.md b/docs/zh/14-reference/03-taos-sql/30-join.md index c0daeb41c0..60b634d310 100644 --- a/docs/zh/14-reference/03-taos-sql/30-join.md +++ b/docs/zh/14-reference/03-taos-sql/30-join.md @@ -202,7 +202,7 @@ SELECT ... FROM table_name1 LEFT|RIGHT ASOF JOIN table_name2 [ON ...] [JLIMIT jl 表 d1001 电压值大于 220V 且表 d1002 中同一时刻或稍早前最后时刻出现电压大于 220V 的时间及各自的电压值: ```sql -SELECT a.ts, a.voltage, a.ts, b.voltage FROM d1001 a LEFT ASOF JOIN d1002 b ON a.ts >= b.ts where a.voltage > 220 and b.voltage > 220 +SELECT a.ts, a.voltage, b.ts, b.voltage FROM d1001 a LEFT ASOF JOIN d1002 b ON a.ts >= b.ts where a.voltage > 220 and b.voltage > 220 ``` ### Left/Right Window Join From 2553bb1745bd1ad308e548c58386000c8305492e Mon Sep 17 00:00:00 2001 From: Hongze Cheng Date: Fri, 9 Aug 2024 15:51:58 +0800 Subject: [PATCH 085/103] fix: remove an invalid assert in syncMain.c --- source/libs/sync/src/syncMain.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/source/libs/sync/src/syncMain.c b/source/libs/sync/src/syncMain.c index 171b73eba7..fd1d3e371e 100644 --- a/source/libs/sync/src/syncMain.c +++ b/source/libs/sync/src/syncMain.c @@ -1287,7 +1287,7 @@ SSyncNode* syncNodeOpen(SSyncInfo* pSyncInfo, int32_t vnodeVersion) { } // tools - (void)syncRespMgrCreate(pSyncNode, SYNC_RESP_TTL_MS, &pSyncNode->pSyncRespMgr); // TODO: check return value + (void)syncRespMgrCreate(pSyncNode, SYNC_RESP_TTL_MS, &pSyncNode->pSyncRespMgr); // TODO: check return value if (pSyncNode->pSyncRespMgr == NULL) { sError("vgId:%d, failed to create SyncRespMgr", pSyncNode->vgId); goto _error; @@ -1407,7 +1407,8 @@ int32_t syncNodeRestore(SSyncNode* pSyncNode) { pSyncNode->commitIndex = TMAX(pSyncNode->commitIndex, commitIndex); sInfo("vgId:%d, restore sync until commitIndex:%" PRId64, pSyncNode->vgId, pSyncNode->commitIndex); - if (pSyncNode->fsmState != SYNC_FSM_STATE_INCOMPLETE && (code = syncLogBufferCommit(pSyncNode->pLogBuf, pSyncNode, pSyncNode->commitIndex)) < 0) { + if (pSyncNode->fsmState != SYNC_FSM_STATE_INCOMPLETE && + (code = syncLogBufferCommit(pSyncNode->pLogBuf, pSyncNode, pSyncNode->commitIndex)) < 0) { TAOS_RETURN(code); } @@ -2187,7 +2188,7 @@ void syncNodeCandidate2Leader(SSyncNode* pSyncNode) { } SyncIndex lastIndex = pSyncNode->pLogStore->syncLogLastIndex(pSyncNode->pLogStore); - ASSERT(lastIndex >= 0); + // ASSERT(lastIndex >= 0); sInfo("vgId:%d, become leader. term:%" PRId64 ", commit index:%" PRId64 ", last index:%" PRId64 "", pSyncNode->vgId, raftStoreGetTerm(pSyncNode), pSyncNode->commitIndex, lastIndex); } From 64874f6e40287fe676a0bf85db0c5ecf6446d23d Mon Sep 17 00:00:00 2001 From: sima Date: Fri, 9 Aug 2024 16:00:05 +0800 Subject: [PATCH 086/103] fix:[TD-31355] use correct way to handle error. --- source/libs/scalar/src/filter.c | 1 + 1 file changed, 1 insertion(+) diff --git a/source/libs/scalar/src/filter.c b/source/libs/scalar/src/filter.c index e5d0fe594a..729a3ff840 100644 --- a/source/libs/scalar/src/filter.c +++ b/source/libs/scalar/src/filter.c @@ -2908,6 +2908,7 @@ int32_t filterGenerateColRange(SFilterInfo *info, SFilterGroupCtx **gRes, int32_ info->colRangeNum = colNum; info->colRange = taosMemoryCalloc(colNum, POINTER_BYTES); if (info->colRange == NULL) { + info->colRangeNum = 0; FLT_ERR_JRET(TSDB_CODE_OUT_OF_MEMORY); } From 3b99e077ef47cf6acb8f5c44de4f38cf71cee64f Mon Sep 17 00:00:00 2001 From: yihaoDeng Date: Fri, 9 Aug 2024 16:57:10 +0800 Subject: [PATCH 087/103] fix return error --- source/libs/stream/src/streamBackendRocksdb.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/source/libs/stream/src/streamBackendRocksdb.c b/source/libs/stream/src/streamBackendRocksdb.c index ee87d3b897..83e55791d2 100644 --- a/source/libs/stream/src/streamBackendRocksdb.c +++ b/source/libs/stream/src/streamBackendRocksdb.c @@ -1452,8 +1452,14 @@ int32_t taskDbBuildSnap(void* arg, SArray* pSnap) { code = TSDB_CODE_OUT_OF_MEMORY; break; } - (void)taosArrayPush(pSnap, &snap); + if (taosArrayPush(pSnap, &snap) == NULL) { + taskDbUnRefChkp(pTaskDb, pTaskDb->chkpId); + taskDbRemoveRef(pTaskDb); + code = terrno; + break; + } + taskDbRemoveRef(pTaskDb); pIter = taosHashIterate(pMeta->pTaskDbUnique, pIter); } streamMutexUnlock(&pMeta->backendMutex); From 105594848cfd15f2f51c4b367219333618b047b4 Mon Sep 17 00:00:00 2001 From: Minglei Jin Date: Fri, 9 Aug 2024 17:00:18 +0800 Subject: [PATCH 088/103] fix(tsdb/cache): return oom if array or push failed --- source/dnode/vnode/src/tsdb/tsdbCache.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/source/dnode/vnode/src/tsdb/tsdbCache.c b/source/dnode/vnode/src/tsdb/tsdbCache.c index b7e3bb6a07..1216f0da81 100644 --- a/source/dnode/vnode/src/tsdb/tsdbCache.c +++ b/source/dnode/vnode/src/tsdb/tsdbCache.c @@ -1703,8 +1703,14 @@ int32_t tsdbCacheGetBatch(STsdb *pTsdb, tb_uid_t uid, SArray *pLastArray, SCache if (!remainCols) { remainCols = taosArrayInit(num_keys, sizeof(SIdxKey)); + if (!remainCols) { + TAOS_RETURN(TSDB_CODE_OUT_OF_MEMORY); + } } - (void)taosArrayPush(remainCols, &(SIdxKey){i, key}); + if (NULL == taosArrayPush(remainCols, &(SIdxKey){i, key})) { + taosArrayDestroy(remainCols); + TAOS_RETURN(TSDB_CODE_OUT_OF_MEMORY); + }; } } From b8cd001dfb4c114cdb6307ca1cb5284c97469ee4 Mon Sep 17 00:00:00 2001 From: Minglei Jin Date: Fri, 9 Aug 2024 17:06:35 +0800 Subject: [PATCH 089/103] fix(meta/query): return oom with null ctb cursor --- source/dnode/vnode/src/meta/metaQuery.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/source/dnode/vnode/src/meta/metaQuery.c b/source/dnode/vnode/src/meta/metaQuery.c index 27a4179172..576324d7c2 100644 --- a/source/dnode/vnode/src/meta/metaQuery.c +++ b/source/dnode/vnode/src/meta/metaQuery.c @@ -1441,6 +1441,9 @@ int32_t metaGetTableTagsByUids(void *pVnode, int64_t suid, SArray *uidList) { int32_t metaGetTableTags(void *pVnode, uint64_t suid, SArray *pUidTagInfo) { SMCtbCursor *pCur = metaOpenCtbCursor(pVnode, suid, 1); + if (!pCur) { + TAOS_RETURN(TSDB_CODE_OUT_OF_MEMORY); + } // If len > 0 means there already have uids, and we only want the // tags of the specified tables, of which uid in the uid list. Otherwise, all table tags are retrieved and kept From 2d2ba4fd2905c481e14cd3ca564a7e075065fbf3 Mon Sep 17 00:00:00 2001 From: Minglei Jin Date: Fri, 9 Aug 2024 17:09:19 +0800 Subject: [PATCH 090/103] fix(meta/query): return oom if malloc failed --- source/dnode/vnode/src/meta/metaQuery.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/source/dnode/vnode/src/meta/metaQuery.c b/source/dnode/vnode/src/meta/metaQuery.c index 27a4179172..f5b03b340e 100644 --- a/source/dnode/vnode/src/meta/metaQuery.c +++ b/source/dnode/vnode/src/meta/metaQuery.c @@ -1234,6 +1234,9 @@ int32_t metaFilterTableIds(void *pVnode, SMetaFltParam *arg, SArray *pUids) { SIdxCursor *pCursor = NULL; pCursor = (SIdxCursor *)taosMemoryCalloc(1, sizeof(SIdxCursor)); + if (!pCursor) { + TAOS_RETURN(TSDB_CODE_OUT_OF_MEMORY); + } pCursor->pMeta = pMeta; pCursor->suid = param->suid; pCursor->cid = param->cid; From 0035c1b06a577f2b14229599db19d341cd0a01b0 Mon Sep 17 00:00:00 2001 From: Minglei Jin Date: Fri, 9 Aug 2024 17:13:37 +0800 Subject: [PATCH 091/103] fix(meta/query): return oom if malloc failed --- source/dnode/vnode/src/meta/metaQuery.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/source/dnode/vnode/src/meta/metaQuery.c b/source/dnode/vnode/src/meta/metaQuery.c index 27a4179172..ffa4067ab9 100644 --- a/source/dnode/vnode/src/meta/metaQuery.c +++ b/source/dnode/vnode/src/meta/metaQuery.c @@ -1427,6 +1427,11 @@ int32_t metaGetTableTagsByUids(void *pVnode, int64_t suid, SArray *uidList) { int32_t len = 0; if (metaGetTableTagByUid(pMeta, suid, p->uid, &val, &len, false) == 0) { p->pTagVal = taosMemoryMalloc(len); + if (!p->pTagVal) { + if (isLock) metaULock(pMeta); + + TAOS_RETURN(TSDB_CODE_OUT_OF_MEMORY); + } memcpy(p->pTagVal, val, len); tdbFree(val); } else { From 353bcc5a4754225269419a8d498c32f3833b85e4 Mon Sep 17 00:00:00 2001 From: Shengliang Guan Date: Fri, 9 Aug 2024 17:58:01 +0800 Subject: [PATCH 092/103] doc: adjust typo --- docs/zh/08-develop/09-udf.md | 62 ++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/docs/zh/08-develop/09-udf.md b/docs/zh/08-develop/09-udf.md index 700bbb2ae0..4606e968d9 100644 --- a/docs/zh/08-develop/09-udf.md +++ b/docs/zh/08-develop/09-udf.md @@ -410,7 +410,7 @@ def finish(buf: bytes) -> output_type: #### 示例一 编写一个只接收一个整数的 UDF 函数: 输入 n, 输出 ln(n^2 + 1)。 -首先编写一个 Python 文件,存在系统某个目录,比如 /root/udf/myfun.py 内容如下 +首先编写一个 Python 文件,存在系统某个目录,比如 /root/udf/myfun.py 内容如下。 ```python from math import log @@ -426,23 +426,23 @@ def process(block): return [log(block.data(i, 0) ** 2 + 1) for i in range(rows)] ``` -这个文件包含 3 个函数, init 和 destroy 都是空函数,它们是 UDF 的生命周期函数,即使什么都不做也要定义。最关键的是 process 函数, 它接受一个数据块,这个数据块对象有两个方法: +这个文件包含 3 个函数, init 和 destroy 都是空函数,它们是 UDF 的生命周期函数,即使什么都不做也要定义。最关键的是 process 函数, 它接受一个数据块,这个数据块对象有两个方法。 1. shape() 返回数据块的行数和列数 2. data(i, j) 返回 i 行 j 列的数据 -标量函数的 process 方法传人的数据块有多少行,就需要返回多少个数据。上述代码中我们忽略的列数,因为我们只想对每行的第一个数做计算。 -接下来我们创建对应的 UDF 函数,在 TDengine CLI 中执行下面语句: +标量函数的 process 方法传入的数据块有多少行,就需要返回多少行数据。上述代码忽略列数,因为只需对每行的第一个数做计算。 +接下来我建对应的 UDF 函数,在 TDengine CLI 中执行下面语句。 ```sql create function myfun as '/root/udf/myfun.py' outputtype double language 'Python' ``` -其输出如下 +其输出如下。 ```shell taos> create function myfun as '/root/udf/myfun.py' outputtype double language 'Python'; Create OK, 0 row(s) affected (0.005202s) ``` -看起来很顺利,接下来 show 一下系统中所有的自定义函数,确认创建成功: +看起来很顺利,接下来 show 一下系统中所有的自定义函数,确认创建成功。 ```text taos> show functions; @@ -452,7 +452,7 @@ taos> show functions; Query OK, 1 row(s) in set (0.005767s) ``` -接下来就来测试一下这个函数,测试之前先执行下面的 SQL 命令,制造些测试数据,在 TDengine CLI 中执行下述命令 +接下来就来测试一下这个函数,测试之前先执行下面的 SQL 命令,制造些测试数据,在 TDengine CLI 中执行下述命令。 ```sql create database test; @@ -462,7 +462,7 @@ insert into t values('2023-05-03 08:09:10', 2, 3, 4); insert into t values('2023-05-10 07:06:05', 3, 4, 5); ``` -测试 myfun 函数: +测试 myfun 函数。 ```sql taos> select myfun(v1, v2) from t; @@ -471,13 +471,13 @@ DB error: udf function execution failure (0.011088s) ``` 不幸的是执行失败了,什么原因呢? -查看 udfd 进程的日志 +查看 udfd 进程的日志。 ```shell tail -10 /var/log/taos/udfd.log ``` -发现以下错误信息: +发现以下错误信息。 ```text 05/24 22:46:28.733545 01665799 UDF ERROR can not load library libtaospyudf.so. error: operation not permitted @@ -486,7 +486,7 @@ tail -10 /var/log/taos/udfd.log 错误很明确:没有加载到 Python 插件 libtaospyudf.so,如果遇到此错误,请参考前面的准备环境一节。 -修复环境错误后再次执行,如下: +修复环境错误后再次执行,如下。 ```sql taos> select myfun(v1) from t; @@ -501,7 +501,7 @@ taos> select myfun(v1) from t; #### 示例二 -上面的 myfun 虽然测试测试通过了,但是有两个缺点: +上面的 myfun 虽然测试测试通过了,但是有两个缺点。 1. 这个标量函数只接受 1 列数据作为输入,如果用户传入了多列也不会抛异常。 @@ -515,7 +515,7 @@ taos> select myfun(v1, v2) from t; ``` 2. 没有处理 null 值。我们期望如果输入有 null,则会抛异常终止执行。 -因此 process 函数改进如下: +因此 process 函数改进如下。 ```python def process(block): @@ -525,13 +525,13 @@ def process(block): return [ None if block.data(i, 0) is None else log(block.data(i, 0) ** 2 + 1) for i in range(rows)] ``` -然后执行下面的语句更新已有的 UDF: +然后执行下面的语句更新已有的 UDF。 ```sql create or replace function myfun as '/root/udf/myfun.py' outputtype double language 'Python'; ``` -再传入 myfun 两个参数,就会执行失败了 +再传入 myfun 两个参数,就会执行失败了。 ```sql taos> select myfun(v1, v2) from t; @@ -539,7 +539,7 @@ taos> select myfun(v1, v2) from t; DB error: udf function execution failure (0.014643s) ``` -但遗憾的是我们自定义的异常信息没有展示给用户,而是在插件的日志文件 /var/log/taos/taospyudf.log 中: +但遗憾的是我们自定义的异常信息没有展示给用户,而是在插件的日志文件 /var/log/taos/taospyudf.log 中。 ```text 2023-05-24 23:21:06.790 ERROR [1666188] [doPyUdfScalarProc@507] call pyUdfScalar proc function. context 0x7faade26d180. error: Exception: require 1 parameter but given 2 @@ -555,7 +555,7 @@ At: #### 示例三 编写一个 UDF:输入(x1, x2, ..., xn), 输出每个值和它们的序号的乘积的和: 1 * x1 + 2 * x2 + ... + n * xn。如果 x1 至 xn 中包含 null,则结果为 null。 -这个示例与示例一的区别是,可以接受任意多列作为输入,且要处理每一列的值。编写 UDF 文件 /root/udf/nsum.py: +这个示例与示例一的区别是,可以接受任意多列作为输入,且要处理每一列的值。编写 UDF 文件 /root/udf/nsum.py。 ```python def init(): @@ -581,13 +581,13 @@ def process(block): return result ``` -创建 UDF: +创建 UDF。 ```sql create function nsum as '/root/udf/nsum.py' outputtype double language 'Python'; ``` -测试 UDF: +测试 UDF。 ```sql taos> insert into t values('2023-05-25 09:09:15', 6, null, 8); @@ -606,13 +606,13 @@ Query OK, 4 row(s) in set (0.010653s) #### 示例四 编写一个 UDF,输入一个时间戳,输出距离这个时间最近的下一个周日。比如今天是 2023-05-25, 则下一个周日是 2023-05-28。 -完成这个函数要用到第三方库 momen。先安装这个库: +完成这个函数要用到第三方库 momen。先安装这个库。 ```shell pip3 install moment ``` -然后编写 UDF 文件 /root/udf/nextsunday.py +然后编写 UDF 文件 /root/udf/nextsunday.py。 ```python import moment @@ -636,13 +636,13 @@ def process(block): for i in range(rows)] ``` -UDF 框架会将 TDengine 的 timestamp 类型映射为 Python 的 int 类型,所以这个函数只接受一个表示毫秒数的整数。process 方法先做参数检查,然后用 moment 包替换时间的星期为星期日,最后格式化输出。输出的字符串长度是固定的 10 个字符长,因此可以这样创建 UDF 函数: +UDF 框架会将 TDengine 的 timestamp 类型映射为 Python 的 int 类型,所以这个函数只接受一个表示毫秒数的整数。process 方法先做参数检查,然后用 moment 包替换时间的星期为星期日,最后格式化输出。输出的字符串长度是固定的 10 个字符长,因此可以这样创建 UDF 函数。 ```sql create function nextsunday as '/root/udf/nextsunday.py' outputtype binary(10) language 'Python'; ``` -此时测试函数,如果你是用 systemctl 启动的 taosd,肯定会遇到错误: +此时测试函数,如果你是用 systemctl 启动的 taosd,肯定会遇到错误。 ```sql taos> select ts, nextsunday(ts) from t; @@ -655,7 +655,7 @@ DB error: udf function execution failure (1.123615s) 2023-05-25 11:42:34.541 ERROR [1679419] [PyUdf::PyUdf@217] py udf load module failure. error ModuleNotFoundError: No module named 'moment' ``` -这是因为 “moment” 所在位置不在 python udf 插件默认的库搜索路径中。怎么确认这一点呢?通过以下命令搜索 taospyudf.log: +这是因为 “moment” 所在位置不在 python udf 插件默认的库搜索路径中。怎么确认这一点呢?通过以下命令搜索 taospyudf.log。 ```shell grep 'sys path' taospyudf.log | tail -1 @@ -668,7 +668,7 @@ grep 'sys path' taospyudf.log | tail -1 ``` 发现 python udf 插件默认搜索的第三方库安装路径是: /lib/python3/dist-packages,而 moment 默认安装到了 /usr/local/lib/python3.8/dist-packages。下面我们修改 python udf 插件默认的库搜索路径。 -先打开 python3 命令行,查看当前的 sys.path +先打开 python3 命令行,查看当前的 sys.path。 ```python >>> import sys @@ -676,13 +676,13 @@ grep 'sys path' taospyudf.log | tail -1 '/usr/lib/python3.8:/usr/lib/python3.8/lib-dynload:/usr/local/lib/python3.8/dist-packages:/usr/lib/python3/dist-packages' ``` -复制上面脚本的输出的字符串,然后编辑 /var/taos/taos.cfg 加入以下配置: +复制上面脚本的输出的字符串,然后编辑 /var/taos/taos.cfg 加入以下配置。 ```shell UdfdLdLibPath /usr/lib/python3.8:/usr/lib/python3.8/lib-dynload:/usr/local/lib/python3.8/dist-packages:/usr/lib/python3/dist-packages ``` -保存后执行 systemctl restart taosd, 再测试就不报错了: +保存后执行 systemctl restart taosd, 再测试就不报错了。 ```sql taos> select ts, nextsunday(ts) from t; @@ -698,7 +698,7 @@ Query OK, 4 row(s) in set (1.011474s) #### 示例五 编写一个聚合函数,计算某一列最大值和最小值的差。 -聚合函数与标量函数的区别是:标量函数是多行输入对应多个输出,聚合函数是多行输入对应一个输出。聚合函数的执行过程有点像经典的 map-reduce 框架的执行过程,框架把数据分成若干块,每个 mapper 处理一个块,reducer 再把 mapper 的结果做聚合。不一样的地方在于,对于 TDengine Python UDF 中的 reduce 函数既有 map 的功能又有 reduce 的功能。reduce 函数接受两个参数:一个是自己要处理的数据,一个是别的任务执行 reduce 函数的处理结果。如下面的示例 /root/udf/myspread.py: +聚合函数与标量函数的区别是:标量函数是多行输入对应多个输出,聚合函数是多行输入对应一个输出。聚合函数的执行过程有点像经典的 map-reduce 框架的执行过程,框架把数据分成若干块,每个 mapper 处理一个块,reducer 再把 mapper 的结果做聚合。不一样的地方在于,对于 TDengine Python UDF 中的 reduce 函数既有 map 的功能又有 reduce 的功能。reduce 函数接受两个参数:一个是自己要处理的数据,一个是别的任务执行 reduce 函数的处理结果。如下面的示例 /root/udf/myspread.py。 ```python import io @@ -747,20 +747,20 @@ def finish(buf): return max_number - min_number ``` -在这个示例中我们不光定义了一个聚合函数,还添加记录执行日志的功能,讲解如下: +在这个示例中我们不光定义了一个聚合函数,还添加记录执行日志的功能,讲解如下。 1. init 函数不再是空函数,而是打开了一个文件用于写执行日志 2. log 函数是记录日志的工具,自动将传入的对象转成字符串,加换行符输出 3. destroy 函数用来在执行结束关闭文件 4. start 返回了初始的 buffer,用来存聚合函数的中间结果,我们把最大值初始化为负无穷大,最小值初始化为正无穷大 5. reduce 处理每个数据块并聚合结果 6. finish 函数将最终的 buffer 转换成最终的输出 -执行下面的 SQL语句创建对应的 UDF: +执行下面的 SQL语句创建对应的 UDF。 ```sql create or replace aggregate function myspread as '/root/udf/myspread.py' outputtype double bufsize 128 language 'Python'; ``` -这个 SQL 语句与创建标量函数的 SQL 语句有两个重要区别: +这个 SQL 语句与创建标量函数的 SQL 语句有两个重要区别。 1. 增加了 aggregate 关键字 2. 增加了 bufsize 关键字,用来指定存储中间结果的内存大小,这个数值可以大于实际使用的数值。本例中间结果是两个浮点数组成的 tuple,序列化后实际占用大小只有 32 个字节,但指定的 bufsize 是128,可以用 python 命令行打印实际占用的字节数 From 8790d5c02b7f4de72249e4a83ba54bdc33c9215e Mon Sep 17 00:00:00 2001 From: Shengliang Guan Date: Fri, 9 Aug 2024 17:59:01 +0800 Subject: [PATCH 093/103] doc: minor changes --- docs/zh/08-develop/09-udf.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/zh/08-develop/09-udf.md b/docs/zh/08-develop/09-udf.md index 4606e968d9..bc62dbfa3c 100644 --- a/docs/zh/08-develop/09-udf.md +++ b/docs/zh/08-develop/09-udf.md @@ -430,7 +430,7 @@ def process(block): 1. shape() 返回数据块的行数和列数 2. data(i, j) 返回 i 行 j 列的数据 标量函数的 process 方法传入的数据块有多少行,就需要返回多少行数据。上述代码忽略列数,因为只需对每行的第一个数做计算。 -接下来我建对应的 UDF 函数,在 TDengine CLI 中执行下面语句。 +接下来创建对应的 UDF 函数,在 TDengine CLI 中执行下面语句。 ```sql create function myfun as '/root/udf/myfun.py' outputtype double language 'Python' From 5e10f461ad1c8b0e3ce8556372a2bf08e2eebd44 Mon Sep 17 00:00:00 2001 From: gccgdb1234 Date: Fri, 9 Aug 2024 18:01:45 +0800 Subject: [PATCH 094/103] docs: reorganize the order of create database parameters --- .../14-reference/03-taos-sql/02-database.md | 42 +++++++++---------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/docs/zh/14-reference/03-taos-sql/02-database.md b/docs/zh/14-reference/03-taos-sql/02-database.md index 8a366011ae..d2e9ba0646 100644 --- a/docs/zh/14-reference/03-taos-sql/02-database.md +++ b/docs/zh/14-reference/03-taos-sql/02-database.md @@ -13,26 +13,26 @@ database_options: database_option ... database_option: { - BUFFER value + VGROUPS value + | PRECISION {'ms' | 'us' | 'ns'} + | REPLICA value + | BUFFER value + | PAGES value + | PAGESIZE value | CACHEMODEL {'none' | 'last_row' | 'last_value' | 'both'} | CACHESIZE value | COMP {0 | 1 | 2} | DURATION value - | WAL_FSYNC_PERIOD value | MAXROWS value | MINROWS value | KEEP value - | PAGES value - | PAGESIZE value - | PRECISION {'ms' | 'us' | 'ns'} - | REPLICA value - | WAL_LEVEL {1 | 2} - | VGROUPS value - | SINGLE_STABLE {0 | 1} | STT_TRIGGER value + | SINGLE_STABLE {0 | 1} | TABLE_PREFIX value | TABLE_SUFFIX value | TSDB_PAGESIZE value + | WAL_LEVEL {1 | 2} + | WAL_FSYNC_PERIOD value | WAL_RETENTION_PERIOD value | WAL_RETENTION_SIZE value } @@ -40,7 +40,14 @@ database_option: { ### 参数说明 +- VGROUPS:数据库中初始 vgroup 的数目。 +- PRECISION:数据库的时间戳精度。ms 表示毫秒,us 表示微秒,ns 表示纳秒,默认 ms 毫秒。 +- REPLICA:表示数据库副本数,取值为 1、2 或 3,默认为 1; 2 仅在企业版 3.3.0.0 及以后版本中可用。在集群中使用,副本数必须小于或等于 DNODE 的数目。且使用时存在以下限制: + - 暂不支持对双副本数据库相关 Vgroup 进行 SPLITE VGROUP 或 REDISTRIBUTE VGROUP 操作 + - 单副本数据库可变更为双副本数据库,但不支持从双副本变更为其它副本数,也不支持从三副本变更为双副本 - BUFFER: 一个 VNODE 写入内存池大小,单位为 MB,默认为 256,最小为 3,最大为 16384。 +- PAGES:一个 VNODE 中元数据存储引擎的缓存页个数,默认为 256,最小 64。一个 VNODE 元数据存储占用 PAGESIZE \* PAGES,默认情况下为 1MB 内存。 +- PAGESIZE:一个 VNODE 中元数据存储引擎的页大小,单位为 KB,默认为 4 KB。范围为 1 到 16384,即 1 KB 到 16 MB。 - CACHEMODEL:表示是否在内存中缓存子表的最近数据。默认为 none。 - none:表示不缓存。 - last_row:表示缓存子表最近一行数据。这将显著改善 LAST_ROW 函数的性能表现。 @@ -53,27 +60,20 @@ database_option: { - 1:表示一阶段压缩。 - 2:表示两阶段压缩。 - DURATION:数据文件存储数据的时间跨度。可以使用加单位的表示形式,如 DURATION 100h、DURATION 10d 等,支持 m(分钟)、h(小时)和 d(天)三个单位。不加时间单位时默认单位为天,如 DURATION 50 表示 50 天。 -- WAL_FSYNC_PERIOD:当 WAL_LEVEL 参数设置为 2 时,用于设置落盘的周期。默认为 3000,单位毫秒。最小为 0,表示每次写入立即落盘;最大为 180000,即三分钟。 - MAXROWS:文件块中记录的最大条数,默认为 4096 条。 - MINROWS:文件块中记录的最小条数,默认为 100 条。 - KEEP:表示数据文件保存的天数,缺省值为 3650,取值范围 [1, 365000],且必须大于或等于3倍的 DURATION 参数值。数据库会自动删除保存时间超过 KEEP 值的数据。KEEP 可以使用加单位的表示形式,如 KEEP 100h、KEEP 10d 等,支持 m(分钟)、h(小时)和 d(天)三个单位。也可以不写单位,如 KEEP 50,此时默认单位为天。企业版支持[多级存储](https://docs.taosdata.com/tdinternal/arch/#%E5%A4%9A%E7%BA%A7%E5%AD%98%E5%82%A8)功能, 因此, 可以设置多个保存时间(多个以英文逗号分隔,最多 3 个,满足 keep 0 \<= keep 1 \<= keep 2,如 KEEP 100h,100d,3650d); 社区版不支持多级存储功能(即使配置了多个保存时间, 也不会生效, KEEP 会取最大的保存时间)。 -- PAGES:一个 VNODE 中元数据存储引擎的缓存页个数,默认为 256,最小 64。一个 VNODE 元数据存储占用 PAGESIZE \* PAGES,默认情况下为 1MB 内存。 -- PAGESIZE:一个 VNODE 中元数据存储引擎的页大小,单位为 KB,默认为 4 KB。范围为 1 到 16384,即 1 KB 到 16 MB。 -- PRECISION:数据库的时间戳精度。ms 表示毫秒,us 表示微秒,ns 表示纳秒,默认 ms 毫秒。 -- REPLICA:表示数据库副本数,取值为 1、2 或 3,默认为 1; 2 仅在企业版 3.3.0.0 及以后版本中可用。在集群中使用,副本数必须小于或等于 DNODE 的数目。且使用时存在以下限制: - - 暂不支持对双副本数据库相关 Vgroup 进行 SPLITE VGROUP 或 REDISTRIBUTE VGROUP 操作 - - 单副本数据库可变更为双副本数据库,但不支持从双副本变更为其它副本数,也不支持从三副本变更为双副本 -- WAL_LEVEL:WAL 级别,默认为 1。 - - 1:写 WAL,但不执行 fsync。 - - 2:写 WAL,而且执行 fsync。 -- VGROUPS:数据库中初始 vgroup 的数目。 +- STT_TRIGGER:表示落盘文件触发文件合并的个数。默认为 1,范围 1 到 16。对于少表高频场景,此参数建议使用默认配置,或较小的值;而对于多表低频场景,此参数建议配置较大的值。 - SINGLE_STABLE:表示此数据库中是否只可以创建一个超级表,用于超级表列非常多的情况。 - 0:表示可以创建多张超级表。 - 1:表示只可以创建一张超级表。 -- STT_TRIGGER:表示落盘文件触发文件合并的个数。默认为 1,范围 1 到 16。对于少表高频场景,此参数建议使用默认配置,或较小的值;而对于多表低频场景,此参数建议配置较大的值。 - TABLE_PREFIX:当其为正值时,在决定把一个表分配到哪个 vgroup 时要忽略表名中指定长度的前缀;当其为负值时,在决定把一个表分配到哪个 vgroup 时只使用表名中指定长度的前缀;例如,假定表名为 "v30001",当 TSDB_PREFIX = 2 时 使用 "0001" 来决定分配到哪个 vgroup ,当 TSDB_PREFIX = -2 时使用 "v3" 来决定分配到哪个 vgroup - TABLE_SUFFIX:当其为正值时,在决定把一个表分配到哪个 vgroup 时要忽略表名中指定长度的后缀;当其为负值时,在决定把一个表分配到哪个 vgroup 时只使用表名中指定长度的后缀;例如,假定表名为 "v30001",当 TSDB_SUFFIX = 2 时 使用 "v300" 来决定分配到哪个 vgroup ,当 TSDB_SUFFIX = -2 时使用 "01" 来决定分配到哪个 vgroup。 - TSDB_PAGESIZE:一个 VNODE 中时序数据存储引擎的页大小,单位为 KB,默认为 4 KB。范围为 1 到 16384,即 1 KB到 16 MB。 +- WAL_LEVEL:WAL 级别,默认为 1。 + - 1:写 WAL,但不执行 fsync。 + - 2:写 WAL,而且执行 fsync。 +- WAL_FSYNC_PERIOD:当 WAL_LEVEL 参数设置为 2 时,用于设置落盘的周期。默认为 3000,单位毫秒。最小为 0,表示每次写入立即落盘;最大为 180000,即三分钟。 - WAL_RETENTION_PERIOD: 为了数据订阅消费,需要WAL日志文件额外保留的最大时长策略。WAL日志清理,不受订阅客户端消费状态影响。单位为 s。默认为 3600,表示在 WAL 保留最近 3600 秒的数据,请根据数据订阅的需要修改这个参数为适当值。 - WAL_RETENTION_SIZE:为了数据订阅消费,需要WAL日志文件额外保留的最大累计大小策略。单位为 KB。默认为 0,表示累计大小无上限。 ### 创建数据库示例 From 43448bee6e21def9acd9ac4fdcbc4d8b9c58e6ad Mon Sep 17 00:00:00 2001 From: sheyanjie-qq <249478495@qq.com> Date: Fri, 9 Aug 2024 18:20:50 +0800 Subject: [PATCH 095/103] fix doc issue --- .../com/taos/example/WSConnectExample.java | 48 +++++++-------- docs/zh/08-develop/02-sql.md | 2 +- .../01-components/12-tdinsight/index.mdx | 4 +- docs/zh/14-reference/05-connector/index.md | 58 ++++++++++--------- .../20-third-party/03-visual/01-grafana.mdx | 46 ++++++++++----- examples/JDBC/connectionPools/pom.xml | 2 +- .../java/com/taosdata/example/DruidDemo.java | 2 +- .../example/pool/DruidPoolBuilder.java | 2 +- .../example/pool/HikariCpBuilder.java | 2 +- .../src/main/resources/proxool.xml | 2 +- 10 files changed, 94 insertions(+), 74 deletions(-) diff --git a/docs/examples/java/src/main/java/com/taos/example/WSConnectExample.java b/docs/examples/java/src/main/java/com/taos/example/WSConnectExample.java index f920e77037..21f184b45a 100644 --- a/docs/examples/java/src/main/java/com/taos/example/WSConnectExample.java +++ b/docs/examples/java/src/main/java/com/taos/example/WSConnectExample.java @@ -8,31 +8,33 @@ import java.sql.SQLException; import java.util.Properties; public class WSConnectExample { -// ANCHOR: main -public static void main(String[] args) throws SQLException { - // use - // String jdbcUrl = "jdbc:TAOS-RS://localhost:6041/dbName?user=root&password=taosdata&batchfetch=true"; - // if you want to connect a specified database named "dbName". - String jdbcUrl = "jdbc:TAOS-RS://localhost:6041?user=root&password=taosdata&batchfetch=true"; - Properties connProps = new Properties(); - connProps.setProperty(TSDBDriver.PROPERTY_KEY_BATCH_LOAD, "true"); - connProps.setProperty(TSDBDriver.PROPERTY_KEY_ENABLE_AUTO_RECONNECT, "true"); - connProps.setProperty(TSDBDriver.PROPERTY_KEY_CHARSET, "UTF-8"); - connProps.setProperty(TSDBDriver.PROPERTY_KEY_TIME_ZONE, "UTC-8"); + // ANCHOR: main + public static void main(String[] args) throws SQLException { + // use + // String jdbcUrl = + // "jdbc:TAOS-RS://localhost:6041/dbName?user=root&password=taosdata&batchfetch=true"; + // if you want to connect a specified database named "dbName". + String jdbcUrl = "jdbc:TAOS-RS://localhost:6041?user=root&password=taosdata&batchfetch=true"; + Properties connProps = new Properties(); + connProps.setProperty(TSDBDriver.PROPERTY_KEY_ENABLE_AUTO_RECONNECT, "true"); + connProps.setProperty(TSDBDriver.PROPERTY_KEY_CHARSET, "UTF-8"); + connProps.setProperty(TSDBDriver.PROPERTY_KEY_TIME_ZONE, "UTC-8"); - try (Connection conn = DriverManager.getConnection(jdbcUrl, connProps)){ - System.out.println("Connected to " + jdbcUrl + " successfully."); + try (Connection conn = DriverManager.getConnection(jdbcUrl, connProps)) { + System.out.println("Connected to " + jdbcUrl + " successfully."); - // you can use the connection for execute SQL here + // you can use the connection for execute SQL here - } catch (SQLException ex) { - // handle any errors, please refer to the JDBC specifications for detailed exceptions info - System.out.println("Failed to connect to " + jdbcUrl + "; ErrCode:" + ex.getErrorCode() + "; ErrMessage: " + ex.getMessage()); - throw ex; - } catch (Exception ex){ - System.out.println("Failed to connect to " + jdbcUrl + "; ErrMessage: " + ex.getMessage()); - throw ex; + } catch (SQLException ex) { + // handle any errors, please refer to the JDBC specifications for detailed + // exceptions info + System.out.println("Failed to connect to " + jdbcUrl + "; ErrCode:" + ex.getErrorCode() + "; ErrMessage: " + + ex.getMessage()); + throw ex; + } catch (Exception ex) { + System.out.println("Failed to connect to " + jdbcUrl + "; ErrMessage: " + ex.getMessage()); + throw ex; + } } -} -// ANCHOR_END: main + // ANCHOR_END: main } diff --git a/docs/zh/08-develop/02-sql.md b/docs/zh/08-develop/02-sql.md index 3bebe5e7a4..3cad007078 100644 --- a/docs/zh/08-develop/02-sql.md +++ b/docs/zh/08-develop/02-sql.md @@ -90,7 +90,7 @@ curl --location -uroot:taosdata 'http://127.0.0.1:6041/rest/sql/power' \ -> **注意**:建议采用 `.` 的格式构造SQL语句,不推荐在应用中采用 `USE DBName`方式访问。 +> **注意**:建议采用 `.` 的格式构造SQL语句,不推荐在应用中采用 `USE DBName` 方式访问。 ## 插入数据 下面以智能电表为例,展示如何使用连接器执行 SQL 来插入数据到 `power` 数据库的 `meters` 超级表。样例使用 TDengine 自动建表 SQL 语法,写入 d1001 子表中 3 条数据,写入 d1002 子表中 1 条数据,然后打印出实际插入数据条数。 diff --git a/docs/zh/14-reference/01-components/12-tdinsight/index.mdx b/docs/zh/14-reference/01-components/12-tdinsight/index.mdx index 2d4b69f46b..34a06e13c4 100644 --- a/docs/zh/14-reference/01-components/12-tdinsight/index.mdx +++ b/docs/zh/14-reference/01-components/12-tdinsight/index.mdx @@ -103,13 +103,13 @@ chmod +x TDinsight.sh 点击 `Save & Test` 进行测试,成功会提示:`TDengine Data source is working`。 -## 导入 TDengine V3 仪表盘 +## 导入 TDinsightV3 仪表盘 在配置 TDengine 数据源界面,点击 “Dashboards” tab,再点击 ”import” 导入 ”TDengine for 3.x” 仪表盘。 导入成功后可以进入这个 dashboard,在左上角 ”Log from“ 选项中选择 taosKeeper 中设置的记录监控指标的数据库就可以看到监控结果。 -## TDengine V3 仪表盘详情 +## TDinsightV3 仪表盘详情 TDinsight 仪表盘旨在提供 TDengine 相关资源的使用情况和状态,比如 dnodes、 mnodes、 vnodes 和数据库等。 主要分为集群状态、DNodes 概述、MNode 概述、请求、数据库、DNode 资源使用情况和 taosAdapter 监控信息。下面我们分别详细介绍。 diff --git a/docs/zh/14-reference/05-connector/index.md b/docs/zh/14-reference/05-connector/index.md index bc63bdff93..f9e1bd837d 100644 --- a/docs/zh/14-reference/05-connector/index.md +++ b/docs/zh/14-reference/05-connector/index.md @@ -13,7 +13,7 @@ TDengine 提供了丰富的应用程序开发接口,为了便于用户快速 目前 TDengine 的原生接口连接器可支持的平台包括:X64/ARM64 等硬件平台,以及 Linux/Win64 等开发环境。对照矩阵如下: | **CPU** | **OS** | **Java** | **Python** | **Go** | **Node.js** | **C#** | **Rust** | C/C++ | -|---------------|-----------|----------|------------|--------|-------------|--------|----------|-------| +| ------------- | --------- | -------- | ---------- | ------ | ----------- | ------ | -------- | ----- | | **X86 64bit** | **Linux** | ● | ● | ● | ● | ● | ● | ● | | **X86 64bit** | **Win64** | ● | ● | ● | ● | ● | ● | ● | | **X86 64bit** | **macOS** | ● | ● | ● | ○ | ○ | ● | ● | @@ -28,14 +28,14 @@ TDengine 提供了丰富的应用程序开发接口,为了便于用户快速 TDengine 版本更新往往会增加新的功能特性,列表中的连接器版本为连接器最佳适配版本。 -| **TDengine 版本** | **Java** | **Python** | **Go** | **C#** | **Node.js** | **Rust** | -|------------------------|----------|--------------------------------------|------------|---------------|-----------------|----------| -| **3.3.0.0 及以上** | 3.3.0及以上 | taospy 2.7.15及以上,taos-ws-py 0.3.2及以上 | 3.5.5及以上 | 3.1.3及以上 | 3.1.0及以上 | 当前版本 | -| **3.0.0.0 及以上** | 3.0.2以上 | 当前版本 | 3.0 分支 | 3.0.0 | 3.1.0 | 当前版本 | -| **2.4.0.14 及以上** | 2.0.38 | 当前版本 | develop 分支 | 1.0.2 - 1.0.6 | 2.0.10 - 2.0.12 | 当前版本 | -| **2.4.0.4 - 2.4.0.13** | 2.0.37 | 当前版本 | develop 分支 | 1.0.2 - 1.0.6 | 2.0.10 - 2.0.12 | 当前版本 | -| **2.2.x.x ** | 2.0.36 | 当前版本 | master 分支 | n/a | 2.0.7 - 2.0.9 | 当前版本 | -| **2.0.x.x ** | 2.0.34 | 当前版本 | master 分支 | n/a | 2.0.1 - 2.0.6 | 当前版本 | +| **TDengine 版本** | **Java** | **Python** | **Go** | **C#** | **Node.js** | **Rust** | +| ---------------------- | ----------- | ------------------------------------------- | ------------ | ------------- | --------------- | -------- | +| **3.3.0.0 及以上** | 3.3.0及以上 | taospy 2.7.15及以上,taos-ws-py 0.3.2及以上 | 3.5.5及以上 | 3.1.3及以上 | 3.1.0及以上 | 当前版本 | +| **3.0.0.0 及以上** | 3.0.2以上 | 当前版本 | 3.0 分支 | 3.0.0 | 3.1.0 | 当前版本 | +| **2.4.0.14 及以上** | 2.0.38 | 当前版本 | develop 分支 | 1.0.2 - 1.0.6 | 2.0.10 - 2.0.12 | 当前版本 | +| **2.4.0.4 - 2.4.0.13** | 2.0.37 | 当前版本 | develop 分支 | 1.0.2 - 1.0.6 | 2.0.10 - 2.0.12 | 当前版本 | +| **2.2.x.x ** | 2.0.36 | 当前版本 | master 分支 | n/a | 2.0.7 - 2.0.9 | 当前版本 | +| **2.0.x.x ** | 2.0.34 | 当前版本 | master 分支 | n/a | 2.0.1 - 2.0.6 | 当前版本 | ## 功能特性 @@ -43,32 +43,36 @@ TDengine 版本更新往往会增加新的功能特性,列表中的连接器 ### 使用原生接口(taosc) -| **功能特性** | **Java** | **Python** | **Go** | **C#** | **Node.js** | **Rust** | -|----------------|----------|------------|--------|--------|-------------|----------| -| **连接管理** | 支持 | 支持 | 支持 | 支持 | 不支持 | 支持 | -| **普通查询** | 支持 | 支持 | 支持 | 支持 | 不支持 | 支持 | -| **参数绑定** | 支持 | 支持 | 支持 | 支持 | 不支持 | 支持 | -| **数据订阅(TMQ)** | 支持 | 支持 | 支持 | 支持 | 不支持 | 支持 | -| **Schemaless** | 支持 | 支持 | 支持 | 支持 | 不支持 | 支持 | +| **功能特性** | **Java** | **Python** | **Go** | **C#** | **Rust** | +| ------------------- | -------- | ---------- | ------ | ------ | -------- | +| **连接管理** | 支持 | 支持 | 支持 | 支持 | 支持 | +| **普通查询** | 支持 | 支持 | 支持 | 支持 | 支持 | +| **参数绑定** | 支持 | 支持 | 支持 | 支持 | 支持 | +| **数据订阅(TMQ)** | 支持 | 支持 | 支持 | 支持 | 支持 | +| **Schemaless** | 支持 | 支持 | 支持 | 支持 | 支持 | :::info 由于不同编程语言数据库框架规范不同,并不意味着所有 C/C++ 接口都需要对应封装支持。 ::: -### 使用 http (REST 或 WebSocket) 接口 +### 使用 http REST 接口 -| **功能特性** | **Java** | **Python** | **Go** | **C#** | **Node.js** | **Rust** | -|------------------------|----------|------------|--------|--------|-------------|----------| -| **连接管理** | 支持 | 支持 | 支持 | 支持 | 支持 | 支持 | -| **普通查询** | 支持 | 支持 | 支持 | 支持 | 支持 | 支持 | -| **参数绑定** | 支持 | 支持 | 支持 | 支持 | 支持 | 支持 | -| **数据订阅(TMQ)** | 支持 | 支持 | 支持 | 支持 | 支持 | 支持 | -| **Schemaless** | 支持 | 支持 | 支持 | 支持 | 支持 | 支持 | -| **批量拉取(基于 WebSocket)** | 支持 | 支持 | 支持 | 支持 | 支持 | 支持 | +| **功能特性** | **Java** | **Python** | **Go** | **C#** | +| ------------ | -------- | ---------- | ------ | ------ | +| **连接管理** | 支持 | 支持 | 支持 | 支持 | +| **普通查询** | 支持 | 支持 | 支持 | 支持 | + +### 使用 Websocket 接口 + +| **功能特性** | **Java** | **Python** | **Go** | **C#** | **Node.js** | **Rust** | +| ------------------- | -------- | ---------- | ------ | ------ | ----------- | -------- | +| **连接管理** | 支持 | 支持 | 支持 | 支持 | 支持 | 支持 | +| **普通查询** | 支持 | 支持 | 支持 | 支持 | 支持 | 支持 | +| **参数绑定** | 支持 | 支持 | 支持 | 支持 | 支持 | 支持 | +| **数据订阅(TMQ)** | 支持 | 支持 | 支持 | 支持 | 支持 | 支持 | +| **Schemaless** | 支持 | 支持 | 支持 | 支持 | 支持 | 支持 | :::warning - -- 参数绑定、数据订阅、Schemaless 和批量拉取功能仅在 WebSocket 连接下支持。 - 无论选用何种编程语言的连接器,2.0 及以上版本的 TDengine 推荐数据库应用的每个线程都建立一个独立的连接,或基于线程建立连接池,以避免连接内的“USE statement”状态量在线程之间相互干扰(但连接的查询和写入操作都是线程安全的)。 ::: diff --git a/docs/zh/20-third-party/03-visual/01-grafana.mdx b/docs/zh/20-third-party/03-visual/01-grafana.mdx index 860087b901..93f0cf6eaa 100644 --- a/docs/zh/20-third-party/03-visual/01-grafana.mdx +++ b/docs/zh/20-third-party/03-visual/01-grafana.mdx @@ -7,13 +7,18 @@ toc_max_heading_level: 4 import Tabs from "@theme/Tabs"; import TabItem from "@theme/TabItem"; -TDengine 能够与开源数据可视化系统 [Grafana](https://www.grafana.com/) 快速集成搭建数据监测报警系统,整个过程无需任何代码开发,TDengine 中数据表的内容可以在仪表盘(DashBoard)上进行可视化展现。关于 TDengine 插件的使用您可以在 [GitHub](https://github.com/taosdata/grafanaplugin/blob/master/README.md) 中了解更多。 +## 概述 +本文档介绍如何将 TDengine 数据源与开源数据可视化系统 [Grafana](https://www.grafana.com/) 集成,以实现数据的可视化和监测报警系统的搭建。通过 TDengine 插件,您可以轻松地将 TDengine 数据表的数据展示在 Grafana 仪表盘上,且无需进行复杂的开发工作。 + +## Grafana 版本要求 +当前 TDengine 支持 Grafana 7.5 及以上版本,建议使用最新版本。请根据您的系统环境下载并安装对应版本的 Grafana。 + ## 前置条件 要让 Grafana 能正常添加 TDengine 数据源,需要以下几方面的准备工作。 -- Grafana 服务已经部署并正常运行。目前 TDengine 支持 Grafana 7.5 以上的版本。用户可以根据当前的操作系统,到 Grafana 官网下载安装包,并执行安装。下载地址如下:[https://grafana.com/grafana/download](https://grafana.com/grafana/download) 。 +- Grafana 服务已经部署并正常运行。 :::info **注意**:要确保启动 Grafana 的账号有其安装目录的写权限,否则可能后面无法安装插件。 @@ -177,7 +182,15 @@ docker run -d \ ::: -## 内置变量和自定义变量 +## Dashboard 使用指南 + +本节内容按如下方式组织: +1. 介绍基础知识,包括 Grafana 的内置变量和自定义变量,TDengine 对与时序查询的特色语法支持。 +2. 介绍如何使用 TDengine 数据源在 Grafana 中创建 Dashboard,然后给出使用时序查询的特色语法和如何分组展示数据。 +3. 由于配置的 Dashbord 在页面显示时会定时查询 TDengine 来刷新显示,如果 SQL 编写不当会导致严重的性能问题,我们给出了性能优化建议。 +4. 最后我们以 TDengine 监控面板 TDinsight 为例介绍了如何导入我们提供的 DashBoard。 + +### Grafana 内置变量和自定义变量 Grafana 中的 Variable(变量)功能非常强大,可以在 Dashboard 的查询、面板标题、标签等地方使用,用来创建更加动态和交互式的 Dashbord,提高用户体验和效率。 @@ -191,7 +204,7 @@ Grafana 中的 Variable(变量)功能非常强大,可以在 Dashboard 的 Grafana 提供了内置变量和自定义变量,它们都可以可以在编写 SQL 时引用,引用的方式是 `$variableName`,`variableName` 是变量的名字,其他引用方式请参考 [引用方式](https://grafana.com/docs/grafana/latest/dashboards/variables/variable-syntax/)。 -### 内置变量 +#### 内置变量 Grafana 内置了 `from`、`to` 和 `interval` 等变量,都取自于 Grafana 插件面板。其含义如下: - `from` 查询范围的起始时间 @@ -202,13 +215,13 @@ Grafana 内置了 `from`、`to` 和 `interval` 等变量,都取自于 Grafana 除了上述三个常用变量,Grafana 还提供了如 `__timezone`, `__org`, `__user` 等变量,详情请参考 [内置变量](https://grafana.com/docs/grafana/latest/dashboards/variables/add-template-variables/#global-variables)。 -### 自定义变量 +#### 自定义变量 我们可以在 Dashbord 中增加自定义变量。自定义变量和内置变量的使用方式没有区别,都是在 SQL 中用 `$variableName` 进行引用。 自定义变量支持多种类型,常见的类型包括 `Query`(查询)、`Constant`(常量)、`Interval`(间隔)、`Data source`(数据源)等。 自定义变量可以引用其他自定义变量,比如一个变量表示区域,另一个变量可以引用区域的值,来查询这个区域的设备。 -#### 添加查询类型变量 +##### 添加查询类型变量 在 Dashbord 的配置中,选择 【Variables】,然后点击 【New variable】: 1. 在 “Name” 字段中,输入你的变量名,此处我们设置变量名为 `selected_groups`。 @@ -221,7 +234,7 @@ Grafana 内置了 `from`、`to` 和 `interval` 等变量,都取自于 Grafana 我们还可以再新增自定义变量来引用这个 `selected_groups` 变量,比如我们新增一个名为 `tbname_max_current` 的查询变量,其 SQL 为 `select tbname from power.meters where groupid = $selected_groups and ts > $from and ts < $to;` -#### 添加间隔类型变量 +##### 添加间隔类型变量 我们可以自定义时间窗口间隔,可以更加贴合业务需求。 1. 在 “Name” 字段中,输入变量名为 `interval`。 @@ -237,7 +250,7 @@ Grafana 内置了 `from`、`to` 和 `interval` 等变量,都取自于 Grafana ::: -## TDengine 时间序列查询支持 +### TDengine 时序查询支持 TDengine 在支持标准 SQL 的基础之上,还提供了一系列满足时序业务场景需求的特色查询语法,这些语法能够为时序场景的应用的开发带来极大的便利。 - `partition by` 子句可以按一定的维度对数据进行切分,然后在切分出的数据空间内再进行一系列的计算。绝大多数情况可以替代 `group by`。 @@ -247,15 +260,16 @@ TDengine 在支持标准 SQL 的基础之上,还提供了一系列满足时序 上述特性详细介绍可以参考 [特色查询](../../../reference/taos-sql/distinguished/)。 -## 创建 Dashboard +### 创建 Dashboard -回到主界面创建 Dashboard,点击【Add Query】进入面板查询页面: +有了前面的基础知识,我们可以配置基于 TDengine 数据源的时间序列数据展示 Dashbord。 +在 Grafana 主界面创建 Dashboard,点击【Add Query】进入面板查询页面: ![TDengine Database Grafana plugin create dashboard](./create_dashboard1.webp) 如上图所示,在 “Query” 中选中 `TDengine` 数据源,在下方查询框可输入相应 SQL 进行查询。 我们继续用智能电表来举例,为了展示曲线美观,此处**用了虚拟数据**。 -## 时间序列数据展示 +#### 时间序列数据展示 假设我们想查询一段时间内的平均电流大小,时间窗口按 `$interval` 切分,若某一时间窗口区间数据缺失,填充 null。 - “INPUT SQL”:输入要查询的语句(该 SQL 语句的结果集应为两列多行),此处输入:`select _wstart as ts, avg(current) as current from power.meters where groupid in ($selected_groups) and ts > $from and ts < $to interval($interval) fill(null)` ,其中,from、to 和 interval 为 Grafana 内置变量,selected_groups 为自定义变量。 @@ -272,7 +286,7 @@ TDengine 在支持标准 SQL 的基础之上,还提供了一系列满足时序 ::: -## 时间序列数据分组展示 +#### 时间序列数据分组展示 假设我们想查询一段时间内的平均电流大小,按 `groupid` 分组展示,我们可以修改之前的 SQL 为 `select _wstart as ts, groupid, avg(current) as current from power.meters where ts > $from and ts < $to partition by groupid interval($interval) fill(null)` @@ -285,12 +299,12 @@ TDengine 在支持标准 SQL 的基础之上,还提供了一系列满足时序 > 关于如何使用 Grafana 创建相应的监测界面以及更多有关使用 Grafana 的信息,请参考 Grafana 官方的[文档](https://grafana.com/docs/)。 -## 性能建议 +### 性能优化建议 - **所有查询加上时间范围**,在时序数据库中,如果不加查询的时间范围,会扫表导致性能低下。常见的 SQL 写法是 `select column_name from db.table where ts > $from and ts < $to;` - 对于最新状态类型的查询,我们一般建议在**创建数据库的时候打开缓存**(`CACHEMODEL` 设置为 last_row 或者 both),常见的 SQL 写法是 `select last(column_name) from db.table where ts > $from and ts < $to;` -## 导入 Dashboard +### 导入 Dashboard 在数据源配置页面,您可以为该数据源导入 TDinsight 面板,作为 TDengine 集群的监控可视化工具。如果 TDengine 服务端为 3.0 版本请选择 `TDinsight for 3.x` 导入。注意 TDinsight for 3.x 需要运行和配置 taoskeeper。 @@ -317,7 +331,7 @@ TDengine Grafana 插件支持告警,如果要配置告警,需要以下几个 3.4 配置标签和告警通道 3.5 配置通知文案 -### 告警配置界面 +### 告警配置界面介绍 在Grafana 11 告警界面一共有 6 个 Tab,分别是 “Alert rules”、“Contact points”、“Notification policies”、“Silences”、 “Groups” 和 “Settings”。 - “Alert rules” 告警规则列表,用于展示和配置告警规则 @@ -329,7 +343,7 @@ TDengine Grafana 插件支持告警,如果要配置告警,需要以下几个 ### 配置联络点 -以邮件和飞书为例配置联络点。 +本节以邮件和飞书为例配置联络点。 #### 配置邮件联络点 diff --git a/examples/JDBC/connectionPools/pom.xml b/examples/JDBC/connectionPools/pom.xml index 61717cf112..855d531f4c 100644 --- a/examples/JDBC/connectionPools/pom.xml +++ b/examples/JDBC/connectionPools/pom.xml @@ -18,7 +18,7 @@ com.taosdata.jdbc taos-jdbcdriver - 3.0.0 + 3.3.0 diff --git a/examples/JDBC/connectionPools/src/main/java/com/taosdata/example/DruidDemo.java b/examples/JDBC/connectionPools/src/main/java/com/taosdata/example/DruidDemo.java index c7df4e0dec..dd96f5c632 100644 --- a/examples/JDBC/connectionPools/src/main/java/com/taosdata/example/DruidDemo.java +++ b/examples/JDBC/connectionPools/src/main/java/com/taosdata/example/DruidDemo.java @@ -22,7 +22,7 @@ public static void main(String[] args) throws Exception { dataSource.setMinIdle(10); dataSource.setMaxActive(10); dataSource.setMaxWait(30000); - dataSource.setValidationQuery("SELECT SERVER_STATUS()"); + dataSource.setValidationQuery("SELECT SERVER_VERSION()"); Connection connection = dataSource.getConnection(); // get connection Statement statement = connection.createStatement(); // get statement diff --git a/examples/JDBC/connectionPools/src/main/java/com/taosdata/example/pool/DruidPoolBuilder.java b/examples/JDBC/connectionPools/src/main/java/com/taosdata/example/pool/DruidPoolBuilder.java index 500f0e9e97..a4581ac4b6 100644 --- a/examples/JDBC/connectionPools/src/main/java/com/taosdata/example/pool/DruidPoolBuilder.java +++ b/examples/JDBC/connectionPools/src/main/java/com/taosdata/example/pool/DruidPoolBuilder.java @@ -20,7 +20,7 @@ public class DruidPoolBuilder { dataSource.setMinIdle(poolSize); dataSource.setMaxActive(poolSize); dataSource.setMaxWait(30000); - dataSource.setValidationQuery("select server_status()"); + dataSource.setValidationQuery("select SERVER_VERSION()"); return dataSource; } diff --git a/examples/JDBC/connectionPools/src/main/java/com/taosdata/example/pool/HikariCpBuilder.java b/examples/JDBC/connectionPools/src/main/java/com/taosdata/example/pool/HikariCpBuilder.java index 7e151de3e0..ffd43cc3ea 100644 --- a/examples/JDBC/connectionPools/src/main/java/com/taosdata/example/pool/HikariCpBuilder.java +++ b/examples/JDBC/connectionPools/src/main/java/com/taosdata/example/pool/HikariCpBuilder.java @@ -20,7 +20,7 @@ public class HikariCpBuilder { config.setConnectionTimeout(30000); //maximum wait milliseconds for get connection from pool config.setMaxLifetime(0); // maximum life time for each connection config.setIdleTimeout(0); // max idle time for recycle idle connection - config.setConnectionTestQuery("select server_status()"); //validation query + config.setConnectionTestQuery("select SERVER_VERSION()"); //validation query HikariDataSource ds = new HikariDataSource(config); return ds; diff --git a/examples/JDBC/connectionPools/src/main/resources/proxool.xml b/examples/JDBC/connectionPools/src/main/resources/proxool.xml index 67baa1c393..0e2ac6368a 100644 --- a/examples/JDBC/connectionPools/src/main/resources/proxool.xml +++ b/examples/JDBC/connectionPools/src/main/resources/proxool.xml @@ -22,6 +22,6 @@ 30000 - select server_status() + select server_version() \ No newline at end of file From 8b8e217130e545aa31029419431b98956fac3143 Mon Sep 17 00:00:00 2001 From: Shengliang Guan Date: Fri, 9 Aug 2024 19:14:13 +0800 Subject: [PATCH 096/103] doc: update udf --- docs/zh/08-develop/09-udf.md | 54 +++++++++++++++--------------------- 1 file changed, 23 insertions(+), 31 deletions(-) diff --git a/docs/zh/08-develop/09-udf.md b/docs/zh/08-develop/09-udf.md index bc62dbfa3c..b16700b460 100644 --- a/docs/zh/08-develop/09-udf.md +++ b/docs/zh/08-develop/09-udf.md @@ -429,7 +429,9 @@ def process(block): 这个文件包含 3 个函数, init 和 destroy 都是空函数,它们是 UDF 的生命周期函数,即使什么都不做也要定义。最关键的是 process 函数, 它接受一个数据块,这个数据块对象有两个方法。 1. shape() 返回数据块的行数和列数 2. data(i, j) 返回 i 行 j 列的数据 -标量函数的 process 方法传入的数据块有多少行,就需要返回多少行数据。上述代码忽略列数,因为只需对每行的第一个数做计算。 + +标量函数的 process 方法传入的数据块有多少行,就需要返回多少行数据。上述代码忽略列数,因为只需对每行的第一列做计算。 + 接下来创建对应的 UDF 函数,在 TDengine CLI 中执行下面语句。 ```sql @@ -442,7 +444,7 @@ taos> create function myfun as '/root/udf/myfun.py' outputtype double language ' Create OK, 0 row(s) affected (0.005202s) ``` -看起来很顺利,接下来 show 一下系统中所有的自定义函数,确认创建成功。 +看起来很顺利,接下来查看系统中所有的自定义函数,确认创建成功。 ```text taos> show functions; @@ -452,7 +454,7 @@ taos> show functions; Query OK, 1 row(s) in set (0.005767s) ``` -接下来就来测试一下这个函数,测试之前先执行下面的 SQL 命令,制造些测试数据,在 TDengine CLI 中执行下述命令。 +生成测试数据,可以在 TDengine CLI 中执行下述命令。 ```sql create database test; @@ -470,8 +472,7 @@ taos> select myfun(v1, v2) from t; DB error: udf function execution failure (0.011088s) ``` -不幸的是执行失败了,什么原因呢? -查看 udfd 进程的日志。 +不幸的是执行失败了,什么原因呢?查看 udfd 进程的日志。 ```shell tail -10 /var/log/taos/udfd.log @@ -514,8 +515,7 @@ taos> select myfun(v1, v2) from t; 2.302585093 | ``` -2. 没有处理 null 值。我们期望如果输入有 null,则会抛异常终止执行。 -因此 process 函数改进如下。 +2. 没有处理 null 值。我们期望如果输入有 null,则会抛异常终止执行。因此 process 函数改进如下。 ```python def process(block): @@ -525,7 +525,7 @@ def process(block): return [ None if block.data(i, 0) is None else log(block.data(i, 0) ** 2 + 1) for i in range(rows)] ``` -然后执行下面的语句更新已有的 UDF。 +执行如下语句更新已有的 UDF。 ```sql create or replace function myfun as '/root/udf/myfun.py' outputtype double language 'Python'; @@ -539,7 +539,7 @@ taos> select myfun(v1, v2) from t; DB error: udf function execution failure (0.014643s) ``` -但遗憾的是我们自定义的异常信息没有展示给用户,而是在插件的日志文件 /var/log/taos/taospyudf.log 中。 +自定义的异常信息打印在插件的日志文件 /var/log/taos/taospyudf.log 中。 ```text 2023-05-24 23:21:06.790 ERROR [1666188] [doPyUdfScalarProc@507] call pyUdfScalar proc function. context 0x7faade26d180. error: Exception: require 1 parameter but given 2 @@ -554,18 +554,17 @@ At: #### 示例三 -编写一个 UDF:输入(x1, x2, ..., xn), 输出每个值和它们的序号的乘积的和: 1 * x1 + 2 * x2 + ... + n * xn。如果 x1 至 xn 中包含 null,则结果为 null。 -这个示例与示例一的区别是,可以接受任意多列作为输入,且要处理每一列的值。编写 UDF 文件 /root/udf/nsum.py。 +输入(x1, x2, ..., xn), 输出每个值和它们的序号的乘积的和:1 * x1 + 2 * x2 + ... + n * xn。如果 x1 至 xn 中包含 null,则结果为 null。 + +本例与示例一的区别是,可以接受任意多列作为输入,且要处理每一列的值。编写 UDF 文件 /root/udf/nsum.py。 ```python def init(): pass - def destroy(): pass - def process(block): rows, cols = block.shape() result = [] @@ -617,11 +616,9 @@ pip3 install moment ```python import moment - def init(): pass - def destroy(): pass @@ -651,7 +648,7 @@ DB error: udf function execution failure (1.123615s) ``` ```shell - tail -20 taospyudf.log +tail -20 taospyudf.log 2023-05-25 11:42:34.541 ERROR [1679419] [PyUdf::PyUdf@217] py udf load module failure. error ModuleNotFoundError: No module named 'moment' ``` @@ -707,26 +704,21 @@ import pickle LOG_FILE: io.TextIOBase = None - def init(): global LOG_FILE LOG_FILE = open("/var/log/taos/spread.log", "wt") log("init function myspead success") - def log(o): LOG_FILE.write(str(o) + '\n') - def destroy(): log("close log file: spread.log") LOG_FILE.close() - def start(): return pickle.dumps((-math.inf, math.inf)) - def reduce(block, buf): max_number, min_number = pickle.loads(buf) log(f"initial max_number={max_number}, min_number={min_number}") @@ -741,20 +733,20 @@ def reduce(block, buf): min_number = v return pickle.dumps((max_number, min_number)) - def finish(buf): max_number, min_number = pickle.loads(buf) return max_number - min_number ``` -在这个示例中我们不光定义了一个聚合函数,还添加记录执行日志的功能,讲解如下。 -1. init 函数不再是空函数,而是打开了一个文件用于写执行日志 -2. log 函数是记录日志的工具,自动将传入的对象转成字符串,加换行符输出 -3. destroy 函数用来在执行结束关闭文件 -4. start 返回了初始的 buffer,用来存聚合函数的中间结果,我们把最大值初始化为负无穷大,最小值初始化为正无穷大 -5. reduce 处理每个数据块并聚合结果 -6. finish 函数将最终的 buffer 转换成最终的输出 -执行下面的 SQL语句创建对应的 UDF。 +在这个示例中,我们不但定义了一个聚合函数,还增加了记录执行日志的功能。 +1. init 函数打开一个文件用于记录日志 +2. log 函数记录日志,自动将传入的对象转成字符串,加换行符输出 +3. destroy 函数在执行结束后关闭日志文件 +4. start 函数返回初始的 buffer,用来存聚合函数的中间结果,把最大值初始化为负无穷大,最小值初始化为正无穷大 +5. reduce 函数处理每个数据块并聚合结果 +6. finish 函数将 buffer 转换成最终的输出 + +执行下面 SQL 语句创建对应的 UDF。 ```sql create or replace aggregate function myspread as '/root/udf/myspread.py' outputtype double bufsize 128 language 'Python'; @@ -785,7 +777,7 @@ taos> select spread(v1) from t; Query OK, 1 row(s) in set (0.005501s) ``` -最后,查看我们自己打印的执行日志,从日志可以看出,reduce 函数被执行了 3 次。执行过程中 max 值被更新了 4 次, min 值只被更新 1 次。 +最后,查看执行日志,可以看到 reduce 函数被执行了 3 次,执行过程中 max 值被更新了 4 次,min 值只被更新 1 次。 ```shell root@slave11 /var/log/taos $ cat spread.log From 809d54599f4311a1e8ddf80f2580fe8bb74a75dd Mon Sep 17 00:00:00 2001 From: Shengliang Guan Date: Fri, 9 Aug 2024 19:27:28 +0800 Subject: [PATCH 097/103] doc: minor changes --- docs/zh/08-develop/09-udf.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/zh/08-develop/09-udf.md b/docs/zh/08-develop/09-udf.md index b16700b460..fab59cc2eb 100644 --- a/docs/zh/08-develop/09-udf.md +++ b/docs/zh/08-develop/09-udf.md @@ -846,7 +846,7 @@ pycumsum 使用 numpy 计算输入列所有数据的累积和。 创建标量函数的 SQL 语法如下。 ```sql -CREATE OR REPLACE FUNCTION function_name AS library_path OUTPUTTYPE output_type LANGUAGE 'Python'; +CREATE [OR REPLACE] FUNCTION function_name AS library_path OUTPUTTYPE output_type LANGUAGE 'Python'; ``` 各参数说明如下。 - or replace:如果函数已经存在,则会修改已有的函数属性。 @@ -859,8 +859,9 @@ CREATE OR REPLACE FUNCTION function_name AS library_path OUTPUTTYPE output_type ### 创建聚合函数 创建聚合函数的 SQL 语法如下。 + ```sql -CREATE OR REPLACE AGGREGATE FUNCTION function_name library_path OUTPUTTYPE output_type LANGUAGE 'Python'; +CREATE [OR REPLACE] AGGREGATE FUNCTION function_name library_path OUTPUTTYPE output_type BUFSIZE 其中,buffer_size LANGUAGE 'Python'; ``` 其中,buffer_size 表示中间计算结果的缓冲区大小,单位是字节。其他参数的含义与标量函数相同。 @@ -872,7 +873,7 @@ CREATE AGGREGATE FUNCTION l2norm AS "/home/taos/udf_example/libl2norm.so" OUTPUT ### 删除 UDF -删除指定名称的 UDF 的 SQL 语法如下: +删除指定名称的 UDF 的 SQL 语法如下。 ```sql DROP FUNCTION function_name; ``` From 3300877339e2ad9b28e5426cefab8881b343c465 Mon Sep 17 00:00:00 2001 From: sheyanjie-qq <249478495@qq.com> Date: Fri, 9 Aug 2024 19:28:18 +0800 Subject: [PATCH 098/103] improve tdinsight doc --- .../01-components/12-tdinsight/index.mdx | 72 +------------------ 1 file changed, 2 insertions(+), 70 deletions(-) diff --git a/docs/zh/14-reference/01-components/12-tdinsight/index.mdx b/docs/zh/14-reference/01-components/12-tdinsight/index.mdx index 34a06e13c4..e5f44a1080 100644 --- a/docs/zh/14-reference/01-components/12-tdinsight/index.mdx +++ b/docs/zh/14-reference/01-components/12-tdinsight/index.mdx @@ -32,76 +32,8 @@ TDengine 通过 taosKeeper 将服务器的 CPU、内存、硬盘空间、带宽 -## 安装 TDengine 数据源插件 - -TDengine 数据源插件支持图形界面安装、手动安装和脚本安装三种安装方式,一般建议图形界面安装。对于 Grafana 8.5 以下版本可以使用手动安装和脚本安装方式。 - - - - -使用 Grafana 最新版本(8.5+),您可以在 Grafana 中[浏览和管理插件](https://grafana.com/docs/grafana/next/administration/plugin-management/#plugin-catalog)。在 Grafana 管理界面中的 **Configurations > Plugins** 页面直接搜索 `TDengine` 并按照提示安装。 - - - - -从 GitHub 安装 TDengine 最新版数据源插件。 - -```bash -get_latest_release() { - curl --silent "https://api.github.com/repos/taosdata/grafanaplugin/releases/latest" | - grep '"tag_name":' | - sed -E 's/.*"v([^"]+)".*/\1/' -} -TDENGINE_PLUGIN_VERSION=$(get_latest_release) -sudo grafana-cli \ - --pluginUrl https://github.com/taosdata/grafanaplugin/releases/download/v$TDENGINE_PLUGIN_VERSION/tdengine-datasource-$TDENGINE_PLUGIN_VERSION.zip \ - plugins install tdengine-datasource -``` - -:::note -3.1.6 和更早版本插件需要在配置文件 `/etc/grafana/grafana.ini` 中添加如下设置,以启用未签名插件。 - -```ini -[plugins] -allow_loading_unsigned_plugins = tdengine-datasource -``` - -::: - - - - - -我们提供了一个自动化安装脚本 [TDinsight.sh](https://github.com/taosdata/grafanaplugin/releases/latest/download/TDinsight.sh) 脚本以便用户快速进行安装配置。 - -您可以通过 `wget` 或其他工具下载该脚本: - -```bash -wget https://github.com/taosdata/grafanaplugin/releases/latest/download/TDinsight.sh -chmod +x TDinsight.sh -./TDinsight.sh -``` - -这个脚本会自动下载最新的[Grafana TDengine 数据源插件](https://github.com/taosdata/grafanaplugin/releases/latest) 和 [TDinsight 仪表盘](https://github.com/taosdata/grafanaplugin/blob/master/dashboards/TDinsightV3.json) ,将命令行选项中的可配置参数转为 [Grafana Provisioning](https://grafana.com/docs/grafana/latest/administration/provisioning/) 配置文件,以进行自动化部署及更新等操作。 - -1. 假设您在同一台主机上使用 TDengine 和 Grafana 服务。 运行 `./TDinsight.sh` 并打开 Grafana 页面就可以看到 TDinsight 仪表盘了。 -2. 假设您在主机 `tdengine` 上启动 TDengine 数据库,taosAdapter 的 HTTP 监听端口为 `6041`,用户为 `root1`,密码为 `pass5ord`。执行脚本:`./TDinsight.sh -a http://tdengine:6041 -u root1 -p pass5ord` - -详细的使用方法请参考 [TDinsight.sh 详细说明](./#附录) - - - - - -## 添加 TDengine 数据源 - -安装完毕后, 点击 “Connections” -> “Data sources“, 然后选择 ”tdengine-datasource“,输入 TDengine 相关配置: -- Host: TDengine 集群中提供 REST 服务的 IP 地址与端口号,默认 `http://localhost:6041` -- User:TDengine 用户名。 -- Password:TDengine 用户密码。 - -点击 `Save & Test` 进行测试,成功会提示:`TDengine Data source is working`。 - +## 安装 TDengine 数据源插件和配置数据源 +安装 Grafana TDengine 数据源插件和配置数据源的步骤请参考:[与 Grafana 集成](../../../third-party/visual/grafana/#安装-grafana-plugin-并配置数据源) ## 导入 TDinsightV3 仪表盘 From fc90d17248bba050e98f0d9b0d7590ee40a7f807 Mon Sep 17 00:00:00 2001 From: Shengliang Guan Date: Fri, 9 Aug 2024 19:37:39 +0800 Subject: [PATCH 099/103] docs: minor changes --- docs/zh/08-develop/09-udf.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/zh/08-develop/09-udf.md b/docs/zh/08-develop/09-udf.md index fab59cc2eb..72291b65e6 100644 --- a/docs/zh/08-develop/09-udf.md +++ b/docs/zh/08-develop/09-udf.md @@ -861,7 +861,7 @@ CREATE [OR REPLACE] FUNCTION function_name AS library_path OUTPUTTYPE output_typ 创建聚合函数的 SQL 语法如下。 ```sql -CREATE [OR REPLACE] AGGREGATE FUNCTION function_name library_path OUTPUTTYPE output_type BUFSIZE 其中,buffer_size LANGUAGE 'Python'; +CREATE [OR REPLACE] AGGREGATE FUNCTION function_name library_path OUTPUTTYPE output_type BUFSIZE buffer_size LANGUAGE 'Python'; ``` 其中,buffer_size 表示中间计算结果的缓冲区大小,单位是字节。其他参数的含义与标量函数相同。 From 2f1f8605d9ed7d50f1acf0310d66bc6aff40ee93 Mon Sep 17 00:00:00 2001 From: sheyanjie-qq <249478495@qq.com> Date: Sat, 10 Aug 2024 10:39:22 +0800 Subject: [PATCH 100/103] mod validation sql in jdbc demo properties --- examples/JDBC/springbootdemo/readme.md | 2 +- .../springbootdemo/src/main/resources/application.properties | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/JDBC/springbootdemo/readme.md b/examples/JDBC/springbootdemo/readme.md index a89e21c009..625d43e4ed 100644 --- a/examples/JDBC/springbootdemo/readme.md +++ b/examples/JDBC/springbootdemo/readme.md @@ -22,7 +22,7 @@ spring.datasource.druid.max-active=5 # max wait time for get connection, ms spring.datasource.druid.max-wait=60000 -spring.datasource.druid.validation-query=select server_status(); +spring.datasource.druid.validation-query=select SERVER_VERSION(); spring.datasource.druid.validation-query-timeout=5000 spring.datasource.druid.test-on-borrow=false spring.datasource.druid.test-on-return=false diff --git a/examples/JDBC/springbootdemo/src/main/resources/application.properties b/examples/JDBC/springbootdemo/src/main/resources/application.properties index c523952fb6..00a06a5098 100644 --- a/examples/JDBC/springbootdemo/src/main/resources/application.properties +++ b/examples/JDBC/springbootdemo/src/main/resources/application.properties @@ -12,7 +12,7 @@ spring.datasource.druid.initial-size=5 spring.datasource.druid.min-idle=5 spring.datasource.druid.max-active=5 spring.datasource.druid.max-wait=30000 -spring.datasource.druid.validation-query=select server_status(); +spring.datasource.druid.validation-query=select SERVER_VERSION(); spring.aop.auto=true spring.aop.proxy-target-class=true #mybatis From 0ea143d319c03c7db57f52eab84f78ae3c349024 Mon Sep 17 00:00:00 2001 From: sheyanjie-qq <249478495@qq.com> Date: Sat, 10 Aug 2024 11:47:13 +0800 Subject: [PATCH 101/103] improve Preconditions --- docs/zh/07-operation/05-monitor.md | 33 ++++++++++++++---------------- 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/docs/zh/07-operation/05-monitor.md b/docs/zh/07-operation/05-monitor.md index cd8a9772cd..abbd54736b 100644 --- a/docs/zh/07-operation/05-monitor.md +++ b/docs/zh/07-operation/05-monitor.md @@ -40,21 +40,21 @@ taosKeeper 的配置文件默认位于 `/etc/taos/taoskeeper.toml`。 详细配 #### 导入仪表盘 -TDengine 数据源插件已被提交至 Grafana 官网,完成插件的安装和数据源的创建后,可以进行 TDinsight 仪表盘的导入。 +TDengine 数据源插件已提交至 Grafana 官网,如何安装 TDengine 数据源插件和配置数据源请参考:[安装 Grafana Plugin 并配置数据源](../../third-party/visual/grafana/#安装-grafana-plugin-并配置数据源)。完成插件的安装和数据源的创建后,可以进行 TDinsight 仪表盘的导入。 -在 Grafana 的 ”Home“ -> ”Dashboards“ 页面,点击位于右上角的 ”New“ -> ”import“ 按钮,即可进入 Dashboard 的导入页面,它支持以下两种导入方式。 +在 Grafana 的 “Home” -> “Dashboards” 页面,点击位于右上角的 “New” -> “import” 按钮,即可进入 Dashboard 的导入页面,它支持以下两种导入方式。 - Dashboard ID:18180。 - Dashboard URL:https://grafana.com/grafana/dashboards/18180-tdinsight-for-3-x/ -填写以上 Dashboard ID 或 Dashboard URL 以后,点击 ”Load“ 按钮,按照向导操作,即可完成导入。导入成功后,Dashboards 列表页面会出现 ”TDinsight for 3.x“ 仪表盘,点击进入后,就可以看到 TDinsight 中已创建的各个指标的面板,如下图所示: +填写以上 Dashboard ID 或 Dashboard URL 以后,点击 “Load” 按钮,按照向导操作,即可完成导入。导入成功后,Dashboards 列表页面会出现 “TDinsight for 3.x” 仪表盘,点击进入后,就可以看到 TDinsight 中已创建的各个指标的面板,如下图所示: ![TDinsight 界面示例](./TDinsight-1-cluster-status.webp) -**注意** 在 TDinsight 界面左上角的 ”Log from“ 下拉列表中可以选择 `log` 数据库。 +**注意** 在 TDinsight 界面左上角的 “Log from” 下拉列表中可以选择 `log` 数据库。 ### TDengine V3 监控数据 -TDinsight dashboard 数据来源于 `log` 库(存放监控数据的默认数据库,可以在 taoskeeper 配置文件中修改)。”TDinsight for 3.x“ 仪表盘查询了 taosd 和 TaosAdapter 的监控指标。 +TDinsight dashboard 数据来源于 `log` 库(存放监控数据的默认数据库,可以在 taoskeeper 配置文件中修改)。“TDinsight for 3.x” 仪表盘查询了 taosd 和 TaosAdapter 的监控指标。 - taosd 的监控指标请参考 [taosd 监控指标](../../reference/components/taosd/#taosd-监控指标) - taosAdapter 的监控指标请参考 [taosAdapter 监控指标](../../reference/components/taosadapter/#taosadapter-监控指标) @@ -66,18 +66,15 @@ taosX 是 TDengine 中提供零代码数据接入能力的核心组件,对它 3. 运行在 taosX 端或 taosx-agent 端的各个连接器子进程 4. 运行中的各类数据写入任务 -### 版本支持 +### 前置条件 -1. TDengine 企业版本 3.2.3.0 或以上版本包含的 taosX 才包含此功能。如果单独安装 taosX,需要 taosX 1.5.0 或以上版本。 -2. 需要安装 Grafana 插件 [TDengie Datasource v3.5.0](https://grafana.com/grafana/plugins/tdengine-datasource/) 或以上版本。 +1. taosd,taosAdapter 和 taosKeeper 都已经部署完成并启动成功。 +2. taosX 服务监控配置正确,如何配置可以参考下文 “配置 taosX 监控”,服务启动成功。 + **注意**:TDengine 企业版本 3.2.3.0 或以上版本包含的 taosX 才包含此功能。如果单独安装 taosX,需要 taosX 1.5.0 或以上版本。 +3. 部署 Grafana ,安装 TDengine Datasource 插件,配置好数据源。可以参考:[安装 Grafana Plugin 并配置数据源](../../third-party/visual/grafana/#安装-grafana-plugin-并配置数据源)。 + **注意**:需要安装 Grafana 插件 [TDengie Datasource v3.5.0](https://grafana.com/grafana/plugins/tdengine-datasource/) 或以上版本。 -### 准备工作 - -假设你已经部署好了 taosd,taosAdapter 和 taosAdapter。 那么还需要: -1. 启动 taosX 服务。 -2. 部署 Grafana ,安装 TDengine Datasource 插件,配置好数据源。 - -### 配置 taosX +### 配置 taosX 监控 toasX 的配置文件(默认 /etc/taos/taosx.toml) 中与 monitor 相关的配置如下: @@ -101,12 +98,12 @@ toasX 的配置文件(默认 /etc/taos/taosx.toml) 中与 monitor 相关的配 ### 基于 TDinsight 监控 tasoX -"TDinsight for taosX" 是专门为 taosX 监控创建的 Grafana 面板。使用前需要先导入这个面板。 +“TDinsight for taosX” 是专门为 taosX 监控创建的 Grafana 面板。使用前需要先导入这个面板。 #### 进入面板 -1. 在 Grafana 界面菜单中点击 ”Data sources“, 然后选择已经配置好的 TDengine 数据源。 -2. 在数据源配置界面选择 “Dashboard” Tab, 然后导入 ”TDinsight for taosX“ 面板(第一次使用需要先导入)。 下面是一个示例图: +1. 在 Grafana 界面菜单中点击 “Data sources”, 然后选择已经配置好的 TDengine 数据源。 +2. 在数据源配置界面选择 “Dashboard” Tab, 然后导入 “TDinsight for taosX” 面板(第一次使用需要先导入)。 下面是一个示例图: ![monitor rows](./pic/monitor-04.jpg) From f09ddfa815ce63e834a2149098f35c95781c8c2c Mon Sep 17 00:00:00 2001 From: sheyanjie-qq <249478495@qq.com> Date: Sat, 10 Aug 2024 16:47:56 +0800 Subject: [PATCH 102/103] improve a note in grafana doc --- docs/zh/20-third-party/03-visual/01-grafana.mdx | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/docs/zh/20-third-party/03-visual/01-grafana.mdx b/docs/zh/20-third-party/03-visual/01-grafana.mdx index 93f0cf6eaa..d7406352c9 100644 --- a/docs/zh/20-third-party/03-visual/01-grafana.mdx +++ b/docs/zh/20-third-party/03-visual/01-grafana.mdx @@ -18,12 +18,8 @@ import TabItem from "@theme/TabItem"; 要让 Grafana 能正常添加 TDengine 数据源,需要以下几方面的准备工作。 -- Grafana 服务已经部署并正常运行。 - :::info - +- Grafana 服务已经部署并正常运行。 **注意**:要确保启动 Grafana 的账号有其安装目录的写权限,否则可能后面无法安装插件。 - - ::: - TDengine 集群已经部署并正常运行。 - taosAdapter 已经安装并正常运行。具体细节请参考 [taosAdapter 的使用手册](../../../reference/components/taosadapter) From c1727ef2aeaeebca96596676fc8ffc3040ef31dc Mon Sep 17 00:00:00 2001 From: sheyanjie-qq <249478495@qq.com> Date: Mon, 12 Aug 2024 08:50:31 +0800 Subject: [PATCH 103/103] add tab for not support language --- .../03-insert-data/01-sql-writing.mdx | 4 +- .../03-insert-data/30-influxdb-line.mdx | 2 +- .../03-insert-data/40-opentsdb-telnet.mdx | 2 +- .../03-insert-data/50-opentsdb-json.mdx | 2 +- docs/en/07-develop/04-query-data/index.mdx | 2 +- docs/zh/08-develop/01-connect/index.md | 42 ++++++++- docs/zh/08-develop/02-sql.md | 8 +- docs/zh/08-develop/04-schemaless.md | 16 +++- docs/zh/08-develop/05-stmt.md | 13 ++- docs/zh/08-develop/07-tmq.md | 86 ++++++++++++++++--- docs/zh/14-reference/05-connector/index.md | 18 ++-- 11 files changed, 159 insertions(+), 36 deletions(-) diff --git a/docs/en/07-develop/03-insert-data/01-sql-writing.mdx b/docs/en/07-develop/03-insert-data/01-sql-writing.mdx index b9ec36e3ac..8f7e573995 100644 --- a/docs/en/07-develop/03-insert-data/01-sql-writing.mdx +++ b/docs/en/07-develop/03-insert-data/01-sql-writing.mdx @@ -88,7 +88,7 @@ For more details about `INSERT` please refer to [INSERT](../../../taos-sql/inser - + @@ -128,7 +128,7 @@ Parameter binding is available only with native connection. - + diff --git a/docs/en/07-develop/03-insert-data/30-influxdb-line.mdx b/docs/en/07-develop/03-insert-data/30-influxdb-line.mdx index ddcaad8914..abba88e70f 100644 --- a/docs/en/07-develop/03-insert-data/30-influxdb-line.mdx +++ b/docs/en/07-develop/03-insert-data/30-influxdb-line.mdx @@ -60,7 +60,7 @@ For more details please refer to [InfluxDB Line Protocol](https://docs.influxdat - + diff --git a/docs/en/07-develop/03-insert-data/40-opentsdb-telnet.mdx b/docs/en/07-develop/03-insert-data/40-opentsdb-telnet.mdx index ed2659042f..0a79823d0e 100644 --- a/docs/en/07-develop/03-insert-data/40-opentsdb-telnet.mdx +++ b/docs/en/07-develop/03-insert-data/40-opentsdb-telnet.mdx @@ -52,7 +52,7 @@ meters.current 1648432611250 11.3 location=California.LosAngeles groupid=3 - + diff --git a/docs/en/07-develop/03-insert-data/50-opentsdb-json.mdx b/docs/en/07-develop/03-insert-data/50-opentsdb-json.mdx index fc54421daf..39004be254 100644 --- a/docs/en/07-develop/03-insert-data/50-opentsdb-json.mdx +++ b/docs/en/07-develop/03-insert-data/50-opentsdb-json.mdx @@ -67,7 +67,7 @@ Please refer to [OpenTSDB HTTP API](http://opentsdb.net/docs/build/html/api_http - + diff --git a/docs/en/07-develop/04-query-data/index.mdx b/docs/en/07-develop/04-query-data/index.mdx index 8e21fd325c..164ec0d0a6 100644 --- a/docs/en/07-develop/04-query-data/index.mdx +++ b/docs/en/07-develop/04-query-data/index.mdx @@ -142,7 +142,7 @@ In the section describing [Insert](../insert-data/sql-writing), a database named - + diff --git a/docs/zh/08-develop/01-connect/index.md b/docs/zh/08-develop/01-connect/index.md index fdde4aea2e..755a9e7f74 100644 --- a/docs/zh/08-develop/01-connect/index.md +++ b/docs/zh/08-develop/01-connect/index.md @@ -20,7 +20,7 @@ import VerifyLinux from "../../14-reference/05-connector/_verify_linux.mdx"; import VerifyMacOS from "../../14-reference/05-connector/_verify_macos.mdx"; import VerifyWindows from "../../14-reference/05-connector/_verify_windows.mdx"; -TDengine 提供了丰富的应用程序开发接口,为了便于用户快速开发自己的应用,TDengine 支持了多种编程语言的连接器,其中官方连接器包括支持 C/C++、Java、Python、Go、Node.js、C#、Rust、Lua(社区贡献)和 PHP (社区贡献)的连接器。这些连接器支持使用原生接口(taosc)和 REST 接口(部分语言暂不支持)连接 TDengine 集群。社区开发者也贡献了多个非官方连接器,例如 ADO.NET 连接器、Lua 连接器和 PHP 连接器。 +TDengine 提供了丰富的应用程序开发接口,为了便于用户快速开发自己的应用,TDengine 支持了多种编程语言的连接器,其中官方连接器包括支持 C/C++、Java、Python、Go、Node.js、C#、Rust、Lua(社区贡献)和 PHP (社区贡献)的连接器。这些连接器支持使用原生接口(taosc)和 REST 接口(部分语言暂不支持)连接 TDengine 集群。社区开发者也贡献了多个非官方连接器,例如 ADO.NET 连接器、Lua 连接器和 PHP 连接器。另外 TDengine 还可以直接调用 taosadapter 提供的 REST API 接口,进行数据写入和查询操作。 ## 连接方式 @@ -33,6 +33,7 @@ TDengine 提供了丰富的应用程序开发接口,为了便于用户快速 ![TDengine connection type](connection-type-zh.webp) 无论使用何种方式建立连接,连接器都提供了相同或相似的 API 操作数据库,都可以执行 SQL 语句,只是初始化连接的方式稍有不同,用户在使用上不会感到什么差别。 +各种连接方式和各语言连接器支持情况请参考:[连接器功能特性](../../reference/connector/#功能特性) 关键不同点在于: @@ -251,7 +252,10 @@ dotnet add package TDengine.Connector 如果已经安装了 TDengine 服务端软件或 TDengine 客户端驱动 taosc, 那么已经安装了 C 连接器,无需额外操作。 -
+ +
+ +使用 REST API 方式访问 TDengine,无需安装任何驱动和连接器。 @@ -394,7 +398,10 @@ C/C++ 语言连接器使用 `taos_connect()` 函数用于建立与 TDengine 数 还提供了 `taos_connect_auth()` 函数用于使用 MD5 加密的密码建立与 TDengine 数据库的连接。此函数与 `taos_connect` 功能相同,不同之处在于密码的处理方式,`taos_connect_auth` 需要的是密码的 MD5 加密字符串。
+ +使用 REST API 方式访问 TDengine,由应用程序去建立 HTTP 连接,自己控制 HTTP 连接参数。 + ### Websocket 连接 @@ -431,6 +438,13 @@ C/C++ 语言连接器使用 `taos_connect()` 函数用于建立与 TDengine 数 {{#include docs/examples/csharp/wsConnect/Program.cs:main}} ```
+ +不支持 + + +不支持 + + ### 原生连接 @@ -455,6 +469,9 @@ C/C++ 语言连接器使用 `taos_connect()` 函数用于建立与 TDengine 数 {{#include docs/examples/rust/nativeexample/examples/connect.rs}} ```
+ +不支持 + ```csharp {{#include docs/examples/csharp/connect/Program.cs:main}} @@ -464,6 +481,10 @@ C/C++ 语言连接器使用 `taos_connect()` 函数用于建立与 TDengine 数 + +不支持 + + ### REST 连接 @@ -485,6 +506,23 @@ C/C++ 语言连接器使用 `taos_connect()` 函数用于建立与 TDengine 数 {{#include docs/examples/go/connect/restexample/main.go}} ```
+ +不支持 + + +不支持 + + +不支持 + + +不支持 + + + +使用 REST API 方式访问 TDengine,由应用程序自主去建立 HTTP 连接。 + + diff --git a/docs/zh/08-develop/02-sql.md b/docs/zh/08-develop/02-sql.md index 3cad007078..1651bf21cd 100644 --- a/docs/zh/08-develop/02-sql.md +++ b/docs/zh/08-develop/02-sql.md @@ -57,7 +57,7 @@ REST API:直接调用 `taosadapter` 提供的 REST API 接口,进行数据 ```
- + ```js {{#include docs/examples/node/websocketexample/sql_example.js:create_db_and_table}} ``` @@ -133,7 +133,7 @@ NOW 为系统内部函数,默认为客户端所在计算机当前时间。 NOW ``` - + ```js {{#include docs/examples/node/websocketexample/sql_example.js:insertData}} ``` @@ -207,7 +207,7 @@ rust 连接器还支持使用 **serde** 进行反序列化行为结构体的结 ``` - + ```js {{#include docs/examples/node/websocketexample/sql_example.js:queryData}} ``` @@ -282,7 +282,7 @@ reqId 可用于请求链路追踪,reqId 就像分布式系统中的 traceId ``` - + ```js {{#include docs/examples/node/websocketexample/sql_example.js:sqlWithReqid}} ``` diff --git a/docs/zh/08-develop/04-schemaless.md b/docs/zh/08-develop/04-schemaless.md index 1bc750c3cb..06dec726e9 100644 --- a/docs/zh/08-develop/04-schemaless.md +++ b/docs/zh/08-develop/04-schemaless.md @@ -194,7 +194,7 @@ writer.write(lineDemo, SchemalessProtocolType.LINE, SchemalessTimestampType.NANO ``` - + ```js {{#include docs/examples/node/websocketexample/line_example.js}} ``` @@ -204,6 +204,12 @@ writer.write(lineDemo, SchemalessProtocolType.LINE, SchemalessTimestampType.NANO {{#include docs/examples/csharp/wssml/Program.cs:main}} ``` + +不支持 + + +不支持 + ### 原生连接 @@ -237,7 +243,9 @@ writer.write(lineDemo, SchemalessProtocolType.LINE, SchemalessTimestampType.NANO {{#include docs/examples/rust/nativeexample/examples/schemaless.rs}} ``` - + + +不支持 ```csharp @@ -249,7 +257,9 @@ writer.write(lineDemo, SchemalessProtocolType.LINE, SchemalessTimestampType.NANO {{#include docs/examples/c/sml_insert_demo.c:schemaless}} ``` - + +不支持 + ## 查询写入的数据 diff --git a/docs/zh/08-develop/05-stmt.md b/docs/zh/08-develop/05-stmt.md index 0e94af4a34..4b8dbbce9b 100644 --- a/docs/zh/08-develop/05-stmt.md +++ b/docs/zh/08-develop/05-stmt.md @@ -63,6 +63,12 @@ import TabItem from "@theme/TabItem"; {{#include docs/examples/csharp/wsStmt/Program.cs:main}} ``` + +不支持 + + +不支持 + ## 原生连接 @@ -93,6 +99,9 @@ import TabItem from "@theme/TabItem"; {{#include docs/examples/rust/nativeexample/examples/stmt.rs}} ``` + + +不支持 ```csharp @@ -104,5 +113,7 @@ import TabItem from "@theme/TabItem"; {{#include docs/examples/c/stmt_insert_demo.c}} ``` - + +不支持 + diff --git a/docs/zh/08-develop/07-tmq.md b/docs/zh/08-develop/07-tmq.md index 3ee606741c..0711ab9f28 100644 --- a/docs/zh/08-develop/07-tmq.md +++ b/docs/zh/08-develop/07-tmq.md @@ -96,6 +96,9 @@ Rust 连接器创建消费者的参数为 DSN, 可以设置的参数列表请 同通用基础配置项。 + +不支持 + ### Websocket 连接 @@ -148,6 +151,12 @@ Rust 连接器创建消费者的参数为 DSN, 可以设置的参数列表请 {{#include docs/examples/csharp/wssubscribe/Program.cs:create_consumer}} ``` + +不支持 + + +不支持 + @@ -187,7 +196,9 @@ Rust 连接器创建消费者的参数为 DSN, 可以设置的参数列表请 ```
- + +不支持 + ```csharp {{#include docs/examples/csharp/subscribe/Program.cs:create_consumer}} @@ -206,7 +217,9 @@ Rust 连接器创建消费者的参数为 DSN, 可以设置的参数列表请 调用 `build_consumer` 函数尝试获取消费者实例 `tmq`。成功则打印成功日志,失败则打印失败日志。 - + +不支持 + ## 订阅消费数据 @@ -267,7 +280,12 @@ Rust 连接器创建消费者的参数为 DSN, 可以设置的参数列表请 {{#include docs/examples/csharp/wssubscribe/Program.cs:subscribe}} ```
- + +不支持 + + +不支持 + ### 原生连接 @@ -311,7 +329,9 @@ Rust 连接器创建消费者的参数为 DSN, 可以设置的参数列表请 - `Record` 是我们自定义的一个结构体,其字段名和数据类型与列的名称和数据类型一一对应,这样可以通过 `serde` 反序列化出 `Record` 类型的对象。
- + +不支持 + ```csharp {{#include docs/examples/csharp/subscribe/Program.cs:subscribe}} @@ -343,7 +363,9 @@ Rust 连接器创建消费者的参数为 DSN, 可以设置的参数列表请 5. 调用 `basic_consume_loop` 函数开始基本的消费循环,处理订阅的消息。 - + +不支持 + ## 指定订阅的 Offset @@ -402,6 +424,12 @@ Rust 连接器创建消费者的参数为 DSN, 可以设置的参数列表请 {{#include docs/examples/csharp/wssubscribe/Program.cs:seek}} ```
+ +不支持 + + +不支持 + ### 原生连接 @@ -445,7 +473,9 @@ Rust 连接器创建消费者的参数为 DSN, 可以设置的参数列表请 2. 在所有分区的偏移量调整完成后,再次获取并记录消费者的分区分配信息,以确认偏移量调整后的状态。
- + +不支持 + ```csharp {{#include docs/examples/csharp/subscribe/Program.cs:seek}} @@ -465,6 +495,9 @@ Rust 连接器创建消费者的参数为 DSN, 可以设置的参数列表请 6. 调用 `basic_consume_loop` 函数开始新的的消费循环,处理消息。 + +不支持 + @@ -518,6 +551,12 @@ Rust 连接器创建消费者的参数为 DSN, 可以设置的参数列表请 {{#include docs/examples/csharp/wssubscribe/Program.cs:commit_offset}} ```
+ +不支持 + + +不支持 + ### 原生连接 @@ -551,7 +590,9 @@ Rust 连接器创建消费者的参数为 DSN, 可以设置的参数列表请 可以通过 `consumer.commit` 方法来手工提交消费进度。
- + +不支持 + ```csharp {{#include docs/examples/csharp/subscribe/Program.cs:commit_offset}} @@ -566,6 +607,9 @@ Rust 连接器创建消费者的参数为 DSN, 可以设置的参数列表请 可以通过 `tmq_commit_sync` 函数来手工提交消费进度。 + +不支持 + @@ -615,6 +659,12 @@ Rust 连接器创建消费者的参数为 DSN, 可以设置的参数列表请 {{#include docs/examples/csharp/wssubscribe/Program.cs:close}} ``` + +不支持 + + +不支持 + ### 原生连接 @@ -646,7 +696,9 @@ Rust 连接器创建消费者的参数为 DSN, 可以设置的参数列表请 **注意**:消费者取消订阅后无法重用,如果想订阅新的 `topic`, 请重新创建消费者。 - + +不支持 + ```csharp {{#include docs/examples/csharp/subscribe/Program.cs:close}} @@ -658,7 +710,9 @@ Rust 连接器创建消费者的参数为 DSN, 可以设置的参数列表请 {{#include docs/examples/c/tmq_demo.c:unsubscribe_and_close}} ``` - + +不支持 + @@ -720,7 +774,12 @@ Rust 连接器创建消费者的参数为 DSN, 可以设置的参数列表请 ``` - + +不支持 + + +不支持 + ### 原生连接 @@ -765,7 +824,9 @@ Rust 连接器创建消费者的参数为 DSN, 可以设置的参数列表请 ``` - + +不支持 +
完整代码示例 @@ -783,4 +844,7 @@ Rust 连接器创建消费者的参数为 DSN, 可以设置的参数列表请 ```
+ +不支持 + diff --git a/docs/zh/14-reference/05-connector/index.md b/docs/zh/14-reference/05-connector/index.md index 5c58a4e7bc..6cca1960c3 100644 --- a/docs/zh/14-reference/05-connector/index.md +++ b/docs/zh/14-reference/05-connector/index.md @@ -46,10 +46,10 @@ TDengine 版本更新往往会增加新的功能特性,列表中的连接器 | **功能特性** | **Java** | **Python** | **Go** | **C#** | **Rust** | | ------------------- | -------- | ---------- | ------ | ------ | -------- | | **连接管理** | 支持 | 支持 | 支持 | 支持 | 支持 | -| **普通查询** | 支持 | 支持 | 支持 | 支持 | 支持 | +| **执行 SQL** | 支持 | 支持 | 支持 | 支持 | 支持 | | **参数绑定** | 支持 | 支持 | 支持 | 支持 | 支持 | | **数据订阅(TMQ)** | 支持 | 支持 | 支持 | 支持 | 支持 | -| **Schemaless** | 支持 | 支持 | 支持 | 支持 | 支持 | +| **无模式写入** | 支持 | 支持 | 支持 | 支持 | 支持 | :::info 由于不同编程语言数据库框架规范不同,并不意味着所有 C/C++ 接口都需要对应封装支持。 @@ -57,20 +57,20 @@ TDengine 版本更新往往会增加新的功能特性,列表中的连接器 ### 使用 http REST 接口 -| **功能特性** | **Java** | **Python** | **Go** | **C#** | -| ------------ | -------- | ---------- | ------ | ------ | -| **连接管理** | 支持 | 支持 | 支持 | 支持 | -| **普通查询** | 支持 | 支持 | 支持 | 支持 | +| **功能特性** | **Java** | **Python** | **Go** | +| ------------ | -------- | ---------- | ------ | +| **连接管理** | 支持 | 支持 | 支持 | +| **执行 SQL** | 支持 | 支持 | 支持 | ### 使用 Websocket 接口 | **功能特性** | **Java** | **Python** | **Go** | **C#** | **Node.js** | **Rust** | | ------------------- | -------- | ---------- | ------ | ------ | ----------- | -------- | | **连接管理** | 支持 | 支持 | 支持 | 支持 | 支持 | 支持 | -| **普通查询** | 支持 | 支持 | 支持 | 支持 | 支持 | 支持 | +| **执行 SQL** | 支持 | 支持 | 支持 | 支持 | 支持 | 支持 | | **参数绑定** | 支持 | 支持 | 支持 | 支持 | 支持 | 支持 | | **数据订阅(TMQ)** | 支持 | 支持 | 支持 | 支持 | 支持 | 支持 | -| **Schemaless** | 支持 | 支持 | 支持 | 支持 | 支持 | 支持 | +| **无模式写入** | 支持 | 支持 | 支持 | 支持 | 支持 | 支持 | :::warning - 无论选用何种编程语言的连接器,2.0 及以上版本的 TDengine 推荐数据库应用的每个线程都建立一个独立的连接,或基于线程建立连接池,以避免连接内的“USE statement”状态量在线程之间相互干扰(但连接的查询和写入操作都是线程安全的)。 @@ -128,4 +128,4 @@ import DocCardList from '@theme/DocCardList'; import {useCurrentSidebarCategory} from '@docusaurus/theme-common'; -``` \ No newline at end of file +```