diff --git a/include/common/tcommon.h b/include/common/tcommon.h index 6f4f15d1e8..72aab9adf0 100644 --- a/include/common/tcommon.h +++ b/include/common/tcommon.h @@ -55,8 +55,8 @@ typedef struct SSessionKey { } SSessionKey; typedef struct SVersionRange { - uint64_t minVer; - uint64_t maxVer; + int64_t minVer; + int64_t maxVer; } SVersionRange; static inline int winKeyCmprImpl(const void* pKey1, const void* pKey2) { diff --git a/include/common/tgrant.h b/include/common/tgrant.h index edbc74bf18..cfc6c13c48 100644 --- a/include/common/tgrant.h +++ b/include/common/tgrant.h @@ -52,9 +52,9 @@ typedef enum { int32_t grantCheck(EGrantType grant); #ifndef TD_GRANT_OPTIMIZE -int32_t grantAlterActiveCode(const char* old, const char* new, char* out, int8_t type); +int32_t grantAlterActiveCode(const char* old, const char* newer, char* out, int8_t type); #else -int32_t grantAlterActiveCode(int32_t did, const char* old, const char* new, char* out, int8_t type); +int32_t grantAlterActiveCode(int32_t did, const char* old, const char* newer, char* out, int8_t type); #endif #ifndef GRANTS_CFG @@ -114,4 +114,4 @@ int32_t grantAlterActiveCode(int32_t did, const char* old, const char* new, char } #endif -#endif /*_TD_COMMON_GRANT_H_*/ \ No newline at end of file +#endif /*_TD_COMMON_GRANT_H_*/ diff --git a/include/common/tmsgdef.h b/include/common/tmsgdef.h index 4a2ae18765..b92bba831c 100644 --- a/include/common/tmsgdef.h +++ b/include/common/tmsgdef.h @@ -299,8 +299,8 @@ enum { // WARN: new msg should be appended to segment tail TD_DEF_MSG_TYPE(TDMT_SYNC_HEARTBEAT, "sync-heartbeat", NULL, NULL) TD_DEF_MSG_TYPE(TDMT_SYNC_HEARTBEAT_REPLY, "sync-heartbeat-reply", NULL, NULL) TD_DEF_MSG_TYPE(TDMT_SYNC_LOCAL_CMD, "sync-local-cmd", NULL, NULL) - TD_DEF_MSG_TYPE(TDMT_SYNC_PRE_SNAPSHOT, "sync-pre-snapshot", NULL, NULL) // no longer used - TD_DEF_MSG_TYPE(TDMT_SYNC_PRE_SNAPSHOT_REPLY, "sync-pre-snapshot-reply", NULL, NULL) // no longer used + TD_DEF_MSG_TYPE(TDMT_SYNC_PREP_SNAPSHOT, "sync-prep-snapshot", NULL, NULL) + TD_DEF_MSG_TYPE(TDMT_SYNC_PREP_SNAPSHOT_REPLY, "sync-prep-snapshot-reply", NULL, NULL) TD_DEF_MSG_TYPE(TDMT_SYNC_MAX_MSG, "sync-max", NULL, NULL) TD_DEF_MSG_TYPE(TDMT_SYNC_FORCE_FOLLOWER, "sync-force-become-follower", NULL, NULL) diff --git a/include/libs/sync/sync.h b/include/libs/sync/sync.h index f69afbd71b..ad525a2aa7 100644 --- a/include/libs/sync/sync.h +++ b/include/libs/sync/sync.h @@ -36,8 +36,7 @@ extern "C" { #define SYNC_DEL_WAL_MS (1000 * 60) #define SYNC_ADD_QUORUM_COUNT 3 #define SYNC_VNODE_LOG_RETENTION (TSDB_SYNC_LOG_BUFFER_RETENTION + 1) -#define SNAPSHOT_MAX_CLOCK_SKEW_MS 1000 * 10 -#define SNAPSHOT_WAIT_MS 1000 * 30 +#define SNAPSHOT_WAIT_MS 1000 * 5 #define SYNC_MAX_RETRY_BACKOFF 5 #define SYNC_LOG_REPL_RETRY_WAIT_MS 100 @@ -87,6 +86,11 @@ typedef enum { TAOS_SYNC_ROLE_ERROR = 2, } ESyncRole; +typedef enum { + SYNC_FSM_STATE_COMPLETE = 0, + SYNC_FSM_STATE_INCOMPLETE, +} ESyncFsmState; + typedef struct SNodeInfo { int64_t clusterId; int32_t nodeId; @@ -95,6 +99,12 @@ typedef struct SNodeInfo { ESyncRole nodeRole; } SNodeInfo; +typedef struct SSyncTLV { + int32_t typ; + int32_t len; + char val[]; +} SSyncTLV; + typedef struct SSyncCfg { int32_t totalReplicaNum; int32_t replicaNum; @@ -139,10 +149,13 @@ typedef struct SReConfigCbMeta { typedef struct SSnapshotParam { SyncIndex start; SyncIndex end; + SSyncTLV* data; } SSnapshotParam; typedef struct SSnapshot { - void* data; + int32_t type; + SSyncTLV* data; + ESyncFsmState state; SyncIndex lastApplyIndex; SyncTerm lastApplyTerm; SyncIndex lastConfigIndex; @@ -171,7 +184,7 @@ typedef struct SSyncFSM { void (*FpBecomeLearnerCb)(const struct SSyncFSM* pFsm); int32_t (*FpGetSnapshot)(const struct SSyncFSM* pFsm, SSnapshot* pSnapshot, void* pReaderParam, void** ppReader); - void (*FpGetSnapshotInfo)(const struct SSyncFSM* pFsm, SSnapshot* pSnapshot); + int32_t (*FpGetSnapshotInfo)(const struct SSyncFSM* pFsm, SSnapshot* pSnapshot); int32_t (*FpSnapshotStartRead)(const struct SSyncFSM* pFsm, void* pReaderParam, void** ppReader); void (*FpSnapshotStopRead)(const struct SSyncFSM* pFsm, void* pReader); diff --git a/include/util/taoserror.h b/include/util/taoserror.h index 39ae3fb97a..6fbe4422ac 100644 --- a/include/util/taoserror.h +++ b/include/util/taoserror.h @@ -557,7 +557,7 @@ int32_t* taosGetErrno(); // #define TSDB_CODE_SYN_TOO_MANY_FWDINFO TAOS_DEF_ERROR_CODE(0, 0x0904) // 2.x // #define TSDB_CODE_SYN_MISMATCHED_PROTOCOL TAOS_DEF_ERROR_CODE(0, 0x0905) // 2.x // #define TSDB_CODE_SYN_MISMATCHED_CLUSTERID TAOS_DEF_ERROR_CODE(0, 0x0906) // 2.x -// #define TSDB_CODE_SYN_MISMATCHED_SIGNATURE TAOS_DEF_ERROR_CODE(0, 0x0907) // 2.x +#define TSDB_CODE_SYN_MISMATCHED_SIGNATURE TAOS_DEF_ERROR_CODE(0, 0x0907) // #define TSDB_CODE_SYN_INVALID_CHECKSUM TAOS_DEF_ERROR_CODE(0, 0x0908) // 2.x // #define TSDB_CODE_SYN_INVALID_MSGLEN TAOS_DEF_ERROR_CODE(0, 0x0909) // 2.x // #define TSDB_CODE_SYN_INVALID_MSGTYPE TAOS_DEF_ERROR_CODE(0, 0x090A) // 2.x diff --git a/source/dnode/mgmt/mgmt_mnode/src/mmHandle.c b/source/dnode/mgmt/mgmt_mnode/src/mmHandle.c index 24b5b2566c..d5488da770 100644 --- a/source/dnode/mgmt/mgmt_mnode/src/mmHandle.c +++ b/source/dnode/mgmt/mgmt_mnode/src/mmHandle.c @@ -238,7 +238,7 @@ SArray *mmGetMsgHandles() { if (dmSetMgmtHandle(pArray, TDMT_SYNC_APPEND_ENTRIES_BATCH, mmPutMsgToSyncQueue, 1) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_SYNC_APPEND_ENTRIES_REPLY, mmPutMsgToSyncQueue, 1) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_SYNC_SNAPSHOT_SEND, mmPutMsgToSyncQueue, 1) == NULL) goto _OVER; - if (dmSetMgmtHandle(pArray, TDMT_SYNC_PRE_SNAPSHOT, mmPutMsgToSyncQueue, 1) == NULL) goto _OVER; + if (dmSetMgmtHandle(pArray, TDMT_SYNC_PREP_SNAPSHOT, mmPutMsgToSyncQueue, 1) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_SYNC_FORCE_FOLLOWER_RSP, mmPutMsgToWriteQueue, 0) == NULL) goto _OVER; @@ -246,7 +246,7 @@ SArray *mmGetMsgHandles() { if (dmSetMgmtHandle(pArray, TDMT_SYNC_HEARTBEAT, mmPutMsgToSyncRdQueue, 1) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_SYNC_HEARTBEAT_REPLY, mmPutMsgToSyncRdQueue, 1) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_SYNC_SNAPSHOT_RSP, mmPutMsgToSyncRdQueue, 1) == NULL) goto _OVER; - if (dmSetMgmtHandle(pArray, TDMT_SYNC_PRE_SNAPSHOT_REPLY, mmPutMsgToSyncRdQueue, 1) == NULL) goto _OVER; + if (dmSetMgmtHandle(pArray, TDMT_SYNC_PREP_SNAPSHOT_REPLY, mmPutMsgToSyncRdQueue, 1) == NULL) goto _OVER; code = 0; diff --git a/source/dnode/mgmt/mgmt_vnode/inc/vmInt.h b/source/dnode/mgmt/mgmt_vnode/inc/vmInt.h index cddf132bce..34f2b5c446 100644 --- a/source/dnode/mgmt/mgmt_vnode/inc/vmInt.h +++ b/source/dnode/mgmt/mgmt_vnode/inc/vmInt.h @@ -56,6 +56,7 @@ typedef struct { int32_t vgVersion; int32_t refCount; int8_t dropped; + int8_t failed; int8_t disable; int32_t diskPrimary; int32_t toVgId; diff --git a/source/dnode/mgmt/mgmt_vnode/src/vmHandle.c b/source/dnode/mgmt/mgmt_vnode/src/vmHandle.c index 0e17d2b75f..cc542f51ce 100644 --- a/source/dnode/mgmt/mgmt_vnode/src/vmHandle.c +++ b/source/dnode/mgmt/mgmt_vnode/src/vmHandle.c @@ -30,9 +30,11 @@ void vmGetVnodeLoads(SVnodeMgmt *pMgmt, SMonVloadInfo *pInfo, bool isReset) { if (ppVnode == NULL || *ppVnode == NULL) continue; SVnodeObj *pVnode = *ppVnode; - SVnodeLoad vload = {0}; - vnodeGetLoad(pVnode->pImpl, &vload); - if (isReset) vnodeResetLoad(pVnode->pImpl, &vload); + SVnodeLoad vload = {.vgId = pVnode->vgId}; + if (!pVnode->failed) { + vnodeGetLoad(pVnode->pImpl, &vload); + if (isReset) vnodeResetLoad(pVnode->pImpl, &vload); + } taosArrayPush(pInfo->pVloads, &vload); pIter = taosHashIterate(pMgmt->hash, pIter); } @@ -52,9 +54,11 @@ void vmGetVnodeLoadsLite(SVnodeMgmt *pMgmt, SMonVloadInfo *pInfo) { if (ppVnode == NULL || *ppVnode == NULL) continue; SVnodeObj *pVnode = *ppVnode; - SVnodeLoadLite vload = {0}; - if (vnodeGetLoadLite(pVnode->pImpl, &vload) == 0) { - taosArrayPush(pInfo->pVloads, &vload); + if (!pVnode->failed) { + SVnodeLoadLite vload = {0}; + if (vnodeGetLoadLite(pVnode->pImpl, &vload) == 0) { + taosArrayPush(pInfo->pVloads, &vload); + } } pIter = taosHashIterate(pMgmt->hash, pIter); } @@ -278,7 +282,7 @@ int32_t vmProcessCreateVnodeReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg) { vmGenerateWrapperCfg(pMgmt, &req, &wrapperCfg); SVnodeObj *pVnode = vmAcquireVnode(pMgmt, req.vgId); - if (pVnode != NULL) { + if (pVnode != NULL && !pVnode->failed) { dError("vgId:%d, already exist", req.vgId); tFreeSCreateVnodeReq(&req); vmReleaseVnode(pMgmt, pVnode); @@ -287,7 +291,9 @@ int32_t vmProcessCreateVnodeReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg) { return 0; } - wrapperCfg.diskPrimary = vmAllocPrimaryDisk(pMgmt, vnodeCfg.vgId); + ASSERT(pVnode == NULL || pVnode->failed); + + wrapperCfg.diskPrimary = pVnode ? pVnode->diskPrimary : vmAllocPrimaryDisk(pMgmt, vnodeCfg.vgId); int32_t diskPrimary = wrapperCfg.diskPrimary; snprintf(path, TSDB_FILENAME_LEN, "vnode%svnode%d", TD_DIRSEP, vnodeCfg.vgId); @@ -299,7 +305,7 @@ int32_t vmProcessCreateVnodeReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg) { goto _OVER; } - SVnode *pImpl = vnodeOpen(path, diskPrimary, pMgmt->pTfs, pMgmt->msgCb); + SVnode *pImpl = vnodeOpen(path, diskPrimary, pMgmt->pTfs, pMgmt->msgCb, true); if (pImpl == NULL) { dError("vgId:%d, failed to open vnode since %s", req.vgId, terrstr()); code = terrno; @@ -364,9 +370,10 @@ int32_t vmProcessAlterVnodeTypeReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg) { TMSG_INFO(pMsg->msgType)); SVnodeObj *pVnode = vmAcquireVnode(pMgmt, req.vgId); - if (pVnode == NULL) { + if (pVnode == NULL || pVnode->failed) { dError("vgId:%d, failed to alter vnode type since %s", req.vgId, terrstr()); terrno = TSDB_CODE_VND_NOT_EXIST; + if (pVnode) vmReleaseVnode(pMgmt, pVnode); return -1; } @@ -445,7 +452,7 @@ int32_t vmProcessAlterVnodeTypeReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg) { } dInfo("vgId:%d, begin to open vnode", vgId); - SVnode *pImpl = vnodeOpen(path, diskPrimary, pMgmt->pTfs, pMgmt->msgCb); + SVnode *pImpl = vnodeOpen(path, diskPrimary, pMgmt->pTfs, pMgmt->msgCb, false); if (pImpl == NULL) { dError("vgId:%d, failed to open vnode at %s since %s", vgId, path, terrstr()); return -1; @@ -481,9 +488,10 @@ int32_t vmProcessCheckLearnCatchupReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg) { req.vgId, TMSG_INFO(pMsg->msgType)); SVnodeObj *pVnode = vmAcquireVnode(pMgmt, req.vgId); - if (pVnode == NULL) { + if (pVnode == NULL || pVnode->failed) { dError("vgId:%d, failed to alter vnode type since %s", req.vgId, terrstr()); terrno = TSDB_CODE_VND_NOT_EXIST; + if (pVnode) vmReleaseVnode(pMgmt, pVnode); return -1; } @@ -523,9 +531,10 @@ int32_t vmProcessDisableVnodeWriteReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg) { dInfo("vgId:%d, vnode write disable:%d", req.vgId, req.disable); SVnodeObj *pVnode = vmAcquireVnode(pMgmt, req.vgId); - if (pVnode == NULL) { + if (pVnode == NULL || pVnode->failed) { dError("vgId:%d, failed to disable write since %s", req.vgId, terrstr()); terrno = TSDB_CODE_VND_NOT_EXIST; + if (pVnode) vmReleaseVnode(pMgmt, pVnode); return -1; } @@ -555,9 +564,10 @@ int32_t vmProcessAlterHashRangeReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg) { dInfo("vgId:%d, start to alter vnode hashrange:[%u, %u], dstVgId:%d", req.srcVgId, req.hashBegin, req.hashEnd, req.dstVgId); pVnode = vmAcquireVnode(pMgmt, srcVgId); - if (pVnode == NULL) { + if (pVnode == NULL || pVnode->failed) { dError("vgId:%d, failed to alter hashrange since %s", srcVgId, terrstr()); terrno = TSDB_CODE_VND_NOT_EXIST; + if (pVnode) vmReleaseVnode(pMgmt, pVnode); return -1; } @@ -592,7 +602,7 @@ int32_t vmProcessAlterHashRangeReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg) { } dInfo("vgId:%d, open vnode", dstVgId); - SVnode *pImpl = vnodeOpen(dstPath, diskPrimary, pMgmt->pTfs, pMgmt->msgCb); + SVnode *pImpl = vnodeOpen(dstPath, diskPrimary, pMgmt->pTfs, pMgmt->msgCb, false); if (pImpl == NULL) { dError("vgId:%d, failed to open vnode at %s since %s", dstVgId, dstPath, terrstr()); @@ -669,9 +679,10 @@ int32_t vmProcessAlterVnodeReplicaReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg) { } SVnodeObj *pVnode = vmAcquireVnode(pMgmt, vgId); - if (pVnode == NULL) { + if (pVnode == NULL || pVnode->failed) { dError("vgId:%d, failed to alter replica since %s", vgId, terrstr()); terrno = TSDB_CODE_VND_NOT_EXIST; + if (pVnode) vmReleaseVnode(pMgmt, pVnode); return -1; } @@ -696,7 +707,7 @@ int32_t vmProcessAlterVnodeReplicaReq(SVnodeMgmt *pMgmt, SRpcMsg *pMsg) { } dInfo("vgId:%d, begin to open vnode", vgId); - SVnode *pImpl = vnodeOpen(path, diskPrimary, pMgmt->pTfs, pMgmt->msgCb); + SVnode *pImpl = vnodeOpen(path, diskPrimary, pMgmt->pTfs, pMgmt->msgCb, false); if (pImpl == NULL) { dError("vgId:%d, failed to open vnode at %s since %s", vgId, path, terrstr()); return -1; @@ -848,14 +859,14 @@ SArray *vmGetMsgHandles() { if (dmSetMgmtHandle(pArray, TDMT_SYNC_APPEND_ENTRIES_BATCH, vmPutMsgToSyncQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_SYNC_APPEND_ENTRIES_REPLY, vmPutMsgToSyncQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_SYNC_SNAPSHOT_SEND, vmPutMsgToSyncQueue, 0) == NULL) goto _OVER; - if (dmSetMgmtHandle(pArray, TDMT_SYNC_PRE_SNAPSHOT, vmPutMsgToSyncQueue, 0) == NULL) goto _OVER; + if (dmSetMgmtHandle(pArray, TDMT_SYNC_PREP_SNAPSHOT, vmPutMsgToSyncQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_SYNC_FORCE_FOLLOWER, vmPutMsgToSyncQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_SYNC_TIMEOUT, vmPutMsgToSyncRdQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_SYNC_HEARTBEAT, vmPutMsgToSyncRdQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_SYNC_HEARTBEAT_REPLY, vmPutMsgToSyncRdQueue, 0) == NULL) goto _OVER; if (dmSetMgmtHandle(pArray, TDMT_SYNC_SNAPSHOT_RSP, vmPutMsgToSyncRdQueue, 0) == NULL) goto _OVER; - if (dmSetMgmtHandle(pArray, TDMT_SYNC_PRE_SNAPSHOT_REPLY, vmPutMsgToSyncRdQueue, 0) == NULL) goto _OVER; + if (dmSetMgmtHandle(pArray, TDMT_SYNC_PREP_SNAPSHOT_REPLY, vmPutMsgToSyncRdQueue, 0) == NULL) goto _OVER; code = 0; diff --git a/source/dnode/mgmt/mgmt_vnode/src/vmInt.c b/source/dnode/mgmt/mgmt_vnode/src/vmInt.c index 963bfa3197..d2093ff77c 100644 --- a/source/dnode/mgmt/mgmt_vnode/src/vmInt.c +++ b/source/dnode/mgmt/mgmt_vnode/src/vmInt.c @@ -112,6 +112,7 @@ int32_t vmOpenVnode(SVnodeMgmt *pMgmt, SWrapperCfg *pCfg, SVnode *pImpl) { pVnode->diskPrimary = pCfg->diskPrimary; pVnode->refCount = 0; pVnode->dropped = 0; + pVnode->failed = 0; pVnode->path = taosStrdup(pCfg->path); pVnode->pImpl = pImpl; @@ -121,11 +122,15 @@ int32_t vmOpenVnode(SVnodeMgmt *pMgmt, SWrapperCfg *pCfg, SVnode *pImpl) { return -1; } - if (vmAllocQueue(pMgmt, pVnode) != 0) { - terrno = TSDB_CODE_OUT_OF_MEMORY; - taosMemoryFree(pVnode->path); - taosMemoryFree(pVnode); - return -1; + if (pImpl) { + if (vmAllocQueue(pMgmt, pVnode) != 0) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + taosMemoryFree(pVnode->path); + taosMemoryFree(pVnode); + return -1; + } + } else { + pVnode->failed = 1; } taosThreadRwlockWrlock(&pMgmt->lock); @@ -267,12 +272,14 @@ static void *vmOpenVnodeInThread(void *param) { int32_t diskPrimary = pCfg->diskPrimary; snprintf(path, TSDB_FILENAME_LEN, "vnode%svnode%d", TD_DIRSEP, pCfg->vgId); - SVnode *pImpl = vnodeOpen(path, diskPrimary, pMgmt->pTfs, pMgmt->msgCb); + SVnode *pImpl = vnodeOpen(path, diskPrimary, pMgmt->pTfs, pMgmt->msgCb, false); if (pImpl == NULL) { dError("vgId:%d, failed to open vnode by thread:%d since %s", pCfg->vgId, pThread->threadIndex, terrstr()); - pThread->failed++; - continue; + if (terrno != TSDB_CODE_NEED_RETRY) { + pThread->failed++; + continue; + } } if (vmOpenVnode(pMgmt, pCfg, pImpl) != 0) { @@ -379,6 +386,7 @@ static void *vmCloseVnodeInThread(void *param) { for (int32_t v = 0; v < pThread->vnodeNum; ++v) { SVnodeObj *pVnode = pThread->ppVnodes[v]; + if (pVnode->failed) continue; char stepDesc[TSDB_STEP_DESC_LEN] = {0}; snprintf(stepDesc, TSDB_STEP_DESC_LEN, "vgId:%d, start to close, %d of %d have been closed", pVnode->vgId, @@ -473,7 +481,9 @@ static void vmCheckSyncTimeout(SVnodeMgmt *pMgmt) { if (ppVnodes != NULL) { for (int32_t i = 0; i < numOfVnodes; ++i) { SVnodeObj *pVnode = ppVnodes[i]; - vnodeSyncCheckTimeout(pVnode->pImpl); + if (!pVnode->failed) { + vnodeSyncCheckTimeout(pVnode->pImpl); + } vmReleaseVnode(pMgmt, pVnode); } taosMemoryFree(ppVnodes); @@ -605,6 +615,12 @@ static void *vmRestoreVnodeInThread(void *param) { for (int32_t v = 0; v < pThread->vnodeNum; ++v) { SVnodeObj *pVnode = pThread->ppVnodes[v]; + if (pVnode->failed) { + dError("vgId:%d, skip restoring vnode in failure mode.", pVnode->vgId); + continue; + } + + ASSERT(pVnode->pImpl); char stepDesc[TSDB_STEP_DESC_LEN] = {0}; snprintf(stepDesc, TSDB_STEP_DESC_LEN, "vgId:%d, start to restore, %d of %d have been restored", pVnode->vgId, diff --git a/source/dnode/mgmt/mgmt_vnode/src/vmWorker.c b/source/dnode/mgmt/mgmt_vnode/src/vmWorker.c index 696107ca90..4b18ec4fb0 100644 --- a/source/dnode/mgmt/mgmt_vnode/src/vmWorker.c +++ b/source/dnode/mgmt/mgmt_vnode/src/vmWorker.c @@ -187,9 +187,9 @@ static int32_t vmPutMsgToQueue(SVnodeMgmt *pMgmt, SRpcMsg *pMsg, EQueueType qtyp pHead->vgId = ntohl(pHead->vgId); SVnodeObj *pVnode = vmAcquireVnode(pMgmt, pHead->vgId); - if (pVnode == NULL) { - dGWarn("vgId:%d, msg:%p failed to put into vnode queue since %s, type:%s qtype:%d contLen:%d", pHead->vgId, pMsg, - terrstr(), TMSG_INFO(pMsg->msgType), qtype, pHead->contLen); + if (pVnode == NULL || pVnode->failed) { + dGDebug("vgId:%d, msg:%p failed to put into vnode queue since %s, type:%s qtype:%d contLen:%d", pHead->vgId, pMsg, + terrstr(), TMSG_INFO(pMsg->msgType), qtype, pHead->contLen); terrno = (terrno != 0) ? terrno : -1; return terrno; } @@ -316,7 +316,7 @@ int32_t vmPutRpcMsgToQueue(SVnodeMgmt *pMgmt, EQueueType qtype, SRpcMsg *pRpc) { int32_t vmGetQueueSize(SVnodeMgmt *pMgmt, int32_t vgId, EQueueType qtype) { int32_t size = -1; SVnodeObj *pVnode = vmAcquireVnode(pMgmt, vgId); - if (pVnode != NULL) { + if (pVnode != NULL && !pVnode->failed) { switch (qtype) { case WRITE_QUEUE: size = taosQueueItemSize(pVnode->pWriteW.queue); @@ -339,8 +339,8 @@ int32_t vmGetQueueSize(SVnodeMgmt *pMgmt, int32_t vgId, EQueueType qtype) { default: break; } - vmReleaseVnode(pMgmt, pVnode); } + if (pVnode) vmReleaseVnode(pMgmt, pVnode); if (size < 0) { dTrace("vgId:%d, can't get size from queue since %s, qtype:%d", vgId, terrstr(), qtype); size = 0; diff --git a/source/dnode/mnode/impl/src/mndSync.c b/source/dnode/mnode/impl/src/mndSync.c index 5759737a6a..7f6a0397ad 100644 --- a/source/dnode/mnode/impl/src/mndSync.c +++ b/source/dnode/mnode/impl/src/mndSync.c @@ -286,9 +286,10 @@ int32_t mndSyncGetSnapshot(const SSyncFSM *pFsm, SSnapshot *pSnapshot, void *pRe return 0; } -static void mndSyncGetSnapshotInfo(const SSyncFSM *pFsm, SSnapshot *pSnapshot) { +static int32_t mndSyncGetSnapshotInfo(const SSyncFSM *pFsm, SSnapshot *pSnapshot) { SMnode *pMnode = pFsm->data; sdbGetCommitInfo(pMnode->pSdb, &pSnapshot->lastApplyIndex, &pSnapshot->lastApplyTerm, &pSnapshot->lastConfigIndex); + return 0; } void mndRestoreFinish(const SSyncFSM *pFsm, const SyncIndex commitIdx) { diff --git a/source/dnode/vnode/inc/vnode.h b/source/dnode/vnode/inc/vnode.h index c40e2657f9..6a0c991be4 100644 --- a/source/dnode/vnode/inc/vnode.h +++ b/source/dnode/vnode/inc/vnode.h @@ -58,7 +58,7 @@ int32_t vnodeAlterHashRange(const char *srcPath, const char *dstPath, SAlterVnod int32_t vnodeRestoreVgroupId(const char *srcPath, const char *dstPath, int32_t srcVgId, int32_t dstVgId, int32_t diskPrimary, STfs *pTfs); void vnodeDestroy(int32_t vgId, const char *path, STfs *pTfs); -SVnode *vnodeOpen(const char *path, int32_t diskPrimary, STfs *pTfs, SMsgCb msgCb); +SVnode *vnodeOpen(const char *path, int32_t diskPrimary, STfs *pTfs, SMsgCb msgCb, bool force); void vnodePreClose(SVnode *pVnode); void vnodePostClose(SVnode *pVnode); void vnodeSyncCheckTimeout(SVnode *pVnode); @@ -69,7 +69,7 @@ int32_t vnodeBegin(SVnode *pVnode); int32_t vnodeStart(SVnode *pVnode); void vnodeStop(SVnode *pVnode); int64_t vnodeGetSyncHandle(SVnode *pVnode); -void vnodeGetSnapshot(SVnode *pVnode, SSnapshot *pSnapshot); +int32_t vnodeGetSnapshot(SVnode *pVnode, SSnapshot *pSnapshot); void vnodeGetInfo(void *pVnode, const char **dbname, int32_t *vgId, int64_t *numOfTables, int64_t *numOfNormalTables); int32_t vnodeProcessCreateTSma(SVnode *pVnode, void *pCont, uint32_t contLen); int32_t vnodeGetTableList(void *pVnode, int8_t type, SArray *pList); @@ -259,11 +259,11 @@ int32_t vnodeEnqueueStreamMsg(SVnode *pVnode, SRpcMsg *pMsg); int32_t smaGetTSmaDays(SVnodeCfg *pCfg, void *pCont, uint32_t contLen, int32_t *days); // SVSnapReader -int32_t vnodeSnapReaderOpen(SVnode *pVnode, int64_t sver, int64_t ever, SVSnapReader **ppReader); +int32_t vnodeSnapReaderOpen(SVnode *pVnode, SSnapshotParam *pParam, SVSnapReader **ppReader); void vnodeSnapReaderClose(SVSnapReader *pReader); int32_t vnodeSnapRead(SVSnapReader *pReader, uint8_t **ppData, uint32_t *nData); // SVSnapWriter -int32_t vnodeSnapWriterOpen(SVnode *pVnode, int64_t sver, int64_t ever, SVSnapWriter **ppWriter); +int32_t vnodeSnapWriterOpen(SVnode *pVnode, SSnapshotParam *pParam, SVSnapWriter **ppWriter); int32_t vnodeSnapWriterClose(SVSnapWriter *pWriter, int8_t rollback, SSnapshot *pSnapshot); int32_t vnodeSnapWrite(SVSnapWriter *pWriter, uint8_t *pData, uint32_t nData); diff --git a/source/dnode/vnode/src/inc/tsdb.h b/source/dnode/vnode/src/inc/tsdb.h index e83f47f7b6..79112babc3 100644 --- a/source/dnode/vnode/src/inc/tsdb.h +++ b/source/dnode/vnode/src/inc/tsdb.h @@ -263,6 +263,7 @@ int32_t tsdbFSRollback(STsdb *pTsdb); int32_t tsdbFSPrepareCommit(STsdb *pTsdb, STsdbFS *pFS); int32_t tsdbFSRef(STsdb *pTsdb, STsdbFS *pFS); void tsdbFSUnref(STsdb *pTsdb, STsdbFS *pFS); +void tsdbGetCurrentFName(STsdb *pTsdb, char *current, char *current_t); int32_t tsdbFSUpsertFSet(STsdbFS *pFS, SDFileSet *pSet); int32_t tsdbFSUpsertDelFile(STsdbFS *pFS, SDelFile *pDelFile); @@ -672,6 +673,42 @@ struct SDelFWriter { typedef struct STFileSet STFileSet; typedef TARRAY2(STFileSet *) TFileSetArray; +typedef struct STSnapRange STSnapRange; +typedef TARRAY2(STSnapRange *) TSnapRangeArray; // disjoint snap ranges + +// util +int32_t tSerializeSnapRangeArray(void *buf, int32_t bufLen, TSnapRangeArray *pSnapR); +int32_t tDeserializeSnapRangeArray(void *buf, int32_t bufLen, TSnapRangeArray *pSnapR); +void tsdbSnapRangeArrayDestroy(TSnapRangeArray **ppSnap); +SHashObj *tsdbGetSnapRangeHash(TSnapRangeArray *pRanges); + +// snap partition list +typedef TARRAY2(SVersionRange) SVerRangeList; +typedef struct STsdbSnapPartition STsdbSnapPartition; +typedef TARRAY2(STsdbSnapPartition *) STsdbSnapPartList; +// util +STsdbSnapPartList *tsdbSnapPartListCreate(); +void tsdbSnapPartListDestroy(STsdbSnapPartList **ppList); +int32_t tSerializeTsdbSnapPartList(void *buf, int32_t bufLen, STsdbSnapPartList *pList); +int32_t tDeserializeTsdbSnapPartList(void *buf, int32_t bufLen, STsdbSnapPartList *pList); +int32_t tsdbSnapPartListToRangeDiff(STsdbSnapPartList *pList, TSnapRangeArray **ppRanges); + +enum { + TSDB_SNAP_RANGE_TYP_HEAD = 0, + TSDB_SNAP_RANGE_TYP_DATA, + TSDB_SNAP_RANGE_TYP_SMA, + TSDB_SNAP_RANGE_TYP_TOMB, + TSDB_SNAP_RANGE_TYP_STT, + TSDB_SNAP_RANGE_TYP_MAX, +}; + +struct STsdbSnapPartition { + int64_t fid; + int8_t stat; + SVerRangeList verRanges[TSDB_SNAP_RANGE_TYP_MAX]; +}; + +// snap read struct STsdbReadSnap { SMemTable *pMem; SQueryNode *pNode; @@ -989,6 +1026,15 @@ struct STsdbFilterInfo { TABLEID tbid; }; +typedef enum { + TSDB_FS_STATE_NORMAL = 0, + TSDB_FS_STATE_INCOMPLETE, +} ETsdbFsState; + +// utils +ETsdbFsState tsdbSnapGetFsState(SVnode *pVnode); +int32_t tsdbSnapGetDetails(SVnode *pVnode, SSnapshot *pSnap); + #ifdef __cplusplus } #endif diff --git a/source/dnode/vnode/src/inc/vnodeInt.h b/source/dnode/vnode/src/inc/vnodeInt.h index 823e9d57f6..12e273c32d 100644 --- a/source/dnode/vnode/src/inc/vnodeInt.h +++ b/source/dnode/vnode/src/inc/vnodeInt.h @@ -202,7 +202,7 @@ typedef struct SMetaInfo { int32_t metaGetInfo(SMeta* pMeta, int64_t uid, SMetaInfo* pInfo, SMetaReader* pReader); // tsdb -int tsdbOpen(SVnode* pVnode, STsdb** ppTsdb, const char* dir, STsdbKeepCfg* pKeepCfg, int8_t rollback); +int tsdbOpen(SVnode* pVnode, STsdb** ppTsdb, const char* dir, STsdbKeepCfg* pKeepCfg, int8_t rollback, bool force); int tsdbClose(STsdb** pTsdb); int32_t tsdbBegin(STsdb* pTsdb); // int32_t tsdbPrepareCommit(STsdb* pTsdb); @@ -267,7 +267,7 @@ int32_t tqProcessTaskScanHistoryFinishRsp(STQ* pTq, SRpcMsg* pMsg); // sma int32_t smaInit(); void smaCleanUp(); -int32_t smaOpen(SVnode* pVnode, int8_t rollback); +int32_t smaOpen(SVnode* pVnode, int8_t rollback, bool force); int32_t smaClose(SSma* pSma); int32_t smaBegin(SSma* pSma); int32_t smaPrepareAsyncCommit(SSma* pSma); @@ -295,11 +295,12 @@ int32_t metaSnapWriterOpen(SMeta* pMeta, int64_t sver, int64_t ever, SMetaSnapWr int32_t metaSnapWrite(SMetaSnapWriter* pWriter, uint8_t* pData, uint32_t nData); int32_t metaSnapWriterClose(SMetaSnapWriter** ppWriter, int8_t rollback); // STsdbSnapReader ======================================== -int32_t tsdbSnapReaderOpen(STsdb* pTsdb, int64_t sver, int64_t ever, int8_t type, STsdbSnapReader** ppReader); +int32_t tsdbSnapReaderOpen(STsdb* pTsdb, int64_t sver, int64_t ever, int8_t type, void* pRanges, + STsdbSnapReader** ppReader); int32_t tsdbSnapReaderClose(STsdbSnapReader** ppReader); int32_t tsdbSnapRead(STsdbSnapReader* pReader, uint8_t** ppData); // STsdbSnapWriter ======================================== -int32_t tsdbSnapWriterOpen(STsdb* pTsdb, int64_t sver, int64_t ever, STsdbSnapWriter** ppWriter); +int32_t tsdbSnapWriterOpen(STsdb* pTsdb, int64_t sver, int64_t ever, void* pRanges, STsdbSnapWriter** ppWriter); int32_t tsdbSnapWrite(STsdbSnapWriter* pWriter, SSnapDataHdr* pHdr); int32_t tsdbSnapWriterPrepareClose(STsdbSnapWriter* pWriter); int32_t tsdbSnapWriterClose(STsdbSnapWriter** ppWriter, int8_t rollback); @@ -356,8 +357,9 @@ int32_t rsmaSnapReaderOpen(SSma* pSma, int64_t sver, int64_t ever, SRSmaSnapRead int32_t rsmaSnapReaderClose(SRSmaSnapReader** ppReader); int32_t rsmaSnapRead(SRSmaSnapReader* pReader, uint8_t** ppData); // SRSmaSnapWriter ======================================== -int32_t rsmaSnapWriterOpen(SSma* pSma, int64_t sver, int64_t ever, SRSmaSnapWriter** ppWriter); +int32_t rsmaSnapWriterOpen(SSma* pSma, int64_t sver, int64_t ever, void** ppRanges, SRSmaSnapWriter** ppWriter); int32_t rsmaSnapWrite(SRSmaSnapWriter* pWriter, uint8_t* pData, uint32_t nData); +int32_t rsmaSnapWriterPrepareClose(SRSmaSnapWriter* pWriter); int32_t rsmaSnapWriterClose(SRSmaSnapWriter** ppWriter, int8_t rollback); typedef struct { @@ -497,6 +499,7 @@ struct SSma { #define SMA_RSMA_TSDB0(s) ((s)->pVnode->pTsdb) #define SMA_RSMA_TSDB1(s) ((s)->pRSmaTsdb[TSDB_RETENTION_L0]) #define SMA_RSMA_TSDB2(s) ((s)->pRSmaTsdb[TSDB_RETENTION_L1]) +#define SMA_RSMA_GET_TSDB(pVnode, level) ((level == 0) ? pVnode->pTsdb : pVnode->pSma->pRSmaTsdb[level - 1]) // sma void smaHandleRes(void* pVnode, int64_t smaId, const SArray* data); diff --git a/source/dnode/vnode/src/sma/smaOpen.c b/source/dnode/vnode/src/sma/smaOpen.c index 09929d138e..49f25c0b0a 100644 --- a/source/dnode/vnode/src/sma/smaOpen.c +++ b/source/dnode/vnode/src/sma/smaOpen.c @@ -30,7 +30,7 @@ static int32_t rsmaRestore(SSma *pSma); pKeepCfg->keepTimeOffset = 0; \ } while (0) -#define SMA_OPEN_RSMA_IMPL(v, l) \ +#define SMA_OPEN_RSMA_IMPL(v, l, force) \ do { \ SRetention *r = (SRetention *)VND_RETENTIONS(v) + l; \ if (!RETENTION_VALID(r)) { \ @@ -42,7 +42,7 @@ static int32_t rsmaRestore(SSma *pSma); } \ code = smaSetKeepCfg(v, &keepCfg, pCfg, TSDB_TYPE_RSMA_L##l); \ TSDB_CHECK_CODE(code, lino, _exit); \ - if (tsdbOpen(v, &SMA_RSMA_TSDB##l(pSma), VNODE_RSMA##l##_DIR, &keepCfg, rollback) < 0) { \ + if (tsdbOpen(v, &SMA_RSMA_TSDB##l(pSma), VNODE_RSMA##l##_DIR, &keepCfg, rollback, force) < 0) { \ code = terrno; \ TSDB_CHECK_CODE(code, lino, _exit); \ } \ @@ -118,7 +118,7 @@ int smaSetKeepCfg(SVnode *pVnode, STsdbKeepCfg *pKeepCfg, STsdbCfg *pCfg, int ty return terrno; } -int32_t smaOpen(SVnode *pVnode, int8_t rollback) { +int32_t smaOpen(SVnode *pVnode, int8_t rollback, bool force) { int32_t code = 0; int32_t lino = 0; STsdbCfg *pCfg = &pVnode->config.tsdbCfg; @@ -139,11 +139,11 @@ int32_t smaOpen(SVnode *pVnode, int8_t rollback) { STsdbKeepCfg keepCfg = {0}; for (int32_t i = 0; i < TSDB_RETENTION_MAX; ++i) { if (i == TSDB_RETENTION_L0) { - SMA_OPEN_RSMA_IMPL(pVnode, 0); + SMA_OPEN_RSMA_IMPL(pVnode, 0, force); } else if (i == TSDB_RETENTION_L1) { - SMA_OPEN_RSMA_IMPL(pVnode, 1); + SMA_OPEN_RSMA_IMPL(pVnode, 1, force); } else if (i == TSDB_RETENTION_L2) { - SMA_OPEN_RSMA_IMPL(pVnode, 2); + SMA_OPEN_RSMA_IMPL(pVnode, 2, force); } } diff --git a/source/dnode/vnode/src/sma/smaSnapshot.c b/source/dnode/vnode/src/sma/smaSnapshot.c index e01a33936b..c93d9a7de6 100644 --- a/source/dnode/vnode/src/sma/smaSnapshot.c +++ b/source/dnode/vnode/src/sma/smaSnapshot.c @@ -48,7 +48,7 @@ int32_t rsmaSnapReaderOpen(SSma* pSma, int64_t sver, int64_t ever, SRSmaSnapRead // open rsma1/rsma2 for (int32_t i = 0; i < TSDB_RETENTION_L2; ++i) { if (pSma->pRSmaTsdb[i]) { - code = tsdbSnapReaderOpen(pSma->pRSmaTsdb[i], sver, ever, i == 0 ? SNAP_DATA_RSMA1 : SNAP_DATA_RSMA2, + code = tsdbSnapReaderOpen(pSma->pRSmaTsdb[i], sver, ever, (i == 0 ? SNAP_DATA_RSMA1 : SNAP_DATA_RSMA2), NULL, &pReader->pDataReader[i]); TSDB_CHECK_CODE(code, lino, _exit); } @@ -128,7 +128,7 @@ struct SRSmaSnapWriter { STsdbSnapWriter* pDataWriter[TSDB_RETENTION_L2]; }; -int32_t rsmaSnapWriterOpen(SSma* pSma, int64_t sver, int64_t ever, SRSmaSnapWriter** ppWriter) { +int32_t rsmaSnapWriterOpen(SSma* pSma, int64_t sver, int64_t ever, void** ppRanges, SRSmaSnapWriter** ppWriter) { int32_t code = 0; int32_t lino = 0; SVnode* pVnode = pSma->pVnode; @@ -147,7 +147,7 @@ int32_t rsmaSnapWriterOpen(SSma* pSma, int64_t sver, int64_t ever, SRSmaSnapWrit // rsma1/rsma2 for (int32_t i = 0; i < TSDB_RETENTION_L2; ++i) { if (pSma->pRSmaTsdb[i]) { - code = tsdbSnapWriterOpen(pSma->pRSmaTsdb[i], sver, ever, &pWriter->pDataWriter[i]); + code = tsdbSnapWriterOpen(pSma->pRSmaTsdb[i], sver, ever, ((void**)ppRanges)[i], &pWriter->pDataWriter[i]); TSDB_CHECK_CODE(code, lino, _exit); } } @@ -165,6 +165,21 @@ _exit: return code; } +int32_t rsmaSnapWriterPrepareClose(SRSmaSnapWriter* pWriter) { + int32_t code = 0; + for (int32_t i = 0; i < TSDB_RETENTION_L2; ++i) { + if (pWriter->pDataWriter[i]) { + code = tsdbSnapWriterPrepareClose(pWriter->pDataWriter[i]); + if (code) { + smaError("vgId:%d, failed to prepare close tsdbSnapWriter since %s. i: %d", SMA_VID(pWriter->pSma), terrstr(), + i); + return -1; + } + } + } + return code; +} + int32_t rsmaSnapWriterClose(SRSmaSnapWriter** ppWriter, int8_t rollback) { int32_t code = 0; int32_t lino = 0; diff --git a/source/dnode/vnode/src/tq/tq.c b/source/dnode/vnode/src/tq/tq.c index 98464d082c..a5832d3c66 100644 --- a/source/dnode/vnode/src/tq/tq.c +++ b/source/dnode/vnode/src/tq/tq.c @@ -1200,6 +1200,9 @@ int32_t tqProcessTaskScanHistory(STQ* pTq, SRpcMsg* pMsg) { pStreamTask->status.taskStatus = TASK_STATUS__HALT; nextProcessedVer = walReaderGetCurrentVer(pStreamTask->exec.pWalReader); + if (nextProcessedVer == -1) { + nextProcessedVer = pStreamTask->dataRange.range.maxVer + 1; + } tqDebug("s-task:%s level:%d nextProcessedVer:%" PRId64 ", sched-status:%d is halt by fill-history task:%s", pStreamTask->id.idStr, pStreamTask->info.taskLevel, nextProcessedVer, pStreamTask->status.schedStatus, @@ -1975,4 +1978,4 @@ int32_t tqProcessTaskResetReq(STQ* pTq, SRpcMsg* pMsg) { streamMetaReleaseTask(pMeta, pTask); return TSDB_CODE_SUCCESS; -} \ No newline at end of file +} diff --git a/source/dnode/vnode/src/tsdb/tsdbDataFileRW.c b/source/dnode/vnode/src/tsdb/tsdbDataFileRW.c index 6e4cb517ff..df6b85a889 100644 --- a/source/dnode/vnode/src/tsdb/tsdbDataFileRW.c +++ b/source/dnode/vnode/src/tsdb/tsdbDataFileRW.c @@ -15,10 +15,6 @@ #include "tsdbDataFileRW.h" -extern int32_t tsdbFileWriteTombBlock(STsdbFD *fd, STombBlock *tombBlock, int8_t cmprAlg, int64_t *fileSize, - TTombBlkArray *tombBlkArray, uint8_t **bufArr); -extern int32_t tsdbFileWriteTombBlk(STsdbFD *fd, const TTombBlkArray *tombBlkArray, SFDataPtr *ptr, int64_t *fileSize); - // SDataFileReader ============================================= struct SDataFileReader { SDataFileReaderConfig config[1]; @@ -491,6 +487,9 @@ struct SDataFileWriter { int32_t tombBlkArrayIdx; STombBlock tombBlock[1]; int32_t tombBlockIdx; + // range + SVersionRange range; + SVersionRange tombRange; } ctx[1]; STFile files[TSDB_FTYPE_MAX]; @@ -589,6 +588,8 @@ static int32_t tsdbDataFileWriterDoOpen(SDataFileWriter *writer) { .fid = writer->config->fid, .cid = writer->config->cid, .size = 0, + .minVer = VERSION_MAX, + .maxVer = VERSION_MIN, }; // .data @@ -602,6 +603,8 @@ static int32_t tsdbDataFileWriterDoOpen(SDataFileWriter *writer) { .fid = writer->config->fid, .cid = writer->config->cid, .size = 0, + .minVer = VERSION_MAX, + .maxVer = VERSION_MIN, }; } @@ -616,6 +619,8 @@ static int32_t tsdbDataFileWriterDoOpen(SDataFileWriter *writer) { .fid = writer->config->fid, .cid = writer->config->cid, .size = 0, + .minVer = VERSION_MAX, + .maxVer = VERSION_MIN, }; } @@ -627,8 +632,14 @@ static int32_t tsdbDataFileWriterDoOpen(SDataFileWriter *writer) { .fid = writer->config->fid, .cid = writer->config->cid, .size = 0, + .minVer = VERSION_MAX, + .maxVer = VERSION_MIN, }; + // range + writer->ctx->range = (SVersionRange){.minVer = VERSION_MAX, .maxVer = VERSION_MIN}; + writer->ctx->tombRange = (SVersionRange){.minVer = VERSION_MAX, .maxVer = VERSION_MIN}; + writer->ctx->opened = true; _exit: @@ -638,8 +649,14 @@ _exit: return code; } +int32_t tsdbWriterUpdVerRange(SVersionRange *range, int64_t minVer, int64_t maxVer) { + range->minVer = TMIN(range->minVer, minVer); + range->maxVer = TMAX(range->maxVer, maxVer); + return 0; +} + int32_t tsdbFileWriteBrinBlock(STsdbFD *fd, SBrinBlock *brinBlock, int8_t cmprAlg, int64_t *fileSize, - TBrinBlkArray *brinBlkArray, uint8_t **bufArr) { + TBrinBlkArray *brinBlkArray, uint8_t **bufArr, SVersionRange *range) { if (BRIN_BLOCK_SIZE(brinBlock) == 0) return 0; int32_t code; @@ -678,6 +695,8 @@ int32_t tsdbFileWriteBrinBlock(STsdbFD *fd, SBrinBlock *brinBlock, int8_t cmprAl } } + tsdbWriterUpdVerRange(range, brinBlk->minVer, brinBlk->maxVer); + // write to file for (int32_t i = 0; i < ARRAY_SIZE(brinBlock->dataArr1); i++) { code = tsdbCmprData((uint8_t *)TARRAY2_DATA(brinBlock->dataArr1 + i), TARRAY2_DATA_LEN(brinBlock->dataArr1 + i), @@ -728,7 +747,8 @@ static int32_t tsdbDataFileWriteBrinBlock(SDataFileWriter *writer) { int32_t lino = 0; code = tsdbFileWriteBrinBlock(writer->fd[TSDB_FTYPE_HEAD], writer->brinBlock, writer->config->cmprAlg, - &writer->files[TSDB_FTYPE_HEAD].size, writer->brinBlkArray, writer->config->bufArr); + &writer->files[TSDB_FTYPE_HEAD].size, writer->brinBlkArray, writer->config->bufArr, + &writer->ctx->range); TSDB_CHECK_CODE(code, lino, _exit); _exit: @@ -795,6 +815,8 @@ static int32_t tsdbDataFileDoWriteBlockData(SDataFileWriter *writer, SBlockData } } + tsdbWriterUpdVerRange(&writer->ctx->range, record->minVer, record->maxVer); + // to .data file int32_t sizeArr[5] = {0}; @@ -1143,6 +1165,64 @@ int32_t tsdbFileWriteHeadFooter(STsdbFD *fd, int64_t *fileSize, const SHeadFoote return 0; } +int32_t tsdbFileWriteTombBlock(STsdbFD *fd, STombBlock *tombBlock, int8_t cmprAlg, int64_t *fileSize, + TTombBlkArray *tombBlkArray, uint8_t **bufArr, SVersionRange *range) { + int32_t code; + + if (TOMB_BLOCK_SIZE(tombBlock) == 0) return 0; + + STombBlk tombBlk[1] = {{ + .dp[0] = + { + .offset = *fileSize, + .size = 0, + }, + .minTbid = + { + .suid = TARRAY2_FIRST(tombBlock->suid), + .uid = TARRAY2_FIRST(tombBlock->uid), + }, + .maxTbid = + { + .suid = TARRAY2_LAST(tombBlock->suid), + .uid = TARRAY2_LAST(tombBlock->uid), + }, + .minVer = TARRAY2_FIRST(tombBlock->version), + .maxVer = TARRAY2_FIRST(tombBlock->version), + .numRec = TOMB_BLOCK_SIZE(tombBlock), + .cmprAlg = cmprAlg, + }}; + + for (int32_t i = 1; i < TOMB_BLOCK_SIZE(tombBlock); i++) { + if (tombBlk->minVer > TARRAY2_GET(tombBlock->version, i)) { + tombBlk->minVer = TARRAY2_GET(tombBlock->version, i); + } + if (tombBlk->maxVer < TARRAY2_GET(tombBlock->version, i)) { + tombBlk->maxVer = TARRAY2_GET(tombBlock->version, i); + } + } + + tsdbWriterUpdVerRange(range, tombBlk->minVer, tombBlk->maxVer); + + for (int32_t i = 0; i < ARRAY_SIZE(tombBlock->dataArr); i++) { + code = tsdbCmprData((uint8_t *)TARRAY2_DATA(&tombBlock->dataArr[i]), TARRAY2_DATA_LEN(&tombBlock->dataArr[i]), + TSDB_DATA_TYPE_BIGINT, tombBlk->cmprAlg, &bufArr[0], 0, &tombBlk->size[i], &bufArr[1]); + if (code) return code; + + code = tsdbWriteFile(fd, *fileSize, bufArr[0], tombBlk->size[i]); + if (code) return code; + + tombBlk->dp->size += tombBlk->size[i]; + *fileSize += tombBlk->size[i]; + } + + code = TARRAY2_APPEND_PTR(tombBlkArray, tombBlk); + if (code) return code; + + tTombBlockClear(tombBlock); + return 0; +} + static int32_t tsdbDataFileWriteHeadFooter(SDataFileWriter *writer) { int32_t code = 0; int32_t lino = 0; @@ -1164,7 +1244,8 @@ static int32_t tsdbDataFileDoWriteTombBlock(SDataFileWriter *writer) { int32_t lino = 0; code = tsdbFileWriteTombBlock(writer->fd[TSDB_FTYPE_TOMB], writer->tombBlock, writer->config->cmprAlg, - &writer->files[TSDB_FTYPE_TOMB].size, writer->tombBlkArray, writer->config->bufArr); + &writer->files[TSDB_FTYPE_TOMB].size, writer->tombBlkArray, writer->config->bufArr, + &writer->ctx->tombRange); TSDB_CHECK_CODE(code, lino, _exit); _exit: @@ -1174,6 +1255,21 @@ _exit: return code; } +int32_t tsdbFileWriteTombBlk(STsdbFD *fd, const TTombBlkArray *tombBlkArray, SFDataPtr *ptr, int64_t *fileSize) { + ptr->size = TARRAY2_DATA_LEN(tombBlkArray); + if (ptr->size > 0) { + ptr->offset = *fileSize; + + int32_t code = tsdbWriteFile(fd, *fileSize, (const uint8_t *)TARRAY2_DATA(tombBlkArray), ptr->size); + if (code) { + return code; + } + + *fileSize += ptr->size; + } + return 0; +} + static int32_t tsdbDataFileDoWriteTombBlk(SDataFileWriter *writer) { ASSERT(TARRAY2_SIZE(writer->tombBlkArray) > 0); @@ -1306,6 +1402,12 @@ _exit: return code; } +int32_t tsdbTFileUpdVerRange(STFile *f, SVersionRange range) { + f->minVer = TMIN(f->minVer, range.minVer); + f->maxVer = TMAX(f->maxVer, range.maxVer); + return 0; +} + static int32_t tsdbDataFileWriterCloseCommit(SDataFileWriter *writer, TFileOpArray *opArr) { int32_t code = 0; int32_t lino = 0; @@ -1334,6 +1436,8 @@ static int32_t tsdbDataFileWriterCloseCommit(SDataFileWriter *writer, TFileOpArr code = tsdbDataFileWriteHeadFooter(writer); TSDB_CHECK_CODE(code, lino, _exit); + SVersionRange ofRange = {.minVer = VERSION_MAX, .maxVer = VERSION_MIN}; + // .head ftype = TSDB_FTYPE_HEAD; if (writer->config->files[ftype].exist) { @@ -1342,6 +1446,7 @@ static int32_t tsdbDataFileWriterCloseCommit(SDataFileWriter *writer, TFileOpArr .fid = writer->config->fid, .of = writer->config->files[ftype].file, }; + ofRange = (SVersionRange){.minVer = op.of.minVer, .maxVer = op.of.maxVer}; code = TARRAY2_APPEND(opArr, op); TSDB_CHECK_CODE(code, lino, _exit); } @@ -1350,6 +1455,8 @@ static int32_t tsdbDataFileWriterCloseCommit(SDataFileWriter *writer, TFileOpArr .fid = writer->config->fid, .nf = writer->files[ftype], }; + tsdbTFileUpdVerRange(&op.nf, ofRange); + tsdbTFileUpdVerRange(&op.nf, writer->ctx->range); code = TARRAY2_APPEND(opArr, op); TSDB_CHECK_CODE(code, lino, _exit); @@ -1361,6 +1468,7 @@ static int32_t tsdbDataFileWriterCloseCommit(SDataFileWriter *writer, TFileOpArr .fid = writer->config->fid, .nf = writer->files[ftype], }; + tsdbTFileUpdVerRange(&op.nf, writer->ctx->range); code = TARRAY2_APPEND(opArr, op); TSDB_CHECK_CODE(code, lino, _exit); } else if (writer->config->files[ftype].file.size != writer->files[ftype].size) { @@ -1370,6 +1478,7 @@ static int32_t tsdbDataFileWriterCloseCommit(SDataFileWriter *writer, TFileOpArr .of = writer->config->files[ftype].file, .nf = writer->files[ftype], }; + tsdbTFileUpdVerRange(&op.nf, writer->ctx->range); code = TARRAY2_APPEND(opArr, op); TSDB_CHECK_CODE(code, lino, _exit); } @@ -1382,6 +1491,7 @@ static int32_t tsdbDataFileWriterCloseCommit(SDataFileWriter *writer, TFileOpArr .fid = writer->config->fid, .nf = writer->files[ftype], }; + tsdbTFileUpdVerRange(&op.nf, writer->ctx->range); code = TARRAY2_APPEND(opArr, op); TSDB_CHECK_CODE(code, lino, _exit); } else if (writer->config->files[ftype].file.size != writer->files[ftype].size) { @@ -1391,6 +1501,7 @@ static int32_t tsdbDataFileWriterCloseCommit(SDataFileWriter *writer, TFileOpArr .of = writer->config->files[ftype].file, .nf = writer->files[ftype], }; + tsdbTFileUpdVerRange(&op.nf, writer->ctx->range); code = TARRAY2_APPEND(opArr, op); TSDB_CHECK_CODE(code, lino, _exit); } @@ -1415,6 +1526,8 @@ static int32_t tsdbDataFileWriterCloseCommit(SDataFileWriter *writer, TFileOpArr code = tsdbDataFileWriteTombFooter(writer); TSDB_CHECK_CODE(code, lino, _exit); + SVersionRange ofRange = (SVersionRange){.minVer = VERSION_MAX, .maxVer = VERSION_MIN}; + ftype = TSDB_FTYPE_TOMB; if (writer->config->files[ftype].exist) { op = (STFileOp){ @@ -1422,6 +1535,7 @@ static int32_t tsdbDataFileWriterCloseCommit(SDataFileWriter *writer, TFileOpArr .fid = writer->config->fid, .of = writer->config->files[ftype].file, }; + ofRange = (SVersionRange){.minVer = op.of.minVer, .maxVer = op.of.maxVer}; code = TARRAY2_APPEND(opArr, op); TSDB_CHECK_CODE(code, lino, _exit); } @@ -1430,6 +1544,8 @@ static int32_t tsdbDataFileWriterCloseCommit(SDataFileWriter *writer, TFileOpArr .fid = writer->config->fid, .nf = writer->files[ftype], }; + tsdbTFileUpdVerRange(&op.nf, ofRange); + tsdbTFileUpdVerRange(&op.nf, writer->ctx->tombRange); code = TARRAY2_APPEND(opArr, op); TSDB_CHECK_CODE(code, lino, _exit); } @@ -1598,6 +1714,7 @@ int32_t tsdbDataFileWriteBlockData(SDataFileWriter *writer, SBlockData *bData) { ) { code = tsdbDataFileDoWriteBlockData(writer, bData); TSDB_CHECK_CODE(code, lino, _exit); + } else { for (int32_t i = 0; i < bData->nRow; ++i) { TSDBROW row[1] = {tsdbRowFromBlockData(bData, i)}; diff --git a/source/dnode/vnode/src/tsdb/tsdbDataFileRW.h b/source/dnode/vnode/src/tsdb/tsdbDataFileRW.h index 827b58fb4a..c4aed6e787 100644 --- a/source/dnode/vnode/src/tsdb/tsdbDataFileRW.h +++ b/source/dnode/vnode/src/tsdb/tsdbDataFileRW.h @@ -95,10 +95,25 @@ int32_t tsdbDataFileWriteRow(SDataFileWriter *writer, SRowInfo *row); int32_t tsdbDataFileWriteBlockData(SDataFileWriter *writer, SBlockData *bData); int32_t tsdbDataFileFlush(SDataFileWriter *writer); +// head +int32_t tsdbFileWriteBrinBlock(STsdbFD *fd, SBrinBlock *brinBlock, int8_t cmprAlg, int64_t *fileSize, + TBrinBlkArray *brinBlkArray, uint8_t **bufArr, SVersionRange *range); +int32_t tsdbFileWriteBrinBlk(STsdbFD *fd, TBrinBlkArray *brinBlkArray, SFDataPtr *ptr, int64_t *fileSize); +int32_t tsdbFileWriteHeadFooter(STsdbFD *fd, int64_t *fileSize, const SHeadFooter *footer); + +// tomb int32_t tsdbDataFileWriteTombRecord(SDataFileWriter *writer, const STombRecord *record); +int32_t tsdbFileWriteTombBlock(STsdbFD *fd, STombBlock *tombBlock, int8_t cmprAlg, int64_t *fileSize, + TTombBlkArray *tombBlkArray, uint8_t **bufArr, SVersionRange *range); +int32_t tsdbFileWriteTombBlk(STsdbFD *fd, const TTombBlkArray *tombBlkArray, SFDataPtr *ptr, int64_t *fileSize); +int32_t tsdbFileWriteTombFooter(STsdbFD *fd, const STombFooter *footer, int64_t *fileSize); + +// utils +int32_t tsdbWriterUpdVerRange(SVersionRange *range, int64_t minVer, int64_t maxVer); +int32_t tsdbTFileUpdVerRange(STFile *f, SVersionRange range); #ifdef __cplusplus } #endif -#endif /*_TSDB_DATA_FILE_RW_H*/ \ No newline at end of file +#endif /*_TSDB_DATA_FILE_RW_H*/ diff --git a/source/dnode/vnode/src/tsdb/tsdbFS2.c b/source/dnode/vnode/src/tsdb/tsdbFS2.c index afe6ef6e1a..93a16b5502 100644 --- a/source/dnode/vnode/src/tsdb/tsdbFS2.c +++ b/source/dnode/vnode/src/tsdb/tsdbFS2.c @@ -38,13 +38,6 @@ typedef struct { STFileHashEntry **buckets; } STFileHash; -enum { - TSDB_FS_STATE_NONE = 0, - TSDB_FS_STATE_OPEN, - TSDB_FS_STATE_EDIT, - TSDB_FS_STATE_CLOSE, -}; - static const char *gCurrentFname[] = { [TSDB_FCURRENT] = "current.json", [TSDB_FCURRENT_C] = "current.c.json", @@ -57,7 +50,7 @@ static int32_t create_fs(STsdb *pTsdb, STFileSystem **fs) { fs[0]->tsdb = pTsdb; tsem_init(&fs[0]->canEdit, 0, 1); - fs[0]->state = TSDB_FS_STATE_NONE; + fs[0]->fsstate = TSDB_FS_STATE_NORMAL; fs[0]->neid = 0; TARRAY2_INIT(fs[0]->fSetArr); TARRAY2_INIT(fs[0]->fSetArrTmp); @@ -258,14 +251,6 @@ _exit: return code; } -static bool is_same_file(const STFile *f1, const STFile f2) { - if (f1->type != f2.type) return false; - if (f1->did.level != f2.did.level) return false; - if (f1->did.id != f2.did.id) return false; - if (f1->cid != f2.cid) return false; - return true; -} - static int32_t apply_commit(STFileSystem *fs) { int32_t code = 0; TFileSetArray *fsetArray1 = fs->fSetArr; @@ -504,6 +489,7 @@ static void tsdbFSDestroyFileObjHash(STFileHash *hash) { static int32_t tsdbFSDoSanAndFix(STFileSystem *fs) { int32_t code = 0; int32_t lino = 0; + int32_t corrupt = false; { // scan each file STFileSet *fset = NULL; @@ -511,8 +497,12 @@ static int32_t tsdbFSDoSanAndFix(STFileSystem *fs) { // data file for (int32_t ftype = 0; ftype < TSDB_FTYPE_MAX; ftype++) { if (fset->farr[ftype] == NULL) continue; - code = tsdbFSDoScanAndFixFile(fs, fset->farr[ftype]); - TSDB_CHECK_CODE(code, lino, _exit); + STFileObj *fobj = fset->farr[ftype]; + code = tsdbFSDoScanAndFixFile(fs, fobj); + if (code) { + fset->maxVerValid = (fobj->f->minVer <= fobj->f->maxVer) ? TMIN(fset->maxVerValid, fobj->f->minVer - 1) : -1; + corrupt = true; + } } // stt file @@ -521,12 +511,22 @@ static int32_t tsdbFSDoSanAndFix(STFileSystem *fs) { STFileObj *fobj; TARRAY2_FOREACH(lvl->fobjArr, fobj) { code = tsdbFSDoScanAndFixFile(fs, fobj); - TSDB_CHECK_CODE(code, lino, _exit); + if (code) { + fset->maxVerValid = (fobj->f->minVer <= fobj->f->maxVer) ? TMIN(fset->maxVerValid, fobj->f->minVer - 1) : -1; + corrupt = true; + } } } } } + if (corrupt) { + tsdbError("vgId:%d, not to clear dangling files due to fset incompleteness", TD_VID(fs->tsdb->pVnode)); + fs->fsstate = TSDB_FS_STATE_INCOMPLETE; + code = 0; + goto _exit; + } + { // clear unreferenced files STfsDir *dir = tfsOpendir(fs->tsdb->pVnode->pTfs, fs->tsdb->path); if (dir == NULL) { @@ -961,6 +961,13 @@ int32_t tsdbFSDestroyCopySnapshot(TFileSetArray **fsetArr) { } int32_t tsdbFSCreateRefSnapshot(STFileSystem *fs, TFileSetArray **fsetArr) { + taosThreadRwlockRdlock(&fs->tsdb->rwLock); + int32_t code = tsdbFSCreateRefSnapshotWithoutLock(fs, fsetArr); + taosThreadRwlockUnlock(&fs->tsdb->rwLock); + return code; +} + +int32_t tsdbFSCreateRefSnapshotWithoutLock(STFileSystem *fs, TFileSetArray **fsetArr) { int32_t code = 0; STFileSet *fset, *fset1; @@ -991,6 +998,142 @@ int32_t tsdbFSDestroyRefSnapshot(TFileSetArray **fsetArr) { return 0; } +int32_t tsdbFSCreateCopyRangedSnapshot(STFileSystem *fs, TSnapRangeArray *pRanges, TFileSetArray **fsetArr, + TFileOpArray *fopArr) { + int32_t code = 0; + STFileSet *fset; + STFileSet *fset1; + SHashObj *pHash = NULL; + + fsetArr[0] = taosMemoryMalloc(sizeof(TFileSetArray)); + if (fsetArr == NULL) return TSDB_CODE_OUT_OF_MEMORY; + TARRAY2_INIT(fsetArr[0]); + + if (pRanges) { + pHash = tsdbGetSnapRangeHash(pRanges); + if (pHash == NULL) { + code = TSDB_CODE_OUT_OF_MEMORY; + goto _out; + } + } + + taosThreadRwlockRdlock(&fs->tsdb->rwLock); + TARRAY2_FOREACH(fs->fSetArr, fset) { + int64_t ever = VERSION_MAX; + if (pHash) { + int32_t fid = fset->fid; + STSnapRange *u = taosHashGet(pHash, &fid, sizeof(fid)); + if (u) { + ever = u->sver - 1; + } + } + + code = tsdbTFileSetFilteredInitDup(fs->tsdb, fset, ever, &fset1, fopArr); + if (code) break; + + code = TARRAY2_APPEND(fsetArr[0], fset1); + if (code) break; + } + taosThreadRwlockUnlock(&fs->tsdb->rwLock); + +_out: + if (code) { + TARRAY2_DESTROY(fsetArr[0], tsdbTFileSetClear); + taosMemoryFree(fsetArr[0]); + fsetArr[0] = NULL; + } + if (pHash) { + taosHashCleanup(pHash); + pHash = NULL; + } + return code; +} + +SHashObj *tsdbGetSnapRangeHash(TSnapRangeArray *pRanges) { + int32_t capacity = TARRAY2_SIZE(pRanges) * 2; + SHashObj *pHash = taosHashInit(capacity, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), false, HASH_ENTRY_LOCK); + if (pHash == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + return NULL; + } + + for (int32_t i = 0; i < TARRAY2_SIZE(pRanges); i++) { + STSnapRange *u = TARRAY2_GET(pRanges, i); + int32_t fid = u->fid; + int32_t code = taosHashPut(pHash, &fid, sizeof(fid), u, sizeof(*u)); + ASSERT(code == 0); + tsdbDebug("range diff hash fid:%d, sver:%" PRId64 ", ever:%" PRId64, u->fid, u->sver, u->ever); + } + return pHash; +} + +int32_t tsdbFSCreateRefRangedSnapshot(STFileSystem *fs, int64_t sver, int64_t ever, TSnapRangeArray *pRanges, + TSnapRangeArray **fsrArr) { + int32_t code = 0; + STFileSet *fset; + STSnapRange *fsr1 = NULL; + SHashObj *pHash = NULL; + + fsrArr[0] = taosMemoryCalloc(1, sizeof(*fsrArr[0])); + if (fsrArr[0] == NULL) { + code = TSDB_CODE_OUT_OF_MEMORY; + goto _out; + } + + tsdbInfo("pRanges size:%d", (pRanges == NULL ? 0 : TARRAY2_SIZE(pRanges))); + if (pRanges) { + pHash = tsdbGetSnapRangeHash(pRanges); + if (pHash == NULL) { + code = TSDB_CODE_OUT_OF_MEMORY; + goto _out; + } + } + + taosThreadRwlockRdlock(&fs->tsdb->rwLock); + TARRAY2_FOREACH(fs->fSetArr, fset) { + int64_t sver1 = sver; + int64_t ever1 = ever; + + if (pHash) { + int32_t fid = fset->fid; + STSnapRange *u = taosHashGet(pHash, &fid, sizeof(fid)); + if (u) { + sver1 = u->sver; + tsdbDebug("range hash get fid:%d, sver:%" PRId64 ", ever:%" PRId64, u->fid, u->sver, u->ever); + } + } + + if (sver1 > ever1) { + tsdbDebug("skip fid:%d, sver:%" PRId64 ", ever:%" PRId64, fset->fid, sver1, ever1); + continue; + } + + tsdbDebug("fsrArr:%p, fid:%d, sver:%" PRId64 ", ever:%" PRId64, fsrArr, fset->fid, sver1, ever1); + + code = tsdbTSnapRangeInitRef(fs->tsdb, fset, sver1, ever1, &fsr1); + if (code) break; + + code = TARRAY2_APPEND(fsrArr[0], fsr1); + if (code) break; + + fsr1 = NULL; + } + taosThreadRwlockUnlock(&fs->tsdb->rwLock); + + if (code) { + tsdbTSnapRangeClear(&fsr1); + TARRAY2_DESTROY(fsrArr[0], tsdbTSnapRangeClear); + fsrArr[0] = NULL; + } + +_out: + if (pHash) { + taosHashCleanup(pHash); + pHash = NULL; + } + return code; +} + const char *gFSBgTaskName[] = {NULL, "MERGE", "RETENTION", "COMPACT"}; static int32_t tsdbFSRunBgTask(void *arg) { @@ -1148,4 +1291,4 @@ int32_t tsdbFSEnableBgTask(STFileSystem *fs) { fs->stop = false; taosThreadMutexUnlock(fs->mutex); return 0; -} \ No newline at end of file +} diff --git a/source/dnode/vnode/src/tsdb/tsdbFS2.h b/source/dnode/vnode/src/tsdb/tsdbFS2.h index b0f42a0c48..31b98e5656 100644 --- a/source/dnode/vnode/src/tsdb/tsdbFS2.h +++ b/source/dnode/vnode/src/tsdb/tsdbFS2.h @@ -52,7 +52,15 @@ int32_t tsdbCloseFS(STFileSystem **fs); int32_t tsdbFSCreateCopySnapshot(STFileSystem *fs, TFileSetArray **fsetArr); int32_t tsdbFSDestroyCopySnapshot(TFileSetArray **fsetArr); int32_t tsdbFSCreateRefSnapshot(STFileSystem *fs, TFileSetArray **fsetArr); +int32_t tsdbFSCreateRefSnapshotWithoutLock(STFileSystem *fs, TFileSetArray **fsetArr); int32_t tsdbFSDestroyRefSnapshot(TFileSetArray **fsetArr); + +int32_t tsdbFSCreateCopyRangedSnapshot(STFileSystem *fs, TSnapRangeArray *pExclude, TFileSetArray **fsetArr, + TFileOpArray *fopArr); +int32_t tsdbFSDestroyCopyRangedSnapshot(TFileSetArray **fsetArr, TFileOpArray *fopArr); +int32_t tsdbFSCreateRefRangedSnapshot(STFileSystem *fs, int64_t sver, int64_t ever, TSnapRangeArray *pRanges, + TSnapRangeArray **fsrArr); +int32_t tsdbFSDestroyRefRangedSnapshot(TSnapRangeArray **fsrArr); // txn int64_t tsdbFSAllocEid(STFileSystem *fs); int32_t tsdbFSEditBegin(STFileSystem *fs, const TFileOpArray *opArray, EFEditT etype); @@ -68,6 +76,9 @@ int32_t tsdbFSEnableBgTask(STFileSystem *fs); // other int32_t tsdbFSGetFSet(STFileSystem *fs, int32_t fid, STFileSet **fset); int32_t tsdbFSCheckCommit(STFileSystem *fs); +// utils +int32_t save_fs(const TFileSetArray *arr, const char *fname); +int32_t current_fname(STsdb *pTsdb, char *fname, EFCurrentT ftype); struct STFSBgTask { EFSBgTaskT type; @@ -91,7 +102,7 @@ struct STFSBgTask { struct STFileSystem { STsdb *tsdb; tsem_t canEdit; - int32_t state; + int32_t fsstate; int64_t neid; EFEditT etype; TFileSetArray fSetArr[1]; diff --git a/source/dnode/vnode/src/tsdb/tsdbFSet2.c b/source/dnode/vnode/src/tsdb/tsdbFSet2.c index cd47a54973..620fcb3a47 100644 --- a/source/dnode/vnode/src/tsdb/tsdbFSet2.c +++ b/source/dnode/vnode/src/tsdb/tsdbFSet2.c @@ -65,6 +65,34 @@ static int32_t tsdbSttLvlInitRef(STsdb *pTsdb, const SSttLvl *lvl1, SSttLvl **lv return 0; } +static int32_t tsdbSttLvlFilteredInitEx(STsdb *pTsdb, const SSttLvl *lvl1, int64_t ever, SSttLvl **lvl, + TFileOpArray *fopArr) { + int32_t code = tsdbSttLvlInit(lvl1->level, lvl); + if (code) return code; + + const STFileObj *fobj1; + TARRAY2_FOREACH(lvl1->fobjArr, fobj1) { + if (fobj1->f->maxVer <= ever) { + STFileObj *fobj; + code = tsdbTFileObjInit(pTsdb, fobj1->f, &fobj); + if (code) { + tsdbSttLvlClear(lvl); + return code; + } + + TARRAY2_APPEND(lvl[0]->fobjArr, fobj); + } else { + STFileOp op = { + .optype = TSDB_FOP_REMOVE, + .fid = fobj1->f->fid, + .of = fobj1->f[0], + }; + TARRAY2_APPEND(fopArr, op); + } + } + return 0; +} + static void tsdbSttLvlRemoveFObj(void *data) { tsdbTFileObjRemove(*(STFileObj **)data); } static void tsdbSttLvlRemove(SSttLvl **lvl) { TARRAY2_DESTROY(lvl[0]->fobjArr, tsdbSttLvlRemoveFObj); @@ -424,6 +452,7 @@ int32_t tsdbTFileSetInit(int32_t fid, STFileSet **fset) { if (fset[0] == NULL) return TSDB_CODE_OUT_OF_MEMORY; fset[0]->fid = fid; + fset[0]->maxVerValid = VERSION_MAX; TARRAY2_INIT(fset[0]->lvlArr); return 0; } @@ -458,6 +487,61 @@ int32_t tsdbTFileSetInitDup(STsdb *pTsdb, const STFileSet *fset1, STFileSet **fs return 0; } +int32_t tsdbTFileSetFilteredInitDup(STsdb *pTsdb, const STFileSet *fset1, int64_t ever, STFileSet **fset, + TFileOpArray *fopArr) { + int32_t code = tsdbTFileSetInit(fset1->fid, fset); + if (code) return code; + + for (int32_t ftype = TSDB_FTYPE_MIN; ftype < TSDB_FTYPE_MAX; ++ftype) { + if (fset1->farr[ftype] == NULL) continue; + STFileObj *fobj = fset1->farr[ftype]; + if (fobj->f->maxVer <= ever) { + code = tsdbTFileObjInit(pTsdb, fobj->f, &fset[0]->farr[ftype]); + if (code) { + tsdbTFileSetClear(fset); + return code; + } + } else { + STFileOp op = { + .optype = TSDB_FOP_REMOVE, + .fid = fobj->f->fid, + .of = fobj->f[0], + }; + TARRAY2_APPEND(fopArr, op); + } + } + + const SSttLvl *lvl1; + TARRAY2_FOREACH(fset1->lvlArr, lvl1) { + SSttLvl *lvl; + code = tsdbSttLvlFilteredInitEx(pTsdb, lvl1, ever, &lvl, fopArr); + if (code) { + tsdbTFileSetClear(fset); + return code; + } + + code = TARRAY2_APPEND(fset[0]->lvlArr, lvl); + if (code) return code; + } + + return 0; +} + +int32_t tsdbTSnapRangeInitRef(STsdb *pTsdb, const STFileSet *fset1, int64_t sver, int64_t ever, STSnapRange **fsr) { + fsr[0] = taosMemoryCalloc(1, sizeof(*fsr[0])); + if (fsr[0] == NULL) return TSDB_CODE_OUT_OF_MEMORY; + fsr[0]->fid = fset1->fid; + fsr[0]->sver = sver; + fsr[0]->ever = ever; + + int32_t code = tsdbTFileSetInitRef(pTsdb, fset1, &fsr[0]->fset); + if (code) { + taosMemoryFree(fsr[0]); + fsr[0] = NULL; + } + return code; +} + int32_t tsdbTFileSetInitRef(STsdb *pTsdb, const STFileSet *fset1, STFileSet **fset) { int32_t code = tsdbTFileSetInit(fset1->fid, fset); if (code) return code; @@ -485,6 +569,15 @@ int32_t tsdbTFileSetInitRef(STsdb *pTsdb, const STFileSet *fset1, STFileSet **fs return 0; } +int32_t tsdbTSnapRangeClear(STSnapRange **fsr) { + if (!fsr[0]) return 0; + + tsdbTFileSetClear(&fsr[0]->fset); + taosMemoryFree(fsr[0]); + fsr[0] = NULL; + return 0; +} + int32_t tsdbTFileSetClear(STFileSet **fset) { if (!fset[0]) return 0; @@ -545,4 +638,4 @@ bool tsdbTFileSetIsEmpty(const STFileSet *fset) { if (fset->farr[ftype] != NULL) return false; } return TARRAY2_SIZE(fset->lvlArr) == 0; -} \ No newline at end of file +} diff --git a/source/dnode/vnode/src/tsdb/tsdbFSet2.h b/source/dnode/vnode/src/tsdb/tsdbFSet2.h index d7b3c1fc8c..ea0f99f68e 100644 --- a/source/dnode/vnode/src/tsdb/tsdbFSet2.h +++ b/source/dnode/vnode/src/tsdb/tsdbFSet2.h @@ -45,6 +45,13 @@ int32_t tsdbTFileSetInitDup(STsdb *pTsdb, const STFileSet *fset1, STFileSet **fs int32_t tsdbTFileSetInitRef(STsdb *pTsdb, const STFileSet *fset1, STFileSet **fset); int32_t tsdbTFileSetClear(STFileSet **fset); int32_t tsdbTFileSetRemove(STFileSet **fset); + +int32_t tsdbTFileSetFilteredInitDup(STsdb *pTsdb, const STFileSet *fset1, int64_t ever, STFileSet **fset, + TFileOpArray *fopArr); + +int32_t tsdbTSnapRangeInitRef(STsdb *pTsdb, const STFileSet *fset1, int64_t sver, int64_t ever, STSnapRange **fsr); +int32_t tsdbTSnapRangeClear(STSnapRange **fsr); + // to/from json int32_t tsdbTFileSetToJson(const STFileSet *fset, cJSON *json); int32_t tsdbJsonToTFileSet(STsdb *pTsdb, const cJSON *json, STFileSet **fset); @@ -59,6 +66,9 @@ int64_t tsdbTFileSetMaxCid(const STFileSet *fset); SSttLvl *tsdbTFileSetGetSttLvl(STFileSet *fset, int32_t level); // is empty bool tsdbTFileSetIsEmpty(const STFileSet *fset); +// stt +int32_t tsdbSttLvlInit(int32_t level, SSttLvl **lvl); +int32_t tsdbSttLvlClear(SSttLvl **lvl); struct STFileOp { tsdb_fop_t optype; @@ -74,12 +84,20 @@ struct SSttLvl { struct STFileSet { int32_t fid; + int64_t maxVerValid; STFileObj *farr[TSDB_FTYPE_MAX]; // file array TSttLvlArray lvlArr[1]; // level array }; +struct STSnapRange { + int32_t fid; + int64_t sver; + int64_t ever; + STFileSet *fset; +}; + #ifdef __cplusplus } #endif -#endif /*_TSDB_FILE_SET2_H*/ \ No newline at end of file +#endif /*_TSDB_FILE_SET2_H*/ diff --git a/source/dnode/vnode/src/tsdb/tsdbFSetRW.c b/source/dnode/vnode/src/tsdb/tsdbFSetRW.c index 83ae8c2429..e6b3cf8f54 100644 --- a/source/dnode/vnode/src/tsdb/tsdbFSetRW.c +++ b/source/dnode/vnode/src/tsdb/tsdbFSetRW.c @@ -292,4 +292,4 @@ _exit: TSDB_ERROR_LOG(TD_VID(writer->config->tsdb->pVnode), lino, code); } return code; -} \ No newline at end of file +} diff --git a/source/dnode/vnode/src/tsdb/tsdbFSetRW.h b/source/dnode/vnode/src/tsdb/tsdbFSetRW.h index b5710407cf..0a8049cded 100644 --- a/source/dnode/vnode/src/tsdb/tsdbFSetRW.h +++ b/source/dnode/vnode/src/tsdb/tsdbFSetRW.h @@ -52,4 +52,4 @@ int32_t tsdbFSetWriteTombRecord(SFSetWriter *writer, const STombRecord *tombReco } #endif -#endif /*_TSDB_FSET_RW_H*/ \ No newline at end of file +#endif /*_TSDB_FSET_RW_H*/ diff --git a/source/dnode/vnode/src/tsdb/tsdbFile2.c b/source/dnode/vnode/src/tsdb/tsdbFile2.c index 3d8964d41b..963c5bad34 100644 --- a/source/dnode/vnode/src/tsdb/tsdbFile2.c +++ b/source/dnode/vnode/src/tsdb/tsdbFile2.c @@ -76,6 +76,17 @@ static int32_t tfile_to_json(const STFile *file, cJSON *json) { return TSDB_CODE_OUT_OF_MEMORY; } + if (file->minVer <= file->maxVer) { + /* minVer */ + if (cJSON_AddNumberToObject(json, "minVer", file->minVer) == NULL) { + return TSDB_CODE_OUT_OF_MEMORY; + } + + /* maxVer */ + if (cJSON_AddNumberToObject(json, "maxVer", file->maxVer) == NULL) { + return TSDB_CODE_OUT_OF_MEMORY; + } + } return 0; } @@ -122,6 +133,19 @@ static int32_t tfile_from_json(const cJSON *json, STFile *file) { return TSDB_CODE_FILE_CORRUPTED; } + /* minVer */ + file->minVer = VERSION_MAX; + item = cJSON_GetObjectItem(json, "minVer"); + if (cJSON_IsNumber(item)) { + file->minVer = item->valuedouble; + } + + /* maxVer */ + file->maxVer = VERSION_MIN; + item = cJSON_GetObjectItem(json, "maxVer"); + if (cJSON_IsNumber(item)) { + file->maxVer = item->valuedouble; + } return 0; } @@ -296,4 +320,4 @@ int32_t tsdbTFileObjCmpr(const STFileObj **fobj1, const STFileObj **fobj2) { } else { return 0; } -} \ No newline at end of file +} diff --git a/source/dnode/vnode/src/tsdb/tsdbFile2.h b/source/dnode/vnode/src/tsdb/tsdbFile2.h index 11d08e45e6..33d8ac5478 100644 --- a/source/dnode/vnode/src/tsdb/tsdbFile2.h +++ b/source/dnode/vnode/src/tsdb/tsdbFile2.h @@ -61,6 +61,8 @@ struct STFile { int32_t fid; // file id int64_t cid; // commit id int64_t size; + int64_t minVer; + int64_t maxVer; union { struct { int32_t level; @@ -80,4 +82,4 @@ struct STFileObj { } #endif -#endif /*_TSDB_FILE_H*/ \ No newline at end of file +#endif /*_TSDB_FILE_H*/ diff --git a/source/dnode/vnode/src/tsdb/tsdbMerge.c b/source/dnode/vnode/src/tsdb/tsdbMerge.c index 42a8b5bb3f..e659cedba3 100644 --- a/source/dnode/vnode/src/tsdb/tsdbMerge.c +++ b/source/dnode/vnode/src/tsdb/tsdbMerge.c @@ -313,6 +313,7 @@ static int32_t tsdbMergeFileSetBeginOpenWriter(SMerger *merger) { if (merger->ctx->fset->farr[ftype]) { config.files[ftype].exist = true; config.files[ftype].file = merger->ctx->fset->farr[ftype]->f[0]; + } else { config.files[ftype].exist = false; } diff --git a/source/dnode/vnode/src/tsdb/tsdbOpen.c b/source/dnode/vnode/src/tsdb/tsdbOpen.c index b060edbd91..6dd66c7a40 100644 --- a/source/dnode/vnode/src/tsdb/tsdbOpen.c +++ b/source/dnode/vnode/src/tsdb/tsdbOpen.c @@ -35,7 +35,7 @@ int32_t tsdbSetKeepCfg(STsdb *pTsdb, STsdbCfg *pCfg) { * @param dir * @return int */ -int tsdbOpen(SVnode *pVnode, STsdb **ppTsdb, const char *dir, STsdbKeepCfg *pKeepCfg, int8_t rollback) { +int tsdbOpen(SVnode *pVnode, STsdb **ppTsdb, const char *dir, STsdbKeepCfg *pKeepCfg, int8_t rollback, bool force) { STsdb *pTsdb = NULL; int slen = 0; @@ -72,6 +72,11 @@ int tsdbOpen(SVnode *pVnode, STsdb **ppTsdb, const char *dir, STsdbKeepCfg *pKee goto _err; } + if (pTsdb->pFS->fsstate == TSDB_FS_STATE_INCOMPLETE && force == false) { + terrno = TSDB_CODE_NEED_RETRY; + goto _err; + } + if (tsdbOpenCache(pTsdb) < 0) { goto _err; } diff --git a/source/dnode/vnode/src/tsdb/tsdbRead2.c b/source/dnode/vnode/src/tsdb/tsdbRead2.c index 1139524cb3..22d2a2098c 100644 --- a/source/dnode/vnode/src/tsdb/tsdbRead2.c +++ b/source/dnode/vnode/src/tsdb/tsdbRead2.c @@ -4947,7 +4947,7 @@ int32_t tsdbTakeReadSnap2(STsdbReader* pReader, _query_reseek_func_t reseek, STs } // fs - code = tsdbFSCreateRefSnapshot(pTsdb->pFS, &pSnap->pfSetArray); + code = tsdbFSCreateRefSnapshotWithoutLock(pTsdb->pFS, &pSnap->pfSetArray); // unlock taosThreadRwlockUnlock(&pTsdb->rwLock); diff --git a/source/dnode/vnode/src/tsdb/tsdbRetention.c b/source/dnode/vnode/src/tsdb/tsdbRetention.c index c3b1a18fd8..f2665dcf26 100644 --- a/source/dnode/vnode/src/tsdb/tsdbRetention.c +++ b/source/dnode/vnode/src/tsdb/tsdbRetention.c @@ -151,6 +151,8 @@ static int32_t tsdbDoMigrateFileObj(SRTNer *rtner, const STFileObj *fobj, const .type = fobj->f->type, .did = did[0], .fid = fobj->f->fid, + .minVer = fobj->f->minVer, + .maxVer = fobj->f->maxVer, .cid = fobj->f->cid, .size = fobj->f->size, .stt[0] = @@ -198,6 +200,8 @@ static int32_t tsdbMigrateDataFileS3(SRTNer *rtner, const STFileObj *fobj, const .type = fobj->f->type, .did = did[0], .fid = fobj->f->fid, + .minVer = fobj->f->minVer, + .maxVer = fobj->f->maxVer, .cid = fobj->f->cid, .size = fobj->f->size, .stt[0] = diff --git a/source/dnode/vnode/src/tsdb/tsdbSnapshot.c b/source/dnode/vnode/src/tsdb/tsdbSnapshot.c index e4011ca400..3b4827a6be 100644 --- a/source/dnode/vnode/src/tsdb/tsdbSnapshot.c +++ b/source/dnode/vnode/src/tsdb/tsdbSnapshot.c @@ -32,12 +32,12 @@ struct STsdbSnapReader { uint8_t* aBuf[5]; SSkmInfo skmTb[1]; - TFileSetArray* fsetArr; + TSnapRangeArray* fsrArr; // context struct { - int32_t fsetArrIdx; - STFileSet* fset; + int32_t fsrArrIdx; + STSnapRange* fsr; bool isDataDone; bool isTombDone; } ctx[1]; @@ -72,10 +72,10 @@ static int32_t tsdbSnapReadFileSetOpenReader(STsdbSnapReader* reader) { }; bool hasDataFile = false; for (int32_t ftype = 0; ftype < TSDB_FTYPE_MAX; ftype++) { - if (reader->ctx->fset->farr[ftype] != NULL) { + if (reader->ctx->fsr->fset->farr[ftype] != NULL) { hasDataFile = true; config.files[ftype].exist = true; - config.files[ftype].file = reader->ctx->fset->farr[ftype]->f[0]; + config.files[ftype].file = reader->ctx->fsr->fset->farr[ftype]->f[0]; } } @@ -86,7 +86,7 @@ static int32_t tsdbSnapReadFileSetOpenReader(STsdbSnapReader* reader) { // stt SSttLvl* lvl; - TARRAY2_FOREACH(reader->ctx->fset->lvlArr, lvl) { + TARRAY2_FOREACH(reader->ctx->fsr->fset->lvlArr, lvl) { STFileObj* fobj; TARRAY2_FOREACH(lvl->fobjArr, fobj) { SSttFileReader* sttReader; @@ -138,8 +138,8 @@ static int32_t tsdbSnapReadFileSetOpenIter(STsdbSnapReader* reader) { STsdbIter* iter; STsdbIterConfig config = { .filterByVersion = true, - .verRange[0] = reader->sver, - .verRange[1] = reader->ever, + .verRange[0] = reader->ctx->fsr->sver, + .verRange[1] = reader->ctx->fsr->ever, }; // data file @@ -211,14 +211,14 @@ static int32_t tsdbSnapReadFileSetCloseIter(STsdbSnapReader* reader) { return 0; } -static int32_t tsdbSnapReadFileSetBegin(STsdbSnapReader* reader) { +static int32_t tsdbSnapReadRangeBegin(STsdbSnapReader* reader) { int32_t code = 0; int32_t lino = 0; - ASSERT(reader->ctx->fset == NULL); + ASSERT(reader->ctx->fsr == NULL); - if (reader->ctx->fsetArrIdx < TARRAY2_SIZE(reader->fsetArr)) { - reader->ctx->fset = TARRAY2_GET(reader->fsetArr, reader->ctx->fsetArrIdx++); + if (reader->ctx->fsrArrIdx < TARRAY2_SIZE(reader->fsrArr)) { + reader->ctx->fsr = TARRAY2_GET(reader->fsrArr, reader->ctx->fsrArrIdx++); reader->ctx->isDataDone = false; reader->ctx->isTombDone = false; @@ -236,10 +236,10 @@ _exit: return code; } -static int32_t tsdbSnapReadFileSetEnd(STsdbSnapReader* reader) { +static int32_t tsdbSnapReadRangeEnd(STsdbSnapReader* reader) { tsdbSnapReadFileSetCloseIter(reader); tsdbSnapReadFileSetCloseReader(reader); - reader->ctx->fset = NULL; + reader->ctx->fsr = NULL; return 0; } @@ -412,7 +412,8 @@ _exit: return code; } -int32_t tsdbSnapReaderOpen(STsdb* tsdb, int64_t sver, int64_t ever, int8_t type, STsdbSnapReader** reader) { +int32_t tsdbSnapReaderOpen(STsdb* tsdb, int64_t sver, int64_t ever, int8_t type, void* pRanges, + STsdbSnapReader** reader) { int32_t code = 0; int32_t lino = 0; @@ -424,22 +425,19 @@ int32_t tsdbSnapReaderOpen(STsdb* tsdb, int64_t sver, int64_t ever, int8_t type, reader[0]->ever = ever; reader[0]->type = type; - taosThreadRwlockRdlock(&tsdb->rwLock); - code = tsdbFSCreateRefSnapshot(tsdb->pFS, &reader[0]->fsetArr); - taosThreadRwlockUnlock(&tsdb->rwLock); - + code = tsdbFSCreateRefRangedSnapshot(tsdb->pFS, sver, ever, (TSnapRangeArray*)pRanges, &reader[0]->fsrArr); TSDB_CHECK_CODE(code, lino, _exit); _exit: if (code) { tsdbError("vgId:%d %s failed at line %d since %s, sver:%" PRId64 " ever:%" PRId64 " type:%d", TD_VID(tsdb->pVnode), __func__, lino, tstrerror(code), sver, ever, type); - tsdbFSDestroyRefSnapshot(&reader[0]->fsetArr); + tsdbSnapRangeArrayDestroy(&reader[0]->fsrArr); taosMemoryFree(reader[0]); reader[0] = NULL; } else { - tsdbInfo("vgId:%d %s done, sver:%" PRId64 " ever:%" PRId64 " type:%d", TD_VID(tsdb->pVnode), __func__, sver, ever, - type); + tsdbInfo("vgId:%d tsdb snapshot reader opened. sver:%" PRId64 " ever:%" PRId64 " type:%d", TD_VID(tsdb->pVnode), + sver, ever, type); } return code; } @@ -462,7 +460,7 @@ int32_t tsdbSnapReaderClose(STsdbSnapReader** reader) { TARRAY2_DESTROY(reader[0]->sttReaderArr, tsdbSttFileReaderClose); tsdbDataFileReaderClose(&reader[0]->dataReader); - tsdbFSDestroyRefSnapshot(&reader[0]->fsetArr); + tsdbSnapRangeArrayDestroy(&reader[0]->fsrArr); tDestroyTSchema(reader[0]->skmTb->pTSchema); for (int32_t i = 0; i < ARRAY_SIZE(reader[0]->aBuf); ++i) { @@ -488,11 +486,11 @@ int32_t tsdbSnapRead(STsdbSnapReader* reader, uint8_t** data) { data[0] = NULL; for (;;) { - if (reader->ctx->fset == NULL) { - code = tsdbSnapReadFileSetBegin(reader); + if (reader->ctx->fsr == NULL) { + code = tsdbSnapReadRangeBegin(reader); TSDB_CHECK_CODE(code, lino, _exit); - if (reader->ctx->fset == NULL) { + if (reader->ctx->fsr == NULL) { break; } } @@ -517,7 +515,7 @@ int32_t tsdbSnapRead(STsdbSnapReader* reader, uint8_t** data) { } } - code = tsdbSnapReadFileSetEnd(reader); + code = tsdbSnapReadRangeEnd(reader); TSDB_CHECK_CODE(code, lino, _exit); } @@ -1030,7 +1028,7 @@ _exit: return code; } -int32_t tsdbSnapWriterOpen(STsdb* pTsdb, int64_t sver, int64_t ever, STsdbSnapWriter** writer) { +int32_t tsdbSnapWriterOpen(STsdb* pTsdb, int64_t sver, int64_t ever, void* pRanges, STsdbSnapWriter** writer) { int32_t code = 0; int32_t lino = 0; @@ -1054,7 +1052,7 @@ int32_t tsdbSnapWriterOpen(STsdb* pTsdb, int64_t sver, int64_t ever, STsdbSnapWr writer[0]->compactVersion = INT64_MAX; writer[0]->now = taosGetTimestampMs(); - code = tsdbFSCreateCopySnapshot(pTsdb->pFS, &writer[0]->fsetArr); + code = tsdbFSCreateCopyRangedSnapshot(pTsdb->pFS, (TSnapRangeArray*)pRanges, &writer[0]->fsetArr, writer[0]->fopArr); TSDB_CHECK_CODE(code, lino, _exit); _exit: @@ -1105,6 +1103,8 @@ int32_t tsdbSnapWriterClose(STsdbSnapWriter** writer, int8_t rollback) { TSDB_CHECK_CODE(code, lino, _exit); } + writer[0]->tsdb->pFS->fsstate = TSDB_FS_STATE_NORMAL; + taosThreadRwlockUnlock(&writer[0]->tsdb->rwLock); } tsdbFSEnableBgTask(tsdb->pFS); @@ -1159,3 +1159,438 @@ _exit: } return code; } + +// snap part +static int32_t tsdbSnapPartCmprFn(STsdbSnapPartition* x, STsdbSnapPartition* y) { + if (x->fid < y->fid) return -1; + if (x->fid > y->fid) return 1; + return 0; +} + +static int32_t tVersionRangeCmprFn(SVersionRange* x, SVersionRange* y) { + if (x->minVer < y->minVer) return -1; + if (x->minVer > y->minVer) return 1; + if (x->maxVer < y->maxVer) return -1; + if (x->maxVer > y->maxVer) return 1; + return 0; +} + +static int32_t tsdbSnapRangeCmprFn(STSnapRange* x, STSnapRange* y) { + if (x->fid < y->fid) return -1; + if (x->fid > y->fid) return 1; + return 0; +} + +STsdbSnapPartition* tsdbSnapPartitionCreate() { + STsdbSnapPartition* pSP = taosMemoryCalloc(1, sizeof(STsdbSnapPartition)); + if (pSP == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + return NULL; + } + for (int32_t i = 0; i < TSDB_SNAP_RANGE_TYP_MAX; i++) { + TARRAY2_INIT(&pSP->verRanges[i]); + } + return pSP; +} + +void tsdbSnapPartitionClear(STsdbSnapPartition** ppSP) { + if (ppSP == NULL || ppSP[0] == NULL) { + return; + } + for (int32_t i = 0; i < TSDB_SNAP_RANGE_TYP_MAX; i++) { + TARRAY2_DESTROY(&ppSP[0]->verRanges[i], NULL); + } + taosMemoryFree(ppSP[0]); + ppSP[0] = NULL; +} + +static int32_t tsdbFTypeToSRangeTyp(tsdb_ftype_t ftype) { + switch (ftype) { + case TSDB_FTYPE_HEAD: + return TSDB_SNAP_RANGE_TYP_HEAD; + case TSDB_FTYPE_DATA: + return TSDB_SNAP_RANGE_TYP_DATA; + case TSDB_FTYPE_SMA: + return TSDB_SNAP_RANGE_TYP_SMA; + case TSDB_FTYPE_TOMB: + return TSDB_SNAP_RANGE_TYP_TOMB; + case TSDB_FTYPE_STT: + return TSDB_SNAP_RANGE_TYP_STT; + } + return TSDB_SNAP_RANGE_TYP_MAX; +} + +static int32_t tsdbTFileSetToSnapPart(STFileSet* fset, STsdbSnapPartition** ppSP) { + STsdbSnapPartition* p = tsdbSnapPartitionCreate(); + if (p == NULL) { + goto _err; + } + + p->fid = fset->fid; + + int32_t code = 0; + int32_t typ = 0; + int32_t corrupt = false; + int32_t count = 0; + for (int32_t ftype = TSDB_FTYPE_MIN; ftype < TSDB_FTYPE_MAX; ++ftype) { + if (fset->farr[ftype] == NULL) continue; + typ = tsdbFTypeToSRangeTyp(ftype); + ASSERT(typ < TSDB_SNAP_RANGE_TYP_MAX); + STFile* f = fset->farr[ftype]->f; + if (f->maxVer > fset->maxVerValid) { + corrupt = true; + tsdbError("skip incomplete data file: fid:%d, maxVerValid:%" PRId64 ", minVer:%" PRId64 ", maxVer:%" PRId64 + ", ftype: %d", + fset->fid, fset->maxVerValid, f->minVer, f->maxVer, ftype); + continue; + } + count++; + SVersionRange vr = {.minVer = f->minVer, .maxVer = f->maxVer}; + code = TARRAY2_SORT_INSERT(&p->verRanges[typ], vr, tVersionRangeCmprFn); + ASSERT(code == 0); + } + + typ = TSDB_SNAP_RANGE_TYP_STT; + const SSttLvl* lvl; + TARRAY2_FOREACH(fset->lvlArr, lvl) { + STFileObj* fobj; + TARRAY2_FOREACH(lvl->fobjArr, fobj) { + STFile* f = fobj->f; + if (f->maxVer > fset->maxVerValid) { + corrupt = true; + tsdbError("skip incomplete stt file.fid:%d, maxVerValid:%" PRId64 ", minVer:%" PRId64 ", maxVer:%" PRId64 + ", ftype: %d", + fset->fid, fset->maxVerValid, f->minVer, f->maxVer, typ); + continue; + } + count++; + SVersionRange vr = {.minVer = f->minVer, .maxVer = f->maxVer}; + code = TARRAY2_SORT_INSERT(&p->verRanges[typ], vr, tVersionRangeCmprFn); + ASSERT(code == 0); + } + } + if (corrupt && count == 0) { + SVersionRange vr = {.minVer = VERSION_MIN, .maxVer = fset->maxVerValid}; + code = TARRAY2_SORT_INSERT(&p->verRanges[typ], vr, tVersionRangeCmprFn); + ASSERT(code == 0); + } + ppSP[0] = p; + return 0; + +_err: + tsdbSnapPartitionClear(&p); + return -1; +} + +STsdbSnapPartList* tsdbSnapPartListCreate() { + STsdbSnapPartList* pList = taosMemoryCalloc(1, sizeof(STsdbSnapPartList)); + if (pList == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + return NULL; + } + TARRAY2_INIT(pList); + return pList; +} + +static STsdbSnapPartList* tsdbGetSnapPartList(STFileSystem* fs) { + STsdbSnapPartList* pList = tsdbSnapPartListCreate(); + if (pList == NULL) { + return NULL; + } + + int32_t code = 0; + taosThreadRwlockRdlock(&fs->tsdb->rwLock); + STFileSet* fset; + TARRAY2_FOREACH(fs->fSetArr, fset) { + STsdbSnapPartition* pItem = NULL; + if (tsdbTFileSetToSnapPart(fset, &pItem) < 0) { + code = -1; + break; + } + ASSERT(pItem != NULL); + code = TARRAY2_SORT_INSERT(pList, pItem, tsdbSnapPartCmprFn); + ASSERT(code == 0); + } + taosThreadRwlockUnlock(&fs->tsdb->rwLock); + + if (code) { + TARRAY2_DESTROY(pList, tsdbSnapPartitionClear); + taosMemoryFree(pList); + pList = NULL; + } + return pList; +} + +int32_t tTsdbSnapPartListDataLenCalc(STsdbSnapPartList* pList) { + int32_t hdrLen = sizeof(int32_t); + int32_t datLen = 0; + + int8_t msgVer = 1; + int32_t len = TARRAY2_SIZE(pList); + hdrLen += sizeof(msgVer); + hdrLen += sizeof(len); + datLen += hdrLen; + + for (int32_t u = 0; u < len; u++) { + STsdbSnapPartition* p = TARRAY2_GET(pList, u); + int32_t typMax = TSDB_SNAP_RANGE_TYP_MAX; + int32_t uItem = 0; + uItem += sizeof(STsdbSnapPartition); + uItem += sizeof(typMax); + + for (int32_t i = 0; i < typMax; i++) { + int32_t iLen = TARRAY2_SIZE(&p->verRanges[i]); + int32_t jItem = 0; + jItem += sizeof(SVersionRange); + jItem += sizeof(int64_t); + uItem += sizeof(iLen) + jItem * iLen; + } + datLen += uItem; + } + return datLen; +} + +int32_t tSerializeTsdbSnapPartList(void* buf, int32_t bufLen, STsdbSnapPartList* pList) { + SEncoder encoder = {0}; + tEncoderInit(&encoder, buf, bufLen); + + int8_t reserved8 = 0; + int16_t reserved16 = 0; + int64_t reserved64 = 0; + + int8_t msgVer = 1; + int32_t len = TARRAY2_SIZE(pList); + + if (tStartEncode(&encoder) < 0) goto _err; + if (tEncodeI8(&encoder, msgVer) < 0) goto _err; + if (tEncodeI32(&encoder, len) < 0) goto _err; + + for (int32_t u = 0; u < len; u++) { + STsdbSnapPartition* p = TARRAY2_GET(pList, u); + if (tEncodeI64(&encoder, p->fid) < 0) goto _err; + if (tEncodeI8(&encoder, p->stat) < 0) goto _err; + if (tEncodeI8(&encoder, reserved8) < 0) goto _err; + if (tEncodeI16(&encoder, reserved16) < 0) goto _err; + + int32_t typMax = TSDB_SNAP_RANGE_TYP_MAX; + if (tEncodeI32(&encoder, typMax) < 0) goto _err; + + for (int32_t i = 0; i < typMax; i++) { + SVerRangeList* iList = &p->verRanges[i]; + int32_t iLen = TARRAY2_SIZE(iList); + + if (tEncodeI32(&encoder, iLen) < 0) goto _err; + for (int32_t j = 0; j < iLen; j++) { + SVersionRange r = TARRAY2_GET(iList, j); + if (tEncodeI64(&encoder, r.minVer) < 0) goto _err; + if (tEncodeI64(&encoder, r.maxVer) < 0) goto _err; + if (tEncodeI64(&encoder, reserved64) < 0) goto _err; + } + } + } + + tEndEncode(&encoder); + int32_t tlen = encoder.pos; + tEncoderClear(&encoder); + return tlen; + +_err: + tEncoderClear(&encoder); + return -1; +} + +int32_t tDeserializeTsdbSnapPartList(void* buf, int32_t bufLen, STsdbSnapPartList* pList) { + SDecoder decoder = {0}; + tDecoderInit(&decoder, buf, bufLen); + + int8_t reserved8 = 0; + int16_t reserved16 = 0; + int64_t reserved64 = 0; + + STsdbSnapPartition* p = NULL; + + int8_t msgVer = 0; + int32_t len = 0; + if (tStartDecode(&decoder) < 0) goto _err; + if (tDecodeI8(&decoder, &msgVer) < 0) goto _err; + if (tDecodeI32(&decoder, &len) < 0) goto _err; + + for (int32_t u = 0; u < len; u++) { + p = tsdbSnapPartitionCreate(); + if (p == NULL) goto _err; + if (tDecodeI64(&decoder, &p->fid) < 0) goto _err; + if (tDecodeI8(&decoder, &p->stat) < 0) goto _err; + if (tDecodeI8(&decoder, &reserved8) < 0) goto _err; + if (tDecodeI16(&decoder, &reserved16) < 0) goto _err; + + int32_t typMax = 0; + if (tDecodeI32(&decoder, &typMax) < 0) goto _err; + + for (int32_t i = 0; i < typMax; i++) { + SVerRangeList* iList = &p->verRanges[i]; + int32_t iLen = 0; + if (tDecodeI32(&decoder, &iLen) < 0) goto _err; + for (int32_t j = 0; j < iLen; j++) { + SVersionRange r = {0}; + if (tDecodeI64(&decoder, &r.minVer) < 0) goto _err; + if (tDecodeI64(&decoder, &r.maxVer) < 0) goto _err; + if (tDecodeI64(&decoder, &reserved64) < 0) goto _err; + TARRAY2_APPEND(iList, r); + } + } + TARRAY2_APPEND(pList, p); + p = NULL; + } + + tEndDecode(&decoder); + tDecoderClear(&decoder); + return 0; + +_err: + if (p) { + tsdbSnapPartitionClear(&p); + } + tDecoderClear(&decoder); + return -1; +} + +int32_t tsdbSnapPartListToRangeDiff(STsdbSnapPartList* pList, TSnapRangeArray** ppRanges) { + TSnapRangeArray* pDiff = taosMemoryCalloc(1, sizeof(TSnapRangeArray)); + if (pDiff == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + goto _err; + } + TARRAY2_INIT(pDiff); + + STsdbSnapPartition* part; + TARRAY2_FOREACH(pList, part) { + STSnapRange* r = taosMemoryCalloc(1, sizeof(STSnapRange)); + if (r == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + goto _err; + } + int64_t maxVerValid = -1; + int32_t typMax = TSDB_SNAP_RANGE_TYP_MAX; + for (int32_t i = 0; i < typMax; i++) { + SVerRangeList* iList = &part->verRanges[i]; + SVersionRange vr = {0}; + TARRAY2_FOREACH(iList, vr) { + if (vr.maxVer < vr.minVer) { + continue; + } + maxVerValid = TMAX(maxVerValid, vr.maxVer); + } + } + r->fid = part->fid; + r->sver = maxVerValid + 1; + r->ever = VERSION_MAX; + tsdbDebug("range diff fid:%" PRId64 ", sver:%" PRId64 ", ever:%" PRId64, part->fid, r->sver, r->ever); + int32_t code = TARRAY2_SORT_INSERT(pDiff, r, tsdbSnapRangeCmprFn); + ASSERT(code == 0); + } + ppRanges[0] = pDiff; + + tsdbInfo("pDiff size:%d", TARRAY2_SIZE(pDiff)); + return 0; + +_err: + if (pDiff) { + tsdbSnapRangeArrayDestroy(&pDiff); + } + return -1; +} + +void tsdbSnapRangeArrayDestroy(TSnapRangeArray** ppSnap) { + if (ppSnap && ppSnap[0]) { + TARRAY2_DESTROY(ppSnap[0], tsdbTSnapRangeClear); + taosMemoryFree(ppSnap[0]); + ppSnap[0] = NULL; + } +} + +void tsdbSnapPartListDestroy(STsdbSnapPartList** ppList) { + if (ppList == NULL || ppList[0] == NULL) return; + + TARRAY2_DESTROY(ppList[0], tsdbSnapPartitionClear); + taosMemoryFree(ppList[0]); + ppList[0] = NULL; +} + +ETsdbFsState tsdbSnapGetFsState(SVnode* pVnode) { + if (!VND_IS_RSMA(pVnode)) { + return pVnode->pTsdb->pFS->fsstate; + } + for (int32_t lvl = 0; lvl < TSDB_RETENTION_MAX; ++lvl) { + if (SMA_RSMA_GET_TSDB(pVnode, lvl)->pFS->fsstate != TSDB_FS_STATE_NORMAL) { + return TSDB_FS_STATE_INCOMPLETE; + } + } + return TSDB_FS_STATE_NORMAL; +} + +int32_t tsdbSnapGetDetails(SVnode* pVnode, SSnapshot* pSnap) { + int code = -1; + int32_t tsdbMaxCnt = (!VND_IS_RSMA(pVnode) ? 1 : TSDB_RETENTION_MAX); + int32_t subTyps[TSDB_RETENTION_MAX] = {SNAP_DATA_TSDB, SNAP_DATA_RSMA1, SNAP_DATA_RSMA2}; + STsdbSnapPartList* pLists[TSDB_RETENTION_MAX] = {0}; + + for (int32_t j = 0; j < tsdbMaxCnt; ++j) { + STsdb* pTsdb = SMA_RSMA_GET_TSDB(pVnode, j); + pLists[j] = tsdbGetSnapPartList(pTsdb->pFS); + if (pLists[j] == NULL) goto _out; + } + + // estimate bufLen and prepare + int32_t bufLen = sizeof(SSyncTLV); // typ: TDMT_SYNC_PREP_SNAPSHOT or TDMT_SYNC_PREP_SNAPSOT_REPLY + for (int32_t j = 0; j < tsdbMaxCnt; ++j) { + bufLen += sizeof(SSyncTLV); // subTyps[j] + bufLen += tTsdbSnapPartListDataLenCalc(pLists[j]); + } + + tsdbInfo("vgId:%d, allocate %d bytes for data of snapshot info.", TD_VID(pVnode), bufLen); + + void* data = taosMemoryRealloc(pSnap->data, bufLen); + if (data == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + tsdbError("vgId:%d, failed to realloc memory for data of snapshot info. bytes:%d", TD_VID(pVnode), bufLen); + goto _out; + } + pSnap->data = data; + + // header + SSyncTLV* head = data; + head->len = 0; + head->typ = pSnap->type; + int32_t offset = sizeof(SSyncTLV); + int32_t tlen = 0; + + // fill snapshot info + for (int32_t j = 0; j < tsdbMaxCnt; ++j) { + if (pSnap->type == TDMT_SYNC_PREP_SNAPSHOT_REPLY) { + } + + // subHead + SSyncTLV* subHead = (void*)((char*)data + offset); + subHead->typ = subTyps[j]; + ASSERT(subHead->val == (char*)data + offset + sizeof(SSyncTLV)); + + if ((tlen = tSerializeTsdbSnapPartList(subHead->val, bufLen - offset - sizeof(SSyncTLV), pLists[j])) < 0) { + tsdbError("vgId:%d, failed to serialize snap partition list of tsdb %d since %s", TD_VID(pVnode), j, terrstr()); + goto _out; + } + subHead->len = tlen; + offset += sizeof(SSyncTLV) + tlen; + } + + head->len = offset - sizeof(SSyncTLV); + ASSERT(offset <= bufLen); + code = 0; + +_out: + for (int32_t j = 0; j < tsdbMaxCnt; ++j) { + if (pLists[j] == NULL) continue; + tsdbSnapPartListDestroy(&pLists[j]); + } + + return code; +} diff --git a/source/dnode/vnode/src/tsdb/tsdbSttFileRW.c b/source/dnode/vnode/src/tsdb/tsdbSttFileRW.c index fa8d2d5ba4..7c3b185e20 100644 --- a/source/dnode/vnode/src/tsdb/tsdbSttFileRW.c +++ b/source/dnode/vnode/src/tsdb/tsdbSttFileRW.c @@ -14,6 +14,7 @@ */ #include "tsdbSttFileRW.h" +#include "tsdbDataFileRW.h" // SSttFReader ============================================================ struct SSttFileReader { @@ -383,6 +384,8 @@ struct SSttFileWriter { struct { bool opened; TABLEID tbid[1]; + // range + SVersionRange range; } ctx[1]; // file STsdbFD *fd; @@ -401,8 +404,8 @@ struct SSttFileWriter { uint8_t *bufArr[5]; }; -int32_t tsdbFileDoWriteBlockData(STsdbFD *fd, SBlockData *blockData, int8_t cmprAlg, int64_t *fileSize, - TSttBlkArray *sttBlkArray, uint8_t **bufArr) { +static int32_t tsdbFileDoWriteSttBlockData(STsdbFD *fd, SBlockData *blockData, int8_t cmprAlg, int64_t *fileSize, + TSttBlkArray *sttBlkArray, uint8_t **bufArr, SVersionRange *range) { if (blockData->nRow == 0) return 0; int32_t code = 0; @@ -425,6 +428,8 @@ int32_t tsdbFileDoWriteBlockData(STsdbFD *fd, SBlockData *blockData, int8_t cmpr if (sttBlk->maxVer < blockData->aVersion[iRow]) sttBlk->maxVer = blockData->aVersion[iRow]; } + tsdbWriterUpdVerRange(range, sttBlk->minVer, sttBlk->maxVer); + int32_t sizeArr[5] = {0}; code = tCmprBlockData(blockData, cmprAlg, NULL, NULL, bufArr, sizeArr); if (code) return code; @@ -455,8 +460,8 @@ static int32_t tsdbSttFileDoWriteBlockData(SSttFileWriter *writer) { int32_t code = 0; int32_t lino = 0; - code = tsdbFileDoWriteBlockData(writer->fd, writer->blockData, writer->config->cmprAlg, &writer->file->size, - writer->sttBlkArray, writer->config->bufArr); + code = tsdbFileDoWriteSttBlockData(writer->fd, writer->blockData, writer->config->cmprAlg, &writer->file->size, + writer->sttBlkArray, writer->config->bufArr, &writer->ctx->range); TSDB_CHECK_CODE(code, lino, _exit); _exit: @@ -517,62 +522,6 @@ _exit: return code; } -int32_t tsdbFileWriteTombBlock(STsdbFD *fd, STombBlock *tombBlock, int8_t cmprAlg, int64_t *fileSize, - TTombBlkArray *tombBlkArray, uint8_t **bufArr) { - int32_t code; - - if (TOMB_BLOCK_SIZE(tombBlock) == 0) return 0; - - STombBlk tombBlk[1] = {{ - .dp[0] = - { - .offset = *fileSize, - .size = 0, - }, - .minTbid = - { - .suid = TARRAY2_FIRST(tombBlock->suid), - .uid = TARRAY2_FIRST(tombBlock->uid), - }, - .maxTbid = - { - .suid = TARRAY2_LAST(tombBlock->suid), - .uid = TARRAY2_LAST(tombBlock->uid), - }, - .minVer = TARRAY2_FIRST(tombBlock->version), - .maxVer = TARRAY2_FIRST(tombBlock->version), - .numRec = TOMB_BLOCK_SIZE(tombBlock), - .cmprAlg = cmprAlg, - }}; - - for (int32_t i = 1; i < TOMB_BLOCK_SIZE(tombBlock); i++) { - if (tombBlk->minVer > TARRAY2_GET(tombBlock->version, i)) { - tombBlk->minVer = TARRAY2_GET(tombBlock->version, i); - } - if (tombBlk->maxVer < TARRAY2_GET(tombBlock->version, i)) { - tombBlk->maxVer = TARRAY2_GET(tombBlock->version, i); - } - } - - for (int32_t i = 0; i < ARRAY_SIZE(tombBlock->dataArr); i++) { - code = tsdbCmprData((uint8_t *)TARRAY2_DATA(&tombBlock->dataArr[i]), TARRAY2_DATA_LEN(&tombBlock->dataArr[i]), - TSDB_DATA_TYPE_BIGINT, tombBlk->cmprAlg, &bufArr[0], 0, &tombBlk->size[i], &bufArr[1]); - if (code) return code; - - code = tsdbWriteFile(fd, *fileSize, bufArr[0], tombBlk->size[i]); - if (code) return code; - - tombBlk->dp->size += tombBlk->size[i]; - *fileSize += tombBlk->size[i]; - } - - code = TARRAY2_APPEND_PTR(tombBlkArray, tombBlk); - if (code) return code; - - tTombBlockClear(tombBlock); - return 0; -} - static int32_t tsdbSttFileDoWriteTombBlock(SSttFileWriter *writer) { if (TOMB_BLOCK_SIZE(writer->tombBlock) == 0) return 0; @@ -580,7 +529,7 @@ static int32_t tsdbSttFileDoWriteTombBlock(SSttFileWriter *writer) { int32_t lino = 0; code = tsdbFileWriteTombBlock(writer->fd, writer->tombBlock, writer->config->cmprAlg, &writer->file->size, - writer->tombBlkArray, writer->config->bufArr); + writer->tombBlkArray, writer->config->bufArr, &writer->ctx->range); TSDB_CHECK_CODE(code, lino, _exit); _exit: @@ -639,21 +588,6 @@ _exit: return code; } -int32_t tsdbFileWriteTombBlk(STsdbFD *fd, const TTombBlkArray *tombBlkArray, SFDataPtr *ptr, int64_t *fileSize) { - ptr->size = TARRAY2_DATA_LEN(tombBlkArray); - if (ptr->size > 0) { - ptr->offset = *fileSize; - - int32_t code = tsdbWriteFile(fd, *fileSize, (const uint8_t *)TARRAY2_DATA(tombBlkArray), ptr->size); - if (code) { - return code; - } - - *fileSize += ptr->size; - } - return 0; -} - static int32_t tsdbSttFileDoWriteTombBlk(SSttFileWriter *writer) { int32_t code = 0; int32_t lino = 0; @@ -694,6 +628,8 @@ static int32_t tsdbSttFWriterDoOpen(SSttFileWriter *writer) { .fid = writer->config->fid, .cid = writer->config->cid, .size = 0, + .minVer = VERSION_MAX, + .maxVer = VERSION_MIN, .stt[0] = { .level = writer->config->level, @@ -713,6 +649,9 @@ static int32_t tsdbSttFWriterDoOpen(SSttFileWriter *writer) { TSDB_CHECK_CODE(code, lino, _exit); writer->file->size += sizeof(hdr); + // range + writer->ctx->range = (SVersionRange){.minVer = VERSION_MAX, .maxVer = VERSION_MIN}; + writer->ctx->opened = true; _exit: @@ -782,6 +721,7 @@ static int32_t tsdbSttFWriterCloseCommit(SSttFileWriter *writer, TFileOpArray *o .fid = writer->config->fid, .nf = writer->file[0], }; + tsdbTFileUpdVerRange(&op.nf, writer->ctx->range); code = TARRAY2_APPEND(opArray, op); TSDB_CHECK_CODE(code, lino, _exit); diff --git a/source/dnode/vnode/src/tsdb/tsdbSttFileRW.h b/source/dnode/vnode/src/tsdb/tsdbSttFileRW.h index 242b55795c..0051a6cd92 100644 --- a/source/dnode/vnode/src/tsdb/tsdbSttFileRW.h +++ b/source/dnode/vnode/src/tsdb/tsdbSttFileRW.h @@ -71,6 +71,9 @@ int32_t tsdbSttFileWriteBlockData(SSttFileWriter *writer, SBlockData *pBlockData int32_t tsdbSttFileWriteTombRecord(SSttFileWriter *writer, const STombRecord *record); bool tsdbSttFileWriterIsOpened(SSttFileWriter *writer); +int32_t tsdbFileWriteSttBlk(STsdbFD *fd, const TSttBlkArray *sttBlkArray, SFDataPtr *ptr, int64_t *fileSize); +int32_t tsdbFileWriteSttFooter(STsdbFD *fd, const SSttFooter *footer, int64_t *fileSize); + struct SSttFileWriterConfig { STsdb *tsdb; int32_t maxRow; @@ -90,4 +93,4 @@ struct SSttFileWriterConfig { } #endif -#endif /*_TSDB_STT_FILE_RW_H*/ \ No newline at end of file +#endif /*_TSDB_STT_FILE_RW_H*/ diff --git a/source/dnode/vnode/src/tsdb/tsdbUpgrade.c b/source/dnode/vnode/src/tsdb/tsdbUpgrade.c index 0884c32385..876c0df4a0 100644 --- a/source/dnode/vnode/src/tsdb/tsdbUpgrade.c +++ b/source/dnode/vnode/src/tsdb/tsdbUpgrade.c @@ -16,24 +16,15 @@ #include "tsdbUpgrade.h" // old -extern void tsdbGetCurrentFName(STsdb *pTsdb, char *current, char *current_t); -extern int32_t tsdbReadDataBlockEx(SDataFReader *pReader, SDataBlk *pDataBlk, SBlockData *pBlockData); +#include "tsdb.h" +// extern void tsdbGetCurrentFName(STsdb *pTsdb, char *current, char *current_t); // new -extern int32_t save_fs(const TFileSetArray *arr, const char *fname); -extern int32_t current_fname(STsdb *pTsdb, char *fname, EFCurrentT ftype); -extern int32_t tsdbFileWriteBrinBlock(STsdbFD *fd, SBrinBlock *brinBlock, int8_t cmprAlg, int64_t *fileSize, - TBrinBlkArray *brinBlkArray, uint8_t **bufArr); -extern int32_t tsdbFileWriteBrinBlk(STsdbFD *fd, TBrinBlkArray *brinBlkArray, SFDataPtr *ptr, int64_t *fileSize); -extern int32_t tsdbFileWriteHeadFooter(STsdbFD *fd, int64_t *fileSize, const SHeadFooter *footer); -extern int32_t tsdbSttLvlInit(int32_t level, SSttLvl **lvl); -extern int32_t tsdbSttLvlClear(SSttLvl **lvl); -extern int32_t tsdbFileWriteSttBlk(STsdbFD *fd, const TSttBlkArray *sttBlkArray, SFDataPtr *ptr, int64_t *fileSize); -extern int32_t tsdbFileWriteSttFooter(STsdbFD *fd, const SSttFooter *footer, int64_t *fileSize); -extern int32_t tsdbFileWriteTombBlock(STsdbFD *fd, STombBlock *tombBlock, int8_t cmprAlg, int64_t *fileSize, - TTombBlkArray *tombBlkArray, uint8_t **bufArr); -extern int32_t tsdbFileWriteTombBlk(STsdbFD *fd, const TTombBlkArray *tombBlkArray, SFDataPtr *ptr, int64_t *fileSize); -extern int32_t tsdbFileWriteTombFooter(STsdbFD *fd, const STombFooter *footer, int64_t *fileSize); +#include "tsdbDataFileRW.h" +#include "tsdbFS2.h" +#include "tsdbSttFileRW.h" +// extern int32_t save_fs(const TFileSetArray *arr, const char *fname); +// extern int32_t current_fname(STsdb *pTsdb, char *fname, EFCurrentT ftype); static int32_t tsdbUpgradeHead(STsdb *tsdb, SDFileSet *pDFileSet, SDataFReader *reader, STFileSet *fset) { int32_t code = 0; @@ -78,6 +69,8 @@ static int32_t tsdbUpgradeHead(STsdb *tsdb, SDFileSet *pDFileSet, SDataFReader * .fid = fset->fid, .cid = pDFileSet->pHeadF->commitID, .size = pDFileSet->pHeadF->size, + .minVer = VERSION_MAX, + .maxVer = VERSION_MIN, }; code = tsdbTFileObjInit(tsdb, &file, &fset->farr[TSDB_FTYPE_HEAD]); @@ -127,16 +120,18 @@ static int32_t tsdbUpgradeHead(STsdb *tsdb, SDFileSet *pDFileSet, SDataFReader * TSDB_CHECK_CODE(code, lino, _exit); if (BRIN_BLOCK_SIZE(ctx->brinBlock) >= ctx->maxRow) { + SVersionRange range = {.minVer = VERSION_MAX, .maxVer = VERSION_MIN}; code = tsdbFileWriteBrinBlock(ctx->fd, ctx->brinBlock, ctx->cmprAlg, &fset->farr[TSDB_FTYPE_HEAD]->f->size, - ctx->brinBlkArray, ctx->bufArr); + ctx->brinBlkArray, ctx->bufArr, &range); TSDB_CHECK_CODE(code, lino, _exit); } } } if (BRIN_BLOCK_SIZE(ctx->brinBlock) > 0) { + SVersionRange range = {.minVer = VERSION_MAX, .maxVer = VERSION_MIN}; code = tsdbFileWriteBrinBlock(ctx->fd, ctx->brinBlock, ctx->cmprAlg, &fset->farr[TSDB_FTYPE_HEAD]->f->size, - ctx->brinBlkArray, ctx->bufArr); + ctx->brinBlkArray, ctx->bufArr, &range); TSDB_CHECK_CODE(code, lino, _exit); } @@ -182,6 +177,8 @@ static int32_t tsdbUpgradeData(STsdb *tsdb, SDFileSet *pDFileSet, SDataFReader * .fid = fset->fid, .cid = pDFileSet->pDataF->commitID, .size = pDFileSet->pDataF->size, + .minVer = VERSION_MAX, + .maxVer = VERSION_MIN, }; code = tsdbTFileObjInit(tsdb, &file, &fset->farr[TSDB_FTYPE_DATA]); @@ -208,6 +205,8 @@ static int32_t tsdbUpgradeSma(STsdb *tsdb, SDFileSet *pDFileSet, SDataFReader *r .fid = fset->fid, .cid = pDFileSet->pSmaF->commitID, .size = pDFileSet->pSmaF->size, + .minVer = VERSION_MAX, + .maxVer = VERSION_MIN, }; code = tsdbTFileObjInit(tsdb, &file, &fset->farr[TSDB_FTYPE_SMA]); @@ -253,6 +252,8 @@ static int32_t tsdbUpgradeSttFile(STsdb *tsdb, SDFileSet *pDFileSet, SDataFReade .fid = fset->fid, .cid = pSttF->commitID, .size = pSttF->size, + .minVer = VERSION_MAX, + .maxVer = VERSION_MIN, }; code = tsdbTFileObjInit(tsdb, &file, &fobj); TSDB_CHECK_CODE(code, lino, _exit1); @@ -382,6 +383,8 @@ static int32_t tsdbUpgradeOpenTombFile(STsdb *tsdb, STFileSet *fset, STsdbFD **f .fid = fset->fid, .cid = 0, .size = 0, + .minVer = VERSION_MAX, + .maxVer = VERSION_MIN, }; code = tsdbTFileObjInit(tsdb, &file, fobj); @@ -398,6 +401,8 @@ static int32_t tsdbUpgradeOpenTombFile(STsdb *tsdb, STFileSet *fset, STsdbFD **f .fid = fset->fid, .cid = 0, .size = 0, + .minVer = VERSION_MAX, + .maxVer = VERSION_MIN, }; code = tsdbTFileObjInit(tsdb, &file, fobj); @@ -481,8 +486,9 @@ static int32_t tsdbDumpTombDataToFSet(STsdb *tsdb, SDelFReader *reader, SArray * code = tsdbUpgradeOpenTombFile(tsdb, fset, &ctx->fd, &ctx->fobj, &ctx->toStt); TSDB_CHECK_CODE(code, lino, _exit); } + SVersionRange tombRange = {.minVer = VERSION_MAX, .maxVer = VERSION_MIN}; code = tsdbFileWriteTombBlock(ctx->fd, ctx->tombBlock, ctx->cmprAlg, &ctx->fobj->f->size, ctx->tombBlkArray, - ctx->bufArr); + ctx->bufArr, &tombRange); TSDB_CHECK_CODE(code, lino, _exit); } } @@ -493,8 +499,9 @@ static int32_t tsdbDumpTombDataToFSet(STsdb *tsdb, SDelFReader *reader, SArray * code = tsdbUpgradeOpenTombFile(tsdb, fset, &ctx->fd, &ctx->fobj, &ctx->toStt); TSDB_CHECK_CODE(code, lino, _exit); } + SVersionRange tombRange = {.minVer = VERSION_MAX, .maxVer = VERSION_MIN}; code = tsdbFileWriteTombBlock(ctx->fd, ctx->tombBlock, ctx->cmprAlg, &ctx->fobj->f->size, ctx->tombBlkArray, - ctx->bufArr); + ctx->bufArr, &tombRange); TSDB_CHECK_CODE(code, lino, _exit); } diff --git a/source/dnode/vnode/src/vnd/vnodeCommit.c b/source/dnode/vnode/src/vnd/vnodeCommit.c index 136168c5cc..50ca2f5d03 100644 --- a/source/dnode/vnode/src/vnd/vnodeCommit.c +++ b/source/dnode/vnode/src/vnd/vnodeCommit.c @@ -285,6 +285,7 @@ static int32_t vnodePrepareCommit(SVnode *pVnode, SCommitInfo *pInfo) { int32_t code = 0; int32_t lino = 0; char dir[TSDB_FILENAME_LEN] = {0}; + int64_t lastCommitted = pInfo->info.state.committed; tsem_wait(&pVnode->canCommit); diff --git a/source/dnode/vnode/src/vnd/vnodeOpen.c b/source/dnode/vnode/src/vnd/vnodeOpen.c index db94f32459..3bdecee79b 100644 --- a/source/dnode/vnode/src/vnd/vnodeOpen.c +++ b/source/dnode/vnode/src/vnd/vnodeOpen.c @@ -13,6 +13,8 @@ * along with this program. If not, see . */ +#include "sync.h" +#include "tsdb.h" #include "vnd.h" #include "vndCos.h" @@ -62,6 +64,13 @@ int32_t vnodeCreate(const char *path, SVnodeCfg *pCfg, int32_t diskPrimary, STfs info.state.applied = -1; info.state.commitID = 0; + SVnodeInfo oldInfo = {0}; + oldInfo.config = vnodeCfgDefault; + if (vnodeLoadInfo(dir, &oldInfo) == 0) { + vWarn("vgId:%d, vnode config info already exists at %s.", oldInfo.config.vgId, dir); + return (oldInfo.config.dbId == info.config.dbId) ? 0 : -1; + } + vInfo("vgId:%d, save config while create", info.config.vgId); if (vnodeSaveInfo(dir, &info) < 0 || vnodeCommitInfo(dir) < 0) { vError("vgId:%d, failed to save vnode config since %s", pCfg ? pCfg->vgId : 0, tstrerror(terrno)); @@ -321,12 +330,13 @@ static int32_t vnodeCheckDisk(int32_t diskPrimary, STfs *pTfs) { return 0; } -SVnode *vnodeOpen(const char *path, int32_t diskPrimary, STfs *pTfs, SMsgCb msgCb) { +SVnode *vnodeOpen(const char *path, int32_t diskPrimary, STfs *pTfs, SMsgCb msgCb, bool force) { SVnode *pVnode = NULL; SVnodeInfo info = {0}; char dir[TSDB_FILENAME_LEN] = {0}; char tdir[TSDB_FILENAME_LEN * 2] = {0}; int32_t ret = 0; + terrno = TSDB_CODE_SUCCESS; if (vnodeCheckDisk(diskPrimary, pTfs)) { vError("failed to open vnode from %s since %s. diskPrimary:%d", path, terrstr(), diskPrimary); @@ -340,9 +350,14 @@ SVnode *vnodeOpen(const char *path, int32_t diskPrimary, STfs *pTfs, SMsgCb msgC ret = vnodeLoadInfo(dir, &info); if (ret < 0) { vError("failed to open vnode from %s since %s", path, tstrerror(terrno)); + terrno = TSDB_CODE_NEED_RETRY; return NULL; } + if (vnodeMkDir(pTfs, path)) { + vError("vgId:%d, failed to prepare vnode dir since %s, path: %s", info.config.vgId, strerror(errno), path); + return NULL; + } // save vnode info on dnode ep changed bool updated = false; SSyncCfg *pCfg = &info.config.syncCfg; @@ -404,7 +419,7 @@ SVnode *vnodeOpen(const char *path, int32_t diskPrimary, STfs *pTfs, SMsgCb msgC } // open tsdb - if (!VND_IS_RSMA(pVnode) && tsdbOpen(pVnode, &VND_TSDB(pVnode), VNODE_TSDB_DIR, NULL, rollback) < 0) { + if (!VND_IS_RSMA(pVnode) && tsdbOpen(pVnode, &VND_TSDB(pVnode), VNODE_TSDB_DIR, NULL, rollback, force) < 0) { vError("vgId:%d, failed to open vnode tsdb since %s", TD_VID(pVnode), tstrerror(terrno)); goto _err; } @@ -438,7 +453,7 @@ SVnode *vnodeOpen(const char *path, int32_t diskPrimary, STfs *pTfs, SMsgCb msgC } // open sma - if (smaOpen(pVnode, rollback)) { + if (smaOpen(pVnode, rollback, force)) { vError("vgId:%d, failed to open vnode sma since %s", TD_VID(pVnode), tstrerror(terrno)); goto _err; } @@ -508,7 +523,10 @@ void vnodeClose(SVnode *pVnode) { } // start the sync timer after the queue is ready -int32_t vnodeStart(SVnode *pVnode) { return vnodeSyncStart(pVnode); } +int32_t vnodeStart(SVnode *pVnode) { + ASSERT(pVnode); + return vnodeSyncStart(pVnode); +} int32_t vnodeIsCatchUp(SVnode *pVnode) { return syncIsCatchUp(pVnode->sync); } @@ -517,10 +535,3 @@ ESyncRole vnodeGetRole(SVnode *pVnode) { return syncGetRole(pVnode->sync); } void vnodeStop(SVnode *pVnode) {} int64_t vnodeGetSyncHandle(SVnode *pVnode) { return pVnode->sync; } - -void vnodeGetSnapshot(SVnode *pVnode, SSnapshot *pSnapshot) { - pSnapshot->data = NULL; - pSnapshot->lastApplyIndex = pVnode->state.committed; - pSnapshot->lastApplyTerm = pVnode->state.commitTerm; - pSnapshot->lastConfigIndex = -1; -} diff --git a/source/dnode/vnode/src/vnd/vnodeSnapshot.c b/source/dnode/vnode/src/vnd/vnodeSnapshot.c index 3abcf79839..87b407efcb 100644 --- a/source/dnode/vnode/src/vnd/vnodeSnapshot.c +++ b/source/dnode/vnode/src/vnd/vnodeSnapshot.c @@ -14,6 +14,7 @@ */ #include "vnd.h" +#include "tsdb.h" // SVSnapReader ======================================================== struct SVSnapReader { @@ -28,6 +29,7 @@ struct SVSnapReader { SMetaSnapReader *pMetaReader; // tsdb int8_t tsdbDone; + TSnapRangeArray *pRanges; STsdbSnapReader *pTsdbReader; // tq int8_t tqHandleDone; @@ -43,11 +45,84 @@ struct SVSnapReader { SStreamStateReader *pStreamStateReader; // rsma int8_t rsmaDone; + TSnapRangeArray *pRsmaRanges[TSDB_RETENTION_L2]; SRSmaSnapReader *pRsmaReader; }; -int32_t vnodeSnapReaderOpen(SVnode *pVnode, int64_t sver, int64_t ever, SVSnapReader **ppReader) { +static int32_t vnodeExtractSnapInfoDiff(void *buf, int32_t bufLen, TSnapRangeArray **ppRanges) { + int32_t code = -1; + STsdbSnapPartList *pList = tsdbSnapPartListCreate(); + if (pList == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + goto _out; + } + if (tDeserializeTsdbSnapPartList(buf, bufLen, pList) < 0) { + terrno = TSDB_CODE_INVALID_DATA_FMT; + goto _out; + } + if (tsdbSnapPartListToRangeDiff(pList, ppRanges) < 0) { + goto _out; + } + code = 0; +_out: + tsdbSnapPartListDestroy(&pList); + return code; +} + +static TSnapRangeArray **vnodeSnapReaderGetTsdbRanges(SVSnapReader *pReader, int32_t tsdbTyp) { + ASSERTS(sizeof(pReader->pRsmaRanges) / sizeof(pReader->pRsmaRanges[0]) == 2, "Unexpected array size"); + switch (tsdbTyp) { + case SNAP_DATA_TSDB: + return &pReader->pRanges; + case SNAP_DATA_RSMA1: + return &pReader->pRsmaRanges[0]; + case SNAP_DATA_RSMA2: + return &pReader->pRsmaRanges[1]; + default: + return NULL; + } +} + +static int32_t vnodeSnapReaderDoSnapInfo(SVSnapReader *pReader, SSnapshotParam *pParam) { + SVnode *pVnode = pReader->pVnode; + int32_t code = -1; + + if (pParam->data) { + SSyncTLV *datHead = (void *)pParam->data; + if (datHead->typ != TDMT_SYNC_PREP_SNAPSHOT_REPLY) { + terrno = TSDB_CODE_INVALID_DATA_FMT; + goto _out; + } + + TSnapRangeArray **ppRanges = NULL; + int32_t offset = 0; + + while (offset + sizeof(SSyncTLV) < datHead->len) { + SSyncTLV *subField = (void *)(datHead->val + offset); + offset += sizeof(SSyncTLV) + subField->len; + void *buf = subField->val; + int32_t bufLen = subField->len; + ppRanges = vnodeSnapReaderGetTsdbRanges(pReader, subField->typ); + if (ppRanges == NULL) { + vError("vgId:%d, unexpected subfield type in data of snapshot param. subtyp:%d", TD_VID(pVnode), subField->typ); + goto _out; + } + if (vnodeExtractSnapInfoDiff(buf, bufLen, ppRanges) < 0) { + vError("vgId:%d, failed to get range diff since %s", TD_VID(pVnode), terrstr()); + goto _out; + } + } + } + + code = 0; +_out: + return code; +} + +int32_t vnodeSnapReaderOpen(SVnode *pVnode, SSnapshotParam *pParam, SVSnapReader **ppReader) { int32_t code = 0; + int64_t sver = pParam->start; + int64_t ever = pParam->end; SVSnapReader *pReader = NULL; pReader = (SVSnapReader *)taosMemoryCalloc(1, sizeof(*pReader)); @@ -59,6 +134,11 @@ int32_t vnodeSnapReaderOpen(SVnode *pVnode, int64_t sver, int64_t ever, SVSnapRe pReader->sver = sver; pReader->ever = ever; + // snapshot info + if (vnodeSnapReaderDoSnapInfo(pReader, pParam) < 0) { + goto _err; + } + vInfo("vgId:%d, vnode snapshot reader opened, sver:%" PRId64 " ever:%" PRId64, TD_VID(pVnode), sver, ever); *ppReader = pReader; return code; @@ -69,8 +149,19 @@ _err: return code; } +static void vnodeSnapReaderDestroyTsdbRanges(SVSnapReader *pReader) { + int32_t tsdbTyps[TSDB_RETENTION_MAX] = {SNAP_DATA_TSDB, SNAP_DATA_RSMA1, SNAP_DATA_RSMA2}; + for (int32_t j = 0; j < TSDB_RETENTION_MAX; ++j) { + TSnapRangeArray **ppRanges = vnodeSnapReaderGetTsdbRanges(pReader, tsdbTyps[j]); + if (ppRanges == NULL) continue; + tsdbSnapRangeArrayDestroy(ppRanges); + } +} + void vnodeSnapReaderClose(SVSnapReader *pReader) { vInfo("vgId:%d, close vnode snapshot reader", TD_VID(pReader->pVnode)); + vnodeSnapReaderDestroyTsdbRanges(pReader); + if (pReader->pRsmaReader) { rsmaSnapReaderClose(&pReader->pRsmaReader); } @@ -175,7 +266,7 @@ int32_t vnodeSnapRead(SVSnapReader *pReader, uint8_t **ppData, uint32_t *nData) if (!pReader->tsdbDone) { // open if not if (pReader->pTsdbReader == NULL) { - code = tsdbSnapReaderOpen(pReader->pVnode->pTsdb, pReader->sver, pReader->ever, SNAP_DATA_TSDB, + code = tsdbSnapReaderOpen(pReader->pVnode->pTsdb, pReader->sver, pReader->ever, SNAP_DATA_TSDB, pReader->pRanges, &pReader->pTsdbReader); if (code) goto _err; } @@ -364,6 +455,7 @@ struct SVSnapWriter { // meta SMetaSnapWriter *pMetaSnapWriter; // tsdb + TSnapRangeArray *pRanges; STsdbSnapWriter *pTsdbSnapWriter; // tq STqSnapWriter *pTqSnapWriter; @@ -373,12 +465,65 @@ struct SVSnapWriter { SStreamTaskWriter *pStreamTaskWriter; SStreamStateWriter *pStreamStateWriter; // rsma + TSnapRangeArray *pRsmaRanges[TSDB_RETENTION_L2]; SRSmaSnapWriter *pRsmaSnapWriter; }; -int32_t vnodeSnapWriterOpen(SVnode *pVnode, int64_t sver, int64_t ever, SVSnapWriter **ppWriter) { +TSnapRangeArray **vnodeSnapWriterGetTsdbRanges(SVSnapWriter *pWriter, int32_t tsdbTyp) { + ASSERTS(sizeof(pWriter->pRsmaRanges) / sizeof(pWriter->pRsmaRanges[0]) == 2, "Unexpected array size"); + switch (tsdbTyp) { + case SNAP_DATA_TSDB: + return &pWriter->pRanges; + case SNAP_DATA_RSMA1: + return &pWriter->pRsmaRanges[0]; + case SNAP_DATA_RSMA2: + return &pWriter->pRsmaRanges[1]; + default: + return NULL; + } +} + +static int32_t vnodeSnapWriterDoSnapInfo(SVSnapWriter *pWriter, SSnapshotParam *pParam) { + SVnode *pVnode = pWriter->pVnode; + int32_t code = -1; + + if (pParam->data) { + SSyncTLV *datHead = (void *)pParam->data; + if (datHead->typ != TDMT_SYNC_PREP_SNAPSHOT_REPLY) { + terrno = TSDB_CODE_INVALID_DATA_FMT; + goto _out; + } + + TSnapRangeArray **ppRanges = NULL; + int32_t offset = 0; + + while (offset + sizeof(SSyncTLV) < datHead->len) { + SSyncTLV *subField = (void *)(datHead->val + offset); + offset += sizeof(SSyncTLV) + subField->len; + void *buf = subField->val; + int32_t bufLen = subField->len; + ppRanges = vnodeSnapWriterGetTsdbRanges(pWriter, subField->typ); + if (ppRanges == NULL) { + vError("vgId:%d, unexpected subfield type in data of snapshot param. subtyp:%d", TD_VID(pVnode), subField->typ); + goto _out; + } + if (vnodeExtractSnapInfoDiff(buf, bufLen, ppRanges) < 0) { + vError("vgId:%d, failed to get range diff since %s", TD_VID(pVnode), terrstr()); + goto _out; + } + } + } + + code = 0; +_out: + return code; +} + +int32_t vnodeSnapWriterOpen(SVnode *pVnode, SSnapshotParam *pParam, SVSnapWriter **ppWriter) { int32_t code = 0; SVSnapWriter *pWriter = NULL; + int64_t sver = pParam->start; + int64_t ever = pParam->end; // commit memory data vnodeAsyncCommit(pVnode); @@ -397,6 +542,11 @@ int32_t vnodeSnapWriterOpen(SVnode *pVnode, int64_t sver, int64_t ever, SVSnapWr // inc commit ID pWriter->commitID = ++pVnode->state.commitID; + // snapshot info + if (vnodeSnapWriterDoSnapInfo(pWriter, pParam) < 0) { + goto _err; + } + vInfo("vgId:%d, vnode snapshot writer opened, sver:%" PRId64 " ever:%" PRId64 " commit id:%" PRId64, TD_VID(pVnode), sver, ever, pWriter->commitID); *ppWriter = pWriter; @@ -408,15 +558,30 @@ _err: return code; } +static void vnodeSnapWriterDestroyTsdbRanges(SVSnapWriter *pWriter) { + int32_t tsdbTyps[TSDB_RETENTION_MAX] = {SNAP_DATA_TSDB, SNAP_DATA_RSMA1, SNAP_DATA_RSMA2}; + for (int32_t j = 0; j < TSDB_RETENTION_MAX; ++j) { + TSnapRangeArray **ppRanges = vnodeSnapWriterGetTsdbRanges(pWriter, tsdbTyps[j]); + if (ppRanges == NULL) continue; + tsdbSnapRangeArrayDestroy(ppRanges); + } +} + int32_t vnodeSnapWriterClose(SVSnapWriter *pWriter, int8_t rollback, SSnapshot *pSnapshot) { int32_t code = 0; SVnode *pVnode = pWriter->pVnode; + vnodeSnapWriterDestroyTsdbRanges(pWriter); + // prepare if (pWriter->pTsdbSnapWriter) { tsdbSnapWriterPrepareClose(pWriter->pTsdbSnapWriter); } + if (pWriter->pRsmaSnapWriter) { + rsmaSnapWriterPrepareClose(pWriter->pRsmaSnapWriter); + } + // commit json if (!rollback) { pWriter->info.state.committed = pWriter->ever; @@ -430,7 +595,9 @@ int32_t vnodeSnapWriterClose(SVSnapWriter *pWriter, int8_t rollback, SSnapshot * char dir[TSDB_FILENAME_LEN] = {0}; vnodeGetPrimaryDir(pVnode->path, pVnode->diskPrimary, pVnode->pTfs, dir, TSDB_FILENAME_LEN); - vnodeCommitInfo(dir); + code = vnodeCommitInfo(dir); + if (code) goto _exit; + } else { vnodeRollback(pWriter->pVnode); } @@ -561,7 +728,8 @@ int32_t vnodeSnapWrite(SVSnapWriter *pWriter, uint8_t *pData, uint32_t nData) { case SNAP_DATA_DEL: { // tsdb if (pWriter->pTsdbSnapWriter == NULL) { - code = tsdbSnapWriterOpen(pVnode->pTsdb, pWriter->sver, pWriter->ever, &pWriter->pTsdbSnapWriter); + code = tsdbSnapWriterOpen(pVnode->pTsdb, pWriter->sver, pWriter->ever, pWriter->pRanges, + &pWriter->pTsdbSnapWriter); if (code) goto _err; } @@ -621,7 +789,8 @@ int32_t vnodeSnapWrite(SVSnapWriter *pWriter, uint8_t *pData, uint32_t nData) { case SNAP_DATA_QTASK: { // rsma1/rsma2/qtask for rsma if (pWriter->pRsmaSnapWriter == NULL) { - code = rsmaSnapWriterOpen(pVnode->pSma, pWriter->sver, pWriter->ever, &pWriter->pRsmaSnapWriter); + code = rsmaSnapWriterOpen(pVnode->pSma, pWriter->sver, pWriter->ever, (void **)pWriter->pRsmaRanges, + &pWriter->pRsmaSnapWriter); if (code) goto _err; } diff --git a/source/dnode/vnode/src/vnd/vnodeSync.c b/source/dnode/vnode/src/vnd/vnodeSync.c index a6c743c87d..6c03ed68e9 100644 --- a/source/dnode/vnode/src/vnd/vnodeSync.c +++ b/source/dnode/vnode/src/vnd/vnodeSync.c @@ -15,6 +15,8 @@ #define _DEFAULT_SOURCE #include "tq.h" +#include "sync.h" +#include "tsdb.h" #include "vnd.h" #define BATCH_ENABLE 0 @@ -416,8 +418,8 @@ static int32_t vnodeSyncSendMsg(const SEpSet *pEpSet, SRpcMsg *pMsg) { return code; } -static void vnodeSyncGetSnapshotInfo(const SSyncFSM *pFsm, SSnapshot *pSnapshot) { - vnodeGetSnapshot(pFsm->data, pSnapshot); +static int32_t vnodeSyncGetSnapshotInfo(const SSyncFSM *pFsm, SSnapshot *pSnapshot) { + return vnodeGetSnapshot(pFsm->data, pSnapshot); } static int32_t vnodeSyncApplyMsg(const SSyncFSM *pFsm, SRpcMsg *pMsg, const SFsmCbMeta *pMeta) { @@ -475,8 +477,7 @@ static void vnodeSyncRollBackMsg(const SSyncFSM *pFsm, SRpcMsg *pMsg, SFsmCbMeta static int32_t vnodeSnapshotStartRead(const SSyncFSM *pFsm, void *pParam, void **ppReader) { SVnode *pVnode = pFsm->data; - SSnapshotParam *pSnapshotParam = pParam; - int32_t code = vnodeSnapReaderOpen(pVnode, pSnapshotParam->start, pSnapshotParam->end, (SVSnapReader **)ppReader); + int32_t code = vnodeSnapReaderOpen(pVnode, (SSnapshotParam *)pParam, (SVSnapReader **)ppReader); return code; } @@ -492,8 +493,7 @@ static int32_t vnodeSnapshotDoRead(const SSyncFSM *pFsm, void *pReader, void **p } static int32_t vnodeSnapshotStartWrite(const SSyncFSM *pFsm, void *pParam, void **ppWriter) { - SVnode *pVnode = pFsm->data; - SSnapshotParam *pSnapshotParam = pParam; + SVnode *pVnode = pFsm->data; do { int32_t itemSize = tmsgGetQueueSize(&pVnode->msgCb, pVnode->config.vgId, APPLY_QUEUE); @@ -506,7 +506,7 @@ static int32_t vnodeSnapshotStartWrite(const SSyncFSM *pFsm, void *pParam, void } } while (true); - int32_t code = vnodeSnapWriterOpen(pVnode, pSnapshotParam->start, pSnapshotParam->end, (SVSnapWriter **)ppWriter); + int32_t code = vnodeSnapWriterOpen(pVnode, (SSnapshotParam *)pParam, (SVSnapWriter **)ppWriter); return code; } @@ -642,6 +642,7 @@ static SSyncFSM *vnodeSyncMakeFsm(SVnode *pVnode) { pFsm->FpAppliedIndexCb = vnodeSyncAppliedIndex; pFsm->FpPreCommitCb = vnodeSyncPreCommitMsg; pFsm->FpRollBackCb = vnodeSyncRollBackMsg; + pFsm->FpGetSnapshot = NULL; pFsm->FpGetSnapshotInfo = vnodeSyncGetSnapshotInfo; pFsm->FpRestoreFinishCb = vnodeRestoreFinish; pFsm->FpLeaderTransferCb = NULL; @@ -784,3 +785,20 @@ bool vnodeIsLeader(SVnode *pVnode) { return true; } + +int32_t vnodeGetSnapshot(SVnode *pVnode, SSnapshot *pSnap) { + int code = 0; + pSnap->lastApplyIndex = pVnode->state.committed; + pSnap->lastApplyTerm = pVnode->state.commitTerm; + pSnap->lastConfigIndex = -1; + pSnap->state = SYNC_FSM_STATE_COMPLETE; + + if (tsdbSnapGetFsState(pVnode) != TSDB_FS_STATE_NORMAL) { + pSnap->state = SYNC_FSM_STATE_INCOMPLETE; + } + + if (pSnap->type == TDMT_SYNC_PREP_SNAPSHOT || pSnap->type == TDMT_SYNC_PREP_SNAPSHOT_REPLY) { + code = tsdbSnapGetDetails(pVnode, pSnap); + } + return code; +} diff --git a/source/libs/stream/src/streamExec.c b/source/libs/stream/src/streamExec.c index 12b51e6c93..c49c647906 100644 --- a/source/libs/stream/src/streamExec.c +++ b/source/libs/stream/src/streamExec.c @@ -309,7 +309,9 @@ int32_t streamDoTransferStateToStreamTask(SStreamTask* pTask) { pStreamTask->id.idStr); } - ASSERT(pStreamTask->hTaskInfo.id.taskId == pTask->id.taskId && pTask->status.appendTranstateBlock == true); + ASSERT(((pStreamTask->status.taskStatus == TASK_STATUS__STOP) || + (pStreamTask->hTaskInfo.id.taskId == pTask->id.taskId)) && + pTask->status.appendTranstateBlock == true); STimeWindow* pTimeWindow = &pStreamTask->dataRange.window; diff --git a/source/libs/sync/inc/syncInt.h b/source/libs/sync/inc/syncInt.h index 870cdd6a72..cec1a12024 100644 --- a/source/libs/sync/inc/syncInt.h +++ b/source/libs/sync/inc/syncInt.h @@ -139,6 +139,7 @@ typedef struct SSyncNode { SSyncFSM* pFsm; int32_t quorum; SRaftId leaderCache; + ESyncFsmState fsmState; // life cycle int64_t rid; diff --git a/source/libs/sync/inc/syncMessage.h b/source/libs/sync/inc/syncMessage.h index f8c96d8be2..9054f47d37 100644 --- a/source/libs/sync/inc/syncMessage.h +++ b/source/libs/sync/inc/syncMessage.h @@ -116,7 +116,7 @@ typedef struct SyncAppendEntriesReply { SyncIndex matchIndex; SyncIndex lastSendIndex; int64_t startTime; - int16_t reserved; + int16_t fsmState; } SyncAppendEntriesReply; typedef struct SyncHeartbeat { @@ -200,7 +200,7 @@ typedef struct SyncSnapshotSend { SSyncCfg lastConfig; int64_t startTime; int32_t seq; - int16_t reserved; + int16_t payloadType; uint32_t dataLen; char data[]; } SyncSnapshotSend; @@ -219,7 +219,8 @@ typedef struct SyncSnapshotRsp { int32_t ack; int32_t code; SyncIndex snapBeginIndex; // when ack = SYNC_SNAPSHOT_SEQ_BEGIN, it's valid - int16_t reserved; + int16_t payloadType; + char data[]; } SyncSnapshotRsp; typedef struct SyncLeaderTransfer { @@ -267,7 +268,7 @@ int32_t syncBuildPreSnapshot(SRpcMsg* pMsg, int32_t vgId); int32_t syncBuildPreSnapshotReply(SRpcMsg* pMsg, int32_t vgId); int32_t syncBuildApplyMsg(SRpcMsg* pMsg, const SRpcMsg* pOriginal, int32_t vgId, SFsmCbMeta* pMeta); int32_t syncBuildSnapshotSend(SRpcMsg* pMsg, int32_t dataLen, int32_t vgId); -int32_t syncBuildSnapshotSendRsp(SRpcMsg* pMsg, int32_t vgId); +int32_t syncBuildSnapshotSendRsp(SRpcMsg* pMsg, int32_t dataLen, int32_t vgId); int32_t syncBuildLeaderTransfer(SRpcMsg* pMsg, int32_t vgId); int32_t syncBuildLocalCmd(SRpcMsg* pMsg, int32_t vgId); diff --git a/source/libs/sync/inc/syncSnapshot.h b/source/libs/sync/inc/syncSnapshot.h index 063b4f51f5..95382132b5 100644 --- a/source/libs/sync/inc/syncSnapshot.h +++ b/source/libs/sync/inc/syncSnapshot.h @@ -31,7 +31,7 @@ extern "C" { #define SYNC_SNAPSHOT_RETRY_MS 5000 typedef struct SSyncSnapshotSender { - bool start; + int8_t start; int32_t seq; int32_t ack; void *pReader; @@ -43,7 +43,7 @@ typedef struct SSyncSnapshotSender { int64_t sendingMS; SyncTerm term; int64_t startTime; - int64_t endTime; + int64_t waitTime; int64_t lastSendTime; bool finish; @@ -60,8 +60,8 @@ void snapshotSenderStop(SSyncSnapshotSender *pSender, bool finis int32_t snapshotReSend(SSyncSnapshotSender *pSender); typedef struct SSyncSnapshotReceiver { - // update when pre snapshot - bool start; + // update when prep snapshot + int8_t start; int32_t ack; SyncTerm term; SRaftId fromId; diff --git a/source/libs/sync/src/syncAppendEntries.c b/source/libs/sync/src/syncAppendEntries.c index 925988f43a..51a0679889 100644 --- a/source/libs/sync/src/syncAppendEntries.c +++ b/source/libs/sync/src/syncAppendEntries.c @@ -155,6 +155,13 @@ int32_t syncNodeOnAppendEntries(SSyncNode* ths, const SRpcMsg* pRpcMsg) { pMsg->vgId, pMsg->prevLogIndex + 1, pMsg->term, pMsg->prevLogIndex, pMsg->prevLogTerm, pMsg->commitIndex, pEntry->term); + if (ths->fsmState == SYNC_FSM_STATE_INCOMPLETE) { + pReply->fsmState = ths->fsmState; + sWarn("vgId:%d, unable to accept, due to incomplete fsm state. index:%" PRId64, ths->vgId, pEntry->index); + syncEntryDestroy(pEntry); + goto _SEND_RESPONSE; + } + // accept if (syncLogBufferAccept(ths->pLogBuf, ths, pEntry, pMsg->prevLogTerm) < 0) { goto _SEND_RESPONSE; @@ -175,7 +182,7 @@ _SEND_RESPONSE: (void)syncNodeSendMsgById(&pReply->destId, ths, &rpcRsp); // commit index, i.e. leader notice me - if (syncLogBufferCommit(ths->pLogBuf, ths, ths->commitIndex) < 0) { + if (ths->fsmState != SYNC_FSM_STATE_INCOMPLETE && syncLogBufferCommit(ths->pLogBuf, ths, ths->commitIndex) < 0) { sError("vgId:%d, failed to commit raft fsm log since %s.", ths->vgId, terrstr()); } diff --git a/source/libs/sync/src/syncElection.c b/source/libs/sync/src/syncElection.c index 86e28db90c..c57e7e273f 100644 --- a/source/libs/sync/src/syncElection.c +++ b/source/libs/sync/src/syncElection.c @@ -71,6 +71,11 @@ static int32_t syncNodeRequestVotePeers(SSyncNode* pNode) { } int32_t syncNodeElect(SSyncNode* pSyncNode) { + if (pSyncNode->fsmState == SYNC_FSM_STATE_INCOMPLETE) { + sNError(pSyncNode, "skip leader election due to incomplete fsm state"); + return -1; + } + sNInfo(pSyncNode, "begin election"); pSyncNode->electNum++; diff --git a/source/libs/sync/src/syncMain.c b/source/libs/sync/src/syncMain.c index eca499cf28..f9dc10da02 100644 --- a/source/libs/sync/src/syncMain.c +++ b/source/libs/sync/src/syncMain.c @@ -1009,6 +1009,13 @@ SSyncNode* syncNodeOpen(SSyncInfo* pSyncInfo, int32_t vnodeVersion) { commitIndex = snapshot.lastApplyIndex; sNTrace(pSyncNode, "reset commit index by snapshot"); } + pSyncNode->fsmState = snapshot.state; + if (pSyncNode->fsmState == SYNC_FSM_STATE_INCOMPLETE) { + sError("vgId:%d, fsm state is incomplete.", pSyncNode->vgId); + if (pSyncNode->replicaNum == 1) { + goto _error; + } + } } pSyncNode->commitIndex = commitIndex; sInfo("vgId:%d, sync node commitIndex initialized as %" PRId64, pSyncNode->vgId, pSyncNode->commitIndex); @@ -1163,7 +1170,8 @@ int32_t syncNodeRestore(SSyncNode* pSyncNode) { pSyncNode->commitIndex = TMAX(pSyncNode->commitIndex, commitIndex); sInfo("vgId:%d, restore sync until commitIndex:%" PRId64, pSyncNode->vgId, pSyncNode->commitIndex); - if (syncLogBufferCommit(pSyncNode->pLogBuf, pSyncNode, pSyncNode->commitIndex) < 0) { + if (pSyncNode->fsmState != SYNC_FSM_STATE_INCOMPLETE && + syncLogBufferCommit(pSyncNode->pLogBuf, pSyncNode, pSyncNode->commitIndex) < 0) { return -1; } @@ -1455,10 +1463,9 @@ int32_t syncNodeSendMsgById(const SRaftId* destRaftId, SSyncNode* pNode, SRpcMsg } if (code < 0) { - sError("vgId:%d, sync send msg by id error, epset:%p dnode:%d addr:%" PRId64 " err:0x%x", pNode->vgId, epSet, - DID(destRaftId), destRaftId->addr, terrno); + sError("vgId:%d, failed to send sync msg since %s. epset:%p dnode:%d addr:%" PRId64, pNode->vgId, terrstr(), epSet, + DID(destRaftId), destRaftId->addr); rpcFreeCont(pMsg->pCont); - terrno = TSDB_CODE_SYN_INTERNAL_ERROR; } return code; @@ -2895,7 +2902,7 @@ _out:; // single replica (void)syncNodeUpdateCommitIndex(ths, matchIndex); - if (syncLogBufferCommit(ths->pLogBuf, ths, ths->commitIndex) < 0) { + if (ths->fsmState != SYNC_FSM_STATE_INCOMPLETE && syncLogBufferCommit(ths->pLogBuf, ths, ths->commitIndex) < 0) { sError("vgId:%d, failed to commit until commitIndex:%" PRId64 "", ths->vgId, ths->commitIndex); code = -1; } @@ -3139,7 +3146,7 @@ int32_t syncNodeOnLocalCmd(SSyncNode* ths, const SRpcMsg* pRpcMsg) { if (pMsg->currentTerm == matchTerm) { (void)syncNodeUpdateCommitIndex(ths, pMsg->commitIndex); } - if (syncLogBufferCommit(ths->pLogBuf, ths, ths->commitIndex) < 0) { + if (ths->fsmState != SYNC_FSM_STATE_INCOMPLETE && syncLogBufferCommit(ths->pLogBuf, ths, ths->commitIndex) < 0) { sError("vgId:%d, failed to commit raft log since %s. commit index:%" PRId64 "", ths->vgId, terrstr(), ths->commitIndex); } diff --git a/source/libs/sync/src/syncMessage.c b/source/libs/sync/src/syncMessage.c index 72c8887803..9e035f60c2 100644 --- a/source/libs/sync/src/syncMessage.c +++ b/source/libs/sync/src/syncMessage.c @@ -216,42 +216,6 @@ int32_t syncBuildHeartbeatReply(SRpcMsg* pMsg, int32_t vgId) { return 0; } -#if 0 -int32_t syncBuildPreSnapshot(SRpcMsg* pMsg, int32_t vgId) { - int32_t bytes = sizeof(SyncPreSnapshot); - pMsg->pCont = rpcMallocCont(bytes); - pMsg->msgType = TDMT_SYNC_PRE_SNAPSHOT; - pMsg->contLen = bytes; - if (pMsg->pCont == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; - return -1; - } - - SyncPreSnapshot* pPreSnapshot = pMsg->pCont; - pPreSnapshot->bytes = bytes; - pPreSnapshot->msgType = TDMT_SYNC_PRE_SNAPSHOT; - pPreSnapshot->vgId = vgId; - return 0; -} - -int32_t syncBuildPreSnapshotReply(SRpcMsg* pMsg, int32_t vgId) { - int32_t bytes = sizeof(SyncPreSnapshotReply); - pMsg->pCont = rpcMallocCont(bytes); - pMsg->msgType = TDMT_SYNC_PRE_SNAPSHOT_REPLY; - pMsg->contLen = bytes; - if (pMsg->pCont == NULL) { - terrno = TSDB_CODE_OUT_OF_MEMORY; - return -1; - } - - SyncPreSnapshotReply* pPreSnapshotReply = pMsg->pCont; - pPreSnapshotReply->bytes = bytes; - pPreSnapshotReply->msgType = TDMT_SYNC_PRE_SNAPSHOT_REPLY; - pPreSnapshotReply->vgId = vgId; - return 0; -} -#endif - int32_t syncBuildSnapshotSend(SRpcMsg* pMsg, int32_t dataLen, int32_t vgId) { int32_t bytes = sizeof(SyncSnapshotSend) + dataLen; pMsg->pCont = rpcMallocCont(bytes); @@ -270,8 +234,8 @@ int32_t syncBuildSnapshotSend(SRpcMsg* pMsg, int32_t dataLen, int32_t vgId) { return 0; } -int32_t syncBuildSnapshotSendRsp(SRpcMsg* pMsg, int32_t vgId) { - int32_t bytes = sizeof(SyncSnapshotRsp); +int32_t syncBuildSnapshotSendRsp(SRpcMsg* pMsg, int32_t dataLen, int32_t vgId) { + int32_t bytes = sizeof(SyncSnapshotRsp) + dataLen; pMsg->pCont = rpcMallocCont(bytes); pMsg->msgType = TDMT_SYNC_SNAPSHOT_RSP; pMsg->contLen = bytes; diff --git a/source/libs/sync/src/syncPipeline.c b/source/libs/sync/src/syncPipeline.c index 019f8f7e62..a7ee37cc3b 100644 --- a/source/libs/sync/src/syncPipeline.c +++ b/source/libs/sync/src/syncPipeline.c @@ -839,14 +839,16 @@ int32_t syncLogReplRecover(SSyncLogReplMgr* pMgr, SSyncNode* pNode, SyncAppendEn return 0; } - if (pMsg->success == false && pMsg->matchIndex >= pMsg->lastSendIndex) { - sWarn("vgId:%d, failed to rollback match index. peer: dnode:%d, match index:%" PRId64 ", last sent:%" PRId64, - pNode->vgId, DID(&destId), pMsg->matchIndex, pMsg->lastSendIndex); + if (pMsg->fsmState == SYNC_FSM_STATE_INCOMPLETE || (!pMsg->success && pMsg->matchIndex >= pMsg->lastSendIndex)) { + char* msg1 = " rollback match index failure"; + char* msg2 = " incomplete fsm state"; + sInfo("vgId:%d, snapshot replication to dnode:%d. reason:%s, match index:%" PRId64 ", last sent:%" PRId64, + pNode->vgId, DID(&destId), (pMsg->fsmState == SYNC_FSM_STATE_INCOMPLETE ? msg2 : msg1), pMsg->matchIndex, + pMsg->lastSendIndex); if (syncNodeStartSnapshot(pNode, &destId) < 0) { sError("vgId:%d, failed to start snapshot for peer dnode:%d", pNode->vgId, DID(&destId)); return -1; } - sInfo("vgId:%d, snapshot replication to peer dnode:%d", pNode->vgId, DID(&destId)); return 0; } } @@ -1000,10 +1002,9 @@ int32_t syncLogReplAttempt(SSyncLogReplMgr* pMgr, SSyncNode* pNode) { pMgr->endIndex = index + 1; if (barrier) { - sInfo("vgId:%d, replicated sync barrier to dest:%" PRIx64 ". index:%" PRId64 ", term:%" PRId64 + sInfo("vgId:%d, replicated sync barrier to dnode:%d. index:%" PRId64 ", term:%" PRId64 ", repl mgr: rs(%d) [%" PRId64 " %" PRId64 ", %" PRId64 ")", - pNode->vgId, pDestId->addr, index, term, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, - pMgr->endIndex); + pNode->vgId, DID(pDestId), index, term, pMgr->restored, pMgr->startIndex, pMgr->matchIndex, pMgr->endIndex); break; } } diff --git a/source/libs/sync/src/syncSnapshot.c b/source/libs/sync/src/syncSnapshot.c index 763d4ec5d6..924813eb98 100644 --- a/source/libs/sync/src/syncSnapshot.c +++ b/source/libs/sync/src/syncSnapshot.c @@ -44,8 +44,8 @@ SSyncSnapshotSender *snapshotSenderCreate(SSyncNode *pSyncNode, int32_t replicaI pSender->pSyncNode = pSyncNode; pSender->replicaIndex = replicaIndex; pSender->term = raftStoreGetTerm(pSyncNode); - pSender->startTime = 0; - pSender->endTime = 0; + pSender->startTime = -1; + pSender->waitTime = -1; pSender->pSyncNode->pFsm->FpGetSnapshotInfo(pSender->pSyncNode->pFsm, &pSender->snapshot); pSender->finish = false; @@ -71,10 +71,14 @@ void snapshotSenderDestroy(SSyncSnapshotSender *pSender) { taosMemoryFree(pSender); } -bool snapshotSenderIsStart(SSyncSnapshotSender *pSender) { return pSender->start; } +bool snapshotSenderIsStart(SSyncSnapshotSender *pSender) { return atomic_load_8(&pSender->start); } int32_t snapshotSenderStart(SSyncSnapshotSender *pSender) { - pSender->start = true; + int32_t code = -1; + + int8_t started = atomic_val_compare_exchange_8(&pSender->start, false, true); + if (started) return 0; + pSender->seq = SYNC_SNAPSHOT_SEQ_BEGIN; pSender->ack = SYNC_SNAPSHOT_SEQ_INVALID; pSender->pReader = NULL; @@ -91,15 +95,33 @@ int32_t snapshotSenderStart(SSyncSnapshotSender *pSender) { memset(&pSender->lastConfig, 0, sizeof(pSender->lastConfig)); pSender->sendingMS = 0; pSender->term = raftStoreGetTerm(pSender->pSyncNode); - pSender->startTime = taosGetTimestampMs(); - pSender->lastSendTime = pSender->startTime; + pSender->startTime = taosGetMonoTimestampMs(); + pSender->lastSendTime = taosGetTimestampMs(); pSender->finish = false; - // build begin msg + // Get full snapshot info + SSyncNode *pSyncNode = pSender->pSyncNode; + SSnapshot snapInfo = {.type = TDMT_SYNC_PREP_SNAPSHOT}; + if (pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapInfo) != 0) { + sSError(pSender, "snapshot get info failure since %s", terrstr()); + goto _out; + } + + int dataLen = 0; + if (snapInfo.data) { + SSyncTLV *datHead = snapInfo.data; + if (datHead->typ != TDMT_SYNC_PREP_SNAPSHOT) { + sSError(pSender, "unexpected data typ in data of snapshot info. typ: %d", datHead->typ); + terrno = TSDB_CODE_INVALID_DATA_FMT; + goto _out; + } + dataLen = sizeof(SSyncTLV) + datHead->len; + } + SRpcMsg rpcMsg = {0}; - if (syncBuildSnapshotSend(&rpcMsg, 0, pSender->pSyncNode->vgId) != 0) { + if (syncBuildSnapshotSend(&rpcMsg, dataLen, pSender->pSyncNode->vgId) != 0) { sSError(pSender, "snapshot sender build msg failed since %s", terrstr()); - return -1; + goto _out; } SyncSnapshotSend *pMsg = rpcMsg.pCont; @@ -114,25 +136,38 @@ int32_t snapshotSenderStart(SSyncSnapshotSender *pSender) { pMsg->startTime = pSender->startTime; pMsg->seq = SYNC_SNAPSHOT_SEQ_PREP_SNAPSHOT; + if (dataLen > 0) { + pMsg->payloadType = snapInfo.type; + memcpy(pMsg->data, snapInfo.data, dataLen); + } + // event log syncLogSendSyncSnapshotSend(pSender->pSyncNode, pMsg, "snapshot sender start"); // send msg if (syncNodeSendMsgById(&pMsg->destId, pSender->pSyncNode, &rpcMsg) != 0) { sSError(pSender, "snapshot sender send msg failed since %s", terrstr()); - return -1; + goto _out; } - return 0; + code = 0; +_out: + if (snapInfo.data) { + taosMemoryFree(snapInfo.data); + snapInfo.data = NULL; + } + return code; } void snapshotSenderStop(SSyncSnapshotSender *pSender, bool finish) { sSDebug(pSender, "snapshot sender stop, finish:%d reader:%p", finish, pSender->pReader); // update flag - pSender->start = false; + int8_t stopped = !atomic_val_compare_exchange_8(&pSender->start, true, false); + if (stopped) return; + pSender->finish = finish; - pSender->endTime = taosGetTimestampMs(); + pSender->waitTime = -1; // close reader if (pSender->pReader != NULL) { @@ -193,6 +228,7 @@ static int32_t snapshotSend(SSyncSnapshotSender *pSender) { pMsg->lastTerm = pSender->snapshot.lastApplyTerm; pMsg->lastConfigIndex = pSender->snapshot.lastConfigIndex; pMsg->lastConfig = pSender->lastConfig; + pMsg->startTime = pSender->startTime; pMsg->seq = pSender->seq; if (pSender->pCurrentBlock != NULL) { @@ -234,6 +270,7 @@ int32_t snapshotReSend(SSyncSnapshotSender *pSender) { pMsg->lastTerm = pSender->snapshot.lastApplyTerm; pMsg->lastConfigIndex = pSender->snapshot.lastConfigIndex; pMsg->lastConfig = pSender->lastConfig; + pMsg->startTime = pSender->startTime; pMsg->seq = pSender->seq; if (pSender->pCurrentBlock != NULL && pSender->blockLen > 0) { @@ -256,7 +293,7 @@ int32_t snapshotReSend(SSyncSnapshotSender *pSender) { static int32_t snapshotSenderUpdateProgress(SSyncSnapshotSender *pSender, SyncSnapshotRsp *pMsg) { if (pMsg->ack != pSender->seq) { sSError(pSender, "snapshot sender update seq failed, ack:%d seq:%d", pMsg->ack, pSender->seq); - terrno = TSDB_CODE_SYN_INTERNAL_ERROR; + terrno = TSDB_CODE_SYN_MISMATCHED_SIGNATURE; return -1; } @@ -271,8 +308,6 @@ static int32_t snapshotSenderUpdateProgress(SSyncSnapshotSender *pSender, SyncSn // return 1, last snapshot finish ok // return -1, error int32_t syncNodeStartSnapshot(SSyncNode *pSyncNode, SRaftId *pDestId) { - sNInfo(pSyncNode, "snapshot sender starting ..."); - SSyncSnapshotSender *pSender = syncNodeGetSnapshotSender(pSyncNode, pDestId); if (pSender == NULL) { sNError(pSyncNode, "snapshot sender start error since get failed"); @@ -280,12 +315,16 @@ int32_t syncNodeStartSnapshot(SSyncNode *pSyncNode, SRaftId *pDestId) { } if (snapshotSenderIsStart(pSender)) { - sSInfo(pSender, "snapshot sender already start, ignore"); + sSDebug(pSender, "snapshot sender already start, ignore"); return 0; } - if (pSender->finish && taosGetTimestampMs() - pSender->endTime < SNAPSHOT_WAIT_MS) { - sSInfo(pSender, "snapshot sender start too frequently, ignore"); + int64_t timeNow = taosGetTimestampMs(); + if (pSender->waitTime <= 0) { + pSender->waitTime = timeNow + SNAPSHOT_WAIT_MS; + } + if (timeNow < pSender->waitTime) { + sSDebug(pSender, "snapshot sender waitTime not expired yet, ignore"); return 0; } @@ -312,6 +351,7 @@ SSyncSnapshotReceiver *snapshotReceiverCreate(SSyncNode *pSyncNode, SRaftId from } pReceiver->start = false; + pReceiver->startTime = 0; pReceiver->ack = SYNC_SNAPSHOT_SEQ_BEGIN; pReceiver->pWriter = NULL; pReceiver->pSyncNode = pSyncNode; @@ -338,12 +378,31 @@ void snapshotReceiverDestroy(SSyncSnapshotReceiver *pReceiver) { pReceiver->pWriter = NULL; } + // free data of snapshot info + if (pReceiver->snapshotParam.data) { + taosMemoryFree(pReceiver->snapshotParam.data); + pReceiver->snapshotParam.data = NULL; + } + + if (pReceiver->snapshot.data) { + taosMemoryFree(pReceiver->snapshot.data); + pReceiver->snapshot.data = NULL; + } + // free receiver taosMemoryFree(pReceiver); } bool snapshotReceiverIsStart(SSyncSnapshotReceiver *pReceiver) { - return (pReceiver != NULL ? pReceiver->start : false); + return (pReceiver != NULL ? atomic_load_8(&pReceiver->start) : false); +} + +static int32_t snapshotReceiverSignatureCmp(SSyncSnapshotReceiver *pReceiver, SyncSnapshotSend *pMsg) { + if (pReceiver->term < pMsg->term) return -1; + if (pReceiver->term > pMsg->term) return 1; + if (pReceiver->startTime < pMsg->startTime) return -1; + if (pReceiver->startTime > pMsg->startTime) return 1; + return 0; } static int32_t snapshotReceiverStartWriter(SSyncSnapshotReceiver *pReceiver, SyncSnapshotSend *pBeginMsg) { @@ -382,11 +441,14 @@ void snapshotReceiverStart(SSyncSnapshotReceiver *pReceiver, SyncSnapshotSend *p return; } - pReceiver->start = true; + int8_t started = atomic_val_compare_exchange_8(&pReceiver->start, false, true); + if (started) return; + pReceiver->ack = SYNC_SNAPSHOT_SEQ_PREP_SNAPSHOT; - pReceiver->term = raftStoreGetTerm(pReceiver->pSyncNode); + pReceiver->term = pPreMsg->term; pReceiver->fromId = pPreMsg->srcId; pReceiver->startTime = pPreMsg->startTime; + ASSERT(pReceiver->startTime); // event log sRInfo(pReceiver, "snapshot receiver is start"); @@ -397,6 +459,9 @@ void snapshotReceiverStart(SSyncSnapshotReceiver *pReceiver, SyncSnapshotSend *p void snapshotReceiverStop(SSyncSnapshotReceiver *pReceiver) { sRInfo(pReceiver, "snapshot receiver stop, not apply, writer:%p", pReceiver->pWriter); + int8_t stopped = !atomic_val_compare_exchange_8(&pReceiver->start, true, false); + if (stopped) return; + if (pReceiver->pWriter != NULL) { int32_t ret = pReceiver->pSyncNode->pFsm->FpSnapshotStopWrite(pReceiver->pSyncNode->pFsm, pReceiver->pWriter, false, &pReceiver->snapshot); @@ -407,8 +472,6 @@ void snapshotReceiverStop(SSyncSnapshotReceiver *pReceiver) { } else { sRInfo(pReceiver, "snapshot receiver stop, writer is null"); } - - pReceiver->start = false; } // when recv last snapshot block, apply data into snapshot @@ -458,6 +521,10 @@ static int32_t snapshotReceiverFinish(SSyncSnapshotReceiver *pReceiver, SyncSnap // update progress pReceiver->ack = SYNC_SNAPSHOT_SEQ_END; + SSnapshot snapshot = {0}; + pReceiver->pSyncNode->pFsm->FpGetSnapshotInfo(pReceiver->pSyncNode->pFsm, &snapshot); + pReceiver->pSyncNode->fsmState = snapshot.state; + } else { sRError(pReceiver, "snapshot receiver finish error since writer is null"); return -1; @@ -529,19 +596,26 @@ static int32_t syncNodeOnSnapshotPrep(SSyncNode *pSyncNode, SyncSnapshotSend *pM if (snapshotReceiverIsStart(pReceiver)) { // already start - if (pMsg->startTime > pReceiver->startTime) { - sRInfo(pReceiver, "snapshot receiver startTime:%" PRId64 " > msg startTime:%" PRId64 " start receiver", - pReceiver->startTime, pMsg->startTime); + int32_t order = 0; + if ((order = snapshotReceiverSignatureCmp(pReceiver, pMsg)) < 0) { + sRInfo(pReceiver, + "received a new snapshot preparation. restart receiver" + "receiver signature: (%" PRId64 ", %" PRId64 "), msg signature:(%" PRId64 ", %" PRId64 ")", + pReceiver->term, pReceiver->startTime, pMsg->term, pMsg->startTime); goto _START_RECEIVER; - } else if (pMsg->startTime == pReceiver->startTime) { - sRInfo(pReceiver, "snapshot receiver startTime:%" PRId64 " == msg startTime:%" PRId64 " send reply", - pReceiver->startTime, pMsg->startTime); + } else if (order == 0) { + sRInfo(pReceiver, + "received a duplicate snapshot preparation. send reply" + "receiver signature: (%" PRId64 ", %" PRId64 "), msg signature:(%" PRId64 ", %" PRId64 ")", + pReceiver->term, pReceiver->startTime, pMsg->term, pMsg->startTime); goto _SEND_REPLY; } else { // ignore - sRError(pReceiver, "snapshot receiver startTime:%" PRId64 " < msg startTime:%" PRId64 " ignore", - pReceiver->startTime, pMsg->startTime); - terrno = TSDB_CODE_SYN_INTERNAL_ERROR; + sRError(pReceiver, + "received a stale snapshot preparation. ignore" + "receiver signature: (%" PRId64 ", %" PRId64 "), msg signature:(%" PRId64 ", %" PRId64 ")", + pReceiver->term, pReceiver->startTime, pMsg->term, pMsg->startTime); + terrno = TSDB_CODE_SYN_MISMATCHED_SIGNATURE; code = terrno; goto _SEND_REPLY; } @@ -552,36 +626,45 @@ static int32_t syncNodeOnSnapshotPrep(SSyncNode *pSyncNode, SyncSnapshotSend *pM } _START_RECEIVER: - if (timeNow - pMsg->startTime > SNAPSHOT_MAX_CLOCK_SKEW_MS) { - sRError(pReceiver, "snapshot receiver time skew too much, now:%" PRId64 " msg startTime:%" PRId64, timeNow, - pMsg->startTime); - terrno = TSDB_CODE_SYN_INTERNAL_ERROR; - code = terrno; - } else { - // waiting for clock match - while (timeNow < pMsg->startTime) { - sRInfo(pReceiver, "snapshot receiver pre waitting for true time, now:%" PRId64 ", startTime:%" PRId64, timeNow, - pMsg->startTime); - taosMsleep(10); - timeNow = taosGetTimestampMs(); - } - - if (snapshotReceiverIsStart(pReceiver)) { - sRInfo(pReceiver, "snapshot receiver already start and force stop pre one"); - snapshotReceiverStop(pReceiver); - } - - snapshotReceiverStart(pReceiver, pMsg); // set start-time same with sender + if (snapshotReceiverIsStart(pReceiver)) { + sRInfo(pReceiver, "snapshot receiver already start and force stop pre one"); + snapshotReceiverStop(pReceiver); } + snapshotReceiverStart(pReceiver, pMsg); // set start-time same with sender + _SEND_REPLY: // build msg ; // make complier happy + SSnapshot snapInfo = {.type = TDMT_SYNC_PREP_SNAPSHOT_REPLY}; + int32_t dataLen = 0; + if (pMsg->dataLen > 0) { + void *data = taosMemoryCalloc(1, pMsg->dataLen); + if (data == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + code = terrno; + goto _out; + } + memcpy(data, pMsg->data, pMsg->dataLen); + snapInfo.data = data; + data = NULL; + pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapInfo); + + SSyncTLV *datHead = snapInfo.data; + if (datHead->typ != TDMT_SYNC_PREP_SNAPSHOT_REPLY) { + sRError(pReceiver, "unexpected data typ in data of snapshot info. typ: %d", datHead->typ); + code = TSDB_CODE_INVALID_DATA_FMT; + goto _out; + } + dataLen = sizeof(SSyncTLV) + datHead->len; + } + SRpcMsg rpcMsg = {0}; - if (syncBuildSnapshotSendRsp(&rpcMsg, pSyncNode->vgId) != 0) { + if (syncBuildSnapshotSendRsp(&rpcMsg, dataLen, pSyncNode->vgId) != 0) { sRError(pReceiver, "snapshot receiver failed to build resp since %s", terrstr()); - return -1; + code = terrno; + goto _out; } SyncSnapshotRsp *pRspMsg = rpcMsg.pCont; @@ -590,18 +673,40 @@ _SEND_REPLY: pRspMsg->term = raftStoreGetTerm(pSyncNode); pRspMsg->lastIndex = pMsg->lastIndex; pRspMsg->lastTerm = pMsg->lastTerm; - pRspMsg->startTime = pReceiver->startTime; + pRspMsg->startTime = pMsg->startTime; pRspMsg->ack = pMsg->seq; // receiver maybe already closed pRspMsg->code = code; pRspMsg->snapBeginIndex = syncNodeGetSnapBeginIndex(pSyncNode); - // send msg - syncLogSendSyncSnapshotRsp(pSyncNode, pRspMsg, "snapshot receiver pre-snapshot"); - if (syncNodeSendMsgById(&pRspMsg->destId, pSyncNode, &rpcMsg) != 0) { - sRError(pReceiver, "snapshot receiver failed to build resp since %s", terrstr()); - return -1; + if (snapInfo.data) { + pRspMsg->payloadType = snapInfo.type; + memcpy(pRspMsg->data, snapInfo.data, dataLen); + + // save snapshot info + SSnapshotParam *pParam = &pReceiver->snapshotParam; + void *data = taosMemoryRealloc(pParam->data, dataLen); + if (data == NULL) { + sError("vgId:%d, failed to realloc memory for snapshot prep due to %s. dataLen:%d", pSyncNode->vgId, + strerror(errno), dataLen); + terrno = TSDB_CODE_OUT_OF_MEMORY; + code = terrno; + goto _out; + } + pParam->data = data; + memcpy(pParam->data, snapInfo.data, dataLen); } + // send msg + if (syncNodeSendMsgById(&pRspMsg->destId, pSyncNode, &rpcMsg) != 0) { + sRError(pReceiver, "failed to send resp since %s", terrstr()); + code = terrno; + } + +_out: + if (snapInfo.data) { + taosMemoryFree(snapInfo.data); + snapInfo.data = NULL; + } return code; } @@ -611,19 +716,19 @@ static int32_t syncNodeOnSnapshotBegin(SSyncNode *pSyncNode, SyncSnapshotSend *p int32_t code = TSDB_CODE_SYN_INTERNAL_ERROR; if (!snapshotReceiverIsStart(pReceiver)) { - sRError(pReceiver, "snapshot receiver begin failed since not start"); + sRError(pReceiver, "failed to begin snapshot receiver since not started"); goto _SEND_REPLY; } - if (pReceiver->startTime != pMsg->startTime) { - sRError(pReceiver, "snapshot receiver begin failed since startTime:%" PRId64 " not equal to msg startTime:%" PRId64, - pReceiver->startTime, pMsg->startTime); + if (snapshotReceiverSignatureCmp(pReceiver, pMsg) != 0) { + terrno = TSDB_CODE_SYN_MISMATCHED_SIGNATURE; + sRError(pReceiver, "failed to begin snapshot receiver since %s", terrstr()); goto _SEND_REPLY; } // start writer if (snapshotReceiverStartWriter(pReceiver, pMsg) != 0) { - sRError(pReceiver, "snapshot receiver begin failed since start writer failed"); + sRError(pReceiver, "failed to start snapshot writer since %s", terrstr()); goto _SEND_REPLY; } @@ -635,8 +740,8 @@ _SEND_REPLY: // build msg SRpcMsg rpcMsg = {0}; - if (syncBuildSnapshotSendRsp(&rpcMsg, pSyncNode->vgId) != 0) { - sRError(pReceiver, "snapshot receiver build resp failed since %s", terrstr()); + if (syncBuildSnapshotSendRsp(&rpcMsg, 0, pSyncNode->vgId) != 0) { + sRError(pReceiver, "failed to build snapshot receiver resp since %s", terrstr()); return -1; } @@ -646,15 +751,14 @@ _SEND_REPLY: pRspMsg->term = raftStoreGetTerm(pSyncNode); pRspMsg->lastIndex = pMsg->lastIndex; pRspMsg->lastTerm = pMsg->lastTerm; - pRspMsg->startTime = pReceiver->startTime; + pRspMsg->startTime = pMsg->startTime; pRspMsg->ack = pReceiver->ack; // receiver maybe already closed pRspMsg->code = code; pRspMsg->snapBeginIndex = pReceiver->snapshotParam.start; // send msg - syncLogSendSyncSnapshotRsp(pSyncNode, pRspMsg, "snapshot receiver begin"); if (syncNodeSendMsgById(&pRspMsg->destId, pSyncNode, &rpcMsg) != 0) { - sRError(pReceiver, "snapshot receiver send resp failed since %s", terrstr()); + sRError(pReceiver, "failed to send snapshot receiver resp since %s", terrstr()); return -1; } @@ -665,17 +769,16 @@ static int32_t syncNodeOnSnapshotReceive(SSyncNode *pSyncNode, SyncSnapshotSend // condition 4 // transfering SSyncSnapshotReceiver *pReceiver = pSyncNode->pNewNodeReceiver; - - // waiting for clock match int64_t timeNow = taosGetTimestampMs(); - while (timeNow < pMsg->startTime) { - sRInfo(pReceiver, "snapshot receiver receiving waitting for true time, now:%" PRId64 ", stime:%" PRId64, timeNow, - pMsg->startTime); - taosMsleep(10); - timeNow = taosGetTimestampMs(); + int32_t code = 0; + + if (snapshotReceiverSignatureCmp(pReceiver, pMsg) != 0) { + terrno = TSDB_CODE_SYN_MISMATCHED_SIGNATURE; + sRError(pReceiver, "failed to receive snapshot data since %s.", terrstr()); + code = terrno; + goto _SEND_REPLY; } - int32_t code = 0; if (snapshotReceiverGotData(pReceiver, pMsg) != 0) { code = terrno; if (code >= SYNC_SNAPSHOT_SEQ_INVALID) { @@ -683,10 +786,12 @@ static int32_t syncNodeOnSnapshotReceive(SSyncNode *pSyncNode, SyncSnapshotSend } } +_SEND_REPLY:; + // build msg SRpcMsg rpcMsg = {0}; - if (syncBuildSnapshotSendRsp(&rpcMsg, pSyncNode->vgId)) { - sRError(pReceiver, "snapshot receiver build resp failed since %s", terrstr()); + if (syncBuildSnapshotSendRsp(&rpcMsg, 0, pSyncNode->vgId)) { + sRError(pReceiver, "failed to build snapshot receiver resp since %s", terrstr()); return -1; } @@ -696,15 +801,14 @@ static int32_t syncNodeOnSnapshotReceive(SSyncNode *pSyncNode, SyncSnapshotSend pRspMsg->term = raftStoreGetTerm(pSyncNode); pRspMsg->lastIndex = pMsg->lastIndex; pRspMsg->lastTerm = pMsg->lastTerm; - pRspMsg->startTime = pReceiver->startTime; + pRspMsg->startTime = pMsg->startTime; pRspMsg->ack = pReceiver->ack; // receiver maybe already closed pRspMsg->code = code; pRspMsg->snapBeginIndex = pReceiver->snapshotParam.start; // send msg - syncLogSendSyncSnapshotRsp(pSyncNode, pRspMsg, "snapshot receiver received"); if (syncNodeSendMsgById(&pRspMsg->destId, pSyncNode, &rpcMsg) != 0) { - sRError(pReceiver, "snapshot receiver send resp failed since %s", terrstr()); + sRError(pReceiver, "failed to send snapshot receiver resp since %s", terrstr()); return -1; } @@ -715,24 +819,27 @@ static int32_t syncNodeOnSnapshotEnd(SSyncNode *pSyncNode, SyncSnapshotSend *pMs // condition 2 // end, finish FSM SSyncSnapshotReceiver *pReceiver = pSyncNode->pNewNodeReceiver; - - // waiting for clock match int64_t timeNow = taosGetTimestampMs(); - while (timeNow < pMsg->startTime) { - sRInfo(pReceiver, "snapshot receiver finish waitting for true time, now:%" PRId64 ", stime:%" PRId64, timeNow, - pMsg->startTime); - taosMsleep(10); - timeNow = taosGetTimestampMs(); + int32_t code = 0; + + if (snapshotReceiverSignatureCmp(pReceiver, pMsg) != 0) { + sRError(pReceiver, "snapshot end failed since startTime:%" PRId64 " not equal to msg startTime:%" PRId64, + pReceiver->startTime, pMsg->startTime); + terrno = TSDB_CODE_SYN_MISMATCHED_SIGNATURE; + code = terrno; + goto _SEND_REPLY; } - int32_t code = snapshotReceiverFinish(pReceiver, pMsg); + code = snapshotReceiverFinish(pReceiver, pMsg); if (code == 0) { snapshotReceiverStop(pReceiver); } +_SEND_REPLY:; + // build msg SRpcMsg rpcMsg = {0}; - if (syncBuildSnapshotSendRsp(&rpcMsg, pSyncNode->vgId) != 0) { + if (syncBuildSnapshotSendRsp(&rpcMsg, 0, pSyncNode->vgId) != 0) { sRError(pReceiver, "snapshot receiver build rsp failed since %s", terrstr()); return -1; } @@ -743,7 +850,7 @@ static int32_t syncNodeOnSnapshotEnd(SSyncNode *pSyncNode, SyncSnapshotSend *pMs pRspMsg->term = raftStoreGetTerm(pSyncNode); pRspMsg->lastIndex = pMsg->lastIndex; pRspMsg->lastTerm = pMsg->lastTerm; - pRspMsg->startTime = pReceiver->startTime; + pRspMsg->startTime = pMsg->startTime; pRspMsg->ack = pReceiver->ack; // receiver maybe already closed pRspMsg->code = code; pRspMsg->snapBeginIndex = pReceiver->snapshotParam.start; @@ -785,13 +892,13 @@ int32_t syncNodeOnSnapshot(SSyncNode *pSyncNode, const SRpcMsg *pRpcMsg) { // if already drop replica, do not process if (!syncNodeInRaftGroup(pSyncNode, &pMsg->srcId)) { syncLogRecvSyncSnapshotSend(pSyncNode, pMsg, "not in my config"); - terrno = TSDB_CODE_SYN_INTERNAL_ERROR; + terrno = TSDB_CODE_SYN_MISMATCHED_SIGNATURE; return -1; } if (pMsg->term < raftStoreGetTerm(pSyncNode)) { syncLogRecvSyncSnapshotSend(pSyncNode, pMsg, "reject since small term"); - terrno = TSDB_CODE_SYN_INTERNAL_ERROR; + terrno = TSDB_CODE_SYN_MISMATCHED_SIGNATURE; return -1; } @@ -809,13 +916,16 @@ int32_t syncNodeOnSnapshot(SSyncNode *pSyncNode, const SRpcMsg *pRpcMsg) { if (pSyncNode->state == TAOS_SYNC_STATE_FOLLOWER || pSyncNode->state == TAOS_SYNC_STATE_LEARNER) { if (pMsg->term == raftStoreGetTerm(pSyncNode)) { if (pMsg->seq == SYNC_SNAPSHOT_SEQ_PREP_SNAPSHOT) { - syncLogRecvSyncSnapshotSend(pSyncNode, pMsg, "process seq pre-snapshot"); + sInfo("vgId:%d, receive pre-snapshot msg of snapshot replication. signature:(%" PRId64 ", %" PRId64 ")", + pSyncNode->vgId, pMsg->term, pMsg->startTime); code = syncNodeOnSnapshotPrep(pSyncNode, pMsg); } else if (pMsg->seq == SYNC_SNAPSHOT_SEQ_BEGIN) { - syncLogRecvSyncSnapshotSend(pSyncNode, pMsg, "process seq begin"); + sInfo("vgId:%d, receive begin msg of snapshot replication. signature:(%" PRId64 ", %" PRId64 ")", + pSyncNode->vgId, pMsg->term, pMsg->startTime); code = syncNodeOnSnapshotBegin(pSyncNode, pMsg); } else if (pMsg->seq == SYNC_SNAPSHOT_SEQ_END) { - syncLogRecvSyncSnapshotSend(pSyncNode, pMsg, "process seq end"); + sInfo("vgId:%d, receive end msg of snapshot replication. signature: (%" PRId64 ", %" PRId64 ")", + pSyncNode->vgId, pMsg->term, pMsg->startTime); code = syncNodeOnSnapshotEnd(pSyncNode, pMsg); if (syncLogBufferReInit(pSyncNode->pLogBuf, pSyncNode) != 0) { sRError(pReceiver, "failed to reinit log buffer since %s", terrstr()); @@ -859,17 +969,21 @@ static int32_t syncNodeOnSnapshotPrepRsp(SSyncNode *pSyncNode, SSyncSnapshotSend sSInfo(pSender, "prepare snapshot, recv-begin:%" PRId64 ", snapshot.last:%" PRId64 ", snapshot.term:%" PRId64, pMsg->snapBeginIndex, snapshot.lastApplyIndex, snapshot.lastApplyTerm); - if (pMsg->snapBeginIndex > snapshot.lastApplyIndex) { - sSError(pSender, "prepare snapshot failed since beginIndex:%" PRId64 " larger than applyIndex:%" PRId64, - pMsg->snapBeginIndex, snapshot.lastApplyIndex); - terrno = TSDB_CODE_SYN_INTERNAL_ERROR; - return -1; - } - // update sender pSender->snapshot = snapshot; // start reader + if (pMsg->payloadType == TDMT_SYNC_PREP_SNAPSHOT_REPLY) { + SSyncTLV *datHead = (void *)pMsg->data; + if (datHead->typ != pMsg->payloadType) { + sSError(pSender, "unexpected data type in data of SyncSnapshotRsp. typ: %d", datHead->typ); + terrno = TSDB_CODE_INVALID_DATA_FMT; + return -1; + } + pSender->snapshotParam.data = (void *)pMsg->data; + sSInfo(pSender, "data of snapshot param. len: %d", datHead->len); + } + int32_t code = pSyncNode->pFsm->FpSnapshotStartRead(pSyncNode->pFsm, &pSender->snapshotParam, &pSender->pReader); if (code != 0) { sSError(pSender, "prepare snapshot failed since %s", terrstr()); @@ -901,6 +1015,11 @@ static int32_t syncNodeOnSnapshotPrepRsp(SSyncNode *pSyncNode, SSyncSnapshotSend pSendMsg->startTime = pSender->startTime; pSendMsg->seq = SYNC_SNAPSHOT_SEQ_BEGIN; + ASSERT(pSendMsg->startTime); + + sSInfo(pSender, "begin snapshot replication to dnode %d. startTime:%" PRId64, DID(&pSendMsg->destId), + pSendMsg->startTime); + // send msg syncLogSendSyncSnapshotSend(pSyncNode, pSendMsg, "snapshot sender reply pre"); if (syncNodeSendMsgById(&pSendMsg->destId, pSender->pSyncNode, &rpcMsg) != 0) { @@ -911,6 +1030,14 @@ static int32_t syncNodeOnSnapshotPrepRsp(SSyncNode *pSyncNode, SSyncSnapshotSend return 0; } +static int32_t snapshotSenderSignatureCmp(SSyncSnapshotSender *pSender, SyncSnapshotRsp *pMsg) { + if (pSender->term < pMsg->term) return -1; + if (pSender->term > pMsg->term) return 1; + if (pSender->startTime < pMsg->startTime) return -1; + if (pSender->startTime > pMsg->startTime) return 1; + return 0; +} + // sender on message // // condition 1 sender receives SYNC_SNAPSHOT_SEQ_END, close sender @@ -923,7 +1050,7 @@ int32_t syncNodeOnSnapshotRsp(SSyncNode *pSyncNode, const SRpcMsg *pRpcMsg) { // if already drop replica, do not process if (!syncNodeInRaftGroup(pSyncNode, &pMsg->srcId)) { syncLogRecvSyncSnapshotRsp(pSyncNode, pMsg, "maybe replica already dropped"); - terrno = TSDB_CODE_SYN_INTERNAL_ERROR; + terrno = TSDB_CODE_SYN_MISMATCHED_SIGNATURE; return -1; } @@ -935,6 +1062,27 @@ int32_t syncNodeOnSnapshotRsp(SSyncNode *pSyncNode, const SRpcMsg *pRpcMsg) { return -1; } + if (!snapshotSenderIsStart(pSender)) { + sSError(pSender, "snapshot sender not started yet. sender startTime:%" PRId64 ", msg startTime:%" PRId64, + pSender->startTime, pMsg->startTime); + return -1; + } + + // check signature + int32_t order = 0; + if ((order = snapshotSenderSignatureCmp(pSender, pMsg)) > 0) { + sSError(pSender, + "received a stale snapshot rsp. ignore it" + "sender signature: (%" PRId64 ", %" PRId64 "), msg signature:(%" PRId64 ", %" PRId64 ")", + pSender->term, pSender->startTime, pMsg->term, pMsg->startTime); + terrno = TSDB_CODE_SYN_MISMATCHED_SIGNATURE; + return -1; + } else if (order < 0) { + sSError(pSender, "snapshot sender is stale. stop"); + terrno = TSDB_CODE_SYN_MISMATCHED_SIGNATURE; + goto _ERROR; + } + // state, term, seq/ack if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) { syncLogRecvSyncSnapshotRsp(pSyncNode, pMsg, "snapshot sender not leader"); @@ -943,20 +1091,12 @@ int32_t syncNodeOnSnapshotRsp(SSyncNode *pSyncNode, const SRpcMsg *pRpcMsg) { goto _ERROR; } - if (pMsg->startTime != pSender->startTime) { - syncLogRecvSyncSnapshotRsp(pSyncNode, pMsg, "snapshot sender and receiver time not match"); - sSError(pSender, "sender:%" PRId64 " receiver:%" PRId64 " time not match, error:%s 0x%x", pMsg->startTime, - pSender->startTime, tstrerror(pMsg->code), pMsg->code); - terrno = TSDB_CODE_SYN_INTERNAL_ERROR; - goto _ERROR; - } - SyncTerm currentTerm = raftStoreGetTerm(pSyncNode); if (pMsg->term != currentTerm) { syncLogRecvSyncSnapshotRsp(pSyncNode, pMsg, "snapshot sender and receiver term not match"); sSError(pSender, "snapshot sender term not equal, msg term:%" PRId64 " currentTerm:%" PRId64, pMsg->term, currentTerm); - terrno = TSDB_CODE_SYN_INTERNAL_ERROR; + terrno = TSDB_CODE_SYN_MISMATCHED_SIGNATURE; goto _ERROR; } diff --git a/source/libs/wal/src/walMeta.c b/source/libs/wal/src/walMeta.c index e700ef3d0a..f5e5427c68 100644 --- a/source/libs/wal/src/walMeta.c +++ b/source/libs/wal/src/walMeta.c @@ -339,8 +339,9 @@ bool walLogEntriesComplete(const SWal* pWal) { } if (!complete) { - wError("vgId:%d, WAL log entries incomplete in range [%" PRId64 ", %" PRId64 "], aligned with snaphotVer:%" PRId64, - pWal->cfg.vgId, pWal->vers.firstVer, pWal->vers.lastVer, pWal->vers.snapshotVer); + wError("vgId:%d, WAL log entries incomplete in range [%" PRId64 ", %" PRId64 "], index:%" PRId64 + ", snaphotVer:%" PRId64, + pWal->cfg.vgId, pWal->vers.firstVer, pWal->vers.lastVer, index, pWal->vers.snapshotVer); terrno = TSDB_CODE_WAL_LOG_INCOMPLETE; } diff --git a/source/util/src/terror.c b/source/util/src/terror.c index 383e4e9d8a..4cc86d51b7 100644 --- a/source/util/src/terror.c +++ b/source/util/src/terror.c @@ -440,6 +440,7 @@ TAOS_DEFINE_ERROR(TSDB_CODE_GRANT_GEN_ENC_IVLD_KLEN, "Invalid klen to encod // sync TAOS_DEFINE_ERROR(TSDB_CODE_SYN_TIMEOUT, "Sync timeout") +TAOS_DEFINE_ERROR(TSDB_CODE_SYN_MISMATCHED_SIGNATURE, "Sync signature mismatch") TAOS_DEFINE_ERROR(TSDB_CODE_SYN_NOT_LEADER, "Sync leader is unreachable") TAOS_DEFINE_ERROR(TSDB_CODE_SYN_NEW_CONFIG_ERROR, "Sync new config error") TAOS_DEFINE_ERROR(TSDB_CODE_SYN_PROPOSE_NOT_READY, "Sync not ready to propose") diff --git a/tests/system-test/0-others/multilevel.py b/tests/system-test/0-others/multilevel.py index 66434fff67..def2c3152b 100644 --- a/tests/system-test/0-others/multilevel.py +++ b/tests/system-test/0-others/multilevel.py @@ -17,6 +17,28 @@ from util.cases import * from util.sql import * from util.common import * from util.sqlset import * +import glob + +def scanFiles(pattern): + res = [] + for f in glob.iglob(pattern): + res += [f] + return res + +def checkFiles(pattern, state): + res = scanFiles(pattern) + tdLog.info(res) + num = len(res) + if num: + if state: + tdLog.info("%s: %d files exist. expect: files exist" % (pattern, num)) + else: + tdLog.exit("%s: %d files exist. expect: files not exist." % (pattern, num)) + else: + if state: + tdLog.exit("%s: %d files exist. expect: files exist" % (pattern, num)) + else: + tdLog.info("%s: %d files exist. expect: files not exist." % (pattern, num)) class TDTestCase: def init(self, conn, logSql, replicaVar=1): @@ -41,8 +63,8 @@ class TDTestCase: tdDnodes.start(1) tdLog.info("================= step2") - tdSql.haveFile('/mnt/data1/',1) - tdSql.haveFile('/mnt/data2/',0) + checkFiles(r'/mnt/data1/*/*',1) + checkFiles(r'/mnt/data2/*/*',0) tdDnodes.stop(1) def dir_not_exist(self): tdLog.info("============== dir_not_exist test ===============") @@ -156,9 +178,9 @@ class TDTestCase: tdDnodes.start(1) for i in dir_list: if i == '/mnt/data000': - tdSql.haveFile(i,1) + checkFiles("%s/*/*" % i, 1) else: - tdSql.haveFile(i,0) + checkFiles("%s/*/*" % i, 0) def more_than_16_disks(self): tdLog.info("============== more_than_16_disks test ===============") @@ -223,7 +245,8 @@ class TDTestCase: for i in range(10,30): tdSql.execute(f'insert into tb1 values(now-{i}d,10)') tdSql.execute('flush database dbtest') - tdSql.haveFile('/mnt/data1/',1) + time.sleep(3) + checkFiles('/mnt/data1/vnode/*/tsdb/v*',1) tdDnodes.stop(1) cfg={ '/mnt/data1 0 1' : 'dataDir', @@ -234,14 +257,14 @@ class TDTestCase: tdSql.createDir('/mnt/data3') tdDnodes.deploy(1,cfg) tdDnodes.start(1) - tdSql.haveFile('/mnt/data1/',1) - tdSql.haveFile('/mnt/data2/',0) - tdSql.haveFile('/mnt/data3/',0) + checkFiles('/mnt/data1/vnode/*/tsdb/v*',1) + checkFiles('/mnt/data2/vnode/*/tsdb/v*',0) + checkFiles('/mnt/data3/vnode/*/tsdb/v*',0) tdSql.execute('alter database dbtest keep 10d,365d,3650d') tdSql.execute('trim database dbtest') time.sleep(3) - tdSql.haveFile('/mnt/data1/',1) - tdSql.haveFile('/mnt/data2/',1) + checkFiles('/mnt/data1/vnode/*/tsdb/v*',1) + checkFiles('/mnt/data2/vnode/*/tsdb/v*',1) def run(self): self.basic()