diff --git a/include/libs/sync/sync.h b/include/libs/sync/sync.h index 189484d1a6..08a6be8015 100644 --- a/include/libs/sync/sync.h +++ b/include/libs/sync/sync.h @@ -143,10 +143,11 @@ typedef struct SSyncFSM { void* data; int32_t (*FpCommitCb)(const struct SSyncFSM* pFsm, SRpcMsg* pMsg, const SFsmCbMeta* pMeta); + SyncIndex (*FpAppliedIndexCb)(const struct SSyncFSM* pFsm); int32_t (*FpPreCommitCb)(const struct SSyncFSM* pFsm, SRpcMsg* pMsg, const SFsmCbMeta* pMeta); void (*FpRollBackCb)(const struct SSyncFSM* pFsm, SRpcMsg* pMsg, const SFsmCbMeta* pMeta); - void (*FpRestoreFinishCb)(const struct SSyncFSM* pFsm); + void (*FpRestoreFinishCb)(const struct SSyncFSM* pFsm, const SyncIndex commitIdx); void (*FpReConfigCb)(const struct SSyncFSM* pFsm, SRpcMsg* pMsg, const SReConfigCbMeta* pMeta); void (*FpLeaderTransferCb)(const struct SSyncFSM* pFsm, SRpcMsg* pMsg, const SFsmCbMeta* pMeta); bool (*FpApplyQueueEmptyCb)(const struct SSyncFSM* pFsm); diff --git a/include/util/taoserror.h b/include/util/taoserror.h index 900a6ce57e..f626f49661 100644 --- a/include/util/taoserror.h +++ b/include/util/taoserror.h @@ -542,7 +542,8 @@ int32_t* taosGetErrno(); #define TSDB_CODE_SYN_BATCH_ERROR TAOS_DEF_ERROR_CODE(0, 0x0913) #define TSDB_CODE_SYN_RESTORING TAOS_DEF_ERROR_CODE(0, 0x0914) #define TSDB_CODE_SYN_INVALID_SNAPSHOT_MSG TAOS_DEF_ERROR_CODE(0, 0x0915) // internal -#define TSDB_CODE_SYN_BUFFER_FULL TAOS_DEF_ERROR_CODE(0, 0x0916) // +#define TSDB_CODE_SYN_BUFFER_FULL TAOS_DEF_ERROR_CODE(0, 0x0916) +#define TSDB_CODE_SYN_WRITE_STALL TAOS_DEF_ERROR_CODE(0, 0x0917) #define TSDB_CODE_SYN_INTERNAL_ERROR TAOS_DEF_ERROR_CODE(0, 0x09FF) // tq diff --git a/source/dnode/mnode/impl/inc/mndInt.h b/source/dnode/mnode/impl/inc/mndInt.h index 006c74ef3d..ffb2443808 100644 --- a/source/dnode/mnode/impl/inc/mndInt.h +++ b/source/dnode/mnode/impl/inc/mndInt.h @@ -113,6 +113,7 @@ typedef struct SMnode { bool deploy; char *path; int64_t checkTime; + SyncIndex applied; SSdb *pSdb; SArray *pSteps; SQHandle *pQuery; diff --git a/source/dnode/mnode/impl/src/mndMain.c b/source/dnode/mnode/impl/src/mndMain.c index d83b969e2d..c32212dfc1 100644 --- a/source/dnode/mnode/impl/src/mndMain.c +++ b/source/dnode/mnode/impl/src/mndMain.c @@ -380,11 +380,13 @@ static int32_t mndInitSdb(SMnode *pMnode) { } static int32_t mndOpenSdb(SMnode *pMnode) { + int32_t code = 0; if (!pMnode->deploy) { - return sdbReadFile(pMnode->pSdb); - } else { - return 0; + code = sdbReadFile(pMnode->pSdb); } + + atomic_store_64(&pMnode->applied, pMnode->pSdb->commitIndex); + return code; } static void mndCleanupSdb(SMnode *pMnode) { diff --git a/source/dnode/mnode/impl/src/mndSync.c b/source/dnode/mnode/impl/src/mndSync.c index f702d8f148..18548db56f 100644 --- a/source/dnode/mnode/impl/src/mndSync.c +++ b/source/dnode/mnode/impl/src/mndSync.c @@ -129,6 +129,14 @@ int32_t mndProcessWriteMsg(const SSyncFSM *pFsm, SRpcMsg *pMsg, const SFsmCbMeta int32_t mndSyncCommitMsg(const SSyncFSM *pFsm, SRpcMsg *pMsg, const SFsmCbMeta *pMeta) { int32_t code = 0; + pMsg->info.conn.applyIndex = pMeta->index; + pMsg->info.conn.applyTerm = pMeta->term; + + if (pMsg->code == 0) { + SMnode *pMnode = pFsm->data; + atomic_store_64(&pMnode->applied, pMsg->info.conn.applyIndex); + } + if (!syncUtilUserCommit(pMsg->msgType)) { goto _out; } @@ -140,6 +148,11 @@ _out: return code; } +SyncIndex mndSyncAppliedIndex(const SSyncFSM *pFSM) { + SMnode *pMnode = pFSM->data; + return atomic_load_64(&pMnode->applied); +} + int32_t mndSyncGetSnapshot(const SSyncFSM *pFsm, SSnapshot *pSnapshot, void *pReaderParam, void **ppReader) { mInfo("start to read snapshot from sdb in atomic way"); SMnode *pMnode = pFsm->data; @@ -153,7 +166,7 @@ static void mndSyncGetSnapshotInfo(const SSyncFSM *pFsm, SSnapshot *pSnapshot) { sdbGetCommitInfo(pMnode->pSdb, &pSnapshot->lastApplyIndex, &pSnapshot->lastApplyTerm, &pSnapshot->lastConfigIndex); } -void mndRestoreFinish(const SSyncFSM *pFsm) { +void mndRestoreFinish(const SSyncFSM *pFsm, const SyncIndex commitIdx) { SMnode *pMnode = pFsm->data; if (!pMnode->deploy) { @@ -167,6 +180,8 @@ void mndRestoreFinish(const SSyncFSM *pFsm) { } else { mInfo("vgId:1, sync restore finished"); } + + ASSERT(commitIdx == mndSyncAppliedIndex(pFsm)); } int32_t mndSnapshotStartRead(const SSyncFSM *pFsm, void *pParam, void **ppReader) { @@ -253,6 +268,7 @@ SSyncFSM *mndSyncMakeFsm(SMnode *pMnode) { SSyncFSM *pFsm = taosMemoryCalloc(1, sizeof(SSyncFSM)); pFsm->data = pMnode; pFsm->FpCommitCb = mndSyncCommitMsg; + pFsm->FpAppliedIndexCb = mndSyncAppliedIndex; pFsm->FpPreCommitCb = NULL; pFsm->FpRollBackCb = NULL; pFsm->FpRestoreFinishCb = mndRestoreFinish; diff --git a/source/dnode/vnode/src/vnd/vnodeSvr.c b/source/dnode/vnode/src/vnd/vnodeSvr.c index 45113d55d3..d3ba7ad608 100644 --- a/source/dnode/vnode/src/vnd/vnodeSvr.c +++ b/source/dnode/vnode/src/vnd/vnodeSvr.c @@ -306,13 +306,7 @@ int32_t vnodeProcessWriteMsg(SVnode *pVnode, SRpcMsg *pMsg, int64_t version, SRp void *pReq; int32_t len; int32_t ret; - /* - if (!pVnode->inUse) { - terrno = TSDB_CODE_VND_NO_AVAIL_BUFPOOL; - vError("vgId:%d, not ready to write since %s", TD_VID(pVnode), terrstr()); - return -1; - } - */ + if (version <= pVnode->state.applied) { vError("vgId:%d, duplicate write request. version: %" PRId64 ", applied: %" PRId64 "", TD_VID(pVnode), version, pVnode->state.applied); @@ -326,8 +320,8 @@ int32_t vnodeProcessWriteMsg(SVnode *pVnode, SRpcMsg *pMsg, int64_t version, SRp ASSERT(pVnode->state.applyTerm <= pMsg->info.conn.applyTerm); ASSERT(pVnode->state.applied + 1 == version); - pVnode->state.applied = version; - pVnode->state.applyTerm = pMsg->info.conn.applyTerm; + atomic_store_64(&pVnode->state.applied, version); + atomic_store_64(&pVnode->state.applyTerm, pMsg->info.conn.applyTerm); if (!syncUtilUserCommit(pMsg->msgType)) goto _exit; diff --git a/source/dnode/vnode/src/vnd/vnodeSync.c b/source/dnode/vnode/src/vnd/vnodeSync.c index e71b03d2af..b49ca70bfa 100644 --- a/source/dnode/vnode/src/vnd/vnodeSync.c +++ b/source/dnode/vnode/src/vnd/vnodeSync.c @@ -433,7 +433,23 @@ static int32_t vnodeSyncApplyMsg(const SSyncFSM *pFsm, SRpcMsg *pMsg, const SFsm } static int32_t vnodeSyncCommitMsg(const SSyncFSM *pFsm, SRpcMsg *pMsg, const SFsmCbMeta *pMeta) { - return vnodeSyncApplyMsg(pFsm, pMsg, pMeta); + if (pMsg->code == 0) { + return vnodeSyncApplyMsg(pFsm, pMsg, pMeta); + } + + const STraceId *trace = &pMsg->info.traceId; + SVnode *pVnode = pFsm->data; + vnodePostBlockMsg(pVnode, pMsg); + + SRpcMsg rsp = {.code = pMsg->code, .info = pMsg->info}; + if (rsp.info.handle != NULL) { + tmsgSendRsp(&rsp); + } + + vGTrace("vgId:%d, msg:%p is freed, code:0x%x index:%" PRId64, TD_VID(pVnode), pMsg, rsp.code, pMeta->index); + rpcFreeCont(pMsg->pCont); + pMsg->pCont = NULL; + return 0; } static int32_t vnodeSyncPreCommitMsg(const SSyncFSM *pFsm, SRpcMsg *pMsg, const SFsmCbMeta *pMeta) { @@ -443,6 +459,11 @@ static int32_t vnodeSyncPreCommitMsg(const SSyncFSM *pFsm, SRpcMsg *pMsg, const return 0; } +static SyncIndex vnodeSyncAppliedIndex(const SSyncFSM *pFSM) { + SVnode *pVnode = pFSM->data; + return atomic_load_64(&pVnode->state.applied); +} + static void vnodeSyncRollBackMsg(const SSyncFSM *pFsm, SRpcMsg *pMsg, const SFsmCbMeta *pMeta) { SVnode *pVnode = pFsm->data; vTrace("vgId:%d, rollback-cb is excuted, fsm:%p, index:%" PRId64 ", weak:%d, code:%d, state:%d %s, type:%s", @@ -505,21 +526,26 @@ static int32_t vnodeSnapshotDoWrite(const SSyncFSM *pFsm, void *pWriter, void *p return code; } -static void vnodeRestoreFinish(const SSyncFSM *pFsm) { +static void vnodeRestoreFinish(const SSyncFSM *pFsm, const SyncIndex commitIdx) { SVnode *pVnode = pFsm->data; + SyncIndex appliedIdx = -1; do { - int32_t itemSize = tmsgGetQueueSize(&pVnode->msgCb, pVnode->config.vgId, APPLY_QUEUE); - if (itemSize == 0) { - vInfo("vgId:%d, apply queue is empty, restore finish", pVnode->config.vgId); + appliedIdx = vnodeSyncAppliedIndex(pFsm); + ASSERT(appliedIdx <= commitIdx); + if (appliedIdx == commitIdx) { + vInfo("vgId:%d, no items to be applied, restore finish", pVnode->config.vgId); break; } else { - vInfo("vgId:%d, restore not finish since %d items in apply queue", pVnode->config.vgId, itemSize); + vInfo("vgId:%d, restore not finish since %" PRId64 " items to be applied. commit-index:%" PRId64 + ", applied-index:%" PRId64, + pVnode->config.vgId, commitIdx - appliedIdx, commitIdx, appliedIdx); taosMsleep(10); } } while (true); - walApplyVer(pVnode->pWal, pVnode->state.applied); + ASSERT(commitIdx == vnodeSyncAppliedIndex(pFsm)); + walApplyVer(pVnode->pWal, commitIdx); pVnode->restored = true; vInfo("vgId:%d, sync restore finished", pVnode->config.vgId); @@ -569,6 +595,7 @@ static SSyncFSM *vnodeSyncMakeFsm(SVnode *pVnode) { SSyncFSM *pFsm = taosMemoryCalloc(1, sizeof(SSyncFSM)); pFsm->data = pVnode; pFsm->FpCommitCb = vnodeSyncCommitMsg; + pFsm->FpAppliedIndexCb = vnodeSyncAppliedIndex; pFsm->FpPreCommitCb = vnodeSyncPreCommitMsg; pFsm->FpRollBackCb = vnodeSyncRollBackMsg; pFsm->FpGetSnapshotInfo = vnodeSyncGetSnapshotInfo; diff --git a/source/libs/sync/src/syncMain.c b/source/libs/sync/src/syncMain.c index 90f79fd93c..e5fe4a2369 100644 --- a/source/libs/sync/src/syncMain.c +++ b/source/libs/sync/src/syncMain.c @@ -2160,8 +2160,8 @@ int32_t syncNodeAppend(SSyncNode* ths, SSyncRaftEntry* pEntry) { // append to log buffer if (syncLogBufferAppend(ths->pLogBuf, ths, pEntry) < 0) { sError("vgId:%d, failed to enqueue sync log buffer, index:%" PRId64, ths->vgId, pEntry->index); - terrno = TSDB_CODE_SYN_BUFFER_FULL; - (void)syncLogFsmExecute(ths, ths->pFsm, ths->state, raftStoreGetTerm(ths), pEntry, TSDB_CODE_SYN_BUFFER_FULL); + ASSERT(terrno != 0); + (void)syncLogFsmExecute(ths, ths->pFsm, ths->state, raftStoreGetTerm(ths), pEntry, terrno); syncEntryDestroy(pEntry); return -1; } diff --git a/source/libs/sync/src/syncPipeline.c b/source/libs/sync/src/syncPipeline.c index cc1a40a430..6600b505c1 100644 --- a/source/libs/sync/src/syncPipeline.c +++ b/source/libs/sync/src/syncPipeline.c @@ -48,7 +48,16 @@ int32_t syncLogBufferAppend(SSyncLogBuffer* pBuf, SSyncNode* pNode, SSyncRaftEnt SyncIndex index = pEntry->index; if (index - pBuf->startIndex >= pBuf->size) { - sError("vgId:%d, failed to append due to sync log buffer full. index:%" PRId64 "", pNode->vgId, index); + terrno = TSDB_CODE_SYN_BUFFER_FULL; + sError("vgId:%d, failed to append since %s. index:%" PRId64 "", pNode->vgId, terrstr(), index); + goto _err; + } + + SyncIndex appliedIndex = pNode->pFsm->FpAppliedIndexCb(pNode->pFsm); + if (pNode->restoreFinish && pBuf->commitIndex - appliedIndex >= pBuf->size) { + terrno = TSDB_CODE_SYN_WRITE_STALL; + sError("vgId:%d, failed to append since %s. index:%" PRId64 ", commit-index:%" PRId64 ", applied-index:%" PRId64, + pNode->vgId, terrstr(), index, pBuf->commitIndex, appliedIndex); goto _err; } @@ -475,7 +484,7 @@ _out: int32_t syncLogFsmExecute(SSyncNode* pNode, SSyncFSM* pFsm, ESyncState role, SyncTerm term, SSyncRaftEntry* pEntry, int32_t applyCode) { - if ((pNode->replicaNum == 1) && pNode->restoreFinish && pNode->vgId != 1) { + if (pNode->replicaNum == 1 && pNode->restoreFinish && pNode->vgId != 1) { return 0; } @@ -587,10 +596,10 @@ _out: // mark as restored if needed if (!pNode->restoreFinish && pBuf->commitIndex >= pNode->commitIndex && pEntry != NULL && currentTerm <= pEntry->term) { - pNode->pFsm->FpRestoreFinishCb(pNode->pFsm); + pNode->pFsm->FpRestoreFinishCb(pNode->pFsm, pBuf->commitIndex); pNode->restoreFinish = true; - sInfo("vgId:%d, restore finished. log buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", pNode->vgId, - pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); + sInfo("vgId:%d, restore finished. term:%" PRId64 ", log buffer: [%" PRId64 " %" PRId64 " %" PRId64 ", %" PRId64 ")", + pNode->vgId, currentTerm, pBuf->startIndex, pBuf->commitIndex, pBuf->matchIndex, pBuf->endIndex); } if (!inBuf) { diff --git a/source/util/src/terror.c b/source/util/src/terror.c index 61071796df..b84d58bfe8 100644 --- a/source/util/src/terror.c +++ b/source/util/src/terror.c @@ -423,6 +423,7 @@ TAOS_DEFINE_ERROR(TSDB_CODE_SYN_BATCH_ERROR, "Sync batch error") TAOS_DEFINE_ERROR(TSDB_CODE_SYN_RESTORING, "Sync leader is restoring") TAOS_DEFINE_ERROR(TSDB_CODE_SYN_INVALID_SNAPSHOT_MSG, "Sync invalid snapshot msg") TAOS_DEFINE_ERROR(TSDB_CODE_SYN_BUFFER_FULL, "Sync buffer is full") +TAOS_DEFINE_ERROR(TSDB_CODE_SYN_WRITE_STALL, "Sync write stall") TAOS_DEFINE_ERROR(TSDB_CODE_SYN_INTERNAL_ERROR, "Sync internal error") //tq